md.c revision 92363
1/*
2 * ----------------------------------------------------------------------------
3 * "THE BEER-WARE LICENSE" (Revision 42):
4 * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
5 * can do whatever you want with this stuff. If we meet some day, and you think
6 * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
7 * ----------------------------------------------------------------------------
8 *
9 * $FreeBSD: head/sys/dev/md/md.c 92363 2002-03-15 18:49:47Z mckusick $
10 *
11 */
12
13/*
14 * The following functions are based in the vn(4) driver: mdstart_swap(),
15 * mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() and mddestroy(),
16 * and as such under the following copyright:
17 *
18 * Copyright (c) 1988 University of Utah.
19 * Copyright (c) 1990, 1993
20 *	The Regents of the University of California.  All rights reserved.
21 *
22 * This code is derived from software contributed to Berkeley by
23 * the Systems Programming Group of the University of Utah Computer
24 * Science Department.
25 *
26 * Redistribution and use in source and binary forms, with or without
27 * modification, are permitted provided that the following conditions
28 * are met:
29 * 1. Redistributions of source code must retain the above copyright
30 *    notice, this list of conditions and the following disclaimer.
31 * 2. Redistributions in binary form must reproduce the above copyright
32 *    notice, this list of conditions and the following disclaimer in the
33 *    documentation and/or other materials provided with the distribution.
34 * 3. All advertising materials mentioning features or use of this software
35 *    must display the following acknowledgement:
36 *	This product includes software developed by the University of
37 *	California, Berkeley and its contributors.
38 * 4. Neither the name of the University nor the names of its contributors
39 *    may be used to endorse or promote products derived from this software
40 *    without specific prior written permission.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52 * SUCH DAMAGE.
53 *
54 * from: Utah Hdr: vn.c 1.13 94/04/02
55 *
56 *	from: @(#)vn.c	8.6 (Berkeley) 4/1/94
57 * From: src/sys/dev/vn/vn.c,v 1.122 2000/12/16 16:06:03
58 */
59
60#include "opt_md.h"
61
62#include <sys/param.h>
63#include <sys/systm.h>
64#include <sys/bio.h>
65#include <sys/conf.h>
66#include <sys/devicestat.h>
67#include <sys/disk.h>
68#include <sys/fcntl.h>
69#include <sys/kernel.h>
70#include <sys/linker.h>
71#include <sys/lock.h>
72#include <sys/malloc.h>
73#include <sys/mdioctl.h>
74#include <sys/mutex.h>
75#include <sys/namei.h>
76#include <sys/proc.h>
77#include <sys/queue.h>
78#include <sys/sysctl.h>
79#include <sys/vnode.h>
80
81#include <machine/atomic.h>
82
83#include <vm/vm.h>
84#include <vm/vm_object.h>
85#include <vm/vm_page.h>
86#include <vm/vm_pager.h>
87#include <vm/vm_zone.h>
88#include <vm/swap_pager.h>
89
90#define MD_MODVER 1
91
92#ifndef MD_NSECT
93#define MD_NSECT (10000 * 2)
94#endif
95
96static MALLOC_DEFINE(M_MD, "MD disk", "Memory Disk");
97static MALLOC_DEFINE(M_MDSECT, "MD sectors", "Memory Disk Sectors");
98
99static int md_debug;
100SYSCTL_INT(_debug, OID_AUTO, mddebug, CTLFLAG_RW, &md_debug, 0, "");
101
102#if defined(MD_ROOT) && defined(MD_ROOT_SIZE)
103/* Image gets put here: */
104static u_char mfs_root[MD_ROOT_SIZE*1024] = "MFS Filesystem goes here";
105static u_char end_mfs_root[] __unused = "MFS Filesystem had better STOP here";
106#endif
107
108static int	mdrootready;
109static int	mdunits;
110static dev_t	status_dev = 0;
111
112
113#define CDEV_MAJOR	95
114
115static d_strategy_t mdstrategy;
116static d_open_t mdopen;
117static d_close_t mdclose;
118static d_ioctl_t mdioctl, mdctlioctl;
119
120static struct cdevsw md_cdevsw = {
121        /* open */      mdopen,
122        /* close */     mdclose,
123        /* read */      physread,
124        /* write */     physwrite,
125        /* ioctl */     mdioctl,
126        /* poll */      nopoll,
127        /* mmap */      nommap,
128        /* strategy */  mdstrategy,
129        /* name */      MD_NAME,
130        /* maj */       CDEV_MAJOR,
131        /* dump */      nodump,
132        /* psize */     nopsize,
133        /* flags */     D_DISK | D_CANFREE | D_MEMDISK,
134};
135
136static struct cdevsw mdctl_cdevsw = {
137        /* open */      nullopen,
138        /* close */     nullclose,
139        /* read */      noread,
140        /* write */     nowrite,
141        /* ioctl */     mdctlioctl,
142        /* poll */      nopoll,
143        /* mmap */      nommap,
144        /* strategy */  nostrategy,
145        /* name */      MD_NAME,
146        /* maj */       CDEV_MAJOR
147};
148
149static struct cdevsw mddisk_cdevsw;
150
151static LIST_HEAD(, md_s) md_softc_list = LIST_HEAD_INITIALIZER(&md_softc_list);
152
153struct md_s {
154	int unit;
155	LIST_ENTRY(md_s) list;
156	struct devstat stats;
157	struct bio_queue_head bio_queue;
158	struct disk disk;
159	dev_t dev;
160	int busy;
161	enum md_types type;
162	unsigned nsect;
163	unsigned opencount;
164	unsigned secsize;
165	unsigned flags;
166
167	/* MD_MALLOC related fields */
168	u_char **secp;
169
170	/* MD_PRELOAD related fields */
171	u_char *pl_ptr;
172	unsigned pl_len;
173
174	/* MD_VNODE related fields */
175	struct vnode *vnode;
176	struct ucred *cred;
177
178	/* MD_SWAP related fields */
179	vm_object_t object;
180};
181
182static int
183mdopen(dev_t dev, int flag, int fmt, struct thread *td)
184{
185	struct md_s *sc;
186	struct disklabel *dl;
187
188	if (md_debug)
189		printf("mdopen(%s %x %x %p)\n",
190			devtoname(dev), flag, fmt, td->td_proc);
191
192	sc = dev->si_drv1;
193
194	dl = &sc->disk.d_label;
195	bzero(dl, sizeof(*dl));
196	dl->d_secsize = sc->secsize;
197	dl->d_nsectors = sc->nsect > 63 ? 63 : sc->nsect;
198	dl->d_ntracks = 1;
199	dl->d_secpercyl = dl->d_nsectors * dl->d_ntracks;
200	dl->d_secperunit = sc->nsect;
201	dl->d_ncylinders = dl->d_secperunit / dl->d_secpercyl;
202	sc->opencount++;
203	return (0);
204}
205
206static int
207mdclose(dev_t dev, int flags, int fmt, struct thread *td)
208{
209	struct md_s *sc = dev->si_drv1;
210
211	sc->opencount--;
212	return (0);
213}
214
215static int
216mdioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct thread *td)
217{
218
219	if (md_debug)
220		printf("mdioctl(%s %lx %p %x %p)\n",
221			devtoname(dev), cmd, addr, flags, td);
222
223	return (ENOIOCTL);
224}
225
226static int
227mdstart_malloc(struct md_s *sc, struct bio *bp)
228{
229	int i;
230	devstat_trans_flags dop;
231	u_char *secp, **secpp, *dst;
232	unsigned secno, nsec, secval, uc;
233
234	if (bp->bio_cmd == BIO_DELETE)
235		dop = DEVSTAT_NO_DATA;
236	else if (bp->bio_cmd == BIO_READ)
237		dop = DEVSTAT_READ;
238	else
239		dop = DEVSTAT_WRITE;
240
241	nsec = bp->bio_bcount / sc->secsize;
242	secno = bp->bio_pblkno;
243	dst = bp->bio_data;
244	while (nsec--) {
245		secpp = &sc->secp[secno];
246		if ((uintptr_t)*secpp > 255) {
247			secp = *secpp;
248			secval = 0;
249		} else {
250			secp = NULL;
251			secval = (uintptr_t) *secpp;
252		}
253
254		if (md_debug > 2)
255			printf("%x %p %p %d\n",
256			    bp->bio_flags, secpp, secp, secval);
257
258		if (bp->bio_cmd == BIO_DELETE) {
259			if (!(sc->flags & MD_RESERVE) && secp != NULL) {
260				FREE(secp, M_MDSECT);
261				*secpp = 0;
262			}
263		} else if (bp->bio_cmd == BIO_READ) {
264			if (secp != NULL) {
265				bcopy(secp, dst, sc->secsize);
266			} else if (secval) {
267				for (i = 0; i < sc->secsize; i++)
268					dst[i] = secval;
269			} else {
270				bzero(dst, sc->secsize);
271			}
272		} else {
273			if (sc->flags & MD_COMPRESS) {
274				uc = dst[0];
275				for (i = 1; i < sc->secsize; i++)
276					if (dst[i] != uc)
277						break;
278			} else {
279				i = 0;
280				uc = 0;
281			}
282			if (i == sc->secsize) {
283				if (secp)
284					FREE(secp, M_MDSECT);
285				*secpp = (u_char *)(uintptr_t)uc;
286			} else {
287				if (secp == NULL)
288					MALLOC(secp, u_char *, sc->secsize, M_MDSECT, M_WAITOK);
289				bcopy(dst, secp, sc->secsize);
290				*secpp = secp;
291			}
292		}
293		secno++;
294		dst += sc->secsize;
295	}
296	bp->bio_resid = 0;
297	return (0);
298}
299
300
301static int
302mdstart_preload(struct md_s *sc, struct bio *bp)
303{
304	devstat_trans_flags dop;
305
306	if (bp->bio_cmd == BIO_DELETE) {
307		dop = DEVSTAT_NO_DATA;
308	} else if (bp->bio_cmd == BIO_READ) {
309		dop = DEVSTAT_READ;
310		bcopy(sc->pl_ptr + (bp->bio_pblkno << DEV_BSHIFT), bp->bio_data, bp->bio_bcount);
311	} else {
312		dop = DEVSTAT_WRITE;
313		bcopy(bp->bio_data, sc->pl_ptr + (bp->bio_pblkno << DEV_BSHIFT), bp->bio_bcount);
314	}
315	bp->bio_resid = 0;
316	return (0);
317}
318
319static int
320mdstart_vnode(struct md_s *sc, struct bio *bp)
321{
322	int error;
323	struct uio auio;
324	struct iovec aiov;
325	struct mount *mp;
326
327	/*
328	 * VNODE I/O
329	 *
330	 * If an error occurs, we set BIO_ERROR but we do not set
331	 * B_INVAL because (for a write anyway), the buffer is
332	 * still valid.
333	 */
334
335	bzero(&auio, sizeof(auio));
336
337	aiov.iov_base = bp->bio_data;
338	aiov.iov_len = bp->bio_bcount;
339	auio.uio_iov = &aiov;
340	auio.uio_iovcnt = 1;
341	auio.uio_offset = (vm_ooffset_t)bp->bio_pblkno * sc->secsize;
342	auio.uio_segflg = UIO_SYSSPACE;
343	if(bp->bio_cmd == BIO_READ)
344		auio.uio_rw = UIO_READ;
345	else
346		auio.uio_rw = UIO_WRITE;
347	auio.uio_resid = bp->bio_bcount;
348	auio.uio_td = curthread;
349	/*
350	 * When reading set IO_DIRECT to try to avoid double-caching
351	 * the data.  When writing IO_DIRECT is not optimal, but we
352	 * must set IO_NOWDRAIN to avoid a wdrain deadlock.
353	 */
354	if (bp->bio_cmd == BIO_READ) {
355		vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread);
356		error = VOP_READ(sc->vnode, &auio, IO_DIRECT, sc->cred);
357	} else {
358		(void) vn_start_write(sc->vnode, &mp, V_WAIT);
359		vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread);
360		error = VOP_WRITE(sc->vnode, &auio, IO_NOWDRAIN, sc->cred);
361		vn_finished_write(mp);
362	}
363	VOP_UNLOCK(sc->vnode, 0, curthread);
364	bp->bio_resid = auio.uio_resid;
365	return (error);
366}
367
368static int
369mdstart_swap(struct md_s *sc, struct bio *bp)
370{
371
372	if ((bp->bio_cmd == BIO_DELETE) && (sc->flags & MD_RESERVE))
373		biodone(bp);
374	else
375		vm_pager_strategy(sc->object, bp);
376	return (-1);
377}
378
379static void
380mdstrategy(struct bio *bp)
381{
382	struct md_s *sc;
383	int error;
384
385	if (md_debug > 1)
386		printf("mdstrategy(%p) %s %x, %lld, %ld, %p)\n",
387		    bp, devtoname(bp->bio_dev), bp->bio_flags, bp->bio_blkno,
388		    bp->bio_bcount / DEV_BSIZE, bp->bio_data);
389
390	sc = bp->bio_dev->si_drv1;
391
392	/* XXX: LOCK(sc->lock) */
393	bioqdisksort(&sc->bio_queue, bp);
394	/* XXX: UNLOCK(sc->lock) */
395
396	if (atomic_cmpset_int(&sc->busy, 0, 1) == 0)
397		return;
398
399	for (;;) {
400		/* XXX: LOCK(unique unit numbers) */
401		bp = bioq_first(&sc->bio_queue);
402		if (bp)
403			bioq_remove(&sc->bio_queue, bp);
404		/* XXX: UNLOCK(unique unit numbers) */
405		if (!bp)
406			break;
407
408
409		switch (sc->type) {
410		case MD_MALLOC:
411			devstat_start_transaction(&sc->stats);
412			error = mdstart_malloc(sc, bp);
413			break;
414		case MD_PRELOAD:
415			devstat_start_transaction(&sc->stats);
416			error = mdstart_preload(sc, bp);
417			break;
418		case MD_VNODE:
419			devstat_start_transaction(&sc->stats);
420			error = mdstart_vnode(sc, bp);
421			break;
422		case MD_SWAP:
423			error = mdstart_swap(sc, bp);
424			break;
425		default:
426			panic("Impossible md(type)");
427			break;
428		}
429
430		if (error != -1)
431			biofinish(bp, &sc->stats, error);
432	}
433	sc->busy = 0;
434}
435
436static struct md_s *
437mdfind(int unit)
438{
439	struct md_s *sc;
440
441	/* XXX: LOCK(unique unit numbers) */
442	LIST_FOREACH(sc, &md_softc_list, list) {
443		if (sc->unit == unit)
444			break;
445	}
446	/* XXX: UNLOCK(unique unit numbers) */
447	return (sc);
448}
449
450static struct md_s *
451mdnew(int unit)
452{
453	struct md_s *sc;
454	int max = -1;
455
456	/* XXX: LOCK(unique unit numbers) */
457	LIST_FOREACH(sc, &md_softc_list, list) {
458		if (sc->unit == unit) {
459			/* XXX: UNLOCK(unique unit numbers) */
460			return (NULL);
461		}
462		if (sc->unit > max)
463			max = sc->unit;
464	}
465	if (unit == -1)
466		unit = max + 1;
467	if (unit > DKMAXUNIT)
468		return (NULL);
469	MALLOC(sc, struct md_s *, sizeof(*sc), M_MD, M_WAITOK | M_ZERO);
470	sc->unit = unit;
471	LIST_INSERT_HEAD(&md_softc_list, sc, list);
472	/* XXX: UNLOCK(unique unit numbers) */
473	return (sc);
474}
475
476static void
477mdinit(struct md_s *sc)
478{
479
480	bioq_init(&sc->bio_queue);
481	devstat_add_entry(&sc->stats, MD_NAME, sc->unit, sc->secsize,
482		DEVSTAT_NO_ORDERED_TAGS,
483		DEVSTAT_TYPE_DIRECT | DEVSTAT_TYPE_IF_OTHER,
484		DEVSTAT_PRIORITY_OTHER);
485	sc->dev = disk_create(sc->unit, &sc->disk, 0, &md_cdevsw, &mddisk_cdevsw);
486	sc->dev->si_drv1 = sc;
487}
488
489/*
490 * XXX: we should check that the range they feed us is mapped.
491 * XXX: we should implement read-only.
492 */
493
494static int
495mdcreate_preload(struct md_ioctl *mdio)
496{
497	struct md_s *sc;
498
499	if (mdio->md_size == 0)
500		return (EINVAL);
501	if (mdio->md_options & ~(MD_AUTOUNIT))
502		return (EINVAL);
503	if (mdio->md_options & MD_AUTOUNIT) {
504		sc = mdnew(-1);
505		if (sc == NULL)
506			return (ENOMEM);
507		mdio->md_unit = sc->unit;
508	} else {
509		sc = mdnew(mdio->md_unit);
510		if (sc == NULL)
511			return (EBUSY);
512	}
513	sc->type = MD_PRELOAD;
514	sc->secsize = DEV_BSIZE;
515	sc->nsect = mdio->md_size;
516	sc->flags = mdio->md_options & MD_FORCE;
517	/* Cast to pointer size, then to pointer to avoid warning */
518	sc->pl_ptr = (u_char *)(uintptr_t)mdio->md_base;
519	sc->pl_len = (mdio->md_size << DEV_BSHIFT);
520	mdinit(sc);
521	return (0);
522}
523
524
525static int
526mdcreate_malloc(struct md_ioctl *mdio)
527{
528	struct md_s *sc;
529	unsigned u;
530
531	if (mdio->md_size == 0)
532		return (EINVAL);
533	if (mdio->md_options & ~(MD_AUTOUNIT | MD_COMPRESS | MD_RESERVE))
534		return (EINVAL);
535	/* Compression doesn't make sense if we have reserved space */
536	if (mdio->md_options & MD_RESERVE)
537		mdio->md_options &= ~MD_COMPRESS;
538	if (mdio->md_options & MD_AUTOUNIT) {
539		sc = mdnew(-1);
540		if (sc == NULL)
541			return (ENOMEM);
542		mdio->md_unit = sc->unit;
543	} else {
544		sc = mdnew(mdio->md_unit);
545		if (sc == NULL)
546			return (EBUSY);
547	}
548	sc->type = MD_MALLOC;
549	sc->secsize = DEV_BSIZE;
550	sc->nsect = mdio->md_size;
551	sc->flags = mdio->md_options & (MD_COMPRESS | MD_FORCE);
552	MALLOC(sc->secp, u_char **, sc->nsect * sizeof(u_char *), M_MD, M_WAITOK | M_ZERO);
553	if (mdio->md_options & MD_RESERVE) {
554		for (u = 0; u < sc->nsect; u++)
555			MALLOC(sc->secp[u], u_char *, DEV_BSIZE, M_MDSECT, M_WAITOK | M_ZERO);
556	}
557	printf("%s%d: Malloc disk\n", MD_NAME, sc->unit);
558	mdinit(sc);
559	return (0);
560}
561
562
563static int
564mdsetcred(struct md_s *sc, struct ucred *cred)
565{
566	char *tmpbuf;
567	int error = 0;
568
569	/*
570	 * Set credits in our softc
571	 */
572
573	if (sc->cred)
574		crfree(sc->cred);
575	sc->cred = crhold(cred);
576
577	/*
578	 * Horrible kludge to establish credentials for NFS  XXX.
579	 */
580
581	if (sc->vnode) {
582		struct uio auio;
583		struct iovec aiov;
584
585		tmpbuf = malloc(sc->secsize, M_TEMP, M_WAITOK);
586		bzero(&auio, sizeof(auio));
587
588		aiov.iov_base = tmpbuf;
589		aiov.iov_len = sc->secsize;
590		auio.uio_iov = &aiov;
591		auio.uio_iovcnt = 1;
592		auio.uio_offset = 0;
593		auio.uio_rw = UIO_READ;
594		auio.uio_segflg = UIO_SYSSPACE;
595		auio.uio_resid = aiov.iov_len;
596		vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread);
597		error = VOP_READ(sc->vnode, &auio, 0, sc->cred);
598		VOP_UNLOCK(sc->vnode, 0, curthread);
599		free(tmpbuf, M_TEMP);
600	}
601	return (error);
602}
603
604static int
605mdcreate_vnode(struct md_ioctl *mdio, struct thread *td)
606{
607	struct md_s *sc;
608	struct vattr vattr;
609	struct nameidata nd;
610	int error, flags;
611
612	if (mdio->md_options & MD_AUTOUNIT) {
613		sc = mdnew(-1);
614		mdio->md_unit = sc->unit;
615	} else {
616		sc = mdnew(mdio->md_unit);
617	}
618	if (sc == NULL)
619		return (EBUSY);
620
621	sc->type = MD_VNODE;
622	sc->flags = mdio->md_options & MD_FORCE;
623
624	flags = FREAD|FWRITE;
625	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, mdio->md_file, td);
626	error = vn_open(&nd, &flags, 0);
627	if (error) {
628		if (error != EACCES && error != EPERM && error != EROFS)
629			return (error);
630		flags &= ~FWRITE;
631		sc->flags |= MD_READONLY;
632		NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, mdio->md_file, td);
633		error = vn_open(&nd, &flags, 0);
634		if (error)
635			return (error);
636	}
637	NDFREE(&nd, NDF_ONLY_PNBUF);
638	if (nd.ni_vp->v_type != VREG ||
639	    (error = VOP_GETATTR(nd.ni_vp, &vattr, td->td_ucred, td))) {
640		VOP_UNLOCK(nd.ni_vp, 0, td);
641		(void) vn_close(nd.ni_vp, flags, td->td_ucred, td);
642		return (error ? error : EINVAL);
643	}
644	VOP_UNLOCK(nd.ni_vp, 0, td);
645	sc->secsize = DEV_BSIZE;
646	sc->vnode = nd.ni_vp;
647
648	/*
649	 * If the size is specified, override the file attributes.
650	 */
651	if (mdio->md_size)
652		sc->nsect = mdio->md_size;
653	else
654		sc->nsect = vattr.va_size / sc->secsize; /* XXX: round up ? */
655	if (sc->nsect == 0) {
656		(void) vn_close(nd.ni_vp, flags, td->td_ucred, td);
657		return (EINVAL);
658	}
659	error = mdsetcred(sc, td->td_ucred);
660	if (error) {
661		(void) vn_close(nd.ni_vp, flags, td->td_ucred, td);
662		return (error);
663	}
664	mdinit(sc);
665	return (0);
666}
667
668static int
669mddestroy(struct md_s *sc, struct thread *td)
670{
671	unsigned u;
672
673	GIANT_REQUIRED;
674
675	if (sc->dev != NULL) {
676		devstat_remove_entry(&sc->stats);
677		disk_destroy(sc->dev);
678	}
679	if (sc->vnode != NULL)
680		(void)vn_close(sc->vnode, sc->flags & MD_READONLY ?
681		    FREAD : (FREAD|FWRITE), sc->cred, td);
682	if (sc->cred != NULL)
683		crfree(sc->cred);
684	if (sc->object != NULL) {
685		vm_pager_deallocate(sc->object);
686	}
687	if (sc->secp != NULL) {
688		for (u = 0; u < sc->nsect; u++)
689			if ((uintptr_t)sc->secp[u] > 255)
690				FREE(sc->secp[u], M_MDSECT);
691		FREE(sc->secp, M_MD);
692	}
693
694	/* XXX: LOCK(unique unit numbers) */
695	LIST_REMOVE(sc, list);
696	/* XXX: UNLOCK(unique unit numbers) */
697	FREE(sc, M_MD);
698	return (0);
699}
700
701static int
702mdcreate_swap(struct md_ioctl *mdio, struct thread *td)
703{
704	int error;
705	struct md_s *sc;
706
707	GIANT_REQUIRED;
708
709	if (mdio->md_options & MD_AUTOUNIT) {
710		sc = mdnew(-1);
711		mdio->md_unit = sc->unit;
712	} else {
713		sc = mdnew(mdio->md_unit);
714	}
715	if (sc == NULL)
716		return (EBUSY);
717
718	sc->type = MD_SWAP;
719
720	/*
721	 * Range check.  Disallow negative sizes or any size less then the
722	 * size of a page.  Then round to a page.
723	 */
724
725	if (mdio->md_size == 0) {
726		mddestroy(sc, td);
727		return (EDOM);
728	}
729
730	/*
731	 * Allocate an OBJT_SWAP object.
732	 *
733	 * sc_secsize is PAGE_SIZE'd
734	 *
735	 * mdio->size is in DEV_BSIZE'd chunks.
736	 * Note the truncation.
737	 */
738
739	sc->secsize = PAGE_SIZE;
740	sc->nsect = mdio->md_size / (PAGE_SIZE / DEV_BSIZE);
741	sc->object = vm_pager_allocate(OBJT_SWAP, NULL, sc->secsize * (vm_offset_t)sc->nsect, VM_PROT_DEFAULT, 0);
742	sc->flags = mdio->md_options & MD_FORCE;
743	if (mdio->md_options & MD_RESERVE) {
744		if (swap_pager_reserve(sc->object, 0, sc->nsect) < 0) {
745			vm_pager_deallocate(sc->object);
746			sc->object = NULL;
747			mddestroy(sc, td);
748			return (EDOM);
749		}
750	}
751	error = mdsetcred(sc, td->td_ucred);
752	if (error)
753		mddestroy(sc, td);
754	else
755		mdinit(sc);
756	return (error);
757}
758
759static int
760mddetach(int unit, struct thread *td)
761{
762	struct md_s *sc;
763
764	sc = mdfind(unit);
765	if (sc == NULL)
766		return (ENOENT);
767	if (sc->opencount != 0 && !(sc->flags & MD_FORCE))
768		return (EBUSY);
769	switch(sc->type) {
770	case MD_VNODE:
771	case MD_SWAP:
772	case MD_MALLOC:
773	case MD_PRELOAD:
774		return (mddestroy(sc, td));
775	default:
776		return (EOPNOTSUPP);
777	}
778}
779
780static int
781mdctlioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct thread *td)
782{
783	struct md_ioctl *mdio;
784	struct md_s *sc;
785
786	if (md_debug)
787		printf("mdctlioctl(%s %lx %p %x %p)\n",
788			devtoname(dev), cmd, addr, flags, td);
789
790	/*
791	 * We assert the version number in the individual ioctl
792	 * handlers instead of out here because (a) it is possible we
793	 * may add another ioctl in the future which doesn't read an
794	 * mdio, and (b) the correct return value for an unknown ioctl
795	 * is ENOIOCTL, not EINVAL.
796	 */
797	mdio = (struct md_ioctl *)addr;
798	switch (cmd) {
799	case MDIOCATTACH:
800		if (mdio->md_version != MDIOVERSION)
801			return (EINVAL);
802		switch (mdio->md_type) {
803		case MD_MALLOC:
804			return (mdcreate_malloc(mdio));
805		case MD_PRELOAD:
806			return (mdcreate_preload(mdio));
807		case MD_VNODE:
808			return (mdcreate_vnode(mdio, td));
809		case MD_SWAP:
810			return (mdcreate_swap(mdio, td));
811		default:
812			return (EINVAL);
813		}
814	case MDIOCDETACH:
815		if (mdio->md_version != MDIOVERSION)
816			return (EINVAL);
817		if (mdio->md_file != NULL || mdio->md_size != 0 ||
818		    mdio->md_options != 0)
819			return (EINVAL);
820		return (mddetach(mdio->md_unit, td));
821	case MDIOCQUERY:
822		if (mdio->md_version != MDIOVERSION)
823			return (EINVAL);
824		sc = mdfind(mdio->md_unit);
825		if (sc == NULL)
826			return (ENOENT);
827		mdio->md_type = sc->type;
828		mdio->md_options = sc->flags;
829		switch (sc->type) {
830		case MD_MALLOC:
831			mdio->md_size = sc->nsect;
832			break;
833		case MD_PRELOAD:
834			mdio->md_size = sc->nsect;
835			(u_char *)(uintptr_t)mdio->md_base = sc->pl_ptr;
836			break;
837		case MD_SWAP:
838			mdio->md_size = sc->nsect * (PAGE_SIZE / DEV_BSIZE);
839			break;
840		case MD_VNODE:
841			mdio->md_size = sc->nsect;
842			/* XXX fill this in */
843			mdio->md_file = NULL;
844			break;
845		}
846		return (0);
847	default:
848		return (ENOIOCTL);
849	};
850	return (ENOIOCTL);
851}
852
853static void
854md_preloaded(u_char *image, unsigned length)
855{
856	struct md_s *sc;
857
858	sc = mdnew(-1);
859	if (sc == NULL)
860		return;
861	sc->type = MD_PRELOAD;
862	sc->secsize = DEV_BSIZE;
863	sc->nsect = length / DEV_BSIZE;
864	sc->pl_ptr = image;
865	sc->pl_len = length;
866	if (sc->unit == 0)
867		mdrootready = 1;
868	mdinit(sc);
869}
870
871static void
872md_drvinit(void *unused)
873{
874
875	caddr_t mod;
876	caddr_t c;
877	u_char *ptr, *name, *type;
878	unsigned len;
879
880#ifdef MD_ROOT_SIZE
881	md_preloaded(mfs_root, MD_ROOT_SIZE*1024);
882#endif
883	mod = NULL;
884	while ((mod = preload_search_next_name(mod)) != NULL) {
885		name = (char *)preload_search_info(mod, MODINFO_NAME);
886		type = (char *)preload_search_info(mod, MODINFO_TYPE);
887		if (name == NULL)
888			continue;
889		if (type == NULL)
890			continue;
891		if (strcmp(type, "md_image") && strcmp(type, "mfs_root"))
892			continue;
893		c = preload_search_info(mod, MODINFO_ADDR);
894		ptr = *(u_char **)c;
895		c = preload_search_info(mod, MODINFO_SIZE);
896		len = *(unsigned *)c;
897		printf("%s%d: Preloaded image <%s> %d bytes at %p\n",
898		    MD_NAME, mdunits, name, len, ptr);
899		md_preloaded(ptr, len);
900	}
901	status_dev = make_dev(&mdctl_cdevsw, 0xffff00ff, UID_ROOT, GID_WHEEL,
902	    0600, MDCTL_NAME);
903}
904
905static int
906md_modevent(module_t mod, int type, void *data)
907{
908	int error;
909	struct md_s *sc;
910
911	switch (type) {
912	case MOD_LOAD:
913		md_drvinit(NULL);
914		break;
915	case MOD_UNLOAD:
916		LIST_FOREACH(sc, &md_softc_list, list) {
917			error = mddetach(sc->unit, curthread);
918			if (error != 0)
919				return (error);
920		}
921		if (status_dev)
922			destroy_dev(status_dev);
923		status_dev = 0;
924		break;
925	default:
926		break;
927	}
928	return (0);
929}
930
931static moduledata_t md_mod = {
932	MD_NAME,
933	md_modevent,
934	NULL
935};
936DECLARE_MODULE(md, md_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE+CDEV_MAJOR);
937MODULE_VERSION(md, MD_MODVER);
938
939
940#ifdef MD_ROOT
941static void
942md_takeroot(void *junk)
943{
944	if (mdrootready)
945		rootdevnames[0] = "ufs:/dev/md0c";
946}
947
948SYSINIT(md_root, SI_SUB_MOUNT_ROOT, SI_ORDER_FIRST, md_takeroot, NULL);
949#endif
950