md.c revision 89628
1/*
2 * ----------------------------------------------------------------------------
3 * "THE BEER-WARE LICENSE" (Revision 42):
4 * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
5 * can do whatever you want with this stuff. If we meet some day, and you think
6 * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
7 * ----------------------------------------------------------------------------
8 *
9 * $FreeBSD: head/sys/dev/md/md.c 89628 2002-01-21 20:50:06Z phk $
10 *
11 */
12
13/*
14 * The following functions are based in the vn(4) driver: mdstart_swap(),
15 * mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() and mddestroy(),
16 * and as such under the following copyright:
17 *
18 * Copyright (c) 1988 University of Utah.
19 * Copyright (c) 1990, 1993
20 *	The Regents of the University of California.  All rights reserved.
21 *
22 * This code is derived from software contributed to Berkeley by
23 * the Systems Programming Group of the University of Utah Computer
24 * Science Department.
25 *
26 * Redistribution and use in source and binary forms, with or without
27 * modification, are permitted provided that the following conditions
28 * are met:
29 * 1. Redistributions of source code must retain the above copyright
30 *    notice, this list of conditions and the following disclaimer.
31 * 2. Redistributions in binary form must reproduce the above copyright
32 *    notice, this list of conditions and the following disclaimer in the
33 *    documentation and/or other materials provided with the distribution.
34 * 3. All advertising materials mentioning features or use of this software
35 *    must display the following acknowledgement:
36 *	This product includes software developed by the University of
37 *	California, Berkeley and its contributors.
38 * 4. Neither the name of the University nor the names of its contributors
39 *    may be used to endorse or promote products derived from this software
40 *    without specific prior written permission.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52 * SUCH DAMAGE.
53 *
54 * from: Utah Hdr: vn.c 1.13 94/04/02
55 *
56 *	from: @(#)vn.c	8.6 (Berkeley) 4/1/94
57 * From: src/sys/dev/vn/vn.c,v 1.122 2000/12/16 16:06:03
58 */
59
60#include "opt_md.h"
61
62#include <sys/param.h>
63#include <sys/systm.h>
64#include <sys/bio.h>
65#include <sys/conf.h>
66#include <sys/devicestat.h>
67#include <sys/disk.h>
68#include <sys/fcntl.h>
69#include <sys/kernel.h>
70#include <sys/linker.h>
71#include <sys/lock.h>
72#include <sys/malloc.h>
73#include <sys/mdioctl.h>
74#include <sys/mutex.h>
75#include <sys/namei.h>
76#include <sys/proc.h>
77#include <sys/queue.h>
78#include <sys/sysctl.h>
79#include <sys/vnode.h>
80
81#include <machine/atomic.h>
82
83#include <vm/vm.h>
84#include <vm/vm_object.h>
85#include <vm/vm_page.h>
86#include <vm/vm_pager.h>
87#include <vm/vm_zone.h>
88#include <vm/swap_pager.h>
89
90#define MD_MODVER 1
91
92#ifndef MD_NSECT
93#define MD_NSECT (10000 * 2)
94#endif
95
96MALLOC_DEFINE(M_MD, "MD disk", "Memory Disk");
97MALLOC_DEFINE(M_MDSECT, "MD sectors", "Memory Disk Sectors");
98
99static int md_debug;
100SYSCTL_INT(_debug, OID_AUTO, mddebug, CTLFLAG_RW, &md_debug, 0, "");
101
102#if defined(MD_ROOT) && defined(MD_ROOT_SIZE)
103/* Image gets put here: */
104static u_char mfs_root[MD_ROOT_SIZE*1024] = "MFS Filesystem goes here";
105static u_char end_mfs_root[] __unused = "MFS Filesystem had better STOP here";
106#endif
107
108static int	mdrootready;
109static int	mdunits;
110static dev_t	status_dev = 0;
111
112
113#define CDEV_MAJOR	95
114
115static d_strategy_t mdstrategy;
116static d_open_t mdopen;
117static d_close_t mdclose;
118static d_ioctl_t mdioctl, mdctlioctl;
119
120static struct cdevsw md_cdevsw = {
121        /* open */      mdopen,
122        /* close */     mdclose,
123        /* read */      physread,
124        /* write */     physwrite,
125        /* ioctl */     mdioctl,
126        /* poll */      nopoll,
127        /* mmap */      nommap,
128        /* strategy */  mdstrategy,
129        /* name */      MD_NAME,
130        /* maj */       CDEV_MAJOR,
131        /* dump */      nodump,
132        /* psize */     nopsize,
133        /* flags */     D_DISK | D_CANFREE | D_MEMDISK,
134};
135
136static struct cdevsw mdctl_cdevsw = {
137        /* open */      nullopen,
138        /* close */     nullclose,
139        /* read */      noread,
140        /* write */     nowrite,
141        /* ioctl */     mdctlioctl,
142        /* poll */      nopoll,
143        /* mmap */      nommap,
144        /* strategy */  nostrategy,
145        /* name */      MD_NAME,
146        /* maj */       CDEV_MAJOR
147};
148
149static struct cdevsw mddisk_cdevsw;
150
151static LIST_HEAD(, md_s) md_softc_list = LIST_HEAD_INITIALIZER(&md_softc_list);
152
153struct md_s {
154	int unit;
155	LIST_ENTRY(md_s) list;
156	struct devstat stats;
157	struct bio_queue_head bio_queue;
158	struct disk disk;
159	dev_t dev;
160	int busy;
161	enum md_types type;
162	unsigned nsect;
163	unsigned opencount;
164	unsigned secsize;
165	unsigned flags;
166
167	/* MD_MALLOC related fields */
168	u_char **secp;
169
170	/* MD_PRELOAD related fields */
171	u_char *pl_ptr;
172	unsigned pl_len;
173
174	/* MD_VNODE related fields */
175	struct vnode *vnode;
176	struct ucred *cred;
177
178	/* MD_SWAP related fields */
179	vm_object_t object;
180};
181
182static int
183mdopen(dev_t dev, int flag, int fmt, struct thread *td)
184{
185	struct md_s *sc;
186	struct disklabel *dl;
187
188	if (md_debug)
189		printf("mdopen(%s %x %x %p)\n",
190			devtoname(dev), flag, fmt, td->td_proc);
191
192	sc = dev->si_drv1;
193
194	dl = &sc->disk.d_label;
195	bzero(dl, sizeof(*dl));
196	dl->d_secsize = sc->secsize;
197	dl->d_nsectors = sc->nsect > 63 ? 63 : sc->nsect;
198	dl->d_ntracks = 1;
199	dl->d_secpercyl = dl->d_nsectors * dl->d_ntracks;
200	dl->d_secperunit = sc->nsect;
201	dl->d_ncylinders = dl->d_secperunit / dl->d_secpercyl;
202	sc->opencount++;
203	return (0);
204}
205
206static int
207mdclose(dev_t dev, int flags, int fmt, struct thread *td)
208{
209	struct md_s *sc = dev->si_drv1;
210
211	sc->opencount--;
212	return (0);
213}
214
215static int
216mdioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct thread *td)
217{
218
219	if (md_debug)
220		printf("mdioctl(%s %lx %p %x %p)\n",
221			devtoname(dev), cmd, addr, flags, td);
222
223	return (ENOIOCTL);
224}
225
226static int
227mdstart_malloc(struct md_s *sc, struct bio *bp)
228{
229	int i;
230	devstat_trans_flags dop;
231	u_char *secp, **secpp, *dst;
232	unsigned secno, nsec, secval, uc;
233
234		if (bp->bio_cmd == BIO_DELETE)
235			dop = DEVSTAT_NO_DATA;
236		else if (bp->bio_cmd == BIO_READ)
237			dop = DEVSTAT_READ;
238		else
239			dop = DEVSTAT_WRITE;
240
241		nsec = bp->bio_bcount / sc->secsize;
242		secno = bp->bio_pblkno;
243		dst = bp->bio_data;
244		while (nsec--) {
245			secpp = &sc->secp[secno];
246			if ((uintptr_t)*secpp > 255) {
247				secp = *secpp;
248				secval = 0;
249			} else {
250				secp = NULL;
251				secval = (uintptr_t) *secpp;
252			}
253
254			if (md_debug > 2)
255				printf("%x %p %p %d\n",
256				    bp->bio_flags, secpp, secp, secval);
257
258			if (bp->bio_cmd == BIO_DELETE) {
259				if (!(sc->flags & MD_RESERVE) && secp != NULL) {
260					FREE(secp, M_MDSECT);
261					*secpp = 0;
262				}
263			} else if (bp->bio_cmd == BIO_READ) {
264				if (secp != NULL) {
265					bcopy(secp, dst, sc->secsize);
266				} else if (secval) {
267					for (i = 0; i < sc->secsize; i++)
268						dst[i] = secval;
269				} else {
270					bzero(dst, sc->secsize);
271				}
272			} else {
273				if (sc->flags & MD_COMPRESS) {
274					uc = dst[0];
275					for (i = 1; i < sc->secsize; i++)
276						if (dst[i] != uc)
277							break;
278				} else {
279					i = 0;
280					uc = 0;
281				}
282				if (i == sc->secsize) {
283					if (secp)
284						FREE(secp, M_MDSECT);
285					*secpp = (u_char *)(uintptr_t)uc;
286				} else {
287					if (secp == NULL)
288						MALLOC(secp, u_char *, sc->secsize, M_MDSECT, M_WAITOK);
289					bcopy(dst, secp, sc->secsize);
290					*secpp = secp;
291				}
292			}
293			secno++;
294			dst += sc->secsize;
295		}
296		bp->bio_resid = 0;
297	return (0);
298}
299
300
301static int
302mdstart_preload(struct md_s *sc, struct bio *bp)
303{
304	devstat_trans_flags dop;
305
306		if (bp->bio_cmd == BIO_DELETE) {
307			dop = DEVSTAT_NO_DATA;
308		} else if (bp->bio_cmd == BIO_READ) {
309			dop = DEVSTAT_READ;
310			bcopy(sc->pl_ptr + (bp->bio_pblkno << DEV_BSHIFT), bp->bio_data, bp->bio_bcount);
311		} else {
312			dop = DEVSTAT_WRITE;
313			bcopy(bp->bio_data, sc->pl_ptr + (bp->bio_pblkno << DEV_BSHIFT), bp->bio_bcount);
314		}
315		bp->bio_resid = 0;
316	return (0);
317}
318
319static int
320mdstart_vnode(struct md_s *sc, struct bio *bp)
321{
322	int error;
323	struct uio auio;
324	struct iovec aiov;
325	struct mount *mp;
326
327	/*
328	 * VNODE I/O
329	 *
330	 * If an error occurs, we set BIO_ERROR but we do not set
331	 * B_INVAL because (for a write anyway), the buffer is
332	 * still valid.
333	 */
334
335		bzero(&auio, sizeof(auio));
336
337		aiov.iov_base = bp->bio_data;
338		aiov.iov_len = bp->bio_bcount;
339		auio.uio_iov = &aiov;
340		auio.uio_iovcnt = 1;
341		auio.uio_offset = (vm_ooffset_t)bp->bio_pblkno * sc->secsize;
342		auio.uio_segflg = UIO_SYSSPACE;
343		if(bp->bio_cmd == BIO_READ)
344			auio.uio_rw = UIO_READ;
345		else
346			auio.uio_rw = UIO_WRITE;
347		auio.uio_resid = bp->bio_bcount;
348		auio.uio_td = curthread;
349		/*
350		 * When reading set IO_DIRECT to try to avoid double-caching
351		 * the data.  When writing IO_DIRECT is not optimal, but we
352		 * must set IO_NOWDRAIN to avoid a wdrain deadlock.
353		 */
354		if (bp->bio_cmd == BIO_READ) {
355			vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread);
356			error = VOP_READ(sc->vnode, &auio, IO_DIRECT, sc->cred);
357		} else {
358			(void) vn_start_write(sc->vnode, &mp, V_WAIT);
359			vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread);
360			error = VOP_WRITE(sc->vnode, &auio, IO_NOWDRAIN, sc->cred);
361			vn_finished_write(mp);
362		}
363		VOP_UNLOCK(sc->vnode, 0, curthread);
364		bp->bio_resid = auio.uio_resid;
365	return (error);
366}
367
368static int
369mdstart_swap(struct md_s *sc, struct bio *bp)
370{
371
372		if ((bp->bio_cmd == BIO_DELETE) && (sc->flags & MD_RESERVE))
373			biodone(bp);
374		else
375			vm_pager_strategy(sc->object, bp);
376	return (-1);
377}
378
379static void
380mdstrategy(struct bio *bp)
381{
382	struct md_s *sc;
383	int error;
384
385	if (md_debug > 1)
386		printf("mdstrategy(%p) %s %x, %d, %ld, %p)\n",
387		    bp, devtoname(bp->bio_dev), bp->bio_flags, bp->bio_blkno,
388		    bp->bio_bcount / DEV_BSIZE, bp->bio_data);
389
390	sc = bp->bio_dev->si_drv1;
391
392	/* XXX: LOCK(sc->lock) */
393	bioqdisksort(&sc->bio_queue, bp);
394	/* XXX: UNLOCK(sc->lock) */
395
396	if (atomic_cmpset_int(&sc->busy, 0, 1) == 0)
397		return;
398
399	for (;;) {
400		/* XXX: LOCK(unique unit numbers) */
401		bp = bioq_first(&sc->bio_queue);
402		if (bp)
403			bioq_remove(&sc->bio_queue, bp);
404		/* XXX: UNLOCK(unique unit numbers) */
405		if (!bp)
406			break;
407
408
409	switch (sc->type) {
410	case MD_MALLOC:
411			devstat_start_transaction(&sc->stats);
412			error = mdstart_malloc(sc, bp);
413		break;
414	case MD_PRELOAD:
415			devstat_start_transaction(&sc->stats);
416			error = mdstart_preload(sc, bp);
417		break;
418	case MD_VNODE:
419			devstat_start_transaction(&sc->stats);
420			error = mdstart_vnode(sc, bp);
421		break;
422	case MD_SWAP:
423			error = mdstart_swap(sc, bp);
424		break;
425	default:
426		panic("Impossible md(type)");
427		break;
428		}
429
430		if (error != -1)
431			biofinish(bp, &sc->stats, error);
432	}
433	sc->busy = 0;
434}
435
436static struct md_s *
437mdfind(int unit)
438{
439	struct md_s *sc;
440
441	/* XXX: LOCK(unique unit numbers) */
442	LIST_FOREACH(sc, &md_softc_list, list) {
443		if (sc->unit == unit)
444			break;
445	}
446	/* XXX: UNLOCK(unique unit numbers) */
447	return (sc);
448}
449
450static struct md_s *
451mdnew(int unit)
452{
453	struct md_s *sc;
454	int max = -1;
455
456	/* XXX: LOCK(unique unit numbers) */
457	LIST_FOREACH(sc, &md_softc_list, list) {
458		if (sc->unit == unit) {
459			/* XXX: UNLOCK(unique unit numbers) */
460			return (NULL);
461		}
462		if (sc->unit > max)
463			max = sc->unit;
464	}
465	if (unit == -1)
466		unit = max + 1;
467	if (unit > DKMAXUNIT)
468		return (NULL);
469	MALLOC(sc, struct md_s *, sizeof(*sc), M_MD, M_WAITOK | M_ZERO);
470	sc->unit = unit;
471	LIST_INSERT_HEAD(&md_softc_list, sc, list);
472	/* XXX: UNLOCK(unique unit numbers) */
473	return (sc);
474}
475
476static void
477mdinit(struct md_s *sc)
478{
479
480	bioq_init(&sc->bio_queue);
481	devstat_add_entry(&sc->stats, MD_NAME, sc->unit, sc->secsize,
482		DEVSTAT_NO_ORDERED_TAGS,
483		DEVSTAT_TYPE_DIRECT | DEVSTAT_TYPE_IF_OTHER,
484		DEVSTAT_PRIORITY_OTHER);
485	sc->dev = disk_create(sc->unit, &sc->disk, 0, &md_cdevsw, &mddisk_cdevsw);
486	sc->dev->si_drv1 = sc;
487}
488
489/*
490 * XXX: we should check that the range they feed us is mapped.
491 * XXX: we should implement read-only.
492 */
493
494static int
495mdcreate_preload(struct md_ioctl *mdio)
496{
497	struct md_s *sc;
498
499	if (mdio->md_size == 0)
500		return (EINVAL);
501	if (mdio->md_options & ~(MD_AUTOUNIT))
502		return (EINVAL);
503	if (mdio->md_options & MD_AUTOUNIT) {
504		sc = mdnew(-1);
505		if (sc == NULL)
506			return (ENOMEM);
507		mdio->md_unit = sc->unit;
508	} else {
509		sc = mdnew(mdio->md_unit);
510		if (sc == NULL)
511			return (EBUSY);
512	}
513	sc->type = MD_PRELOAD;
514	sc->secsize = DEV_BSIZE;
515	sc->nsect = mdio->md_size;
516	sc->flags = mdio->md_options & MD_FORCE;
517	/* Cast to pointer size, then to pointer to avoid warning */
518	sc->pl_ptr = (u_char *)(uintptr_t)mdio->md_base;
519	sc->pl_len = (mdio->md_size << DEV_BSHIFT);
520	mdinit(sc);
521	return (0);
522}
523
524
525static int
526mdcreate_malloc(struct md_ioctl *mdio)
527{
528	struct md_s *sc;
529	unsigned u;
530
531	if (mdio->md_size == 0)
532		return (EINVAL);
533	if (mdio->md_options & ~(MD_AUTOUNIT | MD_COMPRESS | MD_RESERVE))
534		return (EINVAL);
535	/* Compression doesn't make sense if we have reserved space */
536	if (mdio->md_options & MD_RESERVE)
537		mdio->md_options &= ~MD_COMPRESS;
538	if (mdio->md_options & MD_AUTOUNIT) {
539		sc = mdnew(-1);
540		if (sc == NULL)
541			return (ENOMEM);
542		mdio->md_unit = sc->unit;
543	} else {
544		sc = mdnew(mdio->md_unit);
545		if (sc == NULL)
546			return (EBUSY);
547	}
548	sc->type = MD_MALLOC;
549	sc->secsize = DEV_BSIZE;
550	sc->nsect = mdio->md_size;
551	sc->flags = mdio->md_options & (MD_COMPRESS | MD_FORCE);
552	MALLOC(sc->secp, u_char **, sc->nsect * sizeof(u_char *), M_MD, M_WAITOK | M_ZERO);
553	if (mdio->md_options & MD_RESERVE) {
554		for (u = 0; u < sc->nsect; u++)
555			MALLOC(sc->secp[u], u_char *, DEV_BSIZE, M_MDSECT, M_WAITOK | M_ZERO);
556	}
557	printf("%s%d: Malloc disk\n", MD_NAME, sc->unit);
558	mdinit(sc);
559	return (0);
560}
561
562
563static int
564mdsetcred(struct md_s *sc, struct ucred *cred)
565{
566	char *tmpbuf;
567	int error = 0;
568
569	/*
570	 * Set credits in our softc
571	 */
572
573	if (sc->cred)
574		crfree(sc->cred);
575	sc->cred = crhold(cred);
576
577	/*
578	 * Horrible kludge to establish credentials for NFS  XXX.
579	 */
580
581	if (sc->vnode) {
582		struct uio auio;
583		struct iovec aiov;
584
585		tmpbuf = malloc(sc->secsize, M_TEMP, M_WAITOK);
586		bzero(&auio, sizeof(auio));
587
588		aiov.iov_base = tmpbuf;
589		aiov.iov_len = sc->secsize;
590		auio.uio_iov = &aiov;
591		auio.uio_iovcnt = 1;
592		auio.uio_offset = 0;
593		auio.uio_rw = UIO_READ;
594		auio.uio_segflg = UIO_SYSSPACE;
595		auio.uio_resid = aiov.iov_len;
596		vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread);
597		error = VOP_READ(sc->vnode, &auio, 0, sc->cred);
598		VOP_UNLOCK(sc->vnode, 0, curthread);
599		free(tmpbuf, M_TEMP);
600	}
601	return (error);
602}
603
604static int
605mdcreate_vnode(struct md_ioctl *mdio, struct thread *td)
606{
607	struct proc *p = td->td_proc;
608	struct md_s *sc;
609	struct vattr vattr;
610	struct nameidata nd;
611	int error, flags;
612
613	if (mdio->md_options & MD_AUTOUNIT) {
614		sc = mdnew(-1);
615		mdio->md_unit = sc->unit;
616	} else {
617		sc = mdnew(mdio->md_unit);
618	}
619	if (sc == NULL)
620		return (EBUSY);
621
622	sc->type = MD_VNODE;
623	sc->flags = mdio->md_options & MD_FORCE;
624
625	flags = FREAD|FWRITE;
626	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, mdio->md_file, td);
627	error = vn_open(&nd, &flags, 0);
628	if (error) {
629		if (error != EACCES && error != EPERM && error != EROFS)
630			return (error);
631		flags &= ~FWRITE;
632		sc->flags |= MD_READONLY;
633		NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, mdio->md_file, td);
634		error = vn_open(&nd, &flags, 0);
635		if (error)
636			return (error);
637	}
638	NDFREE(&nd, NDF_ONLY_PNBUF);
639	if (nd.ni_vp->v_type != VREG ||
640	    (error = VOP_GETATTR(nd.ni_vp, &vattr, p->p_ucred, td))) {
641		VOP_UNLOCK(nd.ni_vp, 0, td);
642		(void) vn_close(nd.ni_vp, flags, p->p_ucred, td);
643		return (error ? error : EINVAL);
644	}
645	VOP_UNLOCK(nd.ni_vp, 0, td);
646	sc->secsize = DEV_BSIZE;
647	sc->vnode = nd.ni_vp;
648
649	/*
650	 * If the size is specified, override the file attributes.
651	 */
652	if (mdio->md_size)
653		sc->nsect = mdio->md_size;
654	else
655		sc->nsect = vattr.va_size / sc->secsize; /* XXX: round up ? */
656	if (sc->nsect == 0) {
657		(void) vn_close(nd.ni_vp, flags, p->p_ucred, td);
658		return (EINVAL);
659	}
660	error = mdsetcred(sc, p->p_ucred);
661	if (error) {
662		(void) vn_close(nd.ni_vp, flags, p->p_ucred, td);
663		return (error);
664	}
665	mdinit(sc);
666	return (0);
667}
668
669static int
670mddestroy(struct md_s *sc, struct thread *td)
671{
672	unsigned u;
673
674	GIANT_REQUIRED;
675
676	if (sc->dev != NULL) {
677		devstat_remove_entry(&sc->stats);
678		disk_destroy(sc->dev);
679	}
680	if (sc->vnode != NULL)
681		(void)vn_close(sc->vnode, sc->flags & MD_READONLY ?
682		    FREAD : (FREAD|FWRITE), sc->cred, td);
683	if (sc->cred != NULL)
684		crfree(sc->cred);
685	if (sc->object != NULL) {
686		vm_pager_deallocate(sc->object);
687	}
688	if (sc->secp != NULL) {
689		for (u = 0; u < sc->nsect; u++)
690			if ((uintptr_t)sc->secp[u] > 255)
691				FREE(sc->secp[u], M_MDSECT);
692		FREE(sc->secp, M_MD);
693	}
694
695	/* XXX: LOCK(unique unit numbers) */
696	LIST_REMOVE(sc, list);
697	/* XXX: UNLOCK(unique unit numbers) */
698	FREE(sc, M_MD);
699	return (0);
700}
701
702static int
703mdcreate_swap(struct md_ioctl *mdio, struct thread *td)
704{
705	int error;
706	struct md_s *sc;
707
708	GIANT_REQUIRED;
709
710	if (mdio->md_options & MD_AUTOUNIT) {
711		sc = mdnew(-1);
712		mdio->md_unit = sc->unit;
713	} else {
714		sc = mdnew(mdio->md_unit);
715	}
716	if (sc == NULL)
717		return (EBUSY);
718
719	sc->type = MD_SWAP;
720
721	/*
722	 * Range check.  Disallow negative sizes or any size less then the
723	 * size of a page.  Then round to a page.
724	 */
725
726	if (mdio->md_size == 0) {
727		mddestroy(sc, td);
728		return (EDOM);
729	}
730
731	/*
732	 * Allocate an OBJT_SWAP object.
733	 *
734	 * sc_secsize is PAGE_SIZE'd
735	 *
736	 * mdio->size is in DEV_BSIZE'd chunks.
737	 * Note the truncation.
738	 */
739
740	sc->secsize = PAGE_SIZE;
741	sc->nsect = mdio->md_size / (PAGE_SIZE / DEV_BSIZE);
742	sc->object = vm_pager_allocate(OBJT_SWAP, NULL, sc->secsize * (vm_offset_t)sc->nsect, VM_PROT_DEFAULT, 0);
743	sc->flags = mdio->md_options & MD_FORCE;
744	if (mdio->md_options & MD_RESERVE) {
745		if (swap_pager_reserve(sc->object, 0, sc->nsect) < 0) {
746			vm_pager_deallocate(sc->object);
747			sc->object = NULL;
748			mddestroy(sc, td);
749			return (EDOM);
750		}
751	}
752	error = mdsetcred(sc, td->td_proc->p_ucred);
753	if (error)
754		mddestroy(sc, td);
755	else
756		mdinit(sc);
757	return (error);
758}
759
760static int
761mddetach(int unit, struct thread *td)
762{
763	struct md_s *sc;
764
765	sc = mdfind(unit);
766	if (sc == NULL)
767		return (ENOENT);
768	if (sc->opencount != 0 && !(sc->flags & MD_FORCE))
769		return (EBUSY);
770	switch(sc->type) {
771	case MD_VNODE:
772	case MD_SWAP:
773	case MD_MALLOC:
774	case MD_PRELOAD:
775		return (mddestroy(sc, td));
776	default:
777		return (EOPNOTSUPP);
778	}
779}
780
781static int
782mdctlioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct thread *td)
783{
784	struct md_ioctl *mdio;
785	struct md_s *sc;
786
787	if (md_debug)
788		printf("mdctlioctl(%s %lx %p %x %p)\n",
789			devtoname(dev), cmd, addr, flags, td);
790
791	/*
792	 * We assert the version number in the individual ioctl
793	 * handlers instead of out here because (a) it is possible we
794	 * may add another ioctl in the future which doesn't read an
795	 * mdio, and (b) the correct return value for an unknown ioctl
796	 * is ENOIOCTL, not EINVAL.
797	 */
798	mdio = (struct md_ioctl *)addr;
799	switch (cmd) {
800	case MDIOCATTACH:
801		if (mdio->md_version != MDIOVERSION)
802			return (EINVAL);
803		switch (mdio->md_type) {
804		case MD_MALLOC:
805			return (mdcreate_malloc(mdio));
806		case MD_PRELOAD:
807			return (mdcreate_preload(mdio));
808		case MD_VNODE:
809			return (mdcreate_vnode(mdio, td));
810		case MD_SWAP:
811			return (mdcreate_swap(mdio, td));
812		default:
813			return (EINVAL);
814		}
815	case MDIOCDETACH:
816		if (mdio->md_version != MDIOVERSION)
817			return (EINVAL);
818		if (mdio->md_file != NULL || mdio->md_size != 0 ||
819		    mdio->md_options != 0)
820			return (EINVAL);
821		return (mddetach(mdio->md_unit, td));
822	case MDIOCQUERY:
823		if (mdio->md_version != MDIOVERSION)
824			return (EINVAL);
825		sc = mdfind(mdio->md_unit);
826		if (sc == NULL)
827			return (ENOENT);
828		mdio->md_type = sc->type;
829		mdio->md_options = sc->flags;
830		switch (sc->type) {
831		case MD_MALLOC:
832			mdio->md_size = sc->nsect;
833			break;
834		case MD_PRELOAD:
835			mdio->md_size = sc->nsect;
836			(u_char *)(uintptr_t)mdio->md_base = sc->pl_ptr;
837			break;
838		case MD_SWAP:
839			mdio->md_size = sc->nsect * (PAGE_SIZE / DEV_BSIZE);
840			break;
841		case MD_VNODE:
842			mdio->md_size = sc->nsect;
843			/* XXX fill this in */
844			mdio->md_file = NULL;
845			break;
846		}
847		return (0);
848	default:
849		return (ENOIOCTL);
850	};
851	return (ENOIOCTL);
852}
853
854static void
855md_preloaded(u_char *image, unsigned length)
856{
857	struct md_s *sc;
858
859	sc = mdnew(-1);
860	if (sc == NULL)
861		return;
862	sc->type = MD_PRELOAD;
863	sc->secsize = DEV_BSIZE;
864	sc->nsect = length / DEV_BSIZE;
865	sc->pl_ptr = image;
866	sc->pl_len = length;
867	if (sc->unit == 0)
868		mdrootready = 1;
869	mdinit(sc);
870}
871
872static void
873md_drvinit(void *unused)
874{
875
876	caddr_t mod;
877	caddr_t c;
878	u_char *ptr, *name, *type;
879	unsigned len;
880
881#ifdef MD_ROOT_SIZE
882	md_preloaded(mfs_root, MD_ROOT_SIZE*1024);
883#endif
884	mod = NULL;
885	while ((mod = preload_search_next_name(mod)) != NULL) {
886		name = (char *)preload_search_info(mod, MODINFO_NAME);
887		type = (char *)preload_search_info(mod, MODINFO_TYPE);
888		if (name == NULL)
889			continue;
890		if (type == NULL)
891			continue;
892		if (strcmp(type, "md_image") && strcmp(type, "mfs_root"))
893			continue;
894		c = preload_search_info(mod, MODINFO_ADDR);
895		ptr = *(u_char **)c;
896		c = preload_search_info(mod, MODINFO_SIZE);
897		len = *(unsigned *)c;
898		printf("%s%d: Preloaded image <%s> %d bytes at %p\n",
899		    MD_NAME, mdunits, name, len, ptr);
900		md_preloaded(ptr, len);
901	}
902	status_dev = make_dev(&mdctl_cdevsw, 0xffff00ff, UID_ROOT, GID_WHEEL,
903	    0600, MDCTL_NAME);
904}
905
906static int
907md_modevent(module_t mod, int type, void *data)
908{
909	int error;
910	struct md_s *sc;
911
912	switch (type) {
913	case MOD_LOAD:
914		md_drvinit(NULL);
915		break;
916	case MOD_UNLOAD:
917		LIST_FOREACH(sc, &md_softc_list, list) {
918			error = mddetach(sc->unit, curthread);
919			if (error != 0)
920				return (error);
921		}
922		if (status_dev)
923			destroy_dev(status_dev);
924		status_dev = 0;
925		break;
926	default:
927		break;
928	}
929	return (0);
930}
931
932static moduledata_t md_mod = {
933	MD_NAME,
934	md_modevent,
935	NULL
936};
937DECLARE_MODULE(md, md_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE+CDEV_MAJOR);
938MODULE_VERSION(md, MD_MODVER);
939
940
941#ifdef MD_ROOT
942static void
943md_takeroot(void *junk)
944{
945	if (mdrootready)
946		rootdevnames[0] = "ufs:/dev/md0c";
947}
948
949SYSINIT(md_root, SI_SUB_MOUNT_ROOT, SI_ORDER_FIRST, md_takeroot, NULL);
950#endif
951