md.c revision 112555
1/*
2 * ----------------------------------------------------------------------------
3 * "THE BEER-WARE LICENSE" (Revision 42):
4 * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
5 * can do whatever you want with this stuff. If we meet some day, and you think
6 * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
7 * ----------------------------------------------------------------------------
8 *
9 * $FreeBSD: head/sys/dev/md/md.c 112555 2003-03-24 19:46:26Z phk $
10 *
11 */
12
13/*
14 * The following functions are based in the vn(4) driver: mdstart_swap(),
15 * mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() and mddestroy(),
16 * and as such under the following copyright:
17 *
18 * Copyright (c) 1988 University of Utah.
19 * Copyright (c) 1990, 1993
20 *	The Regents of the University of California.  All rights reserved.
21 *
22 * This code is derived from software contributed to Berkeley by
23 * the Systems Programming Group of the University of Utah Computer
24 * Science Department.
25 *
26 * Redistribution and use in source and binary forms, with or without
27 * modification, are permitted provided that the following conditions
28 * are met:
29 * 1. Redistributions of source code must retain the above copyright
30 *    notice, this list of conditions and the following disclaimer.
31 * 2. Redistributions in binary form must reproduce the above copyright
32 *    notice, this list of conditions and the following disclaimer in the
33 *    documentation and/or other materials provided with the distribution.
34 * 3. All advertising materials mentioning features or use of this software
35 *    must display the following acknowledgement:
36 *	This product includes software developed by the University of
37 *	California, Berkeley and its contributors.
38 * 4. Neither the name of the University nor the names of its contributors
39 *    may be used to endorse or promote products derived from this software
40 *    without specific prior written permission.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52 * SUCH DAMAGE.
53 *
54 * from: Utah Hdr: vn.c 1.13 94/04/02
55 *
56 *	from: @(#)vn.c	8.6 (Berkeley) 4/1/94
57 * From: src/sys/dev/vn/vn.c,v 1.122 2000/12/16 16:06:03
58 */
59
60#include "opt_geom.h"
61#include "opt_md.h"
62
63#include <sys/param.h>
64#include <sys/systm.h>
65#include <sys/bio.h>
66#include <sys/conf.h>
67#include <sys/disk.h>
68#include <sys/fcntl.h>
69#include <sys/kernel.h>
70#include <sys/kthread.h>
71#include <sys/linker.h>
72#include <sys/lock.h>
73#include <sys/malloc.h>
74#include <sys/mdioctl.h>
75#include <sys/mutex.h>
76#include <sys/namei.h>
77#include <sys/proc.h>
78#include <sys/queue.h>
79#include <sys/sysctl.h>
80#include <sys/vnode.h>
81
82#include <geom/geom.h>
83
84#include <vm/vm.h>
85#include <vm/vm_object.h>
86#include <vm/vm_page.h>
87#include <vm/vm_pager.h>
88#include <vm/swap_pager.h>
89#include <vm/uma.h>
90
91#define MD_MODVER 1
92
93#define MD_SHUTDOWN 0x10000	/* Tell worker thread to terminate. */
94
95#ifndef MD_NSECT
96#define MD_NSECT (10000 * 2)
97#endif
98
99static MALLOC_DEFINE(M_MD, "MD disk", "Memory Disk");
100static MALLOC_DEFINE(M_MDSECT, "MD sectors", "Memory Disk Sectors");
101
102static int md_debug;
103SYSCTL_INT(_debug, OID_AUTO, mddebug, CTLFLAG_RW, &md_debug, 0, "");
104
105#if defined(MD_ROOT) && defined(MD_ROOT_SIZE)
106/* Image gets put here: */
107static u_char mfs_root[MD_ROOT_SIZE*1024] = "MFS Filesystem goes here";
108static u_char end_mfs_root[] __unused = "MFS Filesystem had better STOP here";
109#endif
110
111static int	mdrootready;
112static int	mdunits;
113static dev_t	status_dev = 0;
114
115#define CDEV_MAJOR	95
116
117static d_ioctl_t mdctlioctl;
118
119static struct cdevsw mdctl_cdevsw = {
120	.d_open =	nullopen,
121	.d_close =	nullclose,
122	.d_ioctl =	mdctlioctl,
123	.d_name =	MD_NAME,
124	.d_maj =	CDEV_MAJOR
125};
126
127
128static LIST_HEAD(, md_s) md_softc_list = LIST_HEAD_INITIALIZER(&md_softc_list);
129
130#define NINDIR	(PAGE_SIZE / sizeof(uintptr_t))
131#define NMASK	(NINDIR-1)
132static int nshift;
133
134struct indir {
135	uintptr_t	*array;
136	uint		total;
137	uint		used;
138	uint		shift;
139};
140
141struct md_s {
142	int unit;
143	LIST_ENTRY(md_s) list;
144	struct bio_queue_head bio_queue;
145	struct mtx queue_mtx;
146	struct disk disk;
147	dev_t dev;
148	enum md_types type;
149	unsigned nsect;
150	unsigned opencount;
151	unsigned secsize;
152	unsigned flags;
153	char name[20];
154	struct proc *procp;
155	struct g_geom *gp;
156	struct g_provider *pp;
157
158	/* MD_MALLOC related fields */
159	struct indir *indir;
160	uma_zone_t uma;
161
162	/* MD_PRELOAD related fields */
163	u_char *pl_ptr;
164	unsigned pl_len;
165
166	/* MD_VNODE related fields */
167	struct vnode *vnode;
168	struct ucred *cred;
169
170	/* MD_SWAP related fields */
171	vm_object_t object;
172};
173
174static int mddestroy(struct md_s *sc, struct thread *td);
175
176static struct indir *
177new_indir(uint shift)
178{
179	struct indir *ip;
180
181	ip = malloc(sizeof *ip, M_MD, M_NOWAIT | M_ZERO);
182	if (ip == NULL)
183		return (NULL);
184	ip->array = malloc(sizeof(uintptr_t) * NINDIR,
185	    M_MDSECT, M_NOWAIT | M_ZERO);
186	if (ip->array == NULL) {
187		free(ip, M_MD);
188		return (NULL);
189	}
190	ip->total = NINDIR;
191	ip->shift = shift;
192	return (ip);
193}
194
195static void
196del_indir(struct indir *ip)
197{
198
199	free(ip->array, M_MDSECT);
200	free(ip, M_MD);
201}
202
203static void
204destroy_indir(struct md_s *sc, struct indir *ip)
205{
206	int i;
207
208	for (i = 0; i < NINDIR; i++) {
209		if (!ip->array[i])
210			continue;
211		if (ip->shift)
212			destroy_indir(sc, (struct indir*)(ip->array[i]));
213		else if (ip->array[i] > 255)
214			uma_zfree(sc->uma, (void *)(ip->array[i]));
215	}
216	del_indir(ip);
217}
218
219/*
220 * This function does the math and alloctes the top level "indir" structure
221 * for a device of "size" sectors.
222 */
223
224static struct indir *
225dimension(off_t size)
226{
227	off_t rcnt;
228	struct indir *ip;
229	int i, layer;
230
231	rcnt = size;
232	layer = 0;
233	while (rcnt > NINDIR) {
234		rcnt /= NINDIR;
235		layer++;
236	}
237	/* figure out log2(NINDIR) */
238	for (i = NINDIR, nshift = -1; i; nshift++)
239		i >>= 1;
240
241	/*
242	 * XXX: the top layer is probably not fully populated, so we allocate
243	 * too much space for ip->array in new_indir() here.
244	 */
245	ip = new_indir(layer * nshift);
246	return (ip);
247}
248
249/*
250 * Read a given sector
251 */
252
253static uintptr_t
254s_read(struct indir *ip, off_t offset)
255{
256	struct indir *cip;
257	int idx;
258	uintptr_t up;
259
260	if (md_debug > 1)
261		printf("s_read(%jd)\n", (intmax_t)offset);
262	up = 0;
263	for (cip = ip; cip != NULL;) {
264		if (cip->shift) {
265			idx = (offset >> cip->shift) & NMASK;
266			up = cip->array[idx];
267			cip = (struct indir *)up;
268			continue;
269		}
270		idx = offset & NMASK;
271		return (cip->array[idx]);
272	}
273	return (0);
274}
275
276/*
277 * Write a given sector, prune the tree if the value is 0
278 */
279
280static int
281s_write(struct indir *ip, off_t offset, uintptr_t ptr)
282{
283	struct indir *cip, *lip[10];
284	int idx, li;
285	uintptr_t up;
286
287	if (md_debug > 1)
288		printf("s_write(%jd, %p)\n", (intmax_t)offset, (void *)ptr);
289	up = 0;
290	li = 0;
291	cip = ip;
292	for (;;) {
293		lip[li++] = cip;
294		if (cip->shift) {
295			idx = (offset >> cip->shift) & NMASK;
296			up = cip->array[idx];
297			if (up != 0) {
298				cip = (struct indir *)up;
299				continue;
300			}
301			/* Allocate branch */
302			cip->array[idx] =
303			    (uintptr_t)new_indir(cip->shift - nshift);
304			if (cip->array[idx] == 0)
305				return (ENOSPC);
306			cip->used++;
307			up = cip->array[idx];
308			cip = (struct indir *)up;
309			continue;
310		}
311		/* leafnode */
312		idx = offset & NMASK;
313		up = cip->array[idx];
314		if (up != 0)
315			cip->used--;
316		cip->array[idx] = ptr;
317		if (ptr != 0)
318			cip->used++;
319		break;
320	}
321	if (cip->used != 0 || li == 1)
322		return (0);
323	li--;
324	while (cip->used == 0 && cip != ip) {
325		li--;
326		idx = (offset >> lip[li]->shift) & NMASK;
327		up = lip[li]->array[idx];
328		KASSERT(up == (uintptr_t)cip, ("md screwed up"));
329		del_indir(cip);
330		lip[li]->array[idx] = 0;
331		lip[li]->used--;
332		cip = lip[li];
333	}
334	return (0);
335}
336
337
338struct g_class g_md_class = {
339	.name = "MD",
340	G_CLASS_INITIALIZER
341
342};
343
344static int
345g_md_access(struct g_provider *pp, int r, int w, int e)
346{
347	struct md_s *sc;
348
349	sc = pp->geom->softc;
350	r += pp->acr;
351	w += pp->acw;
352	e += pp->ace;
353	if ((pp->acr + pp->acw + pp->ace) == 0 && (r + w + e) > 0) {
354		sc->opencount = 1;
355	} else if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0) {
356		sc->opencount = 0;
357	}
358	return (0);
359}
360
361static void
362g_md_start(struct bio *bp)
363{
364	struct md_s *sc;
365
366	sc = bp->bio_to->geom->softc;
367
368	switch(bp->bio_cmd) {
369	case BIO_GETATTR:
370	case BIO_SETATTR:
371		g_io_deliver(bp, EOPNOTSUPP);
372		return;
373	}
374	bp->bio_blkno = bp->bio_offset >> DEV_BSHIFT;
375	bp->bio_pblkno = bp->bio_offset / sc->secsize;
376	bp->bio_bcount = bp->bio_length;
377	mtx_lock(&sc->queue_mtx);
378	bioqdisksort(&sc->bio_queue, bp);
379	mtx_unlock(&sc->queue_mtx);
380
381	wakeup(sc);
382}
383
384DECLARE_GEOM_CLASS(g_md_class, g_md);
385
386
387static int
388mdstart_malloc(struct md_s *sc, struct bio *bp)
389{
390	int i, error;
391	u_char *dst;
392	unsigned secno, nsec, uc;
393	uintptr_t sp, osp;
394
395	nsec = bp->bio_bcount / sc->secsize;
396	secno = bp->bio_pblkno;
397	dst = bp->bio_data;
398	error = 0;
399	while (nsec--) {
400		osp = s_read(sc->indir, secno);
401		if (bp->bio_cmd == BIO_DELETE) {
402			if (osp != 0)
403				error = s_write(sc->indir, secno, 0);
404		} else if (bp->bio_cmd == BIO_READ) {
405			if (osp == 0)
406				bzero(dst, sc->secsize);
407			else if (osp <= 255)
408				for (i = 0; i < sc->secsize; i++)
409					dst[i] = osp;
410			else
411				bcopy((void *)osp, dst, sc->secsize);
412			osp = 0;
413		} else if (bp->bio_cmd == BIO_WRITE) {
414			if (sc->flags & MD_COMPRESS) {
415				uc = dst[0];
416				for (i = 1; i < sc->secsize; i++)
417					if (dst[i] != uc)
418						break;
419			} else {
420				i = 0;
421				uc = 0;
422			}
423			if (i == sc->secsize) {
424				if (osp != uc)
425					error = s_write(sc->indir, secno, uc);
426			} else {
427				if (osp <= 255) {
428					sp = (uintptr_t) uma_zalloc(
429					    sc->uma, M_NOWAIT);
430					if (sp == 0) {
431						error = ENOSPC;
432						break;
433					}
434					bcopy(dst, (void *)sp, sc->secsize);
435					error = s_write(sc->indir, secno, sp);
436				} else {
437					bcopy(dst, (void *)osp, sc->secsize);
438					osp = 0;
439				}
440			}
441		} else {
442			error = EOPNOTSUPP;
443		}
444		if (osp > 255)
445			uma_zfree(sc->uma, (void*)osp);
446		if (error)
447			break;
448		secno++;
449		dst += sc->secsize;
450	}
451	bp->bio_resid = 0;
452	return (error);
453}
454
455static int
456mdstart_preload(struct md_s *sc, struct bio *bp)
457{
458
459	if (bp->bio_cmd == BIO_DELETE) {
460	} else if (bp->bio_cmd == BIO_READ) {
461		bcopy(sc->pl_ptr + (bp->bio_pblkno << DEV_BSHIFT), bp->bio_data, bp->bio_bcount);
462	} else {
463		bcopy(bp->bio_data, sc->pl_ptr + (bp->bio_pblkno << DEV_BSHIFT), bp->bio_bcount);
464	}
465	bp->bio_resid = 0;
466	return (0);
467}
468
469static int
470mdstart_vnode(struct md_s *sc, struct bio *bp)
471{
472	int error;
473	struct uio auio;
474	struct iovec aiov;
475	struct mount *mp;
476
477	/*
478	 * VNODE I/O
479	 *
480	 * If an error occurs, we set BIO_ERROR but we do not set
481	 * B_INVAL because (for a write anyway), the buffer is
482	 * still valid.
483	 */
484
485	bzero(&auio, sizeof(auio));
486
487	aiov.iov_base = bp->bio_data;
488	aiov.iov_len = bp->bio_bcount;
489	auio.uio_iov = &aiov;
490	auio.uio_iovcnt = 1;
491	auio.uio_offset = (vm_ooffset_t)bp->bio_pblkno * sc->secsize;
492	auio.uio_segflg = UIO_SYSSPACE;
493	if(bp->bio_cmd == BIO_READ)
494		auio.uio_rw = UIO_READ;
495	else
496		auio.uio_rw = UIO_WRITE;
497	auio.uio_resid = bp->bio_bcount;
498	auio.uio_td = curthread;
499	/*
500	 * When reading set IO_DIRECT to try to avoid double-caching
501	 * the data.  When writing IO_DIRECT is not optimal, but we
502	 * must set IO_NOWDRAIN to avoid a wdrain deadlock.
503	 */
504	if (bp->bio_cmd == BIO_READ) {
505		vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread);
506		error = VOP_READ(sc->vnode, &auio, IO_DIRECT, sc->cred);
507	} else {
508		(void) vn_start_write(sc->vnode, &mp, V_WAIT);
509		vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread);
510		error = VOP_WRITE(sc->vnode, &auio, IO_NOWDRAIN, sc->cred);
511		vn_finished_write(mp);
512	}
513	VOP_UNLOCK(sc->vnode, 0, curthread);
514	bp->bio_resid = auio.uio_resid;
515	return (error);
516}
517
518static void
519mddone_swap(struct bio *bp)
520{
521
522	bp->bio_completed = bp->bio_length - bp->bio_resid;
523	g_std_done(bp);
524}
525
526static int
527mdstart_swap(struct md_s *sc, struct bio *bp)
528{
529	{
530	struct bio *bp2;
531
532	bp2 = g_clone_bio(bp);
533	bp2->bio_done = mddone_swap;
534	bp2->bio_blkno = bp2->bio_offset >> DEV_BSHIFT;
535	bp2->bio_pblkno = bp2->bio_offset / sc->secsize;
536	bp2->bio_bcount = bp2->bio_length;
537	bp = bp2;
538	}
539
540	bp->bio_resid = 0;
541	if ((bp->bio_cmd == BIO_DELETE) && (sc->flags & MD_RESERVE))
542		biodone(bp);
543	else
544		vm_pager_strategy(sc->object, bp);
545	return (-1);
546}
547
548static void
549md_kthread(void *arg)
550{
551	struct md_s *sc;
552	struct bio *bp;
553	int error, hasgiant;
554
555	sc = arg;
556	curthread->td_base_pri = PRIBIO;
557
558	switch (sc->type) {
559	case MD_SWAP:
560	case MD_VNODE:
561		mtx_lock(&Giant);
562		hasgiant = 1;
563		break;
564	case MD_MALLOC:
565	case MD_PRELOAD:
566	default:
567		hasgiant = 0;
568		break;
569	}
570
571	for (;;) {
572		mtx_lock(&sc->queue_mtx);
573		bp = bioq_first(&sc->bio_queue);
574		if (bp)
575			bioq_remove(&sc->bio_queue, bp);
576		if (!bp) {
577			if (sc->flags & MD_SHUTDOWN) {
578				mtx_unlock(&sc->queue_mtx);
579				sc->procp = NULL;
580				wakeup(&sc->procp);
581				if (!hasgiant)
582					mtx_lock(&Giant);
583				kthread_exit(0);
584			}
585			msleep(sc, &sc->queue_mtx, PRIBIO | PDROP, "mdwait", 0);
586			continue;
587		}
588		mtx_unlock(&sc->queue_mtx);
589
590		switch (sc->type) {
591		case MD_MALLOC:
592			error = mdstart_malloc(sc, bp);
593			break;
594		case MD_PRELOAD:
595			error = mdstart_preload(sc, bp);
596			break;
597		case MD_VNODE:
598			error = mdstart_vnode(sc, bp);
599			break;
600		case MD_SWAP:
601			error = mdstart_swap(sc, bp);
602			break;
603		default:
604			panic("Impossible md(type)");
605			break;
606		}
607
608		if (error != -1) {
609			bp->bio_completed = bp->bio_length;
610			g_io_deliver(bp, error);
611		}
612	}
613}
614
615static struct md_s *
616mdfind(int unit)
617{
618	struct md_s *sc;
619
620	/* XXX: LOCK(unique unit numbers) */
621	LIST_FOREACH(sc, &md_softc_list, list) {
622		if (sc->unit == unit)
623			break;
624	}
625	/* XXX: UNLOCK(unique unit numbers) */
626	return (sc);
627}
628
629static struct md_s *
630mdnew(int unit)
631{
632	struct md_s *sc;
633	int error, max = -1;
634
635	/* XXX: LOCK(unique unit numbers) */
636	LIST_FOREACH(sc, &md_softc_list, list) {
637		if (sc->unit == unit) {
638			/* XXX: UNLOCK(unique unit numbers) */
639			return (NULL);
640		}
641		if (sc->unit > max)
642			max = sc->unit;
643	}
644	if (unit == -1)
645		unit = max + 1;
646	if (unit > 255)
647		return (NULL);
648	sc = (struct md_s *)malloc(sizeof *sc, M_MD, M_WAITOK | M_ZERO);
649	sc->unit = unit;
650	bioq_init(&sc->bio_queue);
651	mtx_init(&sc->queue_mtx, "md bio queue", NULL, MTX_DEF);
652	sprintf(sc->name, "md%d", unit);
653	error = kthread_create(md_kthread, sc, &sc->procp, 0, 0,"%s", sc->name);
654	if (error) {
655		free(sc, M_MD);
656		return (NULL);
657	}
658	LIST_INSERT_HEAD(&md_softc_list, sc, list);
659	/* XXX: UNLOCK(unique unit numbers) */
660	return (sc);
661}
662
663static void
664mdinit(struct md_s *sc)
665{
666
667	struct g_geom *gp;
668	struct g_provider *pp;
669
670	DROP_GIANT();
671	g_topology_lock();
672	gp = g_new_geomf(&g_md_class, "md%d", sc->unit);
673	gp->start = g_md_start;
674	gp->access = g_md_access;
675	gp->softc = sc;
676	pp = g_new_providerf(gp, "md%d", sc->unit);
677	pp->mediasize = (off_t)sc->nsect * sc->secsize;
678	pp->sectorsize = sc->secsize;
679	sc->gp = gp;
680	sc->pp = pp;
681	g_error_provider(pp, 0);
682	g_topology_unlock();
683	PICKUP_GIANT();
684}
685
686/*
687 * XXX: we should check that the range they feed us is mapped.
688 * XXX: we should implement read-only.
689 */
690
691static int
692mdcreate_preload(struct md_ioctl *mdio)
693{
694	struct md_s *sc;
695
696	if (mdio->md_size == 0)
697		return (EINVAL);
698	if (mdio->md_options & ~(MD_AUTOUNIT))
699		return (EINVAL);
700	if (mdio->md_options & MD_AUTOUNIT) {
701		sc = mdnew(-1);
702		if (sc == NULL)
703			return (ENOMEM);
704		mdio->md_unit = sc->unit;
705	} else {
706		sc = mdnew(mdio->md_unit);
707		if (sc == NULL)
708			return (EBUSY);
709	}
710	sc->type = MD_PRELOAD;
711	sc->secsize = DEV_BSIZE;
712	sc->nsect = mdio->md_size;
713	sc->flags = mdio->md_options & MD_FORCE;
714	/* Cast to pointer size, then to pointer to avoid warning */
715	sc->pl_ptr = (u_char *)(uintptr_t)mdio->md_base;
716	sc->pl_len = (mdio->md_size << DEV_BSHIFT);
717	mdinit(sc);
718	return (0);
719}
720
721
722static int
723mdcreate_malloc(struct md_ioctl *mdio)
724{
725	struct md_s *sc;
726	off_t u;
727	uintptr_t sp;
728	int error;
729
730	error = 0;
731	if (mdio->md_size == 0)
732		return (EINVAL);
733	if (mdio->md_options & ~(MD_AUTOUNIT | MD_COMPRESS | MD_RESERVE))
734		return (EINVAL);
735	if (mdio->md_secsize != 0 && !powerof2(mdio->md_secsize))
736		return (EINVAL);
737	/* Compression doesn't make sense if we have reserved space */
738	if (mdio->md_options & MD_RESERVE)
739		mdio->md_options &= ~MD_COMPRESS;
740	if (mdio->md_options & MD_AUTOUNIT) {
741		sc = mdnew(-1);
742		if (sc == NULL)
743			return (ENOMEM);
744		mdio->md_unit = sc->unit;
745	} else {
746		sc = mdnew(mdio->md_unit);
747		if (sc == NULL)
748			return (EBUSY);
749	}
750	sc->type = MD_MALLOC;
751	if (mdio->md_secsize != 0)
752		sc->secsize = mdio->md_secsize;
753	else
754		sc->secsize = DEV_BSIZE;
755	sc->nsect = mdio->md_size;
756	sc->nsect /= (sc->secsize / DEV_BSIZE);
757	sc->flags = mdio->md_options & (MD_COMPRESS | MD_FORCE);
758	sc->indir = dimension(sc->nsect);
759	sc->uma = uma_zcreate(sc->name, sc->secsize,
760	    NULL, NULL, NULL, NULL, 0x1ff, 0);
761	if (mdio->md_options & MD_RESERVE) {
762		for (u = 0; u < sc->nsect; u++) {
763			sp = (uintptr_t) uma_zalloc(sc->uma, M_NOWAIT | M_ZERO);
764			if (sp != 0)
765				error = s_write(sc->indir, u, sp);
766			else
767				error = ENOMEM;
768			if (error)
769				break;
770		}
771	}
772	if (error)  {
773		mddestroy(sc, NULL);
774		return (error);
775	}
776	mdinit(sc);
777	if (!(mdio->md_options & MD_RESERVE))
778		sc->pp->flags |= G_PF_CANDELETE;
779	return (0);
780}
781
782
783static int
784mdsetcred(struct md_s *sc, struct ucred *cred)
785{
786	char *tmpbuf;
787	int error = 0;
788
789	/*
790	 * Set credits in our softc
791	 */
792
793	if (sc->cred)
794		crfree(sc->cred);
795	sc->cred = crhold(cred);
796
797	/*
798	 * Horrible kludge to establish credentials for NFS  XXX.
799	 */
800
801	if (sc->vnode) {
802		struct uio auio;
803		struct iovec aiov;
804
805		tmpbuf = malloc(sc->secsize, M_TEMP, M_WAITOK);
806		bzero(&auio, sizeof(auio));
807
808		aiov.iov_base = tmpbuf;
809		aiov.iov_len = sc->secsize;
810		auio.uio_iov = &aiov;
811		auio.uio_iovcnt = 1;
812		auio.uio_offset = 0;
813		auio.uio_rw = UIO_READ;
814		auio.uio_segflg = UIO_SYSSPACE;
815		auio.uio_resid = aiov.iov_len;
816		vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread);
817		error = VOP_READ(sc->vnode, &auio, 0, sc->cred);
818		VOP_UNLOCK(sc->vnode, 0, curthread);
819		free(tmpbuf, M_TEMP);
820	}
821	return (error);
822}
823
824static int
825mdcreate_vnode(struct md_ioctl *mdio, struct thread *td)
826{
827	struct md_s *sc;
828	struct vattr vattr;
829	struct nameidata nd;
830	int error, flags;
831
832	flags = FREAD|FWRITE;
833	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, mdio->md_file, td);
834	error = vn_open(&nd, &flags, 0);
835	if (error) {
836		if (error != EACCES && error != EPERM && error != EROFS)
837			return (error);
838		flags &= ~FWRITE;
839		NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, mdio->md_file, td);
840		error = vn_open(&nd, &flags, 0);
841		if (error)
842			return (error);
843	}
844	NDFREE(&nd, NDF_ONLY_PNBUF);
845	if (nd.ni_vp->v_type != VREG ||
846	    (error = VOP_GETATTR(nd.ni_vp, &vattr, td->td_ucred, td))) {
847		VOP_UNLOCK(nd.ni_vp, 0, td);
848		(void) vn_close(nd.ni_vp, flags, td->td_ucred, td);
849		return (error ? error : EINVAL);
850	}
851	VOP_UNLOCK(nd.ni_vp, 0, td);
852
853	if (mdio->md_options & MD_AUTOUNIT) {
854		sc = mdnew(-1);
855		mdio->md_unit = sc->unit;
856	} else {
857		sc = mdnew(mdio->md_unit);
858	}
859	if (sc == NULL) {
860		(void) vn_close(nd.ni_vp, flags, td->td_ucred, td);
861		return (EBUSY);
862	}
863
864	sc->type = MD_VNODE;
865	sc->flags = mdio->md_options & MD_FORCE;
866	if (!(flags & FWRITE))
867		sc->flags |= MD_READONLY;
868	sc->secsize = DEV_BSIZE;
869	sc->vnode = nd.ni_vp;
870
871	/*
872	 * If the size is specified, override the file attributes.
873	 */
874	if (mdio->md_size)
875		sc->nsect = mdio->md_size;
876	else
877		sc->nsect = vattr.va_size / sc->secsize; /* XXX: round up ? */
878	if (sc->nsect == 0) {
879		mddestroy(sc, td);
880		return (EINVAL);
881	}
882	error = mdsetcred(sc, td->td_ucred);
883	if (error) {
884		mddestroy(sc, td);
885		return (error);
886	}
887	mdinit(sc);
888	return (0);
889}
890
891static int
892mddestroy(struct md_s *sc, struct thread *td)
893{
894
895	GIANT_REQUIRED;
896
897	mtx_destroy(&sc->queue_mtx);
898	if (sc->gp) {
899		sc->gp->flags |= G_GEOM_WITHER;
900		sc->gp->softc = NULL;
901	}
902	if (sc->pp)
903		g_orphan_provider(sc->pp, ENXIO);
904	sc->flags |= MD_SHUTDOWN;
905	wakeup(sc);
906	while (sc->procp != NULL)
907		tsleep(&sc->procp, PRIBIO, "mddestroy", hz / 10);
908	if (sc->vnode != NULL)
909		(void)vn_close(sc->vnode, sc->flags & MD_READONLY ?
910		    FREAD : (FREAD|FWRITE), sc->cred, td);
911	if (sc->cred != NULL)
912		crfree(sc->cred);
913	if (sc->object != NULL) {
914		vm_pager_deallocate(sc->object);
915	}
916	if (sc->indir)
917		destroy_indir(sc, sc->indir);
918	if (sc->uma)
919		uma_zdestroy(sc->uma);
920
921	/* XXX: LOCK(unique unit numbers) */
922	LIST_REMOVE(sc, list);
923	/* XXX: UNLOCK(unique unit numbers) */
924	free(sc, M_MD);
925	return (0);
926}
927
928static int
929mdcreate_swap(struct md_ioctl *mdio, struct thread *td)
930{
931	int error;
932	struct md_s *sc;
933
934	GIANT_REQUIRED;
935
936	if (mdio->md_options & MD_AUTOUNIT) {
937		sc = mdnew(-1);
938		mdio->md_unit = sc->unit;
939	} else {
940		sc = mdnew(mdio->md_unit);
941	}
942	if (sc == NULL)
943		return (EBUSY);
944
945	sc->type = MD_SWAP;
946
947	/*
948	 * Range check.  Disallow negative sizes or any size less then the
949	 * size of a page.  Then round to a page.
950	 */
951
952	if (mdio->md_size == 0) {
953		mddestroy(sc, td);
954		return (EDOM);
955	}
956
957	/*
958	 * Allocate an OBJT_SWAP object.
959	 *
960	 * sc_secsize is PAGE_SIZE'd
961	 *
962	 * mdio->size is in DEV_BSIZE'd chunks.
963	 * Note the truncation.
964	 */
965
966	sc->secsize = PAGE_SIZE;
967	sc->nsect = mdio->md_size / (PAGE_SIZE / DEV_BSIZE);
968	sc->object = vm_pager_allocate(OBJT_SWAP, NULL, sc->secsize * (vm_offset_t)sc->nsect, VM_PROT_DEFAULT, 0);
969	sc->flags = mdio->md_options & MD_FORCE;
970	if (mdio->md_options & MD_RESERVE) {
971		if (swap_pager_reserve(sc->object, 0, sc->nsect) < 0) {
972			vm_pager_deallocate(sc->object);
973			sc->object = NULL;
974			mddestroy(sc, td);
975			return (EDOM);
976		}
977	}
978	error = mdsetcred(sc, td->td_ucred);
979	if (error) {
980		mddestroy(sc, td);
981		return (error);
982	}
983	mdinit(sc);
984	if (!(mdio->md_options & MD_RESERVE))
985		sc->pp->flags |= G_PF_CANDELETE;
986	return (0);
987}
988
989static int
990mddetach(int unit, struct thread *td)
991{
992	struct md_s *sc;
993
994	sc = mdfind(unit);
995	if (sc == NULL)
996		return (ENOENT);
997	if (sc->opencount != 0 && !(sc->flags & MD_FORCE))
998		return (EBUSY);
999	switch(sc->type) {
1000	case MD_VNODE:
1001	case MD_SWAP:
1002	case MD_MALLOC:
1003	case MD_PRELOAD:
1004		return (mddestroy(sc, td));
1005	default:
1006		return (EOPNOTSUPP);
1007	}
1008}
1009
1010static int
1011mdctlioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct thread *td)
1012{
1013	struct md_ioctl *mdio;
1014	struct md_s *sc;
1015	int i;
1016
1017	if (md_debug)
1018		printf("mdctlioctl(%s %lx %p %x %p)\n",
1019			devtoname(dev), cmd, addr, flags, td);
1020
1021	/*
1022	 * We assert the version number in the individual ioctl
1023	 * handlers instead of out here because (a) it is possible we
1024	 * may add another ioctl in the future which doesn't read an
1025	 * mdio, and (b) the correct return value for an unknown ioctl
1026	 * is ENOIOCTL, not EINVAL.
1027	 */
1028	mdio = (struct md_ioctl *)addr;
1029	switch (cmd) {
1030	case MDIOCATTACH:
1031		if (mdio->md_version != MDIOVERSION)
1032			return (EINVAL);
1033		switch (mdio->md_type) {
1034		case MD_MALLOC:
1035			return (mdcreate_malloc(mdio));
1036		case MD_PRELOAD:
1037			return (mdcreate_preload(mdio));
1038		case MD_VNODE:
1039			return (mdcreate_vnode(mdio, td));
1040		case MD_SWAP:
1041			return (mdcreate_swap(mdio, td));
1042		default:
1043			return (EINVAL);
1044		}
1045	case MDIOCDETACH:
1046		if (mdio->md_version != MDIOVERSION)
1047			return (EINVAL);
1048		if (mdio->md_file != NULL || mdio->md_size != 0 ||
1049		    mdio->md_options != 0)
1050			return (EINVAL);
1051		return (mddetach(mdio->md_unit, td));
1052	case MDIOCQUERY:
1053		if (mdio->md_version != MDIOVERSION)
1054			return (EINVAL);
1055		sc = mdfind(mdio->md_unit);
1056		if (sc == NULL)
1057			return (ENOENT);
1058		mdio->md_type = sc->type;
1059		mdio->md_options = sc->flags;
1060		switch (sc->type) {
1061		case MD_MALLOC:
1062			mdio->md_size = sc->nsect;
1063			break;
1064		case MD_PRELOAD:
1065			mdio->md_size = sc->nsect;
1066			mdio->md_base = (uint64_t)(intptr_t)sc->pl_ptr;
1067			break;
1068		case MD_SWAP:
1069			mdio->md_size = sc->nsect * (PAGE_SIZE / DEV_BSIZE);
1070			break;
1071		case MD_VNODE:
1072			mdio->md_size = sc->nsect;
1073			/* XXX fill this in */
1074			mdio->md_file = NULL;
1075			break;
1076		}
1077		return (0);
1078	case MDIOCLIST:
1079		i = 1;
1080		LIST_FOREACH(sc, &md_softc_list, list) {
1081			if (i == MDNPAD - 1)
1082				mdio->md_pad[i] = -1;
1083			else
1084				mdio->md_pad[i++] = sc->unit;
1085		}
1086		mdio->md_pad[0] = i - 1;
1087		return (0);
1088	default:
1089		return (ENOIOCTL);
1090	};
1091	return (ENOIOCTL);
1092}
1093
1094static void
1095md_preloaded(u_char *image, unsigned length)
1096{
1097	struct md_s *sc;
1098
1099	sc = mdnew(-1);
1100	if (sc == NULL)
1101		return;
1102	sc->type = MD_PRELOAD;
1103	sc->secsize = DEV_BSIZE;
1104	sc->nsect = length / DEV_BSIZE;
1105	sc->pl_ptr = image;
1106	sc->pl_len = length;
1107	if (sc->unit == 0)
1108		mdrootready = 1;
1109	mdinit(sc);
1110}
1111
1112static void
1113md_drvinit(void *unused)
1114{
1115
1116	caddr_t mod;
1117	caddr_t c;
1118	u_char *ptr, *name, *type;
1119	unsigned len;
1120
1121#ifdef MD_ROOT_SIZE
1122	md_preloaded(mfs_root, MD_ROOT_SIZE*1024);
1123#endif
1124	mod = NULL;
1125	while ((mod = preload_search_next_name(mod)) != NULL) {
1126		name = (char *)preload_search_info(mod, MODINFO_NAME);
1127		type = (char *)preload_search_info(mod, MODINFO_TYPE);
1128		if (name == NULL)
1129			continue;
1130		if (type == NULL)
1131			continue;
1132		if (strcmp(type, "md_image") && strcmp(type, "mfs_root"))
1133			continue;
1134		c = preload_search_info(mod, MODINFO_ADDR);
1135		ptr = *(u_char **)c;
1136		c = preload_search_info(mod, MODINFO_SIZE);
1137		len = *(size_t *)c;
1138		printf("%s%d: Preloaded image <%s> %d bytes at %p\n",
1139		    MD_NAME, mdunits, name, len, ptr);
1140		md_preloaded(ptr, len);
1141	}
1142	status_dev = make_dev(&mdctl_cdevsw, 0xffff00ff, UID_ROOT, GID_WHEEL,
1143	    0600, MDCTL_NAME);
1144}
1145
1146static int
1147md_modevent(module_t mod, int type, void *data)
1148{
1149	int error;
1150	struct md_s *sc;
1151
1152	switch (type) {
1153	case MOD_LOAD:
1154		md_drvinit(NULL);
1155		break;
1156	case MOD_UNLOAD:
1157		LIST_FOREACH(sc, &md_softc_list, list) {
1158			error = mddetach(sc->unit, curthread);
1159			if (error != 0)
1160				return (error);
1161		}
1162		if (status_dev)
1163			destroy_dev(status_dev);
1164		status_dev = 0;
1165		break;
1166	default:
1167		break;
1168	}
1169	return (0);
1170}
1171
1172static moduledata_t md_mod = {
1173	MD_NAME,
1174	md_modevent,
1175	NULL
1176};
1177DECLARE_MODULE(md, md_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE+CDEV_MAJOR);
1178MODULE_VERSION(md, MD_MODVER);
1179
1180
1181#ifdef MD_ROOT
1182static void
1183md_takeroot(void *junk)
1184{
1185	if (mdrootready)
1186		rootdevnames[0] = "ufs:/dev/md0";
1187}
1188
1189SYSINIT(md_root, SI_SUB_MOUNT_ROOT, SI_ORDER_FIRST, md_takeroot, NULL);
1190#endif /* MD_ROOT */
1191