md.c revision 216794
1/*-
2 * ----------------------------------------------------------------------------
3 * "THE BEER-WARE LICENSE" (Revision 42):
4 * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
5 * can do whatever you want with this stuff. If we meet some day, and you think
6 * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
7 * ----------------------------------------------------------------------------
8 *
9 * $FreeBSD: head/sys/dev/md/md.c 216794 2010-12-29 12:11:07Z kib $
10 *
11 */
12
13/*-
14 * The following functions are based in the vn(4) driver: mdstart_swap(),
15 * mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() and mddestroy(),
16 * and as such under the following copyright:
17 *
18 * Copyright (c) 1988 University of Utah.
19 * Copyright (c) 1990, 1993
20 *	The Regents of the University of California.  All rights reserved.
21 *
22 * This code is derived from software contributed to Berkeley by
23 * the Systems Programming Group of the University of Utah Computer
24 * Science Department.
25 *
26 * Redistribution and use in source and binary forms, with or without
27 * modification, are permitted provided that the following conditions
28 * are met:
29 * 1. Redistributions of source code must retain the above copyright
30 *    notice, this list of conditions and the following disclaimer.
31 * 2. Redistributions in binary form must reproduce the above copyright
32 *    notice, this list of conditions and the following disclaimer in the
33 *    documentation and/or other materials provided with the distribution.
34 * 4. Neither the name of the University nor the names of its contributors
35 *    may be used to endorse or promote products derived from this software
36 *    without specific prior written permission.
37 *
38 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
39 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
40 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
41 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
42 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
43 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
44 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
45 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
46 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
47 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48 * SUCH DAMAGE.
49 *
50 * from: Utah Hdr: vn.c 1.13 94/04/02
51 *
52 *	from: @(#)vn.c	8.6 (Berkeley) 4/1/94
53 * From: src/sys/dev/vn/vn.c,v 1.122 2000/12/16 16:06:03
54 */
55
56#include "opt_geom.h"
57#include "opt_md.h"
58
59#include <sys/param.h>
60#include <sys/systm.h>
61#include <sys/bio.h>
62#include <sys/conf.h>
63#include <sys/devicestat.h>
64#include <sys/fcntl.h>
65#include <sys/kernel.h>
66#include <sys/kthread.h>
67#include <sys/limits.h>
68#include <sys/linker.h>
69#include <sys/lock.h>
70#include <sys/malloc.h>
71#include <sys/mdioctl.h>
72#include <sys/mount.h>
73#include <sys/mutex.h>
74#include <sys/sx.h>
75#include <sys/namei.h>
76#include <sys/proc.h>
77#include <sys/queue.h>
78#include <sys/sched.h>
79#include <sys/sf_buf.h>
80#include <sys/sysctl.h>
81#include <sys/vnode.h>
82
83#include <geom/geom.h>
84
85#include <vm/vm.h>
86#include <vm/vm_object.h>
87#include <vm/vm_page.h>
88#include <vm/vm_pager.h>
89#include <vm/swap_pager.h>
90#include <vm/uma.h>
91
92#define MD_MODVER 1
93
94#define MD_SHUTDOWN	0x10000		/* Tell worker thread to terminate. */
95#define	MD_EXITING	0x20000		/* Worker thread is exiting. */
96
97#ifndef MD_NSECT
98#define MD_NSECT (10000 * 2)
99#endif
100
101static MALLOC_DEFINE(M_MD, "md_disk", "Memory Disk");
102static MALLOC_DEFINE(M_MDSECT, "md_sectors", "Memory Disk Sectors");
103
104static int md_debug;
105SYSCTL_INT(_debug, OID_AUTO, mddebug, CTLFLAG_RW, &md_debug, 0, "");
106static int md_malloc_wait;
107SYSCTL_INT(_vm, OID_AUTO, md_malloc_wait, CTLFLAG_RW, &md_malloc_wait, 0, "");
108
109#if defined(MD_ROOT) && defined(MD_ROOT_SIZE)
110/*
111 * Preloaded image gets put here.
112 * Applications that patch the object with the image can determine
113 * the size looking at the start and end markers (strings),
114 * so we want them contiguous.
115 */
116static struct {
117	u_char start[MD_ROOT_SIZE*1024];
118	u_char end[128];
119} mfs_root = {
120	.start = "MFS Filesystem goes here",
121	.end = "MFS Filesystem had better STOP here",
122};
123#endif
124
125static g_init_t g_md_init;
126static g_fini_t g_md_fini;
127static g_start_t g_md_start;
128static g_access_t g_md_access;
129static void g_md_dumpconf(struct sbuf *sb, const char *indent,
130    struct g_geom *gp, struct g_consumer *cp __unused, struct g_provider *pp);
131
132static int mdunits;
133static struct cdev *status_dev = 0;
134static struct sx md_sx;
135static struct unrhdr *md_uh;
136
137static d_ioctl_t mdctlioctl;
138
139static struct cdevsw mdctl_cdevsw = {
140	.d_version =	D_VERSION,
141	.d_ioctl =	mdctlioctl,
142	.d_name =	MD_NAME,
143};
144
145struct g_class g_md_class = {
146	.name = "MD",
147	.version = G_VERSION,
148	.init = g_md_init,
149	.fini = g_md_fini,
150	.start = g_md_start,
151	.access = g_md_access,
152	.dumpconf = g_md_dumpconf,
153};
154
155DECLARE_GEOM_CLASS(g_md_class, g_md);
156
157
158static LIST_HEAD(, md_s) md_softc_list = LIST_HEAD_INITIALIZER(md_softc_list);
159
160#define NINDIR	(PAGE_SIZE / sizeof(uintptr_t))
161#define NMASK	(NINDIR-1)
162static int nshift;
163
164struct indir {
165	uintptr_t	*array;
166	u_int		total;
167	u_int		used;
168	u_int		shift;
169};
170
171struct md_s {
172	int unit;
173	LIST_ENTRY(md_s) list;
174	struct bio_queue_head bio_queue;
175	struct mtx queue_mtx;
176	struct cdev *dev;
177	enum md_types type;
178	off_t mediasize;
179	unsigned sectorsize;
180	unsigned opencount;
181	unsigned fwheads;
182	unsigned fwsectors;
183	unsigned flags;
184	char name[20];
185	struct proc *procp;
186	struct g_geom *gp;
187	struct g_provider *pp;
188	int (*start)(struct md_s *sc, struct bio *bp);
189	struct devstat *devstat;
190
191	/* MD_MALLOC related fields */
192	struct indir *indir;
193	uma_zone_t uma;
194
195	/* MD_PRELOAD related fields */
196	u_char *pl_ptr;
197	size_t pl_len;
198
199	/* MD_VNODE related fields */
200	struct vnode *vnode;
201	char file[PATH_MAX];
202	struct ucred *cred;
203
204	/* MD_SWAP related fields */
205	vm_object_t object;
206};
207
208static struct indir *
209new_indir(u_int shift)
210{
211	struct indir *ip;
212
213	ip = malloc(sizeof *ip, M_MD, (md_malloc_wait ? M_WAITOK : M_NOWAIT)
214	    | M_ZERO);
215	if (ip == NULL)
216		return (NULL);
217	ip->array = malloc(sizeof(uintptr_t) * NINDIR,
218	    M_MDSECT, (md_malloc_wait ? M_WAITOK : M_NOWAIT) | M_ZERO);
219	if (ip->array == NULL) {
220		free(ip, M_MD);
221		return (NULL);
222	}
223	ip->total = NINDIR;
224	ip->shift = shift;
225	return (ip);
226}
227
228static void
229del_indir(struct indir *ip)
230{
231
232	free(ip->array, M_MDSECT);
233	free(ip, M_MD);
234}
235
236static void
237destroy_indir(struct md_s *sc, struct indir *ip)
238{
239	int i;
240
241	for (i = 0; i < NINDIR; i++) {
242		if (!ip->array[i])
243			continue;
244		if (ip->shift)
245			destroy_indir(sc, (struct indir*)(ip->array[i]));
246		else if (ip->array[i] > 255)
247			uma_zfree(sc->uma, (void *)(ip->array[i]));
248	}
249	del_indir(ip);
250}
251
252/*
253 * This function does the math and allocates the top level "indir" structure
254 * for a device of "size" sectors.
255 */
256
257static struct indir *
258dimension(off_t size)
259{
260	off_t rcnt;
261	struct indir *ip;
262	int layer;
263
264	rcnt = size;
265	layer = 0;
266	while (rcnt > NINDIR) {
267		rcnt /= NINDIR;
268		layer++;
269	}
270
271	/*
272	 * XXX: the top layer is probably not fully populated, so we allocate
273	 * too much space for ip->array in here.
274	 */
275	ip = malloc(sizeof *ip, M_MD, M_WAITOK | M_ZERO);
276	ip->array = malloc(sizeof(uintptr_t) * NINDIR,
277	    M_MDSECT, M_WAITOK | M_ZERO);
278	ip->total = NINDIR;
279	ip->shift = layer * nshift;
280	return (ip);
281}
282
283/*
284 * Read a given sector
285 */
286
287static uintptr_t
288s_read(struct indir *ip, off_t offset)
289{
290	struct indir *cip;
291	int idx;
292	uintptr_t up;
293
294	if (md_debug > 1)
295		printf("s_read(%jd)\n", (intmax_t)offset);
296	up = 0;
297	for (cip = ip; cip != NULL;) {
298		if (cip->shift) {
299			idx = (offset >> cip->shift) & NMASK;
300			up = cip->array[idx];
301			cip = (struct indir *)up;
302			continue;
303		}
304		idx = offset & NMASK;
305		return (cip->array[idx]);
306	}
307	return (0);
308}
309
310/*
311 * Write a given sector, prune the tree if the value is 0
312 */
313
314static int
315s_write(struct indir *ip, off_t offset, uintptr_t ptr)
316{
317	struct indir *cip, *lip[10];
318	int idx, li;
319	uintptr_t up;
320
321	if (md_debug > 1)
322		printf("s_write(%jd, %p)\n", (intmax_t)offset, (void *)ptr);
323	up = 0;
324	li = 0;
325	cip = ip;
326	for (;;) {
327		lip[li++] = cip;
328		if (cip->shift) {
329			idx = (offset >> cip->shift) & NMASK;
330			up = cip->array[idx];
331			if (up != 0) {
332				cip = (struct indir *)up;
333				continue;
334			}
335			/* Allocate branch */
336			cip->array[idx] =
337			    (uintptr_t)new_indir(cip->shift - nshift);
338			if (cip->array[idx] == 0)
339				return (ENOSPC);
340			cip->used++;
341			up = cip->array[idx];
342			cip = (struct indir *)up;
343			continue;
344		}
345		/* leafnode */
346		idx = offset & NMASK;
347		up = cip->array[idx];
348		if (up != 0)
349			cip->used--;
350		cip->array[idx] = ptr;
351		if (ptr != 0)
352			cip->used++;
353		break;
354	}
355	if (cip->used != 0 || li == 1)
356		return (0);
357	li--;
358	while (cip->used == 0 && cip != ip) {
359		li--;
360		idx = (offset >> lip[li]->shift) & NMASK;
361		up = lip[li]->array[idx];
362		KASSERT(up == (uintptr_t)cip, ("md screwed up"));
363		del_indir(cip);
364		lip[li]->array[idx] = 0;
365		lip[li]->used--;
366		cip = lip[li];
367	}
368	return (0);
369}
370
371
372static int
373g_md_access(struct g_provider *pp, int r, int w, int e)
374{
375	struct md_s *sc;
376
377	sc = pp->geom->softc;
378	if (sc == NULL) {
379		if (r <= 0 && w <= 0 && e <= 0)
380			return (0);
381		return (ENXIO);
382	}
383	r += pp->acr;
384	w += pp->acw;
385	e += pp->ace;
386	if ((sc->flags & MD_READONLY) != 0 && w > 0)
387		return (EROFS);
388	if ((pp->acr + pp->acw + pp->ace) == 0 && (r + w + e) > 0) {
389		sc->opencount = 1;
390	} else if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0) {
391		sc->opencount = 0;
392	}
393	return (0);
394}
395
396static void
397g_md_start(struct bio *bp)
398{
399	struct md_s *sc;
400
401	sc = bp->bio_to->geom->softc;
402	if ((bp->bio_cmd == BIO_READ) || (bp->bio_cmd == BIO_WRITE))
403		devstat_start_transaction_bio(sc->devstat, bp);
404	mtx_lock(&sc->queue_mtx);
405	bioq_disksort(&sc->bio_queue, bp);
406	mtx_unlock(&sc->queue_mtx);
407	wakeup(sc);
408}
409
410static int
411mdstart_malloc(struct md_s *sc, struct bio *bp)
412{
413	int i, error;
414	u_char *dst;
415	off_t secno, nsec, uc;
416	uintptr_t sp, osp;
417
418	switch (bp->bio_cmd) {
419	case BIO_READ:
420	case BIO_WRITE:
421	case BIO_DELETE:
422		break;
423	default:
424		return (EOPNOTSUPP);
425	}
426
427	nsec = bp->bio_length / sc->sectorsize;
428	secno = bp->bio_offset / sc->sectorsize;
429	dst = bp->bio_data;
430	error = 0;
431	while (nsec--) {
432		osp = s_read(sc->indir, secno);
433		if (bp->bio_cmd == BIO_DELETE) {
434			if (osp != 0)
435				error = s_write(sc->indir, secno, 0);
436		} else if (bp->bio_cmd == BIO_READ) {
437			if (osp == 0)
438				bzero(dst, sc->sectorsize);
439			else if (osp <= 255)
440				memset(dst, osp, sc->sectorsize);
441			else {
442				bcopy((void *)osp, dst, sc->sectorsize);
443				cpu_flush_dcache(dst, sc->sectorsize);
444			}
445			osp = 0;
446		} else if (bp->bio_cmd == BIO_WRITE) {
447			if (sc->flags & MD_COMPRESS) {
448				uc = dst[0];
449				for (i = 1; i < sc->sectorsize; i++)
450					if (dst[i] != uc)
451						break;
452			} else {
453				i = 0;
454				uc = 0;
455			}
456			if (i == sc->sectorsize) {
457				if (osp != uc)
458					error = s_write(sc->indir, secno, uc);
459			} else {
460				if (osp <= 255) {
461					sp = (uintptr_t)uma_zalloc(sc->uma,
462					    md_malloc_wait ? M_WAITOK :
463					    M_NOWAIT);
464					if (sp == 0) {
465						error = ENOSPC;
466						break;
467					}
468					bcopy(dst, (void *)sp, sc->sectorsize);
469					error = s_write(sc->indir, secno, sp);
470				} else {
471					bcopy(dst, (void *)osp, sc->sectorsize);
472					osp = 0;
473				}
474			}
475		} else {
476			error = EOPNOTSUPP;
477		}
478		if (osp > 255)
479			uma_zfree(sc->uma, (void*)osp);
480		if (error != 0)
481			break;
482		secno++;
483		dst += sc->sectorsize;
484	}
485	bp->bio_resid = 0;
486	return (error);
487}
488
489static int
490mdstart_preload(struct md_s *sc, struct bio *bp)
491{
492
493	switch (bp->bio_cmd) {
494	case BIO_READ:
495		bcopy(sc->pl_ptr + bp->bio_offset, bp->bio_data,
496		    bp->bio_length);
497		cpu_flush_dcache(bp->bio_data, bp->bio_length);
498		break;
499	case BIO_WRITE:
500		bcopy(bp->bio_data, sc->pl_ptr + bp->bio_offset,
501		    bp->bio_length);
502		break;
503	}
504	bp->bio_resid = 0;
505	return (0);
506}
507
508static int
509mdstart_vnode(struct md_s *sc, struct bio *bp)
510{
511	int error, vfslocked;
512	struct uio auio;
513	struct iovec aiov;
514	struct mount *mp;
515	struct vnode *vp;
516	struct thread *td;
517
518	switch (bp->bio_cmd) {
519	case BIO_READ:
520	case BIO_WRITE:
521	case BIO_FLUSH:
522		break;
523	default:
524		return (EOPNOTSUPP);
525	}
526
527	td = curthread;
528	vp = sc->vnode;
529
530	/*
531	 * VNODE I/O
532	 *
533	 * If an error occurs, we set BIO_ERROR but we do not set
534	 * B_INVAL because (for a write anyway), the buffer is
535	 * still valid.
536	 */
537
538	if (bp->bio_cmd == BIO_FLUSH) {
539		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
540		(void) vn_start_write(vp, &mp, V_WAIT);
541		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
542		error = VOP_FSYNC(vp, MNT_WAIT, td);
543		VOP_UNLOCK(vp, 0);
544		vn_finished_write(mp);
545		VFS_UNLOCK_GIANT(vfslocked);
546		return (error);
547	}
548
549	bzero(&auio, sizeof(auio));
550
551	aiov.iov_base = bp->bio_data;
552	aiov.iov_len = bp->bio_length;
553	auio.uio_iov = &aiov;
554	auio.uio_iovcnt = 1;
555	auio.uio_offset = (vm_ooffset_t)bp->bio_offset;
556	auio.uio_segflg = UIO_SYSSPACE;
557	if (bp->bio_cmd == BIO_READ)
558		auio.uio_rw = UIO_READ;
559	else if (bp->bio_cmd == BIO_WRITE)
560		auio.uio_rw = UIO_WRITE;
561	else
562		panic("wrong BIO_OP in mdstart_vnode");
563	auio.uio_resid = bp->bio_length;
564	auio.uio_td = td;
565	/*
566	 * When reading set IO_DIRECT to try to avoid double-caching
567	 * the data.  When writing IO_DIRECT is not optimal.
568	 */
569	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
570	if (bp->bio_cmd == BIO_READ) {
571		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
572		error = VOP_READ(vp, &auio, IO_DIRECT, sc->cred);
573		VOP_UNLOCK(vp, 0);
574	} else {
575		(void) vn_start_write(vp, &mp, V_WAIT);
576		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
577		error = VOP_WRITE(vp, &auio, sc->flags & MD_ASYNC ? 0 : IO_SYNC,
578		    sc->cred);
579		VOP_UNLOCK(vp, 0);
580		vn_finished_write(mp);
581	}
582	VFS_UNLOCK_GIANT(vfslocked);
583	bp->bio_resid = auio.uio_resid;
584	return (error);
585}
586
587static int
588mdstart_swap(struct md_s *sc, struct bio *bp)
589{
590	struct sf_buf *sf;
591	int rv, offs, len, lastend;
592	vm_pindex_t i, lastp;
593	vm_page_t m;
594	u_char *p;
595
596	switch (bp->bio_cmd) {
597	case BIO_READ:
598	case BIO_WRITE:
599	case BIO_DELETE:
600		break;
601	default:
602		return (EOPNOTSUPP);
603	}
604
605	p = bp->bio_data;
606
607	/*
608	 * offs is the offset at which to start operating on the
609	 * next (ie, first) page.  lastp is the last page on
610	 * which we're going to operate.  lastend is the ending
611	 * position within that last page (ie, PAGE_SIZE if
612	 * we're operating on complete aligned pages).
613	 */
614	offs = bp->bio_offset % PAGE_SIZE;
615	lastp = (bp->bio_offset + bp->bio_length - 1) / PAGE_SIZE;
616	lastend = (bp->bio_offset + bp->bio_length - 1) % PAGE_SIZE + 1;
617
618	rv = VM_PAGER_OK;
619	VM_OBJECT_LOCK(sc->object);
620	vm_object_pip_add(sc->object, 1);
621	for (i = bp->bio_offset / PAGE_SIZE; i <= lastp; i++) {
622		len = ((i == lastp) ? lastend : PAGE_SIZE) - offs;
623
624		m = vm_page_grab(sc->object, i,
625		    VM_ALLOC_NORMAL|VM_ALLOC_RETRY);
626		VM_OBJECT_UNLOCK(sc->object);
627		sched_pin();
628		sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
629		VM_OBJECT_LOCK(sc->object);
630		if (bp->bio_cmd == BIO_READ) {
631			if (m->valid != VM_PAGE_BITS_ALL)
632				rv = vm_pager_get_pages(sc->object, &m, 1, 0);
633			if (rv == VM_PAGER_ERROR) {
634				sf_buf_free(sf);
635				sched_unpin();
636				vm_page_wakeup(m);
637				break;
638			}
639			bcopy((void *)(sf_buf_kva(sf) + offs), p, len);
640			cpu_flush_dcache(p, len);
641		} else if (bp->bio_cmd == BIO_WRITE) {
642			if (len != PAGE_SIZE && m->valid != VM_PAGE_BITS_ALL)
643				rv = vm_pager_get_pages(sc->object, &m, 1, 0);
644			if (rv == VM_PAGER_ERROR) {
645				sf_buf_free(sf);
646				sched_unpin();
647				vm_page_wakeup(m);
648				break;
649			}
650			bcopy(p, (void *)(sf_buf_kva(sf) + offs), len);
651			m->valid = VM_PAGE_BITS_ALL;
652#if 0
653		} else if (bp->bio_cmd == BIO_DELETE) {
654			if (len != PAGE_SIZE && m->valid != VM_PAGE_BITS_ALL)
655				rv = vm_pager_get_pages(sc->object, &m, 1, 0);
656			if (rv == VM_PAGER_ERROR) {
657				sf_buf_free(sf);
658				sched_unpin();
659				vm_page_wakeup(m);
660				break;
661			}
662			bzero((void *)(sf_buf_kva(sf) + offs), len);
663			vm_page_dirty(m);
664			m->valid = VM_PAGE_BITS_ALL;
665#endif
666		}
667		sf_buf_free(sf);
668		sched_unpin();
669		vm_page_wakeup(m);
670		vm_page_lock(m);
671		vm_page_activate(m);
672		vm_page_unlock(m);
673		if (bp->bio_cmd == BIO_WRITE)
674			vm_page_dirty(m);
675
676		/* Actions on further pages start at offset 0 */
677		p += PAGE_SIZE - offs;
678		offs = 0;
679#if 0
680if (bootverbose || bp->bio_offset / PAGE_SIZE < 17)
681printf("wire_count %d busy %d flags %x hold_count %d act_count %d queue %d valid %d dirty %d @ %d\n",
682    m->wire_count, m->busy,
683    m->flags, m->hold_count, m->act_count, m->queue, m->valid, m->dirty, i);
684#endif
685	}
686	vm_object_pip_subtract(sc->object, 1);
687	vm_object_set_writeable_dirty(sc->object);
688	VM_OBJECT_UNLOCK(sc->object);
689	return (rv != VM_PAGER_ERROR ? 0 : ENOSPC);
690}
691
692static void
693md_kthread(void *arg)
694{
695	struct md_s *sc;
696	struct bio *bp;
697	int error;
698
699	sc = arg;
700	thread_lock(curthread);
701	sched_prio(curthread, PRIBIO);
702	thread_unlock(curthread);
703	if (sc->type == MD_VNODE)
704		curthread->td_pflags |= TDP_NORUNNINGBUF;
705
706	for (;;) {
707		mtx_lock(&sc->queue_mtx);
708		if (sc->flags & MD_SHUTDOWN) {
709			sc->flags |= MD_EXITING;
710			mtx_unlock(&sc->queue_mtx);
711			kproc_exit(0);
712		}
713		bp = bioq_takefirst(&sc->bio_queue);
714		if (!bp) {
715			msleep(sc, &sc->queue_mtx, PRIBIO | PDROP, "mdwait", 0);
716			continue;
717		}
718		mtx_unlock(&sc->queue_mtx);
719		if (bp->bio_cmd == BIO_GETATTR) {
720			if ((sc->fwsectors && sc->fwheads &&
721			    (g_handleattr_int(bp, "GEOM::fwsectors",
722			    sc->fwsectors) ||
723			    g_handleattr_int(bp, "GEOM::fwheads",
724			    sc->fwheads))) ||
725			    g_handleattr_int(bp, "GEOM::candelete", 1))
726				error = -1;
727			else
728				error = EOPNOTSUPP;
729		} else {
730			error = sc->start(sc, bp);
731		}
732
733		if (error != -1) {
734			bp->bio_completed = bp->bio_length;
735			g_io_deliver(bp, error);
736			if ((bp->bio_cmd == BIO_READ) || (bp->bio_cmd == BIO_WRITE))
737				devstat_end_transaction_bio(sc->devstat, bp);
738		}
739	}
740}
741
742static struct md_s *
743mdfind(int unit)
744{
745	struct md_s *sc;
746
747	LIST_FOREACH(sc, &md_softc_list, list) {
748		if (sc->unit == unit)
749			break;
750	}
751	return (sc);
752}
753
754static struct md_s *
755mdnew(int unit, int *errp, enum md_types type)
756{
757	struct md_s *sc;
758	int error;
759
760	*errp = 0;
761	if (unit == -1)
762		unit = alloc_unr(md_uh);
763	else
764		unit = alloc_unr_specific(md_uh, unit);
765
766	if (unit == -1) {
767		*errp = EBUSY;
768		return (NULL);
769	}
770
771	sc = (struct md_s *)malloc(sizeof *sc, M_MD, M_WAITOK | M_ZERO);
772	sc->type = type;
773	bioq_init(&sc->bio_queue);
774	mtx_init(&sc->queue_mtx, "md bio queue", NULL, MTX_DEF);
775	sc->unit = unit;
776	sprintf(sc->name, "md%d", unit);
777	LIST_INSERT_HEAD(&md_softc_list, sc, list);
778	error = kproc_create(md_kthread, sc, &sc->procp, 0, 0,"%s", sc->name);
779	if (error == 0)
780		return (sc);
781	LIST_REMOVE(sc, list);
782	mtx_destroy(&sc->queue_mtx);
783	free_unr(md_uh, sc->unit);
784	free(sc, M_MD);
785	*errp = error;
786	return (NULL);
787}
788
789static void
790mdinit(struct md_s *sc)
791{
792	struct g_geom *gp;
793	struct g_provider *pp;
794
795	g_topology_lock();
796	gp = g_new_geomf(&g_md_class, "md%d", sc->unit);
797	gp->softc = sc;
798	pp = g_new_providerf(gp, "md%d", sc->unit);
799	pp->mediasize = sc->mediasize;
800	pp->sectorsize = sc->sectorsize;
801	sc->gp = gp;
802	sc->pp = pp;
803	g_error_provider(pp, 0);
804	g_topology_unlock();
805	sc->devstat = devstat_new_entry("md", sc->unit, sc->sectorsize,
806	    DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX);
807}
808
809/*
810 * XXX: we should check that the range they feed us is mapped.
811 * XXX: we should implement read-only.
812 */
813
814static int
815mdcreate_preload(struct md_s *sc, struct md_ioctl *mdio)
816{
817
818	if (mdio->md_options & ~(MD_AUTOUNIT | MD_FORCE))
819		return (EINVAL);
820	if (mdio->md_base == 0)
821		return (EINVAL);
822	sc->flags = mdio->md_options & MD_FORCE;
823	/* Cast to pointer size, then to pointer to avoid warning */
824	sc->pl_ptr = (u_char *)(uintptr_t)mdio->md_base;
825	sc->pl_len = (size_t)sc->mediasize;
826	return (0);
827}
828
829
830static int
831mdcreate_malloc(struct md_s *sc, struct md_ioctl *mdio)
832{
833	uintptr_t sp;
834	int error;
835	off_t u;
836
837	error = 0;
838	if (mdio->md_options & ~(MD_AUTOUNIT | MD_COMPRESS | MD_RESERVE))
839		return (EINVAL);
840	if (mdio->md_sectorsize != 0 && !powerof2(mdio->md_sectorsize))
841		return (EINVAL);
842	/* Compression doesn't make sense if we have reserved space */
843	if (mdio->md_options & MD_RESERVE)
844		mdio->md_options &= ~MD_COMPRESS;
845	if (mdio->md_fwsectors != 0)
846		sc->fwsectors = mdio->md_fwsectors;
847	if (mdio->md_fwheads != 0)
848		sc->fwheads = mdio->md_fwheads;
849	sc->flags = mdio->md_options & (MD_COMPRESS | MD_FORCE);
850	sc->indir = dimension(sc->mediasize / sc->sectorsize);
851	sc->uma = uma_zcreate(sc->name, sc->sectorsize, NULL, NULL, NULL, NULL,
852	    0x1ff, 0);
853	if (mdio->md_options & MD_RESERVE) {
854		off_t nsectors;
855
856		nsectors = sc->mediasize / sc->sectorsize;
857		for (u = 0; u < nsectors; u++) {
858			sp = (uintptr_t)uma_zalloc(sc->uma, md_malloc_wait ?
859			    M_WAITOK : M_NOWAIT | M_ZERO);
860			if (sp != 0)
861				error = s_write(sc->indir, u, sp);
862			else
863				error = ENOMEM;
864			if (error != 0)
865				break;
866		}
867	}
868	return (error);
869}
870
871
872static int
873mdsetcred(struct md_s *sc, struct ucred *cred)
874{
875	char *tmpbuf;
876	int error = 0;
877
878	/*
879	 * Set credits in our softc
880	 */
881
882	if (sc->cred)
883		crfree(sc->cred);
884	sc->cred = crhold(cred);
885
886	/*
887	 * Horrible kludge to establish credentials for NFS  XXX.
888	 */
889
890	if (sc->vnode) {
891		struct uio auio;
892		struct iovec aiov;
893
894		tmpbuf = malloc(sc->sectorsize, M_TEMP, M_WAITOK);
895		bzero(&auio, sizeof(auio));
896
897		aiov.iov_base = tmpbuf;
898		aiov.iov_len = sc->sectorsize;
899		auio.uio_iov = &aiov;
900		auio.uio_iovcnt = 1;
901		auio.uio_offset = 0;
902		auio.uio_rw = UIO_READ;
903		auio.uio_segflg = UIO_SYSSPACE;
904		auio.uio_resid = aiov.iov_len;
905		vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY);
906		error = VOP_READ(sc->vnode, &auio, 0, sc->cred);
907		VOP_UNLOCK(sc->vnode, 0);
908		free(tmpbuf, M_TEMP);
909	}
910	return (error);
911}
912
913static int
914mdcreate_vnode(struct md_s *sc, struct md_ioctl *mdio, struct thread *td)
915{
916	struct vattr vattr;
917	struct nameidata nd;
918	char *fname;
919	int error, flags, vfslocked;
920
921	/*
922	 * Kernel-originated requests must have the filename appended
923	 * to the mdio structure to protect against malicious software.
924	 */
925	fname = mdio->md_file;
926	if ((void *)fname != (void *)(mdio + 1)) {
927		error = copyinstr(fname, sc->file, sizeof(sc->file), NULL);
928		if (error != 0)
929			return (error);
930	} else
931		strlcpy(sc->file, fname, sizeof(sc->file));
932
933	/*
934	 * If the user specified that this is a read only device, don't
935	 * set the FWRITE mask before trying to open the backing store.
936	 */
937	flags = FREAD | ((mdio->md_options & MD_READONLY) ? 0 : FWRITE);
938	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, sc->file, td);
939	error = vn_open(&nd, &flags, 0, NULL);
940	if (error != 0)
941		return (error);
942	vfslocked = NDHASGIANT(&nd);
943	NDFREE(&nd, NDF_ONLY_PNBUF);
944	if (nd.ni_vp->v_type != VREG) {
945		error = EINVAL;
946		goto bad;
947	}
948	error = VOP_GETATTR(nd.ni_vp, &vattr, td->td_ucred);
949	if (error != 0)
950		goto bad;
951	if (VOP_ISLOCKED(nd.ni_vp) != LK_EXCLUSIVE) {
952		vn_lock(nd.ni_vp, LK_UPGRADE | LK_RETRY);
953		if (nd.ni_vp->v_iflag & VI_DOOMED) {
954			/* Forced unmount. */
955			error = EBADF;
956			goto bad;
957		}
958	}
959	nd.ni_vp->v_vflag |= VV_MD;
960	VOP_UNLOCK(nd.ni_vp, 0);
961
962	if (mdio->md_fwsectors != 0)
963		sc->fwsectors = mdio->md_fwsectors;
964	if (mdio->md_fwheads != 0)
965		sc->fwheads = mdio->md_fwheads;
966	sc->flags = mdio->md_options & (MD_FORCE | MD_ASYNC);
967	if (!(flags & FWRITE))
968		sc->flags |= MD_READONLY;
969	sc->vnode = nd.ni_vp;
970
971	error = mdsetcred(sc, td->td_ucred);
972	if (error != 0) {
973		sc->vnode = NULL;
974		vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY);
975		nd.ni_vp->v_vflag &= ~VV_MD;
976		goto bad;
977	}
978	VFS_UNLOCK_GIANT(vfslocked);
979	return (0);
980bad:
981	VOP_UNLOCK(nd.ni_vp, 0);
982	(void)vn_close(nd.ni_vp, flags, td->td_ucred, td);
983	VFS_UNLOCK_GIANT(vfslocked);
984	return (error);
985}
986
987static int
988mddestroy(struct md_s *sc, struct thread *td)
989{
990	int vfslocked;
991
992	if (sc->gp) {
993		sc->gp->softc = NULL;
994		g_topology_lock();
995		g_wither_geom(sc->gp, ENXIO);
996		g_topology_unlock();
997		sc->gp = NULL;
998		sc->pp = NULL;
999	}
1000	if (sc->devstat) {
1001		devstat_remove_entry(sc->devstat);
1002		sc->devstat = NULL;
1003	}
1004	mtx_lock(&sc->queue_mtx);
1005	sc->flags |= MD_SHUTDOWN;
1006	wakeup(sc);
1007	while (!(sc->flags & MD_EXITING))
1008		msleep(sc->procp, &sc->queue_mtx, PRIBIO, "mddestroy", hz / 10);
1009	mtx_unlock(&sc->queue_mtx);
1010	mtx_destroy(&sc->queue_mtx);
1011	if (sc->vnode != NULL) {
1012		vfslocked = VFS_LOCK_GIANT(sc->vnode->v_mount);
1013		vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY);
1014		sc->vnode->v_vflag &= ~VV_MD;
1015		VOP_UNLOCK(sc->vnode, 0);
1016		(void)vn_close(sc->vnode, sc->flags & MD_READONLY ?
1017		    FREAD : (FREAD|FWRITE), sc->cred, td);
1018		VFS_UNLOCK_GIANT(vfslocked);
1019	}
1020	if (sc->cred != NULL)
1021		crfree(sc->cred);
1022	if (sc->object != NULL)
1023		vm_object_deallocate(sc->object);
1024	if (sc->indir)
1025		destroy_indir(sc, sc->indir);
1026	if (sc->uma)
1027		uma_zdestroy(sc->uma);
1028
1029	LIST_REMOVE(sc, list);
1030	free_unr(md_uh, sc->unit);
1031	free(sc, M_MD);
1032	return (0);
1033}
1034
1035static int
1036mdcreate_swap(struct md_s *sc, struct md_ioctl *mdio, struct thread *td)
1037{
1038	vm_ooffset_t npage;
1039	int error;
1040
1041	/*
1042	 * Range check.  Disallow negative sizes or any size less then the
1043	 * size of a page.  Then round to a page.
1044	 */
1045	if (sc->mediasize == 0 || (sc->mediasize % PAGE_SIZE) != 0)
1046		return (EDOM);
1047
1048	/*
1049	 * Allocate an OBJT_SWAP object.
1050	 *
1051	 * Note the truncation.
1052	 */
1053
1054	npage = mdio->md_mediasize / PAGE_SIZE;
1055	if (mdio->md_fwsectors != 0)
1056		sc->fwsectors = mdio->md_fwsectors;
1057	if (mdio->md_fwheads != 0)
1058		sc->fwheads = mdio->md_fwheads;
1059	sc->object = vm_pager_allocate(OBJT_SWAP, NULL, PAGE_SIZE * npage,
1060	    VM_PROT_DEFAULT, 0, td->td_ucred);
1061	if (sc->object == NULL)
1062		return (ENOMEM);
1063	sc->flags = mdio->md_options & MD_FORCE;
1064	if (mdio->md_options & MD_RESERVE) {
1065		if (swap_pager_reserve(sc->object, 0, npage) < 0) {
1066			error = EDOM;
1067			goto finish;
1068		}
1069	}
1070	error = mdsetcred(sc, td->td_ucred);
1071 finish:
1072	if (error != 0) {
1073		vm_object_deallocate(sc->object);
1074		sc->object = NULL;
1075	}
1076	return (error);
1077}
1078
1079
1080static int
1081xmdctlioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td)
1082{
1083	struct md_ioctl *mdio;
1084	struct md_s *sc;
1085	int error, i;
1086
1087	if (md_debug)
1088		printf("mdctlioctl(%s %lx %p %x %p)\n",
1089			devtoname(dev), cmd, addr, flags, td);
1090
1091	mdio = (struct md_ioctl *)addr;
1092	if (mdio->md_version != MDIOVERSION)
1093		return (EINVAL);
1094
1095	/*
1096	 * We assert the version number in the individual ioctl
1097	 * handlers instead of out here because (a) it is possible we
1098	 * may add another ioctl in the future which doesn't read an
1099	 * mdio, and (b) the correct return value for an unknown ioctl
1100	 * is ENOIOCTL, not EINVAL.
1101	 */
1102	error = 0;
1103	switch (cmd) {
1104	case MDIOCATTACH:
1105		switch (mdio->md_type) {
1106		case MD_MALLOC:
1107		case MD_PRELOAD:
1108		case MD_VNODE:
1109		case MD_SWAP:
1110			break;
1111		default:
1112			return (EINVAL);
1113		}
1114		if (mdio->md_options & MD_AUTOUNIT)
1115			sc = mdnew(-1, &error, mdio->md_type);
1116		else {
1117			if (mdio->md_unit > INT_MAX)
1118				return (EINVAL);
1119			sc = mdnew(mdio->md_unit, &error, mdio->md_type);
1120		}
1121		if (sc == NULL)
1122			return (error);
1123		if (mdio->md_options & MD_AUTOUNIT)
1124			mdio->md_unit = sc->unit;
1125		sc->mediasize = mdio->md_mediasize;
1126		if (mdio->md_sectorsize == 0)
1127			sc->sectorsize = DEV_BSIZE;
1128		else
1129			sc->sectorsize = mdio->md_sectorsize;
1130		error = EDOOFUS;
1131		switch (sc->type) {
1132		case MD_MALLOC:
1133			sc->start = mdstart_malloc;
1134			error = mdcreate_malloc(sc, mdio);
1135			break;
1136		case MD_PRELOAD:
1137			sc->start = mdstart_preload;
1138			error = mdcreate_preload(sc, mdio);
1139			break;
1140		case MD_VNODE:
1141			sc->start = mdstart_vnode;
1142			error = mdcreate_vnode(sc, mdio, td);
1143			break;
1144		case MD_SWAP:
1145			sc->start = mdstart_swap;
1146			error = mdcreate_swap(sc, mdio, td);
1147			break;
1148		}
1149		if (error != 0) {
1150			mddestroy(sc, td);
1151			return (error);
1152		}
1153
1154		/* Prune off any residual fractional sector */
1155		i = sc->mediasize % sc->sectorsize;
1156		sc->mediasize -= i;
1157
1158		mdinit(sc);
1159		return (0);
1160	case MDIOCDETACH:
1161		if (mdio->md_mediasize != 0 ||
1162		    (mdio->md_options & ~MD_FORCE) != 0)
1163			return (EINVAL);
1164
1165		sc = mdfind(mdio->md_unit);
1166		if (sc == NULL)
1167			return (ENOENT);
1168		if (sc->opencount != 0 && !(sc->flags & MD_FORCE) &&
1169		    !(mdio->md_options & MD_FORCE))
1170			return (EBUSY);
1171		return (mddestroy(sc, td));
1172	case MDIOCQUERY:
1173		sc = mdfind(mdio->md_unit);
1174		if (sc == NULL)
1175			return (ENOENT);
1176		mdio->md_type = sc->type;
1177		mdio->md_options = sc->flags;
1178		mdio->md_mediasize = sc->mediasize;
1179		mdio->md_sectorsize = sc->sectorsize;
1180		if (sc->type == MD_VNODE)
1181			error = copyout(sc->file, mdio->md_file,
1182			    strlen(sc->file) + 1);
1183		return (error);
1184	case MDIOCLIST:
1185		i = 1;
1186		LIST_FOREACH(sc, &md_softc_list, list) {
1187			if (i == MDNPAD - 1)
1188				mdio->md_pad[i] = -1;
1189			else
1190				mdio->md_pad[i++] = sc->unit;
1191		}
1192		mdio->md_pad[0] = i - 1;
1193		return (0);
1194	default:
1195		return (ENOIOCTL);
1196	};
1197}
1198
1199static int
1200mdctlioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td)
1201{
1202	int error;
1203
1204	sx_xlock(&md_sx);
1205	error = xmdctlioctl(dev, cmd, addr, flags, td);
1206	sx_xunlock(&md_sx);
1207	return (error);
1208}
1209
1210static void
1211md_preloaded(u_char *image, size_t length)
1212{
1213	struct md_s *sc;
1214	int error;
1215
1216	sc = mdnew(-1, &error, MD_PRELOAD);
1217	if (sc == NULL)
1218		return;
1219	sc->mediasize = length;
1220	sc->sectorsize = DEV_BSIZE;
1221	sc->pl_ptr = image;
1222	sc->pl_len = length;
1223	sc->start = mdstart_preload;
1224#ifdef MD_ROOT
1225	if (sc->unit == 0)
1226		rootdevnames[0] = "ufs:/dev/md0";
1227#endif
1228	mdinit(sc);
1229}
1230
1231static void
1232g_md_init(struct g_class *mp __unused)
1233{
1234	caddr_t mod;
1235	caddr_t c;
1236	u_char *ptr, *name, *type;
1237	unsigned len;
1238	int i;
1239
1240	/* figure out log2(NINDIR) */
1241	for (i = NINDIR, nshift = -1; i; nshift++)
1242		i >>= 1;
1243
1244	mod = NULL;
1245	sx_init(&md_sx, "MD config lock");
1246	g_topology_unlock();
1247	md_uh = new_unrhdr(0, INT_MAX, NULL);
1248#ifdef MD_ROOT_SIZE
1249	sx_xlock(&md_sx);
1250	md_preloaded(mfs_root.start, sizeof(mfs_root.start));
1251	sx_xunlock(&md_sx);
1252#endif
1253	/* XXX: are preload_* static or do they need Giant ? */
1254	while ((mod = preload_search_next_name(mod)) != NULL) {
1255		name = (char *)preload_search_info(mod, MODINFO_NAME);
1256		if (name == NULL)
1257			continue;
1258		type = (char *)preload_search_info(mod, MODINFO_TYPE);
1259		if (type == NULL)
1260			continue;
1261		if (strcmp(type, "md_image") && strcmp(type, "mfs_root"))
1262			continue;
1263		c = preload_search_info(mod, MODINFO_ADDR);
1264		ptr = *(u_char **)c;
1265		c = preload_search_info(mod, MODINFO_SIZE);
1266		len = *(size_t *)c;
1267		printf("%s%d: Preloaded image <%s> %d bytes at %p\n",
1268		    MD_NAME, mdunits, name, len, ptr);
1269		sx_xlock(&md_sx);
1270		md_preloaded(ptr, len);
1271		sx_xunlock(&md_sx);
1272	}
1273	status_dev = make_dev(&mdctl_cdevsw, INT_MAX, UID_ROOT, GID_WHEEL,
1274	    0600, MDCTL_NAME);
1275	g_topology_lock();
1276}
1277
1278static void
1279g_md_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
1280    struct g_consumer *cp __unused, struct g_provider *pp)
1281{
1282	struct md_s *mp;
1283	char *type;
1284
1285	mp = gp->softc;
1286	if (mp == NULL)
1287		return;
1288
1289	switch (mp->type) {
1290	case MD_MALLOC:
1291		type = "malloc";
1292		break;
1293	case MD_PRELOAD:
1294		type = "preload";
1295		break;
1296	case MD_VNODE:
1297		type = "vnode";
1298		break;
1299	case MD_SWAP:
1300		type = "swap";
1301		break;
1302	default:
1303		type = "unknown";
1304		break;
1305	}
1306
1307	if (pp != NULL) {
1308		if (indent == NULL) {
1309			sbuf_printf(sb, " u %d", mp->unit);
1310			sbuf_printf(sb, " s %ju", (uintmax_t) mp->sectorsize);
1311			sbuf_printf(sb, " f %ju", (uintmax_t) mp->fwheads);
1312			sbuf_printf(sb, " fs %ju", (uintmax_t) mp->fwsectors);
1313			sbuf_printf(sb, " l %ju", (uintmax_t) mp->mediasize);
1314			sbuf_printf(sb, " t %s", type);
1315			if (mp->type == MD_VNODE && mp->vnode != NULL)
1316				sbuf_printf(sb, " file %s", mp->file);
1317		} else {
1318			sbuf_printf(sb, "%s<unit>%d</unit>\n", indent,
1319			    mp->unit);
1320			sbuf_printf(sb, "%s<sectorsize>%ju</sectorsize>\n",
1321			    indent, (uintmax_t) mp->sectorsize);
1322			sbuf_printf(sb, "%s<fwheads>%ju</fwheads>\n",
1323			    indent, (uintmax_t) mp->fwheads);
1324			sbuf_printf(sb, "%s<fwsectors>%ju</fwsectors>\n",
1325			    indent, (uintmax_t) mp->fwsectors);
1326			sbuf_printf(sb, "%s<length>%ju</length>\n",
1327			    indent, (uintmax_t) mp->mediasize);
1328			sbuf_printf(sb, "%s<type>%s</type>\n", indent,
1329			    type);
1330			if (mp->type == MD_VNODE && mp->vnode != NULL)
1331				sbuf_printf(sb, "%s<file>%s</file>\n",
1332				    indent, mp->file);
1333		}
1334	}
1335}
1336
1337static void
1338g_md_fini(struct g_class *mp __unused)
1339{
1340
1341	sx_destroy(&md_sx);
1342	if (status_dev != NULL)
1343		destroy_dev(status_dev);
1344	delete_unrhdr(md_uh);
1345}
1346