md.c revision 217583
1/*-
2 * ----------------------------------------------------------------------------
3 * "THE BEER-WARE LICENSE" (Revision 42):
4 * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
5 * can do whatever you want with this stuff. If we meet some day, and you think
6 * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
7 * ----------------------------------------------------------------------------
8 *
9 * $FreeBSD: head/sys/dev/md/md.c 217583 2011-01-19 16:48:07Z kib $
10 *
11 */
12
13/*-
14 * The following functions are based in the vn(4) driver: mdstart_swap(),
15 * mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() and mddestroy(),
16 * and as such under the following copyright:
17 *
18 * Copyright (c) 1988 University of Utah.
19 * Copyright (c) 1990, 1993
20 *	The Regents of the University of California.  All rights reserved.
21 *
22 * This code is derived from software contributed to Berkeley by
23 * the Systems Programming Group of the University of Utah Computer
24 * Science Department.
25 *
26 * Redistribution and use in source and binary forms, with or without
27 * modification, are permitted provided that the following conditions
28 * are met:
29 * 1. Redistributions of source code must retain the above copyright
30 *    notice, this list of conditions and the following disclaimer.
31 * 2. Redistributions in binary form must reproduce the above copyright
32 *    notice, this list of conditions and the following disclaimer in the
33 *    documentation and/or other materials provided with the distribution.
34 * 4. Neither the name of the University nor the names of its contributors
35 *    may be used to endorse or promote products derived from this software
36 *    without specific prior written permission.
37 *
38 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
39 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
40 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
41 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
42 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
43 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
44 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
45 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
46 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
47 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
48 * SUCH DAMAGE.
49 *
50 * from: Utah Hdr: vn.c 1.13 94/04/02
51 *
52 *	from: @(#)vn.c	8.6 (Berkeley) 4/1/94
53 * From: src/sys/dev/vn/vn.c,v 1.122 2000/12/16 16:06:03
54 */
55
56#include "opt_geom.h"
57#include "opt_md.h"
58
59#include <sys/param.h>
60#include <sys/systm.h>
61#include <sys/bio.h>
62#include <sys/conf.h>
63#include <sys/devicestat.h>
64#include <sys/fcntl.h>
65#include <sys/kernel.h>
66#include <sys/kthread.h>
67#include <sys/limits.h>
68#include <sys/linker.h>
69#include <sys/lock.h>
70#include <sys/malloc.h>
71#include <sys/mdioctl.h>
72#include <sys/mount.h>
73#include <sys/mutex.h>
74#include <sys/sx.h>
75#include <sys/namei.h>
76#include <sys/proc.h>
77#include <sys/queue.h>
78#include <sys/sched.h>
79#include <sys/sf_buf.h>
80#include <sys/sysctl.h>
81#include <sys/vnode.h>
82
83#include <geom/geom.h>
84
85#include <vm/vm.h>
86#include <vm/vm_object.h>
87#include <vm/vm_page.h>
88#include <vm/vm_pager.h>
89#include <vm/swap_pager.h>
90#include <vm/uma.h>
91
92#define MD_MODVER 1
93
94#define MD_SHUTDOWN	0x10000		/* Tell worker thread to terminate. */
95#define	MD_EXITING	0x20000		/* Worker thread is exiting. */
96
97#ifndef MD_NSECT
98#define MD_NSECT (10000 * 2)
99#endif
100
101static MALLOC_DEFINE(M_MD, "md_disk", "Memory Disk");
102static MALLOC_DEFINE(M_MDSECT, "md_sectors", "Memory Disk Sectors");
103
104static int md_debug;
105SYSCTL_INT(_debug, OID_AUTO, mddebug, CTLFLAG_RW, &md_debug, 0, "");
106static int md_malloc_wait;
107SYSCTL_INT(_vm, OID_AUTO, md_malloc_wait, CTLFLAG_RW, &md_malloc_wait, 0, "");
108
109#if defined(MD_ROOT) && defined(MD_ROOT_SIZE)
110/*
111 * Preloaded image gets put here.
112 * Applications that patch the object with the image can determine
113 * the size looking at the start and end markers (strings),
114 * so we want them contiguous.
115 */
116static struct {
117	u_char start[MD_ROOT_SIZE*1024];
118	u_char end[128];
119} mfs_root = {
120	.start = "MFS Filesystem goes here",
121	.end = "MFS Filesystem had better STOP here",
122};
123#endif
124
125static g_init_t g_md_init;
126static g_fini_t g_md_fini;
127static g_start_t g_md_start;
128static g_access_t g_md_access;
129static void g_md_dumpconf(struct sbuf *sb, const char *indent,
130    struct g_geom *gp, struct g_consumer *cp __unused, struct g_provider *pp);
131
132static int mdunits;
133static struct cdev *status_dev = 0;
134static struct sx md_sx;
135static struct unrhdr *md_uh;
136
137static d_ioctl_t mdctlioctl;
138
139static struct cdevsw mdctl_cdevsw = {
140	.d_version =	D_VERSION,
141	.d_ioctl =	mdctlioctl,
142	.d_name =	MD_NAME,
143};
144
145struct g_class g_md_class = {
146	.name = "MD",
147	.version = G_VERSION,
148	.init = g_md_init,
149	.fini = g_md_fini,
150	.start = g_md_start,
151	.access = g_md_access,
152	.dumpconf = g_md_dumpconf,
153};
154
155DECLARE_GEOM_CLASS(g_md_class, g_md);
156
157
158static LIST_HEAD(, md_s) md_softc_list = LIST_HEAD_INITIALIZER(md_softc_list);
159
160#define NINDIR	(PAGE_SIZE / sizeof(uintptr_t))
161#define NMASK	(NINDIR-1)
162static int nshift;
163
164struct indir {
165	uintptr_t	*array;
166	u_int		total;
167	u_int		used;
168	u_int		shift;
169};
170
171struct md_s {
172	int unit;
173	LIST_ENTRY(md_s) list;
174	struct bio_queue_head bio_queue;
175	struct mtx queue_mtx;
176	struct cdev *dev;
177	enum md_types type;
178	off_t mediasize;
179	unsigned sectorsize;
180	unsigned opencount;
181	unsigned fwheads;
182	unsigned fwsectors;
183	unsigned flags;
184	char name[20];
185	struct proc *procp;
186	struct g_geom *gp;
187	struct g_provider *pp;
188	int (*start)(struct md_s *sc, struct bio *bp);
189	struct devstat *devstat;
190
191	/* MD_MALLOC related fields */
192	struct indir *indir;
193	uma_zone_t uma;
194
195	/* MD_PRELOAD related fields */
196	u_char *pl_ptr;
197	size_t pl_len;
198
199	/* MD_VNODE related fields */
200	struct vnode *vnode;
201	char file[PATH_MAX];
202	struct ucred *cred;
203
204	/* MD_SWAP related fields */
205	vm_object_t object;
206};
207
208static struct indir *
209new_indir(u_int shift)
210{
211	struct indir *ip;
212
213	ip = malloc(sizeof *ip, M_MD, (md_malloc_wait ? M_WAITOK : M_NOWAIT)
214	    | M_ZERO);
215	if (ip == NULL)
216		return (NULL);
217	ip->array = malloc(sizeof(uintptr_t) * NINDIR,
218	    M_MDSECT, (md_malloc_wait ? M_WAITOK : M_NOWAIT) | M_ZERO);
219	if (ip->array == NULL) {
220		free(ip, M_MD);
221		return (NULL);
222	}
223	ip->total = NINDIR;
224	ip->shift = shift;
225	return (ip);
226}
227
228static void
229del_indir(struct indir *ip)
230{
231
232	free(ip->array, M_MDSECT);
233	free(ip, M_MD);
234}
235
236static void
237destroy_indir(struct md_s *sc, struct indir *ip)
238{
239	int i;
240
241	for (i = 0; i < NINDIR; i++) {
242		if (!ip->array[i])
243			continue;
244		if (ip->shift)
245			destroy_indir(sc, (struct indir*)(ip->array[i]));
246		else if (ip->array[i] > 255)
247			uma_zfree(sc->uma, (void *)(ip->array[i]));
248	}
249	del_indir(ip);
250}
251
252/*
253 * This function does the math and allocates the top level "indir" structure
254 * for a device of "size" sectors.
255 */
256
257static struct indir *
258dimension(off_t size)
259{
260	off_t rcnt;
261	struct indir *ip;
262	int layer;
263
264	rcnt = size;
265	layer = 0;
266	while (rcnt > NINDIR) {
267		rcnt /= NINDIR;
268		layer++;
269	}
270
271	/*
272	 * XXX: the top layer is probably not fully populated, so we allocate
273	 * too much space for ip->array in here.
274	 */
275	ip = malloc(sizeof *ip, M_MD, M_WAITOK | M_ZERO);
276	ip->array = malloc(sizeof(uintptr_t) * NINDIR,
277	    M_MDSECT, M_WAITOK | M_ZERO);
278	ip->total = NINDIR;
279	ip->shift = layer * nshift;
280	return (ip);
281}
282
283/*
284 * Read a given sector
285 */
286
287static uintptr_t
288s_read(struct indir *ip, off_t offset)
289{
290	struct indir *cip;
291	int idx;
292	uintptr_t up;
293
294	if (md_debug > 1)
295		printf("s_read(%jd)\n", (intmax_t)offset);
296	up = 0;
297	for (cip = ip; cip != NULL;) {
298		if (cip->shift) {
299			idx = (offset >> cip->shift) & NMASK;
300			up = cip->array[idx];
301			cip = (struct indir *)up;
302			continue;
303		}
304		idx = offset & NMASK;
305		return (cip->array[idx]);
306	}
307	return (0);
308}
309
310/*
311 * Write a given sector, prune the tree if the value is 0
312 */
313
314static int
315s_write(struct indir *ip, off_t offset, uintptr_t ptr)
316{
317	struct indir *cip, *lip[10];
318	int idx, li;
319	uintptr_t up;
320
321	if (md_debug > 1)
322		printf("s_write(%jd, %p)\n", (intmax_t)offset, (void *)ptr);
323	up = 0;
324	li = 0;
325	cip = ip;
326	for (;;) {
327		lip[li++] = cip;
328		if (cip->shift) {
329			idx = (offset >> cip->shift) & NMASK;
330			up = cip->array[idx];
331			if (up != 0) {
332				cip = (struct indir *)up;
333				continue;
334			}
335			/* Allocate branch */
336			cip->array[idx] =
337			    (uintptr_t)new_indir(cip->shift - nshift);
338			if (cip->array[idx] == 0)
339				return (ENOSPC);
340			cip->used++;
341			up = cip->array[idx];
342			cip = (struct indir *)up;
343			continue;
344		}
345		/* leafnode */
346		idx = offset & NMASK;
347		up = cip->array[idx];
348		if (up != 0)
349			cip->used--;
350		cip->array[idx] = ptr;
351		if (ptr != 0)
352			cip->used++;
353		break;
354	}
355	if (cip->used != 0 || li == 1)
356		return (0);
357	li--;
358	while (cip->used == 0 && cip != ip) {
359		li--;
360		idx = (offset >> lip[li]->shift) & NMASK;
361		up = lip[li]->array[idx];
362		KASSERT(up == (uintptr_t)cip, ("md screwed up"));
363		del_indir(cip);
364		lip[li]->array[idx] = 0;
365		lip[li]->used--;
366		cip = lip[li];
367	}
368	return (0);
369}
370
371
372static int
373g_md_access(struct g_provider *pp, int r, int w, int e)
374{
375	struct md_s *sc;
376
377	sc = pp->geom->softc;
378	if (sc == NULL) {
379		if (r <= 0 && w <= 0 && e <= 0)
380			return (0);
381		return (ENXIO);
382	}
383	r += pp->acr;
384	w += pp->acw;
385	e += pp->ace;
386	if ((sc->flags & MD_READONLY) != 0 && w > 0)
387		return (EROFS);
388	if ((pp->acr + pp->acw + pp->ace) == 0 && (r + w + e) > 0) {
389		sc->opencount = 1;
390	} else if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0) {
391		sc->opencount = 0;
392	}
393	return (0);
394}
395
396static void
397g_md_start(struct bio *bp)
398{
399	struct md_s *sc;
400
401	sc = bp->bio_to->geom->softc;
402	if ((bp->bio_cmd == BIO_READ) || (bp->bio_cmd == BIO_WRITE))
403		devstat_start_transaction_bio(sc->devstat, bp);
404	mtx_lock(&sc->queue_mtx);
405	bioq_disksort(&sc->bio_queue, bp);
406	mtx_unlock(&sc->queue_mtx);
407	wakeup(sc);
408}
409
410static int
411mdstart_malloc(struct md_s *sc, struct bio *bp)
412{
413	int i, error;
414	u_char *dst;
415	off_t secno, nsec, uc;
416	uintptr_t sp, osp;
417
418	switch (bp->bio_cmd) {
419	case BIO_READ:
420	case BIO_WRITE:
421	case BIO_DELETE:
422		break;
423	default:
424		return (EOPNOTSUPP);
425	}
426
427	nsec = bp->bio_length / sc->sectorsize;
428	secno = bp->bio_offset / sc->sectorsize;
429	dst = bp->bio_data;
430	error = 0;
431	while (nsec--) {
432		osp = s_read(sc->indir, secno);
433		if (bp->bio_cmd == BIO_DELETE) {
434			if (osp != 0)
435				error = s_write(sc->indir, secno, 0);
436		} else if (bp->bio_cmd == BIO_READ) {
437			if (osp == 0)
438				bzero(dst, sc->sectorsize);
439			else if (osp <= 255)
440				memset(dst, osp, sc->sectorsize);
441			else {
442				bcopy((void *)osp, dst, sc->sectorsize);
443				cpu_flush_dcache(dst, sc->sectorsize);
444			}
445			osp = 0;
446		} else if (bp->bio_cmd == BIO_WRITE) {
447			if (sc->flags & MD_COMPRESS) {
448				uc = dst[0];
449				for (i = 1; i < sc->sectorsize; i++)
450					if (dst[i] != uc)
451						break;
452			} else {
453				i = 0;
454				uc = 0;
455			}
456			if (i == sc->sectorsize) {
457				if (osp != uc)
458					error = s_write(sc->indir, secno, uc);
459			} else {
460				if (osp <= 255) {
461					sp = (uintptr_t)uma_zalloc(sc->uma,
462					    md_malloc_wait ? M_WAITOK :
463					    M_NOWAIT);
464					if (sp == 0) {
465						error = ENOSPC;
466						break;
467					}
468					bcopy(dst, (void *)sp, sc->sectorsize);
469					error = s_write(sc->indir, secno, sp);
470				} else {
471					bcopy(dst, (void *)osp, sc->sectorsize);
472					osp = 0;
473				}
474			}
475		} else {
476			error = EOPNOTSUPP;
477		}
478		if (osp > 255)
479			uma_zfree(sc->uma, (void*)osp);
480		if (error != 0)
481			break;
482		secno++;
483		dst += sc->sectorsize;
484	}
485	bp->bio_resid = 0;
486	return (error);
487}
488
489static int
490mdstart_preload(struct md_s *sc, struct bio *bp)
491{
492
493	switch (bp->bio_cmd) {
494	case BIO_READ:
495		bcopy(sc->pl_ptr + bp->bio_offset, bp->bio_data,
496		    bp->bio_length);
497		cpu_flush_dcache(bp->bio_data, bp->bio_length);
498		break;
499	case BIO_WRITE:
500		bcopy(bp->bio_data, sc->pl_ptr + bp->bio_offset,
501		    bp->bio_length);
502		break;
503	}
504	bp->bio_resid = 0;
505	return (0);
506}
507
508static int
509mdstart_vnode(struct md_s *sc, struct bio *bp)
510{
511	int error, vfslocked;
512	struct uio auio;
513	struct iovec aiov;
514	struct mount *mp;
515	struct vnode *vp;
516	struct thread *td;
517
518	switch (bp->bio_cmd) {
519	case BIO_READ:
520	case BIO_WRITE:
521	case BIO_FLUSH:
522		break;
523	default:
524		return (EOPNOTSUPP);
525	}
526
527	td = curthread;
528	vp = sc->vnode;
529
530	/*
531	 * VNODE I/O
532	 *
533	 * If an error occurs, we set BIO_ERROR but we do not set
534	 * B_INVAL because (for a write anyway), the buffer is
535	 * still valid.
536	 */
537
538	if (bp->bio_cmd == BIO_FLUSH) {
539		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
540		(void) vn_start_write(vp, &mp, V_WAIT);
541		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
542		error = VOP_FSYNC(vp, MNT_WAIT, td);
543		VOP_UNLOCK(vp, 0);
544		vn_finished_write(mp);
545		VFS_UNLOCK_GIANT(vfslocked);
546		return (error);
547	}
548
549	bzero(&auio, sizeof(auio));
550
551	aiov.iov_base = bp->bio_data;
552	aiov.iov_len = bp->bio_length;
553	auio.uio_iov = &aiov;
554	auio.uio_iovcnt = 1;
555	auio.uio_offset = (vm_ooffset_t)bp->bio_offset;
556	auio.uio_segflg = UIO_SYSSPACE;
557	if (bp->bio_cmd == BIO_READ)
558		auio.uio_rw = UIO_READ;
559	else if (bp->bio_cmd == BIO_WRITE)
560		auio.uio_rw = UIO_WRITE;
561	else
562		panic("wrong BIO_OP in mdstart_vnode");
563	auio.uio_resid = bp->bio_length;
564	auio.uio_td = td;
565	/*
566	 * When reading set IO_DIRECT to try to avoid double-caching
567	 * the data.  When writing IO_DIRECT is not optimal.
568	 */
569	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
570	if (bp->bio_cmd == BIO_READ) {
571		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
572		error = VOP_READ(vp, &auio, IO_DIRECT, sc->cred);
573		VOP_UNLOCK(vp, 0);
574	} else {
575		(void) vn_start_write(vp, &mp, V_WAIT);
576		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
577		error = VOP_WRITE(vp, &auio, sc->flags & MD_ASYNC ? 0 : IO_SYNC,
578		    sc->cred);
579		VOP_UNLOCK(vp, 0);
580		vn_finished_write(mp);
581	}
582	VFS_UNLOCK_GIANT(vfslocked);
583	bp->bio_resid = auio.uio_resid;
584	return (error);
585}
586
587static int
588mdstart_swap(struct md_s *sc, struct bio *bp)
589{
590	struct sf_buf *sf;
591	int rv, offs, len, lastend;
592	vm_pindex_t i, lastp;
593	vm_page_t m;
594	u_char *p;
595
596	switch (bp->bio_cmd) {
597	case BIO_READ:
598	case BIO_WRITE:
599	case BIO_DELETE:
600		break;
601	default:
602		return (EOPNOTSUPP);
603	}
604
605	p = bp->bio_data;
606
607	/*
608	 * offs is the offset at which to start operating on the
609	 * next (ie, first) page.  lastp is the last page on
610	 * which we're going to operate.  lastend is the ending
611	 * position within that last page (ie, PAGE_SIZE if
612	 * we're operating on complete aligned pages).
613	 */
614	offs = bp->bio_offset % PAGE_SIZE;
615	lastp = (bp->bio_offset + bp->bio_length - 1) / PAGE_SIZE;
616	lastend = (bp->bio_offset + bp->bio_length - 1) % PAGE_SIZE + 1;
617
618	rv = VM_PAGER_OK;
619	VM_OBJECT_LOCK(sc->object);
620	vm_object_pip_add(sc->object, 1);
621	for (i = bp->bio_offset / PAGE_SIZE; i <= lastp; i++) {
622		len = ((i == lastp) ? lastend : PAGE_SIZE) - offs;
623
624		m = vm_page_grab(sc->object, i,
625		    VM_ALLOC_NORMAL|VM_ALLOC_RETRY);
626		VM_OBJECT_UNLOCK(sc->object);
627		sched_pin();
628		sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
629		VM_OBJECT_LOCK(sc->object);
630		if (bp->bio_cmd == BIO_READ) {
631			if (m->valid != VM_PAGE_BITS_ALL)
632				rv = vm_pager_get_pages(sc->object, &m, 1, 0);
633			if (rv == VM_PAGER_ERROR) {
634				sf_buf_free(sf);
635				sched_unpin();
636				vm_page_wakeup(m);
637				break;
638			}
639			bcopy((void *)(sf_buf_kva(sf) + offs), p, len);
640			cpu_flush_dcache(p, len);
641		} else if (bp->bio_cmd == BIO_WRITE) {
642			if (len != PAGE_SIZE && m->valid != VM_PAGE_BITS_ALL)
643				rv = vm_pager_get_pages(sc->object, &m, 1, 0);
644			if (rv == VM_PAGER_ERROR) {
645				sf_buf_free(sf);
646				sched_unpin();
647				vm_page_wakeup(m);
648				break;
649			}
650			bcopy(p, (void *)(sf_buf_kva(sf) + offs), len);
651			m->valid = VM_PAGE_BITS_ALL;
652#if 0
653		} else if (bp->bio_cmd == BIO_DELETE) {
654			if (len != PAGE_SIZE && m->valid != VM_PAGE_BITS_ALL)
655				rv = vm_pager_get_pages(sc->object, &m, 1, 0);
656			if (rv == VM_PAGER_ERROR) {
657				sf_buf_free(sf);
658				sched_unpin();
659				vm_page_wakeup(m);
660				break;
661			}
662			bzero((void *)(sf_buf_kva(sf) + offs), len);
663			vm_page_dirty(m);
664			m->valid = VM_PAGE_BITS_ALL;
665#endif
666		}
667		sf_buf_free(sf);
668		sched_unpin();
669		vm_page_wakeup(m);
670		vm_page_lock(m);
671		vm_page_activate(m);
672		vm_page_unlock(m);
673		if (bp->bio_cmd == BIO_WRITE)
674			vm_page_dirty(m);
675
676		/* Actions on further pages start at offset 0 */
677		p += PAGE_SIZE - offs;
678		offs = 0;
679#if 0
680if (bootverbose || bp->bio_offset / PAGE_SIZE < 17)
681printf("wire_count %d busy %d flags %x hold_count %d act_count %d queue %d valid %d dirty %d @ %d\n",
682    m->wire_count, m->busy,
683    m->flags, m->hold_count, m->act_count, m->queue, m->valid, m->dirty, i);
684#endif
685	}
686	vm_object_pip_subtract(sc->object, 1);
687	VM_OBJECT_UNLOCK(sc->object);
688	return (rv != VM_PAGER_ERROR ? 0 : ENOSPC);
689}
690
691static void
692md_kthread(void *arg)
693{
694	struct md_s *sc;
695	struct bio *bp;
696	int error;
697
698	sc = arg;
699	thread_lock(curthread);
700	sched_prio(curthread, PRIBIO);
701	thread_unlock(curthread);
702	if (sc->type == MD_VNODE)
703		curthread->td_pflags |= TDP_NORUNNINGBUF;
704
705	for (;;) {
706		mtx_lock(&sc->queue_mtx);
707		if (sc->flags & MD_SHUTDOWN) {
708			sc->flags |= MD_EXITING;
709			mtx_unlock(&sc->queue_mtx);
710			kproc_exit(0);
711		}
712		bp = bioq_takefirst(&sc->bio_queue);
713		if (!bp) {
714			msleep(sc, &sc->queue_mtx, PRIBIO | PDROP, "mdwait", 0);
715			continue;
716		}
717		mtx_unlock(&sc->queue_mtx);
718		if (bp->bio_cmd == BIO_GETATTR) {
719			if ((sc->fwsectors && sc->fwheads &&
720			    (g_handleattr_int(bp, "GEOM::fwsectors",
721			    sc->fwsectors) ||
722			    g_handleattr_int(bp, "GEOM::fwheads",
723			    sc->fwheads))) ||
724			    g_handleattr_int(bp, "GEOM::candelete", 1))
725				error = -1;
726			else
727				error = EOPNOTSUPP;
728		} else {
729			error = sc->start(sc, bp);
730		}
731
732		if (error != -1) {
733			bp->bio_completed = bp->bio_length;
734			g_io_deliver(bp, error);
735			if ((bp->bio_cmd == BIO_READ) || (bp->bio_cmd == BIO_WRITE))
736				devstat_end_transaction_bio(sc->devstat, bp);
737		}
738	}
739}
740
741static struct md_s *
742mdfind(int unit)
743{
744	struct md_s *sc;
745
746	LIST_FOREACH(sc, &md_softc_list, list) {
747		if (sc->unit == unit)
748			break;
749	}
750	return (sc);
751}
752
753static struct md_s *
754mdnew(int unit, int *errp, enum md_types type)
755{
756	struct md_s *sc;
757	int error;
758
759	*errp = 0;
760	if (unit == -1)
761		unit = alloc_unr(md_uh);
762	else
763		unit = alloc_unr_specific(md_uh, unit);
764
765	if (unit == -1) {
766		*errp = EBUSY;
767		return (NULL);
768	}
769
770	sc = (struct md_s *)malloc(sizeof *sc, M_MD, M_WAITOK | M_ZERO);
771	sc->type = type;
772	bioq_init(&sc->bio_queue);
773	mtx_init(&sc->queue_mtx, "md bio queue", NULL, MTX_DEF);
774	sc->unit = unit;
775	sprintf(sc->name, "md%d", unit);
776	LIST_INSERT_HEAD(&md_softc_list, sc, list);
777	error = kproc_create(md_kthread, sc, &sc->procp, 0, 0,"%s", sc->name);
778	if (error == 0)
779		return (sc);
780	LIST_REMOVE(sc, list);
781	mtx_destroy(&sc->queue_mtx);
782	free_unr(md_uh, sc->unit);
783	free(sc, M_MD);
784	*errp = error;
785	return (NULL);
786}
787
788static void
789mdinit(struct md_s *sc)
790{
791	struct g_geom *gp;
792	struct g_provider *pp;
793
794	g_topology_lock();
795	gp = g_new_geomf(&g_md_class, "md%d", sc->unit);
796	gp->softc = sc;
797	pp = g_new_providerf(gp, "md%d", sc->unit);
798	pp->mediasize = sc->mediasize;
799	pp->sectorsize = sc->sectorsize;
800	sc->gp = gp;
801	sc->pp = pp;
802	g_error_provider(pp, 0);
803	g_topology_unlock();
804	sc->devstat = devstat_new_entry("md", sc->unit, sc->sectorsize,
805	    DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX);
806}
807
808/*
809 * XXX: we should check that the range they feed us is mapped.
810 * XXX: we should implement read-only.
811 */
812
813static int
814mdcreate_preload(struct md_s *sc, struct md_ioctl *mdio)
815{
816
817	if (mdio->md_options & ~(MD_AUTOUNIT | MD_FORCE))
818		return (EINVAL);
819	if (mdio->md_base == 0)
820		return (EINVAL);
821	sc->flags = mdio->md_options & MD_FORCE;
822	/* Cast to pointer size, then to pointer to avoid warning */
823	sc->pl_ptr = (u_char *)(uintptr_t)mdio->md_base;
824	sc->pl_len = (size_t)sc->mediasize;
825	return (0);
826}
827
828
829static int
830mdcreate_malloc(struct md_s *sc, struct md_ioctl *mdio)
831{
832	uintptr_t sp;
833	int error;
834	off_t u;
835
836	error = 0;
837	if (mdio->md_options & ~(MD_AUTOUNIT | MD_COMPRESS | MD_RESERVE))
838		return (EINVAL);
839	if (mdio->md_sectorsize != 0 && !powerof2(mdio->md_sectorsize))
840		return (EINVAL);
841	/* Compression doesn't make sense if we have reserved space */
842	if (mdio->md_options & MD_RESERVE)
843		mdio->md_options &= ~MD_COMPRESS;
844	if (mdio->md_fwsectors != 0)
845		sc->fwsectors = mdio->md_fwsectors;
846	if (mdio->md_fwheads != 0)
847		sc->fwheads = mdio->md_fwheads;
848	sc->flags = mdio->md_options & (MD_COMPRESS | MD_FORCE);
849	sc->indir = dimension(sc->mediasize / sc->sectorsize);
850	sc->uma = uma_zcreate(sc->name, sc->sectorsize, NULL, NULL, NULL, NULL,
851	    0x1ff, 0);
852	if (mdio->md_options & MD_RESERVE) {
853		off_t nsectors;
854
855		nsectors = sc->mediasize / sc->sectorsize;
856		for (u = 0; u < nsectors; u++) {
857			sp = (uintptr_t)uma_zalloc(sc->uma, (md_malloc_wait ?
858			    M_WAITOK : M_NOWAIT) | M_ZERO);
859			if (sp != 0)
860				error = s_write(sc->indir, u, sp);
861			else
862				error = ENOMEM;
863			if (error != 0)
864				break;
865		}
866	}
867	return (error);
868}
869
870
871static int
872mdsetcred(struct md_s *sc, struct ucred *cred)
873{
874	char *tmpbuf;
875	int error = 0;
876
877	/*
878	 * Set credits in our softc
879	 */
880
881	if (sc->cred)
882		crfree(sc->cred);
883	sc->cred = crhold(cred);
884
885	/*
886	 * Horrible kludge to establish credentials for NFS  XXX.
887	 */
888
889	if (sc->vnode) {
890		struct uio auio;
891		struct iovec aiov;
892
893		tmpbuf = malloc(sc->sectorsize, M_TEMP, M_WAITOK);
894		bzero(&auio, sizeof(auio));
895
896		aiov.iov_base = tmpbuf;
897		aiov.iov_len = sc->sectorsize;
898		auio.uio_iov = &aiov;
899		auio.uio_iovcnt = 1;
900		auio.uio_offset = 0;
901		auio.uio_rw = UIO_READ;
902		auio.uio_segflg = UIO_SYSSPACE;
903		auio.uio_resid = aiov.iov_len;
904		vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY);
905		error = VOP_READ(sc->vnode, &auio, 0, sc->cred);
906		VOP_UNLOCK(sc->vnode, 0);
907		free(tmpbuf, M_TEMP);
908	}
909	return (error);
910}
911
912static int
913mdcreate_vnode(struct md_s *sc, struct md_ioctl *mdio, struct thread *td)
914{
915	struct vattr vattr;
916	struct nameidata nd;
917	char *fname;
918	int error, flags, vfslocked;
919
920	/*
921	 * Kernel-originated requests must have the filename appended
922	 * to the mdio structure to protect against malicious software.
923	 */
924	fname = mdio->md_file;
925	if ((void *)fname != (void *)(mdio + 1)) {
926		error = copyinstr(fname, sc->file, sizeof(sc->file), NULL);
927		if (error != 0)
928			return (error);
929	} else
930		strlcpy(sc->file, fname, sizeof(sc->file));
931
932	/*
933	 * If the user specified that this is a read only device, don't
934	 * set the FWRITE mask before trying to open the backing store.
935	 */
936	flags = FREAD | ((mdio->md_options & MD_READONLY) ? 0 : FWRITE);
937	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, sc->file, td);
938	error = vn_open(&nd, &flags, 0, NULL);
939	if (error != 0)
940		return (error);
941	vfslocked = NDHASGIANT(&nd);
942	NDFREE(&nd, NDF_ONLY_PNBUF);
943	if (nd.ni_vp->v_type != VREG) {
944		error = EINVAL;
945		goto bad;
946	}
947	error = VOP_GETATTR(nd.ni_vp, &vattr, td->td_ucred);
948	if (error != 0)
949		goto bad;
950	if (VOP_ISLOCKED(nd.ni_vp) != LK_EXCLUSIVE) {
951		vn_lock(nd.ni_vp, LK_UPGRADE | LK_RETRY);
952		if (nd.ni_vp->v_iflag & VI_DOOMED) {
953			/* Forced unmount. */
954			error = EBADF;
955			goto bad;
956		}
957	}
958	nd.ni_vp->v_vflag |= VV_MD;
959	VOP_UNLOCK(nd.ni_vp, 0);
960
961	if (mdio->md_fwsectors != 0)
962		sc->fwsectors = mdio->md_fwsectors;
963	if (mdio->md_fwheads != 0)
964		sc->fwheads = mdio->md_fwheads;
965	sc->flags = mdio->md_options & (MD_FORCE | MD_ASYNC);
966	if (!(flags & FWRITE))
967		sc->flags |= MD_READONLY;
968	sc->vnode = nd.ni_vp;
969
970	error = mdsetcred(sc, td->td_ucred);
971	if (error != 0) {
972		sc->vnode = NULL;
973		vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY);
974		nd.ni_vp->v_vflag &= ~VV_MD;
975		goto bad;
976	}
977	VFS_UNLOCK_GIANT(vfslocked);
978	return (0);
979bad:
980	VOP_UNLOCK(nd.ni_vp, 0);
981	(void)vn_close(nd.ni_vp, flags, td->td_ucred, td);
982	VFS_UNLOCK_GIANT(vfslocked);
983	return (error);
984}
985
986static int
987mddestroy(struct md_s *sc, struct thread *td)
988{
989	int vfslocked;
990
991	if (sc->gp) {
992		sc->gp->softc = NULL;
993		g_topology_lock();
994		g_wither_geom(sc->gp, ENXIO);
995		g_topology_unlock();
996		sc->gp = NULL;
997		sc->pp = NULL;
998	}
999	if (sc->devstat) {
1000		devstat_remove_entry(sc->devstat);
1001		sc->devstat = NULL;
1002	}
1003	mtx_lock(&sc->queue_mtx);
1004	sc->flags |= MD_SHUTDOWN;
1005	wakeup(sc);
1006	while (!(sc->flags & MD_EXITING))
1007		msleep(sc->procp, &sc->queue_mtx, PRIBIO, "mddestroy", hz / 10);
1008	mtx_unlock(&sc->queue_mtx);
1009	mtx_destroy(&sc->queue_mtx);
1010	if (sc->vnode != NULL) {
1011		vfslocked = VFS_LOCK_GIANT(sc->vnode->v_mount);
1012		vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY);
1013		sc->vnode->v_vflag &= ~VV_MD;
1014		VOP_UNLOCK(sc->vnode, 0);
1015		(void)vn_close(sc->vnode, sc->flags & MD_READONLY ?
1016		    FREAD : (FREAD|FWRITE), sc->cred, td);
1017		VFS_UNLOCK_GIANT(vfslocked);
1018	}
1019	if (sc->cred != NULL)
1020		crfree(sc->cred);
1021	if (sc->object != NULL)
1022		vm_object_deallocate(sc->object);
1023	if (sc->indir)
1024		destroy_indir(sc, sc->indir);
1025	if (sc->uma)
1026		uma_zdestroy(sc->uma);
1027
1028	LIST_REMOVE(sc, list);
1029	free_unr(md_uh, sc->unit);
1030	free(sc, M_MD);
1031	return (0);
1032}
1033
1034static int
1035mdcreate_swap(struct md_s *sc, struct md_ioctl *mdio, struct thread *td)
1036{
1037	vm_ooffset_t npage;
1038	int error;
1039
1040	/*
1041	 * Range check.  Disallow negative sizes or any size less then the
1042	 * size of a page.  Then round to a page.
1043	 */
1044	if (sc->mediasize == 0 || (sc->mediasize % PAGE_SIZE) != 0)
1045		return (EDOM);
1046
1047	/*
1048	 * Allocate an OBJT_SWAP object.
1049	 *
1050	 * Note the truncation.
1051	 */
1052
1053	npage = mdio->md_mediasize / PAGE_SIZE;
1054	if (mdio->md_fwsectors != 0)
1055		sc->fwsectors = mdio->md_fwsectors;
1056	if (mdio->md_fwheads != 0)
1057		sc->fwheads = mdio->md_fwheads;
1058	sc->object = vm_pager_allocate(OBJT_SWAP, NULL, PAGE_SIZE * npage,
1059	    VM_PROT_DEFAULT, 0, td->td_ucred);
1060	if (sc->object == NULL)
1061		return (ENOMEM);
1062	sc->flags = mdio->md_options & MD_FORCE;
1063	if (mdio->md_options & MD_RESERVE) {
1064		if (swap_pager_reserve(sc->object, 0, npage) < 0) {
1065			error = EDOM;
1066			goto finish;
1067		}
1068	}
1069	error = mdsetcred(sc, td->td_ucred);
1070 finish:
1071	if (error != 0) {
1072		vm_object_deallocate(sc->object);
1073		sc->object = NULL;
1074	}
1075	return (error);
1076}
1077
1078
1079static int
1080xmdctlioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td)
1081{
1082	struct md_ioctl *mdio;
1083	struct md_s *sc;
1084	int error, i;
1085
1086	if (md_debug)
1087		printf("mdctlioctl(%s %lx %p %x %p)\n",
1088			devtoname(dev), cmd, addr, flags, td);
1089
1090	mdio = (struct md_ioctl *)addr;
1091	if (mdio->md_version != MDIOVERSION)
1092		return (EINVAL);
1093
1094	/*
1095	 * We assert the version number in the individual ioctl
1096	 * handlers instead of out here because (a) it is possible we
1097	 * may add another ioctl in the future which doesn't read an
1098	 * mdio, and (b) the correct return value for an unknown ioctl
1099	 * is ENOIOCTL, not EINVAL.
1100	 */
1101	error = 0;
1102	switch (cmd) {
1103	case MDIOCATTACH:
1104		switch (mdio->md_type) {
1105		case MD_MALLOC:
1106		case MD_PRELOAD:
1107		case MD_VNODE:
1108		case MD_SWAP:
1109			break;
1110		default:
1111			return (EINVAL);
1112		}
1113		if (mdio->md_options & MD_AUTOUNIT)
1114			sc = mdnew(-1, &error, mdio->md_type);
1115		else {
1116			if (mdio->md_unit > INT_MAX)
1117				return (EINVAL);
1118			sc = mdnew(mdio->md_unit, &error, mdio->md_type);
1119		}
1120		if (sc == NULL)
1121			return (error);
1122		if (mdio->md_options & MD_AUTOUNIT)
1123			mdio->md_unit = sc->unit;
1124		sc->mediasize = mdio->md_mediasize;
1125		if (mdio->md_sectorsize == 0)
1126			sc->sectorsize = DEV_BSIZE;
1127		else
1128			sc->sectorsize = mdio->md_sectorsize;
1129		error = EDOOFUS;
1130		switch (sc->type) {
1131		case MD_MALLOC:
1132			sc->start = mdstart_malloc;
1133			error = mdcreate_malloc(sc, mdio);
1134			break;
1135		case MD_PRELOAD:
1136			sc->start = mdstart_preload;
1137			error = mdcreate_preload(sc, mdio);
1138			break;
1139		case MD_VNODE:
1140			sc->start = mdstart_vnode;
1141			error = mdcreate_vnode(sc, mdio, td);
1142			break;
1143		case MD_SWAP:
1144			sc->start = mdstart_swap;
1145			error = mdcreate_swap(sc, mdio, td);
1146			break;
1147		}
1148		if (error != 0) {
1149			mddestroy(sc, td);
1150			return (error);
1151		}
1152
1153		/* Prune off any residual fractional sector */
1154		i = sc->mediasize % sc->sectorsize;
1155		sc->mediasize -= i;
1156
1157		mdinit(sc);
1158		return (0);
1159	case MDIOCDETACH:
1160		if (mdio->md_mediasize != 0 ||
1161		    (mdio->md_options & ~MD_FORCE) != 0)
1162			return (EINVAL);
1163
1164		sc = mdfind(mdio->md_unit);
1165		if (sc == NULL)
1166			return (ENOENT);
1167		if (sc->opencount != 0 && !(sc->flags & MD_FORCE) &&
1168		    !(mdio->md_options & MD_FORCE))
1169			return (EBUSY);
1170		return (mddestroy(sc, td));
1171	case MDIOCQUERY:
1172		sc = mdfind(mdio->md_unit);
1173		if (sc == NULL)
1174			return (ENOENT);
1175		mdio->md_type = sc->type;
1176		mdio->md_options = sc->flags;
1177		mdio->md_mediasize = sc->mediasize;
1178		mdio->md_sectorsize = sc->sectorsize;
1179		if (sc->type == MD_VNODE)
1180			error = copyout(sc->file, mdio->md_file,
1181			    strlen(sc->file) + 1);
1182		return (error);
1183	case MDIOCLIST:
1184		i = 1;
1185		LIST_FOREACH(sc, &md_softc_list, list) {
1186			if (i == MDNPAD - 1)
1187				mdio->md_pad[i] = -1;
1188			else
1189				mdio->md_pad[i++] = sc->unit;
1190		}
1191		mdio->md_pad[0] = i - 1;
1192		return (0);
1193	default:
1194		return (ENOIOCTL);
1195	};
1196}
1197
1198static int
1199mdctlioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td)
1200{
1201	int error;
1202
1203	sx_xlock(&md_sx);
1204	error = xmdctlioctl(dev, cmd, addr, flags, td);
1205	sx_xunlock(&md_sx);
1206	return (error);
1207}
1208
1209static void
1210md_preloaded(u_char *image, size_t length)
1211{
1212	struct md_s *sc;
1213	int error;
1214
1215	sc = mdnew(-1, &error, MD_PRELOAD);
1216	if (sc == NULL)
1217		return;
1218	sc->mediasize = length;
1219	sc->sectorsize = DEV_BSIZE;
1220	sc->pl_ptr = image;
1221	sc->pl_len = length;
1222	sc->start = mdstart_preload;
1223#ifdef MD_ROOT
1224	if (sc->unit == 0)
1225		rootdevnames[0] = "ufs:/dev/md0";
1226#endif
1227	mdinit(sc);
1228}
1229
1230static void
1231g_md_init(struct g_class *mp __unused)
1232{
1233	caddr_t mod;
1234	caddr_t c;
1235	u_char *ptr, *name, *type;
1236	unsigned len;
1237	int i;
1238
1239	/* figure out log2(NINDIR) */
1240	for (i = NINDIR, nshift = -1; i; nshift++)
1241		i >>= 1;
1242
1243	mod = NULL;
1244	sx_init(&md_sx, "MD config lock");
1245	g_topology_unlock();
1246	md_uh = new_unrhdr(0, INT_MAX, NULL);
1247#ifdef MD_ROOT_SIZE
1248	sx_xlock(&md_sx);
1249	md_preloaded(mfs_root.start, sizeof(mfs_root.start));
1250	sx_xunlock(&md_sx);
1251#endif
1252	/* XXX: are preload_* static or do they need Giant ? */
1253	while ((mod = preload_search_next_name(mod)) != NULL) {
1254		name = (char *)preload_search_info(mod, MODINFO_NAME);
1255		if (name == NULL)
1256			continue;
1257		type = (char *)preload_search_info(mod, MODINFO_TYPE);
1258		if (type == NULL)
1259			continue;
1260		if (strcmp(type, "md_image") && strcmp(type, "mfs_root"))
1261			continue;
1262		c = preload_search_info(mod, MODINFO_ADDR);
1263		ptr = *(u_char **)c;
1264		c = preload_search_info(mod, MODINFO_SIZE);
1265		len = *(size_t *)c;
1266		printf("%s%d: Preloaded image <%s> %d bytes at %p\n",
1267		    MD_NAME, mdunits, name, len, ptr);
1268		sx_xlock(&md_sx);
1269		md_preloaded(ptr, len);
1270		sx_xunlock(&md_sx);
1271	}
1272	status_dev = make_dev(&mdctl_cdevsw, INT_MAX, UID_ROOT, GID_WHEEL,
1273	    0600, MDCTL_NAME);
1274	g_topology_lock();
1275}
1276
1277static void
1278g_md_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
1279    struct g_consumer *cp __unused, struct g_provider *pp)
1280{
1281	struct md_s *mp;
1282	char *type;
1283
1284	mp = gp->softc;
1285	if (mp == NULL)
1286		return;
1287
1288	switch (mp->type) {
1289	case MD_MALLOC:
1290		type = "malloc";
1291		break;
1292	case MD_PRELOAD:
1293		type = "preload";
1294		break;
1295	case MD_VNODE:
1296		type = "vnode";
1297		break;
1298	case MD_SWAP:
1299		type = "swap";
1300		break;
1301	default:
1302		type = "unknown";
1303		break;
1304	}
1305
1306	if (pp != NULL) {
1307		if (indent == NULL) {
1308			sbuf_printf(sb, " u %d", mp->unit);
1309			sbuf_printf(sb, " s %ju", (uintmax_t) mp->sectorsize);
1310			sbuf_printf(sb, " f %ju", (uintmax_t) mp->fwheads);
1311			sbuf_printf(sb, " fs %ju", (uintmax_t) mp->fwsectors);
1312			sbuf_printf(sb, " l %ju", (uintmax_t) mp->mediasize);
1313			sbuf_printf(sb, " t %s", type);
1314			if (mp->type == MD_VNODE && mp->vnode != NULL)
1315				sbuf_printf(sb, " file %s", mp->file);
1316		} else {
1317			sbuf_printf(sb, "%s<unit>%d</unit>\n", indent,
1318			    mp->unit);
1319			sbuf_printf(sb, "%s<sectorsize>%ju</sectorsize>\n",
1320			    indent, (uintmax_t) mp->sectorsize);
1321			sbuf_printf(sb, "%s<fwheads>%ju</fwheads>\n",
1322			    indent, (uintmax_t) mp->fwheads);
1323			sbuf_printf(sb, "%s<fwsectors>%ju</fwsectors>\n",
1324			    indent, (uintmax_t) mp->fwsectors);
1325			sbuf_printf(sb, "%s<length>%ju</length>\n",
1326			    indent, (uintmax_t) mp->mediasize);
1327			sbuf_printf(sb, "%s<type>%s</type>\n", indent,
1328			    type);
1329			if (mp->type == MD_VNODE && mp->vnode != NULL)
1330				sbuf_printf(sb, "%s<file>%s</file>\n",
1331				    indent, mp->file);
1332		}
1333	}
1334}
1335
1336static void
1337g_md_fini(struct g_class *mp __unused)
1338{
1339
1340	sx_destroy(&md_sx);
1341	if (status_dev != NULL)
1342		destroy_dev(status_dev);
1343	delete_unrhdr(md_uh);
1344}
1345