md.c revision 327655
1133359Sobrien/*-
2133359Sobrien * ----------------------------------------------------------------------------
3267843Sdelphij * "THE BEER-WARE LICENSE" (Revision 42):
4133359Sobrien * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
5133359Sobrien * can do whatever you want with this stuff. If we meet some day, and you think
6226048Sobrien * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
7169962Sobrien * ----------------------------------------------------------------------------
8169962Sobrien *
9169962Sobrien * $FreeBSD: stable/11/sys/dev/md/md.c 327655 2018-01-06 23:20:35Z ian $
10169962Sobrien *
11186690Sobrien */
12186690Sobrien
13186690Sobrien/*-
14186690Sobrien * The following functions are based in the vn(4) driver: mdstart_swap(),
15186690Sobrien * mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() and mddestroy(),
16226048Sobrien * and as such under the following copyright:
17226048Sobrien *
18226048Sobrien * Copyright (c) 1988 University of Utah.
19226048Sobrien * Copyright (c) 1990, 1993
20226048Sobrien *	The Regents of the University of California.  All rights reserved.
21226048Sobrien * Copyright (c) 2013 The FreeBSD Foundation
22226048Sobrien * All rights reserved.
23226048Sobrien *
24226048Sobrien * This code is derived from software contributed to Berkeley by
25226048Sobrien * the Systems Programming Group of the University of Utah Computer
26226048Sobrien * Science Department.
27226048Sobrien *
28226048Sobrien * Portions of this software were developed by Konstantin Belousov
29226048Sobrien * under sponsorship from the FreeBSD Foundation.
30226048Sobrien *
31226048Sobrien * Redistribution and use in source and binary forms, with or without
32267843Sdelphij * modification, are permitted provided that the following conditions
33226048Sobrien * are met:
34226048Sobrien * 1. Redistributions of source code must retain the above copyright
35226048Sobrien *    notice, this list of conditions and the following disclaimer.
36226048Sobrien * 2. Redistributions in binary form must reproduce the above copyright
37186690Sobrien *    notice, this list of conditions and the following disclaimer in the
38186690Sobrien *    documentation and/or other materials provided with the distribution.
39186690Sobrien * 4. Neither the name of the University nor the names of its contributors
40186690Sobrien *    may be used to endorse or promote products derived from this software
41186690Sobrien *    without specific prior written permission.
42226048Sobrien *
43226048Sobrien * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
44226048Sobrien * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
45226048Sobrien * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
46226048Sobrien * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
47226048Sobrien * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48226048Sobrien * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49226048Sobrien * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50226048Sobrien * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51226048Sobrien * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52226048Sobrien * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53 * SUCH DAMAGE.
54 *
55 * from: Utah Hdr: vn.c 1.13 94/04/02
56 *
57 *	from: @(#)vn.c	8.6 (Berkeley) 4/1/94
58 * From: src/sys/dev/vn/vn.c,v 1.122 2000/12/16 16:06:03
59 */
60
61#include "opt_rootdevname.h"
62#include "opt_geom.h"
63#include "opt_md.h"
64
65#include <sys/param.h>
66#include <sys/systm.h>
67#include <sys/bio.h>
68#include <sys/buf.h>
69#include <sys/conf.h>
70#include <sys/devicestat.h>
71#include <sys/fcntl.h>
72#include <sys/kernel.h>
73#include <sys/kthread.h>
74#include <sys/limits.h>
75#include <sys/linker.h>
76#include <sys/lock.h>
77#include <sys/malloc.h>
78#include <sys/mdioctl.h>
79#include <sys/mount.h>
80#include <sys/mutex.h>
81#include <sys/sx.h>
82#include <sys/namei.h>
83#include <sys/proc.h>
84#include <sys/queue.h>
85#include <sys/rwlock.h>
86#include <sys/sbuf.h>
87#include <sys/sched.h>
88#include <sys/sf_buf.h>
89#include <sys/sysctl.h>
90#include <sys/vnode.h>
91
92#include <geom/geom.h>
93#include <geom/geom_int.h>
94
95#include <vm/vm.h>
96#include <vm/vm_param.h>
97#include <vm/vm_object.h>
98#include <vm/vm_page.h>
99#include <vm/vm_pager.h>
100#include <vm/swap_pager.h>
101#include <vm/uma.h>
102
103#include <machine/bus.h>
104
105#define MD_MODVER 1
106
107#define MD_SHUTDOWN	0x10000		/* Tell worker thread to terminate. */
108#define	MD_EXITING	0x20000		/* Worker thread is exiting. */
109
110#ifndef MD_NSECT
111#define MD_NSECT (10000 * 2)
112#endif
113
114static MALLOC_DEFINE(M_MD, "md_disk", "Memory Disk");
115static MALLOC_DEFINE(M_MDSECT, "md_sectors", "Memory Disk Sectors");
116
117static int md_debug;
118SYSCTL_INT(_debug, OID_AUTO, mddebug, CTLFLAG_RW, &md_debug, 0,
119    "Enable md(4) debug messages");
120static int md_malloc_wait;
121SYSCTL_INT(_vm, OID_AUTO, md_malloc_wait, CTLFLAG_RW, &md_malloc_wait, 0,
122    "Allow malloc to wait for memory allocations");
123
124#if defined(MD_ROOT) && !defined(MD_ROOT_FSTYPE)
125#define	MD_ROOT_FSTYPE	"ufs"
126#endif
127
128#if defined(MD_ROOT)
129/*
130 * Preloaded image gets put here.
131 */
132#if defined(MD_ROOT_SIZE)
133/*
134 * We put the mfs_root symbol into the oldmfs section of the kernel object file.
135 * Applications that patch the object with the image can determine
136 * the size looking at the oldmfs section size within the kernel.
137 */
138u_char mfs_root[MD_ROOT_SIZE*1024] __attribute__ ((section ("oldmfs")));
139const int mfs_root_size = sizeof(mfs_root);
140#else
141extern volatile u_char __weak_symbol mfs_root;
142extern volatile u_char __weak_symbol mfs_root_end;
143__GLOBL(mfs_root);
144__GLOBL(mfs_root_end);
145#define mfs_root_size ((uintptr_t)(&mfs_root_end - &mfs_root))
146#endif
147#endif
148
149static g_init_t g_md_init;
150static g_fini_t g_md_fini;
151static g_start_t g_md_start;
152static g_access_t g_md_access;
153static void g_md_dumpconf(struct sbuf *sb, const char *indent,
154    struct g_geom *gp, struct g_consumer *cp __unused, struct g_provider *pp);
155
156static struct cdev *status_dev = NULL;
157static struct sx md_sx;
158static struct unrhdr *md_uh;
159
160static d_ioctl_t mdctlioctl;
161
162static struct cdevsw mdctl_cdevsw = {
163	.d_version =	D_VERSION,
164	.d_ioctl =	mdctlioctl,
165	.d_name =	MD_NAME,
166};
167
168struct g_class g_md_class = {
169	.name = "MD",
170	.version = G_VERSION,
171	.init = g_md_init,
172	.fini = g_md_fini,
173	.start = g_md_start,
174	.access = g_md_access,
175	.dumpconf = g_md_dumpconf,
176};
177
178DECLARE_GEOM_CLASS(g_md_class, g_md);
179
180
181static LIST_HEAD(, md_s) md_softc_list = LIST_HEAD_INITIALIZER(md_softc_list);
182
183#define NINDIR	(PAGE_SIZE / sizeof(uintptr_t))
184#define NMASK	(NINDIR-1)
185static int nshift;
186
187static int md_vnode_pbuf_freecnt;
188
189struct indir {
190	uintptr_t	*array;
191	u_int		total;
192	u_int		used;
193	u_int		shift;
194};
195
196struct md_s {
197	int unit;
198	LIST_ENTRY(md_s) list;
199	struct bio_queue_head bio_queue;
200	struct mtx queue_mtx;
201	struct mtx stat_mtx;
202	struct cdev *dev;
203	enum md_types type;
204	off_t mediasize;
205	unsigned sectorsize;
206	unsigned opencount;
207	unsigned fwheads;
208	unsigned fwsectors;
209	unsigned flags;
210	char name[20];
211	struct proc *procp;
212	struct g_geom *gp;
213	struct g_provider *pp;
214	int (*start)(struct md_s *sc, struct bio *bp);
215	struct devstat *devstat;
216
217	/* MD_MALLOC related fields */
218	struct indir *indir;
219	uma_zone_t uma;
220
221	/* MD_PRELOAD related fields */
222	u_char *pl_ptr;
223	size_t pl_len;
224
225	/* MD_VNODE related fields */
226	struct vnode *vnode;
227	char file[PATH_MAX];
228	struct ucred *cred;
229
230	/* MD_SWAP related fields */
231	vm_object_t object;
232};
233
234static struct indir *
235new_indir(u_int shift)
236{
237	struct indir *ip;
238
239	ip = malloc(sizeof *ip, M_MD, (md_malloc_wait ? M_WAITOK : M_NOWAIT)
240	    | M_ZERO);
241	if (ip == NULL)
242		return (NULL);
243	ip->array = malloc(sizeof(uintptr_t) * NINDIR,
244	    M_MDSECT, (md_malloc_wait ? M_WAITOK : M_NOWAIT) | M_ZERO);
245	if (ip->array == NULL) {
246		free(ip, M_MD);
247		return (NULL);
248	}
249	ip->total = NINDIR;
250	ip->shift = shift;
251	return (ip);
252}
253
254static void
255del_indir(struct indir *ip)
256{
257
258	free(ip->array, M_MDSECT);
259	free(ip, M_MD);
260}
261
262static void
263destroy_indir(struct md_s *sc, struct indir *ip)
264{
265	int i;
266
267	for (i = 0; i < NINDIR; i++) {
268		if (!ip->array[i])
269			continue;
270		if (ip->shift)
271			destroy_indir(sc, (struct indir*)(ip->array[i]));
272		else if (ip->array[i] > 255)
273			uma_zfree(sc->uma, (void *)(ip->array[i]));
274	}
275	del_indir(ip);
276}
277
278/*
279 * This function does the math and allocates the top level "indir" structure
280 * for a device of "size" sectors.
281 */
282
283static struct indir *
284dimension(off_t size)
285{
286	off_t rcnt;
287	struct indir *ip;
288	int layer;
289
290	rcnt = size;
291	layer = 0;
292	while (rcnt > NINDIR) {
293		rcnt /= NINDIR;
294		layer++;
295	}
296
297	/*
298	 * XXX: the top layer is probably not fully populated, so we allocate
299	 * too much space for ip->array in here.
300	 */
301	ip = malloc(sizeof *ip, M_MD, M_WAITOK | M_ZERO);
302	ip->array = malloc(sizeof(uintptr_t) * NINDIR,
303	    M_MDSECT, M_WAITOK | M_ZERO);
304	ip->total = NINDIR;
305	ip->shift = layer * nshift;
306	return (ip);
307}
308
309/*
310 * Read a given sector
311 */
312
313static uintptr_t
314s_read(struct indir *ip, off_t offset)
315{
316	struct indir *cip;
317	int idx;
318	uintptr_t up;
319
320	if (md_debug > 1)
321		printf("s_read(%jd)\n", (intmax_t)offset);
322	up = 0;
323	for (cip = ip; cip != NULL;) {
324		if (cip->shift) {
325			idx = (offset >> cip->shift) & NMASK;
326			up = cip->array[idx];
327			cip = (struct indir *)up;
328			continue;
329		}
330		idx = offset & NMASK;
331		return (cip->array[idx]);
332	}
333	return (0);
334}
335
336/*
337 * Write a given sector, prune the tree if the value is 0
338 */
339
340static int
341s_write(struct indir *ip, off_t offset, uintptr_t ptr)
342{
343	struct indir *cip, *lip[10];
344	int idx, li;
345	uintptr_t up;
346
347	if (md_debug > 1)
348		printf("s_write(%jd, %p)\n", (intmax_t)offset, (void *)ptr);
349	up = 0;
350	li = 0;
351	cip = ip;
352	for (;;) {
353		lip[li++] = cip;
354		if (cip->shift) {
355			idx = (offset >> cip->shift) & NMASK;
356			up = cip->array[idx];
357			if (up != 0) {
358				cip = (struct indir *)up;
359				continue;
360			}
361			/* Allocate branch */
362			cip->array[idx] =
363			    (uintptr_t)new_indir(cip->shift - nshift);
364			if (cip->array[idx] == 0)
365				return (ENOSPC);
366			cip->used++;
367			up = cip->array[idx];
368			cip = (struct indir *)up;
369			continue;
370		}
371		/* leafnode */
372		idx = offset & NMASK;
373		up = cip->array[idx];
374		if (up != 0)
375			cip->used--;
376		cip->array[idx] = ptr;
377		if (ptr != 0)
378			cip->used++;
379		break;
380	}
381	if (cip->used != 0 || li == 1)
382		return (0);
383	li--;
384	while (cip->used == 0 && cip != ip) {
385		li--;
386		idx = (offset >> lip[li]->shift) & NMASK;
387		up = lip[li]->array[idx];
388		KASSERT(up == (uintptr_t)cip, ("md screwed up"));
389		del_indir(cip);
390		lip[li]->array[idx] = 0;
391		lip[li]->used--;
392		cip = lip[li];
393	}
394	return (0);
395}
396
397
398static int
399g_md_access(struct g_provider *pp, int r, int w, int e)
400{
401	struct md_s *sc;
402
403	sc = pp->geom->softc;
404	if (sc == NULL) {
405		if (r <= 0 && w <= 0 && e <= 0)
406			return (0);
407		return (ENXIO);
408	}
409	r += pp->acr;
410	w += pp->acw;
411	e += pp->ace;
412	if ((sc->flags & MD_READONLY) != 0 && w > 0)
413		return (EROFS);
414	if ((pp->acr + pp->acw + pp->ace) == 0 && (r + w + e) > 0) {
415		sc->opencount = 1;
416	} else if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0) {
417		sc->opencount = 0;
418	}
419	return (0);
420}
421
422static void
423g_md_start(struct bio *bp)
424{
425	struct md_s *sc;
426
427	sc = bp->bio_to->geom->softc;
428	if ((bp->bio_cmd == BIO_READ) || (bp->bio_cmd == BIO_WRITE)) {
429		mtx_lock(&sc->stat_mtx);
430		devstat_start_transaction_bio(sc->devstat, bp);
431		mtx_unlock(&sc->stat_mtx);
432	}
433	mtx_lock(&sc->queue_mtx);
434	bioq_disksort(&sc->bio_queue, bp);
435	mtx_unlock(&sc->queue_mtx);
436	wakeup(sc);
437}
438
439#define	MD_MALLOC_MOVE_ZERO	1
440#define	MD_MALLOC_MOVE_FILL	2
441#define	MD_MALLOC_MOVE_READ	3
442#define	MD_MALLOC_MOVE_WRITE	4
443#define	MD_MALLOC_MOVE_CMP	5
444
445static int
446md_malloc_move_ma(vm_page_t **mp, int *ma_offs, unsigned sectorsize,
447    void *ptr, u_char fill, int op)
448{
449	struct sf_buf *sf;
450	vm_page_t m, *mp1;
451	char *p, first;
452	off_t *uc;
453	unsigned n;
454	int error, i, ma_offs1, sz, first_read;
455
456	m = NULL;
457	error = 0;
458	sf = NULL;
459	/* if (op == MD_MALLOC_MOVE_CMP) { gcc */
460		first = 0;
461		first_read = 0;
462		uc = ptr;
463		mp1 = *mp;
464		ma_offs1 = *ma_offs;
465	/* } */
466	sched_pin();
467	for (n = sectorsize; n != 0; n -= sz) {
468		sz = imin(PAGE_SIZE - *ma_offs, n);
469		if (m != **mp) {
470			if (sf != NULL)
471				sf_buf_free(sf);
472			m = **mp;
473			sf = sf_buf_alloc(m, SFB_CPUPRIVATE |
474			    (md_malloc_wait ? 0 : SFB_NOWAIT));
475			if (sf == NULL) {
476				error = ENOMEM;
477				break;
478			}
479		}
480		p = (char *)sf_buf_kva(sf) + *ma_offs;
481		switch (op) {
482		case MD_MALLOC_MOVE_ZERO:
483			bzero(p, sz);
484			break;
485		case MD_MALLOC_MOVE_FILL:
486			memset(p, fill, sz);
487			break;
488		case MD_MALLOC_MOVE_READ:
489			bcopy(ptr, p, sz);
490			cpu_flush_dcache(p, sz);
491			break;
492		case MD_MALLOC_MOVE_WRITE:
493			bcopy(p, ptr, sz);
494			break;
495		case MD_MALLOC_MOVE_CMP:
496			for (i = 0; i < sz; i++, p++) {
497				if (!first_read) {
498					*uc = (u_char)*p;
499					first = *p;
500					first_read = 1;
501				} else if (*p != first) {
502					error = EDOOFUS;
503					break;
504				}
505			}
506			break;
507		default:
508			KASSERT(0, ("md_malloc_move_ma unknown op %d\n", op));
509			break;
510		}
511		if (error != 0)
512			break;
513		*ma_offs += sz;
514		*ma_offs %= PAGE_SIZE;
515		if (*ma_offs == 0)
516			(*mp)++;
517		ptr = (char *)ptr + sz;
518	}
519
520	if (sf != NULL)
521		sf_buf_free(sf);
522	sched_unpin();
523	if (op == MD_MALLOC_MOVE_CMP && error != 0) {
524		*mp = mp1;
525		*ma_offs = ma_offs1;
526	}
527	return (error);
528}
529
530static int
531md_malloc_move_vlist(bus_dma_segment_t **pvlist, int *pma_offs,
532    unsigned len, void *ptr, u_char fill, int op)
533{
534	bus_dma_segment_t *vlist;
535	uint8_t *p, *end, first;
536	off_t *uc;
537	int ma_offs, seg_len;
538
539	vlist = *pvlist;
540	ma_offs = *pma_offs;
541	uc = ptr;
542
543	for (; len != 0; len -= seg_len) {
544		seg_len = imin(vlist->ds_len - ma_offs, len);
545		p = (uint8_t *)(uintptr_t)vlist->ds_addr + ma_offs;
546		switch (op) {
547		case MD_MALLOC_MOVE_ZERO:
548			bzero(p, seg_len);
549			break;
550		case MD_MALLOC_MOVE_FILL:
551			memset(p, fill, seg_len);
552			break;
553		case MD_MALLOC_MOVE_READ:
554			bcopy(ptr, p, seg_len);
555			cpu_flush_dcache(p, seg_len);
556			break;
557		case MD_MALLOC_MOVE_WRITE:
558			bcopy(p, ptr, seg_len);
559			break;
560		case MD_MALLOC_MOVE_CMP:
561			end = p + seg_len;
562			first = *uc = *p;
563			/* Confirm all following bytes match the first */
564			while (++p < end) {
565				if (*p != first)
566					return (EDOOFUS);
567			}
568			break;
569		default:
570			KASSERT(0, ("md_malloc_move_vlist unknown op %d\n", op));
571			break;
572		}
573
574		ma_offs += seg_len;
575		if (ma_offs == vlist->ds_len) {
576			ma_offs = 0;
577			vlist++;
578		}
579		ptr = (uint8_t *)ptr + seg_len;
580	}
581	*pvlist = vlist;
582	*pma_offs = ma_offs;
583
584	return (0);
585}
586
587static int
588mdstart_malloc(struct md_s *sc, struct bio *bp)
589{
590	u_char *dst;
591	vm_page_t *m;
592	bus_dma_segment_t *vlist;
593	int i, error, error1, ma_offs, notmapped;
594	off_t secno, nsec, uc;
595	uintptr_t sp, osp;
596
597	switch (bp->bio_cmd) {
598	case BIO_READ:
599	case BIO_WRITE:
600	case BIO_DELETE:
601		break;
602	default:
603		return (EOPNOTSUPP);
604	}
605
606	notmapped = (bp->bio_flags & BIO_UNMAPPED) != 0;
607	vlist = (bp->bio_flags & BIO_VLIST) != 0 ?
608	    (bus_dma_segment_t *)bp->bio_data : NULL;
609	if (notmapped) {
610		m = bp->bio_ma;
611		ma_offs = bp->bio_ma_offset;
612		dst = NULL;
613		KASSERT(vlist == NULL, ("vlists cannot be unmapped"));
614	} else if (vlist != NULL) {
615		ma_offs = bp->bio_ma_offset;
616		dst = NULL;
617	} else {
618		dst = bp->bio_data;
619	}
620
621	nsec = bp->bio_length / sc->sectorsize;
622	secno = bp->bio_offset / sc->sectorsize;
623	error = 0;
624	while (nsec--) {
625		osp = s_read(sc->indir, secno);
626		if (bp->bio_cmd == BIO_DELETE) {
627			if (osp != 0)
628				error = s_write(sc->indir, secno, 0);
629		} else if (bp->bio_cmd == BIO_READ) {
630			if (osp == 0) {
631				if (notmapped) {
632					error = md_malloc_move_ma(&m, &ma_offs,
633					    sc->sectorsize, NULL, 0,
634					    MD_MALLOC_MOVE_ZERO);
635				} else if (vlist != NULL) {
636					error = md_malloc_move_vlist(&vlist,
637					    &ma_offs, sc->sectorsize, NULL, 0,
638					    MD_MALLOC_MOVE_ZERO);
639				} else
640					bzero(dst, sc->sectorsize);
641			} else if (osp <= 255) {
642				if (notmapped) {
643					error = md_malloc_move_ma(&m, &ma_offs,
644					    sc->sectorsize, NULL, osp,
645					    MD_MALLOC_MOVE_FILL);
646				} else if (vlist != NULL) {
647					error = md_malloc_move_vlist(&vlist,
648					    &ma_offs, sc->sectorsize, NULL, osp,
649					    MD_MALLOC_MOVE_FILL);
650				} else
651					memset(dst, osp, sc->sectorsize);
652			} else {
653				if (notmapped) {
654					error = md_malloc_move_ma(&m, &ma_offs,
655					    sc->sectorsize, (void *)osp, 0,
656					    MD_MALLOC_MOVE_READ);
657				} else if (vlist != NULL) {
658					error = md_malloc_move_vlist(&vlist,
659					    &ma_offs, sc->sectorsize,
660					    (void *)osp, 0,
661					    MD_MALLOC_MOVE_READ);
662				} else {
663					bcopy((void *)osp, dst, sc->sectorsize);
664					cpu_flush_dcache(dst, sc->sectorsize);
665				}
666			}
667			osp = 0;
668		} else if (bp->bio_cmd == BIO_WRITE) {
669			if (sc->flags & MD_COMPRESS) {
670				if (notmapped) {
671					error1 = md_malloc_move_ma(&m, &ma_offs,
672					    sc->sectorsize, &uc, 0,
673					    MD_MALLOC_MOVE_CMP);
674					i = error1 == 0 ? sc->sectorsize : 0;
675				} else if (vlist != NULL) {
676					error1 = md_malloc_move_vlist(&vlist,
677					    &ma_offs, sc->sectorsize, &uc, 0,
678					    MD_MALLOC_MOVE_CMP);
679					i = error1 == 0 ? sc->sectorsize : 0;
680				} else {
681					uc = dst[0];
682					for (i = 1; i < sc->sectorsize; i++) {
683						if (dst[i] != uc)
684							break;
685					}
686				}
687			} else {
688				i = 0;
689				uc = 0;
690			}
691			if (i == sc->sectorsize) {
692				if (osp != uc)
693					error = s_write(sc->indir, secno, uc);
694			} else {
695				if (osp <= 255) {
696					sp = (uintptr_t)uma_zalloc(sc->uma,
697					    md_malloc_wait ? M_WAITOK :
698					    M_NOWAIT);
699					if (sp == 0) {
700						error = ENOSPC;
701						break;
702					}
703					if (notmapped) {
704						error = md_malloc_move_ma(&m,
705						    &ma_offs, sc->sectorsize,
706						    (void *)sp, 0,
707						    MD_MALLOC_MOVE_WRITE);
708					} else if (vlist != NULL) {
709						error = md_malloc_move_vlist(
710						    &vlist, &ma_offs,
711						    sc->sectorsize, (void *)sp,
712						    0, MD_MALLOC_MOVE_WRITE);
713					} else {
714						bcopy(dst, (void *)sp,
715						    sc->sectorsize);
716					}
717					error = s_write(sc->indir, secno, sp);
718				} else {
719					if (notmapped) {
720						error = md_malloc_move_ma(&m,
721						    &ma_offs, sc->sectorsize,
722						    (void *)osp, 0,
723						    MD_MALLOC_MOVE_WRITE);
724					} else if (vlist != NULL) {
725						error = md_malloc_move_vlist(
726						    &vlist, &ma_offs,
727						    sc->sectorsize, (void *)osp,
728						    0, MD_MALLOC_MOVE_WRITE);
729					} else {
730						bcopy(dst, (void *)osp,
731						    sc->sectorsize);
732					}
733					osp = 0;
734				}
735			}
736		} else {
737			error = EOPNOTSUPP;
738		}
739		if (osp > 255)
740			uma_zfree(sc->uma, (void*)osp);
741		if (error != 0)
742			break;
743		secno++;
744		if (!notmapped && vlist == NULL)
745			dst += sc->sectorsize;
746	}
747	bp->bio_resid = 0;
748	return (error);
749}
750
751static void
752mdcopyto_vlist(void *src, bus_dma_segment_t *vlist, off_t offset, off_t len)
753{
754	off_t seg_len;
755
756	while (offset >= vlist->ds_len) {
757		offset -= vlist->ds_len;
758		vlist++;
759	}
760
761	while (len != 0) {
762		seg_len = omin(len, vlist->ds_len - offset);
763		bcopy(src, (void *)(uintptr_t)(vlist->ds_addr + offset),
764		    seg_len);
765		offset = 0;
766		src = (uint8_t *)src + seg_len;
767		len -= seg_len;
768		vlist++;
769	}
770}
771
772static void
773mdcopyfrom_vlist(bus_dma_segment_t *vlist, off_t offset, void *dst, off_t len)
774{
775	off_t seg_len;
776
777	while (offset >= vlist->ds_len) {
778		offset -= vlist->ds_len;
779		vlist++;
780	}
781
782	while (len != 0) {
783		seg_len = omin(len, vlist->ds_len - offset);
784		bcopy((void *)(uintptr_t)(vlist->ds_addr + offset), dst,
785		    seg_len);
786		offset = 0;
787		dst = (uint8_t *)dst + seg_len;
788		len -= seg_len;
789		vlist++;
790	}
791}
792
793static int
794mdstart_preload(struct md_s *sc, struct bio *bp)
795{
796	uint8_t *p;
797
798	p = sc->pl_ptr + bp->bio_offset;
799	switch (bp->bio_cmd) {
800	case BIO_READ:
801		if ((bp->bio_flags & BIO_VLIST) != 0) {
802			mdcopyto_vlist(p, (bus_dma_segment_t *)bp->bio_data,
803			    bp->bio_ma_offset, bp->bio_length);
804		} else {
805			bcopy(p, bp->bio_data, bp->bio_length);
806		}
807		cpu_flush_dcache(bp->bio_data, bp->bio_length);
808		break;
809	case BIO_WRITE:
810		if ((bp->bio_flags & BIO_VLIST) != 0) {
811			mdcopyfrom_vlist((bus_dma_segment_t *)bp->bio_data,
812			    bp->bio_ma_offset, p, bp->bio_length);
813		} else {
814			bcopy(bp->bio_data, p, bp->bio_length);
815		}
816		break;
817	}
818	bp->bio_resid = 0;
819	return (0);
820}
821
822static int
823mdstart_vnode(struct md_s *sc, struct bio *bp)
824{
825	int error;
826	struct uio auio;
827	struct iovec aiov;
828	struct iovec *piov;
829	struct mount *mp;
830	struct vnode *vp;
831	struct buf *pb;
832	bus_dma_segment_t *vlist;
833	struct thread *td;
834	off_t iolen, len, zerosize;
835	int ma_offs, npages;
836
837	switch (bp->bio_cmd) {
838	case BIO_READ:
839		auio.uio_rw = UIO_READ;
840		break;
841	case BIO_WRITE:
842	case BIO_DELETE:
843		auio.uio_rw = UIO_WRITE;
844		break;
845	case BIO_FLUSH:
846		break;
847	default:
848		return (EOPNOTSUPP);
849	}
850
851	td = curthread;
852	vp = sc->vnode;
853	pb = NULL;
854	piov = NULL;
855	ma_offs = bp->bio_ma_offset;
856	len = bp->bio_length;
857
858	/*
859	 * VNODE I/O
860	 *
861	 * If an error occurs, we set BIO_ERROR but we do not set
862	 * B_INVAL because (for a write anyway), the buffer is
863	 * still valid.
864	 */
865
866	if (bp->bio_cmd == BIO_FLUSH) {
867		(void) vn_start_write(vp, &mp, V_WAIT);
868		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
869		error = VOP_FSYNC(vp, MNT_WAIT, td);
870		VOP_UNLOCK(vp, 0);
871		vn_finished_write(mp);
872		return (error);
873	}
874
875	auio.uio_offset = (vm_ooffset_t)bp->bio_offset;
876	auio.uio_resid = bp->bio_length;
877	auio.uio_segflg = UIO_SYSSPACE;
878	auio.uio_td = td;
879
880	if (bp->bio_cmd == BIO_DELETE) {
881		/*
882		 * Emulate BIO_DELETE by writing zeros.
883		 */
884		zerosize = ZERO_REGION_SIZE -
885		    (ZERO_REGION_SIZE % sc->sectorsize);
886		auio.uio_iovcnt = howmany(bp->bio_length, zerosize);
887		piov = malloc(sizeof(*piov) * auio.uio_iovcnt, M_MD, M_WAITOK);
888		auio.uio_iov = piov;
889		while (len > 0) {
890			piov->iov_base = __DECONST(void *, zero_region);
891			piov->iov_len = len;
892			if (len > zerosize)
893				piov->iov_len = zerosize;
894			len -= piov->iov_len;
895			piov++;
896		}
897		piov = auio.uio_iov;
898	} else if ((bp->bio_flags & BIO_VLIST) != 0) {
899		piov = malloc(sizeof(*piov) * bp->bio_ma_n, M_MD, M_WAITOK);
900		auio.uio_iov = piov;
901		vlist = (bus_dma_segment_t *)bp->bio_data;
902		while (len > 0) {
903			piov->iov_base = (void *)(uintptr_t)(vlist->ds_addr +
904			    ma_offs);
905			piov->iov_len = vlist->ds_len - ma_offs;
906			if (piov->iov_len > len)
907				piov->iov_len = len;
908			len -= piov->iov_len;
909			ma_offs = 0;
910			vlist++;
911			piov++;
912		}
913		auio.uio_iovcnt = piov - auio.uio_iov;
914		piov = auio.uio_iov;
915	} else if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
916		pb = getpbuf(&md_vnode_pbuf_freecnt);
917		bp->bio_resid = len;
918unmapped_step:
919		npages = atop(min(MAXPHYS, round_page(len + (ma_offs &
920		    PAGE_MASK))));
921		iolen = min(ptoa(npages) - (ma_offs & PAGE_MASK), len);
922		KASSERT(iolen > 0, ("zero iolen"));
923		pmap_qenter((vm_offset_t)pb->b_data,
924		    &bp->bio_ma[atop(ma_offs)], npages);
925		aiov.iov_base = (void *)((vm_offset_t)pb->b_data +
926		    (ma_offs & PAGE_MASK));
927		aiov.iov_len = iolen;
928		auio.uio_iov = &aiov;
929		auio.uio_iovcnt = 1;
930		auio.uio_resid = iolen;
931	} else {
932		aiov.iov_base = bp->bio_data;
933		aiov.iov_len = bp->bio_length;
934		auio.uio_iov = &aiov;
935		auio.uio_iovcnt = 1;
936	}
937	/*
938	 * When reading set IO_DIRECT to try to avoid double-caching
939	 * the data.  When writing IO_DIRECT is not optimal.
940	 */
941	if (auio.uio_rw == UIO_READ) {
942		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
943		error = VOP_READ(vp, &auio, IO_DIRECT, sc->cred);
944		VOP_UNLOCK(vp, 0);
945	} else {
946		(void) vn_start_write(vp, &mp, V_WAIT);
947		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
948		error = VOP_WRITE(vp, &auio, sc->flags & MD_ASYNC ? 0 : IO_SYNC,
949		    sc->cred);
950		VOP_UNLOCK(vp, 0);
951		vn_finished_write(mp);
952	}
953
954	if (pb != NULL) {
955		pmap_qremove((vm_offset_t)pb->b_data, npages);
956		if (error == 0) {
957			len -= iolen;
958			bp->bio_resid -= iolen;
959			ma_offs += iolen;
960			if (len > 0)
961				goto unmapped_step;
962		}
963		relpbuf(pb, &md_vnode_pbuf_freecnt);
964	}
965
966	free(piov, M_MD);
967	if (pb == NULL)
968		bp->bio_resid = auio.uio_resid;
969	return (error);
970}
971
972static void
973md_swap_page_free(vm_page_t m)
974{
975
976	vm_page_xunbusy(m);
977	vm_page_lock(m);
978	vm_page_free(m);
979	vm_page_unlock(m);
980}
981
982static int
983mdstart_swap(struct md_s *sc, struct bio *bp)
984{
985	vm_page_t m;
986	u_char *p;
987	vm_pindex_t i, lastp;
988	bus_dma_segment_t *vlist;
989	int rv, ma_offs, offs, len, lastend;
990
991	switch (bp->bio_cmd) {
992	case BIO_READ:
993	case BIO_WRITE:
994	case BIO_DELETE:
995		break;
996	default:
997		return (EOPNOTSUPP);
998	}
999
1000	p = bp->bio_data;
1001	ma_offs = (bp->bio_flags & (BIO_UNMAPPED|BIO_VLIST)) != 0 ?
1002	    bp->bio_ma_offset : 0;
1003	vlist = (bp->bio_flags & BIO_VLIST) != 0 ?
1004	    (bus_dma_segment_t *)bp->bio_data : NULL;
1005
1006	/*
1007	 * offs is the offset at which to start operating on the
1008	 * next (ie, first) page.  lastp is the last page on
1009	 * which we're going to operate.  lastend is the ending
1010	 * position within that last page (ie, PAGE_SIZE if
1011	 * we're operating on complete aligned pages).
1012	 */
1013	offs = bp->bio_offset % PAGE_SIZE;
1014	lastp = (bp->bio_offset + bp->bio_length - 1) / PAGE_SIZE;
1015	lastend = (bp->bio_offset + bp->bio_length - 1) % PAGE_SIZE + 1;
1016
1017	rv = VM_PAGER_OK;
1018	VM_OBJECT_WLOCK(sc->object);
1019	vm_object_pip_add(sc->object, 1);
1020	for (i = bp->bio_offset / PAGE_SIZE; i <= lastp; i++) {
1021		len = ((i == lastp) ? lastend : PAGE_SIZE) - offs;
1022		m = vm_page_grab(sc->object, i, VM_ALLOC_SYSTEM);
1023		if (bp->bio_cmd == BIO_READ) {
1024			if (m->valid == VM_PAGE_BITS_ALL)
1025				rv = VM_PAGER_OK;
1026			else
1027				rv = vm_pager_get_pages(sc->object, &m, 1,
1028				    NULL, NULL);
1029			if (rv == VM_PAGER_ERROR) {
1030				md_swap_page_free(m);
1031				break;
1032			} else if (rv == VM_PAGER_FAIL) {
1033				/*
1034				 * Pager does not have the page.  Zero
1035				 * the allocated page, and mark it as
1036				 * valid. Do not set dirty, the page
1037				 * can be recreated if thrown out.
1038				 */
1039				pmap_zero_page(m);
1040				m->valid = VM_PAGE_BITS_ALL;
1041			}
1042			if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
1043				pmap_copy_pages(&m, offs, bp->bio_ma,
1044				    ma_offs, len);
1045			} else if ((bp->bio_flags & BIO_VLIST) != 0) {
1046				physcopyout_vlist(VM_PAGE_TO_PHYS(m) + offs,
1047				    vlist, ma_offs, len);
1048				cpu_flush_dcache(p, len);
1049			} else {
1050				physcopyout(VM_PAGE_TO_PHYS(m) + offs, p, len);
1051				cpu_flush_dcache(p, len);
1052			}
1053		} else if (bp->bio_cmd == BIO_WRITE) {
1054			if (len == PAGE_SIZE || m->valid == VM_PAGE_BITS_ALL)
1055				rv = VM_PAGER_OK;
1056			else
1057				rv = vm_pager_get_pages(sc->object, &m, 1,
1058				    NULL, NULL);
1059			if (rv == VM_PAGER_ERROR) {
1060				md_swap_page_free(m);
1061				break;
1062			} else if (rv == VM_PAGER_FAIL)
1063				pmap_zero_page(m);
1064
1065			if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
1066				pmap_copy_pages(bp->bio_ma, ma_offs, &m,
1067				    offs, len);
1068			} else if ((bp->bio_flags & BIO_VLIST) != 0) {
1069				physcopyin_vlist(vlist, ma_offs,
1070				    VM_PAGE_TO_PHYS(m) + offs, len);
1071			} else {
1072				physcopyin(p, VM_PAGE_TO_PHYS(m) + offs, len);
1073			}
1074
1075			m->valid = VM_PAGE_BITS_ALL;
1076			if (m->dirty != VM_PAGE_BITS_ALL) {
1077				vm_page_dirty(m);
1078				vm_pager_page_unswapped(m);
1079			}
1080		} else if (bp->bio_cmd == BIO_DELETE) {
1081			if (len == PAGE_SIZE || m->valid == VM_PAGE_BITS_ALL)
1082				rv = VM_PAGER_OK;
1083			else
1084				rv = vm_pager_get_pages(sc->object, &m, 1,
1085				    NULL, NULL);
1086			if (rv == VM_PAGER_ERROR) {
1087				md_swap_page_free(m);
1088				break;
1089			} else if (rv == VM_PAGER_FAIL) {
1090				md_swap_page_free(m);
1091				m = NULL;
1092			} else {
1093				/* Page is valid. */
1094				if (len != PAGE_SIZE) {
1095					pmap_zero_page_area(m, offs, len);
1096					if (m->dirty != VM_PAGE_BITS_ALL) {
1097						vm_page_dirty(m);
1098						vm_pager_page_unswapped(m);
1099					}
1100				} else {
1101					vm_pager_page_unswapped(m);
1102					md_swap_page_free(m);
1103					m = NULL;
1104				}
1105			}
1106		}
1107		if (m != NULL) {
1108			vm_page_xunbusy(m);
1109			vm_page_lock(m);
1110			vm_page_activate(m);
1111			vm_page_unlock(m);
1112		}
1113
1114		/* Actions on further pages start at offset 0 */
1115		p += PAGE_SIZE - offs;
1116		offs = 0;
1117		ma_offs += len;
1118	}
1119	vm_object_pip_wakeup(sc->object);
1120	VM_OBJECT_WUNLOCK(sc->object);
1121	return (rv != VM_PAGER_ERROR ? 0 : ENOSPC);
1122}
1123
1124static int
1125mdstart_null(struct md_s *sc, struct bio *bp)
1126{
1127
1128	switch (bp->bio_cmd) {
1129	case BIO_READ:
1130		bzero(bp->bio_data, bp->bio_length);
1131		cpu_flush_dcache(bp->bio_data, bp->bio_length);
1132		break;
1133	case BIO_WRITE:
1134		break;
1135	}
1136	bp->bio_resid = 0;
1137	return (0);
1138}
1139
1140static void
1141md_kthread(void *arg)
1142{
1143	struct md_s *sc;
1144	struct bio *bp;
1145	int error;
1146
1147	sc = arg;
1148	thread_lock(curthread);
1149	sched_prio(curthread, PRIBIO);
1150	thread_unlock(curthread);
1151	if (sc->type == MD_VNODE)
1152		curthread->td_pflags |= TDP_NORUNNINGBUF;
1153
1154	for (;;) {
1155		mtx_lock(&sc->queue_mtx);
1156		if (sc->flags & MD_SHUTDOWN) {
1157			sc->flags |= MD_EXITING;
1158			mtx_unlock(&sc->queue_mtx);
1159			kproc_exit(0);
1160		}
1161		bp = bioq_takefirst(&sc->bio_queue);
1162		if (!bp) {
1163			msleep(sc, &sc->queue_mtx, PRIBIO | PDROP, "mdwait", 0);
1164			continue;
1165		}
1166		mtx_unlock(&sc->queue_mtx);
1167		if (bp->bio_cmd == BIO_GETATTR) {
1168			if ((sc->fwsectors && sc->fwheads &&
1169			    (g_handleattr_int(bp, "GEOM::fwsectors",
1170			    sc->fwsectors) ||
1171			    g_handleattr_int(bp, "GEOM::fwheads",
1172			    sc->fwheads))) ||
1173			    g_handleattr_int(bp, "GEOM::candelete", 1))
1174				error = -1;
1175			else
1176				error = EOPNOTSUPP;
1177		} else {
1178			error = sc->start(sc, bp);
1179		}
1180
1181		if (error != -1) {
1182			bp->bio_completed = bp->bio_length;
1183			if ((bp->bio_cmd == BIO_READ) || (bp->bio_cmd == BIO_WRITE))
1184				devstat_end_transaction_bio(sc->devstat, bp);
1185			g_io_deliver(bp, error);
1186		}
1187	}
1188}
1189
1190static struct md_s *
1191mdfind(int unit)
1192{
1193	struct md_s *sc;
1194
1195	LIST_FOREACH(sc, &md_softc_list, list) {
1196		if (sc->unit == unit)
1197			break;
1198	}
1199	return (sc);
1200}
1201
1202static struct md_s *
1203mdnew(int unit, int *errp, enum md_types type)
1204{
1205	struct md_s *sc;
1206	int error;
1207
1208	*errp = 0;
1209	if (unit == -1)
1210		unit = alloc_unr(md_uh);
1211	else
1212		unit = alloc_unr_specific(md_uh, unit);
1213
1214	if (unit == -1) {
1215		*errp = EBUSY;
1216		return (NULL);
1217	}
1218
1219	sc = (struct md_s *)malloc(sizeof *sc, M_MD, M_WAITOK | M_ZERO);
1220	sc->type = type;
1221	bioq_init(&sc->bio_queue);
1222	mtx_init(&sc->queue_mtx, "md bio queue", NULL, MTX_DEF);
1223	mtx_init(&sc->stat_mtx, "md stat", NULL, MTX_DEF);
1224	sc->unit = unit;
1225	sprintf(sc->name, "md%d", unit);
1226	LIST_INSERT_HEAD(&md_softc_list, sc, list);
1227	error = kproc_create(md_kthread, sc, &sc->procp, 0, 0,"%s", sc->name);
1228	if (error == 0)
1229		return (sc);
1230	LIST_REMOVE(sc, list);
1231	mtx_destroy(&sc->stat_mtx);
1232	mtx_destroy(&sc->queue_mtx);
1233	free_unr(md_uh, sc->unit);
1234	free(sc, M_MD);
1235	*errp = error;
1236	return (NULL);
1237}
1238
1239static void
1240mdinit(struct md_s *sc)
1241{
1242	struct g_geom *gp;
1243	struct g_provider *pp;
1244
1245	g_topology_lock();
1246	gp = g_new_geomf(&g_md_class, "md%d", sc->unit);
1247	gp->softc = sc;
1248	pp = g_new_providerf(gp, "md%d", sc->unit);
1249	pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE;
1250	pp->mediasize = sc->mediasize;
1251	pp->sectorsize = sc->sectorsize;
1252	switch (sc->type) {
1253	case MD_MALLOC:
1254	case MD_VNODE:
1255	case MD_SWAP:
1256		pp->flags |= G_PF_ACCEPT_UNMAPPED;
1257		break;
1258	case MD_PRELOAD:
1259	case MD_NULL:
1260		break;
1261	}
1262	sc->gp = gp;
1263	sc->pp = pp;
1264	g_error_provider(pp, 0);
1265	g_topology_unlock();
1266	sc->devstat = devstat_new_entry("md", sc->unit, sc->sectorsize,
1267	    DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX);
1268}
1269
1270static int
1271mdcreate_malloc(struct md_s *sc, struct md_ioctl *mdio)
1272{
1273	uintptr_t sp;
1274	int error;
1275	off_t u;
1276
1277	error = 0;
1278	if (mdio->md_options & ~(MD_AUTOUNIT | MD_COMPRESS | MD_RESERVE))
1279		return (EINVAL);
1280	if (mdio->md_sectorsize != 0 && !powerof2(mdio->md_sectorsize))
1281		return (EINVAL);
1282	/* Compression doesn't make sense if we have reserved space */
1283	if (mdio->md_options & MD_RESERVE)
1284		mdio->md_options &= ~MD_COMPRESS;
1285	if (mdio->md_fwsectors != 0)
1286		sc->fwsectors = mdio->md_fwsectors;
1287	if (mdio->md_fwheads != 0)
1288		sc->fwheads = mdio->md_fwheads;
1289	sc->flags = mdio->md_options & (MD_COMPRESS | MD_FORCE);
1290	sc->indir = dimension(sc->mediasize / sc->sectorsize);
1291	sc->uma = uma_zcreate(sc->name, sc->sectorsize, NULL, NULL, NULL, NULL,
1292	    0x1ff, 0);
1293	if (mdio->md_options & MD_RESERVE) {
1294		off_t nsectors;
1295
1296		nsectors = sc->mediasize / sc->sectorsize;
1297		for (u = 0; u < nsectors; u++) {
1298			sp = (uintptr_t)uma_zalloc(sc->uma, (md_malloc_wait ?
1299			    M_WAITOK : M_NOWAIT) | M_ZERO);
1300			if (sp != 0)
1301				error = s_write(sc->indir, u, sp);
1302			else
1303				error = ENOMEM;
1304			if (error != 0)
1305				break;
1306		}
1307	}
1308	return (error);
1309}
1310
1311
1312static int
1313mdsetcred(struct md_s *sc, struct ucred *cred)
1314{
1315	char *tmpbuf;
1316	int error = 0;
1317
1318	/*
1319	 * Set credits in our softc
1320	 */
1321
1322	if (sc->cred)
1323		crfree(sc->cred);
1324	sc->cred = crhold(cred);
1325
1326	/*
1327	 * Horrible kludge to establish credentials for NFS  XXX.
1328	 */
1329
1330	if (sc->vnode) {
1331		struct uio auio;
1332		struct iovec aiov;
1333
1334		tmpbuf = malloc(sc->sectorsize, M_TEMP, M_WAITOK);
1335		bzero(&auio, sizeof(auio));
1336
1337		aiov.iov_base = tmpbuf;
1338		aiov.iov_len = sc->sectorsize;
1339		auio.uio_iov = &aiov;
1340		auio.uio_iovcnt = 1;
1341		auio.uio_offset = 0;
1342		auio.uio_rw = UIO_READ;
1343		auio.uio_segflg = UIO_SYSSPACE;
1344		auio.uio_resid = aiov.iov_len;
1345		vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY);
1346		error = VOP_READ(sc->vnode, &auio, 0, sc->cred);
1347		VOP_UNLOCK(sc->vnode, 0);
1348		free(tmpbuf, M_TEMP);
1349	}
1350	return (error);
1351}
1352
1353static int
1354mdcreate_vnode(struct md_s *sc, struct md_ioctl *mdio, struct thread *td)
1355{
1356	struct vattr vattr;
1357	struct nameidata nd;
1358	char *fname;
1359	int error, flags;
1360
1361	/*
1362	 * Kernel-originated requests must have the filename appended
1363	 * to the mdio structure to protect against malicious software.
1364	 */
1365	fname = mdio->md_file;
1366	if ((void *)fname != (void *)(mdio + 1)) {
1367		error = copyinstr(fname, sc->file, sizeof(sc->file), NULL);
1368		if (error != 0)
1369			return (error);
1370	} else
1371		strlcpy(sc->file, fname, sizeof(sc->file));
1372
1373	/*
1374	 * If the user specified that this is a read only device, don't
1375	 * set the FWRITE mask before trying to open the backing store.
1376	 */
1377	flags = FREAD | ((mdio->md_options & MD_READONLY) ? 0 : FWRITE);
1378	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, sc->file, td);
1379	error = vn_open(&nd, &flags, 0, NULL);
1380	if (error != 0)
1381		return (error);
1382	NDFREE(&nd, NDF_ONLY_PNBUF);
1383	if (nd.ni_vp->v_type != VREG) {
1384		error = EINVAL;
1385		goto bad;
1386	}
1387	error = VOP_GETATTR(nd.ni_vp, &vattr, td->td_ucred);
1388	if (error != 0)
1389		goto bad;
1390	if (VOP_ISLOCKED(nd.ni_vp) != LK_EXCLUSIVE) {
1391		vn_lock(nd.ni_vp, LK_UPGRADE | LK_RETRY);
1392		if (nd.ni_vp->v_iflag & VI_DOOMED) {
1393			/* Forced unmount. */
1394			error = EBADF;
1395			goto bad;
1396		}
1397	}
1398	nd.ni_vp->v_vflag |= VV_MD;
1399	VOP_UNLOCK(nd.ni_vp, 0);
1400
1401	if (mdio->md_fwsectors != 0)
1402		sc->fwsectors = mdio->md_fwsectors;
1403	if (mdio->md_fwheads != 0)
1404		sc->fwheads = mdio->md_fwheads;
1405	sc->flags = mdio->md_options & (MD_FORCE | MD_ASYNC);
1406	if (!(flags & FWRITE))
1407		sc->flags |= MD_READONLY;
1408	sc->vnode = nd.ni_vp;
1409
1410	error = mdsetcred(sc, td->td_ucred);
1411	if (error != 0) {
1412		sc->vnode = NULL;
1413		vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY);
1414		nd.ni_vp->v_vflag &= ~VV_MD;
1415		goto bad;
1416	}
1417	return (0);
1418bad:
1419	VOP_UNLOCK(nd.ni_vp, 0);
1420	(void)vn_close(nd.ni_vp, flags, td->td_ucred, td);
1421	return (error);
1422}
1423
1424static int
1425mddestroy(struct md_s *sc, struct thread *td)
1426{
1427
1428	if (sc->gp) {
1429		sc->gp->softc = NULL;
1430		g_topology_lock();
1431		g_wither_geom(sc->gp, ENXIO);
1432		g_topology_unlock();
1433		sc->gp = NULL;
1434		sc->pp = NULL;
1435	}
1436	if (sc->devstat) {
1437		devstat_remove_entry(sc->devstat);
1438		sc->devstat = NULL;
1439	}
1440	mtx_lock(&sc->queue_mtx);
1441	sc->flags |= MD_SHUTDOWN;
1442	wakeup(sc);
1443	while (!(sc->flags & MD_EXITING))
1444		msleep(sc->procp, &sc->queue_mtx, PRIBIO, "mddestroy", hz / 10);
1445	mtx_unlock(&sc->queue_mtx);
1446	mtx_destroy(&sc->stat_mtx);
1447	mtx_destroy(&sc->queue_mtx);
1448	if (sc->vnode != NULL) {
1449		vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY);
1450		sc->vnode->v_vflag &= ~VV_MD;
1451		VOP_UNLOCK(sc->vnode, 0);
1452		(void)vn_close(sc->vnode, sc->flags & MD_READONLY ?
1453		    FREAD : (FREAD|FWRITE), sc->cred, td);
1454	}
1455	if (sc->cred != NULL)
1456		crfree(sc->cred);
1457	if (sc->object != NULL)
1458		vm_object_deallocate(sc->object);
1459	if (sc->indir)
1460		destroy_indir(sc, sc->indir);
1461	if (sc->uma)
1462		uma_zdestroy(sc->uma);
1463
1464	LIST_REMOVE(sc, list);
1465	free_unr(md_uh, sc->unit);
1466	free(sc, M_MD);
1467	return (0);
1468}
1469
1470static int
1471mdresize(struct md_s *sc, struct md_ioctl *mdio)
1472{
1473	int error, res;
1474	vm_pindex_t oldpages, newpages;
1475
1476	switch (sc->type) {
1477	case MD_VNODE:
1478	case MD_NULL:
1479		break;
1480	case MD_SWAP:
1481		if (mdio->md_mediasize <= 0 ||
1482		    (mdio->md_mediasize % PAGE_SIZE) != 0)
1483			return (EDOM);
1484		oldpages = OFF_TO_IDX(round_page(sc->mediasize));
1485		newpages = OFF_TO_IDX(round_page(mdio->md_mediasize));
1486		if (newpages < oldpages) {
1487			VM_OBJECT_WLOCK(sc->object);
1488			vm_object_page_remove(sc->object, newpages, 0, 0);
1489			swap_pager_freespace(sc->object, newpages,
1490			    oldpages - newpages);
1491			swap_release_by_cred(IDX_TO_OFF(oldpages -
1492			    newpages), sc->cred);
1493			sc->object->charge = IDX_TO_OFF(newpages);
1494			sc->object->size = newpages;
1495			VM_OBJECT_WUNLOCK(sc->object);
1496		} else if (newpages > oldpages) {
1497			res = swap_reserve_by_cred(IDX_TO_OFF(newpages -
1498			    oldpages), sc->cred);
1499			if (!res)
1500				return (ENOMEM);
1501			if ((mdio->md_options & MD_RESERVE) ||
1502			    (sc->flags & MD_RESERVE)) {
1503				error = swap_pager_reserve(sc->object,
1504				    oldpages, newpages - oldpages);
1505				if (error < 0) {
1506					swap_release_by_cred(
1507					    IDX_TO_OFF(newpages - oldpages),
1508					    sc->cred);
1509					return (EDOM);
1510				}
1511			}
1512			VM_OBJECT_WLOCK(sc->object);
1513			sc->object->charge = IDX_TO_OFF(newpages);
1514			sc->object->size = newpages;
1515			VM_OBJECT_WUNLOCK(sc->object);
1516		}
1517		break;
1518	default:
1519		return (EOPNOTSUPP);
1520	}
1521
1522	sc->mediasize = mdio->md_mediasize;
1523	g_topology_lock();
1524	g_resize_provider(sc->pp, sc->mediasize);
1525	g_topology_unlock();
1526	return (0);
1527}
1528
1529static int
1530mdcreate_swap(struct md_s *sc, struct md_ioctl *mdio, struct thread *td)
1531{
1532	vm_ooffset_t npage;
1533	int error;
1534
1535	/*
1536	 * Range check.  Disallow negative sizes and sizes not being
1537	 * multiple of page size.
1538	 */
1539	if (sc->mediasize <= 0 || (sc->mediasize % PAGE_SIZE) != 0)
1540		return (EDOM);
1541
1542	/*
1543	 * Allocate an OBJT_SWAP object.
1544	 *
1545	 * Note the truncation.
1546	 */
1547
1548	npage = mdio->md_mediasize / PAGE_SIZE;
1549	if (mdio->md_fwsectors != 0)
1550		sc->fwsectors = mdio->md_fwsectors;
1551	if (mdio->md_fwheads != 0)
1552		sc->fwheads = mdio->md_fwheads;
1553	sc->object = vm_pager_allocate(OBJT_SWAP, NULL, PAGE_SIZE * npage,
1554	    VM_PROT_DEFAULT, 0, td->td_ucred);
1555	if (sc->object == NULL)
1556		return (ENOMEM);
1557	sc->flags = mdio->md_options & (MD_FORCE | MD_RESERVE);
1558	if (mdio->md_options & MD_RESERVE) {
1559		if (swap_pager_reserve(sc->object, 0, npage) < 0) {
1560			error = EDOM;
1561			goto finish;
1562		}
1563	}
1564	error = mdsetcred(sc, td->td_ucred);
1565 finish:
1566	if (error != 0) {
1567		vm_object_deallocate(sc->object);
1568		sc->object = NULL;
1569	}
1570	return (error);
1571}
1572
1573static int
1574mdcreate_null(struct md_s *sc, struct md_ioctl *mdio, struct thread *td)
1575{
1576
1577	/*
1578	 * Range check.  Disallow negative sizes and sizes not being
1579	 * multiple of page size.
1580	 */
1581	if (sc->mediasize <= 0 || (sc->mediasize % PAGE_SIZE) != 0)
1582		return (EDOM);
1583
1584	return (0);
1585}
1586
1587static int
1588xmdctlioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td)
1589{
1590	struct md_ioctl *mdio;
1591	struct md_s *sc;
1592	int error, i;
1593	unsigned sectsize;
1594
1595	if (md_debug)
1596		printf("mdctlioctl(%s %lx %p %x %p)\n",
1597			devtoname(dev), cmd, addr, flags, td);
1598
1599	mdio = (struct md_ioctl *)addr;
1600	if (mdio->md_version != MDIOVERSION)
1601		return (EINVAL);
1602
1603	/*
1604	 * We assert the version number in the individual ioctl
1605	 * handlers instead of out here because (a) it is possible we
1606	 * may add another ioctl in the future which doesn't read an
1607	 * mdio, and (b) the correct return value for an unknown ioctl
1608	 * is ENOIOCTL, not EINVAL.
1609	 */
1610	error = 0;
1611	switch (cmd) {
1612	case MDIOCATTACH:
1613		switch (mdio->md_type) {
1614		case MD_MALLOC:
1615		case MD_PRELOAD:
1616		case MD_VNODE:
1617		case MD_SWAP:
1618		case MD_NULL:
1619			break;
1620		default:
1621			return (EINVAL);
1622		}
1623		if (mdio->md_sectorsize == 0)
1624			sectsize = DEV_BSIZE;
1625		else
1626			sectsize = mdio->md_sectorsize;
1627		if (sectsize > MAXPHYS || mdio->md_mediasize < sectsize)
1628			return (EINVAL);
1629		if (mdio->md_options & MD_AUTOUNIT)
1630			sc = mdnew(-1, &error, mdio->md_type);
1631		else {
1632			if (mdio->md_unit > INT_MAX)
1633				return (EINVAL);
1634			sc = mdnew(mdio->md_unit, &error, mdio->md_type);
1635		}
1636		if (sc == NULL)
1637			return (error);
1638		if (mdio->md_options & MD_AUTOUNIT)
1639			mdio->md_unit = sc->unit;
1640		sc->mediasize = mdio->md_mediasize;
1641		sc->sectorsize = sectsize;
1642		error = EDOOFUS;
1643		switch (sc->type) {
1644		case MD_MALLOC:
1645			sc->start = mdstart_malloc;
1646			error = mdcreate_malloc(sc, mdio);
1647			break;
1648		case MD_PRELOAD:
1649			/*
1650			 * We disallow attaching preloaded memory disks via
1651			 * ioctl. Preloaded memory disks are automatically
1652			 * attached in g_md_init().
1653			 */
1654			error = EOPNOTSUPP;
1655			break;
1656		case MD_VNODE:
1657			sc->start = mdstart_vnode;
1658			error = mdcreate_vnode(sc, mdio, td);
1659			break;
1660		case MD_SWAP:
1661			sc->start = mdstart_swap;
1662			error = mdcreate_swap(sc, mdio, td);
1663			break;
1664		case MD_NULL:
1665			sc->start = mdstart_null;
1666			error = mdcreate_null(sc, mdio, td);
1667			break;
1668		}
1669		if (error != 0) {
1670			mddestroy(sc, td);
1671			return (error);
1672		}
1673
1674		/* Prune off any residual fractional sector */
1675		i = sc->mediasize % sc->sectorsize;
1676		sc->mediasize -= i;
1677
1678		mdinit(sc);
1679		return (0);
1680	case MDIOCDETACH:
1681		if (mdio->md_mediasize != 0 ||
1682		    (mdio->md_options & ~MD_FORCE) != 0)
1683			return (EINVAL);
1684
1685		sc = mdfind(mdio->md_unit);
1686		if (sc == NULL)
1687			return (ENOENT);
1688		if (sc->opencount != 0 && !(sc->flags & MD_FORCE) &&
1689		    !(mdio->md_options & MD_FORCE))
1690			return (EBUSY);
1691		return (mddestroy(sc, td));
1692	case MDIOCRESIZE:
1693		if ((mdio->md_options & ~(MD_FORCE | MD_RESERVE)) != 0)
1694			return (EINVAL);
1695
1696		sc = mdfind(mdio->md_unit);
1697		if (sc == NULL)
1698			return (ENOENT);
1699		if (mdio->md_mediasize < sc->sectorsize)
1700			return (EINVAL);
1701		if (mdio->md_mediasize < sc->mediasize &&
1702		    !(sc->flags & MD_FORCE) &&
1703		    !(mdio->md_options & MD_FORCE))
1704			return (EBUSY);
1705		return (mdresize(sc, mdio));
1706	case MDIOCQUERY:
1707		sc = mdfind(mdio->md_unit);
1708		if (sc == NULL)
1709			return (ENOENT);
1710		mdio->md_type = sc->type;
1711		mdio->md_options = sc->flags;
1712		mdio->md_mediasize = sc->mediasize;
1713		mdio->md_sectorsize = sc->sectorsize;
1714		if (sc->type == MD_VNODE)
1715			error = copyout(sc->file, mdio->md_file,
1716			    strlen(sc->file) + 1);
1717		return (error);
1718	case MDIOCLIST:
1719		i = 1;
1720		LIST_FOREACH(sc, &md_softc_list, list) {
1721			if (i == MDNPAD - 1)
1722				mdio->md_pad[i] = -1;
1723			else
1724				mdio->md_pad[i++] = sc->unit;
1725		}
1726		mdio->md_pad[0] = i - 1;
1727		return (0);
1728	default:
1729		return (ENOIOCTL);
1730	};
1731}
1732
1733static int
1734mdctlioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags, struct thread *td)
1735{
1736	int error;
1737
1738	sx_xlock(&md_sx);
1739	error = xmdctlioctl(dev, cmd, addr, flags, td);
1740	sx_xunlock(&md_sx);
1741	return (error);
1742}
1743
1744static void
1745md_preloaded(u_char *image, size_t length, const char *name)
1746{
1747	struct md_s *sc;
1748	int error;
1749
1750	sc = mdnew(-1, &error, MD_PRELOAD);
1751	if (sc == NULL)
1752		return;
1753	sc->mediasize = length;
1754	sc->sectorsize = DEV_BSIZE;
1755	sc->pl_ptr = image;
1756	sc->pl_len = length;
1757	sc->start = mdstart_preload;
1758#ifdef MD_ROOT
1759	if (sc->unit == 0) {
1760#ifndef ROOTDEVNAME
1761		rootdevnames[0] = MD_ROOT_FSTYPE ":/dev/md0";
1762#endif
1763#ifdef MD_ROOT_READONLY
1764		sc->flags |= MD_READONLY;
1765#endif
1766	}
1767#endif
1768	mdinit(sc);
1769	if (name != NULL) {
1770		printf("%s%d: Preloaded image <%s> %zd bytes at %p\n",
1771		    MD_NAME, sc->unit, name, length, image);
1772	} else {
1773		printf("%s%d: Embedded image %zd bytes at %p\n",
1774		    MD_NAME, sc->unit, length, image);
1775	}
1776}
1777
1778static void
1779g_md_init(struct g_class *mp __unused)
1780{
1781	caddr_t mod;
1782	u_char *ptr, *name, *type;
1783	unsigned len;
1784	int i;
1785
1786	/* figure out log2(NINDIR) */
1787	for (i = NINDIR, nshift = -1; i; nshift++)
1788		i >>= 1;
1789
1790	mod = NULL;
1791	sx_init(&md_sx, "MD config lock");
1792	g_topology_unlock();
1793	md_uh = new_unrhdr(0, INT_MAX, NULL);
1794#ifdef MD_ROOT
1795	if (mfs_root_size != 0) {
1796		sx_xlock(&md_sx);
1797		md_preloaded(__DEVOLATILE(u_char *, &mfs_root), mfs_root_size,
1798		    NULL);
1799		sx_xunlock(&md_sx);
1800	}
1801#endif
1802	/* XXX: are preload_* static or do they need Giant ? */
1803	while ((mod = preload_search_next_name(mod)) != NULL) {
1804		name = (char *)preload_search_info(mod, MODINFO_NAME);
1805		if (name == NULL)
1806			continue;
1807		type = (char *)preload_search_info(mod, MODINFO_TYPE);
1808		if (type == NULL)
1809			continue;
1810		if (strcmp(type, "md_image") && strcmp(type, "mfs_root"))
1811			continue;
1812		ptr = preload_fetch_addr(mod);
1813		len = preload_fetch_size(mod);
1814		if (ptr != NULL && len != 0) {
1815			sx_xlock(&md_sx);
1816			md_preloaded(ptr, len, name);
1817			sx_xunlock(&md_sx);
1818		}
1819	}
1820	md_vnode_pbuf_freecnt = nswbuf / 10;
1821	status_dev = make_dev(&mdctl_cdevsw, INT_MAX, UID_ROOT, GID_WHEEL,
1822	    0600, MDCTL_NAME);
1823	g_topology_lock();
1824}
1825
1826static void
1827g_md_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
1828    struct g_consumer *cp __unused, struct g_provider *pp)
1829{
1830	struct md_s *mp;
1831	char *type;
1832
1833	mp = gp->softc;
1834	if (mp == NULL)
1835		return;
1836
1837	switch (mp->type) {
1838	case MD_MALLOC:
1839		type = "malloc";
1840		break;
1841	case MD_PRELOAD:
1842		type = "preload";
1843		break;
1844	case MD_VNODE:
1845		type = "vnode";
1846		break;
1847	case MD_SWAP:
1848		type = "swap";
1849		break;
1850	case MD_NULL:
1851		type = "null";
1852		break;
1853	default:
1854		type = "unknown";
1855		break;
1856	}
1857
1858	if (pp != NULL) {
1859		if (indent == NULL) {
1860			sbuf_printf(sb, " u %d", mp->unit);
1861			sbuf_printf(sb, " s %ju", (uintmax_t) mp->sectorsize);
1862			sbuf_printf(sb, " f %ju", (uintmax_t) mp->fwheads);
1863			sbuf_printf(sb, " fs %ju", (uintmax_t) mp->fwsectors);
1864			sbuf_printf(sb, " l %ju", (uintmax_t) mp->mediasize);
1865			sbuf_printf(sb, " t %s", type);
1866			if (mp->type == MD_VNODE && mp->vnode != NULL)
1867				sbuf_printf(sb, " file %s", mp->file);
1868		} else {
1869			sbuf_printf(sb, "%s<unit>%d</unit>\n", indent,
1870			    mp->unit);
1871			sbuf_printf(sb, "%s<sectorsize>%ju</sectorsize>\n",
1872			    indent, (uintmax_t) mp->sectorsize);
1873			sbuf_printf(sb, "%s<fwheads>%ju</fwheads>\n",
1874			    indent, (uintmax_t) mp->fwheads);
1875			sbuf_printf(sb, "%s<fwsectors>%ju</fwsectors>\n",
1876			    indent, (uintmax_t) mp->fwsectors);
1877			sbuf_printf(sb, "%s<length>%ju</length>\n",
1878			    indent, (uintmax_t) mp->mediasize);
1879			sbuf_printf(sb, "%s<compression>%s</compression>\n", indent,
1880			    (mp->flags & MD_COMPRESS) == 0 ? "off": "on");
1881			sbuf_printf(sb, "%s<access>%s</access>\n", indent,
1882			    (mp->flags & MD_READONLY) == 0 ? "read-write":
1883			    "read-only");
1884			sbuf_printf(sb, "%s<type>%s</type>\n", indent,
1885			    type);
1886			if (mp->type == MD_VNODE && mp->vnode != NULL) {
1887				sbuf_printf(sb, "%s<file>", indent);
1888				g_conf_printf_escaped(sb, "%s", mp->file);
1889				sbuf_printf(sb, "</file>\n");
1890			}
1891		}
1892	}
1893}
1894
1895static void
1896g_md_fini(struct g_class *mp __unused)
1897{
1898
1899	sx_destroy(&md_sx);
1900	if (status_dev != NULL)
1901		destroy_dev(status_dev);
1902	delete_unrhdr(md_uh);
1903}
1904