1/*-
2 * SPDX-License-Identifier: (Beerware AND BSD-3-Clause)
3 *
4 * ----------------------------------------------------------------------------
5 * "THE BEER-WARE LICENSE" (Revision 42):
6 * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
7 * can do whatever you want with this stuff. If we meet some day, and you think
8 * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
9 * ----------------------------------------------------------------------------
10 *
11 */
12
13/*-
14 * The following functions are based on the vn(4) driver: mdstart_swap(),
15 * mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() and mddestroy(),
16 * and as such under the following copyright:
17 *
18 * Copyright (c) 1988 University of Utah.
19 * Copyright (c) 1990, 1993
20 *	The Regents of the University of California.  All rights reserved.
21 * Copyright (c) 2013 The FreeBSD Foundation
22 * All rights reserved.
23 *
24 * This code is derived from software contributed to Berkeley by
25 * the Systems Programming Group of the University of Utah Computer
26 * Science Department.
27 *
28 * Portions of this software were developed by Konstantin Belousov
29 * under sponsorship from the FreeBSD Foundation.
30 *
31 * Redistribution and use in source and binary forms, with or without
32 * modification, are permitted provided that the following conditions
33 * are met:
34 * 1. Redistributions of source code must retain the above copyright
35 *    notice, this list of conditions and the following disclaimer.
36 * 2. Redistributions in binary form must reproduce the above copyright
37 *    notice, this list of conditions and the following disclaimer in the
38 *    documentation and/or other materials provided with the distribution.
39 * 3. Neither the name of the University nor the names of its contributors
40 *    may be used to endorse or promote products derived from this software
41 *    without specific prior written permission.
42 *
43 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
44 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
45 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
46 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
47 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53 * SUCH DAMAGE.
54 *
55 * from: Utah Hdr: vn.c 1.13 94/04/02
56 * From: src/sys/dev/vn/vn.c,v 1.122 2000/12/16 16:06:03
57 */
58
59#include "opt_rootdevname.h"
60#include "opt_geom.h"
61#include "opt_md.h"
62
63#include <sys/param.h>
64#include <sys/systm.h>
65#include <sys/bio.h>
66#include <sys/buf.h>
67#include <sys/conf.h>
68#include <sys/devicestat.h>
69#include <sys/fcntl.h>
70#include <sys/kernel.h>
71#include <sys/kthread.h>
72#include <sys/limits.h>
73#include <sys/linker.h>
74#include <sys/lock.h>
75#include <sys/malloc.h>
76#include <sys/mdioctl.h>
77#include <sys/mount.h>
78#include <sys/mutex.h>
79#include <sys/sx.h>
80#include <sys/namei.h>
81#include <sys/proc.h>
82#include <sys/queue.h>
83#include <sys/rwlock.h>
84#include <sys/sbuf.h>
85#include <sys/sched.h>
86#include <sys/sf_buf.h>
87#include <sys/sysctl.h>
88#include <sys/uio.h>
89#include <sys/unistd.h>
90#include <sys/vnode.h>
91#include <sys/disk.h>
92
93#include <geom/geom.h>
94#include <geom/geom_int.h>
95
96#include <vm/vm.h>
97#include <vm/vm_extern.h>
98#include <vm/vm_param.h>
99#include <vm/vm_object.h>
100#include <vm/vm_page.h>
101#include <vm/vm_pager.h>
102#include <vm/swap_pager.h>
103#include <vm/uma.h>
104
105#include <machine/bus.h>
106
107#define MD_MODVER 1
108
109#define MD_SHUTDOWN	0x10000		/* Tell worker thread to terminate. */
110#define	MD_EXITING	0x20000		/* Worker thread is exiting. */
111#define MD_PROVIDERGONE	0x40000		/* Safe to free the softc */
112
113#ifndef MD_NSECT
114#define MD_NSECT (10000 * 2)
115#endif
116
117struct md_req {
118	unsigned	md_unit;	/* unit number */
119	enum md_types	md_type;	/* type of disk */
120	off_t		md_mediasize;	/* size of disk in bytes */
121	unsigned	md_sectorsize;	/* sectorsize */
122	unsigned	md_options;	/* options */
123	int		md_fwheads;	/* firmware heads */
124	int		md_fwsectors;	/* firmware sectors */
125	char		*md_file;	/* pathname of file to mount */
126	enum uio_seg	md_file_seg;	/* location of md_file */
127	char		*md_label;	/* label of the device (userspace) */
128	int		*md_units;	/* pointer to units array (kernel) */
129	size_t		md_units_nitems; /* items in md_units array */
130};
131
132#ifdef COMPAT_FREEBSD32
133struct md_ioctl32 {
134	unsigned	md_version;
135	unsigned	md_unit;
136	enum md_types	md_type;
137	uint32_t	md_file;
138	off_t		md_mediasize;
139	unsigned	md_sectorsize;
140	unsigned	md_options;
141	uint64_t	md_base;
142	int		md_fwheads;
143	int		md_fwsectors;
144	uint32_t	md_label;
145	int		md_pad[MDNPAD];
146}
147#ifdef __amd64__
148__attribute__((__packed__))
149#endif
150;
151#ifndef __amd64__
152CTASSERT((sizeof(struct md_ioctl32)) == 440);
153#else
154CTASSERT((sizeof(struct md_ioctl32)) == 436);
155#endif
156
157#define	MDIOCATTACH_32	_IOC_NEWTYPE(MDIOCATTACH, struct md_ioctl32)
158#define	MDIOCDETACH_32	_IOC_NEWTYPE(MDIOCDETACH, struct md_ioctl32)
159#define	MDIOCQUERY_32	_IOC_NEWTYPE(MDIOCQUERY, struct md_ioctl32)
160#define	MDIOCRESIZE_32	_IOC_NEWTYPE(MDIOCRESIZE, struct md_ioctl32)
161#endif /* COMPAT_FREEBSD32 */
162
163static MALLOC_DEFINE(M_MD, "md_disk", "Memory Disk");
164static MALLOC_DEFINE(M_MDSECT, "md_sectors", "Memory Disk Sectors");
165
166static int md_debug;
167SYSCTL_INT(_debug, OID_AUTO, mddebug, CTLFLAG_RW, &md_debug, 0,
168    "Enable md(4) debug messages");
169static int md_malloc_wait;
170SYSCTL_INT(_vm, OID_AUTO, md_malloc_wait, CTLFLAG_RW, &md_malloc_wait, 0,
171    "Allow malloc to wait for memory allocations");
172
173#if defined(MD_ROOT) && !defined(MD_ROOT_FSTYPE)
174#define	MD_ROOT_FSTYPE	"ufs"
175#endif
176
177#if defined(MD_ROOT)
178/*
179 * Preloaded image gets put here.
180 */
181#if defined(MD_ROOT_SIZE)
182/*
183 * We put the mfs_root symbol into the oldmfs section of the kernel object file.
184 * Applications that patch the object with the image can determine
185 * the size looking at the oldmfs section size within the kernel.
186 */
187u_char mfs_root[MD_ROOT_SIZE*1024] __attribute__ ((section ("oldmfs")));
188const int mfs_root_size = sizeof(mfs_root);
189#elif defined(MD_ROOT_MEM)
190/* MD region already mapped in the memory */
191u_char *mfs_root;
192int mfs_root_size;
193#else
194extern volatile u_char __weak_symbol mfs_root;
195extern volatile u_char __weak_symbol mfs_root_end;
196#define mfs_root_size ((uintptr_t)(&mfs_root_end - &mfs_root))
197#endif
198#endif
199
200static g_init_t g_md_init;
201static g_fini_t g_md_fini;
202static g_start_t g_md_start;
203static g_access_t g_md_access;
204static void g_md_dumpconf(struct sbuf *sb, const char *indent,
205    struct g_geom *gp, struct g_consumer *cp __unused, struct g_provider *pp);
206static g_provgone_t g_md_providergone;
207
208static struct cdev *status_dev = NULL;
209static struct sx md_sx;
210static struct unrhdr *md_uh;
211
212static d_ioctl_t mdctlioctl;
213
214static struct cdevsw mdctl_cdevsw = {
215	.d_version =	D_VERSION,
216	.d_ioctl =	mdctlioctl,
217	.d_name =	MD_NAME,
218};
219
220struct g_class g_md_class = {
221	.name = "MD",
222	.version = G_VERSION,
223	.init = g_md_init,
224	.fini = g_md_fini,
225	.start = g_md_start,
226	.access = g_md_access,
227	.dumpconf = g_md_dumpconf,
228	.providergone = g_md_providergone,
229};
230
231DECLARE_GEOM_CLASS(g_md_class, g_md);
232MODULE_VERSION(geom_md, 0);
233
234static LIST_HEAD(, md_s) md_softc_list = LIST_HEAD_INITIALIZER(md_softc_list);
235
236#define NINDIR	(PAGE_SIZE / sizeof(uintptr_t))
237#define NMASK	(NINDIR-1)
238static int nshift;
239
240struct indir {
241	uintptr_t	*array;
242	u_int		total;
243	u_int		used;
244	u_int		shift;
245};
246
247struct md_s {
248	int unit;
249	LIST_ENTRY(md_s) list;
250	struct bio_queue_head bio_queue;
251	struct mtx queue_mtx;
252	struct cdev *dev;
253	enum md_types type;
254	off_t mediasize;
255	unsigned sectorsize;
256	unsigned opencount;
257	unsigned fwheads;
258	unsigned fwsectors;
259	char ident[32];
260	unsigned flags;
261	char name[20];
262	struct proc *procp;
263	struct g_geom *gp;
264	struct g_provider *pp;
265	int (*start)(struct md_s *sc, struct bio *bp);
266	struct devstat *devstat;
267	bool candelete;
268
269	/* MD_MALLOC related fields */
270	struct indir *indir;
271	uma_zone_t uma;
272
273	/* MD_PRELOAD related fields */
274	u_char *pl_ptr;
275	size_t pl_len;
276
277	/* MD_VNODE related fields */
278	struct vnode *vnode;
279	char file[PATH_MAX];
280	char label[PATH_MAX];
281	struct ucred *cred;
282	vm_offset_t kva;
283
284	/* MD_SWAP related fields */
285	vm_object_t object;
286};
287
288static struct indir *
289new_indir(u_int shift)
290{
291	struct indir *ip;
292
293	ip = malloc(sizeof *ip, M_MD, (md_malloc_wait ? M_WAITOK : M_NOWAIT)
294	    | M_ZERO);
295	if (ip == NULL)
296		return (NULL);
297	ip->array = malloc(sizeof(uintptr_t) * NINDIR,
298	    M_MDSECT, (md_malloc_wait ? M_WAITOK : M_NOWAIT) | M_ZERO);
299	if (ip->array == NULL) {
300		free(ip, M_MD);
301		return (NULL);
302	}
303	ip->total = NINDIR;
304	ip->shift = shift;
305	return (ip);
306}
307
308static void
309del_indir(struct indir *ip)
310{
311
312	free(ip->array, M_MDSECT);
313	free(ip, M_MD);
314}
315
316static void
317destroy_indir(struct md_s *sc, struct indir *ip)
318{
319	int i;
320
321	for (i = 0; i < NINDIR; i++) {
322		if (!ip->array[i])
323			continue;
324		if (ip->shift)
325			destroy_indir(sc, (struct indir*)(ip->array[i]));
326		else if (ip->array[i] > 255)
327			uma_zfree(sc->uma, (void *)(ip->array[i]));
328	}
329	del_indir(ip);
330}
331
332/*
333 * This function does the math and allocates the top level "indir" structure
334 * for a device of "size" sectors.
335 */
336
337static struct indir *
338dimension(off_t size)
339{
340	off_t rcnt;
341	struct indir *ip;
342	int layer;
343
344	rcnt = size;
345	layer = 0;
346	while (rcnt > NINDIR) {
347		rcnt /= NINDIR;
348		layer++;
349	}
350
351	/*
352	 * XXX: the top layer is probably not fully populated, so we allocate
353	 * too much space for ip->array in here.
354	 */
355	ip = malloc(sizeof *ip, M_MD, M_WAITOK | M_ZERO);
356	ip->array = malloc(sizeof(uintptr_t) * NINDIR,
357	    M_MDSECT, M_WAITOK | M_ZERO);
358	ip->total = NINDIR;
359	ip->shift = layer * nshift;
360	return (ip);
361}
362
363/*
364 * Read a given sector
365 */
366
367static uintptr_t
368s_read(struct indir *ip, off_t offset)
369{
370	struct indir *cip;
371	int idx;
372	uintptr_t up;
373
374	if (md_debug > 1)
375		printf("s_read(%jd)\n", (intmax_t)offset);
376	up = 0;
377	for (cip = ip; cip != NULL;) {
378		if (cip->shift) {
379			idx = (offset >> cip->shift) & NMASK;
380			up = cip->array[idx];
381			cip = (struct indir *)up;
382			continue;
383		}
384		idx = offset & NMASK;
385		return (cip->array[idx]);
386	}
387	return (0);
388}
389
390/*
391 * Write a given sector, prune the tree if the value is 0
392 */
393
394static int
395s_write(struct indir *ip, off_t offset, uintptr_t ptr)
396{
397	struct indir *cip, *lip[10];
398	int idx, li;
399	uintptr_t up;
400
401	if (md_debug > 1)
402		printf("s_write(%jd, %p)\n", (intmax_t)offset, (void *)ptr);
403	up = 0;
404	li = 0;
405	cip = ip;
406	for (;;) {
407		lip[li++] = cip;
408		if (cip->shift) {
409			idx = (offset >> cip->shift) & NMASK;
410			up = cip->array[idx];
411			if (up != 0) {
412				cip = (struct indir *)up;
413				continue;
414			}
415			/* Allocate branch */
416			cip->array[idx] =
417			    (uintptr_t)new_indir(cip->shift - nshift);
418			if (cip->array[idx] == 0)
419				return (ENOSPC);
420			cip->used++;
421			up = cip->array[idx];
422			cip = (struct indir *)up;
423			continue;
424		}
425		/* leafnode */
426		idx = offset & NMASK;
427		up = cip->array[idx];
428		if (up != 0)
429			cip->used--;
430		cip->array[idx] = ptr;
431		if (ptr != 0)
432			cip->used++;
433		break;
434	}
435	if (cip->used != 0 || li == 1)
436		return (0);
437	li--;
438	while (cip->used == 0 && cip != ip) {
439		li--;
440		idx = (offset >> lip[li]->shift) & NMASK;
441		up = lip[li]->array[idx];
442		KASSERT(up == (uintptr_t)cip, ("md screwed up"));
443		del_indir(cip);
444		lip[li]->array[idx] = 0;
445		lip[li]->used--;
446		cip = lip[li];
447	}
448	return (0);
449}
450
451static int
452g_md_access(struct g_provider *pp, int r, int w, int e)
453{
454	struct md_s *sc;
455
456	sc = pp->geom->softc;
457	if (sc == NULL) {
458		if (r <= 0 && w <= 0 && e <= 0)
459			return (0);
460		return (ENXIO);
461	}
462	r += pp->acr;
463	w += pp->acw;
464	e += pp->ace;
465	if ((sc->flags & MD_READONLY) != 0 && w > 0)
466		return (EROFS);
467	if ((pp->acr + pp->acw + pp->ace) == 0 && (r + w + e) > 0) {
468		sc->opencount = 1;
469	} else if ((pp->acr + pp->acw + pp->ace) > 0 && (r + w + e) == 0) {
470		sc->opencount = 0;
471	}
472	return (0);
473}
474
475static void
476g_md_start(struct bio *bp)
477{
478	struct md_s *sc;
479
480	sc = bp->bio_to->geom->softc;
481	if ((bp->bio_cmd == BIO_READ) || (bp->bio_cmd == BIO_WRITE)) {
482		devstat_start_transaction_bio(sc->devstat, bp);
483	}
484	mtx_lock(&sc->queue_mtx);
485	bioq_disksort(&sc->bio_queue, bp);
486	wakeup(sc);
487	mtx_unlock(&sc->queue_mtx);
488}
489
490#define	MD_MALLOC_MOVE_ZERO	1
491#define	MD_MALLOC_MOVE_FILL	2
492#define	MD_MALLOC_MOVE_READ	3
493#define	MD_MALLOC_MOVE_WRITE	4
494#define	MD_MALLOC_MOVE_CMP	5
495
496static int
497md_malloc_move_ma(vm_page_t **mp, int *ma_offs, unsigned sectorsize,
498    void *ptr, u_char fill, int op)
499{
500	struct sf_buf *sf;
501	vm_page_t m, *mp1;
502	char *p, first;
503	off_t *uc;
504	unsigned n;
505	int error, i, ma_offs1, sz, first_read;
506
507	m = NULL;
508	error = 0;
509	sf = NULL;
510	/* if (op == MD_MALLOC_MOVE_CMP) { gcc */
511		first = 0;
512		first_read = 0;
513		uc = ptr;
514		mp1 = *mp;
515		ma_offs1 = *ma_offs;
516	/* } */
517	sched_pin();
518	for (n = sectorsize; n != 0; n -= sz) {
519		sz = imin(PAGE_SIZE - *ma_offs, n);
520		if (m != **mp) {
521			if (sf != NULL)
522				sf_buf_free(sf);
523			m = **mp;
524			sf = sf_buf_alloc(m, SFB_CPUPRIVATE |
525			    (md_malloc_wait ? 0 : SFB_NOWAIT));
526			if (sf == NULL) {
527				error = ENOMEM;
528				break;
529			}
530		}
531		p = (char *)sf_buf_kva(sf) + *ma_offs;
532		switch (op) {
533		case MD_MALLOC_MOVE_ZERO:
534			bzero(p, sz);
535			break;
536		case MD_MALLOC_MOVE_FILL:
537			memset(p, fill, sz);
538			break;
539		case MD_MALLOC_MOVE_READ:
540			bcopy(ptr, p, sz);
541			cpu_flush_dcache(p, sz);
542			break;
543		case MD_MALLOC_MOVE_WRITE:
544			bcopy(p, ptr, sz);
545			break;
546		case MD_MALLOC_MOVE_CMP:
547			for (i = 0; i < sz; i++, p++) {
548				if (!first_read) {
549					*uc = (u_char)*p;
550					first = *p;
551					first_read = 1;
552				} else if (*p != first) {
553					error = EDOOFUS;
554					break;
555				}
556			}
557			break;
558		default:
559			KASSERT(0, ("md_malloc_move_ma unknown op %d\n", op));
560			break;
561		}
562		if (error != 0)
563			break;
564		*ma_offs += sz;
565		*ma_offs %= PAGE_SIZE;
566		if (*ma_offs == 0)
567			(*mp)++;
568		ptr = (char *)ptr + sz;
569	}
570
571	if (sf != NULL)
572		sf_buf_free(sf);
573	sched_unpin();
574	if (op == MD_MALLOC_MOVE_CMP && error != 0) {
575		*mp = mp1;
576		*ma_offs = ma_offs1;
577	}
578	return (error);
579}
580
581static int
582md_malloc_move_vlist(bus_dma_segment_t **pvlist, int *pma_offs,
583    unsigned len, void *ptr, u_char fill, int op)
584{
585	bus_dma_segment_t *vlist;
586	uint8_t *p, *end, first;
587	off_t *uc;
588	int ma_offs, seg_len;
589
590	vlist = *pvlist;
591	ma_offs = *pma_offs;
592	uc = ptr;
593
594	for (; len != 0; len -= seg_len) {
595		seg_len = imin(vlist->ds_len - ma_offs, len);
596		p = (uint8_t *)(uintptr_t)vlist->ds_addr + ma_offs;
597		switch (op) {
598		case MD_MALLOC_MOVE_ZERO:
599			bzero(p, seg_len);
600			break;
601		case MD_MALLOC_MOVE_FILL:
602			memset(p, fill, seg_len);
603			break;
604		case MD_MALLOC_MOVE_READ:
605			bcopy(ptr, p, seg_len);
606			cpu_flush_dcache(p, seg_len);
607			break;
608		case MD_MALLOC_MOVE_WRITE:
609			bcopy(p, ptr, seg_len);
610			break;
611		case MD_MALLOC_MOVE_CMP:
612			end = p + seg_len;
613			first = *uc = *p;
614			/* Confirm all following bytes match the first */
615			while (++p < end) {
616				if (*p != first)
617					return (EDOOFUS);
618			}
619			break;
620		default:
621			KASSERT(0, ("md_malloc_move_vlist unknown op %d\n", op));
622			break;
623		}
624
625		ma_offs += seg_len;
626		if (ma_offs == vlist->ds_len) {
627			ma_offs = 0;
628			vlist++;
629		}
630		ptr = (uint8_t *)ptr + seg_len;
631	}
632	*pvlist = vlist;
633	*pma_offs = ma_offs;
634
635	return (0);
636}
637
638static int
639mdstart_malloc(struct md_s *sc, struct bio *bp)
640{
641	u_char *dst;
642	vm_page_t *m;
643	bus_dma_segment_t *vlist;
644	int i, error, error1, ma_offs, notmapped;
645	off_t secno, nsec, uc;
646	uintptr_t sp, osp;
647
648	switch (bp->bio_cmd) {
649	case BIO_READ:
650	case BIO_WRITE:
651	case BIO_DELETE:
652		break;
653	case BIO_FLUSH:
654		return (0);
655	default:
656		return (EOPNOTSUPP);
657	}
658
659	notmapped = (bp->bio_flags & BIO_UNMAPPED) != 0;
660	vlist = (bp->bio_flags & BIO_VLIST) != 0 ?
661	    (bus_dma_segment_t *)bp->bio_data : NULL;
662	if (notmapped) {
663		m = bp->bio_ma;
664		ma_offs = bp->bio_ma_offset;
665		dst = NULL;
666		KASSERT(vlist == NULL, ("vlists cannot be unmapped"));
667	} else if (vlist != NULL) {
668		ma_offs = bp->bio_ma_offset;
669		dst = NULL;
670	} else {
671		dst = bp->bio_data;
672	}
673
674	nsec = bp->bio_length / sc->sectorsize;
675	secno = bp->bio_offset / sc->sectorsize;
676	error = 0;
677	while (nsec--) {
678		osp = s_read(sc->indir, secno);
679		if (bp->bio_cmd == BIO_DELETE) {
680			if (osp != 0)
681				error = s_write(sc->indir, secno, 0);
682		} else if (bp->bio_cmd == BIO_READ) {
683			if (osp == 0) {
684				if (notmapped) {
685					error = md_malloc_move_ma(&m, &ma_offs,
686					    sc->sectorsize, NULL, 0,
687					    MD_MALLOC_MOVE_ZERO);
688				} else if (vlist != NULL) {
689					error = md_malloc_move_vlist(&vlist,
690					    &ma_offs, sc->sectorsize, NULL, 0,
691					    MD_MALLOC_MOVE_ZERO);
692				} else
693					bzero(dst, sc->sectorsize);
694			} else if (osp <= 255) {
695				if (notmapped) {
696					error = md_malloc_move_ma(&m, &ma_offs,
697					    sc->sectorsize, NULL, osp,
698					    MD_MALLOC_MOVE_FILL);
699				} else if (vlist != NULL) {
700					error = md_malloc_move_vlist(&vlist,
701					    &ma_offs, sc->sectorsize, NULL, osp,
702					    MD_MALLOC_MOVE_FILL);
703				} else
704					memset(dst, osp, sc->sectorsize);
705			} else {
706				if (notmapped) {
707					error = md_malloc_move_ma(&m, &ma_offs,
708					    sc->sectorsize, (void *)osp, 0,
709					    MD_MALLOC_MOVE_READ);
710				} else if (vlist != NULL) {
711					error = md_malloc_move_vlist(&vlist,
712					    &ma_offs, sc->sectorsize,
713					    (void *)osp, 0,
714					    MD_MALLOC_MOVE_READ);
715				} else {
716					bcopy((void *)osp, dst, sc->sectorsize);
717					cpu_flush_dcache(dst, sc->sectorsize);
718				}
719			}
720			osp = 0;
721		} else if (bp->bio_cmd == BIO_WRITE) {
722			if (sc->flags & MD_COMPRESS) {
723				if (notmapped) {
724					error1 = md_malloc_move_ma(&m, &ma_offs,
725					    sc->sectorsize, &uc, 0,
726					    MD_MALLOC_MOVE_CMP);
727					i = error1 == 0 ? sc->sectorsize : 0;
728				} else if (vlist != NULL) {
729					error1 = md_malloc_move_vlist(&vlist,
730					    &ma_offs, sc->sectorsize, &uc, 0,
731					    MD_MALLOC_MOVE_CMP);
732					i = error1 == 0 ? sc->sectorsize : 0;
733				} else {
734					uc = dst[0];
735					for (i = 1; i < sc->sectorsize; i++) {
736						if (dst[i] != uc)
737							break;
738					}
739				}
740			} else {
741				i = 0;
742				uc = 0;
743			}
744			if (i == sc->sectorsize) {
745				if (osp != uc)
746					error = s_write(sc->indir, secno, uc);
747			} else {
748				if (osp <= 255) {
749					sp = (uintptr_t)uma_zalloc(sc->uma,
750					    md_malloc_wait ? M_WAITOK :
751					    M_NOWAIT);
752					if (sp == 0) {
753						error = ENOSPC;
754						break;
755					}
756					if (notmapped) {
757						error = md_malloc_move_ma(&m,
758						    &ma_offs, sc->sectorsize,
759						    (void *)sp, 0,
760						    MD_MALLOC_MOVE_WRITE);
761					} else if (vlist != NULL) {
762						error = md_malloc_move_vlist(
763						    &vlist, &ma_offs,
764						    sc->sectorsize, (void *)sp,
765						    0, MD_MALLOC_MOVE_WRITE);
766					} else {
767						bcopy(dst, (void *)sp,
768						    sc->sectorsize);
769					}
770					error = s_write(sc->indir, secno, sp);
771				} else {
772					if (notmapped) {
773						error = md_malloc_move_ma(&m,
774						    &ma_offs, sc->sectorsize,
775						    (void *)osp, 0,
776						    MD_MALLOC_MOVE_WRITE);
777					} else if (vlist != NULL) {
778						error = md_malloc_move_vlist(
779						    &vlist, &ma_offs,
780						    sc->sectorsize, (void *)osp,
781						    0, MD_MALLOC_MOVE_WRITE);
782					} else {
783						bcopy(dst, (void *)osp,
784						    sc->sectorsize);
785					}
786					osp = 0;
787				}
788			}
789		} else {
790			error = EOPNOTSUPP;
791		}
792		if (osp > 255)
793			uma_zfree(sc->uma, (void*)osp);
794		if (error != 0)
795			break;
796		secno++;
797		if (!notmapped && vlist == NULL)
798			dst += sc->sectorsize;
799	}
800	bp->bio_resid = 0;
801	return (error);
802}
803
804static void
805mdcopyto_vlist(void *src, bus_dma_segment_t *vlist, off_t offset, off_t len)
806{
807	off_t seg_len;
808
809	while (offset >= vlist->ds_len) {
810		offset -= vlist->ds_len;
811		vlist++;
812	}
813
814	while (len != 0) {
815		seg_len = omin(len, vlist->ds_len - offset);
816		bcopy(src, (void *)(uintptr_t)(vlist->ds_addr + offset),
817		    seg_len);
818		offset = 0;
819		src = (uint8_t *)src + seg_len;
820		len -= seg_len;
821		vlist++;
822	}
823}
824
825static void
826mdcopyfrom_vlist(bus_dma_segment_t *vlist, off_t offset, void *dst, off_t len)
827{
828	off_t seg_len;
829
830	while (offset >= vlist->ds_len) {
831		offset -= vlist->ds_len;
832		vlist++;
833	}
834
835	while (len != 0) {
836		seg_len = omin(len, vlist->ds_len - offset);
837		bcopy((void *)(uintptr_t)(vlist->ds_addr + offset), dst,
838		    seg_len);
839		offset = 0;
840		dst = (uint8_t *)dst + seg_len;
841		len -= seg_len;
842		vlist++;
843	}
844}
845
846static int
847mdstart_preload(struct md_s *sc, struct bio *bp)
848{
849	uint8_t *p;
850
851	p = sc->pl_ptr + bp->bio_offset;
852	switch (bp->bio_cmd) {
853	case BIO_READ:
854		if ((bp->bio_flags & BIO_VLIST) != 0) {
855			mdcopyto_vlist(p, (bus_dma_segment_t *)bp->bio_data,
856			    bp->bio_ma_offset, bp->bio_length);
857		} else {
858			bcopy(p, bp->bio_data, bp->bio_length);
859		}
860		cpu_flush_dcache(bp->bio_data, bp->bio_length);
861		break;
862	case BIO_WRITE:
863		if ((bp->bio_flags & BIO_VLIST) != 0) {
864			mdcopyfrom_vlist((bus_dma_segment_t *)bp->bio_data,
865			    bp->bio_ma_offset, p, bp->bio_length);
866		} else {
867			bcopy(bp->bio_data, p, bp->bio_length);
868		}
869		break;
870	}
871	bp->bio_resid = 0;
872	return (0);
873}
874
875static int
876mdstart_vnode(struct md_s *sc, struct bio *bp)
877{
878	int error;
879	struct uio auio;
880	struct iovec aiov;
881	struct iovec *piov;
882	struct mount *mp;
883	struct vnode *vp;
884	bus_dma_segment_t *vlist;
885	struct thread *td;
886	off_t iolen, iostart, off, len;
887	int ma_offs, npages;
888	bool mapped;
889
890	td = curthread;
891	vp = sc->vnode;
892	piov = NULL;
893	ma_offs = bp->bio_ma_offset;
894	off = bp->bio_offset;
895	len = bp->bio_length;
896	mapped = false;
897
898	/*
899	 * VNODE I/O
900	 *
901	 * If an error occurs, we set BIO_ERROR but we do not set
902	 * B_INVAL because (for a write anyway), the buffer is
903	 * still valid.
904	 */
905
906	switch (bp->bio_cmd) {
907	case BIO_READ:
908		auio.uio_rw = UIO_READ;
909		break;
910	case BIO_WRITE:
911		auio.uio_rw = UIO_WRITE;
912		break;
913	case BIO_FLUSH:
914		do {
915			(void)vn_start_write(vp, &mp, V_WAIT);
916			vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
917			error = VOP_FSYNC(vp, MNT_WAIT, td);
918			VOP_UNLOCK(vp);
919			vn_finished_write(mp);
920		} while (error == ERELOOKUP);
921		return (error);
922	case BIO_DELETE:
923		if (sc->candelete) {
924			error = vn_deallocate(vp, &off, &len, 0,
925			    sc->flags & MD_ASYNC ? 0 : IO_SYNC, sc->cred,
926			    NOCRED);
927			bp->bio_resid = len;
928			return (error);
929		}
930		/* FALLTHROUGH */
931	default:
932		return (EOPNOTSUPP);
933	}
934
935	auio.uio_offset = (vm_ooffset_t)bp->bio_offset;
936	auio.uio_resid = bp->bio_length;
937	auio.uio_segflg = UIO_SYSSPACE;
938	auio.uio_td = td;
939
940	if ((bp->bio_flags & BIO_VLIST) != 0) {
941		piov = malloc(sizeof(*piov) * bp->bio_ma_n, M_MD, M_WAITOK);
942		auio.uio_iov = piov;
943		vlist = (bus_dma_segment_t *)bp->bio_data;
944		while (len > 0) {
945			piov->iov_base = (void *)(uintptr_t)(vlist->ds_addr +
946			    ma_offs);
947			piov->iov_len = vlist->ds_len - ma_offs;
948			if (piov->iov_len > len)
949				piov->iov_len = len;
950			len -= piov->iov_len;
951			ma_offs = 0;
952			vlist++;
953			piov++;
954		}
955		auio.uio_iovcnt = piov - auio.uio_iov;
956		piov = auio.uio_iov;
957	} else if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
958		bp->bio_resid = len;
959unmapped_step:
960		npages = atop(min(maxphys, round_page(len + (ma_offs &
961		    PAGE_MASK))));
962		iolen = min(ptoa(npages) - (ma_offs & PAGE_MASK), len);
963		KASSERT(iolen > 0, ("zero iolen"));
964		KASSERT(npages <= atop(maxphys + PAGE_SIZE),
965		    ("npages %d too large", npages));
966		pmap_qenter(sc->kva, &bp->bio_ma[atop(ma_offs)], npages);
967		aiov.iov_base = (void *)(sc->kva + (ma_offs & PAGE_MASK));
968		aiov.iov_len = iolen;
969		auio.uio_iov = &aiov;
970		auio.uio_iovcnt = 1;
971		auio.uio_resid = iolen;
972		mapped = true;
973	} else {
974		aiov.iov_base = bp->bio_data;
975		aiov.iov_len = bp->bio_length;
976		auio.uio_iov = &aiov;
977		auio.uio_iovcnt = 1;
978	}
979	iostart = auio.uio_offset;
980	if (bp->bio_cmd == BIO_READ) {
981		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
982		error = VOP_READ(vp, &auio, 0, sc->cred);
983		VOP_UNLOCK(vp);
984	} else {
985		(void) vn_start_write(vp, &mp, V_WAIT);
986		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
987		error = VOP_WRITE(vp, &auio, sc->flags & MD_ASYNC ? 0 : IO_SYNC,
988		    sc->cred);
989		VOP_UNLOCK(vp);
990		vn_finished_write(mp);
991		if (error == 0)
992			sc->flags &= ~MD_VERIFY;
993	}
994
995	/* When MD_CACHE is set, try to avoid double-caching the data. */
996	if (error == 0 && (sc->flags & MD_CACHE) == 0)
997		VOP_ADVISE(vp, iostart, auio.uio_offset - 1,
998		    POSIX_FADV_DONTNEED);
999
1000	if (mapped) {
1001		pmap_qremove(sc->kva, npages);
1002		if (error == 0) {
1003			len -= iolen;
1004			bp->bio_resid -= iolen;
1005			ma_offs += iolen;
1006			if (len > 0)
1007				goto unmapped_step;
1008		}
1009	} else {
1010		bp->bio_resid = auio.uio_resid;
1011	}
1012
1013	free(piov, M_MD);
1014	return (error);
1015}
1016
1017static int
1018mdstart_swap(struct md_s *sc, struct bio *bp)
1019{
1020	vm_page_t m;
1021	u_char *p;
1022	vm_pindex_t i, lastp;
1023	bus_dma_segment_t *vlist;
1024	int rv, ma_offs, offs, len, lastend;
1025
1026	switch (bp->bio_cmd) {
1027	case BIO_READ:
1028	case BIO_WRITE:
1029	case BIO_DELETE:
1030		break;
1031	case BIO_FLUSH:
1032		return (0);
1033	default:
1034		return (EOPNOTSUPP);
1035	}
1036
1037	p = bp->bio_data;
1038	ma_offs = (bp->bio_flags & (BIO_UNMAPPED|BIO_VLIST)) != 0 ?
1039	    bp->bio_ma_offset : 0;
1040	vlist = (bp->bio_flags & BIO_VLIST) != 0 ?
1041	    (bus_dma_segment_t *)bp->bio_data : NULL;
1042
1043	/*
1044	 * offs is the offset at which to start operating on the
1045	 * next (ie, first) page.  lastp is the last page on
1046	 * which we're going to operate.  lastend is the ending
1047	 * position within that last page (ie, PAGE_SIZE if
1048	 * we're operating on complete aligned pages).
1049	 */
1050	offs = bp->bio_offset % PAGE_SIZE;
1051	lastp = (bp->bio_offset + bp->bio_length - 1) / PAGE_SIZE;
1052	lastend = (bp->bio_offset + bp->bio_length - 1) % PAGE_SIZE + 1;
1053
1054	rv = VM_PAGER_OK;
1055	vm_object_pip_add(sc->object, 1);
1056	for (i = bp->bio_offset / PAGE_SIZE; i <= lastp; i++) {
1057		len = ((i == lastp) ? lastend : PAGE_SIZE) - offs;
1058		m = vm_page_grab_unlocked(sc->object, i, VM_ALLOC_SYSTEM);
1059		if (bp->bio_cmd == BIO_READ) {
1060			if (vm_page_all_valid(m))
1061				rv = VM_PAGER_OK;
1062			else
1063				rv = vm_pager_get_pages(sc->object, &m, 1,
1064				    NULL, NULL);
1065			if (rv == VM_PAGER_ERROR) {
1066				VM_OBJECT_WLOCK(sc->object);
1067				vm_page_free(m);
1068				VM_OBJECT_WUNLOCK(sc->object);
1069				break;
1070			} else if (rv == VM_PAGER_FAIL) {
1071				/*
1072				 * Pager does not have the page.  Zero
1073				 * the allocated page, and mark it as
1074				 * valid. Do not set dirty, the page
1075				 * can be recreated if thrown out.
1076				 */
1077				pmap_zero_page(m);
1078				vm_page_valid(m);
1079			}
1080			if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
1081				pmap_copy_pages(&m, offs, bp->bio_ma,
1082				    ma_offs, len);
1083			} else if ((bp->bio_flags & BIO_VLIST) != 0) {
1084				physcopyout_vlist(VM_PAGE_TO_PHYS(m) + offs,
1085				    vlist, ma_offs, len);
1086				cpu_flush_dcache(p, len);
1087			} else {
1088				physcopyout(VM_PAGE_TO_PHYS(m) + offs, p, len);
1089				cpu_flush_dcache(p, len);
1090			}
1091		} else if (bp->bio_cmd == BIO_WRITE) {
1092			if (len == PAGE_SIZE || vm_page_all_valid(m))
1093				rv = VM_PAGER_OK;
1094			else
1095				rv = vm_pager_get_pages(sc->object, &m, 1,
1096				    NULL, NULL);
1097			if (rv == VM_PAGER_ERROR) {
1098				VM_OBJECT_WLOCK(sc->object);
1099				vm_page_free(m);
1100				VM_OBJECT_WUNLOCK(sc->object);
1101				break;
1102			} else if (rv == VM_PAGER_FAIL)
1103				pmap_zero_page(m);
1104
1105			if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
1106				pmap_copy_pages(bp->bio_ma, ma_offs, &m,
1107				    offs, len);
1108			} else if ((bp->bio_flags & BIO_VLIST) != 0) {
1109				physcopyin_vlist(vlist, ma_offs,
1110				    VM_PAGE_TO_PHYS(m) + offs, len);
1111			} else {
1112				physcopyin(p, VM_PAGE_TO_PHYS(m) + offs, len);
1113			}
1114
1115			vm_page_valid(m);
1116			vm_page_set_dirty(m);
1117		} else if (bp->bio_cmd == BIO_DELETE) {
1118			if (len == PAGE_SIZE || vm_page_all_valid(m))
1119				rv = VM_PAGER_OK;
1120			else
1121				rv = vm_pager_get_pages(sc->object, &m, 1,
1122				    NULL, NULL);
1123			VM_OBJECT_WLOCK(sc->object);
1124			if (rv == VM_PAGER_ERROR) {
1125				vm_page_free(m);
1126				VM_OBJECT_WUNLOCK(sc->object);
1127				break;
1128			} else if (rv == VM_PAGER_FAIL) {
1129				vm_page_free(m);
1130				m = NULL;
1131			} else {
1132				/* Page is valid. */
1133				if (len != PAGE_SIZE) {
1134					pmap_zero_page_area(m, offs, len);
1135					vm_page_set_dirty(m);
1136				} else {
1137					vm_pager_page_unswapped(m);
1138					vm_page_free(m);
1139					m = NULL;
1140				}
1141			}
1142			VM_OBJECT_WUNLOCK(sc->object);
1143		}
1144		if (m != NULL) {
1145			/*
1146			 * The page may be deactivated prior to setting
1147			 * PGA_REFERENCED, but in this case it will be
1148			 * reactivated by the page daemon.
1149			 */
1150			if (vm_page_active(m))
1151				vm_page_reference(m);
1152			else
1153				vm_page_activate(m);
1154			vm_page_xunbusy(m);
1155		}
1156
1157		/* Actions on further pages start at offset 0 */
1158		p += PAGE_SIZE - offs;
1159		offs = 0;
1160		ma_offs += len;
1161	}
1162	vm_object_pip_wakeup(sc->object);
1163	return (rv != VM_PAGER_ERROR ? 0 : ENOSPC);
1164}
1165
1166static int
1167mdstart_null(struct md_s *sc, struct bio *bp)
1168{
1169
1170	switch (bp->bio_cmd) {
1171	case BIO_READ:
1172		bzero(bp->bio_data, bp->bio_length);
1173		cpu_flush_dcache(bp->bio_data, bp->bio_length);
1174		break;
1175	case BIO_WRITE:
1176		break;
1177	}
1178	bp->bio_resid = 0;
1179	return (0);
1180}
1181
1182static void
1183md_handleattr(struct md_s *sc, struct bio *bp)
1184{
1185	if (sc->fwsectors && sc->fwheads &&
1186	    (g_handleattr_int(bp, "GEOM::fwsectors", sc->fwsectors) != 0 ||
1187	    g_handleattr_int(bp, "GEOM::fwheads", sc->fwheads) != 0))
1188		return;
1189	if (g_handleattr_int(bp, "GEOM::candelete", sc->candelete) != 0)
1190		return;
1191	if (sc->ident[0] != '\0' &&
1192	    g_handleattr_str(bp, "GEOM::ident", sc->ident) != 0)
1193		return;
1194	if (g_handleattr_int(bp, "MNT::verified", (sc->flags & MD_VERIFY) != 0))
1195		return;
1196	g_io_deliver(bp, EOPNOTSUPP);
1197}
1198
1199static void
1200md_kthread(void *arg)
1201{
1202	struct md_s *sc;
1203	struct bio *bp;
1204	int error;
1205
1206	sc = arg;
1207	thread_lock(curthread);
1208	sched_prio(curthread, PRIBIO);
1209	thread_unlock(curthread);
1210	if (sc->type == MD_VNODE)
1211		curthread->td_pflags |= TDP_NORUNNINGBUF;
1212
1213	for (;;) {
1214		mtx_lock(&sc->queue_mtx);
1215		if (sc->flags & MD_SHUTDOWN) {
1216			sc->flags |= MD_EXITING;
1217			mtx_unlock(&sc->queue_mtx);
1218			kproc_exit(0);
1219		}
1220		bp = bioq_takefirst(&sc->bio_queue);
1221		if (!bp) {
1222			msleep(sc, &sc->queue_mtx, PRIBIO | PDROP, "mdwait", 0);
1223			continue;
1224		}
1225		mtx_unlock(&sc->queue_mtx);
1226		if (bp->bio_cmd == BIO_GETATTR) {
1227			md_handleattr(sc, bp);
1228		} else {
1229			error = sc->start(sc, bp);
1230			if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) {
1231				/*
1232				 * Devstat uses (bio_bcount, bio_resid) for
1233				 * determining the length of the completed part
1234				 * of the i/o.  g_io_deliver() will translate
1235				 * from bio_completed to that, but it also
1236				 * destroys the bio so we must do our own
1237				 * translation.
1238				 */
1239				bp->bio_bcount = bp->bio_length;
1240				devstat_end_transaction_bio(sc->devstat, bp);
1241			}
1242			bp->bio_completed = bp->bio_length - bp->bio_resid;
1243			g_io_deliver(bp, error);
1244		}
1245	}
1246}
1247
1248static struct md_s *
1249mdfind(int unit)
1250{
1251	struct md_s *sc;
1252
1253	LIST_FOREACH(sc, &md_softc_list, list) {
1254		if (sc->unit == unit)
1255			break;
1256	}
1257	return (sc);
1258}
1259
1260static struct md_s *
1261mdnew(int unit, int *errp, enum md_types type)
1262{
1263	struct md_s *sc;
1264	int error;
1265
1266	*errp = 0;
1267	if (unit == -1)
1268		unit = alloc_unr(md_uh);
1269	else
1270		unit = alloc_unr_specific(md_uh, unit);
1271
1272	if (unit == -1) {
1273		*errp = EBUSY;
1274		return (NULL);
1275	}
1276
1277	sc = malloc(sizeof(*sc), M_MD, M_WAITOK | M_ZERO);
1278	sc->type = type;
1279	bioq_init(&sc->bio_queue);
1280	mtx_init(&sc->queue_mtx, "md bio queue", NULL, MTX_DEF);
1281	sc->unit = unit;
1282	sprintf(sc->name, "md%d", unit);
1283	LIST_INSERT_HEAD(&md_softc_list, sc, list);
1284	error = kproc_create(md_kthread, sc, &sc->procp, 0, 0,"%s", sc->name);
1285	if (error == 0)
1286		return (sc);
1287	LIST_REMOVE(sc, list);
1288	mtx_destroy(&sc->queue_mtx);
1289	free_unr(md_uh, sc->unit);
1290	free(sc, M_MD);
1291	*errp = error;
1292	return (NULL);
1293}
1294
1295static void
1296mdinit(struct md_s *sc)
1297{
1298	struct g_geom *gp;
1299	struct g_provider *pp;
1300
1301	g_topology_lock();
1302	gp = g_new_geomf(&g_md_class, "md%d", sc->unit);
1303	gp->softc = sc;
1304	pp = g_new_providerf(gp, "md%d", sc->unit);
1305	devstat_remove_entry(pp->stat);
1306	pp->stat = NULL;
1307	pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE;
1308	pp->mediasize = sc->mediasize;
1309	pp->sectorsize = sc->sectorsize;
1310	switch (sc->type) {
1311	case MD_MALLOC:
1312	case MD_VNODE:
1313	case MD_SWAP:
1314		pp->flags |= G_PF_ACCEPT_UNMAPPED;
1315		break;
1316	case MD_PRELOAD:
1317	case MD_NULL:
1318		break;
1319	}
1320	sc->gp = gp;
1321	sc->pp = pp;
1322	sc->devstat = devstat_new_entry("md", sc->unit, sc->sectorsize,
1323	    DEVSTAT_ALL_SUPPORTED, DEVSTAT_TYPE_DIRECT, DEVSTAT_PRIORITY_MAX);
1324	sc->devstat->id = pp;
1325	g_error_provider(pp, 0);
1326	g_topology_unlock();
1327}
1328
1329static int
1330mdcreate_malloc(struct md_s *sc, struct md_req *mdr)
1331{
1332	uintptr_t sp;
1333	int error;
1334	off_t u;
1335
1336	error = 0;
1337	if (mdr->md_options & ~(MD_AUTOUNIT | MD_COMPRESS | MD_RESERVE))
1338		return (EINVAL);
1339	if (mdr->md_sectorsize != 0 && !powerof2(mdr->md_sectorsize))
1340		return (EINVAL);
1341	/* Compression doesn't make sense if we have reserved space */
1342	if (mdr->md_options & MD_RESERVE)
1343		mdr->md_options &= ~MD_COMPRESS;
1344	if (mdr->md_fwsectors != 0)
1345		sc->fwsectors = mdr->md_fwsectors;
1346	if (mdr->md_fwheads != 0)
1347		sc->fwheads = mdr->md_fwheads;
1348	sc->flags = mdr->md_options & (MD_COMPRESS | MD_FORCE | MD_RESERVE);
1349	sc->indir = dimension(sc->mediasize / sc->sectorsize);
1350	sc->uma = uma_zcreate(sc->name, sc->sectorsize, NULL, NULL, NULL, NULL,
1351	    0x1ff, 0);
1352	if (mdr->md_options & MD_RESERVE) {
1353		off_t nsectors;
1354
1355		nsectors = sc->mediasize / sc->sectorsize;
1356		for (u = 0; u < nsectors; u++) {
1357			sp = (uintptr_t)uma_zalloc(sc->uma, (md_malloc_wait ?
1358			    M_WAITOK : M_NOWAIT) | M_ZERO);
1359			if (sp != 0)
1360				error = s_write(sc->indir, u, sp);
1361			else
1362				error = ENOMEM;
1363			if (error != 0)
1364				break;
1365		}
1366	}
1367	return (error);
1368}
1369
1370static int
1371mdsetcred(struct md_s *sc, struct ucred *cred)
1372{
1373	char *tmpbuf;
1374	int error = 0;
1375
1376	/*
1377	 * Set credits in our softc
1378	 */
1379
1380	if (sc->cred)
1381		crfree(sc->cred);
1382	sc->cred = crhold(cred);
1383
1384	/*
1385	 * Horrible kludge to establish credentials for NFS  XXX.
1386	 */
1387
1388	if (sc->vnode) {
1389		struct uio auio;
1390		struct iovec aiov;
1391
1392		tmpbuf = malloc(sc->sectorsize, M_TEMP, M_WAITOK);
1393		bzero(&auio, sizeof(auio));
1394
1395		aiov.iov_base = tmpbuf;
1396		aiov.iov_len = sc->sectorsize;
1397		auio.uio_iov = &aiov;
1398		auio.uio_iovcnt = 1;
1399		auio.uio_offset = 0;
1400		auio.uio_rw = UIO_READ;
1401		auio.uio_segflg = UIO_SYSSPACE;
1402		auio.uio_resid = aiov.iov_len;
1403		vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY);
1404		error = VOP_READ(sc->vnode, &auio, 0, sc->cred);
1405		VOP_UNLOCK(sc->vnode);
1406		free(tmpbuf, M_TEMP);
1407	}
1408	return (error);
1409}
1410
1411static int
1412mdcreate_vnode(struct md_s *sc, struct md_req *mdr, struct thread *td)
1413{
1414	struct vattr vattr;
1415	struct nameidata nd;
1416	char *fname;
1417	int error, flags;
1418	long v;
1419
1420	fname = mdr->md_file;
1421	if (mdr->md_file_seg == UIO_USERSPACE) {
1422		error = copyinstr(fname, sc->file, sizeof(sc->file), NULL);
1423		if (error != 0)
1424			return (error);
1425	} else if (mdr->md_file_seg == UIO_SYSSPACE)
1426		strlcpy(sc->file, fname, sizeof(sc->file));
1427	else
1428		return (EDOOFUS);
1429
1430	/*
1431	 * If the user specified that this is a read only device, don't
1432	 * set the FWRITE mask before trying to open the backing store.
1433	 */
1434	flags = FREAD | ((mdr->md_options & MD_READONLY) ? 0 : FWRITE) \
1435	    | ((mdr->md_options & MD_VERIFY) ? O_VERIFY : 0);
1436	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, sc->file);
1437	error = vn_open(&nd, &flags, 0, NULL);
1438	if (error != 0)
1439		return (error);
1440	NDFREE_PNBUF(&nd);
1441	if (nd.ni_vp->v_type != VREG) {
1442		error = EINVAL;
1443		goto bad;
1444	}
1445	error = VOP_GETATTR(nd.ni_vp, &vattr, td->td_ucred);
1446	if (error != 0)
1447		goto bad;
1448	if ((mdr->md_options & MD_MUSTDEALLOC) != 0) {
1449		error = VOP_PATHCONF(nd.ni_vp, _PC_DEALLOC_PRESENT, &v);
1450		if (error != 0)
1451			goto bad;
1452		if (v == 0)
1453			sc->candelete = false;
1454	}
1455	if (VOP_ISLOCKED(nd.ni_vp) != LK_EXCLUSIVE) {
1456		vn_lock(nd.ni_vp, LK_UPGRADE | LK_RETRY);
1457		if (VN_IS_DOOMED(nd.ni_vp)) {
1458			/* Forced unmount. */
1459			error = EBADF;
1460			goto bad;
1461		}
1462	}
1463	nd.ni_vp->v_vflag |= VV_MD;
1464	VOP_UNLOCK(nd.ni_vp);
1465
1466	if (mdr->md_fwsectors != 0)
1467		sc->fwsectors = mdr->md_fwsectors;
1468	if (mdr->md_fwheads != 0)
1469		sc->fwheads = mdr->md_fwheads;
1470	snprintf(sc->ident, sizeof(sc->ident), "MD-DEV%ju-INO%ju",
1471	    (uintmax_t)vattr.va_fsid, (uintmax_t)vattr.va_fileid);
1472	sc->flags = mdr->md_options & (MD_ASYNC | MD_CACHE | MD_FORCE |
1473	    MD_VERIFY | MD_MUSTDEALLOC);
1474	if (!(flags & FWRITE))
1475		sc->flags |= MD_READONLY;
1476	sc->vnode = nd.ni_vp;
1477
1478	error = mdsetcred(sc, td->td_ucred);
1479	if (error != 0) {
1480		sc->vnode = NULL;
1481		vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY);
1482		nd.ni_vp->v_vflag &= ~VV_MD;
1483		goto bad;
1484	}
1485
1486	sc->kva = kva_alloc(maxphys + PAGE_SIZE);
1487	return (0);
1488bad:
1489	VOP_UNLOCK(nd.ni_vp);
1490	(void)vn_close(nd.ni_vp, flags, td->td_ucred, td);
1491	return (error);
1492}
1493
1494static void
1495g_md_providergone(struct g_provider *pp)
1496{
1497	struct md_s *sc = pp->geom->softc;
1498
1499	mtx_lock(&sc->queue_mtx);
1500	sc->flags |= MD_PROVIDERGONE;
1501	wakeup(&sc->flags);
1502	mtx_unlock(&sc->queue_mtx);
1503}
1504
1505static int
1506mddestroy(struct md_s *sc, struct thread *td)
1507{
1508
1509	if (sc->gp) {
1510		g_topology_lock();
1511		g_wither_geom(sc->gp, ENXIO);
1512		g_topology_unlock();
1513
1514		mtx_lock(&sc->queue_mtx);
1515		while (!(sc->flags & MD_PROVIDERGONE))
1516			msleep(&sc->flags, &sc->queue_mtx, PRIBIO, "mddestroy", 0);
1517		mtx_unlock(&sc->queue_mtx);
1518	}
1519	if (sc->devstat) {
1520		devstat_remove_entry(sc->devstat);
1521		sc->devstat = NULL;
1522	}
1523	mtx_lock(&sc->queue_mtx);
1524	sc->flags |= MD_SHUTDOWN;
1525	wakeup(sc);
1526	while (!(sc->flags & MD_EXITING))
1527		msleep(sc->procp, &sc->queue_mtx, PRIBIO, "mddestroy", hz / 10);
1528	mtx_unlock(&sc->queue_mtx);
1529	mtx_destroy(&sc->queue_mtx);
1530	if (sc->vnode != NULL) {
1531		vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY);
1532		sc->vnode->v_vflag &= ~VV_MD;
1533		VOP_UNLOCK(sc->vnode);
1534		(void)vn_close(sc->vnode, sc->flags & MD_READONLY ?
1535		    FREAD : (FREAD|FWRITE), sc->cred, td);
1536	}
1537	if (sc->cred != NULL)
1538		crfree(sc->cred);
1539	if (sc->object != NULL)
1540		vm_object_deallocate(sc->object);
1541	if (sc->indir)
1542		destroy_indir(sc, sc->indir);
1543	if (sc->uma)
1544		uma_zdestroy(sc->uma);
1545	if (sc->kva)
1546		kva_free(sc->kva, maxphys + PAGE_SIZE);
1547
1548	LIST_REMOVE(sc, list);
1549	free_unr(md_uh, sc->unit);
1550	free(sc, M_MD);
1551	return (0);
1552}
1553
1554static int
1555mdresize(struct md_s *sc, struct md_req *mdr)
1556{
1557	int error, res;
1558	vm_pindex_t oldpages, newpages;
1559
1560	switch (sc->type) {
1561	case MD_VNODE:
1562	case MD_NULL:
1563		break;
1564	case MD_SWAP:
1565		if (mdr->md_mediasize <= 0 ||
1566		    (mdr->md_mediasize % PAGE_SIZE) != 0)
1567			return (EDOM);
1568		oldpages = OFF_TO_IDX(sc->mediasize);
1569		newpages = OFF_TO_IDX(mdr->md_mediasize);
1570		if (newpages < oldpages) {
1571			VM_OBJECT_WLOCK(sc->object);
1572			vm_object_page_remove(sc->object, newpages, 0, 0);
1573			swap_release_by_cred(IDX_TO_OFF(oldpages -
1574			    newpages), sc->cred);
1575			sc->object->charge = IDX_TO_OFF(newpages);
1576			sc->object->size = newpages;
1577			VM_OBJECT_WUNLOCK(sc->object);
1578		} else if (newpages > oldpages) {
1579			res = swap_reserve_by_cred(IDX_TO_OFF(newpages -
1580			    oldpages), sc->cred);
1581			if (!res)
1582				return (ENOMEM);
1583			if ((mdr->md_options & MD_RESERVE) ||
1584			    (sc->flags & MD_RESERVE)) {
1585				error = swap_pager_reserve(sc->object,
1586				    oldpages, newpages - oldpages);
1587				if (error < 0) {
1588					swap_release_by_cred(
1589					    IDX_TO_OFF(newpages - oldpages),
1590					    sc->cred);
1591					return (EDOM);
1592				}
1593			}
1594			VM_OBJECT_WLOCK(sc->object);
1595			sc->object->charge = IDX_TO_OFF(newpages);
1596			sc->object->size = newpages;
1597			VM_OBJECT_WUNLOCK(sc->object);
1598		}
1599		break;
1600	default:
1601		return (EOPNOTSUPP);
1602	}
1603
1604	sc->mediasize = mdr->md_mediasize;
1605
1606	g_topology_lock();
1607	g_resize_provider(sc->pp, sc->mediasize);
1608	g_topology_unlock();
1609	return (0);
1610}
1611
1612static int
1613mdcreate_swap(struct md_s *sc, struct md_req *mdr, struct thread *td)
1614{
1615	vm_ooffset_t npage;
1616	int error;
1617
1618	/*
1619	 * Range check.  Disallow negative sizes and sizes not being
1620	 * multiple of page size.
1621	 */
1622	if (sc->mediasize <= 0 || (sc->mediasize % PAGE_SIZE) != 0)
1623		return (EDOM);
1624
1625	/*
1626	 * Allocate an OBJT_SWAP object.
1627	 *
1628	 * Note the truncation.
1629	 */
1630
1631	if ((mdr->md_options & MD_VERIFY) != 0)
1632		return (EINVAL);
1633	npage = mdr->md_mediasize / PAGE_SIZE;
1634	if (mdr->md_fwsectors != 0)
1635		sc->fwsectors = mdr->md_fwsectors;
1636	if (mdr->md_fwheads != 0)
1637		sc->fwheads = mdr->md_fwheads;
1638	sc->object = vm_pager_allocate(OBJT_SWAP, NULL, PAGE_SIZE * npage,
1639	    VM_PROT_DEFAULT, 0, td->td_ucred);
1640	if (sc->object == NULL)
1641		return (ENOMEM);
1642	sc->flags = mdr->md_options & (MD_FORCE | MD_RESERVE);
1643	if (mdr->md_options & MD_RESERVE) {
1644		if (swap_pager_reserve(sc->object, 0, npage) < 0) {
1645			error = EDOM;
1646			goto finish;
1647		}
1648	}
1649	error = mdsetcred(sc, td->td_ucred);
1650 finish:
1651	if (error != 0) {
1652		vm_object_deallocate(sc->object);
1653		sc->object = NULL;
1654	}
1655	return (error);
1656}
1657
1658static int
1659mdcreate_null(struct md_s *sc, struct md_req *mdr, struct thread *td)
1660{
1661
1662	/*
1663	 * Range check.  Disallow negative sizes and sizes not being
1664	 * multiple of page size.
1665	 */
1666	if (sc->mediasize <= 0 || (sc->mediasize % PAGE_SIZE) != 0)
1667		return (EDOM);
1668
1669	return (0);
1670}
1671
1672static int
1673kern_mdattach_locked(struct thread *td, struct md_req *mdr)
1674{
1675	struct md_s *sc;
1676	unsigned sectsize;
1677	int error, i;
1678
1679	sx_assert(&md_sx, SA_XLOCKED);
1680
1681	switch (mdr->md_type) {
1682	case MD_MALLOC:
1683	case MD_PRELOAD:
1684	case MD_VNODE:
1685	case MD_SWAP:
1686	case MD_NULL:
1687		break;
1688	default:
1689		return (EINVAL);
1690	}
1691	if (mdr->md_sectorsize == 0)
1692		sectsize = DEV_BSIZE;
1693	else
1694		sectsize = mdr->md_sectorsize;
1695	if (sectsize > maxphys || mdr->md_mediasize < sectsize)
1696		return (EINVAL);
1697	if (mdr->md_options & MD_AUTOUNIT)
1698		sc = mdnew(-1, &error, mdr->md_type);
1699	else {
1700		if (mdr->md_unit > INT_MAX)
1701			return (EINVAL);
1702		sc = mdnew(mdr->md_unit, &error, mdr->md_type);
1703	}
1704	if (sc == NULL)
1705		return (error);
1706	if (mdr->md_label != NULL)
1707		error = copyinstr(mdr->md_label, sc->label,
1708		    sizeof(sc->label), NULL);
1709	if (error != 0)
1710		goto err_after_new;
1711	if (mdr->md_options & MD_AUTOUNIT)
1712		mdr->md_unit = sc->unit;
1713	sc->mediasize = mdr->md_mediasize;
1714	sc->sectorsize = sectsize;
1715	sc->candelete = true;
1716	error = EDOOFUS;
1717	switch (sc->type) {
1718	case MD_MALLOC:
1719		sc->start = mdstart_malloc;
1720		error = mdcreate_malloc(sc, mdr);
1721		break;
1722	case MD_PRELOAD:
1723		/*
1724		 * We disallow attaching preloaded memory disks via
1725		 * ioctl. Preloaded memory disks are automatically
1726		 * attached in g_md_init().
1727		 */
1728		error = EOPNOTSUPP;
1729		break;
1730	case MD_VNODE:
1731		sc->start = mdstart_vnode;
1732		error = mdcreate_vnode(sc, mdr, td);
1733		break;
1734	case MD_SWAP:
1735		sc->start = mdstart_swap;
1736		error = mdcreate_swap(sc, mdr, td);
1737		break;
1738	case MD_NULL:
1739		sc->start = mdstart_null;
1740		error = mdcreate_null(sc, mdr, td);
1741		break;
1742	}
1743err_after_new:
1744	if (error != 0) {
1745		mddestroy(sc, td);
1746		return (error);
1747	}
1748
1749	/* Prune off any residual fractional sector */
1750	i = sc->mediasize % sc->sectorsize;
1751	sc->mediasize -= i;
1752
1753	mdinit(sc);
1754	return (0);
1755}
1756
1757static int
1758kern_mdattach(struct thread *td, struct md_req *mdr)
1759{
1760	int error;
1761
1762	sx_xlock(&md_sx);
1763	error = kern_mdattach_locked(td, mdr);
1764	sx_xunlock(&md_sx);
1765	return (error);
1766}
1767
1768static int
1769kern_mddetach_locked(struct thread *td, struct md_req *mdr)
1770{
1771	struct md_s *sc;
1772
1773	sx_assert(&md_sx, SA_XLOCKED);
1774
1775	if (mdr->md_mediasize != 0 ||
1776	    (mdr->md_options & ~MD_FORCE) != 0)
1777		return (EINVAL);
1778
1779	sc = mdfind(mdr->md_unit);
1780	if (sc == NULL)
1781		return (ENOENT);
1782	if (sc->opencount != 0 && !(sc->flags & MD_FORCE) &&
1783	    !(mdr->md_options & MD_FORCE))
1784		return (EBUSY);
1785	return (mddestroy(sc, td));
1786}
1787
1788static int
1789kern_mddetach(struct thread *td, struct md_req *mdr)
1790{
1791	int error;
1792
1793	sx_xlock(&md_sx);
1794	error = kern_mddetach_locked(td, mdr);
1795	sx_xunlock(&md_sx);
1796	return (error);
1797}
1798
1799static int
1800kern_mdresize_locked(struct md_req *mdr)
1801{
1802	struct md_s *sc;
1803
1804	sx_assert(&md_sx, SA_XLOCKED);
1805
1806	if ((mdr->md_options & ~(MD_FORCE | MD_RESERVE)) != 0)
1807		return (EINVAL);
1808
1809	sc = mdfind(mdr->md_unit);
1810	if (sc == NULL)
1811		return (ENOENT);
1812	if (mdr->md_mediasize < sc->sectorsize)
1813		return (EINVAL);
1814	mdr->md_mediasize -= mdr->md_mediasize % sc->sectorsize;
1815	if (mdr->md_mediasize < sc->mediasize &&
1816	    !(sc->flags & MD_FORCE) &&
1817	    !(mdr->md_options & MD_FORCE))
1818		return (EBUSY);
1819	return (mdresize(sc, mdr));
1820}
1821
1822static int
1823kern_mdresize(struct md_req *mdr)
1824{
1825	int error;
1826
1827	sx_xlock(&md_sx);
1828	error = kern_mdresize_locked(mdr);
1829	sx_xunlock(&md_sx);
1830	return (error);
1831}
1832
1833static int
1834kern_mdquery_locked(struct md_req *mdr)
1835{
1836	struct md_s *sc;
1837	int error;
1838
1839	sx_assert(&md_sx, SA_XLOCKED);
1840
1841	sc = mdfind(mdr->md_unit);
1842	if (sc == NULL)
1843		return (ENOENT);
1844	mdr->md_type = sc->type;
1845	mdr->md_options = sc->flags;
1846	mdr->md_mediasize = sc->mediasize;
1847	mdr->md_sectorsize = sc->sectorsize;
1848	error = 0;
1849	if (mdr->md_label != NULL) {
1850		error = copyout(sc->label, mdr->md_label,
1851		    strlen(sc->label) + 1);
1852		if (error != 0)
1853			return (error);
1854	}
1855	if (sc->type == MD_VNODE ||
1856	    (sc->type == MD_PRELOAD && mdr->md_file != NULL))
1857		error = copyout(sc->file, mdr->md_file,
1858		    strlen(sc->file) + 1);
1859	return (error);
1860}
1861
1862static int
1863kern_mdquery(struct md_req *mdr)
1864{
1865	int error;
1866
1867	sx_xlock(&md_sx);
1868	error = kern_mdquery_locked(mdr);
1869	sx_xunlock(&md_sx);
1870	return (error);
1871}
1872
1873/* Copy members that are not userspace pointers. */
1874#define	MD_IOCTL2REQ(mdio, mdr) do {					\
1875	(mdr)->md_unit = (mdio)->md_unit;				\
1876	(mdr)->md_type = (mdio)->md_type;				\
1877	(mdr)->md_mediasize = (mdio)->md_mediasize;			\
1878	(mdr)->md_sectorsize = (mdio)->md_sectorsize;			\
1879	(mdr)->md_options = (mdio)->md_options;				\
1880	(mdr)->md_fwheads = (mdio)->md_fwheads;				\
1881	(mdr)->md_fwsectors = (mdio)->md_fwsectors;			\
1882	(mdr)->md_units = &(mdio)->md_pad[0];				\
1883	(mdr)->md_units_nitems = nitems((mdio)->md_pad);		\
1884} while(0)
1885
1886/* Copy members that might have been updated */
1887#define MD_REQ2IOCTL(mdr, mdio) do {					\
1888	(mdio)->md_unit = (mdr)->md_unit;				\
1889	(mdio)->md_type = (mdr)->md_type;				\
1890	(mdio)->md_mediasize = (mdr)->md_mediasize;			\
1891	(mdio)->md_sectorsize = (mdr)->md_sectorsize;			\
1892	(mdio)->md_options = (mdr)->md_options;				\
1893	(mdio)->md_fwheads = (mdr)->md_fwheads;				\
1894	(mdio)->md_fwsectors = (mdr)->md_fwsectors;			\
1895} while(0)
1896
1897static int
1898mdctlioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flags,
1899    struct thread *td)
1900{
1901	struct md_req mdr;
1902	int error;
1903
1904	if (md_debug)
1905		printf("mdctlioctl(%s %lx %p %x %p)\n",
1906			devtoname(dev), cmd, addr, flags, td);
1907
1908	bzero(&mdr, sizeof(mdr));
1909	switch (cmd) {
1910	case MDIOCATTACH:
1911	case MDIOCDETACH:
1912	case MDIOCRESIZE:
1913	case MDIOCQUERY: {
1914		struct md_ioctl *mdio = (struct md_ioctl *)addr;
1915		if (mdio->md_version != MDIOVERSION)
1916			return (EINVAL);
1917		MD_IOCTL2REQ(mdio, &mdr);
1918		mdr.md_file = mdio->md_file;
1919		mdr.md_file_seg = UIO_USERSPACE;
1920		/* If the file is adjacent to the md_ioctl it's in kernel. */
1921		if ((void *)mdio->md_file == (void *)(mdio + 1))
1922			mdr.md_file_seg = UIO_SYSSPACE;
1923		mdr.md_label = mdio->md_label;
1924		break;
1925	}
1926#ifdef COMPAT_FREEBSD32
1927	case MDIOCATTACH_32:
1928	case MDIOCDETACH_32:
1929	case MDIOCRESIZE_32:
1930	case MDIOCQUERY_32: {
1931		struct md_ioctl32 *mdio = (struct md_ioctl32 *)addr;
1932		if (mdio->md_version != MDIOVERSION)
1933			return (EINVAL);
1934		MD_IOCTL2REQ(mdio, &mdr);
1935		mdr.md_file = (void *)(uintptr_t)mdio->md_file;
1936		mdr.md_file_seg = UIO_USERSPACE;
1937		mdr.md_label = (void *)(uintptr_t)mdio->md_label;
1938		break;
1939	}
1940#endif
1941	default:
1942		/* Fall through to handler switch. */
1943		break;
1944	}
1945
1946	error = 0;
1947	switch (cmd) {
1948	case MDIOCATTACH:
1949#ifdef COMPAT_FREEBSD32
1950	case MDIOCATTACH_32:
1951#endif
1952		error = kern_mdattach(td, &mdr);
1953		break;
1954	case MDIOCDETACH:
1955#ifdef COMPAT_FREEBSD32
1956	case MDIOCDETACH_32:
1957#endif
1958		error = kern_mddetach(td, &mdr);
1959		break;
1960	case MDIOCRESIZE:
1961#ifdef COMPAT_FREEBSD32
1962	case MDIOCRESIZE_32:
1963#endif
1964		error = kern_mdresize(&mdr);
1965		break;
1966	case MDIOCQUERY:
1967#ifdef COMPAT_FREEBSD32
1968	case MDIOCQUERY_32:
1969#endif
1970		error = kern_mdquery(&mdr);
1971		break;
1972	default:
1973		error = ENOIOCTL;
1974	}
1975
1976	switch (cmd) {
1977	case MDIOCATTACH:
1978	case MDIOCQUERY: {
1979		struct md_ioctl *mdio = (struct md_ioctl *)addr;
1980		MD_REQ2IOCTL(&mdr, mdio);
1981		break;
1982	}
1983#ifdef COMPAT_FREEBSD32
1984	case MDIOCATTACH_32:
1985	case MDIOCQUERY_32: {
1986		struct md_ioctl32 *mdio = (struct md_ioctl32 *)addr;
1987		MD_REQ2IOCTL(&mdr, mdio);
1988		break;
1989	}
1990#endif
1991	default:
1992		/* Other commands to not alter mdr. */
1993		break;
1994	}
1995
1996	return (error);
1997}
1998
1999static void
2000md_preloaded(u_char *image, size_t length, const char *name)
2001{
2002	struct md_s *sc;
2003	int error;
2004
2005	sc = mdnew(-1, &error, MD_PRELOAD);
2006	if (sc == NULL)
2007		return;
2008	sc->mediasize = length;
2009	sc->sectorsize = DEV_BSIZE;
2010	sc->pl_ptr = image;
2011	sc->pl_len = length;
2012	sc->start = mdstart_preload;
2013	if (name != NULL)
2014		strlcpy(sc->file, name, sizeof(sc->file));
2015#ifdef MD_ROOT
2016	if (sc->unit == 0) {
2017#ifndef ROOTDEVNAME
2018		rootdevnames[0] = MD_ROOT_FSTYPE ":/dev/md0";
2019#endif
2020#ifdef MD_ROOT_READONLY
2021		sc->flags |= MD_READONLY;
2022#endif
2023	}
2024#endif
2025	mdinit(sc);
2026	if (name != NULL) {
2027		printf("%s%d: Preloaded image <%s> %zd bytes at %p\n",
2028		    MD_NAME, sc->unit, name, length, image);
2029	} else {
2030		printf("%s%d: Embedded image %zd bytes at %p\n",
2031		    MD_NAME, sc->unit, length, image);
2032	}
2033}
2034
2035static void
2036g_md_init(struct g_class *mp __unused)
2037{
2038	caddr_t mod;
2039	u_char *ptr, *name, *type;
2040	unsigned len;
2041	int i;
2042
2043	/* figure out log2(NINDIR) */
2044	for (i = NINDIR, nshift = -1; i; nshift++)
2045		i >>= 1;
2046
2047	mod = NULL;
2048	sx_init(&md_sx, "MD config lock");
2049	g_topology_unlock();
2050	md_uh = new_unrhdr(0, INT_MAX, NULL);
2051#ifdef MD_ROOT
2052	if (mfs_root_size != 0) {
2053		sx_xlock(&md_sx);
2054#ifdef MD_ROOT_MEM
2055		md_preloaded(mfs_root, mfs_root_size, NULL);
2056#else
2057		md_preloaded(__DEVOLATILE(u_char *, &mfs_root), mfs_root_size,
2058		    NULL);
2059#endif
2060		sx_xunlock(&md_sx);
2061	}
2062#endif
2063	/* XXX: are preload_* static or do they need Giant ? */
2064	while ((mod = preload_search_next_name(mod)) != NULL) {
2065		name = (char *)preload_search_info(mod, MODINFO_NAME);
2066		if (name == NULL)
2067			continue;
2068		type = (char *)preload_search_info(mod, MODINFO_TYPE);
2069		if (type == NULL)
2070			continue;
2071		if (strcmp(type, "md_image") && strcmp(type, "mfs_root"))
2072			continue;
2073		ptr = preload_fetch_addr(mod);
2074		len = preload_fetch_size(mod);
2075		if (ptr != NULL && len != 0) {
2076			sx_xlock(&md_sx);
2077			md_preloaded(ptr, len, name);
2078			sx_xunlock(&md_sx);
2079		}
2080	}
2081	status_dev = make_dev(&mdctl_cdevsw, INT_MAX, UID_ROOT, GID_WHEEL,
2082	    0600, MDCTL_NAME);
2083	g_topology_lock();
2084}
2085
2086static void
2087g_md_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
2088    struct g_consumer *cp __unused, struct g_provider *pp)
2089{
2090	struct md_s *mp;
2091	char *type;
2092
2093	mp = gp->softc;
2094	if (mp == NULL)
2095		return;
2096
2097	switch (mp->type) {
2098	case MD_MALLOC:
2099		type = "malloc";
2100		break;
2101	case MD_PRELOAD:
2102		type = "preload";
2103		break;
2104	case MD_VNODE:
2105		type = "vnode";
2106		break;
2107	case MD_SWAP:
2108		type = "swap";
2109		break;
2110	case MD_NULL:
2111		type = "null";
2112		break;
2113	default:
2114		type = "unknown";
2115		break;
2116	}
2117
2118	if (pp != NULL) {
2119		if (indent == NULL) {
2120			sbuf_printf(sb, " u %d", mp->unit);
2121			sbuf_printf(sb, " s %ju", (uintmax_t) mp->sectorsize);
2122			sbuf_printf(sb, " f %ju", (uintmax_t) mp->fwheads);
2123			sbuf_printf(sb, " fs %ju", (uintmax_t) mp->fwsectors);
2124			sbuf_printf(sb, " l %ju", (uintmax_t) mp->mediasize);
2125			sbuf_printf(sb, " t %s", type);
2126			if ((mp->type == MD_VNODE && mp->vnode != NULL) ||
2127			    (mp->type == MD_PRELOAD && mp->file[0] != '\0'))
2128				sbuf_printf(sb, " file %s", mp->file);
2129			sbuf_printf(sb, " label %s", mp->label);
2130		} else {
2131			sbuf_printf(sb, "%s<unit>%d</unit>\n", indent,
2132			    mp->unit);
2133			sbuf_printf(sb, "%s<sectorsize>%ju</sectorsize>\n",
2134			    indent, (uintmax_t) mp->sectorsize);
2135			sbuf_printf(sb, "%s<fwheads>%ju</fwheads>\n",
2136			    indent, (uintmax_t) mp->fwheads);
2137			sbuf_printf(sb, "%s<fwsectors>%ju</fwsectors>\n",
2138			    indent, (uintmax_t) mp->fwsectors);
2139			if (mp->ident[0] != '\0') {
2140				sbuf_printf(sb, "%s<ident>", indent);
2141				g_conf_printf_escaped(sb, "%s", mp->ident);
2142				sbuf_printf(sb, "</ident>\n");
2143			}
2144			sbuf_printf(sb, "%s<length>%ju</length>\n",
2145			    indent, (uintmax_t) mp->mediasize);
2146			sbuf_printf(sb, "%s<compression>%s</compression>\n", indent,
2147			    (mp->flags & MD_COMPRESS) == 0 ? "off": "on");
2148			sbuf_printf(sb, "%s<access>%s</access>\n", indent,
2149			    (mp->flags & MD_READONLY) == 0 ? "read-write":
2150			    "read-only");
2151			sbuf_printf(sb, "%s<type>%s</type>\n", indent,
2152			    type);
2153			if ((mp->type == MD_VNODE && mp->vnode != NULL) ||
2154			    (mp->type == MD_PRELOAD && mp->file[0] != '\0')) {
2155				sbuf_printf(sb, "%s<file>", indent);
2156				g_conf_printf_escaped(sb, "%s", mp->file);
2157				sbuf_printf(sb, "</file>\n");
2158			}
2159			if (mp->type == MD_VNODE)
2160				sbuf_printf(sb, "%s<cache>%s</cache>\n", indent,
2161				    (mp->flags & MD_CACHE) == 0 ? "off": "on");
2162			sbuf_printf(sb, "%s<label>", indent);
2163			g_conf_printf_escaped(sb, "%s", mp->label);
2164			sbuf_printf(sb, "</label>\n");
2165		}
2166	}
2167}
2168
2169static void
2170g_md_fini(struct g_class *mp __unused)
2171{
2172
2173	sx_destroy(&md_sx);
2174	if (status_dev != NULL)
2175		destroy_dev(status_dev);
2176	delete_unrhdr(md_uh);
2177}
2178