md.c revision 97291
1/*
2 * ----------------------------------------------------------------------------
3 * "THE BEER-WARE LICENSE" (Revision 42):
4 * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
5 * can do whatever you want with this stuff. If we meet some day, and you think
6 * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
7 * ----------------------------------------------------------------------------
8 *
9 * $FreeBSD: head/sys/dev/md/md.c 97291 2002-05-25 20:44:20Z phk $
10 *
11 */
12
13/*
14 * The following functions are based in the vn(4) driver: mdstart_swap(),
15 * mdstart_vnode(), mdcreate_swap(), mdcreate_vnode() and mddestroy(),
16 * and as such under the following copyright:
17 *
18 * Copyright (c) 1988 University of Utah.
19 * Copyright (c) 1990, 1993
20 *	The Regents of the University of California.  All rights reserved.
21 *
22 * This code is derived from software contributed to Berkeley by
23 * the Systems Programming Group of the University of Utah Computer
24 * Science Department.
25 *
26 * Redistribution and use in source and binary forms, with or without
27 * modification, are permitted provided that the following conditions
28 * are met:
29 * 1. Redistributions of source code must retain the above copyright
30 *    notice, this list of conditions and the following disclaimer.
31 * 2. Redistributions in binary form must reproduce the above copyright
32 *    notice, this list of conditions and the following disclaimer in the
33 *    documentation and/or other materials provided with the distribution.
34 * 3. All advertising materials mentioning features or use of this software
35 *    must display the following acknowledgement:
36 *	This product includes software developed by the University of
37 *	California, Berkeley and its contributors.
38 * 4. Neither the name of the University nor the names of its contributors
39 *    may be used to endorse or promote products derived from this software
40 *    without specific prior written permission.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52 * SUCH DAMAGE.
53 *
54 * from: Utah Hdr: vn.c 1.13 94/04/02
55 *
56 *	from: @(#)vn.c	8.6 (Berkeley) 4/1/94
57 * From: src/sys/dev/vn/vn.c,v 1.122 2000/12/16 16:06:03
58 */
59
60#include "opt_md.h"
61
62#include <sys/param.h>
63#include <sys/systm.h>
64#include <sys/bio.h>
65#include <sys/conf.h>
66#include <sys/devicestat.h>
67#include <sys/disk.h>
68#include <sys/fcntl.h>
69#include <sys/kernel.h>
70#include <sys/linker.h>
71#include <sys/lock.h>
72#include <sys/malloc.h>
73#include <sys/mdioctl.h>
74#include <sys/mutex.h>
75#include <sys/namei.h>
76#include <sys/proc.h>
77#include <sys/queue.h>
78#include <sys/sysctl.h>
79#include <sys/vnode.h>
80
81#include <machine/atomic.h>
82
83#include <vm/vm.h>
84#include <vm/vm_object.h>
85#include <vm/vm_page.h>
86#include <vm/vm_pager.h>
87#include <vm/swap_pager.h>
88
89#define MD_MODVER 1
90
91#ifndef MD_NSECT
92#define MD_NSECT (10000 * 2)
93#endif
94
95static MALLOC_DEFINE(M_MD, "MD disk", "Memory Disk");
96static MALLOC_DEFINE(M_MDSECT, "MD sectors", "Memory Disk Sectors");
97
98static int md_debug;
99SYSCTL_INT(_debug, OID_AUTO, mddebug, CTLFLAG_RW, &md_debug, 0, "");
100
101#if defined(MD_ROOT) && defined(MD_ROOT_SIZE)
102/* Image gets put here: */
103static u_char mfs_root[MD_ROOT_SIZE*1024] = "MFS Filesystem goes here";
104static u_char end_mfs_root[] __unused = "MFS Filesystem had better STOP here";
105#endif
106
107static int	mdrootready;
108static int	mdunits;
109static dev_t	status_dev = 0;
110
111#define CDEV_MAJOR	95
112
113static d_strategy_t mdstrategy;
114static d_open_t mdopen;
115static d_close_t mdclose;
116static d_ioctl_t mdioctl, mdctlioctl;
117
118static struct cdevsw md_cdevsw = {
119        /* open */      mdopen,
120        /* close */     mdclose,
121        /* read */      physread,
122        /* write */     physwrite,
123        /* ioctl */     mdioctl,
124        /* poll */      nopoll,
125        /* mmap */      nommap,
126        /* strategy */  mdstrategy,
127        /* name */      MD_NAME,
128        /* maj */       CDEV_MAJOR,
129        /* dump */      nodump,
130        /* psize */     nopsize,
131        /* flags */     D_DISK | D_CANFREE | D_MEMDISK,
132};
133
134static struct cdevsw mdctl_cdevsw = {
135        /* open */      nullopen,
136        /* close */     nullclose,
137        /* read */      noread,
138        /* write */     nowrite,
139        /* ioctl */     mdctlioctl,
140        /* poll */      nopoll,
141        /* mmap */      nommap,
142        /* strategy */  nostrategy,
143        /* name */      MD_NAME,
144        /* maj */       CDEV_MAJOR
145};
146
147static struct cdevsw mddisk_cdevsw;
148
149static LIST_HEAD(, md_s) md_softc_list = LIST_HEAD_INITIALIZER(&md_softc_list);
150
151#define NINDIR	(PAGE_SIZE / sizeof(uintptr_t))
152#define NMASK	(NINDIR-1)
153static int nshift;
154
155struct indir {
156	uintptr_t	*array;
157	uint		total;
158	uint		used;
159	uint		shift;
160};
161
162struct md_s {
163	int unit;
164	LIST_ENTRY(md_s) list;
165	struct devstat stats;
166	struct bio_queue_head bio_queue;
167	struct disk disk;
168	dev_t dev;
169	int busy;
170	enum md_types type;
171	unsigned nsect;
172	unsigned opencount;
173	unsigned secsize;
174	unsigned flags;
175
176	/* MD_MALLOC related fields */
177	struct indir *indir;
178
179	/* MD_PRELOAD related fields */
180	u_char *pl_ptr;
181	unsigned pl_len;
182
183	/* MD_VNODE related fields */
184	struct vnode *vnode;
185	struct ucred *cred;
186
187	/* MD_SWAP related fields */
188	vm_object_t object;
189};
190
191static int mddestroy(struct md_s *sc, struct thread *td);
192
193static struct indir *
194new_indir(uint shift)
195{
196	struct indir *ip;
197
198	ip = malloc(sizeof *ip, M_MD, M_NOWAIT | M_ZERO);
199	if (ip == NULL)
200		return(NULL);
201	ip->array = malloc(sizeof(uintptr_t) * NINDIR,
202	    M_MDSECT, M_NOWAIT | M_ZERO);
203	if (ip->array == NULL) {
204		free(ip, M_MD);
205		return(NULL);
206	}
207	ip->total = NINDIR;
208	ip->shift = shift;
209	return(ip);
210}
211
212static void
213del_indir(struct indir *ip)
214{
215
216	free(ip->array, M_MD);
217	free(ip, M_MD);
218}
219
220/*
221 * This function does the math and alloctes the top level "indir" structure
222 * for a device of "size" sectors.
223 */
224
225static struct indir *
226dimension(off_t size)
227{
228	off_t rcnt;
229	struct indir *ip;
230	int i, layer;
231
232	rcnt = size;
233	layer = 0;
234	while (rcnt > NINDIR) {
235		rcnt /= NINDIR;
236		layer++;
237	}
238	/* figure out log2(NINDIR) */
239	for (i = NINDIR, nshift = -1; i; nshift++)
240		i >>= 1;
241
242	/*
243	 * XXX: the top layer is probably not fully populated, so we allocate
244	 * too much space for ip->array in new_indir() here.
245	 */
246	ip = new_indir(layer * nshift);
247	return (ip);
248}
249
250/*
251 * Read a given sector
252 */
253
254static uintptr_t
255s_read(struct indir *ip, off_t offset)
256{
257	struct indir *cip;
258	int idx;
259	uintptr_t up;
260
261	if (md_debug > 1)
262		printf("s_read(%lld)\n", offset);
263	up = 0;
264	for (cip = ip; cip != NULL;) {
265		if (cip->shift) {
266			idx = (offset >> cip->shift) & NMASK;
267			up = cip->array[idx];
268			cip = (struct indir *)up;
269			continue;
270		}
271		idx = offset & NMASK;
272		return(cip->array[idx]);
273	}
274	return (0);
275}
276
277/*
278 * Write a given sector, prune the tree if the value is 0
279 * If the new value is different from the old, return the old value.
280 */
281
282static int
283s_write(struct indir *ip, off_t offset, uintptr_t ptr, uintptr_t *old)
284{
285	struct indir *cip, *lip[10];
286	int idx, li;
287	uintptr_t up;
288
289	if (md_debug > 1)
290		printf("s_write(%lld, %p, %p)\n", offset, (void *)ptr, old);
291	up = 0;
292	li = 0;
293	cip = ip;
294	for (;;) {
295		lip[li++] = cip;
296		if (cip->shift) {
297			idx = (offset >> cip->shift) & NMASK;
298			up = cip->array[idx];
299			if (up != 0) {
300				cip = (struct indir *)up;
301				continue;
302			}
303			/* Allocate branch */
304			cip->array[idx] =
305			    (uintptr_t)new_indir(cip->shift - nshift);
306			if (cip->array[idx] == 0)
307				return(ENOMEM);
308			cip->used++;
309			up = cip->array[idx];
310			cip = (struct indir *)up;
311			continue;
312		}
313		/* leafnode */
314		idx = offset & NMASK;
315		up = cip->array[idx];
316		if (old != NULL && up != ptr)
317			*old = up;
318		if (up != 0)
319			cip->used--;
320		cip->array[idx] = ptr;
321		if (ptr != 0)
322			cip->used++;
323		break;
324	}
325	if (cip->used != 0 || li == 1)
326		return (0);
327	li--;
328	while (cip->used == 0 && cip != ip) {
329		li--;
330		idx = (offset >> lip[li]->shift) & NMASK;
331		up = lip[li]->array[idx];
332		KASSERT(up == (uintptr_t)cip, ("md screwed up"));
333		del_indir(cip);
334		lip[li]->array[idx] = NULL;
335		lip[li]->used--;
336		cip = lip[li];
337	}
338	return (0);
339}
340
341static int
342mdopen(dev_t dev, int flag, int fmt, struct thread *td)
343{
344	struct md_s *sc;
345	struct disklabel *dl;
346
347	if (md_debug)
348		printf("mdopen(%p %x %x %p)\n",
349			devtoname(dev), flag, fmt, td);
350
351	sc = dev->si_drv1;
352
353	dl = &sc->disk.d_label;
354	bzero(dl, sizeof(*dl));
355	dl->d_secsize = sc->secsize;
356	dl->d_nsectors = sc->nsect > 63 ? 63 : sc->nsect;
357	dl->d_ntracks = 1;
358	dl->d_secpercyl = dl->d_nsectors * dl->d_ntracks;
359	dl->d_secperunit = sc->nsect;
360	dl->d_ncylinders = dl->d_secperunit / dl->d_secpercyl;
361	sc->opencount++;
362	return (0);
363}
364
365static int
366mdclose(dev_t dev, int flags, int fmt, struct thread *td)
367{
368	struct md_s *sc = dev->si_drv1;
369
370	sc->opencount--;
371	return (0);
372}
373
374static int
375mdioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct thread *td)
376{
377
378	if (md_debug)
379		printf("mdioctl(%s %lx %p %x %p)\n",
380			devtoname(dev), cmd, addr, flags, td);
381
382	return (ENOIOCTL);
383}
384
385static int
386mdstart_malloc(struct md_s *sc, struct bio *bp)
387{
388	int i, error;
389	u_char *dst;
390	unsigned secno, nsec, uc;
391	uintptr_t sp, osp;
392
393	nsec = bp->bio_bcount / sc->secsize;
394	secno = bp->bio_pblkno;
395	dst = bp->bio_data;
396	error = 0;
397	while (nsec--) {
398		osp = 0;
399		if (bp->bio_cmd == BIO_DELETE) {
400			error = s_write(sc->indir, secno, 0, &osp);
401		} else if (bp->bio_cmd == BIO_READ) {
402			sp = s_read(sc->indir, secno);
403			if (sp == 0)
404				bzero(dst, sc->secsize);
405			else if (sp <= 255)
406				for (i = 0; i < sc->secsize; i++)
407					dst[i] = sp;
408			else
409				bcopy((void *)sp, dst, sc->secsize);
410		} else if (bp->bio_cmd == BIO_WRITE) {
411			if (sc->flags & MD_COMPRESS) {
412				uc = dst[0];
413				for (i = 1; i < sc->secsize; i++)
414					if (dst[i] != uc)
415						break;
416			} else {
417				i = 0;
418				uc = 0;
419			}
420			if (i == sc->secsize) {
421				error = s_write(sc->indir, secno, uc, &osp);
422			} else {
423				sp = s_read(sc->indir, secno);
424				if (sp <= 255)
425					sp = (uintptr_t) malloc(
426					    sc->secsize, M_MDSECT, M_NOWAIT);
427				if (sp == 0) {
428					error = ENOMEM;
429				} else {
430					bcopy(dst, (void *)sp, sc->secsize);
431					error = s_write(sc->indir, secno,
432					    sp, &osp);
433				}
434			}
435		} else {
436			error = EOPNOTSUPP;
437		}
438		if (osp > 255)
439			free((void*)osp, M_MDSECT);
440		if (error)
441			break;
442		secno++;
443		dst += sc->secsize;
444	}
445	bp->bio_resid = 0;
446	return (error);
447}
448
449
450static int
451mdstart_preload(struct md_s *sc, struct bio *bp)
452{
453
454	if (bp->bio_cmd == BIO_DELETE) {
455	} else if (bp->bio_cmd == BIO_READ) {
456		bcopy(sc->pl_ptr + (bp->bio_pblkno << DEV_BSHIFT), bp->bio_data, bp->bio_bcount);
457	} else {
458		bcopy(bp->bio_data, sc->pl_ptr + (bp->bio_pblkno << DEV_BSHIFT), bp->bio_bcount);
459	}
460	bp->bio_resid = 0;
461	return (0);
462}
463
464static int
465mdstart_vnode(struct md_s *sc, struct bio *bp)
466{
467	int error;
468	struct uio auio;
469	struct iovec aiov;
470	struct mount *mp;
471
472	/*
473	 * VNODE I/O
474	 *
475	 * If an error occurs, we set BIO_ERROR but we do not set
476	 * B_INVAL because (for a write anyway), the buffer is
477	 * still valid.
478	 */
479
480	bzero(&auio, sizeof(auio));
481
482	aiov.iov_base = bp->bio_data;
483	aiov.iov_len = bp->bio_bcount;
484	auio.uio_iov = &aiov;
485	auio.uio_iovcnt = 1;
486	auio.uio_offset = (vm_ooffset_t)bp->bio_pblkno * sc->secsize;
487	auio.uio_segflg = UIO_SYSSPACE;
488	if(bp->bio_cmd == BIO_READ)
489		auio.uio_rw = UIO_READ;
490	else
491		auio.uio_rw = UIO_WRITE;
492	auio.uio_resid = bp->bio_bcount;
493	auio.uio_td = curthread;
494	/*
495	 * When reading set IO_DIRECT to try to avoid double-caching
496	 * the data.  When writing IO_DIRECT is not optimal, but we
497	 * must set IO_NOWDRAIN to avoid a wdrain deadlock.
498	 */
499	if (bp->bio_cmd == BIO_READ) {
500		vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread);
501		error = VOP_READ(sc->vnode, &auio, IO_DIRECT, sc->cred);
502	} else {
503		(void) vn_start_write(sc->vnode, &mp, V_WAIT);
504		vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread);
505		error = VOP_WRITE(sc->vnode, &auio, IO_NOWDRAIN, sc->cred);
506		vn_finished_write(mp);
507	}
508	VOP_UNLOCK(sc->vnode, 0, curthread);
509	bp->bio_resid = auio.uio_resid;
510	return (error);
511}
512
513static int
514mdstart_swap(struct md_s *sc, struct bio *bp)
515{
516
517	if ((bp->bio_cmd == BIO_DELETE) && (sc->flags & MD_RESERVE))
518		biodone(bp);
519	else
520		vm_pager_strategy(sc->object, bp);
521	return (-1);
522}
523
524static void
525mdstrategy(struct bio *bp)
526{
527	struct md_s *sc;
528	int error;
529
530	if (md_debug > 1)
531		printf("mdstrategy(%p) %s %x, %lld, %ld, %p)\n",
532		    (void *)bp, devtoname(bp->bio_dev), bp->bio_flags,
533		    (long long)bp->bio_blkno, bp->bio_bcount / DEV_BSIZE,
534		    (void *)bp->bio_data);
535
536	sc = bp->bio_dev->si_drv1;
537
538	/* XXX: LOCK(sc->lock) */
539	bioqdisksort(&sc->bio_queue, bp);
540	/* XXX: UNLOCK(sc->lock) */
541
542	if (atomic_cmpset_int(&sc->busy, 0, 1) == 0)
543		return;
544
545	for (;;) {
546		/* XXX: LOCK(unique unit numbers) */
547		bp = bioq_first(&sc->bio_queue);
548		if (bp)
549			bioq_remove(&sc->bio_queue, bp);
550		/* XXX: UNLOCK(unique unit numbers) */
551		if (!bp)
552			break;
553
554
555		switch (sc->type) {
556		case MD_MALLOC:
557			devstat_start_transaction(&sc->stats);
558			error = mdstart_malloc(sc, bp);
559			break;
560		case MD_PRELOAD:
561			devstat_start_transaction(&sc->stats);
562			error = mdstart_preload(sc, bp);
563			break;
564		case MD_VNODE:
565			devstat_start_transaction(&sc->stats);
566			error = mdstart_vnode(sc, bp);
567			break;
568		case MD_SWAP:
569			error = mdstart_swap(sc, bp);
570			break;
571		default:
572			panic("Impossible md(type)");
573			break;
574		}
575
576		if (error != -1)
577			biofinish(bp, &sc->stats, error);
578	}
579	sc->busy = 0;
580}
581
582static struct md_s *
583mdfind(int unit)
584{
585	struct md_s *sc;
586
587	/* XXX: LOCK(unique unit numbers) */
588	LIST_FOREACH(sc, &md_softc_list, list) {
589		if (sc->unit == unit)
590			break;
591	}
592	/* XXX: UNLOCK(unique unit numbers) */
593	return (sc);
594}
595
596static struct md_s *
597mdnew(int unit)
598{
599	struct md_s *sc;
600	int max = -1;
601
602	/* XXX: LOCK(unique unit numbers) */
603	LIST_FOREACH(sc, &md_softc_list, list) {
604		if (sc->unit == unit) {
605			/* XXX: UNLOCK(unique unit numbers) */
606			return (NULL);
607		}
608		if (sc->unit > max)
609			max = sc->unit;
610	}
611	if (unit == -1)
612		unit = max + 1;
613	if (unit > DKMAXUNIT)
614		return (NULL);
615	sc = (struct md_s *)malloc(sizeof *sc, M_MD, M_WAITOK | M_ZERO);
616	sc->unit = unit;
617	LIST_INSERT_HEAD(&md_softc_list, sc, list);
618	/* XXX: UNLOCK(unique unit numbers) */
619	return (sc);
620}
621
622static void
623mdinit(struct md_s *sc)
624{
625
626	bioq_init(&sc->bio_queue);
627	devstat_add_entry(&sc->stats, MD_NAME, sc->unit, sc->secsize,
628		DEVSTAT_NO_ORDERED_TAGS,
629		DEVSTAT_TYPE_DIRECT | DEVSTAT_TYPE_IF_OTHER,
630		DEVSTAT_PRIORITY_OTHER);
631	sc->dev = disk_create(sc->unit, &sc->disk, 0, &md_cdevsw, &mddisk_cdevsw);
632	sc->dev->si_drv1 = sc;
633}
634
635/*
636 * XXX: we should check that the range they feed us is mapped.
637 * XXX: we should implement read-only.
638 */
639
640static int
641mdcreate_preload(struct md_ioctl *mdio)
642{
643	struct md_s *sc;
644
645	if (mdio->md_size == 0)
646		return (EINVAL);
647	if (mdio->md_options & ~(MD_AUTOUNIT))
648		return (EINVAL);
649	if (mdio->md_options & MD_AUTOUNIT) {
650		sc = mdnew(-1);
651		if (sc == NULL)
652			return (ENOMEM);
653		mdio->md_unit = sc->unit;
654	} else {
655		sc = mdnew(mdio->md_unit);
656		if (sc == NULL)
657			return (EBUSY);
658	}
659	sc->type = MD_PRELOAD;
660	sc->secsize = DEV_BSIZE;
661	sc->nsect = mdio->md_size;
662	sc->flags = mdio->md_options & MD_FORCE;
663	/* Cast to pointer size, then to pointer to avoid warning */
664	sc->pl_ptr = (u_char *)(uintptr_t)mdio->md_base;
665	sc->pl_len = (mdio->md_size << DEV_BSHIFT);
666	mdinit(sc);
667	return (0);
668}
669
670
671static int
672mdcreate_malloc(struct md_ioctl *mdio)
673{
674	struct md_s *sc;
675	off_t u;
676	uintptr_t sp;
677	int error;
678
679	error = 0;
680	if (mdio->md_size == 0)
681		return (EINVAL);
682	if (mdio->md_options & ~(MD_AUTOUNIT | MD_COMPRESS | MD_RESERVE))
683		return (EINVAL);
684	/* Compression doesn't make sense if we have reserved space */
685	if (mdio->md_options & MD_RESERVE)
686		mdio->md_options &= ~MD_COMPRESS;
687	if (mdio->md_options & MD_AUTOUNIT) {
688		sc = mdnew(-1);
689		if (sc == NULL)
690			return (ENOMEM);
691		mdio->md_unit = sc->unit;
692	} else {
693		sc = mdnew(mdio->md_unit);
694		if (sc == NULL)
695			return (EBUSY);
696	}
697	sc->type = MD_MALLOC;
698	sc->secsize = DEV_BSIZE;
699	sc->nsect = mdio->md_size;
700	sc->flags = mdio->md_options & (MD_COMPRESS | MD_FORCE);
701	sc->indir = dimension(sc->nsect);
702	if (mdio->md_options & MD_RESERVE) {
703		for (u = 0; u < sc->nsect; u++) {
704			sp = (uintptr_t) malloc( sc->secsize,
705			    M_MDSECT, M_NOWAIT | M_ZERO);
706			if (sp != 0)
707				error = s_write(sc->indir, u, sp, NULL);
708			else
709				error = ENOMEM;
710			if (error)
711				break;
712		}
713	}
714	if (!error)  {
715		printf("%s%d: Malloc disk\n", MD_NAME, sc->unit);
716		mdinit(sc);
717	} else
718		mddestroy(sc, NULL);
719	return (error);
720}
721
722
723static int
724mdsetcred(struct md_s *sc, struct ucred *cred)
725{
726	char *tmpbuf;
727	int error = 0;
728
729	/*
730	 * Set credits in our softc
731	 */
732
733	if (sc->cred)
734		crfree(sc->cred);
735	sc->cred = crhold(cred);
736
737	/*
738	 * Horrible kludge to establish credentials for NFS  XXX.
739	 */
740
741	if (sc->vnode) {
742		struct uio auio;
743		struct iovec aiov;
744
745		tmpbuf = malloc(sc->secsize, M_TEMP, M_WAITOK);
746		bzero(&auio, sizeof(auio));
747
748		aiov.iov_base = tmpbuf;
749		aiov.iov_len = sc->secsize;
750		auio.uio_iov = &aiov;
751		auio.uio_iovcnt = 1;
752		auio.uio_offset = 0;
753		auio.uio_rw = UIO_READ;
754		auio.uio_segflg = UIO_SYSSPACE;
755		auio.uio_resid = aiov.iov_len;
756		vn_lock(sc->vnode, LK_EXCLUSIVE | LK_RETRY, curthread);
757		error = VOP_READ(sc->vnode, &auio, 0, sc->cred);
758		VOP_UNLOCK(sc->vnode, 0, curthread);
759		free(tmpbuf, M_TEMP);
760	}
761	return (error);
762}
763
764static int
765mdcreate_vnode(struct md_ioctl *mdio, struct thread *td)
766{
767	struct md_s *sc;
768	struct vattr vattr;
769	struct nameidata nd;
770	int error, flags;
771
772	flags = FREAD|FWRITE;
773	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, mdio->md_file, td);
774	error = vn_open(&nd, &flags, 0);
775	if (error) {
776		if (error != EACCES && error != EPERM && error != EROFS)
777			return (error);
778		flags &= ~FWRITE;
779		NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, mdio->md_file, td);
780		error = vn_open(&nd, &flags, 0);
781		if (error)
782			return (error);
783	}
784	NDFREE(&nd, NDF_ONLY_PNBUF);
785	if (nd.ni_vp->v_type != VREG ||
786	    (error = VOP_GETATTR(nd.ni_vp, &vattr, td->td_ucred, td))) {
787		VOP_UNLOCK(nd.ni_vp, 0, td);
788		(void) vn_close(nd.ni_vp, flags, td->td_ucred, td);
789		return (error ? error : EINVAL);
790	}
791	VOP_UNLOCK(nd.ni_vp, 0, td);
792
793	if (mdio->md_options & MD_AUTOUNIT) {
794		sc = mdnew(-1);
795		mdio->md_unit = sc->unit;
796	} else {
797		sc = mdnew(mdio->md_unit);
798	}
799	if (sc == NULL) {
800		(void) vn_close(nd.ni_vp, flags, td->td_ucred, td);
801		return (EBUSY);
802	}
803
804	sc->type = MD_VNODE;
805	sc->flags = mdio->md_options & MD_FORCE;
806	if (!(flags & FWRITE))
807		sc->flags |= MD_READONLY;
808	sc->secsize = DEV_BSIZE;
809	sc->vnode = nd.ni_vp;
810
811	/*
812	 * If the size is specified, override the file attributes.
813	 */
814	if (mdio->md_size)
815		sc->nsect = mdio->md_size;
816	else
817		sc->nsect = vattr.va_size / sc->secsize; /* XXX: round up ? */
818	if (sc->nsect == 0) {
819		(void) vn_close(nd.ni_vp, flags, td->td_ucred, td);
820		return (EINVAL);
821	}
822	error = mdsetcred(sc, td->td_ucred);
823	if (error) {
824		(void) vn_close(nd.ni_vp, flags, td->td_ucred, td);
825		return (error);
826	}
827	mdinit(sc);
828	return (0);
829}
830
831static int
832mddestroy(struct md_s *sc, struct thread *td)
833{
834
835	GIANT_REQUIRED;
836
837	if (sc->dev != NULL) {
838		devstat_remove_entry(&sc->stats);
839		disk_destroy(sc->dev);
840	}
841	if (sc->vnode != NULL)
842		(void)vn_close(sc->vnode, sc->flags & MD_READONLY ?
843		    FREAD : (FREAD|FWRITE), sc->cred, td);
844	if (sc->cred != NULL)
845		crfree(sc->cred);
846	if (sc->object != NULL) {
847		vm_pager_deallocate(sc->object);
848	}
849#if 0
850	if (sc->secp != NULL) {
851		for (u = 0; u < sc->nsect; u++)
852			if ((uintptr_t)sc->secp[u] > 255)
853				free(sc->secp[u], M_MDSECT);
854		free(sc->secp, M_MD);
855	}
856#endif
857
858	/* XXX: LOCK(unique unit numbers) */
859	LIST_REMOVE(sc, list);
860	/* XXX: UNLOCK(unique unit numbers) */
861	free(sc, M_MD);
862	return (0);
863}
864
865static int
866mdcreate_swap(struct md_ioctl *mdio, struct thread *td)
867{
868	int error;
869	struct md_s *sc;
870
871	GIANT_REQUIRED;
872
873	if (mdio->md_options & MD_AUTOUNIT) {
874		sc = mdnew(-1);
875		mdio->md_unit = sc->unit;
876	} else {
877		sc = mdnew(mdio->md_unit);
878	}
879	if (sc == NULL)
880		return (EBUSY);
881
882	sc->type = MD_SWAP;
883
884	/*
885	 * Range check.  Disallow negative sizes or any size less then the
886	 * size of a page.  Then round to a page.
887	 */
888
889	if (mdio->md_size == 0) {
890		mddestroy(sc, td);
891		return (EDOM);
892	}
893
894	/*
895	 * Allocate an OBJT_SWAP object.
896	 *
897	 * sc_secsize is PAGE_SIZE'd
898	 *
899	 * mdio->size is in DEV_BSIZE'd chunks.
900	 * Note the truncation.
901	 */
902
903	sc->secsize = PAGE_SIZE;
904	sc->nsect = mdio->md_size / (PAGE_SIZE / DEV_BSIZE);
905	sc->object = vm_pager_allocate(OBJT_SWAP, NULL, sc->secsize * (vm_offset_t)sc->nsect, VM_PROT_DEFAULT, 0);
906	sc->flags = mdio->md_options & MD_FORCE;
907	if (mdio->md_options & MD_RESERVE) {
908		if (swap_pager_reserve(sc->object, 0, sc->nsect) < 0) {
909			vm_pager_deallocate(sc->object);
910			sc->object = NULL;
911			mddestroy(sc, td);
912			return (EDOM);
913		}
914	}
915	error = mdsetcred(sc, td->td_ucred);
916	if (error)
917		mddestroy(sc, td);
918	else
919		mdinit(sc);
920	return (error);
921}
922
923static int
924mddetach(int unit, struct thread *td)
925{
926	struct md_s *sc;
927
928	sc = mdfind(unit);
929	if (sc == NULL)
930		return (ENOENT);
931	if (sc->opencount != 0 && !(sc->flags & MD_FORCE))
932		return (EBUSY);
933	switch(sc->type) {
934	case MD_VNODE:
935	case MD_SWAP:
936	case MD_MALLOC:
937	case MD_PRELOAD:
938		return (mddestroy(sc, td));
939	default:
940		return (EOPNOTSUPP);
941	}
942}
943
944static int
945mdctlioctl(dev_t dev, u_long cmd, caddr_t addr, int flags, struct thread *td)
946{
947	struct md_ioctl *mdio;
948	struct md_s *sc;
949
950	if (md_debug)
951		printf("mdctlioctl(%s %lx %p %x %p)\n",
952			devtoname(dev), cmd, addr, flags, td);
953
954	/*
955	 * We assert the version number in the individual ioctl
956	 * handlers instead of out here because (a) it is possible we
957	 * may add another ioctl in the future which doesn't read an
958	 * mdio, and (b) the correct return value for an unknown ioctl
959	 * is ENOIOCTL, not EINVAL.
960	 */
961	mdio = (struct md_ioctl *)addr;
962	switch (cmd) {
963	case MDIOCATTACH:
964		if (mdio->md_version != MDIOVERSION)
965			return (EINVAL);
966		switch (mdio->md_type) {
967		case MD_MALLOC:
968			return (mdcreate_malloc(mdio));
969		case MD_PRELOAD:
970			return (mdcreate_preload(mdio));
971		case MD_VNODE:
972			return (mdcreate_vnode(mdio, td));
973		case MD_SWAP:
974			return (mdcreate_swap(mdio, td));
975		default:
976			return (EINVAL);
977		}
978	case MDIOCDETACH:
979		if (mdio->md_version != MDIOVERSION)
980			return (EINVAL);
981		if (mdio->md_file != NULL || mdio->md_size != 0 ||
982		    mdio->md_options != 0)
983			return (EINVAL);
984		return (mddetach(mdio->md_unit, td));
985	case MDIOCQUERY:
986		if (mdio->md_version != MDIOVERSION)
987			return (EINVAL);
988		sc = mdfind(mdio->md_unit);
989		if (sc == NULL)
990			return (ENOENT);
991		mdio->md_type = sc->type;
992		mdio->md_options = sc->flags;
993		switch (sc->type) {
994		case MD_MALLOC:
995			mdio->md_size = sc->nsect;
996			break;
997		case MD_PRELOAD:
998			mdio->md_size = sc->nsect;
999			(u_char *)(uintptr_t)mdio->md_base = sc->pl_ptr;
1000			break;
1001		case MD_SWAP:
1002			mdio->md_size = sc->nsect * (PAGE_SIZE / DEV_BSIZE);
1003			break;
1004		case MD_VNODE:
1005			mdio->md_size = sc->nsect;
1006			/* XXX fill this in */
1007			mdio->md_file = NULL;
1008			break;
1009		}
1010		return (0);
1011	default:
1012		return (ENOIOCTL);
1013	};
1014	return (ENOIOCTL);
1015}
1016
1017static void
1018md_preloaded(u_char *image, unsigned length)
1019{
1020	struct md_s *sc;
1021
1022	sc = mdnew(-1);
1023	if (sc == NULL)
1024		return;
1025	sc->type = MD_PRELOAD;
1026	sc->secsize = DEV_BSIZE;
1027	sc->nsect = length / DEV_BSIZE;
1028	sc->pl_ptr = image;
1029	sc->pl_len = length;
1030	if (sc->unit == 0)
1031		mdrootready = 1;
1032	mdinit(sc);
1033}
1034
1035static void
1036md_drvinit(void *unused)
1037{
1038
1039	caddr_t mod;
1040	caddr_t c;
1041	u_char *ptr, *name, *type;
1042	unsigned len;
1043
1044#ifdef MD_ROOT_SIZE
1045	md_preloaded(mfs_root, MD_ROOT_SIZE*1024);
1046#endif
1047	mod = NULL;
1048	while ((mod = preload_search_next_name(mod)) != NULL) {
1049		name = (char *)preload_search_info(mod, MODINFO_NAME);
1050		type = (char *)preload_search_info(mod, MODINFO_TYPE);
1051		if (name == NULL)
1052			continue;
1053		if (type == NULL)
1054			continue;
1055		if (strcmp(type, "md_image") && strcmp(type, "mfs_root"))
1056			continue;
1057		c = preload_search_info(mod, MODINFO_ADDR);
1058		ptr = *(u_char **)c;
1059		c = preload_search_info(mod, MODINFO_SIZE);
1060		len = *(unsigned *)c;
1061		printf("%s%d: Preloaded image <%s> %d bytes at %p\n",
1062		    MD_NAME, mdunits, name, len, ptr);
1063		md_preloaded(ptr, len);
1064	}
1065	status_dev = make_dev(&mdctl_cdevsw, 0xffff00ff, UID_ROOT, GID_WHEEL,
1066	    0600, MDCTL_NAME);
1067}
1068
1069static int
1070md_modevent(module_t mod, int type, void *data)
1071{
1072	int error;
1073	struct md_s *sc;
1074
1075	switch (type) {
1076	case MOD_LOAD:
1077		md_drvinit(NULL);
1078		break;
1079	case MOD_UNLOAD:
1080		LIST_FOREACH(sc, &md_softc_list, list) {
1081			error = mddetach(sc->unit, curthread);
1082			if (error != 0)
1083				return (error);
1084		}
1085		if (status_dev)
1086			destroy_dev(status_dev);
1087		status_dev = 0;
1088		break;
1089	default:
1090		break;
1091	}
1092	return (0);
1093}
1094
1095static moduledata_t md_mod = {
1096	MD_NAME,
1097	md_modevent,
1098	NULL
1099};
1100DECLARE_MODULE(md, md_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE+CDEV_MAJOR);
1101MODULE_VERSION(md, MD_MODVER);
1102
1103
1104#ifdef MD_ROOT
1105static void
1106md_takeroot(void *junk)
1107{
1108	if (mdrootready)
1109		rootdevnames[0] = "ufs:/dev/md0c";
1110}
1111
1112SYSINIT(md_root, SI_SUB_MOUNT_ROOT, SI_ORDER_FIRST, md_takeroot, NULL);
1113#endif
1114