1/*	$NetBSD: vnd.c,v 1.289 2023/05/19 15:42:43 mlelstv Exp $	*/
2
3/*-
4 * Copyright (c) 1996, 1997, 1998, 2008, 2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Jason R. Thorpe.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32/*
33 * Copyright (c) 1988 University of Utah.
34 * Copyright (c) 1990, 1993
35 *	The Regents of the University of California.  All rights reserved.
36 *
37 * This code is derived from software contributed to Berkeley by
38 * the Systems Programming Group of the University of Utah Computer
39 * Science Department.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * 1. Redistributions of source code must retain the above copyright
45 *    notice, this list of conditions and the following disclaimer.
46 * 2. Redistributions in binary form must reproduce the above copyright
47 *    notice, this list of conditions and the following disclaimer in the
48 *    documentation and/or other materials provided with the distribution.
49 * 3. Neither the name of the University nor the names of its contributors
50 *    may be used to endorse or promote products derived from this software
51 *    without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
55 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
56 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
57 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
58 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
59 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
60 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
61 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
62 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
63 * SUCH DAMAGE.
64 *
65 * from: Utah $Hdr: vn.c 1.13 94/04/02$
66 *
67 *	@(#)vn.c	8.9 (Berkeley) 5/14/95
68 */
69
70/*
71 * Vnode disk driver.
72 *
73 * Block/character interface to a vnode.  Allows one to treat a file
74 * as a disk (e.g. build a filesystem in it, mount it, etc.).
75 *
76 * NOTE 1: If the vnode supports the VOP_BMAP and VOP_STRATEGY operations,
77 * this uses them to avoid distorting the local buffer cache.  If those
78 * block-level operations are not available, this falls back to the regular
79 * read and write calls.  Using these may distort the cache in some cases
80 * but better have the driver working than preventing it to work on file
81 * systems where the block-level operations are not implemented for
82 * whatever reason.
83 *
84 * NOTE 2: There is a security issue involved with this driver.
85 * Once mounted all access to the contents of the "mapped" file via
86 * the special file is controlled by the permissions on the special
87 * file, the protection of the mapped file is ignored (effectively,
88 * by using root credentials in all transactions).
89 *
90 * NOTE 3: Doesn't interact with leases, should it?
91 */
92
93#include <sys/cdefs.h>
94__KERNEL_RCSID(0, "$NetBSD: vnd.c,v 1.289 2023/05/19 15:42:43 mlelstv Exp $");
95
96#if defined(_KERNEL_OPT)
97#include "opt_vnd.h"
98#include "opt_compat_netbsd.h"
99#endif
100
101#include <sys/param.h>
102#include <sys/systm.h>
103#include <sys/namei.h>
104#include <sys/proc.h>
105#include <sys/kthread.h>
106#include <sys/errno.h>
107#include <sys/buf.h>
108#include <sys/bufq.h>
109#include <sys/malloc.h>
110#include <sys/ioctl.h>
111#include <sys/disklabel.h>
112#include <sys/device.h>
113#include <sys/disk.h>
114#include <sys/stat.h>
115#include <sys/mount.h>
116#include <sys/vnode.h>
117#include <sys/fstrans.h>
118#include <sys/file.h>
119#include <sys/uio.h>
120#include <sys/conf.h>
121#include <sys/kauth.h>
122#include <sys/module.h>
123#include <sys/compat_stub.h>
124#include <sys/atomic.h>
125
126#include <uvm/uvm.h>
127
128#include <net/zlib.h>
129
130#include <miscfs/genfs/genfs.h>
131#include <miscfs/specfs/specdev.h>
132
133#include <dev/dkvar.h>
134#include <dev/vndvar.h>
135
136#include "ioconf.h"
137
138#if defined(VNDDEBUG) && !defined(DEBUG)
139#define DEBUG
140#endif
141
142#ifdef DEBUG
143int dovndcluster = 1;
144#define VDB_FOLLOW	0x01
145#define VDB_INIT	0x02
146#define VDB_IO		0x04
147#define VDB_LABEL	0x08
148int vnddebug = 0;
149#endif
150
151#define vndunit(x)	DISKUNIT(x)
152
153struct vndxfer {
154	struct buf vx_buf;
155	struct vnd_softc *vx_vnd;
156};
157#define	VND_BUFTOXFER(bp)	((struct vndxfer *)(void *)bp)
158
159#define VND_GETXFER(vnd)	pool_get(&(vnd)->sc_vxpool, PR_WAITOK)
160#define VND_PUTXFER(vnd, vx)	pool_put(&(vnd)->sc_vxpool, (vx))
161
162#define VNDLABELDEV(dev) \
163    (MAKEDISKDEV(major((dev)), vndunit((dev)), RAW_PART))
164
165#define	VND_MAXPENDING(vnd)	((vnd)->sc_maxactive * 4)
166#define	VND_MAXPAGES(vnd)	(1024 * 1024 / PAGE_SIZE)
167
168
169static void	vndclear(struct vnd_softc *, int);
170static int	vnddoclear(struct vnd_softc *, int, int, bool);
171static int	vndsetcred(struct vnd_softc *, kauth_cred_t);
172static void	vndthrottle(struct vnd_softc *, struct vnode *);
173static void	vndiodone(struct buf *);
174#if 0
175static void	vndshutdown(void);
176#endif
177
178static void	vndgetdefaultlabel(struct vnd_softc *, struct disklabel *);
179static void	vndgetdisklabel(dev_t, struct vnd_softc *);
180
181static int	vndlock(struct vnd_softc *);
182static void	vndunlock(struct vnd_softc *);
183#ifdef VND_COMPRESSION
184static void	compstrategy(struct buf *, off_t);
185static void	*vnd_alloc(void *, u_int, u_int);
186static void	vnd_free(void *, void *);
187#endif /* VND_COMPRESSION */
188
189static void	vndthread(void *);
190static bool	vnode_has_op(const struct vnode *, int);
191static void	handle_with_rdwr(struct vnd_softc *, const struct buf *,
192		    struct buf *);
193static void	handle_with_strategy(struct vnd_softc *, const struct buf *,
194		    struct buf *);
195static void	vnd_set_geometry(struct vnd_softc *);
196
197static dev_type_open(vndopen);
198static dev_type_close(vndclose);
199static dev_type_read(vndread);
200static dev_type_write(vndwrite);
201static dev_type_ioctl(vndioctl);
202static dev_type_strategy(vndstrategy);
203static dev_type_dump(vnddump);
204static dev_type_size(vndsize);
205
206const struct bdevsw vnd_bdevsw = {
207	.d_open = vndopen,
208	.d_close = vndclose,
209	.d_strategy = vndstrategy,
210	.d_ioctl = vndioctl,
211	.d_dump = vnddump,
212	.d_psize = vndsize,
213	.d_discard = nodiscard,
214	.d_flag = D_DISK
215};
216
217const struct cdevsw vnd_cdevsw = {
218	.d_open = vndopen,
219	.d_close = vndclose,
220	.d_read = vndread,
221	.d_write = vndwrite,
222	.d_ioctl = vndioctl,
223	.d_stop = nostop,
224	.d_tty = notty,
225	.d_poll = nopoll,
226	.d_mmap = nommap,
227	.d_kqfilter = nokqfilter,
228	.d_discard = nodiscard,
229	.d_flag = D_DISK
230};
231
232static int	vnd_match(device_t, cfdata_t, void *);
233static void	vnd_attach(device_t, device_t, void *);
234static int	vnd_detach(device_t, int);
235
236CFATTACH_DECL3_NEW(vnd, sizeof(struct vnd_softc),
237    vnd_match, vnd_attach, vnd_detach, NULL, NULL, NULL, DVF_DETACH_SHUTDOWN);
238
239static struct vnd_softc	*vnd_spawn(int);
240static int	vnd_destroy(device_t);
241
242static const struct	dkdriver vnddkdriver = {
243	.d_strategy = vndstrategy,
244	.d_minphys = minphys
245};
246
247void
248vndattach(int num)
249{
250	int error;
251
252	error = config_cfattach_attach(vnd_cd.cd_name, &vnd_ca);
253	if (error)
254		aprint_error("%s: unable to register cfattach, error = %d\n",
255		    vnd_cd.cd_name, error);
256}
257
258static int
259vnd_match(device_t self, cfdata_t cfdata, void *aux)
260{
261
262	return 1;
263}
264
265static void
266vnd_attach(device_t parent, device_t self, void *aux)
267{
268	struct vnd_softc *sc = device_private(self);
269
270	sc->sc_dev = self;
271	sc->sc_comp_offsets = NULL;
272	sc->sc_comp_buff = NULL;
273	sc->sc_comp_decombuf = NULL;
274	bufq_alloc(&sc->sc_tab, "disksort", BUFQ_SORT_RAWBLOCK);
275	disk_init(&sc->sc_dkdev, device_xname(self), &vnddkdriver);
276	if (!pmf_device_register(self, NULL, NULL))
277		aprint_error_dev(self, "couldn't establish power handler\n");
278}
279
280static int
281vnd_detach(device_t self, int flags)
282{
283	int error;
284	struct vnd_softc *sc = device_private(self);
285
286	if (sc->sc_flags & VNF_INITED) {
287		error = vnddoclear(sc, 0, -1, (flags & DETACH_FORCE) != 0);
288		if (error != 0)
289			return error;
290	}
291
292	pmf_device_deregister(self);
293	bufq_free(sc->sc_tab);
294	disk_destroy(&sc->sc_dkdev);
295
296	return 0;
297}
298
299static struct vnd_softc *
300vnd_spawn(int unit)
301{
302	cfdata_t cf;
303
304	cf = malloc(sizeof(*cf), M_DEVBUF, M_WAITOK);
305	cf->cf_name = vnd_cd.cd_name;
306	cf->cf_atname = vnd_cd.cd_name;
307	cf->cf_unit = unit;
308	cf->cf_fstate = FSTATE_STAR;
309
310	return device_private(config_attach_pseudo(cf));
311}
312
313static int
314vnd_destroy(device_t dev)
315{
316	int error;
317	cfdata_t cf;
318
319	cf = device_cfdata(dev);
320	error = config_detach(dev, DETACH_QUIET);
321	if (error)
322		return error;
323	free(cf, M_DEVBUF);
324	return 0;
325}
326
327static int
328vndopen(dev_t dev, int flags, int mode, struct lwp *l)
329{
330	int unit = vndunit(dev);
331	struct vnd_softc *sc;
332	int error = 0, part, pmask;
333	struct disklabel *lp;
334
335#ifdef DEBUG
336	if (vnddebug & VDB_FOLLOW)
337		printf("vndopen(0x%"PRIx64", 0x%x, 0x%x, %p)\n", dev, flags, mode, l);
338#endif
339	sc = device_lookup_private(&vnd_cd, unit);
340	if (sc == NULL) {
341		sc = vnd_spawn(unit);
342		if (sc == NULL)
343			return ENOMEM;
344
345		/* compatibility, keep disklabel after close */
346		sc->sc_flags = VNF_KLABEL;
347	}
348
349	if ((error = vndlock(sc)) != 0)
350		return error;
351
352	mutex_enter(&sc->sc_dkdev.dk_openlock);
353
354	if ((sc->sc_flags & VNF_CLEARING) != 0) {
355		error = ENXIO;
356		goto done;
357	}
358
359	lp = sc->sc_dkdev.dk_label;
360
361	part = DISKPART(dev);
362	pmask = (1 << part);
363
364	if (sc->sc_dkdev.dk_nwedges != 0 && part != RAW_PART) {
365		error = EBUSY;
366		goto done;
367	}
368
369	if (sc->sc_flags & VNF_INITED) {
370		if ((sc->sc_dkdev.dk_openmask & ~(1<<RAW_PART)) != 0) {
371			/*
372			 * If any non-raw partition is open, but the disk
373			 * has been invalidated, disallow further opens.
374			 */
375			if ((sc->sc_flags & VNF_VLABEL) == 0) {
376				error = EIO;
377				goto done;
378			}
379		} else {
380			/*
381			 * Load the partition info if not already loaded.
382			 */
383			if ((sc->sc_flags & VNF_VLABEL) == 0) {
384				sc->sc_flags |= VNF_VLABEL;
385				vndgetdisklabel(dev, sc);
386			}
387		}
388	}
389
390	/* Check that the partitions exists. */
391	if (part != RAW_PART) {
392		if (((sc->sc_flags & VNF_INITED) == 0) ||
393		    ((part >= lp->d_npartitions) ||
394		     (lp->d_partitions[part].p_fstype == FS_UNUSED))) {
395			error = ENXIO;
396			goto done;
397		}
398	}
399
400	/* Prevent our unit from being unconfigured while open. */
401	switch (mode) {
402	case S_IFCHR:
403		sc->sc_dkdev.dk_copenmask |= pmask;
404		break;
405
406	case S_IFBLK:
407		sc->sc_dkdev.dk_bopenmask |= pmask;
408		break;
409	}
410	sc->sc_dkdev.dk_openmask =
411	    sc->sc_dkdev.dk_copenmask | sc->sc_dkdev.dk_bopenmask;
412
413 done:
414	mutex_exit(&sc->sc_dkdev.dk_openlock);
415	vndunlock(sc);
416	return error;
417}
418
419static int
420vndclose(dev_t dev, int flags, int mode, struct lwp *l)
421{
422	int unit = vndunit(dev);
423	struct vnd_softc *sc;
424	int error = 0, part;
425
426#ifdef DEBUG
427	if (vnddebug & VDB_FOLLOW)
428		printf("vndclose(0x%"PRIx64", 0x%x, 0x%x, %p)\n", dev, flags, mode, l);
429#endif
430	sc = device_lookup_private(&vnd_cd, unit);
431	if (sc == NULL)
432		return ENXIO;
433
434	if ((error = vndlock(sc)) != 0)
435		return error;
436
437	mutex_enter(&sc->sc_dkdev.dk_openlock);
438
439	part = DISKPART(dev);
440
441	/* ...that much closer to allowing unconfiguration... */
442	switch (mode) {
443	case S_IFCHR:
444		sc->sc_dkdev.dk_copenmask &= ~(1 << part);
445		break;
446
447	case S_IFBLK:
448		sc->sc_dkdev.dk_bopenmask &= ~(1 << part);
449		break;
450	}
451	sc->sc_dkdev.dk_openmask =
452	    sc->sc_dkdev.dk_copenmask | sc->sc_dkdev.dk_bopenmask;
453
454	/* are we last opener ? */
455	if (sc->sc_dkdev.dk_openmask == 0) {
456		if ((sc->sc_flags & VNF_KLABEL) == 0)
457			sc->sc_flags &= ~VNF_VLABEL;
458	}
459
460	mutex_exit(&sc->sc_dkdev.dk_openlock);
461
462	vndunlock(sc);
463
464	if ((sc->sc_flags & VNF_INITED) == 0) {
465		if ((error = vnd_destroy(sc->sc_dev)) != 0) {
466			aprint_error_dev(sc->sc_dev,
467			    "unable to detach instance\n");
468			return error;
469		}
470	}
471
472	return 0;
473}
474
475/*
476 * Queue the request, and wakeup the kernel thread to handle it.
477 */
478static void
479vndstrategy(struct buf *bp)
480{
481	int unit = vndunit(bp->b_dev);
482	struct vnd_softc *vnd =
483	    device_lookup_private(&vnd_cd, unit);
484	struct disklabel *lp;
485	daddr_t blkno;
486	int s = splbio();
487
488	if (vnd == NULL) {
489		bp->b_error = ENXIO;
490		goto done;
491	}
492	lp = vnd->sc_dkdev.dk_label;
493
494	if ((vnd->sc_flags & VNF_INITED) == 0) {
495		bp->b_error = ENXIO;
496		goto done;
497	}
498
499	/*
500	 * The transfer must be a whole number of blocks.
501	 */
502	if ((bp->b_bcount % lp->d_secsize) != 0) {
503		bp->b_error = EINVAL;
504		goto done;
505	}
506
507	/*
508	 * check if we're read-only.
509	 */
510	if ((vnd->sc_flags & VNF_READONLY) && !(bp->b_flags & B_READ)) {
511		bp->b_error = EACCES;
512		goto done;
513	}
514
515	/* If it's a nil transfer, wake up the top half now. */
516	if (bp->b_bcount == 0) {
517		goto done;
518	}
519
520	/*
521	 * Do bounds checking and adjust transfer.  If there's an error,
522	 * the bounds check will flag that for us.
523	 */
524	if (DISKPART(bp->b_dev) == RAW_PART) {
525		if (bounds_check_with_mediasize(bp, DEV_BSIZE,
526		    vnd->sc_size) <= 0)
527			goto done;
528	} else {
529		if (bounds_check_with_label(&vnd->sc_dkdev,
530		    bp, vnd->sc_flags & (VNF_WLABEL|VNF_LABELLING)) <= 0)
531			goto done;
532	}
533
534	/*
535	 * Put the block number in terms of the logical blocksize
536	 * of the "device".
537	 */
538
539	blkno = bp->b_blkno / (lp->d_secsize / DEV_BSIZE);
540
541	/*
542	 * Translate the partition-relative block number to an absolute.
543	 */
544	if (DISKPART(bp->b_dev) != RAW_PART) {
545		struct partition *pp;
546
547		pp = &vnd->sc_dkdev.dk_label->d_partitions[
548		    DISKPART(bp->b_dev)];
549		blkno += pp->p_offset;
550	}
551	bp->b_rawblkno = blkno;
552
553#ifdef DEBUG
554	if (vnddebug & VDB_FOLLOW)
555		printf("vndstrategy(%p): unit %d\n", bp, unit);
556#endif
557	if ((vnd->sc_flags & VNF_USE_VN_RDWR)) {
558		/*
559		 * Limit the number of pending requests to not exhaust
560		 * resources needed for I/O but always allow the worker
561		 * thread to add requests, as a wedge on vnd queues
562		 * requests with biodone() -> dkstart() -> vndstrategy().
563		 */
564		if (curlwp != vnd->sc_kthread && curlwp != uvm.pagedaemon_lwp) {
565			while (vnd->sc_pending >= VND_MAXPENDING(vnd))
566				tsleep(&vnd->sc_pending, PRIBIO, "vndpc", 0);
567		}
568		vnd->sc_pending++;
569		KASSERT(vnd->sc_pending > 0);
570	}
571	bufq_put(vnd->sc_tab, bp);
572	wakeup(&vnd->sc_tab);
573	splx(s);
574	return;
575
576done:
577	bp->b_resid = bp->b_bcount;
578	biodone(bp);
579	splx(s);
580}
581
582static bool
583vnode_has_strategy(struct vnd_softc *vnd)
584{
585	return vnode_has_op(vnd->sc_vp, VOFFSET(vop_bmap)) &&
586	    vnode_has_op(vnd->sc_vp, VOFFSET(vop_strategy));
587}
588
589/* Verify that I/O requests cannot be smaller than the
590 * smallest I/O size supported by the backend.
591 */
592static bool
593vnode_has_large_blocks(struct vnd_softc *vnd)
594{
595	u_int32_t vnd_secsize, iosize;
596
597	iosize = vnd->sc_iosize;
598	vnd_secsize = vnd->sc_geom.vng_secsize;
599
600	return vnd_secsize % iosize != 0;
601}
602
603/* XXX this function needs a reliable check to detect
604 * sparse files. Otherwise, bmap/strategy may be used
605 * and fail on non-allocated blocks. VOP_READ/VOP_WRITE
606 * works on sparse files.
607 */
608#if notyet
609static bool
610vnode_strategy_probe(struct vnd_softc *vnd)
611{
612	int error;
613	daddr_t nbn;
614
615	if (!vnode_has_strategy(vnd))
616		return false;
617
618	if (vnode_has_large_blocks(vnd))
619		return false;
620
621	/* Convert the first logical block number to its
622	 * physical block number.
623	 */
624	error = 0;
625	vn_lock(vnd->sc_vp, LK_EXCLUSIVE | LK_RETRY);
626	error = VOP_BMAP(vnd->sc_vp, 0, NULL, &nbn, NULL);
627	VOP_UNLOCK(vnd->sc_vp);
628
629	/* Test if that worked. */
630	if (error == 0 && (long)nbn == -1)
631		return false;
632
633	return true;
634}
635#endif
636
637static void
638vndthread(void *arg)
639{
640	struct vnd_softc *vnd = arg;
641	int s;
642
643	/* Determine whether we can *use* VOP_BMAP and VOP_STRATEGY to
644	 * directly access the backing vnode.  If we can, use these two
645	 * operations to avoid messing with the local buffer cache.
646	 * Otherwise fall back to regular VOP_READ/VOP_WRITE operations
647	 * which are guaranteed to work with any file system. */
648	if ((vnd->sc_flags & VNF_USE_VN_RDWR) == 0 &&
649	    ! vnode_has_strategy(vnd))
650		vnd->sc_flags |= VNF_USE_VN_RDWR;
651
652	/* VOP_STRATEGY can only be used if the backing vnode allows
653	 * to access blocks as small as defined by the vnd geometry.
654	 */
655	if ((vnd->sc_flags & VNF_USE_VN_RDWR) == 0 &&
656	    vnode_has_large_blocks(vnd))
657		vnd->sc_flags |= VNF_USE_VN_RDWR;
658
659#ifdef DEBUG
660	if (vnddebug & VDB_INIT)
661		printf("vndthread: vp %p, %s\n", vnd->sc_vp,
662		    (vnd->sc_flags & VNF_USE_VN_RDWR) == 0 ?
663		    "using bmap/strategy operations" :
664		    "using read/write operations");
665#endif
666
667	s = splbio();
668	vnd->sc_flags |= VNF_KTHREAD;
669	wakeup(&vnd->sc_kthread);
670
671	/*
672	 * Dequeue requests and serve them depending on the available
673	 * vnode operations.
674	 */
675	while ((vnd->sc_flags & VNF_VUNCONF) == 0) {
676		struct vndxfer *vnx;
677		struct buf *obp;
678		struct buf *bp;
679
680		obp = bufq_get(vnd->sc_tab);
681		if (obp == NULL) {
682			tsleep(&vnd->sc_tab, PRIBIO, "vndbp", 0);
683			continue;
684		};
685		if ((vnd->sc_flags & VNF_USE_VN_RDWR)) {
686			KASSERT(vnd->sc_pending > 0);
687			if (vnd->sc_pending-- == VND_MAXPENDING(vnd))
688				wakeup(&vnd->sc_pending);
689		}
690		splx(s);
691#ifdef DEBUG
692		if (vnddebug & VDB_FOLLOW)
693			printf("vndthread(%p)\n", obp);
694#endif
695
696		if (vnd->sc_vp->v_mount == NULL) {
697			obp->b_error = ENXIO;
698			goto done;
699		}
700#ifdef VND_COMPRESSION
701		/* handle a compressed read */
702		if ((obp->b_flags & B_READ) != 0 && (vnd->sc_flags & VNF_COMP)) {
703			off_t bn;
704
705			/* Convert to a byte offset within the file. */
706			bn = obp->b_rawblkno *
707			    vnd->sc_dkdev.dk_label->d_secsize;
708
709			compstrategy(obp, bn);
710			goto done;
711		}
712#endif /* VND_COMPRESSION */
713
714		/*
715		 * Allocate a header for this transfer and link it to the
716		 * buffer
717		 */
718		s = splbio();
719		vnx = VND_GETXFER(vnd);
720		splx(s);
721		vnx->vx_vnd = vnd;
722
723		s = splbio();
724		while (vnd->sc_active >= vnd->sc_maxactive) {
725			tsleep(&vnd->sc_tab, PRIBIO, "vndac", 0);
726		}
727		vnd->sc_active++;
728		splx(s);
729
730		/* Instrumentation. */
731		disk_busy(&vnd->sc_dkdev);
732
733		bp = &vnx->vx_buf;
734		buf_init(bp);
735		bp->b_flags = (obp->b_flags & (B_READ | B_PHYS | B_RAW));
736		bp->b_oflags = obp->b_oflags;
737		bp->b_cflags = obp->b_cflags;
738		bp->b_iodone = vndiodone;
739		bp->b_private = obp;
740		bp->b_vp = vnd->sc_vp;
741		bp->b_objlock = bp->b_vp->v_interlock;
742		bp->b_data = obp->b_data;
743		bp->b_bcount = obp->b_bcount;
744		BIO_COPYPRIO(bp, obp);
745
746		/* Make sure the request succeeds while suspending this fs. */
747		fstrans_start_lazy(vnd->sc_vp->v_mount);
748
749		/* Handle the request using the appropriate operations. */
750		if ((vnd->sc_flags & VNF_USE_VN_RDWR) == 0)
751			handle_with_strategy(vnd, obp, bp);
752		else
753			handle_with_rdwr(vnd, obp, bp);
754
755		fstrans_done(vnd->sc_vp->v_mount);
756
757		s = splbio();
758		continue;
759
760done:
761		biodone(obp);
762		s = splbio();
763	}
764
765	vnd->sc_flags &= (~VNF_KTHREAD | VNF_VUNCONF);
766	wakeup(&vnd->sc_kthread);
767	splx(s);
768	kthread_exit(0);
769}
770
771/*
772 * Checks if the given vnode supports the requested operation.
773 * The operation is specified the offset returned by VOFFSET.
774 *
775 * XXX The test below used to determine this is quite fragile
776 * because it relies on the file system to use genfs to specify
777 * unimplemented operations.  There might be another way to do
778 * it more cleanly.
779 */
780static bool
781vnode_has_op(const struct vnode *vp, int opoffset)
782{
783	int (*defaultp)(void *);
784	int (*opp)(void *);
785
786	defaultp = vp->v_op[VOFFSET(vop_default)];
787	opp = vp->v_op[opoffset];
788
789	return opp != defaultp && opp != genfs_eopnotsupp &&
790	    opp != genfs_badop && opp != genfs_nullop;
791}
792
793/*
794 * Handles the read/write request given in 'bp' using the vnode's VOP_READ
795 * and VOP_WRITE operations.
796 *
797 * 'obp' is a pointer to the original request fed to the vnd device.
798 */
799static void
800handle_with_rdwr(struct vnd_softc *vnd, const struct buf *obp, struct buf *bp)
801{
802	bool doread;
803	off_t offset;
804	size_t len, resid;
805	struct vnode *vp;
806	int npages;
807
808	doread = bp->b_flags & B_READ;
809	offset = obp->b_rawblkno * vnd->sc_dkdev.dk_label->d_secsize;
810	len = bp->b_bcount;
811	vp = vnd->sc_vp;
812
813#if defined(DEBUG)
814	if (vnddebug & VDB_IO)
815		printf("vnd (rdwr): vp %p, %s, rawblkno 0x%" PRIx64
816		    ", secsize %d, offset %" PRIu64
817		    ", bcount %d\n",
818		    vp, doread ? "read" : "write", obp->b_rawblkno,
819		    vnd->sc_dkdev.dk_label->d_secsize, offset,
820		    bp->b_bcount);
821#endif
822
823	/* Issue the read or write operation. */
824	bp->b_error =
825	    vn_rdwr(doread ? UIO_READ : UIO_WRITE,
826	    vp, bp->b_data, len, offset, UIO_SYSSPACE,
827	    IO_ADV_ENCODE(POSIX_FADV_NOREUSE) | IO_DIRECT,
828		vnd->sc_cred, &resid, NULL);
829	bp->b_resid = resid;
830
831	/*
832	 * Avoid caching too many pages, the vnd user
833	 * is usually a filesystem and caches itself.
834	 * We need some amount of caching to not hinder
835	 * read-ahead and write-behind operations.
836	 */
837	npages = atomic_load_relaxed(&vp->v_uobj.uo_npages);
838	if (npages > VND_MAXPAGES(vnd)) {
839		rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
840		(void) VOP_PUTPAGES(vp, 0, 0,
841		    PGO_ALLPAGES | PGO_CLEANIT | PGO_FREE);
842	}
843
844	/* We need to increase the number of outputs on the vnode if
845	 * there was any write to it. */
846	if (!doread) {
847		mutex_enter(vp->v_interlock);
848		vp->v_numoutput++;
849		mutex_exit(vp->v_interlock);
850	}
851
852	biodone(bp);
853}
854
855/*
856 * Handes the read/write request given in 'bp' using the vnode's VOP_BMAP
857 * and VOP_STRATEGY operations.
858 *
859 * 'obp' is a pointer to the original request fed to the vnd device.
860 */
861static void
862handle_with_strategy(struct vnd_softc *vnd, const struct buf *obp,
863    struct buf *bp)
864{
865	int bsize, error, flags, skipped;
866	size_t resid, sz;
867	off_t bn, offset;
868	struct vnode *vp;
869	struct buf *nbp = NULL;
870
871	flags = obp->b_flags;
872
873
874	/* convert to a byte offset within the file. */
875	bn = obp->b_rawblkno * vnd->sc_dkdev.dk_label->d_secsize;
876
877	bsize = vnd->sc_vp->v_mount->mnt_stat.f_iosize;
878	skipped = 0;
879
880	/*
881	 * Break the request into bsize pieces and feed them
882	 * sequentially using VOP_BMAP/VOP_STRATEGY.
883	 * We do it this way to keep from flooding NFS servers if we
884	 * are connected to an NFS file.  This places the burden on
885	 * the client rather than the server.
886	 */
887	error = 0;
888	bp->b_resid = bp->b_bcount;
889	for (offset = 0, resid = bp->b_resid; /* true */;
890	    resid -= sz, offset += sz) {
891		daddr_t nbn;
892		int off, nra;
893
894		nra = 0;
895		vn_lock(vnd->sc_vp, LK_EXCLUSIVE | LK_RETRY);
896		error = VOP_BMAP(vnd->sc_vp, bn / bsize, &vp, &nbn, &nra);
897		VOP_UNLOCK(vnd->sc_vp);
898
899		if (error == 0 && (long)nbn == -1)
900			error = EIO;
901
902		/*
903		 * If there was an error or a hole in the file...punt.
904		 * Note that we may have to wait for any operations
905		 * that we have already fired off before releasing
906		 * the buffer.
907		 *
908		 * XXX we could deal with holes here but it would be
909		 * a hassle (in the write case).
910		 */
911		if (error) {
912			skipped += resid;
913			break;
914		}
915
916#ifdef DEBUG
917		if (!dovndcluster)
918			nra = 0;
919#endif
920
921		off = bn % bsize;
922		sz = MIN(((off_t)1 + nra) * bsize - off, resid);
923#ifdef	DEBUG
924		if (vnddebug & VDB_IO)
925			printf("vndstrategy: vp %p/%p bn 0x%qx/0x%" PRIx64
926			    " sz 0x%zx\n", vnd->sc_vp, vp, (long long)bn,
927			    nbn, sz);
928#endif
929
930		nbp = getiobuf(vp, true);
931		nestiobuf_setup(bp, nbp, offset, sz);
932		nbp->b_blkno = nbn + btodb(off);
933
934#if 0 /* XXX #ifdef DEBUG */
935		if (vnddebug & VDB_IO)
936			printf("vndstart(%ld): bp %p vp %p blkno "
937			    "0x%" PRIx64 " flags %x addr %p cnt 0x%x\n",
938			    (long) (vnd-vnd_softc), &nbp->vb_buf,
939			    nbp->vb_buf.b_vp, nbp->vb_buf.b_blkno,
940			    nbp->vb_buf.b_flags, nbp->vb_buf.b_data,
941			    nbp->vb_buf.b_bcount);
942#endif
943		if (resid == sz) {
944			break;
945		}
946		VOP_STRATEGY(vp, nbp);
947		bn += sz;
948	}
949	if (!(flags & B_READ)) {
950		struct vnode *w_vp;
951		/*
952		 * this is the last nested buf, account for
953		 * the parent buf write too.
954		 * This has to be done last, so that
955		 * fsync won't wait for this write which
956		 * has no chance to complete before all nested bufs
957		 * have been queued. But it has to be done
958		 * before the last VOP_STRATEGY()
959		 * or the call to nestiobuf_done().
960		 */
961		w_vp = bp->b_vp;
962		mutex_enter(w_vp->v_interlock);
963		w_vp->v_numoutput++;
964		mutex_exit(w_vp->v_interlock);
965	}
966	KASSERT(skipped != 0 || nbp != NULL);
967	if (skipped)
968		nestiobuf_done(bp, skipped, error);
969	else
970		VOP_STRATEGY(vp, nbp);
971}
972
973static void
974vndiodone(struct buf *bp)
975{
976	struct vndxfer *vnx = VND_BUFTOXFER(bp);
977	struct vnd_softc *vnd = vnx->vx_vnd;
978	struct buf *obp = bp->b_private;
979	int s = splbio();
980
981	KERNEL_LOCK(1, NULL);		/* XXXSMP */
982	KASSERT(&vnx->vx_buf == bp);
983	KASSERT(vnd->sc_active > 0);
984#ifdef DEBUG
985	if (vnddebug & VDB_IO) {
986		printf("vndiodone1: bp %p iodone: error %d\n",
987		    bp, bp->b_error);
988	}
989#endif
990	disk_unbusy(&vnd->sc_dkdev, bp->b_bcount - bp->b_resid,
991	    (bp->b_flags & B_READ));
992	vnd->sc_active--;
993	if (vnd->sc_active == 0) {
994		wakeup(&vnd->sc_tab);
995	}
996	KERNEL_UNLOCK_ONE(NULL);	/* XXXSMP */
997	splx(s);
998	obp->b_error = bp->b_error;
999	obp->b_resid = bp->b_resid;
1000	buf_destroy(bp);
1001	VND_PUTXFER(vnd, vnx);
1002	biodone(obp);
1003}
1004
1005/* ARGSUSED */
1006static int
1007vndread(dev_t dev, struct uio *uio, int flags)
1008{
1009	int unit = vndunit(dev);
1010	struct vnd_softc *sc;
1011
1012#ifdef DEBUG
1013	if (vnddebug & VDB_FOLLOW)
1014		printf("vndread(0x%"PRIx64", %p)\n", dev, uio);
1015#endif
1016
1017	sc = device_lookup_private(&vnd_cd, unit);
1018	if (sc == NULL)
1019		return ENXIO;
1020
1021	if ((sc->sc_flags & VNF_INITED) == 0)
1022		return ENXIO;
1023
1024	return physio(vndstrategy, NULL, dev, B_READ, minphys, uio);
1025}
1026
1027/* ARGSUSED */
1028static int
1029vndwrite(dev_t dev, struct uio *uio, int flags)
1030{
1031	int unit = vndunit(dev);
1032	struct vnd_softc *sc;
1033
1034#ifdef DEBUG
1035	if (vnddebug & VDB_FOLLOW)
1036		printf("vndwrite(0x%"PRIx64", %p)\n", dev, uio);
1037#endif
1038
1039	sc = device_lookup_private(&vnd_cd, unit);
1040	if (sc == NULL)
1041		return ENXIO;
1042
1043	if ((sc->sc_flags & VNF_INITED) == 0)
1044		return ENXIO;
1045
1046	return physio(vndstrategy, NULL, dev, B_WRITE, minphys, uio);
1047}
1048
1049static int
1050vnd_cget(struct lwp *l, int unit, int *un, struct vattr *va)
1051{
1052	int error;
1053	struct vnd_softc *vnd;
1054
1055	if (*un == -1)
1056		*un = unit;
1057	if (*un < 0)
1058		return EINVAL;
1059
1060	vnd = device_lookup_private(&vnd_cd, *un);
1061	if (vnd == NULL)
1062		return -1;
1063
1064	if ((vnd->sc_flags & VNF_INITED) == 0)
1065		return -1;
1066
1067	vn_lock(vnd->sc_vp, LK_SHARED | LK_RETRY);
1068	error = VOP_GETATTR(vnd->sc_vp, va, l->l_cred);
1069	VOP_UNLOCK(vnd->sc_vp);
1070	return error;
1071}
1072
1073static int
1074vnddoclear(struct vnd_softc *vnd, int pmask, int minor, bool force)
1075{
1076	int error;
1077
1078	if ((error = vndlock(vnd)) != 0)
1079		return error;
1080
1081	/*
1082	 * Don't unconfigure if any other partitions are open
1083	 * or if both the character and block flavors of this
1084	 * partition are open.
1085	 */
1086	if (DK_BUSY(vnd, pmask) && !force) {
1087		vndunlock(vnd);
1088		return EBUSY;
1089	}
1090
1091	/* Delete all of our wedges */
1092	dkwedge_delall(&vnd->sc_dkdev);
1093
1094	/*
1095	 * XXX vndclear() might call vndclose() implicitly;
1096	 * release lock to avoid recursion
1097	 *
1098	 * Set VNF_CLEARING to prevent vndopen() from
1099	 * sneaking in after we vndunlock().
1100	 */
1101	vnd->sc_flags |= VNF_CLEARING;
1102	vndunlock(vnd);
1103	vndclear(vnd, minor);
1104#ifdef DEBUG
1105	if (vnddebug & VDB_INIT)
1106		printf("%s: CLRed\n", __func__);
1107#endif
1108
1109	/* Destroy the xfer and buffer pools. */
1110	pool_destroy(&vnd->sc_vxpool);
1111
1112	/* Detach the disk. */
1113	disk_detach(&vnd->sc_dkdev);
1114
1115	return 0;
1116}
1117
1118static int
1119vndioctl_get(struct lwp *l, void *data, int unit, struct vattr *va)
1120{
1121	int error;
1122
1123	KASSERT(l);
1124
1125	/* the first member is always int vnd_unit in all the versions */
1126	if (*(int *)data >= vnd_cd.cd_ndevs)
1127		return ENXIO;
1128
1129	switch (error = vnd_cget(l, unit, (int *)data, va)) {
1130	case -1:
1131		/* unused is not an error */
1132		memset(va, 0, sizeof(*va));
1133		/*FALLTHROUGH*/
1134	case 0:
1135		return 0;
1136	default:
1137		return error;
1138	}
1139}
1140
1141/* ARGSUSED */
1142static int
1143vndioctl(dev_t dev, u_long cmd, void *data, int flag, struct lwp *l)
1144{
1145	bool force;
1146	int unit = vndunit(dev);
1147	struct vnd_softc *vnd;
1148	struct vnd_ioctl *vio;
1149	struct vattr vattr;
1150	struct pathbuf *pb;
1151	struct vnode *vp;
1152	int error, part, pmask;
1153	uint64_t geomsize;
1154	int fflags;
1155#ifdef __HAVE_OLD_DISKLABEL
1156	struct disklabel newlabel;
1157#endif
1158
1159#ifdef DEBUG
1160	if (vnddebug & VDB_FOLLOW)
1161		printf("vndioctl(0x%"PRIx64", 0x%lx, %p, 0x%x, %p): unit %d\n",
1162		    dev, cmd, data, flag, l->l_proc, unit);
1163#endif
1164	/* Do the get's first; they don't need initialization or verification */
1165	switch (cmd) {
1166	case VNDIOCGET:
1167		if ((error = vndioctl_get(l, data, unit, &vattr)) != 0)
1168			return error;
1169
1170		struct vnd_user *vnu = data;
1171		vnu->vnu_dev = vattr.va_fsid;
1172		vnu->vnu_ino = vattr.va_fileid;
1173		return 0;
1174
1175	default:
1176		/* First check for COMPAT_50 hook */
1177		MODULE_HOOK_CALL(compat_vndioctl_50_hook,
1178		    (cmd, l, data, unit, &vattr, vndioctl_get),
1179		    enosys(), error);
1180
1181		/*
1182		 * If not present, then COMPAT_30 hook also not
1183		 * present, so just continue with checks for the
1184		 * "write" commands
1185		 */
1186		if (error == ENOSYS) {
1187			error = 0;
1188			break;
1189		}
1190
1191		/* If not already handled, try the COMPAT_30 hook */
1192		if (error == EPASSTHROUGH)
1193			MODULE_HOOK_CALL(compat_vndioctl_30_hook,
1194			    (cmd, l, data, unit, &vattr, vndioctl_get),
1195			    enosys(), error);
1196
1197		/* If no COMPAT_30 module, or not handled, check writes */
1198		if (error == ENOSYS || error == EPASSTHROUGH) {
1199			error = 0;
1200			break;
1201		}
1202		return error;
1203	}
1204
1205	vnd = device_lookup_private(&vnd_cd, unit);
1206	if (vnd == NULL)
1207		return ENXIO;
1208	vio = (struct vnd_ioctl *)data;
1209
1210	/* Must be open for writes for these commands... */
1211	switch (cmd) {
1212	case VNDIOCSET50:
1213	case VNDIOCCLR50:
1214		if (!compat_vndioctl_50_hook.hooked)
1215			return EINVAL;
1216		/* FALLTHROUGH */
1217	case VNDIOCSET:
1218	case VNDIOCCLR:
1219	case DIOCSDINFO:
1220	case DIOCWDINFO:
1221#ifdef __HAVE_OLD_DISKLABEL
1222	case ODIOCSDINFO:
1223	case ODIOCWDINFO:
1224#endif
1225	case DIOCKLABEL:
1226	case DIOCWLABEL:
1227	case DIOCCACHESYNC:
1228		if ((flag & FWRITE) == 0)
1229			return EBADF;
1230	}
1231
1232	switch (cmd) {
1233	case VNDIOCSET50:
1234	case VNDIOCSET:
1235		/* Must not be initialized */
1236		if (vnd->sc_flags & VNF_INITED)
1237			return EBUSY;
1238		break;
1239	default:
1240		/* Must be initialized */
1241		if ((vnd->sc_flags & VNF_INITED) == 0)
1242			return ENXIO;
1243		break;
1244	}
1245
1246	error = disk_ioctl(&vnd->sc_dkdev, dev, cmd, data, flag, l);
1247	if (error != EPASSTHROUGH)
1248		return error;
1249
1250	switch (cmd) {
1251	case VNDIOCSET50:
1252	case VNDIOCSET:
1253		if ((error = vndlock(vnd)) != 0)
1254			return error;
1255
1256		fflags = FREAD;
1257		if ((vio->vnd_flags & VNDIOF_READONLY) == 0)
1258			fflags |= FWRITE;
1259		if ((vio->vnd_flags & VNDIOF_FILEIO) != 0)
1260			vnd->sc_flags |= VNF_USE_VN_RDWR;
1261		error = pathbuf_copyin(vio->vnd_file, &pb);
1262		if (error) {
1263			goto unlock_and_exit;
1264		}
1265		error = vn_open(NULL, pb, 0, fflags, 0, &vp, NULL, NULL);
1266		if (error != 0) {
1267			pathbuf_destroy(pb);
1268			goto unlock_and_exit;
1269		}
1270		KASSERT(l);
1271		error = VOP_GETATTR(vp, &vattr, l->l_cred);
1272		if (!error && vp->v_type != VREG)
1273			error = EOPNOTSUPP;
1274		if (!error && vattr.va_bytes < vattr.va_size)
1275			/* File is definitely sparse, use vn_rdwr() */
1276			vnd->sc_flags |= VNF_USE_VN_RDWR;
1277		if (error) {
1278			VOP_UNLOCK(vp);
1279			goto close_and_exit;
1280		}
1281
1282		/* If using a compressed file, initialize its info */
1283		/* (or abort with an error if kernel has no compression) */
1284		if (vio->vnd_flags & VNDIOF_COMP) {
1285#ifdef VND_COMPRESSION
1286			struct vnd_comp_header *ch;
1287			int i;
1288			uint32_t comp_size;
1289			uint32_t comp_maxsize;
1290
1291			/* allocate space for compressed file header */
1292			ch = malloc(sizeof(struct vnd_comp_header),
1293			    M_TEMP, M_WAITOK);
1294
1295			/* read compressed file header */
1296			error = vn_rdwr(UIO_READ, vp, (void *)ch,
1297			    sizeof(struct vnd_comp_header), 0, UIO_SYSSPACE,
1298			    IO_UNIT|IO_NODELOCKED, l->l_cred, NULL, NULL);
1299			if (error) {
1300				free(ch, M_TEMP);
1301				VOP_UNLOCK(vp);
1302				goto close_and_exit;
1303			}
1304
1305			if (be32toh(ch->block_size) == 0 ||
1306			    be32toh(ch->num_blocks) > UINT32_MAX - 1) {
1307				free(ch, M_TEMP);
1308				VOP_UNLOCK(vp);
1309				goto close_and_exit;
1310			}
1311
1312			/* save some header info */
1313			vnd->sc_comp_blksz = be32toh(ch->block_size);
1314			/* note last offset is the file byte size */
1315			vnd->sc_comp_numoffs = be32toh(ch->num_blocks) + 1;
1316			free(ch, M_TEMP);
1317			if (!DK_DEV_BSIZE_OK(vnd->sc_comp_blksz)) {
1318				VOP_UNLOCK(vp);
1319				error = EINVAL;
1320				goto close_and_exit;
1321			}
1322			KASSERT(0 < vnd->sc_comp_blksz);
1323			KASSERT(0 < vnd->sc_comp_numoffs);
1324			/*
1325			 * @#^@!$& gcc -Wtype-limits refuses to let me
1326			 * write SIZE_MAX/sizeof(uint64_t) < numoffs,
1327			 * because the range of the type on amd64 makes
1328			 * the comparisons always false.
1329			 */
1330#if SIZE_MAX <= UINT32_MAX*(64/CHAR_BIT)
1331			if (SIZE_MAX/sizeof(uint64_t) < vnd->sc_comp_numoffs) {
1332				VOP_UNLOCK(vp);
1333				error = EINVAL;
1334				goto close_and_exit;
1335			}
1336#endif
1337			if ((vattr.va_size < sizeof(struct vnd_comp_header)) ||
1338			    (vattr.va_size - sizeof(struct vnd_comp_header) <
1339				sizeof(uint64_t)*vnd->sc_comp_numoffs) ||
1340			    (UQUAD_MAX/vnd->sc_comp_blksz <
1341				vnd->sc_comp_numoffs - 1)) {
1342				VOP_UNLOCK(vp);
1343				error = EINVAL;
1344				goto close_and_exit;
1345			}
1346
1347			/* set decompressed file size */
1348			KASSERT(vnd->sc_comp_numoffs - 1 <=
1349			    UQUAD_MAX/vnd->sc_comp_blksz);
1350			vattr.va_size =
1351			    ((u_quad_t)vnd->sc_comp_numoffs - 1) *
1352			     (u_quad_t)vnd->sc_comp_blksz;
1353
1354			/* allocate space for all the compressed offsets */
1355			__CTASSERT(UINT32_MAX <= UQUAD_MAX/sizeof(uint64_t));
1356			vnd->sc_comp_offsets =
1357			    malloc(sizeof(uint64_t) * vnd->sc_comp_numoffs,
1358				M_DEVBUF, M_WAITOK);
1359
1360			/* read in the offsets */
1361			error = vn_rdwr(UIO_READ, vp,
1362			    (void *)vnd->sc_comp_offsets,
1363			    sizeof(uint64_t) * vnd->sc_comp_numoffs,
1364			    sizeof(struct vnd_comp_header), UIO_SYSSPACE,
1365			  IO_UNIT|IO_NODELOCKED, l->l_cred, NULL, NULL);
1366			if (error) {
1367				VOP_UNLOCK(vp);
1368				goto close_and_exit;
1369			}
1370			/*
1371			 * find largest block size (used for allocation limit).
1372			 * Also convert offset to native byte order.
1373			 */
1374			comp_maxsize = 0;
1375			for (i = 0; i < vnd->sc_comp_numoffs - 1; i++) {
1376				vnd->sc_comp_offsets[i] =
1377				    be64toh(vnd->sc_comp_offsets[i]);
1378				comp_size =
1379				    be64toh(vnd->sc_comp_offsets[i + 1])
1380				    - vnd->sc_comp_offsets[i];
1381				if (comp_size > comp_maxsize)
1382					comp_maxsize = comp_size;
1383			}
1384			vnd->sc_comp_offsets[vnd->sc_comp_numoffs - 1] =
1385			    be64toh(vnd->sc_comp_offsets[vnd->sc_comp_numoffs
1386				    - 1]);
1387
1388			/* create compressed data buffer */
1389			vnd->sc_comp_buff = malloc(comp_maxsize,
1390			    M_DEVBUF, M_WAITOK);
1391
1392			/* create decompressed buffer */
1393			vnd->sc_comp_decombuf = malloc(vnd->sc_comp_blksz,
1394			    M_DEVBUF, M_WAITOK);
1395			vnd->sc_comp_buffblk = -1;
1396
1397			/* Initialize decompress stream */
1398			memset(&vnd->sc_comp_stream, 0, sizeof(z_stream));
1399			vnd->sc_comp_stream.zalloc = vnd_alloc;
1400			vnd->sc_comp_stream.zfree = vnd_free;
1401			error = inflateInit2(&vnd->sc_comp_stream, MAX_WBITS);
1402			if (error) {
1403				if (vnd->sc_comp_stream.msg)
1404					printf("vnd%d: compressed file, %s\n",
1405					    unit, vnd->sc_comp_stream.msg);
1406				VOP_UNLOCK(vp);
1407				error = EINVAL;
1408				goto close_and_exit;
1409			}
1410
1411			vnd->sc_flags |= VNF_COMP | VNF_READONLY;
1412#else /* !VND_COMPRESSION */
1413			VOP_UNLOCK(vp);
1414			error = EOPNOTSUPP;
1415			goto close_and_exit;
1416#endif /* VND_COMPRESSION */
1417		}
1418
1419		VOP_UNLOCK(vp);
1420		vnd->sc_vp = vp;
1421		vnd->sc_size = btodb(vattr.va_size);	/* note truncation */
1422
1423		/* get smallest I/O size for underlying device, fall back to
1424		 * fundamental I/O size of underlying filesystem
1425		 */
1426		error = bdev_ioctl(vattr.va_fsid, DIOCGSECTORSIZE, &vnd->sc_iosize, FKIOCTL, l);
1427		if (error)
1428			vnd->sc_iosize = vnd->sc_vp->v_mount->mnt_stat.f_frsize;
1429
1430		/* Default I/O size to DEV_BSIZE */
1431		if (vnd->sc_iosize == 0)
1432			vnd->sc_iosize = DEV_BSIZE;
1433
1434		/*
1435		 * Use pseudo-geometry specified.  If none was provided,
1436		 * use "standard" Adaptec fictitious geometry.
1437		 */
1438		if (vio->vnd_flags & VNDIOF_HASGEOM) {
1439
1440			memcpy(&vnd->sc_geom, &vio->vnd_geom,
1441			    sizeof(vio->vnd_geom));
1442
1443			/*
1444			 * Sanity-check the sector size.
1445			 */
1446			if (!DK_DEV_BSIZE_OK(vnd->sc_geom.vng_secsize) ||
1447			    vnd->sc_geom.vng_ntracks == 0 ||
1448			    vnd->sc_geom.vng_nsectors == 0) {
1449				error = EINVAL;
1450				goto close_and_exit;
1451			}
1452
1453			/*
1454			 * Compute missing cylinder count from size
1455			 */
1456			if (vnd->sc_geom.vng_ncylinders == 0)
1457				vnd->sc_geom.vng_ncylinders = vnd->sc_size / (
1458					(vnd->sc_geom.vng_secsize / DEV_BSIZE) *
1459					vnd->sc_geom.vng_ntracks *
1460					vnd->sc_geom.vng_nsectors);
1461
1462			/*
1463			 * Compute the size (in DEV_BSIZE blocks) specified
1464			 * by the geometry.
1465			 */
1466			geomsize = (int64_t)vnd->sc_geom.vng_nsectors *
1467			    vnd->sc_geom.vng_ntracks *
1468			    vnd->sc_geom.vng_ncylinders *
1469			    (vnd->sc_geom.vng_secsize / DEV_BSIZE);
1470
1471			/*
1472			 * Sanity-check the size against the specified
1473			 * geometry.
1474			 */
1475			if (vnd->sc_size < geomsize) {
1476				error = EINVAL;
1477				goto close_and_exit;
1478			}
1479		} else if (vnd->sc_size >= (32 * 64)) {
1480			/*
1481			 * Size must be at least 2048 DEV_BSIZE blocks
1482			 * (1M) in order to use this geometry.
1483			 */
1484			vnd->sc_geom.vng_secsize = DEV_BSIZE;
1485			vnd->sc_geom.vng_nsectors = 32;
1486			vnd->sc_geom.vng_ntracks = 64;
1487			vnd->sc_geom.vng_ncylinders = vnd->sc_size / (64 * 32);
1488		} else {
1489			vnd->sc_geom.vng_secsize = DEV_BSIZE;
1490			vnd->sc_geom.vng_nsectors = 1;
1491			vnd->sc_geom.vng_ntracks = 1;
1492			vnd->sc_geom.vng_ncylinders = vnd->sc_size;
1493		}
1494
1495		vnd_set_geometry(vnd);
1496
1497		if (vio->vnd_flags & VNDIOF_READONLY) {
1498			vnd->sc_flags |= VNF_READONLY;
1499		}
1500
1501		if ((error = vndsetcred(vnd, l->l_cred)) != 0)
1502			goto close_and_exit;
1503
1504		vndthrottle(vnd, vnd->sc_vp);
1505		vio->vnd_osize = dbtob(vnd->sc_size);
1506		if (cmd != VNDIOCSET50)
1507			vio->vnd_size = dbtob(vnd->sc_size);
1508		vnd->sc_flags |= VNF_INITED;
1509
1510		/* create the kernel thread, wait for it to be up */
1511		error = kthread_create(PRI_NONE, 0, NULL, vndthread, vnd,
1512		    &vnd->sc_kthread, "%s", device_xname(vnd->sc_dev));
1513		if (error)
1514			goto close_and_exit;
1515		while ((vnd->sc_flags & VNF_KTHREAD) == 0) {
1516			tsleep(&vnd->sc_kthread, PRIBIO, "vndthr", 0);
1517		}
1518#ifdef DEBUG
1519		if (vnddebug & VDB_INIT)
1520			printf("vndioctl: SET vp %p size 0x%lx %d/%d/%d/%d\n",
1521			    vnd->sc_vp, (unsigned long) vnd->sc_size,
1522			    vnd->sc_geom.vng_secsize,
1523			    vnd->sc_geom.vng_nsectors,
1524			    vnd->sc_geom.vng_ntracks,
1525			    vnd->sc_geom.vng_ncylinders);
1526#endif
1527
1528		/* Attach the disk. */
1529		disk_attach(&vnd->sc_dkdev);
1530
1531		/* Initialize the xfer and buffer pools. */
1532		pool_init(&vnd->sc_vxpool, sizeof(struct vndxfer), 0,
1533		    0, 0, "vndxpl", NULL, IPL_BIO);
1534
1535		vndunlock(vnd);
1536
1537		pathbuf_destroy(pb);
1538
1539		/* Discover wedges on this disk */
1540		dkwedge_discover(&vnd->sc_dkdev);
1541
1542		break;
1543
1544close_and_exit:
1545		(void) vn_close(vp, fflags, l->l_cred);
1546		pathbuf_destroy(pb);
1547unlock_and_exit:
1548#ifdef VND_COMPRESSION
1549		/* free any allocated memory (for compressed file) */
1550		if (vnd->sc_comp_offsets) {
1551			free(vnd->sc_comp_offsets, M_DEVBUF);
1552			vnd->sc_comp_offsets = NULL;
1553		}
1554		if (vnd->sc_comp_buff) {
1555			free(vnd->sc_comp_buff, M_DEVBUF);
1556			vnd->sc_comp_buff = NULL;
1557		}
1558		if (vnd->sc_comp_decombuf) {
1559			free(vnd->sc_comp_decombuf, M_DEVBUF);
1560			vnd->sc_comp_decombuf = NULL;
1561		}
1562#endif /* VND_COMPRESSION */
1563		vndunlock(vnd);
1564		return error;
1565
1566	case VNDIOCCLR50:
1567	case VNDIOCCLR:
1568		part = DISKPART(dev);
1569		pmask = (1 << part);
1570		force = (vio->vnd_flags & VNDIOF_FORCE) != 0;
1571
1572		if ((error = vnddoclear(vnd, pmask, minor(dev), force)) != 0)
1573			return error;
1574
1575		break;
1576
1577
1578	case DIOCWDINFO:
1579	case DIOCSDINFO:
1580#ifdef __HAVE_OLD_DISKLABEL
1581	case ODIOCWDINFO:
1582	case ODIOCSDINFO:
1583#endif
1584	{
1585		struct disklabel *lp;
1586
1587		if ((error = vndlock(vnd)) != 0)
1588			return error;
1589
1590		vnd->sc_flags |= VNF_LABELLING;
1591
1592#ifdef __HAVE_OLD_DISKLABEL
1593		if (cmd == ODIOCSDINFO || cmd == ODIOCWDINFO) {
1594			memset(&newlabel, 0, sizeof newlabel);
1595			memcpy(&newlabel, data, sizeof (struct olddisklabel));
1596			lp = &newlabel;
1597		} else
1598#endif
1599		lp = (struct disklabel *)data;
1600
1601		error = setdisklabel(vnd->sc_dkdev.dk_label,
1602		    lp, 0, vnd->sc_dkdev.dk_cpulabel);
1603		if (error == 0) {
1604			if (cmd == DIOCWDINFO
1605#ifdef __HAVE_OLD_DISKLABEL
1606			    || cmd == ODIOCWDINFO
1607#endif
1608			   )
1609				error = writedisklabel(VNDLABELDEV(dev),
1610				    vndstrategy, vnd->sc_dkdev.dk_label,
1611				    vnd->sc_dkdev.dk_cpulabel);
1612		}
1613
1614		vnd->sc_flags &= ~VNF_LABELLING;
1615
1616		vndunlock(vnd);
1617
1618		if (error)
1619			return error;
1620		break;
1621	}
1622
1623	case DIOCKLABEL:
1624		if (*(int *)data != 0)
1625			vnd->sc_flags |= VNF_KLABEL;
1626		else
1627			vnd->sc_flags &= ~VNF_KLABEL;
1628		break;
1629
1630	case DIOCWLABEL:
1631		if (*(int *)data != 0)
1632			vnd->sc_flags |= VNF_WLABEL;
1633		else
1634			vnd->sc_flags &= ~VNF_WLABEL;
1635		break;
1636
1637	case DIOCGDEFLABEL:
1638		vndgetdefaultlabel(vnd, (struct disklabel *)data);
1639		break;
1640
1641#ifdef __HAVE_OLD_DISKLABEL
1642	case ODIOCGDEFLABEL:
1643		vndgetdefaultlabel(vnd, &newlabel);
1644		if (newlabel.d_npartitions > OLDMAXPARTITIONS)
1645			return ENOTTY;
1646		memcpy(data, &newlabel, sizeof (struct olddisklabel));
1647		break;
1648#endif
1649
1650	case DIOCGSTRATEGY:
1651	    {
1652		struct disk_strategy *dks = (void *)data;
1653
1654		/* No lock needed, never changed */
1655		strlcpy(dks->dks_name,
1656		    bufq_getstrategyname(vnd->sc_tab),
1657		    sizeof(dks->dks_name));
1658		dks->dks_paramlen = 0;
1659		break;
1660	    }
1661	case DIOCGCACHE:
1662	    {
1663		int *bits = (int *)data;
1664		*bits |= DKCACHE_READ | DKCACHE_WRITE;
1665		break;
1666	    }
1667	case DIOCCACHESYNC:
1668		vn_lock(vnd->sc_vp, LK_EXCLUSIVE | LK_RETRY);
1669		error = VOP_FSYNC(vnd->sc_vp, vnd->sc_cred,
1670		    FSYNC_WAIT | FSYNC_DATAONLY | FSYNC_CACHE, 0, 0);
1671		VOP_UNLOCK(vnd->sc_vp);
1672		return error;
1673
1674	default:
1675		return ENOTTY;
1676	}
1677
1678	return 0;
1679}
1680
1681/*
1682 * Duplicate the current processes' credentials.  Since we are called only
1683 * as the result of a SET ioctl and only root can do that, any future access
1684 * to this "disk" is essentially as root.  Note that credentials may change
1685 * if some other uid can write directly to the mapped file (NFS).
1686 */
1687static int
1688vndsetcred(struct vnd_softc *vnd, kauth_cred_t cred)
1689{
1690	struct uio auio;
1691	struct iovec aiov;
1692	char *tmpbuf;
1693	int error;
1694
1695	vnd->sc_cred = kauth_cred_dup(cred);
1696	tmpbuf = malloc(DEV_BSIZE, M_TEMP, M_WAITOK);
1697
1698	/* XXX: Horrible kludge to establish credentials for NFS */
1699	aiov.iov_base = tmpbuf;
1700	aiov.iov_len = uimin(DEV_BSIZE, dbtob(vnd->sc_size));
1701	auio.uio_iov = &aiov;
1702	auio.uio_iovcnt = 1;
1703	auio.uio_offset = 0;
1704	auio.uio_rw = UIO_READ;
1705	auio.uio_resid = aiov.iov_len;
1706	UIO_SETUP_SYSSPACE(&auio);
1707	vn_lock(vnd->sc_vp, LK_EXCLUSIVE | LK_RETRY);
1708	error = VOP_READ(vnd->sc_vp, &auio, 0, vnd->sc_cred);
1709	if (error == 0) {
1710		/*
1711		 * Because vnd does all IO directly through the vnode
1712		 * we need to flush (at least) the buffer from the above
1713		 * VOP_READ from the buffer cache to prevent cache
1714		 * incoherencies.  Also, be careful to write dirty
1715		 * buffers back to stable storage.
1716		 */
1717		error = vinvalbuf(vnd->sc_vp, V_SAVE, vnd->sc_cred,
1718			    curlwp, 0, 0);
1719	}
1720	VOP_UNLOCK(vnd->sc_vp);
1721
1722	free(tmpbuf, M_TEMP);
1723	return error;
1724}
1725
1726/*
1727 * Set maxactive based on FS type
1728 */
1729static void
1730vndthrottle(struct vnd_softc *vnd, struct vnode *vp)
1731{
1732
1733	if (vp->v_tag == VT_NFS)
1734		vnd->sc_maxactive = 2;
1735	else
1736		vnd->sc_maxactive = 8;
1737
1738	if (vnd->sc_maxactive < 1)
1739		vnd->sc_maxactive = 1;
1740}
1741
1742#if 0
1743static void
1744vndshutdown(void)
1745{
1746	struct vnd_softc *vnd;
1747
1748	for (vnd = &vnd_softc[0]; vnd < &vnd_softc[numvnd]; vnd++)
1749		if (vnd->sc_flags & VNF_INITED)
1750			vndclear(vnd);
1751}
1752#endif
1753
1754static void
1755vndclear(struct vnd_softc *vnd, int myminor)
1756{
1757	struct vnode *vp = vnd->sc_vp;
1758	int fflags = FREAD;
1759	int bmaj, cmaj, i, mn;
1760	int s;
1761
1762#ifdef DEBUG
1763	if (vnddebug & VDB_FOLLOW)
1764		printf("vndclear(%p): vp %p\n", vnd, vp);
1765#endif
1766	/* locate the major number */
1767	bmaj = bdevsw_lookup_major(&vnd_bdevsw);
1768	cmaj = cdevsw_lookup_major(&vnd_cdevsw);
1769
1770	/* Nuke the vnodes for any open instances */
1771	for (i = 0; i < MAXPARTITIONS; i++) {
1772		mn = DISKMINOR(device_unit(vnd->sc_dev), i);
1773		if (mn != myminor) { /* XXX avoid to kill own vnode */
1774			vdevgone(bmaj, mn, mn, VBLK);
1775			vdevgone(cmaj, mn, mn, VCHR);
1776		}
1777	}
1778
1779	if ((vnd->sc_flags & VNF_READONLY) == 0)
1780		fflags |= FWRITE;
1781
1782	s = splbio();
1783	bufq_drain(vnd->sc_tab);
1784	splx(s);
1785
1786	vnd->sc_flags |= VNF_VUNCONF;
1787	wakeup(&vnd->sc_tab);
1788	while (vnd->sc_flags & VNF_KTHREAD)
1789		tsleep(&vnd->sc_kthread, PRIBIO, "vnthr", 0);
1790
1791#ifdef VND_COMPRESSION
1792	/* free the compressed file buffers */
1793	if (vnd->sc_flags & VNF_COMP) {
1794		if (vnd->sc_comp_offsets) {
1795			free(vnd->sc_comp_offsets, M_DEVBUF);
1796			vnd->sc_comp_offsets = NULL;
1797		}
1798		if (vnd->sc_comp_buff) {
1799			free(vnd->sc_comp_buff, M_DEVBUF);
1800			vnd->sc_comp_buff = NULL;
1801		}
1802		if (vnd->sc_comp_decombuf) {
1803			free(vnd->sc_comp_decombuf, M_DEVBUF);
1804			vnd->sc_comp_decombuf = NULL;
1805		}
1806	}
1807#endif /* VND_COMPRESSION */
1808	vnd->sc_flags &=
1809	    ~(VNF_INITED | VNF_READONLY | VNF_KLABEL | VNF_VLABEL
1810	      | VNF_VUNCONF | VNF_COMP | VNF_CLEARING);
1811	if (vp == NULL)
1812		panic("vndclear: null vp");
1813	(void) vn_close(vp, fflags, vnd->sc_cred);
1814	kauth_cred_free(vnd->sc_cred);
1815	vnd->sc_vp = NULL;
1816	vnd->sc_cred = NULL;
1817	vnd->sc_size = 0;
1818}
1819
1820static int
1821vndsize(dev_t dev)
1822{
1823	struct vnd_softc *sc;
1824	struct disklabel *lp;
1825	int part, unit, omask;
1826	int size;
1827
1828	unit = vndunit(dev);
1829	sc = device_lookup_private(&vnd_cd, unit);
1830	if (sc == NULL)
1831		return -1;
1832
1833	if ((sc->sc_flags & VNF_INITED) == 0)
1834		return -1;
1835
1836	part = DISKPART(dev);
1837	omask = sc->sc_dkdev.dk_openmask & (1 << part);
1838	lp = sc->sc_dkdev.dk_label;
1839
1840	if (omask == 0 && vndopen(dev, 0, S_IFBLK, curlwp))	/* XXX */
1841		return -1;
1842
1843	if (lp->d_partitions[part].p_fstype != FS_SWAP)
1844		size = -1;
1845	else
1846		size = lp->d_partitions[part].p_size *
1847		    (lp->d_secsize / DEV_BSIZE);
1848
1849	if (omask == 0 && vndclose(dev, 0, S_IFBLK, curlwp))	/* XXX */
1850		return -1;
1851
1852	return size;
1853}
1854
1855static int
1856vnddump(dev_t dev, daddr_t blkno, void *va,
1857    size_t size)
1858{
1859
1860	/* Not implemented. */
1861	return ENXIO;
1862}
1863
1864static void
1865vndgetdefaultlabel(struct vnd_softc *sc, struct disklabel *lp)
1866{
1867	struct vndgeom *vng = &sc->sc_geom;
1868	struct partition *pp;
1869	unsigned spb;
1870
1871	memset(lp, 0, sizeof(*lp));
1872
1873	spb = vng->vng_secsize / DEV_BSIZE;
1874	if (sc->sc_size / spb > UINT32_MAX)
1875		lp->d_secperunit = UINT32_MAX;
1876	else
1877		lp->d_secperunit = sc->sc_size / spb;
1878	lp->d_secsize = vng->vng_secsize;
1879	lp->d_nsectors = vng->vng_nsectors;
1880	lp->d_ntracks = vng->vng_ntracks;
1881	lp->d_ncylinders = vng->vng_ncylinders;
1882	lp->d_secpercyl = lp->d_ntracks * lp->d_nsectors;
1883
1884	strncpy(lp->d_typename, "vnd", sizeof(lp->d_typename));
1885	lp->d_type = DKTYPE_VND;
1886	strncpy(lp->d_packname, "fictitious", sizeof(lp->d_packname));
1887	lp->d_rpm = 3600;
1888	lp->d_interleave = 1;
1889	lp->d_flags = 0;
1890
1891	pp = &lp->d_partitions[RAW_PART];
1892	pp->p_offset = 0;
1893	pp->p_size = lp->d_secperunit;
1894	pp->p_fstype = FS_UNUSED;
1895	lp->d_npartitions = RAW_PART + 1;
1896
1897	lp->d_magic = DISKMAGIC;
1898	lp->d_magic2 = DISKMAGIC;
1899	lp->d_checksum = dkcksum(lp);
1900}
1901
1902/*
1903 * Read the disklabel from a vnd.  If one is not present, create a fake one.
1904 */
1905static void
1906vndgetdisklabel(dev_t dev, struct vnd_softc *sc)
1907{
1908	const char *errstring;
1909	struct disklabel *lp = sc->sc_dkdev.dk_label;
1910	struct cpu_disklabel *clp = sc->sc_dkdev.dk_cpulabel;
1911	int i;
1912
1913	memset(clp, 0, sizeof(*clp));
1914
1915	vndgetdefaultlabel(sc, lp);
1916
1917	/*
1918	 * Call the generic disklabel extraction routine.
1919	 */
1920	errstring = readdisklabel(VNDLABELDEV(dev), vndstrategy, lp, clp);
1921	if (errstring) {
1922		/*
1923		 * Lack of disklabel is common, but we print the warning
1924		 * anyway, since it might contain other useful information.
1925		 */
1926		aprint_normal_dev(sc->sc_dev, "%s\n", errstring);
1927
1928		/*
1929		 * For historical reasons, if there's no disklabel
1930		 * present, all partitions must be FS_BSDFFS and
1931		 * occupy the entire disk.
1932		 */
1933		for (i = 0; i < MAXPARTITIONS; i++) {
1934			/*
1935			 * Don't wipe out port specific hack (such as
1936			 * dos partition hack of i386 port).
1937			 */
1938			if (lp->d_partitions[i].p_size != 0)
1939				continue;
1940
1941			lp->d_partitions[i].p_size = lp->d_secperunit;
1942			lp->d_partitions[i].p_offset = 0;
1943			lp->d_partitions[i].p_fstype = FS_BSDFFS;
1944		}
1945
1946		strncpy(lp->d_packname, "default label",
1947		    sizeof(lp->d_packname));
1948
1949		lp->d_npartitions = MAXPARTITIONS;
1950		lp->d_checksum = dkcksum(lp);
1951	}
1952}
1953
1954/*
1955 * Wait interruptibly for an exclusive lock.
1956 *
1957 * XXX
1958 * Several drivers do this; it should be abstracted and made MP-safe.
1959 */
1960static int
1961vndlock(struct vnd_softc *sc)
1962{
1963	int error;
1964
1965	while ((sc->sc_flags & VNF_LOCKED) != 0) {
1966		sc->sc_flags |= VNF_WANTED;
1967		if ((error = tsleep(sc, PRIBIO | PCATCH, "vndlck", 0)) != 0)
1968			return error;
1969	}
1970	sc->sc_flags |= VNF_LOCKED;
1971	return 0;
1972}
1973
1974/*
1975 * Unlock and wake up any waiters.
1976 */
1977static void
1978vndunlock(struct vnd_softc *sc)
1979{
1980
1981	sc->sc_flags &= ~VNF_LOCKED;
1982	if ((sc->sc_flags & VNF_WANTED) != 0) {
1983		sc->sc_flags &= ~VNF_WANTED;
1984		wakeup(sc);
1985	}
1986}
1987
1988#ifdef VND_COMPRESSION
1989/* compressed file read */
1990static void
1991compstrategy(struct buf *bp, off_t bn)
1992{
1993	int error;
1994	int unit = vndunit(bp->b_dev);
1995	struct vnd_softc *vnd =
1996	    device_lookup_private(&vnd_cd, unit);
1997	u_int32_t comp_block;
1998	struct uio auio;
1999	char *addr;
2000	int s;
2001
2002	/* set up constants for data move */
2003	auio.uio_rw = UIO_READ;
2004	UIO_SETUP_SYSSPACE(&auio);
2005
2006	/* read, and transfer the data */
2007	addr = bp->b_data;
2008	bp->b_resid = bp->b_bcount;
2009	s = splbio();
2010	while (bp->b_resid > 0) {
2011		unsigned length;
2012		size_t length_in_buffer;
2013		u_int32_t offset_in_buffer;
2014		struct iovec aiov;
2015
2016		/* calculate the compressed block number */
2017		comp_block = bn / (off_t)vnd->sc_comp_blksz;
2018
2019		/* check for good block number */
2020		if (comp_block >= vnd->sc_comp_numoffs) {
2021			bp->b_error = EINVAL;
2022			splx(s);
2023			return;
2024		}
2025
2026		/* read in the compressed block, if not in buffer */
2027		if (comp_block != vnd->sc_comp_buffblk) {
2028			length = vnd->sc_comp_offsets[comp_block + 1] -
2029			    vnd->sc_comp_offsets[comp_block];
2030			vn_lock(vnd->sc_vp, LK_EXCLUSIVE | LK_RETRY);
2031			error = vn_rdwr(UIO_READ, vnd->sc_vp, vnd->sc_comp_buff,
2032			    length, vnd->sc_comp_offsets[comp_block],
2033			    UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, vnd->sc_cred,
2034			    NULL, NULL);
2035			if (error) {
2036				bp->b_error = error;
2037				VOP_UNLOCK(vnd->sc_vp);
2038				splx(s);
2039				return;
2040			}
2041			/* uncompress the buffer */
2042			vnd->sc_comp_stream.next_in = vnd->sc_comp_buff;
2043			vnd->sc_comp_stream.avail_in = length;
2044			vnd->sc_comp_stream.next_out = vnd->sc_comp_decombuf;
2045			vnd->sc_comp_stream.avail_out = vnd->sc_comp_blksz;
2046			inflateReset(&vnd->sc_comp_stream);
2047			error = inflate(&vnd->sc_comp_stream, Z_FINISH);
2048			if (error != Z_STREAM_END) {
2049				if (vnd->sc_comp_stream.msg)
2050					aprint_normal_dev(vnd->sc_dev,
2051					    "compressed file, %s\n",
2052					    vnd->sc_comp_stream.msg);
2053				bp->b_error = EBADMSG;
2054				VOP_UNLOCK(vnd->sc_vp);
2055				splx(s);
2056				return;
2057			}
2058			vnd->sc_comp_buffblk = comp_block;
2059			VOP_UNLOCK(vnd->sc_vp);
2060		}
2061
2062		/* transfer the usable uncompressed data */
2063		offset_in_buffer = bn % (off_t)vnd->sc_comp_blksz;
2064		length_in_buffer = vnd->sc_comp_blksz - offset_in_buffer;
2065		if (length_in_buffer > bp->b_resid)
2066			length_in_buffer = bp->b_resid;
2067		auio.uio_iov = &aiov;
2068		auio.uio_iovcnt = 1;
2069		aiov.iov_base = addr;
2070		aiov.iov_len = length_in_buffer;
2071		auio.uio_resid = aiov.iov_len;
2072		auio.uio_offset = 0;
2073		error = uiomove(vnd->sc_comp_decombuf + offset_in_buffer,
2074		    length_in_buffer, &auio);
2075		if (error) {
2076			bp->b_error = error;
2077			splx(s);
2078			return;
2079		}
2080
2081		bn += length_in_buffer;
2082		addr += length_in_buffer;
2083		bp->b_resid -= length_in_buffer;
2084	}
2085	splx(s);
2086}
2087
2088/* compression memory allocation routines */
2089static void *
2090vnd_alloc(void *aux, u_int items, u_int siz)
2091{
2092	return malloc(items * siz, M_TEMP, M_NOWAIT);
2093}
2094
2095static void
2096vnd_free(void *aux, void *ptr)
2097{
2098	free(ptr, M_TEMP);
2099}
2100#endif /* VND_COMPRESSION */
2101
2102static void
2103vnd_set_geometry(struct vnd_softc *vnd)
2104{
2105	struct disk_geom *dg = &vnd->sc_dkdev.dk_geom;
2106	unsigned spb;
2107
2108	memset(dg, 0, sizeof(*dg));
2109
2110	spb = vnd->sc_geom.vng_secsize / DEV_BSIZE;
2111	dg->dg_secperunit = vnd->sc_size / spb;
2112	dg->dg_secsize = vnd->sc_geom.vng_secsize;
2113	dg->dg_nsectors = vnd->sc_geom.vng_nsectors;
2114	dg->dg_ntracks = vnd->sc_geom.vng_ntracks;
2115	dg->dg_ncylinders = vnd->sc_geom.vng_ncylinders;
2116
2117#ifdef DEBUG
2118	if (vnddebug & VDB_LABEL) {
2119		printf("dg->dg_secperunit: %" PRId64 "\n", dg->dg_secperunit);
2120		printf("dg->dg_ncylinders: %u\n", dg->dg_ncylinders);
2121	}
2122#endif
2123	disk_set_info(vnd->sc_dev, &vnd->sc_dkdev, NULL);
2124}
2125
2126#ifdef VND_COMPRESSION
2127#define VND_DEPENDS "zlib"
2128#else
2129#define VND_DEPENDS NULL
2130#endif
2131
2132MODULE(MODULE_CLASS_DRIVER, vnd, VND_DEPENDS);
2133
2134#ifdef _MODULE
2135int vnd_bmajor = -1, vnd_cmajor = -1;
2136
2137CFDRIVER_DECL(vnd, DV_DISK, NULL);
2138#endif
2139
2140static int
2141vnd_modcmd(modcmd_t cmd, void *arg)
2142{
2143	int error = 0;
2144
2145	switch (cmd) {
2146	case MODULE_CMD_INIT:
2147#ifdef _MODULE
2148                /*
2149                 * Attach the {b,c}devsw's
2150                 */
2151		error = devsw_attach("vnd", &vnd_bdevsw, &vnd_bmajor,
2152		    &vnd_cdevsw, &vnd_cmajor);
2153		if (error) {
2154#ifdef DIAGNOSTIC
2155                        aprint_error("%s: unable to attach %s devsw, "
2156                            "error %d", __func__, vnd_cd.cd_name, error);
2157#endif
2158			break;
2159		}
2160
2161		error = config_cfdriver_attach(&vnd_cd);
2162		if (error) {
2163			devsw_detach(&vnd_bdevsw, &vnd_cdevsw);
2164			break;
2165		}
2166
2167		error = config_cfattach_attach(vnd_cd.cd_name, &vnd_ca);
2168	        if (error) {
2169			config_cfdriver_detach(&vnd_cd);
2170			devsw_detach(&vnd_bdevsw, &vnd_cdevsw);
2171#ifdef DIAGNOSTIC
2172			aprint_error("%s: unable to register cfattach for \n"
2173			    "%s, error %d", __func__, vnd_cd.cd_name, error);
2174#endif
2175			break;
2176		}
2177#endif
2178		break;
2179
2180	case MODULE_CMD_FINI:
2181#ifdef _MODULE
2182                /*
2183                 * Remove device from autoconf database
2184                 */
2185		error = config_cfattach_detach(vnd_cd.cd_name, &vnd_ca);
2186                if (error) {
2187#ifdef DIAGNOSTIC
2188                        aprint_error("%s: failed to detach %s cfattach, "
2189                            "error %d\n", __func__, vnd_cd.cd_name, error);
2190#endif
2191                        break;
2192                }
2193                error = config_cfdriver_detach(&vnd_cd);
2194                if (error) {
2195                        (void)config_cfattach_attach(vnd_cd.cd_name, &vnd_ca);
2196#ifdef DIAGNOSTIC
2197                        aprint_error("%s: failed to detach %s cfdriver, "
2198                            "error %d\n", __func__, vnd_cd.cd_name, error);
2199                        break;
2200#endif
2201                }
2202                /*
2203                 * Remove {b,c}devsw's
2204                 */
2205		devsw_detach(&vnd_bdevsw, &vnd_cdevsw);
2206
2207#endif
2208		break;
2209
2210	case MODULE_CMD_STAT:
2211		return ENOTTY;
2212
2213	default:
2214		return ENOTTY;
2215	}
2216
2217	return error;
2218}
2219