lofi.c revision 7656:2621e50fdf4a
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26
27/*
28 * lofi (loopback file) driver - allows you to attach a file to a device,
29 * which can then be accessed through that device. The simple model is that
30 * you tell lofi to open a file, and then use the block device you get as
31 * you would any block device. lofi translates access to the block device
32 * into I/O on the underlying file. This is mostly useful for
33 * mounting images of filesystems.
34 *
35 * lofi is controlled through /dev/lofictl - this is the only device exported
36 * during attach, and is minor number 0. lofiadm communicates with lofi through
37 * ioctls on this device. When a file is attached to lofi, block and character
38 * devices are exported in /dev/lofi and /dev/rlofi. Currently, these devices
39 * are identified by their minor number, and the minor number is also used
40 * as the name in /dev/lofi. If we ever decide to support virtual disks,
41 * we'll have to divide the minor number space to identify fdisk partitions
42 * and slices, and the name will then be the minor number shifted down a
43 * few bits. Minor devices are tracked with state structures handled with
44 * ddi_soft_state(9F) for simplicity.
45 *
46 * A file attached to lofi is opened when attached and not closed until
47 * explicitly detached from lofi. This seems more sensible than deferring
48 * the open until the /dev/lofi device is opened, for a number of reasons.
49 * One is that any failure is likely to be noticed by the person (or script)
50 * running lofiadm. Another is that it would be a security problem if the
51 * file was replaced by another one after being added but before being opened.
52 *
53 * The only hard part about lofi is the ioctls. In order to support things
54 * like 'newfs' on a lofi device, it needs to support certain disk ioctls.
55 * So it has to fake disk geometry and partition information. More may need
56 * to be faked if your favorite utility doesn't work and you think it should
57 * (fdformat doesn't work because it really wants to know the type of floppy
58 * controller to talk to, and that didn't seem easy to fake. Or possibly even
59 * necessary, since we have mkfs_pcfs now).
60 *
61 * Normally, a lofi device cannot be detached if it is open (i.e. busy).  To
62 * support simulation of hotplug events, an optional force flag is provided.
63 * If a lofi device is open when a force detach is requested, then the
64 * underlying file is closed and any subsequent operations return EIO.  When the
65 * device is closed for the last time, it will be cleaned up at that time.  In
66 * addition, the DKIOCSTATE ioctl will return DKIO_DEV_GONE when the device is
67 * detached but not removed.
68 *
69 * Known problems:
70 *
71 *	UFS logging. Mounting a UFS filesystem image "logging"
72 *	works for basic copy testing but wedges during a build of ON through
73 *	that image. Some deadlock in lufs holding the log mutex and then
74 *	getting stuck on a buf. So for now, don't do that.
75 *
76 *	Direct I/O. Since the filesystem data is being cached in the buffer
77 *	cache, _and_ again in the underlying filesystem, it's tempting to
78 *	enable direct I/O on the underlying file. Don't, because that deadlocks.
79 *	I think to fix the cache-twice problem we might need filesystem support.
80 *
81 *	lofi on itself. The simple lock strategy (lofi_lock) precludes this
82 *	because you'll be in lofi_ioctl, holding the lock when you open the
83 *	file, which, if it's lofi, will grab lofi_lock. We prevent this for
84 *	now, though not using ddi_soft_state(9F) would make it possible to
85 *	do. Though it would still be silly.
86 *
87 * Interesting things to do:
88 *
89 *	Allow multiple files for each device. A poor-man's metadisk, basically.
90 *
91 *	Pass-through ioctls on block devices. You can (though it's not
92 *	documented), give lofi a block device as a file name. Then we shouldn't
93 *	need to fake a geometry. But this is also silly unless you're replacing
94 *	metadisk.
95 *
96 *	Encryption. tpm would like this. Apparently Windows 2000 has it, and
97 *	so does Linux.
98 */
99
100#include <sys/types.h>
101#include <netinet/in.h>
102#include <sys/sysmacros.h>
103#include <sys/uio.h>
104#include <sys/kmem.h>
105#include <sys/cred.h>
106#include <sys/mman.h>
107#include <sys/errno.h>
108#include <sys/aio_req.h>
109#include <sys/stat.h>
110#include <sys/file.h>
111#include <sys/modctl.h>
112#include <sys/conf.h>
113#include <sys/debug.h>
114#include <sys/vnode.h>
115#include <sys/lofi.h>
116#include <sys/fcntl.h>
117#include <sys/pathname.h>
118#include <sys/filio.h>
119#include <sys/fdio.h>
120#include <sys/open.h>
121#include <sys/disp.h>
122#include <vm/seg_map.h>
123#include <sys/ddi.h>
124#include <sys/sunddi.h>
125#include <sys/zmod.h>
126
127#define	NBLOCKS_PROP_NAME	"Nblocks"
128#define	SIZE_PROP_NAME		"Size"
129
130static dev_info_t *lofi_dip;
131static void	*lofi_statep;
132static kmutex_t lofi_lock;		/* state lock */
133
134/*
135 * Because lofi_taskq_nthreads limits the actual swamping of the device, the
136 * maxalloc parameter (lofi_taskq_maxalloc) should be tuned conservatively
137 * high.  If we want to be assured that the underlying device is always busy,
138 * we must be sure that the number of bytes enqueued when the number of
139 * enqueued tasks exceeds maxalloc is sufficient to keep the device busy for
140 * the duration of the sleep time in taskq_ent_alloc().  That is, lofi should
141 * set maxalloc to be the maximum throughput (in bytes per second) of the
142 * underlying device divided by the minimum I/O size.  We assume a realistic
143 * maximum throughput of one hundred megabytes per second; we set maxalloc on
144 * the lofi task queue to be 104857600 divided by DEV_BSIZE.
145 */
146static int lofi_taskq_maxalloc = 104857600 / DEV_BSIZE;
147static int lofi_taskq_nthreads = 4;	/* # of taskq threads per device */
148
149uint32_t lofi_max_files = LOFI_MAX_FILES;
150
151static int gzip_decompress(void *src, size_t srclen, void *dst,
152	size_t *destlen, int level);
153
154lofi_compress_info_t lofi_compress_table[LOFI_COMPRESS_FUNCTIONS] = {
155	{gzip_decompress,	NULL,	6,	"gzip"}, /* default */
156	{gzip_decompress,	NULL,	6,	"gzip-6"},
157	{gzip_decompress,	NULL,	9,	"gzip-9"}
158};
159
160static int
161lofi_busy(void)
162{
163	minor_t	minor;
164
165	/*
166	 * We need to make sure no mappings exist - mod_remove won't
167	 * help because the device isn't open.
168	 */
169	mutex_enter(&lofi_lock);
170	for (minor = 1; minor <= lofi_max_files; minor++) {
171		if (ddi_get_soft_state(lofi_statep, minor) != NULL) {
172			mutex_exit(&lofi_lock);
173			return (EBUSY);
174		}
175	}
176	mutex_exit(&lofi_lock);
177	return (0);
178}
179
180static int
181is_opened(struct lofi_state *lsp)
182{
183	ASSERT(mutex_owned(&lofi_lock));
184	return (lsp->ls_chr_open || lsp->ls_blk_open || lsp->ls_lyr_open_count);
185}
186
187static int
188mark_opened(struct lofi_state *lsp, int otyp)
189{
190	ASSERT(mutex_owned(&lofi_lock));
191	switch (otyp) {
192	case OTYP_CHR:
193		lsp->ls_chr_open = 1;
194		break;
195	case OTYP_BLK:
196		lsp->ls_blk_open = 1;
197		break;
198	case OTYP_LYR:
199		lsp->ls_lyr_open_count++;
200		break;
201	default:
202		return (-1);
203	}
204	return (0);
205}
206
207static void
208mark_closed(struct lofi_state *lsp, int otyp)
209{
210	ASSERT(mutex_owned(&lofi_lock));
211	switch (otyp) {
212	case OTYP_CHR:
213		lsp->ls_chr_open = 0;
214		break;
215	case OTYP_BLK:
216		lsp->ls_blk_open = 0;
217		break;
218	case OTYP_LYR:
219		lsp->ls_lyr_open_count--;
220		break;
221	default:
222		break;
223	}
224}
225
226static void
227lofi_free_handle(dev_t dev, minor_t minor, struct lofi_state *lsp,
228    cred_t *credp)
229{
230	dev_t	newdev;
231	char	namebuf[50];
232
233	if (lsp->ls_vp) {
234		(void) VOP_CLOSE(lsp->ls_vp, lsp->ls_openflag,
235		    1, 0, credp, NULL);
236		VN_RELE(lsp->ls_vp);
237		lsp->ls_vp = NULL;
238	}
239
240	newdev = makedevice(getmajor(dev), minor);
241	(void) ddi_prop_remove(newdev, lofi_dip, SIZE_PROP_NAME);
242	(void) ddi_prop_remove(newdev, lofi_dip, NBLOCKS_PROP_NAME);
243
244	(void) snprintf(namebuf, sizeof (namebuf), "%d", minor);
245	ddi_remove_minor_node(lofi_dip, namebuf);
246	(void) snprintf(namebuf, sizeof (namebuf), "%d,raw", minor);
247	ddi_remove_minor_node(lofi_dip, namebuf);
248
249	kmem_free(lsp->ls_filename, lsp->ls_filename_sz);
250	taskq_destroy(lsp->ls_taskq);
251	if (lsp->ls_kstat) {
252		kstat_delete(lsp->ls_kstat);
253		mutex_destroy(&lsp->ls_kstat_lock);
254	}
255
256	if (lsp->ls_uncomp_seg_sz > 0) {
257		kmem_free(lsp->ls_comp_index_data, lsp->ls_comp_index_data_sz);
258		lsp->ls_uncomp_seg_sz = 0;
259	}
260	ddi_soft_state_free(lofi_statep, minor);
261}
262
263/*ARGSUSED*/
264static int
265lofi_open(dev_t *devp, int flag, int otyp, struct cred *credp)
266{
267	minor_t	minor;
268	struct lofi_state *lsp;
269
270	mutex_enter(&lofi_lock);
271	minor = getminor(*devp);
272	if (minor == 0) {
273		/* master control device */
274		/* must be opened exclusively */
275		if (((flag & FEXCL) != FEXCL) || (otyp != OTYP_CHR)) {
276			mutex_exit(&lofi_lock);
277			return (EINVAL);
278		}
279		lsp = ddi_get_soft_state(lofi_statep, 0);
280		if (lsp == NULL) {
281			mutex_exit(&lofi_lock);
282			return (ENXIO);
283		}
284		if (is_opened(lsp)) {
285			mutex_exit(&lofi_lock);
286			return (EBUSY);
287		}
288		(void) mark_opened(lsp, OTYP_CHR);
289		mutex_exit(&lofi_lock);
290		return (0);
291	}
292
293	/* otherwise, the mapping should already exist */
294	lsp = ddi_get_soft_state(lofi_statep, minor);
295	if (lsp == NULL) {
296		mutex_exit(&lofi_lock);
297		return (EINVAL);
298	}
299
300	if (lsp->ls_vp == NULL) {
301		mutex_exit(&lofi_lock);
302		return (ENXIO);
303	}
304
305	if (mark_opened(lsp, otyp) == -1) {
306		mutex_exit(&lofi_lock);
307		return (EINVAL);
308	}
309
310	mutex_exit(&lofi_lock);
311	return (0);
312}
313
314/*ARGSUSED*/
315static int
316lofi_close(dev_t dev, int flag, int otyp, struct cred *credp)
317{
318	minor_t	minor;
319	struct lofi_state *lsp;
320
321	mutex_enter(&lofi_lock);
322	minor = getminor(dev);
323	lsp = ddi_get_soft_state(lofi_statep, minor);
324	if (lsp == NULL) {
325		mutex_exit(&lofi_lock);
326		return (EINVAL);
327	}
328	mark_closed(lsp, otyp);
329
330	/*
331	 * If we forcibly closed the underlying device (li_force), or
332	 * asked for cleanup (li_cleanup), finish up if we're the last
333	 * out of the door.
334	 */
335	if (minor != 0 && !is_opened(lsp) &&
336	    (lsp->ls_cleanup || lsp->ls_vp == NULL))
337		lofi_free_handle(dev, minor, lsp, credp);
338
339	mutex_exit(&lofi_lock);
340	return (0);
341}
342
343static int
344lofi_mapped_rdwr(caddr_t bufaddr, offset_t offset, struct buf *bp,
345	struct lofi_state *lsp)
346{
347	int error;
348	offset_t alignedoffset, mapoffset;
349	size_t	xfersize;
350	int	isread;
351	int 	smflags;
352	caddr_t	mapaddr;
353	size_t	len;
354	enum seg_rw srw;
355
356	/*
357	 * segmap always gives us an 8K (MAXBSIZE) chunk, aligned on
358	 * an 8K boundary, but the buf transfer address may not be
359	 * aligned on more than a 512-byte boundary (we don't enforce
360	 * that even though we could). This matters since the initial
361	 * part of the transfer may not start at offset 0 within the
362	 * segmap'd chunk. So we have to compensate for that with
363	 * 'mapoffset'. Subsequent chunks always start off at the
364	 * beginning, and the last is capped by b_resid
365	 */
366	mapoffset = offset & MAXBOFFSET;
367	alignedoffset = offset - mapoffset;
368	bp->b_resid = bp->b_bcount;
369	isread = bp->b_flags & B_READ;
370	srw = isread ? S_READ : S_WRITE;
371	do {
372		xfersize = MIN(lsp->ls_vp_comp_size - offset,
373		    MIN(MAXBSIZE - mapoffset, bp->b_resid));
374		len = roundup(mapoffset + xfersize, PAGESIZE);
375		mapaddr = segmap_getmapflt(segkmap, lsp->ls_vp,
376		    alignedoffset, MAXBSIZE, 1, srw);
377		/*
378		 * Now fault in the pages. This lets us check
379		 * for errors before we reference mapaddr and
380		 * try to resolve the fault in bcopy (which would
381		 * panic instead). And this can easily happen,
382		 * particularly if you've lofi'd a file over NFS
383		 * and someone deletes the file on the server.
384		 */
385		error = segmap_fault(kas.a_hat, segkmap, mapaddr,
386		    len, F_SOFTLOCK, srw);
387		if (error) {
388			(void) segmap_release(segkmap, mapaddr, 0);
389			if (FC_CODE(error) == FC_OBJERR)
390				error = FC_ERRNO(error);
391			else
392				error = EIO;
393			break;
394		}
395		smflags = 0;
396		if (isread) {
397			smflags |= SM_FREE;
398			/*
399			 * If we're reading an entire page starting
400			 * at a page boundary, there's a good chance
401			 * we won't need it again. Put it on the
402			 * head of the freelist.
403			 */
404			if (mapoffset == 0 && xfersize == PAGESIZE)
405				smflags |= SM_DONTNEED;
406			bcopy(mapaddr + mapoffset, bufaddr, xfersize);
407		} else {
408			smflags |= SM_WRITE;
409			bcopy(bufaddr, mapaddr + mapoffset, xfersize);
410		}
411		bp->b_resid -= xfersize;
412		bufaddr += xfersize;
413		offset += xfersize;
414		(void) segmap_fault(kas.a_hat, segkmap, mapaddr,
415		    len, F_SOFTUNLOCK, srw);
416		error = segmap_release(segkmap, mapaddr, smflags);
417		/* only the first map may start partial */
418		mapoffset = 0;
419		alignedoffset += MAXBSIZE;
420	} while ((error == 0) && (bp->b_resid > 0) &&
421	    (offset < lsp->ls_vp_comp_size));
422
423	return (error);
424}
425
426/*ARGSUSED*/
427static int gzip_decompress(void *src, size_t srclen, void *dst,
428    size_t *dstlen, int level)
429{
430	ASSERT(*dstlen >= srclen);
431
432	if (z_uncompress(dst, dstlen, src, srclen) != Z_OK)
433		return (-1);
434	return (0);
435}
436
437/*
438 * This is basically what strategy used to be before we found we
439 * needed task queues.
440 */
441static void
442lofi_strategy_task(void *arg)
443{
444	struct buf *bp = (struct buf *)arg;
445	int error;
446	struct lofi_state *lsp;
447	uint64_t sblkno, eblkno, cmpbytes;
448	offset_t offset, sblkoff, eblkoff;
449	u_offset_t salign, ealign;
450	u_offset_t sdiff;
451	uint32_t comp_data_sz;
452	caddr_t bufaddr;
453	unsigned char *compressed_seg = NULL, *cmpbuf;
454	unsigned char *uncompressed_seg = NULL;
455	lofi_compress_info_t *li;
456	size_t oblkcount, xfersize;
457	unsigned long seglen;
458
459	lsp = ddi_get_soft_state(lofi_statep, getminor(bp->b_edev));
460	if (lsp->ls_kstat) {
461		mutex_enter(lsp->ls_kstat->ks_lock);
462		kstat_waitq_to_runq(KSTAT_IO_PTR(lsp->ls_kstat));
463		mutex_exit(lsp->ls_kstat->ks_lock);
464	}
465	bp_mapin(bp);
466	bufaddr = bp->b_un.b_addr;
467	offset = bp->b_lblkno * DEV_BSIZE;	/* offset within file */
468
469	/*
470	 * We used to always use vn_rdwr here, but we cannot do that because
471	 * we might decide to read or write from the the underlying
472	 * file during this call, which would be a deadlock because
473	 * we have the rw_lock. So instead we page, unless it's not
474	 * mapable or it's a character device.
475	 */
476	if (lsp->ls_vp == NULL || lsp->ls_vp_closereq) {
477		error = EIO;
478	} else if (((lsp->ls_vp->v_flag & VNOMAP) == 0) &&
479	    (lsp->ls_vp->v_type != VCHR)) {
480		uint64_t i;
481
482		/*
483		 * Handle uncompressed files with a regular read
484		 */
485		if (lsp->ls_uncomp_seg_sz == 0) {
486			error = lofi_mapped_rdwr(bufaddr, offset, bp, lsp);
487			goto done;
488		}
489
490		/*
491		 * From here on we're dealing primarily with compressed files
492		 */
493
494		/*
495		 * Compressed files can only be read from and
496		 * not written to
497		 */
498		if (!(bp->b_flags & B_READ)) {
499			bp->b_resid = bp->b_bcount;
500			error = EROFS;
501			goto done;
502		}
503
504		ASSERT(lsp->ls_comp_algorithm_index >= 0);
505		li = &lofi_compress_table[lsp->ls_comp_algorithm_index];
506		/*
507		 * Compute starting and ending compressed segment numbers
508		 * We use only bitwise operations avoiding division and
509		 * modulus because we enforce the compression segment size
510		 * to a power of 2
511		 */
512		sblkno = offset >> lsp->ls_comp_seg_shift;
513		sblkoff = offset & (lsp->ls_uncomp_seg_sz - 1);
514		eblkno = (offset + bp->b_bcount) >> lsp->ls_comp_seg_shift;
515		eblkoff = (offset + bp->b_bcount) & (lsp->ls_uncomp_seg_sz - 1);
516
517		/*
518		 * Align start offset to block boundary for segmap
519		 */
520		salign = lsp->ls_comp_seg_index[sblkno];
521		sdiff = salign & (DEV_BSIZE - 1);
522		salign -= sdiff;
523		if (eblkno >= (lsp->ls_comp_index_sz - 1)) {
524			/*
525			 * We're dealing with the last segment of
526			 * the compressed file -- the size of this
527			 * segment *may not* be the same as the
528			 * segment size for the file
529			 */
530			eblkoff = (offset + bp->b_bcount) &
531			    (lsp->ls_uncomp_last_seg_sz - 1);
532			ealign = lsp->ls_vp_comp_size;
533		} else {
534			ealign = lsp->ls_comp_seg_index[eblkno + 1];
535		}
536
537		/*
538		 * Preserve original request paramaters
539		 */
540		oblkcount = bp->b_bcount;
541
542		/*
543		 * Assign the calculated parameters
544		 */
545		comp_data_sz = ealign - salign;
546		bp->b_bcount = comp_data_sz;
547
548		/*
549		 * Allocate fixed size memory blocks to hold compressed
550		 * segments and one uncompressed segment since we
551		 * uncompress segments one at a time
552		 */
553		compressed_seg = kmem_alloc(bp->b_bcount, KM_SLEEP);
554		uncompressed_seg = kmem_alloc(lsp->ls_uncomp_seg_sz, KM_SLEEP);
555		/*
556		 * Map in the calculated number of blocks
557		 */
558		error = lofi_mapped_rdwr((caddr_t)compressed_seg, salign,
559		    bp, lsp);
560
561		bp->b_bcount = oblkcount;
562		bp->b_resid = oblkcount;
563		if (error != 0)
564			goto done;
565
566		/*
567		 * We have the compressed blocks, now uncompress them
568		 */
569		cmpbuf = compressed_seg + sdiff;
570		for (i = sblkno; i < (eblkno + 1) && i < lsp->ls_comp_index_sz;
571		    i++) {
572			/*
573			 * Each of the segment index entries contains
574			 * the starting block number for that segment.
575			 * The number of compressed bytes in a segment
576			 * is thus the difference between the starting
577			 * block number of this segment and the starting
578			 * block number of the next segment.
579			 */
580			if ((i == eblkno) &&
581			    (i == lsp->ls_comp_index_sz - 1)) {
582				cmpbytes = lsp->ls_vp_comp_size -
583				    lsp->ls_comp_seg_index[i];
584			} else {
585				cmpbytes = lsp->ls_comp_seg_index[i + 1] -
586				    lsp->ls_comp_seg_index[i];
587			}
588
589			/*
590			 * The first byte in a compressed segment is a flag
591			 * that indicates whether this segment is compressed
592			 * at all
593			 */
594			if (*cmpbuf == UNCOMPRESSED) {
595				bcopy((cmpbuf + SEGHDR), uncompressed_seg,
596				    (cmpbytes - SEGHDR));
597			} else {
598				seglen = lsp->ls_uncomp_seg_sz;
599
600				if (li->l_decompress((cmpbuf + SEGHDR),
601				    (cmpbytes - SEGHDR), uncompressed_seg,
602				    &seglen, li->l_level) != 0) {
603					error = EIO;
604					goto done;
605				}
606			}
607
608			/*
609			 * Determine how much uncompressed data we
610			 * have to copy and copy it
611			 */
612			xfersize = lsp->ls_uncomp_seg_sz - sblkoff;
613			if (i == eblkno) {
614				if (i == (lsp->ls_comp_index_sz - 1))
615					xfersize -= (lsp->ls_uncomp_last_seg_sz
616					    - eblkoff);
617				else
618					xfersize -=
619					    (lsp->ls_uncomp_seg_sz - eblkoff);
620			}
621
622			bcopy((uncompressed_seg + sblkoff), bufaddr, xfersize);
623
624			cmpbuf += cmpbytes;
625			bufaddr += xfersize;
626			bp->b_resid -= xfersize;
627			sblkoff = 0;
628
629			if (bp->b_resid == 0)
630				break;
631		}
632	} else {
633		ssize_t	resid;
634		enum uio_rw rw;
635
636		if (bp->b_flags & B_READ)
637			rw = UIO_READ;
638		else
639			rw = UIO_WRITE;
640		error = vn_rdwr(rw, lsp->ls_vp, bufaddr, bp->b_bcount,
641		    offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
642		bp->b_resid = resid;
643	}
644
645done:
646	if (compressed_seg != NULL)
647		kmem_free(compressed_seg, comp_data_sz);
648	if (uncompressed_seg != NULL)
649		kmem_free(uncompressed_seg, lsp->ls_uncomp_seg_sz);
650
651	if (lsp->ls_kstat) {
652		size_t n_done = bp->b_bcount - bp->b_resid;
653		kstat_io_t *kioptr;
654
655		mutex_enter(lsp->ls_kstat->ks_lock);
656		kioptr = KSTAT_IO_PTR(lsp->ls_kstat);
657		if (bp->b_flags & B_READ) {
658			kioptr->nread += n_done;
659			kioptr->reads++;
660		} else {
661			kioptr->nwritten += n_done;
662			kioptr->writes++;
663		}
664		kstat_runq_exit(kioptr);
665		mutex_exit(lsp->ls_kstat->ks_lock);
666	}
667
668	mutex_enter(&lsp->ls_vp_lock);
669	if (--lsp->ls_vp_iocount == 0)
670		cv_broadcast(&lsp->ls_vp_cv);
671	mutex_exit(&lsp->ls_vp_lock);
672
673	bioerror(bp, error);
674	biodone(bp);
675}
676
677static int
678lofi_strategy(struct buf *bp)
679{
680	struct lofi_state *lsp;
681	offset_t	offset;
682
683	/*
684	 * We cannot just do I/O here, because the current thread
685	 * _might_ end up back in here because the underlying filesystem
686	 * wants a buffer, which eventually gets into bio_recycle and
687	 * might call into lofi to write out a delayed-write buffer.
688	 * This is bad if the filesystem above lofi is the same as below.
689	 *
690	 * We could come up with a complex strategy using threads to
691	 * do the I/O asynchronously, or we could use task queues. task
692	 * queues were incredibly easy so they win.
693	 */
694	lsp = ddi_get_soft_state(lofi_statep, getminor(bp->b_edev));
695	mutex_enter(&lsp->ls_vp_lock);
696	if (lsp->ls_vp == NULL || lsp->ls_vp_closereq) {
697		bioerror(bp, EIO);
698		biodone(bp);
699		mutex_exit(&lsp->ls_vp_lock);
700		return (0);
701	}
702
703	offset = bp->b_lblkno * DEV_BSIZE;	/* offset within file */
704	if (offset == lsp->ls_vp_size) {
705		/* EOF */
706		if ((bp->b_flags & B_READ) != 0) {
707			bp->b_resid = bp->b_bcount;
708			bioerror(bp, 0);
709		} else {
710			/* writes should fail */
711			bioerror(bp, ENXIO);
712		}
713		biodone(bp);
714		mutex_exit(&lsp->ls_vp_lock);
715		return (0);
716	}
717	if (offset > lsp->ls_vp_size) {
718		bioerror(bp, ENXIO);
719		biodone(bp);
720		mutex_exit(&lsp->ls_vp_lock);
721		return (0);
722	}
723	lsp->ls_vp_iocount++;
724	mutex_exit(&lsp->ls_vp_lock);
725
726	if (lsp->ls_kstat) {
727		mutex_enter(lsp->ls_kstat->ks_lock);
728		kstat_waitq_enter(KSTAT_IO_PTR(lsp->ls_kstat));
729		mutex_exit(lsp->ls_kstat->ks_lock);
730	}
731	(void) taskq_dispatch(lsp->ls_taskq, lofi_strategy_task, bp, KM_SLEEP);
732	return (0);
733}
734
735/*ARGSUSED2*/
736static int
737lofi_read(dev_t dev, struct uio *uio, struct cred *credp)
738{
739	if (getminor(dev) == 0)
740		return (EINVAL);
741	return (physio(lofi_strategy, NULL, dev, B_READ, minphys, uio));
742}
743
744/*ARGSUSED2*/
745static int
746lofi_write(dev_t dev, struct uio *uio, struct cred *credp)
747{
748	if (getminor(dev) == 0)
749		return (EINVAL);
750	return (physio(lofi_strategy, NULL, dev, B_WRITE, minphys, uio));
751}
752
753/*ARGSUSED2*/
754static int
755lofi_aread(dev_t dev, struct aio_req *aio, struct cred *credp)
756{
757	if (getminor(dev) == 0)
758		return (EINVAL);
759	return (aphysio(lofi_strategy, anocancel, dev, B_READ, minphys, aio));
760}
761
762/*ARGSUSED2*/
763static int
764lofi_awrite(dev_t dev, struct aio_req *aio, struct cred *credp)
765{
766	if (getminor(dev) == 0)
767		return (EINVAL);
768	return (aphysio(lofi_strategy, anocancel, dev, B_WRITE, minphys, aio));
769}
770
771/*ARGSUSED*/
772static int
773lofi_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
774{
775	switch (infocmd) {
776	case DDI_INFO_DEVT2DEVINFO:
777		*result = lofi_dip;
778		return (DDI_SUCCESS);
779	case DDI_INFO_DEVT2INSTANCE:
780		*result = 0;
781		return (DDI_SUCCESS);
782	}
783	return (DDI_FAILURE);
784}
785
786static int
787lofi_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
788{
789	int	error;
790
791	if (cmd != DDI_ATTACH)
792		return (DDI_FAILURE);
793	error = ddi_soft_state_zalloc(lofi_statep, 0);
794	if (error == DDI_FAILURE) {
795		return (DDI_FAILURE);
796	}
797	error = ddi_create_minor_node(dip, LOFI_CTL_NODE, S_IFCHR, 0,
798	    DDI_PSEUDO, NULL);
799	if (error == DDI_FAILURE) {
800		ddi_soft_state_free(lofi_statep, 0);
801		return (DDI_FAILURE);
802	}
803	/* driver handles kernel-issued IOCTLs */
804	if (ddi_prop_create(DDI_DEV_T_NONE, dip, DDI_PROP_CANSLEEP,
805	    DDI_KERNEL_IOCTL, NULL, 0) != DDI_PROP_SUCCESS) {
806		ddi_remove_minor_node(dip, NULL);
807		ddi_soft_state_free(lofi_statep, 0);
808		return (DDI_FAILURE);
809	}
810	lofi_dip = dip;
811	ddi_report_dev(dip);
812	return (DDI_SUCCESS);
813}
814
815static int
816lofi_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
817{
818	if (cmd != DDI_DETACH)
819		return (DDI_FAILURE);
820	if (lofi_busy())
821		return (DDI_FAILURE);
822	lofi_dip = NULL;
823	ddi_remove_minor_node(dip, NULL);
824	ddi_prop_remove_all(dip);
825	ddi_soft_state_free(lofi_statep, 0);
826	return (DDI_SUCCESS);
827}
828
829/*
830 * These two just simplify the rest of the ioctls that need to copyin/out
831 * the lofi_ioctl structure.
832 */
833struct lofi_ioctl *
834copy_in_lofi_ioctl(const struct lofi_ioctl *ulip, int flag)
835{
836	struct lofi_ioctl *klip;
837	int	error;
838
839	klip = kmem_alloc(sizeof (struct lofi_ioctl), KM_SLEEP);
840	error = ddi_copyin(ulip, klip, sizeof (struct lofi_ioctl), flag);
841	if (error) {
842		kmem_free(klip, sizeof (struct lofi_ioctl));
843		return (NULL);
844	}
845
846	/* make sure filename is always null-terminated */
847	klip->li_filename[MAXPATHLEN] = '\0';
848
849	/* validate minor number */
850	if (klip->li_minor > lofi_max_files) {
851		kmem_free(klip, sizeof (struct lofi_ioctl));
852		return (NULL);
853	}
854	return (klip);
855}
856
857int
858copy_out_lofi_ioctl(const struct lofi_ioctl *klip, struct lofi_ioctl *ulip,
859	int flag)
860{
861	int	error;
862
863	error = ddi_copyout(klip, ulip, sizeof (struct lofi_ioctl), flag);
864	if (error)
865		return (EFAULT);
866	return (0);
867}
868
869void
870free_lofi_ioctl(struct lofi_ioctl *klip)
871{
872	kmem_free(klip, sizeof (struct lofi_ioctl));
873}
874
875/*
876 * Return the minor number 'filename' is mapped to, if it is.
877 */
878static int
879file_to_minor(char *filename)
880{
881	minor_t	minor;
882	struct lofi_state *lsp;
883
884	ASSERT(mutex_owned(&lofi_lock));
885	for (minor = 1; minor <= lofi_max_files; minor++) {
886		lsp = ddi_get_soft_state(lofi_statep, minor);
887		if (lsp == NULL)
888			continue;
889		if (strcmp(lsp->ls_filename, filename) == 0)
890			return (minor);
891	}
892	return (0);
893}
894
895/*
896 * lofiadm does some validation, but since Joe Random (or crashme) could
897 * do our ioctls, we need to do some validation too.
898 */
899static int
900valid_filename(const char *filename)
901{
902	static char *blkprefix = "/dev/" LOFI_BLOCK_NAME "/";
903	static char *charprefix = "/dev/" LOFI_CHAR_NAME "/";
904
905	/* must be absolute path */
906	if (filename[0] != '/')
907		return (0);
908	/* must not be lofi */
909	if (strncmp(filename, blkprefix, strlen(blkprefix)) == 0)
910		return (0);
911	if (strncmp(filename, charprefix, strlen(charprefix)) == 0)
912		return (0);
913	return (1);
914}
915
916/*
917 * Fakes up a disk geometry, and one big partition, based on the size
918 * of the file. This is needed because we allow newfs'ing the device,
919 * and newfs will do several disk ioctls to figure out the geometry and
920 * partition information. It uses that information to determine the parameters
921 * to pass to mkfs. Geometry is pretty much irrelevant these days, but we
922 * have to support it.
923 */
924static void
925fake_disk_geometry(struct lofi_state *lsp)
926{
927	/* dk_geom - see dkio(7I) */
928	/*
929	 * dkg_ncyl _could_ be set to one here (one big cylinder with gobs
930	 * of sectors), but that breaks programs like fdisk which want to
931	 * partition a disk by cylinder. With one cylinder, you can't create
932	 * an fdisk partition and put pcfs on it for testing (hard to pick
933	 * a number between one and one).
934	 *
935	 * The cheezy floppy test is an attempt to not have too few cylinders
936	 * for a small file, or so many on a big file that you waste space
937	 * for backup superblocks or cylinder group structures.
938	 */
939	if (lsp->ls_vp_size < (2 * 1024 * 1024)) /* floppy? */
940		lsp->ls_dkg.dkg_ncyl = lsp->ls_vp_size / (100 * 1024);
941	else
942		lsp->ls_dkg.dkg_ncyl = lsp->ls_vp_size / (300 * 1024);
943	/* in case file file is < 100k */
944	if (lsp->ls_dkg.dkg_ncyl == 0)
945		lsp->ls_dkg.dkg_ncyl = 1;
946	lsp->ls_dkg.dkg_acyl = 0;
947	lsp->ls_dkg.dkg_bcyl = 0;
948	lsp->ls_dkg.dkg_nhead = 1;
949	lsp->ls_dkg.dkg_obs1 = 0;
950	lsp->ls_dkg.dkg_intrlv = 0;
951	lsp->ls_dkg.dkg_obs2 = 0;
952	lsp->ls_dkg.dkg_obs3 = 0;
953	lsp->ls_dkg.dkg_apc = 0;
954	lsp->ls_dkg.dkg_rpm = 7200;
955	lsp->ls_dkg.dkg_pcyl = lsp->ls_dkg.dkg_ncyl + lsp->ls_dkg.dkg_acyl;
956	lsp->ls_dkg.dkg_nsect = lsp->ls_vp_size /
957	    (DEV_BSIZE * lsp->ls_dkg.dkg_ncyl);
958	lsp->ls_dkg.dkg_write_reinstruct = 0;
959	lsp->ls_dkg.dkg_read_reinstruct = 0;
960
961	/* vtoc - see dkio(7I) */
962	bzero(&lsp->ls_vtoc, sizeof (struct vtoc));
963	lsp->ls_vtoc.v_sanity = VTOC_SANE;
964	lsp->ls_vtoc.v_version = V_VERSION;
965	bcopy(LOFI_DRIVER_NAME, lsp->ls_vtoc.v_volume, 7);
966	lsp->ls_vtoc.v_sectorsz = DEV_BSIZE;
967	lsp->ls_vtoc.v_nparts = 1;
968	lsp->ls_vtoc.v_part[0].p_tag = V_UNASSIGNED;
969
970	/*
971	 * A compressed file is read-only, other files can
972	 * be read-write
973	 */
974	if (lsp->ls_uncomp_seg_sz > 0) {
975		lsp->ls_vtoc.v_part[0].p_flag = V_UNMNT | V_RONLY;
976	} else {
977		lsp->ls_vtoc.v_part[0].p_flag = V_UNMNT;
978	}
979	lsp->ls_vtoc.v_part[0].p_start = (daddr_t)0;
980	/*
981	 * The partition size cannot just be the number of sectors, because
982	 * that might not end on a cylinder boundary. And if that's the case,
983	 * newfs/mkfs will print a scary warning. So just figure the size
984	 * based on the number of cylinders and sectors/cylinder.
985	 */
986	lsp->ls_vtoc.v_part[0].p_size = lsp->ls_dkg.dkg_pcyl *
987	    lsp->ls_dkg.dkg_nsect * lsp->ls_dkg.dkg_nhead;
988
989	/* dk_cinfo - see dkio(7I) */
990	bzero(&lsp->ls_ci, sizeof (struct dk_cinfo));
991	(void) strcpy(lsp->ls_ci.dki_cname, LOFI_DRIVER_NAME);
992	lsp->ls_ci.dki_ctype = DKC_MD;
993	lsp->ls_ci.dki_flags = 0;
994	lsp->ls_ci.dki_cnum = 0;
995	lsp->ls_ci.dki_addr = 0;
996	lsp->ls_ci.dki_space = 0;
997	lsp->ls_ci.dki_prio = 0;
998	lsp->ls_ci.dki_vec = 0;
999	(void) strcpy(lsp->ls_ci.dki_dname, LOFI_DRIVER_NAME);
1000	lsp->ls_ci.dki_unit = 0;
1001	lsp->ls_ci.dki_slave = 0;
1002	lsp->ls_ci.dki_partition = 0;
1003	/*
1004	 * newfs uses this to set maxcontig. Must not be < 16, or it
1005	 * will be 0 when newfs multiplies it by DEV_BSIZE and divides
1006	 * it by the block size. Then tunefs doesn't work because
1007	 * maxcontig is 0.
1008	 */
1009	lsp->ls_ci.dki_maxtransfer = 16;
1010}
1011
1012/*
1013 * map in a compressed file
1014 *
1015 * Read in the header and the index that follows.
1016 *
1017 * The header is as follows -
1018 *
1019 * Signature (name of the compression algorithm)
1020 * Compression segment size (a multiple of 512)
1021 * Number of index entries
1022 * Size of the last block
1023 * The array containing the index entries
1024 *
1025 * The header information is always stored in
1026 * network byte order on disk.
1027 */
1028static int
1029lofi_map_compressed_file(struct lofi_state *lsp, char *buf)
1030{
1031	uint32_t index_sz, header_len, i;
1032	ssize_t	resid;
1033	enum uio_rw rw;
1034	char *tbuf = buf;
1035	int error;
1036
1037	/* The signature has already been read */
1038	tbuf += sizeof (lsp->ls_comp_algorithm);
1039	bcopy(tbuf, &(lsp->ls_uncomp_seg_sz), sizeof (lsp->ls_uncomp_seg_sz));
1040	lsp->ls_uncomp_seg_sz = ntohl(lsp->ls_uncomp_seg_sz);
1041
1042	/*
1043	 * The compressed segment size must be a power of 2
1044	 */
1045	if (lsp->ls_uncomp_seg_sz % 2)
1046		return (EINVAL);
1047
1048	for (i = 0; !((lsp->ls_uncomp_seg_sz >> i) & 1); i++)
1049		;
1050
1051	lsp->ls_comp_seg_shift = i;
1052
1053	tbuf += sizeof (lsp->ls_uncomp_seg_sz);
1054	bcopy(tbuf, &(lsp->ls_comp_index_sz), sizeof (lsp->ls_comp_index_sz));
1055	lsp->ls_comp_index_sz = ntohl(lsp->ls_comp_index_sz);
1056
1057	tbuf += sizeof (lsp->ls_comp_index_sz);
1058	bcopy(tbuf, &(lsp->ls_uncomp_last_seg_sz),
1059	    sizeof (lsp->ls_uncomp_last_seg_sz));
1060	lsp->ls_uncomp_last_seg_sz = ntohl(lsp->ls_uncomp_last_seg_sz);
1061
1062	/*
1063	 * Compute the total size of the uncompressed data
1064	 * for use in fake_disk_geometry and other calculations.
1065	 * Disk geometry has to be faked with respect to the
1066	 * actual uncompressed data size rather than the
1067	 * compressed file size.
1068	 */
1069	lsp->ls_vp_size = (lsp->ls_comp_index_sz - 2) * lsp->ls_uncomp_seg_sz
1070	    + lsp->ls_uncomp_last_seg_sz;
1071
1072	/*
1073	 * Index size is rounded up to a 512 byte boundary for ease
1074	 * of segmapping
1075	 */
1076	index_sz = sizeof (*lsp->ls_comp_seg_index) * lsp->ls_comp_index_sz;
1077	header_len = sizeof (lsp->ls_comp_algorithm) +
1078	    sizeof (lsp->ls_uncomp_seg_sz) +
1079	    sizeof (lsp->ls_comp_index_sz) +
1080	    sizeof (lsp->ls_uncomp_last_seg_sz);
1081	lsp->ls_comp_offbase = header_len + index_sz;
1082
1083	index_sz += header_len;
1084	index_sz = roundup(index_sz, DEV_BSIZE);
1085
1086	lsp->ls_comp_index_data = kmem_alloc(index_sz, KM_SLEEP);
1087	lsp->ls_comp_index_data_sz = index_sz;
1088
1089	/*
1090	 * Read in the index -- this has a side-effect
1091	 * of reading in the header as well
1092	 */
1093	rw = UIO_READ;
1094	error = vn_rdwr(rw, lsp->ls_vp, lsp->ls_comp_index_data, index_sz,
1095	    0, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
1096
1097	if (error != 0)
1098		return (error);
1099
1100	/* Skip the header, this is where the index really begins */
1101	lsp->ls_comp_seg_index =
1102	    /*LINTED*/
1103	    (uint64_t *)(lsp->ls_comp_index_data + header_len);
1104
1105	/*
1106	 * Now recompute offsets in the index to account for
1107	 * the header length
1108	 */
1109	for (i = 0; i < lsp->ls_comp_index_sz; i++) {
1110		lsp->ls_comp_seg_index[i] = lsp->ls_comp_offbase +
1111		    BE_64(lsp->ls_comp_seg_index[i]);
1112	}
1113
1114	return (error);
1115}
1116
1117/*
1118 * Check to see if the passed in signature is a valid
1119 * one. If it is valid, return the index into
1120 * lofi_compress_table.
1121 *
1122 * Return -1 if it is invalid
1123 */
1124static int lofi_compress_select(char *signature)
1125{
1126	int i;
1127
1128	for (i = 0; i < LOFI_COMPRESS_FUNCTIONS; i++) {
1129		if (strcmp(lofi_compress_table[i].l_name, signature) == 0)
1130			return (i);
1131	}
1132
1133	return (-1);
1134}
1135
1136/*
1137 * map a file to a minor number. Return the minor number.
1138 */
1139static int
1140lofi_map_file(dev_t dev, struct lofi_ioctl *ulip, int pickminor,
1141    int *rvalp, struct cred *credp, int ioctl_flag)
1142{
1143	minor_t	newminor;
1144	struct lofi_state *lsp;
1145	struct lofi_ioctl *klip;
1146	int	error;
1147	struct vnode *vp;
1148	int64_t	Nblocks_prop_val;
1149	int64_t	Size_prop_val;
1150	int	compress_index;
1151	vattr_t	vattr;
1152	int	flag;
1153	enum vtype v_type;
1154	int zalloced = 0;
1155	dev_t	newdev;
1156	char	namebuf[50];
1157	char 	buf[DEV_BSIZE];
1158	char 	*tbuf;
1159	ssize_t	resid;
1160	enum uio_rw rw;
1161
1162	klip = copy_in_lofi_ioctl(ulip, ioctl_flag);
1163	if (klip == NULL)
1164		return (EFAULT);
1165
1166	mutex_enter(&lofi_lock);
1167
1168	if (!valid_filename(klip->li_filename)) {
1169		error = EINVAL;
1170		goto out;
1171	}
1172
1173	if (file_to_minor(klip->li_filename) != 0) {
1174		error = EBUSY;
1175		goto out;
1176	}
1177
1178	if (pickminor) {
1179		/* Find a free one */
1180		for (newminor = 1; newminor <= lofi_max_files; newminor++)
1181			if (ddi_get_soft_state(lofi_statep, newminor) == NULL)
1182				break;
1183		if (newminor >= lofi_max_files) {
1184			error = EAGAIN;
1185			goto out;
1186		}
1187	} else {
1188		newminor = klip->li_minor;
1189		if (ddi_get_soft_state(lofi_statep, newminor) != NULL) {
1190			error = EEXIST;
1191			goto out;
1192		}
1193	}
1194
1195	/* make sure it's valid */
1196	error = lookupname(klip->li_filename, UIO_SYSSPACE, FOLLOW,
1197	    NULLVPP, &vp);
1198	if (error) {
1199		goto out;
1200	}
1201	v_type = vp->v_type;
1202	VN_RELE(vp);
1203	if (!V_ISLOFIABLE(v_type)) {
1204		error = EINVAL;
1205		goto out;
1206	}
1207	flag = FREAD | FWRITE | FOFFMAX | FEXCL;
1208	error = vn_open(klip->li_filename, UIO_SYSSPACE, flag, 0, &vp, 0, 0);
1209	if (error) {
1210		/* try read-only */
1211		flag &= ~FWRITE;
1212		error = vn_open(klip->li_filename, UIO_SYSSPACE, flag, 0,
1213		    &vp, 0, 0);
1214		if (error) {
1215			goto out;
1216		}
1217	}
1218	vattr.va_mask = AT_SIZE;
1219	error = VOP_GETATTR(vp, &vattr, 0, credp, NULL);
1220	if (error) {
1221		goto closeout;
1222	}
1223	/* the file needs to be a multiple of the block size */
1224	if ((vattr.va_size % DEV_BSIZE) != 0) {
1225		error = EINVAL;
1226		goto closeout;
1227	}
1228	newdev = makedevice(getmajor(dev), newminor);
1229	Size_prop_val = vattr.va_size;
1230	if ((ddi_prop_update_int64(newdev, lofi_dip,
1231	    SIZE_PROP_NAME, Size_prop_val)) != DDI_PROP_SUCCESS) {
1232		error = EINVAL;
1233		goto closeout;
1234	}
1235	Nblocks_prop_val = vattr.va_size / DEV_BSIZE;
1236	if ((ddi_prop_update_int64(newdev, lofi_dip,
1237	    NBLOCKS_PROP_NAME, Nblocks_prop_val)) != DDI_PROP_SUCCESS) {
1238		error = EINVAL;
1239		goto propout;
1240	}
1241	error = ddi_soft_state_zalloc(lofi_statep, newminor);
1242	if (error == DDI_FAILURE) {
1243		error = ENOMEM;
1244		goto propout;
1245	}
1246	zalloced = 1;
1247	(void) snprintf(namebuf, sizeof (namebuf), "%d", newminor);
1248	error = ddi_create_minor_node(lofi_dip, namebuf, S_IFBLK, newminor,
1249	    DDI_PSEUDO, NULL);
1250	if (error != DDI_SUCCESS) {
1251		error = ENXIO;
1252		goto propout;
1253	}
1254	(void) snprintf(namebuf, sizeof (namebuf), "%d,raw", newminor);
1255	error = ddi_create_minor_node(lofi_dip, namebuf, S_IFCHR, newminor,
1256	    DDI_PSEUDO, NULL);
1257	if (error != DDI_SUCCESS) {
1258		/* remove block node */
1259		(void) snprintf(namebuf, sizeof (namebuf), "%d", newminor);
1260		ddi_remove_minor_node(lofi_dip, namebuf);
1261		error = ENXIO;
1262		goto propout;
1263	}
1264	lsp = ddi_get_soft_state(lofi_statep, newminor);
1265	lsp->ls_filename_sz = strlen(klip->li_filename) + 1;
1266	lsp->ls_filename = kmem_alloc(lsp->ls_filename_sz, KM_SLEEP);
1267	(void) snprintf(namebuf, sizeof (namebuf), "%s_taskq_%d",
1268	    LOFI_DRIVER_NAME, newminor);
1269	lsp->ls_taskq = taskq_create(namebuf, lofi_taskq_nthreads,
1270	    minclsyspri, 1, lofi_taskq_maxalloc, 0);
1271	lsp->ls_kstat = kstat_create(LOFI_DRIVER_NAME, newminor,
1272	    NULL, "disk", KSTAT_TYPE_IO, 1, 0);
1273	if (lsp->ls_kstat) {
1274		mutex_init(&lsp->ls_kstat_lock, NULL, MUTEX_DRIVER, NULL);
1275		lsp->ls_kstat->ks_lock = &lsp->ls_kstat_lock;
1276		kstat_install(lsp->ls_kstat);
1277	}
1278	cv_init(&lsp->ls_vp_cv, NULL, CV_DRIVER, NULL);
1279	mutex_init(&lsp->ls_vp_lock, NULL, MUTEX_DRIVER, NULL);
1280
1281	/*
1282	 * save open mode so file can be closed properly and vnode counts
1283	 * updated correctly.
1284	 */
1285	lsp->ls_openflag = flag;
1286
1287	/*
1288	 * Try to handle stacked lofs vnodes.
1289	 */
1290	if (vp->v_type == VREG) {
1291		if (VOP_REALVP(vp, &lsp->ls_vp, NULL) != 0) {
1292			lsp->ls_vp = vp;
1293		} else {
1294			/*
1295			 * Even though vp was obtained via vn_open(), we
1296			 * can't call vn_close() on it, since lofs will
1297			 * pass the VOP_CLOSE() on down to the realvp
1298			 * (which we are about to use). Hence we merely
1299			 * drop the reference to the lofs vnode and hold
1300			 * the realvp so things behave as if we've
1301			 * opened the realvp without any interaction
1302			 * with lofs.
1303			 */
1304			VN_HOLD(lsp->ls_vp);
1305			VN_RELE(vp);
1306		}
1307	} else {
1308		lsp->ls_vp = vp;
1309	}
1310	lsp->ls_vp_size = vattr.va_size;
1311	(void) strcpy(lsp->ls_filename, klip->li_filename);
1312	if (rvalp)
1313		*rvalp = (int)newminor;
1314	klip->li_minor = newminor;
1315
1316	/*
1317	 * Read the file signature to check if it is compressed.
1318	 * 'rw' is set to read since only reads are allowed to
1319	 * a compressed file.
1320	 */
1321	rw = UIO_READ;
1322	error = vn_rdwr(rw, lsp->ls_vp, buf, DEV_BSIZE, 0, UIO_SYSSPACE,
1323	    0, RLIM64_INFINITY, kcred, &resid);
1324
1325	if (error != 0)
1326		goto propout;
1327
1328	tbuf = buf;
1329	lsp->ls_uncomp_seg_sz = 0;
1330	lsp->ls_vp_comp_size = lsp->ls_vp_size;
1331	lsp->ls_comp_algorithm[0] = '\0';
1332
1333	compress_index = lofi_compress_select(tbuf);
1334	if (compress_index != -1) {
1335		lsp->ls_comp_algorithm_index = compress_index;
1336		(void) strlcpy(lsp->ls_comp_algorithm,
1337		    lofi_compress_table[compress_index].l_name,
1338		    sizeof (lsp->ls_comp_algorithm));
1339		error = lofi_map_compressed_file(lsp, buf);
1340		if (error != 0)
1341			goto propout;
1342
1343		/* update DDI properties */
1344		Size_prop_val = lsp->ls_vp_size;
1345		if ((ddi_prop_update_int64(newdev, lofi_dip, SIZE_PROP_NAME,
1346		    Size_prop_val)) != DDI_PROP_SUCCESS) {
1347			error = EINVAL;
1348			goto propout;
1349		}
1350
1351		Nblocks_prop_val = lsp->ls_vp_size / DEV_BSIZE;
1352		if ((ddi_prop_update_int64(newdev, lofi_dip, NBLOCKS_PROP_NAME,
1353		    Nblocks_prop_val)) != DDI_PROP_SUCCESS) {
1354			error = EINVAL;
1355			goto propout;
1356		}
1357	}
1358
1359	fake_disk_geometry(lsp);
1360	mutex_exit(&lofi_lock);
1361	(void) copy_out_lofi_ioctl(klip, ulip, ioctl_flag);
1362	free_lofi_ioctl(klip);
1363	return (0);
1364
1365propout:
1366	(void) ddi_prop_remove(newdev, lofi_dip, SIZE_PROP_NAME);
1367	(void) ddi_prop_remove(newdev, lofi_dip, NBLOCKS_PROP_NAME);
1368closeout:
1369	(void) VOP_CLOSE(vp, flag, 1, 0, credp, NULL);
1370	VN_RELE(vp);
1371out:
1372	if (zalloced)
1373		ddi_soft_state_free(lofi_statep, newminor);
1374	mutex_exit(&lofi_lock);
1375	free_lofi_ioctl(klip);
1376	return (error);
1377}
1378
1379/*
1380 * unmap a file.
1381 */
1382static int
1383lofi_unmap_file(dev_t dev, struct lofi_ioctl *ulip, int byfilename,
1384    struct cred *credp, int ioctl_flag)
1385{
1386	struct lofi_state *lsp;
1387	struct lofi_ioctl *klip;
1388	minor_t	minor;
1389
1390	klip = copy_in_lofi_ioctl(ulip, ioctl_flag);
1391	if (klip == NULL)
1392		return (EFAULT);
1393
1394	mutex_enter(&lofi_lock);
1395	if (byfilename) {
1396		minor = file_to_minor(klip->li_filename);
1397	} else {
1398		minor = klip->li_minor;
1399	}
1400	if (minor == 0) {
1401		mutex_exit(&lofi_lock);
1402		free_lofi_ioctl(klip);
1403		return (ENXIO);
1404	}
1405	lsp = ddi_get_soft_state(lofi_statep, minor);
1406	if (lsp == NULL || lsp->ls_vp == NULL) {
1407		mutex_exit(&lofi_lock);
1408		free_lofi_ioctl(klip);
1409		return (ENXIO);
1410	}
1411
1412	/*
1413	 * If it's still held open, we'll do one of three things:
1414	 *
1415	 * If no flag is set, just return EBUSY.
1416	 *
1417	 * If the 'cleanup' flag is set, unmap and remove the device when
1418	 * the last user finishes.
1419	 *
1420	 * If the 'force' flag is set, then we forcibly close the underlying
1421	 * file.  Subsequent operations will fail, and the DKIOCSTATE ioctl
1422	 * will return DKIO_DEV_GONE.  When the device is last closed, the
1423	 * device will be cleaned up appropriately.
1424	 *
1425	 * This is complicated by the fact that we may have outstanding
1426	 * dispatched I/Os.  Rather than having a single mutex to serialize all
1427	 * I/O, we keep a count of the number of outstanding I/O requests, as
1428	 * well as a flag to indicate that no new I/Os should be dispatched.
1429	 * We set the flag, wait for the number of outstanding I/Os to reach 0,
1430	 * and then close the underlying vnode.
1431	 */
1432
1433	if (is_opened(lsp)) {
1434		if (klip->li_force) {
1435			mutex_enter(&lsp->ls_vp_lock);
1436			lsp->ls_vp_closereq = B_TRUE;
1437			while (lsp->ls_vp_iocount > 0)
1438				cv_wait(&lsp->ls_vp_cv, &lsp->ls_vp_lock);
1439			(void) VOP_CLOSE(lsp->ls_vp, lsp->ls_openflag, 1, 0,
1440			    credp, NULL);
1441			VN_RELE(lsp->ls_vp);
1442			lsp->ls_vp = NULL;
1443			cv_broadcast(&lsp->ls_vp_cv);
1444			mutex_exit(&lsp->ls_vp_lock);
1445			mutex_exit(&lofi_lock);
1446			klip->li_minor = minor;
1447			(void) copy_out_lofi_ioctl(klip, ulip, ioctl_flag);
1448			free_lofi_ioctl(klip);
1449			return (0);
1450		} else if (klip->li_cleanup) {
1451			lsp->ls_cleanup = 1;
1452			mutex_exit(&lofi_lock);
1453			free_lofi_ioctl(klip);
1454			return (0);
1455		}
1456
1457		mutex_exit(&lofi_lock);
1458		free_lofi_ioctl(klip);
1459		return (EBUSY);
1460	}
1461
1462	lofi_free_handle(dev, minor, lsp, credp);
1463
1464	klip->li_minor = minor;
1465	mutex_exit(&lofi_lock);
1466	(void) copy_out_lofi_ioctl(klip, ulip, ioctl_flag);
1467	free_lofi_ioctl(klip);
1468	return (0);
1469}
1470
1471/*
1472 * get the filename given the minor number, or the minor number given
1473 * the name.
1474 */
1475/*ARGSUSED*/
1476static int
1477lofi_get_info(dev_t dev, struct lofi_ioctl *ulip, int which,
1478    struct cred *credp, int ioctl_flag)
1479{
1480	struct lofi_state *lsp;
1481	struct lofi_ioctl *klip;
1482	int	error;
1483	minor_t	minor;
1484
1485	klip = copy_in_lofi_ioctl(ulip, ioctl_flag);
1486	if (klip == NULL)
1487		return (EFAULT);
1488
1489	switch (which) {
1490	case LOFI_GET_FILENAME:
1491		minor = klip->li_minor;
1492		if (minor == 0) {
1493			free_lofi_ioctl(klip);
1494			return (EINVAL);
1495		}
1496
1497		mutex_enter(&lofi_lock);
1498		lsp = ddi_get_soft_state(lofi_statep, minor);
1499		if (lsp == NULL) {
1500			mutex_exit(&lofi_lock);
1501			free_lofi_ioctl(klip);
1502			return (ENXIO);
1503		}
1504		(void) strcpy(klip->li_filename, lsp->ls_filename);
1505		(void) strlcpy(klip->li_algorithm, lsp->ls_comp_algorithm,
1506		    sizeof (klip->li_algorithm));
1507		mutex_exit(&lofi_lock);
1508		error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag);
1509		free_lofi_ioctl(klip);
1510		return (error);
1511	case LOFI_GET_MINOR:
1512		mutex_enter(&lofi_lock);
1513		klip->li_minor = file_to_minor(klip->li_filename);
1514		mutex_exit(&lofi_lock);
1515		if (klip->li_minor == 0) {
1516			free_lofi_ioctl(klip);
1517			return (ENOENT);
1518		}
1519		error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag);
1520		free_lofi_ioctl(klip);
1521		return (error);
1522	case LOFI_CHECK_COMPRESSED:
1523		mutex_enter(&lofi_lock);
1524		klip->li_minor = file_to_minor(klip->li_filename);
1525		mutex_exit(&lofi_lock);
1526		if (klip->li_minor == 0) {
1527			free_lofi_ioctl(klip);
1528			return (ENOENT);
1529		}
1530		mutex_enter(&lofi_lock);
1531		lsp = ddi_get_soft_state(lofi_statep, klip->li_minor);
1532		if (lsp == NULL) {
1533			mutex_exit(&lofi_lock);
1534			free_lofi_ioctl(klip);
1535			return (ENXIO);
1536		}
1537		ASSERT(strcmp(klip->li_filename, lsp->ls_filename) == 0);
1538
1539		(void) strlcpy(klip->li_algorithm, lsp->ls_comp_algorithm,
1540		    sizeof (klip->li_algorithm));
1541		mutex_exit(&lofi_lock);
1542		error = copy_out_lofi_ioctl(klip, ulip, ioctl_flag);
1543		free_lofi_ioctl(klip);
1544		return (error);
1545	default:
1546		free_lofi_ioctl(klip);
1547		return (EINVAL);
1548	}
1549
1550}
1551
1552static int
1553lofi_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *credp,
1554    int *rvalp)
1555{
1556	int	error;
1557	enum dkio_state dkstate;
1558	struct lofi_state *lsp;
1559	minor_t	minor;
1560
1561#ifdef lint
1562	credp = credp;
1563#endif
1564
1565	minor = getminor(dev);
1566	/* lofi ioctls only apply to the master device */
1567	if (minor == 0) {
1568		struct lofi_ioctl *lip = (struct lofi_ioctl *)arg;
1569
1570		/*
1571		 * the query command only need read-access - i.e., normal
1572		 * users are allowed to do those on the ctl device as
1573		 * long as they can open it read-only.
1574		 */
1575		switch (cmd) {
1576		case LOFI_MAP_FILE:
1577			if ((flag & FWRITE) == 0)
1578				return (EPERM);
1579			return (lofi_map_file(dev, lip, 1, rvalp, credp, flag));
1580		case LOFI_MAP_FILE_MINOR:
1581			if ((flag & FWRITE) == 0)
1582				return (EPERM);
1583			return (lofi_map_file(dev, lip, 0, rvalp, credp, flag));
1584		case LOFI_UNMAP_FILE:
1585			if ((flag & FWRITE) == 0)
1586				return (EPERM);
1587			return (lofi_unmap_file(dev, lip, 1, credp, flag));
1588		case LOFI_UNMAP_FILE_MINOR:
1589			if ((flag & FWRITE) == 0)
1590				return (EPERM);
1591			return (lofi_unmap_file(dev, lip, 0, credp, flag));
1592		case LOFI_GET_FILENAME:
1593			return (lofi_get_info(dev, lip, LOFI_GET_FILENAME,
1594			    credp, flag));
1595		case LOFI_GET_MINOR:
1596			return (lofi_get_info(dev, lip, LOFI_GET_MINOR,
1597			    credp, flag));
1598		case LOFI_GET_MAXMINOR:
1599			error = ddi_copyout(&lofi_max_files, &lip->li_minor,
1600			    sizeof (lofi_max_files), flag);
1601			if (error)
1602				return (EFAULT);
1603			return (0);
1604		case LOFI_CHECK_COMPRESSED:
1605			return (lofi_get_info(dev, lip, LOFI_CHECK_COMPRESSED,
1606			    credp, flag));
1607		default:
1608			break;
1609		}
1610	}
1611
1612	lsp = ddi_get_soft_state(lofi_statep, minor);
1613	if (lsp == NULL)
1614		return (ENXIO);
1615
1616	/*
1617	 * We explicitly allow DKIOCSTATE, but all other ioctls should fail with
1618	 * EIO as if the device was no longer present.
1619	 */
1620	if (lsp->ls_vp == NULL && cmd != DKIOCSTATE)
1621		return (EIO);
1622
1623	/* these are for faking out utilities like newfs */
1624	switch (cmd) {
1625	case DKIOCGVTOC:
1626		switch (ddi_model_convert_from(flag & FMODELS)) {
1627		case DDI_MODEL_ILP32: {
1628			struct vtoc32 vtoc32;
1629
1630			vtoctovtoc32(lsp->ls_vtoc, vtoc32);
1631			if (ddi_copyout(&vtoc32, (void *)arg,
1632			    sizeof (struct vtoc32), flag))
1633				return (EFAULT);
1634				break;
1635			}
1636
1637		case DDI_MODEL_NONE:
1638			if (ddi_copyout(&lsp->ls_vtoc, (void *)arg,
1639			    sizeof (struct vtoc), flag))
1640				return (EFAULT);
1641			break;
1642		}
1643		return (0);
1644	case DKIOCINFO:
1645		error = ddi_copyout(&lsp->ls_ci, (void *)arg,
1646		    sizeof (struct dk_cinfo), flag);
1647		if (error)
1648			return (EFAULT);
1649		return (0);
1650	case DKIOCG_VIRTGEOM:
1651	case DKIOCG_PHYGEOM:
1652	case DKIOCGGEOM:
1653		error = ddi_copyout(&lsp->ls_dkg, (void *)arg,
1654		    sizeof (struct dk_geom), flag);
1655		if (error)
1656			return (EFAULT);
1657		return (0);
1658	case DKIOCSTATE:
1659		/*
1660		 * Normally, lofi devices are always in the INSERTED state.  If
1661		 * a device is forcefully unmapped, then the device transitions
1662		 * to the DKIO_DEV_GONE state.
1663		 */
1664		if (ddi_copyin((void *)arg, &dkstate, sizeof (dkstate),
1665		    flag) != 0)
1666			return (EFAULT);
1667
1668		mutex_enter(&lsp->ls_vp_lock);
1669		while ((dkstate == DKIO_INSERTED && lsp->ls_vp != NULL) ||
1670		    (dkstate == DKIO_DEV_GONE && lsp->ls_vp == NULL)) {
1671			/*
1672			 * By virtue of having the device open, we know that
1673			 * 'lsp' will remain valid when we return.
1674			 */
1675			if (!cv_wait_sig(&lsp->ls_vp_cv,
1676			    &lsp->ls_vp_lock)) {
1677				mutex_exit(&lsp->ls_vp_lock);
1678				return (EINTR);
1679			}
1680		}
1681
1682		dkstate = (lsp->ls_vp != NULL ? DKIO_INSERTED : DKIO_DEV_GONE);
1683		mutex_exit(&lsp->ls_vp_lock);
1684
1685		if (ddi_copyout(&dkstate, (void *)arg,
1686		    sizeof (dkstate), flag) != 0)
1687			return (EFAULT);
1688		return (0);
1689	default:
1690		return (ENOTTY);
1691	}
1692}
1693
1694static struct cb_ops lofi_cb_ops = {
1695	lofi_open,		/* open */
1696	lofi_close,		/* close */
1697	lofi_strategy,		/* strategy */
1698	nodev,			/* print */
1699	nodev,			/* dump */
1700	lofi_read,		/* read */
1701	lofi_write,		/* write */
1702	lofi_ioctl,		/* ioctl */
1703	nodev,			/* devmap */
1704	nodev,			/* mmap */
1705	nodev,			/* segmap */
1706	nochpoll,		/* poll */
1707	ddi_prop_op,		/* prop_op */
1708	0,			/* streamtab  */
1709	D_64BIT | D_NEW | D_MP,	/* Driver compatibility flag */
1710	CB_REV,
1711	lofi_aread,
1712	lofi_awrite
1713};
1714
1715static struct dev_ops lofi_ops = {
1716	DEVO_REV,		/* devo_rev, */
1717	0,			/* refcnt  */
1718	lofi_info,		/* info */
1719	nulldev,		/* identify */
1720	nulldev,		/* probe */
1721	lofi_attach,		/* attach */
1722	lofi_detach,		/* detach */
1723	nodev,			/* reset */
1724	&lofi_cb_ops,		/* driver operations */
1725	NULL,			/* no bus operations */
1726	NULL,			/* power */
1727	ddi_quiesce_not_needed,		/* quiesce */
1728};
1729
1730static struct modldrv modldrv = {
1731	&mod_driverops,
1732	"loopback file driver",
1733	&lofi_ops,
1734};
1735
1736static struct modlinkage modlinkage = {
1737	MODREV_1,
1738	&modldrv,
1739	NULL
1740};
1741
1742int
1743_init(void)
1744{
1745	int error;
1746
1747	error = ddi_soft_state_init(&lofi_statep,
1748	    sizeof (struct lofi_state), 0);
1749	if (error)
1750		return (error);
1751
1752	mutex_init(&lofi_lock, NULL, MUTEX_DRIVER, NULL);
1753	error = mod_install(&modlinkage);
1754	if (error) {
1755		mutex_destroy(&lofi_lock);
1756		ddi_soft_state_fini(&lofi_statep);
1757	}
1758
1759	return (error);
1760}
1761
1762int
1763_fini(void)
1764{
1765	int	error;
1766
1767	if (lofi_busy())
1768		return (EBUSY);
1769
1770	error = mod_remove(&modlinkage);
1771	if (error)
1772		return (error);
1773
1774	mutex_destroy(&lofi_lock);
1775	ddi_soft_state_fini(&lofi_statep);
1776
1777	return (error);
1778}
1779
1780int
1781_info(struct modinfo *modinfop)
1782{
1783	return (mod_info(&modlinkage, modinfop));
1784}
1785