1/*
2 * Copyright (c) 2000-2010 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*
30 * Copyright (c) 1988 University of Utah.
31 * Copyright (c) 1990, 1993
32 *	The Regents of the University of California.  All rights reserved.
33 *
34 * This code is derived from software contributed to Berkeley by
35 * the Systems Programming Group of the University of Utah Computer
36 * Science Department.
37 *
38 * Redistribution and use in source and binary forms, with or without
39 * modification, are permitted provided that the following conditions
40 * are met:
41 * 1. Redistributions of source code must retain the above copyright
42 *    notice, this list of conditions and the following disclaimer.
43 * 2. Redistributions in binary form must reproduce the above copyright
44 *    notice, this list of conditions and the following disclaimer in the
45 *    documentation and/or other materials provided with the distribution.
46 * 3. All advertising materials mentioning features or use of this software
47 *    must display the following acknowledgement:
48 *	This product includes software developed by the University of
49 *	California, Berkeley and its contributors.
50 * 4. Neither the name of the University nor the names of its contributors
51 *    may be used to endorse or promote products derived from this software
52 *    without specific prior written permission.
53 *
54 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64 * SUCH DAMAGE.
65 *
66 * from: Utah Hdr: vn.c 1.13 94/04/02
67 *
68 *	from: @(#)vn.c	8.6 (Berkeley) 4/1/94
69 * $FreeBSD: src/sys/dev/vn/vn.c,v 1.105.2.4 2001/11/18 07:11:00 dillon Exp $
70 */
71
72/*
73 * Vnode disk driver.
74 *
75 * Block/character interface to a vnode.  Allows one to treat a file
76 * as a disk (e.g. build a filesystem in it, mount it, etc.).
77 *
78 * NOTE 1: This uses the vnop_blockmap/vnop_strategy interface to the vnode
79 * instead of a simple VOP_RDWR.  We do this to avoid distorting the
80 * local buffer cache.
81 *
82 * NOTE 2: There is a security issue involved with this driver.
83 * Once mounted all access to the contents of the "mapped" file via
84 * the special file is controlled by the permissions on the special
85 * file, the protection of the mapped file is ignored (effectively,
86 * by using root credentials in all transactions).
87 *
88 * NOTE 3: Doesn't interact with leases, should it?
89 */
90
91#include "vndevice.h"
92
93#if NVNDEVICE > 0
94
95#include <sys/param.h>
96#include <sys/systm.h>
97#include <sys/kernel.h>
98#include <sys/mount.h>
99#include <sys/namei.h>
100#include <sys/proc.h>
101#include <sys/kauth.h>
102#include <sys/buf.h>
103#include <sys/malloc.h>
104#include <sys/vnode_internal.h>
105#include <sys/fcntl.h>
106#include <sys/conf.h>
107#include <sys/disk.h>
108#include <sys/stat.h>
109#include <sys/conf.h>
110#include <sys/uio_internal.h>
111
112#include <sys/vnioctl.h>
113
114#include <sys/vm.h>
115
116#include <vm/vm_pager.h>
117#include <mach/memory_object_types.h>
118
119#include <miscfs/devfs/devfs.h>
120
121
122#include "shadow.h"
123static void
124vndevice_do_init(void);
125
126static ioctl_fcn_t		vnioctl_chr;
127static ioctl_fcn_t		vnioctl_blk;
128static open_close_fcn_t		vnopen;
129static open_close_fcn_t		vnclose;
130static psize_fcn_t		vnsize;
131static strategy_fcn_t		vnstrategy;
132static read_write_fcn_t		vnread;
133static read_write_fcn_t		vnwrite;
134
135static int	vndevice_bdev_major;
136static int	vndevice_cdev_major;
137
138/*
139 * cdevsw
140 *	D_DISK		we want to look like a disk
141 *	D_CANFREE	We support B_FREEBUF
142 */
143
144static struct bdevsw vn_bdevsw = {
145	/* open */	vnopen,
146	/* close */	vnclose,
147	/* strategy */	vnstrategy,
148	/* ioctl */	vnioctl_blk,
149	/* dump */	eno_dump,
150	/* psize */	vnsize,
151	/* flags */	D_DISK,
152};
153
154static struct cdevsw vn_cdevsw = {
155	/* open */	vnopen,
156	/* close */	vnclose,
157	/* read */	vnread,
158	/* write */	vnwrite,
159	/* ioctl */	vnioctl_chr,
160	/* stop */	eno_stop,
161	/* reset */	eno_reset,
162	/* ttys */	NULL,
163	/* select */	eno_select,
164	/* mmap */	eno_mmap,
165	/* strategy */	eno_strat,
166	/* getc */	eno_getc,
167	/* putc */	eno_putc,
168	/* flags */	D_DISK,
169};
170
171struct vn_softc {
172	u_int64_t	sc_fsize;	/* file size in bytes 		*/
173	u_int64_t	sc_size;	/* size of vn, sc_secsize scale	*/
174	int		sc_flags;	/* flags 			*/
175	u_int32_t		sc_secsize;	/* sector size			*/
176	struct vnode	*sc_vp;		/* vnode if not NULL		*/
177	uint32_t	sc_vid;
178	int		sc_open_flags;
179	struct vnode	*sc_shadow_vp;	/* shadow vnode if not NULL	*/
180	uint32_t	sc_shadow_vid;
181	shadow_map_t *	sc_shadow_map;	/* shadow map if not NULL	*/
182	kauth_cred_t	sc_cred;	/* credentials 			*/
183	u_int32_t	sc_options;	/* options 			*/
184	void *		sc_bdev;
185	void *		sc_cdev;
186} vn_table[NVNDEVICE];
187
188#define ROOT_IMAGE_UNIT	0
189
190/* sc_flags */
191#define VNF_INITED	0x01
192#define	VNF_READONLY	0x02
193
194static u_int32_t	vn_options;
195
196#define IFOPT(vn,opt) if (((vn)->sc_options|vn_options) & (opt))
197#define TESTOPT(vn,opt) (((vn)->sc_options|vn_options) & (opt))
198
199static int	setcred(struct vnode * vp, kauth_cred_t cred);
200static void	vnclear (struct vn_softc *vn, vfs_context_t  ctx);
201static void vn_ioctl_to_64(struct vn_ioctl_32 *from, struct vn_ioctl_64 *to);
202void vndevice_init(void);
203int vndevice_root_image(char * path, char devname[], dev_t * dev_p);
204
205static int
206vniocattach_file(struct vn_softc *vn,
207		 struct vn_ioctl_64 *vniop,
208		 dev_t dev,
209		 int in_kernel,
210		 proc_t p);
211static int
212vniocattach_shadow(struct vn_softc * vn,
213		   struct vn_ioctl_64 *vniop,
214		   dev_t dev,
215		   int in_kernel,
216		   proc_t p);
217static __inline__ int
218vnunit(dev_t dev)
219{
220	return (minor(dev));
221}
222
223static	int
224vnclose(__unused dev_t dev, __unused int flags,
225		__unused int devtype, __unused proc_t p)
226{
227	return (0);
228}
229
230static	int
231vnopen(dev_t dev, int flags, __unused int devtype, __unused proc_t p)
232{
233	struct vn_softc *vn;
234	int unit;
235
236	unit = vnunit(dev);
237	if (vnunit(dev) >= NVNDEVICE) {
238		return (ENXIO);
239	}
240	vn = vn_table + unit;
241	if ((flags & FWRITE) && (vn->sc_flags & VNF_READONLY))
242		return (EACCES);
243
244	return(0);
245}
246
247static int
248file_io(struct vnode * vp, vfs_context_t ctx,
249	enum uio_rw op, char * base, off_t offset, user_ssize_t count,
250	user_ssize_t * resid)
251{
252	uio_t 		auio;
253	int		error;
254	char		uio_buf[UIO_SIZEOF(1)];
255
256	auio = uio_createwithbuffer(1, offset, UIO_SYSSPACE, op,
257				    &uio_buf[0], sizeof(uio_buf));
258	uio_addiov(auio, CAST_USER_ADDR_T(base), count);
259	if (op == UIO_READ)
260		error = VNOP_READ(vp, auio, IO_SYNC, ctx);
261	else
262		error = VNOP_WRITE(vp, auio, IO_SYNC, ctx);
263
264	if (resid != NULL) {
265		*resid = uio_resid(auio);
266	}
267	return (error);
268}
269
270static __inline__ off_t
271block_round(off_t o, int blocksize)
272{
273	return ((o + blocksize - 1) / blocksize);
274}
275
276static __inline__ off_t
277block_truncate(off_t o, int blocksize)
278{
279	return (o / blocksize);
280}
281
282static __inline__ int
283block_remainder(off_t o, int blocksize)
284{
285	return (o % blocksize);
286}
287
288static int
289vnread_shadow(struct vn_softc * vn, struct uio *uio, int ioflag,
290	      vfs_context_t ctx)
291{
292	u_int32_t		blocksize = vn->sc_secsize;
293	int 		error = 0;
294	off_t		offset;
295	user_ssize_t	resid;
296	off_t		orig_offset;
297	user_ssize_t	orig_resid;
298
299	orig_resid = resid = uio_resid(uio);
300	orig_offset = offset = uio_offset(uio);
301
302	while (resid > 0) {
303		u_int32_t		remainder;
304		u_int32_t		this_block_number;
305		u_int32_t		this_block_count;
306		off_t		this_offset;
307		user_ssize_t	this_resid;
308		struct vnode *	vp;
309
310		/* figure out which blocks to read */
311		remainder = block_remainder(offset, blocksize);
312		if (shadow_map_read(vn->sc_shadow_map,
313				    block_truncate(offset, blocksize),
314				    block_round(resid + remainder, blocksize),
315				    &this_block_number, &this_block_count)) {
316			vp = vn->sc_shadow_vp;
317		}
318		else {
319			vp = vn->sc_vp;
320		}
321
322		/* read the blocks (or parts thereof) */
323		this_offset = (off_t)this_block_number * blocksize + remainder;
324		uio_setoffset(uio, this_offset);
325		this_resid = this_block_count * blocksize - remainder;
326		if (this_resid > resid) {
327			this_resid = resid;
328		}
329		uio_setresid(uio, this_resid);
330		error = VNOP_READ(vp, uio, ioflag, ctx);
331		if (error) {
332			break;
333		}
334
335		/* figure out how much we actually read */
336		this_resid -= uio_resid(uio);
337		if (this_resid == 0) {
338			printf("vn device: vnread_shadow zero length read\n");
339			break;
340		}
341		resid -= this_resid;
342		offset += this_resid;
343	}
344	uio_setresid(uio, resid);
345	uio_setoffset(uio, offset);
346	return (error);
347}
348
349static int
350vncopy_block_to_shadow(struct vn_softc * vn, vfs_context_t ctx,
351		       u_int32_t file_block, u_int32_t shadow_block)
352{
353	int	error;
354	char *	tmpbuf;
355
356	tmpbuf = _MALLOC(vn->sc_secsize, M_TEMP, M_WAITOK);
357	if (tmpbuf == NULL) {
358	    return (ENOMEM);
359	}
360	/* read one block from file at file_block offset */
361	error = file_io(vn->sc_vp, ctx, UIO_READ,
362			tmpbuf, (off_t)file_block * vn->sc_secsize,
363			vn->sc_secsize, NULL);
364	if (error) {
365		goto done;
366	}
367	/* write one block to shadow file at shadow_block offset */
368	error = file_io(vn->sc_shadow_vp, ctx, UIO_WRITE,
369			tmpbuf, (off_t)shadow_block * vn->sc_secsize,
370			vn->sc_secsize, NULL);
371 done:
372	FREE(tmpbuf, M_TEMP);
373	return (error);
374}
375
376enum {
377	FLAGS_FIRST_BLOCK_PARTIAL = 0x1,
378	FLAGS_LAST_BLOCK_PARTIAL = 0x2
379};
380
381static int
382vnwrite_shadow(struct vn_softc * vn, struct uio *uio, int ioflag,
383	       vfs_context_t ctx)
384{
385	u_int32_t		blocksize = vn->sc_secsize;
386	int 		error = 0;
387	user_ssize_t	resid;
388	off_t		offset;
389
390	resid = uio_resid(uio);
391	offset = uio_offset(uio);
392
393	while (resid > 0) {
394		int		flags = 0;
395		u_int32_t		offset_block_number;
396		u_int32_t		remainder;
397		u_int32_t		resid_block_count;
398		u_int32_t		shadow_block_count;
399		u_int32_t		shadow_block_number;
400		user_ssize_t	this_resid;
401
402		/* figure out which blocks to write */
403		offset_block_number = block_truncate(offset, blocksize);
404		remainder = block_remainder(offset, blocksize);
405		resid_block_count = block_round(resid + remainder, blocksize);
406		/* figure out if the first or last blocks are partial writes */
407		if (remainder > 0
408		    && !shadow_map_is_written(vn->sc_shadow_map,
409					      offset_block_number)) {
410			/* the first block is a partial write */
411			flags |= FLAGS_FIRST_BLOCK_PARTIAL;
412		}
413		if (resid_block_count > 1
414		    && !shadow_map_is_written(vn->sc_shadow_map,
415					      offset_block_number
416					      + resid_block_count - 1)
417		    && block_remainder(offset + resid, blocksize) > 0) {
418			/* the last block is a partial write */
419			flags |= FLAGS_LAST_BLOCK_PARTIAL;
420		}
421		if (shadow_map_write(vn->sc_shadow_map,
422				     offset_block_number, resid_block_count,
423				     &shadow_block_number,
424				     &shadow_block_count)) {
425			/* shadow file is growing */
426#if 0
427			/* truncate the file to its new length before write */
428			off_t	size;
429			size = (off_t)shadow_map_shadow_size(vn->sc_shadow_map)
430				* vn->sc_secsize;
431			vnode_setsize(vn->sc_shadow_vp, size, IO_SYNC, ctx);
432#endif
433		}
434		/* write the blocks (or parts thereof) */
435		uio_setoffset(uio, (off_t)
436			      shadow_block_number * blocksize + remainder);
437		this_resid = (off_t)shadow_block_count * blocksize - remainder;
438		if (this_resid >= resid) {
439			this_resid = resid;
440			if ((flags & FLAGS_LAST_BLOCK_PARTIAL) != 0) {
441				/* copy the last block to the shadow */
442				u_int32_t 	d;
443				u_int32_t	s;
444
445				s = offset_block_number
446					+ resid_block_count - 1;
447				d = shadow_block_number
448					+ shadow_block_count - 1;
449				error = vncopy_block_to_shadow(vn, ctx, s, d);
450				if (error) {
451					printf("vnwrite_shadow: failed to copy"
452					       " block %u to shadow block %u\n",
453					       s, d);
454					break;
455				}
456			}
457		}
458		uio_setresid(uio, this_resid);
459		if ((flags & FLAGS_FIRST_BLOCK_PARTIAL) != 0) {
460			/* copy the first block to the shadow */
461			error = vncopy_block_to_shadow(vn, ctx,
462						       offset_block_number,
463						       shadow_block_number);
464			if (error) {
465				printf("vnwrite_shadow: failed to"
466				       " copy block %u to shadow block %u\n",
467				       offset_block_number,
468				       shadow_block_number);
469				break;
470			}
471		}
472		error = VNOP_WRITE(vn->sc_shadow_vp, uio, ioflag, ctx);
473		if (error) {
474			break;
475		}
476		/* figure out how much we actually wrote */
477		this_resid -= uio_resid(uio);
478		if (this_resid == 0) {
479			printf("vn device: vnwrite_shadow zero length write\n");
480			break;
481		}
482		resid -= this_resid;
483		offset += this_resid;
484	}
485	uio_setresid(uio, resid);
486	uio_setoffset(uio, offset);
487	return (error);
488}
489
490static int
491vnread(dev_t dev, struct uio *uio, int ioflag)
492{
493	struct vfs_context  	context;
494	int 			error = 0;
495	off_t			offset;
496	proc_t			p;
497	user_ssize_t		resid;
498	struct vn_softc *	vn;
499	int 			unit;
500
501	unit = vnunit(dev);
502	if (vnunit(dev) >= NVNDEVICE) {
503		return (ENXIO);
504	}
505	p = current_proc();
506	vn = vn_table + unit;
507	if ((vn->sc_flags & VNF_INITED) == 0) {
508		error = ENXIO;
509		goto done;
510	}
511
512	context.vc_thread = current_thread();
513	context.vc_ucred = vn->sc_cred;
514
515	error = vnode_getwithvid(vn->sc_vp, vn->sc_vid);
516	if (error != 0) {
517		/* the vnode is no longer available, abort */
518		error = ENXIO;
519		vnclear(vn, &context);
520		goto done;
521	}
522
523	resid = uio_resid(uio);
524	offset = uio_offset(uio);
525
526	/*
527	 * If out of bounds return an error.  If at the EOF point,
528	 * simply read less.
529	 */
530	if (offset >= (off_t)vn->sc_fsize) {
531		if (offset > (off_t)vn->sc_fsize) {
532			error = EINVAL;
533		}
534		goto done;
535	}
536	/*
537	 * If the request crosses EOF, truncate the request.
538	 */
539	if ((offset + resid) > (off_t)vn->sc_fsize) {
540		resid = vn->sc_fsize - offset;
541		uio_setresid(uio, resid);
542	}
543
544	if (vn->sc_shadow_vp != NULL) {
545		error = vnode_getwithvid(vn->sc_shadow_vp,
546					 vn->sc_shadow_vid);
547		if (error != 0) {
548			/* the vnode is no longer available, abort */
549			error = ENXIO;
550			vnode_put(vn->sc_vp);
551			vnclear(vn, &context);
552			goto done;
553		}
554		error = vnread_shadow(vn, uio, ioflag, &context);
555		vnode_put(vn->sc_shadow_vp);
556	} else {
557		error = VNOP_READ(vn->sc_vp, uio, ioflag, &context);
558	}
559	vnode_put(vn->sc_vp);
560 done:
561	return (error);
562}
563
564static int
565vnwrite(dev_t dev, struct uio *uio, int ioflag)
566{
567	struct vfs_context  	context;
568	int 			error;
569	off_t			offset;
570	proc_t			p;
571	user_ssize_t		resid;
572	struct vn_softc *	vn;
573	int 			unit;
574
575	unit = vnunit(dev);
576	if (vnunit(dev) >= NVNDEVICE) {
577		return (ENXIO);
578	}
579	p = current_proc();
580	vn = vn_table + unit;
581	if ((vn->sc_flags & VNF_INITED) == 0) {
582		error = ENXIO;
583		goto done;
584	}
585	if (vn->sc_flags & VNF_READONLY) {
586		error = EROFS;
587		goto done;
588	}
589
590	context.vc_thread = current_thread();
591	context.vc_ucred = vn->sc_cred;
592
593	error = vnode_getwithvid(vn->sc_vp, vn->sc_vid);
594	if (error != 0) {
595		/* the vnode is no longer available, abort */
596		error = ENXIO;
597		vnclear(vn, &context);
598		goto done;
599	}
600	resid = uio_resid(uio);
601	offset = uio_offset(uio);
602
603	/*
604	 * If out of bounds return an error.  If at the EOF point,
605	 * simply write less.
606	 */
607	if (offset >= (off_t)vn->sc_fsize) {
608		if (offset > (off_t)vn->sc_fsize) {
609			error = EINVAL;
610		}
611		goto done;
612	}
613	/*
614	 * If the request crosses EOF, truncate the request.
615	 */
616	if ((offset + resid) > (off_t)vn->sc_fsize) {
617		resid = (off_t)vn->sc_fsize - offset;
618		uio_setresid(uio, resid);
619	}
620
621	if (vn->sc_shadow_vp != NULL) {
622		error = vnode_getwithvid(vn->sc_shadow_vp,
623					 vn->sc_shadow_vid);
624		if (error != 0) {
625			/* the vnode is no longer available, abort */
626			error = ENXIO;
627			vnode_put(vn->sc_vp);
628			vnclear(vn, &context);
629			goto done;
630		}
631		error = vnwrite_shadow(vn, uio, ioflag, &context);
632		vnode_put(vn->sc_shadow_vp);
633	} else {
634		error = VNOP_WRITE(vn->sc_vp, uio, ioflag, &context);
635	}
636	vnode_put(vn->sc_vp);
637 done:
638	return (error);
639}
640
641static int
642shadow_read(struct vn_softc * vn, struct buf * bp, char * base,
643	vfs_context_t ctx)
644{
645	u_int32_t		blocksize = vn->sc_secsize;
646	int 		error = 0;
647	u_int32_t		offset;
648	boolean_t	read_shadow;
649	u_int32_t		resid;
650	u_int32_t		start = 0;
651
652	offset = buf_blkno(bp);
653	resid =  buf_resid(bp) / blocksize;
654	while (resid > 0) {
655		user_ssize_t	temp_resid;
656		u_int32_t		this_offset;
657		u_int32_t		this_resid;
658		struct vnode *	vp;
659
660		read_shadow = shadow_map_read(vn->sc_shadow_map,
661					      offset, resid,
662					      &this_offset, &this_resid);
663		if (read_shadow) {
664			vp = vn->sc_shadow_vp;
665		}
666		else {
667			vp = vn->sc_vp;
668		}
669		error = file_io(vp, ctx, UIO_READ, base + start,
670				(off_t)this_offset * blocksize,
671				(user_ssize_t)this_resid * blocksize,
672				&temp_resid);
673		if (error) {
674			break;
675		}
676		this_resid -= (temp_resid / blocksize);
677		if (this_resid == 0) {
678			printf("vn device: shadow_read zero length read\n");
679			break;
680		}
681		resid -= this_resid;
682		offset += this_resid;
683		start += this_resid * blocksize;
684	}
685	buf_setresid(bp, resid * blocksize);
686	return (error);
687}
688
689static int
690shadow_write(struct vn_softc * vn, struct buf * bp, char * base,
691	     vfs_context_t ctx)
692{
693	u_int32_t		blocksize = vn->sc_secsize;
694	int 		error = 0;
695	u_int32_t		offset;
696	boolean_t	shadow_grew;
697	u_int32_t		resid;
698	u_int32_t		start = 0;
699
700	offset = buf_blkno(bp);
701	resid =  buf_resid(bp) / blocksize;
702	while (resid > 0) {
703		user_ssize_t	temp_resid;
704		u_int32_t		this_offset;
705		u_int32_t		this_resid;
706
707		shadow_grew = shadow_map_write(vn->sc_shadow_map,
708					       offset, resid,
709					       &this_offset, &this_resid);
710		if (shadow_grew) {
711#if 0
712			off_t	size;
713			/* truncate the file to its new length before write */
714			size = (off_t)shadow_map_shadow_size(vn->sc_shadow_map)
715				* blocksize;
716			vnode_setsize(vn->sc_shadow_vp, size, IO_SYNC, ctx);
717#endif
718		}
719		error = file_io(vn->sc_shadow_vp, ctx, UIO_WRITE,
720				base + start,
721				(off_t)this_offset * blocksize,
722				(user_ssize_t)this_resid * blocksize,
723				&temp_resid);
724		if (error) {
725			break;
726		}
727		this_resid -= (temp_resid / blocksize);
728		if (this_resid == 0) {
729			printf("vn device: shadow_write zero length write\n");
730			break;
731		}
732		resid -= this_resid;
733		offset += this_resid;
734		start += this_resid * blocksize;
735	}
736	buf_setresid(bp, resid * blocksize);
737	return (error);
738}
739
740static int
741vn_readwrite_io(struct vn_softc * vn, struct buf * bp, vfs_context_t ctx)
742{
743	int			error = 0;
744	char *			iov_base;
745	caddr_t 		vaddr;
746
747	if (buf_map(bp, &vaddr))
748	        panic("vn device: buf_map failed");
749	iov_base = (char *)vaddr;
750
751	if (vn->sc_shadow_vp == NULL) {
752	        user_ssize_t		temp_resid;
753
754		error = file_io(vn->sc_vp, ctx,
755				buf_flags(bp) & B_READ ? UIO_READ : UIO_WRITE,
756				iov_base,
757				(off_t)buf_blkno(bp) * vn->sc_secsize,
758				buf_resid(bp), &temp_resid);
759		buf_setresid(bp, temp_resid);
760	}
761	else {
762		if (buf_flags(bp) & B_READ)
763			error = shadow_read(vn, bp, iov_base, ctx);
764		else
765			error = shadow_write(vn, bp, iov_base, ctx);
766	}
767	buf_unmap(bp);
768
769	return (error);
770}
771
772static void
773vnstrategy(struct buf *bp)
774{
775	struct vn_softc *vn;
776	int error = 0;
777	long sz;	/* in sc_secsize chunks */
778	daddr64_t blk_num;
779	struct vnode *		shadow_vp = NULL;
780	struct vnode *		vp = NULL;
781	struct vfs_context  	context;
782
783	vn = vn_table + vnunit(buf_device(bp));
784	if ((vn->sc_flags & VNF_INITED) == 0) {
785		error = ENXIO;
786		goto done;
787	}
788
789	context.vc_thread = current_thread();
790	context.vc_ucred = vn->sc_cred;
791
792	buf_setresid(bp, buf_count(bp));
793	/*
794	 * Check for required alignment.  Transfers must be a valid
795	 * multiple of the sector size.
796	 */
797	blk_num = buf_blkno(bp);
798	if (buf_count(bp) % vn->sc_secsize != 0) {
799		error = EINVAL;
800		goto done;
801	}
802	sz = howmany(buf_count(bp), vn->sc_secsize);
803
804	/*
805	 * If out of bounds return an error.  If at the EOF point,
806	 * simply read or write less.
807	 */
808	if (blk_num >= 0 && (u_int64_t)blk_num >= vn->sc_size) {
809		if (blk_num > 0 && (u_int64_t)blk_num > vn->sc_size) {
810			error = EINVAL;
811		}
812		goto done;
813	}
814	/*
815	 * If the request crosses EOF, truncate the request.
816	 */
817	if ((blk_num + sz) > 0 && ((u_int64_t)(blk_num + sz)) > vn->sc_size) {
818		buf_setcount(bp, (vn->sc_size - blk_num) * vn->sc_secsize);
819		buf_setresid(bp, buf_count(bp));
820	}
821	vp = vn->sc_vp;
822	if (vp == NULL) {
823		error = ENXIO;
824		goto done;
825	}
826
827	error = vnode_getwithvid(vp, vn->sc_vid);
828	if (error != 0) {
829		/* the vnode is no longer available, abort */
830		error = ENXIO;
831		vnclear(vn, &context);
832		goto done;
833	}
834	shadow_vp = vn->sc_shadow_vp;
835	if (shadow_vp != NULL) {
836		error = vnode_getwithvid(shadow_vp,
837					 vn->sc_shadow_vid);
838		if (error != 0) {
839			/* the vnode is no longer available, abort */
840			error = ENXIO;
841			vnode_put(vn->sc_vp);
842			vnclear(vn, &context);
843			goto done;
844		}
845	}
846
847	error = vn_readwrite_io(vn, bp, &context);
848	vnode_put(vp);
849	if (shadow_vp != NULL) {
850		vnode_put(shadow_vp);
851	}
852
853 done:
854	if (error) {
855	        buf_seterror(bp, error);
856	}
857	buf_biodone(bp);
858	return;
859}
860
861/* ARGSUSED */
862static	int
863vnioctl(dev_t dev, u_long cmd, caddr_t data,
864	__unused int flag, proc_t p,
865	int is_char)
866{
867	struct vn_softc *vn;
868	struct vn_ioctl_64 *viop;
869	int error;
870	u_int32_t *f;
871	u_int64_t * o;
872	int unit;
873	struct vfsioattr ioattr;
874	struct vn_ioctl_64 user_vnio;
875	struct vfs_context  	context;
876
877	unit = vnunit(dev);
878	if (vnunit(dev) >= NVNDEVICE) {
879		return (ENXIO);
880	}
881
882	vn = vn_table + unit;
883	error = proc_suser(p);
884	if (error) {
885		goto done;
886	}
887
888	context.vc_thread = current_thread();
889	context.vc_ucred = vn->sc_cred;
890
891	viop = (struct vn_ioctl_64 *)data;
892	f = (u_int32_t *)data;
893	o = (u_int64_t *)data;
894	switch (cmd) {
895#ifdef __LP64__
896	case VNIOCDETACH32:
897	case VNIOCDETACH:
898#else
899	case VNIOCDETACH:
900	case VNIOCDETACH64:
901#endif
902	case DKIOCGETBLOCKSIZE:
903	case DKIOCSETBLOCKSIZE:
904	case DKIOCGETMAXBLOCKCOUNTREAD:
905	case DKIOCGETMAXBLOCKCOUNTWRITE:
906	case DKIOCGETMAXSEGMENTCOUNTREAD:
907	case DKIOCGETMAXSEGMENTCOUNTWRITE:
908	case DKIOCGETMAXSEGMENTBYTECOUNTREAD:
909	case DKIOCGETMAXSEGMENTBYTECOUNTWRITE:
910	case DKIOCGETBLOCKCOUNT:
911	case DKIOCGETBLOCKCOUNT32:
912		if ((vn->sc_flags & VNF_INITED) == 0) {
913			error = ENXIO;
914			goto done;
915		}
916		break;
917	default:
918		break;
919	}
920
921	if (vn->sc_vp != NULL)
922		vfs_ioattr(vnode_mount(vn->sc_vp), &ioattr);
923	else
924		bzero(&ioattr, sizeof(ioattr));
925
926	switch (cmd) {
927	case DKIOCISVIRTUAL:
928		*f = 1;
929		break;
930	case DKIOCGETMAXBLOCKCOUNTREAD:
931		*o = ioattr.io_maxreadcnt / vn->sc_secsize;
932		break;
933	case DKIOCGETMAXBLOCKCOUNTWRITE:
934		*o = ioattr.io_maxwritecnt / vn->sc_secsize;
935		break;
936	case DKIOCGETMAXBYTECOUNTREAD:
937		*o = ioattr.io_maxreadcnt;
938		break;
939	case DKIOCGETMAXBYTECOUNTWRITE:
940		*o = ioattr.io_maxwritecnt;
941		break;
942	case DKIOCGETMAXSEGMENTCOUNTREAD:
943		*o = ioattr.io_segreadcnt;
944		break;
945	case DKIOCGETMAXSEGMENTCOUNTWRITE:
946		*o = ioattr.io_segwritecnt;
947		break;
948	case DKIOCGETMAXSEGMENTBYTECOUNTREAD:
949		*o = ioattr.io_maxsegreadsize;
950		break;
951	case DKIOCGETMAXSEGMENTBYTECOUNTWRITE:
952		*o = ioattr.io_maxsegwritesize;
953		break;
954	case DKIOCGETBLOCKSIZE:
955	        *f = vn->sc_secsize;
956		break;
957	case DKIOCSETBLOCKSIZE:
958		if (is_char) {
959			/* can only set block size on block device */
960			error = ENODEV;
961			break;
962		}
963		if (*f < DEV_BSIZE) {
964			error = EINVAL;
965			break;
966		}
967		if (vn->sc_shadow_vp != NULL) {
968			if (*f == (unsigned)vn->sc_secsize) {
969				break;
970			}
971			/* can't change the block size if already shadowing */
972			error = EBUSY;
973			break;
974		}
975		vn->sc_secsize = *f;
976		/* recompute the size in terms of the new blocksize */
977		vn->sc_size = vn->sc_fsize / vn->sc_secsize;
978		break;
979	case DKIOCISWRITABLE:
980		*f = 1;
981		break;
982	case DKIOCGETBLOCKCOUNT32:
983		*f = vn->sc_size;
984		break;
985	case DKIOCGETBLOCKCOUNT:
986		*o = vn->sc_size;
987		break;
988#ifdef __LP64__
989	case VNIOCSHADOW32:
990	case VNIOCSHADOW:
991#else
992	case VNIOCSHADOW:
993	case VNIOCSHADOW64:
994#endif
995		if (vn->sc_shadow_vp != NULL) {
996			error = EBUSY;
997			break;
998		}
999		if (vn->sc_vp == NULL) {
1000			/* much be attached before we can shadow */
1001			error = EINVAL;
1002			break;
1003		}
1004		if (!proc_is64bit(p)) {
1005			/* downstream code expects LP64 version of vn_ioctl structure */
1006			vn_ioctl_to_64((struct vn_ioctl_32 *)viop, &user_vnio);
1007			viop = &user_vnio;
1008		}
1009		if (viop->vn_file == USER_ADDR_NULL) {
1010			error = EINVAL;
1011			break;
1012		}
1013		error = vniocattach_shadow(vn, viop, dev, 0, p);
1014		break;
1015
1016#ifdef __LP64__
1017	case VNIOCATTACH32:
1018	case VNIOCATTACH:
1019#else
1020	case VNIOCATTACH:
1021	case VNIOCATTACH64:
1022#endif
1023		if (is_char) {
1024			/* attach only on block device */
1025			error = ENODEV;
1026			break;
1027		}
1028		if (vn->sc_flags & VNF_INITED) {
1029			error = EBUSY;
1030			break;
1031		}
1032		if (!proc_is64bit(p)) {
1033			/* downstream code expects LP64 version of vn_ioctl structure */
1034			vn_ioctl_to_64((struct vn_ioctl_32 *)viop, &user_vnio);
1035			viop = &user_vnio;
1036		}
1037		if (viop->vn_file == USER_ADDR_NULL) {
1038			error = EINVAL;
1039			break;
1040		}
1041		error = vniocattach_file(vn, viop, dev, 0, p);
1042		break;
1043
1044#ifdef __LP64__
1045	case VNIOCDETACH32:
1046	case VNIOCDETACH:
1047#else
1048	case VNIOCDETACH:
1049	case VNIOCDETACH64:
1050#endif
1051		if (is_char) {
1052			/* detach only on block device */
1053			error = ENODEV;
1054			break;
1055		}
1056		/* Note: spec_open won't open a mounted block device */
1057
1058		/*
1059		 * XXX handle i/o in progress.  Return EBUSY, or wait, or
1060		 * flush the i/o.
1061		 * XXX handle multiple opens of the device.  Return EBUSY,
1062		 * or revoke the fd's.
1063		 * How are these problems handled for removable and failing
1064		 * hardware devices? (Hint: They are not)
1065		 */
1066		vnclear(vn, &context);
1067		break;
1068
1069	case VNIOCGSET:
1070		vn_options |= *f;
1071		*f = vn_options;
1072		break;
1073
1074	case VNIOCGCLEAR:
1075		vn_options &= ~(*f);
1076		*f = vn_options;
1077		break;
1078
1079	case VNIOCUSET:
1080		vn->sc_options |= *f;
1081		*f = vn->sc_options;
1082		break;
1083
1084	case VNIOCUCLEAR:
1085		vn->sc_options &= ~(*f);
1086		*f = vn->sc_options;
1087		break;
1088
1089	default:
1090		error = ENOTTY;
1091		break;
1092	}
1093 done:
1094	return(error);
1095}
1096
1097static	int
1098vnioctl_chr(dev_t dev, u_long cmd, caddr_t data, int flag, proc_t p)
1099{
1100	return (vnioctl(dev, cmd, data, flag, p, TRUE));
1101}
1102
1103static	int
1104vnioctl_blk(dev_t dev, u_long cmd, caddr_t data, int flag, proc_t p)
1105{
1106	return (vnioctl(dev, cmd, data, flag, p, FALSE));
1107}
1108
1109/*
1110 *	vniocattach_file:
1111 *
1112 *	Attach a file to a VN partition.  Return the size in the vn_size
1113 *	field.
1114 */
1115
1116static int
1117vniocattach_file(struct vn_softc *vn,
1118		 struct vn_ioctl_64 *vniop,
1119		 dev_t dev,
1120		 int in_kernel,
1121		 proc_t p)
1122{
1123	dev_t	cdev;
1124	vfs_context_t ctx = vfs_context_current();
1125	kauth_cred_t cred;
1126	struct nameidata nd;
1127	off_t file_size;
1128	int error, flags;
1129
1130	flags = FREAD|FWRITE;
1131	if (in_kernel) {
1132		NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW, UIO_SYSSPACE, vniop->vn_file, ctx);
1133	}
1134	else {
1135		NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW,
1136			   (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
1137			   vniop->vn_file, ctx);
1138	}
1139	/* vn_open gives both long- and short-term references */
1140	error = vn_open(&nd, flags, 0);
1141	if (error) {
1142		if (error != EACCES && error != EPERM && error != EROFS) {
1143			return (error);
1144		}
1145		flags &= ~FWRITE;
1146		if (in_kernel) {
1147			NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW, UIO_SYSSPACE,
1148			       vniop->vn_file, ctx);
1149		}
1150		else {
1151			NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW,
1152				   (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
1153			       vniop->vn_file, ctx);
1154		}
1155		error = vn_open(&nd, flags, 0);
1156		if (error) {
1157			return (error);
1158		}
1159	}
1160	if (nd.ni_vp->v_type != VREG) {
1161		error = EINVAL;
1162	}
1163	else {
1164		error = vnode_size(nd.ni_vp, &file_size, ctx);
1165	}
1166	if (error != 0) {
1167		(void) vn_close(nd.ni_vp, flags, ctx);
1168		vnode_put(nd.ni_vp);
1169		return (error);
1170	}
1171	cred = kauth_cred_proc_ref(p);
1172	nd.ni_vp->v_flag |= VNOCACHE_DATA;
1173	error = setcred(nd.ni_vp, cred);
1174	if (error) {
1175		(void)vn_close(nd.ni_vp, flags, ctx);
1176		vnode_put(nd.ni_vp);
1177		kauth_cred_unref(&cred);
1178		return(error);
1179	}
1180	vn->sc_secsize = DEV_BSIZE;
1181	vn->sc_fsize = file_size;
1182	vn->sc_size = file_size / vn->sc_secsize;
1183	vn->sc_vp = nd.ni_vp;
1184	vn->sc_vid = vnode_vid(nd.ni_vp);
1185	vn->sc_open_flags = flags;
1186	vn->sc_cred = cred;
1187	cdev = makedev(vndevice_cdev_major, minor(dev));
1188	vn->sc_cdev = devfs_make_node(cdev, DEVFS_CHAR,
1189				      UID_ROOT, GID_OPERATOR,
1190				      0600, "rvn%d",
1191				      minor(dev));
1192	vn->sc_flags |= VNF_INITED;
1193	if (flags == FREAD)
1194		vn->sc_flags |= VNF_READONLY;
1195	/* lose the short-term reference */
1196	vnode_put(nd.ni_vp);
1197	return(0);
1198}
1199
1200static int
1201vniocattach_shadow(struct vn_softc *vn, struct vn_ioctl_64 *vniop,
1202				   __unused dev_t dev, int in_kernel, proc_t p)
1203{
1204	vfs_context_t ctx = vfs_context_current();
1205	struct nameidata nd;
1206	int error, flags;
1207	shadow_map_t *	map;
1208	off_t file_size;
1209
1210	flags = FREAD|FWRITE;
1211	if (in_kernel) {
1212		NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW, UIO_SYSSPACE, vniop->vn_file, ctx);
1213	}
1214	else {
1215		NDINIT(&nd, LOOKUP, OP_OPEN, FOLLOW,
1216			   (IS_64BIT_PROCESS(p) ? UIO_USERSPACE64 : UIO_USERSPACE32),
1217			   vniop->vn_file, ctx);
1218	}
1219	/* vn_open gives both long- and short-term references */
1220	error = vn_open(&nd, flags, 0);
1221	if (error) {
1222		/* shadow MUST be writable! */
1223		return (error);
1224	}
1225	if (nd.ni_vp->v_type != VREG
1226	    || (error = vnode_size(nd.ni_vp, &file_size, ctx))) {
1227		(void)vn_close(nd.ni_vp, flags, ctx);
1228		vnode_put(nd.ni_vp);
1229		return (error ? error : EINVAL);
1230	}
1231	map = shadow_map_create(vn->sc_fsize, file_size,
1232				0, vn->sc_secsize);
1233	if (map == NULL) {
1234		(void)vn_close(nd.ni_vp, flags, ctx);
1235		vnode_put(nd.ni_vp);
1236		vn->sc_shadow_vp = NULL;
1237		return (ENOMEM);
1238	}
1239	vn->sc_shadow_vp = nd.ni_vp;
1240	vn->sc_shadow_vid = vnode_vid(nd.ni_vp);
1241	vn->sc_shadow_vp->v_flag |= VNOCACHE_DATA;
1242	vn->sc_shadow_map = map;
1243	vn->sc_flags &= ~VNF_READONLY; /* we're now read/write */
1244
1245	/* lose the short-term reference */
1246	vnode_put(nd.ni_vp);
1247	return(0);
1248}
1249
1250int
1251vndevice_root_image(char * path, char devname[], dev_t * dev_p)
1252{
1253	int 			error = 0;
1254	struct vn_softc *		vn;
1255	struct vn_ioctl_64 	vnio;
1256
1257	vnio.vn_file = CAST_USER_ADDR_T(path);
1258	vnio.vn_size = 0;
1259
1260	vn = vn_table + ROOT_IMAGE_UNIT;
1261	*dev_p = makedev(vndevice_bdev_major,
1262			 ROOT_IMAGE_UNIT);
1263	snprintf(devname, 16, "vn%d", ROOT_IMAGE_UNIT);
1264	error = vniocattach_file(vn, &vnio, *dev_p, 1, current_proc());
1265	return (error);
1266}
1267
1268/*
1269 * Duplicate the current processes' credentials.  Since we are called only
1270 * as the result of a SET ioctl and only root can do that, any future access
1271 * to this "disk" is essentially as root.  Note that credentials may change
1272 * if some other uid can write directly to the mapped file (NFS).
1273 */
1274static int
1275setcred(struct vnode * vp, kauth_cred_t cred)
1276{
1277	char *tmpbuf;
1278	int error = 0;
1279	struct vfs_context  context;
1280
1281	/*
1282	 * Horrible kludge to establish credentials for NFS  XXX.
1283	 */
1284	context.vc_thread = current_thread();
1285	context.vc_ucred = cred;
1286	tmpbuf = _MALLOC(DEV_BSIZE, M_TEMP, M_WAITOK);
1287	error = file_io(vp, &context, UIO_READ, tmpbuf, 0, DEV_BSIZE, NULL);
1288	FREE(tmpbuf, M_TEMP);
1289	return (error);
1290}
1291
1292void
1293vnclear(struct vn_softc *vn, vfs_context_t ctx)
1294{
1295	if (vn->sc_vp != NULL) {
1296		/* release long-term reference */
1297		(void)vn_close(vn->sc_vp, vn->sc_open_flags, ctx);
1298		vn->sc_vp = NULL;
1299	}
1300	if (vn->sc_shadow_vp != NULL) {
1301		/* release long-term reference */
1302		(void)vn_close(vn->sc_shadow_vp, FREAD | FWRITE, ctx);
1303		vn->sc_shadow_vp = NULL;
1304	}
1305	if (vn->sc_shadow_map != NULL) {
1306		shadow_map_free(vn->sc_shadow_map);
1307		vn->sc_shadow_map = NULL;
1308	}
1309	vn->sc_flags &= ~(VNF_INITED | VNF_READONLY);
1310	if (vn->sc_cred) {
1311		kauth_cred_unref(&vn->sc_cred);
1312	}
1313	vn->sc_size = 0;
1314	vn->sc_fsize = 0;
1315	if (vn->sc_cdev) {
1316		devfs_remove(vn->sc_cdev);
1317		vn->sc_cdev = NULL;
1318	}
1319}
1320
1321static	int
1322vnsize(dev_t dev)
1323{
1324	int	secsize;
1325	struct vn_softc *vn;
1326	int unit;
1327
1328	unit = vnunit(dev);
1329	if (vnunit(dev) >= NVNDEVICE) {
1330		return (-1);
1331	}
1332
1333	vn = vn_table + unit;
1334	if ((vn->sc_flags & VNF_INITED) == 0)
1335		secsize = -1;
1336	else
1337		secsize = vn->sc_secsize;
1338
1339	return (secsize);
1340}
1341
1342#define CDEV_MAJOR 	-1
1343#define BDEV_MAJOR 	-1
1344static int vndevice_inited = 0;
1345
1346void
1347vndevice_init(void)
1348{
1349	if (vndevice_inited)
1350		return;
1351
1352	vndevice_do_init();
1353}
1354
1355static void
1356vndevice_do_init( void )
1357{
1358	int 	i;
1359
1360	vndevice_bdev_major = bdevsw_add(BDEV_MAJOR, &vn_bdevsw);
1361
1362	if (vndevice_bdev_major < 0) {
1363		printf("vndevice_init: bdevsw_add() returned %d\n",
1364		       vndevice_bdev_major);
1365		return;
1366	}
1367	vndevice_cdev_major = cdevsw_add_with_bdev(CDEV_MAJOR, &vn_cdevsw,
1368						   vndevice_bdev_major);
1369	if (vndevice_cdev_major < 0) {
1370		printf("vndevice_init: cdevsw_add() returned %d\n",
1371		       vndevice_cdev_major);
1372		return;
1373	}
1374	for (i = 0; i < NVNDEVICE; i++) {
1375		dev_t	dev = makedev(vndevice_bdev_major, i);
1376		vn_table[i].sc_bdev = devfs_make_node(dev, DEVFS_BLOCK,
1377						      UID_ROOT, GID_OPERATOR,
1378						      0600, "vn%d",
1379						      i);
1380		if (vn_table[i].sc_bdev == NULL)
1381			printf("vninit: devfs_make_node failed!\n");
1382	}
1383}
1384
1385static void
1386vn_ioctl_to_64(struct vn_ioctl_32 *from, struct vn_ioctl_64 *to)
1387{
1388	to->vn_file = CAST_USER_ADDR_T(from->vn_file);
1389	to->vn_size = from->vn_size;
1390	to->vn_control = from->vn_control;
1391}
1392
1393#endif /* NVNDEVICE */
1394