null_vnops.c revision 67145
1/*
2 * Copyright (c) 1992, 1993
3 *	The Regents of the University of California.  All rights reserved.
4 *
5 * This code is derived from software contributed to Berkeley by
6 * John Heidemann of the UCLA Ficus project.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *	This product includes software developed by the University of
19 *	California, Berkeley and its contributors.
20 * 4. Neither the name of the University nor the names of its contributors
21 *    may be used to endorse or promote products derived from this software
22 *    without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 *	@(#)null_vnops.c	8.6 (Berkeley) 5/27/95
37 *
38 * Ancestors:
39 *	@(#)lofs_vnops.c	1.2 (Berkeley) 6/18/92
40 *	...and...
41 *	@(#)null_vnodeops.c 1.20 92/07/07 UCLA Ficus project
42 *
43 * $FreeBSD: head/sys/fs/nullfs/null_vnops.c 67145 2000-10-15 06:25:42Z bp $
44 */
45
46/*
47 * Null Layer
48 *
49 * (See mount_null(8) for more information.)
50 *
51 * The null layer duplicates a portion of the file system
52 * name space under a new name.  In this respect, it is
53 * similar to the loopback file system.  It differs from
54 * the loopback fs in two respects:  it is implemented using
55 * a stackable layers techniques, and its "null-node"s stack above
56 * all lower-layer vnodes, not just over directory vnodes.
57 *
58 * The null layer has two purposes.  First, it serves as a demonstration
59 * of layering by proving a layer which does nothing.  (It actually
60 * does everything the loopback file system does, which is slightly
61 * more than nothing.)  Second, the null layer can serve as a prototype
62 * layer.  Since it provides all necessary layer framework,
63 * new file system layers can be created very easily be starting
64 * with a null layer.
65 *
66 * The remainder of this man page examines the null layer as a basis
67 * for constructing new layers.
68 *
69 *
70 * INSTANTIATING NEW NULL LAYERS
71 *
72 * New null layers are created with mount_null(8).
73 * Mount_null(8) takes two arguments, the pathname
74 * of the lower vfs (target-pn) and the pathname where the null
75 * layer will appear in the namespace (alias-pn).  After
76 * the null layer is put into place, the contents
77 * of target-pn subtree will be aliased under alias-pn.
78 *
79 *
80 * OPERATION OF A NULL LAYER
81 *
82 * The null layer is the minimum file system layer,
83 * simply bypassing all possible operations to the lower layer
84 * for processing there.  The majority of its activity centers
85 * on the bypass routine, through which nearly all vnode operations
86 * pass.
87 *
88 * The bypass routine accepts arbitrary vnode operations for
89 * handling by the lower layer.  It begins by examing vnode
90 * operation arguments and replacing any null-nodes by their
91 * lower-layer equivlants.  It then invokes the operation
92 * on the lower layer.  Finally, it replaces the null-nodes
93 * in the arguments and, if a vnode is return by the operation,
94 * stacks a null-node on top of the returned vnode.
95 *
96 * Although bypass handles most operations, vop_getattr, vop_lock,
97 * vop_unlock, vop_inactive, vop_reclaim, and vop_print are not
98 * bypassed. Vop_getattr must change the fsid being returned.
99 * Vop_lock and vop_unlock must handle any locking for the
100 * current vnode as well as pass the lock request down.
101 * Vop_inactive and vop_reclaim are not bypassed so that
102 * they can handle freeing null-layer specific data. Vop_print
103 * is not bypassed to avoid excessive debugging information.
104 * Also, certain vnode operations change the locking state within
105 * the operation (create, mknod, remove, link, rename, mkdir, rmdir,
106 * and symlink). Ideally these operations should not change the
107 * lock state, but should be changed to let the caller of the
108 * function unlock them. Otherwise all intermediate vnode layers
109 * (such as union, umapfs, etc) must catch these functions to do
110 * the necessary locking at their layer.
111 *
112 *
113 * INSTANTIATING VNODE STACKS
114 *
115 * Mounting associates the null layer with a lower layer,
116 * effect stacking two VFSes.  Vnode stacks are instead
117 * created on demand as files are accessed.
118 *
119 * The initial mount creates a single vnode stack for the
120 * root of the new null layer.  All other vnode stacks
121 * are created as a result of vnode operations on
122 * this or other null vnode stacks.
123 *
124 * New vnode stacks come into existance as a result of
125 * an operation which returns a vnode.
126 * The bypass routine stacks a null-node above the new
127 * vnode before returning it to the caller.
128 *
129 * For example, imagine mounting a null layer with
130 * "mount_null /usr/include /dev/layer/null".
131 * Changing directory to /dev/layer/null will assign
132 * the root null-node (which was created when the null layer was mounted).
133 * Now consider opening "sys".  A vop_lookup would be
134 * done on the root null-node.  This operation would bypass through
135 * to the lower layer which would return a vnode representing
136 * the UFS "sys".  Null_bypass then builds a null-node
137 * aliasing the UFS "sys" and returns this to the caller.
138 * Later operations on the null-node "sys" will repeat this
139 * process when constructing other vnode stacks.
140 *
141 *
142 * CREATING OTHER FILE SYSTEM LAYERS
143 *
144 * One of the easiest ways to construct new file system layers is to make
145 * a copy of the null layer, rename all files and variables, and
146 * then begin modifing the copy.  Sed can be used to easily rename
147 * all variables.
148 *
149 * The umap layer is an example of a layer descended from the
150 * null layer.
151 *
152 *
153 * INVOKING OPERATIONS ON LOWER LAYERS
154 *
155 * There are two techniques to invoke operations on a lower layer
156 * when the operation cannot be completely bypassed.  Each method
157 * is appropriate in different situations.  In both cases,
158 * it is the responsibility of the aliasing layer to make
159 * the operation arguments "correct" for the lower layer
160 * by mapping an vnode arguments to the lower layer.
161 *
162 * The first approach is to call the aliasing layer's bypass routine.
163 * This method is most suitable when you wish to invoke the operation
164 * currently being handled on the lower layer.  It has the advantage
165 * that the bypass routine already must do argument mapping.
166 * An example of this is null_getattrs in the null layer.
167 *
168 * A second approach is to directly invoke vnode operations on
169 * the lower layer with the VOP_OPERATIONNAME interface.
170 * The advantage of this method is that it is easy to invoke
171 * arbitrary operations on the lower layer.  The disadvantage
172 * is that vnode arguments must be manualy mapped.
173 *
174 */
175
176#include <sys/param.h>
177#include <sys/systm.h>
178#include <sys/kernel.h>
179#include <sys/conf.h>
180#include <sys/proc.h>
181#include <sys/sysctl.h>
182#include <sys/vnode.h>
183#include <sys/mount.h>
184#include <sys/namei.h>
185#include <sys/malloc.h>
186#include <miscfs/nullfs/null.h>
187
188#include <vm/vm.h>
189#include <vm/vm_extern.h>
190#include <vm/vm_object.h>
191#include <vm/vnode_pager.h>
192
193static int null_bug_bypass = 0;   /* for debugging: enables bypass printf'ing */
194SYSCTL_INT(_debug, OID_AUTO, nullfs_bug_bypass, CTLFLAG_RW,
195	&null_bug_bypass, 0, "");
196
197static int	null_access(struct vop_access_args *ap);
198static int	null_createvobject(struct vop_createvobject_args *ap);
199static int	null_destroyvobject(struct vop_destroyvobject_args *ap);
200static int	null_getattr(struct vop_getattr_args *ap);
201static int	null_getvobject(struct vop_getvobject_args *ap);
202static int	null_inactive(struct vop_inactive_args *ap);
203static int	null_islocked(struct vop_islocked_args *ap);
204static int	null_lock(struct vop_lock_args *ap);
205static int	null_lookup(struct vop_lookup_args *ap);
206static int	null_open(struct vop_open_args *ap);
207static int	null_print(struct vop_print_args *ap);
208static int	null_reclaim(struct vop_reclaim_args *ap);
209static int	null_rename(struct vop_rename_args *ap);
210static int	null_setattr(struct vop_setattr_args *ap);
211static int	null_unlock(struct vop_unlock_args *ap);
212
213/*
214 * This is the 10-Apr-92 bypass routine.
215 *    This version has been optimized for speed, throwing away some
216 * safety checks.  It should still always work, but it's not as
217 * robust to programmer errors.
218 *
219 * In general, we map all vnodes going down and unmap them on the way back.
220 * As an exception to this, vnodes can be marked "unmapped" by setting
221 * the Nth bit in operation's vdesc_flags.
222 *
223 * Also, some BSD vnode operations have the side effect of vrele'ing
224 * their arguments.  With stacking, the reference counts are held
225 * by the upper node, not the lower one, so we must handle these
226 * side-effects here.  This is not of concern in Sun-derived systems
227 * since there are no such side-effects.
228 *
229 * This makes the following assumptions:
230 * - only one returned vpp
231 * - no INOUT vpp's (Sun's vop_open has one of these)
232 * - the vnode operation vector of the first vnode should be used
233 *   to determine what implementation of the op should be invoked
234 * - all mapped vnodes are of our vnode-type (NEEDSWORK:
235 *   problems on rmdir'ing mount points and renaming?)
236 */
237int
238null_bypass(ap)
239	struct vop_generic_args /* {
240		struct vnodeop_desc *a_desc;
241		<other random data follows, presumably>
242	} */ *ap;
243{
244	register struct vnode **this_vp_p;
245	int error;
246	struct vnode *old_vps[VDESC_MAX_VPS];
247	struct vnode **vps_p[VDESC_MAX_VPS];
248	struct vnode ***vppp;
249	struct vnodeop_desc *descp = ap->a_desc;
250	int reles, i;
251
252	if (null_bug_bypass)
253		printf ("null_bypass: %s\n", descp->vdesc_name);
254
255#ifdef DIAGNOSTIC
256	/*
257	 * We require at least one vp.
258	 */
259	if (descp->vdesc_vp_offsets == NULL ||
260	    descp->vdesc_vp_offsets[0] == VDESC_NO_OFFSET)
261		panic ("null_bypass: no vp's in map");
262#endif
263
264	/*
265	 * Map the vnodes going in.
266	 * Later, we'll invoke the operation based on
267	 * the first mapped vnode's operation vector.
268	 */
269	reles = descp->vdesc_flags;
270	for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) {
271		if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET)
272			break;   /* bail out at end of list */
273		vps_p[i] = this_vp_p =
274			VOPARG_OFFSETTO(struct vnode**,descp->vdesc_vp_offsets[i],ap);
275		/*
276		 * We're not guaranteed that any but the first vnode
277		 * are of our type.  Check for and don't map any
278		 * that aren't.  (We must always map first vp or vclean fails.)
279		 */
280		if (i && (*this_vp_p == NULLVP ||
281		    (*this_vp_p)->v_op != null_vnodeop_p)) {
282			old_vps[i] = NULLVP;
283		} else {
284			old_vps[i] = *this_vp_p;
285			*(vps_p[i]) = NULLVPTOLOWERVP(*this_vp_p);
286			/*
287			 * XXX - Several operations have the side effect
288			 * of vrele'ing their vp's.  We must account for
289			 * that.  (This should go away in the future.)
290			 */
291			if (reles & VDESC_VP0_WILLRELE)
292				VREF(*this_vp_p);
293		}
294
295	}
296
297	/*
298	 * Call the operation on the lower layer
299	 * with the modified argument structure.
300	 */
301	if (vps_p[0] && *vps_p[0])
302		error = VCALL(*(vps_p[0]), descp->vdesc_offset, ap);
303	else {
304		printf("null_bypass: no map for %s\n", descp->vdesc_name);
305		error = EINVAL;
306	}
307
308	/*
309	 * Maintain the illusion of call-by-value
310	 * by restoring vnodes in the argument structure
311	 * to their original value.
312	 */
313	reles = descp->vdesc_flags;
314	for (i = 0; i < VDESC_MAX_VPS; reles >>= 1, i++) {
315		if (descp->vdesc_vp_offsets[i] == VDESC_NO_OFFSET)
316			break;   /* bail out at end of list */
317		if (old_vps[i]) {
318			*(vps_p[i]) = old_vps[i];
319#if 0
320			if (reles & VDESC_VP0_WILLUNLOCK)
321				VOP_UNLOCK(*(vps_p[i]), LK_THISLAYER, curproc);
322#endif
323			if (reles & VDESC_VP0_WILLRELE)
324				vrele(*(vps_p[i]));
325		}
326	}
327
328	/*
329	 * Map the possible out-going vpp
330	 * (Assumes that the lower layer always returns
331	 * a VREF'ed vpp unless it gets an error.)
332	 */
333	if (descp->vdesc_vpp_offset != VDESC_NO_OFFSET &&
334	    !(descp->vdesc_flags & VDESC_NOMAP_VPP) &&
335	    !error) {
336		/*
337		 * XXX - even though some ops have vpp returned vp's,
338		 * several ops actually vrele this before returning.
339		 * We must avoid these ops.
340		 * (This should go away when these ops are regularized.)
341		 */
342		if (descp->vdesc_flags & VDESC_VPP_WILLRELE)
343			goto out;
344		vppp = VOPARG_OFFSETTO(struct vnode***,
345				 descp->vdesc_vpp_offset,ap);
346		if (*vppp)
347			error = null_node_create(old_vps[0]->v_mount, **vppp, *vppp);
348	}
349
350 out:
351	return (error);
352}
353
354/*
355 * We have to carry on the locking protocol on the null layer vnodes
356 * as we progress through the tree. We also have to enforce read-only
357 * if this layer is mounted read-only.
358 */
359static int
360null_lookup(ap)
361	struct vop_lookup_args /* {
362		struct vnode * a_dvp;
363		struct vnode ** a_vpp;
364		struct componentname * a_cnp;
365	} */ *ap;
366{
367	struct componentname *cnp = ap->a_cnp;
368	struct vnode *dvp = ap->a_dvp;
369	struct proc *p = cnp->cn_proc;
370	int flags = cnp->cn_flags;
371	struct vnode *vp, *ldvp, *lvp;
372	int error;
373
374	if ((flags & ISLASTCN) && (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
375	    (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME))
376		return (EROFS);
377	/*
378	 * Although it is possible to call null_bypass(), we'll do
379	 * a direct call to reduce overhead
380	 */
381	ldvp = NULLVPTOLOWERVP(dvp);
382	vp = lvp = NULL;
383	error = VOP_LOOKUP(ldvp, &lvp, cnp);
384	if (error == EJUSTRETURN && (flags & ISLASTCN) &&
385	    (dvp->v_mount->mnt_flag & MNT_RDONLY) &&
386	    (cnp->cn_nameiop == CREATE || cnp->cn_nameiop == RENAME))
387		error = EROFS;
388
389	/*
390	 * Rely only on the PDIRUNLOCK flag which should be carefully
391	 * tracked by underlying filesystem.
392	 */
393	if (cnp->cn_flags & PDIRUNLOCK)
394		VOP_UNLOCK(dvp, LK_THISLAYER, p);
395	if ((error == 0 || error == EJUSTRETURN) && lvp != NULL) {
396		if (ldvp == lvp) {
397			*ap->a_vpp = dvp;
398			VREF(dvp);
399			vrele(lvp);
400		} else {
401			error = null_node_create(dvp->v_mount, lvp, &vp);
402			if (error == 0)
403				*ap->a_vpp = vp;
404		}
405	}
406	return (error);
407}
408
409/*
410 * Setattr call. Disallow write attempts if the layer is mounted read-only.
411 */
412int
413null_setattr(ap)
414	struct vop_setattr_args /* {
415		struct vnodeop_desc *a_desc;
416		struct vnode *a_vp;
417		struct vattr *a_vap;
418		struct ucred *a_cred;
419		struct proc *a_p;
420	} */ *ap;
421{
422	struct vnode *vp = ap->a_vp;
423	struct vattr *vap = ap->a_vap;
424
425  	if ((vap->va_flags != VNOVAL || vap->va_uid != (uid_t)VNOVAL ||
426	    vap->va_gid != (gid_t)VNOVAL || vap->va_atime.tv_sec != VNOVAL ||
427	    vap->va_mtime.tv_sec != VNOVAL || vap->va_mode != (mode_t)VNOVAL) &&
428	    (vp->v_mount->mnt_flag & MNT_RDONLY))
429		return (EROFS);
430	if (vap->va_size != VNOVAL) {
431 		switch (vp->v_type) {
432 		case VDIR:
433 			return (EISDIR);
434 		case VCHR:
435 		case VBLK:
436 		case VSOCK:
437 		case VFIFO:
438			if (vap->va_flags != VNOVAL)
439				return (EOPNOTSUPP);
440			return (0);
441		case VREG:
442		case VLNK:
443 		default:
444			/*
445			 * Disallow write attempts if the filesystem is
446			 * mounted read-only.
447			 */
448			if (vp->v_mount->mnt_flag & MNT_RDONLY)
449				return (EROFS);
450		}
451	}
452
453	return (null_bypass((struct vop_generic_args *)ap));
454}
455
456/*
457 *  We handle getattr only to change the fsid.
458 */
459static int
460null_getattr(ap)
461	struct vop_getattr_args /* {
462		struct vnode *a_vp;
463		struct vattr *a_vap;
464		struct ucred *a_cred;
465		struct proc *a_p;
466	} */ *ap;
467{
468	int error;
469
470	if ((error = null_bypass((struct vop_generic_args *)ap)) != 0)
471		return (error);
472
473	ap->a_vap->va_fsid = ap->a_vp->v_mount->mnt_stat.f_fsid.val[0];
474	return (0);
475}
476
477/*
478 * Handle to disallow write access if mounted read-only.
479 */
480static int
481null_access(ap)
482	struct vop_access_args /* {
483		struct vnode *a_vp;
484		int  a_mode;
485		struct ucred *a_cred;
486		struct proc *a_p;
487	} */ *ap;
488{
489	struct vnode *vp = ap->a_vp;
490	mode_t mode = ap->a_mode;
491
492	/*
493	 * Disallow write attempts on read-only layers;
494	 * unless the file is a socket, fifo, or a block or
495	 * character device resident on the file system.
496	 */
497	if (mode & VWRITE) {
498		switch (vp->v_type) {
499		case VDIR:
500		case VLNK:
501		case VREG:
502			if (vp->v_mount->mnt_flag & MNT_RDONLY)
503				return (EROFS);
504			break;
505		default:
506			break;
507		}
508	}
509	return (null_bypass((struct vop_generic_args *)ap));
510}
511
512/*
513 * We must handle open to be able to catch MNT_NODEV and friends.
514 */
515static int
516null_open(ap)
517	struct vop_open_args /* {
518		struct vnode *a_vp;
519		int  a_mode;
520		struct ucred *a_cred;
521		struct proc *a_p;
522	} */ *ap;
523{
524	struct vnode *vp = ap->a_vp;
525	struct vnode *lvp = NULLVPTOLOWERVP(ap->a_vp);
526
527	if ((vp->v_mount->mnt_flag & MNT_NODEV) &&
528	    (lvp->v_type == VBLK || lvp->v_type == VCHR))
529		return ENXIO;
530
531	return (null_bypass((struct vop_generic_args *)ap));
532}
533
534/*
535 * We handle this to eliminate null FS to lower FS
536 * file moving. Don't know why we don't allow this,
537 * possibly we should.
538 */
539static int
540null_rename(ap)
541	struct vop_rename_args /* {
542		struct vnode *a_fdvp;
543		struct vnode *a_fvp;
544		struct componentname *a_fcnp;
545		struct vnode *a_tdvp;
546		struct vnode *a_tvp;
547		struct componentname *a_tcnp;
548	} */ *ap;
549{
550	struct vnode *tdvp = ap->a_tdvp;
551	struct vnode *fvp = ap->a_fvp;
552	struct vnode *fdvp = ap->a_fdvp;
553	struct vnode *tvp = ap->a_tvp;
554
555	/* Check for cross-device rename. */
556	if ((fvp->v_mount != tdvp->v_mount) ||
557	    (tvp && (fvp->v_mount != tvp->v_mount))) {
558		if (tdvp == tvp)
559			vrele(tdvp);
560		else
561			vput(tdvp);
562		if (tvp)
563			vput(tvp);
564		vrele(fdvp);
565		vrele(fvp);
566		return (EXDEV);
567	}
568
569	return (null_bypass((struct vop_generic_args *)ap));
570}
571
572/*
573 * We need to process our own vnode lock and then clear the
574 * interlock flag as it applies only to our vnode, not the
575 * vnodes below us on the stack.
576 */
577static int
578null_lock(ap)
579	struct vop_lock_args /* {
580		struct vnode *a_vp;
581		int a_flags;
582		struct proc *a_p;
583	} */ *ap;
584{
585	struct vnode *vp = ap->a_vp;
586	int flags = ap->a_flags;
587	struct proc *p = ap->a_p;
588	struct vnode *lvp;
589	int error;
590
591	if (flags & LK_THISLAYER) {
592		if (vp->v_vnlock != NULL)
593			return 0;	/* lock is shared across layers */
594		error = lockmgr(&vp->v_lock, flags & ~LK_THISLAYER,
595		    &vp->v_interlock, p);
596		return (error);
597	}
598
599	if (vp->v_vnlock != NULL) {
600		/*
601		 * The lower level has exported a struct lock to us. Use
602		 * it so that all vnodes in the stack lock and unlock
603		 * simultaneously. Note: we don't DRAIN the lock as DRAIN
604		 * decommissions the lock - just because our vnode is
605		 * going away doesn't mean the struct lock below us is.
606		 * LK_EXCLUSIVE is fine.
607		 */
608		if ((flags & LK_TYPE_MASK) == LK_DRAIN) {
609			NULLFSDEBUG("null_lock: avoiding LK_DRAIN\n");
610			return(lockmgr(vp->v_vnlock,
611				(flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE,
612				&vp->v_interlock, p));
613		}
614		return(lockmgr(vp->v_vnlock, flags, &vp->v_interlock, p));
615	} else {
616		/*
617		 * To prevent race conditions involving doing a lookup
618		 * on "..", we have to lock the lower node, then lock our
619		 * node. Most of the time it won't matter that we lock our
620		 * node (as any locking would need the lower one locked
621		 * first). But we can LK_DRAIN the upper lock as a step
622		 * towards decomissioning it.
623		 */
624		lvp = NULLVPTOLOWERVP(vp);
625		if (lvp == NULL)
626			return (lockmgr(&vp->v_lock, flags, &vp->v_interlock, p));
627		if (flags & LK_INTERLOCK) {
628			mtx_exit(&vp->v_interlock, MTX_DEF);
629			flags &= ~LK_INTERLOCK;
630		}
631		if ((flags & LK_TYPE_MASK) == LK_DRAIN) {
632			error = VOP_LOCK(lvp,
633				(flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE, p);
634		} else
635			error = VOP_LOCK(lvp, flags, p);
636		if (error)
637			return (error);
638		error = lockmgr(&vp->v_lock, flags, &vp->v_interlock, p);
639		if (error)
640			VOP_UNLOCK(lvp, 0, p);
641		return (error);
642	}
643}
644
645/*
646 * We need to process our own vnode unlock and then clear the
647 * interlock flag as it applies only to our vnode, not the
648 * vnodes below us on the stack.
649 */
650static int
651null_unlock(ap)
652	struct vop_unlock_args /* {
653		struct vnode *a_vp;
654		int a_flags;
655		struct proc *a_p;
656	} */ *ap;
657{
658	struct vnode *vp = ap->a_vp;
659	int flags = ap->a_flags;
660	struct proc *p = ap->a_p;
661	struct vnode *lvp;
662
663	if (vp->v_vnlock != NULL) {
664		if (flags & LK_THISLAYER)
665			return 0;	/* the lock is shared across layers */
666		flags &= ~LK_THISLAYER;
667		return (lockmgr(vp->v_vnlock, flags | LK_RELEASE,
668			&vp->v_interlock, p));
669	}
670	lvp = NULLVPTOLOWERVP(vp);
671	if (lvp == NULL)
672		return (lockmgr(&vp->v_lock, flags | LK_RELEASE, &vp->v_interlock, p));
673	if ((flags & LK_THISLAYER) == 0) {
674		if (flags & LK_INTERLOCK) {
675			mtx_exit(&vp->v_interlock, MTX_DEF);
676			flags &= ~LK_INTERLOCK;
677		}
678		VOP_UNLOCK(lvp, flags & ~LK_INTERLOCK, p);
679	} else
680		flags &= ~LK_THISLAYER;
681	return (lockmgr(&vp->v_lock, flags | LK_RELEASE, &vp->v_interlock, p));
682}
683
684static int
685null_islocked(ap)
686	struct vop_islocked_args /* {
687		struct vnode *a_vp;
688		struct proc *a_p;
689	} */ *ap;
690{
691	struct vnode *vp = ap->a_vp;
692	struct proc *p = ap->a_p;
693
694	if (vp->v_vnlock != NULL)
695		return (lockstatus(vp->v_vnlock, p));
696	return (lockstatus(&vp->v_lock, p));
697}
698
699/*
700 * There is no way to tell that someone issued remove/rmdir operation
701 * on the underlying filesystem. For now we just have to release lowevrp
702 * as soon as possible.
703 */
704static int
705null_inactive(ap)
706	struct vop_inactive_args /* {
707		struct vnode *a_vp;
708		struct proc *a_p;
709	} */ *ap;
710{
711	struct vnode *vp = ap->a_vp;
712	struct proc *p = ap->a_p;
713	struct null_node *xp = VTONULL(vp);
714	struct vnode *lowervp = xp->null_lowervp;
715
716	lockmgr(&null_hashlock, LK_EXCLUSIVE, NULL, p);
717	LIST_REMOVE(xp, null_hash);
718	lockmgr(&null_hashlock, LK_RELEASE, NULL, p);
719
720	xp->null_lowervp = NULLVP;
721	if (vp->v_vnlock != NULL) {
722		vp->v_vnlock = &vp->v_lock;	/* we no longer share the lock */
723	} else
724		VOP_UNLOCK(vp, LK_THISLAYER, p);
725
726	vput(lowervp);
727	/*
728	 * Now it is safe to drop references to the lower vnode.
729	 * VOP_INACTIVE() will be called by vrele() if necessary.
730	 */
731	vrele (lowervp);
732
733	return (0);
734}
735
736/*
737 * We can free memory in null_inactive, but we do this
738 * here. (Possible to guard vp->v_data to point somewhere)
739 */
740static int
741null_reclaim(ap)
742	struct vop_reclaim_args /* {
743		struct vnode *a_vp;
744		struct proc *a_p;
745	} */ *ap;
746{
747	struct vnode *vp = ap->a_vp;
748	void *vdata = vp->v_data;
749
750	vp->v_data = NULL;
751	FREE(vdata, M_NULLFSNODE);
752
753	return (0);
754}
755
756static int
757null_print(ap)
758	struct vop_print_args /* {
759		struct vnode *a_vp;
760	} */ *ap;
761{
762	register struct vnode *vp = ap->a_vp;
763	printf ("\ttag VT_NULLFS, vp=%p, lowervp=%p\n", vp, NULLVPTOLOWERVP(vp));
764	return (0);
765}
766
767/*
768 * Let an underlying filesystem do the work
769 */
770static int
771null_createvobject(ap)
772	struct vop_createvobject_args /* {
773		struct vnode *vp;
774		struct ucred *cred;
775		struct proc *p;
776	} */ *ap;
777{
778	struct vnode *vp = ap->a_vp;
779	struct vnode *lowervp = VTONULL(vp) ? NULLVPTOLOWERVP(vp) : NULL;
780	int error;
781
782	if (vp->v_type == VNON || lowervp == NULL)
783		return 0;
784	error = VOP_CREATEVOBJECT(lowervp, ap->a_cred, ap->a_p);
785	if (error)
786		return (error);
787	vp->v_flag |= VOBJBUF;
788	return (0);
789}
790
791/*
792 * We have nothing to destroy and this operation shouldn't be bypassed.
793 */
794static int
795null_destroyvobject(ap)
796	struct vop_destroyvobject_args /* {
797		struct vnode *vp;
798	} */ *ap;
799{
800	struct vnode *vp = ap->a_vp;
801
802	vp->v_flag &= ~VOBJBUF;
803	return (0);
804}
805
806static int
807null_getvobject(ap)
808	struct vop_getvobject_args /* {
809		struct vnode *vp;
810		struct vm_object **objpp;
811	} */ *ap;
812{
813	struct vnode *lvp = NULLVPTOLOWERVP(ap->a_vp);
814
815	if (lvp == NULL)
816		return EINVAL;
817	return (VOP_GETVOBJECT(lvp, ap->a_objpp));
818}
819
820/*
821 * Global vfs data structures
822 */
823vop_t **null_vnodeop_p;
824static struct vnodeopv_entry_desc null_vnodeop_entries[] = {
825	{ &vop_default_desc,		(vop_t *) null_bypass },
826
827	{ &vop_access_desc,		(vop_t *) null_access },
828	{ &vop_bmap_desc,		(vop_t *) vop_eopnotsupp },
829	{ &vop_createvobject_desc,	(vop_t *) null_createvobject },
830	{ &vop_destroyvobject_desc,	(vop_t *) null_destroyvobject },
831	{ &vop_getattr_desc,		(vop_t *) null_getattr },
832	{ &vop_getvobject_desc,		(vop_t *) null_getvobject },
833	{ &vop_getwritemount_desc,	(vop_t *) vop_stdgetwritemount},
834	{ &vop_inactive_desc,		(vop_t *) null_inactive },
835	{ &vop_islocked_desc,		(vop_t *) null_islocked },
836	{ &vop_lock_desc,		(vop_t *) null_lock },
837	{ &vop_lookup_desc,		(vop_t *) null_lookup },
838	{ &vop_open_desc,		(vop_t *) null_open },
839	{ &vop_print_desc,		(vop_t *) null_print },
840	{ &vop_reclaim_desc,		(vop_t *) null_reclaim },
841	{ &vop_rename_desc,		(vop_t *) null_rename },
842	{ &vop_setattr_desc,		(vop_t *) null_setattr },
843	{ &vop_strategy_desc,		(vop_t *) vop_eopnotsupp },
844	{ &vop_unlock_desc,		(vop_t *) null_unlock },
845	{ NULL, NULL }
846};
847static struct vnodeopv_desc null_vnodeop_opv_desc =
848	{ &null_vnodeop_p, null_vnodeop_entries };
849
850VNODEOP_SET(null_vnodeop_opv_desc);
851