union_subr.c revision 139776
1/*-
2 * Copyright (c) 1994 Jan-Simon Pendry
3 * Copyright (c) 1994
4 *	The Regents of the University of California.  All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * Jan-Simon Pendry.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 4. Neither the name of the University nor the names of its contributors
18 *    may be used to endorse or promote products derived from this software
19 *    without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 * SUCH DAMAGE.
32 *
33 *	@(#)union_subr.c	8.20 (Berkeley) 5/20/95
34 * $FreeBSD: head/sys/fs/unionfs/union_subr.c 139776 2005-01-06 18:10:42Z imp $
35 */
36
37#include <sys/param.h>
38#include <sys/systm.h>
39#include <sys/fcntl.h>
40#include <sys/file.h>
41#include <sys/filedesc.h>
42#include <sys/kernel.h>
43#include <sys/lock.h>
44#include <sys/malloc.h>
45#include <sys/module.h>
46#include <sys/mount.h>
47#include <sys/mutex.h>
48#include <sys/namei.h>
49#include <sys/stat.h>
50#include <sys/vnode.h>
51
52#include <vm/vm.h>
53#include <vm/vm_extern.h>	/* for vnode_pager_setsize */
54#include <vm/vm_object.h>	/* for vm cache coherency */
55#include <vm/uma.h>
56
57#include <fs/unionfs/union.h>
58
59#include <sys/proc.h>
60
61extern int	union_init(void);
62
63/* must be power of two, otherwise change UNION_HASH() */
64#define NHASH 32
65
66/* unsigned int ... */
67#define UNION_HASH(u, l) \
68	(((((uintptr_t) (u)) + ((uintptr_t) l)) >> 8) & (NHASH-1))
69
70static MALLOC_DEFINE(M_UNPATH, "unpath", "UNION path component");
71static MALLOC_DEFINE(M_UNDCACHE, "undcac", "UNION directory cache");
72
73static LIST_HEAD(unhead, union_node) unhead[NHASH];
74static int unvplock[NHASH];
75
76static void	union_dircache_r(struct vnode *vp, struct vnode ***vppp,
77				      int *cntp);
78static int	union_list_lock(int ix);
79static void	union_list_unlock(int ix);
80static int	union_relookup(struct union_mount *um, struct vnode *dvp,
81				    struct vnode **vpp,
82				    struct componentname *cnp,
83				    struct componentname *cn, char *path,
84				    int pathlen);
85static void	union_updatevp(struct union_node *un,
86				    struct vnode *uppervp,
87				    struct vnode *lowervp);
88static void union_newlower(struct union_node *, struct vnode *);
89static void union_newupper(struct union_node *, struct vnode *);
90static int union_copyfile(struct vnode *, struct vnode *,
91					struct ucred *, struct thread *);
92static int union_vn_create(struct vnode **, struct union_node *,
93				struct thread *);
94static int union_vn_close(struct vnode *, int, struct ucred *,
95				struct thread *);
96
97int
98union_init()
99{
100	int i;
101
102	for (i = 0; i < NHASH; i++)
103		LIST_INIT(&unhead[i]);
104	bzero((caddr_t)unvplock, sizeof(unvplock));
105	return (0);
106}
107
108static int
109union_list_lock(ix)
110	int ix;
111{
112	if (unvplock[ix] & UNVP_LOCKED) {
113		unvplock[ix] |= UNVP_WANT;
114		(void) tsleep( &unvplock[ix], PINOD, "unllck", 0);
115		return (1);
116	}
117	unvplock[ix] |= UNVP_LOCKED;
118	return (0);
119}
120
121static void
122union_list_unlock(ix)
123	int ix;
124{
125	unvplock[ix] &= ~UNVP_LOCKED;
126
127	if (unvplock[ix] & UNVP_WANT) {
128		unvplock[ix] &= ~UNVP_WANT;
129		wakeup( &unvplock[ix]);
130	}
131}
132
133/*
134 *	union_updatevp:
135 *
136 *	The uppervp, if not NULL, must be referenced and not locked by us
137 *	The lowervp, if not NULL, must be referenced.
138 *
139 *	If uppervp and lowervp match pointers already installed, then
140 *	nothing happens. The passed vp's (when matching) are not adjusted.
141 *
142 *	This routine may only be called by union_newupper() and
143 *	union_newlower().
144 */
145
146static void
147union_updatevp(un, uppervp, lowervp)
148	struct union_node *un;
149	struct vnode *uppervp;
150	struct vnode *lowervp;
151{
152	int ohash = UNION_HASH(un->un_uppervp, un->un_lowervp);
153	int nhash = UNION_HASH(uppervp, lowervp);
154	int docache = (lowervp != NULLVP || uppervp != NULLVP);
155	int lhash, uhash;
156
157	/*
158	 * Ensure locking is ordered from lower to higher
159	 * to avoid deadlocks.
160	 */
161	if (nhash < ohash) {
162		lhash = nhash;
163		uhash = ohash;
164	} else {
165		lhash = ohash;
166		uhash = nhash;
167	}
168
169	if (lhash != uhash) {
170		while (union_list_lock(lhash))
171			continue;
172	}
173
174	while (union_list_lock(uhash))
175		continue;
176
177	if (ohash != nhash || !docache) {
178		if (un->un_flags & UN_CACHED) {
179			un->un_flags &= ~UN_CACHED;
180			LIST_REMOVE(un, un_cache);
181		}
182	}
183
184	if (ohash != nhash)
185		union_list_unlock(ohash);
186
187	if (un->un_lowervp != lowervp) {
188		if (un->un_lowervp) {
189			vrele(un->un_lowervp);
190			if (un->un_path) {
191				free(un->un_path, M_UNPATH);
192				un->un_path = 0;
193			}
194		}
195		un->un_lowervp = lowervp;
196		un->un_lowersz = VNOVAL;
197	}
198
199	if (un->un_uppervp != uppervp) {
200		if (un->un_uppervp)
201			vrele(un->un_uppervp);
202		un->un_uppervp = uppervp;
203		un->un_uppersz = VNOVAL;
204	}
205
206	if (docache && (ohash != nhash)) {
207		LIST_INSERT_HEAD(&unhead[nhash], un, un_cache);
208		un->un_flags |= UN_CACHED;
209	}
210
211	union_list_unlock(nhash);
212}
213
214/*
215 * Set a new lowervp.  The passed lowervp must be referenced and will be
216 * stored in the vp in a referenced state.
217 */
218
219static void
220union_newlower(un, lowervp)
221	struct union_node *un;
222	struct vnode *lowervp;
223{
224	union_updatevp(un, un->un_uppervp, lowervp);
225}
226
227/*
228 * Set a new uppervp.  The passed uppervp must be locked and will be
229 * stored in the vp in a locked state.  The caller should not unlock
230 * uppervp.
231 */
232
233static void
234union_newupper(un, uppervp)
235	struct union_node *un;
236	struct vnode *uppervp;
237{
238	union_updatevp(un, uppervp, un->un_lowervp);
239}
240
241/*
242 * Keep track of size changes in the underlying vnodes.
243 * If the size changes, then callback to the vm layer
244 * giving priority to the upper layer size.
245 */
246void
247union_newsize(vp, uppersz, lowersz)
248	struct vnode *vp;
249	off_t uppersz, lowersz;
250{
251	struct union_node *un;
252	off_t sz;
253
254	/* only interested in regular files */
255	if (vp->v_type != VREG)
256		return;
257
258	un = VTOUNION(vp);
259	sz = VNOVAL;
260
261	if ((uppersz != VNOVAL) && (un->un_uppersz != uppersz)) {
262		un->un_uppersz = uppersz;
263		if (sz == VNOVAL)
264			sz = un->un_uppersz;
265	}
266
267	if ((lowersz != VNOVAL) && (un->un_lowersz != lowersz)) {
268		un->un_lowersz = lowersz;
269		if (sz == VNOVAL)
270			sz = un->un_lowersz;
271	}
272
273	if (sz != VNOVAL) {
274		UDEBUG(("union: %s size now %ld\n",
275			(uppersz != VNOVAL ? "upper" : "lower"), (long)sz));
276		/*
277		 * There is no need to change size of non-existent object.
278		 */
279		/* vnode_pager_setsize(vp, sz); */
280	}
281}
282
283/*
284 *	union_allocvp:	allocate a union_node and associate it with a
285 *			parent union_node and one or two vnodes.
286 *
287 *	vpp	Holds the returned vnode locked and referenced if no
288 *		error occurs.
289 *
290 *	mp	Holds the mount point.  mp may or may not be busied.
291 *		allocvp() makes no changes to mp.
292 *
293 *	dvp	Holds the parent union_node to the one we wish to create.
294 *		XXX may only be used to traverse an uncopied lowervp-based
295 *		tree?  XXX
296 *
297 *		dvp may or may not be locked.  allocvp() makes no changes
298 *		to dvp.
299 *
300 *	upperdvp Holds the parent vnode to uppervp, generally used along
301 *		with path component information to create a shadow of
302 *		lowervp when uppervp does not exist.
303 *
304 *		upperdvp is referenced but unlocked on entry, and will be
305 *		dereferenced on return.
306 *
307 *	uppervp	Holds the new uppervp vnode to be stored in the
308 *		union_node we are allocating.  uppervp is referenced but
309 *		not locked, and will be dereferenced on return.
310 *
311 *	lowervp	Holds the new lowervp vnode to be stored in the
312 *		union_node we are allocating.  lowervp is referenced but
313 *		not locked, and will be dereferenced on return.
314 *
315 *	cnp	Holds path component information to be coupled with
316 *		lowervp and upperdvp to allow unionfs to create an uppervp
317 *		later on.  Only used if lowervp is valid.  The contents
318 *		of cnp is only valid for the duration of the call.
319 *
320 *	docache	Determine whether this node should be entered in the
321 *		cache or whether it should be destroyed as soon as possible.
322 *
323 * All union_nodes are maintained on a singly-linked
324 * list.  New nodes are only allocated when they cannot
325 * be found on this list.  Entries on the list are
326 * removed when the vfs reclaim entry is called.
327 *
328 * A single lock is kept for the entire list.  This is
329 * needed because the getnewvnode() function can block
330 * waiting for a vnode to become free, in which case there
331 * may be more than one process trying to get the same
332 * vnode.  This lock is only taken if we are going to
333 * call getnewvnode(), since the kernel itself is single-threaded.
334 *
335 * If an entry is found on the list, then call vget() to
336 * take a reference.  This is done because there may be
337 * zero references to it and so it needs to removed from
338 * the vnode free list.
339 */
340
341int
342union_allocvp(vpp, mp, dvp, upperdvp, cnp, uppervp, lowervp, docache)
343	struct vnode **vpp;
344	struct mount *mp;
345	struct vnode *dvp;		/* parent union vnode */
346	struct vnode *upperdvp;		/* parent vnode of uppervp */
347	struct componentname *cnp;	/* may be null */
348	struct vnode *uppervp;		/* may be null */
349	struct vnode *lowervp;		/* may be null */
350	int docache;
351{
352	int error;
353	struct union_node *un = 0;
354	struct union_mount *um = MOUNTTOUNIONMOUNT(mp);
355	struct thread *td = (cnp) ? cnp->cn_thread : curthread;
356	int hash = 0;
357	int vflag;
358	int try;
359
360	if (uppervp == NULLVP && lowervp == NULLVP)
361		panic("union: unidentifiable allocation");
362
363	if (uppervp && lowervp && (uppervp->v_type != lowervp->v_type)) {
364		vrele(lowervp);
365		lowervp = NULLVP;
366	}
367
368	/* detect the root vnode (and aliases) */
369	vflag = 0;
370	if ((uppervp == um->um_uppervp) &&
371	    ((lowervp == NULLVP) || lowervp == um->um_lowervp)) {
372		if (lowervp == NULLVP) {
373			lowervp = um->um_lowervp;
374			if (lowervp != NULLVP)
375				VREF(lowervp);
376		}
377		vflag = VV_ROOT;
378	}
379
380loop:
381	if (!docache) {
382		un = 0;
383	} else for (try = 0; try < 3; try++) {
384		switch (try) {
385		case 0:
386			if (lowervp == NULLVP)
387				continue;
388			hash = UNION_HASH(uppervp, lowervp);
389			break;
390
391		case 1:
392			if (uppervp == NULLVP)
393				continue;
394			hash = UNION_HASH(uppervp, NULLVP);
395			break;
396
397		case 2:
398			if (lowervp == NULLVP)
399				continue;
400			hash = UNION_HASH(NULLVP, lowervp);
401			break;
402		}
403
404		while (union_list_lock(hash))
405			continue;
406
407		LIST_FOREACH(un, &unhead[hash], un_cache) {
408			if ((un->un_lowervp == lowervp ||
409			     un->un_lowervp == NULLVP) &&
410			    (un->un_uppervp == uppervp ||
411			     un->un_uppervp == NULLVP) &&
412			    (UNIONTOV(un)->v_mount == mp)) {
413				if (vget(UNIONTOV(un), 0,
414				    cnp ? cnp->cn_thread : NULL)) {
415					union_list_unlock(hash);
416					goto loop;
417				}
418				break;
419			}
420		}
421
422		union_list_unlock(hash);
423
424		if (un)
425			break;
426	}
427
428	if (un) {
429		/*
430		 * Obtain a lock on the union_node.  Everything is unlocked
431		 * except for dvp, so check that case.  If they match, our
432		 * new un is already locked.  Otherwise we have to lock our
433		 * new un.
434		 *
435		 * A potential deadlock situation occurs when we are holding
436		 * one lock while trying to get another.  We must follow
437		 * strict ordering rules to avoid it.  We try to locate dvp
438		 * by scanning up from un_vnode, since the most likely
439		 * scenario is un being under dvp.
440		 */
441
442		if (dvp && un->un_vnode != dvp) {
443			struct vnode *scan = un->un_vnode;
444
445			do {
446				scan = VTOUNION(scan)->un_pvp;
447			} while (scan && scan->v_op == &union_vnodeops &&
448				 scan != dvp);
449			if (scan != dvp) {
450				/*
451				 * our new un is above dvp (we never saw dvp
452				 * while moving up the tree).
453				 */
454				VREF(dvp);
455				VOP_UNLOCK(dvp, 0, td);
456				error = vn_lock(un->un_vnode, LK_EXCLUSIVE, td);
457				vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td);
458				vrele(dvp);
459			} else {
460				/*
461				 * our new un is under dvp
462				 */
463				error = vn_lock(un->un_vnode, LK_EXCLUSIVE, td);
464			}
465		} else if (dvp == NULLVP) {
466			/*
467			 * dvp is NULL, we need to lock un.
468			 */
469			error = vn_lock(un->un_vnode, LK_EXCLUSIVE, td);
470		} else {
471			/*
472			 * dvp == un->un_vnode, we are already locked.
473			 */
474			error = 0;
475		}
476
477		if (error)
478			goto loop;
479
480		/*
481		 * At this point, the union_node is locked and referenced.
482		 *
483		 * uppervp is locked and referenced or NULL, lowervp is
484		 * referenced or NULL.
485		 */
486		UDEBUG(("Modify existing un %p vn %p upper %p(refs %d) -> %p(refs %d)\n",
487			un, un->un_vnode, un->un_uppervp,
488			(un->un_uppervp ? vrefcnt(un->un_uppervp) : -99),
489			uppervp,
490			(uppervp ? vrefcnt(uppervp) : -99)
491		));
492
493		if (uppervp != un->un_uppervp) {
494			KASSERT(uppervp == NULL || vrefcnt(uppervp) > 0, ("union_allocvp: too few refs %d (at least 1 required) on uppervp", vrefcnt(uppervp)));
495			union_newupper(un, uppervp);
496		} else if (uppervp) {
497			KASSERT(vrefcnt(uppervp) > 1, ("union_allocvp: too few refs %d (at least 2 required) on uppervp", vrefcnt(uppervp)));
498			vrele(uppervp);
499		}
500
501		/*
502		 * Save information about the lower layer.
503		 * This needs to keep track of pathname
504		 * and directory information which union_vn_create()
505		 * might need.
506		 */
507		if (lowervp != un->un_lowervp) {
508			union_newlower(un, lowervp);
509			if (cnp && (lowervp != NULLVP)) {
510				un->un_path = malloc(cnp->cn_namelen+1,
511						M_UNPATH, M_WAITOK);
512				bcopy(cnp->cn_nameptr, un->un_path,
513						cnp->cn_namelen);
514				un->un_path[cnp->cn_namelen] = '\0';
515			}
516		} else if (lowervp) {
517			vrele(lowervp);
518		}
519
520		/*
521		 * and upperdvp
522		 */
523		if (upperdvp != un->un_dirvp) {
524			if (un->un_dirvp)
525				vrele(un->un_dirvp);
526			un->un_dirvp = upperdvp;
527		} else if (upperdvp) {
528			vrele(upperdvp);
529		}
530
531		*vpp = UNIONTOV(un);
532		return (0);
533	}
534
535	if (docache) {
536		/*
537		 * Otherwise lock the vp list while we call getnewvnode()
538		 * since that can block.
539		 */
540		hash = UNION_HASH(uppervp, lowervp);
541
542		if (union_list_lock(hash))
543			goto loop;
544	}
545
546	/*
547	 * Create new node rather than replace old node.
548	 */
549
550	error = getnewvnode("union", mp, &union_vnodeops, vpp);
551	if (error) {
552		/*
553		 * If an error occurs, clear out vnodes.
554		 */
555		if (lowervp)
556			vrele(lowervp);
557		if (uppervp)
558			vrele(uppervp);
559		if (upperdvp)
560			vrele(upperdvp);
561		*vpp = NULL;
562		goto out;
563	}
564
565	MALLOC((*vpp)->v_data, void *, sizeof(struct union_node),
566		M_TEMP, M_WAITOK);
567
568	ASSERT_VOP_LOCKED(*vpp, "union_allocvp");
569	(*vpp)->v_vflag |= vflag;
570	if (uppervp)
571		(*vpp)->v_type = uppervp->v_type;
572	else
573		(*vpp)->v_type = lowervp->v_type;
574
575	un = VTOUNION(*vpp);
576	bzero(un, sizeof(*un));
577
578	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);
579
580	un->un_vnode = *vpp;
581	un->un_uppervp = uppervp;
582	un->un_uppersz = VNOVAL;
583	un->un_lowervp = lowervp;
584	un->un_lowersz = VNOVAL;
585	un->un_dirvp = upperdvp;
586	un->un_pvp = dvp;		/* only parent dir in new allocation */
587	if (dvp != NULLVP)
588		VREF(dvp);
589	un->un_dircache = NULL;
590	un->un_openl = 0;
591
592	if (cnp && (lowervp != NULLVP)) {
593		un->un_path = malloc(cnp->cn_namelen+1, M_UNPATH, M_WAITOK);
594		bcopy(cnp->cn_nameptr, un->un_path, cnp->cn_namelen);
595		un->un_path[cnp->cn_namelen] = '\0';
596	} else {
597		un->un_path = NULL;
598		un->un_dirvp = NULL;
599	}
600
601	if (docache) {
602		LIST_INSERT_HEAD(&unhead[hash], un, un_cache);
603		un->un_flags |= UN_CACHED;
604	}
605
606out:
607	if (docache)
608		union_list_unlock(hash);
609
610	return (error);
611}
612
613int
614union_freevp(vp)
615	struct vnode *vp;
616{
617	struct union_node *un = VTOUNION(vp);
618
619	if (un->un_flags & UN_CACHED) {
620		un->un_flags &= ~UN_CACHED;
621		LIST_REMOVE(un, un_cache);
622	}
623
624	if (un->un_pvp != NULLVP) {
625		vrele(un->un_pvp);
626		un->un_pvp = NULL;
627	}
628	if (un->un_uppervp != NULLVP) {
629		vrele(un->un_uppervp);
630		un->un_uppervp = NULL;
631	}
632	if (un->un_lowervp != NULLVP) {
633		vrele(un->un_lowervp);
634		un->un_lowervp = NULL;
635	}
636	if (un->un_dirvp != NULLVP) {
637		vrele(un->un_dirvp);
638		un->un_dirvp = NULL;
639	}
640	if (un->un_path) {
641		free(un->un_path, M_UNPATH);
642		un->un_path = NULL;
643	}
644
645	FREE(vp->v_data, M_TEMP);
646	vp->v_data = 0;
647
648	return (0);
649}
650
651/*
652 * copyfile.  Copy the vnode (fvp) to the vnode (tvp)
653 * using a sequence of reads and writes.  Both (fvp)
654 * and (tvp) are locked on entry and exit.
655 *
656 * fvp and tvp are both exclusive locked on call, but their refcount's
657 * haven't been bumped at all.
658 */
659static int
660union_copyfile(fvp, tvp, cred, td)
661	struct vnode *fvp;
662	struct vnode *tvp;
663	struct ucred *cred;
664	struct thread *td;
665{
666	char *buf;
667	struct uio uio;
668	struct iovec iov;
669	int error = 0;
670
671	/*
672	 * strategy:
673	 * Allocate a buffer of size MAXBSIZE.
674	 * Loop doing reads and writes, keeping track
675	 * of the current uio offset.
676	 * Give up at the first sign of trouble.
677	 */
678
679	bzero(&uio, sizeof(uio));
680
681	uio.uio_td = td;
682	uio.uio_segflg = UIO_SYSSPACE;
683	uio.uio_offset = 0;
684
685	VOP_LEASE(fvp, td, cred, LEASE_READ);
686	VOP_LEASE(tvp, td, cred, LEASE_WRITE);
687
688	buf = malloc(MAXBSIZE, M_TEMP, M_WAITOK);
689
690	/* ugly loop follows... */
691	do {
692		off_t offset = uio.uio_offset;
693		int count;
694		int bufoffset;
695
696		/*
697		 * Setup for big read.
698		 */
699		uio.uio_iov = &iov;
700		uio.uio_iovcnt = 1;
701		iov.iov_base = buf;
702		iov.iov_len = MAXBSIZE;
703		uio.uio_resid = iov.iov_len;
704		uio.uio_rw = UIO_READ;
705
706		if ((error = VOP_READ(fvp, &uio, 0, cred)) != 0)
707			break;
708
709		/*
710		 * Get bytes read, handle read eof case and setup for
711		 * write loop.
712		 */
713		if ((count = MAXBSIZE - uio.uio_resid) == 0)
714			break;
715		bufoffset = 0;
716
717		/*
718		 * Write until an error occurs or our buffer has been
719		 * exhausted, then update the offset for the next read.
720		 */
721		while (bufoffset < count) {
722			uio.uio_iov = &iov;
723			uio.uio_iovcnt = 1;
724			iov.iov_base = buf + bufoffset;
725			iov.iov_len = count - bufoffset;
726			uio.uio_offset = offset + bufoffset;
727			uio.uio_rw = UIO_WRITE;
728			uio.uio_resid = iov.iov_len;
729
730			if ((error = VOP_WRITE(tvp, &uio, 0, cred)) != 0)
731				break;
732			bufoffset += (count - bufoffset) - uio.uio_resid;
733		}
734		uio.uio_offset = offset + bufoffset;
735	} while (error == 0);
736
737	free(buf, M_TEMP);
738	return (error);
739}
740
741/*
742 *
743 * un's vnode is assumed to be locked on entry and remains locked on exit.
744 */
745
746int
747union_copyup(un, docopy, cred, td)
748	struct union_node *un;
749	int docopy;
750	struct ucred *cred;
751	struct thread *td;
752{
753	int error;
754	struct mount *mp;
755	struct vnode *lvp, *uvp;
756
757	/*
758	 * If the user does not have read permission, the vnode should not
759	 * be copied to upper layer.
760	 */
761	vn_lock(un->un_lowervp, LK_EXCLUSIVE | LK_RETRY, td);
762	error = VOP_ACCESS(un->un_lowervp, VREAD, cred, td);
763	VOP_UNLOCK(un->un_lowervp, 0, td);
764	if (error)
765		return (error);
766
767	if ((error = vn_start_write(un->un_dirvp, &mp, V_WAIT | PCATCH)) != 0)
768		return (error);
769	if ((error = union_vn_create(&uvp, un, td)) != 0) {
770		vn_finished_write(mp);
771		return (error);
772	}
773
774	lvp = un->un_lowervp;
775
776	KASSERT(vrefcnt(uvp) > 0, ("copy: uvp refcount 0: %d", vrefcnt(uvp)));
777	if (docopy) {
778		/*
779		 * XX - should not ignore errors
780		 * from VOP_CLOSE()
781		 */
782		vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY, td);
783		error = VOP_OPEN(lvp, FREAD, cred, td, -1);
784		if (error == 0 && vn_canvmio(lvp) == TRUE)
785			error = vfs_object_create(lvp, td, cred);
786		if (error == 0) {
787			error = union_copyfile(lvp, uvp, cred, td);
788			VOP_UNLOCK(lvp, 0, td);
789			(void) VOP_CLOSE(lvp, FREAD, cred, td);
790		}
791		if (error == 0)
792			UDEBUG(("union: copied up %s\n", un->un_path));
793
794	}
795	VOP_UNLOCK(uvp, 0, td);
796	vn_finished_write(mp);
797	union_newupper(un, uvp);
798	KASSERT(vrefcnt(uvp) > 0, ("copy: uvp refcount 0: %d", vrefcnt(uvp)));
799	union_vn_close(uvp, FWRITE, cred, td);
800	KASSERT(vrefcnt(uvp) > 0, ("copy: uvp refcount 0: %d", vrefcnt(uvp)));
801	/*
802	 * Subsequent IOs will go to the top layer, so
803	 * call close on the lower vnode and open on the
804	 * upper vnode to ensure that the filesystem keeps
805	 * its references counts right.  This doesn't do
806	 * the right thing with (cred) and (FREAD) though.
807	 * Ignoring error returns is not right, either.
808	 */
809	if (error == 0) {
810		int i;
811
812		for (i = 0; i < un->un_openl; i++) {
813			(void) VOP_CLOSE(lvp, FREAD, cred, td);
814			(void) VOP_OPEN(uvp, FREAD, cred, td, -1);
815		}
816		if (un->un_openl) {
817			if (vn_canvmio(uvp) == TRUE)
818				error = vfs_object_create(uvp, td, cred);
819		}
820		un->un_openl = 0;
821	}
822
823	return (error);
824
825}
826
827/*
828 *	union_relookup:
829 *
830 *	dvp should be locked on entry and will be locked on return.  No
831 *	net change in the ref count will occur.
832 *
833 *	If an error is returned, *vpp will be invalid, otherwise it
834 *	will hold a locked, referenced vnode.  If *vpp == dvp then
835 *	remember that only one exclusive lock is held.
836 */
837
838static int
839union_relookup(um, dvp, vpp, cnp, cn, path, pathlen)
840	struct union_mount *um;
841	struct vnode *dvp;
842	struct vnode **vpp;
843	struct componentname *cnp;
844	struct componentname *cn;
845	char *path;
846	int pathlen;
847{
848	int error;
849
850	/*
851	 * A new componentname structure must be faked up because
852	 * there is no way to know where the upper level cnp came
853	 * from or what it is being used for.  This must duplicate
854	 * some of the work done by NDINIT(), some of the work done
855	 * by namei(), some of the work done by lookup() and some of
856	 * the work done by VOP_LOOKUP() when given a CREATE flag.
857	 * Conclusion: Horrible.
858	 */
859	cn->cn_namelen = pathlen;
860	cn->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
861	bcopy(path, cn->cn_pnbuf, cn->cn_namelen);
862	cn->cn_pnbuf[cn->cn_namelen] = '\0';
863
864	cn->cn_nameiop = CREATE;
865	cn->cn_flags = (LOCKPARENT|LOCKLEAF|HASBUF|SAVENAME|ISLASTCN);
866	cn->cn_thread = cnp->cn_thread;
867	if (um->um_op == UNMNT_ABOVE)
868		cn->cn_cred = cnp->cn_cred;
869	else
870		cn->cn_cred = um->um_cred;
871	cn->cn_nameptr = cn->cn_pnbuf;
872	cn->cn_consume = cnp->cn_consume;
873
874	VREF(dvp);
875	VOP_UNLOCK(dvp, 0, cnp->cn_thread);
876
877	/*
878	 * Pass dvp unlocked and referenced on call to relookup().
879	 *
880	 * If an error occurs, dvp will be returned unlocked and dereferenced.
881	 */
882
883	if ((error = relookup(dvp, vpp, cn)) != 0) {
884		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, cnp->cn_thread);
885		return(error);
886	}
887
888	/*
889	 * If no error occurs, dvp will be returned locked with the reference
890	 * left as before, and vpp will be returned referenced and locked.
891	 *
892	 * We want to return with dvp as it was passed to us, so we get
893	 * rid of our reference.
894	 */
895	vrele(dvp);
896	return (0);
897}
898
899/*
900 * Create a shadow directory in the upper layer.
901 * The new vnode is returned locked.
902 *
903 * (um) points to the union mount structure for access to the
904 * the mounting process's credentials.
905 * (dvp) is the directory in which to create the shadow directory,
906 * It is locked (but not ref'd) on entry and return.
907 * (cnp) is the component name to be created.
908 * (vpp) is the returned newly created shadow directory, which
909 * is returned locked and ref'd
910 */
911int
912union_mkshadow(um, dvp, cnp, vpp)
913	struct union_mount *um;
914	struct vnode *dvp;
915	struct componentname *cnp;
916	struct vnode **vpp;
917{
918	int error;
919	struct vattr va;
920	struct thread *td = cnp->cn_thread;
921	struct componentname cn;
922	struct mount *mp;
923
924	if ((error = vn_start_write(dvp, &mp, V_WAIT | PCATCH)) != 0)
925		return (error);
926	if ((error = union_relookup(um, dvp, vpp, cnp, &cn,
927			cnp->cn_nameptr, cnp->cn_namelen)) != 0) {
928		vn_finished_write(mp);
929		return (error);
930	}
931
932	if (*vpp) {
933		if (cn.cn_flags & HASBUF) {
934			uma_zfree(namei_zone, cn.cn_pnbuf);
935			cn.cn_flags &= ~HASBUF;
936		}
937		if (dvp == *vpp)
938			vrele(*vpp);
939		else
940			vput(*vpp);
941		vn_finished_write(mp);
942		*vpp = NULLVP;
943		return (EEXIST);
944	}
945
946	/*
947	 * Policy: when creating the shadow directory in the
948	 * upper layer, create it owned by the user who did
949	 * the mount, group from parent directory, and mode
950	 * 777 modified by umask (ie mostly identical to the
951	 * mkdir syscall).  (jsp, kb)
952	 */
953
954	VATTR_NULL(&va);
955	va.va_type = VDIR;
956	va.va_mode = um->um_cmode;
957
958	/* VOP_LEASE: dvp is locked */
959	VOP_LEASE(dvp, td, cn.cn_cred, LEASE_WRITE);
960
961	error = VOP_MKDIR(dvp, vpp, &cn, &va);
962	if (cn.cn_flags & HASBUF) {
963		uma_zfree(namei_zone, cn.cn_pnbuf);
964		cn.cn_flags &= ~HASBUF;
965	}
966	/*vput(dvp);*/
967	vn_finished_write(mp);
968	return (error);
969}
970
971/*
972 * Create a whiteout entry in the upper layer.
973 *
974 * (um) points to the union mount structure for access to the
975 * the mounting process's credentials.
976 * (dvp) is the directory in which to create the whiteout.
977 * It is locked on entry and return.
978 * (cnp) is the component name to be created.
979 */
980int
981union_mkwhiteout(um, dvp, cnp, path)
982	struct union_mount *um;
983	struct vnode *dvp;
984	struct componentname *cnp;
985	char *path;
986{
987	int error;
988	struct thread *td = cnp->cn_thread;
989	struct vnode *wvp;
990	struct componentname cn;
991	struct mount *mp;
992
993	if ((error = vn_start_write(dvp, &mp, V_WAIT | PCATCH)) != 0)
994		return (error);
995	error = union_relookup(um, dvp, &wvp, cnp, &cn, path, strlen(path));
996	if (error) {
997		vn_finished_write(mp);
998		return (error);
999	}
1000
1001	if (wvp) {
1002		if (cn.cn_flags & HASBUF) {
1003			uma_zfree(namei_zone, cn.cn_pnbuf);
1004			cn.cn_flags &= ~HASBUF;
1005		}
1006		if (wvp == dvp)
1007			vrele(wvp);
1008		else
1009			vput(wvp);
1010		vn_finished_write(mp);
1011		return (EEXIST);
1012	}
1013
1014	/* VOP_LEASE: dvp is locked */
1015	VOP_LEASE(dvp, td, td->td_ucred, LEASE_WRITE);
1016
1017	error = VOP_WHITEOUT(dvp, &cn, CREATE);
1018	if (cn.cn_flags & HASBUF) {
1019		uma_zfree(namei_zone, cn.cn_pnbuf);
1020		cn.cn_flags &= ~HASBUF;
1021	}
1022	vn_finished_write(mp);
1023	return (error);
1024}
1025
1026/*
1027 * union_vn_create: creates and opens a new shadow file
1028 * on the upper union layer.  This function is similar
1029 * in spirit to calling vn_open() but it avoids calling namei().
1030 * The problem with calling namei() is that a) it locks too many
1031 * things, and b) it doesn't start at the "right" directory,
1032 * whereas relookup() is told where to start.
1033 *
1034 * On entry, the vnode associated with un is locked.  It remains locked
1035 * on return.
1036 *
1037 * If no error occurs, *vpp contains a locked referenced vnode for your
1038 * use.  If an error occurs *vpp iis undefined.
1039 */
1040static int
1041union_vn_create(vpp, un, td)
1042	struct vnode **vpp;
1043	struct union_node *un;
1044	struct thread *td;
1045{
1046	struct vnode *vp;
1047	struct ucred *cred = td->td_ucred;
1048	struct vattr vat;
1049	struct vattr *vap = &vat;
1050	int fmode = FFLAGS(O_WRONLY|O_CREAT|O_TRUNC|O_EXCL);
1051	int error;
1052	int cmode;
1053	struct componentname cn;
1054
1055	*vpp = NULLVP;
1056	FILEDESC_LOCK_FAST(td->td_proc->p_fd);
1057	cmode = UN_FILEMODE & ~td->td_proc->p_fd->fd_cmask;
1058	FILEDESC_UNLOCK_FAST(td->td_proc->p_fd);
1059
1060	/*
1061	 * Build a new componentname structure (for the same
1062	 * reasons outlines in union_mkshadow()).
1063	 * The difference here is that the file is owned by
1064	 * the current user, rather than by the person who
1065	 * did the mount, since the current user needs to be
1066	 * able to write the file (that's why it is being
1067	 * copied in the first place).
1068	 */
1069	cn.cn_namelen = strlen(un->un_path);
1070	cn.cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
1071	bcopy(un->un_path, cn.cn_pnbuf, cn.cn_namelen+1);
1072	cn.cn_nameiop = CREATE;
1073	cn.cn_flags = (LOCKPARENT|LOCKLEAF|HASBUF|SAVENAME|ISLASTCN);
1074	cn.cn_thread = td;
1075	cn.cn_cred = td->td_ucred;
1076	cn.cn_nameptr = cn.cn_pnbuf;
1077	cn.cn_consume = 0;
1078
1079	/*
1080	 * Pass dvp unlocked and referenced on call to relookup().
1081	 *
1082	 * If an error occurs, dvp will be returned unlocked and dereferenced.
1083	 */
1084	VREF(un->un_dirvp);
1085	error = relookup(un->un_dirvp, &vp, &cn);
1086	if (error)
1087		return (error);
1088
1089	/*
1090	 * If no error occurs, dvp will be returned locked with the reference
1091	 * left as before, and vpp will be returned referenced and locked.
1092	 */
1093	if (vp) {
1094		vput(un->un_dirvp);
1095		if (cn.cn_flags & HASBUF) {
1096			uma_zfree(namei_zone, cn.cn_pnbuf);
1097			cn.cn_flags &= ~HASBUF;
1098		}
1099		if (vp == un->un_dirvp)
1100			vrele(vp);
1101		else
1102			vput(vp);
1103		return (EEXIST);
1104	}
1105
1106	/*
1107	 * Good - there was no race to create the file
1108	 * so go ahead and create it.  The permissions
1109	 * on the file will be 0666 modified by the
1110	 * current user's umask.  Access to the file, while
1111	 * it is unioned, will require access to the top *and*
1112	 * bottom files.  Access when not unioned will simply
1113	 * require access to the top-level file.
1114	 * TODO: confirm choice of access permissions.
1115	 */
1116	VATTR_NULL(vap);
1117	vap->va_type = VREG;
1118	vap->va_mode = cmode;
1119	VOP_LEASE(un->un_dirvp, td, cred, LEASE_WRITE);
1120	error = VOP_CREATE(un->un_dirvp, &vp, &cn, vap);
1121	if (cn.cn_flags & HASBUF) {
1122		uma_zfree(namei_zone, cn.cn_pnbuf);
1123		cn.cn_flags &= ~HASBUF;
1124	}
1125	vput(un->un_dirvp);
1126	if (error)
1127		return (error);
1128
1129	error = VOP_OPEN(vp, fmode, cred, td, -1);
1130	if (error == 0 && vn_canvmio(vp) == TRUE)
1131		error = vfs_object_create(vp, td, cred);
1132	if (error) {
1133		vput(vp);
1134		return (error);
1135	}
1136	vp->v_writecount++;
1137	*vpp = vp;
1138	return (0);
1139}
1140
1141static int
1142union_vn_close(vp, fmode, cred, td)
1143	struct vnode *vp;
1144	int fmode;
1145	struct ucred *cred;
1146	struct thread *td;
1147{
1148
1149	if (fmode & FWRITE)
1150		--vp->v_writecount;
1151	return (VOP_CLOSE(vp, fmode, cred, td));
1152}
1153
1154/*
1155 *	union_removed_upper:
1156 *
1157 *	An upper-only file/directory has been removed; un-cache it so
1158 *	that unionfs vnode gets reclaimed and the last uppervp reference
1159 *	disappears.
1160 *
1161 *	Called with union_node unlocked.
1162 */
1163
1164void
1165union_removed_upper(un)
1166	struct union_node *un;
1167{
1168	if (un->un_flags & UN_CACHED) {
1169		int hash = UNION_HASH(un->un_uppervp, un->un_lowervp);
1170
1171		while (union_list_lock(hash))
1172			continue;
1173		un->un_flags &= ~UN_CACHED;
1174		LIST_REMOVE(un, un_cache);
1175		union_list_unlock(hash);
1176	}
1177}
1178
1179/*
1180 * Determine whether a whiteout is needed
1181 * during a remove/rmdir operation.
1182 */
1183int
1184union_dowhiteout(un, cred, td)
1185	struct union_node *un;
1186	struct ucred *cred;
1187	struct thread *td;
1188{
1189	struct vattr va;
1190
1191	if (un->un_lowervp != NULLVP)
1192		return (1);
1193
1194	if (VOP_GETATTR(un->un_uppervp, &va, cred, td) == 0 &&
1195	    (va.va_flags & OPAQUE))
1196		return (1);
1197
1198	return (0);
1199}
1200
1201static void
1202union_dircache_r(vp, vppp, cntp)
1203	struct vnode *vp;
1204	struct vnode ***vppp;
1205	int *cntp;
1206{
1207	struct union_node *un;
1208
1209	if (vp->v_op != &union_vnodeops) {
1210		if (vppp) {
1211			VREF(vp);
1212			*(*vppp)++ = vp;
1213			if (--(*cntp) == 0)
1214				panic("union: dircache table too small");
1215		} else {
1216			(*cntp)++;
1217		}
1218	} else {
1219		un = VTOUNION(vp);
1220		if (un->un_uppervp != NULLVP)
1221			union_dircache_r(un->un_uppervp, vppp, cntp);
1222		if (un->un_lowervp != NULLVP)
1223			union_dircache_r(un->un_lowervp, vppp, cntp);
1224	}
1225}
1226
1227struct vnode *
1228union_dircache_get(vp, td)
1229	struct vnode *vp;
1230	struct thread *td;
1231{
1232	int cnt;
1233	struct vnode *nvp;
1234	struct vnode **vpp;
1235	struct vnode **dircache, **newdircache;
1236	struct union_node *un;
1237	int error;
1238
1239	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1240	un = VTOUNION(vp);
1241	dircache = un->un_dircache;
1242	newdircache = NULL;
1243
1244	nvp = NULLVP;
1245
1246	if (dircache == NULL) {
1247		cnt = 0;
1248		union_dircache_r(vp, 0, &cnt);
1249		cnt++;
1250		newdircache = dircache = malloc(cnt * sizeof(struct vnode *),
1251						M_UNDCACHE, M_WAITOK);
1252		vpp = dircache;
1253		union_dircache_r(vp, &vpp, &cnt);
1254		*vpp = NULLVP;
1255		vpp = dircache + 1;
1256	} else {
1257		vpp = dircache;
1258		do {
1259			if (*vpp++ == un->un_uppervp)
1260				break;
1261		} while (*vpp != NULLVP);
1262	}
1263
1264	if (*vpp == NULLVP)
1265		goto out;
1266
1267	/*vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);*/
1268	UDEBUG(("ALLOCVP-3 %p ref %d\n", *vpp, (*vpp ? vrefcnt(*vpp) : -99)));
1269	VREF(*vpp);
1270	error = union_allocvp(&nvp, vp->v_mount, NULLVP, NULLVP, NULL, *vpp, NULLVP, 0);
1271	UDEBUG(("ALLOCVP-3B %p ref %d\n", nvp, (*vpp ? vrefcnt(*vpp) : -99)));
1272	if (error)
1273		goto out;
1274
1275	un->un_dircache = NULL;
1276	VTOUNION(nvp)->un_dircache = dircache;
1277	newdircache = NULL;
1278
1279out:
1280	/*
1281	 * If we allocated a new dircache and couldn't attach
1282	 * it to a new vp, free the resources we allocated.
1283	 */
1284	if (newdircache) {
1285		for (vpp = newdircache; *vpp != NULLVP; vpp++)
1286			vrele(*vpp);
1287		free(newdircache, M_UNDCACHE);
1288	}
1289
1290	VOP_UNLOCK(vp, 0, td);
1291	return (nvp);
1292}
1293
1294void
1295union_dircache_free(struct union_node *un)
1296{
1297	struct vnode **vpp;
1298
1299	for (vpp = un->un_dircache; *vpp != NULLVP; vpp++)
1300		vrele(*vpp);
1301	free(un->un_dircache, M_UNDCACHE);
1302	un->un_dircache = NULL;
1303}
1304
1305/*
1306 * Module glue to remove #ifdef UNION from vfs_syscalls.c
1307 */
1308static int
1309union_dircheck(struct thread *td, struct vnode **vp, struct file *fp)
1310{
1311	int error = 0;
1312
1313	if ((*vp)->v_op == &union_vnodeops) {
1314		struct vnode *lvp;
1315
1316		lvp = union_dircache_get(*vp, td);
1317		if (lvp != NULLVP) {
1318			struct vattr va;
1319
1320			/*
1321			 * If the directory is opaque,
1322			 * then don't show lower entries
1323			 */
1324			error = VOP_GETATTR(*vp, &va, fp->f_cred, td);
1325			if (va.va_flags & OPAQUE) {
1326				vput(lvp);
1327				lvp = NULLVP;
1328			}
1329		}
1330
1331		if (lvp != NULLVP) {
1332			error = VOP_OPEN(lvp, FREAD, fp->f_cred, td, -1);
1333			if (error == 0 && vn_canvmio(lvp) == TRUE)
1334				error = vfs_object_create(lvp, td, fp->f_cred);
1335			if (error) {
1336				vput(lvp);
1337				return (error);
1338			}
1339			VOP_UNLOCK(lvp, 0, td);
1340			FILE_LOCK(fp);
1341			fp->f_vnode = lvp;
1342			fp->f_data = lvp;
1343			fp->f_offset = 0;
1344			FILE_UNLOCK(fp);
1345			error = vn_close(*vp, FREAD, fp->f_cred, td);
1346			if (error)
1347				return (error);
1348			*vp = lvp;
1349			return -1;	/* goto unionread */
1350		}
1351	}
1352	return error;
1353}
1354
1355static int
1356union_modevent(module_t mod, int type, void *data)
1357{
1358	switch (type) {
1359	case MOD_LOAD:
1360		union_dircheckp = union_dircheck;
1361		break;
1362	case MOD_UNLOAD:
1363		union_dircheckp = NULL;
1364		break;
1365	default:
1366		return EOPNOTSUPP;
1367		break;
1368	}
1369	return 0;
1370}
1371
1372static moduledata_t union_mod = {
1373	"union_dircheck",
1374	union_modevent,
1375	NULL
1376};
1377
1378DECLARE_MODULE(union_dircheck, union_mod, SI_SUB_VFS, SI_ORDER_ANY);
1379