union_subr.c revision 116357
1/*
2 * Copyright (c) 1994 Jan-Simon Pendry
3 * Copyright (c) 1994
4 *	The Regents of the University of California.  All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * Jan-Simon Pendry.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *	This product includes software developed by the University of
20 *	California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 *    may be used to endorse or promote products derived from this software
23 *    without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 *
37 *	@(#)union_subr.c	8.20 (Berkeley) 5/20/95
38 * $FreeBSD: head/sys/fs/unionfs/union_subr.c 116357 2003-06-14 23:48:20Z das $
39 */
40
41#include <sys/param.h>
42#include <sys/systm.h>
43#include <sys/fcntl.h>
44#include <sys/file.h>
45#include <sys/filedesc.h>
46#include <sys/kernel.h>
47#include <sys/lock.h>
48#include <sys/malloc.h>
49#include <sys/module.h>
50#include <sys/mount.h>
51#include <sys/mutex.h>
52#include <sys/namei.h>
53#include <sys/stat.h>
54#include <sys/vnode.h>
55
56#include <vm/vm.h>
57#include <vm/vm_extern.h>	/* for vnode_pager_setsize */
58#include <vm/vm_object.h>	/* for vm cache coherency */
59#include <vm/uma.h>
60
61#include <fs/unionfs/union.h>
62
63#include <sys/proc.h>
64
65extern int	union_init(void);
66
67/* must be power of two, otherwise change UNION_HASH() */
68#define NHASH 32
69
70/* unsigned int ... */
71#define UNION_HASH(u, l) \
72	(((((uintptr_t) (u)) + ((uintptr_t) l)) >> 8) & (NHASH-1))
73
74static MALLOC_DEFINE(M_UNPATH, "unpath", "UNION path component");
75static MALLOC_DEFINE(M_UNDCACHE, "undcac", "UNION directory cache");
76
77static LIST_HEAD(unhead, union_node) unhead[NHASH];
78static int unvplock[NHASH];
79
80static void	union_dircache_r(struct vnode *vp, struct vnode ***vppp,
81				      int *cntp);
82static int	union_list_lock(int ix);
83static void	union_list_unlock(int ix);
84static int	union_relookup(struct union_mount *um, struct vnode *dvp,
85				    struct vnode **vpp,
86				    struct componentname *cnp,
87				    struct componentname *cn, char *path,
88				    int pathlen);
89static void	union_updatevp(struct union_node *un,
90				    struct vnode *uppervp,
91				    struct vnode *lowervp);
92static void union_newlower(struct union_node *, struct vnode *);
93static void union_newupper(struct union_node *, struct vnode *);
94static int union_copyfile(struct vnode *, struct vnode *,
95					struct ucred *, struct thread *);
96static int union_vn_create(struct vnode **, struct union_node *,
97				struct thread *);
98static int union_vn_close(struct vnode *, int, struct ucred *,
99				struct thread *);
100
101int
102union_init()
103{
104	int i;
105
106	for (i = 0; i < NHASH; i++)
107		LIST_INIT(&unhead[i]);
108	bzero((caddr_t)unvplock, sizeof(unvplock));
109	return (0);
110}
111
112static int
113union_list_lock(ix)
114	int ix;
115{
116	if (unvplock[ix] & UNVP_LOCKED) {
117		unvplock[ix] |= UNVP_WANT;
118		(void) tsleep( &unvplock[ix], PINOD, "unllck", 0);
119		return (1);
120	}
121	unvplock[ix] |= UNVP_LOCKED;
122	return (0);
123}
124
125static void
126union_list_unlock(ix)
127	int ix;
128{
129	unvplock[ix] &= ~UNVP_LOCKED;
130
131	if (unvplock[ix] & UNVP_WANT) {
132		unvplock[ix] &= ~UNVP_WANT;
133		wakeup( &unvplock[ix]);
134	}
135}
136
137/*
138 *	union_updatevp:
139 *
140 *	The uppervp, if not NULL, must be referenced and not locked by us
141 *	The lowervp, if not NULL, must be referenced.
142 *
143 *	If uppervp and lowervp match pointers already installed, then
144 *	nothing happens. The passed vp's (when matching) are not adjusted.
145 *
146 *	This routine may only be called by union_newupper() and
147 *	union_newlower().
148 */
149
150static void
151union_updatevp(un, uppervp, lowervp)
152	struct union_node *un;
153	struct vnode *uppervp;
154	struct vnode *lowervp;
155{
156	int ohash = UNION_HASH(un->un_uppervp, un->un_lowervp);
157	int nhash = UNION_HASH(uppervp, lowervp);
158	int docache = (lowervp != NULLVP || uppervp != NULLVP);
159	int lhash, uhash;
160
161	/*
162	 * Ensure locking is ordered from lower to higher
163	 * to avoid deadlocks.
164	 */
165	if (nhash < ohash) {
166		lhash = nhash;
167		uhash = ohash;
168	} else {
169		lhash = ohash;
170		uhash = nhash;
171	}
172
173	if (lhash != uhash) {
174		while (union_list_lock(lhash))
175			continue;
176	}
177
178	while (union_list_lock(uhash))
179		continue;
180
181	if (ohash != nhash || !docache) {
182		if (un->un_flags & UN_CACHED) {
183			un->un_flags &= ~UN_CACHED;
184			LIST_REMOVE(un, un_cache);
185		}
186	}
187
188	if (ohash != nhash)
189		union_list_unlock(ohash);
190
191	if (un->un_lowervp != lowervp) {
192		if (un->un_lowervp) {
193			vrele(un->un_lowervp);
194			if (un->un_path) {
195				free(un->un_path, M_UNPATH);
196				un->un_path = 0;
197			}
198		}
199		un->un_lowervp = lowervp;
200		un->un_lowersz = VNOVAL;
201	}
202
203	if (un->un_uppervp != uppervp) {
204		if (un->un_uppervp)
205			vrele(un->un_uppervp);
206		un->un_uppervp = uppervp;
207		un->un_uppersz = VNOVAL;
208	}
209
210	if (docache && (ohash != nhash)) {
211		LIST_INSERT_HEAD(&unhead[nhash], un, un_cache);
212		un->un_flags |= UN_CACHED;
213	}
214
215	union_list_unlock(nhash);
216}
217
218/*
219 * Set a new lowervp.  The passed lowervp must be referenced and will be
220 * stored in the vp in a referenced state.
221 */
222
223static void
224union_newlower(un, lowervp)
225	struct union_node *un;
226	struct vnode *lowervp;
227{
228	union_updatevp(un, un->un_uppervp, lowervp);
229}
230
231/*
232 * Set a new uppervp.  The passed uppervp must be locked and will be
233 * stored in the vp in a locked state.  The caller should not unlock
234 * uppervp.
235 */
236
237static void
238union_newupper(un, uppervp)
239	struct union_node *un;
240	struct vnode *uppervp;
241{
242	union_updatevp(un, uppervp, un->un_lowervp);
243}
244
245/*
246 * Keep track of size changes in the underlying vnodes.
247 * If the size changes, then callback to the vm layer
248 * giving priority to the upper layer size.
249 */
250void
251union_newsize(vp, uppersz, lowersz)
252	struct vnode *vp;
253	off_t uppersz, lowersz;
254{
255	struct union_node *un;
256	off_t sz;
257
258	/* only interested in regular files */
259	if (vp->v_type != VREG)
260		return;
261
262	un = VTOUNION(vp);
263	sz = VNOVAL;
264
265	if ((uppersz != VNOVAL) && (un->un_uppersz != uppersz)) {
266		un->un_uppersz = uppersz;
267		if (sz == VNOVAL)
268			sz = un->un_uppersz;
269	}
270
271	if ((lowersz != VNOVAL) && (un->un_lowersz != lowersz)) {
272		un->un_lowersz = lowersz;
273		if (sz == VNOVAL)
274			sz = un->un_lowersz;
275	}
276
277	if (sz != VNOVAL) {
278		UDEBUG(("union: %s size now %ld\n",
279			(uppersz != VNOVAL ? "upper" : "lower"), (long)sz));
280		/*
281		 * There is no need to change size of non-existent object.
282		 */
283		/* vnode_pager_setsize(vp, sz); */
284	}
285}
286
287/*
288 *	union_allocvp:	allocate a union_node and associate it with a
289 *			parent union_node and one or two vnodes.
290 *
291 *	vpp	Holds the returned vnode locked and referenced if no
292 *		error occurs.
293 *
294 *	mp	Holds the mount point.  mp may or may not be busied.
295 *		allocvp() makes no changes to mp.
296 *
297 *	dvp	Holds the parent union_node to the one we wish to create.
298 *		XXX may only be used to traverse an uncopied lowervp-based
299 *		tree?  XXX
300 *
301 *		dvp may or may not be locked.  allocvp() makes no changes
302 *		to dvp.
303 *
304 *	upperdvp Holds the parent vnode to uppervp, generally used along
305 *		with path component information to create a shadow of
306 *		lowervp when uppervp does not exist.
307 *
308 *		upperdvp is referenced but unlocked on entry, and will be
309 *		dereferenced on return.
310 *
311 *	uppervp	Holds the new uppervp vnode to be stored in the
312 *		union_node we are allocating.  uppervp is referenced but
313 *		not locked, and will be dereferenced on return.
314 *
315 *	lowervp	Holds the new lowervp vnode to be stored in the
316 *		union_node we are allocating.  lowervp is referenced but
317 *		not locked, and will be dereferenced on return.
318 *
319 *	cnp	Holds path component information to be coupled with
320 *		lowervp and upperdvp to allow unionfs to create an uppervp
321 *		later on.  Only used if lowervp is valid.  The contents
322 *		of cnp is only valid for the duration of the call.
323 *
324 *	docache	Determine whether this node should be entered in the
325 *		cache or whether it should be destroyed as soon as possible.
326 *
327 * All union_nodes are maintained on a singly-linked
328 * list.  New nodes are only allocated when they cannot
329 * be found on this list.  Entries on the list are
330 * removed when the vfs reclaim entry is called.
331 *
332 * A single lock is kept for the entire list.  This is
333 * needed because the getnewvnode() function can block
334 * waiting for a vnode to become free, in which case there
335 * may be more than one process trying to get the same
336 * vnode.  This lock is only taken if we are going to
337 * call getnewvnode(), since the kernel itself is single-threaded.
338 *
339 * If an entry is found on the list, then call vget() to
340 * take a reference.  This is done because there may be
341 * zero references to it and so it needs to removed from
342 * the vnode free list.
343 */
344
345int
346union_allocvp(vpp, mp, dvp, upperdvp, cnp, uppervp, lowervp, docache)
347	struct vnode **vpp;
348	struct mount *mp;
349	struct vnode *dvp;		/* parent union vnode */
350	struct vnode *upperdvp;		/* parent vnode of uppervp */
351	struct componentname *cnp;	/* may be null */
352	struct vnode *uppervp;		/* may be null */
353	struct vnode *lowervp;		/* may be null */
354	int docache;
355{
356	int error;
357	struct union_node *un = 0;
358	struct union_mount *um = MOUNTTOUNIONMOUNT(mp);
359	struct thread *td = (cnp) ? cnp->cn_thread : curthread;
360	int hash = 0;
361	int vflag;
362	int try;
363
364	if (uppervp == NULLVP && lowervp == NULLVP)
365		panic("union: unidentifiable allocation");
366
367	if (uppervp && lowervp && (uppervp->v_type != lowervp->v_type)) {
368		vrele(lowervp);
369		lowervp = NULLVP;
370	}
371
372	/* detect the root vnode (and aliases) */
373	vflag = 0;
374	if ((uppervp == um->um_uppervp) &&
375	    ((lowervp == NULLVP) || lowervp == um->um_lowervp)) {
376		if (lowervp == NULLVP) {
377			lowervp = um->um_lowervp;
378			if (lowervp != NULLVP)
379				VREF(lowervp);
380		}
381		vflag = VV_ROOT;
382	}
383
384loop:
385	if (!docache) {
386		un = 0;
387	} else for (try = 0; try < 3; try++) {
388		switch (try) {
389		case 0:
390			if (lowervp == NULLVP)
391				continue;
392			hash = UNION_HASH(uppervp, lowervp);
393			break;
394
395		case 1:
396			if (uppervp == NULLVP)
397				continue;
398			hash = UNION_HASH(uppervp, NULLVP);
399			break;
400
401		case 2:
402			if (lowervp == NULLVP)
403				continue;
404			hash = UNION_HASH(NULLVP, lowervp);
405			break;
406		}
407
408		while (union_list_lock(hash))
409			continue;
410
411		LIST_FOREACH(un, &unhead[hash], un_cache) {
412			if ((un->un_lowervp == lowervp ||
413			     un->un_lowervp == NULLVP) &&
414			    (un->un_uppervp == uppervp ||
415			     un->un_uppervp == NULLVP) &&
416			    (UNIONTOV(un)->v_mount == mp)) {
417				if (vget(UNIONTOV(un), 0,
418				    cnp ? cnp->cn_thread : NULL)) {
419					union_list_unlock(hash);
420					goto loop;
421				}
422				break;
423			}
424		}
425
426		union_list_unlock(hash);
427
428		if (un)
429			break;
430	}
431
432	if (un) {
433		/*
434		 * Obtain a lock on the union_node.  Everything is unlocked
435		 * except for dvp, so check that case.  If they match, our
436		 * new un is already locked.  Otherwise we have to lock our
437		 * new un.
438		 *
439		 * A potential deadlock situation occurs when we are holding
440		 * one lock while trying to get another.  We must follow
441		 * strict ordering rules to avoid it.  We try to locate dvp
442		 * by scanning up from un_vnode, since the most likely
443		 * scenario is un being under dvp.
444		 */
445
446		if (dvp && un->un_vnode != dvp) {
447			struct vnode *scan = un->un_vnode;
448
449			do {
450				scan = VTOUNION(scan)->un_pvp;
451			} while (scan && scan->v_op == union_vnodeop_p &&
452				 scan != dvp);
453			if (scan != dvp) {
454				/*
455				 * our new un is above dvp (we never saw dvp
456				 * while moving up the tree).
457				 */
458				VREF(dvp);
459				VOP_UNLOCK(dvp, 0, td);
460				error = vn_lock(un->un_vnode, LK_EXCLUSIVE, td);
461				vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td);
462				vrele(dvp);
463			} else {
464				/*
465				 * our new un is under dvp
466				 */
467				error = vn_lock(un->un_vnode, LK_EXCLUSIVE, td);
468			}
469		} else if (dvp == NULLVP) {
470			/*
471			 * dvp is NULL, we need to lock un.
472			 */
473			error = vn_lock(un->un_vnode, LK_EXCLUSIVE, td);
474		} else {
475			/*
476			 * dvp == un->un_vnode, we are already locked.
477			 */
478			error = 0;
479		}
480
481		if (error)
482			goto loop;
483
484		/*
485		 * At this point, the union_node is locked and referenced.
486		 *
487		 * uppervp is locked and referenced or NULL, lowervp is
488		 * referenced or NULL.
489		 */
490		UDEBUG(("Modify existing un %p vn %p upper %p(refs %d) -> %p(refs %d)\n",
491			un, un->un_vnode, un->un_uppervp,
492			(un->un_uppervp ? vrefcnt(un->un_uppervp) : -99),
493			uppervp,
494			(uppervp ? vrefcnt(uppervp) : -99)
495		));
496
497		if (uppervp != un->un_uppervp) {
498			KASSERT(uppervp == NULL || vrefcnt(uppervp) > 0, ("union_allocvp: too few refs %d (at least 1 required) on uppervp", vrefcnt(uppervp)));
499			union_newupper(un, uppervp);
500		} else if (uppervp) {
501			KASSERT(vrefcnt(uppervp) > 1, ("union_allocvp: too few refs %d (at least 2 required) on uppervp", vrefcnt(uppervp)));
502			vrele(uppervp);
503		}
504
505		/*
506		 * Save information about the lower layer.
507		 * This needs to keep track of pathname
508		 * and directory information which union_vn_create()
509		 * might need.
510		 */
511		if (lowervp != un->un_lowervp) {
512			union_newlower(un, lowervp);
513			if (cnp && (lowervp != NULLVP)) {
514				un->un_path = malloc(cnp->cn_namelen+1,
515						M_UNPATH, M_WAITOK);
516				bcopy(cnp->cn_nameptr, un->un_path,
517						cnp->cn_namelen);
518				un->un_path[cnp->cn_namelen] = '\0';
519			}
520		} else if (lowervp) {
521			vrele(lowervp);
522		}
523
524		/*
525		 * and upperdvp
526		 */
527		if (upperdvp != un->un_dirvp) {
528			if (un->un_dirvp)
529				vrele(un->un_dirvp);
530			un->un_dirvp = upperdvp;
531		} else if (upperdvp) {
532			vrele(upperdvp);
533		}
534
535		*vpp = UNIONTOV(un);
536		return (0);
537	}
538
539	if (docache) {
540		/*
541		 * Otherwise lock the vp list while we call getnewvnode()
542		 * since that can block.
543		 */
544		hash = UNION_HASH(uppervp, lowervp);
545
546		if (union_list_lock(hash))
547			goto loop;
548	}
549
550	/*
551	 * Create new node rather than replace old node.
552	 */
553
554	error = getnewvnode("union", mp, union_vnodeop_p, vpp);
555	if (error) {
556		/*
557		 * If an error occurs, clear out vnodes.
558		 */
559		if (lowervp)
560			vrele(lowervp);
561		if (uppervp)
562			vrele(uppervp);
563		if (upperdvp)
564			vrele(upperdvp);
565		*vpp = NULL;
566		goto out;
567	}
568
569	MALLOC((*vpp)->v_data, void *, sizeof(struct union_node),
570		M_TEMP, M_WAITOK);
571
572	ASSERT_VOP_LOCKED(*vpp, "union_allocvp");
573	(*vpp)->v_vflag |= vflag;
574	if (uppervp)
575		(*vpp)->v_type = uppervp->v_type;
576	else
577		(*vpp)->v_type = lowervp->v_type;
578
579	un = VTOUNION(*vpp);
580	bzero(un, sizeof(*un));
581
582	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);
583
584	un->un_vnode = *vpp;
585	un->un_uppervp = uppervp;
586	un->un_uppersz = VNOVAL;
587	un->un_lowervp = lowervp;
588	un->un_lowersz = VNOVAL;
589	un->un_dirvp = upperdvp;
590	un->un_pvp = dvp;		/* only parent dir in new allocation */
591	if (dvp != NULLVP)
592		VREF(dvp);
593	un->un_dircache = NULL;
594	un->un_openl = 0;
595
596	if (cnp && (lowervp != NULLVP)) {
597		un->un_path = malloc(cnp->cn_namelen+1, M_UNPATH, M_WAITOK);
598		bcopy(cnp->cn_nameptr, un->un_path, cnp->cn_namelen);
599		un->un_path[cnp->cn_namelen] = '\0';
600	} else {
601		un->un_path = NULL;
602		un->un_dirvp = NULL;
603	}
604
605	if (docache) {
606		LIST_INSERT_HEAD(&unhead[hash], un, un_cache);
607		un->un_flags |= UN_CACHED;
608	}
609
610out:
611	if (docache)
612		union_list_unlock(hash);
613
614	return (error);
615}
616
617int
618union_freevp(vp)
619	struct vnode *vp;
620{
621	struct union_node *un = VTOUNION(vp);
622
623	if (un->un_flags & UN_CACHED) {
624		un->un_flags &= ~UN_CACHED;
625		LIST_REMOVE(un, un_cache);
626	}
627
628	if (un->un_pvp != NULLVP) {
629		vrele(un->un_pvp);
630		un->un_pvp = NULL;
631	}
632	if (un->un_uppervp != NULLVP) {
633		vrele(un->un_uppervp);
634		un->un_uppervp = NULL;
635	}
636	if (un->un_lowervp != NULLVP) {
637		vrele(un->un_lowervp);
638		un->un_lowervp = NULL;
639	}
640	if (un->un_dirvp != NULLVP) {
641		vrele(un->un_dirvp);
642		un->un_dirvp = NULL;
643	}
644	if (un->un_path) {
645		free(un->un_path, M_UNPATH);
646		un->un_path = NULL;
647	}
648
649	FREE(vp->v_data, M_TEMP);
650	vp->v_data = 0;
651
652	return (0);
653}
654
655/*
656 * copyfile.  Copy the vnode (fvp) to the vnode (tvp)
657 * using a sequence of reads and writes.  Both (fvp)
658 * and (tvp) are locked on entry and exit.
659 *
660 * fvp and tvp are both exclusive locked on call, but their refcount's
661 * haven't been bumped at all.
662 */
663static int
664union_copyfile(fvp, tvp, cred, td)
665	struct vnode *fvp;
666	struct vnode *tvp;
667	struct ucred *cred;
668	struct thread *td;
669{
670	char *buf;
671	struct uio uio;
672	struct iovec iov;
673	int error = 0;
674
675	/*
676	 * strategy:
677	 * Allocate a buffer of size MAXBSIZE.
678	 * Loop doing reads and writes, keeping track
679	 * of the current uio offset.
680	 * Give up at the first sign of trouble.
681	 */
682
683	bzero(&uio, sizeof(uio));
684
685	uio.uio_td = td;
686	uio.uio_segflg = UIO_SYSSPACE;
687	uio.uio_offset = 0;
688
689	VOP_LEASE(fvp, td, cred, LEASE_READ);
690	VOP_LEASE(tvp, td, cred, LEASE_WRITE);
691
692	buf = malloc(MAXBSIZE, M_TEMP, M_WAITOK);
693
694	/* ugly loop follows... */
695	do {
696		off_t offset = uio.uio_offset;
697		int count;
698		int bufoffset;
699
700		/*
701		 * Setup for big read.
702		 */
703		uio.uio_iov = &iov;
704		uio.uio_iovcnt = 1;
705		iov.iov_base = buf;
706		iov.iov_len = MAXBSIZE;
707		uio.uio_resid = iov.iov_len;
708		uio.uio_rw = UIO_READ;
709
710		if ((error = VOP_READ(fvp, &uio, 0, cred)) != 0)
711			break;
712
713		/*
714		 * Get bytes read, handle read eof case and setup for
715		 * write loop.
716		 */
717		if ((count = MAXBSIZE - uio.uio_resid) == 0)
718			break;
719		bufoffset = 0;
720
721		/*
722		 * Write until an error occurs or our buffer has been
723		 * exhausted, then update the offset for the next read.
724		 */
725		while (bufoffset < count) {
726			uio.uio_iov = &iov;
727			uio.uio_iovcnt = 1;
728			iov.iov_base = buf + bufoffset;
729			iov.iov_len = count - bufoffset;
730			uio.uio_offset = offset + bufoffset;
731			uio.uio_rw = UIO_WRITE;
732			uio.uio_resid = iov.iov_len;
733
734			if ((error = VOP_WRITE(tvp, &uio, 0, cred)) != 0)
735				break;
736			bufoffset += (count - bufoffset) - uio.uio_resid;
737		}
738		uio.uio_offset = offset + bufoffset;
739	} while (error == 0);
740
741	free(buf, M_TEMP);
742	return (error);
743}
744
745/*
746 *
747 * un's vnode is assumed to be locked on entry and remains locked on exit.
748 */
749
750int
751union_copyup(un, docopy, cred, td)
752	struct union_node *un;
753	int docopy;
754	struct ucred *cred;
755	struct thread *td;
756{
757	int error;
758	struct mount *mp;
759	struct vnode *lvp, *uvp;
760
761	/*
762	 * If the user does not have read permission, the vnode should not
763	 * be copied to upper layer.
764	 */
765	vn_lock(un->un_lowervp, LK_EXCLUSIVE | LK_RETRY, td);
766	error = VOP_ACCESS(un->un_lowervp, VREAD, cred, td);
767	VOP_UNLOCK(un->un_lowervp, 0, td);
768	if (error)
769		return (error);
770
771	if ((error = vn_start_write(un->un_dirvp, &mp, V_WAIT | PCATCH)) != 0)
772		return (error);
773	if ((error = union_vn_create(&uvp, un, td)) != 0) {
774		vn_finished_write(mp);
775		return (error);
776	}
777
778	lvp = un->un_lowervp;
779
780	KASSERT(vrefcnt(uvp) > 0, ("copy: uvp refcount 0: %d", vrefcnt(uvp)));
781	if (docopy) {
782		/*
783		 * XX - should not ignore errors
784		 * from VOP_CLOSE()
785		 */
786		vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY, td);
787		error = VOP_OPEN(lvp, FREAD, cred, td);
788		if (error == 0 && vn_canvmio(lvp) == TRUE)
789			error = vfs_object_create(lvp, td, cred);
790		if (error == 0) {
791			error = union_copyfile(lvp, uvp, cred, td);
792			VOP_UNLOCK(lvp, 0, td);
793			(void) VOP_CLOSE(lvp, FREAD, cred, td);
794		}
795		if (error == 0)
796			UDEBUG(("union: copied up %s\n", un->un_path));
797
798	}
799	VOP_UNLOCK(uvp, 0, td);
800	vn_finished_write(mp);
801	union_newupper(un, uvp);
802	KASSERT(vrefcnt(uvp) > 0, ("copy: uvp refcount 0: %d", vrefcnt(uvp)));
803	union_vn_close(uvp, FWRITE, cred, td);
804	KASSERT(vrefcnt(uvp) > 0, ("copy: uvp refcount 0: %d", vrefcnt(uvp)));
805	/*
806	 * Subsequent IOs will go to the top layer, so
807	 * call close on the lower vnode and open on the
808	 * upper vnode to ensure that the filesystem keeps
809	 * its references counts right.  This doesn't do
810	 * the right thing with (cred) and (FREAD) though.
811	 * Ignoring error returns is not right, either.
812	 */
813	if (error == 0) {
814		int i;
815
816		for (i = 0; i < un->un_openl; i++) {
817			(void) VOP_CLOSE(lvp, FREAD, cred, td);
818			(void) VOP_OPEN(uvp, FREAD, cred, td);
819		}
820		if (un->un_openl) {
821			if (vn_canvmio(uvp) == TRUE)
822				error = vfs_object_create(uvp, td, cred);
823		}
824		un->un_openl = 0;
825	}
826
827	return (error);
828
829}
830
831/*
832 *	union_relookup:
833 *
834 *	dvp should be locked on entry and will be locked on return.  No
835 *	net change in the ref count will occur.
836 *
837 *	If an error is returned, *vpp will be invalid, otherwise it
838 *	will hold a locked, referenced vnode.  If *vpp == dvp then
839 *	remember that only one exclusive lock is held.
840 */
841
842static int
843union_relookup(um, dvp, vpp, cnp, cn, path, pathlen)
844	struct union_mount *um;
845	struct vnode *dvp;
846	struct vnode **vpp;
847	struct componentname *cnp;
848	struct componentname *cn;
849	char *path;
850	int pathlen;
851{
852	int error;
853
854	/*
855	 * A new componentname structure must be faked up because
856	 * there is no way to know where the upper level cnp came
857	 * from or what it is being used for.  This must duplicate
858	 * some of the work done by NDINIT(), some of the work done
859	 * by namei(), some of the work done by lookup() and some of
860	 * the work done by VOP_LOOKUP() when given a CREATE flag.
861	 * Conclusion: Horrible.
862	 */
863	cn->cn_namelen = pathlen;
864	cn->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
865	bcopy(path, cn->cn_pnbuf, cn->cn_namelen);
866	cn->cn_pnbuf[cn->cn_namelen] = '\0';
867
868	cn->cn_nameiop = CREATE;
869	cn->cn_flags = (LOCKPARENT|LOCKLEAF|HASBUF|SAVENAME|ISLASTCN);
870	cn->cn_thread = cnp->cn_thread;
871	if (um->um_op == UNMNT_ABOVE)
872		cn->cn_cred = cnp->cn_cred;
873	else
874		cn->cn_cred = um->um_cred;
875	cn->cn_nameptr = cn->cn_pnbuf;
876	cn->cn_consume = cnp->cn_consume;
877
878	VREF(dvp);
879	VOP_UNLOCK(dvp, 0, cnp->cn_thread);
880
881	/*
882	 * Pass dvp unlocked and referenced on call to relookup().
883	 *
884	 * If an error occurs, dvp will be returned unlocked and dereferenced.
885	 */
886
887	if ((error = relookup(dvp, vpp, cn)) != 0) {
888		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, cnp->cn_thread);
889		return(error);
890	}
891
892	/*
893	 * If no error occurs, dvp will be returned locked with the reference
894	 * left as before, and vpp will be returned referenced and locked.
895	 *
896	 * We want to return with dvp as it was passed to us, so we get
897	 * rid of our reference.
898	 */
899	vrele(dvp);
900	return (0);
901}
902
903/*
904 * Create a shadow directory in the upper layer.
905 * The new vnode is returned locked.
906 *
907 * (um) points to the union mount structure for access to the
908 * the mounting process's credentials.
909 * (dvp) is the directory in which to create the shadow directory,
910 * It is locked (but not ref'd) on entry and return.
911 * (cnp) is the component name to be created.
912 * (vpp) is the returned newly created shadow directory, which
913 * is returned locked and ref'd
914 */
915int
916union_mkshadow(um, dvp, cnp, vpp)
917	struct union_mount *um;
918	struct vnode *dvp;
919	struct componentname *cnp;
920	struct vnode **vpp;
921{
922	int error;
923	struct vattr va;
924	struct thread *td = cnp->cn_thread;
925	struct componentname cn;
926	struct mount *mp;
927
928	if ((error = vn_start_write(dvp, &mp, V_WAIT | PCATCH)) != 0)
929		return (error);
930	if ((error = union_relookup(um, dvp, vpp, cnp, &cn,
931			cnp->cn_nameptr, cnp->cn_namelen)) != 0) {
932		vn_finished_write(mp);
933		return (error);
934	}
935
936	if (*vpp) {
937		if (cn.cn_flags & HASBUF) {
938			uma_zfree(namei_zone, cn.cn_pnbuf);
939			cn.cn_flags &= ~HASBUF;
940		}
941		if (dvp == *vpp)
942			vrele(*vpp);
943		else
944			vput(*vpp);
945		vn_finished_write(mp);
946		*vpp = NULLVP;
947		return (EEXIST);
948	}
949
950	/*
951	 * Policy: when creating the shadow directory in the
952	 * upper layer, create it owned by the user who did
953	 * the mount, group from parent directory, and mode
954	 * 777 modified by umask (ie mostly identical to the
955	 * mkdir syscall).  (jsp, kb)
956	 */
957
958	VATTR_NULL(&va);
959	va.va_type = VDIR;
960	va.va_mode = um->um_cmode;
961
962	/* VOP_LEASE: dvp is locked */
963	VOP_LEASE(dvp, td, cn.cn_cred, LEASE_WRITE);
964
965	error = VOP_MKDIR(dvp, vpp, &cn, &va);
966	if (cn.cn_flags & HASBUF) {
967		uma_zfree(namei_zone, cn.cn_pnbuf);
968		cn.cn_flags &= ~HASBUF;
969	}
970	/*vput(dvp);*/
971	vn_finished_write(mp);
972	return (error);
973}
974
975/*
976 * Create a whiteout entry in the upper layer.
977 *
978 * (um) points to the union mount structure for access to the
979 * the mounting process's credentials.
980 * (dvp) is the directory in which to create the whiteout.
981 * It is locked on entry and return.
982 * (cnp) is the component name to be created.
983 */
984int
985union_mkwhiteout(um, dvp, cnp, path)
986	struct union_mount *um;
987	struct vnode *dvp;
988	struct componentname *cnp;
989	char *path;
990{
991	int error;
992	struct thread *td = cnp->cn_thread;
993	struct vnode *wvp;
994	struct componentname cn;
995	struct mount *mp;
996
997	if ((error = vn_start_write(dvp, &mp, V_WAIT | PCATCH)) != 0)
998		return (error);
999	error = union_relookup(um, dvp, &wvp, cnp, &cn, path, strlen(path));
1000	if (error) {
1001		vn_finished_write(mp);
1002		return (error);
1003	}
1004
1005	if (wvp) {
1006		if (cn.cn_flags & HASBUF) {
1007			uma_zfree(namei_zone, cn.cn_pnbuf);
1008			cn.cn_flags &= ~HASBUF;
1009		}
1010		if (wvp == dvp)
1011			vrele(wvp);
1012		else
1013			vput(wvp);
1014		vn_finished_write(mp);
1015		return (EEXIST);
1016	}
1017
1018	/* VOP_LEASE: dvp is locked */
1019	VOP_LEASE(dvp, td, td->td_ucred, LEASE_WRITE);
1020
1021	error = VOP_WHITEOUT(dvp, &cn, CREATE);
1022	if (cn.cn_flags & HASBUF) {
1023		uma_zfree(namei_zone, cn.cn_pnbuf);
1024		cn.cn_flags &= ~HASBUF;
1025	}
1026	vn_finished_write(mp);
1027	return (error);
1028}
1029
1030/*
1031 * union_vn_create: creates and opens a new shadow file
1032 * on the upper union layer.  This function is similar
1033 * in spirit to calling vn_open() but it avoids calling namei().
1034 * The problem with calling namei() is that a) it locks too many
1035 * things, and b) it doesn't start at the "right" directory,
1036 * whereas relookup() is told where to start.
1037 *
1038 * On entry, the vnode associated with un is locked.  It remains locked
1039 * on return.
1040 *
1041 * If no error occurs, *vpp contains a locked referenced vnode for your
1042 * use.  If an error occurs *vpp iis undefined.
1043 */
1044static int
1045union_vn_create(vpp, un, td)
1046	struct vnode **vpp;
1047	struct union_node *un;
1048	struct thread *td;
1049{
1050	struct vnode *vp;
1051	struct ucred *cred = td->td_ucred;
1052	struct vattr vat;
1053	struct vattr *vap = &vat;
1054	int fmode = FFLAGS(O_WRONLY|O_CREAT|O_TRUNC|O_EXCL);
1055	int error;
1056	int cmode;
1057	struct componentname cn;
1058
1059	*vpp = NULLVP;
1060	FILEDESC_LOCK(td->td_proc->p_fd);
1061	cmode = UN_FILEMODE & ~td->td_proc->p_fd->fd_cmask;
1062	FILEDESC_UNLOCK(td->td_proc->p_fd);
1063
1064	/*
1065	 * Build a new componentname structure (for the same
1066	 * reasons outlines in union_mkshadow()).
1067	 * The difference here is that the file is owned by
1068	 * the current user, rather than by the person who
1069	 * did the mount, since the current user needs to be
1070	 * able to write the file (that's why it is being
1071	 * copied in the first place).
1072	 */
1073	cn.cn_namelen = strlen(un->un_path);
1074	cn.cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
1075	bcopy(un->un_path, cn.cn_pnbuf, cn.cn_namelen+1);
1076	cn.cn_nameiop = CREATE;
1077	cn.cn_flags = (LOCKPARENT|LOCKLEAF|HASBUF|SAVENAME|ISLASTCN);
1078	cn.cn_thread = td;
1079	cn.cn_cred = td->td_ucred;
1080	cn.cn_nameptr = cn.cn_pnbuf;
1081	cn.cn_consume = 0;
1082
1083	/*
1084	 * Pass dvp unlocked and referenced on call to relookup().
1085	 *
1086	 * If an error occurs, dvp will be returned unlocked and dereferenced.
1087	 */
1088	VREF(un->un_dirvp);
1089	error = relookup(un->un_dirvp, &vp, &cn);
1090	if (error)
1091		return (error);
1092
1093	/*
1094	 * If no error occurs, dvp will be returned locked with the reference
1095	 * left as before, and vpp will be returned referenced and locked.
1096	 */
1097	if (vp) {
1098		vput(un->un_dirvp);
1099		if (cn.cn_flags & HASBUF) {
1100			uma_zfree(namei_zone, cn.cn_pnbuf);
1101			cn.cn_flags &= ~HASBUF;
1102		}
1103		if (vp == un->un_dirvp)
1104			vrele(vp);
1105		else
1106			vput(vp);
1107		return (EEXIST);
1108	}
1109
1110	/*
1111	 * Good - there was no race to create the file
1112	 * so go ahead and create it.  The permissions
1113	 * on the file will be 0666 modified by the
1114	 * current user's umask.  Access to the file, while
1115	 * it is unioned, will require access to the top *and*
1116	 * bottom files.  Access when not unioned will simply
1117	 * require access to the top-level file.
1118	 * TODO: confirm choice of access permissions.
1119	 */
1120	VATTR_NULL(vap);
1121	vap->va_type = VREG;
1122	vap->va_mode = cmode;
1123	VOP_LEASE(un->un_dirvp, td, cred, LEASE_WRITE);
1124	error = VOP_CREATE(un->un_dirvp, &vp, &cn, vap);
1125	if (cn.cn_flags & HASBUF) {
1126		uma_zfree(namei_zone, cn.cn_pnbuf);
1127		cn.cn_flags &= ~HASBUF;
1128	}
1129	vput(un->un_dirvp);
1130	if (error)
1131		return (error);
1132
1133	error = VOP_OPEN(vp, fmode, cred, td);
1134	if (error == 0 && vn_canvmio(vp) == TRUE)
1135		error = vfs_object_create(vp, td, cred);
1136	if (error) {
1137		vput(vp);
1138		return (error);
1139	}
1140	vp->v_writecount++;
1141	*vpp = vp;
1142	return (0);
1143}
1144
1145static int
1146union_vn_close(vp, fmode, cred, td)
1147	struct vnode *vp;
1148	int fmode;
1149	struct ucred *cred;
1150	struct thread *td;
1151{
1152
1153	if (fmode & FWRITE)
1154		--vp->v_writecount;
1155	return (VOP_CLOSE(vp, fmode, cred, td));
1156}
1157
1158#if 0
1159
1160/*
1161 *	union_removed_upper:
1162 *
1163 *	called with union_node unlocked. XXX
1164 */
1165
1166void
1167union_removed_upper(un)
1168	struct union_node *un;
1169{
1170	struct thread *td = curthread;	/* XXX */
1171	struct vnode **vpp;
1172
1173	/*
1174	 * Do not set the uppervp to NULLVP.  If lowervp is NULLVP,
1175	 * union node will have neither uppervp nor lowervp.  We remove
1176	 * the union node from cache, so that it will not be referrenced.
1177	 */
1178	union_newupper(un, NULLVP);
1179	if (un->un_dircache != NULL)
1180		union_dircache_free(un);
1181
1182	if (un->un_flags & UN_CACHED) {
1183		un->un_flags &= ~UN_CACHED;
1184		LIST_REMOVE(un, un_cache);
1185	}
1186}
1187
1188#endif
1189
1190/*
1191 * Determine whether a whiteout is needed
1192 * during a remove/rmdir operation.
1193 */
1194int
1195union_dowhiteout(un, cred, td)
1196	struct union_node *un;
1197	struct ucred *cred;
1198	struct thread *td;
1199{
1200	struct vattr va;
1201
1202	if (un->un_lowervp != NULLVP)
1203		return (1);
1204
1205	if (VOP_GETATTR(un->un_uppervp, &va, cred, td) == 0 &&
1206	    (va.va_flags & OPAQUE))
1207		return (1);
1208
1209	return (0);
1210}
1211
1212static void
1213union_dircache_r(vp, vppp, cntp)
1214	struct vnode *vp;
1215	struct vnode ***vppp;
1216	int *cntp;
1217{
1218	struct union_node *un;
1219
1220	if (vp->v_op != union_vnodeop_p) {
1221		if (vppp) {
1222			VREF(vp);
1223			*(*vppp)++ = vp;
1224			if (--(*cntp) == 0)
1225				panic("union: dircache table too small");
1226		} else {
1227			(*cntp)++;
1228		}
1229	} else {
1230		un = VTOUNION(vp);
1231		if (un->un_uppervp != NULLVP)
1232			union_dircache_r(un->un_uppervp, vppp, cntp);
1233		if (un->un_lowervp != NULLVP)
1234			union_dircache_r(un->un_lowervp, vppp, cntp);
1235	}
1236}
1237
1238struct vnode *
1239union_dircache_get(vp, td)
1240	struct vnode *vp;
1241	struct thread *td;
1242{
1243	int cnt;
1244	struct vnode *nvp;
1245	struct vnode **vpp;
1246	struct vnode **dircache, **newdircache;
1247	struct union_node *un;
1248	int error;
1249
1250	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
1251	un = VTOUNION(vp);
1252	dircache = un->un_dircache;
1253	newdircache = NULL;
1254
1255	nvp = NULLVP;
1256
1257	if (dircache == NULL) {
1258		cnt = 0;
1259		union_dircache_r(vp, 0, &cnt);
1260		cnt++;
1261		newdircache = dircache = malloc(cnt * sizeof(struct vnode *),
1262						M_UNDCACHE, M_WAITOK);
1263		vpp = dircache;
1264		union_dircache_r(vp, &vpp, &cnt);
1265		*vpp = NULLVP;
1266		vpp = dircache + 1;
1267	} else {
1268		vpp = dircache;
1269		do {
1270			if (*vpp++ == un->un_uppervp)
1271				break;
1272		} while (*vpp != NULLVP);
1273	}
1274
1275	if (*vpp == NULLVP)
1276		goto out;
1277
1278	/*vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);*/
1279	UDEBUG(("ALLOCVP-3 %p ref %d\n", *vpp, (*vpp ? vrefcnt(*vpp) : -99)));
1280	VREF(*vpp);
1281	error = union_allocvp(&nvp, vp->v_mount, NULLVP, NULLVP, NULL, *vpp, NULLVP, 0);
1282	UDEBUG(("ALLOCVP-3B %p ref %d\n", nvp, (*vpp ? vrefcnt(*vpp) : -99)));
1283	if (error)
1284		goto out;
1285
1286	un->un_dircache = NULL;
1287	VTOUNION(nvp)->un_dircache = dircache;
1288	newdircache = NULL;
1289
1290out:
1291	/*
1292	 * If we allocated a new dircache and couldn't attach
1293	 * it to a new vp, free the resources we allocated.
1294	 */
1295	if (newdircache) {
1296		for (vpp = newdircache; *vpp != NULLVP; vpp++)
1297			vrele(*vpp);
1298		free(newdircache, M_UNDCACHE);
1299	}
1300
1301	VOP_UNLOCK(vp, 0, td);
1302	return (nvp);
1303}
1304
1305void
1306union_dircache_free(struct union_node *un)
1307{
1308	struct vnode **vpp;
1309
1310	for (vpp = un->un_dircache; *vpp != NULLVP; vpp++)
1311		vrele(*vpp);
1312	free(un->un_dircache, M_UNDCACHE);
1313	un->un_dircache = NULL;
1314}
1315
1316/*
1317 * Module glue to remove #ifdef UNION from vfs_syscalls.c
1318 */
1319static int
1320union_dircheck(struct thread *td, struct vnode **vp, struct file *fp)
1321{
1322	int error = 0;
1323
1324	if ((*vp)->v_op == union_vnodeop_p) {
1325		struct vnode *lvp;
1326
1327		lvp = union_dircache_get(*vp, td);
1328		if (lvp != NULLVP) {
1329			struct vattr va;
1330
1331			/*
1332			 * If the directory is opaque,
1333			 * then don't show lower entries
1334			 */
1335			error = VOP_GETATTR(*vp, &va, fp->f_cred, td);
1336			if (va.va_flags & OPAQUE) {
1337				vput(lvp);
1338				lvp = NULLVP;
1339			}
1340		}
1341
1342		if (lvp != NULLVP) {
1343			error = VOP_OPEN(lvp, FREAD, fp->f_cred, td);
1344			if (error == 0 && vn_canvmio(lvp) == TRUE)
1345				error = vfs_object_create(lvp, td, fp->f_cred);
1346			if (error) {
1347				vput(lvp);
1348				return (error);
1349			}
1350			VOP_UNLOCK(lvp, 0, td);
1351			FILE_LOCK(fp);
1352			fp->f_data = lvp;
1353			fp->f_offset = 0;
1354			FILE_UNLOCK(fp);
1355			error = vn_close(*vp, FREAD, fp->f_cred, td);
1356			if (error)
1357				return (error);
1358			*vp = lvp;
1359			return -1;	/* goto unionread */
1360		}
1361	}
1362	return error;
1363}
1364
1365static int
1366union_modevent(module_t mod, int type, void *data)
1367{
1368	switch (type) {
1369	case MOD_LOAD:
1370		union_dircheckp = union_dircheck;
1371		break;
1372	case MOD_UNLOAD:
1373		union_dircheckp = NULL;
1374		break;
1375	default:
1376		break;
1377	}
1378	return 0;
1379}
1380
1381static moduledata_t union_mod = {
1382	"union_dircheck",
1383	union_modevent,
1384	NULL
1385};
1386
1387DECLARE_MODULE(union_dircheck, union_mod, SI_SUB_VFS, SI_ORDER_ANY);
1388