union_subr.c revision 28189
1/*
2 * Copyright (c) 1994 Jan-Simon Pendry
3 * Copyright (c) 1994
4 *	The Regents of the University of California.  All rights reserved.
5 *
6 * This code is derived from software contributed to Berkeley by
7 * Jan-Simon Pendry.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 *    notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in the
16 *    documentation and/or other materials provided with the distribution.
17 * 3. All advertising materials mentioning features or use of this software
18 *    must display the following acknowledgement:
19 *	This product includes software developed by the University of
20 *	California, Berkeley and its contributors.
21 * 4. Neither the name of the University nor the names of its contributors
22 *    may be used to endorse or promote products derived from this software
23 *    without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 *
37 *	@(#)union_subr.c	8.20 (Berkeley) 5/20/95
38 * $Id: union_subr.c,v 1.19 1997/08/02 14:32:28 bde Exp $
39 */
40
41#include <sys/param.h>
42#include <sys/systm.h>
43#include <sys/vnode.h>
44#include <sys/namei.h>
45#include <sys/malloc.h>
46#include <sys/fcntl.h>
47#include <sys/filedesc.h>
48#include <sys/mount.h>
49#include <sys/stat.h>
50#include <vm/vm.h>
51#include <vm/vm_extern.h>	/* for vnode_pager_setsize */
52#include <miscfs/union/union.h>
53
54#include <sys/proc.h>
55
56extern int	union_init __P((void));
57
58/* must be power of two, otherwise change UNION_HASH() */
59#define NHASH 32
60
61/* unsigned int ... */
62#define UNION_HASH(u, l) \
63	(((((unsigned long) (u)) + ((unsigned long) l)) >> 8) & (NHASH-1))
64
65static LIST_HEAD(unhead, union_node) unhead[NHASH];
66static int unvplock[NHASH];
67
68static void	union_dircache_r __P((struct vnode *vp, struct vnode ***vppp,
69				      int *cntp));
70static int	union_list_lock __P((int ix));
71static void	union_list_unlock __P((int ix));
72static int	union_relookup __P((struct union_mount *um, struct vnode *dvp,
73				    struct vnode **vpp,
74				    struct componentname *cnp,
75				    struct componentname *cn, char *path,
76				    int pathlen));
77extern void	union_updatevp __P((struct union_node *un,
78				    struct vnode *uppervp,
79				    struct vnode *lowervp));
80
81int
82union_init()
83{
84	int i;
85
86	for (i = 0; i < NHASH; i++)
87		LIST_INIT(&unhead[i]);
88	bzero((caddr_t) unvplock, sizeof(unvplock));
89	return (0);
90}
91
92static int
93union_list_lock(ix)
94	int ix;
95{
96
97	if (unvplock[ix] & UN_LOCKED) {
98		unvplock[ix] |= UN_WANT;
99		(void) tsleep((caddr_t) &unvplock[ix], PINOD, "unllck", 0);
100		return (1);
101	}
102
103	unvplock[ix] |= UN_LOCKED;
104
105	return (0);
106}
107
108static void
109union_list_unlock(ix)
110	int ix;
111{
112
113	unvplock[ix] &= ~UN_LOCKED;
114
115	if (unvplock[ix] & UN_WANT) {
116		unvplock[ix] &= ~UN_WANT;
117		wakeup((caddr_t) &unvplock[ix]);
118	}
119}
120
121void
122union_updatevp(un, uppervp, lowervp)
123	struct union_node *un;
124	struct vnode *uppervp;
125	struct vnode *lowervp;
126{
127	int ohash = UNION_HASH(un->un_uppervp, un->un_lowervp);
128	int nhash = UNION_HASH(uppervp, lowervp);
129	int docache = (lowervp != NULLVP || uppervp != NULLVP);
130	int lhash, hhash, uhash;
131
132	/*
133	 * Ensure locking is ordered from lower to higher
134	 * to avoid deadlocks.
135	 */
136	if (nhash < ohash) {
137		lhash = nhash;
138		uhash = ohash;
139	} else {
140		lhash = ohash;
141		uhash = nhash;
142	}
143
144	if (lhash != uhash)
145		while (union_list_lock(lhash))
146			continue;
147
148	while (union_list_lock(uhash))
149		continue;
150
151	if (ohash != nhash || !docache) {
152		if (un->un_flags & UN_CACHED) {
153			un->un_flags &= ~UN_CACHED;
154			LIST_REMOVE(un, un_cache);
155		}
156	}
157
158	if (ohash != nhash)
159		union_list_unlock(ohash);
160
161	if (un->un_lowervp != lowervp) {
162		if (un->un_lowervp) {
163			vrele(un->un_lowervp);
164			if (un->un_path) {
165				free(un->un_path, M_TEMP);
166				un->un_path = 0;
167			}
168			if (un->un_dirvp) {
169				vrele(un->un_dirvp);
170				un->un_dirvp = NULLVP;
171			}
172		}
173		un->un_lowervp = lowervp;
174		un->un_lowersz = VNOVAL;
175	}
176
177	if (un->un_uppervp != uppervp) {
178		if (un->un_uppervp)
179			vrele(un->un_uppervp);
180
181		un->un_uppervp = uppervp;
182		un->un_uppersz = VNOVAL;
183	}
184
185	if (docache && (ohash != nhash)) {
186		LIST_INSERT_HEAD(&unhead[nhash], un, un_cache);
187		un->un_flags |= UN_CACHED;
188	}
189
190	union_list_unlock(nhash);
191}
192
193void
194union_newlower(un, lowervp)
195	struct union_node *un;
196	struct vnode *lowervp;
197{
198
199	union_updatevp(un, un->un_uppervp, lowervp);
200}
201
202void
203union_newupper(un, uppervp)
204	struct union_node *un;
205	struct vnode *uppervp;
206{
207
208	union_updatevp(un, uppervp, un->un_lowervp);
209}
210
211/*
212 * Keep track of size changes in the underlying vnodes.
213 * If the size changes, then callback to the vm layer
214 * giving priority to the upper layer size.
215 */
216void
217union_newsize(vp, uppersz, lowersz)
218	struct vnode *vp;
219	off_t uppersz, lowersz;
220{
221	struct union_node *un;
222	off_t sz;
223
224	/* only interested in regular files */
225	if (vp->v_type != VREG)
226		return;
227
228	un = VTOUNION(vp);
229	sz = VNOVAL;
230
231	if ((uppersz != VNOVAL) && (un->un_uppersz != uppersz)) {
232		un->un_uppersz = uppersz;
233		if (sz == VNOVAL)
234			sz = un->un_uppersz;
235	}
236
237	if ((lowersz != VNOVAL) && (un->un_lowersz != lowersz)) {
238		un->un_lowersz = lowersz;
239		if (sz == VNOVAL)
240			sz = un->un_lowersz;
241	}
242
243	if (sz != VNOVAL) {
244#ifdef UNION_DIAGNOSTIC
245		printf("union: %s size now %ld\n",
246			uppersz != VNOVAL ? "upper" : "lower", (long) sz);
247#endif
248		vnode_pager_setsize(vp, sz);
249	}
250}
251
252/*
253 * allocate a union_node/vnode pair.  the vnode is
254 * referenced and locked.  the new vnode is returned
255 * via (vpp).  (mp) is the mountpoint of the union filesystem,
256 * (dvp) is the parent directory where the upper layer object
257 * should exist (but doesn't) and (cnp) is the componentname
258 * information which is partially copied to allow the upper
259 * layer object to be created at a later time.  (uppervp)
260 * and (lowervp) reference the upper and lower layer objects
261 * being mapped.  either, but not both, can be nil.
262 * if supplied, (uppervp) is locked.
263 * the reference is either maintained in the new union_node
264 * object which is allocated, or they are vrele'd.
265 *
266 * all union_nodes are maintained on a singly-linked
267 * list.  new nodes are only allocated when they cannot
268 * be found on this list.  entries on the list are
269 * removed when the vfs reclaim entry is called.
270 *
271 * a single lock is kept for the entire list.  this is
272 * needed because the getnewvnode() function can block
273 * waiting for a vnode to become free, in which case there
274 * may be more than one process trying to get the same
275 * vnode.  this lock is only taken if we are going to
276 * call getnewvnode, since the kernel itself is single-threaded.
277 *
278 * if an entry is found on the list, then call vget() to
279 * take a reference.  this is done because there may be
280 * zero references to it and so it needs to removed from
281 * the vnode free list.
282 */
283int
284union_allocvp(vpp, mp, undvp, dvp, cnp, uppervp, lowervp, docache)
285	struct vnode **vpp;
286	struct mount *mp;
287	struct vnode *undvp;		/* parent union vnode */
288	struct vnode *dvp;		/* may be null */
289	struct componentname *cnp;	/* may be null */
290	struct vnode *uppervp;		/* may be null */
291	struct vnode *lowervp;		/* may be null */
292	int docache;
293{
294	int error;
295	struct union_node *un = 0;
296	struct vnode *xlowervp = NULLVP;
297	struct union_mount *um = MOUNTTOUNIONMOUNT(mp);
298	int hash;
299	int vflag;
300	int try;
301
302	if (uppervp == NULLVP && lowervp == NULLVP)
303		panic("union: unidentifiable allocation");
304
305	if (uppervp && lowervp && (uppervp->v_type != lowervp->v_type)) {
306		xlowervp = lowervp;
307		lowervp = NULLVP;
308	}
309
310	/* detect the root vnode (and aliases) */
311	vflag = 0;
312	if ((uppervp == um->um_uppervp) &&
313	    ((lowervp == NULLVP) || lowervp == um->um_lowervp)) {
314		if (lowervp == NULLVP) {
315			lowervp = um->um_lowervp;
316			if (lowervp != NULLVP)
317				VREF(lowervp);
318		}
319		vflag = VROOT;
320	}
321
322loop:
323	if (!docache) {
324		un = 0;
325	} else for (try = 0; try < 3; try++) {
326		switch (try) {
327		case 0:
328			if (lowervp == NULLVP)
329				continue;
330			hash = UNION_HASH(uppervp, lowervp);
331			break;
332
333		case 1:
334			if (uppervp == NULLVP)
335				continue;
336			hash = UNION_HASH(uppervp, NULLVP);
337			break;
338
339		case 2:
340			if (lowervp == NULLVP)
341				continue;
342			hash = UNION_HASH(NULLVP, lowervp);
343			break;
344		}
345
346		while (union_list_lock(hash))
347			continue;
348
349		for (un = unhead[hash].lh_first; un != 0;
350					un = un->un_cache.le_next) {
351			if ((un->un_lowervp == lowervp ||
352			     un->un_lowervp == NULLVP) &&
353			    (un->un_uppervp == uppervp ||
354			     un->un_uppervp == NULLVP) &&
355			    (UNIONTOV(un)->v_mount == mp)) {
356				if (vget(UNIONTOV(un), 0,
357				    cnp ? cnp->cn_proc : NULL)) {
358					union_list_unlock(hash);
359					goto loop;
360				}
361				break;
362			}
363		}
364
365		union_list_unlock(hash);
366
367		if (un)
368			break;
369	}
370
371	if (un) {
372		/*
373		 * Obtain a lock on the union_node.
374		 * uppervp is locked, though un->un_uppervp
375		 * may not be.  this doesn't break the locking
376		 * hierarchy since in the case that un->un_uppervp
377		 * is not yet locked it will be vrele'd and replaced
378		 * with uppervp.
379		 */
380
381		if ((dvp != NULLVP) && (uppervp == dvp)) {
382			/*
383			 * Access ``.'', so (un) will already
384			 * be locked.  Since this process has
385			 * the lock on (uppervp) no other
386			 * process can hold the lock on (un).
387			 */
388#ifdef DIAGNOSTIC
389			if ((un->un_flags & UN_LOCKED) == 0)
390				panic("union: . not locked");
391			else if (curproc && un->un_pid != curproc->p_pid &&
392				    un->un_pid > -1 && curproc->p_pid > -1)
393				panic("union: allocvp not lock owner");
394#endif
395		} else {
396			if (un->un_flags & UN_LOCKED) {
397				vrele(UNIONTOV(un));
398				un->un_flags |= UN_WANT;
399				(void) tsleep((caddr_t) &un->un_flags, PINOD, "unalvp", 0);
400				goto loop;
401			}
402			un->un_flags |= UN_LOCKED;
403
404#ifdef DIAGNOSTIC
405			if (curproc)
406				un->un_pid = curproc->p_pid;
407			else
408				un->un_pid = -1;
409#endif
410		}
411
412		/*
413		 * At this point, the union_node is locked,
414		 * un->un_uppervp may not be locked, and uppervp
415		 * is locked or nil.
416		 */
417
418		/*
419		 * Save information about the upper layer.
420		 */
421		if (uppervp != un->un_uppervp) {
422			union_newupper(un, uppervp);
423		} else if (uppervp) {
424			vrele(uppervp);
425		}
426
427		if (un->un_uppervp) {
428			un->un_flags |= UN_ULOCK;
429			un->un_flags &= ~UN_KLOCK;
430		}
431
432		/*
433		 * Save information about the lower layer.
434		 * This needs to keep track of pathname
435		 * and directory information which union_vn_create
436		 * might need.
437		 */
438		if (lowervp != un->un_lowervp) {
439			union_newlower(un, lowervp);
440			if (cnp && (lowervp != NULLVP)) {
441				un->un_hash = cnp->cn_hash;
442				un->un_path = malloc(cnp->cn_namelen+1,
443						M_TEMP, M_WAITOK);
444				bcopy(cnp->cn_nameptr, un->un_path,
445						cnp->cn_namelen);
446				un->un_path[cnp->cn_namelen] = '\0';
447				VREF(dvp);
448				un->un_dirvp = dvp;
449			}
450		} else if (lowervp) {
451			vrele(lowervp);
452		}
453		*vpp = UNIONTOV(un);
454		return (0);
455	}
456
457	if (docache) {
458		/*
459		 * otherwise lock the vp list while we call getnewvnode
460		 * since that can block.
461		 */
462		hash = UNION_HASH(uppervp, lowervp);
463
464		if (union_list_lock(hash))
465			goto loop;
466	}
467
468	error = getnewvnode(VT_UNION, mp, union_vnodeop_p, vpp);
469	if (error) {
470		if (uppervp) {
471			if (dvp == uppervp)
472				vrele(uppervp);
473			else
474				vput(uppervp);
475		}
476		if (lowervp)
477			vrele(lowervp);
478
479		goto out;
480	}
481
482	MALLOC((*vpp)->v_data, void *, sizeof(struct union_node),
483		M_TEMP, M_WAITOK);
484
485	(*vpp)->v_flag |= vflag;
486	if (uppervp)
487		(*vpp)->v_type = uppervp->v_type;
488	else
489		(*vpp)->v_type = lowervp->v_type;
490	un = VTOUNION(*vpp);
491	un->un_vnode = *vpp;
492	un->un_uppervp = uppervp;
493	un->un_uppersz = VNOVAL;
494	un->un_lowervp = lowervp;
495	un->un_lowersz = VNOVAL;
496	un->un_pvp = undvp;
497	if (undvp != NULLVP)
498		VREF(undvp);
499	un->un_dircache = 0;
500	un->un_openl = 0;
501	un->un_flags = UN_LOCKED;
502	if (un->un_uppervp)
503		un->un_flags |= UN_ULOCK;
504#ifdef DIAGNOSTIC
505	if (curproc)
506		un->un_pid = curproc->p_pid;
507	else
508		un->un_pid = -1;
509#endif
510	if (cnp && (lowervp != NULLVP)) {
511		un->un_hash = cnp->cn_hash;
512		un->un_path = malloc(cnp->cn_namelen+1, M_TEMP, M_WAITOK);
513		bcopy(cnp->cn_nameptr, un->un_path, cnp->cn_namelen);
514		un->un_path[cnp->cn_namelen] = '\0';
515		VREF(dvp);
516		un->un_dirvp = dvp;
517	} else {
518		un->un_hash = 0;
519		un->un_path = 0;
520		un->un_dirvp = 0;
521	}
522
523	if (docache) {
524		LIST_INSERT_HEAD(&unhead[hash], un, un_cache);
525		un->un_flags |= UN_CACHED;
526	}
527
528	if (xlowervp)
529		vrele(xlowervp);
530
531out:
532	if (docache)
533		union_list_unlock(hash);
534
535	return (error);
536}
537
538int
539union_freevp(vp)
540	struct vnode *vp;
541{
542	struct union_node *un = VTOUNION(vp);
543
544	if (un->un_flags & UN_CACHED) {
545		un->un_flags &= ~UN_CACHED;
546		LIST_REMOVE(un, un_cache);
547	}
548
549	if (un->un_pvp != NULLVP)
550		vrele(un->un_pvp);
551	if (un->un_uppervp != NULLVP)
552		vrele(un->un_uppervp);
553	if (un->un_lowervp != NULLVP)
554		vrele(un->un_lowervp);
555	if (un->un_dirvp != NULLVP)
556		vrele(un->un_dirvp);
557	if (un->un_path)
558		free(un->un_path, M_TEMP);
559
560	FREE(vp->v_data, M_TEMP);
561	vp->v_data = 0;
562
563	return (0);
564}
565
566/*
567 * copyfile.  copy the vnode (fvp) to the vnode (tvp)
568 * using a sequence of reads and writes.  both (fvp)
569 * and (tvp) are locked on entry and exit.
570 */
571int
572union_copyfile(fvp, tvp, cred, p)
573	struct vnode *fvp;
574	struct vnode *tvp;
575	struct ucred *cred;
576	struct proc *p;
577{
578	char *buf;
579	struct uio uio;
580	struct iovec iov;
581	int error = 0;
582
583	/*
584	 * strategy:
585	 * allocate a buffer of size MAXBSIZE.
586	 * loop doing reads and writes, keeping track
587	 * of the current uio offset.
588	 * give up at the first sign of trouble.
589	 */
590
591	uio.uio_procp = p;
592	uio.uio_segflg = UIO_SYSSPACE;
593	uio.uio_offset = 0;
594
595	VOP_UNLOCK(fvp, 0, p);				/* XXX */
596	VOP_LEASE(fvp, p, cred, LEASE_READ);
597	vn_lock(fvp, LK_EXCLUSIVE | LK_RETRY, p);	/* XXX */
598	VOP_UNLOCK(tvp, 0, p);				/* XXX */
599	VOP_LEASE(tvp, p, cred, LEASE_WRITE);
600	vn_lock(tvp, LK_EXCLUSIVE | LK_RETRY, p);	/* XXX */
601
602	buf = malloc(MAXBSIZE, M_TEMP, M_WAITOK);
603
604	/* ugly loop follows... */
605	do {
606		off_t offset = uio.uio_offset;
607
608		uio.uio_iov = &iov;
609		uio.uio_iovcnt = 1;
610		iov.iov_base = buf;
611		iov.iov_len = MAXBSIZE;
612		uio.uio_resid = iov.iov_len;
613		uio.uio_rw = UIO_READ;
614		error = VOP_READ(fvp, &uio, 0, cred);
615
616		if (error == 0) {
617			uio.uio_iov = &iov;
618			uio.uio_iovcnt = 1;
619			iov.iov_base = buf;
620			iov.iov_len = MAXBSIZE - uio.uio_resid;
621			uio.uio_offset = offset;
622			uio.uio_rw = UIO_WRITE;
623			uio.uio_resid = iov.iov_len;
624
625			if (uio.uio_resid == 0)
626				break;
627
628			do {
629				error = VOP_WRITE(tvp, &uio, 0, cred);
630			} while ((uio.uio_resid > 0) && (error == 0));
631		}
632
633	} while (error == 0);
634
635	free(buf, M_TEMP);
636	return (error);
637}
638
639/*
640 * (un) is assumed to be locked on entry and remains
641 * locked on exit.
642 */
643int
644union_copyup(un, docopy, cred, p)
645	struct union_node *un;
646	int docopy;
647	struct ucred *cred;
648	struct proc *p;
649{
650	int error;
651	struct vnode *lvp, *uvp;
652
653	/*
654	 * If the user does not have read permission, the vnode should not
655	 * be copied to upper layer.
656	 */
657	vn_lock(un->un_lowervp, LK_EXCLUSIVE | LK_RETRY, p);
658	error = VOP_ACCESS(un->un_lowervp, VREAD, cred, p);
659	VOP_UNLOCK(un->un_lowervp, 0, p);
660	if (error)
661		return (error);
662
663	error = union_vn_create(&uvp, un, p);
664	if (error)
665		return (error);
666
667	/* at this point, uppervp is locked */
668	union_newupper(un, uvp);
669	un->un_flags |= UN_ULOCK;
670
671	lvp = un->un_lowervp;
672
673	if (docopy) {
674		/*
675		 * XX - should not ignore errors
676		 * from VOP_CLOSE
677		 */
678		vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY, p);
679		error = VOP_OPEN(lvp, FREAD, cred, p);
680		if (error == 0) {
681			error = union_copyfile(lvp, uvp, cred, p);
682			VOP_UNLOCK(lvp, 0, p);
683			(void) VOP_CLOSE(lvp, FREAD, cred, p);
684		}
685#ifdef UNION_DIAGNOSTIC
686		if (error == 0)
687			uprintf("union: copied up %s\n", un->un_path);
688#endif
689
690	}
691	un->un_flags &= ~UN_ULOCK;
692	VOP_UNLOCK(uvp, 0, p);
693	union_vn_close(uvp, FWRITE, cred, p);
694	vn_lock(uvp, LK_EXCLUSIVE | LK_RETRY, p);
695	un->un_flags |= UN_ULOCK;
696
697	/*
698	 * Subsequent IOs will go to the top layer, so
699	 * call close on the lower vnode and open on the
700	 * upper vnode to ensure that the filesystem keeps
701	 * its references counts right.  This doesn't do
702	 * the right thing with (cred) and (FREAD) though.
703	 * Ignoring error returns is not right, either.
704	 */
705	if (error == 0) {
706		int i;
707
708		for (i = 0; i < un->un_openl; i++) {
709			(void) VOP_CLOSE(lvp, FREAD, cred, p);
710			(void) VOP_OPEN(uvp, FREAD, cred, p);
711		}
712		un->un_openl = 0;
713	}
714
715	return (error);
716
717}
718
719static int
720union_relookup(um, dvp, vpp, cnp, cn, path, pathlen)
721	struct union_mount *um;
722	struct vnode *dvp;
723	struct vnode **vpp;
724	struct componentname *cnp;
725	struct componentname *cn;
726	char *path;
727	int pathlen;
728{
729	int error;
730
731	/*
732	 * A new componentname structure must be faked up because
733	 * there is no way to know where the upper level cnp came
734	 * from or what it is being used for.  This must duplicate
735	 * some of the work done by NDINIT, some of the work done
736	 * by namei, some of the work done by lookup and some of
737	 * the work done by VOP_LOOKUP when given a CREATE flag.
738	 * Conclusion: Horrible.
739	 *
740	 * The pathname buffer will be FREEed by VOP_MKDIR.
741	 */
742	cn->cn_namelen = pathlen;
743	cn->cn_pnbuf = malloc(cn->cn_namelen+1, M_NAMEI, M_WAITOK);
744	bcopy(path, cn->cn_pnbuf, cn->cn_namelen);
745	cn->cn_pnbuf[cn->cn_namelen] = '\0';
746
747	cn->cn_nameiop = CREATE;
748	cn->cn_flags = (LOCKPARENT|HASBUF|SAVENAME|SAVESTART|ISLASTCN);
749	cn->cn_proc = cnp->cn_proc;
750	if (um->um_op == UNMNT_ABOVE)
751		cn->cn_cred = cnp->cn_cred;
752	else
753		cn->cn_cred = um->um_cred;
754	cn->cn_nameptr = cn->cn_pnbuf;
755	cn->cn_hash = cnp->cn_hash;
756	cn->cn_consume = cnp->cn_consume;
757
758	VREF(dvp);
759	error = relookup(dvp, vpp, cn);
760	if (!error)
761		vrele(dvp);
762	else {
763		free(cn->cn_pnbuf, M_NAMEI);
764		cn->cn_pnbuf = '\0';
765	}
766
767	return (error);
768}
769
770/*
771 * Create a shadow directory in the upper layer.
772 * The new vnode is returned locked.
773 *
774 * (um) points to the union mount structure for access to the
775 * the mounting process's credentials.
776 * (dvp) is the directory in which to create the shadow directory.
777 * it is unlocked on entry and exit.
778 * (cnp) is the componentname to be created.
779 * (vpp) is the returned newly created shadow directory, which
780 * is returned locked.
781 */
782int
783union_mkshadow(um, dvp, cnp, vpp)
784	struct union_mount *um;
785	struct vnode *dvp;
786	struct componentname *cnp;
787	struct vnode **vpp;
788{
789	int error;
790	struct vattr va;
791	struct proc *p = cnp->cn_proc;
792	struct componentname cn;
793
794	error = union_relookup(um, dvp, vpp, cnp, &cn,
795			cnp->cn_nameptr, cnp->cn_namelen);
796	if (error)
797		return (error);
798
799	if (*vpp) {
800		VOP_ABORTOP(dvp, &cn);
801		VOP_UNLOCK(dvp, 0, p);
802		vrele(*vpp);
803		*vpp = NULLVP;
804		return (EEXIST);
805	}
806
807	/*
808	 * policy: when creating the shadow directory in the
809	 * upper layer, create it owned by the user who did
810	 * the mount, group from parent directory, and mode
811	 * 777 modified by umask (ie mostly identical to the
812	 * mkdir syscall).  (jsp, kb)
813	 */
814
815	VATTR_NULL(&va);
816	va.va_type = VDIR;
817	va.va_mode = um->um_cmode;
818
819	/* VOP_LEASE: dvp is locked */
820	VOP_LEASE(dvp, p, cn.cn_cred, LEASE_WRITE);
821
822	error = VOP_MKDIR(dvp, vpp, &cn, &va);
823	return (error);
824}
825
826/*
827 * Create a whiteout entry in the upper layer.
828 *
829 * (um) points to the union mount structure for access to the
830 * the mounting process's credentials.
831 * (dvp) is the directory in which to create the whiteout.
832 * it is locked on entry and exit.
833 * (cnp) is the componentname to be created.
834 */
835int
836union_mkwhiteout(um, dvp, cnp, path)
837	struct union_mount *um;
838	struct vnode *dvp;
839	struct componentname *cnp;
840	char *path;
841{
842	int error;
843	struct vattr va;
844	struct proc *p = cnp->cn_proc;
845	struct vnode *wvp;
846	struct componentname cn;
847
848	VOP_UNLOCK(dvp, 0, p);
849	error = union_relookup(um, dvp, &wvp, cnp, &cn, path, strlen(path));
850	if (error) {
851		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, p);
852		return (error);
853	}
854
855	if (wvp) {
856		VOP_ABORTOP(dvp, &cn);
857		vrele(dvp);
858		vrele(wvp);
859		return (EEXIST);
860	}
861
862	/* VOP_LEASE: dvp is locked */
863	VOP_LEASE(dvp, p, p->p_ucred, LEASE_WRITE);
864
865	error = VOP_WHITEOUT(dvp, &cn, CREATE);
866	if (error)
867		VOP_ABORTOP(dvp, &cn);
868
869	vrele(dvp);
870
871	return (error);
872}
873
874/*
875 * union_vn_create: creates and opens a new shadow file
876 * on the upper union layer.  this function is similar
877 * in spirit to calling vn_open but it avoids calling namei().
878 * the problem with calling namei is that a) it locks too many
879 * things, and b) it doesn't start at the "right" directory,
880 * whereas relookup is told where to start.
881 */
882int
883union_vn_create(vpp, un, p)
884	struct vnode **vpp;
885	struct union_node *un;
886	struct proc *p;
887{
888	struct vnode *vp;
889	struct ucred *cred = p->p_ucred;
890	struct vattr vat;
891	struct vattr *vap = &vat;
892	int fmode = FFLAGS(O_WRONLY|O_CREAT|O_TRUNC|O_EXCL);
893	int error;
894	int cmode = UN_FILEMODE & ~p->p_fd->fd_cmask;
895	struct componentname cn;
896
897	*vpp = NULLVP;
898
899	/*
900	 * Build a new componentname structure (for the same
901	 * reasons outlines in union_mkshadow).
902	 * The difference here is that the file is owned by
903	 * the current user, rather than by the person who
904	 * did the mount, since the current user needs to be
905	 * able to write the file (that's why it is being
906	 * copied in the first place).
907	 */
908	cn.cn_namelen = strlen(un->un_path);
909	cn.cn_pnbuf = (caddr_t) malloc(cn.cn_namelen+1, M_NAMEI, M_WAITOK);
910	bcopy(un->un_path, cn.cn_pnbuf, cn.cn_namelen+1);
911	cn.cn_nameiop = CREATE;
912	cn.cn_flags = (LOCKPARENT|HASBUF|SAVENAME|SAVESTART|ISLASTCN);
913	cn.cn_proc = p;
914	cn.cn_cred = p->p_ucred;
915	cn.cn_nameptr = cn.cn_pnbuf;
916	cn.cn_hash = un->un_hash;
917	cn.cn_consume = 0;
918
919	VREF(un->un_dirvp);
920	error = relookup(un->un_dirvp, &vp, &cn);
921	if (error)
922		return (error);
923	vrele(un->un_dirvp);
924
925	if (vp) {
926		VOP_ABORTOP(un->un_dirvp, &cn);
927		if (un->un_dirvp == vp)
928			vrele(un->un_dirvp);
929		else
930			vput(un->un_dirvp);
931		vrele(vp);
932		return (EEXIST);
933	}
934
935	/*
936	 * Good - there was no race to create the file
937	 * so go ahead and create it.  The permissions
938	 * on the file will be 0666 modified by the
939	 * current user's umask.  Access to the file, while
940	 * it is unioned, will require access to the top *and*
941	 * bottom files.  Access when not unioned will simply
942	 * require access to the top-level file.
943	 * TODO: confirm choice of access permissions.
944	 */
945	VATTR_NULL(vap);
946	vap->va_type = VREG;
947	vap->va_mode = cmode;
948	VOP_LEASE(un->un_dirvp, p, cred, LEASE_WRITE);
949	if (error = VOP_CREATE(un->un_dirvp, &vp, &cn, vap))
950		return (error);
951
952	error = VOP_OPEN(vp, fmode, cred, p);
953	if (error) {
954		vput(vp);
955		return (error);
956	}
957
958	vp->v_writecount++;
959	*vpp = vp;
960	return (0);
961}
962
963int
964union_vn_close(vp, fmode, cred, p)
965	struct vnode *vp;
966	int fmode;
967	struct ucred *cred;
968	struct proc *p;
969{
970
971	if (fmode & FWRITE)
972		--vp->v_writecount;
973	return (VOP_CLOSE(vp, fmode, cred, p));
974}
975
976void
977union_removed_upper(un)
978	struct union_node *un;
979{
980	struct proc *p = curproc;	/* XXX */
981	struct vnode **vpp;
982
983	/*
984	 * Do not set the uppervp to NULLVP.  If lowervp is NULLVP,
985	 * union node will have neither uppervp nor lowervp.  We romove
986	 * the union node from cache, so that it will not be referrenced.
987	 */
988#if 0
989	union_newupper(un, NULLVP);
990#endif
991	if (un->un_dircache != 0) {
992		for (vpp = un->un_dircache; *vpp != NULLVP; vpp++)
993			vrele(*vpp);
994		free(un->un_dircache, M_TEMP);
995		un->un_dircache = 0;
996	}
997
998	if (un->un_flags & UN_CACHED) {
999		un->un_flags &= ~UN_CACHED;
1000		LIST_REMOVE(un, un_cache);
1001	}
1002
1003	if (un->un_flags & UN_ULOCK) {
1004		un->un_flags &= ~UN_ULOCK;
1005		VOP_UNLOCK(un->un_uppervp, 0, p);
1006	}
1007}
1008
1009#if 0
1010struct vnode *
1011union_lowervp(vp)
1012	struct vnode *vp;
1013{
1014	struct union_node *un = VTOUNION(vp);
1015
1016	if ((un->un_lowervp != NULLVP) &&
1017	    (vp->v_type == un->un_lowervp->v_type)) {
1018		if (vget(un->un_lowervp, 0) == 0)
1019			return (un->un_lowervp);
1020	}
1021
1022	return (NULLVP);
1023}
1024#endif
1025
1026/*
1027 * determine whether a whiteout is needed
1028 * during a remove/rmdir operation.
1029 */
1030int
1031union_dowhiteout(un, cred, p)
1032	struct union_node *un;
1033	struct ucred *cred;
1034	struct proc *p;
1035{
1036	struct vattr va;
1037
1038	if (un->un_lowervp != NULLVP)
1039		return (1);
1040
1041	if (VOP_GETATTR(un->un_uppervp, &va, cred, p) == 0 &&
1042	    (va.va_flags & OPAQUE))
1043		return (1);
1044
1045	return (0);
1046}
1047
1048static void
1049union_dircache_r(vp, vppp, cntp)
1050	struct vnode *vp;
1051	struct vnode ***vppp;
1052	int *cntp;
1053{
1054	struct union_node *un;
1055
1056	if (vp->v_op != union_vnodeop_p) {
1057		if (vppp) {
1058			VREF(vp);
1059			*(*vppp)++ = vp;
1060			if (--(*cntp) == 0)
1061				panic("union: dircache table too small");
1062		} else {
1063			(*cntp)++;
1064		}
1065
1066		return;
1067	}
1068
1069	un = VTOUNION(vp);
1070	if (un->un_uppervp != NULLVP)
1071		union_dircache_r(un->un_uppervp, vppp, cntp);
1072	if (un->un_lowervp != NULLVP)
1073		union_dircache_r(un->un_lowervp, vppp, cntp);
1074}
1075
1076struct vnode *
1077union_dircache(vp, p)
1078	struct vnode *vp;
1079	struct proc *p;
1080{
1081	int cnt;
1082	struct vnode *nvp;
1083	struct vnode **vpp;
1084	struct vnode **dircache;
1085	struct union_node *un;
1086	int error;
1087
1088	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
1089	dircache = VTOUNION(vp)->un_dircache;
1090
1091	nvp = NULLVP;
1092
1093	if (dircache == 0) {
1094		cnt = 0;
1095		union_dircache_r(vp, 0, &cnt);
1096		cnt++;
1097		dircache = (struct vnode **)
1098				malloc(cnt * sizeof(struct vnode *),
1099					M_TEMP, M_WAITOK);
1100		vpp = dircache;
1101		union_dircache_r(vp, &vpp, &cnt);
1102		*vpp = NULLVP;
1103		vpp = dircache + 1;
1104	} else {
1105		vpp = dircache;
1106		do {
1107			if (*vpp++ == VTOUNION(vp)->un_uppervp)
1108				break;
1109		} while (*vpp != NULLVP);
1110	}
1111
1112	if (*vpp == NULLVP)
1113		goto out;
1114
1115	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, p);
1116	VREF(*vpp);
1117	error = union_allocvp(&nvp, vp->v_mount, NULLVP, NULLVP, 0, *vpp, NULLVP, 0);
1118	if (error)
1119		goto out;
1120
1121	VTOUNION(vp)->un_dircache = 0;
1122	un = VTOUNION(nvp);
1123	un->un_dircache = dircache;
1124
1125out:
1126	VOP_UNLOCK(vp, 0, p);
1127	return (nvp);
1128}
1129