1/*	$NetBSD: union_subr.c,v 1.82 2022/07/18 04:30:30 thorpej Exp $	*/
2
3/*
4 * Copyright (c) 1994
5 *	The Regents of the University of California.  All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Jan-Simon Pendry.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 *    may be used to endorse or promote products derived from this software
20 *    without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 *	@(#)union_subr.c	8.20 (Berkeley) 5/20/95
35 */
36
37/*
38 * Copyright (c) 1994 Jan-Simon Pendry
39 *
40 * This code is derived from software contributed to Berkeley by
41 * Jan-Simon Pendry.
42 *
43 * Redistribution and use in source and binary forms, with or without
44 * modification, are permitted provided that the following conditions
45 * are met:
46 * 1. Redistributions of source code must retain the above copyright
47 *    notice, this list of conditions and the following disclaimer.
48 * 2. Redistributions in binary form must reproduce the above copyright
49 *    notice, this list of conditions and the following disclaimer in the
50 *    documentation and/or other materials provided with the distribution.
51 * 3. All advertising materials mentioning features or use of this software
52 *    must display the following acknowledgement:
53 *	This product includes software developed by the University of
54 *	California, Berkeley and its contributors.
55 * 4. Neither the name of the University nor the names of its contributors
56 *    may be used to endorse or promote products derived from this software
57 *    without specific prior written permission.
58 *
59 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
60 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
61 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
62 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
63 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
64 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
65 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
66 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
67 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
68 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
69 * SUCH DAMAGE.
70 *
71 *	@(#)union_subr.c	8.20 (Berkeley) 5/20/95
72 */
73
74#include <sys/cdefs.h>
75__KERNEL_RCSID(0, "$NetBSD: union_subr.c,v 1.82 2022/07/18 04:30:30 thorpej Exp $");
76
77#include <sys/param.h>
78#include <sys/systm.h>
79#include <sys/proc.h>
80#include <sys/time.h>
81#include <sys/kernel.h>
82#include <sys/vnode.h>
83#include <sys/namei.h>
84#include <sys/malloc.h>
85#include <sys/dirent.h>
86#include <sys/file.h>
87#include <sys/filedesc.h>
88#include <sys/queue.h>
89#include <sys/mount.h>
90#include <sys/stat.h>
91#include <sys/kauth.h>
92
93#include <uvm/uvm_extern.h>
94
95#include <fs/union/union.h>
96#include <miscfs/genfs/genfs.h>
97#include <miscfs/specfs/specdev.h>
98
99static LIST_HEAD(uhashhead, union_node) *uhashtbl;
100static u_long uhash_mask;		/* size of hash table - 1 */
101#define UNION_HASH(u, l) \
102	((((u_long) (u) + (u_long) (l)) >> 8) & uhash_mask)
103#define NOHASH	((u_long)-1)
104
105static kmutex_t uhash_lock;
106
107static void union_newupper(struct union_node *, struct vnode *);
108static void union_newlower(struct union_node *, struct vnode *);
109static void union_ref(struct union_node *);
110static void union_rele(struct union_node *);
111static int union_do_lookup(struct vnode *, struct componentname *, kauth_cred_t,    const char *);
112int union_vn_close(struct vnode *, int, kauth_cred_t, struct lwp *);
113static void union_dircache_r(struct vnode *, struct vnode ***, int *);
114struct vnode *union_dircache(struct vnode *, struct lwp *);
115
116void
117union_init(void)
118{
119
120	mutex_init(&uhash_lock, MUTEX_DEFAULT, IPL_NONE);
121	uhashtbl = hashinit(desiredvnodes, HASH_LIST, true, &uhash_mask);
122}
123
124void
125union_reinit(void)
126{
127	struct union_node *un;
128	struct uhashhead *oldhash, *hash;
129	u_long oldmask, mask, val;
130	int i;
131
132	hash = hashinit(desiredvnodes, HASH_LIST, true, &mask);
133	mutex_enter(&uhash_lock);
134	oldhash = uhashtbl;
135	oldmask = uhash_mask;
136	uhashtbl = hash;
137	uhash_mask = mask;
138	for (i = 0; i <= oldmask; i++) {
139		while ((un = LIST_FIRST(&oldhash[i])) != NULL) {
140			LIST_REMOVE(un, un_cache);
141			val = UNION_HASH(un->un_uppervp, un->un_lowervp);
142			LIST_INSERT_HEAD(&hash[val], un, un_cache);
143		}
144	}
145	mutex_exit(&uhash_lock);
146	hashdone(oldhash, HASH_LIST, oldmask);
147}
148
149/*
150 * Free global unionfs resources.
151 */
152void
153union_done(void)
154{
155
156	hashdone(uhashtbl, HASH_LIST, uhash_mask);
157	mutex_destroy(&uhash_lock);
158
159	/* Make sure to unset the readdir hook. */
160	vn_union_readdir_hook = NULL;
161}
162
163void
164union_newlower(struct union_node *un, struct vnode *lowervp)
165{
166	int ohash = UNION_HASH(un->un_uppervp, un->un_lowervp);
167	int nhash = UNION_HASH(un->un_uppervp, lowervp);
168
169	if (un->un_lowervp == lowervp)
170		return;
171
172	KASSERT(VOP_ISLOCKED(UNIONTOV(un)) == LK_EXCLUSIVE);
173	KASSERT(un->un_lowervp == NULL);
174
175	mutex_enter(&uhash_lock);
176
177	if (ohash != nhash && (un->un_cflags & UN_CACHED)) {
178		un->un_cflags &= ~UN_CACHED;
179		LIST_REMOVE(un, un_cache);
180	}
181	mutex_enter(&un->un_lock);
182	un->un_lowervp = lowervp;
183	un->un_lowersz = VNOVAL;
184	mutex_exit(&un->un_lock);
185	if (ohash != nhash) {
186		LIST_INSERT_HEAD(&uhashtbl[nhash], un, un_cache);
187		un->un_cflags |= UN_CACHED;
188	}
189
190	mutex_exit(&uhash_lock);
191}
192
193void
194union_newupper(struct union_node *un, struct vnode *uppervp)
195{
196	int ohash = UNION_HASH(un->un_uppervp, un->un_lowervp);
197	int nhash = UNION_HASH(uppervp, un->un_lowervp);
198	struct vop_lock_args lock_ap;
199	struct vop_unlock_args unlock_ap;
200	int error __diagused;
201
202	if (un->un_uppervp == uppervp)
203		return;
204
205	KASSERT(VOP_ISLOCKED(UNIONTOV(un)) == LK_EXCLUSIVE);
206	KASSERT(un->un_uppervp == NULL);
207
208	/*
209	 * We have to transfer the vnode lock from the union vnode to
210	 * the upper vnode.  Lock the upper vnode first.  We cannot use
211	 * VOP_LOCK() here as it would break the fstrans state.
212	 */
213	lock_ap.a_desc = VDESC(vop_lock);
214	lock_ap.a_vp = uppervp;
215	lock_ap.a_flags = LK_EXCLUSIVE;
216	error = VCALL(lock_ap.a_vp,  VOFFSET(vop_lock), &lock_ap);
217	KASSERT(error == 0);
218
219	mutex_enter(&uhash_lock);
220
221	if (ohash != nhash && (un->un_cflags & UN_CACHED)) {
222		un->un_cflags &= ~UN_CACHED;
223		LIST_REMOVE(un, un_cache);
224	}
225	mutex_enter(&un->un_lock);
226	un->un_uppervp = uppervp;
227	un->un_uppersz = VNOVAL;
228	/*
229	 * With the upper vnode in place unlock the union vnode to
230	 * finalize the lock transfer.
231	 */
232	unlock_ap.a_desc = VDESC(vop_unlock);
233	unlock_ap.a_vp = UNIONTOV(un);
234	genfs_unlock(&unlock_ap);
235	/* Update union vnode interlock, vmobjlock, & klist. */
236	vshareilock(UNIONTOV(un), uppervp);
237	rw_obj_hold(uppervp->v_uobj.vmobjlock);
238	uvm_obj_setlock(&UNIONTOV(un)->v_uobj, uppervp->v_uobj.vmobjlock);
239	vshareklist(UNIONTOV(un), uppervp);
240	mutex_exit(&un->un_lock);
241	if (ohash != nhash) {
242		LIST_INSERT_HEAD(&uhashtbl[nhash], un, un_cache);
243		un->un_cflags |= UN_CACHED;
244	}
245
246	mutex_exit(&uhash_lock);
247}
248
249/*
250 * Keep track of size changes in the underlying vnodes.
251 * If the size changes, then callback to the vm layer
252 * giving priority to the upper layer size.
253 *
254 * Mutex un_lock hold on entry and released on return.
255 */
256void
257union_newsize(struct vnode *vp, off_t uppersz, off_t lowersz)
258{
259	struct union_node *un = VTOUNION(vp);
260	off_t sz;
261
262	KASSERT(mutex_owned(&un->un_lock));
263	/* only interested in regular files */
264	if (vp->v_type != VREG) {
265		mutex_exit(&un->un_lock);
266		uvm_vnp_setsize(vp, 0);
267		return;
268	}
269
270	sz = VNOVAL;
271
272	if ((uppersz != VNOVAL) && (un->un_uppersz != uppersz)) {
273		un->un_uppersz = uppersz;
274		if (sz == VNOVAL)
275			sz = un->un_uppersz;
276	}
277
278	if ((lowersz != VNOVAL) && (un->un_lowersz != lowersz)) {
279		un->un_lowersz = lowersz;
280		if (sz == VNOVAL)
281			sz = un->un_lowersz;
282	}
283	mutex_exit(&un->un_lock);
284
285	if (sz != VNOVAL) {
286#ifdef UNION_DIAGNOSTIC
287		printf("union: %s size now %qd\n",
288		    uppersz != VNOVAL ? "upper" : "lower", sz);
289#endif
290		uvm_vnp_setsize(vp, sz);
291	}
292}
293
294static void
295union_ref(struct union_node *un)
296{
297
298	KASSERT(mutex_owned(&uhash_lock));
299	un->un_refs++;
300}
301
302static void
303union_rele(struct union_node *un)
304{
305
306	mutex_enter(&uhash_lock);
307	un->un_refs--;
308	if (un->un_refs > 0) {
309		mutex_exit(&uhash_lock);
310		return;
311	}
312	if (un->un_cflags & UN_CACHED) {
313		un->un_cflags &= ~UN_CACHED;
314		LIST_REMOVE(un, un_cache);
315	}
316	mutex_exit(&uhash_lock);
317
318	if (un->un_pvp != NULLVP)
319		vrele(un->un_pvp);
320	if (un->un_uppervp != NULLVP)
321		vrele(un->un_uppervp);
322	if (un->un_lowervp != NULLVP)
323		vrele(un->un_lowervp);
324	if (un->un_dirvp != NULLVP)
325		vrele(un->un_dirvp);
326	if (un->un_path)
327		free(un->un_path, M_TEMP);
328	mutex_destroy(&un->un_lock);
329
330	free(un, M_TEMP);
331}
332
333/*
334 * allocate a union_node/vnode pair.  the vnode is
335 * referenced and unlocked.  the new vnode is returned
336 * via (vpp).  (mp) is the mountpoint of the union filesystem,
337 * (dvp) is the parent directory where the upper layer object
338 * should exist (but doesn't) and (cnp) is the componentname
339 * information which is partially copied to allow the upper
340 * layer object to be created at a later time.  (uppervp)
341 * and (lowervp) reference the upper and lower layer objects
342 * being mapped.  either, but not both, can be nil.
343 * both, if supplied, are unlocked.
344 * the reference is either maintained in the new union_node
345 * object which is allocated, or they are vrele'd.
346 *
347 * all union_nodes are maintained on a hash
348 * list.  new nodes are only allocated when they cannot
349 * be found on this list.  entries on the list are
350 * removed when the vfs reclaim entry is called.
351 *
352 * the vnode gets attached or referenced with vcache_get().
353 */
354int
355union_allocvp(
356	struct vnode **vpp,
357	struct mount *mp,
358	struct vnode *undvp,		/* parent union vnode */
359	struct vnode *dvp,		/* may be null */
360	struct componentname *cnp,	/* may be null */
361	struct vnode *uppervp,		/* may be null */
362	struct vnode *lowervp,		/* may be null */
363	int docache)
364{
365	int error;
366	struct union_node *un = NULL, *un1;
367	struct vnode *vp, *xlowervp = NULLVP;
368	u_long hash[3];
369	int try;
370	bool is_dotdot;
371
372	is_dotdot = (dvp != NULL && cnp != NULL && (cnp->cn_flags & ISDOTDOT));
373
374	if (uppervp == NULLVP && lowervp == NULLVP)
375		panic("union: unidentifiable allocation");
376
377	if (uppervp && lowervp && (uppervp->v_type != lowervp->v_type)) {
378		xlowervp = lowervp;
379		lowervp = NULLVP;
380	}
381
382	/*
383	 * If both uppervp and lowervp are not NULL we have to
384	 * search union nodes with one vnode as NULL too.
385	 */
386	hash[0] = UNION_HASH(uppervp, lowervp);
387	if (uppervp == NULL || lowervp == NULL) {
388		hash[1] = hash[2] = NOHASH;
389	} else {
390		hash[1] = UNION_HASH(uppervp, NULLVP);
391		hash[2] = UNION_HASH(NULLVP, lowervp);
392	}
393
394	if (!docache) {
395		un = NULL;
396		goto found;
397	}
398
399loop:
400	mutex_enter(&uhash_lock);
401
402	for (try = 0; try < 3; try++) {
403		if (hash[try] == NOHASH)
404			continue;
405		LIST_FOREACH(un, &uhashtbl[hash[try]], un_cache) {
406			if ((un->un_lowervp && un->un_lowervp != lowervp) ||
407			    (un->un_uppervp && un->un_uppervp != uppervp) ||
408			    un->un_mount != mp)
409				continue;
410
411			union_ref(un);
412			mutex_exit(&uhash_lock);
413			error = vcache_get(mp, &un, sizeof(un), &vp);
414			KASSERT(error != 0 || UNIONTOV(un) == vp);
415			union_rele(un);
416			if (error == ENOENT)
417				goto loop;
418			else if (error)
419				goto out;
420			goto found;
421		}
422	}
423
424	mutex_exit(&uhash_lock);
425
426found:
427	if (un) {
428		if (uppervp != dvp) {
429			if (is_dotdot)
430				VOP_UNLOCK(dvp);
431			vn_lock(UNIONTOV(un), LK_EXCLUSIVE | LK_RETRY);
432			if (is_dotdot)
433				vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
434		}
435		/*
436		 * Save information about the upper layer.
437		 */
438		if (uppervp != un->un_uppervp) {
439			union_newupper(un, uppervp);
440		} else if (uppervp) {
441			vrele(uppervp);
442		}
443
444		/*
445		 * Save information about the lower layer.
446		 * This needs to keep track of pathname
447		 * and directory information which union_vn_create
448		 * might need.
449		 */
450		if (lowervp != un->un_lowervp) {
451			union_newlower(un, lowervp);
452			if (cnp && (lowervp != NULLVP)) {
453				un->un_path = malloc(cnp->cn_namelen+1,
454						M_TEMP, M_WAITOK);
455				memcpy(un->un_path, cnp->cn_nameptr,
456						cnp->cn_namelen);
457				un->un_path[cnp->cn_namelen] = '\0';
458				vref(dvp);
459				un->un_dirvp = dvp;
460			}
461		} else if (lowervp) {
462			vrele(lowervp);
463		}
464		*vpp = UNIONTOV(un);
465		if (uppervp != dvp)
466			VOP_UNLOCK(*vpp);
467		error = 0;
468		goto out;
469	}
470
471	un = malloc(sizeof(struct union_node), M_TEMP, M_WAITOK);
472	mutex_init(&un->un_lock, MUTEX_DEFAULT, IPL_NONE);
473	un->un_refs = 1;
474	un->un_mount = mp;
475	un->un_vnode = NULL;
476	un->un_uppervp = uppervp;
477	un->un_lowervp = lowervp;
478	un->un_pvp = undvp;
479	if (undvp != NULLVP)
480		vref(undvp);
481	un->un_dircache = 0;
482	un->un_openl = 0;
483	un->un_cflags = 0;
484	un->un_hooknode = false;
485
486	un->un_uppersz = VNOVAL;
487	un->un_lowersz = VNOVAL;
488
489	if (dvp && cnp && (lowervp != NULLVP)) {
490		un->un_path = malloc(cnp->cn_namelen+1, M_TEMP, M_WAITOK);
491		memcpy(un->un_path, cnp->cn_nameptr, cnp->cn_namelen);
492		un->un_path[cnp->cn_namelen] = '\0';
493		vref(dvp);
494		un->un_dirvp = dvp;
495	} else {
496		un->un_path = 0;
497		un->un_dirvp = 0;
498	}
499
500	if (docache) {
501		mutex_enter(&uhash_lock);
502		LIST_FOREACH(un1, &uhashtbl[hash[0]], un_cache) {
503			if (un1->un_lowervp == lowervp &&
504			    un1->un_uppervp == uppervp &&
505			    un1->un_mount == mp) {
506				/*
507				 * Another thread beat us, push back freshly
508				 * allocated node and retry.
509				 */
510				mutex_exit(&uhash_lock);
511				union_rele(un);
512				goto loop;
513			}
514		}
515		LIST_INSERT_HEAD(&uhashtbl[hash[0]], un, un_cache);
516		un->un_cflags |= UN_CACHED;
517		mutex_exit(&uhash_lock);
518	}
519
520	error = vcache_get(mp, &un, sizeof(un), vpp);
521	KASSERT(error != 0 || UNIONTOV(un) == *vpp);
522	union_rele(un);
523	if (error == ENOENT)
524		goto loop;
525
526out:
527	if (xlowervp)
528		vrele(xlowervp);
529
530	return error;
531}
532
533int
534union_freevp(struct vnode *vp)
535{
536	struct union_node *un = VTOUNION(vp);
537
538	/* Detach vnode from union node. */
539	un->un_vnode = NULL;
540	un->un_uppersz = VNOVAL;
541	un->un_lowersz = VNOVAL;
542
543	/* Detach union node from vnode. */
544	mutex_enter(vp->v_interlock);
545	vp->v_data = NULL;
546	mutex_exit(vp->v_interlock);
547
548	union_rele(un);
549
550	return 0;
551}
552
553int
554union_loadvnode(struct mount *mp, struct vnode *vp,
555    const void *key, size_t key_len, const void **new_key)
556{
557	struct vattr va;
558	struct vnode *svp;
559	struct union_node *un;
560	struct union_mount *um;
561	voff_t uppersz, lowersz;
562
563	KASSERT(key_len == sizeof(un));
564	memcpy(&un, key, key_len);
565
566	um = MOUNTTOUNIONMOUNT(mp);
567	svp = (un->un_uppervp != NULLVP) ? un->un_uppervp : un->un_lowervp;
568
569	vp->v_tag = VT_UNION;
570	vp->v_op = union_vnodeop_p;
571	vp->v_data = un;
572	un->un_vnode = vp;
573
574	vp->v_type = svp->v_type;
575	if (svp->v_type == VCHR || svp->v_type == VBLK)
576		spec_node_init(vp, svp->v_rdev);
577
578	vshareilock(vp, svp);
579	rw_obj_hold(svp->v_uobj.vmobjlock);
580	uvm_obj_setlock(&vp->v_uobj, svp->v_uobj.vmobjlock);
581	vshareklist(vp, svp);
582
583	/* detect the root vnode (and aliases) */
584	if ((un->un_uppervp == um->um_uppervp) &&
585	    ((un->un_lowervp == NULLVP) || un->un_lowervp == um->um_lowervp)) {
586		if (un->un_lowervp == NULLVP) {
587			un->un_lowervp = um->um_lowervp;
588			if (un->un_lowervp != NULLVP)
589				vref(un->un_lowervp);
590		}
591		vp->v_vflag |= VV_ROOT;
592	}
593
594	uppersz = lowersz = VNOVAL;
595	if (un->un_uppervp != NULLVP) {
596		if (vn_lock(un->un_uppervp, LK_SHARED) == 0) {
597			if (VOP_GETATTR(un->un_uppervp, &va, FSCRED) == 0)
598				uppersz = va.va_size;
599			VOP_UNLOCK(un->un_uppervp);
600		}
601	}
602	if (un->un_lowervp != NULLVP) {
603		if (vn_lock(un->un_lowervp, LK_SHARED) == 0) {
604			if (VOP_GETATTR(un->un_lowervp, &va, FSCRED) == 0)
605				lowersz = va.va_size;
606			VOP_UNLOCK(un->un_lowervp);
607		}
608	}
609
610	mutex_enter(&un->un_lock);
611	union_newsize(vp, uppersz, lowersz);
612
613	mutex_enter(&uhash_lock);
614	union_ref(un);
615	mutex_exit(&uhash_lock);
616
617	*new_key = &vp->v_data;
618
619	return 0;
620}
621
622/*
623 * copyfile.  copy the vnode (fvp) to the vnode (tvp)
624 * using a sequence of reads and writes.  both (fvp)
625 * and (tvp) are locked on entry and exit.
626 */
627int
628union_copyfile(struct vnode *fvp, struct vnode *tvp, kauth_cred_t cred,
629	struct lwp *l)
630{
631	char *tbuf;
632	struct uio uio;
633	struct iovec iov;
634	int error = 0;
635
636	/*
637	 * strategy:
638	 * allocate a buffer of size MAXBSIZE.
639	 * loop doing reads and writes, keeping track
640	 * of the current uio offset.
641	 * give up at the first sign of trouble.
642	 */
643
644	uio.uio_offset = 0;
645	UIO_SETUP_SYSSPACE(&uio);
646
647	tbuf = malloc(MAXBSIZE, M_TEMP, M_WAITOK);
648
649	/* ugly loop follows... */
650	do {
651		off_t offset = uio.uio_offset;
652
653		uio.uio_iov = &iov;
654		uio.uio_iovcnt = 1;
655		iov.iov_base = tbuf;
656		iov.iov_len = MAXBSIZE;
657		uio.uio_resid = iov.iov_len;
658		uio.uio_rw = UIO_READ;
659		error = VOP_READ(fvp, &uio, 0, cred);
660
661		if (error == 0) {
662			uio.uio_iov = &iov;
663			uio.uio_iovcnt = 1;
664			iov.iov_base = tbuf;
665			iov.iov_len = MAXBSIZE - uio.uio_resid;
666			uio.uio_offset = offset;
667			uio.uio_rw = UIO_WRITE;
668			uio.uio_resid = iov.iov_len;
669
670			if (uio.uio_resid == 0)
671				break;
672
673			do {
674				error = VOP_WRITE(tvp, &uio, 0, cred);
675			} while ((uio.uio_resid > 0) && (error == 0));
676		}
677
678	} while (error == 0);
679
680	free(tbuf, M_TEMP);
681	return (error);
682}
683
684/*
685 * (un) is assumed to be locked on entry and remains
686 * locked on exit.
687 */
688int
689union_copyup(struct union_node *un, int docopy, kauth_cred_t cred,
690	struct lwp *l)
691{
692	int error;
693	struct vnode *lvp, *uvp;
694	struct vattr lvattr, uvattr;
695
696	error = union_vn_create(&uvp, un, l);
697	if (error)
698		return (error);
699
700	union_newupper(un, uvp);
701
702	lvp = un->un_lowervp;
703
704	if (docopy) {
705		/*
706		 * XX - should not ignore errors
707		 * from VOP_CLOSE
708		 */
709		vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY);
710
711        	error = VOP_GETATTR(lvp, &lvattr, cred);
712		if (error == 0)
713			error = VOP_OPEN(lvp, FREAD, cred);
714		if (error == 0) {
715			error = union_copyfile(lvp, uvp, cred, l);
716			(void) VOP_CLOSE(lvp, FREAD, cred);
717		}
718		if (error == 0) {
719			/* Copy permissions up too */
720			vattr_null(&uvattr);
721			uvattr.va_mode = lvattr.va_mode;
722			uvattr.va_flags = lvattr.va_flags;
723        		error = VOP_SETATTR(uvp, &uvattr, cred);
724		}
725		VOP_UNLOCK(lvp);
726#ifdef UNION_DIAGNOSTIC
727		if (error == 0)
728			uprintf("union: copied up %s\n", un->un_path);
729#endif
730
731	}
732	union_vn_close(uvp, FWRITE, cred, l);
733
734	/*
735	 * Subsequent IOs will go to the top layer, so
736	 * call close on the lower vnode and open on the
737	 * upper vnode to ensure that the filesystem keeps
738	 * its references counts right.  This doesn't do
739	 * the right thing with (cred) and (FREAD) though.
740	 * Ignoring error returns is not right, either.
741	 */
742	if (error == 0) {
743		int i;
744
745		vn_lock(lvp, LK_EXCLUSIVE | LK_RETRY);
746		for (i = 0; i < un->un_openl; i++) {
747			(void) VOP_CLOSE(lvp, FREAD, cred);
748			(void) VOP_OPEN(uvp, FREAD, cred);
749		}
750		un->un_openl = 0;
751		VOP_UNLOCK(lvp);
752	}
753
754	return (error);
755
756}
757
758/*
759 * Prepare the creation of a new node in the upper layer.
760 *
761 * (dvp) is the directory in which to create the new node.
762 * it is locked on entry and exit.
763 * (cnp) is the componentname to be created.
764 * (cred, path, hash) are credentials, path and its hash to fill (cnp).
765 */
766static int
767union_do_lookup(struct vnode *dvp, struct componentname *cnp, kauth_cred_t cred,
768    const char *path)
769{
770	int error;
771	struct vnode *vp;
772
773	cnp->cn_nameiop = CREATE;
774	cnp->cn_flags = LOCKPARENT | ISLASTCN;
775	cnp->cn_cred = cred;
776	cnp->cn_nameptr = path;
777	cnp->cn_namelen = strlen(path);
778
779	error = VOP_LOOKUP(dvp, &vp, cnp);
780
781	if (error == 0) {
782		KASSERT(vp != NULL);
783		VOP_ABORTOP(dvp, cnp);
784		vrele(vp);
785		error = EEXIST;
786	} else if (error == EJUSTRETURN) {
787		error = 0;
788	}
789
790	return error;
791}
792
793/*
794 * Create a shadow directory in the upper layer.
795 * The new vnode is returned locked.
796 *
797 * (um) points to the union mount structure for access to the
798 * the mounting process's credentials.
799 * (dvp) is the directory in which to create the shadow directory.
800 * it is unlocked on entry and exit.
801 * (cnp) is the componentname to be created.
802 * (vpp) is the returned newly created shadow directory, which
803 * is returned locked.
804 *
805 * N.B. We still attempt to create shadow directories even if the union
806 * is mounted read-only, which is a little nonintuitive.
807 */
808int
809union_mkshadow(struct union_mount *um, struct vnode *dvp,
810	struct componentname *cnp, struct vnode **vpp)
811{
812	int error;
813	struct vattr va;
814	struct componentname cn;
815	char *pnbuf;
816
817	if (cnp->cn_namelen + 1 > MAXPATHLEN)
818		return ENAMETOOLONG;
819	pnbuf = PNBUF_GET();
820	memcpy(pnbuf, cnp->cn_nameptr, cnp->cn_namelen);
821	pnbuf[cnp->cn_namelen] = '\0';
822
823	vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
824
825	error = union_do_lookup(dvp, &cn,
826	    (um->um_op == UNMNT_ABOVE ? cnp->cn_cred : um->um_cred), pnbuf);
827	if (error) {
828		VOP_UNLOCK(dvp);
829		PNBUF_PUT(pnbuf);
830		return error;
831	}
832
833	/*
834	 * policy: when creating the shadow directory in the
835	 * upper layer, create it owned by the user who did
836	 * the mount, group from parent directory, and mode
837	 * 777 modified by umask (ie mostly identical to the
838	 * mkdir syscall).  (jsp, kb)
839	 */
840
841	vattr_null(&va);
842	va.va_type = VDIR;
843	va.va_mode = um->um_cmode;
844
845	KASSERT(*vpp == NULL);
846	error = VOP_MKDIR(dvp, vpp, &cn, &va);
847	VOP_UNLOCK(dvp);
848	PNBUF_PUT(pnbuf);
849	return error;
850}
851
852/*
853 * Create a whiteout entry in the upper layer.
854 *
855 * (um) points to the union mount structure for access to the
856 * the mounting process's credentials.
857 * (dvp) is the directory in which to create the whiteout.
858 * it is locked on entry and exit.
859 * (cnp) is the componentname to be created.
860 * (un) holds the path and its hash to be created.
861 */
862int
863union_mkwhiteout(struct union_mount *um, struct vnode *dvp,
864	struct componentname *cnp, struct union_node *un)
865{
866	int error;
867	struct componentname cn;
868
869	error = union_do_lookup(dvp, &cn,
870	    (um->um_op == UNMNT_ABOVE ? cnp->cn_cred : um->um_cred),
871	    un->un_path);
872	if (error)
873		return error;
874
875	error = VOP_WHITEOUT(dvp, &cn, CREATE);
876	return error;
877}
878
879/*
880 * union_vn_create: creates and opens a new shadow file
881 * on the upper union layer.  this function is similar
882 * in spirit to calling vn_open but it avoids calling namei().
883 * the problem with calling namei is that a) it locks too many
884 * things, and b) it doesn't start at the "right" directory,
885 * whereas union_do_lookup is told where to start.
886 */
887int
888union_vn_create(struct vnode **vpp, struct union_node *un, struct lwp *l)
889{
890	struct vnode *vp;
891	kauth_cred_t cred = l->l_cred;
892	struct vattr vat;
893	struct vattr *vap = &vat;
894	int fmode = FFLAGS(O_WRONLY|O_CREAT|O_TRUNC|O_EXCL);
895	int error;
896	int cmode = UN_FILEMODE & ~l->l_proc->p_cwdi->cwdi_cmask;
897	struct componentname cn;
898
899	*vpp = NULLVP;
900
901	vn_lock(un->un_dirvp, LK_EXCLUSIVE | LK_RETRY);
902
903	error = union_do_lookup(un->un_dirvp, &cn, l->l_cred,
904	    un->un_path);
905	if (error) {
906		VOP_UNLOCK(un->un_dirvp);
907		return error;
908	}
909
910	/*
911	 * Good - there was no race to create the file
912	 * so go ahead and create it.  The permissions
913	 * on the file will be 0666 modified by the
914	 * current user's umask.  Access to the file, while
915	 * it is unioned, will require access to the top *and*
916	 * bottom files.  Access when not unioned will simply
917	 * require access to the top-level file.
918	 * TODO: confirm choice of access permissions.
919	 */
920	vattr_null(vap);
921	vap->va_type = VREG;
922	vap->va_mode = cmode;
923	vp = NULL;
924	error = VOP_CREATE(un->un_dirvp, &vp, &cn, vap);
925	if (error) {
926		VOP_UNLOCK(un->un_dirvp);
927		return error;
928	}
929
930	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
931	VOP_UNLOCK(un->un_dirvp);
932	error = VOP_OPEN(vp, fmode, cred);
933	if (error) {
934		vput(vp);
935		return error;
936	}
937
938	vp->v_writecount++;
939	VOP_UNLOCK(vp);
940	*vpp = vp;
941	return 0;
942}
943
944int
945union_vn_close(struct vnode *vp, int fmode, kauth_cred_t cred, struct lwp *l)
946{
947
948	if (fmode & FWRITE)
949		--vp->v_writecount;
950	return (VOP_CLOSE(vp, fmode, cred));
951}
952
953void
954union_removed_upper(struct union_node *un)
955{
956	struct vnode *vp = UNIONTOV(un);
957
958	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
959#if 1
960	/*
961	 * We do not set the uppervp to NULLVP here, because lowervp
962	 * may also be NULLVP, so this routine would end up creating
963	 * a bogus union node with no upper or lower VP (that causes
964	 * pain in many places that assume at least one VP exists).
965	 * Since we've removed this node from the cache hash chains,
966	 * it won't be found again.  When all current holders
967	 * release it, union_inactive() will vgone() it.
968	 */
969	union_diruncache(un);
970#else
971	union_newupper(un, NULLVP);
972#endif
973
974	VOP_UNLOCK(vp);
975
976	mutex_enter(&uhash_lock);
977	if (un->un_cflags & UN_CACHED) {
978		un->un_cflags &= ~UN_CACHED;
979		LIST_REMOVE(un, un_cache);
980	}
981	mutex_exit(&uhash_lock);
982}
983
984#if 0
985struct vnode *
986union_lowervp(struct vnode *vp)
987{
988	struct union_node *un = VTOUNION(vp);
989
990	if ((un->un_lowervp != NULLVP) &&
991	    (vp->v_type == un->un_lowervp->v_type)) {
992		if (vget(un->un_lowervp, 0, true /* wait */) == 0)
993			return (un->un_lowervp);
994	}
995
996	return (NULLVP);
997}
998#endif
999
1000/*
1001 * determine whether a whiteout is needed
1002 * during a remove/rmdir operation.
1003 */
1004int
1005union_dowhiteout(struct union_node *un, kauth_cred_t cred)
1006{
1007	struct vattr va;
1008
1009	if (un->un_lowervp != NULLVP)
1010		return (1);
1011
1012	if (VOP_GETATTR(un->un_uppervp, &va, cred) == 0 &&
1013	    (va.va_flags & OPAQUE))
1014		return (1);
1015
1016	return (0);
1017}
1018
1019static void
1020union_dircache_r(struct vnode *vp, struct vnode ***vppp, int *cntp)
1021{
1022	struct union_node *un;
1023
1024	if (vp->v_op != union_vnodeop_p) {
1025		if (vppp) {
1026			vref(vp);
1027			*(*vppp)++ = vp;
1028			if (--(*cntp) == 0)
1029				panic("union: dircache table too small");
1030		} else {
1031			(*cntp)++;
1032		}
1033
1034		return;
1035	}
1036
1037	un = VTOUNION(vp);
1038	if (un->un_uppervp != NULLVP)
1039		union_dircache_r(un->un_uppervp, vppp, cntp);
1040	if (un->un_lowervp != NULLVP)
1041		union_dircache_r(un->un_lowervp, vppp, cntp);
1042}
1043
1044struct vnode *
1045union_dircache(struct vnode *vp, struct lwp *l)
1046{
1047	int cnt;
1048	struct vnode *nvp = NULLVP;
1049	struct vnode **vpp;
1050	struct vnode **dircache;
1051	int error;
1052
1053	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
1054	dircache = VTOUNION(vp)->un_dircache;
1055
1056	nvp = NULLVP;
1057
1058	if (dircache == 0) {
1059		cnt = 0;
1060		union_dircache_r(vp, 0, &cnt);
1061		cnt++;
1062		dircache = (struct vnode **)
1063				malloc(cnt * sizeof(struct vnode *),
1064					M_TEMP, M_WAITOK);
1065		vpp = dircache;
1066		union_dircache_r(vp, &vpp, &cnt);
1067		VTOUNION(vp)->un_dircache = dircache;
1068		*vpp = NULLVP;
1069		vpp = dircache + 1;
1070	} else {
1071		vpp = dircache;
1072		do {
1073			if (*vpp++ == VTOUNION(vp)->un_lowervp)
1074				break;
1075		} while (*vpp != NULLVP);
1076	}
1077
1078	if (*vpp == NULLVP)
1079		goto out;
1080
1081	vref(*vpp);
1082	error = union_allocvp(&nvp, vp->v_mount, NULLVP, NULLVP, 0,
1083	    NULLVP, *vpp, 0);
1084	if (!error) {
1085		vn_lock(nvp, LK_EXCLUSIVE | LK_RETRY);
1086		VTOUNION(vp)->un_dircache = 0;
1087		VTOUNION(nvp)->un_hooknode = true;
1088		VTOUNION(nvp)->un_dircache = dircache;
1089	}
1090
1091out:
1092	VOP_UNLOCK(vp);
1093	return (nvp);
1094}
1095
1096void
1097union_diruncache(struct union_node *un)
1098{
1099	struct vnode **vpp;
1100
1101	KASSERT(VOP_ISLOCKED(UNIONTOV(un)) == LK_EXCLUSIVE);
1102	if (un->un_dircache != 0) {
1103		for (vpp = un->un_dircache; *vpp != NULLVP; vpp++)
1104			vrele(*vpp);
1105		free(un->un_dircache, M_TEMP);
1106		un->un_dircache = 0;
1107	}
1108}
1109
1110/*
1111 * Check whether node can rmdir (check empty).
1112 */
1113int
1114union_check_rmdir(struct union_node *un, kauth_cred_t cred)
1115{
1116	int dirlen, eofflag, error;
1117	char *dirbuf;
1118	struct vattr va;
1119	struct vnode *tvp;
1120	struct dirent *dp, *edp;
1121	struct componentname cn;
1122	struct iovec aiov;
1123	struct uio auio;
1124
1125	KASSERT(un->un_uppervp != NULL);
1126
1127	/* Check upper for being opaque. */
1128	KASSERT(VOP_ISLOCKED(un->un_uppervp));
1129	error = VOP_GETATTR(un->un_uppervp, &va, cred);
1130	if (error || (va.va_flags & OPAQUE))
1131		return error;
1132
1133	if (un->un_lowervp == NULL)
1134		return 0;
1135
1136	/* Check lower for being empty. */
1137	vn_lock(un->un_lowervp, LK_SHARED | LK_RETRY);
1138	error = VOP_GETATTR(un->un_lowervp, &va, cred);
1139	if (error) {
1140		VOP_UNLOCK(un->un_lowervp);
1141		return error;
1142	}
1143	dirlen = va.va_blocksize;
1144	dirbuf = kmem_alloc(dirlen, KM_SLEEP);
1145	/* error = 0; */
1146	eofflag = 0;
1147	auio.uio_offset = 0;
1148	do {
1149		aiov.iov_len = dirlen;
1150		aiov.iov_base = dirbuf;
1151		auio.uio_iov = &aiov;
1152		auio.uio_iovcnt = 1;
1153		auio.uio_resid = aiov.iov_len;
1154		auio.uio_rw = UIO_READ;
1155		UIO_SETUP_SYSSPACE(&auio);
1156		error = VOP_READDIR(un->un_lowervp, &auio, cred, &eofflag,
1157		    NULL, NULL);
1158		if (error)
1159			break;
1160		edp = (struct dirent *)&dirbuf[dirlen - auio.uio_resid];
1161		for (dp = (struct dirent *)dirbuf;
1162		    error == 0 && dp < edp;
1163		    dp = (struct dirent *)((char *)dp + dp->d_reclen)) {
1164			if (dp->d_reclen == 0) {
1165				error = ENOTEMPTY;
1166				break;
1167			}
1168			if (dp->d_type == DT_WHT ||
1169			    (dp->d_namlen == 1 && dp->d_name[0] == '.') ||
1170			    (dp->d_namlen == 2 && !memcmp(dp->d_name, "..", 2)))
1171				continue;
1172			/* Check for presence in the upper layer. */
1173			cn.cn_nameiop = LOOKUP;
1174			cn.cn_flags = ISLASTCN | RDONLY;
1175			cn.cn_cred = cred;
1176			cn.cn_nameptr = dp->d_name;
1177			cn.cn_namelen = dp->d_namlen;
1178			error = VOP_LOOKUP(un->un_uppervp, &tvp, &cn);
1179			if (error == ENOENT && (cn.cn_flags & ISWHITEOUT)) {
1180				error = 0;
1181				continue;
1182			}
1183			if (error == 0)
1184				vrele(tvp);
1185			error = ENOTEMPTY;
1186		}
1187	} while (error == 0 && !eofflag);
1188	kmem_free(dirbuf, dirlen);
1189	VOP_UNLOCK(un->un_lowervp);
1190
1191	return error;
1192}
1193
1194/*
1195 * This hook is called from vn_readdir() to switch to lower directory
1196 * entry after the upper directory is read.
1197 */
1198int
1199union_readdirhook(struct vnode **vpp, struct file *fp, struct lwp *l)
1200{
1201	struct vnode *vp = *vpp, *lvp;
1202	struct vattr va;
1203	int error;
1204
1205	if (vp->v_op != union_vnodeop_p)
1206		return (0);
1207
1208	/*
1209	 * If the directory is opaque,
1210	 * then don't show lower entries
1211	 */
1212	vn_lock(vp, LK_SHARED | LK_RETRY);
1213	error = VOP_GETATTR(vp, &va, fp->f_cred);
1214	VOP_UNLOCK(vp);
1215	if (error || (va.va_flags & OPAQUE))
1216		return error;
1217
1218	if ((lvp = union_dircache(vp, l)) == NULLVP)
1219		return (0);
1220
1221	error = VOP_OPEN(lvp, FREAD, fp->f_cred);
1222	if (error) {
1223		vput(lvp);
1224		return (error);
1225	}
1226	VOP_UNLOCK(lvp);
1227	fp->f_vnode = lvp;
1228	fp->f_offset = 0;
1229	error = vn_close(vp, FREAD, fp->f_cred);
1230	if (error)
1231		return (error);
1232	*vpp = lvp;
1233	return (0);
1234}
1235