1/*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1994 Jan-Simon Pendry
5 * Copyright (c) 1994
6 *	The Regents of the University of California.  All rights reserved.
7 * Copyright (c) 2005, 2006, 2012 Masanori Ozawa <ozawa@ongs.co.jp>, ONGS Inc.
8 * Copyright (c) 2006, 2012 Daichi Goto <daichi@freebsd.org>
9 *
10 * This code is derived from software contributed to Berkeley by
11 * Jan-Simon Pendry.
12 *
13 * Redistribution and use in source and binary forms, with or without
14 * modification, are permitted provided that the following conditions
15 * are met:
16 * 1. Redistributions of source code must retain the above copyright
17 *    notice, this list of conditions and the following disclaimer.
18 * 2. Redistributions in binary form must reproduce the above copyright
19 *    notice, this list of conditions and the following disclaimer in the
20 *    documentation and/or other materials provided with the distribution.
21 * 3. Neither the name of the University nor the names of its contributors
22 *    may be used to endorse or promote products derived from this software
23 *    without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 * SUCH DAMAGE.
36 */
37
38#include <sys/param.h>
39#include <sys/systm.h>
40#include <sys/kernel.h>
41#include <sys/ktr.h>
42#include <sys/lock.h>
43#include <sys/mutex.h>
44#include <sys/malloc.h>
45#include <sys/mount.h>
46#include <sys/namei.h>
47#include <sys/proc.h>
48#include <sys/vnode.h>
49#include <sys/dirent.h>
50#include <sys/fcntl.h>
51#include <sys/filedesc.h>
52#include <sys/stat.h>
53#include <sys/sysctl.h>
54#include <sys/taskqueue.h>
55#include <sys/resourcevar.h>
56
57#include <machine/atomic.h>
58
59#include <security/mac/mac_framework.h>
60
61#include <vm/uma.h>
62
63#include <fs/unionfs/union.h>
64
65#define NUNIONFSNODECACHE 16
66#define UNIONFSHASHMASK (NUNIONFSNODECACHE - 1)
67
68static MALLOC_DEFINE(M_UNIONFSHASH, "UNIONFS hash", "UNIONFS hash table");
69MALLOC_DEFINE(M_UNIONFSNODE, "UNIONFS node", "UNIONFS vnode private part");
70MALLOC_DEFINE(M_UNIONFSPATH, "UNIONFS path", "UNIONFS path private part");
71
72static struct task unionfs_deferred_rele_task;
73static struct mtx unionfs_deferred_rele_lock;
74static STAILQ_HEAD(, unionfs_node) unionfs_deferred_rele_list =
75    STAILQ_HEAD_INITIALIZER(unionfs_deferred_rele_list);
76static TASKQUEUE_DEFINE_THREAD(unionfs_rele);
77
78unsigned int unionfs_ndeferred = 0;
79SYSCTL_UINT(_vfs, OID_AUTO, unionfs_ndeferred, CTLFLAG_RD,
80    &unionfs_ndeferred, 0, "unionfs deferred vnode release");
81
82static void unionfs_deferred_rele(void *, int);
83
84/*
85 * Initialize
86 */
87int
88unionfs_init(struct vfsconf *vfsp)
89{
90	UNIONFSDEBUG("unionfs_init\n");	/* printed during system boot */
91	TASK_INIT(&unionfs_deferred_rele_task, 0, unionfs_deferred_rele, NULL);
92	mtx_init(&unionfs_deferred_rele_lock, "uniondefr", NULL, MTX_DEF);
93	return (0);
94}
95
96/*
97 * Uninitialize
98 */
99int
100unionfs_uninit(struct vfsconf *vfsp)
101{
102	taskqueue_quiesce(taskqueue_unionfs_rele);
103	taskqueue_free(taskqueue_unionfs_rele);
104	mtx_destroy(&unionfs_deferred_rele_lock);
105	return (0);
106}
107
108static void
109unionfs_deferred_rele(void *arg __unused, int pending __unused)
110{
111	STAILQ_HEAD(, unionfs_node) local_rele_list;
112	struct unionfs_node *unp, *tunp;
113	unsigned int ndeferred;
114
115	ndeferred = 0;
116	STAILQ_INIT(&local_rele_list);
117	mtx_lock(&unionfs_deferred_rele_lock);
118	STAILQ_CONCAT(&local_rele_list, &unionfs_deferred_rele_list);
119	mtx_unlock(&unionfs_deferred_rele_lock);
120	STAILQ_FOREACH_SAFE(unp, &local_rele_list, un_rele, tunp) {
121		++ndeferred;
122		MPASS(unp->un_dvp != NULL);
123		vrele(unp->un_dvp);
124		free(unp, M_UNIONFSNODE);
125	}
126
127	/* We expect this function to be single-threaded, thus no atomic */
128	unionfs_ndeferred += ndeferred;
129}
130
131static struct unionfs_node_hashhead *
132unionfs_get_hashhead(struct vnode *dvp, struct vnode *lookup)
133{
134	struct unionfs_node *unp;
135
136	unp = VTOUNIONFS(dvp);
137
138	return (&(unp->un_hashtbl[vfs_hash_index(lookup) & UNIONFSHASHMASK]));
139}
140
141/*
142 * Attempt to lookup a cached unionfs vnode by upper/lower vp
143 * from dvp, with dvp's interlock held.
144 */
145static struct vnode *
146unionfs_get_cached_vnode_locked(struct vnode *lookup, struct vnode *dvp)
147{
148	struct unionfs_node *unp;
149	struct unionfs_node_hashhead *hd;
150	struct vnode *vp;
151
152	hd = unionfs_get_hashhead(dvp, lookup);
153
154	LIST_FOREACH(unp, hd, un_hash) {
155		if (unp->un_uppervp == lookup ||
156		    unp->un_lowervp == lookup) {
157			vp = UNIONFSTOV(unp);
158			VI_LOCK_FLAGS(vp, MTX_DUPOK);
159			vp->v_iflag &= ~VI_OWEINACT;
160			if (VN_IS_DOOMED(vp) ||
161			    ((vp->v_iflag & VI_DOINGINACT) != 0)) {
162				VI_UNLOCK(vp);
163				vp = NULLVP;
164			} else {
165				vrefl(vp);
166				VI_UNLOCK(vp);
167			}
168			return (vp);
169		}
170	}
171
172	return (NULLVP);
173}
174
175
176/*
177 * Get the cached vnode.
178 */
179static struct vnode *
180unionfs_get_cached_vnode(struct vnode *uvp, struct vnode *lvp,
181    struct vnode *dvp)
182{
183	struct vnode *vp;
184
185	vp = NULLVP;
186	VI_LOCK(dvp);
187	if (uvp != NULLVP)
188		vp = unionfs_get_cached_vnode_locked(uvp, dvp);
189	else if (lvp != NULLVP)
190		vp = unionfs_get_cached_vnode_locked(lvp, dvp);
191	VI_UNLOCK(dvp);
192
193	return (vp);
194}
195
196/*
197 * Add the new vnode into cache.
198 */
199static struct vnode *
200unionfs_ins_cached_vnode(struct unionfs_node *uncp,
201    struct vnode *dvp)
202{
203	struct unionfs_node_hashhead *hd;
204	struct vnode *vp;
205
206	ASSERT_VOP_ELOCKED(uncp->un_uppervp, __func__);
207	ASSERT_VOP_ELOCKED(uncp->un_lowervp, __func__);
208	KASSERT(uncp->un_uppervp == NULLVP || uncp->un_uppervp->v_type == VDIR,
209	    ("%s: v_type != VDIR", __func__));
210	KASSERT(uncp->un_lowervp == NULLVP || uncp->un_lowervp->v_type == VDIR,
211	    ("%s: v_type != VDIR", __func__));
212
213	vp = NULLVP;
214	VI_LOCK(dvp);
215	if (uncp->un_uppervp != NULL)
216		vp = unionfs_get_cached_vnode_locked(uncp->un_uppervp, dvp);
217	else if (uncp->un_lowervp != NULL)
218		vp = unionfs_get_cached_vnode_locked(uncp->un_lowervp, dvp);
219	if (vp == NULLVP) {
220		hd = unionfs_get_hashhead(dvp, (uncp->un_uppervp != NULLVP ?
221		    uncp->un_uppervp : uncp->un_lowervp));
222		LIST_INSERT_HEAD(hd, uncp, un_hash);
223	}
224	VI_UNLOCK(dvp);
225
226	return (vp);
227}
228
229/*
230 * Remove the vnode.
231 */
232static void
233unionfs_rem_cached_vnode(struct unionfs_node *unp, struct vnode *dvp)
234{
235	KASSERT(unp != NULL, ("%s: null node", __func__));
236	KASSERT(dvp != NULLVP,
237	    ("%s: null parent vnode", __func__));
238
239	VI_LOCK(dvp);
240	if (unp->un_hash.le_prev != NULL) {
241		LIST_REMOVE(unp, un_hash);
242		unp->un_hash.le_next = NULL;
243		unp->un_hash.le_prev = NULL;
244	}
245	VI_UNLOCK(dvp);
246}
247
248/*
249 * Common cleanup handling for unionfs_nodeget
250 * Upper, lower, and parent directory vnodes are expected to be referenced by
251 * the caller.  Upper and lower vnodes, if non-NULL, are also expected to be
252 * exclusively locked by the caller.
253 * This function will return with the caller's locks and references undone.
254 */
255static void
256unionfs_nodeget_cleanup(struct vnode *vp, struct unionfs_node *unp)
257{
258
259	/*
260	 * Lock and reset the default vnode lock; vgone() expects a locked
261	 * vnode, and we're going to reset the vnode ops.
262	 */
263	lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL);
264
265	/*
266	 * Clear out private data and reset the vnode ops to avoid use of
267	 * unionfs vnode ops on a partially constructed vnode.
268	 */
269	VI_LOCK(vp);
270	vp->v_data = NULL;
271	vp->v_vnlock = &vp->v_lock;
272	vp->v_op = &dead_vnodeops;
273	VI_UNLOCK(vp);
274	vgone(vp);
275	vput(vp);
276
277	if (unp->un_dvp != NULLVP)
278		vrele(unp->un_dvp);
279	if (unp->un_uppervp != NULLVP)
280		vput(unp->un_uppervp);
281	if (unp->un_lowervp != NULLVP)
282		vput(unp->un_lowervp);
283	if (unp->un_hashtbl != NULL)
284		hashdestroy(unp->un_hashtbl, M_UNIONFSHASH, UNIONFSHASHMASK);
285	free(unp->un_path, M_UNIONFSPATH);
286	free(unp, M_UNIONFSNODE);
287}
288
289/*
290 * Make a new or get existing unionfs node.
291 *
292 * uppervp and lowervp should be unlocked. Because if new unionfs vnode is
293 * locked, uppervp or lowervp is locked too. In order to prevent dead lock,
294 * you should not lock plurality simultaneously.
295 */
296int
297unionfs_nodeget(struct mount *mp, struct vnode *uppervp,
298    struct vnode *lowervp, struct vnode *dvp, struct vnode **vpp,
299    struct componentname *cnp)
300{
301	char	       *path;
302	struct unionfs_mount *ump;
303	struct unionfs_node *unp;
304	struct vnode   *vp;
305	u_long		hashmask;
306	int		error;
307	int		lkflags;
308	__enum_uint8(vtype)	vt;
309
310	error = 0;
311	ump = MOUNTTOUNIONFSMOUNT(mp);
312	lkflags = (cnp ? cnp->cn_lkflags : 0);
313	path = (cnp ? cnp->cn_nameptr : NULL);
314	*vpp = NULLVP;
315
316	if (uppervp == NULLVP && lowervp == NULLVP)
317		panic("%s: upper and lower is null", __func__);
318
319	vt = (uppervp != NULLVP ? uppervp->v_type : lowervp->v_type);
320
321	/* If it has no ISLASTCN flag, path check is skipped. */
322	if (cnp && !(cnp->cn_flags & ISLASTCN))
323		path = NULL;
324
325	/* check the cache */
326	if (dvp != NULLVP && vt == VDIR) {
327		vp = unionfs_get_cached_vnode(uppervp, lowervp, dvp);
328		if (vp != NULLVP) {
329			*vpp = vp;
330			goto unionfs_nodeget_out;
331		}
332	}
333
334	unp = malloc(sizeof(struct unionfs_node),
335	    M_UNIONFSNODE, M_WAITOK | M_ZERO);
336
337	error = getnewvnode("unionfs", mp, &unionfs_vnodeops, &vp);
338	if (error != 0) {
339		free(unp, M_UNIONFSNODE);
340		return (error);
341	}
342	if (dvp != NULLVP)
343		vref(dvp);
344	if (uppervp != NULLVP)
345		vref(uppervp);
346	if (lowervp != NULLVP)
347		vref(lowervp);
348
349	if (vt == VDIR) {
350		unp->un_hashtbl = hashinit(NUNIONFSNODECACHE, M_UNIONFSHASH,
351		    &hashmask);
352		KASSERT(hashmask == UNIONFSHASHMASK,
353		    ("unexpected unionfs hash mask 0x%lx", hashmask));
354	}
355
356	unp->un_vnode = vp;
357	unp->un_uppervp = uppervp;
358	unp->un_lowervp = lowervp;
359	unp->un_dvp = dvp;
360	if (uppervp != NULLVP)
361		vp->v_vnlock = uppervp->v_vnlock;
362	else
363		vp->v_vnlock = lowervp->v_vnlock;
364
365	if (path != NULL) {
366		unp->un_path = malloc(cnp->cn_namelen + 1,
367		    M_UNIONFSPATH, M_WAITOK | M_ZERO);
368		bcopy(cnp->cn_nameptr, unp->un_path, cnp->cn_namelen);
369		unp->un_path[cnp->cn_namelen] = '\0';
370		unp->un_pathlen = cnp->cn_namelen;
371	}
372	vp->v_type = vt;
373	vp->v_data = unp;
374
375	/*
376	 * TODO: This is an imperfect check, as there's no guarantee that
377	 * the underlying filesystems will always return vnode pointers
378	 * for the root inodes that match our cached values.  To reduce
379	 * the likelihood of failure, for example in the case where either
380	 * vnode has been forcibly doomed, we check both pointers and set
381	 * VV_ROOT if either matches.
382	 */
383	if (ump->um_uppervp == uppervp || ump->um_lowervp == lowervp)
384		vp->v_vflag |= VV_ROOT;
385	KASSERT(dvp != NULL || (vp->v_vflag & VV_ROOT) != 0,
386	    ("%s: NULL dvp for non-root vp %p", __func__, vp));
387
388	vn_lock_pair(lowervp, false, LK_EXCLUSIVE, uppervp, false,
389	    LK_EXCLUSIVE);
390	error = insmntque1(vp, mp);
391	if (error != 0) {
392		unionfs_nodeget_cleanup(vp, unp);
393		return (error);
394	}
395	if (lowervp != NULL && VN_IS_DOOMED(lowervp)) {
396		vput(lowervp);
397		unp->un_lowervp = lowervp = NULL;
398	}
399	if (uppervp != NULL && VN_IS_DOOMED(uppervp)) {
400		vput(uppervp);
401		unp->un_uppervp = uppervp = NULL;
402		if (lowervp != NULLVP)
403			vp->v_vnlock = lowervp->v_vnlock;
404	}
405	if (lowervp == NULL && uppervp == NULL) {
406		unionfs_nodeget_cleanup(vp, unp);
407		return (ENOENT);
408	}
409
410	vn_set_state(vp, VSTATE_CONSTRUCTED);
411
412	if (dvp != NULLVP && vt == VDIR)
413		*vpp = unionfs_ins_cached_vnode(unp, dvp);
414	if (*vpp != NULLVP) {
415		unionfs_nodeget_cleanup(vp, unp);
416		vp = *vpp;
417	} else {
418		if (uppervp != NULL)
419			VOP_UNLOCK(uppervp);
420		if (lowervp != NULL)
421			VOP_UNLOCK(lowervp);
422		*vpp = vp;
423	}
424
425unionfs_nodeget_out:
426	if (lkflags & LK_TYPE_MASK)
427		vn_lock(vp, lkflags | LK_RETRY);
428
429	return (0);
430}
431
432/*
433 * Clean up the unionfs node.
434 */
435void
436unionfs_noderem(struct vnode *vp)
437{
438	struct unionfs_node *unp, *unp_t1, *unp_t2;
439	struct unionfs_node_hashhead *hd;
440	struct unionfs_node_status *unsp, *unsp_tmp;
441	struct vnode   *lvp;
442	struct vnode   *uvp;
443	struct vnode   *dvp;
444	int		count;
445	int		writerefs;
446
447	/*
448	 * The root vnode lock may be recursed during unmount, because
449	 * it may share the same lock as the unionfs mount's covered vnode,
450	 * which is locked across VFS_UNMOUNT().  This lock will then be
451	 * recursively taken during the vflush() issued by unionfs_unmount().
452	 * But we still only need to lock the unionfs lock once, because only
453	 * one of those lock operations was taken against a unionfs vnode and
454	 * will be undone against a unionfs vnode.
455	 */
456	KASSERT(vp->v_vnlock->lk_recurse == 0 || (vp->v_vflag & VV_ROOT) != 0,
457	    ("%s: vnode %p locked recursively", __func__, vp));
458	if (lockmgr(&vp->v_lock, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
459		panic("%s: failed to acquire lock for vnode lock", __func__);
460
461	/*
462	 * Use the interlock to protect the clearing of v_data to
463	 * prevent faults in unionfs_lock().
464	 */
465	VI_LOCK(vp);
466	unp = VTOUNIONFS(vp);
467	lvp = unp->un_lowervp;
468	uvp = unp->un_uppervp;
469	dvp = unp->un_dvp;
470	unp->un_lowervp = unp->un_uppervp = NULLVP;
471	vp->v_vnlock = &(vp->v_lock);
472	vp->v_data = NULL;
473	vp->v_object = NULL;
474	if (unp->un_hashtbl != NULL) {
475		/*
476		 * Clear out any cached child vnodes.  This should only
477		 * be necessary during forced unmount, when the vnode may
478		 * be reclaimed with a non-zero use count.  Otherwise the
479		 * reference held by each child should prevent reclamation.
480		 */
481		for (count = 0; count <= UNIONFSHASHMASK; count++) {
482			hd = unp->un_hashtbl + count;
483			LIST_FOREACH_SAFE(unp_t1, hd, un_hash, unp_t2) {
484				LIST_REMOVE(unp_t1, un_hash);
485				unp_t1->un_hash.le_next = NULL;
486				unp_t1->un_hash.le_prev = NULL;
487			}
488		}
489	}
490	VI_UNLOCK(vp);
491
492	writerefs = atomic_load_int(&vp->v_writecount);
493	VNASSERT(writerefs >= 0, vp,
494	    ("%s: write count %d, unexpected text ref", __func__, writerefs));
495	/*
496	 * If we were opened for write, we leased the write reference
497	 * to the lower vnode.  If this is a reclamation due to the
498	 * forced unmount, undo the reference now.
499	 */
500	if (writerefs > 0) {
501		VNASSERT(uvp != NULL, vp,
502		    ("%s: write reference without upper vnode", __func__));
503		VOP_ADD_WRITECOUNT(uvp, -writerefs);
504	}
505	if (lvp != NULLVP)
506		VOP_UNLOCK(lvp);
507	if (uvp != NULLVP)
508		VOP_UNLOCK(uvp);
509
510	if (dvp != NULLVP)
511		unionfs_rem_cached_vnode(unp, dvp);
512
513	if (lvp != NULLVP)
514		vrele(lvp);
515	if (uvp != NULLVP)
516		vrele(uvp);
517	if (unp->un_path != NULL) {
518		free(unp->un_path, M_UNIONFSPATH);
519		unp->un_path = NULL;
520		unp->un_pathlen = 0;
521	}
522
523	if (unp->un_hashtbl != NULL) {
524		hashdestroy(unp->un_hashtbl, M_UNIONFSHASH, UNIONFSHASHMASK);
525	}
526
527	LIST_FOREACH_SAFE(unsp, &(unp->un_unshead), uns_list, unsp_tmp) {
528		LIST_REMOVE(unsp, uns_list);
529		free(unsp, M_TEMP);
530	}
531	if (dvp != NULLVP) {
532		mtx_lock(&unionfs_deferred_rele_lock);
533		STAILQ_INSERT_TAIL(&unionfs_deferred_rele_list, unp, un_rele);
534		mtx_unlock(&unionfs_deferred_rele_lock);
535		taskqueue_enqueue(taskqueue_unionfs_rele,
536		    &unionfs_deferred_rele_task);
537	} else
538		free(unp, M_UNIONFSNODE);
539}
540
541/*
542 * Get the unionfs node status object for the vnode corresponding to unp,
543 * for the process that owns td.  Allocate a new status object if one
544 * does not already exist.
545 */
546void
547unionfs_get_node_status(struct unionfs_node *unp, struct thread *td,
548    struct unionfs_node_status **unspp)
549{
550	struct unionfs_node_status *unsp;
551	pid_t pid;
552
553	pid = td->td_proc->p_pid;
554
555	KASSERT(NULL != unspp, ("%s: NULL status", __func__));
556	ASSERT_VOP_ELOCKED(UNIONFSTOV(unp), __func__);
557
558	LIST_FOREACH(unsp, &(unp->un_unshead), uns_list) {
559		if (unsp->uns_pid == pid) {
560			*unspp = unsp;
561			return;
562		}
563	}
564
565	/* create a new unionfs node status */
566	unsp = malloc(sizeof(struct unionfs_node_status),
567	    M_TEMP, M_WAITOK | M_ZERO);
568
569	unsp->uns_pid = pid;
570	LIST_INSERT_HEAD(&(unp->un_unshead), unsp, uns_list);
571
572	*unspp = unsp;
573}
574
575/*
576 * Remove the unionfs node status, if you can.
577 * You need exclusive lock this vnode.
578 */
579void
580unionfs_tryrem_node_status(struct unionfs_node *unp,
581    struct unionfs_node_status *unsp)
582{
583	KASSERT(NULL != unsp, ("%s: NULL status", __func__));
584	ASSERT_VOP_ELOCKED(UNIONFSTOV(unp), __func__);
585
586	if (0 < unsp->uns_lower_opencnt || 0 < unsp->uns_upper_opencnt)
587		return;
588
589	LIST_REMOVE(unsp, uns_list);
590	free(unsp, M_TEMP);
591}
592
593/*
594 * Create upper node attr.
595 */
596void
597unionfs_create_uppervattr_core(struct unionfs_mount *ump, struct vattr *lva,
598    struct vattr *uva, struct thread *td)
599{
600	VATTR_NULL(uva);
601	uva->va_type = lva->va_type;
602	uva->va_atime = lva->va_atime;
603	uva->va_mtime = lva->va_mtime;
604	uva->va_ctime = lva->va_ctime;
605
606	switch (ump->um_copymode) {
607	case UNIONFS_TRANSPARENT:
608		uva->va_mode = lva->va_mode;
609		uva->va_uid = lva->va_uid;
610		uva->va_gid = lva->va_gid;
611		break;
612	case UNIONFS_MASQUERADE:
613		if (ump->um_uid == lva->va_uid) {
614			uva->va_mode = lva->va_mode & 077077;
615			uva->va_mode |= (lva->va_type == VDIR ?
616			    ump->um_udir : ump->um_ufile) & 0700;
617			uva->va_uid = lva->va_uid;
618			uva->va_gid = lva->va_gid;
619		} else {
620			uva->va_mode = (lva->va_type == VDIR ?
621			    ump->um_udir : ump->um_ufile);
622			uva->va_uid = ump->um_uid;
623			uva->va_gid = ump->um_gid;
624		}
625		break;
626	default:		/* UNIONFS_TRADITIONAL */
627		uva->va_mode = 0777 & ~td->td_proc->p_pd->pd_cmask;
628		uva->va_uid = ump->um_uid;
629		uva->va_gid = ump->um_gid;
630		break;
631	}
632}
633
634/*
635 * Create upper node attr.
636 */
637int
638unionfs_create_uppervattr(struct unionfs_mount *ump, struct vnode *lvp,
639    struct vattr *uva, struct ucred *cred, struct thread *td)
640{
641	struct vattr	lva;
642	int		error;
643
644	if ((error = VOP_GETATTR(lvp, &lva, cred)))
645		return (error);
646
647	unionfs_create_uppervattr_core(ump, &lva, uva, td);
648
649	return (error);
650}
651
652/*
653 * relookup
654 *
655 * dvp should be locked on entry and will be locked on return.
656 *
657 * If an error is returned, *vpp will be invalid, otherwise it will hold a
658 * locked, referenced vnode. If *vpp == dvp then remember that only one
659 * LK_EXCLUSIVE lock is held.
660 */
661int
662unionfs_relookup(struct vnode *dvp, struct vnode **vpp,
663    struct componentname *cnp, struct componentname *cn, struct thread *td,
664    char *path, int pathlen, u_long nameiop)
665{
666	int error;
667	bool refstart;
668
669	cn->cn_namelen = pathlen;
670	cn->cn_pnbuf = path;
671	cn->cn_nameiop = nameiop;
672	cn->cn_flags = (LOCKPARENT | LOCKLEAF | ISLASTCN);
673	cn->cn_lkflags = LK_EXCLUSIVE;
674	cn->cn_cred = cnp->cn_cred;
675	cn->cn_nameptr = cn->cn_pnbuf;
676
677	refstart = false;
678	if (nameiop == DELETE) {
679		cn->cn_flags |= (cnp->cn_flags & DOWHITEOUT);
680	} else if (nameiop == RENAME) {
681		refstart = true;
682	} else if (nameiop == CREATE) {
683		cn->cn_flags |= NOCACHE;
684	}
685
686	vref(dvp);
687	VOP_UNLOCK(dvp);
688
689	if ((error = vfs_relookup(dvp, vpp, cn, refstart))) {
690		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY);
691	} else
692		vrele(dvp);
693
694	KASSERT(cn->cn_pnbuf == path, ("%s: cn_pnbuf changed", __func__));
695
696	return (error);
697}
698
699/*
700 * relookup for CREATE namei operation.
701 *
702 * dvp is unionfs vnode. dvp should be locked.
703 *
704 * If it called 'unionfs_copyfile' function by unionfs_link etc,
705 * VOP_LOOKUP information is broken.
706 * So it need relookup in order to create link etc.
707 */
708int
709unionfs_relookup_for_create(struct vnode *dvp, struct componentname *cnp,
710    struct thread *td)
711{
712	struct vnode *udvp;
713	struct vnode *vp;
714	struct componentname cn;
715	int error;
716
717	udvp = UNIONFSVPTOUPPERVP(dvp);
718	vp = NULLVP;
719
720	error = unionfs_relookup(udvp, &vp, cnp, &cn, td, cnp->cn_nameptr,
721	    cnp->cn_namelen, CREATE);
722	if (error)
723		return (error);
724
725	if (vp != NULLVP) {
726		if (udvp == vp)
727			vrele(vp);
728		else
729			vput(vp);
730
731		error = EEXIST;
732	}
733
734	return (error);
735}
736
737/*
738 * relookup for DELETE namei operation.
739 *
740 * dvp is unionfs vnode. dvp should be locked.
741 */
742int
743unionfs_relookup_for_delete(struct vnode *dvp, struct componentname *cnp,
744    struct thread *td)
745{
746	struct vnode *udvp;
747	struct vnode *vp;
748	struct componentname cn;
749	int error;
750
751	udvp = UNIONFSVPTOUPPERVP(dvp);
752	vp = NULLVP;
753
754	error = unionfs_relookup(udvp, &vp, cnp, &cn, td, cnp->cn_nameptr,
755	    cnp->cn_namelen, DELETE);
756	if (error)
757		return (error);
758
759	if (vp == NULLVP)
760		error = ENOENT;
761	else {
762		if (udvp == vp)
763			vrele(vp);
764		else
765			vput(vp);
766	}
767
768	return (error);
769}
770
771/*
772 * relookup for RENAME namei operation.
773 *
774 * dvp is unionfs vnode. dvp should be locked.
775 */
776int
777unionfs_relookup_for_rename(struct vnode *dvp, struct componentname *cnp,
778    struct thread *td)
779{
780	struct vnode *udvp;
781	struct vnode *vp;
782	struct componentname cn;
783	int error;
784
785	udvp = UNIONFSVPTOUPPERVP(dvp);
786	vp = NULLVP;
787
788	error = unionfs_relookup(udvp, &vp, cnp, &cn, td, cnp->cn_nameptr,
789	    cnp->cn_namelen, RENAME);
790	if (error)
791		return (error);
792
793	if (vp != NULLVP) {
794		if (udvp == vp)
795			vrele(vp);
796		else
797			vput(vp);
798	}
799
800	return (error);
801}
802
803/*
804 * Update the unionfs_node.
805 *
806 * uvp is new locked upper vnode. unionfs vnode's lock will be exchanged to the
807 * uvp's lock and lower's lock will be unlocked.
808 */
809static void
810unionfs_node_update(struct unionfs_node *unp, struct vnode *uvp,
811    struct thread *td)
812{
813	struct unionfs_node_hashhead *hd;
814	struct vnode   *vp;
815	struct vnode   *lvp;
816	struct vnode   *dvp;
817	unsigned	count, lockrec;
818
819	vp = UNIONFSTOV(unp);
820	lvp = unp->un_lowervp;
821	ASSERT_VOP_ELOCKED(lvp, __func__);
822	ASSERT_VOP_ELOCKED(uvp, __func__);
823	dvp = unp->un_dvp;
824
825	VNASSERT(vp->v_writecount == 0, vp,
826	    ("%s: non-zero writecount", __func__));
827	/*
828	 * Update the upper vnode's lock state to match the lower vnode,
829	 * and then switch the unionfs vnode's lock to the upper vnode.
830	 */
831	lockrec = lvp->v_vnlock->lk_recurse;
832	for (count = 0; count < lockrec; count++)
833		vn_lock(uvp, LK_EXCLUSIVE | LK_CANRECURSE | LK_RETRY);
834	VI_LOCK(vp);
835	unp->un_uppervp = uvp;
836	vp->v_vnlock = uvp->v_vnlock;
837	VI_UNLOCK(vp);
838
839	/*
840	 * Re-cache the unionfs vnode against the upper vnode
841	 */
842	if (dvp != NULLVP && vp->v_type == VDIR) {
843		VI_LOCK(dvp);
844		if (unp->un_hash.le_prev != NULL) {
845			LIST_REMOVE(unp, un_hash);
846			hd = unionfs_get_hashhead(dvp, uvp);
847			LIST_INSERT_HEAD(hd, unp, un_hash);
848		}
849		VI_UNLOCK(unp->un_dvp);
850	}
851}
852
853/*
854 * Create a new shadow dir.
855 *
856 * udvp should be locked on entry and will be locked on return.
857 *
858 * If no error returned, unp will be updated.
859 */
860int
861unionfs_mkshadowdir(struct unionfs_mount *ump, struct vnode *udvp,
862    struct unionfs_node *unp, struct componentname *cnp, struct thread *td)
863{
864	struct vnode   *lvp;
865	struct vnode   *uvp;
866	struct vattr	va;
867	struct vattr	lva;
868	struct nameidata nd;
869	struct mount   *mp;
870	struct ucred   *cred;
871	struct ucred   *credbk;
872	struct uidinfo *rootinfo;
873	int		error;
874
875	if (unp->un_uppervp != NULLVP)
876		return (EEXIST);
877
878	lvp = unp->un_lowervp;
879	uvp = NULLVP;
880	credbk = cnp->cn_cred;
881
882	/* Authority change to root */
883	rootinfo = uifind((uid_t)0);
884	cred = crdup(cnp->cn_cred);
885	/*
886	 * The calls to chgproccnt() are needed to compensate for change_ruid()
887	 * calling chgproccnt().
888	 */
889	chgproccnt(cred->cr_ruidinfo, 1, 0);
890	change_euid(cred, rootinfo);
891	change_ruid(cred, rootinfo);
892	change_svuid(cred, (uid_t)0);
893	uifree(rootinfo);
894	cnp->cn_cred = cred;
895
896	memset(&nd.ni_cnd, 0, sizeof(struct componentname));
897	NDPREINIT(&nd);
898
899	if ((error = VOP_GETATTR(lvp, &lva, cnp->cn_cred)))
900		goto unionfs_mkshadowdir_abort;
901
902	if ((error = unionfs_relookup(udvp, &uvp, cnp, &nd.ni_cnd, td,
903	    cnp->cn_nameptr, cnp->cn_namelen, CREATE)))
904		goto unionfs_mkshadowdir_abort;
905	if (uvp != NULLVP) {
906		if (udvp == uvp)
907			vrele(uvp);
908		else
909			vput(uvp);
910
911		error = EEXIST;
912		goto unionfs_mkshadowdir_abort;
913	}
914
915	if ((error = vn_start_write(udvp, &mp, V_WAIT | V_PCATCH)))
916		goto unionfs_mkshadowdir_abort;
917	unionfs_create_uppervattr_core(ump, &lva, &va, td);
918
919	/*
920	 * Temporarily NUL-terminate the current pathname component.
921	 * This function may be called during lookup operations in which
922	 * the current pathname component is not the leaf, meaning that
923	 * the NUL terminator is some distance beyond the end of the current
924	 * component.  This *should* be fine, as cn_namelen will still
925	 * correctly indicate the length of only the current component,
926	 * but ZFS in particular does not respect cn_namelen in its VOP_MKDIR
927	 * implementation
928	 * Note that this assumes nd.ni_cnd.cn_pnbuf was allocated by
929	 * something like a local namei() operation and the temporary
930	 * NUL-termination will not have an effect on other threads.
931	 */
932	char *pathend = &nd.ni_cnd.cn_nameptr[nd.ni_cnd.cn_namelen];
933	char pathterm = *pathend;
934	*pathend = '\0';
935	error = VOP_MKDIR(udvp, &uvp, &nd.ni_cnd, &va);
936	*pathend = pathterm;
937
938	if (!error) {
939		/*
940		 * XXX The bug which cannot set uid/gid was corrected.
941		 * Ignore errors.
942		 */
943		va.va_type = VNON;
944		VOP_SETATTR(uvp, &va, nd.ni_cnd.cn_cred);
945
946		/*
947		 * VOP_SETATTR() may transiently drop uvp's lock, so it's
948		 * important to call it before unionfs_node_update() transfers
949		 * the unionfs vnode's lock from lvp to uvp; otherwise the
950		 * unionfs vnode itself would be transiently unlocked and
951		 * potentially doomed.
952		 */
953		unionfs_node_update(unp, uvp, td);
954	}
955	vn_finished_write(mp);
956
957unionfs_mkshadowdir_abort:
958	cnp->cn_cred = credbk;
959	chgproccnt(cred->cr_ruidinfo, -1, 0);
960	crfree(cred);
961
962	return (error);
963}
964
965static inline void
966unionfs_forward_vop_ref(struct vnode *basevp, int *lkflags)
967{
968	ASSERT_VOP_LOCKED(basevp, __func__);
969	*lkflags = VOP_ISLOCKED(basevp);
970	vref(basevp);
971}
972
973/*
974 * Prepare unionfs to issue a forwarded VOP to either the upper or lower
975 * FS.  This should be used for any VOP which may drop the vnode lock;
976 * it is not required otherwise.
977 * The unionfs vnode shares its lock with the base-layer vnode(s); if the
978 * base FS must transiently drop its vnode lock, the unionfs vnode may
979 * effectively become unlocked.  During that window, a concurrent forced
980 * unmount may doom the unionfs vnode, which leads to two significant
981 * issues:
982 * 1) Completion of, and return from, the unionfs VOP with the unionfs
983 *    vnode completely unlocked.  When the unionfs vnode becomes doomed
984 *    it stops sharing its lock with the base vnode, so even if the
985 *    forwarded VOP reacquires the base vnode lock the unionfs vnode
986 *    lock will no longer be held.  This can lead to violation of the
987 *    caller's sychronization requirements as well as various failed
988 *    locking assertions when DEBUG_VFS_LOCKS is enabled.
989 * 2) Loss of reference on the base vnode.  The caller is expected to
990 *    hold a v_usecount reference on the unionfs vnode, while the
991 *    unionfs vnode holds a reference on the base-layer vnode(s).  But
992 *    these references are released when the unionfs vnode becomes
993 *    doomed, violating the base layer's expectation that its caller
994 *    must hold a reference to prevent vnode recycling.
995 *
996 * basevp1 and basevp2 represent two base-layer vnodes which are
997 * expected to be locked when this function is called.  basevp2
998 * may be NULL, but if not NULL basevp1 and basevp2 should represent
999 * a parent directory and a filed linked to it, respectively.
1000 * lkflags1 and lkflags2 are output parameters that will store the
1001 * current lock status of basevp1 and basevp2, respectively.  They
1002 * are intended to be passed as the lkflags1 and lkflags2 parameters
1003 * in the subsequent call to unionfs_forward_vop_finish_pair().
1004 * lkflags2 may be NULL iff basevp2 is NULL.
1005 */
1006void
1007unionfs_forward_vop_start_pair(struct vnode *basevp1, int *lkflags1,
1008    struct vnode *basevp2, int *lkflags2)
1009{
1010	/*
1011	 * Take an additional reference on the base-layer vnodes to
1012	 * avoid loss of reference if the unionfs vnodes are doomed.
1013	 */
1014	unionfs_forward_vop_ref(basevp1, lkflags1);
1015	if (basevp2 != NULL)
1016		unionfs_forward_vop_ref(basevp2, lkflags2);
1017}
1018
1019static inline bool
1020unionfs_forward_vop_rele(struct vnode *unionvp, struct vnode *basevp,
1021    int lkflags)
1022{
1023	bool unionvp_doomed;
1024
1025	if (__predict_false(VTOUNIONFS(unionvp) == NULL)) {
1026		if ((lkflags & LK_EXCLUSIVE) != 0)
1027			ASSERT_VOP_ELOCKED(basevp, __func__);
1028		else
1029			ASSERT_VOP_LOCKED(basevp, __func__);
1030		unionvp_doomed = true;
1031	} else {
1032		vrele(basevp);
1033		unionvp_doomed = false;
1034	}
1035
1036	return (unionvp_doomed);
1037}
1038
1039
1040/*
1041 * Indicate completion of a forwarded VOP previously prepared by
1042 * unionfs_forward_vop_start_pair().
1043 * basevp1 and basevp2 must be the same values passed to the prior
1044 * call to unionfs_forward_vop_start_pair().  unionvp1 and unionvp2
1045 * must be the unionfs vnodes that were initially above basevp1 and
1046 * basevp2, respectively.
1047 * basevp1 and basevp2 (if not NULL) must be locked when this function
1048 * is called, while unionvp1 and/or unionvp2 may be unlocked if either
1049 * unionfs vnode has become doomed.
1050 * lkflags1 and lkflag2 represent the locking flags that should be
1051 * used to re-lock unionvp1 and unionvp2, respectively, if either
1052 * vnode has become doomed.
1053 *
1054 * Returns true if any unionfs vnode was found to be doomed, false
1055 * otherwise.
1056 */
1057bool
1058unionfs_forward_vop_finish_pair(
1059    struct vnode *unionvp1, struct vnode *basevp1, int lkflags1,
1060    struct vnode *unionvp2, struct vnode *basevp2, int lkflags2)
1061{
1062	bool vp1_doomed, vp2_doomed;
1063
1064	/*
1065	 * If either vnode is found to have been doomed, set
1066	 * a flag indicating that it needs to be re-locked.
1067	 * Otherwise, simply drop the base-vnode reference that
1068	 * was taken in unionfs_forward_vop_start().
1069	 */
1070	vp1_doomed = unionfs_forward_vop_rele(unionvp1, basevp1, lkflags1);
1071
1072	if (unionvp2 != NULL)
1073		vp2_doomed = unionfs_forward_vop_rele(unionvp2, basevp2, lkflags2);
1074	else
1075		vp2_doomed = false;
1076
1077	/*
1078	 * If any of the unionfs vnodes need to be re-locked, that
1079	 * means the unionfs vnode's lock is now de-coupled from the
1080	 * corresponding base vnode.  We therefore need to drop the
1081	 * base vnode lock (since nothing else will after this point),
1082	 * and also release the reference taken in
1083	 * unionfs_forward_vop_start_pair().
1084	 */
1085	if (__predict_false(vp1_doomed && vp2_doomed))
1086		VOP_VPUT_PAIR(basevp1, &basevp2, true);
1087	else if (__predict_false(vp1_doomed)) {
1088		/*
1089		 * If basevp1 needs to be unlocked, then we may not
1090		 * be able to safely unlock it with basevp2 still locked,
1091		 * for the same reason that an ordinary VFS call would
1092		 * need to use VOP_VPUT_PAIR() here.  We might be able
1093		 * to use VOP_VPUT_PAIR(..., false) here, but then we
1094		 * would need to deal with the possibility of basevp2
1095		 * changing out from under us, which could result in
1096		 * either the unionfs vnode becoming doomed or its
1097		 * upper/lower vp no longer matching basevp2.  Either
1098		 * scenario would require at least re-locking the unionfs
1099		 * vnode anyway.
1100		 */
1101		if (unionvp2 != NULL) {
1102			VOP_UNLOCK(unionvp2);
1103			vp2_doomed = true;
1104		}
1105		vput(basevp1);
1106	} else if (__predict_false(vp2_doomed))
1107		vput(basevp2);
1108
1109	if (__predict_false(vp1_doomed || vp2_doomed))
1110		vn_lock_pair(unionvp1, !vp1_doomed, lkflags1,
1111		    unionvp2, !vp2_doomed, lkflags2);
1112
1113	return (vp1_doomed || vp2_doomed);
1114}
1115
1116/*
1117 * Create a new whiteout.
1118 *
1119 * udvp and dvp should be locked on entry and will be locked on return.
1120 */
1121int
1122unionfs_mkwhiteout(struct vnode *dvp, struct vnode *udvp,
1123    struct componentname *cnp, struct thread *td, char *path, int pathlen)
1124{
1125	struct vnode   *wvp;
1126	struct nameidata nd;
1127	struct mount   *mp;
1128	int		error;
1129	int		lkflags;
1130
1131	wvp = NULLVP;
1132	NDPREINIT(&nd);
1133	if ((error = unionfs_relookup(udvp, &wvp, cnp, &nd.ni_cnd, td, path,
1134	    pathlen, CREATE))) {
1135		return (error);
1136	}
1137	if (wvp != NULLVP) {
1138		if (udvp == wvp)
1139			vrele(wvp);
1140		else
1141			vput(wvp);
1142
1143		return (EEXIST);
1144	}
1145
1146	if ((error = vn_start_write(udvp, &mp, V_WAIT | V_PCATCH)))
1147		goto unionfs_mkwhiteout_free_out;
1148	unionfs_forward_vop_start(udvp, &lkflags);
1149	error = VOP_WHITEOUT(udvp, &nd.ni_cnd, CREATE);
1150	unionfs_forward_vop_finish(dvp, udvp, lkflags);
1151
1152	vn_finished_write(mp);
1153
1154unionfs_mkwhiteout_free_out:
1155	return (error);
1156}
1157
1158/*
1159 * Create a new vnode for create a new shadow file.
1160 *
1161 * If an error is returned, *vpp will be invalid, otherwise it will hold a
1162 * locked, referenced and opened vnode.
1163 *
1164 * unp is never updated.
1165 */
1166static int
1167unionfs_vn_create_on_upper(struct vnode **vpp, struct vnode *udvp,
1168    struct unionfs_node *unp, struct vattr *uvap, struct thread *td)
1169{
1170	struct unionfs_mount *ump;
1171	struct vnode   *vp;
1172	struct vnode   *lvp;
1173	struct ucred   *cred;
1174	struct vattr	lva;
1175	struct nameidata nd;
1176	int		fmode;
1177	int		error;
1178
1179	ump = MOUNTTOUNIONFSMOUNT(UNIONFSTOV(unp)->v_mount);
1180	vp = NULLVP;
1181	lvp = unp->un_lowervp;
1182	cred = td->td_ucred;
1183	fmode = FFLAGS(O_WRONLY | O_CREAT | O_TRUNC | O_EXCL);
1184	error = 0;
1185
1186	if ((error = VOP_GETATTR(lvp, &lva, cred)) != 0)
1187		return (error);
1188	unionfs_create_uppervattr_core(ump, &lva, uvap, td);
1189
1190	if (unp->un_path == NULL)
1191		panic("%s: NULL un_path", __func__);
1192
1193	nd.ni_cnd.cn_namelen = unp->un_pathlen;
1194	nd.ni_cnd.cn_pnbuf = unp->un_path;
1195	nd.ni_cnd.cn_nameiop = CREATE;
1196	nd.ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF | ISLASTCN;
1197	nd.ni_cnd.cn_lkflags = LK_EXCLUSIVE;
1198	nd.ni_cnd.cn_cred = cred;
1199	nd.ni_cnd.cn_nameptr = nd.ni_cnd.cn_pnbuf;
1200	NDPREINIT(&nd);
1201
1202	vref(udvp);
1203	if ((error = vfs_relookup(udvp, &vp, &nd.ni_cnd, false)) != 0)
1204		goto unionfs_vn_create_on_upper_free_out2;
1205	vrele(udvp);
1206
1207	if (vp != NULLVP) {
1208		if (vp == udvp)
1209			vrele(vp);
1210		else
1211			vput(vp);
1212		error = EEXIST;
1213		goto unionfs_vn_create_on_upper_free_out1;
1214	}
1215
1216	if ((error = VOP_CREATE(udvp, &vp, &nd.ni_cnd, uvap)) != 0)
1217		goto unionfs_vn_create_on_upper_free_out1;
1218
1219	if ((error = VOP_OPEN(vp, fmode, cred, td, NULL)) != 0) {
1220		vput(vp);
1221		goto unionfs_vn_create_on_upper_free_out1;
1222	}
1223	error = VOP_ADD_WRITECOUNT(vp, 1);
1224	CTR3(KTR_VFS, "%s: vp %p v_writecount increased to %d",
1225	    __func__, vp, vp->v_writecount);
1226	if (error == 0) {
1227		*vpp = vp;
1228	} else {
1229		VOP_CLOSE(vp, fmode, cred, td);
1230	}
1231
1232unionfs_vn_create_on_upper_free_out1:
1233	VOP_UNLOCK(udvp);
1234
1235unionfs_vn_create_on_upper_free_out2:
1236	KASSERT(nd.ni_cnd.cn_pnbuf == unp->un_path,
1237	    ("%s: cn_pnbuf changed", __func__));
1238
1239	return (error);
1240}
1241
1242/*
1243 * Copy from lvp to uvp.
1244 *
1245 * lvp and uvp should be locked and opened on entry and will be locked and
1246 * opened on return.
1247 */
1248static int
1249unionfs_copyfile_core(struct vnode *lvp, struct vnode *uvp,
1250    struct ucred *cred, struct thread *td)
1251{
1252	char           *buf;
1253	struct uio	uio;
1254	struct iovec	iov;
1255	off_t		offset;
1256	int		count;
1257	int		error;
1258	int		bufoffset;
1259
1260	error = 0;
1261	memset(&uio, 0, sizeof(uio));
1262
1263	uio.uio_td = td;
1264	uio.uio_segflg = UIO_SYSSPACE;
1265	uio.uio_offset = 0;
1266
1267	buf = malloc(MAXBSIZE, M_TEMP, M_WAITOK);
1268
1269	while (error == 0) {
1270		offset = uio.uio_offset;
1271
1272		uio.uio_iov = &iov;
1273		uio.uio_iovcnt = 1;
1274		iov.iov_base = buf;
1275		iov.iov_len = MAXBSIZE;
1276		uio.uio_resid = iov.iov_len;
1277		uio.uio_rw = UIO_READ;
1278
1279		if ((error = VOP_READ(lvp, &uio, 0, cred)) != 0)
1280			break;
1281		if ((count = MAXBSIZE - uio.uio_resid) == 0)
1282			break;
1283
1284		bufoffset = 0;
1285		while (bufoffset < count) {
1286			uio.uio_iov = &iov;
1287			uio.uio_iovcnt = 1;
1288			iov.iov_base = buf + bufoffset;
1289			iov.iov_len = count - bufoffset;
1290			uio.uio_offset = offset + bufoffset;
1291			uio.uio_resid = iov.iov_len;
1292			uio.uio_rw = UIO_WRITE;
1293
1294			if ((error = VOP_WRITE(uvp, &uio, 0, cred)) != 0)
1295				break;
1296
1297			bufoffset += (count - bufoffset) - uio.uio_resid;
1298		}
1299
1300		uio.uio_offset = offset + bufoffset;
1301	}
1302
1303	free(buf, M_TEMP);
1304
1305	return (error);
1306}
1307
1308/*
1309 * Copy file from lower to upper.
1310 *
1311 * If you need copy of the contents, set 1 to docopy. Otherwise, set 0 to
1312 * docopy.
1313 *
1314 * If no error returned, unp will be updated.
1315 */
1316int
1317unionfs_copyfile(struct unionfs_node *unp, int docopy, struct ucred *cred,
1318    struct thread *td)
1319{
1320	struct mount   *mp;
1321	struct vnode   *udvp;
1322	struct vnode   *lvp;
1323	struct vnode   *uvp;
1324	struct vattr	uva;
1325	int		error;
1326
1327	lvp = unp->un_lowervp;
1328	uvp = NULLVP;
1329
1330	if ((UNIONFSTOV(unp)->v_mount->mnt_flag & MNT_RDONLY))
1331		return (EROFS);
1332	if (unp->un_dvp == NULLVP)
1333		return (EINVAL);
1334	if (unp->un_uppervp != NULLVP)
1335		return (EEXIST);
1336	udvp = VTOUNIONFS(unp->un_dvp)->un_uppervp;
1337	if (udvp == NULLVP)
1338		return (EROFS);
1339	if ((udvp->v_mount->mnt_flag & MNT_RDONLY))
1340		return (EROFS);
1341
1342	error = VOP_ACCESS(lvp, VREAD, cred, td);
1343	if (error != 0)
1344		return (error);
1345
1346	if ((error = vn_start_write(udvp, &mp, V_WAIT | V_PCATCH)) != 0)
1347		return (error);
1348	error = unionfs_vn_create_on_upper(&uvp, udvp, unp, &uva, td);
1349	if (error != 0) {
1350		vn_finished_write(mp);
1351		return (error);
1352	}
1353
1354	if (docopy != 0) {
1355		error = VOP_OPEN(lvp, FREAD, cred, td, NULL);
1356		if (error == 0) {
1357			error = unionfs_copyfile_core(lvp, uvp, cred, td);
1358			VOP_CLOSE(lvp, FREAD, cred, td);
1359		}
1360	}
1361	VOP_CLOSE(uvp, FWRITE, cred, td);
1362	VOP_ADD_WRITECOUNT_CHECKED(uvp, -1);
1363	CTR3(KTR_VFS, "%s: vp %p v_writecount decreased to %d",
1364	    __func__, uvp, uvp->v_writecount);
1365
1366	vn_finished_write(mp);
1367
1368	if (error == 0) {
1369		/* Reset the attributes. Ignore errors. */
1370		uva.va_type = VNON;
1371		VOP_SETATTR(uvp, &uva, cred);
1372	}
1373
1374	unionfs_node_update(unp, uvp, td);
1375
1376	return (error);
1377}
1378
1379/*
1380 * It checks whether vp can rmdir. (check empty)
1381 *
1382 * vp is unionfs vnode.
1383 * vp should be locked.
1384 */
1385int
1386unionfs_check_rmdir(struct vnode *vp, struct ucred *cred, struct thread *td)
1387{
1388	struct vnode   *uvp;
1389	struct vnode   *lvp;
1390	struct vnode   *tvp;
1391	struct dirent  *dp;
1392	struct dirent  *edp;
1393	struct componentname cn;
1394	struct iovec	iov;
1395	struct uio	uio;
1396	struct vattr	va;
1397	int		error;
1398	int		eofflag;
1399	int		lookuperr;
1400
1401	/*
1402	 * The size of buf needs to be larger than DIRBLKSIZ.
1403	 */
1404	char		buf[256 * 6];
1405
1406	ASSERT_VOP_ELOCKED(vp, __func__);
1407
1408	eofflag = 0;
1409	uvp = UNIONFSVPTOUPPERVP(vp);
1410	lvp = UNIONFSVPTOLOWERVP(vp);
1411
1412	/* check opaque */
1413	if ((error = VOP_GETATTR(uvp, &va, cred)) != 0)
1414		return (error);
1415	if (va.va_flags & OPAQUE)
1416		return (0);
1417
1418	/* open vnode */
1419#ifdef MAC
1420	if ((error = mac_vnode_check_open(cred, vp, VEXEC|VREAD)) != 0)
1421		return (error);
1422#endif
1423	if ((error = VOP_ACCESS(vp, VEXEC|VREAD, cred, td)) != 0)
1424		return (error);
1425	if ((error = VOP_OPEN(vp, FREAD, cred, td, NULL)) != 0)
1426		return (error);
1427
1428	uio.uio_rw = UIO_READ;
1429	uio.uio_segflg = UIO_SYSSPACE;
1430	uio.uio_td = td;
1431	uio.uio_offset = 0;
1432
1433#ifdef MAC
1434	error = mac_vnode_check_readdir(td->td_ucred, lvp);
1435#endif
1436	while (!error && !eofflag) {
1437		iov.iov_base = buf;
1438		iov.iov_len = sizeof(buf);
1439		uio.uio_iov = &iov;
1440		uio.uio_iovcnt = 1;
1441		uio.uio_resid = iov.iov_len;
1442
1443		error = VOP_READDIR(lvp, &uio, cred, &eofflag, NULL, NULL);
1444		if (error != 0)
1445			break;
1446		KASSERT(eofflag != 0 || uio.uio_resid < sizeof(buf),
1447		    ("%s: empty read from lower FS", __func__));
1448
1449		edp = (struct dirent*)&buf[sizeof(buf) - uio.uio_resid];
1450		for (dp = (struct dirent*)buf; !error && dp < edp;
1451		     dp = (struct dirent*)((caddr_t)dp + dp->d_reclen)) {
1452			if (dp->d_type == DT_WHT || dp->d_fileno == 0 ||
1453			    (dp->d_namlen == 1 && dp->d_name[0] == '.') ||
1454			    (dp->d_namlen == 2 && !bcmp(dp->d_name, "..", 2)))
1455				continue;
1456
1457			cn.cn_namelen = dp->d_namlen;
1458			cn.cn_pnbuf = NULL;
1459			cn.cn_nameptr = dp->d_name;
1460			cn.cn_nameiop = LOOKUP;
1461			cn.cn_flags = LOCKPARENT | LOCKLEAF | RDONLY | ISLASTCN;
1462			cn.cn_lkflags = LK_EXCLUSIVE;
1463			cn.cn_cred = cred;
1464
1465			/*
1466			 * check entry in lower.
1467			 * Sometimes, readdir function returns
1468			 * wrong entry.
1469			 */
1470			lookuperr = VOP_LOOKUP(lvp, &tvp, &cn);
1471
1472			if (!lookuperr)
1473				vput(tvp);
1474			else
1475				continue; /* skip entry */
1476
1477			/*
1478			 * check entry
1479			 * If it has no exist/whiteout entry in upper,
1480			 * directory is not empty.
1481			 */
1482			cn.cn_flags = LOCKPARENT | LOCKLEAF | RDONLY | ISLASTCN;
1483			lookuperr = VOP_LOOKUP(uvp, &tvp, &cn);
1484
1485			if (!lookuperr)
1486				vput(tvp);
1487
1488			/* ignore exist or whiteout entry */
1489			if (!lookuperr ||
1490			    (lookuperr == ENOENT && (cn.cn_flags & ISWHITEOUT)))
1491				continue;
1492
1493			error = ENOTEMPTY;
1494		}
1495	}
1496
1497	/* close vnode */
1498	VOP_CLOSE(vp, FREAD, cred, td);
1499
1500	return (error);
1501}
1502
1503