zfs_vnops.c revision 212951
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25/* Portions Copyright 2007 Jeremy Teo */
26
27#include <sys/types.h>
28#include <sys/param.h>
29#include <sys/time.h>
30#include <sys/systm.h>
31#include <sys/sysmacros.h>
32#include <sys/resource.h>
33#include <sys/vfs.h>
34#include <sys/vnode.h>
35#include <sys/file.h>
36#include <sys/stat.h>
37#include <sys/kmem.h>
38#include <sys/taskq.h>
39#include <sys/uio.h>
40#include <sys/atomic.h>
41#include <sys/namei.h>
42#include <sys/mman.h>
43#include <sys/cmn_err.h>
44#include <sys/errno.h>
45#include <sys/unistd.h>
46#include <sys/zfs_dir.h>
47#include <sys/zfs_ioctl.h>
48#include <sys/fs/zfs.h>
49#include <sys/dmu.h>
50#include <sys/spa.h>
51#include <sys/txg.h>
52#include <sys/dbuf.h>
53#include <sys/zap.h>
54#include <sys/dirent.h>
55#include <sys/policy.h>
56#include <sys/sunddi.h>
57#include <sys/filio.h>
58#include <sys/sid.h>
59#include <sys/zfs_ctldir.h>
60#include <sys/zfs_fuid.h>
61#include <sys/dnlc.h>
62#include <sys/zfs_rlock.h>
63#include <sys/extdirent.h>
64#include <sys/kidmap.h>
65#include <sys/bio.h>
66#include <sys/buf.h>
67#include <sys/sf_buf.h>
68#include <sys/sched.h>
69#include <sys/acl.h>
70
71/*
72 * Programming rules.
73 *
74 * Each vnode op performs some logical unit of work.  To do this, the ZPL must
75 * properly lock its in-core state, create a DMU transaction, do the work,
76 * record this work in the intent log (ZIL), commit the DMU transaction,
77 * and wait for the intent log to commit if it is a synchronous operation.
78 * Moreover, the vnode ops must work in both normal and log replay context.
79 * The ordering of events is important to avoid deadlocks and references
80 * to freed memory.  The example below illustrates the following Big Rules:
81 *
82 *  (1) A check must be made in each zfs thread for a mounted file system.
83 *	This is done avoiding races using ZFS_ENTER(zfsvfs).
84 *      A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
85 *      must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
86 *      can return EIO from the calling function.
87 *
88 *  (2)	VN_RELE() should always be the last thing except for zil_commit()
89 *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
90 *	First, if it's the last reference, the vnode/znode
91 *	can be freed, so the zp may point to freed memory.  Second, the last
92 *	reference will call zfs_zinactive(), which may induce a lot of work --
93 *	pushing cached pages (which acquires range locks) and syncing out
94 *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
95 *	which could deadlock the system if you were already holding one.
96 *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
97 *
98 *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
99 *	as they can span dmu_tx_assign() calls.
100 *
101 *  (4)	Always pass TXG_NOWAIT as the second argument to dmu_tx_assign().
102 *	This is critical because we don't want to block while holding locks.
103 *	Note, in particular, that if a lock is sometimes acquired before
104 *	the tx assigns, and sometimes after (e.g. z_lock), then failing to
105 *	use a non-blocking assign can deadlock the system.  The scenario:
106 *
107 *	Thread A has grabbed a lock before calling dmu_tx_assign().
108 *	Thread B is in an already-assigned tx, and blocks for this lock.
109 *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
110 *	forever, because the previous txg can't quiesce until B's tx commits.
111 *
112 *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
113 *	then drop all locks, call dmu_tx_wait(), and try again.
114 *
115 *  (5)	If the operation succeeded, generate the intent log entry for it
116 *	before dropping locks.  This ensures that the ordering of events
117 *	in the intent log matches the order in which they actually occurred.
118 *      During ZIL replay the zfs_log_* functions will update the sequence
119 *	number to indicate the zil transaction has replayed.
120 *
121 *  (6)	At the end of each vnode op, the DMU tx must always commit,
122 *	regardless of whether there were any errors.
123 *
124 *  (7)	After dropping all locks, invoke zil_commit(zilog, seq, foid)
125 *	to ensure that synchronous semantics are provided when necessary.
126 *
127 * In general, this is how things should be ordered in each vnode op:
128 *
129 *	ZFS_ENTER(zfsvfs);		// exit if unmounted
130 * top:
131 *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may VN_HOLD())
132 *	rw_enter(...);			// grab any other locks you need
133 *	tx = dmu_tx_create(...);	// get DMU tx
134 *	dmu_tx_hold_*();		// hold each object you might modify
135 *	error = dmu_tx_assign(tx, TXG_NOWAIT);	// try to assign
136 *	if (error) {
137 *		rw_exit(...);		// drop locks
138 *		zfs_dirent_unlock(dl);	// unlock directory entry
139 *		VN_RELE(...);		// release held vnodes
140 *		if (error == ERESTART) {
141 *			dmu_tx_wait(tx);
142 *			dmu_tx_abort(tx);
143 *			goto top;
144 *		}
145 *		dmu_tx_abort(tx);	// abort DMU tx
146 *		ZFS_EXIT(zfsvfs);	// finished in zfs
147 *		return (error);		// really out of space
148 *	}
149 *	error = do_real_work();		// do whatever this VOP does
150 *	if (error == 0)
151 *		zfs_log_*(...);		// on success, make ZIL entry
152 *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
153 *	rw_exit(...);			// drop locks
154 *	zfs_dirent_unlock(dl);		// unlock directory entry
155 *	VN_RELE(...);			// release held vnodes
156 *	zil_commit(zilog, seq, foid);	// synchronous when necessary
157 *	ZFS_EXIT(zfsvfs);		// finished in zfs
158 *	return (error);			// done, report error
159 */
160
161/* ARGSUSED */
162static int
163zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
164{
165	znode_t	*zp = VTOZ(*vpp);
166	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
167
168	ZFS_ENTER(zfsvfs);
169	ZFS_VERIFY_ZP(zp);
170
171	if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) &&
172	    ((flag & FAPPEND) == 0)) {
173		ZFS_EXIT(zfsvfs);
174		return (EPERM);
175	}
176
177	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
178	    ZTOV(zp)->v_type == VREG &&
179	    !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
180	    zp->z_phys->zp_size > 0) {
181		if (fs_vscan(*vpp, cr, 0) != 0) {
182			ZFS_EXIT(zfsvfs);
183			return (EACCES);
184		}
185	}
186
187	/* Keep a count of the synchronous opens in the znode */
188	if (flag & (FSYNC | FDSYNC))
189		atomic_inc_32(&zp->z_sync_cnt);
190
191	ZFS_EXIT(zfsvfs);
192	return (0);
193}
194
195/* ARGSUSED */
196static int
197zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
198    caller_context_t *ct)
199{
200	znode_t	*zp = VTOZ(vp);
201	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
202
203	/*
204	 * Clean up any locks held by this process on the vp.
205	 */
206	cleanlocks(vp, ddi_get_pid(), 0);
207	cleanshares(vp, ddi_get_pid());
208
209	ZFS_ENTER(zfsvfs);
210	ZFS_VERIFY_ZP(zp);
211
212	/* Decrement the synchronous opens in the znode */
213	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
214		atomic_dec_32(&zp->z_sync_cnt);
215
216	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
217	    ZTOV(zp)->v_type == VREG &&
218	    !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
219	    zp->z_phys->zp_size > 0)
220		VERIFY(fs_vscan(vp, cr, 1) == 0);
221
222	ZFS_EXIT(zfsvfs);
223	return (0);
224}
225
226/*
227 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
228 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
229 */
230static int
231zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
232{
233	znode_t	*zp = VTOZ(vp);
234	uint64_t noff = (uint64_t)*off; /* new offset */
235	uint64_t file_sz;
236	int error;
237	boolean_t hole;
238
239	file_sz = zp->z_phys->zp_size;
240	if (noff >= file_sz)  {
241		return (ENXIO);
242	}
243
244	if (cmd == _FIO_SEEK_HOLE)
245		hole = B_TRUE;
246	else
247		hole = B_FALSE;
248
249	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
250
251	/* end of file? */
252	if ((error == ESRCH) || (noff > file_sz)) {
253		/*
254		 * Handle the virtual hole at the end of file.
255		 */
256		if (hole) {
257			*off = file_sz;
258			return (0);
259		}
260		return (ENXIO);
261	}
262
263	if (noff < *off)
264		return (error);
265	*off = noff;
266	return (error);
267}
268
269/* ARGSUSED */
270static int
271zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
272    int *rvalp, caller_context_t *ct)
273{
274	offset_t off;
275	int error;
276	zfsvfs_t *zfsvfs;
277	znode_t *zp;
278
279	switch (com) {
280	case _FIOFFS:
281		return (0);
282
283		/*
284		 * The following two ioctls are used by bfu.  Faking out,
285		 * necessary to avoid bfu errors.
286		 */
287	case _FIOGDIO:
288	case _FIOSDIO:
289		return (0);
290
291	case _FIO_SEEK_DATA:
292	case _FIO_SEEK_HOLE:
293		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
294			return (EFAULT);
295
296		zp = VTOZ(vp);
297		zfsvfs = zp->z_zfsvfs;
298		ZFS_ENTER(zfsvfs);
299		ZFS_VERIFY_ZP(zp);
300
301		/* offset parameter is in/out */
302		error = zfs_holey(vp, com, &off);
303		ZFS_EXIT(zfsvfs);
304		if (error)
305			return (error);
306		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
307			return (EFAULT);
308		return (0);
309	}
310	return (ENOTTY);
311}
312
313static vm_page_t
314page_lookup(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
315{
316	vm_object_t obj;
317	vm_page_t pp;
318
319	obj = vp->v_object;
320	VM_OBJECT_LOCK_ASSERT(obj, MA_OWNED);
321
322	for (;;) {
323		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
324		    vm_page_is_valid(pp, (vm_offset_t)off, nbytes)) {
325			if ((pp->oflags & VPO_BUSY) != 0) {
326				/*
327				 * Reference the page before unlocking and
328				 * sleeping so that the page daemon is less
329				 * likely to reclaim it.
330				 */
331				vm_page_lock_queues();
332				vm_page_flag_set(pp, PG_REFERENCED);
333				vm_page_sleep(pp, "zfsmwb");
334				continue;
335			}
336			vm_page_busy(pp);
337			vm_page_undirty(pp);
338		} else {
339			if (__predict_false(obj->cache != NULL)) {
340				vm_page_cache_free(obj, OFF_TO_IDX(start),
341				    OFF_TO_IDX(start) + 1);
342			}
343			pp = NULL;
344		}
345		break;
346	}
347	return (pp);
348}
349
350static void
351page_unlock(vm_page_t pp)
352{
353
354	vm_page_wakeup(pp);
355}
356
357static caddr_t
358zfs_map_page(vm_page_t pp, struct sf_buf **sfp)
359{
360
361	*sfp = sf_buf_alloc(pp, 0);
362	return ((caddr_t)sf_buf_kva(*sfp));
363}
364
365static void
366zfs_unmap_page(struct sf_buf *sf)
367{
368
369	sf_buf_free(sf);
370}
371
372
373/*
374 * When a file is memory mapped, we must keep the IO data synchronized
375 * between the DMU cache and the memory mapped pages.  What this means:
376 *
377 * On Write:	If we find a memory mapped page, we write to *both*
378 *		the page and the dmu buffer.
379 */
380
381static void
382update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
383    int segflg, dmu_tx_t *tx)
384{
385	vm_object_t obj;
386	struct sf_buf *sf;
387	int off;
388
389	ASSERT(vp->v_mount != NULL);
390	obj = vp->v_object;
391	ASSERT(obj != NULL);
392
393	off = start & PAGEOFFSET;
394	VM_OBJECT_LOCK(obj);
395	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
396		vm_page_t pp;
397		int nbytes = MIN(PAGESIZE - off, len);
398
399		if ((pp = page_lookup(vp, start, off, nbytes)) != NULL) {
400			caddr_t va;
401
402			VM_OBJECT_UNLOCK(obj);
403			va = zfs_map_page(pp, &sf);
404			if (segflg == UIO_NOCOPY) {
405				(void) dmu_write(os, oid, start+off, nbytes,
406				    va+off, tx);
407			} else {
408				(void) dmu_read(os, oid, start+off, nbytes,
409				    va+off, DMU_READ_PREFETCH);;
410			}
411			zfs_unmap_page(sf);
412			VM_OBJECT_LOCK(obj);
413			page_unlock(pp);
414
415		}
416		len -= nbytes;
417		off = 0;
418	}
419	VM_OBJECT_UNLOCK(obj);
420}
421
422/*
423 * When a file is memory mapped, we must keep the IO data synchronized
424 * between the DMU cache and the memory mapped pages.  What this means:
425 *
426 * On Read:	We "read" preferentially from memory mapped pages,
427 *		else we default from the dmu buffer.
428 *
429 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
430 *	the file is memory mapped.
431 */
432static int
433mappedread(vnode_t *vp, int nbytes, uio_t *uio)
434{
435	znode_t *zp = VTOZ(vp);
436	objset_t *os = zp->z_zfsvfs->z_os;
437	vm_object_t obj;
438	vm_page_t m;
439	struct sf_buf *sf;
440	int64_t start;
441	caddr_t va;
442	int len = nbytes;
443	int off;
444	int error = 0;
445	uint64_t dirbytes;
446
447	ASSERT(vp->v_mount != NULL);
448	obj = vp->v_object;
449	ASSERT(obj != NULL);
450
451	start = uio->uio_loffset;
452	off = start & PAGEOFFSET;
453	dirbytes = 0;
454	VM_OBJECT_LOCK(obj);
455	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
456		int bytes = MIN(PAGESIZE - off, len);
457
458again:
459		if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
460		    vm_page_is_valid(m, off, bytes)) {
461			if ((m->oflags & VPO_BUSY) != 0) {
462				/*
463				 * Reference the page before unlocking and
464				 * sleeping so that the page daemon is less
465				 * likely to reclaim it.
466				 */
467				vm_page_lock_queues();
468				vm_page_flag_set(m, PG_REFERENCED);
469				vm_page_sleep(m, "zfsmrb");
470				goto again;
471			}
472
473			vm_page_busy(m);
474			VM_OBJECT_UNLOCK(obj);
475			if (dirbytes > 0) {
476				error = dmu_read_uio(os, zp->z_id, uio,
477				    dirbytes);
478				dirbytes = 0;
479			}
480			if (error == 0)
481				uiomove_fromphys(&m, off, bytes, uio);
482			VM_OBJECT_LOCK(obj);
483			vm_page_wakeup(m);
484		} else if (m != NULL && uio->uio_segflg == UIO_NOCOPY) {
485			/*
486			 * The code below is here to make sendfile(2) work
487			 * correctly with ZFS. As pointed out by ups@
488			 * sendfile(2) should be changed to use VOP_GETPAGES(),
489			 * but it pessimize performance of sendfile/UFS, that's
490			 * why I handle this special case in ZFS code.
491			 */
492			if ((m->oflags & VPO_BUSY) != 0) {
493				/*
494				 * Reference the page before unlocking and
495				 * sleeping so that the page daemon is less
496				 * likely to reclaim it.
497				 */
498				vm_page_lock_queues();
499				vm_page_flag_set(m, PG_REFERENCED);
500				vm_page_sleep(m, "zfsmrb");
501				goto again;
502			}
503			vm_page_busy(m);
504			VM_OBJECT_UNLOCK(obj);
505			if (dirbytes > 0) {
506				error = dmu_read_uio(os, zp->z_id, uio,
507				    dirbytes);
508				dirbytes = 0;
509			}
510			if (error == 0) {
511				va = zfs_map_page(m, &sf);
512				error = dmu_read(os, zp->z_id, start + off,
513				    bytes, (void *)(va + off),
514				    DMU_READ_PREFETCH);
515				zfs_unmap_page(sf);
516			}
517			VM_OBJECT_LOCK(obj);
518			if (error == 0)
519				vm_page_set_valid(m, off, bytes);
520			vm_page_wakeup(m);
521			if (error == 0) {
522				uio->uio_resid -= bytes;
523				uio->uio_offset += bytes;
524			}
525		} else {
526			dirbytes += bytes;
527		}
528		len -= bytes;
529		off = 0;
530		if (error)
531			break;
532	}
533	VM_OBJECT_UNLOCK(obj);
534	if (error == 0 && dirbytes > 0)
535		error = dmu_read_uio(os, zp->z_id, uio, dirbytes);
536	return (error);
537}
538
539offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
540
541/*
542 * Read bytes from specified file into supplied buffer.
543 *
544 *	IN:	vp	- vnode of file to be read from.
545 *		uio	- structure supplying read location, range info,
546 *			  and return buffer.
547 *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
548 *		cr	- credentials of caller.
549 *		ct	- caller context
550 *
551 *	OUT:	uio	- updated offset and range, buffer filled.
552 *
553 *	RETURN:	0 if success
554 *		error code if failure
555 *
556 * Side Effects:
557 *	vp - atime updated if byte count > 0
558 */
559/* ARGSUSED */
560static int
561zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
562{
563	znode_t		*zp = VTOZ(vp);
564	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
565	objset_t	*os;
566	ssize_t		n, nbytes;
567	int		error;
568	rl_t		*rl;
569
570	ZFS_ENTER(zfsvfs);
571	ZFS_VERIFY_ZP(zp);
572	os = zfsvfs->z_os;
573
574	if (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) {
575		ZFS_EXIT(zfsvfs);
576		return (EACCES);
577	}
578
579	/*
580	 * Validate file offset
581	 */
582	if (uio->uio_loffset < (offset_t)0) {
583		ZFS_EXIT(zfsvfs);
584		return (EINVAL);
585	}
586
587	/*
588	 * Fasttrack empty reads
589	 */
590	if (uio->uio_resid == 0) {
591		ZFS_EXIT(zfsvfs);
592		return (0);
593	}
594
595	/*
596	 * Check for mandatory locks
597	 */
598	if (MANDMODE((mode_t)zp->z_phys->zp_mode)) {
599		if (error = chklock(vp, FREAD,
600		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
601			ZFS_EXIT(zfsvfs);
602			return (error);
603		}
604	}
605
606	/*
607	 * If we're in FRSYNC mode, sync out this znode before reading it.
608	 */
609	if (ioflag & FRSYNC)
610		zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
611
612	/*
613	 * Lock the range against changes.
614	 */
615	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
616
617	/*
618	 * If we are reading past end-of-file we can skip
619	 * to the end; but we might still need to set atime.
620	 */
621	if (uio->uio_loffset >= zp->z_phys->zp_size) {
622		error = 0;
623		goto out;
624	}
625
626	ASSERT(uio->uio_loffset < zp->z_phys->zp_size);
627	n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset);
628
629	while (n > 0) {
630		nbytes = MIN(n, zfs_read_chunk_size -
631		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
632
633		if (vn_has_cached_data(vp))
634			error = mappedread(vp, nbytes, uio);
635		else
636			error = dmu_read_uio(os, zp->z_id, uio, nbytes);
637		if (error) {
638			/* convert checksum errors into IO errors */
639			if (error == ECKSUM)
640				error = EIO;
641			break;
642		}
643
644		n -= nbytes;
645	}
646
647out:
648	zfs_range_unlock(rl);
649
650	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
651	ZFS_EXIT(zfsvfs);
652	return (error);
653}
654
655/*
656 * Fault in the pages of the first n bytes specified by the uio structure.
657 * 1 byte in each page is touched and the uio struct is unmodified.
658 * Any error will exit this routine as this is only a best
659 * attempt to get the pages resident. This is a copy of ufs_trans_touch().
660 */
661static void
662zfs_prefault_write(ssize_t n, struct uio *uio)
663{
664	struct iovec *iov;
665	ulong_t cnt, incr;
666	caddr_t p;
667
668	if (uio->uio_segflg != UIO_USERSPACE)
669		return;
670
671	iov = uio->uio_iov;
672
673	while (n) {
674		cnt = MIN(iov->iov_len, n);
675		if (cnt == 0) {
676			/* empty iov entry */
677			iov++;
678			continue;
679		}
680		n -= cnt;
681		/*
682		 * touch each page in this segment.
683		 */
684		p = iov->iov_base;
685		while (cnt) {
686			if (fubyte(p) == -1)
687				return;
688			incr = MIN(cnt, PAGESIZE);
689			p += incr;
690			cnt -= incr;
691		}
692		/*
693		 * touch the last byte in case it straddles a page.
694		 */
695		p--;
696		if (fubyte(p) == -1)
697			return;
698		iov++;
699	}
700}
701
702/*
703 * Write the bytes to a file.
704 *
705 *	IN:	vp	- vnode of file to be written to.
706 *		uio	- structure supplying write location, range info,
707 *			  and data buffer.
708 *		ioflag	- IO_APPEND flag set if in append mode.
709 *		cr	- credentials of caller.
710 *		ct	- caller context (NFS/CIFS fem monitor only)
711 *
712 *	OUT:	uio	- updated offset and range.
713 *
714 *	RETURN:	0 if success
715 *		error code if failure
716 *
717 * Timestamps:
718 *	vp - ctime|mtime updated if byte count > 0
719 */
720/* ARGSUSED */
721static int
722zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
723{
724	znode_t		*zp = VTOZ(vp);
725	rlim64_t	limit = MAXOFFSET_T;
726	ssize_t		start_resid = uio->uio_resid;
727	ssize_t		tx_bytes;
728	uint64_t	end_size;
729	dmu_tx_t	*tx;
730	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
731	zilog_t		*zilog;
732	offset_t	woff;
733	ssize_t		n, nbytes;
734	rl_t		*rl;
735	int		max_blksz = zfsvfs->z_max_blksz;
736	uint64_t	pflags;
737	int		error;
738	arc_buf_t	*abuf;
739
740	/*
741	 * Fasttrack empty write
742	 */
743	n = start_resid;
744	if (n == 0)
745		return (0);
746
747	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
748		limit = MAXOFFSET_T;
749
750	ZFS_ENTER(zfsvfs);
751	ZFS_VERIFY_ZP(zp);
752
753	/*
754	 * If immutable or not appending then return EPERM
755	 */
756	pflags = zp->z_phys->zp_flags;
757	if ((pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
758	    ((pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
759	    (uio->uio_loffset < zp->z_phys->zp_size))) {
760		ZFS_EXIT(zfsvfs);
761		return (EPERM);
762	}
763
764	zilog = zfsvfs->z_log;
765
766	/*
767	 * Pre-fault the pages to ensure slow (eg NFS) pages
768	 * don't hold up txg.
769	 */
770	zfs_prefault_write(n, uio);
771
772	/*
773	 * If in append mode, set the io offset pointer to eof.
774	 */
775	if (ioflag & IO_APPEND) {
776		/*
777		 * Range lock for a file append:
778		 * The value for the start of range will be determined by
779		 * zfs_range_lock() (to guarantee append semantics).
780		 * If this write will cause the block size to increase,
781		 * zfs_range_lock() will lock the entire file, so we must
782		 * later reduce the range after we grow the block size.
783		 */
784		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
785		if (rl->r_len == UINT64_MAX) {
786			/* overlocked, zp_size can't change */
787			woff = uio->uio_loffset = zp->z_phys->zp_size;
788		} else {
789			woff = uio->uio_loffset = rl->r_off;
790		}
791	} else {
792		woff = uio->uio_loffset;
793		/*
794		 * Validate file offset
795		 */
796		if (woff < 0) {
797			ZFS_EXIT(zfsvfs);
798			return (EINVAL);
799		}
800
801		/*
802		 * If we need to grow the block size then zfs_range_lock()
803		 * will lock a wider range than we request here.
804		 * Later after growing the block size we reduce the range.
805		 */
806		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
807	}
808
809	if (woff >= limit) {
810		zfs_range_unlock(rl);
811		ZFS_EXIT(zfsvfs);
812		return (EFBIG);
813	}
814
815	if ((woff + n) > limit || woff > (limit - n))
816		n = limit - woff;
817
818	/*
819	 * Check for mandatory locks
820	 */
821	if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
822	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
823		zfs_range_unlock(rl);
824		ZFS_EXIT(zfsvfs);
825		return (error);
826	}
827	end_size = MAX(zp->z_phys->zp_size, woff + n);
828
829	/*
830	 * Write the file in reasonable size chunks.  Each chunk is written
831	 * in a separate transaction; this keeps the intent log records small
832	 * and allows us to do more fine-grained space accounting.
833	 */
834	while (n > 0) {
835		abuf = NULL;
836		woff = uio->uio_loffset;
837
838again:
839		if (zfs_usergroup_overquota(zfsvfs,
840		    B_FALSE, zp->z_phys->zp_uid) ||
841		    zfs_usergroup_overquota(zfsvfs,
842		    B_TRUE, zp->z_phys->zp_gid)) {
843			if (abuf != NULL)
844				dmu_return_arcbuf(abuf);
845			error = EDQUOT;
846			break;
847		}
848
849		/*
850		 * If dmu_assign_arcbuf() is expected to execute with minimum
851		 * overhead loan an arc buffer and copy user data to it before
852		 * we enter a txg.  This avoids holding a txg forever while we
853		 * pagefault on a hanging NFS server mapping.
854		 */
855		if (abuf == NULL && n >= max_blksz &&
856		    woff >= zp->z_phys->zp_size &&
857		    P2PHASE(woff, max_blksz) == 0 &&
858		    zp->z_blksz == max_blksz) {
859			size_t cbytes;
860
861			abuf = dmu_request_arcbuf(zp->z_dbuf, max_blksz);
862			ASSERT(abuf != NULL);
863			ASSERT(arc_buf_size(abuf) == max_blksz);
864			if (error = uiocopy(abuf->b_data, max_blksz,
865			    UIO_WRITE, uio, &cbytes)) {
866				dmu_return_arcbuf(abuf);
867				break;
868			}
869			ASSERT(cbytes == max_blksz);
870		}
871
872		/*
873		 * Start a transaction.
874		 */
875		tx = dmu_tx_create(zfsvfs->z_os);
876		dmu_tx_hold_bonus(tx, zp->z_id);
877		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
878		error = dmu_tx_assign(tx, TXG_NOWAIT);
879		if (error) {
880			if (error == ERESTART) {
881				dmu_tx_wait(tx);
882				dmu_tx_abort(tx);
883				goto again;
884			}
885			dmu_tx_abort(tx);
886			if (abuf != NULL)
887				dmu_return_arcbuf(abuf);
888			break;
889		}
890
891		/*
892		 * If zfs_range_lock() over-locked we grow the blocksize
893		 * and then reduce the lock range.  This will only happen
894		 * on the first iteration since zfs_range_reduce() will
895		 * shrink down r_len to the appropriate size.
896		 */
897		if (rl->r_len == UINT64_MAX) {
898			uint64_t new_blksz;
899
900			if (zp->z_blksz > max_blksz) {
901				ASSERT(!ISP2(zp->z_blksz));
902				new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
903			} else {
904				new_blksz = MIN(end_size, max_blksz);
905			}
906			zfs_grow_blocksize(zp, new_blksz, tx);
907			zfs_range_reduce(rl, woff, n);
908		}
909
910		/*
911		 * XXX - should we really limit each write to z_max_blksz?
912		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
913		 */
914		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
915
916		if (woff + nbytes > zp->z_phys->zp_size)
917			vnode_pager_setsize(vp, woff + nbytes);
918
919		if (abuf == NULL) {
920			tx_bytes = uio->uio_resid;
921			error = dmu_write_uio(zfsvfs->z_os, zp->z_id, uio,
922			    nbytes, tx);
923			tx_bytes -= uio->uio_resid;
924		} else {
925			tx_bytes = nbytes;
926			ASSERT(tx_bytes == max_blksz);
927			dmu_assign_arcbuf(zp->z_dbuf, woff, abuf, tx);
928			ASSERT(tx_bytes <= uio->uio_resid);
929			uioskip(uio, tx_bytes);
930		}
931
932		if (tx_bytes && vn_has_cached_data(vp)) {
933			update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
934			    zp->z_id, uio->uio_segflg, tx);
935		}
936
937		/*
938		 * If we made no progress, we're done.  If we made even
939		 * partial progress, update the znode and ZIL accordingly.
940		 */
941		if (tx_bytes == 0) {
942			dmu_tx_commit(tx);
943			ASSERT(error != 0);
944			break;
945		}
946
947		/*
948		 * Clear Set-UID/Set-GID bits on successful write if not
949		 * privileged and at least one of the excute bits is set.
950		 *
951		 * It would be nice to to this after all writes have
952		 * been done, but that would still expose the ISUID/ISGID
953		 * to another app after the partial write is committed.
954		 *
955		 * Note: we don't call zfs_fuid_map_id() here because
956		 * user 0 is not an ephemeral uid.
957		 */
958		mutex_enter(&zp->z_acl_lock);
959		if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) |
960		    (S_IXUSR >> 6))) != 0 &&
961		    (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 &&
962		    secpolicy_vnode_setid_retain(vp, cr,
963		    (zp->z_phys->zp_mode & S_ISUID) != 0 &&
964		    zp->z_phys->zp_uid == 0) != 0) {
965			zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID);
966		}
967		mutex_exit(&zp->z_acl_lock);
968
969		/*
970		 * Update time stamp.  NOTE: This marks the bonus buffer as
971		 * dirty, so we don't have to do it again for zp_size.
972		 */
973		zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
974
975		/*
976		 * Update the file size (zp_size) if it has changed;
977		 * account for possible concurrent updates.
978		 */
979		while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset)
980			(void) atomic_cas_64(&zp->z_phys->zp_size, end_size,
981			    uio->uio_loffset);
982		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
983		dmu_tx_commit(tx);
984
985		if (error != 0)
986			break;
987		ASSERT(tx_bytes == nbytes);
988		n -= nbytes;
989	}
990
991	zfs_range_unlock(rl);
992
993	/*
994	 * If we're in replay mode, or we made no progress, return error.
995	 * Otherwise, it's at least a partial write, so it's successful.
996	 */
997	if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
998		ZFS_EXIT(zfsvfs);
999		return (error);
1000	}
1001
1002	if (ioflag & (FSYNC | FDSYNC))
1003		zil_commit(zilog, zp->z_last_itx, zp->z_id);
1004
1005	ZFS_EXIT(zfsvfs);
1006	return (0);
1007}
1008
1009void
1010zfs_get_done(dmu_buf_t *db, void *vzgd)
1011{
1012	zgd_t *zgd = (zgd_t *)vzgd;
1013	rl_t *rl = zgd->zgd_rl;
1014	vnode_t *vp = ZTOV(rl->r_zp);
1015	objset_t *os = rl->r_zp->z_zfsvfs->z_os;
1016	int vfslocked;
1017
1018	vfslocked = VFS_LOCK_GIANT(vp->v_vfsp);
1019	dmu_buf_rele(db, vzgd);
1020	zfs_range_unlock(rl);
1021	/*
1022	 * Release the vnode asynchronously as we currently have the
1023	 * txg stopped from syncing.
1024	 */
1025	VN_RELE_ASYNC(vp, dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1026	zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
1027	kmem_free(zgd, sizeof (zgd_t));
1028	VFS_UNLOCK_GIANT(vfslocked);
1029}
1030
1031/*
1032 * Get data to generate a TX_WRITE intent log record.
1033 */
1034int
1035zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
1036{
1037	zfsvfs_t *zfsvfs = arg;
1038	objset_t *os = zfsvfs->z_os;
1039	znode_t *zp;
1040	uint64_t off = lr->lr_offset;
1041	dmu_buf_t *db;
1042	rl_t *rl;
1043	zgd_t *zgd;
1044	int dlen = lr->lr_length;		/* length of user data */
1045	int error = 0;
1046
1047	ASSERT(zio);
1048	ASSERT(dlen != 0);
1049
1050	/*
1051	 * Nothing to do if the file has been removed
1052	 */
1053	if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0)
1054		return (ENOENT);
1055	if (zp->z_unlinked) {
1056		/*
1057		 * Release the vnode asynchronously as we currently have the
1058		 * txg stopped from syncing.
1059		 */
1060		VN_RELE_ASYNC(ZTOV(zp),
1061		    dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1062		return (ENOENT);
1063	}
1064
1065	/*
1066	 * Write records come in two flavors: immediate and indirect.
1067	 * For small writes it's cheaper to store the data with the
1068	 * log record (immediate); for large writes it's cheaper to
1069	 * sync the data and get a pointer to it (indirect) so that
1070	 * we don't have to write the data twice.
1071	 */
1072	if (buf != NULL) { /* immediate write */
1073		rl = zfs_range_lock(zp, off, dlen, RL_READER);
1074		/* test for truncation needs to be done while range locked */
1075		if (off >= zp->z_phys->zp_size) {
1076			error = ENOENT;
1077			goto out;
1078		}
1079		VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf,
1080		    DMU_READ_NO_PREFETCH));
1081	} else { /* indirect write */
1082		uint64_t boff; /* block starting offset */
1083
1084		/*
1085		 * Have to lock the whole block to ensure when it's
1086		 * written out and it's checksum is being calculated
1087		 * that no one can change the data. We need to re-check
1088		 * blocksize after we get the lock in case it's changed!
1089		 */
1090		for (;;) {
1091			if (ISP2(zp->z_blksz)) {
1092				boff = P2ALIGN_TYPED(off, zp->z_blksz,
1093				    uint64_t);
1094			} else {
1095				boff = 0;
1096			}
1097			dlen = zp->z_blksz;
1098			rl = zfs_range_lock(zp, boff, dlen, RL_READER);
1099			if (zp->z_blksz == dlen)
1100				break;
1101			zfs_range_unlock(rl);
1102		}
1103		/* test for truncation needs to be done while range locked */
1104		if (off >= zp->z_phys->zp_size) {
1105			error = ENOENT;
1106			goto out;
1107		}
1108		zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
1109		zgd->zgd_rl = rl;
1110		zgd->zgd_zilog = zfsvfs->z_log;
1111		zgd->zgd_bp = &lr->lr_blkptr;
1112		VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db));
1113		ASSERT(boff == db->db_offset);
1114		lr->lr_blkoff = off - boff;
1115		error = dmu_sync(zio, db, &lr->lr_blkptr,
1116		    lr->lr_common.lrc_txg, zfs_get_done, zgd);
1117		ASSERT((error && error != EINPROGRESS) ||
1118		    lr->lr_length <= zp->z_blksz);
1119		if (error == 0) {
1120			/*
1121			 * dmu_sync() can compress a block of zeros to a null
1122			 * blkptr but the block size still needs to be passed
1123			 * through to replay.
1124			 */
1125			BP_SET_LSIZE(&lr->lr_blkptr, db->db_size);
1126			zil_add_block(zfsvfs->z_log, &lr->lr_blkptr);
1127		}
1128
1129		/*
1130		 * If we get EINPROGRESS, then we need to wait for a
1131		 * write IO initiated by dmu_sync() to complete before
1132		 * we can release this dbuf.  We will finish everything
1133		 * up in the zfs_get_done() callback.
1134		 */
1135		if (error == EINPROGRESS) {
1136			return (0);
1137		} else if (error == EALREADY) {
1138			lr->lr_common.lrc_txtype = TX_WRITE2;
1139			error = 0;
1140		}
1141		dmu_buf_rele(db, zgd);
1142		kmem_free(zgd, sizeof (zgd_t));
1143	}
1144out:
1145	zfs_range_unlock(rl);
1146	/*
1147	 * Release the vnode asynchronously as we currently have the
1148	 * txg stopped from syncing.
1149	 */
1150	VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1151	return (error);
1152}
1153
1154/*ARGSUSED*/
1155static int
1156zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1157    caller_context_t *ct)
1158{
1159	znode_t *zp = VTOZ(vp);
1160	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1161	int error;
1162
1163	ZFS_ENTER(zfsvfs);
1164	ZFS_VERIFY_ZP(zp);
1165
1166	if (flag & V_ACE_MASK)
1167		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1168	else
1169		error = zfs_zaccess_rwx(zp, mode, flag, cr);
1170
1171	ZFS_EXIT(zfsvfs);
1172	return (error);
1173}
1174
1175/*
1176 * If vnode is for a device return a specfs vnode instead.
1177 */
1178static int
1179specvp_check(vnode_t **vpp, cred_t *cr)
1180{
1181	int error = 0;
1182
1183	if (IS_DEVVP(*vpp)) {
1184		struct vnode *svp;
1185
1186		svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1187		VN_RELE(*vpp);
1188		if (svp == NULL)
1189			error = ENOSYS;
1190		*vpp = svp;
1191	}
1192	return (error);
1193}
1194
1195
1196/*
1197 * Lookup an entry in a directory, or an extended attribute directory.
1198 * If it exists, return a held vnode reference for it.
1199 *
1200 *	IN:	dvp	- vnode of directory to search.
1201 *		nm	- name of entry to lookup.
1202 *		pnp	- full pathname to lookup [UNUSED].
1203 *		flags	- LOOKUP_XATTR set if looking for an attribute.
1204 *		rdir	- root directory vnode [UNUSED].
1205 *		cr	- credentials of caller.
1206 *		ct	- caller context
1207 *		direntflags - directory lookup flags
1208 *		realpnp - returned pathname.
1209 *
1210 *	OUT:	vpp	- vnode of located entry, NULL if not found.
1211 *
1212 *	RETURN:	0 if success
1213 *		error code if failure
1214 *
1215 * Timestamps:
1216 *	NA
1217 */
1218/* ARGSUSED */
1219static int
1220zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
1221    int nameiop, cred_t *cr, kthread_t *td, int flags)
1222{
1223	znode_t *zdp = VTOZ(dvp);
1224	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1225	int	error = 0;
1226	int *direntflags = NULL;
1227	void *realpnp = NULL;
1228
1229	/* fast path */
1230	if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
1231
1232		if (dvp->v_type != VDIR) {
1233			return (ENOTDIR);
1234		} else if (zdp->z_dbuf == NULL) {
1235			return (EIO);
1236		}
1237
1238		if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
1239			error = zfs_fastaccesschk_execute(zdp, cr);
1240			if (!error) {
1241				*vpp = dvp;
1242				VN_HOLD(*vpp);
1243				return (0);
1244			}
1245			return (error);
1246		} else {
1247			vnode_t *tvp = dnlc_lookup(dvp, nm);
1248
1249			if (tvp) {
1250				error = zfs_fastaccesschk_execute(zdp, cr);
1251				if (error) {
1252					VN_RELE(tvp);
1253					return (error);
1254				}
1255				if (tvp == DNLC_NO_VNODE) {
1256					VN_RELE(tvp);
1257					return (ENOENT);
1258				} else {
1259					*vpp = tvp;
1260					return (specvp_check(vpp, cr));
1261				}
1262			}
1263		}
1264	}
1265
1266	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1267
1268	ZFS_ENTER(zfsvfs);
1269	ZFS_VERIFY_ZP(zdp);
1270
1271	*vpp = NULL;
1272
1273	if (flags & LOOKUP_XATTR) {
1274#ifdef TODO
1275		/*
1276		 * If the xattr property is off, refuse the lookup request.
1277		 */
1278		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1279			ZFS_EXIT(zfsvfs);
1280			return (EINVAL);
1281		}
1282#endif
1283
1284		/*
1285		 * We don't allow recursive attributes..
1286		 * Maybe someday we will.
1287		 */
1288		if (zdp->z_phys->zp_flags & ZFS_XATTR) {
1289			ZFS_EXIT(zfsvfs);
1290			return (EINVAL);
1291		}
1292
1293		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1294			ZFS_EXIT(zfsvfs);
1295			return (error);
1296		}
1297
1298		/*
1299		 * Do we have permission to get into attribute directory?
1300		 */
1301
1302		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1303		    B_FALSE, cr)) {
1304			VN_RELE(*vpp);
1305			*vpp = NULL;
1306		}
1307
1308		ZFS_EXIT(zfsvfs);
1309		return (error);
1310	}
1311
1312	if (dvp->v_type != VDIR) {
1313		ZFS_EXIT(zfsvfs);
1314		return (ENOTDIR);
1315	}
1316
1317	/*
1318	 * Check accessibility of directory.
1319	 */
1320
1321	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1322		ZFS_EXIT(zfsvfs);
1323		return (error);
1324	}
1325
1326	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1327	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1328		ZFS_EXIT(zfsvfs);
1329		return (EILSEQ);
1330	}
1331
1332	error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
1333	if (error == 0)
1334		error = specvp_check(vpp, cr);
1335
1336	/* Translate errors and add SAVENAME when needed. */
1337	if (cnp->cn_flags & ISLASTCN) {
1338		switch (nameiop) {
1339		case CREATE:
1340		case RENAME:
1341			if (error == ENOENT) {
1342				error = EJUSTRETURN;
1343				cnp->cn_flags |= SAVENAME;
1344				break;
1345			}
1346			/* FALLTHROUGH */
1347		case DELETE:
1348			if (error == 0)
1349				cnp->cn_flags |= SAVENAME;
1350			break;
1351		}
1352	}
1353	if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) {
1354		int ltype = 0;
1355
1356		if (cnp->cn_flags & ISDOTDOT) {
1357			ltype = VOP_ISLOCKED(dvp);
1358			VOP_UNLOCK(dvp, 0);
1359		}
1360		ZFS_EXIT(zfsvfs);
1361		error = vn_lock(*vpp, cnp->cn_lkflags);
1362		if (cnp->cn_flags & ISDOTDOT)
1363			vn_lock(dvp, ltype | LK_RETRY);
1364		if (error != 0) {
1365			VN_RELE(*vpp);
1366			*vpp = NULL;
1367			return (error);
1368		}
1369	} else {
1370		ZFS_EXIT(zfsvfs);
1371	}
1372
1373#ifdef FREEBSD_NAMECACHE
1374	/*
1375	 * Insert name into cache (as non-existent) if appropriate.
1376	 */
1377	if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
1378		cache_enter(dvp, *vpp, cnp);
1379	/*
1380	 * Insert name into cache if appropriate.
1381	 */
1382	if (error == 0 && (cnp->cn_flags & MAKEENTRY)) {
1383		if (!(cnp->cn_flags & ISLASTCN) ||
1384		    (nameiop != DELETE && nameiop != RENAME)) {
1385			cache_enter(dvp, *vpp, cnp);
1386		}
1387	}
1388#endif
1389
1390	return (error);
1391}
1392
1393/*
1394 * Attempt to create a new entry in a directory.  If the entry
1395 * already exists, truncate the file if permissible, else return
1396 * an error.  Return the vp of the created or trunc'd file.
1397 *
1398 *	IN:	dvp	- vnode of directory to put new file entry in.
1399 *		name	- name of new file entry.
1400 *		vap	- attributes of new file.
1401 *		excl	- flag indicating exclusive or non-exclusive mode.
1402 *		mode	- mode to open file with.
1403 *		cr	- credentials of caller.
1404 *		flag	- large file flag [UNUSED].
1405 *		ct	- caller context
1406 *		vsecp 	- ACL to be set
1407 *
1408 *	OUT:	vpp	- vnode of created or trunc'd entry.
1409 *
1410 *	RETURN:	0 if success
1411 *		error code if failure
1412 *
1413 * Timestamps:
1414 *	dvp - ctime|mtime updated if new entry created
1415 *	 vp - ctime|mtime always, atime if new
1416 */
1417
1418/* ARGSUSED */
1419static int
1420zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
1421    vnode_t **vpp, cred_t *cr, kthread_t *td)
1422{
1423	znode_t		*zp, *dzp = VTOZ(dvp);
1424	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1425	zilog_t		*zilog;
1426	objset_t	*os;
1427	zfs_dirlock_t	*dl;
1428	dmu_tx_t	*tx;
1429	int		error;
1430	ksid_t		*ksid;
1431	uid_t		uid;
1432	gid_t		gid = crgetgid(cr);
1433	zfs_acl_ids_t	acl_ids;
1434	boolean_t	fuid_dirtied;
1435	void		*vsecp = NULL;
1436	int		flag = 0;
1437
1438	/*
1439	 * If we have an ephemeral id, ACL, or XVATTR then
1440	 * make sure file system is at proper version
1441	 */
1442
1443	ksid = crgetsid(cr, KSID_OWNER);
1444	if (ksid)
1445		uid = ksid_getid(ksid);
1446	else
1447		uid = crgetuid(cr);
1448	if (zfsvfs->z_use_fuids == B_FALSE &&
1449	    (vsecp || (vap->va_mask & AT_XVATTR) ||
1450	    IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))))
1451		return (EINVAL);
1452
1453	ZFS_ENTER(zfsvfs);
1454	ZFS_VERIFY_ZP(dzp);
1455	os = zfsvfs->z_os;
1456	zilog = zfsvfs->z_log;
1457
1458	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1459	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1460		ZFS_EXIT(zfsvfs);
1461		return (EILSEQ);
1462	}
1463
1464	if (vap->va_mask & AT_XVATTR) {
1465		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
1466		    crgetuid(cr), cr, vap->va_type)) != 0) {
1467			ZFS_EXIT(zfsvfs);
1468			return (error);
1469		}
1470	}
1471top:
1472	*vpp = NULL;
1473
1474	if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
1475		vap->va_mode &= ~S_ISVTX;
1476
1477	if (*name == '\0') {
1478		/*
1479		 * Null component name refers to the directory itself.
1480		 */
1481		VN_HOLD(dvp);
1482		zp = dzp;
1483		dl = NULL;
1484		error = 0;
1485	} else {
1486		/* possible VN_HOLD(zp) */
1487		int zflg = 0;
1488
1489		if (flag & FIGNORECASE)
1490			zflg |= ZCILOOK;
1491
1492		error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1493		    NULL, NULL);
1494		if (error) {
1495			if (strcmp(name, "..") == 0)
1496				error = EISDIR;
1497			ZFS_EXIT(zfsvfs);
1498			return (error);
1499		}
1500	}
1501	if (zp == NULL) {
1502		uint64_t txtype;
1503
1504		/*
1505		 * Create a new file object and update the directory
1506		 * to reference it.
1507		 */
1508		if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1509			goto out;
1510		}
1511
1512		/*
1513		 * We only support the creation of regular files in
1514		 * extended attribute directories.
1515		 */
1516		if ((dzp->z_phys->zp_flags & ZFS_XATTR) &&
1517		    (vap->va_type != VREG)) {
1518			error = EINVAL;
1519			goto out;
1520		}
1521
1522
1523		if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp,
1524		    &acl_ids)) != 0)
1525			goto out;
1526		if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1527			zfs_acl_ids_free(&acl_ids);
1528			error = EDQUOT;
1529			goto out;
1530		}
1531
1532		tx = dmu_tx_create(os);
1533		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1534		fuid_dirtied = zfsvfs->z_fuid_dirty;
1535		if (fuid_dirtied)
1536			zfs_fuid_txhold(zfsvfs, tx);
1537		dmu_tx_hold_bonus(tx, dzp->z_id);
1538		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1539		if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1540			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1541			    0, SPA_MAXBLOCKSIZE);
1542		}
1543		error = dmu_tx_assign(tx, TXG_NOWAIT);
1544		if (error) {
1545			zfs_acl_ids_free(&acl_ids);
1546			zfs_dirent_unlock(dl);
1547			if (error == ERESTART) {
1548				dmu_tx_wait(tx);
1549				dmu_tx_abort(tx);
1550				goto top;
1551			}
1552			dmu_tx_abort(tx);
1553			ZFS_EXIT(zfsvfs);
1554			return (error);
1555		}
1556		zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
1557
1558		if (fuid_dirtied)
1559			zfs_fuid_sync(zfsvfs, tx);
1560
1561		(void) zfs_link_create(dl, zp, tx, ZNEW);
1562
1563		txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1564		if (flag & FIGNORECASE)
1565			txtype |= TX_CI;
1566		zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1567		    vsecp, acl_ids.z_fuidp, vap);
1568		zfs_acl_ids_free(&acl_ids);
1569		dmu_tx_commit(tx);
1570	} else {
1571		int aflags = (flag & FAPPEND) ? V_APPEND : 0;
1572
1573		/*
1574		 * A directory entry already exists for this name.
1575		 */
1576		/*
1577		 * Can't truncate an existing file if in exclusive mode.
1578		 */
1579		if (excl == EXCL) {
1580			error = EEXIST;
1581			goto out;
1582		}
1583		/*
1584		 * Can't open a directory for writing.
1585		 */
1586		if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
1587			error = EISDIR;
1588			goto out;
1589		}
1590		/*
1591		 * Verify requested access to file.
1592		 */
1593		if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
1594			goto out;
1595		}
1596
1597		mutex_enter(&dzp->z_lock);
1598		dzp->z_seq++;
1599		mutex_exit(&dzp->z_lock);
1600
1601		/*
1602		 * Truncate regular files if requested.
1603		 */
1604		if ((ZTOV(zp)->v_type == VREG) &&
1605		    (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
1606			/* we can't hold any locks when calling zfs_freesp() */
1607			zfs_dirent_unlock(dl);
1608			dl = NULL;
1609			error = zfs_freesp(zp, 0, 0, mode, TRUE);
1610			if (error == 0) {
1611				vnevent_create(ZTOV(zp), ct);
1612			}
1613		}
1614	}
1615out:
1616	if (dl)
1617		zfs_dirent_unlock(dl);
1618
1619	if (error) {
1620		if (zp)
1621			VN_RELE(ZTOV(zp));
1622	} else {
1623		*vpp = ZTOV(zp);
1624		error = specvp_check(vpp, cr);
1625	}
1626
1627	ZFS_EXIT(zfsvfs);
1628	return (error);
1629}
1630
1631/*
1632 * Remove an entry from a directory.
1633 *
1634 *	IN:	dvp	- vnode of directory to remove entry from.
1635 *		name	- name of entry to remove.
1636 *		cr	- credentials of caller.
1637 *		ct	- caller context
1638 *		flags	- case flags
1639 *
1640 *	RETURN:	0 if success
1641 *		error code if failure
1642 *
1643 * Timestamps:
1644 *	dvp - ctime|mtime
1645 *	 vp - ctime (if nlink > 0)
1646 */
1647/*ARGSUSED*/
1648static int
1649zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
1650    int flags)
1651{
1652	znode_t		*zp, *dzp = VTOZ(dvp);
1653	znode_t		*xzp = NULL;
1654	vnode_t		*vp;
1655	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1656	zilog_t		*zilog;
1657	uint64_t	acl_obj, xattr_obj;
1658	zfs_dirlock_t	*dl;
1659	dmu_tx_t	*tx;
1660	boolean_t	may_delete_now, delete_now = FALSE;
1661	boolean_t	unlinked, toobig = FALSE;
1662	uint64_t	txtype;
1663	pathname_t	*realnmp = NULL;
1664	pathname_t	realnm;
1665	int		error;
1666	int		zflg = ZEXISTS;
1667
1668	ZFS_ENTER(zfsvfs);
1669	ZFS_VERIFY_ZP(dzp);
1670	zilog = zfsvfs->z_log;
1671
1672	if (flags & FIGNORECASE) {
1673		zflg |= ZCILOOK;
1674		pn_alloc(&realnm);
1675		realnmp = &realnm;
1676	}
1677
1678top:
1679	/*
1680	 * Attempt to lock directory; fail if entry doesn't exist.
1681	 */
1682	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1683	    NULL, realnmp)) {
1684		if (realnmp)
1685			pn_free(realnmp);
1686		ZFS_EXIT(zfsvfs);
1687		return (error);
1688	}
1689
1690	vp = ZTOV(zp);
1691
1692	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1693		goto out;
1694	}
1695
1696	/*
1697	 * Need to use rmdir for removing directories.
1698	 */
1699	if (vp->v_type == VDIR) {
1700		error = EPERM;
1701		goto out;
1702	}
1703
1704	vnevent_remove(vp, dvp, name, ct);
1705
1706	if (realnmp)
1707		dnlc_remove(dvp, realnmp->pn_buf);
1708	else
1709		dnlc_remove(dvp, name);
1710
1711	may_delete_now = FALSE;
1712
1713	/*
1714	 * We may delete the znode now, or we may put it in the unlinked set;
1715	 * it depends on whether we're the last link, and on whether there are
1716	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
1717	 * allow for either case.
1718	 */
1719	tx = dmu_tx_create(zfsvfs->z_os);
1720	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1721	dmu_tx_hold_bonus(tx, zp->z_id);
1722	if (may_delete_now) {
1723		toobig =
1724		    zp->z_phys->zp_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
1725		/* if the file is too big, only hold_free a token amount */
1726		dmu_tx_hold_free(tx, zp->z_id, 0,
1727		    (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1728	}
1729
1730	/* are there any extended attributes? */
1731	if ((xattr_obj = zp->z_phys->zp_xattr) != 0) {
1732		/* XXX - do we need this if we are deleting? */
1733		dmu_tx_hold_bonus(tx, xattr_obj);
1734	}
1735
1736	/* are there any additional acls */
1737	if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 &&
1738	    may_delete_now)
1739		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1740
1741	/* charge as an update -- would be nice not to charge at all */
1742	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1743
1744	error = dmu_tx_assign(tx, TXG_NOWAIT);
1745	if (error) {
1746		zfs_dirent_unlock(dl);
1747		VN_RELE(vp);
1748		if (error == ERESTART) {
1749			dmu_tx_wait(tx);
1750			dmu_tx_abort(tx);
1751			goto top;
1752		}
1753		if (realnmp)
1754			pn_free(realnmp);
1755		dmu_tx_abort(tx);
1756		ZFS_EXIT(zfsvfs);
1757		return (error);
1758	}
1759
1760	/*
1761	 * Remove the directory entry.
1762	 */
1763	error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1764
1765	if (error) {
1766		dmu_tx_commit(tx);
1767		goto out;
1768	}
1769
1770	if (0 && unlinked) {
1771		VI_LOCK(vp);
1772		delete_now = may_delete_now && !toobig &&
1773		    vp->v_count == 1 && !vn_has_cached_data(vp) &&
1774		    zp->z_phys->zp_xattr == xattr_obj &&
1775		    zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj;
1776		VI_UNLOCK(vp);
1777	}
1778
1779	if (delete_now) {
1780		if (zp->z_phys->zp_xattr) {
1781			error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
1782			ASSERT3U(error, ==, 0);
1783			ASSERT3U(xzp->z_phys->zp_links, ==, 2);
1784			dmu_buf_will_dirty(xzp->z_dbuf, tx);
1785			mutex_enter(&xzp->z_lock);
1786			xzp->z_unlinked = 1;
1787			xzp->z_phys->zp_links = 0;
1788			mutex_exit(&xzp->z_lock);
1789			zfs_unlinked_add(xzp, tx);
1790			zp->z_phys->zp_xattr = 0; /* probably unnecessary */
1791		}
1792		mutex_enter(&zp->z_lock);
1793		VI_LOCK(vp);
1794		vp->v_count--;
1795		ASSERT3U(vp->v_count, ==, 0);
1796		VI_UNLOCK(vp);
1797		mutex_exit(&zp->z_lock);
1798		zfs_znode_delete(zp, tx);
1799	} else if (unlinked) {
1800		zfs_unlinked_add(zp, tx);
1801	}
1802
1803	txtype = TX_REMOVE;
1804	if (flags & FIGNORECASE)
1805		txtype |= TX_CI;
1806	zfs_log_remove(zilog, tx, txtype, dzp, name);
1807
1808	dmu_tx_commit(tx);
1809out:
1810	if (realnmp)
1811		pn_free(realnmp);
1812
1813	zfs_dirent_unlock(dl);
1814
1815	if (!delete_now) {
1816		VN_RELE(vp);
1817	} else if (xzp) {
1818		/* this rele is delayed to prevent nesting transactions */
1819		VN_RELE(ZTOV(xzp));
1820	}
1821
1822	ZFS_EXIT(zfsvfs);
1823	return (error);
1824}
1825
1826/*
1827 * Create a new directory and insert it into dvp using the name
1828 * provided.  Return a pointer to the inserted directory.
1829 *
1830 *	IN:	dvp	- vnode of directory to add subdir to.
1831 *		dirname	- name of new directory.
1832 *		vap	- attributes of new directory.
1833 *		cr	- credentials of caller.
1834 *		ct	- caller context
1835 *		vsecp	- ACL to be set
1836 *
1837 *	OUT:	vpp	- vnode of created directory.
1838 *
1839 *	RETURN:	0 if success
1840 *		error code if failure
1841 *
1842 * Timestamps:
1843 *	dvp - ctime|mtime updated
1844 *	 vp - ctime|mtime|atime updated
1845 */
1846/*ARGSUSED*/
1847static int
1848zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
1849    caller_context_t *ct, int flags, vsecattr_t *vsecp)
1850{
1851	znode_t		*zp, *dzp = VTOZ(dvp);
1852	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1853	zilog_t		*zilog;
1854	zfs_dirlock_t	*dl;
1855	uint64_t	txtype;
1856	dmu_tx_t	*tx;
1857	int		error;
1858	int		zf = ZNEW;
1859	ksid_t		*ksid;
1860	uid_t		uid;
1861	gid_t		gid = crgetgid(cr);
1862	zfs_acl_ids_t	acl_ids;
1863	boolean_t	fuid_dirtied;
1864
1865	ASSERT(vap->va_type == VDIR);
1866
1867	/*
1868	 * If we have an ephemeral id, ACL, or XVATTR then
1869	 * make sure file system is at proper version
1870	 */
1871
1872	ksid = crgetsid(cr, KSID_OWNER);
1873	if (ksid)
1874		uid = ksid_getid(ksid);
1875	else
1876		uid = crgetuid(cr);
1877	if (zfsvfs->z_use_fuids == B_FALSE &&
1878	    (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(crgetuid(cr))||
1879	    IS_EPHEMERAL(crgetgid(cr))))
1880		return (EINVAL);
1881
1882	ZFS_ENTER(zfsvfs);
1883	ZFS_VERIFY_ZP(dzp);
1884	zilog = zfsvfs->z_log;
1885
1886	if (dzp->z_phys->zp_flags & ZFS_XATTR) {
1887		ZFS_EXIT(zfsvfs);
1888		return (EINVAL);
1889	}
1890
1891	if (zfsvfs->z_utf8 && u8_validate(dirname,
1892	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1893		ZFS_EXIT(zfsvfs);
1894		return (EILSEQ);
1895	}
1896	if (flags & FIGNORECASE)
1897		zf |= ZCILOOK;
1898
1899	if (vap->va_mask & AT_XVATTR)
1900		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
1901		    crgetuid(cr), cr, vap->va_type)) != 0) {
1902			ZFS_EXIT(zfsvfs);
1903			return (error);
1904		}
1905
1906	/*
1907	 * First make sure the new directory doesn't exist.
1908	 */
1909top:
1910	*vpp = NULL;
1911
1912	if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
1913	    NULL, NULL)) {
1914		ZFS_EXIT(zfsvfs);
1915		return (error);
1916	}
1917
1918	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
1919		zfs_dirent_unlock(dl);
1920		ZFS_EXIT(zfsvfs);
1921		return (error);
1922	}
1923
1924	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp,
1925	    &acl_ids)) != 0) {
1926		zfs_dirent_unlock(dl);
1927		ZFS_EXIT(zfsvfs);
1928		return (error);
1929	}
1930	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1931		zfs_acl_ids_free(&acl_ids);
1932		zfs_dirent_unlock(dl);
1933		ZFS_EXIT(zfsvfs);
1934		return (EDQUOT);
1935	}
1936
1937	/*
1938	 * Add a new entry to the directory.
1939	 */
1940	tx = dmu_tx_create(zfsvfs->z_os);
1941	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1942	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1943	fuid_dirtied = zfsvfs->z_fuid_dirty;
1944	if (fuid_dirtied)
1945		zfs_fuid_txhold(zfsvfs, tx);
1946	if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE)
1947		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1948		    0, SPA_MAXBLOCKSIZE);
1949	error = dmu_tx_assign(tx, TXG_NOWAIT);
1950	if (error) {
1951		zfs_acl_ids_free(&acl_ids);
1952		zfs_dirent_unlock(dl);
1953		if (error == ERESTART) {
1954			dmu_tx_wait(tx);
1955			dmu_tx_abort(tx);
1956			goto top;
1957		}
1958		dmu_tx_abort(tx);
1959		ZFS_EXIT(zfsvfs);
1960		return (error);
1961	}
1962
1963	/*
1964	 * Create new node.
1965	 */
1966	zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
1967
1968	if (fuid_dirtied)
1969		zfs_fuid_sync(zfsvfs, tx);
1970	/*
1971	 * Now put new name in parent dir.
1972	 */
1973	(void) zfs_link_create(dl, zp, tx, ZNEW);
1974
1975	*vpp = ZTOV(zp);
1976
1977	txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
1978	if (flags & FIGNORECASE)
1979		txtype |= TX_CI;
1980	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
1981	    acl_ids.z_fuidp, vap);
1982
1983	zfs_acl_ids_free(&acl_ids);
1984	dmu_tx_commit(tx);
1985
1986	zfs_dirent_unlock(dl);
1987
1988	ZFS_EXIT(zfsvfs);
1989	return (0);
1990}
1991
1992/*
1993 * Remove a directory subdir entry.  If the current working
1994 * directory is the same as the subdir to be removed, the
1995 * remove will fail.
1996 *
1997 *	IN:	dvp	- vnode of directory to remove from.
1998 *		name	- name of directory to be removed.
1999 *		cwd	- vnode of current working directory.
2000 *		cr	- credentials of caller.
2001 *		ct	- caller context
2002 *		flags	- case flags
2003 *
2004 *	RETURN:	0 if success
2005 *		error code if failure
2006 *
2007 * Timestamps:
2008 *	dvp - ctime|mtime updated
2009 */
2010/*ARGSUSED*/
2011static int
2012zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
2013    caller_context_t *ct, int flags)
2014{
2015	znode_t		*dzp = VTOZ(dvp);
2016	znode_t		*zp;
2017	vnode_t		*vp;
2018	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2019	zilog_t		*zilog;
2020	zfs_dirlock_t	*dl;
2021	dmu_tx_t	*tx;
2022	int		error;
2023	int		zflg = ZEXISTS;
2024
2025	ZFS_ENTER(zfsvfs);
2026	ZFS_VERIFY_ZP(dzp);
2027	zilog = zfsvfs->z_log;
2028
2029	if (flags & FIGNORECASE)
2030		zflg |= ZCILOOK;
2031top:
2032	zp = NULL;
2033
2034	/*
2035	 * Attempt to lock directory; fail if entry doesn't exist.
2036	 */
2037	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
2038	    NULL, NULL)) {
2039		ZFS_EXIT(zfsvfs);
2040		return (error);
2041	}
2042
2043	vp = ZTOV(zp);
2044
2045	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2046		goto out;
2047	}
2048
2049	if (vp->v_type != VDIR) {
2050		error = ENOTDIR;
2051		goto out;
2052	}
2053
2054	if (vp == cwd) {
2055		error = EINVAL;
2056		goto out;
2057	}
2058
2059	vnevent_rmdir(vp, dvp, name, ct);
2060
2061	/*
2062	 * Grab a lock on the directory to make sure that noone is
2063	 * trying to add (or lookup) entries while we are removing it.
2064	 */
2065	rw_enter(&zp->z_name_lock, RW_WRITER);
2066
2067	/*
2068	 * Grab a lock on the parent pointer to make sure we play well
2069	 * with the treewalk and directory rename code.
2070	 */
2071	rw_enter(&zp->z_parent_lock, RW_WRITER);
2072
2073	tx = dmu_tx_create(zfsvfs->z_os);
2074	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2075	dmu_tx_hold_bonus(tx, zp->z_id);
2076	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2077	error = dmu_tx_assign(tx, TXG_NOWAIT);
2078	if (error) {
2079		rw_exit(&zp->z_parent_lock);
2080		rw_exit(&zp->z_name_lock);
2081		zfs_dirent_unlock(dl);
2082		VN_RELE(vp);
2083		if (error == ERESTART) {
2084			dmu_tx_wait(tx);
2085			dmu_tx_abort(tx);
2086			goto top;
2087		}
2088		dmu_tx_abort(tx);
2089		ZFS_EXIT(zfsvfs);
2090		return (error);
2091	}
2092
2093#ifdef FREEBSD_NAMECACHE
2094	cache_purge(dvp);
2095#endif
2096
2097	error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
2098
2099	if (error == 0) {
2100		uint64_t txtype = TX_RMDIR;
2101		if (flags & FIGNORECASE)
2102			txtype |= TX_CI;
2103		zfs_log_remove(zilog, tx, txtype, dzp, name);
2104	}
2105
2106	dmu_tx_commit(tx);
2107
2108	rw_exit(&zp->z_parent_lock);
2109	rw_exit(&zp->z_name_lock);
2110#ifdef FREEBSD_NAMECACHE
2111	cache_purge(vp);
2112#endif
2113out:
2114	zfs_dirent_unlock(dl);
2115
2116	VN_RELE(vp);
2117
2118	ZFS_EXIT(zfsvfs);
2119	return (error);
2120}
2121
2122/*
2123 * Read as many directory entries as will fit into the provided
2124 * buffer from the given directory cursor position (specified in
2125 * the uio structure.
2126 *
2127 *	IN:	vp	- vnode of directory to read.
2128 *		uio	- structure supplying read location, range info,
2129 *			  and return buffer.
2130 *		cr	- credentials of caller.
2131 *		ct	- caller context
2132 *		flags	- case flags
2133 *
2134 *	OUT:	uio	- updated offset and range, buffer filled.
2135 *		eofp	- set to true if end-of-file detected.
2136 *
2137 *	RETURN:	0 if success
2138 *		error code if failure
2139 *
2140 * Timestamps:
2141 *	vp - atime updated
2142 *
2143 * Note that the low 4 bits of the cookie returned by zap is always zero.
2144 * This allows us to use the low range for "special" directory entries:
2145 * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
2146 * we use the offset 2 for the '.zfs' directory.
2147 */
2148/* ARGSUSED */
2149static int
2150zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
2151{
2152	znode_t		*zp = VTOZ(vp);
2153	iovec_t		*iovp;
2154	edirent_t	*eodp;
2155	dirent64_t	*odp;
2156	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2157	objset_t	*os;
2158	caddr_t		outbuf;
2159	size_t		bufsize;
2160	zap_cursor_t	zc;
2161	zap_attribute_t	zap;
2162	uint_t		bytes_wanted;
2163	uint64_t	offset; /* must be unsigned; checks for < 1 */
2164	int		local_eof;
2165	int		outcount;
2166	int		error;
2167	uint8_t		prefetch;
2168	boolean_t	check_sysattrs;
2169	uint8_t		type;
2170	int		ncooks;
2171	u_long		*cooks = NULL;
2172	int		flags = 0;
2173
2174	ZFS_ENTER(zfsvfs);
2175	ZFS_VERIFY_ZP(zp);
2176
2177	/*
2178	 * If we are not given an eof variable,
2179	 * use a local one.
2180	 */
2181	if (eofp == NULL)
2182		eofp = &local_eof;
2183
2184	/*
2185	 * Check for valid iov_len.
2186	 */
2187	if (uio->uio_iov->iov_len <= 0) {
2188		ZFS_EXIT(zfsvfs);
2189		return (EINVAL);
2190	}
2191
2192	/*
2193	 * Quit if directory has been removed (posix)
2194	 */
2195	if ((*eofp = zp->z_unlinked) != 0) {
2196		ZFS_EXIT(zfsvfs);
2197		return (0);
2198	}
2199
2200	error = 0;
2201	os = zfsvfs->z_os;
2202	offset = uio->uio_loffset;
2203	prefetch = zp->z_zn_prefetch;
2204
2205	/*
2206	 * Initialize the iterator cursor.
2207	 */
2208	if (offset <= 3) {
2209		/*
2210		 * Start iteration from the beginning of the directory.
2211		 */
2212		zap_cursor_init(&zc, os, zp->z_id);
2213	} else {
2214		/*
2215		 * The offset is a serialized cursor.
2216		 */
2217		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2218	}
2219
2220	/*
2221	 * Get space to change directory entries into fs independent format.
2222	 */
2223	iovp = uio->uio_iov;
2224	bytes_wanted = iovp->iov_len;
2225	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
2226		bufsize = bytes_wanted;
2227		outbuf = kmem_alloc(bufsize, KM_SLEEP);
2228		odp = (struct dirent64 *)outbuf;
2229	} else {
2230		bufsize = bytes_wanted;
2231		odp = (struct dirent64 *)iovp->iov_base;
2232	}
2233	eodp = (struct edirent *)odp;
2234
2235	if (ncookies != NULL) {
2236		/*
2237		 * Minimum entry size is dirent size and 1 byte for a file name.
2238		 */
2239		ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
2240		cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
2241		*cookies = cooks;
2242		*ncookies = ncooks;
2243	}
2244	/*
2245	 * If this VFS supports the system attribute view interface; and
2246	 * we're looking at an extended attribute directory; and we care
2247	 * about normalization conflicts on this vfs; then we must check
2248	 * for normalization conflicts with the sysattr name space.
2249	 */
2250#ifdef TODO
2251	check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2252	    (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2253	    (flags & V_RDDIR_ENTFLAGS);
2254#else
2255	check_sysattrs = 0;
2256#endif
2257
2258	/*
2259	 * Transform to file-system independent format
2260	 */
2261	outcount = 0;
2262	while (outcount < bytes_wanted) {
2263		ino64_t objnum;
2264		ushort_t reclen;
2265		off64_t *next;
2266
2267		/*
2268		 * Special case `.', `..', and `.zfs'.
2269		 */
2270		if (offset == 0) {
2271			(void) strcpy(zap.za_name, ".");
2272			zap.za_normalization_conflict = 0;
2273			objnum = zp->z_id;
2274			type = DT_DIR;
2275		} else if (offset == 1) {
2276			(void) strcpy(zap.za_name, "..");
2277			zap.za_normalization_conflict = 0;
2278			objnum = zp->z_phys->zp_parent;
2279			type = DT_DIR;
2280		} else if (offset == 2 && zfs_show_ctldir(zp)) {
2281			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2282			zap.za_normalization_conflict = 0;
2283			objnum = ZFSCTL_INO_ROOT;
2284			type = DT_DIR;
2285		} else {
2286			/*
2287			 * Grab next entry.
2288			 */
2289			if (error = zap_cursor_retrieve(&zc, &zap)) {
2290				if ((*eofp = (error == ENOENT)) != 0)
2291					break;
2292				else
2293					goto update;
2294			}
2295
2296			if (zap.za_integer_length != 8 ||
2297			    zap.za_num_integers != 1) {
2298				cmn_err(CE_WARN, "zap_readdir: bad directory "
2299				    "entry, obj = %lld, offset = %lld\n",
2300				    (u_longlong_t)zp->z_id,
2301				    (u_longlong_t)offset);
2302				error = ENXIO;
2303				goto update;
2304			}
2305
2306			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2307			/*
2308			 * MacOS X can extract the object type here such as:
2309			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2310			 */
2311			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2312
2313			if (check_sysattrs && !zap.za_normalization_conflict) {
2314#ifdef TODO
2315				zap.za_normalization_conflict =
2316				    xattr_sysattr_casechk(zap.za_name);
2317#else
2318				panic("%s:%u: TODO", __func__, __LINE__);
2319#endif
2320			}
2321		}
2322
2323		if (flags & V_RDDIR_ACCFILTER) {
2324			/*
2325			 * If we have no access at all, don't include
2326			 * this entry in the returned information
2327			 */
2328			znode_t	*ezp;
2329			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
2330				goto skip_entry;
2331			if (!zfs_has_access(ezp, cr)) {
2332				VN_RELE(ZTOV(ezp));
2333				goto skip_entry;
2334			}
2335			VN_RELE(ZTOV(ezp));
2336		}
2337
2338		if (flags & V_RDDIR_ENTFLAGS)
2339			reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2340		else
2341			reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2342
2343		/*
2344		 * Will this entry fit in the buffer?
2345		 */
2346		if (outcount + reclen > bufsize) {
2347			/*
2348			 * Did we manage to fit anything in the buffer?
2349			 */
2350			if (!outcount) {
2351				error = EINVAL;
2352				goto update;
2353			}
2354			break;
2355		}
2356		if (flags & V_RDDIR_ENTFLAGS) {
2357			/*
2358			 * Add extended flag entry:
2359			 */
2360			eodp->ed_ino = objnum;
2361			eodp->ed_reclen = reclen;
2362			/* NOTE: ed_off is the offset for the *next* entry */
2363			next = &(eodp->ed_off);
2364			eodp->ed_eflags = zap.za_normalization_conflict ?
2365			    ED_CASE_CONFLICT : 0;
2366			(void) strncpy(eodp->ed_name, zap.za_name,
2367			    EDIRENT_NAMELEN(reclen));
2368			eodp = (edirent_t *)((intptr_t)eodp + reclen);
2369		} else {
2370			/*
2371			 * Add normal entry:
2372			 */
2373			odp->d_ino = objnum;
2374			odp->d_reclen = reclen;
2375			odp->d_namlen = strlen(zap.za_name);
2376			(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
2377			odp->d_type = type;
2378			odp = (dirent64_t *)((intptr_t)odp + reclen);
2379		}
2380		outcount += reclen;
2381
2382		ASSERT(outcount <= bufsize);
2383
2384		/* Prefetch znode */
2385		if (prefetch)
2386			dmu_prefetch(os, objnum, 0, 0);
2387
2388	skip_entry:
2389		/*
2390		 * Move to the next entry, fill in the previous offset.
2391		 */
2392		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2393			zap_cursor_advance(&zc);
2394			offset = zap_cursor_serialize(&zc);
2395		} else {
2396			offset += 1;
2397		}
2398
2399		if (cooks != NULL) {
2400			*cooks++ = offset;
2401			ncooks--;
2402			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
2403		}
2404	}
2405	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2406
2407	/* Subtract unused cookies */
2408	if (ncookies != NULL)
2409		*ncookies -= ncooks;
2410
2411	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
2412		iovp->iov_base += outcount;
2413		iovp->iov_len -= outcount;
2414		uio->uio_resid -= outcount;
2415	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2416		/*
2417		 * Reset the pointer.
2418		 */
2419		offset = uio->uio_loffset;
2420	}
2421
2422update:
2423	zap_cursor_fini(&zc);
2424	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
2425		kmem_free(outbuf, bufsize);
2426
2427	if (error == ENOENT)
2428		error = 0;
2429
2430	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2431
2432	uio->uio_loffset = offset;
2433	ZFS_EXIT(zfsvfs);
2434	if (error != 0 && cookies != NULL) {
2435		free(*cookies, M_TEMP);
2436		*cookies = NULL;
2437		*ncookies = 0;
2438	}
2439	return (error);
2440}
2441
2442ulong_t zfs_fsync_sync_cnt = 4;
2443
2444static int
2445zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2446{
2447	znode_t	*zp = VTOZ(vp);
2448	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2449
2450	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2451
2452	ZFS_ENTER(zfsvfs);
2453	ZFS_VERIFY_ZP(zp);
2454	zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
2455	ZFS_EXIT(zfsvfs);
2456	return (0);
2457}
2458
2459
2460/*
2461 * Get the requested file attributes and place them in the provided
2462 * vattr structure.
2463 *
2464 *	IN:	vp	- vnode of file.
2465 *		vap	- va_mask identifies requested attributes.
2466 *			  If AT_XVATTR set, then optional attrs are requested
2467 *		flags	- ATTR_NOACLCHECK (CIFS server context)
2468 *		cr	- credentials of caller.
2469 *		ct	- caller context
2470 *
2471 *	OUT:	vap	- attribute values.
2472 *
2473 *	RETURN:	0 (always succeeds)
2474 */
2475/* ARGSUSED */
2476static int
2477zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2478    caller_context_t *ct)
2479{
2480	znode_t *zp = VTOZ(vp);
2481	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2482	znode_phys_t *pzp;
2483	int	error = 0;
2484	uint32_t blksize;
2485	u_longlong_t nblocks;
2486	uint64_t links;
2487	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2488	xoptattr_t *xoap = NULL;
2489	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2490
2491	ZFS_ENTER(zfsvfs);
2492	ZFS_VERIFY_ZP(zp);
2493	pzp = zp->z_phys;
2494
2495	/*
2496	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2497	 * Also, if we are the owner don't bother, since owner should
2498	 * always be allowed to read basic attributes of file.
2499	 */
2500	if (!(pzp->zp_flags & ZFS_ACL_TRIVIAL) &&
2501	    (pzp->zp_uid != crgetuid(cr))) {
2502		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2503		    skipaclchk, cr)) {
2504			ZFS_EXIT(zfsvfs);
2505			return (error);
2506		}
2507	}
2508
2509	/*
2510	 * Return all attributes.  It's cheaper to provide the answer
2511	 * than to determine whether we were asked the question.
2512	 */
2513
2514	mutex_enter(&zp->z_lock);
2515	vap->va_type = IFTOVT(pzp->zp_mode);
2516	vap->va_mode = pzp->zp_mode & ~S_IFMT;
2517	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2518//	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
2519	vap->va_nodeid = zp->z_id;
2520	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
2521		links = pzp->zp_links + 1;
2522	else
2523		links = pzp->zp_links;
2524	vap->va_nlink = MIN(links, UINT32_MAX);	/* nlink_t limit! */
2525	vap->va_size = pzp->zp_size;
2526	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
2527	vap->va_rdev = zfs_cmpldev(pzp->zp_rdev);
2528	vap->va_seq = zp->z_seq;
2529	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
2530
2531	/*
2532	 * Add in any requested optional attributes and the create time.
2533	 * Also set the corresponding bits in the returned attribute bitmap.
2534	 */
2535	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2536		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2537			xoap->xoa_archive =
2538			    ((pzp->zp_flags & ZFS_ARCHIVE) != 0);
2539			XVA_SET_RTN(xvap, XAT_ARCHIVE);
2540		}
2541
2542		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2543			xoap->xoa_readonly =
2544			    ((pzp->zp_flags & ZFS_READONLY) != 0);
2545			XVA_SET_RTN(xvap, XAT_READONLY);
2546		}
2547
2548		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2549			xoap->xoa_system =
2550			    ((pzp->zp_flags & ZFS_SYSTEM) != 0);
2551			XVA_SET_RTN(xvap, XAT_SYSTEM);
2552		}
2553
2554		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2555			xoap->xoa_hidden =
2556			    ((pzp->zp_flags & ZFS_HIDDEN) != 0);
2557			XVA_SET_RTN(xvap, XAT_HIDDEN);
2558		}
2559
2560		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2561			xoap->xoa_nounlink =
2562			    ((pzp->zp_flags & ZFS_NOUNLINK) != 0);
2563			XVA_SET_RTN(xvap, XAT_NOUNLINK);
2564		}
2565
2566		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2567			xoap->xoa_immutable =
2568			    ((pzp->zp_flags & ZFS_IMMUTABLE) != 0);
2569			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2570		}
2571
2572		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2573			xoap->xoa_appendonly =
2574			    ((pzp->zp_flags & ZFS_APPENDONLY) != 0);
2575			XVA_SET_RTN(xvap, XAT_APPENDONLY);
2576		}
2577
2578		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2579			xoap->xoa_nodump =
2580			    ((pzp->zp_flags & ZFS_NODUMP) != 0);
2581			XVA_SET_RTN(xvap, XAT_NODUMP);
2582		}
2583
2584		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2585			xoap->xoa_opaque =
2586			    ((pzp->zp_flags & ZFS_OPAQUE) != 0);
2587			XVA_SET_RTN(xvap, XAT_OPAQUE);
2588		}
2589
2590		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2591			xoap->xoa_av_quarantined =
2592			    ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0);
2593			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2594		}
2595
2596		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2597			xoap->xoa_av_modified =
2598			    ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0);
2599			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2600		}
2601
2602		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2603		    vp->v_type == VREG &&
2604		    (pzp->zp_flags & ZFS_BONUS_SCANSTAMP)) {
2605			size_t len;
2606			dmu_object_info_t doi;
2607
2608			/*
2609			 * Only VREG files have anti-virus scanstamps, so we
2610			 * won't conflict with symlinks in the bonus buffer.
2611			 */
2612			dmu_object_info_from_db(zp->z_dbuf, &doi);
2613			len = sizeof (xoap->xoa_av_scanstamp) +
2614			    sizeof (znode_phys_t);
2615			if (len <= doi.doi_bonus_size) {
2616				/*
2617				 * pzp points to the start of the
2618				 * znode_phys_t. pzp + 1 points to the
2619				 * first byte after the znode_phys_t.
2620				 */
2621				(void) memcpy(xoap->xoa_av_scanstamp,
2622				    pzp + 1,
2623				    sizeof (xoap->xoa_av_scanstamp));
2624				XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
2625			}
2626		}
2627
2628		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
2629			ZFS_TIME_DECODE(&xoap->xoa_createtime, pzp->zp_crtime);
2630			XVA_SET_RTN(xvap, XAT_CREATETIME);
2631		}
2632	}
2633
2634	ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime);
2635	ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime);
2636	ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime);
2637	ZFS_TIME_DECODE(&vap->va_birthtime, pzp->zp_crtime);
2638
2639	mutex_exit(&zp->z_lock);
2640
2641	dmu_object_size_from_db(zp->z_dbuf, &blksize, &nblocks);
2642	vap->va_blksize = blksize;
2643	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
2644
2645	if (zp->z_blksz == 0) {
2646		/*
2647		 * Block size hasn't been set; suggest maximal I/O transfers.
2648		 */
2649		vap->va_blksize = zfsvfs->z_max_blksz;
2650	}
2651
2652	ZFS_EXIT(zfsvfs);
2653	return (0);
2654}
2655
2656/*
2657 * Set the file attributes to the values contained in the
2658 * vattr structure.
2659 *
2660 *	IN:	vp	- vnode of file to be modified.
2661 *		vap	- new attribute values.
2662 *			  If AT_XVATTR set, then optional attrs are being set
2663 *		flags	- ATTR_UTIME set if non-default time values provided.
2664 *			- ATTR_NOACLCHECK (CIFS context only).
2665 *		cr	- credentials of caller.
2666 *		ct	- caller context
2667 *
2668 *	RETURN:	0 if success
2669 *		error code if failure
2670 *
2671 * Timestamps:
2672 *	vp - ctime updated, mtime updated if size changed.
2673 */
2674/* ARGSUSED */
2675static int
2676zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2677	caller_context_t *ct)
2678{
2679	znode_t		*zp = VTOZ(vp);
2680	znode_phys_t	*pzp;
2681	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2682	zilog_t		*zilog;
2683	dmu_tx_t	*tx;
2684	vattr_t		oldva;
2685	xvattr_t	tmpxvattr;
2686	uint_t		mask = vap->va_mask;
2687	uint_t		saved_mask;
2688	uint64_t	saved_mode;
2689	int		trim_mask = 0;
2690	uint64_t	new_mode;
2691	uint64_t	new_uid, new_gid;
2692	znode_t		*attrzp;
2693	int		need_policy = FALSE;
2694	int		err;
2695	zfs_fuid_info_t *fuidp = NULL;
2696	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2697	xoptattr_t	*xoap;
2698	zfs_acl_t	*aclp = NULL;
2699	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2700	boolean_t fuid_dirtied = B_FALSE;
2701
2702	if (mask == 0)
2703		return (0);
2704
2705	if (mask & AT_NOSET)
2706		return (EINVAL);
2707
2708	ZFS_ENTER(zfsvfs);
2709	ZFS_VERIFY_ZP(zp);
2710
2711	pzp = zp->z_phys;
2712	zilog = zfsvfs->z_log;
2713
2714	/*
2715	 * Make sure that if we have ephemeral uid/gid or xvattr specified
2716	 * that file system is at proper version level
2717	 */
2718
2719	if (zfsvfs->z_use_fuids == B_FALSE &&
2720	    (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2721	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2722	    (mask & AT_XVATTR))) {
2723		ZFS_EXIT(zfsvfs);
2724		return (EINVAL);
2725	}
2726
2727	if (mask & AT_SIZE && vp->v_type == VDIR) {
2728		ZFS_EXIT(zfsvfs);
2729		return (EISDIR);
2730	}
2731
2732	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
2733		ZFS_EXIT(zfsvfs);
2734		return (EINVAL);
2735	}
2736
2737	/*
2738	 * If this is an xvattr_t, then get a pointer to the structure of
2739	 * optional attributes.  If this is NULL, then we have a vattr_t.
2740	 */
2741	xoap = xva_getxoptattr(xvap);
2742
2743	xva_init(&tmpxvattr);
2744
2745	/*
2746	 * Immutable files can only alter immutable bit and atime
2747	 */
2748	if ((pzp->zp_flags & ZFS_IMMUTABLE) &&
2749	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
2750	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2751		ZFS_EXIT(zfsvfs);
2752		return (EPERM);
2753	}
2754
2755	if ((mask & AT_SIZE) && (pzp->zp_flags & ZFS_READONLY)) {
2756		ZFS_EXIT(zfsvfs);
2757		return (EPERM);
2758	}
2759
2760	/*
2761	 * Verify timestamps doesn't overflow 32 bits.
2762	 * ZFS can handle large timestamps, but 32bit syscalls can't
2763	 * handle times greater than 2039.  This check should be removed
2764	 * once large timestamps are fully supported.
2765	 */
2766	if (mask & (AT_ATIME | AT_MTIME)) {
2767		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2768		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2769			ZFS_EXIT(zfsvfs);
2770			return (EOVERFLOW);
2771		}
2772	}
2773
2774top:
2775	attrzp = NULL;
2776
2777	/* Can this be moved to before the top label? */
2778	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2779		ZFS_EXIT(zfsvfs);
2780		return (EROFS);
2781	}
2782
2783	/*
2784	 * First validate permissions
2785	 */
2786
2787	if (mask & AT_SIZE) {
2788		err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
2789		if (err) {
2790			ZFS_EXIT(zfsvfs);
2791			return (err);
2792		}
2793		/*
2794		 * XXX - Note, we are not providing any open
2795		 * mode flags here (like FNDELAY), so we may
2796		 * block if there are locks present... this
2797		 * should be addressed in openat().
2798		 */
2799		/* XXX - would it be OK to generate a log record here? */
2800		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2801		if (err) {
2802			ZFS_EXIT(zfsvfs);
2803			return (err);
2804		}
2805	}
2806
2807	if (mask & (AT_ATIME|AT_MTIME) ||
2808	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2809	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2810	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2811	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2812	    XVA_ISSET_REQ(xvap, XAT_SYSTEM))))
2813		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2814		    skipaclchk, cr);
2815
2816	if (mask & (AT_UID|AT_GID)) {
2817		int	idmask = (mask & (AT_UID|AT_GID));
2818		int	take_owner;
2819		int	take_group;
2820
2821		/*
2822		 * NOTE: even if a new mode is being set,
2823		 * we may clear S_ISUID/S_ISGID bits.
2824		 */
2825
2826		if (!(mask & AT_MODE))
2827			vap->va_mode = pzp->zp_mode;
2828
2829		/*
2830		 * Take ownership or chgrp to group we are a member of
2831		 */
2832
2833		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
2834		take_group = (mask & AT_GID) &&
2835		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
2836
2837		/*
2838		 * If both AT_UID and AT_GID are set then take_owner and
2839		 * take_group must both be set in order to allow taking
2840		 * ownership.
2841		 *
2842		 * Otherwise, send the check through secpolicy_vnode_setattr()
2843		 *
2844		 */
2845
2846		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
2847		    ((idmask == AT_UID) && take_owner) ||
2848		    ((idmask == AT_GID) && take_group)) {
2849			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
2850			    skipaclchk, cr) == 0) {
2851				/*
2852				 * Remove setuid/setgid for non-privileged users
2853				 */
2854				secpolicy_setid_clear(vap, vp, cr);
2855				trim_mask = (mask & (AT_UID|AT_GID));
2856			} else {
2857				need_policy =  TRUE;
2858			}
2859		} else {
2860			need_policy =  TRUE;
2861		}
2862	}
2863
2864	mutex_enter(&zp->z_lock);
2865	oldva.va_mode = pzp->zp_mode;
2866	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
2867	if (mask & AT_XVATTR) {
2868		/*
2869		 * Update xvattr mask to include only those attributes
2870		 * that are actually changing.
2871		 *
2872		 * the bits will be restored prior to actually setting
2873		 * the attributes so the caller thinks they were set.
2874		 */
2875		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2876			if (xoap->xoa_appendonly !=
2877			    ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) {
2878				need_policy = TRUE;
2879			} else {
2880				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
2881				XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
2882			}
2883		}
2884
2885		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2886			if (xoap->xoa_nounlink !=
2887			    ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) {
2888				need_policy = TRUE;
2889			} else {
2890				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
2891				XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
2892			}
2893		}
2894
2895		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2896			if (xoap->xoa_immutable !=
2897			    ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) {
2898				need_policy = TRUE;
2899			} else {
2900				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
2901				XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
2902			}
2903		}
2904
2905		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2906			if (xoap->xoa_nodump !=
2907			    ((pzp->zp_flags & ZFS_NODUMP) != 0)) {
2908				need_policy = TRUE;
2909			} else {
2910				XVA_CLR_REQ(xvap, XAT_NODUMP);
2911				XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
2912			}
2913		}
2914
2915		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2916			if (xoap->xoa_av_modified !=
2917			    ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) {
2918				need_policy = TRUE;
2919			} else {
2920				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
2921				XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
2922			}
2923		}
2924
2925		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2926			if ((vp->v_type != VREG &&
2927			    xoap->xoa_av_quarantined) ||
2928			    xoap->xoa_av_quarantined !=
2929			    ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)) {
2930				need_policy = TRUE;
2931			} else {
2932				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
2933				XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
2934			}
2935		}
2936
2937		if (need_policy == FALSE &&
2938		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
2939		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
2940			need_policy = TRUE;
2941		}
2942	}
2943
2944	mutex_exit(&zp->z_lock);
2945
2946	if (mask & AT_MODE) {
2947		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
2948			err = secpolicy_setid_setsticky_clear(vp, vap,
2949			    &oldva, cr);
2950			if (err) {
2951				ZFS_EXIT(zfsvfs);
2952				return (err);
2953			}
2954			trim_mask |= AT_MODE;
2955		} else {
2956			need_policy = TRUE;
2957		}
2958	}
2959
2960	if (need_policy) {
2961		/*
2962		 * If trim_mask is set then take ownership
2963		 * has been granted or write_acl is present and user
2964		 * has the ability to modify mode.  In that case remove
2965		 * UID|GID and or MODE from mask so that
2966		 * secpolicy_vnode_setattr() doesn't revoke it.
2967		 */
2968
2969		if (trim_mask) {
2970			saved_mask = vap->va_mask;
2971			vap->va_mask &= ~trim_mask;
2972			if (trim_mask & AT_MODE) {
2973				/*
2974				 * Save the mode, as secpolicy_vnode_setattr()
2975				 * will overwrite it with ova.va_mode.
2976				 */
2977				saved_mode = vap->va_mode;
2978			}
2979		}
2980		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
2981		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
2982		if (err) {
2983			ZFS_EXIT(zfsvfs);
2984			return (err);
2985		}
2986
2987		if (trim_mask) {
2988			vap->va_mask |= saved_mask;
2989			if (trim_mask & AT_MODE) {
2990				/*
2991				 * Recover the mode after
2992				 * secpolicy_vnode_setattr().
2993				 */
2994				vap->va_mode = saved_mode;
2995			}
2996		}
2997	}
2998
2999	/*
3000	 * secpolicy_vnode_setattr, or take ownership may have
3001	 * changed va_mask
3002	 */
3003	mask = vap->va_mask;
3004
3005	tx = dmu_tx_create(zfsvfs->z_os);
3006	dmu_tx_hold_bonus(tx, zp->z_id);
3007
3008	if (mask & AT_MODE) {
3009		uint64_t pmode = pzp->zp_mode;
3010
3011		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
3012
3013		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
3014			goto out;
3015		if (pzp->zp_acl.z_acl_extern_obj) {
3016			/* Are we upgrading ACL from old V0 format to new V1 */
3017			if (zfsvfs->z_version <= ZPL_VERSION_FUID &&
3018			    pzp->zp_acl.z_acl_version ==
3019			    ZFS_ACL_VERSION_INITIAL) {
3020				dmu_tx_hold_free(tx,
3021				    pzp->zp_acl.z_acl_extern_obj, 0,
3022				    DMU_OBJECT_END);
3023				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3024				    0, aclp->z_acl_bytes);
3025			} else {
3026				dmu_tx_hold_write(tx,
3027				    pzp->zp_acl.z_acl_extern_obj, 0,
3028				    aclp->z_acl_bytes);
3029			}
3030		} else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3031			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3032			    0, aclp->z_acl_bytes);
3033		}
3034	}
3035
3036	if (mask & (AT_UID | AT_GID)) {
3037		if (pzp->zp_xattr) {
3038			err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp);
3039			if (err)
3040				goto out;
3041			dmu_tx_hold_bonus(tx, attrzp->z_id);
3042		}
3043		if (mask & AT_UID) {
3044			new_uid = zfs_fuid_create(zfsvfs,
3045			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
3046			if (new_uid != pzp->zp_uid &&
3047			    zfs_usergroup_overquota(zfsvfs, B_FALSE, new_uid)) {
3048				err = EDQUOT;
3049				goto out;
3050			}
3051		}
3052
3053		if (mask & AT_GID) {
3054			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
3055			    cr, ZFS_GROUP, &fuidp);
3056			if (new_gid != pzp->zp_gid &&
3057			    zfs_usergroup_overquota(zfsvfs, B_TRUE, new_gid)) {
3058				err = EDQUOT;
3059				goto out;
3060			}
3061		}
3062		fuid_dirtied = zfsvfs->z_fuid_dirty;
3063		if (fuid_dirtied) {
3064			if (zfsvfs->z_fuid_obj == 0) {
3065				dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
3066				dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
3067				    FUID_SIZE_ESTIMATE(zfsvfs));
3068				dmu_tx_hold_zap(tx, MASTER_NODE_OBJ,
3069				    FALSE, NULL);
3070			} else {
3071				dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
3072				dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
3073				    FUID_SIZE_ESTIMATE(zfsvfs));
3074			}
3075		}
3076	}
3077
3078	err = dmu_tx_assign(tx, TXG_NOWAIT);
3079	if (err) {
3080		if (err == ERESTART)
3081			dmu_tx_wait(tx);
3082		goto out;
3083	}
3084
3085	dmu_buf_will_dirty(zp->z_dbuf, tx);
3086
3087	/*
3088	 * Set each attribute requested.
3089	 * We group settings according to the locks they need to acquire.
3090	 *
3091	 * Note: you cannot set ctime directly, although it will be
3092	 * updated as a side-effect of calling this function.
3093	 */
3094
3095	mutex_enter(&zp->z_lock);
3096
3097	if (mask & AT_MODE) {
3098		mutex_enter(&zp->z_acl_lock);
3099		zp->z_phys->zp_mode = new_mode;
3100		err = zfs_aclset_common(zp, aclp, cr, tx);
3101		ASSERT3U(err, ==, 0);
3102		zp->z_acl_cached = aclp;
3103		aclp = NULL;
3104		mutex_exit(&zp->z_acl_lock);
3105	}
3106
3107	if (attrzp)
3108		mutex_enter(&attrzp->z_lock);
3109
3110	if (mask & AT_UID) {
3111		pzp->zp_uid = new_uid;
3112		if (attrzp)
3113			attrzp->z_phys->zp_uid = new_uid;
3114	}
3115
3116	if (mask & AT_GID) {
3117		pzp->zp_gid = new_gid;
3118		if (attrzp)
3119			attrzp->z_phys->zp_gid = new_gid;
3120	}
3121
3122	if (attrzp)
3123		mutex_exit(&attrzp->z_lock);
3124
3125	if (mask & AT_ATIME)
3126		ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
3127
3128	if (mask & AT_MTIME)
3129		ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
3130
3131	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3132	if (mask & AT_SIZE)
3133		zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx);
3134	else if (mask != 0)
3135		zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
3136	/*
3137	 * Do this after setting timestamps to prevent timestamp
3138	 * update from toggling bit
3139	 */
3140
3141	if (xoap && (mask & AT_XVATTR)) {
3142
3143		/*
3144		 * restore trimmed off masks
3145		 * so that return masks can be set for caller.
3146		 */
3147
3148		if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3149			XVA_SET_REQ(xvap, XAT_APPENDONLY);
3150		}
3151		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3152			XVA_SET_REQ(xvap, XAT_NOUNLINK);
3153		}
3154		if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3155			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3156		}
3157		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3158			XVA_SET_REQ(xvap, XAT_NODUMP);
3159		}
3160		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3161			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3162		}
3163		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3164			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3165		}
3166
3167		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
3168			size_t len;
3169			dmu_object_info_t doi;
3170
3171			ASSERT(vp->v_type == VREG);
3172
3173			/* Grow the bonus buffer if necessary. */
3174			dmu_object_info_from_db(zp->z_dbuf, &doi);
3175			len = sizeof (xoap->xoa_av_scanstamp) +
3176			    sizeof (znode_phys_t);
3177			if (len > doi.doi_bonus_size)
3178				VERIFY(dmu_set_bonus(zp->z_dbuf, len, tx) == 0);
3179		}
3180		zfs_xvattr_set(zp, xvap);
3181	}
3182
3183	if (fuid_dirtied)
3184		zfs_fuid_sync(zfsvfs, tx);
3185
3186	if (mask != 0)
3187		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3188
3189	mutex_exit(&zp->z_lock);
3190
3191out:
3192	if (attrzp)
3193		VN_RELE(ZTOV(attrzp));
3194
3195	if (aclp)
3196		zfs_acl_free(aclp);
3197
3198	if (fuidp) {
3199		zfs_fuid_info_free(fuidp);
3200		fuidp = NULL;
3201	}
3202
3203	if (err)
3204		dmu_tx_abort(tx);
3205	else
3206		dmu_tx_commit(tx);
3207
3208	if (err == ERESTART)
3209		goto top;
3210
3211	ZFS_EXIT(zfsvfs);
3212	return (err);
3213}
3214
3215typedef struct zfs_zlock {
3216	krwlock_t	*zl_rwlock;	/* lock we acquired */
3217	znode_t		*zl_znode;	/* znode we held */
3218	struct zfs_zlock *zl_next;	/* next in list */
3219} zfs_zlock_t;
3220
3221/*
3222 * Drop locks and release vnodes that were held by zfs_rename_lock().
3223 */
3224static void
3225zfs_rename_unlock(zfs_zlock_t **zlpp)
3226{
3227	zfs_zlock_t *zl;
3228
3229	while ((zl = *zlpp) != NULL) {
3230		if (zl->zl_znode != NULL)
3231			VN_RELE(ZTOV(zl->zl_znode));
3232		rw_exit(zl->zl_rwlock);
3233		*zlpp = zl->zl_next;
3234		kmem_free(zl, sizeof (*zl));
3235	}
3236}
3237
3238/*
3239 * Search back through the directory tree, using the ".." entries.
3240 * Lock each directory in the chain to prevent concurrent renames.
3241 * Fail any attempt to move a directory into one of its own descendants.
3242 * XXX - z_parent_lock can overlap with map or grow locks
3243 */
3244static int
3245zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
3246{
3247	zfs_zlock_t	*zl;
3248	znode_t		*zp = tdzp;
3249	uint64_t	rootid = zp->z_zfsvfs->z_root;
3250	uint64_t	*oidp = &zp->z_id;
3251	krwlock_t	*rwlp = &szp->z_parent_lock;
3252	krw_t		rw = RW_WRITER;
3253
3254	/*
3255	 * First pass write-locks szp and compares to zp->z_id.
3256	 * Later passes read-lock zp and compare to zp->z_parent.
3257	 */
3258	do {
3259		if (!rw_tryenter(rwlp, rw)) {
3260			/*
3261			 * Another thread is renaming in this path.
3262			 * Note that if we are a WRITER, we don't have any
3263			 * parent_locks held yet.
3264			 */
3265			if (rw == RW_READER && zp->z_id > szp->z_id) {
3266				/*
3267				 * Drop our locks and restart
3268				 */
3269				zfs_rename_unlock(&zl);
3270				*zlpp = NULL;
3271				zp = tdzp;
3272				oidp = &zp->z_id;
3273				rwlp = &szp->z_parent_lock;
3274				rw = RW_WRITER;
3275				continue;
3276			} else {
3277				/*
3278				 * Wait for other thread to drop its locks
3279				 */
3280				rw_enter(rwlp, rw);
3281			}
3282		}
3283
3284		zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
3285		zl->zl_rwlock = rwlp;
3286		zl->zl_znode = NULL;
3287		zl->zl_next = *zlpp;
3288		*zlpp = zl;
3289
3290		if (*oidp == szp->z_id)		/* We're a descendant of szp */
3291			return (EINVAL);
3292
3293		if (*oidp == rootid)		/* We've hit the top */
3294			return (0);
3295
3296		if (rw == RW_READER) {		/* i.e. not the first pass */
3297			int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp);
3298			if (error)
3299				return (error);
3300			zl->zl_znode = zp;
3301		}
3302		oidp = &zp->z_phys->zp_parent;
3303		rwlp = &zp->z_parent_lock;
3304		rw = RW_READER;
3305
3306	} while (zp->z_id != sdzp->z_id);
3307
3308	return (0);
3309}
3310
3311/*
3312 * Move an entry from the provided source directory to the target
3313 * directory.  Change the entry name as indicated.
3314 *
3315 *	IN:	sdvp	- Source directory containing the "old entry".
3316 *		snm	- Old entry name.
3317 *		tdvp	- Target directory to contain the "new entry".
3318 *		tnm	- New entry name.
3319 *		cr	- credentials of caller.
3320 *		ct	- caller context
3321 *		flags	- case flags
3322 *
3323 *	RETURN:	0 if success
3324 *		error code if failure
3325 *
3326 * Timestamps:
3327 *	sdvp,tdvp - ctime|mtime updated
3328 */
3329/*ARGSUSED*/
3330static int
3331zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
3332    caller_context_t *ct, int flags)
3333{
3334	znode_t		*tdzp, *szp, *tzp;
3335	znode_t		*sdzp = VTOZ(sdvp);
3336	zfsvfs_t	*zfsvfs = sdzp->z_zfsvfs;
3337	zilog_t		*zilog;
3338	vnode_t		*realvp;
3339	zfs_dirlock_t	*sdl, *tdl;
3340	dmu_tx_t	*tx;
3341	zfs_zlock_t	*zl;
3342	int		cmp, serr, terr;
3343	int		error = 0;
3344	int		zflg = 0;
3345
3346	ZFS_ENTER(zfsvfs);
3347	ZFS_VERIFY_ZP(sdzp);
3348	zilog = zfsvfs->z_log;
3349
3350	/*
3351	 * Make sure we have the real vp for the target directory.
3352	 */
3353	if (VOP_REALVP(tdvp, &realvp, ct) == 0)
3354		tdvp = realvp;
3355
3356	if (tdvp->v_vfsp != sdvp->v_vfsp || zfsctl_is_node(tdvp)) {
3357		ZFS_EXIT(zfsvfs);
3358		return (EXDEV);
3359	}
3360
3361	tdzp = VTOZ(tdvp);
3362	ZFS_VERIFY_ZP(tdzp);
3363	if (zfsvfs->z_utf8 && u8_validate(tnm,
3364	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3365		ZFS_EXIT(zfsvfs);
3366		return (EILSEQ);
3367	}
3368
3369	if (flags & FIGNORECASE)
3370		zflg |= ZCILOOK;
3371
3372top:
3373	szp = NULL;
3374	tzp = NULL;
3375	zl = NULL;
3376
3377	/*
3378	 * This is to prevent the creation of links into attribute space
3379	 * by renaming a linked file into/outof an attribute directory.
3380	 * See the comment in zfs_link() for why this is considered bad.
3381	 */
3382	if ((tdzp->z_phys->zp_flags & ZFS_XATTR) !=
3383	    (sdzp->z_phys->zp_flags & ZFS_XATTR)) {
3384		ZFS_EXIT(zfsvfs);
3385		return (EINVAL);
3386	}
3387
3388	/*
3389	 * Lock source and target directory entries.  To prevent deadlock,
3390	 * a lock ordering must be defined.  We lock the directory with
3391	 * the smallest object id first, or if it's a tie, the one with
3392	 * the lexically first name.
3393	 */
3394	if (sdzp->z_id < tdzp->z_id) {
3395		cmp = -1;
3396	} else if (sdzp->z_id > tdzp->z_id) {
3397		cmp = 1;
3398	} else {
3399		/*
3400		 * First compare the two name arguments without
3401		 * considering any case folding.
3402		 */
3403		int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
3404
3405		cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
3406		ASSERT(error == 0 || !zfsvfs->z_utf8);
3407		if (cmp == 0) {
3408			/*
3409			 * POSIX: "If the old argument and the new argument
3410			 * both refer to links to the same existing file,
3411			 * the rename() function shall return successfully
3412			 * and perform no other action."
3413			 */
3414			ZFS_EXIT(zfsvfs);
3415			return (0);
3416		}
3417		/*
3418		 * If the file system is case-folding, then we may
3419		 * have some more checking to do.  A case-folding file
3420		 * system is either supporting mixed case sensitivity
3421		 * access or is completely case-insensitive.  Note
3422		 * that the file system is always case preserving.
3423		 *
3424		 * In mixed sensitivity mode case sensitive behavior
3425		 * is the default.  FIGNORECASE must be used to
3426		 * explicitly request case insensitive behavior.
3427		 *
3428		 * If the source and target names provided differ only
3429		 * by case (e.g., a request to rename 'tim' to 'Tim'),
3430		 * we will treat this as a special case in the
3431		 * case-insensitive mode: as long as the source name
3432		 * is an exact match, we will allow this to proceed as
3433		 * a name-change request.
3434		 */
3435		if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
3436		    (zfsvfs->z_case == ZFS_CASE_MIXED &&
3437		    flags & FIGNORECASE)) &&
3438		    u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
3439		    &error) == 0) {
3440			/*
3441			 * case preserving rename request, require exact
3442			 * name matches
3443			 */
3444			zflg |= ZCIEXACT;
3445			zflg &= ~ZCILOOK;
3446		}
3447	}
3448
3449	/*
3450	 * If the source and destination directories are the same, we should
3451	 * grab the z_name_lock of that directory only once.
3452	 */
3453	if (sdzp == tdzp) {
3454		zflg |= ZHAVELOCK;
3455		rw_enter(&sdzp->z_name_lock, RW_READER);
3456	}
3457
3458	if (cmp < 0) {
3459		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
3460		    ZEXISTS | zflg, NULL, NULL);
3461		terr = zfs_dirent_lock(&tdl,
3462		    tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
3463	} else {
3464		terr = zfs_dirent_lock(&tdl,
3465		    tdzp, tnm, &tzp, zflg, NULL, NULL);
3466		serr = zfs_dirent_lock(&sdl,
3467		    sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
3468		    NULL, NULL);
3469	}
3470
3471	if (serr) {
3472		/*
3473		 * Source entry invalid or not there.
3474		 */
3475		if (!terr) {
3476			zfs_dirent_unlock(tdl);
3477			if (tzp)
3478				VN_RELE(ZTOV(tzp));
3479		}
3480
3481		if (sdzp == tdzp)
3482			rw_exit(&sdzp->z_name_lock);
3483
3484		if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0)
3485			serr = EINVAL;
3486		ZFS_EXIT(zfsvfs);
3487		return (serr);
3488	}
3489	if (terr) {
3490		zfs_dirent_unlock(sdl);
3491		VN_RELE(ZTOV(szp));
3492
3493		if (sdzp == tdzp)
3494			rw_exit(&sdzp->z_name_lock);
3495
3496		if (strcmp(tnm, "..") == 0)
3497			terr = EINVAL;
3498		ZFS_EXIT(zfsvfs);
3499		return (terr);
3500	}
3501
3502	/*
3503	 * Must have write access at the source to remove the old entry
3504	 * and write access at the target to create the new entry.
3505	 * Note that if target and source are the same, this can be
3506	 * done in a single check.
3507	 */
3508
3509	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
3510		goto out;
3511
3512	if (ZTOV(szp)->v_type == VDIR) {
3513		/*
3514		 * Check to make sure rename is valid.
3515		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3516		 */
3517		if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
3518			goto out;
3519	}
3520
3521	/*
3522	 * Does target exist?
3523	 */
3524	if (tzp) {
3525		/*
3526		 * Source and target must be the same type.
3527		 */
3528		if (ZTOV(szp)->v_type == VDIR) {
3529			if (ZTOV(tzp)->v_type != VDIR) {
3530				error = ENOTDIR;
3531				goto out;
3532			}
3533		} else {
3534			if (ZTOV(tzp)->v_type == VDIR) {
3535				error = EISDIR;
3536				goto out;
3537			}
3538		}
3539		/*
3540		 * POSIX dictates that when the source and target
3541		 * entries refer to the same file object, rename
3542		 * must do nothing and exit without error.
3543		 */
3544		if (szp->z_id == tzp->z_id) {
3545			error = 0;
3546			goto out;
3547		}
3548	}
3549
3550	vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
3551	if (tzp)
3552		vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
3553
3554	/*
3555	 * notify the target directory if it is not the same
3556	 * as source directory.
3557	 */
3558	if (tdvp != sdvp) {
3559		vnevent_rename_dest_dir(tdvp, ct);
3560	}
3561
3562	tx = dmu_tx_create(zfsvfs->z_os);
3563	dmu_tx_hold_bonus(tx, szp->z_id);	/* nlink changes */
3564	dmu_tx_hold_bonus(tx, sdzp->z_id);	/* nlink changes */
3565	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3566	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3567	if (sdzp != tdzp)
3568		dmu_tx_hold_bonus(tx, tdzp->z_id);	/* nlink changes */
3569	if (tzp)
3570		dmu_tx_hold_bonus(tx, tzp->z_id);	/* parent changes */
3571	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3572	error = dmu_tx_assign(tx, TXG_NOWAIT);
3573	if (error) {
3574		if (zl != NULL)
3575			zfs_rename_unlock(&zl);
3576		zfs_dirent_unlock(sdl);
3577		zfs_dirent_unlock(tdl);
3578
3579		if (sdzp == tdzp)
3580			rw_exit(&sdzp->z_name_lock);
3581
3582		VN_RELE(ZTOV(szp));
3583		if (tzp)
3584			VN_RELE(ZTOV(tzp));
3585		if (error == ERESTART) {
3586			dmu_tx_wait(tx);
3587			dmu_tx_abort(tx);
3588			goto top;
3589		}
3590		dmu_tx_abort(tx);
3591		ZFS_EXIT(zfsvfs);
3592		return (error);
3593	}
3594
3595	if (tzp)	/* Attempt to remove the existing target */
3596		error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
3597
3598	if (error == 0) {
3599		error = zfs_link_create(tdl, szp, tx, ZRENAMING);
3600		if (error == 0) {
3601			szp->z_phys->zp_flags |= ZFS_AV_MODIFIED;
3602
3603			error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
3604			ASSERT(error == 0);
3605
3606			zfs_log_rename(zilog, tx,
3607			    TX_RENAME | (flags & FIGNORECASE ? TX_CI : 0),
3608			    sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp);
3609
3610			/* Update path information for the target vnode */
3611			vn_renamepath(tdvp, ZTOV(szp), tnm, strlen(tnm));
3612		}
3613#ifdef FREEBSD_NAMECACHE
3614		if (error == 0) {
3615			cache_purge(sdvp);
3616			cache_purge(tdvp);
3617		}
3618#endif
3619	}
3620
3621	dmu_tx_commit(tx);
3622out:
3623	if (zl != NULL)
3624		zfs_rename_unlock(&zl);
3625
3626	zfs_dirent_unlock(sdl);
3627	zfs_dirent_unlock(tdl);
3628
3629	if (sdzp == tdzp)
3630		rw_exit(&sdzp->z_name_lock);
3631
3632	VN_RELE(ZTOV(szp));
3633	if (tzp)
3634		VN_RELE(ZTOV(tzp));
3635
3636	ZFS_EXIT(zfsvfs);
3637
3638	return (error);
3639}
3640
3641/*
3642 * Insert the indicated symbolic reference entry into the directory.
3643 *
3644 *	IN:	dvp	- Directory to contain new symbolic link.
3645 *		link	- Name for new symlink entry.
3646 *		vap	- Attributes of new entry.
3647 *		target	- Target path of new symlink.
3648 *		cr	- credentials of caller.
3649 *		ct	- caller context
3650 *		flags	- case flags
3651 *
3652 *	RETURN:	0 if success
3653 *		error code if failure
3654 *
3655 * Timestamps:
3656 *	dvp - ctime|mtime updated
3657 */
3658/*ARGSUSED*/
3659static int
3660zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
3661    cred_t *cr, kthread_t *td)
3662{
3663	znode_t		*zp, *dzp = VTOZ(dvp);
3664	zfs_dirlock_t	*dl;
3665	dmu_tx_t	*tx;
3666	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
3667	zilog_t		*zilog;
3668	int		len = strlen(link);
3669	int		error;
3670	int		zflg = ZNEW;
3671	zfs_acl_ids_t	acl_ids;
3672	boolean_t	fuid_dirtied;
3673	int		flags = 0;
3674
3675	ASSERT(vap->va_type == VLNK);
3676
3677	ZFS_ENTER(zfsvfs);
3678	ZFS_VERIFY_ZP(dzp);
3679	zilog = zfsvfs->z_log;
3680
3681	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
3682	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3683		ZFS_EXIT(zfsvfs);
3684		return (EILSEQ);
3685	}
3686	if (flags & FIGNORECASE)
3687		zflg |= ZCILOOK;
3688top:
3689	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
3690		ZFS_EXIT(zfsvfs);
3691		return (error);
3692	}
3693
3694	if (len > MAXPATHLEN) {
3695		ZFS_EXIT(zfsvfs);
3696		return (ENAMETOOLONG);
3697	}
3698
3699	/*
3700	 * Attempt to lock directory; fail if entry already exists.
3701	 */
3702	error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
3703	if (error) {
3704		ZFS_EXIT(zfsvfs);
3705		return (error);
3706	}
3707
3708	VERIFY(0 == zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids));
3709	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
3710		zfs_acl_ids_free(&acl_ids);
3711		zfs_dirent_unlock(dl);
3712		ZFS_EXIT(zfsvfs);
3713		return (EDQUOT);
3714	}
3715	tx = dmu_tx_create(zfsvfs->z_os);
3716	fuid_dirtied = zfsvfs->z_fuid_dirty;
3717	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
3718	dmu_tx_hold_bonus(tx, dzp->z_id);
3719	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3720	if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE)
3721		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE);
3722	if (fuid_dirtied)
3723		zfs_fuid_txhold(zfsvfs, tx);
3724	error = dmu_tx_assign(tx, TXG_NOWAIT);
3725	if (error) {
3726		zfs_acl_ids_free(&acl_ids);
3727		zfs_dirent_unlock(dl);
3728		if (error == ERESTART) {
3729			dmu_tx_wait(tx);
3730			dmu_tx_abort(tx);
3731			goto top;
3732		}
3733		dmu_tx_abort(tx);
3734		ZFS_EXIT(zfsvfs);
3735		return (error);
3736	}
3737
3738	dmu_buf_will_dirty(dzp->z_dbuf, tx);
3739
3740	/*
3741	 * Create a new object for the symlink.
3742	 * Put the link content into bonus buffer if it will fit;
3743	 * otherwise, store it just like any other file data.
3744	 */
3745	if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) {
3746		zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, &acl_ids);
3747		if (len != 0)
3748			bcopy(link, zp->z_phys + 1, len);
3749	} else {
3750		dmu_buf_t *dbp;
3751
3752		zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
3753
3754		if (fuid_dirtied)
3755			zfs_fuid_sync(zfsvfs, tx);
3756		/*
3757		 * Nothing can access the znode yet so no locking needed
3758		 * for growing the znode's blocksize.
3759		 */
3760		zfs_grow_blocksize(zp, len, tx);
3761
3762		VERIFY(0 == dmu_buf_hold(zfsvfs->z_os,
3763		    zp->z_id, 0, FTAG, &dbp));
3764		dmu_buf_will_dirty(dbp, tx);
3765
3766		ASSERT3U(len, <=, dbp->db_size);
3767		bcopy(link, dbp->db_data, len);
3768		dmu_buf_rele(dbp, FTAG);
3769	}
3770	zp->z_phys->zp_size = len;
3771
3772	/*
3773	 * Insert the new object into the directory.
3774	 */
3775	(void) zfs_link_create(dl, zp, tx, ZNEW);
3776	if (error == 0) {
3777		uint64_t txtype = TX_SYMLINK;
3778		if (flags & FIGNORECASE)
3779			txtype |= TX_CI;
3780		zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
3781		*vpp = ZTOV(zp);
3782	}
3783
3784	zfs_acl_ids_free(&acl_ids);
3785
3786	dmu_tx_commit(tx);
3787
3788	zfs_dirent_unlock(dl);
3789
3790	ZFS_EXIT(zfsvfs);
3791	return (error);
3792}
3793
3794/*
3795 * Return, in the buffer contained in the provided uio structure,
3796 * the symbolic path referred to by vp.
3797 *
3798 *	IN:	vp	- vnode of symbolic link.
3799 *		uoip	- structure to contain the link path.
3800 *		cr	- credentials of caller.
3801 *		ct	- caller context
3802 *
3803 *	OUT:	uio	- structure to contain the link path.
3804 *
3805 *	RETURN:	0 if success
3806 *		error code if failure
3807 *
3808 * Timestamps:
3809 *	vp - atime updated
3810 */
3811/* ARGSUSED */
3812static int
3813zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
3814{
3815	znode_t		*zp = VTOZ(vp);
3816	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
3817	size_t		bufsz;
3818	int		error;
3819
3820	ZFS_ENTER(zfsvfs);
3821	ZFS_VERIFY_ZP(zp);
3822
3823	bufsz = (size_t)zp->z_phys->zp_size;
3824	if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) {
3825		error = uiomove(zp->z_phys + 1,
3826		    MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
3827	} else {
3828		dmu_buf_t *dbp;
3829		error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp);
3830		if (error) {
3831			ZFS_EXIT(zfsvfs);
3832			return (error);
3833		}
3834		error = uiomove(dbp->db_data,
3835		    MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
3836		dmu_buf_rele(dbp, FTAG);
3837	}
3838
3839	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
3840	ZFS_EXIT(zfsvfs);
3841	return (error);
3842}
3843
3844/*
3845 * Insert a new entry into directory tdvp referencing svp.
3846 *
3847 *	IN:	tdvp	- Directory to contain new entry.
3848 *		svp	- vnode of new entry.
3849 *		name	- name of new entry.
3850 *		cr	- credentials of caller.
3851 *		ct	- caller context
3852 *
3853 *	RETURN:	0 if success
3854 *		error code if failure
3855 *
3856 * Timestamps:
3857 *	tdvp - ctime|mtime updated
3858 *	 svp - ctime updated
3859 */
3860/* ARGSUSED */
3861static int
3862zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
3863    caller_context_t *ct, int flags)
3864{
3865	znode_t		*dzp = VTOZ(tdvp);
3866	znode_t		*tzp, *szp;
3867	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
3868	zilog_t		*zilog;
3869	zfs_dirlock_t	*dl;
3870	dmu_tx_t	*tx;
3871	vnode_t		*realvp;
3872	int		error;
3873	int		zf = ZNEW;
3874	uint64_t	parent;
3875	uid_t		owner;
3876
3877	ASSERT(tdvp->v_type == VDIR);
3878
3879	ZFS_ENTER(zfsvfs);
3880	ZFS_VERIFY_ZP(dzp);
3881	zilog = zfsvfs->z_log;
3882
3883	if (VOP_REALVP(svp, &realvp, ct) == 0)
3884		svp = realvp;
3885
3886	/*
3887	 * POSIX dictates that we return EPERM here.
3888	 * Better choices include ENOTSUP or EISDIR.
3889	 */
3890	if (svp->v_type == VDIR) {
3891		ZFS_EXIT(zfsvfs);
3892		return (EPERM);
3893	}
3894
3895	if (svp->v_vfsp != tdvp->v_vfsp || zfsctl_is_node(svp)) {
3896		ZFS_EXIT(zfsvfs);
3897		return (EXDEV);
3898	}
3899
3900	szp = VTOZ(svp);
3901	ZFS_VERIFY_ZP(szp);
3902
3903	/* Prevent links to .zfs/shares files */
3904
3905	if (szp->z_phys->zp_parent == zfsvfs->z_shares_dir) {
3906		ZFS_EXIT(zfsvfs);
3907		return (EPERM);
3908	}
3909
3910	if (zfsvfs->z_utf8 && u8_validate(name,
3911	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3912		ZFS_EXIT(zfsvfs);
3913		return (EILSEQ);
3914	}
3915	if (flags & FIGNORECASE)
3916		zf |= ZCILOOK;
3917
3918	/*
3919	 * We do not support links between attributes and non-attributes
3920	 * because of the potential security risk of creating links
3921	 * into "normal" file space in order to circumvent restrictions
3922	 * imposed in attribute space.
3923	 */
3924	if ((szp->z_phys->zp_flags & ZFS_XATTR) !=
3925	    (dzp->z_phys->zp_flags & ZFS_XATTR)) {
3926		ZFS_EXIT(zfsvfs);
3927		return (EINVAL);
3928	}
3929
3930
3931	owner = zfs_fuid_map_id(zfsvfs, szp->z_phys->zp_uid, cr, ZFS_OWNER);
3932	if (owner != crgetuid(cr) &&
3933	    secpolicy_basic_link(svp, cr) != 0) {
3934		ZFS_EXIT(zfsvfs);
3935		return (EPERM);
3936	}
3937
3938	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
3939		ZFS_EXIT(zfsvfs);
3940		return (error);
3941	}
3942
3943top:
3944	/*
3945	 * Attempt to lock directory; fail if entry already exists.
3946	 */
3947	error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
3948	if (error) {
3949		ZFS_EXIT(zfsvfs);
3950		return (error);
3951	}
3952
3953	tx = dmu_tx_create(zfsvfs->z_os);
3954	dmu_tx_hold_bonus(tx, szp->z_id);
3955	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3956	error = dmu_tx_assign(tx, TXG_NOWAIT);
3957	if (error) {
3958		zfs_dirent_unlock(dl);
3959		if (error == ERESTART) {
3960			dmu_tx_wait(tx);
3961			dmu_tx_abort(tx);
3962			goto top;
3963		}
3964		dmu_tx_abort(tx);
3965		ZFS_EXIT(zfsvfs);
3966		return (error);
3967	}
3968
3969	error = zfs_link_create(dl, szp, tx, 0);
3970
3971	if (error == 0) {
3972		uint64_t txtype = TX_LINK;
3973		if (flags & FIGNORECASE)
3974			txtype |= TX_CI;
3975		zfs_log_link(zilog, tx, txtype, dzp, szp, name);
3976	}
3977
3978	dmu_tx_commit(tx);
3979
3980	zfs_dirent_unlock(dl);
3981
3982	if (error == 0) {
3983		vnevent_link(svp, ct);
3984	}
3985
3986	ZFS_EXIT(zfsvfs);
3987	return (error);
3988}
3989
3990/*ARGSUSED*/
3991void
3992zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
3993{
3994	znode_t	*zp = VTOZ(vp);
3995	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3996	int error;
3997
3998	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
3999	if (zp->z_dbuf == NULL) {
4000		/*
4001		 * The fs has been unmounted, or we did a
4002		 * suspend/resume and this file no longer exists.
4003		 */
4004		VI_LOCK(vp);
4005		vp->v_count = 0; /* count arrives as 1 */
4006		VI_UNLOCK(vp);
4007		vrecycle(vp, curthread);
4008		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4009		return;
4010	}
4011
4012	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4013		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4014
4015		dmu_tx_hold_bonus(tx, zp->z_id);
4016		error = dmu_tx_assign(tx, TXG_WAIT);
4017		if (error) {
4018			dmu_tx_abort(tx);
4019		} else {
4020			dmu_buf_will_dirty(zp->z_dbuf, tx);
4021			mutex_enter(&zp->z_lock);
4022			zp->z_atime_dirty = 0;
4023			mutex_exit(&zp->z_lock);
4024			dmu_tx_commit(tx);
4025		}
4026	}
4027
4028	zfs_zinactive(zp);
4029	rw_exit(&zfsvfs->z_teardown_inactive_lock);
4030}
4031
4032CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
4033CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
4034
4035/*ARGSUSED*/
4036static int
4037zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4038{
4039	znode_t		*zp = VTOZ(vp);
4040	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4041	uint32_t	gen;
4042	uint64_t	object = zp->z_id;
4043	zfid_short_t	*zfid;
4044	int		size, i;
4045
4046	ZFS_ENTER(zfsvfs);
4047	ZFS_VERIFY_ZP(zp);
4048	gen = (uint32_t)zp->z_gen;
4049
4050	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
4051	fidp->fid_len = size;
4052
4053	zfid = (zfid_short_t *)fidp;
4054
4055	zfid->zf_len = size;
4056
4057	for (i = 0; i < sizeof (zfid->zf_object); i++)
4058		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4059
4060	/* Must have a non-zero generation number to distinguish from .zfs */
4061	if (gen == 0)
4062		gen = 1;
4063	for (i = 0; i < sizeof (zfid->zf_gen); i++)
4064		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4065
4066	if (size == LONG_FID_LEN) {
4067		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
4068		zfid_long_t	*zlfid;
4069
4070		zlfid = (zfid_long_t *)fidp;
4071
4072		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
4073			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
4074
4075		/* XXX - this should be the generation number for the objset */
4076		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
4077			zlfid->zf_setgen[i] = 0;
4078	}
4079
4080	ZFS_EXIT(zfsvfs);
4081	return (0);
4082}
4083
4084static int
4085zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4086    caller_context_t *ct)
4087{
4088	znode_t		*zp, *xzp;
4089	zfsvfs_t	*zfsvfs;
4090	zfs_dirlock_t	*dl;
4091	int		error;
4092
4093	switch (cmd) {
4094	case _PC_LINK_MAX:
4095		*valp = INT_MAX;
4096		return (0);
4097
4098	case _PC_FILESIZEBITS:
4099		*valp = 64;
4100		return (0);
4101
4102#if 0
4103	case _PC_XATTR_EXISTS:
4104		zp = VTOZ(vp);
4105		zfsvfs = zp->z_zfsvfs;
4106		ZFS_ENTER(zfsvfs);
4107		ZFS_VERIFY_ZP(zp);
4108		*valp = 0;
4109		error = zfs_dirent_lock(&dl, zp, "", &xzp,
4110		    ZXATTR | ZEXISTS | ZSHARED, NULL, NULL);
4111		if (error == 0) {
4112			zfs_dirent_unlock(dl);
4113			if (!zfs_dirempty(xzp))
4114				*valp = 1;
4115			VN_RELE(ZTOV(xzp));
4116		} else if (error == ENOENT) {
4117			/*
4118			 * If there aren't extended attributes, it's the
4119			 * same as having zero of them.
4120			 */
4121			error = 0;
4122		}
4123		ZFS_EXIT(zfsvfs);
4124		return (error);
4125#endif
4126
4127	case _PC_ACL_EXTENDED:
4128		*valp = 0;
4129		return (0);
4130
4131	case _PC_ACL_NFS4:
4132		*valp = 1;
4133		return (0);
4134
4135	case _PC_ACL_PATH_MAX:
4136		*valp = ACL_MAX_ENTRIES;
4137		return (0);
4138
4139	case _PC_MIN_HOLE_SIZE:
4140		*valp = (int)SPA_MINBLOCKSIZE;
4141		return (0);
4142
4143	default:
4144		return (EOPNOTSUPP);
4145	}
4146}
4147
4148/*ARGSUSED*/
4149static int
4150zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4151    caller_context_t *ct)
4152{
4153	znode_t *zp = VTOZ(vp);
4154	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4155	int error;
4156	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4157
4158	ZFS_ENTER(zfsvfs);
4159	ZFS_VERIFY_ZP(zp);
4160	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
4161	ZFS_EXIT(zfsvfs);
4162
4163	return (error);
4164}
4165
4166/*ARGSUSED*/
4167static int
4168zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4169    caller_context_t *ct)
4170{
4171	znode_t *zp = VTOZ(vp);
4172	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4173	int error;
4174	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4175
4176	ZFS_ENTER(zfsvfs);
4177	ZFS_VERIFY_ZP(zp);
4178	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
4179	ZFS_EXIT(zfsvfs);
4180	return (error);
4181}
4182
4183static int
4184zfs_freebsd_open(ap)
4185	struct vop_open_args /* {
4186		struct vnode *a_vp;
4187		int a_mode;
4188		struct ucred *a_cred;
4189		struct thread *a_td;
4190	} */ *ap;
4191{
4192	vnode_t	*vp = ap->a_vp;
4193	znode_t *zp = VTOZ(vp);
4194	int error;
4195
4196	error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
4197	if (error == 0)
4198		vnode_create_vobject(vp, zp->z_phys->zp_size, ap->a_td);
4199	return (error);
4200}
4201
4202static int
4203zfs_freebsd_close(ap)
4204	struct vop_close_args /* {
4205		struct vnode *a_vp;
4206		int  a_fflag;
4207		struct ucred *a_cred;
4208		struct thread *a_td;
4209	} */ *ap;
4210{
4211
4212	return (zfs_close(ap->a_vp, ap->a_fflag, 0, 0, ap->a_cred, NULL));
4213}
4214
4215static int
4216zfs_freebsd_ioctl(ap)
4217	struct vop_ioctl_args /* {
4218		struct vnode *a_vp;
4219		u_long a_command;
4220		caddr_t a_data;
4221		int a_fflag;
4222		struct ucred *cred;
4223		struct thread *td;
4224	} */ *ap;
4225{
4226
4227	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
4228	    ap->a_fflag, ap->a_cred, NULL, NULL));
4229}
4230
4231static int
4232zfs_freebsd_read(ap)
4233	struct vop_read_args /* {
4234		struct vnode *a_vp;
4235		struct uio *a_uio;
4236		int a_ioflag;
4237		struct ucred *a_cred;
4238	} */ *ap;
4239{
4240
4241	return (zfs_read(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL));
4242}
4243
4244static int
4245zfs_freebsd_write(ap)
4246	struct vop_write_args /* {
4247		struct vnode *a_vp;
4248		struct uio *a_uio;
4249		int a_ioflag;
4250		struct ucred *a_cred;
4251	} */ *ap;
4252{
4253
4254	if (vn_rlimit_fsize(ap->a_vp, ap->a_uio, ap->a_uio->uio_td))
4255		return (EFBIG);
4256
4257	return (zfs_write(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL));
4258}
4259
4260static int
4261zfs_freebsd_access(ap)
4262	struct vop_access_args /* {
4263		struct vnode *a_vp;
4264		accmode_t a_accmode;
4265		struct ucred *a_cred;
4266		struct thread *a_td;
4267	} */ *ap;
4268{
4269	vnode_t *vp = ap->a_vp;
4270	znode_t *zp = VTOZ(vp);
4271	znode_phys_t *zphys = zp->z_phys;
4272	accmode_t accmode;
4273	int error = 0;
4274
4275	/*
4276	 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
4277	 */
4278	accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
4279	if (accmode != 0)
4280		error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL);
4281
4282	/*
4283	 * VADMIN has to be handled by vaccess().
4284	 */
4285	if (error == 0) {
4286		accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
4287		if (accmode != 0) {
4288			error = vaccess(vp->v_type, zphys->zp_mode,
4289			    zphys->zp_uid, zphys->zp_gid, accmode, ap->a_cred,
4290			    NULL);
4291		}
4292	}
4293
4294	/*
4295	 * For VEXEC, ensure that at least one execute bit is set for
4296	 * non-directories.
4297	 */
4298	if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
4299	    (zphys->zp_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0)
4300		error = EACCES;
4301
4302	return (error);
4303}
4304
4305static int
4306zfs_freebsd_lookup(ap)
4307	struct vop_lookup_args /* {
4308		struct vnode *a_dvp;
4309		struct vnode **a_vpp;
4310		struct componentname *a_cnp;
4311	} */ *ap;
4312{
4313	struct componentname *cnp = ap->a_cnp;
4314	char nm[NAME_MAX + 1];
4315
4316	ASSERT(cnp->cn_namelen < sizeof(nm));
4317	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
4318
4319	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
4320	    cnp->cn_cred, cnp->cn_thread, 0));
4321}
4322
4323static int
4324zfs_freebsd_create(ap)
4325	struct vop_create_args /* {
4326		struct vnode *a_dvp;
4327		struct vnode **a_vpp;
4328		struct componentname *a_cnp;
4329		struct vattr *a_vap;
4330	} */ *ap;
4331{
4332	struct componentname *cnp = ap->a_cnp;
4333	vattr_t *vap = ap->a_vap;
4334	int mode;
4335
4336	ASSERT(cnp->cn_flags & SAVENAME);
4337
4338	vattr_init_mask(vap);
4339	mode = vap->va_mode & ALLPERMS;
4340
4341	return (zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
4342	    ap->a_vpp, cnp->cn_cred, cnp->cn_thread));
4343}
4344
4345static int
4346zfs_freebsd_remove(ap)
4347	struct vop_remove_args /* {
4348		struct vnode *a_dvp;
4349		struct vnode *a_vp;
4350		struct componentname *a_cnp;
4351	} */ *ap;
4352{
4353
4354	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
4355
4356	return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr,
4357	    ap->a_cnp->cn_cred, NULL, 0));
4358}
4359
4360static int
4361zfs_freebsd_mkdir(ap)
4362	struct vop_mkdir_args /* {
4363		struct vnode *a_dvp;
4364		struct vnode **a_vpp;
4365		struct componentname *a_cnp;
4366		struct vattr *a_vap;
4367	} */ *ap;
4368{
4369	vattr_t *vap = ap->a_vap;
4370
4371	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
4372
4373	vattr_init_mask(vap);
4374
4375	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
4376	    ap->a_cnp->cn_cred, NULL, 0, NULL));
4377}
4378
4379static int
4380zfs_freebsd_rmdir(ap)
4381	struct vop_rmdir_args /* {
4382		struct vnode *a_dvp;
4383		struct vnode *a_vp;
4384		struct componentname *a_cnp;
4385	} */ *ap;
4386{
4387	struct componentname *cnp = ap->a_cnp;
4388
4389	ASSERT(cnp->cn_flags & SAVENAME);
4390
4391	return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred, NULL, 0));
4392}
4393
4394static int
4395zfs_freebsd_readdir(ap)
4396	struct vop_readdir_args /* {
4397		struct vnode *a_vp;
4398		struct uio *a_uio;
4399		struct ucred *a_cred;
4400		int *a_eofflag;
4401		int *a_ncookies;
4402		u_long **a_cookies;
4403	} */ *ap;
4404{
4405
4406	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
4407	    ap->a_ncookies, ap->a_cookies));
4408}
4409
4410static int
4411zfs_freebsd_fsync(ap)
4412	struct vop_fsync_args /* {
4413		struct vnode *a_vp;
4414		int a_waitfor;
4415		struct thread *a_td;
4416	} */ *ap;
4417{
4418
4419	vop_stdfsync(ap);
4420	return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
4421}
4422
4423static int
4424zfs_freebsd_getattr(ap)
4425	struct vop_getattr_args /* {
4426		struct vnode *a_vp;
4427		struct vattr *a_vap;
4428		struct ucred *a_cred;
4429		struct thread *a_td;
4430	} */ *ap;
4431{
4432	vattr_t *vap = ap->a_vap;
4433	xvattr_t xvap;
4434	u_long fflags = 0;
4435	int error;
4436
4437	xva_init(&xvap);
4438	xvap.xva_vattr = *vap;
4439	xvap.xva_vattr.va_mask |= AT_XVATTR;
4440
4441	/* Convert chflags into ZFS-type flags. */
4442	/* XXX: what about SF_SETTABLE?. */
4443	XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
4444	XVA_SET_REQ(&xvap, XAT_APPENDONLY);
4445	XVA_SET_REQ(&xvap, XAT_NOUNLINK);
4446	XVA_SET_REQ(&xvap, XAT_NODUMP);
4447	error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
4448	if (error != 0)
4449		return (error);
4450
4451	/* Convert ZFS xattr into chflags. */
4452#define	FLAG_CHECK(fflag, xflag, xfield)	do {			\
4453	if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)		\
4454		fflags |= (fflag);					\
4455} while (0)
4456	FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
4457	    xvap.xva_xoptattrs.xoa_immutable);
4458	FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
4459	    xvap.xva_xoptattrs.xoa_appendonly);
4460	FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
4461	    xvap.xva_xoptattrs.xoa_nounlink);
4462	FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
4463	    xvap.xva_xoptattrs.xoa_nodump);
4464#undef	FLAG_CHECK
4465	*vap = xvap.xva_vattr;
4466	vap->va_flags = fflags;
4467	return (0);
4468}
4469
4470static int
4471zfs_freebsd_setattr(ap)
4472	struct vop_setattr_args /* {
4473		struct vnode *a_vp;
4474		struct vattr *a_vap;
4475		struct ucred *a_cred;
4476		struct thread *a_td;
4477	} */ *ap;
4478{
4479	vnode_t *vp = ap->a_vp;
4480	vattr_t *vap = ap->a_vap;
4481	cred_t *cred = ap->a_cred;
4482	xvattr_t xvap;
4483	u_long fflags;
4484	uint64_t zflags;
4485
4486	vattr_init_mask(vap);
4487	vap->va_mask &= ~AT_NOSET;
4488
4489	xva_init(&xvap);
4490	xvap.xva_vattr = *vap;
4491
4492	zflags = VTOZ(vp)->z_phys->zp_flags;
4493
4494	if (vap->va_flags != VNOVAL) {
4495		zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
4496		int error;
4497
4498		if (zfsvfs->z_use_fuids == B_FALSE)
4499			return (EOPNOTSUPP);
4500
4501		fflags = vap->va_flags;
4502		if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_NODUMP)) != 0)
4503			return (EOPNOTSUPP);
4504		/*
4505		 * Unprivileged processes are not permitted to unset system
4506		 * flags, or modify flags if any system flags are set.
4507		 * Privileged non-jail processes may not modify system flags
4508		 * if securelevel > 0 and any existing system flags are set.
4509		 * Privileged jail processes behave like privileged non-jail
4510		 * processes if the security.jail.chflags_allowed sysctl is
4511		 * is non-zero; otherwise, they behave like unprivileged
4512		 * processes.
4513		 */
4514		if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
4515		    priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) {
4516			if (zflags &
4517			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
4518				error = securelevel_gt(cred, 0);
4519				if (error != 0)
4520					return (error);
4521			}
4522		} else {
4523			/*
4524			 * Callers may only modify the file flags on objects they
4525			 * have VADMIN rights for.
4526			 */
4527			if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0)
4528				return (error);
4529			if (zflags &
4530			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
4531				return (EPERM);
4532			}
4533			if (fflags &
4534			    (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
4535				return (EPERM);
4536			}
4537		}
4538
4539#define	FLAG_CHANGE(fflag, zflag, xflag, xfield)	do {		\
4540	if (((fflags & (fflag)) && !(zflags & (zflag))) ||		\
4541	    ((zflags & (zflag)) && !(fflags & (fflag)))) {		\
4542		XVA_SET_REQ(&xvap, (xflag));				\
4543		(xfield) = ((fflags & (fflag)) != 0);			\
4544	}								\
4545} while (0)
4546		/* Convert chflags into ZFS-type flags. */
4547		/* XXX: what about SF_SETTABLE?. */
4548		FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
4549		    xvap.xva_xoptattrs.xoa_immutable);
4550		FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
4551		    xvap.xva_xoptattrs.xoa_appendonly);
4552		FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
4553		    xvap.xva_xoptattrs.xoa_nounlink);
4554		FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
4555		    xvap.xva_xoptattrs.xoa_nodump);
4556#undef	FLAG_CHANGE
4557	}
4558	return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
4559}
4560
4561static int
4562zfs_freebsd_rename(ap)
4563	struct vop_rename_args  /* {
4564		struct vnode *a_fdvp;
4565		struct vnode *a_fvp;
4566		struct componentname *a_fcnp;
4567		struct vnode *a_tdvp;
4568		struct vnode *a_tvp;
4569		struct componentname *a_tcnp;
4570	} */ *ap;
4571{
4572	vnode_t *fdvp = ap->a_fdvp;
4573	vnode_t *fvp = ap->a_fvp;
4574	vnode_t *tdvp = ap->a_tdvp;
4575	vnode_t *tvp = ap->a_tvp;
4576	int error;
4577
4578	ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
4579	ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
4580
4581	error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp,
4582	    ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred, NULL, 0);
4583
4584	if (tdvp == tvp)
4585		VN_RELE(tdvp);
4586	else
4587		VN_URELE(tdvp);
4588	if (tvp)
4589		VN_URELE(tvp);
4590	VN_RELE(fdvp);
4591	VN_RELE(fvp);
4592
4593	return (error);
4594}
4595
4596static int
4597zfs_freebsd_symlink(ap)
4598	struct vop_symlink_args /* {
4599		struct vnode *a_dvp;
4600		struct vnode **a_vpp;
4601		struct componentname *a_cnp;
4602		struct vattr *a_vap;
4603		char *a_target;
4604	} */ *ap;
4605{
4606	struct componentname *cnp = ap->a_cnp;
4607	vattr_t *vap = ap->a_vap;
4608
4609	ASSERT(cnp->cn_flags & SAVENAME);
4610
4611	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
4612	vattr_init_mask(vap);
4613
4614	return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
4615	    ap->a_target, cnp->cn_cred, cnp->cn_thread));
4616}
4617
4618static int
4619zfs_freebsd_readlink(ap)
4620	struct vop_readlink_args /* {
4621		struct vnode *a_vp;
4622		struct uio *a_uio;
4623		struct ucred *a_cred;
4624	} */ *ap;
4625{
4626
4627	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
4628}
4629
4630static int
4631zfs_freebsd_link(ap)
4632	struct vop_link_args /* {
4633		struct vnode *a_tdvp;
4634		struct vnode *a_vp;
4635		struct componentname *a_cnp;
4636	} */ *ap;
4637{
4638	struct componentname *cnp = ap->a_cnp;
4639
4640	ASSERT(cnp->cn_flags & SAVENAME);
4641
4642	return (zfs_link(ap->a_tdvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
4643}
4644
4645static int
4646zfs_freebsd_inactive(ap)
4647	struct vop_inactive_args /* {
4648		struct vnode *a_vp;
4649		struct thread *a_td;
4650	} */ *ap;
4651{
4652	vnode_t *vp = ap->a_vp;
4653
4654	zfs_inactive(vp, ap->a_td->td_ucred, NULL);
4655	return (0);
4656}
4657
4658static void
4659zfs_reclaim_complete(void *arg, int pending)
4660{
4661	znode_t	*zp = arg;
4662	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4663
4664	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4665	if (zp->z_dbuf != NULL) {
4666		ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id);
4667		zfs_znode_dmu_fini(zp);
4668		ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
4669	}
4670	zfs_znode_free(zp);
4671	rw_exit(&zfsvfs->z_teardown_inactive_lock);
4672	/*
4673	 * If the file system is being unmounted, there is a process waiting
4674	 * for us, wake it up.
4675	 */
4676	if (zfsvfs->z_unmounted)
4677		wakeup_one(zfsvfs);
4678}
4679
4680static int
4681zfs_freebsd_reclaim(ap)
4682	struct vop_reclaim_args /* {
4683		struct vnode *a_vp;
4684		struct thread *a_td;
4685	} */ *ap;
4686{
4687	vnode_t	*vp = ap->a_vp;
4688	znode_t	*zp = VTOZ(vp);
4689	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4690
4691	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4692
4693	ASSERT(zp != NULL);
4694
4695	/*
4696	 * Destroy the vm object and flush associated pages.
4697	 */
4698	vnode_destroy_vobject(vp);
4699
4700	mutex_enter(&zp->z_lock);
4701	ASSERT(zp->z_phys != NULL);
4702	zp->z_vnode = NULL;
4703	mutex_exit(&zp->z_lock);
4704
4705	if (zp->z_unlinked)
4706		;	/* Do nothing. */
4707	else if (zp->z_dbuf == NULL)
4708		zfs_znode_free(zp);
4709	else /* if (!zp->z_unlinked && zp->z_dbuf != NULL) */ {
4710		int locked;
4711
4712		locked = MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)) ? 2 :
4713		    ZFS_OBJ_HOLD_TRYENTER(zfsvfs, zp->z_id);
4714		if (locked == 0) {
4715			/*
4716			 * Lock can't be obtained due to deadlock possibility,
4717			 * so defer znode destruction.
4718			 */
4719			TASK_INIT(&zp->z_task, 0, zfs_reclaim_complete, zp);
4720			taskqueue_enqueue(taskqueue_thread, &zp->z_task);
4721		} else {
4722			zfs_znode_dmu_fini(zp);
4723			if (locked == 1)
4724				ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
4725			zfs_znode_free(zp);
4726		}
4727	}
4728	VI_LOCK(vp);
4729	vp->v_data = NULL;
4730	ASSERT(vp->v_holdcnt >= 1);
4731	VI_UNLOCK(vp);
4732	rw_exit(&zfsvfs->z_teardown_inactive_lock);
4733	return (0);
4734}
4735
4736static int
4737zfs_freebsd_fid(ap)
4738	struct vop_fid_args /* {
4739		struct vnode *a_vp;
4740		struct fid *a_fid;
4741	} */ *ap;
4742{
4743
4744	return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
4745}
4746
4747static int
4748zfs_freebsd_pathconf(ap)
4749	struct vop_pathconf_args /* {
4750		struct vnode *a_vp;
4751		int a_name;
4752		register_t *a_retval;
4753	} */ *ap;
4754{
4755	ulong_t val;
4756	int error;
4757
4758	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL);
4759	if (error == 0)
4760		*ap->a_retval = val;
4761	else if (error == EOPNOTSUPP)
4762		error = vop_stdpathconf(ap);
4763	return (error);
4764}
4765
4766static int
4767zfs_freebsd_fifo_pathconf(ap)
4768	struct vop_pathconf_args /* {
4769		struct vnode *a_vp;
4770		int a_name;
4771		register_t *a_retval;
4772	} */ *ap;
4773{
4774
4775	switch (ap->a_name) {
4776	case _PC_ACL_EXTENDED:
4777	case _PC_ACL_NFS4:
4778	case _PC_ACL_PATH_MAX:
4779	case _PC_MAC_PRESENT:
4780		return (zfs_freebsd_pathconf(ap));
4781	default:
4782		return (fifo_specops.vop_pathconf(ap));
4783	}
4784}
4785
4786/*
4787 * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
4788 * extended attribute name:
4789 *
4790 *	NAMESPACE	PREFIX
4791 *	system		freebsd:system:
4792 *	user		(none, can be used to access ZFS fsattr(5) attributes
4793 *			created on Solaris)
4794 */
4795static int
4796zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
4797    size_t size)
4798{
4799	const char *namespace, *prefix, *suffix;
4800
4801	/* We don't allow '/' character in attribute name. */
4802	if (strchr(name, '/') != NULL)
4803		return (EINVAL);
4804	/* We don't allow attribute names that start with "freebsd:" string. */
4805	if (strncmp(name, "freebsd:", 8) == 0)
4806		return (EINVAL);
4807
4808	bzero(attrname, size);
4809
4810	switch (attrnamespace) {
4811	case EXTATTR_NAMESPACE_USER:
4812#if 0
4813		prefix = "freebsd:";
4814		namespace = EXTATTR_NAMESPACE_USER_STRING;
4815		suffix = ":";
4816#else
4817		/*
4818		 * This is the default namespace by which we can access all
4819		 * attributes created on Solaris.
4820		 */
4821		prefix = namespace = suffix = "";
4822#endif
4823		break;
4824	case EXTATTR_NAMESPACE_SYSTEM:
4825		prefix = "freebsd:";
4826		namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
4827		suffix = ":";
4828		break;
4829	case EXTATTR_NAMESPACE_EMPTY:
4830	default:
4831		return (EINVAL);
4832	}
4833	if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
4834	    name) >= size) {
4835		return (ENAMETOOLONG);
4836	}
4837	return (0);
4838}
4839
4840/*
4841 * Vnode operating to retrieve a named extended attribute.
4842 */
4843static int
4844zfs_getextattr(struct vop_getextattr_args *ap)
4845/*
4846vop_getextattr {
4847	IN struct vnode *a_vp;
4848	IN int a_attrnamespace;
4849	IN const char *a_name;
4850	INOUT struct uio *a_uio;
4851	OUT size_t *a_size;
4852	IN struct ucred *a_cred;
4853	IN struct thread *a_td;
4854};
4855*/
4856{
4857	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
4858	struct thread *td = ap->a_td;
4859	struct nameidata nd;
4860	char attrname[255];
4861	struct vattr va;
4862	vnode_t *xvp = NULL, *vp;
4863	int error, flags;
4864
4865	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
4866	    ap->a_cred, ap->a_td, VREAD);
4867	if (error != 0)
4868		return (error);
4869
4870	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
4871	    sizeof(attrname));
4872	if (error != 0)
4873		return (error);
4874
4875	ZFS_ENTER(zfsvfs);
4876
4877	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
4878	    LOOKUP_XATTR);
4879	if (error != 0) {
4880		ZFS_EXIT(zfsvfs);
4881		return (error);
4882	}
4883
4884	flags = FREAD;
4885	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, attrname,
4886	    xvp, td);
4887	error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL);
4888	vp = nd.ni_vp;
4889	NDFREE(&nd, NDF_ONLY_PNBUF);
4890	if (error != 0) {
4891		ZFS_EXIT(zfsvfs);
4892		if (error == ENOENT)
4893			error = ENOATTR;
4894		return (error);
4895	}
4896
4897	if (ap->a_size != NULL) {
4898		error = VOP_GETATTR(vp, &va, ap->a_cred);
4899		if (error == 0)
4900			*ap->a_size = (size_t)va.va_size;
4901	} else if (ap->a_uio != NULL)
4902		error = VOP_READ(vp, ap->a_uio, IO_UNIT | IO_SYNC, ap->a_cred);
4903
4904	VOP_UNLOCK(vp, 0);
4905	vn_close(vp, flags, ap->a_cred, td);
4906	ZFS_EXIT(zfsvfs);
4907
4908	return (error);
4909}
4910
4911/*
4912 * Vnode operation to remove a named attribute.
4913 */
4914int
4915zfs_deleteextattr(struct vop_deleteextattr_args *ap)
4916/*
4917vop_deleteextattr {
4918	IN struct vnode *a_vp;
4919	IN int a_attrnamespace;
4920	IN const char *a_name;
4921	IN struct ucred *a_cred;
4922	IN struct thread *a_td;
4923};
4924*/
4925{
4926	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
4927	struct thread *td = ap->a_td;
4928	struct nameidata nd;
4929	char attrname[255];
4930	struct vattr va;
4931	vnode_t *xvp = NULL, *vp;
4932	int error, flags;
4933
4934	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
4935	    ap->a_cred, ap->a_td, VWRITE);
4936	if (error != 0)
4937		return (error);
4938
4939	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
4940	    sizeof(attrname));
4941	if (error != 0)
4942		return (error);
4943
4944	ZFS_ENTER(zfsvfs);
4945
4946	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
4947	    LOOKUP_XATTR);
4948	if (error != 0) {
4949		ZFS_EXIT(zfsvfs);
4950		return (error);
4951	}
4952
4953	NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF | MPSAFE,
4954	    UIO_SYSSPACE, attrname, xvp, td);
4955	error = namei(&nd);
4956	vp = nd.ni_vp;
4957	NDFREE(&nd, NDF_ONLY_PNBUF);
4958	if (error != 0) {
4959		ZFS_EXIT(zfsvfs);
4960		if (error == ENOENT)
4961			error = ENOATTR;
4962		return (error);
4963	}
4964	error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
4965
4966	vput(nd.ni_dvp);
4967	if (vp == nd.ni_dvp)
4968		vrele(vp);
4969	else
4970		vput(vp);
4971	ZFS_EXIT(zfsvfs);
4972
4973	return (error);
4974}
4975
4976/*
4977 * Vnode operation to set a named attribute.
4978 */
4979static int
4980zfs_setextattr(struct vop_setextattr_args *ap)
4981/*
4982vop_setextattr {
4983	IN struct vnode *a_vp;
4984	IN int a_attrnamespace;
4985	IN const char *a_name;
4986	INOUT struct uio *a_uio;
4987	IN struct ucred *a_cred;
4988	IN struct thread *a_td;
4989};
4990*/
4991{
4992	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
4993	struct thread *td = ap->a_td;
4994	struct nameidata nd;
4995	char attrname[255];
4996	struct vattr va;
4997	vnode_t *xvp = NULL, *vp;
4998	int error, flags;
4999
5000	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5001	    ap->a_cred, ap->a_td, VWRITE);
5002	if (error != 0)
5003		return (error);
5004
5005	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5006	    sizeof(attrname));
5007	if (error != 0)
5008		return (error);
5009
5010	ZFS_ENTER(zfsvfs);
5011
5012	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5013	    LOOKUP_XATTR | CREATE_XATTR_DIR);
5014	if (error != 0) {
5015		ZFS_EXIT(zfsvfs);
5016		return (error);
5017	}
5018
5019	flags = FFLAGS(O_WRONLY | O_CREAT);
5020	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, attrname,
5021	    xvp, td);
5022	error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL);
5023	vp = nd.ni_vp;
5024	NDFREE(&nd, NDF_ONLY_PNBUF);
5025	if (error != 0) {
5026		ZFS_EXIT(zfsvfs);
5027		return (error);
5028	}
5029
5030	VATTR_NULL(&va);
5031	va.va_size = 0;
5032	error = VOP_SETATTR(vp, &va, ap->a_cred);
5033	if (error == 0)
5034		VOP_WRITE(vp, ap->a_uio, IO_UNIT | IO_SYNC, ap->a_cred);
5035
5036	VOP_UNLOCK(vp, 0);
5037	vn_close(vp, flags, ap->a_cred, td);
5038	ZFS_EXIT(zfsvfs);
5039
5040	return (error);
5041}
5042
5043/*
5044 * Vnode operation to retrieve extended attributes on a vnode.
5045 */
5046static int
5047zfs_listextattr(struct vop_listextattr_args *ap)
5048/*
5049vop_listextattr {
5050	IN struct vnode *a_vp;
5051	IN int a_attrnamespace;
5052	INOUT struct uio *a_uio;
5053	OUT size_t *a_size;
5054	IN struct ucred *a_cred;
5055	IN struct thread *a_td;
5056};
5057*/
5058{
5059	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5060	struct thread *td = ap->a_td;
5061	struct nameidata nd;
5062	char attrprefix[16];
5063	u_char dirbuf[sizeof(struct dirent)];
5064	struct dirent *dp;
5065	struct iovec aiov;
5066	struct uio auio, *uio = ap->a_uio;
5067	size_t *sizep = ap->a_size;
5068	size_t plen;
5069	vnode_t *xvp = NULL, *vp;
5070	int done, error, eof, pos;
5071
5072	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5073	    ap->a_cred, ap->a_td, VREAD);
5074	if (error != 0)
5075		return (error);
5076
5077	error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
5078	    sizeof(attrprefix));
5079	if (error != 0)
5080		return (error);
5081	plen = strlen(attrprefix);
5082
5083	ZFS_ENTER(zfsvfs);
5084
5085	if (sizep != NULL)
5086		*sizep = 0;
5087
5088	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5089	    LOOKUP_XATTR);
5090	if (error != 0) {
5091		ZFS_EXIT(zfsvfs);
5092		/*
5093		 * ENOATTR means that the EA directory does not yet exist,
5094		 * i.e. there are no extended attributes there.
5095		 */
5096		if (error == ENOATTR)
5097			error = 0;
5098		return (error);
5099	}
5100
5101	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED | MPSAFE,
5102	    UIO_SYSSPACE, ".", xvp, td);
5103	error = namei(&nd);
5104	vp = nd.ni_vp;
5105	NDFREE(&nd, NDF_ONLY_PNBUF);
5106	if (error != 0) {
5107		ZFS_EXIT(zfsvfs);
5108		return (error);
5109	}
5110
5111	auio.uio_iov = &aiov;
5112	auio.uio_iovcnt = 1;
5113	auio.uio_segflg = UIO_SYSSPACE;
5114	auio.uio_td = td;
5115	auio.uio_rw = UIO_READ;
5116	auio.uio_offset = 0;
5117
5118	do {
5119		u_char nlen;
5120
5121		aiov.iov_base = (void *)dirbuf;
5122		aiov.iov_len = sizeof(dirbuf);
5123		auio.uio_resid = sizeof(dirbuf);
5124		error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
5125		done = sizeof(dirbuf) - auio.uio_resid;
5126		if (error != 0)
5127			break;
5128		for (pos = 0; pos < done;) {
5129			dp = (struct dirent *)(dirbuf + pos);
5130			pos += dp->d_reclen;
5131			/*
5132			 * XXX: Temporarily we also accept DT_UNKNOWN, as this
5133			 * is what we get when attribute was created on Solaris.
5134			 */
5135			if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
5136				continue;
5137			if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0)
5138				continue;
5139			else if (strncmp(dp->d_name, attrprefix, plen) != 0)
5140				continue;
5141			nlen = dp->d_namlen - plen;
5142			if (sizep != NULL)
5143				*sizep += 1 + nlen;
5144			else if (uio != NULL) {
5145				/*
5146				 * Format of extattr name entry is one byte for
5147				 * length and the rest for name.
5148				 */
5149				error = uiomove(&nlen, 1, uio->uio_rw, uio);
5150				if (error == 0) {
5151					error = uiomove(dp->d_name + plen, nlen,
5152					    uio->uio_rw, uio);
5153				}
5154				if (error != 0)
5155					break;
5156			}
5157		}
5158	} while (!eof && error == 0);
5159
5160	vput(vp);
5161	ZFS_EXIT(zfsvfs);
5162
5163	return (error);
5164}
5165
5166int
5167zfs_freebsd_getacl(ap)
5168	struct vop_getacl_args /* {
5169		struct vnode *vp;
5170		acl_type_t type;
5171		struct acl *aclp;
5172		struct ucred *cred;
5173		struct thread *td;
5174	} */ *ap;
5175{
5176	int		error;
5177	vsecattr_t      vsecattr;
5178
5179	if (ap->a_type != ACL_TYPE_NFS4)
5180		return (EINVAL);
5181
5182	vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
5183	if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))
5184		return (error);
5185
5186	error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt);
5187	if (vsecattr.vsa_aclentp != NULL)
5188		kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
5189
5190	return (error);
5191}
5192
5193int
5194zfs_freebsd_setacl(ap)
5195	struct vop_setacl_args /* {
5196		struct vnode *vp;
5197		acl_type_t type;
5198		struct acl *aclp;
5199		struct ucred *cred;
5200		struct thread *td;
5201	} */ *ap;
5202{
5203	int		error;
5204	vsecattr_t      vsecattr;
5205	int		aclbsize;	/* size of acl list in bytes */
5206	aclent_t	*aaclp;
5207
5208	if (ap->a_type != ACL_TYPE_NFS4)
5209		return (EINVAL);
5210
5211	if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
5212		return (EINVAL);
5213
5214	/*
5215	 * With NFSv4 ACLs, chmod(2) may need to add additional entries,
5216	 * splitting every entry into two and appending "canonical six"
5217	 * entries at the end.  Don't allow for setting an ACL that would
5218	 * cause chmod(2) to run out of ACL entries.
5219	 */
5220	if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
5221		return (ENOSPC);
5222
5223	error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
5224	if (error != 0)
5225		return (error);
5226
5227	vsecattr.vsa_mask = VSA_ACE;
5228	aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t);
5229	vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
5230	aaclp = vsecattr.vsa_aclentp;
5231	vsecattr.vsa_aclentsz = aclbsize;
5232
5233	aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
5234	error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL);
5235	kmem_free(aaclp, aclbsize);
5236
5237	return (error);
5238}
5239
5240int
5241zfs_freebsd_aclcheck(ap)
5242	struct vop_aclcheck_args /* {
5243		struct vnode *vp;
5244		acl_type_t type;
5245		struct acl *aclp;
5246		struct ucred *cred;
5247		struct thread *td;
5248	} */ *ap;
5249{
5250
5251	return (EOPNOTSUPP);
5252}
5253
5254struct vop_vector zfs_vnodeops;
5255struct vop_vector zfs_fifoops;
5256struct vop_vector zfs_shareops;
5257
5258struct vop_vector zfs_vnodeops = {
5259	.vop_default =		&default_vnodeops,
5260	.vop_inactive =		zfs_freebsd_inactive,
5261	.vop_reclaim =		zfs_freebsd_reclaim,
5262	.vop_access =		zfs_freebsd_access,
5263#ifdef FREEBSD_NAMECACHE
5264	.vop_lookup =		vfs_cache_lookup,
5265	.vop_cachedlookup =	zfs_freebsd_lookup,
5266#else
5267	.vop_lookup =		zfs_freebsd_lookup,
5268#endif
5269	.vop_getattr =		zfs_freebsd_getattr,
5270	.vop_setattr =		zfs_freebsd_setattr,
5271	.vop_create =		zfs_freebsd_create,
5272	.vop_mknod =		zfs_freebsd_create,
5273	.vop_mkdir =		zfs_freebsd_mkdir,
5274	.vop_readdir =		zfs_freebsd_readdir,
5275	.vop_fsync =		zfs_freebsd_fsync,
5276	.vop_open =		zfs_freebsd_open,
5277	.vop_close =		zfs_freebsd_close,
5278	.vop_rmdir =		zfs_freebsd_rmdir,
5279	.vop_ioctl =		zfs_freebsd_ioctl,
5280	.vop_link =		zfs_freebsd_link,
5281	.vop_symlink =		zfs_freebsd_symlink,
5282	.vop_readlink =		zfs_freebsd_readlink,
5283	.vop_read =		zfs_freebsd_read,
5284	.vop_write =		zfs_freebsd_write,
5285	.vop_remove =		zfs_freebsd_remove,
5286	.vop_rename =		zfs_freebsd_rename,
5287	.vop_pathconf =		zfs_freebsd_pathconf,
5288	.vop_bmap =		VOP_EOPNOTSUPP,
5289	.vop_fid =		zfs_freebsd_fid,
5290	.vop_getextattr =	zfs_getextattr,
5291	.vop_deleteextattr =	zfs_deleteextattr,
5292	.vop_setextattr =	zfs_setextattr,
5293	.vop_listextattr =	zfs_listextattr,
5294	.vop_getacl =		zfs_freebsd_getacl,
5295	.vop_setacl =		zfs_freebsd_setacl,
5296	.vop_aclcheck =		zfs_freebsd_aclcheck,
5297};
5298
5299struct vop_vector zfs_fifoops = {
5300	.vop_default =		&fifo_specops,
5301	.vop_fsync =		zfs_freebsd_fsync,
5302	.vop_access =		zfs_freebsd_access,
5303	.vop_getattr =		zfs_freebsd_getattr,
5304	.vop_inactive =		zfs_freebsd_inactive,
5305	.vop_read =		VOP_PANIC,
5306	.vop_reclaim =		zfs_freebsd_reclaim,
5307	.vop_setattr =		zfs_freebsd_setattr,
5308	.vop_write =		VOP_PANIC,
5309	.vop_pathconf = 	zfs_freebsd_fifo_pathconf,
5310	.vop_fid =		zfs_freebsd_fid,
5311	.vop_getacl =		zfs_freebsd_getacl,
5312	.vop_setacl =		zfs_freebsd_setacl,
5313	.vop_aclcheck =		zfs_freebsd_aclcheck,
5314};
5315
5316/*
5317 * special share hidden files vnode operations template
5318 */
5319struct vop_vector zfs_shareops = {
5320	.vop_default =		&default_vnodeops,
5321	.vop_access =		zfs_freebsd_access,
5322	.vop_inactive =		zfs_freebsd_inactive,
5323	.vop_reclaim =		zfs_freebsd_reclaim,
5324	.vop_fid =		zfs_freebsd_fid,
5325	.vop_pathconf =		zfs_freebsd_pathconf,
5326};
5327