zfs_vnops.c revision 330991
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
25 * Copyright (c) 2014 Integros [integros.com]
26 * Copyright 2017 Nexenta Systems, Inc.
27 */
28
29/* Portions Copyright 2007 Jeremy Teo */
30/* Portions Copyright 2010 Robert Milkowski */
31
32#include <sys/types.h>
33#include <sys/param.h>
34#include <sys/time.h>
35#include <sys/systm.h>
36#include <sys/sysmacros.h>
37#include <sys/resource.h>
38#include <sys/vfs.h>
39#include <sys/vm.h>
40#include <sys/vnode.h>
41#include <sys/file.h>
42#include <sys/stat.h>
43#include <sys/kmem.h>
44#include <sys/taskq.h>
45#include <sys/uio.h>
46#include <sys/atomic.h>
47#include <sys/namei.h>
48#include <sys/mman.h>
49#include <sys/cmn_err.h>
50#include <sys/errno.h>
51#include <sys/unistd.h>
52#include <sys/zfs_dir.h>
53#include <sys/zfs_ioctl.h>
54#include <sys/fs/zfs.h>
55#include <sys/dmu.h>
56#include <sys/dmu_objset.h>
57#include <sys/spa.h>
58#include <sys/txg.h>
59#include <sys/dbuf.h>
60#include <sys/zap.h>
61#include <sys/sa.h>
62#include <sys/dirent.h>
63#include <sys/policy.h>
64#include <sys/sunddi.h>
65#include <sys/filio.h>
66#include <sys/sid.h>
67#include <sys/zfs_ctldir.h>
68#include <sys/zfs_fuid.h>
69#include <sys/zfs_sa.h>
70#include <sys/zfs_rlock.h>
71#include <sys/extdirent.h>
72#include <sys/kidmap.h>
73#include <sys/bio.h>
74#include <sys/buf.h>
75#include <sys/sched.h>
76#include <sys/acl.h>
77#include <vm/vm_param.h>
78#include <sys/zil.h>
79
80/*
81 * Programming rules.
82 *
83 * Each vnode op performs some logical unit of work.  To do this, the ZPL must
84 * properly lock its in-core state, create a DMU transaction, do the work,
85 * record this work in the intent log (ZIL), commit the DMU transaction,
86 * and wait for the intent log to commit if it is a synchronous operation.
87 * Moreover, the vnode ops must work in both normal and log replay context.
88 * The ordering of events is important to avoid deadlocks and references
89 * to freed memory.  The example below illustrates the following Big Rules:
90 *
91 *  (1)	A check must be made in each zfs thread for a mounted file system.
92 *	This is done avoiding races using ZFS_ENTER(zfsvfs).
93 *	A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
94 *	must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
95 *	can return EIO from the calling function.
96 *
97 *  (2)	VN_RELE() should always be the last thing except for zil_commit()
98 *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
99 *	First, if it's the last reference, the vnode/znode
100 *	can be freed, so the zp may point to freed memory.  Second, the last
101 *	reference will call zfs_zinactive(), which may induce a lot of work --
102 *	pushing cached pages (which acquires range locks) and syncing out
103 *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
104 *	which could deadlock the system if you were already holding one.
105 *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
106 *
107 *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
108 *	as they can span dmu_tx_assign() calls.
109 *
110 *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
111 *      dmu_tx_assign().  This is critical because we don't want to block
112 *      while holding locks.
113 *
114 *	If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
115 *	reduces lock contention and CPU usage when we must wait (note that if
116 *	throughput is constrained by the storage, nearly every transaction
117 *	must wait).
118 *
119 *      Note, in particular, that if a lock is sometimes acquired before
120 *      the tx assigns, and sometimes after (e.g. z_lock), then failing
121 *      to use a non-blocking assign can deadlock the system.  The scenario:
122 *
123 *	Thread A has grabbed a lock before calling dmu_tx_assign().
124 *	Thread B is in an already-assigned tx, and blocks for this lock.
125 *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
126 *	forever, because the previous txg can't quiesce until B's tx commits.
127 *
128 *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
129 *	then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
130 *	calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
131 *	to indicate that this operation has already called dmu_tx_wait().
132 *	This will ensure that we don't retry forever, waiting a short bit
133 *	each time.
134 *
135 *  (5)	If the operation succeeded, generate the intent log entry for it
136 *	before dropping locks.  This ensures that the ordering of events
137 *	in the intent log matches the order in which they actually occurred.
138 *	During ZIL replay the zfs_log_* functions will update the sequence
139 *	number to indicate the zil transaction has replayed.
140 *
141 *  (6)	At the end of each vnode op, the DMU tx must always commit,
142 *	regardless of whether there were any errors.
143 *
144 *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
145 *	to ensure that synchronous semantics are provided when necessary.
146 *
147 * In general, this is how things should be ordered in each vnode op:
148 *
149 *	ZFS_ENTER(zfsvfs);		// exit if unmounted
150 * top:
151 *	zfs_dirent_lookup(&dl, ...)	// lock directory entry (may VN_HOLD())
152 *	rw_enter(...);			// grab any other locks you need
153 *	tx = dmu_tx_create(...);	// get DMU tx
154 *	dmu_tx_hold_*();		// hold each object you might modify
155 *	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
156 *	if (error) {
157 *		rw_exit(...);		// drop locks
158 *		zfs_dirent_unlock(dl);	// unlock directory entry
159 *		VN_RELE(...);		// release held vnodes
160 *		if (error == ERESTART) {
161 *			waited = B_TRUE;
162 *			dmu_tx_wait(tx);
163 *			dmu_tx_abort(tx);
164 *			goto top;
165 *		}
166 *		dmu_tx_abort(tx);	// abort DMU tx
167 *		ZFS_EXIT(zfsvfs);	// finished in zfs
168 *		return (error);		// really out of space
169 *	}
170 *	error = do_real_work();		// do whatever this VOP does
171 *	if (error == 0)
172 *		zfs_log_*(...);		// on success, make ZIL entry
173 *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
174 *	rw_exit(...);			// drop locks
175 *	zfs_dirent_unlock(dl);		// unlock directory entry
176 *	VN_RELE(...);			// release held vnodes
177 *	zil_commit(zilog, foid);	// synchronous when necessary
178 *	ZFS_EXIT(zfsvfs);		// finished in zfs
179 *	return (error);			// done, report error
180 */
181
182/* ARGSUSED */
183static int
184zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
185{
186	znode_t	*zp = VTOZ(*vpp);
187	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
188
189	ZFS_ENTER(zfsvfs);
190	ZFS_VERIFY_ZP(zp);
191
192	if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
193	    ((flag & FAPPEND) == 0)) {
194		ZFS_EXIT(zfsvfs);
195		return (SET_ERROR(EPERM));
196	}
197
198	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
199	    ZTOV(zp)->v_type == VREG &&
200	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
201		if (fs_vscan(*vpp, cr, 0) != 0) {
202			ZFS_EXIT(zfsvfs);
203			return (SET_ERROR(EACCES));
204		}
205	}
206
207	/* Keep a count of the synchronous opens in the znode */
208	if (flag & (FSYNC | FDSYNC))
209		atomic_inc_32(&zp->z_sync_cnt);
210
211	ZFS_EXIT(zfsvfs);
212	return (0);
213}
214
215/* ARGSUSED */
216static int
217zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
218    caller_context_t *ct)
219{
220	znode_t	*zp = VTOZ(vp);
221	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
222
223	/*
224	 * Clean up any locks held by this process on the vp.
225	 */
226	cleanlocks(vp, ddi_get_pid(), 0);
227	cleanshares(vp, ddi_get_pid());
228
229	ZFS_ENTER(zfsvfs);
230	ZFS_VERIFY_ZP(zp);
231
232	/* Decrement the synchronous opens in the znode */
233	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
234		atomic_dec_32(&zp->z_sync_cnt);
235
236	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
237	    ZTOV(zp)->v_type == VREG &&
238	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
239		VERIFY(fs_vscan(vp, cr, 1) == 0);
240
241	ZFS_EXIT(zfsvfs);
242	return (0);
243}
244
245/*
246 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
247 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
248 */
249static int
250zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
251{
252	znode_t	*zp = VTOZ(vp);
253	uint64_t noff = (uint64_t)*off; /* new offset */
254	uint64_t file_sz;
255	int error;
256	boolean_t hole;
257
258	file_sz = zp->z_size;
259	if (noff >= file_sz)  {
260		return (SET_ERROR(ENXIO));
261	}
262
263	if (cmd == _FIO_SEEK_HOLE)
264		hole = B_TRUE;
265	else
266		hole = B_FALSE;
267
268	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
269
270	if (error == ESRCH)
271		return (SET_ERROR(ENXIO));
272
273	/*
274	 * We could find a hole that begins after the logical end-of-file,
275	 * because dmu_offset_next() only works on whole blocks.  If the
276	 * EOF falls mid-block, then indicate that the "virtual hole"
277	 * at the end of the file begins at the logical EOF, rather than
278	 * at the end of the last block.
279	 */
280	if (noff > file_sz) {
281		ASSERT(hole);
282		noff = file_sz;
283	}
284
285	if (noff < *off)
286		return (error);
287	*off = noff;
288	return (error);
289}
290
291/* ARGSUSED */
292static int
293zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
294    int *rvalp, caller_context_t *ct)
295{
296	offset_t off;
297	offset_t ndata;
298	dmu_object_info_t doi;
299	int error;
300	zfsvfs_t *zfsvfs;
301	znode_t *zp;
302
303	switch (com) {
304	case _FIOFFS:
305	{
306		return (0);
307
308		/*
309		 * The following two ioctls are used by bfu.  Faking out,
310		 * necessary to avoid bfu errors.
311		 */
312	}
313	case _FIOGDIO:
314	case _FIOSDIO:
315	{
316		return (0);
317	}
318
319	case _FIO_SEEK_DATA:
320	case _FIO_SEEK_HOLE:
321	{
322#ifdef illumos
323		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
324			return (SET_ERROR(EFAULT));
325#else
326		off = *(offset_t *)data;
327#endif
328		zp = VTOZ(vp);
329		zfsvfs = zp->z_zfsvfs;
330		ZFS_ENTER(zfsvfs);
331		ZFS_VERIFY_ZP(zp);
332
333		/* offset parameter is in/out */
334		error = zfs_holey(vp, com, &off);
335		ZFS_EXIT(zfsvfs);
336		if (error)
337			return (error);
338#ifdef illumos
339		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
340			return (SET_ERROR(EFAULT));
341#else
342		*(offset_t *)data = off;
343#endif
344		return (0);
345	}
346#ifdef illumos
347	case _FIO_COUNT_FILLED:
348	{
349		/*
350		 * _FIO_COUNT_FILLED adds a new ioctl command which
351		 * exposes the number of filled blocks in a
352		 * ZFS object.
353		 */
354		zp = VTOZ(vp);
355		zfsvfs = zp->z_zfsvfs;
356		ZFS_ENTER(zfsvfs);
357		ZFS_VERIFY_ZP(zp);
358
359		/*
360		 * Wait for all dirty blocks for this object
361		 * to get synced out to disk, and the DMU info
362		 * updated.
363		 */
364		error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
365		if (error) {
366			ZFS_EXIT(zfsvfs);
367			return (error);
368		}
369
370		/*
371		 * Retrieve fill count from DMU object.
372		 */
373		error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
374		if (error) {
375			ZFS_EXIT(zfsvfs);
376			return (error);
377		}
378
379		ndata = doi.doi_fill_count;
380
381		ZFS_EXIT(zfsvfs);
382		if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
383			return (SET_ERROR(EFAULT));
384		return (0);
385	}
386#endif
387	}
388	return (SET_ERROR(ENOTTY));
389}
390
391static vm_page_t
392page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
393{
394	vm_object_t obj;
395	vm_page_t pp;
396	int64_t end;
397
398	/*
399	 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
400	 * aligned boundaries, if the range is not aligned.  As a result a
401	 * DEV_BSIZE subrange with partially dirty data may get marked as clean.
402	 * It may happen that all DEV_BSIZE subranges are marked clean and thus
403	 * the whole page would be considred clean despite have some dirty data.
404	 * For this reason we should shrink the range to DEV_BSIZE aligned
405	 * boundaries before calling vm_page_clear_dirty.
406	 */
407	end = rounddown2(off + nbytes, DEV_BSIZE);
408	off = roundup2(off, DEV_BSIZE);
409	nbytes = end - off;
410
411	obj = vp->v_object;
412	zfs_vmobject_assert_wlocked(obj);
413
414	for (;;) {
415		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
416		    pp->valid) {
417			if (vm_page_xbusied(pp)) {
418				/*
419				 * Reference the page before unlocking and
420				 * sleeping so that the page daemon is less
421				 * likely to reclaim it.
422				 */
423				vm_page_reference(pp);
424				vm_page_lock(pp);
425				zfs_vmobject_wunlock(obj);
426				vm_page_busy_sleep(pp, "zfsmwb", true);
427				zfs_vmobject_wlock(obj);
428				continue;
429			}
430			vm_page_sbusy(pp);
431		} else if (pp != NULL) {
432			ASSERT(!pp->valid);
433			pp = NULL;
434		}
435
436		if (pp != NULL) {
437			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
438			vm_object_pip_add(obj, 1);
439			pmap_remove_write(pp);
440			if (nbytes != 0)
441				vm_page_clear_dirty(pp, off, nbytes);
442		}
443		break;
444	}
445	return (pp);
446}
447
448static void
449page_unbusy(vm_page_t pp)
450{
451
452	vm_page_sunbusy(pp);
453	vm_object_pip_subtract(pp->object, 1);
454}
455
456static vm_page_t
457page_hold(vnode_t *vp, int64_t start)
458{
459	vm_object_t obj;
460	vm_page_t pp;
461
462	obj = vp->v_object;
463	zfs_vmobject_assert_wlocked(obj);
464
465	for (;;) {
466		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
467		    pp->valid) {
468			if (vm_page_xbusied(pp)) {
469				/*
470				 * Reference the page before unlocking and
471				 * sleeping so that the page daemon is less
472				 * likely to reclaim it.
473				 */
474				vm_page_reference(pp);
475				vm_page_lock(pp);
476				zfs_vmobject_wunlock(obj);
477				vm_page_busy_sleep(pp, "zfsmwb", true);
478				zfs_vmobject_wlock(obj);
479				continue;
480			}
481
482			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
483			vm_page_lock(pp);
484			vm_page_hold(pp);
485			vm_page_unlock(pp);
486
487		} else
488			pp = NULL;
489		break;
490	}
491	return (pp);
492}
493
494static void
495page_unhold(vm_page_t pp)
496{
497
498	vm_page_lock(pp);
499	vm_page_unhold(pp);
500	vm_page_unlock(pp);
501}
502
503/*
504 * When a file is memory mapped, we must keep the IO data synchronized
505 * between the DMU cache and the memory mapped pages.  What this means:
506 *
507 * On Write:	If we find a memory mapped page, we write to *both*
508 *		the page and the dmu buffer.
509 */
510static void
511update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
512    int segflg, dmu_tx_t *tx)
513{
514	vm_object_t obj;
515	struct sf_buf *sf;
516	caddr_t va;
517	int off;
518
519	ASSERT(segflg != UIO_NOCOPY);
520	ASSERT(vp->v_mount != NULL);
521	obj = vp->v_object;
522	ASSERT(obj != NULL);
523
524	off = start & PAGEOFFSET;
525	zfs_vmobject_wlock(obj);
526	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
527		vm_page_t pp;
528		int nbytes = imin(PAGESIZE - off, len);
529
530		if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
531			zfs_vmobject_wunlock(obj);
532
533			va = zfs_map_page(pp, &sf);
534			(void) dmu_read(os, oid, start+off, nbytes,
535			    va+off, DMU_READ_PREFETCH);;
536			zfs_unmap_page(sf);
537
538			zfs_vmobject_wlock(obj);
539			page_unbusy(pp);
540		}
541		len -= nbytes;
542		off = 0;
543	}
544	vm_object_pip_wakeupn(obj, 0);
545	zfs_vmobject_wunlock(obj);
546}
547
548/*
549 * Read with UIO_NOCOPY flag means that sendfile(2) requests
550 * ZFS to populate a range of page cache pages with data.
551 *
552 * NOTE: this function could be optimized to pre-allocate
553 * all pages in advance, drain exclusive busy on all of them,
554 * map them into contiguous KVA region and populate them
555 * in one single dmu_read() call.
556 */
557static int
558mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
559{
560	znode_t *zp = VTOZ(vp);
561	objset_t *os = zp->z_zfsvfs->z_os;
562	struct sf_buf *sf;
563	vm_object_t obj;
564	vm_page_t pp;
565	int64_t start;
566	caddr_t va;
567	int len = nbytes;
568	int off;
569	int error = 0;
570
571	ASSERT(uio->uio_segflg == UIO_NOCOPY);
572	ASSERT(vp->v_mount != NULL);
573	obj = vp->v_object;
574	ASSERT(obj != NULL);
575	ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
576
577	zfs_vmobject_wlock(obj);
578	for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
579		int bytes = MIN(PAGESIZE, len);
580
581		pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY |
582		    VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
583		if (pp->valid == 0) {
584			zfs_vmobject_wunlock(obj);
585			va = zfs_map_page(pp, &sf);
586			error = dmu_read(os, zp->z_id, start, bytes, va,
587			    DMU_READ_PREFETCH);
588			if (bytes != PAGESIZE && error == 0)
589				bzero(va + bytes, PAGESIZE - bytes);
590			zfs_unmap_page(sf);
591			zfs_vmobject_wlock(obj);
592			vm_page_sunbusy(pp);
593			vm_page_lock(pp);
594			if (error) {
595				if (pp->wire_count == 0 && pp->valid == 0 &&
596				    !vm_page_busied(pp))
597					vm_page_free(pp);
598			} else {
599				pp->valid = VM_PAGE_BITS_ALL;
600				vm_page_activate(pp);
601			}
602			vm_page_unlock(pp);
603		} else {
604			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
605			vm_page_sunbusy(pp);
606		}
607		if (error)
608			break;
609		uio->uio_resid -= bytes;
610		uio->uio_offset += bytes;
611		len -= bytes;
612	}
613	zfs_vmobject_wunlock(obj);
614	return (error);
615}
616
617/*
618 * When a file is memory mapped, we must keep the IO data synchronized
619 * between the DMU cache and the memory mapped pages.  What this means:
620 *
621 * On Read:	We "read" preferentially from memory mapped pages,
622 *		else we default from the dmu buffer.
623 *
624 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
625 *	 the file is memory mapped.
626 */
627static int
628mappedread(vnode_t *vp, int nbytes, uio_t *uio)
629{
630	znode_t *zp = VTOZ(vp);
631	vm_object_t obj;
632	int64_t start;
633	caddr_t va;
634	int len = nbytes;
635	int off;
636	int error = 0;
637
638	ASSERT(vp->v_mount != NULL);
639	obj = vp->v_object;
640	ASSERT(obj != NULL);
641
642	start = uio->uio_loffset;
643	off = start & PAGEOFFSET;
644	zfs_vmobject_wlock(obj);
645	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
646		vm_page_t pp;
647		uint64_t bytes = MIN(PAGESIZE - off, len);
648
649		if (pp = page_hold(vp, start)) {
650			struct sf_buf *sf;
651			caddr_t va;
652
653			zfs_vmobject_wunlock(obj);
654			va = zfs_map_page(pp, &sf);
655#ifdef illumos
656			error = uiomove(va + off, bytes, UIO_READ, uio);
657#else
658			error = vn_io_fault_uiomove(va + off, bytes, uio);
659#endif
660			zfs_unmap_page(sf);
661			zfs_vmobject_wlock(obj);
662			page_unhold(pp);
663		} else {
664			zfs_vmobject_wunlock(obj);
665			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
666			    uio, bytes);
667			zfs_vmobject_wlock(obj);
668		}
669		len -= bytes;
670		off = 0;
671		if (error)
672			break;
673	}
674	zfs_vmobject_wunlock(obj);
675	return (error);
676}
677
678offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
679
680/*
681 * Read bytes from specified file into supplied buffer.
682 *
683 *	IN:	vp	- vnode of file to be read from.
684 *		uio	- structure supplying read location, range info,
685 *			  and return buffer.
686 *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
687 *		cr	- credentials of caller.
688 *		ct	- caller context
689 *
690 *	OUT:	uio	- updated offset and range, buffer filled.
691 *
692 *	RETURN:	0 on success, error code on failure.
693 *
694 * Side Effects:
695 *	vp - atime updated if byte count > 0
696 */
697/* ARGSUSED */
698static int
699zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
700{
701	znode_t		*zp = VTOZ(vp);
702	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
703	ssize_t		n, nbytes;
704	int		error = 0;
705	rl_t		*rl;
706	xuio_t		*xuio = NULL;
707
708	ZFS_ENTER(zfsvfs);
709	ZFS_VERIFY_ZP(zp);
710
711	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
712		ZFS_EXIT(zfsvfs);
713		return (SET_ERROR(EACCES));
714	}
715
716	/*
717	 * Validate file offset
718	 */
719	if (uio->uio_loffset < (offset_t)0) {
720		ZFS_EXIT(zfsvfs);
721		return (SET_ERROR(EINVAL));
722	}
723
724	/*
725	 * Fasttrack empty reads
726	 */
727	if (uio->uio_resid == 0) {
728		ZFS_EXIT(zfsvfs);
729		return (0);
730	}
731
732	/*
733	 * Check for mandatory locks
734	 */
735	if (MANDMODE(zp->z_mode)) {
736		if (error = chklock(vp, FREAD,
737		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
738			ZFS_EXIT(zfsvfs);
739			return (error);
740		}
741	}
742
743	/*
744	 * If we're in FRSYNC mode, sync out this znode before reading it.
745	 */
746	if (zfsvfs->z_log &&
747	    (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
748		zil_commit(zfsvfs->z_log, zp->z_id);
749
750	/*
751	 * Lock the range against changes.
752	 */
753	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
754
755	/*
756	 * If we are reading past end-of-file we can skip
757	 * to the end; but we might still need to set atime.
758	 */
759	if (uio->uio_loffset >= zp->z_size) {
760		error = 0;
761		goto out;
762	}
763
764	ASSERT(uio->uio_loffset < zp->z_size);
765	n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
766
767#ifdef illumos
768	if ((uio->uio_extflg == UIO_XUIO) &&
769	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
770		int nblk;
771		int blksz = zp->z_blksz;
772		uint64_t offset = uio->uio_loffset;
773
774		xuio = (xuio_t *)uio;
775		if ((ISP2(blksz))) {
776			nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
777			    blksz)) / blksz;
778		} else {
779			ASSERT(offset + n <= blksz);
780			nblk = 1;
781		}
782		(void) dmu_xuio_init(xuio, nblk);
783
784		if (vn_has_cached_data(vp)) {
785			/*
786			 * For simplicity, we always allocate a full buffer
787			 * even if we only expect to read a portion of a block.
788			 */
789			while (--nblk >= 0) {
790				(void) dmu_xuio_add(xuio,
791				    dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
792				    blksz), 0, blksz);
793			}
794		}
795	}
796#endif	/* illumos */
797
798	while (n > 0) {
799		nbytes = MIN(n, zfs_read_chunk_size -
800		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
801
802#ifdef __FreeBSD__
803		if (uio->uio_segflg == UIO_NOCOPY)
804			error = mappedread_sf(vp, nbytes, uio);
805		else
806#endif /* __FreeBSD__ */
807		if (vn_has_cached_data(vp)) {
808			error = mappedread(vp, nbytes, uio);
809		} else {
810			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
811			    uio, nbytes);
812		}
813		if (error) {
814			/* convert checksum errors into IO errors */
815			if (error == ECKSUM)
816				error = SET_ERROR(EIO);
817			break;
818		}
819
820		n -= nbytes;
821	}
822out:
823	zfs_range_unlock(rl);
824
825	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
826	ZFS_EXIT(zfsvfs);
827	return (error);
828}
829
830/*
831 * Write the bytes to a file.
832 *
833 *	IN:	vp	- vnode of file to be written to.
834 *		uio	- structure supplying write location, range info,
835 *			  and data buffer.
836 *		ioflag	- FAPPEND, FSYNC, and/or FDSYNC.  FAPPEND is
837 *			  set if in append mode.
838 *		cr	- credentials of caller.
839 *		ct	- caller context (NFS/CIFS fem monitor only)
840 *
841 *	OUT:	uio	- updated offset and range.
842 *
843 *	RETURN:	0 on success, error code on failure.
844 *
845 * Timestamps:
846 *	vp - ctime|mtime updated if byte count > 0
847 */
848
849/* ARGSUSED */
850static int
851zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
852{
853	znode_t		*zp = VTOZ(vp);
854	rlim64_t	limit = MAXOFFSET_T;
855	ssize_t		start_resid = uio->uio_resid;
856	ssize_t		tx_bytes;
857	uint64_t	end_size;
858	dmu_tx_t	*tx;
859	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
860	zilog_t		*zilog;
861	offset_t	woff;
862	ssize_t		n, nbytes;
863	rl_t		*rl;
864	int		max_blksz = zfsvfs->z_max_blksz;
865	int		error = 0;
866	arc_buf_t	*abuf;
867	iovec_t		*aiov = NULL;
868	xuio_t		*xuio = NULL;
869	int		i_iov = 0;
870	int		iovcnt = uio->uio_iovcnt;
871	iovec_t		*iovp = uio->uio_iov;
872	int		write_eof;
873	int		count = 0;
874	sa_bulk_attr_t	bulk[4];
875	uint64_t	mtime[2], ctime[2];
876
877	/*
878	 * Fasttrack empty write
879	 */
880	n = start_resid;
881	if (n == 0)
882		return (0);
883
884	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
885		limit = MAXOFFSET_T;
886
887	ZFS_ENTER(zfsvfs);
888	ZFS_VERIFY_ZP(zp);
889
890	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
891	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
892	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
893	    &zp->z_size, 8);
894	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
895	    &zp->z_pflags, 8);
896
897	/*
898	 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
899	 * callers might not be able to detect properly that we are read-only,
900	 * so check it explicitly here.
901	 */
902	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
903		ZFS_EXIT(zfsvfs);
904		return (SET_ERROR(EROFS));
905	}
906
907	/*
908	 * If immutable or not appending then return EPERM.
909	 * Intentionally allow ZFS_READONLY through here.
910	 * See zfs_zaccess_common()
911	 */
912	if ((zp->z_pflags & ZFS_IMMUTABLE) ||
913	    ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
914	    (uio->uio_loffset < zp->z_size))) {
915		ZFS_EXIT(zfsvfs);
916		return (SET_ERROR(EPERM));
917	}
918
919	zilog = zfsvfs->z_log;
920
921	/*
922	 * Validate file offset
923	 */
924	woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
925	if (woff < 0) {
926		ZFS_EXIT(zfsvfs);
927		return (SET_ERROR(EINVAL));
928	}
929
930	/*
931	 * Check for mandatory locks before calling zfs_range_lock()
932	 * in order to prevent a deadlock with locks set via fcntl().
933	 */
934	if (MANDMODE((mode_t)zp->z_mode) &&
935	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
936		ZFS_EXIT(zfsvfs);
937		return (error);
938	}
939
940#ifdef illumos
941	/*
942	 * Pre-fault the pages to ensure slow (eg NFS) pages
943	 * don't hold up txg.
944	 * Skip this if uio contains loaned arc_buf.
945	 */
946	if ((uio->uio_extflg == UIO_XUIO) &&
947	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
948		xuio = (xuio_t *)uio;
949	else
950		uio_prefaultpages(MIN(n, max_blksz), uio);
951#endif
952
953	/*
954	 * If in append mode, set the io offset pointer to eof.
955	 */
956	if (ioflag & FAPPEND) {
957		/*
958		 * Obtain an appending range lock to guarantee file append
959		 * semantics.  We reset the write offset once we have the lock.
960		 */
961		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
962		woff = rl->r_off;
963		if (rl->r_len == UINT64_MAX) {
964			/*
965			 * We overlocked the file because this write will cause
966			 * the file block size to increase.
967			 * Note that zp_size cannot change with this lock held.
968			 */
969			woff = zp->z_size;
970		}
971		uio->uio_loffset = woff;
972	} else {
973		/*
974		 * Note that if the file block size will change as a result of
975		 * this write, then this range lock will lock the entire file
976		 * so that we can re-write the block safely.
977		 */
978		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
979	}
980
981	if (vn_rlimit_fsize(vp, uio, uio->uio_td)) {
982		zfs_range_unlock(rl);
983		ZFS_EXIT(zfsvfs);
984		return (EFBIG);
985	}
986
987	if (woff >= limit) {
988		zfs_range_unlock(rl);
989		ZFS_EXIT(zfsvfs);
990		return (SET_ERROR(EFBIG));
991	}
992
993	if ((woff + n) > limit || woff > (limit - n))
994		n = limit - woff;
995
996	/* Will this write extend the file length? */
997	write_eof = (woff + n > zp->z_size);
998
999	end_size = MAX(zp->z_size, woff + n);
1000
1001	/*
1002	 * Write the file in reasonable size chunks.  Each chunk is written
1003	 * in a separate transaction; this keeps the intent log records small
1004	 * and allows us to do more fine-grained space accounting.
1005	 */
1006	while (n > 0) {
1007		abuf = NULL;
1008		woff = uio->uio_loffset;
1009		if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
1010		    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
1011			if (abuf != NULL)
1012				dmu_return_arcbuf(abuf);
1013			error = SET_ERROR(EDQUOT);
1014			break;
1015		}
1016
1017		if (xuio && abuf == NULL) {
1018			ASSERT(i_iov < iovcnt);
1019			aiov = &iovp[i_iov];
1020			abuf = dmu_xuio_arcbuf(xuio, i_iov);
1021			dmu_xuio_clear(xuio, i_iov);
1022			DTRACE_PROBE3(zfs_cp_write, int, i_iov,
1023			    iovec_t *, aiov, arc_buf_t *, abuf);
1024			ASSERT((aiov->iov_base == abuf->b_data) ||
1025			    ((char *)aiov->iov_base - (char *)abuf->b_data +
1026			    aiov->iov_len == arc_buf_size(abuf)));
1027			i_iov++;
1028		} else if (abuf == NULL && n >= max_blksz &&
1029		    woff >= zp->z_size &&
1030		    P2PHASE(woff, max_blksz) == 0 &&
1031		    zp->z_blksz == max_blksz) {
1032			/*
1033			 * This write covers a full block.  "Borrow" a buffer
1034			 * from the dmu so that we can fill it before we enter
1035			 * a transaction.  This avoids the possibility of
1036			 * holding up the transaction if the data copy hangs
1037			 * up on a pagefault (e.g., from an NFS server mapping).
1038			 */
1039			size_t cbytes;
1040
1041			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
1042			    max_blksz);
1043			ASSERT(abuf != NULL);
1044			ASSERT(arc_buf_size(abuf) == max_blksz);
1045			if (error = uiocopy(abuf->b_data, max_blksz,
1046			    UIO_WRITE, uio, &cbytes)) {
1047				dmu_return_arcbuf(abuf);
1048				break;
1049			}
1050			ASSERT(cbytes == max_blksz);
1051		}
1052
1053		/*
1054		 * Start a transaction.
1055		 */
1056		tx = dmu_tx_create(zfsvfs->z_os);
1057		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1058		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
1059		zfs_sa_upgrade_txholds(tx, zp);
1060		error = dmu_tx_assign(tx, TXG_WAIT);
1061		if (error) {
1062			dmu_tx_abort(tx);
1063			if (abuf != NULL)
1064				dmu_return_arcbuf(abuf);
1065			break;
1066		}
1067
1068		/*
1069		 * If zfs_range_lock() over-locked we grow the blocksize
1070		 * and then reduce the lock range.  This will only happen
1071		 * on the first iteration since zfs_range_reduce() will
1072		 * shrink down r_len to the appropriate size.
1073		 */
1074		if (rl->r_len == UINT64_MAX) {
1075			uint64_t new_blksz;
1076
1077			if (zp->z_blksz > max_blksz) {
1078				/*
1079				 * File's blocksize is already larger than the
1080				 * "recordsize" property.  Only let it grow to
1081				 * the next power of 2.
1082				 */
1083				ASSERT(!ISP2(zp->z_blksz));
1084				new_blksz = MIN(end_size,
1085				    1 << highbit64(zp->z_blksz));
1086			} else {
1087				new_blksz = MIN(end_size, max_blksz);
1088			}
1089			zfs_grow_blocksize(zp, new_blksz, tx);
1090			zfs_range_reduce(rl, woff, n);
1091		}
1092
1093		/*
1094		 * XXX - should we really limit each write to z_max_blksz?
1095		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
1096		 */
1097		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
1098
1099		if (woff + nbytes > zp->z_size)
1100			vnode_pager_setsize(vp, woff + nbytes);
1101
1102		if (abuf == NULL) {
1103			tx_bytes = uio->uio_resid;
1104			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
1105			    uio, nbytes, tx);
1106			tx_bytes -= uio->uio_resid;
1107		} else {
1108			tx_bytes = nbytes;
1109			ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
1110			/*
1111			 * If this is not a full block write, but we are
1112			 * extending the file past EOF and this data starts
1113			 * block-aligned, use assign_arcbuf().  Otherwise,
1114			 * write via dmu_write().
1115			 */
1116			if (tx_bytes < max_blksz && (!write_eof ||
1117			    aiov->iov_base != abuf->b_data)) {
1118				ASSERT(xuio);
1119				dmu_write(zfsvfs->z_os, zp->z_id, woff,
1120				    aiov->iov_len, aiov->iov_base, tx);
1121				dmu_return_arcbuf(abuf);
1122				xuio_stat_wbuf_copied();
1123			} else {
1124				ASSERT(xuio || tx_bytes == max_blksz);
1125				dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
1126				    woff, abuf, tx);
1127			}
1128			ASSERT(tx_bytes <= uio->uio_resid);
1129			uioskip(uio, tx_bytes);
1130		}
1131		if (tx_bytes && vn_has_cached_data(vp)) {
1132			update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
1133			    zp->z_id, uio->uio_segflg, tx);
1134		}
1135
1136		/*
1137		 * If we made no progress, we're done.  If we made even
1138		 * partial progress, update the znode and ZIL accordingly.
1139		 */
1140		if (tx_bytes == 0) {
1141			(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
1142			    (void *)&zp->z_size, sizeof (uint64_t), tx);
1143			dmu_tx_commit(tx);
1144			ASSERT(error != 0);
1145			break;
1146		}
1147
1148		/*
1149		 * Clear Set-UID/Set-GID bits on successful write if not
1150		 * privileged and at least one of the excute bits is set.
1151		 *
1152		 * It would be nice to to this after all writes have
1153		 * been done, but that would still expose the ISUID/ISGID
1154		 * to another app after the partial write is committed.
1155		 *
1156		 * Note: we don't call zfs_fuid_map_id() here because
1157		 * user 0 is not an ephemeral uid.
1158		 */
1159		mutex_enter(&zp->z_acl_lock);
1160		if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
1161		    (S_IXUSR >> 6))) != 0 &&
1162		    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
1163		    secpolicy_vnode_setid_retain(vp, cr,
1164		    (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
1165			uint64_t newmode;
1166			zp->z_mode &= ~(S_ISUID | S_ISGID);
1167			newmode = zp->z_mode;
1168			(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
1169			    (void *)&newmode, sizeof (uint64_t), tx);
1170		}
1171		mutex_exit(&zp->z_acl_lock);
1172
1173		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
1174		    B_TRUE);
1175
1176		/*
1177		 * Update the file size (zp_size) if it has changed;
1178		 * account for possible concurrent updates.
1179		 */
1180		while ((end_size = zp->z_size) < uio->uio_loffset) {
1181			(void) atomic_cas_64(&zp->z_size, end_size,
1182			    uio->uio_loffset);
1183#ifdef illumos
1184			ASSERT(error == 0);
1185#else
1186			ASSERT(error == 0 || error == EFAULT);
1187#endif
1188		}
1189		/*
1190		 * If we are replaying and eof is non zero then force
1191		 * the file size to the specified eof. Note, there's no
1192		 * concurrency during replay.
1193		 */
1194		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
1195			zp->z_size = zfsvfs->z_replay_eof;
1196
1197		if (error == 0)
1198			error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1199		else
1200			(void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1201
1202		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
1203		dmu_tx_commit(tx);
1204
1205		if (error != 0)
1206			break;
1207		ASSERT(tx_bytes == nbytes);
1208		n -= nbytes;
1209
1210#ifdef illumos
1211		if (!xuio && n > 0)
1212			uio_prefaultpages(MIN(n, max_blksz), uio);
1213#endif
1214	}
1215
1216	zfs_range_unlock(rl);
1217
1218	/*
1219	 * If we're in replay mode, or we made no progress, return error.
1220	 * Otherwise, it's at least a partial write, so it's successful.
1221	 */
1222	if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
1223		ZFS_EXIT(zfsvfs);
1224		return (error);
1225	}
1226
1227#ifdef __FreeBSD__
1228	/*
1229	 * EFAULT means that at least one page of the source buffer was not
1230	 * available.  VFS will re-try remaining I/O upon this error.
1231	 */
1232	if (error == EFAULT) {
1233		ZFS_EXIT(zfsvfs);
1234		return (error);
1235	}
1236#endif
1237
1238	if (ioflag & (FSYNC | FDSYNC) ||
1239	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1240		zil_commit(zilog, zp->z_id);
1241
1242	ZFS_EXIT(zfsvfs);
1243	return (0);
1244}
1245
1246void
1247zfs_get_done(zgd_t *zgd, int error)
1248{
1249	znode_t *zp = zgd->zgd_private;
1250	objset_t *os = zp->z_zfsvfs->z_os;
1251
1252	if (zgd->zgd_db)
1253		dmu_buf_rele(zgd->zgd_db, zgd);
1254
1255	zfs_range_unlock(zgd->zgd_rl);
1256
1257	/*
1258	 * Release the vnode asynchronously as we currently have the
1259	 * txg stopped from syncing.
1260	 */
1261	VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1262
1263	if (error == 0 && zgd->zgd_bp)
1264		zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
1265
1266	kmem_free(zgd, sizeof (zgd_t));
1267}
1268
1269#ifdef DEBUG
1270static int zil_fault_io = 0;
1271#endif
1272
1273/*
1274 * Get data to generate a TX_WRITE intent log record.
1275 */
1276int
1277zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
1278{
1279	zfsvfs_t *zfsvfs = arg;
1280	objset_t *os = zfsvfs->z_os;
1281	znode_t *zp;
1282	uint64_t object = lr->lr_foid;
1283	uint64_t offset = lr->lr_offset;
1284	uint64_t size = lr->lr_length;
1285	dmu_buf_t *db;
1286	zgd_t *zgd;
1287	int error = 0;
1288
1289	ASSERT3P(lwb, !=, NULL);
1290	ASSERT3P(zio, !=, NULL);
1291	ASSERT3U(size, !=, 0);
1292
1293	/*
1294	 * Nothing to do if the file has been removed
1295	 */
1296	if (zfs_zget(zfsvfs, object, &zp) != 0)
1297		return (SET_ERROR(ENOENT));
1298	if (zp->z_unlinked) {
1299		/*
1300		 * Release the vnode asynchronously as we currently have the
1301		 * txg stopped from syncing.
1302		 */
1303		VN_RELE_ASYNC(ZTOV(zp),
1304		    dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1305		return (SET_ERROR(ENOENT));
1306	}
1307
1308	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1309	zgd->zgd_lwb = lwb;
1310	zgd->zgd_private = zp;
1311
1312	/*
1313	 * Write records come in two flavors: immediate and indirect.
1314	 * For small writes it's cheaper to store the data with the
1315	 * log record (immediate); for large writes it's cheaper to
1316	 * sync the data and get a pointer to it (indirect) so that
1317	 * we don't have to write the data twice.
1318	 */
1319	if (buf != NULL) { /* immediate write */
1320		zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
1321		/* test for truncation needs to be done while range locked */
1322		if (offset >= zp->z_size) {
1323			error = SET_ERROR(ENOENT);
1324		} else {
1325			error = dmu_read(os, object, offset, size, buf,
1326			    DMU_READ_NO_PREFETCH);
1327		}
1328		ASSERT(error == 0 || error == ENOENT);
1329	} else { /* indirect write */
1330		/*
1331		 * Have to lock the whole block to ensure when it's
1332		 * written out and its checksum is being calculated
1333		 * that no one can change the data. We need to re-check
1334		 * blocksize after we get the lock in case it's changed!
1335		 */
1336		for (;;) {
1337			uint64_t blkoff;
1338			size = zp->z_blksz;
1339			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1340			offset -= blkoff;
1341			zgd->zgd_rl = zfs_range_lock(zp, offset, size,
1342			    RL_READER);
1343			if (zp->z_blksz == size)
1344				break;
1345			offset += blkoff;
1346			zfs_range_unlock(zgd->zgd_rl);
1347		}
1348		/* test for truncation needs to be done while range locked */
1349		if (lr->lr_offset >= zp->z_size)
1350			error = SET_ERROR(ENOENT);
1351#ifdef DEBUG
1352		if (zil_fault_io) {
1353			error = SET_ERROR(EIO);
1354			zil_fault_io = 0;
1355		}
1356#endif
1357		if (error == 0)
1358			error = dmu_buf_hold(os, object, offset, zgd, &db,
1359			    DMU_READ_NO_PREFETCH);
1360
1361		if (error == 0) {
1362			blkptr_t *bp = &lr->lr_blkptr;
1363
1364			zgd->zgd_db = db;
1365			zgd->zgd_bp = bp;
1366
1367			ASSERT(db->db_offset == offset);
1368			ASSERT(db->db_size == size);
1369
1370			error = dmu_sync(zio, lr->lr_common.lrc_txg,
1371			    zfs_get_done, zgd);
1372			ASSERT(error || lr->lr_length <= size);
1373
1374			/*
1375			 * On success, we need to wait for the write I/O
1376			 * initiated by dmu_sync() to complete before we can
1377			 * release this dbuf.  We will finish everything up
1378			 * in the zfs_get_done() callback.
1379			 */
1380			if (error == 0)
1381				return (0);
1382
1383			if (error == EALREADY) {
1384				lr->lr_common.lrc_txtype = TX_WRITE2;
1385				error = 0;
1386			}
1387		}
1388	}
1389
1390	zfs_get_done(zgd, error);
1391
1392	return (error);
1393}
1394
1395/*ARGSUSED*/
1396static int
1397zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1398    caller_context_t *ct)
1399{
1400	znode_t *zp = VTOZ(vp);
1401	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1402	int error;
1403
1404	ZFS_ENTER(zfsvfs);
1405	ZFS_VERIFY_ZP(zp);
1406
1407	if (flag & V_ACE_MASK)
1408		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1409	else
1410		error = zfs_zaccess_rwx(zp, mode, flag, cr);
1411
1412	ZFS_EXIT(zfsvfs);
1413	return (error);
1414}
1415
1416static int
1417zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
1418{
1419	int error;
1420
1421	*vpp = arg;
1422	error = vn_lock(*vpp, lkflags);
1423	if (error != 0)
1424		vrele(*vpp);
1425	return (error);
1426}
1427
1428static int
1429zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
1430{
1431	znode_t *zdp = VTOZ(dvp);
1432	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1433	int error;
1434	int ltype;
1435
1436	ASSERT_VOP_LOCKED(dvp, __func__);
1437#ifdef DIAGNOSTIC
1438	if ((zdp->z_pflags & ZFS_XATTR) == 0)
1439		VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
1440#endif
1441
1442	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
1443		ASSERT3P(dvp, ==, vp);
1444		vref(dvp);
1445		ltype = lkflags & LK_TYPE_MASK;
1446		if (ltype != VOP_ISLOCKED(dvp)) {
1447			if (ltype == LK_EXCLUSIVE)
1448				vn_lock(dvp, LK_UPGRADE | LK_RETRY);
1449			else /* if (ltype == LK_SHARED) */
1450				vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
1451
1452			/*
1453			 * Relock for the "." case could leave us with
1454			 * reclaimed vnode.
1455			 */
1456			if (dvp->v_iflag & VI_DOOMED) {
1457				vrele(dvp);
1458				return (SET_ERROR(ENOENT));
1459			}
1460		}
1461		return (0);
1462	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
1463		/*
1464		 * Note that in this case, dvp is the child vnode, and we
1465		 * are looking up the parent vnode - exactly reverse from
1466		 * normal operation.  Unlocking dvp requires some rather
1467		 * tricky unlock/relock dance to prevent mp from being freed;
1468		 * use vn_vget_ino_gen() which takes care of all that.
1469		 *
1470		 * XXX Note that there is a time window when both vnodes are
1471		 * unlocked.  It is possible, although highly unlikely, that
1472		 * during that window the parent-child relationship between
1473		 * the vnodes may change, for example, get reversed.
1474		 * In that case we would have a wrong lock order for the vnodes.
1475		 * All other filesystems seem to ignore this problem, so we
1476		 * do the same here.
1477		 * A potential solution could be implemented as follows:
1478		 * - using LK_NOWAIT when locking the second vnode and retrying
1479		 *   if necessary
1480		 * - checking that the parent-child relationship still holds
1481		 *   after locking both vnodes and retrying if it doesn't
1482		 */
1483		error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
1484		return (error);
1485	} else {
1486		error = vn_lock(vp, lkflags);
1487		if (error != 0)
1488			vrele(vp);
1489		return (error);
1490	}
1491}
1492
1493/*
1494 * Lookup an entry in a directory, or an extended attribute directory.
1495 * If it exists, return a held vnode reference for it.
1496 *
1497 *	IN:	dvp	- vnode of directory to search.
1498 *		nm	- name of entry to lookup.
1499 *		pnp	- full pathname to lookup [UNUSED].
1500 *		flags	- LOOKUP_XATTR set if looking for an attribute.
1501 *		rdir	- root directory vnode [UNUSED].
1502 *		cr	- credentials of caller.
1503 *		ct	- caller context
1504 *
1505 *	OUT:	vpp	- vnode of located entry, NULL if not found.
1506 *
1507 *	RETURN:	0 on success, error code on failure.
1508 *
1509 * Timestamps:
1510 *	NA
1511 */
1512/* ARGSUSED */
1513static int
1514zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
1515    int nameiop, cred_t *cr, kthread_t *td, int flags)
1516{
1517	znode_t *zdp = VTOZ(dvp);
1518	znode_t *zp;
1519	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1520	int	error = 0;
1521
1522	/*
1523	 * Fast path lookup, however we must skip DNLC lookup
1524	 * for case folding or normalizing lookups because the
1525	 * DNLC code only stores the passed in name.  This means
1526	 * creating 'a' and removing 'A' on a case insensitive
1527	 * file system would work, but DNLC still thinks 'a'
1528	 * exists and won't let you create it again on the next
1529	 * pass through fast path.
1530	 */
1531	if (!(flags & LOOKUP_XATTR)) {
1532		if (dvp->v_type != VDIR) {
1533			return (SET_ERROR(ENOTDIR));
1534		} else if (zdp->z_sa_hdl == NULL) {
1535			return (SET_ERROR(EIO));
1536		}
1537	}
1538
1539	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1540
1541	ZFS_ENTER(zfsvfs);
1542	ZFS_VERIFY_ZP(zdp);
1543
1544	*vpp = NULL;
1545
1546	if (flags & LOOKUP_XATTR) {
1547#ifdef TODO
1548		/*
1549		 * If the xattr property is off, refuse the lookup request.
1550		 */
1551		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1552			ZFS_EXIT(zfsvfs);
1553			return (SET_ERROR(EINVAL));
1554		}
1555#endif
1556
1557		/*
1558		 * We don't allow recursive attributes..
1559		 * Maybe someday we will.
1560		 */
1561		if (zdp->z_pflags & ZFS_XATTR) {
1562			ZFS_EXIT(zfsvfs);
1563			return (SET_ERROR(EINVAL));
1564		}
1565
1566		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1567			ZFS_EXIT(zfsvfs);
1568			return (error);
1569		}
1570
1571		/*
1572		 * Do we have permission to get into attribute directory?
1573		 */
1574		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1575		    B_FALSE, cr)) {
1576			vrele(*vpp);
1577			*vpp = NULL;
1578		}
1579
1580		ZFS_EXIT(zfsvfs);
1581		return (error);
1582	}
1583
1584	/*
1585	 * Check accessibility of directory.
1586	 */
1587	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1588		ZFS_EXIT(zfsvfs);
1589		return (error);
1590	}
1591
1592	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1593	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1594		ZFS_EXIT(zfsvfs);
1595		return (SET_ERROR(EILSEQ));
1596	}
1597
1598
1599	/*
1600	 * First handle the special cases.
1601	 */
1602	if ((cnp->cn_flags & ISDOTDOT) != 0) {
1603		/*
1604		 * If we are a snapshot mounted under .zfs, return
1605		 * the vp for the snapshot directory.
1606		 */
1607		if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
1608			struct componentname cn;
1609			vnode_t *zfsctl_vp;
1610			int ltype;
1611
1612			ZFS_EXIT(zfsvfs);
1613			ltype = VOP_ISLOCKED(dvp);
1614			VOP_UNLOCK(dvp, 0);
1615			error = zfsctl_root(zfsvfs->z_parent, LK_SHARED,
1616			    &zfsctl_vp);
1617			if (error == 0) {
1618				cn.cn_nameptr = "snapshot";
1619				cn.cn_namelen = strlen(cn.cn_nameptr);
1620				cn.cn_nameiop = cnp->cn_nameiop;
1621				cn.cn_flags = cnp->cn_flags & ~ISDOTDOT;
1622				cn.cn_lkflags = cnp->cn_lkflags;
1623				error = VOP_LOOKUP(zfsctl_vp, vpp, &cn);
1624				vput(zfsctl_vp);
1625			}
1626			vn_lock(dvp, ltype | LK_RETRY);
1627			return (error);
1628		}
1629	}
1630	if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
1631		ZFS_EXIT(zfsvfs);
1632		if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
1633			return (SET_ERROR(ENOTSUP));
1634		error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp);
1635		return (error);
1636	}
1637
1638	/*
1639	 * The loop is retry the lookup if the parent-child relationship
1640	 * changes during the dot-dot locking complexities.
1641	 */
1642	for (;;) {
1643		uint64_t parent;
1644
1645		error = zfs_dirlook(zdp, nm, &zp);
1646		if (error == 0)
1647			*vpp = ZTOV(zp);
1648
1649		ZFS_EXIT(zfsvfs);
1650		if (error != 0)
1651			break;
1652
1653		error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
1654		if (error != 0) {
1655			/*
1656			 * If we've got a locking error, then the vnode
1657			 * got reclaimed because of a force unmount.
1658			 * We never enter doomed vnodes into the name cache.
1659			 */
1660			*vpp = NULL;
1661			return (error);
1662		}
1663
1664		if ((cnp->cn_flags & ISDOTDOT) == 0)
1665			break;
1666
1667		ZFS_ENTER(zfsvfs);
1668		if (zdp->z_sa_hdl == NULL) {
1669			error = SET_ERROR(EIO);
1670		} else {
1671			error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
1672			    &parent, sizeof (parent));
1673		}
1674		if (error != 0) {
1675			ZFS_EXIT(zfsvfs);
1676			vput(ZTOV(zp));
1677			break;
1678		}
1679		if (zp->z_id == parent) {
1680			ZFS_EXIT(zfsvfs);
1681			break;
1682		}
1683		vput(ZTOV(zp));
1684	}
1685
1686out:
1687	if (error != 0)
1688		*vpp = NULL;
1689
1690	/* Translate errors and add SAVENAME when needed. */
1691	if (cnp->cn_flags & ISLASTCN) {
1692		switch (nameiop) {
1693		case CREATE:
1694		case RENAME:
1695			if (error == ENOENT) {
1696				error = EJUSTRETURN;
1697				cnp->cn_flags |= SAVENAME;
1698				break;
1699			}
1700			/* FALLTHROUGH */
1701		case DELETE:
1702			if (error == 0)
1703				cnp->cn_flags |= SAVENAME;
1704			break;
1705		}
1706	}
1707
1708	/* Insert name into cache (as non-existent) if appropriate. */
1709	if (zfsvfs->z_use_namecache &&
1710	    error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
1711		cache_enter(dvp, NULL, cnp);
1712
1713	/* Insert name into cache if appropriate. */
1714	if (zfsvfs->z_use_namecache &&
1715	    error == 0 && (cnp->cn_flags & MAKEENTRY)) {
1716		if (!(cnp->cn_flags & ISLASTCN) ||
1717		    (nameiop != DELETE && nameiop != RENAME)) {
1718			cache_enter(dvp, *vpp, cnp);
1719		}
1720	}
1721
1722	return (error);
1723}
1724
1725/*
1726 * Attempt to create a new entry in a directory.  If the entry
1727 * already exists, truncate the file if permissible, else return
1728 * an error.  Return the vp of the created or trunc'd file.
1729 *
1730 *	IN:	dvp	- vnode of directory to put new file entry in.
1731 *		name	- name of new file entry.
1732 *		vap	- attributes of new file.
1733 *		excl	- flag indicating exclusive or non-exclusive mode.
1734 *		mode	- mode to open file with.
1735 *		cr	- credentials of caller.
1736 *		flag	- large file flag [UNUSED].
1737 *		ct	- caller context
1738 *		vsecp	- ACL to be set
1739 *
1740 *	OUT:	vpp	- vnode of created or trunc'd entry.
1741 *
1742 *	RETURN:	0 on success, error code on failure.
1743 *
1744 * Timestamps:
1745 *	dvp - ctime|mtime updated if new entry created
1746 *	 vp - ctime|mtime always, atime if new
1747 */
1748
1749/* ARGSUSED */
1750static int
1751zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
1752    vnode_t **vpp, cred_t *cr, kthread_t *td)
1753{
1754	znode_t		*zp, *dzp = VTOZ(dvp);
1755	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1756	zilog_t		*zilog;
1757	objset_t	*os;
1758	dmu_tx_t	*tx;
1759	int		error;
1760	ksid_t		*ksid;
1761	uid_t		uid;
1762	gid_t		gid = crgetgid(cr);
1763	zfs_acl_ids_t   acl_ids;
1764	boolean_t	fuid_dirtied;
1765	void		*vsecp = NULL;
1766	int		flag = 0;
1767	uint64_t	txtype;
1768
1769	/*
1770	 * If we have an ephemeral id, ACL, or XVATTR then
1771	 * make sure file system is at proper version
1772	 */
1773
1774	ksid = crgetsid(cr, KSID_OWNER);
1775	if (ksid)
1776		uid = ksid_getid(ksid);
1777	else
1778		uid = crgetuid(cr);
1779
1780	if (zfsvfs->z_use_fuids == B_FALSE &&
1781	    (vsecp || (vap->va_mask & AT_XVATTR) ||
1782	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1783		return (SET_ERROR(EINVAL));
1784
1785	ZFS_ENTER(zfsvfs);
1786	ZFS_VERIFY_ZP(dzp);
1787	os = zfsvfs->z_os;
1788	zilog = zfsvfs->z_log;
1789
1790	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1791	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1792		ZFS_EXIT(zfsvfs);
1793		return (SET_ERROR(EILSEQ));
1794	}
1795
1796	if (vap->va_mask & AT_XVATTR) {
1797		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
1798		    crgetuid(cr), cr, vap->va_type)) != 0) {
1799			ZFS_EXIT(zfsvfs);
1800			return (error);
1801		}
1802	}
1803
1804	*vpp = NULL;
1805
1806	if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
1807		vap->va_mode &= ~S_ISVTX;
1808
1809	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
1810	if (error) {
1811		ZFS_EXIT(zfsvfs);
1812		return (error);
1813	}
1814	ASSERT3P(zp, ==, NULL);
1815
1816	/*
1817	 * Create a new file object and update the directory
1818	 * to reference it.
1819	 */
1820	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1821		goto out;
1822	}
1823
1824	/*
1825	 * We only support the creation of regular files in
1826	 * extended attribute directories.
1827	 */
1828
1829	if ((dzp->z_pflags & ZFS_XATTR) &&
1830	    (vap->va_type != VREG)) {
1831		error = SET_ERROR(EINVAL);
1832		goto out;
1833	}
1834
1835	if ((error = zfs_acl_ids_create(dzp, 0, vap,
1836	    cr, vsecp, &acl_ids)) != 0)
1837		goto out;
1838
1839	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1840		zfs_acl_ids_free(&acl_ids);
1841		error = SET_ERROR(EDQUOT);
1842		goto out;
1843	}
1844
1845	getnewvnode_reserve(1);
1846
1847	tx = dmu_tx_create(os);
1848
1849	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1850	    ZFS_SA_BASE_ATTR_SIZE);
1851
1852	fuid_dirtied = zfsvfs->z_fuid_dirty;
1853	if (fuid_dirtied)
1854		zfs_fuid_txhold(zfsvfs, tx);
1855	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1856	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1857	if (!zfsvfs->z_use_sa &&
1858	    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1859		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1860		    0, acl_ids.z_aclp->z_acl_bytes);
1861	}
1862	error = dmu_tx_assign(tx, TXG_WAIT);
1863	if (error) {
1864		zfs_acl_ids_free(&acl_ids);
1865		dmu_tx_abort(tx);
1866		getnewvnode_drop_reserve();
1867		ZFS_EXIT(zfsvfs);
1868		return (error);
1869	}
1870	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1871
1872	if (fuid_dirtied)
1873		zfs_fuid_sync(zfsvfs, tx);
1874
1875	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
1876	txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1877	zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1878	    vsecp, acl_ids.z_fuidp, vap);
1879	zfs_acl_ids_free(&acl_ids);
1880	dmu_tx_commit(tx);
1881
1882	getnewvnode_drop_reserve();
1883
1884out:
1885	if (error == 0) {
1886		*vpp = ZTOV(zp);
1887	}
1888
1889	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1890		zil_commit(zilog, 0);
1891
1892	ZFS_EXIT(zfsvfs);
1893	return (error);
1894}
1895
1896/*
1897 * Remove an entry from a directory.
1898 *
1899 *	IN:	dvp	- vnode of directory to remove entry from.
1900 *		name	- name of entry to remove.
1901 *		cr	- credentials of caller.
1902 *		ct	- caller context
1903 *		flags	- case flags
1904 *
1905 *	RETURN:	0 on success, error code on failure.
1906 *
1907 * Timestamps:
1908 *	dvp - ctime|mtime
1909 *	 vp - ctime (if nlink > 0)
1910 */
1911
1912/*ARGSUSED*/
1913static int
1914zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
1915{
1916	znode_t		*dzp = VTOZ(dvp);
1917	znode_t		*zp = VTOZ(vp);
1918	znode_t		*xzp;
1919	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1920	zilog_t		*zilog;
1921	uint64_t	acl_obj, xattr_obj;
1922	uint64_t	obj = 0;
1923	dmu_tx_t	*tx;
1924	boolean_t	unlinked, toobig = FALSE;
1925	uint64_t	txtype;
1926	int		error;
1927
1928	ZFS_ENTER(zfsvfs);
1929	ZFS_VERIFY_ZP(dzp);
1930	ZFS_VERIFY_ZP(zp);
1931	zilog = zfsvfs->z_log;
1932	zp = VTOZ(vp);
1933
1934	xattr_obj = 0;
1935	xzp = NULL;
1936
1937	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1938		goto out;
1939	}
1940
1941	/*
1942	 * Need to use rmdir for removing directories.
1943	 */
1944	if (vp->v_type == VDIR) {
1945		error = SET_ERROR(EPERM);
1946		goto out;
1947	}
1948
1949	vnevent_remove(vp, dvp, name, ct);
1950
1951	obj = zp->z_id;
1952
1953	/* are there any extended attributes? */
1954	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1955	    &xattr_obj, sizeof (xattr_obj));
1956	if (error == 0 && xattr_obj) {
1957		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1958		ASSERT0(error);
1959	}
1960
1961	/*
1962	 * We may delete the znode now, or we may put it in the unlinked set;
1963	 * it depends on whether we're the last link, and on whether there are
1964	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
1965	 * allow for either case.
1966	 */
1967	tx = dmu_tx_create(zfsvfs->z_os);
1968	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1969	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1970	zfs_sa_upgrade_txholds(tx, zp);
1971	zfs_sa_upgrade_txholds(tx, dzp);
1972
1973	if (xzp) {
1974		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1975		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1976	}
1977
1978	/* charge as an update -- would be nice not to charge at all */
1979	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1980
1981	/*
1982	 * Mark this transaction as typically resulting in a net free of space
1983	 */
1984	dmu_tx_mark_netfree(tx);
1985
1986	error = dmu_tx_assign(tx, TXG_WAIT);
1987	if (error) {
1988		dmu_tx_abort(tx);
1989		ZFS_EXIT(zfsvfs);
1990		return (error);
1991	}
1992
1993	/*
1994	 * Remove the directory entry.
1995	 */
1996	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
1997
1998	if (error) {
1999		dmu_tx_commit(tx);
2000		goto out;
2001	}
2002
2003	if (unlinked) {
2004		zfs_unlinked_add(zp, tx);
2005		vp->v_vflag |= VV_NOSYNC;
2006	}
2007
2008	txtype = TX_REMOVE;
2009	zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
2010
2011	dmu_tx_commit(tx);
2012out:
2013
2014	if (xzp)
2015		vrele(ZTOV(xzp));
2016
2017	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2018		zil_commit(zilog, 0);
2019
2020	ZFS_EXIT(zfsvfs);
2021	return (error);
2022}
2023
2024/*
2025 * Create a new directory and insert it into dvp using the name
2026 * provided.  Return a pointer to the inserted directory.
2027 *
2028 *	IN:	dvp	- vnode of directory to add subdir to.
2029 *		dirname	- name of new directory.
2030 *		vap	- attributes of new directory.
2031 *		cr	- credentials of caller.
2032 *		ct	- caller context
2033 *		flags	- case flags
2034 *		vsecp	- ACL to be set
2035 *
2036 *	OUT:	vpp	- vnode of created directory.
2037 *
2038 *	RETURN:	0 on success, error code on failure.
2039 *
2040 * Timestamps:
2041 *	dvp - ctime|mtime updated
2042 *	 vp - ctime|mtime|atime updated
2043 */
2044/*ARGSUSED*/
2045static int
2046zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
2047{
2048	znode_t		*zp, *dzp = VTOZ(dvp);
2049	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2050	zilog_t		*zilog;
2051	uint64_t	txtype;
2052	dmu_tx_t	*tx;
2053	int		error;
2054	ksid_t		*ksid;
2055	uid_t		uid;
2056	gid_t		gid = crgetgid(cr);
2057	zfs_acl_ids_t   acl_ids;
2058	boolean_t	fuid_dirtied;
2059
2060	ASSERT(vap->va_type == VDIR);
2061
2062	/*
2063	 * If we have an ephemeral id, ACL, or XVATTR then
2064	 * make sure file system is at proper version
2065	 */
2066
2067	ksid = crgetsid(cr, KSID_OWNER);
2068	if (ksid)
2069		uid = ksid_getid(ksid);
2070	else
2071		uid = crgetuid(cr);
2072	if (zfsvfs->z_use_fuids == B_FALSE &&
2073	    ((vap->va_mask & AT_XVATTR) ||
2074	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
2075		return (SET_ERROR(EINVAL));
2076
2077	ZFS_ENTER(zfsvfs);
2078	ZFS_VERIFY_ZP(dzp);
2079	zilog = zfsvfs->z_log;
2080
2081	if (dzp->z_pflags & ZFS_XATTR) {
2082		ZFS_EXIT(zfsvfs);
2083		return (SET_ERROR(EINVAL));
2084	}
2085
2086	if (zfsvfs->z_utf8 && u8_validate(dirname,
2087	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2088		ZFS_EXIT(zfsvfs);
2089		return (SET_ERROR(EILSEQ));
2090	}
2091
2092	if (vap->va_mask & AT_XVATTR) {
2093		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
2094		    crgetuid(cr), cr, vap->va_type)) != 0) {
2095			ZFS_EXIT(zfsvfs);
2096			return (error);
2097		}
2098	}
2099
2100	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
2101	    NULL, &acl_ids)) != 0) {
2102		ZFS_EXIT(zfsvfs);
2103		return (error);
2104	}
2105
2106	/*
2107	 * First make sure the new directory doesn't exist.
2108	 *
2109	 * Existence is checked first to make sure we don't return
2110	 * EACCES instead of EEXIST which can cause some applications
2111	 * to fail.
2112	 */
2113	*vpp = NULL;
2114
2115	if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) {
2116		zfs_acl_ids_free(&acl_ids);
2117		ZFS_EXIT(zfsvfs);
2118		return (error);
2119	}
2120	ASSERT3P(zp, ==, NULL);
2121
2122	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
2123		zfs_acl_ids_free(&acl_ids);
2124		ZFS_EXIT(zfsvfs);
2125		return (error);
2126	}
2127
2128	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
2129		zfs_acl_ids_free(&acl_ids);
2130		ZFS_EXIT(zfsvfs);
2131		return (SET_ERROR(EDQUOT));
2132	}
2133
2134	/*
2135	 * Add a new entry to the directory.
2136	 */
2137	getnewvnode_reserve(1);
2138	tx = dmu_tx_create(zfsvfs->z_os);
2139	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
2140	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2141	fuid_dirtied = zfsvfs->z_fuid_dirty;
2142	if (fuid_dirtied)
2143		zfs_fuid_txhold(zfsvfs, tx);
2144	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2145		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
2146		    acl_ids.z_aclp->z_acl_bytes);
2147	}
2148
2149	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
2150	    ZFS_SA_BASE_ATTR_SIZE);
2151
2152	error = dmu_tx_assign(tx, TXG_WAIT);
2153	if (error) {
2154		zfs_acl_ids_free(&acl_ids);
2155		dmu_tx_abort(tx);
2156		getnewvnode_drop_reserve();
2157		ZFS_EXIT(zfsvfs);
2158		return (error);
2159	}
2160
2161	/*
2162	 * Create new node.
2163	 */
2164	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
2165
2166	if (fuid_dirtied)
2167		zfs_fuid_sync(zfsvfs, tx);
2168
2169	/*
2170	 * Now put new name in parent dir.
2171	 */
2172	(void) zfs_link_create(dzp, dirname, zp, tx, ZNEW);
2173
2174	*vpp = ZTOV(zp);
2175
2176	txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
2177	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
2178	    acl_ids.z_fuidp, vap);
2179
2180	zfs_acl_ids_free(&acl_ids);
2181
2182	dmu_tx_commit(tx);
2183
2184	getnewvnode_drop_reserve();
2185
2186	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2187		zil_commit(zilog, 0);
2188
2189	ZFS_EXIT(zfsvfs);
2190	return (0);
2191}
2192
2193/*
2194 * Remove a directory subdir entry.  If the current working
2195 * directory is the same as the subdir to be removed, the
2196 * remove will fail.
2197 *
2198 *	IN:	dvp	- vnode of directory to remove from.
2199 *		name	- name of directory to be removed.
2200 *		cwd	- vnode of current working directory.
2201 *		cr	- credentials of caller.
2202 *		ct	- caller context
2203 *		flags	- case flags
2204 *
2205 *	RETURN:	0 on success, error code on failure.
2206 *
2207 * Timestamps:
2208 *	dvp - ctime|mtime updated
2209 */
2210/*ARGSUSED*/
2211static int
2212zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
2213{
2214	znode_t		*dzp = VTOZ(dvp);
2215	znode_t		*zp = VTOZ(vp);
2216	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2217	zilog_t		*zilog;
2218	dmu_tx_t	*tx;
2219	int		error;
2220
2221	ZFS_ENTER(zfsvfs);
2222	ZFS_VERIFY_ZP(dzp);
2223	ZFS_VERIFY_ZP(zp);
2224	zilog = zfsvfs->z_log;
2225
2226
2227	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2228		goto out;
2229	}
2230
2231	if (vp->v_type != VDIR) {
2232		error = SET_ERROR(ENOTDIR);
2233		goto out;
2234	}
2235
2236	vnevent_rmdir(vp, dvp, name, ct);
2237
2238	tx = dmu_tx_create(zfsvfs->z_os);
2239	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2240	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2241	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2242	zfs_sa_upgrade_txholds(tx, zp);
2243	zfs_sa_upgrade_txholds(tx, dzp);
2244	dmu_tx_mark_netfree(tx);
2245	error = dmu_tx_assign(tx, TXG_WAIT);
2246	if (error) {
2247		dmu_tx_abort(tx);
2248		ZFS_EXIT(zfsvfs);
2249		return (error);
2250	}
2251
2252	cache_purge(dvp);
2253
2254	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
2255
2256	if (error == 0) {
2257		uint64_t txtype = TX_RMDIR;
2258		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
2259	}
2260
2261	dmu_tx_commit(tx);
2262
2263	cache_purge(vp);
2264out:
2265	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2266		zil_commit(zilog, 0);
2267
2268	ZFS_EXIT(zfsvfs);
2269	return (error);
2270}
2271
2272/*
2273 * Read as many directory entries as will fit into the provided
2274 * buffer from the given directory cursor position (specified in
2275 * the uio structure).
2276 *
2277 *	IN:	vp	- vnode of directory to read.
2278 *		uio	- structure supplying read location, range info,
2279 *			  and return buffer.
2280 *		cr	- credentials of caller.
2281 *		ct	- caller context
2282 *		flags	- case flags
2283 *
2284 *	OUT:	uio	- updated offset and range, buffer filled.
2285 *		eofp	- set to true if end-of-file detected.
2286 *
2287 *	RETURN:	0 on success, error code on failure.
2288 *
2289 * Timestamps:
2290 *	vp - atime updated
2291 *
2292 * Note that the low 4 bits of the cookie returned by zap is always zero.
2293 * This allows us to use the low range for "special" directory entries:
2294 * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
2295 * we use the offset 2 for the '.zfs' directory.
2296 */
2297/* ARGSUSED */
2298static int
2299zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
2300{
2301	znode_t		*zp = VTOZ(vp);
2302	iovec_t		*iovp;
2303	edirent_t	*eodp;
2304	dirent64_t	*odp;
2305	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2306	objset_t	*os;
2307	caddr_t		outbuf;
2308	size_t		bufsize;
2309	zap_cursor_t	zc;
2310	zap_attribute_t	zap;
2311	uint_t		bytes_wanted;
2312	uint64_t	offset; /* must be unsigned; checks for < 1 */
2313	uint64_t	parent;
2314	int		local_eof;
2315	int		outcount;
2316	int		error;
2317	uint8_t		prefetch;
2318	boolean_t	check_sysattrs;
2319	uint8_t		type;
2320	int		ncooks;
2321	u_long		*cooks = NULL;
2322	int		flags = 0;
2323
2324	ZFS_ENTER(zfsvfs);
2325	ZFS_VERIFY_ZP(zp);
2326
2327	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2328	    &parent, sizeof (parent))) != 0) {
2329		ZFS_EXIT(zfsvfs);
2330		return (error);
2331	}
2332
2333	/*
2334	 * If we are not given an eof variable,
2335	 * use a local one.
2336	 */
2337	if (eofp == NULL)
2338		eofp = &local_eof;
2339
2340	/*
2341	 * Check for valid iov_len.
2342	 */
2343	if (uio->uio_iov->iov_len <= 0) {
2344		ZFS_EXIT(zfsvfs);
2345		return (SET_ERROR(EINVAL));
2346	}
2347
2348	/*
2349	 * Quit if directory has been removed (posix)
2350	 */
2351	if ((*eofp = zp->z_unlinked) != 0) {
2352		ZFS_EXIT(zfsvfs);
2353		return (0);
2354	}
2355
2356	error = 0;
2357	os = zfsvfs->z_os;
2358	offset = uio->uio_loffset;
2359	prefetch = zp->z_zn_prefetch;
2360
2361	/*
2362	 * Initialize the iterator cursor.
2363	 */
2364	if (offset <= 3) {
2365		/*
2366		 * Start iteration from the beginning of the directory.
2367		 */
2368		zap_cursor_init(&zc, os, zp->z_id);
2369	} else {
2370		/*
2371		 * The offset is a serialized cursor.
2372		 */
2373		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2374	}
2375
2376	/*
2377	 * Get space to change directory entries into fs independent format.
2378	 */
2379	iovp = uio->uio_iov;
2380	bytes_wanted = iovp->iov_len;
2381	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
2382		bufsize = bytes_wanted;
2383		outbuf = kmem_alloc(bufsize, KM_SLEEP);
2384		odp = (struct dirent64 *)outbuf;
2385	} else {
2386		bufsize = bytes_wanted;
2387		outbuf = NULL;
2388		odp = (struct dirent64 *)iovp->iov_base;
2389	}
2390	eodp = (struct edirent *)odp;
2391
2392	if (ncookies != NULL) {
2393		/*
2394		 * Minimum entry size is dirent size and 1 byte for a file name.
2395		 */
2396		ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
2397		cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
2398		*cookies = cooks;
2399		*ncookies = ncooks;
2400	}
2401	/*
2402	 * If this VFS supports the system attribute view interface; and
2403	 * we're looking at an extended attribute directory; and we care
2404	 * about normalization conflicts on this vfs; then we must check
2405	 * for normalization conflicts with the sysattr name space.
2406	 */
2407#ifdef TODO
2408	check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2409	    (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2410	    (flags & V_RDDIR_ENTFLAGS);
2411#else
2412	check_sysattrs = 0;
2413#endif
2414
2415	/*
2416	 * Transform to file-system independent format
2417	 */
2418	outcount = 0;
2419	while (outcount < bytes_wanted) {
2420		ino64_t objnum;
2421		ushort_t reclen;
2422		off64_t *next = NULL;
2423
2424		/*
2425		 * Special case `.', `..', and `.zfs'.
2426		 */
2427		if (offset == 0) {
2428			(void) strcpy(zap.za_name, ".");
2429			zap.za_normalization_conflict = 0;
2430			objnum = zp->z_id;
2431			type = DT_DIR;
2432		} else if (offset == 1) {
2433			(void) strcpy(zap.za_name, "..");
2434			zap.za_normalization_conflict = 0;
2435			objnum = parent;
2436			type = DT_DIR;
2437		} else if (offset == 2 && zfs_show_ctldir(zp)) {
2438			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2439			zap.za_normalization_conflict = 0;
2440			objnum = ZFSCTL_INO_ROOT;
2441			type = DT_DIR;
2442		} else {
2443			/*
2444			 * Grab next entry.
2445			 */
2446			if (error = zap_cursor_retrieve(&zc, &zap)) {
2447				if ((*eofp = (error == ENOENT)) != 0)
2448					break;
2449				else
2450					goto update;
2451			}
2452
2453			if (zap.za_integer_length != 8 ||
2454			    zap.za_num_integers != 1) {
2455				cmn_err(CE_WARN, "zap_readdir: bad directory "
2456				    "entry, obj = %lld, offset = %lld\n",
2457				    (u_longlong_t)zp->z_id,
2458				    (u_longlong_t)offset);
2459				error = SET_ERROR(ENXIO);
2460				goto update;
2461			}
2462
2463			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2464			/*
2465			 * MacOS X can extract the object type here such as:
2466			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2467			 */
2468			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2469
2470			if (check_sysattrs && !zap.za_normalization_conflict) {
2471#ifdef TODO
2472				zap.za_normalization_conflict =
2473				    xattr_sysattr_casechk(zap.za_name);
2474#else
2475				panic("%s:%u: TODO", __func__, __LINE__);
2476#endif
2477			}
2478		}
2479
2480		if (flags & V_RDDIR_ACCFILTER) {
2481			/*
2482			 * If we have no access at all, don't include
2483			 * this entry in the returned information
2484			 */
2485			znode_t	*ezp;
2486			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
2487				goto skip_entry;
2488			if (!zfs_has_access(ezp, cr)) {
2489				vrele(ZTOV(ezp));
2490				goto skip_entry;
2491			}
2492			vrele(ZTOV(ezp));
2493		}
2494
2495		if (flags & V_RDDIR_ENTFLAGS)
2496			reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2497		else
2498			reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2499
2500		/*
2501		 * Will this entry fit in the buffer?
2502		 */
2503		if (outcount + reclen > bufsize) {
2504			/*
2505			 * Did we manage to fit anything in the buffer?
2506			 */
2507			if (!outcount) {
2508				error = SET_ERROR(EINVAL);
2509				goto update;
2510			}
2511			break;
2512		}
2513		if (flags & V_RDDIR_ENTFLAGS) {
2514			/*
2515			 * Add extended flag entry:
2516			 */
2517			eodp->ed_ino = objnum;
2518			eodp->ed_reclen = reclen;
2519			/* NOTE: ed_off is the offset for the *next* entry */
2520			next = &(eodp->ed_off);
2521			eodp->ed_eflags = zap.za_normalization_conflict ?
2522			    ED_CASE_CONFLICT : 0;
2523			(void) strncpy(eodp->ed_name, zap.za_name,
2524			    EDIRENT_NAMELEN(reclen));
2525			eodp = (edirent_t *)((intptr_t)eodp + reclen);
2526		} else {
2527			/*
2528			 * Add normal entry:
2529			 */
2530			odp->d_ino = objnum;
2531			odp->d_reclen = reclen;
2532			odp->d_namlen = strlen(zap.za_name);
2533			(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
2534			odp->d_type = type;
2535			odp = (dirent64_t *)((intptr_t)odp + reclen);
2536		}
2537		outcount += reclen;
2538
2539		ASSERT(outcount <= bufsize);
2540
2541		/* Prefetch znode */
2542		if (prefetch)
2543			dmu_prefetch(os, objnum, 0, 0, 0,
2544			    ZIO_PRIORITY_SYNC_READ);
2545
2546	skip_entry:
2547		/*
2548		 * Move to the next entry, fill in the previous offset.
2549		 */
2550		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2551			zap_cursor_advance(&zc);
2552			offset = zap_cursor_serialize(&zc);
2553		} else {
2554			offset += 1;
2555		}
2556
2557		if (cooks != NULL) {
2558			*cooks++ = offset;
2559			ncooks--;
2560			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
2561		}
2562	}
2563	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2564
2565	/* Subtract unused cookies */
2566	if (ncookies != NULL)
2567		*ncookies -= ncooks;
2568
2569	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
2570		iovp->iov_base += outcount;
2571		iovp->iov_len -= outcount;
2572		uio->uio_resid -= outcount;
2573	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2574		/*
2575		 * Reset the pointer.
2576		 */
2577		offset = uio->uio_loffset;
2578	}
2579
2580update:
2581	zap_cursor_fini(&zc);
2582	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
2583		kmem_free(outbuf, bufsize);
2584
2585	if (error == ENOENT)
2586		error = 0;
2587
2588	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2589
2590	uio->uio_loffset = offset;
2591	ZFS_EXIT(zfsvfs);
2592	if (error != 0 && cookies != NULL) {
2593		free(*cookies, M_TEMP);
2594		*cookies = NULL;
2595		*ncookies = 0;
2596	}
2597	return (error);
2598}
2599
2600ulong_t zfs_fsync_sync_cnt = 4;
2601
2602static int
2603zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2604{
2605	znode_t	*zp = VTOZ(vp);
2606	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2607
2608	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2609
2610	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
2611		ZFS_ENTER(zfsvfs);
2612		ZFS_VERIFY_ZP(zp);
2613		zil_commit(zfsvfs->z_log, zp->z_id);
2614		ZFS_EXIT(zfsvfs);
2615	}
2616	return (0);
2617}
2618
2619
2620/*
2621 * Get the requested file attributes and place them in the provided
2622 * vattr structure.
2623 *
2624 *	IN:	vp	- vnode of file.
2625 *		vap	- va_mask identifies requested attributes.
2626 *			  If AT_XVATTR set, then optional attrs are requested
2627 *		flags	- ATTR_NOACLCHECK (CIFS server context)
2628 *		cr	- credentials of caller.
2629 *		ct	- caller context
2630 *
2631 *	OUT:	vap	- attribute values.
2632 *
2633 *	RETURN:	0 (always succeeds).
2634 */
2635/* ARGSUSED */
2636static int
2637zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2638    caller_context_t *ct)
2639{
2640	znode_t *zp = VTOZ(vp);
2641	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2642	int	error = 0;
2643	uint32_t blksize;
2644	u_longlong_t nblocks;
2645	uint64_t links;
2646	uint64_t mtime[2], ctime[2], crtime[2], rdev;
2647	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2648	xoptattr_t *xoap = NULL;
2649	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2650	sa_bulk_attr_t bulk[4];
2651	int count = 0;
2652
2653	ZFS_ENTER(zfsvfs);
2654	ZFS_VERIFY_ZP(zp);
2655
2656	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2657
2658	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
2659	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
2660	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
2661	if (vp->v_type == VBLK || vp->v_type == VCHR)
2662		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
2663		    &rdev, 8);
2664
2665	if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
2666		ZFS_EXIT(zfsvfs);
2667		return (error);
2668	}
2669
2670	/*
2671	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2672	 * Also, if we are the owner don't bother, since owner should
2673	 * always be allowed to read basic attributes of file.
2674	 */
2675	if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
2676	    (vap->va_uid != crgetuid(cr))) {
2677		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2678		    skipaclchk, cr)) {
2679			ZFS_EXIT(zfsvfs);
2680			return (error);
2681		}
2682	}
2683
2684	/*
2685	 * Return all attributes.  It's cheaper to provide the answer
2686	 * than to determine whether we were asked the question.
2687	 */
2688
2689	vap->va_type = IFTOVT(zp->z_mode);
2690	vap->va_mode = zp->z_mode & ~S_IFMT;
2691#ifdef illumos
2692	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
2693#else
2694	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
2695#endif
2696	vap->va_nodeid = zp->z_id;
2697	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
2698		links = zp->z_links + 1;
2699	else
2700		links = zp->z_links;
2701	vap->va_nlink = MIN(links, LINK_MAX);	/* nlink_t limit! */
2702	vap->va_size = zp->z_size;
2703#ifdef illumos
2704	vap->va_rdev = vp->v_rdev;
2705#else
2706	if (vp->v_type == VBLK || vp->v_type == VCHR)
2707		vap->va_rdev = zfs_cmpldev(rdev);
2708#endif
2709	vap->va_seq = zp->z_seq;
2710	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
2711     	vap->va_filerev = zp->z_seq;
2712
2713	/*
2714	 * Add in any requested optional attributes and the create time.
2715	 * Also set the corresponding bits in the returned attribute bitmap.
2716	 */
2717	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2718		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2719			xoap->xoa_archive =
2720			    ((zp->z_pflags & ZFS_ARCHIVE) != 0);
2721			XVA_SET_RTN(xvap, XAT_ARCHIVE);
2722		}
2723
2724		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2725			xoap->xoa_readonly =
2726			    ((zp->z_pflags & ZFS_READONLY) != 0);
2727			XVA_SET_RTN(xvap, XAT_READONLY);
2728		}
2729
2730		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2731			xoap->xoa_system =
2732			    ((zp->z_pflags & ZFS_SYSTEM) != 0);
2733			XVA_SET_RTN(xvap, XAT_SYSTEM);
2734		}
2735
2736		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2737			xoap->xoa_hidden =
2738			    ((zp->z_pflags & ZFS_HIDDEN) != 0);
2739			XVA_SET_RTN(xvap, XAT_HIDDEN);
2740		}
2741
2742		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2743			xoap->xoa_nounlink =
2744			    ((zp->z_pflags & ZFS_NOUNLINK) != 0);
2745			XVA_SET_RTN(xvap, XAT_NOUNLINK);
2746		}
2747
2748		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2749			xoap->xoa_immutable =
2750			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
2751			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2752		}
2753
2754		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2755			xoap->xoa_appendonly =
2756			    ((zp->z_pflags & ZFS_APPENDONLY) != 0);
2757			XVA_SET_RTN(xvap, XAT_APPENDONLY);
2758		}
2759
2760		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2761			xoap->xoa_nodump =
2762			    ((zp->z_pflags & ZFS_NODUMP) != 0);
2763			XVA_SET_RTN(xvap, XAT_NODUMP);
2764		}
2765
2766		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2767			xoap->xoa_opaque =
2768			    ((zp->z_pflags & ZFS_OPAQUE) != 0);
2769			XVA_SET_RTN(xvap, XAT_OPAQUE);
2770		}
2771
2772		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2773			xoap->xoa_av_quarantined =
2774			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
2775			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2776		}
2777
2778		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2779			xoap->xoa_av_modified =
2780			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
2781			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2782		}
2783
2784		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2785		    vp->v_type == VREG) {
2786			zfs_sa_get_scanstamp(zp, xvap);
2787		}
2788
2789		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2790			xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
2791			XVA_SET_RTN(xvap, XAT_REPARSE);
2792		}
2793		if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
2794			xoap->xoa_generation = zp->z_gen;
2795			XVA_SET_RTN(xvap, XAT_GEN);
2796		}
2797
2798		if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
2799			xoap->xoa_offline =
2800			    ((zp->z_pflags & ZFS_OFFLINE) != 0);
2801			XVA_SET_RTN(xvap, XAT_OFFLINE);
2802		}
2803
2804		if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
2805			xoap->xoa_sparse =
2806			    ((zp->z_pflags & ZFS_SPARSE) != 0);
2807			XVA_SET_RTN(xvap, XAT_SPARSE);
2808		}
2809	}
2810
2811	ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
2812	ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2813	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2814	ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
2815
2816
2817	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
2818	vap->va_blksize = blksize;
2819	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
2820
2821	if (zp->z_blksz == 0) {
2822		/*
2823		 * Block size hasn't been set; suggest maximal I/O transfers.
2824		 */
2825		vap->va_blksize = zfsvfs->z_max_blksz;
2826	}
2827
2828	ZFS_EXIT(zfsvfs);
2829	return (0);
2830}
2831
2832/*
2833 * Set the file attributes to the values contained in the
2834 * vattr structure.
2835 *
2836 *	IN:	vp	- vnode of file to be modified.
2837 *		vap	- new attribute values.
2838 *			  If AT_XVATTR set, then optional attrs are being set
2839 *		flags	- ATTR_UTIME set if non-default time values provided.
2840 *			- ATTR_NOACLCHECK (CIFS context only).
2841 *		cr	- credentials of caller.
2842 *		ct	- caller context
2843 *
2844 *	RETURN:	0 on success, error code on failure.
2845 *
2846 * Timestamps:
2847 *	vp - ctime updated, mtime updated if size changed.
2848 */
2849/* ARGSUSED */
2850static int
2851zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2852    caller_context_t *ct)
2853{
2854	znode_t		*zp = VTOZ(vp);
2855	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2856	zilog_t		*zilog;
2857	dmu_tx_t	*tx;
2858	vattr_t		oldva;
2859	xvattr_t	tmpxvattr;
2860	uint_t		mask = vap->va_mask;
2861	uint_t		saved_mask = 0;
2862	uint64_t	saved_mode;
2863	int		trim_mask = 0;
2864	uint64_t	new_mode;
2865	uint64_t	new_uid, new_gid;
2866	uint64_t	xattr_obj;
2867	uint64_t	mtime[2], ctime[2];
2868	znode_t		*attrzp;
2869	int		need_policy = FALSE;
2870	int		err, err2;
2871	zfs_fuid_info_t *fuidp = NULL;
2872	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2873	xoptattr_t	*xoap;
2874	zfs_acl_t	*aclp;
2875	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2876	boolean_t	fuid_dirtied = B_FALSE;
2877	sa_bulk_attr_t	bulk[7], xattr_bulk[7];
2878	int		count = 0, xattr_count = 0;
2879
2880	if (mask == 0)
2881		return (0);
2882
2883	if (mask & AT_NOSET)
2884		return (SET_ERROR(EINVAL));
2885
2886	ZFS_ENTER(zfsvfs);
2887	ZFS_VERIFY_ZP(zp);
2888
2889	zilog = zfsvfs->z_log;
2890
2891	/*
2892	 * Make sure that if we have ephemeral uid/gid or xvattr specified
2893	 * that file system is at proper version level
2894	 */
2895
2896	if (zfsvfs->z_use_fuids == B_FALSE &&
2897	    (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2898	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2899	    (mask & AT_XVATTR))) {
2900		ZFS_EXIT(zfsvfs);
2901		return (SET_ERROR(EINVAL));
2902	}
2903
2904	if (mask & AT_SIZE && vp->v_type == VDIR) {
2905		ZFS_EXIT(zfsvfs);
2906		return (SET_ERROR(EISDIR));
2907	}
2908
2909	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
2910		ZFS_EXIT(zfsvfs);
2911		return (SET_ERROR(EINVAL));
2912	}
2913
2914	/*
2915	 * If this is an xvattr_t, then get a pointer to the structure of
2916	 * optional attributes.  If this is NULL, then we have a vattr_t.
2917	 */
2918	xoap = xva_getxoptattr(xvap);
2919
2920	xva_init(&tmpxvattr);
2921
2922	/*
2923	 * Immutable files can only alter immutable bit and atime
2924	 */
2925	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2926	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
2927	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2928		ZFS_EXIT(zfsvfs);
2929		return (SET_ERROR(EPERM));
2930	}
2931
2932	/*
2933	 * Note: ZFS_READONLY is handled in zfs_zaccess_common.
2934	 */
2935
2936	/*
2937	 * Verify timestamps doesn't overflow 32 bits.
2938	 * ZFS can handle large timestamps, but 32bit syscalls can't
2939	 * handle times greater than 2039.  This check should be removed
2940	 * once large timestamps are fully supported.
2941	 */
2942	if (mask & (AT_ATIME | AT_MTIME)) {
2943		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2944		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2945			ZFS_EXIT(zfsvfs);
2946			return (SET_ERROR(EOVERFLOW));
2947		}
2948	}
2949	if (xoap && (mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME) &&
2950	    TIMESPEC_OVERFLOW(&vap->va_birthtime)) {
2951		ZFS_EXIT(zfsvfs);
2952		return (SET_ERROR(EOVERFLOW));
2953	}
2954
2955	attrzp = NULL;
2956	aclp = NULL;
2957
2958	/* Can this be moved to before the top label? */
2959	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2960		ZFS_EXIT(zfsvfs);
2961		return (SET_ERROR(EROFS));
2962	}
2963
2964	/*
2965	 * First validate permissions
2966	 */
2967
2968	if (mask & AT_SIZE) {
2969		/*
2970		 * XXX - Note, we are not providing any open
2971		 * mode flags here (like FNDELAY), so we may
2972		 * block if there are locks present... this
2973		 * should be addressed in openat().
2974		 */
2975		/* XXX - would it be OK to generate a log record here? */
2976		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2977		if (err) {
2978			ZFS_EXIT(zfsvfs);
2979			return (err);
2980		}
2981	}
2982
2983	if (mask & (AT_ATIME|AT_MTIME) ||
2984	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2985	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2986	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2987	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
2988	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
2989	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2990	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
2991		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2992		    skipaclchk, cr);
2993	}
2994
2995	if (mask & (AT_UID|AT_GID)) {
2996		int	idmask = (mask & (AT_UID|AT_GID));
2997		int	take_owner;
2998		int	take_group;
2999
3000		/*
3001		 * NOTE: even if a new mode is being set,
3002		 * we may clear S_ISUID/S_ISGID bits.
3003		 */
3004
3005		if (!(mask & AT_MODE))
3006			vap->va_mode = zp->z_mode;
3007
3008		/*
3009		 * Take ownership or chgrp to group we are a member of
3010		 */
3011
3012		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
3013		take_group = (mask & AT_GID) &&
3014		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
3015
3016		/*
3017		 * If both AT_UID and AT_GID are set then take_owner and
3018		 * take_group must both be set in order to allow taking
3019		 * ownership.
3020		 *
3021		 * Otherwise, send the check through secpolicy_vnode_setattr()
3022		 *
3023		 */
3024
3025		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
3026		    ((idmask == AT_UID) && take_owner) ||
3027		    ((idmask == AT_GID) && take_group)) {
3028			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
3029			    skipaclchk, cr) == 0) {
3030				/*
3031				 * Remove setuid/setgid for non-privileged users
3032				 */
3033				secpolicy_setid_clear(vap, vp, cr);
3034				trim_mask = (mask & (AT_UID|AT_GID));
3035			} else {
3036				need_policy =  TRUE;
3037			}
3038		} else {
3039			need_policy =  TRUE;
3040		}
3041	}
3042
3043	oldva.va_mode = zp->z_mode;
3044	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
3045	if (mask & AT_XVATTR) {
3046		/*
3047		 * Update xvattr mask to include only those attributes
3048		 * that are actually changing.
3049		 *
3050		 * the bits will be restored prior to actually setting
3051		 * the attributes so the caller thinks they were set.
3052		 */
3053		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
3054			if (xoap->xoa_appendonly !=
3055			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
3056				need_policy = TRUE;
3057			} else {
3058				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
3059				XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
3060			}
3061		}
3062
3063		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
3064			if (xoap->xoa_nounlink !=
3065			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
3066				need_policy = TRUE;
3067			} else {
3068				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
3069				XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
3070			}
3071		}
3072
3073		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
3074			if (xoap->xoa_immutable !=
3075			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
3076				need_policy = TRUE;
3077			} else {
3078				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
3079				XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
3080			}
3081		}
3082
3083		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
3084			if (xoap->xoa_nodump !=
3085			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
3086				need_policy = TRUE;
3087			} else {
3088				XVA_CLR_REQ(xvap, XAT_NODUMP);
3089				XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
3090			}
3091		}
3092
3093		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
3094			if (xoap->xoa_av_modified !=
3095			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
3096				need_policy = TRUE;
3097			} else {
3098				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
3099				XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
3100			}
3101		}
3102
3103		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
3104			if ((vp->v_type != VREG &&
3105			    xoap->xoa_av_quarantined) ||
3106			    xoap->xoa_av_quarantined !=
3107			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
3108				need_policy = TRUE;
3109			} else {
3110				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
3111				XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
3112			}
3113		}
3114
3115		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
3116			ZFS_EXIT(zfsvfs);
3117			return (SET_ERROR(EPERM));
3118		}
3119
3120		if (need_policy == FALSE &&
3121		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
3122		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
3123			need_policy = TRUE;
3124		}
3125	}
3126
3127	if (mask & AT_MODE) {
3128		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
3129			err = secpolicy_setid_setsticky_clear(vp, vap,
3130			    &oldva, cr);
3131			if (err) {
3132				ZFS_EXIT(zfsvfs);
3133				return (err);
3134			}
3135			trim_mask |= AT_MODE;
3136		} else {
3137			need_policy = TRUE;
3138		}
3139	}
3140
3141	if (need_policy) {
3142		/*
3143		 * If trim_mask is set then take ownership
3144		 * has been granted or write_acl is present and user
3145		 * has the ability to modify mode.  In that case remove
3146		 * UID|GID and or MODE from mask so that
3147		 * secpolicy_vnode_setattr() doesn't revoke it.
3148		 */
3149
3150		if (trim_mask) {
3151			saved_mask = vap->va_mask;
3152			vap->va_mask &= ~trim_mask;
3153			if (trim_mask & AT_MODE) {
3154				/*
3155				 * Save the mode, as secpolicy_vnode_setattr()
3156				 * will overwrite it with ova.va_mode.
3157				 */
3158				saved_mode = vap->va_mode;
3159			}
3160		}
3161		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
3162		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
3163		if (err) {
3164			ZFS_EXIT(zfsvfs);
3165			return (err);
3166		}
3167
3168		if (trim_mask) {
3169			vap->va_mask |= saved_mask;
3170			if (trim_mask & AT_MODE) {
3171				/*
3172				 * Recover the mode after
3173				 * secpolicy_vnode_setattr().
3174				 */
3175				vap->va_mode = saved_mode;
3176			}
3177		}
3178	}
3179
3180	/*
3181	 * secpolicy_vnode_setattr, or take ownership may have
3182	 * changed va_mask
3183	 */
3184	mask = vap->va_mask;
3185
3186	if ((mask & (AT_UID | AT_GID))) {
3187		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
3188		    &xattr_obj, sizeof (xattr_obj));
3189
3190		if (err == 0 && xattr_obj) {
3191			err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
3192			if (err == 0) {
3193				err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
3194				if (err != 0)
3195					vrele(ZTOV(attrzp));
3196			}
3197			if (err)
3198				goto out2;
3199		}
3200		if (mask & AT_UID) {
3201			new_uid = zfs_fuid_create(zfsvfs,
3202			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
3203			if (new_uid != zp->z_uid &&
3204			    zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
3205				if (attrzp)
3206					vput(ZTOV(attrzp));
3207				err = SET_ERROR(EDQUOT);
3208				goto out2;
3209			}
3210		}
3211
3212		if (mask & AT_GID) {
3213			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
3214			    cr, ZFS_GROUP, &fuidp);
3215			if (new_gid != zp->z_gid &&
3216			    zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
3217				if (attrzp)
3218					vput(ZTOV(attrzp));
3219				err = SET_ERROR(EDQUOT);
3220				goto out2;
3221			}
3222		}
3223	}
3224	tx = dmu_tx_create(zfsvfs->z_os);
3225
3226	if (mask & AT_MODE) {
3227		uint64_t pmode = zp->z_mode;
3228		uint64_t acl_obj;
3229		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
3230
3231		if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
3232		    !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
3233			err = SET_ERROR(EPERM);
3234			goto out;
3235		}
3236
3237		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
3238			goto out;
3239
3240		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
3241			/*
3242			 * Are we upgrading ACL from old V0 format
3243			 * to V1 format?
3244			 */
3245			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
3246			    zfs_znode_acl_version(zp) ==
3247			    ZFS_ACL_VERSION_INITIAL) {
3248				dmu_tx_hold_free(tx, acl_obj, 0,
3249				    DMU_OBJECT_END);
3250				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3251				    0, aclp->z_acl_bytes);
3252			} else {
3253				dmu_tx_hold_write(tx, acl_obj, 0,
3254				    aclp->z_acl_bytes);
3255			}
3256		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3257			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3258			    0, aclp->z_acl_bytes);
3259		}
3260		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3261	} else {
3262		if ((mask & AT_XVATTR) &&
3263		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3264			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3265		else
3266			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3267	}
3268
3269	if (attrzp) {
3270		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3271	}
3272
3273	fuid_dirtied = zfsvfs->z_fuid_dirty;
3274	if (fuid_dirtied)
3275		zfs_fuid_txhold(zfsvfs, tx);
3276
3277	zfs_sa_upgrade_txholds(tx, zp);
3278
3279	err = dmu_tx_assign(tx, TXG_WAIT);
3280	if (err)
3281		goto out;
3282
3283	count = 0;
3284	/*
3285	 * Set each attribute requested.
3286	 * We group settings according to the locks they need to acquire.
3287	 *
3288	 * Note: you cannot set ctime directly, although it will be
3289	 * updated as a side-effect of calling this function.
3290	 */
3291
3292	if (mask & (AT_UID|AT_GID|AT_MODE))
3293		mutex_enter(&zp->z_acl_lock);
3294
3295	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3296	    &zp->z_pflags, sizeof (zp->z_pflags));
3297
3298	if (attrzp) {
3299		if (mask & (AT_UID|AT_GID|AT_MODE))
3300			mutex_enter(&attrzp->z_acl_lock);
3301		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3302		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3303		    sizeof (attrzp->z_pflags));
3304	}
3305
3306	if (mask & (AT_UID|AT_GID)) {
3307
3308		if (mask & AT_UID) {
3309			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3310			    &new_uid, sizeof (new_uid));
3311			zp->z_uid = new_uid;
3312			if (attrzp) {
3313				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3314				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3315				    sizeof (new_uid));
3316				attrzp->z_uid = new_uid;
3317			}
3318		}
3319
3320		if (mask & AT_GID) {
3321			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3322			    NULL, &new_gid, sizeof (new_gid));
3323			zp->z_gid = new_gid;
3324			if (attrzp) {
3325				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3326				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3327				    sizeof (new_gid));
3328				attrzp->z_gid = new_gid;
3329			}
3330		}
3331		if (!(mask & AT_MODE)) {
3332			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3333			    NULL, &new_mode, sizeof (new_mode));
3334			new_mode = zp->z_mode;
3335		}
3336		err = zfs_acl_chown_setattr(zp);
3337		ASSERT(err == 0);
3338		if (attrzp) {
3339			err = zfs_acl_chown_setattr(attrzp);
3340			ASSERT(err == 0);
3341		}
3342	}
3343
3344	if (mask & AT_MODE) {
3345		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3346		    &new_mode, sizeof (new_mode));
3347		zp->z_mode = new_mode;
3348		ASSERT3U((uintptr_t)aclp, !=, 0);
3349		err = zfs_aclset_common(zp, aclp, cr, tx);
3350		ASSERT0(err);
3351		if (zp->z_acl_cached)
3352			zfs_acl_free(zp->z_acl_cached);
3353		zp->z_acl_cached = aclp;
3354		aclp = NULL;
3355	}
3356
3357
3358	if (mask & AT_ATIME) {
3359		ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
3360		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
3361		    &zp->z_atime, sizeof (zp->z_atime));
3362	}
3363
3364	if (mask & AT_MTIME) {
3365		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3366		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3367		    mtime, sizeof (mtime));
3368	}
3369
3370	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3371	if (mask & AT_SIZE && !(mask & AT_MTIME)) {
3372		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
3373		    NULL, mtime, sizeof (mtime));
3374		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3375		    &ctime, sizeof (ctime));
3376		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
3377		    B_TRUE);
3378	} else if (mask != 0) {
3379		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3380		    &ctime, sizeof (ctime));
3381		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
3382		    B_TRUE);
3383		if (attrzp) {
3384			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3385			    SA_ZPL_CTIME(zfsvfs), NULL,
3386			    &ctime, sizeof (ctime));
3387			zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
3388			    mtime, ctime, B_TRUE);
3389		}
3390	}
3391	/*
3392	 * Do this after setting timestamps to prevent timestamp
3393	 * update from toggling bit
3394	 */
3395
3396	if (xoap && (mask & AT_XVATTR)) {
3397
3398		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
3399			xoap->xoa_createtime = vap->va_birthtime;
3400		/*
3401		 * restore trimmed off masks
3402		 * so that return masks can be set for caller.
3403		 */
3404
3405		if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3406			XVA_SET_REQ(xvap, XAT_APPENDONLY);
3407		}
3408		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3409			XVA_SET_REQ(xvap, XAT_NOUNLINK);
3410		}
3411		if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3412			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3413		}
3414		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3415			XVA_SET_REQ(xvap, XAT_NODUMP);
3416		}
3417		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3418			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3419		}
3420		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3421			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3422		}
3423
3424		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3425			ASSERT(vp->v_type == VREG);
3426
3427		zfs_xvattr_set(zp, xvap, tx);
3428	}
3429
3430	if (fuid_dirtied)
3431		zfs_fuid_sync(zfsvfs, tx);
3432
3433	if (mask != 0)
3434		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3435
3436	if (mask & (AT_UID|AT_GID|AT_MODE))
3437		mutex_exit(&zp->z_acl_lock);
3438
3439	if (attrzp) {
3440		if (mask & (AT_UID|AT_GID|AT_MODE))
3441			mutex_exit(&attrzp->z_acl_lock);
3442	}
3443out:
3444	if (err == 0 && attrzp) {
3445		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3446		    xattr_count, tx);
3447		ASSERT(err2 == 0);
3448	}
3449
3450	if (attrzp)
3451		vput(ZTOV(attrzp));
3452
3453	if (aclp)
3454		zfs_acl_free(aclp);
3455
3456	if (fuidp) {
3457		zfs_fuid_info_free(fuidp);
3458		fuidp = NULL;
3459	}
3460
3461	if (err) {
3462		dmu_tx_abort(tx);
3463	} else {
3464		err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3465		dmu_tx_commit(tx);
3466	}
3467
3468out2:
3469	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3470		zil_commit(zilog, 0);
3471
3472	ZFS_EXIT(zfsvfs);
3473	return (err);
3474}
3475
3476/*
3477 * We acquire all but fdvp locks using non-blocking acquisitions.  If we
3478 * fail to acquire any lock in the path we will drop all held locks,
3479 * acquire the new lock in a blocking fashion, and then release it and
3480 * restart the rename.  This acquire/release step ensures that we do not
3481 * spin on a lock waiting for release.  On error release all vnode locks
3482 * and decrement references the way tmpfs_rename() would do.
3483 */
3484static int
3485zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
3486    struct vnode *tdvp, struct vnode **tvpp,
3487    const struct componentname *scnp, const struct componentname *tcnp)
3488{
3489	zfsvfs_t	*zfsvfs;
3490	struct vnode	*nvp, *svp, *tvp;
3491	znode_t		*sdzp, *tdzp, *szp, *tzp;
3492	const char	*snm = scnp->cn_nameptr;
3493	const char	*tnm = tcnp->cn_nameptr;
3494	int error;
3495
3496	VOP_UNLOCK(tdvp, 0);
3497	if (*tvpp != NULL && *tvpp != tdvp)
3498		VOP_UNLOCK(*tvpp, 0);
3499
3500relock:
3501	error = vn_lock(sdvp, LK_EXCLUSIVE);
3502	if (error)
3503		goto out;
3504	sdzp = VTOZ(sdvp);
3505
3506	error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
3507	if (error != 0) {
3508		VOP_UNLOCK(sdvp, 0);
3509		if (error != EBUSY)
3510			goto out;
3511		error = vn_lock(tdvp, LK_EXCLUSIVE);
3512		if (error)
3513			goto out;
3514		VOP_UNLOCK(tdvp, 0);
3515		goto relock;
3516	}
3517	tdzp = VTOZ(tdvp);
3518
3519	/*
3520	 * Before using sdzp and tdzp we must ensure that they are live.
3521	 * As a porting legacy from illumos we have two things to worry
3522	 * about.  One is typical for FreeBSD and it is that the vnode is
3523	 * not reclaimed (doomed).  The other is that the znode is live.
3524	 * The current code can invalidate the znode without acquiring the
3525	 * corresponding vnode lock if the object represented by the znode
3526	 * and vnode is no longer valid after a rollback or receive operation.
3527	 * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock
3528	 * that protects the znodes from the invalidation.
3529	 */
3530	zfsvfs = sdzp->z_zfsvfs;
3531	ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
3532	ZFS_ENTER(zfsvfs);
3533
3534	/*
3535	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
3536	 * bypassing the cleanup code in the case of an error.
3537	 */
3538	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
3539		ZFS_EXIT(zfsvfs);
3540		VOP_UNLOCK(sdvp, 0);
3541		VOP_UNLOCK(tdvp, 0);
3542		error = SET_ERROR(EIO);
3543		goto out;
3544	}
3545
3546	/*
3547	 * Re-resolve svp to be certain it still exists and fetch the
3548	 * correct vnode.
3549	 */
3550	error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS);
3551	if (error != 0) {
3552		/* Source entry invalid or not there. */
3553		ZFS_EXIT(zfsvfs);
3554		VOP_UNLOCK(sdvp, 0);
3555		VOP_UNLOCK(tdvp, 0);
3556		if ((scnp->cn_flags & ISDOTDOT) != 0 ||
3557		    (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
3558			error = SET_ERROR(EINVAL);
3559		goto out;
3560	}
3561	svp = ZTOV(szp);
3562
3563	/*
3564	 * Re-resolve tvp, if it disappeared we just carry on.
3565	 */
3566	error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0);
3567	if (error != 0) {
3568		ZFS_EXIT(zfsvfs);
3569		VOP_UNLOCK(sdvp, 0);
3570		VOP_UNLOCK(tdvp, 0);
3571		vrele(svp);
3572		if ((tcnp->cn_flags & ISDOTDOT) != 0)
3573			error = SET_ERROR(EINVAL);
3574		goto out;
3575	}
3576	if (tzp != NULL)
3577		tvp = ZTOV(tzp);
3578	else
3579		tvp = NULL;
3580
3581	/*
3582	 * At present the vnode locks must be acquired before z_teardown_lock,
3583	 * although it would be more logical to use the opposite order.
3584	 */
3585	ZFS_EXIT(zfsvfs);
3586
3587	/*
3588	 * Now try acquire locks on svp and tvp.
3589	 */
3590	nvp = svp;
3591	error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
3592	if (error != 0) {
3593		VOP_UNLOCK(sdvp, 0);
3594		VOP_UNLOCK(tdvp, 0);
3595		if (tvp != NULL)
3596			vrele(tvp);
3597		if (error != EBUSY) {
3598			vrele(nvp);
3599			goto out;
3600		}
3601		error = vn_lock(nvp, LK_EXCLUSIVE);
3602		if (error != 0) {
3603			vrele(nvp);
3604			goto out;
3605		}
3606		VOP_UNLOCK(nvp, 0);
3607		/*
3608		 * Concurrent rename race.
3609		 * XXX ?
3610		 */
3611		if (nvp == tdvp) {
3612			vrele(nvp);
3613			error = SET_ERROR(EINVAL);
3614			goto out;
3615		}
3616		vrele(*svpp);
3617		*svpp = nvp;
3618		goto relock;
3619	}
3620	vrele(*svpp);
3621	*svpp = nvp;
3622
3623	if (*tvpp != NULL)
3624		vrele(*tvpp);
3625	*tvpp = NULL;
3626	if (tvp != NULL) {
3627		nvp = tvp;
3628		error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
3629		if (error != 0) {
3630			VOP_UNLOCK(sdvp, 0);
3631			VOP_UNLOCK(tdvp, 0);
3632			VOP_UNLOCK(*svpp, 0);
3633			if (error != EBUSY) {
3634				vrele(nvp);
3635				goto out;
3636			}
3637			error = vn_lock(nvp, LK_EXCLUSIVE);
3638			if (error != 0) {
3639				vrele(nvp);
3640				goto out;
3641			}
3642			vput(nvp);
3643			goto relock;
3644		}
3645		*tvpp = nvp;
3646	}
3647
3648	return (0);
3649
3650out:
3651	return (error);
3652}
3653
3654/*
3655 * Note that we must use VRELE_ASYNC in this function as it walks
3656 * up the directory tree and vrele may need to acquire an exclusive
3657 * lock if a last reference to a vnode is dropped.
3658 */
3659static int
3660zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
3661{
3662	zfsvfs_t	*zfsvfs;
3663	znode_t		*zp, *zp1;
3664	uint64_t	parent;
3665	int		error;
3666
3667	zfsvfs = tdzp->z_zfsvfs;
3668	if (tdzp == szp)
3669		return (SET_ERROR(EINVAL));
3670	if (tdzp == sdzp)
3671		return (0);
3672	if (tdzp->z_id == zfsvfs->z_root)
3673		return (0);
3674	zp = tdzp;
3675	for (;;) {
3676		ASSERT(!zp->z_unlinked);
3677		if ((error = sa_lookup(zp->z_sa_hdl,
3678		    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
3679			break;
3680
3681		if (parent == szp->z_id) {
3682			error = SET_ERROR(EINVAL);
3683			break;
3684		}
3685		if (parent == zfsvfs->z_root)
3686			break;
3687		if (parent == sdzp->z_id)
3688			break;
3689
3690		error = zfs_zget(zfsvfs, parent, &zp1);
3691		if (error != 0)
3692			break;
3693
3694		if (zp != tdzp)
3695			VN_RELE_ASYNC(ZTOV(zp),
3696			    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
3697		zp = zp1;
3698	}
3699
3700	if (error == ENOTDIR)
3701		panic("checkpath: .. not a directory\n");
3702	if (zp != tdzp)
3703		VN_RELE_ASYNC(ZTOV(zp),
3704		    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
3705	return (error);
3706}
3707
3708/*
3709 * Move an entry from the provided source directory to the target
3710 * directory.  Change the entry name as indicated.
3711 *
3712 *	IN:	sdvp	- Source directory containing the "old entry".
3713 *		snm	- Old entry name.
3714 *		tdvp	- Target directory to contain the "new entry".
3715 *		tnm	- New entry name.
3716 *		cr	- credentials of caller.
3717 *		ct	- caller context
3718 *		flags	- case flags
3719 *
3720 *	RETURN:	0 on success, error code on failure.
3721 *
3722 * Timestamps:
3723 *	sdvp,tdvp - ctime|mtime updated
3724 */
3725/*ARGSUSED*/
3726static int
3727zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
3728    vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
3729    cred_t *cr)
3730{
3731	zfsvfs_t	*zfsvfs;
3732	znode_t		*sdzp, *tdzp, *szp, *tzp;
3733	zilog_t		*zilog = NULL;
3734	dmu_tx_t	*tx;
3735	char		*snm = scnp->cn_nameptr;
3736	char		*tnm = tcnp->cn_nameptr;
3737	int		error = 0;
3738
3739	/* Reject renames across filesystems. */
3740	if ((*svpp)->v_mount != tdvp->v_mount ||
3741	    ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
3742		error = SET_ERROR(EXDEV);
3743		goto out;
3744	}
3745
3746	if (zfsctl_is_node(tdvp)) {
3747		error = SET_ERROR(EXDEV);
3748		goto out;
3749	}
3750
3751	/*
3752	 * Lock all four vnodes to ensure safety and semantics of renaming.
3753	 */
3754	error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
3755	if (error != 0) {
3756		/* no vnodes are locked in the case of error here */
3757		return (error);
3758	}
3759
3760	tdzp = VTOZ(tdvp);
3761	sdzp = VTOZ(sdvp);
3762	zfsvfs = tdzp->z_zfsvfs;
3763	zilog = zfsvfs->z_log;
3764
3765	/*
3766	 * After we re-enter ZFS_ENTER() we will have to revalidate all
3767	 * znodes involved.
3768	 */
3769	ZFS_ENTER(zfsvfs);
3770
3771	if (zfsvfs->z_utf8 && u8_validate(tnm,
3772	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3773		error = SET_ERROR(EILSEQ);
3774		goto unlockout;
3775	}
3776
3777	/* If source and target are the same file, there is nothing to do. */
3778	if ((*svpp) == (*tvpp)) {
3779		error = 0;
3780		goto unlockout;
3781	}
3782
3783	if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
3784	    ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
3785	    (*tvpp)->v_mountedhere != NULL)) {
3786		error = SET_ERROR(EXDEV);
3787		goto unlockout;
3788	}
3789
3790	/*
3791	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
3792	 * bypassing the cleanup code in the case of an error.
3793	 */
3794	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
3795		error = SET_ERROR(EIO);
3796		goto unlockout;
3797	}
3798
3799	szp = VTOZ(*svpp);
3800	tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
3801	if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) {
3802		error = SET_ERROR(EIO);
3803		goto unlockout;
3804	}
3805
3806	/*
3807	 * This is to prevent the creation of links into attribute space
3808	 * by renaming a linked file into/outof an attribute directory.
3809	 * See the comment in zfs_link() for why this is considered bad.
3810	 */
3811	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3812		error = SET_ERROR(EINVAL);
3813		goto unlockout;
3814	}
3815
3816	/*
3817	 * Must have write access at the source to remove the old entry
3818	 * and write access at the target to create the new entry.
3819	 * Note that if target and source are the same, this can be
3820	 * done in a single check.
3821	 */
3822	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
3823		goto unlockout;
3824
3825	if ((*svpp)->v_type == VDIR) {
3826		/*
3827		 * Avoid ".", "..", and aliases of "." for obvious reasons.
3828		 */
3829		if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
3830		    sdzp == szp ||
3831		    (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
3832			error = EINVAL;
3833			goto unlockout;
3834		}
3835
3836		/*
3837		 * Check to make sure rename is valid.
3838		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3839		 */
3840		if (error = zfs_rename_check(szp, sdzp, tdzp))
3841			goto unlockout;
3842	}
3843
3844	/*
3845	 * Does target exist?
3846	 */
3847	if (tzp) {
3848		/*
3849		 * Source and target must be the same type.
3850		 */
3851		if ((*svpp)->v_type == VDIR) {
3852			if ((*tvpp)->v_type != VDIR) {
3853				error = SET_ERROR(ENOTDIR);
3854				goto unlockout;
3855			} else {
3856				cache_purge(tdvp);
3857				if (sdvp != tdvp)
3858					cache_purge(sdvp);
3859			}
3860		} else {
3861			if ((*tvpp)->v_type == VDIR) {
3862				error = SET_ERROR(EISDIR);
3863				goto unlockout;
3864			}
3865		}
3866	}
3867
3868	vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
3869	if (tzp)
3870		vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
3871
3872	/*
3873	 * notify the target directory if it is not the same
3874	 * as source directory.
3875	 */
3876	if (tdvp != sdvp) {
3877		vnevent_rename_dest_dir(tdvp, ct);
3878	}
3879
3880	tx = dmu_tx_create(zfsvfs->z_os);
3881	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3882	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3883	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3884	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3885	if (sdzp != tdzp) {
3886		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3887		zfs_sa_upgrade_txholds(tx, tdzp);
3888	}
3889	if (tzp) {
3890		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3891		zfs_sa_upgrade_txholds(tx, tzp);
3892	}
3893
3894	zfs_sa_upgrade_txholds(tx, szp);
3895	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3896	error = dmu_tx_assign(tx, TXG_WAIT);
3897	if (error) {
3898		dmu_tx_abort(tx);
3899		goto unlockout;
3900	}
3901
3902
3903	if (tzp)	/* Attempt to remove the existing target */
3904		error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
3905
3906	if (error == 0) {
3907		error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
3908		if (error == 0) {
3909			szp->z_pflags |= ZFS_AV_MODIFIED;
3910
3911			error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3912			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3913			ASSERT0(error);
3914
3915			error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING,
3916			    NULL);
3917			if (error == 0) {
3918				zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
3919				    snm, tdzp, tnm, szp);
3920
3921				/*
3922				 * Update path information for the target vnode
3923				 */
3924				vn_renamepath(tdvp, *svpp, tnm, strlen(tnm));
3925			} else {
3926				/*
3927				 * At this point, we have successfully created
3928				 * the target name, but have failed to remove
3929				 * the source name.  Since the create was done
3930				 * with the ZRENAMING flag, there are
3931				 * complications; for one, the link count is
3932				 * wrong.  The easiest way to deal with this
3933				 * is to remove the newly created target, and
3934				 * return the original error.  This must
3935				 * succeed; fortunately, it is very unlikely to
3936				 * fail, since we just created it.
3937				 */
3938				VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx,
3939				    ZRENAMING, NULL), ==, 0);
3940			}
3941		}
3942		if (error == 0) {
3943			cache_purge(*svpp);
3944			if (*tvpp != NULL)
3945				cache_purge(*tvpp);
3946			cache_purge_negative(tdvp);
3947		}
3948	}
3949
3950	dmu_tx_commit(tx);
3951
3952unlockout:			/* all 4 vnodes are locked, ZFS_ENTER called */
3953	ZFS_EXIT(zfsvfs);
3954	VOP_UNLOCK(*svpp, 0);
3955	VOP_UNLOCK(sdvp, 0);
3956
3957out:				/* original two vnodes are locked */
3958	if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3959		zil_commit(zilog, 0);
3960
3961	if (*tvpp != NULL)
3962		VOP_UNLOCK(*tvpp, 0);
3963	if (tdvp != *tvpp)
3964		VOP_UNLOCK(tdvp, 0);
3965	return (error);
3966}
3967
3968/*
3969 * Insert the indicated symbolic reference entry into the directory.
3970 *
3971 *	IN:	dvp	- Directory to contain new symbolic link.
3972 *		link	- Name for new symlink entry.
3973 *		vap	- Attributes of new entry.
3974 *		cr	- credentials of caller.
3975 *		ct	- caller context
3976 *		flags	- case flags
3977 *
3978 *	RETURN:	0 on success, error code on failure.
3979 *
3980 * Timestamps:
3981 *	dvp - ctime|mtime updated
3982 */
3983/*ARGSUSED*/
3984static int
3985zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
3986    cred_t *cr, kthread_t *td)
3987{
3988	znode_t		*zp, *dzp = VTOZ(dvp);
3989	dmu_tx_t	*tx;
3990	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
3991	zilog_t		*zilog;
3992	uint64_t	len = strlen(link);
3993	int		error;
3994	zfs_acl_ids_t	acl_ids;
3995	boolean_t	fuid_dirtied;
3996	uint64_t	txtype = TX_SYMLINK;
3997	int		flags = 0;
3998
3999	ASSERT(vap->va_type == VLNK);
4000
4001	ZFS_ENTER(zfsvfs);
4002	ZFS_VERIFY_ZP(dzp);
4003	zilog = zfsvfs->z_log;
4004
4005	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
4006	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4007		ZFS_EXIT(zfsvfs);
4008		return (SET_ERROR(EILSEQ));
4009	}
4010
4011	if (len > MAXPATHLEN) {
4012		ZFS_EXIT(zfsvfs);
4013		return (SET_ERROR(ENAMETOOLONG));
4014	}
4015
4016	if ((error = zfs_acl_ids_create(dzp, 0,
4017	    vap, cr, NULL, &acl_ids)) != 0) {
4018		ZFS_EXIT(zfsvfs);
4019		return (error);
4020	}
4021
4022	/*
4023	 * Attempt to lock directory; fail if entry already exists.
4024	 */
4025	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
4026	if (error) {
4027		zfs_acl_ids_free(&acl_ids);
4028		ZFS_EXIT(zfsvfs);
4029		return (error);
4030	}
4031
4032	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4033		zfs_acl_ids_free(&acl_ids);
4034		ZFS_EXIT(zfsvfs);
4035		return (error);
4036	}
4037
4038	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
4039		zfs_acl_ids_free(&acl_ids);
4040		ZFS_EXIT(zfsvfs);
4041		return (SET_ERROR(EDQUOT));
4042	}
4043
4044	getnewvnode_reserve(1);
4045	tx = dmu_tx_create(zfsvfs->z_os);
4046	fuid_dirtied = zfsvfs->z_fuid_dirty;
4047	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
4048	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4049	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
4050	    ZFS_SA_BASE_ATTR_SIZE + len);
4051	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
4052	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
4053		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
4054		    acl_ids.z_aclp->z_acl_bytes);
4055	}
4056	if (fuid_dirtied)
4057		zfs_fuid_txhold(zfsvfs, tx);
4058	error = dmu_tx_assign(tx, TXG_WAIT);
4059	if (error) {
4060		zfs_acl_ids_free(&acl_ids);
4061		dmu_tx_abort(tx);
4062		getnewvnode_drop_reserve();
4063		ZFS_EXIT(zfsvfs);
4064		return (error);
4065	}
4066
4067	/*
4068	 * Create a new object for the symlink.
4069	 * for version 4 ZPL datsets the symlink will be an SA attribute
4070	 */
4071	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
4072
4073	if (fuid_dirtied)
4074		zfs_fuid_sync(zfsvfs, tx);
4075
4076	if (zp->z_is_sa)
4077		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
4078		    link, len, tx);
4079	else
4080		zfs_sa_symlink(zp, link, len, tx);
4081
4082	zp->z_size = len;
4083	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
4084	    &zp->z_size, sizeof (zp->z_size), tx);
4085	/*
4086	 * Insert the new object into the directory.
4087	 */
4088	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
4089
4090	zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
4091	*vpp = ZTOV(zp);
4092
4093	zfs_acl_ids_free(&acl_ids);
4094
4095	dmu_tx_commit(tx);
4096
4097	getnewvnode_drop_reserve();
4098
4099	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4100		zil_commit(zilog, 0);
4101
4102	ZFS_EXIT(zfsvfs);
4103	return (error);
4104}
4105
4106/*
4107 * Return, in the buffer contained in the provided uio structure,
4108 * the symbolic path referred to by vp.
4109 *
4110 *	IN:	vp	- vnode of symbolic link.
4111 *		uio	- structure to contain the link path.
4112 *		cr	- credentials of caller.
4113 *		ct	- caller context
4114 *
4115 *	OUT:	uio	- structure containing the link path.
4116 *
4117 *	RETURN:	0 on success, error code on failure.
4118 *
4119 * Timestamps:
4120 *	vp - atime updated
4121 */
4122/* ARGSUSED */
4123static int
4124zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
4125{
4126	znode_t		*zp = VTOZ(vp);
4127	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4128	int		error;
4129
4130	ZFS_ENTER(zfsvfs);
4131	ZFS_VERIFY_ZP(zp);
4132
4133	if (zp->z_is_sa)
4134		error = sa_lookup_uio(zp->z_sa_hdl,
4135		    SA_ZPL_SYMLINK(zfsvfs), uio);
4136	else
4137		error = zfs_sa_readlink(zp, uio);
4138
4139	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4140
4141	ZFS_EXIT(zfsvfs);
4142	return (error);
4143}
4144
4145/*
4146 * Insert a new entry into directory tdvp referencing svp.
4147 *
4148 *	IN:	tdvp	- Directory to contain new entry.
4149 *		svp	- vnode of new entry.
4150 *		name	- name of new entry.
4151 *		cr	- credentials of caller.
4152 *		ct	- caller context
4153 *
4154 *	RETURN:	0 on success, error code on failure.
4155 *
4156 * Timestamps:
4157 *	tdvp - ctime|mtime updated
4158 *	 svp - ctime updated
4159 */
4160/* ARGSUSED */
4161static int
4162zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
4163    caller_context_t *ct, int flags)
4164{
4165	znode_t		*dzp = VTOZ(tdvp);
4166	znode_t		*tzp, *szp;
4167	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
4168	zilog_t		*zilog;
4169	dmu_tx_t	*tx;
4170	int		error;
4171	uint64_t	parent;
4172	uid_t		owner;
4173
4174	ASSERT(tdvp->v_type == VDIR);
4175
4176	ZFS_ENTER(zfsvfs);
4177	ZFS_VERIFY_ZP(dzp);
4178	zilog = zfsvfs->z_log;
4179
4180	/*
4181	 * POSIX dictates that we return EPERM here.
4182	 * Better choices include ENOTSUP or EISDIR.
4183	 */
4184	if (svp->v_type == VDIR) {
4185		ZFS_EXIT(zfsvfs);
4186		return (SET_ERROR(EPERM));
4187	}
4188
4189	szp = VTOZ(svp);
4190	ZFS_VERIFY_ZP(szp);
4191
4192	if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) {
4193		ZFS_EXIT(zfsvfs);
4194		return (SET_ERROR(EPERM));
4195	}
4196
4197	/* Prevent links to .zfs/shares files */
4198
4199	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
4200	    &parent, sizeof (uint64_t))) != 0) {
4201		ZFS_EXIT(zfsvfs);
4202		return (error);
4203	}
4204	if (parent == zfsvfs->z_shares_dir) {
4205		ZFS_EXIT(zfsvfs);
4206		return (SET_ERROR(EPERM));
4207	}
4208
4209	if (zfsvfs->z_utf8 && u8_validate(name,
4210	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4211		ZFS_EXIT(zfsvfs);
4212		return (SET_ERROR(EILSEQ));
4213	}
4214
4215	/*
4216	 * We do not support links between attributes and non-attributes
4217	 * because of the potential security risk of creating links
4218	 * into "normal" file space in order to circumvent restrictions
4219	 * imposed in attribute space.
4220	 */
4221	if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
4222		ZFS_EXIT(zfsvfs);
4223		return (SET_ERROR(EINVAL));
4224	}
4225
4226
4227	owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
4228	if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
4229		ZFS_EXIT(zfsvfs);
4230		return (SET_ERROR(EPERM));
4231	}
4232
4233	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4234		ZFS_EXIT(zfsvfs);
4235		return (error);
4236	}
4237
4238	/*
4239	 * Attempt to lock directory; fail if entry already exists.
4240	 */
4241	error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW);
4242	if (error) {
4243		ZFS_EXIT(zfsvfs);
4244		return (error);
4245	}
4246
4247	tx = dmu_tx_create(zfsvfs->z_os);
4248	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4249	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4250	zfs_sa_upgrade_txholds(tx, szp);
4251	zfs_sa_upgrade_txholds(tx, dzp);
4252	error = dmu_tx_assign(tx, TXG_WAIT);
4253	if (error) {
4254		dmu_tx_abort(tx);
4255		ZFS_EXIT(zfsvfs);
4256		return (error);
4257	}
4258
4259	error = zfs_link_create(dzp, name, szp, tx, 0);
4260
4261	if (error == 0) {
4262		uint64_t txtype = TX_LINK;
4263		zfs_log_link(zilog, tx, txtype, dzp, szp, name);
4264	}
4265
4266	dmu_tx_commit(tx);
4267
4268	if (error == 0) {
4269		vnevent_link(svp, ct);
4270	}
4271
4272	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4273		zil_commit(zilog, 0);
4274
4275	ZFS_EXIT(zfsvfs);
4276	return (error);
4277}
4278
4279
4280/*ARGSUSED*/
4281void
4282zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4283{
4284	znode_t	*zp = VTOZ(vp);
4285	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4286	int error;
4287
4288	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4289	if (zp->z_sa_hdl == NULL) {
4290		/*
4291		 * The fs has been unmounted, or we did a
4292		 * suspend/resume and this file no longer exists.
4293		 */
4294		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4295		vrecycle(vp);
4296		return;
4297	}
4298
4299	if (zp->z_unlinked) {
4300		/*
4301		 * Fast path to recycle a vnode of a removed file.
4302		 */
4303		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4304		vrecycle(vp);
4305		return;
4306	}
4307
4308	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4309		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4310
4311		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4312		zfs_sa_upgrade_txholds(tx, zp);
4313		error = dmu_tx_assign(tx, TXG_WAIT);
4314		if (error) {
4315			dmu_tx_abort(tx);
4316		} else {
4317			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4318			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4319			zp->z_atime_dirty = 0;
4320			dmu_tx_commit(tx);
4321		}
4322	}
4323	rw_exit(&zfsvfs->z_teardown_inactive_lock);
4324}
4325
4326
4327CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
4328CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
4329
4330/*ARGSUSED*/
4331static int
4332zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4333{
4334	znode_t		*zp = VTOZ(vp);
4335	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4336	uint32_t	gen;
4337	uint64_t	gen64;
4338	uint64_t	object = zp->z_id;
4339	zfid_short_t	*zfid;
4340	int		size, i, error;
4341
4342	ZFS_ENTER(zfsvfs);
4343	ZFS_VERIFY_ZP(zp);
4344
4345	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4346	    &gen64, sizeof (uint64_t))) != 0) {
4347		ZFS_EXIT(zfsvfs);
4348		return (error);
4349	}
4350
4351	gen = (uint32_t)gen64;
4352
4353	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
4354
4355#ifdef illumos
4356	if (fidp->fid_len < size) {
4357		fidp->fid_len = size;
4358		ZFS_EXIT(zfsvfs);
4359		return (SET_ERROR(ENOSPC));
4360	}
4361#else
4362	fidp->fid_len = size;
4363#endif
4364
4365	zfid = (zfid_short_t *)fidp;
4366
4367	zfid->zf_len = size;
4368
4369	for (i = 0; i < sizeof (zfid->zf_object); i++)
4370		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4371
4372	/* Must have a non-zero generation number to distinguish from .zfs */
4373	if (gen == 0)
4374		gen = 1;
4375	for (i = 0; i < sizeof (zfid->zf_gen); i++)
4376		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4377
4378	if (size == LONG_FID_LEN) {
4379		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
4380		zfid_long_t	*zlfid;
4381
4382		zlfid = (zfid_long_t *)fidp;
4383
4384		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
4385			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
4386
4387		/* XXX - this should be the generation number for the objset */
4388		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
4389			zlfid->zf_setgen[i] = 0;
4390	}
4391
4392	ZFS_EXIT(zfsvfs);
4393	return (0);
4394}
4395
4396static int
4397zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4398    caller_context_t *ct)
4399{
4400	znode_t		*zp, *xzp;
4401	zfsvfs_t	*zfsvfs;
4402	int		error;
4403
4404	switch (cmd) {
4405	case _PC_LINK_MAX:
4406		*valp = INT_MAX;
4407		return (0);
4408
4409	case _PC_FILESIZEBITS:
4410		*valp = 64;
4411		return (0);
4412#ifdef illumos
4413	case _PC_XATTR_EXISTS:
4414		zp = VTOZ(vp);
4415		zfsvfs = zp->z_zfsvfs;
4416		ZFS_ENTER(zfsvfs);
4417		ZFS_VERIFY_ZP(zp);
4418		*valp = 0;
4419		error = zfs_dirent_lookup(zp, "", &xzp,
4420		    ZXATTR | ZEXISTS | ZSHARED);
4421		if (error == 0) {
4422			if (!zfs_dirempty(xzp))
4423				*valp = 1;
4424			vrele(ZTOV(xzp));
4425		} else if (error == ENOENT) {
4426			/*
4427			 * If there aren't extended attributes, it's the
4428			 * same as having zero of them.
4429			 */
4430			error = 0;
4431		}
4432		ZFS_EXIT(zfsvfs);
4433		return (error);
4434
4435	case _PC_SATTR_ENABLED:
4436	case _PC_SATTR_EXISTS:
4437		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
4438		    (vp->v_type == VREG || vp->v_type == VDIR);
4439		return (0);
4440
4441	case _PC_ACCESS_FILTERING:
4442		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
4443		    vp->v_type == VDIR;
4444		return (0);
4445
4446	case _PC_ACL_ENABLED:
4447		*valp = _ACL_ACE_ENABLED;
4448		return (0);
4449#endif	/* illumos */
4450	case _PC_MIN_HOLE_SIZE:
4451		*valp = (int)SPA_MINBLOCKSIZE;
4452		return (0);
4453#ifdef illumos
4454	case _PC_TIMESTAMP_RESOLUTION:
4455		/* nanosecond timestamp resolution */
4456		*valp = 1L;
4457		return (0);
4458#endif
4459	case _PC_ACL_EXTENDED:
4460		*valp = 0;
4461		return (0);
4462
4463	case _PC_ACL_NFS4:
4464		*valp = 1;
4465		return (0);
4466
4467	case _PC_ACL_PATH_MAX:
4468		*valp = ACL_MAX_ENTRIES;
4469		return (0);
4470
4471	default:
4472		return (EOPNOTSUPP);
4473	}
4474}
4475
4476/*ARGSUSED*/
4477static int
4478zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4479    caller_context_t *ct)
4480{
4481	znode_t *zp = VTOZ(vp);
4482	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4483	int error;
4484	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4485
4486	ZFS_ENTER(zfsvfs);
4487	ZFS_VERIFY_ZP(zp);
4488	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
4489	ZFS_EXIT(zfsvfs);
4490
4491	return (error);
4492}
4493
4494/*ARGSUSED*/
4495int
4496zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4497    caller_context_t *ct)
4498{
4499	znode_t *zp = VTOZ(vp);
4500	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4501	int error;
4502	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4503	zilog_t	*zilog = zfsvfs->z_log;
4504
4505	ZFS_ENTER(zfsvfs);
4506	ZFS_VERIFY_ZP(zp);
4507
4508	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
4509
4510	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4511		zil_commit(zilog, 0);
4512
4513	ZFS_EXIT(zfsvfs);
4514	return (error);
4515}
4516
4517static int
4518zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
4519    int *rahead)
4520{
4521	znode_t *zp = VTOZ(vp);
4522	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4523	objset_t *os = zp->z_zfsvfs->z_os;
4524	rl_t *rl;
4525	vm_object_t object;
4526	off_t start, end, obj_size;
4527	uint_t blksz;
4528	int pgsin_b, pgsin_a;
4529	int error;
4530
4531	ZFS_ENTER(zfsvfs);
4532	ZFS_VERIFY_ZP(zp);
4533
4534	start = IDX_TO_OFF(ma[0]->pindex);
4535	end = IDX_TO_OFF(ma[count - 1]->pindex + 1);
4536
4537	/*
4538	 * Lock a range covering all required and optional pages.
4539	 * Note that we need to handle the case of the block size growing.
4540	 */
4541	for (;;) {
4542		blksz = zp->z_blksz;
4543		rl = zfs_range_lock(zp, rounddown(start, blksz),
4544		    roundup(end, blksz) - rounddown(start, blksz), RL_READER);
4545		if (blksz == zp->z_blksz)
4546			break;
4547		zfs_range_unlock(rl);
4548	}
4549
4550	object = ma[0]->object;
4551	zfs_vmobject_wlock(object);
4552	obj_size = object->un_pager.vnp.vnp_size;
4553	zfs_vmobject_wunlock(object);
4554	if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) {
4555		zfs_range_unlock(rl);
4556		ZFS_EXIT(zfsvfs);
4557		return (zfs_vm_pagerret_bad);
4558	}
4559
4560	pgsin_b = 0;
4561	if (rbehind != NULL) {
4562		pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz));
4563		pgsin_b = MIN(*rbehind, pgsin_b);
4564	}
4565
4566	pgsin_a = 0;
4567	if (rahead != NULL) {
4568		pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end);
4569		if (end + IDX_TO_OFF(pgsin_a) >= obj_size)
4570			pgsin_a = OFF_TO_IDX(round_page(obj_size) - end);
4571		pgsin_a = MIN(*rahead, pgsin_a);
4572	}
4573
4574	/*
4575	 * NB: we need to pass the exact byte size of the data that we expect
4576	 * to read after accounting for the file size.  This is required because
4577	 * ZFS will panic if we request DMU to read beyond the end of the last
4578	 * allocated block.
4579	 */
4580	error = dmu_read_pages(os, zp->z_id, ma, count, &pgsin_b, &pgsin_a,
4581	    MIN(end, obj_size) - (end - PAGE_SIZE));
4582
4583	zfs_range_unlock(rl);
4584	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4585	ZFS_EXIT(zfsvfs);
4586
4587	if (error != 0)
4588		return (zfs_vm_pagerret_error);
4589
4590	PCPU_INC(cnt.v_vnodein);
4591	PCPU_ADD(cnt.v_vnodepgsin, count + pgsin_b + pgsin_a);
4592	if (rbehind != NULL)
4593		*rbehind = pgsin_b;
4594	if (rahead != NULL)
4595		*rahead = pgsin_a;
4596	return (zfs_vm_pagerret_ok);
4597}
4598
4599static int
4600zfs_freebsd_getpages(ap)
4601	struct vop_getpages_args /* {
4602		struct vnode *a_vp;
4603		vm_page_t *a_m;
4604		int a_count;
4605		int *a_rbehind;
4606		int *a_rahead;
4607	} */ *ap;
4608{
4609
4610	return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind,
4611	    ap->a_rahead));
4612}
4613
4614static int
4615zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
4616    int *rtvals)
4617{
4618	znode_t		*zp = VTOZ(vp);
4619	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4620	rl_t		*rl;
4621	dmu_tx_t	*tx;
4622	struct sf_buf	*sf;
4623	vm_object_t	object;
4624	vm_page_t	m;
4625	caddr_t		va;
4626	size_t		tocopy;
4627	size_t		lo_len;
4628	vm_ooffset_t	lo_off;
4629	vm_ooffset_t	off;
4630	uint_t		blksz;
4631	int		ncount;
4632	int		pcount;
4633	int		err;
4634	int		i;
4635
4636	ZFS_ENTER(zfsvfs);
4637	ZFS_VERIFY_ZP(zp);
4638
4639	object = vp->v_object;
4640	pcount = btoc(len);
4641	ncount = pcount;
4642
4643	KASSERT(ma[0]->object == object, ("mismatching object"));
4644	KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));
4645
4646	for (i = 0; i < pcount; i++)
4647		rtvals[i] = zfs_vm_pagerret_error;
4648
4649	off = IDX_TO_OFF(ma[0]->pindex);
4650	blksz = zp->z_blksz;
4651	lo_off = rounddown(off, blksz);
4652	lo_len = roundup(len + (off - lo_off), blksz);
4653	rl = zfs_range_lock(zp, lo_off, lo_len, RL_WRITER);
4654
4655	zfs_vmobject_wlock(object);
4656	if (len + off > object->un_pager.vnp.vnp_size) {
4657		if (object->un_pager.vnp.vnp_size > off) {
4658			int pgoff;
4659
4660			len = object->un_pager.vnp.vnp_size - off;
4661			ncount = btoc(len);
4662			if ((pgoff = (int)len & PAGE_MASK) != 0) {
4663				/*
4664				 * If the object is locked and the following
4665				 * conditions hold, then the page's dirty
4666				 * field cannot be concurrently changed by a
4667				 * pmap operation.
4668				 */
4669				m = ma[ncount - 1];
4670				vm_page_assert_sbusied(m);
4671				KASSERT(!pmap_page_is_write_mapped(m),
4672				    ("zfs_putpages: page %p is not read-only", m));
4673				vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
4674				    pgoff);
4675			}
4676		} else {
4677			len = 0;
4678			ncount = 0;
4679		}
4680		if (ncount < pcount) {
4681			for (i = ncount; i < pcount; i++) {
4682				rtvals[i] = zfs_vm_pagerret_bad;
4683			}
4684		}
4685	}
4686	zfs_vmobject_wunlock(object);
4687
4688	if (ncount == 0)
4689		goto out;
4690
4691	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
4692	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
4693		goto out;
4694	}
4695
4696	tx = dmu_tx_create(zfsvfs->z_os);
4697	dmu_tx_hold_write(tx, zp->z_id, off, len);
4698
4699	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4700	zfs_sa_upgrade_txholds(tx, zp);
4701	err = dmu_tx_assign(tx, TXG_WAIT);
4702	if (err != 0) {
4703		dmu_tx_abort(tx);
4704		goto out;
4705	}
4706
4707	if (zp->z_blksz < PAGE_SIZE) {
4708		for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
4709			tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
4710			va = zfs_map_page(ma[i], &sf);
4711			dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
4712			zfs_unmap_page(sf);
4713		}
4714	} else {
4715		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
4716	}
4717
4718	if (err == 0) {
4719		uint64_t mtime[2], ctime[2];
4720		sa_bulk_attr_t bulk[3];
4721		int count = 0;
4722
4723		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4724		    &mtime, 16);
4725		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
4726		    &ctime, 16);
4727		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
4728		    &zp->z_pflags, 8);
4729		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
4730		    B_TRUE);
4731		err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
4732		ASSERT0(err);
4733		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
4734
4735		zfs_vmobject_wlock(object);
4736		for (i = 0; i < ncount; i++) {
4737			rtvals[i] = zfs_vm_pagerret_ok;
4738			vm_page_undirty(ma[i]);
4739		}
4740		zfs_vmobject_wunlock(object);
4741		PCPU_INC(cnt.v_vnodeout);
4742		PCPU_ADD(cnt.v_vnodepgsout, ncount);
4743	}
4744	dmu_tx_commit(tx);
4745
4746out:
4747	zfs_range_unlock(rl);
4748	if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 ||
4749	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4750		zil_commit(zfsvfs->z_log, zp->z_id);
4751	ZFS_EXIT(zfsvfs);
4752	return (rtvals[0]);
4753}
4754
4755int
4756zfs_freebsd_putpages(ap)
4757	struct vop_putpages_args /* {
4758		struct vnode *a_vp;
4759		vm_page_t *a_m;
4760		int a_count;
4761		int a_sync;
4762		int *a_rtvals;
4763	} */ *ap;
4764{
4765
4766	return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
4767	    ap->a_rtvals));
4768}
4769
4770static int
4771zfs_freebsd_bmap(ap)
4772	struct vop_bmap_args /* {
4773		struct vnode *a_vp;
4774		daddr_t  a_bn;
4775		struct bufobj **a_bop;
4776		daddr_t *a_bnp;
4777		int *a_runp;
4778		int *a_runb;
4779	} */ *ap;
4780{
4781
4782	if (ap->a_bop != NULL)
4783		*ap->a_bop = &ap->a_vp->v_bufobj;
4784	if (ap->a_bnp != NULL)
4785		*ap->a_bnp = ap->a_bn;
4786	if (ap->a_runp != NULL)
4787		*ap->a_runp = 0;
4788	if (ap->a_runb != NULL)
4789		*ap->a_runb = 0;
4790
4791	return (0);
4792}
4793
4794static int
4795zfs_freebsd_open(ap)
4796	struct vop_open_args /* {
4797		struct vnode *a_vp;
4798		int a_mode;
4799		struct ucred *a_cred;
4800		struct thread *a_td;
4801	} */ *ap;
4802{
4803	vnode_t	*vp = ap->a_vp;
4804	znode_t *zp = VTOZ(vp);
4805	int error;
4806
4807	error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
4808	if (error == 0)
4809		vnode_create_vobject(vp, zp->z_size, ap->a_td);
4810	return (error);
4811}
4812
4813static int
4814zfs_freebsd_close(ap)
4815	struct vop_close_args /* {
4816		struct vnode *a_vp;
4817		int  a_fflag;
4818		struct ucred *a_cred;
4819		struct thread *a_td;
4820	} */ *ap;
4821{
4822
4823	return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL));
4824}
4825
4826static int
4827zfs_freebsd_ioctl(ap)
4828	struct vop_ioctl_args /* {
4829		struct vnode *a_vp;
4830		u_long a_command;
4831		caddr_t a_data;
4832		int a_fflag;
4833		struct ucred *cred;
4834		struct thread *td;
4835	} */ *ap;
4836{
4837
4838	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
4839	    ap->a_fflag, ap->a_cred, NULL, NULL));
4840}
4841
4842static int
4843ioflags(int ioflags)
4844{
4845	int flags = 0;
4846
4847	if (ioflags & IO_APPEND)
4848		flags |= FAPPEND;
4849	if (ioflags & IO_NDELAY)
4850		flags |= FNONBLOCK;
4851	if (ioflags & IO_SYNC)
4852		flags |= (FSYNC | FDSYNC | FRSYNC);
4853
4854	return (flags);
4855}
4856
4857static int
4858zfs_freebsd_read(ap)
4859	struct vop_read_args /* {
4860		struct vnode *a_vp;
4861		struct uio *a_uio;
4862		int a_ioflag;
4863		struct ucred *a_cred;
4864	} */ *ap;
4865{
4866
4867	return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
4868	    ap->a_cred, NULL));
4869}
4870
4871static int
4872zfs_freebsd_write(ap)
4873	struct vop_write_args /* {
4874		struct vnode *a_vp;
4875		struct uio *a_uio;
4876		int a_ioflag;
4877		struct ucred *a_cred;
4878	} */ *ap;
4879{
4880
4881	return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
4882	    ap->a_cred, NULL));
4883}
4884
4885static int
4886zfs_freebsd_access(ap)
4887	struct vop_access_args /* {
4888		struct vnode *a_vp;
4889		accmode_t a_accmode;
4890		struct ucred *a_cred;
4891		struct thread *a_td;
4892	} */ *ap;
4893{
4894	vnode_t *vp = ap->a_vp;
4895	znode_t *zp = VTOZ(vp);
4896	accmode_t accmode;
4897	int error = 0;
4898
4899	/*
4900	 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
4901	 */
4902	accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
4903	if (accmode != 0)
4904		error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL);
4905
4906	/*
4907	 * VADMIN has to be handled by vaccess().
4908	 */
4909	if (error == 0) {
4910		accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
4911		if (accmode != 0) {
4912			error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
4913			    zp->z_gid, accmode, ap->a_cred, NULL);
4914		}
4915	}
4916
4917	/*
4918	 * For VEXEC, ensure that at least one execute bit is set for
4919	 * non-directories.
4920	 */
4921	if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
4922	    (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
4923		error = EACCES;
4924	}
4925
4926	return (error);
4927}
4928
4929static int
4930zfs_freebsd_lookup(ap)
4931	struct vop_lookup_args /* {
4932		struct vnode *a_dvp;
4933		struct vnode **a_vpp;
4934		struct componentname *a_cnp;
4935	} */ *ap;
4936{
4937	struct componentname *cnp = ap->a_cnp;
4938	char nm[NAME_MAX + 1];
4939
4940	ASSERT(cnp->cn_namelen < sizeof(nm));
4941	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
4942
4943	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
4944	    cnp->cn_cred, cnp->cn_thread, 0));
4945}
4946
4947static int
4948zfs_cache_lookup(ap)
4949	struct vop_lookup_args /* {
4950		struct vnode *a_dvp;
4951		struct vnode **a_vpp;
4952		struct componentname *a_cnp;
4953	} */ *ap;
4954{
4955	zfsvfs_t *zfsvfs;
4956
4957	zfsvfs = ap->a_dvp->v_mount->mnt_data;
4958	if (zfsvfs->z_use_namecache)
4959		return (vfs_cache_lookup(ap));
4960	else
4961		return (zfs_freebsd_lookup(ap));
4962}
4963
4964static int
4965zfs_freebsd_create(ap)
4966	struct vop_create_args /* {
4967		struct vnode *a_dvp;
4968		struct vnode **a_vpp;
4969		struct componentname *a_cnp;
4970		struct vattr *a_vap;
4971	} */ *ap;
4972{
4973	zfsvfs_t *zfsvfs;
4974	struct componentname *cnp = ap->a_cnp;
4975	vattr_t *vap = ap->a_vap;
4976	int error, mode;
4977
4978	ASSERT(cnp->cn_flags & SAVENAME);
4979
4980	vattr_init_mask(vap);
4981	mode = vap->va_mode & ALLPERMS;
4982	zfsvfs = ap->a_dvp->v_mount->mnt_data;
4983
4984	error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
4985	    ap->a_vpp, cnp->cn_cred, cnp->cn_thread);
4986	if (zfsvfs->z_use_namecache &&
4987	    error == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
4988		cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
4989	return (error);
4990}
4991
4992static int
4993zfs_freebsd_remove(ap)
4994	struct vop_remove_args /* {
4995		struct vnode *a_dvp;
4996		struct vnode *a_vp;
4997		struct componentname *a_cnp;
4998	} */ *ap;
4999{
5000
5001	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
5002
5003	return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
5004	    ap->a_cnp->cn_cred));
5005}
5006
5007static int
5008zfs_freebsd_mkdir(ap)
5009	struct vop_mkdir_args /* {
5010		struct vnode *a_dvp;
5011		struct vnode **a_vpp;
5012		struct componentname *a_cnp;
5013		struct vattr *a_vap;
5014	} */ *ap;
5015{
5016	vattr_t *vap = ap->a_vap;
5017
5018	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
5019
5020	vattr_init_mask(vap);
5021
5022	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
5023	    ap->a_cnp->cn_cred));
5024}
5025
5026static int
5027zfs_freebsd_rmdir(ap)
5028	struct vop_rmdir_args /* {
5029		struct vnode *a_dvp;
5030		struct vnode *a_vp;
5031		struct componentname *a_cnp;
5032	} */ *ap;
5033{
5034	struct componentname *cnp = ap->a_cnp;
5035
5036	ASSERT(cnp->cn_flags & SAVENAME);
5037
5038	return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
5039}
5040
5041static int
5042zfs_freebsd_readdir(ap)
5043	struct vop_readdir_args /* {
5044		struct vnode *a_vp;
5045		struct uio *a_uio;
5046		struct ucred *a_cred;
5047		int *a_eofflag;
5048		int *a_ncookies;
5049		u_long **a_cookies;
5050	} */ *ap;
5051{
5052
5053	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
5054	    ap->a_ncookies, ap->a_cookies));
5055}
5056
5057static int
5058zfs_freebsd_fsync(ap)
5059	struct vop_fsync_args /* {
5060		struct vnode *a_vp;
5061		int a_waitfor;
5062		struct thread *a_td;
5063	} */ *ap;
5064{
5065
5066	vop_stdfsync(ap);
5067	return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
5068}
5069
5070static int
5071zfs_freebsd_getattr(ap)
5072	struct vop_getattr_args /* {
5073		struct vnode *a_vp;
5074		struct vattr *a_vap;
5075		struct ucred *a_cred;
5076	} */ *ap;
5077{
5078	vattr_t *vap = ap->a_vap;
5079	xvattr_t xvap;
5080	u_long fflags = 0;
5081	int error;
5082
5083	xva_init(&xvap);
5084	xvap.xva_vattr = *vap;
5085	xvap.xva_vattr.va_mask |= AT_XVATTR;
5086
5087	/* Convert chflags into ZFS-type flags. */
5088	/* XXX: what about SF_SETTABLE?. */
5089	XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
5090	XVA_SET_REQ(&xvap, XAT_APPENDONLY);
5091	XVA_SET_REQ(&xvap, XAT_NOUNLINK);
5092	XVA_SET_REQ(&xvap, XAT_NODUMP);
5093	XVA_SET_REQ(&xvap, XAT_READONLY);
5094	XVA_SET_REQ(&xvap, XAT_ARCHIVE);
5095	XVA_SET_REQ(&xvap, XAT_SYSTEM);
5096	XVA_SET_REQ(&xvap, XAT_HIDDEN);
5097	XVA_SET_REQ(&xvap, XAT_REPARSE);
5098	XVA_SET_REQ(&xvap, XAT_OFFLINE);
5099	XVA_SET_REQ(&xvap, XAT_SPARSE);
5100
5101	error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
5102	if (error != 0)
5103		return (error);
5104
5105	/* Convert ZFS xattr into chflags. */
5106#define	FLAG_CHECK(fflag, xflag, xfield)	do {			\
5107	if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)		\
5108		fflags |= (fflag);					\
5109} while (0)
5110	FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
5111	    xvap.xva_xoptattrs.xoa_immutable);
5112	FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
5113	    xvap.xva_xoptattrs.xoa_appendonly);
5114	FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
5115	    xvap.xva_xoptattrs.xoa_nounlink);
5116	FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
5117	    xvap.xva_xoptattrs.xoa_archive);
5118	FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
5119	    xvap.xva_xoptattrs.xoa_nodump);
5120	FLAG_CHECK(UF_READONLY, XAT_READONLY,
5121	    xvap.xva_xoptattrs.xoa_readonly);
5122	FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
5123	    xvap.xva_xoptattrs.xoa_system);
5124	FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
5125	    xvap.xva_xoptattrs.xoa_hidden);
5126	FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
5127	    xvap.xva_xoptattrs.xoa_reparse);
5128	FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
5129	    xvap.xva_xoptattrs.xoa_offline);
5130	FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
5131	    xvap.xva_xoptattrs.xoa_sparse);
5132
5133#undef	FLAG_CHECK
5134	*vap = xvap.xva_vattr;
5135	vap->va_flags = fflags;
5136	return (0);
5137}
5138
5139static int
5140zfs_freebsd_setattr(ap)
5141	struct vop_setattr_args /* {
5142		struct vnode *a_vp;
5143		struct vattr *a_vap;
5144		struct ucred *a_cred;
5145	} */ *ap;
5146{
5147	vnode_t *vp = ap->a_vp;
5148	vattr_t *vap = ap->a_vap;
5149	cred_t *cred = ap->a_cred;
5150	xvattr_t xvap;
5151	u_long fflags;
5152	uint64_t zflags;
5153
5154	vattr_init_mask(vap);
5155	vap->va_mask &= ~AT_NOSET;
5156
5157	xva_init(&xvap);
5158	xvap.xva_vattr = *vap;
5159
5160	zflags = VTOZ(vp)->z_pflags;
5161
5162	if (vap->va_flags != VNOVAL) {
5163		zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
5164		int error;
5165
5166		if (zfsvfs->z_use_fuids == B_FALSE)
5167			return (EOPNOTSUPP);
5168
5169		fflags = vap->va_flags;
5170		/*
5171		 * XXX KDM
5172		 * We need to figure out whether it makes sense to allow
5173		 * UF_REPARSE through, since we don't really have other
5174		 * facilities to handle reparse points and zfs_setattr()
5175		 * doesn't currently allow setting that attribute anyway.
5176		 */
5177		if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE|
5178		     UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE|
5179		     UF_OFFLINE|UF_SPARSE)) != 0)
5180			return (EOPNOTSUPP);
5181		/*
5182		 * Unprivileged processes are not permitted to unset system
5183		 * flags, or modify flags if any system flags are set.
5184		 * Privileged non-jail processes may not modify system flags
5185		 * if securelevel > 0 and any existing system flags are set.
5186		 * Privileged jail processes behave like privileged non-jail
5187		 * processes if the security.jail.chflags_allowed sysctl is
5188		 * is non-zero; otherwise, they behave like unprivileged
5189		 * processes.
5190		 */
5191		if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
5192		    priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) {
5193			if (zflags &
5194			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
5195				error = securelevel_gt(cred, 0);
5196				if (error != 0)
5197					return (error);
5198			}
5199		} else {
5200			/*
5201			 * Callers may only modify the file flags on objects they
5202			 * have VADMIN rights for.
5203			 */
5204			if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0)
5205				return (error);
5206			if (zflags &
5207			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
5208				return (EPERM);
5209			}
5210			if (fflags &
5211			    (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
5212				return (EPERM);
5213			}
5214		}
5215
5216#define	FLAG_CHANGE(fflag, zflag, xflag, xfield)	do {		\
5217	if (((fflags & (fflag)) && !(zflags & (zflag))) ||		\
5218	    ((zflags & (zflag)) && !(fflags & (fflag)))) {		\
5219		XVA_SET_REQ(&xvap, (xflag));				\
5220		(xfield) = ((fflags & (fflag)) != 0);			\
5221	}								\
5222} while (0)
5223		/* Convert chflags into ZFS-type flags. */
5224		/* XXX: what about SF_SETTABLE?. */
5225		FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
5226		    xvap.xva_xoptattrs.xoa_immutable);
5227		FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
5228		    xvap.xva_xoptattrs.xoa_appendonly);
5229		FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
5230		    xvap.xva_xoptattrs.xoa_nounlink);
5231		FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
5232		    xvap.xva_xoptattrs.xoa_archive);
5233		FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
5234		    xvap.xva_xoptattrs.xoa_nodump);
5235		FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
5236		    xvap.xva_xoptattrs.xoa_readonly);
5237		FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
5238		    xvap.xva_xoptattrs.xoa_system);
5239		FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
5240		    xvap.xva_xoptattrs.xoa_hidden);
5241		FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
5242		    xvap.xva_xoptattrs.xoa_hidden);
5243		FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
5244		    xvap.xva_xoptattrs.xoa_offline);
5245		FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
5246		    xvap.xva_xoptattrs.xoa_sparse);
5247#undef	FLAG_CHANGE
5248	}
5249	if (vap->va_birthtime.tv_sec != VNOVAL) {
5250		xvap.xva_vattr.va_mask |= AT_XVATTR;
5251		XVA_SET_REQ(&xvap, XAT_CREATETIME);
5252	}
5253	return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
5254}
5255
5256static int
5257zfs_freebsd_rename(ap)
5258	struct vop_rename_args  /* {
5259		struct vnode *a_fdvp;
5260		struct vnode *a_fvp;
5261		struct componentname *a_fcnp;
5262		struct vnode *a_tdvp;
5263		struct vnode *a_tvp;
5264		struct componentname *a_tcnp;
5265	} */ *ap;
5266{
5267	vnode_t *fdvp = ap->a_fdvp;
5268	vnode_t *fvp = ap->a_fvp;
5269	vnode_t *tdvp = ap->a_tdvp;
5270	vnode_t *tvp = ap->a_tvp;
5271	int error;
5272
5273	ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
5274	ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
5275
5276	error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
5277	    ap->a_tcnp, ap->a_fcnp->cn_cred);
5278
5279	vrele(fdvp);
5280	vrele(fvp);
5281	vrele(tdvp);
5282	if (tvp != NULL)
5283		vrele(tvp);
5284
5285	return (error);
5286}
5287
5288static int
5289zfs_freebsd_symlink(ap)
5290	struct vop_symlink_args /* {
5291		struct vnode *a_dvp;
5292		struct vnode **a_vpp;
5293		struct componentname *a_cnp;
5294		struct vattr *a_vap;
5295		char *a_target;
5296	} */ *ap;
5297{
5298	struct componentname *cnp = ap->a_cnp;
5299	vattr_t *vap = ap->a_vap;
5300
5301	ASSERT(cnp->cn_flags & SAVENAME);
5302
5303	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
5304	vattr_init_mask(vap);
5305
5306	return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
5307	    ap->a_target, cnp->cn_cred, cnp->cn_thread));
5308}
5309
5310static int
5311zfs_freebsd_readlink(ap)
5312	struct vop_readlink_args /* {
5313		struct vnode *a_vp;
5314		struct uio *a_uio;
5315		struct ucred *a_cred;
5316	} */ *ap;
5317{
5318
5319	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
5320}
5321
5322static int
5323zfs_freebsd_link(ap)
5324	struct vop_link_args /* {
5325		struct vnode *a_tdvp;
5326		struct vnode *a_vp;
5327		struct componentname *a_cnp;
5328	} */ *ap;
5329{
5330	struct componentname *cnp = ap->a_cnp;
5331	vnode_t *vp = ap->a_vp;
5332	vnode_t *tdvp = ap->a_tdvp;
5333
5334	if (tdvp->v_mount != vp->v_mount)
5335		return (EXDEV);
5336
5337	ASSERT(cnp->cn_flags & SAVENAME);
5338
5339	return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
5340}
5341
5342static int
5343zfs_freebsd_inactive(ap)
5344	struct vop_inactive_args /* {
5345		struct vnode *a_vp;
5346		struct thread *a_td;
5347	} */ *ap;
5348{
5349	vnode_t *vp = ap->a_vp;
5350
5351	zfs_inactive(vp, ap->a_td->td_ucred, NULL);
5352	return (0);
5353}
5354
5355static int
5356zfs_freebsd_reclaim(ap)
5357	struct vop_reclaim_args /* {
5358		struct vnode *a_vp;
5359		struct thread *a_td;
5360	} */ *ap;
5361{
5362	vnode_t	*vp = ap->a_vp;
5363	znode_t	*zp = VTOZ(vp);
5364	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5365
5366	ASSERT(zp != NULL);
5367
5368	/* Destroy the vm object and flush associated pages. */
5369	vnode_destroy_vobject(vp);
5370
5371	/*
5372	 * z_teardown_inactive_lock protects from a race with
5373	 * zfs_znode_dmu_fini in zfsvfs_teardown during
5374	 * force unmount.
5375	 */
5376	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
5377	if (zp->z_sa_hdl == NULL)
5378		zfs_znode_free(zp);
5379	else
5380		zfs_zinactive(zp);
5381	rw_exit(&zfsvfs->z_teardown_inactive_lock);
5382
5383	vp->v_data = NULL;
5384	return (0);
5385}
5386
5387static int
5388zfs_freebsd_fid(ap)
5389	struct vop_fid_args /* {
5390		struct vnode *a_vp;
5391		struct fid *a_fid;
5392	} */ *ap;
5393{
5394
5395	return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
5396}
5397
5398static int
5399zfs_freebsd_pathconf(ap)
5400	struct vop_pathconf_args /* {
5401		struct vnode *a_vp;
5402		int a_name;
5403		register_t *a_retval;
5404	} */ *ap;
5405{
5406	ulong_t val;
5407	int error;
5408
5409	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL);
5410	if (error == 0) {
5411		*ap->a_retval = val;
5412		return (error);
5413	}
5414	if (error != EOPNOTSUPP)
5415		return (error);
5416
5417	switch (ap->a_name) {
5418	case _PC_NAME_MAX:
5419		*ap->a_retval = NAME_MAX;
5420		return (0);
5421	case _PC_PIPE_BUF:
5422		if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) {
5423			*ap->a_retval = PIPE_BUF;
5424			return (0);
5425		}
5426		return (EINVAL);
5427	default:
5428		return (vop_stdpathconf(ap));
5429	}
5430}
5431
5432/*
5433 * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
5434 * extended attribute name:
5435 *
5436 *	NAMESPACE	PREFIX
5437 *	system		freebsd:system:
5438 *	user		(none, can be used to access ZFS fsattr(5) attributes
5439 *			created on Solaris)
5440 */
5441static int
5442zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
5443    size_t size)
5444{
5445	const char *namespace, *prefix, *suffix;
5446
5447	/* We don't allow '/' character in attribute name. */
5448	if (strchr(name, '/') != NULL)
5449		return (EINVAL);
5450	/* We don't allow attribute names that start with "freebsd:" string. */
5451	if (strncmp(name, "freebsd:", 8) == 0)
5452		return (EINVAL);
5453
5454	bzero(attrname, size);
5455
5456	switch (attrnamespace) {
5457	case EXTATTR_NAMESPACE_USER:
5458#if 0
5459		prefix = "freebsd:";
5460		namespace = EXTATTR_NAMESPACE_USER_STRING;
5461		suffix = ":";
5462#else
5463		/*
5464		 * This is the default namespace by which we can access all
5465		 * attributes created on Solaris.
5466		 */
5467		prefix = namespace = suffix = "";
5468#endif
5469		break;
5470	case EXTATTR_NAMESPACE_SYSTEM:
5471		prefix = "freebsd:";
5472		namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
5473		suffix = ":";
5474		break;
5475	case EXTATTR_NAMESPACE_EMPTY:
5476	default:
5477		return (EINVAL);
5478	}
5479	if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
5480	    name) >= size) {
5481		return (ENAMETOOLONG);
5482	}
5483	return (0);
5484}
5485
5486/*
5487 * Vnode operating to retrieve a named extended attribute.
5488 */
5489static int
5490zfs_getextattr(struct vop_getextattr_args *ap)
5491/*
5492vop_getextattr {
5493	IN struct vnode *a_vp;
5494	IN int a_attrnamespace;
5495	IN const char *a_name;
5496	INOUT struct uio *a_uio;
5497	OUT size_t *a_size;
5498	IN struct ucred *a_cred;
5499	IN struct thread *a_td;
5500};
5501*/
5502{
5503	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5504	struct thread *td = ap->a_td;
5505	struct nameidata nd;
5506	char attrname[255];
5507	struct vattr va;
5508	vnode_t *xvp = NULL, *vp;
5509	int error, flags;
5510
5511	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5512	    ap->a_cred, ap->a_td, VREAD);
5513	if (error != 0)
5514		return (error);
5515
5516	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5517	    sizeof(attrname));
5518	if (error != 0)
5519		return (error);
5520
5521	ZFS_ENTER(zfsvfs);
5522
5523	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5524	    LOOKUP_XATTR);
5525	if (error != 0) {
5526		ZFS_EXIT(zfsvfs);
5527		return (error);
5528	}
5529
5530	flags = FREAD;
5531	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
5532	    xvp, td);
5533	error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL);
5534	vp = nd.ni_vp;
5535	NDFREE(&nd, NDF_ONLY_PNBUF);
5536	if (error != 0) {
5537		ZFS_EXIT(zfsvfs);
5538		if (error == ENOENT)
5539			error = ENOATTR;
5540		return (error);
5541	}
5542
5543	if (ap->a_size != NULL) {
5544		error = VOP_GETATTR(vp, &va, ap->a_cred);
5545		if (error == 0)
5546			*ap->a_size = (size_t)va.va_size;
5547	} else if (ap->a_uio != NULL)
5548		error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
5549
5550	VOP_UNLOCK(vp, 0);
5551	vn_close(vp, flags, ap->a_cred, td);
5552	ZFS_EXIT(zfsvfs);
5553
5554	return (error);
5555}
5556
5557/*
5558 * Vnode operation to remove a named attribute.
5559 */
5560int
5561zfs_deleteextattr(struct vop_deleteextattr_args *ap)
5562/*
5563vop_deleteextattr {
5564	IN struct vnode *a_vp;
5565	IN int a_attrnamespace;
5566	IN const char *a_name;
5567	IN struct ucred *a_cred;
5568	IN struct thread *a_td;
5569};
5570*/
5571{
5572	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5573	struct thread *td = ap->a_td;
5574	struct nameidata nd;
5575	char attrname[255];
5576	struct vattr va;
5577	vnode_t *xvp = NULL, *vp;
5578	int error, flags;
5579
5580	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5581	    ap->a_cred, ap->a_td, VWRITE);
5582	if (error != 0)
5583		return (error);
5584
5585	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5586	    sizeof(attrname));
5587	if (error != 0)
5588		return (error);
5589
5590	ZFS_ENTER(zfsvfs);
5591
5592	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5593	    LOOKUP_XATTR);
5594	if (error != 0) {
5595		ZFS_EXIT(zfsvfs);
5596		return (error);
5597	}
5598
5599	NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
5600	    UIO_SYSSPACE, attrname, xvp, td);
5601	error = namei(&nd);
5602	vp = nd.ni_vp;
5603	if (error != 0) {
5604		ZFS_EXIT(zfsvfs);
5605		NDFREE(&nd, NDF_ONLY_PNBUF);
5606		if (error == ENOENT)
5607			error = ENOATTR;
5608		return (error);
5609	}
5610
5611	error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
5612	NDFREE(&nd, NDF_ONLY_PNBUF);
5613
5614	vput(nd.ni_dvp);
5615	if (vp == nd.ni_dvp)
5616		vrele(vp);
5617	else
5618		vput(vp);
5619	ZFS_EXIT(zfsvfs);
5620
5621	return (error);
5622}
5623
5624/*
5625 * Vnode operation to set a named attribute.
5626 */
5627static int
5628zfs_setextattr(struct vop_setextattr_args *ap)
5629/*
5630vop_setextattr {
5631	IN struct vnode *a_vp;
5632	IN int a_attrnamespace;
5633	IN const char *a_name;
5634	INOUT struct uio *a_uio;
5635	IN struct ucred *a_cred;
5636	IN struct thread *a_td;
5637};
5638*/
5639{
5640	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5641	struct thread *td = ap->a_td;
5642	struct nameidata nd;
5643	char attrname[255];
5644	struct vattr va;
5645	vnode_t *xvp = NULL, *vp;
5646	int error, flags;
5647
5648	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5649	    ap->a_cred, ap->a_td, VWRITE);
5650	if (error != 0)
5651		return (error);
5652
5653	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5654	    sizeof(attrname));
5655	if (error != 0)
5656		return (error);
5657
5658	ZFS_ENTER(zfsvfs);
5659
5660	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5661	    LOOKUP_XATTR | CREATE_XATTR_DIR);
5662	if (error != 0) {
5663		ZFS_EXIT(zfsvfs);
5664		return (error);
5665	}
5666
5667	flags = FFLAGS(O_WRONLY | O_CREAT);
5668	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
5669	    xvp, td);
5670	error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL);
5671	vp = nd.ni_vp;
5672	NDFREE(&nd, NDF_ONLY_PNBUF);
5673	if (error != 0) {
5674		ZFS_EXIT(zfsvfs);
5675		return (error);
5676	}
5677
5678	VATTR_NULL(&va);
5679	va.va_size = 0;
5680	error = VOP_SETATTR(vp, &va, ap->a_cred);
5681	if (error == 0)
5682		VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);
5683
5684	VOP_UNLOCK(vp, 0);
5685	vn_close(vp, flags, ap->a_cred, td);
5686	ZFS_EXIT(zfsvfs);
5687
5688	return (error);
5689}
5690
5691/*
5692 * Vnode operation to retrieve extended attributes on a vnode.
5693 */
5694static int
5695zfs_listextattr(struct vop_listextattr_args *ap)
5696/*
5697vop_listextattr {
5698	IN struct vnode *a_vp;
5699	IN int a_attrnamespace;
5700	INOUT struct uio *a_uio;
5701	OUT size_t *a_size;
5702	IN struct ucred *a_cred;
5703	IN struct thread *a_td;
5704};
5705*/
5706{
5707	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5708	struct thread *td = ap->a_td;
5709	struct nameidata nd;
5710	char attrprefix[16];
5711	u_char dirbuf[sizeof(struct dirent)];
5712	struct dirent *dp;
5713	struct iovec aiov;
5714	struct uio auio, *uio = ap->a_uio;
5715	size_t *sizep = ap->a_size;
5716	size_t plen;
5717	vnode_t *xvp = NULL, *vp;
5718	int done, error, eof, pos;
5719
5720	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5721	    ap->a_cred, ap->a_td, VREAD);
5722	if (error != 0)
5723		return (error);
5724
5725	error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
5726	    sizeof(attrprefix));
5727	if (error != 0)
5728		return (error);
5729	plen = strlen(attrprefix);
5730
5731	ZFS_ENTER(zfsvfs);
5732
5733	if (sizep != NULL)
5734		*sizep = 0;
5735
5736	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5737	    LOOKUP_XATTR);
5738	if (error != 0) {
5739		ZFS_EXIT(zfsvfs);
5740		/*
5741		 * ENOATTR means that the EA directory does not yet exist,
5742		 * i.e. there are no extended attributes there.
5743		 */
5744		if (error == ENOATTR)
5745			error = 0;
5746		return (error);
5747	}
5748
5749	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
5750	    UIO_SYSSPACE, ".", xvp, td);
5751	error = namei(&nd);
5752	vp = nd.ni_vp;
5753	NDFREE(&nd, NDF_ONLY_PNBUF);
5754	if (error != 0) {
5755		ZFS_EXIT(zfsvfs);
5756		return (error);
5757	}
5758
5759	auio.uio_iov = &aiov;
5760	auio.uio_iovcnt = 1;
5761	auio.uio_segflg = UIO_SYSSPACE;
5762	auio.uio_td = td;
5763	auio.uio_rw = UIO_READ;
5764	auio.uio_offset = 0;
5765
5766	do {
5767		u_char nlen;
5768
5769		aiov.iov_base = (void *)dirbuf;
5770		aiov.iov_len = sizeof(dirbuf);
5771		auio.uio_resid = sizeof(dirbuf);
5772		error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
5773		done = sizeof(dirbuf) - auio.uio_resid;
5774		if (error != 0)
5775			break;
5776		for (pos = 0; pos < done;) {
5777			dp = (struct dirent *)(dirbuf + pos);
5778			pos += dp->d_reclen;
5779			/*
5780			 * XXX: Temporarily we also accept DT_UNKNOWN, as this
5781			 * is what we get when attribute was created on Solaris.
5782			 */
5783			if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
5784				continue;
5785			if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0)
5786				continue;
5787			else if (strncmp(dp->d_name, attrprefix, plen) != 0)
5788				continue;
5789			nlen = dp->d_namlen - plen;
5790			if (sizep != NULL)
5791				*sizep += 1 + nlen;
5792			else if (uio != NULL) {
5793				/*
5794				 * Format of extattr name entry is one byte for
5795				 * length and the rest for name.
5796				 */
5797				error = uiomove(&nlen, 1, uio->uio_rw, uio);
5798				if (error == 0) {
5799					error = uiomove(dp->d_name + plen, nlen,
5800					    uio->uio_rw, uio);
5801				}
5802				if (error != 0)
5803					break;
5804			}
5805		}
5806	} while (!eof && error == 0);
5807
5808	vput(vp);
5809	ZFS_EXIT(zfsvfs);
5810
5811	return (error);
5812}
5813
5814int
5815zfs_freebsd_getacl(ap)
5816	struct vop_getacl_args /* {
5817		struct vnode *vp;
5818		acl_type_t type;
5819		struct acl *aclp;
5820		struct ucred *cred;
5821		struct thread *td;
5822	} */ *ap;
5823{
5824	int		error;
5825	vsecattr_t      vsecattr;
5826
5827	if (ap->a_type != ACL_TYPE_NFS4)
5828		return (EINVAL);
5829
5830	vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
5831	if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))
5832		return (error);
5833
5834	error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt);
5835	if (vsecattr.vsa_aclentp != NULL)
5836		kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
5837
5838	return (error);
5839}
5840
5841int
5842zfs_freebsd_setacl(ap)
5843	struct vop_setacl_args /* {
5844		struct vnode *vp;
5845		acl_type_t type;
5846		struct acl *aclp;
5847		struct ucred *cred;
5848		struct thread *td;
5849	} */ *ap;
5850{
5851	int		error;
5852	vsecattr_t      vsecattr;
5853	int		aclbsize;	/* size of acl list in bytes */
5854	aclent_t	*aaclp;
5855
5856	if (ap->a_type != ACL_TYPE_NFS4)
5857		return (EINVAL);
5858
5859	if (ap->a_aclp == NULL)
5860		return (EINVAL);
5861
5862	if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
5863		return (EINVAL);
5864
5865	/*
5866	 * With NFSv4 ACLs, chmod(2) may need to add additional entries,
5867	 * splitting every entry into two and appending "canonical six"
5868	 * entries at the end.  Don't allow for setting an ACL that would
5869	 * cause chmod(2) to run out of ACL entries.
5870	 */
5871	if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
5872		return (ENOSPC);
5873
5874	error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
5875	if (error != 0)
5876		return (error);
5877
5878	vsecattr.vsa_mask = VSA_ACE;
5879	aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t);
5880	vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
5881	aaclp = vsecattr.vsa_aclentp;
5882	vsecattr.vsa_aclentsz = aclbsize;
5883
5884	aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
5885	error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL);
5886	kmem_free(aaclp, aclbsize);
5887
5888	return (error);
5889}
5890
5891int
5892zfs_freebsd_aclcheck(ap)
5893	struct vop_aclcheck_args /* {
5894		struct vnode *vp;
5895		acl_type_t type;
5896		struct acl *aclp;
5897		struct ucred *cred;
5898		struct thread *td;
5899	} */ *ap;
5900{
5901
5902	return (EOPNOTSUPP);
5903}
5904
5905static int
5906zfs_vptocnp(struct vop_vptocnp_args *ap)
5907{
5908	vnode_t *covered_vp;
5909	vnode_t *vp = ap->a_vp;;
5910	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
5911	znode_t *zp = VTOZ(vp);
5912	int ltype;
5913	int error;
5914
5915	ZFS_ENTER(zfsvfs);
5916	ZFS_VERIFY_ZP(zp);
5917
5918	/*
5919	 * If we are a snapshot mounted under .zfs, run the operation
5920	 * on the covered vnode.
5921	 */
5922	if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) {
5923		char name[MAXNAMLEN + 1];
5924		znode_t *dzp;
5925		size_t len;
5926
5927		error = zfs_znode_parent_and_name(zp, &dzp, name);
5928		if (error == 0) {
5929			len = strlen(name);
5930			if (*ap->a_buflen < len)
5931				error = SET_ERROR(ENOMEM);
5932		}
5933		if (error == 0) {
5934			*ap->a_buflen -= len;
5935			bcopy(name, ap->a_buf + *ap->a_buflen, len);
5936			*ap->a_vpp = ZTOV(dzp);
5937		}
5938		ZFS_EXIT(zfsvfs);
5939		return (error);
5940	}
5941	ZFS_EXIT(zfsvfs);
5942
5943	covered_vp = vp->v_mount->mnt_vnodecovered;
5944	vhold(covered_vp);
5945	ltype = VOP_ISLOCKED(vp);
5946	VOP_UNLOCK(vp, 0);
5947	error = vget(covered_vp, LK_SHARED | LK_VNHELD, curthread);
5948	if (error == 0) {
5949		error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred,
5950		    ap->a_buf, ap->a_buflen);
5951		vput(covered_vp);
5952	}
5953	vn_lock(vp, ltype | LK_RETRY);
5954	if ((vp->v_iflag & VI_DOOMED) != 0)
5955		error = SET_ERROR(ENOENT);
5956	return (error);
5957}
5958
5959#ifdef DIAGNOSTIC
5960static int
5961zfs_lock(ap)
5962	struct vop_lock1_args /* {
5963		struct vnode *a_vp;
5964		int a_flags;
5965		char *file;
5966		int line;
5967	} */ *ap;
5968{
5969	vnode_t *vp;
5970	znode_t *zp;
5971	int err;
5972
5973	err = vop_stdlock(ap);
5974	if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) {
5975		vp = ap->a_vp;
5976		zp = vp->v_data;
5977		if (vp->v_mount != NULL && (vp->v_iflag & VI_DOOMED) == 0 &&
5978		    zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0)
5979			VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock));
5980	}
5981	return (err);
5982}
5983#endif
5984
5985struct vop_vector zfs_vnodeops;
5986struct vop_vector zfs_fifoops;
5987struct vop_vector zfs_shareops;
5988
5989struct vop_vector zfs_vnodeops = {
5990	.vop_default =		&default_vnodeops,
5991	.vop_inactive =		zfs_freebsd_inactive,
5992	.vop_reclaim =		zfs_freebsd_reclaim,
5993	.vop_access =		zfs_freebsd_access,
5994	.vop_lookup =		zfs_cache_lookup,
5995	.vop_cachedlookup =	zfs_freebsd_lookup,
5996	.vop_getattr =		zfs_freebsd_getattr,
5997	.vop_setattr =		zfs_freebsd_setattr,
5998	.vop_create =		zfs_freebsd_create,
5999	.vop_mknod =		zfs_freebsd_create,
6000	.vop_mkdir =		zfs_freebsd_mkdir,
6001	.vop_readdir =		zfs_freebsd_readdir,
6002	.vop_fsync =		zfs_freebsd_fsync,
6003	.vop_open =		zfs_freebsd_open,
6004	.vop_close =		zfs_freebsd_close,
6005	.vop_rmdir =		zfs_freebsd_rmdir,
6006	.vop_ioctl =		zfs_freebsd_ioctl,
6007	.vop_link =		zfs_freebsd_link,
6008	.vop_symlink =		zfs_freebsd_symlink,
6009	.vop_readlink =		zfs_freebsd_readlink,
6010	.vop_read =		zfs_freebsd_read,
6011	.vop_write =		zfs_freebsd_write,
6012	.vop_remove =		zfs_freebsd_remove,
6013	.vop_rename =		zfs_freebsd_rename,
6014	.vop_pathconf =		zfs_freebsd_pathconf,
6015	.vop_bmap =		zfs_freebsd_bmap,
6016	.vop_fid =		zfs_freebsd_fid,
6017	.vop_getextattr =	zfs_getextattr,
6018	.vop_deleteextattr =	zfs_deleteextattr,
6019	.vop_setextattr =	zfs_setextattr,
6020	.vop_listextattr =	zfs_listextattr,
6021	.vop_getacl =		zfs_freebsd_getacl,
6022	.vop_setacl =		zfs_freebsd_setacl,
6023	.vop_aclcheck =		zfs_freebsd_aclcheck,
6024	.vop_getpages =		zfs_freebsd_getpages,
6025	.vop_putpages =		zfs_freebsd_putpages,
6026	.vop_vptocnp =		zfs_vptocnp,
6027#ifdef DIAGNOSTIC
6028	.vop_lock1 =		zfs_lock,
6029#endif
6030};
6031
6032struct vop_vector zfs_fifoops = {
6033	.vop_default =		&fifo_specops,
6034	.vop_fsync =		zfs_freebsd_fsync,
6035	.vop_access =		zfs_freebsd_access,
6036	.vop_getattr =		zfs_freebsd_getattr,
6037	.vop_inactive =		zfs_freebsd_inactive,
6038	.vop_read =		VOP_PANIC,
6039	.vop_reclaim =		zfs_freebsd_reclaim,
6040	.vop_setattr =		zfs_freebsd_setattr,
6041	.vop_write =		VOP_PANIC,
6042	.vop_pathconf = 	zfs_freebsd_pathconf,
6043	.vop_fid =		zfs_freebsd_fid,
6044	.vop_getacl =		zfs_freebsd_getacl,
6045	.vop_setacl =		zfs_freebsd_setacl,
6046	.vop_aclcheck =		zfs_freebsd_aclcheck,
6047};
6048
6049/*
6050 * special share hidden files vnode operations template
6051 */
6052struct vop_vector zfs_shareops = {
6053	.vop_default =		&default_vnodeops,
6054	.vop_access =		zfs_freebsd_access,
6055	.vop_inactive =		zfs_freebsd_inactive,
6056	.vop_reclaim =		zfs_freebsd_reclaim,
6057	.vop_fid =		zfs_freebsd_fid,
6058	.vop_pathconf =		zfs_freebsd_pathconf,
6059};
6060