zfs_vnops.c revision 307142
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24 * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
25 * Copyright (c) 2014 Integros [integros.com]
26 */
27
28/* Portions Copyright 2007 Jeremy Teo */
29/* Portions Copyright 2010 Robert Milkowski */
30
31#include <sys/types.h>
32#include <sys/param.h>
33#include <sys/time.h>
34#include <sys/systm.h>
35#include <sys/sysmacros.h>
36#include <sys/resource.h>
37#include <sys/vfs.h>
38#include <sys/vm.h>
39#include <sys/vnode.h>
40#include <sys/file.h>
41#include <sys/stat.h>
42#include <sys/kmem.h>
43#include <sys/taskq.h>
44#include <sys/uio.h>
45#include <sys/atomic.h>
46#include <sys/namei.h>
47#include <sys/mman.h>
48#include <sys/cmn_err.h>
49#include <sys/errno.h>
50#include <sys/unistd.h>
51#include <sys/zfs_dir.h>
52#include <sys/zfs_ioctl.h>
53#include <sys/fs/zfs.h>
54#include <sys/dmu.h>
55#include <sys/dmu_objset.h>
56#include <sys/spa.h>
57#include <sys/txg.h>
58#include <sys/dbuf.h>
59#include <sys/zap.h>
60#include <sys/sa.h>
61#include <sys/dirent.h>
62#include <sys/policy.h>
63#include <sys/sunddi.h>
64#include <sys/filio.h>
65#include <sys/sid.h>
66#include <sys/zfs_ctldir.h>
67#include <sys/zfs_fuid.h>
68#include <sys/zfs_sa.h>
69#include <sys/zfs_rlock.h>
70#include <sys/extdirent.h>
71#include <sys/kidmap.h>
72#include <sys/bio.h>
73#include <sys/buf.h>
74#include <sys/sched.h>
75#include <sys/acl.h>
76#include <vm/vm_param.h>
77
78/*
79 * Programming rules.
80 *
81 * Each vnode op performs some logical unit of work.  To do this, the ZPL must
82 * properly lock its in-core state, create a DMU transaction, do the work,
83 * record this work in the intent log (ZIL), commit the DMU transaction,
84 * and wait for the intent log to commit if it is a synchronous operation.
85 * Moreover, the vnode ops must work in both normal and log replay context.
86 * The ordering of events is important to avoid deadlocks and references
87 * to freed memory.  The example below illustrates the following Big Rules:
88 *
89 *  (1)	A check must be made in each zfs thread for a mounted file system.
90 *	This is done avoiding races using ZFS_ENTER(zfsvfs).
91 *	A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
92 *	must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
93 *	can return EIO from the calling function.
94 *
95 *  (2)	VN_RELE() should always be the last thing except for zil_commit()
96 *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
97 *	First, if it's the last reference, the vnode/znode
98 *	can be freed, so the zp may point to freed memory.  Second, the last
99 *	reference will call zfs_zinactive(), which may induce a lot of work --
100 *	pushing cached pages (which acquires range locks) and syncing out
101 *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
102 *	which could deadlock the system if you were already holding one.
103 *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
104 *
105 *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
106 *	as they can span dmu_tx_assign() calls.
107 *
108 *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
109 *      dmu_tx_assign().  This is critical because we don't want to block
110 *      while holding locks.
111 *
112 *	If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
113 *	reduces lock contention and CPU usage when we must wait (note that if
114 *	throughput is constrained by the storage, nearly every transaction
115 *	must wait).
116 *
117 *      Note, in particular, that if a lock is sometimes acquired before
118 *      the tx assigns, and sometimes after (e.g. z_lock), then failing
119 *      to use a non-blocking assign can deadlock the system.  The scenario:
120 *
121 *	Thread A has grabbed a lock before calling dmu_tx_assign().
122 *	Thread B is in an already-assigned tx, and blocks for this lock.
123 *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
124 *	forever, because the previous txg can't quiesce until B's tx commits.
125 *
126 *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
127 *	then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
128 *	calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT,
129 *	to indicate that this operation has already called dmu_tx_wait().
130 *	This will ensure that we don't retry forever, waiting a short bit
131 *	each time.
132 *
133 *  (5)	If the operation succeeded, generate the intent log entry for it
134 *	before dropping locks.  This ensures that the ordering of events
135 *	in the intent log matches the order in which they actually occurred.
136 *	During ZIL replay the zfs_log_* functions will update the sequence
137 *	number to indicate the zil transaction has replayed.
138 *
139 *  (6)	At the end of each vnode op, the DMU tx must always commit,
140 *	regardless of whether there were any errors.
141 *
142 *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
143 *	to ensure that synchronous semantics are provided when necessary.
144 *
145 * In general, this is how things should be ordered in each vnode op:
146 *
147 *	ZFS_ENTER(zfsvfs);		// exit if unmounted
148 * top:
149 *	zfs_dirent_lookup(&dl, ...)	// lock directory entry (may VN_HOLD())
150 *	rw_enter(...);			// grab any other locks you need
151 *	tx = dmu_tx_create(...);	// get DMU tx
152 *	dmu_tx_hold_*();		// hold each object you might modify
153 *	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
154 *	if (error) {
155 *		rw_exit(...);		// drop locks
156 *		zfs_dirent_unlock(dl);	// unlock directory entry
157 *		VN_RELE(...);		// release held vnodes
158 *		if (error == ERESTART) {
159 *			waited = B_TRUE;
160 *			dmu_tx_wait(tx);
161 *			dmu_tx_abort(tx);
162 *			goto top;
163 *		}
164 *		dmu_tx_abort(tx);	// abort DMU tx
165 *		ZFS_EXIT(zfsvfs);	// finished in zfs
166 *		return (error);		// really out of space
167 *	}
168 *	error = do_real_work();		// do whatever this VOP does
169 *	if (error == 0)
170 *		zfs_log_*(...);		// on success, make ZIL entry
171 *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
172 *	rw_exit(...);			// drop locks
173 *	zfs_dirent_unlock(dl);		// unlock directory entry
174 *	VN_RELE(...);			// release held vnodes
175 *	zil_commit(zilog, foid);	// synchronous when necessary
176 *	ZFS_EXIT(zfsvfs);		// finished in zfs
177 *	return (error);			// done, report error
178 */
179
180/* ARGSUSED */
181static int
182zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
183{
184	znode_t	*zp = VTOZ(*vpp);
185	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
186
187	ZFS_ENTER(zfsvfs);
188	ZFS_VERIFY_ZP(zp);
189
190	if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
191	    ((flag & FAPPEND) == 0)) {
192		ZFS_EXIT(zfsvfs);
193		return (SET_ERROR(EPERM));
194	}
195
196	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
197	    ZTOV(zp)->v_type == VREG &&
198	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
199		if (fs_vscan(*vpp, cr, 0) != 0) {
200			ZFS_EXIT(zfsvfs);
201			return (SET_ERROR(EACCES));
202		}
203	}
204
205	/* Keep a count of the synchronous opens in the znode */
206	if (flag & (FSYNC | FDSYNC))
207		atomic_inc_32(&zp->z_sync_cnt);
208
209	ZFS_EXIT(zfsvfs);
210	return (0);
211}
212
213/* ARGSUSED */
214static int
215zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
216    caller_context_t *ct)
217{
218	znode_t	*zp = VTOZ(vp);
219	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
220
221	/*
222	 * Clean up any locks held by this process on the vp.
223	 */
224	cleanlocks(vp, ddi_get_pid(), 0);
225	cleanshares(vp, ddi_get_pid());
226
227	ZFS_ENTER(zfsvfs);
228	ZFS_VERIFY_ZP(zp);
229
230	/* Decrement the synchronous opens in the znode */
231	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
232		atomic_dec_32(&zp->z_sync_cnt);
233
234	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
235	    ZTOV(zp)->v_type == VREG &&
236	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
237		VERIFY(fs_vscan(vp, cr, 1) == 0);
238
239	ZFS_EXIT(zfsvfs);
240	return (0);
241}
242
243/*
244 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
245 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
246 */
247static int
248zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
249{
250	znode_t	*zp = VTOZ(vp);
251	uint64_t noff = (uint64_t)*off; /* new offset */
252	uint64_t file_sz;
253	int error;
254	boolean_t hole;
255
256	file_sz = zp->z_size;
257	if (noff >= file_sz)  {
258		return (SET_ERROR(ENXIO));
259	}
260
261	if (cmd == _FIO_SEEK_HOLE)
262		hole = B_TRUE;
263	else
264		hole = B_FALSE;
265
266	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
267
268	if (error == ESRCH)
269		return (SET_ERROR(ENXIO));
270
271	/*
272	 * We could find a hole that begins after the logical end-of-file,
273	 * because dmu_offset_next() only works on whole blocks.  If the
274	 * EOF falls mid-block, then indicate that the "virtual hole"
275	 * at the end of the file begins at the logical EOF, rather than
276	 * at the end of the last block.
277	 */
278	if (noff > file_sz) {
279		ASSERT(hole);
280		noff = file_sz;
281	}
282
283	if (noff < *off)
284		return (error);
285	*off = noff;
286	return (error);
287}
288
289/* ARGSUSED */
290static int
291zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
292    int *rvalp, caller_context_t *ct)
293{
294	offset_t off;
295	offset_t ndata;
296	dmu_object_info_t doi;
297	int error;
298	zfsvfs_t *zfsvfs;
299	znode_t *zp;
300
301	switch (com) {
302	case _FIOFFS:
303	{
304		return (0);
305
306		/*
307		 * The following two ioctls are used by bfu.  Faking out,
308		 * necessary to avoid bfu errors.
309		 */
310	}
311	case _FIOGDIO:
312	case _FIOSDIO:
313	{
314		return (0);
315	}
316
317	case _FIO_SEEK_DATA:
318	case _FIO_SEEK_HOLE:
319	{
320#ifdef illumos
321		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
322			return (SET_ERROR(EFAULT));
323#else
324		off = *(offset_t *)data;
325#endif
326		zp = VTOZ(vp);
327		zfsvfs = zp->z_zfsvfs;
328		ZFS_ENTER(zfsvfs);
329		ZFS_VERIFY_ZP(zp);
330
331		/* offset parameter is in/out */
332		error = zfs_holey(vp, com, &off);
333		ZFS_EXIT(zfsvfs);
334		if (error)
335			return (error);
336#ifdef illumos
337		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
338			return (SET_ERROR(EFAULT));
339#else
340		*(offset_t *)data = off;
341#endif
342		return (0);
343	}
344#ifdef illumos
345	case _FIO_COUNT_FILLED:
346	{
347		/*
348		 * _FIO_COUNT_FILLED adds a new ioctl command which
349		 * exposes the number of filled blocks in a
350		 * ZFS object.
351		 */
352		zp = VTOZ(vp);
353		zfsvfs = zp->z_zfsvfs;
354		ZFS_ENTER(zfsvfs);
355		ZFS_VERIFY_ZP(zp);
356
357		/*
358		 * Wait for all dirty blocks for this object
359		 * to get synced out to disk, and the DMU info
360		 * updated.
361		 */
362		error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
363		if (error) {
364			ZFS_EXIT(zfsvfs);
365			return (error);
366		}
367
368		/*
369		 * Retrieve fill count from DMU object.
370		 */
371		error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
372		if (error) {
373			ZFS_EXIT(zfsvfs);
374			return (error);
375		}
376
377		ndata = doi.doi_fill_count;
378
379		ZFS_EXIT(zfsvfs);
380		if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
381			return (SET_ERROR(EFAULT));
382		return (0);
383	}
384#endif
385	}
386	return (SET_ERROR(ENOTTY));
387}
388
389static vm_page_t
390page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
391{
392	vm_object_t obj;
393	vm_page_t pp;
394	int64_t end;
395
396	/*
397	 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
398	 * aligned boundaries, if the range is not aligned.  As a result a
399	 * DEV_BSIZE subrange with partially dirty data may get marked as clean.
400	 * It may happen that all DEV_BSIZE subranges are marked clean and thus
401	 * the whole page would be considred clean despite have some dirty data.
402	 * For this reason we should shrink the range to DEV_BSIZE aligned
403	 * boundaries before calling vm_page_clear_dirty.
404	 */
405	end = rounddown2(off + nbytes, DEV_BSIZE);
406	off = roundup2(off, DEV_BSIZE);
407	nbytes = end - off;
408
409	obj = vp->v_object;
410	zfs_vmobject_assert_wlocked(obj);
411
412	for (;;) {
413		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
414		    pp->valid) {
415			if (vm_page_xbusied(pp)) {
416				/*
417				 * Reference the page before unlocking and
418				 * sleeping so that the page daemon is less
419				 * likely to reclaim it.
420				 */
421				vm_page_reference(pp);
422				vm_page_lock(pp);
423				zfs_vmobject_wunlock(obj);
424				vm_page_busy_sleep(pp, "zfsmwb");
425				zfs_vmobject_wlock(obj);
426				continue;
427			}
428			vm_page_sbusy(pp);
429		} else if (pp == NULL) {
430			pp = vm_page_alloc(obj, OFF_TO_IDX(start),
431			    VM_ALLOC_SYSTEM | VM_ALLOC_IFCACHED |
432			    VM_ALLOC_SBUSY);
433		} else {
434			ASSERT(pp != NULL && !pp->valid);
435			pp = NULL;
436		}
437
438		if (pp != NULL) {
439			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
440			vm_object_pip_add(obj, 1);
441			pmap_remove_write(pp);
442			if (nbytes != 0)
443				vm_page_clear_dirty(pp, off, nbytes);
444		}
445		break;
446	}
447	return (pp);
448}
449
450static void
451page_unbusy(vm_page_t pp)
452{
453
454	vm_page_sunbusy(pp);
455	vm_object_pip_subtract(pp->object, 1);
456}
457
458static vm_page_t
459page_hold(vnode_t *vp, int64_t start)
460{
461	vm_object_t obj;
462	vm_page_t pp;
463
464	obj = vp->v_object;
465	zfs_vmobject_assert_wlocked(obj);
466
467	for (;;) {
468		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
469		    pp->valid) {
470			if (vm_page_xbusied(pp)) {
471				/*
472				 * Reference the page before unlocking and
473				 * sleeping so that the page daemon is less
474				 * likely to reclaim it.
475				 */
476				vm_page_reference(pp);
477				vm_page_lock(pp);
478				zfs_vmobject_wunlock(obj);
479				vm_page_busy_sleep(pp, "zfsmwb");
480				zfs_vmobject_wlock(obj);
481				continue;
482			}
483
484			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
485			vm_page_lock(pp);
486			vm_page_hold(pp);
487			vm_page_unlock(pp);
488
489		} else
490			pp = NULL;
491		break;
492	}
493	return (pp);
494}
495
496static void
497page_unhold(vm_page_t pp)
498{
499
500	vm_page_lock(pp);
501	vm_page_unhold(pp);
502	vm_page_unlock(pp);
503}
504
505/*
506 * When a file is memory mapped, we must keep the IO data synchronized
507 * between the DMU cache and the memory mapped pages.  What this means:
508 *
509 * On Write:	If we find a memory mapped page, we write to *both*
510 *		the page and the dmu buffer.
511 */
512static void
513update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
514    int segflg, dmu_tx_t *tx)
515{
516	vm_object_t obj;
517	struct sf_buf *sf;
518	caddr_t va;
519	int off;
520
521	ASSERT(segflg != UIO_NOCOPY);
522	ASSERT(vp->v_mount != NULL);
523	obj = vp->v_object;
524	ASSERT(obj != NULL);
525
526	off = start & PAGEOFFSET;
527	zfs_vmobject_wlock(obj);
528	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
529		vm_page_t pp;
530		int nbytes = imin(PAGESIZE - off, len);
531
532		if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
533			zfs_vmobject_wunlock(obj);
534
535			va = zfs_map_page(pp, &sf);
536			(void) dmu_read(os, oid, start+off, nbytes,
537			    va+off, DMU_READ_PREFETCH);;
538			zfs_unmap_page(sf);
539
540			zfs_vmobject_wlock(obj);
541			page_unbusy(pp);
542		}
543		len -= nbytes;
544		off = 0;
545	}
546	vm_object_pip_wakeupn(obj, 0);
547	zfs_vmobject_wunlock(obj);
548}
549
550/*
551 * Read with UIO_NOCOPY flag means that sendfile(2) requests
552 * ZFS to populate a range of page cache pages with data.
553 *
554 * NOTE: this function could be optimized to pre-allocate
555 * all pages in advance, drain exclusive busy on all of them,
556 * map them into contiguous KVA region and populate them
557 * in one single dmu_read() call.
558 */
559static int
560mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
561{
562	znode_t *zp = VTOZ(vp);
563	objset_t *os = zp->z_zfsvfs->z_os;
564	struct sf_buf *sf;
565	vm_object_t obj;
566	vm_page_t pp;
567	int64_t start;
568	caddr_t va;
569	int len = nbytes;
570	int off;
571	int error = 0;
572
573	ASSERT(uio->uio_segflg == UIO_NOCOPY);
574	ASSERT(vp->v_mount != NULL);
575	obj = vp->v_object;
576	ASSERT(obj != NULL);
577	ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
578
579	zfs_vmobject_wlock(obj);
580	for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
581		int bytes = MIN(PAGESIZE, len);
582
583		pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY |
584		    VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
585		if (pp->valid == 0) {
586			zfs_vmobject_wunlock(obj);
587			va = zfs_map_page(pp, &sf);
588			error = dmu_read(os, zp->z_id, start, bytes, va,
589			    DMU_READ_PREFETCH);
590			if (bytes != PAGESIZE && error == 0)
591				bzero(va + bytes, PAGESIZE - bytes);
592			zfs_unmap_page(sf);
593			zfs_vmobject_wlock(obj);
594			vm_page_sunbusy(pp);
595			vm_page_lock(pp);
596			if (error) {
597				if (pp->wire_count == 0 && pp->valid == 0 &&
598				    !vm_page_busied(pp))
599					vm_page_free(pp);
600			} else {
601				pp->valid = VM_PAGE_BITS_ALL;
602				vm_page_activate(pp);
603			}
604			vm_page_unlock(pp);
605		} else {
606			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
607			vm_page_sunbusy(pp);
608		}
609		if (error)
610			break;
611		uio->uio_resid -= bytes;
612		uio->uio_offset += bytes;
613		len -= bytes;
614	}
615	zfs_vmobject_wunlock(obj);
616	return (error);
617}
618
619/*
620 * When a file is memory mapped, we must keep the IO data synchronized
621 * between the DMU cache and the memory mapped pages.  What this means:
622 *
623 * On Read:	We "read" preferentially from memory mapped pages,
624 *		else we default from the dmu buffer.
625 *
626 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
627 *	 the file is memory mapped.
628 */
629static int
630mappedread(vnode_t *vp, int nbytes, uio_t *uio)
631{
632	znode_t *zp = VTOZ(vp);
633	vm_object_t obj;
634	int64_t start;
635	caddr_t va;
636	int len = nbytes;
637	int off;
638	int error = 0;
639
640	ASSERT(vp->v_mount != NULL);
641	obj = vp->v_object;
642	ASSERT(obj != NULL);
643
644	start = uio->uio_loffset;
645	off = start & PAGEOFFSET;
646	zfs_vmobject_wlock(obj);
647	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
648		vm_page_t pp;
649		uint64_t bytes = MIN(PAGESIZE - off, len);
650
651		if (pp = page_hold(vp, start)) {
652			struct sf_buf *sf;
653			caddr_t va;
654
655			zfs_vmobject_wunlock(obj);
656			va = zfs_map_page(pp, &sf);
657#ifdef illumos
658			error = uiomove(va + off, bytes, UIO_READ, uio);
659#else
660			error = vn_io_fault_uiomove(va + off, bytes, uio);
661#endif
662			zfs_unmap_page(sf);
663			zfs_vmobject_wlock(obj);
664			page_unhold(pp);
665		} else {
666			zfs_vmobject_wunlock(obj);
667			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
668			    uio, bytes);
669			zfs_vmobject_wlock(obj);
670		}
671		len -= bytes;
672		off = 0;
673		if (error)
674			break;
675	}
676	zfs_vmobject_wunlock(obj);
677	return (error);
678}
679
680offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
681
682/*
683 * Read bytes from specified file into supplied buffer.
684 *
685 *	IN:	vp	- vnode of file to be read from.
686 *		uio	- structure supplying read location, range info,
687 *			  and return buffer.
688 *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
689 *		cr	- credentials of caller.
690 *		ct	- caller context
691 *
692 *	OUT:	uio	- updated offset and range, buffer filled.
693 *
694 *	RETURN:	0 on success, error code on failure.
695 *
696 * Side Effects:
697 *	vp - atime updated if byte count > 0
698 */
699/* ARGSUSED */
700static int
701zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
702{
703	znode_t		*zp = VTOZ(vp);
704	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
705	ssize_t		n, nbytes;
706	int		error = 0;
707	rl_t		*rl;
708	xuio_t		*xuio = NULL;
709
710	ZFS_ENTER(zfsvfs);
711	ZFS_VERIFY_ZP(zp);
712
713	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
714		ZFS_EXIT(zfsvfs);
715		return (SET_ERROR(EACCES));
716	}
717
718	/*
719	 * Validate file offset
720	 */
721	if (uio->uio_loffset < (offset_t)0) {
722		ZFS_EXIT(zfsvfs);
723		return (SET_ERROR(EINVAL));
724	}
725
726	/*
727	 * Fasttrack empty reads
728	 */
729	if (uio->uio_resid == 0) {
730		ZFS_EXIT(zfsvfs);
731		return (0);
732	}
733
734	/*
735	 * Check for mandatory locks
736	 */
737	if (MANDMODE(zp->z_mode)) {
738		if (error = chklock(vp, FREAD,
739		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
740			ZFS_EXIT(zfsvfs);
741			return (error);
742		}
743	}
744
745	/*
746	 * If we're in FRSYNC mode, sync out this znode before reading it.
747	 */
748	if (zfsvfs->z_log &&
749	    (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
750		zil_commit(zfsvfs->z_log, zp->z_id);
751
752	/*
753	 * Lock the range against changes.
754	 */
755	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
756
757	/*
758	 * If we are reading past end-of-file we can skip
759	 * to the end; but we might still need to set atime.
760	 */
761	if (uio->uio_loffset >= zp->z_size) {
762		error = 0;
763		goto out;
764	}
765
766	ASSERT(uio->uio_loffset < zp->z_size);
767	n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
768
769#ifdef illumos
770	if ((uio->uio_extflg == UIO_XUIO) &&
771	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
772		int nblk;
773		int blksz = zp->z_blksz;
774		uint64_t offset = uio->uio_loffset;
775
776		xuio = (xuio_t *)uio;
777		if ((ISP2(blksz))) {
778			nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
779			    blksz)) / blksz;
780		} else {
781			ASSERT(offset + n <= blksz);
782			nblk = 1;
783		}
784		(void) dmu_xuio_init(xuio, nblk);
785
786		if (vn_has_cached_data(vp)) {
787			/*
788			 * For simplicity, we always allocate a full buffer
789			 * even if we only expect to read a portion of a block.
790			 */
791			while (--nblk >= 0) {
792				(void) dmu_xuio_add(xuio,
793				    dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
794				    blksz), 0, blksz);
795			}
796		}
797	}
798#endif	/* illumos */
799
800	while (n > 0) {
801		nbytes = MIN(n, zfs_read_chunk_size -
802		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
803
804#ifdef __FreeBSD__
805		if (uio->uio_segflg == UIO_NOCOPY)
806			error = mappedread_sf(vp, nbytes, uio);
807		else
808#endif /* __FreeBSD__ */
809		if (vn_has_cached_data(vp)) {
810			error = mappedread(vp, nbytes, uio);
811		} else {
812			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
813			    uio, nbytes);
814		}
815		if (error) {
816			/* convert checksum errors into IO errors */
817			if (error == ECKSUM)
818				error = SET_ERROR(EIO);
819			break;
820		}
821
822		n -= nbytes;
823	}
824out:
825	zfs_range_unlock(rl);
826
827	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
828	ZFS_EXIT(zfsvfs);
829	return (error);
830}
831
832/*
833 * Write the bytes to a file.
834 *
835 *	IN:	vp	- vnode of file to be written to.
836 *		uio	- structure supplying write location, range info,
837 *			  and data buffer.
838 *		ioflag	- FAPPEND, FSYNC, and/or FDSYNC.  FAPPEND is
839 *			  set if in append mode.
840 *		cr	- credentials of caller.
841 *		ct	- caller context (NFS/CIFS fem monitor only)
842 *
843 *	OUT:	uio	- updated offset and range.
844 *
845 *	RETURN:	0 on success, error code on failure.
846 *
847 * Timestamps:
848 *	vp - ctime|mtime updated if byte count > 0
849 */
850
851/* ARGSUSED */
852static int
853zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
854{
855	znode_t		*zp = VTOZ(vp);
856	rlim64_t	limit = MAXOFFSET_T;
857	ssize_t		start_resid = uio->uio_resid;
858	ssize_t		tx_bytes;
859	uint64_t	end_size;
860	dmu_tx_t	*tx;
861	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
862	zilog_t		*zilog;
863	offset_t	woff;
864	ssize_t		n, nbytes;
865	rl_t		*rl;
866	int		max_blksz = zfsvfs->z_max_blksz;
867	int		error = 0;
868	arc_buf_t	*abuf;
869	iovec_t		*aiov = NULL;
870	xuio_t		*xuio = NULL;
871	int		i_iov = 0;
872	int		iovcnt = uio->uio_iovcnt;
873	iovec_t		*iovp = uio->uio_iov;
874	int		write_eof;
875	int		count = 0;
876	sa_bulk_attr_t	bulk[4];
877	uint64_t	mtime[2], ctime[2];
878
879	/*
880	 * Fasttrack empty write
881	 */
882	n = start_resid;
883	if (n == 0)
884		return (0);
885
886	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
887		limit = MAXOFFSET_T;
888
889	ZFS_ENTER(zfsvfs);
890	ZFS_VERIFY_ZP(zp);
891
892	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
893	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
894	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
895	    &zp->z_size, 8);
896	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
897	    &zp->z_pflags, 8);
898
899	/*
900	 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
901	 * callers might not be able to detect properly that we are read-only,
902	 * so check it explicitly here.
903	 */
904	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
905		ZFS_EXIT(zfsvfs);
906		return (SET_ERROR(EROFS));
907	}
908
909	/*
910	 * If immutable or not appending then return EPERM
911	 */
912	if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
913	    ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
914	    (uio->uio_loffset < zp->z_size))) {
915		ZFS_EXIT(zfsvfs);
916		return (SET_ERROR(EPERM));
917	}
918
919	zilog = zfsvfs->z_log;
920
921	/*
922	 * Validate file offset
923	 */
924	woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
925	if (woff < 0) {
926		ZFS_EXIT(zfsvfs);
927		return (SET_ERROR(EINVAL));
928	}
929
930	/*
931	 * Check for mandatory locks before calling zfs_range_lock()
932	 * in order to prevent a deadlock with locks set via fcntl().
933	 */
934	if (MANDMODE((mode_t)zp->z_mode) &&
935	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
936		ZFS_EXIT(zfsvfs);
937		return (error);
938	}
939
940#ifdef illumos
941	/*
942	 * Pre-fault the pages to ensure slow (eg NFS) pages
943	 * don't hold up txg.
944	 * Skip this if uio contains loaned arc_buf.
945	 */
946	if ((uio->uio_extflg == UIO_XUIO) &&
947	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
948		xuio = (xuio_t *)uio;
949	else
950		uio_prefaultpages(MIN(n, max_blksz), uio);
951#endif
952
953	/*
954	 * If in append mode, set the io offset pointer to eof.
955	 */
956	if (ioflag & FAPPEND) {
957		/*
958		 * Obtain an appending range lock to guarantee file append
959		 * semantics.  We reset the write offset once we have the lock.
960		 */
961		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
962		woff = rl->r_off;
963		if (rl->r_len == UINT64_MAX) {
964			/*
965			 * We overlocked the file because this write will cause
966			 * the file block size to increase.
967			 * Note that zp_size cannot change with this lock held.
968			 */
969			woff = zp->z_size;
970		}
971		uio->uio_loffset = woff;
972	} else {
973		/*
974		 * Note that if the file block size will change as a result of
975		 * this write, then this range lock will lock the entire file
976		 * so that we can re-write the block safely.
977		 */
978		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
979	}
980
981	if (vn_rlimit_fsize(vp, uio, uio->uio_td)) {
982		zfs_range_unlock(rl);
983		ZFS_EXIT(zfsvfs);
984		return (EFBIG);
985	}
986
987	if (woff >= limit) {
988		zfs_range_unlock(rl);
989		ZFS_EXIT(zfsvfs);
990		return (SET_ERROR(EFBIG));
991	}
992
993	if ((woff + n) > limit || woff > (limit - n))
994		n = limit - woff;
995
996	/* Will this write extend the file length? */
997	write_eof = (woff + n > zp->z_size);
998
999	end_size = MAX(zp->z_size, woff + n);
1000
1001	/*
1002	 * Write the file in reasonable size chunks.  Each chunk is written
1003	 * in a separate transaction; this keeps the intent log records small
1004	 * and allows us to do more fine-grained space accounting.
1005	 */
1006	while (n > 0) {
1007		abuf = NULL;
1008		woff = uio->uio_loffset;
1009		if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
1010		    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
1011			if (abuf != NULL)
1012				dmu_return_arcbuf(abuf);
1013			error = SET_ERROR(EDQUOT);
1014			break;
1015		}
1016
1017		if (xuio && abuf == NULL) {
1018			ASSERT(i_iov < iovcnt);
1019			aiov = &iovp[i_iov];
1020			abuf = dmu_xuio_arcbuf(xuio, i_iov);
1021			dmu_xuio_clear(xuio, i_iov);
1022			DTRACE_PROBE3(zfs_cp_write, int, i_iov,
1023			    iovec_t *, aiov, arc_buf_t *, abuf);
1024			ASSERT((aiov->iov_base == abuf->b_data) ||
1025			    ((char *)aiov->iov_base - (char *)abuf->b_data +
1026			    aiov->iov_len == arc_buf_size(abuf)));
1027			i_iov++;
1028		} else if (abuf == NULL && n >= max_blksz &&
1029		    woff >= zp->z_size &&
1030		    P2PHASE(woff, max_blksz) == 0 &&
1031		    zp->z_blksz == max_blksz) {
1032			/*
1033			 * This write covers a full block.  "Borrow" a buffer
1034			 * from the dmu so that we can fill it before we enter
1035			 * a transaction.  This avoids the possibility of
1036			 * holding up the transaction if the data copy hangs
1037			 * up on a pagefault (e.g., from an NFS server mapping).
1038			 */
1039#ifdef illumos
1040			size_t cbytes;
1041#endif
1042
1043			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
1044			    max_blksz);
1045			ASSERT(abuf != NULL);
1046			ASSERT(arc_buf_size(abuf) == max_blksz);
1047#ifdef illumos
1048			if (error = uiocopy(abuf->b_data, max_blksz,
1049			    UIO_WRITE, uio, &cbytes)) {
1050				dmu_return_arcbuf(abuf);
1051				break;
1052			}
1053			ASSERT(cbytes == max_blksz);
1054#else
1055			ssize_t resid = uio->uio_resid;
1056			error = vn_io_fault_uiomove(abuf->b_data, max_blksz, uio);
1057			if (error != 0) {
1058				uio->uio_offset -= resid - uio->uio_resid;
1059				uio->uio_resid = resid;
1060				dmu_return_arcbuf(abuf);
1061				break;
1062			}
1063#endif
1064		}
1065
1066		/*
1067		 * Start a transaction.
1068		 */
1069		tx = dmu_tx_create(zfsvfs->z_os);
1070		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1071		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
1072		zfs_sa_upgrade_txholds(tx, zp);
1073		error = dmu_tx_assign(tx, TXG_WAIT);
1074		if (error) {
1075			dmu_tx_abort(tx);
1076			if (abuf != NULL)
1077				dmu_return_arcbuf(abuf);
1078			break;
1079		}
1080
1081		/*
1082		 * If zfs_range_lock() over-locked we grow the blocksize
1083		 * and then reduce the lock range.  This will only happen
1084		 * on the first iteration since zfs_range_reduce() will
1085		 * shrink down r_len to the appropriate size.
1086		 */
1087		if (rl->r_len == UINT64_MAX) {
1088			uint64_t new_blksz;
1089
1090			if (zp->z_blksz > max_blksz) {
1091				/*
1092				 * File's blocksize is already larger than the
1093				 * "recordsize" property.  Only let it grow to
1094				 * the next power of 2.
1095				 */
1096				ASSERT(!ISP2(zp->z_blksz));
1097				new_blksz = MIN(end_size,
1098				    1 << highbit64(zp->z_blksz));
1099			} else {
1100				new_blksz = MIN(end_size, max_blksz);
1101			}
1102			zfs_grow_blocksize(zp, new_blksz, tx);
1103			zfs_range_reduce(rl, woff, n);
1104		}
1105
1106		/*
1107		 * XXX - should we really limit each write to z_max_blksz?
1108		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
1109		 */
1110		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
1111
1112		if (woff + nbytes > zp->z_size)
1113			vnode_pager_setsize(vp, woff + nbytes);
1114
1115		if (abuf == NULL) {
1116			tx_bytes = uio->uio_resid;
1117			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
1118			    uio, nbytes, tx);
1119			tx_bytes -= uio->uio_resid;
1120		} else {
1121			tx_bytes = nbytes;
1122			ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
1123			/*
1124			 * If this is not a full block write, but we are
1125			 * extending the file past EOF and this data starts
1126			 * block-aligned, use assign_arcbuf().  Otherwise,
1127			 * write via dmu_write().
1128			 */
1129			if (tx_bytes < max_blksz && (!write_eof ||
1130			    aiov->iov_base != abuf->b_data)) {
1131				ASSERT(xuio);
1132				dmu_write(zfsvfs->z_os, zp->z_id, woff,
1133				    aiov->iov_len, aiov->iov_base, tx);
1134				dmu_return_arcbuf(abuf);
1135				xuio_stat_wbuf_copied();
1136			} else {
1137				ASSERT(xuio || tx_bytes == max_blksz);
1138				dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
1139				    woff, abuf, tx);
1140			}
1141#ifdef illumos
1142			ASSERT(tx_bytes <= uio->uio_resid);
1143			uioskip(uio, tx_bytes);
1144#endif
1145		}
1146		if (tx_bytes && vn_has_cached_data(vp)) {
1147			update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
1148			    zp->z_id, uio->uio_segflg, tx);
1149		}
1150
1151		/*
1152		 * If we made no progress, we're done.  If we made even
1153		 * partial progress, update the znode and ZIL accordingly.
1154		 */
1155		if (tx_bytes == 0) {
1156			(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
1157			    (void *)&zp->z_size, sizeof (uint64_t), tx);
1158			dmu_tx_commit(tx);
1159			ASSERT(error != 0);
1160			break;
1161		}
1162
1163		/*
1164		 * Clear Set-UID/Set-GID bits on successful write if not
1165		 * privileged and at least one of the excute bits is set.
1166		 *
1167		 * It would be nice to to this after all writes have
1168		 * been done, but that would still expose the ISUID/ISGID
1169		 * to another app after the partial write is committed.
1170		 *
1171		 * Note: we don't call zfs_fuid_map_id() here because
1172		 * user 0 is not an ephemeral uid.
1173		 */
1174		mutex_enter(&zp->z_acl_lock);
1175		if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
1176		    (S_IXUSR >> 6))) != 0 &&
1177		    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
1178		    secpolicy_vnode_setid_retain(vp, cr,
1179		    (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
1180			uint64_t newmode;
1181			zp->z_mode &= ~(S_ISUID | S_ISGID);
1182			newmode = zp->z_mode;
1183			(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
1184			    (void *)&newmode, sizeof (uint64_t), tx);
1185		}
1186		mutex_exit(&zp->z_acl_lock);
1187
1188		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
1189		    B_TRUE);
1190
1191		/*
1192		 * Update the file size (zp_size) if it has changed;
1193		 * account for possible concurrent updates.
1194		 */
1195		while ((end_size = zp->z_size) < uio->uio_loffset) {
1196			(void) atomic_cas_64(&zp->z_size, end_size,
1197			    uio->uio_loffset);
1198#ifdef illumos
1199			ASSERT(error == 0);
1200#else
1201			ASSERT(error == 0 || error == EFAULT);
1202#endif
1203		}
1204		/*
1205		 * If we are replaying and eof is non zero then force
1206		 * the file size to the specified eof. Note, there's no
1207		 * concurrency during replay.
1208		 */
1209		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
1210			zp->z_size = zfsvfs->z_replay_eof;
1211
1212		if (error == 0)
1213			error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1214		else
1215			(void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1216
1217		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
1218		dmu_tx_commit(tx);
1219
1220		if (error != 0)
1221			break;
1222		ASSERT(tx_bytes == nbytes);
1223		n -= nbytes;
1224
1225#ifdef illumos
1226		if (!xuio && n > 0)
1227			uio_prefaultpages(MIN(n, max_blksz), uio);
1228#endif
1229	}
1230
1231	zfs_range_unlock(rl);
1232
1233	/*
1234	 * If we're in replay mode, or we made no progress, return error.
1235	 * Otherwise, it's at least a partial write, so it's successful.
1236	 */
1237	if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
1238		ZFS_EXIT(zfsvfs);
1239		return (error);
1240	}
1241
1242#ifdef __FreeBSD__
1243	/*
1244	 * EFAULT means that at least one page of the source buffer was not
1245	 * available.  VFS will re-try remaining I/O upon this error.
1246	 */
1247	if (error == EFAULT) {
1248		ZFS_EXIT(zfsvfs);
1249		return (error);
1250	}
1251#endif
1252
1253	if (ioflag & (FSYNC | FDSYNC) ||
1254	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1255		zil_commit(zilog, zp->z_id);
1256
1257	ZFS_EXIT(zfsvfs);
1258	return (0);
1259}
1260
1261void
1262zfs_get_done(zgd_t *zgd, int error)
1263{
1264	znode_t *zp = zgd->zgd_private;
1265	objset_t *os = zp->z_zfsvfs->z_os;
1266
1267	if (zgd->zgd_db)
1268		dmu_buf_rele(zgd->zgd_db, zgd);
1269
1270	zfs_range_unlock(zgd->zgd_rl);
1271
1272	/*
1273	 * Release the vnode asynchronously as we currently have the
1274	 * txg stopped from syncing.
1275	 */
1276	VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1277
1278	if (error == 0 && zgd->zgd_bp)
1279		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
1280
1281	kmem_free(zgd, sizeof (zgd_t));
1282}
1283
1284#ifdef DEBUG
1285static int zil_fault_io = 0;
1286#endif
1287
1288/*
1289 * Get data to generate a TX_WRITE intent log record.
1290 */
1291int
1292zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
1293{
1294	zfsvfs_t *zfsvfs = arg;
1295	objset_t *os = zfsvfs->z_os;
1296	znode_t *zp;
1297	uint64_t object = lr->lr_foid;
1298	uint64_t offset = lr->lr_offset;
1299	uint64_t size = lr->lr_length;
1300	blkptr_t *bp = &lr->lr_blkptr;
1301	dmu_buf_t *db;
1302	zgd_t *zgd;
1303	int error = 0;
1304
1305	ASSERT(zio != NULL);
1306	ASSERT(size != 0);
1307
1308	/*
1309	 * Nothing to do if the file has been removed
1310	 */
1311	if (zfs_zget(zfsvfs, object, &zp) != 0)
1312		return (SET_ERROR(ENOENT));
1313	if (zp->z_unlinked) {
1314		/*
1315		 * Release the vnode asynchronously as we currently have the
1316		 * txg stopped from syncing.
1317		 */
1318		VN_RELE_ASYNC(ZTOV(zp),
1319		    dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1320		return (SET_ERROR(ENOENT));
1321	}
1322
1323	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1324	zgd->zgd_zilog = zfsvfs->z_log;
1325	zgd->zgd_private = zp;
1326
1327	/*
1328	 * Write records come in two flavors: immediate and indirect.
1329	 * For small writes it's cheaper to store the data with the
1330	 * log record (immediate); for large writes it's cheaper to
1331	 * sync the data and get a pointer to it (indirect) so that
1332	 * we don't have to write the data twice.
1333	 */
1334	if (buf != NULL) { /* immediate write */
1335		zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
1336		/* test for truncation needs to be done while range locked */
1337		if (offset >= zp->z_size) {
1338			error = SET_ERROR(ENOENT);
1339		} else {
1340			error = dmu_read(os, object, offset, size, buf,
1341			    DMU_READ_NO_PREFETCH);
1342		}
1343		ASSERT(error == 0 || error == ENOENT);
1344	} else { /* indirect write */
1345		/*
1346		 * Have to lock the whole block to ensure when it's
1347		 * written out and it's checksum is being calculated
1348		 * that no one can change the data. We need to re-check
1349		 * blocksize after we get the lock in case it's changed!
1350		 */
1351		for (;;) {
1352			uint64_t blkoff;
1353			size = zp->z_blksz;
1354			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1355			offset -= blkoff;
1356			zgd->zgd_rl = zfs_range_lock(zp, offset, size,
1357			    RL_READER);
1358			if (zp->z_blksz == size)
1359				break;
1360			offset += blkoff;
1361			zfs_range_unlock(zgd->zgd_rl);
1362		}
1363		/* test for truncation needs to be done while range locked */
1364		if (lr->lr_offset >= zp->z_size)
1365			error = SET_ERROR(ENOENT);
1366#ifdef DEBUG
1367		if (zil_fault_io) {
1368			error = SET_ERROR(EIO);
1369			zil_fault_io = 0;
1370		}
1371#endif
1372		if (error == 0)
1373			error = dmu_buf_hold(os, object, offset, zgd, &db,
1374			    DMU_READ_NO_PREFETCH);
1375
1376		if (error == 0) {
1377			blkptr_t *obp = dmu_buf_get_blkptr(db);
1378			if (obp) {
1379				ASSERT(BP_IS_HOLE(bp));
1380				*bp = *obp;
1381			}
1382
1383			zgd->zgd_db = db;
1384			zgd->zgd_bp = bp;
1385
1386			ASSERT(db->db_offset == offset);
1387			ASSERT(db->db_size == size);
1388
1389			error = dmu_sync(zio, lr->lr_common.lrc_txg,
1390			    zfs_get_done, zgd);
1391			ASSERT(error || lr->lr_length <= zp->z_blksz);
1392
1393			/*
1394			 * On success, we need to wait for the write I/O
1395			 * initiated by dmu_sync() to complete before we can
1396			 * release this dbuf.  We will finish everything up
1397			 * in the zfs_get_done() callback.
1398			 */
1399			if (error == 0)
1400				return (0);
1401
1402			if (error == EALREADY) {
1403				lr->lr_common.lrc_txtype = TX_WRITE2;
1404				error = 0;
1405			}
1406		}
1407	}
1408
1409	zfs_get_done(zgd, error);
1410
1411	return (error);
1412}
1413
1414/*ARGSUSED*/
1415static int
1416zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1417    caller_context_t *ct)
1418{
1419	znode_t *zp = VTOZ(vp);
1420	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1421	int error;
1422
1423	ZFS_ENTER(zfsvfs);
1424	ZFS_VERIFY_ZP(zp);
1425
1426	if (flag & V_ACE_MASK)
1427		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1428	else
1429		error = zfs_zaccess_rwx(zp, mode, flag, cr);
1430
1431	ZFS_EXIT(zfsvfs);
1432	return (error);
1433}
1434
1435static int
1436zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
1437{
1438	int error;
1439
1440	*vpp = arg;
1441	error = vn_lock(*vpp, lkflags);
1442	if (error != 0)
1443		vrele(*vpp);
1444	return (error);
1445}
1446
1447static int
1448zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
1449{
1450	znode_t *zdp = VTOZ(dvp);
1451	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1452	int error;
1453	int ltype;
1454
1455	ASSERT_VOP_LOCKED(dvp, __func__);
1456#ifdef DIAGNOSTIC
1457	if ((zdp->z_pflags & ZFS_XATTR) == 0)
1458		VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
1459#endif
1460
1461	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
1462		ASSERT3P(dvp, ==, vp);
1463		vref(dvp);
1464		ltype = lkflags & LK_TYPE_MASK;
1465		if (ltype != VOP_ISLOCKED(dvp)) {
1466			if (ltype == LK_EXCLUSIVE)
1467				vn_lock(dvp, LK_UPGRADE | LK_RETRY);
1468			else /* if (ltype == LK_SHARED) */
1469				vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
1470
1471			/*
1472			 * Relock for the "." case could leave us with
1473			 * reclaimed vnode.
1474			 */
1475			if (dvp->v_iflag & VI_DOOMED) {
1476				vrele(dvp);
1477				return (SET_ERROR(ENOENT));
1478			}
1479		}
1480		return (0);
1481	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
1482		/*
1483		 * Note that in this case, dvp is the child vnode, and we
1484		 * are looking up the parent vnode - exactly reverse from
1485		 * normal operation.  Unlocking dvp requires some rather
1486		 * tricky unlock/relock dance to prevent mp from being freed;
1487		 * use vn_vget_ino_gen() which takes care of all that.
1488		 *
1489		 * XXX Note that there is a time window when both vnodes are
1490		 * unlocked.  It is possible, although highly unlikely, that
1491		 * during that window the parent-child relationship between
1492		 * the vnodes may change, for example, get reversed.
1493		 * In that case we would have a wrong lock order for the vnodes.
1494		 * All other filesystems seem to ignore this problem, so we
1495		 * do the same here.
1496		 * A potential solution could be implemented as follows:
1497		 * - using LK_NOWAIT when locking the second vnode and retrying
1498		 *   if necessary
1499		 * - checking that the parent-child relationship still holds
1500		 *   after locking both vnodes and retrying if it doesn't
1501		 */
1502		error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
1503		return (error);
1504	} else {
1505		error = vn_lock(vp, lkflags);
1506		if (error != 0)
1507			vrele(vp);
1508		return (error);
1509	}
1510}
1511
1512/*
1513 * Lookup an entry in a directory, or an extended attribute directory.
1514 * If it exists, return a held vnode reference for it.
1515 *
1516 *	IN:	dvp	- vnode of directory to search.
1517 *		nm	- name of entry to lookup.
1518 *		pnp	- full pathname to lookup [UNUSED].
1519 *		flags	- LOOKUP_XATTR set if looking for an attribute.
1520 *		rdir	- root directory vnode [UNUSED].
1521 *		cr	- credentials of caller.
1522 *		ct	- caller context
1523 *
1524 *	OUT:	vpp	- vnode of located entry, NULL if not found.
1525 *
1526 *	RETURN:	0 on success, error code on failure.
1527 *
1528 * Timestamps:
1529 *	NA
1530 */
1531/* ARGSUSED */
1532static int
1533zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
1534    int nameiop, cred_t *cr, kthread_t *td, int flags)
1535{
1536	znode_t *zdp = VTOZ(dvp);
1537	znode_t *zp;
1538	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1539	int	error = 0;
1540
1541	/* fast path (should be redundant with vfs namecache) */
1542	if (!(flags & LOOKUP_XATTR)) {
1543		if (dvp->v_type != VDIR) {
1544			return (SET_ERROR(ENOTDIR));
1545		} else if (zdp->z_sa_hdl == NULL) {
1546			return (SET_ERROR(EIO));
1547		}
1548	}
1549
1550	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1551
1552	ZFS_ENTER(zfsvfs);
1553	ZFS_VERIFY_ZP(zdp);
1554
1555	*vpp = NULL;
1556
1557	if (flags & LOOKUP_XATTR) {
1558#ifdef TODO
1559		/*
1560		 * If the xattr property is off, refuse the lookup request.
1561		 */
1562		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1563			ZFS_EXIT(zfsvfs);
1564			return (SET_ERROR(EINVAL));
1565		}
1566#endif
1567
1568		/*
1569		 * We don't allow recursive attributes..
1570		 * Maybe someday we will.
1571		 */
1572		if (zdp->z_pflags & ZFS_XATTR) {
1573			ZFS_EXIT(zfsvfs);
1574			return (SET_ERROR(EINVAL));
1575		}
1576
1577		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1578			ZFS_EXIT(zfsvfs);
1579			return (error);
1580		}
1581
1582		/*
1583		 * Do we have permission to get into attribute directory?
1584		 */
1585		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1586		    B_FALSE, cr)) {
1587			vrele(*vpp);
1588			*vpp = NULL;
1589		}
1590
1591		ZFS_EXIT(zfsvfs);
1592		return (error);
1593	}
1594
1595	/*
1596	 * Check accessibility of directory.
1597	 */
1598	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1599		ZFS_EXIT(zfsvfs);
1600		return (error);
1601	}
1602
1603	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1604	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1605		ZFS_EXIT(zfsvfs);
1606		return (SET_ERROR(EILSEQ));
1607	}
1608
1609
1610	/*
1611	 * First handle the special cases.
1612	 */
1613	if ((cnp->cn_flags & ISDOTDOT) != 0) {
1614		/*
1615		 * If we are a snapshot mounted under .zfs, return
1616		 * the vp for the snapshot directory.
1617		 */
1618		if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
1619			error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
1620			    "snapshot", vpp, NULL, 0, NULL, kcred,
1621			    NULL, NULL, NULL);
1622			ZFS_EXIT(zfsvfs);
1623			if (error == 0) {
1624				error = zfs_lookup_lock(dvp, *vpp, nm,
1625				    cnp->cn_lkflags);
1626			}
1627			goto out;
1628		}
1629	}
1630	if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
1631		error = 0;
1632		if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
1633			error = SET_ERROR(ENOTSUP);
1634		else
1635			*vpp = zfsctl_root(zdp);
1636		ZFS_EXIT(zfsvfs);
1637		if (error == 0)
1638			error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
1639		goto out;
1640	}
1641
1642	/*
1643	 * The loop is retry the lookup if the parent-child relationship
1644	 * changes during the dot-dot locking complexities.
1645	 */
1646	for (;;) {
1647		uint64_t parent;
1648
1649		error = zfs_dirlook(zdp, nm, &zp);
1650		if (error == 0)
1651			*vpp = ZTOV(zp);
1652
1653		ZFS_EXIT(zfsvfs);
1654		if (error != 0)
1655			break;
1656
1657		error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
1658		if (error != 0) {
1659			/*
1660			 * If we've got a locking error, then the vnode
1661			 * got reclaimed because of a force unmount.
1662			 * We never enter doomed vnodes into the name cache.
1663			 */
1664			*vpp = NULL;
1665			return (error);
1666		}
1667
1668		if ((cnp->cn_flags & ISDOTDOT) == 0)
1669			break;
1670
1671		ZFS_ENTER(zfsvfs);
1672		if (zdp->z_sa_hdl == NULL) {
1673			error = SET_ERROR(EIO);
1674		} else {
1675			error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
1676			    &parent, sizeof (parent));
1677		}
1678		if (error != 0) {
1679			ZFS_EXIT(zfsvfs);
1680			vput(ZTOV(zp));
1681			break;
1682		}
1683		if (zp->z_id == parent) {
1684			ZFS_EXIT(zfsvfs);
1685			break;
1686		}
1687		vput(ZTOV(zp));
1688	}
1689
1690out:
1691	if (error != 0)
1692		*vpp = NULL;
1693
1694	/* Translate errors and add SAVENAME when needed. */
1695	if (cnp->cn_flags & ISLASTCN) {
1696		switch (nameiop) {
1697		case CREATE:
1698		case RENAME:
1699			if (error == ENOENT) {
1700				error = EJUSTRETURN;
1701				cnp->cn_flags |= SAVENAME;
1702				break;
1703			}
1704			/* FALLTHROUGH */
1705		case DELETE:
1706			if (error == 0)
1707				cnp->cn_flags |= SAVENAME;
1708			break;
1709		}
1710	}
1711
1712	/* Insert name into cache (as non-existent) if appropriate. */
1713	if (zfsvfs->z_use_namecache &&
1714	    error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
1715		cache_enter(dvp, NULL, cnp);
1716
1717	/* Insert name into cache if appropriate. */
1718	if (zfsvfs->z_use_namecache &&
1719	    error == 0 && (cnp->cn_flags & MAKEENTRY)) {
1720		if (!(cnp->cn_flags & ISLASTCN) ||
1721		    (nameiop != DELETE && nameiop != RENAME)) {
1722			cache_enter(dvp, *vpp, cnp);
1723		}
1724	}
1725
1726	return (error);
1727}
1728
1729/*
1730 * Attempt to create a new entry in a directory.  If the entry
1731 * already exists, truncate the file if permissible, else return
1732 * an error.  Return the vp of the created or trunc'd file.
1733 *
1734 *	IN:	dvp	- vnode of directory to put new file entry in.
1735 *		name	- name of new file entry.
1736 *		vap	- attributes of new file.
1737 *		excl	- flag indicating exclusive or non-exclusive mode.
1738 *		mode	- mode to open file with.
1739 *		cr	- credentials of caller.
1740 *		flag	- large file flag [UNUSED].
1741 *		ct	- caller context
1742 *		vsecp	- ACL to be set
1743 *
1744 *	OUT:	vpp	- vnode of created or trunc'd entry.
1745 *
1746 *	RETURN:	0 on success, error code on failure.
1747 *
1748 * Timestamps:
1749 *	dvp - ctime|mtime updated if new entry created
1750 *	 vp - ctime|mtime always, atime if new
1751 */
1752
1753/* ARGSUSED */
1754static int
1755zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
1756    vnode_t **vpp, cred_t *cr, kthread_t *td)
1757{
1758	znode_t		*zp, *dzp = VTOZ(dvp);
1759	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1760	zilog_t		*zilog;
1761	objset_t	*os;
1762	dmu_tx_t	*tx;
1763	int		error;
1764	ksid_t		*ksid;
1765	uid_t		uid;
1766	gid_t		gid = crgetgid(cr);
1767	zfs_acl_ids_t   acl_ids;
1768	boolean_t	fuid_dirtied;
1769	void		*vsecp = NULL;
1770	int		flag = 0;
1771	uint64_t	txtype;
1772
1773	/*
1774	 * If we have an ephemeral id, ACL, or XVATTR then
1775	 * make sure file system is at proper version
1776	 */
1777
1778	ksid = crgetsid(cr, KSID_OWNER);
1779	if (ksid)
1780		uid = ksid_getid(ksid);
1781	else
1782		uid = crgetuid(cr);
1783
1784	if (zfsvfs->z_use_fuids == B_FALSE &&
1785	    (vsecp || (vap->va_mask & AT_XVATTR) ||
1786	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1787		return (SET_ERROR(EINVAL));
1788
1789	ZFS_ENTER(zfsvfs);
1790	ZFS_VERIFY_ZP(dzp);
1791	os = zfsvfs->z_os;
1792	zilog = zfsvfs->z_log;
1793
1794	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1795	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1796		ZFS_EXIT(zfsvfs);
1797		return (SET_ERROR(EILSEQ));
1798	}
1799
1800	if (vap->va_mask & AT_XVATTR) {
1801		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
1802		    crgetuid(cr), cr, vap->va_type)) != 0) {
1803			ZFS_EXIT(zfsvfs);
1804			return (error);
1805		}
1806	}
1807
1808	*vpp = NULL;
1809
1810	if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
1811		vap->va_mode &= ~S_ISVTX;
1812
1813	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
1814	if (error) {
1815		ZFS_EXIT(zfsvfs);
1816		return (error);
1817	}
1818	ASSERT3P(zp, ==, NULL);
1819
1820	/*
1821	 * Create a new file object and update the directory
1822	 * to reference it.
1823	 */
1824	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1825		goto out;
1826	}
1827
1828	/*
1829	 * We only support the creation of regular files in
1830	 * extended attribute directories.
1831	 */
1832
1833	if ((dzp->z_pflags & ZFS_XATTR) &&
1834	    (vap->va_type != VREG)) {
1835		error = SET_ERROR(EINVAL);
1836		goto out;
1837	}
1838
1839	if ((error = zfs_acl_ids_create(dzp, 0, vap,
1840	    cr, vsecp, &acl_ids)) != 0)
1841		goto out;
1842
1843	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1844		zfs_acl_ids_free(&acl_ids);
1845		error = SET_ERROR(EDQUOT);
1846		goto out;
1847	}
1848
1849	getnewvnode_reserve(1);
1850
1851	tx = dmu_tx_create(os);
1852
1853	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1854	    ZFS_SA_BASE_ATTR_SIZE);
1855
1856	fuid_dirtied = zfsvfs->z_fuid_dirty;
1857	if (fuid_dirtied)
1858		zfs_fuid_txhold(zfsvfs, tx);
1859	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1860	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1861	if (!zfsvfs->z_use_sa &&
1862	    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1863		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1864		    0, acl_ids.z_aclp->z_acl_bytes);
1865	}
1866	error = dmu_tx_assign(tx, TXG_WAIT);
1867	if (error) {
1868		zfs_acl_ids_free(&acl_ids);
1869		dmu_tx_abort(tx);
1870		getnewvnode_drop_reserve();
1871		ZFS_EXIT(zfsvfs);
1872		return (error);
1873	}
1874	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1875
1876	if (fuid_dirtied)
1877		zfs_fuid_sync(zfsvfs, tx);
1878
1879	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
1880	txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1881	zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1882	    vsecp, acl_ids.z_fuidp, vap);
1883	zfs_acl_ids_free(&acl_ids);
1884	dmu_tx_commit(tx);
1885
1886	getnewvnode_drop_reserve();
1887
1888out:
1889	if (error == 0) {
1890		*vpp = ZTOV(zp);
1891	}
1892
1893	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1894		zil_commit(zilog, 0);
1895
1896	ZFS_EXIT(zfsvfs);
1897	return (error);
1898}
1899
1900/*
1901 * Remove an entry from a directory.
1902 *
1903 *	IN:	dvp	- vnode of directory to remove entry from.
1904 *		name	- name of entry to remove.
1905 *		cr	- credentials of caller.
1906 *		ct	- caller context
1907 *		flags	- case flags
1908 *
1909 *	RETURN:	0 on success, error code on failure.
1910 *
1911 * Timestamps:
1912 *	dvp - ctime|mtime
1913 *	 vp - ctime (if nlink > 0)
1914 */
1915
1916/*ARGSUSED*/
1917static int
1918zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
1919{
1920	znode_t		*dzp = VTOZ(dvp);
1921	znode_t		*zp = VTOZ(vp);
1922	znode_t		*xzp;
1923	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1924	zilog_t		*zilog;
1925	uint64_t	acl_obj, xattr_obj;
1926	uint64_t	obj = 0;
1927	dmu_tx_t	*tx;
1928	boolean_t	unlinked, toobig = FALSE;
1929	uint64_t	txtype;
1930	int		error;
1931
1932	ZFS_ENTER(zfsvfs);
1933	ZFS_VERIFY_ZP(dzp);
1934	ZFS_VERIFY_ZP(zp);
1935	zilog = zfsvfs->z_log;
1936	zp = VTOZ(vp);
1937
1938	xattr_obj = 0;
1939	xzp = NULL;
1940
1941	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1942		goto out;
1943	}
1944
1945	/*
1946	 * Need to use rmdir for removing directories.
1947	 */
1948	if (vp->v_type == VDIR) {
1949		error = SET_ERROR(EPERM);
1950		goto out;
1951	}
1952
1953	vnevent_remove(vp, dvp, name, ct);
1954
1955	obj = zp->z_id;
1956
1957	/* are there any extended attributes? */
1958	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1959	    &xattr_obj, sizeof (xattr_obj));
1960	if (error == 0 && xattr_obj) {
1961		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1962		ASSERT0(error);
1963	}
1964
1965	/*
1966	 * We may delete the znode now, or we may put it in the unlinked set;
1967	 * it depends on whether we're the last link, and on whether there are
1968	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
1969	 * allow for either case.
1970	 */
1971	tx = dmu_tx_create(zfsvfs->z_os);
1972	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1973	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1974	zfs_sa_upgrade_txholds(tx, zp);
1975	zfs_sa_upgrade_txholds(tx, dzp);
1976
1977	if (xzp) {
1978		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1979		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1980	}
1981
1982	/* charge as an update -- would be nice not to charge at all */
1983	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1984
1985	/*
1986	 * Mark this transaction as typically resulting in a net free of space
1987	 */
1988	dmu_tx_mark_netfree(tx);
1989
1990	error = dmu_tx_assign(tx, TXG_WAIT);
1991	if (error) {
1992		dmu_tx_abort(tx);
1993		ZFS_EXIT(zfsvfs);
1994		return (error);
1995	}
1996
1997	/*
1998	 * Remove the directory entry.
1999	 */
2000	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
2001
2002	if (error) {
2003		dmu_tx_commit(tx);
2004		goto out;
2005	}
2006
2007	if (unlinked) {
2008		zfs_unlinked_add(zp, tx);
2009		vp->v_vflag |= VV_NOSYNC;
2010	}
2011
2012	txtype = TX_REMOVE;
2013	zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
2014
2015	dmu_tx_commit(tx);
2016out:
2017
2018	if (xzp)
2019		vrele(ZTOV(xzp));
2020
2021	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2022		zil_commit(zilog, 0);
2023
2024	ZFS_EXIT(zfsvfs);
2025	return (error);
2026}
2027
2028/*
2029 * Create a new directory and insert it into dvp using the name
2030 * provided.  Return a pointer to the inserted directory.
2031 *
2032 *	IN:	dvp	- vnode of directory to add subdir to.
2033 *		dirname	- name of new directory.
2034 *		vap	- attributes of new directory.
2035 *		cr	- credentials of caller.
2036 *		ct	- caller context
2037 *		flags	- case flags
2038 *		vsecp	- ACL to be set
2039 *
2040 *	OUT:	vpp	- vnode of created directory.
2041 *
2042 *	RETURN:	0 on success, error code on failure.
2043 *
2044 * Timestamps:
2045 *	dvp - ctime|mtime updated
2046 *	 vp - ctime|mtime|atime updated
2047 */
2048/*ARGSUSED*/
2049static int
2050zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
2051{
2052	znode_t		*zp, *dzp = VTOZ(dvp);
2053	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2054	zilog_t		*zilog;
2055	uint64_t	txtype;
2056	dmu_tx_t	*tx;
2057	int		error;
2058	ksid_t		*ksid;
2059	uid_t		uid;
2060	gid_t		gid = crgetgid(cr);
2061	zfs_acl_ids_t   acl_ids;
2062	boolean_t	fuid_dirtied;
2063
2064	ASSERT(vap->va_type == VDIR);
2065
2066	/*
2067	 * If we have an ephemeral id, ACL, or XVATTR then
2068	 * make sure file system is at proper version
2069	 */
2070
2071	ksid = crgetsid(cr, KSID_OWNER);
2072	if (ksid)
2073		uid = ksid_getid(ksid);
2074	else
2075		uid = crgetuid(cr);
2076	if (zfsvfs->z_use_fuids == B_FALSE &&
2077	    ((vap->va_mask & AT_XVATTR) ||
2078	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
2079		return (SET_ERROR(EINVAL));
2080
2081	ZFS_ENTER(zfsvfs);
2082	ZFS_VERIFY_ZP(dzp);
2083	zilog = zfsvfs->z_log;
2084
2085	if (dzp->z_pflags & ZFS_XATTR) {
2086		ZFS_EXIT(zfsvfs);
2087		return (SET_ERROR(EINVAL));
2088	}
2089
2090	if (zfsvfs->z_utf8 && u8_validate(dirname,
2091	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2092		ZFS_EXIT(zfsvfs);
2093		return (SET_ERROR(EILSEQ));
2094	}
2095
2096	if (vap->va_mask & AT_XVATTR) {
2097		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
2098		    crgetuid(cr), cr, vap->va_type)) != 0) {
2099			ZFS_EXIT(zfsvfs);
2100			return (error);
2101		}
2102	}
2103
2104	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
2105	    NULL, &acl_ids)) != 0) {
2106		ZFS_EXIT(zfsvfs);
2107		return (error);
2108	}
2109
2110	/*
2111	 * First make sure the new directory doesn't exist.
2112	 *
2113	 * Existence is checked first to make sure we don't return
2114	 * EACCES instead of EEXIST which can cause some applications
2115	 * to fail.
2116	 */
2117	*vpp = NULL;
2118
2119	if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) {
2120		zfs_acl_ids_free(&acl_ids);
2121		ZFS_EXIT(zfsvfs);
2122		return (error);
2123	}
2124	ASSERT3P(zp, ==, NULL);
2125
2126	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
2127		zfs_acl_ids_free(&acl_ids);
2128		ZFS_EXIT(zfsvfs);
2129		return (error);
2130	}
2131
2132	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
2133		zfs_acl_ids_free(&acl_ids);
2134		ZFS_EXIT(zfsvfs);
2135		return (SET_ERROR(EDQUOT));
2136	}
2137
2138	/*
2139	 * Add a new entry to the directory.
2140	 */
2141	getnewvnode_reserve(1);
2142	tx = dmu_tx_create(zfsvfs->z_os);
2143	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
2144	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2145	fuid_dirtied = zfsvfs->z_fuid_dirty;
2146	if (fuid_dirtied)
2147		zfs_fuid_txhold(zfsvfs, tx);
2148	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2149		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
2150		    acl_ids.z_aclp->z_acl_bytes);
2151	}
2152
2153	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
2154	    ZFS_SA_BASE_ATTR_SIZE);
2155
2156	error = dmu_tx_assign(tx, TXG_WAIT);
2157	if (error) {
2158		zfs_acl_ids_free(&acl_ids);
2159		dmu_tx_abort(tx);
2160		getnewvnode_drop_reserve();
2161		ZFS_EXIT(zfsvfs);
2162		return (error);
2163	}
2164
2165	/*
2166	 * Create new node.
2167	 */
2168	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
2169
2170	if (fuid_dirtied)
2171		zfs_fuid_sync(zfsvfs, tx);
2172
2173	/*
2174	 * Now put new name in parent dir.
2175	 */
2176	(void) zfs_link_create(dzp, dirname, zp, tx, ZNEW);
2177
2178	*vpp = ZTOV(zp);
2179
2180	txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
2181	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
2182	    acl_ids.z_fuidp, vap);
2183
2184	zfs_acl_ids_free(&acl_ids);
2185
2186	dmu_tx_commit(tx);
2187
2188	getnewvnode_drop_reserve();
2189
2190	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2191		zil_commit(zilog, 0);
2192
2193	ZFS_EXIT(zfsvfs);
2194	return (0);
2195}
2196
2197/*
2198 * Remove a directory subdir entry.  If the current working
2199 * directory is the same as the subdir to be removed, the
2200 * remove will fail.
2201 *
2202 *	IN:	dvp	- vnode of directory to remove from.
2203 *		name	- name of directory to be removed.
2204 *		cwd	- vnode of current working directory.
2205 *		cr	- credentials of caller.
2206 *		ct	- caller context
2207 *		flags	- case flags
2208 *
2209 *	RETURN:	0 on success, error code on failure.
2210 *
2211 * Timestamps:
2212 *	dvp - ctime|mtime updated
2213 */
2214/*ARGSUSED*/
2215static int
2216zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
2217{
2218	znode_t		*dzp = VTOZ(dvp);
2219	znode_t		*zp = VTOZ(vp);
2220	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2221	zilog_t		*zilog;
2222	dmu_tx_t	*tx;
2223	int		error;
2224
2225	ZFS_ENTER(zfsvfs);
2226	ZFS_VERIFY_ZP(dzp);
2227	ZFS_VERIFY_ZP(zp);
2228	zilog = zfsvfs->z_log;
2229
2230
2231	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2232		goto out;
2233	}
2234
2235	if (vp->v_type != VDIR) {
2236		error = SET_ERROR(ENOTDIR);
2237		goto out;
2238	}
2239
2240	vnevent_rmdir(vp, dvp, name, ct);
2241
2242	tx = dmu_tx_create(zfsvfs->z_os);
2243	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2244	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2245	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2246	zfs_sa_upgrade_txholds(tx, zp);
2247	zfs_sa_upgrade_txholds(tx, dzp);
2248	dmu_tx_mark_netfree(tx);
2249	error = dmu_tx_assign(tx, TXG_WAIT);
2250	if (error) {
2251		dmu_tx_abort(tx);
2252		ZFS_EXIT(zfsvfs);
2253		return (error);
2254	}
2255
2256	cache_purge(dvp);
2257
2258	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
2259
2260	if (error == 0) {
2261		uint64_t txtype = TX_RMDIR;
2262		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
2263	}
2264
2265	dmu_tx_commit(tx);
2266
2267	cache_purge(vp);
2268out:
2269	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2270		zil_commit(zilog, 0);
2271
2272	ZFS_EXIT(zfsvfs);
2273	return (error);
2274}
2275
2276/*
2277 * Read as many directory entries as will fit into the provided
2278 * buffer from the given directory cursor position (specified in
2279 * the uio structure).
2280 *
2281 *	IN:	vp	- vnode of directory to read.
2282 *		uio	- structure supplying read location, range info,
2283 *			  and return buffer.
2284 *		cr	- credentials of caller.
2285 *		ct	- caller context
2286 *		flags	- case flags
2287 *
2288 *	OUT:	uio	- updated offset and range, buffer filled.
2289 *		eofp	- set to true if end-of-file detected.
2290 *
2291 *	RETURN:	0 on success, error code on failure.
2292 *
2293 * Timestamps:
2294 *	vp - atime updated
2295 *
2296 * Note that the low 4 bits of the cookie returned by zap is always zero.
2297 * This allows us to use the low range for "special" directory entries:
2298 * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
2299 * we use the offset 2 for the '.zfs' directory.
2300 */
2301/* ARGSUSED */
2302static int
2303zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
2304{
2305	znode_t		*zp = VTOZ(vp);
2306	iovec_t		*iovp;
2307	edirent_t	*eodp;
2308	dirent64_t	*odp;
2309	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2310	objset_t	*os;
2311	caddr_t		outbuf;
2312	size_t		bufsize;
2313	zap_cursor_t	zc;
2314	zap_attribute_t	zap;
2315	uint_t		bytes_wanted;
2316	uint64_t	offset; /* must be unsigned; checks for < 1 */
2317	uint64_t	parent;
2318	int		local_eof;
2319	int		outcount;
2320	int		error;
2321	uint8_t		prefetch;
2322	boolean_t	check_sysattrs;
2323	uint8_t		type;
2324	int		ncooks;
2325	u_long		*cooks = NULL;
2326	int		flags = 0;
2327
2328	ZFS_ENTER(zfsvfs);
2329	ZFS_VERIFY_ZP(zp);
2330
2331	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2332	    &parent, sizeof (parent))) != 0) {
2333		ZFS_EXIT(zfsvfs);
2334		return (error);
2335	}
2336
2337	/*
2338	 * If we are not given an eof variable,
2339	 * use a local one.
2340	 */
2341	if (eofp == NULL)
2342		eofp = &local_eof;
2343
2344	/*
2345	 * Check for valid iov_len.
2346	 */
2347	if (uio->uio_iov->iov_len <= 0) {
2348		ZFS_EXIT(zfsvfs);
2349		return (SET_ERROR(EINVAL));
2350	}
2351
2352	/*
2353	 * Quit if directory has been removed (posix)
2354	 */
2355	if ((*eofp = zp->z_unlinked) != 0) {
2356		ZFS_EXIT(zfsvfs);
2357		return (0);
2358	}
2359
2360	error = 0;
2361	os = zfsvfs->z_os;
2362	offset = uio->uio_loffset;
2363	prefetch = zp->z_zn_prefetch;
2364
2365	/*
2366	 * Initialize the iterator cursor.
2367	 */
2368	if (offset <= 3) {
2369		/*
2370		 * Start iteration from the beginning of the directory.
2371		 */
2372		zap_cursor_init(&zc, os, zp->z_id);
2373	} else {
2374		/*
2375		 * The offset is a serialized cursor.
2376		 */
2377		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2378	}
2379
2380	/*
2381	 * Get space to change directory entries into fs independent format.
2382	 */
2383	iovp = uio->uio_iov;
2384	bytes_wanted = iovp->iov_len;
2385	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
2386		bufsize = bytes_wanted;
2387		outbuf = kmem_alloc(bufsize, KM_SLEEP);
2388		odp = (struct dirent64 *)outbuf;
2389	} else {
2390		bufsize = bytes_wanted;
2391		outbuf = NULL;
2392		odp = (struct dirent64 *)iovp->iov_base;
2393	}
2394	eodp = (struct edirent *)odp;
2395
2396	if (ncookies != NULL) {
2397		/*
2398		 * Minimum entry size is dirent size and 1 byte for a file name.
2399		 */
2400		ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
2401		cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
2402		*cookies = cooks;
2403		*ncookies = ncooks;
2404	}
2405	/*
2406	 * If this VFS supports the system attribute view interface; and
2407	 * we're looking at an extended attribute directory; and we care
2408	 * about normalization conflicts on this vfs; then we must check
2409	 * for normalization conflicts with the sysattr name space.
2410	 */
2411#ifdef TODO
2412	check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2413	    (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2414	    (flags & V_RDDIR_ENTFLAGS);
2415#else
2416	check_sysattrs = 0;
2417#endif
2418
2419	/*
2420	 * Transform to file-system independent format
2421	 */
2422	outcount = 0;
2423	while (outcount < bytes_wanted) {
2424		ino64_t objnum;
2425		ushort_t reclen;
2426		off64_t *next = NULL;
2427
2428		/*
2429		 * Special case `.', `..', and `.zfs'.
2430		 */
2431		if (offset == 0) {
2432			(void) strcpy(zap.za_name, ".");
2433			zap.za_normalization_conflict = 0;
2434			objnum = zp->z_id;
2435			type = DT_DIR;
2436		} else if (offset == 1) {
2437			(void) strcpy(zap.za_name, "..");
2438			zap.za_normalization_conflict = 0;
2439			objnum = parent;
2440			type = DT_DIR;
2441		} else if (offset == 2 && zfs_show_ctldir(zp)) {
2442			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2443			zap.za_normalization_conflict = 0;
2444			objnum = ZFSCTL_INO_ROOT;
2445			type = DT_DIR;
2446		} else {
2447			/*
2448			 * Grab next entry.
2449			 */
2450			if (error = zap_cursor_retrieve(&zc, &zap)) {
2451				if ((*eofp = (error == ENOENT)) != 0)
2452					break;
2453				else
2454					goto update;
2455			}
2456
2457			if (zap.za_integer_length != 8 ||
2458			    zap.za_num_integers != 1) {
2459				cmn_err(CE_WARN, "zap_readdir: bad directory "
2460				    "entry, obj = %lld, offset = %lld\n",
2461				    (u_longlong_t)zp->z_id,
2462				    (u_longlong_t)offset);
2463				error = SET_ERROR(ENXIO);
2464				goto update;
2465			}
2466
2467			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2468			/*
2469			 * MacOS X can extract the object type here such as:
2470			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2471			 */
2472			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2473
2474			if (check_sysattrs && !zap.za_normalization_conflict) {
2475#ifdef TODO
2476				zap.za_normalization_conflict =
2477				    xattr_sysattr_casechk(zap.za_name);
2478#else
2479				panic("%s:%u: TODO", __func__, __LINE__);
2480#endif
2481			}
2482		}
2483
2484		if (flags & V_RDDIR_ACCFILTER) {
2485			/*
2486			 * If we have no access at all, don't include
2487			 * this entry in the returned information
2488			 */
2489			znode_t	*ezp;
2490			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
2491				goto skip_entry;
2492			if (!zfs_has_access(ezp, cr)) {
2493				vrele(ZTOV(ezp));
2494				goto skip_entry;
2495			}
2496			vrele(ZTOV(ezp));
2497		}
2498
2499		if (flags & V_RDDIR_ENTFLAGS)
2500			reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2501		else
2502			reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2503
2504		/*
2505		 * Will this entry fit in the buffer?
2506		 */
2507		if (outcount + reclen > bufsize) {
2508			/*
2509			 * Did we manage to fit anything in the buffer?
2510			 */
2511			if (!outcount) {
2512				error = SET_ERROR(EINVAL);
2513				goto update;
2514			}
2515			break;
2516		}
2517		if (flags & V_RDDIR_ENTFLAGS) {
2518			/*
2519			 * Add extended flag entry:
2520			 */
2521			eodp->ed_ino = objnum;
2522			eodp->ed_reclen = reclen;
2523			/* NOTE: ed_off is the offset for the *next* entry */
2524			next = &(eodp->ed_off);
2525			eodp->ed_eflags = zap.za_normalization_conflict ?
2526			    ED_CASE_CONFLICT : 0;
2527			(void) strncpy(eodp->ed_name, zap.za_name,
2528			    EDIRENT_NAMELEN(reclen));
2529			eodp = (edirent_t *)((intptr_t)eodp + reclen);
2530		} else {
2531			/*
2532			 * Add normal entry:
2533			 */
2534			odp->d_ino = objnum;
2535			odp->d_reclen = reclen;
2536			odp->d_namlen = strlen(zap.za_name);
2537			(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
2538			odp->d_type = type;
2539			odp = (dirent64_t *)((intptr_t)odp + reclen);
2540		}
2541		outcount += reclen;
2542
2543		ASSERT(outcount <= bufsize);
2544
2545		/* Prefetch znode */
2546		if (prefetch)
2547			dmu_prefetch(os, objnum, 0, 0, 0,
2548			    ZIO_PRIORITY_SYNC_READ);
2549
2550	skip_entry:
2551		/*
2552		 * Move to the next entry, fill in the previous offset.
2553		 */
2554		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2555			zap_cursor_advance(&zc);
2556			offset = zap_cursor_serialize(&zc);
2557		} else {
2558			offset += 1;
2559		}
2560
2561		if (cooks != NULL) {
2562			*cooks++ = offset;
2563			ncooks--;
2564			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
2565		}
2566	}
2567	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2568
2569	/* Subtract unused cookies */
2570	if (ncookies != NULL)
2571		*ncookies -= ncooks;
2572
2573	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
2574		iovp->iov_base += outcount;
2575		iovp->iov_len -= outcount;
2576		uio->uio_resid -= outcount;
2577	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2578		/*
2579		 * Reset the pointer.
2580		 */
2581		offset = uio->uio_loffset;
2582	}
2583
2584update:
2585	zap_cursor_fini(&zc);
2586	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
2587		kmem_free(outbuf, bufsize);
2588
2589	if (error == ENOENT)
2590		error = 0;
2591
2592	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2593
2594	uio->uio_loffset = offset;
2595	ZFS_EXIT(zfsvfs);
2596	if (error != 0 && cookies != NULL) {
2597		free(*cookies, M_TEMP);
2598		*cookies = NULL;
2599		*ncookies = 0;
2600	}
2601	return (error);
2602}
2603
2604ulong_t zfs_fsync_sync_cnt = 4;
2605
2606static int
2607zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2608{
2609	znode_t	*zp = VTOZ(vp);
2610	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2611
2612	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2613
2614	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
2615		ZFS_ENTER(zfsvfs);
2616		ZFS_VERIFY_ZP(zp);
2617		zil_commit(zfsvfs->z_log, zp->z_id);
2618		ZFS_EXIT(zfsvfs);
2619	}
2620	return (0);
2621}
2622
2623
2624/*
2625 * Get the requested file attributes and place them in the provided
2626 * vattr structure.
2627 *
2628 *	IN:	vp	- vnode of file.
2629 *		vap	- va_mask identifies requested attributes.
2630 *			  If AT_XVATTR set, then optional attrs are requested
2631 *		flags	- ATTR_NOACLCHECK (CIFS server context)
2632 *		cr	- credentials of caller.
2633 *		ct	- caller context
2634 *
2635 *	OUT:	vap	- attribute values.
2636 *
2637 *	RETURN:	0 (always succeeds).
2638 */
2639/* ARGSUSED */
2640static int
2641zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2642    caller_context_t *ct)
2643{
2644	znode_t *zp = VTOZ(vp);
2645	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2646	int	error = 0;
2647	uint32_t blksize;
2648	u_longlong_t nblocks;
2649	uint64_t links;
2650	uint64_t mtime[2], ctime[2], crtime[2], rdev;
2651	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2652	xoptattr_t *xoap = NULL;
2653	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2654	sa_bulk_attr_t bulk[4];
2655	int count = 0;
2656
2657	ZFS_ENTER(zfsvfs);
2658	ZFS_VERIFY_ZP(zp);
2659
2660	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2661
2662	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
2663	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
2664	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
2665	if (vp->v_type == VBLK || vp->v_type == VCHR)
2666		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
2667		    &rdev, 8);
2668
2669	if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
2670		ZFS_EXIT(zfsvfs);
2671		return (error);
2672	}
2673
2674	/*
2675	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2676	 * Also, if we are the owner don't bother, since owner should
2677	 * always be allowed to read basic attributes of file.
2678	 */
2679	if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
2680	    (vap->va_uid != crgetuid(cr))) {
2681		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2682		    skipaclchk, cr)) {
2683			ZFS_EXIT(zfsvfs);
2684			return (error);
2685		}
2686	}
2687
2688	/*
2689	 * Return all attributes.  It's cheaper to provide the answer
2690	 * than to determine whether we were asked the question.
2691	 */
2692
2693	vap->va_type = IFTOVT(zp->z_mode);
2694	vap->va_mode = zp->z_mode & ~S_IFMT;
2695#ifdef illumos
2696	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
2697#else
2698	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
2699#endif
2700	vap->va_nodeid = zp->z_id;
2701	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
2702		links = zp->z_links + 1;
2703	else
2704		links = zp->z_links;
2705	vap->va_nlink = MIN(links, LINK_MAX);	/* nlink_t limit! */
2706	vap->va_size = zp->z_size;
2707#ifdef illumos
2708	vap->va_rdev = vp->v_rdev;
2709#else
2710	if (vp->v_type == VBLK || vp->v_type == VCHR)
2711		vap->va_rdev = zfs_cmpldev(rdev);
2712#endif
2713	vap->va_seq = zp->z_seq;
2714	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
2715     	vap->va_filerev = zp->z_seq;
2716
2717	/*
2718	 * Add in any requested optional attributes and the create time.
2719	 * Also set the corresponding bits in the returned attribute bitmap.
2720	 */
2721	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2722		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2723			xoap->xoa_archive =
2724			    ((zp->z_pflags & ZFS_ARCHIVE) != 0);
2725			XVA_SET_RTN(xvap, XAT_ARCHIVE);
2726		}
2727
2728		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2729			xoap->xoa_readonly =
2730			    ((zp->z_pflags & ZFS_READONLY) != 0);
2731			XVA_SET_RTN(xvap, XAT_READONLY);
2732		}
2733
2734		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2735			xoap->xoa_system =
2736			    ((zp->z_pflags & ZFS_SYSTEM) != 0);
2737			XVA_SET_RTN(xvap, XAT_SYSTEM);
2738		}
2739
2740		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2741			xoap->xoa_hidden =
2742			    ((zp->z_pflags & ZFS_HIDDEN) != 0);
2743			XVA_SET_RTN(xvap, XAT_HIDDEN);
2744		}
2745
2746		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2747			xoap->xoa_nounlink =
2748			    ((zp->z_pflags & ZFS_NOUNLINK) != 0);
2749			XVA_SET_RTN(xvap, XAT_NOUNLINK);
2750		}
2751
2752		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2753			xoap->xoa_immutable =
2754			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
2755			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2756		}
2757
2758		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2759			xoap->xoa_appendonly =
2760			    ((zp->z_pflags & ZFS_APPENDONLY) != 0);
2761			XVA_SET_RTN(xvap, XAT_APPENDONLY);
2762		}
2763
2764		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2765			xoap->xoa_nodump =
2766			    ((zp->z_pflags & ZFS_NODUMP) != 0);
2767			XVA_SET_RTN(xvap, XAT_NODUMP);
2768		}
2769
2770		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2771			xoap->xoa_opaque =
2772			    ((zp->z_pflags & ZFS_OPAQUE) != 0);
2773			XVA_SET_RTN(xvap, XAT_OPAQUE);
2774		}
2775
2776		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2777			xoap->xoa_av_quarantined =
2778			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
2779			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2780		}
2781
2782		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2783			xoap->xoa_av_modified =
2784			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
2785			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2786		}
2787
2788		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2789		    vp->v_type == VREG) {
2790			zfs_sa_get_scanstamp(zp, xvap);
2791		}
2792
2793		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
2794			uint64_t times[2];
2795
2796			(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
2797			    times, sizeof (times));
2798			ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
2799			XVA_SET_RTN(xvap, XAT_CREATETIME);
2800		}
2801
2802		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2803			xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
2804			XVA_SET_RTN(xvap, XAT_REPARSE);
2805		}
2806		if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
2807			xoap->xoa_generation = zp->z_gen;
2808			XVA_SET_RTN(xvap, XAT_GEN);
2809		}
2810
2811		if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
2812			xoap->xoa_offline =
2813			    ((zp->z_pflags & ZFS_OFFLINE) != 0);
2814			XVA_SET_RTN(xvap, XAT_OFFLINE);
2815		}
2816
2817		if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
2818			xoap->xoa_sparse =
2819			    ((zp->z_pflags & ZFS_SPARSE) != 0);
2820			XVA_SET_RTN(xvap, XAT_SPARSE);
2821		}
2822	}
2823
2824	ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
2825	ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2826	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2827	ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
2828
2829
2830	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
2831	vap->va_blksize = blksize;
2832	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
2833
2834	if (zp->z_blksz == 0) {
2835		/*
2836		 * Block size hasn't been set; suggest maximal I/O transfers.
2837		 */
2838		vap->va_blksize = zfsvfs->z_max_blksz;
2839	}
2840
2841	ZFS_EXIT(zfsvfs);
2842	return (0);
2843}
2844
2845/*
2846 * Set the file attributes to the values contained in the
2847 * vattr structure.
2848 *
2849 *	IN:	vp	- vnode of file to be modified.
2850 *		vap	- new attribute values.
2851 *			  If AT_XVATTR set, then optional attrs are being set
2852 *		flags	- ATTR_UTIME set if non-default time values provided.
2853 *			- ATTR_NOACLCHECK (CIFS context only).
2854 *		cr	- credentials of caller.
2855 *		ct	- caller context
2856 *
2857 *	RETURN:	0 on success, error code on failure.
2858 *
2859 * Timestamps:
2860 *	vp - ctime updated, mtime updated if size changed.
2861 */
2862/* ARGSUSED */
2863static int
2864zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2865    caller_context_t *ct)
2866{
2867	znode_t		*zp = VTOZ(vp);
2868	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2869	zilog_t		*zilog;
2870	dmu_tx_t	*tx;
2871	vattr_t		oldva;
2872	xvattr_t	tmpxvattr;
2873	uint_t		mask = vap->va_mask;
2874	uint_t		saved_mask = 0;
2875	uint64_t	saved_mode;
2876	int		trim_mask = 0;
2877	uint64_t	new_mode;
2878	uint64_t	new_uid, new_gid;
2879	uint64_t	xattr_obj;
2880	uint64_t	mtime[2], ctime[2];
2881	znode_t		*attrzp;
2882	int		need_policy = FALSE;
2883	int		err, err2;
2884	zfs_fuid_info_t *fuidp = NULL;
2885	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2886	xoptattr_t	*xoap;
2887	zfs_acl_t	*aclp;
2888	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2889	boolean_t	fuid_dirtied = B_FALSE;
2890	sa_bulk_attr_t	bulk[7], xattr_bulk[7];
2891	int		count = 0, xattr_count = 0;
2892
2893	if (mask == 0)
2894		return (0);
2895
2896	if (mask & AT_NOSET)
2897		return (SET_ERROR(EINVAL));
2898
2899	ZFS_ENTER(zfsvfs);
2900	ZFS_VERIFY_ZP(zp);
2901
2902	zilog = zfsvfs->z_log;
2903
2904	/*
2905	 * Make sure that if we have ephemeral uid/gid or xvattr specified
2906	 * that file system is at proper version level
2907	 */
2908
2909	if (zfsvfs->z_use_fuids == B_FALSE &&
2910	    (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2911	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2912	    (mask & AT_XVATTR))) {
2913		ZFS_EXIT(zfsvfs);
2914		return (SET_ERROR(EINVAL));
2915	}
2916
2917	if (mask & AT_SIZE && vp->v_type == VDIR) {
2918		ZFS_EXIT(zfsvfs);
2919		return (SET_ERROR(EISDIR));
2920	}
2921
2922	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
2923		ZFS_EXIT(zfsvfs);
2924		return (SET_ERROR(EINVAL));
2925	}
2926
2927	/*
2928	 * If this is an xvattr_t, then get a pointer to the structure of
2929	 * optional attributes.  If this is NULL, then we have a vattr_t.
2930	 */
2931	xoap = xva_getxoptattr(xvap);
2932
2933	xva_init(&tmpxvattr);
2934
2935	/*
2936	 * Immutable files can only alter immutable bit and atime
2937	 */
2938	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2939	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
2940	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2941		ZFS_EXIT(zfsvfs);
2942		return (SET_ERROR(EPERM));
2943	}
2944
2945	if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
2946		ZFS_EXIT(zfsvfs);
2947		return (SET_ERROR(EPERM));
2948	}
2949
2950	/*
2951	 * Verify timestamps doesn't overflow 32 bits.
2952	 * ZFS can handle large timestamps, but 32bit syscalls can't
2953	 * handle times greater than 2039.  This check should be removed
2954	 * once large timestamps are fully supported.
2955	 */
2956	if (mask & (AT_ATIME | AT_MTIME)) {
2957		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2958		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2959			ZFS_EXIT(zfsvfs);
2960			return (SET_ERROR(EOVERFLOW));
2961		}
2962	}
2963
2964	attrzp = NULL;
2965	aclp = NULL;
2966
2967	/* Can this be moved to before the top label? */
2968	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2969		ZFS_EXIT(zfsvfs);
2970		return (SET_ERROR(EROFS));
2971	}
2972
2973	/*
2974	 * First validate permissions
2975	 */
2976
2977	if (mask & AT_SIZE) {
2978		/*
2979		 * XXX - Note, we are not providing any open
2980		 * mode flags here (like FNDELAY), so we may
2981		 * block if there are locks present... this
2982		 * should be addressed in openat().
2983		 */
2984		/* XXX - would it be OK to generate a log record here? */
2985		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2986		if (err) {
2987			ZFS_EXIT(zfsvfs);
2988			return (err);
2989		}
2990	}
2991
2992	if (mask & (AT_ATIME|AT_MTIME) ||
2993	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2994	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2995	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2996	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
2997	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
2998	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2999	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
3000		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
3001		    skipaclchk, cr);
3002	}
3003
3004	if (mask & (AT_UID|AT_GID)) {
3005		int	idmask = (mask & (AT_UID|AT_GID));
3006		int	take_owner;
3007		int	take_group;
3008
3009		/*
3010		 * NOTE: even if a new mode is being set,
3011		 * we may clear S_ISUID/S_ISGID bits.
3012		 */
3013
3014		if (!(mask & AT_MODE))
3015			vap->va_mode = zp->z_mode;
3016
3017		/*
3018		 * Take ownership or chgrp to group we are a member of
3019		 */
3020
3021		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
3022		take_group = (mask & AT_GID) &&
3023		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
3024
3025		/*
3026		 * If both AT_UID and AT_GID are set then take_owner and
3027		 * take_group must both be set in order to allow taking
3028		 * ownership.
3029		 *
3030		 * Otherwise, send the check through secpolicy_vnode_setattr()
3031		 *
3032		 */
3033
3034		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
3035		    ((idmask == AT_UID) && take_owner) ||
3036		    ((idmask == AT_GID) && take_group)) {
3037			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
3038			    skipaclchk, cr) == 0) {
3039				/*
3040				 * Remove setuid/setgid for non-privileged users
3041				 */
3042				secpolicy_setid_clear(vap, vp, cr);
3043				trim_mask = (mask & (AT_UID|AT_GID));
3044			} else {
3045				need_policy =  TRUE;
3046			}
3047		} else {
3048			need_policy =  TRUE;
3049		}
3050	}
3051
3052	oldva.va_mode = zp->z_mode;
3053	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
3054	if (mask & AT_XVATTR) {
3055		/*
3056		 * Update xvattr mask to include only those attributes
3057		 * that are actually changing.
3058		 *
3059		 * the bits will be restored prior to actually setting
3060		 * the attributes so the caller thinks they were set.
3061		 */
3062		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
3063			if (xoap->xoa_appendonly !=
3064			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
3065				need_policy = TRUE;
3066			} else {
3067				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
3068				XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
3069			}
3070		}
3071
3072		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
3073			if (xoap->xoa_nounlink !=
3074			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
3075				need_policy = TRUE;
3076			} else {
3077				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
3078				XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
3079			}
3080		}
3081
3082		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
3083			if (xoap->xoa_immutable !=
3084			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
3085				need_policy = TRUE;
3086			} else {
3087				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
3088				XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
3089			}
3090		}
3091
3092		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
3093			if (xoap->xoa_nodump !=
3094			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
3095				need_policy = TRUE;
3096			} else {
3097				XVA_CLR_REQ(xvap, XAT_NODUMP);
3098				XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
3099			}
3100		}
3101
3102		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
3103			if (xoap->xoa_av_modified !=
3104			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
3105				need_policy = TRUE;
3106			} else {
3107				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
3108				XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
3109			}
3110		}
3111
3112		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
3113			if ((vp->v_type != VREG &&
3114			    xoap->xoa_av_quarantined) ||
3115			    xoap->xoa_av_quarantined !=
3116			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
3117				need_policy = TRUE;
3118			} else {
3119				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
3120				XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
3121			}
3122		}
3123
3124		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
3125			ZFS_EXIT(zfsvfs);
3126			return (SET_ERROR(EPERM));
3127		}
3128
3129		if (need_policy == FALSE &&
3130		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
3131		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
3132			need_policy = TRUE;
3133		}
3134	}
3135
3136	if (mask & AT_MODE) {
3137		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
3138			err = secpolicy_setid_setsticky_clear(vp, vap,
3139			    &oldva, cr);
3140			if (err) {
3141				ZFS_EXIT(zfsvfs);
3142				return (err);
3143			}
3144			trim_mask |= AT_MODE;
3145		} else {
3146			need_policy = TRUE;
3147		}
3148	}
3149
3150	if (need_policy) {
3151		/*
3152		 * If trim_mask is set then take ownership
3153		 * has been granted or write_acl is present and user
3154		 * has the ability to modify mode.  In that case remove
3155		 * UID|GID and or MODE from mask so that
3156		 * secpolicy_vnode_setattr() doesn't revoke it.
3157		 */
3158
3159		if (trim_mask) {
3160			saved_mask = vap->va_mask;
3161			vap->va_mask &= ~trim_mask;
3162			if (trim_mask & AT_MODE) {
3163				/*
3164				 * Save the mode, as secpolicy_vnode_setattr()
3165				 * will overwrite it with ova.va_mode.
3166				 */
3167				saved_mode = vap->va_mode;
3168			}
3169		}
3170		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
3171		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
3172		if (err) {
3173			ZFS_EXIT(zfsvfs);
3174			return (err);
3175		}
3176
3177		if (trim_mask) {
3178			vap->va_mask |= saved_mask;
3179			if (trim_mask & AT_MODE) {
3180				/*
3181				 * Recover the mode after
3182				 * secpolicy_vnode_setattr().
3183				 */
3184				vap->va_mode = saved_mode;
3185			}
3186		}
3187	}
3188
3189	/*
3190	 * secpolicy_vnode_setattr, or take ownership may have
3191	 * changed va_mask
3192	 */
3193	mask = vap->va_mask;
3194
3195	if ((mask & (AT_UID | AT_GID))) {
3196		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
3197		    &xattr_obj, sizeof (xattr_obj));
3198
3199		if (err == 0 && xattr_obj) {
3200			err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
3201			if (err == 0) {
3202				err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
3203				if (err != 0)
3204					vrele(ZTOV(attrzp));
3205			}
3206			if (err)
3207				goto out2;
3208		}
3209		if (mask & AT_UID) {
3210			new_uid = zfs_fuid_create(zfsvfs,
3211			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
3212			if (new_uid != zp->z_uid &&
3213			    zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
3214				if (attrzp)
3215					vput(ZTOV(attrzp));
3216				err = SET_ERROR(EDQUOT);
3217				goto out2;
3218			}
3219		}
3220
3221		if (mask & AT_GID) {
3222			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
3223			    cr, ZFS_GROUP, &fuidp);
3224			if (new_gid != zp->z_gid &&
3225			    zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
3226				if (attrzp)
3227					vput(ZTOV(attrzp));
3228				err = SET_ERROR(EDQUOT);
3229				goto out2;
3230			}
3231		}
3232	}
3233	tx = dmu_tx_create(zfsvfs->z_os);
3234
3235	if (mask & AT_MODE) {
3236		uint64_t pmode = zp->z_mode;
3237		uint64_t acl_obj;
3238		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
3239
3240		if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
3241		    !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
3242			err = SET_ERROR(EPERM);
3243			goto out;
3244		}
3245
3246		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
3247			goto out;
3248
3249		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
3250			/*
3251			 * Are we upgrading ACL from old V0 format
3252			 * to V1 format?
3253			 */
3254			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
3255			    zfs_znode_acl_version(zp) ==
3256			    ZFS_ACL_VERSION_INITIAL) {
3257				dmu_tx_hold_free(tx, acl_obj, 0,
3258				    DMU_OBJECT_END);
3259				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3260				    0, aclp->z_acl_bytes);
3261			} else {
3262				dmu_tx_hold_write(tx, acl_obj, 0,
3263				    aclp->z_acl_bytes);
3264			}
3265		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3266			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3267			    0, aclp->z_acl_bytes);
3268		}
3269		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3270	} else {
3271		if ((mask & AT_XVATTR) &&
3272		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3273			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3274		else
3275			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3276	}
3277
3278	if (attrzp) {
3279		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3280	}
3281
3282	fuid_dirtied = zfsvfs->z_fuid_dirty;
3283	if (fuid_dirtied)
3284		zfs_fuid_txhold(zfsvfs, tx);
3285
3286	zfs_sa_upgrade_txholds(tx, zp);
3287
3288	err = dmu_tx_assign(tx, TXG_WAIT);
3289	if (err)
3290		goto out;
3291
3292	count = 0;
3293	/*
3294	 * Set each attribute requested.
3295	 * We group settings according to the locks they need to acquire.
3296	 *
3297	 * Note: you cannot set ctime directly, although it will be
3298	 * updated as a side-effect of calling this function.
3299	 */
3300
3301	if (mask & (AT_UID|AT_GID|AT_MODE))
3302		mutex_enter(&zp->z_acl_lock);
3303
3304	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3305	    &zp->z_pflags, sizeof (zp->z_pflags));
3306
3307	if (attrzp) {
3308		if (mask & (AT_UID|AT_GID|AT_MODE))
3309			mutex_enter(&attrzp->z_acl_lock);
3310		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3311		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3312		    sizeof (attrzp->z_pflags));
3313	}
3314
3315	if (mask & (AT_UID|AT_GID)) {
3316
3317		if (mask & AT_UID) {
3318			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3319			    &new_uid, sizeof (new_uid));
3320			zp->z_uid = new_uid;
3321			if (attrzp) {
3322				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3323				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3324				    sizeof (new_uid));
3325				attrzp->z_uid = new_uid;
3326			}
3327		}
3328
3329		if (mask & AT_GID) {
3330			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3331			    NULL, &new_gid, sizeof (new_gid));
3332			zp->z_gid = new_gid;
3333			if (attrzp) {
3334				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3335				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3336				    sizeof (new_gid));
3337				attrzp->z_gid = new_gid;
3338			}
3339		}
3340		if (!(mask & AT_MODE)) {
3341			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3342			    NULL, &new_mode, sizeof (new_mode));
3343			new_mode = zp->z_mode;
3344		}
3345		err = zfs_acl_chown_setattr(zp);
3346		ASSERT(err == 0);
3347		if (attrzp) {
3348			err = zfs_acl_chown_setattr(attrzp);
3349			ASSERT(err == 0);
3350		}
3351	}
3352
3353	if (mask & AT_MODE) {
3354		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3355		    &new_mode, sizeof (new_mode));
3356		zp->z_mode = new_mode;
3357		ASSERT3U((uintptr_t)aclp, !=, 0);
3358		err = zfs_aclset_common(zp, aclp, cr, tx);
3359		ASSERT0(err);
3360		if (zp->z_acl_cached)
3361			zfs_acl_free(zp->z_acl_cached);
3362		zp->z_acl_cached = aclp;
3363		aclp = NULL;
3364	}
3365
3366
3367	if (mask & AT_ATIME) {
3368		ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
3369		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
3370		    &zp->z_atime, sizeof (zp->z_atime));
3371	}
3372
3373	if (mask & AT_MTIME) {
3374		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3375		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3376		    mtime, sizeof (mtime));
3377	}
3378
3379	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3380	if (mask & AT_SIZE && !(mask & AT_MTIME)) {
3381		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
3382		    NULL, mtime, sizeof (mtime));
3383		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3384		    &ctime, sizeof (ctime));
3385		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
3386		    B_TRUE);
3387	} else if (mask != 0) {
3388		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3389		    &ctime, sizeof (ctime));
3390		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
3391		    B_TRUE);
3392		if (attrzp) {
3393			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3394			    SA_ZPL_CTIME(zfsvfs), NULL,
3395			    &ctime, sizeof (ctime));
3396			zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
3397			    mtime, ctime, B_TRUE);
3398		}
3399	}
3400	/*
3401	 * Do this after setting timestamps to prevent timestamp
3402	 * update from toggling bit
3403	 */
3404
3405	if (xoap && (mask & AT_XVATTR)) {
3406
3407		/*
3408		 * restore trimmed off masks
3409		 * so that return masks can be set for caller.
3410		 */
3411
3412		if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3413			XVA_SET_REQ(xvap, XAT_APPENDONLY);
3414		}
3415		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3416			XVA_SET_REQ(xvap, XAT_NOUNLINK);
3417		}
3418		if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3419			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3420		}
3421		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3422			XVA_SET_REQ(xvap, XAT_NODUMP);
3423		}
3424		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3425			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3426		}
3427		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3428			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3429		}
3430
3431		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3432			ASSERT(vp->v_type == VREG);
3433
3434		zfs_xvattr_set(zp, xvap, tx);
3435	}
3436
3437	if (fuid_dirtied)
3438		zfs_fuid_sync(zfsvfs, tx);
3439
3440	if (mask != 0)
3441		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3442
3443	if (mask & (AT_UID|AT_GID|AT_MODE))
3444		mutex_exit(&zp->z_acl_lock);
3445
3446	if (attrzp) {
3447		if (mask & (AT_UID|AT_GID|AT_MODE))
3448			mutex_exit(&attrzp->z_acl_lock);
3449	}
3450out:
3451	if (err == 0 && attrzp) {
3452		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3453		    xattr_count, tx);
3454		ASSERT(err2 == 0);
3455	}
3456
3457	if (attrzp)
3458		vput(ZTOV(attrzp));
3459
3460	if (aclp)
3461		zfs_acl_free(aclp);
3462
3463	if (fuidp) {
3464		zfs_fuid_info_free(fuidp);
3465		fuidp = NULL;
3466	}
3467
3468	if (err) {
3469		dmu_tx_abort(tx);
3470	} else {
3471		err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3472		dmu_tx_commit(tx);
3473	}
3474
3475out2:
3476	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3477		zil_commit(zilog, 0);
3478
3479	ZFS_EXIT(zfsvfs);
3480	return (err);
3481}
3482
3483/*
3484 * We acquire all but fdvp locks using non-blocking acquisitions.  If we
3485 * fail to acquire any lock in the path we will drop all held locks,
3486 * acquire the new lock in a blocking fashion, and then release it and
3487 * restart the rename.  This acquire/release step ensures that we do not
3488 * spin on a lock waiting for release.  On error release all vnode locks
3489 * and decrement references the way tmpfs_rename() would do.
3490 */
3491static int
3492zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
3493    struct vnode *tdvp, struct vnode **tvpp,
3494    const struct componentname *scnp, const struct componentname *tcnp)
3495{
3496	zfsvfs_t	*zfsvfs;
3497	struct vnode	*nvp, *svp, *tvp;
3498	znode_t		*sdzp, *tdzp, *szp, *tzp;
3499	const char	*snm = scnp->cn_nameptr;
3500	const char	*tnm = tcnp->cn_nameptr;
3501	int error;
3502
3503	VOP_UNLOCK(tdvp, 0);
3504	if (*tvpp != NULL && *tvpp != tdvp)
3505		VOP_UNLOCK(*tvpp, 0);
3506
3507relock:
3508	error = vn_lock(sdvp, LK_EXCLUSIVE);
3509	if (error)
3510		goto out;
3511	sdzp = VTOZ(sdvp);
3512
3513	error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
3514	if (error != 0) {
3515		VOP_UNLOCK(sdvp, 0);
3516		if (error != EBUSY)
3517			goto out;
3518		error = vn_lock(tdvp, LK_EXCLUSIVE);
3519		if (error)
3520			goto out;
3521		VOP_UNLOCK(tdvp, 0);
3522		goto relock;
3523	}
3524	tdzp = VTOZ(tdvp);
3525
3526	/*
3527	 * Before using sdzp and tdzp we must ensure that they are live.
3528	 * As a porting legacy from illumos we have two things to worry
3529	 * about.  One is typical for FreeBSD and it is that the vnode is
3530	 * not reclaimed (doomed).  The other is that the znode is live.
3531	 * The current code can invalidate the znode without acquiring the
3532	 * corresponding vnode lock if the object represented by the znode
3533	 * and vnode is no longer valid after a rollback or receive operation.
3534	 * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock
3535	 * that protects the znodes from the invalidation.
3536	 */
3537	zfsvfs = sdzp->z_zfsvfs;
3538	ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
3539	ZFS_ENTER(zfsvfs);
3540
3541	/*
3542	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
3543	 * bypassing the cleanup code in the case of an error.
3544	 */
3545	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
3546		ZFS_EXIT(zfsvfs);
3547		VOP_UNLOCK(sdvp, 0);
3548		VOP_UNLOCK(tdvp, 0);
3549		error = SET_ERROR(EIO);
3550		goto out;
3551	}
3552
3553	/*
3554	 * Re-resolve svp to be certain it still exists and fetch the
3555	 * correct vnode.
3556	 */
3557	error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS);
3558	if (error != 0) {
3559		/* Source entry invalid or not there. */
3560		ZFS_EXIT(zfsvfs);
3561		VOP_UNLOCK(sdvp, 0);
3562		VOP_UNLOCK(tdvp, 0);
3563		if ((scnp->cn_flags & ISDOTDOT) != 0 ||
3564		    (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
3565			error = SET_ERROR(EINVAL);
3566		goto out;
3567	}
3568	svp = ZTOV(szp);
3569
3570	/*
3571	 * Re-resolve tvp, if it disappeared we just carry on.
3572	 */
3573	error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0);
3574	if (error != 0) {
3575		ZFS_EXIT(zfsvfs);
3576		VOP_UNLOCK(sdvp, 0);
3577		VOP_UNLOCK(tdvp, 0);
3578		vrele(svp);
3579		if ((tcnp->cn_flags & ISDOTDOT) != 0)
3580			error = SET_ERROR(EINVAL);
3581		goto out;
3582	}
3583	if (tzp != NULL)
3584		tvp = ZTOV(tzp);
3585	else
3586		tvp = NULL;
3587
3588	/*
3589	 * At present the vnode locks must be acquired before z_teardown_lock,
3590	 * although it would be more logical to use the opposite order.
3591	 */
3592	ZFS_EXIT(zfsvfs);
3593
3594	/*
3595	 * Now try acquire locks on svp and tvp.
3596	 */
3597	nvp = svp;
3598	error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
3599	if (error != 0) {
3600		VOP_UNLOCK(sdvp, 0);
3601		VOP_UNLOCK(tdvp, 0);
3602		if (tvp != NULL)
3603			vrele(tvp);
3604		if (error != EBUSY) {
3605			vrele(nvp);
3606			goto out;
3607		}
3608		error = vn_lock(nvp, LK_EXCLUSIVE);
3609		if (error != 0) {
3610			vrele(nvp);
3611			goto out;
3612		}
3613		VOP_UNLOCK(nvp, 0);
3614		/*
3615		 * Concurrent rename race.
3616		 * XXX ?
3617		 */
3618		if (nvp == tdvp) {
3619			vrele(nvp);
3620			error = SET_ERROR(EINVAL);
3621			goto out;
3622		}
3623		vrele(*svpp);
3624		*svpp = nvp;
3625		goto relock;
3626	}
3627	vrele(*svpp);
3628	*svpp = nvp;
3629
3630	if (*tvpp != NULL)
3631		vrele(*tvpp);
3632	*tvpp = NULL;
3633	if (tvp != NULL) {
3634		nvp = tvp;
3635		error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
3636		if (error != 0) {
3637			VOP_UNLOCK(sdvp, 0);
3638			VOP_UNLOCK(tdvp, 0);
3639			VOP_UNLOCK(*svpp, 0);
3640			if (error != EBUSY) {
3641				vrele(nvp);
3642				goto out;
3643			}
3644			error = vn_lock(nvp, LK_EXCLUSIVE);
3645			if (error != 0) {
3646				vrele(nvp);
3647				goto out;
3648			}
3649			vput(nvp);
3650			goto relock;
3651		}
3652		*tvpp = nvp;
3653	}
3654
3655	return (0);
3656
3657out:
3658	return (error);
3659}
3660
3661/*
3662 * Note that we must use VRELE_ASYNC in this function as it walks
3663 * up the directory tree and vrele may need to acquire an exclusive
3664 * lock if a last reference to a vnode is dropped.
3665 */
3666static int
3667zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
3668{
3669	zfsvfs_t	*zfsvfs;
3670	znode_t		*zp, *zp1;
3671	uint64_t	parent;
3672	int		error;
3673
3674	zfsvfs = tdzp->z_zfsvfs;
3675	if (tdzp == szp)
3676		return (SET_ERROR(EINVAL));
3677	if (tdzp == sdzp)
3678		return (0);
3679	if (tdzp->z_id == zfsvfs->z_root)
3680		return (0);
3681	zp = tdzp;
3682	for (;;) {
3683		ASSERT(!zp->z_unlinked);
3684		if ((error = sa_lookup(zp->z_sa_hdl,
3685		    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
3686			break;
3687
3688		if (parent == szp->z_id) {
3689			error = SET_ERROR(EINVAL);
3690			break;
3691		}
3692		if (parent == zfsvfs->z_root)
3693			break;
3694		if (parent == sdzp->z_id)
3695			break;
3696
3697		error = zfs_zget(zfsvfs, parent, &zp1);
3698		if (error != 0)
3699			break;
3700
3701		if (zp != tdzp)
3702			VN_RELE_ASYNC(ZTOV(zp),
3703			    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
3704		zp = zp1;
3705	}
3706
3707	if (error == ENOTDIR)
3708		panic("checkpath: .. not a directory\n");
3709	if (zp != tdzp)
3710		VN_RELE_ASYNC(ZTOV(zp),
3711		    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
3712	return (error);
3713}
3714
3715/*
3716 * Move an entry from the provided source directory to the target
3717 * directory.  Change the entry name as indicated.
3718 *
3719 *	IN:	sdvp	- Source directory containing the "old entry".
3720 *		snm	- Old entry name.
3721 *		tdvp	- Target directory to contain the "new entry".
3722 *		tnm	- New entry name.
3723 *		cr	- credentials of caller.
3724 *		ct	- caller context
3725 *		flags	- case flags
3726 *
3727 *	RETURN:	0 on success, error code on failure.
3728 *
3729 * Timestamps:
3730 *	sdvp,tdvp - ctime|mtime updated
3731 */
3732/*ARGSUSED*/
3733static int
3734zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
3735    vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
3736    cred_t *cr)
3737{
3738	zfsvfs_t	*zfsvfs;
3739	znode_t		*sdzp, *tdzp, *szp, *tzp;
3740	zilog_t		*zilog = NULL;
3741	dmu_tx_t	*tx;
3742	char		*snm = scnp->cn_nameptr;
3743	char		*tnm = tcnp->cn_nameptr;
3744	int		error = 0;
3745
3746	/* Reject renames across filesystems. */
3747	if ((*svpp)->v_mount != tdvp->v_mount ||
3748	    ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
3749		error = SET_ERROR(EXDEV);
3750		goto out;
3751	}
3752
3753	if (zfsctl_is_node(tdvp)) {
3754		error = SET_ERROR(EXDEV);
3755		goto out;
3756	}
3757
3758	/*
3759	 * Lock all four vnodes to ensure safety and semantics of renaming.
3760	 */
3761	error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
3762	if (error != 0) {
3763		/* no vnodes are locked in the case of error here */
3764		return (error);
3765	}
3766
3767	tdzp = VTOZ(tdvp);
3768	sdzp = VTOZ(sdvp);
3769	zfsvfs = tdzp->z_zfsvfs;
3770	zilog = zfsvfs->z_log;
3771
3772	/*
3773	 * After we re-enter ZFS_ENTER() we will have to revalidate all
3774	 * znodes involved.
3775	 */
3776	ZFS_ENTER(zfsvfs);
3777
3778	if (zfsvfs->z_utf8 && u8_validate(tnm,
3779	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3780		error = SET_ERROR(EILSEQ);
3781		goto unlockout;
3782	}
3783
3784	/* If source and target are the same file, there is nothing to do. */
3785	if ((*svpp) == (*tvpp)) {
3786		error = 0;
3787		goto unlockout;
3788	}
3789
3790	if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
3791	    ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
3792	    (*tvpp)->v_mountedhere != NULL)) {
3793		error = SET_ERROR(EXDEV);
3794		goto unlockout;
3795	}
3796
3797	/*
3798	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
3799	 * bypassing the cleanup code in the case of an error.
3800	 */
3801	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
3802		error = SET_ERROR(EIO);
3803		goto unlockout;
3804	}
3805
3806	szp = VTOZ(*svpp);
3807	tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
3808	if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) {
3809		error = SET_ERROR(EIO);
3810		goto unlockout;
3811	}
3812
3813	/*
3814	 * This is to prevent the creation of links into attribute space
3815	 * by renaming a linked file into/outof an attribute directory.
3816	 * See the comment in zfs_link() for why this is considered bad.
3817	 */
3818	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3819		error = SET_ERROR(EINVAL);
3820		goto unlockout;
3821	}
3822
3823	/*
3824	 * Must have write access at the source to remove the old entry
3825	 * and write access at the target to create the new entry.
3826	 * Note that if target and source are the same, this can be
3827	 * done in a single check.
3828	 */
3829	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
3830		goto unlockout;
3831
3832	if ((*svpp)->v_type == VDIR) {
3833		/*
3834		 * Avoid ".", "..", and aliases of "." for obvious reasons.
3835		 */
3836		if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
3837		    sdzp == szp ||
3838		    (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
3839			error = EINVAL;
3840			goto unlockout;
3841		}
3842
3843		/*
3844		 * Check to make sure rename is valid.
3845		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3846		 */
3847		if (error = zfs_rename_check(szp, sdzp, tdzp))
3848			goto unlockout;
3849	}
3850
3851	/*
3852	 * Does target exist?
3853	 */
3854	if (tzp) {
3855		/*
3856		 * Source and target must be the same type.
3857		 */
3858		if ((*svpp)->v_type == VDIR) {
3859			if ((*tvpp)->v_type != VDIR) {
3860				error = SET_ERROR(ENOTDIR);
3861				goto unlockout;
3862			} else {
3863				cache_purge(tdvp);
3864				if (sdvp != tdvp)
3865					cache_purge(sdvp);
3866			}
3867		} else {
3868			if ((*tvpp)->v_type == VDIR) {
3869				error = SET_ERROR(EISDIR);
3870				goto unlockout;
3871			}
3872		}
3873	}
3874
3875	vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
3876	if (tzp)
3877		vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
3878
3879	/*
3880	 * notify the target directory if it is not the same
3881	 * as source directory.
3882	 */
3883	if (tdvp != sdvp) {
3884		vnevent_rename_dest_dir(tdvp, ct);
3885	}
3886
3887	tx = dmu_tx_create(zfsvfs->z_os);
3888	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3889	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3890	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3891	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3892	if (sdzp != tdzp) {
3893		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3894		zfs_sa_upgrade_txholds(tx, tdzp);
3895	}
3896	if (tzp) {
3897		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3898		zfs_sa_upgrade_txholds(tx, tzp);
3899	}
3900
3901	zfs_sa_upgrade_txholds(tx, szp);
3902	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3903	error = dmu_tx_assign(tx, TXG_WAIT);
3904	if (error) {
3905		dmu_tx_abort(tx);
3906		goto unlockout;
3907	}
3908
3909
3910	if (tzp)	/* Attempt to remove the existing target */
3911		error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
3912
3913	if (error == 0) {
3914		error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
3915		if (error == 0) {
3916			szp->z_pflags |= ZFS_AV_MODIFIED;
3917
3918			error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3919			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3920			ASSERT0(error);
3921
3922			error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING,
3923			    NULL);
3924			if (error == 0) {
3925				zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
3926				    snm, tdzp, tnm, szp);
3927
3928				/*
3929				 * Update path information for the target vnode
3930				 */
3931				vn_renamepath(tdvp, *svpp, tnm, strlen(tnm));
3932			} else {
3933				/*
3934				 * At this point, we have successfully created
3935				 * the target name, but have failed to remove
3936				 * the source name.  Since the create was done
3937				 * with the ZRENAMING flag, there are
3938				 * complications; for one, the link count is
3939				 * wrong.  The easiest way to deal with this
3940				 * is to remove the newly created target, and
3941				 * return the original error.  This must
3942				 * succeed; fortunately, it is very unlikely to
3943				 * fail, since we just created it.
3944				 */
3945				VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx,
3946				    ZRENAMING, NULL), ==, 0);
3947			}
3948		}
3949		if (error == 0) {
3950			cache_purge(*svpp);
3951			if (*tvpp != NULL)
3952				cache_purge(*tvpp);
3953			cache_purge_negative(tdvp);
3954		}
3955	}
3956
3957	dmu_tx_commit(tx);
3958
3959unlockout:			/* all 4 vnodes are locked, ZFS_ENTER called */
3960	ZFS_EXIT(zfsvfs);
3961	VOP_UNLOCK(*svpp, 0);
3962	VOP_UNLOCK(sdvp, 0);
3963
3964out:				/* original two vnodes are locked */
3965	if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3966		zil_commit(zilog, 0);
3967
3968	if (*tvpp != NULL)
3969		VOP_UNLOCK(*tvpp, 0);
3970	if (tdvp != *tvpp)
3971		VOP_UNLOCK(tdvp, 0);
3972	return (error);
3973}
3974
3975/*
3976 * Insert the indicated symbolic reference entry into the directory.
3977 *
3978 *	IN:	dvp	- Directory to contain new symbolic link.
3979 *		link	- Name for new symlink entry.
3980 *		vap	- Attributes of new entry.
3981 *		cr	- credentials of caller.
3982 *		ct	- caller context
3983 *		flags	- case flags
3984 *
3985 *	RETURN:	0 on success, error code on failure.
3986 *
3987 * Timestamps:
3988 *	dvp - ctime|mtime updated
3989 */
3990/*ARGSUSED*/
3991static int
3992zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
3993    cred_t *cr, kthread_t *td)
3994{
3995	znode_t		*zp, *dzp = VTOZ(dvp);
3996	dmu_tx_t	*tx;
3997	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
3998	zilog_t		*zilog;
3999	uint64_t	len = strlen(link);
4000	int		error;
4001	zfs_acl_ids_t	acl_ids;
4002	boolean_t	fuid_dirtied;
4003	uint64_t	txtype = TX_SYMLINK;
4004	int		flags = 0;
4005
4006	ASSERT(vap->va_type == VLNK);
4007
4008	ZFS_ENTER(zfsvfs);
4009	ZFS_VERIFY_ZP(dzp);
4010	zilog = zfsvfs->z_log;
4011
4012	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
4013	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4014		ZFS_EXIT(zfsvfs);
4015		return (SET_ERROR(EILSEQ));
4016	}
4017
4018	if (len > MAXPATHLEN) {
4019		ZFS_EXIT(zfsvfs);
4020		return (SET_ERROR(ENAMETOOLONG));
4021	}
4022
4023	if ((error = zfs_acl_ids_create(dzp, 0,
4024	    vap, cr, NULL, &acl_ids)) != 0) {
4025		ZFS_EXIT(zfsvfs);
4026		return (error);
4027	}
4028
4029	/*
4030	 * Attempt to lock directory; fail if entry already exists.
4031	 */
4032	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
4033	if (error) {
4034		zfs_acl_ids_free(&acl_ids);
4035		ZFS_EXIT(zfsvfs);
4036		return (error);
4037	}
4038
4039	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4040		zfs_acl_ids_free(&acl_ids);
4041		ZFS_EXIT(zfsvfs);
4042		return (error);
4043	}
4044
4045	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
4046		zfs_acl_ids_free(&acl_ids);
4047		ZFS_EXIT(zfsvfs);
4048		return (SET_ERROR(EDQUOT));
4049	}
4050
4051	getnewvnode_reserve(1);
4052	tx = dmu_tx_create(zfsvfs->z_os);
4053	fuid_dirtied = zfsvfs->z_fuid_dirty;
4054	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
4055	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4056	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
4057	    ZFS_SA_BASE_ATTR_SIZE + len);
4058	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
4059	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
4060		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
4061		    acl_ids.z_aclp->z_acl_bytes);
4062	}
4063	if (fuid_dirtied)
4064		zfs_fuid_txhold(zfsvfs, tx);
4065	error = dmu_tx_assign(tx, TXG_WAIT);
4066	if (error) {
4067		zfs_acl_ids_free(&acl_ids);
4068		dmu_tx_abort(tx);
4069		getnewvnode_drop_reserve();
4070		ZFS_EXIT(zfsvfs);
4071		return (error);
4072	}
4073
4074	/*
4075	 * Create a new object for the symlink.
4076	 * for version 4 ZPL datsets the symlink will be an SA attribute
4077	 */
4078	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
4079
4080	if (fuid_dirtied)
4081		zfs_fuid_sync(zfsvfs, tx);
4082
4083	if (zp->z_is_sa)
4084		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
4085		    link, len, tx);
4086	else
4087		zfs_sa_symlink(zp, link, len, tx);
4088
4089	zp->z_size = len;
4090	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
4091	    &zp->z_size, sizeof (zp->z_size), tx);
4092	/*
4093	 * Insert the new object into the directory.
4094	 */
4095	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
4096
4097	zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
4098	*vpp = ZTOV(zp);
4099
4100	zfs_acl_ids_free(&acl_ids);
4101
4102	dmu_tx_commit(tx);
4103
4104	getnewvnode_drop_reserve();
4105
4106	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4107		zil_commit(zilog, 0);
4108
4109	ZFS_EXIT(zfsvfs);
4110	return (error);
4111}
4112
4113/*
4114 * Return, in the buffer contained in the provided uio structure,
4115 * the symbolic path referred to by vp.
4116 *
4117 *	IN:	vp	- vnode of symbolic link.
4118 *		uio	- structure to contain the link path.
4119 *		cr	- credentials of caller.
4120 *		ct	- caller context
4121 *
4122 *	OUT:	uio	- structure containing the link path.
4123 *
4124 *	RETURN:	0 on success, error code on failure.
4125 *
4126 * Timestamps:
4127 *	vp - atime updated
4128 */
4129/* ARGSUSED */
4130static int
4131zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
4132{
4133	znode_t		*zp = VTOZ(vp);
4134	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4135	int		error;
4136
4137	ZFS_ENTER(zfsvfs);
4138	ZFS_VERIFY_ZP(zp);
4139
4140	if (zp->z_is_sa)
4141		error = sa_lookup_uio(zp->z_sa_hdl,
4142		    SA_ZPL_SYMLINK(zfsvfs), uio);
4143	else
4144		error = zfs_sa_readlink(zp, uio);
4145
4146	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4147
4148	ZFS_EXIT(zfsvfs);
4149	return (error);
4150}
4151
4152/*
4153 * Insert a new entry into directory tdvp referencing svp.
4154 *
4155 *	IN:	tdvp	- Directory to contain new entry.
4156 *		svp	- vnode of new entry.
4157 *		name	- name of new entry.
4158 *		cr	- credentials of caller.
4159 *		ct	- caller context
4160 *
4161 *	RETURN:	0 on success, error code on failure.
4162 *
4163 * Timestamps:
4164 *	tdvp - ctime|mtime updated
4165 *	 svp - ctime updated
4166 */
4167/* ARGSUSED */
4168static int
4169zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
4170    caller_context_t *ct, int flags)
4171{
4172	znode_t		*dzp = VTOZ(tdvp);
4173	znode_t		*tzp, *szp;
4174	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
4175	zilog_t		*zilog;
4176	dmu_tx_t	*tx;
4177	int		error;
4178	uint64_t	parent;
4179	uid_t		owner;
4180
4181	ASSERT(tdvp->v_type == VDIR);
4182
4183	ZFS_ENTER(zfsvfs);
4184	ZFS_VERIFY_ZP(dzp);
4185	zilog = zfsvfs->z_log;
4186
4187	/*
4188	 * POSIX dictates that we return EPERM here.
4189	 * Better choices include ENOTSUP or EISDIR.
4190	 */
4191	if (svp->v_type == VDIR) {
4192		ZFS_EXIT(zfsvfs);
4193		return (SET_ERROR(EPERM));
4194	}
4195
4196	szp = VTOZ(svp);
4197	ZFS_VERIFY_ZP(szp);
4198
4199	if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) {
4200		ZFS_EXIT(zfsvfs);
4201		return (SET_ERROR(EPERM));
4202	}
4203
4204	/* Prevent links to .zfs/shares files */
4205
4206	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
4207	    &parent, sizeof (uint64_t))) != 0) {
4208		ZFS_EXIT(zfsvfs);
4209		return (error);
4210	}
4211	if (parent == zfsvfs->z_shares_dir) {
4212		ZFS_EXIT(zfsvfs);
4213		return (SET_ERROR(EPERM));
4214	}
4215
4216	if (zfsvfs->z_utf8 && u8_validate(name,
4217	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4218		ZFS_EXIT(zfsvfs);
4219		return (SET_ERROR(EILSEQ));
4220	}
4221
4222	/*
4223	 * We do not support links between attributes and non-attributes
4224	 * because of the potential security risk of creating links
4225	 * into "normal" file space in order to circumvent restrictions
4226	 * imposed in attribute space.
4227	 */
4228	if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
4229		ZFS_EXIT(zfsvfs);
4230		return (SET_ERROR(EINVAL));
4231	}
4232
4233
4234	owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
4235	if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
4236		ZFS_EXIT(zfsvfs);
4237		return (SET_ERROR(EPERM));
4238	}
4239
4240	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4241		ZFS_EXIT(zfsvfs);
4242		return (error);
4243	}
4244
4245	/*
4246	 * Attempt to lock directory; fail if entry already exists.
4247	 */
4248	error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW);
4249	if (error) {
4250		ZFS_EXIT(zfsvfs);
4251		return (error);
4252	}
4253
4254	tx = dmu_tx_create(zfsvfs->z_os);
4255	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4256	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4257	zfs_sa_upgrade_txholds(tx, szp);
4258	zfs_sa_upgrade_txholds(tx, dzp);
4259	error = dmu_tx_assign(tx, TXG_WAIT);
4260	if (error) {
4261		dmu_tx_abort(tx);
4262		ZFS_EXIT(zfsvfs);
4263		return (error);
4264	}
4265
4266	error = zfs_link_create(dzp, name, szp, tx, 0);
4267
4268	if (error == 0) {
4269		uint64_t txtype = TX_LINK;
4270		zfs_log_link(zilog, tx, txtype, dzp, szp, name);
4271	}
4272
4273	dmu_tx_commit(tx);
4274
4275	if (error == 0) {
4276		vnevent_link(svp, ct);
4277	}
4278
4279	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4280		zil_commit(zilog, 0);
4281
4282	ZFS_EXIT(zfsvfs);
4283	return (error);
4284}
4285
4286
4287/*ARGSUSED*/
4288void
4289zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4290{
4291	znode_t	*zp = VTOZ(vp);
4292	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4293	int error;
4294
4295	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4296	if (zp->z_sa_hdl == NULL) {
4297		/*
4298		 * The fs has been unmounted, or we did a
4299		 * suspend/resume and this file no longer exists.
4300		 */
4301		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4302		vrecycle(vp);
4303		return;
4304	}
4305
4306	if (zp->z_unlinked) {
4307		/*
4308		 * Fast path to recycle a vnode of a removed file.
4309		 */
4310		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4311		vrecycle(vp);
4312		return;
4313	}
4314
4315	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4316		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4317
4318		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4319		zfs_sa_upgrade_txholds(tx, zp);
4320		error = dmu_tx_assign(tx, TXG_WAIT);
4321		if (error) {
4322			dmu_tx_abort(tx);
4323		} else {
4324			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4325			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4326			zp->z_atime_dirty = 0;
4327			dmu_tx_commit(tx);
4328		}
4329	}
4330	rw_exit(&zfsvfs->z_teardown_inactive_lock);
4331}
4332
4333
4334CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
4335CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
4336
4337/*ARGSUSED*/
4338static int
4339zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4340{
4341	znode_t		*zp = VTOZ(vp);
4342	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4343	uint32_t	gen;
4344	uint64_t	gen64;
4345	uint64_t	object = zp->z_id;
4346	zfid_short_t	*zfid;
4347	int		size, i, error;
4348
4349	ZFS_ENTER(zfsvfs);
4350	ZFS_VERIFY_ZP(zp);
4351
4352	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4353	    &gen64, sizeof (uint64_t))) != 0) {
4354		ZFS_EXIT(zfsvfs);
4355		return (error);
4356	}
4357
4358	gen = (uint32_t)gen64;
4359
4360	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
4361
4362#ifdef illumos
4363	if (fidp->fid_len < size) {
4364		fidp->fid_len = size;
4365		ZFS_EXIT(zfsvfs);
4366		return (SET_ERROR(ENOSPC));
4367	}
4368#else
4369	fidp->fid_len = size;
4370#endif
4371
4372	zfid = (zfid_short_t *)fidp;
4373
4374	zfid->zf_len = size;
4375
4376	for (i = 0; i < sizeof (zfid->zf_object); i++)
4377		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4378
4379	/* Must have a non-zero generation number to distinguish from .zfs */
4380	if (gen == 0)
4381		gen = 1;
4382	for (i = 0; i < sizeof (zfid->zf_gen); i++)
4383		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4384
4385	if (size == LONG_FID_LEN) {
4386		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
4387		zfid_long_t	*zlfid;
4388
4389		zlfid = (zfid_long_t *)fidp;
4390
4391		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
4392			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
4393
4394		/* XXX - this should be the generation number for the objset */
4395		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
4396			zlfid->zf_setgen[i] = 0;
4397	}
4398
4399	ZFS_EXIT(zfsvfs);
4400	return (0);
4401}
4402
4403static int
4404zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4405    caller_context_t *ct)
4406{
4407	znode_t		*zp, *xzp;
4408	zfsvfs_t	*zfsvfs;
4409	int		error;
4410
4411	switch (cmd) {
4412	case _PC_LINK_MAX:
4413		*valp = INT_MAX;
4414		return (0);
4415
4416	case _PC_FILESIZEBITS:
4417		*valp = 64;
4418		return (0);
4419#ifdef illumos
4420	case _PC_XATTR_EXISTS:
4421		zp = VTOZ(vp);
4422		zfsvfs = zp->z_zfsvfs;
4423		ZFS_ENTER(zfsvfs);
4424		ZFS_VERIFY_ZP(zp);
4425		*valp = 0;
4426		error = zfs_dirent_lookup(zp, "", &xzp,
4427		    ZXATTR | ZEXISTS | ZSHARED);
4428		if (error == 0) {
4429			if (!zfs_dirempty(xzp))
4430				*valp = 1;
4431			vrele(ZTOV(xzp));
4432		} else if (error == ENOENT) {
4433			/*
4434			 * If there aren't extended attributes, it's the
4435			 * same as having zero of them.
4436			 */
4437			error = 0;
4438		}
4439		ZFS_EXIT(zfsvfs);
4440		return (error);
4441
4442	case _PC_SATTR_ENABLED:
4443	case _PC_SATTR_EXISTS:
4444		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
4445		    (vp->v_type == VREG || vp->v_type == VDIR);
4446		return (0);
4447
4448	case _PC_ACCESS_FILTERING:
4449		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
4450		    vp->v_type == VDIR;
4451		return (0);
4452
4453	case _PC_ACL_ENABLED:
4454		*valp = _ACL_ACE_ENABLED;
4455		return (0);
4456#endif	/* illumos */
4457	case _PC_MIN_HOLE_SIZE:
4458		*valp = (int)SPA_MINBLOCKSIZE;
4459		return (0);
4460#ifdef illumos
4461	case _PC_TIMESTAMP_RESOLUTION:
4462		/* nanosecond timestamp resolution */
4463		*valp = 1L;
4464		return (0);
4465#endif
4466	case _PC_ACL_EXTENDED:
4467		*valp = 0;
4468		return (0);
4469
4470	case _PC_ACL_NFS4:
4471		*valp = 1;
4472		return (0);
4473
4474	case _PC_ACL_PATH_MAX:
4475		*valp = ACL_MAX_ENTRIES;
4476		return (0);
4477
4478	default:
4479		return (EOPNOTSUPP);
4480	}
4481}
4482
4483/*ARGSUSED*/
4484static int
4485zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4486    caller_context_t *ct)
4487{
4488	znode_t *zp = VTOZ(vp);
4489	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4490	int error;
4491	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4492
4493	ZFS_ENTER(zfsvfs);
4494	ZFS_VERIFY_ZP(zp);
4495	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
4496	ZFS_EXIT(zfsvfs);
4497
4498	return (error);
4499}
4500
4501/*ARGSUSED*/
4502int
4503zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4504    caller_context_t *ct)
4505{
4506	znode_t *zp = VTOZ(vp);
4507	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4508	int error;
4509	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4510	zilog_t	*zilog = zfsvfs->z_log;
4511
4512	ZFS_ENTER(zfsvfs);
4513	ZFS_VERIFY_ZP(zp);
4514
4515	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
4516
4517	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4518		zil_commit(zilog, 0);
4519
4520	ZFS_EXIT(zfsvfs);
4521	return (error);
4522}
4523
4524static int
4525ioflags(int ioflags)
4526{
4527	int flags = 0;
4528
4529	if (ioflags & IO_APPEND)
4530		flags |= FAPPEND;
4531	if (ioflags & IO_NDELAY)
4532		flags |= FNONBLOCK;
4533	if (ioflags & IO_SYNC)
4534		flags |= (FSYNC | FDSYNC | FRSYNC);
4535
4536	return (flags);
4537}
4538
4539static int
4540zfs_getpages(struct vnode *vp, vm_page_t *m, int count, int *rbehind,
4541    int *rahead)
4542{
4543	znode_t *zp = VTOZ(vp);
4544	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4545	objset_t *os = zp->z_zfsvfs->z_os;
4546	vm_page_t mlast;
4547	vm_object_t object;
4548	caddr_t va;
4549	struct sf_buf *sf;
4550	off_t startoff, endoff;
4551	int i, error;
4552	vm_pindex_t reqstart, reqend;
4553	int lsize, size;
4554
4555	object = m[0]->object;
4556	error = 0;
4557
4558	ZFS_ENTER(zfsvfs);
4559	ZFS_VERIFY_ZP(zp);
4560
4561	zfs_vmobject_wlock(object);
4562	if (m[count - 1]->valid != 0 && --count == 0) {
4563		zfs_vmobject_wunlock(object);
4564		goto out;
4565	}
4566
4567	mlast = m[count - 1];
4568
4569	if (IDX_TO_OFF(mlast->pindex) >=
4570	    object->un_pager.vnp.vnp_size) {
4571		zfs_vmobject_wunlock(object);
4572		ZFS_EXIT(zfsvfs);
4573		return (zfs_vm_pagerret_bad);
4574	}
4575
4576	PCPU_INC(cnt.v_vnodein);
4577	PCPU_ADD(cnt.v_vnodepgsin, count);
4578
4579	lsize = PAGE_SIZE;
4580	if (IDX_TO_OFF(mlast->pindex) + lsize > object->un_pager.vnp.vnp_size)
4581		lsize = object->un_pager.vnp.vnp_size -
4582		    IDX_TO_OFF(mlast->pindex);
4583	zfs_vmobject_wunlock(object);
4584
4585	for (i = 0; i < count; i++) {
4586		size = PAGE_SIZE;
4587		if (i == count - 1)
4588			size = lsize;
4589		va = zfs_map_page(m[i], &sf);
4590		error = dmu_read(os, zp->z_id, IDX_TO_OFF(m[i]->pindex),
4591		    size, va, DMU_READ_PREFETCH);
4592		if (size != PAGE_SIZE)
4593			bzero(va + size, PAGE_SIZE - size);
4594		zfs_unmap_page(sf);
4595		if (error != 0)
4596			goto out;
4597	}
4598
4599	zfs_vmobject_wlock(object);
4600	for (i = 0; i < count; i++)
4601		m[i]->valid = VM_PAGE_BITS_ALL;
4602	zfs_vmobject_wunlock(object);
4603
4604out:
4605	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4606	ZFS_EXIT(zfsvfs);
4607	if (error == 0) {
4608		if (rbehind)
4609			*rbehind = 0;
4610		if (rahead)
4611			*rahead = 0;
4612		return (zfs_vm_pagerret_ok);
4613	} else
4614		return (zfs_vm_pagerret_error);
4615}
4616
4617static int
4618zfs_freebsd_getpages(ap)
4619	struct vop_getpages_args /* {
4620		struct vnode *a_vp;
4621		vm_page_t *a_m;
4622		int a_count;
4623		int *a_rbehind;
4624		int *a_rahead;
4625	} */ *ap;
4626{
4627
4628	return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind,
4629	    ap->a_rahead));
4630}
4631
4632static int
4633zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
4634    int *rtvals)
4635{
4636	znode_t		*zp = VTOZ(vp);
4637	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4638	rl_t		*rl;
4639	dmu_tx_t	*tx;
4640	struct sf_buf	*sf;
4641	vm_object_t	object;
4642	vm_page_t	m;
4643	caddr_t		va;
4644	size_t		tocopy;
4645	size_t		lo_len;
4646	vm_ooffset_t	lo_off;
4647	vm_ooffset_t	off;
4648	uint_t		blksz;
4649	int		ncount;
4650	int		pcount;
4651	int		err;
4652	int		i;
4653
4654	ZFS_ENTER(zfsvfs);
4655	ZFS_VERIFY_ZP(zp);
4656
4657	object = vp->v_object;
4658	pcount = btoc(len);
4659	ncount = pcount;
4660
4661	KASSERT(ma[0]->object == object, ("mismatching object"));
4662	KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));
4663
4664	for (i = 0; i < pcount; i++)
4665		rtvals[i] = zfs_vm_pagerret_error;
4666
4667	off = IDX_TO_OFF(ma[0]->pindex);
4668	blksz = zp->z_blksz;
4669	lo_off = rounddown(off, blksz);
4670	lo_len = roundup(len + (off - lo_off), blksz);
4671	rl = zfs_range_lock(zp, lo_off, lo_len, RL_WRITER);
4672
4673	zfs_vmobject_wlock(object);
4674	if (len + off > object->un_pager.vnp.vnp_size) {
4675		if (object->un_pager.vnp.vnp_size > off) {
4676			int pgoff;
4677
4678			len = object->un_pager.vnp.vnp_size - off;
4679			ncount = btoc(len);
4680			if ((pgoff = (int)len & PAGE_MASK) != 0) {
4681				/*
4682				 * If the object is locked and the following
4683				 * conditions hold, then the page's dirty
4684				 * field cannot be concurrently changed by a
4685				 * pmap operation.
4686				 */
4687				m = ma[ncount - 1];
4688				vm_page_assert_sbusied(m);
4689				KASSERT(!pmap_page_is_write_mapped(m),
4690				    ("zfs_putpages: page %p is not read-only", m));
4691				vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
4692				    pgoff);
4693			}
4694		} else {
4695			len = 0;
4696			ncount = 0;
4697		}
4698		if (ncount < pcount) {
4699			for (i = ncount; i < pcount; i++) {
4700				rtvals[i] = zfs_vm_pagerret_bad;
4701			}
4702		}
4703	}
4704	zfs_vmobject_wunlock(object);
4705
4706	if (ncount == 0)
4707		goto out;
4708
4709	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
4710	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
4711		goto out;
4712	}
4713
4714top:
4715	tx = dmu_tx_create(zfsvfs->z_os);
4716	dmu_tx_hold_write(tx, zp->z_id, off, len);
4717
4718	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4719	zfs_sa_upgrade_txholds(tx, zp);
4720	err = dmu_tx_assign(tx, TXG_NOWAIT);
4721	if (err != 0) {
4722		if (err == ERESTART) {
4723			dmu_tx_wait(tx);
4724			dmu_tx_abort(tx);
4725			goto top;
4726		}
4727		dmu_tx_abort(tx);
4728		goto out;
4729	}
4730
4731	if (zp->z_blksz < PAGE_SIZE) {
4732		i = 0;
4733		for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
4734			tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
4735			va = zfs_map_page(ma[i], &sf);
4736			dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
4737			zfs_unmap_page(sf);
4738		}
4739	} else {
4740		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
4741	}
4742
4743	if (err == 0) {
4744		uint64_t mtime[2], ctime[2];
4745		sa_bulk_attr_t bulk[3];
4746		int count = 0;
4747
4748		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4749		    &mtime, 16);
4750		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
4751		    &ctime, 16);
4752		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
4753		    &zp->z_pflags, 8);
4754		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
4755		    B_TRUE);
4756		(void)sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
4757		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
4758
4759		zfs_vmobject_wlock(object);
4760		for (i = 0; i < ncount; i++) {
4761			rtvals[i] = zfs_vm_pagerret_ok;
4762			vm_page_undirty(ma[i]);
4763		}
4764		zfs_vmobject_wunlock(object);
4765		PCPU_INC(cnt.v_vnodeout);
4766		PCPU_ADD(cnt.v_vnodepgsout, ncount);
4767	}
4768	dmu_tx_commit(tx);
4769
4770out:
4771	zfs_range_unlock(rl);
4772	if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 ||
4773	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4774		zil_commit(zfsvfs->z_log, zp->z_id);
4775	ZFS_EXIT(zfsvfs);
4776	return (rtvals[0]);
4777}
4778
4779int
4780zfs_freebsd_putpages(ap)
4781	struct vop_putpages_args /* {
4782		struct vnode *a_vp;
4783		vm_page_t *a_m;
4784		int a_count;
4785		int a_sync;
4786		int *a_rtvals;
4787	} */ *ap;
4788{
4789
4790	return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
4791	    ap->a_rtvals));
4792}
4793
4794static int
4795zfs_freebsd_bmap(ap)
4796	struct vop_bmap_args /* {
4797		struct vnode *a_vp;
4798		daddr_t  a_bn;
4799		struct bufobj **a_bop;
4800		daddr_t *a_bnp;
4801		int *a_runp;
4802		int *a_runb;
4803	} */ *ap;
4804{
4805
4806	if (ap->a_bop != NULL)
4807		*ap->a_bop = &ap->a_vp->v_bufobj;
4808	if (ap->a_bnp != NULL)
4809		*ap->a_bnp = ap->a_bn;
4810	if (ap->a_runp != NULL)
4811		*ap->a_runp = 0;
4812	if (ap->a_runb != NULL)
4813		*ap->a_runb = 0;
4814
4815	return (0);
4816}
4817
4818static int
4819zfs_freebsd_open(ap)
4820	struct vop_open_args /* {
4821		struct vnode *a_vp;
4822		int a_mode;
4823		struct ucred *a_cred;
4824		struct thread *a_td;
4825	} */ *ap;
4826{
4827	vnode_t	*vp = ap->a_vp;
4828	znode_t *zp = VTOZ(vp);
4829	int error;
4830
4831	error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
4832	if (error == 0)
4833		vnode_create_vobject(vp, zp->z_size, ap->a_td);
4834	return (error);
4835}
4836
4837static int
4838zfs_freebsd_close(ap)
4839	struct vop_close_args /* {
4840		struct vnode *a_vp;
4841		int  a_fflag;
4842		struct ucred *a_cred;
4843		struct thread *a_td;
4844	} */ *ap;
4845{
4846
4847	return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL));
4848}
4849
4850static int
4851zfs_freebsd_ioctl(ap)
4852	struct vop_ioctl_args /* {
4853		struct vnode *a_vp;
4854		u_long a_command;
4855		caddr_t a_data;
4856		int a_fflag;
4857		struct ucred *cred;
4858		struct thread *td;
4859	} */ *ap;
4860{
4861
4862	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
4863	    ap->a_fflag, ap->a_cred, NULL, NULL));
4864}
4865
4866static int
4867zfs_freebsd_read(ap)
4868	struct vop_read_args /* {
4869		struct vnode *a_vp;
4870		struct uio *a_uio;
4871		int a_ioflag;
4872		struct ucred *a_cred;
4873	} */ *ap;
4874{
4875
4876	return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
4877	    ap->a_cred, NULL));
4878}
4879
4880static int
4881zfs_freebsd_write(ap)
4882	struct vop_write_args /* {
4883		struct vnode *a_vp;
4884		struct uio *a_uio;
4885		int a_ioflag;
4886		struct ucred *a_cred;
4887	} */ *ap;
4888{
4889
4890	return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
4891	    ap->a_cred, NULL));
4892}
4893
4894static int
4895zfs_freebsd_access(ap)
4896	struct vop_access_args /* {
4897		struct vnode *a_vp;
4898		accmode_t a_accmode;
4899		struct ucred *a_cred;
4900		struct thread *a_td;
4901	} */ *ap;
4902{
4903	vnode_t *vp = ap->a_vp;
4904	znode_t *zp = VTOZ(vp);
4905	accmode_t accmode;
4906	int error = 0;
4907
4908	/*
4909	 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
4910	 */
4911	accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
4912	if (accmode != 0)
4913		error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL);
4914
4915	/*
4916	 * VADMIN has to be handled by vaccess().
4917	 */
4918	if (error == 0) {
4919		accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
4920		if (accmode != 0) {
4921			error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
4922			    zp->z_gid, accmode, ap->a_cred, NULL);
4923		}
4924	}
4925
4926	/*
4927	 * For VEXEC, ensure that at least one execute bit is set for
4928	 * non-directories.
4929	 */
4930	if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
4931	    (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
4932		error = EACCES;
4933	}
4934
4935	return (error);
4936}
4937
4938static int
4939zfs_freebsd_lookup(ap)
4940	struct vop_lookup_args /* {
4941		struct vnode *a_dvp;
4942		struct vnode **a_vpp;
4943		struct componentname *a_cnp;
4944	} */ *ap;
4945{
4946	struct componentname *cnp = ap->a_cnp;
4947	char nm[NAME_MAX + 1];
4948
4949	ASSERT(cnp->cn_namelen < sizeof(nm));
4950	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
4951
4952	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
4953	    cnp->cn_cred, cnp->cn_thread, 0));
4954}
4955
4956static int
4957zfs_cache_lookup(ap)
4958	struct vop_lookup_args /* {
4959		struct vnode *a_dvp;
4960		struct vnode **a_vpp;
4961		struct componentname *a_cnp;
4962	} */ *ap;
4963{
4964	zfsvfs_t *zfsvfs;
4965
4966	zfsvfs = ap->a_dvp->v_mount->mnt_data;
4967	if (zfsvfs->z_use_namecache)
4968		return (vfs_cache_lookup(ap));
4969	else
4970		return (zfs_freebsd_lookup(ap));
4971}
4972
4973static int
4974zfs_freebsd_create(ap)
4975	struct vop_create_args /* {
4976		struct vnode *a_dvp;
4977		struct vnode **a_vpp;
4978		struct componentname *a_cnp;
4979		struct vattr *a_vap;
4980	} */ *ap;
4981{
4982	zfsvfs_t *zfsvfs;
4983	struct componentname *cnp = ap->a_cnp;
4984	vattr_t *vap = ap->a_vap;
4985	int error, mode;
4986
4987	ASSERT(cnp->cn_flags & SAVENAME);
4988
4989	vattr_init_mask(vap);
4990	mode = vap->va_mode & ALLPERMS;
4991	zfsvfs = ap->a_dvp->v_mount->mnt_data;
4992
4993	error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
4994	    ap->a_vpp, cnp->cn_cred, cnp->cn_thread);
4995	if (zfsvfs->z_use_namecache &&
4996	    error == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
4997		cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
4998	return (error);
4999}
5000
5001static int
5002zfs_freebsd_remove(ap)
5003	struct vop_remove_args /* {
5004		struct vnode *a_dvp;
5005		struct vnode *a_vp;
5006		struct componentname *a_cnp;
5007	} */ *ap;
5008{
5009
5010	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
5011
5012	return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
5013	    ap->a_cnp->cn_cred));
5014}
5015
5016static int
5017zfs_freebsd_mkdir(ap)
5018	struct vop_mkdir_args /* {
5019		struct vnode *a_dvp;
5020		struct vnode **a_vpp;
5021		struct componentname *a_cnp;
5022		struct vattr *a_vap;
5023	} */ *ap;
5024{
5025	vattr_t *vap = ap->a_vap;
5026
5027	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
5028
5029	vattr_init_mask(vap);
5030
5031	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
5032	    ap->a_cnp->cn_cred));
5033}
5034
5035static int
5036zfs_freebsd_rmdir(ap)
5037	struct vop_rmdir_args /* {
5038		struct vnode *a_dvp;
5039		struct vnode *a_vp;
5040		struct componentname *a_cnp;
5041	} */ *ap;
5042{
5043	struct componentname *cnp = ap->a_cnp;
5044
5045	ASSERT(cnp->cn_flags & SAVENAME);
5046
5047	return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
5048}
5049
5050static int
5051zfs_freebsd_readdir(ap)
5052	struct vop_readdir_args /* {
5053		struct vnode *a_vp;
5054		struct uio *a_uio;
5055		struct ucred *a_cred;
5056		int *a_eofflag;
5057		int *a_ncookies;
5058		u_long **a_cookies;
5059	} */ *ap;
5060{
5061
5062	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
5063	    ap->a_ncookies, ap->a_cookies));
5064}
5065
5066static int
5067zfs_freebsd_fsync(ap)
5068	struct vop_fsync_args /* {
5069		struct vnode *a_vp;
5070		int a_waitfor;
5071		struct thread *a_td;
5072	} */ *ap;
5073{
5074
5075	vop_stdfsync(ap);
5076	return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
5077}
5078
5079static int
5080zfs_freebsd_getattr(ap)
5081	struct vop_getattr_args /* {
5082		struct vnode *a_vp;
5083		struct vattr *a_vap;
5084		struct ucred *a_cred;
5085	} */ *ap;
5086{
5087	vattr_t *vap = ap->a_vap;
5088	xvattr_t xvap;
5089	u_long fflags = 0;
5090	int error;
5091
5092	xva_init(&xvap);
5093	xvap.xva_vattr = *vap;
5094	xvap.xva_vattr.va_mask |= AT_XVATTR;
5095
5096	/* Convert chflags into ZFS-type flags. */
5097	/* XXX: what about SF_SETTABLE?. */
5098	XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
5099	XVA_SET_REQ(&xvap, XAT_APPENDONLY);
5100	XVA_SET_REQ(&xvap, XAT_NOUNLINK);
5101	XVA_SET_REQ(&xvap, XAT_NODUMP);
5102	XVA_SET_REQ(&xvap, XAT_READONLY);
5103	XVA_SET_REQ(&xvap, XAT_ARCHIVE);
5104	XVA_SET_REQ(&xvap, XAT_SYSTEM);
5105	XVA_SET_REQ(&xvap, XAT_HIDDEN);
5106	XVA_SET_REQ(&xvap, XAT_REPARSE);
5107	XVA_SET_REQ(&xvap, XAT_OFFLINE);
5108	XVA_SET_REQ(&xvap, XAT_SPARSE);
5109
5110	error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
5111	if (error != 0)
5112		return (error);
5113
5114	/* Convert ZFS xattr into chflags. */
5115#define	FLAG_CHECK(fflag, xflag, xfield)	do {			\
5116	if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)		\
5117		fflags |= (fflag);					\
5118} while (0)
5119	FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
5120	    xvap.xva_xoptattrs.xoa_immutable);
5121	FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
5122	    xvap.xva_xoptattrs.xoa_appendonly);
5123	FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
5124	    xvap.xva_xoptattrs.xoa_nounlink);
5125	FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
5126	    xvap.xva_xoptattrs.xoa_archive);
5127	FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
5128	    xvap.xva_xoptattrs.xoa_nodump);
5129	FLAG_CHECK(UF_READONLY, XAT_READONLY,
5130	    xvap.xva_xoptattrs.xoa_readonly);
5131	FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
5132	    xvap.xva_xoptattrs.xoa_system);
5133	FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
5134	    xvap.xva_xoptattrs.xoa_hidden);
5135	FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
5136	    xvap.xva_xoptattrs.xoa_reparse);
5137	FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
5138	    xvap.xva_xoptattrs.xoa_offline);
5139	FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
5140	    xvap.xva_xoptattrs.xoa_sparse);
5141
5142#undef	FLAG_CHECK
5143	*vap = xvap.xva_vattr;
5144	vap->va_flags = fflags;
5145	return (0);
5146}
5147
5148static int
5149zfs_freebsd_setattr(ap)
5150	struct vop_setattr_args /* {
5151		struct vnode *a_vp;
5152		struct vattr *a_vap;
5153		struct ucred *a_cred;
5154	} */ *ap;
5155{
5156	vnode_t *vp = ap->a_vp;
5157	vattr_t *vap = ap->a_vap;
5158	cred_t *cred = ap->a_cred;
5159	xvattr_t xvap;
5160	u_long fflags;
5161	uint64_t zflags;
5162
5163	vattr_init_mask(vap);
5164	vap->va_mask &= ~AT_NOSET;
5165
5166	xva_init(&xvap);
5167	xvap.xva_vattr = *vap;
5168
5169	zflags = VTOZ(vp)->z_pflags;
5170
5171	if (vap->va_flags != VNOVAL) {
5172		zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
5173		int error;
5174
5175		if (zfsvfs->z_use_fuids == B_FALSE)
5176			return (EOPNOTSUPP);
5177
5178		fflags = vap->va_flags;
5179		/*
5180		 * XXX KDM
5181		 * We need to figure out whether it makes sense to allow
5182		 * UF_REPARSE through, since we don't really have other
5183		 * facilities to handle reparse points and zfs_setattr()
5184		 * doesn't currently allow setting that attribute anyway.
5185		 */
5186		if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE|
5187		     UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE|
5188		     UF_OFFLINE|UF_SPARSE)) != 0)
5189			return (EOPNOTSUPP);
5190		/*
5191		 * Unprivileged processes are not permitted to unset system
5192		 * flags, or modify flags if any system flags are set.
5193		 * Privileged non-jail processes may not modify system flags
5194		 * if securelevel > 0 and any existing system flags are set.
5195		 * Privileged jail processes behave like privileged non-jail
5196		 * processes if the security.jail.chflags_allowed sysctl is
5197		 * is non-zero; otherwise, they behave like unprivileged
5198		 * processes.
5199		 */
5200		if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
5201		    priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) {
5202			if (zflags &
5203			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
5204				error = securelevel_gt(cred, 0);
5205				if (error != 0)
5206					return (error);
5207			}
5208		} else {
5209			/*
5210			 * Callers may only modify the file flags on objects they
5211			 * have VADMIN rights for.
5212			 */
5213			if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0)
5214				return (error);
5215			if (zflags &
5216			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
5217				return (EPERM);
5218			}
5219			if (fflags &
5220			    (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
5221				return (EPERM);
5222			}
5223		}
5224
5225#define	FLAG_CHANGE(fflag, zflag, xflag, xfield)	do {		\
5226	if (((fflags & (fflag)) && !(zflags & (zflag))) ||		\
5227	    ((zflags & (zflag)) && !(fflags & (fflag)))) {		\
5228		XVA_SET_REQ(&xvap, (xflag));				\
5229		(xfield) = ((fflags & (fflag)) != 0);			\
5230	}								\
5231} while (0)
5232		/* Convert chflags into ZFS-type flags. */
5233		/* XXX: what about SF_SETTABLE?. */
5234		FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
5235		    xvap.xva_xoptattrs.xoa_immutable);
5236		FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
5237		    xvap.xva_xoptattrs.xoa_appendonly);
5238		FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
5239		    xvap.xva_xoptattrs.xoa_nounlink);
5240		FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
5241		    xvap.xva_xoptattrs.xoa_archive);
5242		FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
5243		    xvap.xva_xoptattrs.xoa_nodump);
5244		FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
5245		    xvap.xva_xoptattrs.xoa_readonly);
5246		FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
5247		    xvap.xva_xoptattrs.xoa_system);
5248		FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
5249		    xvap.xva_xoptattrs.xoa_hidden);
5250		FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
5251		    xvap.xva_xoptattrs.xoa_hidden);
5252		FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
5253		    xvap.xva_xoptattrs.xoa_offline);
5254		FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
5255		    xvap.xva_xoptattrs.xoa_sparse);
5256#undef	FLAG_CHANGE
5257	}
5258	return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
5259}
5260
5261static int
5262zfs_freebsd_rename(ap)
5263	struct vop_rename_args  /* {
5264		struct vnode *a_fdvp;
5265		struct vnode *a_fvp;
5266		struct componentname *a_fcnp;
5267		struct vnode *a_tdvp;
5268		struct vnode *a_tvp;
5269		struct componentname *a_tcnp;
5270	} */ *ap;
5271{
5272	vnode_t *fdvp = ap->a_fdvp;
5273	vnode_t *fvp = ap->a_fvp;
5274	vnode_t *tdvp = ap->a_tdvp;
5275	vnode_t *tvp = ap->a_tvp;
5276	int error;
5277
5278	ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
5279	ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
5280
5281	error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
5282	    ap->a_tcnp, ap->a_fcnp->cn_cred);
5283
5284	vrele(fdvp);
5285	vrele(fvp);
5286	vrele(tdvp);
5287	if (tvp != NULL)
5288		vrele(tvp);
5289
5290	return (error);
5291}
5292
5293static int
5294zfs_freebsd_symlink(ap)
5295	struct vop_symlink_args /* {
5296		struct vnode *a_dvp;
5297		struct vnode **a_vpp;
5298		struct componentname *a_cnp;
5299		struct vattr *a_vap;
5300		char *a_target;
5301	} */ *ap;
5302{
5303	struct componentname *cnp = ap->a_cnp;
5304	vattr_t *vap = ap->a_vap;
5305
5306	ASSERT(cnp->cn_flags & SAVENAME);
5307
5308	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
5309	vattr_init_mask(vap);
5310
5311	return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
5312	    ap->a_target, cnp->cn_cred, cnp->cn_thread));
5313}
5314
5315static int
5316zfs_freebsd_readlink(ap)
5317	struct vop_readlink_args /* {
5318		struct vnode *a_vp;
5319		struct uio *a_uio;
5320		struct ucred *a_cred;
5321	} */ *ap;
5322{
5323
5324	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
5325}
5326
5327static int
5328zfs_freebsd_link(ap)
5329	struct vop_link_args /* {
5330		struct vnode *a_tdvp;
5331		struct vnode *a_vp;
5332		struct componentname *a_cnp;
5333	} */ *ap;
5334{
5335	struct componentname *cnp = ap->a_cnp;
5336	vnode_t *vp = ap->a_vp;
5337	vnode_t *tdvp = ap->a_tdvp;
5338
5339	if (tdvp->v_mount != vp->v_mount)
5340		return (EXDEV);
5341
5342	ASSERT(cnp->cn_flags & SAVENAME);
5343
5344	return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
5345}
5346
5347static int
5348zfs_freebsd_inactive(ap)
5349	struct vop_inactive_args /* {
5350		struct vnode *a_vp;
5351		struct thread *a_td;
5352	} */ *ap;
5353{
5354	vnode_t *vp = ap->a_vp;
5355
5356	zfs_inactive(vp, ap->a_td->td_ucred, NULL);
5357	return (0);
5358}
5359
5360static int
5361zfs_freebsd_reclaim(ap)
5362	struct vop_reclaim_args /* {
5363		struct vnode *a_vp;
5364		struct thread *a_td;
5365	} */ *ap;
5366{
5367	vnode_t	*vp = ap->a_vp;
5368	znode_t	*zp = VTOZ(vp);
5369	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5370
5371	ASSERT(zp != NULL);
5372
5373	/* Destroy the vm object and flush associated pages. */
5374	vnode_destroy_vobject(vp);
5375
5376	/*
5377	 * z_teardown_inactive_lock protects from a race with
5378	 * zfs_znode_dmu_fini in zfsvfs_teardown during
5379	 * force unmount.
5380	 */
5381	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
5382	if (zp->z_sa_hdl == NULL)
5383		zfs_znode_free(zp);
5384	else
5385		zfs_zinactive(zp);
5386	rw_exit(&zfsvfs->z_teardown_inactive_lock);
5387
5388	vp->v_data = NULL;
5389	return (0);
5390}
5391
5392static int
5393zfs_freebsd_fid(ap)
5394	struct vop_fid_args /* {
5395		struct vnode *a_vp;
5396		struct fid *a_fid;
5397	} */ *ap;
5398{
5399
5400	return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
5401}
5402
5403static int
5404zfs_freebsd_pathconf(ap)
5405	struct vop_pathconf_args /* {
5406		struct vnode *a_vp;
5407		int a_name;
5408		register_t *a_retval;
5409	} */ *ap;
5410{
5411	ulong_t val;
5412	int error;
5413
5414	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL);
5415	if (error == 0)
5416		*ap->a_retval = val;
5417	else if (error == EOPNOTSUPP)
5418		error = vop_stdpathconf(ap);
5419	return (error);
5420}
5421
5422static int
5423zfs_freebsd_fifo_pathconf(ap)
5424	struct vop_pathconf_args /* {
5425		struct vnode *a_vp;
5426		int a_name;
5427		register_t *a_retval;
5428	} */ *ap;
5429{
5430
5431	switch (ap->a_name) {
5432	case _PC_ACL_EXTENDED:
5433	case _PC_ACL_NFS4:
5434	case _PC_ACL_PATH_MAX:
5435	case _PC_MAC_PRESENT:
5436		return (zfs_freebsd_pathconf(ap));
5437	default:
5438		return (fifo_specops.vop_pathconf(ap));
5439	}
5440}
5441
5442/*
5443 * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
5444 * extended attribute name:
5445 *
5446 *	NAMESPACE	PREFIX
5447 *	system		freebsd:system:
5448 *	user		(none, can be used to access ZFS fsattr(5) attributes
5449 *			created on Solaris)
5450 */
5451static int
5452zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
5453    size_t size)
5454{
5455	const char *namespace, *prefix, *suffix;
5456
5457	/* We don't allow '/' character in attribute name. */
5458	if (strchr(name, '/') != NULL)
5459		return (EINVAL);
5460	/* We don't allow attribute names that start with "freebsd:" string. */
5461	if (strncmp(name, "freebsd:", 8) == 0)
5462		return (EINVAL);
5463
5464	bzero(attrname, size);
5465
5466	switch (attrnamespace) {
5467	case EXTATTR_NAMESPACE_USER:
5468#if 0
5469		prefix = "freebsd:";
5470		namespace = EXTATTR_NAMESPACE_USER_STRING;
5471		suffix = ":";
5472#else
5473		/*
5474		 * This is the default namespace by which we can access all
5475		 * attributes created on Solaris.
5476		 */
5477		prefix = namespace = suffix = "";
5478#endif
5479		break;
5480	case EXTATTR_NAMESPACE_SYSTEM:
5481		prefix = "freebsd:";
5482		namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
5483		suffix = ":";
5484		break;
5485	case EXTATTR_NAMESPACE_EMPTY:
5486	default:
5487		return (EINVAL);
5488	}
5489	if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
5490	    name) >= size) {
5491		return (ENAMETOOLONG);
5492	}
5493	return (0);
5494}
5495
5496/*
5497 * Vnode operating to retrieve a named extended attribute.
5498 */
5499static int
5500zfs_getextattr(struct vop_getextattr_args *ap)
5501/*
5502vop_getextattr {
5503	IN struct vnode *a_vp;
5504	IN int a_attrnamespace;
5505	IN const char *a_name;
5506	INOUT struct uio *a_uio;
5507	OUT size_t *a_size;
5508	IN struct ucred *a_cred;
5509	IN struct thread *a_td;
5510};
5511*/
5512{
5513	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5514	struct thread *td = ap->a_td;
5515	struct nameidata nd;
5516	char attrname[255];
5517	struct vattr va;
5518	vnode_t *xvp = NULL, *vp;
5519	int error, flags;
5520
5521	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5522	    ap->a_cred, ap->a_td, VREAD);
5523	if (error != 0)
5524		return (error);
5525
5526	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5527	    sizeof(attrname));
5528	if (error != 0)
5529		return (error);
5530
5531	ZFS_ENTER(zfsvfs);
5532
5533	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5534	    LOOKUP_XATTR);
5535	if (error != 0) {
5536		ZFS_EXIT(zfsvfs);
5537		return (error);
5538	}
5539
5540	flags = FREAD;
5541	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
5542	    xvp, td);
5543	error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL);
5544	vp = nd.ni_vp;
5545	NDFREE(&nd, NDF_ONLY_PNBUF);
5546	if (error != 0) {
5547		ZFS_EXIT(zfsvfs);
5548		if (error == ENOENT)
5549			error = ENOATTR;
5550		return (error);
5551	}
5552
5553	if (ap->a_size != NULL) {
5554		error = VOP_GETATTR(vp, &va, ap->a_cred);
5555		if (error == 0)
5556			*ap->a_size = (size_t)va.va_size;
5557	} else if (ap->a_uio != NULL)
5558		error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
5559
5560	VOP_UNLOCK(vp, 0);
5561	vn_close(vp, flags, ap->a_cred, td);
5562	ZFS_EXIT(zfsvfs);
5563
5564	return (error);
5565}
5566
5567/*
5568 * Vnode operation to remove a named attribute.
5569 */
5570int
5571zfs_deleteextattr(struct vop_deleteextattr_args *ap)
5572/*
5573vop_deleteextattr {
5574	IN struct vnode *a_vp;
5575	IN int a_attrnamespace;
5576	IN const char *a_name;
5577	IN struct ucred *a_cred;
5578	IN struct thread *a_td;
5579};
5580*/
5581{
5582	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5583	struct thread *td = ap->a_td;
5584	struct nameidata nd;
5585	char attrname[255];
5586	struct vattr va;
5587	vnode_t *xvp = NULL, *vp;
5588	int error, flags;
5589
5590	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5591	    ap->a_cred, ap->a_td, VWRITE);
5592	if (error != 0)
5593		return (error);
5594
5595	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5596	    sizeof(attrname));
5597	if (error != 0)
5598		return (error);
5599
5600	ZFS_ENTER(zfsvfs);
5601
5602	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5603	    LOOKUP_XATTR);
5604	if (error != 0) {
5605		ZFS_EXIT(zfsvfs);
5606		return (error);
5607	}
5608
5609	NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
5610	    UIO_SYSSPACE, attrname, xvp, td);
5611	error = namei(&nd);
5612	vp = nd.ni_vp;
5613	if (error != 0) {
5614		ZFS_EXIT(zfsvfs);
5615		NDFREE(&nd, NDF_ONLY_PNBUF);
5616		if (error == ENOENT)
5617			error = ENOATTR;
5618		return (error);
5619	}
5620
5621	error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
5622	NDFREE(&nd, NDF_ONLY_PNBUF);
5623
5624	vput(nd.ni_dvp);
5625	if (vp == nd.ni_dvp)
5626		vrele(vp);
5627	else
5628		vput(vp);
5629	ZFS_EXIT(zfsvfs);
5630
5631	return (error);
5632}
5633
5634/*
5635 * Vnode operation to set a named attribute.
5636 */
5637static int
5638zfs_setextattr(struct vop_setextattr_args *ap)
5639/*
5640vop_setextattr {
5641	IN struct vnode *a_vp;
5642	IN int a_attrnamespace;
5643	IN const char *a_name;
5644	INOUT struct uio *a_uio;
5645	IN struct ucred *a_cred;
5646	IN struct thread *a_td;
5647};
5648*/
5649{
5650	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5651	struct thread *td = ap->a_td;
5652	struct nameidata nd;
5653	char attrname[255];
5654	struct vattr va;
5655	vnode_t *xvp = NULL, *vp;
5656	int error, flags;
5657
5658	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5659	    ap->a_cred, ap->a_td, VWRITE);
5660	if (error != 0)
5661		return (error);
5662
5663	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5664	    sizeof(attrname));
5665	if (error != 0)
5666		return (error);
5667
5668	ZFS_ENTER(zfsvfs);
5669
5670	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5671	    LOOKUP_XATTR | CREATE_XATTR_DIR);
5672	if (error != 0) {
5673		ZFS_EXIT(zfsvfs);
5674		return (error);
5675	}
5676
5677	flags = FFLAGS(O_WRONLY | O_CREAT);
5678	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
5679	    xvp, td);
5680	error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL);
5681	vp = nd.ni_vp;
5682	NDFREE(&nd, NDF_ONLY_PNBUF);
5683	if (error != 0) {
5684		ZFS_EXIT(zfsvfs);
5685		return (error);
5686	}
5687
5688	VATTR_NULL(&va);
5689	va.va_size = 0;
5690	error = VOP_SETATTR(vp, &va, ap->a_cred);
5691	if (error == 0)
5692		VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);
5693
5694	VOP_UNLOCK(vp, 0);
5695	vn_close(vp, flags, ap->a_cred, td);
5696	ZFS_EXIT(zfsvfs);
5697
5698	return (error);
5699}
5700
5701/*
5702 * Vnode operation to retrieve extended attributes on a vnode.
5703 */
5704static int
5705zfs_listextattr(struct vop_listextattr_args *ap)
5706/*
5707vop_listextattr {
5708	IN struct vnode *a_vp;
5709	IN int a_attrnamespace;
5710	INOUT struct uio *a_uio;
5711	OUT size_t *a_size;
5712	IN struct ucred *a_cred;
5713	IN struct thread *a_td;
5714};
5715*/
5716{
5717	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5718	struct thread *td = ap->a_td;
5719	struct nameidata nd;
5720	char attrprefix[16];
5721	u_char dirbuf[sizeof(struct dirent)];
5722	struct dirent *dp;
5723	struct iovec aiov;
5724	struct uio auio, *uio = ap->a_uio;
5725	size_t *sizep = ap->a_size;
5726	size_t plen;
5727	vnode_t *xvp = NULL, *vp;
5728	int done, error, eof, pos;
5729
5730	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5731	    ap->a_cred, ap->a_td, VREAD);
5732	if (error != 0)
5733		return (error);
5734
5735	error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
5736	    sizeof(attrprefix));
5737	if (error != 0)
5738		return (error);
5739	plen = strlen(attrprefix);
5740
5741	ZFS_ENTER(zfsvfs);
5742
5743	if (sizep != NULL)
5744		*sizep = 0;
5745
5746	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5747	    LOOKUP_XATTR);
5748	if (error != 0) {
5749		ZFS_EXIT(zfsvfs);
5750		/*
5751		 * ENOATTR means that the EA directory does not yet exist,
5752		 * i.e. there are no extended attributes there.
5753		 */
5754		if (error == ENOATTR)
5755			error = 0;
5756		return (error);
5757	}
5758
5759	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
5760	    UIO_SYSSPACE, ".", xvp, td);
5761	error = namei(&nd);
5762	vp = nd.ni_vp;
5763	NDFREE(&nd, NDF_ONLY_PNBUF);
5764	if (error != 0) {
5765		ZFS_EXIT(zfsvfs);
5766		return (error);
5767	}
5768
5769	auio.uio_iov = &aiov;
5770	auio.uio_iovcnt = 1;
5771	auio.uio_segflg = UIO_SYSSPACE;
5772	auio.uio_td = td;
5773	auio.uio_rw = UIO_READ;
5774	auio.uio_offset = 0;
5775
5776	do {
5777		u_char nlen;
5778
5779		aiov.iov_base = (void *)dirbuf;
5780		aiov.iov_len = sizeof(dirbuf);
5781		auio.uio_resid = sizeof(dirbuf);
5782		error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
5783		done = sizeof(dirbuf) - auio.uio_resid;
5784		if (error != 0)
5785			break;
5786		for (pos = 0; pos < done;) {
5787			dp = (struct dirent *)(dirbuf + pos);
5788			pos += dp->d_reclen;
5789			/*
5790			 * XXX: Temporarily we also accept DT_UNKNOWN, as this
5791			 * is what we get when attribute was created on Solaris.
5792			 */
5793			if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
5794				continue;
5795			if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0)
5796				continue;
5797			else if (strncmp(dp->d_name, attrprefix, plen) != 0)
5798				continue;
5799			nlen = dp->d_namlen - plen;
5800			if (sizep != NULL)
5801				*sizep += 1 + nlen;
5802			else if (uio != NULL) {
5803				/*
5804				 * Format of extattr name entry is one byte for
5805				 * length and the rest for name.
5806				 */
5807				error = uiomove(&nlen, 1, uio->uio_rw, uio);
5808				if (error == 0) {
5809					error = uiomove(dp->d_name + plen, nlen,
5810					    uio->uio_rw, uio);
5811				}
5812				if (error != 0)
5813					break;
5814			}
5815		}
5816	} while (!eof && error == 0);
5817
5818	vput(vp);
5819	ZFS_EXIT(zfsvfs);
5820
5821	return (error);
5822}
5823
5824int
5825zfs_freebsd_getacl(ap)
5826	struct vop_getacl_args /* {
5827		struct vnode *vp;
5828		acl_type_t type;
5829		struct acl *aclp;
5830		struct ucred *cred;
5831		struct thread *td;
5832	} */ *ap;
5833{
5834	int		error;
5835	vsecattr_t      vsecattr;
5836
5837	if (ap->a_type != ACL_TYPE_NFS4)
5838		return (EINVAL);
5839
5840	vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
5841	if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))
5842		return (error);
5843
5844	error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt);
5845	if (vsecattr.vsa_aclentp != NULL)
5846		kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
5847
5848	return (error);
5849}
5850
5851int
5852zfs_freebsd_setacl(ap)
5853	struct vop_setacl_args /* {
5854		struct vnode *vp;
5855		acl_type_t type;
5856		struct acl *aclp;
5857		struct ucred *cred;
5858		struct thread *td;
5859	} */ *ap;
5860{
5861	int		error;
5862	vsecattr_t      vsecattr;
5863	int		aclbsize;	/* size of acl list in bytes */
5864	aclent_t	*aaclp;
5865
5866	if (ap->a_type != ACL_TYPE_NFS4)
5867		return (EINVAL);
5868
5869	if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
5870		return (EINVAL);
5871
5872	/*
5873	 * With NFSv4 ACLs, chmod(2) may need to add additional entries,
5874	 * splitting every entry into two and appending "canonical six"
5875	 * entries at the end.  Don't allow for setting an ACL that would
5876	 * cause chmod(2) to run out of ACL entries.
5877	 */
5878	if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
5879		return (ENOSPC);
5880
5881	error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
5882	if (error != 0)
5883		return (error);
5884
5885	vsecattr.vsa_mask = VSA_ACE;
5886	aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t);
5887	vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
5888	aaclp = vsecattr.vsa_aclentp;
5889	vsecattr.vsa_aclentsz = aclbsize;
5890
5891	aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
5892	error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL);
5893	kmem_free(aaclp, aclbsize);
5894
5895	return (error);
5896}
5897
5898int
5899zfs_freebsd_aclcheck(ap)
5900	struct vop_aclcheck_args /* {
5901		struct vnode *vp;
5902		acl_type_t type;
5903		struct acl *aclp;
5904		struct ucred *cred;
5905		struct thread *td;
5906	} */ *ap;
5907{
5908
5909	return (EOPNOTSUPP);
5910}
5911
5912static int
5913zfs_vptocnp(struct vop_vptocnp_args *ap)
5914{
5915	vnode_t *covered_vp;
5916	vnode_t *vp = ap->a_vp;;
5917	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
5918	znode_t *zp = VTOZ(vp);
5919	uint64_t parent;
5920	int ltype;
5921	int error;
5922
5923	ZFS_ENTER(zfsvfs);
5924	ZFS_VERIFY_ZP(zp);
5925
5926	/*
5927	 * If we are a snapshot mounted under .zfs, run the operation
5928	 * on the covered vnode.
5929	 */
5930	if ((error = sa_lookup(zp->z_sa_hdl,
5931	    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) {
5932		ZFS_EXIT(zfsvfs);
5933		return (error);
5934	}
5935
5936	if (zp->z_id != parent || zfsvfs->z_parent == zfsvfs) {
5937		ZFS_EXIT(zfsvfs);
5938		return (vop_stdvptocnp(ap));
5939	}
5940	ZFS_EXIT(zfsvfs);
5941
5942	covered_vp = vp->v_mount->mnt_vnodecovered;
5943	vhold(covered_vp);
5944	ltype = VOP_ISLOCKED(vp);
5945	VOP_UNLOCK(vp, 0);
5946	error = vget(covered_vp, LK_EXCLUSIVE | LK_VNHELD, curthread);
5947	if (error == 0) {
5948		error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred,
5949		    ap->a_buf, ap->a_buflen);
5950		vput(covered_vp);
5951	}
5952	vn_lock(vp, ltype | LK_RETRY);
5953	if ((vp->v_iflag & VI_DOOMED) != 0)
5954		error = SET_ERROR(ENOENT);
5955	return (error);
5956}
5957
5958#ifdef DIAGNOSTIC
5959static int
5960zfs_lock(ap)
5961	struct vop_lock1_args /* {
5962		struct vnode *a_vp;
5963		int a_flags;
5964		char *file;
5965		int line;
5966	} */ *ap;
5967{
5968	zfsvfs_t *zfsvfs;
5969	znode_t *zp;
5970	vnode_t *vp;
5971	int flags;
5972	int err;
5973
5974	vp = ap->a_vp;
5975	flags = ap->a_flags;
5976	if ((flags & LK_INTERLOCK) == 0 && (flags & LK_NOWAIT) == 0 &&
5977	    (vp->v_iflag & VI_DOOMED) == 0 && (zp = vp->v_data) != NULL &&
5978	    (zp->z_pflags & ZFS_XATTR) == 0) {
5979		zfsvfs = zp->z_zfsvfs;
5980		VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
5981	}
5982	err = vop_stdlock(ap);
5983	if ((flags & LK_INTERLOCK) != 0 && (flags & LK_NOWAIT) == 0 &&
5984	    (vp->v_iflag & VI_DOOMED) == 0 && (zp = vp->v_data) != NULL &&
5985	    (zp->z_pflags & ZFS_XATTR) == 0) {
5986		zfsvfs = zp->z_zfsvfs;
5987		VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
5988	}
5989	return (err);
5990}
5991#endif
5992
5993struct vop_vector zfs_vnodeops;
5994struct vop_vector zfs_fifoops;
5995struct vop_vector zfs_shareops;
5996
5997struct vop_vector zfs_vnodeops = {
5998	.vop_default =		&default_vnodeops,
5999	.vop_inactive =		zfs_freebsd_inactive,
6000	.vop_reclaim =		zfs_freebsd_reclaim,
6001	.vop_access =		zfs_freebsd_access,
6002	.vop_lookup =		zfs_cache_lookup,
6003	.vop_cachedlookup =	zfs_freebsd_lookup,
6004	.vop_getattr =		zfs_freebsd_getattr,
6005	.vop_setattr =		zfs_freebsd_setattr,
6006	.vop_create =		zfs_freebsd_create,
6007	.vop_mknod =		zfs_freebsd_create,
6008	.vop_mkdir =		zfs_freebsd_mkdir,
6009	.vop_readdir =		zfs_freebsd_readdir,
6010	.vop_fsync =		zfs_freebsd_fsync,
6011	.vop_open =		zfs_freebsd_open,
6012	.vop_close =		zfs_freebsd_close,
6013	.vop_rmdir =		zfs_freebsd_rmdir,
6014	.vop_ioctl =		zfs_freebsd_ioctl,
6015	.vop_link =		zfs_freebsd_link,
6016	.vop_symlink =		zfs_freebsd_symlink,
6017	.vop_readlink =		zfs_freebsd_readlink,
6018	.vop_read =		zfs_freebsd_read,
6019	.vop_write =		zfs_freebsd_write,
6020	.vop_remove =		zfs_freebsd_remove,
6021	.vop_rename =		zfs_freebsd_rename,
6022	.vop_pathconf =		zfs_freebsd_pathconf,
6023	.vop_bmap =		zfs_freebsd_bmap,
6024	.vop_fid =		zfs_freebsd_fid,
6025	.vop_getextattr =	zfs_getextattr,
6026	.vop_deleteextattr =	zfs_deleteextattr,
6027	.vop_setextattr =	zfs_setextattr,
6028	.vop_listextattr =	zfs_listextattr,
6029	.vop_getacl =		zfs_freebsd_getacl,
6030	.vop_setacl =		zfs_freebsd_setacl,
6031	.vop_aclcheck =		zfs_freebsd_aclcheck,
6032	.vop_getpages =		zfs_freebsd_getpages,
6033	.vop_putpages =		zfs_freebsd_putpages,
6034	.vop_vptocnp =		zfs_vptocnp,
6035#ifdef DIAGNOSTIC
6036	.vop_lock1 =		zfs_lock,
6037#endif
6038};
6039
6040struct vop_vector zfs_fifoops = {
6041	.vop_default =		&fifo_specops,
6042	.vop_fsync =		zfs_freebsd_fsync,
6043	.vop_access =		zfs_freebsd_access,
6044	.vop_getattr =		zfs_freebsd_getattr,
6045	.vop_inactive =		zfs_freebsd_inactive,
6046	.vop_read =		VOP_PANIC,
6047	.vop_reclaim =		zfs_freebsd_reclaim,
6048	.vop_setattr =		zfs_freebsd_setattr,
6049	.vop_write =		VOP_PANIC,
6050	.vop_pathconf = 	zfs_freebsd_fifo_pathconf,
6051	.vop_fid =		zfs_freebsd_fid,
6052	.vop_getacl =		zfs_freebsd_getacl,
6053	.vop_setacl =		zfs_freebsd_setacl,
6054	.vop_aclcheck =		zfs_freebsd_aclcheck,
6055};
6056
6057/*
6058 * special share hidden files vnode operations template
6059 */
6060struct vop_vector zfs_shareops = {
6061	.vop_default =		&default_vnodeops,
6062	.vop_access =		zfs_freebsd_access,
6063	.vop_inactive =		zfs_freebsd_inactive,
6064	.vop_reclaim =		zfs_freebsd_reclaim,
6065	.vop_fid =		zfs_freebsd_fid,
6066	.vop_pathconf =		zfs_freebsd_pathconf,
6067};
6068