zfs_vnops.c revision 324204
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24 * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
25 * Copyright (c) 2014 Integros [integros.com]
26 */
27
28/* Portions Copyright 2007 Jeremy Teo */
29/* Portions Copyright 2010 Robert Milkowski */
30
31#include <sys/types.h>
32#include <sys/param.h>
33#include <sys/time.h>
34#include <sys/systm.h>
35#include <sys/sysmacros.h>
36#include <sys/resource.h>
37#include <sys/vfs.h>
38#include <sys/vm.h>
39#include <sys/vnode.h>
40#include <sys/file.h>
41#include <sys/stat.h>
42#include <sys/kmem.h>
43#include <sys/taskq.h>
44#include <sys/uio.h>
45#include <sys/atomic.h>
46#include <sys/namei.h>
47#include <sys/mman.h>
48#include <sys/cmn_err.h>
49#include <sys/errno.h>
50#include <sys/unistd.h>
51#include <sys/zfs_dir.h>
52#include <sys/zfs_ioctl.h>
53#include <sys/fs/zfs.h>
54#include <sys/dmu.h>
55#include <sys/dmu_objset.h>
56#include <sys/spa.h>
57#include <sys/txg.h>
58#include <sys/dbuf.h>
59#include <sys/zap.h>
60#include <sys/sa.h>
61#include <sys/dirent.h>
62#include <sys/policy.h>
63#include <sys/sunddi.h>
64#include <sys/filio.h>
65#include <sys/sid.h>
66#include <sys/zfs_ctldir.h>
67#include <sys/zfs_fuid.h>
68#include <sys/zfs_sa.h>
69#include <sys/zfs_rlock.h>
70#include <sys/extdirent.h>
71#include <sys/kidmap.h>
72#include <sys/bio.h>
73#include <sys/buf.h>
74#include <sys/sched.h>
75#include <sys/acl.h>
76#include <vm/vm_param.h>
77
78/*
79 * Programming rules.
80 *
81 * Each vnode op performs some logical unit of work.  To do this, the ZPL must
82 * properly lock its in-core state, create a DMU transaction, do the work,
83 * record this work in the intent log (ZIL), commit the DMU transaction,
84 * and wait for the intent log to commit if it is a synchronous operation.
85 * Moreover, the vnode ops must work in both normal and log replay context.
86 * The ordering of events is important to avoid deadlocks and references
87 * to freed memory.  The example below illustrates the following Big Rules:
88 *
89 *  (1)	A check must be made in each zfs thread for a mounted file system.
90 *	This is done avoiding races using ZFS_ENTER(zfsvfs).
91 *	A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
92 *	must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
93 *	can return EIO from the calling function.
94 *
95 *  (2)	VN_RELE() should always be the last thing except for zil_commit()
96 *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
97 *	First, if it's the last reference, the vnode/znode
98 *	can be freed, so the zp may point to freed memory.  Second, the last
99 *	reference will call zfs_zinactive(), which may induce a lot of work --
100 *	pushing cached pages (which acquires range locks) and syncing out
101 *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
102 *	which could deadlock the system if you were already holding one.
103 *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
104 *
105 *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
106 *	as they can span dmu_tx_assign() calls.
107 *
108 *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
109 *      dmu_tx_assign().  This is critical because we don't want to block
110 *      while holding locks.
111 *
112 *	If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
113 *	reduces lock contention and CPU usage when we must wait (note that if
114 *	throughput is constrained by the storage, nearly every transaction
115 *	must wait).
116 *
117 *      Note, in particular, that if a lock is sometimes acquired before
118 *      the tx assigns, and sometimes after (e.g. z_lock), then failing
119 *      to use a non-blocking assign can deadlock the system.  The scenario:
120 *
121 *	Thread A has grabbed a lock before calling dmu_tx_assign().
122 *	Thread B is in an already-assigned tx, and blocks for this lock.
123 *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
124 *	forever, because the previous txg can't quiesce until B's tx commits.
125 *
126 *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
127 *	then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
128 *	calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT,
129 *	to indicate that this operation has already called dmu_tx_wait().
130 *	This will ensure that we don't retry forever, waiting a short bit
131 *	each time.
132 *
133 *  (5)	If the operation succeeded, generate the intent log entry for it
134 *	before dropping locks.  This ensures that the ordering of events
135 *	in the intent log matches the order in which they actually occurred.
136 *	During ZIL replay the zfs_log_* functions will update the sequence
137 *	number to indicate the zil transaction has replayed.
138 *
139 *  (6)	At the end of each vnode op, the DMU tx must always commit,
140 *	regardless of whether there were any errors.
141 *
142 *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
143 *	to ensure that synchronous semantics are provided when necessary.
144 *
145 * In general, this is how things should be ordered in each vnode op:
146 *
147 *	ZFS_ENTER(zfsvfs);		// exit if unmounted
148 * top:
149 *	zfs_dirent_lookup(&dl, ...)	// lock directory entry (may VN_HOLD())
150 *	rw_enter(...);			// grab any other locks you need
151 *	tx = dmu_tx_create(...);	// get DMU tx
152 *	dmu_tx_hold_*();		// hold each object you might modify
153 *	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
154 *	if (error) {
155 *		rw_exit(...);		// drop locks
156 *		zfs_dirent_unlock(dl);	// unlock directory entry
157 *		VN_RELE(...);		// release held vnodes
158 *		if (error == ERESTART) {
159 *			waited = B_TRUE;
160 *			dmu_tx_wait(tx);
161 *			dmu_tx_abort(tx);
162 *			goto top;
163 *		}
164 *		dmu_tx_abort(tx);	// abort DMU tx
165 *		ZFS_EXIT(zfsvfs);	// finished in zfs
166 *		return (error);		// really out of space
167 *	}
168 *	error = do_real_work();		// do whatever this VOP does
169 *	if (error == 0)
170 *		zfs_log_*(...);		// on success, make ZIL entry
171 *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
172 *	rw_exit(...);			// drop locks
173 *	zfs_dirent_unlock(dl);		// unlock directory entry
174 *	VN_RELE(...);			// release held vnodes
175 *	zil_commit(zilog, foid);	// synchronous when necessary
176 *	ZFS_EXIT(zfsvfs);		// finished in zfs
177 *	return (error);			// done, report error
178 */
179
180/* ARGSUSED */
181static int
182zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
183{
184	znode_t	*zp = VTOZ(*vpp);
185	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
186
187	ZFS_ENTER(zfsvfs);
188	ZFS_VERIFY_ZP(zp);
189
190	if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
191	    ((flag & FAPPEND) == 0)) {
192		ZFS_EXIT(zfsvfs);
193		return (SET_ERROR(EPERM));
194	}
195
196	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
197	    ZTOV(zp)->v_type == VREG &&
198	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
199		if (fs_vscan(*vpp, cr, 0) != 0) {
200			ZFS_EXIT(zfsvfs);
201			return (SET_ERROR(EACCES));
202		}
203	}
204
205	/* Keep a count of the synchronous opens in the znode */
206	if (flag & (FSYNC | FDSYNC))
207		atomic_inc_32(&zp->z_sync_cnt);
208
209	ZFS_EXIT(zfsvfs);
210	return (0);
211}
212
213/* ARGSUSED */
214static int
215zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
216    caller_context_t *ct)
217{
218	znode_t	*zp = VTOZ(vp);
219	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
220
221	/*
222	 * Clean up any locks held by this process on the vp.
223	 */
224	cleanlocks(vp, ddi_get_pid(), 0);
225	cleanshares(vp, ddi_get_pid());
226
227	ZFS_ENTER(zfsvfs);
228	ZFS_VERIFY_ZP(zp);
229
230	/* Decrement the synchronous opens in the znode */
231	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
232		atomic_dec_32(&zp->z_sync_cnt);
233
234	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
235	    ZTOV(zp)->v_type == VREG &&
236	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
237		VERIFY(fs_vscan(vp, cr, 1) == 0);
238
239	ZFS_EXIT(zfsvfs);
240	return (0);
241}
242
243/*
244 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
245 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
246 */
247static int
248zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
249{
250	znode_t	*zp = VTOZ(vp);
251	uint64_t noff = (uint64_t)*off; /* new offset */
252	uint64_t file_sz;
253	int error;
254	boolean_t hole;
255
256	file_sz = zp->z_size;
257	if (noff >= file_sz)  {
258		return (SET_ERROR(ENXIO));
259	}
260
261	if (cmd == _FIO_SEEK_HOLE)
262		hole = B_TRUE;
263	else
264		hole = B_FALSE;
265
266	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
267
268	if (error == ESRCH)
269		return (SET_ERROR(ENXIO));
270
271	/*
272	 * We could find a hole that begins after the logical end-of-file,
273	 * because dmu_offset_next() only works on whole blocks.  If the
274	 * EOF falls mid-block, then indicate that the "virtual hole"
275	 * at the end of the file begins at the logical EOF, rather than
276	 * at the end of the last block.
277	 */
278	if (noff > file_sz) {
279		ASSERT(hole);
280		noff = file_sz;
281	}
282
283	if (noff < *off)
284		return (error);
285	*off = noff;
286	return (error);
287}
288
289/* ARGSUSED */
290static int
291zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
292    int *rvalp, caller_context_t *ct)
293{
294	offset_t off;
295	offset_t ndata;
296	dmu_object_info_t doi;
297	int error;
298	zfsvfs_t *zfsvfs;
299	znode_t *zp;
300
301	switch (com) {
302	case _FIOFFS:
303	{
304		return (0);
305
306		/*
307		 * The following two ioctls are used by bfu.  Faking out,
308		 * necessary to avoid bfu errors.
309		 */
310	}
311	case _FIOGDIO:
312	case _FIOSDIO:
313	{
314		return (0);
315	}
316
317	case _FIO_SEEK_DATA:
318	case _FIO_SEEK_HOLE:
319	{
320#ifdef illumos
321		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
322			return (SET_ERROR(EFAULT));
323#else
324		off = *(offset_t *)data;
325#endif
326		zp = VTOZ(vp);
327		zfsvfs = zp->z_zfsvfs;
328		ZFS_ENTER(zfsvfs);
329		ZFS_VERIFY_ZP(zp);
330
331		/* offset parameter is in/out */
332		error = zfs_holey(vp, com, &off);
333		ZFS_EXIT(zfsvfs);
334		if (error)
335			return (error);
336#ifdef illumos
337		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
338			return (SET_ERROR(EFAULT));
339#else
340		*(offset_t *)data = off;
341#endif
342		return (0);
343	}
344#ifdef illumos
345	case _FIO_COUNT_FILLED:
346	{
347		/*
348		 * _FIO_COUNT_FILLED adds a new ioctl command which
349		 * exposes the number of filled blocks in a
350		 * ZFS object.
351		 */
352		zp = VTOZ(vp);
353		zfsvfs = zp->z_zfsvfs;
354		ZFS_ENTER(zfsvfs);
355		ZFS_VERIFY_ZP(zp);
356
357		/*
358		 * Wait for all dirty blocks for this object
359		 * to get synced out to disk, and the DMU info
360		 * updated.
361		 */
362		error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
363		if (error) {
364			ZFS_EXIT(zfsvfs);
365			return (error);
366		}
367
368		/*
369		 * Retrieve fill count from DMU object.
370		 */
371		error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
372		if (error) {
373			ZFS_EXIT(zfsvfs);
374			return (error);
375		}
376
377		ndata = doi.doi_fill_count;
378
379		ZFS_EXIT(zfsvfs);
380		if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
381			return (SET_ERROR(EFAULT));
382		return (0);
383	}
384#endif
385	}
386	return (SET_ERROR(ENOTTY));
387}
388
389static vm_page_t
390page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
391{
392	vm_object_t obj;
393	vm_page_t pp;
394	int64_t end;
395
396	/*
397	 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
398	 * aligned boundaries, if the range is not aligned.  As a result a
399	 * DEV_BSIZE subrange with partially dirty data may get marked as clean.
400	 * It may happen that all DEV_BSIZE subranges are marked clean and thus
401	 * the whole page would be considred clean despite have some dirty data.
402	 * For this reason we should shrink the range to DEV_BSIZE aligned
403	 * boundaries before calling vm_page_clear_dirty.
404	 */
405	end = rounddown2(off + nbytes, DEV_BSIZE);
406	off = roundup2(off, DEV_BSIZE);
407	nbytes = end - off;
408
409	obj = vp->v_object;
410	zfs_vmobject_assert_wlocked(obj);
411
412	for (;;) {
413		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
414		    pp->valid) {
415			if (vm_page_xbusied(pp)) {
416				/*
417				 * Reference the page before unlocking and
418				 * sleeping so that the page daemon is less
419				 * likely to reclaim it.
420				 */
421				vm_page_reference(pp);
422				vm_page_lock(pp);
423				zfs_vmobject_wunlock(obj);
424				vm_page_busy_sleep(pp, "zfsmwb", true);
425				zfs_vmobject_wlock(obj);
426				continue;
427			}
428			vm_page_sbusy(pp);
429		} else if (pp == NULL) {
430			pp = vm_page_alloc(obj, OFF_TO_IDX(start),
431			    VM_ALLOC_SYSTEM | VM_ALLOC_IFCACHED |
432			    VM_ALLOC_SBUSY);
433		} else {
434			ASSERT(pp != NULL && !pp->valid);
435			pp = NULL;
436		}
437
438		if (pp != NULL) {
439			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
440			vm_object_pip_add(obj, 1);
441			pmap_remove_write(pp);
442			if (nbytes != 0)
443				vm_page_clear_dirty(pp, off, nbytes);
444		}
445		break;
446	}
447	return (pp);
448}
449
450static void
451page_unbusy(vm_page_t pp)
452{
453
454	vm_page_sunbusy(pp);
455	vm_object_pip_subtract(pp->object, 1);
456}
457
458static vm_page_t
459page_hold(vnode_t *vp, int64_t start)
460{
461	vm_object_t obj;
462	vm_page_t pp;
463
464	obj = vp->v_object;
465	zfs_vmobject_assert_wlocked(obj);
466
467	for (;;) {
468		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
469		    pp->valid) {
470			if (vm_page_xbusied(pp)) {
471				/*
472				 * Reference the page before unlocking and
473				 * sleeping so that the page daemon is less
474				 * likely to reclaim it.
475				 */
476				vm_page_reference(pp);
477				vm_page_lock(pp);
478				zfs_vmobject_wunlock(obj);
479				vm_page_busy_sleep(pp, "zfsmwb", true);
480				zfs_vmobject_wlock(obj);
481				continue;
482			}
483
484			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
485			vm_page_lock(pp);
486			vm_page_hold(pp);
487			vm_page_unlock(pp);
488
489		} else
490			pp = NULL;
491		break;
492	}
493	return (pp);
494}
495
496static void
497page_unhold(vm_page_t pp)
498{
499
500	vm_page_lock(pp);
501	vm_page_unhold(pp);
502	vm_page_unlock(pp);
503}
504
505/*
506 * When a file is memory mapped, we must keep the IO data synchronized
507 * between the DMU cache and the memory mapped pages.  What this means:
508 *
509 * On Write:	If we find a memory mapped page, we write to *both*
510 *		the page and the dmu buffer.
511 */
512static void
513update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
514    int segflg, dmu_tx_t *tx)
515{
516	vm_object_t obj;
517	struct sf_buf *sf;
518	caddr_t va;
519	int off;
520
521	ASSERT(segflg != UIO_NOCOPY);
522	ASSERT(vp->v_mount != NULL);
523	obj = vp->v_object;
524	ASSERT(obj != NULL);
525
526	off = start & PAGEOFFSET;
527	zfs_vmobject_wlock(obj);
528	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
529		vm_page_t pp;
530		int nbytes = imin(PAGESIZE - off, len);
531
532		if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
533			zfs_vmobject_wunlock(obj);
534
535			va = zfs_map_page(pp, &sf);
536			(void) dmu_read(os, oid, start+off, nbytes,
537			    va+off, DMU_READ_PREFETCH);;
538			zfs_unmap_page(sf);
539
540			zfs_vmobject_wlock(obj);
541			page_unbusy(pp);
542		}
543		len -= nbytes;
544		off = 0;
545	}
546	vm_object_pip_wakeupn(obj, 0);
547	zfs_vmobject_wunlock(obj);
548}
549
550/*
551 * Read with UIO_NOCOPY flag means that sendfile(2) requests
552 * ZFS to populate a range of page cache pages with data.
553 *
554 * NOTE: this function could be optimized to pre-allocate
555 * all pages in advance, drain exclusive busy on all of them,
556 * map them into contiguous KVA region and populate them
557 * in one single dmu_read() call.
558 */
559static int
560mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
561{
562	znode_t *zp = VTOZ(vp);
563	objset_t *os = zp->z_zfsvfs->z_os;
564	struct sf_buf *sf;
565	vm_object_t obj;
566	vm_page_t pp;
567	int64_t start;
568	caddr_t va;
569	int len = nbytes;
570	int off;
571	int error = 0;
572
573	ASSERT(uio->uio_segflg == UIO_NOCOPY);
574	ASSERT(vp->v_mount != NULL);
575	obj = vp->v_object;
576	ASSERT(obj != NULL);
577	ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
578
579	zfs_vmobject_wlock(obj);
580	for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
581		int bytes = MIN(PAGESIZE, len);
582
583		pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY |
584		    VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
585		if (pp->valid == 0) {
586			zfs_vmobject_wunlock(obj);
587			va = zfs_map_page(pp, &sf);
588			error = dmu_read(os, zp->z_id, start, bytes, va,
589			    DMU_READ_PREFETCH);
590			if (bytes != PAGESIZE && error == 0)
591				bzero(va + bytes, PAGESIZE - bytes);
592			zfs_unmap_page(sf);
593			zfs_vmobject_wlock(obj);
594			vm_page_sunbusy(pp);
595			vm_page_lock(pp);
596			if (error) {
597				if (pp->wire_count == 0 && pp->valid == 0 &&
598				    !vm_page_busied(pp))
599					vm_page_free(pp);
600			} else {
601				pp->valid = VM_PAGE_BITS_ALL;
602				vm_page_activate(pp);
603			}
604			vm_page_unlock(pp);
605		} else {
606			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
607			vm_page_sunbusy(pp);
608		}
609		if (error)
610			break;
611		uio->uio_resid -= bytes;
612		uio->uio_offset += bytes;
613		len -= bytes;
614	}
615	zfs_vmobject_wunlock(obj);
616	return (error);
617}
618
619/*
620 * When a file is memory mapped, we must keep the IO data synchronized
621 * between the DMU cache and the memory mapped pages.  What this means:
622 *
623 * On Read:	We "read" preferentially from memory mapped pages,
624 *		else we default from the dmu buffer.
625 *
626 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
627 *	 the file is memory mapped.
628 */
629static int
630mappedread(vnode_t *vp, int nbytes, uio_t *uio)
631{
632	znode_t *zp = VTOZ(vp);
633	vm_object_t obj;
634	int64_t start;
635	caddr_t va;
636	int len = nbytes;
637	int off;
638	int error = 0;
639
640	ASSERT(vp->v_mount != NULL);
641	obj = vp->v_object;
642	ASSERT(obj != NULL);
643
644	start = uio->uio_loffset;
645	off = start & PAGEOFFSET;
646	zfs_vmobject_wlock(obj);
647	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
648		vm_page_t pp;
649		uint64_t bytes = MIN(PAGESIZE - off, len);
650
651		if (pp = page_hold(vp, start)) {
652			struct sf_buf *sf;
653			caddr_t va;
654
655			zfs_vmobject_wunlock(obj);
656			va = zfs_map_page(pp, &sf);
657#ifdef illumos
658			error = uiomove(va + off, bytes, UIO_READ, uio);
659#else
660			error = vn_io_fault_uiomove(va + off, bytes, uio);
661#endif
662			zfs_unmap_page(sf);
663			zfs_vmobject_wlock(obj);
664			page_unhold(pp);
665		} else {
666			zfs_vmobject_wunlock(obj);
667			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
668			    uio, bytes);
669			zfs_vmobject_wlock(obj);
670		}
671		len -= bytes;
672		off = 0;
673		if (error)
674			break;
675	}
676	zfs_vmobject_wunlock(obj);
677	return (error);
678}
679
680offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
681
682/*
683 * Read bytes from specified file into supplied buffer.
684 *
685 *	IN:	vp	- vnode of file to be read from.
686 *		uio	- structure supplying read location, range info,
687 *			  and return buffer.
688 *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
689 *		cr	- credentials of caller.
690 *		ct	- caller context
691 *
692 *	OUT:	uio	- updated offset and range, buffer filled.
693 *
694 *	RETURN:	0 on success, error code on failure.
695 *
696 * Side Effects:
697 *	vp - atime updated if byte count > 0
698 */
699/* ARGSUSED */
700static int
701zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
702{
703	znode_t		*zp = VTOZ(vp);
704	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
705	ssize_t		n, nbytes;
706	int		error = 0;
707	rl_t		*rl;
708	xuio_t		*xuio = NULL;
709
710	ZFS_ENTER(zfsvfs);
711	ZFS_VERIFY_ZP(zp);
712
713	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
714		ZFS_EXIT(zfsvfs);
715		return (SET_ERROR(EACCES));
716	}
717
718	/*
719	 * Validate file offset
720	 */
721	if (uio->uio_loffset < (offset_t)0) {
722		ZFS_EXIT(zfsvfs);
723		return (SET_ERROR(EINVAL));
724	}
725
726	/*
727	 * Fasttrack empty reads
728	 */
729	if (uio->uio_resid == 0) {
730		ZFS_EXIT(zfsvfs);
731		return (0);
732	}
733
734	/*
735	 * Check for mandatory locks
736	 */
737	if (MANDMODE(zp->z_mode)) {
738		if (error = chklock(vp, FREAD,
739		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
740			ZFS_EXIT(zfsvfs);
741			return (error);
742		}
743	}
744
745	/*
746	 * If we're in FRSYNC mode, sync out this znode before reading it.
747	 */
748	if (zfsvfs->z_log &&
749	    (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
750		zil_commit(zfsvfs->z_log, zp->z_id);
751
752	/*
753	 * Lock the range against changes.
754	 */
755	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
756
757	/*
758	 * If we are reading past end-of-file we can skip
759	 * to the end; but we might still need to set atime.
760	 */
761	if (uio->uio_loffset >= zp->z_size) {
762		error = 0;
763		goto out;
764	}
765
766	ASSERT(uio->uio_loffset < zp->z_size);
767	n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
768
769#ifdef illumos
770	if ((uio->uio_extflg == UIO_XUIO) &&
771	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
772		int nblk;
773		int blksz = zp->z_blksz;
774		uint64_t offset = uio->uio_loffset;
775
776		xuio = (xuio_t *)uio;
777		if ((ISP2(blksz))) {
778			nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
779			    blksz)) / blksz;
780		} else {
781			ASSERT(offset + n <= blksz);
782			nblk = 1;
783		}
784		(void) dmu_xuio_init(xuio, nblk);
785
786		if (vn_has_cached_data(vp)) {
787			/*
788			 * For simplicity, we always allocate a full buffer
789			 * even if we only expect to read a portion of a block.
790			 */
791			while (--nblk >= 0) {
792				(void) dmu_xuio_add(xuio,
793				    dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
794				    blksz), 0, blksz);
795			}
796		}
797	}
798#endif	/* illumos */
799
800	while (n > 0) {
801		nbytes = MIN(n, zfs_read_chunk_size -
802		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
803
804#ifdef __FreeBSD__
805		if (uio->uio_segflg == UIO_NOCOPY)
806			error = mappedread_sf(vp, nbytes, uio);
807		else
808#endif /* __FreeBSD__ */
809		if (vn_has_cached_data(vp)) {
810			error = mappedread(vp, nbytes, uio);
811		} else {
812			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
813			    uio, nbytes);
814		}
815		if (error) {
816			/* convert checksum errors into IO errors */
817			if (error == ECKSUM)
818				error = SET_ERROR(EIO);
819			break;
820		}
821
822		n -= nbytes;
823	}
824out:
825	zfs_range_unlock(rl);
826
827	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
828	ZFS_EXIT(zfsvfs);
829	return (error);
830}
831
832/*
833 * Write the bytes to a file.
834 *
835 *	IN:	vp	- vnode of file to be written to.
836 *		uio	- structure supplying write location, range info,
837 *			  and data buffer.
838 *		ioflag	- FAPPEND, FSYNC, and/or FDSYNC.  FAPPEND is
839 *			  set if in append mode.
840 *		cr	- credentials of caller.
841 *		ct	- caller context (NFS/CIFS fem monitor only)
842 *
843 *	OUT:	uio	- updated offset and range.
844 *
845 *	RETURN:	0 on success, error code on failure.
846 *
847 * Timestamps:
848 *	vp - ctime|mtime updated if byte count > 0
849 */
850
851/* ARGSUSED */
852static int
853zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
854{
855	znode_t		*zp = VTOZ(vp);
856	rlim64_t	limit = MAXOFFSET_T;
857	ssize_t		start_resid = uio->uio_resid;
858	ssize_t		tx_bytes;
859	uint64_t	end_size;
860	dmu_tx_t	*tx;
861	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
862	zilog_t		*zilog;
863	offset_t	woff;
864	ssize_t		n, nbytes;
865	rl_t		*rl;
866	int		max_blksz = zfsvfs->z_max_blksz;
867	int		error = 0;
868	arc_buf_t	*abuf;
869	iovec_t		*aiov = NULL;
870	xuio_t		*xuio = NULL;
871	int		i_iov = 0;
872	int		iovcnt = uio->uio_iovcnt;
873	iovec_t		*iovp = uio->uio_iov;
874	int		write_eof;
875	int		count = 0;
876	sa_bulk_attr_t	bulk[4];
877	uint64_t	mtime[2], ctime[2];
878
879	/*
880	 * Fasttrack empty write
881	 */
882	n = start_resid;
883	if (n == 0)
884		return (0);
885
886	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
887		limit = MAXOFFSET_T;
888
889	ZFS_ENTER(zfsvfs);
890	ZFS_VERIFY_ZP(zp);
891
892	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
893	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
894	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
895	    &zp->z_size, 8);
896	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
897	    &zp->z_pflags, 8);
898
899	/*
900	 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
901	 * callers might not be able to detect properly that we are read-only,
902	 * so check it explicitly here.
903	 */
904	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
905		ZFS_EXIT(zfsvfs);
906		return (SET_ERROR(EROFS));
907	}
908
909	/*
910	 * If immutable or not appending then return EPERM
911	 */
912	if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
913	    ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
914	    (uio->uio_loffset < zp->z_size))) {
915		ZFS_EXIT(zfsvfs);
916		return (SET_ERROR(EPERM));
917	}
918
919	zilog = zfsvfs->z_log;
920
921	/*
922	 * Validate file offset
923	 */
924	woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
925	if (woff < 0) {
926		ZFS_EXIT(zfsvfs);
927		return (SET_ERROR(EINVAL));
928	}
929
930	/*
931	 * Check for mandatory locks before calling zfs_range_lock()
932	 * in order to prevent a deadlock with locks set via fcntl().
933	 */
934	if (MANDMODE((mode_t)zp->z_mode) &&
935	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
936		ZFS_EXIT(zfsvfs);
937		return (error);
938	}
939
940#ifdef illumos
941	/*
942	 * Pre-fault the pages to ensure slow (eg NFS) pages
943	 * don't hold up txg.
944	 * Skip this if uio contains loaned arc_buf.
945	 */
946	if ((uio->uio_extflg == UIO_XUIO) &&
947	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
948		xuio = (xuio_t *)uio;
949	else
950		uio_prefaultpages(MIN(n, max_blksz), uio);
951#endif
952
953	/*
954	 * If in append mode, set the io offset pointer to eof.
955	 */
956	if (ioflag & FAPPEND) {
957		/*
958		 * Obtain an appending range lock to guarantee file append
959		 * semantics.  We reset the write offset once we have the lock.
960		 */
961		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
962		woff = rl->r_off;
963		if (rl->r_len == UINT64_MAX) {
964			/*
965			 * We overlocked the file because this write will cause
966			 * the file block size to increase.
967			 * Note that zp_size cannot change with this lock held.
968			 */
969			woff = zp->z_size;
970		}
971		uio->uio_loffset = woff;
972	} else {
973		/*
974		 * Note that if the file block size will change as a result of
975		 * this write, then this range lock will lock the entire file
976		 * so that we can re-write the block safely.
977		 */
978		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
979	}
980
981	if (vn_rlimit_fsize(vp, uio, uio->uio_td)) {
982		zfs_range_unlock(rl);
983		ZFS_EXIT(zfsvfs);
984		return (EFBIG);
985	}
986
987	if (woff >= limit) {
988		zfs_range_unlock(rl);
989		ZFS_EXIT(zfsvfs);
990		return (SET_ERROR(EFBIG));
991	}
992
993	if ((woff + n) > limit || woff > (limit - n))
994		n = limit - woff;
995
996	/* Will this write extend the file length? */
997	write_eof = (woff + n > zp->z_size);
998
999	end_size = MAX(zp->z_size, woff + n);
1000
1001	/*
1002	 * Write the file in reasonable size chunks.  Each chunk is written
1003	 * in a separate transaction; this keeps the intent log records small
1004	 * and allows us to do more fine-grained space accounting.
1005	 */
1006	while (n > 0) {
1007		abuf = NULL;
1008		woff = uio->uio_loffset;
1009		if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
1010		    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
1011			if (abuf != NULL)
1012				dmu_return_arcbuf(abuf);
1013			error = SET_ERROR(EDQUOT);
1014			break;
1015		}
1016
1017		if (xuio && abuf == NULL) {
1018			ASSERT(i_iov < iovcnt);
1019			aiov = &iovp[i_iov];
1020			abuf = dmu_xuio_arcbuf(xuio, i_iov);
1021			dmu_xuio_clear(xuio, i_iov);
1022			DTRACE_PROBE3(zfs_cp_write, int, i_iov,
1023			    iovec_t *, aiov, arc_buf_t *, abuf);
1024			ASSERT((aiov->iov_base == abuf->b_data) ||
1025			    ((char *)aiov->iov_base - (char *)abuf->b_data +
1026			    aiov->iov_len == arc_buf_size(abuf)));
1027			i_iov++;
1028		} else if (abuf == NULL && n >= max_blksz &&
1029		    woff >= zp->z_size &&
1030		    P2PHASE(woff, max_blksz) == 0 &&
1031		    zp->z_blksz == max_blksz) {
1032			/*
1033			 * This write covers a full block.  "Borrow" a buffer
1034			 * from the dmu so that we can fill it before we enter
1035			 * a transaction.  This avoids the possibility of
1036			 * holding up the transaction if the data copy hangs
1037			 * up on a pagefault (e.g., from an NFS server mapping).
1038			 */
1039#ifdef illumos
1040			size_t cbytes;
1041#endif
1042
1043			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
1044			    max_blksz);
1045			ASSERT(abuf != NULL);
1046			ASSERT(arc_buf_size(abuf) == max_blksz);
1047#ifdef illumos
1048			if (error = uiocopy(abuf->b_data, max_blksz,
1049			    UIO_WRITE, uio, &cbytes)) {
1050				dmu_return_arcbuf(abuf);
1051				break;
1052			}
1053			ASSERT(cbytes == max_blksz);
1054#else
1055			ssize_t resid = uio->uio_resid;
1056			error = vn_io_fault_uiomove(abuf->b_data, max_blksz, uio);
1057			if (error != 0) {
1058				uio->uio_offset -= resid - uio->uio_resid;
1059				uio->uio_resid = resid;
1060				dmu_return_arcbuf(abuf);
1061				break;
1062			}
1063#endif
1064		}
1065
1066		/*
1067		 * Start a transaction.
1068		 */
1069		tx = dmu_tx_create(zfsvfs->z_os);
1070		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1071		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
1072		zfs_sa_upgrade_txholds(tx, zp);
1073		error = dmu_tx_assign(tx, TXG_WAIT);
1074		if (error) {
1075			dmu_tx_abort(tx);
1076			if (abuf != NULL)
1077				dmu_return_arcbuf(abuf);
1078			break;
1079		}
1080
1081		/*
1082		 * If zfs_range_lock() over-locked we grow the blocksize
1083		 * and then reduce the lock range.  This will only happen
1084		 * on the first iteration since zfs_range_reduce() will
1085		 * shrink down r_len to the appropriate size.
1086		 */
1087		if (rl->r_len == UINT64_MAX) {
1088			uint64_t new_blksz;
1089
1090			if (zp->z_blksz > max_blksz) {
1091				/*
1092				 * File's blocksize is already larger than the
1093				 * "recordsize" property.  Only let it grow to
1094				 * the next power of 2.
1095				 */
1096				ASSERT(!ISP2(zp->z_blksz));
1097				new_blksz = MIN(end_size,
1098				    1 << highbit64(zp->z_blksz));
1099			} else {
1100				new_blksz = MIN(end_size, max_blksz);
1101			}
1102			zfs_grow_blocksize(zp, new_blksz, tx);
1103			zfs_range_reduce(rl, woff, n);
1104		}
1105
1106		/*
1107		 * XXX - should we really limit each write to z_max_blksz?
1108		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
1109		 */
1110		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
1111
1112		if (woff + nbytes > zp->z_size)
1113			vnode_pager_setsize(vp, woff + nbytes);
1114
1115		if (abuf == NULL) {
1116			tx_bytes = uio->uio_resid;
1117			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
1118			    uio, nbytes, tx);
1119			tx_bytes -= uio->uio_resid;
1120		} else {
1121			tx_bytes = nbytes;
1122			ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
1123			/*
1124			 * If this is not a full block write, but we are
1125			 * extending the file past EOF and this data starts
1126			 * block-aligned, use assign_arcbuf().  Otherwise,
1127			 * write via dmu_write().
1128			 */
1129			if (tx_bytes < max_blksz && (!write_eof ||
1130			    aiov->iov_base != abuf->b_data)) {
1131				ASSERT(xuio);
1132				dmu_write(zfsvfs->z_os, zp->z_id, woff,
1133				    aiov->iov_len, aiov->iov_base, tx);
1134				dmu_return_arcbuf(abuf);
1135				xuio_stat_wbuf_copied();
1136			} else {
1137				ASSERT(xuio || tx_bytes == max_blksz);
1138				dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
1139				    woff, abuf, tx);
1140			}
1141#ifdef illumos
1142			ASSERT(tx_bytes <= uio->uio_resid);
1143			uioskip(uio, tx_bytes);
1144#endif
1145		}
1146		if (tx_bytes && vn_has_cached_data(vp)) {
1147			update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
1148			    zp->z_id, uio->uio_segflg, tx);
1149		}
1150
1151		/*
1152		 * If we made no progress, we're done.  If we made even
1153		 * partial progress, update the znode and ZIL accordingly.
1154		 */
1155		if (tx_bytes == 0) {
1156			(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
1157			    (void *)&zp->z_size, sizeof (uint64_t), tx);
1158			dmu_tx_commit(tx);
1159			ASSERT(error != 0);
1160			break;
1161		}
1162
1163		/*
1164		 * Clear Set-UID/Set-GID bits on successful write if not
1165		 * privileged and at least one of the excute bits is set.
1166		 *
1167		 * It would be nice to to this after all writes have
1168		 * been done, but that would still expose the ISUID/ISGID
1169		 * to another app after the partial write is committed.
1170		 *
1171		 * Note: we don't call zfs_fuid_map_id() here because
1172		 * user 0 is not an ephemeral uid.
1173		 */
1174		mutex_enter(&zp->z_acl_lock);
1175		if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
1176		    (S_IXUSR >> 6))) != 0 &&
1177		    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
1178		    secpolicy_vnode_setid_retain(vp, cr,
1179		    (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
1180			uint64_t newmode;
1181			zp->z_mode &= ~(S_ISUID | S_ISGID);
1182			newmode = zp->z_mode;
1183			(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
1184			    (void *)&newmode, sizeof (uint64_t), tx);
1185		}
1186		mutex_exit(&zp->z_acl_lock);
1187
1188		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
1189		    B_TRUE);
1190
1191		/*
1192		 * Update the file size (zp_size) if it has changed;
1193		 * account for possible concurrent updates.
1194		 */
1195		while ((end_size = zp->z_size) < uio->uio_loffset) {
1196			(void) atomic_cas_64(&zp->z_size, end_size,
1197			    uio->uio_loffset);
1198#ifdef illumos
1199			ASSERT(error == 0);
1200#else
1201			ASSERT(error == 0 || error == EFAULT);
1202#endif
1203		}
1204		/*
1205		 * If we are replaying and eof is non zero then force
1206		 * the file size to the specified eof. Note, there's no
1207		 * concurrency during replay.
1208		 */
1209		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
1210			zp->z_size = zfsvfs->z_replay_eof;
1211
1212		if (error == 0)
1213			error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1214		else
1215			(void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1216
1217		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
1218		dmu_tx_commit(tx);
1219
1220		if (error != 0)
1221			break;
1222		ASSERT(tx_bytes == nbytes);
1223		n -= nbytes;
1224
1225#ifdef illumos
1226		if (!xuio && n > 0)
1227			uio_prefaultpages(MIN(n, max_blksz), uio);
1228#endif
1229	}
1230
1231	zfs_range_unlock(rl);
1232
1233	/*
1234	 * If we're in replay mode, or we made no progress, return error.
1235	 * Otherwise, it's at least a partial write, so it's successful.
1236	 */
1237	if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
1238		ZFS_EXIT(zfsvfs);
1239		return (error);
1240	}
1241
1242#ifdef __FreeBSD__
1243	/*
1244	 * EFAULT means that at least one page of the source buffer was not
1245	 * available.  VFS will re-try remaining I/O upon this error.
1246	 */
1247	if (error == EFAULT) {
1248		ZFS_EXIT(zfsvfs);
1249		return (error);
1250	}
1251#endif
1252
1253	if (ioflag & (FSYNC | FDSYNC) ||
1254	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1255		zil_commit(zilog, zp->z_id);
1256
1257	ZFS_EXIT(zfsvfs);
1258	return (0);
1259}
1260
1261void
1262zfs_get_done(zgd_t *zgd, int error)
1263{
1264	znode_t *zp = zgd->zgd_private;
1265	objset_t *os = zp->z_zfsvfs->z_os;
1266
1267	if (zgd->zgd_db)
1268		dmu_buf_rele(zgd->zgd_db, zgd);
1269
1270	zfs_range_unlock(zgd->zgd_rl);
1271
1272	/*
1273	 * Release the vnode asynchronously as we currently have the
1274	 * txg stopped from syncing.
1275	 */
1276	VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1277
1278	if (error == 0 && zgd->zgd_bp)
1279		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
1280
1281	kmem_free(zgd, sizeof (zgd_t));
1282}
1283
1284#ifdef DEBUG
1285static int zil_fault_io = 0;
1286#endif
1287
1288/*
1289 * Get data to generate a TX_WRITE intent log record.
1290 */
1291int
1292zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
1293{
1294	zfsvfs_t *zfsvfs = arg;
1295	objset_t *os = zfsvfs->z_os;
1296	znode_t *zp;
1297	uint64_t object = lr->lr_foid;
1298	uint64_t offset = lr->lr_offset;
1299	uint64_t size = lr->lr_length;
1300	blkptr_t *bp = &lr->lr_blkptr;
1301	dmu_buf_t *db;
1302	zgd_t *zgd;
1303	int error = 0;
1304
1305	ASSERT(zio != NULL);
1306	ASSERT(size != 0);
1307
1308	/*
1309	 * Nothing to do if the file has been removed
1310	 */
1311	if (zfs_zget(zfsvfs, object, &zp) != 0)
1312		return (SET_ERROR(ENOENT));
1313	if (zp->z_unlinked) {
1314		/*
1315		 * Release the vnode asynchronously as we currently have the
1316		 * txg stopped from syncing.
1317		 */
1318		VN_RELE_ASYNC(ZTOV(zp),
1319		    dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1320		return (SET_ERROR(ENOENT));
1321	}
1322
1323	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1324	zgd->zgd_zilog = zfsvfs->z_log;
1325	zgd->zgd_private = zp;
1326
1327	/*
1328	 * Write records come in two flavors: immediate and indirect.
1329	 * For small writes it's cheaper to store the data with the
1330	 * log record (immediate); for large writes it's cheaper to
1331	 * sync the data and get a pointer to it (indirect) so that
1332	 * we don't have to write the data twice.
1333	 */
1334	if (buf != NULL) { /* immediate write */
1335		zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
1336		/* test for truncation needs to be done while range locked */
1337		if (offset >= zp->z_size) {
1338			error = SET_ERROR(ENOENT);
1339		} else {
1340			error = dmu_read(os, object, offset, size, buf,
1341			    DMU_READ_NO_PREFETCH);
1342		}
1343		ASSERT(error == 0 || error == ENOENT);
1344	} else { /* indirect write */
1345		/*
1346		 * Have to lock the whole block to ensure when it's
1347		 * written out and its checksum is being calculated
1348		 * that no one can change the data. We need to re-check
1349		 * blocksize after we get the lock in case it's changed!
1350		 */
1351		for (;;) {
1352			uint64_t blkoff;
1353			size = zp->z_blksz;
1354			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1355			offset -= blkoff;
1356			zgd->zgd_rl = zfs_range_lock(zp, offset, size,
1357			    RL_READER);
1358			if (zp->z_blksz == size)
1359				break;
1360			offset += blkoff;
1361			zfs_range_unlock(zgd->zgd_rl);
1362		}
1363		/* test for truncation needs to be done while range locked */
1364		if (lr->lr_offset >= zp->z_size)
1365			error = SET_ERROR(ENOENT);
1366#ifdef DEBUG
1367		if (zil_fault_io) {
1368			error = SET_ERROR(EIO);
1369			zil_fault_io = 0;
1370		}
1371#endif
1372		if (error == 0)
1373			error = dmu_buf_hold(os, object, offset, zgd, &db,
1374			    DMU_READ_NO_PREFETCH);
1375
1376		if (error == 0) {
1377			blkptr_t *obp = dmu_buf_get_blkptr(db);
1378			if (obp) {
1379				ASSERT(BP_IS_HOLE(bp));
1380				*bp = *obp;
1381			}
1382
1383			zgd->zgd_db = db;
1384			zgd->zgd_bp = bp;
1385
1386			ASSERT(db->db_offset == offset);
1387			ASSERT(db->db_size == size);
1388
1389			error = dmu_sync(zio, lr->lr_common.lrc_txg,
1390			    zfs_get_done, zgd);
1391			ASSERT(error || lr->lr_length <= zp->z_blksz);
1392
1393			/*
1394			 * On success, we need to wait for the write I/O
1395			 * initiated by dmu_sync() to complete before we can
1396			 * release this dbuf.  We will finish everything up
1397			 * in the zfs_get_done() callback.
1398			 */
1399			if (error == 0)
1400				return (0);
1401
1402			if (error == EALREADY) {
1403				lr->lr_common.lrc_txtype = TX_WRITE2;
1404				error = 0;
1405			}
1406		}
1407	}
1408
1409	zfs_get_done(zgd, error);
1410
1411	return (error);
1412}
1413
1414/*ARGSUSED*/
1415static int
1416zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1417    caller_context_t *ct)
1418{
1419	znode_t *zp = VTOZ(vp);
1420	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1421	int error;
1422
1423	ZFS_ENTER(zfsvfs);
1424	ZFS_VERIFY_ZP(zp);
1425
1426	if (flag & V_ACE_MASK)
1427		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1428	else
1429		error = zfs_zaccess_rwx(zp, mode, flag, cr);
1430
1431	ZFS_EXIT(zfsvfs);
1432	return (error);
1433}
1434
1435static int
1436zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
1437{
1438	int error;
1439
1440	*vpp = arg;
1441	error = vn_lock(*vpp, lkflags);
1442	if (error != 0)
1443		vrele(*vpp);
1444	return (error);
1445}
1446
1447static int
1448zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
1449{
1450	znode_t *zdp = VTOZ(dvp);
1451	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1452	int error;
1453	int ltype;
1454
1455	ASSERT_VOP_LOCKED(dvp, __func__);
1456#ifdef DIAGNOSTIC
1457	if ((zdp->z_pflags & ZFS_XATTR) == 0)
1458		VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
1459#endif
1460
1461	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
1462		ASSERT3P(dvp, ==, vp);
1463		vref(dvp);
1464		ltype = lkflags & LK_TYPE_MASK;
1465		if (ltype != VOP_ISLOCKED(dvp)) {
1466			if (ltype == LK_EXCLUSIVE)
1467				vn_lock(dvp, LK_UPGRADE | LK_RETRY);
1468			else /* if (ltype == LK_SHARED) */
1469				vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
1470
1471			/*
1472			 * Relock for the "." case could leave us with
1473			 * reclaimed vnode.
1474			 */
1475			if (dvp->v_iflag & VI_DOOMED) {
1476				vrele(dvp);
1477				return (SET_ERROR(ENOENT));
1478			}
1479		}
1480		return (0);
1481	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
1482		/*
1483		 * Note that in this case, dvp is the child vnode, and we
1484		 * are looking up the parent vnode - exactly reverse from
1485		 * normal operation.  Unlocking dvp requires some rather
1486		 * tricky unlock/relock dance to prevent mp from being freed;
1487		 * use vn_vget_ino_gen() which takes care of all that.
1488		 *
1489		 * XXX Note that there is a time window when both vnodes are
1490		 * unlocked.  It is possible, although highly unlikely, that
1491		 * during that window the parent-child relationship between
1492		 * the vnodes may change, for example, get reversed.
1493		 * In that case we would have a wrong lock order for the vnodes.
1494		 * All other filesystems seem to ignore this problem, so we
1495		 * do the same here.
1496		 * A potential solution could be implemented as follows:
1497		 * - using LK_NOWAIT when locking the second vnode and retrying
1498		 *   if necessary
1499		 * - checking that the parent-child relationship still holds
1500		 *   after locking both vnodes and retrying if it doesn't
1501		 */
1502		error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
1503		return (error);
1504	} else {
1505		error = vn_lock(vp, lkflags);
1506		if (error != 0)
1507			vrele(vp);
1508		return (error);
1509	}
1510}
1511
1512/*
1513 * Lookup an entry in a directory, or an extended attribute directory.
1514 * If it exists, return a held vnode reference for it.
1515 *
1516 *	IN:	dvp	- vnode of directory to search.
1517 *		nm	- name of entry to lookup.
1518 *		pnp	- full pathname to lookup [UNUSED].
1519 *		flags	- LOOKUP_XATTR set if looking for an attribute.
1520 *		rdir	- root directory vnode [UNUSED].
1521 *		cr	- credentials of caller.
1522 *		ct	- caller context
1523 *
1524 *	OUT:	vpp	- vnode of located entry, NULL if not found.
1525 *
1526 *	RETURN:	0 on success, error code on failure.
1527 *
1528 * Timestamps:
1529 *	NA
1530 */
1531/* ARGSUSED */
1532static int
1533zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
1534    int nameiop, cred_t *cr, kthread_t *td, int flags)
1535{
1536	znode_t *zdp = VTOZ(dvp);
1537	znode_t *zp;
1538	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1539	int	error = 0;
1540
1541	/* fast path (should be redundant with vfs namecache) */
1542	if (!(flags & LOOKUP_XATTR)) {
1543		if (dvp->v_type != VDIR) {
1544			return (SET_ERROR(ENOTDIR));
1545		} else if (zdp->z_sa_hdl == NULL) {
1546			return (SET_ERROR(EIO));
1547		}
1548	}
1549
1550	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1551
1552	ZFS_ENTER(zfsvfs);
1553	ZFS_VERIFY_ZP(zdp);
1554
1555	*vpp = NULL;
1556
1557	if (flags & LOOKUP_XATTR) {
1558#ifdef TODO
1559		/*
1560		 * If the xattr property is off, refuse the lookup request.
1561		 */
1562		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1563			ZFS_EXIT(zfsvfs);
1564			return (SET_ERROR(EINVAL));
1565		}
1566#endif
1567
1568		/*
1569		 * We don't allow recursive attributes..
1570		 * Maybe someday we will.
1571		 */
1572		if (zdp->z_pflags & ZFS_XATTR) {
1573			ZFS_EXIT(zfsvfs);
1574			return (SET_ERROR(EINVAL));
1575		}
1576
1577		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1578			ZFS_EXIT(zfsvfs);
1579			return (error);
1580		}
1581
1582		/*
1583		 * Do we have permission to get into attribute directory?
1584		 */
1585		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1586		    B_FALSE, cr)) {
1587			vrele(*vpp);
1588			*vpp = NULL;
1589		}
1590
1591		ZFS_EXIT(zfsvfs);
1592		return (error);
1593	}
1594
1595	/*
1596	 * Check accessibility of directory.
1597	 */
1598	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1599		ZFS_EXIT(zfsvfs);
1600		return (error);
1601	}
1602
1603	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1604	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1605		ZFS_EXIT(zfsvfs);
1606		return (SET_ERROR(EILSEQ));
1607	}
1608
1609
1610	/*
1611	 * First handle the special cases.
1612	 */
1613	if ((cnp->cn_flags & ISDOTDOT) != 0) {
1614		/*
1615		 * If we are a snapshot mounted under .zfs, return
1616		 * the vp for the snapshot directory.
1617		 */
1618		if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
1619			struct componentname cn;
1620			vnode_t *zfsctl_vp;
1621			int ltype;
1622
1623			ZFS_EXIT(zfsvfs);
1624			ltype = VOP_ISLOCKED(dvp);
1625			VOP_UNLOCK(dvp, 0);
1626			error = zfsctl_root(zfsvfs->z_parent, LK_SHARED,
1627			    &zfsctl_vp);
1628			if (error == 0) {
1629				cn.cn_nameptr = "snapshot";
1630				cn.cn_namelen = strlen(cn.cn_nameptr);
1631				cn.cn_nameiop = cnp->cn_nameiop;
1632				cn.cn_flags = cnp->cn_flags & ~ISDOTDOT;
1633				cn.cn_lkflags = cnp->cn_lkflags;
1634				error = VOP_LOOKUP(zfsctl_vp, vpp, &cn);
1635				vput(zfsctl_vp);
1636			}
1637			vn_lock(dvp, ltype | LK_RETRY);
1638			return (error);
1639		}
1640	}
1641	if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
1642		ZFS_EXIT(zfsvfs);
1643		if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
1644			return (SET_ERROR(ENOTSUP));
1645		error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp);
1646		return (error);
1647	}
1648
1649	/*
1650	 * The loop is retry the lookup if the parent-child relationship
1651	 * changes during the dot-dot locking complexities.
1652	 */
1653	for (;;) {
1654		uint64_t parent;
1655
1656		error = zfs_dirlook(zdp, nm, &zp);
1657		if (error == 0)
1658			*vpp = ZTOV(zp);
1659
1660		ZFS_EXIT(zfsvfs);
1661		if (error != 0)
1662			break;
1663
1664		error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
1665		if (error != 0) {
1666			/*
1667			 * If we've got a locking error, then the vnode
1668			 * got reclaimed because of a force unmount.
1669			 * We never enter doomed vnodes into the name cache.
1670			 */
1671			*vpp = NULL;
1672			return (error);
1673		}
1674
1675		if ((cnp->cn_flags & ISDOTDOT) == 0)
1676			break;
1677
1678		ZFS_ENTER(zfsvfs);
1679		if (zdp->z_sa_hdl == NULL) {
1680			error = SET_ERROR(EIO);
1681		} else {
1682			error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
1683			    &parent, sizeof (parent));
1684		}
1685		if (error != 0) {
1686			ZFS_EXIT(zfsvfs);
1687			vput(ZTOV(zp));
1688			break;
1689		}
1690		if (zp->z_id == parent) {
1691			ZFS_EXIT(zfsvfs);
1692			break;
1693		}
1694		vput(ZTOV(zp));
1695	}
1696
1697out:
1698	if (error != 0)
1699		*vpp = NULL;
1700
1701	/* Translate errors and add SAVENAME when needed. */
1702	if (cnp->cn_flags & ISLASTCN) {
1703		switch (nameiop) {
1704		case CREATE:
1705		case RENAME:
1706			if (error == ENOENT) {
1707				error = EJUSTRETURN;
1708				cnp->cn_flags |= SAVENAME;
1709				break;
1710			}
1711			/* FALLTHROUGH */
1712		case DELETE:
1713			if (error == 0)
1714				cnp->cn_flags |= SAVENAME;
1715			break;
1716		}
1717	}
1718
1719	/* Insert name into cache (as non-existent) if appropriate. */
1720	if (zfsvfs->z_use_namecache &&
1721	    error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
1722		cache_enter(dvp, NULL, cnp);
1723
1724	/* Insert name into cache if appropriate. */
1725	if (zfsvfs->z_use_namecache &&
1726	    error == 0 && (cnp->cn_flags & MAKEENTRY)) {
1727		if (!(cnp->cn_flags & ISLASTCN) ||
1728		    (nameiop != DELETE && nameiop != RENAME)) {
1729			cache_enter(dvp, *vpp, cnp);
1730		}
1731	}
1732
1733	return (error);
1734}
1735
1736/*
1737 * Attempt to create a new entry in a directory.  If the entry
1738 * already exists, truncate the file if permissible, else return
1739 * an error.  Return the vp of the created or trunc'd file.
1740 *
1741 *	IN:	dvp	- vnode of directory to put new file entry in.
1742 *		name	- name of new file entry.
1743 *		vap	- attributes of new file.
1744 *		excl	- flag indicating exclusive or non-exclusive mode.
1745 *		mode	- mode to open file with.
1746 *		cr	- credentials of caller.
1747 *		flag	- large file flag [UNUSED].
1748 *		ct	- caller context
1749 *		vsecp	- ACL to be set
1750 *
1751 *	OUT:	vpp	- vnode of created or trunc'd entry.
1752 *
1753 *	RETURN:	0 on success, error code on failure.
1754 *
1755 * Timestamps:
1756 *	dvp - ctime|mtime updated if new entry created
1757 *	 vp - ctime|mtime always, atime if new
1758 */
1759
1760/* ARGSUSED */
1761static int
1762zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
1763    vnode_t **vpp, cred_t *cr, kthread_t *td)
1764{
1765	znode_t		*zp, *dzp = VTOZ(dvp);
1766	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1767	zilog_t		*zilog;
1768	objset_t	*os;
1769	dmu_tx_t	*tx;
1770	int		error;
1771	ksid_t		*ksid;
1772	uid_t		uid;
1773	gid_t		gid = crgetgid(cr);
1774	zfs_acl_ids_t   acl_ids;
1775	boolean_t	fuid_dirtied;
1776	void		*vsecp = NULL;
1777	int		flag = 0;
1778	uint64_t	txtype;
1779
1780	/*
1781	 * If we have an ephemeral id, ACL, or XVATTR then
1782	 * make sure file system is at proper version
1783	 */
1784
1785	ksid = crgetsid(cr, KSID_OWNER);
1786	if (ksid)
1787		uid = ksid_getid(ksid);
1788	else
1789		uid = crgetuid(cr);
1790
1791	if (zfsvfs->z_use_fuids == B_FALSE &&
1792	    (vsecp || (vap->va_mask & AT_XVATTR) ||
1793	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1794		return (SET_ERROR(EINVAL));
1795
1796	ZFS_ENTER(zfsvfs);
1797	ZFS_VERIFY_ZP(dzp);
1798	os = zfsvfs->z_os;
1799	zilog = zfsvfs->z_log;
1800
1801	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1802	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1803		ZFS_EXIT(zfsvfs);
1804		return (SET_ERROR(EILSEQ));
1805	}
1806
1807	if (vap->va_mask & AT_XVATTR) {
1808		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
1809		    crgetuid(cr), cr, vap->va_type)) != 0) {
1810			ZFS_EXIT(zfsvfs);
1811			return (error);
1812		}
1813	}
1814
1815	*vpp = NULL;
1816
1817	if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
1818		vap->va_mode &= ~S_ISVTX;
1819
1820	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
1821	if (error) {
1822		ZFS_EXIT(zfsvfs);
1823		return (error);
1824	}
1825	ASSERT3P(zp, ==, NULL);
1826
1827	/*
1828	 * Create a new file object and update the directory
1829	 * to reference it.
1830	 */
1831	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1832		goto out;
1833	}
1834
1835	/*
1836	 * We only support the creation of regular files in
1837	 * extended attribute directories.
1838	 */
1839
1840	if ((dzp->z_pflags & ZFS_XATTR) &&
1841	    (vap->va_type != VREG)) {
1842		error = SET_ERROR(EINVAL);
1843		goto out;
1844	}
1845
1846	if ((error = zfs_acl_ids_create(dzp, 0, vap,
1847	    cr, vsecp, &acl_ids)) != 0)
1848		goto out;
1849
1850	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1851		zfs_acl_ids_free(&acl_ids);
1852		error = SET_ERROR(EDQUOT);
1853		goto out;
1854	}
1855
1856	getnewvnode_reserve(1);
1857
1858	tx = dmu_tx_create(os);
1859
1860	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1861	    ZFS_SA_BASE_ATTR_SIZE);
1862
1863	fuid_dirtied = zfsvfs->z_fuid_dirty;
1864	if (fuid_dirtied)
1865		zfs_fuid_txhold(zfsvfs, tx);
1866	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1867	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1868	if (!zfsvfs->z_use_sa &&
1869	    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1870		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1871		    0, acl_ids.z_aclp->z_acl_bytes);
1872	}
1873	error = dmu_tx_assign(tx, TXG_WAIT);
1874	if (error) {
1875		zfs_acl_ids_free(&acl_ids);
1876		dmu_tx_abort(tx);
1877		getnewvnode_drop_reserve();
1878		ZFS_EXIT(zfsvfs);
1879		return (error);
1880	}
1881	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1882
1883	if (fuid_dirtied)
1884		zfs_fuid_sync(zfsvfs, tx);
1885
1886	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
1887	txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1888	zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1889	    vsecp, acl_ids.z_fuidp, vap);
1890	zfs_acl_ids_free(&acl_ids);
1891	dmu_tx_commit(tx);
1892
1893	getnewvnode_drop_reserve();
1894
1895out:
1896	if (error == 0) {
1897		*vpp = ZTOV(zp);
1898	}
1899
1900	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1901		zil_commit(zilog, 0);
1902
1903	ZFS_EXIT(zfsvfs);
1904	return (error);
1905}
1906
1907/*
1908 * Remove an entry from a directory.
1909 *
1910 *	IN:	dvp	- vnode of directory to remove entry from.
1911 *		name	- name of entry to remove.
1912 *		cr	- credentials of caller.
1913 *		ct	- caller context
1914 *		flags	- case flags
1915 *
1916 *	RETURN:	0 on success, error code on failure.
1917 *
1918 * Timestamps:
1919 *	dvp - ctime|mtime
1920 *	 vp - ctime (if nlink > 0)
1921 */
1922
1923/*ARGSUSED*/
1924static int
1925zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
1926{
1927	znode_t		*dzp = VTOZ(dvp);
1928	znode_t		*zp = VTOZ(vp);
1929	znode_t		*xzp;
1930	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1931	zilog_t		*zilog;
1932	uint64_t	acl_obj, xattr_obj;
1933	uint64_t	obj = 0;
1934	dmu_tx_t	*tx;
1935	boolean_t	unlinked, toobig = FALSE;
1936	uint64_t	txtype;
1937	int		error;
1938
1939	ZFS_ENTER(zfsvfs);
1940	ZFS_VERIFY_ZP(dzp);
1941	ZFS_VERIFY_ZP(zp);
1942	zilog = zfsvfs->z_log;
1943	zp = VTOZ(vp);
1944
1945	xattr_obj = 0;
1946	xzp = NULL;
1947
1948	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1949		goto out;
1950	}
1951
1952	/*
1953	 * Need to use rmdir for removing directories.
1954	 */
1955	if (vp->v_type == VDIR) {
1956		error = SET_ERROR(EPERM);
1957		goto out;
1958	}
1959
1960	vnevent_remove(vp, dvp, name, ct);
1961
1962	obj = zp->z_id;
1963
1964	/* are there any extended attributes? */
1965	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1966	    &xattr_obj, sizeof (xattr_obj));
1967	if (error == 0 && xattr_obj) {
1968		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1969		ASSERT0(error);
1970	}
1971
1972	/*
1973	 * We may delete the znode now, or we may put it in the unlinked set;
1974	 * it depends on whether we're the last link, and on whether there are
1975	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
1976	 * allow for either case.
1977	 */
1978	tx = dmu_tx_create(zfsvfs->z_os);
1979	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1980	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1981	zfs_sa_upgrade_txholds(tx, zp);
1982	zfs_sa_upgrade_txholds(tx, dzp);
1983
1984	if (xzp) {
1985		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1986		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1987	}
1988
1989	/* charge as an update -- would be nice not to charge at all */
1990	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1991
1992	/*
1993	 * Mark this transaction as typically resulting in a net free of space
1994	 */
1995	dmu_tx_mark_netfree(tx);
1996
1997	error = dmu_tx_assign(tx, TXG_WAIT);
1998	if (error) {
1999		dmu_tx_abort(tx);
2000		ZFS_EXIT(zfsvfs);
2001		return (error);
2002	}
2003
2004	/*
2005	 * Remove the directory entry.
2006	 */
2007	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
2008
2009	if (error) {
2010		dmu_tx_commit(tx);
2011		goto out;
2012	}
2013
2014	if (unlinked) {
2015		zfs_unlinked_add(zp, tx);
2016		vp->v_vflag |= VV_NOSYNC;
2017	}
2018
2019	txtype = TX_REMOVE;
2020	zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
2021
2022	dmu_tx_commit(tx);
2023out:
2024
2025	if (xzp)
2026		vrele(ZTOV(xzp));
2027
2028	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2029		zil_commit(zilog, 0);
2030
2031	ZFS_EXIT(zfsvfs);
2032	return (error);
2033}
2034
2035/*
2036 * Create a new directory and insert it into dvp using the name
2037 * provided.  Return a pointer to the inserted directory.
2038 *
2039 *	IN:	dvp	- vnode of directory to add subdir to.
2040 *		dirname	- name of new directory.
2041 *		vap	- attributes of new directory.
2042 *		cr	- credentials of caller.
2043 *		ct	- caller context
2044 *		flags	- case flags
2045 *		vsecp	- ACL to be set
2046 *
2047 *	OUT:	vpp	- vnode of created directory.
2048 *
2049 *	RETURN:	0 on success, error code on failure.
2050 *
2051 * Timestamps:
2052 *	dvp - ctime|mtime updated
2053 *	 vp - ctime|mtime|atime updated
2054 */
2055/*ARGSUSED*/
2056static int
2057zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
2058{
2059	znode_t		*zp, *dzp = VTOZ(dvp);
2060	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2061	zilog_t		*zilog;
2062	uint64_t	txtype;
2063	dmu_tx_t	*tx;
2064	int		error;
2065	ksid_t		*ksid;
2066	uid_t		uid;
2067	gid_t		gid = crgetgid(cr);
2068	zfs_acl_ids_t   acl_ids;
2069	boolean_t	fuid_dirtied;
2070
2071	ASSERT(vap->va_type == VDIR);
2072
2073	/*
2074	 * If we have an ephemeral id, ACL, or XVATTR then
2075	 * make sure file system is at proper version
2076	 */
2077
2078	ksid = crgetsid(cr, KSID_OWNER);
2079	if (ksid)
2080		uid = ksid_getid(ksid);
2081	else
2082		uid = crgetuid(cr);
2083	if (zfsvfs->z_use_fuids == B_FALSE &&
2084	    ((vap->va_mask & AT_XVATTR) ||
2085	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
2086		return (SET_ERROR(EINVAL));
2087
2088	ZFS_ENTER(zfsvfs);
2089	ZFS_VERIFY_ZP(dzp);
2090	zilog = zfsvfs->z_log;
2091
2092	if (dzp->z_pflags & ZFS_XATTR) {
2093		ZFS_EXIT(zfsvfs);
2094		return (SET_ERROR(EINVAL));
2095	}
2096
2097	if (zfsvfs->z_utf8 && u8_validate(dirname,
2098	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2099		ZFS_EXIT(zfsvfs);
2100		return (SET_ERROR(EILSEQ));
2101	}
2102
2103	if (vap->va_mask & AT_XVATTR) {
2104		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
2105		    crgetuid(cr), cr, vap->va_type)) != 0) {
2106			ZFS_EXIT(zfsvfs);
2107			return (error);
2108		}
2109	}
2110
2111	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
2112	    NULL, &acl_ids)) != 0) {
2113		ZFS_EXIT(zfsvfs);
2114		return (error);
2115	}
2116
2117	/*
2118	 * First make sure the new directory doesn't exist.
2119	 *
2120	 * Existence is checked first to make sure we don't return
2121	 * EACCES instead of EEXIST which can cause some applications
2122	 * to fail.
2123	 */
2124	*vpp = NULL;
2125
2126	if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) {
2127		zfs_acl_ids_free(&acl_ids);
2128		ZFS_EXIT(zfsvfs);
2129		return (error);
2130	}
2131	ASSERT3P(zp, ==, NULL);
2132
2133	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
2134		zfs_acl_ids_free(&acl_ids);
2135		ZFS_EXIT(zfsvfs);
2136		return (error);
2137	}
2138
2139	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
2140		zfs_acl_ids_free(&acl_ids);
2141		ZFS_EXIT(zfsvfs);
2142		return (SET_ERROR(EDQUOT));
2143	}
2144
2145	/*
2146	 * Add a new entry to the directory.
2147	 */
2148	getnewvnode_reserve(1);
2149	tx = dmu_tx_create(zfsvfs->z_os);
2150	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
2151	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2152	fuid_dirtied = zfsvfs->z_fuid_dirty;
2153	if (fuid_dirtied)
2154		zfs_fuid_txhold(zfsvfs, tx);
2155	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2156		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
2157		    acl_ids.z_aclp->z_acl_bytes);
2158	}
2159
2160	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
2161	    ZFS_SA_BASE_ATTR_SIZE);
2162
2163	error = dmu_tx_assign(tx, TXG_WAIT);
2164	if (error) {
2165		zfs_acl_ids_free(&acl_ids);
2166		dmu_tx_abort(tx);
2167		getnewvnode_drop_reserve();
2168		ZFS_EXIT(zfsvfs);
2169		return (error);
2170	}
2171
2172	/*
2173	 * Create new node.
2174	 */
2175	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
2176
2177	if (fuid_dirtied)
2178		zfs_fuid_sync(zfsvfs, tx);
2179
2180	/*
2181	 * Now put new name in parent dir.
2182	 */
2183	(void) zfs_link_create(dzp, dirname, zp, tx, ZNEW);
2184
2185	*vpp = ZTOV(zp);
2186
2187	txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
2188	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
2189	    acl_ids.z_fuidp, vap);
2190
2191	zfs_acl_ids_free(&acl_ids);
2192
2193	dmu_tx_commit(tx);
2194
2195	getnewvnode_drop_reserve();
2196
2197	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2198		zil_commit(zilog, 0);
2199
2200	ZFS_EXIT(zfsvfs);
2201	return (0);
2202}
2203
2204/*
2205 * Remove a directory subdir entry.  If the current working
2206 * directory is the same as the subdir to be removed, the
2207 * remove will fail.
2208 *
2209 *	IN:	dvp	- vnode of directory to remove from.
2210 *		name	- name of directory to be removed.
2211 *		cwd	- vnode of current working directory.
2212 *		cr	- credentials of caller.
2213 *		ct	- caller context
2214 *		flags	- case flags
2215 *
2216 *	RETURN:	0 on success, error code on failure.
2217 *
2218 * Timestamps:
2219 *	dvp - ctime|mtime updated
2220 */
2221/*ARGSUSED*/
2222static int
2223zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
2224{
2225	znode_t		*dzp = VTOZ(dvp);
2226	znode_t		*zp = VTOZ(vp);
2227	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2228	zilog_t		*zilog;
2229	dmu_tx_t	*tx;
2230	int		error;
2231
2232	ZFS_ENTER(zfsvfs);
2233	ZFS_VERIFY_ZP(dzp);
2234	ZFS_VERIFY_ZP(zp);
2235	zilog = zfsvfs->z_log;
2236
2237
2238	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2239		goto out;
2240	}
2241
2242	if (vp->v_type != VDIR) {
2243		error = SET_ERROR(ENOTDIR);
2244		goto out;
2245	}
2246
2247	vnevent_rmdir(vp, dvp, name, ct);
2248
2249	tx = dmu_tx_create(zfsvfs->z_os);
2250	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2251	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2252	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2253	zfs_sa_upgrade_txholds(tx, zp);
2254	zfs_sa_upgrade_txholds(tx, dzp);
2255	dmu_tx_mark_netfree(tx);
2256	error = dmu_tx_assign(tx, TXG_WAIT);
2257	if (error) {
2258		dmu_tx_abort(tx);
2259		ZFS_EXIT(zfsvfs);
2260		return (error);
2261	}
2262
2263	cache_purge(dvp);
2264
2265	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
2266
2267	if (error == 0) {
2268		uint64_t txtype = TX_RMDIR;
2269		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
2270	}
2271
2272	dmu_tx_commit(tx);
2273
2274	cache_purge(vp);
2275out:
2276	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2277		zil_commit(zilog, 0);
2278
2279	ZFS_EXIT(zfsvfs);
2280	return (error);
2281}
2282
2283/*
2284 * Read as many directory entries as will fit into the provided
2285 * buffer from the given directory cursor position (specified in
2286 * the uio structure).
2287 *
2288 *	IN:	vp	- vnode of directory to read.
2289 *		uio	- structure supplying read location, range info,
2290 *			  and return buffer.
2291 *		cr	- credentials of caller.
2292 *		ct	- caller context
2293 *		flags	- case flags
2294 *
2295 *	OUT:	uio	- updated offset and range, buffer filled.
2296 *		eofp	- set to true if end-of-file detected.
2297 *
2298 *	RETURN:	0 on success, error code on failure.
2299 *
2300 * Timestamps:
2301 *	vp - atime updated
2302 *
2303 * Note that the low 4 bits of the cookie returned by zap is always zero.
2304 * This allows us to use the low range for "special" directory entries:
2305 * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
2306 * we use the offset 2 for the '.zfs' directory.
2307 */
2308/* ARGSUSED */
2309static int
2310zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
2311{
2312	znode_t		*zp = VTOZ(vp);
2313	iovec_t		*iovp;
2314	edirent_t	*eodp;
2315	dirent64_t	*odp;
2316	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2317	objset_t	*os;
2318	caddr_t		outbuf;
2319	size_t		bufsize;
2320	zap_cursor_t	zc;
2321	zap_attribute_t	zap;
2322	uint_t		bytes_wanted;
2323	uint64_t	offset; /* must be unsigned; checks for < 1 */
2324	uint64_t	parent;
2325	int		local_eof;
2326	int		outcount;
2327	int		error;
2328	uint8_t		prefetch;
2329	boolean_t	check_sysattrs;
2330	uint8_t		type;
2331	int		ncooks;
2332	u_long		*cooks = NULL;
2333	int		flags = 0;
2334
2335	ZFS_ENTER(zfsvfs);
2336	ZFS_VERIFY_ZP(zp);
2337
2338	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2339	    &parent, sizeof (parent))) != 0) {
2340		ZFS_EXIT(zfsvfs);
2341		return (error);
2342	}
2343
2344	/*
2345	 * If we are not given an eof variable,
2346	 * use a local one.
2347	 */
2348	if (eofp == NULL)
2349		eofp = &local_eof;
2350
2351	/*
2352	 * Check for valid iov_len.
2353	 */
2354	if (uio->uio_iov->iov_len <= 0) {
2355		ZFS_EXIT(zfsvfs);
2356		return (SET_ERROR(EINVAL));
2357	}
2358
2359	/*
2360	 * Quit if directory has been removed (posix)
2361	 */
2362	if ((*eofp = zp->z_unlinked) != 0) {
2363		ZFS_EXIT(zfsvfs);
2364		return (0);
2365	}
2366
2367	error = 0;
2368	os = zfsvfs->z_os;
2369	offset = uio->uio_loffset;
2370	prefetch = zp->z_zn_prefetch;
2371
2372	/*
2373	 * Initialize the iterator cursor.
2374	 */
2375	if (offset <= 3) {
2376		/*
2377		 * Start iteration from the beginning of the directory.
2378		 */
2379		zap_cursor_init(&zc, os, zp->z_id);
2380	} else {
2381		/*
2382		 * The offset is a serialized cursor.
2383		 */
2384		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2385	}
2386
2387	/*
2388	 * Get space to change directory entries into fs independent format.
2389	 */
2390	iovp = uio->uio_iov;
2391	bytes_wanted = iovp->iov_len;
2392	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
2393		bufsize = bytes_wanted;
2394		outbuf = kmem_alloc(bufsize, KM_SLEEP);
2395		odp = (struct dirent64 *)outbuf;
2396	} else {
2397		bufsize = bytes_wanted;
2398		outbuf = NULL;
2399		odp = (struct dirent64 *)iovp->iov_base;
2400	}
2401	eodp = (struct edirent *)odp;
2402
2403	if (ncookies != NULL) {
2404		/*
2405		 * Minimum entry size is dirent size and 1 byte for a file name.
2406		 */
2407		ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
2408		cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
2409		*cookies = cooks;
2410		*ncookies = ncooks;
2411	}
2412	/*
2413	 * If this VFS supports the system attribute view interface; and
2414	 * we're looking at an extended attribute directory; and we care
2415	 * about normalization conflicts on this vfs; then we must check
2416	 * for normalization conflicts with the sysattr name space.
2417	 */
2418#ifdef TODO
2419	check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2420	    (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2421	    (flags & V_RDDIR_ENTFLAGS);
2422#else
2423	check_sysattrs = 0;
2424#endif
2425
2426	/*
2427	 * Transform to file-system independent format
2428	 */
2429	outcount = 0;
2430	while (outcount < bytes_wanted) {
2431		ino64_t objnum;
2432		ushort_t reclen;
2433		off64_t *next = NULL;
2434
2435		/*
2436		 * Special case `.', `..', and `.zfs'.
2437		 */
2438		if (offset == 0) {
2439			(void) strcpy(zap.za_name, ".");
2440			zap.za_normalization_conflict = 0;
2441			objnum = zp->z_id;
2442			type = DT_DIR;
2443		} else if (offset == 1) {
2444			(void) strcpy(zap.za_name, "..");
2445			zap.za_normalization_conflict = 0;
2446			objnum = parent;
2447			type = DT_DIR;
2448		} else if (offset == 2 && zfs_show_ctldir(zp)) {
2449			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2450			zap.za_normalization_conflict = 0;
2451			objnum = ZFSCTL_INO_ROOT;
2452			type = DT_DIR;
2453		} else {
2454			/*
2455			 * Grab next entry.
2456			 */
2457			if (error = zap_cursor_retrieve(&zc, &zap)) {
2458				if ((*eofp = (error == ENOENT)) != 0)
2459					break;
2460				else
2461					goto update;
2462			}
2463
2464			if (zap.za_integer_length != 8 ||
2465			    zap.za_num_integers != 1) {
2466				cmn_err(CE_WARN, "zap_readdir: bad directory "
2467				    "entry, obj = %lld, offset = %lld\n",
2468				    (u_longlong_t)zp->z_id,
2469				    (u_longlong_t)offset);
2470				error = SET_ERROR(ENXIO);
2471				goto update;
2472			}
2473
2474			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2475			/*
2476			 * MacOS X can extract the object type here such as:
2477			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2478			 */
2479			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2480
2481			if (check_sysattrs && !zap.za_normalization_conflict) {
2482#ifdef TODO
2483				zap.za_normalization_conflict =
2484				    xattr_sysattr_casechk(zap.za_name);
2485#else
2486				panic("%s:%u: TODO", __func__, __LINE__);
2487#endif
2488			}
2489		}
2490
2491		if (flags & V_RDDIR_ACCFILTER) {
2492			/*
2493			 * If we have no access at all, don't include
2494			 * this entry in the returned information
2495			 */
2496			znode_t	*ezp;
2497			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
2498				goto skip_entry;
2499			if (!zfs_has_access(ezp, cr)) {
2500				vrele(ZTOV(ezp));
2501				goto skip_entry;
2502			}
2503			vrele(ZTOV(ezp));
2504		}
2505
2506		if (flags & V_RDDIR_ENTFLAGS)
2507			reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2508		else
2509			reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2510
2511		/*
2512		 * Will this entry fit in the buffer?
2513		 */
2514		if (outcount + reclen > bufsize) {
2515			/*
2516			 * Did we manage to fit anything in the buffer?
2517			 */
2518			if (!outcount) {
2519				error = SET_ERROR(EINVAL);
2520				goto update;
2521			}
2522			break;
2523		}
2524		if (flags & V_RDDIR_ENTFLAGS) {
2525			/*
2526			 * Add extended flag entry:
2527			 */
2528			eodp->ed_ino = objnum;
2529			eodp->ed_reclen = reclen;
2530			/* NOTE: ed_off is the offset for the *next* entry */
2531			next = &(eodp->ed_off);
2532			eodp->ed_eflags = zap.za_normalization_conflict ?
2533			    ED_CASE_CONFLICT : 0;
2534			(void) strncpy(eodp->ed_name, zap.za_name,
2535			    EDIRENT_NAMELEN(reclen));
2536			eodp = (edirent_t *)((intptr_t)eodp + reclen);
2537		} else {
2538			/*
2539			 * Add normal entry:
2540			 */
2541			odp->d_ino = objnum;
2542			odp->d_reclen = reclen;
2543			odp->d_namlen = strlen(zap.za_name);
2544			(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
2545			odp->d_type = type;
2546			odp = (dirent64_t *)((intptr_t)odp + reclen);
2547		}
2548		outcount += reclen;
2549
2550		ASSERT(outcount <= bufsize);
2551
2552		/* Prefetch znode */
2553		if (prefetch)
2554			dmu_prefetch(os, objnum, 0, 0, 0,
2555			    ZIO_PRIORITY_SYNC_READ);
2556
2557	skip_entry:
2558		/*
2559		 * Move to the next entry, fill in the previous offset.
2560		 */
2561		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2562			zap_cursor_advance(&zc);
2563			offset = zap_cursor_serialize(&zc);
2564		} else {
2565			offset += 1;
2566		}
2567
2568		if (cooks != NULL) {
2569			*cooks++ = offset;
2570			ncooks--;
2571			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
2572		}
2573	}
2574	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2575
2576	/* Subtract unused cookies */
2577	if (ncookies != NULL)
2578		*ncookies -= ncooks;
2579
2580	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
2581		iovp->iov_base += outcount;
2582		iovp->iov_len -= outcount;
2583		uio->uio_resid -= outcount;
2584	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2585		/*
2586		 * Reset the pointer.
2587		 */
2588		offset = uio->uio_loffset;
2589	}
2590
2591update:
2592	zap_cursor_fini(&zc);
2593	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
2594		kmem_free(outbuf, bufsize);
2595
2596	if (error == ENOENT)
2597		error = 0;
2598
2599	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2600
2601	uio->uio_loffset = offset;
2602	ZFS_EXIT(zfsvfs);
2603	if (error != 0 && cookies != NULL) {
2604		free(*cookies, M_TEMP);
2605		*cookies = NULL;
2606		*ncookies = 0;
2607	}
2608	return (error);
2609}
2610
2611ulong_t zfs_fsync_sync_cnt = 4;
2612
2613static int
2614zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2615{
2616	znode_t	*zp = VTOZ(vp);
2617	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2618
2619	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2620
2621	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
2622		ZFS_ENTER(zfsvfs);
2623		ZFS_VERIFY_ZP(zp);
2624		zil_commit(zfsvfs->z_log, zp->z_id);
2625		ZFS_EXIT(zfsvfs);
2626	}
2627	return (0);
2628}
2629
2630
2631/*
2632 * Get the requested file attributes and place them in the provided
2633 * vattr structure.
2634 *
2635 *	IN:	vp	- vnode of file.
2636 *		vap	- va_mask identifies requested attributes.
2637 *			  If AT_XVATTR set, then optional attrs are requested
2638 *		flags	- ATTR_NOACLCHECK (CIFS server context)
2639 *		cr	- credentials of caller.
2640 *		ct	- caller context
2641 *
2642 *	OUT:	vap	- attribute values.
2643 *
2644 *	RETURN:	0 (always succeeds).
2645 */
2646/* ARGSUSED */
2647static int
2648zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2649    caller_context_t *ct)
2650{
2651	znode_t *zp = VTOZ(vp);
2652	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2653	int	error = 0;
2654	uint32_t blksize;
2655	u_longlong_t nblocks;
2656	uint64_t links;
2657	uint64_t mtime[2], ctime[2], crtime[2], rdev;
2658	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2659	xoptattr_t *xoap = NULL;
2660	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2661	sa_bulk_attr_t bulk[4];
2662	int count = 0;
2663
2664	ZFS_ENTER(zfsvfs);
2665	ZFS_VERIFY_ZP(zp);
2666
2667	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2668
2669	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
2670	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
2671	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
2672	if (vp->v_type == VBLK || vp->v_type == VCHR)
2673		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
2674		    &rdev, 8);
2675
2676	if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
2677		ZFS_EXIT(zfsvfs);
2678		return (error);
2679	}
2680
2681	/*
2682	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2683	 * Also, if we are the owner don't bother, since owner should
2684	 * always be allowed to read basic attributes of file.
2685	 */
2686	if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
2687	    (vap->va_uid != crgetuid(cr))) {
2688		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2689		    skipaclchk, cr)) {
2690			ZFS_EXIT(zfsvfs);
2691			return (error);
2692		}
2693	}
2694
2695	/*
2696	 * Return all attributes.  It's cheaper to provide the answer
2697	 * than to determine whether we were asked the question.
2698	 */
2699
2700	vap->va_type = IFTOVT(zp->z_mode);
2701	vap->va_mode = zp->z_mode & ~S_IFMT;
2702#ifdef illumos
2703	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
2704#else
2705	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
2706#endif
2707	vap->va_nodeid = zp->z_id;
2708	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
2709		links = zp->z_links + 1;
2710	else
2711		links = zp->z_links;
2712	vap->va_nlink = MIN(links, LINK_MAX);	/* nlink_t limit! */
2713	vap->va_size = zp->z_size;
2714#ifdef illumos
2715	vap->va_rdev = vp->v_rdev;
2716#else
2717	if (vp->v_type == VBLK || vp->v_type == VCHR)
2718		vap->va_rdev = zfs_cmpldev(rdev);
2719#endif
2720	vap->va_seq = zp->z_seq;
2721	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
2722	vap->va_filerev = zp->z_seq;
2723
2724	/*
2725	 * Add in any requested optional attributes and the create time.
2726	 * Also set the corresponding bits in the returned attribute bitmap.
2727	 */
2728	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2729		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2730			xoap->xoa_archive =
2731			    ((zp->z_pflags & ZFS_ARCHIVE) != 0);
2732			XVA_SET_RTN(xvap, XAT_ARCHIVE);
2733		}
2734
2735		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2736			xoap->xoa_readonly =
2737			    ((zp->z_pflags & ZFS_READONLY) != 0);
2738			XVA_SET_RTN(xvap, XAT_READONLY);
2739		}
2740
2741		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2742			xoap->xoa_system =
2743			    ((zp->z_pflags & ZFS_SYSTEM) != 0);
2744			XVA_SET_RTN(xvap, XAT_SYSTEM);
2745		}
2746
2747		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2748			xoap->xoa_hidden =
2749			    ((zp->z_pflags & ZFS_HIDDEN) != 0);
2750			XVA_SET_RTN(xvap, XAT_HIDDEN);
2751		}
2752
2753		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2754			xoap->xoa_nounlink =
2755			    ((zp->z_pflags & ZFS_NOUNLINK) != 0);
2756			XVA_SET_RTN(xvap, XAT_NOUNLINK);
2757		}
2758
2759		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2760			xoap->xoa_immutable =
2761			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
2762			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2763		}
2764
2765		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2766			xoap->xoa_appendonly =
2767			    ((zp->z_pflags & ZFS_APPENDONLY) != 0);
2768			XVA_SET_RTN(xvap, XAT_APPENDONLY);
2769		}
2770
2771		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2772			xoap->xoa_nodump =
2773			    ((zp->z_pflags & ZFS_NODUMP) != 0);
2774			XVA_SET_RTN(xvap, XAT_NODUMP);
2775		}
2776
2777		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2778			xoap->xoa_opaque =
2779			    ((zp->z_pflags & ZFS_OPAQUE) != 0);
2780			XVA_SET_RTN(xvap, XAT_OPAQUE);
2781		}
2782
2783		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2784			xoap->xoa_av_quarantined =
2785			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
2786			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2787		}
2788
2789		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2790			xoap->xoa_av_modified =
2791			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
2792			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2793		}
2794
2795		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2796		    vp->v_type == VREG) {
2797			zfs_sa_get_scanstamp(zp, xvap);
2798		}
2799
2800		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
2801			uint64_t times[2];
2802
2803			(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
2804			    times, sizeof (times));
2805			ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
2806			XVA_SET_RTN(xvap, XAT_CREATETIME);
2807		}
2808
2809		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2810			xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
2811			XVA_SET_RTN(xvap, XAT_REPARSE);
2812		}
2813		if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
2814			xoap->xoa_generation = zp->z_gen;
2815			XVA_SET_RTN(xvap, XAT_GEN);
2816		}
2817
2818		if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
2819			xoap->xoa_offline =
2820			    ((zp->z_pflags & ZFS_OFFLINE) != 0);
2821			XVA_SET_RTN(xvap, XAT_OFFLINE);
2822		}
2823
2824		if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
2825			xoap->xoa_sparse =
2826			    ((zp->z_pflags & ZFS_SPARSE) != 0);
2827			XVA_SET_RTN(xvap, XAT_SPARSE);
2828		}
2829	}
2830
2831	ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
2832	ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2833	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2834	ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
2835
2836
2837	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
2838	vap->va_blksize = blksize;
2839	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
2840
2841	if (zp->z_blksz == 0) {
2842		/*
2843		 * Block size hasn't been set; suggest maximal I/O transfers.
2844		 */
2845		vap->va_blksize = zfsvfs->z_max_blksz;
2846	}
2847
2848	ZFS_EXIT(zfsvfs);
2849	return (0);
2850}
2851
2852/*
2853 * Set the file attributes to the values contained in the
2854 * vattr structure.
2855 *
2856 *	IN:	vp	- vnode of file to be modified.
2857 *		vap	- new attribute values.
2858 *			  If AT_XVATTR set, then optional attrs are being set
2859 *		flags	- ATTR_UTIME set if non-default time values provided.
2860 *			- ATTR_NOACLCHECK (CIFS context only).
2861 *		cr	- credentials of caller.
2862 *		ct	- caller context
2863 *
2864 *	RETURN:	0 on success, error code on failure.
2865 *
2866 * Timestamps:
2867 *	vp - ctime updated, mtime updated if size changed.
2868 */
2869/* ARGSUSED */
2870static int
2871zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2872    caller_context_t *ct)
2873{
2874	znode_t		*zp = VTOZ(vp);
2875	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2876	zilog_t		*zilog;
2877	dmu_tx_t	*tx;
2878	vattr_t		oldva;
2879	xvattr_t	tmpxvattr;
2880	uint_t		mask = vap->va_mask;
2881	uint_t		saved_mask = 0;
2882	uint64_t	saved_mode;
2883	int		trim_mask = 0;
2884	uint64_t	new_mode;
2885	uint64_t	new_uid, new_gid;
2886	uint64_t	xattr_obj;
2887	uint64_t	mtime[2], ctime[2];
2888	znode_t		*attrzp;
2889	int		need_policy = FALSE;
2890	int		err, err2;
2891	zfs_fuid_info_t *fuidp = NULL;
2892	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2893	xoptattr_t	*xoap;
2894	zfs_acl_t	*aclp;
2895	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2896	boolean_t	fuid_dirtied = B_FALSE;
2897	sa_bulk_attr_t	bulk[7], xattr_bulk[7];
2898	int		count = 0, xattr_count = 0;
2899
2900	if (mask == 0)
2901		return (0);
2902
2903	if (mask & AT_NOSET)
2904		return (SET_ERROR(EINVAL));
2905
2906	ZFS_ENTER(zfsvfs);
2907	ZFS_VERIFY_ZP(zp);
2908
2909	zilog = zfsvfs->z_log;
2910
2911	/*
2912	 * Make sure that if we have ephemeral uid/gid or xvattr specified
2913	 * that file system is at proper version level
2914	 */
2915
2916	if (zfsvfs->z_use_fuids == B_FALSE &&
2917	    (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2918	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2919	    (mask & AT_XVATTR))) {
2920		ZFS_EXIT(zfsvfs);
2921		return (SET_ERROR(EINVAL));
2922	}
2923
2924	if (mask & AT_SIZE && vp->v_type == VDIR) {
2925		ZFS_EXIT(zfsvfs);
2926		return (SET_ERROR(EISDIR));
2927	}
2928
2929	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
2930		ZFS_EXIT(zfsvfs);
2931		return (SET_ERROR(EINVAL));
2932	}
2933
2934	/*
2935	 * If this is an xvattr_t, then get a pointer to the structure of
2936	 * optional attributes.  If this is NULL, then we have a vattr_t.
2937	 */
2938	xoap = xva_getxoptattr(xvap);
2939
2940	xva_init(&tmpxvattr);
2941
2942	/*
2943	 * Immutable files can only alter immutable bit and atime
2944	 */
2945	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2946	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
2947	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2948		ZFS_EXIT(zfsvfs);
2949		return (SET_ERROR(EPERM));
2950	}
2951
2952	if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
2953		ZFS_EXIT(zfsvfs);
2954		return (SET_ERROR(EPERM));
2955	}
2956
2957	/*
2958	 * Verify timestamps doesn't overflow 32 bits.
2959	 * ZFS can handle large timestamps, but 32bit syscalls can't
2960	 * handle times greater than 2039.  This check should be removed
2961	 * once large timestamps are fully supported.
2962	 */
2963	if (mask & (AT_ATIME | AT_MTIME)) {
2964		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2965		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2966			ZFS_EXIT(zfsvfs);
2967			return (SET_ERROR(EOVERFLOW));
2968		}
2969	}
2970
2971	attrzp = NULL;
2972	aclp = NULL;
2973
2974	/* Can this be moved to before the top label? */
2975	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2976		ZFS_EXIT(zfsvfs);
2977		return (SET_ERROR(EROFS));
2978	}
2979
2980	/*
2981	 * First validate permissions
2982	 */
2983
2984	if (mask & AT_SIZE) {
2985		/*
2986		 * XXX - Note, we are not providing any open
2987		 * mode flags here (like FNDELAY), so we may
2988		 * block if there are locks present... this
2989		 * should be addressed in openat().
2990		 */
2991		/* XXX - would it be OK to generate a log record here? */
2992		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2993		if (err) {
2994			ZFS_EXIT(zfsvfs);
2995			return (err);
2996		}
2997	}
2998
2999	if (mask & (AT_ATIME|AT_MTIME) ||
3000	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
3001	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
3002	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
3003	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
3004	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
3005	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
3006	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
3007		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
3008		    skipaclchk, cr);
3009	}
3010
3011	if (mask & (AT_UID|AT_GID)) {
3012		int	idmask = (mask & (AT_UID|AT_GID));
3013		int	take_owner;
3014		int	take_group;
3015
3016		/*
3017		 * NOTE: even if a new mode is being set,
3018		 * we may clear S_ISUID/S_ISGID bits.
3019		 */
3020
3021		if (!(mask & AT_MODE))
3022			vap->va_mode = zp->z_mode;
3023
3024		/*
3025		 * Take ownership or chgrp to group we are a member of
3026		 */
3027
3028		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
3029		take_group = (mask & AT_GID) &&
3030		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
3031
3032		/*
3033		 * If both AT_UID and AT_GID are set then take_owner and
3034		 * take_group must both be set in order to allow taking
3035		 * ownership.
3036		 *
3037		 * Otherwise, send the check through secpolicy_vnode_setattr()
3038		 *
3039		 */
3040
3041		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
3042		    ((idmask == AT_UID) && take_owner) ||
3043		    ((idmask == AT_GID) && take_group)) {
3044			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
3045			    skipaclchk, cr) == 0) {
3046				/*
3047				 * Remove setuid/setgid for non-privileged users
3048				 */
3049				secpolicy_setid_clear(vap, vp, cr);
3050				trim_mask = (mask & (AT_UID|AT_GID));
3051			} else {
3052				need_policy =  TRUE;
3053			}
3054		} else {
3055			need_policy =  TRUE;
3056		}
3057	}
3058
3059	oldva.va_mode = zp->z_mode;
3060	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
3061	if (mask & AT_XVATTR) {
3062		/*
3063		 * Update xvattr mask to include only those attributes
3064		 * that are actually changing.
3065		 *
3066		 * the bits will be restored prior to actually setting
3067		 * the attributes so the caller thinks they were set.
3068		 */
3069		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
3070			if (xoap->xoa_appendonly !=
3071			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
3072				need_policy = TRUE;
3073			} else {
3074				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
3075				XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
3076			}
3077		}
3078
3079		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
3080			if (xoap->xoa_nounlink !=
3081			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
3082				need_policy = TRUE;
3083			} else {
3084				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
3085				XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
3086			}
3087		}
3088
3089		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
3090			if (xoap->xoa_immutable !=
3091			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
3092				need_policy = TRUE;
3093			} else {
3094				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
3095				XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
3096			}
3097		}
3098
3099		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
3100			if (xoap->xoa_nodump !=
3101			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
3102				need_policy = TRUE;
3103			} else {
3104				XVA_CLR_REQ(xvap, XAT_NODUMP);
3105				XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
3106			}
3107		}
3108
3109		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
3110			if (xoap->xoa_av_modified !=
3111			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
3112				need_policy = TRUE;
3113			} else {
3114				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
3115				XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
3116			}
3117		}
3118
3119		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
3120			if ((vp->v_type != VREG &&
3121			    xoap->xoa_av_quarantined) ||
3122			    xoap->xoa_av_quarantined !=
3123			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
3124				need_policy = TRUE;
3125			} else {
3126				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
3127				XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
3128			}
3129		}
3130
3131		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
3132			ZFS_EXIT(zfsvfs);
3133			return (SET_ERROR(EPERM));
3134		}
3135
3136		if (need_policy == FALSE &&
3137		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
3138		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
3139			need_policy = TRUE;
3140		}
3141	}
3142
3143	if (mask & AT_MODE) {
3144		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
3145			err = secpolicy_setid_setsticky_clear(vp, vap,
3146			    &oldva, cr);
3147			if (err) {
3148				ZFS_EXIT(zfsvfs);
3149				return (err);
3150			}
3151			trim_mask |= AT_MODE;
3152		} else {
3153			need_policy = TRUE;
3154		}
3155	}
3156
3157	if (need_policy) {
3158		/*
3159		 * If trim_mask is set then take ownership
3160		 * has been granted or write_acl is present and user
3161		 * has the ability to modify mode.  In that case remove
3162		 * UID|GID and or MODE from mask so that
3163		 * secpolicy_vnode_setattr() doesn't revoke it.
3164		 */
3165
3166		if (trim_mask) {
3167			saved_mask = vap->va_mask;
3168			vap->va_mask &= ~trim_mask;
3169			if (trim_mask & AT_MODE) {
3170				/*
3171				 * Save the mode, as secpolicy_vnode_setattr()
3172				 * will overwrite it with ova.va_mode.
3173				 */
3174				saved_mode = vap->va_mode;
3175			}
3176		}
3177		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
3178		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
3179		if (err) {
3180			ZFS_EXIT(zfsvfs);
3181			return (err);
3182		}
3183
3184		if (trim_mask) {
3185			vap->va_mask |= saved_mask;
3186			if (trim_mask & AT_MODE) {
3187				/*
3188				 * Recover the mode after
3189				 * secpolicy_vnode_setattr().
3190				 */
3191				vap->va_mode = saved_mode;
3192			}
3193		}
3194	}
3195
3196	/*
3197	 * secpolicy_vnode_setattr, or take ownership may have
3198	 * changed va_mask
3199	 */
3200	mask = vap->va_mask;
3201
3202	if ((mask & (AT_UID | AT_GID))) {
3203		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
3204		    &xattr_obj, sizeof (xattr_obj));
3205
3206		if (err == 0 && xattr_obj) {
3207			err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
3208			if (err == 0) {
3209				err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
3210				if (err != 0)
3211					vrele(ZTOV(attrzp));
3212			}
3213			if (err)
3214				goto out2;
3215		}
3216		if (mask & AT_UID) {
3217			new_uid = zfs_fuid_create(zfsvfs,
3218			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
3219			if (new_uid != zp->z_uid &&
3220			    zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
3221				if (attrzp)
3222					vput(ZTOV(attrzp));
3223				err = SET_ERROR(EDQUOT);
3224				goto out2;
3225			}
3226		}
3227
3228		if (mask & AT_GID) {
3229			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
3230			    cr, ZFS_GROUP, &fuidp);
3231			if (new_gid != zp->z_gid &&
3232			    zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
3233				if (attrzp)
3234					vput(ZTOV(attrzp));
3235				err = SET_ERROR(EDQUOT);
3236				goto out2;
3237			}
3238		}
3239	}
3240	tx = dmu_tx_create(zfsvfs->z_os);
3241
3242	if (mask & AT_MODE) {
3243		uint64_t pmode = zp->z_mode;
3244		uint64_t acl_obj;
3245		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
3246
3247		if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
3248		    !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
3249			err = SET_ERROR(EPERM);
3250			goto out;
3251		}
3252
3253		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
3254			goto out;
3255
3256		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
3257			/*
3258			 * Are we upgrading ACL from old V0 format
3259			 * to V1 format?
3260			 */
3261			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
3262			    zfs_znode_acl_version(zp) ==
3263			    ZFS_ACL_VERSION_INITIAL) {
3264				dmu_tx_hold_free(tx, acl_obj, 0,
3265				    DMU_OBJECT_END);
3266				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3267				    0, aclp->z_acl_bytes);
3268			} else {
3269				dmu_tx_hold_write(tx, acl_obj, 0,
3270				    aclp->z_acl_bytes);
3271			}
3272		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3273			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3274			    0, aclp->z_acl_bytes);
3275		}
3276		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3277	} else {
3278		if ((mask & AT_XVATTR) &&
3279		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3280			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3281		else
3282			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3283	}
3284
3285	if (attrzp) {
3286		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3287	}
3288
3289	fuid_dirtied = zfsvfs->z_fuid_dirty;
3290	if (fuid_dirtied)
3291		zfs_fuid_txhold(zfsvfs, tx);
3292
3293	zfs_sa_upgrade_txholds(tx, zp);
3294
3295	err = dmu_tx_assign(tx, TXG_WAIT);
3296	if (err)
3297		goto out;
3298
3299	count = 0;
3300	/*
3301	 * Set each attribute requested.
3302	 * We group settings according to the locks they need to acquire.
3303	 *
3304	 * Note: you cannot set ctime directly, although it will be
3305	 * updated as a side-effect of calling this function.
3306	 */
3307
3308	if (mask & (AT_UID|AT_GID|AT_MODE))
3309		mutex_enter(&zp->z_acl_lock);
3310
3311	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3312	    &zp->z_pflags, sizeof (zp->z_pflags));
3313
3314	if (attrzp) {
3315		if (mask & (AT_UID|AT_GID|AT_MODE))
3316			mutex_enter(&attrzp->z_acl_lock);
3317		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3318		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3319		    sizeof (attrzp->z_pflags));
3320	}
3321
3322	if (mask & (AT_UID|AT_GID)) {
3323
3324		if (mask & AT_UID) {
3325			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3326			    &new_uid, sizeof (new_uid));
3327			zp->z_uid = new_uid;
3328			if (attrzp) {
3329				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3330				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3331				    sizeof (new_uid));
3332				attrzp->z_uid = new_uid;
3333			}
3334		}
3335
3336		if (mask & AT_GID) {
3337			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3338			    NULL, &new_gid, sizeof (new_gid));
3339			zp->z_gid = new_gid;
3340			if (attrzp) {
3341				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3342				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3343				    sizeof (new_gid));
3344				attrzp->z_gid = new_gid;
3345			}
3346		}
3347		if (!(mask & AT_MODE)) {
3348			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3349			    NULL, &new_mode, sizeof (new_mode));
3350			new_mode = zp->z_mode;
3351		}
3352		err = zfs_acl_chown_setattr(zp);
3353		ASSERT(err == 0);
3354		if (attrzp) {
3355			err = zfs_acl_chown_setattr(attrzp);
3356			ASSERT(err == 0);
3357		}
3358	}
3359
3360	if (mask & AT_MODE) {
3361		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3362		    &new_mode, sizeof (new_mode));
3363		zp->z_mode = new_mode;
3364		ASSERT3U((uintptr_t)aclp, !=, 0);
3365		err = zfs_aclset_common(zp, aclp, cr, tx);
3366		ASSERT0(err);
3367		if (zp->z_acl_cached)
3368			zfs_acl_free(zp->z_acl_cached);
3369		zp->z_acl_cached = aclp;
3370		aclp = NULL;
3371	}
3372
3373
3374	if (mask & AT_ATIME) {
3375		ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
3376		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
3377		    &zp->z_atime, sizeof (zp->z_atime));
3378	}
3379
3380	if (mask & AT_MTIME) {
3381		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3382		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3383		    mtime, sizeof (mtime));
3384	}
3385
3386	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3387	if (mask & AT_SIZE && !(mask & AT_MTIME)) {
3388		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
3389		    NULL, mtime, sizeof (mtime));
3390		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3391		    &ctime, sizeof (ctime));
3392		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
3393		    B_TRUE);
3394	} else if (mask != 0) {
3395		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3396		    &ctime, sizeof (ctime));
3397		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
3398		    B_TRUE);
3399		if (attrzp) {
3400			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3401			    SA_ZPL_CTIME(zfsvfs), NULL,
3402			    &ctime, sizeof (ctime));
3403			zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
3404			    mtime, ctime, B_TRUE);
3405		}
3406	}
3407	/*
3408	 * Do this after setting timestamps to prevent timestamp
3409	 * update from toggling bit
3410	 */
3411
3412	if (xoap && (mask & AT_XVATTR)) {
3413
3414		/*
3415		 * restore trimmed off masks
3416		 * so that return masks can be set for caller.
3417		 */
3418
3419		if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3420			XVA_SET_REQ(xvap, XAT_APPENDONLY);
3421		}
3422		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3423			XVA_SET_REQ(xvap, XAT_NOUNLINK);
3424		}
3425		if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3426			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3427		}
3428		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3429			XVA_SET_REQ(xvap, XAT_NODUMP);
3430		}
3431		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3432			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3433		}
3434		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3435			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3436		}
3437
3438		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3439			ASSERT(vp->v_type == VREG);
3440
3441		zfs_xvattr_set(zp, xvap, tx);
3442	}
3443
3444	if (fuid_dirtied)
3445		zfs_fuid_sync(zfsvfs, tx);
3446
3447	if (mask != 0)
3448		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3449
3450	if (mask & (AT_UID|AT_GID|AT_MODE))
3451		mutex_exit(&zp->z_acl_lock);
3452
3453	if (attrzp) {
3454		if (mask & (AT_UID|AT_GID|AT_MODE))
3455			mutex_exit(&attrzp->z_acl_lock);
3456	}
3457out:
3458	if (err == 0 && attrzp) {
3459		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3460		    xattr_count, tx);
3461		ASSERT(err2 == 0);
3462	}
3463
3464	if (attrzp)
3465		vput(ZTOV(attrzp));
3466
3467	if (aclp)
3468		zfs_acl_free(aclp);
3469
3470	if (fuidp) {
3471		zfs_fuid_info_free(fuidp);
3472		fuidp = NULL;
3473	}
3474
3475	if (err) {
3476		dmu_tx_abort(tx);
3477	} else {
3478		err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3479		dmu_tx_commit(tx);
3480	}
3481
3482out2:
3483	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3484		zil_commit(zilog, 0);
3485
3486	ZFS_EXIT(zfsvfs);
3487	return (err);
3488}
3489
3490/*
3491 * We acquire all but fdvp locks using non-blocking acquisitions.  If we
3492 * fail to acquire any lock in the path we will drop all held locks,
3493 * acquire the new lock in a blocking fashion, and then release it and
3494 * restart the rename.  This acquire/release step ensures that we do not
3495 * spin on a lock waiting for release.  On error release all vnode locks
3496 * and decrement references the way tmpfs_rename() would do.
3497 */
3498static int
3499zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
3500    struct vnode *tdvp, struct vnode **tvpp,
3501    const struct componentname *scnp, const struct componentname *tcnp)
3502{
3503	zfsvfs_t	*zfsvfs;
3504	struct vnode	*nvp, *svp, *tvp;
3505	znode_t		*sdzp, *tdzp, *szp, *tzp;
3506	const char	*snm = scnp->cn_nameptr;
3507	const char	*tnm = tcnp->cn_nameptr;
3508	int error;
3509
3510	VOP_UNLOCK(tdvp, 0);
3511	if (*tvpp != NULL && *tvpp != tdvp)
3512		VOP_UNLOCK(*tvpp, 0);
3513
3514relock:
3515	error = vn_lock(sdvp, LK_EXCLUSIVE);
3516	if (error)
3517		goto out;
3518	sdzp = VTOZ(sdvp);
3519
3520	error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
3521	if (error != 0) {
3522		VOP_UNLOCK(sdvp, 0);
3523		if (error != EBUSY)
3524			goto out;
3525		error = vn_lock(tdvp, LK_EXCLUSIVE);
3526		if (error)
3527			goto out;
3528		VOP_UNLOCK(tdvp, 0);
3529		goto relock;
3530	}
3531	tdzp = VTOZ(tdvp);
3532
3533	/*
3534	 * Before using sdzp and tdzp we must ensure that they are live.
3535	 * As a porting legacy from illumos we have two things to worry
3536	 * about.  One is typical for FreeBSD and it is that the vnode is
3537	 * not reclaimed (doomed).  The other is that the znode is live.
3538	 * The current code can invalidate the znode without acquiring the
3539	 * corresponding vnode lock if the object represented by the znode
3540	 * and vnode is no longer valid after a rollback or receive operation.
3541	 * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock
3542	 * that protects the znodes from the invalidation.
3543	 */
3544	zfsvfs = sdzp->z_zfsvfs;
3545	ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
3546	ZFS_ENTER(zfsvfs);
3547
3548	/*
3549	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
3550	 * bypassing the cleanup code in the case of an error.
3551	 */
3552	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
3553		ZFS_EXIT(zfsvfs);
3554		VOP_UNLOCK(sdvp, 0);
3555		VOP_UNLOCK(tdvp, 0);
3556		error = SET_ERROR(EIO);
3557		goto out;
3558	}
3559
3560	/*
3561	 * Re-resolve svp to be certain it still exists and fetch the
3562	 * correct vnode.
3563	 */
3564	error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS);
3565	if (error != 0) {
3566		/* Source entry invalid or not there. */
3567		ZFS_EXIT(zfsvfs);
3568		VOP_UNLOCK(sdvp, 0);
3569		VOP_UNLOCK(tdvp, 0);
3570		if ((scnp->cn_flags & ISDOTDOT) != 0 ||
3571		    (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
3572			error = SET_ERROR(EINVAL);
3573		goto out;
3574	}
3575	svp = ZTOV(szp);
3576
3577	/*
3578	 * Re-resolve tvp, if it disappeared we just carry on.
3579	 */
3580	error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0);
3581	if (error != 0) {
3582		ZFS_EXIT(zfsvfs);
3583		VOP_UNLOCK(sdvp, 0);
3584		VOP_UNLOCK(tdvp, 0);
3585		vrele(svp);
3586		if ((tcnp->cn_flags & ISDOTDOT) != 0)
3587			error = SET_ERROR(EINVAL);
3588		goto out;
3589	}
3590	if (tzp != NULL)
3591		tvp = ZTOV(tzp);
3592	else
3593		tvp = NULL;
3594
3595	/*
3596	 * At present the vnode locks must be acquired before z_teardown_lock,
3597	 * although it would be more logical to use the opposite order.
3598	 */
3599	ZFS_EXIT(zfsvfs);
3600
3601	/*
3602	 * Now try acquire locks on svp and tvp.
3603	 */
3604	nvp = svp;
3605	error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
3606	if (error != 0) {
3607		VOP_UNLOCK(sdvp, 0);
3608		VOP_UNLOCK(tdvp, 0);
3609		if (tvp != NULL)
3610			vrele(tvp);
3611		if (error != EBUSY) {
3612			vrele(nvp);
3613			goto out;
3614		}
3615		error = vn_lock(nvp, LK_EXCLUSIVE);
3616		if (error != 0) {
3617			vrele(nvp);
3618			goto out;
3619		}
3620		VOP_UNLOCK(nvp, 0);
3621		/*
3622		 * Concurrent rename race.
3623		 * XXX ?
3624		 */
3625		if (nvp == tdvp) {
3626			vrele(nvp);
3627			error = SET_ERROR(EINVAL);
3628			goto out;
3629		}
3630		vrele(*svpp);
3631		*svpp = nvp;
3632		goto relock;
3633	}
3634	vrele(*svpp);
3635	*svpp = nvp;
3636
3637	if (*tvpp != NULL)
3638		vrele(*tvpp);
3639	*tvpp = NULL;
3640	if (tvp != NULL) {
3641		nvp = tvp;
3642		error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
3643		if (error != 0) {
3644			VOP_UNLOCK(sdvp, 0);
3645			VOP_UNLOCK(tdvp, 0);
3646			VOP_UNLOCK(*svpp, 0);
3647			if (error != EBUSY) {
3648				vrele(nvp);
3649				goto out;
3650			}
3651			error = vn_lock(nvp, LK_EXCLUSIVE);
3652			if (error != 0) {
3653				vrele(nvp);
3654				goto out;
3655			}
3656			vput(nvp);
3657			goto relock;
3658		}
3659		*tvpp = nvp;
3660	}
3661
3662	return (0);
3663
3664out:
3665	return (error);
3666}
3667
3668/*
3669 * Note that we must use VRELE_ASYNC in this function as it walks
3670 * up the directory tree and vrele may need to acquire an exclusive
3671 * lock if a last reference to a vnode is dropped.
3672 */
3673static int
3674zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
3675{
3676	zfsvfs_t	*zfsvfs;
3677	znode_t		*zp, *zp1;
3678	uint64_t	parent;
3679	int		error;
3680
3681	zfsvfs = tdzp->z_zfsvfs;
3682	if (tdzp == szp)
3683		return (SET_ERROR(EINVAL));
3684	if (tdzp == sdzp)
3685		return (0);
3686	if (tdzp->z_id == zfsvfs->z_root)
3687		return (0);
3688	zp = tdzp;
3689	for (;;) {
3690		ASSERT(!zp->z_unlinked);
3691		if ((error = sa_lookup(zp->z_sa_hdl,
3692		    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
3693			break;
3694
3695		if (parent == szp->z_id) {
3696			error = SET_ERROR(EINVAL);
3697			break;
3698		}
3699		if (parent == zfsvfs->z_root)
3700			break;
3701		if (parent == sdzp->z_id)
3702			break;
3703
3704		error = zfs_zget(zfsvfs, parent, &zp1);
3705		if (error != 0)
3706			break;
3707
3708		if (zp != tdzp)
3709			VN_RELE_ASYNC(ZTOV(zp),
3710			    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
3711		zp = zp1;
3712	}
3713
3714	if (error == ENOTDIR)
3715		panic("checkpath: .. not a directory\n");
3716	if (zp != tdzp)
3717		VN_RELE_ASYNC(ZTOV(zp),
3718		    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
3719	return (error);
3720}
3721
3722/*
3723 * Move an entry from the provided source directory to the target
3724 * directory.  Change the entry name as indicated.
3725 *
3726 *	IN:	sdvp	- Source directory containing the "old entry".
3727 *		snm	- Old entry name.
3728 *		tdvp	- Target directory to contain the "new entry".
3729 *		tnm	- New entry name.
3730 *		cr	- credentials of caller.
3731 *		ct	- caller context
3732 *		flags	- case flags
3733 *
3734 *	RETURN:	0 on success, error code on failure.
3735 *
3736 * Timestamps:
3737 *	sdvp,tdvp - ctime|mtime updated
3738 */
3739/*ARGSUSED*/
3740static int
3741zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
3742    vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
3743    cred_t *cr)
3744{
3745	zfsvfs_t	*zfsvfs;
3746	znode_t		*sdzp, *tdzp, *szp, *tzp;
3747	zilog_t		*zilog = NULL;
3748	dmu_tx_t	*tx;
3749	char		*snm = scnp->cn_nameptr;
3750	char		*tnm = tcnp->cn_nameptr;
3751	int		error = 0;
3752
3753	/* Reject renames across filesystems. */
3754	if ((*svpp)->v_mount != tdvp->v_mount ||
3755	    ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
3756		error = SET_ERROR(EXDEV);
3757		goto out;
3758	}
3759
3760	if (zfsctl_is_node(tdvp)) {
3761		error = SET_ERROR(EXDEV);
3762		goto out;
3763	}
3764
3765	/*
3766	 * Lock all four vnodes to ensure safety and semantics of renaming.
3767	 */
3768	error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
3769	if (error != 0) {
3770		/* no vnodes are locked in the case of error here */
3771		return (error);
3772	}
3773
3774	tdzp = VTOZ(tdvp);
3775	sdzp = VTOZ(sdvp);
3776	zfsvfs = tdzp->z_zfsvfs;
3777	zilog = zfsvfs->z_log;
3778
3779	/*
3780	 * After we re-enter ZFS_ENTER() we will have to revalidate all
3781	 * znodes involved.
3782	 */
3783	ZFS_ENTER(zfsvfs);
3784
3785	if (zfsvfs->z_utf8 && u8_validate(tnm,
3786	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3787		error = SET_ERROR(EILSEQ);
3788		goto unlockout;
3789	}
3790
3791	/* If source and target are the same file, there is nothing to do. */
3792	if ((*svpp) == (*tvpp)) {
3793		error = 0;
3794		goto unlockout;
3795	}
3796
3797	if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
3798	    ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
3799	    (*tvpp)->v_mountedhere != NULL)) {
3800		error = SET_ERROR(EXDEV);
3801		goto unlockout;
3802	}
3803
3804	/*
3805	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
3806	 * bypassing the cleanup code in the case of an error.
3807	 */
3808	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
3809		error = SET_ERROR(EIO);
3810		goto unlockout;
3811	}
3812
3813	szp = VTOZ(*svpp);
3814	tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
3815	if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) {
3816		error = SET_ERROR(EIO);
3817		goto unlockout;
3818	}
3819
3820	/*
3821	 * This is to prevent the creation of links into attribute space
3822	 * by renaming a linked file into/outof an attribute directory.
3823	 * See the comment in zfs_link() for why this is considered bad.
3824	 */
3825	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3826		error = SET_ERROR(EINVAL);
3827		goto unlockout;
3828	}
3829
3830	/*
3831	 * Must have write access at the source to remove the old entry
3832	 * and write access at the target to create the new entry.
3833	 * Note that if target and source are the same, this can be
3834	 * done in a single check.
3835	 */
3836	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
3837		goto unlockout;
3838
3839	if ((*svpp)->v_type == VDIR) {
3840		/*
3841		 * Avoid ".", "..", and aliases of "." for obvious reasons.
3842		 */
3843		if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
3844		    sdzp == szp ||
3845		    (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
3846			error = EINVAL;
3847			goto unlockout;
3848		}
3849
3850		/*
3851		 * Check to make sure rename is valid.
3852		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3853		 */
3854		if (error = zfs_rename_check(szp, sdzp, tdzp))
3855			goto unlockout;
3856	}
3857
3858	/*
3859	 * Does target exist?
3860	 */
3861	if (tzp) {
3862		/*
3863		 * Source and target must be the same type.
3864		 */
3865		if ((*svpp)->v_type == VDIR) {
3866			if ((*tvpp)->v_type != VDIR) {
3867				error = SET_ERROR(ENOTDIR);
3868				goto unlockout;
3869			} else {
3870				cache_purge(tdvp);
3871				if (sdvp != tdvp)
3872					cache_purge(sdvp);
3873			}
3874		} else {
3875			if ((*tvpp)->v_type == VDIR) {
3876				error = SET_ERROR(EISDIR);
3877				goto unlockout;
3878			}
3879		}
3880	}
3881
3882	vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
3883	if (tzp)
3884		vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
3885
3886	/*
3887	 * notify the target directory if it is not the same
3888	 * as source directory.
3889	 */
3890	if (tdvp != sdvp) {
3891		vnevent_rename_dest_dir(tdvp, ct);
3892	}
3893
3894	tx = dmu_tx_create(zfsvfs->z_os);
3895	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3896	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3897	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3898	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3899	if (sdzp != tdzp) {
3900		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3901		zfs_sa_upgrade_txholds(tx, tdzp);
3902	}
3903	if (tzp) {
3904		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3905		zfs_sa_upgrade_txholds(tx, tzp);
3906	}
3907
3908	zfs_sa_upgrade_txholds(tx, szp);
3909	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3910	error = dmu_tx_assign(tx, TXG_WAIT);
3911	if (error) {
3912		dmu_tx_abort(tx);
3913		goto unlockout;
3914	}
3915
3916
3917	if (tzp)	/* Attempt to remove the existing target */
3918		error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
3919
3920	if (error == 0) {
3921		error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
3922		if (error == 0) {
3923			szp->z_pflags |= ZFS_AV_MODIFIED;
3924
3925			error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3926			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3927			ASSERT0(error);
3928
3929			error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING,
3930			    NULL);
3931			if (error == 0) {
3932				zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
3933				    snm, tdzp, tnm, szp);
3934
3935				/*
3936				 * Update path information for the target vnode
3937				 */
3938				vn_renamepath(tdvp, *svpp, tnm, strlen(tnm));
3939			} else {
3940				/*
3941				 * At this point, we have successfully created
3942				 * the target name, but have failed to remove
3943				 * the source name.  Since the create was done
3944				 * with the ZRENAMING flag, there are
3945				 * complications; for one, the link count is
3946				 * wrong.  The easiest way to deal with this
3947				 * is to remove the newly created target, and
3948				 * return the original error.  This must
3949				 * succeed; fortunately, it is very unlikely to
3950				 * fail, since we just created it.
3951				 */
3952				VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx,
3953				    ZRENAMING, NULL), ==, 0);
3954			}
3955		}
3956		if (error == 0) {
3957			cache_purge(*svpp);
3958			if (*tvpp != NULL)
3959				cache_purge(*tvpp);
3960			cache_purge_negative(tdvp);
3961		}
3962	}
3963
3964	dmu_tx_commit(tx);
3965
3966unlockout:			/* all 4 vnodes are locked, ZFS_ENTER called */
3967	ZFS_EXIT(zfsvfs);
3968	VOP_UNLOCK(*svpp, 0);
3969	VOP_UNLOCK(sdvp, 0);
3970
3971out:				/* original two vnodes are locked */
3972	if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3973		zil_commit(zilog, 0);
3974
3975	if (*tvpp != NULL)
3976		VOP_UNLOCK(*tvpp, 0);
3977	if (tdvp != *tvpp)
3978		VOP_UNLOCK(tdvp, 0);
3979	return (error);
3980}
3981
3982/*
3983 * Insert the indicated symbolic reference entry into the directory.
3984 *
3985 *	IN:	dvp	- Directory to contain new symbolic link.
3986 *		link	- Name for new symlink entry.
3987 *		vap	- Attributes of new entry.
3988 *		cr	- credentials of caller.
3989 *		ct	- caller context
3990 *		flags	- case flags
3991 *
3992 *	RETURN:	0 on success, error code on failure.
3993 *
3994 * Timestamps:
3995 *	dvp - ctime|mtime updated
3996 */
3997/*ARGSUSED*/
3998static int
3999zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
4000    cred_t *cr, kthread_t *td)
4001{
4002	znode_t		*zp, *dzp = VTOZ(dvp);
4003	dmu_tx_t	*tx;
4004	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
4005	zilog_t		*zilog;
4006	uint64_t	len = strlen(link);
4007	int		error;
4008	zfs_acl_ids_t	acl_ids;
4009	boolean_t	fuid_dirtied;
4010	uint64_t	txtype = TX_SYMLINK;
4011	int		flags = 0;
4012
4013	ASSERT(vap->va_type == VLNK);
4014
4015	ZFS_ENTER(zfsvfs);
4016	ZFS_VERIFY_ZP(dzp);
4017	zilog = zfsvfs->z_log;
4018
4019	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
4020	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4021		ZFS_EXIT(zfsvfs);
4022		return (SET_ERROR(EILSEQ));
4023	}
4024
4025	if (len > MAXPATHLEN) {
4026		ZFS_EXIT(zfsvfs);
4027		return (SET_ERROR(ENAMETOOLONG));
4028	}
4029
4030	if ((error = zfs_acl_ids_create(dzp, 0,
4031	    vap, cr, NULL, &acl_ids)) != 0) {
4032		ZFS_EXIT(zfsvfs);
4033		return (error);
4034	}
4035
4036	/*
4037	 * Attempt to lock directory; fail if entry already exists.
4038	 */
4039	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
4040	if (error) {
4041		zfs_acl_ids_free(&acl_ids);
4042		ZFS_EXIT(zfsvfs);
4043		return (error);
4044	}
4045
4046	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4047		zfs_acl_ids_free(&acl_ids);
4048		ZFS_EXIT(zfsvfs);
4049		return (error);
4050	}
4051
4052	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
4053		zfs_acl_ids_free(&acl_ids);
4054		ZFS_EXIT(zfsvfs);
4055		return (SET_ERROR(EDQUOT));
4056	}
4057
4058	getnewvnode_reserve(1);
4059	tx = dmu_tx_create(zfsvfs->z_os);
4060	fuid_dirtied = zfsvfs->z_fuid_dirty;
4061	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
4062	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4063	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
4064	    ZFS_SA_BASE_ATTR_SIZE + len);
4065	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
4066	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
4067		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
4068		    acl_ids.z_aclp->z_acl_bytes);
4069	}
4070	if (fuid_dirtied)
4071		zfs_fuid_txhold(zfsvfs, tx);
4072	error = dmu_tx_assign(tx, TXG_WAIT);
4073	if (error) {
4074		zfs_acl_ids_free(&acl_ids);
4075		dmu_tx_abort(tx);
4076		getnewvnode_drop_reserve();
4077		ZFS_EXIT(zfsvfs);
4078		return (error);
4079	}
4080
4081	/*
4082	 * Create a new object for the symlink.
4083	 * for version 4 ZPL datsets the symlink will be an SA attribute
4084	 */
4085	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
4086
4087	if (fuid_dirtied)
4088		zfs_fuid_sync(zfsvfs, tx);
4089
4090	if (zp->z_is_sa)
4091		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
4092		    link, len, tx);
4093	else
4094		zfs_sa_symlink(zp, link, len, tx);
4095
4096	zp->z_size = len;
4097	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
4098	    &zp->z_size, sizeof (zp->z_size), tx);
4099	/*
4100	 * Insert the new object into the directory.
4101	 */
4102	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
4103
4104	zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
4105	*vpp = ZTOV(zp);
4106
4107	zfs_acl_ids_free(&acl_ids);
4108
4109	dmu_tx_commit(tx);
4110
4111	getnewvnode_drop_reserve();
4112
4113	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4114		zil_commit(zilog, 0);
4115
4116	ZFS_EXIT(zfsvfs);
4117	return (error);
4118}
4119
4120/*
4121 * Return, in the buffer contained in the provided uio structure,
4122 * the symbolic path referred to by vp.
4123 *
4124 *	IN:	vp	- vnode of symbolic link.
4125 *		uio	- structure to contain the link path.
4126 *		cr	- credentials of caller.
4127 *		ct	- caller context
4128 *
4129 *	OUT:	uio	- structure containing the link path.
4130 *
4131 *	RETURN:	0 on success, error code on failure.
4132 *
4133 * Timestamps:
4134 *	vp - atime updated
4135 */
4136/* ARGSUSED */
4137static int
4138zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
4139{
4140	znode_t		*zp = VTOZ(vp);
4141	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4142	int		error;
4143
4144	ZFS_ENTER(zfsvfs);
4145	ZFS_VERIFY_ZP(zp);
4146
4147	if (zp->z_is_sa)
4148		error = sa_lookup_uio(zp->z_sa_hdl,
4149		    SA_ZPL_SYMLINK(zfsvfs), uio);
4150	else
4151		error = zfs_sa_readlink(zp, uio);
4152
4153	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4154
4155	ZFS_EXIT(zfsvfs);
4156	return (error);
4157}
4158
4159/*
4160 * Insert a new entry into directory tdvp referencing svp.
4161 *
4162 *	IN:	tdvp	- Directory to contain new entry.
4163 *		svp	- vnode of new entry.
4164 *		name	- name of new entry.
4165 *		cr	- credentials of caller.
4166 *		ct	- caller context
4167 *
4168 *	RETURN:	0 on success, error code on failure.
4169 *
4170 * Timestamps:
4171 *	tdvp - ctime|mtime updated
4172 *	 svp - ctime updated
4173 */
4174/* ARGSUSED */
4175static int
4176zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
4177    caller_context_t *ct, int flags)
4178{
4179	znode_t		*dzp = VTOZ(tdvp);
4180	znode_t		*tzp, *szp;
4181	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
4182	zilog_t		*zilog;
4183	dmu_tx_t	*tx;
4184	int		error;
4185	uint64_t	parent;
4186	uid_t		owner;
4187
4188	ASSERT(tdvp->v_type == VDIR);
4189
4190	ZFS_ENTER(zfsvfs);
4191	ZFS_VERIFY_ZP(dzp);
4192	zilog = zfsvfs->z_log;
4193
4194	/*
4195	 * POSIX dictates that we return EPERM here.
4196	 * Better choices include ENOTSUP or EISDIR.
4197	 */
4198	if (svp->v_type == VDIR) {
4199		ZFS_EXIT(zfsvfs);
4200		return (SET_ERROR(EPERM));
4201	}
4202
4203	szp = VTOZ(svp);
4204	ZFS_VERIFY_ZP(szp);
4205
4206	if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) {
4207		ZFS_EXIT(zfsvfs);
4208		return (SET_ERROR(EPERM));
4209	}
4210
4211	/* Prevent links to .zfs/shares files */
4212
4213	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
4214	    &parent, sizeof (uint64_t))) != 0) {
4215		ZFS_EXIT(zfsvfs);
4216		return (error);
4217	}
4218	if (parent == zfsvfs->z_shares_dir) {
4219		ZFS_EXIT(zfsvfs);
4220		return (SET_ERROR(EPERM));
4221	}
4222
4223	if (zfsvfs->z_utf8 && u8_validate(name,
4224	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4225		ZFS_EXIT(zfsvfs);
4226		return (SET_ERROR(EILSEQ));
4227	}
4228
4229	/*
4230	 * We do not support links between attributes and non-attributes
4231	 * because of the potential security risk of creating links
4232	 * into "normal" file space in order to circumvent restrictions
4233	 * imposed in attribute space.
4234	 */
4235	if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
4236		ZFS_EXIT(zfsvfs);
4237		return (SET_ERROR(EINVAL));
4238	}
4239
4240
4241	owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
4242	if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
4243		ZFS_EXIT(zfsvfs);
4244		return (SET_ERROR(EPERM));
4245	}
4246
4247	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4248		ZFS_EXIT(zfsvfs);
4249		return (error);
4250	}
4251
4252	/*
4253	 * Attempt to lock directory; fail if entry already exists.
4254	 */
4255	error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW);
4256	if (error) {
4257		ZFS_EXIT(zfsvfs);
4258		return (error);
4259	}
4260
4261	tx = dmu_tx_create(zfsvfs->z_os);
4262	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4263	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4264	zfs_sa_upgrade_txholds(tx, szp);
4265	zfs_sa_upgrade_txholds(tx, dzp);
4266	error = dmu_tx_assign(tx, TXG_WAIT);
4267	if (error) {
4268		dmu_tx_abort(tx);
4269		ZFS_EXIT(zfsvfs);
4270		return (error);
4271	}
4272
4273	error = zfs_link_create(dzp, name, szp, tx, 0);
4274
4275	if (error == 0) {
4276		uint64_t txtype = TX_LINK;
4277		zfs_log_link(zilog, tx, txtype, dzp, szp, name);
4278	}
4279
4280	dmu_tx_commit(tx);
4281
4282	if (error == 0) {
4283		vnevent_link(svp, ct);
4284	}
4285
4286	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4287		zil_commit(zilog, 0);
4288
4289	ZFS_EXIT(zfsvfs);
4290	return (error);
4291}
4292
4293
4294/*ARGSUSED*/
4295void
4296zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4297{
4298	znode_t	*zp = VTOZ(vp);
4299	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4300	int error;
4301
4302	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4303	if (zp->z_sa_hdl == NULL) {
4304		/*
4305		 * The fs has been unmounted, or we did a
4306		 * suspend/resume and this file no longer exists.
4307		 */
4308		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4309		vrecycle(vp);
4310		return;
4311	}
4312
4313	if (zp->z_unlinked) {
4314		/*
4315		 * Fast path to recycle a vnode of a removed file.
4316		 */
4317		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4318		vrecycle(vp);
4319		return;
4320	}
4321
4322	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4323		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4324
4325		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4326		zfs_sa_upgrade_txholds(tx, zp);
4327		error = dmu_tx_assign(tx, TXG_WAIT);
4328		if (error) {
4329			dmu_tx_abort(tx);
4330		} else {
4331			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4332			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4333			zp->z_atime_dirty = 0;
4334			dmu_tx_commit(tx);
4335		}
4336	}
4337	rw_exit(&zfsvfs->z_teardown_inactive_lock);
4338}
4339
4340
4341CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
4342CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
4343
4344/*ARGSUSED*/
4345static int
4346zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4347{
4348	znode_t		*zp = VTOZ(vp);
4349	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4350	uint32_t	gen;
4351	uint64_t	gen64;
4352	uint64_t	object = zp->z_id;
4353	zfid_short_t	*zfid;
4354	int		size, i, error;
4355
4356	ZFS_ENTER(zfsvfs);
4357	ZFS_VERIFY_ZP(zp);
4358
4359	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4360	    &gen64, sizeof (uint64_t))) != 0) {
4361		ZFS_EXIT(zfsvfs);
4362		return (error);
4363	}
4364
4365	gen = (uint32_t)gen64;
4366
4367	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
4368
4369#ifdef illumos
4370	if (fidp->fid_len < size) {
4371		fidp->fid_len = size;
4372		ZFS_EXIT(zfsvfs);
4373		return (SET_ERROR(ENOSPC));
4374	}
4375#else
4376	fidp->fid_len = size;
4377#endif
4378
4379	zfid = (zfid_short_t *)fidp;
4380
4381	zfid->zf_len = size;
4382
4383	for (i = 0; i < sizeof (zfid->zf_object); i++)
4384		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4385
4386	/* Must have a non-zero generation number to distinguish from .zfs */
4387	if (gen == 0)
4388		gen = 1;
4389	for (i = 0; i < sizeof (zfid->zf_gen); i++)
4390		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4391
4392	if (size == LONG_FID_LEN) {
4393		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
4394		zfid_long_t	*zlfid;
4395
4396		zlfid = (zfid_long_t *)fidp;
4397
4398		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
4399			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
4400
4401		/* XXX - this should be the generation number for the objset */
4402		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
4403			zlfid->zf_setgen[i] = 0;
4404	}
4405
4406	ZFS_EXIT(zfsvfs);
4407	return (0);
4408}
4409
4410static int
4411zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4412    caller_context_t *ct)
4413{
4414	znode_t		*zp, *xzp;
4415	zfsvfs_t	*zfsvfs;
4416	int		error;
4417
4418	switch (cmd) {
4419	case _PC_LINK_MAX:
4420		*valp = INT_MAX;
4421		return (0);
4422
4423	case _PC_FILESIZEBITS:
4424		*valp = 64;
4425		return (0);
4426#ifdef illumos
4427	case _PC_XATTR_EXISTS:
4428		zp = VTOZ(vp);
4429		zfsvfs = zp->z_zfsvfs;
4430		ZFS_ENTER(zfsvfs);
4431		ZFS_VERIFY_ZP(zp);
4432		*valp = 0;
4433		error = zfs_dirent_lookup(zp, "", &xzp,
4434		    ZXATTR | ZEXISTS | ZSHARED);
4435		if (error == 0) {
4436			if (!zfs_dirempty(xzp))
4437				*valp = 1;
4438			vrele(ZTOV(xzp));
4439		} else if (error == ENOENT) {
4440			/*
4441			 * If there aren't extended attributes, it's the
4442			 * same as having zero of them.
4443			 */
4444			error = 0;
4445		}
4446		ZFS_EXIT(zfsvfs);
4447		return (error);
4448
4449	case _PC_SATTR_ENABLED:
4450	case _PC_SATTR_EXISTS:
4451		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
4452		    (vp->v_type == VREG || vp->v_type == VDIR);
4453		return (0);
4454
4455	case _PC_ACCESS_FILTERING:
4456		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
4457		    vp->v_type == VDIR;
4458		return (0);
4459
4460	case _PC_ACL_ENABLED:
4461		*valp = _ACL_ACE_ENABLED;
4462		return (0);
4463#endif	/* illumos */
4464	case _PC_MIN_HOLE_SIZE:
4465		*valp = (int)SPA_MINBLOCKSIZE;
4466		return (0);
4467#ifdef illumos
4468	case _PC_TIMESTAMP_RESOLUTION:
4469		/* nanosecond timestamp resolution */
4470		*valp = 1L;
4471		return (0);
4472#endif
4473	case _PC_ACL_EXTENDED:
4474		*valp = 0;
4475		return (0);
4476
4477	case _PC_ACL_NFS4:
4478		*valp = 1;
4479		return (0);
4480
4481	case _PC_ACL_PATH_MAX:
4482		*valp = ACL_MAX_ENTRIES;
4483		return (0);
4484
4485	default:
4486		return (EOPNOTSUPP);
4487	}
4488}
4489
4490/*ARGSUSED*/
4491static int
4492zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4493    caller_context_t *ct)
4494{
4495	znode_t *zp = VTOZ(vp);
4496	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4497	int error;
4498	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4499
4500	ZFS_ENTER(zfsvfs);
4501	ZFS_VERIFY_ZP(zp);
4502	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
4503	ZFS_EXIT(zfsvfs);
4504
4505	return (error);
4506}
4507
4508/*ARGSUSED*/
4509int
4510zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4511    caller_context_t *ct)
4512{
4513	znode_t *zp = VTOZ(vp);
4514	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4515	int error;
4516	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4517	zilog_t	*zilog = zfsvfs->z_log;
4518
4519	ZFS_ENTER(zfsvfs);
4520	ZFS_VERIFY_ZP(zp);
4521
4522	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
4523
4524	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4525		zil_commit(zilog, 0);
4526
4527	ZFS_EXIT(zfsvfs);
4528	return (error);
4529}
4530
4531static int
4532ioflags(int ioflags)
4533{
4534	int flags = 0;
4535
4536	if (ioflags & IO_APPEND)
4537		flags |= FAPPEND;
4538	if (ioflags & IO_NDELAY)
4539		flags |= FNONBLOCK;
4540	if (ioflags & IO_SYNC)
4541		flags |= (FSYNC | FDSYNC | FRSYNC);
4542
4543	return (flags);
4544}
4545
4546static int
4547zfs_getpages(struct vnode *vp, vm_page_t *m, int count, int reqpage)
4548{
4549	znode_t *zp = VTOZ(vp);
4550	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4551	objset_t *os = zp->z_zfsvfs->z_os;
4552	vm_page_t mfirst, mlast, mreq;
4553	vm_object_t object;
4554	caddr_t va;
4555	struct sf_buf *sf;
4556	off_t startoff, endoff;
4557	int i, error;
4558	vm_pindex_t reqstart, reqend;
4559	int pcount, lsize, reqsize, size;
4560
4561	ZFS_ENTER(zfsvfs);
4562	ZFS_VERIFY_ZP(zp);
4563
4564	pcount = OFF_TO_IDX(round_page(count));
4565	mreq = m[reqpage];
4566	object = mreq->object;
4567	error = 0;
4568
4569	KASSERT(vp->v_object == object, ("mismatching object"));
4570
4571	if (pcount > 1 && zp->z_blksz > PAGESIZE) {
4572		startoff = rounddown(IDX_TO_OFF(mreq->pindex), zp->z_blksz);
4573		reqstart = OFF_TO_IDX(round_page(startoff));
4574		if (reqstart < m[0]->pindex)
4575			reqstart = 0;
4576		else
4577			reqstart = reqstart - m[0]->pindex;
4578		endoff = roundup(IDX_TO_OFF(mreq->pindex) + PAGE_SIZE,
4579		    zp->z_blksz);
4580		reqend = OFF_TO_IDX(trunc_page(endoff)) - 1;
4581		if (reqend > m[pcount - 1]->pindex)
4582			reqend = m[pcount - 1]->pindex;
4583		reqsize = reqend - m[reqstart]->pindex + 1;
4584		KASSERT(reqstart <= reqpage && reqpage < reqstart + reqsize,
4585		    ("reqpage beyond [reqstart, reqstart + reqsize[ bounds"));
4586	} else {
4587		reqstart = reqpage;
4588		reqsize = 1;
4589	}
4590	mfirst = m[reqstart];
4591	mlast = m[reqstart + reqsize - 1];
4592
4593	zfs_vmobject_wlock(object);
4594
4595	for (i = 0; i < reqstart; i++) {
4596		vm_page_lock(m[i]);
4597		vm_page_free(m[i]);
4598		vm_page_unlock(m[i]);
4599	}
4600	for (i = reqstart + reqsize; i < pcount; i++) {
4601		vm_page_lock(m[i]);
4602		vm_page_free(m[i]);
4603		vm_page_unlock(m[i]);
4604	}
4605
4606	if (mreq->valid && reqsize == 1) {
4607		if (mreq->valid != VM_PAGE_BITS_ALL)
4608			vm_page_zero_invalid(mreq, TRUE);
4609		zfs_vmobject_wunlock(object);
4610		ZFS_EXIT(zfsvfs);
4611		return (zfs_vm_pagerret_ok);
4612	}
4613
4614	PCPU_INC(cnt.v_vnodein);
4615	PCPU_ADD(cnt.v_vnodepgsin, reqsize);
4616
4617	if (IDX_TO_OFF(mreq->pindex) >= object->un_pager.vnp.vnp_size) {
4618		for (i = reqstart; i < reqstart + reqsize; i++) {
4619			if (i != reqpage) {
4620				vm_page_lock(m[i]);
4621				vm_page_free(m[i]);
4622				vm_page_unlock(m[i]);
4623			}
4624		}
4625		zfs_vmobject_wunlock(object);
4626		ZFS_EXIT(zfsvfs);
4627		return (zfs_vm_pagerret_bad);
4628	}
4629
4630	lsize = PAGE_SIZE;
4631	if (IDX_TO_OFF(mlast->pindex) + lsize > object->un_pager.vnp.vnp_size)
4632		lsize = object->un_pager.vnp.vnp_size - IDX_TO_OFF(mlast->pindex);
4633
4634	zfs_vmobject_wunlock(object);
4635
4636	for (i = reqstart; i < reqstart + reqsize; i++) {
4637		size = PAGE_SIZE;
4638		if (i == (reqstart + reqsize - 1))
4639			size = lsize;
4640		va = zfs_map_page(m[i], &sf);
4641		error = dmu_read(os, zp->z_id, IDX_TO_OFF(m[i]->pindex),
4642		    size, va, DMU_READ_PREFETCH);
4643		if (size != PAGE_SIZE)
4644			bzero(va + size, PAGE_SIZE - size);
4645		zfs_unmap_page(sf);
4646		if (error != 0)
4647			break;
4648	}
4649
4650	zfs_vmobject_wlock(object);
4651
4652	for (i = reqstart; i < reqstart + reqsize; i++) {
4653		if (!error)
4654			m[i]->valid = VM_PAGE_BITS_ALL;
4655		KASSERT(m[i]->dirty == 0, ("zfs_getpages: page %p is dirty", m[i]));
4656		if (i != reqpage)
4657			vm_page_readahead_finish(m[i]);
4658	}
4659
4660	zfs_vmobject_wunlock(object);
4661
4662	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4663	ZFS_EXIT(zfsvfs);
4664	return (error ? zfs_vm_pagerret_error : zfs_vm_pagerret_ok);
4665}
4666
4667static int
4668zfs_freebsd_getpages(ap)
4669	struct vop_getpages_args /* {
4670		struct vnode *a_vp;
4671		vm_page_t *a_m;
4672		int a_count;
4673		int a_reqpage;
4674		vm_ooffset_t a_offset;
4675	} */ *ap;
4676{
4677
4678	return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_reqpage));
4679}
4680
4681static int
4682zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
4683    int *rtvals)
4684{
4685	znode_t		*zp = VTOZ(vp);
4686	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4687	rl_t		*rl;
4688	dmu_tx_t	*tx;
4689	struct sf_buf	*sf;
4690	vm_object_t	object;
4691	vm_page_t	m;
4692	caddr_t		va;
4693	size_t		tocopy;
4694	size_t		lo_len;
4695	vm_ooffset_t	lo_off;
4696	vm_ooffset_t	off;
4697	uint_t		blksz;
4698	int		ncount;
4699	int		pcount;
4700	int		err;
4701	int		i;
4702
4703	ZFS_ENTER(zfsvfs);
4704	ZFS_VERIFY_ZP(zp);
4705
4706	object = vp->v_object;
4707	pcount = btoc(len);
4708	ncount = pcount;
4709
4710	KASSERT(ma[0]->object == object, ("mismatching object"));
4711	KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));
4712
4713	for (i = 0; i < pcount; i++)
4714		rtvals[i] = zfs_vm_pagerret_error;
4715
4716	off = IDX_TO_OFF(ma[0]->pindex);
4717	blksz = zp->z_blksz;
4718	lo_off = rounddown(off, blksz);
4719	lo_len = roundup(len + (off - lo_off), blksz);
4720	rl = zfs_range_lock(zp, lo_off, lo_len, RL_WRITER);
4721
4722	zfs_vmobject_wlock(object);
4723	if (len + off > object->un_pager.vnp.vnp_size) {
4724		if (object->un_pager.vnp.vnp_size > off) {
4725			int pgoff;
4726
4727			len = object->un_pager.vnp.vnp_size - off;
4728			ncount = btoc(len);
4729			if ((pgoff = (int)len & PAGE_MASK) != 0) {
4730				/*
4731				 * If the object is locked and the following
4732				 * conditions hold, then the page's dirty
4733				 * field cannot be concurrently changed by a
4734				 * pmap operation.
4735				 */
4736				m = ma[ncount - 1];
4737				vm_page_assert_sbusied(m);
4738				KASSERT(!pmap_page_is_write_mapped(m),
4739				    ("zfs_putpages: page %p is not read-only", m));
4740				vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
4741				    pgoff);
4742			}
4743		} else {
4744			len = 0;
4745			ncount = 0;
4746		}
4747		if (ncount < pcount) {
4748			for (i = ncount; i < pcount; i++) {
4749				rtvals[i] = zfs_vm_pagerret_bad;
4750			}
4751		}
4752	}
4753	zfs_vmobject_wunlock(object);
4754
4755	if (ncount == 0)
4756		goto out;
4757
4758	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
4759	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
4760		goto out;
4761	}
4762
4763	tx = dmu_tx_create(zfsvfs->z_os);
4764	dmu_tx_hold_write(tx, zp->z_id, off, len);
4765
4766	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4767	zfs_sa_upgrade_txholds(tx, zp);
4768	err = dmu_tx_assign(tx, TXG_WAIT);
4769	if (err != 0) {
4770		dmu_tx_abort(tx);
4771		goto out;
4772	}
4773
4774	if (zp->z_blksz < PAGE_SIZE) {
4775		i = 0;
4776		for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
4777			tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
4778			va = zfs_map_page(ma[i], &sf);
4779			dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
4780			zfs_unmap_page(sf);
4781		}
4782	} else {
4783		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
4784	}
4785
4786	if (err == 0) {
4787		uint64_t mtime[2], ctime[2];
4788		sa_bulk_attr_t bulk[3];
4789		int count = 0;
4790
4791		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4792		    &mtime, 16);
4793		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
4794		    &ctime, 16);
4795		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
4796		    &zp->z_pflags, 8);
4797		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
4798		    B_TRUE);
4799		(void)sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
4800		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
4801
4802		zfs_vmobject_wlock(object);
4803		for (i = 0; i < ncount; i++) {
4804			rtvals[i] = zfs_vm_pagerret_ok;
4805			vm_page_undirty(ma[i]);
4806		}
4807		zfs_vmobject_wunlock(object);
4808		PCPU_INC(cnt.v_vnodeout);
4809		PCPU_ADD(cnt.v_vnodepgsout, ncount);
4810	}
4811	dmu_tx_commit(tx);
4812
4813out:
4814	zfs_range_unlock(rl);
4815	if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 ||
4816	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4817		zil_commit(zfsvfs->z_log, zp->z_id);
4818	ZFS_EXIT(zfsvfs);
4819	return (rtvals[0]);
4820}
4821
4822int
4823zfs_freebsd_putpages(ap)
4824	struct vop_putpages_args /* {
4825		struct vnode *a_vp;
4826		vm_page_t *a_m;
4827		int a_count;
4828		int a_sync;
4829		int *a_rtvals;
4830		vm_ooffset_t a_offset;
4831	} */ *ap;
4832{
4833
4834	return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
4835	    ap->a_rtvals));
4836}
4837
4838static int
4839zfs_freebsd_bmap(ap)
4840	struct vop_bmap_args /* {
4841		struct vnode *a_vp;
4842		daddr_t  a_bn;
4843		struct bufobj **a_bop;
4844		daddr_t *a_bnp;
4845		int *a_runp;
4846		int *a_runb;
4847	} */ *ap;
4848{
4849
4850	if (ap->a_bop != NULL)
4851		*ap->a_bop = &ap->a_vp->v_bufobj;
4852	if (ap->a_bnp != NULL)
4853		*ap->a_bnp = ap->a_bn;
4854	if (ap->a_runp != NULL)
4855		*ap->a_runp = 0;
4856	if (ap->a_runb != NULL)
4857		*ap->a_runb = 0;
4858
4859	return (0);
4860}
4861
4862static int
4863zfs_freebsd_open(ap)
4864	struct vop_open_args /* {
4865		struct vnode *a_vp;
4866		int a_mode;
4867		struct ucred *a_cred;
4868		struct thread *a_td;
4869	} */ *ap;
4870{
4871	vnode_t	*vp = ap->a_vp;
4872	znode_t *zp = VTOZ(vp);
4873	int error;
4874
4875	error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
4876	if (error == 0)
4877		vnode_create_vobject(vp, zp->z_size, ap->a_td);
4878	return (error);
4879}
4880
4881static int
4882zfs_freebsd_close(ap)
4883	struct vop_close_args /* {
4884		struct vnode *a_vp;
4885		int  a_fflag;
4886		struct ucred *a_cred;
4887		struct thread *a_td;
4888	} */ *ap;
4889{
4890
4891	return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL));
4892}
4893
4894static int
4895zfs_freebsd_ioctl(ap)
4896	struct vop_ioctl_args /* {
4897		struct vnode *a_vp;
4898		u_long a_command;
4899		caddr_t a_data;
4900		int a_fflag;
4901		struct ucred *cred;
4902		struct thread *td;
4903	} */ *ap;
4904{
4905
4906	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
4907	    ap->a_fflag, ap->a_cred, NULL, NULL));
4908}
4909
4910static int
4911zfs_freebsd_read(ap)
4912	struct vop_read_args /* {
4913		struct vnode *a_vp;
4914		struct uio *a_uio;
4915		int a_ioflag;
4916		struct ucred *a_cred;
4917	} */ *ap;
4918{
4919
4920	return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
4921	    ap->a_cred, NULL));
4922}
4923
4924static int
4925zfs_freebsd_write(ap)
4926	struct vop_write_args /* {
4927		struct vnode *a_vp;
4928		struct uio *a_uio;
4929		int a_ioflag;
4930		struct ucred *a_cred;
4931	} */ *ap;
4932{
4933
4934	return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
4935	    ap->a_cred, NULL));
4936}
4937
4938static int
4939zfs_freebsd_access(ap)
4940	struct vop_access_args /* {
4941		struct vnode *a_vp;
4942		accmode_t a_accmode;
4943		struct ucred *a_cred;
4944		struct thread *a_td;
4945	} */ *ap;
4946{
4947	vnode_t *vp = ap->a_vp;
4948	znode_t *zp = VTOZ(vp);
4949	accmode_t accmode;
4950	int error = 0;
4951
4952	/*
4953	 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
4954	 */
4955	accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
4956	if (accmode != 0)
4957		error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL);
4958
4959	/*
4960	 * VADMIN has to be handled by vaccess().
4961	 */
4962	if (error == 0) {
4963		accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
4964		if (accmode != 0) {
4965			error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
4966			    zp->z_gid, accmode, ap->a_cred, NULL);
4967		}
4968	}
4969
4970	/*
4971	 * For VEXEC, ensure that at least one execute bit is set for
4972	 * non-directories.
4973	 */
4974	if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
4975	    (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
4976		error = EACCES;
4977	}
4978
4979	return (error);
4980}
4981
4982static int
4983zfs_freebsd_lookup(ap)
4984	struct vop_lookup_args /* {
4985		struct vnode *a_dvp;
4986		struct vnode **a_vpp;
4987		struct componentname *a_cnp;
4988	} */ *ap;
4989{
4990	struct componentname *cnp = ap->a_cnp;
4991	char nm[NAME_MAX + 1];
4992
4993	ASSERT(cnp->cn_namelen < sizeof(nm));
4994	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
4995
4996	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
4997	    cnp->cn_cred, cnp->cn_thread, 0));
4998}
4999
5000static int
5001zfs_cache_lookup(ap)
5002	struct vop_lookup_args /* {
5003		struct vnode *a_dvp;
5004		struct vnode **a_vpp;
5005		struct componentname *a_cnp;
5006	} */ *ap;
5007{
5008	zfsvfs_t *zfsvfs;
5009
5010	zfsvfs = ap->a_dvp->v_mount->mnt_data;
5011	if (zfsvfs->z_use_namecache)
5012		return (vfs_cache_lookup(ap));
5013	else
5014		return (zfs_freebsd_lookup(ap));
5015}
5016
5017static int
5018zfs_freebsd_create(ap)
5019	struct vop_create_args /* {
5020		struct vnode *a_dvp;
5021		struct vnode **a_vpp;
5022		struct componentname *a_cnp;
5023		struct vattr *a_vap;
5024	} */ *ap;
5025{
5026	zfsvfs_t *zfsvfs;
5027	struct componentname *cnp = ap->a_cnp;
5028	vattr_t *vap = ap->a_vap;
5029	int error, mode;
5030
5031	ASSERT(cnp->cn_flags & SAVENAME);
5032
5033	vattr_init_mask(vap);
5034	mode = vap->va_mode & ALLPERMS;
5035	zfsvfs = ap->a_dvp->v_mount->mnt_data;
5036
5037	error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
5038	    ap->a_vpp, cnp->cn_cred, cnp->cn_thread);
5039	if (zfsvfs->z_use_namecache &&
5040	    error == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
5041		cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
5042	return (error);
5043}
5044
5045static int
5046zfs_freebsd_remove(ap)
5047	struct vop_remove_args /* {
5048		struct vnode *a_dvp;
5049		struct vnode *a_vp;
5050		struct componentname *a_cnp;
5051	} */ *ap;
5052{
5053
5054	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
5055
5056	return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
5057	    ap->a_cnp->cn_cred));
5058}
5059
5060static int
5061zfs_freebsd_mkdir(ap)
5062	struct vop_mkdir_args /* {
5063		struct vnode *a_dvp;
5064		struct vnode **a_vpp;
5065		struct componentname *a_cnp;
5066		struct vattr *a_vap;
5067	} */ *ap;
5068{
5069	vattr_t *vap = ap->a_vap;
5070
5071	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
5072
5073	vattr_init_mask(vap);
5074
5075	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
5076	    ap->a_cnp->cn_cred));
5077}
5078
5079static int
5080zfs_freebsd_rmdir(ap)
5081	struct vop_rmdir_args /* {
5082		struct vnode *a_dvp;
5083		struct vnode *a_vp;
5084		struct componentname *a_cnp;
5085	} */ *ap;
5086{
5087	struct componentname *cnp = ap->a_cnp;
5088
5089	ASSERT(cnp->cn_flags & SAVENAME);
5090
5091	return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
5092}
5093
5094static int
5095zfs_freebsd_readdir(ap)
5096	struct vop_readdir_args /* {
5097		struct vnode *a_vp;
5098		struct uio *a_uio;
5099		struct ucred *a_cred;
5100		int *a_eofflag;
5101		int *a_ncookies;
5102		u_long **a_cookies;
5103	} */ *ap;
5104{
5105
5106	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
5107	    ap->a_ncookies, ap->a_cookies));
5108}
5109
5110static int
5111zfs_freebsd_fsync(ap)
5112	struct vop_fsync_args /* {
5113		struct vnode *a_vp;
5114		int a_waitfor;
5115		struct thread *a_td;
5116	} */ *ap;
5117{
5118
5119	vop_stdfsync(ap);
5120	return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
5121}
5122
5123static int
5124zfs_freebsd_getattr(ap)
5125	struct vop_getattr_args /* {
5126		struct vnode *a_vp;
5127		struct vattr *a_vap;
5128		struct ucred *a_cred;
5129	} */ *ap;
5130{
5131	vattr_t *vap = ap->a_vap;
5132	xvattr_t xvap;
5133	u_long fflags = 0;
5134	int error;
5135
5136	xva_init(&xvap);
5137	xvap.xva_vattr = *vap;
5138	xvap.xva_vattr.va_mask |= AT_XVATTR;
5139
5140	/* Convert chflags into ZFS-type flags. */
5141	/* XXX: what about SF_SETTABLE?. */
5142	XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
5143	XVA_SET_REQ(&xvap, XAT_APPENDONLY);
5144	XVA_SET_REQ(&xvap, XAT_NOUNLINK);
5145	XVA_SET_REQ(&xvap, XAT_NODUMP);
5146	XVA_SET_REQ(&xvap, XAT_READONLY);
5147	XVA_SET_REQ(&xvap, XAT_ARCHIVE);
5148	XVA_SET_REQ(&xvap, XAT_SYSTEM);
5149	XVA_SET_REQ(&xvap, XAT_HIDDEN);
5150	XVA_SET_REQ(&xvap, XAT_REPARSE);
5151	XVA_SET_REQ(&xvap, XAT_OFFLINE);
5152	XVA_SET_REQ(&xvap, XAT_SPARSE);
5153
5154	error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
5155	if (error != 0)
5156		return (error);
5157
5158	/* Convert ZFS xattr into chflags. */
5159#define	FLAG_CHECK(fflag, xflag, xfield)	do {			\
5160	if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)		\
5161		fflags |= (fflag);					\
5162} while (0)
5163	FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
5164	    xvap.xva_xoptattrs.xoa_immutable);
5165	FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
5166	    xvap.xva_xoptattrs.xoa_appendonly);
5167	FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
5168	    xvap.xva_xoptattrs.xoa_nounlink);
5169	FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
5170	    xvap.xva_xoptattrs.xoa_archive);
5171	FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
5172	    xvap.xva_xoptattrs.xoa_nodump);
5173	FLAG_CHECK(UF_READONLY, XAT_READONLY,
5174	    xvap.xva_xoptattrs.xoa_readonly);
5175	FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
5176	    xvap.xva_xoptattrs.xoa_system);
5177	FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
5178	    xvap.xva_xoptattrs.xoa_hidden);
5179	FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
5180	    xvap.xva_xoptattrs.xoa_reparse);
5181	FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
5182	    xvap.xva_xoptattrs.xoa_offline);
5183	FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
5184	    xvap.xva_xoptattrs.xoa_sparse);
5185
5186#undef	FLAG_CHECK
5187	*vap = xvap.xva_vattr;
5188	vap->va_flags = fflags;
5189	return (0);
5190}
5191
5192static int
5193zfs_freebsd_setattr(ap)
5194	struct vop_setattr_args /* {
5195		struct vnode *a_vp;
5196		struct vattr *a_vap;
5197		struct ucred *a_cred;
5198	} */ *ap;
5199{
5200	vnode_t *vp = ap->a_vp;
5201	vattr_t *vap = ap->a_vap;
5202	cred_t *cred = ap->a_cred;
5203	xvattr_t xvap;
5204	u_long fflags;
5205	uint64_t zflags;
5206
5207	vattr_init_mask(vap);
5208	vap->va_mask &= ~AT_NOSET;
5209
5210	xva_init(&xvap);
5211	xvap.xva_vattr = *vap;
5212
5213	zflags = VTOZ(vp)->z_pflags;
5214
5215	if (vap->va_flags != VNOVAL) {
5216		zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
5217		int error;
5218
5219		if (zfsvfs->z_use_fuids == B_FALSE)
5220			return (EOPNOTSUPP);
5221
5222		fflags = vap->va_flags;
5223		/*
5224		 * XXX KDM
5225		 * We need to figure out whether it makes sense to allow
5226		 * UF_REPARSE through, since we don't really have other
5227		 * facilities to handle reparse points and zfs_setattr()
5228		 * doesn't currently allow setting that attribute anyway.
5229		 */
5230		if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE|
5231		     UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE|
5232		     UF_OFFLINE|UF_SPARSE)) != 0)
5233			return (EOPNOTSUPP);
5234		/*
5235		 * Unprivileged processes are not permitted to unset system
5236		 * flags, or modify flags if any system flags are set.
5237		 * Privileged non-jail processes may not modify system flags
5238		 * if securelevel > 0 and any existing system flags are set.
5239		 * Privileged jail processes behave like privileged non-jail
5240		 * processes if the security.jail.chflags_allowed sysctl is
5241		 * is non-zero; otherwise, they behave like unprivileged
5242		 * processes.
5243		 */
5244		if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
5245		    priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) {
5246			if (zflags &
5247			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
5248				error = securelevel_gt(cred, 0);
5249				if (error != 0)
5250					return (error);
5251			}
5252		} else {
5253			/*
5254			 * Callers may only modify the file flags on objects they
5255			 * have VADMIN rights for.
5256			 */
5257			if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0)
5258				return (error);
5259			if (zflags &
5260			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
5261				return (EPERM);
5262			}
5263			if (fflags &
5264			    (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
5265				return (EPERM);
5266			}
5267		}
5268
5269#define	FLAG_CHANGE(fflag, zflag, xflag, xfield)	do {		\
5270	if (((fflags & (fflag)) && !(zflags & (zflag))) ||		\
5271	    ((zflags & (zflag)) && !(fflags & (fflag)))) {		\
5272		XVA_SET_REQ(&xvap, (xflag));				\
5273		(xfield) = ((fflags & (fflag)) != 0);			\
5274	}								\
5275} while (0)
5276		/* Convert chflags into ZFS-type flags. */
5277		/* XXX: what about SF_SETTABLE?. */
5278		FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
5279		    xvap.xva_xoptattrs.xoa_immutable);
5280		FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
5281		    xvap.xva_xoptattrs.xoa_appendonly);
5282		FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
5283		    xvap.xva_xoptattrs.xoa_nounlink);
5284		FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
5285		    xvap.xva_xoptattrs.xoa_archive);
5286		FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
5287		    xvap.xva_xoptattrs.xoa_nodump);
5288		FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
5289		    xvap.xva_xoptattrs.xoa_readonly);
5290		FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
5291		    xvap.xva_xoptattrs.xoa_system);
5292		FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
5293		    xvap.xva_xoptattrs.xoa_hidden);
5294		FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
5295		    xvap.xva_xoptattrs.xoa_hidden);
5296		FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
5297		    xvap.xva_xoptattrs.xoa_offline);
5298		FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
5299		    xvap.xva_xoptattrs.xoa_sparse);
5300#undef	FLAG_CHANGE
5301	}
5302	return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
5303}
5304
5305static int
5306zfs_freebsd_rename(ap)
5307	struct vop_rename_args  /* {
5308		struct vnode *a_fdvp;
5309		struct vnode *a_fvp;
5310		struct componentname *a_fcnp;
5311		struct vnode *a_tdvp;
5312		struct vnode *a_tvp;
5313		struct componentname *a_tcnp;
5314	} */ *ap;
5315{
5316	vnode_t *fdvp = ap->a_fdvp;
5317	vnode_t *fvp = ap->a_fvp;
5318	vnode_t *tdvp = ap->a_tdvp;
5319	vnode_t *tvp = ap->a_tvp;
5320	int error;
5321
5322	ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
5323	ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
5324
5325	error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
5326	    ap->a_tcnp, ap->a_fcnp->cn_cred);
5327
5328	vrele(fdvp);
5329	vrele(fvp);
5330	vrele(tdvp);
5331	if (tvp != NULL)
5332		vrele(tvp);
5333
5334	return (error);
5335}
5336
5337static int
5338zfs_freebsd_symlink(ap)
5339	struct vop_symlink_args /* {
5340		struct vnode *a_dvp;
5341		struct vnode **a_vpp;
5342		struct componentname *a_cnp;
5343		struct vattr *a_vap;
5344		char *a_target;
5345	} */ *ap;
5346{
5347	struct componentname *cnp = ap->a_cnp;
5348	vattr_t *vap = ap->a_vap;
5349
5350	ASSERT(cnp->cn_flags & SAVENAME);
5351
5352	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
5353	vattr_init_mask(vap);
5354
5355	return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
5356	    ap->a_target, cnp->cn_cred, cnp->cn_thread));
5357}
5358
5359static int
5360zfs_freebsd_readlink(ap)
5361	struct vop_readlink_args /* {
5362		struct vnode *a_vp;
5363		struct uio *a_uio;
5364		struct ucred *a_cred;
5365	} */ *ap;
5366{
5367
5368	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
5369}
5370
5371static int
5372zfs_freebsd_link(ap)
5373	struct vop_link_args /* {
5374		struct vnode *a_tdvp;
5375		struct vnode *a_vp;
5376		struct componentname *a_cnp;
5377	} */ *ap;
5378{
5379	struct componentname *cnp = ap->a_cnp;
5380	vnode_t *vp = ap->a_vp;
5381	vnode_t *tdvp = ap->a_tdvp;
5382
5383	if (tdvp->v_mount != vp->v_mount)
5384		return (EXDEV);
5385
5386	ASSERT(cnp->cn_flags & SAVENAME);
5387
5388	return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
5389}
5390
5391static int
5392zfs_freebsd_inactive(ap)
5393	struct vop_inactive_args /* {
5394		struct vnode *a_vp;
5395		struct thread *a_td;
5396	} */ *ap;
5397{
5398	vnode_t *vp = ap->a_vp;
5399
5400	zfs_inactive(vp, ap->a_td->td_ucred, NULL);
5401	return (0);
5402}
5403
5404static int
5405zfs_freebsd_reclaim(ap)
5406	struct vop_reclaim_args /* {
5407		struct vnode *a_vp;
5408		struct thread *a_td;
5409	} */ *ap;
5410{
5411	vnode_t	*vp = ap->a_vp;
5412	znode_t	*zp = VTOZ(vp);
5413	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5414
5415	ASSERT(zp != NULL);
5416
5417	/* Destroy the vm object and flush associated pages. */
5418	vnode_destroy_vobject(vp);
5419
5420	/*
5421	 * z_teardown_inactive_lock protects from a race with
5422	 * zfs_znode_dmu_fini in zfsvfs_teardown during
5423	 * force unmount.
5424	 */
5425	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
5426	if (zp->z_sa_hdl == NULL)
5427		zfs_znode_free(zp);
5428	else
5429		zfs_zinactive(zp);
5430	rw_exit(&zfsvfs->z_teardown_inactive_lock);
5431
5432	vp->v_data = NULL;
5433	return (0);
5434}
5435
5436static int
5437zfs_freebsd_fid(ap)
5438	struct vop_fid_args /* {
5439		struct vnode *a_vp;
5440		struct fid *a_fid;
5441	} */ *ap;
5442{
5443
5444	return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
5445}
5446
5447static int
5448zfs_freebsd_pathconf(ap)
5449	struct vop_pathconf_args /* {
5450		struct vnode *a_vp;
5451		int a_name;
5452		register_t *a_retval;
5453	} */ *ap;
5454{
5455	ulong_t val;
5456	int error;
5457
5458	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL);
5459	if (error == 0)
5460		*ap->a_retval = val;
5461	else if (error == EOPNOTSUPP)
5462		error = vop_stdpathconf(ap);
5463	return (error);
5464}
5465
5466static int
5467zfs_freebsd_fifo_pathconf(ap)
5468	struct vop_pathconf_args /* {
5469		struct vnode *a_vp;
5470		int a_name;
5471		register_t *a_retval;
5472	} */ *ap;
5473{
5474
5475	switch (ap->a_name) {
5476	case _PC_ACL_EXTENDED:
5477	case _PC_ACL_NFS4:
5478	case _PC_ACL_PATH_MAX:
5479	case _PC_MAC_PRESENT:
5480		return (zfs_freebsd_pathconf(ap));
5481	default:
5482		return (fifo_specops.vop_pathconf(ap));
5483	}
5484}
5485
5486/*
5487 * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
5488 * extended attribute name:
5489 *
5490 *	NAMESPACE	PREFIX
5491 *	system		freebsd:system:
5492 *	user		(none, can be used to access ZFS fsattr(5) attributes
5493 *			created on Solaris)
5494 */
5495static int
5496zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
5497    size_t size)
5498{
5499	const char *namespace, *prefix, *suffix;
5500
5501	/* We don't allow '/' character in attribute name. */
5502	if (strchr(name, '/') != NULL)
5503		return (EINVAL);
5504	/* We don't allow attribute names that start with "freebsd:" string. */
5505	if (strncmp(name, "freebsd:", 8) == 0)
5506		return (EINVAL);
5507
5508	bzero(attrname, size);
5509
5510	switch (attrnamespace) {
5511	case EXTATTR_NAMESPACE_USER:
5512#if 0
5513		prefix = "freebsd:";
5514		namespace = EXTATTR_NAMESPACE_USER_STRING;
5515		suffix = ":";
5516#else
5517		/*
5518		 * This is the default namespace by which we can access all
5519		 * attributes created on Solaris.
5520		 */
5521		prefix = namespace = suffix = "";
5522#endif
5523		break;
5524	case EXTATTR_NAMESPACE_SYSTEM:
5525		prefix = "freebsd:";
5526		namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
5527		suffix = ":";
5528		break;
5529	case EXTATTR_NAMESPACE_EMPTY:
5530	default:
5531		return (EINVAL);
5532	}
5533	if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
5534	    name) >= size) {
5535		return (ENAMETOOLONG);
5536	}
5537	return (0);
5538}
5539
5540/*
5541 * Vnode operating to retrieve a named extended attribute.
5542 */
5543static int
5544zfs_getextattr(struct vop_getextattr_args *ap)
5545/*
5546vop_getextattr {
5547	IN struct vnode *a_vp;
5548	IN int a_attrnamespace;
5549	IN const char *a_name;
5550	INOUT struct uio *a_uio;
5551	OUT size_t *a_size;
5552	IN struct ucred *a_cred;
5553	IN struct thread *a_td;
5554};
5555*/
5556{
5557	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5558	struct thread *td = ap->a_td;
5559	struct nameidata nd;
5560	char attrname[255];
5561	struct vattr va;
5562	vnode_t *xvp = NULL, *vp;
5563	int error, flags;
5564
5565	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5566	    ap->a_cred, ap->a_td, VREAD);
5567	if (error != 0)
5568		return (error);
5569
5570	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5571	    sizeof(attrname));
5572	if (error != 0)
5573		return (error);
5574
5575	ZFS_ENTER(zfsvfs);
5576
5577	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5578	    LOOKUP_XATTR);
5579	if (error != 0) {
5580		ZFS_EXIT(zfsvfs);
5581		return (error);
5582	}
5583
5584	flags = FREAD;
5585	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
5586	    xvp, td);
5587	error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL);
5588	vp = nd.ni_vp;
5589	NDFREE(&nd, NDF_ONLY_PNBUF);
5590	if (error != 0) {
5591		ZFS_EXIT(zfsvfs);
5592		if (error == ENOENT)
5593			error = ENOATTR;
5594		return (error);
5595	}
5596
5597	if (ap->a_size != NULL) {
5598		error = VOP_GETATTR(vp, &va, ap->a_cred);
5599		if (error == 0)
5600			*ap->a_size = (size_t)va.va_size;
5601	} else if (ap->a_uio != NULL)
5602		error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
5603
5604	VOP_UNLOCK(vp, 0);
5605	vn_close(vp, flags, ap->a_cred, td);
5606	ZFS_EXIT(zfsvfs);
5607
5608	return (error);
5609}
5610
5611/*
5612 * Vnode operation to remove a named attribute.
5613 */
5614int
5615zfs_deleteextattr(struct vop_deleteextattr_args *ap)
5616/*
5617vop_deleteextattr {
5618	IN struct vnode *a_vp;
5619	IN int a_attrnamespace;
5620	IN const char *a_name;
5621	IN struct ucred *a_cred;
5622	IN struct thread *a_td;
5623};
5624*/
5625{
5626	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5627	struct thread *td = ap->a_td;
5628	struct nameidata nd;
5629	char attrname[255];
5630	struct vattr va;
5631	vnode_t *xvp = NULL, *vp;
5632	int error, flags;
5633
5634	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5635	    ap->a_cred, ap->a_td, VWRITE);
5636	if (error != 0)
5637		return (error);
5638
5639	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5640	    sizeof(attrname));
5641	if (error != 0)
5642		return (error);
5643
5644	ZFS_ENTER(zfsvfs);
5645
5646	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5647	    LOOKUP_XATTR);
5648	if (error != 0) {
5649		ZFS_EXIT(zfsvfs);
5650		return (error);
5651	}
5652
5653	NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
5654	    UIO_SYSSPACE, attrname, xvp, td);
5655	error = namei(&nd);
5656	vp = nd.ni_vp;
5657	if (error != 0) {
5658		ZFS_EXIT(zfsvfs);
5659		NDFREE(&nd, NDF_ONLY_PNBUF);
5660		if (error == ENOENT)
5661			error = ENOATTR;
5662		return (error);
5663	}
5664
5665	error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
5666	NDFREE(&nd, NDF_ONLY_PNBUF);
5667
5668	vput(nd.ni_dvp);
5669	if (vp == nd.ni_dvp)
5670		vrele(vp);
5671	else
5672		vput(vp);
5673	ZFS_EXIT(zfsvfs);
5674
5675	return (error);
5676}
5677
5678/*
5679 * Vnode operation to set a named attribute.
5680 */
5681static int
5682zfs_setextattr(struct vop_setextattr_args *ap)
5683/*
5684vop_setextattr {
5685	IN struct vnode *a_vp;
5686	IN int a_attrnamespace;
5687	IN const char *a_name;
5688	INOUT struct uio *a_uio;
5689	IN struct ucred *a_cred;
5690	IN struct thread *a_td;
5691};
5692*/
5693{
5694	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5695	struct thread *td = ap->a_td;
5696	struct nameidata nd;
5697	char attrname[255];
5698	struct vattr va;
5699	vnode_t *xvp = NULL, *vp;
5700	int error, flags;
5701
5702	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5703	    ap->a_cred, ap->a_td, VWRITE);
5704	if (error != 0)
5705		return (error);
5706
5707	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5708	    sizeof(attrname));
5709	if (error != 0)
5710		return (error);
5711
5712	ZFS_ENTER(zfsvfs);
5713
5714	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5715	    LOOKUP_XATTR | CREATE_XATTR_DIR);
5716	if (error != 0) {
5717		ZFS_EXIT(zfsvfs);
5718		return (error);
5719	}
5720
5721	flags = FFLAGS(O_WRONLY | O_CREAT);
5722	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
5723	    xvp, td);
5724	error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL);
5725	vp = nd.ni_vp;
5726	NDFREE(&nd, NDF_ONLY_PNBUF);
5727	if (error != 0) {
5728		ZFS_EXIT(zfsvfs);
5729		return (error);
5730	}
5731
5732	VATTR_NULL(&va);
5733	va.va_size = 0;
5734	error = VOP_SETATTR(vp, &va, ap->a_cred);
5735	if (error == 0)
5736		VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);
5737
5738	VOP_UNLOCK(vp, 0);
5739	vn_close(vp, flags, ap->a_cred, td);
5740	ZFS_EXIT(zfsvfs);
5741
5742	return (error);
5743}
5744
5745/*
5746 * Vnode operation to retrieve extended attributes on a vnode.
5747 */
5748static int
5749zfs_listextattr(struct vop_listextattr_args *ap)
5750/*
5751vop_listextattr {
5752	IN struct vnode *a_vp;
5753	IN int a_attrnamespace;
5754	INOUT struct uio *a_uio;
5755	OUT size_t *a_size;
5756	IN struct ucred *a_cred;
5757	IN struct thread *a_td;
5758};
5759*/
5760{
5761	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5762	struct thread *td = ap->a_td;
5763	struct nameidata nd;
5764	char attrprefix[16];
5765	u_char dirbuf[sizeof(struct dirent)];
5766	struct dirent *dp;
5767	struct iovec aiov;
5768	struct uio auio, *uio = ap->a_uio;
5769	size_t *sizep = ap->a_size;
5770	size_t plen;
5771	vnode_t *xvp = NULL, *vp;
5772	int done, error, eof, pos;
5773
5774	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5775	    ap->a_cred, ap->a_td, VREAD);
5776	if (error != 0)
5777		return (error);
5778
5779	error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
5780	    sizeof(attrprefix));
5781	if (error != 0)
5782		return (error);
5783	plen = strlen(attrprefix);
5784
5785	ZFS_ENTER(zfsvfs);
5786
5787	if (sizep != NULL)
5788		*sizep = 0;
5789
5790	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5791	    LOOKUP_XATTR);
5792	if (error != 0) {
5793		ZFS_EXIT(zfsvfs);
5794		/*
5795		 * ENOATTR means that the EA directory does not yet exist,
5796		 * i.e. there are no extended attributes there.
5797		 */
5798		if (error == ENOATTR)
5799			error = 0;
5800		return (error);
5801	}
5802
5803	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
5804	    UIO_SYSSPACE, ".", xvp, td);
5805	error = namei(&nd);
5806	vp = nd.ni_vp;
5807	NDFREE(&nd, NDF_ONLY_PNBUF);
5808	if (error != 0) {
5809		ZFS_EXIT(zfsvfs);
5810		return (error);
5811	}
5812
5813	auio.uio_iov = &aiov;
5814	auio.uio_iovcnt = 1;
5815	auio.uio_segflg = UIO_SYSSPACE;
5816	auio.uio_td = td;
5817	auio.uio_rw = UIO_READ;
5818	auio.uio_offset = 0;
5819
5820	do {
5821		u_char nlen;
5822
5823		aiov.iov_base = (void *)dirbuf;
5824		aiov.iov_len = sizeof(dirbuf);
5825		auio.uio_resid = sizeof(dirbuf);
5826		error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
5827		done = sizeof(dirbuf) - auio.uio_resid;
5828		if (error != 0)
5829			break;
5830		for (pos = 0; pos < done;) {
5831			dp = (struct dirent *)(dirbuf + pos);
5832			pos += dp->d_reclen;
5833			/*
5834			 * XXX: Temporarily we also accept DT_UNKNOWN, as this
5835			 * is what we get when attribute was created on Solaris.
5836			 */
5837			if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
5838				continue;
5839			if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0)
5840				continue;
5841			else if (strncmp(dp->d_name, attrprefix, plen) != 0)
5842				continue;
5843			nlen = dp->d_namlen - plen;
5844			if (sizep != NULL)
5845				*sizep += 1 + nlen;
5846			else if (uio != NULL) {
5847				/*
5848				 * Format of extattr name entry is one byte for
5849				 * length and the rest for name.
5850				 */
5851				error = uiomove(&nlen, 1, uio->uio_rw, uio);
5852				if (error == 0) {
5853					error = uiomove(dp->d_name + plen, nlen,
5854					    uio->uio_rw, uio);
5855				}
5856				if (error != 0)
5857					break;
5858			}
5859		}
5860	} while (!eof && error == 0);
5861
5862	vput(vp);
5863	ZFS_EXIT(zfsvfs);
5864
5865	return (error);
5866}
5867
5868int
5869zfs_freebsd_getacl(ap)
5870	struct vop_getacl_args /* {
5871		struct vnode *vp;
5872		acl_type_t type;
5873		struct acl *aclp;
5874		struct ucred *cred;
5875		struct thread *td;
5876	} */ *ap;
5877{
5878	int		error;
5879	vsecattr_t      vsecattr;
5880
5881	if (ap->a_type != ACL_TYPE_NFS4)
5882		return (EINVAL);
5883
5884	vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
5885	if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))
5886		return (error);
5887
5888	error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt);
5889	if (vsecattr.vsa_aclentp != NULL)
5890		kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
5891
5892	return (error);
5893}
5894
5895int
5896zfs_freebsd_setacl(ap)
5897	struct vop_setacl_args /* {
5898		struct vnode *vp;
5899		acl_type_t type;
5900		struct acl *aclp;
5901		struct ucred *cred;
5902		struct thread *td;
5903	} */ *ap;
5904{
5905	int		error;
5906	vsecattr_t      vsecattr;
5907	int		aclbsize;	/* size of acl list in bytes */
5908	aclent_t	*aaclp;
5909
5910	if (ap->a_type != ACL_TYPE_NFS4)
5911		return (EINVAL);
5912
5913	if (ap->a_aclp == NULL)
5914		return (EINVAL);
5915
5916	if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
5917		return (EINVAL);
5918
5919	/*
5920	 * With NFSv4 ACLs, chmod(2) may need to add additional entries,
5921	 * splitting every entry into two and appending "canonical six"
5922	 * entries at the end.  Don't allow for setting an ACL that would
5923	 * cause chmod(2) to run out of ACL entries.
5924	 */
5925	if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
5926		return (ENOSPC);
5927
5928	error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
5929	if (error != 0)
5930		return (error);
5931
5932	vsecattr.vsa_mask = VSA_ACE;
5933	aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t);
5934	vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
5935	aaclp = vsecattr.vsa_aclentp;
5936	vsecattr.vsa_aclentsz = aclbsize;
5937
5938	aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
5939	error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL);
5940	kmem_free(aaclp, aclbsize);
5941
5942	return (error);
5943}
5944
5945int
5946zfs_freebsd_aclcheck(ap)
5947	struct vop_aclcheck_args /* {
5948		struct vnode *vp;
5949		acl_type_t type;
5950		struct acl *aclp;
5951		struct ucred *cred;
5952		struct thread *td;
5953	} */ *ap;
5954{
5955
5956	return (EOPNOTSUPP);
5957}
5958
5959static int
5960zfs_vptocnp(struct vop_vptocnp_args *ap)
5961{
5962	vnode_t *covered_vp;
5963	vnode_t *vp = ap->a_vp;;
5964	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
5965	znode_t *zp = VTOZ(vp);
5966	int ltype;
5967	int error;
5968
5969	ZFS_ENTER(zfsvfs);
5970	ZFS_VERIFY_ZP(zp);
5971
5972	/*
5973	 * If we are a snapshot mounted under .zfs, run the operation
5974	 * on the covered vnode.
5975	 */
5976	if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) {
5977		char name[MAXNAMLEN + 1];
5978		znode_t *dzp;
5979		size_t len;
5980
5981		error = zfs_znode_parent_and_name(zp, &dzp, name);
5982		if (error == 0) {
5983			len = strlen(name);
5984			if (*ap->a_buflen < len)
5985				error = SET_ERROR(ENOMEM);
5986		}
5987		if (error == 0) {
5988			*ap->a_buflen -= len;
5989			bcopy(name, ap->a_buf + *ap->a_buflen, len);
5990			*ap->a_vpp = ZTOV(dzp);
5991		}
5992		ZFS_EXIT(zfsvfs);
5993		return (error);
5994	}
5995	ZFS_EXIT(zfsvfs);
5996
5997	covered_vp = vp->v_mount->mnt_vnodecovered;
5998	vhold(covered_vp);
5999	ltype = VOP_ISLOCKED(vp);
6000	VOP_UNLOCK(vp, 0);
6001	error = vget(covered_vp, LK_SHARED, curthread);
6002	vdrop(covered_vp);
6003	if (error == 0) {
6004		error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred,
6005		    ap->a_buf, ap->a_buflen);
6006		vput(covered_vp);
6007	}
6008	vn_lock(vp, ltype | LK_RETRY);
6009	if ((vp->v_iflag & VI_DOOMED) != 0)
6010		error = SET_ERROR(ENOENT);
6011	return (error);
6012}
6013
6014#ifdef DIAGNOSTIC
6015static int
6016zfs_lock(ap)
6017	struct vop_lock1_args /* {
6018		struct vnode *a_vp;
6019		int a_flags;
6020		char *file;
6021		int line;
6022	} */ *ap;
6023{
6024	vnode_t *vp;
6025	znode_t *zp;
6026	int err;
6027
6028	err = vop_stdlock(ap);
6029	if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) {
6030		vp = ap->a_vp;
6031		zp = vp->v_data;
6032		if (vp->v_mount != NULL && (vp->v_iflag & VI_DOOMED) == 0 &&
6033		    zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0)
6034			VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock));
6035	}
6036	return (err);
6037}
6038#endif
6039
6040struct vop_vector zfs_vnodeops;
6041struct vop_vector zfs_fifoops;
6042struct vop_vector zfs_shareops;
6043
6044struct vop_vector zfs_vnodeops = {
6045	.vop_default =		&default_vnodeops,
6046	.vop_inactive =		zfs_freebsd_inactive,
6047	.vop_reclaim =		zfs_freebsd_reclaim,
6048	.vop_access =		zfs_freebsd_access,
6049	.vop_lookup =		zfs_cache_lookup,
6050	.vop_cachedlookup =	zfs_freebsd_lookup,
6051	.vop_getattr =		zfs_freebsd_getattr,
6052	.vop_setattr =		zfs_freebsd_setattr,
6053	.vop_create =		zfs_freebsd_create,
6054	.vop_mknod =		zfs_freebsd_create,
6055	.vop_mkdir =		zfs_freebsd_mkdir,
6056	.vop_readdir =		zfs_freebsd_readdir,
6057	.vop_fsync =		zfs_freebsd_fsync,
6058	.vop_open =		zfs_freebsd_open,
6059	.vop_close =		zfs_freebsd_close,
6060	.vop_rmdir =		zfs_freebsd_rmdir,
6061	.vop_ioctl =		zfs_freebsd_ioctl,
6062	.vop_link =		zfs_freebsd_link,
6063	.vop_symlink =		zfs_freebsd_symlink,
6064	.vop_readlink =		zfs_freebsd_readlink,
6065	.vop_read =		zfs_freebsd_read,
6066	.vop_write =		zfs_freebsd_write,
6067	.vop_remove =		zfs_freebsd_remove,
6068	.vop_rename =		zfs_freebsd_rename,
6069	.vop_pathconf =		zfs_freebsd_pathconf,
6070	.vop_bmap =		zfs_freebsd_bmap,
6071	.vop_fid =		zfs_freebsd_fid,
6072	.vop_getextattr =	zfs_getextattr,
6073	.vop_deleteextattr =	zfs_deleteextattr,
6074	.vop_setextattr =	zfs_setextattr,
6075	.vop_listextattr =	zfs_listextattr,
6076	.vop_getacl =		zfs_freebsd_getacl,
6077	.vop_setacl =		zfs_freebsd_setacl,
6078	.vop_aclcheck =		zfs_freebsd_aclcheck,
6079	.vop_getpages =		zfs_freebsd_getpages,
6080	.vop_putpages =		zfs_freebsd_putpages,
6081	.vop_vptocnp =		zfs_vptocnp,
6082#ifdef DIAGNOSTIC
6083	.vop_lock1 =		zfs_lock,
6084#endif
6085};
6086
6087struct vop_vector zfs_fifoops = {
6088	.vop_default =		&fifo_specops,
6089	.vop_fsync =		zfs_freebsd_fsync,
6090	.vop_access =		zfs_freebsd_access,
6091	.vop_getattr =		zfs_freebsd_getattr,
6092	.vop_inactive =		zfs_freebsd_inactive,
6093	.vop_read =		VOP_PANIC,
6094	.vop_reclaim =		zfs_freebsd_reclaim,
6095	.vop_setattr =		zfs_freebsd_setattr,
6096	.vop_write =		VOP_PANIC,
6097	.vop_pathconf = 	zfs_freebsd_fifo_pathconf,
6098	.vop_fid =		zfs_freebsd_fid,
6099	.vop_getacl =		zfs_freebsd_getacl,
6100	.vop_setacl =		zfs_freebsd_setacl,
6101	.vop_aclcheck =		zfs_freebsd_aclcheck,
6102};
6103
6104/*
6105 * special share hidden files vnode operations template
6106 */
6107struct vop_vector zfs_shareops = {
6108	.vop_default =		&default_vnodeops,
6109	.vop_access =		zfs_freebsd_access,
6110	.vop_inactive =		zfs_freebsd_inactive,
6111	.vop_reclaim =		zfs_freebsd_reclaim,
6112	.vop_fid =		zfs_freebsd_fid,
6113	.vop_pathconf =		zfs_freebsd_pathconf,
6114};
6115