zfs_vnops.c revision 304671
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24 * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
25 * Copyright (c) 2014 Integros [integros.com]
26 */
27
28/* Portions Copyright 2007 Jeremy Teo */
29/* Portions Copyright 2010 Robert Milkowski */
30
31#include <sys/types.h>
32#include <sys/param.h>
33#include <sys/time.h>
34#include <sys/systm.h>
35#include <sys/sysmacros.h>
36#include <sys/resource.h>
37#include <sys/vfs.h>
38#include <sys/vm.h>
39#include <sys/vnode.h>
40#include <sys/file.h>
41#include <sys/stat.h>
42#include <sys/kmem.h>
43#include <sys/taskq.h>
44#include <sys/uio.h>
45#include <sys/atomic.h>
46#include <sys/namei.h>
47#include <sys/mman.h>
48#include <sys/cmn_err.h>
49#include <sys/errno.h>
50#include <sys/unistd.h>
51#include <sys/zfs_dir.h>
52#include <sys/zfs_ioctl.h>
53#include <sys/fs/zfs.h>
54#include <sys/dmu.h>
55#include <sys/dmu_objset.h>
56#include <sys/spa.h>
57#include <sys/txg.h>
58#include <sys/dbuf.h>
59#include <sys/zap.h>
60#include <sys/sa.h>
61#include <sys/dirent.h>
62#include <sys/policy.h>
63#include <sys/sunddi.h>
64#include <sys/filio.h>
65#include <sys/sid.h>
66#include <sys/zfs_ctldir.h>
67#include <sys/zfs_fuid.h>
68#include <sys/zfs_sa.h>
69#include <sys/zfs_rlock.h>
70#include <sys/extdirent.h>
71#include <sys/kidmap.h>
72#include <sys/bio.h>
73#include <sys/buf.h>
74#include <sys/sched.h>
75#include <sys/acl.h>
76#include <vm/vm_param.h>
77
78/*
79 * Programming rules.
80 *
81 * Each vnode op performs some logical unit of work.  To do this, the ZPL must
82 * properly lock its in-core state, create a DMU transaction, do the work,
83 * record this work in the intent log (ZIL), commit the DMU transaction,
84 * and wait for the intent log to commit if it is a synchronous operation.
85 * Moreover, the vnode ops must work in both normal and log replay context.
86 * The ordering of events is important to avoid deadlocks and references
87 * to freed memory.  The example below illustrates the following Big Rules:
88 *
89 *  (1)	A check must be made in each zfs thread for a mounted file system.
90 *	This is done avoiding races using ZFS_ENTER(zfsvfs).
91 *	A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
92 *	must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
93 *	can return EIO from the calling function.
94 *
95 *  (2)	VN_RELE() should always be the last thing except for zil_commit()
96 *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
97 *	First, if it's the last reference, the vnode/znode
98 *	can be freed, so the zp may point to freed memory.  Second, the last
99 *	reference will call zfs_zinactive(), which may induce a lot of work --
100 *	pushing cached pages (which acquires range locks) and syncing out
101 *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
102 *	which could deadlock the system if you were already holding one.
103 *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
104 *
105 *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
106 *	as they can span dmu_tx_assign() calls.
107 *
108 *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
109 *      dmu_tx_assign().  This is critical because we don't want to block
110 *      while holding locks.
111 *
112 *	If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
113 *	reduces lock contention and CPU usage when we must wait (note that if
114 *	throughput is constrained by the storage, nearly every transaction
115 *	must wait).
116 *
117 *      Note, in particular, that if a lock is sometimes acquired before
118 *      the tx assigns, and sometimes after (e.g. z_lock), then failing
119 *      to use a non-blocking assign can deadlock the system.  The scenario:
120 *
121 *	Thread A has grabbed a lock before calling dmu_tx_assign().
122 *	Thread B is in an already-assigned tx, and blocks for this lock.
123 *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
124 *	forever, because the previous txg can't quiesce until B's tx commits.
125 *
126 *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
127 *	then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
128 *	calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT,
129 *	to indicate that this operation has already called dmu_tx_wait().
130 *	This will ensure that we don't retry forever, waiting a short bit
131 *	each time.
132 *
133 *  (5)	If the operation succeeded, generate the intent log entry for it
134 *	before dropping locks.  This ensures that the ordering of events
135 *	in the intent log matches the order in which they actually occurred.
136 *	During ZIL replay the zfs_log_* functions will update the sequence
137 *	number to indicate the zil transaction has replayed.
138 *
139 *  (6)	At the end of each vnode op, the DMU tx must always commit,
140 *	regardless of whether there were any errors.
141 *
142 *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
143 *	to ensure that synchronous semantics are provided when necessary.
144 *
145 * In general, this is how things should be ordered in each vnode op:
146 *
147 *	ZFS_ENTER(zfsvfs);		// exit if unmounted
148 * top:
149 *	zfs_dirent_lookup(&dl, ...)	// lock directory entry (may VN_HOLD())
150 *	rw_enter(...);			// grab any other locks you need
151 *	tx = dmu_tx_create(...);	// get DMU tx
152 *	dmu_tx_hold_*();		// hold each object you might modify
153 *	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
154 *	if (error) {
155 *		rw_exit(...);		// drop locks
156 *		zfs_dirent_unlock(dl);	// unlock directory entry
157 *		VN_RELE(...);		// release held vnodes
158 *		if (error == ERESTART) {
159 *			waited = B_TRUE;
160 *			dmu_tx_wait(tx);
161 *			dmu_tx_abort(tx);
162 *			goto top;
163 *		}
164 *		dmu_tx_abort(tx);	// abort DMU tx
165 *		ZFS_EXIT(zfsvfs);	// finished in zfs
166 *		return (error);		// really out of space
167 *	}
168 *	error = do_real_work();		// do whatever this VOP does
169 *	if (error == 0)
170 *		zfs_log_*(...);		// on success, make ZIL entry
171 *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
172 *	rw_exit(...);			// drop locks
173 *	zfs_dirent_unlock(dl);		// unlock directory entry
174 *	VN_RELE(...);			// release held vnodes
175 *	zil_commit(zilog, foid);	// synchronous when necessary
176 *	ZFS_EXIT(zfsvfs);		// finished in zfs
177 *	return (error);			// done, report error
178 */
179
180/* ARGSUSED */
181static int
182zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
183{
184	znode_t	*zp = VTOZ(*vpp);
185	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
186
187	ZFS_ENTER(zfsvfs);
188	ZFS_VERIFY_ZP(zp);
189
190	if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
191	    ((flag & FAPPEND) == 0)) {
192		ZFS_EXIT(zfsvfs);
193		return (SET_ERROR(EPERM));
194	}
195
196	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
197	    ZTOV(zp)->v_type == VREG &&
198	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
199		if (fs_vscan(*vpp, cr, 0) != 0) {
200			ZFS_EXIT(zfsvfs);
201			return (SET_ERROR(EACCES));
202		}
203	}
204
205	/* Keep a count of the synchronous opens in the znode */
206	if (flag & (FSYNC | FDSYNC))
207		atomic_inc_32(&zp->z_sync_cnt);
208
209	ZFS_EXIT(zfsvfs);
210	return (0);
211}
212
213/* ARGSUSED */
214static int
215zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
216    caller_context_t *ct)
217{
218	znode_t	*zp = VTOZ(vp);
219	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
220
221	/*
222	 * Clean up any locks held by this process on the vp.
223	 */
224	cleanlocks(vp, ddi_get_pid(), 0);
225	cleanshares(vp, ddi_get_pid());
226
227	ZFS_ENTER(zfsvfs);
228	ZFS_VERIFY_ZP(zp);
229
230	/* Decrement the synchronous opens in the znode */
231	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
232		atomic_dec_32(&zp->z_sync_cnt);
233
234	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
235	    ZTOV(zp)->v_type == VREG &&
236	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
237		VERIFY(fs_vscan(vp, cr, 1) == 0);
238
239	ZFS_EXIT(zfsvfs);
240	return (0);
241}
242
243/*
244 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
245 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
246 */
247static int
248zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
249{
250	znode_t	*zp = VTOZ(vp);
251	uint64_t noff = (uint64_t)*off; /* new offset */
252	uint64_t file_sz;
253	int error;
254	boolean_t hole;
255
256	file_sz = zp->z_size;
257	if (noff >= file_sz)  {
258		return (SET_ERROR(ENXIO));
259	}
260
261	if (cmd == _FIO_SEEK_HOLE)
262		hole = B_TRUE;
263	else
264		hole = B_FALSE;
265
266	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
267
268	if (error == ESRCH)
269		return (SET_ERROR(ENXIO));
270
271	/*
272	 * We could find a hole that begins after the logical end-of-file,
273	 * because dmu_offset_next() only works on whole blocks.  If the
274	 * EOF falls mid-block, then indicate that the "virtual hole"
275	 * at the end of the file begins at the logical EOF, rather than
276	 * at the end of the last block.
277	 */
278	if (noff > file_sz) {
279		ASSERT(hole);
280		noff = file_sz;
281	}
282
283	if (noff < *off)
284		return (error);
285	*off = noff;
286	return (error);
287}
288
289/* ARGSUSED */
290static int
291zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
292    int *rvalp, caller_context_t *ct)
293{
294	offset_t off;
295	offset_t ndata;
296	dmu_object_info_t doi;
297	int error;
298	zfsvfs_t *zfsvfs;
299	znode_t *zp;
300
301	switch (com) {
302	case _FIOFFS:
303	{
304		return (0);
305
306		/*
307		 * The following two ioctls are used by bfu.  Faking out,
308		 * necessary to avoid bfu errors.
309		 */
310	}
311	case _FIOGDIO:
312	case _FIOSDIO:
313	{
314		return (0);
315	}
316
317	case _FIO_SEEK_DATA:
318	case _FIO_SEEK_HOLE:
319	{
320#ifdef illumos
321		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
322			return (SET_ERROR(EFAULT));
323#else
324		off = *(offset_t *)data;
325#endif
326		zp = VTOZ(vp);
327		zfsvfs = zp->z_zfsvfs;
328		ZFS_ENTER(zfsvfs);
329		ZFS_VERIFY_ZP(zp);
330
331		/* offset parameter is in/out */
332		error = zfs_holey(vp, com, &off);
333		ZFS_EXIT(zfsvfs);
334		if (error)
335			return (error);
336#ifdef illumos
337		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
338			return (SET_ERROR(EFAULT));
339#else
340		*(offset_t *)data = off;
341#endif
342		return (0);
343	}
344#ifdef illumos
345	case _FIO_COUNT_FILLED:
346	{
347		/*
348		 * _FIO_COUNT_FILLED adds a new ioctl command which
349		 * exposes the number of filled blocks in a
350		 * ZFS object.
351		 */
352		zp = VTOZ(vp);
353		zfsvfs = zp->z_zfsvfs;
354		ZFS_ENTER(zfsvfs);
355		ZFS_VERIFY_ZP(zp);
356
357		/*
358		 * Wait for all dirty blocks for this object
359		 * to get synced out to disk, and the DMU info
360		 * updated.
361		 */
362		error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
363		if (error) {
364			ZFS_EXIT(zfsvfs);
365			return (error);
366		}
367
368		/*
369		 * Retrieve fill count from DMU object.
370		 */
371		error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
372		if (error) {
373			ZFS_EXIT(zfsvfs);
374			return (error);
375		}
376
377		ndata = doi.doi_fill_count;
378
379		ZFS_EXIT(zfsvfs);
380		if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
381			return (SET_ERROR(EFAULT));
382		return (0);
383	}
384#endif
385	}
386	return (SET_ERROR(ENOTTY));
387}
388
389static vm_page_t
390page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
391{
392	vm_object_t obj;
393	vm_page_t pp;
394	int64_t end;
395
396	/*
397	 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
398	 * aligned boundaries, if the range is not aligned.  As a result a
399	 * DEV_BSIZE subrange with partially dirty data may get marked as clean.
400	 * It may happen that all DEV_BSIZE subranges are marked clean and thus
401	 * the whole page would be considred clean despite have some dirty data.
402	 * For this reason we should shrink the range to DEV_BSIZE aligned
403	 * boundaries before calling vm_page_clear_dirty.
404	 */
405	end = rounddown2(off + nbytes, DEV_BSIZE);
406	off = roundup2(off, DEV_BSIZE);
407	nbytes = end - off;
408
409	obj = vp->v_object;
410	zfs_vmobject_assert_wlocked(obj);
411
412	for (;;) {
413		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
414		    pp->valid) {
415			if (vm_page_xbusied(pp)) {
416				/*
417				 * Reference the page before unlocking and
418				 * sleeping so that the page daemon is less
419				 * likely to reclaim it.
420				 */
421				vm_page_reference(pp);
422				vm_page_lock(pp);
423				zfs_vmobject_wunlock(obj);
424				vm_page_busy_sleep(pp, "zfsmwb");
425				zfs_vmobject_wlock(obj);
426				continue;
427			}
428			vm_page_sbusy(pp);
429		} else if (pp == NULL) {
430			pp = vm_page_alloc(obj, OFF_TO_IDX(start),
431			    VM_ALLOC_SYSTEM | VM_ALLOC_IFCACHED |
432			    VM_ALLOC_SBUSY);
433		} else {
434			ASSERT(pp != NULL && !pp->valid);
435			pp = NULL;
436		}
437
438		if (pp != NULL) {
439			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
440			vm_object_pip_add(obj, 1);
441			pmap_remove_write(pp);
442			if (nbytes != 0)
443				vm_page_clear_dirty(pp, off, nbytes);
444		}
445		break;
446	}
447	return (pp);
448}
449
450static void
451page_unbusy(vm_page_t pp)
452{
453
454	vm_page_sunbusy(pp);
455	vm_object_pip_subtract(pp->object, 1);
456}
457
458static vm_page_t
459page_hold(vnode_t *vp, int64_t start)
460{
461	vm_object_t obj;
462	vm_page_t pp;
463
464	obj = vp->v_object;
465	zfs_vmobject_assert_wlocked(obj);
466
467	for (;;) {
468		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
469		    pp->valid) {
470			if (vm_page_xbusied(pp)) {
471				/*
472				 * Reference the page before unlocking and
473				 * sleeping so that the page daemon is less
474				 * likely to reclaim it.
475				 */
476				vm_page_reference(pp);
477				vm_page_lock(pp);
478				zfs_vmobject_wunlock(obj);
479				vm_page_busy_sleep(pp, "zfsmwb");
480				zfs_vmobject_wlock(obj);
481				continue;
482			}
483
484			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
485			vm_page_lock(pp);
486			vm_page_hold(pp);
487			vm_page_unlock(pp);
488
489		} else
490			pp = NULL;
491		break;
492	}
493	return (pp);
494}
495
496static void
497page_unhold(vm_page_t pp)
498{
499
500	vm_page_lock(pp);
501	vm_page_unhold(pp);
502	vm_page_unlock(pp);
503}
504
505/*
506 * When a file is memory mapped, we must keep the IO data synchronized
507 * between the DMU cache and the memory mapped pages.  What this means:
508 *
509 * On Write:	If we find a memory mapped page, we write to *both*
510 *		the page and the dmu buffer.
511 */
512static void
513update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
514    int segflg, dmu_tx_t *tx)
515{
516	vm_object_t obj;
517	struct sf_buf *sf;
518	caddr_t va;
519	int off;
520
521	ASSERT(segflg != UIO_NOCOPY);
522	ASSERT(vp->v_mount != NULL);
523	obj = vp->v_object;
524	ASSERT(obj != NULL);
525
526	off = start & PAGEOFFSET;
527	zfs_vmobject_wlock(obj);
528	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
529		vm_page_t pp;
530		int nbytes = imin(PAGESIZE - off, len);
531
532		if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
533			zfs_vmobject_wunlock(obj);
534
535			va = zfs_map_page(pp, &sf);
536			(void) dmu_read(os, oid, start+off, nbytes,
537			    va+off, DMU_READ_PREFETCH);;
538			zfs_unmap_page(sf);
539
540			zfs_vmobject_wlock(obj);
541			page_unbusy(pp);
542		}
543		len -= nbytes;
544		off = 0;
545	}
546	vm_object_pip_wakeupn(obj, 0);
547	zfs_vmobject_wunlock(obj);
548}
549
550/*
551 * Read with UIO_NOCOPY flag means that sendfile(2) requests
552 * ZFS to populate a range of page cache pages with data.
553 *
554 * NOTE: this function could be optimized to pre-allocate
555 * all pages in advance, drain exclusive busy on all of them,
556 * map them into contiguous KVA region and populate them
557 * in one single dmu_read() call.
558 */
559static int
560mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
561{
562	znode_t *zp = VTOZ(vp);
563	objset_t *os = zp->z_zfsvfs->z_os;
564	struct sf_buf *sf;
565	vm_object_t obj;
566	vm_page_t pp;
567	int64_t start;
568	caddr_t va;
569	int len = nbytes;
570	int off;
571	int error = 0;
572
573	ASSERT(uio->uio_segflg == UIO_NOCOPY);
574	ASSERT(vp->v_mount != NULL);
575	obj = vp->v_object;
576	ASSERT(obj != NULL);
577	ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
578
579	zfs_vmobject_wlock(obj);
580	for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
581		int bytes = MIN(PAGESIZE, len);
582
583		pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY |
584		    VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
585		if (pp->valid == 0) {
586			zfs_vmobject_wunlock(obj);
587			va = zfs_map_page(pp, &sf);
588			error = dmu_read(os, zp->z_id, start, bytes, va,
589			    DMU_READ_PREFETCH);
590			if (bytes != PAGESIZE && error == 0)
591				bzero(va + bytes, PAGESIZE - bytes);
592			zfs_unmap_page(sf);
593			zfs_vmobject_wlock(obj);
594			vm_page_sunbusy(pp);
595			vm_page_lock(pp);
596			if (error) {
597				if (pp->wire_count == 0 && pp->valid == 0 &&
598				    !vm_page_busied(pp))
599					vm_page_free(pp);
600			} else {
601				pp->valid = VM_PAGE_BITS_ALL;
602				vm_page_activate(pp);
603			}
604			vm_page_unlock(pp);
605		} else {
606			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
607			vm_page_sunbusy(pp);
608		}
609		if (error)
610			break;
611		uio->uio_resid -= bytes;
612		uio->uio_offset += bytes;
613		len -= bytes;
614	}
615	zfs_vmobject_wunlock(obj);
616	return (error);
617}
618
619/*
620 * When a file is memory mapped, we must keep the IO data synchronized
621 * between the DMU cache and the memory mapped pages.  What this means:
622 *
623 * On Read:	We "read" preferentially from memory mapped pages,
624 *		else we default from the dmu buffer.
625 *
626 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
627 *	 the file is memory mapped.
628 */
629static int
630mappedread(vnode_t *vp, int nbytes, uio_t *uio)
631{
632	znode_t *zp = VTOZ(vp);
633	vm_object_t obj;
634	int64_t start;
635	caddr_t va;
636	int len = nbytes;
637	int off;
638	int error = 0;
639
640	ASSERT(vp->v_mount != NULL);
641	obj = vp->v_object;
642	ASSERT(obj != NULL);
643
644	start = uio->uio_loffset;
645	off = start & PAGEOFFSET;
646	zfs_vmobject_wlock(obj);
647	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
648		vm_page_t pp;
649		uint64_t bytes = MIN(PAGESIZE - off, len);
650
651		if (pp = page_hold(vp, start)) {
652			struct sf_buf *sf;
653			caddr_t va;
654
655			zfs_vmobject_wunlock(obj);
656			va = zfs_map_page(pp, &sf);
657#ifdef illumos
658			error = uiomove(va + off, bytes, UIO_READ, uio);
659#else
660			error = vn_io_fault_uiomove(va + off, bytes, uio);
661#endif
662			zfs_unmap_page(sf);
663			zfs_vmobject_wlock(obj);
664			page_unhold(pp);
665		} else {
666			zfs_vmobject_wunlock(obj);
667			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
668			    uio, bytes);
669			zfs_vmobject_wlock(obj);
670		}
671		len -= bytes;
672		off = 0;
673		if (error)
674			break;
675	}
676	zfs_vmobject_wunlock(obj);
677	return (error);
678}
679
680offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
681
682/*
683 * Read bytes from specified file into supplied buffer.
684 *
685 *	IN:	vp	- vnode of file to be read from.
686 *		uio	- structure supplying read location, range info,
687 *			  and return buffer.
688 *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
689 *		cr	- credentials of caller.
690 *		ct	- caller context
691 *
692 *	OUT:	uio	- updated offset and range, buffer filled.
693 *
694 *	RETURN:	0 on success, error code on failure.
695 *
696 * Side Effects:
697 *	vp - atime updated if byte count > 0
698 */
699/* ARGSUSED */
700static int
701zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
702{
703	znode_t		*zp = VTOZ(vp);
704	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
705	ssize_t		n, nbytes;
706	int		error = 0;
707	rl_t		*rl;
708	xuio_t		*xuio = NULL;
709
710	ZFS_ENTER(zfsvfs);
711	ZFS_VERIFY_ZP(zp);
712
713	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
714		ZFS_EXIT(zfsvfs);
715		return (SET_ERROR(EACCES));
716	}
717
718	/*
719	 * Validate file offset
720	 */
721	if (uio->uio_loffset < (offset_t)0) {
722		ZFS_EXIT(zfsvfs);
723		return (SET_ERROR(EINVAL));
724	}
725
726	/*
727	 * Fasttrack empty reads
728	 */
729	if (uio->uio_resid == 0) {
730		ZFS_EXIT(zfsvfs);
731		return (0);
732	}
733
734	/*
735	 * Check for mandatory locks
736	 */
737	if (MANDMODE(zp->z_mode)) {
738		if (error = chklock(vp, FREAD,
739		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
740			ZFS_EXIT(zfsvfs);
741			return (error);
742		}
743	}
744
745	/*
746	 * If we're in FRSYNC mode, sync out this znode before reading it.
747	 */
748	if (zfsvfs->z_log &&
749	    (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
750		zil_commit(zfsvfs->z_log, zp->z_id);
751
752	/*
753	 * Lock the range against changes.
754	 */
755	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
756
757	/*
758	 * If we are reading past end-of-file we can skip
759	 * to the end; but we might still need to set atime.
760	 */
761	if (uio->uio_loffset >= zp->z_size) {
762		error = 0;
763		goto out;
764	}
765
766	ASSERT(uio->uio_loffset < zp->z_size);
767	n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
768
769#ifdef illumos
770	if ((uio->uio_extflg == UIO_XUIO) &&
771	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
772		int nblk;
773		int blksz = zp->z_blksz;
774		uint64_t offset = uio->uio_loffset;
775
776		xuio = (xuio_t *)uio;
777		if ((ISP2(blksz))) {
778			nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
779			    blksz)) / blksz;
780		} else {
781			ASSERT(offset + n <= blksz);
782			nblk = 1;
783		}
784		(void) dmu_xuio_init(xuio, nblk);
785
786		if (vn_has_cached_data(vp)) {
787			/*
788			 * For simplicity, we always allocate a full buffer
789			 * even if we only expect to read a portion of a block.
790			 */
791			while (--nblk >= 0) {
792				(void) dmu_xuio_add(xuio,
793				    dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
794				    blksz), 0, blksz);
795			}
796		}
797	}
798#endif	/* illumos */
799
800	while (n > 0) {
801		nbytes = MIN(n, zfs_read_chunk_size -
802		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
803
804#ifdef __FreeBSD__
805		if (uio->uio_segflg == UIO_NOCOPY)
806			error = mappedread_sf(vp, nbytes, uio);
807		else
808#endif /* __FreeBSD__ */
809		if (vn_has_cached_data(vp)) {
810			error = mappedread(vp, nbytes, uio);
811		} else {
812			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
813			    uio, nbytes);
814		}
815		if (error) {
816			/* convert checksum errors into IO errors */
817			if (error == ECKSUM)
818				error = SET_ERROR(EIO);
819			break;
820		}
821
822		n -= nbytes;
823	}
824out:
825	zfs_range_unlock(rl);
826
827	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
828	ZFS_EXIT(zfsvfs);
829	return (error);
830}
831
832/*
833 * Write the bytes to a file.
834 *
835 *	IN:	vp	- vnode of file to be written to.
836 *		uio	- structure supplying write location, range info,
837 *			  and data buffer.
838 *		ioflag	- FAPPEND, FSYNC, and/or FDSYNC.  FAPPEND is
839 *			  set if in append mode.
840 *		cr	- credentials of caller.
841 *		ct	- caller context (NFS/CIFS fem monitor only)
842 *
843 *	OUT:	uio	- updated offset and range.
844 *
845 *	RETURN:	0 on success, error code on failure.
846 *
847 * Timestamps:
848 *	vp - ctime|mtime updated if byte count > 0
849 */
850
851/* ARGSUSED */
852static int
853zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
854{
855	znode_t		*zp = VTOZ(vp);
856	rlim64_t	limit = MAXOFFSET_T;
857	ssize_t		start_resid = uio->uio_resid;
858	ssize_t		tx_bytes;
859	uint64_t	end_size;
860	dmu_tx_t	*tx;
861	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
862	zilog_t		*zilog;
863	offset_t	woff;
864	ssize_t		n, nbytes;
865	rl_t		*rl;
866	int		max_blksz = zfsvfs->z_max_blksz;
867	int		error = 0;
868	arc_buf_t	*abuf;
869	iovec_t		*aiov = NULL;
870	xuio_t		*xuio = NULL;
871	int		i_iov = 0;
872	int		iovcnt = uio->uio_iovcnt;
873	iovec_t		*iovp = uio->uio_iov;
874	int		write_eof;
875	int		count = 0;
876	sa_bulk_attr_t	bulk[4];
877	uint64_t	mtime[2], ctime[2];
878
879	/*
880	 * Fasttrack empty write
881	 */
882	n = start_resid;
883	if (n == 0)
884		return (0);
885
886	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
887		limit = MAXOFFSET_T;
888
889	ZFS_ENTER(zfsvfs);
890	ZFS_VERIFY_ZP(zp);
891
892	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
893	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
894	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
895	    &zp->z_size, 8);
896	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
897	    &zp->z_pflags, 8);
898
899	/*
900	 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
901	 * callers might not be able to detect properly that we are read-only,
902	 * so check it explicitly here.
903	 */
904	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
905		ZFS_EXIT(zfsvfs);
906		return (SET_ERROR(EROFS));
907	}
908
909	/*
910	 * If immutable or not appending then return EPERM
911	 */
912	if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
913	    ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
914	    (uio->uio_loffset < zp->z_size))) {
915		ZFS_EXIT(zfsvfs);
916		return (SET_ERROR(EPERM));
917	}
918
919	zilog = zfsvfs->z_log;
920
921	/*
922	 * Validate file offset
923	 */
924	woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
925	if (woff < 0) {
926		ZFS_EXIT(zfsvfs);
927		return (SET_ERROR(EINVAL));
928	}
929
930	/*
931	 * Check for mandatory locks before calling zfs_range_lock()
932	 * in order to prevent a deadlock with locks set via fcntl().
933	 */
934	if (MANDMODE((mode_t)zp->z_mode) &&
935	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
936		ZFS_EXIT(zfsvfs);
937		return (error);
938	}
939
940#ifdef illumos
941	/*
942	 * Pre-fault the pages to ensure slow (eg NFS) pages
943	 * don't hold up txg.
944	 * Skip this if uio contains loaned arc_buf.
945	 */
946	if ((uio->uio_extflg == UIO_XUIO) &&
947	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
948		xuio = (xuio_t *)uio;
949	else
950		uio_prefaultpages(MIN(n, max_blksz), uio);
951#endif
952
953	/*
954	 * If in append mode, set the io offset pointer to eof.
955	 */
956	if (ioflag & FAPPEND) {
957		/*
958		 * Obtain an appending range lock to guarantee file append
959		 * semantics.  We reset the write offset once we have the lock.
960		 */
961		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
962		woff = rl->r_off;
963		if (rl->r_len == UINT64_MAX) {
964			/*
965			 * We overlocked the file because this write will cause
966			 * the file block size to increase.
967			 * Note that zp_size cannot change with this lock held.
968			 */
969			woff = zp->z_size;
970		}
971		uio->uio_loffset = woff;
972	} else {
973		/*
974		 * Note that if the file block size will change as a result of
975		 * this write, then this range lock will lock the entire file
976		 * so that we can re-write the block safely.
977		 */
978		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
979	}
980
981	if (vn_rlimit_fsize(vp, uio, uio->uio_td)) {
982		zfs_range_unlock(rl);
983		ZFS_EXIT(zfsvfs);
984		return (EFBIG);
985	}
986
987	if (woff >= limit) {
988		zfs_range_unlock(rl);
989		ZFS_EXIT(zfsvfs);
990		return (SET_ERROR(EFBIG));
991	}
992
993	if ((woff + n) > limit || woff > (limit - n))
994		n = limit - woff;
995
996	/* Will this write extend the file length? */
997	write_eof = (woff + n > zp->z_size);
998
999	end_size = MAX(zp->z_size, woff + n);
1000
1001	/*
1002	 * Write the file in reasonable size chunks.  Each chunk is written
1003	 * in a separate transaction; this keeps the intent log records small
1004	 * and allows us to do more fine-grained space accounting.
1005	 */
1006	while (n > 0) {
1007		abuf = NULL;
1008		woff = uio->uio_loffset;
1009		if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
1010		    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
1011			if (abuf != NULL)
1012				dmu_return_arcbuf(abuf);
1013			error = SET_ERROR(EDQUOT);
1014			break;
1015		}
1016
1017		if (xuio && abuf == NULL) {
1018			ASSERT(i_iov < iovcnt);
1019			aiov = &iovp[i_iov];
1020			abuf = dmu_xuio_arcbuf(xuio, i_iov);
1021			dmu_xuio_clear(xuio, i_iov);
1022			DTRACE_PROBE3(zfs_cp_write, int, i_iov,
1023			    iovec_t *, aiov, arc_buf_t *, abuf);
1024			ASSERT((aiov->iov_base == abuf->b_data) ||
1025			    ((char *)aiov->iov_base - (char *)abuf->b_data +
1026			    aiov->iov_len == arc_buf_size(abuf)));
1027			i_iov++;
1028		} else if (abuf == NULL && n >= max_blksz &&
1029		    woff >= zp->z_size &&
1030		    P2PHASE(woff, max_blksz) == 0 &&
1031		    zp->z_blksz == max_blksz) {
1032			/*
1033			 * This write covers a full block.  "Borrow" a buffer
1034			 * from the dmu so that we can fill it before we enter
1035			 * a transaction.  This avoids the possibility of
1036			 * holding up the transaction if the data copy hangs
1037			 * up on a pagefault (e.g., from an NFS server mapping).
1038			 */
1039#ifdef illumos
1040			size_t cbytes;
1041#endif
1042
1043			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
1044			    max_blksz);
1045			ASSERT(abuf != NULL);
1046			ASSERT(arc_buf_size(abuf) == max_blksz);
1047#ifdef illumos
1048			if (error = uiocopy(abuf->b_data, max_blksz,
1049			    UIO_WRITE, uio, &cbytes)) {
1050				dmu_return_arcbuf(abuf);
1051				break;
1052			}
1053			ASSERT(cbytes == max_blksz);
1054#else
1055			ssize_t resid = uio->uio_resid;
1056			error = vn_io_fault_uiomove(abuf->b_data, max_blksz, uio);
1057			if (error != 0) {
1058				uio->uio_offset -= resid - uio->uio_resid;
1059				uio->uio_resid = resid;
1060				dmu_return_arcbuf(abuf);
1061				break;
1062			}
1063#endif
1064		}
1065
1066		/*
1067		 * Start a transaction.
1068		 */
1069		tx = dmu_tx_create(zfsvfs->z_os);
1070		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1071		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
1072		zfs_sa_upgrade_txholds(tx, zp);
1073		error = dmu_tx_assign(tx, TXG_WAIT);
1074		if (error) {
1075			dmu_tx_abort(tx);
1076			if (abuf != NULL)
1077				dmu_return_arcbuf(abuf);
1078			break;
1079		}
1080
1081		/*
1082		 * If zfs_range_lock() over-locked we grow the blocksize
1083		 * and then reduce the lock range.  This will only happen
1084		 * on the first iteration since zfs_range_reduce() will
1085		 * shrink down r_len to the appropriate size.
1086		 */
1087		if (rl->r_len == UINT64_MAX) {
1088			uint64_t new_blksz;
1089
1090			if (zp->z_blksz > max_blksz) {
1091				/*
1092				 * File's blocksize is already larger than the
1093				 * "recordsize" property.  Only let it grow to
1094				 * the next power of 2.
1095				 */
1096				ASSERT(!ISP2(zp->z_blksz));
1097				new_blksz = MIN(end_size,
1098				    1 << highbit64(zp->z_blksz));
1099			} else {
1100				new_blksz = MIN(end_size, max_blksz);
1101			}
1102			zfs_grow_blocksize(zp, new_blksz, tx);
1103			zfs_range_reduce(rl, woff, n);
1104		}
1105
1106		/*
1107		 * XXX - should we really limit each write to z_max_blksz?
1108		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
1109		 */
1110		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
1111
1112		if (woff + nbytes > zp->z_size)
1113			vnode_pager_setsize(vp, woff + nbytes);
1114
1115		if (abuf == NULL) {
1116			tx_bytes = uio->uio_resid;
1117			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
1118			    uio, nbytes, tx);
1119			tx_bytes -= uio->uio_resid;
1120		} else {
1121			tx_bytes = nbytes;
1122			ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
1123			/*
1124			 * If this is not a full block write, but we are
1125			 * extending the file past EOF and this data starts
1126			 * block-aligned, use assign_arcbuf().  Otherwise,
1127			 * write via dmu_write().
1128			 */
1129			if (tx_bytes < max_blksz && (!write_eof ||
1130			    aiov->iov_base != abuf->b_data)) {
1131				ASSERT(xuio);
1132				dmu_write(zfsvfs->z_os, zp->z_id, woff,
1133				    aiov->iov_len, aiov->iov_base, tx);
1134				dmu_return_arcbuf(abuf);
1135				xuio_stat_wbuf_copied();
1136			} else {
1137				ASSERT(xuio || tx_bytes == max_blksz);
1138				dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
1139				    woff, abuf, tx);
1140			}
1141#ifdef illumos
1142			ASSERT(tx_bytes <= uio->uio_resid);
1143			uioskip(uio, tx_bytes);
1144#endif
1145		}
1146		if (tx_bytes && vn_has_cached_data(vp)) {
1147			update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
1148			    zp->z_id, uio->uio_segflg, tx);
1149		}
1150
1151		/*
1152		 * If we made no progress, we're done.  If we made even
1153		 * partial progress, update the znode and ZIL accordingly.
1154		 */
1155		if (tx_bytes == 0) {
1156			(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
1157			    (void *)&zp->z_size, sizeof (uint64_t), tx);
1158			dmu_tx_commit(tx);
1159			ASSERT(error != 0);
1160			break;
1161		}
1162
1163		/*
1164		 * Clear Set-UID/Set-GID bits on successful write if not
1165		 * privileged and at least one of the excute bits is set.
1166		 *
1167		 * It would be nice to to this after all writes have
1168		 * been done, but that would still expose the ISUID/ISGID
1169		 * to another app after the partial write is committed.
1170		 *
1171		 * Note: we don't call zfs_fuid_map_id() here because
1172		 * user 0 is not an ephemeral uid.
1173		 */
1174		mutex_enter(&zp->z_acl_lock);
1175		if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
1176		    (S_IXUSR >> 6))) != 0 &&
1177		    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
1178		    secpolicy_vnode_setid_retain(vp, cr,
1179		    (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
1180			uint64_t newmode;
1181			zp->z_mode &= ~(S_ISUID | S_ISGID);
1182			newmode = zp->z_mode;
1183			(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
1184			    (void *)&newmode, sizeof (uint64_t), tx);
1185		}
1186		mutex_exit(&zp->z_acl_lock);
1187
1188		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
1189		    B_TRUE);
1190
1191		/*
1192		 * Update the file size (zp_size) if it has changed;
1193		 * account for possible concurrent updates.
1194		 */
1195		while ((end_size = zp->z_size) < uio->uio_loffset) {
1196			(void) atomic_cas_64(&zp->z_size, end_size,
1197			    uio->uio_loffset);
1198#ifdef illumos
1199			ASSERT(error == 0);
1200#else
1201			ASSERT(error == 0 || error == EFAULT);
1202#endif
1203		}
1204		/*
1205		 * If we are replaying and eof is non zero then force
1206		 * the file size to the specified eof. Note, there's no
1207		 * concurrency during replay.
1208		 */
1209		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
1210			zp->z_size = zfsvfs->z_replay_eof;
1211
1212		if (error == 0)
1213			error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1214		else
1215			(void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1216
1217		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
1218		dmu_tx_commit(tx);
1219
1220		if (error != 0)
1221			break;
1222		ASSERT(tx_bytes == nbytes);
1223		n -= nbytes;
1224
1225#ifdef illumos
1226		if (!xuio && n > 0)
1227			uio_prefaultpages(MIN(n, max_blksz), uio);
1228#endif
1229	}
1230
1231	zfs_range_unlock(rl);
1232
1233	/*
1234	 * If we're in replay mode, or we made no progress, return error.
1235	 * Otherwise, it's at least a partial write, so it's successful.
1236	 */
1237	if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
1238		ZFS_EXIT(zfsvfs);
1239		return (error);
1240	}
1241
1242#ifdef __FreeBSD__
1243	/*
1244	 * EFAULT means that at least one page of the source buffer was not
1245	 * available.  VFS will re-try remaining I/O upon this error.
1246	 */
1247	if (error == EFAULT) {
1248		ZFS_EXIT(zfsvfs);
1249		return (error);
1250	}
1251#endif
1252
1253	if (ioflag & (FSYNC | FDSYNC) ||
1254	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1255		zil_commit(zilog, zp->z_id);
1256
1257	ZFS_EXIT(zfsvfs);
1258	return (0);
1259}
1260
1261void
1262zfs_get_done(zgd_t *zgd, int error)
1263{
1264	znode_t *zp = zgd->zgd_private;
1265	objset_t *os = zp->z_zfsvfs->z_os;
1266
1267	if (zgd->zgd_db)
1268		dmu_buf_rele(zgd->zgd_db, zgd);
1269
1270	zfs_range_unlock(zgd->zgd_rl);
1271
1272	/*
1273	 * Release the vnode asynchronously as we currently have the
1274	 * txg stopped from syncing.
1275	 */
1276	VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1277
1278	if (error == 0 && zgd->zgd_bp)
1279		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
1280
1281	kmem_free(zgd, sizeof (zgd_t));
1282}
1283
1284#ifdef DEBUG
1285static int zil_fault_io = 0;
1286#endif
1287
1288/*
1289 * Get data to generate a TX_WRITE intent log record.
1290 */
1291int
1292zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
1293{
1294	zfsvfs_t *zfsvfs = arg;
1295	objset_t *os = zfsvfs->z_os;
1296	znode_t *zp;
1297	uint64_t object = lr->lr_foid;
1298	uint64_t offset = lr->lr_offset;
1299	uint64_t size = lr->lr_length;
1300	blkptr_t *bp = &lr->lr_blkptr;
1301	dmu_buf_t *db;
1302	zgd_t *zgd;
1303	int error = 0;
1304
1305	ASSERT(zio != NULL);
1306	ASSERT(size != 0);
1307
1308	/*
1309	 * Nothing to do if the file has been removed
1310	 */
1311	if (zfs_zget(zfsvfs, object, &zp) != 0)
1312		return (SET_ERROR(ENOENT));
1313	if (zp->z_unlinked) {
1314		/*
1315		 * Release the vnode asynchronously as we currently have the
1316		 * txg stopped from syncing.
1317		 */
1318		VN_RELE_ASYNC(ZTOV(zp),
1319		    dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1320		return (SET_ERROR(ENOENT));
1321	}
1322
1323	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1324	zgd->zgd_zilog = zfsvfs->z_log;
1325	zgd->zgd_private = zp;
1326
1327	/*
1328	 * Write records come in two flavors: immediate and indirect.
1329	 * For small writes it's cheaper to store the data with the
1330	 * log record (immediate); for large writes it's cheaper to
1331	 * sync the data and get a pointer to it (indirect) so that
1332	 * we don't have to write the data twice.
1333	 */
1334	if (buf != NULL) { /* immediate write */
1335		zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
1336		/* test for truncation needs to be done while range locked */
1337		if (offset >= zp->z_size) {
1338			error = SET_ERROR(ENOENT);
1339		} else {
1340			error = dmu_read(os, object, offset, size, buf,
1341			    DMU_READ_NO_PREFETCH);
1342		}
1343		ASSERT(error == 0 || error == ENOENT);
1344	} else { /* indirect write */
1345		/*
1346		 * Have to lock the whole block to ensure when it's
1347		 * written out and it's checksum is being calculated
1348		 * that no one can change the data. We need to re-check
1349		 * blocksize after we get the lock in case it's changed!
1350		 */
1351		for (;;) {
1352			uint64_t blkoff;
1353			size = zp->z_blksz;
1354			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1355			offset -= blkoff;
1356			zgd->zgd_rl = zfs_range_lock(zp, offset, size,
1357			    RL_READER);
1358			if (zp->z_blksz == size)
1359				break;
1360			offset += blkoff;
1361			zfs_range_unlock(zgd->zgd_rl);
1362		}
1363		/* test for truncation needs to be done while range locked */
1364		if (lr->lr_offset >= zp->z_size)
1365			error = SET_ERROR(ENOENT);
1366#ifdef DEBUG
1367		if (zil_fault_io) {
1368			error = SET_ERROR(EIO);
1369			zil_fault_io = 0;
1370		}
1371#endif
1372		if (error == 0)
1373			error = dmu_buf_hold(os, object, offset, zgd, &db,
1374			    DMU_READ_NO_PREFETCH);
1375
1376		if (error == 0) {
1377			blkptr_t *obp = dmu_buf_get_blkptr(db);
1378			if (obp) {
1379				ASSERT(BP_IS_HOLE(bp));
1380				*bp = *obp;
1381			}
1382
1383			zgd->zgd_db = db;
1384			zgd->zgd_bp = bp;
1385
1386			ASSERT(db->db_offset == offset);
1387			ASSERT(db->db_size == size);
1388
1389			error = dmu_sync(zio, lr->lr_common.lrc_txg,
1390			    zfs_get_done, zgd);
1391			ASSERT(error || lr->lr_length <= zp->z_blksz);
1392
1393			/*
1394			 * On success, we need to wait for the write I/O
1395			 * initiated by dmu_sync() to complete before we can
1396			 * release this dbuf.  We will finish everything up
1397			 * in the zfs_get_done() callback.
1398			 */
1399			if (error == 0)
1400				return (0);
1401
1402			if (error == EALREADY) {
1403				lr->lr_common.lrc_txtype = TX_WRITE2;
1404				error = 0;
1405			}
1406		}
1407	}
1408
1409	zfs_get_done(zgd, error);
1410
1411	return (error);
1412}
1413
1414/*ARGSUSED*/
1415static int
1416zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1417    caller_context_t *ct)
1418{
1419	znode_t *zp = VTOZ(vp);
1420	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1421	int error;
1422
1423	ZFS_ENTER(zfsvfs);
1424	ZFS_VERIFY_ZP(zp);
1425
1426	if (flag & V_ACE_MASK)
1427		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1428	else
1429		error = zfs_zaccess_rwx(zp, mode, flag, cr);
1430
1431	ZFS_EXIT(zfsvfs);
1432	return (error);
1433}
1434
1435static int
1436zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
1437{
1438	int error;
1439
1440	*vpp = arg;
1441	error = vn_lock(*vpp, lkflags);
1442	if (error != 0)
1443		vrele(*vpp);
1444	return (error);
1445}
1446
1447static int
1448zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
1449{
1450	znode_t *zdp = VTOZ(dvp);
1451	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1452	int error;
1453	int ltype;
1454
1455	ASSERT_VOP_LOCKED(dvp, __func__);
1456#ifdef DIAGNOSTIC
1457	ASSERT(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
1458#endif
1459
1460	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
1461		ASSERT3P(dvp, ==, vp);
1462		vref(dvp);
1463		ltype = lkflags & LK_TYPE_MASK;
1464		if (ltype != VOP_ISLOCKED(dvp)) {
1465			if (ltype == LK_EXCLUSIVE)
1466				vn_lock(dvp, LK_UPGRADE | LK_RETRY);
1467			else /* if (ltype == LK_SHARED) */
1468				vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
1469
1470			/*
1471			 * Relock for the "." case could leave us with
1472			 * reclaimed vnode.
1473			 */
1474			if (dvp->v_iflag & VI_DOOMED) {
1475				vrele(dvp);
1476				return (SET_ERROR(ENOENT));
1477			}
1478		}
1479		return (0);
1480	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
1481		/*
1482		 * Note that in this case, dvp is the child vnode, and we
1483		 * are looking up the parent vnode - exactly reverse from
1484		 * normal operation.  Unlocking dvp requires some rather
1485		 * tricky unlock/relock dance to prevent mp from being freed;
1486		 * use vn_vget_ino_gen() which takes care of all that.
1487		 *
1488		 * XXX Note that there is a time window when both vnodes are
1489		 * unlocked.  It is possible, although highly unlikely, that
1490		 * during that window the parent-child relationship between
1491		 * the vnodes may change, for example, get reversed.
1492		 * In that case we would have a wrong lock order for the vnodes.
1493		 * All other filesystems seem to ignore this problem, so we
1494		 * do the same here.
1495		 * A potential solution could be implemented as follows:
1496		 * - using LK_NOWAIT when locking the second vnode and retrying
1497		 *   if necessary
1498		 * - checking that the parent-child relationship still holds
1499		 *   after locking both vnodes and retrying if it doesn't
1500		 */
1501		error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
1502		return (error);
1503	} else {
1504		error = vn_lock(vp, lkflags);
1505		if (error != 0)
1506			vrele(vp);
1507		return (error);
1508	}
1509}
1510
1511/*
1512 * Lookup an entry in a directory, or an extended attribute directory.
1513 * If it exists, return a held vnode reference for it.
1514 *
1515 *	IN:	dvp	- vnode of directory to search.
1516 *		nm	- name of entry to lookup.
1517 *		pnp	- full pathname to lookup [UNUSED].
1518 *		flags	- LOOKUP_XATTR set if looking for an attribute.
1519 *		rdir	- root directory vnode [UNUSED].
1520 *		cr	- credentials of caller.
1521 *		ct	- caller context
1522 *
1523 *	OUT:	vpp	- vnode of located entry, NULL if not found.
1524 *
1525 *	RETURN:	0 on success, error code on failure.
1526 *
1527 * Timestamps:
1528 *	NA
1529 */
1530/* ARGSUSED */
1531static int
1532zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
1533    int nameiop, cred_t *cr, kthread_t *td, int flags)
1534{
1535	znode_t *zdp = VTOZ(dvp);
1536	znode_t *zp;
1537	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1538	int	error = 0;
1539
1540	/* fast path (should be redundant with vfs namecache) */
1541	if (!(flags & LOOKUP_XATTR)) {
1542		if (dvp->v_type != VDIR) {
1543			return (SET_ERROR(ENOTDIR));
1544		} else if (zdp->z_sa_hdl == NULL) {
1545			return (SET_ERROR(EIO));
1546		}
1547	}
1548
1549	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1550
1551	ZFS_ENTER(zfsvfs);
1552	ZFS_VERIFY_ZP(zdp);
1553
1554	*vpp = NULL;
1555
1556	if (flags & LOOKUP_XATTR) {
1557#ifdef TODO
1558		/*
1559		 * If the xattr property is off, refuse the lookup request.
1560		 */
1561		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1562			ZFS_EXIT(zfsvfs);
1563			return (SET_ERROR(EINVAL));
1564		}
1565#endif
1566
1567		/*
1568		 * We don't allow recursive attributes..
1569		 * Maybe someday we will.
1570		 */
1571		if (zdp->z_pflags & ZFS_XATTR) {
1572			ZFS_EXIT(zfsvfs);
1573			return (SET_ERROR(EINVAL));
1574		}
1575
1576		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1577			ZFS_EXIT(zfsvfs);
1578			return (error);
1579		}
1580
1581		/*
1582		 * Do we have permission to get into attribute directory?
1583		 */
1584		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1585		    B_FALSE, cr)) {
1586			vrele(*vpp);
1587			*vpp = NULL;
1588		}
1589
1590		ZFS_EXIT(zfsvfs);
1591		return (error);
1592	}
1593
1594	/*
1595	 * Check accessibility of directory.
1596	 */
1597	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1598		ZFS_EXIT(zfsvfs);
1599		return (error);
1600	}
1601
1602	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1603	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1604		ZFS_EXIT(zfsvfs);
1605		return (SET_ERROR(EILSEQ));
1606	}
1607
1608
1609	/*
1610	 * First handle the special cases.
1611	 */
1612	if ((cnp->cn_flags & ISDOTDOT) != 0) {
1613		/*
1614		 * If we are a snapshot mounted under .zfs, return
1615		 * the vp for the snapshot directory.
1616		 */
1617		if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
1618			error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
1619			    "snapshot", vpp, NULL, 0, NULL, kcred,
1620			    NULL, NULL, NULL);
1621			ZFS_EXIT(zfsvfs);
1622			if (error == 0) {
1623				error = zfs_lookup_lock(dvp, *vpp, nm,
1624				    cnp->cn_lkflags);
1625			}
1626			goto out;
1627		}
1628	}
1629	if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
1630		error = 0;
1631		if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
1632			error = SET_ERROR(ENOTSUP);
1633		else
1634			*vpp = zfsctl_root(zdp);
1635		ZFS_EXIT(zfsvfs);
1636		if (error == 0)
1637			error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
1638		goto out;
1639	}
1640
1641	/*
1642	 * The loop is retry the lookup if the parent-child relationship
1643	 * changes during the dot-dot locking complexities.
1644	 */
1645	for (;;) {
1646		uint64_t parent;
1647
1648		error = zfs_dirlook(zdp, nm, &zp);
1649		if (error == 0)
1650			*vpp = ZTOV(zp);
1651
1652		ZFS_EXIT(zfsvfs);
1653		if (error != 0)
1654			break;
1655
1656		error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
1657		if (error != 0) {
1658			/*
1659			 * If we've got a locking error, then the vnode
1660			 * got reclaimed because of a force unmount.
1661			 * We never enter doomed vnodes into the name cache.
1662			 */
1663			*vpp = NULL;
1664			return (error);
1665		}
1666
1667		if ((cnp->cn_flags & ISDOTDOT) == 0)
1668			break;
1669
1670		ZFS_ENTER(zfsvfs);
1671		if (zdp->z_sa_hdl == NULL) {
1672			error = SET_ERROR(EIO);
1673		} else {
1674			error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
1675			    &parent, sizeof (parent));
1676		}
1677		if (error != 0) {
1678			ZFS_EXIT(zfsvfs);
1679			vput(ZTOV(zp));
1680			break;
1681		}
1682		if (zp->z_id == parent) {
1683			ZFS_EXIT(zfsvfs);
1684			break;
1685		}
1686		vput(ZTOV(zp));
1687	}
1688
1689out:
1690	if (error != 0)
1691		*vpp = NULL;
1692
1693	/* Translate errors and add SAVENAME when needed. */
1694	if (cnp->cn_flags & ISLASTCN) {
1695		switch (nameiop) {
1696		case CREATE:
1697		case RENAME:
1698			if (error == ENOENT) {
1699				error = EJUSTRETURN;
1700				cnp->cn_flags |= SAVENAME;
1701				break;
1702			}
1703			/* FALLTHROUGH */
1704		case DELETE:
1705			if (error == 0)
1706				cnp->cn_flags |= SAVENAME;
1707			break;
1708		}
1709	}
1710
1711	/* Insert name into cache (as non-existent) if appropriate. */
1712	if (zfsvfs->z_use_namecache &&
1713	    error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
1714		cache_enter(dvp, NULL, cnp);
1715
1716	/* Insert name into cache if appropriate. */
1717	if (zfsvfs->z_use_namecache &&
1718	    error == 0 && (cnp->cn_flags & MAKEENTRY)) {
1719		if (!(cnp->cn_flags & ISLASTCN) ||
1720		    (nameiop != DELETE && nameiop != RENAME)) {
1721			cache_enter(dvp, *vpp, cnp);
1722		}
1723	}
1724
1725	return (error);
1726}
1727
1728/*
1729 * Attempt to create a new entry in a directory.  If the entry
1730 * already exists, truncate the file if permissible, else return
1731 * an error.  Return the vp of the created or trunc'd file.
1732 *
1733 *	IN:	dvp	- vnode of directory to put new file entry in.
1734 *		name	- name of new file entry.
1735 *		vap	- attributes of new file.
1736 *		excl	- flag indicating exclusive or non-exclusive mode.
1737 *		mode	- mode to open file with.
1738 *		cr	- credentials of caller.
1739 *		flag	- large file flag [UNUSED].
1740 *		ct	- caller context
1741 *		vsecp	- ACL to be set
1742 *
1743 *	OUT:	vpp	- vnode of created or trunc'd entry.
1744 *
1745 *	RETURN:	0 on success, error code on failure.
1746 *
1747 * Timestamps:
1748 *	dvp - ctime|mtime updated if new entry created
1749 *	 vp - ctime|mtime always, atime if new
1750 */
1751
1752/* ARGSUSED */
1753static int
1754zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
1755    vnode_t **vpp, cred_t *cr, kthread_t *td)
1756{
1757	znode_t		*zp, *dzp = VTOZ(dvp);
1758	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1759	zilog_t		*zilog;
1760	objset_t	*os;
1761	dmu_tx_t	*tx;
1762	int		error;
1763	ksid_t		*ksid;
1764	uid_t		uid;
1765	gid_t		gid = crgetgid(cr);
1766	zfs_acl_ids_t   acl_ids;
1767	boolean_t	fuid_dirtied;
1768	void		*vsecp = NULL;
1769	int		flag = 0;
1770	uint64_t	txtype;
1771
1772	/*
1773	 * If we have an ephemeral id, ACL, or XVATTR then
1774	 * make sure file system is at proper version
1775	 */
1776
1777	ksid = crgetsid(cr, KSID_OWNER);
1778	if (ksid)
1779		uid = ksid_getid(ksid);
1780	else
1781		uid = crgetuid(cr);
1782
1783	if (zfsvfs->z_use_fuids == B_FALSE &&
1784	    (vsecp || (vap->va_mask & AT_XVATTR) ||
1785	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1786		return (SET_ERROR(EINVAL));
1787
1788	ZFS_ENTER(zfsvfs);
1789	ZFS_VERIFY_ZP(dzp);
1790	os = zfsvfs->z_os;
1791	zilog = zfsvfs->z_log;
1792
1793	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1794	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1795		ZFS_EXIT(zfsvfs);
1796		return (SET_ERROR(EILSEQ));
1797	}
1798
1799	if (vap->va_mask & AT_XVATTR) {
1800		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
1801		    crgetuid(cr), cr, vap->va_type)) != 0) {
1802			ZFS_EXIT(zfsvfs);
1803			return (error);
1804		}
1805	}
1806
1807	*vpp = NULL;
1808
1809	if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
1810		vap->va_mode &= ~S_ISVTX;
1811
1812	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
1813	if (error) {
1814		ZFS_EXIT(zfsvfs);
1815		return (error);
1816	}
1817	ASSERT3P(zp, ==, NULL);
1818
1819	/*
1820	 * Create a new file object and update the directory
1821	 * to reference it.
1822	 */
1823	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1824		goto out;
1825	}
1826
1827	/*
1828	 * We only support the creation of regular files in
1829	 * extended attribute directories.
1830	 */
1831
1832	if ((dzp->z_pflags & ZFS_XATTR) &&
1833	    (vap->va_type != VREG)) {
1834		error = SET_ERROR(EINVAL);
1835		goto out;
1836	}
1837
1838	if ((error = zfs_acl_ids_create(dzp, 0, vap,
1839	    cr, vsecp, &acl_ids)) != 0)
1840		goto out;
1841
1842	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1843		zfs_acl_ids_free(&acl_ids);
1844		error = SET_ERROR(EDQUOT);
1845		goto out;
1846	}
1847
1848	getnewvnode_reserve(1);
1849
1850	tx = dmu_tx_create(os);
1851
1852	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1853	    ZFS_SA_BASE_ATTR_SIZE);
1854
1855	fuid_dirtied = zfsvfs->z_fuid_dirty;
1856	if (fuid_dirtied)
1857		zfs_fuid_txhold(zfsvfs, tx);
1858	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1859	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1860	if (!zfsvfs->z_use_sa &&
1861	    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1862		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1863		    0, acl_ids.z_aclp->z_acl_bytes);
1864	}
1865	error = dmu_tx_assign(tx, TXG_WAIT);
1866	if (error) {
1867		zfs_acl_ids_free(&acl_ids);
1868		dmu_tx_abort(tx);
1869		getnewvnode_drop_reserve();
1870		ZFS_EXIT(zfsvfs);
1871		return (error);
1872	}
1873	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1874
1875	if (fuid_dirtied)
1876		zfs_fuid_sync(zfsvfs, tx);
1877
1878	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
1879	txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1880	zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1881	    vsecp, acl_ids.z_fuidp, vap);
1882	zfs_acl_ids_free(&acl_ids);
1883	dmu_tx_commit(tx);
1884
1885	getnewvnode_drop_reserve();
1886
1887out:
1888	if (error == 0) {
1889		*vpp = ZTOV(zp);
1890	}
1891
1892	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1893		zil_commit(zilog, 0);
1894
1895	ZFS_EXIT(zfsvfs);
1896	return (error);
1897}
1898
1899/*
1900 * Remove an entry from a directory.
1901 *
1902 *	IN:	dvp	- vnode of directory to remove entry from.
1903 *		name	- name of entry to remove.
1904 *		cr	- credentials of caller.
1905 *		ct	- caller context
1906 *		flags	- case flags
1907 *
1908 *	RETURN:	0 on success, error code on failure.
1909 *
1910 * Timestamps:
1911 *	dvp - ctime|mtime
1912 *	 vp - ctime (if nlink > 0)
1913 */
1914
1915/*ARGSUSED*/
1916static int
1917zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
1918{
1919	znode_t		*dzp = VTOZ(dvp);
1920	znode_t		*zp = VTOZ(vp);
1921	znode_t		*xzp;
1922	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1923	zilog_t		*zilog;
1924	uint64_t	acl_obj, xattr_obj;
1925	uint64_t	obj = 0;
1926	dmu_tx_t	*tx;
1927	boolean_t	unlinked, toobig = FALSE;
1928	uint64_t	txtype;
1929	int		error;
1930
1931	ZFS_ENTER(zfsvfs);
1932	ZFS_VERIFY_ZP(dzp);
1933	ZFS_VERIFY_ZP(zp);
1934	zilog = zfsvfs->z_log;
1935	zp = VTOZ(vp);
1936
1937	xattr_obj = 0;
1938	xzp = NULL;
1939
1940	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1941		goto out;
1942	}
1943
1944	/*
1945	 * Need to use rmdir for removing directories.
1946	 */
1947	if (vp->v_type == VDIR) {
1948		error = SET_ERROR(EPERM);
1949		goto out;
1950	}
1951
1952	vnevent_remove(vp, dvp, name, ct);
1953
1954	obj = zp->z_id;
1955
1956	/* are there any extended attributes? */
1957	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1958	    &xattr_obj, sizeof (xattr_obj));
1959	if (error == 0 && xattr_obj) {
1960		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1961		ASSERT0(error);
1962	}
1963
1964	/*
1965	 * We may delete the znode now, or we may put it in the unlinked set;
1966	 * it depends on whether we're the last link, and on whether there are
1967	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
1968	 * allow for either case.
1969	 */
1970	tx = dmu_tx_create(zfsvfs->z_os);
1971	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1972	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1973	zfs_sa_upgrade_txholds(tx, zp);
1974	zfs_sa_upgrade_txholds(tx, dzp);
1975
1976	if (xzp) {
1977		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1978		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1979	}
1980
1981	/* charge as an update -- would be nice not to charge at all */
1982	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1983
1984	/*
1985	 * Mark this transaction as typically resulting in a net free of space
1986	 */
1987	dmu_tx_mark_netfree(tx);
1988
1989	error = dmu_tx_assign(tx, TXG_WAIT);
1990	if (error) {
1991		dmu_tx_abort(tx);
1992		ZFS_EXIT(zfsvfs);
1993		return (error);
1994	}
1995
1996	/*
1997	 * Remove the directory entry.
1998	 */
1999	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
2000
2001	if (error) {
2002		dmu_tx_commit(tx);
2003		goto out;
2004	}
2005
2006	if (unlinked) {
2007		zfs_unlinked_add(zp, tx);
2008		vp->v_vflag |= VV_NOSYNC;
2009	}
2010
2011	txtype = TX_REMOVE;
2012	zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
2013
2014	dmu_tx_commit(tx);
2015out:
2016
2017	if (xzp)
2018		vrele(ZTOV(xzp));
2019
2020	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2021		zil_commit(zilog, 0);
2022
2023	ZFS_EXIT(zfsvfs);
2024	return (error);
2025}
2026
2027/*
2028 * Create a new directory and insert it into dvp using the name
2029 * provided.  Return a pointer to the inserted directory.
2030 *
2031 *	IN:	dvp	- vnode of directory to add subdir to.
2032 *		dirname	- name of new directory.
2033 *		vap	- attributes of new directory.
2034 *		cr	- credentials of caller.
2035 *		ct	- caller context
2036 *		flags	- case flags
2037 *		vsecp	- ACL to be set
2038 *
2039 *	OUT:	vpp	- vnode of created directory.
2040 *
2041 *	RETURN:	0 on success, error code on failure.
2042 *
2043 * Timestamps:
2044 *	dvp - ctime|mtime updated
2045 *	 vp - ctime|mtime|atime updated
2046 */
2047/*ARGSUSED*/
2048static int
2049zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
2050{
2051	znode_t		*zp, *dzp = VTOZ(dvp);
2052	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2053	zilog_t		*zilog;
2054	uint64_t	txtype;
2055	dmu_tx_t	*tx;
2056	int		error;
2057	ksid_t		*ksid;
2058	uid_t		uid;
2059	gid_t		gid = crgetgid(cr);
2060	zfs_acl_ids_t   acl_ids;
2061	boolean_t	fuid_dirtied;
2062
2063	ASSERT(vap->va_type == VDIR);
2064
2065	/*
2066	 * If we have an ephemeral id, ACL, or XVATTR then
2067	 * make sure file system is at proper version
2068	 */
2069
2070	ksid = crgetsid(cr, KSID_OWNER);
2071	if (ksid)
2072		uid = ksid_getid(ksid);
2073	else
2074		uid = crgetuid(cr);
2075	if (zfsvfs->z_use_fuids == B_FALSE &&
2076	    ((vap->va_mask & AT_XVATTR) ||
2077	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
2078		return (SET_ERROR(EINVAL));
2079
2080	ZFS_ENTER(zfsvfs);
2081	ZFS_VERIFY_ZP(dzp);
2082	zilog = zfsvfs->z_log;
2083
2084	if (dzp->z_pflags & ZFS_XATTR) {
2085		ZFS_EXIT(zfsvfs);
2086		return (SET_ERROR(EINVAL));
2087	}
2088
2089	if (zfsvfs->z_utf8 && u8_validate(dirname,
2090	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2091		ZFS_EXIT(zfsvfs);
2092		return (SET_ERROR(EILSEQ));
2093	}
2094
2095	if (vap->va_mask & AT_XVATTR) {
2096		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
2097		    crgetuid(cr), cr, vap->va_type)) != 0) {
2098			ZFS_EXIT(zfsvfs);
2099			return (error);
2100		}
2101	}
2102
2103	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
2104	    NULL, &acl_ids)) != 0) {
2105		ZFS_EXIT(zfsvfs);
2106		return (error);
2107	}
2108
2109	/*
2110	 * First make sure the new directory doesn't exist.
2111	 *
2112	 * Existence is checked first to make sure we don't return
2113	 * EACCES instead of EEXIST which can cause some applications
2114	 * to fail.
2115	 */
2116	*vpp = NULL;
2117
2118	if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) {
2119		zfs_acl_ids_free(&acl_ids);
2120		ZFS_EXIT(zfsvfs);
2121		return (error);
2122	}
2123	ASSERT3P(zp, ==, NULL);
2124
2125	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
2126		zfs_acl_ids_free(&acl_ids);
2127		ZFS_EXIT(zfsvfs);
2128		return (error);
2129	}
2130
2131	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
2132		zfs_acl_ids_free(&acl_ids);
2133		ZFS_EXIT(zfsvfs);
2134		return (SET_ERROR(EDQUOT));
2135	}
2136
2137	/*
2138	 * Add a new entry to the directory.
2139	 */
2140	getnewvnode_reserve(1);
2141	tx = dmu_tx_create(zfsvfs->z_os);
2142	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
2143	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2144	fuid_dirtied = zfsvfs->z_fuid_dirty;
2145	if (fuid_dirtied)
2146		zfs_fuid_txhold(zfsvfs, tx);
2147	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2148		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
2149		    acl_ids.z_aclp->z_acl_bytes);
2150	}
2151
2152	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
2153	    ZFS_SA_BASE_ATTR_SIZE);
2154
2155	error = dmu_tx_assign(tx, TXG_WAIT);
2156	if (error) {
2157		zfs_acl_ids_free(&acl_ids);
2158		dmu_tx_abort(tx);
2159		getnewvnode_drop_reserve();
2160		ZFS_EXIT(zfsvfs);
2161		return (error);
2162	}
2163
2164	/*
2165	 * Create new node.
2166	 */
2167	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
2168
2169	if (fuid_dirtied)
2170		zfs_fuid_sync(zfsvfs, tx);
2171
2172	/*
2173	 * Now put new name in parent dir.
2174	 */
2175	(void) zfs_link_create(dzp, dirname, zp, tx, ZNEW);
2176
2177	*vpp = ZTOV(zp);
2178
2179	txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
2180	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
2181	    acl_ids.z_fuidp, vap);
2182
2183	zfs_acl_ids_free(&acl_ids);
2184
2185	dmu_tx_commit(tx);
2186
2187	getnewvnode_drop_reserve();
2188
2189	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2190		zil_commit(zilog, 0);
2191
2192	ZFS_EXIT(zfsvfs);
2193	return (0);
2194}
2195
2196/*
2197 * Remove a directory subdir entry.  If the current working
2198 * directory is the same as the subdir to be removed, the
2199 * remove will fail.
2200 *
2201 *	IN:	dvp	- vnode of directory to remove from.
2202 *		name	- name of directory to be removed.
2203 *		cwd	- vnode of current working directory.
2204 *		cr	- credentials of caller.
2205 *		ct	- caller context
2206 *		flags	- case flags
2207 *
2208 *	RETURN:	0 on success, error code on failure.
2209 *
2210 * Timestamps:
2211 *	dvp - ctime|mtime updated
2212 */
2213/*ARGSUSED*/
2214static int
2215zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
2216{
2217	znode_t		*dzp = VTOZ(dvp);
2218	znode_t		*zp = VTOZ(vp);
2219	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2220	zilog_t		*zilog;
2221	dmu_tx_t	*tx;
2222	int		error;
2223
2224	ZFS_ENTER(zfsvfs);
2225	ZFS_VERIFY_ZP(dzp);
2226	ZFS_VERIFY_ZP(zp);
2227	zilog = zfsvfs->z_log;
2228
2229
2230	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2231		goto out;
2232	}
2233
2234	if (vp->v_type != VDIR) {
2235		error = SET_ERROR(ENOTDIR);
2236		goto out;
2237	}
2238
2239	vnevent_rmdir(vp, dvp, name, ct);
2240
2241	tx = dmu_tx_create(zfsvfs->z_os);
2242	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2243	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2244	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2245	zfs_sa_upgrade_txholds(tx, zp);
2246	zfs_sa_upgrade_txholds(tx, dzp);
2247	dmu_tx_mark_netfree(tx);
2248	error = dmu_tx_assign(tx, TXG_WAIT);
2249	if (error) {
2250		dmu_tx_abort(tx);
2251		ZFS_EXIT(zfsvfs);
2252		return (error);
2253	}
2254
2255	cache_purge(dvp);
2256
2257	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
2258
2259	if (error == 0) {
2260		uint64_t txtype = TX_RMDIR;
2261		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
2262	}
2263
2264	dmu_tx_commit(tx);
2265
2266	cache_purge(vp);
2267out:
2268	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2269		zil_commit(zilog, 0);
2270
2271	ZFS_EXIT(zfsvfs);
2272	return (error);
2273}
2274
2275/*
2276 * Read as many directory entries as will fit into the provided
2277 * buffer from the given directory cursor position (specified in
2278 * the uio structure).
2279 *
2280 *	IN:	vp	- vnode of directory to read.
2281 *		uio	- structure supplying read location, range info,
2282 *			  and return buffer.
2283 *		cr	- credentials of caller.
2284 *		ct	- caller context
2285 *		flags	- case flags
2286 *
2287 *	OUT:	uio	- updated offset and range, buffer filled.
2288 *		eofp	- set to true if end-of-file detected.
2289 *
2290 *	RETURN:	0 on success, error code on failure.
2291 *
2292 * Timestamps:
2293 *	vp - atime updated
2294 *
2295 * Note that the low 4 bits of the cookie returned by zap is always zero.
2296 * This allows us to use the low range for "special" directory entries:
2297 * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
2298 * we use the offset 2 for the '.zfs' directory.
2299 */
2300/* ARGSUSED */
2301static int
2302zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
2303{
2304	znode_t		*zp = VTOZ(vp);
2305	iovec_t		*iovp;
2306	edirent_t	*eodp;
2307	dirent64_t	*odp;
2308	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2309	objset_t	*os;
2310	caddr_t		outbuf;
2311	size_t		bufsize;
2312	zap_cursor_t	zc;
2313	zap_attribute_t	zap;
2314	uint_t		bytes_wanted;
2315	uint64_t	offset; /* must be unsigned; checks for < 1 */
2316	uint64_t	parent;
2317	int		local_eof;
2318	int		outcount;
2319	int		error;
2320	uint8_t		prefetch;
2321	boolean_t	check_sysattrs;
2322	uint8_t		type;
2323	int		ncooks;
2324	u_long		*cooks = NULL;
2325	int		flags = 0;
2326
2327	ZFS_ENTER(zfsvfs);
2328	ZFS_VERIFY_ZP(zp);
2329
2330	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2331	    &parent, sizeof (parent))) != 0) {
2332		ZFS_EXIT(zfsvfs);
2333		return (error);
2334	}
2335
2336	/*
2337	 * If we are not given an eof variable,
2338	 * use a local one.
2339	 */
2340	if (eofp == NULL)
2341		eofp = &local_eof;
2342
2343	/*
2344	 * Check for valid iov_len.
2345	 */
2346	if (uio->uio_iov->iov_len <= 0) {
2347		ZFS_EXIT(zfsvfs);
2348		return (SET_ERROR(EINVAL));
2349	}
2350
2351	/*
2352	 * Quit if directory has been removed (posix)
2353	 */
2354	if ((*eofp = zp->z_unlinked) != 0) {
2355		ZFS_EXIT(zfsvfs);
2356		return (0);
2357	}
2358
2359	error = 0;
2360	os = zfsvfs->z_os;
2361	offset = uio->uio_loffset;
2362	prefetch = zp->z_zn_prefetch;
2363
2364	/*
2365	 * Initialize the iterator cursor.
2366	 */
2367	if (offset <= 3) {
2368		/*
2369		 * Start iteration from the beginning of the directory.
2370		 */
2371		zap_cursor_init(&zc, os, zp->z_id);
2372	} else {
2373		/*
2374		 * The offset is a serialized cursor.
2375		 */
2376		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2377	}
2378
2379	/*
2380	 * Get space to change directory entries into fs independent format.
2381	 */
2382	iovp = uio->uio_iov;
2383	bytes_wanted = iovp->iov_len;
2384	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
2385		bufsize = bytes_wanted;
2386		outbuf = kmem_alloc(bufsize, KM_SLEEP);
2387		odp = (struct dirent64 *)outbuf;
2388	} else {
2389		bufsize = bytes_wanted;
2390		outbuf = NULL;
2391		odp = (struct dirent64 *)iovp->iov_base;
2392	}
2393	eodp = (struct edirent *)odp;
2394
2395	if (ncookies != NULL) {
2396		/*
2397		 * Minimum entry size is dirent size and 1 byte for a file name.
2398		 */
2399		ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
2400		cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
2401		*cookies = cooks;
2402		*ncookies = ncooks;
2403	}
2404	/*
2405	 * If this VFS supports the system attribute view interface; and
2406	 * we're looking at an extended attribute directory; and we care
2407	 * about normalization conflicts on this vfs; then we must check
2408	 * for normalization conflicts with the sysattr name space.
2409	 */
2410#ifdef TODO
2411	check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2412	    (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2413	    (flags & V_RDDIR_ENTFLAGS);
2414#else
2415	check_sysattrs = 0;
2416#endif
2417
2418	/*
2419	 * Transform to file-system independent format
2420	 */
2421	outcount = 0;
2422	while (outcount < bytes_wanted) {
2423		ino64_t objnum;
2424		ushort_t reclen;
2425		off64_t *next = NULL;
2426
2427		/*
2428		 * Special case `.', `..', and `.zfs'.
2429		 */
2430		if (offset == 0) {
2431			(void) strcpy(zap.za_name, ".");
2432			zap.za_normalization_conflict = 0;
2433			objnum = zp->z_id;
2434			type = DT_DIR;
2435		} else if (offset == 1) {
2436			(void) strcpy(zap.za_name, "..");
2437			zap.za_normalization_conflict = 0;
2438			objnum = parent;
2439			type = DT_DIR;
2440		} else if (offset == 2 && zfs_show_ctldir(zp)) {
2441			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2442			zap.za_normalization_conflict = 0;
2443			objnum = ZFSCTL_INO_ROOT;
2444			type = DT_DIR;
2445		} else {
2446			/*
2447			 * Grab next entry.
2448			 */
2449			if (error = zap_cursor_retrieve(&zc, &zap)) {
2450				if ((*eofp = (error == ENOENT)) != 0)
2451					break;
2452				else
2453					goto update;
2454			}
2455
2456			if (zap.za_integer_length != 8 ||
2457			    zap.za_num_integers != 1) {
2458				cmn_err(CE_WARN, "zap_readdir: bad directory "
2459				    "entry, obj = %lld, offset = %lld\n",
2460				    (u_longlong_t)zp->z_id,
2461				    (u_longlong_t)offset);
2462				error = SET_ERROR(ENXIO);
2463				goto update;
2464			}
2465
2466			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2467			/*
2468			 * MacOS X can extract the object type here such as:
2469			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2470			 */
2471			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2472
2473			if (check_sysattrs && !zap.za_normalization_conflict) {
2474#ifdef TODO
2475				zap.za_normalization_conflict =
2476				    xattr_sysattr_casechk(zap.za_name);
2477#else
2478				panic("%s:%u: TODO", __func__, __LINE__);
2479#endif
2480			}
2481		}
2482
2483		if (flags & V_RDDIR_ACCFILTER) {
2484			/*
2485			 * If we have no access at all, don't include
2486			 * this entry in the returned information
2487			 */
2488			znode_t	*ezp;
2489			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
2490				goto skip_entry;
2491			if (!zfs_has_access(ezp, cr)) {
2492				vrele(ZTOV(ezp));
2493				goto skip_entry;
2494			}
2495			vrele(ZTOV(ezp));
2496		}
2497
2498		if (flags & V_RDDIR_ENTFLAGS)
2499			reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2500		else
2501			reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2502
2503		/*
2504		 * Will this entry fit in the buffer?
2505		 */
2506		if (outcount + reclen > bufsize) {
2507			/*
2508			 * Did we manage to fit anything in the buffer?
2509			 */
2510			if (!outcount) {
2511				error = SET_ERROR(EINVAL);
2512				goto update;
2513			}
2514			break;
2515		}
2516		if (flags & V_RDDIR_ENTFLAGS) {
2517			/*
2518			 * Add extended flag entry:
2519			 */
2520			eodp->ed_ino = objnum;
2521			eodp->ed_reclen = reclen;
2522			/* NOTE: ed_off is the offset for the *next* entry */
2523			next = &(eodp->ed_off);
2524			eodp->ed_eflags = zap.za_normalization_conflict ?
2525			    ED_CASE_CONFLICT : 0;
2526			(void) strncpy(eodp->ed_name, zap.za_name,
2527			    EDIRENT_NAMELEN(reclen));
2528			eodp = (edirent_t *)((intptr_t)eodp + reclen);
2529		} else {
2530			/*
2531			 * Add normal entry:
2532			 */
2533			odp->d_ino = objnum;
2534			odp->d_reclen = reclen;
2535			odp->d_namlen = strlen(zap.za_name);
2536			(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
2537			odp->d_type = type;
2538			odp = (dirent64_t *)((intptr_t)odp + reclen);
2539		}
2540		outcount += reclen;
2541
2542		ASSERT(outcount <= bufsize);
2543
2544		/* Prefetch znode */
2545		if (prefetch)
2546			dmu_prefetch(os, objnum, 0, 0, 0,
2547			    ZIO_PRIORITY_SYNC_READ);
2548
2549	skip_entry:
2550		/*
2551		 * Move to the next entry, fill in the previous offset.
2552		 */
2553		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2554			zap_cursor_advance(&zc);
2555			offset = zap_cursor_serialize(&zc);
2556		} else {
2557			offset += 1;
2558		}
2559
2560		if (cooks != NULL) {
2561			*cooks++ = offset;
2562			ncooks--;
2563			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
2564		}
2565	}
2566	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2567
2568	/* Subtract unused cookies */
2569	if (ncookies != NULL)
2570		*ncookies -= ncooks;
2571
2572	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
2573		iovp->iov_base += outcount;
2574		iovp->iov_len -= outcount;
2575		uio->uio_resid -= outcount;
2576	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2577		/*
2578		 * Reset the pointer.
2579		 */
2580		offset = uio->uio_loffset;
2581	}
2582
2583update:
2584	zap_cursor_fini(&zc);
2585	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
2586		kmem_free(outbuf, bufsize);
2587
2588	if (error == ENOENT)
2589		error = 0;
2590
2591	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2592
2593	uio->uio_loffset = offset;
2594	ZFS_EXIT(zfsvfs);
2595	if (error != 0 && cookies != NULL) {
2596		free(*cookies, M_TEMP);
2597		*cookies = NULL;
2598		*ncookies = 0;
2599	}
2600	return (error);
2601}
2602
2603ulong_t zfs_fsync_sync_cnt = 4;
2604
2605static int
2606zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2607{
2608	znode_t	*zp = VTOZ(vp);
2609	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2610
2611	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2612
2613	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
2614		ZFS_ENTER(zfsvfs);
2615		ZFS_VERIFY_ZP(zp);
2616		zil_commit(zfsvfs->z_log, zp->z_id);
2617		ZFS_EXIT(zfsvfs);
2618	}
2619	return (0);
2620}
2621
2622
2623/*
2624 * Get the requested file attributes and place them in the provided
2625 * vattr structure.
2626 *
2627 *	IN:	vp	- vnode of file.
2628 *		vap	- va_mask identifies requested attributes.
2629 *			  If AT_XVATTR set, then optional attrs are requested
2630 *		flags	- ATTR_NOACLCHECK (CIFS server context)
2631 *		cr	- credentials of caller.
2632 *		ct	- caller context
2633 *
2634 *	OUT:	vap	- attribute values.
2635 *
2636 *	RETURN:	0 (always succeeds).
2637 */
2638/* ARGSUSED */
2639static int
2640zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2641    caller_context_t *ct)
2642{
2643	znode_t *zp = VTOZ(vp);
2644	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2645	int	error = 0;
2646	uint32_t blksize;
2647	u_longlong_t nblocks;
2648	uint64_t links;
2649	uint64_t mtime[2], ctime[2], crtime[2], rdev;
2650	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2651	xoptattr_t *xoap = NULL;
2652	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2653	sa_bulk_attr_t bulk[4];
2654	int count = 0;
2655
2656	ZFS_ENTER(zfsvfs);
2657	ZFS_VERIFY_ZP(zp);
2658
2659	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2660
2661	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
2662	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
2663	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
2664	if (vp->v_type == VBLK || vp->v_type == VCHR)
2665		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
2666		    &rdev, 8);
2667
2668	if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
2669		ZFS_EXIT(zfsvfs);
2670		return (error);
2671	}
2672
2673	/*
2674	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2675	 * Also, if we are the owner don't bother, since owner should
2676	 * always be allowed to read basic attributes of file.
2677	 */
2678	if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
2679	    (vap->va_uid != crgetuid(cr))) {
2680		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2681		    skipaclchk, cr)) {
2682			ZFS_EXIT(zfsvfs);
2683			return (error);
2684		}
2685	}
2686
2687	/*
2688	 * Return all attributes.  It's cheaper to provide the answer
2689	 * than to determine whether we were asked the question.
2690	 */
2691
2692	vap->va_type = IFTOVT(zp->z_mode);
2693	vap->va_mode = zp->z_mode & ~S_IFMT;
2694#ifdef illumos
2695	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
2696#else
2697	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
2698#endif
2699	vap->va_nodeid = zp->z_id;
2700	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
2701		links = zp->z_links + 1;
2702	else
2703		links = zp->z_links;
2704	vap->va_nlink = MIN(links, LINK_MAX);	/* nlink_t limit! */
2705	vap->va_size = zp->z_size;
2706#ifdef illumos
2707	vap->va_rdev = vp->v_rdev;
2708#else
2709	if (vp->v_type == VBLK || vp->v_type == VCHR)
2710		vap->va_rdev = zfs_cmpldev(rdev);
2711#endif
2712	vap->va_seq = zp->z_seq;
2713	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
2714	vap->va_filerev = zp->z_seq;
2715
2716	/*
2717	 * Add in any requested optional attributes and the create time.
2718	 * Also set the corresponding bits in the returned attribute bitmap.
2719	 */
2720	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2721		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2722			xoap->xoa_archive =
2723			    ((zp->z_pflags & ZFS_ARCHIVE) != 0);
2724			XVA_SET_RTN(xvap, XAT_ARCHIVE);
2725		}
2726
2727		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2728			xoap->xoa_readonly =
2729			    ((zp->z_pflags & ZFS_READONLY) != 0);
2730			XVA_SET_RTN(xvap, XAT_READONLY);
2731		}
2732
2733		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2734			xoap->xoa_system =
2735			    ((zp->z_pflags & ZFS_SYSTEM) != 0);
2736			XVA_SET_RTN(xvap, XAT_SYSTEM);
2737		}
2738
2739		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2740			xoap->xoa_hidden =
2741			    ((zp->z_pflags & ZFS_HIDDEN) != 0);
2742			XVA_SET_RTN(xvap, XAT_HIDDEN);
2743		}
2744
2745		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2746			xoap->xoa_nounlink =
2747			    ((zp->z_pflags & ZFS_NOUNLINK) != 0);
2748			XVA_SET_RTN(xvap, XAT_NOUNLINK);
2749		}
2750
2751		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2752			xoap->xoa_immutable =
2753			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
2754			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2755		}
2756
2757		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2758			xoap->xoa_appendonly =
2759			    ((zp->z_pflags & ZFS_APPENDONLY) != 0);
2760			XVA_SET_RTN(xvap, XAT_APPENDONLY);
2761		}
2762
2763		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2764			xoap->xoa_nodump =
2765			    ((zp->z_pflags & ZFS_NODUMP) != 0);
2766			XVA_SET_RTN(xvap, XAT_NODUMP);
2767		}
2768
2769		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2770			xoap->xoa_opaque =
2771			    ((zp->z_pflags & ZFS_OPAQUE) != 0);
2772			XVA_SET_RTN(xvap, XAT_OPAQUE);
2773		}
2774
2775		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2776			xoap->xoa_av_quarantined =
2777			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
2778			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2779		}
2780
2781		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2782			xoap->xoa_av_modified =
2783			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
2784			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2785		}
2786
2787		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2788		    vp->v_type == VREG) {
2789			zfs_sa_get_scanstamp(zp, xvap);
2790		}
2791
2792		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
2793			uint64_t times[2];
2794
2795			(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
2796			    times, sizeof (times));
2797			ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
2798			XVA_SET_RTN(xvap, XAT_CREATETIME);
2799		}
2800
2801		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2802			xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
2803			XVA_SET_RTN(xvap, XAT_REPARSE);
2804		}
2805		if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
2806			xoap->xoa_generation = zp->z_gen;
2807			XVA_SET_RTN(xvap, XAT_GEN);
2808		}
2809
2810		if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
2811			xoap->xoa_offline =
2812			    ((zp->z_pflags & ZFS_OFFLINE) != 0);
2813			XVA_SET_RTN(xvap, XAT_OFFLINE);
2814		}
2815
2816		if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
2817			xoap->xoa_sparse =
2818			    ((zp->z_pflags & ZFS_SPARSE) != 0);
2819			XVA_SET_RTN(xvap, XAT_SPARSE);
2820		}
2821	}
2822
2823	ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
2824	ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2825	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2826	ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
2827
2828
2829	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
2830	vap->va_blksize = blksize;
2831	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
2832
2833	if (zp->z_blksz == 0) {
2834		/*
2835		 * Block size hasn't been set; suggest maximal I/O transfers.
2836		 */
2837		vap->va_blksize = zfsvfs->z_max_blksz;
2838	}
2839
2840	ZFS_EXIT(zfsvfs);
2841	return (0);
2842}
2843
2844/*
2845 * Set the file attributes to the values contained in the
2846 * vattr structure.
2847 *
2848 *	IN:	vp	- vnode of file to be modified.
2849 *		vap	- new attribute values.
2850 *			  If AT_XVATTR set, then optional attrs are being set
2851 *		flags	- ATTR_UTIME set if non-default time values provided.
2852 *			- ATTR_NOACLCHECK (CIFS context only).
2853 *		cr	- credentials of caller.
2854 *		ct	- caller context
2855 *
2856 *	RETURN:	0 on success, error code on failure.
2857 *
2858 * Timestamps:
2859 *	vp - ctime updated, mtime updated if size changed.
2860 */
2861/* ARGSUSED */
2862static int
2863zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2864    caller_context_t *ct)
2865{
2866	znode_t		*zp = VTOZ(vp);
2867	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2868	zilog_t		*zilog;
2869	dmu_tx_t	*tx;
2870	vattr_t		oldva;
2871	xvattr_t	tmpxvattr;
2872	uint_t		mask = vap->va_mask;
2873	uint_t		saved_mask = 0;
2874	uint64_t	saved_mode;
2875	int		trim_mask = 0;
2876	uint64_t	new_mode;
2877	uint64_t	new_uid, new_gid;
2878	uint64_t	xattr_obj;
2879	uint64_t	mtime[2], ctime[2];
2880	znode_t		*attrzp;
2881	int		need_policy = FALSE;
2882	int		err, err2;
2883	zfs_fuid_info_t *fuidp = NULL;
2884	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2885	xoptattr_t	*xoap;
2886	zfs_acl_t	*aclp;
2887	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2888	boolean_t	fuid_dirtied = B_FALSE;
2889	sa_bulk_attr_t	bulk[7], xattr_bulk[7];
2890	int		count = 0, xattr_count = 0;
2891
2892	if (mask == 0)
2893		return (0);
2894
2895	if (mask & AT_NOSET)
2896		return (SET_ERROR(EINVAL));
2897
2898	ZFS_ENTER(zfsvfs);
2899	ZFS_VERIFY_ZP(zp);
2900
2901	zilog = zfsvfs->z_log;
2902
2903	/*
2904	 * Make sure that if we have ephemeral uid/gid or xvattr specified
2905	 * that file system is at proper version level
2906	 */
2907
2908	if (zfsvfs->z_use_fuids == B_FALSE &&
2909	    (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2910	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2911	    (mask & AT_XVATTR))) {
2912		ZFS_EXIT(zfsvfs);
2913		return (SET_ERROR(EINVAL));
2914	}
2915
2916	if (mask & AT_SIZE && vp->v_type == VDIR) {
2917		ZFS_EXIT(zfsvfs);
2918		return (SET_ERROR(EISDIR));
2919	}
2920
2921	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
2922		ZFS_EXIT(zfsvfs);
2923		return (SET_ERROR(EINVAL));
2924	}
2925
2926	/*
2927	 * If this is an xvattr_t, then get a pointer to the structure of
2928	 * optional attributes.  If this is NULL, then we have a vattr_t.
2929	 */
2930	xoap = xva_getxoptattr(xvap);
2931
2932	xva_init(&tmpxvattr);
2933
2934	/*
2935	 * Immutable files can only alter immutable bit and atime
2936	 */
2937	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2938	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
2939	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2940		ZFS_EXIT(zfsvfs);
2941		return (SET_ERROR(EPERM));
2942	}
2943
2944	if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
2945		ZFS_EXIT(zfsvfs);
2946		return (SET_ERROR(EPERM));
2947	}
2948
2949	/*
2950	 * Verify timestamps doesn't overflow 32 bits.
2951	 * ZFS can handle large timestamps, but 32bit syscalls can't
2952	 * handle times greater than 2039.  This check should be removed
2953	 * once large timestamps are fully supported.
2954	 */
2955	if (mask & (AT_ATIME | AT_MTIME)) {
2956		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2957		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2958			ZFS_EXIT(zfsvfs);
2959			return (SET_ERROR(EOVERFLOW));
2960		}
2961	}
2962
2963	attrzp = NULL;
2964	aclp = NULL;
2965
2966	/* Can this be moved to before the top label? */
2967	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2968		ZFS_EXIT(zfsvfs);
2969		return (SET_ERROR(EROFS));
2970	}
2971
2972	/*
2973	 * First validate permissions
2974	 */
2975
2976	if (mask & AT_SIZE) {
2977		/*
2978		 * XXX - Note, we are not providing any open
2979		 * mode flags here (like FNDELAY), so we may
2980		 * block if there are locks present... this
2981		 * should be addressed in openat().
2982		 */
2983		/* XXX - would it be OK to generate a log record here? */
2984		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2985		if (err) {
2986			ZFS_EXIT(zfsvfs);
2987			return (err);
2988		}
2989	}
2990
2991	if (mask & (AT_ATIME|AT_MTIME) ||
2992	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2993	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2994	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2995	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
2996	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
2997	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2998	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
2999		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
3000		    skipaclchk, cr);
3001	}
3002
3003	if (mask & (AT_UID|AT_GID)) {
3004		int	idmask = (mask & (AT_UID|AT_GID));
3005		int	take_owner;
3006		int	take_group;
3007
3008		/*
3009		 * NOTE: even if a new mode is being set,
3010		 * we may clear S_ISUID/S_ISGID bits.
3011		 */
3012
3013		if (!(mask & AT_MODE))
3014			vap->va_mode = zp->z_mode;
3015
3016		/*
3017		 * Take ownership or chgrp to group we are a member of
3018		 */
3019
3020		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
3021		take_group = (mask & AT_GID) &&
3022		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
3023
3024		/*
3025		 * If both AT_UID and AT_GID are set then take_owner and
3026		 * take_group must both be set in order to allow taking
3027		 * ownership.
3028		 *
3029		 * Otherwise, send the check through secpolicy_vnode_setattr()
3030		 *
3031		 */
3032
3033		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
3034		    ((idmask == AT_UID) && take_owner) ||
3035		    ((idmask == AT_GID) && take_group)) {
3036			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
3037			    skipaclchk, cr) == 0) {
3038				/*
3039				 * Remove setuid/setgid for non-privileged users
3040				 */
3041				secpolicy_setid_clear(vap, vp, cr);
3042				trim_mask = (mask & (AT_UID|AT_GID));
3043			} else {
3044				need_policy =  TRUE;
3045			}
3046		} else {
3047			need_policy =  TRUE;
3048		}
3049	}
3050
3051	oldva.va_mode = zp->z_mode;
3052	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
3053	if (mask & AT_XVATTR) {
3054		/*
3055		 * Update xvattr mask to include only those attributes
3056		 * that are actually changing.
3057		 *
3058		 * the bits will be restored prior to actually setting
3059		 * the attributes so the caller thinks they were set.
3060		 */
3061		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
3062			if (xoap->xoa_appendonly !=
3063			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
3064				need_policy = TRUE;
3065			} else {
3066				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
3067				XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
3068			}
3069		}
3070
3071		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
3072			if (xoap->xoa_nounlink !=
3073			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
3074				need_policy = TRUE;
3075			} else {
3076				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
3077				XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
3078			}
3079		}
3080
3081		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
3082			if (xoap->xoa_immutable !=
3083			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
3084				need_policy = TRUE;
3085			} else {
3086				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
3087				XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
3088			}
3089		}
3090
3091		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
3092			if (xoap->xoa_nodump !=
3093			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
3094				need_policy = TRUE;
3095			} else {
3096				XVA_CLR_REQ(xvap, XAT_NODUMP);
3097				XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
3098			}
3099		}
3100
3101		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
3102			if (xoap->xoa_av_modified !=
3103			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
3104				need_policy = TRUE;
3105			} else {
3106				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
3107				XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
3108			}
3109		}
3110
3111		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
3112			if ((vp->v_type != VREG &&
3113			    xoap->xoa_av_quarantined) ||
3114			    xoap->xoa_av_quarantined !=
3115			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
3116				need_policy = TRUE;
3117			} else {
3118				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
3119				XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
3120			}
3121		}
3122
3123		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
3124			ZFS_EXIT(zfsvfs);
3125			return (SET_ERROR(EPERM));
3126		}
3127
3128		if (need_policy == FALSE &&
3129		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
3130		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
3131			need_policy = TRUE;
3132		}
3133	}
3134
3135	if (mask & AT_MODE) {
3136		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
3137			err = secpolicy_setid_setsticky_clear(vp, vap,
3138			    &oldva, cr);
3139			if (err) {
3140				ZFS_EXIT(zfsvfs);
3141				return (err);
3142			}
3143			trim_mask |= AT_MODE;
3144		} else {
3145			need_policy = TRUE;
3146		}
3147	}
3148
3149	if (need_policy) {
3150		/*
3151		 * If trim_mask is set then take ownership
3152		 * has been granted or write_acl is present and user
3153		 * has the ability to modify mode.  In that case remove
3154		 * UID|GID and or MODE from mask so that
3155		 * secpolicy_vnode_setattr() doesn't revoke it.
3156		 */
3157
3158		if (trim_mask) {
3159			saved_mask = vap->va_mask;
3160			vap->va_mask &= ~trim_mask;
3161			if (trim_mask & AT_MODE) {
3162				/*
3163				 * Save the mode, as secpolicy_vnode_setattr()
3164				 * will overwrite it with ova.va_mode.
3165				 */
3166				saved_mode = vap->va_mode;
3167			}
3168		}
3169		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
3170		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
3171		if (err) {
3172			ZFS_EXIT(zfsvfs);
3173			return (err);
3174		}
3175
3176		if (trim_mask) {
3177			vap->va_mask |= saved_mask;
3178			if (trim_mask & AT_MODE) {
3179				/*
3180				 * Recover the mode after
3181				 * secpolicy_vnode_setattr().
3182				 */
3183				vap->va_mode = saved_mode;
3184			}
3185		}
3186	}
3187
3188	/*
3189	 * secpolicy_vnode_setattr, or take ownership may have
3190	 * changed va_mask
3191	 */
3192	mask = vap->va_mask;
3193
3194	if ((mask & (AT_UID | AT_GID))) {
3195		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
3196		    &xattr_obj, sizeof (xattr_obj));
3197
3198		if (err == 0 && xattr_obj) {
3199			err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
3200			if (err)
3201				goto out2;
3202		}
3203		if (mask & AT_UID) {
3204			new_uid = zfs_fuid_create(zfsvfs,
3205			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
3206			if (new_uid != zp->z_uid &&
3207			    zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
3208				if (attrzp)
3209					vrele(ZTOV(attrzp));
3210				err = SET_ERROR(EDQUOT);
3211				goto out2;
3212			}
3213		}
3214
3215		if (mask & AT_GID) {
3216			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
3217			    cr, ZFS_GROUP, &fuidp);
3218			if (new_gid != zp->z_gid &&
3219			    zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
3220				if (attrzp)
3221					vrele(ZTOV(attrzp));
3222				err = SET_ERROR(EDQUOT);
3223				goto out2;
3224			}
3225		}
3226	}
3227	tx = dmu_tx_create(zfsvfs->z_os);
3228
3229	if (mask & AT_MODE) {
3230		uint64_t pmode = zp->z_mode;
3231		uint64_t acl_obj;
3232		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
3233
3234		if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
3235		    !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
3236			err = SET_ERROR(EPERM);
3237			goto out;
3238		}
3239
3240		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
3241			goto out;
3242
3243		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
3244			/*
3245			 * Are we upgrading ACL from old V0 format
3246			 * to V1 format?
3247			 */
3248			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
3249			    zfs_znode_acl_version(zp) ==
3250			    ZFS_ACL_VERSION_INITIAL) {
3251				dmu_tx_hold_free(tx, acl_obj, 0,
3252				    DMU_OBJECT_END);
3253				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3254				    0, aclp->z_acl_bytes);
3255			} else {
3256				dmu_tx_hold_write(tx, acl_obj, 0,
3257				    aclp->z_acl_bytes);
3258			}
3259		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3260			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3261			    0, aclp->z_acl_bytes);
3262		}
3263		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3264	} else {
3265		if ((mask & AT_XVATTR) &&
3266		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3267			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3268		else
3269			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3270	}
3271
3272	if (attrzp) {
3273		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3274	}
3275
3276	fuid_dirtied = zfsvfs->z_fuid_dirty;
3277	if (fuid_dirtied)
3278		zfs_fuid_txhold(zfsvfs, tx);
3279
3280	zfs_sa_upgrade_txholds(tx, zp);
3281
3282	err = dmu_tx_assign(tx, TXG_WAIT);
3283	if (err)
3284		goto out;
3285
3286	count = 0;
3287	/*
3288	 * Set each attribute requested.
3289	 * We group settings according to the locks they need to acquire.
3290	 *
3291	 * Note: you cannot set ctime directly, although it will be
3292	 * updated as a side-effect of calling this function.
3293	 */
3294
3295	if (mask & (AT_UID|AT_GID|AT_MODE))
3296		mutex_enter(&zp->z_acl_lock);
3297
3298	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3299	    &zp->z_pflags, sizeof (zp->z_pflags));
3300
3301	if (attrzp) {
3302		if (mask & (AT_UID|AT_GID|AT_MODE))
3303			mutex_enter(&attrzp->z_acl_lock);
3304		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3305		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3306		    sizeof (attrzp->z_pflags));
3307	}
3308
3309	if (mask & (AT_UID|AT_GID)) {
3310
3311		if (mask & AT_UID) {
3312			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3313			    &new_uid, sizeof (new_uid));
3314			zp->z_uid = new_uid;
3315			if (attrzp) {
3316				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3317				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3318				    sizeof (new_uid));
3319				attrzp->z_uid = new_uid;
3320			}
3321		}
3322
3323		if (mask & AT_GID) {
3324			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3325			    NULL, &new_gid, sizeof (new_gid));
3326			zp->z_gid = new_gid;
3327			if (attrzp) {
3328				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3329				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3330				    sizeof (new_gid));
3331				attrzp->z_gid = new_gid;
3332			}
3333		}
3334		if (!(mask & AT_MODE)) {
3335			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3336			    NULL, &new_mode, sizeof (new_mode));
3337			new_mode = zp->z_mode;
3338		}
3339		err = zfs_acl_chown_setattr(zp);
3340		ASSERT(err == 0);
3341		if (attrzp) {
3342			err = zfs_acl_chown_setattr(attrzp);
3343			ASSERT(err == 0);
3344		}
3345	}
3346
3347	if (mask & AT_MODE) {
3348		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3349		    &new_mode, sizeof (new_mode));
3350		zp->z_mode = new_mode;
3351		ASSERT3U((uintptr_t)aclp, !=, 0);
3352		err = zfs_aclset_common(zp, aclp, cr, tx);
3353		ASSERT0(err);
3354		if (zp->z_acl_cached)
3355			zfs_acl_free(zp->z_acl_cached);
3356		zp->z_acl_cached = aclp;
3357		aclp = NULL;
3358	}
3359
3360
3361	if (mask & AT_ATIME) {
3362		ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
3363		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
3364		    &zp->z_atime, sizeof (zp->z_atime));
3365	}
3366
3367	if (mask & AT_MTIME) {
3368		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3369		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3370		    mtime, sizeof (mtime));
3371	}
3372
3373	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3374	if (mask & AT_SIZE && !(mask & AT_MTIME)) {
3375		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
3376		    NULL, mtime, sizeof (mtime));
3377		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3378		    &ctime, sizeof (ctime));
3379		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
3380		    B_TRUE);
3381	} else if (mask != 0) {
3382		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3383		    &ctime, sizeof (ctime));
3384		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
3385		    B_TRUE);
3386		if (attrzp) {
3387			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3388			    SA_ZPL_CTIME(zfsvfs), NULL,
3389			    &ctime, sizeof (ctime));
3390			zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
3391			    mtime, ctime, B_TRUE);
3392		}
3393	}
3394	/*
3395	 * Do this after setting timestamps to prevent timestamp
3396	 * update from toggling bit
3397	 */
3398
3399	if (xoap && (mask & AT_XVATTR)) {
3400
3401		/*
3402		 * restore trimmed off masks
3403		 * so that return masks can be set for caller.
3404		 */
3405
3406		if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3407			XVA_SET_REQ(xvap, XAT_APPENDONLY);
3408		}
3409		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3410			XVA_SET_REQ(xvap, XAT_NOUNLINK);
3411		}
3412		if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3413			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3414		}
3415		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3416			XVA_SET_REQ(xvap, XAT_NODUMP);
3417		}
3418		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3419			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3420		}
3421		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3422			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3423		}
3424
3425		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3426			ASSERT(vp->v_type == VREG);
3427
3428		zfs_xvattr_set(zp, xvap, tx);
3429	}
3430
3431	if (fuid_dirtied)
3432		zfs_fuid_sync(zfsvfs, tx);
3433
3434	if (mask != 0)
3435		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3436
3437	if (mask & (AT_UID|AT_GID|AT_MODE))
3438		mutex_exit(&zp->z_acl_lock);
3439
3440	if (attrzp) {
3441		if (mask & (AT_UID|AT_GID|AT_MODE))
3442			mutex_exit(&attrzp->z_acl_lock);
3443	}
3444out:
3445	if (err == 0 && attrzp) {
3446		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3447		    xattr_count, tx);
3448		ASSERT(err2 == 0);
3449	}
3450
3451	if (attrzp)
3452		vrele(ZTOV(attrzp));
3453
3454	if (aclp)
3455		zfs_acl_free(aclp);
3456
3457	if (fuidp) {
3458		zfs_fuid_info_free(fuidp);
3459		fuidp = NULL;
3460	}
3461
3462	if (err) {
3463		dmu_tx_abort(tx);
3464	} else {
3465		err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3466		dmu_tx_commit(tx);
3467	}
3468
3469out2:
3470	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3471		zil_commit(zilog, 0);
3472
3473	ZFS_EXIT(zfsvfs);
3474	return (err);
3475}
3476
3477/*
3478 * We acquire all but fdvp locks using non-blocking acquisitions.  If we
3479 * fail to acquire any lock in the path we will drop all held locks,
3480 * acquire the new lock in a blocking fashion, and then release it and
3481 * restart the rename.  This acquire/release step ensures that we do not
3482 * spin on a lock waiting for release.  On error release all vnode locks
3483 * and decrement references the way tmpfs_rename() would do.
3484 */
3485static int
3486zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
3487    struct vnode *tdvp, struct vnode **tvpp,
3488    const struct componentname *scnp, const struct componentname *tcnp)
3489{
3490	zfsvfs_t	*zfsvfs;
3491	struct vnode	*nvp, *svp, *tvp;
3492	znode_t		*sdzp, *tdzp, *szp, *tzp;
3493	const char	*snm = scnp->cn_nameptr;
3494	const char	*tnm = tcnp->cn_nameptr;
3495	int error;
3496
3497	VOP_UNLOCK(tdvp, 0);
3498	if (*tvpp != NULL && *tvpp != tdvp)
3499		VOP_UNLOCK(*tvpp, 0);
3500
3501relock:
3502	error = vn_lock(sdvp, LK_EXCLUSIVE);
3503	if (error)
3504		goto out;
3505	sdzp = VTOZ(sdvp);
3506
3507	error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
3508	if (error != 0) {
3509		VOP_UNLOCK(sdvp, 0);
3510		if (error != EBUSY)
3511			goto out;
3512		error = vn_lock(tdvp, LK_EXCLUSIVE);
3513		if (error)
3514			goto out;
3515		VOP_UNLOCK(tdvp, 0);
3516		goto relock;
3517	}
3518	tdzp = VTOZ(tdvp);
3519
3520	/*
3521	 * Before using sdzp and tdzp we must ensure that they are live.
3522	 * As a porting legacy from illumos we have two things to worry
3523	 * about.  One is typical for FreeBSD and it is that the vnode is
3524	 * not reclaimed (doomed).  The other is that the znode is live.
3525	 * The current code can invalidate the znode without acquiring the
3526	 * corresponding vnode lock if the object represented by the znode
3527	 * and vnode is no longer valid after a rollback or receive operation.
3528	 * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock
3529	 * that protects the znodes from the invalidation.
3530	 */
3531	zfsvfs = sdzp->z_zfsvfs;
3532	ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
3533	ZFS_ENTER(zfsvfs);
3534
3535	/*
3536	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
3537	 * bypassing the cleanup code in the case of an error.
3538	 */
3539	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
3540		ZFS_EXIT(zfsvfs);
3541		VOP_UNLOCK(sdvp, 0);
3542		VOP_UNLOCK(tdvp, 0);
3543		error = SET_ERROR(EIO);
3544		goto out;
3545	}
3546
3547	/*
3548	 * Re-resolve svp to be certain it still exists and fetch the
3549	 * correct vnode.
3550	 */
3551	error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS);
3552	if (error != 0) {
3553		/* Source entry invalid or not there. */
3554		ZFS_EXIT(zfsvfs);
3555		VOP_UNLOCK(sdvp, 0);
3556		VOP_UNLOCK(tdvp, 0);
3557		if ((scnp->cn_flags & ISDOTDOT) != 0 ||
3558		    (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
3559			error = SET_ERROR(EINVAL);
3560		goto out;
3561	}
3562	svp = ZTOV(szp);
3563
3564	/*
3565	 * Re-resolve tvp, if it disappeared we just carry on.
3566	 */
3567	error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0);
3568	if (error != 0) {
3569		ZFS_EXIT(zfsvfs);
3570		VOP_UNLOCK(sdvp, 0);
3571		VOP_UNLOCK(tdvp, 0);
3572		vrele(svp);
3573		if ((tcnp->cn_flags & ISDOTDOT) != 0)
3574			error = SET_ERROR(EINVAL);
3575		goto out;
3576	}
3577	if (tzp != NULL)
3578		tvp = ZTOV(tzp);
3579	else
3580		tvp = NULL;
3581
3582	/*
3583	 * At present the vnode locks must be acquired before z_teardown_lock,
3584	 * although it would be more logical to use the opposite order.
3585	 */
3586	ZFS_EXIT(zfsvfs);
3587
3588	/*
3589	 * Now try acquire locks on svp and tvp.
3590	 */
3591	nvp = svp;
3592	error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
3593	if (error != 0) {
3594		VOP_UNLOCK(sdvp, 0);
3595		VOP_UNLOCK(tdvp, 0);
3596		if (tvp != NULL)
3597			vrele(tvp);
3598		if (error != EBUSY) {
3599			vrele(nvp);
3600			goto out;
3601		}
3602		error = vn_lock(nvp, LK_EXCLUSIVE);
3603		if (error != 0) {
3604			vrele(nvp);
3605			goto out;
3606		}
3607		VOP_UNLOCK(nvp, 0);
3608		/*
3609		 * Concurrent rename race.
3610		 * XXX ?
3611		 */
3612		if (nvp == tdvp) {
3613			vrele(nvp);
3614			error = SET_ERROR(EINVAL);
3615			goto out;
3616		}
3617		vrele(*svpp);
3618		*svpp = nvp;
3619		goto relock;
3620	}
3621	vrele(*svpp);
3622	*svpp = nvp;
3623
3624	if (*tvpp != NULL)
3625		vrele(*tvpp);
3626	*tvpp = NULL;
3627	if (tvp != NULL) {
3628		nvp = tvp;
3629		error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
3630		if (error != 0) {
3631			VOP_UNLOCK(sdvp, 0);
3632			VOP_UNLOCK(tdvp, 0);
3633			VOP_UNLOCK(*svpp, 0);
3634			if (error != EBUSY) {
3635				vrele(nvp);
3636				goto out;
3637			}
3638			error = vn_lock(nvp, LK_EXCLUSIVE);
3639			if (error != 0) {
3640				vrele(nvp);
3641				goto out;
3642			}
3643			vput(nvp);
3644			goto relock;
3645		}
3646		*tvpp = nvp;
3647	}
3648
3649	return (0);
3650
3651out:
3652	return (error);
3653}
3654
3655/*
3656 * Note that we must use VRELE_ASYNC in this function as it walks
3657 * up the directory tree and vrele may need to acquire an exclusive
3658 * lock if a last reference to a vnode is dropped.
3659 */
3660static int
3661zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
3662{
3663	zfsvfs_t	*zfsvfs;
3664	znode_t		*zp, *zp1;
3665	uint64_t	parent;
3666	int		error;
3667
3668	zfsvfs = tdzp->z_zfsvfs;
3669	if (tdzp == szp)
3670		return (SET_ERROR(EINVAL));
3671	if (tdzp == sdzp)
3672		return (0);
3673	if (tdzp->z_id == zfsvfs->z_root)
3674		return (0);
3675	zp = tdzp;
3676	for (;;) {
3677		ASSERT(!zp->z_unlinked);
3678		if ((error = sa_lookup(zp->z_sa_hdl,
3679		    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
3680			break;
3681
3682		if (parent == szp->z_id) {
3683			error = SET_ERROR(EINVAL);
3684			break;
3685		}
3686		if (parent == zfsvfs->z_root)
3687			break;
3688		if (parent == sdzp->z_id)
3689			break;
3690
3691		error = zfs_zget(zfsvfs, parent, &zp1);
3692		if (error != 0)
3693			break;
3694
3695		if (zp != tdzp)
3696			VN_RELE_ASYNC(ZTOV(zp),
3697			    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
3698		zp = zp1;
3699	}
3700
3701	if (error == ENOTDIR)
3702		panic("checkpath: .. not a directory\n");
3703	if (zp != tdzp)
3704		VN_RELE_ASYNC(ZTOV(zp),
3705		    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
3706	return (error);
3707}
3708
3709/*
3710 * Move an entry from the provided source directory to the target
3711 * directory.  Change the entry name as indicated.
3712 *
3713 *	IN:	sdvp	- Source directory containing the "old entry".
3714 *		snm	- Old entry name.
3715 *		tdvp	- Target directory to contain the "new entry".
3716 *		tnm	- New entry name.
3717 *		cr	- credentials of caller.
3718 *		ct	- caller context
3719 *		flags	- case flags
3720 *
3721 *	RETURN:	0 on success, error code on failure.
3722 *
3723 * Timestamps:
3724 *	sdvp,tdvp - ctime|mtime updated
3725 */
3726/*ARGSUSED*/
3727static int
3728zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
3729    vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
3730    cred_t *cr)
3731{
3732	zfsvfs_t	*zfsvfs;
3733	znode_t		*sdzp, *tdzp, *szp, *tzp;
3734	zilog_t		*zilog = NULL;
3735	dmu_tx_t	*tx;
3736	char		*snm = scnp->cn_nameptr;
3737	char		*tnm = tcnp->cn_nameptr;
3738	int		error = 0;
3739
3740	/* Reject renames across filesystems. */
3741	if ((*svpp)->v_mount != tdvp->v_mount ||
3742	    ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
3743		error = SET_ERROR(EXDEV);
3744		goto out;
3745	}
3746
3747	if (zfsctl_is_node(tdvp)) {
3748		error = SET_ERROR(EXDEV);
3749		goto out;
3750	}
3751
3752	/*
3753	 * Lock all four vnodes to ensure safety and semantics of renaming.
3754	 */
3755	error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
3756	if (error != 0) {
3757		/* no vnodes are locked in the case of error here */
3758		return (error);
3759	}
3760
3761	tdzp = VTOZ(tdvp);
3762	sdzp = VTOZ(sdvp);
3763	zfsvfs = tdzp->z_zfsvfs;
3764	zilog = zfsvfs->z_log;
3765
3766	/*
3767	 * After we re-enter ZFS_ENTER() we will have to revalidate all
3768	 * znodes involved.
3769	 */
3770	ZFS_ENTER(zfsvfs);
3771
3772	if (zfsvfs->z_utf8 && u8_validate(tnm,
3773	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3774		error = SET_ERROR(EILSEQ);
3775		goto unlockout;
3776	}
3777
3778	/* If source and target are the same file, there is nothing to do. */
3779	if ((*svpp) == (*tvpp)) {
3780		error = 0;
3781		goto unlockout;
3782	}
3783
3784	if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
3785	    ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
3786	    (*tvpp)->v_mountedhere != NULL)) {
3787		error = SET_ERROR(EXDEV);
3788		goto unlockout;
3789	}
3790
3791	/*
3792	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
3793	 * bypassing the cleanup code in the case of an error.
3794	 */
3795	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
3796		error = SET_ERROR(EIO);
3797		goto unlockout;
3798	}
3799
3800	szp = VTOZ(*svpp);
3801	tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
3802	if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) {
3803		error = SET_ERROR(EIO);
3804		goto unlockout;
3805	}
3806
3807	/*
3808	 * This is to prevent the creation of links into attribute space
3809	 * by renaming a linked file into/outof an attribute directory.
3810	 * See the comment in zfs_link() for why this is considered bad.
3811	 */
3812	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3813		error = SET_ERROR(EINVAL);
3814		goto unlockout;
3815	}
3816
3817	/*
3818	 * Must have write access at the source to remove the old entry
3819	 * and write access at the target to create the new entry.
3820	 * Note that if target and source are the same, this can be
3821	 * done in a single check.
3822	 */
3823	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
3824		goto unlockout;
3825
3826	if ((*svpp)->v_type == VDIR) {
3827		/*
3828		 * Avoid ".", "..", and aliases of "." for obvious reasons.
3829		 */
3830		if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
3831		    sdzp == szp ||
3832		    (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
3833			error = EINVAL;
3834			goto unlockout;
3835		}
3836
3837		/*
3838		 * Check to make sure rename is valid.
3839		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3840		 */
3841		if (error = zfs_rename_check(szp, sdzp, tdzp))
3842			goto unlockout;
3843	}
3844
3845	/*
3846	 * Does target exist?
3847	 */
3848	if (tzp) {
3849		/*
3850		 * Source and target must be the same type.
3851		 */
3852		if ((*svpp)->v_type == VDIR) {
3853			if ((*tvpp)->v_type != VDIR) {
3854				error = SET_ERROR(ENOTDIR);
3855				goto unlockout;
3856			} else {
3857				cache_purge(tdvp);
3858				if (sdvp != tdvp)
3859					cache_purge(sdvp);
3860			}
3861		} else {
3862			if ((*tvpp)->v_type == VDIR) {
3863				error = SET_ERROR(EISDIR);
3864				goto unlockout;
3865			}
3866		}
3867	}
3868
3869	vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
3870	if (tzp)
3871		vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
3872
3873	/*
3874	 * notify the target directory if it is not the same
3875	 * as source directory.
3876	 */
3877	if (tdvp != sdvp) {
3878		vnevent_rename_dest_dir(tdvp, ct);
3879	}
3880
3881	tx = dmu_tx_create(zfsvfs->z_os);
3882	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3883	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3884	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3885	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3886	if (sdzp != tdzp) {
3887		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3888		zfs_sa_upgrade_txholds(tx, tdzp);
3889	}
3890	if (tzp) {
3891		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3892		zfs_sa_upgrade_txholds(tx, tzp);
3893	}
3894
3895	zfs_sa_upgrade_txholds(tx, szp);
3896	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3897	error = dmu_tx_assign(tx, TXG_WAIT);
3898	if (error) {
3899		dmu_tx_abort(tx);
3900		goto unlockout;
3901	}
3902
3903
3904	if (tzp)	/* Attempt to remove the existing target */
3905		error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
3906
3907	if (error == 0) {
3908		error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
3909		if (error == 0) {
3910			szp->z_pflags |= ZFS_AV_MODIFIED;
3911
3912			error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3913			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3914			ASSERT0(error);
3915
3916			error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING,
3917			    NULL);
3918			if (error == 0) {
3919				zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
3920				    snm, tdzp, tnm, szp);
3921
3922				/*
3923				 * Update path information for the target vnode
3924				 */
3925				vn_renamepath(tdvp, *svpp, tnm, strlen(tnm));
3926			} else {
3927				/*
3928				 * At this point, we have successfully created
3929				 * the target name, but have failed to remove
3930				 * the source name.  Since the create was done
3931				 * with the ZRENAMING flag, there are
3932				 * complications; for one, the link count is
3933				 * wrong.  The easiest way to deal with this
3934				 * is to remove the newly created target, and
3935				 * return the original error.  This must
3936				 * succeed; fortunately, it is very unlikely to
3937				 * fail, since we just created it.
3938				 */
3939				VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx,
3940				    ZRENAMING, NULL), ==, 0);
3941			}
3942		}
3943		if (error == 0) {
3944			cache_purge(*svpp);
3945			if (*tvpp != NULL)
3946				cache_purge(*tvpp);
3947			cache_purge_negative(tdvp);
3948		}
3949	}
3950
3951	dmu_tx_commit(tx);
3952
3953unlockout:			/* all 4 vnodes are locked, ZFS_ENTER called */
3954	ZFS_EXIT(zfsvfs);
3955	VOP_UNLOCK(*svpp, 0);
3956	VOP_UNLOCK(sdvp, 0);
3957
3958out:				/* original two vnodes are locked */
3959	if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3960		zil_commit(zilog, 0);
3961
3962	if (*tvpp != NULL)
3963		VOP_UNLOCK(*tvpp, 0);
3964	if (tdvp != *tvpp)
3965		VOP_UNLOCK(tdvp, 0);
3966	return (error);
3967}
3968
3969/*
3970 * Insert the indicated symbolic reference entry into the directory.
3971 *
3972 *	IN:	dvp	- Directory to contain new symbolic link.
3973 *		link	- Name for new symlink entry.
3974 *		vap	- Attributes of new entry.
3975 *		cr	- credentials of caller.
3976 *		ct	- caller context
3977 *		flags	- case flags
3978 *
3979 *	RETURN:	0 on success, error code on failure.
3980 *
3981 * Timestamps:
3982 *	dvp - ctime|mtime updated
3983 */
3984/*ARGSUSED*/
3985static int
3986zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
3987    cred_t *cr, kthread_t *td)
3988{
3989	znode_t		*zp, *dzp = VTOZ(dvp);
3990	dmu_tx_t	*tx;
3991	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
3992	zilog_t		*zilog;
3993	uint64_t	len = strlen(link);
3994	int		error;
3995	zfs_acl_ids_t	acl_ids;
3996	boolean_t	fuid_dirtied;
3997	uint64_t	txtype = TX_SYMLINK;
3998	int		flags = 0;
3999
4000	ASSERT(vap->va_type == VLNK);
4001
4002	ZFS_ENTER(zfsvfs);
4003	ZFS_VERIFY_ZP(dzp);
4004	zilog = zfsvfs->z_log;
4005
4006	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
4007	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4008		ZFS_EXIT(zfsvfs);
4009		return (SET_ERROR(EILSEQ));
4010	}
4011
4012	if (len > MAXPATHLEN) {
4013		ZFS_EXIT(zfsvfs);
4014		return (SET_ERROR(ENAMETOOLONG));
4015	}
4016
4017	if ((error = zfs_acl_ids_create(dzp, 0,
4018	    vap, cr, NULL, &acl_ids)) != 0) {
4019		ZFS_EXIT(zfsvfs);
4020		return (error);
4021	}
4022
4023	/*
4024	 * Attempt to lock directory; fail if entry already exists.
4025	 */
4026	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
4027	if (error) {
4028		zfs_acl_ids_free(&acl_ids);
4029		ZFS_EXIT(zfsvfs);
4030		return (error);
4031	}
4032
4033	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4034		zfs_acl_ids_free(&acl_ids);
4035		ZFS_EXIT(zfsvfs);
4036		return (error);
4037	}
4038
4039	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
4040		zfs_acl_ids_free(&acl_ids);
4041		ZFS_EXIT(zfsvfs);
4042		return (SET_ERROR(EDQUOT));
4043	}
4044
4045	getnewvnode_reserve(1);
4046	tx = dmu_tx_create(zfsvfs->z_os);
4047	fuid_dirtied = zfsvfs->z_fuid_dirty;
4048	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
4049	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4050	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
4051	    ZFS_SA_BASE_ATTR_SIZE + len);
4052	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
4053	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
4054		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
4055		    acl_ids.z_aclp->z_acl_bytes);
4056	}
4057	if (fuid_dirtied)
4058		zfs_fuid_txhold(zfsvfs, tx);
4059	error = dmu_tx_assign(tx, TXG_WAIT);
4060	if (error) {
4061		zfs_acl_ids_free(&acl_ids);
4062		dmu_tx_abort(tx);
4063		getnewvnode_drop_reserve();
4064		ZFS_EXIT(zfsvfs);
4065		return (error);
4066	}
4067
4068	/*
4069	 * Create a new object for the symlink.
4070	 * for version 4 ZPL datsets the symlink will be an SA attribute
4071	 */
4072	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
4073
4074	if (fuid_dirtied)
4075		zfs_fuid_sync(zfsvfs, tx);
4076
4077	if (zp->z_is_sa)
4078		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
4079		    link, len, tx);
4080	else
4081		zfs_sa_symlink(zp, link, len, tx);
4082
4083	zp->z_size = len;
4084	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
4085	    &zp->z_size, sizeof (zp->z_size), tx);
4086	/*
4087	 * Insert the new object into the directory.
4088	 */
4089	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
4090
4091	zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
4092	*vpp = ZTOV(zp);
4093
4094	zfs_acl_ids_free(&acl_ids);
4095
4096	dmu_tx_commit(tx);
4097
4098	getnewvnode_drop_reserve();
4099
4100	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4101		zil_commit(zilog, 0);
4102
4103	ZFS_EXIT(zfsvfs);
4104	return (error);
4105}
4106
4107/*
4108 * Return, in the buffer contained in the provided uio structure,
4109 * the symbolic path referred to by vp.
4110 *
4111 *	IN:	vp	- vnode of symbolic link.
4112 *		uio	- structure to contain the link path.
4113 *		cr	- credentials of caller.
4114 *		ct	- caller context
4115 *
4116 *	OUT:	uio	- structure containing the link path.
4117 *
4118 *	RETURN:	0 on success, error code on failure.
4119 *
4120 * Timestamps:
4121 *	vp - atime updated
4122 */
4123/* ARGSUSED */
4124static int
4125zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
4126{
4127	znode_t		*zp = VTOZ(vp);
4128	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4129	int		error;
4130
4131	ZFS_ENTER(zfsvfs);
4132	ZFS_VERIFY_ZP(zp);
4133
4134	if (zp->z_is_sa)
4135		error = sa_lookup_uio(zp->z_sa_hdl,
4136		    SA_ZPL_SYMLINK(zfsvfs), uio);
4137	else
4138		error = zfs_sa_readlink(zp, uio);
4139
4140	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4141
4142	ZFS_EXIT(zfsvfs);
4143	return (error);
4144}
4145
4146/*
4147 * Insert a new entry into directory tdvp referencing svp.
4148 *
4149 *	IN:	tdvp	- Directory to contain new entry.
4150 *		svp	- vnode of new entry.
4151 *		name	- name of new entry.
4152 *		cr	- credentials of caller.
4153 *		ct	- caller context
4154 *
4155 *	RETURN:	0 on success, error code on failure.
4156 *
4157 * Timestamps:
4158 *	tdvp - ctime|mtime updated
4159 *	 svp - ctime updated
4160 */
4161/* ARGSUSED */
4162static int
4163zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
4164    caller_context_t *ct, int flags)
4165{
4166	znode_t		*dzp = VTOZ(tdvp);
4167	znode_t		*tzp, *szp;
4168	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
4169	zilog_t		*zilog;
4170	dmu_tx_t	*tx;
4171	int		error;
4172	uint64_t	parent;
4173	uid_t		owner;
4174
4175	ASSERT(tdvp->v_type == VDIR);
4176
4177	ZFS_ENTER(zfsvfs);
4178	ZFS_VERIFY_ZP(dzp);
4179	zilog = zfsvfs->z_log;
4180
4181	/*
4182	 * POSIX dictates that we return EPERM here.
4183	 * Better choices include ENOTSUP or EISDIR.
4184	 */
4185	if (svp->v_type == VDIR) {
4186		ZFS_EXIT(zfsvfs);
4187		return (SET_ERROR(EPERM));
4188	}
4189
4190	szp = VTOZ(svp);
4191	ZFS_VERIFY_ZP(szp);
4192
4193	if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) {
4194		ZFS_EXIT(zfsvfs);
4195		return (SET_ERROR(EPERM));
4196	}
4197
4198	/* Prevent links to .zfs/shares files */
4199
4200	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
4201	    &parent, sizeof (uint64_t))) != 0) {
4202		ZFS_EXIT(zfsvfs);
4203		return (error);
4204	}
4205	if (parent == zfsvfs->z_shares_dir) {
4206		ZFS_EXIT(zfsvfs);
4207		return (SET_ERROR(EPERM));
4208	}
4209
4210	if (zfsvfs->z_utf8 && u8_validate(name,
4211	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4212		ZFS_EXIT(zfsvfs);
4213		return (SET_ERROR(EILSEQ));
4214	}
4215
4216	/*
4217	 * We do not support links between attributes and non-attributes
4218	 * because of the potential security risk of creating links
4219	 * into "normal" file space in order to circumvent restrictions
4220	 * imposed in attribute space.
4221	 */
4222	if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
4223		ZFS_EXIT(zfsvfs);
4224		return (SET_ERROR(EINVAL));
4225	}
4226
4227
4228	owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
4229	if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
4230		ZFS_EXIT(zfsvfs);
4231		return (SET_ERROR(EPERM));
4232	}
4233
4234	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4235		ZFS_EXIT(zfsvfs);
4236		return (error);
4237	}
4238
4239	/*
4240	 * Attempt to lock directory; fail if entry already exists.
4241	 */
4242	error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW);
4243	if (error) {
4244		ZFS_EXIT(zfsvfs);
4245		return (error);
4246	}
4247
4248	tx = dmu_tx_create(zfsvfs->z_os);
4249	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4250	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4251	zfs_sa_upgrade_txholds(tx, szp);
4252	zfs_sa_upgrade_txholds(tx, dzp);
4253	error = dmu_tx_assign(tx, TXG_WAIT);
4254	if (error) {
4255		dmu_tx_abort(tx);
4256		ZFS_EXIT(zfsvfs);
4257		return (error);
4258	}
4259
4260	error = zfs_link_create(dzp, name, szp, tx, 0);
4261
4262	if (error == 0) {
4263		uint64_t txtype = TX_LINK;
4264		zfs_log_link(zilog, tx, txtype, dzp, szp, name);
4265	}
4266
4267	dmu_tx_commit(tx);
4268
4269	if (error == 0) {
4270		vnevent_link(svp, ct);
4271	}
4272
4273	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4274		zil_commit(zilog, 0);
4275
4276	ZFS_EXIT(zfsvfs);
4277	return (error);
4278}
4279
4280
4281/*ARGSUSED*/
4282void
4283zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4284{
4285	znode_t	*zp = VTOZ(vp);
4286	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4287	int error;
4288
4289	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4290	if (zp->z_sa_hdl == NULL) {
4291		/*
4292		 * The fs has been unmounted, or we did a
4293		 * suspend/resume and this file no longer exists.
4294		 */
4295		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4296		vrecycle(vp);
4297		return;
4298	}
4299
4300	if (zp->z_unlinked) {
4301		/*
4302		 * Fast path to recycle a vnode of a removed file.
4303		 */
4304		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4305		vrecycle(vp);
4306		return;
4307	}
4308
4309	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4310		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4311
4312		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4313		zfs_sa_upgrade_txholds(tx, zp);
4314		error = dmu_tx_assign(tx, TXG_WAIT);
4315		if (error) {
4316			dmu_tx_abort(tx);
4317		} else {
4318			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4319			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4320			zp->z_atime_dirty = 0;
4321			dmu_tx_commit(tx);
4322		}
4323	}
4324	rw_exit(&zfsvfs->z_teardown_inactive_lock);
4325}
4326
4327
4328CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
4329CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
4330
4331/*ARGSUSED*/
4332static int
4333zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4334{
4335	znode_t		*zp = VTOZ(vp);
4336	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4337	uint32_t	gen;
4338	uint64_t	gen64;
4339	uint64_t	object = zp->z_id;
4340	zfid_short_t	*zfid;
4341	int		size, i, error;
4342
4343	ZFS_ENTER(zfsvfs);
4344	ZFS_VERIFY_ZP(zp);
4345
4346	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4347	    &gen64, sizeof (uint64_t))) != 0) {
4348		ZFS_EXIT(zfsvfs);
4349		return (error);
4350	}
4351
4352	gen = (uint32_t)gen64;
4353
4354	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
4355
4356#ifdef illumos
4357	if (fidp->fid_len < size) {
4358		fidp->fid_len = size;
4359		ZFS_EXIT(zfsvfs);
4360		return (SET_ERROR(ENOSPC));
4361	}
4362#else
4363	fidp->fid_len = size;
4364#endif
4365
4366	zfid = (zfid_short_t *)fidp;
4367
4368	zfid->zf_len = size;
4369
4370	for (i = 0; i < sizeof (zfid->zf_object); i++)
4371		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4372
4373	/* Must have a non-zero generation number to distinguish from .zfs */
4374	if (gen == 0)
4375		gen = 1;
4376	for (i = 0; i < sizeof (zfid->zf_gen); i++)
4377		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4378
4379	if (size == LONG_FID_LEN) {
4380		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
4381		zfid_long_t	*zlfid;
4382
4383		zlfid = (zfid_long_t *)fidp;
4384
4385		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
4386			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
4387
4388		/* XXX - this should be the generation number for the objset */
4389		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
4390			zlfid->zf_setgen[i] = 0;
4391	}
4392
4393	ZFS_EXIT(zfsvfs);
4394	return (0);
4395}
4396
4397static int
4398zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4399    caller_context_t *ct)
4400{
4401	znode_t		*zp, *xzp;
4402	zfsvfs_t	*zfsvfs;
4403	int		error;
4404
4405	switch (cmd) {
4406	case _PC_LINK_MAX:
4407		*valp = INT_MAX;
4408		return (0);
4409
4410	case _PC_FILESIZEBITS:
4411		*valp = 64;
4412		return (0);
4413#ifdef illumos
4414	case _PC_XATTR_EXISTS:
4415		zp = VTOZ(vp);
4416		zfsvfs = zp->z_zfsvfs;
4417		ZFS_ENTER(zfsvfs);
4418		ZFS_VERIFY_ZP(zp);
4419		*valp = 0;
4420		error = zfs_dirent_lookup(zp, "", &xzp,
4421		    ZXATTR | ZEXISTS | ZSHARED);
4422		if (error == 0) {
4423			if (!zfs_dirempty(xzp))
4424				*valp = 1;
4425			vrele(ZTOV(xzp));
4426		} else if (error == ENOENT) {
4427			/*
4428			 * If there aren't extended attributes, it's the
4429			 * same as having zero of them.
4430			 */
4431			error = 0;
4432		}
4433		ZFS_EXIT(zfsvfs);
4434		return (error);
4435
4436	case _PC_SATTR_ENABLED:
4437	case _PC_SATTR_EXISTS:
4438		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
4439		    (vp->v_type == VREG || vp->v_type == VDIR);
4440		return (0);
4441
4442	case _PC_ACCESS_FILTERING:
4443		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
4444		    vp->v_type == VDIR;
4445		return (0);
4446
4447	case _PC_ACL_ENABLED:
4448		*valp = _ACL_ACE_ENABLED;
4449		return (0);
4450#endif	/* illumos */
4451	case _PC_MIN_HOLE_SIZE:
4452		*valp = (int)SPA_MINBLOCKSIZE;
4453		return (0);
4454#ifdef illumos
4455	case _PC_TIMESTAMP_RESOLUTION:
4456		/* nanosecond timestamp resolution */
4457		*valp = 1L;
4458		return (0);
4459#endif
4460	case _PC_ACL_EXTENDED:
4461		*valp = 0;
4462		return (0);
4463
4464	case _PC_ACL_NFS4:
4465		*valp = 1;
4466		return (0);
4467
4468	case _PC_ACL_PATH_MAX:
4469		*valp = ACL_MAX_ENTRIES;
4470		return (0);
4471
4472	default:
4473		return (EOPNOTSUPP);
4474	}
4475}
4476
4477/*ARGSUSED*/
4478static int
4479zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4480    caller_context_t *ct)
4481{
4482	znode_t *zp = VTOZ(vp);
4483	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4484	int error;
4485	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4486
4487	ZFS_ENTER(zfsvfs);
4488	ZFS_VERIFY_ZP(zp);
4489	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
4490	ZFS_EXIT(zfsvfs);
4491
4492	return (error);
4493}
4494
4495/*ARGSUSED*/
4496int
4497zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4498    caller_context_t *ct)
4499{
4500	znode_t *zp = VTOZ(vp);
4501	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4502	int error;
4503	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4504	zilog_t	*zilog = zfsvfs->z_log;
4505
4506	ZFS_ENTER(zfsvfs);
4507	ZFS_VERIFY_ZP(zp);
4508
4509	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
4510
4511	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4512		zil_commit(zilog, 0);
4513
4514	ZFS_EXIT(zfsvfs);
4515	return (error);
4516}
4517
4518static int
4519ioflags(int ioflags)
4520{
4521	int flags = 0;
4522
4523	if (ioflags & IO_APPEND)
4524		flags |= FAPPEND;
4525	if (ioflags & IO_NDELAY)
4526		flags |= FNONBLOCK;
4527	if (ioflags & IO_SYNC)
4528		flags |= (FSYNC | FDSYNC | FRSYNC);
4529
4530	return (flags);
4531}
4532
4533static int
4534zfs_getpages(struct vnode *vp, vm_page_t *m, int count, int reqpage)
4535{
4536	znode_t *zp = VTOZ(vp);
4537	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4538	objset_t *os = zp->z_zfsvfs->z_os;
4539	vm_page_t mfirst, mlast, mreq;
4540	vm_object_t object;
4541	caddr_t va;
4542	struct sf_buf *sf;
4543	off_t startoff, endoff;
4544	int i, error;
4545	vm_pindex_t reqstart, reqend;
4546	int pcount, lsize, reqsize, size;
4547
4548	ZFS_ENTER(zfsvfs);
4549	ZFS_VERIFY_ZP(zp);
4550
4551	pcount = OFF_TO_IDX(round_page(count));
4552	mreq = m[reqpage];
4553	object = mreq->object;
4554	error = 0;
4555
4556	KASSERT(vp->v_object == object, ("mismatching object"));
4557
4558	if (pcount > 1 && zp->z_blksz > PAGESIZE) {
4559		startoff = rounddown(IDX_TO_OFF(mreq->pindex), zp->z_blksz);
4560		reqstart = OFF_TO_IDX(round_page(startoff));
4561		if (reqstart < m[0]->pindex)
4562			reqstart = 0;
4563		else
4564			reqstart = reqstart - m[0]->pindex;
4565		endoff = roundup(IDX_TO_OFF(mreq->pindex) + PAGE_SIZE,
4566		    zp->z_blksz);
4567		reqend = OFF_TO_IDX(trunc_page(endoff)) - 1;
4568		if (reqend > m[pcount - 1]->pindex)
4569			reqend = m[pcount - 1]->pindex;
4570		reqsize = reqend - m[reqstart]->pindex + 1;
4571		KASSERT(reqstart <= reqpage && reqpage < reqstart + reqsize,
4572		    ("reqpage beyond [reqstart, reqstart + reqsize[ bounds"));
4573	} else {
4574		reqstart = reqpage;
4575		reqsize = 1;
4576	}
4577	mfirst = m[reqstart];
4578	mlast = m[reqstart + reqsize - 1];
4579
4580	zfs_vmobject_wlock(object);
4581
4582	for (i = 0; i < reqstart; i++) {
4583		vm_page_lock(m[i]);
4584		vm_page_free(m[i]);
4585		vm_page_unlock(m[i]);
4586	}
4587	for (i = reqstart + reqsize; i < pcount; i++) {
4588		vm_page_lock(m[i]);
4589		vm_page_free(m[i]);
4590		vm_page_unlock(m[i]);
4591	}
4592
4593	if (mreq->valid && reqsize == 1) {
4594		if (mreq->valid != VM_PAGE_BITS_ALL)
4595			vm_page_zero_invalid(mreq, TRUE);
4596		zfs_vmobject_wunlock(object);
4597		ZFS_EXIT(zfsvfs);
4598		return (zfs_vm_pagerret_ok);
4599	}
4600
4601	PCPU_INC(cnt.v_vnodein);
4602	PCPU_ADD(cnt.v_vnodepgsin, reqsize);
4603
4604	if (IDX_TO_OFF(mreq->pindex) >= object->un_pager.vnp.vnp_size) {
4605		for (i = reqstart; i < reqstart + reqsize; i++) {
4606			if (i != reqpage) {
4607				vm_page_lock(m[i]);
4608				vm_page_free(m[i]);
4609				vm_page_unlock(m[i]);
4610			}
4611		}
4612		zfs_vmobject_wunlock(object);
4613		ZFS_EXIT(zfsvfs);
4614		return (zfs_vm_pagerret_bad);
4615	}
4616
4617	lsize = PAGE_SIZE;
4618	if (IDX_TO_OFF(mlast->pindex) + lsize > object->un_pager.vnp.vnp_size)
4619		lsize = object->un_pager.vnp.vnp_size - IDX_TO_OFF(mlast->pindex);
4620
4621	zfs_vmobject_wunlock(object);
4622
4623	for (i = reqstart; i < reqstart + reqsize; i++) {
4624		size = PAGE_SIZE;
4625		if (i == (reqstart + reqsize - 1))
4626			size = lsize;
4627		va = zfs_map_page(m[i], &sf);
4628		error = dmu_read(os, zp->z_id, IDX_TO_OFF(m[i]->pindex),
4629		    size, va, DMU_READ_PREFETCH);
4630		if (size != PAGE_SIZE)
4631			bzero(va + size, PAGE_SIZE - size);
4632		zfs_unmap_page(sf);
4633		if (error != 0)
4634			break;
4635	}
4636
4637	zfs_vmobject_wlock(object);
4638
4639	for (i = reqstart; i < reqstart + reqsize; i++) {
4640		if (!error)
4641			m[i]->valid = VM_PAGE_BITS_ALL;
4642		KASSERT(m[i]->dirty == 0, ("zfs_getpages: page %p is dirty", m[i]));
4643		if (i != reqpage)
4644			vm_page_readahead_finish(m[i]);
4645	}
4646
4647	zfs_vmobject_wunlock(object);
4648
4649	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4650	ZFS_EXIT(zfsvfs);
4651	return (error ? zfs_vm_pagerret_error : zfs_vm_pagerret_ok);
4652}
4653
4654static int
4655zfs_freebsd_getpages(ap)
4656	struct vop_getpages_args /* {
4657		struct vnode *a_vp;
4658		vm_page_t *a_m;
4659		int a_count;
4660		int a_reqpage;
4661		vm_ooffset_t a_offset;
4662	} */ *ap;
4663{
4664
4665	return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_reqpage));
4666}
4667
4668static int
4669zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
4670    int *rtvals)
4671{
4672	znode_t		*zp = VTOZ(vp);
4673	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4674	rl_t		*rl;
4675	dmu_tx_t	*tx;
4676	struct sf_buf	*sf;
4677	vm_object_t	object;
4678	vm_page_t	m;
4679	caddr_t		va;
4680	size_t		tocopy;
4681	size_t		lo_len;
4682	vm_ooffset_t	lo_off;
4683	vm_ooffset_t	off;
4684	uint_t		blksz;
4685	int		ncount;
4686	int		pcount;
4687	int		err;
4688	int		i;
4689
4690	ZFS_ENTER(zfsvfs);
4691	ZFS_VERIFY_ZP(zp);
4692
4693	object = vp->v_object;
4694	pcount = btoc(len);
4695	ncount = pcount;
4696
4697	KASSERT(ma[0]->object == object, ("mismatching object"));
4698	KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));
4699
4700	for (i = 0; i < pcount; i++)
4701		rtvals[i] = zfs_vm_pagerret_error;
4702
4703	off = IDX_TO_OFF(ma[0]->pindex);
4704	blksz = zp->z_blksz;
4705	lo_off = rounddown(off, blksz);
4706	lo_len = roundup(len + (off - lo_off), blksz);
4707	rl = zfs_range_lock(zp, lo_off, lo_len, RL_WRITER);
4708
4709	zfs_vmobject_wlock(object);
4710	if (len + off > object->un_pager.vnp.vnp_size) {
4711		if (object->un_pager.vnp.vnp_size > off) {
4712			int pgoff;
4713
4714			len = object->un_pager.vnp.vnp_size - off;
4715			ncount = btoc(len);
4716			if ((pgoff = (int)len & PAGE_MASK) != 0) {
4717				/*
4718				 * If the object is locked and the following
4719				 * conditions hold, then the page's dirty
4720				 * field cannot be concurrently changed by a
4721				 * pmap operation.
4722				 */
4723				m = ma[ncount - 1];
4724				vm_page_assert_sbusied(m);
4725				KASSERT(!pmap_page_is_write_mapped(m),
4726				    ("zfs_putpages: page %p is not read-only", m));
4727				vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
4728				    pgoff);
4729			}
4730		} else {
4731			len = 0;
4732			ncount = 0;
4733		}
4734		if (ncount < pcount) {
4735			for (i = ncount; i < pcount; i++) {
4736				rtvals[i] = zfs_vm_pagerret_bad;
4737			}
4738		}
4739	}
4740	zfs_vmobject_wunlock(object);
4741
4742	if (ncount == 0)
4743		goto out;
4744
4745	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
4746	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
4747		goto out;
4748	}
4749
4750top:
4751	tx = dmu_tx_create(zfsvfs->z_os);
4752	dmu_tx_hold_write(tx, zp->z_id, off, len);
4753
4754	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4755	zfs_sa_upgrade_txholds(tx, zp);
4756	err = dmu_tx_assign(tx, TXG_NOWAIT);
4757	if (err != 0) {
4758		if (err == ERESTART) {
4759			dmu_tx_wait(tx);
4760			dmu_tx_abort(tx);
4761			goto top;
4762		}
4763		dmu_tx_abort(tx);
4764		goto out;
4765	}
4766
4767	if (zp->z_blksz < PAGE_SIZE) {
4768		i = 0;
4769		for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
4770			tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
4771			va = zfs_map_page(ma[i], &sf);
4772			dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
4773			zfs_unmap_page(sf);
4774		}
4775	} else {
4776		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
4777	}
4778
4779	if (err == 0) {
4780		uint64_t mtime[2], ctime[2];
4781		sa_bulk_attr_t bulk[3];
4782		int count = 0;
4783
4784		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4785		    &mtime, 16);
4786		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
4787		    &ctime, 16);
4788		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
4789		    &zp->z_pflags, 8);
4790		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
4791		    B_TRUE);
4792		(void)sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
4793		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
4794
4795		zfs_vmobject_wlock(object);
4796		for (i = 0; i < ncount; i++) {
4797			rtvals[i] = zfs_vm_pagerret_ok;
4798			vm_page_undirty(ma[i]);
4799		}
4800		zfs_vmobject_wunlock(object);
4801		PCPU_INC(cnt.v_vnodeout);
4802		PCPU_ADD(cnt.v_vnodepgsout, ncount);
4803	}
4804	dmu_tx_commit(tx);
4805
4806out:
4807	zfs_range_unlock(rl);
4808	if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 ||
4809	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4810		zil_commit(zfsvfs->z_log, zp->z_id);
4811	ZFS_EXIT(zfsvfs);
4812	return (rtvals[0]);
4813}
4814
4815int
4816zfs_freebsd_putpages(ap)
4817	struct vop_putpages_args /* {
4818		struct vnode *a_vp;
4819		vm_page_t *a_m;
4820		int a_count;
4821		int a_sync;
4822		int *a_rtvals;
4823		vm_ooffset_t a_offset;
4824	} */ *ap;
4825{
4826
4827	return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
4828	    ap->a_rtvals));
4829}
4830
4831static int
4832zfs_freebsd_bmap(ap)
4833	struct vop_bmap_args /* {
4834		struct vnode *a_vp;
4835		daddr_t  a_bn;
4836		struct bufobj **a_bop;
4837		daddr_t *a_bnp;
4838		int *a_runp;
4839		int *a_runb;
4840	} */ *ap;
4841{
4842
4843	if (ap->a_bop != NULL)
4844		*ap->a_bop = &ap->a_vp->v_bufobj;
4845	if (ap->a_bnp != NULL)
4846		*ap->a_bnp = ap->a_bn;
4847	if (ap->a_runp != NULL)
4848		*ap->a_runp = 0;
4849	if (ap->a_runb != NULL)
4850		*ap->a_runb = 0;
4851
4852	return (0);
4853}
4854
4855static int
4856zfs_freebsd_open(ap)
4857	struct vop_open_args /* {
4858		struct vnode *a_vp;
4859		int a_mode;
4860		struct ucred *a_cred;
4861		struct thread *a_td;
4862	} */ *ap;
4863{
4864	vnode_t	*vp = ap->a_vp;
4865	znode_t *zp = VTOZ(vp);
4866	int error;
4867
4868	error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
4869	if (error == 0)
4870		vnode_create_vobject(vp, zp->z_size, ap->a_td);
4871	return (error);
4872}
4873
4874static int
4875zfs_freebsd_close(ap)
4876	struct vop_close_args /* {
4877		struct vnode *a_vp;
4878		int  a_fflag;
4879		struct ucred *a_cred;
4880		struct thread *a_td;
4881	} */ *ap;
4882{
4883
4884	return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL));
4885}
4886
4887static int
4888zfs_freebsd_ioctl(ap)
4889	struct vop_ioctl_args /* {
4890		struct vnode *a_vp;
4891		u_long a_command;
4892		caddr_t a_data;
4893		int a_fflag;
4894		struct ucred *cred;
4895		struct thread *td;
4896	} */ *ap;
4897{
4898
4899	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
4900	    ap->a_fflag, ap->a_cred, NULL, NULL));
4901}
4902
4903static int
4904zfs_freebsd_read(ap)
4905	struct vop_read_args /* {
4906		struct vnode *a_vp;
4907		struct uio *a_uio;
4908		int a_ioflag;
4909		struct ucred *a_cred;
4910	} */ *ap;
4911{
4912
4913	return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
4914	    ap->a_cred, NULL));
4915}
4916
4917static int
4918zfs_freebsd_write(ap)
4919	struct vop_write_args /* {
4920		struct vnode *a_vp;
4921		struct uio *a_uio;
4922		int a_ioflag;
4923		struct ucred *a_cred;
4924	} */ *ap;
4925{
4926
4927	return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
4928	    ap->a_cred, NULL));
4929}
4930
4931static int
4932zfs_freebsd_access(ap)
4933	struct vop_access_args /* {
4934		struct vnode *a_vp;
4935		accmode_t a_accmode;
4936		struct ucred *a_cred;
4937		struct thread *a_td;
4938	} */ *ap;
4939{
4940	vnode_t *vp = ap->a_vp;
4941	znode_t *zp = VTOZ(vp);
4942	accmode_t accmode;
4943	int error = 0;
4944
4945	/*
4946	 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
4947	 */
4948	accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
4949	if (accmode != 0)
4950		error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL);
4951
4952	/*
4953	 * VADMIN has to be handled by vaccess().
4954	 */
4955	if (error == 0) {
4956		accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
4957		if (accmode != 0) {
4958			error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
4959			    zp->z_gid, accmode, ap->a_cred, NULL);
4960		}
4961	}
4962
4963	/*
4964	 * For VEXEC, ensure that at least one execute bit is set for
4965	 * non-directories.
4966	 */
4967	if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
4968	    (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
4969		error = EACCES;
4970	}
4971
4972	return (error);
4973}
4974
4975static int
4976zfs_freebsd_lookup(ap)
4977	struct vop_lookup_args /* {
4978		struct vnode *a_dvp;
4979		struct vnode **a_vpp;
4980		struct componentname *a_cnp;
4981	} */ *ap;
4982{
4983	struct componentname *cnp = ap->a_cnp;
4984	char nm[NAME_MAX + 1];
4985
4986	ASSERT(cnp->cn_namelen < sizeof(nm));
4987	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
4988
4989	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
4990	    cnp->cn_cred, cnp->cn_thread, 0));
4991}
4992
4993static int
4994zfs_cache_lookup(ap)
4995	struct vop_lookup_args /* {
4996		struct vnode *a_dvp;
4997		struct vnode **a_vpp;
4998		struct componentname *a_cnp;
4999	} */ *ap;
5000{
5001	zfsvfs_t *zfsvfs;
5002
5003	zfsvfs = ap->a_dvp->v_mount->mnt_data;
5004	if (zfsvfs->z_use_namecache)
5005		return (vfs_cache_lookup(ap));
5006	else
5007		return (zfs_freebsd_lookup(ap));
5008}
5009
5010static int
5011zfs_freebsd_create(ap)
5012	struct vop_create_args /* {
5013		struct vnode *a_dvp;
5014		struct vnode **a_vpp;
5015		struct componentname *a_cnp;
5016		struct vattr *a_vap;
5017	} */ *ap;
5018{
5019	zfsvfs_t *zfsvfs;
5020	struct componentname *cnp = ap->a_cnp;
5021	vattr_t *vap = ap->a_vap;
5022	int error, mode;
5023
5024	ASSERT(cnp->cn_flags & SAVENAME);
5025
5026	vattr_init_mask(vap);
5027	mode = vap->va_mode & ALLPERMS;
5028	zfsvfs = ap->a_dvp->v_mount->mnt_data;
5029
5030	error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
5031	    ap->a_vpp, cnp->cn_cred, cnp->cn_thread);
5032	if (zfsvfs->z_use_namecache &&
5033	    error == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
5034		cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
5035	return (error);
5036}
5037
5038static int
5039zfs_freebsd_remove(ap)
5040	struct vop_remove_args /* {
5041		struct vnode *a_dvp;
5042		struct vnode *a_vp;
5043		struct componentname *a_cnp;
5044	} */ *ap;
5045{
5046
5047	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
5048
5049	return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
5050	    ap->a_cnp->cn_cred));
5051}
5052
5053static int
5054zfs_freebsd_mkdir(ap)
5055	struct vop_mkdir_args /* {
5056		struct vnode *a_dvp;
5057		struct vnode **a_vpp;
5058		struct componentname *a_cnp;
5059		struct vattr *a_vap;
5060	} */ *ap;
5061{
5062	vattr_t *vap = ap->a_vap;
5063
5064	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
5065
5066	vattr_init_mask(vap);
5067
5068	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
5069	    ap->a_cnp->cn_cred));
5070}
5071
5072static int
5073zfs_freebsd_rmdir(ap)
5074	struct vop_rmdir_args /* {
5075		struct vnode *a_dvp;
5076		struct vnode *a_vp;
5077		struct componentname *a_cnp;
5078	} */ *ap;
5079{
5080	struct componentname *cnp = ap->a_cnp;
5081
5082	ASSERT(cnp->cn_flags & SAVENAME);
5083
5084	return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
5085}
5086
5087static int
5088zfs_freebsd_readdir(ap)
5089	struct vop_readdir_args /* {
5090		struct vnode *a_vp;
5091		struct uio *a_uio;
5092		struct ucred *a_cred;
5093		int *a_eofflag;
5094		int *a_ncookies;
5095		u_long **a_cookies;
5096	} */ *ap;
5097{
5098
5099	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
5100	    ap->a_ncookies, ap->a_cookies));
5101}
5102
5103static int
5104zfs_freebsd_fsync(ap)
5105	struct vop_fsync_args /* {
5106		struct vnode *a_vp;
5107		int a_waitfor;
5108		struct thread *a_td;
5109	} */ *ap;
5110{
5111
5112	vop_stdfsync(ap);
5113	return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
5114}
5115
5116static int
5117zfs_freebsd_getattr(ap)
5118	struct vop_getattr_args /* {
5119		struct vnode *a_vp;
5120		struct vattr *a_vap;
5121		struct ucred *a_cred;
5122	} */ *ap;
5123{
5124	vattr_t *vap = ap->a_vap;
5125	xvattr_t xvap;
5126	u_long fflags = 0;
5127	int error;
5128
5129	xva_init(&xvap);
5130	xvap.xva_vattr = *vap;
5131	xvap.xva_vattr.va_mask |= AT_XVATTR;
5132
5133	/* Convert chflags into ZFS-type flags. */
5134	/* XXX: what about SF_SETTABLE?. */
5135	XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
5136	XVA_SET_REQ(&xvap, XAT_APPENDONLY);
5137	XVA_SET_REQ(&xvap, XAT_NOUNLINK);
5138	XVA_SET_REQ(&xvap, XAT_NODUMP);
5139	XVA_SET_REQ(&xvap, XAT_READONLY);
5140	XVA_SET_REQ(&xvap, XAT_ARCHIVE);
5141	XVA_SET_REQ(&xvap, XAT_SYSTEM);
5142	XVA_SET_REQ(&xvap, XAT_HIDDEN);
5143	XVA_SET_REQ(&xvap, XAT_REPARSE);
5144	XVA_SET_REQ(&xvap, XAT_OFFLINE);
5145	XVA_SET_REQ(&xvap, XAT_SPARSE);
5146
5147	error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
5148	if (error != 0)
5149		return (error);
5150
5151	/* Convert ZFS xattr into chflags. */
5152#define	FLAG_CHECK(fflag, xflag, xfield)	do {			\
5153	if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)		\
5154		fflags |= (fflag);					\
5155} while (0)
5156	FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
5157	    xvap.xva_xoptattrs.xoa_immutable);
5158	FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
5159	    xvap.xva_xoptattrs.xoa_appendonly);
5160	FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
5161	    xvap.xva_xoptattrs.xoa_nounlink);
5162	FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
5163	    xvap.xva_xoptattrs.xoa_archive);
5164	FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
5165	    xvap.xva_xoptattrs.xoa_nodump);
5166	FLAG_CHECK(UF_READONLY, XAT_READONLY,
5167	    xvap.xva_xoptattrs.xoa_readonly);
5168	FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
5169	    xvap.xva_xoptattrs.xoa_system);
5170	FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
5171	    xvap.xva_xoptattrs.xoa_hidden);
5172	FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
5173	    xvap.xva_xoptattrs.xoa_reparse);
5174	FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
5175	    xvap.xva_xoptattrs.xoa_offline);
5176	FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
5177	    xvap.xva_xoptattrs.xoa_sparse);
5178
5179#undef	FLAG_CHECK
5180	*vap = xvap.xva_vattr;
5181	vap->va_flags = fflags;
5182	return (0);
5183}
5184
5185static int
5186zfs_freebsd_setattr(ap)
5187	struct vop_setattr_args /* {
5188		struct vnode *a_vp;
5189		struct vattr *a_vap;
5190		struct ucred *a_cred;
5191	} */ *ap;
5192{
5193	vnode_t *vp = ap->a_vp;
5194	vattr_t *vap = ap->a_vap;
5195	cred_t *cred = ap->a_cred;
5196	xvattr_t xvap;
5197	u_long fflags;
5198	uint64_t zflags;
5199
5200	vattr_init_mask(vap);
5201	vap->va_mask &= ~AT_NOSET;
5202
5203	xva_init(&xvap);
5204	xvap.xva_vattr = *vap;
5205
5206	zflags = VTOZ(vp)->z_pflags;
5207
5208	if (vap->va_flags != VNOVAL) {
5209		zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
5210		int error;
5211
5212		if (zfsvfs->z_use_fuids == B_FALSE)
5213			return (EOPNOTSUPP);
5214
5215		fflags = vap->va_flags;
5216		/*
5217		 * XXX KDM
5218		 * We need to figure out whether it makes sense to allow
5219		 * UF_REPARSE through, since we don't really have other
5220		 * facilities to handle reparse points and zfs_setattr()
5221		 * doesn't currently allow setting that attribute anyway.
5222		 */
5223		if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE|
5224		     UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE|
5225		     UF_OFFLINE|UF_SPARSE)) != 0)
5226			return (EOPNOTSUPP);
5227		/*
5228		 * Unprivileged processes are not permitted to unset system
5229		 * flags, or modify flags if any system flags are set.
5230		 * Privileged non-jail processes may not modify system flags
5231		 * if securelevel > 0 and any existing system flags are set.
5232		 * Privileged jail processes behave like privileged non-jail
5233		 * processes if the security.jail.chflags_allowed sysctl is
5234		 * is non-zero; otherwise, they behave like unprivileged
5235		 * processes.
5236		 */
5237		if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
5238		    priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) {
5239			if (zflags &
5240			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
5241				error = securelevel_gt(cred, 0);
5242				if (error != 0)
5243					return (error);
5244			}
5245		} else {
5246			/*
5247			 * Callers may only modify the file flags on objects they
5248			 * have VADMIN rights for.
5249			 */
5250			if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0)
5251				return (error);
5252			if (zflags &
5253			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
5254				return (EPERM);
5255			}
5256			if (fflags &
5257			    (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
5258				return (EPERM);
5259			}
5260		}
5261
5262#define	FLAG_CHANGE(fflag, zflag, xflag, xfield)	do {		\
5263	if (((fflags & (fflag)) && !(zflags & (zflag))) ||		\
5264	    ((zflags & (zflag)) && !(fflags & (fflag)))) {		\
5265		XVA_SET_REQ(&xvap, (xflag));				\
5266		(xfield) = ((fflags & (fflag)) != 0);			\
5267	}								\
5268} while (0)
5269		/* Convert chflags into ZFS-type flags. */
5270		/* XXX: what about SF_SETTABLE?. */
5271		FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
5272		    xvap.xva_xoptattrs.xoa_immutable);
5273		FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
5274		    xvap.xva_xoptattrs.xoa_appendonly);
5275		FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
5276		    xvap.xva_xoptattrs.xoa_nounlink);
5277		FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
5278		    xvap.xva_xoptattrs.xoa_archive);
5279		FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
5280		    xvap.xva_xoptattrs.xoa_nodump);
5281		FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
5282		    xvap.xva_xoptattrs.xoa_readonly);
5283		FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
5284		    xvap.xva_xoptattrs.xoa_system);
5285		FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
5286		    xvap.xva_xoptattrs.xoa_hidden);
5287		FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
5288		    xvap.xva_xoptattrs.xoa_hidden);
5289		FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
5290		    xvap.xva_xoptattrs.xoa_offline);
5291		FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
5292		    xvap.xva_xoptattrs.xoa_sparse);
5293#undef	FLAG_CHANGE
5294	}
5295	return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
5296}
5297
5298static int
5299zfs_freebsd_rename(ap)
5300	struct vop_rename_args  /* {
5301		struct vnode *a_fdvp;
5302		struct vnode *a_fvp;
5303		struct componentname *a_fcnp;
5304		struct vnode *a_tdvp;
5305		struct vnode *a_tvp;
5306		struct componentname *a_tcnp;
5307	} */ *ap;
5308{
5309	vnode_t *fdvp = ap->a_fdvp;
5310	vnode_t *fvp = ap->a_fvp;
5311	vnode_t *tdvp = ap->a_tdvp;
5312	vnode_t *tvp = ap->a_tvp;
5313	int error;
5314
5315	ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
5316	ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
5317
5318	error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
5319	    ap->a_tcnp, ap->a_fcnp->cn_cred);
5320
5321	vrele(fdvp);
5322	vrele(fvp);
5323	vrele(tdvp);
5324	if (tvp != NULL)
5325		vrele(tvp);
5326
5327	return (error);
5328}
5329
5330static int
5331zfs_freebsd_symlink(ap)
5332	struct vop_symlink_args /* {
5333		struct vnode *a_dvp;
5334		struct vnode **a_vpp;
5335		struct componentname *a_cnp;
5336		struct vattr *a_vap;
5337		char *a_target;
5338	} */ *ap;
5339{
5340	struct componentname *cnp = ap->a_cnp;
5341	vattr_t *vap = ap->a_vap;
5342
5343	ASSERT(cnp->cn_flags & SAVENAME);
5344
5345	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
5346	vattr_init_mask(vap);
5347
5348	return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
5349	    ap->a_target, cnp->cn_cred, cnp->cn_thread));
5350}
5351
5352static int
5353zfs_freebsd_readlink(ap)
5354	struct vop_readlink_args /* {
5355		struct vnode *a_vp;
5356		struct uio *a_uio;
5357		struct ucred *a_cred;
5358	} */ *ap;
5359{
5360
5361	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
5362}
5363
5364static int
5365zfs_freebsd_link(ap)
5366	struct vop_link_args /* {
5367		struct vnode *a_tdvp;
5368		struct vnode *a_vp;
5369		struct componentname *a_cnp;
5370	} */ *ap;
5371{
5372	struct componentname *cnp = ap->a_cnp;
5373	vnode_t *vp = ap->a_vp;
5374	vnode_t *tdvp = ap->a_tdvp;
5375
5376	if (tdvp->v_mount != vp->v_mount)
5377		return (EXDEV);
5378
5379	ASSERT(cnp->cn_flags & SAVENAME);
5380
5381	return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
5382}
5383
5384static int
5385zfs_freebsd_inactive(ap)
5386	struct vop_inactive_args /* {
5387		struct vnode *a_vp;
5388		struct thread *a_td;
5389	} */ *ap;
5390{
5391	vnode_t *vp = ap->a_vp;
5392
5393	zfs_inactive(vp, ap->a_td->td_ucred, NULL);
5394	return (0);
5395}
5396
5397static int
5398zfs_freebsd_reclaim(ap)
5399	struct vop_reclaim_args /* {
5400		struct vnode *a_vp;
5401		struct thread *a_td;
5402	} */ *ap;
5403{
5404	vnode_t	*vp = ap->a_vp;
5405	znode_t	*zp = VTOZ(vp);
5406	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5407
5408	ASSERT(zp != NULL);
5409
5410	/* Destroy the vm object and flush associated pages. */
5411	vnode_destroy_vobject(vp);
5412
5413	/*
5414	 * z_teardown_inactive_lock protects from a race with
5415	 * zfs_znode_dmu_fini in zfsvfs_teardown during
5416	 * force unmount.
5417	 */
5418	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
5419	if (zp->z_sa_hdl == NULL)
5420		zfs_znode_free(zp);
5421	else
5422		zfs_zinactive(zp);
5423	rw_exit(&zfsvfs->z_teardown_inactive_lock);
5424
5425	vp->v_data = NULL;
5426	return (0);
5427}
5428
5429static int
5430zfs_freebsd_fid(ap)
5431	struct vop_fid_args /* {
5432		struct vnode *a_vp;
5433		struct fid *a_fid;
5434	} */ *ap;
5435{
5436
5437	return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
5438}
5439
5440static int
5441zfs_freebsd_pathconf(ap)
5442	struct vop_pathconf_args /* {
5443		struct vnode *a_vp;
5444		int a_name;
5445		register_t *a_retval;
5446	} */ *ap;
5447{
5448	ulong_t val;
5449	int error;
5450
5451	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL);
5452	if (error == 0)
5453		*ap->a_retval = val;
5454	else if (error == EOPNOTSUPP)
5455		error = vop_stdpathconf(ap);
5456	return (error);
5457}
5458
5459static int
5460zfs_freebsd_fifo_pathconf(ap)
5461	struct vop_pathconf_args /* {
5462		struct vnode *a_vp;
5463		int a_name;
5464		register_t *a_retval;
5465	} */ *ap;
5466{
5467
5468	switch (ap->a_name) {
5469	case _PC_ACL_EXTENDED:
5470	case _PC_ACL_NFS4:
5471	case _PC_ACL_PATH_MAX:
5472	case _PC_MAC_PRESENT:
5473		return (zfs_freebsd_pathconf(ap));
5474	default:
5475		return (fifo_specops.vop_pathconf(ap));
5476	}
5477}
5478
5479/*
5480 * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
5481 * extended attribute name:
5482 *
5483 *	NAMESPACE	PREFIX
5484 *	system		freebsd:system:
5485 *	user		(none, can be used to access ZFS fsattr(5) attributes
5486 *			created on Solaris)
5487 */
5488static int
5489zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
5490    size_t size)
5491{
5492	const char *namespace, *prefix, *suffix;
5493
5494	/* We don't allow '/' character in attribute name. */
5495	if (strchr(name, '/') != NULL)
5496		return (EINVAL);
5497	/* We don't allow attribute names that start with "freebsd:" string. */
5498	if (strncmp(name, "freebsd:", 8) == 0)
5499		return (EINVAL);
5500
5501	bzero(attrname, size);
5502
5503	switch (attrnamespace) {
5504	case EXTATTR_NAMESPACE_USER:
5505#if 0
5506		prefix = "freebsd:";
5507		namespace = EXTATTR_NAMESPACE_USER_STRING;
5508		suffix = ":";
5509#else
5510		/*
5511		 * This is the default namespace by which we can access all
5512		 * attributes created on Solaris.
5513		 */
5514		prefix = namespace = suffix = "";
5515#endif
5516		break;
5517	case EXTATTR_NAMESPACE_SYSTEM:
5518		prefix = "freebsd:";
5519		namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
5520		suffix = ":";
5521		break;
5522	case EXTATTR_NAMESPACE_EMPTY:
5523	default:
5524		return (EINVAL);
5525	}
5526	if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
5527	    name) >= size) {
5528		return (ENAMETOOLONG);
5529	}
5530	return (0);
5531}
5532
5533/*
5534 * Vnode operating to retrieve a named extended attribute.
5535 */
5536static int
5537zfs_getextattr(struct vop_getextattr_args *ap)
5538/*
5539vop_getextattr {
5540	IN struct vnode *a_vp;
5541	IN int a_attrnamespace;
5542	IN const char *a_name;
5543	INOUT struct uio *a_uio;
5544	OUT size_t *a_size;
5545	IN struct ucred *a_cred;
5546	IN struct thread *a_td;
5547};
5548*/
5549{
5550	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5551	struct thread *td = ap->a_td;
5552	struct nameidata nd;
5553	char attrname[255];
5554	struct vattr va;
5555	vnode_t *xvp = NULL, *vp;
5556	int error, flags;
5557
5558	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5559	    ap->a_cred, ap->a_td, VREAD);
5560	if (error != 0)
5561		return (error);
5562
5563	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5564	    sizeof(attrname));
5565	if (error != 0)
5566		return (error);
5567
5568	ZFS_ENTER(zfsvfs);
5569
5570	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5571	    LOOKUP_XATTR);
5572	if (error != 0) {
5573		ZFS_EXIT(zfsvfs);
5574		return (error);
5575	}
5576
5577	flags = FREAD;
5578	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
5579	    xvp, td);
5580	error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL);
5581	vp = nd.ni_vp;
5582	NDFREE(&nd, NDF_ONLY_PNBUF);
5583	if (error != 0) {
5584		ZFS_EXIT(zfsvfs);
5585		if (error == ENOENT)
5586			error = ENOATTR;
5587		return (error);
5588	}
5589
5590	if (ap->a_size != NULL) {
5591		error = VOP_GETATTR(vp, &va, ap->a_cred);
5592		if (error == 0)
5593			*ap->a_size = (size_t)va.va_size;
5594	} else if (ap->a_uio != NULL)
5595		error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
5596
5597	VOP_UNLOCK(vp, 0);
5598	vn_close(vp, flags, ap->a_cred, td);
5599	ZFS_EXIT(zfsvfs);
5600
5601	return (error);
5602}
5603
5604/*
5605 * Vnode operation to remove a named attribute.
5606 */
5607int
5608zfs_deleteextattr(struct vop_deleteextattr_args *ap)
5609/*
5610vop_deleteextattr {
5611	IN struct vnode *a_vp;
5612	IN int a_attrnamespace;
5613	IN const char *a_name;
5614	IN struct ucred *a_cred;
5615	IN struct thread *a_td;
5616};
5617*/
5618{
5619	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5620	struct thread *td = ap->a_td;
5621	struct nameidata nd;
5622	char attrname[255];
5623	struct vattr va;
5624	vnode_t *xvp = NULL, *vp;
5625	int error, flags;
5626
5627	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5628	    ap->a_cred, ap->a_td, VWRITE);
5629	if (error != 0)
5630		return (error);
5631
5632	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5633	    sizeof(attrname));
5634	if (error != 0)
5635		return (error);
5636
5637	ZFS_ENTER(zfsvfs);
5638
5639	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5640	    LOOKUP_XATTR);
5641	if (error != 0) {
5642		ZFS_EXIT(zfsvfs);
5643		return (error);
5644	}
5645
5646	NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
5647	    UIO_SYSSPACE, attrname, xvp, td);
5648	error = namei(&nd);
5649	vp = nd.ni_vp;
5650	if (error != 0) {
5651		ZFS_EXIT(zfsvfs);
5652		NDFREE(&nd, NDF_ONLY_PNBUF);
5653		if (error == ENOENT)
5654			error = ENOATTR;
5655		return (error);
5656	}
5657
5658	error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
5659	NDFREE(&nd, NDF_ONLY_PNBUF);
5660
5661	vput(nd.ni_dvp);
5662	if (vp == nd.ni_dvp)
5663		vrele(vp);
5664	else
5665		vput(vp);
5666	ZFS_EXIT(zfsvfs);
5667
5668	return (error);
5669}
5670
5671/*
5672 * Vnode operation to set a named attribute.
5673 */
5674static int
5675zfs_setextattr(struct vop_setextattr_args *ap)
5676/*
5677vop_setextattr {
5678	IN struct vnode *a_vp;
5679	IN int a_attrnamespace;
5680	IN const char *a_name;
5681	INOUT struct uio *a_uio;
5682	IN struct ucred *a_cred;
5683	IN struct thread *a_td;
5684};
5685*/
5686{
5687	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5688	struct thread *td = ap->a_td;
5689	struct nameidata nd;
5690	char attrname[255];
5691	struct vattr va;
5692	vnode_t *xvp = NULL, *vp;
5693	int error, flags;
5694
5695	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5696	    ap->a_cred, ap->a_td, VWRITE);
5697	if (error != 0)
5698		return (error);
5699
5700	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5701	    sizeof(attrname));
5702	if (error != 0)
5703		return (error);
5704
5705	ZFS_ENTER(zfsvfs);
5706
5707	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5708	    LOOKUP_XATTR | CREATE_XATTR_DIR);
5709	if (error != 0) {
5710		ZFS_EXIT(zfsvfs);
5711		return (error);
5712	}
5713
5714	flags = FFLAGS(O_WRONLY | O_CREAT);
5715	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
5716	    xvp, td);
5717	error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL);
5718	vp = nd.ni_vp;
5719	NDFREE(&nd, NDF_ONLY_PNBUF);
5720	if (error != 0) {
5721		ZFS_EXIT(zfsvfs);
5722		return (error);
5723	}
5724
5725	VATTR_NULL(&va);
5726	va.va_size = 0;
5727	error = VOP_SETATTR(vp, &va, ap->a_cred);
5728	if (error == 0)
5729		VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);
5730
5731	VOP_UNLOCK(vp, 0);
5732	vn_close(vp, flags, ap->a_cred, td);
5733	ZFS_EXIT(zfsvfs);
5734
5735	return (error);
5736}
5737
5738/*
5739 * Vnode operation to retrieve extended attributes on a vnode.
5740 */
5741static int
5742zfs_listextattr(struct vop_listextattr_args *ap)
5743/*
5744vop_listextattr {
5745	IN struct vnode *a_vp;
5746	IN int a_attrnamespace;
5747	INOUT struct uio *a_uio;
5748	OUT size_t *a_size;
5749	IN struct ucred *a_cred;
5750	IN struct thread *a_td;
5751};
5752*/
5753{
5754	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5755	struct thread *td = ap->a_td;
5756	struct nameidata nd;
5757	char attrprefix[16];
5758	u_char dirbuf[sizeof(struct dirent)];
5759	struct dirent *dp;
5760	struct iovec aiov;
5761	struct uio auio, *uio = ap->a_uio;
5762	size_t *sizep = ap->a_size;
5763	size_t plen;
5764	vnode_t *xvp = NULL, *vp;
5765	int done, error, eof, pos;
5766
5767	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5768	    ap->a_cred, ap->a_td, VREAD);
5769	if (error != 0)
5770		return (error);
5771
5772	error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
5773	    sizeof(attrprefix));
5774	if (error != 0)
5775		return (error);
5776	plen = strlen(attrprefix);
5777
5778	ZFS_ENTER(zfsvfs);
5779
5780	if (sizep != NULL)
5781		*sizep = 0;
5782
5783	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5784	    LOOKUP_XATTR);
5785	if (error != 0) {
5786		ZFS_EXIT(zfsvfs);
5787		/*
5788		 * ENOATTR means that the EA directory does not yet exist,
5789		 * i.e. there are no extended attributes there.
5790		 */
5791		if (error == ENOATTR)
5792			error = 0;
5793		return (error);
5794	}
5795
5796	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
5797	    UIO_SYSSPACE, ".", xvp, td);
5798	error = namei(&nd);
5799	vp = nd.ni_vp;
5800	NDFREE(&nd, NDF_ONLY_PNBUF);
5801	if (error != 0) {
5802		ZFS_EXIT(zfsvfs);
5803		return (error);
5804	}
5805
5806	auio.uio_iov = &aiov;
5807	auio.uio_iovcnt = 1;
5808	auio.uio_segflg = UIO_SYSSPACE;
5809	auio.uio_td = td;
5810	auio.uio_rw = UIO_READ;
5811	auio.uio_offset = 0;
5812
5813	do {
5814		u_char nlen;
5815
5816		aiov.iov_base = (void *)dirbuf;
5817		aiov.iov_len = sizeof(dirbuf);
5818		auio.uio_resid = sizeof(dirbuf);
5819		error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
5820		done = sizeof(dirbuf) - auio.uio_resid;
5821		if (error != 0)
5822			break;
5823		for (pos = 0; pos < done;) {
5824			dp = (struct dirent *)(dirbuf + pos);
5825			pos += dp->d_reclen;
5826			/*
5827			 * XXX: Temporarily we also accept DT_UNKNOWN, as this
5828			 * is what we get when attribute was created on Solaris.
5829			 */
5830			if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
5831				continue;
5832			if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0)
5833				continue;
5834			else if (strncmp(dp->d_name, attrprefix, plen) != 0)
5835				continue;
5836			nlen = dp->d_namlen - plen;
5837			if (sizep != NULL)
5838				*sizep += 1 + nlen;
5839			else if (uio != NULL) {
5840				/*
5841				 * Format of extattr name entry is one byte for
5842				 * length and the rest for name.
5843				 */
5844				error = uiomove(&nlen, 1, uio->uio_rw, uio);
5845				if (error == 0) {
5846					error = uiomove(dp->d_name + plen, nlen,
5847					    uio->uio_rw, uio);
5848				}
5849				if (error != 0)
5850					break;
5851			}
5852		}
5853	} while (!eof && error == 0);
5854
5855	vput(vp);
5856	ZFS_EXIT(zfsvfs);
5857
5858	return (error);
5859}
5860
5861int
5862zfs_freebsd_getacl(ap)
5863	struct vop_getacl_args /* {
5864		struct vnode *vp;
5865		acl_type_t type;
5866		struct acl *aclp;
5867		struct ucred *cred;
5868		struct thread *td;
5869	} */ *ap;
5870{
5871	int		error;
5872	vsecattr_t      vsecattr;
5873
5874	if (ap->a_type != ACL_TYPE_NFS4)
5875		return (EINVAL);
5876
5877	vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
5878	if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))
5879		return (error);
5880
5881	error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt);
5882	if (vsecattr.vsa_aclentp != NULL)
5883		kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
5884
5885	return (error);
5886}
5887
5888int
5889zfs_freebsd_setacl(ap)
5890	struct vop_setacl_args /* {
5891		struct vnode *vp;
5892		acl_type_t type;
5893		struct acl *aclp;
5894		struct ucred *cred;
5895		struct thread *td;
5896	} */ *ap;
5897{
5898	int		error;
5899	vsecattr_t      vsecattr;
5900	int		aclbsize;	/* size of acl list in bytes */
5901	aclent_t	*aaclp;
5902
5903	if (ap->a_type != ACL_TYPE_NFS4)
5904		return (EINVAL);
5905
5906	if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
5907		return (EINVAL);
5908
5909	/*
5910	 * With NFSv4 ACLs, chmod(2) may need to add additional entries,
5911	 * splitting every entry into two and appending "canonical six"
5912	 * entries at the end.  Don't allow for setting an ACL that would
5913	 * cause chmod(2) to run out of ACL entries.
5914	 */
5915	if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
5916		return (ENOSPC);
5917
5918	error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
5919	if (error != 0)
5920		return (error);
5921
5922	vsecattr.vsa_mask = VSA_ACE;
5923	aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t);
5924	vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
5925	aaclp = vsecattr.vsa_aclentp;
5926	vsecattr.vsa_aclentsz = aclbsize;
5927
5928	aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
5929	error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL);
5930	kmem_free(aaclp, aclbsize);
5931
5932	return (error);
5933}
5934
5935int
5936zfs_freebsd_aclcheck(ap)
5937	struct vop_aclcheck_args /* {
5938		struct vnode *vp;
5939		acl_type_t type;
5940		struct acl *aclp;
5941		struct ucred *cred;
5942		struct thread *td;
5943	} */ *ap;
5944{
5945
5946	return (EOPNOTSUPP);
5947}
5948
5949static int
5950zfs_vptocnp(struct vop_vptocnp_args *ap)
5951{
5952	vnode_t *covered_vp;
5953	vnode_t *vp = ap->a_vp;;
5954	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
5955	znode_t *zp = VTOZ(vp);
5956	uint64_t parent;
5957	int ltype;
5958	int error;
5959
5960	ZFS_ENTER(zfsvfs);
5961	ZFS_VERIFY_ZP(zp);
5962
5963	/*
5964	 * If we are a snapshot mounted under .zfs, run the operation
5965	 * on the covered vnode.
5966	 */
5967	if ((error = sa_lookup(zp->z_sa_hdl,
5968	    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) {
5969		ZFS_EXIT(zfsvfs);
5970		return (error);
5971	}
5972
5973	if (zp->z_id != parent || zfsvfs->z_parent == zfsvfs) {
5974		ZFS_EXIT(zfsvfs);
5975		return (vop_stdvptocnp(ap));
5976	}
5977	ZFS_EXIT(zfsvfs);
5978
5979	covered_vp = vp->v_mount->mnt_vnodecovered;
5980	vhold(covered_vp);
5981	ltype = VOP_ISLOCKED(vp);
5982	VOP_UNLOCK(vp, 0);
5983	error = vget(covered_vp, LK_EXCLUSIVE, curthread);
5984	vdrop(covered_vp);
5985	if (error == 0) {
5986		error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred,
5987		    ap->a_buf, ap->a_buflen);
5988		vput(covered_vp);
5989	}
5990	vn_lock(vp, ltype | LK_RETRY);
5991	if ((vp->v_iflag & VI_DOOMED) != 0)
5992		error = SET_ERROR(ENOENT);
5993	return (error);
5994}
5995
5996#ifdef DIAGNOSTIC
5997static int
5998zfs_lock(ap)
5999	struct vop_lock1_args /* {
6000		struct vnode *a_vp;
6001		int a_flags;
6002		char *file;
6003		int line;
6004	} */ *ap;
6005{
6006	zfsvfs_t *zfsvfs;
6007	znode_t *zp;
6008	vnode_t *vp;
6009	int flags;
6010	int err;
6011
6012	vp = ap->a_vp;
6013	flags = ap->a_flags;
6014	if ((flags & LK_INTERLOCK) == 0 && (flags & LK_NOWAIT) == 0 &&
6015	    (vp->v_iflag & VI_DOOMED) == 0 && (zp = vp->v_data) != NULL) {
6016		zfsvfs = zp->z_zfsvfs;
6017		VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
6018	}
6019	err = vop_stdlock(ap);
6020	if ((flags & LK_INTERLOCK) != 0 && (flags & LK_NOWAIT) == 0 &&
6021	    (vp->v_iflag & VI_DOOMED) == 0 && (zp = vp->v_data) != NULL) {
6022		zfsvfs = zp->z_zfsvfs;
6023		VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
6024	}
6025	return (err);
6026}
6027#endif
6028
6029struct vop_vector zfs_vnodeops;
6030struct vop_vector zfs_fifoops;
6031struct vop_vector zfs_shareops;
6032
6033struct vop_vector zfs_vnodeops = {
6034	.vop_default =		&default_vnodeops,
6035	.vop_inactive =		zfs_freebsd_inactive,
6036	.vop_reclaim =		zfs_freebsd_reclaim,
6037	.vop_access =		zfs_freebsd_access,
6038	.vop_lookup =		zfs_cache_lookup,
6039	.vop_cachedlookup =	zfs_freebsd_lookup,
6040	.vop_getattr =		zfs_freebsd_getattr,
6041	.vop_setattr =		zfs_freebsd_setattr,
6042	.vop_create =		zfs_freebsd_create,
6043	.vop_mknod =		zfs_freebsd_create,
6044	.vop_mkdir =		zfs_freebsd_mkdir,
6045	.vop_readdir =		zfs_freebsd_readdir,
6046	.vop_fsync =		zfs_freebsd_fsync,
6047	.vop_open =		zfs_freebsd_open,
6048	.vop_close =		zfs_freebsd_close,
6049	.vop_rmdir =		zfs_freebsd_rmdir,
6050	.vop_ioctl =		zfs_freebsd_ioctl,
6051	.vop_link =		zfs_freebsd_link,
6052	.vop_symlink =		zfs_freebsd_symlink,
6053	.vop_readlink =		zfs_freebsd_readlink,
6054	.vop_read =		zfs_freebsd_read,
6055	.vop_write =		zfs_freebsd_write,
6056	.vop_remove =		zfs_freebsd_remove,
6057	.vop_rename =		zfs_freebsd_rename,
6058	.vop_pathconf =		zfs_freebsd_pathconf,
6059	.vop_bmap =		zfs_freebsd_bmap,
6060	.vop_fid =		zfs_freebsd_fid,
6061	.vop_getextattr =	zfs_getextattr,
6062	.vop_deleteextattr =	zfs_deleteextattr,
6063	.vop_setextattr =	zfs_setextattr,
6064	.vop_listextattr =	zfs_listextattr,
6065	.vop_getacl =		zfs_freebsd_getacl,
6066	.vop_setacl =		zfs_freebsd_setacl,
6067	.vop_aclcheck =		zfs_freebsd_aclcheck,
6068	.vop_getpages =		zfs_freebsd_getpages,
6069	.vop_putpages =		zfs_freebsd_putpages,
6070	.vop_vptocnp =		zfs_vptocnp,
6071#ifdef DIAGNOSTIC
6072	.vop_lock1 =		zfs_lock,
6073#endif
6074};
6075
6076struct vop_vector zfs_fifoops = {
6077	.vop_default =		&fifo_specops,
6078	.vop_fsync =		zfs_freebsd_fsync,
6079	.vop_access =		zfs_freebsd_access,
6080	.vop_getattr =		zfs_freebsd_getattr,
6081	.vop_inactive =		zfs_freebsd_inactive,
6082	.vop_read =		VOP_PANIC,
6083	.vop_reclaim =		zfs_freebsd_reclaim,
6084	.vop_setattr =		zfs_freebsd_setattr,
6085	.vop_write =		VOP_PANIC,
6086	.vop_pathconf = 	zfs_freebsd_fifo_pathconf,
6087	.vop_fid =		zfs_freebsd_fid,
6088	.vop_getacl =		zfs_freebsd_getacl,
6089	.vop_setacl =		zfs_freebsd_setacl,
6090	.vop_aclcheck =		zfs_freebsd_aclcheck,
6091};
6092
6093/*
6094 * special share hidden files vnode operations template
6095 */
6096struct vop_vector zfs_shareops = {
6097	.vop_default =		&default_vnodeops,
6098	.vop_access =		zfs_freebsd_access,
6099	.vop_inactive =		zfs_freebsd_inactive,
6100	.vop_reclaim =		zfs_freebsd_reclaim,
6101	.vop_fid =		zfs_freebsd_fid,
6102	.vop_pathconf =		zfs_freebsd_pathconf,
6103};
6104