zfs_vnops.c revision 274337
1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22212694Smm * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23271536Sdelphij * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
24262990Sdelphij * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
25168404Spjd */
26168404Spjd
27169195Spjd/* Portions Copyright 2007 Jeremy Teo */
28219089Spjd/* Portions Copyright 2010 Robert Milkowski */
29169195Spjd
30168404Spjd#include <sys/types.h>
31168404Spjd#include <sys/param.h>
32168404Spjd#include <sys/time.h>
33168404Spjd#include <sys/systm.h>
34168404Spjd#include <sys/sysmacros.h>
35168404Spjd#include <sys/resource.h>
36168404Spjd#include <sys/vfs.h>
37248084Sattilio#include <sys/vm.h>
38168404Spjd#include <sys/vnode.h>
39168404Spjd#include <sys/file.h>
40168404Spjd#include <sys/stat.h>
41168404Spjd#include <sys/kmem.h>
42168404Spjd#include <sys/taskq.h>
43168404Spjd#include <sys/uio.h>
44168404Spjd#include <sys/atomic.h>
45168404Spjd#include <sys/namei.h>
46168404Spjd#include <sys/mman.h>
47168404Spjd#include <sys/cmn_err.h>
48168404Spjd#include <sys/errno.h>
49168404Spjd#include <sys/unistd.h>
50168404Spjd#include <sys/zfs_dir.h>
51168404Spjd#include <sys/zfs_ioctl.h>
52168404Spjd#include <sys/fs/zfs.h>
53168404Spjd#include <sys/dmu.h>
54219089Spjd#include <sys/dmu_objset.h>
55168404Spjd#include <sys/spa.h>
56168404Spjd#include <sys/txg.h>
57168404Spjd#include <sys/dbuf.h>
58168404Spjd#include <sys/zap.h>
59219089Spjd#include <sys/sa.h>
60168404Spjd#include <sys/dirent.h>
61168962Spjd#include <sys/policy.h>
62168962Spjd#include <sys/sunddi.h>
63168404Spjd#include <sys/filio.h>
64209962Smm#include <sys/sid.h>
65168404Spjd#include <sys/zfs_ctldir.h>
66185029Spjd#include <sys/zfs_fuid.h>
67219089Spjd#include <sys/zfs_sa.h>
68168404Spjd#include <sys/dnlc.h>
69168404Spjd#include <sys/zfs_rlock.h>
70185029Spjd#include <sys/extdirent.h>
71185029Spjd#include <sys/kidmap.h>
72168404Spjd#include <sys/bio.h>
73168404Spjd#include <sys/buf.h>
74168404Spjd#include <sys/sched.h>
75192800Strasz#include <sys/acl.h>
76239077Smarius#include <vm/vm_param.h>
77215401Savg#include <vm/vm_pageout.h>
78168404Spjd
79168404Spjd/*
80168404Spjd * Programming rules.
81168404Spjd *
82168404Spjd * Each vnode op performs some logical unit of work.  To do this, the ZPL must
83168404Spjd * properly lock its in-core state, create a DMU transaction, do the work,
84168404Spjd * record this work in the intent log (ZIL), commit the DMU transaction,
85185029Spjd * and wait for the intent log to commit if it is a synchronous operation.
86185029Spjd * Moreover, the vnode ops must work in both normal and log replay context.
87168404Spjd * The ordering of events is important to avoid deadlocks and references
88168404Spjd * to freed memory.  The example below illustrates the following Big Rules:
89168404Spjd *
90251631Sdelphij *  (1)	A check must be made in each zfs thread for a mounted file system.
91168404Spjd *	This is done avoiding races using ZFS_ENTER(zfsvfs).
92251631Sdelphij *	A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
93251631Sdelphij *	must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
94251631Sdelphij *	can return EIO from the calling function.
95168404Spjd *
96168404Spjd *  (2)	VN_RELE() should always be the last thing except for zil_commit()
97168404Spjd *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
98168404Spjd *	First, if it's the last reference, the vnode/znode
99168404Spjd *	can be freed, so the zp may point to freed memory.  Second, the last
100168404Spjd *	reference will call zfs_zinactive(), which may induce a lot of work --
101168404Spjd *	pushing cached pages (which acquires range locks) and syncing out
102168404Spjd *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
103168404Spjd *	which could deadlock the system if you were already holding one.
104191900Skmacy *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
105168404Spjd *
106168404Spjd *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
107168404Spjd *	as they can span dmu_tx_assign() calls.
108168404Spjd *
109258720Savg *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
110258720Savg *      dmu_tx_assign().  This is critical because we don't want to block
111258720Savg *      while holding locks.
112168404Spjd *
113258720Savg *	If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
114258720Savg *	reduces lock contention and CPU usage when we must wait (note that if
115258720Savg *	throughput is constrained by the storage, nearly every transaction
116258720Savg *	must wait).
117258720Savg *
118258720Savg *      Note, in particular, that if a lock is sometimes acquired before
119258720Savg *      the tx assigns, and sometimes after (e.g. z_lock), then failing
120258720Savg *      to use a non-blocking assign can deadlock the system.  The scenario:
121258720Savg *
122168404Spjd *	Thread A has grabbed a lock before calling dmu_tx_assign().
123168404Spjd *	Thread B is in an already-assigned tx, and blocks for this lock.
124168404Spjd *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
125168404Spjd *	forever, because the previous txg can't quiesce until B's tx commits.
126168404Spjd *
127168404Spjd *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
128258632Savg *	then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
129258632Savg *	calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT,
130258632Savg *	to indicate that this operation has already called dmu_tx_wait().
131258632Savg *	This will ensure that we don't retry forever, waiting a short bit
132258632Savg *	each time.
133168404Spjd *
134168404Spjd *  (5)	If the operation succeeded, generate the intent log entry for it
135168404Spjd *	before dropping locks.  This ensures that the ordering of events
136168404Spjd *	in the intent log matches the order in which they actually occurred.
137251631Sdelphij *	During ZIL replay the zfs_log_* functions will update the sequence
138209962Smm *	number to indicate the zil transaction has replayed.
139168404Spjd *
140168404Spjd *  (6)	At the end of each vnode op, the DMU tx must always commit,
141168404Spjd *	regardless of whether there were any errors.
142168404Spjd *
143219089Spjd *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
144168404Spjd *	to ensure that synchronous semantics are provided when necessary.
145168404Spjd *
146168404Spjd * In general, this is how things should be ordered in each vnode op:
147168404Spjd *
148168404Spjd *	ZFS_ENTER(zfsvfs);		// exit if unmounted
149168404Spjd * top:
150168404Spjd *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may VN_HOLD())
151168404Spjd *	rw_enter(...);			// grab any other locks you need
152168404Spjd *	tx = dmu_tx_create(...);	// get DMU tx
153168404Spjd *	dmu_tx_hold_*();		// hold each object you might modify
154258632Savg *	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
155168404Spjd *	if (error) {
156168404Spjd *		rw_exit(...);		// drop locks
157168404Spjd *		zfs_dirent_unlock(dl);	// unlock directory entry
158168404Spjd *		VN_RELE(...);		// release held vnodes
159209962Smm *		if (error == ERESTART) {
160258632Savg *			waited = B_TRUE;
161168404Spjd *			dmu_tx_wait(tx);
162168404Spjd *			dmu_tx_abort(tx);
163168404Spjd *			goto top;
164168404Spjd *		}
165168404Spjd *		dmu_tx_abort(tx);	// abort DMU tx
166168404Spjd *		ZFS_EXIT(zfsvfs);	// finished in zfs
167168404Spjd *		return (error);		// really out of space
168168404Spjd *	}
169168404Spjd *	error = do_real_work();		// do whatever this VOP does
170168404Spjd *	if (error == 0)
171168404Spjd *		zfs_log_*(...);		// on success, make ZIL entry
172168404Spjd *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
173168404Spjd *	rw_exit(...);			// drop locks
174168404Spjd *	zfs_dirent_unlock(dl);		// unlock directory entry
175168404Spjd *	VN_RELE(...);			// release held vnodes
176219089Spjd *	zil_commit(zilog, foid);	// synchronous when necessary
177168404Spjd *	ZFS_EXIT(zfsvfs);		// finished in zfs
178168404Spjd *	return (error);			// done, report error
179168404Spjd */
180185029Spjd
181168404Spjd/* ARGSUSED */
182168404Spjdstatic int
183185029Spjdzfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
184168404Spjd{
185168962Spjd	znode_t	*zp = VTOZ(*vpp);
186209962Smm	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
187168404Spjd
188209962Smm	ZFS_ENTER(zfsvfs);
189209962Smm	ZFS_VERIFY_ZP(zp);
190209962Smm
191219089Spjd	if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
192185029Spjd	    ((flag & FAPPEND) == 0)) {
193209962Smm		ZFS_EXIT(zfsvfs);
194249195Smm		return (SET_ERROR(EPERM));
195185029Spjd	}
196185029Spjd
197185029Spjd	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
198185029Spjd	    ZTOV(zp)->v_type == VREG &&
199219089Spjd	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
200209962Smm		if (fs_vscan(*vpp, cr, 0) != 0) {
201209962Smm			ZFS_EXIT(zfsvfs);
202249195Smm			return (SET_ERROR(EACCES));
203209962Smm		}
204209962Smm	}
205185029Spjd
206168404Spjd	/* Keep a count of the synchronous opens in the znode */
207168962Spjd	if (flag & (FSYNC | FDSYNC))
208168404Spjd		atomic_inc_32(&zp->z_sync_cnt);
209185029Spjd
210209962Smm	ZFS_EXIT(zfsvfs);
211168404Spjd	return (0);
212168404Spjd}
213168404Spjd
214168404Spjd/* ARGSUSED */
215168404Spjdstatic int
216185029Spjdzfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
217185029Spjd    caller_context_t *ct)
218168404Spjd{
219168962Spjd	znode_t	*zp = VTOZ(vp);
220209962Smm	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
221168404Spjd
222210470Smm	/*
223210470Smm	 * Clean up any locks held by this process on the vp.
224210470Smm	 */
225210470Smm	cleanlocks(vp, ddi_get_pid(), 0);
226210470Smm	cleanshares(vp, ddi_get_pid());
227210470Smm
228209962Smm	ZFS_ENTER(zfsvfs);
229209962Smm	ZFS_VERIFY_ZP(zp);
230209962Smm
231168404Spjd	/* Decrement the synchronous opens in the znode */
232185029Spjd	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
233168404Spjd		atomic_dec_32(&zp->z_sync_cnt);
234168404Spjd
235185029Spjd	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
236185029Spjd	    ZTOV(zp)->v_type == VREG &&
237219089Spjd	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
238185029Spjd		VERIFY(fs_vscan(vp, cr, 1) == 0);
239185029Spjd
240209962Smm	ZFS_EXIT(zfsvfs);
241168404Spjd	return (0);
242168404Spjd}
243168404Spjd
244168404Spjd/*
245168404Spjd * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
246168404Spjd * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
247168404Spjd */
248168404Spjdstatic int
249168978Spjdzfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
250168404Spjd{
251168404Spjd	znode_t	*zp = VTOZ(vp);
252168404Spjd	uint64_t noff = (uint64_t)*off; /* new offset */
253168404Spjd	uint64_t file_sz;
254168404Spjd	int error;
255168404Spjd	boolean_t hole;
256168404Spjd
257219089Spjd	file_sz = zp->z_size;
258168404Spjd	if (noff >= file_sz)  {
259249195Smm		return (SET_ERROR(ENXIO));
260168404Spjd	}
261168404Spjd
262168962Spjd	if (cmd == _FIO_SEEK_HOLE)
263168404Spjd		hole = B_TRUE;
264168404Spjd	else
265168404Spjd		hole = B_FALSE;
266168404Spjd
267168404Spjd	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
268168404Spjd
269271536Sdelphij	if (error == ESRCH)
270249195Smm		return (SET_ERROR(ENXIO));
271271536Sdelphij
272271536Sdelphij	/*
273271536Sdelphij	 * We could find a hole that begins after the logical end-of-file,
274271536Sdelphij	 * because dmu_offset_next() only works on whole blocks.  If the
275271536Sdelphij	 * EOF falls mid-block, then indicate that the "virtual hole"
276271536Sdelphij	 * at the end of the file begins at the logical EOF, rather than
277271536Sdelphij	 * at the end of the last block.
278271536Sdelphij	 */
279271536Sdelphij	if (noff > file_sz) {
280271536Sdelphij		ASSERT(hole);
281271536Sdelphij		noff = file_sz;
282168404Spjd	}
283168404Spjd
284168404Spjd	if (noff < *off)
285168404Spjd		return (error);
286168404Spjd	*off = noff;
287168404Spjd	return (error);
288168404Spjd}
289168404Spjd
290168404Spjd/* ARGSUSED */
291168404Spjdstatic int
292168978Spjdzfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
293185029Spjd    int *rvalp, caller_context_t *ct)
294168404Spjd{
295168962Spjd	offset_t off;
296168962Spjd	int error;
297168962Spjd	zfsvfs_t *zfsvfs;
298185029Spjd	znode_t *zp;
299168404Spjd
300168404Spjd	switch (com) {
301185029Spjd	case _FIOFFS:
302168962Spjd		return (0);
303168404Spjd
304168962Spjd		/*
305168962Spjd		 * The following two ioctls are used by bfu.  Faking out,
306168962Spjd		 * necessary to avoid bfu errors.
307168962Spjd		 */
308185029Spjd	case _FIOGDIO:
309185029Spjd	case _FIOSDIO:
310168962Spjd		return (0);
311168962Spjd
312185029Spjd	case _FIO_SEEK_DATA:
313185029Spjd	case _FIO_SEEK_HOLE:
314233918Savg#ifdef sun
315168962Spjd		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
316249195Smm			return (SET_ERROR(EFAULT));
317233918Savg#else
318233918Savg		off = *(offset_t *)data;
319233918Savg#endif
320185029Spjd		zp = VTOZ(vp);
321185029Spjd		zfsvfs = zp->z_zfsvfs;
322168404Spjd		ZFS_ENTER(zfsvfs);
323185029Spjd		ZFS_VERIFY_ZP(zp);
324168404Spjd
325168404Spjd		/* offset parameter is in/out */
326168404Spjd		error = zfs_holey(vp, com, &off);
327168404Spjd		ZFS_EXIT(zfsvfs);
328168404Spjd		if (error)
329168404Spjd			return (error);
330233918Savg#ifdef sun
331168962Spjd		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
332249195Smm			return (SET_ERROR(EFAULT));
333233918Savg#else
334233918Savg		*(offset_t *)data = off;
335233918Savg#endif
336168404Spjd		return (0);
337168404Spjd	}
338249195Smm	return (SET_ERROR(ENOTTY));
339168404Spjd}
340168404Spjd
341209962Smmstatic vm_page_t
342253953Sattiliopage_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
343209962Smm{
344209962Smm	vm_object_t obj;
345209962Smm	vm_page_t pp;
346258353Savg	int64_t end;
347209962Smm
348258353Savg	/*
349258353Savg	 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
350258353Savg	 * aligned boundaries, if the range is not aligned.  As a result a
351258353Savg	 * DEV_BSIZE subrange with partially dirty data may get marked as clean.
352258353Savg	 * It may happen that all DEV_BSIZE subranges are marked clean and thus
353258353Savg	 * the whole page would be considred clean despite have some dirty data.
354258353Savg	 * For this reason we should shrink the range to DEV_BSIZE aligned
355258353Savg	 * boundaries before calling vm_page_clear_dirty.
356258353Savg	 */
357258353Savg	end = rounddown2(off + nbytes, DEV_BSIZE);
358258353Savg	off = roundup2(off, DEV_BSIZE);
359258353Savg	nbytes = end - off;
360258353Savg
361209962Smm	obj = vp->v_object;
362248084Sattilio	zfs_vmobject_assert_wlocked(obj);
363209962Smm
364209962Smm	for (;;) {
365209962Smm		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
366246293Savg		    pp->valid) {
367254138Sattilio			if (vm_page_xbusied(pp)) {
368212652Savg				/*
369212652Savg				 * Reference the page before unlocking and
370212652Savg				 * sleeping so that the page daemon is less
371212652Savg				 * likely to reclaim it.
372212652Savg				 */
373225418Skib				vm_page_reference(pp);
374254138Sattilio				vm_page_lock(pp);
375254138Sattilio				zfs_vmobject_wunlock(obj);
376254138Sattilio				vm_page_busy_sleep(pp, "zfsmwb");
377254138Sattilio				zfs_vmobject_wlock(obj);
378209962Smm				continue;
379212652Savg			}
380254138Sattilio			vm_page_sbusy(pp);
381252337Sgavin		} else if (pp == NULL) {
382246293Savg			pp = vm_page_alloc(obj, OFF_TO_IDX(start),
383246293Savg			    VM_ALLOC_SYSTEM | VM_ALLOC_IFCACHED |
384254138Sattilio			    VM_ALLOC_SBUSY);
385252337Sgavin		} else {
386252337Sgavin			ASSERT(pp != NULL && !pp->valid);
387252337Sgavin			pp = NULL;
388209962Smm		}
389246293Savg
390246293Savg		if (pp != NULL) {
391246293Savg			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
392253953Sattilio			vm_object_pip_add(obj, 1);
393246293Savg			pmap_remove_write(pp);
394258353Savg			if (nbytes != 0)
395258353Savg				vm_page_clear_dirty(pp, off, nbytes);
396246293Savg		}
397209962Smm		break;
398209962Smm	}
399209962Smm	return (pp);
400209962Smm}
401209962Smm
402209962Smmstatic void
403253953Sattiliopage_unbusy(vm_page_t pp)
404209962Smm{
405209962Smm
406254138Sattilio	vm_page_sunbusy(pp);
407253953Sattilio	vm_object_pip_subtract(pp->object, 1);
408209962Smm}
409209962Smm
410253953Sattiliostatic vm_page_t
411253953Sattiliopage_hold(vnode_t *vp, int64_t start)
412253953Sattilio{
413253953Sattilio	vm_object_t obj;
414253953Sattilio	vm_page_t pp;
415253953Sattilio
416253953Sattilio	obj = vp->v_object;
417253953Sattilio	zfs_vmobject_assert_wlocked(obj);
418253953Sattilio
419253953Sattilio	for (;;) {
420253953Sattilio		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
421253953Sattilio		    pp->valid) {
422254138Sattilio			if (vm_page_xbusied(pp)) {
423253953Sattilio				/*
424253953Sattilio				 * Reference the page before unlocking and
425253953Sattilio				 * sleeping so that the page daemon is less
426253953Sattilio				 * likely to reclaim it.
427253953Sattilio				 */
428253953Sattilio				vm_page_reference(pp);
429254138Sattilio				vm_page_lock(pp);
430254138Sattilio				zfs_vmobject_wunlock(obj);
431254138Sattilio				vm_page_busy_sleep(pp, "zfsmwb");
432254138Sattilio				zfs_vmobject_wlock(obj);
433253953Sattilio				continue;
434253953Sattilio			}
435253953Sattilio
436253953Sattilio			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
437253953Sattilio			vm_page_lock(pp);
438253953Sattilio			vm_page_hold(pp);
439253953Sattilio			vm_page_unlock(pp);
440253953Sattilio
441253953Sattilio		} else
442253953Sattilio			pp = NULL;
443253953Sattilio		break;
444253953Sattilio	}
445253953Sattilio	return (pp);
446253953Sattilio}
447253953Sattilio
448253953Sattiliostatic void
449253953Sattiliopage_unhold(vm_page_t pp)
450253953Sattilio{
451253953Sattilio
452253953Sattilio	vm_page_lock(pp);
453253953Sattilio	vm_page_unhold(pp);
454253953Sattilio	vm_page_unlock(pp);
455253953Sattilio}
456253953Sattilio
457168404Spjd/*
458168404Spjd * When a file is memory mapped, we must keep the IO data synchronized
459168404Spjd * between the DMU cache and the memory mapped pages.  What this means:
460168404Spjd *
461168404Spjd * On Write:	If we find a memory mapped page, we write to *both*
462168404Spjd *		the page and the dmu buffer.
463168404Spjd */
464209962Smmstatic void
465209962Smmupdate_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
466209962Smm    int segflg, dmu_tx_t *tx)
467168404Spjd{
468168404Spjd	vm_object_t obj;
469168404Spjd	struct sf_buf *sf;
470246293Savg	caddr_t va;
471212655Savg	int off;
472168404Spjd
473258746Savg	ASSERT(segflg != UIO_NOCOPY);
474168404Spjd	ASSERT(vp->v_mount != NULL);
475168404Spjd	obj = vp->v_object;
476168404Spjd	ASSERT(obj != NULL);
477168404Spjd
478168404Spjd	off = start & PAGEOFFSET;
479248084Sattilio	zfs_vmobject_wlock(obj);
480168404Spjd	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
481209962Smm		vm_page_t pp;
482246293Savg		int nbytes = imin(PAGESIZE - off, len);
483168404Spjd
484258746Savg		if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
485248084Sattilio			zfs_vmobject_wunlock(obj);
486168404Spjd
487246293Savg			va = zfs_map_page(pp, &sf);
488246293Savg			(void) dmu_read(os, oid, start+off, nbytes,
489246293Savg			    va+off, DMU_READ_PREFETCH);;
490209962Smm			zfs_unmap_page(sf);
491246293Savg
492248084Sattilio			zfs_vmobject_wlock(obj);
493253953Sattilio			page_unbusy(pp);
494168404Spjd		}
495209962Smm		len -= nbytes;
496168404Spjd		off = 0;
497168404Spjd	}
498258746Savg	vm_object_pip_wakeupn(obj, 0);
499248084Sattilio	zfs_vmobject_wunlock(obj);
500168404Spjd}
501168404Spjd
502168404Spjd/*
503219089Spjd * Read with UIO_NOCOPY flag means that sendfile(2) requests
504219089Spjd * ZFS to populate a range of page cache pages with data.
505219089Spjd *
506219089Spjd * NOTE: this function could be optimized to pre-allocate
507254138Sattilio * all pages in advance, drain exclusive busy on all of them,
508219089Spjd * map them into contiguous KVA region and populate them
509219089Spjd * in one single dmu_read() call.
510219089Spjd */
511219089Spjdstatic int
512219089Spjdmappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
513219089Spjd{
514219089Spjd	znode_t *zp = VTOZ(vp);
515219089Spjd	objset_t *os = zp->z_zfsvfs->z_os;
516219089Spjd	struct sf_buf *sf;
517219089Spjd	vm_object_t obj;
518219089Spjd	vm_page_t pp;
519219089Spjd	int64_t start;
520219089Spjd	caddr_t va;
521219089Spjd	int len = nbytes;
522219089Spjd	int off;
523219089Spjd	int error = 0;
524219089Spjd
525219089Spjd	ASSERT(uio->uio_segflg == UIO_NOCOPY);
526219089Spjd	ASSERT(vp->v_mount != NULL);
527219089Spjd	obj = vp->v_object;
528219089Spjd	ASSERT(obj != NULL);
529219089Spjd	ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
530219089Spjd
531248084Sattilio	zfs_vmobject_wlock(obj);
532219089Spjd	for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
533219089Spjd		int bytes = MIN(PAGESIZE, len);
534219089Spjd
535254138Sattilio		pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY |
536254649Skib		    VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
537219089Spjd		if (pp->valid == 0) {
538248084Sattilio			zfs_vmobject_wunlock(obj);
539219089Spjd			va = zfs_map_page(pp, &sf);
540219089Spjd			error = dmu_read(os, zp->z_id, start, bytes, va,
541219089Spjd			    DMU_READ_PREFETCH);
542219089Spjd			if (bytes != PAGESIZE && error == 0)
543219089Spjd				bzero(va + bytes, PAGESIZE - bytes);
544219089Spjd			zfs_unmap_page(sf);
545248084Sattilio			zfs_vmobject_wlock(obj);
546254138Sattilio			vm_page_sunbusy(pp);
547219089Spjd			vm_page_lock(pp);
548219089Spjd			if (error) {
549253073Savg				if (pp->wire_count == 0 && pp->valid == 0 &&
550254138Sattilio				    !vm_page_busied(pp))
551253073Savg					vm_page_free(pp);
552219089Spjd			} else {
553219089Spjd				pp->valid = VM_PAGE_BITS_ALL;
554219089Spjd				vm_page_activate(pp);
555219089Spjd			}
556219089Spjd			vm_page_unlock(pp);
557258739Savg		} else {
558258739Savg			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
559254138Sattilio			vm_page_sunbusy(pp);
560258739Savg		}
561219089Spjd		if (error)
562219089Spjd			break;
563219089Spjd		uio->uio_resid -= bytes;
564219089Spjd		uio->uio_offset += bytes;
565219089Spjd		len -= bytes;
566219089Spjd	}
567248084Sattilio	zfs_vmobject_wunlock(obj);
568219089Spjd	return (error);
569219089Spjd}
570219089Spjd
571219089Spjd/*
572168404Spjd * When a file is memory mapped, we must keep the IO data synchronized
573168404Spjd * between the DMU cache and the memory mapped pages.  What this means:
574168404Spjd *
575168404Spjd * On Read:	We "read" preferentially from memory mapped pages,
576168404Spjd *		else we default from the dmu buffer.
577168404Spjd *
578168404Spjd * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
579251631Sdelphij *	 the file is memory mapped.
580168404Spjd */
581168404Spjdstatic int
582168404Spjdmappedread(vnode_t *vp, int nbytes, uio_t *uio)
583168404Spjd{
584168404Spjd	znode_t *zp = VTOZ(vp);
585168404Spjd	vm_object_t obj;
586212655Savg	int64_t start;
587168926Spjd	caddr_t va;
588168404Spjd	int len = nbytes;
589212655Savg	int off;
590168404Spjd	int error = 0;
591168404Spjd
592168404Spjd	ASSERT(vp->v_mount != NULL);
593168404Spjd	obj = vp->v_object;
594168404Spjd	ASSERT(obj != NULL);
595168404Spjd
596168404Spjd	start = uio->uio_loffset;
597168404Spjd	off = start & PAGEOFFSET;
598248084Sattilio	zfs_vmobject_wlock(obj);
599168404Spjd	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
600219089Spjd		vm_page_t pp;
601219089Spjd		uint64_t bytes = MIN(PAGESIZE - off, len);
602168404Spjd
603253953Sattilio		if (pp = page_hold(vp, start)) {
604219089Spjd			struct sf_buf *sf;
605219089Spjd			caddr_t va;
606212652Savg
607248084Sattilio			zfs_vmobject_wunlock(obj);
608219089Spjd			va = zfs_map_page(pp, &sf);
609219089Spjd			error = uiomove(va + off, bytes, UIO_READ, uio);
610219089Spjd			zfs_unmap_page(sf);
611248084Sattilio			zfs_vmobject_wlock(obj);
612253953Sattilio			page_unhold(pp);
613219089Spjd		} else {
614248084Sattilio			zfs_vmobject_wunlock(obj);
615272809Sdelphij			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
616272809Sdelphij			    uio, bytes);
617248084Sattilio			zfs_vmobject_wlock(obj);
618168404Spjd		}
619168404Spjd		len -= bytes;
620168404Spjd		off = 0;
621168404Spjd		if (error)
622168404Spjd			break;
623168404Spjd	}
624248084Sattilio	zfs_vmobject_wunlock(obj);
625168404Spjd	return (error);
626168404Spjd}
627168404Spjd
628168404Spjdoffset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
629168404Spjd
630168404Spjd/*
631168404Spjd * Read bytes from specified file into supplied buffer.
632168404Spjd *
633168404Spjd *	IN:	vp	- vnode of file to be read from.
634168404Spjd *		uio	- structure supplying read location, range info,
635168404Spjd *			  and return buffer.
636168404Spjd *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
637168404Spjd *		cr	- credentials of caller.
638185029Spjd *		ct	- caller context
639168404Spjd *
640168404Spjd *	OUT:	uio	- updated offset and range, buffer filled.
641168404Spjd *
642251631Sdelphij *	RETURN:	0 on success, error code on failure.
643168404Spjd *
644168404Spjd * Side Effects:
645168404Spjd *	vp - atime updated if byte count > 0
646168404Spjd */
647168404Spjd/* ARGSUSED */
648168404Spjdstatic int
649168962Spjdzfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
650168404Spjd{
651168404Spjd	znode_t		*zp = VTOZ(vp);
652168404Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
653168404Spjd	ssize_t		n, nbytes;
654247187Smm	int		error = 0;
655168404Spjd	rl_t		*rl;
656219089Spjd	xuio_t		*xuio = NULL;
657168404Spjd
658168404Spjd	ZFS_ENTER(zfsvfs);
659185029Spjd	ZFS_VERIFY_ZP(zp);
660168404Spjd
661219089Spjd	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
662185029Spjd		ZFS_EXIT(zfsvfs);
663249195Smm		return (SET_ERROR(EACCES));
664185029Spjd	}
665185029Spjd
666168404Spjd	/*
667168404Spjd	 * Validate file offset
668168404Spjd	 */
669168404Spjd	if (uio->uio_loffset < (offset_t)0) {
670168404Spjd		ZFS_EXIT(zfsvfs);
671249195Smm		return (SET_ERROR(EINVAL));
672168404Spjd	}
673168404Spjd
674168404Spjd	/*
675168404Spjd	 * Fasttrack empty reads
676168404Spjd	 */
677168404Spjd	if (uio->uio_resid == 0) {
678168404Spjd		ZFS_EXIT(zfsvfs);
679168404Spjd		return (0);
680168404Spjd	}
681168404Spjd
682168404Spjd	/*
683168962Spjd	 * Check for mandatory locks
684168962Spjd	 */
685219089Spjd	if (MANDMODE(zp->z_mode)) {
686168962Spjd		if (error = chklock(vp, FREAD,
687168962Spjd		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
688168962Spjd			ZFS_EXIT(zfsvfs);
689168962Spjd			return (error);
690168962Spjd		}
691168962Spjd	}
692168962Spjd
693168962Spjd	/*
694168404Spjd	 * If we're in FRSYNC mode, sync out this znode before reading it.
695168404Spjd	 */
696224605Smm	if (zfsvfs->z_log &&
697224605Smm	    (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
698219089Spjd		zil_commit(zfsvfs->z_log, zp->z_id);
699168404Spjd
700168404Spjd	/*
701168404Spjd	 * Lock the range against changes.
702168404Spjd	 */
703168404Spjd	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
704168404Spjd
705168404Spjd	/*
706168404Spjd	 * If we are reading past end-of-file we can skip
707168404Spjd	 * to the end; but we might still need to set atime.
708168404Spjd	 */
709219089Spjd	if (uio->uio_loffset >= zp->z_size) {
710168404Spjd		error = 0;
711168404Spjd		goto out;
712168404Spjd	}
713168404Spjd
714219089Spjd	ASSERT(uio->uio_loffset < zp->z_size);
715219089Spjd	n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
716168404Spjd
717219089Spjd#ifdef sun
718219089Spjd	if ((uio->uio_extflg == UIO_XUIO) &&
719219089Spjd	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
720219089Spjd		int nblk;
721219089Spjd		int blksz = zp->z_blksz;
722219089Spjd		uint64_t offset = uio->uio_loffset;
723219089Spjd
724219089Spjd		xuio = (xuio_t *)uio;
725219089Spjd		if ((ISP2(blksz))) {
726219089Spjd			nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
727219089Spjd			    blksz)) / blksz;
728219089Spjd		} else {
729219089Spjd			ASSERT(offset + n <= blksz);
730219089Spjd			nblk = 1;
731219089Spjd		}
732219089Spjd		(void) dmu_xuio_init(xuio, nblk);
733219089Spjd
734219089Spjd		if (vn_has_cached_data(vp)) {
735219089Spjd			/*
736219089Spjd			 * For simplicity, we always allocate a full buffer
737219089Spjd			 * even if we only expect to read a portion of a block.
738219089Spjd			 */
739219089Spjd			while (--nblk >= 0) {
740219089Spjd				(void) dmu_xuio_add(xuio,
741219089Spjd				    dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
742219089Spjd				    blksz), 0, blksz);
743219089Spjd			}
744219089Spjd		}
745219089Spjd	}
746219089Spjd#endif	/* sun */
747219089Spjd
748168404Spjd	while (n > 0) {
749168404Spjd		nbytes = MIN(n, zfs_read_chunk_size -
750168404Spjd		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
751168404Spjd
752219089Spjd#ifdef __FreeBSD__
753219089Spjd		if (uio->uio_segflg == UIO_NOCOPY)
754219089Spjd			error = mappedread_sf(vp, nbytes, uio);
755219089Spjd		else
756219089Spjd#endif /* __FreeBSD__ */
757272809Sdelphij		if (vn_has_cached_data(vp)) {
758168404Spjd			error = mappedread(vp, nbytes, uio);
759272809Sdelphij		} else {
760272809Sdelphij			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
761272809Sdelphij			    uio, nbytes);
762272809Sdelphij		}
763185029Spjd		if (error) {
764185029Spjd			/* convert checksum errors into IO errors */
765185029Spjd			if (error == ECKSUM)
766249195Smm				error = SET_ERROR(EIO);
767168404Spjd			break;
768185029Spjd		}
769168962Spjd
770168404Spjd		n -= nbytes;
771168404Spjd	}
772168404Spjdout:
773168404Spjd	zfs_range_unlock(rl);
774168404Spjd
775168404Spjd	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
776168404Spjd	ZFS_EXIT(zfsvfs);
777168404Spjd	return (error);
778168404Spjd}
779168404Spjd
780168404Spjd/*
781168404Spjd * Write the bytes to a file.
782168404Spjd *
783168404Spjd *	IN:	vp	- vnode of file to be written to.
784168404Spjd *		uio	- structure supplying write location, range info,
785168404Spjd *			  and data buffer.
786251631Sdelphij *		ioflag	- FAPPEND, FSYNC, and/or FDSYNC.  FAPPEND is
787251631Sdelphij *			  set if in append mode.
788168404Spjd *		cr	- credentials of caller.
789185029Spjd *		ct	- caller context (NFS/CIFS fem monitor only)
790168404Spjd *
791168404Spjd *	OUT:	uio	- updated offset and range.
792168404Spjd *
793251631Sdelphij *	RETURN:	0 on success, error code on failure.
794168404Spjd *
795168404Spjd * Timestamps:
796168404Spjd *	vp - ctime|mtime updated if byte count > 0
797168404Spjd */
798219089Spjd
799168404Spjd/* ARGSUSED */
800168404Spjdstatic int
801168962Spjdzfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
802168404Spjd{
803168404Spjd	znode_t		*zp = VTOZ(vp);
804168962Spjd	rlim64_t	limit = MAXOFFSET_T;
805168404Spjd	ssize_t		start_resid = uio->uio_resid;
806168404Spjd	ssize_t		tx_bytes;
807168404Spjd	uint64_t	end_size;
808168404Spjd	dmu_tx_t	*tx;
809168404Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
810185029Spjd	zilog_t		*zilog;
811168404Spjd	offset_t	woff;
812168404Spjd	ssize_t		n, nbytes;
813168404Spjd	rl_t		*rl;
814168404Spjd	int		max_blksz = zfsvfs->z_max_blksz;
815247187Smm	int		error = 0;
816209962Smm	arc_buf_t	*abuf;
817247187Smm	iovec_t		*aiov = NULL;
818219089Spjd	xuio_t		*xuio = NULL;
819219089Spjd	int		i_iov = 0;
820219089Spjd	int		iovcnt = uio->uio_iovcnt;
821219089Spjd	iovec_t		*iovp = uio->uio_iov;
822219089Spjd	int		write_eof;
823219089Spjd	int		count = 0;
824219089Spjd	sa_bulk_attr_t	bulk[4];
825219089Spjd	uint64_t	mtime[2], ctime[2];
826168404Spjd
827168404Spjd	/*
828168404Spjd	 * Fasttrack empty write
829168404Spjd	 */
830168404Spjd	n = start_resid;
831168404Spjd	if (n == 0)
832168404Spjd		return (0);
833168404Spjd
834168962Spjd	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
835168962Spjd		limit = MAXOFFSET_T;
836168962Spjd
837168404Spjd	ZFS_ENTER(zfsvfs);
838185029Spjd	ZFS_VERIFY_ZP(zp);
839168404Spjd
840219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
841219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
842219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
843219089Spjd	    &zp->z_size, 8);
844219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
845219089Spjd	    &zp->z_pflags, 8);
846219089Spjd
847168404Spjd	/*
848262990Sdelphij	 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
849262990Sdelphij	 * callers might not be able to detect properly that we are read-only,
850262990Sdelphij	 * so check it explicitly here.
851262990Sdelphij	 */
852262990Sdelphij	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
853262990Sdelphij		ZFS_EXIT(zfsvfs);
854262990Sdelphij		return (SET_ERROR(EROFS));
855262990Sdelphij	}
856262990Sdelphij
857262990Sdelphij	/*
858185029Spjd	 * If immutable or not appending then return EPERM
859185029Spjd	 */
860219089Spjd	if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
861219089Spjd	    ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
862219089Spjd	    (uio->uio_loffset < zp->z_size))) {
863185029Spjd		ZFS_EXIT(zfsvfs);
864249195Smm		return (SET_ERROR(EPERM));
865185029Spjd	}
866185029Spjd
867185029Spjd	zilog = zfsvfs->z_log;
868185029Spjd
869185029Spjd	/*
870219089Spjd	 * Validate file offset
871219089Spjd	 */
872219089Spjd	woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
873219089Spjd	if (woff < 0) {
874219089Spjd		ZFS_EXIT(zfsvfs);
875249195Smm		return (SET_ERROR(EINVAL));
876219089Spjd	}
877219089Spjd
878219089Spjd	/*
879219089Spjd	 * Check for mandatory locks before calling zfs_range_lock()
880219089Spjd	 * in order to prevent a deadlock with locks set via fcntl().
881219089Spjd	 */
882219089Spjd	if (MANDMODE((mode_t)zp->z_mode) &&
883219089Spjd	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
884219089Spjd		ZFS_EXIT(zfsvfs);
885219089Spjd		return (error);
886219089Spjd	}
887219089Spjd
888219089Spjd#ifdef sun
889219089Spjd	/*
890168404Spjd	 * Pre-fault the pages to ensure slow (eg NFS) pages
891168404Spjd	 * don't hold up txg.
892219089Spjd	 * Skip this if uio contains loaned arc_buf.
893168404Spjd	 */
894219089Spjd	if ((uio->uio_extflg == UIO_XUIO) &&
895219089Spjd	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
896219089Spjd		xuio = (xuio_t *)uio;
897219089Spjd	else
898219089Spjd		uio_prefaultpages(MIN(n, max_blksz), uio);
899219089Spjd#endif	/* sun */
900168404Spjd
901168404Spjd	/*
902168404Spjd	 * If in append mode, set the io offset pointer to eof.
903168404Spjd	 */
904213673Spjd	if (ioflag & FAPPEND) {
905168404Spjd		/*
906219089Spjd		 * Obtain an appending range lock to guarantee file append
907219089Spjd		 * semantics.  We reset the write offset once we have the lock.
908168404Spjd		 */
909168404Spjd		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
910219089Spjd		woff = rl->r_off;
911168404Spjd		if (rl->r_len == UINT64_MAX) {
912219089Spjd			/*
913219089Spjd			 * We overlocked the file because this write will cause
914219089Spjd			 * the file block size to increase.
915219089Spjd			 * Note that zp_size cannot change with this lock held.
916219089Spjd			 */
917219089Spjd			woff = zp->z_size;
918168404Spjd		}
919219089Spjd		uio->uio_loffset = woff;
920168404Spjd	} else {
921168404Spjd		/*
922219089Spjd		 * Note that if the file block size will change as a result of
923219089Spjd		 * this write, then this range lock will lock the entire file
924219089Spjd		 * so that we can re-write the block safely.
925168404Spjd		 */
926168404Spjd		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
927168404Spjd	}
928168404Spjd
929235781Strasz	if (vn_rlimit_fsize(vp, uio, uio->uio_td)) {
930235781Strasz		zfs_range_unlock(rl);
931235781Strasz		ZFS_EXIT(zfsvfs);
932235781Strasz		return (EFBIG);
933235781Strasz	}
934235781Strasz
935168962Spjd	if (woff >= limit) {
936168962Spjd		zfs_range_unlock(rl);
937168962Spjd		ZFS_EXIT(zfsvfs);
938249195Smm		return (SET_ERROR(EFBIG));
939168962Spjd	}
940168962Spjd
941168962Spjd	if ((woff + n) > limit || woff > (limit - n))
942168962Spjd		n = limit - woff;
943168962Spjd
944219089Spjd	/* Will this write extend the file length? */
945219089Spjd	write_eof = (woff + n > zp->z_size);
946168404Spjd
947219089Spjd	end_size = MAX(zp->z_size, woff + n);
948219089Spjd
949168404Spjd	/*
950168404Spjd	 * Write the file in reasonable size chunks.  Each chunk is written
951168404Spjd	 * in a separate transaction; this keeps the intent log records small
952168404Spjd	 * and allows us to do more fine-grained space accounting.
953168404Spjd	 */
954168404Spjd	while (n > 0) {
955209962Smm		abuf = NULL;
956209962Smm		woff = uio->uio_loffset;
957219089Spjd		if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
958219089Spjd		    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
959209962Smm			if (abuf != NULL)
960209962Smm				dmu_return_arcbuf(abuf);
961249195Smm			error = SET_ERROR(EDQUOT);
962209962Smm			break;
963209962Smm		}
964209962Smm
965219089Spjd		if (xuio && abuf == NULL) {
966219089Spjd			ASSERT(i_iov < iovcnt);
967219089Spjd			aiov = &iovp[i_iov];
968219089Spjd			abuf = dmu_xuio_arcbuf(xuio, i_iov);
969219089Spjd			dmu_xuio_clear(xuio, i_iov);
970219089Spjd			DTRACE_PROBE3(zfs_cp_write, int, i_iov,
971219089Spjd			    iovec_t *, aiov, arc_buf_t *, abuf);
972219089Spjd			ASSERT((aiov->iov_base == abuf->b_data) ||
973219089Spjd			    ((char *)aiov->iov_base - (char *)abuf->b_data +
974219089Spjd			    aiov->iov_len == arc_buf_size(abuf)));
975219089Spjd			i_iov++;
976219089Spjd		} else if (abuf == NULL && n >= max_blksz &&
977219089Spjd		    woff >= zp->z_size &&
978209962Smm		    P2PHASE(woff, max_blksz) == 0 &&
979209962Smm		    zp->z_blksz == max_blksz) {
980219089Spjd			/*
981219089Spjd			 * This write covers a full block.  "Borrow" a buffer
982219089Spjd			 * from the dmu so that we can fill it before we enter
983219089Spjd			 * a transaction.  This avoids the possibility of
984219089Spjd			 * holding up the transaction if the data copy hangs
985219089Spjd			 * up on a pagefault (e.g., from an NFS server mapping).
986219089Spjd			 */
987209962Smm			size_t cbytes;
988209962Smm
989219089Spjd			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
990219089Spjd			    max_blksz);
991209962Smm			ASSERT(abuf != NULL);
992209962Smm			ASSERT(arc_buf_size(abuf) == max_blksz);
993209962Smm			if (error = uiocopy(abuf->b_data, max_blksz,
994209962Smm			    UIO_WRITE, uio, &cbytes)) {
995209962Smm				dmu_return_arcbuf(abuf);
996209962Smm				break;
997209962Smm			}
998209962Smm			ASSERT(cbytes == max_blksz);
999209962Smm		}
1000209962Smm
1001209962Smm		/*
1002168404Spjd		 * Start a transaction.
1003168404Spjd		 */
1004168404Spjd		tx = dmu_tx_create(zfsvfs->z_os);
1005219089Spjd		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1006168404Spjd		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
1007219089Spjd		zfs_sa_upgrade_txholds(tx, zp);
1008258720Savg		error = dmu_tx_assign(tx, TXG_WAIT);
1009168404Spjd		if (error) {
1010168404Spjd			dmu_tx_abort(tx);
1011209962Smm			if (abuf != NULL)
1012209962Smm				dmu_return_arcbuf(abuf);
1013168404Spjd			break;
1014168404Spjd		}
1015168404Spjd
1016168404Spjd		/*
1017168404Spjd		 * If zfs_range_lock() over-locked we grow the blocksize
1018168404Spjd		 * and then reduce the lock range.  This will only happen
1019168404Spjd		 * on the first iteration since zfs_range_reduce() will
1020168404Spjd		 * shrink down r_len to the appropriate size.
1021168404Spjd		 */
1022168404Spjd		if (rl->r_len == UINT64_MAX) {
1023168404Spjd			uint64_t new_blksz;
1024168404Spjd
1025168404Spjd			if (zp->z_blksz > max_blksz) {
1026274337Sdelphij				/*
1027274337Sdelphij				 * File's blocksize is already larger than the
1028274337Sdelphij				 * "recordsize" property.  Only let it grow to
1029274337Sdelphij				 * the next power of 2.
1030274337Sdelphij				 */
1031168404Spjd				ASSERT(!ISP2(zp->z_blksz));
1032274337Sdelphij				new_blksz = MIN(end_size,
1033274337Sdelphij				    1 << highbit64(zp->z_blksz));
1034168404Spjd			} else {
1035168404Spjd				new_blksz = MIN(end_size, max_blksz);
1036168404Spjd			}
1037168404Spjd			zfs_grow_blocksize(zp, new_blksz, tx);
1038168404Spjd			zfs_range_reduce(rl, woff, n);
1039168404Spjd		}
1040168404Spjd
1041168404Spjd		/*
1042168404Spjd		 * XXX - should we really limit each write to z_max_blksz?
1043168404Spjd		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
1044168404Spjd		 */
1045168404Spjd		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
1046168404Spjd
1047219089Spjd		if (woff + nbytes > zp->z_size)
1048168404Spjd			vnode_pager_setsize(vp, woff + nbytes);
1049168404Spjd
1050209962Smm		if (abuf == NULL) {
1051209962Smm			tx_bytes = uio->uio_resid;
1052219089Spjd			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
1053219089Spjd			    uio, nbytes, tx);
1054209962Smm			tx_bytes -= uio->uio_resid;
1055168404Spjd		} else {
1056209962Smm			tx_bytes = nbytes;
1057219089Spjd			ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
1058219089Spjd			/*
1059219089Spjd			 * If this is not a full block write, but we are
1060219089Spjd			 * extending the file past EOF and this data starts
1061219089Spjd			 * block-aligned, use assign_arcbuf().  Otherwise,
1062219089Spjd			 * write via dmu_write().
1063219089Spjd			 */
1064219089Spjd			if (tx_bytes < max_blksz && (!write_eof ||
1065219089Spjd			    aiov->iov_base != abuf->b_data)) {
1066219089Spjd				ASSERT(xuio);
1067219089Spjd				dmu_write(zfsvfs->z_os, zp->z_id, woff,
1068219089Spjd				    aiov->iov_len, aiov->iov_base, tx);
1069219089Spjd				dmu_return_arcbuf(abuf);
1070219089Spjd				xuio_stat_wbuf_copied();
1071219089Spjd			} else {
1072219089Spjd				ASSERT(xuio || tx_bytes == max_blksz);
1073219089Spjd				dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
1074219089Spjd				    woff, abuf, tx);
1075219089Spjd			}
1076209962Smm			ASSERT(tx_bytes <= uio->uio_resid);
1077209962Smm			uioskip(uio, tx_bytes);
1078168404Spjd		}
1079212657Savg		if (tx_bytes && vn_has_cached_data(vp)) {
1080209962Smm			update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
1081209962Smm			    zp->z_id, uio->uio_segflg, tx);
1082209962Smm		}
1083209962Smm
1084209962Smm		/*
1085168404Spjd		 * If we made no progress, we're done.  If we made even
1086168404Spjd		 * partial progress, update the znode and ZIL accordingly.
1087168404Spjd		 */
1088168404Spjd		if (tx_bytes == 0) {
1089219089Spjd			(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
1090219089Spjd			    (void *)&zp->z_size, sizeof (uint64_t), tx);
1091168404Spjd			dmu_tx_commit(tx);
1092168404Spjd			ASSERT(error != 0);
1093168404Spjd			break;
1094168404Spjd		}
1095168404Spjd
1096168404Spjd		/*
1097168404Spjd		 * Clear Set-UID/Set-GID bits on successful write if not
1098168404Spjd		 * privileged and at least one of the excute bits is set.
1099168404Spjd		 *
1100168404Spjd		 * It would be nice to to this after all writes have
1101168404Spjd		 * been done, but that would still expose the ISUID/ISGID
1102168404Spjd		 * to another app after the partial write is committed.
1103185029Spjd		 *
1104185029Spjd		 * Note: we don't call zfs_fuid_map_id() here because
1105185029Spjd		 * user 0 is not an ephemeral uid.
1106168404Spjd		 */
1107168404Spjd		mutex_enter(&zp->z_acl_lock);
1108219089Spjd		if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
1109168404Spjd		    (S_IXUSR >> 6))) != 0 &&
1110219089Spjd		    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
1111185029Spjd		    secpolicy_vnode_setid_retain(vp, cr,
1112219089Spjd		    (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
1113219089Spjd			uint64_t newmode;
1114219089Spjd			zp->z_mode &= ~(S_ISUID | S_ISGID);
1115219089Spjd			newmode = zp->z_mode;
1116219089Spjd			(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
1117219089Spjd			    (void *)&newmode, sizeof (uint64_t), tx);
1118168404Spjd		}
1119168404Spjd		mutex_exit(&zp->z_acl_lock);
1120168404Spjd
1121219089Spjd		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
1122219089Spjd		    B_TRUE);
1123168404Spjd
1124168404Spjd		/*
1125168404Spjd		 * Update the file size (zp_size) if it has changed;
1126168404Spjd		 * account for possible concurrent updates.
1127168404Spjd		 */
1128219089Spjd		while ((end_size = zp->z_size) < uio->uio_loffset) {
1129219089Spjd			(void) atomic_cas_64(&zp->z_size, end_size,
1130168404Spjd			    uio->uio_loffset);
1131219089Spjd			ASSERT(error == 0);
1132219089Spjd		}
1133219089Spjd		/*
1134219089Spjd		 * If we are replaying and eof is non zero then force
1135219089Spjd		 * the file size to the specified eof. Note, there's no
1136219089Spjd		 * concurrency during replay.
1137219089Spjd		 */
1138219089Spjd		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
1139219089Spjd			zp->z_size = zfsvfs->z_replay_eof;
1140219089Spjd
1141219089Spjd		error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1142219089Spjd
1143168404Spjd		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
1144168404Spjd		dmu_tx_commit(tx);
1145168404Spjd
1146168404Spjd		if (error != 0)
1147168404Spjd			break;
1148168404Spjd		ASSERT(tx_bytes == nbytes);
1149168404Spjd		n -= nbytes;
1150219089Spjd
1151219089Spjd#ifdef sun
1152219089Spjd		if (!xuio && n > 0)
1153219089Spjd			uio_prefaultpages(MIN(n, max_blksz), uio);
1154219089Spjd#endif	/* sun */
1155168404Spjd	}
1156168404Spjd
1157168404Spjd	zfs_range_unlock(rl);
1158168404Spjd
1159168404Spjd	/*
1160168404Spjd	 * If we're in replay mode, or we made no progress, return error.
1161168404Spjd	 * Otherwise, it's at least a partial write, so it's successful.
1162168404Spjd	 */
1163209962Smm	if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
1164168404Spjd		ZFS_EXIT(zfsvfs);
1165168404Spjd		return (error);
1166168404Spjd	}
1167168404Spjd
1168219089Spjd	if (ioflag & (FSYNC | FDSYNC) ||
1169219089Spjd	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1170219089Spjd		zil_commit(zilog, zp->z_id);
1171168404Spjd
1172168404Spjd	ZFS_EXIT(zfsvfs);
1173168404Spjd	return (0);
1174168404Spjd}
1175168404Spjd
1176168404Spjdvoid
1177219089Spjdzfs_get_done(zgd_t *zgd, int error)
1178168404Spjd{
1179219089Spjd	znode_t *zp = zgd->zgd_private;
1180219089Spjd	objset_t *os = zp->z_zfsvfs->z_os;
1181168404Spjd
1182219089Spjd	if (zgd->zgd_db)
1183219089Spjd		dmu_buf_rele(zgd->zgd_db, zgd);
1184219089Spjd
1185219089Spjd	zfs_range_unlock(zgd->zgd_rl);
1186219089Spjd
1187191900Skmacy	/*
1188191900Skmacy	 * Release the vnode asynchronously as we currently have the
1189191900Skmacy	 * txg stopped from syncing.
1190191900Skmacy	 */
1191219089Spjd	VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1192219089Spjd
1193219089Spjd	if (error == 0 && zgd->zgd_bp)
1194219089Spjd		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
1195219089Spjd
1196168404Spjd	kmem_free(zgd, sizeof (zgd_t));
1197168404Spjd}
1198168404Spjd
1199214378Smm#ifdef DEBUG
1200214378Smmstatic int zil_fault_io = 0;
1201214378Smm#endif
1202214378Smm
1203168404Spjd/*
1204168404Spjd * Get data to generate a TX_WRITE intent log record.
1205168404Spjd */
1206168404Spjdint
1207168404Spjdzfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
1208168404Spjd{
1209168404Spjd	zfsvfs_t *zfsvfs = arg;
1210168404Spjd	objset_t *os = zfsvfs->z_os;
1211168404Spjd	znode_t *zp;
1212219089Spjd	uint64_t object = lr->lr_foid;
1213219089Spjd	uint64_t offset = lr->lr_offset;
1214219089Spjd	uint64_t size = lr->lr_length;
1215219089Spjd	blkptr_t *bp = &lr->lr_blkptr;
1216168404Spjd	dmu_buf_t *db;
1217168404Spjd	zgd_t *zgd;
1218168404Spjd	int error = 0;
1219168404Spjd
1220219089Spjd	ASSERT(zio != NULL);
1221219089Spjd	ASSERT(size != 0);
1222168404Spjd
1223168404Spjd	/*
1224168404Spjd	 * Nothing to do if the file has been removed
1225168404Spjd	 */
1226219089Spjd	if (zfs_zget(zfsvfs, object, &zp) != 0)
1227249195Smm		return (SET_ERROR(ENOENT));
1228168404Spjd	if (zp->z_unlinked) {
1229191900Skmacy		/*
1230191900Skmacy		 * Release the vnode asynchronously as we currently have the
1231191900Skmacy		 * txg stopped from syncing.
1232191900Skmacy		 */
1233196307Spjd		VN_RELE_ASYNC(ZTOV(zp),
1234196307Spjd		    dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1235249195Smm		return (SET_ERROR(ENOENT));
1236168404Spjd	}
1237168404Spjd
1238219089Spjd	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1239219089Spjd	zgd->zgd_zilog = zfsvfs->z_log;
1240219089Spjd	zgd->zgd_private = zp;
1241219089Spjd
1242168404Spjd	/*
1243168404Spjd	 * Write records come in two flavors: immediate and indirect.
1244168404Spjd	 * For small writes it's cheaper to store the data with the
1245168404Spjd	 * log record (immediate); for large writes it's cheaper to
1246168404Spjd	 * sync the data and get a pointer to it (indirect) so that
1247168404Spjd	 * we don't have to write the data twice.
1248168404Spjd	 */
1249168404Spjd	if (buf != NULL) { /* immediate write */
1250219089Spjd		zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
1251168404Spjd		/* test for truncation needs to be done while range locked */
1252219089Spjd		if (offset >= zp->z_size) {
1253249195Smm			error = SET_ERROR(ENOENT);
1254219089Spjd		} else {
1255219089Spjd			error = dmu_read(os, object, offset, size, buf,
1256219089Spjd			    DMU_READ_NO_PREFETCH);
1257168404Spjd		}
1258219089Spjd		ASSERT(error == 0 || error == ENOENT);
1259168404Spjd	} else { /* indirect write */
1260168404Spjd		/*
1261168404Spjd		 * Have to lock the whole block to ensure when it's
1262168404Spjd		 * written out and it's checksum is being calculated
1263168404Spjd		 * that no one can change the data. We need to re-check
1264168404Spjd		 * blocksize after we get the lock in case it's changed!
1265168404Spjd		 */
1266168404Spjd		for (;;) {
1267219089Spjd			uint64_t blkoff;
1268219089Spjd			size = zp->z_blksz;
1269219089Spjd			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1270219089Spjd			offset -= blkoff;
1271219089Spjd			zgd->zgd_rl = zfs_range_lock(zp, offset, size,
1272219089Spjd			    RL_READER);
1273219089Spjd			if (zp->z_blksz == size)
1274168404Spjd				break;
1275219089Spjd			offset += blkoff;
1276219089Spjd			zfs_range_unlock(zgd->zgd_rl);
1277168404Spjd		}
1278168404Spjd		/* test for truncation needs to be done while range locked */
1279219089Spjd		if (lr->lr_offset >= zp->z_size)
1280249195Smm			error = SET_ERROR(ENOENT);
1281214378Smm#ifdef DEBUG
1282214378Smm		if (zil_fault_io) {
1283249195Smm			error = SET_ERROR(EIO);
1284214378Smm			zil_fault_io = 0;
1285214378Smm		}
1286214378Smm#endif
1287219089Spjd		if (error == 0)
1288219089Spjd			error = dmu_buf_hold(os, object, offset, zgd, &db,
1289219089Spjd			    DMU_READ_NO_PREFETCH);
1290214378Smm
1291209962Smm		if (error == 0) {
1292243524Smm			blkptr_t *obp = dmu_buf_get_blkptr(db);
1293243524Smm			if (obp) {
1294243524Smm				ASSERT(BP_IS_HOLE(bp));
1295243524Smm				*bp = *obp;
1296243524Smm			}
1297243524Smm
1298219089Spjd			zgd->zgd_db = db;
1299219089Spjd			zgd->zgd_bp = bp;
1300219089Spjd
1301219089Spjd			ASSERT(db->db_offset == offset);
1302219089Spjd			ASSERT(db->db_size == size);
1303219089Spjd
1304219089Spjd			error = dmu_sync(zio, lr->lr_common.lrc_txg,
1305219089Spjd			    zfs_get_done, zgd);
1306219089Spjd			ASSERT(error || lr->lr_length <= zp->z_blksz);
1307219089Spjd
1308209962Smm			/*
1309219089Spjd			 * On success, we need to wait for the write I/O
1310219089Spjd			 * initiated by dmu_sync() to complete before we can
1311219089Spjd			 * release this dbuf.  We will finish everything up
1312219089Spjd			 * in the zfs_get_done() callback.
1313209962Smm			 */
1314219089Spjd			if (error == 0)
1315219089Spjd				return (0);
1316209962Smm
1317219089Spjd			if (error == EALREADY) {
1318219089Spjd				lr->lr_common.lrc_txtype = TX_WRITE2;
1319219089Spjd				error = 0;
1320219089Spjd			}
1321209962Smm		}
1322168404Spjd	}
1323219089Spjd
1324219089Spjd	zfs_get_done(zgd, error);
1325219089Spjd
1326168404Spjd	return (error);
1327168404Spjd}
1328168404Spjd
1329168404Spjd/*ARGSUSED*/
1330168404Spjdstatic int
1331185029Spjdzfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1332185029Spjd    caller_context_t *ct)
1333168404Spjd{
1334168404Spjd	znode_t *zp = VTOZ(vp);
1335168404Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1336168404Spjd	int error;
1337168404Spjd
1338168404Spjd	ZFS_ENTER(zfsvfs);
1339185029Spjd	ZFS_VERIFY_ZP(zp);
1340185029Spjd
1341185029Spjd	if (flag & V_ACE_MASK)
1342185029Spjd		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1343185029Spjd	else
1344185029Spjd		error = zfs_zaccess_rwx(zp, mode, flag, cr);
1345185029Spjd
1346168404Spjd	ZFS_EXIT(zfsvfs);
1347168404Spjd	return (error);
1348168404Spjd}
1349168404Spjd
1350168404Spjd/*
1351211932Smm * If vnode is for a device return a specfs vnode instead.
1352211932Smm */
1353211932Smmstatic int
1354211932Smmspecvp_check(vnode_t **vpp, cred_t *cr)
1355211932Smm{
1356211932Smm	int error = 0;
1357211932Smm
1358211932Smm	if (IS_DEVVP(*vpp)) {
1359211932Smm		struct vnode *svp;
1360211932Smm
1361211932Smm		svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1362211932Smm		VN_RELE(*vpp);
1363211932Smm		if (svp == NULL)
1364249195Smm			error = SET_ERROR(ENOSYS);
1365211932Smm		*vpp = svp;
1366211932Smm	}
1367211932Smm	return (error);
1368211932Smm}
1369211932Smm
1370211932Smm
1371211932Smm/*
1372168404Spjd * Lookup an entry in a directory, or an extended attribute directory.
1373168404Spjd * If it exists, return a held vnode reference for it.
1374168404Spjd *
1375168404Spjd *	IN:	dvp	- vnode of directory to search.
1376168404Spjd *		nm	- name of entry to lookup.
1377168404Spjd *		pnp	- full pathname to lookup [UNUSED].
1378168404Spjd *		flags	- LOOKUP_XATTR set if looking for an attribute.
1379168404Spjd *		rdir	- root directory vnode [UNUSED].
1380168404Spjd *		cr	- credentials of caller.
1381185029Spjd *		ct	- caller context
1382185029Spjd *		direntflags - directory lookup flags
1383185029Spjd *		realpnp - returned pathname.
1384168404Spjd *
1385168404Spjd *	OUT:	vpp	- vnode of located entry, NULL if not found.
1386168404Spjd *
1387251631Sdelphij *	RETURN:	0 on success, error code on failure.
1388168404Spjd *
1389168404Spjd * Timestamps:
1390168404Spjd *	NA
1391168404Spjd */
1392168404Spjd/* ARGSUSED */
1393168962Spjdstatic int
1394168962Spjdzfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
1395185029Spjd    int nameiop, cred_t *cr, kthread_t *td, int flags)
1396168404Spjd{
1397168962Spjd	znode_t *zdp = VTOZ(dvp);
1398168962Spjd	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1399211932Smm	int	error = 0;
1400185029Spjd	int *direntflags = NULL;
1401185029Spjd	void *realpnp = NULL;
1402168404Spjd
1403211932Smm	/* fast path */
1404211932Smm	if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
1405211932Smm
1406211932Smm		if (dvp->v_type != VDIR) {
1407249195Smm			return (SET_ERROR(ENOTDIR));
1408219089Spjd		} else if (zdp->z_sa_hdl == NULL) {
1409249195Smm			return (SET_ERROR(EIO));
1410211932Smm		}
1411211932Smm
1412211932Smm		if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
1413211932Smm			error = zfs_fastaccesschk_execute(zdp, cr);
1414211932Smm			if (!error) {
1415211932Smm				*vpp = dvp;
1416211932Smm				VN_HOLD(*vpp);
1417211932Smm				return (0);
1418211932Smm			}
1419211932Smm			return (error);
1420211932Smm		} else {
1421211932Smm			vnode_t *tvp = dnlc_lookup(dvp, nm);
1422211932Smm
1423211932Smm			if (tvp) {
1424211932Smm				error = zfs_fastaccesschk_execute(zdp, cr);
1425211932Smm				if (error) {
1426211932Smm					VN_RELE(tvp);
1427211932Smm					return (error);
1428211932Smm				}
1429211932Smm				if (tvp == DNLC_NO_VNODE) {
1430211932Smm					VN_RELE(tvp);
1431249195Smm					return (SET_ERROR(ENOENT));
1432211932Smm				} else {
1433211932Smm					*vpp = tvp;
1434211932Smm					return (specvp_check(vpp, cr));
1435211932Smm				}
1436211932Smm			}
1437211932Smm		}
1438211932Smm	}
1439211932Smm
1440211932Smm	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1441211932Smm
1442168404Spjd	ZFS_ENTER(zfsvfs);
1443185029Spjd	ZFS_VERIFY_ZP(zdp);
1444168404Spjd
1445168404Spjd	*vpp = NULL;
1446168404Spjd
1447185029Spjd	if (flags & LOOKUP_XATTR) {
1448168404Spjd#ifdef TODO
1449168404Spjd		/*
1450168404Spjd		 * If the xattr property is off, refuse the lookup request.
1451168404Spjd		 */
1452168404Spjd		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1453168404Spjd			ZFS_EXIT(zfsvfs);
1454249195Smm			return (SET_ERROR(EINVAL));
1455168404Spjd		}
1456185029Spjd#endif
1457168404Spjd
1458168404Spjd		/*
1459168404Spjd		 * We don't allow recursive attributes..
1460168404Spjd		 * Maybe someday we will.
1461168404Spjd		 */
1462219089Spjd		if (zdp->z_pflags & ZFS_XATTR) {
1463168404Spjd			ZFS_EXIT(zfsvfs);
1464249195Smm			return (SET_ERROR(EINVAL));
1465168404Spjd		}
1466168404Spjd
1467168404Spjd		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1468168404Spjd			ZFS_EXIT(zfsvfs);
1469168404Spjd			return (error);
1470168404Spjd		}
1471168404Spjd
1472168404Spjd		/*
1473168404Spjd		 * Do we have permission to get into attribute directory?
1474168404Spjd		 */
1475168404Spjd
1476185029Spjd		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1477185029Spjd		    B_FALSE, cr)) {
1478168404Spjd			VN_RELE(*vpp);
1479185029Spjd			*vpp = NULL;
1480168404Spjd		}
1481168404Spjd
1482168404Spjd		ZFS_EXIT(zfsvfs);
1483168404Spjd		return (error);
1484168404Spjd	}
1485168404Spjd
1486168404Spjd	if (dvp->v_type != VDIR) {
1487168404Spjd		ZFS_EXIT(zfsvfs);
1488249195Smm		return (SET_ERROR(ENOTDIR));
1489168404Spjd	}
1490168404Spjd
1491168404Spjd	/*
1492168404Spjd	 * Check accessibility of directory.
1493168404Spjd	 */
1494168404Spjd
1495185029Spjd	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1496168404Spjd		ZFS_EXIT(zfsvfs);
1497168404Spjd		return (error);
1498168404Spjd	}
1499168404Spjd
1500185029Spjd	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1501185029Spjd	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1502185029Spjd		ZFS_EXIT(zfsvfs);
1503249195Smm		return (SET_ERROR(EILSEQ));
1504185029Spjd	}
1505168404Spjd
1506185029Spjd	error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
1507211932Smm	if (error == 0)
1508211932Smm		error = specvp_check(vpp, cr);
1509168962Spjd
1510168404Spjd	/* Translate errors and add SAVENAME when needed. */
1511168404Spjd	if (cnp->cn_flags & ISLASTCN) {
1512168404Spjd		switch (nameiop) {
1513168404Spjd		case CREATE:
1514168404Spjd		case RENAME:
1515168404Spjd			if (error == ENOENT) {
1516168404Spjd				error = EJUSTRETURN;
1517168404Spjd				cnp->cn_flags |= SAVENAME;
1518168404Spjd				break;
1519168404Spjd			}
1520168404Spjd			/* FALLTHROUGH */
1521168404Spjd		case DELETE:
1522168404Spjd			if (error == 0)
1523168404Spjd				cnp->cn_flags |= SAVENAME;
1524168404Spjd			break;
1525168404Spjd		}
1526168404Spjd	}
1527168404Spjd	if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) {
1528169198Spjd		int ltype = 0;
1529169198Spjd
1530169198Spjd		if (cnp->cn_flags & ISDOTDOT) {
1531176559Sattilio			ltype = VOP_ISLOCKED(dvp);
1532175294Sattilio			VOP_UNLOCK(dvp, 0);
1533169198Spjd		}
1534206667Spjd		ZFS_EXIT(zfsvfs);
1535254711Savg		error = vn_lock(*vpp, cnp->cn_lkflags);
1536168962Spjd		if (cnp->cn_flags & ISDOTDOT)
1537175202Sattilio			vn_lock(dvp, ltype | LK_RETRY);
1538169172Spjd		if (error != 0) {
1539169172Spjd			VN_RELE(*vpp);
1540169172Spjd			*vpp = NULL;
1541169172Spjd			return (error);
1542169172Spjd		}
1543206667Spjd	} else {
1544206667Spjd		ZFS_EXIT(zfsvfs);
1545168404Spjd	}
1546168404Spjd
1547168404Spjd#ifdef FREEBSD_NAMECACHE
1548168404Spjd	/*
1549168404Spjd	 * Insert name into cache (as non-existent) if appropriate.
1550168404Spjd	 */
1551168404Spjd	if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
1552168404Spjd		cache_enter(dvp, *vpp, cnp);
1553169170Spjd	/*
1554169170Spjd	 * Insert name into cache if appropriate.
1555169170Spjd	 */
1556168404Spjd	if (error == 0 && (cnp->cn_flags & MAKEENTRY)) {
1557168404Spjd		if (!(cnp->cn_flags & ISLASTCN) ||
1558168404Spjd		    (nameiop != DELETE && nameiop != RENAME)) {
1559168404Spjd			cache_enter(dvp, *vpp, cnp);
1560168404Spjd		}
1561168404Spjd	}
1562168404Spjd#endif
1563168404Spjd
1564168404Spjd	return (error);
1565168404Spjd}
1566168404Spjd
1567168404Spjd/*
1568168404Spjd * Attempt to create a new entry in a directory.  If the entry
1569168404Spjd * already exists, truncate the file if permissible, else return
1570168404Spjd * an error.  Return the vp of the created or trunc'd file.
1571168404Spjd *
1572168404Spjd *	IN:	dvp	- vnode of directory to put new file entry in.
1573168404Spjd *		name	- name of new file entry.
1574168404Spjd *		vap	- attributes of new file.
1575168404Spjd *		excl	- flag indicating exclusive or non-exclusive mode.
1576168404Spjd *		mode	- mode to open file with.
1577168404Spjd *		cr	- credentials of caller.
1578168404Spjd *		flag	- large file flag [UNUSED].
1579185029Spjd *		ct	- caller context
1580268464Sdelphij *		vsecp	- ACL to be set
1581168404Spjd *
1582168404Spjd *	OUT:	vpp	- vnode of created or trunc'd entry.
1583168404Spjd *
1584251631Sdelphij *	RETURN:	0 on success, error code on failure.
1585168404Spjd *
1586168404Spjd * Timestamps:
1587168404Spjd *	dvp - ctime|mtime updated if new entry created
1588168404Spjd *	 vp - ctime|mtime always, atime if new
1589168404Spjd */
1590185029Spjd
1591168404Spjd/* ARGSUSED */
1592168404Spjdstatic int
1593168962Spjdzfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
1594185029Spjd    vnode_t **vpp, cred_t *cr, kthread_t *td)
1595168404Spjd{
1596168404Spjd	znode_t		*zp, *dzp = VTOZ(dvp);
1597168404Spjd	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1598185029Spjd	zilog_t		*zilog;
1599185029Spjd	objset_t	*os;
1600168404Spjd	zfs_dirlock_t	*dl;
1601168404Spjd	dmu_tx_t	*tx;
1602168404Spjd	int		error;
1603209962Smm	ksid_t		*ksid;
1604209962Smm	uid_t		uid;
1605209962Smm	gid_t		gid = crgetgid(cr);
1606219089Spjd	zfs_acl_ids_t   acl_ids;
1607209962Smm	boolean_t	fuid_dirtied;
1608219089Spjd	boolean_t	have_acl = B_FALSE;
1609258632Savg	boolean_t	waited = B_FALSE;
1610185029Spjd	void		*vsecp = NULL;
1611185029Spjd	int		flag = 0;
1612168404Spjd
1613185029Spjd	/*
1614185029Spjd	 * If we have an ephemeral id, ACL, or XVATTR then
1615185029Spjd	 * make sure file system is at proper version
1616185029Spjd	 */
1617185029Spjd
1618209962Smm	ksid = crgetsid(cr, KSID_OWNER);
1619209962Smm	if (ksid)
1620209962Smm		uid = ksid_getid(ksid);
1621209962Smm	else
1622209962Smm		uid = crgetuid(cr);
1623219089Spjd
1624185029Spjd	if (zfsvfs->z_use_fuids == B_FALSE &&
1625185029Spjd	    (vsecp || (vap->va_mask & AT_XVATTR) ||
1626219089Spjd	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1627249195Smm		return (SET_ERROR(EINVAL));
1628185029Spjd
1629168404Spjd	ZFS_ENTER(zfsvfs);
1630185029Spjd	ZFS_VERIFY_ZP(dzp);
1631185029Spjd	os = zfsvfs->z_os;
1632185029Spjd	zilog = zfsvfs->z_log;
1633168404Spjd
1634185029Spjd	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1635185029Spjd	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1636185029Spjd		ZFS_EXIT(zfsvfs);
1637249195Smm		return (SET_ERROR(EILSEQ));
1638185029Spjd	}
1639185029Spjd
1640185029Spjd	if (vap->va_mask & AT_XVATTR) {
1641197861Spjd		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
1642185029Spjd		    crgetuid(cr), cr, vap->va_type)) != 0) {
1643185029Spjd			ZFS_EXIT(zfsvfs);
1644185029Spjd			return (error);
1645185029Spjd		}
1646185029Spjd	}
1647260704Savg
1648260704Savg	getnewvnode_reserve(1);
1649260704Savg
1650168404Spjdtop:
1651168404Spjd	*vpp = NULL;
1652168404Spjd
1653182905Strasz	if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
1654182905Strasz		vap->va_mode &= ~S_ISVTX;
1655168404Spjd
1656168404Spjd	if (*name == '\0') {
1657168404Spjd		/*
1658168404Spjd		 * Null component name refers to the directory itself.
1659168404Spjd		 */
1660168404Spjd		VN_HOLD(dvp);
1661168404Spjd		zp = dzp;
1662168404Spjd		dl = NULL;
1663168404Spjd		error = 0;
1664168404Spjd	} else {
1665168404Spjd		/* possible VN_HOLD(zp) */
1666185029Spjd		int zflg = 0;
1667185029Spjd
1668185029Spjd		if (flag & FIGNORECASE)
1669185029Spjd			zflg |= ZCILOOK;
1670185029Spjd
1671185029Spjd		error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1672185029Spjd		    NULL, NULL);
1673185029Spjd		if (error) {
1674219089Spjd			if (have_acl)
1675219089Spjd				zfs_acl_ids_free(&acl_ids);
1676168404Spjd			if (strcmp(name, "..") == 0)
1677249195Smm				error = SET_ERROR(EISDIR);
1678260704Savg			getnewvnode_drop_reserve();
1679168404Spjd			ZFS_EXIT(zfsvfs);
1680168404Spjd			return (error);
1681168404Spjd		}
1682168404Spjd	}
1683219089Spjd
1684185029Spjd	if (zp == NULL) {
1685185029Spjd		uint64_t txtype;
1686168404Spjd
1687168404Spjd		/*
1688168404Spjd		 * Create a new file object and update the directory
1689168404Spjd		 * to reference it.
1690168404Spjd		 */
1691185029Spjd		if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1692219089Spjd			if (have_acl)
1693219089Spjd				zfs_acl_ids_free(&acl_ids);
1694168404Spjd			goto out;
1695168404Spjd		}
1696168404Spjd
1697168404Spjd		/*
1698168404Spjd		 * We only support the creation of regular files in
1699168404Spjd		 * extended attribute directories.
1700168404Spjd		 */
1701219089Spjd
1702219089Spjd		if ((dzp->z_pflags & ZFS_XATTR) &&
1703168404Spjd		    (vap->va_type != VREG)) {
1704219089Spjd			if (have_acl)
1705219089Spjd				zfs_acl_ids_free(&acl_ids);
1706249195Smm			error = SET_ERROR(EINVAL);
1707168404Spjd			goto out;
1708168404Spjd		}
1709168404Spjd
1710219089Spjd		if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
1711219089Spjd		    cr, vsecp, &acl_ids)) != 0)
1712219089Spjd			goto out;
1713219089Spjd		have_acl = B_TRUE;
1714209962Smm
1715209962Smm		if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1716211932Smm			zfs_acl_ids_free(&acl_ids);
1717249195Smm			error = SET_ERROR(EDQUOT);
1718209962Smm			goto out;
1719209962Smm		}
1720209962Smm
1721168404Spjd		tx = dmu_tx_create(os);
1722219089Spjd
1723219089Spjd		dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1724219089Spjd		    ZFS_SA_BASE_ATTR_SIZE);
1725219089Spjd
1726209962Smm		fuid_dirtied = zfsvfs->z_fuid_dirty;
1727209962Smm		if (fuid_dirtied)
1728209962Smm			zfs_fuid_txhold(zfsvfs, tx);
1729168404Spjd		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1730219089Spjd		dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1731219089Spjd		if (!zfsvfs->z_use_sa &&
1732219089Spjd		    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1733168404Spjd			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1734219089Spjd			    0, acl_ids.z_aclp->z_acl_bytes);
1735185029Spjd		}
1736258632Savg		error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
1737168404Spjd		if (error) {
1738168404Spjd			zfs_dirent_unlock(dl);
1739209962Smm			if (error == ERESTART) {
1740258632Savg				waited = B_TRUE;
1741168404Spjd				dmu_tx_wait(tx);
1742168404Spjd				dmu_tx_abort(tx);
1743168404Spjd				goto top;
1744168404Spjd			}
1745219089Spjd			zfs_acl_ids_free(&acl_ids);
1746168404Spjd			dmu_tx_abort(tx);
1747260704Savg			getnewvnode_drop_reserve();
1748168404Spjd			ZFS_EXIT(zfsvfs);
1749168404Spjd			return (error);
1750168404Spjd		}
1751219089Spjd		zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1752209962Smm
1753209962Smm		if (fuid_dirtied)
1754209962Smm			zfs_fuid_sync(zfsvfs, tx);
1755209962Smm
1756168404Spjd		(void) zfs_link_create(dl, zp, tx, ZNEW);
1757185029Spjd		txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1758185029Spjd		if (flag & FIGNORECASE)
1759185029Spjd			txtype |= TX_CI;
1760185029Spjd		zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1761209962Smm		    vsecp, acl_ids.z_fuidp, vap);
1762209962Smm		zfs_acl_ids_free(&acl_ids);
1763168404Spjd		dmu_tx_commit(tx);
1764168404Spjd	} else {
1765185029Spjd		int aflags = (flag & FAPPEND) ? V_APPEND : 0;
1766185029Spjd
1767219089Spjd		if (have_acl)
1768219089Spjd			zfs_acl_ids_free(&acl_ids);
1769219089Spjd		have_acl = B_FALSE;
1770219089Spjd
1771168404Spjd		/*
1772168404Spjd		 * A directory entry already exists for this name.
1773168404Spjd		 */
1774168404Spjd		/*
1775168962Spjd		 * Can't truncate an existing file if in exclusive mode.
1776168962Spjd		 */
1777168962Spjd		if (excl == EXCL) {
1778249195Smm			error = SET_ERROR(EEXIST);
1779168962Spjd			goto out;
1780168962Spjd		}
1781168962Spjd		/*
1782168404Spjd		 * Can't open a directory for writing.
1783168404Spjd		 */
1784168404Spjd		if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
1785249195Smm			error = SET_ERROR(EISDIR);
1786168404Spjd			goto out;
1787168404Spjd		}
1788168404Spjd		/*
1789168404Spjd		 * Verify requested access to file.
1790168404Spjd		 */
1791185029Spjd		if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
1792168404Spjd			goto out;
1793168404Spjd		}
1794168404Spjd
1795168404Spjd		mutex_enter(&dzp->z_lock);
1796168404Spjd		dzp->z_seq++;
1797168404Spjd		mutex_exit(&dzp->z_lock);
1798168404Spjd
1799168404Spjd		/*
1800168404Spjd		 * Truncate regular files if requested.
1801168404Spjd		 */
1802168404Spjd		if ((ZTOV(zp)->v_type == VREG) &&
1803168404Spjd		    (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
1804185029Spjd			/* we can't hold any locks when calling zfs_freesp() */
1805185029Spjd			zfs_dirent_unlock(dl);
1806185029Spjd			dl = NULL;
1807168404Spjd			error = zfs_freesp(zp, 0, 0, mode, TRUE);
1808185029Spjd			if (error == 0) {
1809185029Spjd				vnevent_create(ZTOV(zp), ct);
1810168404Spjd			}
1811168404Spjd		}
1812168404Spjd	}
1813168404Spjdout:
1814260704Savg	getnewvnode_drop_reserve();
1815168404Spjd	if (dl)
1816168404Spjd		zfs_dirent_unlock(dl);
1817168404Spjd
1818168404Spjd	if (error) {
1819168404Spjd		if (zp)
1820168404Spjd			VN_RELE(ZTOV(zp));
1821168962Spjd	} else {
1822168962Spjd		*vpp = ZTOV(zp);
1823211932Smm		error = specvp_check(vpp, cr);
1824168404Spjd	}
1825168404Spjd
1826219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1827219089Spjd		zil_commit(zilog, 0);
1828219089Spjd
1829168404Spjd	ZFS_EXIT(zfsvfs);
1830168404Spjd	return (error);
1831168404Spjd}
1832168404Spjd
1833168404Spjd/*
1834168404Spjd * Remove an entry from a directory.
1835168404Spjd *
1836168404Spjd *	IN:	dvp	- vnode of directory to remove entry from.
1837168404Spjd *		name	- name of entry to remove.
1838168404Spjd *		cr	- credentials of caller.
1839185029Spjd *		ct	- caller context
1840185029Spjd *		flags	- case flags
1841168404Spjd *
1842251631Sdelphij *	RETURN:	0 on success, error code on failure.
1843168404Spjd *
1844168404Spjd * Timestamps:
1845168404Spjd *	dvp - ctime|mtime
1846168404Spjd *	 vp - ctime (if nlink > 0)
1847168404Spjd */
1848219089Spjd
1849219089Spjduint64_t null_xattr = 0;
1850219089Spjd
1851185029Spjd/*ARGSUSED*/
1852168404Spjdstatic int
1853185029Spjdzfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
1854185029Spjd    int flags)
1855168404Spjd{
1856168404Spjd	znode_t		*zp, *dzp = VTOZ(dvp);
1857219089Spjd	znode_t		*xzp;
1858168404Spjd	vnode_t		*vp;
1859168404Spjd	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1860185029Spjd	zilog_t		*zilog;
1861168962Spjd	uint64_t	acl_obj, xattr_obj;
1862268464Sdelphij	uint64_t	xattr_obj_unlinked = 0;
1863219089Spjd	uint64_t	obj = 0;
1864168404Spjd	zfs_dirlock_t	*dl;
1865168404Spjd	dmu_tx_t	*tx;
1866168962Spjd	boolean_t	may_delete_now, delete_now = FALSE;
1867185029Spjd	boolean_t	unlinked, toobig = FALSE;
1868185029Spjd	uint64_t	txtype;
1869185029Spjd	pathname_t	*realnmp = NULL;
1870185029Spjd	pathname_t	realnm;
1871168404Spjd	int		error;
1872185029Spjd	int		zflg = ZEXISTS;
1873258632Savg	boolean_t	waited = B_FALSE;
1874168404Spjd
1875168404Spjd	ZFS_ENTER(zfsvfs);
1876185029Spjd	ZFS_VERIFY_ZP(dzp);
1877185029Spjd	zilog = zfsvfs->z_log;
1878168404Spjd
1879185029Spjd	if (flags & FIGNORECASE) {
1880185029Spjd		zflg |= ZCILOOK;
1881185029Spjd		pn_alloc(&realnm);
1882185029Spjd		realnmp = &realnm;
1883185029Spjd	}
1884185029Spjd
1885168404Spjdtop:
1886219089Spjd	xattr_obj = 0;
1887219089Spjd	xzp = NULL;
1888168404Spjd	/*
1889168404Spjd	 * Attempt to lock directory; fail if entry doesn't exist.
1890168404Spjd	 */
1891185029Spjd	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1892185029Spjd	    NULL, realnmp)) {
1893185029Spjd		if (realnmp)
1894185029Spjd			pn_free(realnmp);
1895168404Spjd		ZFS_EXIT(zfsvfs);
1896168404Spjd		return (error);
1897168404Spjd	}
1898168404Spjd
1899168404Spjd	vp = ZTOV(zp);
1900168404Spjd
1901168962Spjd	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1902168404Spjd		goto out;
1903168962Spjd	}
1904168404Spjd
1905168962Spjd	/*
1906168962Spjd	 * Need to use rmdir for removing directories.
1907168962Spjd	 */
1908168962Spjd	if (vp->v_type == VDIR) {
1909249195Smm		error = SET_ERROR(EPERM);
1910168962Spjd		goto out;
1911168962Spjd	}
1912168962Spjd
1913185029Spjd	vnevent_remove(vp, dvp, name, ct);
1914168962Spjd
1915185029Spjd	if (realnmp)
1916185029Spjd		dnlc_remove(dvp, realnmp->pn_buf);
1917185029Spjd	else
1918185029Spjd		dnlc_remove(dvp, name);
1919168404Spjd
1920219089Spjd	VI_LOCK(vp);
1921219089Spjd	may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp);
1922219089Spjd	VI_UNLOCK(vp);
1923168962Spjd
1924168404Spjd	/*
1925168404Spjd	 * We may delete the znode now, or we may put it in the unlinked set;
1926168404Spjd	 * it depends on whether we're the last link, and on whether there are
1927168404Spjd	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
1928168404Spjd	 * allow for either case.
1929168404Spjd	 */
1930219089Spjd	obj = zp->z_id;
1931168404Spjd	tx = dmu_tx_create(zfsvfs->z_os);
1932168404Spjd	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1933219089Spjd	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1934219089Spjd	zfs_sa_upgrade_txholds(tx, zp);
1935219089Spjd	zfs_sa_upgrade_txholds(tx, dzp);
1936185029Spjd	if (may_delete_now) {
1937185029Spjd		toobig =
1938219089Spjd		    zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
1939185029Spjd		/* if the file is too big, only hold_free a token amount */
1940185029Spjd		dmu_tx_hold_free(tx, zp->z_id, 0,
1941185029Spjd		    (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1942185029Spjd	}
1943168404Spjd
1944168404Spjd	/* are there any extended attributes? */
1945219089Spjd	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1946219089Spjd	    &xattr_obj, sizeof (xattr_obj));
1947219089Spjd	if (error == 0 && xattr_obj) {
1948219089Spjd		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1949240415Smm		ASSERT0(error);
1950219089Spjd		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1951219089Spjd		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1952168404Spjd	}
1953168404Spjd
1954219089Spjd	mutex_enter(&zp->z_lock);
1955219089Spjd	if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
1956168962Spjd		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1957219089Spjd	mutex_exit(&zp->z_lock);
1958168962Spjd
1959168404Spjd	/* charge as an update -- would be nice not to charge at all */
1960168404Spjd	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1961168404Spjd
1962268464Sdelphij	/*
1963268464Sdelphij	 * Mark this transaction as typically resulting in a net free of
1964268464Sdelphij	 * space, unless object removal will be delayed indefinitely
1965268464Sdelphij	 * (due to active holds on the vnode due to the file being open).
1966268464Sdelphij	 */
1967268464Sdelphij	if (may_delete_now)
1968268464Sdelphij		dmu_tx_mark_netfree(tx);
1969268464Sdelphij
1970258632Savg	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
1971168404Spjd	if (error) {
1972168404Spjd		zfs_dirent_unlock(dl);
1973168962Spjd		VN_RELE(vp);
1974219089Spjd		if (xzp)
1975219089Spjd			VN_RELE(ZTOV(xzp));
1976209962Smm		if (error == ERESTART) {
1977258632Savg			waited = B_TRUE;
1978168404Spjd			dmu_tx_wait(tx);
1979168404Spjd			dmu_tx_abort(tx);
1980168404Spjd			goto top;
1981168404Spjd		}
1982185029Spjd		if (realnmp)
1983185029Spjd			pn_free(realnmp);
1984168404Spjd		dmu_tx_abort(tx);
1985168404Spjd		ZFS_EXIT(zfsvfs);
1986168404Spjd		return (error);
1987168404Spjd	}
1988168404Spjd
1989168404Spjd	/*
1990168404Spjd	 * Remove the directory entry.
1991168404Spjd	 */
1992185029Spjd	error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1993168404Spjd
1994168404Spjd	if (error) {
1995168404Spjd		dmu_tx_commit(tx);
1996168404Spjd		goto out;
1997168404Spjd	}
1998168404Spjd
1999219089Spjd	if (unlinked) {
2000219089Spjd		/*
2001219089Spjd		 * Hold z_lock so that we can make sure that the ACL obj
2002219089Spjd		 * hasn't changed.  Could have been deleted due to
2003219089Spjd		 * zfs_sa_upgrade().
2004219089Spjd		 */
2005219089Spjd		mutex_enter(&zp->z_lock);
2006168962Spjd		VI_LOCK(vp);
2007219089Spjd		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
2008219089Spjd		    &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
2009185029Spjd		delete_now = may_delete_now && !toobig &&
2010168962Spjd		    vp->v_count == 1 && !vn_has_cached_data(vp) &&
2011219089Spjd		    xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) ==
2012219089Spjd		    acl_obj;
2013168962Spjd		VI_UNLOCK(vp);
2014168962Spjd	}
2015168962Spjd
2016168962Spjd	if (delete_now) {
2017243270Savg#ifdef __FreeBSD__
2018243270Savg		panic("zfs_remove: delete_now branch taken");
2019243270Savg#endif
2020219089Spjd		if (xattr_obj_unlinked) {
2021219089Spjd			ASSERT3U(xzp->z_links, ==, 2);
2022168962Spjd			mutex_enter(&xzp->z_lock);
2023168962Spjd			xzp->z_unlinked = 1;
2024219089Spjd			xzp->z_links = 0;
2025219089Spjd			error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
2026219089Spjd			    &xzp->z_links, sizeof (xzp->z_links), tx);
2027219089Spjd			ASSERT3U(error,  ==,  0);
2028168962Spjd			mutex_exit(&xzp->z_lock);
2029168962Spjd			zfs_unlinked_add(xzp, tx);
2030219089Spjd
2031219089Spjd			if (zp->z_is_sa)
2032219089Spjd				error = sa_remove(zp->z_sa_hdl,
2033219089Spjd				    SA_ZPL_XATTR(zfsvfs), tx);
2034219089Spjd			else
2035219089Spjd				error = sa_update(zp->z_sa_hdl,
2036219089Spjd				    SA_ZPL_XATTR(zfsvfs), &null_xattr,
2037219089Spjd				    sizeof (uint64_t), tx);
2038240415Smm			ASSERT0(error);
2039168962Spjd		}
2040168962Spjd		VI_LOCK(vp);
2041168962Spjd		vp->v_count--;
2042240415Smm		ASSERT0(vp->v_count);
2043168962Spjd		VI_UNLOCK(vp);
2044168962Spjd		mutex_exit(&zp->z_lock);
2045168962Spjd		zfs_znode_delete(zp, tx);
2046168962Spjd	} else if (unlinked) {
2047219089Spjd		mutex_exit(&zp->z_lock);
2048168404Spjd		zfs_unlinked_add(zp, tx);
2049243268Savg#ifdef __FreeBSD__
2050243268Savg		vp->v_vflag |= VV_NOSYNC;
2051243268Savg#endif
2052168962Spjd	}
2053168404Spjd
2054185029Spjd	txtype = TX_REMOVE;
2055185029Spjd	if (flags & FIGNORECASE)
2056185029Spjd		txtype |= TX_CI;
2057219089Spjd	zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
2058168404Spjd
2059168404Spjd	dmu_tx_commit(tx);
2060168404Spjdout:
2061185029Spjd	if (realnmp)
2062185029Spjd		pn_free(realnmp);
2063185029Spjd
2064168404Spjd	zfs_dirent_unlock(dl);
2065168404Spjd
2066219089Spjd	if (!delete_now)
2067168962Spjd		VN_RELE(vp);
2068219089Spjd	if (xzp)
2069168962Spjd		VN_RELE(ZTOV(xzp));
2070168962Spjd
2071219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2072219089Spjd		zil_commit(zilog, 0);
2073219089Spjd
2074168404Spjd	ZFS_EXIT(zfsvfs);
2075168404Spjd	return (error);
2076168404Spjd}
2077168404Spjd
2078168404Spjd/*
2079168404Spjd * Create a new directory and insert it into dvp using the name
2080168404Spjd * provided.  Return a pointer to the inserted directory.
2081168404Spjd *
2082168404Spjd *	IN:	dvp	- vnode of directory to add subdir to.
2083168404Spjd *		dirname	- name of new directory.
2084168404Spjd *		vap	- attributes of new directory.
2085168404Spjd *		cr	- credentials of caller.
2086185029Spjd *		ct	- caller context
2087251631Sdelphij *		flags	- case flags
2088185029Spjd *		vsecp	- ACL to be set
2089168404Spjd *
2090168404Spjd *	OUT:	vpp	- vnode of created directory.
2091168404Spjd *
2092251631Sdelphij *	RETURN:	0 on success, error code on failure.
2093168404Spjd *
2094168404Spjd * Timestamps:
2095168404Spjd *	dvp - ctime|mtime updated
2096168404Spjd *	 vp - ctime|mtime|atime updated
2097168404Spjd */
2098185029Spjd/*ARGSUSED*/
2099168404Spjdstatic int
2100185029Spjdzfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
2101185029Spjd    caller_context_t *ct, int flags, vsecattr_t *vsecp)
2102168404Spjd{
2103168404Spjd	znode_t		*zp, *dzp = VTOZ(dvp);
2104168404Spjd	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2105185029Spjd	zilog_t		*zilog;
2106168404Spjd	zfs_dirlock_t	*dl;
2107185029Spjd	uint64_t	txtype;
2108168404Spjd	dmu_tx_t	*tx;
2109168404Spjd	int		error;
2110185029Spjd	int		zf = ZNEW;
2111209962Smm	ksid_t		*ksid;
2112209962Smm	uid_t		uid;
2113209962Smm	gid_t		gid = crgetgid(cr);
2114219089Spjd	zfs_acl_ids_t   acl_ids;
2115209962Smm	boolean_t	fuid_dirtied;
2116258632Savg	boolean_t	waited = B_FALSE;
2117168404Spjd
2118168404Spjd	ASSERT(vap->va_type == VDIR);
2119168404Spjd
2120185029Spjd	/*
2121185029Spjd	 * If we have an ephemeral id, ACL, or XVATTR then
2122185029Spjd	 * make sure file system is at proper version
2123185029Spjd	 */
2124185029Spjd
2125209962Smm	ksid = crgetsid(cr, KSID_OWNER);
2126209962Smm	if (ksid)
2127209962Smm		uid = ksid_getid(ksid);
2128209962Smm	else
2129209962Smm		uid = crgetuid(cr);
2130185029Spjd	if (zfsvfs->z_use_fuids == B_FALSE &&
2131219089Spjd	    (vsecp || (vap->va_mask & AT_XVATTR) ||
2132219089Spjd	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
2133249195Smm		return (SET_ERROR(EINVAL));
2134185029Spjd
2135168404Spjd	ZFS_ENTER(zfsvfs);
2136185029Spjd	ZFS_VERIFY_ZP(dzp);
2137185029Spjd	zilog = zfsvfs->z_log;
2138168404Spjd
2139219089Spjd	if (dzp->z_pflags & ZFS_XATTR) {
2140168404Spjd		ZFS_EXIT(zfsvfs);
2141249195Smm		return (SET_ERROR(EINVAL));
2142168404Spjd	}
2143168404Spjd
2144185029Spjd	if (zfsvfs->z_utf8 && u8_validate(dirname,
2145185029Spjd	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2146185029Spjd		ZFS_EXIT(zfsvfs);
2147249195Smm		return (SET_ERROR(EILSEQ));
2148185029Spjd	}
2149185029Spjd	if (flags & FIGNORECASE)
2150185029Spjd		zf |= ZCILOOK;
2151185029Spjd
2152219089Spjd	if (vap->va_mask & AT_XVATTR) {
2153197861Spjd		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
2154185029Spjd		    crgetuid(cr), cr, vap->va_type)) != 0) {
2155185029Spjd			ZFS_EXIT(zfsvfs);
2156185029Spjd			return (error);
2157185029Spjd		}
2158219089Spjd	}
2159185029Spjd
2160219089Spjd	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
2161219089Spjd	    vsecp, &acl_ids)) != 0) {
2162219089Spjd		ZFS_EXIT(zfsvfs);
2163219089Spjd		return (error);
2164219089Spjd	}
2165260704Savg
2166260704Savg	getnewvnode_reserve(1);
2167260704Savg
2168168404Spjd	/*
2169168404Spjd	 * First make sure the new directory doesn't exist.
2170219089Spjd	 *
2171219089Spjd	 * Existence is checked first to make sure we don't return
2172219089Spjd	 * EACCES instead of EEXIST which can cause some applications
2173219089Spjd	 * to fail.
2174168404Spjd	 */
2175185029Spjdtop:
2176185029Spjd	*vpp = NULL;
2177185029Spjd
2178185029Spjd	if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
2179185029Spjd	    NULL, NULL)) {
2180219089Spjd		zfs_acl_ids_free(&acl_ids);
2181260704Savg		getnewvnode_drop_reserve();
2182168404Spjd		ZFS_EXIT(zfsvfs);
2183168404Spjd		return (error);
2184168404Spjd	}
2185168404Spjd
2186185029Spjd	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
2187219089Spjd		zfs_acl_ids_free(&acl_ids);
2188168404Spjd		zfs_dirent_unlock(dl);
2189260704Savg		getnewvnode_drop_reserve();
2190168404Spjd		ZFS_EXIT(zfsvfs);
2191168404Spjd		return (error);
2192168404Spjd	}
2193168404Spjd
2194209962Smm	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
2195211932Smm		zfs_acl_ids_free(&acl_ids);
2196209962Smm		zfs_dirent_unlock(dl);
2197260704Savg		getnewvnode_drop_reserve();
2198209962Smm		ZFS_EXIT(zfsvfs);
2199249195Smm		return (SET_ERROR(EDQUOT));
2200209962Smm	}
2201209962Smm
2202168404Spjd	/*
2203168404Spjd	 * Add a new entry to the directory.
2204168404Spjd	 */
2205168404Spjd	tx = dmu_tx_create(zfsvfs->z_os);
2206168404Spjd	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
2207168404Spjd	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2208209962Smm	fuid_dirtied = zfsvfs->z_fuid_dirty;
2209209962Smm	if (fuid_dirtied)
2210209962Smm		zfs_fuid_txhold(zfsvfs, tx);
2211219089Spjd	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2212219089Spjd		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
2213219089Spjd		    acl_ids.z_aclp->z_acl_bytes);
2214219089Spjd	}
2215219089Spjd
2216219089Spjd	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
2217219089Spjd	    ZFS_SA_BASE_ATTR_SIZE);
2218219089Spjd
2219258632Savg	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
2220168404Spjd	if (error) {
2221168404Spjd		zfs_dirent_unlock(dl);
2222209962Smm		if (error == ERESTART) {
2223258632Savg			waited = B_TRUE;
2224168404Spjd			dmu_tx_wait(tx);
2225168404Spjd			dmu_tx_abort(tx);
2226168404Spjd			goto top;
2227168404Spjd		}
2228219089Spjd		zfs_acl_ids_free(&acl_ids);
2229168404Spjd		dmu_tx_abort(tx);
2230260704Savg		getnewvnode_drop_reserve();
2231168404Spjd		ZFS_EXIT(zfsvfs);
2232168404Spjd		return (error);
2233168404Spjd	}
2234168404Spjd
2235168404Spjd	/*
2236168404Spjd	 * Create new node.
2237168404Spjd	 */
2238219089Spjd	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
2239168404Spjd
2240209962Smm	if (fuid_dirtied)
2241209962Smm		zfs_fuid_sync(zfsvfs, tx);
2242219089Spjd
2243168404Spjd	/*
2244168404Spjd	 * Now put new name in parent dir.
2245168404Spjd	 */
2246168404Spjd	(void) zfs_link_create(dl, zp, tx, ZNEW);
2247168404Spjd
2248168404Spjd	*vpp = ZTOV(zp);
2249168404Spjd
2250185029Spjd	txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
2251185029Spjd	if (flags & FIGNORECASE)
2252185029Spjd		txtype |= TX_CI;
2253209962Smm	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
2254209962Smm	    acl_ids.z_fuidp, vap);
2255185029Spjd
2256209962Smm	zfs_acl_ids_free(&acl_ids);
2257219089Spjd
2258168404Spjd	dmu_tx_commit(tx);
2259168404Spjd
2260260704Savg	getnewvnode_drop_reserve();
2261260704Savg
2262168404Spjd	zfs_dirent_unlock(dl);
2263168404Spjd
2264219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2265219089Spjd		zil_commit(zilog, 0);
2266219089Spjd
2267168404Spjd	ZFS_EXIT(zfsvfs);
2268168404Spjd	return (0);
2269168404Spjd}
2270168404Spjd
2271168404Spjd/*
2272168404Spjd * Remove a directory subdir entry.  If the current working
2273168404Spjd * directory is the same as the subdir to be removed, the
2274168404Spjd * remove will fail.
2275168404Spjd *
2276168404Spjd *	IN:	dvp	- vnode of directory to remove from.
2277168404Spjd *		name	- name of directory to be removed.
2278168404Spjd *		cwd	- vnode of current working directory.
2279168404Spjd *		cr	- credentials of caller.
2280185029Spjd *		ct	- caller context
2281185029Spjd *		flags	- case flags
2282168404Spjd *
2283251631Sdelphij *	RETURN:	0 on success, error code on failure.
2284168404Spjd *
2285168404Spjd * Timestamps:
2286168404Spjd *	dvp - ctime|mtime updated
2287168404Spjd */
2288185029Spjd/*ARGSUSED*/
2289168404Spjdstatic int
2290185029Spjdzfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
2291185029Spjd    caller_context_t *ct, int flags)
2292168404Spjd{
2293168404Spjd	znode_t		*dzp = VTOZ(dvp);
2294168404Spjd	znode_t		*zp;
2295168404Spjd	vnode_t		*vp;
2296168404Spjd	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2297185029Spjd	zilog_t		*zilog;
2298168404Spjd	zfs_dirlock_t	*dl;
2299168404Spjd	dmu_tx_t	*tx;
2300168404Spjd	int		error;
2301185029Spjd	int		zflg = ZEXISTS;
2302258632Savg	boolean_t	waited = B_FALSE;
2303168404Spjd
2304168962Spjd	ZFS_ENTER(zfsvfs);
2305185029Spjd	ZFS_VERIFY_ZP(dzp);
2306185029Spjd	zilog = zfsvfs->z_log;
2307168404Spjd
2308185029Spjd	if (flags & FIGNORECASE)
2309185029Spjd		zflg |= ZCILOOK;
2310168404Spjdtop:
2311168404Spjd	zp = NULL;
2312168404Spjd
2313168404Spjd	/*
2314168404Spjd	 * Attempt to lock directory; fail if entry doesn't exist.
2315168404Spjd	 */
2316185029Spjd	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
2317185029Spjd	    NULL, NULL)) {
2318168404Spjd		ZFS_EXIT(zfsvfs);
2319168404Spjd		return (error);
2320168404Spjd	}
2321168404Spjd
2322168404Spjd	vp = ZTOV(zp);
2323168404Spjd
2324168404Spjd	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2325168404Spjd		goto out;
2326168404Spjd	}
2327168404Spjd
2328168962Spjd	if (vp->v_type != VDIR) {
2329249195Smm		error = SET_ERROR(ENOTDIR);
2330168962Spjd		goto out;
2331168962Spjd	}
2332168962Spjd
2333168962Spjd	if (vp == cwd) {
2334249195Smm		error = SET_ERROR(EINVAL);
2335168962Spjd		goto out;
2336168962Spjd	}
2337168962Spjd
2338185029Spjd	vnevent_rmdir(vp, dvp, name, ct);
2339168962Spjd
2340168404Spjd	/*
2341168404Spjd	 * Grab a lock on the directory to make sure that noone is
2342168404Spjd	 * trying to add (or lookup) entries while we are removing it.
2343168404Spjd	 */
2344168404Spjd	rw_enter(&zp->z_name_lock, RW_WRITER);
2345168404Spjd
2346168404Spjd	/*
2347168404Spjd	 * Grab a lock on the parent pointer to make sure we play well
2348168404Spjd	 * with the treewalk and directory rename code.
2349168404Spjd	 */
2350168404Spjd	rw_enter(&zp->z_parent_lock, RW_WRITER);
2351168404Spjd
2352168404Spjd	tx = dmu_tx_create(zfsvfs->z_os);
2353168404Spjd	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2354219089Spjd	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2355168404Spjd	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2356219089Spjd	zfs_sa_upgrade_txholds(tx, zp);
2357219089Spjd	zfs_sa_upgrade_txholds(tx, dzp);
2358258632Savg	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
2359168404Spjd	if (error) {
2360168404Spjd		rw_exit(&zp->z_parent_lock);
2361168404Spjd		rw_exit(&zp->z_name_lock);
2362168404Spjd		zfs_dirent_unlock(dl);
2363168962Spjd		VN_RELE(vp);
2364209962Smm		if (error == ERESTART) {
2365258632Savg			waited = B_TRUE;
2366168404Spjd			dmu_tx_wait(tx);
2367168404Spjd			dmu_tx_abort(tx);
2368168404Spjd			goto top;
2369168404Spjd		}
2370168404Spjd		dmu_tx_abort(tx);
2371168404Spjd		ZFS_EXIT(zfsvfs);
2372168404Spjd		return (error);
2373168404Spjd	}
2374168404Spjd
2375168404Spjd#ifdef FREEBSD_NAMECACHE
2376168404Spjd	cache_purge(dvp);
2377168404Spjd#endif
2378168404Spjd
2379185029Spjd	error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
2380168404Spjd
2381185029Spjd	if (error == 0) {
2382185029Spjd		uint64_t txtype = TX_RMDIR;
2383185029Spjd		if (flags & FIGNORECASE)
2384185029Spjd			txtype |= TX_CI;
2385219089Spjd		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
2386185029Spjd	}
2387168404Spjd
2388168404Spjd	dmu_tx_commit(tx);
2389168404Spjd
2390168404Spjd	rw_exit(&zp->z_parent_lock);
2391168404Spjd	rw_exit(&zp->z_name_lock);
2392168404Spjd#ifdef FREEBSD_NAMECACHE
2393168404Spjd	cache_purge(vp);
2394168404Spjd#endif
2395168404Spjdout:
2396168404Spjd	zfs_dirent_unlock(dl);
2397168404Spjd
2398168962Spjd	VN_RELE(vp);
2399168962Spjd
2400219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2401219089Spjd		zil_commit(zilog, 0);
2402219089Spjd
2403168404Spjd	ZFS_EXIT(zfsvfs);
2404168404Spjd	return (error);
2405168404Spjd}
2406168404Spjd
2407168404Spjd/*
2408168404Spjd * Read as many directory entries as will fit into the provided
2409168404Spjd * buffer from the given directory cursor position (specified in
2410251631Sdelphij * the uio structure).
2411168404Spjd *
2412168404Spjd *	IN:	vp	- vnode of directory to read.
2413168404Spjd *		uio	- structure supplying read location, range info,
2414168404Spjd *			  and return buffer.
2415168404Spjd *		cr	- credentials of caller.
2416185029Spjd *		ct	- caller context
2417185029Spjd *		flags	- case flags
2418168404Spjd *
2419168404Spjd *	OUT:	uio	- updated offset and range, buffer filled.
2420168404Spjd *		eofp	- set to true if end-of-file detected.
2421168404Spjd *
2422251631Sdelphij *	RETURN:	0 on success, error code on failure.
2423168404Spjd *
2424168404Spjd * Timestamps:
2425168404Spjd *	vp - atime updated
2426168404Spjd *
2427168404Spjd * Note that the low 4 bits of the cookie returned by zap is always zero.
2428168404Spjd * This allows us to use the low range for "special" directory entries:
2429168404Spjd * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
2430168404Spjd * we use the offset 2 for the '.zfs' directory.
2431168404Spjd */
2432168404Spjd/* ARGSUSED */
2433168404Spjdstatic int
2434168962Spjdzfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
2435168404Spjd{
2436168404Spjd	znode_t		*zp = VTOZ(vp);
2437168404Spjd	iovec_t		*iovp;
2438185029Spjd	edirent_t	*eodp;
2439168404Spjd	dirent64_t	*odp;
2440168404Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2441168404Spjd	objset_t	*os;
2442168404Spjd	caddr_t		outbuf;
2443168404Spjd	size_t		bufsize;
2444168404Spjd	zap_cursor_t	zc;
2445168404Spjd	zap_attribute_t	zap;
2446168404Spjd	uint_t		bytes_wanted;
2447168404Spjd	uint64_t	offset; /* must be unsigned; checks for < 1 */
2448219089Spjd	uint64_t	parent;
2449168404Spjd	int		local_eof;
2450168404Spjd	int		outcount;
2451168404Spjd	int		error;
2452168404Spjd	uint8_t		prefetch;
2453185029Spjd	boolean_t	check_sysattrs;
2454168404Spjd	uint8_t		type;
2455168962Spjd	int		ncooks;
2456168962Spjd	u_long		*cooks = NULL;
2457185029Spjd	int		flags = 0;
2458168404Spjd
2459168404Spjd	ZFS_ENTER(zfsvfs);
2460185029Spjd	ZFS_VERIFY_ZP(zp);
2461168404Spjd
2462219089Spjd	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2463219089Spjd	    &parent, sizeof (parent))) != 0) {
2464219089Spjd		ZFS_EXIT(zfsvfs);
2465219089Spjd		return (error);
2466219089Spjd	}
2467219089Spjd
2468168404Spjd	/*
2469168404Spjd	 * If we are not given an eof variable,
2470168404Spjd	 * use a local one.
2471168404Spjd	 */
2472168404Spjd	if (eofp == NULL)
2473168404Spjd		eofp = &local_eof;
2474168404Spjd
2475168404Spjd	/*
2476168404Spjd	 * Check for valid iov_len.
2477168404Spjd	 */
2478168404Spjd	if (uio->uio_iov->iov_len <= 0) {
2479168404Spjd		ZFS_EXIT(zfsvfs);
2480249195Smm		return (SET_ERROR(EINVAL));
2481168404Spjd	}
2482168404Spjd
2483168404Spjd	/*
2484168404Spjd	 * Quit if directory has been removed (posix)
2485168404Spjd	 */
2486168404Spjd	if ((*eofp = zp->z_unlinked) != 0) {
2487168404Spjd		ZFS_EXIT(zfsvfs);
2488168404Spjd		return (0);
2489168404Spjd	}
2490168404Spjd
2491168404Spjd	error = 0;
2492168404Spjd	os = zfsvfs->z_os;
2493168404Spjd	offset = uio->uio_loffset;
2494168404Spjd	prefetch = zp->z_zn_prefetch;
2495168404Spjd
2496168404Spjd	/*
2497168404Spjd	 * Initialize the iterator cursor.
2498168404Spjd	 */
2499168404Spjd	if (offset <= 3) {
2500168404Spjd		/*
2501168404Spjd		 * Start iteration from the beginning of the directory.
2502168404Spjd		 */
2503168404Spjd		zap_cursor_init(&zc, os, zp->z_id);
2504168404Spjd	} else {
2505168404Spjd		/*
2506168404Spjd		 * The offset is a serialized cursor.
2507168404Spjd		 */
2508168404Spjd		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2509168404Spjd	}
2510168404Spjd
2511168404Spjd	/*
2512168404Spjd	 * Get space to change directory entries into fs independent format.
2513168404Spjd	 */
2514168404Spjd	iovp = uio->uio_iov;
2515168404Spjd	bytes_wanted = iovp->iov_len;
2516168404Spjd	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
2517168404Spjd		bufsize = bytes_wanted;
2518168404Spjd		outbuf = kmem_alloc(bufsize, KM_SLEEP);
2519168404Spjd		odp = (struct dirent64 *)outbuf;
2520168404Spjd	} else {
2521168404Spjd		bufsize = bytes_wanted;
2522247187Smm		outbuf = NULL;
2523168404Spjd		odp = (struct dirent64 *)iovp->iov_base;
2524168404Spjd	}
2525185029Spjd	eodp = (struct edirent *)odp;
2526168404Spjd
2527169170Spjd	if (ncookies != NULL) {
2528168404Spjd		/*
2529168404Spjd		 * Minimum entry size is dirent size and 1 byte for a file name.
2530168404Spjd		 */
2531168962Spjd		ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
2532219404Spjd		cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
2533219404Spjd		*cookies = cooks;
2534168962Spjd		*ncookies = ncooks;
2535168404Spjd	}
2536185029Spjd	/*
2537185029Spjd	 * If this VFS supports the system attribute view interface; and
2538185029Spjd	 * we're looking at an extended attribute directory; and we care
2539185029Spjd	 * about normalization conflicts on this vfs; then we must check
2540185029Spjd	 * for normalization conflicts with the sysattr name space.
2541185029Spjd	 */
2542185029Spjd#ifdef TODO
2543185029Spjd	check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2544185029Spjd	    (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2545185029Spjd	    (flags & V_RDDIR_ENTFLAGS);
2546185029Spjd#else
2547185029Spjd	check_sysattrs = 0;
2548185029Spjd#endif
2549168404Spjd
2550168404Spjd	/*
2551168404Spjd	 * Transform to file-system independent format
2552168404Spjd	 */
2553168404Spjd	outcount = 0;
2554168404Spjd	while (outcount < bytes_wanted) {
2555168404Spjd		ino64_t objnum;
2556168404Spjd		ushort_t reclen;
2557219089Spjd		off64_t *next = NULL;
2558168404Spjd
2559168404Spjd		/*
2560168404Spjd		 * Special case `.', `..', and `.zfs'.
2561168404Spjd		 */
2562168404Spjd		if (offset == 0) {
2563168404Spjd			(void) strcpy(zap.za_name, ".");
2564185029Spjd			zap.za_normalization_conflict = 0;
2565168404Spjd			objnum = zp->z_id;
2566169108Spjd			type = DT_DIR;
2567168404Spjd		} else if (offset == 1) {
2568168404Spjd			(void) strcpy(zap.za_name, "..");
2569185029Spjd			zap.za_normalization_conflict = 0;
2570219089Spjd			objnum = parent;
2571169108Spjd			type = DT_DIR;
2572168404Spjd		} else if (offset == 2 && zfs_show_ctldir(zp)) {
2573168404Spjd			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2574185029Spjd			zap.za_normalization_conflict = 0;
2575168404Spjd			objnum = ZFSCTL_INO_ROOT;
2576169108Spjd			type = DT_DIR;
2577168404Spjd		} else {
2578168404Spjd			/*
2579168404Spjd			 * Grab next entry.
2580168404Spjd			 */
2581168404Spjd			if (error = zap_cursor_retrieve(&zc, &zap)) {
2582168404Spjd				if ((*eofp = (error == ENOENT)) != 0)
2583168404Spjd					break;
2584168404Spjd				else
2585168404Spjd					goto update;
2586168404Spjd			}
2587168404Spjd
2588168404Spjd			if (zap.za_integer_length != 8 ||
2589168404Spjd			    zap.za_num_integers != 1) {
2590168404Spjd				cmn_err(CE_WARN, "zap_readdir: bad directory "
2591168404Spjd				    "entry, obj = %lld, offset = %lld\n",
2592168404Spjd				    (u_longlong_t)zp->z_id,
2593168404Spjd				    (u_longlong_t)offset);
2594249195Smm				error = SET_ERROR(ENXIO);
2595168404Spjd				goto update;
2596168404Spjd			}
2597168404Spjd
2598168404Spjd			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2599168404Spjd			/*
2600168404Spjd			 * MacOS X can extract the object type here such as:
2601168404Spjd			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2602168404Spjd			 */
2603168404Spjd			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2604185029Spjd
2605185029Spjd			if (check_sysattrs && !zap.za_normalization_conflict) {
2606185029Spjd#ifdef TODO
2607185029Spjd				zap.za_normalization_conflict =
2608185029Spjd				    xattr_sysattr_casechk(zap.za_name);
2609185029Spjd#else
2610185029Spjd				panic("%s:%u: TODO", __func__, __LINE__);
2611185029Spjd#endif
2612185029Spjd			}
2613168404Spjd		}
2614168404Spjd
2615211932Smm		if (flags & V_RDDIR_ACCFILTER) {
2616211932Smm			/*
2617211932Smm			 * If we have no access at all, don't include
2618211932Smm			 * this entry in the returned information
2619211932Smm			 */
2620211932Smm			znode_t	*ezp;
2621211932Smm			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
2622211932Smm				goto skip_entry;
2623211932Smm			if (!zfs_has_access(ezp, cr)) {
2624211932Smm				VN_RELE(ZTOV(ezp));
2625211932Smm				goto skip_entry;
2626211932Smm			}
2627211932Smm			VN_RELE(ZTOV(ezp));
2628211932Smm		}
2629211932Smm
2630185029Spjd		if (flags & V_RDDIR_ENTFLAGS)
2631185029Spjd			reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2632185029Spjd		else
2633185029Spjd			reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2634185029Spjd
2635168404Spjd		/*
2636168404Spjd		 * Will this entry fit in the buffer?
2637168404Spjd		 */
2638168404Spjd		if (outcount + reclen > bufsize) {
2639168404Spjd			/*
2640168404Spjd			 * Did we manage to fit anything in the buffer?
2641168404Spjd			 */
2642168404Spjd			if (!outcount) {
2643249195Smm				error = SET_ERROR(EINVAL);
2644168404Spjd				goto update;
2645168404Spjd			}
2646168404Spjd			break;
2647168404Spjd		}
2648185029Spjd		if (flags & V_RDDIR_ENTFLAGS) {
2649185029Spjd			/*
2650185029Spjd			 * Add extended flag entry:
2651185029Spjd			 */
2652185029Spjd			eodp->ed_ino = objnum;
2653185029Spjd			eodp->ed_reclen = reclen;
2654185029Spjd			/* NOTE: ed_off is the offset for the *next* entry */
2655185029Spjd			next = &(eodp->ed_off);
2656185029Spjd			eodp->ed_eflags = zap.za_normalization_conflict ?
2657185029Spjd			    ED_CASE_CONFLICT : 0;
2658185029Spjd			(void) strncpy(eodp->ed_name, zap.za_name,
2659185029Spjd			    EDIRENT_NAMELEN(reclen));
2660185029Spjd			eodp = (edirent_t *)((intptr_t)eodp + reclen);
2661185029Spjd		} else {
2662185029Spjd			/*
2663185029Spjd			 * Add normal entry:
2664185029Spjd			 */
2665185029Spjd			odp->d_ino = objnum;
2666185029Spjd			odp->d_reclen = reclen;
2667185029Spjd			odp->d_namlen = strlen(zap.za_name);
2668185029Spjd			(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
2669185029Spjd			odp->d_type = type;
2670185029Spjd			odp = (dirent64_t *)((intptr_t)odp + reclen);
2671185029Spjd		}
2672168404Spjd		outcount += reclen;
2673168404Spjd
2674168404Spjd		ASSERT(outcount <= bufsize);
2675168404Spjd
2676168404Spjd		/* Prefetch znode */
2677168404Spjd		if (prefetch)
2678168404Spjd			dmu_prefetch(os, objnum, 0, 0);
2679168404Spjd
2680211932Smm	skip_entry:
2681168404Spjd		/*
2682168404Spjd		 * Move to the next entry, fill in the previous offset.
2683168404Spjd		 */
2684168404Spjd		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2685168404Spjd			zap_cursor_advance(&zc);
2686168404Spjd			offset = zap_cursor_serialize(&zc);
2687168404Spjd		} else {
2688168404Spjd			offset += 1;
2689168404Spjd		}
2690219404Spjd
2691219404Spjd		if (cooks != NULL) {
2692219404Spjd			*cooks++ = offset;
2693219404Spjd			ncooks--;
2694219404Spjd			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
2695219404Spjd		}
2696168404Spjd	}
2697168404Spjd	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2698168404Spjd
2699168404Spjd	/* Subtract unused cookies */
2700168962Spjd	if (ncookies != NULL)
2701168962Spjd		*ncookies -= ncooks;
2702168404Spjd
2703168404Spjd	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
2704168404Spjd		iovp->iov_base += outcount;
2705168404Spjd		iovp->iov_len -= outcount;
2706168404Spjd		uio->uio_resid -= outcount;
2707168404Spjd	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2708168404Spjd		/*
2709168404Spjd		 * Reset the pointer.
2710168404Spjd		 */
2711168404Spjd		offset = uio->uio_loffset;
2712168404Spjd	}
2713168404Spjd
2714168404Spjdupdate:
2715168404Spjd	zap_cursor_fini(&zc);
2716168404Spjd	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
2717168404Spjd		kmem_free(outbuf, bufsize);
2718168404Spjd
2719168404Spjd	if (error == ENOENT)
2720168404Spjd		error = 0;
2721168404Spjd
2722168404Spjd	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2723168404Spjd
2724168404Spjd	uio->uio_loffset = offset;
2725168404Spjd	ZFS_EXIT(zfsvfs);
2726169107Spjd	if (error != 0 && cookies != NULL) {
2727168962Spjd		free(*cookies, M_TEMP);
2728168962Spjd		*cookies = NULL;
2729168962Spjd		*ncookies = 0;
2730168404Spjd	}
2731168404Spjd	return (error);
2732168404Spjd}
2733168404Spjd
2734185029Spjdulong_t zfs_fsync_sync_cnt = 4;
2735185029Spjd
2736168404Spjdstatic int
2737185029Spjdzfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2738168404Spjd{
2739168962Spjd	znode_t	*zp = VTOZ(vp);
2740168962Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2741168404Spjd
2742185029Spjd	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2743185029Spjd
2744219089Spjd	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
2745219089Spjd		ZFS_ENTER(zfsvfs);
2746219089Spjd		ZFS_VERIFY_ZP(zp);
2747219089Spjd		zil_commit(zfsvfs->z_log, zp->z_id);
2748219089Spjd		ZFS_EXIT(zfsvfs);
2749219089Spjd	}
2750168404Spjd	return (0);
2751168404Spjd}
2752168404Spjd
2753185029Spjd
2754168404Spjd/*
2755168404Spjd * Get the requested file attributes and place them in the provided
2756168404Spjd * vattr structure.
2757168404Spjd *
2758168404Spjd *	IN:	vp	- vnode of file.
2759168404Spjd *		vap	- va_mask identifies requested attributes.
2760185029Spjd *			  If AT_XVATTR set, then optional attrs are requested
2761185029Spjd *		flags	- ATTR_NOACLCHECK (CIFS server context)
2762168404Spjd *		cr	- credentials of caller.
2763185029Spjd *		ct	- caller context
2764168404Spjd *
2765168404Spjd *	OUT:	vap	- attribute values.
2766168404Spjd *
2767251631Sdelphij *	RETURN:	0 (always succeeds).
2768168404Spjd */
2769168404Spjd/* ARGSUSED */
2770168404Spjdstatic int
2771185029Spjdzfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2772185029Spjd    caller_context_t *ct)
2773168404Spjd{
2774168962Spjd	znode_t *zp = VTOZ(vp);
2775168962Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2776185029Spjd	int	error = 0;
2777168962Spjd	uint32_t blksize;
2778168962Spjd	u_longlong_t nblocks;
2779185029Spjd	uint64_t links;
2780224251Sdelphij	uint64_t mtime[2], ctime[2], crtime[2], rdev;
2781185029Spjd	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2782185029Spjd	xoptattr_t *xoap = NULL;
2783185029Spjd	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2784224251Sdelphij	sa_bulk_attr_t bulk[4];
2785219089Spjd	int count = 0;
2786168404Spjd
2787168404Spjd	ZFS_ENTER(zfsvfs);
2788185029Spjd	ZFS_VERIFY_ZP(zp);
2789168404Spjd
2790219089Spjd	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2791219089Spjd
2792219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
2793219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
2794243807Sdelphij	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
2795224251Sdelphij	if (vp->v_type == VBLK || vp->v_type == VCHR)
2796224251Sdelphij		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
2797224251Sdelphij		    &rdev, 8);
2798219089Spjd
2799219089Spjd	if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
2800219089Spjd		ZFS_EXIT(zfsvfs);
2801219089Spjd		return (error);
2802219089Spjd	}
2803219089Spjd
2804168404Spjd	/*
2805185029Spjd	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2806185029Spjd	 * Also, if we are the owner don't bother, since owner should
2807185029Spjd	 * always be allowed to read basic attributes of file.
2808185029Spjd	 */
2809219089Spjd	if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
2810219089Spjd	    (vap->va_uid != crgetuid(cr))) {
2811185029Spjd		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2812185029Spjd		    skipaclchk, cr)) {
2813185029Spjd			ZFS_EXIT(zfsvfs);
2814185029Spjd			return (error);
2815185029Spjd		}
2816185029Spjd	}
2817185029Spjd
2818185029Spjd	/*
2819168404Spjd	 * Return all attributes.  It's cheaper to provide the answer
2820168404Spjd	 * than to determine whether we were asked the question.
2821168404Spjd	 */
2822168404Spjd
2823209097Smm	mutex_enter(&zp->z_lock);
2824219089Spjd	vap->va_type = IFTOVT(zp->z_mode);
2825219089Spjd	vap->va_mode = zp->z_mode & ~S_IFMT;
2826224252Sdelphij#ifdef sun
2827224252Sdelphij	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
2828224252Sdelphij#else
2829224252Sdelphij	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
2830224252Sdelphij#endif
2831168404Spjd	vap->va_nodeid = zp->z_id;
2832185029Spjd	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
2833219089Spjd		links = zp->z_links + 1;
2834185029Spjd	else
2835219089Spjd		links = zp->z_links;
2836229425Sdim	vap->va_nlink = MIN(links, LINK_MAX);	/* nlink_t limit! */
2837219089Spjd	vap->va_size = zp->z_size;
2838224252Sdelphij#ifdef sun
2839224252Sdelphij	vap->va_rdev = vp->v_rdev;
2840224252Sdelphij#else
2841224251Sdelphij	if (vp->v_type == VBLK || vp->v_type == VCHR)
2842224251Sdelphij		vap->va_rdev = zfs_cmpldev(rdev);
2843224252Sdelphij#endif
2844168404Spjd	vap->va_seq = zp->z_seq;
2845168404Spjd	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
2846272467Saraujo     	vap->va_filerev = zp->z_seq;
2847168404Spjd
2848185029Spjd	/*
2849185029Spjd	 * Add in any requested optional attributes and the create time.
2850185029Spjd	 * Also set the corresponding bits in the returned attribute bitmap.
2851185029Spjd	 */
2852185029Spjd	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2853185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2854185029Spjd			xoap->xoa_archive =
2855219089Spjd			    ((zp->z_pflags & ZFS_ARCHIVE) != 0);
2856185029Spjd			XVA_SET_RTN(xvap, XAT_ARCHIVE);
2857185029Spjd		}
2858185029Spjd
2859185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2860185029Spjd			xoap->xoa_readonly =
2861219089Spjd			    ((zp->z_pflags & ZFS_READONLY) != 0);
2862185029Spjd			XVA_SET_RTN(xvap, XAT_READONLY);
2863185029Spjd		}
2864185029Spjd
2865185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2866185029Spjd			xoap->xoa_system =
2867219089Spjd			    ((zp->z_pflags & ZFS_SYSTEM) != 0);
2868185029Spjd			XVA_SET_RTN(xvap, XAT_SYSTEM);
2869185029Spjd		}
2870185029Spjd
2871185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2872185029Spjd			xoap->xoa_hidden =
2873219089Spjd			    ((zp->z_pflags & ZFS_HIDDEN) != 0);
2874185029Spjd			XVA_SET_RTN(xvap, XAT_HIDDEN);
2875185029Spjd		}
2876185029Spjd
2877185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2878185029Spjd			xoap->xoa_nounlink =
2879219089Spjd			    ((zp->z_pflags & ZFS_NOUNLINK) != 0);
2880185029Spjd			XVA_SET_RTN(xvap, XAT_NOUNLINK);
2881185029Spjd		}
2882185029Spjd
2883185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2884185029Spjd			xoap->xoa_immutable =
2885219089Spjd			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
2886185029Spjd			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2887185029Spjd		}
2888185029Spjd
2889185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2890185029Spjd			xoap->xoa_appendonly =
2891219089Spjd			    ((zp->z_pflags & ZFS_APPENDONLY) != 0);
2892185029Spjd			XVA_SET_RTN(xvap, XAT_APPENDONLY);
2893185029Spjd		}
2894185029Spjd
2895185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2896185029Spjd			xoap->xoa_nodump =
2897219089Spjd			    ((zp->z_pflags & ZFS_NODUMP) != 0);
2898185029Spjd			XVA_SET_RTN(xvap, XAT_NODUMP);
2899185029Spjd		}
2900185029Spjd
2901185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2902185029Spjd			xoap->xoa_opaque =
2903219089Spjd			    ((zp->z_pflags & ZFS_OPAQUE) != 0);
2904185029Spjd			XVA_SET_RTN(xvap, XAT_OPAQUE);
2905185029Spjd		}
2906185029Spjd
2907185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2908185029Spjd			xoap->xoa_av_quarantined =
2909219089Spjd			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
2910185029Spjd			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2911185029Spjd		}
2912185029Spjd
2913185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2914185029Spjd			xoap->xoa_av_modified =
2915219089Spjd			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
2916185029Spjd			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2917185029Spjd		}
2918185029Spjd
2919185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2920219089Spjd		    vp->v_type == VREG) {
2921219089Spjd			zfs_sa_get_scanstamp(zp, xvap);
2922185029Spjd		}
2923185029Spjd
2924185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
2925219089Spjd			uint64_t times[2];
2926219089Spjd
2927219089Spjd			(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
2928219089Spjd			    times, sizeof (times));
2929219089Spjd			ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
2930185029Spjd			XVA_SET_RTN(xvap, XAT_CREATETIME);
2931185029Spjd		}
2932219089Spjd
2933219089Spjd		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2934219089Spjd			xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
2935219089Spjd			XVA_SET_RTN(xvap, XAT_REPARSE);
2936219089Spjd		}
2937219089Spjd		if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
2938219089Spjd			xoap->xoa_generation = zp->z_gen;
2939219089Spjd			XVA_SET_RTN(xvap, XAT_GEN);
2940219089Spjd		}
2941219089Spjd
2942219089Spjd		if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
2943219089Spjd			xoap->xoa_offline =
2944219089Spjd			    ((zp->z_pflags & ZFS_OFFLINE) != 0);
2945219089Spjd			XVA_SET_RTN(xvap, XAT_OFFLINE);
2946219089Spjd		}
2947219089Spjd
2948219089Spjd		if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
2949219089Spjd			xoap->xoa_sparse =
2950219089Spjd			    ((zp->z_pflags & ZFS_SPARSE) != 0);
2951219089Spjd			XVA_SET_RTN(xvap, XAT_SPARSE);
2952219089Spjd		}
2953185029Spjd	}
2954185029Spjd
2955219089Spjd	ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
2956219089Spjd	ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2957219089Spjd	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2958219089Spjd	ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
2959168404Spjd
2960168404Spjd	mutex_exit(&zp->z_lock);
2961168404Spjd
2962219089Spjd	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
2963168404Spjd	vap->va_blksize = blksize;
2964168404Spjd	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
2965168404Spjd
2966168404Spjd	if (zp->z_blksz == 0) {
2967168404Spjd		/*
2968168404Spjd		 * Block size hasn't been set; suggest maximal I/O transfers.
2969168404Spjd		 */
2970168404Spjd		vap->va_blksize = zfsvfs->z_max_blksz;
2971168404Spjd	}
2972168404Spjd
2973168404Spjd	ZFS_EXIT(zfsvfs);
2974168404Spjd	return (0);
2975168404Spjd}
2976168404Spjd
2977168404Spjd/*
2978168404Spjd * Set the file attributes to the values contained in the
2979168404Spjd * vattr structure.
2980168404Spjd *
2981168404Spjd *	IN:	vp	- vnode of file to be modified.
2982168404Spjd *		vap	- new attribute values.
2983185029Spjd *			  If AT_XVATTR set, then optional attrs are being set
2984168404Spjd *		flags	- ATTR_UTIME set if non-default time values provided.
2985185029Spjd *			- ATTR_NOACLCHECK (CIFS context only).
2986168404Spjd *		cr	- credentials of caller.
2987185029Spjd *		ct	- caller context
2988168404Spjd *
2989251631Sdelphij *	RETURN:	0 on success, error code on failure.
2990168404Spjd *
2991168404Spjd * Timestamps:
2992168404Spjd *	vp - ctime updated, mtime updated if size changed.
2993168404Spjd */
2994168404Spjd/* ARGSUSED */
2995168404Spjdstatic int
2996168962Spjdzfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2997251631Sdelphij    caller_context_t *ct)
2998168404Spjd{
2999185029Spjd	znode_t		*zp = VTOZ(vp);
3000168404Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
3001185029Spjd	zilog_t		*zilog;
3002168404Spjd	dmu_tx_t	*tx;
3003168404Spjd	vattr_t		oldva;
3004209962Smm	xvattr_t	tmpxvattr;
3005168962Spjd	uint_t		mask = vap->va_mask;
3006247187Smm	uint_t		saved_mask = 0;
3007197831Spjd	uint64_t	saved_mode;
3008168404Spjd	int		trim_mask = 0;
3009168404Spjd	uint64_t	new_mode;
3010209962Smm	uint64_t	new_uid, new_gid;
3011219089Spjd	uint64_t	xattr_obj;
3012219089Spjd	uint64_t	mtime[2], ctime[2];
3013168404Spjd	znode_t		*attrzp;
3014168404Spjd	int		need_policy = FALSE;
3015219089Spjd	int		err, err2;
3016185029Spjd	zfs_fuid_info_t *fuidp = NULL;
3017185029Spjd	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
3018185029Spjd	xoptattr_t	*xoap;
3019219089Spjd	zfs_acl_t	*aclp;
3020185029Spjd	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
3021219089Spjd	boolean_t	fuid_dirtied = B_FALSE;
3022219089Spjd	sa_bulk_attr_t	bulk[7], xattr_bulk[7];
3023219089Spjd	int		count = 0, xattr_count = 0;
3024168404Spjd
3025168404Spjd	if (mask == 0)
3026168404Spjd		return (0);
3027168404Spjd
3028168962Spjd	if (mask & AT_NOSET)
3029249195Smm		return (SET_ERROR(EINVAL));
3030168962Spjd
3031185029Spjd	ZFS_ENTER(zfsvfs);
3032185029Spjd	ZFS_VERIFY_ZP(zp);
3033185029Spjd
3034185029Spjd	zilog = zfsvfs->z_log;
3035185029Spjd
3036185029Spjd	/*
3037185029Spjd	 * Make sure that if we have ephemeral uid/gid or xvattr specified
3038185029Spjd	 * that file system is at proper version level
3039185029Spjd	 */
3040185029Spjd
3041185029Spjd	if (zfsvfs->z_use_fuids == B_FALSE &&
3042185029Spjd	    (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
3043185029Spjd	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
3044185029Spjd	    (mask & AT_XVATTR))) {
3045185029Spjd		ZFS_EXIT(zfsvfs);
3046249195Smm		return (SET_ERROR(EINVAL));
3047185029Spjd	}
3048185029Spjd
3049185029Spjd	if (mask & AT_SIZE && vp->v_type == VDIR) {
3050185029Spjd		ZFS_EXIT(zfsvfs);
3051249195Smm		return (SET_ERROR(EISDIR));
3052185029Spjd	}
3053168404Spjd
3054185029Spjd	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
3055185029Spjd		ZFS_EXIT(zfsvfs);
3056249195Smm		return (SET_ERROR(EINVAL));
3057185029Spjd	}
3058168404Spjd
3059185029Spjd	/*
3060185029Spjd	 * If this is an xvattr_t, then get a pointer to the structure of
3061185029Spjd	 * optional attributes.  If this is NULL, then we have a vattr_t.
3062185029Spjd	 */
3063185029Spjd	xoap = xva_getxoptattr(xvap);
3064168404Spjd
3065209962Smm	xva_init(&tmpxvattr);
3066209962Smm
3067185029Spjd	/*
3068185029Spjd	 * Immutable files can only alter immutable bit and atime
3069185029Spjd	 */
3070219089Spjd	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
3071185029Spjd	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
3072185029Spjd	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
3073185029Spjd		ZFS_EXIT(zfsvfs);
3074249195Smm		return (SET_ERROR(EPERM));
3075185029Spjd	}
3076185029Spjd
3077219089Spjd	if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
3078185029Spjd		ZFS_EXIT(zfsvfs);
3079249195Smm		return (SET_ERROR(EPERM));
3080185029Spjd	}
3081185029Spjd
3082185029Spjd	/*
3083185029Spjd	 * Verify timestamps doesn't overflow 32 bits.
3084185029Spjd	 * ZFS can handle large timestamps, but 32bit syscalls can't
3085185029Spjd	 * handle times greater than 2039.  This check should be removed
3086185029Spjd	 * once large timestamps are fully supported.
3087185029Spjd	 */
3088185029Spjd	if (mask & (AT_ATIME | AT_MTIME)) {
3089185029Spjd		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
3090185029Spjd		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
3091185029Spjd			ZFS_EXIT(zfsvfs);
3092249195Smm			return (SET_ERROR(EOVERFLOW));
3093185029Spjd		}
3094185029Spjd	}
3095185029Spjd
3096168404Spjdtop:
3097168404Spjd	attrzp = NULL;
3098219089Spjd	aclp = NULL;
3099168404Spjd
3100211932Smm	/* Can this be moved to before the top label? */
3101168404Spjd	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
3102168404Spjd		ZFS_EXIT(zfsvfs);
3103249195Smm		return (SET_ERROR(EROFS));
3104168404Spjd	}
3105168404Spjd
3106168404Spjd	/*
3107168404Spjd	 * First validate permissions
3108168404Spjd	 */
3109168404Spjd
3110168404Spjd	if (mask & AT_SIZE) {
3111168404Spjd		/*
3112168404Spjd		 * XXX - Note, we are not providing any open
3113168404Spjd		 * mode flags here (like FNDELAY), so we may
3114168404Spjd		 * block if there are locks present... this
3115168404Spjd		 * should be addressed in openat().
3116168404Spjd		 */
3117185029Spjd		/* XXX - would it be OK to generate a log record here? */
3118185029Spjd		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
3119168404Spjd		if (err) {
3120168404Spjd			ZFS_EXIT(zfsvfs);
3121168404Spjd			return (err);
3122168404Spjd		}
3123168404Spjd	}
3124168404Spjd
3125185029Spjd	if (mask & (AT_ATIME|AT_MTIME) ||
3126185029Spjd	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
3127185029Spjd	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
3128185029Spjd	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
3129219089Spjd	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
3130219089Spjd	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
3131185029Spjd	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
3132219089Spjd	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
3133185029Spjd		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
3134185029Spjd		    skipaclchk, cr);
3135219089Spjd	}
3136168404Spjd
3137168404Spjd	if (mask & (AT_UID|AT_GID)) {
3138168404Spjd		int	idmask = (mask & (AT_UID|AT_GID));
3139168404Spjd		int	take_owner;
3140168404Spjd		int	take_group;
3141168404Spjd
3142168404Spjd		/*
3143168404Spjd		 * NOTE: even if a new mode is being set,
3144168404Spjd		 * we may clear S_ISUID/S_ISGID bits.
3145168404Spjd		 */
3146168404Spjd
3147168404Spjd		if (!(mask & AT_MODE))
3148219089Spjd			vap->va_mode = zp->z_mode;
3149168404Spjd
3150168404Spjd		/*
3151168404Spjd		 * Take ownership or chgrp to group we are a member of
3152168404Spjd		 */
3153168404Spjd
3154168404Spjd		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
3155185029Spjd		take_group = (mask & AT_GID) &&
3156185029Spjd		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
3157168404Spjd
3158168404Spjd		/*
3159168404Spjd		 * If both AT_UID and AT_GID are set then take_owner and
3160168404Spjd		 * take_group must both be set in order to allow taking
3161168404Spjd		 * ownership.
3162168404Spjd		 *
3163168404Spjd		 * Otherwise, send the check through secpolicy_vnode_setattr()
3164168404Spjd		 *
3165168404Spjd		 */
3166168404Spjd
3167168404Spjd		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
3168168404Spjd		    ((idmask == AT_UID) && take_owner) ||
3169168404Spjd		    ((idmask == AT_GID) && take_group)) {
3170185029Spjd			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
3171185029Spjd			    skipaclchk, cr) == 0) {
3172168404Spjd				/*
3173168404Spjd				 * Remove setuid/setgid for non-privileged users
3174168404Spjd				 */
3175185029Spjd				secpolicy_setid_clear(vap, vp, cr);
3176168404Spjd				trim_mask = (mask & (AT_UID|AT_GID));
3177168404Spjd			} else {
3178168404Spjd				need_policy =  TRUE;
3179168404Spjd			}
3180168404Spjd		} else {
3181168404Spjd			need_policy =  TRUE;
3182168404Spjd		}
3183168404Spjd	}
3184168404Spjd
3185168404Spjd	mutex_enter(&zp->z_lock);
3186219089Spjd	oldva.va_mode = zp->z_mode;
3187185029Spjd	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
3188185029Spjd	if (mask & AT_XVATTR) {
3189209962Smm		/*
3190209962Smm		 * Update xvattr mask to include only those attributes
3191209962Smm		 * that are actually changing.
3192209962Smm		 *
3193209962Smm		 * the bits will be restored prior to actually setting
3194209962Smm		 * the attributes so the caller thinks they were set.
3195209962Smm		 */
3196209962Smm		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
3197209962Smm			if (xoap->xoa_appendonly !=
3198219089Spjd			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
3199209962Smm				need_policy = TRUE;
3200209962Smm			} else {
3201209962Smm				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
3202209962Smm				XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
3203209962Smm			}
3204209962Smm		}
3205209962Smm
3206209962Smm		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
3207209962Smm			if (xoap->xoa_nounlink !=
3208219089Spjd			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
3209209962Smm				need_policy = TRUE;
3210209962Smm			} else {
3211209962Smm				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
3212209962Smm				XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
3213209962Smm			}
3214209962Smm		}
3215209962Smm
3216209962Smm		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
3217209962Smm			if (xoap->xoa_immutable !=
3218219089Spjd			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
3219209962Smm				need_policy = TRUE;
3220209962Smm			} else {
3221209962Smm				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
3222209962Smm				XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
3223209962Smm			}
3224209962Smm		}
3225209962Smm
3226209962Smm		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
3227209962Smm			if (xoap->xoa_nodump !=
3228219089Spjd			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
3229209962Smm				need_policy = TRUE;
3230209962Smm			} else {
3231209962Smm				XVA_CLR_REQ(xvap, XAT_NODUMP);
3232209962Smm				XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
3233209962Smm			}
3234209962Smm		}
3235209962Smm
3236209962Smm		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
3237209962Smm			if (xoap->xoa_av_modified !=
3238219089Spjd			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
3239209962Smm				need_policy = TRUE;
3240209962Smm			} else {
3241209962Smm				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
3242209962Smm				XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
3243209962Smm			}
3244209962Smm		}
3245209962Smm
3246209962Smm		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
3247209962Smm			if ((vp->v_type != VREG &&
3248209962Smm			    xoap->xoa_av_quarantined) ||
3249209962Smm			    xoap->xoa_av_quarantined !=
3250219089Spjd			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
3251209962Smm				need_policy = TRUE;
3252209962Smm			} else {
3253209962Smm				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
3254209962Smm				XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
3255209962Smm			}
3256209962Smm		}
3257209962Smm
3258219089Spjd		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
3259219089Spjd			mutex_exit(&zp->z_lock);
3260219089Spjd			ZFS_EXIT(zfsvfs);
3261249195Smm			return (SET_ERROR(EPERM));
3262219089Spjd		}
3263219089Spjd
3264209962Smm		if (need_policy == FALSE &&
3265209962Smm		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
3266209962Smm		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
3267185029Spjd			need_policy = TRUE;
3268185029Spjd		}
3269185029Spjd	}
3270185029Spjd
3271168404Spjd	mutex_exit(&zp->z_lock);
3272168404Spjd
3273168404Spjd	if (mask & AT_MODE) {
3274185029Spjd		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
3275168962Spjd			err = secpolicy_setid_setsticky_clear(vp, vap,
3276168962Spjd			    &oldva, cr);
3277168962Spjd			if (err) {
3278168962Spjd				ZFS_EXIT(zfsvfs);
3279168962Spjd				return (err);
3280168962Spjd			}
3281168404Spjd			trim_mask |= AT_MODE;
3282168404Spjd		} else {
3283168404Spjd			need_policy = TRUE;
3284168404Spjd		}
3285168404Spjd	}
3286168404Spjd
3287168404Spjd	if (need_policy) {
3288168404Spjd		/*
3289168404Spjd		 * If trim_mask is set then take ownership
3290168404Spjd		 * has been granted or write_acl is present and user
3291168404Spjd		 * has the ability to modify mode.  In that case remove
3292168404Spjd		 * UID|GID and or MODE from mask so that
3293168404Spjd		 * secpolicy_vnode_setattr() doesn't revoke it.
3294168404Spjd		 */
3295168404Spjd
3296168404Spjd		if (trim_mask) {
3297168404Spjd			saved_mask = vap->va_mask;
3298168404Spjd			vap->va_mask &= ~trim_mask;
3299197831Spjd			if (trim_mask & AT_MODE) {
3300197831Spjd				/*
3301197831Spjd				 * Save the mode, as secpolicy_vnode_setattr()
3302197831Spjd				 * will overwrite it with ova.va_mode.
3303197831Spjd				 */
3304197831Spjd				saved_mode = vap->va_mode;
3305197831Spjd			}
3306168404Spjd		}
3307168404Spjd		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
3308185029Spjd		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
3309168404Spjd		if (err) {
3310168404Spjd			ZFS_EXIT(zfsvfs);
3311168404Spjd			return (err);
3312168404Spjd		}
3313168404Spjd
3314197831Spjd		if (trim_mask) {
3315168404Spjd			vap->va_mask |= saved_mask;
3316197831Spjd			if (trim_mask & AT_MODE) {
3317197831Spjd				/*
3318197831Spjd				 * Recover the mode after
3319197831Spjd				 * secpolicy_vnode_setattr().
3320197831Spjd				 */
3321197831Spjd				vap->va_mode = saved_mode;
3322197831Spjd			}
3323197831Spjd		}
3324168404Spjd	}
3325168404Spjd
3326168404Spjd	/*
3327168404Spjd	 * secpolicy_vnode_setattr, or take ownership may have
3328168404Spjd	 * changed va_mask
3329168404Spjd	 */
3330168404Spjd	mask = vap->va_mask;
3331168404Spjd
3332219089Spjd	if ((mask & (AT_UID | AT_GID))) {
3333219089Spjd		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
3334219089Spjd		    &xattr_obj, sizeof (xattr_obj));
3335168404Spjd
3336219089Spjd		if (err == 0 && xattr_obj) {
3337219089Spjd			err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
3338209962Smm			if (err)
3339219089Spjd				goto out2;
3340168404Spjd		}
3341209962Smm		if (mask & AT_UID) {
3342209962Smm			new_uid = zfs_fuid_create(zfsvfs,
3343209962Smm			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
3344219089Spjd			if (new_uid != zp->z_uid &&
3345219089Spjd			    zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
3346219089Spjd				if (attrzp)
3347219089Spjd					VN_RELE(ZTOV(attrzp));
3348249195Smm				err = SET_ERROR(EDQUOT);
3349219089Spjd				goto out2;
3350209962Smm			}
3351209962Smm		}
3352209962Smm
3353209962Smm		if (mask & AT_GID) {
3354209962Smm			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
3355209962Smm			    cr, ZFS_GROUP, &fuidp);
3356219089Spjd			if (new_gid != zp->z_gid &&
3357219089Spjd			    zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
3358219089Spjd				if (attrzp)
3359219089Spjd					VN_RELE(ZTOV(attrzp));
3360249195Smm				err = SET_ERROR(EDQUOT);
3361219089Spjd				goto out2;
3362209962Smm			}
3363209962Smm		}
3364219089Spjd	}
3365219089Spjd	tx = dmu_tx_create(zfsvfs->z_os);
3366219089Spjd
3367219089Spjd	if (mask & AT_MODE) {
3368219089Spjd		uint64_t pmode = zp->z_mode;
3369219089Spjd		uint64_t acl_obj;
3370219089Spjd		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
3371219089Spjd
3372243560Smm		if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
3373243560Smm		    !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
3374249195Smm			err = SET_ERROR(EPERM);
3375243560Smm			goto out;
3376243560Smm		}
3377243560Smm
3378224174Smm		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
3379224174Smm			goto out;
3380219089Spjd
3381219089Spjd		mutex_enter(&zp->z_lock);
3382219089Spjd		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
3383219089Spjd			/*
3384219089Spjd			 * Are we upgrading ACL from old V0 format
3385219089Spjd			 * to V1 format?
3386219089Spjd			 */
3387219089Spjd			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
3388219089Spjd			    zfs_znode_acl_version(zp) ==
3389219089Spjd			    ZFS_ACL_VERSION_INITIAL) {
3390219089Spjd				dmu_tx_hold_free(tx, acl_obj, 0,
3391219089Spjd				    DMU_OBJECT_END);
3392219089Spjd				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3393219089Spjd				    0, aclp->z_acl_bytes);
3394209962Smm			} else {
3395219089Spjd				dmu_tx_hold_write(tx, acl_obj, 0,
3396219089Spjd				    aclp->z_acl_bytes);
3397209962Smm			}
3398219089Spjd		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3399219089Spjd			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3400219089Spjd			    0, aclp->z_acl_bytes);
3401209962Smm		}
3402219089Spjd		mutex_exit(&zp->z_lock);
3403219089Spjd		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3404219089Spjd	} else {
3405219089Spjd		if ((mask & AT_XVATTR) &&
3406219089Spjd		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3407219089Spjd			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3408219089Spjd		else
3409219089Spjd			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3410168404Spjd	}
3411168404Spjd
3412219089Spjd	if (attrzp) {
3413219089Spjd		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3414219089Spjd	}
3415219089Spjd
3416219089Spjd	fuid_dirtied = zfsvfs->z_fuid_dirty;
3417219089Spjd	if (fuid_dirtied)
3418219089Spjd		zfs_fuid_txhold(zfsvfs, tx);
3419219089Spjd
3420219089Spjd	zfs_sa_upgrade_txholds(tx, zp);
3421219089Spjd
3422258720Savg	err = dmu_tx_assign(tx, TXG_WAIT);
3423258720Savg	if (err)
3424209962Smm		goto out;
3425168404Spjd
3426219089Spjd	count = 0;
3427168404Spjd	/*
3428168404Spjd	 * Set each attribute requested.
3429168404Spjd	 * We group settings according to the locks they need to acquire.
3430168404Spjd	 *
3431168404Spjd	 * Note: you cannot set ctime directly, although it will be
3432168404Spjd	 * updated as a side-effect of calling this function.
3433168404Spjd	 */
3434168404Spjd
3435219089Spjd
3436219089Spjd	if (mask & (AT_UID|AT_GID|AT_MODE))
3437219089Spjd		mutex_enter(&zp->z_acl_lock);
3438168404Spjd	mutex_enter(&zp->z_lock);
3439168404Spjd
3440219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3441219089Spjd	    &zp->z_pflags, sizeof (zp->z_pflags));
3442219089Spjd
3443219089Spjd	if (attrzp) {
3444219089Spjd		if (mask & (AT_UID|AT_GID|AT_MODE))
3445219089Spjd			mutex_enter(&attrzp->z_acl_lock);
3446219089Spjd		mutex_enter(&attrzp->z_lock);
3447219089Spjd		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3448219089Spjd		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3449219089Spjd		    sizeof (attrzp->z_pflags));
3450219089Spjd	}
3451219089Spjd
3452219089Spjd	if (mask & (AT_UID|AT_GID)) {
3453219089Spjd
3454219089Spjd		if (mask & AT_UID) {
3455219089Spjd			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3456219089Spjd			    &new_uid, sizeof (new_uid));
3457219089Spjd			zp->z_uid = new_uid;
3458219089Spjd			if (attrzp) {
3459219089Spjd				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3460219089Spjd				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3461219089Spjd				    sizeof (new_uid));
3462219089Spjd				attrzp->z_uid = new_uid;
3463219089Spjd			}
3464219089Spjd		}
3465219089Spjd
3466219089Spjd		if (mask & AT_GID) {
3467219089Spjd			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3468219089Spjd			    NULL, &new_gid, sizeof (new_gid));
3469219089Spjd			zp->z_gid = new_gid;
3470219089Spjd			if (attrzp) {
3471219089Spjd				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3472219089Spjd				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3473219089Spjd				    sizeof (new_gid));
3474219089Spjd				attrzp->z_gid = new_gid;
3475219089Spjd			}
3476219089Spjd		}
3477219089Spjd		if (!(mask & AT_MODE)) {
3478219089Spjd			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3479219089Spjd			    NULL, &new_mode, sizeof (new_mode));
3480219089Spjd			new_mode = zp->z_mode;
3481219089Spjd		}
3482219089Spjd		err = zfs_acl_chown_setattr(zp);
3483219089Spjd		ASSERT(err == 0);
3484219089Spjd		if (attrzp) {
3485219089Spjd			err = zfs_acl_chown_setattr(attrzp);
3486219089Spjd			ASSERT(err == 0);
3487219089Spjd		}
3488219089Spjd	}
3489219089Spjd
3490168404Spjd	if (mask & AT_MODE) {
3491219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3492219089Spjd		    &new_mode, sizeof (new_mode));
3493219089Spjd		zp->z_mode = new_mode;
3494219089Spjd		ASSERT3U((uintptr_t)aclp, !=, 0);
3495209962Smm		err = zfs_aclset_common(zp, aclp, cr, tx);
3496240415Smm		ASSERT0(err);
3497219089Spjd		if (zp->z_acl_cached)
3498219089Spjd			zfs_acl_free(zp->z_acl_cached);
3499211932Smm		zp->z_acl_cached = aclp;
3500211932Smm		aclp = NULL;
3501168404Spjd	}
3502168404Spjd
3503168404Spjd
3504219089Spjd	if (mask & AT_ATIME) {
3505219089Spjd		ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
3506219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
3507219089Spjd		    &zp->z_atime, sizeof (zp->z_atime));
3508168404Spjd	}
3509168404Spjd
3510219089Spjd	if (mask & AT_MTIME) {
3511219089Spjd		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3512219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3513219089Spjd		    mtime, sizeof (mtime));
3514168404Spjd	}
3515168404Spjd
3516185029Spjd	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3517219089Spjd	if (mask & AT_SIZE && !(mask & AT_MTIME)) {
3518219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
3519219089Spjd		    NULL, mtime, sizeof (mtime));
3520219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3521219089Spjd		    &ctime, sizeof (ctime));
3522219089Spjd		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
3523219089Spjd		    B_TRUE);
3524219089Spjd	} else if (mask != 0) {
3525219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3526219089Spjd		    &ctime, sizeof (ctime));
3527219089Spjd		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
3528219089Spjd		    B_TRUE);
3529219089Spjd		if (attrzp) {
3530219089Spjd			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3531219089Spjd			    SA_ZPL_CTIME(zfsvfs), NULL,
3532219089Spjd			    &ctime, sizeof (ctime));
3533219089Spjd			zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
3534219089Spjd			    mtime, ctime, B_TRUE);
3535219089Spjd		}
3536219089Spjd	}
3537185029Spjd	/*
3538185029Spjd	 * Do this after setting timestamps to prevent timestamp
3539185029Spjd	 * update from toggling bit
3540185029Spjd	 */
3541168404Spjd
3542185029Spjd	if (xoap && (mask & AT_XVATTR)) {
3543209962Smm
3544209962Smm		/*
3545209962Smm		 * restore trimmed off masks
3546209962Smm		 * so that return masks can be set for caller.
3547209962Smm		 */
3548209962Smm
3549209962Smm		if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3550209962Smm			XVA_SET_REQ(xvap, XAT_APPENDONLY);
3551209962Smm		}
3552209962Smm		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3553209962Smm			XVA_SET_REQ(xvap, XAT_NOUNLINK);
3554209962Smm		}
3555209962Smm		if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3556209962Smm			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3557209962Smm		}
3558209962Smm		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3559209962Smm			XVA_SET_REQ(xvap, XAT_NODUMP);
3560209962Smm		}
3561209962Smm		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3562209962Smm			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3563209962Smm		}
3564209962Smm		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3565209962Smm			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3566209962Smm		}
3567209962Smm
3568219089Spjd		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3569185029Spjd			ASSERT(vp->v_type == VREG);
3570185029Spjd
3571219089Spjd		zfs_xvattr_set(zp, xvap, tx);
3572185029Spjd	}
3573185029Spjd
3574209962Smm	if (fuid_dirtied)
3575209962Smm		zfs_fuid_sync(zfsvfs, tx);
3576209962Smm
3577168404Spjd	if (mask != 0)
3578185029Spjd		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3579168404Spjd
3580168404Spjd	mutex_exit(&zp->z_lock);
3581219089Spjd	if (mask & (AT_UID|AT_GID|AT_MODE))
3582219089Spjd		mutex_exit(&zp->z_acl_lock);
3583168404Spjd
3584219089Spjd	if (attrzp) {
3585219089Spjd		if (mask & (AT_UID|AT_GID|AT_MODE))
3586219089Spjd			mutex_exit(&attrzp->z_acl_lock);
3587219089Spjd		mutex_exit(&attrzp->z_lock);
3588219089Spjd	}
3589209962Smmout:
3590219089Spjd	if (err == 0 && attrzp) {
3591219089Spjd		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3592219089Spjd		    xattr_count, tx);
3593219089Spjd		ASSERT(err2 == 0);
3594219089Spjd	}
3595219089Spjd
3596168404Spjd	if (attrzp)
3597168404Spjd		VN_RELE(ZTOV(attrzp));
3598251631Sdelphij
3599211932Smm	if (aclp)
3600209962Smm		zfs_acl_free(aclp);
3601168404Spjd
3602209962Smm	if (fuidp) {
3603209962Smm		zfs_fuid_info_free(fuidp);
3604209962Smm		fuidp = NULL;
3605209962Smm	}
3606209962Smm
3607219089Spjd	if (err) {
3608209962Smm		dmu_tx_abort(tx);
3609219089Spjd		if (err == ERESTART)
3610219089Spjd			goto top;
3611219089Spjd	} else {
3612219089Spjd		err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3613209962Smm		dmu_tx_commit(tx);
3614219089Spjd	}
3615209962Smm
3616219089Spjdout2:
3617219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3618219089Spjd		zil_commit(zilog, 0);
3619209962Smm
3620168404Spjd	ZFS_EXIT(zfsvfs);
3621168404Spjd	return (err);
3622168404Spjd}
3623168404Spjd
3624168404Spjdtypedef struct zfs_zlock {
3625168404Spjd	krwlock_t	*zl_rwlock;	/* lock we acquired */
3626168404Spjd	znode_t		*zl_znode;	/* znode we held */
3627168404Spjd	struct zfs_zlock *zl_next;	/* next in list */
3628168404Spjd} zfs_zlock_t;
3629168404Spjd
3630168404Spjd/*
3631168404Spjd * Drop locks and release vnodes that were held by zfs_rename_lock().
3632168404Spjd */
3633168404Spjdstatic void
3634168404Spjdzfs_rename_unlock(zfs_zlock_t **zlpp)
3635168404Spjd{
3636168404Spjd	zfs_zlock_t *zl;
3637168404Spjd
3638168404Spjd	while ((zl = *zlpp) != NULL) {
3639168404Spjd		if (zl->zl_znode != NULL)
3640168404Spjd			VN_RELE(ZTOV(zl->zl_znode));
3641168404Spjd		rw_exit(zl->zl_rwlock);
3642168404Spjd		*zlpp = zl->zl_next;
3643168404Spjd		kmem_free(zl, sizeof (*zl));
3644168404Spjd	}
3645168404Spjd}
3646168404Spjd
3647168404Spjd/*
3648168404Spjd * Search back through the directory tree, using the ".." entries.
3649168404Spjd * Lock each directory in the chain to prevent concurrent renames.
3650168404Spjd * Fail any attempt to move a directory into one of its own descendants.
3651168404Spjd * XXX - z_parent_lock can overlap with map or grow locks
3652168404Spjd */
3653168404Spjdstatic int
3654168404Spjdzfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
3655168404Spjd{
3656168404Spjd	zfs_zlock_t	*zl;
3657168404Spjd	znode_t		*zp = tdzp;
3658168404Spjd	uint64_t	rootid = zp->z_zfsvfs->z_root;
3659219089Spjd	uint64_t	oidp = zp->z_id;
3660168404Spjd	krwlock_t	*rwlp = &szp->z_parent_lock;
3661168404Spjd	krw_t		rw = RW_WRITER;
3662168404Spjd
3663168404Spjd	/*
3664168404Spjd	 * First pass write-locks szp and compares to zp->z_id.
3665168404Spjd	 * Later passes read-lock zp and compare to zp->z_parent.
3666168404Spjd	 */
3667168404Spjd	do {
3668168404Spjd		if (!rw_tryenter(rwlp, rw)) {
3669168404Spjd			/*
3670168404Spjd			 * Another thread is renaming in this path.
3671168404Spjd			 * Note that if we are a WRITER, we don't have any
3672168404Spjd			 * parent_locks held yet.
3673168404Spjd			 */
3674168404Spjd			if (rw == RW_READER && zp->z_id > szp->z_id) {
3675168404Spjd				/*
3676168404Spjd				 * Drop our locks and restart
3677168404Spjd				 */
3678168404Spjd				zfs_rename_unlock(&zl);
3679168404Spjd				*zlpp = NULL;
3680168404Spjd				zp = tdzp;
3681219089Spjd				oidp = zp->z_id;
3682168404Spjd				rwlp = &szp->z_parent_lock;
3683168404Spjd				rw = RW_WRITER;
3684168404Spjd				continue;
3685168404Spjd			} else {
3686168404Spjd				/*
3687168404Spjd				 * Wait for other thread to drop its locks
3688168404Spjd				 */
3689168404Spjd				rw_enter(rwlp, rw);
3690168404Spjd			}
3691168404Spjd		}
3692168404Spjd
3693168404Spjd		zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
3694168404Spjd		zl->zl_rwlock = rwlp;
3695168404Spjd		zl->zl_znode = NULL;
3696168404Spjd		zl->zl_next = *zlpp;
3697168404Spjd		*zlpp = zl;
3698168404Spjd
3699219089Spjd		if (oidp == szp->z_id)		/* We're a descendant of szp */
3700249195Smm			return (SET_ERROR(EINVAL));
3701168404Spjd
3702219089Spjd		if (oidp == rootid)		/* We've hit the top */
3703168404Spjd			return (0);
3704168404Spjd
3705168404Spjd		if (rw == RW_READER) {		/* i.e. not the first pass */
3706219089Spjd			int error = zfs_zget(zp->z_zfsvfs, oidp, &zp);
3707168404Spjd			if (error)
3708168404Spjd				return (error);
3709168404Spjd			zl->zl_znode = zp;
3710168404Spjd		}
3711219089Spjd		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs),
3712219089Spjd		    &oidp, sizeof (oidp));
3713168404Spjd		rwlp = &zp->z_parent_lock;
3714168404Spjd		rw = RW_READER;
3715168404Spjd
3716168404Spjd	} while (zp->z_id != sdzp->z_id);
3717168404Spjd
3718168404Spjd	return (0);
3719168404Spjd}
3720168404Spjd
3721168404Spjd/*
3722168404Spjd * Move an entry from the provided source directory to the target
3723168404Spjd * directory.  Change the entry name as indicated.
3724168404Spjd *
3725168404Spjd *	IN:	sdvp	- Source directory containing the "old entry".
3726168404Spjd *		snm	- Old entry name.
3727168404Spjd *		tdvp	- Target directory to contain the "new entry".
3728168404Spjd *		tnm	- New entry name.
3729168404Spjd *		cr	- credentials of caller.
3730185029Spjd *		ct	- caller context
3731185029Spjd *		flags	- case flags
3732168404Spjd *
3733251631Sdelphij *	RETURN:	0 on success, error code on failure.
3734168404Spjd *
3735168404Spjd * Timestamps:
3736168404Spjd *	sdvp,tdvp - ctime|mtime updated
3737168404Spjd */
3738185029Spjd/*ARGSUSED*/
3739168404Spjdstatic int
3740185029Spjdzfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
3741185029Spjd    caller_context_t *ct, int flags)
3742168404Spjd{
3743264392Sdavide	znode_t		*tdzp, *sdzp, *szp, *tzp;
3744264392Sdavide	zfsvfs_t 	*zfsvfs;
3745185029Spjd	zilog_t		*zilog;
3746168962Spjd	vnode_t		*realvp;
3747168404Spjd	zfs_dirlock_t	*sdl, *tdl;
3748168404Spjd	dmu_tx_t	*tx;
3749168404Spjd	zfs_zlock_t	*zl;
3750185029Spjd	int		cmp, serr, terr;
3751185029Spjd	int		error = 0;
3752185029Spjd	int		zflg = 0;
3753258632Savg	boolean_t	waited = B_FALSE;
3754168404Spjd
3755264392Sdavide	tdzp = VTOZ(tdvp);
3756264392Sdavide	ZFS_VERIFY_ZP(tdzp);
3757264392Sdavide	zfsvfs = tdzp->z_zfsvfs;
3758168404Spjd	ZFS_ENTER(zfsvfs);
3759185029Spjd	zilog = zfsvfs->z_log;
3760264392Sdavide	sdzp = VTOZ(sdvp);
3761168404Spjd
3762168962Spjd	/*
3763264392Sdavide	 * In case sdzp is not valid, let's be sure to exit from the right
3764264392Sdavide	 * zfsvfs_t.
3765168962Spjd	 */
3766264392Sdavide	if (sdzp->z_sa_hdl == NULL) {
3767264392Sdavide		ZFS_EXIT(zfsvfs);
3768264392Sdavide		return (SET_ERROR(EIO));
3769264392Sdavide	}
3770168962Spjd
3771254585Sdelphij	/*
3772254585Sdelphij	 * We check z_zfsvfs rather than v_vfsp here, because snapshots and the
3773254585Sdelphij	 * ctldir appear to have the same v_vfsp.
3774254585Sdelphij	 */
3775264392Sdavide	if (sdzp->z_zfsvfs != zfsvfs || zfsctl_is_node(tdvp)) {
3776168404Spjd		ZFS_EXIT(zfsvfs);
3777249195Smm		return (SET_ERROR(EXDEV));
3778168404Spjd	}
3779168404Spjd
3780185029Spjd	if (zfsvfs->z_utf8 && u8_validate(tnm,
3781185029Spjd	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3782185029Spjd		ZFS_EXIT(zfsvfs);
3783249195Smm		return (SET_ERROR(EILSEQ));
3784185029Spjd	}
3785185029Spjd
3786185029Spjd	if (flags & FIGNORECASE)
3787185029Spjd		zflg |= ZCILOOK;
3788185029Spjd
3789168404Spjdtop:
3790168404Spjd	szp = NULL;
3791168404Spjd	tzp = NULL;
3792168404Spjd	zl = NULL;
3793168404Spjd
3794168404Spjd	/*
3795168404Spjd	 * This is to prevent the creation of links into attribute space
3796168404Spjd	 * by renaming a linked file into/outof an attribute directory.
3797168404Spjd	 * See the comment in zfs_link() for why this is considered bad.
3798168404Spjd	 */
3799219089Spjd	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3800168962Spjd		ZFS_EXIT(zfsvfs);
3801249195Smm		return (SET_ERROR(EINVAL));
3802168404Spjd	}
3803168404Spjd
3804168404Spjd	/*
3805168404Spjd	 * Lock source and target directory entries.  To prevent deadlock,
3806168404Spjd	 * a lock ordering must be defined.  We lock the directory with
3807168404Spjd	 * the smallest object id first, or if it's a tie, the one with
3808168404Spjd	 * the lexically first name.
3809168404Spjd	 */
3810168404Spjd	if (sdzp->z_id < tdzp->z_id) {
3811168962Spjd		cmp = -1;
3812168962Spjd	} else if (sdzp->z_id > tdzp->z_id) {
3813168962Spjd		cmp = 1;
3814168962Spjd	} else {
3815185029Spjd		/*
3816185029Spjd		 * First compare the two name arguments without
3817185029Spjd		 * considering any case folding.
3818185029Spjd		 */
3819185029Spjd		int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
3820185029Spjd
3821185029Spjd		cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
3822185029Spjd		ASSERT(error == 0 || !zfsvfs->z_utf8);
3823168962Spjd		if (cmp == 0) {
3824168962Spjd			/*
3825168962Spjd			 * POSIX: "If the old argument and the new argument
3826168962Spjd			 * both refer to links to the same existing file,
3827168962Spjd			 * the rename() function shall return successfully
3828168962Spjd			 * and perform no other action."
3829168962Spjd			 */
3830168962Spjd			ZFS_EXIT(zfsvfs);
3831168962Spjd			return (0);
3832168962Spjd		}
3833185029Spjd		/*
3834185029Spjd		 * If the file system is case-folding, then we may
3835185029Spjd		 * have some more checking to do.  A case-folding file
3836185029Spjd		 * system is either supporting mixed case sensitivity
3837185029Spjd		 * access or is completely case-insensitive.  Note
3838185029Spjd		 * that the file system is always case preserving.
3839185029Spjd		 *
3840185029Spjd		 * In mixed sensitivity mode case sensitive behavior
3841185029Spjd		 * is the default.  FIGNORECASE must be used to
3842185029Spjd		 * explicitly request case insensitive behavior.
3843185029Spjd		 *
3844185029Spjd		 * If the source and target names provided differ only
3845185029Spjd		 * by case (e.g., a request to rename 'tim' to 'Tim'),
3846185029Spjd		 * we will treat this as a special case in the
3847185029Spjd		 * case-insensitive mode: as long as the source name
3848185029Spjd		 * is an exact match, we will allow this to proceed as
3849185029Spjd		 * a name-change request.
3850185029Spjd		 */
3851185029Spjd		if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
3852185029Spjd		    (zfsvfs->z_case == ZFS_CASE_MIXED &&
3853185029Spjd		    flags & FIGNORECASE)) &&
3854185029Spjd		    u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
3855185029Spjd		    &error) == 0) {
3856185029Spjd			/*
3857185029Spjd			 * case preserving rename request, require exact
3858185029Spjd			 * name matches
3859185029Spjd			 */
3860185029Spjd			zflg |= ZCIEXACT;
3861185029Spjd			zflg &= ~ZCILOOK;
3862185029Spjd		}
3863168962Spjd	}
3864185029Spjd
3865208131Smm	/*
3866208131Smm	 * If the source and destination directories are the same, we should
3867208131Smm	 * grab the z_name_lock of that directory only once.
3868208131Smm	 */
3869208131Smm	if (sdzp == tdzp) {
3870208131Smm		zflg |= ZHAVELOCK;
3871208131Smm		rw_enter(&sdzp->z_name_lock, RW_READER);
3872208131Smm	}
3873208131Smm
3874168962Spjd	if (cmp < 0) {
3875185029Spjd		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
3876185029Spjd		    ZEXISTS | zflg, NULL, NULL);
3877185029Spjd		terr = zfs_dirent_lock(&tdl,
3878185029Spjd		    tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
3879168962Spjd	} else {
3880185029Spjd		terr = zfs_dirent_lock(&tdl,
3881185029Spjd		    tdzp, tnm, &tzp, zflg, NULL, NULL);
3882185029Spjd		serr = zfs_dirent_lock(&sdl,
3883185029Spjd		    sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
3884185029Spjd		    NULL, NULL);
3885168404Spjd	}
3886168404Spjd
3887168962Spjd	if (serr) {
3888168404Spjd		/*
3889168404Spjd		 * Source entry invalid or not there.
3890168404Spjd		 */
3891168962Spjd		if (!terr) {
3892168404Spjd			zfs_dirent_unlock(tdl);
3893168962Spjd			if (tzp)
3894168962Spjd				VN_RELE(ZTOV(tzp));
3895168962Spjd		}
3896208131Smm
3897208131Smm		if (sdzp == tdzp)
3898208131Smm			rw_exit(&sdzp->z_name_lock);
3899208131Smm
3900219089Spjd		/*
3901219089Spjd		 * FreeBSD: In OpenSolaris they only check if rename source is
3902219089Spjd		 * ".." here, because "." is handled in their lookup. This is
3903219089Spjd		 * not the case for FreeBSD, so we check for "." explicitly.
3904219089Spjd		 */
3905168404Spjd		if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0)
3906249195Smm			serr = SET_ERROR(EINVAL);
3907168962Spjd		ZFS_EXIT(zfsvfs);
3908168962Spjd		return (serr);
3909168404Spjd	}
3910168404Spjd	if (terr) {
3911168404Spjd		zfs_dirent_unlock(sdl);
3912168962Spjd		VN_RELE(ZTOV(szp));
3913208131Smm
3914208131Smm		if (sdzp == tdzp)
3915208131Smm			rw_exit(&sdzp->z_name_lock);
3916208131Smm
3917168404Spjd		if (strcmp(tnm, "..") == 0)
3918249195Smm			terr = SET_ERROR(EINVAL);
3919168962Spjd		ZFS_EXIT(zfsvfs);
3920168962Spjd		return (terr);
3921168404Spjd	}
3922168404Spjd
3923168404Spjd	/*
3924168404Spjd	 * Must have write access at the source to remove the old entry
3925168404Spjd	 * and write access at the target to create the new entry.
3926168404Spjd	 * Note that if target and source are the same, this can be
3927168404Spjd	 * done in a single check.
3928168404Spjd	 */
3929168404Spjd
3930168404Spjd	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
3931168404Spjd		goto out;
3932168404Spjd
3933168962Spjd	if (ZTOV(szp)->v_type == VDIR) {
3934168404Spjd		/*
3935168404Spjd		 * Check to make sure rename is valid.
3936168404Spjd		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3937168404Spjd		 */
3938168404Spjd		if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
3939168404Spjd			goto out;
3940168404Spjd	}
3941168404Spjd
3942168404Spjd	/*
3943168404Spjd	 * Does target exist?
3944168404Spjd	 */
3945168404Spjd	if (tzp) {
3946168404Spjd		/*
3947168404Spjd		 * Source and target must be the same type.
3948168404Spjd		 */
3949168962Spjd		if (ZTOV(szp)->v_type == VDIR) {
3950168962Spjd			if (ZTOV(tzp)->v_type != VDIR) {
3951249195Smm				error = SET_ERROR(ENOTDIR);
3952168404Spjd				goto out;
3953168404Spjd			}
3954168404Spjd		} else {
3955168962Spjd			if (ZTOV(tzp)->v_type == VDIR) {
3956249195Smm				error = SET_ERROR(EISDIR);
3957168404Spjd				goto out;
3958168404Spjd			}
3959168404Spjd		}
3960168404Spjd		/*
3961168404Spjd		 * POSIX dictates that when the source and target
3962168404Spjd		 * entries refer to the same file object, rename
3963168404Spjd		 * must do nothing and exit without error.
3964168404Spjd		 */
3965168404Spjd		if (szp->z_id == tzp->z_id) {
3966168404Spjd			error = 0;
3967168404Spjd			goto out;
3968168404Spjd		}
3969168404Spjd	}
3970168404Spjd
3971185029Spjd	vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
3972168962Spjd	if (tzp)
3973185029Spjd		vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
3974168962Spjd
3975185029Spjd	/*
3976185029Spjd	 * notify the target directory if it is not the same
3977185029Spjd	 * as source directory.
3978185029Spjd	 */
3979185029Spjd	if (tdvp != sdvp) {
3980185029Spjd		vnevent_rename_dest_dir(tdvp, ct);
3981185029Spjd	}
3982185029Spjd
3983168404Spjd	tx = dmu_tx_create(zfsvfs->z_os);
3984219089Spjd	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3985219089Spjd	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3986168404Spjd	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3987168404Spjd	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3988219089Spjd	if (sdzp != tdzp) {
3989219089Spjd		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3990219089Spjd		zfs_sa_upgrade_txholds(tx, tdzp);
3991219089Spjd	}
3992219089Spjd	if (tzp) {
3993219089Spjd		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3994219089Spjd		zfs_sa_upgrade_txholds(tx, tzp);
3995219089Spjd	}
3996219089Spjd
3997219089Spjd	zfs_sa_upgrade_txholds(tx, szp);
3998168404Spjd	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3999258632Savg	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
4000168404Spjd	if (error) {
4001168404Spjd		if (zl != NULL)
4002168404Spjd			zfs_rename_unlock(&zl);
4003168404Spjd		zfs_dirent_unlock(sdl);
4004168404Spjd		zfs_dirent_unlock(tdl);
4005208131Smm
4006208131Smm		if (sdzp == tdzp)
4007208131Smm			rw_exit(&sdzp->z_name_lock);
4008208131Smm
4009168962Spjd		VN_RELE(ZTOV(szp));
4010168962Spjd		if (tzp)
4011168962Spjd			VN_RELE(ZTOV(tzp));
4012209962Smm		if (error == ERESTART) {
4013258632Savg			waited = B_TRUE;
4014168404Spjd			dmu_tx_wait(tx);
4015168404Spjd			dmu_tx_abort(tx);
4016168404Spjd			goto top;
4017168404Spjd		}
4018168404Spjd		dmu_tx_abort(tx);
4019168962Spjd		ZFS_EXIT(zfsvfs);
4020168962Spjd		return (error);
4021168404Spjd	}
4022168404Spjd
4023168404Spjd	if (tzp)	/* Attempt to remove the existing target */
4024185029Spjd		error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
4025168404Spjd
4026168404Spjd	if (error == 0) {
4027168404Spjd		error = zfs_link_create(tdl, szp, tx, ZRENAMING);
4028168404Spjd		if (error == 0) {
4029219089Spjd			szp->z_pflags |= ZFS_AV_MODIFIED;
4030185029Spjd
4031219089Spjd			error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
4032219089Spjd			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
4033240415Smm			ASSERT0(error);
4034219089Spjd
4035168404Spjd			error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
4036219089Spjd			if (error == 0) {
4037219089Spjd				zfs_log_rename(zilog, tx, TX_RENAME |
4038219089Spjd				    (flags & FIGNORECASE ? TX_CI : 0), sdzp,
4039219089Spjd				    sdl->dl_name, tdzp, tdl->dl_name, szp);
4040185029Spjd
4041219089Spjd				/*
4042219089Spjd				 * Update path information for the target vnode
4043219089Spjd				 */
4044219089Spjd				vn_renamepath(tdvp, ZTOV(szp), tnm,
4045219089Spjd				    strlen(tnm));
4046219089Spjd			} else {
4047219089Spjd				/*
4048219089Spjd				 * At this point, we have successfully created
4049219089Spjd				 * the target name, but have failed to remove
4050219089Spjd				 * the source name.  Since the create was done
4051219089Spjd				 * with the ZRENAMING flag, there are
4052219089Spjd				 * complications; for one, the link count is
4053219089Spjd				 * wrong.  The easiest way to deal with this
4054219089Spjd				 * is to remove the newly created target, and
4055219089Spjd				 * return the original error.  This must
4056219089Spjd				 * succeed; fortunately, it is very unlikely to
4057219089Spjd				 * fail, since we just created it.
4058219089Spjd				 */
4059219089Spjd				VERIFY3U(zfs_link_destroy(tdl, szp, tx,
4060219089Spjd				    ZRENAMING, NULL), ==, 0);
4061219089Spjd			}
4062168404Spjd		}
4063168404Spjd#ifdef FREEBSD_NAMECACHE
4064168404Spjd		if (error == 0) {
4065168404Spjd			cache_purge(sdvp);
4066168404Spjd			cache_purge(tdvp);
4067240829Spjd			cache_purge(ZTOV(szp));
4068240829Spjd			if (tzp)
4069240829Spjd				cache_purge(ZTOV(tzp));
4070168404Spjd		}
4071168404Spjd#endif
4072168404Spjd	}
4073168404Spjd
4074168404Spjd	dmu_tx_commit(tx);
4075168404Spjdout:
4076168404Spjd	if (zl != NULL)
4077168404Spjd		zfs_rename_unlock(&zl);
4078168404Spjd
4079168404Spjd	zfs_dirent_unlock(sdl);
4080168404Spjd	zfs_dirent_unlock(tdl);
4081168404Spjd
4082208131Smm	if (sdzp == tdzp)
4083208131Smm		rw_exit(&sdzp->z_name_lock);
4084208131Smm
4085219089Spjd
4086168962Spjd	VN_RELE(ZTOV(szp));
4087168404Spjd	if (tzp)
4088168962Spjd		VN_RELE(ZTOV(tzp));
4089168404Spjd
4090219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4091219089Spjd		zil_commit(zilog, 0);
4092219089Spjd
4093168404Spjd	ZFS_EXIT(zfsvfs);
4094168404Spjd
4095168404Spjd	return (error);
4096168404Spjd}
4097168404Spjd
4098168404Spjd/*
4099168404Spjd * Insert the indicated symbolic reference entry into the directory.
4100168404Spjd *
4101168404Spjd *	IN:	dvp	- Directory to contain new symbolic link.
4102168404Spjd *		link	- Name for new symlink entry.
4103168404Spjd *		vap	- Attributes of new entry.
4104168404Spjd *		cr	- credentials of caller.
4105185029Spjd *		ct	- caller context
4106185029Spjd *		flags	- case flags
4107168404Spjd *
4108251631Sdelphij *	RETURN:	0 on success, error code on failure.
4109168404Spjd *
4110168404Spjd * Timestamps:
4111168404Spjd *	dvp - ctime|mtime updated
4112168404Spjd */
4113185029Spjd/*ARGSUSED*/
4114168404Spjdstatic int
4115185029Spjdzfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
4116185029Spjd    cred_t *cr, kthread_t *td)
4117168404Spjd{
4118168404Spjd	znode_t		*zp, *dzp = VTOZ(dvp);
4119168404Spjd	zfs_dirlock_t	*dl;
4120168404Spjd	dmu_tx_t	*tx;
4121168404Spjd	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
4122185029Spjd	zilog_t		*zilog;
4123219089Spjd	uint64_t	len = strlen(link);
4124168404Spjd	int		error;
4125185029Spjd	int		zflg = ZNEW;
4126209962Smm	zfs_acl_ids_t	acl_ids;
4127209962Smm	boolean_t	fuid_dirtied;
4128219089Spjd	uint64_t	txtype = TX_SYMLINK;
4129258632Savg	boolean_t	waited = B_FALSE;
4130185029Spjd	int		flags = 0;
4131168404Spjd
4132168962Spjd	ASSERT(vap->va_type == VLNK);
4133168404Spjd
4134168404Spjd	ZFS_ENTER(zfsvfs);
4135185029Spjd	ZFS_VERIFY_ZP(dzp);
4136185029Spjd	zilog = zfsvfs->z_log;
4137185029Spjd
4138185029Spjd	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
4139185029Spjd	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4140185029Spjd		ZFS_EXIT(zfsvfs);
4141249195Smm		return (SET_ERROR(EILSEQ));
4142185029Spjd	}
4143185029Spjd	if (flags & FIGNORECASE)
4144185029Spjd		zflg |= ZCILOOK;
4145168404Spjd
4146168404Spjd	if (len > MAXPATHLEN) {
4147168404Spjd		ZFS_EXIT(zfsvfs);
4148249195Smm		return (SET_ERROR(ENAMETOOLONG));
4149168404Spjd	}
4150168404Spjd
4151219089Spjd	if ((error = zfs_acl_ids_create(dzp, 0,
4152219089Spjd	    vap, cr, NULL, &acl_ids)) != 0) {
4153219089Spjd		ZFS_EXIT(zfsvfs);
4154219089Spjd		return (error);
4155219089Spjd	}
4156260704Savg
4157260704Savg	getnewvnode_reserve(1);
4158260704Savg
4159219089Spjdtop:
4160168404Spjd	/*
4161168404Spjd	 * Attempt to lock directory; fail if entry already exists.
4162168404Spjd	 */
4163185029Spjd	error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
4164185029Spjd	if (error) {
4165219089Spjd		zfs_acl_ids_free(&acl_ids);
4166260704Savg		getnewvnode_drop_reserve();
4167168404Spjd		ZFS_EXIT(zfsvfs);
4168168404Spjd		return (error);
4169168404Spjd	}
4170168404Spjd
4171219089Spjd	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4172219089Spjd		zfs_acl_ids_free(&acl_ids);
4173219089Spjd		zfs_dirent_unlock(dl);
4174260704Savg		getnewvnode_drop_reserve();
4175219089Spjd		ZFS_EXIT(zfsvfs);
4176219089Spjd		return (error);
4177219089Spjd	}
4178219089Spjd
4179209962Smm	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
4180209962Smm		zfs_acl_ids_free(&acl_ids);
4181209962Smm		zfs_dirent_unlock(dl);
4182260704Savg		getnewvnode_drop_reserve();
4183209962Smm		ZFS_EXIT(zfsvfs);
4184249195Smm		return (SET_ERROR(EDQUOT));
4185209962Smm	}
4186168404Spjd	tx = dmu_tx_create(zfsvfs->z_os);
4187209962Smm	fuid_dirtied = zfsvfs->z_fuid_dirty;
4188168404Spjd	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
4189168404Spjd	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4190219089Spjd	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
4191219089Spjd	    ZFS_SA_BASE_ATTR_SIZE + len);
4192219089Spjd	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
4193219089Spjd	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
4194219089Spjd		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
4195219089Spjd		    acl_ids.z_aclp->z_acl_bytes);
4196219089Spjd	}
4197209962Smm	if (fuid_dirtied)
4198209962Smm		zfs_fuid_txhold(zfsvfs, tx);
4199258632Savg	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
4200168404Spjd	if (error) {
4201168404Spjd		zfs_dirent_unlock(dl);
4202209962Smm		if (error == ERESTART) {
4203258632Savg			waited = B_TRUE;
4204168404Spjd			dmu_tx_wait(tx);
4205168404Spjd			dmu_tx_abort(tx);
4206168404Spjd			goto top;
4207168404Spjd		}
4208219089Spjd		zfs_acl_ids_free(&acl_ids);
4209168404Spjd		dmu_tx_abort(tx);
4210260704Savg		getnewvnode_drop_reserve();
4211168404Spjd		ZFS_EXIT(zfsvfs);
4212168404Spjd		return (error);
4213168404Spjd	}
4214168404Spjd
4215168404Spjd	/*
4216168404Spjd	 * Create a new object for the symlink.
4217219089Spjd	 * for version 4 ZPL datsets the symlink will be an SA attribute
4218168404Spjd	 */
4219219089Spjd	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
4220168404Spjd
4221219089Spjd	if (fuid_dirtied)
4222219089Spjd		zfs_fuid_sync(zfsvfs, tx);
4223209962Smm
4224219089Spjd	mutex_enter(&zp->z_lock);
4225219089Spjd	if (zp->z_is_sa)
4226219089Spjd		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
4227219089Spjd		    link, len, tx);
4228219089Spjd	else
4229219089Spjd		zfs_sa_symlink(zp, link, len, tx);
4230219089Spjd	mutex_exit(&zp->z_lock);
4231168404Spjd
4232219089Spjd	zp->z_size = len;
4233219089Spjd	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
4234219089Spjd	    &zp->z_size, sizeof (zp->z_size), tx);
4235168404Spjd	/*
4236168404Spjd	 * Insert the new object into the directory.
4237168404Spjd	 */
4238168404Spjd	(void) zfs_link_create(dl, zp, tx, ZNEW);
4239168404Spjd
4240219089Spjd	if (flags & FIGNORECASE)
4241219089Spjd		txtype |= TX_CI;
4242219089Spjd	zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
4243219089Spjd	*vpp = ZTOV(zp);
4244219089Spjd
4245209962Smm	zfs_acl_ids_free(&acl_ids);
4246209962Smm
4247168404Spjd	dmu_tx_commit(tx);
4248168404Spjd
4249260704Savg	getnewvnode_drop_reserve();
4250260704Savg
4251168404Spjd	zfs_dirent_unlock(dl);
4252168404Spjd
4253219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4254219089Spjd		zil_commit(zilog, 0);
4255219089Spjd
4256168404Spjd	ZFS_EXIT(zfsvfs);
4257168404Spjd	return (error);
4258168404Spjd}
4259168404Spjd
4260168404Spjd/*
4261168404Spjd * Return, in the buffer contained in the provided uio structure,
4262168404Spjd * the symbolic path referred to by vp.
4263168404Spjd *
4264168404Spjd *	IN:	vp	- vnode of symbolic link.
4265251631Sdelphij *		uio	- structure to contain the link path.
4266168404Spjd *		cr	- credentials of caller.
4267185029Spjd *		ct	- caller context
4268168404Spjd *
4269251631Sdelphij *	OUT:	uio	- structure containing the link path.
4270168404Spjd *
4271251631Sdelphij *	RETURN:	0 on success, error code on failure.
4272168404Spjd *
4273168404Spjd * Timestamps:
4274168404Spjd *	vp - atime updated
4275168404Spjd */
4276168404Spjd/* ARGSUSED */
4277168404Spjdstatic int
4278185029Spjdzfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
4279168404Spjd{
4280168404Spjd	znode_t		*zp = VTOZ(vp);
4281168404Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4282168404Spjd	int		error;
4283168404Spjd
4284168404Spjd	ZFS_ENTER(zfsvfs);
4285185029Spjd	ZFS_VERIFY_ZP(zp);
4286168404Spjd
4287219089Spjd	mutex_enter(&zp->z_lock);
4288219089Spjd	if (zp->z_is_sa)
4289219089Spjd		error = sa_lookup_uio(zp->z_sa_hdl,
4290219089Spjd		    SA_ZPL_SYMLINK(zfsvfs), uio);
4291219089Spjd	else
4292219089Spjd		error = zfs_sa_readlink(zp, uio);
4293219089Spjd	mutex_exit(&zp->z_lock);
4294168404Spjd
4295168404Spjd	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4296219089Spjd
4297168404Spjd	ZFS_EXIT(zfsvfs);
4298168404Spjd	return (error);
4299168404Spjd}
4300168404Spjd
4301168404Spjd/*
4302168404Spjd * Insert a new entry into directory tdvp referencing svp.
4303168404Spjd *
4304168404Spjd *	IN:	tdvp	- Directory to contain new entry.
4305168404Spjd *		svp	- vnode of new entry.
4306168404Spjd *		name	- name of new entry.
4307168404Spjd *		cr	- credentials of caller.
4308185029Spjd *		ct	- caller context
4309168404Spjd *
4310251631Sdelphij *	RETURN:	0 on success, error code on failure.
4311168404Spjd *
4312168404Spjd * Timestamps:
4313168404Spjd *	tdvp - ctime|mtime updated
4314168404Spjd *	 svp - ctime updated
4315168404Spjd */
4316168404Spjd/* ARGSUSED */
4317168404Spjdstatic int
4318185029Spjdzfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
4319185029Spjd    caller_context_t *ct, int flags)
4320168404Spjd{
4321168404Spjd	znode_t		*dzp = VTOZ(tdvp);
4322168404Spjd	znode_t		*tzp, *szp;
4323168404Spjd	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
4324185029Spjd	zilog_t		*zilog;
4325168404Spjd	zfs_dirlock_t	*dl;
4326168404Spjd	dmu_tx_t	*tx;
4327168962Spjd	vnode_t		*realvp;
4328168404Spjd	int		error;
4329185029Spjd	int		zf = ZNEW;
4330212694Smm	uint64_t	parent;
4331185029Spjd	uid_t		owner;
4332258632Savg	boolean_t	waited = B_FALSE;
4333168404Spjd
4334168404Spjd	ASSERT(tdvp->v_type == VDIR);
4335168404Spjd
4336168404Spjd	ZFS_ENTER(zfsvfs);
4337185029Spjd	ZFS_VERIFY_ZP(dzp);
4338185029Spjd	zilog = zfsvfs->z_log;
4339168404Spjd
4340185029Spjd	if (VOP_REALVP(svp, &realvp, ct) == 0)
4341168962Spjd		svp = realvp;
4342168962Spjd
4343212694Smm	/*
4344212694Smm	 * POSIX dictates that we return EPERM here.
4345212694Smm	 * Better choices include ENOTSUP or EISDIR.
4346212694Smm	 */
4347212694Smm	if (svp->v_type == VDIR) {
4348168404Spjd		ZFS_EXIT(zfsvfs);
4349249195Smm		return (SET_ERROR(EPERM));
4350212694Smm	}
4351212694Smm
4352254585Sdelphij	szp = VTOZ(svp);
4353254585Sdelphij	ZFS_VERIFY_ZP(szp);
4354254585Sdelphij
4355258597Spjd	if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) {
4356258597Spjd		ZFS_EXIT(zfsvfs);
4357258597Spjd		return (SET_ERROR(EPERM));
4358258597Spjd	}
4359258597Spjd
4360254585Sdelphij	/*
4361254585Sdelphij	 * We check z_zfsvfs rather than v_vfsp here, because snapshots and the
4362254585Sdelphij	 * ctldir appear to have the same v_vfsp.
4363254585Sdelphij	 */
4364254585Sdelphij	if (szp->z_zfsvfs != zfsvfs || zfsctl_is_node(svp)) {
4365212694Smm		ZFS_EXIT(zfsvfs);
4366249195Smm		return (SET_ERROR(EXDEV));
4367168404Spjd	}
4368212694Smm
4369212694Smm	/* Prevent links to .zfs/shares files */
4370212694Smm
4371219089Spjd	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
4372219089Spjd	    &parent, sizeof (uint64_t))) != 0) {
4373212694Smm		ZFS_EXIT(zfsvfs);
4374219089Spjd		return (error);
4375219089Spjd	}
4376219089Spjd	if (parent == zfsvfs->z_shares_dir) {
4377219089Spjd		ZFS_EXIT(zfsvfs);
4378249195Smm		return (SET_ERROR(EPERM));
4379212694Smm	}
4380212694Smm
4381185029Spjd	if (zfsvfs->z_utf8 && u8_validate(name,
4382185029Spjd	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4383185029Spjd		ZFS_EXIT(zfsvfs);
4384249195Smm		return (SET_ERROR(EILSEQ));
4385185029Spjd	}
4386185029Spjd	if (flags & FIGNORECASE)
4387185029Spjd		zf |= ZCILOOK;
4388185029Spjd
4389168404Spjd	/*
4390168404Spjd	 * We do not support links between attributes and non-attributes
4391168404Spjd	 * because of the potential security risk of creating links
4392168404Spjd	 * into "normal" file space in order to circumvent restrictions
4393168404Spjd	 * imposed in attribute space.
4394168404Spjd	 */
4395219089Spjd	if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
4396168404Spjd		ZFS_EXIT(zfsvfs);
4397249195Smm		return (SET_ERROR(EINVAL));
4398168404Spjd	}
4399168404Spjd
4400168404Spjd
4401219089Spjd	owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
4402219089Spjd	if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
4403168404Spjd		ZFS_EXIT(zfsvfs);
4404249195Smm		return (SET_ERROR(EPERM));
4405168404Spjd	}
4406168404Spjd
4407185029Spjd	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4408168404Spjd		ZFS_EXIT(zfsvfs);
4409168404Spjd		return (error);
4410168404Spjd	}
4411168404Spjd
4412212694Smmtop:
4413168404Spjd	/*
4414168404Spjd	 * Attempt to lock directory; fail if entry already exists.
4415168404Spjd	 */
4416185029Spjd	error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
4417185029Spjd	if (error) {
4418168404Spjd		ZFS_EXIT(zfsvfs);
4419168404Spjd		return (error);
4420168404Spjd	}
4421168404Spjd
4422168404Spjd	tx = dmu_tx_create(zfsvfs->z_os);
4423219089Spjd	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4424168404Spjd	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4425219089Spjd	zfs_sa_upgrade_txholds(tx, szp);
4426219089Spjd	zfs_sa_upgrade_txholds(tx, dzp);
4427258632Savg	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
4428168404Spjd	if (error) {
4429168404Spjd		zfs_dirent_unlock(dl);
4430209962Smm		if (error == ERESTART) {
4431258632Savg			waited = B_TRUE;
4432168404Spjd			dmu_tx_wait(tx);
4433168404Spjd			dmu_tx_abort(tx);
4434168404Spjd			goto top;
4435168404Spjd		}
4436168404Spjd		dmu_tx_abort(tx);
4437168404Spjd		ZFS_EXIT(zfsvfs);
4438168404Spjd		return (error);
4439168404Spjd	}
4440168404Spjd
4441168404Spjd	error = zfs_link_create(dl, szp, tx, 0);
4442168404Spjd
4443185029Spjd	if (error == 0) {
4444185029Spjd		uint64_t txtype = TX_LINK;
4445185029Spjd		if (flags & FIGNORECASE)
4446185029Spjd			txtype |= TX_CI;
4447185029Spjd		zfs_log_link(zilog, tx, txtype, dzp, szp, name);
4448185029Spjd	}
4449168404Spjd
4450168404Spjd	dmu_tx_commit(tx);
4451168404Spjd
4452168404Spjd	zfs_dirent_unlock(dl);
4453168404Spjd
4454185029Spjd	if (error == 0) {
4455185029Spjd		vnevent_link(svp, ct);
4456185029Spjd	}
4457185029Spjd
4458219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4459219089Spjd		zil_commit(zilog, 0);
4460219089Spjd
4461168404Spjd	ZFS_EXIT(zfsvfs);
4462168404Spjd	return (error);
4463168404Spjd}
4464168404Spjd
4465219089Spjd#ifdef sun
4466219089Spjd/*
4467219089Spjd * zfs_null_putapage() is used when the file system has been force
4468219089Spjd * unmounted. It just drops the pages.
4469219089Spjd */
4470219089Spjd/* ARGSUSED */
4471219089Spjdstatic int
4472219089Spjdzfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4473219089Spjd		size_t *lenp, int flags, cred_t *cr)
4474219089Spjd{
4475219089Spjd	pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR);
4476219089Spjd	return (0);
4477219089Spjd}
4478219089Spjd
4479219089Spjd/*
4480219089Spjd * Push a page out to disk, klustering if possible.
4481219089Spjd *
4482219089Spjd *	IN:	vp	- file to push page to.
4483219089Spjd *		pp	- page to push.
4484219089Spjd *		flags	- additional flags.
4485219089Spjd *		cr	- credentials of caller.
4486219089Spjd *
4487219089Spjd *	OUT:	offp	- start of range pushed.
4488219089Spjd *		lenp	- len of range pushed.
4489219089Spjd *
4490251631Sdelphij *	RETURN:	0 on success, error code on failure.
4491219089Spjd *
4492219089Spjd * NOTE: callers must have locked the page to be pushed.  On
4493219089Spjd * exit, the page (and all other pages in the kluster) must be
4494219089Spjd * unlocked.
4495219089Spjd */
4496219089Spjd/* ARGSUSED */
4497219089Spjdstatic int
4498219089Spjdzfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4499219089Spjd		size_t *lenp, int flags, cred_t *cr)
4500219089Spjd{
4501219089Spjd	znode_t		*zp = VTOZ(vp);
4502219089Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4503219089Spjd	dmu_tx_t	*tx;
4504219089Spjd	u_offset_t	off, koff;
4505219089Spjd	size_t		len, klen;
4506219089Spjd	int		err;
4507219089Spjd
4508219089Spjd	off = pp->p_offset;
4509219089Spjd	len = PAGESIZE;
4510219089Spjd	/*
4511219089Spjd	 * If our blocksize is bigger than the page size, try to kluster
4512219089Spjd	 * multiple pages so that we write a full block (thus avoiding
4513219089Spjd	 * a read-modify-write).
4514219089Spjd	 */
4515219089Spjd	if (off < zp->z_size && zp->z_blksz > PAGESIZE) {
4516219089Spjd		klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
4517219089Spjd		koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0;
4518219089Spjd		ASSERT(koff <= zp->z_size);
4519219089Spjd		if (koff + klen > zp->z_size)
4520219089Spjd			klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE);
4521219089Spjd		pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
4522219089Spjd	}
4523219089Spjd	ASSERT3U(btop(len), ==, btopr(len));
4524219089Spjd
4525219089Spjd	/*
4526219089Spjd	 * Can't push pages past end-of-file.
4527219089Spjd	 */
4528219089Spjd	if (off >= zp->z_size) {
4529219089Spjd		/* ignore all pages */
4530219089Spjd		err = 0;
4531219089Spjd		goto out;
4532219089Spjd	} else if (off + len > zp->z_size) {
4533219089Spjd		int npages = btopr(zp->z_size - off);
4534219089Spjd		page_t *trunc;
4535219089Spjd
4536219089Spjd		page_list_break(&pp, &trunc, npages);
4537219089Spjd		/* ignore pages past end of file */
4538219089Spjd		if (trunc)
4539219089Spjd			pvn_write_done(trunc, flags);
4540219089Spjd		len = zp->z_size - off;
4541219089Spjd	}
4542219089Spjd
4543219089Spjd	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
4544219089Spjd	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
4545249195Smm		err = SET_ERROR(EDQUOT);
4546219089Spjd		goto out;
4547219089Spjd	}
4548219089Spjd	tx = dmu_tx_create(zfsvfs->z_os);
4549219089Spjd	dmu_tx_hold_write(tx, zp->z_id, off, len);
4550219089Spjd
4551219089Spjd	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4552219089Spjd	zfs_sa_upgrade_txholds(tx, zp);
4553258720Savg	err = dmu_tx_assign(tx, TXG_WAIT);
4554219089Spjd	if (err != 0) {
4555219089Spjd		dmu_tx_abort(tx);
4556219089Spjd		goto out;
4557219089Spjd	}
4558219089Spjd
4559219089Spjd	if (zp->z_blksz <= PAGESIZE) {
4560219089Spjd		caddr_t va = zfs_map_page(pp, S_READ);
4561219089Spjd		ASSERT3U(len, <=, PAGESIZE);
4562219089Spjd		dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
4563219089Spjd		zfs_unmap_page(pp, va);
4564219089Spjd	} else {
4565219089Spjd		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
4566219089Spjd	}
4567219089Spjd
4568219089Spjd	if (err == 0) {
4569219089Spjd		uint64_t mtime[2], ctime[2];
4570219089Spjd		sa_bulk_attr_t bulk[3];
4571219089Spjd		int count = 0;
4572219089Spjd
4573219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4574219089Spjd		    &mtime, 16);
4575219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
4576219089Spjd		    &ctime, 16);
4577219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
4578219089Spjd		    &zp->z_pflags, 8);
4579219089Spjd		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
4580219089Spjd		    B_TRUE);
4581219089Spjd		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
4582219089Spjd	}
4583219089Spjd	dmu_tx_commit(tx);
4584219089Spjd
4585219089Spjdout:
4586219089Spjd	pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
4587219089Spjd	if (offp)
4588219089Spjd		*offp = off;
4589219089Spjd	if (lenp)
4590219089Spjd		*lenp = len;
4591219089Spjd
4592219089Spjd	return (err);
4593219089Spjd}
4594219089Spjd
4595219089Spjd/*
4596219089Spjd * Copy the portion of the file indicated from pages into the file.
4597219089Spjd * The pages are stored in a page list attached to the files vnode.
4598219089Spjd *
4599219089Spjd *	IN:	vp	- vnode of file to push page data to.
4600219089Spjd *		off	- position in file to put data.
4601219089Spjd *		len	- amount of data to write.
4602219089Spjd *		flags	- flags to control the operation.
4603219089Spjd *		cr	- credentials of caller.
4604219089Spjd *		ct	- caller context.
4605219089Spjd *
4606251631Sdelphij *	RETURN:	0 on success, error code on failure.
4607219089Spjd *
4608219089Spjd * Timestamps:
4609219089Spjd *	vp - ctime|mtime updated
4610219089Spjd */
4611185029Spjd/*ARGSUSED*/
4612219089Spjdstatic int
4613219089Spjdzfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
4614219089Spjd    caller_context_t *ct)
4615219089Spjd{
4616219089Spjd	znode_t		*zp = VTOZ(vp);
4617219089Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4618219089Spjd	page_t		*pp;
4619219089Spjd	size_t		io_len;
4620219089Spjd	u_offset_t	io_off;
4621219089Spjd	uint_t		blksz;
4622219089Spjd	rl_t		*rl;
4623219089Spjd	int		error = 0;
4624219089Spjd
4625219089Spjd	ZFS_ENTER(zfsvfs);
4626219089Spjd	ZFS_VERIFY_ZP(zp);
4627219089Spjd
4628219089Spjd	/*
4629219089Spjd	 * Align this request to the file block size in case we kluster.
4630219089Spjd	 * XXX - this can result in pretty aggresive locking, which can
4631219089Spjd	 * impact simultanious read/write access.  One option might be
4632219089Spjd	 * to break up long requests (len == 0) into block-by-block
4633219089Spjd	 * operations to get narrower locking.
4634219089Spjd	 */
4635219089Spjd	blksz = zp->z_blksz;
4636219089Spjd	if (ISP2(blksz))
4637219089Spjd		io_off = P2ALIGN_TYPED(off, blksz, u_offset_t);
4638219089Spjd	else
4639219089Spjd		io_off = 0;
4640219089Spjd	if (len > 0 && ISP2(blksz))
4641219089Spjd		io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t);
4642219089Spjd	else
4643219089Spjd		io_len = 0;
4644219089Spjd
4645219089Spjd	if (io_len == 0) {
4646219089Spjd		/*
4647219089Spjd		 * Search the entire vp list for pages >= io_off.
4648219089Spjd		 */
4649219089Spjd		rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER);
4650219089Spjd		error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr);
4651219089Spjd		goto out;
4652219089Spjd	}
4653219089Spjd	rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER);
4654219089Spjd
4655219089Spjd	if (off > zp->z_size) {
4656219089Spjd		/* past end of file */
4657219089Spjd		zfs_range_unlock(rl);
4658219089Spjd		ZFS_EXIT(zfsvfs);
4659219089Spjd		return (0);
4660219089Spjd	}
4661219089Spjd
4662219089Spjd	len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off);
4663219089Spjd
4664219089Spjd	for (off = io_off; io_off < off + len; io_off += io_len) {
4665219089Spjd		if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
4666219089Spjd			pp = page_lookup(vp, io_off,
4667219089Spjd			    (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
4668219089Spjd		} else {
4669219089Spjd			pp = page_lookup_nowait(vp, io_off,
4670219089Spjd			    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
4671219089Spjd		}
4672219089Spjd
4673219089Spjd		if (pp != NULL && pvn_getdirty(pp, flags)) {
4674219089Spjd			int err;
4675219089Spjd
4676219089Spjd			/*
4677219089Spjd			 * Found a dirty page to push
4678219089Spjd			 */
4679219089Spjd			err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr);
4680219089Spjd			if (err)
4681219089Spjd				error = err;
4682219089Spjd		} else {
4683219089Spjd			io_len = PAGESIZE;
4684219089Spjd		}
4685219089Spjd	}
4686219089Spjdout:
4687219089Spjd	zfs_range_unlock(rl);
4688219089Spjd	if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4689219089Spjd		zil_commit(zfsvfs->z_log, zp->z_id);
4690219089Spjd	ZFS_EXIT(zfsvfs);
4691219089Spjd	return (error);
4692219089Spjd}
4693219089Spjd#endif	/* sun */
4694219089Spjd
4695219089Spjd/*ARGSUSED*/
4696168962Spjdvoid
4697185029Spjdzfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4698168404Spjd{
4699168962Spjd	znode_t	*zp = VTOZ(vp);
4700168962Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4701168962Spjd	int error;
4702168404Spjd
4703185029Spjd	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4704219089Spjd	if (zp->z_sa_hdl == NULL) {
4705185029Spjd		/*
4706185029Spjd		 * The fs has been unmounted, or we did a
4707185029Spjd		 * suspend/resume and this file no longer exists.
4708185029Spjd		 */
4709243520Savg		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4710234607Strasz		vrecycle(vp);
4711243520Savg		return;
4712243520Savg	}
4713243520Savg
4714243520Savg	mutex_enter(&zp->z_lock);
4715243520Savg	if (zp->z_unlinked) {
4716243520Savg		/*
4717243520Savg		 * Fast path to recycle a vnode of a removed file.
4718243520Savg		 */
4719243520Savg		mutex_exit(&zp->z_lock);
4720185029Spjd		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4721243520Savg		vrecycle(vp);
4722168962Spjd		return;
4723168404Spjd	}
4724243520Savg	mutex_exit(&zp->z_lock);
4725168404Spjd
4726168404Spjd	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4727168404Spjd		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4728168404Spjd
4729219089Spjd		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4730219089Spjd		zfs_sa_upgrade_txholds(tx, zp);
4731168404Spjd		error = dmu_tx_assign(tx, TXG_WAIT);
4732168404Spjd		if (error) {
4733168404Spjd			dmu_tx_abort(tx);
4734168404Spjd		} else {
4735168404Spjd			mutex_enter(&zp->z_lock);
4736219089Spjd			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4737219089Spjd			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4738168404Spjd			zp->z_atime_dirty = 0;
4739168404Spjd			mutex_exit(&zp->z_lock);
4740168404Spjd			dmu_tx_commit(tx);
4741168404Spjd		}
4742168404Spjd	}
4743185029Spjd	rw_exit(&zfsvfs->z_teardown_inactive_lock);
4744168404Spjd}
4745168404Spjd
4746219089Spjd#ifdef sun
4747219089Spjd/*
4748219089Spjd * Bounds-check the seek operation.
4749219089Spjd *
4750219089Spjd *	IN:	vp	- vnode seeking within
4751219089Spjd *		ooff	- old file offset
4752219089Spjd *		noffp	- pointer to new file offset
4753219089Spjd *		ct	- caller context
4754219089Spjd *
4755251631Sdelphij *	RETURN:	0 on success, EINVAL if new offset invalid.
4756219089Spjd */
4757219089Spjd/* ARGSUSED */
4758219089Spjdstatic int
4759219089Spjdzfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
4760219089Spjd    caller_context_t *ct)
4761219089Spjd{
4762219089Spjd	if (vp->v_type == VDIR)
4763219089Spjd		return (0);
4764219089Spjd	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
4765219089Spjd}
4766219089Spjd
4767219089Spjd/*
4768219089Spjd * Pre-filter the generic locking function to trap attempts to place
4769219089Spjd * a mandatory lock on a memory mapped file.
4770219089Spjd */
4771219089Spjdstatic int
4772219089Spjdzfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
4773219089Spjd    flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct)
4774219089Spjd{
4775219089Spjd	znode_t *zp = VTOZ(vp);
4776219089Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4777219089Spjd
4778219089Spjd	ZFS_ENTER(zfsvfs);
4779219089Spjd	ZFS_VERIFY_ZP(zp);
4780219089Spjd
4781219089Spjd	/*
4782219089Spjd	 * We are following the UFS semantics with respect to mapcnt
4783219089Spjd	 * here: If we see that the file is mapped already, then we will
4784219089Spjd	 * return an error, but we don't worry about races between this
4785219089Spjd	 * function and zfs_map().
4786219089Spjd	 */
4787219089Spjd	if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) {
4788219089Spjd		ZFS_EXIT(zfsvfs);
4789249195Smm		return (SET_ERROR(EAGAIN));
4790219089Spjd	}
4791219089Spjd	ZFS_EXIT(zfsvfs);
4792219089Spjd	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
4793219089Spjd}
4794219089Spjd
4795219089Spjd/*
4796219089Spjd * If we can't find a page in the cache, we will create a new page
4797219089Spjd * and fill it with file data.  For efficiency, we may try to fill
4798219089Spjd * multiple pages at once (klustering) to fill up the supplied page
4799219089Spjd * list.  Note that the pages to be filled are held with an exclusive
4800219089Spjd * lock to prevent access by other threads while they are being filled.
4801219089Spjd */
4802219089Spjdstatic int
4803219089Spjdzfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
4804219089Spjd    caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw)
4805219089Spjd{
4806219089Spjd	znode_t *zp = VTOZ(vp);
4807219089Spjd	page_t *pp, *cur_pp;
4808219089Spjd	objset_t *os = zp->z_zfsvfs->z_os;
4809219089Spjd	u_offset_t io_off, total;
4810219089Spjd	size_t io_len;
4811219089Spjd	int err;
4812219089Spjd
4813219089Spjd	if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) {
4814219089Spjd		/*
4815219089Spjd		 * We only have a single page, don't bother klustering
4816219089Spjd		 */
4817219089Spjd		io_off = off;
4818219089Spjd		io_len = PAGESIZE;
4819219089Spjd		pp = page_create_va(vp, io_off, io_len,
4820219089Spjd		    PG_EXCL | PG_WAIT, seg, addr);
4821219089Spjd	} else {
4822219089Spjd		/*
4823219089Spjd		 * Try to find enough pages to fill the page list
4824219089Spjd		 */
4825219089Spjd		pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
4826219089Spjd		    &io_len, off, plsz, 0);
4827219089Spjd	}
4828219089Spjd	if (pp == NULL) {
4829219089Spjd		/*
4830219089Spjd		 * The page already exists, nothing to do here.
4831219089Spjd		 */
4832219089Spjd		*pl = NULL;
4833219089Spjd		return (0);
4834219089Spjd	}
4835219089Spjd
4836219089Spjd	/*
4837219089Spjd	 * Fill the pages in the kluster.
4838219089Spjd	 */
4839219089Spjd	cur_pp = pp;
4840219089Spjd	for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
4841219089Spjd		caddr_t va;
4842219089Spjd
4843219089Spjd		ASSERT3U(io_off, ==, cur_pp->p_offset);
4844219089Spjd		va = zfs_map_page(cur_pp, S_WRITE);
4845219089Spjd		err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
4846219089Spjd		    DMU_READ_PREFETCH);
4847219089Spjd		zfs_unmap_page(cur_pp, va);
4848219089Spjd		if (err) {
4849219089Spjd			/* On error, toss the entire kluster */
4850219089Spjd			pvn_read_done(pp, B_ERROR);
4851219089Spjd			/* convert checksum errors into IO errors */
4852219089Spjd			if (err == ECKSUM)
4853249195Smm				err = SET_ERROR(EIO);
4854219089Spjd			return (err);
4855219089Spjd		}
4856219089Spjd		cur_pp = cur_pp->p_next;
4857219089Spjd	}
4858219089Spjd
4859219089Spjd	/*
4860219089Spjd	 * Fill in the page list array from the kluster starting
4861219089Spjd	 * from the desired offset `off'.
4862219089Spjd	 * NOTE: the page list will always be null terminated.
4863219089Spjd	 */
4864219089Spjd	pvn_plist_init(pp, pl, plsz, off, io_len, rw);
4865219089Spjd	ASSERT(pl == NULL || (*pl)->p_offset == off);
4866219089Spjd
4867219089Spjd	return (0);
4868219089Spjd}
4869219089Spjd
4870219089Spjd/*
4871219089Spjd * Return pointers to the pages for the file region [off, off + len]
4872219089Spjd * in the pl array.  If plsz is greater than len, this function may
4873219089Spjd * also return page pointers from after the specified region
4874219089Spjd * (i.e. the region [off, off + plsz]).  These additional pages are
4875219089Spjd * only returned if they are already in the cache, or were created as
4876219089Spjd * part of a klustered read.
4877219089Spjd *
4878219089Spjd *	IN:	vp	- vnode of file to get data from.
4879219089Spjd *		off	- position in file to get data from.
4880219089Spjd *		len	- amount of data to retrieve.
4881219089Spjd *		plsz	- length of provided page list.
4882219089Spjd *		seg	- segment to obtain pages for.
4883219089Spjd *		addr	- virtual address of fault.
4884219089Spjd *		rw	- mode of created pages.
4885219089Spjd *		cr	- credentials of caller.
4886219089Spjd *		ct	- caller context.
4887219089Spjd *
4888219089Spjd *	OUT:	protp	- protection mode of created pages.
4889219089Spjd *		pl	- list of pages created.
4890219089Spjd *
4891251631Sdelphij *	RETURN:	0 on success, error code on failure.
4892219089Spjd *
4893219089Spjd * Timestamps:
4894219089Spjd *	vp - atime updated
4895219089Spjd */
4896219089Spjd/* ARGSUSED */
4897219089Spjdstatic int
4898219089Spjdzfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
4899251631Sdelphij    page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
4900251631Sdelphij    enum seg_rw rw, cred_t *cr, caller_context_t *ct)
4901219089Spjd{
4902219089Spjd	znode_t		*zp = VTOZ(vp);
4903219089Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4904219089Spjd	page_t		**pl0 = pl;
4905219089Spjd	int		err = 0;
4906219089Spjd
4907219089Spjd	/* we do our own caching, faultahead is unnecessary */
4908219089Spjd	if (pl == NULL)
4909219089Spjd		return (0);
4910219089Spjd	else if (len > plsz)
4911219089Spjd		len = plsz;
4912219089Spjd	else
4913219089Spjd		len = P2ROUNDUP(len, PAGESIZE);
4914219089Spjd	ASSERT(plsz >= len);
4915219089Spjd
4916219089Spjd	ZFS_ENTER(zfsvfs);
4917219089Spjd	ZFS_VERIFY_ZP(zp);
4918219089Spjd
4919219089Spjd	if (protp)
4920219089Spjd		*protp = PROT_ALL;
4921219089Spjd
4922219089Spjd	/*
4923219089Spjd	 * Loop through the requested range [off, off + len) looking
4924219089Spjd	 * for pages.  If we don't find a page, we will need to create
4925219089Spjd	 * a new page and fill it with data from the file.
4926219089Spjd	 */
4927219089Spjd	while (len > 0) {
4928219089Spjd		if (*pl = page_lookup(vp, off, SE_SHARED))
4929219089Spjd			*(pl+1) = NULL;
4930219089Spjd		else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw))
4931219089Spjd			goto out;
4932219089Spjd		while (*pl) {
4933219089Spjd			ASSERT3U((*pl)->p_offset, ==, off);
4934219089Spjd			off += PAGESIZE;
4935219089Spjd			addr += PAGESIZE;
4936219089Spjd			if (len > 0) {
4937219089Spjd				ASSERT3U(len, >=, PAGESIZE);
4938219089Spjd				len -= PAGESIZE;
4939219089Spjd			}
4940219089Spjd			ASSERT3U(plsz, >=, PAGESIZE);
4941219089Spjd			plsz -= PAGESIZE;
4942219089Spjd			pl++;
4943219089Spjd		}
4944219089Spjd	}
4945219089Spjd
4946219089Spjd	/*
4947219089Spjd	 * Fill out the page array with any pages already in the cache.
4948219089Spjd	 */
4949219089Spjd	while (plsz > 0 &&
4950219089Spjd	    (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) {
4951219089Spjd			off += PAGESIZE;
4952219089Spjd			plsz -= PAGESIZE;
4953219089Spjd	}
4954219089Spjdout:
4955219089Spjd	if (err) {
4956219089Spjd		/*
4957219089Spjd		 * Release any pages we have previously locked.
4958219089Spjd		 */
4959219089Spjd		while (pl > pl0)
4960219089Spjd			page_unlock(*--pl);
4961219089Spjd	} else {
4962219089Spjd		ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4963219089Spjd	}
4964219089Spjd
4965219089Spjd	*pl = NULL;
4966219089Spjd
4967219089Spjd	ZFS_EXIT(zfsvfs);
4968219089Spjd	return (err);
4969219089Spjd}
4970219089Spjd
4971219089Spjd/*
4972219089Spjd * Request a memory map for a section of a file.  This code interacts
4973219089Spjd * with common code and the VM system as follows:
4974219089Spjd *
4975251631Sdelphij * - common code calls mmap(), which ends up in smmap_common()
4976251631Sdelphij * - this calls VOP_MAP(), which takes you into (say) zfs
4977251631Sdelphij * - zfs_map() calls as_map(), passing segvn_create() as the callback
4978251631Sdelphij * - segvn_create() creates the new segment and calls VOP_ADDMAP()
4979251631Sdelphij * - zfs_addmap() updates z_mapcnt
4980219089Spjd */
4981219089Spjd/*ARGSUSED*/
4982219089Spjdstatic int
4983219089Spjdzfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
4984219089Spjd    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4985219089Spjd    caller_context_t *ct)
4986219089Spjd{
4987219089Spjd	znode_t *zp = VTOZ(vp);
4988219089Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4989219089Spjd	segvn_crargs_t	vn_a;
4990219089Spjd	int		error;
4991219089Spjd
4992219089Spjd	ZFS_ENTER(zfsvfs);
4993219089Spjd	ZFS_VERIFY_ZP(zp);
4994219089Spjd
4995219089Spjd	if ((prot & PROT_WRITE) && (zp->z_pflags &
4996219089Spjd	    (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
4997219089Spjd		ZFS_EXIT(zfsvfs);
4998249195Smm		return (SET_ERROR(EPERM));
4999219089Spjd	}
5000219089Spjd
5001219089Spjd	if ((prot & (PROT_READ | PROT_EXEC)) &&
5002219089Spjd	    (zp->z_pflags & ZFS_AV_QUARANTINED)) {
5003219089Spjd		ZFS_EXIT(zfsvfs);
5004249195Smm		return (SET_ERROR(EACCES));
5005219089Spjd	}
5006219089Spjd
5007219089Spjd	if (vp->v_flag & VNOMAP) {
5008219089Spjd		ZFS_EXIT(zfsvfs);
5009249195Smm		return (SET_ERROR(ENOSYS));
5010219089Spjd	}
5011219089Spjd
5012219089Spjd	if (off < 0 || len > MAXOFFSET_T - off) {
5013219089Spjd		ZFS_EXIT(zfsvfs);
5014249195Smm		return (SET_ERROR(ENXIO));
5015219089Spjd	}
5016219089Spjd
5017219089Spjd	if (vp->v_type != VREG) {
5018219089Spjd		ZFS_EXIT(zfsvfs);
5019249195Smm		return (SET_ERROR(ENODEV));
5020219089Spjd	}
5021219089Spjd
5022219089Spjd	/*
5023219089Spjd	 * If file is locked, disallow mapping.
5024219089Spjd	 */
5025219089Spjd	if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) {
5026219089Spjd		ZFS_EXIT(zfsvfs);
5027249195Smm		return (SET_ERROR(EAGAIN));
5028219089Spjd	}
5029219089Spjd
5030219089Spjd	as_rangelock(as);
5031219089Spjd	error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
5032219089Spjd	if (error != 0) {
5033219089Spjd		as_rangeunlock(as);
5034219089Spjd		ZFS_EXIT(zfsvfs);
5035219089Spjd		return (error);
5036219089Spjd	}
5037219089Spjd
5038219089Spjd	vn_a.vp = vp;
5039219089Spjd	vn_a.offset = (u_offset_t)off;
5040219089Spjd	vn_a.type = flags & MAP_TYPE;
5041219089Spjd	vn_a.prot = prot;
5042219089Spjd	vn_a.maxprot = maxprot;
5043219089Spjd	vn_a.cred = cr;
5044219089Spjd	vn_a.amp = NULL;
5045219089Spjd	vn_a.flags = flags & ~MAP_TYPE;
5046219089Spjd	vn_a.szc = 0;
5047219089Spjd	vn_a.lgrp_mem_policy_flags = 0;
5048219089Spjd
5049219089Spjd	error = as_map(as, *addrp, len, segvn_create, &vn_a);
5050219089Spjd
5051219089Spjd	as_rangeunlock(as);
5052219089Spjd	ZFS_EXIT(zfsvfs);
5053219089Spjd	return (error);
5054219089Spjd}
5055219089Spjd
5056219089Spjd/* ARGSUSED */
5057219089Spjdstatic int
5058219089Spjdzfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
5059219089Spjd    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
5060219089Spjd    caller_context_t *ct)
5061219089Spjd{
5062219089Spjd	uint64_t pages = btopr(len);
5063219089Spjd
5064219089Spjd	atomic_add_64(&VTOZ(vp)->z_mapcnt, pages);
5065219089Spjd	return (0);
5066219089Spjd}
5067219089Spjd
5068219089Spjd/*
5069219089Spjd * The reason we push dirty pages as part of zfs_delmap() is so that we get a
5070219089Spjd * more accurate mtime for the associated file.  Since we don't have a way of
5071219089Spjd * detecting when the data was actually modified, we have to resort to
5072219089Spjd * heuristics.  If an explicit msync() is done, then we mark the mtime when the
5073219089Spjd * last page is pushed.  The problem occurs when the msync() call is omitted,
5074219089Spjd * which by far the most common case:
5075219089Spjd *
5076268464Sdelphij *	open()
5077268464Sdelphij *	mmap()
5078268464Sdelphij *	<modify memory>
5079268464Sdelphij *	munmap()
5080268464Sdelphij *	close()
5081268464Sdelphij *	<time lapse>
5082268464Sdelphij *	putpage() via fsflush
5083219089Spjd *
5084219089Spjd * If we wait until fsflush to come along, we can have a modification time that
5085219089Spjd * is some arbitrary point in the future.  In order to prevent this in the
5086219089Spjd * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
5087219089Spjd * torn down.
5088219089Spjd */
5089219089Spjd/* ARGSUSED */
5090219089Spjdstatic int
5091219089Spjdzfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
5092219089Spjd    size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
5093219089Spjd    caller_context_t *ct)
5094219089Spjd{
5095219089Spjd	uint64_t pages = btopr(len);
5096219089Spjd
5097219089Spjd	ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
5098219089Spjd	atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
5099219089Spjd
5100219089Spjd	if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
5101219089Spjd	    vn_has_cached_data(vp))
5102219089Spjd		(void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
5103219089Spjd
5104219089Spjd	return (0);
5105219089Spjd}
5106219089Spjd
5107219089Spjd/*
5108219089Spjd * Free or allocate space in a file.  Currently, this function only
5109219089Spjd * supports the `F_FREESP' command.  However, this command is somewhat
5110219089Spjd * misnamed, as its functionality includes the ability to allocate as
5111219089Spjd * well as free space.
5112219089Spjd *
5113219089Spjd *	IN:	vp	- vnode of file to free data in.
5114219089Spjd *		cmd	- action to take (only F_FREESP supported).
5115219089Spjd *		bfp	- section of file to free/alloc.
5116219089Spjd *		flag	- current file open mode flags.
5117219089Spjd *		offset	- current file offset.
5118219089Spjd *		cr	- credentials of caller [UNUSED].
5119219089Spjd *		ct	- caller context.
5120219089Spjd *
5121251631Sdelphij *	RETURN:	0 on success, error code on failure.
5122219089Spjd *
5123219089Spjd * Timestamps:
5124219089Spjd *	vp - ctime|mtime updated
5125219089Spjd */
5126219089Spjd/* ARGSUSED */
5127219089Spjdstatic int
5128219089Spjdzfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
5129219089Spjd    offset_t offset, cred_t *cr, caller_context_t *ct)
5130219089Spjd{
5131219089Spjd	znode_t		*zp = VTOZ(vp);
5132219089Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
5133219089Spjd	uint64_t	off, len;
5134219089Spjd	int		error;
5135219089Spjd
5136219089Spjd	ZFS_ENTER(zfsvfs);
5137219089Spjd	ZFS_VERIFY_ZP(zp);
5138219089Spjd
5139219089Spjd	if (cmd != F_FREESP) {
5140219089Spjd		ZFS_EXIT(zfsvfs);
5141249195Smm		return (SET_ERROR(EINVAL));
5142219089Spjd	}
5143219089Spjd
5144262990Sdelphij	/*
5145262990Sdelphij	 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
5146262990Sdelphij	 * callers might not be able to detect properly that we are read-only,
5147262990Sdelphij	 * so check it explicitly here.
5148262990Sdelphij	 */
5149262990Sdelphij	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
5150262990Sdelphij		ZFS_EXIT(zfsvfs);
5151262990Sdelphij		return (SET_ERROR(EROFS));
5152262990Sdelphij	}
5153262990Sdelphij
5154219089Spjd	if (error = convoff(vp, bfp, 0, offset)) {
5155219089Spjd		ZFS_EXIT(zfsvfs);
5156219089Spjd		return (error);
5157219089Spjd	}
5158219089Spjd
5159219089Spjd	if (bfp->l_len < 0) {
5160219089Spjd		ZFS_EXIT(zfsvfs);
5161249195Smm		return (SET_ERROR(EINVAL));
5162219089Spjd	}
5163219089Spjd
5164219089Spjd	off = bfp->l_start;
5165219089Spjd	len = bfp->l_len; /* 0 means from off to end of file */
5166219089Spjd
5167219089Spjd	error = zfs_freesp(zp, off, len, flag, TRUE);
5168219089Spjd
5169219089Spjd	ZFS_EXIT(zfsvfs);
5170219089Spjd	return (error);
5171219089Spjd}
5172219089Spjd#endif	/* sun */
5173219089Spjd
5174168404SpjdCTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
5175168404SpjdCTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
5176168404Spjd
5177185029Spjd/*ARGSUSED*/
5178168404Spjdstatic int
5179185029Spjdzfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
5180168404Spjd{
5181168404Spjd	znode_t		*zp = VTOZ(vp);
5182168404Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
5183185029Spjd	uint32_t	gen;
5184219089Spjd	uint64_t	gen64;
5185168404Spjd	uint64_t	object = zp->z_id;
5186168404Spjd	zfid_short_t	*zfid;
5187219089Spjd	int		size, i, error;
5188168404Spjd
5189168404Spjd	ZFS_ENTER(zfsvfs);
5190185029Spjd	ZFS_VERIFY_ZP(zp);
5191168404Spjd
5192219089Spjd	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
5193219089Spjd	    &gen64, sizeof (uint64_t))) != 0) {
5194219089Spjd		ZFS_EXIT(zfsvfs);
5195219089Spjd		return (error);
5196219089Spjd	}
5197219089Spjd
5198219089Spjd	gen = (uint32_t)gen64;
5199219089Spjd
5200168404Spjd	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
5201249195Smm
5202249195Smm#ifdef illumos
5203249195Smm	if (fidp->fid_len < size) {
5204249195Smm		fidp->fid_len = size;
5205249195Smm		ZFS_EXIT(zfsvfs);
5206249195Smm		return (SET_ERROR(ENOSPC));
5207249195Smm	}
5208249195Smm#else
5209168404Spjd	fidp->fid_len = size;
5210249195Smm#endif
5211168404Spjd
5212168404Spjd	zfid = (zfid_short_t *)fidp;
5213168404Spjd
5214168404Spjd	zfid->zf_len = size;
5215168404Spjd
5216168404Spjd	for (i = 0; i < sizeof (zfid->zf_object); i++)
5217168404Spjd		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
5218168404Spjd
5219168404Spjd	/* Must have a non-zero generation number to distinguish from .zfs */
5220168404Spjd	if (gen == 0)
5221168404Spjd		gen = 1;
5222168404Spjd	for (i = 0; i < sizeof (zfid->zf_gen); i++)
5223168404Spjd		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
5224168404Spjd
5225168404Spjd	if (size == LONG_FID_LEN) {
5226168404Spjd		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
5227169023Spjd		zfid_long_t	*zlfid;
5228168404Spjd
5229168404Spjd		zlfid = (zfid_long_t *)fidp;
5230168404Spjd
5231168404Spjd		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
5232168404Spjd			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
5233168404Spjd
5234168404Spjd		/* XXX - this should be the generation number for the objset */
5235168404Spjd		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
5236168404Spjd			zlfid->zf_setgen[i] = 0;
5237168404Spjd	}
5238168404Spjd
5239168404Spjd	ZFS_EXIT(zfsvfs);
5240168404Spjd	return (0);
5241168404Spjd}
5242168404Spjd
5243168404Spjdstatic int
5244185029Spjdzfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
5245185029Spjd    caller_context_t *ct)
5246168404Spjd{
5247168404Spjd	znode_t		*zp, *xzp;
5248168404Spjd	zfsvfs_t	*zfsvfs;
5249168404Spjd	zfs_dirlock_t	*dl;
5250168404Spjd	int		error;
5251168404Spjd
5252168404Spjd	switch (cmd) {
5253168404Spjd	case _PC_LINK_MAX:
5254168404Spjd		*valp = INT_MAX;
5255168404Spjd		return (0);
5256168404Spjd
5257168404Spjd	case _PC_FILESIZEBITS:
5258168404Spjd		*valp = 64;
5259168404Spjd		return (0);
5260219089Spjd#ifdef sun
5261168404Spjd	case _PC_XATTR_EXISTS:
5262168404Spjd		zp = VTOZ(vp);
5263168404Spjd		zfsvfs = zp->z_zfsvfs;
5264168404Spjd		ZFS_ENTER(zfsvfs);
5265185029Spjd		ZFS_VERIFY_ZP(zp);
5266168404Spjd		*valp = 0;
5267168404Spjd		error = zfs_dirent_lock(&dl, zp, "", &xzp,
5268185029Spjd		    ZXATTR | ZEXISTS | ZSHARED, NULL, NULL);
5269168404Spjd		if (error == 0) {
5270168404Spjd			zfs_dirent_unlock(dl);
5271168404Spjd			if (!zfs_dirempty(xzp))
5272168404Spjd				*valp = 1;
5273168404Spjd			VN_RELE(ZTOV(xzp));
5274168404Spjd		} else if (error == ENOENT) {
5275168404Spjd			/*
5276168404Spjd			 * If there aren't extended attributes, it's the
5277168404Spjd			 * same as having zero of them.
5278168404Spjd			 */
5279168404Spjd			error = 0;
5280168404Spjd		}
5281168404Spjd		ZFS_EXIT(zfsvfs);
5282168404Spjd		return (error);
5283168404Spjd
5284219089Spjd	case _PC_SATTR_ENABLED:
5285219089Spjd	case _PC_SATTR_EXISTS:
5286219089Spjd		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
5287219089Spjd		    (vp->v_type == VREG || vp->v_type == VDIR);
5288219089Spjd		return (0);
5289219089Spjd
5290219089Spjd	case _PC_ACCESS_FILTERING:
5291219089Spjd		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
5292219089Spjd		    vp->v_type == VDIR;
5293219089Spjd		return (0);
5294219089Spjd
5295219089Spjd	case _PC_ACL_ENABLED:
5296219089Spjd		*valp = _ACL_ACE_ENABLED;
5297219089Spjd		return (0);
5298219089Spjd#endif	/* sun */
5299219089Spjd	case _PC_MIN_HOLE_SIZE:
5300219089Spjd		*valp = (int)SPA_MINBLOCKSIZE;
5301219089Spjd		return (0);
5302219089Spjd#ifdef sun
5303219089Spjd	case _PC_TIMESTAMP_RESOLUTION:
5304219089Spjd		/* nanosecond timestamp resolution */
5305219089Spjd		*valp = 1L;
5306219089Spjd		return (0);
5307219089Spjd#endif	/* sun */
5308168404Spjd	case _PC_ACL_EXTENDED:
5309196949Strasz		*valp = 0;
5310168404Spjd		return (0);
5311168404Spjd
5312196949Strasz	case _PC_ACL_NFS4:
5313196949Strasz		*valp = 1;
5314196949Strasz		return (0);
5315196949Strasz
5316196949Strasz	case _PC_ACL_PATH_MAX:
5317196949Strasz		*valp = ACL_MAX_ENTRIES;
5318196949Strasz		return (0);
5319196949Strasz
5320168404Spjd	default:
5321168962Spjd		return (EOPNOTSUPP);
5322168404Spjd	}
5323168404Spjd}
5324168404Spjd
5325168404Spjd/*ARGSUSED*/
5326168404Spjdstatic int
5327185029Spjdzfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
5328185029Spjd    caller_context_t *ct)
5329168404Spjd{
5330168404Spjd	znode_t *zp = VTOZ(vp);
5331168404Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5332168404Spjd	int error;
5333185029Spjd	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
5334168404Spjd
5335168404Spjd	ZFS_ENTER(zfsvfs);
5336185029Spjd	ZFS_VERIFY_ZP(zp);
5337185029Spjd	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
5338168404Spjd	ZFS_EXIT(zfsvfs);
5339168404Spjd
5340168404Spjd	return (error);
5341168404Spjd}
5342168404Spjd
5343168404Spjd/*ARGSUSED*/
5344228685Spjdint
5345185029Spjdzfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
5346185029Spjd    caller_context_t *ct)
5347168404Spjd{
5348168404Spjd	znode_t *zp = VTOZ(vp);
5349168404Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5350168404Spjd	int error;
5351185029Spjd	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
5352219089Spjd	zilog_t	*zilog = zfsvfs->z_log;
5353168404Spjd
5354168404Spjd	ZFS_ENTER(zfsvfs);
5355185029Spjd	ZFS_VERIFY_ZP(zp);
5356219089Spjd
5357185029Spjd	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
5358219089Spjd
5359219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
5360219089Spjd		zil_commit(zilog, 0);
5361219089Spjd
5362168404Spjd	ZFS_EXIT(zfsvfs);
5363168404Spjd	return (error);
5364168404Spjd}
5365168404Spjd
5366219089Spjd#ifdef sun
5367219089Spjd/*
5368251631Sdelphij * The smallest read we may consider to loan out an arcbuf.
5369251631Sdelphij * This must be a power of 2.
5370219089Spjd */
5371219089Spjdint zcr_blksz_min = (1 << 10);	/* 1K */
5372251631Sdelphij/*
5373251631Sdelphij * If set to less than the file block size, allow loaning out of an
5374251631Sdelphij * arcbuf for a partial block read.  This must be a power of 2.
5375251631Sdelphij */
5376219089Spjdint zcr_blksz_max = (1 << 17);	/* 128K */
5377219089Spjd
5378219089Spjd/*ARGSUSED*/
5379168962Spjdstatic int
5380219089Spjdzfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr,
5381219089Spjd    caller_context_t *ct)
5382219089Spjd{
5383219089Spjd	znode_t	*zp = VTOZ(vp);
5384219089Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5385219089Spjd	int max_blksz = zfsvfs->z_max_blksz;
5386219089Spjd	uio_t *uio = &xuio->xu_uio;
5387219089Spjd	ssize_t size = uio->uio_resid;
5388219089Spjd	offset_t offset = uio->uio_loffset;
5389219089Spjd	int blksz;
5390219089Spjd	int fullblk, i;
5391219089Spjd	arc_buf_t *abuf;
5392219089Spjd	ssize_t maxsize;
5393219089Spjd	int preamble, postamble;
5394219089Spjd
5395219089Spjd	if (xuio->xu_type != UIOTYPE_ZEROCOPY)
5396249195Smm		return (SET_ERROR(EINVAL));
5397219089Spjd
5398219089Spjd	ZFS_ENTER(zfsvfs);
5399219089Spjd	ZFS_VERIFY_ZP(zp);
5400219089Spjd	switch (ioflag) {
5401219089Spjd	case UIO_WRITE:
5402219089Spjd		/*
5403219089Spjd		 * Loan out an arc_buf for write if write size is bigger than
5404219089Spjd		 * max_blksz, and the file's block size is also max_blksz.
5405219089Spjd		 */
5406219089Spjd		blksz = max_blksz;
5407219089Spjd		if (size < blksz || zp->z_blksz != blksz) {
5408219089Spjd			ZFS_EXIT(zfsvfs);
5409249195Smm			return (SET_ERROR(EINVAL));
5410219089Spjd		}
5411219089Spjd		/*
5412219089Spjd		 * Caller requests buffers for write before knowing where the
5413219089Spjd		 * write offset might be (e.g. NFS TCP write).
5414219089Spjd		 */
5415219089Spjd		if (offset == -1) {
5416219089Spjd			preamble = 0;
5417219089Spjd		} else {
5418219089Spjd			preamble = P2PHASE(offset, blksz);
5419219089Spjd			if (preamble) {
5420219089Spjd				preamble = blksz - preamble;
5421219089Spjd				size -= preamble;
5422219089Spjd			}
5423219089Spjd		}
5424219089Spjd
5425219089Spjd		postamble = P2PHASE(size, blksz);
5426219089Spjd		size -= postamble;
5427219089Spjd
5428219089Spjd		fullblk = size / blksz;
5429219089Spjd		(void) dmu_xuio_init(xuio,
5430219089Spjd		    (preamble != 0) + fullblk + (postamble != 0));
5431219089Spjd		DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble,
5432219089Spjd		    int, postamble, int,
5433219089Spjd		    (preamble != 0) + fullblk + (postamble != 0));
5434219089Spjd
5435219089Spjd		/*
5436219089Spjd		 * Have to fix iov base/len for partial buffers.  They
5437219089Spjd		 * currently represent full arc_buf's.
5438219089Spjd		 */
5439219089Spjd		if (preamble) {
5440219089Spjd			/* data begins in the middle of the arc_buf */
5441219089Spjd			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5442219089Spjd			    blksz);
5443219089Spjd			ASSERT(abuf);
5444219089Spjd			(void) dmu_xuio_add(xuio, abuf,
5445219089Spjd			    blksz - preamble, preamble);
5446219089Spjd		}
5447219089Spjd
5448219089Spjd		for (i = 0; i < fullblk; i++) {
5449219089Spjd			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5450219089Spjd			    blksz);
5451219089Spjd			ASSERT(abuf);
5452219089Spjd			(void) dmu_xuio_add(xuio, abuf, 0, blksz);
5453219089Spjd		}
5454219089Spjd
5455219089Spjd		if (postamble) {
5456219089Spjd			/* data ends in the middle of the arc_buf */
5457219089Spjd			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5458219089Spjd			    blksz);
5459219089Spjd			ASSERT(abuf);
5460219089Spjd			(void) dmu_xuio_add(xuio, abuf, 0, postamble);
5461219089Spjd		}
5462219089Spjd		break;
5463219089Spjd	case UIO_READ:
5464219089Spjd		/*
5465219089Spjd		 * Loan out an arc_buf for read if the read size is larger than
5466219089Spjd		 * the current file block size.  Block alignment is not
5467219089Spjd		 * considered.  Partial arc_buf will be loaned out for read.
5468219089Spjd		 */
5469219089Spjd		blksz = zp->z_blksz;
5470219089Spjd		if (blksz < zcr_blksz_min)
5471219089Spjd			blksz = zcr_blksz_min;
5472219089Spjd		if (blksz > zcr_blksz_max)
5473219089Spjd			blksz = zcr_blksz_max;
5474219089Spjd		/* avoid potential complexity of dealing with it */
5475219089Spjd		if (blksz > max_blksz) {
5476219089Spjd			ZFS_EXIT(zfsvfs);
5477249195Smm			return (SET_ERROR(EINVAL));
5478219089Spjd		}
5479219089Spjd
5480219089Spjd		maxsize = zp->z_size - uio->uio_loffset;
5481219089Spjd		if (size > maxsize)
5482219089Spjd			size = maxsize;
5483219089Spjd
5484219089Spjd		if (size < blksz || vn_has_cached_data(vp)) {
5485219089Spjd			ZFS_EXIT(zfsvfs);
5486249195Smm			return (SET_ERROR(EINVAL));
5487219089Spjd		}
5488219089Spjd		break;
5489219089Spjd	default:
5490219089Spjd		ZFS_EXIT(zfsvfs);
5491249195Smm		return (SET_ERROR(EINVAL));
5492219089Spjd	}
5493219089Spjd
5494219089Spjd	uio->uio_extflg = UIO_XUIO;
5495219089Spjd	XUIO_XUZC_RW(xuio) = ioflag;
5496219089Spjd	ZFS_EXIT(zfsvfs);
5497219089Spjd	return (0);
5498219089Spjd}
5499219089Spjd
5500219089Spjd/*ARGSUSED*/
5501219089Spjdstatic int
5502219089Spjdzfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct)
5503219089Spjd{
5504219089Spjd	int i;
5505219089Spjd	arc_buf_t *abuf;
5506219089Spjd	int ioflag = XUIO_XUZC_RW(xuio);
5507219089Spjd
5508219089Spjd	ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
5509219089Spjd
5510219089Spjd	i = dmu_xuio_cnt(xuio);
5511219089Spjd	while (i-- > 0) {
5512219089Spjd		abuf = dmu_xuio_arcbuf(xuio, i);
5513219089Spjd		/*
5514219089Spjd		 * if abuf == NULL, it must be a write buffer
5515219089Spjd		 * that has been returned in zfs_write().
5516219089Spjd		 */
5517219089Spjd		if (abuf)
5518219089Spjd			dmu_return_arcbuf(abuf);
5519219089Spjd		ASSERT(abuf || ioflag == UIO_WRITE);
5520219089Spjd	}
5521219089Spjd
5522219089Spjd	dmu_xuio_fini(xuio);
5523219089Spjd	return (0);
5524219089Spjd}
5525219089Spjd
5526219089Spjd/*
5527219089Spjd * Predeclare these here so that the compiler assumes that
5528219089Spjd * this is an "old style" function declaration that does
5529219089Spjd * not include arguments => we won't get type mismatch errors
5530219089Spjd * in the initializations that follow.
5531219089Spjd */
5532219089Spjdstatic int zfs_inval();
5533219089Spjdstatic int zfs_isdir();
5534219089Spjd
5535219089Spjdstatic int
5536219089Spjdzfs_inval()
5537219089Spjd{
5538249195Smm	return (SET_ERROR(EINVAL));
5539219089Spjd}
5540219089Spjd
5541219089Spjdstatic int
5542219089Spjdzfs_isdir()
5543219089Spjd{
5544249195Smm	return (SET_ERROR(EISDIR));
5545219089Spjd}
5546219089Spjd/*
5547219089Spjd * Directory vnode operations template
5548219089Spjd */
5549219089Spjdvnodeops_t *zfs_dvnodeops;
5550219089Spjdconst fs_operation_def_t zfs_dvnodeops_template[] = {
5551219089Spjd	VOPNAME_OPEN,		{ .vop_open = zfs_open },
5552219089Spjd	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
5553219089Spjd	VOPNAME_READ,		{ .error = zfs_isdir },
5554219089Spjd	VOPNAME_WRITE,		{ .error = zfs_isdir },
5555219089Spjd	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
5556219089Spjd	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5557219089Spjd	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5558219089Spjd	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5559219089Spjd	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
5560219089Spjd	VOPNAME_CREATE,		{ .vop_create = zfs_create },
5561219089Spjd	VOPNAME_REMOVE,		{ .vop_remove = zfs_remove },
5562219089Spjd	VOPNAME_LINK,		{ .vop_link = zfs_link },
5563219089Spjd	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5564219089Spjd	VOPNAME_MKDIR,		{ .vop_mkdir = zfs_mkdir },
5565219089Spjd	VOPNAME_RMDIR,		{ .vop_rmdir = zfs_rmdir },
5566219089Spjd	VOPNAME_READDIR,	{ .vop_readdir = zfs_readdir },
5567219089Spjd	VOPNAME_SYMLINK,	{ .vop_symlink = zfs_symlink },
5568219089Spjd	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
5569219089Spjd	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5570219089Spjd	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5571219089Spjd	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
5572219089Spjd	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5573219089Spjd	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5574219089Spjd	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5575268464Sdelphij	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5576219089Spjd	NULL,			NULL
5577219089Spjd};
5578219089Spjd
5579219089Spjd/*
5580219089Spjd * Regular file vnode operations template
5581219089Spjd */
5582219089Spjdvnodeops_t *zfs_fvnodeops;
5583219089Spjdconst fs_operation_def_t zfs_fvnodeops_template[] = {
5584219089Spjd	VOPNAME_OPEN,		{ .vop_open = zfs_open },
5585219089Spjd	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
5586219089Spjd	VOPNAME_READ,		{ .vop_read = zfs_read },
5587219089Spjd	VOPNAME_WRITE,		{ .vop_write = zfs_write },
5588219089Spjd	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
5589219089Spjd	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5590219089Spjd	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5591219089Spjd	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5592219089Spjd	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
5593219089Spjd	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5594219089Spjd	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
5595219089Spjd	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5596219089Spjd	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5597219089Spjd	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
5598219089Spjd	VOPNAME_FRLOCK,		{ .vop_frlock = zfs_frlock },
5599219089Spjd	VOPNAME_SPACE,		{ .vop_space = zfs_space },
5600219089Spjd	VOPNAME_GETPAGE,	{ .vop_getpage = zfs_getpage },
5601219089Spjd	VOPNAME_PUTPAGE,	{ .vop_putpage = zfs_putpage },
5602219089Spjd	VOPNAME_MAP,		{ .vop_map = zfs_map },
5603219089Spjd	VOPNAME_ADDMAP,		{ .vop_addmap = zfs_addmap },
5604219089Spjd	VOPNAME_DELMAP,		{ .vop_delmap = zfs_delmap },
5605219089Spjd	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5606219089Spjd	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5607219089Spjd	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5608219089Spjd	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5609268464Sdelphij	VOPNAME_REQZCBUF,	{ .vop_reqzcbuf = zfs_reqzcbuf },
5610268464Sdelphij	VOPNAME_RETZCBUF,	{ .vop_retzcbuf = zfs_retzcbuf },
5611219089Spjd	NULL,			NULL
5612219089Spjd};
5613219089Spjd
5614219089Spjd/*
5615219089Spjd * Symbolic link vnode operations template
5616219089Spjd */
5617219089Spjdvnodeops_t *zfs_symvnodeops;
5618219089Spjdconst fs_operation_def_t zfs_symvnodeops_template[] = {
5619219089Spjd	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5620219089Spjd	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5621219089Spjd	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5622219089Spjd	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5623219089Spjd	VOPNAME_READLINK,	{ .vop_readlink = zfs_readlink },
5624219089Spjd	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5625219089Spjd	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5626219089Spjd	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5627219089Spjd	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5628219089Spjd	NULL,			NULL
5629219089Spjd};
5630219089Spjd
5631219089Spjd/*
5632219089Spjd * special share hidden files vnode operations template
5633219089Spjd */
5634219089Spjdvnodeops_t *zfs_sharevnodeops;
5635219089Spjdconst fs_operation_def_t zfs_sharevnodeops_template[] = {
5636219089Spjd	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5637219089Spjd	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5638219089Spjd	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5639219089Spjd	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5640219089Spjd	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5641219089Spjd	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5642219089Spjd	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5643219089Spjd	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5644219089Spjd	NULL,			NULL
5645219089Spjd};
5646219089Spjd
5647219089Spjd/*
5648219089Spjd * Extended attribute directory vnode operations template
5649251631Sdelphij *
5650251631Sdelphij * This template is identical to the directory vnodes
5651251631Sdelphij * operation template except for restricted operations:
5652251631Sdelphij *	VOP_MKDIR()
5653251631Sdelphij *	VOP_SYMLINK()
5654251631Sdelphij *
5655219089Spjd * Note that there are other restrictions embedded in:
5656219089Spjd *	zfs_create()	- restrict type to VREG
5657219089Spjd *	zfs_link()	- no links into/out of attribute space
5658219089Spjd *	zfs_rename()	- no moves into/out of attribute space
5659219089Spjd */
5660219089Spjdvnodeops_t *zfs_xdvnodeops;
5661219089Spjdconst fs_operation_def_t zfs_xdvnodeops_template[] = {
5662219089Spjd	VOPNAME_OPEN,		{ .vop_open = zfs_open },
5663219089Spjd	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
5664219089Spjd	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
5665219089Spjd	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5666219089Spjd	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5667219089Spjd	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5668219089Spjd	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
5669219089Spjd	VOPNAME_CREATE,		{ .vop_create = zfs_create },
5670219089Spjd	VOPNAME_REMOVE,		{ .vop_remove = zfs_remove },
5671219089Spjd	VOPNAME_LINK,		{ .vop_link = zfs_link },
5672219089Spjd	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5673219089Spjd	VOPNAME_MKDIR,		{ .error = zfs_inval },
5674219089Spjd	VOPNAME_RMDIR,		{ .vop_rmdir = zfs_rmdir },
5675219089Spjd	VOPNAME_READDIR,	{ .vop_readdir = zfs_readdir },
5676219089Spjd	VOPNAME_SYMLINK,	{ .error = zfs_inval },
5677219089Spjd	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
5678219089Spjd	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5679219089Spjd	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5680219089Spjd	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
5681219089Spjd	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5682219089Spjd	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5683219089Spjd	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5684219089Spjd	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5685219089Spjd	NULL,			NULL
5686219089Spjd};
5687219089Spjd
5688219089Spjd/*
5689219089Spjd * Error vnode operations template
5690219089Spjd */
5691219089Spjdvnodeops_t *zfs_evnodeops;
5692219089Spjdconst fs_operation_def_t zfs_evnodeops_template[] = {
5693219089Spjd	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5694219089Spjd	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5695219089Spjd	NULL,			NULL
5696219089Spjd};
5697219089Spjd#endif	/* sun */
5698219089Spjd
5699219089Spjdstatic int
5700213673Spjdioflags(int ioflags)
5701213673Spjd{
5702213673Spjd	int flags = 0;
5703213673Spjd
5704213673Spjd	if (ioflags & IO_APPEND)
5705213673Spjd		flags |= FAPPEND;
5706213673Spjd	if (ioflags & IO_NDELAY)
5707213673Spjd        	flags |= FNONBLOCK;
5708213673Spjd	if (ioflags & IO_SYNC)
5709213673Spjd		flags |= (FSYNC | FDSYNC | FRSYNC);
5710213673Spjd
5711213673Spjd	return (flags);
5712213673Spjd}
5713213673Spjd
5714213673Spjdstatic int
5715213937Savgzfs_getpages(struct vnode *vp, vm_page_t *m, int count, int reqpage)
5716213937Savg{
5717213937Savg	znode_t *zp = VTOZ(vp);
5718213937Savg	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5719213937Savg	objset_t *os = zp->z_zfsvfs->z_os;
5720243517Savg	vm_page_t mfirst, mlast, mreq;
5721213937Savg	vm_object_t object;
5722213937Savg	caddr_t va;
5723213937Savg	struct sf_buf *sf;
5724243517Savg	off_t startoff, endoff;
5725213937Savg	int i, error;
5726243517Savg	vm_pindex_t reqstart, reqend;
5727243517Savg	int pcount, lsize, reqsize, size;
5728213937Savg
5729213937Savg	ZFS_ENTER(zfsvfs);
5730213937Savg	ZFS_VERIFY_ZP(zp);
5731213937Savg
5732243517Savg	pcount = OFF_TO_IDX(round_page(count));
5733213937Savg	mreq = m[reqpage];
5734213937Savg	object = mreq->object;
5735213937Savg	error = 0;
5736213937Savg
5737213937Savg	KASSERT(vp->v_object == object, ("mismatching object"));
5738213937Savg
5739243517Savg	if (pcount > 1 && zp->z_blksz > PAGESIZE) {
5740243517Savg		startoff = rounddown(IDX_TO_OFF(mreq->pindex), zp->z_blksz);
5741243517Savg		reqstart = OFF_TO_IDX(round_page(startoff));
5742243517Savg		if (reqstart < m[0]->pindex)
5743243517Savg			reqstart = 0;
5744243517Savg		else
5745243517Savg			reqstart = reqstart - m[0]->pindex;
5746243517Savg		endoff = roundup(IDX_TO_OFF(mreq->pindex) + PAGE_SIZE,
5747243517Savg		    zp->z_blksz);
5748243517Savg		reqend = OFF_TO_IDX(trunc_page(endoff)) - 1;
5749243517Savg		if (reqend > m[pcount - 1]->pindex)
5750243517Savg			reqend = m[pcount - 1]->pindex;
5751243517Savg		reqsize = reqend - m[reqstart]->pindex + 1;
5752243517Savg		KASSERT(reqstart <= reqpage && reqpage < reqstart + reqsize,
5753243517Savg		    ("reqpage beyond [reqstart, reqstart + reqsize[ bounds"));
5754243517Savg	} else {
5755243517Savg		reqstart = reqpage;
5756243517Savg		reqsize = 1;
5757243517Savg	}
5758243517Savg	mfirst = m[reqstart];
5759243517Savg	mlast = m[reqstart + reqsize - 1];
5760243517Savg
5761248084Sattilio	zfs_vmobject_wlock(object);
5762213937Savg
5763243517Savg	for (i = 0; i < reqstart; i++) {
5764243517Savg		vm_page_lock(m[i]);
5765243517Savg		vm_page_free(m[i]);
5766243517Savg		vm_page_unlock(m[i]);
5767213937Savg	}
5768243517Savg	for (i = reqstart + reqsize; i < pcount; i++) {
5769243517Savg		vm_page_lock(m[i]);
5770243517Savg		vm_page_free(m[i]);
5771243517Savg		vm_page_unlock(m[i]);
5772243517Savg	}
5773213937Savg
5774243517Savg	if (mreq->valid && reqsize == 1) {
5775213937Savg		if (mreq->valid != VM_PAGE_BITS_ALL)
5776213937Savg			vm_page_zero_invalid(mreq, TRUE);
5777248084Sattilio		zfs_vmobject_wunlock(object);
5778213937Savg		ZFS_EXIT(zfsvfs);
5779248084Sattilio		return (zfs_vm_pagerret_ok);
5780213937Savg	}
5781213937Savg
5782213937Savg	PCPU_INC(cnt.v_vnodein);
5783243517Savg	PCPU_ADD(cnt.v_vnodepgsin, reqsize);
5784213937Savg
5785213937Savg	if (IDX_TO_OFF(mreq->pindex) >= object->un_pager.vnp.vnp_size) {
5786243517Savg		for (i = reqstart; i < reqstart + reqsize; i++) {
5787243517Savg			if (i != reqpage) {
5788243517Savg				vm_page_lock(m[i]);
5789243517Savg				vm_page_free(m[i]);
5790243517Savg				vm_page_unlock(m[i]);
5791243517Savg			}
5792243517Savg		}
5793248084Sattilio		zfs_vmobject_wunlock(object);
5794213937Savg		ZFS_EXIT(zfsvfs);
5795248084Sattilio		return (zfs_vm_pagerret_bad);
5796213937Savg	}
5797213937Savg
5798243517Savg	lsize = PAGE_SIZE;
5799243517Savg	if (IDX_TO_OFF(mlast->pindex) + lsize > object->un_pager.vnp.vnp_size)
5800243517Savg		lsize = object->un_pager.vnp.vnp_size - IDX_TO_OFF(mlast->pindex);
5801213937Savg
5802248084Sattilio	zfs_vmobject_wunlock(object);
5803243517Savg
5804243517Savg	for (i = reqstart; i < reqstart + reqsize; i++) {
5805243517Savg		size = PAGE_SIZE;
5806243517Savg		if (i == (reqstart + reqsize - 1))
5807243517Savg			size = lsize;
5808243517Savg		va = zfs_map_page(m[i], &sf);
5809243517Savg		error = dmu_read(os, zp->z_id, IDX_TO_OFF(m[i]->pindex),
5810243517Savg		    size, va, DMU_READ_PREFETCH);
5811243517Savg		if (size != PAGE_SIZE)
5812243517Savg			bzero(va + size, PAGE_SIZE - size);
5813243517Savg		zfs_unmap_page(sf);
5814243517Savg		if (error != 0)
5815243517Savg			break;
5816243517Savg	}
5817243517Savg
5818248084Sattilio	zfs_vmobject_wlock(object);
5819213937Savg
5820243517Savg	for (i = reqstart; i < reqstart + reqsize; i++) {
5821243763Savg		if (!error)
5822243763Savg			m[i]->valid = VM_PAGE_BITS_ALL;
5823243517Savg		KASSERT(m[i]->dirty == 0, ("zfs_getpages: page %p is dirty", m[i]));
5824243763Savg		if (i != reqpage)
5825243763Savg			vm_page_readahead_finish(m[i]);
5826243517Savg	}
5827243517Savg
5828248084Sattilio	zfs_vmobject_wunlock(object);
5829213937Savg
5830213937Savg	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
5831213937Savg	ZFS_EXIT(zfsvfs);
5832248084Sattilio	return (error ? zfs_vm_pagerret_error : zfs_vm_pagerret_ok);
5833213937Savg}
5834213937Savg
5835213937Savgstatic int
5836213937Savgzfs_freebsd_getpages(ap)
5837213937Savg	struct vop_getpages_args /* {
5838213937Savg		struct vnode *a_vp;
5839213937Savg		vm_page_t *a_m;
5840213937Savg		int a_count;
5841213937Savg		int a_reqpage;
5842213937Savg	} */ *ap;
5843213937Savg{
5844213937Savg
5845213937Savg	return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_reqpage));
5846213937Savg}
5847213937Savg
5848213937Savgstatic int
5849258746Savgzfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
5850258746Savg    int *rtvals)
5851258746Savg{
5852258746Savg	znode_t		*zp = VTOZ(vp);
5853258746Savg	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
5854258746Savg	rl_t		*rl;
5855258746Savg	dmu_tx_t	*tx;
5856258746Savg	struct sf_buf	*sf;
5857258746Savg	vm_object_t	object;
5858258746Savg	vm_page_t	m;
5859258746Savg	caddr_t		va;
5860258746Savg	size_t		tocopy;
5861258746Savg	size_t		lo_len;
5862258746Savg	vm_ooffset_t	lo_off;
5863258746Savg	vm_ooffset_t	off;
5864258746Savg	uint_t		blksz;
5865258746Savg	int		ncount;
5866258746Savg	int		pcount;
5867258746Savg	int		err;
5868258746Savg	int		i;
5869258746Savg
5870258746Savg	ZFS_ENTER(zfsvfs);
5871258746Savg	ZFS_VERIFY_ZP(zp);
5872258746Savg
5873258746Savg	object = vp->v_object;
5874258746Savg	pcount = btoc(len);
5875258746Savg	ncount = pcount;
5876258746Savg
5877258746Savg	KASSERT(ma[0]->object == object, ("mismatching object"));
5878258746Savg	KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));
5879258746Savg
5880258746Savg	for (i = 0; i < pcount; i++)
5881258746Savg		rtvals[i] = zfs_vm_pagerret_error;
5882258746Savg
5883258746Savg	off = IDX_TO_OFF(ma[0]->pindex);
5884258746Savg	blksz = zp->z_blksz;
5885258746Savg	lo_off = rounddown(off, blksz);
5886258746Savg	lo_len = roundup(len + (off - lo_off), blksz);
5887258746Savg	rl = zfs_range_lock(zp, lo_off, lo_len, RL_WRITER);
5888258746Savg
5889258746Savg	zfs_vmobject_wlock(object);
5890258746Savg	if (len + off > object->un_pager.vnp.vnp_size) {
5891258746Savg		if (object->un_pager.vnp.vnp_size > off) {
5892258746Savg			int pgoff;
5893258746Savg
5894258746Savg			len = object->un_pager.vnp.vnp_size - off;
5895258746Savg			ncount = btoc(len);
5896258746Savg			if ((pgoff = (int)len & PAGE_MASK) != 0) {
5897258746Savg				/*
5898258746Savg				 * If the object is locked and the following
5899258746Savg				 * conditions hold, then the page's dirty
5900258746Savg				 * field cannot be concurrently changed by a
5901258746Savg				 * pmap operation.
5902258746Savg				 */
5903258746Savg				m = ma[ncount - 1];
5904258746Savg				vm_page_assert_sbusied(m);
5905258746Savg				KASSERT(!pmap_page_is_write_mapped(m),
5906258746Savg				    ("zfs_putpages: page %p is not read-only", m));
5907258746Savg				vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
5908258746Savg				    pgoff);
5909258746Savg			}
5910258746Savg		} else {
5911258746Savg			len = 0;
5912258746Savg			ncount = 0;
5913258746Savg		}
5914258746Savg		if (ncount < pcount) {
5915258746Savg			for (i = ncount; i < pcount; i++) {
5916258746Savg				rtvals[i] = zfs_vm_pagerret_bad;
5917258746Savg			}
5918258746Savg		}
5919258746Savg	}
5920258746Savg	zfs_vmobject_wunlock(object);
5921258746Savg
5922258746Savg	if (ncount == 0)
5923258746Savg		goto out;
5924258746Savg
5925258746Savg	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
5926258746Savg	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
5927258746Savg		goto out;
5928258746Savg	}
5929258746Savg
5930258746Savgtop:
5931258746Savg	tx = dmu_tx_create(zfsvfs->z_os);
5932258746Savg	dmu_tx_hold_write(tx, zp->z_id, off, len);
5933258746Savg
5934258746Savg	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
5935258746Savg	zfs_sa_upgrade_txholds(tx, zp);
5936258746Savg	err = dmu_tx_assign(tx, TXG_NOWAIT);
5937258746Savg	if (err != 0) {
5938258746Savg		if (err == ERESTART) {
5939258746Savg			dmu_tx_wait(tx);
5940258746Savg			dmu_tx_abort(tx);
5941258746Savg			goto top;
5942258746Savg		}
5943258746Savg		dmu_tx_abort(tx);
5944258746Savg		goto out;
5945258746Savg	}
5946258746Savg
5947258746Savg	if (zp->z_blksz < PAGE_SIZE) {
5948258746Savg		i = 0;
5949258746Savg		for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
5950258746Savg			tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
5951258746Savg			va = zfs_map_page(ma[i], &sf);
5952258746Savg			dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
5953258746Savg			zfs_unmap_page(sf);
5954258746Savg		}
5955258746Savg	} else {
5956258746Savg		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
5957258746Savg	}
5958258746Savg
5959258746Savg	if (err == 0) {
5960258746Savg		uint64_t mtime[2], ctime[2];
5961258746Savg		sa_bulk_attr_t bulk[3];
5962258746Savg		int count = 0;
5963258746Savg
5964258746Savg		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
5965258746Savg		    &mtime, 16);
5966258746Savg		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
5967258746Savg		    &ctime, 16);
5968258746Savg		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
5969258746Savg		    &zp->z_pflags, 8);
5970258746Savg		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
5971258746Savg		    B_TRUE);
5972258746Savg		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
5973258746Savg
5974258746Savg		zfs_vmobject_wlock(object);
5975258746Savg		for (i = 0; i < ncount; i++) {
5976258746Savg			rtvals[i] = zfs_vm_pagerret_ok;
5977258746Savg			vm_page_undirty(ma[i]);
5978258746Savg		}
5979258746Savg		zfs_vmobject_wunlock(object);
5980258746Savg		PCPU_INC(cnt.v_vnodeout);
5981258746Savg		PCPU_ADD(cnt.v_vnodepgsout, ncount);
5982258746Savg	}
5983258746Savg	dmu_tx_commit(tx);
5984258746Savg
5985258746Savgout:
5986258746Savg	zfs_range_unlock(rl);
5987258746Savg	if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 ||
5988258746Savg	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
5989258746Savg		zil_commit(zfsvfs->z_log, zp->z_id);
5990258746Savg	ZFS_EXIT(zfsvfs);
5991258746Savg	return (rtvals[0]);
5992258746Savg}
5993258746Savg
5994258746Savgint
5995258746Savgzfs_freebsd_putpages(ap)
5996258746Savg	struct vop_putpages_args /* {
5997258746Savg		struct vnode *a_vp;
5998258746Savg		vm_page_t *a_m;
5999258746Savg		int a_count;
6000258746Savg		int a_sync;
6001258746Savg		int *a_rtvals;
6002258746Savg	} */ *ap;
6003258746Savg{
6004258746Savg
6005258746Savg	return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
6006258746Savg	    ap->a_rtvals));
6007258746Savg}
6008258746Savg
6009258746Savgstatic int
6010243518Savgzfs_freebsd_bmap(ap)
6011243518Savg	struct vop_bmap_args /* {
6012243518Savg		struct vnode *a_vp;
6013243518Savg		daddr_t  a_bn;
6014243518Savg		struct bufobj **a_bop;
6015243518Savg		daddr_t *a_bnp;
6016243518Savg		int *a_runp;
6017243518Savg		int *a_runb;
6018243518Savg	} */ *ap;
6019243518Savg{
6020243518Savg
6021243518Savg	if (ap->a_bop != NULL)
6022243518Savg		*ap->a_bop = &ap->a_vp->v_bufobj;
6023243518Savg	if (ap->a_bnp != NULL)
6024243518Savg		*ap->a_bnp = ap->a_bn;
6025243518Savg	if (ap->a_runp != NULL)
6026243518Savg		*ap->a_runp = 0;
6027243518Savg	if (ap->a_runb != NULL)
6028243518Savg		*ap->a_runb = 0;
6029243518Savg
6030243518Savg	return (0);
6031243518Savg}
6032243518Savg
6033243518Savgstatic int
6034168962Spjdzfs_freebsd_open(ap)
6035168962Spjd	struct vop_open_args /* {
6036168962Spjd		struct vnode *a_vp;
6037168962Spjd		int a_mode;
6038168962Spjd		struct ucred *a_cred;
6039168962Spjd		struct thread *a_td;
6040168962Spjd	} */ *ap;
6041168962Spjd{
6042168962Spjd	vnode_t	*vp = ap->a_vp;
6043168962Spjd	znode_t *zp = VTOZ(vp);
6044168962Spjd	int error;
6045168962Spjd
6046185029Spjd	error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
6047168962Spjd	if (error == 0)
6048219089Spjd		vnode_create_vobject(vp, zp->z_size, ap->a_td);
6049168962Spjd	return (error);
6050168962Spjd}
6051168962Spjd
6052168962Spjdstatic int
6053168962Spjdzfs_freebsd_close(ap)
6054168962Spjd	struct vop_close_args /* {
6055168962Spjd		struct vnode *a_vp;
6056168962Spjd		int  a_fflag;
6057168962Spjd		struct ucred *a_cred;
6058168962Spjd		struct thread *a_td;
6059168962Spjd	} */ *ap;
6060168962Spjd{
6061168962Spjd
6062242566Savg	return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL));
6063168962Spjd}
6064168962Spjd
6065168962Spjdstatic int
6066168962Spjdzfs_freebsd_ioctl(ap)
6067168962Spjd	struct vop_ioctl_args /* {
6068168962Spjd		struct vnode *a_vp;
6069168962Spjd		u_long a_command;
6070168962Spjd		caddr_t a_data;
6071168962Spjd		int a_fflag;
6072168962Spjd		struct ucred *cred;
6073168962Spjd		struct thread *td;
6074168962Spjd	} */ *ap;
6075168962Spjd{
6076168962Spjd
6077168978Spjd	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
6078185029Spjd	    ap->a_fflag, ap->a_cred, NULL, NULL));
6079168962Spjd}
6080168962Spjd
6081168962Spjdstatic int
6082168962Spjdzfs_freebsd_read(ap)
6083168962Spjd	struct vop_read_args /* {
6084168962Spjd		struct vnode *a_vp;
6085168962Spjd		struct uio *a_uio;
6086168962Spjd		int a_ioflag;
6087168962Spjd		struct ucred *a_cred;
6088168962Spjd	} */ *ap;
6089168962Spjd{
6090168962Spjd
6091213673Spjd	return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
6092213673Spjd	    ap->a_cred, NULL));
6093168962Spjd}
6094168962Spjd
6095168962Spjdstatic int
6096168962Spjdzfs_freebsd_write(ap)
6097168962Spjd	struct vop_write_args /* {
6098168962Spjd		struct vnode *a_vp;
6099168962Spjd		struct uio *a_uio;
6100168962Spjd		int a_ioflag;
6101168962Spjd		struct ucred *a_cred;
6102168962Spjd	} */ *ap;
6103168962Spjd{
6104168962Spjd
6105213673Spjd	return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
6106213673Spjd	    ap->a_cred, NULL));
6107168962Spjd}
6108168962Spjd
6109168962Spjdstatic int
6110168962Spjdzfs_freebsd_access(ap)
6111168962Spjd	struct vop_access_args /* {
6112168962Spjd		struct vnode *a_vp;
6113192689Strasz		accmode_t a_accmode;
6114168962Spjd		struct ucred *a_cred;
6115168962Spjd		struct thread *a_td;
6116168962Spjd	} */ *ap;
6117168962Spjd{
6118212002Sjh	vnode_t *vp = ap->a_vp;
6119212002Sjh	znode_t *zp = VTOZ(vp);
6120198703Spjd	accmode_t accmode;
6121198703Spjd	int error = 0;
6122168962Spjd
6123185172Spjd	/*
6124198703Spjd	 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
6125185172Spjd	 */
6126198703Spjd	accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
6127198703Spjd	if (accmode != 0)
6128198703Spjd		error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL);
6129185172Spjd
6130198703Spjd	/*
6131198703Spjd	 * VADMIN has to be handled by vaccess().
6132198703Spjd	 */
6133198703Spjd	if (error == 0) {
6134198703Spjd		accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
6135198703Spjd		if (accmode != 0) {
6136219089Spjd			error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
6137219089Spjd			    zp->z_gid, accmode, ap->a_cred, NULL);
6138198703Spjd		}
6139185172Spjd	}
6140185172Spjd
6141212002Sjh	/*
6142212002Sjh	 * For VEXEC, ensure that at least one execute bit is set for
6143212002Sjh	 * non-directories.
6144212002Sjh	 */
6145212002Sjh	if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
6146219089Spjd	    (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
6147212002Sjh		error = EACCES;
6148219089Spjd	}
6149212002Sjh
6150198703Spjd	return (error);
6151168962Spjd}
6152168962Spjd
6153168962Spjdstatic int
6154168962Spjdzfs_freebsd_lookup(ap)
6155168962Spjd	struct vop_lookup_args /* {
6156168962Spjd		struct vnode *a_dvp;
6157168962Spjd		struct vnode **a_vpp;
6158168962Spjd		struct componentname *a_cnp;
6159168962Spjd	} */ *ap;
6160168962Spjd{
6161168962Spjd	struct componentname *cnp = ap->a_cnp;
6162168962Spjd	char nm[NAME_MAX + 1];
6163168962Spjd
6164168962Spjd	ASSERT(cnp->cn_namelen < sizeof(nm));
6165168962Spjd	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
6166168962Spjd
6167168962Spjd	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
6168185029Spjd	    cnp->cn_cred, cnp->cn_thread, 0));
6169168962Spjd}
6170168962Spjd
6171168962Spjdstatic int
6172168962Spjdzfs_freebsd_create(ap)
6173168962Spjd	struct vop_create_args /* {
6174168962Spjd		struct vnode *a_dvp;
6175168962Spjd		struct vnode **a_vpp;
6176168962Spjd		struct componentname *a_cnp;
6177168962Spjd		struct vattr *a_vap;
6178168962Spjd	} */ *ap;
6179168962Spjd{
6180168962Spjd	struct componentname *cnp = ap->a_cnp;
6181168962Spjd	vattr_t *vap = ap->a_vap;
6182168962Spjd	int mode;
6183168962Spjd
6184168962Spjd	ASSERT(cnp->cn_flags & SAVENAME);
6185168962Spjd
6186168962Spjd	vattr_init_mask(vap);
6187168962Spjd	mode = vap->va_mode & ALLPERMS;
6188168962Spjd
6189168962Spjd	return (zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
6190185029Spjd	    ap->a_vpp, cnp->cn_cred, cnp->cn_thread));
6191168962Spjd}
6192168962Spjd
6193168962Spjdstatic int
6194168962Spjdzfs_freebsd_remove(ap)
6195168962Spjd	struct vop_remove_args /* {
6196168962Spjd		struct vnode *a_dvp;
6197168962Spjd		struct vnode *a_vp;
6198168962Spjd		struct componentname *a_cnp;
6199168962Spjd	} */ *ap;
6200168962Spjd{
6201168962Spjd
6202168962Spjd	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
6203168962Spjd
6204168962Spjd	return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr,
6205185029Spjd	    ap->a_cnp->cn_cred, NULL, 0));
6206168962Spjd}
6207168962Spjd
6208168962Spjdstatic int
6209168962Spjdzfs_freebsd_mkdir(ap)
6210168962Spjd	struct vop_mkdir_args /* {
6211168962Spjd		struct vnode *a_dvp;
6212168962Spjd		struct vnode **a_vpp;
6213168962Spjd		struct componentname *a_cnp;
6214168962Spjd		struct vattr *a_vap;
6215168962Spjd	} */ *ap;
6216168962Spjd{
6217168962Spjd	vattr_t *vap = ap->a_vap;
6218168962Spjd
6219168962Spjd	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
6220168962Spjd
6221168962Spjd	vattr_init_mask(vap);
6222168962Spjd
6223168962Spjd	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
6224185029Spjd	    ap->a_cnp->cn_cred, NULL, 0, NULL));
6225168962Spjd}
6226168962Spjd
6227168962Spjdstatic int
6228168962Spjdzfs_freebsd_rmdir(ap)
6229168962Spjd	struct vop_rmdir_args /* {
6230168962Spjd		struct vnode *a_dvp;
6231168962Spjd		struct vnode *a_vp;
6232168962Spjd		struct componentname *a_cnp;
6233168962Spjd	} */ *ap;
6234168962Spjd{
6235168962Spjd	struct componentname *cnp = ap->a_cnp;
6236168962Spjd
6237168962Spjd	ASSERT(cnp->cn_flags & SAVENAME);
6238168962Spjd
6239185029Spjd	return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred, NULL, 0));
6240168962Spjd}
6241168962Spjd
6242168962Spjdstatic int
6243168962Spjdzfs_freebsd_readdir(ap)
6244168962Spjd	struct vop_readdir_args /* {
6245168962Spjd		struct vnode *a_vp;
6246168962Spjd		struct uio *a_uio;
6247168962Spjd		struct ucred *a_cred;
6248168962Spjd		int *a_eofflag;
6249168962Spjd		int *a_ncookies;
6250168962Spjd		u_long **a_cookies;
6251168962Spjd	} */ *ap;
6252168962Spjd{
6253168962Spjd
6254168962Spjd	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
6255168962Spjd	    ap->a_ncookies, ap->a_cookies));
6256168962Spjd}
6257168962Spjd
6258168962Spjdstatic int
6259168962Spjdzfs_freebsd_fsync(ap)
6260168962Spjd	struct vop_fsync_args /* {
6261168962Spjd		struct vnode *a_vp;
6262168962Spjd		int a_waitfor;
6263168962Spjd		struct thread *a_td;
6264168962Spjd	} */ *ap;
6265168962Spjd{
6266168962Spjd
6267168962Spjd	vop_stdfsync(ap);
6268185029Spjd	return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
6269168962Spjd}
6270168962Spjd
6271168962Spjdstatic int
6272168962Spjdzfs_freebsd_getattr(ap)
6273168962Spjd	struct vop_getattr_args /* {
6274168962Spjd		struct vnode *a_vp;
6275168962Spjd		struct vattr *a_vap;
6276168962Spjd		struct ucred *a_cred;
6277168962Spjd	} */ *ap;
6278168962Spjd{
6279185029Spjd	vattr_t *vap = ap->a_vap;
6280185029Spjd	xvattr_t xvap;
6281185029Spjd	u_long fflags = 0;
6282185029Spjd	int error;
6283168962Spjd
6284185029Spjd	xva_init(&xvap);
6285185029Spjd	xvap.xva_vattr = *vap;
6286185029Spjd	xvap.xva_vattr.va_mask |= AT_XVATTR;
6287185029Spjd
6288185029Spjd	/* Convert chflags into ZFS-type flags. */
6289185029Spjd	/* XXX: what about SF_SETTABLE?. */
6290185029Spjd	XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
6291185029Spjd	XVA_SET_REQ(&xvap, XAT_APPENDONLY);
6292185029Spjd	XVA_SET_REQ(&xvap, XAT_NOUNLINK);
6293185029Spjd	XVA_SET_REQ(&xvap, XAT_NODUMP);
6294254627Sken	XVA_SET_REQ(&xvap, XAT_READONLY);
6295254627Sken	XVA_SET_REQ(&xvap, XAT_ARCHIVE);
6296254627Sken	XVA_SET_REQ(&xvap, XAT_SYSTEM);
6297254627Sken	XVA_SET_REQ(&xvap, XAT_HIDDEN);
6298254627Sken	XVA_SET_REQ(&xvap, XAT_REPARSE);
6299254627Sken	XVA_SET_REQ(&xvap, XAT_OFFLINE);
6300254627Sken	XVA_SET_REQ(&xvap, XAT_SPARSE);
6301254627Sken
6302185029Spjd	error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
6303185029Spjd	if (error != 0)
6304185029Spjd		return (error);
6305185029Spjd
6306185029Spjd	/* Convert ZFS xattr into chflags. */
6307185029Spjd#define	FLAG_CHECK(fflag, xflag, xfield)	do {			\
6308185029Spjd	if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)		\
6309185029Spjd		fflags |= (fflag);					\
6310185029Spjd} while (0)
6311185029Spjd	FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
6312185029Spjd	    xvap.xva_xoptattrs.xoa_immutable);
6313185029Spjd	FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
6314185029Spjd	    xvap.xva_xoptattrs.xoa_appendonly);
6315185029Spjd	FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
6316185029Spjd	    xvap.xva_xoptattrs.xoa_nounlink);
6317254627Sken	FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
6318254627Sken	    xvap.xva_xoptattrs.xoa_archive);
6319185029Spjd	FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
6320185029Spjd	    xvap.xva_xoptattrs.xoa_nodump);
6321254627Sken	FLAG_CHECK(UF_READONLY, XAT_READONLY,
6322254627Sken	    xvap.xva_xoptattrs.xoa_readonly);
6323254627Sken	FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
6324254627Sken	    xvap.xva_xoptattrs.xoa_system);
6325254627Sken	FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
6326254627Sken	    xvap.xva_xoptattrs.xoa_hidden);
6327254627Sken	FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
6328254627Sken	    xvap.xva_xoptattrs.xoa_reparse);
6329254627Sken	FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
6330254627Sken	    xvap.xva_xoptattrs.xoa_offline);
6331254627Sken	FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
6332254627Sken	    xvap.xva_xoptattrs.xoa_sparse);
6333254627Sken
6334185029Spjd#undef	FLAG_CHECK
6335185029Spjd	*vap = xvap.xva_vattr;
6336185029Spjd	vap->va_flags = fflags;
6337185029Spjd	return (0);
6338168962Spjd}
6339168962Spjd
6340168962Spjdstatic int
6341168962Spjdzfs_freebsd_setattr(ap)
6342168962Spjd	struct vop_setattr_args /* {
6343168962Spjd		struct vnode *a_vp;
6344168962Spjd		struct vattr *a_vap;
6345168962Spjd		struct ucred *a_cred;
6346168962Spjd	} */ *ap;
6347168962Spjd{
6348185172Spjd	vnode_t *vp = ap->a_vp;
6349168962Spjd	vattr_t *vap = ap->a_vap;
6350185172Spjd	cred_t *cred = ap->a_cred;
6351185029Spjd	xvattr_t xvap;
6352185029Spjd	u_long fflags;
6353185029Spjd	uint64_t zflags;
6354168962Spjd
6355168962Spjd	vattr_init_mask(vap);
6356170044Spjd	vap->va_mask &= ~AT_NOSET;
6357168962Spjd
6358185029Spjd	xva_init(&xvap);
6359185029Spjd	xvap.xva_vattr = *vap;
6360185029Spjd
6361219089Spjd	zflags = VTOZ(vp)->z_pflags;
6362185172Spjd
6363185029Spjd	if (vap->va_flags != VNOVAL) {
6364197683Sdelphij		zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
6365185172Spjd		int error;
6366185172Spjd
6367197683Sdelphij		if (zfsvfs->z_use_fuids == B_FALSE)
6368197683Sdelphij			return (EOPNOTSUPP);
6369197683Sdelphij
6370185029Spjd		fflags = vap->va_flags;
6371254627Sken		/*
6372254627Sken		 * XXX KDM
6373254627Sken		 * We need to figure out whether it makes sense to allow
6374254627Sken		 * UF_REPARSE through, since we don't really have other
6375254627Sken		 * facilities to handle reparse points and zfs_setattr()
6376254627Sken		 * doesn't currently allow setting that attribute anyway.
6377254627Sken		 */
6378254627Sken		if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE|
6379254627Sken		     UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE|
6380254627Sken		     UF_OFFLINE|UF_SPARSE)) != 0)
6381185029Spjd			return (EOPNOTSUPP);
6382185172Spjd		/*
6383185172Spjd		 * Unprivileged processes are not permitted to unset system
6384185172Spjd		 * flags, or modify flags if any system flags are set.
6385185172Spjd		 * Privileged non-jail processes may not modify system flags
6386185172Spjd		 * if securelevel > 0 and any existing system flags are set.
6387185172Spjd		 * Privileged jail processes behave like privileged non-jail
6388185172Spjd		 * processes if the security.jail.chflags_allowed sysctl is
6389185172Spjd		 * is non-zero; otherwise, they behave like unprivileged
6390185172Spjd		 * processes.
6391185172Spjd		 */
6392197861Spjd		if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
6393197861Spjd		    priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) {
6394185172Spjd			if (zflags &
6395185172Spjd			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
6396185172Spjd				error = securelevel_gt(cred, 0);
6397197861Spjd				if (error != 0)
6398185172Spjd					return (error);
6399185172Spjd			}
6400185172Spjd		} else {
6401197861Spjd			/*
6402197861Spjd			 * Callers may only modify the file flags on objects they
6403197861Spjd			 * have VADMIN rights for.
6404197861Spjd			 */
6405197861Spjd			if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0)
6406197861Spjd				return (error);
6407185172Spjd			if (zflags &
6408185172Spjd			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
6409185172Spjd				return (EPERM);
6410185172Spjd			}
6411185172Spjd			if (fflags &
6412185172Spjd			    (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
6413185172Spjd				return (EPERM);
6414185172Spjd			}
6415185172Spjd		}
6416185029Spjd
6417185029Spjd#define	FLAG_CHANGE(fflag, zflag, xflag, xfield)	do {		\
6418185029Spjd	if (((fflags & (fflag)) && !(zflags & (zflag))) ||		\
6419185029Spjd	    ((zflags & (zflag)) && !(fflags & (fflag)))) {		\
6420185029Spjd		XVA_SET_REQ(&xvap, (xflag));				\
6421185029Spjd		(xfield) = ((fflags & (fflag)) != 0);			\
6422185029Spjd	}								\
6423185029Spjd} while (0)
6424185029Spjd		/* Convert chflags into ZFS-type flags. */
6425185029Spjd		/* XXX: what about SF_SETTABLE?. */
6426185029Spjd		FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
6427185029Spjd		    xvap.xva_xoptattrs.xoa_immutable);
6428185029Spjd		FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
6429185029Spjd		    xvap.xva_xoptattrs.xoa_appendonly);
6430185029Spjd		FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
6431185029Spjd		    xvap.xva_xoptattrs.xoa_nounlink);
6432254627Sken		FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
6433254627Sken		    xvap.xva_xoptattrs.xoa_archive);
6434185029Spjd		FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
6435185172Spjd		    xvap.xva_xoptattrs.xoa_nodump);
6436254627Sken		FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
6437254627Sken		    xvap.xva_xoptattrs.xoa_readonly);
6438254627Sken		FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
6439254627Sken		    xvap.xva_xoptattrs.xoa_system);
6440254627Sken		FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
6441254627Sken		    xvap.xva_xoptattrs.xoa_hidden);
6442254627Sken		FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
6443254627Sken		    xvap.xva_xoptattrs.xoa_hidden);
6444254627Sken		FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
6445254627Sken		    xvap.xva_xoptattrs.xoa_offline);
6446254627Sken		FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
6447254627Sken		    xvap.xva_xoptattrs.xoa_sparse);
6448185029Spjd#undef	FLAG_CHANGE
6449185029Spjd	}
6450185172Spjd	return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
6451168962Spjd}
6452168962Spjd
6453168962Spjdstatic int
6454168962Spjdzfs_freebsd_rename(ap)
6455168962Spjd	struct vop_rename_args  /* {
6456168962Spjd		struct vnode *a_fdvp;
6457168962Spjd		struct vnode *a_fvp;
6458168962Spjd		struct componentname *a_fcnp;
6459168962Spjd		struct vnode *a_tdvp;
6460168962Spjd		struct vnode *a_tvp;
6461168962Spjd		struct componentname *a_tcnp;
6462168962Spjd	} */ *ap;
6463168962Spjd{
6464168962Spjd	vnode_t *fdvp = ap->a_fdvp;
6465168962Spjd	vnode_t *fvp = ap->a_fvp;
6466168962Spjd	vnode_t *tdvp = ap->a_tdvp;
6467168962Spjd	vnode_t *tvp = ap->a_tvp;
6468168962Spjd	int error;
6469168962Spjd
6470192237Skmacy	ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
6471192237Skmacy	ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
6472168962Spjd
6473255748Sdavide	/*
6474255748Sdavide	 * Check for cross-device rename.
6475255748Sdavide	 */
6476255748Sdavide	if ((fdvp->v_mount != tdvp->v_mount) ||
6477255748Sdavide	    (tvp && (fdvp->v_mount != tvp->v_mount)))
6478255748Sdavide		error = EXDEV;
6479255748Sdavide	else
6480254982Sdelphij		error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp,
6481254982Sdelphij		    ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred, NULL, 0);
6482168962Spjd	if (tdvp == tvp)
6483168962Spjd		VN_RELE(tdvp);
6484168962Spjd	else
6485168962Spjd		VN_URELE(tdvp);
6486168962Spjd	if (tvp)
6487168962Spjd		VN_URELE(tvp);
6488168962Spjd	VN_RELE(fdvp);
6489168962Spjd	VN_RELE(fvp);
6490168962Spjd
6491168962Spjd	return (error);
6492168962Spjd}
6493168962Spjd
6494168962Spjdstatic int
6495168962Spjdzfs_freebsd_symlink(ap)
6496168962Spjd	struct vop_symlink_args /* {
6497168962Spjd		struct vnode *a_dvp;
6498168962Spjd		struct vnode **a_vpp;
6499168962Spjd		struct componentname *a_cnp;
6500168962Spjd		struct vattr *a_vap;
6501168962Spjd		char *a_target;
6502168962Spjd	} */ *ap;
6503168962Spjd{
6504168962Spjd	struct componentname *cnp = ap->a_cnp;
6505168962Spjd	vattr_t *vap = ap->a_vap;
6506168962Spjd
6507168962Spjd	ASSERT(cnp->cn_flags & SAVENAME);
6508168962Spjd
6509168962Spjd	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
6510168962Spjd	vattr_init_mask(vap);
6511168962Spjd
6512168962Spjd	return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
6513168962Spjd	    ap->a_target, cnp->cn_cred, cnp->cn_thread));
6514168962Spjd}
6515168962Spjd
6516168962Spjdstatic int
6517168962Spjdzfs_freebsd_readlink(ap)
6518168962Spjd	struct vop_readlink_args /* {
6519168962Spjd		struct vnode *a_vp;
6520168962Spjd		struct uio *a_uio;
6521168962Spjd		struct ucred *a_cred;
6522168962Spjd	} */ *ap;
6523168962Spjd{
6524168962Spjd
6525185029Spjd	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
6526168962Spjd}
6527168962Spjd
6528168962Spjdstatic int
6529168962Spjdzfs_freebsd_link(ap)
6530168962Spjd	struct vop_link_args /* {
6531168962Spjd		struct vnode *a_tdvp;
6532168962Spjd		struct vnode *a_vp;
6533168962Spjd		struct componentname *a_cnp;
6534168962Spjd	} */ *ap;
6535168962Spjd{
6536168962Spjd	struct componentname *cnp = ap->a_cnp;
6537254982Sdelphij	vnode_t *vp = ap->a_vp;
6538254982Sdelphij	vnode_t *tdvp = ap->a_tdvp;
6539168962Spjd
6540254982Sdelphij	if (tdvp->v_mount != vp->v_mount)
6541254982Sdelphij		return (EXDEV);
6542254982Sdelphij
6543168962Spjd	ASSERT(cnp->cn_flags & SAVENAME);
6544168962Spjd
6545254982Sdelphij	return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
6546168962Spjd}
6547168962Spjd
6548168962Spjdstatic int
6549168962Spjdzfs_freebsd_inactive(ap)
6550169170Spjd	struct vop_inactive_args /* {
6551169170Spjd		struct vnode *a_vp;
6552169170Spjd		struct thread *a_td;
6553169170Spjd	} */ *ap;
6554168962Spjd{
6555168962Spjd	vnode_t *vp = ap->a_vp;
6556168962Spjd
6557185029Spjd	zfs_inactive(vp, ap->a_td->td_ucred, NULL);
6558168962Spjd	return (0);
6559168962Spjd}
6560168962Spjd
6561168962Spjdstatic int
6562168962Spjdzfs_freebsd_reclaim(ap)
6563168962Spjd	struct vop_reclaim_args /* {
6564168962Spjd		struct vnode *a_vp;
6565168962Spjd		struct thread *a_td;
6566168962Spjd	} */ *ap;
6567168962Spjd{
6568169170Spjd	vnode_t	*vp = ap->a_vp;
6569168962Spjd	znode_t	*zp = VTOZ(vp);
6570197133Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
6571168962Spjd
6572169025Spjd	ASSERT(zp != NULL);
6573169025Spjd
6574243520Savg	/* Destroy the vm object and flush associated pages. */
6575243520Savg	vnode_destroy_vobject(vp);
6576243520Savg
6577168962Spjd	/*
6578243520Savg	 * z_teardown_inactive_lock protects from a race with
6579243520Savg	 * zfs_znode_dmu_fini in zfsvfs_teardown during
6580243520Savg	 * force unmount.
6581168962Spjd	 */
6582243520Savg	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
6583243520Savg	if (zp->z_sa_hdl == NULL)
6584196301Spjd		zfs_znode_free(zp);
6585243520Savg	else
6586243520Savg		zfs_zinactive(zp);
6587243520Savg	rw_exit(&zfsvfs->z_teardown_inactive_lock);
6588185029Spjd
6589168962Spjd	vp->v_data = NULL;
6590168962Spjd	return (0);
6591168962Spjd}
6592168962Spjd
6593168962Spjdstatic int
6594168962Spjdzfs_freebsd_fid(ap)
6595168962Spjd	struct vop_fid_args /* {
6596168962Spjd		struct vnode *a_vp;
6597168962Spjd		struct fid *a_fid;
6598168962Spjd	} */ *ap;
6599168962Spjd{
6600168962Spjd
6601185029Spjd	return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
6602168962Spjd}
6603168962Spjd
6604168962Spjdstatic int
6605168962Spjdzfs_freebsd_pathconf(ap)
6606168962Spjd	struct vop_pathconf_args /* {
6607168962Spjd		struct vnode *a_vp;
6608168962Spjd		int a_name;
6609168962Spjd		register_t *a_retval;
6610168962Spjd	} */ *ap;
6611168962Spjd{
6612168962Spjd	ulong_t val;
6613168962Spjd	int error;
6614168962Spjd
6615185029Spjd	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL);
6616168962Spjd	if (error == 0)
6617168962Spjd		*ap->a_retval = val;
6618168962Spjd	else if (error == EOPNOTSUPP)
6619168962Spjd		error = vop_stdpathconf(ap);
6620168962Spjd	return (error);
6621168962Spjd}
6622168962Spjd
6623196949Straszstatic int
6624196949Straszzfs_freebsd_fifo_pathconf(ap)
6625196949Strasz	struct vop_pathconf_args /* {
6626196949Strasz		struct vnode *a_vp;
6627196949Strasz		int a_name;
6628196949Strasz		register_t *a_retval;
6629196949Strasz	} */ *ap;
6630196949Strasz{
6631196949Strasz
6632196949Strasz	switch (ap->a_name) {
6633196949Strasz	case _PC_ACL_EXTENDED:
6634196949Strasz	case _PC_ACL_NFS4:
6635196949Strasz	case _PC_ACL_PATH_MAX:
6636196949Strasz	case _PC_MAC_PRESENT:
6637196949Strasz		return (zfs_freebsd_pathconf(ap));
6638196949Strasz	default:
6639196949Strasz		return (fifo_specops.vop_pathconf(ap));
6640196949Strasz	}
6641196949Strasz}
6642196949Strasz
6643185029Spjd/*
6644185029Spjd * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
6645185029Spjd * extended attribute name:
6646185029Spjd *
6647185029Spjd *	NAMESPACE	PREFIX
6648185029Spjd *	system		freebsd:system:
6649185029Spjd *	user		(none, can be used to access ZFS fsattr(5) attributes
6650185029Spjd *			created on Solaris)
6651185029Spjd */
6652185029Spjdstatic int
6653185029Spjdzfs_create_attrname(int attrnamespace, const char *name, char *attrname,
6654185029Spjd    size_t size)
6655185029Spjd{
6656185029Spjd	const char *namespace, *prefix, *suffix;
6657185029Spjd
6658185029Spjd	/* We don't allow '/' character in attribute name. */
6659185029Spjd	if (strchr(name, '/') != NULL)
6660185029Spjd		return (EINVAL);
6661185029Spjd	/* We don't allow attribute names that start with "freebsd:" string. */
6662185029Spjd	if (strncmp(name, "freebsd:", 8) == 0)
6663185029Spjd		return (EINVAL);
6664185029Spjd
6665185029Spjd	bzero(attrname, size);
6666185029Spjd
6667185029Spjd	switch (attrnamespace) {
6668185029Spjd	case EXTATTR_NAMESPACE_USER:
6669185029Spjd#if 0
6670185029Spjd		prefix = "freebsd:";
6671185029Spjd		namespace = EXTATTR_NAMESPACE_USER_STRING;
6672185029Spjd		suffix = ":";
6673185029Spjd#else
6674185029Spjd		/*
6675185029Spjd		 * This is the default namespace by which we can access all
6676185029Spjd		 * attributes created on Solaris.
6677185029Spjd		 */
6678185029Spjd		prefix = namespace = suffix = "";
6679185029Spjd#endif
6680185029Spjd		break;
6681185029Spjd	case EXTATTR_NAMESPACE_SYSTEM:
6682185029Spjd		prefix = "freebsd:";
6683185029Spjd		namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
6684185029Spjd		suffix = ":";
6685185029Spjd		break;
6686185029Spjd	case EXTATTR_NAMESPACE_EMPTY:
6687185029Spjd	default:
6688185029Spjd		return (EINVAL);
6689185029Spjd	}
6690185029Spjd	if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
6691185029Spjd	    name) >= size) {
6692185029Spjd		return (ENAMETOOLONG);
6693185029Spjd	}
6694185029Spjd	return (0);
6695185029Spjd}
6696185029Spjd
6697185029Spjd/*
6698185029Spjd * Vnode operating to retrieve a named extended attribute.
6699185029Spjd */
6700185029Spjdstatic int
6701185029Spjdzfs_getextattr(struct vop_getextattr_args *ap)
6702185029Spjd/*
6703185029Spjdvop_getextattr {
6704185029Spjd	IN struct vnode *a_vp;
6705185029Spjd	IN int a_attrnamespace;
6706185029Spjd	IN const char *a_name;
6707185029Spjd	INOUT struct uio *a_uio;
6708185029Spjd	OUT size_t *a_size;
6709185029Spjd	IN struct ucred *a_cred;
6710185029Spjd	IN struct thread *a_td;
6711185029Spjd};
6712185029Spjd*/
6713185029Spjd{
6714185029Spjd	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
6715185029Spjd	struct thread *td = ap->a_td;
6716185029Spjd	struct nameidata nd;
6717185029Spjd	char attrname[255];
6718185029Spjd	struct vattr va;
6719185029Spjd	vnode_t *xvp = NULL, *vp;
6720185029Spjd	int error, flags;
6721185029Spjd
6722195785Strasz	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6723195785Strasz	    ap->a_cred, ap->a_td, VREAD);
6724195785Strasz	if (error != 0)
6725195785Strasz		return (error);
6726195785Strasz
6727185029Spjd	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
6728185029Spjd	    sizeof(attrname));
6729185029Spjd	if (error != 0)
6730185029Spjd		return (error);
6731185029Spjd
6732185029Spjd	ZFS_ENTER(zfsvfs);
6733185029Spjd
6734185029Spjd	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
6735185029Spjd	    LOOKUP_XATTR);
6736185029Spjd	if (error != 0) {
6737185029Spjd		ZFS_EXIT(zfsvfs);
6738185029Spjd		return (error);
6739185029Spjd	}
6740185029Spjd
6741185029Spjd	flags = FREAD;
6742241896Skib	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
6743185029Spjd	    xvp, td);
6744194586Skib	error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL);
6745185029Spjd	vp = nd.ni_vp;
6746185029Spjd	NDFREE(&nd, NDF_ONLY_PNBUF);
6747185029Spjd	if (error != 0) {
6748196303Spjd		ZFS_EXIT(zfsvfs);
6749195785Strasz		if (error == ENOENT)
6750195785Strasz			error = ENOATTR;
6751185029Spjd		return (error);
6752185029Spjd	}
6753185029Spjd
6754185029Spjd	if (ap->a_size != NULL) {
6755185029Spjd		error = VOP_GETATTR(vp, &va, ap->a_cred);
6756185029Spjd		if (error == 0)
6757185029Spjd			*ap->a_size = (size_t)va.va_size;
6758185029Spjd	} else if (ap->a_uio != NULL)
6759224605Smm		error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
6760185029Spjd
6761185029Spjd	VOP_UNLOCK(vp, 0);
6762185029Spjd	vn_close(vp, flags, ap->a_cred, td);
6763185029Spjd	ZFS_EXIT(zfsvfs);
6764185029Spjd
6765185029Spjd	return (error);
6766185029Spjd}
6767185029Spjd
6768185029Spjd/*
6769185029Spjd * Vnode operation to remove a named attribute.
6770185029Spjd */
6771185029Spjdint
6772185029Spjdzfs_deleteextattr(struct vop_deleteextattr_args *ap)
6773185029Spjd/*
6774185029Spjdvop_deleteextattr {
6775185029Spjd	IN struct vnode *a_vp;
6776185029Spjd	IN int a_attrnamespace;
6777185029Spjd	IN const char *a_name;
6778185029Spjd	IN struct ucred *a_cred;
6779185029Spjd	IN struct thread *a_td;
6780185029Spjd};
6781185029Spjd*/
6782185029Spjd{
6783185029Spjd	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
6784185029Spjd	struct thread *td = ap->a_td;
6785185029Spjd	struct nameidata nd;
6786185029Spjd	char attrname[255];
6787185029Spjd	struct vattr va;
6788185029Spjd	vnode_t *xvp = NULL, *vp;
6789185029Spjd	int error, flags;
6790185029Spjd
6791195785Strasz	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6792195785Strasz	    ap->a_cred, ap->a_td, VWRITE);
6793195785Strasz	if (error != 0)
6794195785Strasz		return (error);
6795195785Strasz
6796185029Spjd	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
6797185029Spjd	    sizeof(attrname));
6798185029Spjd	if (error != 0)
6799185029Spjd		return (error);
6800185029Spjd
6801185029Spjd	ZFS_ENTER(zfsvfs);
6802185029Spjd
6803185029Spjd	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
6804185029Spjd	    LOOKUP_XATTR);
6805185029Spjd	if (error != 0) {
6806185029Spjd		ZFS_EXIT(zfsvfs);
6807185029Spjd		return (error);
6808185029Spjd	}
6809185029Spjd
6810241896Skib	NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
6811185029Spjd	    UIO_SYSSPACE, attrname, xvp, td);
6812185029Spjd	error = namei(&nd);
6813185029Spjd	vp = nd.ni_vp;
6814185029Spjd	if (error != 0) {
6815196303Spjd		ZFS_EXIT(zfsvfs);
6816260706Savg		NDFREE(&nd, NDF_ONLY_PNBUF);
6817195785Strasz		if (error == ENOENT)
6818195785Strasz			error = ENOATTR;
6819185029Spjd		return (error);
6820185029Spjd	}
6821260706Savg
6822185029Spjd	error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
6823260706Savg	NDFREE(&nd, NDF_ONLY_PNBUF);
6824185029Spjd
6825185029Spjd	vput(nd.ni_dvp);
6826185029Spjd	if (vp == nd.ni_dvp)
6827185029Spjd		vrele(vp);
6828185029Spjd	else
6829185029Spjd		vput(vp);
6830185029Spjd	ZFS_EXIT(zfsvfs);
6831185029Spjd
6832185029Spjd	return (error);
6833185029Spjd}
6834185029Spjd
6835185029Spjd/*
6836185029Spjd * Vnode operation to set a named attribute.
6837185029Spjd */
6838185029Spjdstatic int
6839185029Spjdzfs_setextattr(struct vop_setextattr_args *ap)
6840185029Spjd/*
6841185029Spjdvop_setextattr {
6842185029Spjd	IN struct vnode *a_vp;
6843185029Spjd	IN int a_attrnamespace;
6844185029Spjd	IN const char *a_name;
6845185029Spjd	INOUT struct uio *a_uio;
6846185029Spjd	IN struct ucred *a_cred;
6847185029Spjd	IN struct thread *a_td;
6848185029Spjd};
6849185029Spjd*/
6850185029Spjd{
6851185029Spjd	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
6852185029Spjd	struct thread *td = ap->a_td;
6853185029Spjd	struct nameidata nd;
6854185029Spjd	char attrname[255];
6855185029Spjd	struct vattr va;
6856185029Spjd	vnode_t *xvp = NULL, *vp;
6857185029Spjd	int error, flags;
6858185029Spjd
6859195785Strasz	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6860195785Strasz	    ap->a_cred, ap->a_td, VWRITE);
6861195785Strasz	if (error != 0)
6862195785Strasz		return (error);
6863195785Strasz
6864185029Spjd	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
6865185029Spjd	    sizeof(attrname));
6866185029Spjd	if (error != 0)
6867185029Spjd		return (error);
6868185029Spjd
6869185029Spjd	ZFS_ENTER(zfsvfs);
6870185029Spjd
6871185029Spjd	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
6872195785Strasz	    LOOKUP_XATTR | CREATE_XATTR_DIR);
6873185029Spjd	if (error != 0) {
6874185029Spjd		ZFS_EXIT(zfsvfs);
6875185029Spjd		return (error);
6876185029Spjd	}
6877185029Spjd
6878185029Spjd	flags = FFLAGS(O_WRONLY | O_CREAT);
6879241896Skib	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
6880185029Spjd	    xvp, td);
6881194586Skib	error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL);
6882185029Spjd	vp = nd.ni_vp;
6883185029Spjd	NDFREE(&nd, NDF_ONLY_PNBUF);
6884185029Spjd	if (error != 0) {
6885185029Spjd		ZFS_EXIT(zfsvfs);
6886185029Spjd		return (error);
6887185029Spjd	}
6888185029Spjd
6889185029Spjd	VATTR_NULL(&va);
6890185029Spjd	va.va_size = 0;
6891185029Spjd	error = VOP_SETATTR(vp, &va, ap->a_cred);
6892185029Spjd	if (error == 0)
6893268420Smav		VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);
6894185029Spjd
6895185029Spjd	VOP_UNLOCK(vp, 0);
6896185029Spjd	vn_close(vp, flags, ap->a_cred, td);
6897185029Spjd	ZFS_EXIT(zfsvfs);
6898185029Spjd
6899185029Spjd	return (error);
6900185029Spjd}
6901185029Spjd
6902185029Spjd/*
6903185029Spjd * Vnode operation to retrieve extended attributes on a vnode.
6904185029Spjd */
6905185029Spjdstatic int
6906185029Spjdzfs_listextattr(struct vop_listextattr_args *ap)
6907185029Spjd/*
6908185029Spjdvop_listextattr {
6909185029Spjd	IN struct vnode *a_vp;
6910185029Spjd	IN int a_attrnamespace;
6911185029Spjd	INOUT struct uio *a_uio;
6912185029Spjd	OUT size_t *a_size;
6913185029Spjd	IN struct ucred *a_cred;
6914185029Spjd	IN struct thread *a_td;
6915185029Spjd};
6916185029Spjd*/
6917185029Spjd{
6918185029Spjd	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
6919185029Spjd	struct thread *td = ap->a_td;
6920185029Spjd	struct nameidata nd;
6921185029Spjd	char attrprefix[16];
6922185029Spjd	u_char dirbuf[sizeof(struct dirent)];
6923185029Spjd	struct dirent *dp;
6924185029Spjd	struct iovec aiov;
6925185029Spjd	struct uio auio, *uio = ap->a_uio;
6926185029Spjd	size_t *sizep = ap->a_size;
6927185029Spjd	size_t plen;
6928185029Spjd	vnode_t *xvp = NULL, *vp;
6929185029Spjd	int done, error, eof, pos;
6930185029Spjd
6931195785Strasz	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6932195785Strasz	    ap->a_cred, ap->a_td, VREAD);
6933196303Spjd	if (error != 0)
6934195785Strasz		return (error);
6935195785Strasz
6936185029Spjd	error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
6937185029Spjd	    sizeof(attrprefix));
6938185029Spjd	if (error != 0)
6939185029Spjd		return (error);
6940185029Spjd	plen = strlen(attrprefix);
6941185029Spjd
6942185029Spjd	ZFS_ENTER(zfsvfs);
6943185029Spjd
6944195822Strasz	if (sizep != NULL)
6945195822Strasz		*sizep = 0;
6946195822Strasz
6947185029Spjd	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
6948185029Spjd	    LOOKUP_XATTR);
6949185029Spjd	if (error != 0) {
6950196303Spjd		ZFS_EXIT(zfsvfs);
6951195785Strasz		/*
6952195785Strasz		 * ENOATTR means that the EA directory does not yet exist,
6953195785Strasz		 * i.e. there are no extended attributes there.
6954195785Strasz		 */
6955195785Strasz		if (error == ENOATTR)
6956195785Strasz			error = 0;
6957185029Spjd		return (error);
6958185029Spjd	}
6959185029Spjd
6960241896Skib	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
6961188588Sjhb	    UIO_SYSSPACE, ".", xvp, td);
6962185029Spjd	error = namei(&nd);
6963185029Spjd	vp = nd.ni_vp;
6964185029Spjd	NDFREE(&nd, NDF_ONLY_PNBUF);
6965185029Spjd	if (error != 0) {
6966185029Spjd		ZFS_EXIT(zfsvfs);
6967185029Spjd		return (error);
6968185029Spjd	}
6969185029Spjd
6970185029Spjd	auio.uio_iov = &aiov;
6971185029Spjd	auio.uio_iovcnt = 1;
6972185029Spjd	auio.uio_segflg = UIO_SYSSPACE;
6973185029Spjd	auio.uio_td = td;
6974185029Spjd	auio.uio_rw = UIO_READ;
6975185029Spjd	auio.uio_offset = 0;
6976185029Spjd
6977185029Spjd	do {
6978185029Spjd		u_char nlen;
6979185029Spjd
6980185029Spjd		aiov.iov_base = (void *)dirbuf;
6981185029Spjd		aiov.iov_len = sizeof(dirbuf);
6982185029Spjd		auio.uio_resid = sizeof(dirbuf);
6983185029Spjd		error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
6984185029Spjd		done = sizeof(dirbuf) - auio.uio_resid;
6985185029Spjd		if (error != 0)
6986185029Spjd			break;
6987185029Spjd		for (pos = 0; pos < done;) {
6988185029Spjd			dp = (struct dirent *)(dirbuf + pos);
6989185029Spjd			pos += dp->d_reclen;
6990185029Spjd			/*
6991185029Spjd			 * XXX: Temporarily we also accept DT_UNKNOWN, as this
6992185029Spjd			 * is what we get when attribute was created on Solaris.
6993185029Spjd			 */
6994185029Spjd			if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
6995185029Spjd				continue;
6996185029Spjd			if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0)
6997185029Spjd				continue;
6998185029Spjd			else if (strncmp(dp->d_name, attrprefix, plen) != 0)
6999185029Spjd				continue;
7000185029Spjd			nlen = dp->d_namlen - plen;
7001185029Spjd			if (sizep != NULL)
7002185029Spjd				*sizep += 1 + nlen;
7003185029Spjd			else if (uio != NULL) {
7004185029Spjd				/*
7005185029Spjd				 * Format of extattr name entry is one byte for
7006185029Spjd				 * length and the rest for name.
7007185029Spjd				 */
7008185029Spjd				error = uiomove(&nlen, 1, uio->uio_rw, uio);
7009185029Spjd				if (error == 0) {
7010185029Spjd					error = uiomove(dp->d_name + plen, nlen,
7011185029Spjd					    uio->uio_rw, uio);
7012185029Spjd				}
7013185029Spjd				if (error != 0)
7014185029Spjd					break;
7015185029Spjd			}
7016185029Spjd		}
7017185029Spjd	} while (!eof && error == 0);
7018185029Spjd
7019185029Spjd	vput(vp);
7020185029Spjd	ZFS_EXIT(zfsvfs);
7021185029Spjd
7022185029Spjd	return (error);
7023185029Spjd}
7024185029Spjd
7025192800Straszint
7026192800Straszzfs_freebsd_getacl(ap)
7027192800Strasz	struct vop_getacl_args /* {
7028192800Strasz		struct vnode *vp;
7029192800Strasz		acl_type_t type;
7030192800Strasz		struct acl *aclp;
7031192800Strasz		struct ucred *cred;
7032192800Strasz		struct thread *td;
7033192800Strasz	} */ *ap;
7034192800Strasz{
7035192800Strasz	int		error;
7036192800Strasz	vsecattr_t      vsecattr;
7037192800Strasz
7038192800Strasz	if (ap->a_type != ACL_TYPE_NFS4)
7039197435Strasz		return (EINVAL);
7040192800Strasz
7041192800Strasz	vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
7042192800Strasz	if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))
7043192800Strasz		return (error);
7044192800Strasz
7045192800Strasz	error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt);
7046196303Spjd	if (vsecattr.vsa_aclentp != NULL)
7047196303Spjd		kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
7048192800Strasz
7049196303Spjd	return (error);
7050192800Strasz}
7051192800Strasz
7052192800Straszint
7053192800Straszzfs_freebsd_setacl(ap)
7054192800Strasz	struct vop_setacl_args /* {
7055192800Strasz		struct vnode *vp;
7056192800Strasz		acl_type_t type;
7057192800Strasz		struct acl *aclp;
7058192800Strasz		struct ucred *cred;
7059192800Strasz		struct thread *td;
7060192800Strasz	} */ *ap;
7061192800Strasz{
7062192800Strasz	int		error;
7063192800Strasz	vsecattr_t      vsecattr;
7064192800Strasz	int		aclbsize;	/* size of acl list in bytes */
7065192800Strasz	aclent_t	*aaclp;
7066192800Strasz
7067192800Strasz	if (ap->a_type != ACL_TYPE_NFS4)
7068197435Strasz		return (EINVAL);
7069192800Strasz
7070192800Strasz	if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
7071192800Strasz		return (EINVAL);
7072192800Strasz
7073192800Strasz	/*
7074196949Strasz	 * With NFSv4 ACLs, chmod(2) may need to add additional entries,
7075192800Strasz	 * splitting every entry into two and appending "canonical six"
7076192800Strasz	 * entries at the end.  Don't allow for setting an ACL that would
7077192800Strasz	 * cause chmod(2) to run out of ACL entries.
7078192800Strasz	 */
7079192800Strasz	if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
7080192800Strasz		return (ENOSPC);
7081192800Strasz
7082208030Strasz	error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
7083208030Strasz	if (error != 0)
7084208030Strasz		return (error);
7085208030Strasz
7086192800Strasz	vsecattr.vsa_mask = VSA_ACE;
7087192800Strasz	aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t);
7088192800Strasz	vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
7089192800Strasz	aaclp = vsecattr.vsa_aclentp;
7090192800Strasz	vsecattr.vsa_aclentsz = aclbsize;
7091192800Strasz
7092192800Strasz	aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
7093192800Strasz	error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL);
7094192800Strasz	kmem_free(aaclp, aclbsize);
7095192800Strasz
7096192800Strasz	return (error);
7097192800Strasz}
7098192800Strasz
7099192800Straszint
7100192800Straszzfs_freebsd_aclcheck(ap)
7101192800Strasz	struct vop_aclcheck_args /* {
7102192800Strasz		struct vnode *vp;
7103192800Strasz		acl_type_t type;
7104192800Strasz		struct acl *aclp;
7105192800Strasz		struct ucred *cred;
7106192800Strasz		struct thread *td;
7107192800Strasz	} */ *ap;
7108192800Strasz{
7109192800Strasz
7110192800Strasz	return (EOPNOTSUPP);
7111192800Strasz}
7112192800Strasz
7113168404Spjdstruct vop_vector zfs_vnodeops;
7114168404Spjdstruct vop_vector zfs_fifoops;
7115209962Smmstruct vop_vector zfs_shareops;
7116168404Spjd
7117168404Spjdstruct vop_vector zfs_vnodeops = {
7118185029Spjd	.vop_default =		&default_vnodeops,
7119185029Spjd	.vop_inactive =		zfs_freebsd_inactive,
7120185029Spjd	.vop_reclaim =		zfs_freebsd_reclaim,
7121185029Spjd	.vop_access =		zfs_freebsd_access,
7122168404Spjd#ifdef FREEBSD_NAMECACHE
7123185029Spjd	.vop_lookup =		vfs_cache_lookup,
7124185029Spjd	.vop_cachedlookup =	zfs_freebsd_lookup,
7125168404Spjd#else
7126185029Spjd	.vop_lookup =		zfs_freebsd_lookup,
7127168404Spjd#endif
7128185029Spjd	.vop_getattr =		zfs_freebsd_getattr,
7129185029Spjd	.vop_setattr =		zfs_freebsd_setattr,
7130185029Spjd	.vop_create =		zfs_freebsd_create,
7131185029Spjd	.vop_mknod =		zfs_freebsd_create,
7132185029Spjd	.vop_mkdir =		zfs_freebsd_mkdir,
7133185029Spjd	.vop_readdir =		zfs_freebsd_readdir,
7134185029Spjd	.vop_fsync =		zfs_freebsd_fsync,
7135185029Spjd	.vop_open =		zfs_freebsd_open,
7136185029Spjd	.vop_close =		zfs_freebsd_close,
7137185029Spjd	.vop_rmdir =		zfs_freebsd_rmdir,
7138185029Spjd	.vop_ioctl =		zfs_freebsd_ioctl,
7139185029Spjd	.vop_link =		zfs_freebsd_link,
7140185029Spjd	.vop_symlink =		zfs_freebsd_symlink,
7141185029Spjd	.vop_readlink =		zfs_freebsd_readlink,
7142185029Spjd	.vop_read =		zfs_freebsd_read,
7143185029Spjd	.vop_write =		zfs_freebsd_write,
7144185029Spjd	.vop_remove =		zfs_freebsd_remove,
7145185029Spjd	.vop_rename =		zfs_freebsd_rename,
7146185029Spjd	.vop_pathconf =		zfs_freebsd_pathconf,
7147243518Savg	.vop_bmap =		zfs_freebsd_bmap,
7148185029Spjd	.vop_fid =		zfs_freebsd_fid,
7149185029Spjd	.vop_getextattr =	zfs_getextattr,
7150185029Spjd	.vop_deleteextattr =	zfs_deleteextattr,
7151185029Spjd	.vop_setextattr =	zfs_setextattr,
7152185029Spjd	.vop_listextattr =	zfs_listextattr,
7153192800Strasz	.vop_getacl =		zfs_freebsd_getacl,
7154192800Strasz	.vop_setacl =		zfs_freebsd_setacl,
7155192800Strasz	.vop_aclcheck =		zfs_freebsd_aclcheck,
7156213937Savg	.vop_getpages =		zfs_freebsd_getpages,
7157258746Savg	.vop_putpages =		zfs_freebsd_putpages,
7158168404Spjd};
7159168404Spjd
7160169170Spjdstruct vop_vector zfs_fifoops = {
7161185029Spjd	.vop_default =		&fifo_specops,
7162200162Skib	.vop_fsync =		zfs_freebsd_fsync,
7163185029Spjd	.vop_access =		zfs_freebsd_access,
7164185029Spjd	.vop_getattr =		zfs_freebsd_getattr,
7165185029Spjd	.vop_inactive =		zfs_freebsd_inactive,
7166185029Spjd	.vop_read =		VOP_PANIC,
7167185029Spjd	.vop_reclaim =		zfs_freebsd_reclaim,
7168185029Spjd	.vop_setattr =		zfs_freebsd_setattr,
7169185029Spjd	.vop_write =		VOP_PANIC,
7170196949Strasz	.vop_pathconf = 	zfs_freebsd_fifo_pathconf,
7171185029Spjd	.vop_fid =		zfs_freebsd_fid,
7172192800Strasz	.vop_getacl =		zfs_freebsd_getacl,
7173192800Strasz	.vop_setacl =		zfs_freebsd_setacl,
7174192800Strasz	.vop_aclcheck =		zfs_freebsd_aclcheck,
7175168404Spjd};
7176209962Smm
7177209962Smm/*
7178209962Smm * special share hidden files vnode operations template
7179209962Smm */
7180209962Smmstruct vop_vector zfs_shareops = {
7181209962Smm	.vop_default =		&default_vnodeops,
7182209962Smm	.vop_access =		zfs_freebsd_access,
7183209962Smm	.vop_inactive =		zfs_freebsd_inactive,
7184209962Smm	.vop_reclaim =		zfs_freebsd_reclaim,
7185209962Smm	.vop_fid =		zfs_freebsd_fid,
7186209962Smm	.vop_pathconf =		zfs_freebsd_pathconf,
7187209962Smm};
7188