zfs_vnops.c revision 224251
1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22212694Smm * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23168404Spjd */
24168404Spjd
25169195Spjd/* Portions Copyright 2007 Jeremy Teo */
26219089Spjd/* Portions Copyright 2010 Robert Milkowski */
27169195Spjd
28168404Spjd#include <sys/types.h>
29168404Spjd#include <sys/param.h>
30168404Spjd#include <sys/time.h>
31168404Spjd#include <sys/systm.h>
32168404Spjd#include <sys/sysmacros.h>
33168404Spjd#include <sys/resource.h>
34168404Spjd#include <sys/vfs.h>
35168404Spjd#include <sys/vnode.h>
36168404Spjd#include <sys/file.h>
37168404Spjd#include <sys/stat.h>
38168404Spjd#include <sys/kmem.h>
39168404Spjd#include <sys/taskq.h>
40168404Spjd#include <sys/uio.h>
41168404Spjd#include <sys/atomic.h>
42168404Spjd#include <sys/namei.h>
43168404Spjd#include <sys/mman.h>
44168404Spjd#include <sys/cmn_err.h>
45168404Spjd#include <sys/errno.h>
46168404Spjd#include <sys/unistd.h>
47168404Spjd#include <sys/zfs_dir.h>
48168404Spjd#include <sys/zfs_ioctl.h>
49168404Spjd#include <sys/fs/zfs.h>
50168404Spjd#include <sys/dmu.h>
51219089Spjd#include <sys/dmu_objset.h>
52168404Spjd#include <sys/spa.h>
53168404Spjd#include <sys/txg.h>
54168404Spjd#include <sys/dbuf.h>
55168404Spjd#include <sys/zap.h>
56219089Spjd#include <sys/sa.h>
57168404Spjd#include <sys/dirent.h>
58168962Spjd#include <sys/policy.h>
59168962Spjd#include <sys/sunddi.h>
60168404Spjd#include <sys/filio.h>
61209962Smm#include <sys/sid.h>
62168404Spjd#include <sys/zfs_ctldir.h>
63185029Spjd#include <sys/zfs_fuid.h>
64219089Spjd#include <sys/zfs_sa.h>
65168404Spjd#include <sys/dnlc.h>
66168404Spjd#include <sys/zfs_rlock.h>
67185029Spjd#include <sys/extdirent.h>
68185029Spjd#include <sys/kidmap.h>
69168404Spjd#include <sys/bio.h>
70168404Spjd#include <sys/buf.h>
71168404Spjd#include <sys/sf_buf.h>
72168404Spjd#include <sys/sched.h>
73192800Strasz#include <sys/acl.h>
74215401Savg#include <vm/vm_pageout.h>
75168404Spjd
76168404Spjd/*
77168404Spjd * Programming rules.
78168404Spjd *
79168404Spjd * Each vnode op performs some logical unit of work.  To do this, the ZPL must
80168404Spjd * properly lock its in-core state, create a DMU transaction, do the work,
81168404Spjd * record this work in the intent log (ZIL), commit the DMU transaction,
82185029Spjd * and wait for the intent log to commit if it is a synchronous operation.
83185029Spjd * Moreover, the vnode ops must work in both normal and log replay context.
84168404Spjd * The ordering of events is important to avoid deadlocks and references
85168404Spjd * to freed memory.  The example below illustrates the following Big Rules:
86168404Spjd *
87168404Spjd *  (1) A check must be made in each zfs thread for a mounted file system.
88168404Spjd *	This is done avoiding races using ZFS_ENTER(zfsvfs).
89185029Spjd *      A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
90185029Spjd *      must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
91185029Spjd *      can return EIO from the calling function.
92168404Spjd *
93168404Spjd *  (2)	VN_RELE() should always be the last thing except for zil_commit()
94168404Spjd *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
95168404Spjd *	First, if it's the last reference, the vnode/znode
96168404Spjd *	can be freed, so the zp may point to freed memory.  Second, the last
97168404Spjd *	reference will call zfs_zinactive(), which may induce a lot of work --
98168404Spjd *	pushing cached pages (which acquires range locks) and syncing out
99168404Spjd *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
100168404Spjd *	which could deadlock the system if you were already holding one.
101191900Skmacy *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
102168404Spjd *
103168404Spjd *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
104168404Spjd *	as they can span dmu_tx_assign() calls.
105168404Spjd *
106209962Smm *  (4)	Always pass TXG_NOWAIT as the second argument to dmu_tx_assign().
107168404Spjd *	This is critical because we don't want to block while holding locks.
108168404Spjd *	Note, in particular, that if a lock is sometimes acquired before
109168404Spjd *	the tx assigns, and sometimes after (e.g. z_lock), then failing to
110168404Spjd *	use a non-blocking assign can deadlock the system.  The scenario:
111168404Spjd *
112168404Spjd *	Thread A has grabbed a lock before calling dmu_tx_assign().
113168404Spjd *	Thread B is in an already-assigned tx, and blocks for this lock.
114168404Spjd *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
115168404Spjd *	forever, because the previous txg can't quiesce until B's tx commits.
116168404Spjd *
117168404Spjd *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
118168404Spjd *	then drop all locks, call dmu_tx_wait(), and try again.
119168404Spjd *
120168404Spjd *  (5)	If the operation succeeded, generate the intent log entry for it
121168404Spjd *	before dropping locks.  This ensures that the ordering of events
122168404Spjd *	in the intent log matches the order in which they actually occurred.
123209962Smm *      During ZIL replay the zfs_log_* functions will update the sequence
124209962Smm *	number to indicate the zil transaction has replayed.
125168404Spjd *
126168404Spjd *  (6)	At the end of each vnode op, the DMU tx must always commit,
127168404Spjd *	regardless of whether there were any errors.
128168404Spjd *
129219089Spjd *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
130168404Spjd *	to ensure that synchronous semantics are provided when necessary.
131168404Spjd *
132168404Spjd * In general, this is how things should be ordered in each vnode op:
133168404Spjd *
134168404Spjd *	ZFS_ENTER(zfsvfs);		// exit if unmounted
135168404Spjd * top:
136168404Spjd *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may VN_HOLD())
137168404Spjd *	rw_enter(...);			// grab any other locks you need
138168404Spjd *	tx = dmu_tx_create(...);	// get DMU tx
139168404Spjd *	dmu_tx_hold_*();		// hold each object you might modify
140209962Smm *	error = dmu_tx_assign(tx, TXG_NOWAIT);	// try to assign
141168404Spjd *	if (error) {
142168404Spjd *		rw_exit(...);		// drop locks
143168404Spjd *		zfs_dirent_unlock(dl);	// unlock directory entry
144168404Spjd *		VN_RELE(...);		// release held vnodes
145209962Smm *		if (error == ERESTART) {
146168404Spjd *			dmu_tx_wait(tx);
147168404Spjd *			dmu_tx_abort(tx);
148168404Spjd *			goto top;
149168404Spjd *		}
150168404Spjd *		dmu_tx_abort(tx);	// abort DMU tx
151168404Spjd *		ZFS_EXIT(zfsvfs);	// finished in zfs
152168404Spjd *		return (error);		// really out of space
153168404Spjd *	}
154168404Spjd *	error = do_real_work();		// do whatever this VOP does
155168404Spjd *	if (error == 0)
156168404Spjd *		zfs_log_*(...);		// on success, make ZIL entry
157168404Spjd *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
158168404Spjd *	rw_exit(...);			// drop locks
159168404Spjd *	zfs_dirent_unlock(dl);		// unlock directory entry
160168404Spjd *	VN_RELE(...);			// release held vnodes
161219089Spjd *	zil_commit(zilog, foid);	// synchronous when necessary
162168404Spjd *	ZFS_EXIT(zfsvfs);		// finished in zfs
163168404Spjd *	return (error);			// done, report error
164168404Spjd */
165185029Spjd
166168404Spjd/* ARGSUSED */
167168404Spjdstatic int
168185029Spjdzfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
169168404Spjd{
170168962Spjd	znode_t	*zp = VTOZ(*vpp);
171209962Smm	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
172168404Spjd
173209962Smm	ZFS_ENTER(zfsvfs);
174209962Smm	ZFS_VERIFY_ZP(zp);
175209962Smm
176219089Spjd	if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
177185029Spjd	    ((flag & FAPPEND) == 0)) {
178209962Smm		ZFS_EXIT(zfsvfs);
179185029Spjd		return (EPERM);
180185029Spjd	}
181185029Spjd
182185029Spjd	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
183185029Spjd	    ZTOV(zp)->v_type == VREG &&
184219089Spjd	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
185209962Smm		if (fs_vscan(*vpp, cr, 0) != 0) {
186209962Smm			ZFS_EXIT(zfsvfs);
187185029Spjd			return (EACCES);
188209962Smm		}
189209962Smm	}
190185029Spjd
191168404Spjd	/* Keep a count of the synchronous opens in the znode */
192168962Spjd	if (flag & (FSYNC | FDSYNC))
193168404Spjd		atomic_inc_32(&zp->z_sync_cnt);
194185029Spjd
195209962Smm	ZFS_EXIT(zfsvfs);
196168404Spjd	return (0);
197168404Spjd}
198168404Spjd
199168404Spjd/* ARGSUSED */
200168404Spjdstatic int
201185029Spjdzfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
202185029Spjd    caller_context_t *ct)
203168404Spjd{
204168962Spjd	znode_t	*zp = VTOZ(vp);
205209962Smm	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
206168404Spjd
207210470Smm	/*
208210470Smm	 * Clean up any locks held by this process on the vp.
209210470Smm	 */
210210470Smm	cleanlocks(vp, ddi_get_pid(), 0);
211210470Smm	cleanshares(vp, ddi_get_pid());
212210470Smm
213209962Smm	ZFS_ENTER(zfsvfs);
214209962Smm	ZFS_VERIFY_ZP(zp);
215209962Smm
216168404Spjd	/* Decrement the synchronous opens in the znode */
217185029Spjd	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
218168404Spjd		atomic_dec_32(&zp->z_sync_cnt);
219168404Spjd
220185029Spjd	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
221185029Spjd	    ZTOV(zp)->v_type == VREG &&
222219089Spjd	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
223185029Spjd		VERIFY(fs_vscan(vp, cr, 1) == 0);
224185029Spjd
225209962Smm	ZFS_EXIT(zfsvfs);
226168404Spjd	return (0);
227168404Spjd}
228168404Spjd
229168404Spjd/*
230168404Spjd * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
231168404Spjd * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
232168404Spjd */
233168404Spjdstatic int
234168978Spjdzfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
235168404Spjd{
236168404Spjd	znode_t	*zp = VTOZ(vp);
237168404Spjd	uint64_t noff = (uint64_t)*off; /* new offset */
238168404Spjd	uint64_t file_sz;
239168404Spjd	int error;
240168404Spjd	boolean_t hole;
241168404Spjd
242219089Spjd	file_sz = zp->z_size;
243168404Spjd	if (noff >= file_sz)  {
244168404Spjd		return (ENXIO);
245168404Spjd	}
246168404Spjd
247168962Spjd	if (cmd == _FIO_SEEK_HOLE)
248168404Spjd		hole = B_TRUE;
249168404Spjd	else
250168404Spjd		hole = B_FALSE;
251168404Spjd
252168404Spjd	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
253168404Spjd
254168404Spjd	/* end of file? */
255168404Spjd	if ((error == ESRCH) || (noff > file_sz)) {
256168404Spjd		/*
257168404Spjd		 * Handle the virtual hole at the end of file.
258168404Spjd		 */
259168404Spjd		if (hole) {
260168404Spjd			*off = file_sz;
261168404Spjd			return (0);
262168404Spjd		}
263168404Spjd		return (ENXIO);
264168404Spjd	}
265168404Spjd
266168404Spjd	if (noff < *off)
267168404Spjd		return (error);
268168404Spjd	*off = noff;
269168404Spjd	return (error);
270168404Spjd}
271168404Spjd
272168404Spjd/* ARGSUSED */
273168404Spjdstatic int
274168978Spjdzfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
275185029Spjd    int *rvalp, caller_context_t *ct)
276168404Spjd{
277168962Spjd	offset_t off;
278168962Spjd	int error;
279168962Spjd	zfsvfs_t *zfsvfs;
280185029Spjd	znode_t *zp;
281168404Spjd
282168404Spjd	switch (com) {
283185029Spjd	case _FIOFFS:
284168962Spjd		return (0);
285168404Spjd
286168962Spjd		/*
287168962Spjd		 * The following two ioctls are used by bfu.  Faking out,
288168962Spjd		 * necessary to avoid bfu errors.
289168962Spjd		 */
290185029Spjd	case _FIOGDIO:
291185029Spjd	case _FIOSDIO:
292168962Spjd		return (0);
293168962Spjd
294185029Spjd	case _FIO_SEEK_DATA:
295185029Spjd	case _FIO_SEEK_HOLE:
296168962Spjd		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
297168962Spjd			return (EFAULT);
298168962Spjd
299185029Spjd		zp = VTOZ(vp);
300185029Spjd		zfsvfs = zp->z_zfsvfs;
301168404Spjd		ZFS_ENTER(zfsvfs);
302185029Spjd		ZFS_VERIFY_ZP(zp);
303168404Spjd
304168404Spjd		/* offset parameter is in/out */
305168404Spjd		error = zfs_holey(vp, com, &off);
306168404Spjd		ZFS_EXIT(zfsvfs);
307168404Spjd		if (error)
308168404Spjd			return (error);
309168962Spjd		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
310168962Spjd			return (EFAULT);
311168404Spjd		return (0);
312168404Spjd	}
313168404Spjd	return (ENOTTY);
314168404Spjd}
315168404Spjd
316209962Smmstatic vm_page_t
317209962Smmpage_lookup(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
318209962Smm{
319209962Smm	vm_object_t obj;
320209962Smm	vm_page_t pp;
321209962Smm
322209962Smm	obj = vp->v_object;
323209962Smm	VM_OBJECT_LOCK_ASSERT(obj, MA_OWNED);
324209962Smm
325209962Smm	for (;;) {
326209962Smm		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
327209962Smm		    vm_page_is_valid(pp, (vm_offset_t)off, nbytes)) {
328212652Savg			if ((pp->oflags & VPO_BUSY) != 0) {
329212652Savg				/*
330212652Savg				 * Reference the page before unlocking and
331212652Savg				 * sleeping so that the page daemon is less
332212652Savg				 * likely to reclaim it.
333212652Savg				 */
334212652Savg				vm_page_lock_queues();
335212652Savg				vm_page_flag_set(pp, PG_REFERENCED);
336212652Savg				vm_page_sleep(pp, "zfsmwb");
337209962Smm				continue;
338212652Savg			}
339209962Smm			vm_page_busy(pp);
340209962Smm			vm_page_undirty(pp);
341209962Smm		} else {
342209962Smm			if (__predict_false(obj->cache != NULL)) {
343209962Smm				vm_page_cache_free(obj, OFF_TO_IDX(start),
344209962Smm				    OFF_TO_IDX(start) + 1);
345209962Smm			}
346209962Smm			pp = NULL;
347209962Smm		}
348209962Smm		break;
349209962Smm	}
350209962Smm	return (pp);
351209962Smm}
352209962Smm
353209962Smmstatic void
354209962Smmpage_unlock(vm_page_t pp)
355209962Smm{
356209962Smm
357209962Smm	vm_page_wakeup(pp);
358209962Smm}
359209962Smm
360209962Smmstatic caddr_t
361209962Smmzfs_map_page(vm_page_t pp, struct sf_buf **sfp)
362209962Smm{
363209962Smm
364212951Savg	*sfp = sf_buf_alloc(pp, 0);
365209962Smm	return ((caddr_t)sf_buf_kva(*sfp));
366209962Smm}
367209962Smm
368209962Smmstatic void
369209962Smmzfs_unmap_page(struct sf_buf *sf)
370209962Smm{
371209962Smm
372209962Smm	sf_buf_free(sf);
373209962Smm}
374209962Smm
375168404Spjd/*
376168404Spjd * When a file is memory mapped, we must keep the IO data synchronized
377168404Spjd * between the DMU cache and the memory mapped pages.  What this means:
378168404Spjd *
379168404Spjd * On Write:	If we find a memory mapped page, we write to *both*
380168404Spjd *		the page and the dmu buffer.
381168404Spjd */
382209962Smmstatic void
383209962Smmupdate_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
384209962Smm    int segflg, dmu_tx_t *tx)
385168404Spjd{
386168404Spjd	vm_object_t obj;
387168404Spjd	struct sf_buf *sf;
388212655Savg	int off;
389168404Spjd
390168404Spjd	ASSERT(vp->v_mount != NULL);
391168404Spjd	obj = vp->v_object;
392168404Spjd	ASSERT(obj != NULL);
393168404Spjd
394168404Spjd	off = start & PAGEOFFSET;
395168404Spjd	VM_OBJECT_LOCK(obj);
396168404Spjd	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
397209962Smm		vm_page_t pp;
398212655Savg		int nbytes = MIN(PAGESIZE - off, len);
399168404Spjd
400209962Smm		if ((pp = page_lookup(vp, start, off, nbytes)) != NULL) {
401168404Spjd			caddr_t va;
402168404Spjd
403168404Spjd			VM_OBJECT_UNLOCK(obj);
404209962Smm			va = zfs_map_page(pp, &sf);
405209962Smm			if (segflg == UIO_NOCOPY) {
406209962Smm				(void) dmu_write(os, oid, start+off, nbytes,
407209962Smm				    va+off, tx);
408209962Smm			} else {
409209962Smm				(void) dmu_read(os, oid, start+off, nbytes,
410216378Spjd				    va+off, DMU_READ_PREFETCH);
411169059Spjd			}
412209962Smm			zfs_unmap_page(sf);
413168404Spjd			VM_OBJECT_LOCK(obj);
414209962Smm			page_unlock(pp);
415168404Spjd		}
416209962Smm		len -= nbytes;
417168404Spjd		off = 0;
418168404Spjd	}
419168404Spjd	VM_OBJECT_UNLOCK(obj);
420168404Spjd}
421168404Spjd
422168404Spjd/*
423219089Spjd * Read with UIO_NOCOPY flag means that sendfile(2) requests
424219089Spjd * ZFS to populate a range of page cache pages with data.
425219089Spjd *
426219089Spjd * NOTE: this function could be optimized to pre-allocate
427219089Spjd * all pages in advance, drain VPO_BUSY on all of them,
428219089Spjd * map them into contiguous KVA region and populate them
429219089Spjd * in one single dmu_read() call.
430219089Spjd */
431219089Spjdstatic int
432219089Spjdmappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
433219089Spjd{
434219089Spjd	znode_t *zp = VTOZ(vp);
435219089Spjd	objset_t *os = zp->z_zfsvfs->z_os;
436219089Spjd	struct sf_buf *sf;
437219089Spjd	vm_object_t obj;
438219089Spjd	vm_page_t pp;
439219089Spjd	int64_t start;
440219089Spjd	caddr_t va;
441219089Spjd	int len = nbytes;
442219089Spjd	int off;
443219089Spjd	int error = 0;
444219089Spjd
445219089Spjd	ASSERT(uio->uio_segflg == UIO_NOCOPY);
446219089Spjd	ASSERT(vp->v_mount != NULL);
447219089Spjd	obj = vp->v_object;
448219089Spjd	ASSERT(obj != NULL);
449219089Spjd	ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
450219089Spjd
451219089Spjd	VM_OBJECT_LOCK(obj);
452219089Spjd	for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
453219089Spjd		int bytes = MIN(PAGESIZE, len);
454219089Spjd
455219089Spjd		pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_NOBUSY |
456219089Spjd		    VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_IGN_SBUSY);
457219089Spjd		if (pp->valid == 0) {
458219089Spjd			vm_page_io_start(pp);
459219089Spjd			VM_OBJECT_UNLOCK(obj);
460219089Spjd			va = zfs_map_page(pp, &sf);
461219089Spjd			error = dmu_read(os, zp->z_id, start, bytes, va,
462219089Spjd			    DMU_READ_PREFETCH);
463219089Spjd			if (bytes != PAGESIZE && error == 0)
464219089Spjd				bzero(va + bytes, PAGESIZE - bytes);
465219089Spjd			zfs_unmap_page(sf);
466219089Spjd			VM_OBJECT_LOCK(obj);
467219089Spjd			vm_page_io_finish(pp);
468219089Spjd			vm_page_lock(pp);
469219089Spjd			if (error) {
470219089Spjd				vm_page_free(pp);
471219089Spjd			} else {
472219089Spjd				pp->valid = VM_PAGE_BITS_ALL;
473219089Spjd				vm_page_activate(pp);
474219089Spjd			}
475219089Spjd			vm_page_unlock(pp);
476219089Spjd		}
477219089Spjd		if (error)
478219089Spjd			break;
479219089Spjd		uio->uio_resid -= bytes;
480219089Spjd		uio->uio_offset += bytes;
481219089Spjd		len -= bytes;
482219089Spjd	}
483219089Spjd	VM_OBJECT_UNLOCK(obj);
484219089Spjd	return (error);
485219089Spjd}
486219089Spjd
487219089Spjd/*
488168404Spjd * When a file is memory mapped, we must keep the IO data synchronized
489168404Spjd * between the DMU cache and the memory mapped pages.  What this means:
490168404Spjd *
491168404Spjd * On Read:	We "read" preferentially from memory mapped pages,
492168404Spjd *		else we default from the dmu buffer.
493168404Spjd *
494168404Spjd * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
495168404Spjd *	the file is memory mapped.
496168404Spjd */
497168404Spjdstatic int
498168404Spjdmappedread(vnode_t *vp, int nbytes, uio_t *uio)
499168404Spjd{
500168404Spjd	znode_t *zp = VTOZ(vp);
501168404Spjd	objset_t *os = zp->z_zfsvfs->z_os;
502168404Spjd	vm_object_t obj;
503212655Savg	int64_t start;
504168926Spjd	caddr_t va;
505168404Spjd	int len = nbytes;
506212655Savg	int off;
507168404Spjd	int error = 0;
508168404Spjd
509168404Spjd	ASSERT(vp->v_mount != NULL);
510168404Spjd	obj = vp->v_object;
511168404Spjd	ASSERT(obj != NULL);
512168404Spjd
513168404Spjd	start = uio->uio_loffset;
514168404Spjd	off = start & PAGEOFFSET;
515168404Spjd	VM_OBJECT_LOCK(obj);
516168404Spjd	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
517219089Spjd		vm_page_t pp;
518219089Spjd		uint64_t bytes = MIN(PAGESIZE - off, len);
519168404Spjd
520219089Spjd		if (pp = page_lookup(vp, start, off, bytes)) {
521219089Spjd			struct sf_buf *sf;
522219089Spjd			caddr_t va;
523212652Savg
524168404Spjd			VM_OBJECT_UNLOCK(obj);
525219089Spjd			va = zfs_map_page(pp, &sf);
526219089Spjd			error = uiomove(va + off, bytes, UIO_READ, uio);
527219089Spjd			zfs_unmap_page(sf);
528168404Spjd			VM_OBJECT_LOCK(obj);
529219089Spjd			page_unlock(pp);
530219089Spjd		} else {
531168926Spjd			VM_OBJECT_UNLOCK(obj);
532219089Spjd			error = dmu_read_uio(os, zp->z_id, uio, bytes);
533168926Spjd			VM_OBJECT_LOCK(obj);
534168404Spjd		}
535168404Spjd		len -= bytes;
536168404Spjd		off = 0;
537168404Spjd		if (error)
538168404Spjd			break;
539168404Spjd	}
540168404Spjd	VM_OBJECT_UNLOCK(obj);
541168404Spjd	return (error);
542168404Spjd}
543168404Spjd
544168404Spjdoffset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
545168404Spjd
546168404Spjd/*
547168404Spjd * Read bytes from specified file into supplied buffer.
548168404Spjd *
549168404Spjd *	IN:	vp	- vnode of file to be read from.
550168404Spjd *		uio	- structure supplying read location, range info,
551168404Spjd *			  and return buffer.
552168404Spjd *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
553168404Spjd *		cr	- credentials of caller.
554185029Spjd *		ct	- caller context
555168404Spjd *
556168404Spjd *	OUT:	uio	- updated offset and range, buffer filled.
557168404Spjd *
558168404Spjd *	RETURN:	0 if success
559168404Spjd *		error code if failure
560168404Spjd *
561168404Spjd * Side Effects:
562168404Spjd *	vp - atime updated if byte count > 0
563168404Spjd */
564168404Spjd/* ARGSUSED */
565168404Spjdstatic int
566168962Spjdzfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
567168404Spjd{
568168404Spjd	znode_t		*zp = VTOZ(vp);
569168404Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
570185029Spjd	objset_t	*os;
571168404Spjd	ssize_t		n, nbytes;
572168404Spjd	int		error;
573168404Spjd	rl_t		*rl;
574219089Spjd	xuio_t		*xuio = NULL;
575168404Spjd
576168404Spjd	ZFS_ENTER(zfsvfs);
577185029Spjd	ZFS_VERIFY_ZP(zp);
578185029Spjd	os = zfsvfs->z_os;
579168404Spjd
580219089Spjd	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
581185029Spjd		ZFS_EXIT(zfsvfs);
582185029Spjd		return (EACCES);
583185029Spjd	}
584185029Spjd
585168404Spjd	/*
586168404Spjd	 * Validate file offset
587168404Spjd	 */
588168404Spjd	if (uio->uio_loffset < (offset_t)0) {
589168404Spjd		ZFS_EXIT(zfsvfs);
590168404Spjd		return (EINVAL);
591168404Spjd	}
592168404Spjd
593168404Spjd	/*
594168404Spjd	 * Fasttrack empty reads
595168404Spjd	 */
596168404Spjd	if (uio->uio_resid == 0) {
597168404Spjd		ZFS_EXIT(zfsvfs);
598168404Spjd		return (0);
599168404Spjd	}
600168404Spjd
601168404Spjd	/*
602168962Spjd	 * Check for mandatory locks
603168962Spjd	 */
604219089Spjd	if (MANDMODE(zp->z_mode)) {
605168962Spjd		if (error = chklock(vp, FREAD,
606168962Spjd		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
607168962Spjd			ZFS_EXIT(zfsvfs);
608168962Spjd			return (error);
609168962Spjd		}
610168962Spjd	}
611168962Spjd
612168962Spjd	/*
613168404Spjd	 * If we're in FRSYNC mode, sync out this znode before reading it.
614168404Spjd	 */
615219089Spjd	if (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
616219089Spjd		zil_commit(zfsvfs->z_log, zp->z_id);
617168404Spjd
618168404Spjd	/*
619168404Spjd	 * Lock the range against changes.
620168404Spjd	 */
621168404Spjd	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
622168404Spjd
623168404Spjd	/*
624168404Spjd	 * If we are reading past end-of-file we can skip
625168404Spjd	 * to the end; but we might still need to set atime.
626168404Spjd	 */
627219089Spjd	if (uio->uio_loffset >= zp->z_size) {
628168404Spjd		error = 0;
629168404Spjd		goto out;
630168404Spjd	}
631168404Spjd
632219089Spjd	ASSERT(uio->uio_loffset < zp->z_size);
633219089Spjd	n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
634168404Spjd
635219089Spjd#ifdef sun
636219089Spjd	if ((uio->uio_extflg == UIO_XUIO) &&
637219089Spjd	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
638219089Spjd		int nblk;
639219089Spjd		int blksz = zp->z_blksz;
640219089Spjd		uint64_t offset = uio->uio_loffset;
641219089Spjd
642219089Spjd		xuio = (xuio_t *)uio;
643219089Spjd		if ((ISP2(blksz))) {
644219089Spjd			nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
645219089Spjd			    blksz)) / blksz;
646219089Spjd		} else {
647219089Spjd			ASSERT(offset + n <= blksz);
648219089Spjd			nblk = 1;
649219089Spjd		}
650219089Spjd		(void) dmu_xuio_init(xuio, nblk);
651219089Spjd
652219089Spjd		if (vn_has_cached_data(vp)) {
653219089Spjd			/*
654219089Spjd			 * For simplicity, we always allocate a full buffer
655219089Spjd			 * even if we only expect to read a portion of a block.
656219089Spjd			 */
657219089Spjd			while (--nblk >= 0) {
658219089Spjd				(void) dmu_xuio_add(xuio,
659219089Spjd				    dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
660219089Spjd				    blksz), 0, blksz);
661219089Spjd			}
662219089Spjd		}
663219089Spjd	}
664219089Spjd#endif	/* sun */
665219089Spjd
666168404Spjd	while (n > 0) {
667168404Spjd		nbytes = MIN(n, zfs_read_chunk_size -
668168404Spjd		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
669168404Spjd
670219089Spjd#ifdef __FreeBSD__
671219089Spjd		if (uio->uio_segflg == UIO_NOCOPY)
672219089Spjd			error = mappedread_sf(vp, nbytes, uio);
673219089Spjd		else
674219089Spjd#endif /* __FreeBSD__ */
675168404Spjd		if (vn_has_cached_data(vp))
676168404Spjd			error = mappedread(vp, nbytes, uio);
677168404Spjd		else
678168404Spjd			error = dmu_read_uio(os, zp->z_id, uio, nbytes);
679185029Spjd		if (error) {
680185029Spjd			/* convert checksum errors into IO errors */
681185029Spjd			if (error == ECKSUM)
682185029Spjd				error = EIO;
683168404Spjd			break;
684185029Spjd		}
685168962Spjd
686168404Spjd		n -= nbytes;
687168404Spjd	}
688168404Spjdout:
689168404Spjd	zfs_range_unlock(rl);
690168404Spjd
691168404Spjd	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
692168404Spjd	ZFS_EXIT(zfsvfs);
693168404Spjd	return (error);
694168404Spjd}
695168404Spjd
696168404Spjd/*
697168404Spjd * Write the bytes to a file.
698168404Spjd *
699168404Spjd *	IN:	vp	- vnode of file to be written to.
700168404Spjd *		uio	- structure supplying write location, range info,
701168404Spjd *			  and data buffer.
702213673Spjd *		ioflag	- FAPPEND flag set if in append mode.
703168404Spjd *		cr	- credentials of caller.
704185029Spjd *		ct	- caller context (NFS/CIFS fem monitor only)
705168404Spjd *
706168404Spjd *	OUT:	uio	- updated offset and range.
707168404Spjd *
708168404Spjd *	RETURN:	0 if success
709168404Spjd *		error code if failure
710168404Spjd *
711168404Spjd * Timestamps:
712168404Spjd *	vp - ctime|mtime updated if byte count > 0
713168404Spjd */
714219089Spjd
715168404Spjd/* ARGSUSED */
716168404Spjdstatic int
717168962Spjdzfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
718168404Spjd{
719168404Spjd	znode_t		*zp = VTOZ(vp);
720168962Spjd	rlim64_t	limit = MAXOFFSET_T;
721168404Spjd	ssize_t		start_resid = uio->uio_resid;
722168404Spjd	ssize_t		tx_bytes;
723168404Spjd	uint64_t	end_size;
724168404Spjd	dmu_tx_t	*tx;
725168404Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
726185029Spjd	zilog_t		*zilog;
727168404Spjd	offset_t	woff;
728168404Spjd	ssize_t		n, nbytes;
729168404Spjd	rl_t		*rl;
730168404Spjd	int		max_blksz = zfsvfs->z_max_blksz;
731168404Spjd	int		error;
732209962Smm	arc_buf_t	*abuf;
733219089Spjd	iovec_t		*aiov;
734219089Spjd	xuio_t		*xuio = NULL;
735219089Spjd	int		i_iov = 0;
736219089Spjd	int		iovcnt = uio->uio_iovcnt;
737219089Spjd	iovec_t		*iovp = uio->uio_iov;
738219089Spjd	int		write_eof;
739219089Spjd	int		count = 0;
740219089Spjd	sa_bulk_attr_t	bulk[4];
741219089Spjd	uint64_t	mtime[2], ctime[2];
742168404Spjd
743168404Spjd	/*
744168404Spjd	 * Fasttrack empty write
745168404Spjd	 */
746168404Spjd	n = start_resid;
747168404Spjd	if (n == 0)
748168404Spjd		return (0);
749168404Spjd
750168962Spjd	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
751168962Spjd		limit = MAXOFFSET_T;
752168962Spjd
753168404Spjd	ZFS_ENTER(zfsvfs);
754185029Spjd	ZFS_VERIFY_ZP(zp);
755168404Spjd
756219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
757219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
758219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
759219089Spjd	    &zp->z_size, 8);
760219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
761219089Spjd	    &zp->z_pflags, 8);
762219089Spjd
763168404Spjd	/*
764185029Spjd	 * If immutable or not appending then return EPERM
765185029Spjd	 */
766219089Spjd	if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
767219089Spjd	    ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
768219089Spjd	    (uio->uio_loffset < zp->z_size))) {
769185029Spjd		ZFS_EXIT(zfsvfs);
770185029Spjd		return (EPERM);
771185029Spjd	}
772185029Spjd
773185029Spjd	zilog = zfsvfs->z_log;
774185029Spjd
775185029Spjd	/*
776219089Spjd	 * Validate file offset
777219089Spjd	 */
778219089Spjd	woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
779219089Spjd	if (woff < 0) {
780219089Spjd		ZFS_EXIT(zfsvfs);
781219089Spjd		return (EINVAL);
782219089Spjd	}
783219089Spjd
784219089Spjd	/*
785219089Spjd	 * Check for mandatory locks before calling zfs_range_lock()
786219089Spjd	 * in order to prevent a deadlock with locks set via fcntl().
787219089Spjd	 */
788219089Spjd	if (MANDMODE((mode_t)zp->z_mode) &&
789219089Spjd	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
790219089Spjd		ZFS_EXIT(zfsvfs);
791219089Spjd		return (error);
792219089Spjd	}
793219089Spjd
794219089Spjd#ifdef sun
795219089Spjd	/*
796168404Spjd	 * Pre-fault the pages to ensure slow (eg NFS) pages
797168404Spjd	 * don't hold up txg.
798219089Spjd	 * Skip this if uio contains loaned arc_buf.
799168404Spjd	 */
800219089Spjd	if ((uio->uio_extflg == UIO_XUIO) &&
801219089Spjd	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
802219089Spjd		xuio = (xuio_t *)uio;
803219089Spjd	else
804219089Spjd		uio_prefaultpages(MIN(n, max_blksz), uio);
805219089Spjd#endif	/* sun */
806168404Spjd
807168404Spjd	/*
808168404Spjd	 * If in append mode, set the io offset pointer to eof.
809168404Spjd	 */
810213673Spjd	if (ioflag & FAPPEND) {
811168404Spjd		/*
812219089Spjd		 * Obtain an appending range lock to guarantee file append
813219089Spjd		 * semantics.  We reset the write offset once we have the lock.
814168404Spjd		 */
815168404Spjd		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
816219089Spjd		woff = rl->r_off;
817168404Spjd		if (rl->r_len == UINT64_MAX) {
818219089Spjd			/*
819219089Spjd			 * We overlocked the file because this write will cause
820219089Spjd			 * the file block size to increase.
821219089Spjd			 * Note that zp_size cannot change with this lock held.
822219089Spjd			 */
823219089Spjd			woff = zp->z_size;
824168404Spjd		}
825219089Spjd		uio->uio_loffset = woff;
826168404Spjd	} else {
827168404Spjd		/*
828219089Spjd		 * Note that if the file block size will change as a result of
829219089Spjd		 * this write, then this range lock will lock the entire file
830219089Spjd		 * so that we can re-write the block safely.
831168404Spjd		 */
832168404Spjd		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
833168404Spjd	}
834168404Spjd
835168962Spjd	if (woff >= limit) {
836168962Spjd		zfs_range_unlock(rl);
837168962Spjd		ZFS_EXIT(zfsvfs);
838168962Spjd		return (EFBIG);
839168962Spjd	}
840168962Spjd
841168962Spjd	if ((woff + n) > limit || woff > (limit - n))
842168962Spjd		n = limit - woff;
843168962Spjd
844219089Spjd	/* Will this write extend the file length? */
845219089Spjd	write_eof = (woff + n > zp->z_size);
846168404Spjd
847219089Spjd	end_size = MAX(zp->z_size, woff + n);
848219089Spjd
849168404Spjd	/*
850168404Spjd	 * Write the file in reasonable size chunks.  Each chunk is written
851168404Spjd	 * in a separate transaction; this keeps the intent log records small
852168404Spjd	 * and allows us to do more fine-grained space accounting.
853168404Spjd	 */
854168404Spjd	while (n > 0) {
855209962Smm		abuf = NULL;
856209962Smm		woff = uio->uio_loffset;
857209962Smmagain:
858219089Spjd		if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
859219089Spjd		    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
860209962Smm			if (abuf != NULL)
861209962Smm				dmu_return_arcbuf(abuf);
862209962Smm			error = EDQUOT;
863209962Smm			break;
864209962Smm		}
865209962Smm
866219089Spjd		if (xuio && abuf == NULL) {
867219089Spjd			ASSERT(i_iov < iovcnt);
868219089Spjd			aiov = &iovp[i_iov];
869219089Spjd			abuf = dmu_xuio_arcbuf(xuio, i_iov);
870219089Spjd			dmu_xuio_clear(xuio, i_iov);
871219089Spjd			DTRACE_PROBE3(zfs_cp_write, int, i_iov,
872219089Spjd			    iovec_t *, aiov, arc_buf_t *, abuf);
873219089Spjd			ASSERT((aiov->iov_base == abuf->b_data) ||
874219089Spjd			    ((char *)aiov->iov_base - (char *)abuf->b_data +
875219089Spjd			    aiov->iov_len == arc_buf_size(abuf)));
876219089Spjd			i_iov++;
877219089Spjd		} else if (abuf == NULL && n >= max_blksz &&
878219089Spjd		    woff >= zp->z_size &&
879209962Smm		    P2PHASE(woff, max_blksz) == 0 &&
880209962Smm		    zp->z_blksz == max_blksz) {
881219089Spjd			/*
882219089Spjd			 * This write covers a full block.  "Borrow" a buffer
883219089Spjd			 * from the dmu so that we can fill it before we enter
884219089Spjd			 * a transaction.  This avoids the possibility of
885219089Spjd			 * holding up the transaction if the data copy hangs
886219089Spjd			 * up on a pagefault (e.g., from an NFS server mapping).
887219089Spjd			 */
888209962Smm			size_t cbytes;
889209962Smm
890219089Spjd			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
891219089Spjd			    max_blksz);
892209962Smm			ASSERT(abuf != NULL);
893209962Smm			ASSERT(arc_buf_size(abuf) == max_blksz);
894209962Smm			if (error = uiocopy(abuf->b_data, max_blksz,
895209962Smm			    UIO_WRITE, uio, &cbytes)) {
896209962Smm				dmu_return_arcbuf(abuf);
897209962Smm				break;
898209962Smm			}
899209962Smm			ASSERT(cbytes == max_blksz);
900209962Smm		}
901209962Smm
902209962Smm		/*
903168404Spjd		 * Start a transaction.
904168404Spjd		 */
905168404Spjd		tx = dmu_tx_create(zfsvfs->z_os);
906219089Spjd		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
907168404Spjd		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
908219089Spjd		zfs_sa_upgrade_txholds(tx, zp);
909209962Smm		error = dmu_tx_assign(tx, TXG_NOWAIT);
910168404Spjd		if (error) {
911209962Smm			if (error == ERESTART) {
912168404Spjd				dmu_tx_wait(tx);
913168404Spjd				dmu_tx_abort(tx);
914209962Smm				goto again;
915168404Spjd			}
916168404Spjd			dmu_tx_abort(tx);
917209962Smm			if (abuf != NULL)
918209962Smm				dmu_return_arcbuf(abuf);
919168404Spjd			break;
920168404Spjd		}
921168404Spjd
922168404Spjd		/*
923168404Spjd		 * If zfs_range_lock() over-locked we grow the blocksize
924168404Spjd		 * and then reduce the lock range.  This will only happen
925168404Spjd		 * on the first iteration since zfs_range_reduce() will
926168404Spjd		 * shrink down r_len to the appropriate size.
927168404Spjd		 */
928168404Spjd		if (rl->r_len == UINT64_MAX) {
929168404Spjd			uint64_t new_blksz;
930168404Spjd
931168404Spjd			if (zp->z_blksz > max_blksz) {
932168404Spjd				ASSERT(!ISP2(zp->z_blksz));
933168404Spjd				new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
934168404Spjd			} else {
935168404Spjd				new_blksz = MIN(end_size, max_blksz);
936168404Spjd			}
937168404Spjd			zfs_grow_blocksize(zp, new_blksz, tx);
938168404Spjd			zfs_range_reduce(rl, woff, n);
939168404Spjd		}
940168404Spjd
941168404Spjd		/*
942168404Spjd		 * XXX - should we really limit each write to z_max_blksz?
943168404Spjd		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
944168404Spjd		 */
945168404Spjd		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
946168404Spjd
947219089Spjd		if (woff + nbytes > zp->z_size)
948168404Spjd			vnode_pager_setsize(vp, woff + nbytes);
949168404Spjd
950209962Smm		if (abuf == NULL) {
951209962Smm			tx_bytes = uio->uio_resid;
952219089Spjd			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
953219089Spjd			    uio, nbytes, tx);
954209962Smm			tx_bytes -= uio->uio_resid;
955168404Spjd		} else {
956209962Smm			tx_bytes = nbytes;
957219089Spjd			ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
958219089Spjd			/*
959219089Spjd			 * If this is not a full block write, but we are
960219089Spjd			 * extending the file past EOF and this data starts
961219089Spjd			 * block-aligned, use assign_arcbuf().  Otherwise,
962219089Spjd			 * write via dmu_write().
963219089Spjd			 */
964219089Spjd			if (tx_bytes < max_blksz && (!write_eof ||
965219089Spjd			    aiov->iov_base != abuf->b_data)) {
966219089Spjd				ASSERT(xuio);
967219089Spjd				dmu_write(zfsvfs->z_os, zp->z_id, woff,
968219089Spjd				    aiov->iov_len, aiov->iov_base, tx);
969219089Spjd				dmu_return_arcbuf(abuf);
970219089Spjd				xuio_stat_wbuf_copied();
971219089Spjd			} else {
972219089Spjd				ASSERT(xuio || tx_bytes == max_blksz);
973219089Spjd				dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
974219089Spjd				    woff, abuf, tx);
975219089Spjd			}
976209962Smm			ASSERT(tx_bytes <= uio->uio_resid);
977209962Smm			uioskip(uio, tx_bytes);
978168404Spjd		}
979212657Savg		if (tx_bytes && vn_has_cached_data(vp)) {
980209962Smm			update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
981209962Smm			    zp->z_id, uio->uio_segflg, tx);
982209962Smm		}
983209962Smm
984209962Smm		/*
985168404Spjd		 * If we made no progress, we're done.  If we made even
986168404Spjd		 * partial progress, update the znode and ZIL accordingly.
987168404Spjd		 */
988168404Spjd		if (tx_bytes == 0) {
989219089Spjd			(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
990219089Spjd			    (void *)&zp->z_size, sizeof (uint64_t), tx);
991168404Spjd			dmu_tx_commit(tx);
992168404Spjd			ASSERT(error != 0);
993168404Spjd			break;
994168404Spjd		}
995168404Spjd
996168404Spjd		/*
997168404Spjd		 * Clear Set-UID/Set-GID bits on successful write if not
998168404Spjd		 * privileged and at least one of the excute bits is set.
999168404Spjd		 *
1000168404Spjd		 * It would be nice to to this after all writes have
1001168404Spjd		 * been done, but that would still expose the ISUID/ISGID
1002168404Spjd		 * to another app after the partial write is committed.
1003185029Spjd		 *
1004185029Spjd		 * Note: we don't call zfs_fuid_map_id() here because
1005185029Spjd		 * user 0 is not an ephemeral uid.
1006168404Spjd		 */
1007168404Spjd		mutex_enter(&zp->z_acl_lock);
1008219089Spjd		if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
1009168404Spjd		    (S_IXUSR >> 6))) != 0 &&
1010219089Spjd		    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
1011185029Spjd		    secpolicy_vnode_setid_retain(vp, cr,
1012219089Spjd		    (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
1013219089Spjd			uint64_t newmode;
1014219089Spjd			zp->z_mode &= ~(S_ISUID | S_ISGID);
1015219089Spjd			newmode = zp->z_mode;
1016219089Spjd			(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
1017219089Spjd			    (void *)&newmode, sizeof (uint64_t), tx);
1018168404Spjd		}
1019168404Spjd		mutex_exit(&zp->z_acl_lock);
1020168404Spjd
1021219089Spjd		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
1022219089Spjd		    B_TRUE);
1023168404Spjd
1024168404Spjd		/*
1025168404Spjd		 * Update the file size (zp_size) if it has changed;
1026168404Spjd		 * account for possible concurrent updates.
1027168404Spjd		 */
1028219089Spjd		while ((end_size = zp->z_size) < uio->uio_loffset) {
1029219089Spjd			(void) atomic_cas_64(&zp->z_size, end_size,
1030168404Spjd			    uio->uio_loffset);
1031219089Spjd			ASSERT(error == 0);
1032219089Spjd		}
1033219089Spjd		/*
1034219089Spjd		 * If we are replaying and eof is non zero then force
1035219089Spjd		 * the file size to the specified eof. Note, there's no
1036219089Spjd		 * concurrency during replay.
1037219089Spjd		 */
1038219089Spjd		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
1039219089Spjd			zp->z_size = zfsvfs->z_replay_eof;
1040219089Spjd
1041219089Spjd		error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1042219089Spjd
1043168404Spjd		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
1044168404Spjd		dmu_tx_commit(tx);
1045168404Spjd
1046168404Spjd		if (error != 0)
1047168404Spjd			break;
1048168404Spjd		ASSERT(tx_bytes == nbytes);
1049168404Spjd		n -= nbytes;
1050219089Spjd
1051219089Spjd#ifdef sun
1052219089Spjd		if (!xuio && n > 0)
1053219089Spjd			uio_prefaultpages(MIN(n, max_blksz), uio);
1054219089Spjd#endif	/* sun */
1055168404Spjd	}
1056168404Spjd
1057168404Spjd	zfs_range_unlock(rl);
1058168404Spjd
1059168404Spjd	/*
1060168404Spjd	 * If we're in replay mode, or we made no progress, return error.
1061168404Spjd	 * Otherwise, it's at least a partial write, so it's successful.
1062168404Spjd	 */
1063209962Smm	if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
1064168404Spjd		ZFS_EXIT(zfsvfs);
1065168404Spjd		return (error);
1066168404Spjd	}
1067168404Spjd
1068219089Spjd	if (ioflag & (FSYNC | FDSYNC) ||
1069219089Spjd	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1070219089Spjd		zil_commit(zilog, zp->z_id);
1071168404Spjd
1072168404Spjd	ZFS_EXIT(zfsvfs);
1073168404Spjd	return (0);
1074168404Spjd}
1075168404Spjd
1076168404Spjdvoid
1077219089Spjdzfs_get_done(zgd_t *zgd, int error)
1078168404Spjd{
1079219089Spjd	znode_t *zp = zgd->zgd_private;
1080219089Spjd	objset_t *os = zp->z_zfsvfs->z_os;
1081168404Spjd	int vfslocked;
1082168404Spjd
1083219089Spjd	if (zgd->zgd_db)
1084219089Spjd		dmu_buf_rele(zgd->zgd_db, zgd);
1085219089Spjd
1086219089Spjd	zfs_range_unlock(zgd->zgd_rl);
1087219089Spjd
1088219089Spjd	vfslocked = VFS_LOCK_GIANT(zp->z_zfsvfs->z_vfs);
1089191900Skmacy	/*
1090191900Skmacy	 * Release the vnode asynchronously as we currently have the
1091191900Skmacy	 * txg stopped from syncing.
1092191900Skmacy	 */
1093219089Spjd	VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1094219089Spjd
1095219089Spjd	if (error == 0 && zgd->zgd_bp)
1096219089Spjd		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
1097219089Spjd
1098168404Spjd	kmem_free(zgd, sizeof (zgd_t));
1099168404Spjd	VFS_UNLOCK_GIANT(vfslocked);
1100168404Spjd}
1101168404Spjd
1102214378Smm#ifdef DEBUG
1103214378Smmstatic int zil_fault_io = 0;
1104214378Smm#endif
1105214378Smm
1106168404Spjd/*
1107168404Spjd * Get data to generate a TX_WRITE intent log record.
1108168404Spjd */
1109168404Spjdint
1110168404Spjdzfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
1111168404Spjd{
1112168404Spjd	zfsvfs_t *zfsvfs = arg;
1113168404Spjd	objset_t *os = zfsvfs->z_os;
1114168404Spjd	znode_t *zp;
1115219089Spjd	uint64_t object = lr->lr_foid;
1116219089Spjd	uint64_t offset = lr->lr_offset;
1117219089Spjd	uint64_t size = lr->lr_length;
1118219089Spjd	blkptr_t *bp = &lr->lr_blkptr;
1119168404Spjd	dmu_buf_t *db;
1120168404Spjd	zgd_t *zgd;
1121168404Spjd	int error = 0;
1122168404Spjd
1123219089Spjd	ASSERT(zio != NULL);
1124219089Spjd	ASSERT(size != 0);
1125168404Spjd
1126168404Spjd	/*
1127168404Spjd	 * Nothing to do if the file has been removed
1128168404Spjd	 */
1129219089Spjd	if (zfs_zget(zfsvfs, object, &zp) != 0)
1130168404Spjd		return (ENOENT);
1131168404Spjd	if (zp->z_unlinked) {
1132191900Skmacy		/*
1133191900Skmacy		 * Release the vnode asynchronously as we currently have the
1134191900Skmacy		 * txg stopped from syncing.
1135191900Skmacy		 */
1136196307Spjd		VN_RELE_ASYNC(ZTOV(zp),
1137196307Spjd		    dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1138168404Spjd		return (ENOENT);
1139168404Spjd	}
1140168404Spjd
1141219089Spjd	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1142219089Spjd	zgd->zgd_zilog = zfsvfs->z_log;
1143219089Spjd	zgd->zgd_private = zp;
1144219089Spjd
1145168404Spjd	/*
1146168404Spjd	 * Write records come in two flavors: immediate and indirect.
1147168404Spjd	 * For small writes it's cheaper to store the data with the
1148168404Spjd	 * log record (immediate); for large writes it's cheaper to
1149168404Spjd	 * sync the data and get a pointer to it (indirect) so that
1150168404Spjd	 * we don't have to write the data twice.
1151168404Spjd	 */
1152168404Spjd	if (buf != NULL) { /* immediate write */
1153219089Spjd		zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
1154168404Spjd		/* test for truncation needs to be done while range locked */
1155219089Spjd		if (offset >= zp->z_size) {
1156168404Spjd			error = ENOENT;
1157219089Spjd		} else {
1158219089Spjd			error = dmu_read(os, object, offset, size, buf,
1159219089Spjd			    DMU_READ_NO_PREFETCH);
1160168404Spjd		}
1161219089Spjd		ASSERT(error == 0 || error == ENOENT);
1162168404Spjd	} else { /* indirect write */
1163168404Spjd		/*
1164168404Spjd		 * Have to lock the whole block to ensure when it's
1165168404Spjd		 * written out and it's checksum is being calculated
1166168404Spjd		 * that no one can change the data. We need to re-check
1167168404Spjd		 * blocksize after we get the lock in case it's changed!
1168168404Spjd		 */
1169168404Spjd		for (;;) {
1170219089Spjd			uint64_t blkoff;
1171219089Spjd			size = zp->z_blksz;
1172219089Spjd			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1173219089Spjd			offset -= blkoff;
1174219089Spjd			zgd->zgd_rl = zfs_range_lock(zp, offset, size,
1175219089Spjd			    RL_READER);
1176219089Spjd			if (zp->z_blksz == size)
1177168404Spjd				break;
1178219089Spjd			offset += blkoff;
1179219089Spjd			zfs_range_unlock(zgd->zgd_rl);
1180168404Spjd		}
1181168404Spjd		/* test for truncation needs to be done while range locked */
1182219089Spjd		if (lr->lr_offset >= zp->z_size)
1183168404Spjd			error = ENOENT;
1184214378Smm#ifdef DEBUG
1185214378Smm		if (zil_fault_io) {
1186214378Smm			error = EIO;
1187214378Smm			zil_fault_io = 0;
1188214378Smm		}
1189214378Smm#endif
1190219089Spjd		if (error == 0)
1191219089Spjd			error = dmu_buf_hold(os, object, offset, zgd, &db,
1192219089Spjd			    DMU_READ_NO_PREFETCH);
1193214378Smm
1194209962Smm		if (error == 0) {
1195219089Spjd			zgd->zgd_db = db;
1196219089Spjd			zgd->zgd_bp = bp;
1197219089Spjd
1198219089Spjd			ASSERT(db->db_offset == offset);
1199219089Spjd			ASSERT(db->db_size == size);
1200219089Spjd
1201219089Spjd			error = dmu_sync(zio, lr->lr_common.lrc_txg,
1202219089Spjd			    zfs_get_done, zgd);
1203219089Spjd			ASSERT(error || lr->lr_length <= zp->z_blksz);
1204219089Spjd
1205209962Smm			/*
1206219089Spjd			 * On success, we need to wait for the write I/O
1207219089Spjd			 * initiated by dmu_sync() to complete before we can
1208219089Spjd			 * release this dbuf.  We will finish everything up
1209219089Spjd			 * in the zfs_get_done() callback.
1210209962Smm			 */
1211219089Spjd			if (error == 0)
1212219089Spjd				return (0);
1213209962Smm
1214219089Spjd			if (error == EALREADY) {
1215219089Spjd				lr->lr_common.lrc_txtype = TX_WRITE2;
1216219089Spjd				error = 0;
1217219089Spjd			}
1218209962Smm		}
1219168404Spjd	}
1220219089Spjd
1221219089Spjd	zfs_get_done(zgd, error);
1222219089Spjd
1223168404Spjd	return (error);
1224168404Spjd}
1225168404Spjd
1226168404Spjd/*ARGSUSED*/
1227168404Spjdstatic int
1228185029Spjdzfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1229185029Spjd    caller_context_t *ct)
1230168404Spjd{
1231168404Spjd	znode_t *zp = VTOZ(vp);
1232168404Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1233168404Spjd	int error;
1234168404Spjd
1235168404Spjd	ZFS_ENTER(zfsvfs);
1236185029Spjd	ZFS_VERIFY_ZP(zp);
1237185029Spjd
1238185029Spjd	if (flag & V_ACE_MASK)
1239185029Spjd		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1240185029Spjd	else
1241185029Spjd		error = zfs_zaccess_rwx(zp, mode, flag, cr);
1242185029Spjd
1243168404Spjd	ZFS_EXIT(zfsvfs);
1244168404Spjd	return (error);
1245168404Spjd}
1246168404Spjd
1247168404Spjd/*
1248211932Smm * If vnode is for a device return a specfs vnode instead.
1249211932Smm */
1250211932Smmstatic int
1251211932Smmspecvp_check(vnode_t **vpp, cred_t *cr)
1252211932Smm{
1253211932Smm	int error = 0;
1254211932Smm
1255211932Smm	if (IS_DEVVP(*vpp)) {
1256211932Smm		struct vnode *svp;
1257211932Smm
1258211932Smm		svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1259211932Smm		VN_RELE(*vpp);
1260211932Smm		if (svp == NULL)
1261211932Smm			error = ENOSYS;
1262211932Smm		*vpp = svp;
1263211932Smm	}
1264211932Smm	return (error);
1265211932Smm}
1266211932Smm
1267211932Smm
1268211932Smm/*
1269168404Spjd * Lookup an entry in a directory, or an extended attribute directory.
1270168404Spjd * If it exists, return a held vnode reference for it.
1271168404Spjd *
1272168404Spjd *	IN:	dvp	- vnode of directory to search.
1273168404Spjd *		nm	- name of entry to lookup.
1274168404Spjd *		pnp	- full pathname to lookup [UNUSED].
1275168404Spjd *		flags	- LOOKUP_XATTR set if looking for an attribute.
1276168404Spjd *		rdir	- root directory vnode [UNUSED].
1277168404Spjd *		cr	- credentials of caller.
1278185029Spjd *		ct	- caller context
1279185029Spjd *		direntflags - directory lookup flags
1280185029Spjd *		realpnp - returned pathname.
1281168404Spjd *
1282168404Spjd *	OUT:	vpp	- vnode of located entry, NULL if not found.
1283168404Spjd *
1284168404Spjd *	RETURN:	0 if success
1285168404Spjd *		error code if failure
1286168404Spjd *
1287168404Spjd * Timestamps:
1288168404Spjd *	NA
1289168404Spjd */
1290168404Spjd/* ARGSUSED */
1291168962Spjdstatic int
1292168962Spjdzfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
1293185029Spjd    int nameiop, cred_t *cr, kthread_t *td, int flags)
1294168404Spjd{
1295168962Spjd	znode_t *zdp = VTOZ(dvp);
1296168962Spjd	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1297211932Smm	int	error = 0;
1298185029Spjd	int *direntflags = NULL;
1299185029Spjd	void *realpnp = NULL;
1300168404Spjd
1301211932Smm	/* fast path */
1302211932Smm	if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
1303211932Smm
1304211932Smm		if (dvp->v_type != VDIR) {
1305211932Smm			return (ENOTDIR);
1306219089Spjd		} else if (zdp->z_sa_hdl == NULL) {
1307211932Smm			return (EIO);
1308211932Smm		}
1309211932Smm
1310211932Smm		if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
1311211932Smm			error = zfs_fastaccesschk_execute(zdp, cr);
1312211932Smm			if (!error) {
1313211932Smm				*vpp = dvp;
1314211932Smm				VN_HOLD(*vpp);
1315211932Smm				return (0);
1316211932Smm			}
1317211932Smm			return (error);
1318211932Smm		} else {
1319211932Smm			vnode_t *tvp = dnlc_lookup(dvp, nm);
1320211932Smm
1321211932Smm			if (tvp) {
1322211932Smm				error = zfs_fastaccesschk_execute(zdp, cr);
1323211932Smm				if (error) {
1324211932Smm					VN_RELE(tvp);
1325211932Smm					return (error);
1326211932Smm				}
1327211932Smm				if (tvp == DNLC_NO_VNODE) {
1328211932Smm					VN_RELE(tvp);
1329211932Smm					return (ENOENT);
1330211932Smm				} else {
1331211932Smm					*vpp = tvp;
1332211932Smm					return (specvp_check(vpp, cr));
1333211932Smm				}
1334211932Smm			}
1335211932Smm		}
1336211932Smm	}
1337211932Smm
1338211932Smm	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1339211932Smm
1340168404Spjd	ZFS_ENTER(zfsvfs);
1341185029Spjd	ZFS_VERIFY_ZP(zdp);
1342168404Spjd
1343168404Spjd	*vpp = NULL;
1344168404Spjd
1345185029Spjd	if (flags & LOOKUP_XATTR) {
1346168404Spjd#ifdef TODO
1347168404Spjd		/*
1348168404Spjd		 * If the xattr property is off, refuse the lookup request.
1349168404Spjd		 */
1350168404Spjd		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1351168404Spjd			ZFS_EXIT(zfsvfs);
1352168404Spjd			return (EINVAL);
1353168404Spjd		}
1354185029Spjd#endif
1355168404Spjd
1356168404Spjd		/*
1357168404Spjd		 * We don't allow recursive attributes..
1358168404Spjd		 * Maybe someday we will.
1359168404Spjd		 */
1360219089Spjd		if (zdp->z_pflags & ZFS_XATTR) {
1361168404Spjd			ZFS_EXIT(zfsvfs);
1362168404Spjd			return (EINVAL);
1363168404Spjd		}
1364168404Spjd
1365168404Spjd		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1366168404Spjd			ZFS_EXIT(zfsvfs);
1367168404Spjd			return (error);
1368168404Spjd		}
1369168404Spjd
1370168404Spjd		/*
1371168404Spjd		 * Do we have permission to get into attribute directory?
1372168404Spjd		 */
1373168404Spjd
1374185029Spjd		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1375185029Spjd		    B_FALSE, cr)) {
1376168404Spjd			VN_RELE(*vpp);
1377185029Spjd			*vpp = NULL;
1378168404Spjd		}
1379168404Spjd
1380168404Spjd		ZFS_EXIT(zfsvfs);
1381168404Spjd		return (error);
1382168404Spjd	}
1383168404Spjd
1384168404Spjd	if (dvp->v_type != VDIR) {
1385168404Spjd		ZFS_EXIT(zfsvfs);
1386168404Spjd		return (ENOTDIR);
1387168404Spjd	}
1388168404Spjd
1389168404Spjd	/*
1390168404Spjd	 * Check accessibility of directory.
1391168404Spjd	 */
1392168404Spjd
1393185029Spjd	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1394168404Spjd		ZFS_EXIT(zfsvfs);
1395168404Spjd		return (error);
1396168404Spjd	}
1397168404Spjd
1398185029Spjd	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1399185029Spjd	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1400185029Spjd		ZFS_EXIT(zfsvfs);
1401185029Spjd		return (EILSEQ);
1402185029Spjd	}
1403168404Spjd
1404185029Spjd	error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
1405211932Smm	if (error == 0)
1406211932Smm		error = specvp_check(vpp, cr);
1407168962Spjd
1408168404Spjd	/* Translate errors and add SAVENAME when needed. */
1409168404Spjd	if (cnp->cn_flags & ISLASTCN) {
1410168404Spjd		switch (nameiop) {
1411168404Spjd		case CREATE:
1412168404Spjd		case RENAME:
1413168404Spjd			if (error == ENOENT) {
1414168404Spjd				error = EJUSTRETURN;
1415168404Spjd				cnp->cn_flags |= SAVENAME;
1416168404Spjd				break;
1417168404Spjd			}
1418168404Spjd			/* FALLTHROUGH */
1419168404Spjd		case DELETE:
1420168404Spjd			if (error == 0)
1421168404Spjd				cnp->cn_flags |= SAVENAME;
1422168404Spjd			break;
1423168404Spjd		}
1424168404Spjd	}
1425168404Spjd	if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) {
1426169198Spjd		int ltype = 0;
1427169198Spjd
1428169198Spjd		if (cnp->cn_flags & ISDOTDOT) {
1429176559Sattilio			ltype = VOP_ISLOCKED(dvp);
1430175294Sattilio			VOP_UNLOCK(dvp, 0);
1431169198Spjd		}
1432206667Spjd		ZFS_EXIT(zfsvfs);
1433219089Spjd		error = zfs_vnode_lock(*vpp, cnp->cn_lkflags);
1434168962Spjd		if (cnp->cn_flags & ISDOTDOT)
1435175202Sattilio			vn_lock(dvp, ltype | LK_RETRY);
1436169172Spjd		if (error != 0) {
1437169172Spjd			VN_RELE(*vpp);
1438169172Spjd			*vpp = NULL;
1439169172Spjd			return (error);
1440169172Spjd		}
1441206667Spjd	} else {
1442206667Spjd		ZFS_EXIT(zfsvfs);
1443168404Spjd	}
1444168404Spjd
1445168404Spjd#ifdef FREEBSD_NAMECACHE
1446168404Spjd	/*
1447168404Spjd	 * Insert name into cache (as non-existent) if appropriate.
1448168404Spjd	 */
1449168404Spjd	if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
1450168404Spjd		cache_enter(dvp, *vpp, cnp);
1451169170Spjd	/*
1452169170Spjd	 * Insert name into cache if appropriate.
1453169170Spjd	 */
1454168404Spjd	if (error == 0 && (cnp->cn_flags & MAKEENTRY)) {
1455168404Spjd		if (!(cnp->cn_flags & ISLASTCN) ||
1456168404Spjd		    (nameiop != DELETE && nameiop != RENAME)) {
1457168404Spjd			cache_enter(dvp, *vpp, cnp);
1458168404Spjd		}
1459168404Spjd	}
1460168404Spjd#endif
1461168404Spjd
1462168404Spjd	return (error);
1463168404Spjd}
1464168404Spjd
1465168404Spjd/*
1466168404Spjd * Attempt to create a new entry in a directory.  If the entry
1467168404Spjd * already exists, truncate the file if permissible, else return
1468168404Spjd * an error.  Return the vp of the created or trunc'd file.
1469168404Spjd *
1470168404Spjd *	IN:	dvp	- vnode of directory to put new file entry in.
1471168404Spjd *		name	- name of new file entry.
1472168404Spjd *		vap	- attributes of new file.
1473168404Spjd *		excl	- flag indicating exclusive or non-exclusive mode.
1474168404Spjd *		mode	- mode to open file with.
1475168404Spjd *		cr	- credentials of caller.
1476168404Spjd *		flag	- large file flag [UNUSED].
1477185029Spjd *		ct	- caller context
1478185029Spjd *		vsecp 	- ACL to be set
1479168404Spjd *
1480168404Spjd *	OUT:	vpp	- vnode of created or trunc'd entry.
1481168404Spjd *
1482168404Spjd *	RETURN:	0 if success
1483168404Spjd *		error code if failure
1484168404Spjd *
1485168404Spjd * Timestamps:
1486168404Spjd *	dvp - ctime|mtime updated if new entry created
1487168404Spjd *	 vp - ctime|mtime always, atime if new
1488168404Spjd */
1489185029Spjd
1490168404Spjd/* ARGSUSED */
1491168404Spjdstatic int
1492168962Spjdzfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
1493185029Spjd    vnode_t **vpp, cred_t *cr, kthread_t *td)
1494168404Spjd{
1495168404Spjd	znode_t		*zp, *dzp = VTOZ(dvp);
1496168404Spjd	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1497185029Spjd	zilog_t		*zilog;
1498185029Spjd	objset_t	*os;
1499168404Spjd	zfs_dirlock_t	*dl;
1500168404Spjd	dmu_tx_t	*tx;
1501168404Spjd	int		error;
1502209962Smm	ksid_t		*ksid;
1503209962Smm	uid_t		uid;
1504209962Smm	gid_t		gid = crgetgid(cr);
1505219089Spjd	zfs_acl_ids_t   acl_ids;
1506209962Smm	boolean_t	fuid_dirtied;
1507219089Spjd	boolean_t	have_acl = B_FALSE;
1508185029Spjd	void		*vsecp = NULL;
1509185029Spjd	int		flag = 0;
1510168404Spjd
1511185029Spjd	/*
1512185029Spjd	 * If we have an ephemeral id, ACL, or XVATTR then
1513185029Spjd	 * make sure file system is at proper version
1514185029Spjd	 */
1515185029Spjd
1516209962Smm	ksid = crgetsid(cr, KSID_OWNER);
1517209962Smm	if (ksid)
1518209962Smm		uid = ksid_getid(ksid);
1519209962Smm	else
1520209962Smm		uid = crgetuid(cr);
1521219089Spjd
1522185029Spjd	if (zfsvfs->z_use_fuids == B_FALSE &&
1523185029Spjd	    (vsecp || (vap->va_mask & AT_XVATTR) ||
1524219089Spjd	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1525185029Spjd		return (EINVAL);
1526185029Spjd
1527168404Spjd	ZFS_ENTER(zfsvfs);
1528185029Spjd	ZFS_VERIFY_ZP(dzp);
1529185029Spjd	os = zfsvfs->z_os;
1530185029Spjd	zilog = zfsvfs->z_log;
1531168404Spjd
1532185029Spjd	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1533185029Spjd	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1534185029Spjd		ZFS_EXIT(zfsvfs);
1535185029Spjd		return (EILSEQ);
1536185029Spjd	}
1537185029Spjd
1538185029Spjd	if (vap->va_mask & AT_XVATTR) {
1539197861Spjd		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
1540185029Spjd		    crgetuid(cr), cr, vap->va_type)) != 0) {
1541185029Spjd			ZFS_EXIT(zfsvfs);
1542185029Spjd			return (error);
1543185029Spjd		}
1544185029Spjd	}
1545168404Spjdtop:
1546168404Spjd	*vpp = NULL;
1547168404Spjd
1548182905Strasz	if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
1549182905Strasz		vap->va_mode &= ~S_ISVTX;
1550168404Spjd
1551168404Spjd	if (*name == '\0') {
1552168404Spjd		/*
1553168404Spjd		 * Null component name refers to the directory itself.
1554168404Spjd		 */
1555168404Spjd		VN_HOLD(dvp);
1556168404Spjd		zp = dzp;
1557168404Spjd		dl = NULL;
1558168404Spjd		error = 0;
1559168404Spjd	} else {
1560168404Spjd		/* possible VN_HOLD(zp) */
1561185029Spjd		int zflg = 0;
1562185029Spjd
1563185029Spjd		if (flag & FIGNORECASE)
1564185029Spjd			zflg |= ZCILOOK;
1565185029Spjd
1566185029Spjd		error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1567185029Spjd		    NULL, NULL);
1568185029Spjd		if (error) {
1569219089Spjd			if (have_acl)
1570219089Spjd				zfs_acl_ids_free(&acl_ids);
1571168404Spjd			if (strcmp(name, "..") == 0)
1572168404Spjd				error = EISDIR;
1573168404Spjd			ZFS_EXIT(zfsvfs);
1574168404Spjd			return (error);
1575168404Spjd		}
1576168404Spjd	}
1577219089Spjd
1578185029Spjd	if (zp == NULL) {
1579185029Spjd		uint64_t txtype;
1580168404Spjd
1581168404Spjd		/*
1582168404Spjd		 * Create a new file object and update the directory
1583168404Spjd		 * to reference it.
1584168404Spjd		 */
1585185029Spjd		if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1586219089Spjd			if (have_acl)
1587219089Spjd				zfs_acl_ids_free(&acl_ids);
1588168404Spjd			goto out;
1589168404Spjd		}
1590168404Spjd
1591168404Spjd		/*
1592168404Spjd		 * We only support the creation of regular files in
1593168404Spjd		 * extended attribute directories.
1594168404Spjd		 */
1595219089Spjd
1596219089Spjd		if ((dzp->z_pflags & ZFS_XATTR) &&
1597168404Spjd		    (vap->va_type != VREG)) {
1598219089Spjd			if (have_acl)
1599219089Spjd				zfs_acl_ids_free(&acl_ids);
1600168404Spjd			error = EINVAL;
1601168404Spjd			goto out;
1602168404Spjd		}
1603168404Spjd
1604219089Spjd		if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
1605219089Spjd		    cr, vsecp, &acl_ids)) != 0)
1606219089Spjd			goto out;
1607219089Spjd		have_acl = B_TRUE;
1608209962Smm
1609209962Smm		if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1610211932Smm			zfs_acl_ids_free(&acl_ids);
1611209962Smm			error = EDQUOT;
1612209962Smm			goto out;
1613209962Smm		}
1614209962Smm
1615168404Spjd		tx = dmu_tx_create(os);
1616219089Spjd
1617219089Spjd		dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1618219089Spjd		    ZFS_SA_BASE_ATTR_SIZE);
1619219089Spjd
1620209962Smm		fuid_dirtied = zfsvfs->z_fuid_dirty;
1621209962Smm		if (fuid_dirtied)
1622209962Smm			zfs_fuid_txhold(zfsvfs, tx);
1623168404Spjd		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1624219089Spjd		dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1625219089Spjd		if (!zfsvfs->z_use_sa &&
1626219089Spjd		    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1627168404Spjd			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1628219089Spjd			    0, acl_ids.z_aclp->z_acl_bytes);
1629185029Spjd		}
1630209962Smm		error = dmu_tx_assign(tx, TXG_NOWAIT);
1631168404Spjd		if (error) {
1632168404Spjd			zfs_dirent_unlock(dl);
1633209962Smm			if (error == ERESTART) {
1634168404Spjd				dmu_tx_wait(tx);
1635168404Spjd				dmu_tx_abort(tx);
1636168404Spjd				goto top;
1637168404Spjd			}
1638219089Spjd			zfs_acl_ids_free(&acl_ids);
1639168404Spjd			dmu_tx_abort(tx);
1640168404Spjd			ZFS_EXIT(zfsvfs);
1641168404Spjd			return (error);
1642168404Spjd		}
1643219089Spjd		zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1644209962Smm
1645209962Smm		if (fuid_dirtied)
1646209962Smm			zfs_fuid_sync(zfsvfs, tx);
1647209962Smm
1648168404Spjd		(void) zfs_link_create(dl, zp, tx, ZNEW);
1649185029Spjd		txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1650185029Spjd		if (flag & FIGNORECASE)
1651185029Spjd			txtype |= TX_CI;
1652185029Spjd		zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1653209962Smm		    vsecp, acl_ids.z_fuidp, vap);
1654209962Smm		zfs_acl_ids_free(&acl_ids);
1655168404Spjd		dmu_tx_commit(tx);
1656168404Spjd	} else {
1657185029Spjd		int aflags = (flag & FAPPEND) ? V_APPEND : 0;
1658185029Spjd
1659219089Spjd		if (have_acl)
1660219089Spjd			zfs_acl_ids_free(&acl_ids);
1661219089Spjd		have_acl = B_FALSE;
1662219089Spjd
1663168404Spjd		/*
1664168404Spjd		 * A directory entry already exists for this name.
1665168404Spjd		 */
1666168404Spjd		/*
1667168962Spjd		 * Can't truncate an existing file if in exclusive mode.
1668168962Spjd		 */
1669168962Spjd		if (excl == EXCL) {
1670168962Spjd			error = EEXIST;
1671168962Spjd			goto out;
1672168962Spjd		}
1673168962Spjd		/*
1674168404Spjd		 * Can't open a directory for writing.
1675168404Spjd		 */
1676168404Spjd		if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
1677168404Spjd			error = EISDIR;
1678168404Spjd			goto out;
1679168404Spjd		}
1680168404Spjd		/*
1681168404Spjd		 * Verify requested access to file.
1682168404Spjd		 */
1683185029Spjd		if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
1684168404Spjd			goto out;
1685168404Spjd		}
1686168404Spjd
1687168404Spjd		mutex_enter(&dzp->z_lock);
1688168404Spjd		dzp->z_seq++;
1689168404Spjd		mutex_exit(&dzp->z_lock);
1690168404Spjd
1691168404Spjd		/*
1692168404Spjd		 * Truncate regular files if requested.
1693168404Spjd		 */
1694168404Spjd		if ((ZTOV(zp)->v_type == VREG) &&
1695168404Spjd		    (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
1696185029Spjd			/* we can't hold any locks when calling zfs_freesp() */
1697185029Spjd			zfs_dirent_unlock(dl);
1698185029Spjd			dl = NULL;
1699168404Spjd			error = zfs_freesp(zp, 0, 0, mode, TRUE);
1700185029Spjd			if (error == 0) {
1701185029Spjd				vnevent_create(ZTOV(zp), ct);
1702168404Spjd			}
1703168404Spjd		}
1704168404Spjd	}
1705168404Spjdout:
1706168404Spjd	if (dl)
1707168404Spjd		zfs_dirent_unlock(dl);
1708168404Spjd
1709168404Spjd	if (error) {
1710168404Spjd		if (zp)
1711168404Spjd			VN_RELE(ZTOV(zp));
1712168962Spjd	} else {
1713168962Spjd		*vpp = ZTOV(zp);
1714211932Smm		error = specvp_check(vpp, cr);
1715168404Spjd	}
1716168404Spjd
1717219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1718219089Spjd		zil_commit(zilog, 0);
1719219089Spjd
1720168404Spjd	ZFS_EXIT(zfsvfs);
1721168404Spjd	return (error);
1722168404Spjd}
1723168404Spjd
1724168404Spjd/*
1725168404Spjd * Remove an entry from a directory.
1726168404Spjd *
1727168404Spjd *	IN:	dvp	- vnode of directory to remove entry from.
1728168404Spjd *		name	- name of entry to remove.
1729168404Spjd *		cr	- credentials of caller.
1730185029Spjd *		ct	- caller context
1731185029Spjd *		flags	- case flags
1732168404Spjd *
1733168404Spjd *	RETURN:	0 if success
1734168404Spjd *		error code if failure
1735168404Spjd *
1736168404Spjd * Timestamps:
1737168404Spjd *	dvp - ctime|mtime
1738168404Spjd *	 vp - ctime (if nlink > 0)
1739168404Spjd */
1740219089Spjd
1741219089Spjduint64_t null_xattr = 0;
1742219089Spjd
1743185029Spjd/*ARGSUSED*/
1744168404Spjdstatic int
1745185029Spjdzfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
1746185029Spjd    int flags)
1747168404Spjd{
1748168404Spjd	znode_t		*zp, *dzp = VTOZ(dvp);
1749219089Spjd	znode_t		*xzp;
1750168404Spjd	vnode_t		*vp;
1751168404Spjd	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1752185029Spjd	zilog_t		*zilog;
1753168962Spjd	uint64_t	acl_obj, xattr_obj;
1754219089Spjd	uint64_t 	xattr_obj_unlinked = 0;
1755219089Spjd	uint64_t	obj = 0;
1756168404Spjd	zfs_dirlock_t	*dl;
1757168404Spjd	dmu_tx_t	*tx;
1758168962Spjd	boolean_t	may_delete_now, delete_now = FALSE;
1759185029Spjd	boolean_t	unlinked, toobig = FALSE;
1760185029Spjd	uint64_t	txtype;
1761185029Spjd	pathname_t	*realnmp = NULL;
1762185029Spjd	pathname_t	realnm;
1763168404Spjd	int		error;
1764185029Spjd	int		zflg = ZEXISTS;
1765168404Spjd
1766168404Spjd	ZFS_ENTER(zfsvfs);
1767185029Spjd	ZFS_VERIFY_ZP(dzp);
1768185029Spjd	zilog = zfsvfs->z_log;
1769168404Spjd
1770185029Spjd	if (flags & FIGNORECASE) {
1771185029Spjd		zflg |= ZCILOOK;
1772185029Spjd		pn_alloc(&realnm);
1773185029Spjd		realnmp = &realnm;
1774185029Spjd	}
1775185029Spjd
1776168404Spjdtop:
1777219089Spjd	xattr_obj = 0;
1778219089Spjd	xzp = NULL;
1779168404Spjd	/*
1780168404Spjd	 * Attempt to lock directory; fail if entry doesn't exist.
1781168404Spjd	 */
1782185029Spjd	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1783185029Spjd	    NULL, realnmp)) {
1784185029Spjd		if (realnmp)
1785185029Spjd			pn_free(realnmp);
1786168404Spjd		ZFS_EXIT(zfsvfs);
1787168404Spjd		return (error);
1788168404Spjd	}
1789168404Spjd
1790168404Spjd	vp = ZTOV(zp);
1791168404Spjd
1792168962Spjd	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1793168404Spjd		goto out;
1794168962Spjd	}
1795168404Spjd
1796168962Spjd	/*
1797168962Spjd	 * Need to use rmdir for removing directories.
1798168962Spjd	 */
1799168962Spjd	if (vp->v_type == VDIR) {
1800168962Spjd		error = EPERM;
1801168962Spjd		goto out;
1802168962Spjd	}
1803168962Spjd
1804185029Spjd	vnevent_remove(vp, dvp, name, ct);
1805168962Spjd
1806185029Spjd	if (realnmp)
1807185029Spjd		dnlc_remove(dvp, realnmp->pn_buf);
1808185029Spjd	else
1809185029Spjd		dnlc_remove(dvp, name);
1810168404Spjd
1811219089Spjd	VI_LOCK(vp);
1812219089Spjd	may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp);
1813219089Spjd	VI_UNLOCK(vp);
1814168962Spjd
1815168404Spjd	/*
1816168404Spjd	 * We may delete the znode now, or we may put it in the unlinked set;
1817168404Spjd	 * it depends on whether we're the last link, and on whether there are
1818168404Spjd	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
1819168404Spjd	 * allow for either case.
1820168404Spjd	 */
1821219089Spjd	obj = zp->z_id;
1822168404Spjd	tx = dmu_tx_create(zfsvfs->z_os);
1823168404Spjd	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1824219089Spjd	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1825219089Spjd	zfs_sa_upgrade_txholds(tx, zp);
1826219089Spjd	zfs_sa_upgrade_txholds(tx, dzp);
1827185029Spjd	if (may_delete_now) {
1828185029Spjd		toobig =
1829219089Spjd		    zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
1830185029Spjd		/* if the file is too big, only hold_free a token amount */
1831185029Spjd		dmu_tx_hold_free(tx, zp->z_id, 0,
1832185029Spjd		    (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1833185029Spjd	}
1834168404Spjd
1835168404Spjd	/* are there any extended attributes? */
1836219089Spjd	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1837219089Spjd	    &xattr_obj, sizeof (xattr_obj));
1838219089Spjd	if (error == 0 && xattr_obj) {
1839219089Spjd		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1840219089Spjd		ASSERT3U(error, ==, 0);
1841219089Spjd		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1842219089Spjd		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1843168404Spjd	}
1844168404Spjd
1845219089Spjd	mutex_enter(&zp->z_lock);
1846219089Spjd	if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
1847168962Spjd		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1848219089Spjd	mutex_exit(&zp->z_lock);
1849168962Spjd
1850168404Spjd	/* charge as an update -- would be nice not to charge at all */
1851168404Spjd	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1852168404Spjd
1853209962Smm	error = dmu_tx_assign(tx, TXG_NOWAIT);
1854168404Spjd	if (error) {
1855168404Spjd		zfs_dirent_unlock(dl);
1856168962Spjd		VN_RELE(vp);
1857219089Spjd		if (xzp)
1858219089Spjd			VN_RELE(ZTOV(xzp));
1859209962Smm		if (error == ERESTART) {
1860168404Spjd			dmu_tx_wait(tx);
1861168404Spjd			dmu_tx_abort(tx);
1862168404Spjd			goto top;
1863168404Spjd		}
1864185029Spjd		if (realnmp)
1865185029Spjd			pn_free(realnmp);
1866168404Spjd		dmu_tx_abort(tx);
1867168404Spjd		ZFS_EXIT(zfsvfs);
1868168404Spjd		return (error);
1869168404Spjd	}
1870168404Spjd
1871168404Spjd	/*
1872168404Spjd	 * Remove the directory entry.
1873168404Spjd	 */
1874185029Spjd	error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1875168404Spjd
1876168404Spjd	if (error) {
1877168404Spjd		dmu_tx_commit(tx);
1878168404Spjd		goto out;
1879168404Spjd	}
1880168404Spjd
1881219089Spjd	if (unlinked) {
1882219089Spjd
1883219089Spjd		/*
1884219089Spjd		 * Hold z_lock so that we can make sure that the ACL obj
1885219089Spjd		 * hasn't changed.  Could have been deleted due to
1886219089Spjd		 * zfs_sa_upgrade().
1887219089Spjd		 */
1888219089Spjd		mutex_enter(&zp->z_lock);
1889168962Spjd		VI_LOCK(vp);
1890219089Spjd		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1891219089Spjd		    &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
1892185029Spjd		delete_now = may_delete_now && !toobig &&
1893168962Spjd		    vp->v_count == 1 && !vn_has_cached_data(vp) &&
1894219089Spjd		    xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) ==
1895219089Spjd		    acl_obj;
1896168962Spjd		VI_UNLOCK(vp);
1897168962Spjd	}
1898168962Spjd
1899168962Spjd	if (delete_now) {
1900219089Spjd		if (xattr_obj_unlinked) {
1901219089Spjd			ASSERT3U(xzp->z_links, ==, 2);
1902168962Spjd			mutex_enter(&xzp->z_lock);
1903168962Spjd			xzp->z_unlinked = 1;
1904219089Spjd			xzp->z_links = 0;
1905219089Spjd			error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
1906219089Spjd			    &xzp->z_links, sizeof (xzp->z_links), tx);
1907219089Spjd			ASSERT3U(error,  ==,  0);
1908168962Spjd			mutex_exit(&xzp->z_lock);
1909168962Spjd			zfs_unlinked_add(xzp, tx);
1910219089Spjd
1911219089Spjd			if (zp->z_is_sa)
1912219089Spjd				error = sa_remove(zp->z_sa_hdl,
1913219089Spjd				    SA_ZPL_XATTR(zfsvfs), tx);
1914219089Spjd			else
1915219089Spjd				error = sa_update(zp->z_sa_hdl,
1916219089Spjd				    SA_ZPL_XATTR(zfsvfs), &null_xattr,
1917219089Spjd				    sizeof (uint64_t), tx);
1918219089Spjd			ASSERT3U(error, ==, 0);
1919168962Spjd		}
1920168962Spjd		VI_LOCK(vp);
1921168962Spjd		vp->v_count--;
1922168962Spjd		ASSERT3U(vp->v_count, ==, 0);
1923168962Spjd		VI_UNLOCK(vp);
1924168962Spjd		mutex_exit(&zp->z_lock);
1925168962Spjd		zfs_znode_delete(zp, tx);
1926168962Spjd	} else if (unlinked) {
1927219089Spjd		mutex_exit(&zp->z_lock);
1928168404Spjd		zfs_unlinked_add(zp, tx);
1929168962Spjd	}
1930168404Spjd
1931185029Spjd	txtype = TX_REMOVE;
1932185029Spjd	if (flags & FIGNORECASE)
1933185029Spjd		txtype |= TX_CI;
1934219089Spjd	zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
1935168404Spjd
1936168404Spjd	dmu_tx_commit(tx);
1937168404Spjdout:
1938185029Spjd	if (realnmp)
1939185029Spjd		pn_free(realnmp);
1940185029Spjd
1941168404Spjd	zfs_dirent_unlock(dl);
1942168404Spjd
1943219089Spjd	if (!delete_now)
1944168962Spjd		VN_RELE(vp);
1945219089Spjd	if (xzp)
1946168962Spjd		VN_RELE(ZTOV(xzp));
1947168962Spjd
1948219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1949219089Spjd		zil_commit(zilog, 0);
1950219089Spjd
1951168404Spjd	ZFS_EXIT(zfsvfs);
1952168404Spjd	return (error);
1953168404Spjd}
1954168404Spjd
1955168404Spjd/*
1956168404Spjd * Create a new directory and insert it into dvp using the name
1957168404Spjd * provided.  Return a pointer to the inserted directory.
1958168404Spjd *
1959168404Spjd *	IN:	dvp	- vnode of directory to add subdir to.
1960168404Spjd *		dirname	- name of new directory.
1961168404Spjd *		vap	- attributes of new directory.
1962168404Spjd *		cr	- credentials of caller.
1963185029Spjd *		ct	- caller context
1964185029Spjd *		vsecp	- ACL to be set
1965168404Spjd *
1966168404Spjd *	OUT:	vpp	- vnode of created directory.
1967168404Spjd *
1968168404Spjd *	RETURN:	0 if success
1969168404Spjd *		error code if failure
1970168404Spjd *
1971168404Spjd * Timestamps:
1972168404Spjd *	dvp - ctime|mtime updated
1973168404Spjd *	 vp - ctime|mtime|atime updated
1974168404Spjd */
1975185029Spjd/*ARGSUSED*/
1976168404Spjdstatic int
1977185029Spjdzfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
1978185029Spjd    caller_context_t *ct, int flags, vsecattr_t *vsecp)
1979168404Spjd{
1980168404Spjd	znode_t		*zp, *dzp = VTOZ(dvp);
1981168404Spjd	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1982185029Spjd	zilog_t		*zilog;
1983168404Spjd	zfs_dirlock_t	*dl;
1984185029Spjd	uint64_t	txtype;
1985168404Spjd	dmu_tx_t	*tx;
1986168404Spjd	int		error;
1987185029Spjd	int		zf = ZNEW;
1988209962Smm	ksid_t		*ksid;
1989209962Smm	uid_t		uid;
1990209962Smm	gid_t		gid = crgetgid(cr);
1991219089Spjd	zfs_acl_ids_t   acl_ids;
1992209962Smm	boolean_t	fuid_dirtied;
1993168404Spjd
1994168404Spjd	ASSERT(vap->va_type == VDIR);
1995168404Spjd
1996185029Spjd	/*
1997185029Spjd	 * If we have an ephemeral id, ACL, or XVATTR then
1998185029Spjd	 * make sure file system is at proper version
1999185029Spjd	 */
2000185029Spjd
2001209962Smm	ksid = crgetsid(cr, KSID_OWNER);
2002209962Smm	if (ksid)
2003209962Smm		uid = ksid_getid(ksid);
2004209962Smm	else
2005209962Smm		uid = crgetuid(cr);
2006185029Spjd	if (zfsvfs->z_use_fuids == B_FALSE &&
2007219089Spjd	    (vsecp || (vap->va_mask & AT_XVATTR) ||
2008219089Spjd	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
2009185029Spjd		return (EINVAL);
2010185029Spjd
2011168404Spjd	ZFS_ENTER(zfsvfs);
2012185029Spjd	ZFS_VERIFY_ZP(dzp);
2013185029Spjd	zilog = zfsvfs->z_log;
2014168404Spjd
2015219089Spjd	if (dzp->z_pflags & ZFS_XATTR) {
2016168404Spjd		ZFS_EXIT(zfsvfs);
2017168404Spjd		return (EINVAL);
2018168404Spjd	}
2019168404Spjd
2020185029Spjd	if (zfsvfs->z_utf8 && u8_validate(dirname,
2021185029Spjd	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2022185029Spjd		ZFS_EXIT(zfsvfs);
2023185029Spjd		return (EILSEQ);
2024185029Spjd	}
2025185029Spjd	if (flags & FIGNORECASE)
2026185029Spjd		zf |= ZCILOOK;
2027185029Spjd
2028219089Spjd	if (vap->va_mask & AT_XVATTR) {
2029197861Spjd		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
2030185029Spjd		    crgetuid(cr), cr, vap->va_type)) != 0) {
2031185029Spjd			ZFS_EXIT(zfsvfs);
2032185029Spjd			return (error);
2033185029Spjd		}
2034219089Spjd	}
2035185029Spjd
2036219089Spjd	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
2037219089Spjd	    vsecp, &acl_ids)) != 0) {
2038219089Spjd		ZFS_EXIT(zfsvfs);
2039219089Spjd		return (error);
2040219089Spjd	}
2041168404Spjd	/*
2042168404Spjd	 * First make sure the new directory doesn't exist.
2043219089Spjd	 *
2044219089Spjd	 * Existence is checked first to make sure we don't return
2045219089Spjd	 * EACCES instead of EEXIST which can cause some applications
2046219089Spjd	 * to fail.
2047168404Spjd	 */
2048185029Spjdtop:
2049185029Spjd	*vpp = NULL;
2050185029Spjd
2051185029Spjd	if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
2052185029Spjd	    NULL, NULL)) {
2053219089Spjd		zfs_acl_ids_free(&acl_ids);
2054168404Spjd		ZFS_EXIT(zfsvfs);
2055168404Spjd		return (error);
2056168404Spjd	}
2057168404Spjd
2058185029Spjd	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
2059219089Spjd		zfs_acl_ids_free(&acl_ids);
2060168404Spjd		zfs_dirent_unlock(dl);
2061168404Spjd		ZFS_EXIT(zfsvfs);
2062168404Spjd		return (error);
2063168404Spjd	}
2064168404Spjd
2065209962Smm	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
2066211932Smm		zfs_acl_ids_free(&acl_ids);
2067209962Smm		zfs_dirent_unlock(dl);
2068209962Smm		ZFS_EXIT(zfsvfs);
2069209962Smm		return (EDQUOT);
2070209962Smm	}
2071209962Smm
2072168404Spjd	/*
2073168404Spjd	 * Add a new entry to the directory.
2074168404Spjd	 */
2075168404Spjd	tx = dmu_tx_create(zfsvfs->z_os);
2076168404Spjd	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
2077168404Spjd	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2078209962Smm	fuid_dirtied = zfsvfs->z_fuid_dirty;
2079209962Smm	if (fuid_dirtied)
2080209962Smm		zfs_fuid_txhold(zfsvfs, tx);
2081219089Spjd	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2082219089Spjd		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
2083219089Spjd		    acl_ids.z_aclp->z_acl_bytes);
2084219089Spjd	}
2085219089Spjd
2086219089Spjd	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
2087219089Spjd	    ZFS_SA_BASE_ATTR_SIZE);
2088219089Spjd
2089209962Smm	error = dmu_tx_assign(tx, TXG_NOWAIT);
2090168404Spjd	if (error) {
2091168404Spjd		zfs_dirent_unlock(dl);
2092209962Smm		if (error == ERESTART) {
2093168404Spjd			dmu_tx_wait(tx);
2094168404Spjd			dmu_tx_abort(tx);
2095168404Spjd			goto top;
2096168404Spjd		}
2097219089Spjd		zfs_acl_ids_free(&acl_ids);
2098168404Spjd		dmu_tx_abort(tx);
2099168404Spjd		ZFS_EXIT(zfsvfs);
2100168404Spjd		return (error);
2101168404Spjd	}
2102168404Spjd
2103168404Spjd	/*
2104168404Spjd	 * Create new node.
2105168404Spjd	 */
2106219089Spjd	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
2107168404Spjd
2108209962Smm	if (fuid_dirtied)
2109209962Smm		zfs_fuid_sync(zfsvfs, tx);
2110219089Spjd
2111168404Spjd	/*
2112168404Spjd	 * Now put new name in parent dir.
2113168404Spjd	 */
2114168404Spjd	(void) zfs_link_create(dl, zp, tx, ZNEW);
2115168404Spjd
2116168404Spjd	*vpp = ZTOV(zp);
2117168404Spjd
2118185029Spjd	txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
2119185029Spjd	if (flags & FIGNORECASE)
2120185029Spjd		txtype |= TX_CI;
2121209962Smm	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
2122209962Smm	    acl_ids.z_fuidp, vap);
2123185029Spjd
2124209962Smm	zfs_acl_ids_free(&acl_ids);
2125219089Spjd
2126168404Spjd	dmu_tx_commit(tx);
2127168404Spjd
2128168404Spjd	zfs_dirent_unlock(dl);
2129168404Spjd
2130219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2131219089Spjd		zil_commit(zilog, 0);
2132219089Spjd
2133168404Spjd	ZFS_EXIT(zfsvfs);
2134168404Spjd	return (0);
2135168404Spjd}
2136168404Spjd
2137168404Spjd/*
2138168404Spjd * Remove a directory subdir entry.  If the current working
2139168404Spjd * directory is the same as the subdir to be removed, the
2140168404Spjd * remove will fail.
2141168404Spjd *
2142168404Spjd *	IN:	dvp	- vnode of directory to remove from.
2143168404Spjd *		name	- name of directory to be removed.
2144168404Spjd *		cwd	- vnode of current working directory.
2145168404Spjd *		cr	- credentials of caller.
2146185029Spjd *		ct	- caller context
2147185029Spjd *		flags	- case flags
2148168404Spjd *
2149168404Spjd *	RETURN:	0 if success
2150168404Spjd *		error code if failure
2151168404Spjd *
2152168404Spjd * Timestamps:
2153168404Spjd *	dvp - ctime|mtime updated
2154168404Spjd */
2155185029Spjd/*ARGSUSED*/
2156168404Spjdstatic int
2157185029Spjdzfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
2158185029Spjd    caller_context_t *ct, int flags)
2159168404Spjd{
2160168404Spjd	znode_t		*dzp = VTOZ(dvp);
2161168404Spjd	znode_t		*zp;
2162168404Spjd	vnode_t		*vp;
2163168404Spjd	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2164185029Spjd	zilog_t		*zilog;
2165168404Spjd	zfs_dirlock_t	*dl;
2166168404Spjd	dmu_tx_t	*tx;
2167168404Spjd	int		error;
2168185029Spjd	int		zflg = ZEXISTS;
2169168404Spjd
2170168962Spjd	ZFS_ENTER(zfsvfs);
2171185029Spjd	ZFS_VERIFY_ZP(dzp);
2172185029Spjd	zilog = zfsvfs->z_log;
2173168404Spjd
2174185029Spjd	if (flags & FIGNORECASE)
2175185029Spjd		zflg |= ZCILOOK;
2176168404Spjdtop:
2177168404Spjd	zp = NULL;
2178168404Spjd
2179168404Spjd	/*
2180168404Spjd	 * Attempt to lock directory; fail if entry doesn't exist.
2181168404Spjd	 */
2182185029Spjd	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
2183185029Spjd	    NULL, NULL)) {
2184168404Spjd		ZFS_EXIT(zfsvfs);
2185168404Spjd		return (error);
2186168404Spjd	}
2187168404Spjd
2188168404Spjd	vp = ZTOV(zp);
2189168404Spjd
2190168404Spjd	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2191168404Spjd		goto out;
2192168404Spjd	}
2193168404Spjd
2194168962Spjd	if (vp->v_type != VDIR) {
2195168962Spjd		error = ENOTDIR;
2196168962Spjd		goto out;
2197168962Spjd	}
2198168962Spjd
2199168962Spjd	if (vp == cwd) {
2200168962Spjd		error = EINVAL;
2201168962Spjd		goto out;
2202168962Spjd	}
2203168962Spjd
2204185029Spjd	vnevent_rmdir(vp, dvp, name, ct);
2205168962Spjd
2206168404Spjd	/*
2207168404Spjd	 * Grab a lock on the directory to make sure that noone is
2208168404Spjd	 * trying to add (or lookup) entries while we are removing it.
2209168404Spjd	 */
2210168404Spjd	rw_enter(&zp->z_name_lock, RW_WRITER);
2211168404Spjd
2212168404Spjd	/*
2213168404Spjd	 * Grab a lock on the parent pointer to make sure we play well
2214168404Spjd	 * with the treewalk and directory rename code.
2215168404Spjd	 */
2216168404Spjd	rw_enter(&zp->z_parent_lock, RW_WRITER);
2217168404Spjd
2218168404Spjd	tx = dmu_tx_create(zfsvfs->z_os);
2219168404Spjd	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2220219089Spjd	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2221168404Spjd	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2222219089Spjd	zfs_sa_upgrade_txholds(tx, zp);
2223219089Spjd	zfs_sa_upgrade_txholds(tx, dzp);
2224209962Smm	error = dmu_tx_assign(tx, TXG_NOWAIT);
2225168404Spjd	if (error) {
2226168404Spjd		rw_exit(&zp->z_parent_lock);
2227168404Spjd		rw_exit(&zp->z_name_lock);
2228168404Spjd		zfs_dirent_unlock(dl);
2229168962Spjd		VN_RELE(vp);
2230209962Smm		if (error == ERESTART) {
2231168404Spjd			dmu_tx_wait(tx);
2232168404Spjd			dmu_tx_abort(tx);
2233168404Spjd			goto top;
2234168404Spjd		}
2235168404Spjd		dmu_tx_abort(tx);
2236168404Spjd		ZFS_EXIT(zfsvfs);
2237168404Spjd		return (error);
2238168404Spjd	}
2239168404Spjd
2240168404Spjd#ifdef FREEBSD_NAMECACHE
2241168404Spjd	cache_purge(dvp);
2242168404Spjd#endif
2243168404Spjd
2244185029Spjd	error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
2245168404Spjd
2246185029Spjd	if (error == 0) {
2247185029Spjd		uint64_t txtype = TX_RMDIR;
2248185029Spjd		if (flags & FIGNORECASE)
2249185029Spjd			txtype |= TX_CI;
2250219089Spjd		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
2251185029Spjd	}
2252168404Spjd
2253168404Spjd	dmu_tx_commit(tx);
2254168404Spjd
2255168404Spjd	rw_exit(&zp->z_parent_lock);
2256168404Spjd	rw_exit(&zp->z_name_lock);
2257168404Spjd#ifdef FREEBSD_NAMECACHE
2258168404Spjd	cache_purge(vp);
2259168404Spjd#endif
2260168404Spjdout:
2261168404Spjd	zfs_dirent_unlock(dl);
2262168404Spjd
2263168962Spjd	VN_RELE(vp);
2264168962Spjd
2265219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2266219089Spjd		zil_commit(zilog, 0);
2267219089Spjd
2268168404Spjd	ZFS_EXIT(zfsvfs);
2269168404Spjd	return (error);
2270168404Spjd}
2271168404Spjd
2272168404Spjd/*
2273168404Spjd * Read as many directory entries as will fit into the provided
2274168404Spjd * buffer from the given directory cursor position (specified in
2275168404Spjd * the uio structure.
2276168404Spjd *
2277168404Spjd *	IN:	vp	- vnode of directory to read.
2278168404Spjd *		uio	- structure supplying read location, range info,
2279168404Spjd *			  and return buffer.
2280168404Spjd *		cr	- credentials of caller.
2281185029Spjd *		ct	- caller context
2282185029Spjd *		flags	- case flags
2283168404Spjd *
2284168404Spjd *	OUT:	uio	- updated offset and range, buffer filled.
2285168404Spjd *		eofp	- set to true if end-of-file detected.
2286168404Spjd *
2287168404Spjd *	RETURN:	0 if success
2288168404Spjd *		error code if failure
2289168404Spjd *
2290168404Spjd * Timestamps:
2291168404Spjd *	vp - atime updated
2292168404Spjd *
2293168404Spjd * Note that the low 4 bits of the cookie returned by zap is always zero.
2294168404Spjd * This allows us to use the low range for "special" directory entries:
2295168404Spjd * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
2296168404Spjd * we use the offset 2 for the '.zfs' directory.
2297168404Spjd */
2298168404Spjd/* ARGSUSED */
2299168404Spjdstatic int
2300168962Spjdzfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
2301168404Spjd{
2302168404Spjd	znode_t		*zp = VTOZ(vp);
2303168404Spjd	iovec_t		*iovp;
2304185029Spjd	edirent_t	*eodp;
2305168404Spjd	dirent64_t	*odp;
2306168404Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2307168404Spjd	objset_t	*os;
2308168404Spjd	caddr_t		outbuf;
2309168404Spjd	size_t		bufsize;
2310168404Spjd	zap_cursor_t	zc;
2311168404Spjd	zap_attribute_t	zap;
2312168404Spjd	uint_t		bytes_wanted;
2313168404Spjd	uint64_t	offset; /* must be unsigned; checks for < 1 */
2314219089Spjd	uint64_t	parent;
2315168404Spjd	int		local_eof;
2316168404Spjd	int		outcount;
2317168404Spjd	int		error;
2318168404Spjd	uint8_t		prefetch;
2319185029Spjd	boolean_t	check_sysattrs;
2320168404Spjd	uint8_t		type;
2321168962Spjd	int		ncooks;
2322168962Spjd	u_long		*cooks = NULL;
2323185029Spjd	int		flags = 0;
2324168404Spjd
2325168404Spjd	ZFS_ENTER(zfsvfs);
2326185029Spjd	ZFS_VERIFY_ZP(zp);
2327168404Spjd
2328219089Spjd	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2329219089Spjd	    &parent, sizeof (parent))) != 0) {
2330219089Spjd		ZFS_EXIT(zfsvfs);
2331219089Spjd		return (error);
2332219089Spjd	}
2333219089Spjd
2334168404Spjd	/*
2335168404Spjd	 * If we are not given an eof variable,
2336168404Spjd	 * use a local one.
2337168404Spjd	 */
2338168404Spjd	if (eofp == NULL)
2339168404Spjd		eofp = &local_eof;
2340168404Spjd
2341168404Spjd	/*
2342168404Spjd	 * Check for valid iov_len.
2343168404Spjd	 */
2344168404Spjd	if (uio->uio_iov->iov_len <= 0) {
2345168404Spjd		ZFS_EXIT(zfsvfs);
2346168404Spjd		return (EINVAL);
2347168404Spjd	}
2348168404Spjd
2349168404Spjd	/*
2350168404Spjd	 * Quit if directory has been removed (posix)
2351168404Spjd	 */
2352168404Spjd	if ((*eofp = zp->z_unlinked) != 0) {
2353168404Spjd		ZFS_EXIT(zfsvfs);
2354168404Spjd		return (0);
2355168404Spjd	}
2356168404Spjd
2357168404Spjd	error = 0;
2358168404Spjd	os = zfsvfs->z_os;
2359168404Spjd	offset = uio->uio_loffset;
2360168404Spjd	prefetch = zp->z_zn_prefetch;
2361168404Spjd
2362168404Spjd	/*
2363168404Spjd	 * Initialize the iterator cursor.
2364168404Spjd	 */
2365168404Spjd	if (offset <= 3) {
2366168404Spjd		/*
2367168404Spjd		 * Start iteration from the beginning of the directory.
2368168404Spjd		 */
2369168404Spjd		zap_cursor_init(&zc, os, zp->z_id);
2370168404Spjd	} else {
2371168404Spjd		/*
2372168404Spjd		 * The offset is a serialized cursor.
2373168404Spjd		 */
2374168404Spjd		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2375168404Spjd	}
2376168404Spjd
2377168404Spjd	/*
2378168404Spjd	 * Get space to change directory entries into fs independent format.
2379168404Spjd	 */
2380168404Spjd	iovp = uio->uio_iov;
2381168404Spjd	bytes_wanted = iovp->iov_len;
2382168404Spjd	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
2383168404Spjd		bufsize = bytes_wanted;
2384168404Spjd		outbuf = kmem_alloc(bufsize, KM_SLEEP);
2385168404Spjd		odp = (struct dirent64 *)outbuf;
2386168404Spjd	} else {
2387168404Spjd		bufsize = bytes_wanted;
2388168404Spjd		odp = (struct dirent64 *)iovp->iov_base;
2389168404Spjd	}
2390185029Spjd	eodp = (struct edirent *)odp;
2391168404Spjd
2392169170Spjd	if (ncookies != NULL) {
2393168404Spjd		/*
2394168404Spjd		 * Minimum entry size is dirent size and 1 byte for a file name.
2395168404Spjd		 */
2396168962Spjd		ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
2397219404Spjd		cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
2398219404Spjd		*cookies = cooks;
2399168962Spjd		*ncookies = ncooks;
2400168404Spjd	}
2401185029Spjd	/*
2402185029Spjd	 * If this VFS supports the system attribute view interface; and
2403185029Spjd	 * we're looking at an extended attribute directory; and we care
2404185029Spjd	 * about normalization conflicts on this vfs; then we must check
2405185029Spjd	 * for normalization conflicts with the sysattr name space.
2406185029Spjd	 */
2407185029Spjd#ifdef TODO
2408185029Spjd	check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2409185029Spjd	    (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2410185029Spjd	    (flags & V_RDDIR_ENTFLAGS);
2411185029Spjd#else
2412185029Spjd	check_sysattrs = 0;
2413185029Spjd#endif
2414168404Spjd
2415168404Spjd	/*
2416168404Spjd	 * Transform to file-system independent format
2417168404Spjd	 */
2418168404Spjd	outcount = 0;
2419168404Spjd	while (outcount < bytes_wanted) {
2420168404Spjd		ino64_t objnum;
2421168404Spjd		ushort_t reclen;
2422219089Spjd		off64_t *next = NULL;
2423168404Spjd
2424168404Spjd		/*
2425168404Spjd		 * Special case `.', `..', and `.zfs'.
2426168404Spjd		 */
2427168404Spjd		if (offset == 0) {
2428168404Spjd			(void) strcpy(zap.za_name, ".");
2429185029Spjd			zap.za_normalization_conflict = 0;
2430168404Spjd			objnum = zp->z_id;
2431169108Spjd			type = DT_DIR;
2432168404Spjd		} else if (offset == 1) {
2433168404Spjd			(void) strcpy(zap.za_name, "..");
2434185029Spjd			zap.za_normalization_conflict = 0;
2435219089Spjd			objnum = parent;
2436169108Spjd			type = DT_DIR;
2437168404Spjd		} else if (offset == 2 && zfs_show_ctldir(zp)) {
2438168404Spjd			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2439185029Spjd			zap.za_normalization_conflict = 0;
2440168404Spjd			objnum = ZFSCTL_INO_ROOT;
2441169108Spjd			type = DT_DIR;
2442168404Spjd		} else {
2443168404Spjd			/*
2444168404Spjd			 * Grab next entry.
2445168404Spjd			 */
2446168404Spjd			if (error = zap_cursor_retrieve(&zc, &zap)) {
2447168404Spjd				if ((*eofp = (error == ENOENT)) != 0)
2448168404Spjd					break;
2449168404Spjd				else
2450168404Spjd					goto update;
2451168404Spjd			}
2452168404Spjd
2453168404Spjd			if (zap.za_integer_length != 8 ||
2454168404Spjd			    zap.za_num_integers != 1) {
2455168404Spjd				cmn_err(CE_WARN, "zap_readdir: bad directory "
2456168404Spjd				    "entry, obj = %lld, offset = %lld\n",
2457168404Spjd				    (u_longlong_t)zp->z_id,
2458168404Spjd				    (u_longlong_t)offset);
2459168404Spjd				error = ENXIO;
2460168404Spjd				goto update;
2461168404Spjd			}
2462168404Spjd
2463168404Spjd			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2464168404Spjd			/*
2465168404Spjd			 * MacOS X can extract the object type here such as:
2466168404Spjd			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2467168404Spjd			 */
2468168404Spjd			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2469185029Spjd
2470185029Spjd			if (check_sysattrs && !zap.za_normalization_conflict) {
2471185029Spjd#ifdef TODO
2472185029Spjd				zap.za_normalization_conflict =
2473185029Spjd				    xattr_sysattr_casechk(zap.za_name);
2474185029Spjd#else
2475185029Spjd				panic("%s:%u: TODO", __func__, __LINE__);
2476185029Spjd#endif
2477185029Spjd			}
2478168404Spjd		}
2479168404Spjd
2480211932Smm		if (flags & V_RDDIR_ACCFILTER) {
2481211932Smm			/*
2482211932Smm			 * If we have no access at all, don't include
2483211932Smm			 * this entry in the returned information
2484211932Smm			 */
2485211932Smm			znode_t	*ezp;
2486211932Smm			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
2487211932Smm				goto skip_entry;
2488211932Smm			if (!zfs_has_access(ezp, cr)) {
2489211932Smm				VN_RELE(ZTOV(ezp));
2490211932Smm				goto skip_entry;
2491211932Smm			}
2492211932Smm			VN_RELE(ZTOV(ezp));
2493211932Smm		}
2494211932Smm
2495185029Spjd		if (flags & V_RDDIR_ENTFLAGS)
2496185029Spjd			reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2497185029Spjd		else
2498185029Spjd			reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2499185029Spjd
2500168404Spjd		/*
2501168404Spjd		 * Will this entry fit in the buffer?
2502168404Spjd		 */
2503168404Spjd		if (outcount + reclen > bufsize) {
2504168404Spjd			/*
2505168404Spjd			 * Did we manage to fit anything in the buffer?
2506168404Spjd			 */
2507168404Spjd			if (!outcount) {
2508168404Spjd				error = EINVAL;
2509168404Spjd				goto update;
2510168404Spjd			}
2511168404Spjd			break;
2512168404Spjd		}
2513185029Spjd		if (flags & V_RDDIR_ENTFLAGS) {
2514185029Spjd			/*
2515185029Spjd			 * Add extended flag entry:
2516185029Spjd			 */
2517185029Spjd			eodp->ed_ino = objnum;
2518185029Spjd			eodp->ed_reclen = reclen;
2519185029Spjd			/* NOTE: ed_off is the offset for the *next* entry */
2520185029Spjd			next = &(eodp->ed_off);
2521185029Spjd			eodp->ed_eflags = zap.za_normalization_conflict ?
2522185029Spjd			    ED_CASE_CONFLICT : 0;
2523185029Spjd			(void) strncpy(eodp->ed_name, zap.za_name,
2524185029Spjd			    EDIRENT_NAMELEN(reclen));
2525185029Spjd			eodp = (edirent_t *)((intptr_t)eodp + reclen);
2526185029Spjd		} else {
2527185029Spjd			/*
2528185029Spjd			 * Add normal entry:
2529185029Spjd			 */
2530185029Spjd			odp->d_ino = objnum;
2531185029Spjd			odp->d_reclen = reclen;
2532185029Spjd			odp->d_namlen = strlen(zap.za_name);
2533185029Spjd			(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
2534185029Spjd			odp->d_type = type;
2535185029Spjd			odp = (dirent64_t *)((intptr_t)odp + reclen);
2536185029Spjd		}
2537168404Spjd		outcount += reclen;
2538168404Spjd
2539168404Spjd		ASSERT(outcount <= bufsize);
2540168404Spjd
2541168404Spjd		/* Prefetch znode */
2542168404Spjd		if (prefetch)
2543168404Spjd			dmu_prefetch(os, objnum, 0, 0);
2544168404Spjd
2545211932Smm	skip_entry:
2546168404Spjd		/*
2547168404Spjd		 * Move to the next entry, fill in the previous offset.
2548168404Spjd		 */
2549168404Spjd		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2550168404Spjd			zap_cursor_advance(&zc);
2551168404Spjd			offset = zap_cursor_serialize(&zc);
2552168404Spjd		} else {
2553168404Spjd			offset += 1;
2554168404Spjd		}
2555219404Spjd
2556219404Spjd		if (cooks != NULL) {
2557219404Spjd			*cooks++ = offset;
2558219404Spjd			ncooks--;
2559219404Spjd			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
2560219404Spjd		}
2561168404Spjd	}
2562168404Spjd	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2563168404Spjd
2564168404Spjd	/* Subtract unused cookies */
2565168962Spjd	if (ncookies != NULL)
2566168962Spjd		*ncookies -= ncooks;
2567168404Spjd
2568168404Spjd	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
2569168404Spjd		iovp->iov_base += outcount;
2570168404Spjd		iovp->iov_len -= outcount;
2571168404Spjd		uio->uio_resid -= outcount;
2572168404Spjd	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2573168404Spjd		/*
2574168404Spjd		 * Reset the pointer.
2575168404Spjd		 */
2576168404Spjd		offset = uio->uio_loffset;
2577168404Spjd	}
2578168404Spjd
2579168404Spjdupdate:
2580168404Spjd	zap_cursor_fini(&zc);
2581168404Spjd	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
2582168404Spjd		kmem_free(outbuf, bufsize);
2583168404Spjd
2584168404Spjd	if (error == ENOENT)
2585168404Spjd		error = 0;
2586168404Spjd
2587168404Spjd	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2588168404Spjd
2589168404Spjd	uio->uio_loffset = offset;
2590168404Spjd	ZFS_EXIT(zfsvfs);
2591169107Spjd	if (error != 0 && cookies != NULL) {
2592168962Spjd		free(*cookies, M_TEMP);
2593168962Spjd		*cookies = NULL;
2594168962Spjd		*ncookies = 0;
2595168404Spjd	}
2596168404Spjd	return (error);
2597168404Spjd}
2598168404Spjd
2599185029Spjdulong_t zfs_fsync_sync_cnt = 4;
2600185029Spjd
2601168404Spjdstatic int
2602185029Spjdzfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2603168404Spjd{
2604168962Spjd	znode_t	*zp = VTOZ(vp);
2605168962Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2606168404Spjd
2607185029Spjd	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2608185029Spjd
2609219089Spjd	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
2610219089Spjd		ZFS_ENTER(zfsvfs);
2611219089Spjd		ZFS_VERIFY_ZP(zp);
2612219089Spjd		zil_commit(zfsvfs->z_log, zp->z_id);
2613219089Spjd		ZFS_EXIT(zfsvfs);
2614219089Spjd	}
2615168404Spjd	return (0);
2616168404Spjd}
2617168404Spjd
2618185029Spjd
2619168404Spjd/*
2620168404Spjd * Get the requested file attributes and place them in the provided
2621168404Spjd * vattr structure.
2622168404Spjd *
2623168404Spjd *	IN:	vp	- vnode of file.
2624168404Spjd *		vap	- va_mask identifies requested attributes.
2625185029Spjd *			  If AT_XVATTR set, then optional attrs are requested
2626185029Spjd *		flags	- ATTR_NOACLCHECK (CIFS server context)
2627168404Spjd *		cr	- credentials of caller.
2628185029Spjd *		ct	- caller context
2629168404Spjd *
2630168404Spjd *	OUT:	vap	- attribute values.
2631168404Spjd *
2632168404Spjd *	RETURN:	0 (always succeeds)
2633168404Spjd */
2634168404Spjd/* ARGSUSED */
2635168404Spjdstatic int
2636185029Spjdzfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2637185029Spjd    caller_context_t *ct)
2638168404Spjd{
2639168962Spjd	znode_t *zp = VTOZ(vp);
2640168962Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2641185029Spjd	int	error = 0;
2642168962Spjd	uint32_t blksize;
2643168962Spjd	u_longlong_t nblocks;
2644185029Spjd	uint64_t links;
2645224251Sdelphij	uint64_t mtime[2], ctime[2], crtime[2], rdev;
2646185029Spjd	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2647185029Spjd	xoptattr_t *xoap = NULL;
2648185029Spjd	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2649224251Sdelphij	sa_bulk_attr_t bulk[4];
2650219089Spjd	int count = 0;
2651168404Spjd
2652168404Spjd	ZFS_ENTER(zfsvfs);
2653185029Spjd	ZFS_VERIFY_ZP(zp);
2654168404Spjd
2655219089Spjd	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2656219089Spjd
2657219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
2658219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
2659219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &crtime, 16);
2660224251Sdelphij	if (vp->v_type == VBLK || vp->v_type == VCHR)
2661224251Sdelphij		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
2662224251Sdelphij		    &rdev, 8);
2663219089Spjd
2664219089Spjd	if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
2665219089Spjd		ZFS_EXIT(zfsvfs);
2666219089Spjd		return (error);
2667219089Spjd	}
2668219089Spjd
2669168404Spjd	/*
2670185029Spjd	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2671185029Spjd	 * Also, if we are the owner don't bother, since owner should
2672185029Spjd	 * always be allowed to read basic attributes of file.
2673185029Spjd	 */
2674219089Spjd	if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
2675219089Spjd	    (vap->va_uid != crgetuid(cr))) {
2676185029Spjd		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2677185029Spjd		    skipaclchk, cr)) {
2678185029Spjd			ZFS_EXIT(zfsvfs);
2679185029Spjd			return (error);
2680185029Spjd		}
2681185029Spjd	}
2682185029Spjd
2683185029Spjd	/*
2684168404Spjd	 * Return all attributes.  It's cheaper to provide the answer
2685168404Spjd	 * than to determine whether we were asked the question.
2686168404Spjd	 */
2687168404Spjd
2688209097Smm	mutex_enter(&zp->z_lock);
2689219089Spjd	vap->va_type = IFTOVT(zp->z_mode);
2690219089Spjd	vap->va_mode = zp->z_mode & ~S_IFMT;
2691185029Spjd//	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
2692168404Spjd	vap->va_nodeid = zp->z_id;
2693185029Spjd	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
2694219089Spjd		links = zp->z_links + 1;
2695185029Spjd	else
2696219089Spjd		links = zp->z_links;
2697185029Spjd	vap->va_nlink = MIN(links, UINT32_MAX);	/* nlink_t limit! */
2698219089Spjd	vap->va_size = zp->z_size;
2699168404Spjd	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
2700224251Sdelphij	if (vp->v_type == VBLK || vp->v_type == VCHR)
2701224251Sdelphij		vap->va_rdev = zfs_cmpldev(rdev);
2702168404Spjd	vap->va_seq = zp->z_seq;
2703168404Spjd	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
2704168404Spjd
2705185029Spjd	/*
2706185029Spjd	 * Add in any requested optional attributes and the create time.
2707185029Spjd	 * Also set the corresponding bits in the returned attribute bitmap.
2708185029Spjd	 */
2709185029Spjd	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2710185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2711185029Spjd			xoap->xoa_archive =
2712219089Spjd			    ((zp->z_pflags & ZFS_ARCHIVE) != 0);
2713185029Spjd			XVA_SET_RTN(xvap, XAT_ARCHIVE);
2714185029Spjd		}
2715185029Spjd
2716185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2717185029Spjd			xoap->xoa_readonly =
2718219089Spjd			    ((zp->z_pflags & ZFS_READONLY) != 0);
2719185029Spjd			XVA_SET_RTN(xvap, XAT_READONLY);
2720185029Spjd		}
2721185029Spjd
2722185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2723185029Spjd			xoap->xoa_system =
2724219089Spjd			    ((zp->z_pflags & ZFS_SYSTEM) != 0);
2725185029Spjd			XVA_SET_RTN(xvap, XAT_SYSTEM);
2726185029Spjd		}
2727185029Spjd
2728185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2729185029Spjd			xoap->xoa_hidden =
2730219089Spjd			    ((zp->z_pflags & ZFS_HIDDEN) != 0);
2731185029Spjd			XVA_SET_RTN(xvap, XAT_HIDDEN);
2732185029Spjd		}
2733185029Spjd
2734185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2735185029Spjd			xoap->xoa_nounlink =
2736219089Spjd			    ((zp->z_pflags & ZFS_NOUNLINK) != 0);
2737185029Spjd			XVA_SET_RTN(xvap, XAT_NOUNLINK);
2738185029Spjd		}
2739185029Spjd
2740185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2741185029Spjd			xoap->xoa_immutable =
2742219089Spjd			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
2743185029Spjd			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2744185029Spjd		}
2745185029Spjd
2746185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2747185029Spjd			xoap->xoa_appendonly =
2748219089Spjd			    ((zp->z_pflags & ZFS_APPENDONLY) != 0);
2749185029Spjd			XVA_SET_RTN(xvap, XAT_APPENDONLY);
2750185029Spjd		}
2751185029Spjd
2752185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2753185029Spjd			xoap->xoa_nodump =
2754219089Spjd			    ((zp->z_pflags & ZFS_NODUMP) != 0);
2755185029Spjd			XVA_SET_RTN(xvap, XAT_NODUMP);
2756185029Spjd		}
2757185029Spjd
2758185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2759185029Spjd			xoap->xoa_opaque =
2760219089Spjd			    ((zp->z_pflags & ZFS_OPAQUE) != 0);
2761185029Spjd			XVA_SET_RTN(xvap, XAT_OPAQUE);
2762185029Spjd		}
2763185029Spjd
2764185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2765185029Spjd			xoap->xoa_av_quarantined =
2766219089Spjd			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
2767185029Spjd			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2768185029Spjd		}
2769185029Spjd
2770185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2771185029Spjd			xoap->xoa_av_modified =
2772219089Spjd			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
2773185029Spjd			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2774185029Spjd		}
2775185029Spjd
2776185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2777219089Spjd		    vp->v_type == VREG) {
2778219089Spjd			zfs_sa_get_scanstamp(zp, xvap);
2779185029Spjd		}
2780185029Spjd
2781185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
2782219089Spjd			uint64_t times[2];
2783219089Spjd
2784219089Spjd			(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
2785219089Spjd			    times, sizeof (times));
2786219089Spjd			ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
2787185029Spjd			XVA_SET_RTN(xvap, XAT_CREATETIME);
2788185029Spjd		}
2789219089Spjd
2790219089Spjd		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2791219089Spjd			xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
2792219089Spjd			XVA_SET_RTN(xvap, XAT_REPARSE);
2793219089Spjd		}
2794219089Spjd		if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
2795219089Spjd			xoap->xoa_generation = zp->z_gen;
2796219089Spjd			XVA_SET_RTN(xvap, XAT_GEN);
2797219089Spjd		}
2798219089Spjd
2799219089Spjd		if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
2800219089Spjd			xoap->xoa_offline =
2801219089Spjd			    ((zp->z_pflags & ZFS_OFFLINE) != 0);
2802219089Spjd			XVA_SET_RTN(xvap, XAT_OFFLINE);
2803219089Spjd		}
2804219089Spjd
2805219089Spjd		if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
2806219089Spjd			xoap->xoa_sparse =
2807219089Spjd			    ((zp->z_pflags & ZFS_SPARSE) != 0);
2808219089Spjd			XVA_SET_RTN(xvap, XAT_SPARSE);
2809219089Spjd		}
2810185029Spjd	}
2811185029Spjd
2812219089Spjd	ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
2813219089Spjd	ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2814219089Spjd	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2815219089Spjd	ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
2816168404Spjd
2817168404Spjd	mutex_exit(&zp->z_lock);
2818168404Spjd
2819219089Spjd	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
2820168404Spjd	vap->va_blksize = blksize;
2821168404Spjd	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
2822168404Spjd
2823168404Spjd	if (zp->z_blksz == 0) {
2824168404Spjd		/*
2825168404Spjd		 * Block size hasn't been set; suggest maximal I/O transfers.
2826168404Spjd		 */
2827168404Spjd		vap->va_blksize = zfsvfs->z_max_blksz;
2828168404Spjd	}
2829168404Spjd
2830168404Spjd	ZFS_EXIT(zfsvfs);
2831168404Spjd	return (0);
2832168404Spjd}
2833168404Spjd
2834168404Spjd/*
2835168404Spjd * Set the file attributes to the values contained in the
2836168404Spjd * vattr structure.
2837168404Spjd *
2838168404Spjd *	IN:	vp	- vnode of file to be modified.
2839168404Spjd *		vap	- new attribute values.
2840185029Spjd *			  If AT_XVATTR set, then optional attrs are being set
2841168404Spjd *		flags	- ATTR_UTIME set if non-default time values provided.
2842185029Spjd *			- ATTR_NOACLCHECK (CIFS context only).
2843168404Spjd *		cr	- credentials of caller.
2844185029Spjd *		ct	- caller context
2845168404Spjd *
2846168404Spjd *	RETURN:	0 if success
2847168404Spjd *		error code if failure
2848168404Spjd *
2849168404Spjd * Timestamps:
2850168404Spjd *	vp - ctime updated, mtime updated if size changed.
2851168404Spjd */
2852168404Spjd/* ARGSUSED */
2853168404Spjdstatic int
2854168962Spjdzfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2855168962Spjd	caller_context_t *ct)
2856168404Spjd{
2857185029Spjd	znode_t		*zp = VTOZ(vp);
2858168404Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2859185029Spjd	zilog_t		*zilog;
2860168404Spjd	dmu_tx_t	*tx;
2861168404Spjd	vattr_t		oldva;
2862209962Smm	xvattr_t	tmpxvattr;
2863168962Spjd	uint_t		mask = vap->va_mask;
2864168404Spjd	uint_t		saved_mask;
2865197831Spjd	uint64_t	saved_mode;
2866168404Spjd	int		trim_mask = 0;
2867168404Spjd	uint64_t	new_mode;
2868209962Smm	uint64_t	new_uid, new_gid;
2869219089Spjd	uint64_t	xattr_obj;
2870219089Spjd	uint64_t	mtime[2], ctime[2];
2871168404Spjd	znode_t		*attrzp;
2872168404Spjd	int		need_policy = FALSE;
2873219089Spjd	int		err, err2;
2874185029Spjd	zfs_fuid_info_t *fuidp = NULL;
2875185029Spjd	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2876185029Spjd	xoptattr_t	*xoap;
2877219089Spjd	zfs_acl_t	*aclp;
2878185029Spjd	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2879219089Spjd	boolean_t	fuid_dirtied = B_FALSE;
2880219089Spjd	sa_bulk_attr_t	bulk[7], xattr_bulk[7];
2881219089Spjd	int		count = 0, xattr_count = 0;
2882168404Spjd
2883168404Spjd	if (mask == 0)
2884168404Spjd		return (0);
2885168404Spjd
2886168962Spjd	if (mask & AT_NOSET)
2887168962Spjd		return (EINVAL);
2888168962Spjd
2889185029Spjd	ZFS_ENTER(zfsvfs);
2890185029Spjd	ZFS_VERIFY_ZP(zp);
2891185029Spjd
2892185029Spjd	zilog = zfsvfs->z_log;
2893185029Spjd
2894185029Spjd	/*
2895185029Spjd	 * Make sure that if we have ephemeral uid/gid or xvattr specified
2896185029Spjd	 * that file system is at proper version level
2897185029Spjd	 */
2898185029Spjd
2899185029Spjd	if (zfsvfs->z_use_fuids == B_FALSE &&
2900185029Spjd	    (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2901185029Spjd	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2902185029Spjd	    (mask & AT_XVATTR))) {
2903185029Spjd		ZFS_EXIT(zfsvfs);
2904185029Spjd		return (EINVAL);
2905185029Spjd	}
2906185029Spjd
2907185029Spjd	if (mask & AT_SIZE && vp->v_type == VDIR) {
2908185029Spjd		ZFS_EXIT(zfsvfs);
2909168404Spjd		return (EISDIR);
2910185029Spjd	}
2911168404Spjd
2912185029Spjd	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
2913185029Spjd		ZFS_EXIT(zfsvfs);
2914168404Spjd		return (EINVAL);
2915185029Spjd	}
2916168404Spjd
2917185029Spjd	/*
2918185029Spjd	 * If this is an xvattr_t, then get a pointer to the structure of
2919185029Spjd	 * optional attributes.  If this is NULL, then we have a vattr_t.
2920185029Spjd	 */
2921185029Spjd	xoap = xva_getxoptattr(xvap);
2922168404Spjd
2923209962Smm	xva_init(&tmpxvattr);
2924209962Smm
2925185029Spjd	/*
2926185029Spjd	 * Immutable files can only alter immutable bit and atime
2927185029Spjd	 */
2928219089Spjd	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2929185029Spjd	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
2930185029Spjd	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2931185029Spjd		ZFS_EXIT(zfsvfs);
2932185029Spjd		return (EPERM);
2933185029Spjd	}
2934185029Spjd
2935219089Spjd	if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
2936185029Spjd		ZFS_EXIT(zfsvfs);
2937185029Spjd		return (EPERM);
2938185029Spjd	}
2939185029Spjd
2940185029Spjd	/*
2941185029Spjd	 * Verify timestamps doesn't overflow 32 bits.
2942185029Spjd	 * ZFS can handle large timestamps, but 32bit syscalls can't
2943185029Spjd	 * handle times greater than 2039.  This check should be removed
2944185029Spjd	 * once large timestamps are fully supported.
2945185029Spjd	 */
2946185029Spjd	if (mask & (AT_ATIME | AT_MTIME)) {
2947185029Spjd		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2948185029Spjd		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2949185029Spjd			ZFS_EXIT(zfsvfs);
2950185029Spjd			return (EOVERFLOW);
2951185029Spjd		}
2952185029Spjd	}
2953185029Spjd
2954168404Spjdtop:
2955168404Spjd	attrzp = NULL;
2956219089Spjd	aclp = NULL;
2957168404Spjd
2958211932Smm	/* Can this be moved to before the top label? */
2959168404Spjd	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2960168404Spjd		ZFS_EXIT(zfsvfs);
2961168404Spjd		return (EROFS);
2962168404Spjd	}
2963168404Spjd
2964168404Spjd	/*
2965168404Spjd	 * First validate permissions
2966168404Spjd	 */
2967168404Spjd
2968168404Spjd	if (mask & AT_SIZE) {
2969168404Spjd		/*
2970168404Spjd		 * XXX - Note, we are not providing any open
2971168404Spjd		 * mode flags here (like FNDELAY), so we may
2972168404Spjd		 * block if there are locks present... this
2973168404Spjd		 * should be addressed in openat().
2974168404Spjd		 */
2975185029Spjd		/* XXX - would it be OK to generate a log record here? */
2976185029Spjd		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2977168404Spjd		if (err) {
2978168404Spjd			ZFS_EXIT(zfsvfs);
2979168404Spjd			return (err);
2980168404Spjd		}
2981168404Spjd	}
2982168404Spjd
2983185029Spjd	if (mask & (AT_ATIME|AT_MTIME) ||
2984185029Spjd	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2985185029Spjd	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2986185029Spjd	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2987219089Spjd	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
2988219089Spjd	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
2989185029Spjd	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2990219089Spjd	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
2991185029Spjd		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2992185029Spjd		    skipaclchk, cr);
2993219089Spjd	}
2994168404Spjd
2995168404Spjd	if (mask & (AT_UID|AT_GID)) {
2996168404Spjd		int	idmask = (mask & (AT_UID|AT_GID));
2997168404Spjd		int	take_owner;
2998168404Spjd		int	take_group;
2999168404Spjd
3000168404Spjd		/*
3001168404Spjd		 * NOTE: even if a new mode is being set,
3002168404Spjd		 * we may clear S_ISUID/S_ISGID bits.
3003168404Spjd		 */
3004168404Spjd
3005168404Spjd		if (!(mask & AT_MODE))
3006219089Spjd			vap->va_mode = zp->z_mode;
3007168404Spjd
3008168404Spjd		/*
3009168404Spjd		 * Take ownership or chgrp to group we are a member of
3010168404Spjd		 */
3011168404Spjd
3012168404Spjd		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
3013185029Spjd		take_group = (mask & AT_GID) &&
3014185029Spjd		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
3015168404Spjd
3016168404Spjd		/*
3017168404Spjd		 * If both AT_UID and AT_GID are set then take_owner and
3018168404Spjd		 * take_group must both be set in order to allow taking
3019168404Spjd		 * ownership.
3020168404Spjd		 *
3021168404Spjd		 * Otherwise, send the check through secpolicy_vnode_setattr()
3022168404Spjd		 *
3023168404Spjd		 */
3024168404Spjd
3025168404Spjd		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
3026168404Spjd		    ((idmask == AT_UID) && take_owner) ||
3027168404Spjd		    ((idmask == AT_GID) && take_group)) {
3028185029Spjd			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
3029185029Spjd			    skipaclchk, cr) == 0) {
3030168404Spjd				/*
3031168404Spjd				 * Remove setuid/setgid for non-privileged users
3032168404Spjd				 */
3033185029Spjd				secpolicy_setid_clear(vap, vp, cr);
3034168404Spjd				trim_mask = (mask & (AT_UID|AT_GID));
3035168404Spjd			} else {
3036168404Spjd				need_policy =  TRUE;
3037168404Spjd			}
3038168404Spjd		} else {
3039168404Spjd			need_policy =  TRUE;
3040168404Spjd		}
3041168404Spjd	}
3042168404Spjd
3043168404Spjd	mutex_enter(&zp->z_lock);
3044219089Spjd	oldva.va_mode = zp->z_mode;
3045185029Spjd	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
3046185029Spjd	if (mask & AT_XVATTR) {
3047209962Smm		/*
3048209962Smm		 * Update xvattr mask to include only those attributes
3049209962Smm		 * that are actually changing.
3050209962Smm		 *
3051209962Smm		 * the bits will be restored prior to actually setting
3052209962Smm		 * the attributes so the caller thinks they were set.
3053209962Smm		 */
3054209962Smm		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
3055209962Smm			if (xoap->xoa_appendonly !=
3056219089Spjd			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
3057209962Smm				need_policy = TRUE;
3058209962Smm			} else {
3059209962Smm				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
3060209962Smm				XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
3061209962Smm			}
3062209962Smm		}
3063209962Smm
3064209962Smm		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
3065209962Smm			if (xoap->xoa_nounlink !=
3066219089Spjd			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
3067209962Smm				need_policy = TRUE;
3068209962Smm			} else {
3069209962Smm				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
3070209962Smm				XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
3071209962Smm			}
3072209962Smm		}
3073209962Smm
3074209962Smm		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
3075209962Smm			if (xoap->xoa_immutable !=
3076219089Spjd			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
3077209962Smm				need_policy = TRUE;
3078209962Smm			} else {
3079209962Smm				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
3080209962Smm				XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
3081209962Smm			}
3082209962Smm		}
3083209962Smm
3084209962Smm		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
3085209962Smm			if (xoap->xoa_nodump !=
3086219089Spjd			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
3087209962Smm				need_policy = TRUE;
3088209962Smm			} else {
3089209962Smm				XVA_CLR_REQ(xvap, XAT_NODUMP);
3090209962Smm				XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
3091209962Smm			}
3092209962Smm		}
3093209962Smm
3094209962Smm		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
3095209962Smm			if (xoap->xoa_av_modified !=
3096219089Spjd			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
3097209962Smm				need_policy = TRUE;
3098209962Smm			} else {
3099209962Smm				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
3100209962Smm				XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
3101209962Smm			}
3102209962Smm		}
3103209962Smm
3104209962Smm		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
3105209962Smm			if ((vp->v_type != VREG &&
3106209962Smm			    xoap->xoa_av_quarantined) ||
3107209962Smm			    xoap->xoa_av_quarantined !=
3108219089Spjd			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
3109209962Smm				need_policy = TRUE;
3110209962Smm			} else {
3111209962Smm				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
3112209962Smm				XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
3113209962Smm			}
3114209962Smm		}
3115209962Smm
3116219089Spjd		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
3117219089Spjd			mutex_exit(&zp->z_lock);
3118219089Spjd			ZFS_EXIT(zfsvfs);
3119219089Spjd			return (EPERM);
3120219089Spjd		}
3121219089Spjd
3122209962Smm		if (need_policy == FALSE &&
3123209962Smm		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
3124209962Smm		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
3125185029Spjd			need_policy = TRUE;
3126185029Spjd		}
3127185029Spjd	}
3128185029Spjd
3129168404Spjd	mutex_exit(&zp->z_lock);
3130168404Spjd
3131168404Spjd	if (mask & AT_MODE) {
3132185029Spjd		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
3133168962Spjd			err = secpolicy_setid_setsticky_clear(vp, vap,
3134168962Spjd			    &oldva, cr);
3135168962Spjd			if (err) {
3136168962Spjd				ZFS_EXIT(zfsvfs);
3137168962Spjd				return (err);
3138168962Spjd			}
3139168404Spjd			trim_mask |= AT_MODE;
3140168404Spjd		} else {
3141168404Spjd			need_policy = TRUE;
3142168404Spjd		}
3143168404Spjd	}
3144168404Spjd
3145168404Spjd	if (need_policy) {
3146168404Spjd		/*
3147168404Spjd		 * If trim_mask is set then take ownership
3148168404Spjd		 * has been granted or write_acl is present and user
3149168404Spjd		 * has the ability to modify mode.  In that case remove
3150168404Spjd		 * UID|GID and or MODE from mask so that
3151168404Spjd		 * secpolicy_vnode_setattr() doesn't revoke it.
3152168404Spjd		 */
3153168404Spjd
3154168404Spjd		if (trim_mask) {
3155168404Spjd			saved_mask = vap->va_mask;
3156168404Spjd			vap->va_mask &= ~trim_mask;
3157197831Spjd			if (trim_mask & AT_MODE) {
3158197831Spjd				/*
3159197831Spjd				 * Save the mode, as secpolicy_vnode_setattr()
3160197831Spjd				 * will overwrite it with ova.va_mode.
3161197831Spjd				 */
3162197831Spjd				saved_mode = vap->va_mode;
3163197831Spjd			}
3164168404Spjd		}
3165168404Spjd		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
3166185029Spjd		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
3167168404Spjd		if (err) {
3168168404Spjd			ZFS_EXIT(zfsvfs);
3169168404Spjd			return (err);
3170168404Spjd		}
3171168404Spjd
3172197831Spjd		if (trim_mask) {
3173168404Spjd			vap->va_mask |= saved_mask;
3174197831Spjd			if (trim_mask & AT_MODE) {
3175197831Spjd				/*
3176197831Spjd				 * Recover the mode after
3177197831Spjd				 * secpolicy_vnode_setattr().
3178197831Spjd				 */
3179197831Spjd				vap->va_mode = saved_mode;
3180197831Spjd			}
3181197831Spjd		}
3182168404Spjd	}
3183168404Spjd
3184168404Spjd	/*
3185168404Spjd	 * secpolicy_vnode_setattr, or take ownership may have
3186168404Spjd	 * changed va_mask
3187168404Spjd	 */
3188168404Spjd	mask = vap->va_mask;
3189168404Spjd
3190219089Spjd	if ((mask & (AT_UID | AT_GID))) {
3191219089Spjd		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
3192219089Spjd		    &xattr_obj, sizeof (xattr_obj));
3193168404Spjd
3194219089Spjd		if (err == 0 && xattr_obj) {
3195219089Spjd			err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
3196209962Smm			if (err)
3197219089Spjd				goto out2;
3198168404Spjd		}
3199209962Smm		if (mask & AT_UID) {
3200209962Smm			new_uid = zfs_fuid_create(zfsvfs,
3201209962Smm			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
3202219089Spjd			if (new_uid != zp->z_uid &&
3203219089Spjd			    zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
3204219089Spjd				if (attrzp)
3205219089Spjd					VN_RELE(ZTOV(attrzp));
3206209962Smm				err = EDQUOT;
3207219089Spjd				goto out2;
3208209962Smm			}
3209209962Smm		}
3210209962Smm
3211209962Smm		if (mask & AT_GID) {
3212209962Smm			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
3213209962Smm			    cr, ZFS_GROUP, &fuidp);
3214219089Spjd			if (new_gid != zp->z_gid &&
3215219089Spjd			    zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
3216219089Spjd				if (attrzp)
3217219089Spjd					VN_RELE(ZTOV(attrzp));
3218209962Smm				err = EDQUOT;
3219219089Spjd				goto out2;
3220209962Smm			}
3221209962Smm		}
3222219089Spjd	}
3223219089Spjd	tx = dmu_tx_create(zfsvfs->z_os);
3224219089Spjd
3225219089Spjd	if (mask & AT_MODE) {
3226219089Spjd		uint64_t pmode = zp->z_mode;
3227219089Spjd		uint64_t acl_obj;
3228219089Spjd		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
3229219089Spjd
3230224174Smm		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
3231224174Smm			goto out;
3232219089Spjd
3233219089Spjd		mutex_enter(&zp->z_lock);
3234219089Spjd		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
3235219089Spjd			/*
3236219089Spjd			 * Are we upgrading ACL from old V0 format
3237219089Spjd			 * to V1 format?
3238219089Spjd			 */
3239219089Spjd			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
3240219089Spjd			    zfs_znode_acl_version(zp) ==
3241219089Spjd			    ZFS_ACL_VERSION_INITIAL) {
3242219089Spjd				dmu_tx_hold_free(tx, acl_obj, 0,
3243219089Spjd				    DMU_OBJECT_END);
3244219089Spjd				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3245219089Spjd				    0, aclp->z_acl_bytes);
3246209962Smm			} else {
3247219089Spjd				dmu_tx_hold_write(tx, acl_obj, 0,
3248219089Spjd				    aclp->z_acl_bytes);
3249209962Smm			}
3250219089Spjd		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3251219089Spjd			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3252219089Spjd			    0, aclp->z_acl_bytes);
3253209962Smm		}
3254219089Spjd		mutex_exit(&zp->z_lock);
3255219089Spjd		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3256219089Spjd	} else {
3257219089Spjd		if ((mask & AT_XVATTR) &&
3258219089Spjd		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3259219089Spjd			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3260219089Spjd		else
3261219089Spjd			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3262168404Spjd	}
3263168404Spjd
3264219089Spjd	if (attrzp) {
3265219089Spjd		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3266219089Spjd	}
3267219089Spjd
3268219089Spjd	fuid_dirtied = zfsvfs->z_fuid_dirty;
3269219089Spjd	if (fuid_dirtied)
3270219089Spjd		zfs_fuid_txhold(zfsvfs, tx);
3271219089Spjd
3272219089Spjd	zfs_sa_upgrade_txholds(tx, zp);
3273219089Spjd
3274209962Smm	err = dmu_tx_assign(tx, TXG_NOWAIT);
3275168404Spjd	if (err) {
3276209962Smm		if (err == ERESTART)
3277168404Spjd			dmu_tx_wait(tx);
3278209962Smm		goto out;
3279168404Spjd	}
3280168404Spjd
3281219089Spjd	count = 0;
3282168404Spjd	/*
3283168404Spjd	 * Set each attribute requested.
3284168404Spjd	 * We group settings according to the locks they need to acquire.
3285168404Spjd	 *
3286168404Spjd	 * Note: you cannot set ctime directly, although it will be
3287168404Spjd	 * updated as a side-effect of calling this function.
3288168404Spjd	 */
3289168404Spjd
3290219089Spjd
3291219089Spjd	if (mask & (AT_UID|AT_GID|AT_MODE))
3292219089Spjd		mutex_enter(&zp->z_acl_lock);
3293168404Spjd	mutex_enter(&zp->z_lock);
3294168404Spjd
3295219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3296219089Spjd	    &zp->z_pflags, sizeof (zp->z_pflags));
3297219089Spjd
3298219089Spjd	if (attrzp) {
3299219089Spjd		if (mask & (AT_UID|AT_GID|AT_MODE))
3300219089Spjd			mutex_enter(&attrzp->z_acl_lock);
3301219089Spjd		mutex_enter(&attrzp->z_lock);
3302219089Spjd		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3303219089Spjd		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3304219089Spjd		    sizeof (attrzp->z_pflags));
3305219089Spjd	}
3306219089Spjd
3307219089Spjd	if (mask & (AT_UID|AT_GID)) {
3308219089Spjd
3309219089Spjd		if (mask & AT_UID) {
3310219089Spjd			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3311219089Spjd			    &new_uid, sizeof (new_uid));
3312219089Spjd			zp->z_uid = new_uid;
3313219089Spjd			if (attrzp) {
3314219089Spjd				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3315219089Spjd				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3316219089Spjd				    sizeof (new_uid));
3317219089Spjd				attrzp->z_uid = new_uid;
3318219089Spjd			}
3319219089Spjd		}
3320219089Spjd
3321219089Spjd		if (mask & AT_GID) {
3322219089Spjd			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3323219089Spjd			    NULL, &new_gid, sizeof (new_gid));
3324219089Spjd			zp->z_gid = new_gid;
3325219089Spjd			if (attrzp) {
3326219089Spjd				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3327219089Spjd				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3328219089Spjd				    sizeof (new_gid));
3329219089Spjd				attrzp->z_gid = new_gid;
3330219089Spjd			}
3331219089Spjd		}
3332219089Spjd		if (!(mask & AT_MODE)) {
3333219089Spjd			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3334219089Spjd			    NULL, &new_mode, sizeof (new_mode));
3335219089Spjd			new_mode = zp->z_mode;
3336219089Spjd		}
3337219089Spjd		err = zfs_acl_chown_setattr(zp);
3338219089Spjd		ASSERT(err == 0);
3339219089Spjd		if (attrzp) {
3340219089Spjd			err = zfs_acl_chown_setattr(attrzp);
3341219089Spjd			ASSERT(err == 0);
3342219089Spjd		}
3343219089Spjd	}
3344219089Spjd
3345168404Spjd	if (mask & AT_MODE) {
3346219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3347219089Spjd		    &new_mode, sizeof (new_mode));
3348219089Spjd		zp->z_mode = new_mode;
3349219089Spjd		ASSERT3U((uintptr_t)aclp, !=, 0);
3350209962Smm		err = zfs_aclset_common(zp, aclp, cr, tx);
3351168404Spjd		ASSERT3U(err, ==, 0);
3352219089Spjd		if (zp->z_acl_cached)
3353219089Spjd			zfs_acl_free(zp->z_acl_cached);
3354211932Smm		zp->z_acl_cached = aclp;
3355211932Smm		aclp = NULL;
3356168404Spjd	}
3357168404Spjd
3358168404Spjd
3359219089Spjd	if (mask & AT_ATIME) {
3360219089Spjd		ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
3361219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
3362219089Spjd		    &zp->z_atime, sizeof (zp->z_atime));
3363168404Spjd	}
3364168404Spjd
3365219089Spjd	if (mask & AT_MTIME) {
3366219089Spjd		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3367219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3368219089Spjd		    mtime, sizeof (mtime));
3369168404Spjd	}
3370168404Spjd
3371185029Spjd	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3372219089Spjd	if (mask & AT_SIZE && !(mask & AT_MTIME)) {
3373219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
3374219089Spjd		    NULL, mtime, sizeof (mtime));
3375219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3376219089Spjd		    &ctime, sizeof (ctime));
3377219089Spjd		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
3378219089Spjd		    B_TRUE);
3379219089Spjd	} else if (mask != 0) {
3380219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3381219089Spjd		    &ctime, sizeof (ctime));
3382219089Spjd		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
3383219089Spjd		    B_TRUE);
3384219089Spjd		if (attrzp) {
3385219089Spjd			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3386219089Spjd			    SA_ZPL_CTIME(zfsvfs), NULL,
3387219089Spjd			    &ctime, sizeof (ctime));
3388219089Spjd			zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
3389219089Spjd			    mtime, ctime, B_TRUE);
3390219089Spjd		}
3391219089Spjd	}
3392185029Spjd	/*
3393185029Spjd	 * Do this after setting timestamps to prevent timestamp
3394185029Spjd	 * update from toggling bit
3395185029Spjd	 */
3396168404Spjd
3397185029Spjd	if (xoap && (mask & AT_XVATTR)) {
3398209962Smm
3399209962Smm		/*
3400209962Smm		 * restore trimmed off masks
3401209962Smm		 * so that return masks can be set for caller.
3402209962Smm		 */
3403209962Smm
3404209962Smm		if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3405209962Smm			XVA_SET_REQ(xvap, XAT_APPENDONLY);
3406209962Smm		}
3407209962Smm		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3408209962Smm			XVA_SET_REQ(xvap, XAT_NOUNLINK);
3409209962Smm		}
3410209962Smm		if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3411209962Smm			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3412209962Smm		}
3413209962Smm		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3414209962Smm			XVA_SET_REQ(xvap, XAT_NODUMP);
3415209962Smm		}
3416209962Smm		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3417209962Smm			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3418209962Smm		}
3419209962Smm		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3420209962Smm			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3421209962Smm		}
3422209962Smm
3423219089Spjd		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3424185029Spjd			ASSERT(vp->v_type == VREG);
3425185029Spjd
3426219089Spjd		zfs_xvattr_set(zp, xvap, tx);
3427185029Spjd	}
3428185029Spjd
3429209962Smm	if (fuid_dirtied)
3430209962Smm		zfs_fuid_sync(zfsvfs, tx);
3431209962Smm
3432168404Spjd	if (mask != 0)
3433185029Spjd		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3434168404Spjd
3435168404Spjd	mutex_exit(&zp->z_lock);
3436219089Spjd	if (mask & (AT_UID|AT_GID|AT_MODE))
3437219089Spjd		mutex_exit(&zp->z_acl_lock);
3438168404Spjd
3439219089Spjd	if (attrzp) {
3440219089Spjd		if (mask & (AT_UID|AT_GID|AT_MODE))
3441219089Spjd			mutex_exit(&attrzp->z_acl_lock);
3442219089Spjd		mutex_exit(&attrzp->z_lock);
3443219089Spjd	}
3444209962Smmout:
3445219089Spjd	if (err == 0 && attrzp) {
3446219089Spjd		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3447219089Spjd		    xattr_count, tx);
3448219089Spjd		ASSERT(err2 == 0);
3449219089Spjd	}
3450219089Spjd
3451168404Spjd	if (attrzp)
3452168404Spjd		VN_RELE(ZTOV(attrzp));
3453211932Smm	if (aclp)
3454209962Smm		zfs_acl_free(aclp);
3455168404Spjd
3456209962Smm	if (fuidp) {
3457209962Smm		zfs_fuid_info_free(fuidp);
3458209962Smm		fuidp = NULL;
3459209962Smm	}
3460209962Smm
3461219089Spjd	if (err) {
3462209962Smm		dmu_tx_abort(tx);
3463219089Spjd		if (err == ERESTART)
3464219089Spjd			goto top;
3465219089Spjd	} else {
3466219089Spjd		err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3467209962Smm		dmu_tx_commit(tx);
3468219089Spjd	}
3469209962Smm
3470219089Spjdout2:
3471219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3472219089Spjd		zil_commit(zilog, 0);
3473209962Smm
3474168404Spjd	ZFS_EXIT(zfsvfs);
3475168404Spjd	return (err);
3476168404Spjd}
3477168404Spjd
3478168404Spjdtypedef struct zfs_zlock {
3479168404Spjd	krwlock_t	*zl_rwlock;	/* lock we acquired */
3480168404Spjd	znode_t		*zl_znode;	/* znode we held */
3481168404Spjd	struct zfs_zlock *zl_next;	/* next in list */
3482168404Spjd} zfs_zlock_t;
3483168404Spjd
3484168404Spjd/*
3485168404Spjd * Drop locks and release vnodes that were held by zfs_rename_lock().
3486168404Spjd */
3487168404Spjdstatic void
3488168404Spjdzfs_rename_unlock(zfs_zlock_t **zlpp)
3489168404Spjd{
3490168404Spjd	zfs_zlock_t *zl;
3491168404Spjd
3492168404Spjd	while ((zl = *zlpp) != NULL) {
3493168404Spjd		if (zl->zl_znode != NULL)
3494168404Spjd			VN_RELE(ZTOV(zl->zl_znode));
3495168404Spjd		rw_exit(zl->zl_rwlock);
3496168404Spjd		*zlpp = zl->zl_next;
3497168404Spjd		kmem_free(zl, sizeof (*zl));
3498168404Spjd	}
3499168404Spjd}
3500168404Spjd
3501168404Spjd/*
3502168404Spjd * Search back through the directory tree, using the ".." entries.
3503168404Spjd * Lock each directory in the chain to prevent concurrent renames.
3504168404Spjd * Fail any attempt to move a directory into one of its own descendants.
3505168404Spjd * XXX - z_parent_lock can overlap with map or grow locks
3506168404Spjd */
3507168404Spjdstatic int
3508168404Spjdzfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
3509168404Spjd{
3510168404Spjd	zfs_zlock_t	*zl;
3511168404Spjd	znode_t		*zp = tdzp;
3512168404Spjd	uint64_t	rootid = zp->z_zfsvfs->z_root;
3513219089Spjd	uint64_t	oidp = zp->z_id;
3514168404Spjd	krwlock_t	*rwlp = &szp->z_parent_lock;
3515168404Spjd	krw_t		rw = RW_WRITER;
3516168404Spjd
3517168404Spjd	/*
3518168404Spjd	 * First pass write-locks szp and compares to zp->z_id.
3519168404Spjd	 * Later passes read-lock zp and compare to zp->z_parent.
3520168404Spjd	 */
3521168404Spjd	do {
3522168404Spjd		if (!rw_tryenter(rwlp, rw)) {
3523168404Spjd			/*
3524168404Spjd			 * Another thread is renaming in this path.
3525168404Spjd			 * Note that if we are a WRITER, we don't have any
3526168404Spjd			 * parent_locks held yet.
3527168404Spjd			 */
3528168404Spjd			if (rw == RW_READER && zp->z_id > szp->z_id) {
3529168404Spjd				/*
3530168404Spjd				 * Drop our locks and restart
3531168404Spjd				 */
3532168404Spjd				zfs_rename_unlock(&zl);
3533168404Spjd				*zlpp = NULL;
3534168404Spjd				zp = tdzp;
3535219089Spjd				oidp = zp->z_id;
3536168404Spjd				rwlp = &szp->z_parent_lock;
3537168404Spjd				rw = RW_WRITER;
3538168404Spjd				continue;
3539168404Spjd			} else {
3540168404Spjd				/*
3541168404Spjd				 * Wait for other thread to drop its locks
3542168404Spjd				 */
3543168404Spjd				rw_enter(rwlp, rw);
3544168404Spjd			}
3545168404Spjd		}
3546168404Spjd
3547168404Spjd		zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
3548168404Spjd		zl->zl_rwlock = rwlp;
3549168404Spjd		zl->zl_znode = NULL;
3550168404Spjd		zl->zl_next = *zlpp;
3551168404Spjd		*zlpp = zl;
3552168404Spjd
3553219089Spjd		if (oidp == szp->z_id)		/* We're a descendant of szp */
3554168404Spjd			return (EINVAL);
3555168404Spjd
3556219089Spjd		if (oidp == rootid)		/* We've hit the top */
3557168404Spjd			return (0);
3558168404Spjd
3559168404Spjd		if (rw == RW_READER) {		/* i.e. not the first pass */
3560219089Spjd			int error = zfs_zget(zp->z_zfsvfs, oidp, &zp);
3561168404Spjd			if (error)
3562168404Spjd				return (error);
3563168404Spjd			zl->zl_znode = zp;
3564168404Spjd		}
3565219089Spjd		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs),
3566219089Spjd		    &oidp, sizeof (oidp));
3567168404Spjd		rwlp = &zp->z_parent_lock;
3568168404Spjd		rw = RW_READER;
3569168404Spjd
3570168404Spjd	} while (zp->z_id != sdzp->z_id);
3571168404Spjd
3572168404Spjd	return (0);
3573168404Spjd}
3574168404Spjd
3575168404Spjd/*
3576168404Spjd * Move an entry from the provided source directory to the target
3577168404Spjd * directory.  Change the entry name as indicated.
3578168404Spjd *
3579168404Spjd *	IN:	sdvp	- Source directory containing the "old entry".
3580168404Spjd *		snm	- Old entry name.
3581168404Spjd *		tdvp	- Target directory to contain the "new entry".
3582168404Spjd *		tnm	- New entry name.
3583168404Spjd *		cr	- credentials of caller.
3584185029Spjd *		ct	- caller context
3585185029Spjd *		flags	- case flags
3586168404Spjd *
3587168404Spjd *	RETURN:	0 if success
3588168404Spjd *		error code if failure
3589168404Spjd *
3590168404Spjd * Timestamps:
3591168404Spjd *	sdvp,tdvp - ctime|mtime updated
3592168404Spjd */
3593185029Spjd/*ARGSUSED*/
3594168404Spjdstatic int
3595185029Spjdzfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
3596185029Spjd    caller_context_t *ct, int flags)
3597168404Spjd{
3598168404Spjd	znode_t		*tdzp, *szp, *tzp;
3599168404Spjd	znode_t		*sdzp = VTOZ(sdvp);
3600168404Spjd	zfsvfs_t	*zfsvfs = sdzp->z_zfsvfs;
3601185029Spjd	zilog_t		*zilog;
3602168962Spjd	vnode_t		*realvp;
3603168404Spjd	zfs_dirlock_t	*sdl, *tdl;
3604168404Spjd	dmu_tx_t	*tx;
3605168404Spjd	zfs_zlock_t	*zl;
3606185029Spjd	int		cmp, serr, terr;
3607185029Spjd	int		error = 0;
3608185029Spjd	int		zflg = 0;
3609168404Spjd
3610168404Spjd	ZFS_ENTER(zfsvfs);
3611185029Spjd	ZFS_VERIFY_ZP(sdzp);
3612185029Spjd	zilog = zfsvfs->z_log;
3613168404Spjd
3614168962Spjd	/*
3615168962Spjd	 * Make sure we have the real vp for the target directory.
3616168962Spjd	 */
3617185029Spjd	if (VOP_REALVP(tdvp, &realvp, ct) == 0)
3618168962Spjd		tdvp = realvp;
3619168962Spjd
3620212694Smm	if (tdvp->v_vfsp != sdvp->v_vfsp || zfsctl_is_node(tdvp)) {
3621168404Spjd		ZFS_EXIT(zfsvfs);
3622168962Spjd		return (EXDEV);
3623168404Spjd	}
3624168404Spjd
3625168404Spjd	tdzp = VTOZ(tdvp);
3626185029Spjd	ZFS_VERIFY_ZP(tdzp);
3627185029Spjd	if (zfsvfs->z_utf8 && u8_validate(tnm,
3628185029Spjd	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3629185029Spjd		ZFS_EXIT(zfsvfs);
3630185029Spjd		return (EILSEQ);
3631185029Spjd	}
3632185029Spjd
3633185029Spjd	if (flags & FIGNORECASE)
3634185029Spjd		zflg |= ZCILOOK;
3635185029Spjd
3636168404Spjdtop:
3637168404Spjd	szp = NULL;
3638168404Spjd	tzp = NULL;
3639168404Spjd	zl = NULL;
3640168404Spjd
3641168404Spjd	/*
3642168404Spjd	 * This is to prevent the creation of links into attribute space
3643168404Spjd	 * by renaming a linked file into/outof an attribute directory.
3644168404Spjd	 * See the comment in zfs_link() for why this is considered bad.
3645168404Spjd	 */
3646219089Spjd	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3647168962Spjd		ZFS_EXIT(zfsvfs);
3648168962Spjd		return (EINVAL);
3649168404Spjd	}
3650168404Spjd
3651168404Spjd	/*
3652168404Spjd	 * Lock source and target directory entries.  To prevent deadlock,
3653168404Spjd	 * a lock ordering must be defined.  We lock the directory with
3654168404Spjd	 * the smallest object id first, or if it's a tie, the one with
3655168404Spjd	 * the lexically first name.
3656168404Spjd	 */
3657168404Spjd	if (sdzp->z_id < tdzp->z_id) {
3658168962Spjd		cmp = -1;
3659168962Spjd	} else if (sdzp->z_id > tdzp->z_id) {
3660168962Spjd		cmp = 1;
3661168962Spjd	} else {
3662185029Spjd		/*
3663185029Spjd		 * First compare the two name arguments without
3664185029Spjd		 * considering any case folding.
3665185029Spjd		 */
3666185029Spjd		int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
3667185029Spjd
3668185029Spjd		cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
3669185029Spjd		ASSERT(error == 0 || !zfsvfs->z_utf8);
3670168962Spjd		if (cmp == 0) {
3671168962Spjd			/*
3672168962Spjd			 * POSIX: "If the old argument and the new argument
3673168962Spjd			 * both refer to links to the same existing file,
3674168962Spjd			 * the rename() function shall return successfully
3675168962Spjd			 * and perform no other action."
3676168962Spjd			 */
3677168962Spjd			ZFS_EXIT(zfsvfs);
3678168962Spjd			return (0);
3679168962Spjd		}
3680185029Spjd		/*
3681185029Spjd		 * If the file system is case-folding, then we may
3682185029Spjd		 * have some more checking to do.  A case-folding file
3683185029Spjd		 * system is either supporting mixed case sensitivity
3684185029Spjd		 * access or is completely case-insensitive.  Note
3685185029Spjd		 * that the file system is always case preserving.
3686185029Spjd		 *
3687185029Spjd		 * In mixed sensitivity mode case sensitive behavior
3688185029Spjd		 * is the default.  FIGNORECASE must be used to
3689185029Spjd		 * explicitly request case insensitive behavior.
3690185029Spjd		 *
3691185029Spjd		 * If the source and target names provided differ only
3692185029Spjd		 * by case (e.g., a request to rename 'tim' to 'Tim'),
3693185029Spjd		 * we will treat this as a special case in the
3694185029Spjd		 * case-insensitive mode: as long as the source name
3695185029Spjd		 * is an exact match, we will allow this to proceed as
3696185029Spjd		 * a name-change request.
3697185029Spjd		 */
3698185029Spjd		if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
3699185029Spjd		    (zfsvfs->z_case == ZFS_CASE_MIXED &&
3700185029Spjd		    flags & FIGNORECASE)) &&
3701185029Spjd		    u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
3702185029Spjd		    &error) == 0) {
3703185029Spjd			/*
3704185029Spjd			 * case preserving rename request, require exact
3705185029Spjd			 * name matches
3706185029Spjd			 */
3707185029Spjd			zflg |= ZCIEXACT;
3708185029Spjd			zflg &= ~ZCILOOK;
3709185029Spjd		}
3710168962Spjd	}
3711185029Spjd
3712208131Smm	/*
3713208131Smm	 * If the source and destination directories are the same, we should
3714208131Smm	 * grab the z_name_lock of that directory only once.
3715208131Smm	 */
3716208131Smm	if (sdzp == tdzp) {
3717208131Smm		zflg |= ZHAVELOCK;
3718208131Smm		rw_enter(&sdzp->z_name_lock, RW_READER);
3719208131Smm	}
3720208131Smm
3721168962Spjd	if (cmp < 0) {
3722185029Spjd		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
3723185029Spjd		    ZEXISTS | zflg, NULL, NULL);
3724185029Spjd		terr = zfs_dirent_lock(&tdl,
3725185029Spjd		    tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
3726168962Spjd	} else {
3727185029Spjd		terr = zfs_dirent_lock(&tdl,
3728185029Spjd		    tdzp, tnm, &tzp, zflg, NULL, NULL);
3729185029Spjd		serr = zfs_dirent_lock(&sdl,
3730185029Spjd		    sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
3731185029Spjd		    NULL, NULL);
3732168404Spjd	}
3733168404Spjd
3734168962Spjd	if (serr) {
3735168404Spjd		/*
3736168404Spjd		 * Source entry invalid or not there.
3737168404Spjd		 */
3738168962Spjd		if (!terr) {
3739168404Spjd			zfs_dirent_unlock(tdl);
3740168962Spjd			if (tzp)
3741168962Spjd				VN_RELE(ZTOV(tzp));
3742168962Spjd		}
3743208131Smm
3744208131Smm		if (sdzp == tdzp)
3745208131Smm			rw_exit(&sdzp->z_name_lock);
3746208131Smm
3747219089Spjd		/*
3748219089Spjd		 * FreeBSD: In OpenSolaris they only check if rename source is
3749219089Spjd		 * ".." here, because "." is handled in their lookup. This is
3750219089Spjd		 * not the case for FreeBSD, so we check for "." explicitly.
3751219089Spjd		 */
3752168404Spjd		if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0)
3753168404Spjd			serr = EINVAL;
3754168962Spjd		ZFS_EXIT(zfsvfs);
3755168962Spjd		return (serr);
3756168404Spjd	}
3757168404Spjd	if (terr) {
3758168404Spjd		zfs_dirent_unlock(sdl);
3759168962Spjd		VN_RELE(ZTOV(szp));
3760208131Smm
3761208131Smm		if (sdzp == tdzp)
3762208131Smm			rw_exit(&sdzp->z_name_lock);
3763208131Smm
3764168404Spjd		if (strcmp(tnm, "..") == 0)
3765168404Spjd			terr = EINVAL;
3766168962Spjd		ZFS_EXIT(zfsvfs);
3767168962Spjd		return (terr);
3768168404Spjd	}
3769168404Spjd
3770168404Spjd	/*
3771168404Spjd	 * Must have write access at the source to remove the old entry
3772168404Spjd	 * and write access at the target to create the new entry.
3773168404Spjd	 * Note that if target and source are the same, this can be
3774168404Spjd	 * done in a single check.
3775168404Spjd	 */
3776168404Spjd
3777168404Spjd	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
3778168404Spjd		goto out;
3779168404Spjd
3780168962Spjd	if (ZTOV(szp)->v_type == VDIR) {
3781168404Spjd		/*
3782168404Spjd		 * Check to make sure rename is valid.
3783168404Spjd		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3784168404Spjd		 */
3785168404Spjd		if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
3786168404Spjd			goto out;
3787168404Spjd	}
3788168404Spjd
3789168404Spjd	/*
3790168404Spjd	 * Does target exist?
3791168404Spjd	 */
3792168404Spjd	if (tzp) {
3793168404Spjd		/*
3794168404Spjd		 * Source and target must be the same type.
3795168404Spjd		 */
3796168962Spjd		if (ZTOV(szp)->v_type == VDIR) {
3797168962Spjd			if (ZTOV(tzp)->v_type != VDIR) {
3798168404Spjd				error = ENOTDIR;
3799168404Spjd				goto out;
3800168404Spjd			}
3801168404Spjd		} else {
3802168962Spjd			if (ZTOV(tzp)->v_type == VDIR) {
3803168404Spjd				error = EISDIR;
3804168404Spjd				goto out;
3805168404Spjd			}
3806168404Spjd		}
3807168404Spjd		/*
3808168404Spjd		 * POSIX dictates that when the source and target
3809168404Spjd		 * entries refer to the same file object, rename
3810168404Spjd		 * must do nothing and exit without error.
3811168404Spjd		 */
3812168404Spjd		if (szp->z_id == tzp->z_id) {
3813168404Spjd			error = 0;
3814168404Spjd			goto out;
3815168404Spjd		}
3816168404Spjd	}
3817168404Spjd
3818185029Spjd	vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
3819168962Spjd	if (tzp)
3820185029Spjd		vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
3821168962Spjd
3822185029Spjd	/*
3823185029Spjd	 * notify the target directory if it is not the same
3824185029Spjd	 * as source directory.
3825185029Spjd	 */
3826185029Spjd	if (tdvp != sdvp) {
3827185029Spjd		vnevent_rename_dest_dir(tdvp, ct);
3828185029Spjd	}
3829185029Spjd
3830168404Spjd	tx = dmu_tx_create(zfsvfs->z_os);
3831219089Spjd	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3832219089Spjd	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3833168404Spjd	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3834168404Spjd	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3835219089Spjd	if (sdzp != tdzp) {
3836219089Spjd		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3837219089Spjd		zfs_sa_upgrade_txholds(tx, tdzp);
3838219089Spjd	}
3839219089Spjd	if (tzp) {
3840219089Spjd		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3841219089Spjd		zfs_sa_upgrade_txholds(tx, tzp);
3842219089Spjd	}
3843219089Spjd
3844219089Spjd	zfs_sa_upgrade_txholds(tx, szp);
3845168404Spjd	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3846209962Smm	error = dmu_tx_assign(tx, TXG_NOWAIT);
3847168404Spjd	if (error) {
3848168404Spjd		if (zl != NULL)
3849168404Spjd			zfs_rename_unlock(&zl);
3850168404Spjd		zfs_dirent_unlock(sdl);
3851168404Spjd		zfs_dirent_unlock(tdl);
3852208131Smm
3853208131Smm		if (sdzp == tdzp)
3854208131Smm			rw_exit(&sdzp->z_name_lock);
3855208131Smm
3856168962Spjd		VN_RELE(ZTOV(szp));
3857168962Spjd		if (tzp)
3858168962Spjd			VN_RELE(ZTOV(tzp));
3859209962Smm		if (error == ERESTART) {
3860168404Spjd			dmu_tx_wait(tx);
3861168404Spjd			dmu_tx_abort(tx);
3862168404Spjd			goto top;
3863168404Spjd		}
3864168404Spjd		dmu_tx_abort(tx);
3865168962Spjd		ZFS_EXIT(zfsvfs);
3866168962Spjd		return (error);
3867168404Spjd	}
3868168404Spjd
3869168404Spjd	if (tzp)	/* Attempt to remove the existing target */
3870185029Spjd		error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
3871168404Spjd
3872168404Spjd	if (error == 0) {
3873168404Spjd		error = zfs_link_create(tdl, szp, tx, ZRENAMING);
3874168404Spjd		if (error == 0) {
3875219089Spjd			szp->z_pflags |= ZFS_AV_MODIFIED;
3876185029Spjd
3877219089Spjd			error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3878219089Spjd			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3879219089Spjd			ASSERT3U(error, ==, 0);
3880219089Spjd
3881168404Spjd			error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
3882219089Spjd			if (error == 0) {
3883219089Spjd				zfs_log_rename(zilog, tx, TX_RENAME |
3884219089Spjd				    (flags & FIGNORECASE ? TX_CI : 0), sdzp,
3885219089Spjd				    sdl->dl_name, tdzp, tdl->dl_name, szp);
3886185029Spjd
3887219089Spjd				/*
3888219089Spjd				 * Update path information for the target vnode
3889219089Spjd				 */
3890219089Spjd				vn_renamepath(tdvp, ZTOV(szp), tnm,
3891219089Spjd				    strlen(tnm));
3892219089Spjd			} else {
3893219089Spjd				/*
3894219089Spjd				 * At this point, we have successfully created
3895219089Spjd				 * the target name, but have failed to remove
3896219089Spjd				 * the source name.  Since the create was done
3897219089Spjd				 * with the ZRENAMING flag, there are
3898219089Spjd				 * complications; for one, the link count is
3899219089Spjd				 * wrong.  The easiest way to deal with this
3900219089Spjd				 * is to remove the newly created target, and
3901219089Spjd				 * return the original error.  This must
3902219089Spjd				 * succeed; fortunately, it is very unlikely to
3903219089Spjd				 * fail, since we just created it.
3904219089Spjd				 */
3905219089Spjd				VERIFY3U(zfs_link_destroy(tdl, szp, tx,
3906219089Spjd				    ZRENAMING, NULL), ==, 0);
3907219089Spjd			}
3908168404Spjd		}
3909168404Spjd#ifdef FREEBSD_NAMECACHE
3910168404Spjd		if (error == 0) {
3911168404Spjd			cache_purge(sdvp);
3912168404Spjd			cache_purge(tdvp);
3913168404Spjd		}
3914168404Spjd#endif
3915168404Spjd	}
3916168404Spjd
3917168404Spjd	dmu_tx_commit(tx);
3918168404Spjdout:
3919168404Spjd	if (zl != NULL)
3920168404Spjd		zfs_rename_unlock(&zl);
3921168404Spjd
3922168404Spjd	zfs_dirent_unlock(sdl);
3923168404Spjd	zfs_dirent_unlock(tdl);
3924168404Spjd
3925208131Smm	if (sdzp == tdzp)
3926208131Smm		rw_exit(&sdzp->z_name_lock);
3927208131Smm
3928219089Spjd
3929168962Spjd	VN_RELE(ZTOV(szp));
3930168404Spjd	if (tzp)
3931168962Spjd		VN_RELE(ZTOV(tzp));
3932168404Spjd
3933219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3934219089Spjd		zil_commit(zilog, 0);
3935219089Spjd
3936168404Spjd	ZFS_EXIT(zfsvfs);
3937168404Spjd
3938168404Spjd	return (error);
3939168404Spjd}
3940168404Spjd
3941168404Spjd/*
3942168404Spjd * Insert the indicated symbolic reference entry into the directory.
3943168404Spjd *
3944168404Spjd *	IN:	dvp	- Directory to contain new symbolic link.
3945168404Spjd *		link	- Name for new symlink entry.
3946168404Spjd *		vap	- Attributes of new entry.
3947168404Spjd *		target	- Target path of new symlink.
3948168404Spjd *		cr	- credentials of caller.
3949185029Spjd *		ct	- caller context
3950185029Spjd *		flags	- case flags
3951168404Spjd *
3952168404Spjd *	RETURN:	0 if success
3953168404Spjd *		error code if failure
3954168404Spjd *
3955168404Spjd * Timestamps:
3956168404Spjd *	dvp - ctime|mtime updated
3957168404Spjd */
3958185029Spjd/*ARGSUSED*/
3959168404Spjdstatic int
3960185029Spjdzfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
3961185029Spjd    cred_t *cr, kthread_t *td)
3962168404Spjd{
3963168404Spjd	znode_t		*zp, *dzp = VTOZ(dvp);
3964168404Spjd	zfs_dirlock_t	*dl;
3965168404Spjd	dmu_tx_t	*tx;
3966168404Spjd	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
3967185029Spjd	zilog_t		*zilog;
3968219089Spjd	uint64_t	len = strlen(link);
3969168404Spjd	int		error;
3970185029Spjd	int		zflg = ZNEW;
3971209962Smm	zfs_acl_ids_t	acl_ids;
3972209962Smm	boolean_t	fuid_dirtied;
3973219089Spjd	uint64_t	txtype = TX_SYMLINK;
3974185029Spjd	int		flags = 0;
3975168404Spjd
3976168962Spjd	ASSERT(vap->va_type == VLNK);
3977168404Spjd
3978168404Spjd	ZFS_ENTER(zfsvfs);
3979185029Spjd	ZFS_VERIFY_ZP(dzp);
3980185029Spjd	zilog = zfsvfs->z_log;
3981185029Spjd
3982185029Spjd	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
3983185029Spjd	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3984185029Spjd		ZFS_EXIT(zfsvfs);
3985185029Spjd		return (EILSEQ);
3986185029Spjd	}
3987185029Spjd	if (flags & FIGNORECASE)
3988185029Spjd		zflg |= ZCILOOK;
3989168404Spjd
3990168404Spjd	if (len > MAXPATHLEN) {
3991168404Spjd		ZFS_EXIT(zfsvfs);
3992168404Spjd		return (ENAMETOOLONG);
3993168404Spjd	}
3994168404Spjd
3995219089Spjd	if ((error = zfs_acl_ids_create(dzp, 0,
3996219089Spjd	    vap, cr, NULL, &acl_ids)) != 0) {
3997219089Spjd		ZFS_EXIT(zfsvfs);
3998219089Spjd		return (error);
3999219089Spjd	}
4000219089Spjdtop:
4001168404Spjd	/*
4002168404Spjd	 * Attempt to lock directory; fail if entry already exists.
4003168404Spjd	 */
4004185029Spjd	error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
4005185029Spjd	if (error) {
4006219089Spjd		zfs_acl_ids_free(&acl_ids);
4007168404Spjd		ZFS_EXIT(zfsvfs);
4008168404Spjd		return (error);
4009168404Spjd	}
4010168404Spjd
4011219089Spjd	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4012219089Spjd		zfs_acl_ids_free(&acl_ids);
4013219089Spjd		zfs_dirent_unlock(dl);
4014219089Spjd		ZFS_EXIT(zfsvfs);
4015219089Spjd		return (error);
4016219089Spjd	}
4017219089Spjd
4018209962Smm	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
4019209962Smm		zfs_acl_ids_free(&acl_ids);
4020209962Smm		zfs_dirent_unlock(dl);
4021209962Smm		ZFS_EXIT(zfsvfs);
4022209962Smm		return (EDQUOT);
4023209962Smm	}
4024168404Spjd	tx = dmu_tx_create(zfsvfs->z_os);
4025209962Smm	fuid_dirtied = zfsvfs->z_fuid_dirty;
4026168404Spjd	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
4027168404Spjd	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4028219089Spjd	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
4029219089Spjd	    ZFS_SA_BASE_ATTR_SIZE + len);
4030219089Spjd	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
4031219089Spjd	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
4032219089Spjd		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
4033219089Spjd		    acl_ids.z_aclp->z_acl_bytes);
4034219089Spjd	}
4035209962Smm	if (fuid_dirtied)
4036209962Smm		zfs_fuid_txhold(zfsvfs, tx);
4037209962Smm	error = dmu_tx_assign(tx, TXG_NOWAIT);
4038168404Spjd	if (error) {
4039168404Spjd		zfs_dirent_unlock(dl);
4040209962Smm		if (error == ERESTART) {
4041168404Spjd			dmu_tx_wait(tx);
4042168404Spjd			dmu_tx_abort(tx);
4043168404Spjd			goto top;
4044168404Spjd		}
4045219089Spjd		zfs_acl_ids_free(&acl_ids);
4046168404Spjd		dmu_tx_abort(tx);
4047168404Spjd		ZFS_EXIT(zfsvfs);
4048168404Spjd		return (error);
4049168404Spjd	}
4050168404Spjd
4051168404Spjd	/*
4052168404Spjd	 * Create a new object for the symlink.
4053219089Spjd	 * for version 4 ZPL datsets the symlink will be an SA attribute
4054168404Spjd	 */
4055219089Spjd	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
4056168404Spjd
4057219089Spjd	if (fuid_dirtied)
4058219089Spjd		zfs_fuid_sync(zfsvfs, tx);
4059209962Smm
4060219089Spjd	mutex_enter(&zp->z_lock);
4061219089Spjd	if (zp->z_is_sa)
4062219089Spjd		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
4063219089Spjd		    link, len, tx);
4064219089Spjd	else
4065219089Spjd		zfs_sa_symlink(zp, link, len, tx);
4066219089Spjd	mutex_exit(&zp->z_lock);
4067168404Spjd
4068219089Spjd	zp->z_size = len;
4069219089Spjd	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
4070219089Spjd	    &zp->z_size, sizeof (zp->z_size), tx);
4071168404Spjd	/*
4072168404Spjd	 * Insert the new object into the directory.
4073168404Spjd	 */
4074168404Spjd	(void) zfs_link_create(dl, zp, tx, ZNEW);
4075168404Spjd
4076219089Spjd	if (flags & FIGNORECASE)
4077219089Spjd		txtype |= TX_CI;
4078219089Spjd	zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
4079219089Spjd	*vpp = ZTOV(zp);
4080219089Spjd
4081209962Smm	zfs_acl_ids_free(&acl_ids);
4082209962Smm
4083168404Spjd	dmu_tx_commit(tx);
4084168404Spjd
4085168404Spjd	zfs_dirent_unlock(dl);
4086168404Spjd
4087219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4088219089Spjd		zil_commit(zilog, 0);
4089219089Spjd
4090168404Spjd	ZFS_EXIT(zfsvfs);
4091168404Spjd	return (error);
4092168404Spjd}
4093168404Spjd
4094168404Spjd/*
4095168404Spjd * Return, in the buffer contained in the provided uio structure,
4096168404Spjd * the symbolic path referred to by vp.
4097168404Spjd *
4098168404Spjd *	IN:	vp	- vnode of symbolic link.
4099168404Spjd *		uoip	- structure to contain the link path.
4100168404Spjd *		cr	- credentials of caller.
4101185029Spjd *		ct	- caller context
4102168404Spjd *
4103168404Spjd *	OUT:	uio	- structure to contain the link path.
4104168404Spjd *
4105168404Spjd *	RETURN:	0 if success
4106168404Spjd *		error code if failure
4107168404Spjd *
4108168404Spjd * Timestamps:
4109168404Spjd *	vp - atime updated
4110168404Spjd */
4111168404Spjd/* ARGSUSED */
4112168404Spjdstatic int
4113185029Spjdzfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
4114168404Spjd{
4115168404Spjd	znode_t		*zp = VTOZ(vp);
4116168404Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4117168404Spjd	int		error;
4118168404Spjd
4119168404Spjd	ZFS_ENTER(zfsvfs);
4120185029Spjd	ZFS_VERIFY_ZP(zp);
4121168404Spjd
4122219089Spjd	mutex_enter(&zp->z_lock);
4123219089Spjd	if (zp->z_is_sa)
4124219089Spjd		error = sa_lookup_uio(zp->z_sa_hdl,
4125219089Spjd		    SA_ZPL_SYMLINK(zfsvfs), uio);
4126219089Spjd	else
4127219089Spjd		error = zfs_sa_readlink(zp, uio);
4128219089Spjd	mutex_exit(&zp->z_lock);
4129168404Spjd
4130168404Spjd	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4131219089Spjd
4132168404Spjd	ZFS_EXIT(zfsvfs);
4133168404Spjd	return (error);
4134168404Spjd}
4135168404Spjd
4136168404Spjd/*
4137168404Spjd * Insert a new entry into directory tdvp referencing svp.
4138168404Spjd *
4139168404Spjd *	IN:	tdvp	- Directory to contain new entry.
4140168404Spjd *		svp	- vnode of new entry.
4141168404Spjd *		name	- name of new entry.
4142168404Spjd *		cr	- credentials of caller.
4143185029Spjd *		ct	- caller context
4144168404Spjd *
4145168404Spjd *	RETURN:	0 if success
4146168404Spjd *		error code if failure
4147168404Spjd *
4148168404Spjd * Timestamps:
4149168404Spjd *	tdvp - ctime|mtime updated
4150168404Spjd *	 svp - ctime updated
4151168404Spjd */
4152168404Spjd/* ARGSUSED */
4153168404Spjdstatic int
4154185029Spjdzfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
4155185029Spjd    caller_context_t *ct, int flags)
4156168404Spjd{
4157168404Spjd	znode_t		*dzp = VTOZ(tdvp);
4158168404Spjd	znode_t		*tzp, *szp;
4159168404Spjd	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
4160185029Spjd	zilog_t		*zilog;
4161168404Spjd	zfs_dirlock_t	*dl;
4162168404Spjd	dmu_tx_t	*tx;
4163168962Spjd	vnode_t		*realvp;
4164168404Spjd	int		error;
4165185029Spjd	int		zf = ZNEW;
4166212694Smm	uint64_t	parent;
4167185029Spjd	uid_t		owner;
4168168404Spjd
4169168404Spjd	ASSERT(tdvp->v_type == VDIR);
4170168404Spjd
4171168404Spjd	ZFS_ENTER(zfsvfs);
4172185029Spjd	ZFS_VERIFY_ZP(dzp);
4173185029Spjd	zilog = zfsvfs->z_log;
4174168404Spjd
4175185029Spjd	if (VOP_REALVP(svp, &realvp, ct) == 0)
4176168962Spjd		svp = realvp;
4177168962Spjd
4178212694Smm	/*
4179212694Smm	 * POSIX dictates that we return EPERM here.
4180212694Smm	 * Better choices include ENOTSUP or EISDIR.
4181212694Smm	 */
4182212694Smm	if (svp->v_type == VDIR) {
4183168404Spjd		ZFS_EXIT(zfsvfs);
4184212694Smm		return (EPERM);
4185212694Smm	}
4186212694Smm
4187212694Smm	if (svp->v_vfsp != tdvp->v_vfsp || zfsctl_is_node(svp)) {
4188212694Smm		ZFS_EXIT(zfsvfs);
4189168404Spjd		return (EXDEV);
4190168404Spjd	}
4191212694Smm
4192185029Spjd	szp = VTOZ(svp);
4193185029Spjd	ZFS_VERIFY_ZP(szp);
4194168404Spjd
4195212694Smm	/* Prevent links to .zfs/shares files */
4196212694Smm
4197219089Spjd	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
4198219089Spjd	    &parent, sizeof (uint64_t))) != 0) {
4199212694Smm		ZFS_EXIT(zfsvfs);
4200219089Spjd		return (error);
4201219089Spjd	}
4202219089Spjd	if (parent == zfsvfs->z_shares_dir) {
4203219089Spjd		ZFS_EXIT(zfsvfs);
4204212694Smm		return (EPERM);
4205212694Smm	}
4206212694Smm
4207185029Spjd	if (zfsvfs->z_utf8 && u8_validate(name,
4208185029Spjd	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4209185029Spjd		ZFS_EXIT(zfsvfs);
4210185029Spjd		return (EILSEQ);
4211185029Spjd	}
4212185029Spjd	if (flags & FIGNORECASE)
4213185029Spjd		zf |= ZCILOOK;
4214185029Spjd
4215168404Spjd	/*
4216168404Spjd	 * We do not support links between attributes and non-attributes
4217168404Spjd	 * because of the potential security risk of creating links
4218168404Spjd	 * into "normal" file space in order to circumvent restrictions
4219168404Spjd	 * imposed in attribute space.
4220168404Spjd	 */
4221219089Spjd	if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
4222168404Spjd		ZFS_EXIT(zfsvfs);
4223168404Spjd		return (EINVAL);
4224168404Spjd	}
4225168404Spjd
4226168404Spjd
4227219089Spjd	owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
4228219089Spjd	if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
4229168404Spjd		ZFS_EXIT(zfsvfs);
4230168404Spjd		return (EPERM);
4231168404Spjd	}
4232168404Spjd
4233185029Spjd	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4234168404Spjd		ZFS_EXIT(zfsvfs);
4235168404Spjd		return (error);
4236168404Spjd	}
4237168404Spjd
4238212694Smmtop:
4239168404Spjd	/*
4240168404Spjd	 * Attempt to lock directory; fail if entry already exists.
4241168404Spjd	 */
4242185029Spjd	error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
4243185029Spjd	if (error) {
4244168404Spjd		ZFS_EXIT(zfsvfs);
4245168404Spjd		return (error);
4246168404Spjd	}
4247168404Spjd
4248168404Spjd	tx = dmu_tx_create(zfsvfs->z_os);
4249219089Spjd	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4250168404Spjd	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4251219089Spjd	zfs_sa_upgrade_txholds(tx, szp);
4252219089Spjd	zfs_sa_upgrade_txholds(tx, dzp);
4253209962Smm	error = dmu_tx_assign(tx, TXG_NOWAIT);
4254168404Spjd	if (error) {
4255168404Spjd		zfs_dirent_unlock(dl);
4256209962Smm		if (error == ERESTART) {
4257168404Spjd			dmu_tx_wait(tx);
4258168404Spjd			dmu_tx_abort(tx);
4259168404Spjd			goto top;
4260168404Spjd		}
4261168404Spjd		dmu_tx_abort(tx);
4262168404Spjd		ZFS_EXIT(zfsvfs);
4263168404Spjd		return (error);
4264168404Spjd	}
4265168404Spjd
4266168404Spjd	error = zfs_link_create(dl, szp, tx, 0);
4267168404Spjd
4268185029Spjd	if (error == 0) {
4269185029Spjd		uint64_t txtype = TX_LINK;
4270185029Spjd		if (flags & FIGNORECASE)
4271185029Spjd			txtype |= TX_CI;
4272185029Spjd		zfs_log_link(zilog, tx, txtype, dzp, szp, name);
4273185029Spjd	}
4274168404Spjd
4275168404Spjd	dmu_tx_commit(tx);
4276168404Spjd
4277168404Spjd	zfs_dirent_unlock(dl);
4278168404Spjd
4279185029Spjd	if (error == 0) {
4280185029Spjd		vnevent_link(svp, ct);
4281185029Spjd	}
4282185029Spjd
4283219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4284219089Spjd		zil_commit(zilog, 0);
4285219089Spjd
4286168404Spjd	ZFS_EXIT(zfsvfs);
4287168404Spjd	return (error);
4288168404Spjd}
4289168404Spjd
4290219089Spjd#ifdef sun
4291219089Spjd/*
4292219089Spjd * zfs_null_putapage() is used when the file system has been force
4293219089Spjd * unmounted. It just drops the pages.
4294219089Spjd */
4295219089Spjd/* ARGSUSED */
4296219089Spjdstatic int
4297219089Spjdzfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4298219089Spjd		size_t *lenp, int flags, cred_t *cr)
4299219089Spjd{
4300219089Spjd	pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR);
4301219089Spjd	return (0);
4302219089Spjd}
4303219089Spjd
4304219089Spjd/*
4305219089Spjd * Push a page out to disk, klustering if possible.
4306219089Spjd *
4307219089Spjd *	IN:	vp	- file to push page to.
4308219089Spjd *		pp	- page to push.
4309219089Spjd *		flags	- additional flags.
4310219089Spjd *		cr	- credentials of caller.
4311219089Spjd *
4312219089Spjd *	OUT:	offp	- start of range pushed.
4313219089Spjd *		lenp	- len of range pushed.
4314219089Spjd *
4315219089Spjd *	RETURN:	0 if success
4316219089Spjd *		error code if failure
4317219089Spjd *
4318219089Spjd * NOTE: callers must have locked the page to be pushed.  On
4319219089Spjd * exit, the page (and all other pages in the kluster) must be
4320219089Spjd * unlocked.
4321219089Spjd */
4322219089Spjd/* ARGSUSED */
4323219089Spjdstatic int
4324219089Spjdzfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4325219089Spjd		size_t *lenp, int flags, cred_t *cr)
4326219089Spjd{
4327219089Spjd	znode_t		*zp = VTOZ(vp);
4328219089Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4329219089Spjd	dmu_tx_t	*tx;
4330219089Spjd	u_offset_t	off, koff;
4331219089Spjd	size_t		len, klen;
4332219089Spjd	int		err;
4333219089Spjd
4334219089Spjd	off = pp->p_offset;
4335219089Spjd	len = PAGESIZE;
4336219089Spjd	/*
4337219089Spjd	 * If our blocksize is bigger than the page size, try to kluster
4338219089Spjd	 * multiple pages so that we write a full block (thus avoiding
4339219089Spjd	 * a read-modify-write).
4340219089Spjd	 */
4341219089Spjd	if (off < zp->z_size && zp->z_blksz > PAGESIZE) {
4342219089Spjd		klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
4343219089Spjd		koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0;
4344219089Spjd		ASSERT(koff <= zp->z_size);
4345219089Spjd		if (koff + klen > zp->z_size)
4346219089Spjd			klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE);
4347219089Spjd		pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
4348219089Spjd	}
4349219089Spjd	ASSERT3U(btop(len), ==, btopr(len));
4350219089Spjd
4351219089Spjd	/*
4352219089Spjd	 * Can't push pages past end-of-file.
4353219089Spjd	 */
4354219089Spjd	if (off >= zp->z_size) {
4355219089Spjd		/* ignore all pages */
4356219089Spjd		err = 0;
4357219089Spjd		goto out;
4358219089Spjd	} else if (off + len > zp->z_size) {
4359219089Spjd		int npages = btopr(zp->z_size - off);
4360219089Spjd		page_t *trunc;
4361219089Spjd
4362219089Spjd		page_list_break(&pp, &trunc, npages);
4363219089Spjd		/* ignore pages past end of file */
4364219089Spjd		if (trunc)
4365219089Spjd			pvn_write_done(trunc, flags);
4366219089Spjd		len = zp->z_size - off;
4367219089Spjd	}
4368219089Spjd
4369219089Spjd	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
4370219089Spjd	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
4371219089Spjd		err = EDQUOT;
4372219089Spjd		goto out;
4373219089Spjd	}
4374219089Spjdtop:
4375219089Spjd	tx = dmu_tx_create(zfsvfs->z_os);
4376219089Spjd	dmu_tx_hold_write(tx, zp->z_id, off, len);
4377219089Spjd
4378219089Spjd	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4379219089Spjd	zfs_sa_upgrade_txholds(tx, zp);
4380219089Spjd	err = dmu_tx_assign(tx, TXG_NOWAIT);
4381219089Spjd	if (err != 0) {
4382219089Spjd		if (err == ERESTART) {
4383219089Spjd			dmu_tx_wait(tx);
4384219089Spjd			dmu_tx_abort(tx);
4385219089Spjd			goto top;
4386219089Spjd		}
4387219089Spjd		dmu_tx_abort(tx);
4388219089Spjd		goto out;
4389219089Spjd	}
4390219089Spjd
4391219089Spjd	if (zp->z_blksz <= PAGESIZE) {
4392219089Spjd		caddr_t va = zfs_map_page(pp, S_READ);
4393219089Spjd		ASSERT3U(len, <=, PAGESIZE);
4394219089Spjd		dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
4395219089Spjd		zfs_unmap_page(pp, va);
4396219089Spjd	} else {
4397219089Spjd		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
4398219089Spjd	}
4399219089Spjd
4400219089Spjd	if (err == 0) {
4401219089Spjd		uint64_t mtime[2], ctime[2];
4402219089Spjd		sa_bulk_attr_t bulk[3];
4403219089Spjd		int count = 0;
4404219089Spjd
4405219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4406219089Spjd		    &mtime, 16);
4407219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
4408219089Spjd		    &ctime, 16);
4409219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
4410219089Spjd		    &zp->z_pflags, 8);
4411219089Spjd		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
4412219089Spjd		    B_TRUE);
4413219089Spjd		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
4414219089Spjd	}
4415219089Spjd	dmu_tx_commit(tx);
4416219089Spjd
4417219089Spjdout:
4418219089Spjd	pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
4419219089Spjd	if (offp)
4420219089Spjd		*offp = off;
4421219089Spjd	if (lenp)
4422219089Spjd		*lenp = len;
4423219089Spjd
4424219089Spjd	return (err);
4425219089Spjd}
4426219089Spjd
4427219089Spjd/*
4428219089Spjd * Copy the portion of the file indicated from pages into the file.
4429219089Spjd * The pages are stored in a page list attached to the files vnode.
4430219089Spjd *
4431219089Spjd *	IN:	vp	- vnode of file to push page data to.
4432219089Spjd *		off	- position in file to put data.
4433219089Spjd *		len	- amount of data to write.
4434219089Spjd *		flags	- flags to control the operation.
4435219089Spjd *		cr	- credentials of caller.
4436219089Spjd *		ct	- caller context.
4437219089Spjd *
4438219089Spjd *	RETURN:	0 if success
4439219089Spjd *		error code if failure
4440219089Spjd *
4441219089Spjd * Timestamps:
4442219089Spjd *	vp - ctime|mtime updated
4443219089Spjd */
4444185029Spjd/*ARGSUSED*/
4445219089Spjdstatic int
4446219089Spjdzfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
4447219089Spjd    caller_context_t *ct)
4448219089Spjd{
4449219089Spjd	znode_t		*zp = VTOZ(vp);
4450219089Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4451219089Spjd	page_t		*pp;
4452219089Spjd	size_t		io_len;
4453219089Spjd	u_offset_t	io_off;
4454219089Spjd	uint_t		blksz;
4455219089Spjd	rl_t		*rl;
4456219089Spjd	int		error = 0;
4457219089Spjd
4458219089Spjd	ZFS_ENTER(zfsvfs);
4459219089Spjd	ZFS_VERIFY_ZP(zp);
4460219089Spjd
4461219089Spjd	/*
4462219089Spjd	 * Align this request to the file block size in case we kluster.
4463219089Spjd	 * XXX - this can result in pretty aggresive locking, which can
4464219089Spjd	 * impact simultanious read/write access.  One option might be
4465219089Spjd	 * to break up long requests (len == 0) into block-by-block
4466219089Spjd	 * operations to get narrower locking.
4467219089Spjd	 */
4468219089Spjd	blksz = zp->z_blksz;
4469219089Spjd	if (ISP2(blksz))
4470219089Spjd		io_off = P2ALIGN_TYPED(off, blksz, u_offset_t);
4471219089Spjd	else
4472219089Spjd		io_off = 0;
4473219089Spjd	if (len > 0 && ISP2(blksz))
4474219089Spjd		io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t);
4475219089Spjd	else
4476219089Spjd		io_len = 0;
4477219089Spjd
4478219089Spjd	if (io_len == 0) {
4479219089Spjd		/*
4480219089Spjd		 * Search the entire vp list for pages >= io_off.
4481219089Spjd		 */
4482219089Spjd		rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER);
4483219089Spjd		error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr);
4484219089Spjd		goto out;
4485219089Spjd	}
4486219089Spjd	rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER);
4487219089Spjd
4488219089Spjd	if (off > zp->z_size) {
4489219089Spjd		/* past end of file */
4490219089Spjd		zfs_range_unlock(rl);
4491219089Spjd		ZFS_EXIT(zfsvfs);
4492219089Spjd		return (0);
4493219089Spjd	}
4494219089Spjd
4495219089Spjd	len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off);
4496219089Spjd
4497219089Spjd	for (off = io_off; io_off < off + len; io_off += io_len) {
4498219089Spjd		if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
4499219089Spjd			pp = page_lookup(vp, io_off,
4500219089Spjd			    (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
4501219089Spjd		} else {
4502219089Spjd			pp = page_lookup_nowait(vp, io_off,
4503219089Spjd			    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
4504219089Spjd		}
4505219089Spjd
4506219089Spjd		if (pp != NULL && pvn_getdirty(pp, flags)) {
4507219089Spjd			int err;
4508219089Spjd
4509219089Spjd			/*
4510219089Spjd			 * Found a dirty page to push
4511219089Spjd			 */
4512219089Spjd			err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr);
4513219089Spjd			if (err)
4514219089Spjd				error = err;
4515219089Spjd		} else {
4516219089Spjd			io_len = PAGESIZE;
4517219089Spjd		}
4518219089Spjd	}
4519219089Spjdout:
4520219089Spjd	zfs_range_unlock(rl);
4521219089Spjd	if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4522219089Spjd		zil_commit(zfsvfs->z_log, zp->z_id);
4523219089Spjd	ZFS_EXIT(zfsvfs);
4524219089Spjd	return (error);
4525219089Spjd}
4526219089Spjd#endif	/* sun */
4527219089Spjd
4528219089Spjd/*ARGSUSED*/
4529168962Spjdvoid
4530185029Spjdzfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4531168404Spjd{
4532168962Spjd	znode_t	*zp = VTOZ(vp);
4533168962Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4534168962Spjd	int error;
4535168404Spjd
4536185029Spjd	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4537219089Spjd	if (zp->z_sa_hdl == NULL) {
4538185029Spjd		/*
4539185029Spjd		 * The fs has been unmounted, or we did a
4540185029Spjd		 * suspend/resume and this file no longer exists.
4541185029Spjd		 */
4542168404Spjd		VI_LOCK(vp);
4543219089Spjd		ASSERT(vp->v_count <= 1);
4544219089Spjd		vp->v_count = 0;
4545196299Spjd		VI_UNLOCK(vp);
4546196299Spjd		vrecycle(vp, curthread);
4547185029Spjd		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4548168962Spjd		return;
4549168404Spjd	}
4550168404Spjd
4551168404Spjd	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4552168404Spjd		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4553168404Spjd
4554219089Spjd		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4555219089Spjd		zfs_sa_upgrade_txholds(tx, zp);
4556168404Spjd		error = dmu_tx_assign(tx, TXG_WAIT);
4557168404Spjd		if (error) {
4558168404Spjd			dmu_tx_abort(tx);
4559168404Spjd		} else {
4560168404Spjd			mutex_enter(&zp->z_lock);
4561219089Spjd			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4562219089Spjd			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4563168404Spjd			zp->z_atime_dirty = 0;
4564168404Spjd			mutex_exit(&zp->z_lock);
4565168404Spjd			dmu_tx_commit(tx);
4566168404Spjd		}
4567168404Spjd	}
4568168404Spjd
4569168404Spjd	zfs_zinactive(zp);
4570185029Spjd	rw_exit(&zfsvfs->z_teardown_inactive_lock);
4571168404Spjd}
4572168404Spjd
4573219089Spjd#ifdef sun
4574219089Spjd/*
4575219089Spjd * Bounds-check the seek operation.
4576219089Spjd *
4577219089Spjd *	IN:	vp	- vnode seeking within
4578219089Spjd *		ooff	- old file offset
4579219089Spjd *		noffp	- pointer to new file offset
4580219089Spjd *		ct	- caller context
4581219089Spjd *
4582219089Spjd *	RETURN:	0 if success
4583219089Spjd *		EINVAL if new offset invalid
4584219089Spjd */
4585219089Spjd/* ARGSUSED */
4586219089Spjdstatic int
4587219089Spjdzfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
4588219089Spjd    caller_context_t *ct)
4589219089Spjd{
4590219089Spjd	if (vp->v_type == VDIR)
4591219089Spjd		return (0);
4592219089Spjd	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
4593219089Spjd}
4594219089Spjd
4595219089Spjd/*
4596219089Spjd * Pre-filter the generic locking function to trap attempts to place
4597219089Spjd * a mandatory lock on a memory mapped file.
4598219089Spjd */
4599219089Spjdstatic int
4600219089Spjdzfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
4601219089Spjd    flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct)
4602219089Spjd{
4603219089Spjd	znode_t *zp = VTOZ(vp);
4604219089Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4605219089Spjd
4606219089Spjd	ZFS_ENTER(zfsvfs);
4607219089Spjd	ZFS_VERIFY_ZP(zp);
4608219089Spjd
4609219089Spjd	/*
4610219089Spjd	 * We are following the UFS semantics with respect to mapcnt
4611219089Spjd	 * here: If we see that the file is mapped already, then we will
4612219089Spjd	 * return an error, but we don't worry about races between this
4613219089Spjd	 * function and zfs_map().
4614219089Spjd	 */
4615219089Spjd	if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) {
4616219089Spjd		ZFS_EXIT(zfsvfs);
4617219089Spjd		return (EAGAIN);
4618219089Spjd	}
4619219089Spjd	ZFS_EXIT(zfsvfs);
4620219089Spjd	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
4621219089Spjd}
4622219089Spjd
4623219089Spjd/*
4624219089Spjd * If we can't find a page in the cache, we will create a new page
4625219089Spjd * and fill it with file data.  For efficiency, we may try to fill
4626219089Spjd * multiple pages at once (klustering) to fill up the supplied page
4627219089Spjd * list.  Note that the pages to be filled are held with an exclusive
4628219089Spjd * lock to prevent access by other threads while they are being filled.
4629219089Spjd */
4630219089Spjdstatic int
4631219089Spjdzfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
4632219089Spjd    caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw)
4633219089Spjd{
4634219089Spjd	znode_t *zp = VTOZ(vp);
4635219089Spjd	page_t *pp, *cur_pp;
4636219089Spjd	objset_t *os = zp->z_zfsvfs->z_os;
4637219089Spjd	u_offset_t io_off, total;
4638219089Spjd	size_t io_len;
4639219089Spjd	int err;
4640219089Spjd
4641219089Spjd	if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) {
4642219089Spjd		/*
4643219089Spjd		 * We only have a single page, don't bother klustering
4644219089Spjd		 */
4645219089Spjd		io_off = off;
4646219089Spjd		io_len = PAGESIZE;
4647219089Spjd		pp = page_create_va(vp, io_off, io_len,
4648219089Spjd		    PG_EXCL | PG_WAIT, seg, addr);
4649219089Spjd	} else {
4650219089Spjd		/*
4651219089Spjd		 * Try to find enough pages to fill the page list
4652219089Spjd		 */
4653219089Spjd		pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
4654219089Spjd		    &io_len, off, plsz, 0);
4655219089Spjd	}
4656219089Spjd	if (pp == NULL) {
4657219089Spjd		/*
4658219089Spjd		 * The page already exists, nothing to do here.
4659219089Spjd		 */
4660219089Spjd		*pl = NULL;
4661219089Spjd		return (0);
4662219089Spjd	}
4663219089Spjd
4664219089Spjd	/*
4665219089Spjd	 * Fill the pages in the kluster.
4666219089Spjd	 */
4667219089Spjd	cur_pp = pp;
4668219089Spjd	for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
4669219089Spjd		caddr_t va;
4670219089Spjd
4671219089Spjd		ASSERT3U(io_off, ==, cur_pp->p_offset);
4672219089Spjd		va = zfs_map_page(cur_pp, S_WRITE);
4673219089Spjd		err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
4674219089Spjd		    DMU_READ_PREFETCH);
4675219089Spjd		zfs_unmap_page(cur_pp, va);
4676219089Spjd		if (err) {
4677219089Spjd			/* On error, toss the entire kluster */
4678219089Spjd			pvn_read_done(pp, B_ERROR);
4679219089Spjd			/* convert checksum errors into IO errors */
4680219089Spjd			if (err == ECKSUM)
4681219089Spjd				err = EIO;
4682219089Spjd			return (err);
4683219089Spjd		}
4684219089Spjd		cur_pp = cur_pp->p_next;
4685219089Spjd	}
4686219089Spjd
4687219089Spjd	/*
4688219089Spjd	 * Fill in the page list array from the kluster starting
4689219089Spjd	 * from the desired offset `off'.
4690219089Spjd	 * NOTE: the page list will always be null terminated.
4691219089Spjd	 */
4692219089Spjd	pvn_plist_init(pp, pl, plsz, off, io_len, rw);
4693219089Spjd	ASSERT(pl == NULL || (*pl)->p_offset == off);
4694219089Spjd
4695219089Spjd	return (0);
4696219089Spjd}
4697219089Spjd
4698219089Spjd/*
4699219089Spjd * Return pointers to the pages for the file region [off, off + len]
4700219089Spjd * in the pl array.  If plsz is greater than len, this function may
4701219089Spjd * also return page pointers from after the specified region
4702219089Spjd * (i.e. the region [off, off + plsz]).  These additional pages are
4703219089Spjd * only returned if they are already in the cache, or were created as
4704219089Spjd * part of a klustered read.
4705219089Spjd *
4706219089Spjd *	IN:	vp	- vnode of file to get data from.
4707219089Spjd *		off	- position in file to get data from.
4708219089Spjd *		len	- amount of data to retrieve.
4709219089Spjd *		plsz	- length of provided page list.
4710219089Spjd *		seg	- segment to obtain pages for.
4711219089Spjd *		addr	- virtual address of fault.
4712219089Spjd *		rw	- mode of created pages.
4713219089Spjd *		cr	- credentials of caller.
4714219089Spjd *		ct	- caller context.
4715219089Spjd *
4716219089Spjd *	OUT:	protp	- protection mode of created pages.
4717219089Spjd *		pl	- list of pages created.
4718219089Spjd *
4719219089Spjd *	RETURN:	0 if success
4720219089Spjd *		error code if failure
4721219089Spjd *
4722219089Spjd * Timestamps:
4723219089Spjd *	vp - atime updated
4724219089Spjd */
4725219089Spjd/* ARGSUSED */
4726219089Spjdstatic int
4727219089Spjdzfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
4728219089Spjd	page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
4729219089Spjd	enum seg_rw rw, cred_t *cr, caller_context_t *ct)
4730219089Spjd{
4731219089Spjd	znode_t		*zp = VTOZ(vp);
4732219089Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4733219089Spjd	page_t		**pl0 = pl;
4734219089Spjd	int		err = 0;
4735219089Spjd
4736219089Spjd	/* we do our own caching, faultahead is unnecessary */
4737219089Spjd	if (pl == NULL)
4738219089Spjd		return (0);
4739219089Spjd	else if (len > plsz)
4740219089Spjd		len = plsz;
4741219089Spjd	else
4742219089Spjd		len = P2ROUNDUP(len, PAGESIZE);
4743219089Spjd	ASSERT(plsz >= len);
4744219089Spjd
4745219089Spjd	ZFS_ENTER(zfsvfs);
4746219089Spjd	ZFS_VERIFY_ZP(zp);
4747219089Spjd
4748219089Spjd	if (protp)
4749219089Spjd		*protp = PROT_ALL;
4750219089Spjd
4751219089Spjd	/*
4752219089Spjd	 * Loop through the requested range [off, off + len) looking
4753219089Spjd	 * for pages.  If we don't find a page, we will need to create
4754219089Spjd	 * a new page and fill it with data from the file.
4755219089Spjd	 */
4756219089Spjd	while (len > 0) {
4757219089Spjd		if (*pl = page_lookup(vp, off, SE_SHARED))
4758219089Spjd			*(pl+1) = NULL;
4759219089Spjd		else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw))
4760219089Spjd			goto out;
4761219089Spjd		while (*pl) {
4762219089Spjd			ASSERT3U((*pl)->p_offset, ==, off);
4763219089Spjd			off += PAGESIZE;
4764219089Spjd			addr += PAGESIZE;
4765219089Spjd			if (len > 0) {
4766219089Spjd				ASSERT3U(len, >=, PAGESIZE);
4767219089Spjd				len -= PAGESIZE;
4768219089Spjd			}
4769219089Spjd			ASSERT3U(plsz, >=, PAGESIZE);
4770219089Spjd			plsz -= PAGESIZE;
4771219089Spjd			pl++;
4772219089Spjd		}
4773219089Spjd	}
4774219089Spjd
4775219089Spjd	/*
4776219089Spjd	 * Fill out the page array with any pages already in the cache.
4777219089Spjd	 */
4778219089Spjd	while (plsz > 0 &&
4779219089Spjd	    (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) {
4780219089Spjd			off += PAGESIZE;
4781219089Spjd			plsz -= PAGESIZE;
4782219089Spjd	}
4783219089Spjdout:
4784219089Spjd	if (err) {
4785219089Spjd		/*
4786219089Spjd		 * Release any pages we have previously locked.
4787219089Spjd		 */
4788219089Spjd		while (pl > pl0)
4789219089Spjd			page_unlock(*--pl);
4790219089Spjd	} else {
4791219089Spjd		ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4792219089Spjd	}
4793219089Spjd
4794219089Spjd	*pl = NULL;
4795219089Spjd
4796219089Spjd	ZFS_EXIT(zfsvfs);
4797219089Spjd	return (err);
4798219089Spjd}
4799219089Spjd
4800219089Spjd/*
4801219089Spjd * Request a memory map for a section of a file.  This code interacts
4802219089Spjd * with common code and the VM system as follows:
4803219089Spjd *
4804219089Spjd *	common code calls mmap(), which ends up in smmap_common()
4805219089Spjd *
4806219089Spjd *	this calls VOP_MAP(), which takes you into (say) zfs
4807219089Spjd *
4808219089Spjd *	zfs_map() calls as_map(), passing segvn_create() as the callback
4809219089Spjd *
4810219089Spjd *	segvn_create() creates the new segment and calls VOP_ADDMAP()
4811219089Spjd *
4812219089Spjd *	zfs_addmap() updates z_mapcnt
4813219089Spjd */
4814219089Spjd/*ARGSUSED*/
4815219089Spjdstatic int
4816219089Spjdzfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
4817219089Spjd    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4818219089Spjd    caller_context_t *ct)
4819219089Spjd{
4820219089Spjd	znode_t *zp = VTOZ(vp);
4821219089Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4822219089Spjd	segvn_crargs_t	vn_a;
4823219089Spjd	int		error;
4824219089Spjd
4825219089Spjd	ZFS_ENTER(zfsvfs);
4826219089Spjd	ZFS_VERIFY_ZP(zp);
4827219089Spjd
4828219089Spjd	if ((prot & PROT_WRITE) && (zp->z_pflags &
4829219089Spjd	    (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
4830219089Spjd		ZFS_EXIT(zfsvfs);
4831219089Spjd		return (EPERM);
4832219089Spjd	}
4833219089Spjd
4834219089Spjd	if ((prot & (PROT_READ | PROT_EXEC)) &&
4835219089Spjd	    (zp->z_pflags & ZFS_AV_QUARANTINED)) {
4836219089Spjd		ZFS_EXIT(zfsvfs);
4837219089Spjd		return (EACCES);
4838219089Spjd	}
4839219089Spjd
4840219089Spjd	if (vp->v_flag & VNOMAP) {
4841219089Spjd		ZFS_EXIT(zfsvfs);
4842219089Spjd		return (ENOSYS);
4843219089Spjd	}
4844219089Spjd
4845219089Spjd	if (off < 0 || len > MAXOFFSET_T - off) {
4846219089Spjd		ZFS_EXIT(zfsvfs);
4847219089Spjd		return (ENXIO);
4848219089Spjd	}
4849219089Spjd
4850219089Spjd	if (vp->v_type != VREG) {
4851219089Spjd		ZFS_EXIT(zfsvfs);
4852219089Spjd		return (ENODEV);
4853219089Spjd	}
4854219089Spjd
4855219089Spjd	/*
4856219089Spjd	 * If file is locked, disallow mapping.
4857219089Spjd	 */
4858219089Spjd	if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) {
4859219089Spjd		ZFS_EXIT(zfsvfs);
4860219089Spjd		return (EAGAIN);
4861219089Spjd	}
4862219089Spjd
4863219089Spjd	as_rangelock(as);
4864219089Spjd	error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
4865219089Spjd	if (error != 0) {
4866219089Spjd		as_rangeunlock(as);
4867219089Spjd		ZFS_EXIT(zfsvfs);
4868219089Spjd		return (error);
4869219089Spjd	}
4870219089Spjd
4871219089Spjd	vn_a.vp = vp;
4872219089Spjd	vn_a.offset = (u_offset_t)off;
4873219089Spjd	vn_a.type = flags & MAP_TYPE;
4874219089Spjd	vn_a.prot = prot;
4875219089Spjd	vn_a.maxprot = maxprot;
4876219089Spjd	vn_a.cred = cr;
4877219089Spjd	vn_a.amp = NULL;
4878219089Spjd	vn_a.flags = flags & ~MAP_TYPE;
4879219089Spjd	vn_a.szc = 0;
4880219089Spjd	vn_a.lgrp_mem_policy_flags = 0;
4881219089Spjd
4882219089Spjd	error = as_map(as, *addrp, len, segvn_create, &vn_a);
4883219089Spjd
4884219089Spjd	as_rangeunlock(as);
4885219089Spjd	ZFS_EXIT(zfsvfs);
4886219089Spjd	return (error);
4887219089Spjd}
4888219089Spjd
4889219089Spjd/* ARGSUSED */
4890219089Spjdstatic int
4891219089Spjdzfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4892219089Spjd    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4893219089Spjd    caller_context_t *ct)
4894219089Spjd{
4895219089Spjd	uint64_t pages = btopr(len);
4896219089Spjd
4897219089Spjd	atomic_add_64(&VTOZ(vp)->z_mapcnt, pages);
4898219089Spjd	return (0);
4899219089Spjd}
4900219089Spjd
4901219089Spjd/*
4902219089Spjd * The reason we push dirty pages as part of zfs_delmap() is so that we get a
4903219089Spjd * more accurate mtime for the associated file.  Since we don't have a way of
4904219089Spjd * detecting when the data was actually modified, we have to resort to
4905219089Spjd * heuristics.  If an explicit msync() is done, then we mark the mtime when the
4906219089Spjd * last page is pushed.  The problem occurs when the msync() call is omitted,
4907219089Spjd * which by far the most common case:
4908219089Spjd *
4909219089Spjd * 	open()
4910219089Spjd * 	mmap()
4911219089Spjd * 	<modify memory>
4912219089Spjd * 	munmap()
4913219089Spjd * 	close()
4914219089Spjd * 	<time lapse>
4915219089Spjd * 	putpage() via fsflush
4916219089Spjd *
4917219089Spjd * If we wait until fsflush to come along, we can have a modification time that
4918219089Spjd * is some arbitrary point in the future.  In order to prevent this in the
4919219089Spjd * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
4920219089Spjd * torn down.
4921219089Spjd */
4922219089Spjd/* ARGSUSED */
4923219089Spjdstatic int
4924219089Spjdzfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4925219089Spjd    size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
4926219089Spjd    caller_context_t *ct)
4927219089Spjd{
4928219089Spjd	uint64_t pages = btopr(len);
4929219089Spjd
4930219089Spjd	ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
4931219089Spjd	atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
4932219089Spjd
4933219089Spjd	if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
4934219089Spjd	    vn_has_cached_data(vp))
4935219089Spjd		(void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
4936219089Spjd
4937219089Spjd	return (0);
4938219089Spjd}
4939219089Spjd
4940219089Spjd/*
4941219089Spjd * Free or allocate space in a file.  Currently, this function only
4942219089Spjd * supports the `F_FREESP' command.  However, this command is somewhat
4943219089Spjd * misnamed, as its functionality includes the ability to allocate as
4944219089Spjd * well as free space.
4945219089Spjd *
4946219089Spjd *	IN:	vp	- vnode of file to free data in.
4947219089Spjd *		cmd	- action to take (only F_FREESP supported).
4948219089Spjd *		bfp	- section of file to free/alloc.
4949219089Spjd *		flag	- current file open mode flags.
4950219089Spjd *		offset	- current file offset.
4951219089Spjd *		cr	- credentials of caller [UNUSED].
4952219089Spjd *		ct	- caller context.
4953219089Spjd *
4954219089Spjd *	RETURN:	0 if success
4955219089Spjd *		error code if failure
4956219089Spjd *
4957219089Spjd * Timestamps:
4958219089Spjd *	vp - ctime|mtime updated
4959219089Spjd */
4960219089Spjd/* ARGSUSED */
4961219089Spjdstatic int
4962219089Spjdzfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
4963219089Spjd    offset_t offset, cred_t *cr, caller_context_t *ct)
4964219089Spjd{
4965219089Spjd	znode_t		*zp = VTOZ(vp);
4966219089Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4967219089Spjd	uint64_t	off, len;
4968219089Spjd	int		error;
4969219089Spjd
4970219089Spjd	ZFS_ENTER(zfsvfs);
4971219089Spjd	ZFS_VERIFY_ZP(zp);
4972219089Spjd
4973219089Spjd	if (cmd != F_FREESP) {
4974219089Spjd		ZFS_EXIT(zfsvfs);
4975219089Spjd		return (EINVAL);
4976219089Spjd	}
4977219089Spjd
4978219089Spjd	if (error = convoff(vp, bfp, 0, offset)) {
4979219089Spjd		ZFS_EXIT(zfsvfs);
4980219089Spjd		return (error);
4981219089Spjd	}
4982219089Spjd
4983219089Spjd	if (bfp->l_len < 0) {
4984219089Spjd		ZFS_EXIT(zfsvfs);
4985219089Spjd		return (EINVAL);
4986219089Spjd	}
4987219089Spjd
4988219089Spjd	off = bfp->l_start;
4989219089Spjd	len = bfp->l_len; /* 0 means from off to end of file */
4990219089Spjd
4991219089Spjd	error = zfs_freesp(zp, off, len, flag, TRUE);
4992219089Spjd
4993219089Spjd	ZFS_EXIT(zfsvfs);
4994219089Spjd	return (error);
4995219089Spjd}
4996219089Spjd#endif	/* sun */
4997219089Spjd
4998168404SpjdCTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
4999168404SpjdCTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
5000168404Spjd
5001185029Spjd/*ARGSUSED*/
5002168404Spjdstatic int
5003185029Spjdzfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
5004168404Spjd{
5005168404Spjd	znode_t		*zp = VTOZ(vp);
5006168404Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
5007185029Spjd	uint32_t	gen;
5008219089Spjd	uint64_t	gen64;
5009168404Spjd	uint64_t	object = zp->z_id;
5010168404Spjd	zfid_short_t	*zfid;
5011219089Spjd	int		size, i, error;
5012168404Spjd
5013168404Spjd	ZFS_ENTER(zfsvfs);
5014185029Spjd	ZFS_VERIFY_ZP(zp);
5015168404Spjd
5016219089Spjd	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
5017219089Spjd	    &gen64, sizeof (uint64_t))) != 0) {
5018219089Spjd		ZFS_EXIT(zfsvfs);
5019219089Spjd		return (error);
5020219089Spjd	}
5021219089Spjd
5022219089Spjd	gen = (uint32_t)gen64;
5023219089Spjd
5024168404Spjd	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
5025168404Spjd	fidp->fid_len = size;
5026168404Spjd
5027168404Spjd	zfid = (zfid_short_t *)fidp;
5028168404Spjd
5029168404Spjd	zfid->zf_len = size;
5030168404Spjd
5031168404Spjd	for (i = 0; i < sizeof (zfid->zf_object); i++)
5032168404Spjd		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
5033168404Spjd
5034168404Spjd	/* Must have a non-zero generation number to distinguish from .zfs */
5035168404Spjd	if (gen == 0)
5036168404Spjd		gen = 1;
5037168404Spjd	for (i = 0; i < sizeof (zfid->zf_gen); i++)
5038168404Spjd		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
5039168404Spjd
5040168404Spjd	if (size == LONG_FID_LEN) {
5041168404Spjd		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
5042169023Spjd		zfid_long_t	*zlfid;
5043168404Spjd
5044168404Spjd		zlfid = (zfid_long_t *)fidp;
5045168404Spjd
5046168404Spjd		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
5047168404Spjd			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
5048168404Spjd
5049168404Spjd		/* XXX - this should be the generation number for the objset */
5050168404Spjd		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
5051168404Spjd			zlfid->zf_setgen[i] = 0;
5052168404Spjd	}
5053168404Spjd
5054168404Spjd	ZFS_EXIT(zfsvfs);
5055168404Spjd	return (0);
5056168404Spjd}
5057168404Spjd
5058168404Spjdstatic int
5059185029Spjdzfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
5060185029Spjd    caller_context_t *ct)
5061168404Spjd{
5062168404Spjd	znode_t		*zp, *xzp;
5063168404Spjd	zfsvfs_t	*zfsvfs;
5064168404Spjd	zfs_dirlock_t	*dl;
5065168404Spjd	int		error;
5066168404Spjd
5067168404Spjd	switch (cmd) {
5068168404Spjd	case _PC_LINK_MAX:
5069168404Spjd		*valp = INT_MAX;
5070168404Spjd		return (0);
5071168404Spjd
5072168404Spjd	case _PC_FILESIZEBITS:
5073168404Spjd		*valp = 64;
5074168404Spjd		return (0);
5075219089Spjd#ifdef sun
5076168404Spjd	case _PC_XATTR_EXISTS:
5077168404Spjd		zp = VTOZ(vp);
5078168404Spjd		zfsvfs = zp->z_zfsvfs;
5079168404Spjd		ZFS_ENTER(zfsvfs);
5080185029Spjd		ZFS_VERIFY_ZP(zp);
5081168404Spjd		*valp = 0;
5082168404Spjd		error = zfs_dirent_lock(&dl, zp, "", &xzp,
5083185029Spjd		    ZXATTR | ZEXISTS | ZSHARED, NULL, NULL);
5084168404Spjd		if (error == 0) {
5085168404Spjd			zfs_dirent_unlock(dl);
5086168404Spjd			if (!zfs_dirempty(xzp))
5087168404Spjd				*valp = 1;
5088168404Spjd			VN_RELE(ZTOV(xzp));
5089168404Spjd		} else if (error == ENOENT) {
5090168404Spjd			/*
5091168404Spjd			 * If there aren't extended attributes, it's the
5092168404Spjd			 * same as having zero of them.
5093168404Spjd			 */
5094168404Spjd			error = 0;
5095168404Spjd		}
5096168404Spjd		ZFS_EXIT(zfsvfs);
5097168404Spjd		return (error);
5098168404Spjd
5099219089Spjd	case _PC_SATTR_ENABLED:
5100219089Spjd	case _PC_SATTR_EXISTS:
5101219089Spjd		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
5102219089Spjd		    (vp->v_type == VREG || vp->v_type == VDIR);
5103219089Spjd		return (0);
5104219089Spjd
5105219089Spjd	case _PC_ACCESS_FILTERING:
5106219089Spjd		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
5107219089Spjd		    vp->v_type == VDIR;
5108219089Spjd		return (0);
5109219089Spjd
5110219089Spjd	case _PC_ACL_ENABLED:
5111219089Spjd		*valp = _ACL_ACE_ENABLED;
5112219089Spjd		return (0);
5113219089Spjd#endif	/* sun */
5114219089Spjd	case _PC_MIN_HOLE_SIZE:
5115219089Spjd		*valp = (int)SPA_MINBLOCKSIZE;
5116219089Spjd		return (0);
5117219089Spjd#ifdef sun
5118219089Spjd	case _PC_TIMESTAMP_RESOLUTION:
5119219089Spjd		/* nanosecond timestamp resolution */
5120219089Spjd		*valp = 1L;
5121219089Spjd		return (0);
5122219089Spjd#endif	/* sun */
5123168404Spjd	case _PC_ACL_EXTENDED:
5124196949Strasz		*valp = 0;
5125168404Spjd		return (0);
5126168404Spjd
5127196949Strasz	case _PC_ACL_NFS4:
5128196949Strasz		*valp = 1;
5129196949Strasz		return (0);
5130196949Strasz
5131196949Strasz	case _PC_ACL_PATH_MAX:
5132196949Strasz		*valp = ACL_MAX_ENTRIES;
5133196949Strasz		return (0);
5134196949Strasz
5135168404Spjd	default:
5136168962Spjd		return (EOPNOTSUPP);
5137168404Spjd	}
5138168404Spjd}
5139168404Spjd
5140168404Spjd/*ARGSUSED*/
5141168404Spjdstatic int
5142185029Spjdzfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
5143185029Spjd    caller_context_t *ct)
5144168404Spjd{
5145168404Spjd	znode_t *zp = VTOZ(vp);
5146168404Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5147168404Spjd	int error;
5148185029Spjd	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
5149168404Spjd
5150168404Spjd	ZFS_ENTER(zfsvfs);
5151185029Spjd	ZFS_VERIFY_ZP(zp);
5152185029Spjd	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
5153168404Spjd	ZFS_EXIT(zfsvfs);
5154168404Spjd
5155168404Spjd	return (error);
5156168404Spjd}
5157168404Spjd
5158168404Spjd/*ARGSUSED*/
5159168404Spjdstatic int
5160185029Spjdzfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
5161185029Spjd    caller_context_t *ct)
5162168404Spjd{
5163168404Spjd	znode_t *zp = VTOZ(vp);
5164168404Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5165168404Spjd	int error;
5166185029Spjd	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
5167219089Spjd	zilog_t	*zilog = zfsvfs->z_log;
5168168404Spjd
5169168404Spjd	ZFS_ENTER(zfsvfs);
5170185029Spjd	ZFS_VERIFY_ZP(zp);
5171219089Spjd
5172185029Spjd	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
5173219089Spjd
5174219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
5175219089Spjd		zil_commit(zilog, 0);
5176219089Spjd
5177168404Spjd	ZFS_EXIT(zfsvfs);
5178168404Spjd	return (error);
5179168404Spjd}
5180168404Spjd
5181219089Spjd#ifdef sun
5182219089Spjd/*
5183219089Spjd * Tunable, both must be a power of 2.
5184219089Spjd *
5185219089Spjd * zcr_blksz_min: the smallest read we may consider to loan out an arcbuf
5186219089Spjd * zcr_blksz_max: if set to less than the file block size, allow loaning out of
5187219089Spjd *                an arcbuf for a partial block read
5188219089Spjd */
5189219089Spjdint zcr_blksz_min = (1 << 10);	/* 1K */
5190219089Spjdint zcr_blksz_max = (1 << 17);	/* 128K */
5191219089Spjd
5192219089Spjd/*ARGSUSED*/
5193168962Spjdstatic int
5194219089Spjdzfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr,
5195219089Spjd    caller_context_t *ct)
5196219089Spjd{
5197219089Spjd	znode_t	*zp = VTOZ(vp);
5198219089Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5199219089Spjd	int max_blksz = zfsvfs->z_max_blksz;
5200219089Spjd	uio_t *uio = &xuio->xu_uio;
5201219089Spjd	ssize_t size = uio->uio_resid;
5202219089Spjd	offset_t offset = uio->uio_loffset;
5203219089Spjd	int blksz;
5204219089Spjd	int fullblk, i;
5205219089Spjd	arc_buf_t *abuf;
5206219089Spjd	ssize_t maxsize;
5207219089Spjd	int preamble, postamble;
5208219089Spjd
5209219089Spjd	if (xuio->xu_type != UIOTYPE_ZEROCOPY)
5210219089Spjd		return (EINVAL);
5211219089Spjd
5212219089Spjd	ZFS_ENTER(zfsvfs);
5213219089Spjd	ZFS_VERIFY_ZP(zp);
5214219089Spjd	switch (ioflag) {
5215219089Spjd	case UIO_WRITE:
5216219089Spjd		/*
5217219089Spjd		 * Loan out an arc_buf for write if write size is bigger than
5218219089Spjd		 * max_blksz, and the file's block size is also max_blksz.
5219219089Spjd		 */
5220219089Spjd		blksz = max_blksz;
5221219089Spjd		if (size < blksz || zp->z_blksz != blksz) {
5222219089Spjd			ZFS_EXIT(zfsvfs);
5223219089Spjd			return (EINVAL);
5224219089Spjd		}
5225219089Spjd		/*
5226219089Spjd		 * Caller requests buffers for write before knowing where the
5227219089Spjd		 * write offset might be (e.g. NFS TCP write).
5228219089Spjd		 */
5229219089Spjd		if (offset == -1) {
5230219089Spjd			preamble = 0;
5231219089Spjd		} else {
5232219089Spjd			preamble = P2PHASE(offset, blksz);
5233219089Spjd			if (preamble) {
5234219089Spjd				preamble = blksz - preamble;
5235219089Spjd				size -= preamble;
5236219089Spjd			}
5237219089Spjd		}
5238219089Spjd
5239219089Spjd		postamble = P2PHASE(size, blksz);
5240219089Spjd		size -= postamble;
5241219089Spjd
5242219089Spjd		fullblk = size / blksz;
5243219089Spjd		(void) dmu_xuio_init(xuio,
5244219089Spjd		    (preamble != 0) + fullblk + (postamble != 0));
5245219089Spjd		DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble,
5246219089Spjd		    int, postamble, int,
5247219089Spjd		    (preamble != 0) + fullblk + (postamble != 0));
5248219089Spjd
5249219089Spjd		/*
5250219089Spjd		 * Have to fix iov base/len for partial buffers.  They
5251219089Spjd		 * currently represent full arc_buf's.
5252219089Spjd		 */
5253219089Spjd		if (preamble) {
5254219089Spjd			/* data begins in the middle of the arc_buf */
5255219089Spjd			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5256219089Spjd			    blksz);
5257219089Spjd			ASSERT(abuf);
5258219089Spjd			(void) dmu_xuio_add(xuio, abuf,
5259219089Spjd			    blksz - preamble, preamble);
5260219089Spjd		}
5261219089Spjd
5262219089Spjd		for (i = 0; i < fullblk; i++) {
5263219089Spjd			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5264219089Spjd			    blksz);
5265219089Spjd			ASSERT(abuf);
5266219089Spjd			(void) dmu_xuio_add(xuio, abuf, 0, blksz);
5267219089Spjd		}
5268219089Spjd
5269219089Spjd		if (postamble) {
5270219089Spjd			/* data ends in the middle of the arc_buf */
5271219089Spjd			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5272219089Spjd			    blksz);
5273219089Spjd			ASSERT(abuf);
5274219089Spjd			(void) dmu_xuio_add(xuio, abuf, 0, postamble);
5275219089Spjd		}
5276219089Spjd		break;
5277219089Spjd	case UIO_READ:
5278219089Spjd		/*
5279219089Spjd		 * Loan out an arc_buf for read if the read size is larger than
5280219089Spjd		 * the current file block size.  Block alignment is not
5281219089Spjd		 * considered.  Partial arc_buf will be loaned out for read.
5282219089Spjd		 */
5283219089Spjd		blksz = zp->z_blksz;
5284219089Spjd		if (blksz < zcr_blksz_min)
5285219089Spjd			blksz = zcr_blksz_min;
5286219089Spjd		if (blksz > zcr_blksz_max)
5287219089Spjd			blksz = zcr_blksz_max;
5288219089Spjd		/* avoid potential complexity of dealing with it */
5289219089Spjd		if (blksz > max_blksz) {
5290219089Spjd			ZFS_EXIT(zfsvfs);
5291219089Spjd			return (EINVAL);
5292219089Spjd		}
5293219089Spjd
5294219089Spjd		maxsize = zp->z_size - uio->uio_loffset;
5295219089Spjd		if (size > maxsize)
5296219089Spjd			size = maxsize;
5297219089Spjd
5298219089Spjd		if (size < blksz || vn_has_cached_data(vp)) {
5299219089Spjd			ZFS_EXIT(zfsvfs);
5300219089Spjd			return (EINVAL);
5301219089Spjd		}
5302219089Spjd		break;
5303219089Spjd	default:
5304219089Spjd		ZFS_EXIT(zfsvfs);
5305219089Spjd		return (EINVAL);
5306219089Spjd	}
5307219089Spjd
5308219089Spjd	uio->uio_extflg = UIO_XUIO;
5309219089Spjd	XUIO_XUZC_RW(xuio) = ioflag;
5310219089Spjd	ZFS_EXIT(zfsvfs);
5311219089Spjd	return (0);
5312219089Spjd}
5313219089Spjd
5314219089Spjd/*ARGSUSED*/
5315219089Spjdstatic int
5316219089Spjdzfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct)
5317219089Spjd{
5318219089Spjd	int i;
5319219089Spjd	arc_buf_t *abuf;
5320219089Spjd	int ioflag = XUIO_XUZC_RW(xuio);
5321219089Spjd
5322219089Spjd	ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
5323219089Spjd
5324219089Spjd	i = dmu_xuio_cnt(xuio);
5325219089Spjd	while (i-- > 0) {
5326219089Spjd		abuf = dmu_xuio_arcbuf(xuio, i);
5327219089Spjd		/*
5328219089Spjd		 * if abuf == NULL, it must be a write buffer
5329219089Spjd		 * that has been returned in zfs_write().
5330219089Spjd		 */
5331219089Spjd		if (abuf)
5332219089Spjd			dmu_return_arcbuf(abuf);
5333219089Spjd		ASSERT(abuf || ioflag == UIO_WRITE);
5334219089Spjd	}
5335219089Spjd
5336219089Spjd	dmu_xuio_fini(xuio);
5337219089Spjd	return (0);
5338219089Spjd}
5339219089Spjd
5340219089Spjd/*
5341219089Spjd * Predeclare these here so that the compiler assumes that
5342219089Spjd * this is an "old style" function declaration that does
5343219089Spjd * not include arguments => we won't get type mismatch errors
5344219089Spjd * in the initializations that follow.
5345219089Spjd */
5346219089Spjdstatic int zfs_inval();
5347219089Spjdstatic int zfs_isdir();
5348219089Spjd
5349219089Spjdstatic int
5350219089Spjdzfs_inval()
5351219089Spjd{
5352219089Spjd	return (EINVAL);
5353219089Spjd}
5354219089Spjd
5355219089Spjdstatic int
5356219089Spjdzfs_isdir()
5357219089Spjd{
5358219089Spjd	return (EISDIR);
5359219089Spjd}
5360219089Spjd/*
5361219089Spjd * Directory vnode operations template
5362219089Spjd */
5363219089Spjdvnodeops_t *zfs_dvnodeops;
5364219089Spjdconst fs_operation_def_t zfs_dvnodeops_template[] = {
5365219089Spjd	VOPNAME_OPEN,		{ .vop_open = zfs_open },
5366219089Spjd	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
5367219089Spjd	VOPNAME_READ,		{ .error = zfs_isdir },
5368219089Spjd	VOPNAME_WRITE,		{ .error = zfs_isdir },
5369219089Spjd	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
5370219089Spjd	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5371219089Spjd	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5372219089Spjd	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5373219089Spjd	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
5374219089Spjd	VOPNAME_CREATE,		{ .vop_create = zfs_create },
5375219089Spjd	VOPNAME_REMOVE,		{ .vop_remove = zfs_remove },
5376219089Spjd	VOPNAME_LINK,		{ .vop_link = zfs_link },
5377219089Spjd	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5378219089Spjd	VOPNAME_MKDIR,		{ .vop_mkdir = zfs_mkdir },
5379219089Spjd	VOPNAME_RMDIR,		{ .vop_rmdir = zfs_rmdir },
5380219089Spjd	VOPNAME_READDIR,	{ .vop_readdir = zfs_readdir },
5381219089Spjd	VOPNAME_SYMLINK,	{ .vop_symlink = zfs_symlink },
5382219089Spjd	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
5383219089Spjd	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5384219089Spjd	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5385219089Spjd	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
5386219089Spjd	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5387219089Spjd	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5388219089Spjd	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5389219089Spjd	VOPNAME_VNEVENT, 	{ .vop_vnevent = fs_vnevent_support },
5390219089Spjd	NULL,			NULL
5391219089Spjd};
5392219089Spjd
5393219089Spjd/*
5394219089Spjd * Regular file vnode operations template
5395219089Spjd */
5396219089Spjdvnodeops_t *zfs_fvnodeops;
5397219089Spjdconst fs_operation_def_t zfs_fvnodeops_template[] = {
5398219089Spjd	VOPNAME_OPEN,		{ .vop_open = zfs_open },
5399219089Spjd	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
5400219089Spjd	VOPNAME_READ,		{ .vop_read = zfs_read },
5401219089Spjd	VOPNAME_WRITE,		{ .vop_write = zfs_write },
5402219089Spjd	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
5403219089Spjd	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5404219089Spjd	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5405219089Spjd	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5406219089Spjd	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
5407219089Spjd	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5408219089Spjd	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
5409219089Spjd	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5410219089Spjd	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5411219089Spjd	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
5412219089Spjd	VOPNAME_FRLOCK,		{ .vop_frlock = zfs_frlock },
5413219089Spjd	VOPNAME_SPACE,		{ .vop_space = zfs_space },
5414219089Spjd	VOPNAME_GETPAGE,	{ .vop_getpage = zfs_getpage },
5415219089Spjd	VOPNAME_PUTPAGE,	{ .vop_putpage = zfs_putpage },
5416219089Spjd	VOPNAME_MAP,		{ .vop_map = zfs_map },
5417219089Spjd	VOPNAME_ADDMAP,		{ .vop_addmap = zfs_addmap },
5418219089Spjd	VOPNAME_DELMAP,		{ .vop_delmap = zfs_delmap },
5419219089Spjd	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5420219089Spjd	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5421219089Spjd	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5422219089Spjd	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5423219089Spjd	VOPNAME_REQZCBUF, 	{ .vop_reqzcbuf = zfs_reqzcbuf },
5424219089Spjd	VOPNAME_RETZCBUF, 	{ .vop_retzcbuf = zfs_retzcbuf },
5425219089Spjd	NULL,			NULL
5426219089Spjd};
5427219089Spjd
5428219089Spjd/*
5429219089Spjd * Symbolic link vnode operations template
5430219089Spjd */
5431219089Spjdvnodeops_t *zfs_symvnodeops;
5432219089Spjdconst fs_operation_def_t zfs_symvnodeops_template[] = {
5433219089Spjd	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5434219089Spjd	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5435219089Spjd	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5436219089Spjd	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5437219089Spjd	VOPNAME_READLINK,	{ .vop_readlink = zfs_readlink },
5438219089Spjd	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5439219089Spjd	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5440219089Spjd	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5441219089Spjd	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5442219089Spjd	NULL,			NULL
5443219089Spjd};
5444219089Spjd
5445219089Spjd/*
5446219089Spjd * special share hidden files vnode operations template
5447219089Spjd */
5448219089Spjdvnodeops_t *zfs_sharevnodeops;
5449219089Spjdconst fs_operation_def_t zfs_sharevnodeops_template[] = {
5450219089Spjd	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5451219089Spjd	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5452219089Spjd	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5453219089Spjd	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5454219089Spjd	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5455219089Spjd	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5456219089Spjd	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5457219089Spjd	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5458219089Spjd	NULL,			NULL
5459219089Spjd};
5460219089Spjd
5461219089Spjd/*
5462219089Spjd * Extended attribute directory vnode operations template
5463219089Spjd *	This template is identical to the directory vnodes
5464219089Spjd *	operation template except for restricted operations:
5465219089Spjd *		VOP_MKDIR()
5466219089Spjd *		VOP_SYMLINK()
5467219089Spjd * Note that there are other restrictions embedded in:
5468219089Spjd *	zfs_create()	- restrict type to VREG
5469219089Spjd *	zfs_link()	- no links into/out of attribute space
5470219089Spjd *	zfs_rename()	- no moves into/out of attribute space
5471219089Spjd */
5472219089Spjdvnodeops_t *zfs_xdvnodeops;
5473219089Spjdconst fs_operation_def_t zfs_xdvnodeops_template[] = {
5474219089Spjd	VOPNAME_OPEN,		{ .vop_open = zfs_open },
5475219089Spjd	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
5476219089Spjd	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
5477219089Spjd	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5478219089Spjd	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5479219089Spjd	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5480219089Spjd	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
5481219089Spjd	VOPNAME_CREATE,		{ .vop_create = zfs_create },
5482219089Spjd	VOPNAME_REMOVE,		{ .vop_remove = zfs_remove },
5483219089Spjd	VOPNAME_LINK,		{ .vop_link = zfs_link },
5484219089Spjd	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5485219089Spjd	VOPNAME_MKDIR,		{ .error = zfs_inval },
5486219089Spjd	VOPNAME_RMDIR,		{ .vop_rmdir = zfs_rmdir },
5487219089Spjd	VOPNAME_READDIR,	{ .vop_readdir = zfs_readdir },
5488219089Spjd	VOPNAME_SYMLINK,	{ .error = zfs_inval },
5489219089Spjd	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
5490219089Spjd	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5491219089Spjd	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5492219089Spjd	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
5493219089Spjd	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5494219089Spjd	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5495219089Spjd	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5496219089Spjd	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5497219089Spjd	NULL,			NULL
5498219089Spjd};
5499219089Spjd
5500219089Spjd/*
5501219089Spjd * Error vnode operations template
5502219089Spjd */
5503219089Spjdvnodeops_t *zfs_evnodeops;
5504219089Spjdconst fs_operation_def_t zfs_evnodeops_template[] = {
5505219089Spjd	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5506219089Spjd	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5507219089Spjd	NULL,			NULL
5508219089Spjd};
5509219089Spjd#endif	/* sun */
5510219089Spjd
5511219089Spjdstatic int
5512213673Spjdioflags(int ioflags)
5513213673Spjd{
5514213673Spjd	int flags = 0;
5515213673Spjd
5516213673Spjd	if (ioflags & IO_APPEND)
5517213673Spjd		flags |= FAPPEND;
5518213673Spjd	if (ioflags & IO_NDELAY)
5519213673Spjd        	flags |= FNONBLOCK;
5520213673Spjd	if (ioflags & IO_SYNC)
5521213673Spjd		flags |= (FSYNC | FDSYNC | FRSYNC);
5522213673Spjd
5523213673Spjd	return (flags);
5524213673Spjd}
5525213673Spjd
5526213673Spjdstatic int
5527213937Savgzfs_getpages(struct vnode *vp, vm_page_t *m, int count, int reqpage)
5528213937Savg{
5529213937Savg	znode_t *zp = VTOZ(vp);
5530213937Savg	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5531213937Savg	objset_t *os = zp->z_zfsvfs->z_os;
5532213937Savg	vm_page_t mreq;
5533213937Savg	vm_object_t object;
5534213937Savg	caddr_t va;
5535213937Savg	struct sf_buf *sf;
5536213937Savg	int i, error;
5537213937Savg	int pcount, size;
5538213937Savg
5539213937Savg	ZFS_ENTER(zfsvfs);
5540213937Savg	ZFS_VERIFY_ZP(zp);
5541213937Savg
5542213937Savg	pcount = round_page(count) / PAGE_SIZE;
5543213937Savg	mreq = m[reqpage];
5544213937Savg	object = mreq->object;
5545213937Savg	error = 0;
5546213937Savg
5547213937Savg	KASSERT(vp->v_object == object, ("mismatching object"));
5548213937Savg
5549213937Savg	VM_OBJECT_LOCK(object);
5550213937Savg
5551213937Savg	for (i = 0; i < pcount; i++) {
5552213937Savg		if (i != reqpage) {
5553213937Savg			vm_page_lock(m[i]);
5554213937Savg			vm_page_free(m[i]);
5555213937Savg			vm_page_unlock(m[i]);
5556213937Savg		}
5557213937Savg	}
5558213937Savg
5559213937Savg	if (mreq->valid) {
5560213937Savg		if (mreq->valid != VM_PAGE_BITS_ALL)
5561213937Savg			vm_page_zero_invalid(mreq, TRUE);
5562213937Savg		VM_OBJECT_UNLOCK(object);
5563213937Savg		ZFS_EXIT(zfsvfs);
5564213937Savg		return (VM_PAGER_OK);
5565213937Savg	}
5566213937Savg
5567213937Savg	PCPU_INC(cnt.v_vnodein);
5568213937Savg	PCPU_INC(cnt.v_vnodepgsin);
5569213937Savg
5570213937Savg	if (IDX_TO_OFF(mreq->pindex) >= object->un_pager.vnp.vnp_size) {
5571213937Savg		VM_OBJECT_UNLOCK(object);
5572213937Savg		ZFS_EXIT(zfsvfs);
5573213937Savg		return (VM_PAGER_BAD);
5574213937Savg	}
5575213937Savg
5576213937Savg	size = PAGE_SIZE;
5577213937Savg	if (IDX_TO_OFF(mreq->pindex) + size > object->un_pager.vnp.vnp_size)
5578213937Savg		size = object->un_pager.vnp.vnp_size - IDX_TO_OFF(mreq->pindex);
5579213937Savg
5580213937Savg	VM_OBJECT_UNLOCK(object);
5581213937Savg	va = zfs_map_page(mreq, &sf);
5582213937Savg	error = dmu_read(os, zp->z_id, IDX_TO_OFF(mreq->pindex),
5583213937Savg	    size, va, DMU_READ_PREFETCH);
5584213937Savg	if (size != PAGE_SIZE)
5585213937Savg		bzero(va + size, PAGE_SIZE - size);
5586213937Savg	zfs_unmap_page(sf);
5587213937Savg	VM_OBJECT_LOCK(object);
5588213937Savg
5589213937Savg	if (!error)
5590213937Savg		mreq->valid = VM_PAGE_BITS_ALL;
5591213937Savg	KASSERT(mreq->dirty == 0, ("zfs_getpages: page %p is dirty", mreq));
5592213937Savg
5593213937Savg	VM_OBJECT_UNLOCK(object);
5594213937Savg
5595213937Savg	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
5596213937Savg	ZFS_EXIT(zfsvfs);
5597213937Savg	return (error ? VM_PAGER_ERROR : VM_PAGER_OK);
5598213937Savg}
5599213937Savg
5600213937Savgstatic int
5601213937Savgzfs_freebsd_getpages(ap)
5602213937Savg	struct vop_getpages_args /* {
5603213937Savg		struct vnode *a_vp;
5604213937Savg		vm_page_t *a_m;
5605213937Savg		int a_count;
5606213937Savg		int a_reqpage;
5607213937Savg		vm_ooffset_t a_offset;
5608213937Savg	} */ *ap;
5609213937Savg{
5610213937Savg
5611213937Savg	return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_reqpage));
5612213937Savg}
5613213937Savg
5614213937Savgstatic int
5615168962Spjdzfs_freebsd_open(ap)
5616168962Spjd	struct vop_open_args /* {
5617168962Spjd		struct vnode *a_vp;
5618168962Spjd		int a_mode;
5619168962Spjd		struct ucred *a_cred;
5620168962Spjd		struct thread *a_td;
5621168962Spjd	} */ *ap;
5622168962Spjd{
5623168962Spjd	vnode_t	*vp = ap->a_vp;
5624168962Spjd	znode_t *zp = VTOZ(vp);
5625168962Spjd	int error;
5626168962Spjd
5627185029Spjd	error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
5628168962Spjd	if (error == 0)
5629219089Spjd		vnode_create_vobject(vp, zp->z_size, ap->a_td);
5630168962Spjd	return (error);
5631168962Spjd}
5632168962Spjd
5633168962Spjdstatic int
5634168962Spjdzfs_freebsd_close(ap)
5635168962Spjd	struct vop_close_args /* {
5636168962Spjd		struct vnode *a_vp;
5637168962Spjd		int  a_fflag;
5638168962Spjd		struct ucred *a_cred;
5639168962Spjd		struct thread *a_td;
5640168962Spjd	} */ *ap;
5641168962Spjd{
5642168962Spjd
5643185029Spjd	return (zfs_close(ap->a_vp, ap->a_fflag, 0, 0, ap->a_cred, NULL));
5644168962Spjd}
5645168962Spjd
5646168962Spjdstatic int
5647168962Spjdzfs_freebsd_ioctl(ap)
5648168962Spjd	struct vop_ioctl_args /* {
5649168962Spjd		struct vnode *a_vp;
5650168962Spjd		u_long a_command;
5651168962Spjd		caddr_t a_data;
5652168962Spjd		int a_fflag;
5653168962Spjd		struct ucred *cred;
5654168962Spjd		struct thread *td;
5655168962Spjd	} */ *ap;
5656168962Spjd{
5657168962Spjd
5658168978Spjd	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
5659185029Spjd	    ap->a_fflag, ap->a_cred, NULL, NULL));
5660168962Spjd}
5661168962Spjd
5662168962Spjdstatic int
5663168962Spjdzfs_freebsd_read(ap)
5664168962Spjd	struct vop_read_args /* {
5665168962Spjd		struct vnode *a_vp;
5666168962Spjd		struct uio *a_uio;
5667168962Spjd		int a_ioflag;
5668168962Spjd		struct ucred *a_cred;
5669168962Spjd	} */ *ap;
5670168962Spjd{
5671168962Spjd
5672213673Spjd	return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
5673213673Spjd	    ap->a_cred, NULL));
5674168962Spjd}
5675168962Spjd
5676168962Spjdstatic int
5677168962Spjdzfs_freebsd_write(ap)
5678168962Spjd	struct vop_write_args /* {
5679168962Spjd		struct vnode *a_vp;
5680168962Spjd		struct uio *a_uio;
5681168962Spjd		int a_ioflag;
5682168962Spjd		struct ucred *a_cred;
5683168962Spjd	} */ *ap;
5684168962Spjd{
5685168962Spjd
5686207745Strasz	if (vn_rlimit_fsize(ap->a_vp, ap->a_uio, ap->a_uio->uio_td))
5687207745Strasz		return (EFBIG);
5688207745Strasz
5689213673Spjd	return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
5690213673Spjd	    ap->a_cred, NULL));
5691168962Spjd}
5692168962Spjd
5693168962Spjdstatic int
5694168962Spjdzfs_freebsd_access(ap)
5695168962Spjd	struct vop_access_args /* {
5696168962Spjd		struct vnode *a_vp;
5697192689Strasz		accmode_t a_accmode;
5698168962Spjd		struct ucred *a_cred;
5699168962Spjd		struct thread *a_td;
5700168962Spjd	} */ *ap;
5701168962Spjd{
5702212002Sjh	vnode_t *vp = ap->a_vp;
5703212002Sjh	znode_t *zp = VTOZ(vp);
5704198703Spjd	accmode_t accmode;
5705198703Spjd	int error = 0;
5706168962Spjd
5707185172Spjd	/*
5708198703Spjd	 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
5709185172Spjd	 */
5710198703Spjd	accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
5711198703Spjd	if (accmode != 0)
5712198703Spjd		error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL);
5713185172Spjd
5714198703Spjd	/*
5715198703Spjd	 * VADMIN has to be handled by vaccess().
5716198703Spjd	 */
5717198703Spjd	if (error == 0) {
5718198703Spjd		accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
5719198703Spjd		if (accmode != 0) {
5720219089Spjd			error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
5721219089Spjd			    zp->z_gid, accmode, ap->a_cred, NULL);
5722198703Spjd		}
5723185172Spjd	}
5724185172Spjd
5725212002Sjh	/*
5726212002Sjh	 * For VEXEC, ensure that at least one execute bit is set for
5727212002Sjh	 * non-directories.
5728212002Sjh	 */
5729212002Sjh	if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
5730219089Spjd	    (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
5731212002Sjh		error = EACCES;
5732219089Spjd	}
5733212002Sjh
5734198703Spjd	return (error);
5735168962Spjd}
5736168962Spjd
5737168962Spjdstatic int
5738168962Spjdzfs_freebsd_lookup(ap)
5739168962Spjd	struct vop_lookup_args /* {
5740168962Spjd		struct vnode *a_dvp;
5741168962Spjd		struct vnode **a_vpp;
5742168962Spjd		struct componentname *a_cnp;
5743168962Spjd	} */ *ap;
5744168962Spjd{
5745168962Spjd	struct componentname *cnp = ap->a_cnp;
5746168962Spjd	char nm[NAME_MAX + 1];
5747168962Spjd
5748168962Spjd	ASSERT(cnp->cn_namelen < sizeof(nm));
5749168962Spjd	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
5750168962Spjd
5751168962Spjd	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
5752185029Spjd	    cnp->cn_cred, cnp->cn_thread, 0));
5753168962Spjd}
5754168962Spjd
5755168962Spjdstatic int
5756168962Spjdzfs_freebsd_create(ap)
5757168962Spjd	struct vop_create_args /* {
5758168962Spjd		struct vnode *a_dvp;
5759168962Spjd		struct vnode **a_vpp;
5760168962Spjd		struct componentname *a_cnp;
5761168962Spjd		struct vattr *a_vap;
5762168962Spjd	} */ *ap;
5763168962Spjd{
5764168962Spjd	struct componentname *cnp = ap->a_cnp;
5765168962Spjd	vattr_t *vap = ap->a_vap;
5766168962Spjd	int mode;
5767168962Spjd
5768168962Spjd	ASSERT(cnp->cn_flags & SAVENAME);
5769168962Spjd
5770168962Spjd	vattr_init_mask(vap);
5771168962Spjd	mode = vap->va_mode & ALLPERMS;
5772168962Spjd
5773168962Spjd	return (zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
5774185029Spjd	    ap->a_vpp, cnp->cn_cred, cnp->cn_thread));
5775168962Spjd}
5776168962Spjd
5777168962Spjdstatic int
5778168962Spjdzfs_freebsd_remove(ap)
5779168962Spjd	struct vop_remove_args /* {
5780168962Spjd		struct vnode *a_dvp;
5781168962Spjd		struct vnode *a_vp;
5782168962Spjd		struct componentname *a_cnp;
5783168962Spjd	} */ *ap;
5784168962Spjd{
5785168962Spjd
5786168962Spjd	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
5787168962Spjd
5788168962Spjd	return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr,
5789185029Spjd	    ap->a_cnp->cn_cred, NULL, 0));
5790168962Spjd}
5791168962Spjd
5792168962Spjdstatic int
5793168962Spjdzfs_freebsd_mkdir(ap)
5794168962Spjd	struct vop_mkdir_args /* {
5795168962Spjd		struct vnode *a_dvp;
5796168962Spjd		struct vnode **a_vpp;
5797168962Spjd		struct componentname *a_cnp;
5798168962Spjd		struct vattr *a_vap;
5799168962Spjd	} */ *ap;
5800168962Spjd{
5801168962Spjd	vattr_t *vap = ap->a_vap;
5802168962Spjd
5803168962Spjd	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
5804168962Spjd
5805168962Spjd	vattr_init_mask(vap);
5806168962Spjd
5807168962Spjd	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
5808185029Spjd	    ap->a_cnp->cn_cred, NULL, 0, NULL));
5809168962Spjd}
5810168962Spjd
5811168962Spjdstatic int
5812168962Spjdzfs_freebsd_rmdir(ap)
5813168962Spjd	struct vop_rmdir_args /* {
5814168962Spjd		struct vnode *a_dvp;
5815168962Spjd		struct vnode *a_vp;
5816168962Spjd		struct componentname *a_cnp;
5817168962Spjd	} */ *ap;
5818168962Spjd{
5819168962Spjd	struct componentname *cnp = ap->a_cnp;
5820168962Spjd
5821168962Spjd	ASSERT(cnp->cn_flags & SAVENAME);
5822168962Spjd
5823185029Spjd	return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred, NULL, 0));
5824168962Spjd}
5825168962Spjd
5826168962Spjdstatic int
5827168962Spjdzfs_freebsd_readdir(ap)
5828168962Spjd	struct vop_readdir_args /* {
5829168962Spjd		struct vnode *a_vp;
5830168962Spjd		struct uio *a_uio;
5831168962Spjd		struct ucred *a_cred;
5832168962Spjd		int *a_eofflag;
5833168962Spjd		int *a_ncookies;
5834168962Spjd		u_long **a_cookies;
5835168962Spjd	} */ *ap;
5836168962Spjd{
5837168962Spjd
5838168962Spjd	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
5839168962Spjd	    ap->a_ncookies, ap->a_cookies));
5840168962Spjd}
5841168962Spjd
5842168962Spjdstatic int
5843168962Spjdzfs_freebsd_fsync(ap)
5844168962Spjd	struct vop_fsync_args /* {
5845168962Spjd		struct vnode *a_vp;
5846168962Spjd		int a_waitfor;
5847168962Spjd		struct thread *a_td;
5848168962Spjd	} */ *ap;
5849168962Spjd{
5850168962Spjd
5851168962Spjd	vop_stdfsync(ap);
5852185029Spjd	return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
5853168962Spjd}
5854168962Spjd
5855168962Spjdstatic int
5856168962Spjdzfs_freebsd_getattr(ap)
5857168962Spjd	struct vop_getattr_args /* {
5858168962Spjd		struct vnode *a_vp;
5859168962Spjd		struct vattr *a_vap;
5860168962Spjd		struct ucred *a_cred;
5861168962Spjd	} */ *ap;
5862168962Spjd{
5863185029Spjd	vattr_t *vap = ap->a_vap;
5864185029Spjd	xvattr_t xvap;
5865185029Spjd	u_long fflags = 0;
5866185029Spjd	int error;
5867168962Spjd
5868185029Spjd	xva_init(&xvap);
5869185029Spjd	xvap.xva_vattr = *vap;
5870185029Spjd	xvap.xva_vattr.va_mask |= AT_XVATTR;
5871185029Spjd
5872185029Spjd	/* Convert chflags into ZFS-type flags. */
5873185029Spjd	/* XXX: what about SF_SETTABLE?. */
5874185029Spjd	XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
5875185029Spjd	XVA_SET_REQ(&xvap, XAT_APPENDONLY);
5876185029Spjd	XVA_SET_REQ(&xvap, XAT_NOUNLINK);
5877185029Spjd	XVA_SET_REQ(&xvap, XAT_NODUMP);
5878185029Spjd	error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
5879185029Spjd	if (error != 0)
5880185029Spjd		return (error);
5881185029Spjd
5882185029Spjd	/* Convert ZFS xattr into chflags. */
5883185029Spjd#define	FLAG_CHECK(fflag, xflag, xfield)	do {			\
5884185029Spjd	if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)		\
5885185029Spjd		fflags |= (fflag);					\
5886185029Spjd} while (0)
5887185029Spjd	FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
5888185029Spjd	    xvap.xva_xoptattrs.xoa_immutable);
5889185029Spjd	FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
5890185029Spjd	    xvap.xva_xoptattrs.xoa_appendonly);
5891185029Spjd	FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
5892185029Spjd	    xvap.xva_xoptattrs.xoa_nounlink);
5893185029Spjd	FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
5894185029Spjd	    xvap.xva_xoptattrs.xoa_nodump);
5895185029Spjd#undef	FLAG_CHECK
5896185029Spjd	*vap = xvap.xva_vattr;
5897185029Spjd	vap->va_flags = fflags;
5898185029Spjd	return (0);
5899168962Spjd}
5900168962Spjd
5901168962Spjdstatic int
5902168962Spjdzfs_freebsd_setattr(ap)
5903168962Spjd	struct vop_setattr_args /* {
5904168962Spjd		struct vnode *a_vp;
5905168962Spjd		struct vattr *a_vap;
5906168962Spjd		struct ucred *a_cred;
5907168962Spjd	} */ *ap;
5908168962Spjd{
5909185172Spjd	vnode_t *vp = ap->a_vp;
5910168962Spjd	vattr_t *vap = ap->a_vap;
5911185172Spjd	cred_t *cred = ap->a_cred;
5912185029Spjd	xvattr_t xvap;
5913185029Spjd	u_long fflags;
5914185029Spjd	uint64_t zflags;
5915168962Spjd
5916168962Spjd	vattr_init_mask(vap);
5917170044Spjd	vap->va_mask &= ~AT_NOSET;
5918168962Spjd
5919185029Spjd	xva_init(&xvap);
5920185029Spjd	xvap.xva_vattr = *vap;
5921185029Spjd
5922219089Spjd	zflags = VTOZ(vp)->z_pflags;
5923185172Spjd
5924185029Spjd	if (vap->va_flags != VNOVAL) {
5925197683Sdelphij		zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
5926185172Spjd		int error;
5927185172Spjd
5928197683Sdelphij		if (zfsvfs->z_use_fuids == B_FALSE)
5929197683Sdelphij			return (EOPNOTSUPP);
5930197683Sdelphij
5931185029Spjd		fflags = vap->va_flags;
5932185029Spjd		if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_NODUMP)) != 0)
5933185029Spjd			return (EOPNOTSUPP);
5934185172Spjd		/*
5935185172Spjd		 * Unprivileged processes are not permitted to unset system
5936185172Spjd		 * flags, or modify flags if any system flags are set.
5937185172Spjd		 * Privileged non-jail processes may not modify system flags
5938185172Spjd		 * if securelevel > 0 and any existing system flags are set.
5939185172Spjd		 * Privileged jail processes behave like privileged non-jail
5940185172Spjd		 * processes if the security.jail.chflags_allowed sysctl is
5941185172Spjd		 * is non-zero; otherwise, they behave like unprivileged
5942185172Spjd		 * processes.
5943185172Spjd		 */
5944197861Spjd		if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
5945197861Spjd		    priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) {
5946185172Spjd			if (zflags &
5947185172Spjd			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
5948185172Spjd				error = securelevel_gt(cred, 0);
5949197861Spjd				if (error != 0)
5950185172Spjd					return (error);
5951185172Spjd			}
5952185172Spjd		} else {
5953197861Spjd			/*
5954197861Spjd			 * Callers may only modify the file flags on objects they
5955197861Spjd			 * have VADMIN rights for.
5956197861Spjd			 */
5957197861Spjd			if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0)
5958197861Spjd				return (error);
5959185172Spjd			if (zflags &
5960185172Spjd			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
5961185172Spjd				return (EPERM);
5962185172Spjd			}
5963185172Spjd			if (fflags &
5964185172Spjd			    (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
5965185172Spjd				return (EPERM);
5966185172Spjd			}
5967185172Spjd		}
5968185029Spjd
5969185029Spjd#define	FLAG_CHANGE(fflag, zflag, xflag, xfield)	do {		\
5970185029Spjd	if (((fflags & (fflag)) && !(zflags & (zflag))) ||		\
5971185029Spjd	    ((zflags & (zflag)) && !(fflags & (fflag)))) {		\
5972185029Spjd		XVA_SET_REQ(&xvap, (xflag));				\
5973185029Spjd		(xfield) = ((fflags & (fflag)) != 0);			\
5974185029Spjd	}								\
5975185029Spjd} while (0)
5976185029Spjd		/* Convert chflags into ZFS-type flags. */
5977185029Spjd		/* XXX: what about SF_SETTABLE?. */
5978185029Spjd		FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
5979185029Spjd		    xvap.xva_xoptattrs.xoa_immutable);
5980185029Spjd		FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
5981185029Spjd		    xvap.xva_xoptattrs.xoa_appendonly);
5982185029Spjd		FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
5983185029Spjd		    xvap.xva_xoptattrs.xoa_nounlink);
5984185029Spjd		FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
5985185172Spjd		    xvap.xva_xoptattrs.xoa_nodump);
5986185029Spjd#undef	FLAG_CHANGE
5987185029Spjd	}
5988185172Spjd	return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
5989168962Spjd}
5990168962Spjd
5991168962Spjdstatic int
5992168962Spjdzfs_freebsd_rename(ap)
5993168962Spjd	struct vop_rename_args  /* {
5994168962Spjd		struct vnode *a_fdvp;
5995168962Spjd		struct vnode *a_fvp;
5996168962Spjd		struct componentname *a_fcnp;
5997168962Spjd		struct vnode *a_tdvp;
5998168962Spjd		struct vnode *a_tvp;
5999168962Spjd		struct componentname *a_tcnp;
6000168962Spjd	} */ *ap;
6001168962Spjd{
6002168962Spjd	vnode_t *fdvp = ap->a_fdvp;
6003168962Spjd	vnode_t *fvp = ap->a_fvp;
6004168962Spjd	vnode_t *tdvp = ap->a_tdvp;
6005168962Spjd	vnode_t *tvp = ap->a_tvp;
6006168962Spjd	int error;
6007168962Spjd
6008192237Skmacy	ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
6009192237Skmacy	ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
6010168962Spjd
6011168962Spjd	error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp,
6012185029Spjd	    ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred, NULL, 0);
6013168962Spjd
6014168962Spjd	if (tdvp == tvp)
6015168962Spjd		VN_RELE(tdvp);
6016168962Spjd	else
6017168962Spjd		VN_URELE(tdvp);
6018168962Spjd	if (tvp)
6019168962Spjd		VN_URELE(tvp);
6020168962Spjd	VN_RELE(fdvp);
6021168962Spjd	VN_RELE(fvp);
6022168962Spjd
6023168962Spjd	return (error);
6024168962Spjd}
6025168962Spjd
6026168962Spjdstatic int
6027168962Spjdzfs_freebsd_symlink(ap)
6028168962Spjd	struct vop_symlink_args /* {
6029168962Spjd		struct vnode *a_dvp;
6030168962Spjd		struct vnode **a_vpp;
6031168962Spjd		struct componentname *a_cnp;
6032168962Spjd		struct vattr *a_vap;
6033168962Spjd		char *a_target;
6034168962Spjd	} */ *ap;
6035168962Spjd{
6036168962Spjd	struct componentname *cnp = ap->a_cnp;
6037168962Spjd	vattr_t *vap = ap->a_vap;
6038168962Spjd
6039168962Spjd	ASSERT(cnp->cn_flags & SAVENAME);
6040168962Spjd
6041168962Spjd	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
6042168962Spjd	vattr_init_mask(vap);
6043168962Spjd
6044168962Spjd	return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
6045168962Spjd	    ap->a_target, cnp->cn_cred, cnp->cn_thread));
6046168962Spjd}
6047168962Spjd
6048168962Spjdstatic int
6049168962Spjdzfs_freebsd_readlink(ap)
6050168962Spjd	struct vop_readlink_args /* {
6051168962Spjd		struct vnode *a_vp;
6052168962Spjd		struct uio *a_uio;
6053168962Spjd		struct ucred *a_cred;
6054168962Spjd	} */ *ap;
6055168962Spjd{
6056168962Spjd
6057185029Spjd	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
6058168962Spjd}
6059168962Spjd
6060168962Spjdstatic int
6061168962Spjdzfs_freebsd_link(ap)
6062168962Spjd	struct vop_link_args /* {
6063168962Spjd		struct vnode *a_tdvp;
6064168962Spjd		struct vnode *a_vp;
6065168962Spjd		struct componentname *a_cnp;
6066168962Spjd	} */ *ap;
6067168962Spjd{
6068168962Spjd	struct componentname *cnp = ap->a_cnp;
6069168962Spjd
6070168962Spjd	ASSERT(cnp->cn_flags & SAVENAME);
6071168962Spjd
6072185029Spjd	return (zfs_link(ap->a_tdvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
6073168962Spjd}
6074168962Spjd
6075168962Spjdstatic int
6076168962Spjdzfs_freebsd_inactive(ap)
6077169170Spjd	struct vop_inactive_args /* {
6078169170Spjd		struct vnode *a_vp;
6079169170Spjd		struct thread *a_td;
6080169170Spjd	} */ *ap;
6081168962Spjd{
6082168962Spjd	vnode_t *vp = ap->a_vp;
6083168962Spjd
6084185029Spjd	zfs_inactive(vp, ap->a_td->td_ucred, NULL);
6085168962Spjd	return (0);
6086168962Spjd}
6087168962Spjd
6088185029Spjdstatic void
6089185029Spjdzfs_reclaim_complete(void *arg, int pending)
6090185029Spjd{
6091185029Spjd	znode_t	*zp = arg;
6092185029Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
6093185029Spjd
6094197133Spjd	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
6095219089Spjd	if (zp->z_sa_hdl != NULL) {
6096197133Spjd		ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id);
6097197133Spjd		zfs_znode_dmu_fini(zp);
6098197133Spjd		ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
6099197133Spjd	}
6100185029Spjd	zfs_znode_free(zp);
6101197133Spjd	rw_exit(&zfsvfs->z_teardown_inactive_lock);
6102197133Spjd	/*
6103197133Spjd	 * If the file system is being unmounted, there is a process waiting
6104197133Spjd	 * for us, wake it up.
6105197133Spjd	 */
6106197133Spjd	if (zfsvfs->z_unmounted)
6107197133Spjd		wakeup_one(zfsvfs);
6108185029Spjd}
6109185029Spjd
6110168962Spjdstatic int
6111168962Spjdzfs_freebsd_reclaim(ap)
6112168962Spjd	struct vop_reclaim_args /* {
6113168962Spjd		struct vnode *a_vp;
6114168962Spjd		struct thread *a_td;
6115168962Spjd	} */ *ap;
6116168962Spjd{
6117169170Spjd	vnode_t	*vp = ap->a_vp;
6118168962Spjd	znode_t	*zp = VTOZ(vp);
6119197133Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
6120219089Spjd	boolean_t rlocked;
6121168962Spjd
6122219089Spjd	rlocked = rw_tryenter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
6123197133Spjd
6124169025Spjd	ASSERT(zp != NULL);
6125169025Spjd
6126168962Spjd	/*
6127168962Spjd	 * Destroy the vm object and flush associated pages.
6128168962Spjd	 */
6129168962Spjd	vnode_destroy_vobject(vp);
6130169025Spjd
6131169025Spjd	mutex_enter(&zp->z_lock);
6132197153Spjd	zp->z_vnode = NULL;
6133196301Spjd	mutex_exit(&zp->z_lock);
6134196301Spjd
6135219089Spjd	if (zp->z_unlinked) {
6136196301Spjd		;	/* Do nothing. */
6137219089Spjd	} else if (!rlocked) {
6138219089Spjd		TASK_INIT(&zp->z_task, 0, zfs_reclaim_complete, zp);
6139219089Spjd		taskqueue_enqueue(taskqueue_thread, &zp->z_task);
6140219089Spjd	} else if (zp->z_sa_hdl == NULL) {
6141196301Spjd		zfs_znode_free(zp);
6142219089Spjd	} else /* if (!zp->z_unlinked && zp->z_dbuf != NULL) */ {
6143185029Spjd		int locked;
6144185029Spjd
6145185029Spjd		locked = MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)) ? 2 :
6146185029Spjd		    ZFS_OBJ_HOLD_TRYENTER(zfsvfs, zp->z_id);
6147185029Spjd		if (locked == 0) {
6148185029Spjd			/*
6149185029Spjd			 * Lock can't be obtained due to deadlock possibility,
6150185029Spjd			 * so defer znode destruction.
6151185029Spjd			 */
6152185029Spjd			TASK_INIT(&zp->z_task, 0, zfs_reclaim_complete, zp);
6153185029Spjd			taskqueue_enqueue(taskqueue_thread, &zp->z_task);
6154185029Spjd		} else {
6155185029Spjd			zfs_znode_dmu_fini(zp);
6156185029Spjd			if (locked == 1)
6157185029Spjd				ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
6158185029Spjd			zfs_znode_free(zp);
6159185029Spjd		}
6160169025Spjd	}
6161168962Spjd	VI_LOCK(vp);
6162168962Spjd	vp->v_data = NULL;
6163171567Spjd	ASSERT(vp->v_holdcnt >= 1);
6164171316Sdfr	VI_UNLOCK(vp);
6165219089Spjd	if (rlocked)
6166219089Spjd		rw_exit(&zfsvfs->z_teardown_inactive_lock);
6167168962Spjd	return (0);
6168168962Spjd}
6169168962Spjd
6170168962Spjdstatic int
6171168962Spjdzfs_freebsd_fid(ap)
6172168962Spjd	struct vop_fid_args /* {
6173168962Spjd		struct vnode *a_vp;
6174168962Spjd		struct fid *a_fid;
6175168962Spjd	} */ *ap;
6176168962Spjd{
6177168962Spjd
6178185029Spjd	return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
6179168962Spjd}
6180168962Spjd
6181168962Spjdstatic int
6182168962Spjdzfs_freebsd_pathconf(ap)
6183168962Spjd	struct vop_pathconf_args /* {
6184168962Spjd		struct vnode *a_vp;
6185168962Spjd		int a_name;
6186168962Spjd		register_t *a_retval;
6187168962Spjd	} */ *ap;
6188168962Spjd{
6189168962Spjd	ulong_t val;
6190168962Spjd	int error;
6191168962Spjd
6192185029Spjd	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL);
6193168962Spjd	if (error == 0)
6194168962Spjd		*ap->a_retval = val;
6195168962Spjd	else if (error == EOPNOTSUPP)
6196168962Spjd		error = vop_stdpathconf(ap);
6197168962Spjd	return (error);
6198168962Spjd}
6199168962Spjd
6200196949Straszstatic int
6201196949Straszzfs_freebsd_fifo_pathconf(ap)
6202196949Strasz	struct vop_pathconf_args /* {
6203196949Strasz		struct vnode *a_vp;
6204196949Strasz		int a_name;
6205196949Strasz		register_t *a_retval;
6206196949Strasz	} */ *ap;
6207196949Strasz{
6208196949Strasz
6209196949Strasz	switch (ap->a_name) {
6210196949Strasz	case _PC_ACL_EXTENDED:
6211196949Strasz	case _PC_ACL_NFS4:
6212196949Strasz	case _PC_ACL_PATH_MAX:
6213196949Strasz	case _PC_MAC_PRESENT:
6214196949Strasz		return (zfs_freebsd_pathconf(ap));
6215196949Strasz	default:
6216196949Strasz		return (fifo_specops.vop_pathconf(ap));
6217196949Strasz	}
6218196949Strasz}
6219196949Strasz
6220185029Spjd/*
6221185029Spjd * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
6222185029Spjd * extended attribute name:
6223185029Spjd *
6224185029Spjd *	NAMESPACE	PREFIX
6225185029Spjd *	system		freebsd:system:
6226185029Spjd *	user		(none, can be used to access ZFS fsattr(5) attributes
6227185029Spjd *			created on Solaris)
6228185029Spjd */
6229185029Spjdstatic int
6230185029Spjdzfs_create_attrname(int attrnamespace, const char *name, char *attrname,
6231185029Spjd    size_t size)
6232185029Spjd{
6233185029Spjd	const char *namespace, *prefix, *suffix;
6234185029Spjd
6235185029Spjd	/* We don't allow '/' character in attribute name. */
6236185029Spjd	if (strchr(name, '/') != NULL)
6237185029Spjd		return (EINVAL);
6238185029Spjd	/* We don't allow attribute names that start with "freebsd:" string. */
6239185029Spjd	if (strncmp(name, "freebsd:", 8) == 0)
6240185029Spjd		return (EINVAL);
6241185029Spjd
6242185029Spjd	bzero(attrname, size);
6243185029Spjd
6244185029Spjd	switch (attrnamespace) {
6245185029Spjd	case EXTATTR_NAMESPACE_USER:
6246185029Spjd#if 0
6247185029Spjd		prefix = "freebsd:";
6248185029Spjd		namespace = EXTATTR_NAMESPACE_USER_STRING;
6249185029Spjd		suffix = ":";
6250185029Spjd#else
6251185029Spjd		/*
6252185029Spjd		 * This is the default namespace by which we can access all
6253185029Spjd		 * attributes created on Solaris.
6254185029Spjd		 */
6255185029Spjd		prefix = namespace = suffix = "";
6256185029Spjd#endif
6257185029Spjd		break;
6258185029Spjd	case EXTATTR_NAMESPACE_SYSTEM:
6259185029Spjd		prefix = "freebsd:";
6260185029Spjd		namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
6261185029Spjd		suffix = ":";
6262185029Spjd		break;
6263185029Spjd	case EXTATTR_NAMESPACE_EMPTY:
6264185029Spjd	default:
6265185029Spjd		return (EINVAL);
6266185029Spjd	}
6267185029Spjd	if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
6268185029Spjd	    name) >= size) {
6269185029Spjd		return (ENAMETOOLONG);
6270185029Spjd	}
6271185029Spjd	return (0);
6272185029Spjd}
6273185029Spjd
6274185029Spjd/*
6275185029Spjd * Vnode operating to retrieve a named extended attribute.
6276185029Spjd */
6277185029Spjdstatic int
6278185029Spjdzfs_getextattr(struct vop_getextattr_args *ap)
6279185029Spjd/*
6280185029Spjdvop_getextattr {
6281185029Spjd	IN struct vnode *a_vp;
6282185029Spjd	IN int a_attrnamespace;
6283185029Spjd	IN const char *a_name;
6284185029Spjd	INOUT struct uio *a_uio;
6285185029Spjd	OUT size_t *a_size;
6286185029Spjd	IN struct ucred *a_cred;
6287185029Spjd	IN struct thread *a_td;
6288185029Spjd};
6289185029Spjd*/
6290185029Spjd{
6291185029Spjd	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
6292185029Spjd	struct thread *td = ap->a_td;
6293185029Spjd	struct nameidata nd;
6294185029Spjd	char attrname[255];
6295185029Spjd	struct vattr va;
6296185029Spjd	vnode_t *xvp = NULL, *vp;
6297185029Spjd	int error, flags;
6298185029Spjd
6299195785Strasz	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6300195785Strasz	    ap->a_cred, ap->a_td, VREAD);
6301195785Strasz	if (error != 0)
6302195785Strasz		return (error);
6303195785Strasz
6304185029Spjd	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
6305185029Spjd	    sizeof(attrname));
6306185029Spjd	if (error != 0)
6307185029Spjd		return (error);
6308185029Spjd
6309185029Spjd	ZFS_ENTER(zfsvfs);
6310185029Spjd
6311185029Spjd	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
6312185029Spjd	    LOOKUP_XATTR);
6313185029Spjd	if (error != 0) {
6314185029Spjd		ZFS_EXIT(zfsvfs);
6315185029Spjd		return (error);
6316185029Spjd	}
6317185029Spjd
6318185029Spjd	flags = FREAD;
6319185029Spjd	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, attrname,
6320185029Spjd	    xvp, td);
6321194586Skib	error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL);
6322185029Spjd	vp = nd.ni_vp;
6323185029Spjd	NDFREE(&nd, NDF_ONLY_PNBUF);
6324185029Spjd	if (error != 0) {
6325196303Spjd		ZFS_EXIT(zfsvfs);
6326195785Strasz		if (error == ENOENT)
6327195785Strasz			error = ENOATTR;
6328185029Spjd		return (error);
6329185029Spjd	}
6330185029Spjd
6331185029Spjd	if (ap->a_size != NULL) {
6332185029Spjd		error = VOP_GETATTR(vp, &va, ap->a_cred);
6333185029Spjd		if (error == 0)
6334185029Spjd			*ap->a_size = (size_t)va.va_size;
6335185029Spjd	} else if (ap->a_uio != NULL)
6336185029Spjd		error = VOP_READ(vp, ap->a_uio, IO_UNIT | IO_SYNC, ap->a_cred);
6337185029Spjd
6338185029Spjd	VOP_UNLOCK(vp, 0);
6339185029Spjd	vn_close(vp, flags, ap->a_cred, td);
6340185029Spjd	ZFS_EXIT(zfsvfs);
6341185029Spjd
6342185029Spjd	return (error);
6343185029Spjd}
6344185029Spjd
6345185029Spjd/*
6346185029Spjd * Vnode operation to remove a named attribute.
6347185029Spjd */
6348185029Spjdint
6349185029Spjdzfs_deleteextattr(struct vop_deleteextattr_args *ap)
6350185029Spjd/*
6351185029Spjdvop_deleteextattr {
6352185029Spjd	IN struct vnode *a_vp;
6353185029Spjd	IN int a_attrnamespace;
6354185029Spjd	IN const char *a_name;
6355185029Spjd	IN struct ucred *a_cred;
6356185029Spjd	IN struct thread *a_td;
6357185029Spjd};
6358185029Spjd*/
6359185029Spjd{
6360185029Spjd	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
6361185029Spjd	struct thread *td = ap->a_td;
6362185029Spjd	struct nameidata nd;
6363185029Spjd	char attrname[255];
6364185029Spjd	struct vattr va;
6365185029Spjd	vnode_t *xvp = NULL, *vp;
6366185029Spjd	int error, flags;
6367185029Spjd
6368195785Strasz	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6369195785Strasz	    ap->a_cred, ap->a_td, VWRITE);
6370195785Strasz	if (error != 0)
6371195785Strasz		return (error);
6372195785Strasz
6373185029Spjd	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
6374185029Spjd	    sizeof(attrname));
6375185029Spjd	if (error != 0)
6376185029Spjd		return (error);
6377185029Spjd
6378185029Spjd	ZFS_ENTER(zfsvfs);
6379185029Spjd
6380185029Spjd	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
6381185029Spjd	    LOOKUP_XATTR);
6382185029Spjd	if (error != 0) {
6383185029Spjd		ZFS_EXIT(zfsvfs);
6384185029Spjd		return (error);
6385185029Spjd	}
6386185029Spjd
6387185029Spjd	NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF | MPSAFE,
6388185029Spjd	    UIO_SYSSPACE, attrname, xvp, td);
6389185029Spjd	error = namei(&nd);
6390185029Spjd	vp = nd.ni_vp;
6391185029Spjd	NDFREE(&nd, NDF_ONLY_PNBUF);
6392185029Spjd	if (error != 0) {
6393196303Spjd		ZFS_EXIT(zfsvfs);
6394195785Strasz		if (error == ENOENT)
6395195785Strasz			error = ENOATTR;
6396185029Spjd		return (error);
6397185029Spjd	}
6398185029Spjd	error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
6399185029Spjd
6400185029Spjd	vput(nd.ni_dvp);
6401185029Spjd	if (vp == nd.ni_dvp)
6402185029Spjd		vrele(vp);
6403185029Spjd	else
6404185029Spjd		vput(vp);
6405185029Spjd	ZFS_EXIT(zfsvfs);
6406185029Spjd
6407185029Spjd	return (error);
6408185029Spjd}
6409185029Spjd
6410185029Spjd/*
6411185029Spjd * Vnode operation to set a named attribute.
6412185029Spjd */
6413185029Spjdstatic int
6414185029Spjdzfs_setextattr(struct vop_setextattr_args *ap)
6415185029Spjd/*
6416185029Spjdvop_setextattr {
6417185029Spjd	IN struct vnode *a_vp;
6418185029Spjd	IN int a_attrnamespace;
6419185029Spjd	IN const char *a_name;
6420185029Spjd	INOUT struct uio *a_uio;
6421185029Spjd	IN struct ucred *a_cred;
6422185029Spjd	IN struct thread *a_td;
6423185029Spjd};
6424185029Spjd*/
6425185029Spjd{
6426185029Spjd	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
6427185029Spjd	struct thread *td = ap->a_td;
6428185029Spjd	struct nameidata nd;
6429185029Spjd	char attrname[255];
6430185029Spjd	struct vattr va;
6431185029Spjd	vnode_t *xvp = NULL, *vp;
6432185029Spjd	int error, flags;
6433185029Spjd
6434195785Strasz	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6435195785Strasz	    ap->a_cred, ap->a_td, VWRITE);
6436195785Strasz	if (error != 0)
6437195785Strasz		return (error);
6438195785Strasz
6439185029Spjd	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
6440185029Spjd	    sizeof(attrname));
6441185029Spjd	if (error != 0)
6442185029Spjd		return (error);
6443185029Spjd
6444185029Spjd	ZFS_ENTER(zfsvfs);
6445185029Spjd
6446185029Spjd	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
6447195785Strasz	    LOOKUP_XATTR | CREATE_XATTR_DIR);
6448185029Spjd	if (error != 0) {
6449185029Spjd		ZFS_EXIT(zfsvfs);
6450185029Spjd		return (error);
6451185029Spjd	}
6452185029Spjd
6453185029Spjd	flags = FFLAGS(O_WRONLY | O_CREAT);
6454185029Spjd	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, attrname,
6455185029Spjd	    xvp, td);
6456194586Skib	error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL);
6457185029Spjd	vp = nd.ni_vp;
6458185029Spjd	NDFREE(&nd, NDF_ONLY_PNBUF);
6459185029Spjd	if (error != 0) {
6460185029Spjd		ZFS_EXIT(zfsvfs);
6461185029Spjd		return (error);
6462185029Spjd	}
6463185029Spjd
6464185029Spjd	VATTR_NULL(&va);
6465185029Spjd	va.va_size = 0;
6466185029Spjd	error = VOP_SETATTR(vp, &va, ap->a_cred);
6467185029Spjd	if (error == 0)
6468185029Spjd		VOP_WRITE(vp, ap->a_uio, IO_UNIT | IO_SYNC, ap->a_cred);
6469185029Spjd
6470185029Spjd	VOP_UNLOCK(vp, 0);
6471185029Spjd	vn_close(vp, flags, ap->a_cred, td);
6472185029Spjd	ZFS_EXIT(zfsvfs);
6473185029Spjd
6474185029Spjd	return (error);
6475185029Spjd}
6476185029Spjd
6477185029Spjd/*
6478185029Spjd * Vnode operation to retrieve extended attributes on a vnode.
6479185029Spjd */
6480185029Spjdstatic int
6481185029Spjdzfs_listextattr(struct vop_listextattr_args *ap)
6482185029Spjd/*
6483185029Spjdvop_listextattr {
6484185029Spjd	IN struct vnode *a_vp;
6485185029Spjd	IN int a_attrnamespace;
6486185029Spjd	INOUT struct uio *a_uio;
6487185029Spjd	OUT size_t *a_size;
6488185029Spjd	IN struct ucred *a_cred;
6489185029Spjd	IN struct thread *a_td;
6490185029Spjd};
6491185029Spjd*/
6492185029Spjd{
6493185029Spjd	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
6494185029Spjd	struct thread *td = ap->a_td;
6495185029Spjd	struct nameidata nd;
6496185029Spjd	char attrprefix[16];
6497185029Spjd	u_char dirbuf[sizeof(struct dirent)];
6498185029Spjd	struct dirent *dp;
6499185029Spjd	struct iovec aiov;
6500185029Spjd	struct uio auio, *uio = ap->a_uio;
6501185029Spjd	size_t *sizep = ap->a_size;
6502185029Spjd	size_t plen;
6503185029Spjd	vnode_t *xvp = NULL, *vp;
6504185029Spjd	int done, error, eof, pos;
6505185029Spjd
6506195785Strasz	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6507195785Strasz	    ap->a_cred, ap->a_td, VREAD);
6508196303Spjd	if (error != 0)
6509195785Strasz		return (error);
6510195785Strasz
6511185029Spjd	error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
6512185029Spjd	    sizeof(attrprefix));
6513185029Spjd	if (error != 0)
6514185029Spjd		return (error);
6515185029Spjd	plen = strlen(attrprefix);
6516185029Spjd
6517185029Spjd	ZFS_ENTER(zfsvfs);
6518185029Spjd
6519195822Strasz	if (sizep != NULL)
6520195822Strasz		*sizep = 0;
6521195822Strasz
6522185029Spjd	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
6523185029Spjd	    LOOKUP_XATTR);
6524185029Spjd	if (error != 0) {
6525196303Spjd		ZFS_EXIT(zfsvfs);
6526195785Strasz		/*
6527195785Strasz		 * ENOATTR means that the EA directory does not yet exist,
6528195785Strasz		 * i.e. there are no extended attributes there.
6529195785Strasz		 */
6530195785Strasz		if (error == ENOATTR)
6531195785Strasz			error = 0;
6532185029Spjd		return (error);
6533185029Spjd	}
6534185029Spjd
6535188588Sjhb	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED | MPSAFE,
6536188588Sjhb	    UIO_SYSSPACE, ".", xvp, td);
6537185029Spjd	error = namei(&nd);
6538185029Spjd	vp = nd.ni_vp;
6539185029Spjd	NDFREE(&nd, NDF_ONLY_PNBUF);
6540185029Spjd	if (error != 0) {
6541185029Spjd		ZFS_EXIT(zfsvfs);
6542185029Spjd		return (error);
6543185029Spjd	}
6544185029Spjd
6545185029Spjd	auio.uio_iov = &aiov;
6546185029Spjd	auio.uio_iovcnt = 1;
6547185029Spjd	auio.uio_segflg = UIO_SYSSPACE;
6548185029Spjd	auio.uio_td = td;
6549185029Spjd	auio.uio_rw = UIO_READ;
6550185029Spjd	auio.uio_offset = 0;
6551185029Spjd
6552185029Spjd	do {
6553185029Spjd		u_char nlen;
6554185029Spjd
6555185029Spjd		aiov.iov_base = (void *)dirbuf;
6556185029Spjd		aiov.iov_len = sizeof(dirbuf);
6557185029Spjd		auio.uio_resid = sizeof(dirbuf);
6558185029Spjd		error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
6559185029Spjd		done = sizeof(dirbuf) - auio.uio_resid;
6560185029Spjd		if (error != 0)
6561185029Spjd			break;
6562185029Spjd		for (pos = 0; pos < done;) {
6563185029Spjd			dp = (struct dirent *)(dirbuf + pos);
6564185029Spjd			pos += dp->d_reclen;
6565185029Spjd			/*
6566185029Spjd			 * XXX: Temporarily we also accept DT_UNKNOWN, as this
6567185029Spjd			 * is what we get when attribute was created on Solaris.
6568185029Spjd			 */
6569185029Spjd			if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
6570185029Spjd				continue;
6571185029Spjd			if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0)
6572185029Spjd				continue;
6573185029Spjd			else if (strncmp(dp->d_name, attrprefix, plen) != 0)
6574185029Spjd				continue;
6575185029Spjd			nlen = dp->d_namlen - plen;
6576185029Spjd			if (sizep != NULL)
6577185029Spjd				*sizep += 1 + nlen;
6578185029Spjd			else if (uio != NULL) {
6579185029Spjd				/*
6580185029Spjd				 * Format of extattr name entry is one byte for
6581185029Spjd				 * length and the rest for name.
6582185029Spjd				 */
6583185029Spjd				error = uiomove(&nlen, 1, uio->uio_rw, uio);
6584185029Spjd				if (error == 0) {
6585185029Spjd					error = uiomove(dp->d_name + plen, nlen,
6586185029Spjd					    uio->uio_rw, uio);
6587185029Spjd				}
6588185029Spjd				if (error != 0)
6589185029Spjd					break;
6590185029Spjd			}
6591185029Spjd		}
6592185029Spjd	} while (!eof && error == 0);
6593185029Spjd
6594185029Spjd	vput(vp);
6595185029Spjd	ZFS_EXIT(zfsvfs);
6596185029Spjd
6597185029Spjd	return (error);
6598185029Spjd}
6599185029Spjd
6600192800Straszint
6601192800Straszzfs_freebsd_getacl(ap)
6602192800Strasz	struct vop_getacl_args /* {
6603192800Strasz		struct vnode *vp;
6604192800Strasz		acl_type_t type;
6605192800Strasz		struct acl *aclp;
6606192800Strasz		struct ucred *cred;
6607192800Strasz		struct thread *td;
6608192800Strasz	} */ *ap;
6609192800Strasz{
6610192800Strasz	int		error;
6611192800Strasz	vsecattr_t      vsecattr;
6612192800Strasz
6613192800Strasz	if (ap->a_type != ACL_TYPE_NFS4)
6614197435Strasz		return (EINVAL);
6615192800Strasz
6616192800Strasz	vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
6617192800Strasz	if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))
6618192800Strasz		return (error);
6619192800Strasz
6620192800Strasz	error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt);
6621196303Spjd	if (vsecattr.vsa_aclentp != NULL)
6622196303Spjd		kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
6623192800Strasz
6624196303Spjd	return (error);
6625192800Strasz}
6626192800Strasz
6627192800Straszint
6628192800Straszzfs_freebsd_setacl(ap)
6629192800Strasz	struct vop_setacl_args /* {
6630192800Strasz		struct vnode *vp;
6631192800Strasz		acl_type_t type;
6632192800Strasz		struct acl *aclp;
6633192800Strasz		struct ucred *cred;
6634192800Strasz		struct thread *td;
6635192800Strasz	} */ *ap;
6636192800Strasz{
6637192800Strasz	int		error;
6638192800Strasz	vsecattr_t      vsecattr;
6639192800Strasz	int		aclbsize;	/* size of acl list in bytes */
6640192800Strasz	aclent_t	*aaclp;
6641192800Strasz
6642192800Strasz	if (ap->a_type != ACL_TYPE_NFS4)
6643197435Strasz		return (EINVAL);
6644192800Strasz
6645192800Strasz	if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
6646192800Strasz		return (EINVAL);
6647192800Strasz
6648192800Strasz	/*
6649196949Strasz	 * With NFSv4 ACLs, chmod(2) may need to add additional entries,
6650192800Strasz	 * splitting every entry into two and appending "canonical six"
6651192800Strasz	 * entries at the end.  Don't allow for setting an ACL that would
6652192800Strasz	 * cause chmod(2) to run out of ACL entries.
6653192800Strasz	 */
6654192800Strasz	if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
6655192800Strasz		return (ENOSPC);
6656192800Strasz
6657208030Strasz	error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
6658208030Strasz	if (error != 0)
6659208030Strasz		return (error);
6660208030Strasz
6661192800Strasz	vsecattr.vsa_mask = VSA_ACE;
6662192800Strasz	aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t);
6663192800Strasz	vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
6664192800Strasz	aaclp = vsecattr.vsa_aclentp;
6665192800Strasz	vsecattr.vsa_aclentsz = aclbsize;
6666192800Strasz
6667192800Strasz	aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
6668192800Strasz	error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL);
6669192800Strasz	kmem_free(aaclp, aclbsize);
6670192800Strasz
6671192800Strasz	return (error);
6672192800Strasz}
6673192800Strasz
6674192800Straszint
6675192800Straszzfs_freebsd_aclcheck(ap)
6676192800Strasz	struct vop_aclcheck_args /* {
6677192800Strasz		struct vnode *vp;
6678192800Strasz		acl_type_t type;
6679192800Strasz		struct acl *aclp;
6680192800Strasz		struct ucred *cred;
6681192800Strasz		struct thread *td;
6682192800Strasz	} */ *ap;
6683192800Strasz{
6684192800Strasz
6685192800Strasz	return (EOPNOTSUPP);
6686192800Strasz}
6687192800Strasz
6688168404Spjdstruct vop_vector zfs_vnodeops;
6689168404Spjdstruct vop_vector zfs_fifoops;
6690209962Smmstruct vop_vector zfs_shareops;
6691168404Spjd
6692168404Spjdstruct vop_vector zfs_vnodeops = {
6693185029Spjd	.vop_default =		&default_vnodeops,
6694185029Spjd	.vop_inactive =		zfs_freebsd_inactive,
6695185029Spjd	.vop_reclaim =		zfs_freebsd_reclaim,
6696185029Spjd	.vop_access =		zfs_freebsd_access,
6697168404Spjd#ifdef FREEBSD_NAMECACHE
6698185029Spjd	.vop_lookup =		vfs_cache_lookup,
6699185029Spjd	.vop_cachedlookup =	zfs_freebsd_lookup,
6700168404Spjd#else
6701185029Spjd	.vop_lookup =		zfs_freebsd_lookup,
6702168404Spjd#endif
6703185029Spjd	.vop_getattr =		zfs_freebsd_getattr,
6704185029Spjd	.vop_setattr =		zfs_freebsd_setattr,
6705185029Spjd	.vop_create =		zfs_freebsd_create,
6706185029Spjd	.vop_mknod =		zfs_freebsd_create,
6707185029Spjd	.vop_mkdir =		zfs_freebsd_mkdir,
6708185029Spjd	.vop_readdir =		zfs_freebsd_readdir,
6709185029Spjd	.vop_fsync =		zfs_freebsd_fsync,
6710185029Spjd	.vop_open =		zfs_freebsd_open,
6711185029Spjd	.vop_close =		zfs_freebsd_close,
6712185029Spjd	.vop_rmdir =		zfs_freebsd_rmdir,
6713185029Spjd	.vop_ioctl =		zfs_freebsd_ioctl,
6714185029Spjd	.vop_link =		zfs_freebsd_link,
6715185029Spjd	.vop_symlink =		zfs_freebsd_symlink,
6716185029Spjd	.vop_readlink =		zfs_freebsd_readlink,
6717185029Spjd	.vop_read =		zfs_freebsd_read,
6718185029Spjd	.vop_write =		zfs_freebsd_write,
6719185029Spjd	.vop_remove =		zfs_freebsd_remove,
6720185029Spjd	.vop_rename =		zfs_freebsd_rename,
6721185029Spjd	.vop_pathconf =		zfs_freebsd_pathconf,
6722185029Spjd	.vop_bmap =		VOP_EOPNOTSUPP,
6723185029Spjd	.vop_fid =		zfs_freebsd_fid,
6724185029Spjd	.vop_getextattr =	zfs_getextattr,
6725185029Spjd	.vop_deleteextattr =	zfs_deleteextattr,
6726185029Spjd	.vop_setextattr =	zfs_setextattr,
6727185029Spjd	.vop_listextattr =	zfs_listextattr,
6728192800Strasz	.vop_getacl =		zfs_freebsd_getacl,
6729192800Strasz	.vop_setacl =		zfs_freebsd_setacl,
6730192800Strasz	.vop_aclcheck =		zfs_freebsd_aclcheck,
6731213937Savg	.vop_getpages =		zfs_freebsd_getpages,
6732168404Spjd};
6733168404Spjd
6734169170Spjdstruct vop_vector zfs_fifoops = {
6735185029Spjd	.vop_default =		&fifo_specops,
6736200162Skib	.vop_fsync =		zfs_freebsd_fsync,
6737185029Spjd	.vop_access =		zfs_freebsd_access,
6738185029Spjd	.vop_getattr =		zfs_freebsd_getattr,
6739185029Spjd	.vop_inactive =		zfs_freebsd_inactive,
6740185029Spjd	.vop_read =		VOP_PANIC,
6741185029Spjd	.vop_reclaim =		zfs_freebsd_reclaim,
6742185029Spjd	.vop_setattr =		zfs_freebsd_setattr,
6743185029Spjd	.vop_write =		VOP_PANIC,
6744196949Strasz	.vop_pathconf = 	zfs_freebsd_fifo_pathconf,
6745185029Spjd	.vop_fid =		zfs_freebsd_fid,
6746192800Strasz	.vop_getacl =		zfs_freebsd_getacl,
6747192800Strasz	.vop_setacl =		zfs_freebsd_setacl,
6748192800Strasz	.vop_aclcheck =		zfs_freebsd_aclcheck,
6749168404Spjd};
6750209962Smm
6751209962Smm/*
6752209962Smm * special share hidden files vnode operations template
6753209962Smm */
6754209962Smmstruct vop_vector zfs_shareops = {
6755209962Smm	.vop_default =		&default_vnodeops,
6756209962Smm	.vop_access =		zfs_freebsd_access,
6757209962Smm	.vop_inactive =		zfs_freebsd_inactive,
6758209962Smm	.vop_reclaim =		zfs_freebsd_reclaim,
6759209962Smm	.vop_fid =		zfs_freebsd_fid,
6760209962Smm	.vop_pathconf =		zfs_freebsd_pathconf,
6761209962Smm};
6762