zfs_vnops.c revision 249195
1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22212694Smm * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23249195Smm * Copyright (c) 2013 by Delphix. All rights reserved.
24168404Spjd */
25168404Spjd
26169195Spjd/* Portions Copyright 2007 Jeremy Teo */
27219089Spjd/* Portions Copyright 2010 Robert Milkowski */
28169195Spjd
29168404Spjd#include <sys/types.h>
30168404Spjd#include <sys/param.h>
31168404Spjd#include <sys/time.h>
32168404Spjd#include <sys/systm.h>
33168404Spjd#include <sys/sysmacros.h>
34168404Spjd#include <sys/resource.h>
35168404Spjd#include <sys/vfs.h>
36248084Sattilio#include <sys/vm.h>
37168404Spjd#include <sys/vnode.h>
38168404Spjd#include <sys/file.h>
39168404Spjd#include <sys/stat.h>
40168404Spjd#include <sys/kmem.h>
41168404Spjd#include <sys/taskq.h>
42168404Spjd#include <sys/uio.h>
43168404Spjd#include <sys/atomic.h>
44168404Spjd#include <sys/namei.h>
45168404Spjd#include <sys/mman.h>
46168404Spjd#include <sys/cmn_err.h>
47168404Spjd#include <sys/errno.h>
48168404Spjd#include <sys/unistd.h>
49168404Spjd#include <sys/zfs_dir.h>
50168404Spjd#include <sys/zfs_ioctl.h>
51168404Spjd#include <sys/fs/zfs.h>
52168404Spjd#include <sys/dmu.h>
53219089Spjd#include <sys/dmu_objset.h>
54168404Spjd#include <sys/spa.h>
55168404Spjd#include <sys/txg.h>
56168404Spjd#include <sys/dbuf.h>
57168404Spjd#include <sys/zap.h>
58219089Spjd#include <sys/sa.h>
59168404Spjd#include <sys/dirent.h>
60168962Spjd#include <sys/policy.h>
61168962Spjd#include <sys/sunddi.h>
62168404Spjd#include <sys/filio.h>
63209962Smm#include <sys/sid.h>
64168404Spjd#include <sys/zfs_ctldir.h>
65185029Spjd#include <sys/zfs_fuid.h>
66219089Spjd#include <sys/zfs_sa.h>
67168404Spjd#include <sys/dnlc.h>
68168404Spjd#include <sys/zfs_rlock.h>
69185029Spjd#include <sys/extdirent.h>
70185029Spjd#include <sys/kidmap.h>
71168404Spjd#include <sys/bio.h>
72168404Spjd#include <sys/buf.h>
73168404Spjd#include <sys/sf_buf.h>
74168404Spjd#include <sys/sched.h>
75192800Strasz#include <sys/acl.h>
76239077Smarius#include <vm/vm_param.h>
77215401Savg#include <vm/vm_pageout.h>
78168404Spjd
79168404Spjd/*
80168404Spjd * Programming rules.
81168404Spjd *
82168404Spjd * Each vnode op performs some logical unit of work.  To do this, the ZPL must
83168404Spjd * properly lock its in-core state, create a DMU transaction, do the work,
84168404Spjd * record this work in the intent log (ZIL), commit the DMU transaction,
85185029Spjd * and wait for the intent log to commit if it is a synchronous operation.
86185029Spjd * Moreover, the vnode ops must work in both normal and log replay context.
87168404Spjd * The ordering of events is important to avoid deadlocks and references
88168404Spjd * to freed memory.  The example below illustrates the following Big Rules:
89168404Spjd *
90168404Spjd *  (1) A check must be made in each zfs thread for a mounted file system.
91168404Spjd *	This is done avoiding races using ZFS_ENTER(zfsvfs).
92185029Spjd *      A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
93185029Spjd *      must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
94185029Spjd *      can return EIO from the calling function.
95168404Spjd *
96168404Spjd *  (2)	VN_RELE() should always be the last thing except for zil_commit()
97168404Spjd *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
98168404Spjd *	First, if it's the last reference, the vnode/znode
99168404Spjd *	can be freed, so the zp may point to freed memory.  Second, the last
100168404Spjd *	reference will call zfs_zinactive(), which may induce a lot of work --
101168404Spjd *	pushing cached pages (which acquires range locks) and syncing out
102168404Spjd *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
103168404Spjd *	which could deadlock the system if you were already holding one.
104191900Skmacy *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
105168404Spjd *
106168404Spjd *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
107168404Spjd *	as they can span dmu_tx_assign() calls.
108168404Spjd *
109209962Smm *  (4)	Always pass TXG_NOWAIT as the second argument to dmu_tx_assign().
110168404Spjd *	This is critical because we don't want to block while holding locks.
111168404Spjd *	Note, in particular, that if a lock is sometimes acquired before
112168404Spjd *	the tx assigns, and sometimes after (e.g. z_lock), then failing to
113168404Spjd *	use a non-blocking assign can deadlock the system.  The scenario:
114168404Spjd *
115168404Spjd *	Thread A has grabbed a lock before calling dmu_tx_assign().
116168404Spjd *	Thread B is in an already-assigned tx, and blocks for this lock.
117168404Spjd *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
118168404Spjd *	forever, because the previous txg can't quiesce until B's tx commits.
119168404Spjd *
120168404Spjd *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
121168404Spjd *	then drop all locks, call dmu_tx_wait(), and try again.
122168404Spjd *
123168404Spjd *  (5)	If the operation succeeded, generate the intent log entry for it
124168404Spjd *	before dropping locks.  This ensures that the ordering of events
125168404Spjd *	in the intent log matches the order in which they actually occurred.
126209962Smm *      During ZIL replay the zfs_log_* functions will update the sequence
127209962Smm *	number to indicate the zil transaction has replayed.
128168404Spjd *
129168404Spjd *  (6)	At the end of each vnode op, the DMU tx must always commit,
130168404Spjd *	regardless of whether there were any errors.
131168404Spjd *
132219089Spjd *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
133168404Spjd *	to ensure that synchronous semantics are provided when necessary.
134168404Spjd *
135168404Spjd * In general, this is how things should be ordered in each vnode op:
136168404Spjd *
137168404Spjd *	ZFS_ENTER(zfsvfs);		// exit if unmounted
138168404Spjd * top:
139168404Spjd *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may VN_HOLD())
140168404Spjd *	rw_enter(...);			// grab any other locks you need
141168404Spjd *	tx = dmu_tx_create(...);	// get DMU tx
142168404Spjd *	dmu_tx_hold_*();		// hold each object you might modify
143209962Smm *	error = dmu_tx_assign(tx, TXG_NOWAIT);	// try to assign
144168404Spjd *	if (error) {
145168404Spjd *		rw_exit(...);		// drop locks
146168404Spjd *		zfs_dirent_unlock(dl);	// unlock directory entry
147168404Spjd *		VN_RELE(...);		// release held vnodes
148209962Smm *		if (error == ERESTART) {
149168404Spjd *			dmu_tx_wait(tx);
150168404Spjd *			dmu_tx_abort(tx);
151168404Spjd *			goto top;
152168404Spjd *		}
153168404Spjd *		dmu_tx_abort(tx);	// abort DMU tx
154168404Spjd *		ZFS_EXIT(zfsvfs);	// finished in zfs
155168404Spjd *		return (error);		// really out of space
156168404Spjd *	}
157168404Spjd *	error = do_real_work();		// do whatever this VOP does
158168404Spjd *	if (error == 0)
159168404Spjd *		zfs_log_*(...);		// on success, make ZIL entry
160168404Spjd *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
161168404Spjd *	rw_exit(...);			// drop locks
162168404Spjd *	zfs_dirent_unlock(dl);		// unlock directory entry
163168404Spjd *	VN_RELE(...);			// release held vnodes
164219089Spjd *	zil_commit(zilog, foid);	// synchronous when necessary
165168404Spjd *	ZFS_EXIT(zfsvfs);		// finished in zfs
166168404Spjd *	return (error);			// done, report error
167168404Spjd */
168185029Spjd
169168404Spjd/* ARGSUSED */
170168404Spjdstatic int
171185029Spjdzfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
172168404Spjd{
173168962Spjd	znode_t	*zp = VTOZ(*vpp);
174209962Smm	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
175168404Spjd
176209962Smm	ZFS_ENTER(zfsvfs);
177209962Smm	ZFS_VERIFY_ZP(zp);
178209962Smm
179219089Spjd	if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
180185029Spjd	    ((flag & FAPPEND) == 0)) {
181209962Smm		ZFS_EXIT(zfsvfs);
182249195Smm		return (SET_ERROR(EPERM));
183185029Spjd	}
184185029Spjd
185185029Spjd	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
186185029Spjd	    ZTOV(zp)->v_type == VREG &&
187219089Spjd	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
188209962Smm		if (fs_vscan(*vpp, cr, 0) != 0) {
189209962Smm			ZFS_EXIT(zfsvfs);
190249195Smm			return (SET_ERROR(EACCES));
191209962Smm		}
192209962Smm	}
193185029Spjd
194168404Spjd	/* Keep a count of the synchronous opens in the znode */
195168962Spjd	if (flag & (FSYNC | FDSYNC))
196168404Spjd		atomic_inc_32(&zp->z_sync_cnt);
197185029Spjd
198209962Smm	ZFS_EXIT(zfsvfs);
199168404Spjd	return (0);
200168404Spjd}
201168404Spjd
202168404Spjd/* ARGSUSED */
203168404Spjdstatic int
204185029Spjdzfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
205185029Spjd    caller_context_t *ct)
206168404Spjd{
207168962Spjd	znode_t	*zp = VTOZ(vp);
208209962Smm	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
209168404Spjd
210210470Smm	/*
211210470Smm	 * Clean up any locks held by this process on the vp.
212210470Smm	 */
213210470Smm	cleanlocks(vp, ddi_get_pid(), 0);
214210470Smm	cleanshares(vp, ddi_get_pid());
215210470Smm
216209962Smm	ZFS_ENTER(zfsvfs);
217209962Smm	ZFS_VERIFY_ZP(zp);
218209962Smm
219168404Spjd	/* Decrement the synchronous opens in the znode */
220185029Spjd	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
221168404Spjd		atomic_dec_32(&zp->z_sync_cnt);
222168404Spjd
223185029Spjd	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
224185029Spjd	    ZTOV(zp)->v_type == VREG &&
225219089Spjd	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
226185029Spjd		VERIFY(fs_vscan(vp, cr, 1) == 0);
227185029Spjd
228209962Smm	ZFS_EXIT(zfsvfs);
229168404Spjd	return (0);
230168404Spjd}
231168404Spjd
232168404Spjd/*
233168404Spjd * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
234168404Spjd * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
235168404Spjd */
236168404Spjdstatic int
237168978Spjdzfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
238168404Spjd{
239168404Spjd	znode_t	*zp = VTOZ(vp);
240168404Spjd	uint64_t noff = (uint64_t)*off; /* new offset */
241168404Spjd	uint64_t file_sz;
242168404Spjd	int error;
243168404Spjd	boolean_t hole;
244168404Spjd
245219089Spjd	file_sz = zp->z_size;
246168404Spjd	if (noff >= file_sz)  {
247249195Smm		return (SET_ERROR(ENXIO));
248168404Spjd	}
249168404Spjd
250168962Spjd	if (cmd == _FIO_SEEK_HOLE)
251168404Spjd		hole = B_TRUE;
252168404Spjd	else
253168404Spjd		hole = B_FALSE;
254168404Spjd
255168404Spjd	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
256168404Spjd
257168404Spjd	/* end of file? */
258168404Spjd	if ((error == ESRCH) || (noff > file_sz)) {
259168404Spjd		/*
260168404Spjd		 * Handle the virtual hole at the end of file.
261168404Spjd		 */
262168404Spjd		if (hole) {
263168404Spjd			*off = file_sz;
264168404Spjd			return (0);
265168404Spjd		}
266249195Smm		return (SET_ERROR(ENXIO));
267168404Spjd	}
268168404Spjd
269168404Spjd	if (noff < *off)
270168404Spjd		return (error);
271168404Spjd	*off = noff;
272168404Spjd	return (error);
273168404Spjd}
274168404Spjd
275168404Spjd/* ARGSUSED */
276168404Spjdstatic int
277168978Spjdzfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
278185029Spjd    int *rvalp, caller_context_t *ct)
279168404Spjd{
280168962Spjd	offset_t off;
281168962Spjd	int error;
282168962Spjd	zfsvfs_t *zfsvfs;
283185029Spjd	znode_t *zp;
284168404Spjd
285168404Spjd	switch (com) {
286185029Spjd	case _FIOFFS:
287168962Spjd		return (0);
288168404Spjd
289168962Spjd		/*
290168962Spjd		 * The following two ioctls are used by bfu.  Faking out,
291168962Spjd		 * necessary to avoid bfu errors.
292168962Spjd		 */
293185029Spjd	case _FIOGDIO:
294185029Spjd	case _FIOSDIO:
295168962Spjd		return (0);
296168962Spjd
297185029Spjd	case _FIO_SEEK_DATA:
298185029Spjd	case _FIO_SEEK_HOLE:
299233918Savg#ifdef sun
300168962Spjd		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
301249195Smm			return (SET_ERROR(EFAULT));
302233918Savg#else
303233918Savg		off = *(offset_t *)data;
304233918Savg#endif
305185029Spjd		zp = VTOZ(vp);
306185029Spjd		zfsvfs = zp->z_zfsvfs;
307168404Spjd		ZFS_ENTER(zfsvfs);
308185029Spjd		ZFS_VERIFY_ZP(zp);
309168404Spjd
310168404Spjd		/* offset parameter is in/out */
311168404Spjd		error = zfs_holey(vp, com, &off);
312168404Spjd		ZFS_EXIT(zfsvfs);
313168404Spjd		if (error)
314168404Spjd			return (error);
315233918Savg#ifdef sun
316168962Spjd		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
317249195Smm			return (SET_ERROR(EFAULT));
318233918Savg#else
319233918Savg		*(offset_t *)data = off;
320233918Savg#endif
321168404Spjd		return (0);
322168404Spjd	}
323249195Smm	return (SET_ERROR(ENOTTY));
324168404Spjd}
325168404Spjd
326209962Smmstatic vm_page_t
327246293Savgpage_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
328209962Smm{
329209962Smm	vm_object_t obj;
330209962Smm	vm_page_t pp;
331209962Smm
332209962Smm	obj = vp->v_object;
333248084Sattilio	zfs_vmobject_assert_wlocked(obj);
334209962Smm
335209962Smm	for (;;) {
336209962Smm		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
337246293Savg		    pp->valid) {
338212652Savg			if ((pp->oflags & VPO_BUSY) != 0) {
339212652Savg				/*
340212652Savg				 * Reference the page before unlocking and
341212652Savg				 * sleeping so that the page daemon is less
342212652Savg				 * likely to reclaim it.
343212652Savg				 */
344225418Skib				vm_page_reference(pp);
345212652Savg				vm_page_sleep(pp, "zfsmwb");
346209962Smm				continue;
347212652Savg			}
348209962Smm		} else {
349246293Savg			pp = vm_page_alloc(obj, OFF_TO_IDX(start),
350246293Savg			    VM_ALLOC_SYSTEM | VM_ALLOC_IFCACHED |
351246293Savg			    VM_ALLOC_NOBUSY);
352209962Smm		}
353246293Savg
354246293Savg		if (pp != NULL) {
355246293Savg			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
356246293Savg			vm_object_pip_add(obj, 1);
357246293Savg			vm_page_io_start(pp);
358246293Savg			pmap_remove_write(pp);
359246293Savg			vm_page_clear_dirty(pp, off, nbytes);
360246293Savg		}
361209962Smm		break;
362209962Smm	}
363209962Smm	return (pp);
364209962Smm}
365209962Smm
366209962Smmstatic void
367246293Savgpage_unbusy(vm_page_t pp)
368209962Smm{
369209962Smm
370246293Savg	vm_page_io_finish(pp);
371246293Savg	vm_object_pip_subtract(pp->object, 1);
372209962Smm}
373209962Smm
374246293Savgstatic vm_page_t
375246293Savgpage_hold(vnode_t *vp, int64_t start)
376246293Savg{
377246293Savg	vm_object_t obj;
378246293Savg	vm_page_t pp;
379246293Savg
380246293Savg	obj = vp->v_object;
381248084Sattilio	zfs_vmobject_assert_wlocked(obj);
382246293Savg
383246293Savg	for (;;) {
384246293Savg		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
385246293Savg		    pp->valid) {
386246293Savg			if ((pp->oflags & VPO_BUSY) != 0) {
387246293Savg				/*
388246293Savg				 * Reference the page before unlocking and
389246293Savg				 * sleeping so that the page daemon is less
390246293Savg				 * likely to reclaim it.
391246293Savg				 */
392246293Savg				vm_page_reference(pp);
393246293Savg				vm_page_sleep(pp, "zfsmwb");
394246293Savg				continue;
395246293Savg			}
396246293Savg
397246293Savg			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
398246293Savg			vm_page_lock(pp);
399246293Savg			vm_page_hold(pp);
400246293Savg			vm_page_unlock(pp);
401246293Savg
402246293Savg		} else
403246293Savg			pp = NULL;
404246293Savg		break;
405246293Savg	}
406246293Savg	return (pp);
407246293Savg}
408246293Savg
409246293Savgstatic void
410246293Savgpage_unhold(vm_page_t pp)
411246293Savg{
412246293Savg
413246293Savg	vm_page_lock(pp);
414246293Savg	vm_page_unhold(pp);
415246293Savg	vm_page_unlock(pp);
416246293Savg}
417246293Savg
418209962Smmstatic caddr_t
419209962Smmzfs_map_page(vm_page_t pp, struct sf_buf **sfp)
420209962Smm{
421209962Smm
422212951Savg	*sfp = sf_buf_alloc(pp, 0);
423209962Smm	return ((caddr_t)sf_buf_kva(*sfp));
424209962Smm}
425209962Smm
426209962Smmstatic void
427209962Smmzfs_unmap_page(struct sf_buf *sf)
428209962Smm{
429209962Smm
430209962Smm	sf_buf_free(sf);
431209962Smm}
432209962Smm
433168404Spjd/*
434168404Spjd * When a file is memory mapped, we must keep the IO data synchronized
435168404Spjd * between the DMU cache and the memory mapped pages.  What this means:
436168404Spjd *
437168404Spjd * On Write:	If we find a memory mapped page, we write to *both*
438168404Spjd *		the page and the dmu buffer.
439168404Spjd */
440209962Smmstatic void
441209962Smmupdate_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
442209962Smm    int segflg, dmu_tx_t *tx)
443168404Spjd{
444168404Spjd	vm_object_t obj;
445168404Spjd	struct sf_buf *sf;
446246293Savg	caddr_t va;
447212655Savg	int off;
448168404Spjd
449168404Spjd	ASSERT(vp->v_mount != NULL);
450168404Spjd	obj = vp->v_object;
451168404Spjd	ASSERT(obj != NULL);
452168404Spjd
453168404Spjd	off = start & PAGEOFFSET;
454248084Sattilio	zfs_vmobject_wlock(obj);
455168404Spjd	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
456209962Smm		vm_page_t pp;
457246293Savg		int nbytes = imin(PAGESIZE - off, len);
458168404Spjd
459246293Savg		if (segflg == UIO_NOCOPY) {
460246293Savg			pp = vm_page_lookup(obj, OFF_TO_IDX(start));
461246293Savg			KASSERT(pp != NULL,
462246293Savg			    ("zfs update_pages: NULL page in putpages case"));
463246293Savg			KASSERT(off == 0,
464246293Savg			    ("zfs update_pages: unaligned data in putpages case"));
465246293Savg			KASSERT(pp->valid == VM_PAGE_BITS_ALL,
466246293Savg			    ("zfs update_pages: invalid page in putpages case"));
467246293Savg			KASSERT(pp->busy > 0,
468246293Savg			    ("zfs update_pages: unbusy page in putpages case"));
469246293Savg			KASSERT(!pmap_page_is_write_mapped(pp),
470246293Savg			    ("zfs update_pages: writable page in putpages case"));
471248084Sattilio			zfs_vmobject_wunlock(obj);
472168404Spjd
473246293Savg			va = zfs_map_page(pp, &sf);
474246293Savg			(void) dmu_write(os, oid, start, nbytes, va, tx);
475246293Savg			zfs_unmap_page(sf);
476246293Savg
477248084Sattilio			zfs_vmobject_wlock(obj);
478246293Savg			vm_page_undirty(pp);
479246293Savg		} else if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
480248084Sattilio			zfs_vmobject_wunlock(obj);
481246293Savg
482209962Smm			va = zfs_map_page(pp, &sf);
483246293Savg			(void) dmu_read(os, oid, start+off, nbytes,
484246293Savg			    va+off, DMU_READ_PREFETCH);;
485209962Smm			zfs_unmap_page(sf);
486246293Savg
487248084Sattilio			zfs_vmobject_wlock(obj);
488246293Savg			page_unbusy(pp);
489168404Spjd		}
490209962Smm		len -= nbytes;
491168404Spjd		off = 0;
492168404Spjd	}
493246293Savg	if (segflg != UIO_NOCOPY)
494246293Savg		vm_object_pip_wakeupn(obj, 0);
495248084Sattilio	zfs_vmobject_wunlock(obj);
496168404Spjd}
497168404Spjd
498168404Spjd/*
499219089Spjd * Read with UIO_NOCOPY flag means that sendfile(2) requests
500219089Spjd * ZFS to populate a range of page cache pages with data.
501219089Spjd *
502219089Spjd * NOTE: this function could be optimized to pre-allocate
503219089Spjd * all pages in advance, drain VPO_BUSY on all of them,
504219089Spjd * map them into contiguous KVA region and populate them
505219089Spjd * in one single dmu_read() call.
506219089Spjd */
507219089Spjdstatic int
508219089Spjdmappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
509219089Spjd{
510219089Spjd	znode_t *zp = VTOZ(vp);
511219089Spjd	objset_t *os = zp->z_zfsvfs->z_os;
512219089Spjd	struct sf_buf *sf;
513219089Spjd	vm_object_t obj;
514219089Spjd	vm_page_t pp;
515219089Spjd	int64_t start;
516219089Spjd	caddr_t va;
517219089Spjd	int len = nbytes;
518219089Spjd	int off;
519219089Spjd	int error = 0;
520219089Spjd
521219089Spjd	ASSERT(uio->uio_segflg == UIO_NOCOPY);
522219089Spjd	ASSERT(vp->v_mount != NULL);
523219089Spjd	obj = vp->v_object;
524219089Spjd	ASSERT(obj != NULL);
525219089Spjd	ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
526219089Spjd
527248084Sattilio	zfs_vmobject_wlock(obj);
528219089Spjd	for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
529219089Spjd		int bytes = MIN(PAGESIZE, len);
530219089Spjd
531219089Spjd		pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_NOBUSY |
532219089Spjd		    VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_IGN_SBUSY);
533219089Spjd		if (pp->valid == 0) {
534219089Spjd			vm_page_io_start(pp);
535248084Sattilio			zfs_vmobject_wunlock(obj);
536219089Spjd			va = zfs_map_page(pp, &sf);
537219089Spjd			error = dmu_read(os, zp->z_id, start, bytes, va,
538219089Spjd			    DMU_READ_PREFETCH);
539219089Spjd			if (bytes != PAGESIZE && error == 0)
540219089Spjd				bzero(va + bytes, PAGESIZE - bytes);
541219089Spjd			zfs_unmap_page(sf);
542248084Sattilio			zfs_vmobject_wlock(obj);
543219089Spjd			vm_page_io_finish(pp);
544219089Spjd			vm_page_lock(pp);
545219089Spjd			if (error) {
546219089Spjd				vm_page_free(pp);
547219089Spjd			} else {
548219089Spjd				pp->valid = VM_PAGE_BITS_ALL;
549219089Spjd				vm_page_activate(pp);
550219089Spjd			}
551219089Spjd			vm_page_unlock(pp);
552219089Spjd		}
553219089Spjd		if (error)
554219089Spjd			break;
555219089Spjd		uio->uio_resid -= bytes;
556219089Spjd		uio->uio_offset += bytes;
557219089Spjd		len -= bytes;
558219089Spjd	}
559248084Sattilio	zfs_vmobject_wunlock(obj);
560219089Spjd	return (error);
561219089Spjd}
562219089Spjd
563219089Spjd/*
564168404Spjd * When a file is memory mapped, we must keep the IO data synchronized
565168404Spjd * between the DMU cache and the memory mapped pages.  What this means:
566168404Spjd *
567168404Spjd * On Read:	We "read" preferentially from memory mapped pages,
568168404Spjd *		else we default from the dmu buffer.
569168404Spjd *
570168404Spjd * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
571168404Spjd *	the file is memory mapped.
572168404Spjd */
573168404Spjdstatic int
574168404Spjdmappedread(vnode_t *vp, int nbytes, uio_t *uio)
575168404Spjd{
576168404Spjd	znode_t *zp = VTOZ(vp);
577168404Spjd	objset_t *os = zp->z_zfsvfs->z_os;
578168404Spjd	vm_object_t obj;
579212655Savg	int64_t start;
580168926Spjd	caddr_t va;
581168404Spjd	int len = nbytes;
582212655Savg	int off;
583168404Spjd	int error = 0;
584168404Spjd
585168404Spjd	ASSERT(vp->v_mount != NULL);
586168404Spjd	obj = vp->v_object;
587168404Spjd	ASSERT(obj != NULL);
588168404Spjd
589168404Spjd	start = uio->uio_loffset;
590168404Spjd	off = start & PAGEOFFSET;
591248084Sattilio	zfs_vmobject_wlock(obj);
592168404Spjd	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
593219089Spjd		vm_page_t pp;
594219089Spjd		uint64_t bytes = MIN(PAGESIZE - off, len);
595168404Spjd
596246293Savg		if (pp = page_hold(vp, start)) {
597219089Spjd			struct sf_buf *sf;
598219089Spjd			caddr_t va;
599212652Savg
600248084Sattilio			zfs_vmobject_wunlock(obj);
601219089Spjd			va = zfs_map_page(pp, &sf);
602219089Spjd			error = uiomove(va + off, bytes, UIO_READ, uio);
603219089Spjd			zfs_unmap_page(sf);
604248084Sattilio			zfs_vmobject_wlock(obj);
605246293Savg			page_unhold(pp);
606219089Spjd		} else {
607248084Sattilio			zfs_vmobject_wunlock(obj);
608219089Spjd			error = dmu_read_uio(os, zp->z_id, uio, bytes);
609248084Sattilio			zfs_vmobject_wlock(obj);
610168404Spjd		}
611168404Spjd		len -= bytes;
612168404Spjd		off = 0;
613168404Spjd		if (error)
614168404Spjd			break;
615168404Spjd	}
616248084Sattilio	zfs_vmobject_wunlock(obj);
617168404Spjd	return (error);
618168404Spjd}
619168404Spjd
620168404Spjdoffset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
621168404Spjd
622168404Spjd/*
623168404Spjd * Read bytes from specified file into supplied buffer.
624168404Spjd *
625168404Spjd *	IN:	vp	- vnode of file to be read from.
626168404Spjd *		uio	- structure supplying read location, range info,
627168404Spjd *			  and return buffer.
628168404Spjd *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
629168404Spjd *		cr	- credentials of caller.
630185029Spjd *		ct	- caller context
631168404Spjd *
632168404Spjd *	OUT:	uio	- updated offset and range, buffer filled.
633168404Spjd *
634168404Spjd *	RETURN:	0 if success
635168404Spjd *		error code if failure
636168404Spjd *
637168404Spjd * Side Effects:
638168404Spjd *	vp - atime updated if byte count > 0
639168404Spjd */
640168404Spjd/* ARGSUSED */
641168404Spjdstatic int
642168962Spjdzfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
643168404Spjd{
644168404Spjd	znode_t		*zp = VTOZ(vp);
645168404Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
646185029Spjd	objset_t	*os;
647168404Spjd	ssize_t		n, nbytes;
648247187Smm	int		error = 0;
649168404Spjd	rl_t		*rl;
650219089Spjd	xuio_t		*xuio = NULL;
651168404Spjd
652168404Spjd	ZFS_ENTER(zfsvfs);
653185029Spjd	ZFS_VERIFY_ZP(zp);
654185029Spjd	os = zfsvfs->z_os;
655168404Spjd
656219089Spjd	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
657185029Spjd		ZFS_EXIT(zfsvfs);
658249195Smm		return (SET_ERROR(EACCES));
659185029Spjd	}
660185029Spjd
661168404Spjd	/*
662168404Spjd	 * Validate file offset
663168404Spjd	 */
664168404Spjd	if (uio->uio_loffset < (offset_t)0) {
665168404Spjd		ZFS_EXIT(zfsvfs);
666249195Smm		return (SET_ERROR(EINVAL));
667168404Spjd	}
668168404Spjd
669168404Spjd	/*
670168404Spjd	 * Fasttrack empty reads
671168404Spjd	 */
672168404Spjd	if (uio->uio_resid == 0) {
673168404Spjd		ZFS_EXIT(zfsvfs);
674168404Spjd		return (0);
675168404Spjd	}
676168404Spjd
677168404Spjd	/*
678168962Spjd	 * Check for mandatory locks
679168962Spjd	 */
680219089Spjd	if (MANDMODE(zp->z_mode)) {
681168962Spjd		if (error = chklock(vp, FREAD,
682168962Spjd		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
683168962Spjd			ZFS_EXIT(zfsvfs);
684168962Spjd			return (error);
685168962Spjd		}
686168962Spjd	}
687168962Spjd
688168962Spjd	/*
689168404Spjd	 * If we're in FRSYNC mode, sync out this znode before reading it.
690168404Spjd	 */
691224605Smm	if (zfsvfs->z_log &&
692224605Smm	    (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
693219089Spjd		zil_commit(zfsvfs->z_log, zp->z_id);
694168404Spjd
695168404Spjd	/*
696168404Spjd	 * Lock the range against changes.
697168404Spjd	 */
698168404Spjd	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
699168404Spjd
700168404Spjd	/*
701168404Spjd	 * If we are reading past end-of-file we can skip
702168404Spjd	 * to the end; but we might still need to set atime.
703168404Spjd	 */
704219089Spjd	if (uio->uio_loffset >= zp->z_size) {
705168404Spjd		error = 0;
706168404Spjd		goto out;
707168404Spjd	}
708168404Spjd
709219089Spjd	ASSERT(uio->uio_loffset < zp->z_size);
710219089Spjd	n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
711168404Spjd
712219089Spjd#ifdef sun
713219089Spjd	if ((uio->uio_extflg == UIO_XUIO) &&
714219089Spjd	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
715219089Spjd		int nblk;
716219089Spjd		int blksz = zp->z_blksz;
717219089Spjd		uint64_t offset = uio->uio_loffset;
718219089Spjd
719219089Spjd		xuio = (xuio_t *)uio;
720219089Spjd		if ((ISP2(blksz))) {
721219089Spjd			nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
722219089Spjd			    blksz)) / blksz;
723219089Spjd		} else {
724219089Spjd			ASSERT(offset + n <= blksz);
725219089Spjd			nblk = 1;
726219089Spjd		}
727219089Spjd		(void) dmu_xuio_init(xuio, nblk);
728219089Spjd
729219089Spjd		if (vn_has_cached_data(vp)) {
730219089Spjd			/*
731219089Spjd			 * For simplicity, we always allocate a full buffer
732219089Spjd			 * even if we only expect to read a portion of a block.
733219089Spjd			 */
734219089Spjd			while (--nblk >= 0) {
735219089Spjd				(void) dmu_xuio_add(xuio,
736219089Spjd				    dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
737219089Spjd				    blksz), 0, blksz);
738219089Spjd			}
739219089Spjd		}
740219089Spjd	}
741219089Spjd#endif	/* sun */
742219089Spjd
743168404Spjd	while (n > 0) {
744168404Spjd		nbytes = MIN(n, zfs_read_chunk_size -
745168404Spjd		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
746168404Spjd
747219089Spjd#ifdef __FreeBSD__
748219089Spjd		if (uio->uio_segflg == UIO_NOCOPY)
749219089Spjd			error = mappedread_sf(vp, nbytes, uio);
750219089Spjd		else
751219089Spjd#endif /* __FreeBSD__ */
752168404Spjd		if (vn_has_cached_data(vp))
753168404Spjd			error = mappedread(vp, nbytes, uio);
754168404Spjd		else
755168404Spjd			error = dmu_read_uio(os, zp->z_id, uio, nbytes);
756185029Spjd		if (error) {
757185029Spjd			/* convert checksum errors into IO errors */
758185029Spjd			if (error == ECKSUM)
759249195Smm				error = SET_ERROR(EIO);
760168404Spjd			break;
761185029Spjd		}
762168962Spjd
763168404Spjd		n -= nbytes;
764168404Spjd	}
765168404Spjdout:
766168404Spjd	zfs_range_unlock(rl);
767168404Spjd
768168404Spjd	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
769168404Spjd	ZFS_EXIT(zfsvfs);
770168404Spjd	return (error);
771168404Spjd}
772168404Spjd
773168404Spjd/*
774168404Spjd * Write the bytes to a file.
775168404Spjd *
776168404Spjd *	IN:	vp	- vnode of file to be written to.
777168404Spjd *		uio	- structure supplying write location, range info,
778168404Spjd *			  and data buffer.
779213673Spjd *		ioflag	- FAPPEND flag set if in append mode.
780168404Spjd *		cr	- credentials of caller.
781185029Spjd *		ct	- caller context (NFS/CIFS fem monitor only)
782168404Spjd *
783168404Spjd *	OUT:	uio	- updated offset and range.
784168404Spjd *
785168404Spjd *	RETURN:	0 if success
786168404Spjd *		error code if failure
787168404Spjd *
788168404Spjd * Timestamps:
789168404Spjd *	vp - ctime|mtime updated if byte count > 0
790168404Spjd */
791219089Spjd
792168404Spjd/* ARGSUSED */
793168404Spjdstatic int
794168962Spjdzfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
795168404Spjd{
796168404Spjd	znode_t		*zp = VTOZ(vp);
797168962Spjd	rlim64_t	limit = MAXOFFSET_T;
798168404Spjd	ssize_t		start_resid = uio->uio_resid;
799168404Spjd	ssize_t		tx_bytes;
800168404Spjd	uint64_t	end_size;
801168404Spjd	dmu_tx_t	*tx;
802168404Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
803185029Spjd	zilog_t		*zilog;
804168404Spjd	offset_t	woff;
805168404Spjd	ssize_t		n, nbytes;
806168404Spjd	rl_t		*rl;
807168404Spjd	int		max_blksz = zfsvfs->z_max_blksz;
808247187Smm	int		error = 0;
809209962Smm	arc_buf_t	*abuf;
810247187Smm	iovec_t		*aiov = NULL;
811219089Spjd	xuio_t		*xuio = NULL;
812219089Spjd	int		i_iov = 0;
813219089Spjd	int		iovcnt = uio->uio_iovcnt;
814219089Spjd	iovec_t		*iovp = uio->uio_iov;
815219089Spjd	int		write_eof;
816219089Spjd	int		count = 0;
817219089Spjd	sa_bulk_attr_t	bulk[4];
818219089Spjd	uint64_t	mtime[2], ctime[2];
819168404Spjd
820168404Spjd	/*
821168404Spjd	 * Fasttrack empty write
822168404Spjd	 */
823168404Spjd	n = start_resid;
824168404Spjd	if (n == 0)
825168404Spjd		return (0);
826168404Spjd
827168962Spjd	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
828168962Spjd		limit = MAXOFFSET_T;
829168962Spjd
830168404Spjd	ZFS_ENTER(zfsvfs);
831185029Spjd	ZFS_VERIFY_ZP(zp);
832168404Spjd
833219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
834219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
835219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
836219089Spjd	    &zp->z_size, 8);
837219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
838219089Spjd	    &zp->z_pflags, 8);
839219089Spjd
840168404Spjd	/*
841185029Spjd	 * If immutable or not appending then return EPERM
842185029Spjd	 */
843219089Spjd	if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
844219089Spjd	    ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
845219089Spjd	    (uio->uio_loffset < zp->z_size))) {
846185029Spjd		ZFS_EXIT(zfsvfs);
847249195Smm		return (SET_ERROR(EPERM));
848185029Spjd	}
849185029Spjd
850185029Spjd	zilog = zfsvfs->z_log;
851185029Spjd
852185029Spjd	/*
853219089Spjd	 * Validate file offset
854219089Spjd	 */
855219089Spjd	woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
856219089Spjd	if (woff < 0) {
857219089Spjd		ZFS_EXIT(zfsvfs);
858249195Smm		return (SET_ERROR(EINVAL));
859219089Spjd	}
860219089Spjd
861219089Spjd	/*
862219089Spjd	 * Check for mandatory locks before calling zfs_range_lock()
863219089Spjd	 * in order to prevent a deadlock with locks set via fcntl().
864219089Spjd	 */
865219089Spjd	if (MANDMODE((mode_t)zp->z_mode) &&
866219089Spjd	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
867219089Spjd		ZFS_EXIT(zfsvfs);
868219089Spjd		return (error);
869219089Spjd	}
870219089Spjd
871219089Spjd#ifdef sun
872219089Spjd	/*
873168404Spjd	 * Pre-fault the pages to ensure slow (eg NFS) pages
874168404Spjd	 * don't hold up txg.
875219089Spjd	 * Skip this if uio contains loaned arc_buf.
876168404Spjd	 */
877219089Spjd	if ((uio->uio_extflg == UIO_XUIO) &&
878219089Spjd	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
879219089Spjd		xuio = (xuio_t *)uio;
880219089Spjd	else
881219089Spjd		uio_prefaultpages(MIN(n, max_blksz), uio);
882219089Spjd#endif	/* sun */
883168404Spjd
884168404Spjd	/*
885168404Spjd	 * If in append mode, set the io offset pointer to eof.
886168404Spjd	 */
887213673Spjd	if (ioflag & FAPPEND) {
888168404Spjd		/*
889219089Spjd		 * Obtain an appending range lock to guarantee file append
890219089Spjd		 * semantics.  We reset the write offset once we have the lock.
891168404Spjd		 */
892168404Spjd		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
893219089Spjd		woff = rl->r_off;
894168404Spjd		if (rl->r_len == UINT64_MAX) {
895219089Spjd			/*
896219089Spjd			 * We overlocked the file because this write will cause
897219089Spjd			 * the file block size to increase.
898219089Spjd			 * Note that zp_size cannot change with this lock held.
899219089Spjd			 */
900219089Spjd			woff = zp->z_size;
901168404Spjd		}
902219089Spjd		uio->uio_loffset = woff;
903168404Spjd	} else {
904168404Spjd		/*
905219089Spjd		 * Note that if the file block size will change as a result of
906219089Spjd		 * this write, then this range lock will lock the entire file
907219089Spjd		 * so that we can re-write the block safely.
908168404Spjd		 */
909168404Spjd		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
910168404Spjd	}
911168404Spjd
912235781Strasz	if (vn_rlimit_fsize(vp, uio, uio->uio_td)) {
913235781Strasz		zfs_range_unlock(rl);
914235781Strasz		ZFS_EXIT(zfsvfs);
915235781Strasz		return (EFBIG);
916235781Strasz	}
917235781Strasz
918168962Spjd	if (woff >= limit) {
919168962Spjd		zfs_range_unlock(rl);
920168962Spjd		ZFS_EXIT(zfsvfs);
921249195Smm		return (SET_ERROR(EFBIG));
922168962Spjd	}
923168962Spjd
924168962Spjd	if ((woff + n) > limit || woff > (limit - n))
925168962Spjd		n = limit - woff;
926168962Spjd
927219089Spjd	/* Will this write extend the file length? */
928219089Spjd	write_eof = (woff + n > zp->z_size);
929168404Spjd
930219089Spjd	end_size = MAX(zp->z_size, woff + n);
931219089Spjd
932168404Spjd	/*
933168404Spjd	 * Write the file in reasonable size chunks.  Each chunk is written
934168404Spjd	 * in a separate transaction; this keeps the intent log records small
935168404Spjd	 * and allows us to do more fine-grained space accounting.
936168404Spjd	 */
937168404Spjd	while (n > 0) {
938209962Smm		abuf = NULL;
939209962Smm		woff = uio->uio_loffset;
940209962Smmagain:
941219089Spjd		if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
942219089Spjd		    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
943209962Smm			if (abuf != NULL)
944209962Smm				dmu_return_arcbuf(abuf);
945249195Smm			error = SET_ERROR(EDQUOT);
946209962Smm			break;
947209962Smm		}
948209962Smm
949219089Spjd		if (xuio && abuf == NULL) {
950219089Spjd			ASSERT(i_iov < iovcnt);
951219089Spjd			aiov = &iovp[i_iov];
952219089Spjd			abuf = dmu_xuio_arcbuf(xuio, i_iov);
953219089Spjd			dmu_xuio_clear(xuio, i_iov);
954219089Spjd			DTRACE_PROBE3(zfs_cp_write, int, i_iov,
955219089Spjd			    iovec_t *, aiov, arc_buf_t *, abuf);
956219089Spjd			ASSERT((aiov->iov_base == abuf->b_data) ||
957219089Spjd			    ((char *)aiov->iov_base - (char *)abuf->b_data +
958219089Spjd			    aiov->iov_len == arc_buf_size(abuf)));
959219089Spjd			i_iov++;
960219089Spjd		} else if (abuf == NULL && n >= max_blksz &&
961219089Spjd		    woff >= zp->z_size &&
962209962Smm		    P2PHASE(woff, max_blksz) == 0 &&
963209962Smm		    zp->z_blksz == max_blksz) {
964219089Spjd			/*
965219089Spjd			 * This write covers a full block.  "Borrow" a buffer
966219089Spjd			 * from the dmu so that we can fill it before we enter
967219089Spjd			 * a transaction.  This avoids the possibility of
968219089Spjd			 * holding up the transaction if the data copy hangs
969219089Spjd			 * up on a pagefault (e.g., from an NFS server mapping).
970219089Spjd			 */
971209962Smm			size_t cbytes;
972209962Smm
973219089Spjd			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
974219089Spjd			    max_blksz);
975209962Smm			ASSERT(abuf != NULL);
976209962Smm			ASSERT(arc_buf_size(abuf) == max_blksz);
977209962Smm			if (error = uiocopy(abuf->b_data, max_blksz,
978209962Smm			    UIO_WRITE, uio, &cbytes)) {
979209962Smm				dmu_return_arcbuf(abuf);
980209962Smm				break;
981209962Smm			}
982209962Smm			ASSERT(cbytes == max_blksz);
983209962Smm		}
984209962Smm
985209962Smm		/*
986168404Spjd		 * Start a transaction.
987168404Spjd		 */
988168404Spjd		tx = dmu_tx_create(zfsvfs->z_os);
989219089Spjd		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
990168404Spjd		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
991219089Spjd		zfs_sa_upgrade_txholds(tx, zp);
992209962Smm		error = dmu_tx_assign(tx, TXG_NOWAIT);
993168404Spjd		if (error) {
994209962Smm			if (error == ERESTART) {
995168404Spjd				dmu_tx_wait(tx);
996168404Spjd				dmu_tx_abort(tx);
997209962Smm				goto again;
998168404Spjd			}
999168404Spjd			dmu_tx_abort(tx);
1000209962Smm			if (abuf != NULL)
1001209962Smm				dmu_return_arcbuf(abuf);
1002168404Spjd			break;
1003168404Spjd		}
1004168404Spjd
1005168404Spjd		/*
1006168404Spjd		 * If zfs_range_lock() over-locked we grow the blocksize
1007168404Spjd		 * and then reduce the lock range.  This will only happen
1008168404Spjd		 * on the first iteration since zfs_range_reduce() will
1009168404Spjd		 * shrink down r_len to the appropriate size.
1010168404Spjd		 */
1011168404Spjd		if (rl->r_len == UINT64_MAX) {
1012168404Spjd			uint64_t new_blksz;
1013168404Spjd
1014168404Spjd			if (zp->z_blksz > max_blksz) {
1015168404Spjd				ASSERT(!ISP2(zp->z_blksz));
1016168404Spjd				new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
1017168404Spjd			} else {
1018168404Spjd				new_blksz = MIN(end_size, max_blksz);
1019168404Spjd			}
1020168404Spjd			zfs_grow_blocksize(zp, new_blksz, tx);
1021168404Spjd			zfs_range_reduce(rl, woff, n);
1022168404Spjd		}
1023168404Spjd
1024168404Spjd		/*
1025168404Spjd		 * XXX - should we really limit each write to z_max_blksz?
1026168404Spjd		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
1027168404Spjd		 */
1028168404Spjd		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
1029168404Spjd
1030219089Spjd		if (woff + nbytes > zp->z_size)
1031168404Spjd			vnode_pager_setsize(vp, woff + nbytes);
1032168404Spjd
1033209962Smm		if (abuf == NULL) {
1034209962Smm			tx_bytes = uio->uio_resid;
1035219089Spjd			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
1036219089Spjd			    uio, nbytes, tx);
1037209962Smm			tx_bytes -= uio->uio_resid;
1038168404Spjd		} else {
1039209962Smm			tx_bytes = nbytes;
1040219089Spjd			ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
1041219089Spjd			/*
1042219089Spjd			 * If this is not a full block write, but we are
1043219089Spjd			 * extending the file past EOF and this data starts
1044219089Spjd			 * block-aligned, use assign_arcbuf().  Otherwise,
1045219089Spjd			 * write via dmu_write().
1046219089Spjd			 */
1047219089Spjd			if (tx_bytes < max_blksz && (!write_eof ||
1048219089Spjd			    aiov->iov_base != abuf->b_data)) {
1049219089Spjd				ASSERT(xuio);
1050219089Spjd				dmu_write(zfsvfs->z_os, zp->z_id, woff,
1051219089Spjd				    aiov->iov_len, aiov->iov_base, tx);
1052219089Spjd				dmu_return_arcbuf(abuf);
1053219089Spjd				xuio_stat_wbuf_copied();
1054219089Spjd			} else {
1055219089Spjd				ASSERT(xuio || tx_bytes == max_blksz);
1056219089Spjd				dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
1057219089Spjd				    woff, abuf, tx);
1058219089Spjd			}
1059209962Smm			ASSERT(tx_bytes <= uio->uio_resid);
1060209962Smm			uioskip(uio, tx_bytes);
1061168404Spjd		}
1062212657Savg		if (tx_bytes && vn_has_cached_data(vp)) {
1063209962Smm			update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
1064209962Smm			    zp->z_id, uio->uio_segflg, tx);
1065209962Smm		}
1066209962Smm
1067209962Smm		/*
1068168404Spjd		 * If we made no progress, we're done.  If we made even
1069168404Spjd		 * partial progress, update the znode and ZIL accordingly.
1070168404Spjd		 */
1071168404Spjd		if (tx_bytes == 0) {
1072219089Spjd			(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
1073219089Spjd			    (void *)&zp->z_size, sizeof (uint64_t), tx);
1074168404Spjd			dmu_tx_commit(tx);
1075168404Spjd			ASSERT(error != 0);
1076168404Spjd			break;
1077168404Spjd		}
1078168404Spjd
1079168404Spjd		/*
1080168404Spjd		 * Clear Set-UID/Set-GID bits on successful write if not
1081168404Spjd		 * privileged and at least one of the excute bits is set.
1082168404Spjd		 *
1083168404Spjd		 * It would be nice to to this after all writes have
1084168404Spjd		 * been done, but that would still expose the ISUID/ISGID
1085168404Spjd		 * to another app after the partial write is committed.
1086185029Spjd		 *
1087185029Spjd		 * Note: we don't call zfs_fuid_map_id() here because
1088185029Spjd		 * user 0 is not an ephemeral uid.
1089168404Spjd		 */
1090168404Spjd		mutex_enter(&zp->z_acl_lock);
1091219089Spjd		if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
1092168404Spjd		    (S_IXUSR >> 6))) != 0 &&
1093219089Spjd		    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
1094185029Spjd		    secpolicy_vnode_setid_retain(vp, cr,
1095219089Spjd		    (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
1096219089Spjd			uint64_t newmode;
1097219089Spjd			zp->z_mode &= ~(S_ISUID | S_ISGID);
1098219089Spjd			newmode = zp->z_mode;
1099219089Spjd			(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
1100219089Spjd			    (void *)&newmode, sizeof (uint64_t), tx);
1101168404Spjd		}
1102168404Spjd		mutex_exit(&zp->z_acl_lock);
1103168404Spjd
1104219089Spjd		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
1105219089Spjd		    B_TRUE);
1106168404Spjd
1107168404Spjd		/*
1108168404Spjd		 * Update the file size (zp_size) if it has changed;
1109168404Spjd		 * account for possible concurrent updates.
1110168404Spjd		 */
1111219089Spjd		while ((end_size = zp->z_size) < uio->uio_loffset) {
1112219089Spjd			(void) atomic_cas_64(&zp->z_size, end_size,
1113168404Spjd			    uio->uio_loffset);
1114219089Spjd			ASSERT(error == 0);
1115219089Spjd		}
1116219089Spjd		/*
1117219089Spjd		 * If we are replaying and eof is non zero then force
1118219089Spjd		 * the file size to the specified eof. Note, there's no
1119219089Spjd		 * concurrency during replay.
1120219089Spjd		 */
1121219089Spjd		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
1122219089Spjd			zp->z_size = zfsvfs->z_replay_eof;
1123219089Spjd
1124219089Spjd		error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1125219089Spjd
1126168404Spjd		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
1127168404Spjd		dmu_tx_commit(tx);
1128168404Spjd
1129168404Spjd		if (error != 0)
1130168404Spjd			break;
1131168404Spjd		ASSERT(tx_bytes == nbytes);
1132168404Spjd		n -= nbytes;
1133219089Spjd
1134219089Spjd#ifdef sun
1135219089Spjd		if (!xuio && n > 0)
1136219089Spjd			uio_prefaultpages(MIN(n, max_blksz), uio);
1137219089Spjd#endif	/* sun */
1138168404Spjd	}
1139168404Spjd
1140168404Spjd	zfs_range_unlock(rl);
1141168404Spjd
1142168404Spjd	/*
1143168404Spjd	 * If we're in replay mode, or we made no progress, return error.
1144168404Spjd	 * Otherwise, it's at least a partial write, so it's successful.
1145168404Spjd	 */
1146209962Smm	if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
1147168404Spjd		ZFS_EXIT(zfsvfs);
1148168404Spjd		return (error);
1149168404Spjd	}
1150168404Spjd
1151219089Spjd	if (ioflag & (FSYNC | FDSYNC) ||
1152219089Spjd	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1153219089Spjd		zil_commit(zilog, zp->z_id);
1154168404Spjd
1155168404Spjd	ZFS_EXIT(zfsvfs);
1156168404Spjd	return (0);
1157168404Spjd}
1158168404Spjd
1159168404Spjdvoid
1160219089Spjdzfs_get_done(zgd_t *zgd, int error)
1161168404Spjd{
1162219089Spjd	znode_t *zp = zgd->zgd_private;
1163219089Spjd	objset_t *os = zp->z_zfsvfs->z_os;
1164168404Spjd
1165219089Spjd	if (zgd->zgd_db)
1166219089Spjd		dmu_buf_rele(zgd->zgd_db, zgd);
1167219089Spjd
1168219089Spjd	zfs_range_unlock(zgd->zgd_rl);
1169219089Spjd
1170191900Skmacy	/*
1171191900Skmacy	 * Release the vnode asynchronously as we currently have the
1172191900Skmacy	 * txg stopped from syncing.
1173191900Skmacy	 */
1174219089Spjd	VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1175219089Spjd
1176219089Spjd	if (error == 0 && zgd->zgd_bp)
1177219089Spjd		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
1178219089Spjd
1179168404Spjd	kmem_free(zgd, sizeof (zgd_t));
1180168404Spjd}
1181168404Spjd
1182214378Smm#ifdef DEBUG
1183214378Smmstatic int zil_fault_io = 0;
1184214378Smm#endif
1185214378Smm
1186168404Spjd/*
1187168404Spjd * Get data to generate a TX_WRITE intent log record.
1188168404Spjd */
1189168404Spjdint
1190168404Spjdzfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
1191168404Spjd{
1192168404Spjd	zfsvfs_t *zfsvfs = arg;
1193168404Spjd	objset_t *os = zfsvfs->z_os;
1194168404Spjd	znode_t *zp;
1195219089Spjd	uint64_t object = lr->lr_foid;
1196219089Spjd	uint64_t offset = lr->lr_offset;
1197219089Spjd	uint64_t size = lr->lr_length;
1198219089Spjd	blkptr_t *bp = &lr->lr_blkptr;
1199168404Spjd	dmu_buf_t *db;
1200168404Spjd	zgd_t *zgd;
1201168404Spjd	int error = 0;
1202168404Spjd
1203219089Spjd	ASSERT(zio != NULL);
1204219089Spjd	ASSERT(size != 0);
1205168404Spjd
1206168404Spjd	/*
1207168404Spjd	 * Nothing to do if the file has been removed
1208168404Spjd	 */
1209219089Spjd	if (zfs_zget(zfsvfs, object, &zp) != 0)
1210249195Smm		return (SET_ERROR(ENOENT));
1211168404Spjd	if (zp->z_unlinked) {
1212191900Skmacy		/*
1213191900Skmacy		 * Release the vnode asynchronously as we currently have the
1214191900Skmacy		 * txg stopped from syncing.
1215191900Skmacy		 */
1216196307Spjd		VN_RELE_ASYNC(ZTOV(zp),
1217196307Spjd		    dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1218249195Smm		return (SET_ERROR(ENOENT));
1219168404Spjd	}
1220168404Spjd
1221219089Spjd	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1222219089Spjd	zgd->zgd_zilog = zfsvfs->z_log;
1223219089Spjd	zgd->zgd_private = zp;
1224219089Spjd
1225168404Spjd	/*
1226168404Spjd	 * Write records come in two flavors: immediate and indirect.
1227168404Spjd	 * For small writes it's cheaper to store the data with the
1228168404Spjd	 * log record (immediate); for large writes it's cheaper to
1229168404Spjd	 * sync the data and get a pointer to it (indirect) so that
1230168404Spjd	 * we don't have to write the data twice.
1231168404Spjd	 */
1232168404Spjd	if (buf != NULL) { /* immediate write */
1233219089Spjd		zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
1234168404Spjd		/* test for truncation needs to be done while range locked */
1235219089Spjd		if (offset >= zp->z_size) {
1236249195Smm			error = SET_ERROR(ENOENT);
1237219089Spjd		} else {
1238219089Spjd			error = dmu_read(os, object, offset, size, buf,
1239219089Spjd			    DMU_READ_NO_PREFETCH);
1240168404Spjd		}
1241219089Spjd		ASSERT(error == 0 || error == ENOENT);
1242168404Spjd	} else { /* indirect write */
1243168404Spjd		/*
1244168404Spjd		 * Have to lock the whole block to ensure when it's
1245168404Spjd		 * written out and it's checksum is being calculated
1246168404Spjd		 * that no one can change the data. We need to re-check
1247168404Spjd		 * blocksize after we get the lock in case it's changed!
1248168404Spjd		 */
1249168404Spjd		for (;;) {
1250219089Spjd			uint64_t blkoff;
1251219089Spjd			size = zp->z_blksz;
1252219089Spjd			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1253219089Spjd			offset -= blkoff;
1254219089Spjd			zgd->zgd_rl = zfs_range_lock(zp, offset, size,
1255219089Spjd			    RL_READER);
1256219089Spjd			if (zp->z_blksz == size)
1257168404Spjd				break;
1258219089Spjd			offset += blkoff;
1259219089Spjd			zfs_range_unlock(zgd->zgd_rl);
1260168404Spjd		}
1261168404Spjd		/* test for truncation needs to be done while range locked */
1262219089Spjd		if (lr->lr_offset >= zp->z_size)
1263249195Smm			error = SET_ERROR(ENOENT);
1264214378Smm#ifdef DEBUG
1265214378Smm		if (zil_fault_io) {
1266249195Smm			error = SET_ERROR(EIO);
1267214378Smm			zil_fault_io = 0;
1268214378Smm		}
1269214378Smm#endif
1270219089Spjd		if (error == 0)
1271219089Spjd			error = dmu_buf_hold(os, object, offset, zgd, &db,
1272219089Spjd			    DMU_READ_NO_PREFETCH);
1273214378Smm
1274209962Smm		if (error == 0) {
1275243524Smm			blkptr_t *obp = dmu_buf_get_blkptr(db);
1276243524Smm			if (obp) {
1277243524Smm				ASSERT(BP_IS_HOLE(bp));
1278243524Smm				*bp = *obp;
1279243524Smm			}
1280243524Smm
1281219089Spjd			zgd->zgd_db = db;
1282219089Spjd			zgd->zgd_bp = bp;
1283219089Spjd
1284219089Spjd			ASSERT(db->db_offset == offset);
1285219089Spjd			ASSERT(db->db_size == size);
1286219089Spjd
1287219089Spjd			error = dmu_sync(zio, lr->lr_common.lrc_txg,
1288219089Spjd			    zfs_get_done, zgd);
1289219089Spjd			ASSERT(error || lr->lr_length <= zp->z_blksz);
1290219089Spjd
1291209962Smm			/*
1292219089Spjd			 * On success, we need to wait for the write I/O
1293219089Spjd			 * initiated by dmu_sync() to complete before we can
1294219089Spjd			 * release this dbuf.  We will finish everything up
1295219089Spjd			 * in the zfs_get_done() callback.
1296209962Smm			 */
1297219089Spjd			if (error == 0)
1298219089Spjd				return (0);
1299209962Smm
1300219089Spjd			if (error == EALREADY) {
1301219089Spjd				lr->lr_common.lrc_txtype = TX_WRITE2;
1302219089Spjd				error = 0;
1303219089Spjd			}
1304209962Smm		}
1305168404Spjd	}
1306219089Spjd
1307219089Spjd	zfs_get_done(zgd, error);
1308219089Spjd
1309168404Spjd	return (error);
1310168404Spjd}
1311168404Spjd
1312168404Spjd/*ARGSUSED*/
1313168404Spjdstatic int
1314185029Spjdzfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1315185029Spjd    caller_context_t *ct)
1316168404Spjd{
1317168404Spjd	znode_t *zp = VTOZ(vp);
1318168404Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1319168404Spjd	int error;
1320168404Spjd
1321168404Spjd	ZFS_ENTER(zfsvfs);
1322185029Spjd	ZFS_VERIFY_ZP(zp);
1323185029Spjd
1324185029Spjd	if (flag & V_ACE_MASK)
1325185029Spjd		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1326185029Spjd	else
1327185029Spjd		error = zfs_zaccess_rwx(zp, mode, flag, cr);
1328185029Spjd
1329168404Spjd	ZFS_EXIT(zfsvfs);
1330168404Spjd	return (error);
1331168404Spjd}
1332168404Spjd
1333168404Spjd/*
1334211932Smm * If vnode is for a device return a specfs vnode instead.
1335211932Smm */
1336211932Smmstatic int
1337211932Smmspecvp_check(vnode_t **vpp, cred_t *cr)
1338211932Smm{
1339211932Smm	int error = 0;
1340211932Smm
1341211932Smm	if (IS_DEVVP(*vpp)) {
1342211932Smm		struct vnode *svp;
1343211932Smm
1344211932Smm		svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1345211932Smm		VN_RELE(*vpp);
1346211932Smm		if (svp == NULL)
1347249195Smm			error = SET_ERROR(ENOSYS);
1348211932Smm		*vpp = svp;
1349211932Smm	}
1350211932Smm	return (error);
1351211932Smm}
1352211932Smm
1353211932Smm
1354211932Smm/*
1355168404Spjd * Lookup an entry in a directory, or an extended attribute directory.
1356168404Spjd * If it exists, return a held vnode reference for it.
1357168404Spjd *
1358168404Spjd *	IN:	dvp	- vnode of directory to search.
1359168404Spjd *		nm	- name of entry to lookup.
1360168404Spjd *		pnp	- full pathname to lookup [UNUSED].
1361168404Spjd *		flags	- LOOKUP_XATTR set if looking for an attribute.
1362168404Spjd *		rdir	- root directory vnode [UNUSED].
1363168404Spjd *		cr	- credentials of caller.
1364185029Spjd *		ct	- caller context
1365185029Spjd *		direntflags - directory lookup flags
1366185029Spjd *		realpnp - returned pathname.
1367168404Spjd *
1368168404Spjd *	OUT:	vpp	- vnode of located entry, NULL if not found.
1369168404Spjd *
1370168404Spjd *	RETURN:	0 if success
1371168404Spjd *		error code if failure
1372168404Spjd *
1373168404Spjd * Timestamps:
1374168404Spjd *	NA
1375168404Spjd */
1376168404Spjd/* ARGSUSED */
1377168962Spjdstatic int
1378168962Spjdzfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
1379185029Spjd    int nameiop, cred_t *cr, kthread_t *td, int flags)
1380168404Spjd{
1381168962Spjd	znode_t *zdp = VTOZ(dvp);
1382168962Spjd	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1383211932Smm	int	error = 0;
1384185029Spjd	int *direntflags = NULL;
1385185029Spjd	void *realpnp = NULL;
1386168404Spjd
1387211932Smm	/* fast path */
1388211932Smm	if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
1389211932Smm
1390211932Smm		if (dvp->v_type != VDIR) {
1391249195Smm			return (SET_ERROR(ENOTDIR));
1392219089Spjd		} else if (zdp->z_sa_hdl == NULL) {
1393249195Smm			return (SET_ERROR(EIO));
1394211932Smm		}
1395211932Smm
1396211932Smm		if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
1397211932Smm			error = zfs_fastaccesschk_execute(zdp, cr);
1398211932Smm			if (!error) {
1399211932Smm				*vpp = dvp;
1400211932Smm				VN_HOLD(*vpp);
1401211932Smm				return (0);
1402211932Smm			}
1403211932Smm			return (error);
1404211932Smm		} else {
1405211932Smm			vnode_t *tvp = dnlc_lookup(dvp, nm);
1406211932Smm
1407211932Smm			if (tvp) {
1408211932Smm				error = zfs_fastaccesschk_execute(zdp, cr);
1409211932Smm				if (error) {
1410211932Smm					VN_RELE(tvp);
1411211932Smm					return (error);
1412211932Smm				}
1413211932Smm				if (tvp == DNLC_NO_VNODE) {
1414211932Smm					VN_RELE(tvp);
1415249195Smm					return (SET_ERROR(ENOENT));
1416211932Smm				} else {
1417211932Smm					*vpp = tvp;
1418211932Smm					return (specvp_check(vpp, cr));
1419211932Smm				}
1420211932Smm			}
1421211932Smm		}
1422211932Smm	}
1423211932Smm
1424211932Smm	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1425211932Smm
1426168404Spjd	ZFS_ENTER(zfsvfs);
1427185029Spjd	ZFS_VERIFY_ZP(zdp);
1428168404Spjd
1429168404Spjd	*vpp = NULL;
1430168404Spjd
1431185029Spjd	if (flags & LOOKUP_XATTR) {
1432168404Spjd#ifdef TODO
1433168404Spjd		/*
1434168404Spjd		 * If the xattr property is off, refuse the lookup request.
1435168404Spjd		 */
1436168404Spjd		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1437168404Spjd			ZFS_EXIT(zfsvfs);
1438249195Smm			return (SET_ERROR(EINVAL));
1439168404Spjd		}
1440185029Spjd#endif
1441168404Spjd
1442168404Spjd		/*
1443168404Spjd		 * We don't allow recursive attributes..
1444168404Spjd		 * Maybe someday we will.
1445168404Spjd		 */
1446219089Spjd		if (zdp->z_pflags & ZFS_XATTR) {
1447168404Spjd			ZFS_EXIT(zfsvfs);
1448249195Smm			return (SET_ERROR(EINVAL));
1449168404Spjd		}
1450168404Spjd
1451168404Spjd		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1452168404Spjd			ZFS_EXIT(zfsvfs);
1453168404Spjd			return (error);
1454168404Spjd		}
1455168404Spjd
1456168404Spjd		/*
1457168404Spjd		 * Do we have permission to get into attribute directory?
1458168404Spjd		 */
1459168404Spjd
1460185029Spjd		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1461185029Spjd		    B_FALSE, cr)) {
1462168404Spjd			VN_RELE(*vpp);
1463185029Spjd			*vpp = NULL;
1464168404Spjd		}
1465168404Spjd
1466168404Spjd		ZFS_EXIT(zfsvfs);
1467168404Spjd		return (error);
1468168404Spjd	}
1469168404Spjd
1470168404Spjd	if (dvp->v_type != VDIR) {
1471168404Spjd		ZFS_EXIT(zfsvfs);
1472249195Smm		return (SET_ERROR(ENOTDIR));
1473168404Spjd	}
1474168404Spjd
1475168404Spjd	/*
1476168404Spjd	 * Check accessibility of directory.
1477168404Spjd	 */
1478168404Spjd
1479185029Spjd	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1480168404Spjd		ZFS_EXIT(zfsvfs);
1481168404Spjd		return (error);
1482168404Spjd	}
1483168404Spjd
1484185029Spjd	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1485185029Spjd	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1486185029Spjd		ZFS_EXIT(zfsvfs);
1487249195Smm		return (SET_ERROR(EILSEQ));
1488185029Spjd	}
1489168404Spjd
1490185029Spjd	error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
1491211932Smm	if (error == 0)
1492211932Smm		error = specvp_check(vpp, cr);
1493168962Spjd
1494168404Spjd	/* Translate errors and add SAVENAME when needed. */
1495168404Spjd	if (cnp->cn_flags & ISLASTCN) {
1496168404Spjd		switch (nameiop) {
1497168404Spjd		case CREATE:
1498168404Spjd		case RENAME:
1499168404Spjd			if (error == ENOENT) {
1500168404Spjd				error = EJUSTRETURN;
1501168404Spjd				cnp->cn_flags |= SAVENAME;
1502168404Spjd				break;
1503168404Spjd			}
1504168404Spjd			/* FALLTHROUGH */
1505168404Spjd		case DELETE:
1506168404Spjd			if (error == 0)
1507168404Spjd				cnp->cn_flags |= SAVENAME;
1508168404Spjd			break;
1509168404Spjd		}
1510168404Spjd	}
1511168404Spjd	if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) {
1512169198Spjd		int ltype = 0;
1513169198Spjd
1514169198Spjd		if (cnp->cn_flags & ISDOTDOT) {
1515176559Sattilio			ltype = VOP_ISLOCKED(dvp);
1516175294Sattilio			VOP_UNLOCK(dvp, 0);
1517169198Spjd		}
1518206667Spjd		ZFS_EXIT(zfsvfs);
1519219089Spjd		error = zfs_vnode_lock(*vpp, cnp->cn_lkflags);
1520168962Spjd		if (cnp->cn_flags & ISDOTDOT)
1521175202Sattilio			vn_lock(dvp, ltype | LK_RETRY);
1522169172Spjd		if (error != 0) {
1523169172Spjd			VN_RELE(*vpp);
1524169172Spjd			*vpp = NULL;
1525169172Spjd			return (error);
1526169172Spjd		}
1527206667Spjd	} else {
1528206667Spjd		ZFS_EXIT(zfsvfs);
1529168404Spjd	}
1530168404Spjd
1531168404Spjd#ifdef FREEBSD_NAMECACHE
1532168404Spjd	/*
1533168404Spjd	 * Insert name into cache (as non-existent) if appropriate.
1534168404Spjd	 */
1535168404Spjd	if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
1536168404Spjd		cache_enter(dvp, *vpp, cnp);
1537169170Spjd	/*
1538169170Spjd	 * Insert name into cache if appropriate.
1539169170Spjd	 */
1540168404Spjd	if (error == 0 && (cnp->cn_flags & MAKEENTRY)) {
1541168404Spjd		if (!(cnp->cn_flags & ISLASTCN) ||
1542168404Spjd		    (nameiop != DELETE && nameiop != RENAME)) {
1543168404Spjd			cache_enter(dvp, *vpp, cnp);
1544168404Spjd		}
1545168404Spjd	}
1546168404Spjd#endif
1547168404Spjd
1548168404Spjd	return (error);
1549168404Spjd}
1550168404Spjd
1551168404Spjd/*
1552168404Spjd * Attempt to create a new entry in a directory.  If the entry
1553168404Spjd * already exists, truncate the file if permissible, else return
1554168404Spjd * an error.  Return the vp of the created or trunc'd file.
1555168404Spjd *
1556168404Spjd *	IN:	dvp	- vnode of directory to put new file entry in.
1557168404Spjd *		name	- name of new file entry.
1558168404Spjd *		vap	- attributes of new file.
1559168404Spjd *		excl	- flag indicating exclusive or non-exclusive mode.
1560168404Spjd *		mode	- mode to open file with.
1561168404Spjd *		cr	- credentials of caller.
1562168404Spjd *		flag	- large file flag [UNUSED].
1563185029Spjd *		ct	- caller context
1564185029Spjd *		vsecp 	- ACL to be set
1565168404Spjd *
1566168404Spjd *	OUT:	vpp	- vnode of created or trunc'd entry.
1567168404Spjd *
1568168404Spjd *	RETURN:	0 if success
1569168404Spjd *		error code if failure
1570168404Spjd *
1571168404Spjd * Timestamps:
1572168404Spjd *	dvp - ctime|mtime updated if new entry created
1573168404Spjd *	 vp - ctime|mtime always, atime if new
1574168404Spjd */
1575185029Spjd
1576168404Spjd/* ARGSUSED */
1577168404Spjdstatic int
1578168962Spjdzfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
1579185029Spjd    vnode_t **vpp, cred_t *cr, kthread_t *td)
1580168404Spjd{
1581168404Spjd	znode_t		*zp, *dzp = VTOZ(dvp);
1582168404Spjd	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1583185029Spjd	zilog_t		*zilog;
1584185029Spjd	objset_t	*os;
1585168404Spjd	zfs_dirlock_t	*dl;
1586168404Spjd	dmu_tx_t	*tx;
1587168404Spjd	int		error;
1588209962Smm	ksid_t		*ksid;
1589209962Smm	uid_t		uid;
1590209962Smm	gid_t		gid = crgetgid(cr);
1591219089Spjd	zfs_acl_ids_t   acl_ids;
1592209962Smm	boolean_t	fuid_dirtied;
1593219089Spjd	boolean_t	have_acl = B_FALSE;
1594185029Spjd	void		*vsecp = NULL;
1595185029Spjd	int		flag = 0;
1596168404Spjd
1597185029Spjd	/*
1598185029Spjd	 * If we have an ephemeral id, ACL, or XVATTR then
1599185029Spjd	 * make sure file system is at proper version
1600185029Spjd	 */
1601185029Spjd
1602209962Smm	ksid = crgetsid(cr, KSID_OWNER);
1603209962Smm	if (ksid)
1604209962Smm		uid = ksid_getid(ksid);
1605209962Smm	else
1606209962Smm		uid = crgetuid(cr);
1607219089Spjd
1608185029Spjd	if (zfsvfs->z_use_fuids == B_FALSE &&
1609185029Spjd	    (vsecp || (vap->va_mask & AT_XVATTR) ||
1610219089Spjd	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1611249195Smm		return (SET_ERROR(EINVAL));
1612185029Spjd
1613168404Spjd	ZFS_ENTER(zfsvfs);
1614185029Spjd	ZFS_VERIFY_ZP(dzp);
1615185029Spjd	os = zfsvfs->z_os;
1616185029Spjd	zilog = zfsvfs->z_log;
1617168404Spjd
1618185029Spjd	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1619185029Spjd	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1620185029Spjd		ZFS_EXIT(zfsvfs);
1621249195Smm		return (SET_ERROR(EILSEQ));
1622185029Spjd	}
1623185029Spjd
1624185029Spjd	if (vap->va_mask & AT_XVATTR) {
1625197861Spjd		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
1626185029Spjd		    crgetuid(cr), cr, vap->va_type)) != 0) {
1627185029Spjd			ZFS_EXIT(zfsvfs);
1628185029Spjd			return (error);
1629185029Spjd		}
1630185029Spjd	}
1631168404Spjdtop:
1632168404Spjd	*vpp = NULL;
1633168404Spjd
1634182905Strasz	if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
1635182905Strasz		vap->va_mode &= ~S_ISVTX;
1636168404Spjd
1637168404Spjd	if (*name == '\0') {
1638168404Spjd		/*
1639168404Spjd		 * Null component name refers to the directory itself.
1640168404Spjd		 */
1641168404Spjd		VN_HOLD(dvp);
1642168404Spjd		zp = dzp;
1643168404Spjd		dl = NULL;
1644168404Spjd		error = 0;
1645168404Spjd	} else {
1646168404Spjd		/* possible VN_HOLD(zp) */
1647185029Spjd		int zflg = 0;
1648185029Spjd
1649185029Spjd		if (flag & FIGNORECASE)
1650185029Spjd			zflg |= ZCILOOK;
1651185029Spjd
1652185029Spjd		error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1653185029Spjd		    NULL, NULL);
1654185029Spjd		if (error) {
1655219089Spjd			if (have_acl)
1656219089Spjd				zfs_acl_ids_free(&acl_ids);
1657168404Spjd			if (strcmp(name, "..") == 0)
1658249195Smm				error = SET_ERROR(EISDIR);
1659168404Spjd			ZFS_EXIT(zfsvfs);
1660168404Spjd			return (error);
1661168404Spjd		}
1662168404Spjd	}
1663219089Spjd
1664185029Spjd	if (zp == NULL) {
1665185029Spjd		uint64_t txtype;
1666168404Spjd
1667168404Spjd		/*
1668168404Spjd		 * Create a new file object and update the directory
1669168404Spjd		 * to reference it.
1670168404Spjd		 */
1671185029Spjd		if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1672219089Spjd			if (have_acl)
1673219089Spjd				zfs_acl_ids_free(&acl_ids);
1674168404Spjd			goto out;
1675168404Spjd		}
1676168404Spjd
1677168404Spjd		/*
1678168404Spjd		 * We only support the creation of regular files in
1679168404Spjd		 * extended attribute directories.
1680168404Spjd		 */
1681219089Spjd
1682219089Spjd		if ((dzp->z_pflags & ZFS_XATTR) &&
1683168404Spjd		    (vap->va_type != VREG)) {
1684219089Spjd			if (have_acl)
1685219089Spjd				zfs_acl_ids_free(&acl_ids);
1686249195Smm			error = SET_ERROR(EINVAL);
1687168404Spjd			goto out;
1688168404Spjd		}
1689168404Spjd
1690219089Spjd		if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
1691219089Spjd		    cr, vsecp, &acl_ids)) != 0)
1692219089Spjd			goto out;
1693219089Spjd		have_acl = B_TRUE;
1694209962Smm
1695209962Smm		if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1696211932Smm			zfs_acl_ids_free(&acl_ids);
1697249195Smm			error = SET_ERROR(EDQUOT);
1698209962Smm			goto out;
1699209962Smm		}
1700209962Smm
1701168404Spjd		tx = dmu_tx_create(os);
1702219089Spjd
1703219089Spjd		dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1704219089Spjd		    ZFS_SA_BASE_ATTR_SIZE);
1705219089Spjd
1706209962Smm		fuid_dirtied = zfsvfs->z_fuid_dirty;
1707209962Smm		if (fuid_dirtied)
1708209962Smm			zfs_fuid_txhold(zfsvfs, tx);
1709168404Spjd		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1710219089Spjd		dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1711219089Spjd		if (!zfsvfs->z_use_sa &&
1712219089Spjd		    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1713168404Spjd			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1714219089Spjd			    0, acl_ids.z_aclp->z_acl_bytes);
1715185029Spjd		}
1716209962Smm		error = dmu_tx_assign(tx, TXG_NOWAIT);
1717168404Spjd		if (error) {
1718168404Spjd			zfs_dirent_unlock(dl);
1719209962Smm			if (error == ERESTART) {
1720168404Spjd				dmu_tx_wait(tx);
1721168404Spjd				dmu_tx_abort(tx);
1722168404Spjd				goto top;
1723168404Spjd			}
1724219089Spjd			zfs_acl_ids_free(&acl_ids);
1725168404Spjd			dmu_tx_abort(tx);
1726168404Spjd			ZFS_EXIT(zfsvfs);
1727168404Spjd			return (error);
1728168404Spjd		}
1729219089Spjd		zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1730209962Smm
1731209962Smm		if (fuid_dirtied)
1732209962Smm			zfs_fuid_sync(zfsvfs, tx);
1733209962Smm
1734168404Spjd		(void) zfs_link_create(dl, zp, tx, ZNEW);
1735185029Spjd		txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1736185029Spjd		if (flag & FIGNORECASE)
1737185029Spjd			txtype |= TX_CI;
1738185029Spjd		zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1739209962Smm		    vsecp, acl_ids.z_fuidp, vap);
1740209962Smm		zfs_acl_ids_free(&acl_ids);
1741168404Spjd		dmu_tx_commit(tx);
1742168404Spjd	} else {
1743185029Spjd		int aflags = (flag & FAPPEND) ? V_APPEND : 0;
1744185029Spjd
1745219089Spjd		if (have_acl)
1746219089Spjd			zfs_acl_ids_free(&acl_ids);
1747219089Spjd		have_acl = B_FALSE;
1748219089Spjd
1749168404Spjd		/*
1750168404Spjd		 * A directory entry already exists for this name.
1751168404Spjd		 */
1752168404Spjd		/*
1753168962Spjd		 * Can't truncate an existing file if in exclusive mode.
1754168962Spjd		 */
1755168962Spjd		if (excl == EXCL) {
1756249195Smm			error = SET_ERROR(EEXIST);
1757168962Spjd			goto out;
1758168962Spjd		}
1759168962Spjd		/*
1760168404Spjd		 * Can't open a directory for writing.
1761168404Spjd		 */
1762168404Spjd		if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
1763249195Smm			error = SET_ERROR(EISDIR);
1764168404Spjd			goto out;
1765168404Spjd		}
1766168404Spjd		/*
1767168404Spjd		 * Verify requested access to file.
1768168404Spjd		 */
1769185029Spjd		if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
1770168404Spjd			goto out;
1771168404Spjd		}
1772168404Spjd
1773168404Spjd		mutex_enter(&dzp->z_lock);
1774168404Spjd		dzp->z_seq++;
1775168404Spjd		mutex_exit(&dzp->z_lock);
1776168404Spjd
1777168404Spjd		/*
1778168404Spjd		 * Truncate regular files if requested.
1779168404Spjd		 */
1780168404Spjd		if ((ZTOV(zp)->v_type == VREG) &&
1781168404Spjd		    (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
1782185029Spjd			/* we can't hold any locks when calling zfs_freesp() */
1783185029Spjd			zfs_dirent_unlock(dl);
1784185029Spjd			dl = NULL;
1785168404Spjd			error = zfs_freesp(zp, 0, 0, mode, TRUE);
1786185029Spjd			if (error == 0) {
1787185029Spjd				vnevent_create(ZTOV(zp), ct);
1788168404Spjd			}
1789168404Spjd		}
1790168404Spjd	}
1791168404Spjdout:
1792168404Spjd	if (dl)
1793168404Spjd		zfs_dirent_unlock(dl);
1794168404Spjd
1795168404Spjd	if (error) {
1796168404Spjd		if (zp)
1797168404Spjd			VN_RELE(ZTOV(zp));
1798168962Spjd	} else {
1799168962Spjd		*vpp = ZTOV(zp);
1800211932Smm		error = specvp_check(vpp, cr);
1801168404Spjd	}
1802168404Spjd
1803219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1804219089Spjd		zil_commit(zilog, 0);
1805219089Spjd
1806168404Spjd	ZFS_EXIT(zfsvfs);
1807168404Spjd	return (error);
1808168404Spjd}
1809168404Spjd
1810168404Spjd/*
1811168404Spjd * Remove an entry from a directory.
1812168404Spjd *
1813168404Spjd *	IN:	dvp	- vnode of directory to remove entry from.
1814168404Spjd *		name	- name of entry to remove.
1815168404Spjd *		cr	- credentials of caller.
1816185029Spjd *		ct	- caller context
1817185029Spjd *		flags	- case flags
1818168404Spjd *
1819168404Spjd *	RETURN:	0 if success
1820168404Spjd *		error code if failure
1821168404Spjd *
1822168404Spjd * Timestamps:
1823168404Spjd *	dvp - ctime|mtime
1824168404Spjd *	 vp - ctime (if nlink > 0)
1825168404Spjd */
1826219089Spjd
1827219089Spjduint64_t null_xattr = 0;
1828219089Spjd
1829185029Spjd/*ARGSUSED*/
1830168404Spjdstatic int
1831185029Spjdzfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
1832185029Spjd    int flags)
1833168404Spjd{
1834168404Spjd	znode_t		*zp, *dzp = VTOZ(dvp);
1835219089Spjd	znode_t		*xzp;
1836168404Spjd	vnode_t		*vp;
1837168404Spjd	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1838185029Spjd	zilog_t		*zilog;
1839168962Spjd	uint64_t	acl_obj, xattr_obj;
1840219089Spjd	uint64_t 	xattr_obj_unlinked = 0;
1841219089Spjd	uint64_t	obj = 0;
1842168404Spjd	zfs_dirlock_t	*dl;
1843168404Spjd	dmu_tx_t	*tx;
1844168962Spjd	boolean_t	may_delete_now, delete_now = FALSE;
1845185029Spjd	boolean_t	unlinked, toobig = FALSE;
1846185029Spjd	uint64_t	txtype;
1847185029Spjd	pathname_t	*realnmp = NULL;
1848185029Spjd	pathname_t	realnm;
1849168404Spjd	int		error;
1850185029Spjd	int		zflg = ZEXISTS;
1851168404Spjd
1852168404Spjd	ZFS_ENTER(zfsvfs);
1853185029Spjd	ZFS_VERIFY_ZP(dzp);
1854185029Spjd	zilog = zfsvfs->z_log;
1855168404Spjd
1856185029Spjd	if (flags & FIGNORECASE) {
1857185029Spjd		zflg |= ZCILOOK;
1858185029Spjd		pn_alloc(&realnm);
1859185029Spjd		realnmp = &realnm;
1860185029Spjd	}
1861185029Spjd
1862168404Spjdtop:
1863219089Spjd	xattr_obj = 0;
1864219089Spjd	xzp = NULL;
1865168404Spjd	/*
1866168404Spjd	 * Attempt to lock directory; fail if entry doesn't exist.
1867168404Spjd	 */
1868185029Spjd	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1869185029Spjd	    NULL, realnmp)) {
1870185029Spjd		if (realnmp)
1871185029Spjd			pn_free(realnmp);
1872168404Spjd		ZFS_EXIT(zfsvfs);
1873168404Spjd		return (error);
1874168404Spjd	}
1875168404Spjd
1876168404Spjd	vp = ZTOV(zp);
1877168404Spjd
1878168962Spjd	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1879168404Spjd		goto out;
1880168962Spjd	}
1881168404Spjd
1882168962Spjd	/*
1883168962Spjd	 * Need to use rmdir for removing directories.
1884168962Spjd	 */
1885168962Spjd	if (vp->v_type == VDIR) {
1886249195Smm		error = SET_ERROR(EPERM);
1887168962Spjd		goto out;
1888168962Spjd	}
1889168962Spjd
1890185029Spjd	vnevent_remove(vp, dvp, name, ct);
1891168962Spjd
1892185029Spjd	if (realnmp)
1893185029Spjd		dnlc_remove(dvp, realnmp->pn_buf);
1894185029Spjd	else
1895185029Spjd		dnlc_remove(dvp, name);
1896168404Spjd
1897219089Spjd	VI_LOCK(vp);
1898219089Spjd	may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp);
1899219089Spjd	VI_UNLOCK(vp);
1900168962Spjd
1901168404Spjd	/*
1902168404Spjd	 * We may delete the znode now, or we may put it in the unlinked set;
1903168404Spjd	 * it depends on whether we're the last link, and on whether there are
1904168404Spjd	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
1905168404Spjd	 * allow for either case.
1906168404Spjd	 */
1907219089Spjd	obj = zp->z_id;
1908168404Spjd	tx = dmu_tx_create(zfsvfs->z_os);
1909168404Spjd	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1910219089Spjd	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1911219089Spjd	zfs_sa_upgrade_txholds(tx, zp);
1912219089Spjd	zfs_sa_upgrade_txholds(tx, dzp);
1913185029Spjd	if (may_delete_now) {
1914185029Spjd		toobig =
1915219089Spjd		    zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
1916185029Spjd		/* if the file is too big, only hold_free a token amount */
1917185029Spjd		dmu_tx_hold_free(tx, zp->z_id, 0,
1918185029Spjd		    (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1919185029Spjd	}
1920168404Spjd
1921168404Spjd	/* are there any extended attributes? */
1922219089Spjd	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1923219089Spjd	    &xattr_obj, sizeof (xattr_obj));
1924219089Spjd	if (error == 0 && xattr_obj) {
1925219089Spjd		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1926240415Smm		ASSERT0(error);
1927219089Spjd		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1928219089Spjd		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1929168404Spjd	}
1930168404Spjd
1931219089Spjd	mutex_enter(&zp->z_lock);
1932219089Spjd	if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
1933168962Spjd		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1934219089Spjd	mutex_exit(&zp->z_lock);
1935168962Spjd
1936168404Spjd	/* charge as an update -- would be nice not to charge at all */
1937168404Spjd	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1938168404Spjd
1939209962Smm	error = dmu_tx_assign(tx, TXG_NOWAIT);
1940168404Spjd	if (error) {
1941168404Spjd		zfs_dirent_unlock(dl);
1942168962Spjd		VN_RELE(vp);
1943219089Spjd		if (xzp)
1944219089Spjd			VN_RELE(ZTOV(xzp));
1945209962Smm		if (error == ERESTART) {
1946168404Spjd			dmu_tx_wait(tx);
1947168404Spjd			dmu_tx_abort(tx);
1948168404Spjd			goto top;
1949168404Spjd		}
1950185029Spjd		if (realnmp)
1951185029Spjd			pn_free(realnmp);
1952168404Spjd		dmu_tx_abort(tx);
1953168404Spjd		ZFS_EXIT(zfsvfs);
1954168404Spjd		return (error);
1955168404Spjd	}
1956168404Spjd
1957168404Spjd	/*
1958168404Spjd	 * Remove the directory entry.
1959168404Spjd	 */
1960185029Spjd	error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1961168404Spjd
1962168404Spjd	if (error) {
1963168404Spjd		dmu_tx_commit(tx);
1964168404Spjd		goto out;
1965168404Spjd	}
1966168404Spjd
1967219089Spjd	if (unlinked) {
1968219089Spjd
1969219089Spjd		/*
1970219089Spjd		 * Hold z_lock so that we can make sure that the ACL obj
1971219089Spjd		 * hasn't changed.  Could have been deleted due to
1972219089Spjd		 * zfs_sa_upgrade().
1973219089Spjd		 */
1974219089Spjd		mutex_enter(&zp->z_lock);
1975168962Spjd		VI_LOCK(vp);
1976219089Spjd		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1977219089Spjd		    &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
1978185029Spjd		delete_now = may_delete_now && !toobig &&
1979168962Spjd		    vp->v_count == 1 && !vn_has_cached_data(vp) &&
1980219089Spjd		    xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) ==
1981219089Spjd		    acl_obj;
1982168962Spjd		VI_UNLOCK(vp);
1983168962Spjd	}
1984168962Spjd
1985168962Spjd	if (delete_now) {
1986243270Savg#ifdef __FreeBSD__
1987243270Savg		panic("zfs_remove: delete_now branch taken");
1988243270Savg#endif
1989219089Spjd		if (xattr_obj_unlinked) {
1990219089Spjd			ASSERT3U(xzp->z_links, ==, 2);
1991168962Spjd			mutex_enter(&xzp->z_lock);
1992168962Spjd			xzp->z_unlinked = 1;
1993219089Spjd			xzp->z_links = 0;
1994219089Spjd			error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
1995219089Spjd			    &xzp->z_links, sizeof (xzp->z_links), tx);
1996219089Spjd			ASSERT3U(error,  ==,  0);
1997168962Spjd			mutex_exit(&xzp->z_lock);
1998168962Spjd			zfs_unlinked_add(xzp, tx);
1999219089Spjd
2000219089Spjd			if (zp->z_is_sa)
2001219089Spjd				error = sa_remove(zp->z_sa_hdl,
2002219089Spjd				    SA_ZPL_XATTR(zfsvfs), tx);
2003219089Spjd			else
2004219089Spjd				error = sa_update(zp->z_sa_hdl,
2005219089Spjd				    SA_ZPL_XATTR(zfsvfs), &null_xattr,
2006219089Spjd				    sizeof (uint64_t), tx);
2007240415Smm			ASSERT0(error);
2008168962Spjd		}
2009168962Spjd		VI_LOCK(vp);
2010168962Spjd		vp->v_count--;
2011240415Smm		ASSERT0(vp->v_count);
2012168962Spjd		VI_UNLOCK(vp);
2013168962Spjd		mutex_exit(&zp->z_lock);
2014168962Spjd		zfs_znode_delete(zp, tx);
2015168962Spjd	} else if (unlinked) {
2016219089Spjd		mutex_exit(&zp->z_lock);
2017168404Spjd		zfs_unlinked_add(zp, tx);
2018243268Savg#ifdef __FreeBSD__
2019243268Savg		vp->v_vflag |= VV_NOSYNC;
2020243268Savg#endif
2021168962Spjd	}
2022168404Spjd
2023185029Spjd	txtype = TX_REMOVE;
2024185029Spjd	if (flags & FIGNORECASE)
2025185029Spjd		txtype |= TX_CI;
2026219089Spjd	zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
2027168404Spjd
2028168404Spjd	dmu_tx_commit(tx);
2029168404Spjdout:
2030185029Spjd	if (realnmp)
2031185029Spjd		pn_free(realnmp);
2032185029Spjd
2033168404Spjd	zfs_dirent_unlock(dl);
2034168404Spjd
2035219089Spjd	if (!delete_now)
2036168962Spjd		VN_RELE(vp);
2037219089Spjd	if (xzp)
2038168962Spjd		VN_RELE(ZTOV(xzp));
2039168962Spjd
2040219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2041219089Spjd		zil_commit(zilog, 0);
2042219089Spjd
2043168404Spjd	ZFS_EXIT(zfsvfs);
2044168404Spjd	return (error);
2045168404Spjd}
2046168404Spjd
2047168404Spjd/*
2048168404Spjd * Create a new directory and insert it into dvp using the name
2049168404Spjd * provided.  Return a pointer to the inserted directory.
2050168404Spjd *
2051168404Spjd *	IN:	dvp	- vnode of directory to add subdir to.
2052168404Spjd *		dirname	- name of new directory.
2053168404Spjd *		vap	- attributes of new directory.
2054168404Spjd *		cr	- credentials of caller.
2055185029Spjd *		ct	- caller context
2056185029Spjd *		vsecp	- ACL to be set
2057168404Spjd *
2058168404Spjd *	OUT:	vpp	- vnode of created directory.
2059168404Spjd *
2060168404Spjd *	RETURN:	0 if success
2061168404Spjd *		error code if failure
2062168404Spjd *
2063168404Spjd * Timestamps:
2064168404Spjd *	dvp - ctime|mtime updated
2065168404Spjd *	 vp - ctime|mtime|atime updated
2066168404Spjd */
2067185029Spjd/*ARGSUSED*/
2068168404Spjdstatic int
2069185029Spjdzfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
2070185029Spjd    caller_context_t *ct, int flags, vsecattr_t *vsecp)
2071168404Spjd{
2072168404Spjd	znode_t		*zp, *dzp = VTOZ(dvp);
2073168404Spjd	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2074185029Spjd	zilog_t		*zilog;
2075168404Spjd	zfs_dirlock_t	*dl;
2076185029Spjd	uint64_t	txtype;
2077168404Spjd	dmu_tx_t	*tx;
2078168404Spjd	int		error;
2079185029Spjd	int		zf = ZNEW;
2080209962Smm	ksid_t		*ksid;
2081209962Smm	uid_t		uid;
2082209962Smm	gid_t		gid = crgetgid(cr);
2083219089Spjd	zfs_acl_ids_t   acl_ids;
2084209962Smm	boolean_t	fuid_dirtied;
2085168404Spjd
2086168404Spjd	ASSERT(vap->va_type == VDIR);
2087168404Spjd
2088185029Spjd	/*
2089185029Spjd	 * If we have an ephemeral id, ACL, or XVATTR then
2090185029Spjd	 * make sure file system is at proper version
2091185029Spjd	 */
2092185029Spjd
2093209962Smm	ksid = crgetsid(cr, KSID_OWNER);
2094209962Smm	if (ksid)
2095209962Smm		uid = ksid_getid(ksid);
2096209962Smm	else
2097209962Smm		uid = crgetuid(cr);
2098185029Spjd	if (zfsvfs->z_use_fuids == B_FALSE &&
2099219089Spjd	    (vsecp || (vap->va_mask & AT_XVATTR) ||
2100219089Spjd	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
2101249195Smm		return (SET_ERROR(EINVAL));
2102185029Spjd
2103168404Spjd	ZFS_ENTER(zfsvfs);
2104185029Spjd	ZFS_VERIFY_ZP(dzp);
2105185029Spjd	zilog = zfsvfs->z_log;
2106168404Spjd
2107219089Spjd	if (dzp->z_pflags & ZFS_XATTR) {
2108168404Spjd		ZFS_EXIT(zfsvfs);
2109249195Smm		return (SET_ERROR(EINVAL));
2110168404Spjd	}
2111168404Spjd
2112185029Spjd	if (zfsvfs->z_utf8 && u8_validate(dirname,
2113185029Spjd	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2114185029Spjd		ZFS_EXIT(zfsvfs);
2115249195Smm		return (SET_ERROR(EILSEQ));
2116185029Spjd	}
2117185029Spjd	if (flags & FIGNORECASE)
2118185029Spjd		zf |= ZCILOOK;
2119185029Spjd
2120219089Spjd	if (vap->va_mask & AT_XVATTR) {
2121197861Spjd		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
2122185029Spjd		    crgetuid(cr), cr, vap->va_type)) != 0) {
2123185029Spjd			ZFS_EXIT(zfsvfs);
2124185029Spjd			return (error);
2125185029Spjd		}
2126219089Spjd	}
2127185029Spjd
2128219089Spjd	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
2129219089Spjd	    vsecp, &acl_ids)) != 0) {
2130219089Spjd		ZFS_EXIT(zfsvfs);
2131219089Spjd		return (error);
2132219089Spjd	}
2133168404Spjd	/*
2134168404Spjd	 * First make sure the new directory doesn't exist.
2135219089Spjd	 *
2136219089Spjd	 * Existence is checked first to make sure we don't return
2137219089Spjd	 * EACCES instead of EEXIST which can cause some applications
2138219089Spjd	 * to fail.
2139168404Spjd	 */
2140185029Spjdtop:
2141185029Spjd	*vpp = NULL;
2142185029Spjd
2143185029Spjd	if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
2144185029Spjd	    NULL, NULL)) {
2145219089Spjd		zfs_acl_ids_free(&acl_ids);
2146168404Spjd		ZFS_EXIT(zfsvfs);
2147168404Spjd		return (error);
2148168404Spjd	}
2149168404Spjd
2150185029Spjd	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
2151219089Spjd		zfs_acl_ids_free(&acl_ids);
2152168404Spjd		zfs_dirent_unlock(dl);
2153168404Spjd		ZFS_EXIT(zfsvfs);
2154168404Spjd		return (error);
2155168404Spjd	}
2156168404Spjd
2157209962Smm	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
2158211932Smm		zfs_acl_ids_free(&acl_ids);
2159209962Smm		zfs_dirent_unlock(dl);
2160209962Smm		ZFS_EXIT(zfsvfs);
2161249195Smm		return (SET_ERROR(EDQUOT));
2162209962Smm	}
2163209962Smm
2164168404Spjd	/*
2165168404Spjd	 * Add a new entry to the directory.
2166168404Spjd	 */
2167168404Spjd	tx = dmu_tx_create(zfsvfs->z_os);
2168168404Spjd	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
2169168404Spjd	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2170209962Smm	fuid_dirtied = zfsvfs->z_fuid_dirty;
2171209962Smm	if (fuid_dirtied)
2172209962Smm		zfs_fuid_txhold(zfsvfs, tx);
2173219089Spjd	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2174219089Spjd		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
2175219089Spjd		    acl_ids.z_aclp->z_acl_bytes);
2176219089Spjd	}
2177219089Spjd
2178219089Spjd	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
2179219089Spjd	    ZFS_SA_BASE_ATTR_SIZE);
2180219089Spjd
2181209962Smm	error = dmu_tx_assign(tx, TXG_NOWAIT);
2182168404Spjd	if (error) {
2183168404Spjd		zfs_dirent_unlock(dl);
2184209962Smm		if (error == ERESTART) {
2185168404Spjd			dmu_tx_wait(tx);
2186168404Spjd			dmu_tx_abort(tx);
2187168404Spjd			goto top;
2188168404Spjd		}
2189219089Spjd		zfs_acl_ids_free(&acl_ids);
2190168404Spjd		dmu_tx_abort(tx);
2191168404Spjd		ZFS_EXIT(zfsvfs);
2192168404Spjd		return (error);
2193168404Spjd	}
2194168404Spjd
2195168404Spjd	/*
2196168404Spjd	 * Create new node.
2197168404Spjd	 */
2198219089Spjd	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
2199168404Spjd
2200209962Smm	if (fuid_dirtied)
2201209962Smm		zfs_fuid_sync(zfsvfs, tx);
2202219089Spjd
2203168404Spjd	/*
2204168404Spjd	 * Now put new name in parent dir.
2205168404Spjd	 */
2206168404Spjd	(void) zfs_link_create(dl, zp, tx, ZNEW);
2207168404Spjd
2208168404Spjd	*vpp = ZTOV(zp);
2209168404Spjd
2210185029Spjd	txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
2211185029Spjd	if (flags & FIGNORECASE)
2212185029Spjd		txtype |= TX_CI;
2213209962Smm	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
2214209962Smm	    acl_ids.z_fuidp, vap);
2215185029Spjd
2216209962Smm	zfs_acl_ids_free(&acl_ids);
2217219089Spjd
2218168404Spjd	dmu_tx_commit(tx);
2219168404Spjd
2220168404Spjd	zfs_dirent_unlock(dl);
2221168404Spjd
2222219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2223219089Spjd		zil_commit(zilog, 0);
2224219089Spjd
2225168404Spjd	ZFS_EXIT(zfsvfs);
2226168404Spjd	return (0);
2227168404Spjd}
2228168404Spjd
2229168404Spjd/*
2230168404Spjd * Remove a directory subdir entry.  If the current working
2231168404Spjd * directory is the same as the subdir to be removed, the
2232168404Spjd * remove will fail.
2233168404Spjd *
2234168404Spjd *	IN:	dvp	- vnode of directory to remove from.
2235168404Spjd *		name	- name of directory to be removed.
2236168404Spjd *		cwd	- vnode of current working directory.
2237168404Spjd *		cr	- credentials of caller.
2238185029Spjd *		ct	- caller context
2239185029Spjd *		flags	- case flags
2240168404Spjd *
2241168404Spjd *	RETURN:	0 if success
2242168404Spjd *		error code if failure
2243168404Spjd *
2244168404Spjd * Timestamps:
2245168404Spjd *	dvp - ctime|mtime updated
2246168404Spjd */
2247185029Spjd/*ARGSUSED*/
2248168404Spjdstatic int
2249185029Spjdzfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
2250185029Spjd    caller_context_t *ct, int flags)
2251168404Spjd{
2252168404Spjd	znode_t		*dzp = VTOZ(dvp);
2253168404Spjd	znode_t		*zp;
2254168404Spjd	vnode_t		*vp;
2255168404Spjd	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2256185029Spjd	zilog_t		*zilog;
2257168404Spjd	zfs_dirlock_t	*dl;
2258168404Spjd	dmu_tx_t	*tx;
2259168404Spjd	int		error;
2260185029Spjd	int		zflg = ZEXISTS;
2261168404Spjd
2262168962Spjd	ZFS_ENTER(zfsvfs);
2263185029Spjd	ZFS_VERIFY_ZP(dzp);
2264185029Spjd	zilog = zfsvfs->z_log;
2265168404Spjd
2266185029Spjd	if (flags & FIGNORECASE)
2267185029Spjd		zflg |= ZCILOOK;
2268168404Spjdtop:
2269168404Spjd	zp = NULL;
2270168404Spjd
2271168404Spjd	/*
2272168404Spjd	 * Attempt to lock directory; fail if entry doesn't exist.
2273168404Spjd	 */
2274185029Spjd	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
2275185029Spjd	    NULL, NULL)) {
2276168404Spjd		ZFS_EXIT(zfsvfs);
2277168404Spjd		return (error);
2278168404Spjd	}
2279168404Spjd
2280168404Spjd	vp = ZTOV(zp);
2281168404Spjd
2282168404Spjd	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2283168404Spjd		goto out;
2284168404Spjd	}
2285168404Spjd
2286168962Spjd	if (vp->v_type != VDIR) {
2287249195Smm		error = SET_ERROR(ENOTDIR);
2288168962Spjd		goto out;
2289168962Spjd	}
2290168962Spjd
2291168962Spjd	if (vp == cwd) {
2292249195Smm		error = SET_ERROR(EINVAL);
2293168962Spjd		goto out;
2294168962Spjd	}
2295168962Spjd
2296185029Spjd	vnevent_rmdir(vp, dvp, name, ct);
2297168962Spjd
2298168404Spjd	/*
2299168404Spjd	 * Grab a lock on the directory to make sure that noone is
2300168404Spjd	 * trying to add (or lookup) entries while we are removing it.
2301168404Spjd	 */
2302168404Spjd	rw_enter(&zp->z_name_lock, RW_WRITER);
2303168404Spjd
2304168404Spjd	/*
2305168404Spjd	 * Grab a lock on the parent pointer to make sure we play well
2306168404Spjd	 * with the treewalk and directory rename code.
2307168404Spjd	 */
2308168404Spjd	rw_enter(&zp->z_parent_lock, RW_WRITER);
2309168404Spjd
2310168404Spjd	tx = dmu_tx_create(zfsvfs->z_os);
2311168404Spjd	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2312219089Spjd	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2313168404Spjd	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2314219089Spjd	zfs_sa_upgrade_txholds(tx, zp);
2315219089Spjd	zfs_sa_upgrade_txholds(tx, dzp);
2316209962Smm	error = dmu_tx_assign(tx, TXG_NOWAIT);
2317168404Spjd	if (error) {
2318168404Spjd		rw_exit(&zp->z_parent_lock);
2319168404Spjd		rw_exit(&zp->z_name_lock);
2320168404Spjd		zfs_dirent_unlock(dl);
2321168962Spjd		VN_RELE(vp);
2322209962Smm		if (error == ERESTART) {
2323168404Spjd			dmu_tx_wait(tx);
2324168404Spjd			dmu_tx_abort(tx);
2325168404Spjd			goto top;
2326168404Spjd		}
2327168404Spjd		dmu_tx_abort(tx);
2328168404Spjd		ZFS_EXIT(zfsvfs);
2329168404Spjd		return (error);
2330168404Spjd	}
2331168404Spjd
2332168404Spjd#ifdef FREEBSD_NAMECACHE
2333168404Spjd	cache_purge(dvp);
2334168404Spjd#endif
2335168404Spjd
2336185029Spjd	error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
2337168404Spjd
2338185029Spjd	if (error == 0) {
2339185029Spjd		uint64_t txtype = TX_RMDIR;
2340185029Spjd		if (flags & FIGNORECASE)
2341185029Spjd			txtype |= TX_CI;
2342219089Spjd		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
2343185029Spjd	}
2344168404Spjd
2345168404Spjd	dmu_tx_commit(tx);
2346168404Spjd
2347168404Spjd	rw_exit(&zp->z_parent_lock);
2348168404Spjd	rw_exit(&zp->z_name_lock);
2349168404Spjd#ifdef FREEBSD_NAMECACHE
2350168404Spjd	cache_purge(vp);
2351168404Spjd#endif
2352168404Spjdout:
2353168404Spjd	zfs_dirent_unlock(dl);
2354168404Spjd
2355168962Spjd	VN_RELE(vp);
2356168962Spjd
2357219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2358219089Spjd		zil_commit(zilog, 0);
2359219089Spjd
2360168404Spjd	ZFS_EXIT(zfsvfs);
2361168404Spjd	return (error);
2362168404Spjd}
2363168404Spjd
2364168404Spjd/*
2365168404Spjd * Read as many directory entries as will fit into the provided
2366168404Spjd * buffer from the given directory cursor position (specified in
2367168404Spjd * the uio structure.
2368168404Spjd *
2369168404Spjd *	IN:	vp	- vnode of directory to read.
2370168404Spjd *		uio	- structure supplying read location, range info,
2371168404Spjd *			  and return buffer.
2372168404Spjd *		cr	- credentials of caller.
2373185029Spjd *		ct	- caller context
2374185029Spjd *		flags	- case flags
2375168404Spjd *
2376168404Spjd *	OUT:	uio	- updated offset and range, buffer filled.
2377168404Spjd *		eofp	- set to true if end-of-file detected.
2378168404Spjd *
2379168404Spjd *	RETURN:	0 if success
2380168404Spjd *		error code if failure
2381168404Spjd *
2382168404Spjd * Timestamps:
2383168404Spjd *	vp - atime updated
2384168404Spjd *
2385168404Spjd * Note that the low 4 bits of the cookie returned by zap is always zero.
2386168404Spjd * This allows us to use the low range for "special" directory entries:
2387168404Spjd * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
2388168404Spjd * we use the offset 2 for the '.zfs' directory.
2389168404Spjd */
2390168404Spjd/* ARGSUSED */
2391168404Spjdstatic int
2392168962Spjdzfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
2393168404Spjd{
2394168404Spjd	znode_t		*zp = VTOZ(vp);
2395168404Spjd	iovec_t		*iovp;
2396185029Spjd	edirent_t	*eodp;
2397168404Spjd	dirent64_t	*odp;
2398168404Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2399168404Spjd	objset_t	*os;
2400168404Spjd	caddr_t		outbuf;
2401168404Spjd	size_t		bufsize;
2402168404Spjd	zap_cursor_t	zc;
2403168404Spjd	zap_attribute_t	zap;
2404168404Spjd	uint_t		bytes_wanted;
2405168404Spjd	uint64_t	offset; /* must be unsigned; checks for < 1 */
2406219089Spjd	uint64_t	parent;
2407168404Spjd	int		local_eof;
2408168404Spjd	int		outcount;
2409168404Spjd	int		error;
2410168404Spjd	uint8_t		prefetch;
2411185029Spjd	boolean_t	check_sysattrs;
2412168404Spjd	uint8_t		type;
2413168962Spjd	int		ncooks;
2414168962Spjd	u_long		*cooks = NULL;
2415185029Spjd	int		flags = 0;
2416168404Spjd
2417168404Spjd	ZFS_ENTER(zfsvfs);
2418185029Spjd	ZFS_VERIFY_ZP(zp);
2419168404Spjd
2420219089Spjd	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2421219089Spjd	    &parent, sizeof (parent))) != 0) {
2422219089Spjd		ZFS_EXIT(zfsvfs);
2423219089Spjd		return (error);
2424219089Spjd	}
2425219089Spjd
2426168404Spjd	/*
2427168404Spjd	 * If we are not given an eof variable,
2428168404Spjd	 * use a local one.
2429168404Spjd	 */
2430168404Spjd	if (eofp == NULL)
2431168404Spjd		eofp = &local_eof;
2432168404Spjd
2433168404Spjd	/*
2434168404Spjd	 * Check for valid iov_len.
2435168404Spjd	 */
2436168404Spjd	if (uio->uio_iov->iov_len <= 0) {
2437168404Spjd		ZFS_EXIT(zfsvfs);
2438249195Smm		return (SET_ERROR(EINVAL));
2439168404Spjd	}
2440168404Spjd
2441168404Spjd	/*
2442168404Spjd	 * Quit if directory has been removed (posix)
2443168404Spjd	 */
2444168404Spjd	if ((*eofp = zp->z_unlinked) != 0) {
2445168404Spjd		ZFS_EXIT(zfsvfs);
2446168404Spjd		return (0);
2447168404Spjd	}
2448168404Spjd
2449168404Spjd	error = 0;
2450168404Spjd	os = zfsvfs->z_os;
2451168404Spjd	offset = uio->uio_loffset;
2452168404Spjd	prefetch = zp->z_zn_prefetch;
2453168404Spjd
2454168404Spjd	/*
2455168404Spjd	 * Initialize the iterator cursor.
2456168404Spjd	 */
2457168404Spjd	if (offset <= 3) {
2458168404Spjd		/*
2459168404Spjd		 * Start iteration from the beginning of the directory.
2460168404Spjd		 */
2461168404Spjd		zap_cursor_init(&zc, os, zp->z_id);
2462168404Spjd	} else {
2463168404Spjd		/*
2464168404Spjd		 * The offset is a serialized cursor.
2465168404Spjd		 */
2466168404Spjd		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2467168404Spjd	}
2468168404Spjd
2469168404Spjd	/*
2470168404Spjd	 * Get space to change directory entries into fs independent format.
2471168404Spjd	 */
2472168404Spjd	iovp = uio->uio_iov;
2473168404Spjd	bytes_wanted = iovp->iov_len;
2474168404Spjd	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
2475168404Spjd		bufsize = bytes_wanted;
2476168404Spjd		outbuf = kmem_alloc(bufsize, KM_SLEEP);
2477168404Spjd		odp = (struct dirent64 *)outbuf;
2478168404Spjd	} else {
2479168404Spjd		bufsize = bytes_wanted;
2480247187Smm		outbuf = NULL;
2481168404Spjd		odp = (struct dirent64 *)iovp->iov_base;
2482168404Spjd	}
2483185029Spjd	eodp = (struct edirent *)odp;
2484168404Spjd
2485169170Spjd	if (ncookies != NULL) {
2486168404Spjd		/*
2487168404Spjd		 * Minimum entry size is dirent size and 1 byte for a file name.
2488168404Spjd		 */
2489168962Spjd		ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
2490219404Spjd		cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
2491219404Spjd		*cookies = cooks;
2492168962Spjd		*ncookies = ncooks;
2493168404Spjd	}
2494185029Spjd	/*
2495185029Spjd	 * If this VFS supports the system attribute view interface; and
2496185029Spjd	 * we're looking at an extended attribute directory; and we care
2497185029Spjd	 * about normalization conflicts on this vfs; then we must check
2498185029Spjd	 * for normalization conflicts with the sysattr name space.
2499185029Spjd	 */
2500185029Spjd#ifdef TODO
2501185029Spjd	check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2502185029Spjd	    (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2503185029Spjd	    (flags & V_RDDIR_ENTFLAGS);
2504185029Spjd#else
2505185029Spjd	check_sysattrs = 0;
2506185029Spjd#endif
2507168404Spjd
2508168404Spjd	/*
2509168404Spjd	 * Transform to file-system independent format
2510168404Spjd	 */
2511168404Spjd	outcount = 0;
2512168404Spjd	while (outcount < bytes_wanted) {
2513168404Spjd		ino64_t objnum;
2514168404Spjd		ushort_t reclen;
2515219089Spjd		off64_t *next = NULL;
2516168404Spjd
2517168404Spjd		/*
2518168404Spjd		 * Special case `.', `..', and `.zfs'.
2519168404Spjd		 */
2520168404Spjd		if (offset == 0) {
2521168404Spjd			(void) strcpy(zap.za_name, ".");
2522185029Spjd			zap.za_normalization_conflict = 0;
2523168404Spjd			objnum = zp->z_id;
2524169108Spjd			type = DT_DIR;
2525168404Spjd		} else if (offset == 1) {
2526168404Spjd			(void) strcpy(zap.za_name, "..");
2527185029Spjd			zap.za_normalization_conflict = 0;
2528219089Spjd			objnum = parent;
2529169108Spjd			type = DT_DIR;
2530168404Spjd		} else if (offset == 2 && zfs_show_ctldir(zp)) {
2531168404Spjd			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2532185029Spjd			zap.za_normalization_conflict = 0;
2533168404Spjd			objnum = ZFSCTL_INO_ROOT;
2534169108Spjd			type = DT_DIR;
2535168404Spjd		} else {
2536168404Spjd			/*
2537168404Spjd			 * Grab next entry.
2538168404Spjd			 */
2539168404Spjd			if (error = zap_cursor_retrieve(&zc, &zap)) {
2540168404Spjd				if ((*eofp = (error == ENOENT)) != 0)
2541168404Spjd					break;
2542168404Spjd				else
2543168404Spjd					goto update;
2544168404Spjd			}
2545168404Spjd
2546168404Spjd			if (zap.za_integer_length != 8 ||
2547168404Spjd			    zap.za_num_integers != 1) {
2548168404Spjd				cmn_err(CE_WARN, "zap_readdir: bad directory "
2549168404Spjd				    "entry, obj = %lld, offset = %lld\n",
2550168404Spjd				    (u_longlong_t)zp->z_id,
2551168404Spjd				    (u_longlong_t)offset);
2552249195Smm				error = SET_ERROR(ENXIO);
2553168404Spjd				goto update;
2554168404Spjd			}
2555168404Spjd
2556168404Spjd			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2557168404Spjd			/*
2558168404Spjd			 * MacOS X can extract the object type here such as:
2559168404Spjd			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2560168404Spjd			 */
2561168404Spjd			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2562185029Spjd
2563185029Spjd			if (check_sysattrs && !zap.za_normalization_conflict) {
2564185029Spjd#ifdef TODO
2565185029Spjd				zap.za_normalization_conflict =
2566185029Spjd				    xattr_sysattr_casechk(zap.za_name);
2567185029Spjd#else
2568185029Spjd				panic("%s:%u: TODO", __func__, __LINE__);
2569185029Spjd#endif
2570185029Spjd			}
2571168404Spjd		}
2572168404Spjd
2573211932Smm		if (flags & V_RDDIR_ACCFILTER) {
2574211932Smm			/*
2575211932Smm			 * If we have no access at all, don't include
2576211932Smm			 * this entry in the returned information
2577211932Smm			 */
2578211932Smm			znode_t	*ezp;
2579211932Smm			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
2580211932Smm				goto skip_entry;
2581211932Smm			if (!zfs_has_access(ezp, cr)) {
2582211932Smm				VN_RELE(ZTOV(ezp));
2583211932Smm				goto skip_entry;
2584211932Smm			}
2585211932Smm			VN_RELE(ZTOV(ezp));
2586211932Smm		}
2587211932Smm
2588185029Spjd		if (flags & V_RDDIR_ENTFLAGS)
2589185029Spjd			reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2590185029Spjd		else
2591185029Spjd			reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2592185029Spjd
2593168404Spjd		/*
2594168404Spjd		 * Will this entry fit in the buffer?
2595168404Spjd		 */
2596168404Spjd		if (outcount + reclen > bufsize) {
2597168404Spjd			/*
2598168404Spjd			 * Did we manage to fit anything in the buffer?
2599168404Spjd			 */
2600168404Spjd			if (!outcount) {
2601249195Smm				error = SET_ERROR(EINVAL);
2602168404Spjd				goto update;
2603168404Spjd			}
2604168404Spjd			break;
2605168404Spjd		}
2606185029Spjd		if (flags & V_RDDIR_ENTFLAGS) {
2607185029Spjd			/*
2608185029Spjd			 * Add extended flag entry:
2609185029Spjd			 */
2610185029Spjd			eodp->ed_ino = objnum;
2611185029Spjd			eodp->ed_reclen = reclen;
2612185029Spjd			/* NOTE: ed_off is the offset for the *next* entry */
2613185029Spjd			next = &(eodp->ed_off);
2614185029Spjd			eodp->ed_eflags = zap.za_normalization_conflict ?
2615185029Spjd			    ED_CASE_CONFLICT : 0;
2616185029Spjd			(void) strncpy(eodp->ed_name, zap.za_name,
2617185029Spjd			    EDIRENT_NAMELEN(reclen));
2618185029Spjd			eodp = (edirent_t *)((intptr_t)eodp + reclen);
2619185029Spjd		} else {
2620185029Spjd			/*
2621185029Spjd			 * Add normal entry:
2622185029Spjd			 */
2623185029Spjd			odp->d_ino = objnum;
2624185029Spjd			odp->d_reclen = reclen;
2625185029Spjd			odp->d_namlen = strlen(zap.za_name);
2626185029Spjd			(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
2627185029Spjd			odp->d_type = type;
2628185029Spjd			odp = (dirent64_t *)((intptr_t)odp + reclen);
2629185029Spjd		}
2630168404Spjd		outcount += reclen;
2631168404Spjd
2632168404Spjd		ASSERT(outcount <= bufsize);
2633168404Spjd
2634168404Spjd		/* Prefetch znode */
2635168404Spjd		if (prefetch)
2636168404Spjd			dmu_prefetch(os, objnum, 0, 0);
2637168404Spjd
2638211932Smm	skip_entry:
2639168404Spjd		/*
2640168404Spjd		 * Move to the next entry, fill in the previous offset.
2641168404Spjd		 */
2642168404Spjd		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2643168404Spjd			zap_cursor_advance(&zc);
2644168404Spjd			offset = zap_cursor_serialize(&zc);
2645168404Spjd		} else {
2646168404Spjd			offset += 1;
2647168404Spjd		}
2648219404Spjd
2649219404Spjd		if (cooks != NULL) {
2650219404Spjd			*cooks++ = offset;
2651219404Spjd			ncooks--;
2652219404Spjd			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
2653219404Spjd		}
2654168404Spjd	}
2655168404Spjd	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2656168404Spjd
2657168404Spjd	/* Subtract unused cookies */
2658168962Spjd	if (ncookies != NULL)
2659168962Spjd		*ncookies -= ncooks;
2660168404Spjd
2661168404Spjd	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
2662168404Spjd		iovp->iov_base += outcount;
2663168404Spjd		iovp->iov_len -= outcount;
2664168404Spjd		uio->uio_resid -= outcount;
2665168404Spjd	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2666168404Spjd		/*
2667168404Spjd		 * Reset the pointer.
2668168404Spjd		 */
2669168404Spjd		offset = uio->uio_loffset;
2670168404Spjd	}
2671168404Spjd
2672168404Spjdupdate:
2673168404Spjd	zap_cursor_fini(&zc);
2674168404Spjd	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
2675168404Spjd		kmem_free(outbuf, bufsize);
2676168404Spjd
2677168404Spjd	if (error == ENOENT)
2678168404Spjd		error = 0;
2679168404Spjd
2680168404Spjd	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2681168404Spjd
2682168404Spjd	uio->uio_loffset = offset;
2683168404Spjd	ZFS_EXIT(zfsvfs);
2684169107Spjd	if (error != 0 && cookies != NULL) {
2685168962Spjd		free(*cookies, M_TEMP);
2686168962Spjd		*cookies = NULL;
2687168962Spjd		*ncookies = 0;
2688168404Spjd	}
2689168404Spjd	return (error);
2690168404Spjd}
2691168404Spjd
2692185029Spjdulong_t zfs_fsync_sync_cnt = 4;
2693185029Spjd
2694168404Spjdstatic int
2695185029Spjdzfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2696168404Spjd{
2697168962Spjd	znode_t	*zp = VTOZ(vp);
2698168962Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2699168404Spjd
2700185029Spjd	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2701185029Spjd
2702219089Spjd	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
2703219089Spjd		ZFS_ENTER(zfsvfs);
2704219089Spjd		ZFS_VERIFY_ZP(zp);
2705219089Spjd		zil_commit(zfsvfs->z_log, zp->z_id);
2706219089Spjd		ZFS_EXIT(zfsvfs);
2707219089Spjd	}
2708168404Spjd	return (0);
2709168404Spjd}
2710168404Spjd
2711185029Spjd
2712168404Spjd/*
2713168404Spjd * Get the requested file attributes and place them in the provided
2714168404Spjd * vattr structure.
2715168404Spjd *
2716168404Spjd *	IN:	vp	- vnode of file.
2717168404Spjd *		vap	- va_mask identifies requested attributes.
2718185029Spjd *			  If AT_XVATTR set, then optional attrs are requested
2719185029Spjd *		flags	- ATTR_NOACLCHECK (CIFS server context)
2720168404Spjd *		cr	- credentials of caller.
2721185029Spjd *		ct	- caller context
2722168404Spjd *
2723168404Spjd *	OUT:	vap	- attribute values.
2724168404Spjd *
2725168404Spjd *	RETURN:	0 (always succeeds)
2726168404Spjd */
2727168404Spjd/* ARGSUSED */
2728168404Spjdstatic int
2729185029Spjdzfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2730185029Spjd    caller_context_t *ct)
2731168404Spjd{
2732168962Spjd	znode_t *zp = VTOZ(vp);
2733168962Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2734185029Spjd	int	error = 0;
2735168962Spjd	uint32_t blksize;
2736168962Spjd	u_longlong_t nblocks;
2737185029Spjd	uint64_t links;
2738224251Sdelphij	uint64_t mtime[2], ctime[2], crtime[2], rdev;
2739185029Spjd	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2740185029Spjd	xoptattr_t *xoap = NULL;
2741185029Spjd	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2742224251Sdelphij	sa_bulk_attr_t bulk[4];
2743219089Spjd	int count = 0;
2744168404Spjd
2745168404Spjd	ZFS_ENTER(zfsvfs);
2746185029Spjd	ZFS_VERIFY_ZP(zp);
2747168404Spjd
2748219089Spjd	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2749219089Spjd
2750219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
2751219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
2752243807Sdelphij	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
2753224251Sdelphij	if (vp->v_type == VBLK || vp->v_type == VCHR)
2754224251Sdelphij		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
2755224251Sdelphij		    &rdev, 8);
2756219089Spjd
2757219089Spjd	if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
2758219089Spjd		ZFS_EXIT(zfsvfs);
2759219089Spjd		return (error);
2760219089Spjd	}
2761219089Spjd
2762168404Spjd	/*
2763185029Spjd	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2764185029Spjd	 * Also, if we are the owner don't bother, since owner should
2765185029Spjd	 * always be allowed to read basic attributes of file.
2766185029Spjd	 */
2767219089Spjd	if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
2768219089Spjd	    (vap->va_uid != crgetuid(cr))) {
2769185029Spjd		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2770185029Spjd		    skipaclchk, cr)) {
2771185029Spjd			ZFS_EXIT(zfsvfs);
2772185029Spjd			return (error);
2773185029Spjd		}
2774185029Spjd	}
2775185029Spjd
2776185029Spjd	/*
2777168404Spjd	 * Return all attributes.  It's cheaper to provide the answer
2778168404Spjd	 * than to determine whether we were asked the question.
2779168404Spjd	 */
2780168404Spjd
2781209097Smm	mutex_enter(&zp->z_lock);
2782219089Spjd	vap->va_type = IFTOVT(zp->z_mode);
2783219089Spjd	vap->va_mode = zp->z_mode & ~S_IFMT;
2784224252Sdelphij#ifdef sun
2785224252Sdelphij	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
2786224252Sdelphij#else
2787224252Sdelphij	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
2788224252Sdelphij#endif
2789168404Spjd	vap->va_nodeid = zp->z_id;
2790185029Spjd	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
2791219089Spjd		links = zp->z_links + 1;
2792185029Spjd	else
2793219089Spjd		links = zp->z_links;
2794229425Sdim	vap->va_nlink = MIN(links, LINK_MAX);	/* nlink_t limit! */
2795219089Spjd	vap->va_size = zp->z_size;
2796224252Sdelphij#ifdef sun
2797224252Sdelphij	vap->va_rdev = vp->v_rdev;
2798224252Sdelphij#else
2799224251Sdelphij	if (vp->v_type == VBLK || vp->v_type == VCHR)
2800224251Sdelphij		vap->va_rdev = zfs_cmpldev(rdev);
2801224252Sdelphij#endif
2802168404Spjd	vap->va_seq = zp->z_seq;
2803168404Spjd	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
2804168404Spjd
2805185029Spjd	/*
2806185029Spjd	 * Add in any requested optional attributes and the create time.
2807185029Spjd	 * Also set the corresponding bits in the returned attribute bitmap.
2808185029Spjd	 */
2809185029Spjd	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2810185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2811185029Spjd			xoap->xoa_archive =
2812219089Spjd			    ((zp->z_pflags & ZFS_ARCHIVE) != 0);
2813185029Spjd			XVA_SET_RTN(xvap, XAT_ARCHIVE);
2814185029Spjd		}
2815185029Spjd
2816185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2817185029Spjd			xoap->xoa_readonly =
2818219089Spjd			    ((zp->z_pflags & ZFS_READONLY) != 0);
2819185029Spjd			XVA_SET_RTN(xvap, XAT_READONLY);
2820185029Spjd		}
2821185029Spjd
2822185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2823185029Spjd			xoap->xoa_system =
2824219089Spjd			    ((zp->z_pflags & ZFS_SYSTEM) != 0);
2825185029Spjd			XVA_SET_RTN(xvap, XAT_SYSTEM);
2826185029Spjd		}
2827185029Spjd
2828185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2829185029Spjd			xoap->xoa_hidden =
2830219089Spjd			    ((zp->z_pflags & ZFS_HIDDEN) != 0);
2831185029Spjd			XVA_SET_RTN(xvap, XAT_HIDDEN);
2832185029Spjd		}
2833185029Spjd
2834185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2835185029Spjd			xoap->xoa_nounlink =
2836219089Spjd			    ((zp->z_pflags & ZFS_NOUNLINK) != 0);
2837185029Spjd			XVA_SET_RTN(xvap, XAT_NOUNLINK);
2838185029Spjd		}
2839185029Spjd
2840185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2841185029Spjd			xoap->xoa_immutable =
2842219089Spjd			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
2843185029Spjd			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2844185029Spjd		}
2845185029Spjd
2846185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2847185029Spjd			xoap->xoa_appendonly =
2848219089Spjd			    ((zp->z_pflags & ZFS_APPENDONLY) != 0);
2849185029Spjd			XVA_SET_RTN(xvap, XAT_APPENDONLY);
2850185029Spjd		}
2851185029Spjd
2852185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2853185029Spjd			xoap->xoa_nodump =
2854219089Spjd			    ((zp->z_pflags & ZFS_NODUMP) != 0);
2855185029Spjd			XVA_SET_RTN(xvap, XAT_NODUMP);
2856185029Spjd		}
2857185029Spjd
2858185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2859185029Spjd			xoap->xoa_opaque =
2860219089Spjd			    ((zp->z_pflags & ZFS_OPAQUE) != 0);
2861185029Spjd			XVA_SET_RTN(xvap, XAT_OPAQUE);
2862185029Spjd		}
2863185029Spjd
2864185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2865185029Spjd			xoap->xoa_av_quarantined =
2866219089Spjd			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
2867185029Spjd			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2868185029Spjd		}
2869185029Spjd
2870185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2871185029Spjd			xoap->xoa_av_modified =
2872219089Spjd			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
2873185029Spjd			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2874185029Spjd		}
2875185029Spjd
2876185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2877219089Spjd		    vp->v_type == VREG) {
2878219089Spjd			zfs_sa_get_scanstamp(zp, xvap);
2879185029Spjd		}
2880185029Spjd
2881185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
2882219089Spjd			uint64_t times[2];
2883219089Spjd
2884219089Spjd			(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
2885219089Spjd			    times, sizeof (times));
2886219089Spjd			ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
2887185029Spjd			XVA_SET_RTN(xvap, XAT_CREATETIME);
2888185029Spjd		}
2889219089Spjd
2890219089Spjd		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2891219089Spjd			xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
2892219089Spjd			XVA_SET_RTN(xvap, XAT_REPARSE);
2893219089Spjd		}
2894219089Spjd		if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
2895219089Spjd			xoap->xoa_generation = zp->z_gen;
2896219089Spjd			XVA_SET_RTN(xvap, XAT_GEN);
2897219089Spjd		}
2898219089Spjd
2899219089Spjd		if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
2900219089Spjd			xoap->xoa_offline =
2901219089Spjd			    ((zp->z_pflags & ZFS_OFFLINE) != 0);
2902219089Spjd			XVA_SET_RTN(xvap, XAT_OFFLINE);
2903219089Spjd		}
2904219089Spjd
2905219089Spjd		if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
2906219089Spjd			xoap->xoa_sparse =
2907219089Spjd			    ((zp->z_pflags & ZFS_SPARSE) != 0);
2908219089Spjd			XVA_SET_RTN(xvap, XAT_SPARSE);
2909219089Spjd		}
2910185029Spjd	}
2911185029Spjd
2912219089Spjd	ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
2913219089Spjd	ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2914219089Spjd	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2915219089Spjd	ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
2916168404Spjd
2917168404Spjd	mutex_exit(&zp->z_lock);
2918168404Spjd
2919219089Spjd	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
2920168404Spjd	vap->va_blksize = blksize;
2921168404Spjd	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
2922168404Spjd
2923168404Spjd	if (zp->z_blksz == 0) {
2924168404Spjd		/*
2925168404Spjd		 * Block size hasn't been set; suggest maximal I/O transfers.
2926168404Spjd		 */
2927168404Spjd		vap->va_blksize = zfsvfs->z_max_blksz;
2928168404Spjd	}
2929168404Spjd
2930168404Spjd	ZFS_EXIT(zfsvfs);
2931168404Spjd	return (0);
2932168404Spjd}
2933168404Spjd
2934168404Spjd/*
2935168404Spjd * Set the file attributes to the values contained in the
2936168404Spjd * vattr structure.
2937168404Spjd *
2938168404Spjd *	IN:	vp	- vnode of file to be modified.
2939168404Spjd *		vap	- new attribute values.
2940185029Spjd *			  If AT_XVATTR set, then optional attrs are being set
2941168404Spjd *		flags	- ATTR_UTIME set if non-default time values provided.
2942185029Spjd *			- ATTR_NOACLCHECK (CIFS context only).
2943168404Spjd *		cr	- credentials of caller.
2944185029Spjd *		ct	- caller context
2945168404Spjd *
2946168404Spjd *	RETURN:	0 if success
2947168404Spjd *		error code if failure
2948168404Spjd *
2949168404Spjd * Timestamps:
2950168404Spjd *	vp - ctime updated, mtime updated if size changed.
2951168404Spjd */
2952168404Spjd/* ARGSUSED */
2953168404Spjdstatic int
2954168962Spjdzfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2955168962Spjd	caller_context_t *ct)
2956168404Spjd{
2957185029Spjd	znode_t		*zp = VTOZ(vp);
2958168404Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2959185029Spjd	zilog_t		*zilog;
2960168404Spjd	dmu_tx_t	*tx;
2961168404Spjd	vattr_t		oldva;
2962209962Smm	xvattr_t	tmpxvattr;
2963168962Spjd	uint_t		mask = vap->va_mask;
2964247187Smm	uint_t		saved_mask = 0;
2965197831Spjd	uint64_t	saved_mode;
2966168404Spjd	int		trim_mask = 0;
2967168404Spjd	uint64_t	new_mode;
2968209962Smm	uint64_t	new_uid, new_gid;
2969219089Spjd	uint64_t	xattr_obj;
2970219089Spjd	uint64_t	mtime[2], ctime[2];
2971168404Spjd	znode_t		*attrzp;
2972168404Spjd	int		need_policy = FALSE;
2973219089Spjd	int		err, err2;
2974185029Spjd	zfs_fuid_info_t *fuidp = NULL;
2975185029Spjd	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2976185029Spjd	xoptattr_t	*xoap;
2977219089Spjd	zfs_acl_t	*aclp;
2978185029Spjd	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2979219089Spjd	boolean_t	fuid_dirtied = B_FALSE;
2980219089Spjd	sa_bulk_attr_t	bulk[7], xattr_bulk[7];
2981219089Spjd	int		count = 0, xattr_count = 0;
2982168404Spjd
2983168404Spjd	if (mask == 0)
2984168404Spjd		return (0);
2985168404Spjd
2986168962Spjd	if (mask & AT_NOSET)
2987249195Smm		return (SET_ERROR(EINVAL));
2988168962Spjd
2989185029Spjd	ZFS_ENTER(zfsvfs);
2990185029Spjd	ZFS_VERIFY_ZP(zp);
2991185029Spjd
2992185029Spjd	zilog = zfsvfs->z_log;
2993185029Spjd
2994185029Spjd	/*
2995185029Spjd	 * Make sure that if we have ephemeral uid/gid or xvattr specified
2996185029Spjd	 * that file system is at proper version level
2997185029Spjd	 */
2998185029Spjd
2999185029Spjd	if (zfsvfs->z_use_fuids == B_FALSE &&
3000185029Spjd	    (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
3001185029Spjd	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
3002185029Spjd	    (mask & AT_XVATTR))) {
3003185029Spjd		ZFS_EXIT(zfsvfs);
3004249195Smm		return (SET_ERROR(EINVAL));
3005185029Spjd	}
3006185029Spjd
3007185029Spjd	if (mask & AT_SIZE && vp->v_type == VDIR) {
3008185029Spjd		ZFS_EXIT(zfsvfs);
3009249195Smm		return (SET_ERROR(EISDIR));
3010185029Spjd	}
3011168404Spjd
3012185029Spjd	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
3013185029Spjd		ZFS_EXIT(zfsvfs);
3014249195Smm		return (SET_ERROR(EINVAL));
3015185029Spjd	}
3016168404Spjd
3017185029Spjd	/*
3018185029Spjd	 * If this is an xvattr_t, then get a pointer to the structure of
3019185029Spjd	 * optional attributes.  If this is NULL, then we have a vattr_t.
3020185029Spjd	 */
3021185029Spjd	xoap = xva_getxoptattr(xvap);
3022168404Spjd
3023209962Smm	xva_init(&tmpxvattr);
3024209962Smm
3025185029Spjd	/*
3026185029Spjd	 * Immutable files can only alter immutable bit and atime
3027185029Spjd	 */
3028219089Spjd	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
3029185029Spjd	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
3030185029Spjd	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
3031185029Spjd		ZFS_EXIT(zfsvfs);
3032249195Smm		return (SET_ERROR(EPERM));
3033185029Spjd	}
3034185029Spjd
3035219089Spjd	if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
3036185029Spjd		ZFS_EXIT(zfsvfs);
3037249195Smm		return (SET_ERROR(EPERM));
3038185029Spjd	}
3039185029Spjd
3040185029Spjd	/*
3041185029Spjd	 * Verify timestamps doesn't overflow 32 bits.
3042185029Spjd	 * ZFS can handle large timestamps, but 32bit syscalls can't
3043185029Spjd	 * handle times greater than 2039.  This check should be removed
3044185029Spjd	 * once large timestamps are fully supported.
3045185029Spjd	 */
3046185029Spjd	if (mask & (AT_ATIME | AT_MTIME)) {
3047185029Spjd		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
3048185029Spjd		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
3049185029Spjd			ZFS_EXIT(zfsvfs);
3050249195Smm			return (SET_ERROR(EOVERFLOW));
3051185029Spjd		}
3052185029Spjd	}
3053185029Spjd
3054168404Spjdtop:
3055168404Spjd	attrzp = NULL;
3056219089Spjd	aclp = NULL;
3057168404Spjd
3058211932Smm	/* Can this be moved to before the top label? */
3059168404Spjd	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
3060168404Spjd		ZFS_EXIT(zfsvfs);
3061249195Smm		return (SET_ERROR(EROFS));
3062168404Spjd	}
3063168404Spjd
3064168404Spjd	/*
3065168404Spjd	 * First validate permissions
3066168404Spjd	 */
3067168404Spjd
3068168404Spjd	if (mask & AT_SIZE) {
3069168404Spjd		/*
3070168404Spjd		 * XXX - Note, we are not providing any open
3071168404Spjd		 * mode flags here (like FNDELAY), so we may
3072168404Spjd		 * block if there are locks present... this
3073168404Spjd		 * should be addressed in openat().
3074168404Spjd		 */
3075185029Spjd		/* XXX - would it be OK to generate a log record here? */
3076185029Spjd		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
3077168404Spjd		if (err) {
3078168404Spjd			ZFS_EXIT(zfsvfs);
3079168404Spjd			return (err);
3080168404Spjd		}
3081168404Spjd	}
3082168404Spjd
3083185029Spjd	if (mask & (AT_ATIME|AT_MTIME) ||
3084185029Spjd	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
3085185029Spjd	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
3086185029Spjd	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
3087219089Spjd	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
3088219089Spjd	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
3089185029Spjd	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
3090219089Spjd	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
3091185029Spjd		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
3092185029Spjd		    skipaclchk, cr);
3093219089Spjd	}
3094168404Spjd
3095168404Spjd	if (mask & (AT_UID|AT_GID)) {
3096168404Spjd		int	idmask = (mask & (AT_UID|AT_GID));
3097168404Spjd		int	take_owner;
3098168404Spjd		int	take_group;
3099168404Spjd
3100168404Spjd		/*
3101168404Spjd		 * NOTE: even if a new mode is being set,
3102168404Spjd		 * we may clear S_ISUID/S_ISGID bits.
3103168404Spjd		 */
3104168404Spjd
3105168404Spjd		if (!(mask & AT_MODE))
3106219089Spjd			vap->va_mode = zp->z_mode;
3107168404Spjd
3108168404Spjd		/*
3109168404Spjd		 * Take ownership or chgrp to group we are a member of
3110168404Spjd		 */
3111168404Spjd
3112168404Spjd		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
3113185029Spjd		take_group = (mask & AT_GID) &&
3114185029Spjd		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
3115168404Spjd
3116168404Spjd		/*
3117168404Spjd		 * If both AT_UID and AT_GID are set then take_owner and
3118168404Spjd		 * take_group must both be set in order to allow taking
3119168404Spjd		 * ownership.
3120168404Spjd		 *
3121168404Spjd		 * Otherwise, send the check through secpolicy_vnode_setattr()
3122168404Spjd		 *
3123168404Spjd		 */
3124168404Spjd
3125168404Spjd		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
3126168404Spjd		    ((idmask == AT_UID) && take_owner) ||
3127168404Spjd		    ((idmask == AT_GID) && take_group)) {
3128185029Spjd			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
3129185029Spjd			    skipaclchk, cr) == 0) {
3130168404Spjd				/*
3131168404Spjd				 * Remove setuid/setgid for non-privileged users
3132168404Spjd				 */
3133185029Spjd				secpolicy_setid_clear(vap, vp, cr);
3134168404Spjd				trim_mask = (mask & (AT_UID|AT_GID));
3135168404Spjd			} else {
3136168404Spjd				need_policy =  TRUE;
3137168404Spjd			}
3138168404Spjd		} else {
3139168404Spjd			need_policy =  TRUE;
3140168404Spjd		}
3141168404Spjd	}
3142168404Spjd
3143168404Spjd	mutex_enter(&zp->z_lock);
3144219089Spjd	oldva.va_mode = zp->z_mode;
3145185029Spjd	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
3146185029Spjd	if (mask & AT_XVATTR) {
3147209962Smm		/*
3148209962Smm		 * Update xvattr mask to include only those attributes
3149209962Smm		 * that are actually changing.
3150209962Smm		 *
3151209962Smm		 * the bits will be restored prior to actually setting
3152209962Smm		 * the attributes so the caller thinks they were set.
3153209962Smm		 */
3154209962Smm		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
3155209962Smm			if (xoap->xoa_appendonly !=
3156219089Spjd			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
3157209962Smm				need_policy = TRUE;
3158209962Smm			} else {
3159209962Smm				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
3160209962Smm				XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
3161209962Smm			}
3162209962Smm		}
3163209962Smm
3164209962Smm		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
3165209962Smm			if (xoap->xoa_nounlink !=
3166219089Spjd			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
3167209962Smm				need_policy = TRUE;
3168209962Smm			} else {
3169209962Smm				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
3170209962Smm				XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
3171209962Smm			}
3172209962Smm		}
3173209962Smm
3174209962Smm		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
3175209962Smm			if (xoap->xoa_immutable !=
3176219089Spjd			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
3177209962Smm				need_policy = TRUE;
3178209962Smm			} else {
3179209962Smm				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
3180209962Smm				XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
3181209962Smm			}
3182209962Smm		}
3183209962Smm
3184209962Smm		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
3185209962Smm			if (xoap->xoa_nodump !=
3186219089Spjd			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
3187209962Smm				need_policy = TRUE;
3188209962Smm			} else {
3189209962Smm				XVA_CLR_REQ(xvap, XAT_NODUMP);
3190209962Smm				XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
3191209962Smm			}
3192209962Smm		}
3193209962Smm
3194209962Smm		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
3195209962Smm			if (xoap->xoa_av_modified !=
3196219089Spjd			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
3197209962Smm				need_policy = TRUE;
3198209962Smm			} else {
3199209962Smm				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
3200209962Smm				XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
3201209962Smm			}
3202209962Smm		}
3203209962Smm
3204209962Smm		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
3205209962Smm			if ((vp->v_type != VREG &&
3206209962Smm			    xoap->xoa_av_quarantined) ||
3207209962Smm			    xoap->xoa_av_quarantined !=
3208219089Spjd			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
3209209962Smm				need_policy = TRUE;
3210209962Smm			} else {
3211209962Smm				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
3212209962Smm				XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
3213209962Smm			}
3214209962Smm		}
3215209962Smm
3216219089Spjd		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
3217219089Spjd			mutex_exit(&zp->z_lock);
3218219089Spjd			ZFS_EXIT(zfsvfs);
3219249195Smm			return (SET_ERROR(EPERM));
3220219089Spjd		}
3221219089Spjd
3222209962Smm		if (need_policy == FALSE &&
3223209962Smm		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
3224209962Smm		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
3225185029Spjd			need_policy = TRUE;
3226185029Spjd		}
3227185029Spjd	}
3228185029Spjd
3229168404Spjd	mutex_exit(&zp->z_lock);
3230168404Spjd
3231168404Spjd	if (mask & AT_MODE) {
3232185029Spjd		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
3233168962Spjd			err = secpolicy_setid_setsticky_clear(vp, vap,
3234168962Spjd			    &oldva, cr);
3235168962Spjd			if (err) {
3236168962Spjd				ZFS_EXIT(zfsvfs);
3237168962Spjd				return (err);
3238168962Spjd			}
3239168404Spjd			trim_mask |= AT_MODE;
3240168404Spjd		} else {
3241168404Spjd			need_policy = TRUE;
3242168404Spjd		}
3243168404Spjd	}
3244168404Spjd
3245168404Spjd	if (need_policy) {
3246168404Spjd		/*
3247168404Spjd		 * If trim_mask is set then take ownership
3248168404Spjd		 * has been granted or write_acl is present and user
3249168404Spjd		 * has the ability to modify mode.  In that case remove
3250168404Spjd		 * UID|GID and or MODE from mask so that
3251168404Spjd		 * secpolicy_vnode_setattr() doesn't revoke it.
3252168404Spjd		 */
3253168404Spjd
3254168404Spjd		if (trim_mask) {
3255168404Spjd			saved_mask = vap->va_mask;
3256168404Spjd			vap->va_mask &= ~trim_mask;
3257197831Spjd			if (trim_mask & AT_MODE) {
3258197831Spjd				/*
3259197831Spjd				 * Save the mode, as secpolicy_vnode_setattr()
3260197831Spjd				 * will overwrite it with ova.va_mode.
3261197831Spjd				 */
3262197831Spjd				saved_mode = vap->va_mode;
3263197831Spjd			}
3264168404Spjd		}
3265168404Spjd		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
3266185029Spjd		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
3267168404Spjd		if (err) {
3268168404Spjd			ZFS_EXIT(zfsvfs);
3269168404Spjd			return (err);
3270168404Spjd		}
3271168404Spjd
3272197831Spjd		if (trim_mask) {
3273168404Spjd			vap->va_mask |= saved_mask;
3274197831Spjd			if (trim_mask & AT_MODE) {
3275197831Spjd				/*
3276197831Spjd				 * Recover the mode after
3277197831Spjd				 * secpolicy_vnode_setattr().
3278197831Spjd				 */
3279197831Spjd				vap->va_mode = saved_mode;
3280197831Spjd			}
3281197831Spjd		}
3282168404Spjd	}
3283168404Spjd
3284168404Spjd	/*
3285168404Spjd	 * secpolicy_vnode_setattr, or take ownership may have
3286168404Spjd	 * changed va_mask
3287168404Spjd	 */
3288168404Spjd	mask = vap->va_mask;
3289168404Spjd
3290219089Spjd	if ((mask & (AT_UID | AT_GID))) {
3291219089Spjd		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
3292219089Spjd		    &xattr_obj, sizeof (xattr_obj));
3293168404Spjd
3294219089Spjd		if (err == 0 && xattr_obj) {
3295219089Spjd			err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
3296209962Smm			if (err)
3297219089Spjd				goto out2;
3298168404Spjd		}
3299209962Smm		if (mask & AT_UID) {
3300209962Smm			new_uid = zfs_fuid_create(zfsvfs,
3301209962Smm			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
3302219089Spjd			if (new_uid != zp->z_uid &&
3303219089Spjd			    zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
3304219089Spjd				if (attrzp)
3305219089Spjd					VN_RELE(ZTOV(attrzp));
3306249195Smm				err = SET_ERROR(EDQUOT);
3307219089Spjd				goto out2;
3308209962Smm			}
3309209962Smm		}
3310209962Smm
3311209962Smm		if (mask & AT_GID) {
3312209962Smm			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
3313209962Smm			    cr, ZFS_GROUP, &fuidp);
3314219089Spjd			if (new_gid != zp->z_gid &&
3315219089Spjd			    zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
3316219089Spjd				if (attrzp)
3317219089Spjd					VN_RELE(ZTOV(attrzp));
3318249195Smm				err = SET_ERROR(EDQUOT);
3319219089Spjd				goto out2;
3320209962Smm			}
3321209962Smm		}
3322219089Spjd	}
3323219089Spjd	tx = dmu_tx_create(zfsvfs->z_os);
3324219089Spjd
3325219089Spjd	if (mask & AT_MODE) {
3326219089Spjd		uint64_t pmode = zp->z_mode;
3327219089Spjd		uint64_t acl_obj;
3328219089Spjd		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
3329219089Spjd
3330243560Smm		if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
3331243560Smm		    !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
3332249195Smm			err = SET_ERROR(EPERM);
3333243560Smm			goto out;
3334243560Smm		}
3335243560Smm
3336224174Smm		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
3337224174Smm			goto out;
3338219089Spjd
3339219089Spjd		mutex_enter(&zp->z_lock);
3340219089Spjd		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
3341219089Spjd			/*
3342219089Spjd			 * Are we upgrading ACL from old V0 format
3343219089Spjd			 * to V1 format?
3344219089Spjd			 */
3345219089Spjd			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
3346219089Spjd			    zfs_znode_acl_version(zp) ==
3347219089Spjd			    ZFS_ACL_VERSION_INITIAL) {
3348219089Spjd				dmu_tx_hold_free(tx, acl_obj, 0,
3349219089Spjd				    DMU_OBJECT_END);
3350219089Spjd				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3351219089Spjd				    0, aclp->z_acl_bytes);
3352209962Smm			} else {
3353219089Spjd				dmu_tx_hold_write(tx, acl_obj, 0,
3354219089Spjd				    aclp->z_acl_bytes);
3355209962Smm			}
3356219089Spjd		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3357219089Spjd			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3358219089Spjd			    0, aclp->z_acl_bytes);
3359209962Smm		}
3360219089Spjd		mutex_exit(&zp->z_lock);
3361219089Spjd		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3362219089Spjd	} else {
3363219089Spjd		if ((mask & AT_XVATTR) &&
3364219089Spjd		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3365219089Spjd			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3366219089Spjd		else
3367219089Spjd			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3368168404Spjd	}
3369168404Spjd
3370219089Spjd	if (attrzp) {
3371219089Spjd		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3372219089Spjd	}
3373219089Spjd
3374219089Spjd	fuid_dirtied = zfsvfs->z_fuid_dirty;
3375219089Spjd	if (fuid_dirtied)
3376219089Spjd		zfs_fuid_txhold(zfsvfs, tx);
3377219089Spjd
3378219089Spjd	zfs_sa_upgrade_txholds(tx, zp);
3379219089Spjd
3380209962Smm	err = dmu_tx_assign(tx, TXG_NOWAIT);
3381168404Spjd	if (err) {
3382209962Smm		if (err == ERESTART)
3383168404Spjd			dmu_tx_wait(tx);
3384209962Smm		goto out;
3385168404Spjd	}
3386168404Spjd
3387219089Spjd	count = 0;
3388168404Spjd	/*
3389168404Spjd	 * Set each attribute requested.
3390168404Spjd	 * We group settings according to the locks they need to acquire.
3391168404Spjd	 *
3392168404Spjd	 * Note: you cannot set ctime directly, although it will be
3393168404Spjd	 * updated as a side-effect of calling this function.
3394168404Spjd	 */
3395168404Spjd
3396219089Spjd
3397219089Spjd	if (mask & (AT_UID|AT_GID|AT_MODE))
3398219089Spjd		mutex_enter(&zp->z_acl_lock);
3399168404Spjd	mutex_enter(&zp->z_lock);
3400168404Spjd
3401219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3402219089Spjd	    &zp->z_pflags, sizeof (zp->z_pflags));
3403219089Spjd
3404219089Spjd	if (attrzp) {
3405219089Spjd		if (mask & (AT_UID|AT_GID|AT_MODE))
3406219089Spjd			mutex_enter(&attrzp->z_acl_lock);
3407219089Spjd		mutex_enter(&attrzp->z_lock);
3408219089Spjd		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3409219089Spjd		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3410219089Spjd		    sizeof (attrzp->z_pflags));
3411219089Spjd	}
3412219089Spjd
3413219089Spjd	if (mask & (AT_UID|AT_GID)) {
3414219089Spjd
3415219089Spjd		if (mask & AT_UID) {
3416219089Spjd			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3417219089Spjd			    &new_uid, sizeof (new_uid));
3418219089Spjd			zp->z_uid = new_uid;
3419219089Spjd			if (attrzp) {
3420219089Spjd				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3421219089Spjd				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3422219089Spjd				    sizeof (new_uid));
3423219089Spjd				attrzp->z_uid = new_uid;
3424219089Spjd			}
3425219089Spjd		}
3426219089Spjd
3427219089Spjd		if (mask & AT_GID) {
3428219089Spjd			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3429219089Spjd			    NULL, &new_gid, sizeof (new_gid));
3430219089Spjd			zp->z_gid = new_gid;
3431219089Spjd			if (attrzp) {
3432219089Spjd				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3433219089Spjd				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3434219089Spjd				    sizeof (new_gid));
3435219089Spjd				attrzp->z_gid = new_gid;
3436219089Spjd			}
3437219089Spjd		}
3438219089Spjd		if (!(mask & AT_MODE)) {
3439219089Spjd			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3440219089Spjd			    NULL, &new_mode, sizeof (new_mode));
3441219089Spjd			new_mode = zp->z_mode;
3442219089Spjd		}
3443219089Spjd		err = zfs_acl_chown_setattr(zp);
3444219089Spjd		ASSERT(err == 0);
3445219089Spjd		if (attrzp) {
3446219089Spjd			err = zfs_acl_chown_setattr(attrzp);
3447219089Spjd			ASSERT(err == 0);
3448219089Spjd		}
3449219089Spjd	}
3450219089Spjd
3451168404Spjd	if (mask & AT_MODE) {
3452219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3453219089Spjd		    &new_mode, sizeof (new_mode));
3454219089Spjd		zp->z_mode = new_mode;
3455219089Spjd		ASSERT3U((uintptr_t)aclp, !=, 0);
3456209962Smm		err = zfs_aclset_common(zp, aclp, cr, tx);
3457240415Smm		ASSERT0(err);
3458219089Spjd		if (zp->z_acl_cached)
3459219089Spjd			zfs_acl_free(zp->z_acl_cached);
3460211932Smm		zp->z_acl_cached = aclp;
3461211932Smm		aclp = NULL;
3462168404Spjd	}
3463168404Spjd
3464168404Spjd
3465219089Spjd	if (mask & AT_ATIME) {
3466219089Spjd		ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
3467219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
3468219089Spjd		    &zp->z_atime, sizeof (zp->z_atime));
3469168404Spjd	}
3470168404Spjd
3471219089Spjd	if (mask & AT_MTIME) {
3472219089Spjd		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3473219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3474219089Spjd		    mtime, sizeof (mtime));
3475168404Spjd	}
3476168404Spjd
3477185029Spjd	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3478219089Spjd	if (mask & AT_SIZE && !(mask & AT_MTIME)) {
3479219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
3480219089Spjd		    NULL, mtime, sizeof (mtime));
3481219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3482219089Spjd		    &ctime, sizeof (ctime));
3483219089Spjd		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
3484219089Spjd		    B_TRUE);
3485219089Spjd	} else if (mask != 0) {
3486219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3487219089Spjd		    &ctime, sizeof (ctime));
3488219089Spjd		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
3489219089Spjd		    B_TRUE);
3490219089Spjd		if (attrzp) {
3491219089Spjd			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3492219089Spjd			    SA_ZPL_CTIME(zfsvfs), NULL,
3493219089Spjd			    &ctime, sizeof (ctime));
3494219089Spjd			zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
3495219089Spjd			    mtime, ctime, B_TRUE);
3496219089Spjd		}
3497219089Spjd	}
3498185029Spjd	/*
3499185029Spjd	 * Do this after setting timestamps to prevent timestamp
3500185029Spjd	 * update from toggling bit
3501185029Spjd	 */
3502168404Spjd
3503185029Spjd	if (xoap && (mask & AT_XVATTR)) {
3504209962Smm
3505209962Smm		/*
3506209962Smm		 * restore trimmed off masks
3507209962Smm		 * so that return masks can be set for caller.
3508209962Smm		 */
3509209962Smm
3510209962Smm		if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3511209962Smm			XVA_SET_REQ(xvap, XAT_APPENDONLY);
3512209962Smm		}
3513209962Smm		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3514209962Smm			XVA_SET_REQ(xvap, XAT_NOUNLINK);
3515209962Smm		}
3516209962Smm		if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3517209962Smm			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3518209962Smm		}
3519209962Smm		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3520209962Smm			XVA_SET_REQ(xvap, XAT_NODUMP);
3521209962Smm		}
3522209962Smm		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3523209962Smm			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3524209962Smm		}
3525209962Smm		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3526209962Smm			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3527209962Smm		}
3528209962Smm
3529219089Spjd		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3530185029Spjd			ASSERT(vp->v_type == VREG);
3531185029Spjd
3532219089Spjd		zfs_xvattr_set(zp, xvap, tx);
3533185029Spjd	}
3534185029Spjd
3535209962Smm	if (fuid_dirtied)
3536209962Smm		zfs_fuid_sync(zfsvfs, tx);
3537209962Smm
3538168404Spjd	if (mask != 0)
3539185029Spjd		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3540168404Spjd
3541168404Spjd	mutex_exit(&zp->z_lock);
3542219089Spjd	if (mask & (AT_UID|AT_GID|AT_MODE))
3543219089Spjd		mutex_exit(&zp->z_acl_lock);
3544168404Spjd
3545219089Spjd	if (attrzp) {
3546219089Spjd		if (mask & (AT_UID|AT_GID|AT_MODE))
3547219089Spjd			mutex_exit(&attrzp->z_acl_lock);
3548219089Spjd		mutex_exit(&attrzp->z_lock);
3549219089Spjd	}
3550209962Smmout:
3551219089Spjd	if (err == 0 && attrzp) {
3552219089Spjd		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3553219089Spjd		    xattr_count, tx);
3554219089Spjd		ASSERT(err2 == 0);
3555219089Spjd	}
3556219089Spjd
3557168404Spjd	if (attrzp)
3558168404Spjd		VN_RELE(ZTOV(attrzp));
3559211932Smm	if (aclp)
3560209962Smm		zfs_acl_free(aclp);
3561168404Spjd
3562209962Smm	if (fuidp) {
3563209962Smm		zfs_fuid_info_free(fuidp);
3564209962Smm		fuidp = NULL;
3565209962Smm	}
3566209962Smm
3567219089Spjd	if (err) {
3568209962Smm		dmu_tx_abort(tx);
3569219089Spjd		if (err == ERESTART)
3570219089Spjd			goto top;
3571219089Spjd	} else {
3572219089Spjd		err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3573209962Smm		dmu_tx_commit(tx);
3574219089Spjd	}
3575209962Smm
3576219089Spjdout2:
3577219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3578219089Spjd		zil_commit(zilog, 0);
3579209962Smm
3580168404Spjd	ZFS_EXIT(zfsvfs);
3581168404Spjd	return (err);
3582168404Spjd}
3583168404Spjd
3584168404Spjdtypedef struct zfs_zlock {
3585168404Spjd	krwlock_t	*zl_rwlock;	/* lock we acquired */
3586168404Spjd	znode_t		*zl_znode;	/* znode we held */
3587168404Spjd	struct zfs_zlock *zl_next;	/* next in list */
3588168404Spjd} zfs_zlock_t;
3589168404Spjd
3590168404Spjd/*
3591168404Spjd * Drop locks and release vnodes that were held by zfs_rename_lock().
3592168404Spjd */
3593168404Spjdstatic void
3594168404Spjdzfs_rename_unlock(zfs_zlock_t **zlpp)
3595168404Spjd{
3596168404Spjd	zfs_zlock_t *zl;
3597168404Spjd
3598168404Spjd	while ((zl = *zlpp) != NULL) {
3599168404Spjd		if (zl->zl_znode != NULL)
3600168404Spjd			VN_RELE(ZTOV(zl->zl_znode));
3601168404Spjd		rw_exit(zl->zl_rwlock);
3602168404Spjd		*zlpp = zl->zl_next;
3603168404Spjd		kmem_free(zl, sizeof (*zl));
3604168404Spjd	}
3605168404Spjd}
3606168404Spjd
3607168404Spjd/*
3608168404Spjd * Search back through the directory tree, using the ".." entries.
3609168404Spjd * Lock each directory in the chain to prevent concurrent renames.
3610168404Spjd * Fail any attempt to move a directory into one of its own descendants.
3611168404Spjd * XXX - z_parent_lock can overlap with map or grow locks
3612168404Spjd */
3613168404Spjdstatic int
3614168404Spjdzfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
3615168404Spjd{
3616168404Spjd	zfs_zlock_t	*zl;
3617168404Spjd	znode_t		*zp = tdzp;
3618168404Spjd	uint64_t	rootid = zp->z_zfsvfs->z_root;
3619219089Spjd	uint64_t	oidp = zp->z_id;
3620168404Spjd	krwlock_t	*rwlp = &szp->z_parent_lock;
3621168404Spjd	krw_t		rw = RW_WRITER;
3622168404Spjd
3623168404Spjd	/*
3624168404Spjd	 * First pass write-locks szp and compares to zp->z_id.
3625168404Spjd	 * Later passes read-lock zp and compare to zp->z_parent.
3626168404Spjd	 */
3627168404Spjd	do {
3628168404Spjd		if (!rw_tryenter(rwlp, rw)) {
3629168404Spjd			/*
3630168404Spjd			 * Another thread is renaming in this path.
3631168404Spjd			 * Note that if we are a WRITER, we don't have any
3632168404Spjd			 * parent_locks held yet.
3633168404Spjd			 */
3634168404Spjd			if (rw == RW_READER && zp->z_id > szp->z_id) {
3635168404Spjd				/*
3636168404Spjd				 * Drop our locks and restart
3637168404Spjd				 */
3638168404Spjd				zfs_rename_unlock(&zl);
3639168404Spjd				*zlpp = NULL;
3640168404Spjd				zp = tdzp;
3641219089Spjd				oidp = zp->z_id;
3642168404Spjd				rwlp = &szp->z_parent_lock;
3643168404Spjd				rw = RW_WRITER;
3644168404Spjd				continue;
3645168404Spjd			} else {
3646168404Spjd				/*
3647168404Spjd				 * Wait for other thread to drop its locks
3648168404Spjd				 */
3649168404Spjd				rw_enter(rwlp, rw);
3650168404Spjd			}
3651168404Spjd		}
3652168404Spjd
3653168404Spjd		zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
3654168404Spjd		zl->zl_rwlock = rwlp;
3655168404Spjd		zl->zl_znode = NULL;
3656168404Spjd		zl->zl_next = *zlpp;
3657168404Spjd		*zlpp = zl;
3658168404Spjd
3659219089Spjd		if (oidp == szp->z_id)		/* We're a descendant of szp */
3660249195Smm			return (SET_ERROR(EINVAL));
3661168404Spjd
3662219089Spjd		if (oidp == rootid)		/* We've hit the top */
3663168404Spjd			return (0);
3664168404Spjd
3665168404Spjd		if (rw == RW_READER) {		/* i.e. not the first pass */
3666219089Spjd			int error = zfs_zget(zp->z_zfsvfs, oidp, &zp);
3667168404Spjd			if (error)
3668168404Spjd				return (error);
3669168404Spjd			zl->zl_znode = zp;
3670168404Spjd		}
3671219089Spjd		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs),
3672219089Spjd		    &oidp, sizeof (oidp));
3673168404Spjd		rwlp = &zp->z_parent_lock;
3674168404Spjd		rw = RW_READER;
3675168404Spjd
3676168404Spjd	} while (zp->z_id != sdzp->z_id);
3677168404Spjd
3678168404Spjd	return (0);
3679168404Spjd}
3680168404Spjd
3681168404Spjd/*
3682168404Spjd * Move an entry from the provided source directory to the target
3683168404Spjd * directory.  Change the entry name as indicated.
3684168404Spjd *
3685168404Spjd *	IN:	sdvp	- Source directory containing the "old entry".
3686168404Spjd *		snm	- Old entry name.
3687168404Spjd *		tdvp	- Target directory to contain the "new entry".
3688168404Spjd *		tnm	- New entry name.
3689168404Spjd *		cr	- credentials of caller.
3690185029Spjd *		ct	- caller context
3691185029Spjd *		flags	- case flags
3692168404Spjd *
3693168404Spjd *	RETURN:	0 if success
3694168404Spjd *		error code if failure
3695168404Spjd *
3696168404Spjd * Timestamps:
3697168404Spjd *	sdvp,tdvp - ctime|mtime updated
3698168404Spjd */
3699185029Spjd/*ARGSUSED*/
3700168404Spjdstatic int
3701185029Spjdzfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
3702185029Spjd    caller_context_t *ct, int flags)
3703168404Spjd{
3704168404Spjd	znode_t		*tdzp, *szp, *tzp;
3705168404Spjd	znode_t		*sdzp = VTOZ(sdvp);
3706168404Spjd	zfsvfs_t	*zfsvfs = sdzp->z_zfsvfs;
3707185029Spjd	zilog_t		*zilog;
3708168962Spjd	vnode_t		*realvp;
3709168404Spjd	zfs_dirlock_t	*sdl, *tdl;
3710168404Spjd	dmu_tx_t	*tx;
3711168404Spjd	zfs_zlock_t	*zl;
3712185029Spjd	int		cmp, serr, terr;
3713185029Spjd	int		error = 0;
3714185029Spjd	int		zflg = 0;
3715168404Spjd
3716168404Spjd	ZFS_ENTER(zfsvfs);
3717185029Spjd	ZFS_VERIFY_ZP(sdzp);
3718185029Spjd	zilog = zfsvfs->z_log;
3719168404Spjd
3720168962Spjd	/*
3721168962Spjd	 * Make sure we have the real vp for the target directory.
3722168962Spjd	 */
3723185029Spjd	if (VOP_REALVP(tdvp, &realvp, ct) == 0)
3724168962Spjd		tdvp = realvp;
3725168962Spjd
3726212694Smm	if (tdvp->v_vfsp != sdvp->v_vfsp || zfsctl_is_node(tdvp)) {
3727168404Spjd		ZFS_EXIT(zfsvfs);
3728249195Smm		return (SET_ERROR(EXDEV));
3729168404Spjd	}
3730168404Spjd
3731168404Spjd	tdzp = VTOZ(tdvp);
3732185029Spjd	ZFS_VERIFY_ZP(tdzp);
3733185029Spjd	if (zfsvfs->z_utf8 && u8_validate(tnm,
3734185029Spjd	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3735185029Spjd		ZFS_EXIT(zfsvfs);
3736249195Smm		return (SET_ERROR(EILSEQ));
3737185029Spjd	}
3738185029Spjd
3739185029Spjd	if (flags & FIGNORECASE)
3740185029Spjd		zflg |= ZCILOOK;
3741185029Spjd
3742168404Spjdtop:
3743168404Spjd	szp = NULL;
3744168404Spjd	tzp = NULL;
3745168404Spjd	zl = NULL;
3746168404Spjd
3747168404Spjd	/*
3748168404Spjd	 * This is to prevent the creation of links into attribute space
3749168404Spjd	 * by renaming a linked file into/outof an attribute directory.
3750168404Spjd	 * See the comment in zfs_link() for why this is considered bad.
3751168404Spjd	 */
3752219089Spjd	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3753168962Spjd		ZFS_EXIT(zfsvfs);
3754249195Smm		return (SET_ERROR(EINVAL));
3755168404Spjd	}
3756168404Spjd
3757168404Spjd	/*
3758168404Spjd	 * Lock source and target directory entries.  To prevent deadlock,
3759168404Spjd	 * a lock ordering must be defined.  We lock the directory with
3760168404Spjd	 * the smallest object id first, or if it's a tie, the one with
3761168404Spjd	 * the lexically first name.
3762168404Spjd	 */
3763168404Spjd	if (sdzp->z_id < tdzp->z_id) {
3764168962Spjd		cmp = -1;
3765168962Spjd	} else if (sdzp->z_id > tdzp->z_id) {
3766168962Spjd		cmp = 1;
3767168962Spjd	} else {
3768185029Spjd		/*
3769185029Spjd		 * First compare the two name arguments without
3770185029Spjd		 * considering any case folding.
3771185029Spjd		 */
3772185029Spjd		int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
3773185029Spjd
3774185029Spjd		cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
3775185029Spjd		ASSERT(error == 0 || !zfsvfs->z_utf8);
3776168962Spjd		if (cmp == 0) {
3777168962Spjd			/*
3778168962Spjd			 * POSIX: "If the old argument and the new argument
3779168962Spjd			 * both refer to links to the same existing file,
3780168962Spjd			 * the rename() function shall return successfully
3781168962Spjd			 * and perform no other action."
3782168962Spjd			 */
3783168962Spjd			ZFS_EXIT(zfsvfs);
3784168962Spjd			return (0);
3785168962Spjd		}
3786185029Spjd		/*
3787185029Spjd		 * If the file system is case-folding, then we may
3788185029Spjd		 * have some more checking to do.  A case-folding file
3789185029Spjd		 * system is either supporting mixed case sensitivity
3790185029Spjd		 * access or is completely case-insensitive.  Note
3791185029Spjd		 * that the file system is always case preserving.
3792185029Spjd		 *
3793185029Spjd		 * In mixed sensitivity mode case sensitive behavior
3794185029Spjd		 * is the default.  FIGNORECASE must be used to
3795185029Spjd		 * explicitly request case insensitive behavior.
3796185029Spjd		 *
3797185029Spjd		 * If the source and target names provided differ only
3798185029Spjd		 * by case (e.g., a request to rename 'tim' to 'Tim'),
3799185029Spjd		 * we will treat this as a special case in the
3800185029Spjd		 * case-insensitive mode: as long as the source name
3801185029Spjd		 * is an exact match, we will allow this to proceed as
3802185029Spjd		 * a name-change request.
3803185029Spjd		 */
3804185029Spjd		if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
3805185029Spjd		    (zfsvfs->z_case == ZFS_CASE_MIXED &&
3806185029Spjd		    flags & FIGNORECASE)) &&
3807185029Spjd		    u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
3808185029Spjd		    &error) == 0) {
3809185029Spjd			/*
3810185029Spjd			 * case preserving rename request, require exact
3811185029Spjd			 * name matches
3812185029Spjd			 */
3813185029Spjd			zflg |= ZCIEXACT;
3814185029Spjd			zflg &= ~ZCILOOK;
3815185029Spjd		}
3816168962Spjd	}
3817185029Spjd
3818208131Smm	/*
3819208131Smm	 * If the source and destination directories are the same, we should
3820208131Smm	 * grab the z_name_lock of that directory only once.
3821208131Smm	 */
3822208131Smm	if (sdzp == tdzp) {
3823208131Smm		zflg |= ZHAVELOCK;
3824208131Smm		rw_enter(&sdzp->z_name_lock, RW_READER);
3825208131Smm	}
3826208131Smm
3827168962Spjd	if (cmp < 0) {
3828185029Spjd		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
3829185029Spjd		    ZEXISTS | zflg, NULL, NULL);
3830185029Spjd		terr = zfs_dirent_lock(&tdl,
3831185029Spjd		    tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
3832168962Spjd	} else {
3833185029Spjd		terr = zfs_dirent_lock(&tdl,
3834185029Spjd		    tdzp, tnm, &tzp, zflg, NULL, NULL);
3835185029Spjd		serr = zfs_dirent_lock(&sdl,
3836185029Spjd		    sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
3837185029Spjd		    NULL, NULL);
3838168404Spjd	}
3839168404Spjd
3840168962Spjd	if (serr) {
3841168404Spjd		/*
3842168404Spjd		 * Source entry invalid or not there.
3843168404Spjd		 */
3844168962Spjd		if (!terr) {
3845168404Spjd			zfs_dirent_unlock(tdl);
3846168962Spjd			if (tzp)
3847168962Spjd				VN_RELE(ZTOV(tzp));
3848168962Spjd		}
3849208131Smm
3850208131Smm		if (sdzp == tdzp)
3851208131Smm			rw_exit(&sdzp->z_name_lock);
3852208131Smm
3853219089Spjd		/*
3854219089Spjd		 * FreeBSD: In OpenSolaris they only check if rename source is
3855219089Spjd		 * ".." here, because "." is handled in their lookup. This is
3856219089Spjd		 * not the case for FreeBSD, so we check for "." explicitly.
3857219089Spjd		 */
3858168404Spjd		if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0)
3859249195Smm			serr = SET_ERROR(EINVAL);
3860168962Spjd		ZFS_EXIT(zfsvfs);
3861168962Spjd		return (serr);
3862168404Spjd	}
3863168404Spjd	if (terr) {
3864168404Spjd		zfs_dirent_unlock(sdl);
3865168962Spjd		VN_RELE(ZTOV(szp));
3866208131Smm
3867208131Smm		if (sdzp == tdzp)
3868208131Smm			rw_exit(&sdzp->z_name_lock);
3869208131Smm
3870168404Spjd		if (strcmp(tnm, "..") == 0)
3871249195Smm			terr = SET_ERROR(EINVAL);
3872168962Spjd		ZFS_EXIT(zfsvfs);
3873168962Spjd		return (terr);
3874168404Spjd	}
3875168404Spjd
3876168404Spjd	/*
3877168404Spjd	 * Must have write access at the source to remove the old entry
3878168404Spjd	 * and write access at the target to create the new entry.
3879168404Spjd	 * Note that if target and source are the same, this can be
3880168404Spjd	 * done in a single check.
3881168404Spjd	 */
3882168404Spjd
3883168404Spjd	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
3884168404Spjd		goto out;
3885168404Spjd
3886168962Spjd	if (ZTOV(szp)->v_type == VDIR) {
3887168404Spjd		/*
3888168404Spjd		 * Check to make sure rename is valid.
3889168404Spjd		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3890168404Spjd		 */
3891168404Spjd		if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
3892168404Spjd			goto out;
3893168404Spjd	}
3894168404Spjd
3895168404Spjd	/*
3896168404Spjd	 * Does target exist?
3897168404Spjd	 */
3898168404Spjd	if (tzp) {
3899168404Spjd		/*
3900168404Spjd		 * Source and target must be the same type.
3901168404Spjd		 */
3902168962Spjd		if (ZTOV(szp)->v_type == VDIR) {
3903168962Spjd			if (ZTOV(tzp)->v_type != VDIR) {
3904249195Smm				error = SET_ERROR(ENOTDIR);
3905168404Spjd				goto out;
3906168404Spjd			}
3907168404Spjd		} else {
3908168962Spjd			if (ZTOV(tzp)->v_type == VDIR) {
3909249195Smm				error = SET_ERROR(EISDIR);
3910168404Spjd				goto out;
3911168404Spjd			}
3912168404Spjd		}
3913168404Spjd		/*
3914168404Spjd		 * POSIX dictates that when the source and target
3915168404Spjd		 * entries refer to the same file object, rename
3916168404Spjd		 * must do nothing and exit without error.
3917168404Spjd		 */
3918168404Spjd		if (szp->z_id == tzp->z_id) {
3919168404Spjd			error = 0;
3920168404Spjd			goto out;
3921168404Spjd		}
3922168404Spjd	}
3923168404Spjd
3924185029Spjd	vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
3925168962Spjd	if (tzp)
3926185029Spjd		vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
3927168962Spjd
3928185029Spjd	/*
3929185029Spjd	 * notify the target directory if it is not the same
3930185029Spjd	 * as source directory.
3931185029Spjd	 */
3932185029Spjd	if (tdvp != sdvp) {
3933185029Spjd		vnevent_rename_dest_dir(tdvp, ct);
3934185029Spjd	}
3935185029Spjd
3936168404Spjd	tx = dmu_tx_create(zfsvfs->z_os);
3937219089Spjd	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3938219089Spjd	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3939168404Spjd	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3940168404Spjd	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3941219089Spjd	if (sdzp != tdzp) {
3942219089Spjd		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3943219089Spjd		zfs_sa_upgrade_txholds(tx, tdzp);
3944219089Spjd	}
3945219089Spjd	if (tzp) {
3946219089Spjd		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3947219089Spjd		zfs_sa_upgrade_txholds(tx, tzp);
3948219089Spjd	}
3949219089Spjd
3950219089Spjd	zfs_sa_upgrade_txholds(tx, szp);
3951168404Spjd	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3952209962Smm	error = dmu_tx_assign(tx, TXG_NOWAIT);
3953168404Spjd	if (error) {
3954168404Spjd		if (zl != NULL)
3955168404Spjd			zfs_rename_unlock(&zl);
3956168404Spjd		zfs_dirent_unlock(sdl);
3957168404Spjd		zfs_dirent_unlock(tdl);
3958208131Smm
3959208131Smm		if (sdzp == tdzp)
3960208131Smm			rw_exit(&sdzp->z_name_lock);
3961208131Smm
3962168962Spjd		VN_RELE(ZTOV(szp));
3963168962Spjd		if (tzp)
3964168962Spjd			VN_RELE(ZTOV(tzp));
3965209962Smm		if (error == ERESTART) {
3966168404Spjd			dmu_tx_wait(tx);
3967168404Spjd			dmu_tx_abort(tx);
3968168404Spjd			goto top;
3969168404Spjd		}
3970168404Spjd		dmu_tx_abort(tx);
3971168962Spjd		ZFS_EXIT(zfsvfs);
3972168962Spjd		return (error);
3973168404Spjd	}
3974168404Spjd
3975168404Spjd	if (tzp)	/* Attempt to remove the existing target */
3976185029Spjd		error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
3977168404Spjd
3978168404Spjd	if (error == 0) {
3979168404Spjd		error = zfs_link_create(tdl, szp, tx, ZRENAMING);
3980168404Spjd		if (error == 0) {
3981219089Spjd			szp->z_pflags |= ZFS_AV_MODIFIED;
3982185029Spjd
3983219089Spjd			error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3984219089Spjd			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3985240415Smm			ASSERT0(error);
3986219089Spjd
3987168404Spjd			error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
3988219089Spjd			if (error == 0) {
3989219089Spjd				zfs_log_rename(zilog, tx, TX_RENAME |
3990219089Spjd				    (flags & FIGNORECASE ? TX_CI : 0), sdzp,
3991219089Spjd				    sdl->dl_name, tdzp, tdl->dl_name, szp);
3992185029Spjd
3993219089Spjd				/*
3994219089Spjd				 * Update path information for the target vnode
3995219089Spjd				 */
3996219089Spjd				vn_renamepath(tdvp, ZTOV(szp), tnm,
3997219089Spjd				    strlen(tnm));
3998219089Spjd			} else {
3999219089Spjd				/*
4000219089Spjd				 * At this point, we have successfully created
4001219089Spjd				 * the target name, but have failed to remove
4002219089Spjd				 * the source name.  Since the create was done
4003219089Spjd				 * with the ZRENAMING flag, there are
4004219089Spjd				 * complications; for one, the link count is
4005219089Spjd				 * wrong.  The easiest way to deal with this
4006219089Spjd				 * is to remove the newly created target, and
4007219089Spjd				 * return the original error.  This must
4008219089Spjd				 * succeed; fortunately, it is very unlikely to
4009219089Spjd				 * fail, since we just created it.
4010219089Spjd				 */
4011219089Spjd				VERIFY3U(zfs_link_destroy(tdl, szp, tx,
4012219089Spjd				    ZRENAMING, NULL), ==, 0);
4013219089Spjd			}
4014168404Spjd		}
4015168404Spjd#ifdef FREEBSD_NAMECACHE
4016168404Spjd		if (error == 0) {
4017168404Spjd			cache_purge(sdvp);
4018168404Spjd			cache_purge(tdvp);
4019240829Spjd			cache_purge(ZTOV(szp));
4020240829Spjd			if (tzp)
4021240829Spjd				cache_purge(ZTOV(tzp));
4022168404Spjd		}
4023168404Spjd#endif
4024168404Spjd	}
4025168404Spjd
4026168404Spjd	dmu_tx_commit(tx);
4027168404Spjdout:
4028168404Spjd	if (zl != NULL)
4029168404Spjd		zfs_rename_unlock(&zl);
4030168404Spjd
4031168404Spjd	zfs_dirent_unlock(sdl);
4032168404Spjd	zfs_dirent_unlock(tdl);
4033168404Spjd
4034208131Smm	if (sdzp == tdzp)
4035208131Smm		rw_exit(&sdzp->z_name_lock);
4036208131Smm
4037219089Spjd
4038168962Spjd	VN_RELE(ZTOV(szp));
4039168404Spjd	if (tzp)
4040168962Spjd		VN_RELE(ZTOV(tzp));
4041168404Spjd
4042219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4043219089Spjd		zil_commit(zilog, 0);
4044219089Spjd
4045168404Spjd	ZFS_EXIT(zfsvfs);
4046168404Spjd
4047168404Spjd	return (error);
4048168404Spjd}
4049168404Spjd
4050168404Spjd/*
4051168404Spjd * Insert the indicated symbolic reference entry into the directory.
4052168404Spjd *
4053168404Spjd *	IN:	dvp	- Directory to contain new symbolic link.
4054168404Spjd *		link	- Name for new symlink entry.
4055168404Spjd *		vap	- Attributes of new entry.
4056168404Spjd *		target	- Target path of new symlink.
4057168404Spjd *		cr	- credentials of caller.
4058185029Spjd *		ct	- caller context
4059185029Spjd *		flags	- case flags
4060168404Spjd *
4061168404Spjd *	RETURN:	0 if success
4062168404Spjd *		error code if failure
4063168404Spjd *
4064168404Spjd * Timestamps:
4065168404Spjd *	dvp - ctime|mtime updated
4066168404Spjd */
4067185029Spjd/*ARGSUSED*/
4068168404Spjdstatic int
4069185029Spjdzfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
4070185029Spjd    cred_t *cr, kthread_t *td)
4071168404Spjd{
4072168404Spjd	znode_t		*zp, *dzp = VTOZ(dvp);
4073168404Spjd	zfs_dirlock_t	*dl;
4074168404Spjd	dmu_tx_t	*tx;
4075168404Spjd	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
4076185029Spjd	zilog_t		*zilog;
4077219089Spjd	uint64_t	len = strlen(link);
4078168404Spjd	int		error;
4079185029Spjd	int		zflg = ZNEW;
4080209962Smm	zfs_acl_ids_t	acl_ids;
4081209962Smm	boolean_t	fuid_dirtied;
4082219089Spjd	uint64_t	txtype = TX_SYMLINK;
4083185029Spjd	int		flags = 0;
4084168404Spjd
4085168962Spjd	ASSERT(vap->va_type == VLNK);
4086168404Spjd
4087168404Spjd	ZFS_ENTER(zfsvfs);
4088185029Spjd	ZFS_VERIFY_ZP(dzp);
4089185029Spjd	zilog = zfsvfs->z_log;
4090185029Spjd
4091185029Spjd	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
4092185029Spjd	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4093185029Spjd		ZFS_EXIT(zfsvfs);
4094249195Smm		return (SET_ERROR(EILSEQ));
4095185029Spjd	}
4096185029Spjd	if (flags & FIGNORECASE)
4097185029Spjd		zflg |= ZCILOOK;
4098168404Spjd
4099168404Spjd	if (len > MAXPATHLEN) {
4100168404Spjd		ZFS_EXIT(zfsvfs);
4101249195Smm		return (SET_ERROR(ENAMETOOLONG));
4102168404Spjd	}
4103168404Spjd
4104219089Spjd	if ((error = zfs_acl_ids_create(dzp, 0,
4105219089Spjd	    vap, cr, NULL, &acl_ids)) != 0) {
4106219089Spjd		ZFS_EXIT(zfsvfs);
4107219089Spjd		return (error);
4108219089Spjd	}
4109219089Spjdtop:
4110168404Spjd	/*
4111168404Spjd	 * Attempt to lock directory; fail if entry already exists.
4112168404Spjd	 */
4113185029Spjd	error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
4114185029Spjd	if (error) {
4115219089Spjd		zfs_acl_ids_free(&acl_ids);
4116168404Spjd		ZFS_EXIT(zfsvfs);
4117168404Spjd		return (error);
4118168404Spjd	}
4119168404Spjd
4120219089Spjd	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4121219089Spjd		zfs_acl_ids_free(&acl_ids);
4122219089Spjd		zfs_dirent_unlock(dl);
4123219089Spjd		ZFS_EXIT(zfsvfs);
4124219089Spjd		return (error);
4125219089Spjd	}
4126219089Spjd
4127209962Smm	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
4128209962Smm		zfs_acl_ids_free(&acl_ids);
4129209962Smm		zfs_dirent_unlock(dl);
4130209962Smm		ZFS_EXIT(zfsvfs);
4131249195Smm		return (SET_ERROR(EDQUOT));
4132209962Smm	}
4133168404Spjd	tx = dmu_tx_create(zfsvfs->z_os);
4134209962Smm	fuid_dirtied = zfsvfs->z_fuid_dirty;
4135168404Spjd	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
4136168404Spjd	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4137219089Spjd	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
4138219089Spjd	    ZFS_SA_BASE_ATTR_SIZE + len);
4139219089Spjd	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
4140219089Spjd	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
4141219089Spjd		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
4142219089Spjd		    acl_ids.z_aclp->z_acl_bytes);
4143219089Spjd	}
4144209962Smm	if (fuid_dirtied)
4145209962Smm		zfs_fuid_txhold(zfsvfs, tx);
4146209962Smm	error = dmu_tx_assign(tx, TXG_NOWAIT);
4147168404Spjd	if (error) {
4148168404Spjd		zfs_dirent_unlock(dl);
4149209962Smm		if (error == ERESTART) {
4150168404Spjd			dmu_tx_wait(tx);
4151168404Spjd			dmu_tx_abort(tx);
4152168404Spjd			goto top;
4153168404Spjd		}
4154219089Spjd		zfs_acl_ids_free(&acl_ids);
4155168404Spjd		dmu_tx_abort(tx);
4156168404Spjd		ZFS_EXIT(zfsvfs);
4157168404Spjd		return (error);
4158168404Spjd	}
4159168404Spjd
4160168404Spjd	/*
4161168404Spjd	 * Create a new object for the symlink.
4162219089Spjd	 * for version 4 ZPL datsets the symlink will be an SA attribute
4163168404Spjd	 */
4164219089Spjd	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
4165168404Spjd
4166219089Spjd	if (fuid_dirtied)
4167219089Spjd		zfs_fuid_sync(zfsvfs, tx);
4168209962Smm
4169219089Spjd	mutex_enter(&zp->z_lock);
4170219089Spjd	if (zp->z_is_sa)
4171219089Spjd		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
4172219089Spjd		    link, len, tx);
4173219089Spjd	else
4174219089Spjd		zfs_sa_symlink(zp, link, len, tx);
4175219089Spjd	mutex_exit(&zp->z_lock);
4176168404Spjd
4177219089Spjd	zp->z_size = len;
4178219089Spjd	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
4179219089Spjd	    &zp->z_size, sizeof (zp->z_size), tx);
4180168404Spjd	/*
4181168404Spjd	 * Insert the new object into the directory.
4182168404Spjd	 */
4183168404Spjd	(void) zfs_link_create(dl, zp, tx, ZNEW);
4184168404Spjd
4185219089Spjd	if (flags & FIGNORECASE)
4186219089Spjd		txtype |= TX_CI;
4187219089Spjd	zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
4188219089Spjd	*vpp = ZTOV(zp);
4189219089Spjd
4190209962Smm	zfs_acl_ids_free(&acl_ids);
4191209962Smm
4192168404Spjd	dmu_tx_commit(tx);
4193168404Spjd
4194168404Spjd	zfs_dirent_unlock(dl);
4195168404Spjd
4196219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4197219089Spjd		zil_commit(zilog, 0);
4198219089Spjd
4199168404Spjd	ZFS_EXIT(zfsvfs);
4200168404Spjd	return (error);
4201168404Spjd}
4202168404Spjd
4203168404Spjd/*
4204168404Spjd * Return, in the buffer contained in the provided uio structure,
4205168404Spjd * the symbolic path referred to by vp.
4206168404Spjd *
4207168404Spjd *	IN:	vp	- vnode of symbolic link.
4208168404Spjd *		uoip	- structure to contain the link path.
4209168404Spjd *		cr	- credentials of caller.
4210185029Spjd *		ct	- caller context
4211168404Spjd *
4212168404Spjd *	OUT:	uio	- structure to contain the link path.
4213168404Spjd *
4214168404Spjd *	RETURN:	0 if success
4215168404Spjd *		error code if failure
4216168404Spjd *
4217168404Spjd * Timestamps:
4218168404Spjd *	vp - atime updated
4219168404Spjd */
4220168404Spjd/* ARGSUSED */
4221168404Spjdstatic int
4222185029Spjdzfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
4223168404Spjd{
4224168404Spjd	znode_t		*zp = VTOZ(vp);
4225168404Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4226168404Spjd	int		error;
4227168404Spjd
4228168404Spjd	ZFS_ENTER(zfsvfs);
4229185029Spjd	ZFS_VERIFY_ZP(zp);
4230168404Spjd
4231219089Spjd	mutex_enter(&zp->z_lock);
4232219089Spjd	if (zp->z_is_sa)
4233219089Spjd		error = sa_lookup_uio(zp->z_sa_hdl,
4234219089Spjd		    SA_ZPL_SYMLINK(zfsvfs), uio);
4235219089Spjd	else
4236219089Spjd		error = zfs_sa_readlink(zp, uio);
4237219089Spjd	mutex_exit(&zp->z_lock);
4238168404Spjd
4239168404Spjd	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4240219089Spjd
4241168404Spjd	ZFS_EXIT(zfsvfs);
4242168404Spjd	return (error);
4243168404Spjd}
4244168404Spjd
4245168404Spjd/*
4246168404Spjd * Insert a new entry into directory tdvp referencing svp.
4247168404Spjd *
4248168404Spjd *	IN:	tdvp	- Directory to contain new entry.
4249168404Spjd *		svp	- vnode of new entry.
4250168404Spjd *		name	- name of new entry.
4251168404Spjd *		cr	- credentials of caller.
4252185029Spjd *		ct	- caller context
4253168404Spjd *
4254168404Spjd *	RETURN:	0 if success
4255168404Spjd *		error code if failure
4256168404Spjd *
4257168404Spjd * Timestamps:
4258168404Spjd *	tdvp - ctime|mtime updated
4259168404Spjd *	 svp - ctime updated
4260168404Spjd */
4261168404Spjd/* ARGSUSED */
4262168404Spjdstatic int
4263185029Spjdzfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
4264185029Spjd    caller_context_t *ct, int flags)
4265168404Spjd{
4266168404Spjd	znode_t		*dzp = VTOZ(tdvp);
4267168404Spjd	znode_t		*tzp, *szp;
4268168404Spjd	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
4269185029Spjd	zilog_t		*zilog;
4270168404Spjd	zfs_dirlock_t	*dl;
4271168404Spjd	dmu_tx_t	*tx;
4272168962Spjd	vnode_t		*realvp;
4273168404Spjd	int		error;
4274185029Spjd	int		zf = ZNEW;
4275212694Smm	uint64_t	parent;
4276185029Spjd	uid_t		owner;
4277168404Spjd
4278168404Spjd	ASSERT(tdvp->v_type == VDIR);
4279168404Spjd
4280168404Spjd	ZFS_ENTER(zfsvfs);
4281185029Spjd	ZFS_VERIFY_ZP(dzp);
4282185029Spjd	zilog = zfsvfs->z_log;
4283168404Spjd
4284185029Spjd	if (VOP_REALVP(svp, &realvp, ct) == 0)
4285168962Spjd		svp = realvp;
4286168962Spjd
4287212694Smm	/*
4288212694Smm	 * POSIX dictates that we return EPERM here.
4289212694Smm	 * Better choices include ENOTSUP or EISDIR.
4290212694Smm	 */
4291212694Smm	if (svp->v_type == VDIR) {
4292168404Spjd		ZFS_EXIT(zfsvfs);
4293249195Smm		return (SET_ERROR(EPERM));
4294212694Smm	}
4295212694Smm
4296212694Smm	if (svp->v_vfsp != tdvp->v_vfsp || zfsctl_is_node(svp)) {
4297212694Smm		ZFS_EXIT(zfsvfs);
4298249195Smm		return (SET_ERROR(EXDEV));
4299168404Spjd	}
4300212694Smm
4301185029Spjd	szp = VTOZ(svp);
4302185029Spjd	ZFS_VERIFY_ZP(szp);
4303168404Spjd
4304212694Smm	/* Prevent links to .zfs/shares files */
4305212694Smm
4306219089Spjd	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
4307219089Spjd	    &parent, sizeof (uint64_t))) != 0) {
4308212694Smm		ZFS_EXIT(zfsvfs);
4309219089Spjd		return (error);
4310219089Spjd	}
4311219089Spjd	if (parent == zfsvfs->z_shares_dir) {
4312219089Spjd		ZFS_EXIT(zfsvfs);
4313249195Smm		return (SET_ERROR(EPERM));
4314212694Smm	}
4315212694Smm
4316185029Spjd	if (zfsvfs->z_utf8 && u8_validate(name,
4317185029Spjd	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4318185029Spjd		ZFS_EXIT(zfsvfs);
4319249195Smm		return (SET_ERROR(EILSEQ));
4320185029Spjd	}
4321185029Spjd	if (flags & FIGNORECASE)
4322185029Spjd		zf |= ZCILOOK;
4323185029Spjd
4324168404Spjd	/*
4325168404Spjd	 * We do not support links between attributes and non-attributes
4326168404Spjd	 * because of the potential security risk of creating links
4327168404Spjd	 * into "normal" file space in order to circumvent restrictions
4328168404Spjd	 * imposed in attribute space.
4329168404Spjd	 */
4330219089Spjd	if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
4331168404Spjd		ZFS_EXIT(zfsvfs);
4332249195Smm		return (SET_ERROR(EINVAL));
4333168404Spjd	}
4334168404Spjd
4335168404Spjd
4336219089Spjd	owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
4337219089Spjd	if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
4338168404Spjd		ZFS_EXIT(zfsvfs);
4339249195Smm		return (SET_ERROR(EPERM));
4340168404Spjd	}
4341168404Spjd
4342185029Spjd	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4343168404Spjd		ZFS_EXIT(zfsvfs);
4344168404Spjd		return (error);
4345168404Spjd	}
4346168404Spjd
4347212694Smmtop:
4348168404Spjd	/*
4349168404Spjd	 * Attempt to lock directory; fail if entry already exists.
4350168404Spjd	 */
4351185029Spjd	error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
4352185029Spjd	if (error) {
4353168404Spjd		ZFS_EXIT(zfsvfs);
4354168404Spjd		return (error);
4355168404Spjd	}
4356168404Spjd
4357168404Spjd	tx = dmu_tx_create(zfsvfs->z_os);
4358219089Spjd	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4359168404Spjd	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4360219089Spjd	zfs_sa_upgrade_txholds(tx, szp);
4361219089Spjd	zfs_sa_upgrade_txholds(tx, dzp);
4362209962Smm	error = dmu_tx_assign(tx, TXG_NOWAIT);
4363168404Spjd	if (error) {
4364168404Spjd		zfs_dirent_unlock(dl);
4365209962Smm		if (error == ERESTART) {
4366168404Spjd			dmu_tx_wait(tx);
4367168404Spjd			dmu_tx_abort(tx);
4368168404Spjd			goto top;
4369168404Spjd		}
4370168404Spjd		dmu_tx_abort(tx);
4371168404Spjd		ZFS_EXIT(zfsvfs);
4372168404Spjd		return (error);
4373168404Spjd	}
4374168404Spjd
4375168404Spjd	error = zfs_link_create(dl, szp, tx, 0);
4376168404Spjd
4377185029Spjd	if (error == 0) {
4378185029Spjd		uint64_t txtype = TX_LINK;
4379185029Spjd		if (flags & FIGNORECASE)
4380185029Spjd			txtype |= TX_CI;
4381185029Spjd		zfs_log_link(zilog, tx, txtype, dzp, szp, name);
4382185029Spjd	}
4383168404Spjd
4384168404Spjd	dmu_tx_commit(tx);
4385168404Spjd
4386168404Spjd	zfs_dirent_unlock(dl);
4387168404Spjd
4388185029Spjd	if (error == 0) {
4389185029Spjd		vnevent_link(svp, ct);
4390185029Spjd	}
4391185029Spjd
4392219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4393219089Spjd		zil_commit(zilog, 0);
4394219089Spjd
4395168404Spjd	ZFS_EXIT(zfsvfs);
4396168404Spjd	return (error);
4397168404Spjd}
4398168404Spjd
4399219089Spjd#ifdef sun
4400219089Spjd/*
4401219089Spjd * zfs_null_putapage() is used when the file system has been force
4402219089Spjd * unmounted. It just drops the pages.
4403219089Spjd */
4404219089Spjd/* ARGSUSED */
4405219089Spjdstatic int
4406219089Spjdzfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4407219089Spjd		size_t *lenp, int flags, cred_t *cr)
4408219089Spjd{
4409219089Spjd	pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR);
4410219089Spjd	return (0);
4411219089Spjd}
4412219089Spjd
4413219089Spjd/*
4414219089Spjd * Push a page out to disk, klustering if possible.
4415219089Spjd *
4416219089Spjd *	IN:	vp	- file to push page to.
4417219089Spjd *		pp	- page to push.
4418219089Spjd *		flags	- additional flags.
4419219089Spjd *		cr	- credentials of caller.
4420219089Spjd *
4421219089Spjd *	OUT:	offp	- start of range pushed.
4422219089Spjd *		lenp	- len of range pushed.
4423219089Spjd *
4424219089Spjd *	RETURN:	0 if success
4425219089Spjd *		error code if failure
4426219089Spjd *
4427219089Spjd * NOTE: callers must have locked the page to be pushed.  On
4428219089Spjd * exit, the page (and all other pages in the kluster) must be
4429219089Spjd * unlocked.
4430219089Spjd */
4431219089Spjd/* ARGSUSED */
4432219089Spjdstatic int
4433219089Spjdzfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4434219089Spjd		size_t *lenp, int flags, cred_t *cr)
4435219089Spjd{
4436219089Spjd	znode_t		*zp = VTOZ(vp);
4437219089Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4438219089Spjd	dmu_tx_t	*tx;
4439219089Spjd	u_offset_t	off, koff;
4440219089Spjd	size_t		len, klen;
4441219089Spjd	int		err;
4442219089Spjd
4443219089Spjd	off = pp->p_offset;
4444219089Spjd	len = PAGESIZE;
4445219089Spjd	/*
4446219089Spjd	 * If our blocksize is bigger than the page size, try to kluster
4447219089Spjd	 * multiple pages so that we write a full block (thus avoiding
4448219089Spjd	 * a read-modify-write).
4449219089Spjd	 */
4450219089Spjd	if (off < zp->z_size && zp->z_blksz > PAGESIZE) {
4451219089Spjd		klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
4452219089Spjd		koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0;
4453219089Spjd		ASSERT(koff <= zp->z_size);
4454219089Spjd		if (koff + klen > zp->z_size)
4455219089Spjd			klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE);
4456219089Spjd		pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
4457219089Spjd	}
4458219089Spjd	ASSERT3U(btop(len), ==, btopr(len));
4459219089Spjd
4460219089Spjd	/*
4461219089Spjd	 * Can't push pages past end-of-file.
4462219089Spjd	 */
4463219089Spjd	if (off >= zp->z_size) {
4464219089Spjd		/* ignore all pages */
4465219089Spjd		err = 0;
4466219089Spjd		goto out;
4467219089Spjd	} else if (off + len > zp->z_size) {
4468219089Spjd		int npages = btopr(zp->z_size - off);
4469219089Spjd		page_t *trunc;
4470219089Spjd
4471219089Spjd		page_list_break(&pp, &trunc, npages);
4472219089Spjd		/* ignore pages past end of file */
4473219089Spjd		if (trunc)
4474219089Spjd			pvn_write_done(trunc, flags);
4475219089Spjd		len = zp->z_size - off;
4476219089Spjd	}
4477219089Spjd
4478219089Spjd	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
4479219089Spjd	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
4480249195Smm		err = SET_ERROR(EDQUOT);
4481219089Spjd		goto out;
4482219089Spjd	}
4483219089Spjdtop:
4484219089Spjd	tx = dmu_tx_create(zfsvfs->z_os);
4485219089Spjd	dmu_tx_hold_write(tx, zp->z_id, off, len);
4486219089Spjd
4487219089Spjd	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4488219089Spjd	zfs_sa_upgrade_txholds(tx, zp);
4489219089Spjd	err = dmu_tx_assign(tx, TXG_NOWAIT);
4490219089Spjd	if (err != 0) {
4491219089Spjd		if (err == ERESTART) {
4492219089Spjd			dmu_tx_wait(tx);
4493219089Spjd			dmu_tx_abort(tx);
4494219089Spjd			goto top;
4495219089Spjd		}
4496219089Spjd		dmu_tx_abort(tx);
4497219089Spjd		goto out;
4498219089Spjd	}
4499219089Spjd
4500219089Spjd	if (zp->z_blksz <= PAGESIZE) {
4501219089Spjd		caddr_t va = zfs_map_page(pp, S_READ);
4502219089Spjd		ASSERT3U(len, <=, PAGESIZE);
4503219089Spjd		dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
4504219089Spjd		zfs_unmap_page(pp, va);
4505219089Spjd	} else {
4506219089Spjd		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
4507219089Spjd	}
4508219089Spjd
4509219089Spjd	if (err == 0) {
4510219089Spjd		uint64_t mtime[2], ctime[2];
4511219089Spjd		sa_bulk_attr_t bulk[3];
4512219089Spjd		int count = 0;
4513219089Spjd
4514219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4515219089Spjd		    &mtime, 16);
4516219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
4517219089Spjd		    &ctime, 16);
4518219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
4519219089Spjd		    &zp->z_pflags, 8);
4520219089Spjd		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
4521219089Spjd		    B_TRUE);
4522219089Spjd		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
4523219089Spjd	}
4524219089Spjd	dmu_tx_commit(tx);
4525219089Spjd
4526219089Spjdout:
4527219089Spjd	pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
4528219089Spjd	if (offp)
4529219089Spjd		*offp = off;
4530219089Spjd	if (lenp)
4531219089Spjd		*lenp = len;
4532219089Spjd
4533219089Spjd	return (err);
4534219089Spjd}
4535219089Spjd
4536219089Spjd/*
4537219089Spjd * Copy the portion of the file indicated from pages into the file.
4538219089Spjd * The pages are stored in a page list attached to the files vnode.
4539219089Spjd *
4540219089Spjd *	IN:	vp	- vnode of file to push page data to.
4541219089Spjd *		off	- position in file to put data.
4542219089Spjd *		len	- amount of data to write.
4543219089Spjd *		flags	- flags to control the operation.
4544219089Spjd *		cr	- credentials of caller.
4545219089Spjd *		ct	- caller context.
4546219089Spjd *
4547219089Spjd *	RETURN:	0 if success
4548219089Spjd *		error code if failure
4549219089Spjd *
4550219089Spjd * Timestamps:
4551219089Spjd *	vp - ctime|mtime updated
4552219089Spjd */
4553185029Spjd/*ARGSUSED*/
4554219089Spjdstatic int
4555219089Spjdzfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
4556219089Spjd    caller_context_t *ct)
4557219089Spjd{
4558219089Spjd	znode_t		*zp = VTOZ(vp);
4559219089Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4560219089Spjd	page_t		*pp;
4561219089Spjd	size_t		io_len;
4562219089Spjd	u_offset_t	io_off;
4563219089Spjd	uint_t		blksz;
4564219089Spjd	rl_t		*rl;
4565219089Spjd	int		error = 0;
4566219089Spjd
4567219089Spjd	ZFS_ENTER(zfsvfs);
4568219089Spjd	ZFS_VERIFY_ZP(zp);
4569219089Spjd
4570219089Spjd	/*
4571219089Spjd	 * Align this request to the file block size in case we kluster.
4572219089Spjd	 * XXX - this can result in pretty aggresive locking, which can
4573219089Spjd	 * impact simultanious read/write access.  One option might be
4574219089Spjd	 * to break up long requests (len == 0) into block-by-block
4575219089Spjd	 * operations to get narrower locking.
4576219089Spjd	 */
4577219089Spjd	blksz = zp->z_blksz;
4578219089Spjd	if (ISP2(blksz))
4579219089Spjd		io_off = P2ALIGN_TYPED(off, blksz, u_offset_t);
4580219089Spjd	else
4581219089Spjd		io_off = 0;
4582219089Spjd	if (len > 0 && ISP2(blksz))
4583219089Spjd		io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t);
4584219089Spjd	else
4585219089Spjd		io_len = 0;
4586219089Spjd
4587219089Spjd	if (io_len == 0) {
4588219089Spjd		/*
4589219089Spjd		 * Search the entire vp list for pages >= io_off.
4590219089Spjd		 */
4591219089Spjd		rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER);
4592219089Spjd		error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr);
4593219089Spjd		goto out;
4594219089Spjd	}
4595219089Spjd	rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER);
4596219089Spjd
4597219089Spjd	if (off > zp->z_size) {
4598219089Spjd		/* past end of file */
4599219089Spjd		zfs_range_unlock(rl);
4600219089Spjd		ZFS_EXIT(zfsvfs);
4601219089Spjd		return (0);
4602219089Spjd	}
4603219089Spjd
4604219089Spjd	len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off);
4605219089Spjd
4606219089Spjd	for (off = io_off; io_off < off + len; io_off += io_len) {
4607219089Spjd		if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
4608219089Spjd			pp = page_lookup(vp, io_off,
4609219089Spjd			    (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
4610219089Spjd		} else {
4611219089Spjd			pp = page_lookup_nowait(vp, io_off,
4612219089Spjd			    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
4613219089Spjd		}
4614219089Spjd
4615219089Spjd		if (pp != NULL && pvn_getdirty(pp, flags)) {
4616219089Spjd			int err;
4617219089Spjd
4618219089Spjd			/*
4619219089Spjd			 * Found a dirty page to push
4620219089Spjd			 */
4621219089Spjd			err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr);
4622219089Spjd			if (err)
4623219089Spjd				error = err;
4624219089Spjd		} else {
4625219089Spjd			io_len = PAGESIZE;
4626219089Spjd		}
4627219089Spjd	}
4628219089Spjdout:
4629219089Spjd	zfs_range_unlock(rl);
4630219089Spjd	if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4631219089Spjd		zil_commit(zfsvfs->z_log, zp->z_id);
4632219089Spjd	ZFS_EXIT(zfsvfs);
4633219089Spjd	return (error);
4634219089Spjd}
4635219089Spjd#endif	/* sun */
4636219089Spjd
4637219089Spjd/*ARGSUSED*/
4638168962Spjdvoid
4639185029Spjdzfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4640168404Spjd{
4641168962Spjd	znode_t	*zp = VTOZ(vp);
4642168962Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4643168962Spjd	int error;
4644168404Spjd
4645185029Spjd	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4646219089Spjd	if (zp->z_sa_hdl == NULL) {
4647185029Spjd		/*
4648185029Spjd		 * The fs has been unmounted, or we did a
4649185029Spjd		 * suspend/resume and this file no longer exists.
4650185029Spjd		 */
4651243520Savg		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4652234607Strasz		vrecycle(vp);
4653243520Savg		return;
4654243520Savg	}
4655243520Savg
4656243520Savg	mutex_enter(&zp->z_lock);
4657243520Savg	if (zp->z_unlinked) {
4658243520Savg		/*
4659243520Savg		 * Fast path to recycle a vnode of a removed file.
4660243520Savg		 */
4661243520Savg		mutex_exit(&zp->z_lock);
4662185029Spjd		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4663243520Savg		vrecycle(vp);
4664168962Spjd		return;
4665168404Spjd	}
4666243520Savg	mutex_exit(&zp->z_lock);
4667168404Spjd
4668168404Spjd	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4669168404Spjd		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4670168404Spjd
4671219089Spjd		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4672219089Spjd		zfs_sa_upgrade_txholds(tx, zp);
4673168404Spjd		error = dmu_tx_assign(tx, TXG_WAIT);
4674168404Spjd		if (error) {
4675168404Spjd			dmu_tx_abort(tx);
4676168404Spjd		} else {
4677168404Spjd			mutex_enter(&zp->z_lock);
4678219089Spjd			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4679219089Spjd			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4680168404Spjd			zp->z_atime_dirty = 0;
4681168404Spjd			mutex_exit(&zp->z_lock);
4682168404Spjd			dmu_tx_commit(tx);
4683168404Spjd		}
4684168404Spjd	}
4685185029Spjd	rw_exit(&zfsvfs->z_teardown_inactive_lock);
4686168404Spjd}
4687168404Spjd
4688219089Spjd#ifdef sun
4689219089Spjd/*
4690219089Spjd * Bounds-check the seek operation.
4691219089Spjd *
4692219089Spjd *	IN:	vp	- vnode seeking within
4693219089Spjd *		ooff	- old file offset
4694219089Spjd *		noffp	- pointer to new file offset
4695219089Spjd *		ct	- caller context
4696219089Spjd *
4697219089Spjd *	RETURN:	0 if success
4698219089Spjd *		EINVAL if new offset invalid
4699219089Spjd */
4700219089Spjd/* ARGSUSED */
4701219089Spjdstatic int
4702219089Spjdzfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
4703219089Spjd    caller_context_t *ct)
4704219089Spjd{
4705219089Spjd	if (vp->v_type == VDIR)
4706219089Spjd		return (0);
4707219089Spjd	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
4708219089Spjd}
4709219089Spjd
4710219089Spjd/*
4711219089Spjd * Pre-filter the generic locking function to trap attempts to place
4712219089Spjd * a mandatory lock on a memory mapped file.
4713219089Spjd */
4714219089Spjdstatic int
4715219089Spjdzfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
4716219089Spjd    flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct)
4717219089Spjd{
4718219089Spjd	znode_t *zp = VTOZ(vp);
4719219089Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4720219089Spjd
4721219089Spjd	ZFS_ENTER(zfsvfs);
4722219089Spjd	ZFS_VERIFY_ZP(zp);
4723219089Spjd
4724219089Spjd	/*
4725219089Spjd	 * We are following the UFS semantics with respect to mapcnt
4726219089Spjd	 * here: If we see that the file is mapped already, then we will
4727219089Spjd	 * return an error, but we don't worry about races between this
4728219089Spjd	 * function and zfs_map().
4729219089Spjd	 */
4730219089Spjd	if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) {
4731219089Spjd		ZFS_EXIT(zfsvfs);
4732249195Smm		return (SET_ERROR(EAGAIN));
4733219089Spjd	}
4734219089Spjd	ZFS_EXIT(zfsvfs);
4735219089Spjd	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
4736219089Spjd}
4737219089Spjd
4738219089Spjd/*
4739219089Spjd * If we can't find a page in the cache, we will create a new page
4740219089Spjd * and fill it with file data.  For efficiency, we may try to fill
4741219089Spjd * multiple pages at once (klustering) to fill up the supplied page
4742219089Spjd * list.  Note that the pages to be filled are held with an exclusive
4743219089Spjd * lock to prevent access by other threads while they are being filled.
4744219089Spjd */
4745219089Spjdstatic int
4746219089Spjdzfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
4747219089Spjd    caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw)
4748219089Spjd{
4749219089Spjd	znode_t *zp = VTOZ(vp);
4750219089Spjd	page_t *pp, *cur_pp;
4751219089Spjd	objset_t *os = zp->z_zfsvfs->z_os;
4752219089Spjd	u_offset_t io_off, total;
4753219089Spjd	size_t io_len;
4754219089Spjd	int err;
4755219089Spjd
4756219089Spjd	if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) {
4757219089Spjd		/*
4758219089Spjd		 * We only have a single page, don't bother klustering
4759219089Spjd		 */
4760219089Spjd		io_off = off;
4761219089Spjd		io_len = PAGESIZE;
4762219089Spjd		pp = page_create_va(vp, io_off, io_len,
4763219089Spjd		    PG_EXCL | PG_WAIT, seg, addr);
4764219089Spjd	} else {
4765219089Spjd		/*
4766219089Spjd		 * Try to find enough pages to fill the page list
4767219089Spjd		 */
4768219089Spjd		pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
4769219089Spjd		    &io_len, off, plsz, 0);
4770219089Spjd	}
4771219089Spjd	if (pp == NULL) {
4772219089Spjd		/*
4773219089Spjd		 * The page already exists, nothing to do here.
4774219089Spjd		 */
4775219089Spjd		*pl = NULL;
4776219089Spjd		return (0);
4777219089Spjd	}
4778219089Spjd
4779219089Spjd	/*
4780219089Spjd	 * Fill the pages in the kluster.
4781219089Spjd	 */
4782219089Spjd	cur_pp = pp;
4783219089Spjd	for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
4784219089Spjd		caddr_t va;
4785219089Spjd
4786219089Spjd		ASSERT3U(io_off, ==, cur_pp->p_offset);
4787219089Spjd		va = zfs_map_page(cur_pp, S_WRITE);
4788219089Spjd		err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
4789219089Spjd		    DMU_READ_PREFETCH);
4790219089Spjd		zfs_unmap_page(cur_pp, va);
4791219089Spjd		if (err) {
4792219089Spjd			/* On error, toss the entire kluster */
4793219089Spjd			pvn_read_done(pp, B_ERROR);
4794219089Spjd			/* convert checksum errors into IO errors */
4795219089Spjd			if (err == ECKSUM)
4796249195Smm				err = SET_ERROR(EIO);
4797219089Spjd			return (err);
4798219089Spjd		}
4799219089Spjd		cur_pp = cur_pp->p_next;
4800219089Spjd	}
4801219089Spjd
4802219089Spjd	/*
4803219089Spjd	 * Fill in the page list array from the kluster starting
4804219089Spjd	 * from the desired offset `off'.
4805219089Spjd	 * NOTE: the page list will always be null terminated.
4806219089Spjd	 */
4807219089Spjd	pvn_plist_init(pp, pl, plsz, off, io_len, rw);
4808219089Spjd	ASSERT(pl == NULL || (*pl)->p_offset == off);
4809219089Spjd
4810219089Spjd	return (0);
4811219089Spjd}
4812219089Spjd
4813219089Spjd/*
4814219089Spjd * Return pointers to the pages for the file region [off, off + len]
4815219089Spjd * in the pl array.  If plsz is greater than len, this function may
4816219089Spjd * also return page pointers from after the specified region
4817219089Spjd * (i.e. the region [off, off + plsz]).  These additional pages are
4818219089Spjd * only returned if they are already in the cache, or were created as
4819219089Spjd * part of a klustered read.
4820219089Spjd *
4821219089Spjd *	IN:	vp	- vnode of file to get data from.
4822219089Spjd *		off	- position in file to get data from.
4823219089Spjd *		len	- amount of data to retrieve.
4824219089Spjd *		plsz	- length of provided page list.
4825219089Spjd *		seg	- segment to obtain pages for.
4826219089Spjd *		addr	- virtual address of fault.
4827219089Spjd *		rw	- mode of created pages.
4828219089Spjd *		cr	- credentials of caller.
4829219089Spjd *		ct	- caller context.
4830219089Spjd *
4831219089Spjd *	OUT:	protp	- protection mode of created pages.
4832219089Spjd *		pl	- list of pages created.
4833219089Spjd *
4834219089Spjd *	RETURN:	0 if success
4835219089Spjd *		error code if failure
4836219089Spjd *
4837219089Spjd * Timestamps:
4838219089Spjd *	vp - atime updated
4839219089Spjd */
4840219089Spjd/* ARGSUSED */
4841219089Spjdstatic int
4842219089Spjdzfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
4843219089Spjd	page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
4844219089Spjd	enum seg_rw rw, cred_t *cr, caller_context_t *ct)
4845219089Spjd{
4846219089Spjd	znode_t		*zp = VTOZ(vp);
4847219089Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4848219089Spjd	page_t		**pl0 = pl;
4849219089Spjd	int		err = 0;
4850219089Spjd
4851219089Spjd	/* we do our own caching, faultahead is unnecessary */
4852219089Spjd	if (pl == NULL)
4853219089Spjd		return (0);
4854219089Spjd	else if (len > plsz)
4855219089Spjd		len = plsz;
4856219089Spjd	else
4857219089Spjd		len = P2ROUNDUP(len, PAGESIZE);
4858219089Spjd	ASSERT(plsz >= len);
4859219089Spjd
4860219089Spjd	ZFS_ENTER(zfsvfs);
4861219089Spjd	ZFS_VERIFY_ZP(zp);
4862219089Spjd
4863219089Spjd	if (protp)
4864219089Spjd		*protp = PROT_ALL;
4865219089Spjd
4866219089Spjd	/*
4867219089Spjd	 * Loop through the requested range [off, off + len) looking
4868219089Spjd	 * for pages.  If we don't find a page, we will need to create
4869219089Spjd	 * a new page and fill it with data from the file.
4870219089Spjd	 */
4871219089Spjd	while (len > 0) {
4872219089Spjd		if (*pl = page_lookup(vp, off, SE_SHARED))
4873219089Spjd			*(pl+1) = NULL;
4874219089Spjd		else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw))
4875219089Spjd			goto out;
4876219089Spjd		while (*pl) {
4877219089Spjd			ASSERT3U((*pl)->p_offset, ==, off);
4878219089Spjd			off += PAGESIZE;
4879219089Spjd			addr += PAGESIZE;
4880219089Spjd			if (len > 0) {
4881219089Spjd				ASSERT3U(len, >=, PAGESIZE);
4882219089Spjd				len -= PAGESIZE;
4883219089Spjd			}
4884219089Spjd			ASSERT3U(plsz, >=, PAGESIZE);
4885219089Spjd			plsz -= PAGESIZE;
4886219089Spjd			pl++;
4887219089Spjd		}
4888219089Spjd	}
4889219089Spjd
4890219089Spjd	/*
4891219089Spjd	 * Fill out the page array with any pages already in the cache.
4892219089Spjd	 */
4893219089Spjd	while (plsz > 0 &&
4894219089Spjd	    (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) {
4895219089Spjd			off += PAGESIZE;
4896219089Spjd			plsz -= PAGESIZE;
4897219089Spjd	}
4898219089Spjdout:
4899219089Spjd	if (err) {
4900219089Spjd		/*
4901219089Spjd		 * Release any pages we have previously locked.
4902219089Spjd		 */
4903219089Spjd		while (pl > pl0)
4904219089Spjd			page_unlock(*--pl);
4905219089Spjd	} else {
4906219089Spjd		ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4907219089Spjd	}
4908219089Spjd
4909219089Spjd	*pl = NULL;
4910219089Spjd
4911219089Spjd	ZFS_EXIT(zfsvfs);
4912219089Spjd	return (err);
4913219089Spjd}
4914219089Spjd
4915219089Spjd/*
4916219089Spjd * Request a memory map for a section of a file.  This code interacts
4917219089Spjd * with common code and the VM system as follows:
4918219089Spjd *
4919219089Spjd *	common code calls mmap(), which ends up in smmap_common()
4920219089Spjd *
4921219089Spjd *	this calls VOP_MAP(), which takes you into (say) zfs
4922219089Spjd *
4923219089Spjd *	zfs_map() calls as_map(), passing segvn_create() as the callback
4924219089Spjd *
4925219089Spjd *	segvn_create() creates the new segment and calls VOP_ADDMAP()
4926219089Spjd *
4927219089Spjd *	zfs_addmap() updates z_mapcnt
4928219089Spjd */
4929219089Spjd/*ARGSUSED*/
4930219089Spjdstatic int
4931219089Spjdzfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
4932219089Spjd    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4933219089Spjd    caller_context_t *ct)
4934219089Spjd{
4935219089Spjd	znode_t *zp = VTOZ(vp);
4936219089Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4937219089Spjd	segvn_crargs_t	vn_a;
4938219089Spjd	int		error;
4939219089Spjd
4940219089Spjd	ZFS_ENTER(zfsvfs);
4941219089Spjd	ZFS_VERIFY_ZP(zp);
4942219089Spjd
4943219089Spjd	if ((prot & PROT_WRITE) && (zp->z_pflags &
4944219089Spjd	    (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
4945219089Spjd		ZFS_EXIT(zfsvfs);
4946249195Smm		return (SET_ERROR(EPERM));
4947219089Spjd	}
4948219089Spjd
4949219089Spjd	if ((prot & (PROT_READ | PROT_EXEC)) &&
4950219089Spjd	    (zp->z_pflags & ZFS_AV_QUARANTINED)) {
4951219089Spjd		ZFS_EXIT(zfsvfs);
4952249195Smm		return (SET_ERROR(EACCES));
4953219089Spjd	}
4954219089Spjd
4955219089Spjd	if (vp->v_flag & VNOMAP) {
4956219089Spjd		ZFS_EXIT(zfsvfs);
4957249195Smm		return (SET_ERROR(ENOSYS));
4958219089Spjd	}
4959219089Spjd
4960219089Spjd	if (off < 0 || len > MAXOFFSET_T - off) {
4961219089Spjd		ZFS_EXIT(zfsvfs);
4962249195Smm		return (SET_ERROR(ENXIO));
4963219089Spjd	}
4964219089Spjd
4965219089Spjd	if (vp->v_type != VREG) {
4966219089Spjd		ZFS_EXIT(zfsvfs);
4967249195Smm		return (SET_ERROR(ENODEV));
4968219089Spjd	}
4969219089Spjd
4970219089Spjd	/*
4971219089Spjd	 * If file is locked, disallow mapping.
4972219089Spjd	 */
4973219089Spjd	if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) {
4974219089Spjd		ZFS_EXIT(zfsvfs);
4975249195Smm		return (SET_ERROR(EAGAIN));
4976219089Spjd	}
4977219089Spjd
4978219089Spjd	as_rangelock(as);
4979219089Spjd	error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
4980219089Spjd	if (error != 0) {
4981219089Spjd		as_rangeunlock(as);
4982219089Spjd		ZFS_EXIT(zfsvfs);
4983219089Spjd		return (error);
4984219089Spjd	}
4985219089Spjd
4986219089Spjd	vn_a.vp = vp;
4987219089Spjd	vn_a.offset = (u_offset_t)off;
4988219089Spjd	vn_a.type = flags & MAP_TYPE;
4989219089Spjd	vn_a.prot = prot;
4990219089Spjd	vn_a.maxprot = maxprot;
4991219089Spjd	vn_a.cred = cr;
4992219089Spjd	vn_a.amp = NULL;
4993219089Spjd	vn_a.flags = flags & ~MAP_TYPE;
4994219089Spjd	vn_a.szc = 0;
4995219089Spjd	vn_a.lgrp_mem_policy_flags = 0;
4996219089Spjd
4997219089Spjd	error = as_map(as, *addrp, len, segvn_create, &vn_a);
4998219089Spjd
4999219089Spjd	as_rangeunlock(as);
5000219089Spjd	ZFS_EXIT(zfsvfs);
5001219089Spjd	return (error);
5002219089Spjd}
5003219089Spjd
5004219089Spjd/* ARGSUSED */
5005219089Spjdstatic int
5006219089Spjdzfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
5007219089Spjd    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
5008219089Spjd    caller_context_t *ct)
5009219089Spjd{
5010219089Spjd	uint64_t pages = btopr(len);
5011219089Spjd
5012219089Spjd	atomic_add_64(&VTOZ(vp)->z_mapcnt, pages);
5013219089Spjd	return (0);
5014219089Spjd}
5015219089Spjd
5016219089Spjd/*
5017219089Spjd * The reason we push dirty pages as part of zfs_delmap() is so that we get a
5018219089Spjd * more accurate mtime for the associated file.  Since we don't have a way of
5019219089Spjd * detecting when the data was actually modified, we have to resort to
5020219089Spjd * heuristics.  If an explicit msync() is done, then we mark the mtime when the
5021219089Spjd * last page is pushed.  The problem occurs when the msync() call is omitted,
5022219089Spjd * which by far the most common case:
5023219089Spjd *
5024219089Spjd * 	open()
5025219089Spjd * 	mmap()
5026219089Spjd * 	<modify memory>
5027219089Spjd * 	munmap()
5028219089Spjd * 	close()
5029219089Spjd * 	<time lapse>
5030219089Spjd * 	putpage() via fsflush
5031219089Spjd *
5032219089Spjd * If we wait until fsflush to come along, we can have a modification time that
5033219089Spjd * is some arbitrary point in the future.  In order to prevent this in the
5034219089Spjd * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
5035219089Spjd * torn down.
5036219089Spjd */
5037219089Spjd/* ARGSUSED */
5038219089Spjdstatic int
5039219089Spjdzfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
5040219089Spjd    size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
5041219089Spjd    caller_context_t *ct)
5042219089Spjd{
5043219089Spjd	uint64_t pages = btopr(len);
5044219089Spjd
5045219089Spjd	ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
5046219089Spjd	atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
5047219089Spjd
5048219089Spjd	if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
5049219089Spjd	    vn_has_cached_data(vp))
5050219089Spjd		(void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
5051219089Spjd
5052219089Spjd	return (0);
5053219089Spjd}
5054219089Spjd
5055219089Spjd/*
5056219089Spjd * Free or allocate space in a file.  Currently, this function only
5057219089Spjd * supports the `F_FREESP' command.  However, this command is somewhat
5058219089Spjd * misnamed, as its functionality includes the ability to allocate as
5059219089Spjd * well as free space.
5060219089Spjd *
5061219089Spjd *	IN:	vp	- vnode of file to free data in.
5062219089Spjd *		cmd	- action to take (only F_FREESP supported).
5063219089Spjd *		bfp	- section of file to free/alloc.
5064219089Spjd *		flag	- current file open mode flags.
5065219089Spjd *		offset	- current file offset.
5066219089Spjd *		cr	- credentials of caller [UNUSED].
5067219089Spjd *		ct	- caller context.
5068219089Spjd *
5069219089Spjd *	RETURN:	0 if success
5070219089Spjd *		error code if failure
5071219089Spjd *
5072219089Spjd * Timestamps:
5073219089Spjd *	vp - ctime|mtime updated
5074219089Spjd */
5075219089Spjd/* ARGSUSED */
5076219089Spjdstatic int
5077219089Spjdzfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
5078219089Spjd    offset_t offset, cred_t *cr, caller_context_t *ct)
5079219089Spjd{
5080219089Spjd	znode_t		*zp = VTOZ(vp);
5081219089Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
5082219089Spjd	uint64_t	off, len;
5083219089Spjd	int		error;
5084219089Spjd
5085219089Spjd	ZFS_ENTER(zfsvfs);
5086219089Spjd	ZFS_VERIFY_ZP(zp);
5087219089Spjd
5088219089Spjd	if (cmd != F_FREESP) {
5089219089Spjd		ZFS_EXIT(zfsvfs);
5090249195Smm		return (SET_ERROR(EINVAL));
5091219089Spjd	}
5092219089Spjd
5093219089Spjd	if (error = convoff(vp, bfp, 0, offset)) {
5094219089Spjd		ZFS_EXIT(zfsvfs);
5095219089Spjd		return (error);
5096219089Spjd	}
5097219089Spjd
5098219089Spjd	if (bfp->l_len < 0) {
5099219089Spjd		ZFS_EXIT(zfsvfs);
5100249195Smm		return (SET_ERROR(EINVAL));
5101219089Spjd	}
5102219089Spjd
5103219089Spjd	off = bfp->l_start;
5104219089Spjd	len = bfp->l_len; /* 0 means from off to end of file */
5105219089Spjd
5106219089Spjd	error = zfs_freesp(zp, off, len, flag, TRUE);
5107219089Spjd
5108219089Spjd	ZFS_EXIT(zfsvfs);
5109219089Spjd	return (error);
5110219089Spjd}
5111219089Spjd#endif	/* sun */
5112219089Spjd
5113168404SpjdCTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
5114168404SpjdCTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
5115168404Spjd
5116185029Spjd/*ARGSUSED*/
5117168404Spjdstatic int
5118185029Spjdzfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
5119168404Spjd{
5120168404Spjd	znode_t		*zp = VTOZ(vp);
5121168404Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
5122185029Spjd	uint32_t	gen;
5123219089Spjd	uint64_t	gen64;
5124168404Spjd	uint64_t	object = zp->z_id;
5125168404Spjd	zfid_short_t	*zfid;
5126219089Spjd	int		size, i, error;
5127168404Spjd
5128168404Spjd	ZFS_ENTER(zfsvfs);
5129185029Spjd	ZFS_VERIFY_ZP(zp);
5130168404Spjd
5131219089Spjd	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
5132219089Spjd	    &gen64, sizeof (uint64_t))) != 0) {
5133219089Spjd		ZFS_EXIT(zfsvfs);
5134219089Spjd		return (error);
5135219089Spjd	}
5136219089Spjd
5137219089Spjd	gen = (uint32_t)gen64;
5138219089Spjd
5139168404Spjd	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
5140249195Smm
5141249195Smm#ifdef illumos
5142249195Smm	if (fidp->fid_len < size) {
5143249195Smm		fidp->fid_len = size;
5144249195Smm		ZFS_EXIT(zfsvfs);
5145249195Smm		return (SET_ERROR(ENOSPC));
5146249195Smm	}
5147249195Smm#else
5148168404Spjd	fidp->fid_len = size;
5149249195Smm#endif
5150168404Spjd
5151168404Spjd	zfid = (zfid_short_t *)fidp;
5152168404Spjd
5153168404Spjd	zfid->zf_len = size;
5154168404Spjd
5155168404Spjd	for (i = 0; i < sizeof (zfid->zf_object); i++)
5156168404Spjd		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
5157168404Spjd
5158168404Spjd	/* Must have a non-zero generation number to distinguish from .zfs */
5159168404Spjd	if (gen == 0)
5160168404Spjd		gen = 1;
5161168404Spjd	for (i = 0; i < sizeof (zfid->zf_gen); i++)
5162168404Spjd		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
5163168404Spjd
5164168404Spjd	if (size == LONG_FID_LEN) {
5165168404Spjd		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
5166169023Spjd		zfid_long_t	*zlfid;
5167168404Spjd
5168168404Spjd		zlfid = (zfid_long_t *)fidp;
5169168404Spjd
5170168404Spjd		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
5171168404Spjd			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
5172168404Spjd
5173168404Spjd		/* XXX - this should be the generation number for the objset */
5174168404Spjd		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
5175168404Spjd			zlfid->zf_setgen[i] = 0;
5176168404Spjd	}
5177168404Spjd
5178168404Spjd	ZFS_EXIT(zfsvfs);
5179168404Spjd	return (0);
5180168404Spjd}
5181168404Spjd
5182168404Spjdstatic int
5183185029Spjdzfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
5184185029Spjd    caller_context_t *ct)
5185168404Spjd{
5186168404Spjd	znode_t		*zp, *xzp;
5187168404Spjd	zfsvfs_t	*zfsvfs;
5188168404Spjd	zfs_dirlock_t	*dl;
5189168404Spjd	int		error;
5190168404Spjd
5191168404Spjd	switch (cmd) {
5192168404Spjd	case _PC_LINK_MAX:
5193168404Spjd		*valp = INT_MAX;
5194168404Spjd		return (0);
5195168404Spjd
5196168404Spjd	case _PC_FILESIZEBITS:
5197168404Spjd		*valp = 64;
5198168404Spjd		return (0);
5199219089Spjd#ifdef sun
5200168404Spjd	case _PC_XATTR_EXISTS:
5201168404Spjd		zp = VTOZ(vp);
5202168404Spjd		zfsvfs = zp->z_zfsvfs;
5203168404Spjd		ZFS_ENTER(zfsvfs);
5204185029Spjd		ZFS_VERIFY_ZP(zp);
5205168404Spjd		*valp = 0;
5206168404Spjd		error = zfs_dirent_lock(&dl, zp, "", &xzp,
5207185029Spjd		    ZXATTR | ZEXISTS | ZSHARED, NULL, NULL);
5208168404Spjd		if (error == 0) {
5209168404Spjd			zfs_dirent_unlock(dl);
5210168404Spjd			if (!zfs_dirempty(xzp))
5211168404Spjd				*valp = 1;
5212168404Spjd			VN_RELE(ZTOV(xzp));
5213168404Spjd		} else if (error == ENOENT) {
5214168404Spjd			/*
5215168404Spjd			 * If there aren't extended attributes, it's the
5216168404Spjd			 * same as having zero of them.
5217168404Spjd			 */
5218168404Spjd			error = 0;
5219168404Spjd		}
5220168404Spjd		ZFS_EXIT(zfsvfs);
5221168404Spjd		return (error);
5222168404Spjd
5223219089Spjd	case _PC_SATTR_ENABLED:
5224219089Spjd	case _PC_SATTR_EXISTS:
5225219089Spjd		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
5226219089Spjd		    (vp->v_type == VREG || vp->v_type == VDIR);
5227219089Spjd		return (0);
5228219089Spjd
5229219089Spjd	case _PC_ACCESS_FILTERING:
5230219089Spjd		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
5231219089Spjd		    vp->v_type == VDIR;
5232219089Spjd		return (0);
5233219089Spjd
5234219089Spjd	case _PC_ACL_ENABLED:
5235219089Spjd		*valp = _ACL_ACE_ENABLED;
5236219089Spjd		return (0);
5237219089Spjd#endif	/* sun */
5238219089Spjd	case _PC_MIN_HOLE_SIZE:
5239219089Spjd		*valp = (int)SPA_MINBLOCKSIZE;
5240219089Spjd		return (0);
5241219089Spjd#ifdef sun
5242219089Spjd	case _PC_TIMESTAMP_RESOLUTION:
5243219089Spjd		/* nanosecond timestamp resolution */
5244219089Spjd		*valp = 1L;
5245219089Spjd		return (0);
5246219089Spjd#endif	/* sun */
5247168404Spjd	case _PC_ACL_EXTENDED:
5248196949Strasz		*valp = 0;
5249168404Spjd		return (0);
5250168404Spjd
5251196949Strasz	case _PC_ACL_NFS4:
5252196949Strasz		*valp = 1;
5253196949Strasz		return (0);
5254196949Strasz
5255196949Strasz	case _PC_ACL_PATH_MAX:
5256196949Strasz		*valp = ACL_MAX_ENTRIES;
5257196949Strasz		return (0);
5258196949Strasz
5259168404Spjd	default:
5260168962Spjd		return (EOPNOTSUPP);
5261168404Spjd	}
5262168404Spjd}
5263168404Spjd
5264168404Spjd/*ARGSUSED*/
5265168404Spjdstatic int
5266185029Spjdzfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
5267185029Spjd    caller_context_t *ct)
5268168404Spjd{
5269168404Spjd	znode_t *zp = VTOZ(vp);
5270168404Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5271168404Spjd	int error;
5272185029Spjd	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
5273168404Spjd
5274168404Spjd	ZFS_ENTER(zfsvfs);
5275185029Spjd	ZFS_VERIFY_ZP(zp);
5276185029Spjd	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
5277168404Spjd	ZFS_EXIT(zfsvfs);
5278168404Spjd
5279168404Spjd	return (error);
5280168404Spjd}
5281168404Spjd
5282168404Spjd/*ARGSUSED*/
5283228685Spjdint
5284185029Spjdzfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
5285185029Spjd    caller_context_t *ct)
5286168404Spjd{
5287168404Spjd	znode_t *zp = VTOZ(vp);
5288168404Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5289168404Spjd	int error;
5290185029Spjd	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
5291219089Spjd	zilog_t	*zilog = zfsvfs->z_log;
5292168404Spjd
5293168404Spjd	ZFS_ENTER(zfsvfs);
5294185029Spjd	ZFS_VERIFY_ZP(zp);
5295219089Spjd
5296185029Spjd	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
5297219089Spjd
5298219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
5299219089Spjd		zil_commit(zilog, 0);
5300219089Spjd
5301168404Spjd	ZFS_EXIT(zfsvfs);
5302168404Spjd	return (error);
5303168404Spjd}
5304168404Spjd
5305219089Spjd#ifdef sun
5306219089Spjd/*
5307219089Spjd * Tunable, both must be a power of 2.
5308219089Spjd *
5309219089Spjd * zcr_blksz_min: the smallest read we may consider to loan out an arcbuf
5310219089Spjd * zcr_blksz_max: if set to less than the file block size, allow loaning out of
5311219089Spjd *                an arcbuf for a partial block read
5312219089Spjd */
5313219089Spjdint zcr_blksz_min = (1 << 10);	/* 1K */
5314219089Spjdint zcr_blksz_max = (1 << 17);	/* 128K */
5315219089Spjd
5316219089Spjd/*ARGSUSED*/
5317168962Spjdstatic int
5318219089Spjdzfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr,
5319219089Spjd    caller_context_t *ct)
5320219089Spjd{
5321219089Spjd	znode_t	*zp = VTOZ(vp);
5322219089Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5323219089Spjd	int max_blksz = zfsvfs->z_max_blksz;
5324219089Spjd	uio_t *uio = &xuio->xu_uio;
5325219089Spjd	ssize_t size = uio->uio_resid;
5326219089Spjd	offset_t offset = uio->uio_loffset;
5327219089Spjd	int blksz;
5328219089Spjd	int fullblk, i;
5329219089Spjd	arc_buf_t *abuf;
5330219089Spjd	ssize_t maxsize;
5331219089Spjd	int preamble, postamble;
5332219089Spjd
5333219089Spjd	if (xuio->xu_type != UIOTYPE_ZEROCOPY)
5334249195Smm		return (SET_ERROR(EINVAL));
5335219089Spjd
5336219089Spjd	ZFS_ENTER(zfsvfs);
5337219089Spjd	ZFS_VERIFY_ZP(zp);
5338219089Spjd	switch (ioflag) {
5339219089Spjd	case UIO_WRITE:
5340219089Spjd		/*
5341219089Spjd		 * Loan out an arc_buf for write if write size is bigger than
5342219089Spjd		 * max_blksz, and the file's block size is also max_blksz.
5343219089Spjd		 */
5344219089Spjd		blksz = max_blksz;
5345219089Spjd		if (size < blksz || zp->z_blksz != blksz) {
5346219089Spjd			ZFS_EXIT(zfsvfs);
5347249195Smm			return (SET_ERROR(EINVAL));
5348219089Spjd		}
5349219089Spjd		/*
5350219089Spjd		 * Caller requests buffers for write before knowing where the
5351219089Spjd		 * write offset might be (e.g. NFS TCP write).
5352219089Spjd		 */
5353219089Spjd		if (offset == -1) {
5354219089Spjd			preamble = 0;
5355219089Spjd		} else {
5356219089Spjd			preamble = P2PHASE(offset, blksz);
5357219089Spjd			if (preamble) {
5358219089Spjd				preamble = blksz - preamble;
5359219089Spjd				size -= preamble;
5360219089Spjd			}
5361219089Spjd		}
5362219089Spjd
5363219089Spjd		postamble = P2PHASE(size, blksz);
5364219089Spjd		size -= postamble;
5365219089Spjd
5366219089Spjd		fullblk = size / blksz;
5367219089Spjd		(void) dmu_xuio_init(xuio,
5368219089Spjd		    (preamble != 0) + fullblk + (postamble != 0));
5369219089Spjd		DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble,
5370219089Spjd		    int, postamble, int,
5371219089Spjd		    (preamble != 0) + fullblk + (postamble != 0));
5372219089Spjd
5373219089Spjd		/*
5374219089Spjd		 * Have to fix iov base/len for partial buffers.  They
5375219089Spjd		 * currently represent full arc_buf's.
5376219089Spjd		 */
5377219089Spjd		if (preamble) {
5378219089Spjd			/* data begins in the middle of the arc_buf */
5379219089Spjd			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5380219089Spjd			    blksz);
5381219089Spjd			ASSERT(abuf);
5382219089Spjd			(void) dmu_xuio_add(xuio, abuf,
5383219089Spjd			    blksz - preamble, preamble);
5384219089Spjd		}
5385219089Spjd
5386219089Spjd		for (i = 0; i < fullblk; i++) {
5387219089Spjd			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5388219089Spjd			    blksz);
5389219089Spjd			ASSERT(abuf);
5390219089Spjd			(void) dmu_xuio_add(xuio, abuf, 0, blksz);
5391219089Spjd		}
5392219089Spjd
5393219089Spjd		if (postamble) {
5394219089Spjd			/* data ends in the middle of the arc_buf */
5395219089Spjd			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5396219089Spjd			    blksz);
5397219089Spjd			ASSERT(abuf);
5398219089Spjd			(void) dmu_xuio_add(xuio, abuf, 0, postamble);
5399219089Spjd		}
5400219089Spjd		break;
5401219089Spjd	case UIO_READ:
5402219089Spjd		/*
5403219089Spjd		 * Loan out an arc_buf for read if the read size is larger than
5404219089Spjd		 * the current file block size.  Block alignment is not
5405219089Spjd		 * considered.  Partial arc_buf will be loaned out for read.
5406219089Spjd		 */
5407219089Spjd		blksz = zp->z_blksz;
5408219089Spjd		if (blksz < zcr_blksz_min)
5409219089Spjd			blksz = zcr_blksz_min;
5410219089Spjd		if (blksz > zcr_blksz_max)
5411219089Spjd			blksz = zcr_blksz_max;
5412219089Spjd		/* avoid potential complexity of dealing with it */
5413219089Spjd		if (blksz > max_blksz) {
5414219089Spjd			ZFS_EXIT(zfsvfs);
5415249195Smm			return (SET_ERROR(EINVAL));
5416219089Spjd		}
5417219089Spjd
5418219089Spjd		maxsize = zp->z_size - uio->uio_loffset;
5419219089Spjd		if (size > maxsize)
5420219089Spjd			size = maxsize;
5421219089Spjd
5422219089Spjd		if (size < blksz || vn_has_cached_data(vp)) {
5423219089Spjd			ZFS_EXIT(zfsvfs);
5424249195Smm			return (SET_ERROR(EINVAL));
5425219089Spjd		}
5426219089Spjd		break;
5427219089Spjd	default:
5428219089Spjd		ZFS_EXIT(zfsvfs);
5429249195Smm		return (SET_ERROR(EINVAL));
5430219089Spjd	}
5431219089Spjd
5432219089Spjd	uio->uio_extflg = UIO_XUIO;
5433219089Spjd	XUIO_XUZC_RW(xuio) = ioflag;
5434219089Spjd	ZFS_EXIT(zfsvfs);
5435219089Spjd	return (0);
5436219089Spjd}
5437219089Spjd
5438219089Spjd/*ARGSUSED*/
5439219089Spjdstatic int
5440219089Spjdzfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct)
5441219089Spjd{
5442219089Spjd	int i;
5443219089Spjd	arc_buf_t *abuf;
5444219089Spjd	int ioflag = XUIO_XUZC_RW(xuio);
5445219089Spjd
5446219089Spjd	ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
5447219089Spjd
5448219089Spjd	i = dmu_xuio_cnt(xuio);
5449219089Spjd	while (i-- > 0) {
5450219089Spjd		abuf = dmu_xuio_arcbuf(xuio, i);
5451219089Spjd		/*
5452219089Spjd		 * if abuf == NULL, it must be a write buffer
5453219089Spjd		 * that has been returned in zfs_write().
5454219089Spjd		 */
5455219089Spjd		if (abuf)
5456219089Spjd			dmu_return_arcbuf(abuf);
5457219089Spjd		ASSERT(abuf || ioflag == UIO_WRITE);
5458219089Spjd	}
5459219089Spjd
5460219089Spjd	dmu_xuio_fini(xuio);
5461219089Spjd	return (0);
5462219089Spjd}
5463219089Spjd
5464219089Spjd/*
5465219089Spjd * Predeclare these here so that the compiler assumes that
5466219089Spjd * this is an "old style" function declaration that does
5467219089Spjd * not include arguments => we won't get type mismatch errors
5468219089Spjd * in the initializations that follow.
5469219089Spjd */
5470219089Spjdstatic int zfs_inval();
5471219089Spjdstatic int zfs_isdir();
5472219089Spjd
5473219089Spjdstatic int
5474219089Spjdzfs_inval()
5475219089Spjd{
5476249195Smm	return (SET_ERROR(EINVAL));
5477219089Spjd}
5478219089Spjd
5479219089Spjdstatic int
5480219089Spjdzfs_isdir()
5481219089Spjd{
5482249195Smm	return (SET_ERROR(EISDIR));
5483219089Spjd}
5484219089Spjd/*
5485219089Spjd * Directory vnode operations template
5486219089Spjd */
5487219089Spjdvnodeops_t *zfs_dvnodeops;
5488219089Spjdconst fs_operation_def_t zfs_dvnodeops_template[] = {
5489219089Spjd	VOPNAME_OPEN,		{ .vop_open = zfs_open },
5490219089Spjd	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
5491219089Spjd	VOPNAME_READ,		{ .error = zfs_isdir },
5492219089Spjd	VOPNAME_WRITE,		{ .error = zfs_isdir },
5493219089Spjd	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
5494219089Spjd	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5495219089Spjd	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5496219089Spjd	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5497219089Spjd	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
5498219089Spjd	VOPNAME_CREATE,		{ .vop_create = zfs_create },
5499219089Spjd	VOPNAME_REMOVE,		{ .vop_remove = zfs_remove },
5500219089Spjd	VOPNAME_LINK,		{ .vop_link = zfs_link },
5501219089Spjd	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5502219089Spjd	VOPNAME_MKDIR,		{ .vop_mkdir = zfs_mkdir },
5503219089Spjd	VOPNAME_RMDIR,		{ .vop_rmdir = zfs_rmdir },
5504219089Spjd	VOPNAME_READDIR,	{ .vop_readdir = zfs_readdir },
5505219089Spjd	VOPNAME_SYMLINK,	{ .vop_symlink = zfs_symlink },
5506219089Spjd	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
5507219089Spjd	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5508219089Spjd	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5509219089Spjd	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
5510219089Spjd	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5511219089Spjd	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5512219089Spjd	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5513219089Spjd	VOPNAME_VNEVENT, 	{ .vop_vnevent = fs_vnevent_support },
5514219089Spjd	NULL,			NULL
5515219089Spjd};
5516219089Spjd
5517219089Spjd/*
5518219089Spjd * Regular file vnode operations template
5519219089Spjd */
5520219089Spjdvnodeops_t *zfs_fvnodeops;
5521219089Spjdconst fs_operation_def_t zfs_fvnodeops_template[] = {
5522219089Spjd	VOPNAME_OPEN,		{ .vop_open = zfs_open },
5523219089Spjd	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
5524219089Spjd	VOPNAME_READ,		{ .vop_read = zfs_read },
5525219089Spjd	VOPNAME_WRITE,		{ .vop_write = zfs_write },
5526219089Spjd	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
5527219089Spjd	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5528219089Spjd	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5529219089Spjd	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5530219089Spjd	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
5531219089Spjd	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5532219089Spjd	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
5533219089Spjd	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5534219089Spjd	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5535219089Spjd	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
5536219089Spjd	VOPNAME_FRLOCK,		{ .vop_frlock = zfs_frlock },
5537219089Spjd	VOPNAME_SPACE,		{ .vop_space = zfs_space },
5538219089Spjd	VOPNAME_GETPAGE,	{ .vop_getpage = zfs_getpage },
5539219089Spjd	VOPNAME_PUTPAGE,	{ .vop_putpage = zfs_putpage },
5540219089Spjd	VOPNAME_MAP,		{ .vop_map = zfs_map },
5541219089Spjd	VOPNAME_ADDMAP,		{ .vop_addmap = zfs_addmap },
5542219089Spjd	VOPNAME_DELMAP,		{ .vop_delmap = zfs_delmap },
5543219089Spjd	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5544219089Spjd	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5545219089Spjd	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5546219089Spjd	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5547219089Spjd	VOPNAME_REQZCBUF, 	{ .vop_reqzcbuf = zfs_reqzcbuf },
5548219089Spjd	VOPNAME_RETZCBUF, 	{ .vop_retzcbuf = zfs_retzcbuf },
5549219089Spjd	NULL,			NULL
5550219089Spjd};
5551219089Spjd
5552219089Spjd/*
5553219089Spjd * Symbolic link vnode operations template
5554219089Spjd */
5555219089Spjdvnodeops_t *zfs_symvnodeops;
5556219089Spjdconst fs_operation_def_t zfs_symvnodeops_template[] = {
5557219089Spjd	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5558219089Spjd	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5559219089Spjd	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5560219089Spjd	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5561219089Spjd	VOPNAME_READLINK,	{ .vop_readlink = zfs_readlink },
5562219089Spjd	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5563219089Spjd	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5564219089Spjd	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5565219089Spjd	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5566219089Spjd	NULL,			NULL
5567219089Spjd};
5568219089Spjd
5569219089Spjd/*
5570219089Spjd * special share hidden files vnode operations template
5571219089Spjd */
5572219089Spjdvnodeops_t *zfs_sharevnodeops;
5573219089Spjdconst fs_operation_def_t zfs_sharevnodeops_template[] = {
5574219089Spjd	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5575219089Spjd	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5576219089Spjd	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5577219089Spjd	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5578219089Spjd	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5579219089Spjd	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5580219089Spjd	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5581219089Spjd	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5582219089Spjd	NULL,			NULL
5583219089Spjd};
5584219089Spjd
5585219089Spjd/*
5586219089Spjd * Extended attribute directory vnode operations template
5587219089Spjd *	This template is identical to the directory vnodes
5588219089Spjd *	operation template except for restricted operations:
5589219089Spjd *		VOP_MKDIR()
5590219089Spjd *		VOP_SYMLINK()
5591219089Spjd * Note that there are other restrictions embedded in:
5592219089Spjd *	zfs_create()	- restrict type to VREG
5593219089Spjd *	zfs_link()	- no links into/out of attribute space
5594219089Spjd *	zfs_rename()	- no moves into/out of attribute space
5595219089Spjd */
5596219089Spjdvnodeops_t *zfs_xdvnodeops;
5597219089Spjdconst fs_operation_def_t zfs_xdvnodeops_template[] = {
5598219089Spjd	VOPNAME_OPEN,		{ .vop_open = zfs_open },
5599219089Spjd	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
5600219089Spjd	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
5601219089Spjd	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5602219089Spjd	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5603219089Spjd	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5604219089Spjd	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
5605219089Spjd	VOPNAME_CREATE,		{ .vop_create = zfs_create },
5606219089Spjd	VOPNAME_REMOVE,		{ .vop_remove = zfs_remove },
5607219089Spjd	VOPNAME_LINK,		{ .vop_link = zfs_link },
5608219089Spjd	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5609219089Spjd	VOPNAME_MKDIR,		{ .error = zfs_inval },
5610219089Spjd	VOPNAME_RMDIR,		{ .vop_rmdir = zfs_rmdir },
5611219089Spjd	VOPNAME_READDIR,	{ .vop_readdir = zfs_readdir },
5612219089Spjd	VOPNAME_SYMLINK,	{ .error = zfs_inval },
5613219089Spjd	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
5614219089Spjd	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5615219089Spjd	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5616219089Spjd	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
5617219089Spjd	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5618219089Spjd	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5619219089Spjd	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5620219089Spjd	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5621219089Spjd	NULL,			NULL
5622219089Spjd};
5623219089Spjd
5624219089Spjd/*
5625219089Spjd * Error vnode operations template
5626219089Spjd */
5627219089Spjdvnodeops_t *zfs_evnodeops;
5628219089Spjdconst fs_operation_def_t zfs_evnodeops_template[] = {
5629219089Spjd	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5630219089Spjd	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5631219089Spjd	NULL,			NULL
5632219089Spjd};
5633219089Spjd#endif	/* sun */
5634219089Spjd
5635219089Spjdstatic int
5636213673Spjdioflags(int ioflags)
5637213673Spjd{
5638213673Spjd	int flags = 0;
5639213673Spjd
5640213673Spjd	if (ioflags & IO_APPEND)
5641213673Spjd		flags |= FAPPEND;
5642213673Spjd	if (ioflags & IO_NDELAY)
5643213673Spjd        	flags |= FNONBLOCK;
5644213673Spjd	if (ioflags & IO_SYNC)
5645213673Spjd		flags |= (FSYNC | FDSYNC | FRSYNC);
5646213673Spjd
5647213673Spjd	return (flags);
5648213673Spjd}
5649213673Spjd
5650213673Spjdstatic int
5651213937Savgzfs_getpages(struct vnode *vp, vm_page_t *m, int count, int reqpage)
5652213937Savg{
5653213937Savg	znode_t *zp = VTOZ(vp);
5654213937Savg	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5655213937Savg	objset_t *os = zp->z_zfsvfs->z_os;
5656243517Savg	vm_page_t mfirst, mlast, mreq;
5657213937Savg	vm_object_t object;
5658213937Savg	caddr_t va;
5659213937Savg	struct sf_buf *sf;
5660243517Savg	off_t startoff, endoff;
5661213937Savg	int i, error;
5662243517Savg	vm_pindex_t reqstart, reqend;
5663243517Savg	int pcount, lsize, reqsize, size;
5664213937Savg
5665213937Savg	ZFS_ENTER(zfsvfs);
5666213937Savg	ZFS_VERIFY_ZP(zp);
5667213937Savg
5668243517Savg	pcount = OFF_TO_IDX(round_page(count));
5669213937Savg	mreq = m[reqpage];
5670213937Savg	object = mreq->object;
5671213937Savg	error = 0;
5672213937Savg
5673213937Savg	KASSERT(vp->v_object == object, ("mismatching object"));
5674213937Savg
5675243517Savg	if (pcount > 1 && zp->z_blksz > PAGESIZE) {
5676243517Savg		startoff = rounddown(IDX_TO_OFF(mreq->pindex), zp->z_blksz);
5677243517Savg		reqstart = OFF_TO_IDX(round_page(startoff));
5678243517Savg		if (reqstart < m[0]->pindex)
5679243517Savg			reqstart = 0;
5680243517Savg		else
5681243517Savg			reqstart = reqstart - m[0]->pindex;
5682243517Savg		endoff = roundup(IDX_TO_OFF(mreq->pindex) + PAGE_SIZE,
5683243517Savg		    zp->z_blksz);
5684243517Savg		reqend = OFF_TO_IDX(trunc_page(endoff)) - 1;
5685243517Savg		if (reqend > m[pcount - 1]->pindex)
5686243517Savg			reqend = m[pcount - 1]->pindex;
5687243517Savg		reqsize = reqend - m[reqstart]->pindex + 1;
5688243517Savg		KASSERT(reqstart <= reqpage && reqpage < reqstart + reqsize,
5689243517Savg		    ("reqpage beyond [reqstart, reqstart + reqsize[ bounds"));
5690243517Savg	} else {
5691243517Savg		reqstart = reqpage;
5692243517Savg		reqsize = 1;
5693243517Savg	}
5694243517Savg	mfirst = m[reqstart];
5695243517Savg	mlast = m[reqstart + reqsize - 1];
5696243517Savg
5697248084Sattilio	zfs_vmobject_wlock(object);
5698213937Savg
5699243517Savg	for (i = 0; i < reqstart; i++) {
5700243517Savg		vm_page_lock(m[i]);
5701243517Savg		vm_page_free(m[i]);
5702243517Savg		vm_page_unlock(m[i]);
5703213937Savg	}
5704243517Savg	for (i = reqstart + reqsize; i < pcount; i++) {
5705243517Savg		vm_page_lock(m[i]);
5706243517Savg		vm_page_free(m[i]);
5707243517Savg		vm_page_unlock(m[i]);
5708243517Savg	}
5709213937Savg
5710243517Savg	if (mreq->valid && reqsize == 1) {
5711213937Savg		if (mreq->valid != VM_PAGE_BITS_ALL)
5712213937Savg			vm_page_zero_invalid(mreq, TRUE);
5713248084Sattilio		zfs_vmobject_wunlock(object);
5714213937Savg		ZFS_EXIT(zfsvfs);
5715248084Sattilio		return (zfs_vm_pagerret_ok);
5716213937Savg	}
5717213937Savg
5718213937Savg	PCPU_INC(cnt.v_vnodein);
5719243517Savg	PCPU_ADD(cnt.v_vnodepgsin, reqsize);
5720213937Savg
5721213937Savg	if (IDX_TO_OFF(mreq->pindex) >= object->un_pager.vnp.vnp_size) {
5722243517Savg		for (i = reqstart; i < reqstart + reqsize; i++) {
5723243517Savg			if (i != reqpage) {
5724243517Savg				vm_page_lock(m[i]);
5725243517Savg				vm_page_free(m[i]);
5726243517Savg				vm_page_unlock(m[i]);
5727243517Savg			}
5728243517Savg		}
5729248084Sattilio		zfs_vmobject_wunlock(object);
5730213937Savg		ZFS_EXIT(zfsvfs);
5731248084Sattilio		return (zfs_vm_pagerret_bad);
5732213937Savg	}
5733213937Savg
5734243517Savg	lsize = PAGE_SIZE;
5735243517Savg	if (IDX_TO_OFF(mlast->pindex) + lsize > object->un_pager.vnp.vnp_size)
5736243517Savg		lsize = object->un_pager.vnp.vnp_size - IDX_TO_OFF(mlast->pindex);
5737213937Savg
5738248084Sattilio	zfs_vmobject_wunlock(object);
5739243517Savg
5740243517Savg	for (i = reqstart; i < reqstart + reqsize; i++) {
5741243517Savg		size = PAGE_SIZE;
5742243517Savg		if (i == (reqstart + reqsize - 1))
5743243517Savg			size = lsize;
5744243517Savg		va = zfs_map_page(m[i], &sf);
5745243517Savg		error = dmu_read(os, zp->z_id, IDX_TO_OFF(m[i]->pindex),
5746243517Savg		    size, va, DMU_READ_PREFETCH);
5747243517Savg		if (size != PAGE_SIZE)
5748243517Savg			bzero(va + size, PAGE_SIZE - size);
5749243517Savg		zfs_unmap_page(sf);
5750243517Savg		if (error != 0)
5751243517Savg			break;
5752243517Savg	}
5753243517Savg
5754248084Sattilio	zfs_vmobject_wlock(object);
5755213937Savg
5756243517Savg	for (i = reqstart; i < reqstart + reqsize; i++) {
5757243763Savg		if (!error)
5758243763Savg			m[i]->valid = VM_PAGE_BITS_ALL;
5759243517Savg		KASSERT(m[i]->dirty == 0, ("zfs_getpages: page %p is dirty", m[i]));
5760243763Savg		if (i != reqpage)
5761243763Savg			vm_page_readahead_finish(m[i]);
5762243517Savg	}
5763243517Savg
5764248084Sattilio	zfs_vmobject_wunlock(object);
5765213937Savg
5766213937Savg	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
5767213937Savg	ZFS_EXIT(zfsvfs);
5768248084Sattilio	return (error ? zfs_vm_pagerret_error : zfs_vm_pagerret_ok);
5769213937Savg}
5770213937Savg
5771213937Savgstatic int
5772213937Savgzfs_freebsd_getpages(ap)
5773213937Savg	struct vop_getpages_args /* {
5774213937Savg		struct vnode *a_vp;
5775213937Savg		vm_page_t *a_m;
5776213937Savg		int a_count;
5777213937Savg		int a_reqpage;
5778213937Savg		vm_ooffset_t a_offset;
5779213937Savg	} */ *ap;
5780213937Savg{
5781213937Savg
5782213937Savg	return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_reqpage));
5783213937Savg}
5784213937Savg
5785213937Savgstatic int
5786243518Savgzfs_freebsd_bmap(ap)
5787243518Savg	struct vop_bmap_args /* {
5788243518Savg		struct vnode *a_vp;
5789243518Savg		daddr_t  a_bn;
5790243518Savg		struct bufobj **a_bop;
5791243518Savg		daddr_t *a_bnp;
5792243518Savg		int *a_runp;
5793243518Savg		int *a_runb;
5794243518Savg	} */ *ap;
5795243518Savg{
5796243518Savg
5797243518Savg	if (ap->a_bop != NULL)
5798243518Savg		*ap->a_bop = &ap->a_vp->v_bufobj;
5799243518Savg	if (ap->a_bnp != NULL)
5800243518Savg		*ap->a_bnp = ap->a_bn;
5801243518Savg	if (ap->a_runp != NULL)
5802243518Savg		*ap->a_runp = 0;
5803243518Savg	if (ap->a_runb != NULL)
5804243518Savg		*ap->a_runb = 0;
5805243518Savg
5806243518Savg	return (0);
5807243518Savg}
5808243518Savg
5809243518Savgstatic int
5810168962Spjdzfs_freebsd_open(ap)
5811168962Spjd	struct vop_open_args /* {
5812168962Spjd		struct vnode *a_vp;
5813168962Spjd		int a_mode;
5814168962Spjd		struct ucred *a_cred;
5815168962Spjd		struct thread *a_td;
5816168962Spjd	} */ *ap;
5817168962Spjd{
5818168962Spjd	vnode_t	*vp = ap->a_vp;
5819168962Spjd	znode_t *zp = VTOZ(vp);
5820168962Spjd	int error;
5821168962Spjd
5822185029Spjd	error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
5823168962Spjd	if (error == 0)
5824219089Spjd		vnode_create_vobject(vp, zp->z_size, ap->a_td);
5825168962Spjd	return (error);
5826168962Spjd}
5827168962Spjd
5828168962Spjdstatic int
5829168962Spjdzfs_freebsd_close(ap)
5830168962Spjd	struct vop_close_args /* {
5831168962Spjd		struct vnode *a_vp;
5832168962Spjd		int  a_fflag;
5833168962Spjd		struct ucred *a_cred;
5834168962Spjd		struct thread *a_td;
5835168962Spjd	} */ *ap;
5836168962Spjd{
5837168962Spjd
5838242566Savg	return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL));
5839168962Spjd}
5840168962Spjd
5841168962Spjdstatic int
5842168962Spjdzfs_freebsd_ioctl(ap)
5843168962Spjd	struct vop_ioctl_args /* {
5844168962Spjd		struct vnode *a_vp;
5845168962Spjd		u_long a_command;
5846168962Spjd		caddr_t a_data;
5847168962Spjd		int a_fflag;
5848168962Spjd		struct ucred *cred;
5849168962Spjd		struct thread *td;
5850168962Spjd	} */ *ap;
5851168962Spjd{
5852168962Spjd
5853168978Spjd	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
5854185029Spjd	    ap->a_fflag, ap->a_cred, NULL, NULL));
5855168962Spjd}
5856168962Spjd
5857168962Spjdstatic int
5858168962Spjdzfs_freebsd_read(ap)
5859168962Spjd	struct vop_read_args /* {
5860168962Spjd		struct vnode *a_vp;
5861168962Spjd		struct uio *a_uio;
5862168962Spjd		int a_ioflag;
5863168962Spjd		struct ucred *a_cred;
5864168962Spjd	} */ *ap;
5865168962Spjd{
5866168962Spjd
5867213673Spjd	return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
5868213673Spjd	    ap->a_cred, NULL));
5869168962Spjd}
5870168962Spjd
5871168962Spjdstatic int
5872168962Spjdzfs_freebsd_write(ap)
5873168962Spjd	struct vop_write_args /* {
5874168962Spjd		struct vnode *a_vp;
5875168962Spjd		struct uio *a_uio;
5876168962Spjd		int a_ioflag;
5877168962Spjd		struct ucred *a_cred;
5878168962Spjd	} */ *ap;
5879168962Spjd{
5880168962Spjd
5881213673Spjd	return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
5882213673Spjd	    ap->a_cred, NULL));
5883168962Spjd}
5884168962Spjd
5885168962Spjdstatic int
5886168962Spjdzfs_freebsd_access(ap)
5887168962Spjd	struct vop_access_args /* {
5888168962Spjd		struct vnode *a_vp;
5889192689Strasz		accmode_t a_accmode;
5890168962Spjd		struct ucred *a_cred;
5891168962Spjd		struct thread *a_td;
5892168962Spjd	} */ *ap;
5893168962Spjd{
5894212002Sjh	vnode_t *vp = ap->a_vp;
5895212002Sjh	znode_t *zp = VTOZ(vp);
5896198703Spjd	accmode_t accmode;
5897198703Spjd	int error = 0;
5898168962Spjd
5899185172Spjd	/*
5900198703Spjd	 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
5901185172Spjd	 */
5902198703Spjd	accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
5903198703Spjd	if (accmode != 0)
5904198703Spjd		error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL);
5905185172Spjd
5906198703Spjd	/*
5907198703Spjd	 * VADMIN has to be handled by vaccess().
5908198703Spjd	 */
5909198703Spjd	if (error == 0) {
5910198703Spjd		accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
5911198703Spjd		if (accmode != 0) {
5912219089Spjd			error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
5913219089Spjd			    zp->z_gid, accmode, ap->a_cred, NULL);
5914198703Spjd		}
5915185172Spjd	}
5916185172Spjd
5917212002Sjh	/*
5918212002Sjh	 * For VEXEC, ensure that at least one execute bit is set for
5919212002Sjh	 * non-directories.
5920212002Sjh	 */
5921212002Sjh	if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
5922219089Spjd	    (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
5923212002Sjh		error = EACCES;
5924219089Spjd	}
5925212002Sjh
5926198703Spjd	return (error);
5927168962Spjd}
5928168962Spjd
5929168962Spjdstatic int
5930168962Spjdzfs_freebsd_lookup(ap)
5931168962Spjd	struct vop_lookup_args /* {
5932168962Spjd		struct vnode *a_dvp;
5933168962Spjd		struct vnode **a_vpp;
5934168962Spjd		struct componentname *a_cnp;
5935168962Spjd	} */ *ap;
5936168962Spjd{
5937168962Spjd	struct componentname *cnp = ap->a_cnp;
5938168962Spjd	char nm[NAME_MAX + 1];
5939168962Spjd
5940168962Spjd	ASSERT(cnp->cn_namelen < sizeof(nm));
5941168962Spjd	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
5942168962Spjd
5943168962Spjd	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
5944185029Spjd	    cnp->cn_cred, cnp->cn_thread, 0));
5945168962Spjd}
5946168962Spjd
5947168962Spjdstatic int
5948168962Spjdzfs_freebsd_create(ap)
5949168962Spjd	struct vop_create_args /* {
5950168962Spjd		struct vnode *a_dvp;
5951168962Spjd		struct vnode **a_vpp;
5952168962Spjd		struct componentname *a_cnp;
5953168962Spjd		struct vattr *a_vap;
5954168962Spjd	} */ *ap;
5955168962Spjd{
5956168962Spjd	struct componentname *cnp = ap->a_cnp;
5957168962Spjd	vattr_t *vap = ap->a_vap;
5958168962Spjd	int mode;
5959168962Spjd
5960168962Spjd	ASSERT(cnp->cn_flags & SAVENAME);
5961168962Spjd
5962168962Spjd	vattr_init_mask(vap);
5963168962Spjd	mode = vap->va_mode & ALLPERMS;
5964168962Spjd
5965168962Spjd	return (zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
5966185029Spjd	    ap->a_vpp, cnp->cn_cred, cnp->cn_thread));
5967168962Spjd}
5968168962Spjd
5969168962Spjdstatic int
5970168962Spjdzfs_freebsd_remove(ap)
5971168962Spjd	struct vop_remove_args /* {
5972168962Spjd		struct vnode *a_dvp;
5973168962Spjd		struct vnode *a_vp;
5974168962Spjd		struct componentname *a_cnp;
5975168962Spjd	} */ *ap;
5976168962Spjd{
5977168962Spjd
5978168962Spjd	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
5979168962Spjd
5980168962Spjd	return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr,
5981185029Spjd	    ap->a_cnp->cn_cred, NULL, 0));
5982168962Spjd}
5983168962Spjd
5984168962Spjdstatic int
5985168962Spjdzfs_freebsd_mkdir(ap)
5986168962Spjd	struct vop_mkdir_args /* {
5987168962Spjd		struct vnode *a_dvp;
5988168962Spjd		struct vnode **a_vpp;
5989168962Spjd		struct componentname *a_cnp;
5990168962Spjd		struct vattr *a_vap;
5991168962Spjd	} */ *ap;
5992168962Spjd{
5993168962Spjd	vattr_t *vap = ap->a_vap;
5994168962Spjd
5995168962Spjd	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
5996168962Spjd
5997168962Spjd	vattr_init_mask(vap);
5998168962Spjd
5999168962Spjd	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
6000185029Spjd	    ap->a_cnp->cn_cred, NULL, 0, NULL));
6001168962Spjd}
6002168962Spjd
6003168962Spjdstatic int
6004168962Spjdzfs_freebsd_rmdir(ap)
6005168962Spjd	struct vop_rmdir_args /* {
6006168962Spjd		struct vnode *a_dvp;
6007168962Spjd		struct vnode *a_vp;
6008168962Spjd		struct componentname *a_cnp;
6009168962Spjd	} */ *ap;
6010168962Spjd{
6011168962Spjd	struct componentname *cnp = ap->a_cnp;
6012168962Spjd
6013168962Spjd	ASSERT(cnp->cn_flags & SAVENAME);
6014168962Spjd
6015185029Spjd	return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred, NULL, 0));
6016168962Spjd}
6017168962Spjd
6018168962Spjdstatic int
6019168962Spjdzfs_freebsd_readdir(ap)
6020168962Spjd	struct vop_readdir_args /* {
6021168962Spjd		struct vnode *a_vp;
6022168962Spjd		struct uio *a_uio;
6023168962Spjd		struct ucred *a_cred;
6024168962Spjd		int *a_eofflag;
6025168962Spjd		int *a_ncookies;
6026168962Spjd		u_long **a_cookies;
6027168962Spjd	} */ *ap;
6028168962Spjd{
6029168962Spjd
6030168962Spjd	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
6031168962Spjd	    ap->a_ncookies, ap->a_cookies));
6032168962Spjd}
6033168962Spjd
6034168962Spjdstatic int
6035168962Spjdzfs_freebsd_fsync(ap)
6036168962Spjd	struct vop_fsync_args /* {
6037168962Spjd		struct vnode *a_vp;
6038168962Spjd		int a_waitfor;
6039168962Spjd		struct thread *a_td;
6040168962Spjd	} */ *ap;
6041168962Spjd{
6042168962Spjd
6043168962Spjd	vop_stdfsync(ap);
6044185029Spjd	return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
6045168962Spjd}
6046168962Spjd
6047168962Spjdstatic int
6048168962Spjdzfs_freebsd_getattr(ap)
6049168962Spjd	struct vop_getattr_args /* {
6050168962Spjd		struct vnode *a_vp;
6051168962Spjd		struct vattr *a_vap;
6052168962Spjd		struct ucred *a_cred;
6053168962Spjd	} */ *ap;
6054168962Spjd{
6055185029Spjd	vattr_t *vap = ap->a_vap;
6056185029Spjd	xvattr_t xvap;
6057185029Spjd	u_long fflags = 0;
6058185029Spjd	int error;
6059168962Spjd
6060185029Spjd	xva_init(&xvap);
6061185029Spjd	xvap.xva_vattr = *vap;
6062185029Spjd	xvap.xva_vattr.va_mask |= AT_XVATTR;
6063185029Spjd
6064185029Spjd	/* Convert chflags into ZFS-type flags. */
6065185029Spjd	/* XXX: what about SF_SETTABLE?. */
6066185029Spjd	XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
6067185029Spjd	XVA_SET_REQ(&xvap, XAT_APPENDONLY);
6068185029Spjd	XVA_SET_REQ(&xvap, XAT_NOUNLINK);
6069185029Spjd	XVA_SET_REQ(&xvap, XAT_NODUMP);
6070185029Spjd	error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
6071185029Spjd	if (error != 0)
6072185029Spjd		return (error);
6073185029Spjd
6074185029Spjd	/* Convert ZFS xattr into chflags. */
6075185029Spjd#define	FLAG_CHECK(fflag, xflag, xfield)	do {			\
6076185029Spjd	if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)		\
6077185029Spjd		fflags |= (fflag);					\
6078185029Spjd} while (0)
6079185029Spjd	FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
6080185029Spjd	    xvap.xva_xoptattrs.xoa_immutable);
6081185029Spjd	FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
6082185029Spjd	    xvap.xva_xoptattrs.xoa_appendonly);
6083185029Spjd	FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
6084185029Spjd	    xvap.xva_xoptattrs.xoa_nounlink);
6085185029Spjd	FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
6086185029Spjd	    xvap.xva_xoptattrs.xoa_nodump);
6087185029Spjd#undef	FLAG_CHECK
6088185029Spjd	*vap = xvap.xva_vattr;
6089185029Spjd	vap->va_flags = fflags;
6090185029Spjd	return (0);
6091168962Spjd}
6092168962Spjd
6093168962Spjdstatic int
6094168962Spjdzfs_freebsd_setattr(ap)
6095168962Spjd	struct vop_setattr_args /* {
6096168962Spjd		struct vnode *a_vp;
6097168962Spjd		struct vattr *a_vap;
6098168962Spjd		struct ucred *a_cred;
6099168962Spjd	} */ *ap;
6100168962Spjd{
6101185172Spjd	vnode_t *vp = ap->a_vp;
6102168962Spjd	vattr_t *vap = ap->a_vap;
6103185172Spjd	cred_t *cred = ap->a_cred;
6104185029Spjd	xvattr_t xvap;
6105185029Spjd	u_long fflags;
6106185029Spjd	uint64_t zflags;
6107168962Spjd
6108168962Spjd	vattr_init_mask(vap);
6109170044Spjd	vap->va_mask &= ~AT_NOSET;
6110168962Spjd
6111185029Spjd	xva_init(&xvap);
6112185029Spjd	xvap.xva_vattr = *vap;
6113185029Spjd
6114219089Spjd	zflags = VTOZ(vp)->z_pflags;
6115185172Spjd
6116185029Spjd	if (vap->va_flags != VNOVAL) {
6117197683Sdelphij		zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
6118185172Spjd		int error;
6119185172Spjd
6120197683Sdelphij		if (zfsvfs->z_use_fuids == B_FALSE)
6121197683Sdelphij			return (EOPNOTSUPP);
6122197683Sdelphij
6123185029Spjd		fflags = vap->va_flags;
6124185029Spjd		if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_NODUMP)) != 0)
6125185029Spjd			return (EOPNOTSUPP);
6126185172Spjd		/*
6127185172Spjd		 * Unprivileged processes are not permitted to unset system
6128185172Spjd		 * flags, or modify flags if any system flags are set.
6129185172Spjd		 * Privileged non-jail processes may not modify system flags
6130185172Spjd		 * if securelevel > 0 and any existing system flags are set.
6131185172Spjd		 * Privileged jail processes behave like privileged non-jail
6132185172Spjd		 * processes if the security.jail.chflags_allowed sysctl is
6133185172Spjd		 * is non-zero; otherwise, they behave like unprivileged
6134185172Spjd		 * processes.
6135185172Spjd		 */
6136197861Spjd		if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
6137197861Spjd		    priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) {
6138185172Spjd			if (zflags &
6139185172Spjd			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
6140185172Spjd				error = securelevel_gt(cred, 0);
6141197861Spjd				if (error != 0)
6142185172Spjd					return (error);
6143185172Spjd			}
6144185172Spjd		} else {
6145197861Spjd			/*
6146197861Spjd			 * Callers may only modify the file flags on objects they
6147197861Spjd			 * have VADMIN rights for.
6148197861Spjd			 */
6149197861Spjd			if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0)
6150197861Spjd				return (error);
6151185172Spjd			if (zflags &
6152185172Spjd			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
6153185172Spjd				return (EPERM);
6154185172Spjd			}
6155185172Spjd			if (fflags &
6156185172Spjd			    (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
6157185172Spjd				return (EPERM);
6158185172Spjd			}
6159185172Spjd		}
6160185029Spjd
6161185029Spjd#define	FLAG_CHANGE(fflag, zflag, xflag, xfield)	do {		\
6162185029Spjd	if (((fflags & (fflag)) && !(zflags & (zflag))) ||		\
6163185029Spjd	    ((zflags & (zflag)) && !(fflags & (fflag)))) {		\
6164185029Spjd		XVA_SET_REQ(&xvap, (xflag));				\
6165185029Spjd		(xfield) = ((fflags & (fflag)) != 0);			\
6166185029Spjd	}								\
6167185029Spjd} while (0)
6168185029Spjd		/* Convert chflags into ZFS-type flags. */
6169185029Spjd		/* XXX: what about SF_SETTABLE?. */
6170185029Spjd		FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
6171185029Spjd		    xvap.xva_xoptattrs.xoa_immutable);
6172185029Spjd		FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
6173185029Spjd		    xvap.xva_xoptattrs.xoa_appendonly);
6174185029Spjd		FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
6175185029Spjd		    xvap.xva_xoptattrs.xoa_nounlink);
6176185029Spjd		FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
6177185172Spjd		    xvap.xva_xoptattrs.xoa_nodump);
6178185029Spjd#undef	FLAG_CHANGE
6179185029Spjd	}
6180185172Spjd	return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
6181168962Spjd}
6182168962Spjd
6183168962Spjdstatic int
6184168962Spjdzfs_freebsd_rename(ap)
6185168962Spjd	struct vop_rename_args  /* {
6186168962Spjd		struct vnode *a_fdvp;
6187168962Spjd		struct vnode *a_fvp;
6188168962Spjd		struct componentname *a_fcnp;
6189168962Spjd		struct vnode *a_tdvp;
6190168962Spjd		struct vnode *a_tvp;
6191168962Spjd		struct componentname *a_tcnp;
6192168962Spjd	} */ *ap;
6193168962Spjd{
6194168962Spjd	vnode_t *fdvp = ap->a_fdvp;
6195168962Spjd	vnode_t *fvp = ap->a_fvp;
6196168962Spjd	vnode_t *tdvp = ap->a_tdvp;
6197168962Spjd	vnode_t *tvp = ap->a_tvp;
6198168962Spjd	int error;
6199168962Spjd
6200192237Skmacy	ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
6201192237Skmacy	ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
6202168962Spjd
6203168962Spjd	error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp,
6204185029Spjd	    ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred, NULL, 0);
6205168962Spjd
6206168962Spjd	if (tdvp == tvp)
6207168962Spjd		VN_RELE(tdvp);
6208168962Spjd	else
6209168962Spjd		VN_URELE(tdvp);
6210168962Spjd	if (tvp)
6211168962Spjd		VN_URELE(tvp);
6212168962Spjd	VN_RELE(fdvp);
6213168962Spjd	VN_RELE(fvp);
6214168962Spjd
6215168962Spjd	return (error);
6216168962Spjd}
6217168962Spjd
6218168962Spjdstatic int
6219168962Spjdzfs_freebsd_symlink(ap)
6220168962Spjd	struct vop_symlink_args /* {
6221168962Spjd		struct vnode *a_dvp;
6222168962Spjd		struct vnode **a_vpp;
6223168962Spjd		struct componentname *a_cnp;
6224168962Spjd		struct vattr *a_vap;
6225168962Spjd		char *a_target;
6226168962Spjd	} */ *ap;
6227168962Spjd{
6228168962Spjd	struct componentname *cnp = ap->a_cnp;
6229168962Spjd	vattr_t *vap = ap->a_vap;
6230168962Spjd
6231168962Spjd	ASSERT(cnp->cn_flags & SAVENAME);
6232168962Spjd
6233168962Spjd	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
6234168962Spjd	vattr_init_mask(vap);
6235168962Spjd
6236168962Spjd	return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
6237168962Spjd	    ap->a_target, cnp->cn_cred, cnp->cn_thread));
6238168962Spjd}
6239168962Spjd
6240168962Spjdstatic int
6241168962Spjdzfs_freebsd_readlink(ap)
6242168962Spjd	struct vop_readlink_args /* {
6243168962Spjd		struct vnode *a_vp;
6244168962Spjd		struct uio *a_uio;
6245168962Spjd		struct ucred *a_cred;
6246168962Spjd	} */ *ap;
6247168962Spjd{
6248168962Spjd
6249185029Spjd	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
6250168962Spjd}
6251168962Spjd
6252168962Spjdstatic int
6253168962Spjdzfs_freebsd_link(ap)
6254168962Spjd	struct vop_link_args /* {
6255168962Spjd		struct vnode *a_tdvp;
6256168962Spjd		struct vnode *a_vp;
6257168962Spjd		struct componentname *a_cnp;
6258168962Spjd	} */ *ap;
6259168962Spjd{
6260168962Spjd	struct componentname *cnp = ap->a_cnp;
6261168962Spjd
6262168962Spjd	ASSERT(cnp->cn_flags & SAVENAME);
6263168962Spjd
6264185029Spjd	return (zfs_link(ap->a_tdvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
6265168962Spjd}
6266168962Spjd
6267168962Spjdstatic int
6268168962Spjdzfs_freebsd_inactive(ap)
6269169170Spjd	struct vop_inactive_args /* {
6270169170Spjd		struct vnode *a_vp;
6271169170Spjd		struct thread *a_td;
6272169170Spjd	} */ *ap;
6273168962Spjd{
6274168962Spjd	vnode_t *vp = ap->a_vp;
6275168962Spjd
6276185029Spjd	zfs_inactive(vp, ap->a_td->td_ucred, NULL);
6277168962Spjd	return (0);
6278168962Spjd}
6279168962Spjd
6280168962Spjdstatic int
6281168962Spjdzfs_freebsd_reclaim(ap)
6282168962Spjd	struct vop_reclaim_args /* {
6283168962Spjd		struct vnode *a_vp;
6284168962Spjd		struct thread *a_td;
6285168962Spjd	} */ *ap;
6286168962Spjd{
6287169170Spjd	vnode_t	*vp = ap->a_vp;
6288168962Spjd	znode_t	*zp = VTOZ(vp);
6289197133Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
6290168962Spjd
6291169025Spjd	ASSERT(zp != NULL);
6292169025Spjd
6293243520Savg	/* Destroy the vm object and flush associated pages. */
6294243520Savg	vnode_destroy_vobject(vp);
6295243520Savg
6296168962Spjd	/*
6297243520Savg	 * z_teardown_inactive_lock protects from a race with
6298243520Savg	 * zfs_znode_dmu_fini in zfsvfs_teardown during
6299243520Savg	 * force unmount.
6300168962Spjd	 */
6301243520Savg	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
6302243520Savg	if (zp->z_sa_hdl == NULL)
6303196301Spjd		zfs_znode_free(zp);
6304243520Savg	else
6305243520Savg		zfs_zinactive(zp);
6306243520Savg	rw_exit(&zfsvfs->z_teardown_inactive_lock);
6307185029Spjd
6308168962Spjd	vp->v_data = NULL;
6309168962Spjd	return (0);
6310168962Spjd}
6311168962Spjd
6312168962Spjdstatic int
6313168962Spjdzfs_freebsd_fid(ap)
6314168962Spjd	struct vop_fid_args /* {
6315168962Spjd		struct vnode *a_vp;
6316168962Spjd		struct fid *a_fid;
6317168962Spjd	} */ *ap;
6318168962Spjd{
6319168962Spjd
6320185029Spjd	return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
6321168962Spjd}
6322168962Spjd
6323168962Spjdstatic int
6324168962Spjdzfs_freebsd_pathconf(ap)
6325168962Spjd	struct vop_pathconf_args /* {
6326168962Spjd		struct vnode *a_vp;
6327168962Spjd		int a_name;
6328168962Spjd		register_t *a_retval;
6329168962Spjd	} */ *ap;
6330168962Spjd{
6331168962Spjd	ulong_t val;
6332168962Spjd	int error;
6333168962Spjd
6334185029Spjd	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL);
6335168962Spjd	if (error == 0)
6336168962Spjd		*ap->a_retval = val;
6337168962Spjd	else if (error == EOPNOTSUPP)
6338168962Spjd		error = vop_stdpathconf(ap);
6339168962Spjd	return (error);
6340168962Spjd}
6341168962Spjd
6342196949Straszstatic int
6343196949Straszzfs_freebsd_fifo_pathconf(ap)
6344196949Strasz	struct vop_pathconf_args /* {
6345196949Strasz		struct vnode *a_vp;
6346196949Strasz		int a_name;
6347196949Strasz		register_t *a_retval;
6348196949Strasz	} */ *ap;
6349196949Strasz{
6350196949Strasz
6351196949Strasz	switch (ap->a_name) {
6352196949Strasz	case _PC_ACL_EXTENDED:
6353196949Strasz	case _PC_ACL_NFS4:
6354196949Strasz	case _PC_ACL_PATH_MAX:
6355196949Strasz	case _PC_MAC_PRESENT:
6356196949Strasz		return (zfs_freebsd_pathconf(ap));
6357196949Strasz	default:
6358196949Strasz		return (fifo_specops.vop_pathconf(ap));
6359196949Strasz	}
6360196949Strasz}
6361196949Strasz
6362185029Spjd/*
6363185029Spjd * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
6364185029Spjd * extended attribute name:
6365185029Spjd *
6366185029Spjd *	NAMESPACE	PREFIX
6367185029Spjd *	system		freebsd:system:
6368185029Spjd *	user		(none, can be used to access ZFS fsattr(5) attributes
6369185029Spjd *			created on Solaris)
6370185029Spjd */
6371185029Spjdstatic int
6372185029Spjdzfs_create_attrname(int attrnamespace, const char *name, char *attrname,
6373185029Spjd    size_t size)
6374185029Spjd{
6375185029Spjd	const char *namespace, *prefix, *suffix;
6376185029Spjd
6377185029Spjd	/* We don't allow '/' character in attribute name. */
6378185029Spjd	if (strchr(name, '/') != NULL)
6379185029Spjd		return (EINVAL);
6380185029Spjd	/* We don't allow attribute names that start with "freebsd:" string. */
6381185029Spjd	if (strncmp(name, "freebsd:", 8) == 0)
6382185029Spjd		return (EINVAL);
6383185029Spjd
6384185029Spjd	bzero(attrname, size);
6385185029Spjd
6386185029Spjd	switch (attrnamespace) {
6387185029Spjd	case EXTATTR_NAMESPACE_USER:
6388185029Spjd#if 0
6389185029Spjd		prefix = "freebsd:";
6390185029Spjd		namespace = EXTATTR_NAMESPACE_USER_STRING;
6391185029Spjd		suffix = ":";
6392185029Spjd#else
6393185029Spjd		/*
6394185029Spjd		 * This is the default namespace by which we can access all
6395185029Spjd		 * attributes created on Solaris.
6396185029Spjd		 */
6397185029Spjd		prefix = namespace = suffix = "";
6398185029Spjd#endif
6399185029Spjd		break;
6400185029Spjd	case EXTATTR_NAMESPACE_SYSTEM:
6401185029Spjd		prefix = "freebsd:";
6402185029Spjd		namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
6403185029Spjd		suffix = ":";
6404185029Spjd		break;
6405185029Spjd	case EXTATTR_NAMESPACE_EMPTY:
6406185029Spjd	default:
6407185029Spjd		return (EINVAL);
6408185029Spjd	}
6409185029Spjd	if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
6410185029Spjd	    name) >= size) {
6411185029Spjd		return (ENAMETOOLONG);
6412185029Spjd	}
6413185029Spjd	return (0);
6414185029Spjd}
6415185029Spjd
6416185029Spjd/*
6417185029Spjd * Vnode operating to retrieve a named extended attribute.
6418185029Spjd */
6419185029Spjdstatic int
6420185029Spjdzfs_getextattr(struct vop_getextattr_args *ap)
6421185029Spjd/*
6422185029Spjdvop_getextattr {
6423185029Spjd	IN struct vnode *a_vp;
6424185029Spjd	IN int a_attrnamespace;
6425185029Spjd	IN const char *a_name;
6426185029Spjd	INOUT struct uio *a_uio;
6427185029Spjd	OUT size_t *a_size;
6428185029Spjd	IN struct ucred *a_cred;
6429185029Spjd	IN struct thread *a_td;
6430185029Spjd};
6431185029Spjd*/
6432185029Spjd{
6433185029Spjd	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
6434185029Spjd	struct thread *td = ap->a_td;
6435185029Spjd	struct nameidata nd;
6436185029Spjd	char attrname[255];
6437185029Spjd	struct vattr va;
6438185029Spjd	vnode_t *xvp = NULL, *vp;
6439185029Spjd	int error, flags;
6440185029Spjd
6441195785Strasz	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6442195785Strasz	    ap->a_cred, ap->a_td, VREAD);
6443195785Strasz	if (error != 0)
6444195785Strasz		return (error);
6445195785Strasz
6446185029Spjd	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
6447185029Spjd	    sizeof(attrname));
6448185029Spjd	if (error != 0)
6449185029Spjd		return (error);
6450185029Spjd
6451185029Spjd	ZFS_ENTER(zfsvfs);
6452185029Spjd
6453185029Spjd	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
6454185029Spjd	    LOOKUP_XATTR);
6455185029Spjd	if (error != 0) {
6456185029Spjd		ZFS_EXIT(zfsvfs);
6457185029Spjd		return (error);
6458185029Spjd	}
6459185029Spjd
6460185029Spjd	flags = FREAD;
6461241896Skib	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
6462185029Spjd	    xvp, td);
6463194586Skib	error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL);
6464185029Spjd	vp = nd.ni_vp;
6465185029Spjd	NDFREE(&nd, NDF_ONLY_PNBUF);
6466185029Spjd	if (error != 0) {
6467196303Spjd		ZFS_EXIT(zfsvfs);
6468195785Strasz		if (error == ENOENT)
6469195785Strasz			error = ENOATTR;
6470185029Spjd		return (error);
6471185029Spjd	}
6472185029Spjd
6473185029Spjd	if (ap->a_size != NULL) {
6474185029Spjd		error = VOP_GETATTR(vp, &va, ap->a_cred);
6475185029Spjd		if (error == 0)
6476185029Spjd			*ap->a_size = (size_t)va.va_size;
6477185029Spjd	} else if (ap->a_uio != NULL)
6478224605Smm		error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
6479185029Spjd
6480185029Spjd	VOP_UNLOCK(vp, 0);
6481185029Spjd	vn_close(vp, flags, ap->a_cred, td);
6482185029Spjd	ZFS_EXIT(zfsvfs);
6483185029Spjd
6484185029Spjd	return (error);
6485185029Spjd}
6486185029Spjd
6487185029Spjd/*
6488185029Spjd * Vnode operation to remove a named attribute.
6489185029Spjd */
6490185029Spjdint
6491185029Spjdzfs_deleteextattr(struct vop_deleteextattr_args *ap)
6492185029Spjd/*
6493185029Spjdvop_deleteextattr {
6494185029Spjd	IN struct vnode *a_vp;
6495185029Spjd	IN int a_attrnamespace;
6496185029Spjd	IN const char *a_name;
6497185029Spjd	IN struct ucred *a_cred;
6498185029Spjd	IN struct thread *a_td;
6499185029Spjd};
6500185029Spjd*/
6501185029Spjd{
6502185029Spjd	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
6503185029Spjd	struct thread *td = ap->a_td;
6504185029Spjd	struct nameidata nd;
6505185029Spjd	char attrname[255];
6506185029Spjd	struct vattr va;
6507185029Spjd	vnode_t *xvp = NULL, *vp;
6508185029Spjd	int error, flags;
6509185029Spjd
6510195785Strasz	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6511195785Strasz	    ap->a_cred, ap->a_td, VWRITE);
6512195785Strasz	if (error != 0)
6513195785Strasz		return (error);
6514195785Strasz
6515185029Spjd	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
6516185029Spjd	    sizeof(attrname));
6517185029Spjd	if (error != 0)
6518185029Spjd		return (error);
6519185029Spjd
6520185029Spjd	ZFS_ENTER(zfsvfs);
6521185029Spjd
6522185029Spjd	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
6523185029Spjd	    LOOKUP_XATTR);
6524185029Spjd	if (error != 0) {
6525185029Spjd		ZFS_EXIT(zfsvfs);
6526185029Spjd		return (error);
6527185029Spjd	}
6528185029Spjd
6529241896Skib	NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
6530185029Spjd	    UIO_SYSSPACE, attrname, xvp, td);
6531185029Spjd	error = namei(&nd);
6532185029Spjd	vp = nd.ni_vp;
6533185029Spjd	NDFREE(&nd, NDF_ONLY_PNBUF);
6534185029Spjd	if (error != 0) {
6535196303Spjd		ZFS_EXIT(zfsvfs);
6536195785Strasz		if (error == ENOENT)
6537195785Strasz			error = ENOATTR;
6538185029Spjd		return (error);
6539185029Spjd	}
6540185029Spjd	error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
6541185029Spjd
6542185029Spjd	vput(nd.ni_dvp);
6543185029Spjd	if (vp == nd.ni_dvp)
6544185029Spjd		vrele(vp);
6545185029Spjd	else
6546185029Spjd		vput(vp);
6547185029Spjd	ZFS_EXIT(zfsvfs);
6548185029Spjd
6549185029Spjd	return (error);
6550185029Spjd}
6551185029Spjd
6552185029Spjd/*
6553185029Spjd * Vnode operation to set a named attribute.
6554185029Spjd */
6555185029Spjdstatic int
6556185029Spjdzfs_setextattr(struct vop_setextattr_args *ap)
6557185029Spjd/*
6558185029Spjdvop_setextattr {
6559185029Spjd	IN struct vnode *a_vp;
6560185029Spjd	IN int a_attrnamespace;
6561185029Spjd	IN const char *a_name;
6562185029Spjd	INOUT struct uio *a_uio;
6563185029Spjd	IN struct ucred *a_cred;
6564185029Spjd	IN struct thread *a_td;
6565185029Spjd};
6566185029Spjd*/
6567185029Spjd{
6568185029Spjd	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
6569185029Spjd	struct thread *td = ap->a_td;
6570185029Spjd	struct nameidata nd;
6571185029Spjd	char attrname[255];
6572185029Spjd	struct vattr va;
6573185029Spjd	vnode_t *xvp = NULL, *vp;
6574185029Spjd	int error, flags;
6575185029Spjd
6576195785Strasz	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6577195785Strasz	    ap->a_cred, ap->a_td, VWRITE);
6578195785Strasz	if (error != 0)
6579195785Strasz		return (error);
6580195785Strasz
6581185029Spjd	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
6582185029Spjd	    sizeof(attrname));
6583185029Spjd	if (error != 0)
6584185029Spjd		return (error);
6585185029Spjd
6586185029Spjd	ZFS_ENTER(zfsvfs);
6587185029Spjd
6588185029Spjd	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
6589195785Strasz	    LOOKUP_XATTR | CREATE_XATTR_DIR);
6590185029Spjd	if (error != 0) {
6591185029Spjd		ZFS_EXIT(zfsvfs);
6592185029Spjd		return (error);
6593185029Spjd	}
6594185029Spjd
6595185029Spjd	flags = FFLAGS(O_WRONLY | O_CREAT);
6596241896Skib	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
6597185029Spjd	    xvp, td);
6598194586Skib	error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL);
6599185029Spjd	vp = nd.ni_vp;
6600185029Spjd	NDFREE(&nd, NDF_ONLY_PNBUF);
6601185029Spjd	if (error != 0) {
6602185029Spjd		ZFS_EXIT(zfsvfs);
6603185029Spjd		return (error);
6604185029Spjd	}
6605185029Spjd
6606185029Spjd	VATTR_NULL(&va);
6607185029Spjd	va.va_size = 0;
6608185029Spjd	error = VOP_SETATTR(vp, &va, ap->a_cred);
6609185029Spjd	if (error == 0)
6610185029Spjd		VOP_WRITE(vp, ap->a_uio, IO_UNIT | IO_SYNC, ap->a_cred);
6611185029Spjd
6612185029Spjd	VOP_UNLOCK(vp, 0);
6613185029Spjd	vn_close(vp, flags, ap->a_cred, td);
6614185029Spjd	ZFS_EXIT(zfsvfs);
6615185029Spjd
6616185029Spjd	return (error);
6617185029Spjd}
6618185029Spjd
6619185029Spjd/*
6620185029Spjd * Vnode operation to retrieve extended attributes on a vnode.
6621185029Spjd */
6622185029Spjdstatic int
6623185029Spjdzfs_listextattr(struct vop_listextattr_args *ap)
6624185029Spjd/*
6625185029Spjdvop_listextattr {
6626185029Spjd	IN struct vnode *a_vp;
6627185029Spjd	IN int a_attrnamespace;
6628185029Spjd	INOUT struct uio *a_uio;
6629185029Spjd	OUT size_t *a_size;
6630185029Spjd	IN struct ucred *a_cred;
6631185029Spjd	IN struct thread *a_td;
6632185029Spjd};
6633185029Spjd*/
6634185029Spjd{
6635185029Spjd	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
6636185029Spjd	struct thread *td = ap->a_td;
6637185029Spjd	struct nameidata nd;
6638185029Spjd	char attrprefix[16];
6639185029Spjd	u_char dirbuf[sizeof(struct dirent)];
6640185029Spjd	struct dirent *dp;
6641185029Spjd	struct iovec aiov;
6642185029Spjd	struct uio auio, *uio = ap->a_uio;
6643185029Spjd	size_t *sizep = ap->a_size;
6644185029Spjd	size_t plen;
6645185029Spjd	vnode_t *xvp = NULL, *vp;
6646185029Spjd	int done, error, eof, pos;
6647185029Spjd
6648195785Strasz	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6649195785Strasz	    ap->a_cred, ap->a_td, VREAD);
6650196303Spjd	if (error != 0)
6651195785Strasz		return (error);
6652195785Strasz
6653185029Spjd	error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
6654185029Spjd	    sizeof(attrprefix));
6655185029Spjd	if (error != 0)
6656185029Spjd		return (error);
6657185029Spjd	plen = strlen(attrprefix);
6658185029Spjd
6659185029Spjd	ZFS_ENTER(zfsvfs);
6660185029Spjd
6661195822Strasz	if (sizep != NULL)
6662195822Strasz		*sizep = 0;
6663195822Strasz
6664185029Spjd	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
6665185029Spjd	    LOOKUP_XATTR);
6666185029Spjd	if (error != 0) {
6667196303Spjd		ZFS_EXIT(zfsvfs);
6668195785Strasz		/*
6669195785Strasz		 * ENOATTR means that the EA directory does not yet exist,
6670195785Strasz		 * i.e. there are no extended attributes there.
6671195785Strasz		 */
6672195785Strasz		if (error == ENOATTR)
6673195785Strasz			error = 0;
6674185029Spjd		return (error);
6675185029Spjd	}
6676185029Spjd
6677241896Skib	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
6678188588Sjhb	    UIO_SYSSPACE, ".", xvp, td);
6679185029Spjd	error = namei(&nd);
6680185029Spjd	vp = nd.ni_vp;
6681185029Spjd	NDFREE(&nd, NDF_ONLY_PNBUF);
6682185029Spjd	if (error != 0) {
6683185029Spjd		ZFS_EXIT(zfsvfs);
6684185029Spjd		return (error);
6685185029Spjd	}
6686185029Spjd
6687185029Spjd	auio.uio_iov = &aiov;
6688185029Spjd	auio.uio_iovcnt = 1;
6689185029Spjd	auio.uio_segflg = UIO_SYSSPACE;
6690185029Spjd	auio.uio_td = td;
6691185029Spjd	auio.uio_rw = UIO_READ;
6692185029Spjd	auio.uio_offset = 0;
6693185029Spjd
6694185029Spjd	do {
6695185029Spjd		u_char nlen;
6696185029Spjd
6697185029Spjd		aiov.iov_base = (void *)dirbuf;
6698185029Spjd		aiov.iov_len = sizeof(dirbuf);
6699185029Spjd		auio.uio_resid = sizeof(dirbuf);
6700185029Spjd		error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
6701185029Spjd		done = sizeof(dirbuf) - auio.uio_resid;
6702185029Spjd		if (error != 0)
6703185029Spjd			break;
6704185029Spjd		for (pos = 0; pos < done;) {
6705185029Spjd			dp = (struct dirent *)(dirbuf + pos);
6706185029Spjd			pos += dp->d_reclen;
6707185029Spjd			/*
6708185029Spjd			 * XXX: Temporarily we also accept DT_UNKNOWN, as this
6709185029Spjd			 * is what we get when attribute was created on Solaris.
6710185029Spjd			 */
6711185029Spjd			if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
6712185029Spjd				continue;
6713185029Spjd			if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0)
6714185029Spjd				continue;
6715185029Spjd			else if (strncmp(dp->d_name, attrprefix, plen) != 0)
6716185029Spjd				continue;
6717185029Spjd			nlen = dp->d_namlen - plen;
6718185029Spjd			if (sizep != NULL)
6719185029Spjd				*sizep += 1 + nlen;
6720185029Spjd			else if (uio != NULL) {
6721185029Spjd				/*
6722185029Spjd				 * Format of extattr name entry is one byte for
6723185029Spjd				 * length and the rest for name.
6724185029Spjd				 */
6725185029Spjd				error = uiomove(&nlen, 1, uio->uio_rw, uio);
6726185029Spjd				if (error == 0) {
6727185029Spjd					error = uiomove(dp->d_name + plen, nlen,
6728185029Spjd					    uio->uio_rw, uio);
6729185029Spjd				}
6730185029Spjd				if (error != 0)
6731185029Spjd					break;
6732185029Spjd			}
6733185029Spjd		}
6734185029Spjd	} while (!eof && error == 0);
6735185029Spjd
6736185029Spjd	vput(vp);
6737185029Spjd	ZFS_EXIT(zfsvfs);
6738185029Spjd
6739185029Spjd	return (error);
6740185029Spjd}
6741185029Spjd
6742192800Straszint
6743192800Straszzfs_freebsd_getacl(ap)
6744192800Strasz	struct vop_getacl_args /* {
6745192800Strasz		struct vnode *vp;
6746192800Strasz		acl_type_t type;
6747192800Strasz		struct acl *aclp;
6748192800Strasz		struct ucred *cred;
6749192800Strasz		struct thread *td;
6750192800Strasz	} */ *ap;
6751192800Strasz{
6752192800Strasz	int		error;
6753192800Strasz	vsecattr_t      vsecattr;
6754192800Strasz
6755192800Strasz	if (ap->a_type != ACL_TYPE_NFS4)
6756197435Strasz		return (EINVAL);
6757192800Strasz
6758192800Strasz	vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
6759192800Strasz	if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))
6760192800Strasz		return (error);
6761192800Strasz
6762192800Strasz	error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt);
6763196303Spjd	if (vsecattr.vsa_aclentp != NULL)
6764196303Spjd		kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
6765192800Strasz
6766196303Spjd	return (error);
6767192800Strasz}
6768192800Strasz
6769192800Straszint
6770192800Straszzfs_freebsd_setacl(ap)
6771192800Strasz	struct vop_setacl_args /* {
6772192800Strasz		struct vnode *vp;
6773192800Strasz		acl_type_t type;
6774192800Strasz		struct acl *aclp;
6775192800Strasz		struct ucred *cred;
6776192800Strasz		struct thread *td;
6777192800Strasz	} */ *ap;
6778192800Strasz{
6779192800Strasz	int		error;
6780192800Strasz	vsecattr_t      vsecattr;
6781192800Strasz	int		aclbsize;	/* size of acl list in bytes */
6782192800Strasz	aclent_t	*aaclp;
6783192800Strasz
6784192800Strasz	if (ap->a_type != ACL_TYPE_NFS4)
6785197435Strasz		return (EINVAL);
6786192800Strasz
6787192800Strasz	if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
6788192800Strasz		return (EINVAL);
6789192800Strasz
6790192800Strasz	/*
6791196949Strasz	 * With NFSv4 ACLs, chmod(2) may need to add additional entries,
6792192800Strasz	 * splitting every entry into two and appending "canonical six"
6793192800Strasz	 * entries at the end.  Don't allow for setting an ACL that would
6794192800Strasz	 * cause chmod(2) to run out of ACL entries.
6795192800Strasz	 */
6796192800Strasz	if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
6797192800Strasz		return (ENOSPC);
6798192800Strasz
6799208030Strasz	error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
6800208030Strasz	if (error != 0)
6801208030Strasz		return (error);
6802208030Strasz
6803192800Strasz	vsecattr.vsa_mask = VSA_ACE;
6804192800Strasz	aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t);
6805192800Strasz	vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
6806192800Strasz	aaclp = vsecattr.vsa_aclentp;
6807192800Strasz	vsecattr.vsa_aclentsz = aclbsize;
6808192800Strasz
6809192800Strasz	aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
6810192800Strasz	error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL);
6811192800Strasz	kmem_free(aaclp, aclbsize);
6812192800Strasz
6813192800Strasz	return (error);
6814192800Strasz}
6815192800Strasz
6816192800Straszint
6817192800Straszzfs_freebsd_aclcheck(ap)
6818192800Strasz	struct vop_aclcheck_args /* {
6819192800Strasz		struct vnode *vp;
6820192800Strasz		acl_type_t type;
6821192800Strasz		struct acl *aclp;
6822192800Strasz		struct ucred *cred;
6823192800Strasz		struct thread *td;
6824192800Strasz	} */ *ap;
6825192800Strasz{
6826192800Strasz
6827192800Strasz	return (EOPNOTSUPP);
6828192800Strasz}
6829192800Strasz
6830168404Spjdstruct vop_vector zfs_vnodeops;
6831168404Spjdstruct vop_vector zfs_fifoops;
6832209962Smmstruct vop_vector zfs_shareops;
6833168404Spjd
6834168404Spjdstruct vop_vector zfs_vnodeops = {
6835185029Spjd	.vop_default =		&default_vnodeops,
6836185029Spjd	.vop_inactive =		zfs_freebsd_inactive,
6837185029Spjd	.vop_reclaim =		zfs_freebsd_reclaim,
6838185029Spjd	.vop_access =		zfs_freebsd_access,
6839168404Spjd#ifdef FREEBSD_NAMECACHE
6840185029Spjd	.vop_lookup =		vfs_cache_lookup,
6841185029Spjd	.vop_cachedlookup =	zfs_freebsd_lookup,
6842168404Spjd#else
6843185029Spjd	.vop_lookup =		zfs_freebsd_lookup,
6844168404Spjd#endif
6845185029Spjd	.vop_getattr =		zfs_freebsd_getattr,
6846185029Spjd	.vop_setattr =		zfs_freebsd_setattr,
6847185029Spjd	.vop_create =		zfs_freebsd_create,
6848185029Spjd	.vop_mknod =		zfs_freebsd_create,
6849185029Spjd	.vop_mkdir =		zfs_freebsd_mkdir,
6850185029Spjd	.vop_readdir =		zfs_freebsd_readdir,
6851185029Spjd	.vop_fsync =		zfs_freebsd_fsync,
6852185029Spjd	.vop_open =		zfs_freebsd_open,
6853185029Spjd	.vop_close =		zfs_freebsd_close,
6854185029Spjd	.vop_rmdir =		zfs_freebsd_rmdir,
6855185029Spjd	.vop_ioctl =		zfs_freebsd_ioctl,
6856185029Spjd	.vop_link =		zfs_freebsd_link,
6857185029Spjd	.vop_symlink =		zfs_freebsd_symlink,
6858185029Spjd	.vop_readlink =		zfs_freebsd_readlink,
6859185029Spjd	.vop_read =		zfs_freebsd_read,
6860185029Spjd	.vop_write =		zfs_freebsd_write,
6861185029Spjd	.vop_remove =		zfs_freebsd_remove,
6862185029Spjd	.vop_rename =		zfs_freebsd_rename,
6863185029Spjd	.vop_pathconf =		zfs_freebsd_pathconf,
6864243518Savg	.vop_bmap =		zfs_freebsd_bmap,
6865185029Spjd	.vop_fid =		zfs_freebsd_fid,
6866185029Spjd	.vop_getextattr =	zfs_getextattr,
6867185029Spjd	.vop_deleteextattr =	zfs_deleteextattr,
6868185029Spjd	.vop_setextattr =	zfs_setextattr,
6869185029Spjd	.vop_listextattr =	zfs_listextattr,
6870192800Strasz	.vop_getacl =		zfs_freebsd_getacl,
6871192800Strasz	.vop_setacl =		zfs_freebsd_setacl,
6872192800Strasz	.vop_aclcheck =		zfs_freebsd_aclcheck,
6873213937Savg	.vop_getpages =		zfs_freebsd_getpages,
6874168404Spjd};
6875168404Spjd
6876169170Spjdstruct vop_vector zfs_fifoops = {
6877185029Spjd	.vop_default =		&fifo_specops,
6878200162Skib	.vop_fsync =		zfs_freebsd_fsync,
6879185029Spjd	.vop_access =		zfs_freebsd_access,
6880185029Spjd	.vop_getattr =		zfs_freebsd_getattr,
6881185029Spjd	.vop_inactive =		zfs_freebsd_inactive,
6882185029Spjd	.vop_read =		VOP_PANIC,
6883185029Spjd	.vop_reclaim =		zfs_freebsd_reclaim,
6884185029Spjd	.vop_setattr =		zfs_freebsd_setattr,
6885185029Spjd	.vop_write =		VOP_PANIC,
6886196949Strasz	.vop_pathconf = 	zfs_freebsd_fifo_pathconf,
6887185029Spjd	.vop_fid =		zfs_freebsd_fid,
6888192800Strasz	.vop_getacl =		zfs_freebsd_getacl,
6889192800Strasz	.vop_setacl =		zfs_freebsd_setacl,
6890192800Strasz	.vop_aclcheck =		zfs_freebsd_aclcheck,
6891168404Spjd};
6892209962Smm
6893209962Smm/*
6894209962Smm * special share hidden files vnode operations template
6895209962Smm */
6896209962Smmstruct vop_vector zfs_shareops = {
6897209962Smm	.vop_default =		&default_vnodeops,
6898209962Smm	.vop_access =		zfs_freebsd_access,
6899209962Smm	.vop_inactive =		zfs_freebsd_inactive,
6900209962Smm	.vop_reclaim =		zfs_freebsd_reclaim,
6901209962Smm	.vop_fid =		zfs_freebsd_fid,
6902209962Smm	.vop_pathconf =		zfs_freebsd_pathconf,
6903209962Smm};
6904