zfs_vnops.c revision 240829
1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22212694Smm * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23240415Smm * Copyright (c) 2012 by Delphix. All rights reserved.
24168404Spjd */
25168404Spjd
26169195Spjd/* Portions Copyright 2007 Jeremy Teo */
27219089Spjd/* Portions Copyright 2010 Robert Milkowski */
28169195Spjd
29168404Spjd#include <sys/types.h>
30168404Spjd#include <sys/param.h>
31168404Spjd#include <sys/time.h>
32168404Spjd#include <sys/systm.h>
33168404Spjd#include <sys/sysmacros.h>
34168404Spjd#include <sys/resource.h>
35168404Spjd#include <sys/vfs.h>
36168404Spjd#include <sys/vnode.h>
37168404Spjd#include <sys/file.h>
38168404Spjd#include <sys/stat.h>
39168404Spjd#include <sys/kmem.h>
40168404Spjd#include <sys/taskq.h>
41168404Spjd#include <sys/uio.h>
42168404Spjd#include <sys/atomic.h>
43168404Spjd#include <sys/namei.h>
44168404Spjd#include <sys/mman.h>
45168404Spjd#include <sys/cmn_err.h>
46168404Spjd#include <sys/errno.h>
47168404Spjd#include <sys/unistd.h>
48168404Spjd#include <sys/zfs_dir.h>
49168404Spjd#include <sys/zfs_ioctl.h>
50168404Spjd#include <sys/fs/zfs.h>
51168404Spjd#include <sys/dmu.h>
52219089Spjd#include <sys/dmu_objset.h>
53168404Spjd#include <sys/spa.h>
54168404Spjd#include <sys/txg.h>
55168404Spjd#include <sys/dbuf.h>
56168404Spjd#include <sys/zap.h>
57219089Spjd#include <sys/sa.h>
58168404Spjd#include <sys/dirent.h>
59168962Spjd#include <sys/policy.h>
60168962Spjd#include <sys/sunddi.h>
61168404Spjd#include <sys/filio.h>
62209962Smm#include <sys/sid.h>
63168404Spjd#include <sys/zfs_ctldir.h>
64185029Spjd#include <sys/zfs_fuid.h>
65219089Spjd#include <sys/zfs_sa.h>
66168404Spjd#include <sys/dnlc.h>
67168404Spjd#include <sys/zfs_rlock.h>
68185029Spjd#include <sys/extdirent.h>
69185029Spjd#include <sys/kidmap.h>
70168404Spjd#include <sys/bio.h>
71168404Spjd#include <sys/buf.h>
72168404Spjd#include <sys/sf_buf.h>
73168404Spjd#include <sys/sched.h>
74192800Strasz#include <sys/acl.h>
75239077Smarius#include <vm/vm_param.h>
76215401Savg#include <vm/vm_pageout.h>
77168404Spjd
78168404Spjd/*
79168404Spjd * Programming rules.
80168404Spjd *
81168404Spjd * Each vnode op performs some logical unit of work.  To do this, the ZPL must
82168404Spjd * properly lock its in-core state, create a DMU transaction, do the work,
83168404Spjd * record this work in the intent log (ZIL), commit the DMU transaction,
84185029Spjd * and wait for the intent log to commit if it is a synchronous operation.
85185029Spjd * Moreover, the vnode ops must work in both normal and log replay context.
86168404Spjd * The ordering of events is important to avoid deadlocks and references
87168404Spjd * to freed memory.  The example below illustrates the following Big Rules:
88168404Spjd *
89168404Spjd *  (1) A check must be made in each zfs thread for a mounted file system.
90168404Spjd *	This is done avoiding races using ZFS_ENTER(zfsvfs).
91185029Spjd *      A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
92185029Spjd *      must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
93185029Spjd *      can return EIO from the calling function.
94168404Spjd *
95168404Spjd *  (2)	VN_RELE() should always be the last thing except for zil_commit()
96168404Spjd *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
97168404Spjd *	First, if it's the last reference, the vnode/znode
98168404Spjd *	can be freed, so the zp may point to freed memory.  Second, the last
99168404Spjd *	reference will call zfs_zinactive(), which may induce a lot of work --
100168404Spjd *	pushing cached pages (which acquires range locks) and syncing out
101168404Spjd *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
102168404Spjd *	which could deadlock the system if you were already holding one.
103191900Skmacy *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
104168404Spjd *
105168404Spjd *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
106168404Spjd *	as they can span dmu_tx_assign() calls.
107168404Spjd *
108209962Smm *  (4)	Always pass TXG_NOWAIT as the second argument to dmu_tx_assign().
109168404Spjd *	This is critical because we don't want to block while holding locks.
110168404Spjd *	Note, in particular, that if a lock is sometimes acquired before
111168404Spjd *	the tx assigns, and sometimes after (e.g. z_lock), then failing to
112168404Spjd *	use a non-blocking assign can deadlock the system.  The scenario:
113168404Spjd *
114168404Spjd *	Thread A has grabbed a lock before calling dmu_tx_assign().
115168404Spjd *	Thread B is in an already-assigned tx, and blocks for this lock.
116168404Spjd *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
117168404Spjd *	forever, because the previous txg can't quiesce until B's tx commits.
118168404Spjd *
119168404Spjd *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
120168404Spjd *	then drop all locks, call dmu_tx_wait(), and try again.
121168404Spjd *
122168404Spjd *  (5)	If the operation succeeded, generate the intent log entry for it
123168404Spjd *	before dropping locks.  This ensures that the ordering of events
124168404Spjd *	in the intent log matches the order in which they actually occurred.
125209962Smm *      During ZIL replay the zfs_log_* functions will update the sequence
126209962Smm *	number to indicate the zil transaction has replayed.
127168404Spjd *
128168404Spjd *  (6)	At the end of each vnode op, the DMU tx must always commit,
129168404Spjd *	regardless of whether there were any errors.
130168404Spjd *
131219089Spjd *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
132168404Spjd *	to ensure that synchronous semantics are provided when necessary.
133168404Spjd *
134168404Spjd * In general, this is how things should be ordered in each vnode op:
135168404Spjd *
136168404Spjd *	ZFS_ENTER(zfsvfs);		// exit if unmounted
137168404Spjd * top:
138168404Spjd *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may VN_HOLD())
139168404Spjd *	rw_enter(...);			// grab any other locks you need
140168404Spjd *	tx = dmu_tx_create(...);	// get DMU tx
141168404Spjd *	dmu_tx_hold_*();		// hold each object you might modify
142209962Smm *	error = dmu_tx_assign(tx, TXG_NOWAIT);	// try to assign
143168404Spjd *	if (error) {
144168404Spjd *		rw_exit(...);		// drop locks
145168404Spjd *		zfs_dirent_unlock(dl);	// unlock directory entry
146168404Spjd *		VN_RELE(...);		// release held vnodes
147209962Smm *		if (error == ERESTART) {
148168404Spjd *			dmu_tx_wait(tx);
149168404Spjd *			dmu_tx_abort(tx);
150168404Spjd *			goto top;
151168404Spjd *		}
152168404Spjd *		dmu_tx_abort(tx);	// abort DMU tx
153168404Spjd *		ZFS_EXIT(zfsvfs);	// finished in zfs
154168404Spjd *		return (error);		// really out of space
155168404Spjd *	}
156168404Spjd *	error = do_real_work();		// do whatever this VOP does
157168404Spjd *	if (error == 0)
158168404Spjd *		zfs_log_*(...);		// on success, make ZIL entry
159168404Spjd *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
160168404Spjd *	rw_exit(...);			// drop locks
161168404Spjd *	zfs_dirent_unlock(dl);		// unlock directory entry
162168404Spjd *	VN_RELE(...);			// release held vnodes
163219089Spjd *	zil_commit(zilog, foid);	// synchronous when necessary
164168404Spjd *	ZFS_EXIT(zfsvfs);		// finished in zfs
165168404Spjd *	return (error);			// done, report error
166168404Spjd */
167185029Spjd
168168404Spjd/* ARGSUSED */
169168404Spjdstatic int
170185029Spjdzfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
171168404Spjd{
172168962Spjd	znode_t	*zp = VTOZ(*vpp);
173209962Smm	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
174168404Spjd
175209962Smm	ZFS_ENTER(zfsvfs);
176209962Smm	ZFS_VERIFY_ZP(zp);
177209962Smm
178219089Spjd	if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
179185029Spjd	    ((flag & FAPPEND) == 0)) {
180209962Smm		ZFS_EXIT(zfsvfs);
181185029Spjd		return (EPERM);
182185029Spjd	}
183185029Spjd
184185029Spjd	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
185185029Spjd	    ZTOV(zp)->v_type == VREG &&
186219089Spjd	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
187209962Smm		if (fs_vscan(*vpp, cr, 0) != 0) {
188209962Smm			ZFS_EXIT(zfsvfs);
189185029Spjd			return (EACCES);
190209962Smm		}
191209962Smm	}
192185029Spjd
193168404Spjd	/* Keep a count of the synchronous opens in the znode */
194168962Spjd	if (flag & (FSYNC | FDSYNC))
195168404Spjd		atomic_inc_32(&zp->z_sync_cnt);
196185029Spjd
197209962Smm	ZFS_EXIT(zfsvfs);
198168404Spjd	return (0);
199168404Spjd}
200168404Spjd
201168404Spjd/* ARGSUSED */
202168404Spjdstatic int
203185029Spjdzfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
204185029Spjd    caller_context_t *ct)
205168404Spjd{
206168962Spjd	znode_t	*zp = VTOZ(vp);
207209962Smm	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
208168404Spjd
209210470Smm	/*
210210470Smm	 * Clean up any locks held by this process on the vp.
211210470Smm	 */
212210470Smm	cleanlocks(vp, ddi_get_pid(), 0);
213210470Smm	cleanshares(vp, ddi_get_pid());
214210470Smm
215209962Smm	ZFS_ENTER(zfsvfs);
216209962Smm	ZFS_VERIFY_ZP(zp);
217209962Smm
218168404Spjd	/* Decrement the synchronous opens in the znode */
219185029Spjd	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
220168404Spjd		atomic_dec_32(&zp->z_sync_cnt);
221168404Spjd
222185029Spjd	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
223185029Spjd	    ZTOV(zp)->v_type == VREG &&
224219089Spjd	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
225185029Spjd		VERIFY(fs_vscan(vp, cr, 1) == 0);
226185029Spjd
227209962Smm	ZFS_EXIT(zfsvfs);
228168404Spjd	return (0);
229168404Spjd}
230168404Spjd
231168404Spjd/*
232168404Spjd * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
233168404Spjd * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
234168404Spjd */
235168404Spjdstatic int
236168978Spjdzfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
237168404Spjd{
238168404Spjd	znode_t	*zp = VTOZ(vp);
239168404Spjd	uint64_t noff = (uint64_t)*off; /* new offset */
240168404Spjd	uint64_t file_sz;
241168404Spjd	int error;
242168404Spjd	boolean_t hole;
243168404Spjd
244219089Spjd	file_sz = zp->z_size;
245168404Spjd	if (noff >= file_sz)  {
246168404Spjd		return (ENXIO);
247168404Spjd	}
248168404Spjd
249168962Spjd	if (cmd == _FIO_SEEK_HOLE)
250168404Spjd		hole = B_TRUE;
251168404Spjd	else
252168404Spjd		hole = B_FALSE;
253168404Spjd
254168404Spjd	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
255168404Spjd
256168404Spjd	/* end of file? */
257168404Spjd	if ((error == ESRCH) || (noff > file_sz)) {
258168404Spjd		/*
259168404Spjd		 * Handle the virtual hole at the end of file.
260168404Spjd		 */
261168404Spjd		if (hole) {
262168404Spjd			*off = file_sz;
263168404Spjd			return (0);
264168404Spjd		}
265168404Spjd		return (ENXIO);
266168404Spjd	}
267168404Spjd
268168404Spjd	if (noff < *off)
269168404Spjd		return (error);
270168404Spjd	*off = noff;
271168404Spjd	return (error);
272168404Spjd}
273168404Spjd
274168404Spjd/* ARGSUSED */
275168404Spjdstatic int
276168978Spjdzfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
277185029Spjd    int *rvalp, caller_context_t *ct)
278168404Spjd{
279168962Spjd	offset_t off;
280168962Spjd	int error;
281168962Spjd	zfsvfs_t *zfsvfs;
282185029Spjd	znode_t *zp;
283168404Spjd
284168404Spjd	switch (com) {
285185029Spjd	case _FIOFFS:
286168962Spjd		return (0);
287168404Spjd
288168962Spjd		/*
289168962Spjd		 * The following two ioctls are used by bfu.  Faking out,
290168962Spjd		 * necessary to avoid bfu errors.
291168962Spjd		 */
292185029Spjd	case _FIOGDIO:
293185029Spjd	case _FIOSDIO:
294168962Spjd		return (0);
295168962Spjd
296185029Spjd	case _FIO_SEEK_DATA:
297185029Spjd	case _FIO_SEEK_HOLE:
298233918Savg#ifdef sun
299168962Spjd		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
300168962Spjd			return (EFAULT);
301233918Savg#else
302233918Savg		off = *(offset_t *)data;
303233918Savg#endif
304185029Spjd		zp = VTOZ(vp);
305185029Spjd		zfsvfs = zp->z_zfsvfs;
306168404Spjd		ZFS_ENTER(zfsvfs);
307185029Spjd		ZFS_VERIFY_ZP(zp);
308168404Spjd
309168404Spjd		/* offset parameter is in/out */
310168404Spjd		error = zfs_holey(vp, com, &off);
311168404Spjd		ZFS_EXIT(zfsvfs);
312168404Spjd		if (error)
313168404Spjd			return (error);
314233918Savg#ifdef sun
315168962Spjd		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
316168962Spjd			return (EFAULT);
317233918Savg#else
318233918Savg		*(offset_t *)data = off;
319233918Savg#endif
320168404Spjd		return (0);
321168404Spjd	}
322168404Spjd	return (ENOTTY);
323168404Spjd}
324168404Spjd
325209962Smmstatic vm_page_t
326209962Smmpage_lookup(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
327209962Smm{
328209962Smm	vm_object_t obj;
329209962Smm	vm_page_t pp;
330209962Smm
331209962Smm	obj = vp->v_object;
332209962Smm	VM_OBJECT_LOCK_ASSERT(obj, MA_OWNED);
333209962Smm
334209962Smm	for (;;) {
335209962Smm		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
336209962Smm		    vm_page_is_valid(pp, (vm_offset_t)off, nbytes)) {
337212652Savg			if ((pp->oflags & VPO_BUSY) != 0) {
338212652Savg				/*
339212652Savg				 * Reference the page before unlocking and
340212652Savg				 * sleeping so that the page daemon is less
341212652Savg				 * likely to reclaim it.
342212652Savg				 */
343225418Skib				vm_page_reference(pp);
344212652Savg				vm_page_sleep(pp, "zfsmwb");
345209962Smm				continue;
346212652Savg			}
347209962Smm			vm_page_busy(pp);
348209962Smm			vm_page_undirty(pp);
349209962Smm		} else {
350234064Sattilio			if (vm_page_is_cached(obj, OFF_TO_IDX(start)))
351209962Smm				vm_page_cache_free(obj, OFF_TO_IDX(start),
352209962Smm				    OFF_TO_IDX(start) + 1);
353209962Smm			pp = NULL;
354209962Smm		}
355209962Smm		break;
356209962Smm	}
357209962Smm	return (pp);
358209962Smm}
359209962Smm
360209962Smmstatic void
361209962Smmpage_unlock(vm_page_t pp)
362209962Smm{
363209962Smm
364209962Smm	vm_page_wakeup(pp);
365209962Smm}
366209962Smm
367209962Smmstatic caddr_t
368209962Smmzfs_map_page(vm_page_t pp, struct sf_buf **sfp)
369209962Smm{
370209962Smm
371212951Savg	*sfp = sf_buf_alloc(pp, 0);
372209962Smm	return ((caddr_t)sf_buf_kva(*sfp));
373209962Smm}
374209962Smm
375209962Smmstatic void
376209962Smmzfs_unmap_page(struct sf_buf *sf)
377209962Smm{
378209962Smm
379209962Smm	sf_buf_free(sf);
380209962Smm}
381209962Smm
382168404Spjd/*
383168404Spjd * When a file is memory mapped, we must keep the IO data synchronized
384168404Spjd * between the DMU cache and the memory mapped pages.  What this means:
385168404Spjd *
386168404Spjd * On Write:	If we find a memory mapped page, we write to *both*
387168404Spjd *		the page and the dmu buffer.
388168404Spjd */
389209962Smmstatic void
390209962Smmupdate_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
391209962Smm    int segflg, dmu_tx_t *tx)
392168404Spjd{
393168404Spjd	vm_object_t obj;
394168404Spjd	struct sf_buf *sf;
395212655Savg	int off;
396168404Spjd
397168404Spjd	ASSERT(vp->v_mount != NULL);
398168404Spjd	obj = vp->v_object;
399168404Spjd	ASSERT(obj != NULL);
400168404Spjd
401168404Spjd	off = start & PAGEOFFSET;
402168404Spjd	VM_OBJECT_LOCK(obj);
403168404Spjd	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
404209962Smm		vm_page_t pp;
405212655Savg		int nbytes = MIN(PAGESIZE - off, len);
406168404Spjd
407209962Smm		if ((pp = page_lookup(vp, start, off, nbytes)) != NULL) {
408168404Spjd			caddr_t va;
409168404Spjd
410168404Spjd			VM_OBJECT_UNLOCK(obj);
411209962Smm			va = zfs_map_page(pp, &sf);
412209962Smm			if (segflg == UIO_NOCOPY) {
413209962Smm				(void) dmu_write(os, oid, start+off, nbytes,
414209962Smm				    va+off, tx);
415209962Smm			} else {
416209962Smm				(void) dmu_read(os, oid, start+off, nbytes,
417216378Spjd				    va+off, DMU_READ_PREFETCH);
418169059Spjd			}
419209962Smm			zfs_unmap_page(sf);
420168404Spjd			VM_OBJECT_LOCK(obj);
421209962Smm			page_unlock(pp);
422168404Spjd		}
423209962Smm		len -= nbytes;
424168404Spjd		off = 0;
425168404Spjd	}
426168404Spjd	VM_OBJECT_UNLOCK(obj);
427168404Spjd}
428168404Spjd
429168404Spjd/*
430219089Spjd * Read with UIO_NOCOPY flag means that sendfile(2) requests
431219089Spjd * ZFS to populate a range of page cache pages with data.
432219089Spjd *
433219089Spjd * NOTE: this function could be optimized to pre-allocate
434219089Spjd * all pages in advance, drain VPO_BUSY on all of them,
435219089Spjd * map them into contiguous KVA region and populate them
436219089Spjd * in one single dmu_read() call.
437219089Spjd */
438219089Spjdstatic int
439219089Spjdmappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
440219089Spjd{
441219089Spjd	znode_t *zp = VTOZ(vp);
442219089Spjd	objset_t *os = zp->z_zfsvfs->z_os;
443219089Spjd	struct sf_buf *sf;
444219089Spjd	vm_object_t obj;
445219089Spjd	vm_page_t pp;
446219089Spjd	int64_t start;
447219089Spjd	caddr_t va;
448219089Spjd	int len = nbytes;
449219089Spjd	int off;
450219089Spjd	int error = 0;
451219089Spjd
452219089Spjd	ASSERT(uio->uio_segflg == UIO_NOCOPY);
453219089Spjd	ASSERT(vp->v_mount != NULL);
454219089Spjd	obj = vp->v_object;
455219089Spjd	ASSERT(obj != NULL);
456219089Spjd	ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
457219089Spjd
458219089Spjd	VM_OBJECT_LOCK(obj);
459219089Spjd	for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
460219089Spjd		int bytes = MIN(PAGESIZE, len);
461219089Spjd
462219089Spjd		pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_NOBUSY |
463219089Spjd		    VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_IGN_SBUSY);
464219089Spjd		if (pp->valid == 0) {
465219089Spjd			vm_page_io_start(pp);
466219089Spjd			VM_OBJECT_UNLOCK(obj);
467219089Spjd			va = zfs_map_page(pp, &sf);
468219089Spjd			error = dmu_read(os, zp->z_id, start, bytes, va,
469219089Spjd			    DMU_READ_PREFETCH);
470219089Spjd			if (bytes != PAGESIZE && error == 0)
471219089Spjd				bzero(va + bytes, PAGESIZE - bytes);
472219089Spjd			zfs_unmap_page(sf);
473219089Spjd			VM_OBJECT_LOCK(obj);
474219089Spjd			vm_page_io_finish(pp);
475219089Spjd			vm_page_lock(pp);
476219089Spjd			if (error) {
477219089Spjd				vm_page_free(pp);
478219089Spjd			} else {
479219089Spjd				pp->valid = VM_PAGE_BITS_ALL;
480219089Spjd				vm_page_activate(pp);
481219089Spjd			}
482219089Spjd			vm_page_unlock(pp);
483219089Spjd		}
484219089Spjd		if (error)
485219089Spjd			break;
486219089Spjd		uio->uio_resid -= bytes;
487219089Spjd		uio->uio_offset += bytes;
488219089Spjd		len -= bytes;
489219089Spjd	}
490219089Spjd	VM_OBJECT_UNLOCK(obj);
491219089Spjd	return (error);
492219089Spjd}
493219089Spjd
494219089Spjd/*
495168404Spjd * When a file is memory mapped, we must keep the IO data synchronized
496168404Spjd * between the DMU cache and the memory mapped pages.  What this means:
497168404Spjd *
498168404Spjd * On Read:	We "read" preferentially from memory mapped pages,
499168404Spjd *		else we default from the dmu buffer.
500168404Spjd *
501168404Spjd * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
502168404Spjd *	the file is memory mapped.
503168404Spjd */
504168404Spjdstatic int
505168404Spjdmappedread(vnode_t *vp, int nbytes, uio_t *uio)
506168404Spjd{
507168404Spjd	znode_t *zp = VTOZ(vp);
508168404Spjd	objset_t *os = zp->z_zfsvfs->z_os;
509168404Spjd	vm_object_t obj;
510212655Savg	int64_t start;
511168926Spjd	caddr_t va;
512168404Spjd	int len = nbytes;
513212655Savg	int off;
514168404Spjd	int error = 0;
515168404Spjd
516168404Spjd	ASSERT(vp->v_mount != NULL);
517168404Spjd	obj = vp->v_object;
518168404Spjd	ASSERT(obj != NULL);
519168404Spjd
520168404Spjd	start = uio->uio_loffset;
521168404Spjd	off = start & PAGEOFFSET;
522168404Spjd	VM_OBJECT_LOCK(obj);
523168404Spjd	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
524219089Spjd		vm_page_t pp;
525219089Spjd		uint64_t bytes = MIN(PAGESIZE - off, len);
526168404Spjd
527219089Spjd		if (pp = page_lookup(vp, start, off, bytes)) {
528219089Spjd			struct sf_buf *sf;
529219089Spjd			caddr_t va;
530212652Savg
531168404Spjd			VM_OBJECT_UNLOCK(obj);
532219089Spjd			va = zfs_map_page(pp, &sf);
533219089Spjd			error = uiomove(va + off, bytes, UIO_READ, uio);
534219089Spjd			zfs_unmap_page(sf);
535168404Spjd			VM_OBJECT_LOCK(obj);
536219089Spjd			page_unlock(pp);
537219089Spjd		} else {
538168926Spjd			VM_OBJECT_UNLOCK(obj);
539219089Spjd			error = dmu_read_uio(os, zp->z_id, uio, bytes);
540168926Spjd			VM_OBJECT_LOCK(obj);
541168404Spjd		}
542168404Spjd		len -= bytes;
543168404Spjd		off = 0;
544168404Spjd		if (error)
545168404Spjd			break;
546168404Spjd	}
547168404Spjd	VM_OBJECT_UNLOCK(obj);
548168404Spjd	return (error);
549168404Spjd}
550168404Spjd
551168404Spjdoffset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
552168404Spjd
553168404Spjd/*
554168404Spjd * Read bytes from specified file into supplied buffer.
555168404Spjd *
556168404Spjd *	IN:	vp	- vnode of file to be read from.
557168404Spjd *		uio	- structure supplying read location, range info,
558168404Spjd *			  and return buffer.
559168404Spjd *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
560168404Spjd *		cr	- credentials of caller.
561185029Spjd *		ct	- caller context
562168404Spjd *
563168404Spjd *	OUT:	uio	- updated offset and range, buffer filled.
564168404Spjd *
565168404Spjd *	RETURN:	0 if success
566168404Spjd *		error code if failure
567168404Spjd *
568168404Spjd * Side Effects:
569168404Spjd *	vp - atime updated if byte count > 0
570168404Spjd */
571168404Spjd/* ARGSUSED */
572168404Spjdstatic int
573168962Spjdzfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
574168404Spjd{
575168404Spjd	znode_t		*zp = VTOZ(vp);
576168404Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
577185029Spjd	objset_t	*os;
578168404Spjd	ssize_t		n, nbytes;
579168404Spjd	int		error;
580168404Spjd	rl_t		*rl;
581219089Spjd	xuio_t		*xuio = NULL;
582168404Spjd
583168404Spjd	ZFS_ENTER(zfsvfs);
584185029Spjd	ZFS_VERIFY_ZP(zp);
585185029Spjd	os = zfsvfs->z_os;
586168404Spjd
587219089Spjd	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
588185029Spjd		ZFS_EXIT(zfsvfs);
589185029Spjd		return (EACCES);
590185029Spjd	}
591185029Spjd
592168404Spjd	/*
593168404Spjd	 * Validate file offset
594168404Spjd	 */
595168404Spjd	if (uio->uio_loffset < (offset_t)0) {
596168404Spjd		ZFS_EXIT(zfsvfs);
597168404Spjd		return (EINVAL);
598168404Spjd	}
599168404Spjd
600168404Spjd	/*
601168404Spjd	 * Fasttrack empty reads
602168404Spjd	 */
603168404Spjd	if (uio->uio_resid == 0) {
604168404Spjd		ZFS_EXIT(zfsvfs);
605168404Spjd		return (0);
606168404Spjd	}
607168404Spjd
608168404Spjd	/*
609168962Spjd	 * Check for mandatory locks
610168962Spjd	 */
611219089Spjd	if (MANDMODE(zp->z_mode)) {
612168962Spjd		if (error = chklock(vp, FREAD,
613168962Spjd		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
614168962Spjd			ZFS_EXIT(zfsvfs);
615168962Spjd			return (error);
616168962Spjd		}
617168962Spjd	}
618168962Spjd
619168962Spjd	/*
620168404Spjd	 * If we're in FRSYNC mode, sync out this znode before reading it.
621168404Spjd	 */
622224605Smm	if (zfsvfs->z_log &&
623224605Smm	    (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
624219089Spjd		zil_commit(zfsvfs->z_log, zp->z_id);
625168404Spjd
626168404Spjd	/*
627168404Spjd	 * Lock the range against changes.
628168404Spjd	 */
629168404Spjd	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
630168404Spjd
631168404Spjd	/*
632168404Spjd	 * If we are reading past end-of-file we can skip
633168404Spjd	 * to the end; but we might still need to set atime.
634168404Spjd	 */
635219089Spjd	if (uio->uio_loffset >= zp->z_size) {
636168404Spjd		error = 0;
637168404Spjd		goto out;
638168404Spjd	}
639168404Spjd
640219089Spjd	ASSERT(uio->uio_loffset < zp->z_size);
641219089Spjd	n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
642168404Spjd
643219089Spjd#ifdef sun
644219089Spjd	if ((uio->uio_extflg == UIO_XUIO) &&
645219089Spjd	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
646219089Spjd		int nblk;
647219089Spjd		int blksz = zp->z_blksz;
648219089Spjd		uint64_t offset = uio->uio_loffset;
649219089Spjd
650219089Spjd		xuio = (xuio_t *)uio;
651219089Spjd		if ((ISP2(blksz))) {
652219089Spjd			nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
653219089Spjd			    blksz)) / blksz;
654219089Spjd		} else {
655219089Spjd			ASSERT(offset + n <= blksz);
656219089Spjd			nblk = 1;
657219089Spjd		}
658219089Spjd		(void) dmu_xuio_init(xuio, nblk);
659219089Spjd
660219089Spjd		if (vn_has_cached_data(vp)) {
661219089Spjd			/*
662219089Spjd			 * For simplicity, we always allocate a full buffer
663219089Spjd			 * even if we only expect to read a portion of a block.
664219089Spjd			 */
665219089Spjd			while (--nblk >= 0) {
666219089Spjd				(void) dmu_xuio_add(xuio,
667219089Spjd				    dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
668219089Spjd				    blksz), 0, blksz);
669219089Spjd			}
670219089Spjd		}
671219089Spjd	}
672219089Spjd#endif	/* sun */
673219089Spjd
674168404Spjd	while (n > 0) {
675168404Spjd		nbytes = MIN(n, zfs_read_chunk_size -
676168404Spjd		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
677168404Spjd
678219089Spjd#ifdef __FreeBSD__
679219089Spjd		if (uio->uio_segflg == UIO_NOCOPY)
680219089Spjd			error = mappedread_sf(vp, nbytes, uio);
681219089Spjd		else
682219089Spjd#endif /* __FreeBSD__ */
683168404Spjd		if (vn_has_cached_data(vp))
684168404Spjd			error = mappedread(vp, nbytes, uio);
685168404Spjd		else
686168404Spjd			error = dmu_read_uio(os, zp->z_id, uio, nbytes);
687185029Spjd		if (error) {
688185029Spjd			/* convert checksum errors into IO errors */
689185029Spjd			if (error == ECKSUM)
690185029Spjd				error = EIO;
691168404Spjd			break;
692185029Spjd		}
693168962Spjd
694168404Spjd		n -= nbytes;
695168404Spjd	}
696168404Spjdout:
697168404Spjd	zfs_range_unlock(rl);
698168404Spjd
699168404Spjd	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
700168404Spjd	ZFS_EXIT(zfsvfs);
701168404Spjd	return (error);
702168404Spjd}
703168404Spjd
704168404Spjd/*
705168404Spjd * Write the bytes to a file.
706168404Spjd *
707168404Spjd *	IN:	vp	- vnode of file to be written to.
708168404Spjd *		uio	- structure supplying write location, range info,
709168404Spjd *			  and data buffer.
710213673Spjd *		ioflag	- FAPPEND flag set if in append mode.
711168404Spjd *		cr	- credentials of caller.
712185029Spjd *		ct	- caller context (NFS/CIFS fem monitor only)
713168404Spjd *
714168404Spjd *	OUT:	uio	- updated offset and range.
715168404Spjd *
716168404Spjd *	RETURN:	0 if success
717168404Spjd *		error code if failure
718168404Spjd *
719168404Spjd * Timestamps:
720168404Spjd *	vp - ctime|mtime updated if byte count > 0
721168404Spjd */
722219089Spjd
723168404Spjd/* ARGSUSED */
724168404Spjdstatic int
725168962Spjdzfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
726168404Spjd{
727168404Spjd	znode_t		*zp = VTOZ(vp);
728168962Spjd	rlim64_t	limit = MAXOFFSET_T;
729168404Spjd	ssize_t		start_resid = uio->uio_resid;
730168404Spjd	ssize_t		tx_bytes;
731168404Spjd	uint64_t	end_size;
732168404Spjd	dmu_tx_t	*tx;
733168404Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
734185029Spjd	zilog_t		*zilog;
735168404Spjd	offset_t	woff;
736168404Spjd	ssize_t		n, nbytes;
737168404Spjd	rl_t		*rl;
738168404Spjd	int		max_blksz = zfsvfs->z_max_blksz;
739168404Spjd	int		error;
740209962Smm	arc_buf_t	*abuf;
741219089Spjd	iovec_t		*aiov;
742219089Spjd	xuio_t		*xuio = NULL;
743219089Spjd	int		i_iov = 0;
744219089Spjd	int		iovcnt = uio->uio_iovcnt;
745219089Spjd	iovec_t		*iovp = uio->uio_iov;
746219089Spjd	int		write_eof;
747219089Spjd	int		count = 0;
748219089Spjd	sa_bulk_attr_t	bulk[4];
749219089Spjd	uint64_t	mtime[2], ctime[2];
750168404Spjd
751168404Spjd	/*
752168404Spjd	 * Fasttrack empty write
753168404Spjd	 */
754168404Spjd	n = start_resid;
755168404Spjd	if (n == 0)
756168404Spjd		return (0);
757168404Spjd
758168962Spjd	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
759168962Spjd		limit = MAXOFFSET_T;
760168962Spjd
761168404Spjd	ZFS_ENTER(zfsvfs);
762185029Spjd	ZFS_VERIFY_ZP(zp);
763168404Spjd
764219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
765219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
766219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
767219089Spjd	    &zp->z_size, 8);
768219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
769219089Spjd	    &zp->z_pflags, 8);
770219089Spjd
771168404Spjd	/*
772185029Spjd	 * If immutable or not appending then return EPERM
773185029Spjd	 */
774219089Spjd	if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
775219089Spjd	    ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
776219089Spjd	    (uio->uio_loffset < zp->z_size))) {
777185029Spjd		ZFS_EXIT(zfsvfs);
778185029Spjd		return (EPERM);
779185029Spjd	}
780185029Spjd
781185029Spjd	zilog = zfsvfs->z_log;
782185029Spjd
783185029Spjd	/*
784219089Spjd	 * Validate file offset
785219089Spjd	 */
786219089Spjd	woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
787219089Spjd	if (woff < 0) {
788219089Spjd		ZFS_EXIT(zfsvfs);
789219089Spjd		return (EINVAL);
790219089Spjd	}
791219089Spjd
792219089Spjd	/*
793219089Spjd	 * Check for mandatory locks before calling zfs_range_lock()
794219089Spjd	 * in order to prevent a deadlock with locks set via fcntl().
795219089Spjd	 */
796219089Spjd	if (MANDMODE((mode_t)zp->z_mode) &&
797219089Spjd	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
798219089Spjd		ZFS_EXIT(zfsvfs);
799219089Spjd		return (error);
800219089Spjd	}
801219089Spjd
802219089Spjd#ifdef sun
803219089Spjd	/*
804168404Spjd	 * Pre-fault the pages to ensure slow (eg NFS) pages
805168404Spjd	 * don't hold up txg.
806219089Spjd	 * Skip this if uio contains loaned arc_buf.
807168404Spjd	 */
808219089Spjd	if ((uio->uio_extflg == UIO_XUIO) &&
809219089Spjd	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
810219089Spjd		xuio = (xuio_t *)uio;
811219089Spjd	else
812219089Spjd		uio_prefaultpages(MIN(n, max_blksz), uio);
813219089Spjd#endif	/* sun */
814168404Spjd
815168404Spjd	/*
816168404Spjd	 * If in append mode, set the io offset pointer to eof.
817168404Spjd	 */
818213673Spjd	if (ioflag & FAPPEND) {
819168404Spjd		/*
820219089Spjd		 * Obtain an appending range lock to guarantee file append
821219089Spjd		 * semantics.  We reset the write offset once we have the lock.
822168404Spjd		 */
823168404Spjd		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
824219089Spjd		woff = rl->r_off;
825168404Spjd		if (rl->r_len == UINT64_MAX) {
826219089Spjd			/*
827219089Spjd			 * We overlocked the file because this write will cause
828219089Spjd			 * the file block size to increase.
829219089Spjd			 * Note that zp_size cannot change with this lock held.
830219089Spjd			 */
831219089Spjd			woff = zp->z_size;
832168404Spjd		}
833219089Spjd		uio->uio_loffset = woff;
834168404Spjd	} else {
835168404Spjd		/*
836219089Spjd		 * Note that if the file block size will change as a result of
837219089Spjd		 * this write, then this range lock will lock the entire file
838219089Spjd		 * so that we can re-write the block safely.
839168404Spjd		 */
840168404Spjd		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
841168404Spjd	}
842168404Spjd
843235781Strasz	if (vn_rlimit_fsize(vp, uio, uio->uio_td)) {
844235781Strasz		zfs_range_unlock(rl);
845235781Strasz		ZFS_EXIT(zfsvfs);
846235781Strasz		return (EFBIG);
847235781Strasz	}
848235781Strasz
849168962Spjd	if (woff >= limit) {
850168962Spjd		zfs_range_unlock(rl);
851168962Spjd		ZFS_EXIT(zfsvfs);
852168962Spjd		return (EFBIG);
853168962Spjd	}
854168962Spjd
855168962Spjd	if ((woff + n) > limit || woff > (limit - n))
856168962Spjd		n = limit - woff;
857168962Spjd
858219089Spjd	/* Will this write extend the file length? */
859219089Spjd	write_eof = (woff + n > zp->z_size);
860168404Spjd
861219089Spjd	end_size = MAX(zp->z_size, woff + n);
862219089Spjd
863168404Spjd	/*
864168404Spjd	 * Write the file in reasonable size chunks.  Each chunk is written
865168404Spjd	 * in a separate transaction; this keeps the intent log records small
866168404Spjd	 * and allows us to do more fine-grained space accounting.
867168404Spjd	 */
868168404Spjd	while (n > 0) {
869209962Smm		abuf = NULL;
870209962Smm		woff = uio->uio_loffset;
871209962Smmagain:
872219089Spjd		if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
873219089Spjd		    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
874209962Smm			if (abuf != NULL)
875209962Smm				dmu_return_arcbuf(abuf);
876209962Smm			error = EDQUOT;
877209962Smm			break;
878209962Smm		}
879209962Smm
880219089Spjd		if (xuio && abuf == NULL) {
881219089Spjd			ASSERT(i_iov < iovcnt);
882219089Spjd			aiov = &iovp[i_iov];
883219089Spjd			abuf = dmu_xuio_arcbuf(xuio, i_iov);
884219089Spjd			dmu_xuio_clear(xuio, i_iov);
885219089Spjd			DTRACE_PROBE3(zfs_cp_write, int, i_iov,
886219089Spjd			    iovec_t *, aiov, arc_buf_t *, abuf);
887219089Spjd			ASSERT((aiov->iov_base == abuf->b_data) ||
888219089Spjd			    ((char *)aiov->iov_base - (char *)abuf->b_data +
889219089Spjd			    aiov->iov_len == arc_buf_size(abuf)));
890219089Spjd			i_iov++;
891219089Spjd		} else if (abuf == NULL && n >= max_blksz &&
892219089Spjd		    woff >= zp->z_size &&
893209962Smm		    P2PHASE(woff, max_blksz) == 0 &&
894209962Smm		    zp->z_blksz == max_blksz) {
895219089Spjd			/*
896219089Spjd			 * This write covers a full block.  "Borrow" a buffer
897219089Spjd			 * from the dmu so that we can fill it before we enter
898219089Spjd			 * a transaction.  This avoids the possibility of
899219089Spjd			 * holding up the transaction if the data copy hangs
900219089Spjd			 * up on a pagefault (e.g., from an NFS server mapping).
901219089Spjd			 */
902209962Smm			size_t cbytes;
903209962Smm
904219089Spjd			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
905219089Spjd			    max_blksz);
906209962Smm			ASSERT(abuf != NULL);
907209962Smm			ASSERT(arc_buf_size(abuf) == max_blksz);
908209962Smm			if (error = uiocopy(abuf->b_data, max_blksz,
909209962Smm			    UIO_WRITE, uio, &cbytes)) {
910209962Smm				dmu_return_arcbuf(abuf);
911209962Smm				break;
912209962Smm			}
913209962Smm			ASSERT(cbytes == max_blksz);
914209962Smm		}
915209962Smm
916209962Smm		/*
917168404Spjd		 * Start a transaction.
918168404Spjd		 */
919168404Spjd		tx = dmu_tx_create(zfsvfs->z_os);
920219089Spjd		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
921168404Spjd		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
922219089Spjd		zfs_sa_upgrade_txholds(tx, zp);
923209962Smm		error = dmu_tx_assign(tx, TXG_NOWAIT);
924168404Spjd		if (error) {
925209962Smm			if (error == ERESTART) {
926168404Spjd				dmu_tx_wait(tx);
927168404Spjd				dmu_tx_abort(tx);
928209962Smm				goto again;
929168404Spjd			}
930168404Spjd			dmu_tx_abort(tx);
931209962Smm			if (abuf != NULL)
932209962Smm				dmu_return_arcbuf(abuf);
933168404Spjd			break;
934168404Spjd		}
935168404Spjd
936168404Spjd		/*
937168404Spjd		 * If zfs_range_lock() over-locked we grow the blocksize
938168404Spjd		 * and then reduce the lock range.  This will only happen
939168404Spjd		 * on the first iteration since zfs_range_reduce() will
940168404Spjd		 * shrink down r_len to the appropriate size.
941168404Spjd		 */
942168404Spjd		if (rl->r_len == UINT64_MAX) {
943168404Spjd			uint64_t new_blksz;
944168404Spjd
945168404Spjd			if (zp->z_blksz > max_blksz) {
946168404Spjd				ASSERT(!ISP2(zp->z_blksz));
947168404Spjd				new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
948168404Spjd			} else {
949168404Spjd				new_blksz = MIN(end_size, max_blksz);
950168404Spjd			}
951168404Spjd			zfs_grow_blocksize(zp, new_blksz, tx);
952168404Spjd			zfs_range_reduce(rl, woff, n);
953168404Spjd		}
954168404Spjd
955168404Spjd		/*
956168404Spjd		 * XXX - should we really limit each write to z_max_blksz?
957168404Spjd		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
958168404Spjd		 */
959168404Spjd		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
960168404Spjd
961219089Spjd		if (woff + nbytes > zp->z_size)
962168404Spjd			vnode_pager_setsize(vp, woff + nbytes);
963168404Spjd
964209962Smm		if (abuf == NULL) {
965209962Smm			tx_bytes = uio->uio_resid;
966219089Spjd			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
967219089Spjd			    uio, nbytes, tx);
968209962Smm			tx_bytes -= uio->uio_resid;
969168404Spjd		} else {
970209962Smm			tx_bytes = nbytes;
971219089Spjd			ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
972219089Spjd			/*
973219089Spjd			 * If this is not a full block write, but we are
974219089Spjd			 * extending the file past EOF and this data starts
975219089Spjd			 * block-aligned, use assign_arcbuf().  Otherwise,
976219089Spjd			 * write via dmu_write().
977219089Spjd			 */
978219089Spjd			if (tx_bytes < max_blksz && (!write_eof ||
979219089Spjd			    aiov->iov_base != abuf->b_data)) {
980219089Spjd				ASSERT(xuio);
981219089Spjd				dmu_write(zfsvfs->z_os, zp->z_id, woff,
982219089Spjd				    aiov->iov_len, aiov->iov_base, tx);
983219089Spjd				dmu_return_arcbuf(abuf);
984219089Spjd				xuio_stat_wbuf_copied();
985219089Spjd			} else {
986219089Spjd				ASSERT(xuio || tx_bytes == max_blksz);
987219089Spjd				dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
988219089Spjd				    woff, abuf, tx);
989219089Spjd			}
990209962Smm			ASSERT(tx_bytes <= uio->uio_resid);
991209962Smm			uioskip(uio, tx_bytes);
992168404Spjd		}
993212657Savg		if (tx_bytes && vn_has_cached_data(vp)) {
994209962Smm			update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
995209962Smm			    zp->z_id, uio->uio_segflg, tx);
996209962Smm		}
997209962Smm
998209962Smm		/*
999168404Spjd		 * If we made no progress, we're done.  If we made even
1000168404Spjd		 * partial progress, update the znode and ZIL accordingly.
1001168404Spjd		 */
1002168404Spjd		if (tx_bytes == 0) {
1003219089Spjd			(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
1004219089Spjd			    (void *)&zp->z_size, sizeof (uint64_t), tx);
1005168404Spjd			dmu_tx_commit(tx);
1006168404Spjd			ASSERT(error != 0);
1007168404Spjd			break;
1008168404Spjd		}
1009168404Spjd
1010168404Spjd		/*
1011168404Spjd		 * Clear Set-UID/Set-GID bits on successful write if not
1012168404Spjd		 * privileged and at least one of the excute bits is set.
1013168404Spjd		 *
1014168404Spjd		 * It would be nice to to this after all writes have
1015168404Spjd		 * been done, but that would still expose the ISUID/ISGID
1016168404Spjd		 * to another app after the partial write is committed.
1017185029Spjd		 *
1018185029Spjd		 * Note: we don't call zfs_fuid_map_id() here because
1019185029Spjd		 * user 0 is not an ephemeral uid.
1020168404Spjd		 */
1021168404Spjd		mutex_enter(&zp->z_acl_lock);
1022219089Spjd		if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
1023168404Spjd		    (S_IXUSR >> 6))) != 0 &&
1024219089Spjd		    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
1025185029Spjd		    secpolicy_vnode_setid_retain(vp, cr,
1026219089Spjd		    (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
1027219089Spjd			uint64_t newmode;
1028219089Spjd			zp->z_mode &= ~(S_ISUID | S_ISGID);
1029219089Spjd			newmode = zp->z_mode;
1030219089Spjd			(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
1031219089Spjd			    (void *)&newmode, sizeof (uint64_t), tx);
1032168404Spjd		}
1033168404Spjd		mutex_exit(&zp->z_acl_lock);
1034168404Spjd
1035219089Spjd		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
1036219089Spjd		    B_TRUE);
1037168404Spjd
1038168404Spjd		/*
1039168404Spjd		 * Update the file size (zp_size) if it has changed;
1040168404Spjd		 * account for possible concurrent updates.
1041168404Spjd		 */
1042219089Spjd		while ((end_size = zp->z_size) < uio->uio_loffset) {
1043219089Spjd			(void) atomic_cas_64(&zp->z_size, end_size,
1044168404Spjd			    uio->uio_loffset);
1045219089Spjd			ASSERT(error == 0);
1046219089Spjd		}
1047219089Spjd		/*
1048219089Spjd		 * If we are replaying and eof is non zero then force
1049219089Spjd		 * the file size to the specified eof. Note, there's no
1050219089Spjd		 * concurrency during replay.
1051219089Spjd		 */
1052219089Spjd		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
1053219089Spjd			zp->z_size = zfsvfs->z_replay_eof;
1054219089Spjd
1055219089Spjd		error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1056219089Spjd
1057168404Spjd		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
1058168404Spjd		dmu_tx_commit(tx);
1059168404Spjd
1060168404Spjd		if (error != 0)
1061168404Spjd			break;
1062168404Spjd		ASSERT(tx_bytes == nbytes);
1063168404Spjd		n -= nbytes;
1064219089Spjd
1065219089Spjd#ifdef sun
1066219089Spjd		if (!xuio && n > 0)
1067219089Spjd			uio_prefaultpages(MIN(n, max_blksz), uio);
1068219089Spjd#endif	/* sun */
1069168404Spjd	}
1070168404Spjd
1071168404Spjd	zfs_range_unlock(rl);
1072168404Spjd
1073168404Spjd	/*
1074168404Spjd	 * If we're in replay mode, or we made no progress, return error.
1075168404Spjd	 * Otherwise, it's at least a partial write, so it's successful.
1076168404Spjd	 */
1077209962Smm	if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
1078168404Spjd		ZFS_EXIT(zfsvfs);
1079168404Spjd		return (error);
1080168404Spjd	}
1081168404Spjd
1082219089Spjd	if (ioflag & (FSYNC | FDSYNC) ||
1083219089Spjd	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1084219089Spjd		zil_commit(zilog, zp->z_id);
1085168404Spjd
1086168404Spjd	ZFS_EXIT(zfsvfs);
1087168404Spjd	return (0);
1088168404Spjd}
1089168404Spjd
1090168404Spjdvoid
1091219089Spjdzfs_get_done(zgd_t *zgd, int error)
1092168404Spjd{
1093219089Spjd	znode_t *zp = zgd->zgd_private;
1094219089Spjd	objset_t *os = zp->z_zfsvfs->z_os;
1095168404Spjd	int vfslocked;
1096168404Spjd
1097219089Spjd	if (zgd->zgd_db)
1098219089Spjd		dmu_buf_rele(zgd->zgd_db, zgd);
1099219089Spjd
1100219089Spjd	zfs_range_unlock(zgd->zgd_rl);
1101219089Spjd
1102219089Spjd	vfslocked = VFS_LOCK_GIANT(zp->z_zfsvfs->z_vfs);
1103191900Skmacy	/*
1104191900Skmacy	 * Release the vnode asynchronously as we currently have the
1105191900Skmacy	 * txg stopped from syncing.
1106191900Skmacy	 */
1107219089Spjd	VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1108219089Spjd
1109219089Spjd	if (error == 0 && zgd->zgd_bp)
1110219089Spjd		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
1111219089Spjd
1112168404Spjd	kmem_free(zgd, sizeof (zgd_t));
1113168404Spjd	VFS_UNLOCK_GIANT(vfslocked);
1114168404Spjd}
1115168404Spjd
1116214378Smm#ifdef DEBUG
1117214378Smmstatic int zil_fault_io = 0;
1118214378Smm#endif
1119214378Smm
1120168404Spjd/*
1121168404Spjd * Get data to generate a TX_WRITE intent log record.
1122168404Spjd */
1123168404Spjdint
1124168404Spjdzfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
1125168404Spjd{
1126168404Spjd	zfsvfs_t *zfsvfs = arg;
1127168404Spjd	objset_t *os = zfsvfs->z_os;
1128168404Spjd	znode_t *zp;
1129219089Spjd	uint64_t object = lr->lr_foid;
1130219089Spjd	uint64_t offset = lr->lr_offset;
1131219089Spjd	uint64_t size = lr->lr_length;
1132219089Spjd	blkptr_t *bp = &lr->lr_blkptr;
1133168404Spjd	dmu_buf_t *db;
1134168404Spjd	zgd_t *zgd;
1135168404Spjd	int error = 0;
1136168404Spjd
1137219089Spjd	ASSERT(zio != NULL);
1138219089Spjd	ASSERT(size != 0);
1139168404Spjd
1140168404Spjd	/*
1141168404Spjd	 * Nothing to do if the file has been removed
1142168404Spjd	 */
1143219089Spjd	if (zfs_zget(zfsvfs, object, &zp) != 0)
1144168404Spjd		return (ENOENT);
1145168404Spjd	if (zp->z_unlinked) {
1146191900Skmacy		/*
1147191900Skmacy		 * Release the vnode asynchronously as we currently have the
1148191900Skmacy		 * txg stopped from syncing.
1149191900Skmacy		 */
1150196307Spjd		VN_RELE_ASYNC(ZTOV(zp),
1151196307Spjd		    dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1152168404Spjd		return (ENOENT);
1153168404Spjd	}
1154168404Spjd
1155219089Spjd	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1156219089Spjd	zgd->zgd_zilog = zfsvfs->z_log;
1157219089Spjd	zgd->zgd_private = zp;
1158219089Spjd
1159168404Spjd	/*
1160168404Spjd	 * Write records come in two flavors: immediate and indirect.
1161168404Spjd	 * For small writes it's cheaper to store the data with the
1162168404Spjd	 * log record (immediate); for large writes it's cheaper to
1163168404Spjd	 * sync the data and get a pointer to it (indirect) so that
1164168404Spjd	 * we don't have to write the data twice.
1165168404Spjd	 */
1166168404Spjd	if (buf != NULL) { /* immediate write */
1167219089Spjd		zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
1168168404Spjd		/* test for truncation needs to be done while range locked */
1169219089Spjd		if (offset >= zp->z_size) {
1170168404Spjd			error = ENOENT;
1171219089Spjd		} else {
1172219089Spjd			error = dmu_read(os, object, offset, size, buf,
1173219089Spjd			    DMU_READ_NO_PREFETCH);
1174168404Spjd		}
1175219089Spjd		ASSERT(error == 0 || error == ENOENT);
1176168404Spjd	} else { /* indirect write */
1177168404Spjd		/*
1178168404Spjd		 * Have to lock the whole block to ensure when it's
1179168404Spjd		 * written out and it's checksum is being calculated
1180168404Spjd		 * that no one can change the data. We need to re-check
1181168404Spjd		 * blocksize after we get the lock in case it's changed!
1182168404Spjd		 */
1183168404Spjd		for (;;) {
1184219089Spjd			uint64_t blkoff;
1185219089Spjd			size = zp->z_blksz;
1186219089Spjd			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1187219089Spjd			offset -= blkoff;
1188219089Spjd			zgd->zgd_rl = zfs_range_lock(zp, offset, size,
1189219089Spjd			    RL_READER);
1190219089Spjd			if (zp->z_blksz == size)
1191168404Spjd				break;
1192219089Spjd			offset += blkoff;
1193219089Spjd			zfs_range_unlock(zgd->zgd_rl);
1194168404Spjd		}
1195168404Spjd		/* test for truncation needs to be done while range locked */
1196219089Spjd		if (lr->lr_offset >= zp->z_size)
1197168404Spjd			error = ENOENT;
1198214378Smm#ifdef DEBUG
1199214378Smm		if (zil_fault_io) {
1200214378Smm			error = EIO;
1201214378Smm			zil_fault_io = 0;
1202214378Smm		}
1203214378Smm#endif
1204219089Spjd		if (error == 0)
1205219089Spjd			error = dmu_buf_hold(os, object, offset, zgd, &db,
1206219089Spjd			    DMU_READ_NO_PREFETCH);
1207214378Smm
1208209962Smm		if (error == 0) {
1209219089Spjd			zgd->zgd_db = db;
1210219089Spjd			zgd->zgd_bp = bp;
1211219089Spjd
1212219089Spjd			ASSERT(db->db_offset == offset);
1213219089Spjd			ASSERT(db->db_size == size);
1214219089Spjd
1215219089Spjd			error = dmu_sync(zio, lr->lr_common.lrc_txg,
1216219089Spjd			    zfs_get_done, zgd);
1217219089Spjd			ASSERT(error || lr->lr_length <= zp->z_blksz);
1218219089Spjd
1219209962Smm			/*
1220219089Spjd			 * On success, we need to wait for the write I/O
1221219089Spjd			 * initiated by dmu_sync() to complete before we can
1222219089Spjd			 * release this dbuf.  We will finish everything up
1223219089Spjd			 * in the zfs_get_done() callback.
1224209962Smm			 */
1225219089Spjd			if (error == 0)
1226219089Spjd				return (0);
1227209962Smm
1228219089Spjd			if (error == EALREADY) {
1229219089Spjd				lr->lr_common.lrc_txtype = TX_WRITE2;
1230219089Spjd				error = 0;
1231219089Spjd			}
1232209962Smm		}
1233168404Spjd	}
1234219089Spjd
1235219089Spjd	zfs_get_done(zgd, error);
1236219089Spjd
1237168404Spjd	return (error);
1238168404Spjd}
1239168404Spjd
1240168404Spjd/*ARGSUSED*/
1241168404Spjdstatic int
1242185029Spjdzfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1243185029Spjd    caller_context_t *ct)
1244168404Spjd{
1245168404Spjd	znode_t *zp = VTOZ(vp);
1246168404Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1247168404Spjd	int error;
1248168404Spjd
1249168404Spjd	ZFS_ENTER(zfsvfs);
1250185029Spjd	ZFS_VERIFY_ZP(zp);
1251185029Spjd
1252185029Spjd	if (flag & V_ACE_MASK)
1253185029Spjd		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1254185029Spjd	else
1255185029Spjd		error = zfs_zaccess_rwx(zp, mode, flag, cr);
1256185029Spjd
1257168404Spjd	ZFS_EXIT(zfsvfs);
1258168404Spjd	return (error);
1259168404Spjd}
1260168404Spjd
1261168404Spjd/*
1262211932Smm * If vnode is for a device return a specfs vnode instead.
1263211932Smm */
1264211932Smmstatic int
1265211932Smmspecvp_check(vnode_t **vpp, cred_t *cr)
1266211932Smm{
1267211932Smm	int error = 0;
1268211932Smm
1269211932Smm	if (IS_DEVVP(*vpp)) {
1270211932Smm		struct vnode *svp;
1271211932Smm
1272211932Smm		svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1273211932Smm		VN_RELE(*vpp);
1274211932Smm		if (svp == NULL)
1275211932Smm			error = ENOSYS;
1276211932Smm		*vpp = svp;
1277211932Smm	}
1278211932Smm	return (error);
1279211932Smm}
1280211932Smm
1281211932Smm
1282211932Smm/*
1283168404Spjd * Lookup an entry in a directory, or an extended attribute directory.
1284168404Spjd * If it exists, return a held vnode reference for it.
1285168404Spjd *
1286168404Spjd *	IN:	dvp	- vnode of directory to search.
1287168404Spjd *		nm	- name of entry to lookup.
1288168404Spjd *		pnp	- full pathname to lookup [UNUSED].
1289168404Spjd *		flags	- LOOKUP_XATTR set if looking for an attribute.
1290168404Spjd *		rdir	- root directory vnode [UNUSED].
1291168404Spjd *		cr	- credentials of caller.
1292185029Spjd *		ct	- caller context
1293185029Spjd *		direntflags - directory lookup flags
1294185029Spjd *		realpnp - returned pathname.
1295168404Spjd *
1296168404Spjd *	OUT:	vpp	- vnode of located entry, NULL if not found.
1297168404Spjd *
1298168404Spjd *	RETURN:	0 if success
1299168404Spjd *		error code if failure
1300168404Spjd *
1301168404Spjd * Timestamps:
1302168404Spjd *	NA
1303168404Spjd */
1304168404Spjd/* ARGSUSED */
1305168962Spjdstatic int
1306168962Spjdzfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
1307185029Spjd    int nameiop, cred_t *cr, kthread_t *td, int flags)
1308168404Spjd{
1309168962Spjd	znode_t *zdp = VTOZ(dvp);
1310168962Spjd	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1311211932Smm	int	error = 0;
1312185029Spjd	int *direntflags = NULL;
1313185029Spjd	void *realpnp = NULL;
1314168404Spjd
1315211932Smm	/* fast path */
1316211932Smm	if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
1317211932Smm
1318211932Smm		if (dvp->v_type != VDIR) {
1319211932Smm			return (ENOTDIR);
1320219089Spjd		} else if (zdp->z_sa_hdl == NULL) {
1321211932Smm			return (EIO);
1322211932Smm		}
1323211932Smm
1324211932Smm		if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
1325211932Smm			error = zfs_fastaccesschk_execute(zdp, cr);
1326211932Smm			if (!error) {
1327211932Smm				*vpp = dvp;
1328211932Smm				VN_HOLD(*vpp);
1329211932Smm				return (0);
1330211932Smm			}
1331211932Smm			return (error);
1332211932Smm		} else {
1333211932Smm			vnode_t *tvp = dnlc_lookup(dvp, nm);
1334211932Smm
1335211932Smm			if (tvp) {
1336211932Smm				error = zfs_fastaccesschk_execute(zdp, cr);
1337211932Smm				if (error) {
1338211932Smm					VN_RELE(tvp);
1339211932Smm					return (error);
1340211932Smm				}
1341211932Smm				if (tvp == DNLC_NO_VNODE) {
1342211932Smm					VN_RELE(tvp);
1343211932Smm					return (ENOENT);
1344211932Smm				} else {
1345211932Smm					*vpp = tvp;
1346211932Smm					return (specvp_check(vpp, cr));
1347211932Smm				}
1348211932Smm			}
1349211932Smm		}
1350211932Smm	}
1351211932Smm
1352211932Smm	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1353211932Smm
1354168404Spjd	ZFS_ENTER(zfsvfs);
1355185029Spjd	ZFS_VERIFY_ZP(zdp);
1356168404Spjd
1357168404Spjd	*vpp = NULL;
1358168404Spjd
1359185029Spjd	if (flags & LOOKUP_XATTR) {
1360168404Spjd#ifdef TODO
1361168404Spjd		/*
1362168404Spjd		 * If the xattr property is off, refuse the lookup request.
1363168404Spjd		 */
1364168404Spjd		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1365168404Spjd			ZFS_EXIT(zfsvfs);
1366168404Spjd			return (EINVAL);
1367168404Spjd		}
1368185029Spjd#endif
1369168404Spjd
1370168404Spjd		/*
1371168404Spjd		 * We don't allow recursive attributes..
1372168404Spjd		 * Maybe someday we will.
1373168404Spjd		 */
1374219089Spjd		if (zdp->z_pflags & ZFS_XATTR) {
1375168404Spjd			ZFS_EXIT(zfsvfs);
1376168404Spjd			return (EINVAL);
1377168404Spjd		}
1378168404Spjd
1379168404Spjd		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1380168404Spjd			ZFS_EXIT(zfsvfs);
1381168404Spjd			return (error);
1382168404Spjd		}
1383168404Spjd
1384168404Spjd		/*
1385168404Spjd		 * Do we have permission to get into attribute directory?
1386168404Spjd		 */
1387168404Spjd
1388185029Spjd		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1389185029Spjd		    B_FALSE, cr)) {
1390168404Spjd			VN_RELE(*vpp);
1391185029Spjd			*vpp = NULL;
1392168404Spjd		}
1393168404Spjd
1394168404Spjd		ZFS_EXIT(zfsvfs);
1395168404Spjd		return (error);
1396168404Spjd	}
1397168404Spjd
1398168404Spjd	if (dvp->v_type != VDIR) {
1399168404Spjd		ZFS_EXIT(zfsvfs);
1400168404Spjd		return (ENOTDIR);
1401168404Spjd	}
1402168404Spjd
1403168404Spjd	/*
1404168404Spjd	 * Check accessibility of directory.
1405168404Spjd	 */
1406168404Spjd
1407185029Spjd	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1408168404Spjd		ZFS_EXIT(zfsvfs);
1409168404Spjd		return (error);
1410168404Spjd	}
1411168404Spjd
1412185029Spjd	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1413185029Spjd	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1414185029Spjd		ZFS_EXIT(zfsvfs);
1415185029Spjd		return (EILSEQ);
1416185029Spjd	}
1417168404Spjd
1418185029Spjd	error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
1419211932Smm	if (error == 0)
1420211932Smm		error = specvp_check(vpp, cr);
1421168962Spjd
1422168404Spjd	/* Translate errors and add SAVENAME when needed. */
1423168404Spjd	if (cnp->cn_flags & ISLASTCN) {
1424168404Spjd		switch (nameiop) {
1425168404Spjd		case CREATE:
1426168404Spjd		case RENAME:
1427168404Spjd			if (error == ENOENT) {
1428168404Spjd				error = EJUSTRETURN;
1429168404Spjd				cnp->cn_flags |= SAVENAME;
1430168404Spjd				break;
1431168404Spjd			}
1432168404Spjd			/* FALLTHROUGH */
1433168404Spjd		case DELETE:
1434168404Spjd			if (error == 0)
1435168404Spjd				cnp->cn_flags |= SAVENAME;
1436168404Spjd			break;
1437168404Spjd		}
1438168404Spjd	}
1439168404Spjd	if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) {
1440169198Spjd		int ltype = 0;
1441169198Spjd
1442169198Spjd		if (cnp->cn_flags & ISDOTDOT) {
1443176559Sattilio			ltype = VOP_ISLOCKED(dvp);
1444175294Sattilio			VOP_UNLOCK(dvp, 0);
1445169198Spjd		}
1446206667Spjd		ZFS_EXIT(zfsvfs);
1447219089Spjd		error = zfs_vnode_lock(*vpp, cnp->cn_lkflags);
1448168962Spjd		if (cnp->cn_flags & ISDOTDOT)
1449175202Sattilio			vn_lock(dvp, ltype | LK_RETRY);
1450169172Spjd		if (error != 0) {
1451169172Spjd			VN_RELE(*vpp);
1452169172Spjd			*vpp = NULL;
1453169172Spjd			return (error);
1454169172Spjd		}
1455206667Spjd	} else {
1456206667Spjd		ZFS_EXIT(zfsvfs);
1457168404Spjd	}
1458168404Spjd
1459168404Spjd#ifdef FREEBSD_NAMECACHE
1460168404Spjd	/*
1461168404Spjd	 * Insert name into cache (as non-existent) if appropriate.
1462168404Spjd	 */
1463168404Spjd	if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
1464168404Spjd		cache_enter(dvp, *vpp, cnp);
1465169170Spjd	/*
1466169170Spjd	 * Insert name into cache if appropriate.
1467169170Spjd	 */
1468168404Spjd	if (error == 0 && (cnp->cn_flags & MAKEENTRY)) {
1469168404Spjd		if (!(cnp->cn_flags & ISLASTCN) ||
1470168404Spjd		    (nameiop != DELETE && nameiop != RENAME)) {
1471168404Spjd			cache_enter(dvp, *vpp, cnp);
1472168404Spjd		}
1473168404Spjd	}
1474168404Spjd#endif
1475168404Spjd
1476168404Spjd	return (error);
1477168404Spjd}
1478168404Spjd
1479168404Spjd/*
1480168404Spjd * Attempt to create a new entry in a directory.  If the entry
1481168404Spjd * already exists, truncate the file if permissible, else return
1482168404Spjd * an error.  Return the vp of the created or trunc'd file.
1483168404Spjd *
1484168404Spjd *	IN:	dvp	- vnode of directory to put new file entry in.
1485168404Spjd *		name	- name of new file entry.
1486168404Spjd *		vap	- attributes of new file.
1487168404Spjd *		excl	- flag indicating exclusive or non-exclusive mode.
1488168404Spjd *		mode	- mode to open file with.
1489168404Spjd *		cr	- credentials of caller.
1490168404Spjd *		flag	- large file flag [UNUSED].
1491185029Spjd *		ct	- caller context
1492185029Spjd *		vsecp 	- ACL to be set
1493168404Spjd *
1494168404Spjd *	OUT:	vpp	- vnode of created or trunc'd entry.
1495168404Spjd *
1496168404Spjd *	RETURN:	0 if success
1497168404Spjd *		error code if failure
1498168404Spjd *
1499168404Spjd * Timestamps:
1500168404Spjd *	dvp - ctime|mtime updated if new entry created
1501168404Spjd *	 vp - ctime|mtime always, atime if new
1502168404Spjd */
1503185029Spjd
1504168404Spjd/* ARGSUSED */
1505168404Spjdstatic int
1506168962Spjdzfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
1507185029Spjd    vnode_t **vpp, cred_t *cr, kthread_t *td)
1508168404Spjd{
1509168404Spjd	znode_t		*zp, *dzp = VTOZ(dvp);
1510168404Spjd	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1511185029Spjd	zilog_t		*zilog;
1512185029Spjd	objset_t	*os;
1513168404Spjd	zfs_dirlock_t	*dl;
1514168404Spjd	dmu_tx_t	*tx;
1515168404Spjd	int		error;
1516209962Smm	ksid_t		*ksid;
1517209962Smm	uid_t		uid;
1518209962Smm	gid_t		gid = crgetgid(cr);
1519219089Spjd	zfs_acl_ids_t   acl_ids;
1520209962Smm	boolean_t	fuid_dirtied;
1521219089Spjd	boolean_t	have_acl = B_FALSE;
1522185029Spjd	void		*vsecp = NULL;
1523185029Spjd	int		flag = 0;
1524168404Spjd
1525185029Spjd	/*
1526185029Spjd	 * If we have an ephemeral id, ACL, or XVATTR then
1527185029Spjd	 * make sure file system is at proper version
1528185029Spjd	 */
1529185029Spjd
1530209962Smm	ksid = crgetsid(cr, KSID_OWNER);
1531209962Smm	if (ksid)
1532209962Smm		uid = ksid_getid(ksid);
1533209962Smm	else
1534209962Smm		uid = crgetuid(cr);
1535219089Spjd
1536185029Spjd	if (zfsvfs->z_use_fuids == B_FALSE &&
1537185029Spjd	    (vsecp || (vap->va_mask & AT_XVATTR) ||
1538219089Spjd	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1539185029Spjd		return (EINVAL);
1540185029Spjd
1541168404Spjd	ZFS_ENTER(zfsvfs);
1542185029Spjd	ZFS_VERIFY_ZP(dzp);
1543185029Spjd	os = zfsvfs->z_os;
1544185029Spjd	zilog = zfsvfs->z_log;
1545168404Spjd
1546185029Spjd	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1547185029Spjd	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1548185029Spjd		ZFS_EXIT(zfsvfs);
1549185029Spjd		return (EILSEQ);
1550185029Spjd	}
1551185029Spjd
1552185029Spjd	if (vap->va_mask & AT_XVATTR) {
1553197861Spjd		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
1554185029Spjd		    crgetuid(cr), cr, vap->va_type)) != 0) {
1555185029Spjd			ZFS_EXIT(zfsvfs);
1556185029Spjd			return (error);
1557185029Spjd		}
1558185029Spjd	}
1559168404Spjdtop:
1560168404Spjd	*vpp = NULL;
1561168404Spjd
1562182905Strasz	if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
1563182905Strasz		vap->va_mode &= ~S_ISVTX;
1564168404Spjd
1565168404Spjd	if (*name == '\0') {
1566168404Spjd		/*
1567168404Spjd		 * Null component name refers to the directory itself.
1568168404Spjd		 */
1569168404Spjd		VN_HOLD(dvp);
1570168404Spjd		zp = dzp;
1571168404Spjd		dl = NULL;
1572168404Spjd		error = 0;
1573168404Spjd	} else {
1574168404Spjd		/* possible VN_HOLD(zp) */
1575185029Spjd		int zflg = 0;
1576185029Spjd
1577185029Spjd		if (flag & FIGNORECASE)
1578185029Spjd			zflg |= ZCILOOK;
1579185029Spjd
1580185029Spjd		error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1581185029Spjd		    NULL, NULL);
1582185029Spjd		if (error) {
1583219089Spjd			if (have_acl)
1584219089Spjd				zfs_acl_ids_free(&acl_ids);
1585168404Spjd			if (strcmp(name, "..") == 0)
1586168404Spjd				error = EISDIR;
1587168404Spjd			ZFS_EXIT(zfsvfs);
1588168404Spjd			return (error);
1589168404Spjd		}
1590168404Spjd	}
1591219089Spjd
1592185029Spjd	if (zp == NULL) {
1593185029Spjd		uint64_t txtype;
1594168404Spjd
1595168404Spjd		/*
1596168404Spjd		 * Create a new file object and update the directory
1597168404Spjd		 * to reference it.
1598168404Spjd		 */
1599185029Spjd		if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1600219089Spjd			if (have_acl)
1601219089Spjd				zfs_acl_ids_free(&acl_ids);
1602168404Spjd			goto out;
1603168404Spjd		}
1604168404Spjd
1605168404Spjd		/*
1606168404Spjd		 * We only support the creation of regular files in
1607168404Spjd		 * extended attribute directories.
1608168404Spjd		 */
1609219089Spjd
1610219089Spjd		if ((dzp->z_pflags & ZFS_XATTR) &&
1611168404Spjd		    (vap->va_type != VREG)) {
1612219089Spjd			if (have_acl)
1613219089Spjd				zfs_acl_ids_free(&acl_ids);
1614168404Spjd			error = EINVAL;
1615168404Spjd			goto out;
1616168404Spjd		}
1617168404Spjd
1618219089Spjd		if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
1619219089Spjd		    cr, vsecp, &acl_ids)) != 0)
1620219089Spjd			goto out;
1621219089Spjd		have_acl = B_TRUE;
1622209962Smm
1623209962Smm		if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1624211932Smm			zfs_acl_ids_free(&acl_ids);
1625209962Smm			error = EDQUOT;
1626209962Smm			goto out;
1627209962Smm		}
1628209962Smm
1629168404Spjd		tx = dmu_tx_create(os);
1630219089Spjd
1631219089Spjd		dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1632219089Spjd		    ZFS_SA_BASE_ATTR_SIZE);
1633219089Spjd
1634209962Smm		fuid_dirtied = zfsvfs->z_fuid_dirty;
1635209962Smm		if (fuid_dirtied)
1636209962Smm			zfs_fuid_txhold(zfsvfs, tx);
1637168404Spjd		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1638219089Spjd		dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1639219089Spjd		if (!zfsvfs->z_use_sa &&
1640219089Spjd		    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1641168404Spjd			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1642219089Spjd			    0, acl_ids.z_aclp->z_acl_bytes);
1643185029Spjd		}
1644209962Smm		error = dmu_tx_assign(tx, TXG_NOWAIT);
1645168404Spjd		if (error) {
1646168404Spjd			zfs_dirent_unlock(dl);
1647209962Smm			if (error == ERESTART) {
1648168404Spjd				dmu_tx_wait(tx);
1649168404Spjd				dmu_tx_abort(tx);
1650168404Spjd				goto top;
1651168404Spjd			}
1652219089Spjd			zfs_acl_ids_free(&acl_ids);
1653168404Spjd			dmu_tx_abort(tx);
1654168404Spjd			ZFS_EXIT(zfsvfs);
1655168404Spjd			return (error);
1656168404Spjd		}
1657219089Spjd		zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1658209962Smm
1659209962Smm		if (fuid_dirtied)
1660209962Smm			zfs_fuid_sync(zfsvfs, tx);
1661209962Smm
1662168404Spjd		(void) zfs_link_create(dl, zp, tx, ZNEW);
1663185029Spjd		txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1664185029Spjd		if (flag & FIGNORECASE)
1665185029Spjd			txtype |= TX_CI;
1666185029Spjd		zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1667209962Smm		    vsecp, acl_ids.z_fuidp, vap);
1668209962Smm		zfs_acl_ids_free(&acl_ids);
1669168404Spjd		dmu_tx_commit(tx);
1670168404Spjd	} else {
1671185029Spjd		int aflags = (flag & FAPPEND) ? V_APPEND : 0;
1672185029Spjd
1673219089Spjd		if (have_acl)
1674219089Spjd			zfs_acl_ids_free(&acl_ids);
1675219089Spjd		have_acl = B_FALSE;
1676219089Spjd
1677168404Spjd		/*
1678168404Spjd		 * A directory entry already exists for this name.
1679168404Spjd		 */
1680168404Spjd		/*
1681168962Spjd		 * Can't truncate an existing file if in exclusive mode.
1682168962Spjd		 */
1683168962Spjd		if (excl == EXCL) {
1684168962Spjd			error = EEXIST;
1685168962Spjd			goto out;
1686168962Spjd		}
1687168962Spjd		/*
1688168404Spjd		 * Can't open a directory for writing.
1689168404Spjd		 */
1690168404Spjd		if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
1691168404Spjd			error = EISDIR;
1692168404Spjd			goto out;
1693168404Spjd		}
1694168404Spjd		/*
1695168404Spjd		 * Verify requested access to file.
1696168404Spjd		 */
1697185029Spjd		if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
1698168404Spjd			goto out;
1699168404Spjd		}
1700168404Spjd
1701168404Spjd		mutex_enter(&dzp->z_lock);
1702168404Spjd		dzp->z_seq++;
1703168404Spjd		mutex_exit(&dzp->z_lock);
1704168404Spjd
1705168404Spjd		/*
1706168404Spjd		 * Truncate regular files if requested.
1707168404Spjd		 */
1708168404Spjd		if ((ZTOV(zp)->v_type == VREG) &&
1709168404Spjd		    (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
1710185029Spjd			/* we can't hold any locks when calling zfs_freesp() */
1711185029Spjd			zfs_dirent_unlock(dl);
1712185029Spjd			dl = NULL;
1713168404Spjd			error = zfs_freesp(zp, 0, 0, mode, TRUE);
1714185029Spjd			if (error == 0) {
1715185029Spjd				vnevent_create(ZTOV(zp), ct);
1716168404Spjd			}
1717168404Spjd		}
1718168404Spjd	}
1719168404Spjdout:
1720168404Spjd	if (dl)
1721168404Spjd		zfs_dirent_unlock(dl);
1722168404Spjd
1723168404Spjd	if (error) {
1724168404Spjd		if (zp)
1725168404Spjd			VN_RELE(ZTOV(zp));
1726168962Spjd	} else {
1727168962Spjd		*vpp = ZTOV(zp);
1728211932Smm		error = specvp_check(vpp, cr);
1729168404Spjd	}
1730168404Spjd
1731219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1732219089Spjd		zil_commit(zilog, 0);
1733219089Spjd
1734168404Spjd	ZFS_EXIT(zfsvfs);
1735168404Spjd	return (error);
1736168404Spjd}
1737168404Spjd
1738168404Spjd/*
1739168404Spjd * Remove an entry from a directory.
1740168404Spjd *
1741168404Spjd *	IN:	dvp	- vnode of directory to remove entry from.
1742168404Spjd *		name	- name of entry to remove.
1743168404Spjd *		cr	- credentials of caller.
1744185029Spjd *		ct	- caller context
1745185029Spjd *		flags	- case flags
1746168404Spjd *
1747168404Spjd *	RETURN:	0 if success
1748168404Spjd *		error code if failure
1749168404Spjd *
1750168404Spjd * Timestamps:
1751168404Spjd *	dvp - ctime|mtime
1752168404Spjd *	 vp - ctime (if nlink > 0)
1753168404Spjd */
1754219089Spjd
1755219089Spjduint64_t null_xattr = 0;
1756219089Spjd
1757185029Spjd/*ARGSUSED*/
1758168404Spjdstatic int
1759185029Spjdzfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
1760185029Spjd    int flags)
1761168404Spjd{
1762168404Spjd	znode_t		*zp, *dzp = VTOZ(dvp);
1763219089Spjd	znode_t		*xzp;
1764168404Spjd	vnode_t		*vp;
1765168404Spjd	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1766185029Spjd	zilog_t		*zilog;
1767168962Spjd	uint64_t	acl_obj, xattr_obj;
1768219089Spjd	uint64_t 	xattr_obj_unlinked = 0;
1769219089Spjd	uint64_t	obj = 0;
1770168404Spjd	zfs_dirlock_t	*dl;
1771168404Spjd	dmu_tx_t	*tx;
1772168962Spjd	boolean_t	may_delete_now, delete_now = FALSE;
1773185029Spjd	boolean_t	unlinked, toobig = FALSE;
1774185029Spjd	uint64_t	txtype;
1775185029Spjd	pathname_t	*realnmp = NULL;
1776185029Spjd	pathname_t	realnm;
1777168404Spjd	int		error;
1778185029Spjd	int		zflg = ZEXISTS;
1779168404Spjd
1780168404Spjd	ZFS_ENTER(zfsvfs);
1781185029Spjd	ZFS_VERIFY_ZP(dzp);
1782185029Spjd	zilog = zfsvfs->z_log;
1783168404Spjd
1784185029Spjd	if (flags & FIGNORECASE) {
1785185029Spjd		zflg |= ZCILOOK;
1786185029Spjd		pn_alloc(&realnm);
1787185029Spjd		realnmp = &realnm;
1788185029Spjd	}
1789185029Spjd
1790168404Spjdtop:
1791219089Spjd	xattr_obj = 0;
1792219089Spjd	xzp = NULL;
1793168404Spjd	/*
1794168404Spjd	 * Attempt to lock directory; fail if entry doesn't exist.
1795168404Spjd	 */
1796185029Spjd	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1797185029Spjd	    NULL, realnmp)) {
1798185029Spjd		if (realnmp)
1799185029Spjd			pn_free(realnmp);
1800168404Spjd		ZFS_EXIT(zfsvfs);
1801168404Spjd		return (error);
1802168404Spjd	}
1803168404Spjd
1804168404Spjd	vp = ZTOV(zp);
1805168404Spjd
1806168962Spjd	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1807168404Spjd		goto out;
1808168962Spjd	}
1809168404Spjd
1810168962Spjd	/*
1811168962Spjd	 * Need to use rmdir for removing directories.
1812168962Spjd	 */
1813168962Spjd	if (vp->v_type == VDIR) {
1814168962Spjd		error = EPERM;
1815168962Spjd		goto out;
1816168962Spjd	}
1817168962Spjd
1818185029Spjd	vnevent_remove(vp, dvp, name, ct);
1819168962Spjd
1820185029Spjd	if (realnmp)
1821185029Spjd		dnlc_remove(dvp, realnmp->pn_buf);
1822185029Spjd	else
1823185029Spjd		dnlc_remove(dvp, name);
1824168404Spjd
1825219089Spjd	VI_LOCK(vp);
1826219089Spjd	may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp);
1827219089Spjd	VI_UNLOCK(vp);
1828168962Spjd
1829168404Spjd	/*
1830168404Spjd	 * We may delete the znode now, or we may put it in the unlinked set;
1831168404Spjd	 * it depends on whether we're the last link, and on whether there are
1832168404Spjd	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
1833168404Spjd	 * allow for either case.
1834168404Spjd	 */
1835219089Spjd	obj = zp->z_id;
1836168404Spjd	tx = dmu_tx_create(zfsvfs->z_os);
1837168404Spjd	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1838219089Spjd	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1839219089Spjd	zfs_sa_upgrade_txholds(tx, zp);
1840219089Spjd	zfs_sa_upgrade_txholds(tx, dzp);
1841185029Spjd	if (may_delete_now) {
1842185029Spjd		toobig =
1843219089Spjd		    zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
1844185029Spjd		/* if the file is too big, only hold_free a token amount */
1845185029Spjd		dmu_tx_hold_free(tx, zp->z_id, 0,
1846185029Spjd		    (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1847185029Spjd	}
1848168404Spjd
1849168404Spjd	/* are there any extended attributes? */
1850219089Spjd	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1851219089Spjd	    &xattr_obj, sizeof (xattr_obj));
1852219089Spjd	if (error == 0 && xattr_obj) {
1853219089Spjd		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1854240415Smm		ASSERT0(error);
1855219089Spjd		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1856219089Spjd		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1857168404Spjd	}
1858168404Spjd
1859219089Spjd	mutex_enter(&zp->z_lock);
1860219089Spjd	if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
1861168962Spjd		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1862219089Spjd	mutex_exit(&zp->z_lock);
1863168962Spjd
1864168404Spjd	/* charge as an update -- would be nice not to charge at all */
1865168404Spjd	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1866168404Spjd
1867209962Smm	error = dmu_tx_assign(tx, TXG_NOWAIT);
1868168404Spjd	if (error) {
1869168404Spjd		zfs_dirent_unlock(dl);
1870168962Spjd		VN_RELE(vp);
1871219089Spjd		if (xzp)
1872219089Spjd			VN_RELE(ZTOV(xzp));
1873209962Smm		if (error == ERESTART) {
1874168404Spjd			dmu_tx_wait(tx);
1875168404Spjd			dmu_tx_abort(tx);
1876168404Spjd			goto top;
1877168404Spjd		}
1878185029Spjd		if (realnmp)
1879185029Spjd			pn_free(realnmp);
1880168404Spjd		dmu_tx_abort(tx);
1881168404Spjd		ZFS_EXIT(zfsvfs);
1882168404Spjd		return (error);
1883168404Spjd	}
1884168404Spjd
1885168404Spjd	/*
1886168404Spjd	 * Remove the directory entry.
1887168404Spjd	 */
1888185029Spjd	error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1889168404Spjd
1890168404Spjd	if (error) {
1891168404Spjd		dmu_tx_commit(tx);
1892168404Spjd		goto out;
1893168404Spjd	}
1894168404Spjd
1895219089Spjd	if (unlinked) {
1896219089Spjd
1897219089Spjd		/*
1898219089Spjd		 * Hold z_lock so that we can make sure that the ACL obj
1899219089Spjd		 * hasn't changed.  Could have been deleted due to
1900219089Spjd		 * zfs_sa_upgrade().
1901219089Spjd		 */
1902219089Spjd		mutex_enter(&zp->z_lock);
1903168962Spjd		VI_LOCK(vp);
1904219089Spjd		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1905219089Spjd		    &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
1906185029Spjd		delete_now = may_delete_now && !toobig &&
1907168962Spjd		    vp->v_count == 1 && !vn_has_cached_data(vp) &&
1908219089Spjd		    xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) ==
1909219089Spjd		    acl_obj;
1910168962Spjd		VI_UNLOCK(vp);
1911168962Spjd	}
1912168962Spjd
1913168962Spjd	if (delete_now) {
1914219089Spjd		if (xattr_obj_unlinked) {
1915219089Spjd			ASSERT3U(xzp->z_links, ==, 2);
1916168962Spjd			mutex_enter(&xzp->z_lock);
1917168962Spjd			xzp->z_unlinked = 1;
1918219089Spjd			xzp->z_links = 0;
1919219089Spjd			error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
1920219089Spjd			    &xzp->z_links, sizeof (xzp->z_links), tx);
1921219089Spjd			ASSERT3U(error,  ==,  0);
1922168962Spjd			mutex_exit(&xzp->z_lock);
1923168962Spjd			zfs_unlinked_add(xzp, tx);
1924219089Spjd
1925219089Spjd			if (zp->z_is_sa)
1926219089Spjd				error = sa_remove(zp->z_sa_hdl,
1927219089Spjd				    SA_ZPL_XATTR(zfsvfs), tx);
1928219089Spjd			else
1929219089Spjd				error = sa_update(zp->z_sa_hdl,
1930219089Spjd				    SA_ZPL_XATTR(zfsvfs), &null_xattr,
1931219089Spjd				    sizeof (uint64_t), tx);
1932240415Smm			ASSERT0(error);
1933168962Spjd		}
1934168962Spjd		VI_LOCK(vp);
1935168962Spjd		vp->v_count--;
1936240415Smm		ASSERT0(vp->v_count);
1937168962Spjd		VI_UNLOCK(vp);
1938168962Spjd		mutex_exit(&zp->z_lock);
1939168962Spjd		zfs_znode_delete(zp, tx);
1940168962Spjd	} else if (unlinked) {
1941219089Spjd		mutex_exit(&zp->z_lock);
1942168404Spjd		zfs_unlinked_add(zp, tx);
1943168962Spjd	}
1944168404Spjd
1945185029Spjd	txtype = TX_REMOVE;
1946185029Spjd	if (flags & FIGNORECASE)
1947185029Spjd		txtype |= TX_CI;
1948219089Spjd	zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
1949168404Spjd
1950168404Spjd	dmu_tx_commit(tx);
1951168404Spjdout:
1952185029Spjd	if (realnmp)
1953185029Spjd		pn_free(realnmp);
1954185029Spjd
1955168404Spjd	zfs_dirent_unlock(dl);
1956168404Spjd
1957219089Spjd	if (!delete_now)
1958168962Spjd		VN_RELE(vp);
1959219089Spjd	if (xzp)
1960168962Spjd		VN_RELE(ZTOV(xzp));
1961168962Spjd
1962219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1963219089Spjd		zil_commit(zilog, 0);
1964219089Spjd
1965168404Spjd	ZFS_EXIT(zfsvfs);
1966168404Spjd	return (error);
1967168404Spjd}
1968168404Spjd
1969168404Spjd/*
1970168404Spjd * Create a new directory and insert it into dvp using the name
1971168404Spjd * provided.  Return a pointer to the inserted directory.
1972168404Spjd *
1973168404Spjd *	IN:	dvp	- vnode of directory to add subdir to.
1974168404Spjd *		dirname	- name of new directory.
1975168404Spjd *		vap	- attributes of new directory.
1976168404Spjd *		cr	- credentials of caller.
1977185029Spjd *		ct	- caller context
1978185029Spjd *		vsecp	- ACL to be set
1979168404Spjd *
1980168404Spjd *	OUT:	vpp	- vnode of created directory.
1981168404Spjd *
1982168404Spjd *	RETURN:	0 if success
1983168404Spjd *		error code if failure
1984168404Spjd *
1985168404Spjd * Timestamps:
1986168404Spjd *	dvp - ctime|mtime updated
1987168404Spjd *	 vp - ctime|mtime|atime updated
1988168404Spjd */
1989185029Spjd/*ARGSUSED*/
1990168404Spjdstatic int
1991185029Spjdzfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
1992185029Spjd    caller_context_t *ct, int flags, vsecattr_t *vsecp)
1993168404Spjd{
1994168404Spjd	znode_t		*zp, *dzp = VTOZ(dvp);
1995168404Spjd	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1996185029Spjd	zilog_t		*zilog;
1997168404Spjd	zfs_dirlock_t	*dl;
1998185029Spjd	uint64_t	txtype;
1999168404Spjd	dmu_tx_t	*tx;
2000168404Spjd	int		error;
2001185029Spjd	int		zf = ZNEW;
2002209962Smm	ksid_t		*ksid;
2003209962Smm	uid_t		uid;
2004209962Smm	gid_t		gid = crgetgid(cr);
2005219089Spjd	zfs_acl_ids_t   acl_ids;
2006209962Smm	boolean_t	fuid_dirtied;
2007168404Spjd
2008168404Spjd	ASSERT(vap->va_type == VDIR);
2009168404Spjd
2010185029Spjd	/*
2011185029Spjd	 * If we have an ephemeral id, ACL, or XVATTR then
2012185029Spjd	 * make sure file system is at proper version
2013185029Spjd	 */
2014185029Spjd
2015209962Smm	ksid = crgetsid(cr, KSID_OWNER);
2016209962Smm	if (ksid)
2017209962Smm		uid = ksid_getid(ksid);
2018209962Smm	else
2019209962Smm		uid = crgetuid(cr);
2020185029Spjd	if (zfsvfs->z_use_fuids == B_FALSE &&
2021219089Spjd	    (vsecp || (vap->va_mask & AT_XVATTR) ||
2022219089Spjd	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
2023185029Spjd		return (EINVAL);
2024185029Spjd
2025168404Spjd	ZFS_ENTER(zfsvfs);
2026185029Spjd	ZFS_VERIFY_ZP(dzp);
2027185029Spjd	zilog = zfsvfs->z_log;
2028168404Spjd
2029219089Spjd	if (dzp->z_pflags & ZFS_XATTR) {
2030168404Spjd		ZFS_EXIT(zfsvfs);
2031168404Spjd		return (EINVAL);
2032168404Spjd	}
2033168404Spjd
2034185029Spjd	if (zfsvfs->z_utf8 && u8_validate(dirname,
2035185029Spjd	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2036185029Spjd		ZFS_EXIT(zfsvfs);
2037185029Spjd		return (EILSEQ);
2038185029Spjd	}
2039185029Spjd	if (flags & FIGNORECASE)
2040185029Spjd		zf |= ZCILOOK;
2041185029Spjd
2042219089Spjd	if (vap->va_mask & AT_XVATTR) {
2043197861Spjd		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
2044185029Spjd		    crgetuid(cr), cr, vap->va_type)) != 0) {
2045185029Spjd			ZFS_EXIT(zfsvfs);
2046185029Spjd			return (error);
2047185029Spjd		}
2048219089Spjd	}
2049185029Spjd
2050219089Spjd	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
2051219089Spjd	    vsecp, &acl_ids)) != 0) {
2052219089Spjd		ZFS_EXIT(zfsvfs);
2053219089Spjd		return (error);
2054219089Spjd	}
2055168404Spjd	/*
2056168404Spjd	 * First make sure the new directory doesn't exist.
2057219089Spjd	 *
2058219089Spjd	 * Existence is checked first to make sure we don't return
2059219089Spjd	 * EACCES instead of EEXIST which can cause some applications
2060219089Spjd	 * to fail.
2061168404Spjd	 */
2062185029Spjdtop:
2063185029Spjd	*vpp = NULL;
2064185029Spjd
2065185029Spjd	if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
2066185029Spjd	    NULL, NULL)) {
2067219089Spjd		zfs_acl_ids_free(&acl_ids);
2068168404Spjd		ZFS_EXIT(zfsvfs);
2069168404Spjd		return (error);
2070168404Spjd	}
2071168404Spjd
2072185029Spjd	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
2073219089Spjd		zfs_acl_ids_free(&acl_ids);
2074168404Spjd		zfs_dirent_unlock(dl);
2075168404Spjd		ZFS_EXIT(zfsvfs);
2076168404Spjd		return (error);
2077168404Spjd	}
2078168404Spjd
2079209962Smm	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
2080211932Smm		zfs_acl_ids_free(&acl_ids);
2081209962Smm		zfs_dirent_unlock(dl);
2082209962Smm		ZFS_EXIT(zfsvfs);
2083209962Smm		return (EDQUOT);
2084209962Smm	}
2085209962Smm
2086168404Spjd	/*
2087168404Spjd	 * Add a new entry to the directory.
2088168404Spjd	 */
2089168404Spjd	tx = dmu_tx_create(zfsvfs->z_os);
2090168404Spjd	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
2091168404Spjd	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2092209962Smm	fuid_dirtied = zfsvfs->z_fuid_dirty;
2093209962Smm	if (fuid_dirtied)
2094209962Smm		zfs_fuid_txhold(zfsvfs, tx);
2095219089Spjd	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2096219089Spjd		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
2097219089Spjd		    acl_ids.z_aclp->z_acl_bytes);
2098219089Spjd	}
2099219089Spjd
2100219089Spjd	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
2101219089Spjd	    ZFS_SA_BASE_ATTR_SIZE);
2102219089Spjd
2103209962Smm	error = dmu_tx_assign(tx, TXG_NOWAIT);
2104168404Spjd	if (error) {
2105168404Spjd		zfs_dirent_unlock(dl);
2106209962Smm		if (error == ERESTART) {
2107168404Spjd			dmu_tx_wait(tx);
2108168404Spjd			dmu_tx_abort(tx);
2109168404Spjd			goto top;
2110168404Spjd		}
2111219089Spjd		zfs_acl_ids_free(&acl_ids);
2112168404Spjd		dmu_tx_abort(tx);
2113168404Spjd		ZFS_EXIT(zfsvfs);
2114168404Spjd		return (error);
2115168404Spjd	}
2116168404Spjd
2117168404Spjd	/*
2118168404Spjd	 * Create new node.
2119168404Spjd	 */
2120219089Spjd	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
2121168404Spjd
2122209962Smm	if (fuid_dirtied)
2123209962Smm		zfs_fuid_sync(zfsvfs, tx);
2124219089Spjd
2125168404Spjd	/*
2126168404Spjd	 * Now put new name in parent dir.
2127168404Spjd	 */
2128168404Spjd	(void) zfs_link_create(dl, zp, tx, ZNEW);
2129168404Spjd
2130168404Spjd	*vpp = ZTOV(zp);
2131168404Spjd
2132185029Spjd	txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
2133185029Spjd	if (flags & FIGNORECASE)
2134185029Spjd		txtype |= TX_CI;
2135209962Smm	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
2136209962Smm	    acl_ids.z_fuidp, vap);
2137185029Spjd
2138209962Smm	zfs_acl_ids_free(&acl_ids);
2139219089Spjd
2140168404Spjd	dmu_tx_commit(tx);
2141168404Spjd
2142168404Spjd	zfs_dirent_unlock(dl);
2143168404Spjd
2144219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2145219089Spjd		zil_commit(zilog, 0);
2146219089Spjd
2147168404Spjd	ZFS_EXIT(zfsvfs);
2148168404Spjd	return (0);
2149168404Spjd}
2150168404Spjd
2151168404Spjd/*
2152168404Spjd * Remove a directory subdir entry.  If the current working
2153168404Spjd * directory is the same as the subdir to be removed, the
2154168404Spjd * remove will fail.
2155168404Spjd *
2156168404Spjd *	IN:	dvp	- vnode of directory to remove from.
2157168404Spjd *		name	- name of directory to be removed.
2158168404Spjd *		cwd	- vnode of current working directory.
2159168404Spjd *		cr	- credentials of caller.
2160185029Spjd *		ct	- caller context
2161185029Spjd *		flags	- case flags
2162168404Spjd *
2163168404Spjd *	RETURN:	0 if success
2164168404Spjd *		error code if failure
2165168404Spjd *
2166168404Spjd * Timestamps:
2167168404Spjd *	dvp - ctime|mtime updated
2168168404Spjd */
2169185029Spjd/*ARGSUSED*/
2170168404Spjdstatic int
2171185029Spjdzfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
2172185029Spjd    caller_context_t *ct, int flags)
2173168404Spjd{
2174168404Spjd	znode_t		*dzp = VTOZ(dvp);
2175168404Spjd	znode_t		*zp;
2176168404Spjd	vnode_t		*vp;
2177168404Spjd	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2178185029Spjd	zilog_t		*zilog;
2179168404Spjd	zfs_dirlock_t	*dl;
2180168404Spjd	dmu_tx_t	*tx;
2181168404Spjd	int		error;
2182185029Spjd	int		zflg = ZEXISTS;
2183168404Spjd
2184168962Spjd	ZFS_ENTER(zfsvfs);
2185185029Spjd	ZFS_VERIFY_ZP(dzp);
2186185029Spjd	zilog = zfsvfs->z_log;
2187168404Spjd
2188185029Spjd	if (flags & FIGNORECASE)
2189185029Spjd		zflg |= ZCILOOK;
2190168404Spjdtop:
2191168404Spjd	zp = NULL;
2192168404Spjd
2193168404Spjd	/*
2194168404Spjd	 * Attempt to lock directory; fail if entry doesn't exist.
2195168404Spjd	 */
2196185029Spjd	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
2197185029Spjd	    NULL, NULL)) {
2198168404Spjd		ZFS_EXIT(zfsvfs);
2199168404Spjd		return (error);
2200168404Spjd	}
2201168404Spjd
2202168404Spjd	vp = ZTOV(zp);
2203168404Spjd
2204168404Spjd	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2205168404Spjd		goto out;
2206168404Spjd	}
2207168404Spjd
2208168962Spjd	if (vp->v_type != VDIR) {
2209168962Spjd		error = ENOTDIR;
2210168962Spjd		goto out;
2211168962Spjd	}
2212168962Spjd
2213168962Spjd	if (vp == cwd) {
2214168962Spjd		error = EINVAL;
2215168962Spjd		goto out;
2216168962Spjd	}
2217168962Spjd
2218185029Spjd	vnevent_rmdir(vp, dvp, name, ct);
2219168962Spjd
2220168404Spjd	/*
2221168404Spjd	 * Grab a lock on the directory to make sure that noone is
2222168404Spjd	 * trying to add (or lookup) entries while we are removing it.
2223168404Spjd	 */
2224168404Spjd	rw_enter(&zp->z_name_lock, RW_WRITER);
2225168404Spjd
2226168404Spjd	/*
2227168404Spjd	 * Grab a lock on the parent pointer to make sure we play well
2228168404Spjd	 * with the treewalk and directory rename code.
2229168404Spjd	 */
2230168404Spjd	rw_enter(&zp->z_parent_lock, RW_WRITER);
2231168404Spjd
2232168404Spjd	tx = dmu_tx_create(zfsvfs->z_os);
2233168404Spjd	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2234219089Spjd	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2235168404Spjd	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2236219089Spjd	zfs_sa_upgrade_txholds(tx, zp);
2237219089Spjd	zfs_sa_upgrade_txholds(tx, dzp);
2238209962Smm	error = dmu_tx_assign(tx, TXG_NOWAIT);
2239168404Spjd	if (error) {
2240168404Spjd		rw_exit(&zp->z_parent_lock);
2241168404Spjd		rw_exit(&zp->z_name_lock);
2242168404Spjd		zfs_dirent_unlock(dl);
2243168962Spjd		VN_RELE(vp);
2244209962Smm		if (error == ERESTART) {
2245168404Spjd			dmu_tx_wait(tx);
2246168404Spjd			dmu_tx_abort(tx);
2247168404Spjd			goto top;
2248168404Spjd		}
2249168404Spjd		dmu_tx_abort(tx);
2250168404Spjd		ZFS_EXIT(zfsvfs);
2251168404Spjd		return (error);
2252168404Spjd	}
2253168404Spjd
2254168404Spjd#ifdef FREEBSD_NAMECACHE
2255168404Spjd	cache_purge(dvp);
2256168404Spjd#endif
2257168404Spjd
2258185029Spjd	error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
2259168404Spjd
2260185029Spjd	if (error == 0) {
2261185029Spjd		uint64_t txtype = TX_RMDIR;
2262185029Spjd		if (flags & FIGNORECASE)
2263185029Spjd			txtype |= TX_CI;
2264219089Spjd		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
2265185029Spjd	}
2266168404Spjd
2267168404Spjd	dmu_tx_commit(tx);
2268168404Spjd
2269168404Spjd	rw_exit(&zp->z_parent_lock);
2270168404Spjd	rw_exit(&zp->z_name_lock);
2271168404Spjd#ifdef FREEBSD_NAMECACHE
2272168404Spjd	cache_purge(vp);
2273168404Spjd#endif
2274168404Spjdout:
2275168404Spjd	zfs_dirent_unlock(dl);
2276168404Spjd
2277168962Spjd	VN_RELE(vp);
2278168962Spjd
2279219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2280219089Spjd		zil_commit(zilog, 0);
2281219089Spjd
2282168404Spjd	ZFS_EXIT(zfsvfs);
2283168404Spjd	return (error);
2284168404Spjd}
2285168404Spjd
2286168404Spjd/*
2287168404Spjd * Read as many directory entries as will fit into the provided
2288168404Spjd * buffer from the given directory cursor position (specified in
2289168404Spjd * the uio structure.
2290168404Spjd *
2291168404Spjd *	IN:	vp	- vnode of directory to read.
2292168404Spjd *		uio	- structure supplying read location, range info,
2293168404Spjd *			  and return buffer.
2294168404Spjd *		cr	- credentials of caller.
2295185029Spjd *		ct	- caller context
2296185029Spjd *		flags	- case flags
2297168404Spjd *
2298168404Spjd *	OUT:	uio	- updated offset and range, buffer filled.
2299168404Spjd *		eofp	- set to true if end-of-file detected.
2300168404Spjd *
2301168404Spjd *	RETURN:	0 if success
2302168404Spjd *		error code if failure
2303168404Spjd *
2304168404Spjd * Timestamps:
2305168404Spjd *	vp - atime updated
2306168404Spjd *
2307168404Spjd * Note that the low 4 bits of the cookie returned by zap is always zero.
2308168404Spjd * This allows us to use the low range for "special" directory entries:
2309168404Spjd * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
2310168404Spjd * we use the offset 2 for the '.zfs' directory.
2311168404Spjd */
2312168404Spjd/* ARGSUSED */
2313168404Spjdstatic int
2314168962Spjdzfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
2315168404Spjd{
2316168404Spjd	znode_t		*zp = VTOZ(vp);
2317168404Spjd	iovec_t		*iovp;
2318185029Spjd	edirent_t	*eodp;
2319168404Spjd	dirent64_t	*odp;
2320168404Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2321168404Spjd	objset_t	*os;
2322168404Spjd	caddr_t		outbuf;
2323168404Spjd	size_t		bufsize;
2324168404Spjd	zap_cursor_t	zc;
2325168404Spjd	zap_attribute_t	zap;
2326168404Spjd	uint_t		bytes_wanted;
2327168404Spjd	uint64_t	offset; /* must be unsigned; checks for < 1 */
2328219089Spjd	uint64_t	parent;
2329168404Spjd	int		local_eof;
2330168404Spjd	int		outcount;
2331168404Spjd	int		error;
2332168404Spjd	uint8_t		prefetch;
2333185029Spjd	boolean_t	check_sysattrs;
2334168404Spjd	uint8_t		type;
2335168962Spjd	int		ncooks;
2336168962Spjd	u_long		*cooks = NULL;
2337185029Spjd	int		flags = 0;
2338168404Spjd
2339168404Spjd	ZFS_ENTER(zfsvfs);
2340185029Spjd	ZFS_VERIFY_ZP(zp);
2341168404Spjd
2342219089Spjd	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2343219089Spjd	    &parent, sizeof (parent))) != 0) {
2344219089Spjd		ZFS_EXIT(zfsvfs);
2345219089Spjd		return (error);
2346219089Spjd	}
2347219089Spjd
2348168404Spjd	/*
2349168404Spjd	 * If we are not given an eof variable,
2350168404Spjd	 * use a local one.
2351168404Spjd	 */
2352168404Spjd	if (eofp == NULL)
2353168404Spjd		eofp = &local_eof;
2354168404Spjd
2355168404Spjd	/*
2356168404Spjd	 * Check for valid iov_len.
2357168404Spjd	 */
2358168404Spjd	if (uio->uio_iov->iov_len <= 0) {
2359168404Spjd		ZFS_EXIT(zfsvfs);
2360168404Spjd		return (EINVAL);
2361168404Spjd	}
2362168404Spjd
2363168404Spjd	/*
2364168404Spjd	 * Quit if directory has been removed (posix)
2365168404Spjd	 */
2366168404Spjd	if ((*eofp = zp->z_unlinked) != 0) {
2367168404Spjd		ZFS_EXIT(zfsvfs);
2368168404Spjd		return (0);
2369168404Spjd	}
2370168404Spjd
2371168404Spjd	error = 0;
2372168404Spjd	os = zfsvfs->z_os;
2373168404Spjd	offset = uio->uio_loffset;
2374168404Spjd	prefetch = zp->z_zn_prefetch;
2375168404Spjd
2376168404Spjd	/*
2377168404Spjd	 * Initialize the iterator cursor.
2378168404Spjd	 */
2379168404Spjd	if (offset <= 3) {
2380168404Spjd		/*
2381168404Spjd		 * Start iteration from the beginning of the directory.
2382168404Spjd		 */
2383168404Spjd		zap_cursor_init(&zc, os, zp->z_id);
2384168404Spjd	} else {
2385168404Spjd		/*
2386168404Spjd		 * The offset is a serialized cursor.
2387168404Spjd		 */
2388168404Spjd		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2389168404Spjd	}
2390168404Spjd
2391168404Spjd	/*
2392168404Spjd	 * Get space to change directory entries into fs independent format.
2393168404Spjd	 */
2394168404Spjd	iovp = uio->uio_iov;
2395168404Spjd	bytes_wanted = iovp->iov_len;
2396168404Spjd	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
2397168404Spjd		bufsize = bytes_wanted;
2398168404Spjd		outbuf = kmem_alloc(bufsize, KM_SLEEP);
2399168404Spjd		odp = (struct dirent64 *)outbuf;
2400168404Spjd	} else {
2401168404Spjd		bufsize = bytes_wanted;
2402168404Spjd		odp = (struct dirent64 *)iovp->iov_base;
2403168404Spjd	}
2404185029Spjd	eodp = (struct edirent *)odp;
2405168404Spjd
2406169170Spjd	if (ncookies != NULL) {
2407168404Spjd		/*
2408168404Spjd		 * Minimum entry size is dirent size and 1 byte for a file name.
2409168404Spjd		 */
2410168962Spjd		ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
2411219404Spjd		cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
2412219404Spjd		*cookies = cooks;
2413168962Spjd		*ncookies = ncooks;
2414168404Spjd	}
2415185029Spjd	/*
2416185029Spjd	 * If this VFS supports the system attribute view interface; and
2417185029Spjd	 * we're looking at an extended attribute directory; and we care
2418185029Spjd	 * about normalization conflicts on this vfs; then we must check
2419185029Spjd	 * for normalization conflicts with the sysattr name space.
2420185029Spjd	 */
2421185029Spjd#ifdef TODO
2422185029Spjd	check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2423185029Spjd	    (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2424185029Spjd	    (flags & V_RDDIR_ENTFLAGS);
2425185029Spjd#else
2426185029Spjd	check_sysattrs = 0;
2427185029Spjd#endif
2428168404Spjd
2429168404Spjd	/*
2430168404Spjd	 * Transform to file-system independent format
2431168404Spjd	 */
2432168404Spjd	outcount = 0;
2433168404Spjd	while (outcount < bytes_wanted) {
2434168404Spjd		ino64_t objnum;
2435168404Spjd		ushort_t reclen;
2436219089Spjd		off64_t *next = NULL;
2437168404Spjd
2438168404Spjd		/*
2439168404Spjd		 * Special case `.', `..', and `.zfs'.
2440168404Spjd		 */
2441168404Spjd		if (offset == 0) {
2442168404Spjd			(void) strcpy(zap.za_name, ".");
2443185029Spjd			zap.za_normalization_conflict = 0;
2444168404Spjd			objnum = zp->z_id;
2445169108Spjd			type = DT_DIR;
2446168404Spjd		} else if (offset == 1) {
2447168404Spjd			(void) strcpy(zap.za_name, "..");
2448185029Spjd			zap.za_normalization_conflict = 0;
2449219089Spjd			objnum = parent;
2450169108Spjd			type = DT_DIR;
2451168404Spjd		} else if (offset == 2 && zfs_show_ctldir(zp)) {
2452168404Spjd			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2453185029Spjd			zap.za_normalization_conflict = 0;
2454168404Spjd			objnum = ZFSCTL_INO_ROOT;
2455169108Spjd			type = DT_DIR;
2456168404Spjd		} else {
2457168404Spjd			/*
2458168404Spjd			 * Grab next entry.
2459168404Spjd			 */
2460168404Spjd			if (error = zap_cursor_retrieve(&zc, &zap)) {
2461168404Spjd				if ((*eofp = (error == ENOENT)) != 0)
2462168404Spjd					break;
2463168404Spjd				else
2464168404Spjd					goto update;
2465168404Spjd			}
2466168404Spjd
2467168404Spjd			if (zap.za_integer_length != 8 ||
2468168404Spjd			    zap.za_num_integers != 1) {
2469168404Spjd				cmn_err(CE_WARN, "zap_readdir: bad directory "
2470168404Spjd				    "entry, obj = %lld, offset = %lld\n",
2471168404Spjd				    (u_longlong_t)zp->z_id,
2472168404Spjd				    (u_longlong_t)offset);
2473168404Spjd				error = ENXIO;
2474168404Spjd				goto update;
2475168404Spjd			}
2476168404Spjd
2477168404Spjd			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2478168404Spjd			/*
2479168404Spjd			 * MacOS X can extract the object type here such as:
2480168404Spjd			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2481168404Spjd			 */
2482168404Spjd			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2483185029Spjd
2484185029Spjd			if (check_sysattrs && !zap.za_normalization_conflict) {
2485185029Spjd#ifdef TODO
2486185029Spjd				zap.za_normalization_conflict =
2487185029Spjd				    xattr_sysattr_casechk(zap.za_name);
2488185029Spjd#else
2489185029Spjd				panic("%s:%u: TODO", __func__, __LINE__);
2490185029Spjd#endif
2491185029Spjd			}
2492168404Spjd		}
2493168404Spjd
2494211932Smm		if (flags & V_RDDIR_ACCFILTER) {
2495211932Smm			/*
2496211932Smm			 * If we have no access at all, don't include
2497211932Smm			 * this entry in the returned information
2498211932Smm			 */
2499211932Smm			znode_t	*ezp;
2500211932Smm			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
2501211932Smm				goto skip_entry;
2502211932Smm			if (!zfs_has_access(ezp, cr)) {
2503211932Smm				VN_RELE(ZTOV(ezp));
2504211932Smm				goto skip_entry;
2505211932Smm			}
2506211932Smm			VN_RELE(ZTOV(ezp));
2507211932Smm		}
2508211932Smm
2509185029Spjd		if (flags & V_RDDIR_ENTFLAGS)
2510185029Spjd			reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2511185029Spjd		else
2512185029Spjd			reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2513185029Spjd
2514168404Spjd		/*
2515168404Spjd		 * Will this entry fit in the buffer?
2516168404Spjd		 */
2517168404Spjd		if (outcount + reclen > bufsize) {
2518168404Spjd			/*
2519168404Spjd			 * Did we manage to fit anything in the buffer?
2520168404Spjd			 */
2521168404Spjd			if (!outcount) {
2522168404Spjd				error = EINVAL;
2523168404Spjd				goto update;
2524168404Spjd			}
2525168404Spjd			break;
2526168404Spjd		}
2527185029Spjd		if (flags & V_RDDIR_ENTFLAGS) {
2528185029Spjd			/*
2529185029Spjd			 * Add extended flag entry:
2530185029Spjd			 */
2531185029Spjd			eodp->ed_ino = objnum;
2532185029Spjd			eodp->ed_reclen = reclen;
2533185029Spjd			/* NOTE: ed_off is the offset for the *next* entry */
2534185029Spjd			next = &(eodp->ed_off);
2535185029Spjd			eodp->ed_eflags = zap.za_normalization_conflict ?
2536185029Spjd			    ED_CASE_CONFLICT : 0;
2537185029Spjd			(void) strncpy(eodp->ed_name, zap.za_name,
2538185029Spjd			    EDIRENT_NAMELEN(reclen));
2539185029Spjd			eodp = (edirent_t *)((intptr_t)eodp + reclen);
2540185029Spjd		} else {
2541185029Spjd			/*
2542185029Spjd			 * Add normal entry:
2543185029Spjd			 */
2544185029Spjd			odp->d_ino = objnum;
2545185029Spjd			odp->d_reclen = reclen;
2546185029Spjd			odp->d_namlen = strlen(zap.za_name);
2547185029Spjd			(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
2548185029Spjd			odp->d_type = type;
2549185029Spjd			odp = (dirent64_t *)((intptr_t)odp + reclen);
2550185029Spjd		}
2551168404Spjd		outcount += reclen;
2552168404Spjd
2553168404Spjd		ASSERT(outcount <= bufsize);
2554168404Spjd
2555168404Spjd		/* Prefetch znode */
2556168404Spjd		if (prefetch)
2557168404Spjd			dmu_prefetch(os, objnum, 0, 0);
2558168404Spjd
2559211932Smm	skip_entry:
2560168404Spjd		/*
2561168404Spjd		 * Move to the next entry, fill in the previous offset.
2562168404Spjd		 */
2563168404Spjd		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2564168404Spjd			zap_cursor_advance(&zc);
2565168404Spjd			offset = zap_cursor_serialize(&zc);
2566168404Spjd		} else {
2567168404Spjd			offset += 1;
2568168404Spjd		}
2569219404Spjd
2570219404Spjd		if (cooks != NULL) {
2571219404Spjd			*cooks++ = offset;
2572219404Spjd			ncooks--;
2573219404Spjd			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
2574219404Spjd		}
2575168404Spjd	}
2576168404Spjd	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2577168404Spjd
2578168404Spjd	/* Subtract unused cookies */
2579168962Spjd	if (ncookies != NULL)
2580168962Spjd		*ncookies -= ncooks;
2581168404Spjd
2582168404Spjd	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
2583168404Spjd		iovp->iov_base += outcount;
2584168404Spjd		iovp->iov_len -= outcount;
2585168404Spjd		uio->uio_resid -= outcount;
2586168404Spjd	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2587168404Spjd		/*
2588168404Spjd		 * Reset the pointer.
2589168404Spjd		 */
2590168404Spjd		offset = uio->uio_loffset;
2591168404Spjd	}
2592168404Spjd
2593168404Spjdupdate:
2594168404Spjd	zap_cursor_fini(&zc);
2595168404Spjd	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
2596168404Spjd		kmem_free(outbuf, bufsize);
2597168404Spjd
2598168404Spjd	if (error == ENOENT)
2599168404Spjd		error = 0;
2600168404Spjd
2601168404Spjd	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2602168404Spjd
2603168404Spjd	uio->uio_loffset = offset;
2604168404Spjd	ZFS_EXIT(zfsvfs);
2605169107Spjd	if (error != 0 && cookies != NULL) {
2606168962Spjd		free(*cookies, M_TEMP);
2607168962Spjd		*cookies = NULL;
2608168962Spjd		*ncookies = 0;
2609168404Spjd	}
2610168404Spjd	return (error);
2611168404Spjd}
2612168404Spjd
2613185029Spjdulong_t zfs_fsync_sync_cnt = 4;
2614185029Spjd
2615168404Spjdstatic int
2616185029Spjdzfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2617168404Spjd{
2618168962Spjd	znode_t	*zp = VTOZ(vp);
2619168962Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2620168404Spjd
2621185029Spjd	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2622185029Spjd
2623219089Spjd	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
2624219089Spjd		ZFS_ENTER(zfsvfs);
2625219089Spjd		ZFS_VERIFY_ZP(zp);
2626219089Spjd		zil_commit(zfsvfs->z_log, zp->z_id);
2627219089Spjd		ZFS_EXIT(zfsvfs);
2628219089Spjd	}
2629168404Spjd	return (0);
2630168404Spjd}
2631168404Spjd
2632185029Spjd
2633168404Spjd/*
2634168404Spjd * Get the requested file attributes and place them in the provided
2635168404Spjd * vattr structure.
2636168404Spjd *
2637168404Spjd *	IN:	vp	- vnode of file.
2638168404Spjd *		vap	- va_mask identifies requested attributes.
2639185029Spjd *			  If AT_XVATTR set, then optional attrs are requested
2640185029Spjd *		flags	- ATTR_NOACLCHECK (CIFS server context)
2641168404Spjd *		cr	- credentials of caller.
2642185029Spjd *		ct	- caller context
2643168404Spjd *
2644168404Spjd *	OUT:	vap	- attribute values.
2645168404Spjd *
2646168404Spjd *	RETURN:	0 (always succeeds)
2647168404Spjd */
2648168404Spjd/* ARGSUSED */
2649168404Spjdstatic int
2650185029Spjdzfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2651185029Spjd    caller_context_t *ct)
2652168404Spjd{
2653168962Spjd	znode_t *zp = VTOZ(vp);
2654168962Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2655185029Spjd	int	error = 0;
2656168962Spjd	uint32_t blksize;
2657168962Spjd	u_longlong_t nblocks;
2658185029Spjd	uint64_t links;
2659224251Sdelphij	uint64_t mtime[2], ctime[2], crtime[2], rdev;
2660185029Spjd	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2661185029Spjd	xoptattr_t *xoap = NULL;
2662185029Spjd	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2663224251Sdelphij	sa_bulk_attr_t bulk[4];
2664219089Spjd	int count = 0;
2665168404Spjd
2666168404Spjd	ZFS_ENTER(zfsvfs);
2667185029Spjd	ZFS_VERIFY_ZP(zp);
2668168404Spjd
2669219089Spjd	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2670219089Spjd
2671219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
2672219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
2673219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &crtime, 16);
2674224251Sdelphij	if (vp->v_type == VBLK || vp->v_type == VCHR)
2675224251Sdelphij		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
2676224251Sdelphij		    &rdev, 8);
2677219089Spjd
2678219089Spjd	if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
2679219089Spjd		ZFS_EXIT(zfsvfs);
2680219089Spjd		return (error);
2681219089Spjd	}
2682219089Spjd
2683168404Spjd	/*
2684185029Spjd	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2685185029Spjd	 * Also, if we are the owner don't bother, since owner should
2686185029Spjd	 * always be allowed to read basic attributes of file.
2687185029Spjd	 */
2688219089Spjd	if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
2689219089Spjd	    (vap->va_uid != crgetuid(cr))) {
2690185029Spjd		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2691185029Spjd		    skipaclchk, cr)) {
2692185029Spjd			ZFS_EXIT(zfsvfs);
2693185029Spjd			return (error);
2694185029Spjd		}
2695185029Spjd	}
2696185029Spjd
2697185029Spjd	/*
2698168404Spjd	 * Return all attributes.  It's cheaper to provide the answer
2699168404Spjd	 * than to determine whether we were asked the question.
2700168404Spjd	 */
2701168404Spjd
2702209097Smm	mutex_enter(&zp->z_lock);
2703219089Spjd	vap->va_type = IFTOVT(zp->z_mode);
2704219089Spjd	vap->va_mode = zp->z_mode & ~S_IFMT;
2705224252Sdelphij#ifdef sun
2706224252Sdelphij	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
2707224252Sdelphij#else
2708224252Sdelphij	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
2709224252Sdelphij#endif
2710168404Spjd	vap->va_nodeid = zp->z_id;
2711185029Spjd	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
2712219089Spjd		links = zp->z_links + 1;
2713185029Spjd	else
2714219089Spjd		links = zp->z_links;
2715229425Sdim	vap->va_nlink = MIN(links, LINK_MAX);	/* nlink_t limit! */
2716219089Spjd	vap->va_size = zp->z_size;
2717224252Sdelphij#ifdef sun
2718224252Sdelphij	vap->va_rdev = vp->v_rdev;
2719224252Sdelphij#else
2720224251Sdelphij	if (vp->v_type == VBLK || vp->v_type == VCHR)
2721224251Sdelphij		vap->va_rdev = zfs_cmpldev(rdev);
2722224252Sdelphij#endif
2723168404Spjd	vap->va_seq = zp->z_seq;
2724168404Spjd	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
2725168404Spjd
2726185029Spjd	/*
2727185029Spjd	 * Add in any requested optional attributes and the create time.
2728185029Spjd	 * Also set the corresponding bits in the returned attribute bitmap.
2729185029Spjd	 */
2730185029Spjd	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2731185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2732185029Spjd			xoap->xoa_archive =
2733219089Spjd			    ((zp->z_pflags & ZFS_ARCHIVE) != 0);
2734185029Spjd			XVA_SET_RTN(xvap, XAT_ARCHIVE);
2735185029Spjd		}
2736185029Spjd
2737185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2738185029Spjd			xoap->xoa_readonly =
2739219089Spjd			    ((zp->z_pflags & ZFS_READONLY) != 0);
2740185029Spjd			XVA_SET_RTN(xvap, XAT_READONLY);
2741185029Spjd		}
2742185029Spjd
2743185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2744185029Spjd			xoap->xoa_system =
2745219089Spjd			    ((zp->z_pflags & ZFS_SYSTEM) != 0);
2746185029Spjd			XVA_SET_RTN(xvap, XAT_SYSTEM);
2747185029Spjd		}
2748185029Spjd
2749185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2750185029Spjd			xoap->xoa_hidden =
2751219089Spjd			    ((zp->z_pflags & ZFS_HIDDEN) != 0);
2752185029Spjd			XVA_SET_RTN(xvap, XAT_HIDDEN);
2753185029Spjd		}
2754185029Spjd
2755185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2756185029Spjd			xoap->xoa_nounlink =
2757219089Spjd			    ((zp->z_pflags & ZFS_NOUNLINK) != 0);
2758185029Spjd			XVA_SET_RTN(xvap, XAT_NOUNLINK);
2759185029Spjd		}
2760185029Spjd
2761185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2762185029Spjd			xoap->xoa_immutable =
2763219089Spjd			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
2764185029Spjd			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2765185029Spjd		}
2766185029Spjd
2767185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2768185029Spjd			xoap->xoa_appendonly =
2769219089Spjd			    ((zp->z_pflags & ZFS_APPENDONLY) != 0);
2770185029Spjd			XVA_SET_RTN(xvap, XAT_APPENDONLY);
2771185029Spjd		}
2772185029Spjd
2773185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2774185029Spjd			xoap->xoa_nodump =
2775219089Spjd			    ((zp->z_pflags & ZFS_NODUMP) != 0);
2776185029Spjd			XVA_SET_RTN(xvap, XAT_NODUMP);
2777185029Spjd		}
2778185029Spjd
2779185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2780185029Spjd			xoap->xoa_opaque =
2781219089Spjd			    ((zp->z_pflags & ZFS_OPAQUE) != 0);
2782185029Spjd			XVA_SET_RTN(xvap, XAT_OPAQUE);
2783185029Spjd		}
2784185029Spjd
2785185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2786185029Spjd			xoap->xoa_av_quarantined =
2787219089Spjd			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
2788185029Spjd			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2789185029Spjd		}
2790185029Spjd
2791185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2792185029Spjd			xoap->xoa_av_modified =
2793219089Spjd			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
2794185029Spjd			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2795185029Spjd		}
2796185029Spjd
2797185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2798219089Spjd		    vp->v_type == VREG) {
2799219089Spjd			zfs_sa_get_scanstamp(zp, xvap);
2800185029Spjd		}
2801185029Spjd
2802185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
2803219089Spjd			uint64_t times[2];
2804219089Spjd
2805219089Spjd			(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
2806219089Spjd			    times, sizeof (times));
2807219089Spjd			ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
2808185029Spjd			XVA_SET_RTN(xvap, XAT_CREATETIME);
2809185029Spjd		}
2810219089Spjd
2811219089Spjd		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2812219089Spjd			xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
2813219089Spjd			XVA_SET_RTN(xvap, XAT_REPARSE);
2814219089Spjd		}
2815219089Spjd		if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
2816219089Spjd			xoap->xoa_generation = zp->z_gen;
2817219089Spjd			XVA_SET_RTN(xvap, XAT_GEN);
2818219089Spjd		}
2819219089Spjd
2820219089Spjd		if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
2821219089Spjd			xoap->xoa_offline =
2822219089Spjd			    ((zp->z_pflags & ZFS_OFFLINE) != 0);
2823219089Spjd			XVA_SET_RTN(xvap, XAT_OFFLINE);
2824219089Spjd		}
2825219089Spjd
2826219089Spjd		if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
2827219089Spjd			xoap->xoa_sparse =
2828219089Spjd			    ((zp->z_pflags & ZFS_SPARSE) != 0);
2829219089Spjd			XVA_SET_RTN(xvap, XAT_SPARSE);
2830219089Spjd		}
2831185029Spjd	}
2832185029Spjd
2833219089Spjd	ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
2834219089Spjd	ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2835219089Spjd	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2836219089Spjd	ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
2837168404Spjd
2838168404Spjd	mutex_exit(&zp->z_lock);
2839168404Spjd
2840219089Spjd	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
2841168404Spjd	vap->va_blksize = blksize;
2842168404Spjd	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
2843168404Spjd
2844168404Spjd	if (zp->z_blksz == 0) {
2845168404Spjd		/*
2846168404Spjd		 * Block size hasn't been set; suggest maximal I/O transfers.
2847168404Spjd		 */
2848168404Spjd		vap->va_blksize = zfsvfs->z_max_blksz;
2849168404Spjd	}
2850168404Spjd
2851168404Spjd	ZFS_EXIT(zfsvfs);
2852168404Spjd	return (0);
2853168404Spjd}
2854168404Spjd
2855168404Spjd/*
2856168404Spjd * Set the file attributes to the values contained in the
2857168404Spjd * vattr structure.
2858168404Spjd *
2859168404Spjd *	IN:	vp	- vnode of file to be modified.
2860168404Spjd *		vap	- new attribute values.
2861185029Spjd *			  If AT_XVATTR set, then optional attrs are being set
2862168404Spjd *		flags	- ATTR_UTIME set if non-default time values provided.
2863185029Spjd *			- ATTR_NOACLCHECK (CIFS context only).
2864168404Spjd *		cr	- credentials of caller.
2865185029Spjd *		ct	- caller context
2866168404Spjd *
2867168404Spjd *	RETURN:	0 if success
2868168404Spjd *		error code if failure
2869168404Spjd *
2870168404Spjd * Timestamps:
2871168404Spjd *	vp - ctime updated, mtime updated if size changed.
2872168404Spjd */
2873168404Spjd/* ARGSUSED */
2874168404Spjdstatic int
2875168962Spjdzfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2876168962Spjd	caller_context_t *ct)
2877168404Spjd{
2878185029Spjd	znode_t		*zp = VTOZ(vp);
2879168404Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2880185029Spjd	zilog_t		*zilog;
2881168404Spjd	dmu_tx_t	*tx;
2882168404Spjd	vattr_t		oldva;
2883209962Smm	xvattr_t	tmpxvattr;
2884168962Spjd	uint_t		mask = vap->va_mask;
2885168404Spjd	uint_t		saved_mask;
2886197831Spjd	uint64_t	saved_mode;
2887168404Spjd	int		trim_mask = 0;
2888168404Spjd	uint64_t	new_mode;
2889209962Smm	uint64_t	new_uid, new_gid;
2890219089Spjd	uint64_t	xattr_obj;
2891219089Spjd	uint64_t	mtime[2], ctime[2];
2892168404Spjd	znode_t		*attrzp;
2893168404Spjd	int		need_policy = FALSE;
2894219089Spjd	int		err, err2;
2895185029Spjd	zfs_fuid_info_t *fuidp = NULL;
2896185029Spjd	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2897185029Spjd	xoptattr_t	*xoap;
2898219089Spjd	zfs_acl_t	*aclp;
2899185029Spjd	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2900219089Spjd	boolean_t	fuid_dirtied = B_FALSE;
2901219089Spjd	sa_bulk_attr_t	bulk[7], xattr_bulk[7];
2902219089Spjd	int		count = 0, xattr_count = 0;
2903168404Spjd
2904168404Spjd	if (mask == 0)
2905168404Spjd		return (0);
2906168404Spjd
2907168962Spjd	if (mask & AT_NOSET)
2908168962Spjd		return (EINVAL);
2909168962Spjd
2910185029Spjd	ZFS_ENTER(zfsvfs);
2911185029Spjd	ZFS_VERIFY_ZP(zp);
2912185029Spjd
2913185029Spjd	zilog = zfsvfs->z_log;
2914185029Spjd
2915185029Spjd	/*
2916185029Spjd	 * Make sure that if we have ephemeral uid/gid or xvattr specified
2917185029Spjd	 * that file system is at proper version level
2918185029Spjd	 */
2919185029Spjd
2920185029Spjd	if (zfsvfs->z_use_fuids == B_FALSE &&
2921185029Spjd	    (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2922185029Spjd	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2923185029Spjd	    (mask & AT_XVATTR))) {
2924185029Spjd		ZFS_EXIT(zfsvfs);
2925185029Spjd		return (EINVAL);
2926185029Spjd	}
2927185029Spjd
2928185029Spjd	if (mask & AT_SIZE && vp->v_type == VDIR) {
2929185029Spjd		ZFS_EXIT(zfsvfs);
2930168404Spjd		return (EISDIR);
2931185029Spjd	}
2932168404Spjd
2933185029Spjd	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
2934185029Spjd		ZFS_EXIT(zfsvfs);
2935168404Spjd		return (EINVAL);
2936185029Spjd	}
2937168404Spjd
2938185029Spjd	/*
2939185029Spjd	 * If this is an xvattr_t, then get a pointer to the structure of
2940185029Spjd	 * optional attributes.  If this is NULL, then we have a vattr_t.
2941185029Spjd	 */
2942185029Spjd	xoap = xva_getxoptattr(xvap);
2943168404Spjd
2944209962Smm	xva_init(&tmpxvattr);
2945209962Smm
2946185029Spjd	/*
2947185029Spjd	 * Immutable files can only alter immutable bit and atime
2948185029Spjd	 */
2949219089Spjd	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2950185029Spjd	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
2951185029Spjd	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2952185029Spjd		ZFS_EXIT(zfsvfs);
2953185029Spjd		return (EPERM);
2954185029Spjd	}
2955185029Spjd
2956219089Spjd	if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
2957185029Spjd		ZFS_EXIT(zfsvfs);
2958185029Spjd		return (EPERM);
2959185029Spjd	}
2960185029Spjd
2961185029Spjd	/*
2962185029Spjd	 * Verify timestamps doesn't overflow 32 bits.
2963185029Spjd	 * ZFS can handle large timestamps, but 32bit syscalls can't
2964185029Spjd	 * handle times greater than 2039.  This check should be removed
2965185029Spjd	 * once large timestamps are fully supported.
2966185029Spjd	 */
2967185029Spjd	if (mask & (AT_ATIME | AT_MTIME)) {
2968185029Spjd		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2969185029Spjd		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2970185029Spjd			ZFS_EXIT(zfsvfs);
2971185029Spjd			return (EOVERFLOW);
2972185029Spjd		}
2973185029Spjd	}
2974185029Spjd
2975168404Spjdtop:
2976168404Spjd	attrzp = NULL;
2977219089Spjd	aclp = NULL;
2978168404Spjd
2979211932Smm	/* Can this be moved to before the top label? */
2980168404Spjd	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2981168404Spjd		ZFS_EXIT(zfsvfs);
2982168404Spjd		return (EROFS);
2983168404Spjd	}
2984168404Spjd
2985168404Spjd	/*
2986168404Spjd	 * First validate permissions
2987168404Spjd	 */
2988168404Spjd
2989168404Spjd	if (mask & AT_SIZE) {
2990168404Spjd		/*
2991168404Spjd		 * XXX - Note, we are not providing any open
2992168404Spjd		 * mode flags here (like FNDELAY), so we may
2993168404Spjd		 * block if there are locks present... this
2994168404Spjd		 * should be addressed in openat().
2995168404Spjd		 */
2996185029Spjd		/* XXX - would it be OK to generate a log record here? */
2997185029Spjd		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2998168404Spjd		if (err) {
2999168404Spjd			ZFS_EXIT(zfsvfs);
3000168404Spjd			return (err);
3001168404Spjd		}
3002168404Spjd	}
3003168404Spjd
3004185029Spjd	if (mask & (AT_ATIME|AT_MTIME) ||
3005185029Spjd	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
3006185029Spjd	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
3007185029Spjd	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
3008219089Spjd	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
3009219089Spjd	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
3010185029Spjd	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
3011219089Spjd	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
3012185029Spjd		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
3013185029Spjd		    skipaclchk, cr);
3014219089Spjd	}
3015168404Spjd
3016168404Spjd	if (mask & (AT_UID|AT_GID)) {
3017168404Spjd		int	idmask = (mask & (AT_UID|AT_GID));
3018168404Spjd		int	take_owner;
3019168404Spjd		int	take_group;
3020168404Spjd
3021168404Spjd		/*
3022168404Spjd		 * NOTE: even if a new mode is being set,
3023168404Spjd		 * we may clear S_ISUID/S_ISGID bits.
3024168404Spjd		 */
3025168404Spjd
3026168404Spjd		if (!(mask & AT_MODE))
3027219089Spjd			vap->va_mode = zp->z_mode;
3028168404Spjd
3029168404Spjd		/*
3030168404Spjd		 * Take ownership or chgrp to group we are a member of
3031168404Spjd		 */
3032168404Spjd
3033168404Spjd		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
3034185029Spjd		take_group = (mask & AT_GID) &&
3035185029Spjd		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
3036168404Spjd
3037168404Spjd		/*
3038168404Spjd		 * If both AT_UID and AT_GID are set then take_owner and
3039168404Spjd		 * take_group must both be set in order to allow taking
3040168404Spjd		 * ownership.
3041168404Spjd		 *
3042168404Spjd		 * Otherwise, send the check through secpolicy_vnode_setattr()
3043168404Spjd		 *
3044168404Spjd		 */
3045168404Spjd
3046168404Spjd		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
3047168404Spjd		    ((idmask == AT_UID) && take_owner) ||
3048168404Spjd		    ((idmask == AT_GID) && take_group)) {
3049185029Spjd			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
3050185029Spjd			    skipaclchk, cr) == 0) {
3051168404Spjd				/*
3052168404Spjd				 * Remove setuid/setgid for non-privileged users
3053168404Spjd				 */
3054185029Spjd				secpolicy_setid_clear(vap, vp, cr);
3055168404Spjd				trim_mask = (mask & (AT_UID|AT_GID));
3056168404Spjd			} else {
3057168404Spjd				need_policy =  TRUE;
3058168404Spjd			}
3059168404Spjd		} else {
3060168404Spjd			need_policy =  TRUE;
3061168404Spjd		}
3062168404Spjd	}
3063168404Spjd
3064168404Spjd	mutex_enter(&zp->z_lock);
3065219089Spjd	oldva.va_mode = zp->z_mode;
3066185029Spjd	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
3067185029Spjd	if (mask & AT_XVATTR) {
3068209962Smm		/*
3069209962Smm		 * Update xvattr mask to include only those attributes
3070209962Smm		 * that are actually changing.
3071209962Smm		 *
3072209962Smm		 * the bits will be restored prior to actually setting
3073209962Smm		 * the attributes so the caller thinks they were set.
3074209962Smm		 */
3075209962Smm		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
3076209962Smm			if (xoap->xoa_appendonly !=
3077219089Spjd			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
3078209962Smm				need_policy = TRUE;
3079209962Smm			} else {
3080209962Smm				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
3081209962Smm				XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
3082209962Smm			}
3083209962Smm		}
3084209962Smm
3085209962Smm		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
3086209962Smm			if (xoap->xoa_nounlink !=
3087219089Spjd			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
3088209962Smm				need_policy = TRUE;
3089209962Smm			} else {
3090209962Smm				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
3091209962Smm				XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
3092209962Smm			}
3093209962Smm		}
3094209962Smm
3095209962Smm		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
3096209962Smm			if (xoap->xoa_immutable !=
3097219089Spjd			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
3098209962Smm				need_policy = TRUE;
3099209962Smm			} else {
3100209962Smm				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
3101209962Smm				XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
3102209962Smm			}
3103209962Smm		}
3104209962Smm
3105209962Smm		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
3106209962Smm			if (xoap->xoa_nodump !=
3107219089Spjd			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
3108209962Smm				need_policy = TRUE;
3109209962Smm			} else {
3110209962Smm				XVA_CLR_REQ(xvap, XAT_NODUMP);
3111209962Smm				XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
3112209962Smm			}
3113209962Smm		}
3114209962Smm
3115209962Smm		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
3116209962Smm			if (xoap->xoa_av_modified !=
3117219089Spjd			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
3118209962Smm				need_policy = TRUE;
3119209962Smm			} else {
3120209962Smm				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
3121209962Smm				XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
3122209962Smm			}
3123209962Smm		}
3124209962Smm
3125209962Smm		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
3126209962Smm			if ((vp->v_type != VREG &&
3127209962Smm			    xoap->xoa_av_quarantined) ||
3128209962Smm			    xoap->xoa_av_quarantined !=
3129219089Spjd			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
3130209962Smm				need_policy = TRUE;
3131209962Smm			} else {
3132209962Smm				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
3133209962Smm				XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
3134209962Smm			}
3135209962Smm		}
3136209962Smm
3137219089Spjd		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
3138219089Spjd			mutex_exit(&zp->z_lock);
3139219089Spjd			ZFS_EXIT(zfsvfs);
3140219089Spjd			return (EPERM);
3141219089Spjd		}
3142219089Spjd
3143209962Smm		if (need_policy == FALSE &&
3144209962Smm		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
3145209962Smm		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
3146185029Spjd			need_policy = TRUE;
3147185029Spjd		}
3148185029Spjd	}
3149185029Spjd
3150168404Spjd	mutex_exit(&zp->z_lock);
3151168404Spjd
3152168404Spjd	if (mask & AT_MODE) {
3153185029Spjd		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
3154168962Spjd			err = secpolicy_setid_setsticky_clear(vp, vap,
3155168962Spjd			    &oldva, cr);
3156168962Spjd			if (err) {
3157168962Spjd				ZFS_EXIT(zfsvfs);
3158168962Spjd				return (err);
3159168962Spjd			}
3160168404Spjd			trim_mask |= AT_MODE;
3161168404Spjd		} else {
3162168404Spjd			need_policy = TRUE;
3163168404Spjd		}
3164168404Spjd	}
3165168404Spjd
3166168404Spjd	if (need_policy) {
3167168404Spjd		/*
3168168404Spjd		 * If trim_mask is set then take ownership
3169168404Spjd		 * has been granted or write_acl is present and user
3170168404Spjd		 * has the ability to modify mode.  In that case remove
3171168404Spjd		 * UID|GID and or MODE from mask so that
3172168404Spjd		 * secpolicy_vnode_setattr() doesn't revoke it.
3173168404Spjd		 */
3174168404Spjd
3175168404Spjd		if (trim_mask) {
3176168404Spjd			saved_mask = vap->va_mask;
3177168404Spjd			vap->va_mask &= ~trim_mask;
3178197831Spjd			if (trim_mask & AT_MODE) {
3179197831Spjd				/*
3180197831Spjd				 * Save the mode, as secpolicy_vnode_setattr()
3181197831Spjd				 * will overwrite it with ova.va_mode.
3182197831Spjd				 */
3183197831Spjd				saved_mode = vap->va_mode;
3184197831Spjd			}
3185168404Spjd		}
3186168404Spjd		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
3187185029Spjd		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
3188168404Spjd		if (err) {
3189168404Spjd			ZFS_EXIT(zfsvfs);
3190168404Spjd			return (err);
3191168404Spjd		}
3192168404Spjd
3193197831Spjd		if (trim_mask) {
3194168404Spjd			vap->va_mask |= saved_mask;
3195197831Spjd			if (trim_mask & AT_MODE) {
3196197831Spjd				/*
3197197831Spjd				 * Recover the mode after
3198197831Spjd				 * secpolicy_vnode_setattr().
3199197831Spjd				 */
3200197831Spjd				vap->va_mode = saved_mode;
3201197831Spjd			}
3202197831Spjd		}
3203168404Spjd	}
3204168404Spjd
3205168404Spjd	/*
3206168404Spjd	 * secpolicy_vnode_setattr, or take ownership may have
3207168404Spjd	 * changed va_mask
3208168404Spjd	 */
3209168404Spjd	mask = vap->va_mask;
3210168404Spjd
3211219089Spjd	if ((mask & (AT_UID | AT_GID))) {
3212219089Spjd		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
3213219089Spjd		    &xattr_obj, sizeof (xattr_obj));
3214168404Spjd
3215219089Spjd		if (err == 0 && xattr_obj) {
3216219089Spjd			err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
3217209962Smm			if (err)
3218219089Spjd				goto out2;
3219168404Spjd		}
3220209962Smm		if (mask & AT_UID) {
3221209962Smm			new_uid = zfs_fuid_create(zfsvfs,
3222209962Smm			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
3223219089Spjd			if (new_uid != zp->z_uid &&
3224219089Spjd			    zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
3225219089Spjd				if (attrzp)
3226219089Spjd					VN_RELE(ZTOV(attrzp));
3227209962Smm				err = EDQUOT;
3228219089Spjd				goto out2;
3229209962Smm			}
3230209962Smm		}
3231209962Smm
3232209962Smm		if (mask & AT_GID) {
3233209962Smm			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
3234209962Smm			    cr, ZFS_GROUP, &fuidp);
3235219089Spjd			if (new_gid != zp->z_gid &&
3236219089Spjd			    zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
3237219089Spjd				if (attrzp)
3238219089Spjd					VN_RELE(ZTOV(attrzp));
3239209962Smm				err = EDQUOT;
3240219089Spjd				goto out2;
3241209962Smm			}
3242209962Smm		}
3243219089Spjd	}
3244219089Spjd	tx = dmu_tx_create(zfsvfs->z_os);
3245219089Spjd
3246219089Spjd	if (mask & AT_MODE) {
3247219089Spjd		uint64_t pmode = zp->z_mode;
3248219089Spjd		uint64_t acl_obj;
3249219089Spjd		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
3250219089Spjd
3251224174Smm		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
3252224174Smm			goto out;
3253219089Spjd
3254219089Spjd		mutex_enter(&zp->z_lock);
3255219089Spjd		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
3256219089Spjd			/*
3257219089Spjd			 * Are we upgrading ACL from old V0 format
3258219089Spjd			 * to V1 format?
3259219089Spjd			 */
3260219089Spjd			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
3261219089Spjd			    zfs_znode_acl_version(zp) ==
3262219089Spjd			    ZFS_ACL_VERSION_INITIAL) {
3263219089Spjd				dmu_tx_hold_free(tx, acl_obj, 0,
3264219089Spjd				    DMU_OBJECT_END);
3265219089Spjd				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3266219089Spjd				    0, aclp->z_acl_bytes);
3267209962Smm			} else {
3268219089Spjd				dmu_tx_hold_write(tx, acl_obj, 0,
3269219089Spjd				    aclp->z_acl_bytes);
3270209962Smm			}
3271219089Spjd		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3272219089Spjd			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3273219089Spjd			    0, aclp->z_acl_bytes);
3274209962Smm		}
3275219089Spjd		mutex_exit(&zp->z_lock);
3276219089Spjd		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3277219089Spjd	} else {
3278219089Spjd		if ((mask & AT_XVATTR) &&
3279219089Spjd		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3280219089Spjd			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3281219089Spjd		else
3282219089Spjd			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3283168404Spjd	}
3284168404Spjd
3285219089Spjd	if (attrzp) {
3286219089Spjd		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3287219089Spjd	}
3288219089Spjd
3289219089Spjd	fuid_dirtied = zfsvfs->z_fuid_dirty;
3290219089Spjd	if (fuid_dirtied)
3291219089Spjd		zfs_fuid_txhold(zfsvfs, tx);
3292219089Spjd
3293219089Spjd	zfs_sa_upgrade_txholds(tx, zp);
3294219089Spjd
3295209962Smm	err = dmu_tx_assign(tx, TXG_NOWAIT);
3296168404Spjd	if (err) {
3297209962Smm		if (err == ERESTART)
3298168404Spjd			dmu_tx_wait(tx);
3299209962Smm		goto out;
3300168404Spjd	}
3301168404Spjd
3302219089Spjd	count = 0;
3303168404Spjd	/*
3304168404Spjd	 * Set each attribute requested.
3305168404Spjd	 * We group settings according to the locks they need to acquire.
3306168404Spjd	 *
3307168404Spjd	 * Note: you cannot set ctime directly, although it will be
3308168404Spjd	 * updated as a side-effect of calling this function.
3309168404Spjd	 */
3310168404Spjd
3311219089Spjd
3312219089Spjd	if (mask & (AT_UID|AT_GID|AT_MODE))
3313219089Spjd		mutex_enter(&zp->z_acl_lock);
3314168404Spjd	mutex_enter(&zp->z_lock);
3315168404Spjd
3316219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3317219089Spjd	    &zp->z_pflags, sizeof (zp->z_pflags));
3318219089Spjd
3319219089Spjd	if (attrzp) {
3320219089Spjd		if (mask & (AT_UID|AT_GID|AT_MODE))
3321219089Spjd			mutex_enter(&attrzp->z_acl_lock);
3322219089Spjd		mutex_enter(&attrzp->z_lock);
3323219089Spjd		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3324219089Spjd		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3325219089Spjd		    sizeof (attrzp->z_pflags));
3326219089Spjd	}
3327219089Spjd
3328219089Spjd	if (mask & (AT_UID|AT_GID)) {
3329219089Spjd
3330219089Spjd		if (mask & AT_UID) {
3331219089Spjd			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3332219089Spjd			    &new_uid, sizeof (new_uid));
3333219089Spjd			zp->z_uid = new_uid;
3334219089Spjd			if (attrzp) {
3335219089Spjd				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3336219089Spjd				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3337219089Spjd				    sizeof (new_uid));
3338219089Spjd				attrzp->z_uid = new_uid;
3339219089Spjd			}
3340219089Spjd		}
3341219089Spjd
3342219089Spjd		if (mask & AT_GID) {
3343219089Spjd			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3344219089Spjd			    NULL, &new_gid, sizeof (new_gid));
3345219089Spjd			zp->z_gid = new_gid;
3346219089Spjd			if (attrzp) {
3347219089Spjd				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3348219089Spjd				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3349219089Spjd				    sizeof (new_gid));
3350219089Spjd				attrzp->z_gid = new_gid;
3351219089Spjd			}
3352219089Spjd		}
3353219089Spjd		if (!(mask & AT_MODE)) {
3354219089Spjd			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3355219089Spjd			    NULL, &new_mode, sizeof (new_mode));
3356219089Spjd			new_mode = zp->z_mode;
3357219089Spjd		}
3358219089Spjd		err = zfs_acl_chown_setattr(zp);
3359219089Spjd		ASSERT(err == 0);
3360219089Spjd		if (attrzp) {
3361219089Spjd			err = zfs_acl_chown_setattr(attrzp);
3362219089Spjd			ASSERT(err == 0);
3363219089Spjd		}
3364219089Spjd	}
3365219089Spjd
3366168404Spjd	if (mask & AT_MODE) {
3367219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3368219089Spjd		    &new_mode, sizeof (new_mode));
3369219089Spjd		zp->z_mode = new_mode;
3370219089Spjd		ASSERT3U((uintptr_t)aclp, !=, 0);
3371209962Smm		err = zfs_aclset_common(zp, aclp, cr, tx);
3372240415Smm		ASSERT0(err);
3373219089Spjd		if (zp->z_acl_cached)
3374219089Spjd			zfs_acl_free(zp->z_acl_cached);
3375211932Smm		zp->z_acl_cached = aclp;
3376211932Smm		aclp = NULL;
3377168404Spjd	}
3378168404Spjd
3379168404Spjd
3380219089Spjd	if (mask & AT_ATIME) {
3381219089Spjd		ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
3382219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
3383219089Spjd		    &zp->z_atime, sizeof (zp->z_atime));
3384168404Spjd	}
3385168404Spjd
3386219089Spjd	if (mask & AT_MTIME) {
3387219089Spjd		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3388219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3389219089Spjd		    mtime, sizeof (mtime));
3390168404Spjd	}
3391168404Spjd
3392185029Spjd	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3393219089Spjd	if (mask & AT_SIZE && !(mask & AT_MTIME)) {
3394219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
3395219089Spjd		    NULL, mtime, sizeof (mtime));
3396219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3397219089Spjd		    &ctime, sizeof (ctime));
3398219089Spjd		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
3399219089Spjd		    B_TRUE);
3400219089Spjd	} else if (mask != 0) {
3401219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3402219089Spjd		    &ctime, sizeof (ctime));
3403219089Spjd		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
3404219089Spjd		    B_TRUE);
3405219089Spjd		if (attrzp) {
3406219089Spjd			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3407219089Spjd			    SA_ZPL_CTIME(zfsvfs), NULL,
3408219089Spjd			    &ctime, sizeof (ctime));
3409219089Spjd			zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
3410219089Spjd			    mtime, ctime, B_TRUE);
3411219089Spjd		}
3412219089Spjd	}
3413185029Spjd	/*
3414185029Spjd	 * Do this after setting timestamps to prevent timestamp
3415185029Spjd	 * update from toggling bit
3416185029Spjd	 */
3417168404Spjd
3418185029Spjd	if (xoap && (mask & AT_XVATTR)) {
3419209962Smm
3420209962Smm		/*
3421209962Smm		 * restore trimmed off masks
3422209962Smm		 * so that return masks can be set for caller.
3423209962Smm		 */
3424209962Smm
3425209962Smm		if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3426209962Smm			XVA_SET_REQ(xvap, XAT_APPENDONLY);
3427209962Smm		}
3428209962Smm		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3429209962Smm			XVA_SET_REQ(xvap, XAT_NOUNLINK);
3430209962Smm		}
3431209962Smm		if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3432209962Smm			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3433209962Smm		}
3434209962Smm		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3435209962Smm			XVA_SET_REQ(xvap, XAT_NODUMP);
3436209962Smm		}
3437209962Smm		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3438209962Smm			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3439209962Smm		}
3440209962Smm		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3441209962Smm			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3442209962Smm		}
3443209962Smm
3444219089Spjd		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3445185029Spjd			ASSERT(vp->v_type == VREG);
3446185029Spjd
3447219089Spjd		zfs_xvattr_set(zp, xvap, tx);
3448185029Spjd	}
3449185029Spjd
3450209962Smm	if (fuid_dirtied)
3451209962Smm		zfs_fuid_sync(zfsvfs, tx);
3452209962Smm
3453168404Spjd	if (mask != 0)
3454185029Spjd		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3455168404Spjd
3456168404Spjd	mutex_exit(&zp->z_lock);
3457219089Spjd	if (mask & (AT_UID|AT_GID|AT_MODE))
3458219089Spjd		mutex_exit(&zp->z_acl_lock);
3459168404Spjd
3460219089Spjd	if (attrzp) {
3461219089Spjd		if (mask & (AT_UID|AT_GID|AT_MODE))
3462219089Spjd			mutex_exit(&attrzp->z_acl_lock);
3463219089Spjd		mutex_exit(&attrzp->z_lock);
3464219089Spjd	}
3465209962Smmout:
3466219089Spjd	if (err == 0 && attrzp) {
3467219089Spjd		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3468219089Spjd		    xattr_count, tx);
3469219089Spjd		ASSERT(err2 == 0);
3470219089Spjd	}
3471219089Spjd
3472168404Spjd	if (attrzp)
3473168404Spjd		VN_RELE(ZTOV(attrzp));
3474211932Smm	if (aclp)
3475209962Smm		zfs_acl_free(aclp);
3476168404Spjd
3477209962Smm	if (fuidp) {
3478209962Smm		zfs_fuid_info_free(fuidp);
3479209962Smm		fuidp = NULL;
3480209962Smm	}
3481209962Smm
3482219089Spjd	if (err) {
3483209962Smm		dmu_tx_abort(tx);
3484219089Spjd		if (err == ERESTART)
3485219089Spjd			goto top;
3486219089Spjd	} else {
3487219089Spjd		err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3488209962Smm		dmu_tx_commit(tx);
3489219089Spjd	}
3490209962Smm
3491219089Spjdout2:
3492219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3493219089Spjd		zil_commit(zilog, 0);
3494209962Smm
3495168404Spjd	ZFS_EXIT(zfsvfs);
3496168404Spjd	return (err);
3497168404Spjd}
3498168404Spjd
3499168404Spjdtypedef struct zfs_zlock {
3500168404Spjd	krwlock_t	*zl_rwlock;	/* lock we acquired */
3501168404Spjd	znode_t		*zl_znode;	/* znode we held */
3502168404Spjd	struct zfs_zlock *zl_next;	/* next in list */
3503168404Spjd} zfs_zlock_t;
3504168404Spjd
3505168404Spjd/*
3506168404Spjd * Drop locks and release vnodes that were held by zfs_rename_lock().
3507168404Spjd */
3508168404Spjdstatic void
3509168404Spjdzfs_rename_unlock(zfs_zlock_t **zlpp)
3510168404Spjd{
3511168404Spjd	zfs_zlock_t *zl;
3512168404Spjd
3513168404Spjd	while ((zl = *zlpp) != NULL) {
3514168404Spjd		if (zl->zl_znode != NULL)
3515168404Spjd			VN_RELE(ZTOV(zl->zl_znode));
3516168404Spjd		rw_exit(zl->zl_rwlock);
3517168404Spjd		*zlpp = zl->zl_next;
3518168404Spjd		kmem_free(zl, sizeof (*zl));
3519168404Spjd	}
3520168404Spjd}
3521168404Spjd
3522168404Spjd/*
3523168404Spjd * Search back through the directory tree, using the ".." entries.
3524168404Spjd * Lock each directory in the chain to prevent concurrent renames.
3525168404Spjd * Fail any attempt to move a directory into one of its own descendants.
3526168404Spjd * XXX - z_parent_lock can overlap with map or grow locks
3527168404Spjd */
3528168404Spjdstatic int
3529168404Spjdzfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
3530168404Spjd{
3531168404Spjd	zfs_zlock_t	*zl;
3532168404Spjd	znode_t		*zp = tdzp;
3533168404Spjd	uint64_t	rootid = zp->z_zfsvfs->z_root;
3534219089Spjd	uint64_t	oidp = zp->z_id;
3535168404Spjd	krwlock_t	*rwlp = &szp->z_parent_lock;
3536168404Spjd	krw_t		rw = RW_WRITER;
3537168404Spjd
3538168404Spjd	/*
3539168404Spjd	 * First pass write-locks szp and compares to zp->z_id.
3540168404Spjd	 * Later passes read-lock zp and compare to zp->z_parent.
3541168404Spjd	 */
3542168404Spjd	do {
3543168404Spjd		if (!rw_tryenter(rwlp, rw)) {
3544168404Spjd			/*
3545168404Spjd			 * Another thread is renaming in this path.
3546168404Spjd			 * Note that if we are a WRITER, we don't have any
3547168404Spjd			 * parent_locks held yet.
3548168404Spjd			 */
3549168404Spjd			if (rw == RW_READER && zp->z_id > szp->z_id) {
3550168404Spjd				/*
3551168404Spjd				 * Drop our locks and restart
3552168404Spjd				 */
3553168404Spjd				zfs_rename_unlock(&zl);
3554168404Spjd				*zlpp = NULL;
3555168404Spjd				zp = tdzp;
3556219089Spjd				oidp = zp->z_id;
3557168404Spjd				rwlp = &szp->z_parent_lock;
3558168404Spjd				rw = RW_WRITER;
3559168404Spjd				continue;
3560168404Spjd			} else {
3561168404Spjd				/*
3562168404Spjd				 * Wait for other thread to drop its locks
3563168404Spjd				 */
3564168404Spjd				rw_enter(rwlp, rw);
3565168404Spjd			}
3566168404Spjd		}
3567168404Spjd
3568168404Spjd		zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
3569168404Spjd		zl->zl_rwlock = rwlp;
3570168404Spjd		zl->zl_znode = NULL;
3571168404Spjd		zl->zl_next = *zlpp;
3572168404Spjd		*zlpp = zl;
3573168404Spjd
3574219089Spjd		if (oidp == szp->z_id)		/* We're a descendant of szp */
3575168404Spjd			return (EINVAL);
3576168404Spjd
3577219089Spjd		if (oidp == rootid)		/* We've hit the top */
3578168404Spjd			return (0);
3579168404Spjd
3580168404Spjd		if (rw == RW_READER) {		/* i.e. not the first pass */
3581219089Spjd			int error = zfs_zget(zp->z_zfsvfs, oidp, &zp);
3582168404Spjd			if (error)
3583168404Spjd				return (error);
3584168404Spjd			zl->zl_znode = zp;
3585168404Spjd		}
3586219089Spjd		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs),
3587219089Spjd		    &oidp, sizeof (oidp));
3588168404Spjd		rwlp = &zp->z_parent_lock;
3589168404Spjd		rw = RW_READER;
3590168404Spjd
3591168404Spjd	} while (zp->z_id != sdzp->z_id);
3592168404Spjd
3593168404Spjd	return (0);
3594168404Spjd}
3595168404Spjd
3596168404Spjd/*
3597168404Spjd * Move an entry from the provided source directory to the target
3598168404Spjd * directory.  Change the entry name as indicated.
3599168404Spjd *
3600168404Spjd *	IN:	sdvp	- Source directory containing the "old entry".
3601168404Spjd *		snm	- Old entry name.
3602168404Spjd *		tdvp	- Target directory to contain the "new entry".
3603168404Spjd *		tnm	- New entry name.
3604168404Spjd *		cr	- credentials of caller.
3605185029Spjd *		ct	- caller context
3606185029Spjd *		flags	- case flags
3607168404Spjd *
3608168404Spjd *	RETURN:	0 if success
3609168404Spjd *		error code if failure
3610168404Spjd *
3611168404Spjd * Timestamps:
3612168404Spjd *	sdvp,tdvp - ctime|mtime updated
3613168404Spjd */
3614185029Spjd/*ARGSUSED*/
3615168404Spjdstatic int
3616185029Spjdzfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
3617185029Spjd    caller_context_t *ct, int flags)
3618168404Spjd{
3619168404Spjd	znode_t		*tdzp, *szp, *tzp;
3620168404Spjd	znode_t		*sdzp = VTOZ(sdvp);
3621168404Spjd	zfsvfs_t	*zfsvfs = sdzp->z_zfsvfs;
3622185029Spjd	zilog_t		*zilog;
3623168962Spjd	vnode_t		*realvp;
3624168404Spjd	zfs_dirlock_t	*sdl, *tdl;
3625168404Spjd	dmu_tx_t	*tx;
3626168404Spjd	zfs_zlock_t	*zl;
3627185029Spjd	int		cmp, serr, terr;
3628185029Spjd	int		error = 0;
3629185029Spjd	int		zflg = 0;
3630168404Spjd
3631168404Spjd	ZFS_ENTER(zfsvfs);
3632185029Spjd	ZFS_VERIFY_ZP(sdzp);
3633185029Spjd	zilog = zfsvfs->z_log;
3634168404Spjd
3635168962Spjd	/*
3636168962Spjd	 * Make sure we have the real vp for the target directory.
3637168962Spjd	 */
3638185029Spjd	if (VOP_REALVP(tdvp, &realvp, ct) == 0)
3639168962Spjd		tdvp = realvp;
3640168962Spjd
3641212694Smm	if (tdvp->v_vfsp != sdvp->v_vfsp || zfsctl_is_node(tdvp)) {
3642168404Spjd		ZFS_EXIT(zfsvfs);
3643168962Spjd		return (EXDEV);
3644168404Spjd	}
3645168404Spjd
3646168404Spjd	tdzp = VTOZ(tdvp);
3647185029Spjd	ZFS_VERIFY_ZP(tdzp);
3648185029Spjd	if (zfsvfs->z_utf8 && u8_validate(tnm,
3649185029Spjd	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3650185029Spjd		ZFS_EXIT(zfsvfs);
3651185029Spjd		return (EILSEQ);
3652185029Spjd	}
3653185029Spjd
3654185029Spjd	if (flags & FIGNORECASE)
3655185029Spjd		zflg |= ZCILOOK;
3656185029Spjd
3657168404Spjdtop:
3658168404Spjd	szp = NULL;
3659168404Spjd	tzp = NULL;
3660168404Spjd	zl = NULL;
3661168404Spjd
3662168404Spjd	/*
3663168404Spjd	 * This is to prevent the creation of links into attribute space
3664168404Spjd	 * by renaming a linked file into/outof an attribute directory.
3665168404Spjd	 * See the comment in zfs_link() for why this is considered bad.
3666168404Spjd	 */
3667219089Spjd	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3668168962Spjd		ZFS_EXIT(zfsvfs);
3669168962Spjd		return (EINVAL);
3670168404Spjd	}
3671168404Spjd
3672168404Spjd	/*
3673168404Spjd	 * Lock source and target directory entries.  To prevent deadlock,
3674168404Spjd	 * a lock ordering must be defined.  We lock the directory with
3675168404Spjd	 * the smallest object id first, or if it's a tie, the one with
3676168404Spjd	 * the lexically first name.
3677168404Spjd	 */
3678168404Spjd	if (sdzp->z_id < tdzp->z_id) {
3679168962Spjd		cmp = -1;
3680168962Spjd	} else if (sdzp->z_id > tdzp->z_id) {
3681168962Spjd		cmp = 1;
3682168962Spjd	} else {
3683185029Spjd		/*
3684185029Spjd		 * First compare the two name arguments without
3685185029Spjd		 * considering any case folding.
3686185029Spjd		 */
3687185029Spjd		int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
3688185029Spjd
3689185029Spjd		cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
3690185029Spjd		ASSERT(error == 0 || !zfsvfs->z_utf8);
3691168962Spjd		if (cmp == 0) {
3692168962Spjd			/*
3693168962Spjd			 * POSIX: "If the old argument and the new argument
3694168962Spjd			 * both refer to links to the same existing file,
3695168962Spjd			 * the rename() function shall return successfully
3696168962Spjd			 * and perform no other action."
3697168962Spjd			 */
3698168962Spjd			ZFS_EXIT(zfsvfs);
3699168962Spjd			return (0);
3700168962Spjd		}
3701185029Spjd		/*
3702185029Spjd		 * If the file system is case-folding, then we may
3703185029Spjd		 * have some more checking to do.  A case-folding file
3704185029Spjd		 * system is either supporting mixed case sensitivity
3705185029Spjd		 * access or is completely case-insensitive.  Note
3706185029Spjd		 * that the file system is always case preserving.
3707185029Spjd		 *
3708185029Spjd		 * In mixed sensitivity mode case sensitive behavior
3709185029Spjd		 * is the default.  FIGNORECASE must be used to
3710185029Spjd		 * explicitly request case insensitive behavior.
3711185029Spjd		 *
3712185029Spjd		 * If the source and target names provided differ only
3713185029Spjd		 * by case (e.g., a request to rename 'tim' to 'Tim'),
3714185029Spjd		 * we will treat this as a special case in the
3715185029Spjd		 * case-insensitive mode: as long as the source name
3716185029Spjd		 * is an exact match, we will allow this to proceed as
3717185029Spjd		 * a name-change request.
3718185029Spjd		 */
3719185029Spjd		if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
3720185029Spjd		    (zfsvfs->z_case == ZFS_CASE_MIXED &&
3721185029Spjd		    flags & FIGNORECASE)) &&
3722185029Spjd		    u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
3723185029Spjd		    &error) == 0) {
3724185029Spjd			/*
3725185029Spjd			 * case preserving rename request, require exact
3726185029Spjd			 * name matches
3727185029Spjd			 */
3728185029Spjd			zflg |= ZCIEXACT;
3729185029Spjd			zflg &= ~ZCILOOK;
3730185029Spjd		}
3731168962Spjd	}
3732185029Spjd
3733208131Smm	/*
3734208131Smm	 * If the source and destination directories are the same, we should
3735208131Smm	 * grab the z_name_lock of that directory only once.
3736208131Smm	 */
3737208131Smm	if (sdzp == tdzp) {
3738208131Smm		zflg |= ZHAVELOCK;
3739208131Smm		rw_enter(&sdzp->z_name_lock, RW_READER);
3740208131Smm	}
3741208131Smm
3742168962Spjd	if (cmp < 0) {
3743185029Spjd		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
3744185029Spjd		    ZEXISTS | zflg, NULL, NULL);
3745185029Spjd		terr = zfs_dirent_lock(&tdl,
3746185029Spjd		    tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
3747168962Spjd	} else {
3748185029Spjd		terr = zfs_dirent_lock(&tdl,
3749185029Spjd		    tdzp, tnm, &tzp, zflg, NULL, NULL);
3750185029Spjd		serr = zfs_dirent_lock(&sdl,
3751185029Spjd		    sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
3752185029Spjd		    NULL, NULL);
3753168404Spjd	}
3754168404Spjd
3755168962Spjd	if (serr) {
3756168404Spjd		/*
3757168404Spjd		 * Source entry invalid or not there.
3758168404Spjd		 */
3759168962Spjd		if (!terr) {
3760168404Spjd			zfs_dirent_unlock(tdl);
3761168962Spjd			if (tzp)
3762168962Spjd				VN_RELE(ZTOV(tzp));
3763168962Spjd		}
3764208131Smm
3765208131Smm		if (sdzp == tdzp)
3766208131Smm			rw_exit(&sdzp->z_name_lock);
3767208131Smm
3768219089Spjd		/*
3769219089Spjd		 * FreeBSD: In OpenSolaris they only check if rename source is
3770219089Spjd		 * ".." here, because "." is handled in their lookup. This is
3771219089Spjd		 * not the case for FreeBSD, so we check for "." explicitly.
3772219089Spjd		 */
3773168404Spjd		if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0)
3774168404Spjd			serr = EINVAL;
3775168962Spjd		ZFS_EXIT(zfsvfs);
3776168962Spjd		return (serr);
3777168404Spjd	}
3778168404Spjd	if (terr) {
3779168404Spjd		zfs_dirent_unlock(sdl);
3780168962Spjd		VN_RELE(ZTOV(szp));
3781208131Smm
3782208131Smm		if (sdzp == tdzp)
3783208131Smm			rw_exit(&sdzp->z_name_lock);
3784208131Smm
3785168404Spjd		if (strcmp(tnm, "..") == 0)
3786168404Spjd			terr = EINVAL;
3787168962Spjd		ZFS_EXIT(zfsvfs);
3788168962Spjd		return (terr);
3789168404Spjd	}
3790168404Spjd
3791168404Spjd	/*
3792168404Spjd	 * Must have write access at the source to remove the old entry
3793168404Spjd	 * and write access at the target to create the new entry.
3794168404Spjd	 * Note that if target and source are the same, this can be
3795168404Spjd	 * done in a single check.
3796168404Spjd	 */
3797168404Spjd
3798168404Spjd	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
3799168404Spjd		goto out;
3800168404Spjd
3801168962Spjd	if (ZTOV(szp)->v_type == VDIR) {
3802168404Spjd		/*
3803168404Spjd		 * Check to make sure rename is valid.
3804168404Spjd		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3805168404Spjd		 */
3806168404Spjd		if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
3807168404Spjd			goto out;
3808168404Spjd	}
3809168404Spjd
3810168404Spjd	/*
3811168404Spjd	 * Does target exist?
3812168404Spjd	 */
3813168404Spjd	if (tzp) {
3814168404Spjd		/*
3815168404Spjd		 * Source and target must be the same type.
3816168404Spjd		 */
3817168962Spjd		if (ZTOV(szp)->v_type == VDIR) {
3818168962Spjd			if (ZTOV(tzp)->v_type != VDIR) {
3819168404Spjd				error = ENOTDIR;
3820168404Spjd				goto out;
3821168404Spjd			}
3822168404Spjd		} else {
3823168962Spjd			if (ZTOV(tzp)->v_type == VDIR) {
3824168404Spjd				error = EISDIR;
3825168404Spjd				goto out;
3826168404Spjd			}
3827168404Spjd		}
3828168404Spjd		/*
3829168404Spjd		 * POSIX dictates that when the source and target
3830168404Spjd		 * entries refer to the same file object, rename
3831168404Spjd		 * must do nothing and exit without error.
3832168404Spjd		 */
3833168404Spjd		if (szp->z_id == tzp->z_id) {
3834168404Spjd			error = 0;
3835168404Spjd			goto out;
3836168404Spjd		}
3837168404Spjd	}
3838168404Spjd
3839185029Spjd	vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
3840168962Spjd	if (tzp)
3841185029Spjd		vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
3842168962Spjd
3843185029Spjd	/*
3844185029Spjd	 * notify the target directory if it is not the same
3845185029Spjd	 * as source directory.
3846185029Spjd	 */
3847185029Spjd	if (tdvp != sdvp) {
3848185029Spjd		vnevent_rename_dest_dir(tdvp, ct);
3849185029Spjd	}
3850185029Spjd
3851168404Spjd	tx = dmu_tx_create(zfsvfs->z_os);
3852219089Spjd	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3853219089Spjd	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3854168404Spjd	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3855168404Spjd	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3856219089Spjd	if (sdzp != tdzp) {
3857219089Spjd		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3858219089Spjd		zfs_sa_upgrade_txholds(tx, tdzp);
3859219089Spjd	}
3860219089Spjd	if (tzp) {
3861219089Spjd		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3862219089Spjd		zfs_sa_upgrade_txholds(tx, tzp);
3863219089Spjd	}
3864219089Spjd
3865219089Spjd	zfs_sa_upgrade_txholds(tx, szp);
3866168404Spjd	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3867209962Smm	error = dmu_tx_assign(tx, TXG_NOWAIT);
3868168404Spjd	if (error) {
3869168404Spjd		if (zl != NULL)
3870168404Spjd			zfs_rename_unlock(&zl);
3871168404Spjd		zfs_dirent_unlock(sdl);
3872168404Spjd		zfs_dirent_unlock(tdl);
3873208131Smm
3874208131Smm		if (sdzp == tdzp)
3875208131Smm			rw_exit(&sdzp->z_name_lock);
3876208131Smm
3877168962Spjd		VN_RELE(ZTOV(szp));
3878168962Spjd		if (tzp)
3879168962Spjd			VN_RELE(ZTOV(tzp));
3880209962Smm		if (error == ERESTART) {
3881168404Spjd			dmu_tx_wait(tx);
3882168404Spjd			dmu_tx_abort(tx);
3883168404Spjd			goto top;
3884168404Spjd		}
3885168404Spjd		dmu_tx_abort(tx);
3886168962Spjd		ZFS_EXIT(zfsvfs);
3887168962Spjd		return (error);
3888168404Spjd	}
3889168404Spjd
3890168404Spjd	if (tzp)	/* Attempt to remove the existing target */
3891185029Spjd		error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
3892168404Spjd
3893168404Spjd	if (error == 0) {
3894168404Spjd		error = zfs_link_create(tdl, szp, tx, ZRENAMING);
3895168404Spjd		if (error == 0) {
3896219089Spjd			szp->z_pflags |= ZFS_AV_MODIFIED;
3897185029Spjd
3898219089Spjd			error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3899219089Spjd			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3900240415Smm			ASSERT0(error);
3901219089Spjd
3902168404Spjd			error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
3903219089Spjd			if (error == 0) {
3904219089Spjd				zfs_log_rename(zilog, tx, TX_RENAME |
3905219089Spjd				    (flags & FIGNORECASE ? TX_CI : 0), sdzp,
3906219089Spjd				    sdl->dl_name, tdzp, tdl->dl_name, szp);
3907185029Spjd
3908219089Spjd				/*
3909219089Spjd				 * Update path information for the target vnode
3910219089Spjd				 */
3911219089Spjd				vn_renamepath(tdvp, ZTOV(szp), tnm,
3912219089Spjd				    strlen(tnm));
3913219089Spjd			} else {
3914219089Spjd				/*
3915219089Spjd				 * At this point, we have successfully created
3916219089Spjd				 * the target name, but have failed to remove
3917219089Spjd				 * the source name.  Since the create was done
3918219089Spjd				 * with the ZRENAMING flag, there are
3919219089Spjd				 * complications; for one, the link count is
3920219089Spjd				 * wrong.  The easiest way to deal with this
3921219089Spjd				 * is to remove the newly created target, and
3922219089Spjd				 * return the original error.  This must
3923219089Spjd				 * succeed; fortunately, it is very unlikely to
3924219089Spjd				 * fail, since we just created it.
3925219089Spjd				 */
3926219089Spjd				VERIFY3U(zfs_link_destroy(tdl, szp, tx,
3927219089Spjd				    ZRENAMING, NULL), ==, 0);
3928219089Spjd			}
3929168404Spjd		}
3930168404Spjd#ifdef FREEBSD_NAMECACHE
3931168404Spjd		if (error == 0) {
3932168404Spjd			cache_purge(sdvp);
3933168404Spjd			cache_purge(tdvp);
3934240829Spjd			cache_purge(ZTOV(szp));
3935240829Spjd			if (tzp)
3936240829Spjd				cache_purge(ZTOV(tzp));
3937168404Spjd		}
3938168404Spjd#endif
3939168404Spjd	}
3940168404Spjd
3941168404Spjd	dmu_tx_commit(tx);
3942168404Spjdout:
3943168404Spjd	if (zl != NULL)
3944168404Spjd		zfs_rename_unlock(&zl);
3945168404Spjd
3946168404Spjd	zfs_dirent_unlock(sdl);
3947168404Spjd	zfs_dirent_unlock(tdl);
3948168404Spjd
3949208131Smm	if (sdzp == tdzp)
3950208131Smm		rw_exit(&sdzp->z_name_lock);
3951208131Smm
3952219089Spjd
3953168962Spjd	VN_RELE(ZTOV(szp));
3954168404Spjd	if (tzp)
3955168962Spjd		VN_RELE(ZTOV(tzp));
3956168404Spjd
3957219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3958219089Spjd		zil_commit(zilog, 0);
3959219089Spjd
3960168404Spjd	ZFS_EXIT(zfsvfs);
3961168404Spjd
3962168404Spjd	return (error);
3963168404Spjd}
3964168404Spjd
3965168404Spjd/*
3966168404Spjd * Insert the indicated symbolic reference entry into the directory.
3967168404Spjd *
3968168404Spjd *	IN:	dvp	- Directory to contain new symbolic link.
3969168404Spjd *		link	- Name for new symlink entry.
3970168404Spjd *		vap	- Attributes of new entry.
3971168404Spjd *		target	- Target path of new symlink.
3972168404Spjd *		cr	- credentials of caller.
3973185029Spjd *		ct	- caller context
3974185029Spjd *		flags	- case flags
3975168404Spjd *
3976168404Spjd *	RETURN:	0 if success
3977168404Spjd *		error code if failure
3978168404Spjd *
3979168404Spjd * Timestamps:
3980168404Spjd *	dvp - ctime|mtime updated
3981168404Spjd */
3982185029Spjd/*ARGSUSED*/
3983168404Spjdstatic int
3984185029Spjdzfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
3985185029Spjd    cred_t *cr, kthread_t *td)
3986168404Spjd{
3987168404Spjd	znode_t		*zp, *dzp = VTOZ(dvp);
3988168404Spjd	zfs_dirlock_t	*dl;
3989168404Spjd	dmu_tx_t	*tx;
3990168404Spjd	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
3991185029Spjd	zilog_t		*zilog;
3992219089Spjd	uint64_t	len = strlen(link);
3993168404Spjd	int		error;
3994185029Spjd	int		zflg = ZNEW;
3995209962Smm	zfs_acl_ids_t	acl_ids;
3996209962Smm	boolean_t	fuid_dirtied;
3997219089Spjd	uint64_t	txtype = TX_SYMLINK;
3998185029Spjd	int		flags = 0;
3999168404Spjd
4000168962Spjd	ASSERT(vap->va_type == VLNK);
4001168404Spjd
4002168404Spjd	ZFS_ENTER(zfsvfs);
4003185029Spjd	ZFS_VERIFY_ZP(dzp);
4004185029Spjd	zilog = zfsvfs->z_log;
4005185029Spjd
4006185029Spjd	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
4007185029Spjd	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4008185029Spjd		ZFS_EXIT(zfsvfs);
4009185029Spjd		return (EILSEQ);
4010185029Spjd	}
4011185029Spjd	if (flags & FIGNORECASE)
4012185029Spjd		zflg |= ZCILOOK;
4013168404Spjd
4014168404Spjd	if (len > MAXPATHLEN) {
4015168404Spjd		ZFS_EXIT(zfsvfs);
4016168404Spjd		return (ENAMETOOLONG);
4017168404Spjd	}
4018168404Spjd
4019219089Spjd	if ((error = zfs_acl_ids_create(dzp, 0,
4020219089Spjd	    vap, cr, NULL, &acl_ids)) != 0) {
4021219089Spjd		ZFS_EXIT(zfsvfs);
4022219089Spjd		return (error);
4023219089Spjd	}
4024219089Spjdtop:
4025168404Spjd	/*
4026168404Spjd	 * Attempt to lock directory; fail if entry already exists.
4027168404Spjd	 */
4028185029Spjd	error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
4029185029Spjd	if (error) {
4030219089Spjd		zfs_acl_ids_free(&acl_ids);
4031168404Spjd		ZFS_EXIT(zfsvfs);
4032168404Spjd		return (error);
4033168404Spjd	}
4034168404Spjd
4035219089Spjd	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4036219089Spjd		zfs_acl_ids_free(&acl_ids);
4037219089Spjd		zfs_dirent_unlock(dl);
4038219089Spjd		ZFS_EXIT(zfsvfs);
4039219089Spjd		return (error);
4040219089Spjd	}
4041219089Spjd
4042209962Smm	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
4043209962Smm		zfs_acl_ids_free(&acl_ids);
4044209962Smm		zfs_dirent_unlock(dl);
4045209962Smm		ZFS_EXIT(zfsvfs);
4046209962Smm		return (EDQUOT);
4047209962Smm	}
4048168404Spjd	tx = dmu_tx_create(zfsvfs->z_os);
4049209962Smm	fuid_dirtied = zfsvfs->z_fuid_dirty;
4050168404Spjd	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
4051168404Spjd	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4052219089Spjd	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
4053219089Spjd	    ZFS_SA_BASE_ATTR_SIZE + len);
4054219089Spjd	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
4055219089Spjd	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
4056219089Spjd		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
4057219089Spjd		    acl_ids.z_aclp->z_acl_bytes);
4058219089Spjd	}
4059209962Smm	if (fuid_dirtied)
4060209962Smm		zfs_fuid_txhold(zfsvfs, tx);
4061209962Smm	error = dmu_tx_assign(tx, TXG_NOWAIT);
4062168404Spjd	if (error) {
4063168404Spjd		zfs_dirent_unlock(dl);
4064209962Smm		if (error == ERESTART) {
4065168404Spjd			dmu_tx_wait(tx);
4066168404Spjd			dmu_tx_abort(tx);
4067168404Spjd			goto top;
4068168404Spjd		}
4069219089Spjd		zfs_acl_ids_free(&acl_ids);
4070168404Spjd		dmu_tx_abort(tx);
4071168404Spjd		ZFS_EXIT(zfsvfs);
4072168404Spjd		return (error);
4073168404Spjd	}
4074168404Spjd
4075168404Spjd	/*
4076168404Spjd	 * Create a new object for the symlink.
4077219089Spjd	 * for version 4 ZPL datsets the symlink will be an SA attribute
4078168404Spjd	 */
4079219089Spjd	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
4080168404Spjd
4081219089Spjd	if (fuid_dirtied)
4082219089Spjd		zfs_fuid_sync(zfsvfs, tx);
4083209962Smm
4084219089Spjd	mutex_enter(&zp->z_lock);
4085219089Spjd	if (zp->z_is_sa)
4086219089Spjd		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
4087219089Spjd		    link, len, tx);
4088219089Spjd	else
4089219089Spjd		zfs_sa_symlink(zp, link, len, tx);
4090219089Spjd	mutex_exit(&zp->z_lock);
4091168404Spjd
4092219089Spjd	zp->z_size = len;
4093219089Spjd	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
4094219089Spjd	    &zp->z_size, sizeof (zp->z_size), tx);
4095168404Spjd	/*
4096168404Spjd	 * Insert the new object into the directory.
4097168404Spjd	 */
4098168404Spjd	(void) zfs_link_create(dl, zp, tx, ZNEW);
4099168404Spjd
4100219089Spjd	if (flags & FIGNORECASE)
4101219089Spjd		txtype |= TX_CI;
4102219089Spjd	zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
4103219089Spjd	*vpp = ZTOV(zp);
4104219089Spjd
4105209962Smm	zfs_acl_ids_free(&acl_ids);
4106209962Smm
4107168404Spjd	dmu_tx_commit(tx);
4108168404Spjd
4109168404Spjd	zfs_dirent_unlock(dl);
4110168404Spjd
4111219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4112219089Spjd		zil_commit(zilog, 0);
4113219089Spjd
4114168404Spjd	ZFS_EXIT(zfsvfs);
4115168404Spjd	return (error);
4116168404Spjd}
4117168404Spjd
4118168404Spjd/*
4119168404Spjd * Return, in the buffer contained in the provided uio structure,
4120168404Spjd * the symbolic path referred to by vp.
4121168404Spjd *
4122168404Spjd *	IN:	vp	- vnode of symbolic link.
4123168404Spjd *		uoip	- structure to contain the link path.
4124168404Spjd *		cr	- credentials of caller.
4125185029Spjd *		ct	- caller context
4126168404Spjd *
4127168404Spjd *	OUT:	uio	- structure to contain the link path.
4128168404Spjd *
4129168404Spjd *	RETURN:	0 if success
4130168404Spjd *		error code if failure
4131168404Spjd *
4132168404Spjd * Timestamps:
4133168404Spjd *	vp - atime updated
4134168404Spjd */
4135168404Spjd/* ARGSUSED */
4136168404Spjdstatic int
4137185029Spjdzfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
4138168404Spjd{
4139168404Spjd	znode_t		*zp = VTOZ(vp);
4140168404Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4141168404Spjd	int		error;
4142168404Spjd
4143168404Spjd	ZFS_ENTER(zfsvfs);
4144185029Spjd	ZFS_VERIFY_ZP(zp);
4145168404Spjd
4146219089Spjd	mutex_enter(&zp->z_lock);
4147219089Spjd	if (zp->z_is_sa)
4148219089Spjd		error = sa_lookup_uio(zp->z_sa_hdl,
4149219089Spjd		    SA_ZPL_SYMLINK(zfsvfs), uio);
4150219089Spjd	else
4151219089Spjd		error = zfs_sa_readlink(zp, uio);
4152219089Spjd	mutex_exit(&zp->z_lock);
4153168404Spjd
4154168404Spjd	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4155219089Spjd
4156168404Spjd	ZFS_EXIT(zfsvfs);
4157168404Spjd	return (error);
4158168404Spjd}
4159168404Spjd
4160168404Spjd/*
4161168404Spjd * Insert a new entry into directory tdvp referencing svp.
4162168404Spjd *
4163168404Spjd *	IN:	tdvp	- Directory to contain new entry.
4164168404Spjd *		svp	- vnode of new entry.
4165168404Spjd *		name	- name of new entry.
4166168404Spjd *		cr	- credentials of caller.
4167185029Spjd *		ct	- caller context
4168168404Spjd *
4169168404Spjd *	RETURN:	0 if success
4170168404Spjd *		error code if failure
4171168404Spjd *
4172168404Spjd * Timestamps:
4173168404Spjd *	tdvp - ctime|mtime updated
4174168404Spjd *	 svp - ctime updated
4175168404Spjd */
4176168404Spjd/* ARGSUSED */
4177168404Spjdstatic int
4178185029Spjdzfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
4179185029Spjd    caller_context_t *ct, int flags)
4180168404Spjd{
4181168404Spjd	znode_t		*dzp = VTOZ(tdvp);
4182168404Spjd	znode_t		*tzp, *szp;
4183168404Spjd	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
4184185029Spjd	zilog_t		*zilog;
4185168404Spjd	zfs_dirlock_t	*dl;
4186168404Spjd	dmu_tx_t	*tx;
4187168962Spjd	vnode_t		*realvp;
4188168404Spjd	int		error;
4189185029Spjd	int		zf = ZNEW;
4190212694Smm	uint64_t	parent;
4191185029Spjd	uid_t		owner;
4192168404Spjd
4193168404Spjd	ASSERT(tdvp->v_type == VDIR);
4194168404Spjd
4195168404Spjd	ZFS_ENTER(zfsvfs);
4196185029Spjd	ZFS_VERIFY_ZP(dzp);
4197185029Spjd	zilog = zfsvfs->z_log;
4198168404Spjd
4199185029Spjd	if (VOP_REALVP(svp, &realvp, ct) == 0)
4200168962Spjd		svp = realvp;
4201168962Spjd
4202212694Smm	/*
4203212694Smm	 * POSIX dictates that we return EPERM here.
4204212694Smm	 * Better choices include ENOTSUP or EISDIR.
4205212694Smm	 */
4206212694Smm	if (svp->v_type == VDIR) {
4207168404Spjd		ZFS_EXIT(zfsvfs);
4208212694Smm		return (EPERM);
4209212694Smm	}
4210212694Smm
4211212694Smm	if (svp->v_vfsp != tdvp->v_vfsp || zfsctl_is_node(svp)) {
4212212694Smm		ZFS_EXIT(zfsvfs);
4213168404Spjd		return (EXDEV);
4214168404Spjd	}
4215212694Smm
4216185029Spjd	szp = VTOZ(svp);
4217185029Spjd	ZFS_VERIFY_ZP(szp);
4218168404Spjd
4219212694Smm	/* Prevent links to .zfs/shares files */
4220212694Smm
4221219089Spjd	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
4222219089Spjd	    &parent, sizeof (uint64_t))) != 0) {
4223212694Smm		ZFS_EXIT(zfsvfs);
4224219089Spjd		return (error);
4225219089Spjd	}
4226219089Spjd	if (parent == zfsvfs->z_shares_dir) {
4227219089Spjd		ZFS_EXIT(zfsvfs);
4228212694Smm		return (EPERM);
4229212694Smm	}
4230212694Smm
4231185029Spjd	if (zfsvfs->z_utf8 && u8_validate(name,
4232185029Spjd	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4233185029Spjd		ZFS_EXIT(zfsvfs);
4234185029Spjd		return (EILSEQ);
4235185029Spjd	}
4236185029Spjd	if (flags & FIGNORECASE)
4237185029Spjd		zf |= ZCILOOK;
4238185029Spjd
4239168404Spjd	/*
4240168404Spjd	 * We do not support links between attributes and non-attributes
4241168404Spjd	 * because of the potential security risk of creating links
4242168404Spjd	 * into "normal" file space in order to circumvent restrictions
4243168404Spjd	 * imposed in attribute space.
4244168404Spjd	 */
4245219089Spjd	if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
4246168404Spjd		ZFS_EXIT(zfsvfs);
4247168404Spjd		return (EINVAL);
4248168404Spjd	}
4249168404Spjd
4250168404Spjd
4251219089Spjd	owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
4252219089Spjd	if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
4253168404Spjd		ZFS_EXIT(zfsvfs);
4254168404Spjd		return (EPERM);
4255168404Spjd	}
4256168404Spjd
4257185029Spjd	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4258168404Spjd		ZFS_EXIT(zfsvfs);
4259168404Spjd		return (error);
4260168404Spjd	}
4261168404Spjd
4262212694Smmtop:
4263168404Spjd	/*
4264168404Spjd	 * Attempt to lock directory; fail if entry already exists.
4265168404Spjd	 */
4266185029Spjd	error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
4267185029Spjd	if (error) {
4268168404Spjd		ZFS_EXIT(zfsvfs);
4269168404Spjd		return (error);
4270168404Spjd	}
4271168404Spjd
4272168404Spjd	tx = dmu_tx_create(zfsvfs->z_os);
4273219089Spjd	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4274168404Spjd	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4275219089Spjd	zfs_sa_upgrade_txholds(tx, szp);
4276219089Spjd	zfs_sa_upgrade_txholds(tx, dzp);
4277209962Smm	error = dmu_tx_assign(tx, TXG_NOWAIT);
4278168404Spjd	if (error) {
4279168404Spjd		zfs_dirent_unlock(dl);
4280209962Smm		if (error == ERESTART) {
4281168404Spjd			dmu_tx_wait(tx);
4282168404Spjd			dmu_tx_abort(tx);
4283168404Spjd			goto top;
4284168404Spjd		}
4285168404Spjd		dmu_tx_abort(tx);
4286168404Spjd		ZFS_EXIT(zfsvfs);
4287168404Spjd		return (error);
4288168404Spjd	}
4289168404Spjd
4290168404Spjd	error = zfs_link_create(dl, szp, tx, 0);
4291168404Spjd
4292185029Spjd	if (error == 0) {
4293185029Spjd		uint64_t txtype = TX_LINK;
4294185029Spjd		if (flags & FIGNORECASE)
4295185029Spjd			txtype |= TX_CI;
4296185029Spjd		zfs_log_link(zilog, tx, txtype, dzp, szp, name);
4297185029Spjd	}
4298168404Spjd
4299168404Spjd	dmu_tx_commit(tx);
4300168404Spjd
4301168404Spjd	zfs_dirent_unlock(dl);
4302168404Spjd
4303185029Spjd	if (error == 0) {
4304185029Spjd		vnevent_link(svp, ct);
4305185029Spjd	}
4306185029Spjd
4307219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4308219089Spjd		zil_commit(zilog, 0);
4309219089Spjd
4310168404Spjd	ZFS_EXIT(zfsvfs);
4311168404Spjd	return (error);
4312168404Spjd}
4313168404Spjd
4314219089Spjd#ifdef sun
4315219089Spjd/*
4316219089Spjd * zfs_null_putapage() is used when the file system has been force
4317219089Spjd * unmounted. It just drops the pages.
4318219089Spjd */
4319219089Spjd/* ARGSUSED */
4320219089Spjdstatic int
4321219089Spjdzfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4322219089Spjd		size_t *lenp, int flags, cred_t *cr)
4323219089Spjd{
4324219089Spjd	pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR);
4325219089Spjd	return (0);
4326219089Spjd}
4327219089Spjd
4328219089Spjd/*
4329219089Spjd * Push a page out to disk, klustering if possible.
4330219089Spjd *
4331219089Spjd *	IN:	vp	- file to push page to.
4332219089Spjd *		pp	- page to push.
4333219089Spjd *		flags	- additional flags.
4334219089Spjd *		cr	- credentials of caller.
4335219089Spjd *
4336219089Spjd *	OUT:	offp	- start of range pushed.
4337219089Spjd *		lenp	- len of range pushed.
4338219089Spjd *
4339219089Spjd *	RETURN:	0 if success
4340219089Spjd *		error code if failure
4341219089Spjd *
4342219089Spjd * NOTE: callers must have locked the page to be pushed.  On
4343219089Spjd * exit, the page (and all other pages in the kluster) must be
4344219089Spjd * unlocked.
4345219089Spjd */
4346219089Spjd/* ARGSUSED */
4347219089Spjdstatic int
4348219089Spjdzfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4349219089Spjd		size_t *lenp, int flags, cred_t *cr)
4350219089Spjd{
4351219089Spjd	znode_t		*zp = VTOZ(vp);
4352219089Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4353219089Spjd	dmu_tx_t	*tx;
4354219089Spjd	u_offset_t	off, koff;
4355219089Spjd	size_t		len, klen;
4356219089Spjd	int		err;
4357219089Spjd
4358219089Spjd	off = pp->p_offset;
4359219089Spjd	len = PAGESIZE;
4360219089Spjd	/*
4361219089Spjd	 * If our blocksize is bigger than the page size, try to kluster
4362219089Spjd	 * multiple pages so that we write a full block (thus avoiding
4363219089Spjd	 * a read-modify-write).
4364219089Spjd	 */
4365219089Spjd	if (off < zp->z_size && zp->z_blksz > PAGESIZE) {
4366219089Spjd		klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
4367219089Spjd		koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0;
4368219089Spjd		ASSERT(koff <= zp->z_size);
4369219089Spjd		if (koff + klen > zp->z_size)
4370219089Spjd			klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE);
4371219089Spjd		pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
4372219089Spjd	}
4373219089Spjd	ASSERT3U(btop(len), ==, btopr(len));
4374219089Spjd
4375219089Spjd	/*
4376219089Spjd	 * Can't push pages past end-of-file.
4377219089Spjd	 */
4378219089Spjd	if (off >= zp->z_size) {
4379219089Spjd		/* ignore all pages */
4380219089Spjd		err = 0;
4381219089Spjd		goto out;
4382219089Spjd	} else if (off + len > zp->z_size) {
4383219089Spjd		int npages = btopr(zp->z_size - off);
4384219089Spjd		page_t *trunc;
4385219089Spjd
4386219089Spjd		page_list_break(&pp, &trunc, npages);
4387219089Spjd		/* ignore pages past end of file */
4388219089Spjd		if (trunc)
4389219089Spjd			pvn_write_done(trunc, flags);
4390219089Spjd		len = zp->z_size - off;
4391219089Spjd	}
4392219089Spjd
4393219089Spjd	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
4394219089Spjd	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
4395219089Spjd		err = EDQUOT;
4396219089Spjd		goto out;
4397219089Spjd	}
4398219089Spjdtop:
4399219089Spjd	tx = dmu_tx_create(zfsvfs->z_os);
4400219089Spjd	dmu_tx_hold_write(tx, zp->z_id, off, len);
4401219089Spjd
4402219089Spjd	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4403219089Spjd	zfs_sa_upgrade_txholds(tx, zp);
4404219089Spjd	err = dmu_tx_assign(tx, TXG_NOWAIT);
4405219089Spjd	if (err != 0) {
4406219089Spjd		if (err == ERESTART) {
4407219089Spjd			dmu_tx_wait(tx);
4408219089Spjd			dmu_tx_abort(tx);
4409219089Spjd			goto top;
4410219089Spjd		}
4411219089Spjd		dmu_tx_abort(tx);
4412219089Spjd		goto out;
4413219089Spjd	}
4414219089Spjd
4415219089Spjd	if (zp->z_blksz <= PAGESIZE) {
4416219089Spjd		caddr_t va = zfs_map_page(pp, S_READ);
4417219089Spjd		ASSERT3U(len, <=, PAGESIZE);
4418219089Spjd		dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
4419219089Spjd		zfs_unmap_page(pp, va);
4420219089Spjd	} else {
4421219089Spjd		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
4422219089Spjd	}
4423219089Spjd
4424219089Spjd	if (err == 0) {
4425219089Spjd		uint64_t mtime[2], ctime[2];
4426219089Spjd		sa_bulk_attr_t bulk[3];
4427219089Spjd		int count = 0;
4428219089Spjd
4429219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4430219089Spjd		    &mtime, 16);
4431219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
4432219089Spjd		    &ctime, 16);
4433219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
4434219089Spjd		    &zp->z_pflags, 8);
4435219089Spjd		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
4436219089Spjd		    B_TRUE);
4437219089Spjd		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
4438219089Spjd	}
4439219089Spjd	dmu_tx_commit(tx);
4440219089Spjd
4441219089Spjdout:
4442219089Spjd	pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
4443219089Spjd	if (offp)
4444219089Spjd		*offp = off;
4445219089Spjd	if (lenp)
4446219089Spjd		*lenp = len;
4447219089Spjd
4448219089Spjd	return (err);
4449219089Spjd}
4450219089Spjd
4451219089Spjd/*
4452219089Spjd * Copy the portion of the file indicated from pages into the file.
4453219089Spjd * The pages are stored in a page list attached to the files vnode.
4454219089Spjd *
4455219089Spjd *	IN:	vp	- vnode of file to push page data to.
4456219089Spjd *		off	- position in file to put data.
4457219089Spjd *		len	- amount of data to write.
4458219089Spjd *		flags	- flags to control the operation.
4459219089Spjd *		cr	- credentials of caller.
4460219089Spjd *		ct	- caller context.
4461219089Spjd *
4462219089Spjd *	RETURN:	0 if success
4463219089Spjd *		error code if failure
4464219089Spjd *
4465219089Spjd * Timestamps:
4466219089Spjd *	vp - ctime|mtime updated
4467219089Spjd */
4468185029Spjd/*ARGSUSED*/
4469219089Spjdstatic int
4470219089Spjdzfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
4471219089Spjd    caller_context_t *ct)
4472219089Spjd{
4473219089Spjd	znode_t		*zp = VTOZ(vp);
4474219089Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4475219089Spjd	page_t		*pp;
4476219089Spjd	size_t		io_len;
4477219089Spjd	u_offset_t	io_off;
4478219089Spjd	uint_t		blksz;
4479219089Spjd	rl_t		*rl;
4480219089Spjd	int		error = 0;
4481219089Spjd
4482219089Spjd	ZFS_ENTER(zfsvfs);
4483219089Spjd	ZFS_VERIFY_ZP(zp);
4484219089Spjd
4485219089Spjd	/*
4486219089Spjd	 * Align this request to the file block size in case we kluster.
4487219089Spjd	 * XXX - this can result in pretty aggresive locking, which can
4488219089Spjd	 * impact simultanious read/write access.  One option might be
4489219089Spjd	 * to break up long requests (len == 0) into block-by-block
4490219089Spjd	 * operations to get narrower locking.
4491219089Spjd	 */
4492219089Spjd	blksz = zp->z_blksz;
4493219089Spjd	if (ISP2(blksz))
4494219089Spjd		io_off = P2ALIGN_TYPED(off, blksz, u_offset_t);
4495219089Spjd	else
4496219089Spjd		io_off = 0;
4497219089Spjd	if (len > 0 && ISP2(blksz))
4498219089Spjd		io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t);
4499219089Spjd	else
4500219089Spjd		io_len = 0;
4501219089Spjd
4502219089Spjd	if (io_len == 0) {
4503219089Spjd		/*
4504219089Spjd		 * Search the entire vp list for pages >= io_off.
4505219089Spjd		 */
4506219089Spjd		rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER);
4507219089Spjd		error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr);
4508219089Spjd		goto out;
4509219089Spjd	}
4510219089Spjd	rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER);
4511219089Spjd
4512219089Spjd	if (off > zp->z_size) {
4513219089Spjd		/* past end of file */
4514219089Spjd		zfs_range_unlock(rl);
4515219089Spjd		ZFS_EXIT(zfsvfs);
4516219089Spjd		return (0);
4517219089Spjd	}
4518219089Spjd
4519219089Spjd	len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off);
4520219089Spjd
4521219089Spjd	for (off = io_off; io_off < off + len; io_off += io_len) {
4522219089Spjd		if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
4523219089Spjd			pp = page_lookup(vp, io_off,
4524219089Spjd			    (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
4525219089Spjd		} else {
4526219089Spjd			pp = page_lookup_nowait(vp, io_off,
4527219089Spjd			    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
4528219089Spjd		}
4529219089Spjd
4530219089Spjd		if (pp != NULL && pvn_getdirty(pp, flags)) {
4531219089Spjd			int err;
4532219089Spjd
4533219089Spjd			/*
4534219089Spjd			 * Found a dirty page to push
4535219089Spjd			 */
4536219089Spjd			err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr);
4537219089Spjd			if (err)
4538219089Spjd				error = err;
4539219089Spjd		} else {
4540219089Spjd			io_len = PAGESIZE;
4541219089Spjd		}
4542219089Spjd	}
4543219089Spjdout:
4544219089Spjd	zfs_range_unlock(rl);
4545219089Spjd	if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4546219089Spjd		zil_commit(zfsvfs->z_log, zp->z_id);
4547219089Spjd	ZFS_EXIT(zfsvfs);
4548219089Spjd	return (error);
4549219089Spjd}
4550219089Spjd#endif	/* sun */
4551219089Spjd
4552219089Spjd/*ARGSUSED*/
4553168962Spjdvoid
4554185029Spjdzfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4555168404Spjd{
4556168962Spjd	znode_t	*zp = VTOZ(vp);
4557168962Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4558168962Spjd	int error;
4559168404Spjd
4560185029Spjd	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4561219089Spjd	if (zp->z_sa_hdl == NULL) {
4562185029Spjd		/*
4563185029Spjd		 * The fs has been unmounted, or we did a
4564185029Spjd		 * suspend/resume and this file no longer exists.
4565185029Spjd		 */
4566168404Spjd		VI_LOCK(vp);
4567219089Spjd		ASSERT(vp->v_count <= 1);
4568219089Spjd		vp->v_count = 0;
4569196299Spjd		VI_UNLOCK(vp);
4570234607Strasz		vrecycle(vp);
4571185029Spjd		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4572168962Spjd		return;
4573168404Spjd	}
4574168404Spjd
4575168404Spjd	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4576168404Spjd		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4577168404Spjd
4578219089Spjd		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4579219089Spjd		zfs_sa_upgrade_txholds(tx, zp);
4580168404Spjd		error = dmu_tx_assign(tx, TXG_WAIT);
4581168404Spjd		if (error) {
4582168404Spjd			dmu_tx_abort(tx);
4583168404Spjd		} else {
4584168404Spjd			mutex_enter(&zp->z_lock);
4585219089Spjd			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4586219089Spjd			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4587168404Spjd			zp->z_atime_dirty = 0;
4588168404Spjd			mutex_exit(&zp->z_lock);
4589168404Spjd			dmu_tx_commit(tx);
4590168404Spjd		}
4591168404Spjd	}
4592168404Spjd
4593168404Spjd	zfs_zinactive(zp);
4594185029Spjd	rw_exit(&zfsvfs->z_teardown_inactive_lock);
4595168404Spjd}
4596168404Spjd
4597219089Spjd#ifdef sun
4598219089Spjd/*
4599219089Spjd * Bounds-check the seek operation.
4600219089Spjd *
4601219089Spjd *	IN:	vp	- vnode seeking within
4602219089Spjd *		ooff	- old file offset
4603219089Spjd *		noffp	- pointer to new file offset
4604219089Spjd *		ct	- caller context
4605219089Spjd *
4606219089Spjd *	RETURN:	0 if success
4607219089Spjd *		EINVAL if new offset invalid
4608219089Spjd */
4609219089Spjd/* ARGSUSED */
4610219089Spjdstatic int
4611219089Spjdzfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
4612219089Spjd    caller_context_t *ct)
4613219089Spjd{
4614219089Spjd	if (vp->v_type == VDIR)
4615219089Spjd		return (0);
4616219089Spjd	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
4617219089Spjd}
4618219089Spjd
4619219089Spjd/*
4620219089Spjd * Pre-filter the generic locking function to trap attempts to place
4621219089Spjd * a mandatory lock on a memory mapped file.
4622219089Spjd */
4623219089Spjdstatic int
4624219089Spjdzfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
4625219089Spjd    flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct)
4626219089Spjd{
4627219089Spjd	znode_t *zp = VTOZ(vp);
4628219089Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4629219089Spjd
4630219089Spjd	ZFS_ENTER(zfsvfs);
4631219089Spjd	ZFS_VERIFY_ZP(zp);
4632219089Spjd
4633219089Spjd	/*
4634219089Spjd	 * We are following the UFS semantics with respect to mapcnt
4635219089Spjd	 * here: If we see that the file is mapped already, then we will
4636219089Spjd	 * return an error, but we don't worry about races between this
4637219089Spjd	 * function and zfs_map().
4638219089Spjd	 */
4639219089Spjd	if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) {
4640219089Spjd		ZFS_EXIT(zfsvfs);
4641219089Spjd		return (EAGAIN);
4642219089Spjd	}
4643219089Spjd	ZFS_EXIT(zfsvfs);
4644219089Spjd	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
4645219089Spjd}
4646219089Spjd
4647219089Spjd/*
4648219089Spjd * If we can't find a page in the cache, we will create a new page
4649219089Spjd * and fill it with file data.  For efficiency, we may try to fill
4650219089Spjd * multiple pages at once (klustering) to fill up the supplied page
4651219089Spjd * list.  Note that the pages to be filled are held with an exclusive
4652219089Spjd * lock to prevent access by other threads while they are being filled.
4653219089Spjd */
4654219089Spjdstatic int
4655219089Spjdzfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
4656219089Spjd    caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw)
4657219089Spjd{
4658219089Spjd	znode_t *zp = VTOZ(vp);
4659219089Spjd	page_t *pp, *cur_pp;
4660219089Spjd	objset_t *os = zp->z_zfsvfs->z_os;
4661219089Spjd	u_offset_t io_off, total;
4662219089Spjd	size_t io_len;
4663219089Spjd	int err;
4664219089Spjd
4665219089Spjd	if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) {
4666219089Spjd		/*
4667219089Spjd		 * We only have a single page, don't bother klustering
4668219089Spjd		 */
4669219089Spjd		io_off = off;
4670219089Spjd		io_len = PAGESIZE;
4671219089Spjd		pp = page_create_va(vp, io_off, io_len,
4672219089Spjd		    PG_EXCL | PG_WAIT, seg, addr);
4673219089Spjd	} else {
4674219089Spjd		/*
4675219089Spjd		 * Try to find enough pages to fill the page list
4676219089Spjd		 */
4677219089Spjd		pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
4678219089Spjd		    &io_len, off, plsz, 0);
4679219089Spjd	}
4680219089Spjd	if (pp == NULL) {
4681219089Spjd		/*
4682219089Spjd		 * The page already exists, nothing to do here.
4683219089Spjd		 */
4684219089Spjd		*pl = NULL;
4685219089Spjd		return (0);
4686219089Spjd	}
4687219089Spjd
4688219089Spjd	/*
4689219089Spjd	 * Fill the pages in the kluster.
4690219089Spjd	 */
4691219089Spjd	cur_pp = pp;
4692219089Spjd	for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
4693219089Spjd		caddr_t va;
4694219089Spjd
4695219089Spjd		ASSERT3U(io_off, ==, cur_pp->p_offset);
4696219089Spjd		va = zfs_map_page(cur_pp, S_WRITE);
4697219089Spjd		err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
4698219089Spjd		    DMU_READ_PREFETCH);
4699219089Spjd		zfs_unmap_page(cur_pp, va);
4700219089Spjd		if (err) {
4701219089Spjd			/* On error, toss the entire kluster */
4702219089Spjd			pvn_read_done(pp, B_ERROR);
4703219089Spjd			/* convert checksum errors into IO errors */
4704219089Spjd			if (err == ECKSUM)
4705219089Spjd				err = EIO;
4706219089Spjd			return (err);
4707219089Spjd		}
4708219089Spjd		cur_pp = cur_pp->p_next;
4709219089Spjd	}
4710219089Spjd
4711219089Spjd	/*
4712219089Spjd	 * Fill in the page list array from the kluster starting
4713219089Spjd	 * from the desired offset `off'.
4714219089Spjd	 * NOTE: the page list will always be null terminated.
4715219089Spjd	 */
4716219089Spjd	pvn_plist_init(pp, pl, plsz, off, io_len, rw);
4717219089Spjd	ASSERT(pl == NULL || (*pl)->p_offset == off);
4718219089Spjd
4719219089Spjd	return (0);
4720219089Spjd}
4721219089Spjd
4722219089Spjd/*
4723219089Spjd * Return pointers to the pages for the file region [off, off + len]
4724219089Spjd * in the pl array.  If plsz is greater than len, this function may
4725219089Spjd * also return page pointers from after the specified region
4726219089Spjd * (i.e. the region [off, off + plsz]).  These additional pages are
4727219089Spjd * only returned if they are already in the cache, or were created as
4728219089Spjd * part of a klustered read.
4729219089Spjd *
4730219089Spjd *	IN:	vp	- vnode of file to get data from.
4731219089Spjd *		off	- position in file to get data from.
4732219089Spjd *		len	- amount of data to retrieve.
4733219089Spjd *		plsz	- length of provided page list.
4734219089Spjd *		seg	- segment to obtain pages for.
4735219089Spjd *		addr	- virtual address of fault.
4736219089Spjd *		rw	- mode of created pages.
4737219089Spjd *		cr	- credentials of caller.
4738219089Spjd *		ct	- caller context.
4739219089Spjd *
4740219089Spjd *	OUT:	protp	- protection mode of created pages.
4741219089Spjd *		pl	- list of pages created.
4742219089Spjd *
4743219089Spjd *	RETURN:	0 if success
4744219089Spjd *		error code if failure
4745219089Spjd *
4746219089Spjd * Timestamps:
4747219089Spjd *	vp - atime updated
4748219089Spjd */
4749219089Spjd/* ARGSUSED */
4750219089Spjdstatic int
4751219089Spjdzfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
4752219089Spjd	page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
4753219089Spjd	enum seg_rw rw, cred_t *cr, caller_context_t *ct)
4754219089Spjd{
4755219089Spjd	znode_t		*zp = VTOZ(vp);
4756219089Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4757219089Spjd	page_t		**pl0 = pl;
4758219089Spjd	int		err = 0;
4759219089Spjd
4760219089Spjd	/* we do our own caching, faultahead is unnecessary */
4761219089Spjd	if (pl == NULL)
4762219089Spjd		return (0);
4763219089Spjd	else if (len > plsz)
4764219089Spjd		len = plsz;
4765219089Spjd	else
4766219089Spjd		len = P2ROUNDUP(len, PAGESIZE);
4767219089Spjd	ASSERT(plsz >= len);
4768219089Spjd
4769219089Spjd	ZFS_ENTER(zfsvfs);
4770219089Spjd	ZFS_VERIFY_ZP(zp);
4771219089Spjd
4772219089Spjd	if (protp)
4773219089Spjd		*protp = PROT_ALL;
4774219089Spjd
4775219089Spjd	/*
4776219089Spjd	 * Loop through the requested range [off, off + len) looking
4777219089Spjd	 * for pages.  If we don't find a page, we will need to create
4778219089Spjd	 * a new page and fill it with data from the file.
4779219089Spjd	 */
4780219089Spjd	while (len > 0) {
4781219089Spjd		if (*pl = page_lookup(vp, off, SE_SHARED))
4782219089Spjd			*(pl+1) = NULL;
4783219089Spjd		else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw))
4784219089Spjd			goto out;
4785219089Spjd		while (*pl) {
4786219089Spjd			ASSERT3U((*pl)->p_offset, ==, off);
4787219089Spjd			off += PAGESIZE;
4788219089Spjd			addr += PAGESIZE;
4789219089Spjd			if (len > 0) {
4790219089Spjd				ASSERT3U(len, >=, PAGESIZE);
4791219089Spjd				len -= PAGESIZE;
4792219089Spjd			}
4793219089Spjd			ASSERT3U(plsz, >=, PAGESIZE);
4794219089Spjd			plsz -= PAGESIZE;
4795219089Spjd			pl++;
4796219089Spjd		}
4797219089Spjd	}
4798219089Spjd
4799219089Spjd	/*
4800219089Spjd	 * Fill out the page array with any pages already in the cache.
4801219089Spjd	 */
4802219089Spjd	while (plsz > 0 &&
4803219089Spjd	    (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) {
4804219089Spjd			off += PAGESIZE;
4805219089Spjd			plsz -= PAGESIZE;
4806219089Spjd	}
4807219089Spjdout:
4808219089Spjd	if (err) {
4809219089Spjd		/*
4810219089Spjd		 * Release any pages we have previously locked.
4811219089Spjd		 */
4812219089Spjd		while (pl > pl0)
4813219089Spjd			page_unlock(*--pl);
4814219089Spjd	} else {
4815219089Spjd		ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4816219089Spjd	}
4817219089Spjd
4818219089Spjd	*pl = NULL;
4819219089Spjd
4820219089Spjd	ZFS_EXIT(zfsvfs);
4821219089Spjd	return (err);
4822219089Spjd}
4823219089Spjd
4824219089Spjd/*
4825219089Spjd * Request a memory map for a section of a file.  This code interacts
4826219089Spjd * with common code and the VM system as follows:
4827219089Spjd *
4828219089Spjd *	common code calls mmap(), which ends up in smmap_common()
4829219089Spjd *
4830219089Spjd *	this calls VOP_MAP(), which takes you into (say) zfs
4831219089Spjd *
4832219089Spjd *	zfs_map() calls as_map(), passing segvn_create() as the callback
4833219089Spjd *
4834219089Spjd *	segvn_create() creates the new segment and calls VOP_ADDMAP()
4835219089Spjd *
4836219089Spjd *	zfs_addmap() updates z_mapcnt
4837219089Spjd */
4838219089Spjd/*ARGSUSED*/
4839219089Spjdstatic int
4840219089Spjdzfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
4841219089Spjd    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4842219089Spjd    caller_context_t *ct)
4843219089Spjd{
4844219089Spjd	znode_t *zp = VTOZ(vp);
4845219089Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4846219089Spjd	segvn_crargs_t	vn_a;
4847219089Spjd	int		error;
4848219089Spjd
4849219089Spjd	ZFS_ENTER(zfsvfs);
4850219089Spjd	ZFS_VERIFY_ZP(zp);
4851219089Spjd
4852219089Spjd	if ((prot & PROT_WRITE) && (zp->z_pflags &
4853219089Spjd	    (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
4854219089Spjd		ZFS_EXIT(zfsvfs);
4855219089Spjd		return (EPERM);
4856219089Spjd	}
4857219089Spjd
4858219089Spjd	if ((prot & (PROT_READ | PROT_EXEC)) &&
4859219089Spjd	    (zp->z_pflags & ZFS_AV_QUARANTINED)) {
4860219089Spjd		ZFS_EXIT(zfsvfs);
4861219089Spjd		return (EACCES);
4862219089Spjd	}
4863219089Spjd
4864219089Spjd	if (vp->v_flag & VNOMAP) {
4865219089Spjd		ZFS_EXIT(zfsvfs);
4866219089Spjd		return (ENOSYS);
4867219089Spjd	}
4868219089Spjd
4869219089Spjd	if (off < 0 || len > MAXOFFSET_T - off) {
4870219089Spjd		ZFS_EXIT(zfsvfs);
4871219089Spjd		return (ENXIO);
4872219089Spjd	}
4873219089Spjd
4874219089Spjd	if (vp->v_type != VREG) {
4875219089Spjd		ZFS_EXIT(zfsvfs);
4876219089Spjd		return (ENODEV);
4877219089Spjd	}
4878219089Spjd
4879219089Spjd	/*
4880219089Spjd	 * If file is locked, disallow mapping.
4881219089Spjd	 */
4882219089Spjd	if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) {
4883219089Spjd		ZFS_EXIT(zfsvfs);
4884219089Spjd		return (EAGAIN);
4885219089Spjd	}
4886219089Spjd
4887219089Spjd	as_rangelock(as);
4888219089Spjd	error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
4889219089Spjd	if (error != 0) {
4890219089Spjd		as_rangeunlock(as);
4891219089Spjd		ZFS_EXIT(zfsvfs);
4892219089Spjd		return (error);
4893219089Spjd	}
4894219089Spjd
4895219089Spjd	vn_a.vp = vp;
4896219089Spjd	vn_a.offset = (u_offset_t)off;
4897219089Spjd	vn_a.type = flags & MAP_TYPE;
4898219089Spjd	vn_a.prot = prot;
4899219089Spjd	vn_a.maxprot = maxprot;
4900219089Spjd	vn_a.cred = cr;
4901219089Spjd	vn_a.amp = NULL;
4902219089Spjd	vn_a.flags = flags & ~MAP_TYPE;
4903219089Spjd	vn_a.szc = 0;
4904219089Spjd	vn_a.lgrp_mem_policy_flags = 0;
4905219089Spjd
4906219089Spjd	error = as_map(as, *addrp, len, segvn_create, &vn_a);
4907219089Spjd
4908219089Spjd	as_rangeunlock(as);
4909219089Spjd	ZFS_EXIT(zfsvfs);
4910219089Spjd	return (error);
4911219089Spjd}
4912219089Spjd
4913219089Spjd/* ARGSUSED */
4914219089Spjdstatic int
4915219089Spjdzfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4916219089Spjd    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4917219089Spjd    caller_context_t *ct)
4918219089Spjd{
4919219089Spjd	uint64_t pages = btopr(len);
4920219089Spjd
4921219089Spjd	atomic_add_64(&VTOZ(vp)->z_mapcnt, pages);
4922219089Spjd	return (0);
4923219089Spjd}
4924219089Spjd
4925219089Spjd/*
4926219089Spjd * The reason we push dirty pages as part of zfs_delmap() is so that we get a
4927219089Spjd * more accurate mtime for the associated file.  Since we don't have a way of
4928219089Spjd * detecting when the data was actually modified, we have to resort to
4929219089Spjd * heuristics.  If an explicit msync() is done, then we mark the mtime when the
4930219089Spjd * last page is pushed.  The problem occurs when the msync() call is omitted,
4931219089Spjd * which by far the most common case:
4932219089Spjd *
4933219089Spjd * 	open()
4934219089Spjd * 	mmap()
4935219089Spjd * 	<modify memory>
4936219089Spjd * 	munmap()
4937219089Spjd * 	close()
4938219089Spjd * 	<time lapse>
4939219089Spjd * 	putpage() via fsflush
4940219089Spjd *
4941219089Spjd * If we wait until fsflush to come along, we can have a modification time that
4942219089Spjd * is some arbitrary point in the future.  In order to prevent this in the
4943219089Spjd * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
4944219089Spjd * torn down.
4945219089Spjd */
4946219089Spjd/* ARGSUSED */
4947219089Spjdstatic int
4948219089Spjdzfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4949219089Spjd    size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
4950219089Spjd    caller_context_t *ct)
4951219089Spjd{
4952219089Spjd	uint64_t pages = btopr(len);
4953219089Spjd
4954219089Spjd	ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
4955219089Spjd	atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
4956219089Spjd
4957219089Spjd	if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
4958219089Spjd	    vn_has_cached_data(vp))
4959219089Spjd		(void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
4960219089Spjd
4961219089Spjd	return (0);
4962219089Spjd}
4963219089Spjd
4964219089Spjd/*
4965219089Spjd * Free or allocate space in a file.  Currently, this function only
4966219089Spjd * supports the `F_FREESP' command.  However, this command is somewhat
4967219089Spjd * misnamed, as its functionality includes the ability to allocate as
4968219089Spjd * well as free space.
4969219089Spjd *
4970219089Spjd *	IN:	vp	- vnode of file to free data in.
4971219089Spjd *		cmd	- action to take (only F_FREESP supported).
4972219089Spjd *		bfp	- section of file to free/alloc.
4973219089Spjd *		flag	- current file open mode flags.
4974219089Spjd *		offset	- current file offset.
4975219089Spjd *		cr	- credentials of caller [UNUSED].
4976219089Spjd *		ct	- caller context.
4977219089Spjd *
4978219089Spjd *	RETURN:	0 if success
4979219089Spjd *		error code if failure
4980219089Spjd *
4981219089Spjd * Timestamps:
4982219089Spjd *	vp - ctime|mtime updated
4983219089Spjd */
4984219089Spjd/* ARGSUSED */
4985219089Spjdstatic int
4986219089Spjdzfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
4987219089Spjd    offset_t offset, cred_t *cr, caller_context_t *ct)
4988219089Spjd{
4989219089Spjd	znode_t		*zp = VTOZ(vp);
4990219089Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4991219089Spjd	uint64_t	off, len;
4992219089Spjd	int		error;
4993219089Spjd
4994219089Spjd	ZFS_ENTER(zfsvfs);
4995219089Spjd	ZFS_VERIFY_ZP(zp);
4996219089Spjd
4997219089Spjd	if (cmd != F_FREESP) {
4998219089Spjd		ZFS_EXIT(zfsvfs);
4999219089Spjd		return (EINVAL);
5000219089Spjd	}
5001219089Spjd
5002219089Spjd	if (error = convoff(vp, bfp, 0, offset)) {
5003219089Spjd		ZFS_EXIT(zfsvfs);
5004219089Spjd		return (error);
5005219089Spjd	}
5006219089Spjd
5007219089Spjd	if (bfp->l_len < 0) {
5008219089Spjd		ZFS_EXIT(zfsvfs);
5009219089Spjd		return (EINVAL);
5010219089Spjd	}
5011219089Spjd
5012219089Spjd	off = bfp->l_start;
5013219089Spjd	len = bfp->l_len; /* 0 means from off to end of file */
5014219089Spjd
5015219089Spjd	error = zfs_freesp(zp, off, len, flag, TRUE);
5016219089Spjd
5017219089Spjd	ZFS_EXIT(zfsvfs);
5018219089Spjd	return (error);
5019219089Spjd}
5020219089Spjd#endif	/* sun */
5021219089Spjd
5022168404SpjdCTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
5023168404SpjdCTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
5024168404Spjd
5025185029Spjd/*ARGSUSED*/
5026168404Spjdstatic int
5027185029Spjdzfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
5028168404Spjd{
5029168404Spjd	znode_t		*zp = VTOZ(vp);
5030168404Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
5031185029Spjd	uint32_t	gen;
5032219089Spjd	uint64_t	gen64;
5033168404Spjd	uint64_t	object = zp->z_id;
5034168404Spjd	zfid_short_t	*zfid;
5035219089Spjd	int		size, i, error;
5036168404Spjd
5037168404Spjd	ZFS_ENTER(zfsvfs);
5038185029Spjd	ZFS_VERIFY_ZP(zp);
5039168404Spjd
5040219089Spjd	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
5041219089Spjd	    &gen64, sizeof (uint64_t))) != 0) {
5042219089Spjd		ZFS_EXIT(zfsvfs);
5043219089Spjd		return (error);
5044219089Spjd	}
5045219089Spjd
5046219089Spjd	gen = (uint32_t)gen64;
5047219089Spjd
5048168404Spjd	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
5049168404Spjd	fidp->fid_len = size;
5050168404Spjd
5051168404Spjd	zfid = (zfid_short_t *)fidp;
5052168404Spjd
5053168404Spjd	zfid->zf_len = size;
5054168404Spjd
5055168404Spjd	for (i = 0; i < sizeof (zfid->zf_object); i++)
5056168404Spjd		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
5057168404Spjd
5058168404Spjd	/* Must have a non-zero generation number to distinguish from .zfs */
5059168404Spjd	if (gen == 0)
5060168404Spjd		gen = 1;
5061168404Spjd	for (i = 0; i < sizeof (zfid->zf_gen); i++)
5062168404Spjd		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
5063168404Spjd
5064168404Spjd	if (size == LONG_FID_LEN) {
5065168404Spjd		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
5066169023Spjd		zfid_long_t	*zlfid;
5067168404Spjd
5068168404Spjd		zlfid = (zfid_long_t *)fidp;
5069168404Spjd
5070168404Spjd		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
5071168404Spjd			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
5072168404Spjd
5073168404Spjd		/* XXX - this should be the generation number for the objset */
5074168404Spjd		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
5075168404Spjd			zlfid->zf_setgen[i] = 0;
5076168404Spjd	}
5077168404Spjd
5078168404Spjd	ZFS_EXIT(zfsvfs);
5079168404Spjd	return (0);
5080168404Spjd}
5081168404Spjd
5082168404Spjdstatic int
5083185029Spjdzfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
5084185029Spjd    caller_context_t *ct)
5085168404Spjd{
5086168404Spjd	znode_t		*zp, *xzp;
5087168404Spjd	zfsvfs_t	*zfsvfs;
5088168404Spjd	zfs_dirlock_t	*dl;
5089168404Spjd	int		error;
5090168404Spjd
5091168404Spjd	switch (cmd) {
5092168404Spjd	case _PC_LINK_MAX:
5093168404Spjd		*valp = INT_MAX;
5094168404Spjd		return (0);
5095168404Spjd
5096168404Spjd	case _PC_FILESIZEBITS:
5097168404Spjd		*valp = 64;
5098168404Spjd		return (0);
5099219089Spjd#ifdef sun
5100168404Spjd	case _PC_XATTR_EXISTS:
5101168404Spjd		zp = VTOZ(vp);
5102168404Spjd		zfsvfs = zp->z_zfsvfs;
5103168404Spjd		ZFS_ENTER(zfsvfs);
5104185029Spjd		ZFS_VERIFY_ZP(zp);
5105168404Spjd		*valp = 0;
5106168404Spjd		error = zfs_dirent_lock(&dl, zp, "", &xzp,
5107185029Spjd		    ZXATTR | ZEXISTS | ZSHARED, NULL, NULL);
5108168404Spjd		if (error == 0) {
5109168404Spjd			zfs_dirent_unlock(dl);
5110168404Spjd			if (!zfs_dirempty(xzp))
5111168404Spjd				*valp = 1;
5112168404Spjd			VN_RELE(ZTOV(xzp));
5113168404Spjd		} else if (error == ENOENT) {
5114168404Spjd			/*
5115168404Spjd			 * If there aren't extended attributes, it's the
5116168404Spjd			 * same as having zero of them.
5117168404Spjd			 */
5118168404Spjd			error = 0;
5119168404Spjd		}
5120168404Spjd		ZFS_EXIT(zfsvfs);
5121168404Spjd		return (error);
5122168404Spjd
5123219089Spjd	case _PC_SATTR_ENABLED:
5124219089Spjd	case _PC_SATTR_EXISTS:
5125219089Spjd		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
5126219089Spjd		    (vp->v_type == VREG || vp->v_type == VDIR);
5127219089Spjd		return (0);
5128219089Spjd
5129219089Spjd	case _PC_ACCESS_FILTERING:
5130219089Spjd		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
5131219089Spjd		    vp->v_type == VDIR;
5132219089Spjd		return (0);
5133219089Spjd
5134219089Spjd	case _PC_ACL_ENABLED:
5135219089Spjd		*valp = _ACL_ACE_ENABLED;
5136219089Spjd		return (0);
5137219089Spjd#endif	/* sun */
5138219089Spjd	case _PC_MIN_HOLE_SIZE:
5139219089Spjd		*valp = (int)SPA_MINBLOCKSIZE;
5140219089Spjd		return (0);
5141219089Spjd#ifdef sun
5142219089Spjd	case _PC_TIMESTAMP_RESOLUTION:
5143219089Spjd		/* nanosecond timestamp resolution */
5144219089Spjd		*valp = 1L;
5145219089Spjd		return (0);
5146219089Spjd#endif	/* sun */
5147168404Spjd	case _PC_ACL_EXTENDED:
5148196949Strasz		*valp = 0;
5149168404Spjd		return (0);
5150168404Spjd
5151196949Strasz	case _PC_ACL_NFS4:
5152196949Strasz		*valp = 1;
5153196949Strasz		return (0);
5154196949Strasz
5155196949Strasz	case _PC_ACL_PATH_MAX:
5156196949Strasz		*valp = ACL_MAX_ENTRIES;
5157196949Strasz		return (0);
5158196949Strasz
5159168404Spjd	default:
5160168962Spjd		return (EOPNOTSUPP);
5161168404Spjd	}
5162168404Spjd}
5163168404Spjd
5164168404Spjd/*ARGSUSED*/
5165168404Spjdstatic int
5166185029Spjdzfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
5167185029Spjd    caller_context_t *ct)
5168168404Spjd{
5169168404Spjd	znode_t *zp = VTOZ(vp);
5170168404Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5171168404Spjd	int error;
5172185029Spjd	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
5173168404Spjd
5174168404Spjd	ZFS_ENTER(zfsvfs);
5175185029Spjd	ZFS_VERIFY_ZP(zp);
5176185029Spjd	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
5177168404Spjd	ZFS_EXIT(zfsvfs);
5178168404Spjd
5179168404Spjd	return (error);
5180168404Spjd}
5181168404Spjd
5182168404Spjd/*ARGSUSED*/
5183228685Spjdint
5184185029Spjdzfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
5185185029Spjd    caller_context_t *ct)
5186168404Spjd{
5187168404Spjd	znode_t *zp = VTOZ(vp);
5188168404Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5189168404Spjd	int error;
5190185029Spjd	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
5191219089Spjd	zilog_t	*zilog = zfsvfs->z_log;
5192168404Spjd
5193168404Spjd	ZFS_ENTER(zfsvfs);
5194185029Spjd	ZFS_VERIFY_ZP(zp);
5195219089Spjd
5196185029Spjd	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
5197219089Spjd
5198219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
5199219089Spjd		zil_commit(zilog, 0);
5200219089Spjd
5201168404Spjd	ZFS_EXIT(zfsvfs);
5202168404Spjd	return (error);
5203168404Spjd}
5204168404Spjd
5205219089Spjd#ifdef sun
5206219089Spjd/*
5207219089Spjd * Tunable, both must be a power of 2.
5208219089Spjd *
5209219089Spjd * zcr_blksz_min: the smallest read we may consider to loan out an arcbuf
5210219089Spjd * zcr_blksz_max: if set to less than the file block size, allow loaning out of
5211219089Spjd *                an arcbuf for a partial block read
5212219089Spjd */
5213219089Spjdint zcr_blksz_min = (1 << 10);	/* 1K */
5214219089Spjdint zcr_blksz_max = (1 << 17);	/* 128K */
5215219089Spjd
5216219089Spjd/*ARGSUSED*/
5217168962Spjdstatic int
5218219089Spjdzfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr,
5219219089Spjd    caller_context_t *ct)
5220219089Spjd{
5221219089Spjd	znode_t	*zp = VTOZ(vp);
5222219089Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5223219089Spjd	int max_blksz = zfsvfs->z_max_blksz;
5224219089Spjd	uio_t *uio = &xuio->xu_uio;
5225219089Spjd	ssize_t size = uio->uio_resid;
5226219089Spjd	offset_t offset = uio->uio_loffset;
5227219089Spjd	int blksz;
5228219089Spjd	int fullblk, i;
5229219089Spjd	arc_buf_t *abuf;
5230219089Spjd	ssize_t maxsize;
5231219089Spjd	int preamble, postamble;
5232219089Spjd
5233219089Spjd	if (xuio->xu_type != UIOTYPE_ZEROCOPY)
5234219089Spjd		return (EINVAL);
5235219089Spjd
5236219089Spjd	ZFS_ENTER(zfsvfs);
5237219089Spjd	ZFS_VERIFY_ZP(zp);
5238219089Spjd	switch (ioflag) {
5239219089Spjd	case UIO_WRITE:
5240219089Spjd		/*
5241219089Spjd		 * Loan out an arc_buf for write if write size is bigger than
5242219089Spjd		 * max_blksz, and the file's block size is also max_blksz.
5243219089Spjd		 */
5244219089Spjd		blksz = max_blksz;
5245219089Spjd		if (size < blksz || zp->z_blksz != blksz) {
5246219089Spjd			ZFS_EXIT(zfsvfs);
5247219089Spjd			return (EINVAL);
5248219089Spjd		}
5249219089Spjd		/*
5250219089Spjd		 * Caller requests buffers for write before knowing where the
5251219089Spjd		 * write offset might be (e.g. NFS TCP write).
5252219089Spjd		 */
5253219089Spjd		if (offset == -1) {
5254219089Spjd			preamble = 0;
5255219089Spjd		} else {
5256219089Spjd			preamble = P2PHASE(offset, blksz);
5257219089Spjd			if (preamble) {
5258219089Spjd				preamble = blksz - preamble;
5259219089Spjd				size -= preamble;
5260219089Spjd			}
5261219089Spjd		}
5262219089Spjd
5263219089Spjd		postamble = P2PHASE(size, blksz);
5264219089Spjd		size -= postamble;
5265219089Spjd
5266219089Spjd		fullblk = size / blksz;
5267219089Spjd		(void) dmu_xuio_init(xuio,
5268219089Spjd		    (preamble != 0) + fullblk + (postamble != 0));
5269219089Spjd		DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble,
5270219089Spjd		    int, postamble, int,
5271219089Spjd		    (preamble != 0) + fullblk + (postamble != 0));
5272219089Spjd
5273219089Spjd		/*
5274219089Spjd		 * Have to fix iov base/len for partial buffers.  They
5275219089Spjd		 * currently represent full arc_buf's.
5276219089Spjd		 */
5277219089Spjd		if (preamble) {
5278219089Spjd			/* data begins in the middle of the arc_buf */
5279219089Spjd			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5280219089Spjd			    blksz);
5281219089Spjd			ASSERT(abuf);
5282219089Spjd			(void) dmu_xuio_add(xuio, abuf,
5283219089Spjd			    blksz - preamble, preamble);
5284219089Spjd		}
5285219089Spjd
5286219089Spjd		for (i = 0; i < fullblk; i++) {
5287219089Spjd			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5288219089Spjd			    blksz);
5289219089Spjd			ASSERT(abuf);
5290219089Spjd			(void) dmu_xuio_add(xuio, abuf, 0, blksz);
5291219089Spjd		}
5292219089Spjd
5293219089Spjd		if (postamble) {
5294219089Spjd			/* data ends in the middle of the arc_buf */
5295219089Spjd			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5296219089Spjd			    blksz);
5297219089Spjd			ASSERT(abuf);
5298219089Spjd			(void) dmu_xuio_add(xuio, abuf, 0, postamble);
5299219089Spjd		}
5300219089Spjd		break;
5301219089Spjd	case UIO_READ:
5302219089Spjd		/*
5303219089Spjd		 * Loan out an arc_buf for read if the read size is larger than
5304219089Spjd		 * the current file block size.  Block alignment is not
5305219089Spjd		 * considered.  Partial arc_buf will be loaned out for read.
5306219089Spjd		 */
5307219089Spjd		blksz = zp->z_blksz;
5308219089Spjd		if (blksz < zcr_blksz_min)
5309219089Spjd			blksz = zcr_blksz_min;
5310219089Spjd		if (blksz > zcr_blksz_max)
5311219089Spjd			blksz = zcr_blksz_max;
5312219089Spjd		/* avoid potential complexity of dealing with it */
5313219089Spjd		if (blksz > max_blksz) {
5314219089Spjd			ZFS_EXIT(zfsvfs);
5315219089Spjd			return (EINVAL);
5316219089Spjd		}
5317219089Spjd
5318219089Spjd		maxsize = zp->z_size - uio->uio_loffset;
5319219089Spjd		if (size > maxsize)
5320219089Spjd			size = maxsize;
5321219089Spjd
5322219089Spjd		if (size < blksz || vn_has_cached_data(vp)) {
5323219089Spjd			ZFS_EXIT(zfsvfs);
5324219089Spjd			return (EINVAL);
5325219089Spjd		}
5326219089Spjd		break;
5327219089Spjd	default:
5328219089Spjd		ZFS_EXIT(zfsvfs);
5329219089Spjd		return (EINVAL);
5330219089Spjd	}
5331219089Spjd
5332219089Spjd	uio->uio_extflg = UIO_XUIO;
5333219089Spjd	XUIO_XUZC_RW(xuio) = ioflag;
5334219089Spjd	ZFS_EXIT(zfsvfs);
5335219089Spjd	return (0);
5336219089Spjd}
5337219089Spjd
5338219089Spjd/*ARGSUSED*/
5339219089Spjdstatic int
5340219089Spjdzfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct)
5341219089Spjd{
5342219089Spjd	int i;
5343219089Spjd	arc_buf_t *abuf;
5344219089Spjd	int ioflag = XUIO_XUZC_RW(xuio);
5345219089Spjd
5346219089Spjd	ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
5347219089Spjd
5348219089Spjd	i = dmu_xuio_cnt(xuio);
5349219089Spjd	while (i-- > 0) {
5350219089Spjd		abuf = dmu_xuio_arcbuf(xuio, i);
5351219089Spjd		/*
5352219089Spjd		 * if abuf == NULL, it must be a write buffer
5353219089Spjd		 * that has been returned in zfs_write().
5354219089Spjd		 */
5355219089Spjd		if (abuf)
5356219089Spjd			dmu_return_arcbuf(abuf);
5357219089Spjd		ASSERT(abuf || ioflag == UIO_WRITE);
5358219089Spjd	}
5359219089Spjd
5360219089Spjd	dmu_xuio_fini(xuio);
5361219089Spjd	return (0);
5362219089Spjd}
5363219089Spjd
5364219089Spjd/*
5365219089Spjd * Predeclare these here so that the compiler assumes that
5366219089Spjd * this is an "old style" function declaration that does
5367219089Spjd * not include arguments => we won't get type mismatch errors
5368219089Spjd * in the initializations that follow.
5369219089Spjd */
5370219089Spjdstatic int zfs_inval();
5371219089Spjdstatic int zfs_isdir();
5372219089Spjd
5373219089Spjdstatic int
5374219089Spjdzfs_inval()
5375219089Spjd{
5376219089Spjd	return (EINVAL);
5377219089Spjd}
5378219089Spjd
5379219089Spjdstatic int
5380219089Spjdzfs_isdir()
5381219089Spjd{
5382219089Spjd	return (EISDIR);
5383219089Spjd}
5384219089Spjd/*
5385219089Spjd * Directory vnode operations template
5386219089Spjd */
5387219089Spjdvnodeops_t *zfs_dvnodeops;
5388219089Spjdconst fs_operation_def_t zfs_dvnodeops_template[] = {
5389219089Spjd	VOPNAME_OPEN,		{ .vop_open = zfs_open },
5390219089Spjd	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
5391219089Spjd	VOPNAME_READ,		{ .error = zfs_isdir },
5392219089Spjd	VOPNAME_WRITE,		{ .error = zfs_isdir },
5393219089Spjd	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
5394219089Spjd	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5395219089Spjd	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5396219089Spjd	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5397219089Spjd	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
5398219089Spjd	VOPNAME_CREATE,		{ .vop_create = zfs_create },
5399219089Spjd	VOPNAME_REMOVE,		{ .vop_remove = zfs_remove },
5400219089Spjd	VOPNAME_LINK,		{ .vop_link = zfs_link },
5401219089Spjd	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5402219089Spjd	VOPNAME_MKDIR,		{ .vop_mkdir = zfs_mkdir },
5403219089Spjd	VOPNAME_RMDIR,		{ .vop_rmdir = zfs_rmdir },
5404219089Spjd	VOPNAME_READDIR,	{ .vop_readdir = zfs_readdir },
5405219089Spjd	VOPNAME_SYMLINK,	{ .vop_symlink = zfs_symlink },
5406219089Spjd	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
5407219089Spjd	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5408219089Spjd	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5409219089Spjd	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
5410219089Spjd	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5411219089Spjd	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5412219089Spjd	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5413219089Spjd	VOPNAME_VNEVENT, 	{ .vop_vnevent = fs_vnevent_support },
5414219089Spjd	NULL,			NULL
5415219089Spjd};
5416219089Spjd
5417219089Spjd/*
5418219089Spjd * Regular file vnode operations template
5419219089Spjd */
5420219089Spjdvnodeops_t *zfs_fvnodeops;
5421219089Spjdconst fs_operation_def_t zfs_fvnodeops_template[] = {
5422219089Spjd	VOPNAME_OPEN,		{ .vop_open = zfs_open },
5423219089Spjd	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
5424219089Spjd	VOPNAME_READ,		{ .vop_read = zfs_read },
5425219089Spjd	VOPNAME_WRITE,		{ .vop_write = zfs_write },
5426219089Spjd	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
5427219089Spjd	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5428219089Spjd	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5429219089Spjd	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5430219089Spjd	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
5431219089Spjd	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5432219089Spjd	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
5433219089Spjd	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5434219089Spjd	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5435219089Spjd	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
5436219089Spjd	VOPNAME_FRLOCK,		{ .vop_frlock = zfs_frlock },
5437219089Spjd	VOPNAME_SPACE,		{ .vop_space = zfs_space },
5438219089Spjd	VOPNAME_GETPAGE,	{ .vop_getpage = zfs_getpage },
5439219089Spjd	VOPNAME_PUTPAGE,	{ .vop_putpage = zfs_putpage },
5440219089Spjd	VOPNAME_MAP,		{ .vop_map = zfs_map },
5441219089Spjd	VOPNAME_ADDMAP,		{ .vop_addmap = zfs_addmap },
5442219089Spjd	VOPNAME_DELMAP,		{ .vop_delmap = zfs_delmap },
5443219089Spjd	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5444219089Spjd	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5445219089Spjd	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5446219089Spjd	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5447219089Spjd	VOPNAME_REQZCBUF, 	{ .vop_reqzcbuf = zfs_reqzcbuf },
5448219089Spjd	VOPNAME_RETZCBUF, 	{ .vop_retzcbuf = zfs_retzcbuf },
5449219089Spjd	NULL,			NULL
5450219089Spjd};
5451219089Spjd
5452219089Spjd/*
5453219089Spjd * Symbolic link vnode operations template
5454219089Spjd */
5455219089Spjdvnodeops_t *zfs_symvnodeops;
5456219089Spjdconst fs_operation_def_t zfs_symvnodeops_template[] = {
5457219089Spjd	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5458219089Spjd	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5459219089Spjd	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5460219089Spjd	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5461219089Spjd	VOPNAME_READLINK,	{ .vop_readlink = zfs_readlink },
5462219089Spjd	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5463219089Spjd	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5464219089Spjd	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5465219089Spjd	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5466219089Spjd	NULL,			NULL
5467219089Spjd};
5468219089Spjd
5469219089Spjd/*
5470219089Spjd * special share hidden files vnode operations template
5471219089Spjd */
5472219089Spjdvnodeops_t *zfs_sharevnodeops;
5473219089Spjdconst fs_operation_def_t zfs_sharevnodeops_template[] = {
5474219089Spjd	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5475219089Spjd	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5476219089Spjd	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5477219089Spjd	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5478219089Spjd	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5479219089Spjd	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5480219089Spjd	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5481219089Spjd	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5482219089Spjd	NULL,			NULL
5483219089Spjd};
5484219089Spjd
5485219089Spjd/*
5486219089Spjd * Extended attribute directory vnode operations template
5487219089Spjd *	This template is identical to the directory vnodes
5488219089Spjd *	operation template except for restricted operations:
5489219089Spjd *		VOP_MKDIR()
5490219089Spjd *		VOP_SYMLINK()
5491219089Spjd * Note that there are other restrictions embedded in:
5492219089Spjd *	zfs_create()	- restrict type to VREG
5493219089Spjd *	zfs_link()	- no links into/out of attribute space
5494219089Spjd *	zfs_rename()	- no moves into/out of attribute space
5495219089Spjd */
5496219089Spjdvnodeops_t *zfs_xdvnodeops;
5497219089Spjdconst fs_operation_def_t zfs_xdvnodeops_template[] = {
5498219089Spjd	VOPNAME_OPEN,		{ .vop_open = zfs_open },
5499219089Spjd	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
5500219089Spjd	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
5501219089Spjd	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5502219089Spjd	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5503219089Spjd	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5504219089Spjd	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
5505219089Spjd	VOPNAME_CREATE,		{ .vop_create = zfs_create },
5506219089Spjd	VOPNAME_REMOVE,		{ .vop_remove = zfs_remove },
5507219089Spjd	VOPNAME_LINK,		{ .vop_link = zfs_link },
5508219089Spjd	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5509219089Spjd	VOPNAME_MKDIR,		{ .error = zfs_inval },
5510219089Spjd	VOPNAME_RMDIR,		{ .vop_rmdir = zfs_rmdir },
5511219089Spjd	VOPNAME_READDIR,	{ .vop_readdir = zfs_readdir },
5512219089Spjd	VOPNAME_SYMLINK,	{ .error = zfs_inval },
5513219089Spjd	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
5514219089Spjd	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5515219089Spjd	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5516219089Spjd	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
5517219089Spjd	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5518219089Spjd	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5519219089Spjd	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5520219089Spjd	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5521219089Spjd	NULL,			NULL
5522219089Spjd};
5523219089Spjd
5524219089Spjd/*
5525219089Spjd * Error vnode operations template
5526219089Spjd */
5527219089Spjdvnodeops_t *zfs_evnodeops;
5528219089Spjdconst fs_operation_def_t zfs_evnodeops_template[] = {
5529219089Spjd	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5530219089Spjd	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5531219089Spjd	NULL,			NULL
5532219089Spjd};
5533219089Spjd#endif	/* sun */
5534219089Spjd
5535219089Spjdstatic int
5536213673Spjdioflags(int ioflags)
5537213673Spjd{
5538213673Spjd	int flags = 0;
5539213673Spjd
5540213673Spjd	if (ioflags & IO_APPEND)
5541213673Spjd		flags |= FAPPEND;
5542213673Spjd	if (ioflags & IO_NDELAY)
5543213673Spjd        	flags |= FNONBLOCK;
5544213673Spjd	if (ioflags & IO_SYNC)
5545213673Spjd		flags |= (FSYNC | FDSYNC | FRSYNC);
5546213673Spjd
5547213673Spjd	return (flags);
5548213673Spjd}
5549213673Spjd
5550213673Spjdstatic int
5551213937Savgzfs_getpages(struct vnode *vp, vm_page_t *m, int count, int reqpage)
5552213937Savg{
5553213937Savg	znode_t *zp = VTOZ(vp);
5554213937Savg	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5555213937Savg	objset_t *os = zp->z_zfsvfs->z_os;
5556213937Savg	vm_page_t mreq;
5557213937Savg	vm_object_t object;
5558213937Savg	caddr_t va;
5559213937Savg	struct sf_buf *sf;
5560213937Savg	int i, error;
5561213937Savg	int pcount, size;
5562213937Savg
5563213937Savg	ZFS_ENTER(zfsvfs);
5564213937Savg	ZFS_VERIFY_ZP(zp);
5565213937Savg
5566213937Savg	pcount = round_page(count) / PAGE_SIZE;
5567213937Savg	mreq = m[reqpage];
5568213937Savg	object = mreq->object;
5569213937Savg	error = 0;
5570213937Savg
5571213937Savg	KASSERT(vp->v_object == object, ("mismatching object"));
5572213937Savg
5573213937Savg	VM_OBJECT_LOCK(object);
5574213937Savg
5575213937Savg	for (i = 0; i < pcount; i++) {
5576213937Savg		if (i != reqpage) {
5577213937Savg			vm_page_lock(m[i]);
5578213937Savg			vm_page_free(m[i]);
5579213937Savg			vm_page_unlock(m[i]);
5580213937Savg		}
5581213937Savg	}
5582213937Savg
5583213937Savg	if (mreq->valid) {
5584213937Savg		if (mreq->valid != VM_PAGE_BITS_ALL)
5585213937Savg			vm_page_zero_invalid(mreq, TRUE);
5586213937Savg		VM_OBJECT_UNLOCK(object);
5587213937Savg		ZFS_EXIT(zfsvfs);
5588213937Savg		return (VM_PAGER_OK);
5589213937Savg	}
5590213937Savg
5591213937Savg	PCPU_INC(cnt.v_vnodein);
5592213937Savg	PCPU_INC(cnt.v_vnodepgsin);
5593213937Savg
5594213937Savg	if (IDX_TO_OFF(mreq->pindex) >= object->un_pager.vnp.vnp_size) {
5595213937Savg		VM_OBJECT_UNLOCK(object);
5596213937Savg		ZFS_EXIT(zfsvfs);
5597213937Savg		return (VM_PAGER_BAD);
5598213937Savg	}
5599213937Savg
5600213937Savg	size = PAGE_SIZE;
5601213937Savg	if (IDX_TO_OFF(mreq->pindex) + size > object->un_pager.vnp.vnp_size)
5602213937Savg		size = object->un_pager.vnp.vnp_size - IDX_TO_OFF(mreq->pindex);
5603213937Savg
5604213937Savg	VM_OBJECT_UNLOCK(object);
5605213937Savg	va = zfs_map_page(mreq, &sf);
5606213937Savg	error = dmu_read(os, zp->z_id, IDX_TO_OFF(mreq->pindex),
5607213937Savg	    size, va, DMU_READ_PREFETCH);
5608213937Savg	if (size != PAGE_SIZE)
5609213937Savg		bzero(va + size, PAGE_SIZE - size);
5610213937Savg	zfs_unmap_page(sf);
5611213937Savg	VM_OBJECT_LOCK(object);
5612213937Savg
5613213937Savg	if (!error)
5614213937Savg		mreq->valid = VM_PAGE_BITS_ALL;
5615213937Savg	KASSERT(mreq->dirty == 0, ("zfs_getpages: page %p is dirty", mreq));
5616213937Savg
5617213937Savg	VM_OBJECT_UNLOCK(object);
5618213937Savg
5619213937Savg	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
5620213937Savg	ZFS_EXIT(zfsvfs);
5621213937Savg	return (error ? VM_PAGER_ERROR : VM_PAGER_OK);
5622213937Savg}
5623213937Savg
5624213937Savgstatic int
5625213937Savgzfs_freebsd_getpages(ap)
5626213937Savg	struct vop_getpages_args /* {
5627213937Savg		struct vnode *a_vp;
5628213937Savg		vm_page_t *a_m;
5629213937Savg		int a_count;
5630213937Savg		int a_reqpage;
5631213937Savg		vm_ooffset_t a_offset;
5632213937Savg	} */ *ap;
5633213937Savg{
5634213937Savg
5635213937Savg	return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_reqpage));
5636213937Savg}
5637213937Savg
5638213937Savgstatic int
5639168962Spjdzfs_freebsd_open(ap)
5640168962Spjd	struct vop_open_args /* {
5641168962Spjd		struct vnode *a_vp;
5642168962Spjd		int a_mode;
5643168962Spjd		struct ucred *a_cred;
5644168962Spjd		struct thread *a_td;
5645168962Spjd	} */ *ap;
5646168962Spjd{
5647168962Spjd	vnode_t	*vp = ap->a_vp;
5648168962Spjd	znode_t *zp = VTOZ(vp);
5649168962Spjd	int error;
5650168962Spjd
5651185029Spjd	error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
5652168962Spjd	if (error == 0)
5653219089Spjd		vnode_create_vobject(vp, zp->z_size, ap->a_td);
5654168962Spjd	return (error);
5655168962Spjd}
5656168962Spjd
5657168962Spjdstatic int
5658168962Spjdzfs_freebsd_close(ap)
5659168962Spjd	struct vop_close_args /* {
5660168962Spjd		struct vnode *a_vp;
5661168962Spjd		int  a_fflag;
5662168962Spjd		struct ucred *a_cred;
5663168962Spjd		struct thread *a_td;
5664168962Spjd	} */ *ap;
5665168962Spjd{
5666168962Spjd
5667185029Spjd	return (zfs_close(ap->a_vp, ap->a_fflag, 0, 0, ap->a_cred, NULL));
5668168962Spjd}
5669168962Spjd
5670168962Spjdstatic int
5671168962Spjdzfs_freebsd_ioctl(ap)
5672168962Spjd	struct vop_ioctl_args /* {
5673168962Spjd		struct vnode *a_vp;
5674168962Spjd		u_long a_command;
5675168962Spjd		caddr_t a_data;
5676168962Spjd		int a_fflag;
5677168962Spjd		struct ucred *cred;
5678168962Spjd		struct thread *td;
5679168962Spjd	} */ *ap;
5680168962Spjd{
5681168962Spjd
5682168978Spjd	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
5683185029Spjd	    ap->a_fflag, ap->a_cred, NULL, NULL));
5684168962Spjd}
5685168962Spjd
5686168962Spjdstatic int
5687168962Spjdzfs_freebsd_read(ap)
5688168962Spjd	struct vop_read_args /* {
5689168962Spjd		struct vnode *a_vp;
5690168962Spjd		struct uio *a_uio;
5691168962Spjd		int a_ioflag;
5692168962Spjd		struct ucred *a_cred;
5693168962Spjd	} */ *ap;
5694168962Spjd{
5695168962Spjd
5696213673Spjd	return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
5697213673Spjd	    ap->a_cred, NULL));
5698168962Spjd}
5699168962Spjd
5700168962Spjdstatic int
5701168962Spjdzfs_freebsd_write(ap)
5702168962Spjd	struct vop_write_args /* {
5703168962Spjd		struct vnode *a_vp;
5704168962Spjd		struct uio *a_uio;
5705168962Spjd		int a_ioflag;
5706168962Spjd		struct ucred *a_cred;
5707168962Spjd	} */ *ap;
5708168962Spjd{
5709168962Spjd
5710213673Spjd	return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
5711213673Spjd	    ap->a_cred, NULL));
5712168962Spjd}
5713168962Spjd
5714168962Spjdstatic int
5715168962Spjdzfs_freebsd_access(ap)
5716168962Spjd	struct vop_access_args /* {
5717168962Spjd		struct vnode *a_vp;
5718192689Strasz		accmode_t a_accmode;
5719168962Spjd		struct ucred *a_cred;
5720168962Spjd		struct thread *a_td;
5721168962Spjd	} */ *ap;
5722168962Spjd{
5723212002Sjh	vnode_t *vp = ap->a_vp;
5724212002Sjh	znode_t *zp = VTOZ(vp);
5725198703Spjd	accmode_t accmode;
5726198703Spjd	int error = 0;
5727168962Spjd
5728185172Spjd	/*
5729198703Spjd	 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
5730185172Spjd	 */
5731198703Spjd	accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
5732198703Spjd	if (accmode != 0)
5733198703Spjd		error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL);
5734185172Spjd
5735198703Spjd	/*
5736198703Spjd	 * VADMIN has to be handled by vaccess().
5737198703Spjd	 */
5738198703Spjd	if (error == 0) {
5739198703Spjd		accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
5740198703Spjd		if (accmode != 0) {
5741219089Spjd			error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
5742219089Spjd			    zp->z_gid, accmode, ap->a_cred, NULL);
5743198703Spjd		}
5744185172Spjd	}
5745185172Spjd
5746212002Sjh	/*
5747212002Sjh	 * For VEXEC, ensure that at least one execute bit is set for
5748212002Sjh	 * non-directories.
5749212002Sjh	 */
5750212002Sjh	if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
5751219089Spjd	    (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
5752212002Sjh		error = EACCES;
5753219089Spjd	}
5754212002Sjh
5755198703Spjd	return (error);
5756168962Spjd}
5757168962Spjd
5758168962Spjdstatic int
5759168962Spjdzfs_freebsd_lookup(ap)
5760168962Spjd	struct vop_lookup_args /* {
5761168962Spjd		struct vnode *a_dvp;
5762168962Spjd		struct vnode **a_vpp;
5763168962Spjd		struct componentname *a_cnp;
5764168962Spjd	} */ *ap;
5765168962Spjd{
5766168962Spjd	struct componentname *cnp = ap->a_cnp;
5767168962Spjd	char nm[NAME_MAX + 1];
5768168962Spjd
5769168962Spjd	ASSERT(cnp->cn_namelen < sizeof(nm));
5770168962Spjd	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
5771168962Spjd
5772168962Spjd	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
5773185029Spjd	    cnp->cn_cred, cnp->cn_thread, 0));
5774168962Spjd}
5775168962Spjd
5776168962Spjdstatic int
5777168962Spjdzfs_freebsd_create(ap)
5778168962Spjd	struct vop_create_args /* {
5779168962Spjd		struct vnode *a_dvp;
5780168962Spjd		struct vnode **a_vpp;
5781168962Spjd		struct componentname *a_cnp;
5782168962Spjd		struct vattr *a_vap;
5783168962Spjd	} */ *ap;
5784168962Spjd{
5785168962Spjd	struct componentname *cnp = ap->a_cnp;
5786168962Spjd	vattr_t *vap = ap->a_vap;
5787168962Spjd	int mode;
5788168962Spjd
5789168962Spjd	ASSERT(cnp->cn_flags & SAVENAME);
5790168962Spjd
5791168962Spjd	vattr_init_mask(vap);
5792168962Spjd	mode = vap->va_mode & ALLPERMS;
5793168962Spjd
5794168962Spjd	return (zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
5795185029Spjd	    ap->a_vpp, cnp->cn_cred, cnp->cn_thread));
5796168962Spjd}
5797168962Spjd
5798168962Spjdstatic int
5799168962Spjdzfs_freebsd_remove(ap)
5800168962Spjd	struct vop_remove_args /* {
5801168962Spjd		struct vnode *a_dvp;
5802168962Spjd		struct vnode *a_vp;
5803168962Spjd		struct componentname *a_cnp;
5804168962Spjd	} */ *ap;
5805168962Spjd{
5806168962Spjd
5807168962Spjd	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
5808168962Spjd
5809168962Spjd	return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr,
5810185029Spjd	    ap->a_cnp->cn_cred, NULL, 0));
5811168962Spjd}
5812168962Spjd
5813168962Spjdstatic int
5814168962Spjdzfs_freebsd_mkdir(ap)
5815168962Spjd	struct vop_mkdir_args /* {
5816168962Spjd		struct vnode *a_dvp;
5817168962Spjd		struct vnode **a_vpp;
5818168962Spjd		struct componentname *a_cnp;
5819168962Spjd		struct vattr *a_vap;
5820168962Spjd	} */ *ap;
5821168962Spjd{
5822168962Spjd	vattr_t *vap = ap->a_vap;
5823168962Spjd
5824168962Spjd	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
5825168962Spjd
5826168962Spjd	vattr_init_mask(vap);
5827168962Spjd
5828168962Spjd	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
5829185029Spjd	    ap->a_cnp->cn_cred, NULL, 0, NULL));
5830168962Spjd}
5831168962Spjd
5832168962Spjdstatic int
5833168962Spjdzfs_freebsd_rmdir(ap)
5834168962Spjd	struct vop_rmdir_args /* {
5835168962Spjd		struct vnode *a_dvp;
5836168962Spjd		struct vnode *a_vp;
5837168962Spjd		struct componentname *a_cnp;
5838168962Spjd	} */ *ap;
5839168962Spjd{
5840168962Spjd	struct componentname *cnp = ap->a_cnp;
5841168962Spjd
5842168962Spjd	ASSERT(cnp->cn_flags & SAVENAME);
5843168962Spjd
5844185029Spjd	return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred, NULL, 0));
5845168962Spjd}
5846168962Spjd
5847168962Spjdstatic int
5848168962Spjdzfs_freebsd_readdir(ap)
5849168962Spjd	struct vop_readdir_args /* {
5850168962Spjd		struct vnode *a_vp;
5851168962Spjd		struct uio *a_uio;
5852168962Spjd		struct ucred *a_cred;
5853168962Spjd		int *a_eofflag;
5854168962Spjd		int *a_ncookies;
5855168962Spjd		u_long **a_cookies;
5856168962Spjd	} */ *ap;
5857168962Spjd{
5858168962Spjd
5859168962Spjd	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
5860168962Spjd	    ap->a_ncookies, ap->a_cookies));
5861168962Spjd}
5862168962Spjd
5863168962Spjdstatic int
5864168962Spjdzfs_freebsd_fsync(ap)
5865168962Spjd	struct vop_fsync_args /* {
5866168962Spjd		struct vnode *a_vp;
5867168962Spjd		int a_waitfor;
5868168962Spjd		struct thread *a_td;
5869168962Spjd	} */ *ap;
5870168962Spjd{
5871168962Spjd
5872168962Spjd	vop_stdfsync(ap);
5873185029Spjd	return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
5874168962Spjd}
5875168962Spjd
5876168962Spjdstatic int
5877168962Spjdzfs_freebsd_getattr(ap)
5878168962Spjd	struct vop_getattr_args /* {
5879168962Spjd		struct vnode *a_vp;
5880168962Spjd		struct vattr *a_vap;
5881168962Spjd		struct ucred *a_cred;
5882168962Spjd	} */ *ap;
5883168962Spjd{
5884185029Spjd	vattr_t *vap = ap->a_vap;
5885185029Spjd	xvattr_t xvap;
5886185029Spjd	u_long fflags = 0;
5887185029Spjd	int error;
5888168962Spjd
5889185029Spjd	xva_init(&xvap);
5890185029Spjd	xvap.xva_vattr = *vap;
5891185029Spjd	xvap.xva_vattr.va_mask |= AT_XVATTR;
5892185029Spjd
5893185029Spjd	/* Convert chflags into ZFS-type flags. */
5894185029Spjd	/* XXX: what about SF_SETTABLE?. */
5895185029Spjd	XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
5896185029Spjd	XVA_SET_REQ(&xvap, XAT_APPENDONLY);
5897185029Spjd	XVA_SET_REQ(&xvap, XAT_NOUNLINK);
5898185029Spjd	XVA_SET_REQ(&xvap, XAT_NODUMP);
5899185029Spjd	error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
5900185029Spjd	if (error != 0)
5901185029Spjd		return (error);
5902185029Spjd
5903185029Spjd	/* Convert ZFS xattr into chflags. */
5904185029Spjd#define	FLAG_CHECK(fflag, xflag, xfield)	do {			\
5905185029Spjd	if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)		\
5906185029Spjd		fflags |= (fflag);					\
5907185029Spjd} while (0)
5908185029Spjd	FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
5909185029Spjd	    xvap.xva_xoptattrs.xoa_immutable);
5910185029Spjd	FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
5911185029Spjd	    xvap.xva_xoptattrs.xoa_appendonly);
5912185029Spjd	FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
5913185029Spjd	    xvap.xva_xoptattrs.xoa_nounlink);
5914185029Spjd	FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
5915185029Spjd	    xvap.xva_xoptattrs.xoa_nodump);
5916185029Spjd#undef	FLAG_CHECK
5917185029Spjd	*vap = xvap.xva_vattr;
5918185029Spjd	vap->va_flags = fflags;
5919185029Spjd	return (0);
5920168962Spjd}
5921168962Spjd
5922168962Spjdstatic int
5923168962Spjdzfs_freebsd_setattr(ap)
5924168962Spjd	struct vop_setattr_args /* {
5925168962Spjd		struct vnode *a_vp;
5926168962Spjd		struct vattr *a_vap;
5927168962Spjd		struct ucred *a_cred;
5928168962Spjd	} */ *ap;
5929168962Spjd{
5930185172Spjd	vnode_t *vp = ap->a_vp;
5931168962Spjd	vattr_t *vap = ap->a_vap;
5932185172Spjd	cred_t *cred = ap->a_cred;
5933185029Spjd	xvattr_t xvap;
5934185029Spjd	u_long fflags;
5935185029Spjd	uint64_t zflags;
5936168962Spjd
5937168962Spjd	vattr_init_mask(vap);
5938170044Spjd	vap->va_mask &= ~AT_NOSET;
5939168962Spjd
5940185029Spjd	xva_init(&xvap);
5941185029Spjd	xvap.xva_vattr = *vap;
5942185029Spjd
5943219089Spjd	zflags = VTOZ(vp)->z_pflags;
5944185172Spjd
5945185029Spjd	if (vap->va_flags != VNOVAL) {
5946197683Sdelphij		zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
5947185172Spjd		int error;
5948185172Spjd
5949197683Sdelphij		if (zfsvfs->z_use_fuids == B_FALSE)
5950197683Sdelphij			return (EOPNOTSUPP);
5951197683Sdelphij
5952185029Spjd		fflags = vap->va_flags;
5953185029Spjd		if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_NODUMP)) != 0)
5954185029Spjd			return (EOPNOTSUPP);
5955185172Spjd		/*
5956185172Spjd		 * Unprivileged processes are not permitted to unset system
5957185172Spjd		 * flags, or modify flags if any system flags are set.
5958185172Spjd		 * Privileged non-jail processes may not modify system flags
5959185172Spjd		 * if securelevel > 0 and any existing system flags are set.
5960185172Spjd		 * Privileged jail processes behave like privileged non-jail
5961185172Spjd		 * processes if the security.jail.chflags_allowed sysctl is
5962185172Spjd		 * is non-zero; otherwise, they behave like unprivileged
5963185172Spjd		 * processes.
5964185172Spjd		 */
5965197861Spjd		if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
5966197861Spjd		    priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) {
5967185172Spjd			if (zflags &
5968185172Spjd			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
5969185172Spjd				error = securelevel_gt(cred, 0);
5970197861Spjd				if (error != 0)
5971185172Spjd					return (error);
5972185172Spjd			}
5973185172Spjd		} else {
5974197861Spjd			/*
5975197861Spjd			 * Callers may only modify the file flags on objects they
5976197861Spjd			 * have VADMIN rights for.
5977197861Spjd			 */
5978197861Spjd			if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0)
5979197861Spjd				return (error);
5980185172Spjd			if (zflags &
5981185172Spjd			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
5982185172Spjd				return (EPERM);
5983185172Spjd			}
5984185172Spjd			if (fflags &
5985185172Spjd			    (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
5986185172Spjd				return (EPERM);
5987185172Spjd			}
5988185172Spjd		}
5989185029Spjd
5990185029Spjd#define	FLAG_CHANGE(fflag, zflag, xflag, xfield)	do {		\
5991185029Spjd	if (((fflags & (fflag)) && !(zflags & (zflag))) ||		\
5992185029Spjd	    ((zflags & (zflag)) && !(fflags & (fflag)))) {		\
5993185029Spjd		XVA_SET_REQ(&xvap, (xflag));				\
5994185029Spjd		(xfield) = ((fflags & (fflag)) != 0);			\
5995185029Spjd	}								\
5996185029Spjd} while (0)
5997185029Spjd		/* Convert chflags into ZFS-type flags. */
5998185029Spjd		/* XXX: what about SF_SETTABLE?. */
5999185029Spjd		FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
6000185029Spjd		    xvap.xva_xoptattrs.xoa_immutable);
6001185029Spjd		FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
6002185029Spjd		    xvap.xva_xoptattrs.xoa_appendonly);
6003185029Spjd		FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
6004185029Spjd		    xvap.xva_xoptattrs.xoa_nounlink);
6005185029Spjd		FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
6006185172Spjd		    xvap.xva_xoptattrs.xoa_nodump);
6007185029Spjd#undef	FLAG_CHANGE
6008185029Spjd	}
6009185172Spjd	return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
6010168962Spjd}
6011168962Spjd
6012168962Spjdstatic int
6013168962Spjdzfs_freebsd_rename(ap)
6014168962Spjd	struct vop_rename_args  /* {
6015168962Spjd		struct vnode *a_fdvp;
6016168962Spjd		struct vnode *a_fvp;
6017168962Spjd		struct componentname *a_fcnp;
6018168962Spjd		struct vnode *a_tdvp;
6019168962Spjd		struct vnode *a_tvp;
6020168962Spjd		struct componentname *a_tcnp;
6021168962Spjd	} */ *ap;
6022168962Spjd{
6023168962Spjd	vnode_t *fdvp = ap->a_fdvp;
6024168962Spjd	vnode_t *fvp = ap->a_fvp;
6025168962Spjd	vnode_t *tdvp = ap->a_tdvp;
6026168962Spjd	vnode_t *tvp = ap->a_tvp;
6027168962Spjd	int error;
6028168962Spjd
6029192237Skmacy	ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
6030192237Skmacy	ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
6031168962Spjd
6032168962Spjd	error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp,
6033185029Spjd	    ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred, NULL, 0);
6034168962Spjd
6035168962Spjd	if (tdvp == tvp)
6036168962Spjd		VN_RELE(tdvp);
6037168962Spjd	else
6038168962Spjd		VN_URELE(tdvp);
6039168962Spjd	if (tvp)
6040168962Spjd		VN_URELE(tvp);
6041168962Spjd	VN_RELE(fdvp);
6042168962Spjd	VN_RELE(fvp);
6043168962Spjd
6044168962Spjd	return (error);
6045168962Spjd}
6046168962Spjd
6047168962Spjdstatic int
6048168962Spjdzfs_freebsd_symlink(ap)
6049168962Spjd	struct vop_symlink_args /* {
6050168962Spjd		struct vnode *a_dvp;
6051168962Spjd		struct vnode **a_vpp;
6052168962Spjd		struct componentname *a_cnp;
6053168962Spjd		struct vattr *a_vap;
6054168962Spjd		char *a_target;
6055168962Spjd	} */ *ap;
6056168962Spjd{
6057168962Spjd	struct componentname *cnp = ap->a_cnp;
6058168962Spjd	vattr_t *vap = ap->a_vap;
6059168962Spjd
6060168962Spjd	ASSERT(cnp->cn_flags & SAVENAME);
6061168962Spjd
6062168962Spjd	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
6063168962Spjd	vattr_init_mask(vap);
6064168962Spjd
6065168962Spjd	return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
6066168962Spjd	    ap->a_target, cnp->cn_cred, cnp->cn_thread));
6067168962Spjd}
6068168962Spjd
6069168962Spjdstatic int
6070168962Spjdzfs_freebsd_readlink(ap)
6071168962Spjd	struct vop_readlink_args /* {
6072168962Spjd		struct vnode *a_vp;
6073168962Spjd		struct uio *a_uio;
6074168962Spjd		struct ucred *a_cred;
6075168962Spjd	} */ *ap;
6076168962Spjd{
6077168962Spjd
6078185029Spjd	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
6079168962Spjd}
6080168962Spjd
6081168962Spjdstatic int
6082168962Spjdzfs_freebsd_link(ap)
6083168962Spjd	struct vop_link_args /* {
6084168962Spjd		struct vnode *a_tdvp;
6085168962Spjd		struct vnode *a_vp;
6086168962Spjd		struct componentname *a_cnp;
6087168962Spjd	} */ *ap;
6088168962Spjd{
6089168962Spjd	struct componentname *cnp = ap->a_cnp;
6090168962Spjd
6091168962Spjd	ASSERT(cnp->cn_flags & SAVENAME);
6092168962Spjd
6093185029Spjd	return (zfs_link(ap->a_tdvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
6094168962Spjd}
6095168962Spjd
6096168962Spjdstatic int
6097168962Spjdzfs_freebsd_inactive(ap)
6098169170Spjd	struct vop_inactive_args /* {
6099169170Spjd		struct vnode *a_vp;
6100169170Spjd		struct thread *a_td;
6101169170Spjd	} */ *ap;
6102168962Spjd{
6103168962Spjd	vnode_t *vp = ap->a_vp;
6104168962Spjd
6105185029Spjd	zfs_inactive(vp, ap->a_td->td_ucred, NULL);
6106168962Spjd	return (0);
6107168962Spjd}
6108168962Spjd
6109185029Spjdstatic void
6110185029Spjdzfs_reclaim_complete(void *arg, int pending)
6111185029Spjd{
6112185029Spjd	znode_t	*zp = arg;
6113185029Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
6114185029Spjd
6115197133Spjd	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
6116219089Spjd	if (zp->z_sa_hdl != NULL) {
6117197133Spjd		ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id);
6118197133Spjd		zfs_znode_dmu_fini(zp);
6119197133Spjd		ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
6120197133Spjd	}
6121185029Spjd	zfs_znode_free(zp);
6122197133Spjd	rw_exit(&zfsvfs->z_teardown_inactive_lock);
6123197133Spjd	/*
6124197133Spjd	 * If the file system is being unmounted, there is a process waiting
6125197133Spjd	 * for us, wake it up.
6126197133Spjd	 */
6127197133Spjd	if (zfsvfs->z_unmounted)
6128197133Spjd		wakeup_one(zfsvfs);
6129185029Spjd}
6130185029Spjd
6131168962Spjdstatic int
6132168962Spjdzfs_freebsd_reclaim(ap)
6133168962Spjd	struct vop_reclaim_args /* {
6134168962Spjd		struct vnode *a_vp;
6135168962Spjd		struct thread *a_td;
6136168962Spjd	} */ *ap;
6137168962Spjd{
6138169170Spjd	vnode_t	*vp = ap->a_vp;
6139168962Spjd	znode_t	*zp = VTOZ(vp);
6140197133Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
6141219089Spjd	boolean_t rlocked;
6142168962Spjd
6143219089Spjd	rlocked = rw_tryenter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
6144197133Spjd
6145169025Spjd	ASSERT(zp != NULL);
6146169025Spjd
6147168962Spjd	/*
6148168962Spjd	 * Destroy the vm object and flush associated pages.
6149168962Spjd	 */
6150168962Spjd	vnode_destroy_vobject(vp);
6151169025Spjd
6152169025Spjd	mutex_enter(&zp->z_lock);
6153197153Spjd	zp->z_vnode = NULL;
6154196301Spjd	mutex_exit(&zp->z_lock);
6155196301Spjd
6156219089Spjd	if (zp->z_unlinked) {
6157196301Spjd		;	/* Do nothing. */
6158219089Spjd	} else if (!rlocked) {
6159219089Spjd		TASK_INIT(&zp->z_task, 0, zfs_reclaim_complete, zp);
6160219089Spjd		taskqueue_enqueue(taskqueue_thread, &zp->z_task);
6161219089Spjd	} else if (zp->z_sa_hdl == NULL) {
6162196301Spjd		zfs_znode_free(zp);
6163219089Spjd	} else /* if (!zp->z_unlinked && zp->z_dbuf != NULL) */ {
6164185029Spjd		int locked;
6165185029Spjd
6166185029Spjd		locked = MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)) ? 2 :
6167185029Spjd		    ZFS_OBJ_HOLD_TRYENTER(zfsvfs, zp->z_id);
6168185029Spjd		if (locked == 0) {
6169185029Spjd			/*
6170185029Spjd			 * Lock can't be obtained due to deadlock possibility,
6171185029Spjd			 * so defer znode destruction.
6172185029Spjd			 */
6173185029Spjd			TASK_INIT(&zp->z_task, 0, zfs_reclaim_complete, zp);
6174185029Spjd			taskqueue_enqueue(taskqueue_thread, &zp->z_task);
6175185029Spjd		} else {
6176185029Spjd			zfs_znode_dmu_fini(zp);
6177185029Spjd			if (locked == 1)
6178185029Spjd				ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
6179185029Spjd			zfs_znode_free(zp);
6180185029Spjd		}
6181169025Spjd	}
6182168962Spjd	VI_LOCK(vp);
6183168962Spjd	vp->v_data = NULL;
6184171567Spjd	ASSERT(vp->v_holdcnt >= 1);
6185171316Sdfr	VI_UNLOCK(vp);
6186219089Spjd	if (rlocked)
6187219089Spjd		rw_exit(&zfsvfs->z_teardown_inactive_lock);
6188168962Spjd	return (0);
6189168962Spjd}
6190168962Spjd
6191168962Spjdstatic int
6192168962Spjdzfs_freebsd_fid(ap)
6193168962Spjd	struct vop_fid_args /* {
6194168962Spjd		struct vnode *a_vp;
6195168962Spjd		struct fid *a_fid;
6196168962Spjd	} */ *ap;
6197168962Spjd{
6198168962Spjd
6199185029Spjd	return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
6200168962Spjd}
6201168962Spjd
6202168962Spjdstatic int
6203168962Spjdzfs_freebsd_pathconf(ap)
6204168962Spjd	struct vop_pathconf_args /* {
6205168962Spjd		struct vnode *a_vp;
6206168962Spjd		int a_name;
6207168962Spjd		register_t *a_retval;
6208168962Spjd	} */ *ap;
6209168962Spjd{
6210168962Spjd	ulong_t val;
6211168962Spjd	int error;
6212168962Spjd
6213185029Spjd	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL);
6214168962Spjd	if (error == 0)
6215168962Spjd		*ap->a_retval = val;
6216168962Spjd	else if (error == EOPNOTSUPP)
6217168962Spjd		error = vop_stdpathconf(ap);
6218168962Spjd	return (error);
6219168962Spjd}
6220168962Spjd
6221196949Straszstatic int
6222196949Straszzfs_freebsd_fifo_pathconf(ap)
6223196949Strasz	struct vop_pathconf_args /* {
6224196949Strasz		struct vnode *a_vp;
6225196949Strasz		int a_name;
6226196949Strasz		register_t *a_retval;
6227196949Strasz	} */ *ap;
6228196949Strasz{
6229196949Strasz
6230196949Strasz	switch (ap->a_name) {
6231196949Strasz	case _PC_ACL_EXTENDED:
6232196949Strasz	case _PC_ACL_NFS4:
6233196949Strasz	case _PC_ACL_PATH_MAX:
6234196949Strasz	case _PC_MAC_PRESENT:
6235196949Strasz		return (zfs_freebsd_pathconf(ap));
6236196949Strasz	default:
6237196949Strasz		return (fifo_specops.vop_pathconf(ap));
6238196949Strasz	}
6239196949Strasz}
6240196949Strasz
6241185029Spjd/*
6242185029Spjd * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
6243185029Spjd * extended attribute name:
6244185029Spjd *
6245185029Spjd *	NAMESPACE	PREFIX
6246185029Spjd *	system		freebsd:system:
6247185029Spjd *	user		(none, can be used to access ZFS fsattr(5) attributes
6248185029Spjd *			created on Solaris)
6249185029Spjd */
6250185029Spjdstatic int
6251185029Spjdzfs_create_attrname(int attrnamespace, const char *name, char *attrname,
6252185029Spjd    size_t size)
6253185029Spjd{
6254185029Spjd	const char *namespace, *prefix, *suffix;
6255185029Spjd
6256185029Spjd	/* We don't allow '/' character in attribute name. */
6257185029Spjd	if (strchr(name, '/') != NULL)
6258185029Spjd		return (EINVAL);
6259185029Spjd	/* We don't allow attribute names that start with "freebsd:" string. */
6260185029Spjd	if (strncmp(name, "freebsd:", 8) == 0)
6261185029Spjd		return (EINVAL);
6262185029Spjd
6263185029Spjd	bzero(attrname, size);
6264185029Spjd
6265185029Spjd	switch (attrnamespace) {
6266185029Spjd	case EXTATTR_NAMESPACE_USER:
6267185029Spjd#if 0
6268185029Spjd		prefix = "freebsd:";
6269185029Spjd		namespace = EXTATTR_NAMESPACE_USER_STRING;
6270185029Spjd		suffix = ":";
6271185029Spjd#else
6272185029Spjd		/*
6273185029Spjd		 * This is the default namespace by which we can access all
6274185029Spjd		 * attributes created on Solaris.
6275185029Spjd		 */
6276185029Spjd		prefix = namespace = suffix = "";
6277185029Spjd#endif
6278185029Spjd		break;
6279185029Spjd	case EXTATTR_NAMESPACE_SYSTEM:
6280185029Spjd		prefix = "freebsd:";
6281185029Spjd		namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
6282185029Spjd		suffix = ":";
6283185029Spjd		break;
6284185029Spjd	case EXTATTR_NAMESPACE_EMPTY:
6285185029Spjd	default:
6286185029Spjd		return (EINVAL);
6287185029Spjd	}
6288185029Spjd	if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
6289185029Spjd	    name) >= size) {
6290185029Spjd		return (ENAMETOOLONG);
6291185029Spjd	}
6292185029Spjd	return (0);
6293185029Spjd}
6294185029Spjd
6295185029Spjd/*
6296185029Spjd * Vnode operating to retrieve a named extended attribute.
6297185029Spjd */
6298185029Spjdstatic int
6299185029Spjdzfs_getextattr(struct vop_getextattr_args *ap)
6300185029Spjd/*
6301185029Spjdvop_getextattr {
6302185029Spjd	IN struct vnode *a_vp;
6303185029Spjd	IN int a_attrnamespace;
6304185029Spjd	IN const char *a_name;
6305185029Spjd	INOUT struct uio *a_uio;
6306185029Spjd	OUT size_t *a_size;
6307185029Spjd	IN struct ucred *a_cred;
6308185029Spjd	IN struct thread *a_td;
6309185029Spjd};
6310185029Spjd*/
6311185029Spjd{
6312185029Spjd	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
6313185029Spjd	struct thread *td = ap->a_td;
6314185029Spjd	struct nameidata nd;
6315185029Spjd	char attrname[255];
6316185029Spjd	struct vattr va;
6317185029Spjd	vnode_t *xvp = NULL, *vp;
6318185029Spjd	int error, flags;
6319185029Spjd
6320195785Strasz	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6321195785Strasz	    ap->a_cred, ap->a_td, VREAD);
6322195785Strasz	if (error != 0)
6323195785Strasz		return (error);
6324195785Strasz
6325185029Spjd	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
6326185029Spjd	    sizeof(attrname));
6327185029Spjd	if (error != 0)
6328185029Spjd		return (error);
6329185029Spjd
6330185029Spjd	ZFS_ENTER(zfsvfs);
6331185029Spjd
6332185029Spjd	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
6333185029Spjd	    LOOKUP_XATTR);
6334185029Spjd	if (error != 0) {
6335185029Spjd		ZFS_EXIT(zfsvfs);
6336185029Spjd		return (error);
6337185029Spjd	}
6338185029Spjd
6339185029Spjd	flags = FREAD;
6340185029Spjd	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, attrname,
6341185029Spjd	    xvp, td);
6342194586Skib	error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL);
6343185029Spjd	vp = nd.ni_vp;
6344185029Spjd	NDFREE(&nd, NDF_ONLY_PNBUF);
6345185029Spjd	if (error != 0) {
6346196303Spjd		ZFS_EXIT(zfsvfs);
6347195785Strasz		if (error == ENOENT)
6348195785Strasz			error = ENOATTR;
6349185029Spjd		return (error);
6350185029Spjd	}
6351185029Spjd
6352185029Spjd	if (ap->a_size != NULL) {
6353185029Spjd		error = VOP_GETATTR(vp, &va, ap->a_cred);
6354185029Spjd		if (error == 0)
6355185029Spjd			*ap->a_size = (size_t)va.va_size;
6356185029Spjd	} else if (ap->a_uio != NULL)
6357224605Smm		error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
6358185029Spjd
6359185029Spjd	VOP_UNLOCK(vp, 0);
6360185029Spjd	vn_close(vp, flags, ap->a_cred, td);
6361185029Spjd	ZFS_EXIT(zfsvfs);
6362185029Spjd
6363185029Spjd	return (error);
6364185029Spjd}
6365185029Spjd
6366185029Spjd/*
6367185029Spjd * Vnode operation to remove a named attribute.
6368185029Spjd */
6369185029Spjdint
6370185029Spjdzfs_deleteextattr(struct vop_deleteextattr_args *ap)
6371185029Spjd/*
6372185029Spjdvop_deleteextattr {
6373185029Spjd	IN struct vnode *a_vp;
6374185029Spjd	IN int a_attrnamespace;
6375185029Spjd	IN const char *a_name;
6376185029Spjd	IN struct ucred *a_cred;
6377185029Spjd	IN struct thread *a_td;
6378185029Spjd};
6379185029Spjd*/
6380185029Spjd{
6381185029Spjd	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
6382185029Spjd	struct thread *td = ap->a_td;
6383185029Spjd	struct nameidata nd;
6384185029Spjd	char attrname[255];
6385185029Spjd	struct vattr va;
6386185029Spjd	vnode_t *xvp = NULL, *vp;
6387185029Spjd	int error, flags;
6388185029Spjd
6389195785Strasz	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6390195785Strasz	    ap->a_cred, ap->a_td, VWRITE);
6391195785Strasz	if (error != 0)
6392195785Strasz		return (error);
6393195785Strasz
6394185029Spjd	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
6395185029Spjd	    sizeof(attrname));
6396185029Spjd	if (error != 0)
6397185029Spjd		return (error);
6398185029Spjd
6399185029Spjd	ZFS_ENTER(zfsvfs);
6400185029Spjd
6401185029Spjd	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
6402185029Spjd	    LOOKUP_XATTR);
6403185029Spjd	if (error != 0) {
6404185029Spjd		ZFS_EXIT(zfsvfs);
6405185029Spjd		return (error);
6406185029Spjd	}
6407185029Spjd
6408185029Spjd	NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF | MPSAFE,
6409185029Spjd	    UIO_SYSSPACE, attrname, xvp, td);
6410185029Spjd	error = namei(&nd);
6411185029Spjd	vp = nd.ni_vp;
6412185029Spjd	NDFREE(&nd, NDF_ONLY_PNBUF);
6413185029Spjd	if (error != 0) {
6414196303Spjd		ZFS_EXIT(zfsvfs);
6415195785Strasz		if (error == ENOENT)
6416195785Strasz			error = ENOATTR;
6417185029Spjd		return (error);
6418185029Spjd	}
6419185029Spjd	error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
6420185029Spjd
6421185029Spjd	vput(nd.ni_dvp);
6422185029Spjd	if (vp == nd.ni_dvp)
6423185029Spjd		vrele(vp);
6424185029Spjd	else
6425185029Spjd		vput(vp);
6426185029Spjd	ZFS_EXIT(zfsvfs);
6427185029Spjd
6428185029Spjd	return (error);
6429185029Spjd}
6430185029Spjd
6431185029Spjd/*
6432185029Spjd * Vnode operation to set a named attribute.
6433185029Spjd */
6434185029Spjdstatic int
6435185029Spjdzfs_setextattr(struct vop_setextattr_args *ap)
6436185029Spjd/*
6437185029Spjdvop_setextattr {
6438185029Spjd	IN struct vnode *a_vp;
6439185029Spjd	IN int a_attrnamespace;
6440185029Spjd	IN const char *a_name;
6441185029Spjd	INOUT struct uio *a_uio;
6442185029Spjd	IN struct ucred *a_cred;
6443185029Spjd	IN struct thread *a_td;
6444185029Spjd};
6445185029Spjd*/
6446185029Spjd{
6447185029Spjd	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
6448185029Spjd	struct thread *td = ap->a_td;
6449185029Spjd	struct nameidata nd;
6450185029Spjd	char attrname[255];
6451185029Spjd	struct vattr va;
6452185029Spjd	vnode_t *xvp = NULL, *vp;
6453185029Spjd	int error, flags;
6454185029Spjd
6455195785Strasz	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6456195785Strasz	    ap->a_cred, ap->a_td, VWRITE);
6457195785Strasz	if (error != 0)
6458195785Strasz		return (error);
6459195785Strasz
6460185029Spjd	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
6461185029Spjd	    sizeof(attrname));
6462185029Spjd	if (error != 0)
6463185029Spjd		return (error);
6464185029Spjd
6465185029Spjd	ZFS_ENTER(zfsvfs);
6466185029Spjd
6467185029Spjd	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
6468195785Strasz	    LOOKUP_XATTR | CREATE_XATTR_DIR);
6469185029Spjd	if (error != 0) {
6470185029Spjd		ZFS_EXIT(zfsvfs);
6471185029Spjd		return (error);
6472185029Spjd	}
6473185029Spjd
6474185029Spjd	flags = FFLAGS(O_WRONLY | O_CREAT);
6475185029Spjd	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, attrname,
6476185029Spjd	    xvp, td);
6477194586Skib	error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL);
6478185029Spjd	vp = nd.ni_vp;
6479185029Spjd	NDFREE(&nd, NDF_ONLY_PNBUF);
6480185029Spjd	if (error != 0) {
6481185029Spjd		ZFS_EXIT(zfsvfs);
6482185029Spjd		return (error);
6483185029Spjd	}
6484185029Spjd
6485185029Spjd	VATTR_NULL(&va);
6486185029Spjd	va.va_size = 0;
6487185029Spjd	error = VOP_SETATTR(vp, &va, ap->a_cred);
6488185029Spjd	if (error == 0)
6489185029Spjd		VOP_WRITE(vp, ap->a_uio, IO_UNIT | IO_SYNC, ap->a_cred);
6490185029Spjd
6491185029Spjd	VOP_UNLOCK(vp, 0);
6492185029Spjd	vn_close(vp, flags, ap->a_cred, td);
6493185029Spjd	ZFS_EXIT(zfsvfs);
6494185029Spjd
6495185029Spjd	return (error);
6496185029Spjd}
6497185029Spjd
6498185029Spjd/*
6499185029Spjd * Vnode operation to retrieve extended attributes on a vnode.
6500185029Spjd */
6501185029Spjdstatic int
6502185029Spjdzfs_listextattr(struct vop_listextattr_args *ap)
6503185029Spjd/*
6504185029Spjdvop_listextattr {
6505185029Spjd	IN struct vnode *a_vp;
6506185029Spjd	IN int a_attrnamespace;
6507185029Spjd	INOUT struct uio *a_uio;
6508185029Spjd	OUT size_t *a_size;
6509185029Spjd	IN struct ucred *a_cred;
6510185029Spjd	IN struct thread *a_td;
6511185029Spjd};
6512185029Spjd*/
6513185029Spjd{
6514185029Spjd	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
6515185029Spjd	struct thread *td = ap->a_td;
6516185029Spjd	struct nameidata nd;
6517185029Spjd	char attrprefix[16];
6518185029Spjd	u_char dirbuf[sizeof(struct dirent)];
6519185029Spjd	struct dirent *dp;
6520185029Spjd	struct iovec aiov;
6521185029Spjd	struct uio auio, *uio = ap->a_uio;
6522185029Spjd	size_t *sizep = ap->a_size;
6523185029Spjd	size_t plen;
6524185029Spjd	vnode_t *xvp = NULL, *vp;
6525185029Spjd	int done, error, eof, pos;
6526185029Spjd
6527195785Strasz	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6528195785Strasz	    ap->a_cred, ap->a_td, VREAD);
6529196303Spjd	if (error != 0)
6530195785Strasz		return (error);
6531195785Strasz
6532185029Spjd	error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
6533185029Spjd	    sizeof(attrprefix));
6534185029Spjd	if (error != 0)
6535185029Spjd		return (error);
6536185029Spjd	plen = strlen(attrprefix);
6537185029Spjd
6538185029Spjd	ZFS_ENTER(zfsvfs);
6539185029Spjd
6540195822Strasz	if (sizep != NULL)
6541195822Strasz		*sizep = 0;
6542195822Strasz
6543185029Spjd	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
6544185029Spjd	    LOOKUP_XATTR);
6545185029Spjd	if (error != 0) {
6546196303Spjd		ZFS_EXIT(zfsvfs);
6547195785Strasz		/*
6548195785Strasz		 * ENOATTR means that the EA directory does not yet exist,
6549195785Strasz		 * i.e. there are no extended attributes there.
6550195785Strasz		 */
6551195785Strasz		if (error == ENOATTR)
6552195785Strasz			error = 0;
6553185029Spjd		return (error);
6554185029Spjd	}
6555185029Spjd
6556188588Sjhb	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED | MPSAFE,
6557188588Sjhb	    UIO_SYSSPACE, ".", xvp, td);
6558185029Spjd	error = namei(&nd);
6559185029Spjd	vp = nd.ni_vp;
6560185029Spjd	NDFREE(&nd, NDF_ONLY_PNBUF);
6561185029Spjd	if (error != 0) {
6562185029Spjd		ZFS_EXIT(zfsvfs);
6563185029Spjd		return (error);
6564185029Spjd	}
6565185029Spjd
6566185029Spjd	auio.uio_iov = &aiov;
6567185029Spjd	auio.uio_iovcnt = 1;
6568185029Spjd	auio.uio_segflg = UIO_SYSSPACE;
6569185029Spjd	auio.uio_td = td;
6570185029Spjd	auio.uio_rw = UIO_READ;
6571185029Spjd	auio.uio_offset = 0;
6572185029Spjd
6573185029Spjd	do {
6574185029Spjd		u_char nlen;
6575185029Spjd
6576185029Spjd		aiov.iov_base = (void *)dirbuf;
6577185029Spjd		aiov.iov_len = sizeof(dirbuf);
6578185029Spjd		auio.uio_resid = sizeof(dirbuf);
6579185029Spjd		error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
6580185029Spjd		done = sizeof(dirbuf) - auio.uio_resid;
6581185029Spjd		if (error != 0)
6582185029Spjd			break;
6583185029Spjd		for (pos = 0; pos < done;) {
6584185029Spjd			dp = (struct dirent *)(dirbuf + pos);
6585185029Spjd			pos += dp->d_reclen;
6586185029Spjd			/*
6587185029Spjd			 * XXX: Temporarily we also accept DT_UNKNOWN, as this
6588185029Spjd			 * is what we get when attribute was created on Solaris.
6589185029Spjd			 */
6590185029Spjd			if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
6591185029Spjd				continue;
6592185029Spjd			if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0)
6593185029Spjd				continue;
6594185029Spjd			else if (strncmp(dp->d_name, attrprefix, plen) != 0)
6595185029Spjd				continue;
6596185029Spjd			nlen = dp->d_namlen - plen;
6597185029Spjd			if (sizep != NULL)
6598185029Spjd				*sizep += 1 + nlen;
6599185029Spjd			else if (uio != NULL) {
6600185029Spjd				/*
6601185029Spjd				 * Format of extattr name entry is one byte for
6602185029Spjd				 * length and the rest for name.
6603185029Spjd				 */
6604185029Spjd				error = uiomove(&nlen, 1, uio->uio_rw, uio);
6605185029Spjd				if (error == 0) {
6606185029Spjd					error = uiomove(dp->d_name + plen, nlen,
6607185029Spjd					    uio->uio_rw, uio);
6608185029Spjd				}
6609185029Spjd				if (error != 0)
6610185029Spjd					break;
6611185029Spjd			}
6612185029Spjd		}
6613185029Spjd	} while (!eof && error == 0);
6614185029Spjd
6615185029Spjd	vput(vp);
6616185029Spjd	ZFS_EXIT(zfsvfs);
6617185029Spjd
6618185029Spjd	return (error);
6619185029Spjd}
6620185029Spjd
6621192800Straszint
6622192800Straszzfs_freebsd_getacl(ap)
6623192800Strasz	struct vop_getacl_args /* {
6624192800Strasz		struct vnode *vp;
6625192800Strasz		acl_type_t type;
6626192800Strasz		struct acl *aclp;
6627192800Strasz		struct ucred *cred;
6628192800Strasz		struct thread *td;
6629192800Strasz	} */ *ap;
6630192800Strasz{
6631192800Strasz	int		error;
6632192800Strasz	vsecattr_t      vsecattr;
6633192800Strasz
6634192800Strasz	if (ap->a_type != ACL_TYPE_NFS4)
6635197435Strasz		return (EINVAL);
6636192800Strasz
6637192800Strasz	vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
6638192800Strasz	if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))
6639192800Strasz		return (error);
6640192800Strasz
6641192800Strasz	error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt);
6642196303Spjd	if (vsecattr.vsa_aclentp != NULL)
6643196303Spjd		kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
6644192800Strasz
6645196303Spjd	return (error);
6646192800Strasz}
6647192800Strasz
6648192800Straszint
6649192800Straszzfs_freebsd_setacl(ap)
6650192800Strasz	struct vop_setacl_args /* {
6651192800Strasz		struct vnode *vp;
6652192800Strasz		acl_type_t type;
6653192800Strasz		struct acl *aclp;
6654192800Strasz		struct ucred *cred;
6655192800Strasz		struct thread *td;
6656192800Strasz	} */ *ap;
6657192800Strasz{
6658192800Strasz	int		error;
6659192800Strasz	vsecattr_t      vsecattr;
6660192800Strasz	int		aclbsize;	/* size of acl list in bytes */
6661192800Strasz	aclent_t	*aaclp;
6662192800Strasz
6663192800Strasz	if (ap->a_type != ACL_TYPE_NFS4)
6664197435Strasz		return (EINVAL);
6665192800Strasz
6666192800Strasz	if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
6667192800Strasz		return (EINVAL);
6668192800Strasz
6669192800Strasz	/*
6670196949Strasz	 * With NFSv4 ACLs, chmod(2) may need to add additional entries,
6671192800Strasz	 * splitting every entry into two and appending "canonical six"
6672192800Strasz	 * entries at the end.  Don't allow for setting an ACL that would
6673192800Strasz	 * cause chmod(2) to run out of ACL entries.
6674192800Strasz	 */
6675192800Strasz	if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
6676192800Strasz		return (ENOSPC);
6677192800Strasz
6678208030Strasz	error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
6679208030Strasz	if (error != 0)
6680208030Strasz		return (error);
6681208030Strasz
6682192800Strasz	vsecattr.vsa_mask = VSA_ACE;
6683192800Strasz	aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t);
6684192800Strasz	vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
6685192800Strasz	aaclp = vsecattr.vsa_aclentp;
6686192800Strasz	vsecattr.vsa_aclentsz = aclbsize;
6687192800Strasz
6688192800Strasz	aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
6689192800Strasz	error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL);
6690192800Strasz	kmem_free(aaclp, aclbsize);
6691192800Strasz
6692192800Strasz	return (error);
6693192800Strasz}
6694192800Strasz
6695192800Straszint
6696192800Straszzfs_freebsd_aclcheck(ap)
6697192800Strasz	struct vop_aclcheck_args /* {
6698192800Strasz		struct vnode *vp;
6699192800Strasz		acl_type_t type;
6700192800Strasz		struct acl *aclp;
6701192800Strasz		struct ucred *cred;
6702192800Strasz		struct thread *td;
6703192800Strasz	} */ *ap;
6704192800Strasz{
6705192800Strasz
6706192800Strasz	return (EOPNOTSUPP);
6707192800Strasz}
6708192800Strasz
6709168404Spjdstruct vop_vector zfs_vnodeops;
6710168404Spjdstruct vop_vector zfs_fifoops;
6711209962Smmstruct vop_vector zfs_shareops;
6712168404Spjd
6713168404Spjdstruct vop_vector zfs_vnodeops = {
6714185029Spjd	.vop_default =		&default_vnodeops,
6715185029Spjd	.vop_inactive =		zfs_freebsd_inactive,
6716185029Spjd	.vop_reclaim =		zfs_freebsd_reclaim,
6717185029Spjd	.vop_access =		zfs_freebsd_access,
6718168404Spjd#ifdef FREEBSD_NAMECACHE
6719185029Spjd	.vop_lookup =		vfs_cache_lookup,
6720185029Spjd	.vop_cachedlookup =	zfs_freebsd_lookup,
6721168404Spjd#else
6722185029Spjd	.vop_lookup =		zfs_freebsd_lookup,
6723168404Spjd#endif
6724185029Spjd	.vop_getattr =		zfs_freebsd_getattr,
6725185029Spjd	.vop_setattr =		zfs_freebsd_setattr,
6726185029Spjd	.vop_create =		zfs_freebsd_create,
6727185029Spjd	.vop_mknod =		zfs_freebsd_create,
6728185029Spjd	.vop_mkdir =		zfs_freebsd_mkdir,
6729185029Spjd	.vop_readdir =		zfs_freebsd_readdir,
6730185029Spjd	.vop_fsync =		zfs_freebsd_fsync,
6731185029Spjd	.vop_open =		zfs_freebsd_open,
6732185029Spjd	.vop_close =		zfs_freebsd_close,
6733185029Spjd	.vop_rmdir =		zfs_freebsd_rmdir,
6734185029Spjd	.vop_ioctl =		zfs_freebsd_ioctl,
6735185029Spjd	.vop_link =		zfs_freebsd_link,
6736185029Spjd	.vop_symlink =		zfs_freebsd_symlink,
6737185029Spjd	.vop_readlink =		zfs_freebsd_readlink,
6738185029Spjd	.vop_read =		zfs_freebsd_read,
6739185029Spjd	.vop_write =		zfs_freebsd_write,
6740185029Spjd	.vop_remove =		zfs_freebsd_remove,
6741185029Spjd	.vop_rename =		zfs_freebsd_rename,
6742185029Spjd	.vop_pathconf =		zfs_freebsd_pathconf,
6743185029Spjd	.vop_bmap =		VOP_EOPNOTSUPP,
6744185029Spjd	.vop_fid =		zfs_freebsd_fid,
6745185029Spjd	.vop_getextattr =	zfs_getextattr,
6746185029Spjd	.vop_deleteextattr =	zfs_deleteextattr,
6747185029Spjd	.vop_setextattr =	zfs_setextattr,
6748185029Spjd	.vop_listextattr =	zfs_listextattr,
6749192800Strasz	.vop_getacl =		zfs_freebsd_getacl,
6750192800Strasz	.vop_setacl =		zfs_freebsd_setacl,
6751192800Strasz	.vop_aclcheck =		zfs_freebsd_aclcheck,
6752213937Savg	.vop_getpages =		zfs_freebsd_getpages,
6753168404Spjd};
6754168404Spjd
6755169170Spjdstruct vop_vector zfs_fifoops = {
6756185029Spjd	.vop_default =		&fifo_specops,
6757200162Skib	.vop_fsync =		zfs_freebsd_fsync,
6758185029Spjd	.vop_access =		zfs_freebsd_access,
6759185029Spjd	.vop_getattr =		zfs_freebsd_getattr,
6760185029Spjd	.vop_inactive =		zfs_freebsd_inactive,
6761185029Spjd	.vop_read =		VOP_PANIC,
6762185029Spjd	.vop_reclaim =		zfs_freebsd_reclaim,
6763185029Spjd	.vop_setattr =		zfs_freebsd_setattr,
6764185029Spjd	.vop_write =		VOP_PANIC,
6765196949Strasz	.vop_pathconf = 	zfs_freebsd_fifo_pathconf,
6766185029Spjd	.vop_fid =		zfs_freebsd_fid,
6767192800Strasz	.vop_getacl =		zfs_freebsd_getacl,
6768192800Strasz	.vop_setacl =		zfs_freebsd_setacl,
6769192800Strasz	.vop_aclcheck =		zfs_freebsd_aclcheck,
6770168404Spjd};
6771209962Smm
6772209962Smm/*
6773209962Smm * special share hidden files vnode operations template
6774209962Smm */
6775209962Smmstruct vop_vector zfs_shareops = {
6776209962Smm	.vop_default =		&default_vnodeops,
6777209962Smm	.vop_access =		zfs_freebsd_access,
6778209962Smm	.vop_inactive =		zfs_freebsd_inactive,
6779209962Smm	.vop_reclaim =		zfs_freebsd_reclaim,
6780209962Smm	.vop_fid =		zfs_freebsd_fid,
6781209962Smm	.vop_pathconf =		zfs_freebsd_pathconf,
6782209962Smm};
6783