zfs_vnops.c revision 254982
1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22212694Smm * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23249195Smm * Copyright (c) 2013 by Delphix. All rights reserved.
24254585Sdelphij * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
25168404Spjd */
26168404Spjd
27169195Spjd/* Portions Copyright 2007 Jeremy Teo */
28219089Spjd/* Portions Copyright 2010 Robert Milkowski */
29169195Spjd
30168404Spjd#include <sys/types.h>
31168404Spjd#include <sys/param.h>
32168404Spjd#include <sys/time.h>
33168404Spjd#include <sys/systm.h>
34168404Spjd#include <sys/sysmacros.h>
35168404Spjd#include <sys/resource.h>
36168404Spjd#include <sys/vfs.h>
37248084Sattilio#include <sys/vm.h>
38168404Spjd#include <sys/vnode.h>
39168404Spjd#include <sys/file.h>
40168404Spjd#include <sys/stat.h>
41168404Spjd#include <sys/kmem.h>
42168404Spjd#include <sys/taskq.h>
43168404Spjd#include <sys/uio.h>
44168404Spjd#include <sys/atomic.h>
45168404Spjd#include <sys/namei.h>
46168404Spjd#include <sys/mman.h>
47168404Spjd#include <sys/cmn_err.h>
48168404Spjd#include <sys/errno.h>
49168404Spjd#include <sys/unistd.h>
50168404Spjd#include <sys/zfs_dir.h>
51168404Spjd#include <sys/zfs_ioctl.h>
52168404Spjd#include <sys/fs/zfs.h>
53168404Spjd#include <sys/dmu.h>
54219089Spjd#include <sys/dmu_objset.h>
55168404Spjd#include <sys/spa.h>
56168404Spjd#include <sys/txg.h>
57168404Spjd#include <sys/dbuf.h>
58168404Spjd#include <sys/zap.h>
59219089Spjd#include <sys/sa.h>
60168404Spjd#include <sys/dirent.h>
61168962Spjd#include <sys/policy.h>
62168962Spjd#include <sys/sunddi.h>
63168404Spjd#include <sys/filio.h>
64209962Smm#include <sys/sid.h>
65168404Spjd#include <sys/zfs_ctldir.h>
66185029Spjd#include <sys/zfs_fuid.h>
67219089Spjd#include <sys/zfs_sa.h>
68168404Spjd#include <sys/dnlc.h>
69168404Spjd#include <sys/zfs_rlock.h>
70185029Spjd#include <sys/extdirent.h>
71185029Spjd#include <sys/kidmap.h>
72168404Spjd#include <sys/bio.h>
73168404Spjd#include <sys/buf.h>
74168404Spjd#include <sys/sf_buf.h>
75168404Spjd#include <sys/sched.h>
76192800Strasz#include <sys/acl.h>
77239077Smarius#include <vm/vm_param.h>
78215401Savg#include <vm/vm_pageout.h>
79168404Spjd
80168404Spjd/*
81168404Spjd * Programming rules.
82168404Spjd *
83168404Spjd * Each vnode op performs some logical unit of work.  To do this, the ZPL must
84168404Spjd * properly lock its in-core state, create a DMU transaction, do the work,
85168404Spjd * record this work in the intent log (ZIL), commit the DMU transaction,
86185029Spjd * and wait for the intent log to commit if it is a synchronous operation.
87185029Spjd * Moreover, the vnode ops must work in both normal and log replay context.
88168404Spjd * The ordering of events is important to avoid deadlocks and references
89168404Spjd * to freed memory.  The example below illustrates the following Big Rules:
90168404Spjd *
91251631Sdelphij *  (1)	A check must be made in each zfs thread for a mounted file system.
92168404Spjd *	This is done avoiding races using ZFS_ENTER(zfsvfs).
93251631Sdelphij *	A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
94251631Sdelphij *	must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
95251631Sdelphij *	can return EIO from the calling function.
96168404Spjd *
97168404Spjd *  (2)	VN_RELE() should always be the last thing except for zil_commit()
98168404Spjd *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
99168404Spjd *	First, if it's the last reference, the vnode/znode
100168404Spjd *	can be freed, so the zp may point to freed memory.  Second, the last
101168404Spjd *	reference will call zfs_zinactive(), which may induce a lot of work --
102168404Spjd *	pushing cached pages (which acquires range locks) and syncing out
103168404Spjd *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
104168404Spjd *	which could deadlock the system if you were already holding one.
105191900Skmacy *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
106168404Spjd *
107168404Spjd *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
108168404Spjd *	as they can span dmu_tx_assign() calls.
109168404Spjd *
110209962Smm *  (4)	Always pass TXG_NOWAIT as the second argument to dmu_tx_assign().
111168404Spjd *	This is critical because we don't want to block while holding locks.
112168404Spjd *	Note, in particular, that if a lock is sometimes acquired before
113168404Spjd *	the tx assigns, and sometimes after (e.g. z_lock), then failing to
114168404Spjd *	use a non-blocking assign can deadlock the system.  The scenario:
115168404Spjd *
116168404Spjd *	Thread A has grabbed a lock before calling dmu_tx_assign().
117168404Spjd *	Thread B is in an already-assigned tx, and blocks for this lock.
118168404Spjd *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
119168404Spjd *	forever, because the previous txg can't quiesce until B's tx commits.
120168404Spjd *
121168404Spjd *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
122168404Spjd *	then drop all locks, call dmu_tx_wait(), and try again.
123168404Spjd *
124168404Spjd *  (5)	If the operation succeeded, generate the intent log entry for it
125168404Spjd *	before dropping locks.  This ensures that the ordering of events
126168404Spjd *	in the intent log matches the order in which they actually occurred.
127251631Sdelphij *	During ZIL replay the zfs_log_* functions will update the sequence
128209962Smm *	number to indicate the zil transaction has replayed.
129168404Spjd *
130168404Spjd *  (6)	At the end of each vnode op, the DMU tx must always commit,
131168404Spjd *	regardless of whether there were any errors.
132168404Spjd *
133219089Spjd *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
134168404Spjd *	to ensure that synchronous semantics are provided when necessary.
135168404Spjd *
136168404Spjd * In general, this is how things should be ordered in each vnode op:
137168404Spjd *
138168404Spjd *	ZFS_ENTER(zfsvfs);		// exit if unmounted
139168404Spjd * top:
140168404Spjd *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may VN_HOLD())
141168404Spjd *	rw_enter(...);			// grab any other locks you need
142168404Spjd *	tx = dmu_tx_create(...);	// get DMU tx
143168404Spjd *	dmu_tx_hold_*();		// hold each object you might modify
144209962Smm *	error = dmu_tx_assign(tx, TXG_NOWAIT);	// try to assign
145168404Spjd *	if (error) {
146168404Spjd *		rw_exit(...);		// drop locks
147168404Spjd *		zfs_dirent_unlock(dl);	// unlock directory entry
148168404Spjd *		VN_RELE(...);		// release held vnodes
149209962Smm *		if (error == ERESTART) {
150168404Spjd *			dmu_tx_wait(tx);
151168404Spjd *			dmu_tx_abort(tx);
152168404Spjd *			goto top;
153168404Spjd *		}
154168404Spjd *		dmu_tx_abort(tx);	// abort DMU tx
155168404Spjd *		ZFS_EXIT(zfsvfs);	// finished in zfs
156168404Spjd *		return (error);		// really out of space
157168404Spjd *	}
158168404Spjd *	error = do_real_work();		// do whatever this VOP does
159168404Spjd *	if (error == 0)
160168404Spjd *		zfs_log_*(...);		// on success, make ZIL entry
161168404Spjd *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
162168404Spjd *	rw_exit(...);			// drop locks
163168404Spjd *	zfs_dirent_unlock(dl);		// unlock directory entry
164168404Spjd *	VN_RELE(...);			// release held vnodes
165219089Spjd *	zil_commit(zilog, foid);	// synchronous when necessary
166168404Spjd *	ZFS_EXIT(zfsvfs);		// finished in zfs
167168404Spjd *	return (error);			// done, report error
168168404Spjd */
169185029Spjd
170168404Spjd/* ARGSUSED */
171168404Spjdstatic int
172185029Spjdzfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
173168404Spjd{
174168962Spjd	znode_t	*zp = VTOZ(*vpp);
175209962Smm	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
176168404Spjd
177209962Smm	ZFS_ENTER(zfsvfs);
178209962Smm	ZFS_VERIFY_ZP(zp);
179209962Smm
180219089Spjd	if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
181185029Spjd	    ((flag & FAPPEND) == 0)) {
182209962Smm		ZFS_EXIT(zfsvfs);
183249195Smm		return (SET_ERROR(EPERM));
184185029Spjd	}
185185029Spjd
186185029Spjd	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
187185029Spjd	    ZTOV(zp)->v_type == VREG &&
188219089Spjd	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
189209962Smm		if (fs_vscan(*vpp, cr, 0) != 0) {
190209962Smm			ZFS_EXIT(zfsvfs);
191249195Smm			return (SET_ERROR(EACCES));
192209962Smm		}
193209962Smm	}
194185029Spjd
195168404Spjd	/* Keep a count of the synchronous opens in the znode */
196168962Spjd	if (flag & (FSYNC | FDSYNC))
197168404Spjd		atomic_inc_32(&zp->z_sync_cnt);
198185029Spjd
199209962Smm	ZFS_EXIT(zfsvfs);
200168404Spjd	return (0);
201168404Spjd}
202168404Spjd
203168404Spjd/* ARGSUSED */
204168404Spjdstatic int
205185029Spjdzfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
206185029Spjd    caller_context_t *ct)
207168404Spjd{
208168962Spjd	znode_t	*zp = VTOZ(vp);
209209962Smm	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
210168404Spjd
211210470Smm	/*
212210470Smm	 * Clean up any locks held by this process on the vp.
213210470Smm	 */
214210470Smm	cleanlocks(vp, ddi_get_pid(), 0);
215210470Smm	cleanshares(vp, ddi_get_pid());
216210470Smm
217209962Smm	ZFS_ENTER(zfsvfs);
218209962Smm	ZFS_VERIFY_ZP(zp);
219209962Smm
220168404Spjd	/* Decrement the synchronous opens in the znode */
221185029Spjd	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
222168404Spjd		atomic_dec_32(&zp->z_sync_cnt);
223168404Spjd
224185029Spjd	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
225185029Spjd	    ZTOV(zp)->v_type == VREG &&
226219089Spjd	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
227185029Spjd		VERIFY(fs_vscan(vp, cr, 1) == 0);
228185029Spjd
229209962Smm	ZFS_EXIT(zfsvfs);
230168404Spjd	return (0);
231168404Spjd}
232168404Spjd
233168404Spjd/*
234168404Spjd * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
235168404Spjd * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
236168404Spjd */
237168404Spjdstatic int
238168978Spjdzfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
239168404Spjd{
240168404Spjd	znode_t	*zp = VTOZ(vp);
241168404Spjd	uint64_t noff = (uint64_t)*off; /* new offset */
242168404Spjd	uint64_t file_sz;
243168404Spjd	int error;
244168404Spjd	boolean_t hole;
245168404Spjd
246219089Spjd	file_sz = zp->z_size;
247168404Spjd	if (noff >= file_sz)  {
248249195Smm		return (SET_ERROR(ENXIO));
249168404Spjd	}
250168404Spjd
251168962Spjd	if (cmd == _FIO_SEEK_HOLE)
252168404Spjd		hole = B_TRUE;
253168404Spjd	else
254168404Spjd		hole = B_FALSE;
255168404Spjd
256168404Spjd	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
257168404Spjd
258168404Spjd	/* end of file? */
259168404Spjd	if ((error == ESRCH) || (noff > file_sz)) {
260168404Spjd		/*
261168404Spjd		 * Handle the virtual hole at the end of file.
262168404Spjd		 */
263168404Spjd		if (hole) {
264168404Spjd			*off = file_sz;
265168404Spjd			return (0);
266168404Spjd		}
267249195Smm		return (SET_ERROR(ENXIO));
268168404Spjd	}
269168404Spjd
270168404Spjd	if (noff < *off)
271168404Spjd		return (error);
272168404Spjd	*off = noff;
273168404Spjd	return (error);
274168404Spjd}
275168404Spjd
276168404Spjd/* ARGSUSED */
277168404Spjdstatic int
278168978Spjdzfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
279185029Spjd    int *rvalp, caller_context_t *ct)
280168404Spjd{
281168962Spjd	offset_t off;
282168962Spjd	int error;
283168962Spjd	zfsvfs_t *zfsvfs;
284185029Spjd	znode_t *zp;
285168404Spjd
286168404Spjd	switch (com) {
287185029Spjd	case _FIOFFS:
288168962Spjd		return (0);
289168404Spjd
290168962Spjd		/*
291168962Spjd		 * The following two ioctls are used by bfu.  Faking out,
292168962Spjd		 * necessary to avoid bfu errors.
293168962Spjd		 */
294185029Spjd	case _FIOGDIO:
295185029Spjd	case _FIOSDIO:
296168962Spjd		return (0);
297168962Spjd
298185029Spjd	case _FIO_SEEK_DATA:
299185029Spjd	case _FIO_SEEK_HOLE:
300233918Savg#ifdef sun
301168962Spjd		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
302249195Smm			return (SET_ERROR(EFAULT));
303233918Savg#else
304233918Savg		off = *(offset_t *)data;
305233918Savg#endif
306185029Spjd		zp = VTOZ(vp);
307185029Spjd		zfsvfs = zp->z_zfsvfs;
308168404Spjd		ZFS_ENTER(zfsvfs);
309185029Spjd		ZFS_VERIFY_ZP(zp);
310168404Spjd
311168404Spjd		/* offset parameter is in/out */
312168404Spjd		error = zfs_holey(vp, com, &off);
313168404Spjd		ZFS_EXIT(zfsvfs);
314168404Spjd		if (error)
315168404Spjd			return (error);
316233918Savg#ifdef sun
317168962Spjd		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
318249195Smm			return (SET_ERROR(EFAULT));
319233918Savg#else
320233918Savg		*(offset_t *)data = off;
321233918Savg#endif
322168404Spjd		return (0);
323168404Spjd	}
324249195Smm	return (SET_ERROR(ENOTTY));
325168404Spjd}
326168404Spjd
327209962Smmstatic vm_page_t
328253953Sattiliopage_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
329209962Smm{
330209962Smm	vm_object_t obj;
331209962Smm	vm_page_t pp;
332209962Smm
333209962Smm	obj = vp->v_object;
334248084Sattilio	zfs_vmobject_assert_wlocked(obj);
335209962Smm
336209962Smm	for (;;) {
337209962Smm		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
338246293Savg		    pp->valid) {
339254138Sattilio			if (vm_page_xbusied(pp)) {
340212652Savg				/*
341212652Savg				 * Reference the page before unlocking and
342212652Savg				 * sleeping so that the page daemon is less
343212652Savg				 * likely to reclaim it.
344212652Savg				 */
345225418Skib				vm_page_reference(pp);
346254138Sattilio				vm_page_lock(pp);
347254138Sattilio				zfs_vmobject_wunlock(obj);
348254138Sattilio				vm_page_busy_sleep(pp, "zfsmwb");
349254138Sattilio				zfs_vmobject_wlock(obj);
350209962Smm				continue;
351212652Savg			}
352254138Sattilio			vm_page_sbusy(pp);
353252337Sgavin		} else if (pp == NULL) {
354246293Savg			pp = vm_page_alloc(obj, OFF_TO_IDX(start),
355246293Savg			    VM_ALLOC_SYSTEM | VM_ALLOC_IFCACHED |
356254138Sattilio			    VM_ALLOC_SBUSY);
357252337Sgavin		} else {
358252337Sgavin			ASSERT(pp != NULL && !pp->valid);
359252337Sgavin			pp = NULL;
360209962Smm		}
361246293Savg
362246293Savg		if (pp != NULL) {
363246293Savg			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
364253953Sattilio			vm_object_pip_add(obj, 1);
365246293Savg			pmap_remove_write(pp);
366246293Savg			vm_page_clear_dirty(pp, off, nbytes);
367246293Savg		}
368209962Smm		break;
369209962Smm	}
370209962Smm	return (pp);
371209962Smm}
372209962Smm
373209962Smmstatic void
374253953Sattiliopage_unbusy(vm_page_t pp)
375209962Smm{
376209962Smm
377254138Sattilio	vm_page_sunbusy(pp);
378253953Sattilio	vm_object_pip_subtract(pp->object, 1);
379209962Smm}
380209962Smm
381253953Sattiliostatic vm_page_t
382253953Sattiliopage_hold(vnode_t *vp, int64_t start)
383253953Sattilio{
384253953Sattilio	vm_object_t obj;
385253953Sattilio	vm_page_t pp;
386253953Sattilio
387253953Sattilio	obj = vp->v_object;
388253953Sattilio	zfs_vmobject_assert_wlocked(obj);
389253953Sattilio
390253953Sattilio	for (;;) {
391253953Sattilio		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
392253953Sattilio		    pp->valid) {
393254138Sattilio			if (vm_page_xbusied(pp)) {
394253953Sattilio				/*
395253953Sattilio				 * Reference the page before unlocking and
396253953Sattilio				 * sleeping so that the page daemon is less
397253953Sattilio				 * likely to reclaim it.
398253953Sattilio				 */
399253953Sattilio				vm_page_reference(pp);
400254138Sattilio				vm_page_lock(pp);
401254138Sattilio				zfs_vmobject_wunlock(obj);
402254138Sattilio				vm_page_busy_sleep(pp, "zfsmwb");
403254138Sattilio				zfs_vmobject_wlock(obj);
404253953Sattilio				continue;
405253953Sattilio			}
406253953Sattilio
407253953Sattilio			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
408253953Sattilio			vm_page_lock(pp);
409253953Sattilio			vm_page_hold(pp);
410253953Sattilio			vm_page_unlock(pp);
411253953Sattilio
412253953Sattilio		} else
413253953Sattilio			pp = NULL;
414253953Sattilio		break;
415253953Sattilio	}
416253953Sattilio	return (pp);
417253953Sattilio}
418253953Sattilio
419253953Sattiliostatic void
420253953Sattiliopage_unhold(vm_page_t pp)
421253953Sattilio{
422253953Sattilio
423253953Sattilio	vm_page_lock(pp);
424253953Sattilio	vm_page_unhold(pp);
425253953Sattilio	vm_page_unlock(pp);
426253953Sattilio}
427253953Sattilio
428209962Smmstatic caddr_t
429209962Smmzfs_map_page(vm_page_t pp, struct sf_buf **sfp)
430209962Smm{
431209962Smm
432212951Savg	*sfp = sf_buf_alloc(pp, 0);
433209962Smm	return ((caddr_t)sf_buf_kva(*sfp));
434209962Smm}
435209962Smm
436209962Smmstatic void
437209962Smmzfs_unmap_page(struct sf_buf *sf)
438209962Smm{
439209962Smm
440209962Smm	sf_buf_free(sf);
441209962Smm}
442209962Smm
443168404Spjd/*
444168404Spjd * When a file is memory mapped, we must keep the IO data synchronized
445168404Spjd * between the DMU cache and the memory mapped pages.  What this means:
446168404Spjd *
447168404Spjd * On Write:	If we find a memory mapped page, we write to *both*
448168404Spjd *		the page and the dmu buffer.
449168404Spjd */
450209962Smmstatic void
451209962Smmupdate_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
452209962Smm    int segflg, dmu_tx_t *tx)
453168404Spjd{
454168404Spjd	vm_object_t obj;
455168404Spjd	struct sf_buf *sf;
456246293Savg	caddr_t va;
457212655Savg	int off;
458168404Spjd
459168404Spjd	ASSERT(vp->v_mount != NULL);
460168404Spjd	obj = vp->v_object;
461168404Spjd	ASSERT(obj != NULL);
462168404Spjd
463168404Spjd	off = start & PAGEOFFSET;
464248084Sattilio	zfs_vmobject_wlock(obj);
465168404Spjd	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
466209962Smm		vm_page_t pp;
467246293Savg		int nbytes = imin(PAGESIZE - off, len);
468168404Spjd
469246293Savg		if (segflg == UIO_NOCOPY) {
470246293Savg			pp = vm_page_lookup(obj, OFF_TO_IDX(start));
471246293Savg			KASSERT(pp != NULL,
472246293Savg			    ("zfs update_pages: NULL page in putpages case"));
473246293Savg			KASSERT(off == 0,
474246293Savg			    ("zfs update_pages: unaligned data in putpages case"));
475246293Savg			KASSERT(pp->valid == VM_PAGE_BITS_ALL,
476246293Savg			    ("zfs update_pages: invalid page in putpages case"));
477254138Sattilio			KASSERT(vm_page_sbusied(pp),
478246293Savg			    ("zfs update_pages: unbusy page in putpages case"));
479246293Savg			KASSERT(!pmap_page_is_write_mapped(pp),
480246293Savg			    ("zfs update_pages: writable page in putpages case"));
481248084Sattilio			zfs_vmobject_wunlock(obj);
482168404Spjd
483246293Savg			va = zfs_map_page(pp, &sf);
484246293Savg			(void) dmu_write(os, oid, start, nbytes, va, tx);
485246293Savg			zfs_unmap_page(sf);
486246293Savg
487248084Sattilio			zfs_vmobject_wlock(obj);
488246293Savg			vm_page_undirty(pp);
489253953Sattilio		} else if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
490248084Sattilio			zfs_vmobject_wunlock(obj);
491246293Savg
492209962Smm			va = zfs_map_page(pp, &sf);
493246293Savg			(void) dmu_read(os, oid, start+off, nbytes,
494246293Savg			    va+off, DMU_READ_PREFETCH);;
495209962Smm			zfs_unmap_page(sf);
496246293Savg
497248084Sattilio			zfs_vmobject_wlock(obj);
498253953Sattilio			page_unbusy(pp);
499168404Spjd		}
500209962Smm		len -= nbytes;
501168404Spjd		off = 0;
502168404Spjd	}
503246293Savg	if (segflg != UIO_NOCOPY)
504246293Savg		vm_object_pip_wakeupn(obj, 0);
505248084Sattilio	zfs_vmobject_wunlock(obj);
506168404Spjd}
507168404Spjd
508168404Spjd/*
509219089Spjd * Read with UIO_NOCOPY flag means that sendfile(2) requests
510219089Spjd * ZFS to populate a range of page cache pages with data.
511219089Spjd *
512219089Spjd * NOTE: this function could be optimized to pre-allocate
513254138Sattilio * all pages in advance, drain exclusive busy on all of them,
514219089Spjd * map them into contiguous KVA region and populate them
515219089Spjd * in one single dmu_read() call.
516219089Spjd */
517219089Spjdstatic int
518219089Spjdmappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
519219089Spjd{
520219089Spjd	znode_t *zp = VTOZ(vp);
521219089Spjd	objset_t *os = zp->z_zfsvfs->z_os;
522219089Spjd	struct sf_buf *sf;
523219089Spjd	vm_object_t obj;
524219089Spjd	vm_page_t pp;
525219089Spjd	int64_t start;
526219089Spjd	caddr_t va;
527219089Spjd	int len = nbytes;
528219089Spjd	int off;
529219089Spjd	int error = 0;
530219089Spjd
531219089Spjd	ASSERT(uio->uio_segflg == UIO_NOCOPY);
532219089Spjd	ASSERT(vp->v_mount != NULL);
533219089Spjd	obj = vp->v_object;
534219089Spjd	ASSERT(obj != NULL);
535219089Spjd	ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
536219089Spjd
537248084Sattilio	zfs_vmobject_wlock(obj);
538219089Spjd	for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
539219089Spjd		int bytes = MIN(PAGESIZE, len);
540219089Spjd
541254138Sattilio		pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY |
542254649Skib		    VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
543219089Spjd		if (pp->valid == 0) {
544248084Sattilio			zfs_vmobject_wunlock(obj);
545219089Spjd			va = zfs_map_page(pp, &sf);
546219089Spjd			error = dmu_read(os, zp->z_id, start, bytes, va,
547219089Spjd			    DMU_READ_PREFETCH);
548219089Spjd			if (bytes != PAGESIZE && error == 0)
549219089Spjd				bzero(va + bytes, PAGESIZE - bytes);
550219089Spjd			zfs_unmap_page(sf);
551248084Sattilio			zfs_vmobject_wlock(obj);
552254138Sattilio			vm_page_sunbusy(pp);
553219089Spjd			vm_page_lock(pp);
554219089Spjd			if (error) {
555253073Savg				if (pp->wire_count == 0 && pp->valid == 0 &&
556254138Sattilio				    !vm_page_busied(pp))
557253073Savg					vm_page_free(pp);
558219089Spjd			} else {
559219089Spjd				pp->valid = VM_PAGE_BITS_ALL;
560219089Spjd				vm_page_activate(pp);
561219089Spjd			}
562219089Spjd			vm_page_unlock(pp);
563254138Sattilio		} else
564254138Sattilio			vm_page_sunbusy(pp);
565219089Spjd		if (error)
566219089Spjd			break;
567219089Spjd		uio->uio_resid -= bytes;
568219089Spjd		uio->uio_offset += bytes;
569219089Spjd		len -= bytes;
570219089Spjd	}
571248084Sattilio	zfs_vmobject_wunlock(obj);
572219089Spjd	return (error);
573219089Spjd}
574219089Spjd
575219089Spjd/*
576168404Spjd * When a file is memory mapped, we must keep the IO data synchronized
577168404Spjd * between the DMU cache and the memory mapped pages.  What this means:
578168404Spjd *
579168404Spjd * On Read:	We "read" preferentially from memory mapped pages,
580168404Spjd *		else we default from the dmu buffer.
581168404Spjd *
582168404Spjd * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
583251631Sdelphij *	 the file is memory mapped.
584168404Spjd */
585168404Spjdstatic int
586168404Spjdmappedread(vnode_t *vp, int nbytes, uio_t *uio)
587168404Spjd{
588168404Spjd	znode_t *zp = VTOZ(vp);
589168404Spjd	objset_t *os = zp->z_zfsvfs->z_os;
590168404Spjd	vm_object_t obj;
591212655Savg	int64_t start;
592168926Spjd	caddr_t va;
593168404Spjd	int len = nbytes;
594212655Savg	int off;
595168404Spjd	int error = 0;
596168404Spjd
597168404Spjd	ASSERT(vp->v_mount != NULL);
598168404Spjd	obj = vp->v_object;
599168404Spjd	ASSERT(obj != NULL);
600168404Spjd
601168404Spjd	start = uio->uio_loffset;
602168404Spjd	off = start & PAGEOFFSET;
603248084Sattilio	zfs_vmobject_wlock(obj);
604168404Spjd	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
605219089Spjd		vm_page_t pp;
606219089Spjd		uint64_t bytes = MIN(PAGESIZE - off, len);
607168404Spjd
608253953Sattilio		if (pp = page_hold(vp, start)) {
609219089Spjd			struct sf_buf *sf;
610219089Spjd			caddr_t va;
611212652Savg
612248084Sattilio			zfs_vmobject_wunlock(obj);
613219089Spjd			va = zfs_map_page(pp, &sf);
614219089Spjd			error = uiomove(va + off, bytes, UIO_READ, uio);
615219089Spjd			zfs_unmap_page(sf);
616248084Sattilio			zfs_vmobject_wlock(obj);
617253953Sattilio			page_unhold(pp);
618219089Spjd		} else {
619248084Sattilio			zfs_vmobject_wunlock(obj);
620219089Spjd			error = dmu_read_uio(os, zp->z_id, uio, bytes);
621248084Sattilio			zfs_vmobject_wlock(obj);
622168404Spjd		}
623168404Spjd		len -= bytes;
624168404Spjd		off = 0;
625168404Spjd		if (error)
626168404Spjd			break;
627168404Spjd	}
628248084Sattilio	zfs_vmobject_wunlock(obj);
629168404Spjd	return (error);
630168404Spjd}
631168404Spjd
632168404Spjdoffset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
633168404Spjd
634168404Spjd/*
635168404Spjd * Read bytes from specified file into supplied buffer.
636168404Spjd *
637168404Spjd *	IN:	vp	- vnode of file to be read from.
638168404Spjd *		uio	- structure supplying read location, range info,
639168404Spjd *			  and return buffer.
640168404Spjd *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
641168404Spjd *		cr	- credentials of caller.
642185029Spjd *		ct	- caller context
643168404Spjd *
644168404Spjd *	OUT:	uio	- updated offset and range, buffer filled.
645168404Spjd *
646251631Sdelphij *	RETURN:	0 on success, error code on failure.
647168404Spjd *
648168404Spjd * Side Effects:
649168404Spjd *	vp - atime updated if byte count > 0
650168404Spjd */
651168404Spjd/* ARGSUSED */
652168404Spjdstatic int
653168962Spjdzfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
654168404Spjd{
655168404Spjd	znode_t		*zp = VTOZ(vp);
656168404Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
657185029Spjd	objset_t	*os;
658168404Spjd	ssize_t		n, nbytes;
659247187Smm	int		error = 0;
660168404Spjd	rl_t		*rl;
661219089Spjd	xuio_t		*xuio = NULL;
662168404Spjd
663168404Spjd	ZFS_ENTER(zfsvfs);
664185029Spjd	ZFS_VERIFY_ZP(zp);
665185029Spjd	os = zfsvfs->z_os;
666168404Spjd
667219089Spjd	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
668185029Spjd		ZFS_EXIT(zfsvfs);
669249195Smm		return (SET_ERROR(EACCES));
670185029Spjd	}
671185029Spjd
672168404Spjd	/*
673168404Spjd	 * Validate file offset
674168404Spjd	 */
675168404Spjd	if (uio->uio_loffset < (offset_t)0) {
676168404Spjd		ZFS_EXIT(zfsvfs);
677249195Smm		return (SET_ERROR(EINVAL));
678168404Spjd	}
679168404Spjd
680168404Spjd	/*
681168404Spjd	 * Fasttrack empty reads
682168404Spjd	 */
683168404Spjd	if (uio->uio_resid == 0) {
684168404Spjd		ZFS_EXIT(zfsvfs);
685168404Spjd		return (0);
686168404Spjd	}
687168404Spjd
688168404Spjd	/*
689168962Spjd	 * Check for mandatory locks
690168962Spjd	 */
691219089Spjd	if (MANDMODE(zp->z_mode)) {
692168962Spjd		if (error = chklock(vp, FREAD,
693168962Spjd		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
694168962Spjd			ZFS_EXIT(zfsvfs);
695168962Spjd			return (error);
696168962Spjd		}
697168962Spjd	}
698168962Spjd
699168962Spjd	/*
700168404Spjd	 * If we're in FRSYNC mode, sync out this znode before reading it.
701168404Spjd	 */
702224605Smm	if (zfsvfs->z_log &&
703224605Smm	    (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
704219089Spjd		zil_commit(zfsvfs->z_log, zp->z_id);
705168404Spjd
706168404Spjd	/*
707168404Spjd	 * Lock the range against changes.
708168404Spjd	 */
709168404Spjd	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
710168404Spjd
711168404Spjd	/*
712168404Spjd	 * If we are reading past end-of-file we can skip
713168404Spjd	 * to the end; but we might still need to set atime.
714168404Spjd	 */
715219089Spjd	if (uio->uio_loffset >= zp->z_size) {
716168404Spjd		error = 0;
717168404Spjd		goto out;
718168404Spjd	}
719168404Spjd
720219089Spjd	ASSERT(uio->uio_loffset < zp->z_size);
721219089Spjd	n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
722168404Spjd
723219089Spjd#ifdef sun
724219089Spjd	if ((uio->uio_extflg == UIO_XUIO) &&
725219089Spjd	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
726219089Spjd		int nblk;
727219089Spjd		int blksz = zp->z_blksz;
728219089Spjd		uint64_t offset = uio->uio_loffset;
729219089Spjd
730219089Spjd		xuio = (xuio_t *)uio;
731219089Spjd		if ((ISP2(blksz))) {
732219089Spjd			nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
733219089Spjd			    blksz)) / blksz;
734219089Spjd		} else {
735219089Spjd			ASSERT(offset + n <= blksz);
736219089Spjd			nblk = 1;
737219089Spjd		}
738219089Spjd		(void) dmu_xuio_init(xuio, nblk);
739219089Spjd
740219089Spjd		if (vn_has_cached_data(vp)) {
741219089Spjd			/*
742219089Spjd			 * For simplicity, we always allocate a full buffer
743219089Spjd			 * even if we only expect to read a portion of a block.
744219089Spjd			 */
745219089Spjd			while (--nblk >= 0) {
746219089Spjd				(void) dmu_xuio_add(xuio,
747219089Spjd				    dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
748219089Spjd				    blksz), 0, blksz);
749219089Spjd			}
750219089Spjd		}
751219089Spjd	}
752219089Spjd#endif	/* sun */
753219089Spjd
754168404Spjd	while (n > 0) {
755168404Spjd		nbytes = MIN(n, zfs_read_chunk_size -
756168404Spjd		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
757168404Spjd
758219089Spjd#ifdef __FreeBSD__
759219089Spjd		if (uio->uio_segflg == UIO_NOCOPY)
760219089Spjd			error = mappedread_sf(vp, nbytes, uio);
761219089Spjd		else
762219089Spjd#endif /* __FreeBSD__ */
763168404Spjd		if (vn_has_cached_data(vp))
764168404Spjd			error = mappedread(vp, nbytes, uio);
765168404Spjd		else
766168404Spjd			error = dmu_read_uio(os, zp->z_id, uio, nbytes);
767185029Spjd		if (error) {
768185029Spjd			/* convert checksum errors into IO errors */
769185029Spjd			if (error == ECKSUM)
770249195Smm				error = SET_ERROR(EIO);
771168404Spjd			break;
772185029Spjd		}
773168962Spjd
774168404Spjd		n -= nbytes;
775168404Spjd	}
776168404Spjdout:
777168404Spjd	zfs_range_unlock(rl);
778168404Spjd
779168404Spjd	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
780168404Spjd	ZFS_EXIT(zfsvfs);
781168404Spjd	return (error);
782168404Spjd}
783168404Spjd
784168404Spjd/*
785168404Spjd * Write the bytes to a file.
786168404Spjd *
787168404Spjd *	IN:	vp	- vnode of file to be written to.
788168404Spjd *		uio	- structure supplying write location, range info,
789168404Spjd *			  and data buffer.
790251631Sdelphij *		ioflag	- FAPPEND, FSYNC, and/or FDSYNC.  FAPPEND is
791251631Sdelphij *			  set if in append mode.
792168404Spjd *		cr	- credentials of caller.
793185029Spjd *		ct	- caller context (NFS/CIFS fem monitor only)
794168404Spjd *
795168404Spjd *	OUT:	uio	- updated offset and range.
796168404Spjd *
797251631Sdelphij *	RETURN:	0 on success, error code on failure.
798168404Spjd *
799168404Spjd * Timestamps:
800168404Spjd *	vp - ctime|mtime updated if byte count > 0
801168404Spjd */
802219089Spjd
803168404Spjd/* ARGSUSED */
804168404Spjdstatic int
805168962Spjdzfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
806168404Spjd{
807168404Spjd	znode_t		*zp = VTOZ(vp);
808168962Spjd	rlim64_t	limit = MAXOFFSET_T;
809168404Spjd	ssize_t		start_resid = uio->uio_resid;
810168404Spjd	ssize_t		tx_bytes;
811168404Spjd	uint64_t	end_size;
812168404Spjd	dmu_tx_t	*tx;
813168404Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
814185029Spjd	zilog_t		*zilog;
815168404Spjd	offset_t	woff;
816168404Spjd	ssize_t		n, nbytes;
817168404Spjd	rl_t		*rl;
818168404Spjd	int		max_blksz = zfsvfs->z_max_blksz;
819247187Smm	int		error = 0;
820209962Smm	arc_buf_t	*abuf;
821247187Smm	iovec_t		*aiov = NULL;
822219089Spjd	xuio_t		*xuio = NULL;
823219089Spjd	int		i_iov = 0;
824219089Spjd	int		iovcnt = uio->uio_iovcnt;
825219089Spjd	iovec_t		*iovp = uio->uio_iov;
826219089Spjd	int		write_eof;
827219089Spjd	int		count = 0;
828219089Spjd	sa_bulk_attr_t	bulk[4];
829219089Spjd	uint64_t	mtime[2], ctime[2];
830168404Spjd
831168404Spjd	/*
832168404Spjd	 * Fasttrack empty write
833168404Spjd	 */
834168404Spjd	n = start_resid;
835168404Spjd	if (n == 0)
836168404Spjd		return (0);
837168404Spjd
838168962Spjd	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
839168962Spjd		limit = MAXOFFSET_T;
840168962Spjd
841168404Spjd	ZFS_ENTER(zfsvfs);
842185029Spjd	ZFS_VERIFY_ZP(zp);
843168404Spjd
844219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
845219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
846219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
847219089Spjd	    &zp->z_size, 8);
848219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
849219089Spjd	    &zp->z_pflags, 8);
850219089Spjd
851168404Spjd	/*
852185029Spjd	 * If immutable or not appending then return EPERM
853185029Spjd	 */
854219089Spjd	if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
855219089Spjd	    ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
856219089Spjd	    (uio->uio_loffset < zp->z_size))) {
857185029Spjd		ZFS_EXIT(zfsvfs);
858249195Smm		return (SET_ERROR(EPERM));
859185029Spjd	}
860185029Spjd
861185029Spjd	zilog = zfsvfs->z_log;
862185029Spjd
863185029Spjd	/*
864219089Spjd	 * Validate file offset
865219089Spjd	 */
866219089Spjd	woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
867219089Spjd	if (woff < 0) {
868219089Spjd		ZFS_EXIT(zfsvfs);
869249195Smm		return (SET_ERROR(EINVAL));
870219089Spjd	}
871219089Spjd
872219089Spjd	/*
873219089Spjd	 * Check for mandatory locks before calling zfs_range_lock()
874219089Spjd	 * in order to prevent a deadlock with locks set via fcntl().
875219089Spjd	 */
876219089Spjd	if (MANDMODE((mode_t)zp->z_mode) &&
877219089Spjd	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
878219089Spjd		ZFS_EXIT(zfsvfs);
879219089Spjd		return (error);
880219089Spjd	}
881219089Spjd
882219089Spjd#ifdef sun
883219089Spjd	/*
884168404Spjd	 * Pre-fault the pages to ensure slow (eg NFS) pages
885168404Spjd	 * don't hold up txg.
886219089Spjd	 * Skip this if uio contains loaned arc_buf.
887168404Spjd	 */
888219089Spjd	if ((uio->uio_extflg == UIO_XUIO) &&
889219089Spjd	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
890219089Spjd		xuio = (xuio_t *)uio;
891219089Spjd	else
892219089Spjd		uio_prefaultpages(MIN(n, max_blksz), uio);
893219089Spjd#endif	/* sun */
894168404Spjd
895168404Spjd	/*
896168404Spjd	 * If in append mode, set the io offset pointer to eof.
897168404Spjd	 */
898213673Spjd	if (ioflag & FAPPEND) {
899168404Spjd		/*
900219089Spjd		 * Obtain an appending range lock to guarantee file append
901219089Spjd		 * semantics.  We reset the write offset once we have the lock.
902168404Spjd		 */
903168404Spjd		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
904219089Spjd		woff = rl->r_off;
905168404Spjd		if (rl->r_len == UINT64_MAX) {
906219089Spjd			/*
907219089Spjd			 * We overlocked the file because this write will cause
908219089Spjd			 * the file block size to increase.
909219089Spjd			 * Note that zp_size cannot change with this lock held.
910219089Spjd			 */
911219089Spjd			woff = zp->z_size;
912168404Spjd		}
913219089Spjd		uio->uio_loffset = woff;
914168404Spjd	} else {
915168404Spjd		/*
916219089Spjd		 * Note that if the file block size will change as a result of
917219089Spjd		 * this write, then this range lock will lock the entire file
918219089Spjd		 * so that we can re-write the block safely.
919168404Spjd		 */
920168404Spjd		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
921168404Spjd	}
922168404Spjd
923235781Strasz	if (vn_rlimit_fsize(vp, uio, uio->uio_td)) {
924235781Strasz		zfs_range_unlock(rl);
925235781Strasz		ZFS_EXIT(zfsvfs);
926235781Strasz		return (EFBIG);
927235781Strasz	}
928235781Strasz
929168962Spjd	if (woff >= limit) {
930168962Spjd		zfs_range_unlock(rl);
931168962Spjd		ZFS_EXIT(zfsvfs);
932249195Smm		return (SET_ERROR(EFBIG));
933168962Spjd	}
934168962Spjd
935168962Spjd	if ((woff + n) > limit || woff > (limit - n))
936168962Spjd		n = limit - woff;
937168962Spjd
938219089Spjd	/* Will this write extend the file length? */
939219089Spjd	write_eof = (woff + n > zp->z_size);
940168404Spjd
941219089Spjd	end_size = MAX(zp->z_size, woff + n);
942219089Spjd
943168404Spjd	/*
944168404Spjd	 * Write the file in reasonable size chunks.  Each chunk is written
945168404Spjd	 * in a separate transaction; this keeps the intent log records small
946168404Spjd	 * and allows us to do more fine-grained space accounting.
947168404Spjd	 */
948168404Spjd	while (n > 0) {
949209962Smm		abuf = NULL;
950209962Smm		woff = uio->uio_loffset;
951209962Smmagain:
952219089Spjd		if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
953219089Spjd		    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
954209962Smm			if (abuf != NULL)
955209962Smm				dmu_return_arcbuf(abuf);
956249195Smm			error = SET_ERROR(EDQUOT);
957209962Smm			break;
958209962Smm		}
959209962Smm
960219089Spjd		if (xuio && abuf == NULL) {
961219089Spjd			ASSERT(i_iov < iovcnt);
962219089Spjd			aiov = &iovp[i_iov];
963219089Spjd			abuf = dmu_xuio_arcbuf(xuio, i_iov);
964219089Spjd			dmu_xuio_clear(xuio, i_iov);
965219089Spjd			DTRACE_PROBE3(zfs_cp_write, int, i_iov,
966219089Spjd			    iovec_t *, aiov, arc_buf_t *, abuf);
967219089Spjd			ASSERT((aiov->iov_base == abuf->b_data) ||
968219089Spjd			    ((char *)aiov->iov_base - (char *)abuf->b_data +
969219089Spjd			    aiov->iov_len == arc_buf_size(abuf)));
970219089Spjd			i_iov++;
971219089Spjd		} else if (abuf == NULL && n >= max_blksz &&
972219089Spjd		    woff >= zp->z_size &&
973209962Smm		    P2PHASE(woff, max_blksz) == 0 &&
974209962Smm		    zp->z_blksz == max_blksz) {
975219089Spjd			/*
976219089Spjd			 * This write covers a full block.  "Borrow" a buffer
977219089Spjd			 * from the dmu so that we can fill it before we enter
978219089Spjd			 * a transaction.  This avoids the possibility of
979219089Spjd			 * holding up the transaction if the data copy hangs
980219089Spjd			 * up on a pagefault (e.g., from an NFS server mapping).
981219089Spjd			 */
982209962Smm			size_t cbytes;
983209962Smm
984219089Spjd			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
985219089Spjd			    max_blksz);
986209962Smm			ASSERT(abuf != NULL);
987209962Smm			ASSERT(arc_buf_size(abuf) == max_blksz);
988209962Smm			if (error = uiocopy(abuf->b_data, max_blksz,
989209962Smm			    UIO_WRITE, uio, &cbytes)) {
990209962Smm				dmu_return_arcbuf(abuf);
991209962Smm				break;
992209962Smm			}
993209962Smm			ASSERT(cbytes == max_blksz);
994209962Smm		}
995209962Smm
996209962Smm		/*
997168404Spjd		 * Start a transaction.
998168404Spjd		 */
999168404Spjd		tx = dmu_tx_create(zfsvfs->z_os);
1000219089Spjd		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1001168404Spjd		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
1002219089Spjd		zfs_sa_upgrade_txholds(tx, zp);
1003209962Smm		error = dmu_tx_assign(tx, TXG_NOWAIT);
1004168404Spjd		if (error) {
1005209962Smm			if (error == ERESTART) {
1006168404Spjd				dmu_tx_wait(tx);
1007168404Spjd				dmu_tx_abort(tx);
1008209962Smm				goto again;
1009168404Spjd			}
1010168404Spjd			dmu_tx_abort(tx);
1011209962Smm			if (abuf != NULL)
1012209962Smm				dmu_return_arcbuf(abuf);
1013168404Spjd			break;
1014168404Spjd		}
1015168404Spjd
1016168404Spjd		/*
1017168404Spjd		 * If zfs_range_lock() over-locked we grow the blocksize
1018168404Spjd		 * and then reduce the lock range.  This will only happen
1019168404Spjd		 * on the first iteration since zfs_range_reduce() will
1020168404Spjd		 * shrink down r_len to the appropriate size.
1021168404Spjd		 */
1022168404Spjd		if (rl->r_len == UINT64_MAX) {
1023168404Spjd			uint64_t new_blksz;
1024168404Spjd
1025168404Spjd			if (zp->z_blksz > max_blksz) {
1026168404Spjd				ASSERT(!ISP2(zp->z_blksz));
1027168404Spjd				new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
1028168404Spjd			} else {
1029168404Spjd				new_blksz = MIN(end_size, max_blksz);
1030168404Spjd			}
1031168404Spjd			zfs_grow_blocksize(zp, new_blksz, tx);
1032168404Spjd			zfs_range_reduce(rl, woff, n);
1033168404Spjd		}
1034168404Spjd
1035168404Spjd		/*
1036168404Spjd		 * XXX - should we really limit each write to z_max_blksz?
1037168404Spjd		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
1038168404Spjd		 */
1039168404Spjd		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
1040168404Spjd
1041219089Spjd		if (woff + nbytes > zp->z_size)
1042168404Spjd			vnode_pager_setsize(vp, woff + nbytes);
1043168404Spjd
1044209962Smm		if (abuf == NULL) {
1045209962Smm			tx_bytes = uio->uio_resid;
1046219089Spjd			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
1047219089Spjd			    uio, nbytes, tx);
1048209962Smm			tx_bytes -= uio->uio_resid;
1049168404Spjd		} else {
1050209962Smm			tx_bytes = nbytes;
1051219089Spjd			ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
1052219089Spjd			/*
1053219089Spjd			 * If this is not a full block write, but we are
1054219089Spjd			 * extending the file past EOF and this data starts
1055219089Spjd			 * block-aligned, use assign_arcbuf().  Otherwise,
1056219089Spjd			 * write via dmu_write().
1057219089Spjd			 */
1058219089Spjd			if (tx_bytes < max_blksz && (!write_eof ||
1059219089Spjd			    aiov->iov_base != abuf->b_data)) {
1060219089Spjd				ASSERT(xuio);
1061219089Spjd				dmu_write(zfsvfs->z_os, zp->z_id, woff,
1062219089Spjd				    aiov->iov_len, aiov->iov_base, tx);
1063219089Spjd				dmu_return_arcbuf(abuf);
1064219089Spjd				xuio_stat_wbuf_copied();
1065219089Spjd			} else {
1066219089Spjd				ASSERT(xuio || tx_bytes == max_blksz);
1067219089Spjd				dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
1068219089Spjd				    woff, abuf, tx);
1069219089Spjd			}
1070209962Smm			ASSERT(tx_bytes <= uio->uio_resid);
1071209962Smm			uioskip(uio, tx_bytes);
1072168404Spjd		}
1073212657Savg		if (tx_bytes && vn_has_cached_data(vp)) {
1074209962Smm			update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
1075209962Smm			    zp->z_id, uio->uio_segflg, tx);
1076209962Smm		}
1077209962Smm
1078209962Smm		/*
1079168404Spjd		 * If we made no progress, we're done.  If we made even
1080168404Spjd		 * partial progress, update the znode and ZIL accordingly.
1081168404Spjd		 */
1082168404Spjd		if (tx_bytes == 0) {
1083219089Spjd			(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
1084219089Spjd			    (void *)&zp->z_size, sizeof (uint64_t), tx);
1085168404Spjd			dmu_tx_commit(tx);
1086168404Spjd			ASSERT(error != 0);
1087168404Spjd			break;
1088168404Spjd		}
1089168404Spjd
1090168404Spjd		/*
1091168404Spjd		 * Clear Set-UID/Set-GID bits on successful write if not
1092168404Spjd		 * privileged and at least one of the excute bits is set.
1093168404Spjd		 *
1094168404Spjd		 * It would be nice to to this after all writes have
1095168404Spjd		 * been done, but that would still expose the ISUID/ISGID
1096168404Spjd		 * to another app after the partial write is committed.
1097185029Spjd		 *
1098185029Spjd		 * Note: we don't call zfs_fuid_map_id() here because
1099185029Spjd		 * user 0 is not an ephemeral uid.
1100168404Spjd		 */
1101168404Spjd		mutex_enter(&zp->z_acl_lock);
1102219089Spjd		if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
1103168404Spjd		    (S_IXUSR >> 6))) != 0 &&
1104219089Spjd		    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
1105185029Spjd		    secpolicy_vnode_setid_retain(vp, cr,
1106219089Spjd		    (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
1107219089Spjd			uint64_t newmode;
1108219089Spjd			zp->z_mode &= ~(S_ISUID | S_ISGID);
1109219089Spjd			newmode = zp->z_mode;
1110219089Spjd			(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
1111219089Spjd			    (void *)&newmode, sizeof (uint64_t), tx);
1112168404Spjd		}
1113168404Spjd		mutex_exit(&zp->z_acl_lock);
1114168404Spjd
1115219089Spjd		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
1116219089Spjd		    B_TRUE);
1117168404Spjd
1118168404Spjd		/*
1119168404Spjd		 * Update the file size (zp_size) if it has changed;
1120168404Spjd		 * account for possible concurrent updates.
1121168404Spjd		 */
1122219089Spjd		while ((end_size = zp->z_size) < uio->uio_loffset) {
1123219089Spjd			(void) atomic_cas_64(&zp->z_size, end_size,
1124168404Spjd			    uio->uio_loffset);
1125219089Spjd			ASSERT(error == 0);
1126219089Spjd		}
1127219089Spjd		/*
1128219089Spjd		 * If we are replaying and eof is non zero then force
1129219089Spjd		 * the file size to the specified eof. Note, there's no
1130219089Spjd		 * concurrency during replay.
1131219089Spjd		 */
1132219089Spjd		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
1133219089Spjd			zp->z_size = zfsvfs->z_replay_eof;
1134219089Spjd
1135219089Spjd		error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1136219089Spjd
1137168404Spjd		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
1138168404Spjd		dmu_tx_commit(tx);
1139168404Spjd
1140168404Spjd		if (error != 0)
1141168404Spjd			break;
1142168404Spjd		ASSERT(tx_bytes == nbytes);
1143168404Spjd		n -= nbytes;
1144219089Spjd
1145219089Spjd#ifdef sun
1146219089Spjd		if (!xuio && n > 0)
1147219089Spjd			uio_prefaultpages(MIN(n, max_blksz), uio);
1148219089Spjd#endif	/* sun */
1149168404Spjd	}
1150168404Spjd
1151168404Spjd	zfs_range_unlock(rl);
1152168404Spjd
1153168404Spjd	/*
1154168404Spjd	 * If we're in replay mode, or we made no progress, return error.
1155168404Spjd	 * Otherwise, it's at least a partial write, so it's successful.
1156168404Spjd	 */
1157209962Smm	if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
1158168404Spjd		ZFS_EXIT(zfsvfs);
1159168404Spjd		return (error);
1160168404Spjd	}
1161168404Spjd
1162219089Spjd	if (ioflag & (FSYNC | FDSYNC) ||
1163219089Spjd	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1164219089Spjd		zil_commit(zilog, zp->z_id);
1165168404Spjd
1166168404Spjd	ZFS_EXIT(zfsvfs);
1167168404Spjd	return (0);
1168168404Spjd}
1169168404Spjd
1170168404Spjdvoid
1171219089Spjdzfs_get_done(zgd_t *zgd, int error)
1172168404Spjd{
1173219089Spjd	znode_t *zp = zgd->zgd_private;
1174219089Spjd	objset_t *os = zp->z_zfsvfs->z_os;
1175168404Spjd
1176219089Spjd	if (zgd->zgd_db)
1177219089Spjd		dmu_buf_rele(zgd->zgd_db, zgd);
1178219089Spjd
1179219089Spjd	zfs_range_unlock(zgd->zgd_rl);
1180219089Spjd
1181191900Skmacy	/*
1182191900Skmacy	 * Release the vnode asynchronously as we currently have the
1183191900Skmacy	 * txg stopped from syncing.
1184191900Skmacy	 */
1185219089Spjd	VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1186219089Spjd
1187219089Spjd	if (error == 0 && zgd->zgd_bp)
1188219089Spjd		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
1189219089Spjd
1190168404Spjd	kmem_free(zgd, sizeof (zgd_t));
1191168404Spjd}
1192168404Spjd
1193214378Smm#ifdef DEBUG
1194214378Smmstatic int zil_fault_io = 0;
1195214378Smm#endif
1196214378Smm
1197168404Spjd/*
1198168404Spjd * Get data to generate a TX_WRITE intent log record.
1199168404Spjd */
1200168404Spjdint
1201168404Spjdzfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
1202168404Spjd{
1203168404Spjd	zfsvfs_t *zfsvfs = arg;
1204168404Spjd	objset_t *os = zfsvfs->z_os;
1205168404Spjd	znode_t *zp;
1206219089Spjd	uint64_t object = lr->lr_foid;
1207219089Spjd	uint64_t offset = lr->lr_offset;
1208219089Spjd	uint64_t size = lr->lr_length;
1209219089Spjd	blkptr_t *bp = &lr->lr_blkptr;
1210168404Spjd	dmu_buf_t *db;
1211168404Spjd	zgd_t *zgd;
1212168404Spjd	int error = 0;
1213168404Spjd
1214219089Spjd	ASSERT(zio != NULL);
1215219089Spjd	ASSERT(size != 0);
1216168404Spjd
1217168404Spjd	/*
1218168404Spjd	 * Nothing to do if the file has been removed
1219168404Spjd	 */
1220219089Spjd	if (zfs_zget(zfsvfs, object, &zp) != 0)
1221249195Smm		return (SET_ERROR(ENOENT));
1222168404Spjd	if (zp->z_unlinked) {
1223191900Skmacy		/*
1224191900Skmacy		 * Release the vnode asynchronously as we currently have the
1225191900Skmacy		 * txg stopped from syncing.
1226191900Skmacy		 */
1227196307Spjd		VN_RELE_ASYNC(ZTOV(zp),
1228196307Spjd		    dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1229249195Smm		return (SET_ERROR(ENOENT));
1230168404Spjd	}
1231168404Spjd
1232219089Spjd	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1233219089Spjd	zgd->zgd_zilog = zfsvfs->z_log;
1234219089Spjd	zgd->zgd_private = zp;
1235219089Spjd
1236168404Spjd	/*
1237168404Spjd	 * Write records come in two flavors: immediate and indirect.
1238168404Spjd	 * For small writes it's cheaper to store the data with the
1239168404Spjd	 * log record (immediate); for large writes it's cheaper to
1240168404Spjd	 * sync the data and get a pointer to it (indirect) so that
1241168404Spjd	 * we don't have to write the data twice.
1242168404Spjd	 */
1243168404Spjd	if (buf != NULL) { /* immediate write */
1244219089Spjd		zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
1245168404Spjd		/* test for truncation needs to be done while range locked */
1246219089Spjd		if (offset >= zp->z_size) {
1247249195Smm			error = SET_ERROR(ENOENT);
1248219089Spjd		} else {
1249219089Spjd			error = dmu_read(os, object, offset, size, buf,
1250219089Spjd			    DMU_READ_NO_PREFETCH);
1251168404Spjd		}
1252219089Spjd		ASSERT(error == 0 || error == ENOENT);
1253168404Spjd	} else { /* indirect write */
1254168404Spjd		/*
1255168404Spjd		 * Have to lock the whole block to ensure when it's
1256168404Spjd		 * written out and it's checksum is being calculated
1257168404Spjd		 * that no one can change the data. We need to re-check
1258168404Spjd		 * blocksize after we get the lock in case it's changed!
1259168404Spjd		 */
1260168404Spjd		for (;;) {
1261219089Spjd			uint64_t blkoff;
1262219089Spjd			size = zp->z_blksz;
1263219089Spjd			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1264219089Spjd			offset -= blkoff;
1265219089Spjd			zgd->zgd_rl = zfs_range_lock(zp, offset, size,
1266219089Spjd			    RL_READER);
1267219089Spjd			if (zp->z_blksz == size)
1268168404Spjd				break;
1269219089Spjd			offset += blkoff;
1270219089Spjd			zfs_range_unlock(zgd->zgd_rl);
1271168404Spjd		}
1272168404Spjd		/* test for truncation needs to be done while range locked */
1273219089Spjd		if (lr->lr_offset >= zp->z_size)
1274249195Smm			error = SET_ERROR(ENOENT);
1275214378Smm#ifdef DEBUG
1276214378Smm		if (zil_fault_io) {
1277249195Smm			error = SET_ERROR(EIO);
1278214378Smm			zil_fault_io = 0;
1279214378Smm		}
1280214378Smm#endif
1281219089Spjd		if (error == 0)
1282219089Spjd			error = dmu_buf_hold(os, object, offset, zgd, &db,
1283219089Spjd			    DMU_READ_NO_PREFETCH);
1284214378Smm
1285209962Smm		if (error == 0) {
1286243524Smm			blkptr_t *obp = dmu_buf_get_blkptr(db);
1287243524Smm			if (obp) {
1288243524Smm				ASSERT(BP_IS_HOLE(bp));
1289243524Smm				*bp = *obp;
1290243524Smm			}
1291243524Smm
1292219089Spjd			zgd->zgd_db = db;
1293219089Spjd			zgd->zgd_bp = bp;
1294219089Spjd
1295219089Spjd			ASSERT(db->db_offset == offset);
1296219089Spjd			ASSERT(db->db_size == size);
1297219089Spjd
1298219089Spjd			error = dmu_sync(zio, lr->lr_common.lrc_txg,
1299219089Spjd			    zfs_get_done, zgd);
1300219089Spjd			ASSERT(error || lr->lr_length <= zp->z_blksz);
1301219089Spjd
1302209962Smm			/*
1303219089Spjd			 * On success, we need to wait for the write I/O
1304219089Spjd			 * initiated by dmu_sync() to complete before we can
1305219089Spjd			 * release this dbuf.  We will finish everything up
1306219089Spjd			 * in the zfs_get_done() callback.
1307209962Smm			 */
1308219089Spjd			if (error == 0)
1309219089Spjd				return (0);
1310209962Smm
1311219089Spjd			if (error == EALREADY) {
1312219089Spjd				lr->lr_common.lrc_txtype = TX_WRITE2;
1313219089Spjd				error = 0;
1314219089Spjd			}
1315209962Smm		}
1316168404Spjd	}
1317219089Spjd
1318219089Spjd	zfs_get_done(zgd, error);
1319219089Spjd
1320168404Spjd	return (error);
1321168404Spjd}
1322168404Spjd
1323168404Spjd/*ARGSUSED*/
1324168404Spjdstatic int
1325185029Spjdzfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1326185029Spjd    caller_context_t *ct)
1327168404Spjd{
1328168404Spjd	znode_t *zp = VTOZ(vp);
1329168404Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1330168404Spjd	int error;
1331168404Spjd
1332168404Spjd	ZFS_ENTER(zfsvfs);
1333185029Spjd	ZFS_VERIFY_ZP(zp);
1334185029Spjd
1335185029Spjd	if (flag & V_ACE_MASK)
1336185029Spjd		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1337185029Spjd	else
1338185029Spjd		error = zfs_zaccess_rwx(zp, mode, flag, cr);
1339185029Spjd
1340168404Spjd	ZFS_EXIT(zfsvfs);
1341168404Spjd	return (error);
1342168404Spjd}
1343168404Spjd
1344168404Spjd/*
1345211932Smm * If vnode is for a device return a specfs vnode instead.
1346211932Smm */
1347211932Smmstatic int
1348211932Smmspecvp_check(vnode_t **vpp, cred_t *cr)
1349211932Smm{
1350211932Smm	int error = 0;
1351211932Smm
1352211932Smm	if (IS_DEVVP(*vpp)) {
1353211932Smm		struct vnode *svp;
1354211932Smm
1355211932Smm		svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1356211932Smm		VN_RELE(*vpp);
1357211932Smm		if (svp == NULL)
1358249195Smm			error = SET_ERROR(ENOSYS);
1359211932Smm		*vpp = svp;
1360211932Smm	}
1361211932Smm	return (error);
1362211932Smm}
1363211932Smm
1364211932Smm
1365211932Smm/*
1366168404Spjd * Lookup an entry in a directory, or an extended attribute directory.
1367168404Spjd * If it exists, return a held vnode reference for it.
1368168404Spjd *
1369168404Spjd *	IN:	dvp	- vnode of directory to search.
1370168404Spjd *		nm	- name of entry to lookup.
1371168404Spjd *		pnp	- full pathname to lookup [UNUSED].
1372168404Spjd *		flags	- LOOKUP_XATTR set if looking for an attribute.
1373168404Spjd *		rdir	- root directory vnode [UNUSED].
1374168404Spjd *		cr	- credentials of caller.
1375185029Spjd *		ct	- caller context
1376185029Spjd *		direntflags - directory lookup flags
1377185029Spjd *		realpnp - returned pathname.
1378168404Spjd *
1379168404Spjd *	OUT:	vpp	- vnode of located entry, NULL if not found.
1380168404Spjd *
1381251631Sdelphij *	RETURN:	0 on success, error code on failure.
1382168404Spjd *
1383168404Spjd * Timestamps:
1384168404Spjd *	NA
1385168404Spjd */
1386168404Spjd/* ARGSUSED */
1387168962Spjdstatic int
1388168962Spjdzfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
1389185029Spjd    int nameiop, cred_t *cr, kthread_t *td, int flags)
1390168404Spjd{
1391168962Spjd	znode_t *zdp = VTOZ(dvp);
1392168962Spjd	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1393211932Smm	int	error = 0;
1394185029Spjd	int *direntflags = NULL;
1395185029Spjd	void *realpnp = NULL;
1396168404Spjd
1397211932Smm	/* fast path */
1398211932Smm	if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
1399211932Smm
1400211932Smm		if (dvp->v_type != VDIR) {
1401249195Smm			return (SET_ERROR(ENOTDIR));
1402219089Spjd		} else if (zdp->z_sa_hdl == NULL) {
1403249195Smm			return (SET_ERROR(EIO));
1404211932Smm		}
1405211932Smm
1406211932Smm		if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
1407211932Smm			error = zfs_fastaccesschk_execute(zdp, cr);
1408211932Smm			if (!error) {
1409211932Smm				*vpp = dvp;
1410211932Smm				VN_HOLD(*vpp);
1411211932Smm				return (0);
1412211932Smm			}
1413211932Smm			return (error);
1414211932Smm		} else {
1415211932Smm			vnode_t *tvp = dnlc_lookup(dvp, nm);
1416211932Smm
1417211932Smm			if (tvp) {
1418211932Smm				error = zfs_fastaccesschk_execute(zdp, cr);
1419211932Smm				if (error) {
1420211932Smm					VN_RELE(tvp);
1421211932Smm					return (error);
1422211932Smm				}
1423211932Smm				if (tvp == DNLC_NO_VNODE) {
1424211932Smm					VN_RELE(tvp);
1425249195Smm					return (SET_ERROR(ENOENT));
1426211932Smm				} else {
1427211932Smm					*vpp = tvp;
1428211932Smm					return (specvp_check(vpp, cr));
1429211932Smm				}
1430211932Smm			}
1431211932Smm		}
1432211932Smm	}
1433211932Smm
1434211932Smm	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1435211932Smm
1436168404Spjd	ZFS_ENTER(zfsvfs);
1437185029Spjd	ZFS_VERIFY_ZP(zdp);
1438168404Spjd
1439168404Spjd	*vpp = NULL;
1440168404Spjd
1441185029Spjd	if (flags & LOOKUP_XATTR) {
1442168404Spjd#ifdef TODO
1443168404Spjd		/*
1444168404Spjd		 * If the xattr property is off, refuse the lookup request.
1445168404Spjd		 */
1446168404Spjd		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1447168404Spjd			ZFS_EXIT(zfsvfs);
1448249195Smm			return (SET_ERROR(EINVAL));
1449168404Spjd		}
1450185029Spjd#endif
1451168404Spjd
1452168404Spjd		/*
1453168404Spjd		 * We don't allow recursive attributes..
1454168404Spjd		 * Maybe someday we will.
1455168404Spjd		 */
1456219089Spjd		if (zdp->z_pflags & ZFS_XATTR) {
1457168404Spjd			ZFS_EXIT(zfsvfs);
1458249195Smm			return (SET_ERROR(EINVAL));
1459168404Spjd		}
1460168404Spjd
1461168404Spjd		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1462168404Spjd			ZFS_EXIT(zfsvfs);
1463168404Spjd			return (error);
1464168404Spjd		}
1465168404Spjd
1466168404Spjd		/*
1467168404Spjd		 * Do we have permission to get into attribute directory?
1468168404Spjd		 */
1469168404Spjd
1470185029Spjd		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1471185029Spjd		    B_FALSE, cr)) {
1472168404Spjd			VN_RELE(*vpp);
1473185029Spjd			*vpp = NULL;
1474168404Spjd		}
1475168404Spjd
1476168404Spjd		ZFS_EXIT(zfsvfs);
1477168404Spjd		return (error);
1478168404Spjd	}
1479168404Spjd
1480168404Spjd	if (dvp->v_type != VDIR) {
1481168404Spjd		ZFS_EXIT(zfsvfs);
1482249195Smm		return (SET_ERROR(ENOTDIR));
1483168404Spjd	}
1484168404Spjd
1485168404Spjd	/*
1486168404Spjd	 * Check accessibility of directory.
1487168404Spjd	 */
1488168404Spjd
1489185029Spjd	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1490168404Spjd		ZFS_EXIT(zfsvfs);
1491168404Spjd		return (error);
1492168404Spjd	}
1493168404Spjd
1494185029Spjd	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1495185029Spjd	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1496185029Spjd		ZFS_EXIT(zfsvfs);
1497249195Smm		return (SET_ERROR(EILSEQ));
1498185029Spjd	}
1499168404Spjd
1500185029Spjd	error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
1501211932Smm	if (error == 0)
1502211932Smm		error = specvp_check(vpp, cr);
1503168962Spjd
1504168404Spjd	/* Translate errors and add SAVENAME when needed. */
1505168404Spjd	if (cnp->cn_flags & ISLASTCN) {
1506168404Spjd		switch (nameiop) {
1507168404Spjd		case CREATE:
1508168404Spjd		case RENAME:
1509168404Spjd			if (error == ENOENT) {
1510168404Spjd				error = EJUSTRETURN;
1511168404Spjd				cnp->cn_flags |= SAVENAME;
1512168404Spjd				break;
1513168404Spjd			}
1514168404Spjd			/* FALLTHROUGH */
1515168404Spjd		case DELETE:
1516168404Spjd			if (error == 0)
1517168404Spjd				cnp->cn_flags |= SAVENAME;
1518168404Spjd			break;
1519168404Spjd		}
1520168404Spjd	}
1521168404Spjd	if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) {
1522169198Spjd		int ltype = 0;
1523169198Spjd
1524169198Spjd		if (cnp->cn_flags & ISDOTDOT) {
1525176559Sattilio			ltype = VOP_ISLOCKED(dvp);
1526175294Sattilio			VOP_UNLOCK(dvp, 0);
1527169198Spjd		}
1528206667Spjd		ZFS_EXIT(zfsvfs);
1529254711Savg		error = vn_lock(*vpp, cnp->cn_lkflags);
1530168962Spjd		if (cnp->cn_flags & ISDOTDOT)
1531175202Sattilio			vn_lock(dvp, ltype | LK_RETRY);
1532169172Spjd		if (error != 0) {
1533169172Spjd			VN_RELE(*vpp);
1534169172Spjd			*vpp = NULL;
1535169172Spjd			return (error);
1536169172Spjd		}
1537206667Spjd	} else {
1538206667Spjd		ZFS_EXIT(zfsvfs);
1539168404Spjd	}
1540168404Spjd
1541168404Spjd#ifdef FREEBSD_NAMECACHE
1542168404Spjd	/*
1543168404Spjd	 * Insert name into cache (as non-existent) if appropriate.
1544168404Spjd	 */
1545168404Spjd	if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
1546168404Spjd		cache_enter(dvp, *vpp, cnp);
1547169170Spjd	/*
1548169170Spjd	 * Insert name into cache if appropriate.
1549169170Spjd	 */
1550168404Spjd	if (error == 0 && (cnp->cn_flags & MAKEENTRY)) {
1551168404Spjd		if (!(cnp->cn_flags & ISLASTCN) ||
1552168404Spjd		    (nameiop != DELETE && nameiop != RENAME)) {
1553168404Spjd			cache_enter(dvp, *vpp, cnp);
1554168404Spjd		}
1555168404Spjd	}
1556168404Spjd#endif
1557168404Spjd
1558168404Spjd	return (error);
1559168404Spjd}
1560168404Spjd
1561168404Spjd/*
1562168404Spjd * Attempt to create a new entry in a directory.  If the entry
1563168404Spjd * already exists, truncate the file if permissible, else return
1564168404Spjd * an error.  Return the vp of the created or trunc'd file.
1565168404Spjd *
1566168404Spjd *	IN:	dvp	- vnode of directory to put new file entry in.
1567168404Spjd *		name	- name of new file entry.
1568168404Spjd *		vap	- attributes of new file.
1569168404Spjd *		excl	- flag indicating exclusive or non-exclusive mode.
1570168404Spjd *		mode	- mode to open file with.
1571168404Spjd *		cr	- credentials of caller.
1572168404Spjd *		flag	- large file flag [UNUSED].
1573185029Spjd *		ct	- caller context
1574185029Spjd *		vsecp 	- ACL to be set
1575168404Spjd *
1576168404Spjd *	OUT:	vpp	- vnode of created or trunc'd entry.
1577168404Spjd *
1578251631Sdelphij *	RETURN:	0 on success, error code on failure.
1579168404Spjd *
1580168404Spjd * Timestamps:
1581168404Spjd *	dvp - ctime|mtime updated if new entry created
1582168404Spjd *	 vp - ctime|mtime always, atime if new
1583168404Spjd */
1584185029Spjd
1585168404Spjd/* ARGSUSED */
1586168404Spjdstatic int
1587168962Spjdzfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
1588185029Spjd    vnode_t **vpp, cred_t *cr, kthread_t *td)
1589168404Spjd{
1590168404Spjd	znode_t		*zp, *dzp = VTOZ(dvp);
1591168404Spjd	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1592185029Spjd	zilog_t		*zilog;
1593185029Spjd	objset_t	*os;
1594168404Spjd	zfs_dirlock_t	*dl;
1595168404Spjd	dmu_tx_t	*tx;
1596168404Spjd	int		error;
1597209962Smm	ksid_t		*ksid;
1598209962Smm	uid_t		uid;
1599209962Smm	gid_t		gid = crgetgid(cr);
1600219089Spjd	zfs_acl_ids_t   acl_ids;
1601209962Smm	boolean_t	fuid_dirtied;
1602219089Spjd	boolean_t	have_acl = B_FALSE;
1603185029Spjd	void		*vsecp = NULL;
1604185029Spjd	int		flag = 0;
1605168404Spjd
1606185029Spjd	/*
1607185029Spjd	 * If we have an ephemeral id, ACL, or XVATTR then
1608185029Spjd	 * make sure file system is at proper version
1609185029Spjd	 */
1610185029Spjd
1611209962Smm	ksid = crgetsid(cr, KSID_OWNER);
1612209962Smm	if (ksid)
1613209962Smm		uid = ksid_getid(ksid);
1614209962Smm	else
1615209962Smm		uid = crgetuid(cr);
1616219089Spjd
1617185029Spjd	if (zfsvfs->z_use_fuids == B_FALSE &&
1618185029Spjd	    (vsecp || (vap->va_mask & AT_XVATTR) ||
1619219089Spjd	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1620249195Smm		return (SET_ERROR(EINVAL));
1621185029Spjd
1622168404Spjd	ZFS_ENTER(zfsvfs);
1623185029Spjd	ZFS_VERIFY_ZP(dzp);
1624185029Spjd	os = zfsvfs->z_os;
1625185029Spjd	zilog = zfsvfs->z_log;
1626168404Spjd
1627185029Spjd	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1628185029Spjd	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1629185029Spjd		ZFS_EXIT(zfsvfs);
1630249195Smm		return (SET_ERROR(EILSEQ));
1631185029Spjd	}
1632185029Spjd
1633185029Spjd	if (vap->va_mask & AT_XVATTR) {
1634197861Spjd		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
1635185029Spjd		    crgetuid(cr), cr, vap->va_type)) != 0) {
1636185029Spjd			ZFS_EXIT(zfsvfs);
1637185029Spjd			return (error);
1638185029Spjd		}
1639185029Spjd	}
1640168404Spjdtop:
1641168404Spjd	*vpp = NULL;
1642168404Spjd
1643182905Strasz	if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
1644182905Strasz		vap->va_mode &= ~S_ISVTX;
1645168404Spjd
1646168404Spjd	if (*name == '\0') {
1647168404Spjd		/*
1648168404Spjd		 * Null component name refers to the directory itself.
1649168404Spjd		 */
1650168404Spjd		VN_HOLD(dvp);
1651168404Spjd		zp = dzp;
1652168404Spjd		dl = NULL;
1653168404Spjd		error = 0;
1654168404Spjd	} else {
1655168404Spjd		/* possible VN_HOLD(zp) */
1656185029Spjd		int zflg = 0;
1657185029Spjd
1658185029Spjd		if (flag & FIGNORECASE)
1659185029Spjd			zflg |= ZCILOOK;
1660185029Spjd
1661185029Spjd		error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1662185029Spjd		    NULL, NULL);
1663185029Spjd		if (error) {
1664219089Spjd			if (have_acl)
1665219089Spjd				zfs_acl_ids_free(&acl_ids);
1666168404Spjd			if (strcmp(name, "..") == 0)
1667249195Smm				error = SET_ERROR(EISDIR);
1668168404Spjd			ZFS_EXIT(zfsvfs);
1669168404Spjd			return (error);
1670168404Spjd		}
1671168404Spjd	}
1672219089Spjd
1673185029Spjd	if (zp == NULL) {
1674185029Spjd		uint64_t txtype;
1675168404Spjd
1676168404Spjd		/*
1677168404Spjd		 * Create a new file object and update the directory
1678168404Spjd		 * to reference it.
1679168404Spjd		 */
1680185029Spjd		if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1681219089Spjd			if (have_acl)
1682219089Spjd				zfs_acl_ids_free(&acl_ids);
1683168404Spjd			goto out;
1684168404Spjd		}
1685168404Spjd
1686168404Spjd		/*
1687168404Spjd		 * We only support the creation of regular files in
1688168404Spjd		 * extended attribute directories.
1689168404Spjd		 */
1690219089Spjd
1691219089Spjd		if ((dzp->z_pflags & ZFS_XATTR) &&
1692168404Spjd		    (vap->va_type != VREG)) {
1693219089Spjd			if (have_acl)
1694219089Spjd				zfs_acl_ids_free(&acl_ids);
1695249195Smm			error = SET_ERROR(EINVAL);
1696168404Spjd			goto out;
1697168404Spjd		}
1698168404Spjd
1699219089Spjd		if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
1700219089Spjd		    cr, vsecp, &acl_ids)) != 0)
1701219089Spjd			goto out;
1702219089Spjd		have_acl = B_TRUE;
1703209962Smm
1704209962Smm		if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1705211932Smm			zfs_acl_ids_free(&acl_ids);
1706249195Smm			error = SET_ERROR(EDQUOT);
1707209962Smm			goto out;
1708209962Smm		}
1709209962Smm
1710168404Spjd		tx = dmu_tx_create(os);
1711219089Spjd
1712219089Spjd		dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1713219089Spjd		    ZFS_SA_BASE_ATTR_SIZE);
1714219089Spjd
1715209962Smm		fuid_dirtied = zfsvfs->z_fuid_dirty;
1716209962Smm		if (fuid_dirtied)
1717209962Smm			zfs_fuid_txhold(zfsvfs, tx);
1718168404Spjd		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1719219089Spjd		dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1720219089Spjd		if (!zfsvfs->z_use_sa &&
1721219089Spjd		    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1722168404Spjd			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1723219089Spjd			    0, acl_ids.z_aclp->z_acl_bytes);
1724185029Spjd		}
1725209962Smm		error = dmu_tx_assign(tx, TXG_NOWAIT);
1726168404Spjd		if (error) {
1727168404Spjd			zfs_dirent_unlock(dl);
1728209962Smm			if (error == ERESTART) {
1729168404Spjd				dmu_tx_wait(tx);
1730168404Spjd				dmu_tx_abort(tx);
1731168404Spjd				goto top;
1732168404Spjd			}
1733219089Spjd			zfs_acl_ids_free(&acl_ids);
1734168404Spjd			dmu_tx_abort(tx);
1735168404Spjd			ZFS_EXIT(zfsvfs);
1736168404Spjd			return (error);
1737168404Spjd		}
1738219089Spjd		zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1739209962Smm
1740209962Smm		if (fuid_dirtied)
1741209962Smm			zfs_fuid_sync(zfsvfs, tx);
1742209962Smm
1743168404Spjd		(void) zfs_link_create(dl, zp, tx, ZNEW);
1744185029Spjd		txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1745185029Spjd		if (flag & FIGNORECASE)
1746185029Spjd			txtype |= TX_CI;
1747185029Spjd		zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1748209962Smm		    vsecp, acl_ids.z_fuidp, vap);
1749209962Smm		zfs_acl_ids_free(&acl_ids);
1750168404Spjd		dmu_tx_commit(tx);
1751168404Spjd	} else {
1752185029Spjd		int aflags = (flag & FAPPEND) ? V_APPEND : 0;
1753185029Spjd
1754219089Spjd		if (have_acl)
1755219089Spjd			zfs_acl_ids_free(&acl_ids);
1756219089Spjd		have_acl = B_FALSE;
1757219089Spjd
1758168404Spjd		/*
1759168404Spjd		 * A directory entry already exists for this name.
1760168404Spjd		 */
1761168404Spjd		/*
1762168962Spjd		 * Can't truncate an existing file if in exclusive mode.
1763168962Spjd		 */
1764168962Spjd		if (excl == EXCL) {
1765249195Smm			error = SET_ERROR(EEXIST);
1766168962Spjd			goto out;
1767168962Spjd		}
1768168962Spjd		/*
1769168404Spjd		 * Can't open a directory for writing.
1770168404Spjd		 */
1771168404Spjd		if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
1772249195Smm			error = SET_ERROR(EISDIR);
1773168404Spjd			goto out;
1774168404Spjd		}
1775168404Spjd		/*
1776168404Spjd		 * Verify requested access to file.
1777168404Spjd		 */
1778185029Spjd		if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
1779168404Spjd			goto out;
1780168404Spjd		}
1781168404Spjd
1782168404Spjd		mutex_enter(&dzp->z_lock);
1783168404Spjd		dzp->z_seq++;
1784168404Spjd		mutex_exit(&dzp->z_lock);
1785168404Spjd
1786168404Spjd		/*
1787168404Spjd		 * Truncate regular files if requested.
1788168404Spjd		 */
1789168404Spjd		if ((ZTOV(zp)->v_type == VREG) &&
1790168404Spjd		    (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
1791185029Spjd			/* we can't hold any locks when calling zfs_freesp() */
1792185029Spjd			zfs_dirent_unlock(dl);
1793185029Spjd			dl = NULL;
1794168404Spjd			error = zfs_freesp(zp, 0, 0, mode, TRUE);
1795185029Spjd			if (error == 0) {
1796185029Spjd				vnevent_create(ZTOV(zp), ct);
1797168404Spjd			}
1798168404Spjd		}
1799168404Spjd	}
1800168404Spjdout:
1801168404Spjd	if (dl)
1802168404Spjd		zfs_dirent_unlock(dl);
1803168404Spjd
1804168404Spjd	if (error) {
1805168404Spjd		if (zp)
1806168404Spjd			VN_RELE(ZTOV(zp));
1807168962Spjd	} else {
1808168962Spjd		*vpp = ZTOV(zp);
1809211932Smm		error = specvp_check(vpp, cr);
1810168404Spjd	}
1811168404Spjd
1812219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1813219089Spjd		zil_commit(zilog, 0);
1814219089Spjd
1815168404Spjd	ZFS_EXIT(zfsvfs);
1816168404Spjd	return (error);
1817168404Spjd}
1818168404Spjd
1819168404Spjd/*
1820168404Spjd * Remove an entry from a directory.
1821168404Spjd *
1822168404Spjd *	IN:	dvp	- vnode of directory to remove entry from.
1823168404Spjd *		name	- name of entry to remove.
1824168404Spjd *		cr	- credentials of caller.
1825185029Spjd *		ct	- caller context
1826185029Spjd *		flags	- case flags
1827168404Spjd *
1828251631Sdelphij *	RETURN:	0 on success, error code on failure.
1829168404Spjd *
1830168404Spjd * Timestamps:
1831168404Spjd *	dvp - ctime|mtime
1832168404Spjd *	 vp - ctime (if nlink > 0)
1833168404Spjd */
1834219089Spjd
1835219089Spjduint64_t null_xattr = 0;
1836219089Spjd
1837185029Spjd/*ARGSUSED*/
1838168404Spjdstatic int
1839185029Spjdzfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
1840185029Spjd    int flags)
1841168404Spjd{
1842168404Spjd	znode_t		*zp, *dzp = VTOZ(dvp);
1843219089Spjd	znode_t		*xzp;
1844168404Spjd	vnode_t		*vp;
1845168404Spjd	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1846185029Spjd	zilog_t		*zilog;
1847168962Spjd	uint64_t	acl_obj, xattr_obj;
1848219089Spjd	uint64_t 	xattr_obj_unlinked = 0;
1849219089Spjd	uint64_t	obj = 0;
1850168404Spjd	zfs_dirlock_t	*dl;
1851168404Spjd	dmu_tx_t	*tx;
1852168962Spjd	boolean_t	may_delete_now, delete_now = FALSE;
1853185029Spjd	boolean_t	unlinked, toobig = FALSE;
1854185029Spjd	uint64_t	txtype;
1855185029Spjd	pathname_t	*realnmp = NULL;
1856185029Spjd	pathname_t	realnm;
1857168404Spjd	int		error;
1858185029Spjd	int		zflg = ZEXISTS;
1859168404Spjd
1860168404Spjd	ZFS_ENTER(zfsvfs);
1861185029Spjd	ZFS_VERIFY_ZP(dzp);
1862185029Spjd	zilog = zfsvfs->z_log;
1863168404Spjd
1864185029Spjd	if (flags & FIGNORECASE) {
1865185029Spjd		zflg |= ZCILOOK;
1866185029Spjd		pn_alloc(&realnm);
1867185029Spjd		realnmp = &realnm;
1868185029Spjd	}
1869185029Spjd
1870168404Spjdtop:
1871219089Spjd	xattr_obj = 0;
1872219089Spjd	xzp = NULL;
1873168404Spjd	/*
1874168404Spjd	 * Attempt to lock directory; fail if entry doesn't exist.
1875168404Spjd	 */
1876185029Spjd	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1877185029Spjd	    NULL, realnmp)) {
1878185029Spjd		if (realnmp)
1879185029Spjd			pn_free(realnmp);
1880168404Spjd		ZFS_EXIT(zfsvfs);
1881168404Spjd		return (error);
1882168404Spjd	}
1883168404Spjd
1884168404Spjd	vp = ZTOV(zp);
1885168404Spjd
1886168962Spjd	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1887168404Spjd		goto out;
1888168962Spjd	}
1889168404Spjd
1890168962Spjd	/*
1891168962Spjd	 * Need to use rmdir for removing directories.
1892168962Spjd	 */
1893168962Spjd	if (vp->v_type == VDIR) {
1894249195Smm		error = SET_ERROR(EPERM);
1895168962Spjd		goto out;
1896168962Spjd	}
1897168962Spjd
1898185029Spjd	vnevent_remove(vp, dvp, name, ct);
1899168962Spjd
1900185029Spjd	if (realnmp)
1901185029Spjd		dnlc_remove(dvp, realnmp->pn_buf);
1902185029Spjd	else
1903185029Spjd		dnlc_remove(dvp, name);
1904168404Spjd
1905219089Spjd	VI_LOCK(vp);
1906219089Spjd	may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp);
1907219089Spjd	VI_UNLOCK(vp);
1908168962Spjd
1909168404Spjd	/*
1910168404Spjd	 * We may delete the znode now, or we may put it in the unlinked set;
1911168404Spjd	 * it depends on whether we're the last link, and on whether there are
1912168404Spjd	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
1913168404Spjd	 * allow for either case.
1914168404Spjd	 */
1915219089Spjd	obj = zp->z_id;
1916168404Spjd	tx = dmu_tx_create(zfsvfs->z_os);
1917168404Spjd	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1918219089Spjd	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1919219089Spjd	zfs_sa_upgrade_txholds(tx, zp);
1920219089Spjd	zfs_sa_upgrade_txholds(tx, dzp);
1921185029Spjd	if (may_delete_now) {
1922185029Spjd		toobig =
1923219089Spjd		    zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
1924185029Spjd		/* if the file is too big, only hold_free a token amount */
1925185029Spjd		dmu_tx_hold_free(tx, zp->z_id, 0,
1926185029Spjd		    (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1927185029Spjd	}
1928168404Spjd
1929168404Spjd	/* are there any extended attributes? */
1930219089Spjd	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1931219089Spjd	    &xattr_obj, sizeof (xattr_obj));
1932219089Spjd	if (error == 0 && xattr_obj) {
1933219089Spjd		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1934240415Smm		ASSERT0(error);
1935219089Spjd		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1936219089Spjd		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1937168404Spjd	}
1938168404Spjd
1939219089Spjd	mutex_enter(&zp->z_lock);
1940219089Spjd	if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
1941168962Spjd		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1942219089Spjd	mutex_exit(&zp->z_lock);
1943168962Spjd
1944168404Spjd	/* charge as an update -- would be nice not to charge at all */
1945168404Spjd	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1946168404Spjd
1947209962Smm	error = dmu_tx_assign(tx, TXG_NOWAIT);
1948168404Spjd	if (error) {
1949168404Spjd		zfs_dirent_unlock(dl);
1950168962Spjd		VN_RELE(vp);
1951219089Spjd		if (xzp)
1952219089Spjd			VN_RELE(ZTOV(xzp));
1953209962Smm		if (error == ERESTART) {
1954168404Spjd			dmu_tx_wait(tx);
1955168404Spjd			dmu_tx_abort(tx);
1956168404Spjd			goto top;
1957168404Spjd		}
1958185029Spjd		if (realnmp)
1959185029Spjd			pn_free(realnmp);
1960168404Spjd		dmu_tx_abort(tx);
1961168404Spjd		ZFS_EXIT(zfsvfs);
1962168404Spjd		return (error);
1963168404Spjd	}
1964168404Spjd
1965168404Spjd	/*
1966168404Spjd	 * Remove the directory entry.
1967168404Spjd	 */
1968185029Spjd	error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1969168404Spjd
1970168404Spjd	if (error) {
1971168404Spjd		dmu_tx_commit(tx);
1972168404Spjd		goto out;
1973168404Spjd	}
1974168404Spjd
1975219089Spjd	if (unlinked) {
1976219089Spjd
1977219089Spjd		/*
1978219089Spjd		 * Hold z_lock so that we can make sure that the ACL obj
1979219089Spjd		 * hasn't changed.  Could have been deleted due to
1980219089Spjd		 * zfs_sa_upgrade().
1981219089Spjd		 */
1982219089Spjd		mutex_enter(&zp->z_lock);
1983168962Spjd		VI_LOCK(vp);
1984219089Spjd		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1985219089Spjd		    &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
1986185029Spjd		delete_now = may_delete_now && !toobig &&
1987168962Spjd		    vp->v_count == 1 && !vn_has_cached_data(vp) &&
1988219089Spjd		    xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) ==
1989219089Spjd		    acl_obj;
1990168962Spjd		VI_UNLOCK(vp);
1991168962Spjd	}
1992168962Spjd
1993168962Spjd	if (delete_now) {
1994243270Savg#ifdef __FreeBSD__
1995243270Savg		panic("zfs_remove: delete_now branch taken");
1996243270Savg#endif
1997219089Spjd		if (xattr_obj_unlinked) {
1998219089Spjd			ASSERT3U(xzp->z_links, ==, 2);
1999168962Spjd			mutex_enter(&xzp->z_lock);
2000168962Spjd			xzp->z_unlinked = 1;
2001219089Spjd			xzp->z_links = 0;
2002219089Spjd			error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
2003219089Spjd			    &xzp->z_links, sizeof (xzp->z_links), tx);
2004219089Spjd			ASSERT3U(error,  ==,  0);
2005168962Spjd			mutex_exit(&xzp->z_lock);
2006168962Spjd			zfs_unlinked_add(xzp, tx);
2007219089Spjd
2008219089Spjd			if (zp->z_is_sa)
2009219089Spjd				error = sa_remove(zp->z_sa_hdl,
2010219089Spjd				    SA_ZPL_XATTR(zfsvfs), tx);
2011219089Spjd			else
2012219089Spjd				error = sa_update(zp->z_sa_hdl,
2013219089Spjd				    SA_ZPL_XATTR(zfsvfs), &null_xattr,
2014219089Spjd				    sizeof (uint64_t), tx);
2015240415Smm			ASSERT0(error);
2016168962Spjd		}
2017168962Spjd		VI_LOCK(vp);
2018168962Spjd		vp->v_count--;
2019240415Smm		ASSERT0(vp->v_count);
2020168962Spjd		VI_UNLOCK(vp);
2021168962Spjd		mutex_exit(&zp->z_lock);
2022168962Spjd		zfs_znode_delete(zp, tx);
2023168962Spjd	} else if (unlinked) {
2024219089Spjd		mutex_exit(&zp->z_lock);
2025168404Spjd		zfs_unlinked_add(zp, tx);
2026243268Savg#ifdef __FreeBSD__
2027243268Savg		vp->v_vflag |= VV_NOSYNC;
2028243268Savg#endif
2029168962Spjd	}
2030168404Spjd
2031185029Spjd	txtype = TX_REMOVE;
2032185029Spjd	if (flags & FIGNORECASE)
2033185029Spjd		txtype |= TX_CI;
2034219089Spjd	zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
2035168404Spjd
2036168404Spjd	dmu_tx_commit(tx);
2037168404Spjdout:
2038185029Spjd	if (realnmp)
2039185029Spjd		pn_free(realnmp);
2040185029Spjd
2041168404Spjd	zfs_dirent_unlock(dl);
2042168404Spjd
2043219089Spjd	if (!delete_now)
2044168962Spjd		VN_RELE(vp);
2045219089Spjd	if (xzp)
2046168962Spjd		VN_RELE(ZTOV(xzp));
2047168962Spjd
2048219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2049219089Spjd		zil_commit(zilog, 0);
2050219089Spjd
2051168404Spjd	ZFS_EXIT(zfsvfs);
2052168404Spjd	return (error);
2053168404Spjd}
2054168404Spjd
2055168404Spjd/*
2056168404Spjd * Create a new directory and insert it into dvp using the name
2057168404Spjd * provided.  Return a pointer to the inserted directory.
2058168404Spjd *
2059168404Spjd *	IN:	dvp	- vnode of directory to add subdir to.
2060168404Spjd *		dirname	- name of new directory.
2061168404Spjd *		vap	- attributes of new directory.
2062168404Spjd *		cr	- credentials of caller.
2063185029Spjd *		ct	- caller context
2064251631Sdelphij *		flags	- case flags
2065185029Spjd *		vsecp	- ACL to be set
2066168404Spjd *
2067168404Spjd *	OUT:	vpp	- vnode of created directory.
2068168404Spjd *
2069251631Sdelphij *	RETURN:	0 on success, error code on failure.
2070168404Spjd *
2071168404Spjd * Timestamps:
2072168404Spjd *	dvp - ctime|mtime updated
2073168404Spjd *	 vp - ctime|mtime|atime updated
2074168404Spjd */
2075185029Spjd/*ARGSUSED*/
2076168404Spjdstatic int
2077185029Spjdzfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
2078185029Spjd    caller_context_t *ct, int flags, vsecattr_t *vsecp)
2079168404Spjd{
2080168404Spjd	znode_t		*zp, *dzp = VTOZ(dvp);
2081168404Spjd	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2082185029Spjd	zilog_t		*zilog;
2083168404Spjd	zfs_dirlock_t	*dl;
2084185029Spjd	uint64_t	txtype;
2085168404Spjd	dmu_tx_t	*tx;
2086168404Spjd	int		error;
2087185029Spjd	int		zf = ZNEW;
2088209962Smm	ksid_t		*ksid;
2089209962Smm	uid_t		uid;
2090209962Smm	gid_t		gid = crgetgid(cr);
2091219089Spjd	zfs_acl_ids_t   acl_ids;
2092209962Smm	boolean_t	fuid_dirtied;
2093168404Spjd
2094168404Spjd	ASSERT(vap->va_type == VDIR);
2095168404Spjd
2096185029Spjd	/*
2097185029Spjd	 * If we have an ephemeral id, ACL, or XVATTR then
2098185029Spjd	 * make sure file system is at proper version
2099185029Spjd	 */
2100185029Spjd
2101209962Smm	ksid = crgetsid(cr, KSID_OWNER);
2102209962Smm	if (ksid)
2103209962Smm		uid = ksid_getid(ksid);
2104209962Smm	else
2105209962Smm		uid = crgetuid(cr);
2106185029Spjd	if (zfsvfs->z_use_fuids == B_FALSE &&
2107219089Spjd	    (vsecp || (vap->va_mask & AT_XVATTR) ||
2108219089Spjd	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
2109249195Smm		return (SET_ERROR(EINVAL));
2110185029Spjd
2111168404Spjd	ZFS_ENTER(zfsvfs);
2112185029Spjd	ZFS_VERIFY_ZP(dzp);
2113185029Spjd	zilog = zfsvfs->z_log;
2114168404Spjd
2115219089Spjd	if (dzp->z_pflags & ZFS_XATTR) {
2116168404Spjd		ZFS_EXIT(zfsvfs);
2117249195Smm		return (SET_ERROR(EINVAL));
2118168404Spjd	}
2119168404Spjd
2120185029Spjd	if (zfsvfs->z_utf8 && u8_validate(dirname,
2121185029Spjd	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2122185029Spjd		ZFS_EXIT(zfsvfs);
2123249195Smm		return (SET_ERROR(EILSEQ));
2124185029Spjd	}
2125185029Spjd	if (flags & FIGNORECASE)
2126185029Spjd		zf |= ZCILOOK;
2127185029Spjd
2128219089Spjd	if (vap->va_mask & AT_XVATTR) {
2129197861Spjd		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
2130185029Spjd		    crgetuid(cr), cr, vap->va_type)) != 0) {
2131185029Spjd			ZFS_EXIT(zfsvfs);
2132185029Spjd			return (error);
2133185029Spjd		}
2134219089Spjd	}
2135185029Spjd
2136219089Spjd	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
2137219089Spjd	    vsecp, &acl_ids)) != 0) {
2138219089Spjd		ZFS_EXIT(zfsvfs);
2139219089Spjd		return (error);
2140219089Spjd	}
2141168404Spjd	/*
2142168404Spjd	 * First make sure the new directory doesn't exist.
2143219089Spjd	 *
2144219089Spjd	 * Existence is checked first to make sure we don't return
2145219089Spjd	 * EACCES instead of EEXIST which can cause some applications
2146219089Spjd	 * to fail.
2147168404Spjd	 */
2148185029Spjdtop:
2149185029Spjd	*vpp = NULL;
2150185029Spjd
2151185029Spjd	if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
2152185029Spjd	    NULL, NULL)) {
2153219089Spjd		zfs_acl_ids_free(&acl_ids);
2154168404Spjd		ZFS_EXIT(zfsvfs);
2155168404Spjd		return (error);
2156168404Spjd	}
2157168404Spjd
2158185029Spjd	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
2159219089Spjd		zfs_acl_ids_free(&acl_ids);
2160168404Spjd		zfs_dirent_unlock(dl);
2161168404Spjd		ZFS_EXIT(zfsvfs);
2162168404Spjd		return (error);
2163168404Spjd	}
2164168404Spjd
2165209962Smm	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
2166211932Smm		zfs_acl_ids_free(&acl_ids);
2167209962Smm		zfs_dirent_unlock(dl);
2168209962Smm		ZFS_EXIT(zfsvfs);
2169249195Smm		return (SET_ERROR(EDQUOT));
2170209962Smm	}
2171209962Smm
2172168404Spjd	/*
2173168404Spjd	 * Add a new entry to the directory.
2174168404Spjd	 */
2175168404Spjd	tx = dmu_tx_create(zfsvfs->z_os);
2176168404Spjd	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
2177168404Spjd	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2178209962Smm	fuid_dirtied = zfsvfs->z_fuid_dirty;
2179209962Smm	if (fuid_dirtied)
2180209962Smm		zfs_fuid_txhold(zfsvfs, tx);
2181219089Spjd	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2182219089Spjd		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
2183219089Spjd		    acl_ids.z_aclp->z_acl_bytes);
2184219089Spjd	}
2185219089Spjd
2186219089Spjd	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
2187219089Spjd	    ZFS_SA_BASE_ATTR_SIZE);
2188219089Spjd
2189209962Smm	error = dmu_tx_assign(tx, TXG_NOWAIT);
2190168404Spjd	if (error) {
2191168404Spjd		zfs_dirent_unlock(dl);
2192209962Smm		if (error == ERESTART) {
2193168404Spjd			dmu_tx_wait(tx);
2194168404Spjd			dmu_tx_abort(tx);
2195168404Spjd			goto top;
2196168404Spjd		}
2197219089Spjd		zfs_acl_ids_free(&acl_ids);
2198168404Spjd		dmu_tx_abort(tx);
2199168404Spjd		ZFS_EXIT(zfsvfs);
2200168404Spjd		return (error);
2201168404Spjd	}
2202168404Spjd
2203168404Spjd	/*
2204168404Spjd	 * Create new node.
2205168404Spjd	 */
2206219089Spjd	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
2207168404Spjd
2208209962Smm	if (fuid_dirtied)
2209209962Smm		zfs_fuid_sync(zfsvfs, tx);
2210219089Spjd
2211168404Spjd	/*
2212168404Spjd	 * Now put new name in parent dir.
2213168404Spjd	 */
2214168404Spjd	(void) zfs_link_create(dl, zp, tx, ZNEW);
2215168404Spjd
2216168404Spjd	*vpp = ZTOV(zp);
2217168404Spjd
2218185029Spjd	txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
2219185029Spjd	if (flags & FIGNORECASE)
2220185029Spjd		txtype |= TX_CI;
2221209962Smm	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
2222209962Smm	    acl_ids.z_fuidp, vap);
2223185029Spjd
2224209962Smm	zfs_acl_ids_free(&acl_ids);
2225219089Spjd
2226168404Spjd	dmu_tx_commit(tx);
2227168404Spjd
2228168404Spjd	zfs_dirent_unlock(dl);
2229168404Spjd
2230219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2231219089Spjd		zil_commit(zilog, 0);
2232219089Spjd
2233168404Spjd	ZFS_EXIT(zfsvfs);
2234168404Spjd	return (0);
2235168404Spjd}
2236168404Spjd
2237168404Spjd/*
2238168404Spjd * Remove a directory subdir entry.  If the current working
2239168404Spjd * directory is the same as the subdir to be removed, the
2240168404Spjd * remove will fail.
2241168404Spjd *
2242168404Spjd *	IN:	dvp	- vnode of directory to remove from.
2243168404Spjd *		name	- name of directory to be removed.
2244168404Spjd *		cwd	- vnode of current working directory.
2245168404Spjd *		cr	- credentials of caller.
2246185029Spjd *		ct	- caller context
2247185029Spjd *		flags	- case flags
2248168404Spjd *
2249251631Sdelphij *	RETURN:	0 on success, error code on failure.
2250168404Spjd *
2251168404Spjd * Timestamps:
2252168404Spjd *	dvp - ctime|mtime updated
2253168404Spjd */
2254185029Spjd/*ARGSUSED*/
2255168404Spjdstatic int
2256185029Spjdzfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
2257185029Spjd    caller_context_t *ct, int flags)
2258168404Spjd{
2259168404Spjd	znode_t		*dzp = VTOZ(dvp);
2260168404Spjd	znode_t		*zp;
2261168404Spjd	vnode_t		*vp;
2262168404Spjd	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2263185029Spjd	zilog_t		*zilog;
2264168404Spjd	zfs_dirlock_t	*dl;
2265168404Spjd	dmu_tx_t	*tx;
2266168404Spjd	int		error;
2267185029Spjd	int		zflg = ZEXISTS;
2268168404Spjd
2269168962Spjd	ZFS_ENTER(zfsvfs);
2270185029Spjd	ZFS_VERIFY_ZP(dzp);
2271185029Spjd	zilog = zfsvfs->z_log;
2272168404Spjd
2273185029Spjd	if (flags & FIGNORECASE)
2274185029Spjd		zflg |= ZCILOOK;
2275168404Spjdtop:
2276168404Spjd	zp = NULL;
2277168404Spjd
2278168404Spjd	/*
2279168404Spjd	 * Attempt to lock directory; fail if entry doesn't exist.
2280168404Spjd	 */
2281185029Spjd	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
2282185029Spjd	    NULL, NULL)) {
2283168404Spjd		ZFS_EXIT(zfsvfs);
2284168404Spjd		return (error);
2285168404Spjd	}
2286168404Spjd
2287168404Spjd	vp = ZTOV(zp);
2288168404Spjd
2289168404Spjd	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2290168404Spjd		goto out;
2291168404Spjd	}
2292168404Spjd
2293168962Spjd	if (vp->v_type != VDIR) {
2294249195Smm		error = SET_ERROR(ENOTDIR);
2295168962Spjd		goto out;
2296168962Spjd	}
2297168962Spjd
2298168962Spjd	if (vp == cwd) {
2299249195Smm		error = SET_ERROR(EINVAL);
2300168962Spjd		goto out;
2301168962Spjd	}
2302168962Spjd
2303185029Spjd	vnevent_rmdir(vp, dvp, name, ct);
2304168962Spjd
2305168404Spjd	/*
2306168404Spjd	 * Grab a lock on the directory to make sure that noone is
2307168404Spjd	 * trying to add (or lookup) entries while we are removing it.
2308168404Spjd	 */
2309168404Spjd	rw_enter(&zp->z_name_lock, RW_WRITER);
2310168404Spjd
2311168404Spjd	/*
2312168404Spjd	 * Grab a lock on the parent pointer to make sure we play well
2313168404Spjd	 * with the treewalk and directory rename code.
2314168404Spjd	 */
2315168404Spjd	rw_enter(&zp->z_parent_lock, RW_WRITER);
2316168404Spjd
2317168404Spjd	tx = dmu_tx_create(zfsvfs->z_os);
2318168404Spjd	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2319219089Spjd	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2320168404Spjd	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2321219089Spjd	zfs_sa_upgrade_txholds(tx, zp);
2322219089Spjd	zfs_sa_upgrade_txholds(tx, dzp);
2323209962Smm	error = dmu_tx_assign(tx, TXG_NOWAIT);
2324168404Spjd	if (error) {
2325168404Spjd		rw_exit(&zp->z_parent_lock);
2326168404Spjd		rw_exit(&zp->z_name_lock);
2327168404Spjd		zfs_dirent_unlock(dl);
2328168962Spjd		VN_RELE(vp);
2329209962Smm		if (error == ERESTART) {
2330168404Spjd			dmu_tx_wait(tx);
2331168404Spjd			dmu_tx_abort(tx);
2332168404Spjd			goto top;
2333168404Spjd		}
2334168404Spjd		dmu_tx_abort(tx);
2335168404Spjd		ZFS_EXIT(zfsvfs);
2336168404Spjd		return (error);
2337168404Spjd	}
2338168404Spjd
2339168404Spjd#ifdef FREEBSD_NAMECACHE
2340168404Spjd	cache_purge(dvp);
2341168404Spjd#endif
2342168404Spjd
2343185029Spjd	error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
2344168404Spjd
2345185029Spjd	if (error == 0) {
2346185029Spjd		uint64_t txtype = TX_RMDIR;
2347185029Spjd		if (flags & FIGNORECASE)
2348185029Spjd			txtype |= TX_CI;
2349219089Spjd		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
2350185029Spjd	}
2351168404Spjd
2352168404Spjd	dmu_tx_commit(tx);
2353168404Spjd
2354168404Spjd	rw_exit(&zp->z_parent_lock);
2355168404Spjd	rw_exit(&zp->z_name_lock);
2356168404Spjd#ifdef FREEBSD_NAMECACHE
2357168404Spjd	cache_purge(vp);
2358168404Spjd#endif
2359168404Spjdout:
2360168404Spjd	zfs_dirent_unlock(dl);
2361168404Spjd
2362168962Spjd	VN_RELE(vp);
2363168962Spjd
2364219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2365219089Spjd		zil_commit(zilog, 0);
2366219089Spjd
2367168404Spjd	ZFS_EXIT(zfsvfs);
2368168404Spjd	return (error);
2369168404Spjd}
2370168404Spjd
2371168404Spjd/*
2372168404Spjd * Read as many directory entries as will fit into the provided
2373168404Spjd * buffer from the given directory cursor position (specified in
2374251631Sdelphij * the uio structure).
2375168404Spjd *
2376168404Spjd *	IN:	vp	- vnode of directory to read.
2377168404Spjd *		uio	- structure supplying read location, range info,
2378168404Spjd *			  and return buffer.
2379168404Spjd *		cr	- credentials of caller.
2380185029Spjd *		ct	- caller context
2381185029Spjd *		flags	- case flags
2382168404Spjd *
2383168404Spjd *	OUT:	uio	- updated offset and range, buffer filled.
2384168404Spjd *		eofp	- set to true if end-of-file detected.
2385168404Spjd *
2386251631Sdelphij *	RETURN:	0 on success, error code on failure.
2387168404Spjd *
2388168404Spjd * Timestamps:
2389168404Spjd *	vp - atime updated
2390168404Spjd *
2391168404Spjd * Note that the low 4 bits of the cookie returned by zap is always zero.
2392168404Spjd * This allows us to use the low range for "special" directory entries:
2393168404Spjd * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
2394168404Spjd * we use the offset 2 for the '.zfs' directory.
2395168404Spjd */
2396168404Spjd/* ARGSUSED */
2397168404Spjdstatic int
2398168962Spjdzfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
2399168404Spjd{
2400168404Spjd	znode_t		*zp = VTOZ(vp);
2401168404Spjd	iovec_t		*iovp;
2402185029Spjd	edirent_t	*eodp;
2403168404Spjd	dirent64_t	*odp;
2404168404Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2405168404Spjd	objset_t	*os;
2406168404Spjd	caddr_t		outbuf;
2407168404Spjd	size_t		bufsize;
2408168404Spjd	zap_cursor_t	zc;
2409168404Spjd	zap_attribute_t	zap;
2410168404Spjd	uint_t		bytes_wanted;
2411168404Spjd	uint64_t	offset; /* must be unsigned; checks for < 1 */
2412219089Spjd	uint64_t	parent;
2413168404Spjd	int		local_eof;
2414168404Spjd	int		outcount;
2415168404Spjd	int		error;
2416168404Spjd	uint8_t		prefetch;
2417185029Spjd	boolean_t	check_sysattrs;
2418168404Spjd	uint8_t		type;
2419168962Spjd	int		ncooks;
2420168962Spjd	u_long		*cooks = NULL;
2421185029Spjd	int		flags = 0;
2422168404Spjd
2423168404Spjd	ZFS_ENTER(zfsvfs);
2424185029Spjd	ZFS_VERIFY_ZP(zp);
2425168404Spjd
2426219089Spjd	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2427219089Spjd	    &parent, sizeof (parent))) != 0) {
2428219089Spjd		ZFS_EXIT(zfsvfs);
2429219089Spjd		return (error);
2430219089Spjd	}
2431219089Spjd
2432168404Spjd	/*
2433168404Spjd	 * If we are not given an eof variable,
2434168404Spjd	 * use a local one.
2435168404Spjd	 */
2436168404Spjd	if (eofp == NULL)
2437168404Spjd		eofp = &local_eof;
2438168404Spjd
2439168404Spjd	/*
2440168404Spjd	 * Check for valid iov_len.
2441168404Spjd	 */
2442168404Spjd	if (uio->uio_iov->iov_len <= 0) {
2443168404Spjd		ZFS_EXIT(zfsvfs);
2444249195Smm		return (SET_ERROR(EINVAL));
2445168404Spjd	}
2446168404Spjd
2447168404Spjd	/*
2448168404Spjd	 * Quit if directory has been removed (posix)
2449168404Spjd	 */
2450168404Spjd	if ((*eofp = zp->z_unlinked) != 0) {
2451168404Spjd		ZFS_EXIT(zfsvfs);
2452168404Spjd		return (0);
2453168404Spjd	}
2454168404Spjd
2455168404Spjd	error = 0;
2456168404Spjd	os = zfsvfs->z_os;
2457168404Spjd	offset = uio->uio_loffset;
2458168404Spjd	prefetch = zp->z_zn_prefetch;
2459168404Spjd
2460168404Spjd	/*
2461168404Spjd	 * Initialize the iterator cursor.
2462168404Spjd	 */
2463168404Spjd	if (offset <= 3) {
2464168404Spjd		/*
2465168404Spjd		 * Start iteration from the beginning of the directory.
2466168404Spjd		 */
2467168404Spjd		zap_cursor_init(&zc, os, zp->z_id);
2468168404Spjd	} else {
2469168404Spjd		/*
2470168404Spjd		 * The offset is a serialized cursor.
2471168404Spjd		 */
2472168404Spjd		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2473168404Spjd	}
2474168404Spjd
2475168404Spjd	/*
2476168404Spjd	 * Get space to change directory entries into fs independent format.
2477168404Spjd	 */
2478168404Spjd	iovp = uio->uio_iov;
2479168404Spjd	bytes_wanted = iovp->iov_len;
2480168404Spjd	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
2481168404Spjd		bufsize = bytes_wanted;
2482168404Spjd		outbuf = kmem_alloc(bufsize, KM_SLEEP);
2483168404Spjd		odp = (struct dirent64 *)outbuf;
2484168404Spjd	} else {
2485168404Spjd		bufsize = bytes_wanted;
2486247187Smm		outbuf = NULL;
2487168404Spjd		odp = (struct dirent64 *)iovp->iov_base;
2488168404Spjd	}
2489185029Spjd	eodp = (struct edirent *)odp;
2490168404Spjd
2491169170Spjd	if (ncookies != NULL) {
2492168404Spjd		/*
2493168404Spjd		 * Minimum entry size is dirent size and 1 byte for a file name.
2494168404Spjd		 */
2495168962Spjd		ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
2496219404Spjd		cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
2497219404Spjd		*cookies = cooks;
2498168962Spjd		*ncookies = ncooks;
2499168404Spjd	}
2500185029Spjd	/*
2501185029Spjd	 * If this VFS supports the system attribute view interface; and
2502185029Spjd	 * we're looking at an extended attribute directory; and we care
2503185029Spjd	 * about normalization conflicts on this vfs; then we must check
2504185029Spjd	 * for normalization conflicts with the sysattr name space.
2505185029Spjd	 */
2506185029Spjd#ifdef TODO
2507185029Spjd	check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2508185029Spjd	    (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2509185029Spjd	    (flags & V_RDDIR_ENTFLAGS);
2510185029Spjd#else
2511185029Spjd	check_sysattrs = 0;
2512185029Spjd#endif
2513168404Spjd
2514168404Spjd	/*
2515168404Spjd	 * Transform to file-system independent format
2516168404Spjd	 */
2517168404Spjd	outcount = 0;
2518168404Spjd	while (outcount < bytes_wanted) {
2519168404Spjd		ino64_t objnum;
2520168404Spjd		ushort_t reclen;
2521219089Spjd		off64_t *next = NULL;
2522168404Spjd
2523168404Spjd		/*
2524168404Spjd		 * Special case `.', `..', and `.zfs'.
2525168404Spjd		 */
2526168404Spjd		if (offset == 0) {
2527168404Spjd			(void) strcpy(zap.za_name, ".");
2528185029Spjd			zap.za_normalization_conflict = 0;
2529168404Spjd			objnum = zp->z_id;
2530169108Spjd			type = DT_DIR;
2531168404Spjd		} else if (offset == 1) {
2532168404Spjd			(void) strcpy(zap.za_name, "..");
2533185029Spjd			zap.za_normalization_conflict = 0;
2534219089Spjd			objnum = parent;
2535169108Spjd			type = DT_DIR;
2536168404Spjd		} else if (offset == 2 && zfs_show_ctldir(zp)) {
2537168404Spjd			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2538185029Spjd			zap.za_normalization_conflict = 0;
2539168404Spjd			objnum = ZFSCTL_INO_ROOT;
2540169108Spjd			type = DT_DIR;
2541168404Spjd		} else {
2542168404Spjd			/*
2543168404Spjd			 * Grab next entry.
2544168404Spjd			 */
2545168404Spjd			if (error = zap_cursor_retrieve(&zc, &zap)) {
2546168404Spjd				if ((*eofp = (error == ENOENT)) != 0)
2547168404Spjd					break;
2548168404Spjd				else
2549168404Spjd					goto update;
2550168404Spjd			}
2551168404Spjd
2552168404Spjd			if (zap.za_integer_length != 8 ||
2553168404Spjd			    zap.za_num_integers != 1) {
2554168404Spjd				cmn_err(CE_WARN, "zap_readdir: bad directory "
2555168404Spjd				    "entry, obj = %lld, offset = %lld\n",
2556168404Spjd				    (u_longlong_t)zp->z_id,
2557168404Spjd				    (u_longlong_t)offset);
2558249195Smm				error = SET_ERROR(ENXIO);
2559168404Spjd				goto update;
2560168404Spjd			}
2561168404Spjd
2562168404Spjd			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2563168404Spjd			/*
2564168404Spjd			 * MacOS X can extract the object type here such as:
2565168404Spjd			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2566168404Spjd			 */
2567168404Spjd			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2568185029Spjd
2569185029Spjd			if (check_sysattrs && !zap.za_normalization_conflict) {
2570185029Spjd#ifdef TODO
2571185029Spjd				zap.za_normalization_conflict =
2572185029Spjd				    xattr_sysattr_casechk(zap.za_name);
2573185029Spjd#else
2574185029Spjd				panic("%s:%u: TODO", __func__, __LINE__);
2575185029Spjd#endif
2576185029Spjd			}
2577168404Spjd		}
2578168404Spjd
2579211932Smm		if (flags & V_RDDIR_ACCFILTER) {
2580211932Smm			/*
2581211932Smm			 * If we have no access at all, don't include
2582211932Smm			 * this entry in the returned information
2583211932Smm			 */
2584211932Smm			znode_t	*ezp;
2585211932Smm			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
2586211932Smm				goto skip_entry;
2587211932Smm			if (!zfs_has_access(ezp, cr)) {
2588211932Smm				VN_RELE(ZTOV(ezp));
2589211932Smm				goto skip_entry;
2590211932Smm			}
2591211932Smm			VN_RELE(ZTOV(ezp));
2592211932Smm		}
2593211932Smm
2594185029Spjd		if (flags & V_RDDIR_ENTFLAGS)
2595185029Spjd			reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2596185029Spjd		else
2597185029Spjd			reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2598185029Spjd
2599168404Spjd		/*
2600168404Spjd		 * Will this entry fit in the buffer?
2601168404Spjd		 */
2602168404Spjd		if (outcount + reclen > bufsize) {
2603168404Spjd			/*
2604168404Spjd			 * Did we manage to fit anything in the buffer?
2605168404Spjd			 */
2606168404Spjd			if (!outcount) {
2607249195Smm				error = SET_ERROR(EINVAL);
2608168404Spjd				goto update;
2609168404Spjd			}
2610168404Spjd			break;
2611168404Spjd		}
2612185029Spjd		if (flags & V_RDDIR_ENTFLAGS) {
2613185029Spjd			/*
2614185029Spjd			 * Add extended flag entry:
2615185029Spjd			 */
2616185029Spjd			eodp->ed_ino = objnum;
2617185029Spjd			eodp->ed_reclen = reclen;
2618185029Spjd			/* NOTE: ed_off is the offset for the *next* entry */
2619185029Spjd			next = &(eodp->ed_off);
2620185029Spjd			eodp->ed_eflags = zap.za_normalization_conflict ?
2621185029Spjd			    ED_CASE_CONFLICT : 0;
2622185029Spjd			(void) strncpy(eodp->ed_name, zap.za_name,
2623185029Spjd			    EDIRENT_NAMELEN(reclen));
2624185029Spjd			eodp = (edirent_t *)((intptr_t)eodp + reclen);
2625185029Spjd		} else {
2626185029Spjd			/*
2627185029Spjd			 * Add normal entry:
2628185029Spjd			 */
2629185029Spjd			odp->d_ino = objnum;
2630185029Spjd			odp->d_reclen = reclen;
2631185029Spjd			odp->d_namlen = strlen(zap.za_name);
2632185029Spjd			(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
2633185029Spjd			odp->d_type = type;
2634185029Spjd			odp = (dirent64_t *)((intptr_t)odp + reclen);
2635185029Spjd		}
2636168404Spjd		outcount += reclen;
2637168404Spjd
2638168404Spjd		ASSERT(outcount <= bufsize);
2639168404Spjd
2640168404Spjd		/* Prefetch znode */
2641168404Spjd		if (prefetch)
2642168404Spjd			dmu_prefetch(os, objnum, 0, 0);
2643168404Spjd
2644211932Smm	skip_entry:
2645168404Spjd		/*
2646168404Spjd		 * Move to the next entry, fill in the previous offset.
2647168404Spjd		 */
2648168404Spjd		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2649168404Spjd			zap_cursor_advance(&zc);
2650168404Spjd			offset = zap_cursor_serialize(&zc);
2651168404Spjd		} else {
2652168404Spjd			offset += 1;
2653168404Spjd		}
2654219404Spjd
2655219404Spjd		if (cooks != NULL) {
2656219404Spjd			*cooks++ = offset;
2657219404Spjd			ncooks--;
2658219404Spjd			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
2659219404Spjd		}
2660168404Spjd	}
2661168404Spjd	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2662168404Spjd
2663168404Spjd	/* Subtract unused cookies */
2664168962Spjd	if (ncookies != NULL)
2665168962Spjd		*ncookies -= ncooks;
2666168404Spjd
2667168404Spjd	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
2668168404Spjd		iovp->iov_base += outcount;
2669168404Spjd		iovp->iov_len -= outcount;
2670168404Spjd		uio->uio_resid -= outcount;
2671168404Spjd	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2672168404Spjd		/*
2673168404Spjd		 * Reset the pointer.
2674168404Spjd		 */
2675168404Spjd		offset = uio->uio_loffset;
2676168404Spjd	}
2677168404Spjd
2678168404Spjdupdate:
2679168404Spjd	zap_cursor_fini(&zc);
2680168404Spjd	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
2681168404Spjd		kmem_free(outbuf, bufsize);
2682168404Spjd
2683168404Spjd	if (error == ENOENT)
2684168404Spjd		error = 0;
2685168404Spjd
2686168404Spjd	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2687168404Spjd
2688168404Spjd	uio->uio_loffset = offset;
2689168404Spjd	ZFS_EXIT(zfsvfs);
2690169107Spjd	if (error != 0 && cookies != NULL) {
2691168962Spjd		free(*cookies, M_TEMP);
2692168962Spjd		*cookies = NULL;
2693168962Spjd		*ncookies = 0;
2694168404Spjd	}
2695168404Spjd	return (error);
2696168404Spjd}
2697168404Spjd
2698185029Spjdulong_t zfs_fsync_sync_cnt = 4;
2699185029Spjd
2700168404Spjdstatic int
2701185029Spjdzfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2702168404Spjd{
2703168962Spjd	znode_t	*zp = VTOZ(vp);
2704168962Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2705168404Spjd
2706185029Spjd	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2707185029Spjd
2708219089Spjd	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
2709219089Spjd		ZFS_ENTER(zfsvfs);
2710219089Spjd		ZFS_VERIFY_ZP(zp);
2711219089Spjd		zil_commit(zfsvfs->z_log, zp->z_id);
2712219089Spjd		ZFS_EXIT(zfsvfs);
2713219089Spjd	}
2714168404Spjd	return (0);
2715168404Spjd}
2716168404Spjd
2717185029Spjd
2718168404Spjd/*
2719168404Spjd * Get the requested file attributes and place them in the provided
2720168404Spjd * vattr structure.
2721168404Spjd *
2722168404Spjd *	IN:	vp	- vnode of file.
2723168404Spjd *		vap	- va_mask identifies requested attributes.
2724185029Spjd *			  If AT_XVATTR set, then optional attrs are requested
2725185029Spjd *		flags	- ATTR_NOACLCHECK (CIFS server context)
2726168404Spjd *		cr	- credentials of caller.
2727185029Spjd *		ct	- caller context
2728168404Spjd *
2729168404Spjd *	OUT:	vap	- attribute values.
2730168404Spjd *
2731251631Sdelphij *	RETURN:	0 (always succeeds).
2732168404Spjd */
2733168404Spjd/* ARGSUSED */
2734168404Spjdstatic int
2735185029Spjdzfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2736185029Spjd    caller_context_t *ct)
2737168404Spjd{
2738168962Spjd	znode_t *zp = VTOZ(vp);
2739168962Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2740185029Spjd	int	error = 0;
2741168962Spjd	uint32_t blksize;
2742168962Spjd	u_longlong_t nblocks;
2743185029Spjd	uint64_t links;
2744224251Sdelphij	uint64_t mtime[2], ctime[2], crtime[2], rdev;
2745185029Spjd	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2746185029Spjd	xoptattr_t *xoap = NULL;
2747185029Spjd	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2748224251Sdelphij	sa_bulk_attr_t bulk[4];
2749219089Spjd	int count = 0;
2750168404Spjd
2751168404Spjd	ZFS_ENTER(zfsvfs);
2752185029Spjd	ZFS_VERIFY_ZP(zp);
2753168404Spjd
2754219089Spjd	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2755219089Spjd
2756219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
2757219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
2758243807Sdelphij	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
2759224251Sdelphij	if (vp->v_type == VBLK || vp->v_type == VCHR)
2760224251Sdelphij		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
2761224251Sdelphij		    &rdev, 8);
2762219089Spjd
2763219089Spjd	if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
2764219089Spjd		ZFS_EXIT(zfsvfs);
2765219089Spjd		return (error);
2766219089Spjd	}
2767219089Spjd
2768168404Spjd	/*
2769185029Spjd	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2770185029Spjd	 * Also, if we are the owner don't bother, since owner should
2771185029Spjd	 * always be allowed to read basic attributes of file.
2772185029Spjd	 */
2773219089Spjd	if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
2774219089Spjd	    (vap->va_uid != crgetuid(cr))) {
2775185029Spjd		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2776185029Spjd		    skipaclchk, cr)) {
2777185029Spjd			ZFS_EXIT(zfsvfs);
2778185029Spjd			return (error);
2779185029Spjd		}
2780185029Spjd	}
2781185029Spjd
2782185029Spjd	/*
2783168404Spjd	 * Return all attributes.  It's cheaper to provide the answer
2784168404Spjd	 * than to determine whether we were asked the question.
2785168404Spjd	 */
2786168404Spjd
2787209097Smm	mutex_enter(&zp->z_lock);
2788219089Spjd	vap->va_type = IFTOVT(zp->z_mode);
2789219089Spjd	vap->va_mode = zp->z_mode & ~S_IFMT;
2790224252Sdelphij#ifdef sun
2791224252Sdelphij	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
2792224252Sdelphij#else
2793224252Sdelphij	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
2794224252Sdelphij#endif
2795168404Spjd	vap->va_nodeid = zp->z_id;
2796185029Spjd	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
2797219089Spjd		links = zp->z_links + 1;
2798185029Spjd	else
2799219089Spjd		links = zp->z_links;
2800229425Sdim	vap->va_nlink = MIN(links, LINK_MAX);	/* nlink_t limit! */
2801219089Spjd	vap->va_size = zp->z_size;
2802224252Sdelphij#ifdef sun
2803224252Sdelphij	vap->va_rdev = vp->v_rdev;
2804224252Sdelphij#else
2805224251Sdelphij	if (vp->v_type == VBLK || vp->v_type == VCHR)
2806224251Sdelphij		vap->va_rdev = zfs_cmpldev(rdev);
2807224252Sdelphij#endif
2808168404Spjd	vap->va_seq = zp->z_seq;
2809168404Spjd	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
2810168404Spjd
2811185029Spjd	/*
2812185029Spjd	 * Add in any requested optional attributes and the create time.
2813185029Spjd	 * Also set the corresponding bits in the returned attribute bitmap.
2814185029Spjd	 */
2815185029Spjd	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2816185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2817185029Spjd			xoap->xoa_archive =
2818219089Spjd			    ((zp->z_pflags & ZFS_ARCHIVE) != 0);
2819185029Spjd			XVA_SET_RTN(xvap, XAT_ARCHIVE);
2820185029Spjd		}
2821185029Spjd
2822185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2823185029Spjd			xoap->xoa_readonly =
2824219089Spjd			    ((zp->z_pflags & ZFS_READONLY) != 0);
2825185029Spjd			XVA_SET_RTN(xvap, XAT_READONLY);
2826185029Spjd		}
2827185029Spjd
2828185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2829185029Spjd			xoap->xoa_system =
2830219089Spjd			    ((zp->z_pflags & ZFS_SYSTEM) != 0);
2831185029Spjd			XVA_SET_RTN(xvap, XAT_SYSTEM);
2832185029Spjd		}
2833185029Spjd
2834185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2835185029Spjd			xoap->xoa_hidden =
2836219089Spjd			    ((zp->z_pflags & ZFS_HIDDEN) != 0);
2837185029Spjd			XVA_SET_RTN(xvap, XAT_HIDDEN);
2838185029Spjd		}
2839185029Spjd
2840185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2841185029Spjd			xoap->xoa_nounlink =
2842219089Spjd			    ((zp->z_pflags & ZFS_NOUNLINK) != 0);
2843185029Spjd			XVA_SET_RTN(xvap, XAT_NOUNLINK);
2844185029Spjd		}
2845185029Spjd
2846185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2847185029Spjd			xoap->xoa_immutable =
2848219089Spjd			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
2849185029Spjd			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2850185029Spjd		}
2851185029Spjd
2852185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2853185029Spjd			xoap->xoa_appendonly =
2854219089Spjd			    ((zp->z_pflags & ZFS_APPENDONLY) != 0);
2855185029Spjd			XVA_SET_RTN(xvap, XAT_APPENDONLY);
2856185029Spjd		}
2857185029Spjd
2858185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2859185029Spjd			xoap->xoa_nodump =
2860219089Spjd			    ((zp->z_pflags & ZFS_NODUMP) != 0);
2861185029Spjd			XVA_SET_RTN(xvap, XAT_NODUMP);
2862185029Spjd		}
2863185029Spjd
2864185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2865185029Spjd			xoap->xoa_opaque =
2866219089Spjd			    ((zp->z_pflags & ZFS_OPAQUE) != 0);
2867185029Spjd			XVA_SET_RTN(xvap, XAT_OPAQUE);
2868185029Spjd		}
2869185029Spjd
2870185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2871185029Spjd			xoap->xoa_av_quarantined =
2872219089Spjd			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
2873185029Spjd			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2874185029Spjd		}
2875185029Spjd
2876185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2877185029Spjd			xoap->xoa_av_modified =
2878219089Spjd			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
2879185029Spjd			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2880185029Spjd		}
2881185029Spjd
2882185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2883219089Spjd		    vp->v_type == VREG) {
2884219089Spjd			zfs_sa_get_scanstamp(zp, xvap);
2885185029Spjd		}
2886185029Spjd
2887185029Spjd		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
2888219089Spjd			uint64_t times[2];
2889219089Spjd
2890219089Spjd			(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
2891219089Spjd			    times, sizeof (times));
2892219089Spjd			ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
2893185029Spjd			XVA_SET_RTN(xvap, XAT_CREATETIME);
2894185029Spjd		}
2895219089Spjd
2896219089Spjd		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2897219089Spjd			xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
2898219089Spjd			XVA_SET_RTN(xvap, XAT_REPARSE);
2899219089Spjd		}
2900219089Spjd		if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
2901219089Spjd			xoap->xoa_generation = zp->z_gen;
2902219089Spjd			XVA_SET_RTN(xvap, XAT_GEN);
2903219089Spjd		}
2904219089Spjd
2905219089Spjd		if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
2906219089Spjd			xoap->xoa_offline =
2907219089Spjd			    ((zp->z_pflags & ZFS_OFFLINE) != 0);
2908219089Spjd			XVA_SET_RTN(xvap, XAT_OFFLINE);
2909219089Spjd		}
2910219089Spjd
2911219089Spjd		if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
2912219089Spjd			xoap->xoa_sparse =
2913219089Spjd			    ((zp->z_pflags & ZFS_SPARSE) != 0);
2914219089Spjd			XVA_SET_RTN(xvap, XAT_SPARSE);
2915219089Spjd		}
2916185029Spjd	}
2917185029Spjd
2918219089Spjd	ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
2919219089Spjd	ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2920219089Spjd	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2921219089Spjd	ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
2922168404Spjd
2923168404Spjd	mutex_exit(&zp->z_lock);
2924168404Spjd
2925219089Spjd	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
2926168404Spjd	vap->va_blksize = blksize;
2927168404Spjd	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
2928168404Spjd
2929168404Spjd	if (zp->z_blksz == 0) {
2930168404Spjd		/*
2931168404Spjd		 * Block size hasn't been set; suggest maximal I/O transfers.
2932168404Spjd		 */
2933168404Spjd		vap->va_blksize = zfsvfs->z_max_blksz;
2934168404Spjd	}
2935168404Spjd
2936168404Spjd	ZFS_EXIT(zfsvfs);
2937168404Spjd	return (0);
2938168404Spjd}
2939168404Spjd
2940168404Spjd/*
2941168404Spjd * Set the file attributes to the values contained in the
2942168404Spjd * vattr structure.
2943168404Spjd *
2944168404Spjd *	IN:	vp	- vnode of file to be modified.
2945168404Spjd *		vap	- new attribute values.
2946185029Spjd *			  If AT_XVATTR set, then optional attrs are being set
2947168404Spjd *		flags	- ATTR_UTIME set if non-default time values provided.
2948185029Spjd *			- ATTR_NOACLCHECK (CIFS context only).
2949168404Spjd *		cr	- credentials of caller.
2950185029Spjd *		ct	- caller context
2951168404Spjd *
2952251631Sdelphij *	RETURN:	0 on success, error code on failure.
2953168404Spjd *
2954168404Spjd * Timestamps:
2955168404Spjd *	vp - ctime updated, mtime updated if size changed.
2956168404Spjd */
2957168404Spjd/* ARGSUSED */
2958168404Spjdstatic int
2959168962Spjdzfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2960251631Sdelphij    caller_context_t *ct)
2961168404Spjd{
2962185029Spjd	znode_t		*zp = VTOZ(vp);
2963168404Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2964185029Spjd	zilog_t		*zilog;
2965168404Spjd	dmu_tx_t	*tx;
2966168404Spjd	vattr_t		oldva;
2967209962Smm	xvattr_t	tmpxvattr;
2968168962Spjd	uint_t		mask = vap->va_mask;
2969247187Smm	uint_t		saved_mask = 0;
2970197831Spjd	uint64_t	saved_mode;
2971168404Spjd	int		trim_mask = 0;
2972168404Spjd	uint64_t	new_mode;
2973209962Smm	uint64_t	new_uid, new_gid;
2974219089Spjd	uint64_t	xattr_obj;
2975219089Spjd	uint64_t	mtime[2], ctime[2];
2976168404Spjd	znode_t		*attrzp;
2977168404Spjd	int		need_policy = FALSE;
2978219089Spjd	int		err, err2;
2979185029Spjd	zfs_fuid_info_t *fuidp = NULL;
2980185029Spjd	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2981185029Spjd	xoptattr_t	*xoap;
2982219089Spjd	zfs_acl_t	*aclp;
2983185029Spjd	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2984219089Spjd	boolean_t	fuid_dirtied = B_FALSE;
2985219089Spjd	sa_bulk_attr_t	bulk[7], xattr_bulk[7];
2986219089Spjd	int		count = 0, xattr_count = 0;
2987168404Spjd
2988168404Spjd	if (mask == 0)
2989168404Spjd		return (0);
2990168404Spjd
2991168962Spjd	if (mask & AT_NOSET)
2992249195Smm		return (SET_ERROR(EINVAL));
2993168962Spjd
2994185029Spjd	ZFS_ENTER(zfsvfs);
2995185029Spjd	ZFS_VERIFY_ZP(zp);
2996185029Spjd
2997185029Spjd	zilog = zfsvfs->z_log;
2998185029Spjd
2999185029Spjd	/*
3000185029Spjd	 * Make sure that if we have ephemeral uid/gid or xvattr specified
3001185029Spjd	 * that file system is at proper version level
3002185029Spjd	 */
3003185029Spjd
3004185029Spjd	if (zfsvfs->z_use_fuids == B_FALSE &&
3005185029Spjd	    (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
3006185029Spjd	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
3007185029Spjd	    (mask & AT_XVATTR))) {
3008185029Spjd		ZFS_EXIT(zfsvfs);
3009249195Smm		return (SET_ERROR(EINVAL));
3010185029Spjd	}
3011185029Spjd
3012185029Spjd	if (mask & AT_SIZE && vp->v_type == VDIR) {
3013185029Spjd		ZFS_EXIT(zfsvfs);
3014249195Smm		return (SET_ERROR(EISDIR));
3015185029Spjd	}
3016168404Spjd
3017185029Spjd	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
3018185029Spjd		ZFS_EXIT(zfsvfs);
3019249195Smm		return (SET_ERROR(EINVAL));
3020185029Spjd	}
3021168404Spjd
3022185029Spjd	/*
3023185029Spjd	 * If this is an xvattr_t, then get a pointer to the structure of
3024185029Spjd	 * optional attributes.  If this is NULL, then we have a vattr_t.
3025185029Spjd	 */
3026185029Spjd	xoap = xva_getxoptattr(xvap);
3027168404Spjd
3028209962Smm	xva_init(&tmpxvattr);
3029209962Smm
3030185029Spjd	/*
3031185029Spjd	 * Immutable files can only alter immutable bit and atime
3032185029Spjd	 */
3033219089Spjd	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
3034185029Spjd	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
3035185029Spjd	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
3036185029Spjd		ZFS_EXIT(zfsvfs);
3037249195Smm		return (SET_ERROR(EPERM));
3038185029Spjd	}
3039185029Spjd
3040219089Spjd	if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
3041185029Spjd		ZFS_EXIT(zfsvfs);
3042249195Smm		return (SET_ERROR(EPERM));
3043185029Spjd	}
3044185029Spjd
3045185029Spjd	/*
3046185029Spjd	 * Verify timestamps doesn't overflow 32 bits.
3047185029Spjd	 * ZFS can handle large timestamps, but 32bit syscalls can't
3048185029Spjd	 * handle times greater than 2039.  This check should be removed
3049185029Spjd	 * once large timestamps are fully supported.
3050185029Spjd	 */
3051185029Spjd	if (mask & (AT_ATIME | AT_MTIME)) {
3052185029Spjd		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
3053185029Spjd		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
3054185029Spjd			ZFS_EXIT(zfsvfs);
3055249195Smm			return (SET_ERROR(EOVERFLOW));
3056185029Spjd		}
3057185029Spjd	}
3058185029Spjd
3059168404Spjdtop:
3060168404Spjd	attrzp = NULL;
3061219089Spjd	aclp = NULL;
3062168404Spjd
3063211932Smm	/* Can this be moved to before the top label? */
3064168404Spjd	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
3065168404Spjd		ZFS_EXIT(zfsvfs);
3066249195Smm		return (SET_ERROR(EROFS));
3067168404Spjd	}
3068168404Spjd
3069168404Spjd	/*
3070168404Spjd	 * First validate permissions
3071168404Spjd	 */
3072168404Spjd
3073168404Spjd	if (mask & AT_SIZE) {
3074168404Spjd		/*
3075168404Spjd		 * XXX - Note, we are not providing any open
3076168404Spjd		 * mode flags here (like FNDELAY), so we may
3077168404Spjd		 * block if there are locks present... this
3078168404Spjd		 * should be addressed in openat().
3079168404Spjd		 */
3080185029Spjd		/* XXX - would it be OK to generate a log record here? */
3081185029Spjd		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
3082168404Spjd		if (err) {
3083168404Spjd			ZFS_EXIT(zfsvfs);
3084168404Spjd			return (err);
3085168404Spjd		}
3086168404Spjd	}
3087168404Spjd
3088185029Spjd	if (mask & (AT_ATIME|AT_MTIME) ||
3089185029Spjd	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
3090185029Spjd	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
3091185029Spjd	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
3092219089Spjd	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
3093219089Spjd	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
3094185029Spjd	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
3095219089Spjd	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
3096185029Spjd		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
3097185029Spjd		    skipaclchk, cr);
3098219089Spjd	}
3099168404Spjd
3100168404Spjd	if (mask & (AT_UID|AT_GID)) {
3101168404Spjd		int	idmask = (mask & (AT_UID|AT_GID));
3102168404Spjd		int	take_owner;
3103168404Spjd		int	take_group;
3104168404Spjd
3105168404Spjd		/*
3106168404Spjd		 * NOTE: even if a new mode is being set,
3107168404Spjd		 * we may clear S_ISUID/S_ISGID bits.
3108168404Spjd		 */
3109168404Spjd
3110168404Spjd		if (!(mask & AT_MODE))
3111219089Spjd			vap->va_mode = zp->z_mode;
3112168404Spjd
3113168404Spjd		/*
3114168404Spjd		 * Take ownership or chgrp to group we are a member of
3115168404Spjd		 */
3116168404Spjd
3117168404Spjd		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
3118185029Spjd		take_group = (mask & AT_GID) &&
3119185029Spjd		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
3120168404Spjd
3121168404Spjd		/*
3122168404Spjd		 * If both AT_UID and AT_GID are set then take_owner and
3123168404Spjd		 * take_group must both be set in order to allow taking
3124168404Spjd		 * ownership.
3125168404Spjd		 *
3126168404Spjd		 * Otherwise, send the check through secpolicy_vnode_setattr()
3127168404Spjd		 *
3128168404Spjd		 */
3129168404Spjd
3130168404Spjd		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
3131168404Spjd		    ((idmask == AT_UID) && take_owner) ||
3132168404Spjd		    ((idmask == AT_GID) && take_group)) {
3133185029Spjd			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
3134185029Spjd			    skipaclchk, cr) == 0) {
3135168404Spjd				/*
3136168404Spjd				 * Remove setuid/setgid for non-privileged users
3137168404Spjd				 */
3138185029Spjd				secpolicy_setid_clear(vap, vp, cr);
3139168404Spjd				trim_mask = (mask & (AT_UID|AT_GID));
3140168404Spjd			} else {
3141168404Spjd				need_policy =  TRUE;
3142168404Spjd			}
3143168404Spjd		} else {
3144168404Spjd			need_policy =  TRUE;
3145168404Spjd		}
3146168404Spjd	}
3147168404Spjd
3148168404Spjd	mutex_enter(&zp->z_lock);
3149219089Spjd	oldva.va_mode = zp->z_mode;
3150185029Spjd	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
3151185029Spjd	if (mask & AT_XVATTR) {
3152209962Smm		/*
3153209962Smm		 * Update xvattr mask to include only those attributes
3154209962Smm		 * that are actually changing.
3155209962Smm		 *
3156209962Smm		 * the bits will be restored prior to actually setting
3157209962Smm		 * the attributes so the caller thinks they were set.
3158209962Smm		 */
3159209962Smm		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
3160209962Smm			if (xoap->xoa_appendonly !=
3161219089Spjd			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
3162209962Smm				need_policy = TRUE;
3163209962Smm			} else {
3164209962Smm				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
3165209962Smm				XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
3166209962Smm			}
3167209962Smm		}
3168209962Smm
3169209962Smm		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
3170209962Smm			if (xoap->xoa_nounlink !=
3171219089Spjd			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
3172209962Smm				need_policy = TRUE;
3173209962Smm			} else {
3174209962Smm				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
3175209962Smm				XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
3176209962Smm			}
3177209962Smm		}
3178209962Smm
3179209962Smm		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
3180209962Smm			if (xoap->xoa_immutable !=
3181219089Spjd			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
3182209962Smm				need_policy = TRUE;
3183209962Smm			} else {
3184209962Smm				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
3185209962Smm				XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
3186209962Smm			}
3187209962Smm		}
3188209962Smm
3189209962Smm		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
3190209962Smm			if (xoap->xoa_nodump !=
3191219089Spjd			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
3192209962Smm				need_policy = TRUE;
3193209962Smm			} else {
3194209962Smm				XVA_CLR_REQ(xvap, XAT_NODUMP);
3195209962Smm				XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
3196209962Smm			}
3197209962Smm		}
3198209962Smm
3199209962Smm		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
3200209962Smm			if (xoap->xoa_av_modified !=
3201219089Spjd			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
3202209962Smm				need_policy = TRUE;
3203209962Smm			} else {
3204209962Smm				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
3205209962Smm				XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
3206209962Smm			}
3207209962Smm		}
3208209962Smm
3209209962Smm		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
3210209962Smm			if ((vp->v_type != VREG &&
3211209962Smm			    xoap->xoa_av_quarantined) ||
3212209962Smm			    xoap->xoa_av_quarantined !=
3213219089Spjd			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
3214209962Smm				need_policy = TRUE;
3215209962Smm			} else {
3216209962Smm				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
3217209962Smm				XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
3218209962Smm			}
3219209962Smm		}
3220209962Smm
3221219089Spjd		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
3222219089Spjd			mutex_exit(&zp->z_lock);
3223219089Spjd			ZFS_EXIT(zfsvfs);
3224249195Smm			return (SET_ERROR(EPERM));
3225219089Spjd		}
3226219089Spjd
3227209962Smm		if (need_policy == FALSE &&
3228209962Smm		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
3229209962Smm		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
3230185029Spjd			need_policy = TRUE;
3231185029Spjd		}
3232185029Spjd	}
3233185029Spjd
3234168404Spjd	mutex_exit(&zp->z_lock);
3235168404Spjd
3236168404Spjd	if (mask & AT_MODE) {
3237185029Spjd		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
3238168962Spjd			err = secpolicy_setid_setsticky_clear(vp, vap,
3239168962Spjd			    &oldva, cr);
3240168962Spjd			if (err) {
3241168962Spjd				ZFS_EXIT(zfsvfs);
3242168962Spjd				return (err);
3243168962Spjd			}
3244168404Spjd			trim_mask |= AT_MODE;
3245168404Spjd		} else {
3246168404Spjd			need_policy = TRUE;
3247168404Spjd		}
3248168404Spjd	}
3249168404Spjd
3250168404Spjd	if (need_policy) {
3251168404Spjd		/*
3252168404Spjd		 * If trim_mask is set then take ownership
3253168404Spjd		 * has been granted or write_acl is present and user
3254168404Spjd		 * has the ability to modify mode.  In that case remove
3255168404Spjd		 * UID|GID and or MODE from mask so that
3256168404Spjd		 * secpolicy_vnode_setattr() doesn't revoke it.
3257168404Spjd		 */
3258168404Spjd
3259168404Spjd		if (trim_mask) {
3260168404Spjd			saved_mask = vap->va_mask;
3261168404Spjd			vap->va_mask &= ~trim_mask;
3262197831Spjd			if (trim_mask & AT_MODE) {
3263197831Spjd				/*
3264197831Spjd				 * Save the mode, as secpolicy_vnode_setattr()
3265197831Spjd				 * will overwrite it with ova.va_mode.
3266197831Spjd				 */
3267197831Spjd				saved_mode = vap->va_mode;
3268197831Spjd			}
3269168404Spjd		}
3270168404Spjd		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
3271185029Spjd		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
3272168404Spjd		if (err) {
3273168404Spjd			ZFS_EXIT(zfsvfs);
3274168404Spjd			return (err);
3275168404Spjd		}
3276168404Spjd
3277197831Spjd		if (trim_mask) {
3278168404Spjd			vap->va_mask |= saved_mask;
3279197831Spjd			if (trim_mask & AT_MODE) {
3280197831Spjd				/*
3281197831Spjd				 * Recover the mode after
3282197831Spjd				 * secpolicy_vnode_setattr().
3283197831Spjd				 */
3284197831Spjd				vap->va_mode = saved_mode;
3285197831Spjd			}
3286197831Spjd		}
3287168404Spjd	}
3288168404Spjd
3289168404Spjd	/*
3290168404Spjd	 * secpolicy_vnode_setattr, or take ownership may have
3291168404Spjd	 * changed va_mask
3292168404Spjd	 */
3293168404Spjd	mask = vap->va_mask;
3294168404Spjd
3295219089Spjd	if ((mask & (AT_UID | AT_GID))) {
3296219089Spjd		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
3297219089Spjd		    &xattr_obj, sizeof (xattr_obj));
3298168404Spjd
3299219089Spjd		if (err == 0 && xattr_obj) {
3300219089Spjd			err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
3301209962Smm			if (err)
3302219089Spjd				goto out2;
3303168404Spjd		}
3304209962Smm		if (mask & AT_UID) {
3305209962Smm			new_uid = zfs_fuid_create(zfsvfs,
3306209962Smm			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
3307219089Spjd			if (new_uid != zp->z_uid &&
3308219089Spjd			    zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
3309219089Spjd				if (attrzp)
3310219089Spjd					VN_RELE(ZTOV(attrzp));
3311249195Smm				err = SET_ERROR(EDQUOT);
3312219089Spjd				goto out2;
3313209962Smm			}
3314209962Smm		}
3315209962Smm
3316209962Smm		if (mask & AT_GID) {
3317209962Smm			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
3318209962Smm			    cr, ZFS_GROUP, &fuidp);
3319219089Spjd			if (new_gid != zp->z_gid &&
3320219089Spjd			    zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
3321219089Spjd				if (attrzp)
3322219089Spjd					VN_RELE(ZTOV(attrzp));
3323249195Smm				err = SET_ERROR(EDQUOT);
3324219089Spjd				goto out2;
3325209962Smm			}
3326209962Smm		}
3327219089Spjd	}
3328219089Spjd	tx = dmu_tx_create(zfsvfs->z_os);
3329219089Spjd
3330219089Spjd	if (mask & AT_MODE) {
3331219089Spjd		uint64_t pmode = zp->z_mode;
3332219089Spjd		uint64_t acl_obj;
3333219089Spjd		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
3334219089Spjd
3335243560Smm		if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
3336243560Smm		    !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
3337249195Smm			err = SET_ERROR(EPERM);
3338243560Smm			goto out;
3339243560Smm		}
3340243560Smm
3341224174Smm		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
3342224174Smm			goto out;
3343219089Spjd
3344219089Spjd		mutex_enter(&zp->z_lock);
3345219089Spjd		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
3346219089Spjd			/*
3347219089Spjd			 * Are we upgrading ACL from old V0 format
3348219089Spjd			 * to V1 format?
3349219089Spjd			 */
3350219089Spjd			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
3351219089Spjd			    zfs_znode_acl_version(zp) ==
3352219089Spjd			    ZFS_ACL_VERSION_INITIAL) {
3353219089Spjd				dmu_tx_hold_free(tx, acl_obj, 0,
3354219089Spjd				    DMU_OBJECT_END);
3355219089Spjd				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3356219089Spjd				    0, aclp->z_acl_bytes);
3357209962Smm			} else {
3358219089Spjd				dmu_tx_hold_write(tx, acl_obj, 0,
3359219089Spjd				    aclp->z_acl_bytes);
3360209962Smm			}
3361219089Spjd		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3362219089Spjd			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3363219089Spjd			    0, aclp->z_acl_bytes);
3364209962Smm		}
3365219089Spjd		mutex_exit(&zp->z_lock);
3366219089Spjd		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3367219089Spjd	} else {
3368219089Spjd		if ((mask & AT_XVATTR) &&
3369219089Spjd		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3370219089Spjd			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3371219089Spjd		else
3372219089Spjd			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3373168404Spjd	}
3374168404Spjd
3375219089Spjd	if (attrzp) {
3376219089Spjd		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3377219089Spjd	}
3378219089Spjd
3379219089Spjd	fuid_dirtied = zfsvfs->z_fuid_dirty;
3380219089Spjd	if (fuid_dirtied)
3381219089Spjd		zfs_fuid_txhold(zfsvfs, tx);
3382219089Spjd
3383219089Spjd	zfs_sa_upgrade_txholds(tx, zp);
3384219089Spjd
3385209962Smm	err = dmu_tx_assign(tx, TXG_NOWAIT);
3386168404Spjd	if (err) {
3387209962Smm		if (err == ERESTART)
3388168404Spjd			dmu_tx_wait(tx);
3389209962Smm		goto out;
3390168404Spjd	}
3391168404Spjd
3392219089Spjd	count = 0;
3393168404Spjd	/*
3394168404Spjd	 * Set each attribute requested.
3395168404Spjd	 * We group settings according to the locks they need to acquire.
3396168404Spjd	 *
3397168404Spjd	 * Note: you cannot set ctime directly, although it will be
3398168404Spjd	 * updated as a side-effect of calling this function.
3399168404Spjd	 */
3400168404Spjd
3401219089Spjd
3402219089Spjd	if (mask & (AT_UID|AT_GID|AT_MODE))
3403219089Spjd		mutex_enter(&zp->z_acl_lock);
3404168404Spjd	mutex_enter(&zp->z_lock);
3405168404Spjd
3406219089Spjd	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3407219089Spjd	    &zp->z_pflags, sizeof (zp->z_pflags));
3408219089Spjd
3409219089Spjd	if (attrzp) {
3410219089Spjd		if (mask & (AT_UID|AT_GID|AT_MODE))
3411219089Spjd			mutex_enter(&attrzp->z_acl_lock);
3412219089Spjd		mutex_enter(&attrzp->z_lock);
3413219089Spjd		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3414219089Spjd		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3415219089Spjd		    sizeof (attrzp->z_pflags));
3416219089Spjd	}
3417219089Spjd
3418219089Spjd	if (mask & (AT_UID|AT_GID)) {
3419219089Spjd
3420219089Spjd		if (mask & AT_UID) {
3421219089Spjd			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3422219089Spjd			    &new_uid, sizeof (new_uid));
3423219089Spjd			zp->z_uid = new_uid;
3424219089Spjd			if (attrzp) {
3425219089Spjd				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3426219089Spjd				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3427219089Spjd				    sizeof (new_uid));
3428219089Spjd				attrzp->z_uid = new_uid;
3429219089Spjd			}
3430219089Spjd		}
3431219089Spjd
3432219089Spjd		if (mask & AT_GID) {
3433219089Spjd			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3434219089Spjd			    NULL, &new_gid, sizeof (new_gid));
3435219089Spjd			zp->z_gid = new_gid;
3436219089Spjd			if (attrzp) {
3437219089Spjd				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3438219089Spjd				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3439219089Spjd				    sizeof (new_gid));
3440219089Spjd				attrzp->z_gid = new_gid;
3441219089Spjd			}
3442219089Spjd		}
3443219089Spjd		if (!(mask & AT_MODE)) {
3444219089Spjd			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3445219089Spjd			    NULL, &new_mode, sizeof (new_mode));
3446219089Spjd			new_mode = zp->z_mode;
3447219089Spjd		}
3448219089Spjd		err = zfs_acl_chown_setattr(zp);
3449219089Spjd		ASSERT(err == 0);
3450219089Spjd		if (attrzp) {
3451219089Spjd			err = zfs_acl_chown_setattr(attrzp);
3452219089Spjd			ASSERT(err == 0);
3453219089Spjd		}
3454219089Spjd	}
3455219089Spjd
3456168404Spjd	if (mask & AT_MODE) {
3457219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3458219089Spjd		    &new_mode, sizeof (new_mode));
3459219089Spjd		zp->z_mode = new_mode;
3460219089Spjd		ASSERT3U((uintptr_t)aclp, !=, 0);
3461209962Smm		err = zfs_aclset_common(zp, aclp, cr, tx);
3462240415Smm		ASSERT0(err);
3463219089Spjd		if (zp->z_acl_cached)
3464219089Spjd			zfs_acl_free(zp->z_acl_cached);
3465211932Smm		zp->z_acl_cached = aclp;
3466211932Smm		aclp = NULL;
3467168404Spjd	}
3468168404Spjd
3469168404Spjd
3470219089Spjd	if (mask & AT_ATIME) {
3471219089Spjd		ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
3472219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
3473219089Spjd		    &zp->z_atime, sizeof (zp->z_atime));
3474168404Spjd	}
3475168404Spjd
3476219089Spjd	if (mask & AT_MTIME) {
3477219089Spjd		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3478219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3479219089Spjd		    mtime, sizeof (mtime));
3480168404Spjd	}
3481168404Spjd
3482185029Spjd	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3483219089Spjd	if (mask & AT_SIZE && !(mask & AT_MTIME)) {
3484219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
3485219089Spjd		    NULL, mtime, sizeof (mtime));
3486219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3487219089Spjd		    &ctime, sizeof (ctime));
3488219089Spjd		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
3489219089Spjd		    B_TRUE);
3490219089Spjd	} else if (mask != 0) {
3491219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3492219089Spjd		    &ctime, sizeof (ctime));
3493219089Spjd		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
3494219089Spjd		    B_TRUE);
3495219089Spjd		if (attrzp) {
3496219089Spjd			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3497219089Spjd			    SA_ZPL_CTIME(zfsvfs), NULL,
3498219089Spjd			    &ctime, sizeof (ctime));
3499219089Spjd			zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
3500219089Spjd			    mtime, ctime, B_TRUE);
3501219089Spjd		}
3502219089Spjd	}
3503185029Spjd	/*
3504185029Spjd	 * Do this after setting timestamps to prevent timestamp
3505185029Spjd	 * update from toggling bit
3506185029Spjd	 */
3507168404Spjd
3508185029Spjd	if (xoap && (mask & AT_XVATTR)) {
3509209962Smm
3510209962Smm		/*
3511209962Smm		 * restore trimmed off masks
3512209962Smm		 * so that return masks can be set for caller.
3513209962Smm		 */
3514209962Smm
3515209962Smm		if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3516209962Smm			XVA_SET_REQ(xvap, XAT_APPENDONLY);
3517209962Smm		}
3518209962Smm		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3519209962Smm			XVA_SET_REQ(xvap, XAT_NOUNLINK);
3520209962Smm		}
3521209962Smm		if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3522209962Smm			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3523209962Smm		}
3524209962Smm		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3525209962Smm			XVA_SET_REQ(xvap, XAT_NODUMP);
3526209962Smm		}
3527209962Smm		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3528209962Smm			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3529209962Smm		}
3530209962Smm		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3531209962Smm			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3532209962Smm		}
3533209962Smm
3534219089Spjd		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3535185029Spjd			ASSERT(vp->v_type == VREG);
3536185029Spjd
3537219089Spjd		zfs_xvattr_set(zp, xvap, tx);
3538185029Spjd	}
3539185029Spjd
3540209962Smm	if (fuid_dirtied)
3541209962Smm		zfs_fuid_sync(zfsvfs, tx);
3542209962Smm
3543168404Spjd	if (mask != 0)
3544185029Spjd		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3545168404Spjd
3546168404Spjd	mutex_exit(&zp->z_lock);
3547219089Spjd	if (mask & (AT_UID|AT_GID|AT_MODE))
3548219089Spjd		mutex_exit(&zp->z_acl_lock);
3549168404Spjd
3550219089Spjd	if (attrzp) {
3551219089Spjd		if (mask & (AT_UID|AT_GID|AT_MODE))
3552219089Spjd			mutex_exit(&attrzp->z_acl_lock);
3553219089Spjd		mutex_exit(&attrzp->z_lock);
3554219089Spjd	}
3555209962Smmout:
3556219089Spjd	if (err == 0 && attrzp) {
3557219089Spjd		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3558219089Spjd		    xattr_count, tx);
3559219089Spjd		ASSERT(err2 == 0);
3560219089Spjd	}
3561219089Spjd
3562168404Spjd	if (attrzp)
3563168404Spjd		VN_RELE(ZTOV(attrzp));
3564251631Sdelphij
3565211932Smm	if (aclp)
3566209962Smm		zfs_acl_free(aclp);
3567168404Spjd
3568209962Smm	if (fuidp) {
3569209962Smm		zfs_fuid_info_free(fuidp);
3570209962Smm		fuidp = NULL;
3571209962Smm	}
3572209962Smm
3573219089Spjd	if (err) {
3574209962Smm		dmu_tx_abort(tx);
3575219089Spjd		if (err == ERESTART)
3576219089Spjd			goto top;
3577219089Spjd	} else {
3578219089Spjd		err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3579209962Smm		dmu_tx_commit(tx);
3580219089Spjd	}
3581209962Smm
3582219089Spjdout2:
3583219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3584219089Spjd		zil_commit(zilog, 0);
3585209962Smm
3586168404Spjd	ZFS_EXIT(zfsvfs);
3587168404Spjd	return (err);
3588168404Spjd}
3589168404Spjd
3590168404Spjdtypedef struct zfs_zlock {
3591168404Spjd	krwlock_t	*zl_rwlock;	/* lock we acquired */
3592168404Spjd	znode_t		*zl_znode;	/* znode we held */
3593168404Spjd	struct zfs_zlock *zl_next;	/* next in list */
3594168404Spjd} zfs_zlock_t;
3595168404Spjd
3596168404Spjd/*
3597168404Spjd * Drop locks and release vnodes that were held by zfs_rename_lock().
3598168404Spjd */
3599168404Spjdstatic void
3600168404Spjdzfs_rename_unlock(zfs_zlock_t **zlpp)
3601168404Spjd{
3602168404Spjd	zfs_zlock_t *zl;
3603168404Spjd
3604168404Spjd	while ((zl = *zlpp) != NULL) {
3605168404Spjd		if (zl->zl_znode != NULL)
3606168404Spjd			VN_RELE(ZTOV(zl->zl_znode));
3607168404Spjd		rw_exit(zl->zl_rwlock);
3608168404Spjd		*zlpp = zl->zl_next;
3609168404Spjd		kmem_free(zl, sizeof (*zl));
3610168404Spjd	}
3611168404Spjd}
3612168404Spjd
3613168404Spjd/*
3614168404Spjd * Search back through the directory tree, using the ".." entries.
3615168404Spjd * Lock each directory in the chain to prevent concurrent renames.
3616168404Spjd * Fail any attempt to move a directory into one of its own descendants.
3617168404Spjd * XXX - z_parent_lock can overlap with map or grow locks
3618168404Spjd */
3619168404Spjdstatic int
3620168404Spjdzfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
3621168404Spjd{
3622168404Spjd	zfs_zlock_t	*zl;
3623168404Spjd	znode_t		*zp = tdzp;
3624168404Spjd	uint64_t	rootid = zp->z_zfsvfs->z_root;
3625219089Spjd	uint64_t	oidp = zp->z_id;
3626168404Spjd	krwlock_t	*rwlp = &szp->z_parent_lock;
3627168404Spjd	krw_t		rw = RW_WRITER;
3628168404Spjd
3629168404Spjd	/*
3630168404Spjd	 * First pass write-locks szp and compares to zp->z_id.
3631168404Spjd	 * Later passes read-lock zp and compare to zp->z_parent.
3632168404Spjd	 */
3633168404Spjd	do {
3634168404Spjd		if (!rw_tryenter(rwlp, rw)) {
3635168404Spjd			/*
3636168404Spjd			 * Another thread is renaming in this path.
3637168404Spjd			 * Note that if we are a WRITER, we don't have any
3638168404Spjd			 * parent_locks held yet.
3639168404Spjd			 */
3640168404Spjd			if (rw == RW_READER && zp->z_id > szp->z_id) {
3641168404Spjd				/*
3642168404Spjd				 * Drop our locks and restart
3643168404Spjd				 */
3644168404Spjd				zfs_rename_unlock(&zl);
3645168404Spjd				*zlpp = NULL;
3646168404Spjd				zp = tdzp;
3647219089Spjd				oidp = zp->z_id;
3648168404Spjd				rwlp = &szp->z_parent_lock;
3649168404Spjd				rw = RW_WRITER;
3650168404Spjd				continue;
3651168404Spjd			} else {
3652168404Spjd				/*
3653168404Spjd				 * Wait for other thread to drop its locks
3654168404Spjd				 */
3655168404Spjd				rw_enter(rwlp, rw);
3656168404Spjd			}
3657168404Spjd		}
3658168404Spjd
3659168404Spjd		zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
3660168404Spjd		zl->zl_rwlock = rwlp;
3661168404Spjd		zl->zl_znode = NULL;
3662168404Spjd		zl->zl_next = *zlpp;
3663168404Spjd		*zlpp = zl;
3664168404Spjd
3665219089Spjd		if (oidp == szp->z_id)		/* We're a descendant of szp */
3666249195Smm			return (SET_ERROR(EINVAL));
3667168404Spjd
3668219089Spjd		if (oidp == rootid)		/* We've hit the top */
3669168404Spjd			return (0);
3670168404Spjd
3671168404Spjd		if (rw == RW_READER) {		/* i.e. not the first pass */
3672219089Spjd			int error = zfs_zget(zp->z_zfsvfs, oidp, &zp);
3673168404Spjd			if (error)
3674168404Spjd				return (error);
3675168404Spjd			zl->zl_znode = zp;
3676168404Spjd		}
3677219089Spjd		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs),
3678219089Spjd		    &oidp, sizeof (oidp));
3679168404Spjd		rwlp = &zp->z_parent_lock;
3680168404Spjd		rw = RW_READER;
3681168404Spjd
3682168404Spjd	} while (zp->z_id != sdzp->z_id);
3683168404Spjd
3684168404Spjd	return (0);
3685168404Spjd}
3686168404Spjd
3687168404Spjd/*
3688168404Spjd * Move an entry from the provided source directory to the target
3689168404Spjd * directory.  Change the entry name as indicated.
3690168404Spjd *
3691168404Spjd *	IN:	sdvp	- Source directory containing the "old entry".
3692168404Spjd *		snm	- Old entry name.
3693168404Spjd *		tdvp	- Target directory to contain the "new entry".
3694168404Spjd *		tnm	- New entry name.
3695168404Spjd *		cr	- credentials of caller.
3696185029Spjd *		ct	- caller context
3697185029Spjd *		flags	- case flags
3698168404Spjd *
3699251631Sdelphij *	RETURN:	0 on success, error code on failure.
3700168404Spjd *
3701168404Spjd * Timestamps:
3702168404Spjd *	sdvp,tdvp - ctime|mtime updated
3703168404Spjd */
3704185029Spjd/*ARGSUSED*/
3705168404Spjdstatic int
3706185029Spjdzfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
3707185029Spjd    caller_context_t *ct, int flags)
3708168404Spjd{
3709168404Spjd	znode_t		*tdzp, *szp, *tzp;
3710168404Spjd	znode_t		*sdzp = VTOZ(sdvp);
3711168404Spjd	zfsvfs_t	*zfsvfs = sdzp->z_zfsvfs;
3712185029Spjd	zilog_t		*zilog;
3713168962Spjd	vnode_t		*realvp;
3714168404Spjd	zfs_dirlock_t	*sdl, *tdl;
3715168404Spjd	dmu_tx_t	*tx;
3716168404Spjd	zfs_zlock_t	*zl;
3717185029Spjd	int		cmp, serr, terr;
3718185029Spjd	int		error = 0;
3719185029Spjd	int		zflg = 0;
3720168404Spjd
3721168404Spjd	ZFS_ENTER(zfsvfs);
3722185029Spjd	ZFS_VERIFY_ZP(sdzp);
3723185029Spjd	zilog = zfsvfs->z_log;
3724168404Spjd
3725168962Spjd	/*
3726168962Spjd	 * Make sure we have the real vp for the target directory.
3727168962Spjd	 */
3728185029Spjd	if (VOP_REALVP(tdvp, &realvp, ct) == 0)
3729168962Spjd		tdvp = realvp;
3730168962Spjd
3731254585Sdelphij	tdzp = VTOZ(tdvp);
3732254585Sdelphij	ZFS_VERIFY_ZP(tdzp);
3733254585Sdelphij
3734254585Sdelphij	/*
3735254585Sdelphij	 * We check z_zfsvfs rather than v_vfsp here, because snapshots and the
3736254585Sdelphij	 * ctldir appear to have the same v_vfsp.
3737254585Sdelphij	 */
3738254585Sdelphij	if (tdzp->z_zfsvfs != zfsvfs || zfsctl_is_node(tdvp)) {
3739168404Spjd		ZFS_EXIT(zfsvfs);
3740249195Smm		return (SET_ERROR(EXDEV));
3741168404Spjd	}
3742168404Spjd
3743185029Spjd	if (zfsvfs->z_utf8 && u8_validate(tnm,
3744185029Spjd	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3745185029Spjd		ZFS_EXIT(zfsvfs);
3746249195Smm		return (SET_ERROR(EILSEQ));
3747185029Spjd	}
3748185029Spjd
3749185029Spjd	if (flags & FIGNORECASE)
3750185029Spjd		zflg |= ZCILOOK;
3751185029Spjd
3752168404Spjdtop:
3753168404Spjd	szp = NULL;
3754168404Spjd	tzp = NULL;
3755168404Spjd	zl = NULL;
3756168404Spjd
3757168404Spjd	/*
3758168404Spjd	 * This is to prevent the creation of links into attribute space
3759168404Spjd	 * by renaming a linked file into/outof an attribute directory.
3760168404Spjd	 * See the comment in zfs_link() for why this is considered bad.
3761168404Spjd	 */
3762219089Spjd	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3763168962Spjd		ZFS_EXIT(zfsvfs);
3764249195Smm		return (SET_ERROR(EINVAL));
3765168404Spjd	}
3766168404Spjd
3767168404Spjd	/*
3768168404Spjd	 * Lock source and target directory entries.  To prevent deadlock,
3769168404Spjd	 * a lock ordering must be defined.  We lock the directory with
3770168404Spjd	 * the smallest object id first, or if it's a tie, the one with
3771168404Spjd	 * the lexically first name.
3772168404Spjd	 */
3773168404Spjd	if (sdzp->z_id < tdzp->z_id) {
3774168962Spjd		cmp = -1;
3775168962Spjd	} else if (sdzp->z_id > tdzp->z_id) {
3776168962Spjd		cmp = 1;
3777168962Spjd	} else {
3778185029Spjd		/*
3779185029Spjd		 * First compare the two name arguments without
3780185029Spjd		 * considering any case folding.
3781185029Spjd		 */
3782185029Spjd		int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
3783185029Spjd
3784185029Spjd		cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
3785185029Spjd		ASSERT(error == 0 || !zfsvfs->z_utf8);
3786168962Spjd		if (cmp == 0) {
3787168962Spjd			/*
3788168962Spjd			 * POSIX: "If the old argument and the new argument
3789168962Spjd			 * both refer to links to the same existing file,
3790168962Spjd			 * the rename() function shall return successfully
3791168962Spjd			 * and perform no other action."
3792168962Spjd			 */
3793168962Spjd			ZFS_EXIT(zfsvfs);
3794168962Spjd			return (0);
3795168962Spjd		}
3796185029Spjd		/*
3797185029Spjd		 * If the file system is case-folding, then we may
3798185029Spjd		 * have some more checking to do.  A case-folding file
3799185029Spjd		 * system is either supporting mixed case sensitivity
3800185029Spjd		 * access or is completely case-insensitive.  Note
3801185029Spjd		 * that the file system is always case preserving.
3802185029Spjd		 *
3803185029Spjd		 * In mixed sensitivity mode case sensitive behavior
3804185029Spjd		 * is the default.  FIGNORECASE must be used to
3805185029Spjd		 * explicitly request case insensitive behavior.
3806185029Spjd		 *
3807185029Spjd		 * If the source and target names provided differ only
3808185029Spjd		 * by case (e.g., a request to rename 'tim' to 'Tim'),
3809185029Spjd		 * we will treat this as a special case in the
3810185029Spjd		 * case-insensitive mode: as long as the source name
3811185029Spjd		 * is an exact match, we will allow this to proceed as
3812185029Spjd		 * a name-change request.
3813185029Spjd		 */
3814185029Spjd		if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
3815185029Spjd		    (zfsvfs->z_case == ZFS_CASE_MIXED &&
3816185029Spjd		    flags & FIGNORECASE)) &&
3817185029Spjd		    u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
3818185029Spjd		    &error) == 0) {
3819185029Spjd			/*
3820185029Spjd			 * case preserving rename request, require exact
3821185029Spjd			 * name matches
3822185029Spjd			 */
3823185029Spjd			zflg |= ZCIEXACT;
3824185029Spjd			zflg &= ~ZCILOOK;
3825185029Spjd		}
3826168962Spjd	}
3827185029Spjd
3828208131Smm	/*
3829208131Smm	 * If the source and destination directories are the same, we should
3830208131Smm	 * grab the z_name_lock of that directory only once.
3831208131Smm	 */
3832208131Smm	if (sdzp == tdzp) {
3833208131Smm		zflg |= ZHAVELOCK;
3834208131Smm		rw_enter(&sdzp->z_name_lock, RW_READER);
3835208131Smm	}
3836208131Smm
3837168962Spjd	if (cmp < 0) {
3838185029Spjd		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
3839185029Spjd		    ZEXISTS | zflg, NULL, NULL);
3840185029Spjd		terr = zfs_dirent_lock(&tdl,
3841185029Spjd		    tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
3842168962Spjd	} else {
3843185029Spjd		terr = zfs_dirent_lock(&tdl,
3844185029Spjd		    tdzp, tnm, &tzp, zflg, NULL, NULL);
3845185029Spjd		serr = zfs_dirent_lock(&sdl,
3846185029Spjd		    sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
3847185029Spjd		    NULL, NULL);
3848168404Spjd	}
3849168404Spjd
3850168962Spjd	if (serr) {
3851168404Spjd		/*
3852168404Spjd		 * Source entry invalid or not there.
3853168404Spjd		 */
3854168962Spjd		if (!terr) {
3855168404Spjd			zfs_dirent_unlock(tdl);
3856168962Spjd			if (tzp)
3857168962Spjd				VN_RELE(ZTOV(tzp));
3858168962Spjd		}
3859208131Smm
3860208131Smm		if (sdzp == tdzp)
3861208131Smm			rw_exit(&sdzp->z_name_lock);
3862208131Smm
3863219089Spjd		/*
3864219089Spjd		 * FreeBSD: In OpenSolaris they only check if rename source is
3865219089Spjd		 * ".." here, because "." is handled in their lookup. This is
3866219089Spjd		 * not the case for FreeBSD, so we check for "." explicitly.
3867219089Spjd		 */
3868168404Spjd		if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0)
3869249195Smm			serr = SET_ERROR(EINVAL);
3870168962Spjd		ZFS_EXIT(zfsvfs);
3871168962Spjd		return (serr);
3872168404Spjd	}
3873168404Spjd	if (terr) {
3874168404Spjd		zfs_dirent_unlock(sdl);
3875168962Spjd		VN_RELE(ZTOV(szp));
3876208131Smm
3877208131Smm		if (sdzp == tdzp)
3878208131Smm			rw_exit(&sdzp->z_name_lock);
3879208131Smm
3880168404Spjd		if (strcmp(tnm, "..") == 0)
3881249195Smm			terr = SET_ERROR(EINVAL);
3882168962Spjd		ZFS_EXIT(zfsvfs);
3883168962Spjd		return (terr);
3884168404Spjd	}
3885168404Spjd
3886168404Spjd	/*
3887168404Spjd	 * Must have write access at the source to remove the old entry
3888168404Spjd	 * and write access at the target to create the new entry.
3889168404Spjd	 * Note that if target and source are the same, this can be
3890168404Spjd	 * done in a single check.
3891168404Spjd	 */
3892168404Spjd
3893168404Spjd	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
3894168404Spjd		goto out;
3895168404Spjd
3896168962Spjd	if (ZTOV(szp)->v_type == VDIR) {
3897168404Spjd		/*
3898168404Spjd		 * Check to make sure rename is valid.
3899168404Spjd		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3900168404Spjd		 */
3901168404Spjd		if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
3902168404Spjd			goto out;
3903168404Spjd	}
3904168404Spjd
3905168404Spjd	/*
3906168404Spjd	 * Does target exist?
3907168404Spjd	 */
3908168404Spjd	if (tzp) {
3909168404Spjd		/*
3910168404Spjd		 * Source and target must be the same type.
3911168404Spjd		 */
3912168962Spjd		if (ZTOV(szp)->v_type == VDIR) {
3913168962Spjd			if (ZTOV(tzp)->v_type != VDIR) {
3914249195Smm				error = SET_ERROR(ENOTDIR);
3915168404Spjd				goto out;
3916168404Spjd			}
3917168404Spjd		} else {
3918168962Spjd			if (ZTOV(tzp)->v_type == VDIR) {
3919249195Smm				error = SET_ERROR(EISDIR);
3920168404Spjd				goto out;
3921168404Spjd			}
3922168404Spjd		}
3923168404Spjd		/*
3924168404Spjd		 * POSIX dictates that when the source and target
3925168404Spjd		 * entries refer to the same file object, rename
3926168404Spjd		 * must do nothing and exit without error.
3927168404Spjd		 */
3928168404Spjd		if (szp->z_id == tzp->z_id) {
3929168404Spjd			error = 0;
3930168404Spjd			goto out;
3931168404Spjd		}
3932168404Spjd	}
3933168404Spjd
3934185029Spjd	vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
3935168962Spjd	if (tzp)
3936185029Spjd		vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
3937168962Spjd
3938185029Spjd	/*
3939185029Spjd	 * notify the target directory if it is not the same
3940185029Spjd	 * as source directory.
3941185029Spjd	 */
3942185029Spjd	if (tdvp != sdvp) {
3943185029Spjd		vnevent_rename_dest_dir(tdvp, ct);
3944185029Spjd	}
3945185029Spjd
3946168404Spjd	tx = dmu_tx_create(zfsvfs->z_os);
3947219089Spjd	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3948219089Spjd	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3949168404Spjd	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3950168404Spjd	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3951219089Spjd	if (sdzp != tdzp) {
3952219089Spjd		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3953219089Spjd		zfs_sa_upgrade_txholds(tx, tdzp);
3954219089Spjd	}
3955219089Spjd	if (tzp) {
3956219089Spjd		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3957219089Spjd		zfs_sa_upgrade_txholds(tx, tzp);
3958219089Spjd	}
3959219089Spjd
3960219089Spjd	zfs_sa_upgrade_txholds(tx, szp);
3961168404Spjd	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3962209962Smm	error = dmu_tx_assign(tx, TXG_NOWAIT);
3963168404Spjd	if (error) {
3964168404Spjd		if (zl != NULL)
3965168404Spjd			zfs_rename_unlock(&zl);
3966168404Spjd		zfs_dirent_unlock(sdl);
3967168404Spjd		zfs_dirent_unlock(tdl);
3968208131Smm
3969208131Smm		if (sdzp == tdzp)
3970208131Smm			rw_exit(&sdzp->z_name_lock);
3971208131Smm
3972168962Spjd		VN_RELE(ZTOV(szp));
3973168962Spjd		if (tzp)
3974168962Spjd			VN_RELE(ZTOV(tzp));
3975209962Smm		if (error == ERESTART) {
3976168404Spjd			dmu_tx_wait(tx);
3977168404Spjd			dmu_tx_abort(tx);
3978168404Spjd			goto top;
3979168404Spjd		}
3980168404Spjd		dmu_tx_abort(tx);
3981168962Spjd		ZFS_EXIT(zfsvfs);
3982168962Spjd		return (error);
3983168404Spjd	}
3984168404Spjd
3985168404Spjd	if (tzp)	/* Attempt to remove the existing target */
3986185029Spjd		error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
3987168404Spjd
3988168404Spjd	if (error == 0) {
3989168404Spjd		error = zfs_link_create(tdl, szp, tx, ZRENAMING);
3990168404Spjd		if (error == 0) {
3991219089Spjd			szp->z_pflags |= ZFS_AV_MODIFIED;
3992185029Spjd
3993219089Spjd			error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3994219089Spjd			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3995240415Smm			ASSERT0(error);
3996219089Spjd
3997168404Spjd			error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
3998219089Spjd			if (error == 0) {
3999219089Spjd				zfs_log_rename(zilog, tx, TX_RENAME |
4000219089Spjd				    (flags & FIGNORECASE ? TX_CI : 0), sdzp,
4001219089Spjd				    sdl->dl_name, tdzp, tdl->dl_name, szp);
4002185029Spjd
4003219089Spjd				/*
4004219089Spjd				 * Update path information for the target vnode
4005219089Spjd				 */
4006219089Spjd				vn_renamepath(tdvp, ZTOV(szp), tnm,
4007219089Spjd				    strlen(tnm));
4008219089Spjd			} else {
4009219089Spjd				/*
4010219089Spjd				 * At this point, we have successfully created
4011219089Spjd				 * the target name, but have failed to remove
4012219089Spjd				 * the source name.  Since the create was done
4013219089Spjd				 * with the ZRENAMING flag, there are
4014219089Spjd				 * complications; for one, the link count is
4015219089Spjd				 * wrong.  The easiest way to deal with this
4016219089Spjd				 * is to remove the newly created target, and
4017219089Spjd				 * return the original error.  This must
4018219089Spjd				 * succeed; fortunately, it is very unlikely to
4019219089Spjd				 * fail, since we just created it.
4020219089Spjd				 */
4021219089Spjd				VERIFY3U(zfs_link_destroy(tdl, szp, tx,
4022219089Spjd				    ZRENAMING, NULL), ==, 0);
4023219089Spjd			}
4024168404Spjd		}
4025168404Spjd#ifdef FREEBSD_NAMECACHE
4026168404Spjd		if (error == 0) {
4027168404Spjd			cache_purge(sdvp);
4028168404Spjd			cache_purge(tdvp);
4029240829Spjd			cache_purge(ZTOV(szp));
4030240829Spjd			if (tzp)
4031240829Spjd				cache_purge(ZTOV(tzp));
4032168404Spjd		}
4033168404Spjd#endif
4034168404Spjd	}
4035168404Spjd
4036168404Spjd	dmu_tx_commit(tx);
4037168404Spjdout:
4038168404Spjd	if (zl != NULL)
4039168404Spjd		zfs_rename_unlock(&zl);
4040168404Spjd
4041168404Spjd	zfs_dirent_unlock(sdl);
4042168404Spjd	zfs_dirent_unlock(tdl);
4043168404Spjd
4044208131Smm	if (sdzp == tdzp)
4045208131Smm		rw_exit(&sdzp->z_name_lock);
4046208131Smm
4047219089Spjd
4048168962Spjd	VN_RELE(ZTOV(szp));
4049168404Spjd	if (tzp)
4050168962Spjd		VN_RELE(ZTOV(tzp));
4051168404Spjd
4052219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4053219089Spjd		zil_commit(zilog, 0);
4054219089Spjd
4055168404Spjd	ZFS_EXIT(zfsvfs);
4056168404Spjd
4057168404Spjd	return (error);
4058168404Spjd}
4059168404Spjd
4060168404Spjd/*
4061168404Spjd * Insert the indicated symbolic reference entry into the directory.
4062168404Spjd *
4063168404Spjd *	IN:	dvp	- Directory to contain new symbolic link.
4064168404Spjd *		link	- Name for new symlink entry.
4065168404Spjd *		vap	- Attributes of new entry.
4066168404Spjd *		cr	- credentials of caller.
4067185029Spjd *		ct	- caller context
4068185029Spjd *		flags	- case flags
4069168404Spjd *
4070251631Sdelphij *	RETURN:	0 on success, error code on failure.
4071168404Spjd *
4072168404Spjd * Timestamps:
4073168404Spjd *	dvp - ctime|mtime updated
4074168404Spjd */
4075185029Spjd/*ARGSUSED*/
4076168404Spjdstatic int
4077185029Spjdzfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
4078185029Spjd    cred_t *cr, kthread_t *td)
4079168404Spjd{
4080168404Spjd	znode_t		*zp, *dzp = VTOZ(dvp);
4081168404Spjd	zfs_dirlock_t	*dl;
4082168404Spjd	dmu_tx_t	*tx;
4083168404Spjd	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
4084185029Spjd	zilog_t		*zilog;
4085219089Spjd	uint64_t	len = strlen(link);
4086168404Spjd	int		error;
4087185029Spjd	int		zflg = ZNEW;
4088209962Smm	zfs_acl_ids_t	acl_ids;
4089209962Smm	boolean_t	fuid_dirtied;
4090219089Spjd	uint64_t	txtype = TX_SYMLINK;
4091185029Spjd	int		flags = 0;
4092168404Spjd
4093168962Spjd	ASSERT(vap->va_type == VLNK);
4094168404Spjd
4095168404Spjd	ZFS_ENTER(zfsvfs);
4096185029Spjd	ZFS_VERIFY_ZP(dzp);
4097185029Spjd	zilog = zfsvfs->z_log;
4098185029Spjd
4099185029Spjd	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
4100185029Spjd	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4101185029Spjd		ZFS_EXIT(zfsvfs);
4102249195Smm		return (SET_ERROR(EILSEQ));
4103185029Spjd	}
4104185029Spjd	if (flags & FIGNORECASE)
4105185029Spjd		zflg |= ZCILOOK;
4106168404Spjd
4107168404Spjd	if (len > MAXPATHLEN) {
4108168404Spjd		ZFS_EXIT(zfsvfs);
4109249195Smm		return (SET_ERROR(ENAMETOOLONG));
4110168404Spjd	}
4111168404Spjd
4112219089Spjd	if ((error = zfs_acl_ids_create(dzp, 0,
4113219089Spjd	    vap, cr, NULL, &acl_ids)) != 0) {
4114219089Spjd		ZFS_EXIT(zfsvfs);
4115219089Spjd		return (error);
4116219089Spjd	}
4117219089Spjdtop:
4118168404Spjd	/*
4119168404Spjd	 * Attempt to lock directory; fail if entry already exists.
4120168404Spjd	 */
4121185029Spjd	error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
4122185029Spjd	if (error) {
4123219089Spjd		zfs_acl_ids_free(&acl_ids);
4124168404Spjd		ZFS_EXIT(zfsvfs);
4125168404Spjd		return (error);
4126168404Spjd	}
4127168404Spjd
4128219089Spjd	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4129219089Spjd		zfs_acl_ids_free(&acl_ids);
4130219089Spjd		zfs_dirent_unlock(dl);
4131219089Spjd		ZFS_EXIT(zfsvfs);
4132219089Spjd		return (error);
4133219089Spjd	}
4134219089Spjd
4135209962Smm	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
4136209962Smm		zfs_acl_ids_free(&acl_ids);
4137209962Smm		zfs_dirent_unlock(dl);
4138209962Smm		ZFS_EXIT(zfsvfs);
4139249195Smm		return (SET_ERROR(EDQUOT));
4140209962Smm	}
4141168404Spjd	tx = dmu_tx_create(zfsvfs->z_os);
4142209962Smm	fuid_dirtied = zfsvfs->z_fuid_dirty;
4143168404Spjd	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
4144168404Spjd	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4145219089Spjd	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
4146219089Spjd	    ZFS_SA_BASE_ATTR_SIZE + len);
4147219089Spjd	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
4148219089Spjd	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
4149219089Spjd		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
4150219089Spjd		    acl_ids.z_aclp->z_acl_bytes);
4151219089Spjd	}
4152209962Smm	if (fuid_dirtied)
4153209962Smm		zfs_fuid_txhold(zfsvfs, tx);
4154209962Smm	error = dmu_tx_assign(tx, TXG_NOWAIT);
4155168404Spjd	if (error) {
4156168404Spjd		zfs_dirent_unlock(dl);
4157209962Smm		if (error == ERESTART) {
4158168404Spjd			dmu_tx_wait(tx);
4159168404Spjd			dmu_tx_abort(tx);
4160168404Spjd			goto top;
4161168404Spjd		}
4162219089Spjd		zfs_acl_ids_free(&acl_ids);
4163168404Spjd		dmu_tx_abort(tx);
4164168404Spjd		ZFS_EXIT(zfsvfs);
4165168404Spjd		return (error);
4166168404Spjd	}
4167168404Spjd
4168168404Spjd	/*
4169168404Spjd	 * Create a new object for the symlink.
4170219089Spjd	 * for version 4 ZPL datsets the symlink will be an SA attribute
4171168404Spjd	 */
4172219089Spjd	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
4173168404Spjd
4174219089Spjd	if (fuid_dirtied)
4175219089Spjd		zfs_fuid_sync(zfsvfs, tx);
4176209962Smm
4177219089Spjd	mutex_enter(&zp->z_lock);
4178219089Spjd	if (zp->z_is_sa)
4179219089Spjd		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
4180219089Spjd		    link, len, tx);
4181219089Spjd	else
4182219089Spjd		zfs_sa_symlink(zp, link, len, tx);
4183219089Spjd	mutex_exit(&zp->z_lock);
4184168404Spjd
4185219089Spjd	zp->z_size = len;
4186219089Spjd	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
4187219089Spjd	    &zp->z_size, sizeof (zp->z_size), tx);
4188168404Spjd	/*
4189168404Spjd	 * Insert the new object into the directory.
4190168404Spjd	 */
4191168404Spjd	(void) zfs_link_create(dl, zp, tx, ZNEW);
4192168404Spjd
4193219089Spjd	if (flags & FIGNORECASE)
4194219089Spjd		txtype |= TX_CI;
4195219089Spjd	zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
4196219089Spjd	*vpp = ZTOV(zp);
4197219089Spjd
4198209962Smm	zfs_acl_ids_free(&acl_ids);
4199209962Smm
4200168404Spjd	dmu_tx_commit(tx);
4201168404Spjd
4202168404Spjd	zfs_dirent_unlock(dl);
4203168404Spjd
4204219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4205219089Spjd		zil_commit(zilog, 0);
4206219089Spjd
4207168404Spjd	ZFS_EXIT(zfsvfs);
4208168404Spjd	return (error);
4209168404Spjd}
4210168404Spjd
4211168404Spjd/*
4212168404Spjd * Return, in the buffer contained in the provided uio structure,
4213168404Spjd * the symbolic path referred to by vp.
4214168404Spjd *
4215168404Spjd *	IN:	vp	- vnode of symbolic link.
4216251631Sdelphij *		uio	- structure to contain the link path.
4217168404Spjd *		cr	- credentials of caller.
4218185029Spjd *		ct	- caller context
4219168404Spjd *
4220251631Sdelphij *	OUT:	uio	- structure containing the link path.
4221168404Spjd *
4222251631Sdelphij *	RETURN:	0 on success, error code on failure.
4223168404Spjd *
4224168404Spjd * Timestamps:
4225168404Spjd *	vp - atime updated
4226168404Spjd */
4227168404Spjd/* ARGSUSED */
4228168404Spjdstatic int
4229185029Spjdzfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
4230168404Spjd{
4231168404Spjd	znode_t		*zp = VTOZ(vp);
4232168404Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4233168404Spjd	int		error;
4234168404Spjd
4235168404Spjd	ZFS_ENTER(zfsvfs);
4236185029Spjd	ZFS_VERIFY_ZP(zp);
4237168404Spjd
4238219089Spjd	mutex_enter(&zp->z_lock);
4239219089Spjd	if (zp->z_is_sa)
4240219089Spjd		error = sa_lookup_uio(zp->z_sa_hdl,
4241219089Spjd		    SA_ZPL_SYMLINK(zfsvfs), uio);
4242219089Spjd	else
4243219089Spjd		error = zfs_sa_readlink(zp, uio);
4244219089Spjd	mutex_exit(&zp->z_lock);
4245168404Spjd
4246168404Spjd	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4247219089Spjd
4248168404Spjd	ZFS_EXIT(zfsvfs);
4249168404Spjd	return (error);
4250168404Spjd}
4251168404Spjd
4252168404Spjd/*
4253168404Spjd * Insert a new entry into directory tdvp referencing svp.
4254168404Spjd *
4255168404Spjd *	IN:	tdvp	- Directory to contain new entry.
4256168404Spjd *		svp	- vnode of new entry.
4257168404Spjd *		name	- name of new entry.
4258168404Spjd *		cr	- credentials of caller.
4259185029Spjd *		ct	- caller context
4260168404Spjd *
4261251631Sdelphij *	RETURN:	0 on success, error code on failure.
4262168404Spjd *
4263168404Spjd * Timestamps:
4264168404Spjd *	tdvp - ctime|mtime updated
4265168404Spjd *	 svp - ctime updated
4266168404Spjd */
4267168404Spjd/* ARGSUSED */
4268168404Spjdstatic int
4269185029Spjdzfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
4270185029Spjd    caller_context_t *ct, int flags)
4271168404Spjd{
4272168404Spjd	znode_t		*dzp = VTOZ(tdvp);
4273168404Spjd	znode_t		*tzp, *szp;
4274168404Spjd	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
4275185029Spjd	zilog_t		*zilog;
4276168404Spjd	zfs_dirlock_t	*dl;
4277168404Spjd	dmu_tx_t	*tx;
4278168962Spjd	vnode_t		*realvp;
4279168404Spjd	int		error;
4280185029Spjd	int		zf = ZNEW;
4281212694Smm	uint64_t	parent;
4282185029Spjd	uid_t		owner;
4283168404Spjd
4284168404Spjd	ASSERT(tdvp->v_type == VDIR);
4285168404Spjd
4286168404Spjd	ZFS_ENTER(zfsvfs);
4287185029Spjd	ZFS_VERIFY_ZP(dzp);
4288185029Spjd	zilog = zfsvfs->z_log;
4289168404Spjd
4290185029Spjd	if (VOP_REALVP(svp, &realvp, ct) == 0)
4291168962Spjd		svp = realvp;
4292168962Spjd
4293212694Smm	/*
4294212694Smm	 * POSIX dictates that we return EPERM here.
4295212694Smm	 * Better choices include ENOTSUP or EISDIR.
4296212694Smm	 */
4297212694Smm	if (svp->v_type == VDIR) {
4298168404Spjd		ZFS_EXIT(zfsvfs);
4299249195Smm		return (SET_ERROR(EPERM));
4300212694Smm	}
4301212694Smm
4302254585Sdelphij	szp = VTOZ(svp);
4303254585Sdelphij	ZFS_VERIFY_ZP(szp);
4304254585Sdelphij
4305254585Sdelphij	/*
4306254585Sdelphij	 * We check z_zfsvfs rather than v_vfsp here, because snapshots and the
4307254585Sdelphij	 * ctldir appear to have the same v_vfsp.
4308254585Sdelphij	 */
4309254585Sdelphij	if (szp->z_zfsvfs != zfsvfs || zfsctl_is_node(svp)) {
4310212694Smm		ZFS_EXIT(zfsvfs);
4311249195Smm		return (SET_ERROR(EXDEV));
4312168404Spjd	}
4313212694Smm
4314212694Smm	/* Prevent links to .zfs/shares files */
4315212694Smm
4316219089Spjd	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
4317219089Spjd	    &parent, sizeof (uint64_t))) != 0) {
4318212694Smm		ZFS_EXIT(zfsvfs);
4319219089Spjd		return (error);
4320219089Spjd	}
4321219089Spjd	if (parent == zfsvfs->z_shares_dir) {
4322219089Spjd		ZFS_EXIT(zfsvfs);
4323249195Smm		return (SET_ERROR(EPERM));
4324212694Smm	}
4325212694Smm
4326185029Spjd	if (zfsvfs->z_utf8 && u8_validate(name,
4327185029Spjd	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4328185029Spjd		ZFS_EXIT(zfsvfs);
4329249195Smm		return (SET_ERROR(EILSEQ));
4330185029Spjd	}
4331185029Spjd	if (flags & FIGNORECASE)
4332185029Spjd		zf |= ZCILOOK;
4333185029Spjd
4334168404Spjd	/*
4335168404Spjd	 * We do not support links between attributes and non-attributes
4336168404Spjd	 * because of the potential security risk of creating links
4337168404Spjd	 * into "normal" file space in order to circumvent restrictions
4338168404Spjd	 * imposed in attribute space.
4339168404Spjd	 */
4340219089Spjd	if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
4341168404Spjd		ZFS_EXIT(zfsvfs);
4342249195Smm		return (SET_ERROR(EINVAL));
4343168404Spjd	}
4344168404Spjd
4345168404Spjd
4346219089Spjd	owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
4347219089Spjd	if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
4348168404Spjd		ZFS_EXIT(zfsvfs);
4349249195Smm		return (SET_ERROR(EPERM));
4350168404Spjd	}
4351168404Spjd
4352185029Spjd	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4353168404Spjd		ZFS_EXIT(zfsvfs);
4354168404Spjd		return (error);
4355168404Spjd	}
4356168404Spjd
4357212694Smmtop:
4358168404Spjd	/*
4359168404Spjd	 * Attempt to lock directory; fail if entry already exists.
4360168404Spjd	 */
4361185029Spjd	error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
4362185029Spjd	if (error) {
4363168404Spjd		ZFS_EXIT(zfsvfs);
4364168404Spjd		return (error);
4365168404Spjd	}
4366168404Spjd
4367168404Spjd	tx = dmu_tx_create(zfsvfs->z_os);
4368219089Spjd	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4369168404Spjd	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4370219089Spjd	zfs_sa_upgrade_txholds(tx, szp);
4371219089Spjd	zfs_sa_upgrade_txholds(tx, dzp);
4372209962Smm	error = dmu_tx_assign(tx, TXG_NOWAIT);
4373168404Spjd	if (error) {
4374168404Spjd		zfs_dirent_unlock(dl);
4375209962Smm		if (error == ERESTART) {
4376168404Spjd			dmu_tx_wait(tx);
4377168404Spjd			dmu_tx_abort(tx);
4378168404Spjd			goto top;
4379168404Spjd		}
4380168404Spjd		dmu_tx_abort(tx);
4381168404Spjd		ZFS_EXIT(zfsvfs);
4382168404Spjd		return (error);
4383168404Spjd	}
4384168404Spjd
4385168404Spjd	error = zfs_link_create(dl, szp, tx, 0);
4386168404Spjd
4387185029Spjd	if (error == 0) {
4388185029Spjd		uint64_t txtype = TX_LINK;
4389185029Spjd		if (flags & FIGNORECASE)
4390185029Spjd			txtype |= TX_CI;
4391185029Spjd		zfs_log_link(zilog, tx, txtype, dzp, szp, name);
4392185029Spjd	}
4393168404Spjd
4394168404Spjd	dmu_tx_commit(tx);
4395168404Spjd
4396168404Spjd	zfs_dirent_unlock(dl);
4397168404Spjd
4398185029Spjd	if (error == 0) {
4399185029Spjd		vnevent_link(svp, ct);
4400185029Spjd	}
4401185029Spjd
4402219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4403219089Spjd		zil_commit(zilog, 0);
4404219089Spjd
4405168404Spjd	ZFS_EXIT(zfsvfs);
4406168404Spjd	return (error);
4407168404Spjd}
4408168404Spjd
4409219089Spjd#ifdef sun
4410219089Spjd/*
4411219089Spjd * zfs_null_putapage() is used when the file system has been force
4412219089Spjd * unmounted. It just drops the pages.
4413219089Spjd */
4414219089Spjd/* ARGSUSED */
4415219089Spjdstatic int
4416219089Spjdzfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4417219089Spjd		size_t *lenp, int flags, cred_t *cr)
4418219089Spjd{
4419219089Spjd	pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR);
4420219089Spjd	return (0);
4421219089Spjd}
4422219089Spjd
4423219089Spjd/*
4424219089Spjd * Push a page out to disk, klustering if possible.
4425219089Spjd *
4426219089Spjd *	IN:	vp	- file to push page to.
4427219089Spjd *		pp	- page to push.
4428219089Spjd *		flags	- additional flags.
4429219089Spjd *		cr	- credentials of caller.
4430219089Spjd *
4431219089Spjd *	OUT:	offp	- start of range pushed.
4432219089Spjd *		lenp	- len of range pushed.
4433219089Spjd *
4434251631Sdelphij *	RETURN:	0 on success, error code on failure.
4435219089Spjd *
4436219089Spjd * NOTE: callers must have locked the page to be pushed.  On
4437219089Spjd * exit, the page (and all other pages in the kluster) must be
4438219089Spjd * unlocked.
4439219089Spjd */
4440219089Spjd/* ARGSUSED */
4441219089Spjdstatic int
4442219089Spjdzfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4443219089Spjd		size_t *lenp, int flags, cred_t *cr)
4444219089Spjd{
4445219089Spjd	znode_t		*zp = VTOZ(vp);
4446219089Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4447219089Spjd	dmu_tx_t	*tx;
4448219089Spjd	u_offset_t	off, koff;
4449219089Spjd	size_t		len, klen;
4450219089Spjd	int		err;
4451219089Spjd
4452219089Spjd	off = pp->p_offset;
4453219089Spjd	len = PAGESIZE;
4454219089Spjd	/*
4455219089Spjd	 * If our blocksize is bigger than the page size, try to kluster
4456219089Spjd	 * multiple pages so that we write a full block (thus avoiding
4457219089Spjd	 * a read-modify-write).
4458219089Spjd	 */
4459219089Spjd	if (off < zp->z_size && zp->z_blksz > PAGESIZE) {
4460219089Spjd		klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
4461219089Spjd		koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0;
4462219089Spjd		ASSERT(koff <= zp->z_size);
4463219089Spjd		if (koff + klen > zp->z_size)
4464219089Spjd			klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE);
4465219089Spjd		pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
4466219089Spjd	}
4467219089Spjd	ASSERT3U(btop(len), ==, btopr(len));
4468219089Spjd
4469219089Spjd	/*
4470219089Spjd	 * Can't push pages past end-of-file.
4471219089Spjd	 */
4472219089Spjd	if (off >= zp->z_size) {
4473219089Spjd		/* ignore all pages */
4474219089Spjd		err = 0;
4475219089Spjd		goto out;
4476219089Spjd	} else if (off + len > zp->z_size) {
4477219089Spjd		int npages = btopr(zp->z_size - off);
4478219089Spjd		page_t *trunc;
4479219089Spjd
4480219089Spjd		page_list_break(&pp, &trunc, npages);
4481219089Spjd		/* ignore pages past end of file */
4482219089Spjd		if (trunc)
4483219089Spjd			pvn_write_done(trunc, flags);
4484219089Spjd		len = zp->z_size - off;
4485219089Spjd	}
4486219089Spjd
4487219089Spjd	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
4488219089Spjd	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
4489249195Smm		err = SET_ERROR(EDQUOT);
4490219089Spjd		goto out;
4491219089Spjd	}
4492219089Spjdtop:
4493219089Spjd	tx = dmu_tx_create(zfsvfs->z_os);
4494219089Spjd	dmu_tx_hold_write(tx, zp->z_id, off, len);
4495219089Spjd
4496219089Spjd	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4497219089Spjd	zfs_sa_upgrade_txholds(tx, zp);
4498219089Spjd	err = dmu_tx_assign(tx, TXG_NOWAIT);
4499219089Spjd	if (err != 0) {
4500219089Spjd		if (err == ERESTART) {
4501219089Spjd			dmu_tx_wait(tx);
4502219089Spjd			dmu_tx_abort(tx);
4503219089Spjd			goto top;
4504219089Spjd		}
4505219089Spjd		dmu_tx_abort(tx);
4506219089Spjd		goto out;
4507219089Spjd	}
4508219089Spjd
4509219089Spjd	if (zp->z_blksz <= PAGESIZE) {
4510219089Spjd		caddr_t va = zfs_map_page(pp, S_READ);
4511219089Spjd		ASSERT3U(len, <=, PAGESIZE);
4512219089Spjd		dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
4513219089Spjd		zfs_unmap_page(pp, va);
4514219089Spjd	} else {
4515219089Spjd		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
4516219089Spjd	}
4517219089Spjd
4518219089Spjd	if (err == 0) {
4519219089Spjd		uint64_t mtime[2], ctime[2];
4520219089Spjd		sa_bulk_attr_t bulk[3];
4521219089Spjd		int count = 0;
4522219089Spjd
4523219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4524219089Spjd		    &mtime, 16);
4525219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
4526219089Spjd		    &ctime, 16);
4527219089Spjd		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
4528219089Spjd		    &zp->z_pflags, 8);
4529219089Spjd		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
4530219089Spjd		    B_TRUE);
4531219089Spjd		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
4532219089Spjd	}
4533219089Spjd	dmu_tx_commit(tx);
4534219089Spjd
4535219089Spjdout:
4536219089Spjd	pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
4537219089Spjd	if (offp)
4538219089Spjd		*offp = off;
4539219089Spjd	if (lenp)
4540219089Spjd		*lenp = len;
4541219089Spjd
4542219089Spjd	return (err);
4543219089Spjd}
4544219089Spjd
4545219089Spjd/*
4546219089Spjd * Copy the portion of the file indicated from pages into the file.
4547219089Spjd * The pages are stored in a page list attached to the files vnode.
4548219089Spjd *
4549219089Spjd *	IN:	vp	- vnode of file to push page data to.
4550219089Spjd *		off	- position in file to put data.
4551219089Spjd *		len	- amount of data to write.
4552219089Spjd *		flags	- flags to control the operation.
4553219089Spjd *		cr	- credentials of caller.
4554219089Spjd *		ct	- caller context.
4555219089Spjd *
4556251631Sdelphij *	RETURN:	0 on success, error code on failure.
4557219089Spjd *
4558219089Spjd * Timestamps:
4559219089Spjd *	vp - ctime|mtime updated
4560219089Spjd */
4561185029Spjd/*ARGSUSED*/
4562219089Spjdstatic int
4563219089Spjdzfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
4564219089Spjd    caller_context_t *ct)
4565219089Spjd{
4566219089Spjd	znode_t		*zp = VTOZ(vp);
4567219089Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4568219089Spjd	page_t		*pp;
4569219089Spjd	size_t		io_len;
4570219089Spjd	u_offset_t	io_off;
4571219089Spjd	uint_t		blksz;
4572219089Spjd	rl_t		*rl;
4573219089Spjd	int		error = 0;
4574219089Spjd
4575219089Spjd	ZFS_ENTER(zfsvfs);
4576219089Spjd	ZFS_VERIFY_ZP(zp);
4577219089Spjd
4578219089Spjd	/*
4579219089Spjd	 * Align this request to the file block size in case we kluster.
4580219089Spjd	 * XXX - this can result in pretty aggresive locking, which can
4581219089Spjd	 * impact simultanious read/write access.  One option might be
4582219089Spjd	 * to break up long requests (len == 0) into block-by-block
4583219089Spjd	 * operations to get narrower locking.
4584219089Spjd	 */
4585219089Spjd	blksz = zp->z_blksz;
4586219089Spjd	if (ISP2(blksz))
4587219089Spjd		io_off = P2ALIGN_TYPED(off, blksz, u_offset_t);
4588219089Spjd	else
4589219089Spjd		io_off = 0;
4590219089Spjd	if (len > 0 && ISP2(blksz))
4591219089Spjd		io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t);
4592219089Spjd	else
4593219089Spjd		io_len = 0;
4594219089Spjd
4595219089Spjd	if (io_len == 0) {
4596219089Spjd		/*
4597219089Spjd		 * Search the entire vp list for pages >= io_off.
4598219089Spjd		 */
4599219089Spjd		rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER);
4600219089Spjd		error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr);
4601219089Spjd		goto out;
4602219089Spjd	}
4603219089Spjd	rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER);
4604219089Spjd
4605219089Spjd	if (off > zp->z_size) {
4606219089Spjd		/* past end of file */
4607219089Spjd		zfs_range_unlock(rl);
4608219089Spjd		ZFS_EXIT(zfsvfs);
4609219089Spjd		return (0);
4610219089Spjd	}
4611219089Spjd
4612219089Spjd	len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off);
4613219089Spjd
4614219089Spjd	for (off = io_off; io_off < off + len; io_off += io_len) {
4615219089Spjd		if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
4616219089Spjd			pp = page_lookup(vp, io_off,
4617219089Spjd			    (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
4618219089Spjd		} else {
4619219089Spjd			pp = page_lookup_nowait(vp, io_off,
4620219089Spjd			    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
4621219089Spjd		}
4622219089Spjd
4623219089Spjd		if (pp != NULL && pvn_getdirty(pp, flags)) {
4624219089Spjd			int err;
4625219089Spjd
4626219089Spjd			/*
4627219089Spjd			 * Found a dirty page to push
4628219089Spjd			 */
4629219089Spjd			err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr);
4630219089Spjd			if (err)
4631219089Spjd				error = err;
4632219089Spjd		} else {
4633219089Spjd			io_len = PAGESIZE;
4634219089Spjd		}
4635219089Spjd	}
4636219089Spjdout:
4637219089Spjd	zfs_range_unlock(rl);
4638219089Spjd	if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4639219089Spjd		zil_commit(zfsvfs->z_log, zp->z_id);
4640219089Spjd	ZFS_EXIT(zfsvfs);
4641219089Spjd	return (error);
4642219089Spjd}
4643219089Spjd#endif	/* sun */
4644219089Spjd
4645219089Spjd/*ARGSUSED*/
4646168962Spjdvoid
4647185029Spjdzfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4648168404Spjd{
4649168962Spjd	znode_t	*zp = VTOZ(vp);
4650168962Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4651168962Spjd	int error;
4652168404Spjd
4653185029Spjd	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4654219089Spjd	if (zp->z_sa_hdl == NULL) {
4655185029Spjd		/*
4656185029Spjd		 * The fs has been unmounted, or we did a
4657185029Spjd		 * suspend/resume and this file no longer exists.
4658185029Spjd		 */
4659243520Savg		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4660234607Strasz		vrecycle(vp);
4661243520Savg		return;
4662243520Savg	}
4663243520Savg
4664243520Savg	mutex_enter(&zp->z_lock);
4665243520Savg	if (zp->z_unlinked) {
4666243520Savg		/*
4667243520Savg		 * Fast path to recycle a vnode of a removed file.
4668243520Savg		 */
4669243520Savg		mutex_exit(&zp->z_lock);
4670185029Spjd		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4671243520Savg		vrecycle(vp);
4672168962Spjd		return;
4673168404Spjd	}
4674243520Savg	mutex_exit(&zp->z_lock);
4675168404Spjd
4676168404Spjd	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4677168404Spjd		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4678168404Spjd
4679219089Spjd		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4680219089Spjd		zfs_sa_upgrade_txholds(tx, zp);
4681168404Spjd		error = dmu_tx_assign(tx, TXG_WAIT);
4682168404Spjd		if (error) {
4683168404Spjd			dmu_tx_abort(tx);
4684168404Spjd		} else {
4685168404Spjd			mutex_enter(&zp->z_lock);
4686219089Spjd			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4687219089Spjd			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4688168404Spjd			zp->z_atime_dirty = 0;
4689168404Spjd			mutex_exit(&zp->z_lock);
4690168404Spjd			dmu_tx_commit(tx);
4691168404Spjd		}
4692168404Spjd	}
4693185029Spjd	rw_exit(&zfsvfs->z_teardown_inactive_lock);
4694168404Spjd}
4695168404Spjd
4696219089Spjd#ifdef sun
4697219089Spjd/*
4698219089Spjd * Bounds-check the seek operation.
4699219089Spjd *
4700219089Spjd *	IN:	vp	- vnode seeking within
4701219089Spjd *		ooff	- old file offset
4702219089Spjd *		noffp	- pointer to new file offset
4703219089Spjd *		ct	- caller context
4704219089Spjd *
4705251631Sdelphij *	RETURN:	0 on success, EINVAL if new offset invalid.
4706219089Spjd */
4707219089Spjd/* ARGSUSED */
4708219089Spjdstatic int
4709219089Spjdzfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
4710219089Spjd    caller_context_t *ct)
4711219089Spjd{
4712219089Spjd	if (vp->v_type == VDIR)
4713219089Spjd		return (0);
4714219089Spjd	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
4715219089Spjd}
4716219089Spjd
4717219089Spjd/*
4718219089Spjd * Pre-filter the generic locking function to trap attempts to place
4719219089Spjd * a mandatory lock on a memory mapped file.
4720219089Spjd */
4721219089Spjdstatic int
4722219089Spjdzfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
4723219089Spjd    flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct)
4724219089Spjd{
4725219089Spjd	znode_t *zp = VTOZ(vp);
4726219089Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4727219089Spjd
4728219089Spjd	ZFS_ENTER(zfsvfs);
4729219089Spjd	ZFS_VERIFY_ZP(zp);
4730219089Spjd
4731219089Spjd	/*
4732219089Spjd	 * We are following the UFS semantics with respect to mapcnt
4733219089Spjd	 * here: If we see that the file is mapped already, then we will
4734219089Spjd	 * return an error, but we don't worry about races between this
4735219089Spjd	 * function and zfs_map().
4736219089Spjd	 */
4737219089Spjd	if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) {
4738219089Spjd		ZFS_EXIT(zfsvfs);
4739249195Smm		return (SET_ERROR(EAGAIN));
4740219089Spjd	}
4741219089Spjd	ZFS_EXIT(zfsvfs);
4742219089Spjd	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
4743219089Spjd}
4744219089Spjd
4745219089Spjd/*
4746219089Spjd * If we can't find a page in the cache, we will create a new page
4747219089Spjd * and fill it with file data.  For efficiency, we may try to fill
4748219089Spjd * multiple pages at once (klustering) to fill up the supplied page
4749219089Spjd * list.  Note that the pages to be filled are held with an exclusive
4750219089Spjd * lock to prevent access by other threads while they are being filled.
4751219089Spjd */
4752219089Spjdstatic int
4753219089Spjdzfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
4754219089Spjd    caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw)
4755219089Spjd{
4756219089Spjd	znode_t *zp = VTOZ(vp);
4757219089Spjd	page_t *pp, *cur_pp;
4758219089Spjd	objset_t *os = zp->z_zfsvfs->z_os;
4759219089Spjd	u_offset_t io_off, total;
4760219089Spjd	size_t io_len;
4761219089Spjd	int err;
4762219089Spjd
4763219089Spjd	if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) {
4764219089Spjd		/*
4765219089Spjd		 * We only have a single page, don't bother klustering
4766219089Spjd		 */
4767219089Spjd		io_off = off;
4768219089Spjd		io_len = PAGESIZE;
4769219089Spjd		pp = page_create_va(vp, io_off, io_len,
4770219089Spjd		    PG_EXCL | PG_WAIT, seg, addr);
4771219089Spjd	} else {
4772219089Spjd		/*
4773219089Spjd		 * Try to find enough pages to fill the page list
4774219089Spjd		 */
4775219089Spjd		pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
4776219089Spjd		    &io_len, off, plsz, 0);
4777219089Spjd	}
4778219089Spjd	if (pp == NULL) {
4779219089Spjd		/*
4780219089Spjd		 * The page already exists, nothing to do here.
4781219089Spjd		 */
4782219089Spjd		*pl = NULL;
4783219089Spjd		return (0);
4784219089Spjd	}
4785219089Spjd
4786219089Spjd	/*
4787219089Spjd	 * Fill the pages in the kluster.
4788219089Spjd	 */
4789219089Spjd	cur_pp = pp;
4790219089Spjd	for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
4791219089Spjd		caddr_t va;
4792219089Spjd
4793219089Spjd		ASSERT3U(io_off, ==, cur_pp->p_offset);
4794219089Spjd		va = zfs_map_page(cur_pp, S_WRITE);
4795219089Spjd		err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
4796219089Spjd		    DMU_READ_PREFETCH);
4797219089Spjd		zfs_unmap_page(cur_pp, va);
4798219089Spjd		if (err) {
4799219089Spjd			/* On error, toss the entire kluster */
4800219089Spjd			pvn_read_done(pp, B_ERROR);
4801219089Spjd			/* convert checksum errors into IO errors */
4802219089Spjd			if (err == ECKSUM)
4803249195Smm				err = SET_ERROR(EIO);
4804219089Spjd			return (err);
4805219089Spjd		}
4806219089Spjd		cur_pp = cur_pp->p_next;
4807219089Spjd	}
4808219089Spjd
4809219089Spjd	/*
4810219089Spjd	 * Fill in the page list array from the kluster starting
4811219089Spjd	 * from the desired offset `off'.
4812219089Spjd	 * NOTE: the page list will always be null terminated.
4813219089Spjd	 */
4814219089Spjd	pvn_plist_init(pp, pl, plsz, off, io_len, rw);
4815219089Spjd	ASSERT(pl == NULL || (*pl)->p_offset == off);
4816219089Spjd
4817219089Spjd	return (0);
4818219089Spjd}
4819219089Spjd
4820219089Spjd/*
4821219089Spjd * Return pointers to the pages for the file region [off, off + len]
4822219089Spjd * in the pl array.  If plsz is greater than len, this function may
4823219089Spjd * also return page pointers from after the specified region
4824219089Spjd * (i.e. the region [off, off + plsz]).  These additional pages are
4825219089Spjd * only returned if they are already in the cache, or were created as
4826219089Spjd * part of a klustered read.
4827219089Spjd *
4828219089Spjd *	IN:	vp	- vnode of file to get data from.
4829219089Spjd *		off	- position in file to get data from.
4830219089Spjd *		len	- amount of data to retrieve.
4831219089Spjd *		plsz	- length of provided page list.
4832219089Spjd *		seg	- segment to obtain pages for.
4833219089Spjd *		addr	- virtual address of fault.
4834219089Spjd *		rw	- mode of created pages.
4835219089Spjd *		cr	- credentials of caller.
4836219089Spjd *		ct	- caller context.
4837219089Spjd *
4838219089Spjd *	OUT:	protp	- protection mode of created pages.
4839219089Spjd *		pl	- list of pages created.
4840219089Spjd *
4841251631Sdelphij *	RETURN:	0 on success, error code on failure.
4842219089Spjd *
4843219089Spjd * Timestamps:
4844219089Spjd *	vp - atime updated
4845219089Spjd */
4846219089Spjd/* ARGSUSED */
4847219089Spjdstatic int
4848219089Spjdzfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
4849251631Sdelphij    page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
4850251631Sdelphij    enum seg_rw rw, cred_t *cr, caller_context_t *ct)
4851219089Spjd{
4852219089Spjd	znode_t		*zp = VTOZ(vp);
4853219089Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4854219089Spjd	page_t		**pl0 = pl;
4855219089Spjd	int		err = 0;
4856219089Spjd
4857219089Spjd	/* we do our own caching, faultahead is unnecessary */
4858219089Spjd	if (pl == NULL)
4859219089Spjd		return (0);
4860219089Spjd	else if (len > plsz)
4861219089Spjd		len = plsz;
4862219089Spjd	else
4863219089Spjd		len = P2ROUNDUP(len, PAGESIZE);
4864219089Spjd	ASSERT(plsz >= len);
4865219089Spjd
4866219089Spjd	ZFS_ENTER(zfsvfs);
4867219089Spjd	ZFS_VERIFY_ZP(zp);
4868219089Spjd
4869219089Spjd	if (protp)
4870219089Spjd		*protp = PROT_ALL;
4871219089Spjd
4872219089Spjd	/*
4873219089Spjd	 * Loop through the requested range [off, off + len) looking
4874219089Spjd	 * for pages.  If we don't find a page, we will need to create
4875219089Spjd	 * a new page and fill it with data from the file.
4876219089Spjd	 */
4877219089Spjd	while (len > 0) {
4878219089Spjd		if (*pl = page_lookup(vp, off, SE_SHARED))
4879219089Spjd			*(pl+1) = NULL;
4880219089Spjd		else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw))
4881219089Spjd			goto out;
4882219089Spjd		while (*pl) {
4883219089Spjd			ASSERT3U((*pl)->p_offset, ==, off);
4884219089Spjd			off += PAGESIZE;
4885219089Spjd			addr += PAGESIZE;
4886219089Spjd			if (len > 0) {
4887219089Spjd				ASSERT3U(len, >=, PAGESIZE);
4888219089Spjd				len -= PAGESIZE;
4889219089Spjd			}
4890219089Spjd			ASSERT3U(plsz, >=, PAGESIZE);
4891219089Spjd			plsz -= PAGESIZE;
4892219089Spjd			pl++;
4893219089Spjd		}
4894219089Spjd	}
4895219089Spjd
4896219089Spjd	/*
4897219089Spjd	 * Fill out the page array with any pages already in the cache.
4898219089Spjd	 */
4899219089Spjd	while (plsz > 0 &&
4900219089Spjd	    (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) {
4901219089Spjd			off += PAGESIZE;
4902219089Spjd			plsz -= PAGESIZE;
4903219089Spjd	}
4904219089Spjdout:
4905219089Spjd	if (err) {
4906219089Spjd		/*
4907219089Spjd		 * Release any pages we have previously locked.
4908219089Spjd		 */
4909219089Spjd		while (pl > pl0)
4910219089Spjd			page_unlock(*--pl);
4911219089Spjd	} else {
4912219089Spjd		ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4913219089Spjd	}
4914219089Spjd
4915219089Spjd	*pl = NULL;
4916219089Spjd
4917219089Spjd	ZFS_EXIT(zfsvfs);
4918219089Spjd	return (err);
4919219089Spjd}
4920219089Spjd
4921219089Spjd/*
4922219089Spjd * Request a memory map for a section of a file.  This code interacts
4923219089Spjd * with common code and the VM system as follows:
4924219089Spjd *
4925251631Sdelphij * - common code calls mmap(), which ends up in smmap_common()
4926251631Sdelphij * - this calls VOP_MAP(), which takes you into (say) zfs
4927251631Sdelphij * - zfs_map() calls as_map(), passing segvn_create() as the callback
4928251631Sdelphij * - segvn_create() creates the new segment and calls VOP_ADDMAP()
4929251631Sdelphij * - zfs_addmap() updates z_mapcnt
4930219089Spjd */
4931219089Spjd/*ARGSUSED*/
4932219089Spjdstatic int
4933219089Spjdzfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
4934219089Spjd    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4935219089Spjd    caller_context_t *ct)
4936219089Spjd{
4937219089Spjd	znode_t *zp = VTOZ(vp);
4938219089Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4939219089Spjd	segvn_crargs_t	vn_a;
4940219089Spjd	int		error;
4941219089Spjd
4942219089Spjd	ZFS_ENTER(zfsvfs);
4943219089Spjd	ZFS_VERIFY_ZP(zp);
4944219089Spjd
4945219089Spjd	if ((prot & PROT_WRITE) && (zp->z_pflags &
4946219089Spjd	    (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
4947219089Spjd		ZFS_EXIT(zfsvfs);
4948249195Smm		return (SET_ERROR(EPERM));
4949219089Spjd	}
4950219089Spjd
4951219089Spjd	if ((prot & (PROT_READ | PROT_EXEC)) &&
4952219089Spjd	    (zp->z_pflags & ZFS_AV_QUARANTINED)) {
4953219089Spjd		ZFS_EXIT(zfsvfs);
4954249195Smm		return (SET_ERROR(EACCES));
4955219089Spjd	}
4956219089Spjd
4957219089Spjd	if (vp->v_flag & VNOMAP) {
4958219089Spjd		ZFS_EXIT(zfsvfs);
4959249195Smm		return (SET_ERROR(ENOSYS));
4960219089Spjd	}
4961219089Spjd
4962219089Spjd	if (off < 0 || len > MAXOFFSET_T - off) {
4963219089Spjd		ZFS_EXIT(zfsvfs);
4964249195Smm		return (SET_ERROR(ENXIO));
4965219089Spjd	}
4966219089Spjd
4967219089Spjd	if (vp->v_type != VREG) {
4968219089Spjd		ZFS_EXIT(zfsvfs);
4969249195Smm		return (SET_ERROR(ENODEV));
4970219089Spjd	}
4971219089Spjd
4972219089Spjd	/*
4973219089Spjd	 * If file is locked, disallow mapping.
4974219089Spjd	 */
4975219089Spjd	if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) {
4976219089Spjd		ZFS_EXIT(zfsvfs);
4977249195Smm		return (SET_ERROR(EAGAIN));
4978219089Spjd	}
4979219089Spjd
4980219089Spjd	as_rangelock(as);
4981219089Spjd	error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
4982219089Spjd	if (error != 0) {
4983219089Spjd		as_rangeunlock(as);
4984219089Spjd		ZFS_EXIT(zfsvfs);
4985219089Spjd		return (error);
4986219089Spjd	}
4987219089Spjd
4988219089Spjd	vn_a.vp = vp;
4989219089Spjd	vn_a.offset = (u_offset_t)off;
4990219089Spjd	vn_a.type = flags & MAP_TYPE;
4991219089Spjd	vn_a.prot = prot;
4992219089Spjd	vn_a.maxprot = maxprot;
4993219089Spjd	vn_a.cred = cr;
4994219089Spjd	vn_a.amp = NULL;
4995219089Spjd	vn_a.flags = flags & ~MAP_TYPE;
4996219089Spjd	vn_a.szc = 0;
4997219089Spjd	vn_a.lgrp_mem_policy_flags = 0;
4998219089Spjd
4999219089Spjd	error = as_map(as, *addrp, len, segvn_create, &vn_a);
5000219089Spjd
5001219089Spjd	as_rangeunlock(as);
5002219089Spjd	ZFS_EXIT(zfsvfs);
5003219089Spjd	return (error);
5004219089Spjd}
5005219089Spjd
5006219089Spjd/* ARGSUSED */
5007219089Spjdstatic int
5008219089Spjdzfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
5009219089Spjd    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
5010219089Spjd    caller_context_t *ct)
5011219089Spjd{
5012219089Spjd	uint64_t pages = btopr(len);
5013219089Spjd
5014219089Spjd	atomic_add_64(&VTOZ(vp)->z_mapcnt, pages);
5015219089Spjd	return (0);
5016219089Spjd}
5017219089Spjd
5018219089Spjd/*
5019219089Spjd * The reason we push dirty pages as part of zfs_delmap() is so that we get a
5020219089Spjd * more accurate mtime for the associated file.  Since we don't have a way of
5021219089Spjd * detecting when the data was actually modified, we have to resort to
5022219089Spjd * heuristics.  If an explicit msync() is done, then we mark the mtime when the
5023219089Spjd * last page is pushed.  The problem occurs when the msync() call is omitted,
5024219089Spjd * which by far the most common case:
5025219089Spjd *
5026219089Spjd * 	open()
5027219089Spjd * 	mmap()
5028219089Spjd * 	<modify memory>
5029219089Spjd * 	munmap()
5030219089Spjd * 	close()
5031219089Spjd * 	<time lapse>
5032219089Spjd * 	putpage() via fsflush
5033219089Spjd *
5034219089Spjd * If we wait until fsflush to come along, we can have a modification time that
5035219089Spjd * is some arbitrary point in the future.  In order to prevent this in the
5036219089Spjd * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
5037219089Spjd * torn down.
5038219089Spjd */
5039219089Spjd/* ARGSUSED */
5040219089Spjdstatic int
5041219089Spjdzfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
5042219089Spjd    size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
5043219089Spjd    caller_context_t *ct)
5044219089Spjd{
5045219089Spjd	uint64_t pages = btopr(len);
5046219089Spjd
5047219089Spjd	ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
5048219089Spjd	atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
5049219089Spjd
5050219089Spjd	if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
5051219089Spjd	    vn_has_cached_data(vp))
5052219089Spjd		(void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
5053219089Spjd
5054219089Spjd	return (0);
5055219089Spjd}
5056219089Spjd
5057219089Spjd/*
5058219089Spjd * Free or allocate space in a file.  Currently, this function only
5059219089Spjd * supports the `F_FREESP' command.  However, this command is somewhat
5060219089Spjd * misnamed, as its functionality includes the ability to allocate as
5061219089Spjd * well as free space.
5062219089Spjd *
5063219089Spjd *	IN:	vp	- vnode of file to free data in.
5064219089Spjd *		cmd	- action to take (only F_FREESP supported).
5065219089Spjd *		bfp	- section of file to free/alloc.
5066219089Spjd *		flag	- current file open mode flags.
5067219089Spjd *		offset	- current file offset.
5068219089Spjd *		cr	- credentials of caller [UNUSED].
5069219089Spjd *		ct	- caller context.
5070219089Spjd *
5071251631Sdelphij *	RETURN:	0 on success, error code on failure.
5072219089Spjd *
5073219089Spjd * Timestamps:
5074219089Spjd *	vp - ctime|mtime updated
5075219089Spjd */
5076219089Spjd/* ARGSUSED */
5077219089Spjdstatic int
5078219089Spjdzfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
5079219089Spjd    offset_t offset, cred_t *cr, caller_context_t *ct)
5080219089Spjd{
5081219089Spjd	znode_t		*zp = VTOZ(vp);
5082219089Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
5083219089Spjd	uint64_t	off, len;
5084219089Spjd	int		error;
5085219089Spjd
5086219089Spjd	ZFS_ENTER(zfsvfs);
5087219089Spjd	ZFS_VERIFY_ZP(zp);
5088219089Spjd
5089219089Spjd	if (cmd != F_FREESP) {
5090219089Spjd		ZFS_EXIT(zfsvfs);
5091249195Smm		return (SET_ERROR(EINVAL));
5092219089Spjd	}
5093219089Spjd
5094219089Spjd	if (error = convoff(vp, bfp, 0, offset)) {
5095219089Spjd		ZFS_EXIT(zfsvfs);
5096219089Spjd		return (error);
5097219089Spjd	}
5098219089Spjd
5099219089Spjd	if (bfp->l_len < 0) {
5100219089Spjd		ZFS_EXIT(zfsvfs);
5101249195Smm		return (SET_ERROR(EINVAL));
5102219089Spjd	}
5103219089Spjd
5104219089Spjd	off = bfp->l_start;
5105219089Spjd	len = bfp->l_len; /* 0 means from off to end of file */
5106219089Spjd
5107219089Spjd	error = zfs_freesp(zp, off, len, flag, TRUE);
5108219089Spjd
5109219089Spjd	ZFS_EXIT(zfsvfs);
5110219089Spjd	return (error);
5111219089Spjd}
5112219089Spjd#endif	/* sun */
5113219089Spjd
5114168404SpjdCTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
5115168404SpjdCTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
5116168404Spjd
5117185029Spjd/*ARGSUSED*/
5118168404Spjdstatic int
5119185029Spjdzfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
5120168404Spjd{
5121168404Spjd	znode_t		*zp = VTOZ(vp);
5122168404Spjd	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
5123185029Spjd	uint32_t	gen;
5124219089Spjd	uint64_t	gen64;
5125168404Spjd	uint64_t	object = zp->z_id;
5126168404Spjd	zfid_short_t	*zfid;
5127219089Spjd	int		size, i, error;
5128168404Spjd
5129168404Spjd	ZFS_ENTER(zfsvfs);
5130185029Spjd	ZFS_VERIFY_ZP(zp);
5131168404Spjd
5132219089Spjd	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
5133219089Spjd	    &gen64, sizeof (uint64_t))) != 0) {
5134219089Spjd		ZFS_EXIT(zfsvfs);
5135219089Spjd		return (error);
5136219089Spjd	}
5137219089Spjd
5138219089Spjd	gen = (uint32_t)gen64;
5139219089Spjd
5140168404Spjd	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
5141249195Smm
5142249195Smm#ifdef illumos
5143249195Smm	if (fidp->fid_len < size) {
5144249195Smm		fidp->fid_len = size;
5145249195Smm		ZFS_EXIT(zfsvfs);
5146249195Smm		return (SET_ERROR(ENOSPC));
5147249195Smm	}
5148249195Smm#else
5149168404Spjd	fidp->fid_len = size;
5150249195Smm#endif
5151168404Spjd
5152168404Spjd	zfid = (zfid_short_t *)fidp;
5153168404Spjd
5154168404Spjd	zfid->zf_len = size;
5155168404Spjd
5156168404Spjd	for (i = 0; i < sizeof (zfid->zf_object); i++)
5157168404Spjd		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
5158168404Spjd
5159168404Spjd	/* Must have a non-zero generation number to distinguish from .zfs */
5160168404Spjd	if (gen == 0)
5161168404Spjd		gen = 1;
5162168404Spjd	for (i = 0; i < sizeof (zfid->zf_gen); i++)
5163168404Spjd		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
5164168404Spjd
5165168404Spjd	if (size == LONG_FID_LEN) {
5166168404Spjd		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
5167169023Spjd		zfid_long_t	*zlfid;
5168168404Spjd
5169168404Spjd		zlfid = (zfid_long_t *)fidp;
5170168404Spjd
5171168404Spjd		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
5172168404Spjd			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
5173168404Spjd
5174168404Spjd		/* XXX - this should be the generation number for the objset */
5175168404Spjd		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
5176168404Spjd			zlfid->zf_setgen[i] = 0;
5177168404Spjd	}
5178168404Spjd
5179168404Spjd	ZFS_EXIT(zfsvfs);
5180168404Spjd	return (0);
5181168404Spjd}
5182168404Spjd
5183168404Spjdstatic int
5184185029Spjdzfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
5185185029Spjd    caller_context_t *ct)
5186168404Spjd{
5187168404Spjd	znode_t		*zp, *xzp;
5188168404Spjd	zfsvfs_t	*zfsvfs;
5189168404Spjd	zfs_dirlock_t	*dl;
5190168404Spjd	int		error;
5191168404Spjd
5192168404Spjd	switch (cmd) {
5193168404Spjd	case _PC_LINK_MAX:
5194168404Spjd		*valp = INT_MAX;
5195168404Spjd		return (0);
5196168404Spjd
5197168404Spjd	case _PC_FILESIZEBITS:
5198168404Spjd		*valp = 64;
5199168404Spjd		return (0);
5200219089Spjd#ifdef sun
5201168404Spjd	case _PC_XATTR_EXISTS:
5202168404Spjd		zp = VTOZ(vp);
5203168404Spjd		zfsvfs = zp->z_zfsvfs;
5204168404Spjd		ZFS_ENTER(zfsvfs);
5205185029Spjd		ZFS_VERIFY_ZP(zp);
5206168404Spjd		*valp = 0;
5207168404Spjd		error = zfs_dirent_lock(&dl, zp, "", &xzp,
5208185029Spjd		    ZXATTR | ZEXISTS | ZSHARED, NULL, NULL);
5209168404Spjd		if (error == 0) {
5210168404Spjd			zfs_dirent_unlock(dl);
5211168404Spjd			if (!zfs_dirempty(xzp))
5212168404Spjd				*valp = 1;
5213168404Spjd			VN_RELE(ZTOV(xzp));
5214168404Spjd		} else if (error == ENOENT) {
5215168404Spjd			/*
5216168404Spjd			 * If there aren't extended attributes, it's the
5217168404Spjd			 * same as having zero of them.
5218168404Spjd			 */
5219168404Spjd			error = 0;
5220168404Spjd		}
5221168404Spjd		ZFS_EXIT(zfsvfs);
5222168404Spjd		return (error);
5223168404Spjd
5224219089Spjd	case _PC_SATTR_ENABLED:
5225219089Spjd	case _PC_SATTR_EXISTS:
5226219089Spjd		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
5227219089Spjd		    (vp->v_type == VREG || vp->v_type == VDIR);
5228219089Spjd		return (0);
5229219089Spjd
5230219089Spjd	case _PC_ACCESS_FILTERING:
5231219089Spjd		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
5232219089Spjd		    vp->v_type == VDIR;
5233219089Spjd		return (0);
5234219089Spjd
5235219089Spjd	case _PC_ACL_ENABLED:
5236219089Spjd		*valp = _ACL_ACE_ENABLED;
5237219089Spjd		return (0);
5238219089Spjd#endif	/* sun */
5239219089Spjd	case _PC_MIN_HOLE_SIZE:
5240219089Spjd		*valp = (int)SPA_MINBLOCKSIZE;
5241219089Spjd		return (0);
5242219089Spjd#ifdef sun
5243219089Spjd	case _PC_TIMESTAMP_RESOLUTION:
5244219089Spjd		/* nanosecond timestamp resolution */
5245219089Spjd		*valp = 1L;
5246219089Spjd		return (0);
5247219089Spjd#endif	/* sun */
5248168404Spjd	case _PC_ACL_EXTENDED:
5249196949Strasz		*valp = 0;
5250168404Spjd		return (0);
5251168404Spjd
5252196949Strasz	case _PC_ACL_NFS4:
5253196949Strasz		*valp = 1;
5254196949Strasz		return (0);
5255196949Strasz
5256196949Strasz	case _PC_ACL_PATH_MAX:
5257196949Strasz		*valp = ACL_MAX_ENTRIES;
5258196949Strasz		return (0);
5259196949Strasz
5260168404Spjd	default:
5261168962Spjd		return (EOPNOTSUPP);
5262168404Spjd	}
5263168404Spjd}
5264168404Spjd
5265168404Spjd/*ARGSUSED*/
5266168404Spjdstatic int
5267185029Spjdzfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
5268185029Spjd    caller_context_t *ct)
5269168404Spjd{
5270168404Spjd	znode_t *zp = VTOZ(vp);
5271168404Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5272168404Spjd	int error;
5273185029Spjd	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
5274168404Spjd
5275168404Spjd	ZFS_ENTER(zfsvfs);
5276185029Spjd	ZFS_VERIFY_ZP(zp);
5277185029Spjd	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
5278168404Spjd	ZFS_EXIT(zfsvfs);
5279168404Spjd
5280168404Spjd	return (error);
5281168404Spjd}
5282168404Spjd
5283168404Spjd/*ARGSUSED*/
5284228685Spjdint
5285185029Spjdzfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
5286185029Spjd    caller_context_t *ct)
5287168404Spjd{
5288168404Spjd	znode_t *zp = VTOZ(vp);
5289168404Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5290168404Spjd	int error;
5291185029Spjd	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
5292219089Spjd	zilog_t	*zilog = zfsvfs->z_log;
5293168404Spjd
5294168404Spjd	ZFS_ENTER(zfsvfs);
5295185029Spjd	ZFS_VERIFY_ZP(zp);
5296219089Spjd
5297185029Spjd	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
5298219089Spjd
5299219089Spjd	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
5300219089Spjd		zil_commit(zilog, 0);
5301219089Spjd
5302168404Spjd	ZFS_EXIT(zfsvfs);
5303168404Spjd	return (error);
5304168404Spjd}
5305168404Spjd
5306219089Spjd#ifdef sun
5307219089Spjd/*
5308251631Sdelphij * The smallest read we may consider to loan out an arcbuf.
5309251631Sdelphij * This must be a power of 2.
5310219089Spjd */
5311219089Spjdint zcr_blksz_min = (1 << 10);	/* 1K */
5312251631Sdelphij/*
5313251631Sdelphij * If set to less than the file block size, allow loaning out of an
5314251631Sdelphij * arcbuf for a partial block read.  This must be a power of 2.
5315251631Sdelphij */
5316219089Spjdint zcr_blksz_max = (1 << 17);	/* 128K */
5317219089Spjd
5318219089Spjd/*ARGSUSED*/
5319168962Spjdstatic int
5320219089Spjdzfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr,
5321219089Spjd    caller_context_t *ct)
5322219089Spjd{
5323219089Spjd	znode_t	*zp = VTOZ(vp);
5324219089Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5325219089Spjd	int max_blksz = zfsvfs->z_max_blksz;
5326219089Spjd	uio_t *uio = &xuio->xu_uio;
5327219089Spjd	ssize_t size = uio->uio_resid;
5328219089Spjd	offset_t offset = uio->uio_loffset;
5329219089Spjd	int blksz;
5330219089Spjd	int fullblk, i;
5331219089Spjd	arc_buf_t *abuf;
5332219089Spjd	ssize_t maxsize;
5333219089Spjd	int preamble, postamble;
5334219089Spjd
5335219089Spjd	if (xuio->xu_type != UIOTYPE_ZEROCOPY)
5336249195Smm		return (SET_ERROR(EINVAL));
5337219089Spjd
5338219089Spjd	ZFS_ENTER(zfsvfs);
5339219089Spjd	ZFS_VERIFY_ZP(zp);
5340219089Spjd	switch (ioflag) {
5341219089Spjd	case UIO_WRITE:
5342219089Spjd		/*
5343219089Spjd		 * Loan out an arc_buf for write if write size is bigger than
5344219089Spjd		 * max_blksz, and the file's block size is also max_blksz.
5345219089Spjd		 */
5346219089Spjd		blksz = max_blksz;
5347219089Spjd		if (size < blksz || zp->z_blksz != blksz) {
5348219089Spjd			ZFS_EXIT(zfsvfs);
5349249195Smm			return (SET_ERROR(EINVAL));
5350219089Spjd		}
5351219089Spjd		/*
5352219089Spjd		 * Caller requests buffers for write before knowing where the
5353219089Spjd		 * write offset might be (e.g. NFS TCP write).
5354219089Spjd		 */
5355219089Spjd		if (offset == -1) {
5356219089Spjd			preamble = 0;
5357219089Spjd		} else {
5358219089Spjd			preamble = P2PHASE(offset, blksz);
5359219089Spjd			if (preamble) {
5360219089Spjd				preamble = blksz - preamble;
5361219089Spjd				size -= preamble;
5362219089Spjd			}
5363219089Spjd		}
5364219089Spjd
5365219089Spjd		postamble = P2PHASE(size, blksz);
5366219089Spjd		size -= postamble;
5367219089Spjd
5368219089Spjd		fullblk = size / blksz;
5369219089Spjd		(void) dmu_xuio_init(xuio,
5370219089Spjd		    (preamble != 0) + fullblk + (postamble != 0));
5371219089Spjd		DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble,
5372219089Spjd		    int, postamble, int,
5373219089Spjd		    (preamble != 0) + fullblk + (postamble != 0));
5374219089Spjd
5375219089Spjd		/*
5376219089Spjd		 * Have to fix iov base/len for partial buffers.  They
5377219089Spjd		 * currently represent full arc_buf's.
5378219089Spjd		 */
5379219089Spjd		if (preamble) {
5380219089Spjd			/* data begins in the middle of the arc_buf */
5381219089Spjd			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5382219089Spjd			    blksz);
5383219089Spjd			ASSERT(abuf);
5384219089Spjd			(void) dmu_xuio_add(xuio, abuf,
5385219089Spjd			    blksz - preamble, preamble);
5386219089Spjd		}
5387219089Spjd
5388219089Spjd		for (i = 0; i < fullblk; i++) {
5389219089Spjd			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5390219089Spjd			    blksz);
5391219089Spjd			ASSERT(abuf);
5392219089Spjd			(void) dmu_xuio_add(xuio, abuf, 0, blksz);
5393219089Spjd		}
5394219089Spjd
5395219089Spjd		if (postamble) {
5396219089Spjd			/* data ends in the middle of the arc_buf */
5397219089Spjd			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5398219089Spjd			    blksz);
5399219089Spjd			ASSERT(abuf);
5400219089Spjd			(void) dmu_xuio_add(xuio, abuf, 0, postamble);
5401219089Spjd		}
5402219089Spjd		break;
5403219089Spjd	case UIO_READ:
5404219089Spjd		/*
5405219089Spjd		 * Loan out an arc_buf for read if the read size is larger than
5406219089Spjd		 * the current file block size.  Block alignment is not
5407219089Spjd		 * considered.  Partial arc_buf will be loaned out for read.
5408219089Spjd		 */
5409219089Spjd		blksz = zp->z_blksz;
5410219089Spjd		if (blksz < zcr_blksz_min)
5411219089Spjd			blksz = zcr_blksz_min;
5412219089Spjd		if (blksz > zcr_blksz_max)
5413219089Spjd			blksz = zcr_blksz_max;
5414219089Spjd		/* avoid potential complexity of dealing with it */
5415219089Spjd		if (blksz > max_blksz) {
5416219089Spjd			ZFS_EXIT(zfsvfs);
5417249195Smm			return (SET_ERROR(EINVAL));
5418219089Spjd		}
5419219089Spjd
5420219089Spjd		maxsize = zp->z_size - uio->uio_loffset;
5421219089Spjd		if (size > maxsize)
5422219089Spjd			size = maxsize;
5423219089Spjd
5424219089Spjd		if (size < blksz || vn_has_cached_data(vp)) {
5425219089Spjd			ZFS_EXIT(zfsvfs);
5426249195Smm			return (SET_ERROR(EINVAL));
5427219089Spjd		}
5428219089Spjd		break;
5429219089Spjd	default:
5430219089Spjd		ZFS_EXIT(zfsvfs);
5431249195Smm		return (SET_ERROR(EINVAL));
5432219089Spjd	}
5433219089Spjd
5434219089Spjd	uio->uio_extflg = UIO_XUIO;
5435219089Spjd	XUIO_XUZC_RW(xuio) = ioflag;
5436219089Spjd	ZFS_EXIT(zfsvfs);
5437219089Spjd	return (0);
5438219089Spjd}
5439219089Spjd
5440219089Spjd/*ARGSUSED*/
5441219089Spjdstatic int
5442219089Spjdzfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct)
5443219089Spjd{
5444219089Spjd	int i;
5445219089Spjd	arc_buf_t *abuf;
5446219089Spjd	int ioflag = XUIO_XUZC_RW(xuio);
5447219089Spjd
5448219089Spjd	ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
5449219089Spjd
5450219089Spjd	i = dmu_xuio_cnt(xuio);
5451219089Spjd	while (i-- > 0) {
5452219089Spjd		abuf = dmu_xuio_arcbuf(xuio, i);
5453219089Spjd		/*
5454219089Spjd		 * if abuf == NULL, it must be a write buffer
5455219089Spjd		 * that has been returned in zfs_write().
5456219089Spjd		 */
5457219089Spjd		if (abuf)
5458219089Spjd			dmu_return_arcbuf(abuf);
5459219089Spjd		ASSERT(abuf || ioflag == UIO_WRITE);
5460219089Spjd	}
5461219089Spjd
5462219089Spjd	dmu_xuio_fini(xuio);
5463219089Spjd	return (0);
5464219089Spjd}
5465219089Spjd
5466219089Spjd/*
5467219089Spjd * Predeclare these here so that the compiler assumes that
5468219089Spjd * this is an "old style" function declaration that does
5469219089Spjd * not include arguments => we won't get type mismatch errors
5470219089Spjd * in the initializations that follow.
5471219089Spjd */
5472219089Spjdstatic int zfs_inval();
5473219089Spjdstatic int zfs_isdir();
5474219089Spjd
5475219089Spjdstatic int
5476219089Spjdzfs_inval()
5477219089Spjd{
5478249195Smm	return (SET_ERROR(EINVAL));
5479219089Spjd}
5480219089Spjd
5481219089Spjdstatic int
5482219089Spjdzfs_isdir()
5483219089Spjd{
5484249195Smm	return (SET_ERROR(EISDIR));
5485219089Spjd}
5486219089Spjd/*
5487219089Spjd * Directory vnode operations template
5488219089Spjd */
5489219089Spjdvnodeops_t *zfs_dvnodeops;
5490219089Spjdconst fs_operation_def_t zfs_dvnodeops_template[] = {
5491219089Spjd	VOPNAME_OPEN,		{ .vop_open = zfs_open },
5492219089Spjd	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
5493219089Spjd	VOPNAME_READ,		{ .error = zfs_isdir },
5494219089Spjd	VOPNAME_WRITE,		{ .error = zfs_isdir },
5495219089Spjd	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
5496219089Spjd	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5497219089Spjd	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5498219089Spjd	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5499219089Spjd	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
5500219089Spjd	VOPNAME_CREATE,		{ .vop_create = zfs_create },
5501219089Spjd	VOPNAME_REMOVE,		{ .vop_remove = zfs_remove },
5502219089Spjd	VOPNAME_LINK,		{ .vop_link = zfs_link },
5503219089Spjd	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5504219089Spjd	VOPNAME_MKDIR,		{ .vop_mkdir = zfs_mkdir },
5505219089Spjd	VOPNAME_RMDIR,		{ .vop_rmdir = zfs_rmdir },
5506219089Spjd	VOPNAME_READDIR,	{ .vop_readdir = zfs_readdir },
5507219089Spjd	VOPNAME_SYMLINK,	{ .vop_symlink = zfs_symlink },
5508219089Spjd	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
5509219089Spjd	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5510219089Spjd	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5511219089Spjd	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
5512219089Spjd	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5513219089Spjd	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5514219089Spjd	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5515219089Spjd	VOPNAME_VNEVENT, 	{ .vop_vnevent = fs_vnevent_support },
5516219089Spjd	NULL,			NULL
5517219089Spjd};
5518219089Spjd
5519219089Spjd/*
5520219089Spjd * Regular file vnode operations template
5521219089Spjd */
5522219089Spjdvnodeops_t *zfs_fvnodeops;
5523219089Spjdconst fs_operation_def_t zfs_fvnodeops_template[] = {
5524219089Spjd	VOPNAME_OPEN,		{ .vop_open = zfs_open },
5525219089Spjd	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
5526219089Spjd	VOPNAME_READ,		{ .vop_read = zfs_read },
5527219089Spjd	VOPNAME_WRITE,		{ .vop_write = zfs_write },
5528219089Spjd	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
5529219089Spjd	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5530219089Spjd	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5531219089Spjd	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5532219089Spjd	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
5533219089Spjd	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5534219089Spjd	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
5535219089Spjd	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5536219089Spjd	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5537219089Spjd	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
5538219089Spjd	VOPNAME_FRLOCK,		{ .vop_frlock = zfs_frlock },
5539219089Spjd	VOPNAME_SPACE,		{ .vop_space = zfs_space },
5540219089Spjd	VOPNAME_GETPAGE,	{ .vop_getpage = zfs_getpage },
5541219089Spjd	VOPNAME_PUTPAGE,	{ .vop_putpage = zfs_putpage },
5542219089Spjd	VOPNAME_MAP,		{ .vop_map = zfs_map },
5543219089Spjd	VOPNAME_ADDMAP,		{ .vop_addmap = zfs_addmap },
5544219089Spjd	VOPNAME_DELMAP,		{ .vop_delmap = zfs_delmap },
5545219089Spjd	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5546219089Spjd	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5547219089Spjd	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5548219089Spjd	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5549219089Spjd	VOPNAME_REQZCBUF, 	{ .vop_reqzcbuf = zfs_reqzcbuf },
5550219089Spjd	VOPNAME_RETZCBUF, 	{ .vop_retzcbuf = zfs_retzcbuf },
5551219089Spjd	NULL,			NULL
5552219089Spjd};
5553219089Spjd
5554219089Spjd/*
5555219089Spjd * Symbolic link vnode operations template
5556219089Spjd */
5557219089Spjdvnodeops_t *zfs_symvnodeops;
5558219089Spjdconst fs_operation_def_t zfs_symvnodeops_template[] = {
5559219089Spjd	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5560219089Spjd	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5561219089Spjd	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5562219089Spjd	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5563219089Spjd	VOPNAME_READLINK,	{ .vop_readlink = zfs_readlink },
5564219089Spjd	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5565219089Spjd	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5566219089Spjd	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5567219089Spjd	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5568219089Spjd	NULL,			NULL
5569219089Spjd};
5570219089Spjd
5571219089Spjd/*
5572219089Spjd * special share hidden files vnode operations template
5573219089Spjd */
5574219089Spjdvnodeops_t *zfs_sharevnodeops;
5575219089Spjdconst fs_operation_def_t zfs_sharevnodeops_template[] = {
5576219089Spjd	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5577219089Spjd	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5578219089Spjd	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5579219089Spjd	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5580219089Spjd	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5581219089Spjd	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5582219089Spjd	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5583219089Spjd	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5584219089Spjd	NULL,			NULL
5585219089Spjd};
5586219089Spjd
5587219089Spjd/*
5588219089Spjd * Extended attribute directory vnode operations template
5589251631Sdelphij *
5590251631Sdelphij * This template is identical to the directory vnodes
5591251631Sdelphij * operation template except for restricted operations:
5592251631Sdelphij *	VOP_MKDIR()
5593251631Sdelphij *	VOP_SYMLINK()
5594251631Sdelphij *
5595219089Spjd * Note that there are other restrictions embedded in:
5596219089Spjd *	zfs_create()	- restrict type to VREG
5597219089Spjd *	zfs_link()	- no links into/out of attribute space
5598219089Spjd *	zfs_rename()	- no moves into/out of attribute space
5599219089Spjd */
5600219089Spjdvnodeops_t *zfs_xdvnodeops;
5601219089Spjdconst fs_operation_def_t zfs_xdvnodeops_template[] = {
5602219089Spjd	VOPNAME_OPEN,		{ .vop_open = zfs_open },
5603219089Spjd	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
5604219089Spjd	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
5605219089Spjd	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5606219089Spjd	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5607219089Spjd	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5608219089Spjd	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
5609219089Spjd	VOPNAME_CREATE,		{ .vop_create = zfs_create },
5610219089Spjd	VOPNAME_REMOVE,		{ .vop_remove = zfs_remove },
5611219089Spjd	VOPNAME_LINK,		{ .vop_link = zfs_link },
5612219089Spjd	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5613219089Spjd	VOPNAME_MKDIR,		{ .error = zfs_inval },
5614219089Spjd	VOPNAME_RMDIR,		{ .vop_rmdir = zfs_rmdir },
5615219089Spjd	VOPNAME_READDIR,	{ .vop_readdir = zfs_readdir },
5616219089Spjd	VOPNAME_SYMLINK,	{ .error = zfs_inval },
5617219089Spjd	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
5618219089Spjd	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5619219089Spjd	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5620219089Spjd	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
5621219089Spjd	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5622219089Spjd	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5623219089Spjd	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5624219089Spjd	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5625219089Spjd	NULL,			NULL
5626219089Spjd};
5627219089Spjd
5628219089Spjd/*
5629219089Spjd * Error vnode operations template
5630219089Spjd */
5631219089Spjdvnodeops_t *zfs_evnodeops;
5632219089Spjdconst fs_operation_def_t zfs_evnodeops_template[] = {
5633219089Spjd	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5634219089Spjd	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5635219089Spjd	NULL,			NULL
5636219089Spjd};
5637219089Spjd#endif	/* sun */
5638219089Spjd
5639219089Spjdstatic int
5640213673Spjdioflags(int ioflags)
5641213673Spjd{
5642213673Spjd	int flags = 0;
5643213673Spjd
5644213673Spjd	if (ioflags & IO_APPEND)
5645213673Spjd		flags |= FAPPEND;
5646213673Spjd	if (ioflags & IO_NDELAY)
5647213673Spjd        	flags |= FNONBLOCK;
5648213673Spjd	if (ioflags & IO_SYNC)
5649213673Spjd		flags |= (FSYNC | FDSYNC | FRSYNC);
5650213673Spjd
5651213673Spjd	return (flags);
5652213673Spjd}
5653213673Spjd
5654213673Spjdstatic int
5655213937Savgzfs_getpages(struct vnode *vp, vm_page_t *m, int count, int reqpage)
5656213937Savg{
5657213937Savg	znode_t *zp = VTOZ(vp);
5658213937Savg	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5659213937Savg	objset_t *os = zp->z_zfsvfs->z_os;
5660243517Savg	vm_page_t mfirst, mlast, mreq;
5661213937Savg	vm_object_t object;
5662213937Savg	caddr_t va;
5663213937Savg	struct sf_buf *sf;
5664243517Savg	off_t startoff, endoff;
5665213937Savg	int i, error;
5666243517Savg	vm_pindex_t reqstart, reqend;
5667243517Savg	int pcount, lsize, reqsize, size;
5668213937Savg
5669213937Savg	ZFS_ENTER(zfsvfs);
5670213937Savg	ZFS_VERIFY_ZP(zp);
5671213937Savg
5672243517Savg	pcount = OFF_TO_IDX(round_page(count));
5673213937Savg	mreq = m[reqpage];
5674213937Savg	object = mreq->object;
5675213937Savg	error = 0;
5676213937Savg
5677213937Savg	KASSERT(vp->v_object == object, ("mismatching object"));
5678213937Savg
5679243517Savg	if (pcount > 1 && zp->z_blksz > PAGESIZE) {
5680243517Savg		startoff = rounddown(IDX_TO_OFF(mreq->pindex), zp->z_blksz);
5681243517Savg		reqstart = OFF_TO_IDX(round_page(startoff));
5682243517Savg		if (reqstart < m[0]->pindex)
5683243517Savg			reqstart = 0;
5684243517Savg		else
5685243517Savg			reqstart = reqstart - m[0]->pindex;
5686243517Savg		endoff = roundup(IDX_TO_OFF(mreq->pindex) + PAGE_SIZE,
5687243517Savg		    zp->z_blksz);
5688243517Savg		reqend = OFF_TO_IDX(trunc_page(endoff)) - 1;
5689243517Savg		if (reqend > m[pcount - 1]->pindex)
5690243517Savg			reqend = m[pcount - 1]->pindex;
5691243517Savg		reqsize = reqend - m[reqstart]->pindex + 1;
5692243517Savg		KASSERT(reqstart <= reqpage && reqpage < reqstart + reqsize,
5693243517Savg		    ("reqpage beyond [reqstart, reqstart + reqsize[ bounds"));
5694243517Savg	} else {
5695243517Savg		reqstart = reqpage;
5696243517Savg		reqsize = 1;
5697243517Savg	}
5698243517Savg	mfirst = m[reqstart];
5699243517Savg	mlast = m[reqstart + reqsize - 1];
5700243517Savg
5701248084Sattilio	zfs_vmobject_wlock(object);
5702213937Savg
5703243517Savg	for (i = 0; i < reqstart; i++) {
5704243517Savg		vm_page_lock(m[i]);
5705243517Savg		vm_page_free(m[i]);
5706243517Savg		vm_page_unlock(m[i]);
5707213937Savg	}
5708243517Savg	for (i = reqstart + reqsize; i < pcount; i++) {
5709243517Savg		vm_page_lock(m[i]);
5710243517Savg		vm_page_free(m[i]);
5711243517Savg		vm_page_unlock(m[i]);
5712243517Savg	}
5713213937Savg
5714243517Savg	if (mreq->valid && reqsize == 1) {
5715213937Savg		if (mreq->valid != VM_PAGE_BITS_ALL)
5716213937Savg			vm_page_zero_invalid(mreq, TRUE);
5717248084Sattilio		zfs_vmobject_wunlock(object);
5718213937Savg		ZFS_EXIT(zfsvfs);
5719248084Sattilio		return (zfs_vm_pagerret_ok);
5720213937Savg	}
5721213937Savg
5722213937Savg	PCPU_INC(cnt.v_vnodein);
5723243517Savg	PCPU_ADD(cnt.v_vnodepgsin, reqsize);
5724213937Savg
5725213937Savg	if (IDX_TO_OFF(mreq->pindex) >= object->un_pager.vnp.vnp_size) {
5726243517Savg		for (i = reqstart; i < reqstart + reqsize; i++) {
5727243517Savg			if (i != reqpage) {
5728243517Savg				vm_page_lock(m[i]);
5729243517Savg				vm_page_free(m[i]);
5730243517Savg				vm_page_unlock(m[i]);
5731243517Savg			}
5732243517Savg		}
5733248084Sattilio		zfs_vmobject_wunlock(object);
5734213937Savg		ZFS_EXIT(zfsvfs);
5735248084Sattilio		return (zfs_vm_pagerret_bad);
5736213937Savg	}
5737213937Savg
5738243517Savg	lsize = PAGE_SIZE;
5739243517Savg	if (IDX_TO_OFF(mlast->pindex) + lsize > object->un_pager.vnp.vnp_size)
5740243517Savg		lsize = object->un_pager.vnp.vnp_size - IDX_TO_OFF(mlast->pindex);
5741213937Savg
5742248084Sattilio	zfs_vmobject_wunlock(object);
5743243517Savg
5744243517Savg	for (i = reqstart; i < reqstart + reqsize; i++) {
5745243517Savg		size = PAGE_SIZE;
5746243517Savg		if (i == (reqstart + reqsize - 1))
5747243517Savg			size = lsize;
5748243517Savg		va = zfs_map_page(m[i], &sf);
5749243517Savg		error = dmu_read(os, zp->z_id, IDX_TO_OFF(m[i]->pindex),
5750243517Savg		    size, va, DMU_READ_PREFETCH);
5751243517Savg		if (size != PAGE_SIZE)
5752243517Savg			bzero(va + size, PAGE_SIZE - size);
5753243517Savg		zfs_unmap_page(sf);
5754243517Savg		if (error != 0)
5755243517Savg			break;
5756243517Savg	}
5757243517Savg
5758248084Sattilio	zfs_vmobject_wlock(object);
5759213937Savg
5760243517Savg	for (i = reqstart; i < reqstart + reqsize; i++) {
5761243763Savg		if (!error)
5762243763Savg			m[i]->valid = VM_PAGE_BITS_ALL;
5763243517Savg		KASSERT(m[i]->dirty == 0, ("zfs_getpages: page %p is dirty", m[i]));
5764243763Savg		if (i != reqpage)
5765243763Savg			vm_page_readahead_finish(m[i]);
5766243517Savg	}
5767243517Savg
5768248084Sattilio	zfs_vmobject_wunlock(object);
5769213937Savg
5770213937Savg	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
5771213937Savg	ZFS_EXIT(zfsvfs);
5772248084Sattilio	return (error ? zfs_vm_pagerret_error : zfs_vm_pagerret_ok);
5773213937Savg}
5774213937Savg
5775213937Savgstatic int
5776213937Savgzfs_freebsd_getpages(ap)
5777213937Savg	struct vop_getpages_args /* {
5778213937Savg		struct vnode *a_vp;
5779213937Savg		vm_page_t *a_m;
5780213937Savg		int a_count;
5781213937Savg		int a_reqpage;
5782213937Savg		vm_ooffset_t a_offset;
5783213937Savg	} */ *ap;
5784213937Savg{
5785213937Savg
5786213937Savg	return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_reqpage));
5787213937Savg}
5788213937Savg
5789213937Savgstatic int
5790243518Savgzfs_freebsd_bmap(ap)
5791243518Savg	struct vop_bmap_args /* {
5792243518Savg		struct vnode *a_vp;
5793243518Savg		daddr_t  a_bn;
5794243518Savg		struct bufobj **a_bop;
5795243518Savg		daddr_t *a_bnp;
5796243518Savg		int *a_runp;
5797243518Savg		int *a_runb;
5798243518Savg	} */ *ap;
5799243518Savg{
5800243518Savg
5801243518Savg	if (ap->a_bop != NULL)
5802243518Savg		*ap->a_bop = &ap->a_vp->v_bufobj;
5803243518Savg	if (ap->a_bnp != NULL)
5804243518Savg		*ap->a_bnp = ap->a_bn;
5805243518Savg	if (ap->a_runp != NULL)
5806243518Savg		*ap->a_runp = 0;
5807243518Savg	if (ap->a_runb != NULL)
5808243518Savg		*ap->a_runb = 0;
5809243518Savg
5810243518Savg	return (0);
5811243518Savg}
5812243518Savg
5813243518Savgstatic int
5814168962Spjdzfs_freebsd_open(ap)
5815168962Spjd	struct vop_open_args /* {
5816168962Spjd		struct vnode *a_vp;
5817168962Spjd		int a_mode;
5818168962Spjd		struct ucred *a_cred;
5819168962Spjd		struct thread *a_td;
5820168962Spjd	} */ *ap;
5821168962Spjd{
5822168962Spjd	vnode_t	*vp = ap->a_vp;
5823168962Spjd	znode_t *zp = VTOZ(vp);
5824168962Spjd	int error;
5825168962Spjd
5826185029Spjd	error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
5827168962Spjd	if (error == 0)
5828219089Spjd		vnode_create_vobject(vp, zp->z_size, ap->a_td);
5829168962Spjd	return (error);
5830168962Spjd}
5831168962Spjd
5832168962Spjdstatic int
5833168962Spjdzfs_freebsd_close(ap)
5834168962Spjd	struct vop_close_args /* {
5835168962Spjd		struct vnode *a_vp;
5836168962Spjd		int  a_fflag;
5837168962Spjd		struct ucred *a_cred;
5838168962Spjd		struct thread *a_td;
5839168962Spjd	} */ *ap;
5840168962Spjd{
5841168962Spjd
5842242566Savg	return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL));
5843168962Spjd}
5844168962Spjd
5845168962Spjdstatic int
5846168962Spjdzfs_freebsd_ioctl(ap)
5847168962Spjd	struct vop_ioctl_args /* {
5848168962Spjd		struct vnode *a_vp;
5849168962Spjd		u_long a_command;
5850168962Spjd		caddr_t a_data;
5851168962Spjd		int a_fflag;
5852168962Spjd		struct ucred *cred;
5853168962Spjd		struct thread *td;
5854168962Spjd	} */ *ap;
5855168962Spjd{
5856168962Spjd
5857168978Spjd	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
5858185029Spjd	    ap->a_fflag, ap->a_cred, NULL, NULL));
5859168962Spjd}
5860168962Spjd
5861168962Spjdstatic int
5862168962Spjdzfs_freebsd_read(ap)
5863168962Spjd	struct vop_read_args /* {
5864168962Spjd		struct vnode *a_vp;
5865168962Spjd		struct uio *a_uio;
5866168962Spjd		int a_ioflag;
5867168962Spjd		struct ucred *a_cred;
5868168962Spjd	} */ *ap;
5869168962Spjd{
5870168962Spjd
5871213673Spjd	return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
5872213673Spjd	    ap->a_cred, NULL));
5873168962Spjd}
5874168962Spjd
5875168962Spjdstatic int
5876168962Spjdzfs_freebsd_write(ap)
5877168962Spjd	struct vop_write_args /* {
5878168962Spjd		struct vnode *a_vp;
5879168962Spjd		struct uio *a_uio;
5880168962Spjd		int a_ioflag;
5881168962Spjd		struct ucred *a_cred;
5882168962Spjd	} */ *ap;
5883168962Spjd{
5884168962Spjd
5885213673Spjd	return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
5886213673Spjd	    ap->a_cred, NULL));
5887168962Spjd}
5888168962Spjd
5889168962Spjdstatic int
5890168962Spjdzfs_freebsd_access(ap)
5891168962Spjd	struct vop_access_args /* {
5892168962Spjd		struct vnode *a_vp;
5893192689Strasz		accmode_t a_accmode;
5894168962Spjd		struct ucred *a_cred;
5895168962Spjd		struct thread *a_td;
5896168962Spjd	} */ *ap;
5897168962Spjd{
5898212002Sjh	vnode_t *vp = ap->a_vp;
5899212002Sjh	znode_t *zp = VTOZ(vp);
5900198703Spjd	accmode_t accmode;
5901198703Spjd	int error = 0;
5902168962Spjd
5903185172Spjd	/*
5904198703Spjd	 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
5905185172Spjd	 */
5906198703Spjd	accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
5907198703Spjd	if (accmode != 0)
5908198703Spjd		error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL);
5909185172Spjd
5910198703Spjd	/*
5911198703Spjd	 * VADMIN has to be handled by vaccess().
5912198703Spjd	 */
5913198703Spjd	if (error == 0) {
5914198703Spjd		accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
5915198703Spjd		if (accmode != 0) {
5916219089Spjd			error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
5917219089Spjd			    zp->z_gid, accmode, ap->a_cred, NULL);
5918198703Spjd		}
5919185172Spjd	}
5920185172Spjd
5921212002Sjh	/*
5922212002Sjh	 * For VEXEC, ensure that at least one execute bit is set for
5923212002Sjh	 * non-directories.
5924212002Sjh	 */
5925212002Sjh	if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
5926219089Spjd	    (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
5927212002Sjh		error = EACCES;
5928219089Spjd	}
5929212002Sjh
5930198703Spjd	return (error);
5931168962Spjd}
5932168962Spjd
5933168962Spjdstatic int
5934168962Spjdzfs_freebsd_lookup(ap)
5935168962Spjd	struct vop_lookup_args /* {
5936168962Spjd		struct vnode *a_dvp;
5937168962Spjd		struct vnode **a_vpp;
5938168962Spjd		struct componentname *a_cnp;
5939168962Spjd	} */ *ap;
5940168962Spjd{
5941168962Spjd	struct componentname *cnp = ap->a_cnp;
5942168962Spjd	char nm[NAME_MAX + 1];
5943168962Spjd
5944168962Spjd	ASSERT(cnp->cn_namelen < sizeof(nm));
5945168962Spjd	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
5946168962Spjd
5947168962Spjd	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
5948185029Spjd	    cnp->cn_cred, cnp->cn_thread, 0));
5949168962Spjd}
5950168962Spjd
5951168962Spjdstatic int
5952168962Spjdzfs_freebsd_create(ap)
5953168962Spjd	struct vop_create_args /* {
5954168962Spjd		struct vnode *a_dvp;
5955168962Spjd		struct vnode **a_vpp;
5956168962Spjd		struct componentname *a_cnp;
5957168962Spjd		struct vattr *a_vap;
5958168962Spjd	} */ *ap;
5959168962Spjd{
5960168962Spjd	struct componentname *cnp = ap->a_cnp;
5961168962Spjd	vattr_t *vap = ap->a_vap;
5962168962Spjd	int mode;
5963168962Spjd
5964168962Spjd	ASSERT(cnp->cn_flags & SAVENAME);
5965168962Spjd
5966168962Spjd	vattr_init_mask(vap);
5967168962Spjd	mode = vap->va_mode & ALLPERMS;
5968168962Spjd
5969168962Spjd	return (zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
5970185029Spjd	    ap->a_vpp, cnp->cn_cred, cnp->cn_thread));
5971168962Spjd}
5972168962Spjd
5973168962Spjdstatic int
5974168962Spjdzfs_freebsd_remove(ap)
5975168962Spjd	struct vop_remove_args /* {
5976168962Spjd		struct vnode *a_dvp;
5977168962Spjd		struct vnode *a_vp;
5978168962Spjd		struct componentname *a_cnp;
5979168962Spjd	} */ *ap;
5980168962Spjd{
5981168962Spjd
5982168962Spjd	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
5983168962Spjd
5984168962Spjd	return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr,
5985185029Spjd	    ap->a_cnp->cn_cred, NULL, 0));
5986168962Spjd}
5987168962Spjd
5988168962Spjdstatic int
5989168962Spjdzfs_freebsd_mkdir(ap)
5990168962Spjd	struct vop_mkdir_args /* {
5991168962Spjd		struct vnode *a_dvp;
5992168962Spjd		struct vnode **a_vpp;
5993168962Spjd		struct componentname *a_cnp;
5994168962Spjd		struct vattr *a_vap;
5995168962Spjd	} */ *ap;
5996168962Spjd{
5997168962Spjd	vattr_t *vap = ap->a_vap;
5998168962Spjd
5999168962Spjd	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
6000168962Spjd
6001168962Spjd	vattr_init_mask(vap);
6002168962Spjd
6003168962Spjd	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
6004185029Spjd	    ap->a_cnp->cn_cred, NULL, 0, NULL));
6005168962Spjd}
6006168962Spjd
6007168962Spjdstatic int
6008168962Spjdzfs_freebsd_rmdir(ap)
6009168962Spjd	struct vop_rmdir_args /* {
6010168962Spjd		struct vnode *a_dvp;
6011168962Spjd		struct vnode *a_vp;
6012168962Spjd		struct componentname *a_cnp;
6013168962Spjd	} */ *ap;
6014168962Spjd{
6015168962Spjd	struct componentname *cnp = ap->a_cnp;
6016168962Spjd
6017168962Spjd	ASSERT(cnp->cn_flags & SAVENAME);
6018168962Spjd
6019185029Spjd	return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred, NULL, 0));
6020168962Spjd}
6021168962Spjd
6022168962Spjdstatic int
6023168962Spjdzfs_freebsd_readdir(ap)
6024168962Spjd	struct vop_readdir_args /* {
6025168962Spjd		struct vnode *a_vp;
6026168962Spjd		struct uio *a_uio;
6027168962Spjd		struct ucred *a_cred;
6028168962Spjd		int *a_eofflag;
6029168962Spjd		int *a_ncookies;
6030168962Spjd		u_long **a_cookies;
6031168962Spjd	} */ *ap;
6032168962Spjd{
6033168962Spjd
6034168962Spjd	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
6035168962Spjd	    ap->a_ncookies, ap->a_cookies));
6036168962Spjd}
6037168962Spjd
6038168962Spjdstatic int
6039168962Spjdzfs_freebsd_fsync(ap)
6040168962Spjd	struct vop_fsync_args /* {
6041168962Spjd		struct vnode *a_vp;
6042168962Spjd		int a_waitfor;
6043168962Spjd		struct thread *a_td;
6044168962Spjd	} */ *ap;
6045168962Spjd{
6046168962Spjd
6047168962Spjd	vop_stdfsync(ap);
6048185029Spjd	return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
6049168962Spjd}
6050168962Spjd
6051168962Spjdstatic int
6052168962Spjdzfs_freebsd_getattr(ap)
6053168962Spjd	struct vop_getattr_args /* {
6054168962Spjd		struct vnode *a_vp;
6055168962Spjd		struct vattr *a_vap;
6056168962Spjd		struct ucred *a_cred;
6057168962Spjd	} */ *ap;
6058168962Spjd{
6059185029Spjd	vattr_t *vap = ap->a_vap;
6060185029Spjd	xvattr_t xvap;
6061185029Spjd	u_long fflags = 0;
6062185029Spjd	int error;
6063168962Spjd
6064185029Spjd	xva_init(&xvap);
6065185029Spjd	xvap.xva_vattr = *vap;
6066185029Spjd	xvap.xva_vattr.va_mask |= AT_XVATTR;
6067185029Spjd
6068185029Spjd	/* Convert chflags into ZFS-type flags. */
6069185029Spjd	/* XXX: what about SF_SETTABLE?. */
6070185029Spjd	XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
6071185029Spjd	XVA_SET_REQ(&xvap, XAT_APPENDONLY);
6072185029Spjd	XVA_SET_REQ(&xvap, XAT_NOUNLINK);
6073185029Spjd	XVA_SET_REQ(&xvap, XAT_NODUMP);
6074254627Sken	XVA_SET_REQ(&xvap, XAT_READONLY);
6075254627Sken	XVA_SET_REQ(&xvap, XAT_ARCHIVE);
6076254627Sken	XVA_SET_REQ(&xvap, XAT_SYSTEM);
6077254627Sken	XVA_SET_REQ(&xvap, XAT_HIDDEN);
6078254627Sken	XVA_SET_REQ(&xvap, XAT_REPARSE);
6079254627Sken	XVA_SET_REQ(&xvap, XAT_OFFLINE);
6080254627Sken	XVA_SET_REQ(&xvap, XAT_SPARSE);
6081254627Sken
6082185029Spjd	error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
6083185029Spjd	if (error != 0)
6084185029Spjd		return (error);
6085185029Spjd
6086185029Spjd	/* Convert ZFS xattr into chflags. */
6087185029Spjd#define	FLAG_CHECK(fflag, xflag, xfield)	do {			\
6088185029Spjd	if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)		\
6089185029Spjd		fflags |= (fflag);					\
6090185029Spjd} while (0)
6091185029Spjd	FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
6092185029Spjd	    xvap.xva_xoptattrs.xoa_immutable);
6093185029Spjd	FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
6094185029Spjd	    xvap.xva_xoptattrs.xoa_appendonly);
6095185029Spjd	FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
6096185029Spjd	    xvap.xva_xoptattrs.xoa_nounlink);
6097254627Sken	FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
6098254627Sken	    xvap.xva_xoptattrs.xoa_archive);
6099185029Spjd	FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
6100185029Spjd	    xvap.xva_xoptattrs.xoa_nodump);
6101254627Sken	FLAG_CHECK(UF_READONLY, XAT_READONLY,
6102254627Sken	    xvap.xva_xoptattrs.xoa_readonly);
6103254627Sken	FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
6104254627Sken	    xvap.xva_xoptattrs.xoa_system);
6105254627Sken	FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
6106254627Sken	    xvap.xva_xoptattrs.xoa_hidden);
6107254627Sken	FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
6108254627Sken	    xvap.xva_xoptattrs.xoa_reparse);
6109254627Sken	FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
6110254627Sken	    xvap.xva_xoptattrs.xoa_offline);
6111254627Sken	FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
6112254627Sken	    xvap.xva_xoptattrs.xoa_sparse);
6113254627Sken
6114185029Spjd#undef	FLAG_CHECK
6115185029Spjd	*vap = xvap.xva_vattr;
6116185029Spjd	vap->va_flags = fflags;
6117185029Spjd	return (0);
6118168962Spjd}
6119168962Spjd
6120168962Spjdstatic int
6121168962Spjdzfs_freebsd_setattr(ap)
6122168962Spjd	struct vop_setattr_args /* {
6123168962Spjd		struct vnode *a_vp;
6124168962Spjd		struct vattr *a_vap;
6125168962Spjd		struct ucred *a_cred;
6126168962Spjd	} */ *ap;
6127168962Spjd{
6128185172Spjd	vnode_t *vp = ap->a_vp;
6129168962Spjd	vattr_t *vap = ap->a_vap;
6130185172Spjd	cred_t *cred = ap->a_cred;
6131185029Spjd	xvattr_t xvap;
6132185029Spjd	u_long fflags;
6133185029Spjd	uint64_t zflags;
6134168962Spjd
6135168962Spjd	vattr_init_mask(vap);
6136170044Spjd	vap->va_mask &= ~AT_NOSET;
6137168962Spjd
6138185029Spjd	xva_init(&xvap);
6139185029Spjd	xvap.xva_vattr = *vap;
6140185029Spjd
6141219089Spjd	zflags = VTOZ(vp)->z_pflags;
6142185172Spjd
6143185029Spjd	if (vap->va_flags != VNOVAL) {
6144197683Sdelphij		zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
6145185172Spjd		int error;
6146185172Spjd
6147197683Sdelphij		if (zfsvfs->z_use_fuids == B_FALSE)
6148197683Sdelphij			return (EOPNOTSUPP);
6149197683Sdelphij
6150185029Spjd		fflags = vap->va_flags;
6151254627Sken		/*
6152254627Sken		 * XXX KDM
6153254627Sken		 * We need to figure out whether it makes sense to allow
6154254627Sken		 * UF_REPARSE through, since we don't really have other
6155254627Sken		 * facilities to handle reparse points and zfs_setattr()
6156254627Sken		 * doesn't currently allow setting that attribute anyway.
6157254627Sken		 */
6158254627Sken		if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE|
6159254627Sken		     UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE|
6160254627Sken		     UF_OFFLINE|UF_SPARSE)) != 0)
6161185029Spjd			return (EOPNOTSUPP);
6162185172Spjd		/*
6163185172Spjd		 * Unprivileged processes are not permitted to unset system
6164185172Spjd		 * flags, or modify flags if any system flags are set.
6165185172Spjd		 * Privileged non-jail processes may not modify system flags
6166185172Spjd		 * if securelevel > 0 and any existing system flags are set.
6167185172Spjd		 * Privileged jail processes behave like privileged non-jail
6168185172Spjd		 * processes if the security.jail.chflags_allowed sysctl is
6169185172Spjd		 * is non-zero; otherwise, they behave like unprivileged
6170185172Spjd		 * processes.
6171185172Spjd		 */
6172197861Spjd		if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
6173197861Spjd		    priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) {
6174185172Spjd			if (zflags &
6175185172Spjd			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
6176185172Spjd				error = securelevel_gt(cred, 0);
6177197861Spjd				if (error != 0)
6178185172Spjd					return (error);
6179185172Spjd			}
6180185172Spjd		} else {
6181197861Spjd			/*
6182197861Spjd			 * Callers may only modify the file flags on objects they
6183197861Spjd			 * have VADMIN rights for.
6184197861Spjd			 */
6185197861Spjd			if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0)
6186197861Spjd				return (error);
6187185172Spjd			if (zflags &
6188185172Spjd			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
6189185172Spjd				return (EPERM);
6190185172Spjd			}
6191185172Spjd			if (fflags &
6192185172Spjd			    (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
6193185172Spjd				return (EPERM);
6194185172Spjd			}
6195185172Spjd		}
6196185029Spjd
6197185029Spjd#define	FLAG_CHANGE(fflag, zflag, xflag, xfield)	do {		\
6198185029Spjd	if (((fflags & (fflag)) && !(zflags & (zflag))) ||		\
6199185029Spjd	    ((zflags & (zflag)) && !(fflags & (fflag)))) {		\
6200185029Spjd		XVA_SET_REQ(&xvap, (xflag));				\
6201185029Spjd		(xfield) = ((fflags & (fflag)) != 0);			\
6202185029Spjd	}								\
6203185029Spjd} while (0)
6204185029Spjd		/* Convert chflags into ZFS-type flags. */
6205185029Spjd		/* XXX: what about SF_SETTABLE?. */
6206185029Spjd		FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
6207185029Spjd		    xvap.xva_xoptattrs.xoa_immutable);
6208185029Spjd		FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
6209185029Spjd		    xvap.xva_xoptattrs.xoa_appendonly);
6210185029Spjd		FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
6211185029Spjd		    xvap.xva_xoptattrs.xoa_nounlink);
6212254627Sken		FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
6213254627Sken		    xvap.xva_xoptattrs.xoa_archive);
6214185029Spjd		FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
6215185172Spjd		    xvap.xva_xoptattrs.xoa_nodump);
6216254627Sken		FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
6217254627Sken		    xvap.xva_xoptattrs.xoa_readonly);
6218254627Sken		FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
6219254627Sken		    xvap.xva_xoptattrs.xoa_system);
6220254627Sken		FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
6221254627Sken		    xvap.xva_xoptattrs.xoa_hidden);
6222254627Sken		FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
6223254627Sken		    xvap.xva_xoptattrs.xoa_hidden);
6224254627Sken		FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
6225254627Sken		    xvap.xva_xoptattrs.xoa_offline);
6226254627Sken		FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
6227254627Sken		    xvap.xva_xoptattrs.xoa_sparse);
6228185029Spjd#undef	FLAG_CHANGE
6229185029Spjd	}
6230185172Spjd	return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
6231168962Spjd}
6232168962Spjd
6233168962Spjdstatic int
6234168962Spjdzfs_freebsd_rename(ap)
6235168962Spjd	struct vop_rename_args  /* {
6236168962Spjd		struct vnode *a_fdvp;
6237168962Spjd		struct vnode *a_fvp;
6238168962Spjd		struct componentname *a_fcnp;
6239168962Spjd		struct vnode *a_tdvp;
6240168962Spjd		struct vnode *a_tvp;
6241168962Spjd		struct componentname *a_tcnp;
6242168962Spjd	} */ *ap;
6243168962Spjd{
6244168962Spjd	vnode_t *fdvp = ap->a_fdvp;
6245168962Spjd	vnode_t *fvp = ap->a_fvp;
6246168962Spjd	vnode_t *tdvp = ap->a_tdvp;
6247168962Spjd	vnode_t *tvp = ap->a_tvp;
6248168962Spjd	int error;
6249168962Spjd
6250192237Skmacy	ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
6251192237Skmacy	ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
6252168962Spjd
6253254982Sdelphij	if (fdvp->v_mount == tdvp->v_mount)
6254254982Sdelphij		error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp,
6255254982Sdelphij		    ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred, NULL, 0);
6256254982Sdelphij	else
6257254982Sdelphij		error = EXDEV;
6258168962Spjd
6259168962Spjd	if (tdvp == tvp)
6260168962Spjd		VN_RELE(tdvp);
6261168962Spjd	else
6262168962Spjd		VN_URELE(tdvp);
6263168962Spjd	if (tvp)
6264168962Spjd		VN_URELE(tvp);
6265168962Spjd	VN_RELE(fdvp);
6266168962Spjd	VN_RELE(fvp);
6267168962Spjd
6268168962Spjd	return (error);
6269168962Spjd}
6270168962Spjd
6271168962Spjdstatic int
6272168962Spjdzfs_freebsd_symlink(ap)
6273168962Spjd	struct vop_symlink_args /* {
6274168962Spjd		struct vnode *a_dvp;
6275168962Spjd		struct vnode **a_vpp;
6276168962Spjd		struct componentname *a_cnp;
6277168962Spjd		struct vattr *a_vap;
6278168962Spjd		char *a_target;
6279168962Spjd	} */ *ap;
6280168962Spjd{
6281168962Spjd	struct componentname *cnp = ap->a_cnp;
6282168962Spjd	vattr_t *vap = ap->a_vap;
6283168962Spjd
6284168962Spjd	ASSERT(cnp->cn_flags & SAVENAME);
6285168962Spjd
6286168962Spjd	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
6287168962Spjd	vattr_init_mask(vap);
6288168962Spjd
6289168962Spjd	return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
6290168962Spjd	    ap->a_target, cnp->cn_cred, cnp->cn_thread));
6291168962Spjd}
6292168962Spjd
6293168962Spjdstatic int
6294168962Spjdzfs_freebsd_readlink(ap)
6295168962Spjd	struct vop_readlink_args /* {
6296168962Spjd		struct vnode *a_vp;
6297168962Spjd		struct uio *a_uio;
6298168962Spjd		struct ucred *a_cred;
6299168962Spjd	} */ *ap;
6300168962Spjd{
6301168962Spjd
6302185029Spjd	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
6303168962Spjd}
6304168962Spjd
6305168962Spjdstatic int
6306168962Spjdzfs_freebsd_link(ap)
6307168962Spjd	struct vop_link_args /* {
6308168962Spjd		struct vnode *a_tdvp;
6309168962Spjd		struct vnode *a_vp;
6310168962Spjd		struct componentname *a_cnp;
6311168962Spjd	} */ *ap;
6312168962Spjd{
6313168962Spjd	struct componentname *cnp = ap->a_cnp;
6314254982Sdelphij	vnode_t *vp = ap->a_vp;
6315254982Sdelphij	vnode_t *tdvp = ap->a_tdvp;
6316168962Spjd
6317254982Sdelphij	if (tdvp->v_mount != vp->v_mount)
6318254982Sdelphij		return (EXDEV);
6319254982Sdelphij
6320168962Spjd	ASSERT(cnp->cn_flags & SAVENAME);
6321168962Spjd
6322254982Sdelphij	return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
6323168962Spjd}
6324168962Spjd
6325168962Spjdstatic int
6326168962Spjdzfs_freebsd_inactive(ap)
6327169170Spjd	struct vop_inactive_args /* {
6328169170Spjd		struct vnode *a_vp;
6329169170Spjd		struct thread *a_td;
6330169170Spjd	} */ *ap;
6331168962Spjd{
6332168962Spjd	vnode_t *vp = ap->a_vp;
6333168962Spjd
6334185029Spjd	zfs_inactive(vp, ap->a_td->td_ucred, NULL);
6335168962Spjd	return (0);
6336168962Spjd}
6337168962Spjd
6338168962Spjdstatic int
6339168962Spjdzfs_freebsd_reclaim(ap)
6340168962Spjd	struct vop_reclaim_args /* {
6341168962Spjd		struct vnode *a_vp;
6342168962Spjd		struct thread *a_td;
6343168962Spjd	} */ *ap;
6344168962Spjd{
6345169170Spjd	vnode_t	*vp = ap->a_vp;
6346168962Spjd	znode_t	*zp = VTOZ(vp);
6347197133Spjd	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
6348168962Spjd
6349169025Spjd	ASSERT(zp != NULL);
6350169025Spjd
6351243520Savg	/* Destroy the vm object and flush associated pages. */
6352243520Savg	vnode_destroy_vobject(vp);
6353243520Savg
6354168962Spjd	/*
6355243520Savg	 * z_teardown_inactive_lock protects from a race with
6356243520Savg	 * zfs_znode_dmu_fini in zfsvfs_teardown during
6357243520Savg	 * force unmount.
6358168962Spjd	 */
6359243520Savg	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
6360243520Savg	if (zp->z_sa_hdl == NULL)
6361196301Spjd		zfs_znode_free(zp);
6362243520Savg	else
6363243520Savg		zfs_zinactive(zp);
6364243520Savg	rw_exit(&zfsvfs->z_teardown_inactive_lock);
6365185029Spjd
6366168962Spjd	vp->v_data = NULL;
6367168962Spjd	return (0);
6368168962Spjd}
6369168962Spjd
6370168962Spjdstatic int
6371168962Spjdzfs_freebsd_fid(ap)
6372168962Spjd	struct vop_fid_args /* {
6373168962Spjd		struct vnode *a_vp;
6374168962Spjd		struct fid *a_fid;
6375168962Spjd	} */ *ap;
6376168962Spjd{
6377168962Spjd
6378185029Spjd	return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
6379168962Spjd}
6380168962Spjd
6381168962Spjdstatic int
6382168962Spjdzfs_freebsd_pathconf(ap)
6383168962Spjd	struct vop_pathconf_args /* {
6384168962Spjd		struct vnode *a_vp;
6385168962Spjd		int a_name;
6386168962Spjd		register_t *a_retval;
6387168962Spjd	} */ *ap;
6388168962Spjd{
6389168962Spjd	ulong_t val;
6390168962Spjd	int error;
6391168962Spjd
6392185029Spjd	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL);
6393168962Spjd	if (error == 0)
6394168962Spjd		*ap->a_retval = val;
6395168962Spjd	else if (error == EOPNOTSUPP)
6396168962Spjd		error = vop_stdpathconf(ap);
6397168962Spjd	return (error);
6398168962Spjd}
6399168962Spjd
6400196949Straszstatic int
6401196949Straszzfs_freebsd_fifo_pathconf(ap)
6402196949Strasz	struct vop_pathconf_args /* {
6403196949Strasz		struct vnode *a_vp;
6404196949Strasz		int a_name;
6405196949Strasz		register_t *a_retval;
6406196949Strasz	} */ *ap;
6407196949Strasz{
6408196949Strasz
6409196949Strasz	switch (ap->a_name) {
6410196949Strasz	case _PC_ACL_EXTENDED:
6411196949Strasz	case _PC_ACL_NFS4:
6412196949Strasz	case _PC_ACL_PATH_MAX:
6413196949Strasz	case _PC_MAC_PRESENT:
6414196949Strasz		return (zfs_freebsd_pathconf(ap));
6415196949Strasz	default:
6416196949Strasz		return (fifo_specops.vop_pathconf(ap));
6417196949Strasz	}
6418196949Strasz}
6419196949Strasz
6420185029Spjd/*
6421185029Spjd * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
6422185029Spjd * extended attribute name:
6423185029Spjd *
6424185029Spjd *	NAMESPACE	PREFIX
6425185029Spjd *	system		freebsd:system:
6426185029Spjd *	user		(none, can be used to access ZFS fsattr(5) attributes
6427185029Spjd *			created on Solaris)
6428185029Spjd */
6429185029Spjdstatic int
6430185029Spjdzfs_create_attrname(int attrnamespace, const char *name, char *attrname,
6431185029Spjd    size_t size)
6432185029Spjd{
6433185029Spjd	const char *namespace, *prefix, *suffix;
6434185029Spjd
6435185029Spjd	/* We don't allow '/' character in attribute name. */
6436185029Spjd	if (strchr(name, '/') != NULL)
6437185029Spjd		return (EINVAL);
6438185029Spjd	/* We don't allow attribute names that start with "freebsd:" string. */
6439185029Spjd	if (strncmp(name, "freebsd:", 8) == 0)
6440185029Spjd		return (EINVAL);
6441185029Spjd
6442185029Spjd	bzero(attrname, size);
6443185029Spjd
6444185029Spjd	switch (attrnamespace) {
6445185029Spjd	case EXTATTR_NAMESPACE_USER:
6446185029Spjd#if 0
6447185029Spjd		prefix = "freebsd:";
6448185029Spjd		namespace = EXTATTR_NAMESPACE_USER_STRING;
6449185029Spjd		suffix = ":";
6450185029Spjd#else
6451185029Spjd		/*
6452185029Spjd		 * This is the default namespace by which we can access all
6453185029Spjd		 * attributes created on Solaris.
6454185029Spjd		 */
6455185029Spjd		prefix = namespace = suffix = "";
6456185029Spjd#endif
6457185029Spjd		break;
6458185029Spjd	case EXTATTR_NAMESPACE_SYSTEM:
6459185029Spjd		prefix = "freebsd:";
6460185029Spjd		namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
6461185029Spjd		suffix = ":";
6462185029Spjd		break;
6463185029Spjd	case EXTATTR_NAMESPACE_EMPTY:
6464185029Spjd	default:
6465185029Spjd		return (EINVAL);
6466185029Spjd	}
6467185029Spjd	if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
6468185029Spjd	    name) >= size) {
6469185029Spjd		return (ENAMETOOLONG);
6470185029Spjd	}
6471185029Spjd	return (0);
6472185029Spjd}
6473185029Spjd
6474185029Spjd/*
6475185029Spjd * Vnode operating to retrieve a named extended attribute.
6476185029Spjd */
6477185029Spjdstatic int
6478185029Spjdzfs_getextattr(struct vop_getextattr_args *ap)
6479185029Spjd/*
6480185029Spjdvop_getextattr {
6481185029Spjd	IN struct vnode *a_vp;
6482185029Spjd	IN int a_attrnamespace;
6483185029Spjd	IN const char *a_name;
6484185029Spjd	INOUT struct uio *a_uio;
6485185029Spjd	OUT size_t *a_size;
6486185029Spjd	IN struct ucred *a_cred;
6487185029Spjd	IN struct thread *a_td;
6488185029Spjd};
6489185029Spjd*/
6490185029Spjd{
6491185029Spjd	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
6492185029Spjd	struct thread *td = ap->a_td;
6493185029Spjd	struct nameidata nd;
6494185029Spjd	char attrname[255];
6495185029Spjd	struct vattr va;
6496185029Spjd	vnode_t *xvp = NULL, *vp;
6497185029Spjd	int error, flags;
6498185029Spjd
6499195785Strasz	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6500195785Strasz	    ap->a_cred, ap->a_td, VREAD);
6501195785Strasz	if (error != 0)
6502195785Strasz		return (error);
6503195785Strasz
6504185029Spjd	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
6505185029Spjd	    sizeof(attrname));
6506185029Spjd	if (error != 0)
6507185029Spjd		return (error);
6508185029Spjd
6509185029Spjd	ZFS_ENTER(zfsvfs);
6510185029Spjd
6511185029Spjd	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
6512185029Spjd	    LOOKUP_XATTR);
6513185029Spjd	if (error != 0) {
6514185029Spjd		ZFS_EXIT(zfsvfs);
6515185029Spjd		return (error);
6516185029Spjd	}
6517185029Spjd
6518185029Spjd	flags = FREAD;
6519241896Skib	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
6520185029Spjd	    xvp, td);
6521194586Skib	error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL);
6522185029Spjd	vp = nd.ni_vp;
6523185029Spjd	NDFREE(&nd, NDF_ONLY_PNBUF);
6524185029Spjd	if (error != 0) {
6525196303Spjd		ZFS_EXIT(zfsvfs);
6526195785Strasz		if (error == ENOENT)
6527195785Strasz			error = ENOATTR;
6528185029Spjd		return (error);
6529185029Spjd	}
6530185029Spjd
6531185029Spjd	if (ap->a_size != NULL) {
6532185029Spjd		error = VOP_GETATTR(vp, &va, ap->a_cred);
6533185029Spjd		if (error == 0)
6534185029Spjd			*ap->a_size = (size_t)va.va_size;
6535185029Spjd	} else if (ap->a_uio != NULL)
6536224605Smm		error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
6537185029Spjd
6538185029Spjd	VOP_UNLOCK(vp, 0);
6539185029Spjd	vn_close(vp, flags, ap->a_cred, td);
6540185029Spjd	ZFS_EXIT(zfsvfs);
6541185029Spjd
6542185029Spjd	return (error);
6543185029Spjd}
6544185029Spjd
6545185029Spjd/*
6546185029Spjd * Vnode operation to remove a named attribute.
6547185029Spjd */
6548185029Spjdint
6549185029Spjdzfs_deleteextattr(struct vop_deleteextattr_args *ap)
6550185029Spjd/*
6551185029Spjdvop_deleteextattr {
6552185029Spjd	IN struct vnode *a_vp;
6553185029Spjd	IN int a_attrnamespace;
6554185029Spjd	IN const char *a_name;
6555185029Spjd	IN struct ucred *a_cred;
6556185029Spjd	IN struct thread *a_td;
6557185029Spjd};
6558185029Spjd*/
6559185029Spjd{
6560185029Spjd	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
6561185029Spjd	struct thread *td = ap->a_td;
6562185029Spjd	struct nameidata nd;
6563185029Spjd	char attrname[255];
6564185029Spjd	struct vattr va;
6565185029Spjd	vnode_t *xvp = NULL, *vp;
6566185029Spjd	int error, flags;
6567185029Spjd
6568195785Strasz	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6569195785Strasz	    ap->a_cred, ap->a_td, VWRITE);
6570195785Strasz	if (error != 0)
6571195785Strasz		return (error);
6572195785Strasz
6573185029Spjd	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
6574185029Spjd	    sizeof(attrname));
6575185029Spjd	if (error != 0)
6576185029Spjd		return (error);
6577185029Spjd
6578185029Spjd	ZFS_ENTER(zfsvfs);
6579185029Spjd
6580185029Spjd	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
6581185029Spjd	    LOOKUP_XATTR);
6582185029Spjd	if (error != 0) {
6583185029Spjd		ZFS_EXIT(zfsvfs);
6584185029Spjd		return (error);
6585185029Spjd	}
6586185029Spjd
6587241896Skib	NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
6588185029Spjd	    UIO_SYSSPACE, attrname, xvp, td);
6589185029Spjd	error = namei(&nd);
6590185029Spjd	vp = nd.ni_vp;
6591185029Spjd	NDFREE(&nd, NDF_ONLY_PNBUF);
6592185029Spjd	if (error != 0) {
6593196303Spjd		ZFS_EXIT(zfsvfs);
6594195785Strasz		if (error == ENOENT)
6595195785Strasz			error = ENOATTR;
6596185029Spjd		return (error);
6597185029Spjd	}
6598185029Spjd	error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
6599185029Spjd
6600185029Spjd	vput(nd.ni_dvp);
6601185029Spjd	if (vp == nd.ni_dvp)
6602185029Spjd		vrele(vp);
6603185029Spjd	else
6604185029Spjd		vput(vp);
6605185029Spjd	ZFS_EXIT(zfsvfs);
6606185029Spjd
6607185029Spjd	return (error);
6608185029Spjd}
6609185029Spjd
6610185029Spjd/*
6611185029Spjd * Vnode operation to set a named attribute.
6612185029Spjd */
6613185029Spjdstatic int
6614185029Spjdzfs_setextattr(struct vop_setextattr_args *ap)
6615185029Spjd/*
6616185029Spjdvop_setextattr {
6617185029Spjd	IN struct vnode *a_vp;
6618185029Spjd	IN int a_attrnamespace;
6619185029Spjd	IN const char *a_name;
6620185029Spjd	INOUT struct uio *a_uio;
6621185029Spjd	IN struct ucred *a_cred;
6622185029Spjd	IN struct thread *a_td;
6623185029Spjd};
6624185029Spjd*/
6625185029Spjd{
6626185029Spjd	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
6627185029Spjd	struct thread *td = ap->a_td;
6628185029Spjd	struct nameidata nd;
6629185029Spjd	char attrname[255];
6630185029Spjd	struct vattr va;
6631185029Spjd	vnode_t *xvp = NULL, *vp;
6632185029Spjd	int error, flags;
6633185029Spjd
6634195785Strasz	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6635195785Strasz	    ap->a_cred, ap->a_td, VWRITE);
6636195785Strasz	if (error != 0)
6637195785Strasz		return (error);
6638195785Strasz
6639185029Spjd	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
6640185029Spjd	    sizeof(attrname));
6641185029Spjd	if (error != 0)
6642185029Spjd		return (error);
6643185029Spjd
6644185029Spjd	ZFS_ENTER(zfsvfs);
6645185029Spjd
6646185029Spjd	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
6647195785Strasz	    LOOKUP_XATTR | CREATE_XATTR_DIR);
6648185029Spjd	if (error != 0) {
6649185029Spjd		ZFS_EXIT(zfsvfs);
6650185029Spjd		return (error);
6651185029Spjd	}
6652185029Spjd
6653185029Spjd	flags = FFLAGS(O_WRONLY | O_CREAT);
6654241896Skib	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
6655185029Spjd	    xvp, td);
6656194586Skib	error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL);
6657185029Spjd	vp = nd.ni_vp;
6658185029Spjd	NDFREE(&nd, NDF_ONLY_PNBUF);
6659185029Spjd	if (error != 0) {
6660185029Spjd		ZFS_EXIT(zfsvfs);
6661185029Spjd		return (error);
6662185029Spjd	}
6663185029Spjd
6664185029Spjd	VATTR_NULL(&va);
6665185029Spjd	va.va_size = 0;
6666185029Spjd	error = VOP_SETATTR(vp, &va, ap->a_cred);
6667185029Spjd	if (error == 0)
6668185029Spjd		VOP_WRITE(vp, ap->a_uio, IO_UNIT | IO_SYNC, ap->a_cred);
6669185029Spjd
6670185029Spjd	VOP_UNLOCK(vp, 0);
6671185029Spjd	vn_close(vp, flags, ap->a_cred, td);
6672185029Spjd	ZFS_EXIT(zfsvfs);
6673185029Spjd
6674185029Spjd	return (error);
6675185029Spjd}
6676185029Spjd
6677185029Spjd/*
6678185029Spjd * Vnode operation to retrieve extended attributes on a vnode.
6679185029Spjd */
6680185029Spjdstatic int
6681185029Spjdzfs_listextattr(struct vop_listextattr_args *ap)
6682185029Spjd/*
6683185029Spjdvop_listextattr {
6684185029Spjd	IN struct vnode *a_vp;
6685185029Spjd	IN int a_attrnamespace;
6686185029Spjd	INOUT struct uio *a_uio;
6687185029Spjd	OUT size_t *a_size;
6688185029Spjd	IN struct ucred *a_cred;
6689185029Spjd	IN struct thread *a_td;
6690185029Spjd};
6691185029Spjd*/
6692185029Spjd{
6693185029Spjd	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
6694185029Spjd	struct thread *td = ap->a_td;
6695185029Spjd	struct nameidata nd;
6696185029Spjd	char attrprefix[16];
6697185029Spjd	u_char dirbuf[sizeof(struct dirent)];
6698185029Spjd	struct dirent *dp;
6699185029Spjd	struct iovec aiov;
6700185029Spjd	struct uio auio, *uio = ap->a_uio;
6701185029Spjd	size_t *sizep = ap->a_size;
6702185029Spjd	size_t plen;
6703185029Spjd	vnode_t *xvp = NULL, *vp;
6704185029Spjd	int done, error, eof, pos;
6705185029Spjd
6706195785Strasz	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
6707195785Strasz	    ap->a_cred, ap->a_td, VREAD);
6708196303Spjd	if (error != 0)
6709195785Strasz		return (error);
6710195785Strasz
6711185029Spjd	error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
6712185029Spjd	    sizeof(attrprefix));
6713185029Spjd	if (error != 0)
6714185029Spjd		return (error);
6715185029Spjd	plen = strlen(attrprefix);
6716185029Spjd
6717185029Spjd	ZFS_ENTER(zfsvfs);
6718185029Spjd
6719195822Strasz	if (sizep != NULL)
6720195822Strasz		*sizep = 0;
6721195822Strasz
6722185029Spjd	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
6723185029Spjd	    LOOKUP_XATTR);
6724185029Spjd	if (error != 0) {
6725196303Spjd		ZFS_EXIT(zfsvfs);
6726195785Strasz		/*
6727195785Strasz		 * ENOATTR means that the EA directory does not yet exist,
6728195785Strasz		 * i.e. there are no extended attributes there.
6729195785Strasz		 */
6730195785Strasz		if (error == ENOATTR)
6731195785Strasz			error = 0;
6732185029Spjd		return (error);
6733185029Spjd	}
6734185029Spjd
6735241896Skib	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
6736188588Sjhb	    UIO_SYSSPACE, ".", xvp, td);
6737185029Spjd	error = namei(&nd);
6738185029Spjd	vp = nd.ni_vp;
6739185029Spjd	NDFREE(&nd, NDF_ONLY_PNBUF);
6740185029Spjd	if (error != 0) {
6741185029Spjd		ZFS_EXIT(zfsvfs);
6742185029Spjd		return (error);
6743185029Spjd	}
6744185029Spjd
6745185029Spjd	auio.uio_iov = &aiov;
6746185029Spjd	auio.uio_iovcnt = 1;
6747185029Spjd	auio.uio_segflg = UIO_SYSSPACE;
6748185029Spjd	auio.uio_td = td;
6749185029Spjd	auio.uio_rw = UIO_READ;
6750185029Spjd	auio.uio_offset = 0;
6751185029Spjd
6752185029Spjd	do {
6753185029Spjd		u_char nlen;
6754185029Spjd
6755185029Spjd		aiov.iov_base = (void *)dirbuf;
6756185029Spjd		aiov.iov_len = sizeof(dirbuf);
6757185029Spjd		auio.uio_resid = sizeof(dirbuf);
6758185029Spjd		error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
6759185029Spjd		done = sizeof(dirbuf) - auio.uio_resid;
6760185029Spjd		if (error != 0)
6761185029Spjd			break;
6762185029Spjd		for (pos = 0; pos < done;) {
6763185029Spjd			dp = (struct dirent *)(dirbuf + pos);
6764185029Spjd			pos += dp->d_reclen;
6765185029Spjd			/*
6766185029Spjd			 * XXX: Temporarily we also accept DT_UNKNOWN, as this
6767185029Spjd			 * is what we get when attribute was created on Solaris.
6768185029Spjd			 */
6769185029Spjd			if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
6770185029Spjd				continue;
6771185029Spjd			if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0)
6772185029Spjd				continue;
6773185029Spjd			else if (strncmp(dp->d_name, attrprefix, plen) != 0)
6774185029Spjd				continue;
6775185029Spjd			nlen = dp->d_namlen - plen;
6776185029Spjd			if (sizep != NULL)
6777185029Spjd				*sizep += 1 + nlen;
6778185029Spjd			else if (uio != NULL) {
6779185029Spjd				/*
6780185029Spjd				 * Format of extattr name entry is one byte for
6781185029Spjd				 * length and the rest for name.
6782185029Spjd				 */
6783185029Spjd				error = uiomove(&nlen, 1, uio->uio_rw, uio);
6784185029Spjd				if (error == 0) {
6785185029Spjd					error = uiomove(dp->d_name + plen, nlen,
6786185029Spjd					    uio->uio_rw, uio);
6787185029Spjd				}
6788185029Spjd				if (error != 0)
6789185029Spjd					break;
6790185029Spjd			}
6791185029Spjd		}
6792185029Spjd	} while (!eof && error == 0);
6793185029Spjd
6794185029Spjd	vput(vp);
6795185029Spjd	ZFS_EXIT(zfsvfs);
6796185029Spjd
6797185029Spjd	return (error);
6798185029Spjd}
6799185029Spjd
6800192800Straszint
6801192800Straszzfs_freebsd_getacl(ap)
6802192800Strasz	struct vop_getacl_args /* {
6803192800Strasz		struct vnode *vp;
6804192800Strasz		acl_type_t type;
6805192800Strasz		struct acl *aclp;
6806192800Strasz		struct ucred *cred;
6807192800Strasz		struct thread *td;
6808192800Strasz	} */ *ap;
6809192800Strasz{
6810192800Strasz	int		error;
6811192800Strasz	vsecattr_t      vsecattr;
6812192800Strasz
6813192800Strasz	if (ap->a_type != ACL_TYPE_NFS4)
6814197435Strasz		return (EINVAL);
6815192800Strasz
6816192800Strasz	vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
6817192800Strasz	if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))
6818192800Strasz		return (error);
6819192800Strasz
6820192800Strasz	error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt);
6821196303Spjd	if (vsecattr.vsa_aclentp != NULL)
6822196303Spjd		kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
6823192800Strasz
6824196303Spjd	return (error);
6825192800Strasz}
6826192800Strasz
6827192800Straszint
6828192800Straszzfs_freebsd_setacl(ap)
6829192800Strasz	struct vop_setacl_args /* {
6830192800Strasz		struct vnode *vp;
6831192800Strasz		acl_type_t type;
6832192800Strasz		struct acl *aclp;
6833192800Strasz		struct ucred *cred;
6834192800Strasz		struct thread *td;
6835192800Strasz	} */ *ap;
6836192800Strasz{
6837192800Strasz	int		error;
6838192800Strasz	vsecattr_t      vsecattr;
6839192800Strasz	int		aclbsize;	/* size of acl list in bytes */
6840192800Strasz	aclent_t	*aaclp;
6841192800Strasz
6842192800Strasz	if (ap->a_type != ACL_TYPE_NFS4)
6843197435Strasz		return (EINVAL);
6844192800Strasz
6845192800Strasz	if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
6846192800Strasz		return (EINVAL);
6847192800Strasz
6848192800Strasz	/*
6849196949Strasz	 * With NFSv4 ACLs, chmod(2) may need to add additional entries,
6850192800Strasz	 * splitting every entry into two and appending "canonical six"
6851192800Strasz	 * entries at the end.  Don't allow for setting an ACL that would
6852192800Strasz	 * cause chmod(2) to run out of ACL entries.
6853192800Strasz	 */
6854192800Strasz	if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
6855192800Strasz		return (ENOSPC);
6856192800Strasz
6857208030Strasz	error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
6858208030Strasz	if (error != 0)
6859208030Strasz		return (error);
6860208030Strasz
6861192800Strasz	vsecattr.vsa_mask = VSA_ACE;
6862192800Strasz	aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t);
6863192800Strasz	vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
6864192800Strasz	aaclp = vsecattr.vsa_aclentp;
6865192800Strasz	vsecattr.vsa_aclentsz = aclbsize;
6866192800Strasz
6867192800Strasz	aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
6868192800Strasz	error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL);
6869192800Strasz	kmem_free(aaclp, aclbsize);
6870192800Strasz
6871192800Strasz	return (error);
6872192800Strasz}
6873192800Strasz
6874192800Straszint
6875192800Straszzfs_freebsd_aclcheck(ap)
6876192800Strasz	struct vop_aclcheck_args /* {
6877192800Strasz		struct vnode *vp;
6878192800Strasz		acl_type_t type;
6879192800Strasz		struct acl *aclp;
6880192800Strasz		struct ucred *cred;
6881192800Strasz		struct thread *td;
6882192800Strasz	} */ *ap;
6883192800Strasz{
6884192800Strasz
6885192800Strasz	return (EOPNOTSUPP);
6886192800Strasz}
6887192800Strasz
6888168404Spjdstruct vop_vector zfs_vnodeops;
6889168404Spjdstruct vop_vector zfs_fifoops;
6890209962Smmstruct vop_vector zfs_shareops;
6891168404Spjd
6892168404Spjdstruct vop_vector zfs_vnodeops = {
6893185029Spjd	.vop_default =		&default_vnodeops,
6894185029Spjd	.vop_inactive =		zfs_freebsd_inactive,
6895185029Spjd	.vop_reclaim =		zfs_freebsd_reclaim,
6896185029Spjd	.vop_access =		zfs_freebsd_access,
6897168404Spjd#ifdef FREEBSD_NAMECACHE
6898185029Spjd	.vop_lookup =		vfs_cache_lookup,
6899185029Spjd	.vop_cachedlookup =	zfs_freebsd_lookup,
6900168404Spjd#else
6901185029Spjd	.vop_lookup =		zfs_freebsd_lookup,
6902168404Spjd#endif
6903185029Spjd	.vop_getattr =		zfs_freebsd_getattr,
6904185029Spjd	.vop_setattr =		zfs_freebsd_setattr,
6905185029Spjd	.vop_create =		zfs_freebsd_create,
6906185029Spjd	.vop_mknod =		zfs_freebsd_create,
6907185029Spjd	.vop_mkdir =		zfs_freebsd_mkdir,
6908185029Spjd	.vop_readdir =		zfs_freebsd_readdir,
6909185029Spjd	.vop_fsync =		zfs_freebsd_fsync,
6910185029Spjd	.vop_open =		zfs_freebsd_open,
6911185029Spjd	.vop_close =		zfs_freebsd_close,
6912185029Spjd	.vop_rmdir =		zfs_freebsd_rmdir,
6913185029Spjd	.vop_ioctl =		zfs_freebsd_ioctl,
6914185029Spjd	.vop_link =		zfs_freebsd_link,
6915185029Spjd	.vop_symlink =		zfs_freebsd_symlink,
6916185029Spjd	.vop_readlink =		zfs_freebsd_readlink,
6917185029Spjd	.vop_read =		zfs_freebsd_read,
6918185029Spjd	.vop_write =		zfs_freebsd_write,
6919185029Spjd	.vop_remove =		zfs_freebsd_remove,
6920185029Spjd	.vop_rename =		zfs_freebsd_rename,
6921185029Spjd	.vop_pathconf =		zfs_freebsd_pathconf,
6922243518Savg	.vop_bmap =		zfs_freebsd_bmap,
6923185029Spjd	.vop_fid =		zfs_freebsd_fid,
6924185029Spjd	.vop_getextattr =	zfs_getextattr,
6925185029Spjd	.vop_deleteextattr =	zfs_deleteextattr,
6926185029Spjd	.vop_setextattr =	zfs_setextattr,
6927185029Spjd	.vop_listextattr =	zfs_listextattr,
6928192800Strasz	.vop_getacl =		zfs_freebsd_getacl,
6929192800Strasz	.vop_setacl =		zfs_freebsd_setacl,
6930192800Strasz	.vop_aclcheck =		zfs_freebsd_aclcheck,
6931213937Savg	.vop_getpages =		zfs_freebsd_getpages,
6932168404Spjd};
6933168404Spjd
6934169170Spjdstruct vop_vector zfs_fifoops = {
6935185029Spjd	.vop_default =		&fifo_specops,
6936200162Skib	.vop_fsync =		zfs_freebsd_fsync,
6937185029Spjd	.vop_access =		zfs_freebsd_access,
6938185029Spjd	.vop_getattr =		zfs_freebsd_getattr,
6939185029Spjd	.vop_inactive =		zfs_freebsd_inactive,
6940185029Spjd	.vop_read =		VOP_PANIC,
6941185029Spjd	.vop_reclaim =		zfs_freebsd_reclaim,
6942185029Spjd	.vop_setattr =		zfs_freebsd_setattr,
6943185029Spjd	.vop_write =		VOP_PANIC,
6944196949Strasz	.vop_pathconf = 	zfs_freebsd_fifo_pathconf,
6945185029Spjd	.vop_fid =		zfs_freebsd_fid,
6946192800Strasz	.vop_getacl =		zfs_freebsd_getacl,
6947192800Strasz	.vop_setacl =		zfs_freebsd_setacl,
6948192800Strasz	.vop_aclcheck =		zfs_freebsd_aclcheck,
6949168404Spjd};
6950209962Smm
6951209962Smm/*
6952209962Smm * special share hidden files vnode operations template
6953209962Smm */
6954209962Smmstruct vop_vector zfs_shareops = {
6955209962Smm	.vop_default =		&default_vnodeops,
6956209962Smm	.vop_access =		zfs_freebsd_access,
6957209962Smm	.vop_inactive =		zfs_freebsd_inactive,
6958209962Smm	.vop_reclaim =		zfs_freebsd_reclaim,
6959209962Smm	.vop_fid =		zfs_freebsd_fid,
6960209962Smm	.vop_pathconf =		zfs_freebsd_pathconf,
6961209962Smm};
6962