1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
24 * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
25 * Copyright (c) 2014 Integros [integros.com]
26 */
27
28/* Portions Copyright 2007 Jeremy Teo */
29/* Portions Copyright 2010 Robert Milkowski */
30
31#include <sys/types.h>
32#include <sys/param.h>
33#include <sys/time.h>
34#include <sys/systm.h>
35#include <sys/sysmacros.h>
36#include <sys/resource.h>
37#include <sys/vfs.h>
38#include <sys/vm.h>
39#include <sys/vnode.h>
40#include <sys/file.h>
41#include <sys/stat.h>
42#include <sys/kmem.h>
43#include <sys/taskq.h>
44#include <sys/uio.h>
45#include <sys/atomic.h>
46#include <sys/namei.h>
47#include <sys/mman.h>
48#include <sys/cmn_err.h>
49#include <sys/errno.h>
50#include <sys/unistd.h>
51#include <sys/zfs_dir.h>
52#include <sys/zfs_ioctl.h>
53#include <sys/fs/zfs.h>
54#include <sys/dmu.h>
55#include <sys/dmu_objset.h>
56#include <sys/spa.h>
57#include <sys/txg.h>
58#include <sys/dbuf.h>
59#include <sys/zap.h>
60#include <sys/sa.h>
61#include <sys/dirent.h>
62#include <sys/policy.h>
63#include <sys/sunddi.h>
64#include <sys/filio.h>
65#include <sys/sid.h>
66#include <sys/zfs_ctldir.h>
67#include <sys/zfs_fuid.h>
68#include <sys/zfs_sa.h>
69#include <sys/dnlc.h>
70#include <sys/zfs_rlock.h>
71#include <sys/buf.h>
72#include <sys/sched.h>
73#include <sys/acl.h>
74#include <sys/extdirent.h>
75
76#ifdef __FreeBSD__
77#include <sys/kidmap.h>
78#include <sys/bio.h>
79#include <vm/vm_param.h>
80#endif
81
82#ifdef __NetBSD__
83#include <dev/mm.h>
84#include <miscfs/fifofs/fifo.h>
85#include <miscfs/genfs/genfs.h>
86#include <miscfs/genfs/genfs_node.h>
87#include <uvm/uvm_extern.h>
88#include <sys/fstrans.h>
89#include <sys/malloc.h>
90
91uint_t zfs_putpage_key;
92#endif
93
94/*
95 * Programming rules.
96 *
97 * Each vnode op performs some logical unit of work.  To do this, the ZPL must
98 * properly lock its in-core state, create a DMU transaction, do the work,
99 * record this work in the intent log (ZIL), commit the DMU transaction,
100 * and wait for the intent log to commit if it is a synchronous operation.
101 * Moreover, the vnode ops must work in both normal and log replay context.
102 * The ordering of events is important to avoid deadlocks and references
103 * to freed memory.  The example below illustrates the following Big Rules:
104 *
105 *  (1)	A check must be made in each zfs thread for a mounted file system.
106 *	This is done avoiding races using ZFS_ENTER(zfsvfs).
107 *	A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
108 *	must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
109 *	can return EIO from the calling function.
110 *
111 *  (2)	VN_RELE() should always be the last thing except for zil_commit()
112 *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
113 *	First, if it's the last reference, the vnode/znode
114 *	can be freed, so the zp may point to freed memory.  Second, the last
115 *	reference will call zfs_zinactive(), which may induce a lot of work --
116 *	pushing cached pages (which acquires range locks) and syncing out
117 *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
118 *	which could deadlock the system if you were already holding one.
119 *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
120 *
121 *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
122 *	as they can span dmu_tx_assign() calls.
123 *
124 *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
125 *      dmu_tx_assign().  This is critical because we don't want to block
126 *      while holding locks.
127 *
128 *	If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
129 *	reduces lock contention and CPU usage when we must wait (note that if
130 *	throughput is constrained by the storage, nearly every transaction
131 *	must wait).
132 *
133 *      Note, in particular, that if a lock is sometimes acquired before
134 *      the tx assigns, and sometimes after (e.g. z_lock), then failing
135 *      to use a non-blocking assign can deadlock the system.  The scenario:
136 *
137 *	Thread A has grabbed a lock before calling dmu_tx_assign().
138 *	Thread B is in an already-assigned tx, and blocks for this lock.
139 *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
140 *	forever, because the previous txg can't quiesce until B's tx commits.
141 *
142 *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
143 *	then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
144 *	calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT,
145 *	to indicate that this operation has already called dmu_tx_wait().
146 *	This will ensure that we don't retry forever, waiting a short bit
147 *	each time.
148 *
149 *  (5)	If the operation succeeded, generate the intent log entry for it
150 *	before dropping locks.  This ensures that the ordering of events
151 *	in the intent log matches the order in which they actually occurred.
152 *	During ZIL replay the zfs_log_* functions will update the sequence
153 *	number to indicate the zil transaction has replayed.
154 *
155 *  (6)	At the end of each vnode op, the DMU tx must always commit,
156 *	regardless of whether there were any errors.
157 *
158 *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
159 *	to ensure that synchronous semantics are provided when necessary.
160 *
161 * In general, this is how things should be ordered in each vnode op:
162 *
163 *	ZFS_ENTER(zfsvfs);		// exit if unmounted
164 * top:
165 *	zfs_dirent_lookup(&dl, ...)	// lock directory entry (may VN_HOLD())
166 *	rw_enter(...);			// grab any other locks you need
167 *	tx = dmu_tx_create(...);	// get DMU tx
168 *	dmu_tx_hold_*();		// hold each object you might modify
169 *	error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
170 *	if (error) {
171 *		rw_exit(...);		// drop locks
172 *		zfs_dirent_unlock(dl);	// unlock directory entry
173 *		VN_RELE(...);		// release held vnodes
174 *		if (error == ERESTART) {
175 *			waited = B_TRUE;
176 *			dmu_tx_wait(tx);
177 *			dmu_tx_abort(tx);
178 *			goto top;
179 *		}
180 *		dmu_tx_abort(tx);	// abort DMU tx
181 *		ZFS_EXIT(zfsvfs);	// finished in zfs
182 *		return (error);		// really out of space
183 *	}
184 *	error = do_real_work();		// do whatever this VOP does
185 *	if (error == 0)
186 *		zfs_log_*(...);		// on success, make ZIL entry
187 *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
188 *	rw_exit(...);			// drop locks
189 *	zfs_dirent_unlock(dl);		// unlock directory entry
190 *	VN_RELE(...);			// release held vnodes
191 *	zil_commit(zilog, foid);	// synchronous when necessary
192 *	ZFS_EXIT(zfsvfs);		// finished in zfs
193 *	return (error);			// done, report error
194 */
195
196/* ARGSUSED */
197static int
198zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
199{
200	znode_t	*zp = VTOZ(*vpp);
201	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
202
203	ZFS_ENTER(zfsvfs);
204	ZFS_VERIFY_ZP(zp);
205
206	if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
207	    ((flag & FAPPEND) == 0)) {
208		ZFS_EXIT(zfsvfs);
209		return (SET_ERROR(EPERM));
210	}
211
212	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
213	    ZTOV(zp)->v_type == VREG &&
214	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
215		if (fs_vscan(*vpp, cr, 0) != 0) {
216			ZFS_EXIT(zfsvfs);
217			return (SET_ERROR(EACCES));
218		}
219	}
220
221	/* Keep a count of the synchronous opens in the znode */
222	if (flag & (FSYNC | FDSYNC))
223		atomic_inc_32(&zp->z_sync_cnt);
224
225	ZFS_EXIT(zfsvfs);
226	return (0);
227}
228
229/* ARGSUSED */
230static int
231zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
232    caller_context_t *ct)
233{
234	znode_t	*zp = VTOZ(vp);
235	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
236
237	/*
238	 * Clean up any locks held by this process on the vp.
239	 */
240	cleanlocks(vp, ddi_get_pid(), 0);
241	cleanshares(vp, ddi_get_pid());
242
243	ZFS_ENTER(zfsvfs);
244	ZFS_VERIFY_ZP(zp);
245
246	/* Decrement the synchronous opens in the znode */
247	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
248		atomic_dec_32(&zp->z_sync_cnt);
249
250	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
251	    ZTOV(zp)->v_type == VREG &&
252	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
253		VERIFY(fs_vscan(vp, cr, 1) == 0);
254
255	ZFS_EXIT(zfsvfs);
256	return (0);
257}
258
259/*
260 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
261 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
262 */
263static int
264zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
265{
266	znode_t	*zp = VTOZ(vp);
267	uint64_t noff = (uint64_t)*off; /* new offset */
268	uint64_t file_sz;
269	int error;
270	boolean_t hole;
271
272	file_sz = zp->z_size;
273	if (noff >= file_sz)  {
274		return (SET_ERROR(ENXIO));
275	}
276
277	if (cmd == _FIO_SEEK_HOLE)
278		hole = B_TRUE;
279	else
280		hole = B_FALSE;
281
282	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
283
284	if (error == ESRCH)
285		return (SET_ERROR(ENXIO));
286
287	/*
288	 * We could find a hole that begins after the logical end-of-file,
289	 * because dmu_offset_next() only works on whole blocks.  If the
290	 * EOF falls mid-block, then indicate that the "virtual hole"
291	 * at the end of the file begins at the logical EOF, rather than
292	 * at the end of the last block.
293	 */
294	if (noff > file_sz) {
295		ASSERT(hole);
296		noff = file_sz;
297	}
298
299	if (noff < *off)
300		return (error);
301	*off = noff;
302	return (error);
303}
304
305/* ARGSUSED */
306static int
307zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
308    int *rvalp, caller_context_t *ct)
309{
310	offset_t off;
311	offset_t ndata;
312	dmu_object_info_t doi;
313	int error;
314	zfsvfs_t *zfsvfs;
315	znode_t *zp;
316
317	switch (com) {
318	case _FIOFFS:
319	{
320		return (0);
321
322		/*
323		 * The following two ioctls are used by bfu.  Faking out,
324		 * necessary to avoid bfu errors.
325		 */
326	}
327	case _FIOGDIO:
328	case _FIOSDIO:
329	{
330		return (0);
331	}
332
333	case _FIO_SEEK_DATA:
334	case _FIO_SEEK_HOLE:
335	{
336#ifdef illumos
337		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
338			return (SET_ERROR(EFAULT));
339#else
340		off = *(offset_t *)data;
341#endif
342		zp = VTOZ(vp);
343		zfsvfs = zp->z_zfsvfs;
344		ZFS_ENTER(zfsvfs);
345		ZFS_VERIFY_ZP(zp);
346
347		/* offset parameter is in/out */
348		error = zfs_holey(vp, com, &off);
349		ZFS_EXIT(zfsvfs);
350		if (error)
351			return (error);
352#ifdef illumos
353		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
354			return (SET_ERROR(EFAULT));
355#else
356		*(offset_t *)data = off;
357#endif
358		return (0);
359	}
360#ifdef illumos
361	case _FIO_COUNT_FILLED:
362	{
363		/*
364		 * _FIO_COUNT_FILLED adds a new ioctl command which
365		 * exposes the number of filled blocks in a
366		 * ZFS object.
367		 */
368		zp = VTOZ(vp);
369		zfsvfs = zp->z_zfsvfs;
370		ZFS_ENTER(zfsvfs);
371		ZFS_VERIFY_ZP(zp);
372
373		/*
374		 * Wait for all dirty blocks for this object
375		 * to get synced out to disk, and the DMU info
376		 * updated.
377		 */
378		error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
379		if (error) {
380			ZFS_EXIT(zfsvfs);
381			return (error);
382		}
383
384		/*
385		 * Retrieve fill count from DMU object.
386		 */
387		error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
388		if (error) {
389			ZFS_EXIT(zfsvfs);
390			return (error);
391		}
392
393		ndata = doi.doi_fill_count;
394
395		ZFS_EXIT(zfsvfs);
396		if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
397			return (SET_ERROR(EFAULT));
398		return (0);
399	}
400#endif
401	}
402	return (SET_ERROR(ENOTTY));
403}
404
405#ifdef __FreeBSD__
406static vm_page_t
407page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
408{
409	vm_object_t obj;
410	vm_page_t pp;
411	int64_t end;
412
413	/*
414	 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
415	 * aligned boundaries, if the range is not aligned.  As a result a
416	 * DEV_BSIZE subrange with partially dirty data may get marked as clean.
417	 * It may happen that all DEV_BSIZE subranges are marked clean and thus
418	 * the whole page would be considred clean despite have some dirty data.
419	 * For this reason we should shrink the range to DEV_BSIZE aligned
420	 * boundaries before calling vm_page_clear_dirty.
421	 */
422	end = rounddown2(off + nbytes, DEV_BSIZE);
423	off = roundup2(off, DEV_BSIZE);
424	nbytes = end - off;
425
426	obj = vp->v_object;
427	zfs_vmobject_assert_wlocked(obj);
428
429	for (;;) {
430		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
431		    pp->valid) {
432			if (vm_page_xbusied(pp)) {
433				/*
434				 * Reference the page before unlocking and
435				 * sleeping so that the page daemon is less
436				 * likely to reclaim it.
437				 */
438				vm_page_reference(pp);
439				vm_page_lock(pp);
440				zfs_vmobject_wunlock(obj);
441				vm_page_busy_sleep(pp, "zfsmwb", true);
442				zfs_vmobject_wlock(obj);
443				continue;
444			}
445			vm_page_sbusy(pp);
446		} else if (pp != NULL) {
447			ASSERT(!pp->valid);
448			pp = NULL;
449		}
450
451		if (pp != NULL) {
452			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
453			vm_object_pip_add(obj, 1);
454			pmap_remove_write(pp);
455			if (nbytes != 0)
456				vm_page_clear_dirty(pp, off, nbytes);
457		}
458		break;
459	}
460	return (pp);
461}
462
463static void
464page_unbusy(vm_page_t pp)
465{
466
467	vm_page_sunbusy(pp);
468	vm_object_pip_subtract(pp->object, 1);
469}
470
471static vm_page_t
472page_hold(vnode_t *vp, int64_t start)
473{
474	vm_object_t obj;
475	vm_page_t pp;
476
477	obj = vp->v_object;
478	zfs_vmobject_assert_wlocked(obj);
479
480	for (;;) {
481		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
482		    pp->valid) {
483			if (vm_page_xbusied(pp)) {
484				/*
485				 * Reference the page before unlocking and
486				 * sleeping so that the page daemon is less
487				 * likely to reclaim it.
488				 */
489				vm_page_reference(pp);
490				vm_page_lock(pp);
491				zfs_vmobject_wunlock(obj);
492				vm_page_busy_sleep(pp, "zfsmwb", true);
493				zfs_vmobject_wlock(obj);
494				continue;
495			}
496
497			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
498			vm_page_lock(pp);
499			vm_page_hold(pp);
500			vm_page_unlock(pp);
501
502		} else
503			pp = NULL;
504		break;
505	}
506	return (pp);
507}
508
509static void
510page_unhold(vm_page_t pp)
511{
512
513	vm_page_lock(pp);
514	vm_page_unhold(pp);
515	vm_page_unlock(pp);
516}
517
518/*
519 * When a file is memory mapped, we must keep the IO data synchronized
520 * between the DMU cache and the memory mapped pages.  What this means:
521 *
522 * On Write:	If we find a memory mapped page, we write to *both*
523 *		the page and the dmu buffer.
524 */
525static void
526update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
527    int segflg, dmu_tx_t *tx)
528{
529	vm_object_t obj;
530	struct sf_buf *sf;
531	caddr_t va;
532	int off;
533
534	ASSERT(segflg != UIO_NOCOPY);
535	ASSERT(vp->v_mount != NULL);
536	obj = vp->v_object;
537	ASSERT(obj != NULL);
538
539	off = start & PAGEOFFSET;
540	zfs_vmobject_wlock(obj);
541	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
542		vm_page_t pp;
543		int nbytes = imin(PAGESIZE - off, len);
544
545		if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
546			zfs_vmobject_wunlock(obj);
547
548			va = zfs_map_page(pp, &sf);
549			(void) dmu_read(os, oid, start+off, nbytes,
550			    va+off, DMU_READ_PREFETCH);;
551			zfs_unmap_page(sf);
552
553			zfs_vmobject_wlock(obj);
554			page_unbusy(pp);
555		}
556		len -= nbytes;
557		off = 0;
558	}
559	vm_object_pip_wakeupn(obj, 0);
560	zfs_vmobject_wunlock(obj);
561}
562
563/*
564 * Read with UIO_NOCOPY flag means that sendfile(2) requests
565 * ZFS to populate a range of page cache pages with data.
566 *
567 * NOTE: this function could be optimized to pre-allocate
568 * all pages in advance, drain exclusive busy on all of them,
569 * map them into contiguous KVA region and populate them
570 * in one single dmu_read() call.
571 */
572static int
573mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
574{
575	znode_t *zp = VTOZ(vp);
576	objset_t *os = zp->z_zfsvfs->z_os;
577	struct sf_buf *sf;
578	vm_object_t obj;
579	vm_page_t pp;
580	int64_t start;
581	caddr_t va;
582	int len = nbytes;
583	int off;
584	int error = 0;
585
586	ASSERT(uio->uio_segflg == UIO_NOCOPY);
587	ASSERT(vp->v_mount != NULL);
588	obj = vp->v_object;
589	ASSERT(obj != NULL);
590	ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
591
592	zfs_vmobject_wlock(obj);
593	for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
594		int bytes = MIN(PAGESIZE, len);
595
596		pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY |
597		    VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
598		if (pp->valid == 0) {
599			zfs_vmobject_wunlock(obj);
600			va = zfs_map_page(pp, &sf);
601			error = dmu_read(os, zp->z_id, start, bytes, va,
602			    DMU_READ_PREFETCH);
603			if (bytes != PAGESIZE && error == 0)
604				bzero(va + bytes, PAGESIZE - bytes);
605			zfs_unmap_page(sf);
606			zfs_vmobject_wlock(obj);
607			vm_page_sunbusy(pp);
608			vm_page_lock(pp);
609			if (error) {
610				if (pp->wire_count == 0 && pp->valid == 0 &&
611				    !vm_page_busied(pp))
612					vm_page_free(pp);
613			} else {
614				pp->valid = VM_PAGE_BITS_ALL;
615				vm_page_activate(pp);
616			}
617			vm_page_unlock(pp);
618		} else {
619			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
620			vm_page_sunbusy(pp);
621		}
622		if (error)
623			break;
624		uio->uio_resid -= bytes;
625		uio->uio_offset += bytes;
626		len -= bytes;
627	}
628	zfs_vmobject_wunlock(obj);
629	return (error);
630}
631
632/*
633 * When a file is memory mapped, we must keep the IO data synchronized
634 * between the DMU cache and the memory mapped pages.  What this means:
635 *
636 * On Read:	We "read" preferentially from memory mapped pages,
637 *		else we default from the dmu buffer.
638 *
639 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
640 *	 the file is memory mapped.
641 */
642static int
643mappedread(vnode_t *vp, int nbytes, uio_t *uio)
644{
645	znode_t *zp = VTOZ(vp);
646	vm_object_t obj;
647	int64_t start;
648	caddr_t va;
649	int len = nbytes;
650	int off;
651	int error = 0;
652
653	ASSERT(vp->v_mount != NULL);
654	obj = vp->v_object;
655	ASSERT(obj != NULL);
656
657	start = uio->uio_loffset;
658	off = start & PAGEOFFSET;
659	zfs_vmobject_wlock(obj);
660	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
661		vm_page_t pp;
662		uint64_t bytes = MIN(PAGESIZE - off, len);
663
664		if (pp = page_hold(vp, start)) {
665			struct sf_buf *sf;
666			caddr_t va;
667
668			zfs_vmobject_wunlock(obj);
669			va = zfs_map_page(pp, &sf);
670#ifdef illumos
671			error = uiomove(va + off, bytes, UIO_READ, uio);
672#else
673			error = vn_io_fault_uiomove(va + off, bytes, uio);
674#endif
675			zfs_unmap_page(sf);
676			zfs_vmobject_wlock(obj);
677			page_unhold(pp);
678		} else {
679			zfs_vmobject_wunlock(obj);
680			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
681			    uio, bytes);
682			zfs_vmobject_wlock(obj);
683		}
684		len -= bytes;
685		off = 0;
686		if (error)
687			break;
688	}
689	zfs_vmobject_wunlock(obj);
690	return (error);
691}
692#endif /* __FreeBSD__ */
693
694#ifdef __NetBSD__
695
696caddr_t
697zfs_map_page(page_t *pp, enum seg_rw rw)
698{
699	vaddr_t va;
700	int flags;
701
702#ifdef __HAVE_MM_MD_DIRECT_MAPPED_PHYS
703	if (mm_md_direct_mapped_phys(VM_PAGE_TO_PHYS(pp), &va))
704		return (caddr_t)va;
705#endif
706
707	flags = UVMPAGER_MAPIN_WAITOK |
708		(rw == S_READ ? UVMPAGER_MAPIN_WRITE : UVMPAGER_MAPIN_READ);
709	va = uvm_pagermapin(&pp, 1, flags);
710	return (caddr_t)va;
711}
712
713void
714zfs_unmap_page(page_t *pp, caddr_t addr)
715{
716
717#ifdef __HAVE_MM_MD_DIRECT_MAPPED_PHYS
718	vaddr_t va;
719
720	if (mm_md_direct_mapped_phys(VM_PAGE_TO_PHYS(pp), &va))
721		return;
722#endif
723	uvm_pagermapout((vaddr_t)addr, 1);
724}
725
726static int
727mappedread(vnode_t *vp, int nbytes, uio_t *uio)
728{
729	znode_t *zp = VTOZ(vp);
730	struct uvm_object *uobj = &vp->v_uobj;
731	krwlock_t *rw = uobj->vmobjlock;
732	int64_t start;
733	caddr_t va;
734	size_t len = nbytes;
735	int off;
736	int error = 0;
737	int npages, found;
738
739	start = uio->uio_loffset;
740	off = start & PAGEOFFSET;
741
742	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
743		page_t *pp;
744		uint64_t bytes = MIN(PAGESIZE - off, len);
745
746		pp = NULL;
747		npages = 1;
748		rw_enter(rw, RW_WRITER);
749		found = uvn_findpages(uobj, start, &npages, &pp, NULL,
750		    UFP_NOALLOC);
751		rw_exit(rw);
752
753		/* XXXNETBSD shouldn't access userspace with the page busy */
754		if (found) {
755			va = zfs_map_page(pp, S_READ);
756			error = uiomove(va + off, bytes, UIO_READ, uio);
757			zfs_unmap_page(pp, va);
758			rw_enter(rw, RW_WRITER);
759			uvm_page_unbusy(&pp, 1);
760			rw_exit(rw);
761		} else {
762			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
763			    uio, bytes);
764		}
765
766		len -= bytes;
767		off = 0;
768		if (error)
769			break;
770	}
771	return (error);
772}
773
774static void
775update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
776    int segflg, dmu_tx_t *tx)
777{
778	struct uvm_object *uobj = &vp->v_uobj;
779	krwlock_t *rw = uobj->vmobjlock;
780	caddr_t va;
781	int off, status;
782
783	ASSERT(vp->v_mount != NULL);
784
785	rw_enter(rw, RW_WRITER);
786
787	off = start & PAGEOFFSET;
788	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
789		page_t *pp;
790		int nbytes = MIN(PAGESIZE - off, len);
791		int npages, found;
792
793		pp = NULL;
794		npages = 1;
795		found = uvn_findpages(uobj, start, &npages, &pp, NULL,
796		    UFP_NOALLOC);
797		if (found) {
798			/*
799			 * We're about to zap the page's contents and don't
800			 * care about any existing modifications.  We must
801			 * keep track of any new modifications past this
802			 * point.  Clear the modified bit in the pmap, and
803			 * if the page is marked dirty revert to tracking
804			 * the modified bit.
805			 */
806			switch (uvm_pagegetdirty(pp)) {
807			case UVM_PAGE_STATUS_DIRTY:
808				/* Does pmap_clear_modify(). */
809				uvm_pagemarkdirty(pp, UVM_PAGE_STATUS_UNKNOWN);
810				break;
811			case UVM_PAGE_STATUS_UNKNOWN:
812				pmap_clear_modify(pp);
813				break;
814			case UVM_PAGE_STATUS_CLEAN:
815				/* Nothing to do. */
816				break;
817			}
818			rw_exit(rw);
819
820			va = zfs_map_page(pp, S_WRITE);
821			(void) dmu_read(os, oid, start + off, nbytes,
822			    va + off, DMU_READ_PREFETCH);
823			zfs_unmap_page(pp, va);
824
825			rw_enter(rw, RW_WRITER);
826			uvm_page_unbusy(&pp, 1);
827		}
828		len -= nbytes;
829		off = 0;
830	}
831	rw_exit(rw);
832}
833#endif /* __NetBSD__ */
834
835offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
836
837/*
838 * Read bytes from specified file into supplied buffer.
839 *
840 *	IN:	vp	- vnode of file to be read from.
841 *		uio	- structure supplying read location, range info,
842 *			  and return buffer.
843 *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
844 *		cr	- credentials of caller.
845 *		ct	- caller context
846 *
847 *	OUT:	uio	- updated offset and range, buffer filled.
848 *
849 *	RETURN:	0 on success, error code on failure.
850 *
851 * Side Effects:
852 *	vp - atime updated if byte count > 0
853 */
854/* ARGSUSED */
855static int
856zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
857{
858	znode_t		*zp = VTOZ(vp);
859	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
860	ssize_t		n, nbytes;
861	int		error = 0;
862	rl_t		*rl;
863	xuio_t		*xuio = NULL;
864
865	ZFS_ENTER(zfsvfs);
866	ZFS_VERIFY_ZP(zp);
867
868	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
869		ZFS_EXIT(zfsvfs);
870		return (SET_ERROR(EACCES));
871	}
872
873	/*
874	 * Validate file offset
875	 */
876	if (uio->uio_loffset < (offset_t)0) {
877		ZFS_EXIT(zfsvfs);
878		return (SET_ERROR(EINVAL));
879	}
880
881	/*
882	 * Fasttrack empty reads
883	 */
884	if (uio->uio_resid == 0) {
885		ZFS_EXIT(zfsvfs);
886		return (0);
887	}
888
889	/*
890	 * Check for mandatory locks
891	 */
892	if (MANDMODE(zp->z_mode)) {
893		if (error = chklock(vp, FREAD,
894		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
895			ZFS_EXIT(zfsvfs);
896			return (error);
897		}
898	}
899
900	/*
901	 * If we're in FRSYNC mode, sync out this znode before reading it.
902	 */
903	if (zfsvfs->z_log &&
904	    (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
905		zil_commit(zfsvfs->z_log, zp->z_id);
906
907	/*
908	 * Lock the range against changes.
909	 */
910	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
911
912	/*
913	 * If we are reading past end-of-file we can skip
914	 * to the end; but we might still need to set atime.
915	 */
916	if (uio->uio_loffset >= zp->z_size) {
917		error = 0;
918		goto out;
919	}
920
921	ASSERT(uio->uio_loffset < zp->z_size);
922	n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
923
924#ifdef illumos
925	if ((uio->uio_extflg == UIO_XUIO) &&
926	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
927		int nblk;
928		int blksz = zp->z_blksz;
929		uint64_t offset = uio->uio_loffset;
930
931		xuio = (xuio_t *)uio;
932		if ((ISP2(blksz))) {
933			nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
934			    blksz)) / blksz;
935		} else {
936			ASSERT(offset + n <= blksz);
937			nblk = 1;
938		}
939		(void) dmu_xuio_init(xuio, nblk);
940
941		if (vn_has_cached_data(vp)) {
942			/*
943			 * For simplicity, we always allocate a full buffer
944			 * even if we only expect to read a portion of a block.
945			 */
946			while (--nblk >= 0) {
947				(void) dmu_xuio_add(xuio,
948				    dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
949				    blksz), 0, blksz);
950			}
951		}
952	}
953#endif	/* illumos */
954
955	while (n > 0) {
956		nbytes = MIN(n, zfs_read_chunk_size -
957		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
958
959#ifdef __FreeBSD__
960		if (uio->uio_segflg == UIO_NOCOPY)
961			error = mappedread_sf(vp, nbytes, uio);
962		else
963#endif /* __FreeBSD__ */
964		if (vn_has_cached_data(vp)) {
965			error = mappedread(vp, nbytes, uio);
966		} else {
967			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
968			    uio, nbytes);
969		}
970		if (error) {
971			/* convert checksum errors into IO errors */
972			if (error == ECKSUM)
973				error = SET_ERROR(EIO);
974			break;
975		}
976
977		n -= nbytes;
978	}
979out:
980	zfs_range_unlock(rl);
981
982	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
983	ZFS_EXIT(zfsvfs);
984	return (error);
985}
986
987/*
988 * Write the bytes to a file.
989 *
990 *	IN:	vp	- vnode of file to be written to.
991 *		uio	- structure supplying write location, range info,
992 *			  and data buffer.
993 *		ioflag	- FAPPEND, FSYNC, and/or FDSYNC.  FAPPEND is
994 *			  set if in append mode.
995 *		cr	- credentials of caller.
996 *		ct	- caller context (NFS/CIFS fem monitor only)
997 *
998 *	OUT:	uio	- updated offset and range.
999 *
1000 *	RETURN:	0 on success, error code on failure.
1001 *
1002 * Timestamps:
1003 *	vp - ctime|mtime updated if byte count > 0
1004 */
1005
1006/* ARGSUSED */
1007static int
1008zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
1009{
1010	znode_t		*zp = VTOZ(vp);
1011	rlim64_t	limit = MAXOFFSET_T;
1012	ssize_t		start_resid = uio->uio_resid;
1013	ssize_t		tx_bytes;
1014	uint64_t	end_size;
1015	dmu_tx_t	*tx;
1016	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
1017	zilog_t		*zilog;
1018	offset_t	woff;
1019	ssize_t		n, nbytes;
1020	rl_t		*rl;
1021	int		max_blksz = zfsvfs->z_max_blksz;
1022	int		error = 0;
1023	arc_buf_t	*abuf;
1024	iovec_t		*aiov = NULL;
1025	xuio_t		*xuio = NULL;
1026	int		i_iov = 0;
1027	int		iovcnt = uio->uio_iovcnt;
1028	iovec_t		*iovp = uio->uio_iov;
1029	int		write_eof;
1030	int		count = 0;
1031	sa_bulk_attr_t	bulk[4];
1032	uint64_t	mtime[2], ctime[2];
1033	int		segflg;
1034
1035#ifdef __NetBSD__
1036	segflg = VMSPACE_IS_KERNEL_P(uio->uio_vmspace) ?
1037		UIO_SYSSPACE : UIO_USERSPACE;
1038#else
1039	segflg = uio->uio_segflg;
1040#endif
1041
1042	/*
1043	 * Fasttrack empty write
1044	 */
1045	n = start_resid;
1046	if (n == 0)
1047		return (0);
1048
1049	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
1050		limit = MAXOFFSET_T;
1051
1052	ZFS_ENTER(zfsvfs);
1053	ZFS_VERIFY_ZP(zp);
1054
1055	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
1056	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
1057	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
1058	    &zp->z_size, 8);
1059	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
1060	    &zp->z_pflags, 8);
1061
1062	/*
1063	 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
1064	 * callers might not be able to detect properly that we are read-only,
1065	 * so check it explicitly here.
1066	 */
1067	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
1068		ZFS_EXIT(zfsvfs);
1069		return (SET_ERROR(EROFS));
1070	}
1071
1072	/*
1073	 * If immutable or not appending then return EPERM
1074	 */
1075	if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
1076	    ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
1077	    (uio->uio_loffset < zp->z_size))) {
1078		ZFS_EXIT(zfsvfs);
1079		return (SET_ERROR(EPERM));
1080	}
1081
1082	zilog = zfsvfs->z_log;
1083
1084	/*
1085	 * Validate file offset
1086	 */
1087	woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
1088	if (woff < 0) {
1089		ZFS_EXIT(zfsvfs);
1090		return (SET_ERROR(EINVAL));
1091	}
1092
1093	/*
1094	 * Check for mandatory locks before calling zfs_range_lock()
1095	 * in order to prevent a deadlock with locks set via fcntl().
1096	 */
1097	if (MANDMODE((mode_t)zp->z_mode) &&
1098	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
1099		ZFS_EXIT(zfsvfs);
1100		return (error);
1101	}
1102
1103#ifdef illumos
1104	/*
1105	 * Pre-fault the pages to ensure slow (eg NFS) pages
1106	 * don't hold up txg.
1107	 * Skip this if uio contains loaned arc_buf.
1108	 */
1109	if ((uio->uio_extflg == UIO_XUIO) &&
1110	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
1111		xuio = (xuio_t *)uio;
1112	else
1113		uio_prefaultpages(MIN(n, max_blksz), uio);
1114#endif
1115
1116	/*
1117	 * If in append mode, set the io offset pointer to eof.
1118	 */
1119	if (ioflag & FAPPEND) {
1120		/*
1121		 * Obtain an appending range lock to guarantee file append
1122		 * semantics.  We reset the write offset once we have the lock.
1123		 */
1124		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
1125		woff = rl->r_off;
1126		if (rl->r_len == UINT64_MAX) {
1127			/*
1128			 * We overlocked the file because this write will cause
1129			 * the file block size to increase.
1130			 * Note that zp_size cannot change with this lock held.
1131			 */
1132			woff = zp->z_size;
1133		}
1134		uio->uio_loffset = woff;
1135	} else {
1136		/*
1137		 * Note that if the file block size will change as a result of
1138		 * this write, then this range lock will lock the entire file
1139		 * so that we can re-write the block safely.
1140		 */
1141		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
1142	}
1143
1144#ifdef illumos
1145	if (woff >= limit) {
1146		zfs_range_unlock(rl);
1147		ZFS_EXIT(zfsvfs);
1148		return (SET_ERROR(EFBIG));
1149	}
1150
1151#endif
1152#ifdef __FreeBSD__
1153	if (vn_rlimit_fsize(vp, uio, uio->uio_td)) {
1154		zfs_range_unlock(rl);
1155		ZFS_EXIT(zfsvfs);
1156		return (SET_ERROR(EFBIG));
1157	}
1158#endif
1159#ifdef __NetBSD__
1160	/* XXXNETBSD we might need vn_rlimit_fsize() too here eventually */
1161#endif
1162
1163	if ((woff + n) > limit || woff > (limit - n))
1164		n = limit - woff;
1165
1166	/* Will this write extend the file length? */
1167	write_eof = (woff + n > zp->z_size);
1168
1169	end_size = MAX(zp->z_size, woff + n);
1170
1171	/*
1172	 * Write the file in reasonable size chunks.  Each chunk is written
1173	 * in a separate transaction; this keeps the intent log records small
1174	 * and allows us to do more fine-grained space accounting.
1175	 */
1176	while (n > 0) {
1177		abuf = NULL;
1178		woff = uio->uio_loffset;
1179		if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
1180		    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
1181			if (abuf != NULL)
1182				dmu_return_arcbuf(abuf);
1183			error = SET_ERROR(EDQUOT);
1184			break;
1185		}
1186
1187		if (xuio && abuf == NULL) {
1188			ASSERT(i_iov < iovcnt);
1189			aiov = &iovp[i_iov];
1190			abuf = dmu_xuio_arcbuf(xuio, i_iov);
1191			dmu_xuio_clear(xuio, i_iov);
1192			DTRACE_PROBE3(zfs_cp_write, int, i_iov,
1193			    iovec_t *, aiov, arc_buf_t *, abuf);
1194			ASSERT((aiov->iov_base == abuf->b_data) ||
1195			    ((char *)aiov->iov_base - (char *)abuf->b_data +
1196			    aiov->iov_len == arc_buf_size(abuf)));
1197			i_iov++;
1198		} else if (abuf == NULL && n >= max_blksz &&
1199		    woff >= zp->z_size &&
1200		    P2PHASE(woff, max_blksz) == 0 &&
1201		    zp->z_blksz == max_blksz) {
1202			/*
1203			 * This write covers a full block.  "Borrow" a buffer
1204			 * from the dmu so that we can fill it before we enter
1205			 * a transaction.  This avoids the possibility of
1206			 * holding up the transaction if the data copy hangs
1207			 * up on a pagefault (e.g., from an NFS server mapping).
1208			 */
1209#if defined(illumos) || defined(__NetBSD__)
1210			size_t cbytes;
1211#endif
1212
1213			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
1214			    max_blksz);
1215			ASSERT(abuf != NULL);
1216			ASSERT(arc_buf_size(abuf) == max_blksz);
1217#if defined(illumos) || defined(__NetBSD__)
1218			if (error = uiocopy(abuf->b_data, max_blksz,
1219			    UIO_WRITE, uio, &cbytes)) {
1220				dmu_return_arcbuf(abuf);
1221				break;
1222			}
1223			ASSERT(cbytes == max_blksz);
1224#endif
1225#ifdef __FreeBSD__
1226			ssize_t resid = uio->uio_resid;
1227
1228			error = vn_io_fault_uiomove(abuf->b_data, max_blksz, uio);
1229			if (error != 0) {
1230				uio->uio_offset -= resid - uio->uio_resid;
1231				uio->uio_resid = resid;
1232				dmu_return_arcbuf(abuf);
1233				break;
1234			}
1235#endif
1236		}
1237
1238		/*
1239		 * Start a transaction.
1240		 */
1241		tx = dmu_tx_create(zfsvfs->z_os);
1242		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1243		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
1244		zfs_sa_upgrade_txholds(tx, zp);
1245		error = dmu_tx_assign(tx, TXG_WAIT);
1246		if (error) {
1247			dmu_tx_abort(tx);
1248			if (abuf != NULL)
1249				dmu_return_arcbuf(abuf);
1250			break;
1251		}
1252
1253		/*
1254		 * If zfs_range_lock() over-locked we grow the blocksize
1255		 * and then reduce the lock range.  This will only happen
1256		 * on the first iteration since zfs_range_reduce() will
1257		 * shrink down r_len to the appropriate size.
1258		 */
1259		if (rl->r_len == UINT64_MAX) {
1260			uint64_t new_blksz;
1261
1262			if (zp->z_blksz > max_blksz) {
1263				/*
1264				 * File's blocksize is already larger than the
1265				 * "recordsize" property.  Only let it grow to
1266				 * the next power of 2.
1267				 */
1268				ASSERT(!ISP2(zp->z_blksz));
1269				new_blksz = MIN(end_size,
1270				    1 << highbit64(zp->z_blksz));
1271			} else {
1272				new_blksz = MIN(end_size, max_blksz);
1273			}
1274			zfs_grow_blocksize(zp, new_blksz, tx);
1275			zfs_range_reduce(rl, woff, n);
1276		}
1277
1278		/*
1279		 * XXX - should we really limit each write to z_max_blksz?
1280		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
1281		 */
1282		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
1283
1284		if (woff + nbytes > zp->z_size)
1285			vnode_pager_setsize(vp, woff + nbytes);
1286
1287		if (abuf == NULL) {
1288			tx_bytes = uio->uio_resid;
1289			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
1290			    uio, nbytes, tx);
1291			tx_bytes -= uio->uio_resid;
1292		} else {
1293			tx_bytes = nbytes;
1294			ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
1295			/*
1296			 * If this is not a full block write, but we are
1297			 * extending the file past EOF and this data starts
1298			 * block-aligned, use assign_arcbuf().  Otherwise,
1299			 * write via dmu_write().
1300			 */
1301			if (tx_bytes < max_blksz && (!write_eof ||
1302			    aiov->iov_base != abuf->b_data)) {
1303				ASSERT(xuio);
1304				dmu_write(zfsvfs->z_os, zp->z_id, woff,
1305				    aiov->iov_len, aiov->iov_base, tx);
1306				dmu_return_arcbuf(abuf);
1307				xuio_stat_wbuf_copied();
1308			} else {
1309				ASSERT(xuio || tx_bytes == max_blksz);
1310				dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
1311				    woff, abuf, tx);
1312			}
1313#if defined(illumos) || defined(__NetBSD__)
1314			ASSERT(tx_bytes <= uio->uio_resid);
1315			uioskip(uio, tx_bytes);
1316#endif
1317		}
1318		if (tx_bytes && vn_has_cached_data(vp)) {
1319			update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
1320			    zp->z_id, segflg, tx);
1321		}
1322
1323		/*
1324		 * If we made no progress, we're done.  If we made even
1325		 * partial progress, update the znode and ZIL accordingly.
1326		 */
1327		if (tx_bytes == 0) {
1328			(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
1329			    (void *)&zp->z_size, sizeof (uint64_t), tx);
1330			dmu_tx_commit(tx);
1331			ASSERT(error != 0);
1332			break;
1333		}
1334
1335		/*
1336		 * Clear Set-UID/Set-GID bits on successful write if not
1337		 * privileged and at least one of the excute bits is set.
1338		 *
1339		 * It would be nice to to this after all writes have
1340		 * been done, but that would still expose the ISUID/ISGID
1341		 * to another app after the partial write is committed.
1342		 *
1343		 * Note: we don't call zfs_fuid_map_id() here because
1344		 * user 0 is not an ephemeral uid.
1345		 */
1346		mutex_enter(&zp->z_acl_lock);
1347		if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
1348		    (S_IXUSR >> 6))) != 0 &&
1349		    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
1350		    secpolicy_vnode_setid_retain(vp, cr,
1351		    (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
1352			uint64_t newmode;
1353			zp->z_mode &= ~(S_ISUID | S_ISGID);
1354			newmode = zp->z_mode;
1355			(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
1356			    (void *)&newmode, sizeof (uint64_t), tx);
1357#ifdef __NetBSD__
1358			cache_enter_id(vp, zp->z_mode, zp->z_uid, zp->z_gid,
1359			    true);
1360#endif
1361		}
1362		mutex_exit(&zp->z_acl_lock);
1363
1364		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
1365		    B_TRUE);
1366
1367		/*
1368		 * Update the file size (zp_size) if it has changed;
1369		 * account for possible concurrent updates.
1370		 */
1371		while ((end_size = zp->z_size) < uio->uio_loffset) {
1372			(void) atomic_cas_64(&zp->z_size, end_size,
1373			    uio->uio_loffset);
1374#ifdef illumos
1375			ASSERT(error == 0);
1376#else
1377			ASSERT(error == 0 || error == EFAULT);
1378#endif
1379		}
1380		/*
1381		 * If we are replaying and eof is non zero then force
1382		 * the file size to the specified eof. Note, there's no
1383		 * concurrency during replay.
1384		 */
1385		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
1386			zp->z_size = zfsvfs->z_replay_eof;
1387
1388		if (error == 0)
1389			error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1390		else
1391			(void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1392
1393		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
1394		dmu_tx_commit(tx);
1395
1396		if (error != 0)
1397			break;
1398		ASSERT(tx_bytes == nbytes);
1399		n -= nbytes;
1400
1401#ifdef illumos
1402		if (!xuio && n > 0)
1403			uio_prefaultpages(MIN(n, max_blksz), uio);
1404#endif
1405	}
1406
1407	zfs_range_unlock(rl);
1408
1409	/*
1410	 * If we're in replay mode, or we made no progress, return error.
1411	 * Otherwise, it's at least a partial write, so it's successful.
1412	 */
1413	if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
1414		ZFS_EXIT(zfsvfs);
1415		return (error);
1416	}
1417
1418#ifdef __FreeBSD__
1419	/*
1420	 * EFAULT means that at least one page of the source buffer was not
1421	 * available.  VFS will re-try remaining I/O upon this error.
1422	 */
1423	if (error == EFAULT) {
1424		ZFS_EXIT(zfsvfs);
1425		return (error);
1426	}
1427#endif
1428
1429	if (ioflag & (FSYNC | FDSYNC) ||
1430	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1431		zil_commit(zilog, zp->z_id);
1432
1433	ZFS_EXIT(zfsvfs);
1434	return (0);
1435}
1436
1437void
1438zfs_get_done(zgd_t *zgd, int error)
1439{
1440	znode_t *zp = zgd->zgd_private;
1441	objset_t *os = zp->z_zfsvfs->z_os;
1442
1443	if (zgd->zgd_db)
1444		dmu_buf_rele(zgd->zgd_db, zgd);
1445
1446	zfs_range_unlock(zgd->zgd_rl);
1447
1448	/*
1449	 * Release the vnode asynchronously as we currently have the
1450	 * txg stopped from syncing.
1451	 */
1452	VN_RELE_CLEANER(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1453
1454	if (error == 0 && zgd->zgd_bp)
1455		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
1456
1457	kmem_free(zgd, sizeof (zgd_t));
1458}
1459
1460#ifdef DEBUG
1461static int zil_fault_io = 0;
1462#endif
1463
1464/*
1465 * Get data to generate a TX_WRITE intent log record.
1466 */
1467int
1468zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
1469{
1470	zfsvfs_t *zfsvfs = arg;
1471	objset_t *os = zfsvfs->z_os;
1472	znode_t *zp;
1473	uint64_t object = lr->lr_foid;
1474	uint64_t offset = lr->lr_offset;
1475	uint64_t size = lr->lr_length;
1476	blkptr_t *bp = &lr->lr_blkptr;
1477	dmu_buf_t *db;
1478	zgd_t *zgd;
1479	int error = 0;
1480
1481	ASSERT(zio != NULL);
1482	ASSERT(size != 0);
1483
1484	/*
1485	 * Nothing to do if the file has been removed
1486	 */
1487	if (zfs_zget_cleaner(zfsvfs, object, &zp) != 0)
1488		return (SET_ERROR(ENOENT));
1489	if (zp->z_unlinked) {
1490		/*
1491		 * Release the vnode asynchronously as we currently have the
1492		 * txg stopped from syncing.
1493		 */
1494		VN_RELE_CLEANER(ZTOV(zp),
1495		    dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1496		return (SET_ERROR(ENOENT));
1497	}
1498
1499	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1500	zgd->zgd_zilog = zfsvfs->z_log;
1501	zgd->zgd_private = zp;
1502
1503	/*
1504	 * Write records come in two flavors: immediate and indirect.
1505	 * For small writes it's cheaper to store the data with the
1506	 * log record (immediate); for large writes it's cheaper to
1507	 * sync the data and get a pointer to it (indirect) so that
1508	 * we don't have to write the data twice.
1509	 */
1510	if (buf != NULL) { /* immediate write */
1511		zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
1512		/* test for truncation needs to be done while range locked */
1513		if (offset >= zp->z_size) {
1514			error = SET_ERROR(ENOENT);
1515		} else {
1516			error = dmu_read(os, object, offset, size, buf,
1517			    DMU_READ_NO_PREFETCH);
1518		}
1519		ASSERT(error == 0 || error == ENOENT);
1520	} else { /* indirect write */
1521		/*
1522		 * Have to lock the whole block to ensure when it's
1523		 * written out and it's checksum is being calculated
1524		 * that no one can change the data. We need to re-check
1525		 * blocksize after we get the lock in case it's changed!
1526		 */
1527		for (;;) {
1528			uint64_t blkoff;
1529			size = zp->z_blksz;
1530			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1531			offset -= blkoff;
1532			zgd->zgd_rl = zfs_range_lock(zp, offset, size,
1533			    RL_READER);
1534			if (zp->z_blksz == size)
1535				break;
1536			offset += blkoff;
1537			zfs_range_unlock(zgd->zgd_rl);
1538		}
1539		/* test for truncation needs to be done while range locked */
1540		if (lr->lr_offset >= zp->z_size)
1541			error = SET_ERROR(ENOENT);
1542#ifdef DEBUG
1543		if (zil_fault_io) {
1544			error = SET_ERROR(EIO);
1545			zil_fault_io = 0;
1546		}
1547#endif
1548		if (error == 0)
1549			error = dmu_buf_hold(os, object, offset, zgd, &db,
1550			    DMU_READ_NO_PREFETCH);
1551
1552		if (error == 0) {
1553			blkptr_t *obp = dmu_buf_get_blkptr(db);
1554			if (obp) {
1555				ASSERT(BP_IS_HOLE(bp));
1556				*bp = *obp;
1557			}
1558
1559			zgd->zgd_db = db;
1560			zgd->zgd_bp = bp;
1561
1562			ASSERT(db->db_offset == offset);
1563			ASSERT(db->db_size == size);
1564
1565			error = dmu_sync(zio, lr->lr_common.lrc_txg,
1566			    zfs_get_done, zgd);
1567			ASSERT(error || lr->lr_length <= zp->z_blksz);
1568
1569			/*
1570			 * On success, we need to wait for the write I/O
1571			 * initiated by dmu_sync() to complete before we can
1572			 * release this dbuf.  We will finish everything up
1573			 * in the zfs_get_done() callback.
1574			 */
1575			if (error == 0)
1576				return (0);
1577
1578			if (error == EALREADY) {
1579				lr->lr_common.lrc_txtype = TX_WRITE2;
1580				error = 0;
1581			}
1582		}
1583	}
1584
1585	zfs_get_done(zgd, error);
1586
1587	return (error);
1588}
1589
1590/*ARGSUSED*/
1591static int
1592zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1593    caller_context_t *ct)
1594{
1595	znode_t *zp = VTOZ(vp);
1596	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1597	int error;
1598
1599	ZFS_ENTER(zfsvfs);
1600	ZFS_VERIFY_ZP(zp);
1601
1602	if (flag & V_ACE_MASK)
1603		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1604	else
1605		error = zfs_zaccess_rwx(zp, mode, flag, cr);
1606
1607	ZFS_EXIT(zfsvfs);
1608	return (error);
1609}
1610
1611#ifdef __FreeBSD__
1612static int
1613zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
1614{
1615	int error;
1616
1617	*vpp = arg;
1618	error = vn_lock(*vpp, lkflags);
1619	if (error != 0)
1620		vrele(*vpp);
1621	return (error);
1622}
1623
1624static int
1625zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
1626{
1627	znode_t *zdp = VTOZ(dvp);
1628	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1629	int error;
1630	int ltype;
1631
1632	ASSERT_VOP_LOCKED(dvp, __func__);
1633#ifdef DIAGNOSTIC
1634	if ((zdp->z_pflags & ZFS_XATTR) == 0)
1635		VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
1636#endif
1637
1638	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
1639		ASSERT3P(dvp, ==, vp);
1640		vref(dvp);
1641		ltype = lkflags & LK_TYPE_MASK;
1642		if (ltype != VOP_ISLOCKED(dvp)) {
1643			if (ltype == LK_EXCLUSIVE)
1644				vn_lock(dvp, LK_UPGRADE | LK_RETRY);
1645			else /* if (ltype == LK_SHARED) */
1646				vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
1647
1648			/*
1649			 * Relock for the "." case could leave us with
1650			 * reclaimed vnode.
1651			 */
1652			if (dvp->v_iflag & VI_DOOMED) {
1653				vrele(dvp);
1654				return (SET_ERROR(ENOENT));
1655			}
1656		}
1657		return (0);
1658	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
1659		/*
1660		 * Note that in this case, dvp is the child vnode, and we
1661		 * are looking up the parent vnode - exactly reverse from
1662		 * normal operation.  Unlocking dvp requires some rather
1663		 * tricky unlock/relock dance to prevent mp from being freed;
1664		 * use vn_vget_ino_gen() which takes care of all that.
1665		 *
1666		 * XXX Note that there is a time window when both vnodes are
1667		 * unlocked.  It is possible, although highly unlikely, that
1668		 * during that window the parent-child relationship between
1669		 * the vnodes may change, for example, get reversed.
1670		 * In that case we would have a wrong lock order for the vnodes.
1671		 * All other filesystems seem to ignore this problem, so we
1672		 * do the same here.
1673		 * A potential solution could be implemented as follows:
1674		 * - using LK_NOWAIT when locking the second vnode and retrying
1675		 *   if necessary
1676		 * - checking that the parent-child relationship still holds
1677		 *   after locking both vnodes and retrying if it doesn't
1678		 */
1679		error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
1680		return (error);
1681	} else {
1682		error = vn_lock(vp, lkflags);
1683		if (error != 0)
1684			vrele(vp);
1685		return (error);
1686	}
1687}
1688
1689/*
1690 * Lookup an entry in a directory, or an extended attribute directory.
1691 * If it exists, return a held vnode reference for it.
1692 *
1693 *	IN:	dvp	- vnode of directory to search.
1694 *		nm	- name of entry to lookup.
1695 *		pnp	- full pathname to lookup [UNUSED].
1696 *		flags	- LOOKUP_XATTR set if looking for an attribute.
1697 *		rdir	- root directory vnode [UNUSED].
1698 *		cr	- credentials of caller.
1699 *		ct	- caller context
1700 *
1701 *	OUT:	vpp	- vnode of located entry, NULL if not found.
1702 *
1703 *	RETURN:	0 on success, error code on failure.
1704 *
1705 * Timestamps:
1706 *	NA
1707 */
1708/* ARGSUSED */
1709static int
1710zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
1711    int nameiop, cred_t *cr, kthread_t *td, int flags)
1712{
1713	znode_t *zdp = VTOZ(dvp);
1714	znode_t *zp;
1715	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1716	int	error = 0;
1717
1718	/* fast path (should be redundant with vfs namecache) */
1719	if (!(flags & LOOKUP_XATTR)) {
1720		if (dvp->v_type != VDIR) {
1721			return (SET_ERROR(ENOTDIR));
1722		} else if (zdp->z_sa_hdl == NULL) {
1723			return (SET_ERROR(EIO));
1724		}
1725	}
1726
1727	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1728
1729	ZFS_ENTER(zfsvfs);
1730	ZFS_VERIFY_ZP(zdp);
1731
1732	*vpp = NULL;
1733
1734	if (flags & LOOKUP_XATTR) {
1735#ifdef TODO
1736		/*
1737		 * If the xattr property is off, refuse the lookup request.
1738		 */
1739		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1740			ZFS_EXIT(zfsvfs);
1741			return (SET_ERROR(EINVAL));
1742		}
1743#endif
1744
1745		/*
1746		 * We don't allow recursive attributes..
1747		 * Maybe someday we will.
1748		 */
1749		if (zdp->z_pflags & ZFS_XATTR) {
1750			ZFS_EXIT(zfsvfs);
1751			return (SET_ERROR(EINVAL));
1752		}
1753
1754		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1755			ZFS_EXIT(zfsvfs);
1756			return (error);
1757		}
1758
1759		/*
1760		 * Do we have permission to get into attribute directory?
1761		 */
1762		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1763		    B_FALSE, cr)) {
1764			vrele(*vpp);
1765			*vpp = NULL;
1766		}
1767
1768		ZFS_EXIT(zfsvfs);
1769		return (error);
1770	}
1771
1772	/*
1773	 * Check accessibility of directory.
1774	 */
1775	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1776		ZFS_EXIT(zfsvfs);
1777		return (error);
1778	}
1779
1780	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1781	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1782		ZFS_EXIT(zfsvfs);
1783		return (SET_ERROR(EILSEQ));
1784	}
1785
1786
1787	/*
1788	 * First handle the special cases.
1789	 */
1790	if ((cnp->cn_flags & ISDOTDOT) != 0) {
1791		/*
1792		 * If we are a snapshot mounted under .zfs, return
1793		 * the vp for the snapshot directory.
1794		 */
1795		if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
1796			struct componentname cn;
1797			vnode_t *zfsctl_vp;
1798			int ltype;
1799
1800			ZFS_EXIT(zfsvfs);
1801			ltype = VOP_ISLOCKED(dvp);
1802			VOP_UNLOCK(dvp, 0);
1803			error = zfsctl_root(zfsvfs->z_parent, LK_SHARED,
1804			    &zfsctl_vp);
1805			if (error == 0) {
1806				cn.cn_nameptr = "snapshot";
1807				cn.cn_namelen = strlen(cn.cn_nameptr);
1808				cn.cn_nameiop = cnp->cn_nameiop;
1809				cn.cn_flags = cnp->cn_flags;
1810				cn.cn_lkflags = cnp->cn_lkflags;
1811				error = VOP_LOOKUP(zfsctl_vp, vpp, &cn);
1812				vput(zfsctl_vp);
1813			}
1814			vn_lock(dvp, ltype | LK_RETRY);
1815			return (error);
1816		}
1817	}
1818	if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
1819		ZFS_EXIT(zfsvfs);
1820		if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
1821			return (SET_ERROR(ENOTSUP));
1822		error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp);
1823		return (error);
1824	}
1825
1826	/*
1827	 * The loop is retry the lookup if the parent-child relationship
1828	 * changes during the dot-dot locking complexities.
1829	 */
1830	for (;;) {
1831		uint64_t parent;
1832
1833		error = zfs_dirlook(zdp, nm, &zp);
1834		if (error == 0)
1835			*vpp = ZTOV(zp);
1836
1837		ZFS_EXIT(zfsvfs);
1838		if (error != 0)
1839			break;
1840
1841		error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
1842		if (error != 0) {
1843			/*
1844			 * If we've got a locking error, then the vnode
1845			 * got reclaimed because of a force unmount.
1846			 * We never enter doomed vnodes into the name cache.
1847			 */
1848			*vpp = NULL;
1849			return (error);
1850		}
1851
1852		if ((cnp->cn_flags & ISDOTDOT) == 0)
1853			break;
1854
1855		ZFS_ENTER(zfsvfs);
1856		if (zdp->z_sa_hdl == NULL) {
1857			error = SET_ERROR(EIO);
1858		} else {
1859			error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
1860			    &parent, sizeof (parent));
1861		}
1862		if (error != 0) {
1863			ZFS_EXIT(zfsvfs);
1864			vput(ZTOV(zp));
1865			break;
1866		}
1867		if (zp->z_id == parent) {
1868			ZFS_EXIT(zfsvfs);
1869			break;
1870		}
1871		vput(ZTOV(zp));
1872	}
1873
1874out:
1875	if (error != 0)
1876		*vpp = NULL;
1877
1878	/* Translate errors and add SAVENAME when needed. */
1879	if (cnp->cn_flags & ISLASTCN) {
1880		switch (nameiop) {
1881		case CREATE:
1882		case RENAME:
1883			if (error == ENOENT) {
1884				error = EJUSTRETURN;
1885				cnp->cn_flags |= SAVENAME;
1886				break;
1887			}
1888			/* FALLTHROUGH */
1889		case DELETE:
1890			if (error == 0)
1891				cnp->cn_flags |= SAVENAME;
1892			break;
1893		}
1894	}
1895
1896	/* Insert name into cache (as non-existent) if appropriate. */
1897	if (zfsvfs->z_use_namecache &&
1898	    error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
1899		cache_enter(dvp, NULL, cnp);
1900
1901	/* Insert name into cache if appropriate. */
1902	if (zfsvfs->z_use_namecache &&
1903	    error == 0 && (cnp->cn_flags & MAKEENTRY)) {
1904		if (!(cnp->cn_flags & ISLASTCN) ||
1905		    (nameiop != DELETE && nameiop != RENAME)) {
1906			cache_enter(dvp, *vpp, cnp);
1907		}
1908	}
1909
1910	return (error);
1911}
1912#endif /* __FreeBSD__ */
1913
1914#ifdef __NetBSD__
1915/*
1916 * If vnode is for a device return a specfs vnode instead.
1917 */
1918static int
1919specvp_check(vnode_t **vpp, cred_t *cr)
1920{
1921	int error = 0;
1922
1923	if (IS_DEVVP(*vpp)) {
1924		struct vnode *svp;
1925
1926		svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1927		VN_RELE(*vpp);
1928		if (svp == NULL)
1929			error = ENOSYS;
1930		*vpp = svp;
1931	}
1932	return (error);
1933}
1934
1935/*
1936 * Lookup an entry in a directory, or an extended attribute directory.
1937 * If it exists, return a held vnode reference for it.
1938 *
1939 *	IN:	dvp	- vnode of directory to search.
1940 *		nm	- name of entry to lookup.
1941 *		pnp	- full pathname to lookup [UNUSED].
1942 *		flags	- LOOKUP_XATTR set if looking for an attribute.
1943 *		rdir	- root directory vnode [UNUSED].
1944 *		cr	- credentials of caller.
1945 *		ct	- caller context
1946 *		direntflags - directory lookup flags
1947 *		realpnp - returned pathname.
1948 *
1949 *	OUT:	vpp	- vnode of located entry, NULL if not found.
1950 *
1951 *	RETURN:	0 if success
1952 *		error code if failure
1953 *
1954 * Timestamps:
1955 *	NA
1956 */
1957/* ARGSUSED */
1958static int
1959zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, int flags,
1960    struct componentname *cnp, int nameiop, cred_t *cr)
1961{
1962	znode_t *zdp = VTOZ(dvp);
1963	znode_t *zp;
1964	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1965	int	error = 0;
1966
1967	/* fast path */
1968	if (!(flags & LOOKUP_XATTR)) {
1969		if (dvp->v_type != VDIR) {
1970			return (ENOTDIR);
1971		} else if (zdp->z_sa_hdl == NULL) {
1972			return (SET_ERROR(EIO));
1973		}
1974
1975		if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
1976			error = zfs_fastaccesschk_execute(zdp, cr);
1977			if (!error) {
1978				*vpp = dvp;
1979				VN_HOLD(*vpp);
1980				return (0);
1981			}
1982			return (error);
1983		} else {
1984			vnode_t *tvp = dnlc_lookup(dvp, nm);
1985
1986			if (tvp) {
1987				error = zfs_fastaccesschk_execute(zdp, cr);
1988				if (error) {
1989					VN_RELE(tvp);
1990					return (error);
1991				}
1992				if (tvp == DNLC_NO_VNODE) {
1993					VN_RELE(tvp);
1994					return (ENOENT);
1995				} else {
1996					*vpp = tvp;
1997					return (specvp_check(vpp, cr));
1998				}
1999			}
2000		}
2001	}
2002
2003	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
2004
2005	ZFS_ENTER(zfsvfs);
2006	ZFS_VERIFY_ZP(zdp);
2007
2008	*vpp = NULL;
2009
2010	if (flags & LOOKUP_XATTR) {
2011#ifdef TODO
2012		/*
2013		 * If the xattr property is off, refuse the lookup request.
2014		 */
2015		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
2016			ZFS_EXIT(zfsvfs);
2017			return (EINVAL);
2018		}
2019#endif
2020
2021		/*
2022		 * We don't allow recursive attributes..
2023		 * Maybe someday we will.
2024		 */
2025		if (zdp->z_pflags & ZFS_XATTR) {
2026			ZFS_EXIT(zfsvfs);
2027			return (EINVAL);
2028		}
2029
2030		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
2031			ZFS_EXIT(zfsvfs);
2032			return (error);
2033		}
2034
2035		/*
2036		 * Do we have permission to get into attribute directory?
2037		 */
2038		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
2039		    B_FALSE, cr)) {
2040			VN_RELE(*vpp);
2041			*vpp = NULL;
2042		}
2043
2044		ZFS_EXIT(zfsvfs);
2045		return (error);
2046	}
2047
2048	if (dvp->v_type != VDIR) {
2049		ZFS_EXIT(zfsvfs);
2050		return (ENOTDIR);
2051	}
2052
2053	/*
2054	 * Check accessibility of directory.
2055	 */
2056	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
2057		ZFS_EXIT(zfsvfs);
2058		return (error);
2059	}
2060
2061	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
2062	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2063		ZFS_EXIT(zfsvfs);
2064		return (EILSEQ);
2065	}
2066
2067	/*
2068	 * First handle the special cases.
2069	 */
2070	if ((cnp->cn_flags & ISDOTDOT) != 0) {
2071		/*
2072		 * If we are a snapshot mounted under .zfs, return
2073		 * the vp for the snapshot directory.
2074		 */
2075		if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
2076			ZFS_EXIT(zfsvfs);
2077			error = zfsctl_snapshot(zfsvfs->z_parent, vpp);
2078
2079			return (error);
2080		}
2081	}
2082	if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
2083		ZFS_EXIT(zfsvfs);
2084		if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
2085			return (SET_ERROR(ENOTSUP));
2086		error = zfsctl_root(zfsvfs, vpp);
2087		return (error);
2088	}
2089
2090	error = zfs_dirlook(zdp, nm, &zp);
2091	if (error == 0) {
2092		*vpp = ZTOV(zp);
2093		error = specvp_check(vpp, cr);
2094	}
2095
2096	ZFS_EXIT(zfsvfs);
2097	return (error);
2098}
2099#endif
2100
2101/*
2102 * Attempt to create a new entry in a directory.  If the entry
2103 * already exists, truncate the file if permissible, else return
2104 * an error.  Return the vp of the created or trunc'd file.
2105 *
2106 *	IN:	dvp	- vnode of directory to put new file entry in.
2107 *		name	- name of new file entry.
2108 *		vap	- attributes of new file.
2109 *		excl	- flag indicating exclusive or non-exclusive mode.
2110 *		mode	- mode to open file with.
2111 *		cr	- credentials of caller.
2112 *		flag	- large file flag [UNUSED].
2113 *		ct	- caller context
2114 *		vsecp	- ACL to be set
2115 *
2116 *	OUT:	vpp	- vnode of created or trunc'd entry.
2117 *
2118 *	RETURN:	0 on success, error code on failure.
2119 *
2120 * Timestamps:
2121 *	dvp - ctime|mtime updated if new entry created
2122 *	 vp - ctime|mtime always, atime if new
2123 */
2124
2125/* ARGSUSED */
2126static int
2127zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
2128    vnode_t **vpp, cred_t *cr, kthread_t *td)
2129{
2130	znode_t		*zp, *dzp = VTOZ(dvp);
2131	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2132	zilog_t		*zilog;
2133	objset_t	*os;
2134	dmu_tx_t	*tx;
2135	int		error;
2136	ksid_t		*ksid;
2137	uid_t		uid;
2138	gid_t		gid = crgetgid(cr);
2139	zfs_acl_ids_t   acl_ids;
2140	boolean_t	fuid_dirtied;
2141	void		*vsecp = NULL;
2142	int		flag = 0;
2143	uint64_t	txtype;
2144
2145	/*
2146	 * If we have an ephemeral id, ACL, or XVATTR then
2147	 * make sure file system is at proper version
2148	 */
2149
2150	ksid = crgetsid(cr, KSID_OWNER);
2151	if (ksid)
2152		uid = ksid_getid(ksid);
2153	else
2154		uid = crgetuid(cr);
2155
2156	if (zfsvfs->z_use_fuids == B_FALSE &&
2157	    (vsecp || (vap->va_mask & AT_XVATTR) ||
2158	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
2159		return (SET_ERROR(EINVAL));
2160
2161	ZFS_ENTER(zfsvfs);
2162	ZFS_VERIFY_ZP(dzp);
2163	os = zfsvfs->z_os;
2164	zilog = zfsvfs->z_log;
2165
2166	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
2167	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2168		ZFS_EXIT(zfsvfs);
2169		return (SET_ERROR(EILSEQ));
2170	}
2171
2172	if (vap->va_mask & AT_XVATTR) {
2173		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
2174		    crgetuid(cr), cr, vap->va_type)) != 0) {
2175			ZFS_EXIT(zfsvfs);
2176			return (error);
2177		}
2178	}
2179
2180	*vpp = NULL;
2181
2182	if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
2183		vap->va_mode &= ~S_ISVTX;
2184
2185	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
2186	if (error) {
2187		ZFS_EXIT(zfsvfs);
2188		return (error);
2189	}
2190	ASSERT3P(zp, ==, NULL);
2191
2192	/*
2193	 * Create a new file object and update the directory
2194	 * to reference it.
2195	 */
2196	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
2197		goto out;
2198	}
2199
2200	/*
2201	 * We only support the creation of regular files in
2202	 * extended attribute directories.
2203	 */
2204
2205	if ((dzp->z_pflags & ZFS_XATTR) &&
2206	    (vap->va_type != VREG)) {
2207		error = SET_ERROR(EINVAL);
2208		goto out;
2209	}
2210
2211	if ((error = zfs_acl_ids_create(dzp, 0, vap,
2212	    cr, vsecp, &acl_ids)) != 0)
2213		goto out;
2214
2215	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
2216		zfs_acl_ids_free(&acl_ids);
2217		error = SET_ERROR(EDQUOT);
2218		goto out;
2219	}
2220
2221	getnewvnode_reserve(1);
2222
2223	tx = dmu_tx_create(os);
2224
2225	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
2226	    ZFS_SA_BASE_ATTR_SIZE);
2227
2228	fuid_dirtied = zfsvfs->z_fuid_dirty;
2229	if (fuid_dirtied)
2230		zfs_fuid_txhold(zfsvfs, tx);
2231	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
2232	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
2233	if (!zfsvfs->z_use_sa &&
2234	    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2235		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2236		    0, acl_ids.z_aclp->z_acl_bytes);
2237	}
2238	error = dmu_tx_assign(tx, TXG_WAIT);
2239	if (error) {
2240		zfs_acl_ids_free(&acl_ids);
2241		dmu_tx_abort(tx);
2242		getnewvnode_drop_reserve();
2243		ZFS_EXIT(zfsvfs);
2244		return (error);
2245	}
2246	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
2247
2248	if (fuid_dirtied)
2249		zfs_fuid_sync(zfsvfs, tx);
2250
2251	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
2252	txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
2253	zfs_log_create(zilog, tx, txtype, dzp, zp, name,
2254	    vsecp, acl_ids.z_fuidp, vap);
2255	zfs_acl_ids_free(&acl_ids);
2256	dmu_tx_commit(tx);
2257
2258	getnewvnode_drop_reserve();
2259
2260out:
2261	if (error == 0) {
2262		*vpp = ZTOV(zp);
2263	}
2264
2265	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2266		zil_commit(zilog, 0);
2267
2268	ZFS_EXIT(zfsvfs);
2269	return (error);
2270}
2271
2272/*
2273 * Remove an entry from a directory.
2274 *
2275 *	IN:	dvp	- vnode of directory to remove entry from.
2276 *		name	- name of entry to remove.
2277 *		cr	- credentials of caller.
2278 *		ct	- caller context
2279 *		flags	- case flags
2280 *
2281 *	RETURN:	0 on success, error code on failure.
2282 *
2283 * Timestamps:
2284 *	dvp - ctime|mtime
2285 *	 vp - ctime (if nlink > 0)
2286 */
2287
2288/*ARGSUSED*/
2289static int
2290zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
2291{
2292	znode_t		*dzp = VTOZ(dvp);
2293	znode_t		*zp = VTOZ(vp);
2294	znode_t		*xzp;
2295	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2296	zilog_t		*zilog;
2297	uint64_t	acl_obj, xattr_obj;
2298	uint64_t	obj = 0;
2299	dmu_tx_t	*tx;
2300	boolean_t	unlinked, toobig = FALSE;
2301	uint64_t	txtype;
2302	int		error;
2303
2304	ZFS_ENTER(zfsvfs);
2305	ZFS_VERIFY_ZP(dzp);
2306	ZFS_VERIFY_ZP(zp);
2307	zilog = zfsvfs->z_log;
2308	zp = VTOZ(vp);
2309
2310	xattr_obj = 0;
2311	xzp = NULL;
2312
2313	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2314		goto out;
2315	}
2316
2317	/*
2318	 * Need to use rmdir for removing directories.
2319	 */
2320	if (vp->v_type == VDIR) {
2321		error = SET_ERROR(EPERM);
2322		goto out;
2323	}
2324
2325	vnevent_remove(vp, dvp, name, ct);
2326
2327	obj = zp->z_id;
2328
2329	/* are there any extended attributes? */
2330	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
2331	    &xattr_obj, sizeof (xattr_obj));
2332	if (error == 0 && xattr_obj) {
2333		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
2334		ASSERT0(error);
2335	}
2336
2337	/*
2338	 * We may delete the znode now, or we may put it in the unlinked set;
2339	 * it depends on whether we're the last link, and on whether there are
2340	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
2341	 * allow for either case.
2342	 */
2343	tx = dmu_tx_create(zfsvfs->z_os);
2344	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2345	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2346	zfs_sa_upgrade_txholds(tx, zp);
2347	zfs_sa_upgrade_txholds(tx, dzp);
2348
2349	if (xzp) {
2350		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2351		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
2352	}
2353
2354	/* charge as an update -- would be nice not to charge at all */
2355	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2356
2357	/*
2358	 * Mark this transaction as typically resulting in a net free of space
2359	 */
2360	dmu_tx_mark_netfree(tx);
2361
2362	error = dmu_tx_assign(tx, TXG_WAIT);
2363	if (error) {
2364		dmu_tx_abort(tx);
2365		ZFS_EXIT(zfsvfs);
2366		return (error);
2367	}
2368
2369	/*
2370	 * Remove the directory entry.
2371	 */
2372	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
2373
2374	if (error) {
2375		dmu_tx_commit(tx);
2376		goto out;
2377	}
2378
2379	if (unlinked) {
2380		zfs_unlinked_add(zp, tx);
2381		vp->v_vflag |= VV_NOSYNC;
2382	}
2383
2384	txtype = TX_REMOVE;
2385	zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
2386
2387	dmu_tx_commit(tx);
2388out:
2389
2390	if (xzp)
2391		vrele(ZTOV(xzp));
2392
2393	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2394		zil_commit(zilog, 0);
2395
2396	ZFS_EXIT(zfsvfs);
2397	return (error);
2398}
2399
2400/*
2401 * Create a new directory and insert it into dvp using the name
2402 * provided.  Return a pointer to the inserted directory.
2403 *
2404 *	IN:	dvp	- vnode of directory to add subdir to.
2405 *		dirname	- name of new directory.
2406 *		vap	- attributes of new directory.
2407 *		cr	- credentials of caller.
2408 *		ct	- caller context
2409 *		flags	- case flags
2410 *		vsecp	- ACL to be set
2411 *
2412 *	OUT:	vpp	- vnode of created directory.
2413 *
2414 *	RETURN:	0 on success, error code on failure.
2415 *
2416 * Timestamps:
2417 *	dvp - ctime|mtime updated
2418 *	 vp - ctime|mtime|atime updated
2419 */
2420/*ARGSUSED*/
2421static int
2422zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
2423{
2424	znode_t		*zp, *dzp = VTOZ(dvp);
2425	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2426	zilog_t		*zilog;
2427	uint64_t	txtype;
2428	dmu_tx_t	*tx;
2429	int		error;
2430	ksid_t		*ksid;
2431	uid_t		uid;
2432	gid_t		gid = crgetgid(cr);
2433	zfs_acl_ids_t   acl_ids;
2434	boolean_t	fuid_dirtied;
2435
2436	ASSERT(vap->va_type == VDIR);
2437
2438	/*
2439	 * If we have an ephemeral id, ACL, or XVATTR then
2440	 * make sure file system is at proper version
2441	 */
2442
2443	ksid = crgetsid(cr, KSID_OWNER);
2444	if (ksid)
2445		uid = ksid_getid(ksid);
2446	else
2447		uid = crgetuid(cr);
2448	if (zfsvfs->z_use_fuids == B_FALSE &&
2449	    ((vap->va_mask & AT_XVATTR) ||
2450	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
2451		return (SET_ERROR(EINVAL));
2452
2453	ZFS_ENTER(zfsvfs);
2454	ZFS_VERIFY_ZP(dzp);
2455	zilog = zfsvfs->z_log;
2456
2457	if (dzp->z_pflags & ZFS_XATTR) {
2458		ZFS_EXIT(zfsvfs);
2459		return (SET_ERROR(EINVAL));
2460	}
2461
2462	if (zfsvfs->z_utf8 && u8_validate(dirname,
2463	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2464		ZFS_EXIT(zfsvfs);
2465		return (SET_ERROR(EILSEQ));
2466	}
2467
2468	if (vap->va_mask & AT_XVATTR) {
2469		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
2470		    crgetuid(cr), cr, vap->va_type)) != 0) {
2471			ZFS_EXIT(zfsvfs);
2472			return (error);
2473		}
2474	}
2475
2476	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
2477	    NULL, &acl_ids)) != 0) {
2478		ZFS_EXIT(zfsvfs);
2479		return (error);
2480	}
2481
2482	/*
2483	 * First make sure the new directory doesn't exist.
2484	 *
2485	 * Existence is checked first to make sure we don't return
2486	 * EACCES instead of EEXIST which can cause some applications
2487	 * to fail.
2488	 */
2489	*vpp = NULL;
2490
2491	if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) {
2492		zfs_acl_ids_free(&acl_ids);
2493		ZFS_EXIT(zfsvfs);
2494		return (error);
2495	}
2496	ASSERT3P(zp, ==, NULL);
2497
2498	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
2499		zfs_acl_ids_free(&acl_ids);
2500		ZFS_EXIT(zfsvfs);
2501		return (error);
2502	}
2503
2504	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
2505		zfs_acl_ids_free(&acl_ids);
2506		ZFS_EXIT(zfsvfs);
2507		return (SET_ERROR(EDQUOT));
2508	}
2509
2510	/*
2511	 * Add a new entry to the directory.
2512	 */
2513	getnewvnode_reserve(1);
2514	tx = dmu_tx_create(zfsvfs->z_os);
2515	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
2516	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2517	fuid_dirtied = zfsvfs->z_fuid_dirty;
2518	if (fuid_dirtied)
2519		zfs_fuid_txhold(zfsvfs, tx);
2520	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2521		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
2522		    acl_ids.z_aclp->z_acl_bytes);
2523	}
2524
2525	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
2526	    ZFS_SA_BASE_ATTR_SIZE);
2527
2528	error = dmu_tx_assign(tx, TXG_WAIT);
2529	if (error) {
2530		zfs_acl_ids_free(&acl_ids);
2531		dmu_tx_abort(tx);
2532		getnewvnode_drop_reserve();
2533		ZFS_EXIT(zfsvfs);
2534		return (error);
2535	}
2536
2537	/*
2538	 * Create new node.
2539	 */
2540	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
2541
2542	if (fuid_dirtied)
2543		zfs_fuid_sync(zfsvfs, tx);
2544
2545	/*
2546	 * Now put new name in parent dir.
2547	 */
2548	(void) zfs_link_create(dzp, dirname, zp, tx, ZNEW);
2549
2550	*vpp = ZTOV(zp);
2551
2552	txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
2553	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
2554	    acl_ids.z_fuidp, vap);
2555
2556	zfs_acl_ids_free(&acl_ids);
2557
2558	dmu_tx_commit(tx);
2559
2560	getnewvnode_drop_reserve();
2561
2562	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2563		zil_commit(zilog, 0);
2564
2565	ZFS_EXIT(zfsvfs);
2566	return (0);
2567}
2568
2569/*
2570 * Remove a directory subdir entry.  If the current working
2571 * directory is the same as the subdir to be removed, the
2572 * remove will fail.
2573 *
2574 *	IN:	dvp	- vnode of directory to remove from.
2575 *		name	- name of directory to be removed.
2576 *		cwd	- vnode of current working directory.
2577 *		cr	- credentials of caller.
2578 *		ct	- caller context
2579 *		flags	- case flags
2580 *
2581 *	RETURN:	0 on success, error code on failure.
2582 *
2583 * Timestamps:
2584 *	dvp - ctime|mtime updated
2585 */
2586/*ARGSUSED*/
2587static int
2588zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
2589{
2590	znode_t		*dzp = VTOZ(dvp);
2591	znode_t		*zp = VTOZ(vp);
2592	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2593	zilog_t		*zilog;
2594	dmu_tx_t	*tx;
2595	int		error;
2596
2597	ZFS_ENTER(zfsvfs);
2598	ZFS_VERIFY_ZP(dzp);
2599	ZFS_VERIFY_ZP(zp);
2600	zilog = zfsvfs->z_log;
2601
2602
2603	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2604		goto out;
2605	}
2606
2607	if (vp->v_type != VDIR) {
2608		error = SET_ERROR(ENOTDIR);
2609		goto out;
2610	}
2611
2612	vnevent_rmdir(vp, dvp, name, ct);
2613
2614	tx = dmu_tx_create(zfsvfs->z_os);
2615	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2616	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2617	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2618	zfs_sa_upgrade_txholds(tx, zp);
2619	zfs_sa_upgrade_txholds(tx, dzp);
2620	dmu_tx_mark_netfree(tx);
2621	error = dmu_tx_assign(tx, TXG_WAIT);
2622	if (error) {
2623		dmu_tx_abort(tx);
2624		ZFS_EXIT(zfsvfs);
2625		return (error);
2626	}
2627
2628	cache_purge(dvp);
2629
2630	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
2631
2632	if (error == 0) {
2633		uint64_t txtype = TX_RMDIR;
2634		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
2635	}
2636
2637	dmu_tx_commit(tx);
2638
2639	cache_purge(vp);
2640out:
2641	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2642		zil_commit(zilog, 0);
2643
2644	ZFS_EXIT(zfsvfs);
2645	return (error);
2646}
2647
2648/*
2649 * Read as many directory entries as will fit into the provided
2650 * buffer from the given directory cursor position (specified in
2651 * the uio structure).
2652 *
2653 *	IN:	vp	- vnode of directory to read.
2654 *		uio	- structure supplying read location, range info,
2655 *			  and return buffer.
2656 *		cr	- credentials of caller.
2657 *		ct	- caller context
2658 *		flags	- case flags
2659 *
2660 *	OUT:	uio	- updated offset and range, buffer filled.
2661 *		eofp	- set to true if end-of-file detected.
2662 *
2663 *	RETURN:	0 on success, error code on failure.
2664 *
2665 * Timestamps:
2666 *	vp - atime updated
2667 *
2668 * Note that the low 4 bits of the cookie returned by zap is always zero.
2669 * This allows us to use the low range for "special" directory entries:
2670 * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
2671 * we use the offset 2 for the '.zfs' directory.
2672 */
2673/* ARGSUSED */
2674static int
2675zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, off_t **cookies)
2676{
2677	znode_t		*zp = VTOZ(vp);
2678	iovec_t		*iovp;
2679	edirent_t	*eodp;
2680	dirent64_t	*odp;
2681	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2682	objset_t	*os;
2683	caddr_t		outbuf;
2684	size_t		bufsize;
2685	zap_cursor_t	zc;
2686	zap_attribute_t	zap;
2687	uint_t		bytes_wanted;
2688	uint64_t	offset; /* must be unsigned; checks for < 1 */
2689	uint64_t	parent;
2690	int		local_eof;
2691	int		outcount;
2692	int		error;
2693	uint8_t		prefetch;
2694	boolean_t	check_sysattrs;
2695	uint8_t		type;
2696	int		ncooks = 0;
2697	off_t		*cooks = NULL;
2698	int		flags = 0;
2699#ifdef __FreeBSD__
2700	boolean_t user = uio->uio_segflg != UIO_SYSSPACE;
2701#endif
2702#ifdef __NetBSD__
2703	boolean_t user = !VMSPACE_IS_KERNEL_P(uio->uio_vmspace);
2704#endif
2705
2706	ZFS_ENTER(zfsvfs);
2707	ZFS_VERIFY_ZP(zp);
2708
2709	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2710	    &parent, sizeof (parent))) != 0) {
2711		ZFS_EXIT(zfsvfs);
2712		return (error);
2713	}
2714
2715	/*
2716	 * If we are not given an eof variable,
2717	 * use a local one.
2718	 */
2719	if (eofp == NULL)
2720		eofp = &local_eof;
2721
2722	/*
2723	 * Check for valid iov_len.
2724	 */
2725	if (uio->uio_iov->iov_len <= 0) {
2726		ZFS_EXIT(zfsvfs);
2727		return (SET_ERROR(EINVAL));
2728	}
2729
2730	/*
2731	 * Quit if directory has been removed (posix)
2732	 */
2733	if ((*eofp = zp->z_unlinked) != 0) {
2734		ZFS_EXIT(zfsvfs);
2735		return (0);
2736	}
2737
2738	error = 0;
2739	os = zfsvfs->z_os;
2740	offset = uio->uio_loffset;
2741	prefetch = zp->z_zn_prefetch;
2742
2743	/*
2744	 * Initialize the iterator cursor.
2745	 */
2746	if (offset <= 3) {
2747		/*
2748		 * Start iteration from the beginning of the directory.
2749		 */
2750		zap_cursor_init(&zc, os, zp->z_id);
2751	} else {
2752		/*
2753		 * The offset is a serialized cursor.
2754		 */
2755		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2756	}
2757
2758	/*
2759	 * Get space to change directory entries into fs independent format.
2760	 */
2761	iovp = uio->uio_iov;
2762	bytes_wanted = iovp->iov_len;
2763	if (user || uio->uio_iovcnt != 1) {
2764		bufsize = bytes_wanted;
2765		outbuf = kmem_alloc(bufsize, KM_SLEEP);
2766		odp = (struct dirent64 *)outbuf;
2767	} else {
2768		bufsize = bytes_wanted;
2769		outbuf = NULL;
2770		odp = (struct dirent64 *)iovp->iov_base;
2771	}
2772	eodp = (struct edirent *)odp;
2773
2774	if (ncookies != NULL) {
2775		/*
2776		 * Minimum entry size is dirent size and 1 byte for a file name.
2777		 */
2778#ifdef __FreeBSD__
2779		ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
2780		cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
2781#endif
2782#ifdef __NetBSD__
2783		ncooks = uio->uio_resid / _DIRENT_MINSIZE(odp);
2784		cooks = malloc(ncooks * sizeof(off_t), M_TEMP, M_WAITOK);
2785#endif
2786		*cookies = cooks;
2787		*ncookies = ncooks;
2788	}
2789
2790	/*
2791	 * If this VFS supports the system attribute view interface; and
2792	 * we're looking at an extended attribute directory; and we care
2793	 * about normalization conflicts on this vfs; then we must check
2794	 * for normalization conflicts with the sysattr name space.
2795	 */
2796#ifdef TODO
2797	check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2798	    (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2799	    (flags & V_RDDIR_ENTFLAGS);
2800#else
2801	check_sysattrs = 0;
2802#endif
2803
2804	/*
2805	 * Transform to file-system independent format
2806	 */
2807	outcount = 0;
2808	while (outcount < bytes_wanted) {
2809		ino64_t objnum;
2810		ushort_t reclen;
2811		off64_t *next = NULL;
2812
2813		/*
2814		 * Special case `.', `..', and `.zfs'.
2815		 */
2816		if (offset == 0) {
2817			(void) strcpy(zap.za_name, ".");
2818			zap.za_normalization_conflict = 0;
2819			objnum = zp->z_id;
2820			type = DT_DIR;
2821		} else if (offset == 1) {
2822			(void) strcpy(zap.za_name, "..");
2823			zap.za_normalization_conflict = 0;
2824			objnum = parent;
2825			type = DT_DIR;
2826		} else if (offset == 2 && zfs_show_ctldir(zp)) {
2827			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2828			zap.za_normalization_conflict = 0;
2829			objnum = ZFSCTL_INO_ROOT;
2830			type = DT_DIR;
2831		} else {
2832			/*
2833			 * Grab next entry.
2834			 */
2835			if (error = zap_cursor_retrieve(&zc, &zap)) {
2836				if ((*eofp = (error == ENOENT)) != 0)
2837					break;
2838				else
2839					goto update;
2840			}
2841
2842			if (zap.za_integer_length != 8 ||
2843			    zap.za_num_integers != 1) {
2844				cmn_err(CE_WARN, "zap_readdir: bad directory "
2845				    "entry, obj = %lld, offset = %lld\n",
2846				    (u_longlong_t)zp->z_id,
2847				    (u_longlong_t)offset);
2848				error = SET_ERROR(ENXIO);
2849				goto update;
2850			}
2851
2852			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2853			/*
2854			 * MacOS X can extract the object type here such as:
2855			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2856			 */
2857			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2858
2859			if (check_sysattrs && !zap.za_normalization_conflict) {
2860#ifdef TODO
2861				zap.za_normalization_conflict =
2862				    xattr_sysattr_casechk(zap.za_name);
2863#else
2864				panic("%s:%u: TODO", __func__, __LINE__);
2865#endif
2866			}
2867		}
2868
2869		if (flags & V_RDDIR_ACCFILTER) {
2870			/*
2871			 * If we have no access at all, don't include
2872			 * this entry in the returned information
2873			 */
2874			znode_t	*ezp;
2875			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
2876				goto skip_entry;
2877			if (!zfs_has_access(ezp, cr)) {
2878				vrele(ZTOV(ezp));
2879				goto skip_entry;
2880			}
2881			vrele(ZTOV(ezp));
2882		}
2883
2884		if (flags & V_RDDIR_ENTFLAGS)
2885			reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2886		else
2887			reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2888
2889		/*
2890		 * Will this entry fit in the buffer?
2891		 */
2892		if (outcount + reclen > bufsize) {
2893			/*
2894			 * Did we manage to fit anything in the buffer?
2895			 */
2896			if (!outcount) {
2897				error = SET_ERROR(EINVAL);
2898				goto update;
2899			}
2900			break;
2901		}
2902		if (flags & V_RDDIR_ENTFLAGS) {
2903			/*
2904			 * Add extended flag entry:
2905			 */
2906			eodp->ed_ino = objnum;
2907			eodp->ed_reclen = reclen;
2908			/* NOTE: ed_off is the offset for the *next* entry */
2909			next = &(eodp->ed_off);
2910			eodp->ed_eflags = zap.za_normalization_conflict ?
2911			    ED_CASE_CONFLICT : 0;
2912			(void) strncpy(eodp->ed_name, zap.za_name,
2913			    EDIRENT_NAMELEN(reclen));
2914			eodp = (edirent_t *)((intptr_t)eodp + reclen);
2915		} else {
2916			/*
2917			 * Add normal entry:
2918			 */
2919			odp->d_ino = objnum;
2920			odp->d_reclen = reclen;
2921			odp->d_namlen = strlen(zap.za_name);
2922			(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
2923			odp->d_type = type;
2924			odp = (dirent64_t *)((intptr_t)odp + reclen);
2925		}
2926		outcount += reclen;
2927
2928		ASSERT(outcount <= bufsize);
2929
2930		/* Prefetch znode */
2931		if (prefetch)
2932			dmu_prefetch(os, objnum, 0, 0, 0,
2933			    ZIO_PRIORITY_SYNC_READ);
2934
2935	skip_entry:
2936		/*
2937		 * Move to the next entry, fill in the previous offset.
2938		 */
2939		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2940			zap_cursor_advance(&zc);
2941			offset = zap_cursor_serialize(&zc);
2942		} else {
2943			offset += 1;
2944		}
2945
2946		if (cooks != NULL) {
2947			*cooks++ = offset;
2948			ncooks--;
2949#ifdef __FreeBSD__
2950			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
2951#endif
2952#ifdef __NetBSD__
2953			KASSERTMSG(ncooks >= 0, "ncooks=%d", ncooks);
2954#endif
2955		}
2956	}
2957	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2958
2959	/* Subtract unused cookies */
2960	if (ncookies != NULL)
2961		*ncookies -= ncooks;
2962
2963	if (!user && uio->uio_iovcnt == 1) {
2964		iovp->iov_base += outcount;
2965		iovp->iov_len -= outcount;
2966		uio->uio_resid -= outcount;
2967	} else if (error = uiomove(outbuf, (size_t)outcount, UIO_READ, uio)) {
2968		/*
2969		 * Reset the pointer.
2970		 */
2971		offset = uio->uio_loffset;
2972	}
2973
2974update:
2975	zap_cursor_fini(&zc);
2976	if (user || uio->uio_iovcnt != 1)
2977		kmem_free(outbuf, bufsize);
2978
2979	if (error == ENOENT)
2980		error = 0;
2981
2982	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2983
2984	uio->uio_loffset = offset;
2985	ZFS_EXIT(zfsvfs);
2986	if (error != 0 && cookies != NULL) {
2987#ifdef __FreeBSD__
2988		free(*cookies, M_TEMP);
2989#endif
2990#ifdef __NetBSD__
2991		kmem_free(*cookies, ncooks * sizeof(off_t));
2992#endif
2993		*cookies = NULL;
2994		*ncookies = 0;
2995	}
2996	return (error);
2997}
2998
2999ulong_t zfs_fsync_sync_cnt = 4;
3000
3001static int
3002zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
3003{
3004	znode_t	*zp = VTOZ(vp);
3005	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3006
3007	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
3008
3009	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
3010		ZFS_ENTER(zfsvfs);
3011		ZFS_VERIFY_ZP(zp);
3012
3013#ifdef __NetBSD__
3014		if (!zp->z_unlinked)
3015#endif
3016		zil_commit(zfsvfs->z_log, zp->z_id);
3017		ZFS_EXIT(zfsvfs);
3018	}
3019	return (0);
3020}
3021
3022
3023/*
3024 * Get the requested file attributes and place them in the provided
3025 * vattr structure.
3026 *
3027 *	IN:	vp	- vnode of file.
3028 *		vap	- va_mask identifies requested attributes.
3029 *			  If AT_XVATTR set, then optional attrs are requested
3030 *		flags	- ATTR_NOACLCHECK (CIFS server context)
3031 *		cr	- credentials of caller.
3032 *		ct	- caller context
3033 *
3034 *	OUT:	vap	- attribute values.
3035 *
3036 *	RETURN:	0 (always succeeds).
3037 */
3038/* ARGSUSED */
3039static int
3040zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
3041    caller_context_t *ct)
3042{
3043	znode_t *zp = VTOZ(vp);
3044	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3045	int	error = 0;
3046	uint32_t blksize;
3047	u_longlong_t nblocks;
3048	uint64_t links;
3049	uint64_t mtime[2], ctime[2], crtime[2], rdev;
3050	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
3051	xoptattr_t *xoap = NULL;
3052	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
3053	sa_bulk_attr_t bulk[4];
3054	int count = 0;
3055
3056	ZFS_ENTER(zfsvfs);
3057	ZFS_VERIFY_ZP(zp);
3058
3059	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
3060
3061	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
3062	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
3063	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
3064	if (vp->v_type == VBLK || vp->v_type == VCHR)
3065		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
3066		    &rdev, 8);
3067
3068	if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
3069		ZFS_EXIT(zfsvfs);
3070		return (error);
3071	}
3072
3073	/*
3074	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
3075	 * Also, if we are the owner don't bother, since owner should
3076	 * always be allowed to read basic attributes of file.
3077	 */
3078	if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
3079	    (vap->va_uid != crgetuid(cr))) {
3080		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
3081		    skipaclchk, cr)) {
3082			ZFS_EXIT(zfsvfs);
3083			return (error);
3084		}
3085	}
3086
3087	/*
3088	 * Return all attributes.  It's cheaper to provide the answer
3089	 * than to determine whether we were asked the question.
3090	 */
3091
3092	vap->va_type = IFTOVT(zp->z_mode);
3093	vap->va_mode = zp->z_mode & ~S_IFMT;
3094#ifdef illumos
3095	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
3096#endif
3097#ifdef __FreeBSD__
3098	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
3099	vap->va_nodeid = zp->z_id;
3100#endif
3101#ifdef __NetBSD__
3102	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid;
3103	vap->va_nodeid = zp->z_id;
3104	/*
3105	 * If we are a snapshot mounted under .zfs, return
3106	 * the object id of the snapshot to make getcwd happy.
3107	 */
3108	if (zp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
3109		vnode_t *cvp = vp->v_mount->mnt_vnodecovered;
3110
3111		if (cvp && zfsctl_is_node(cvp))
3112			vap->va_nodeid = dmu_objset_id(zfsvfs->z_os);
3113	}
3114#endif
3115	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
3116		links = zp->z_links + 1;
3117	else
3118		links = zp->z_links;
3119	/* XXX NetBSD: use LINK_MAX when that value matches 32-bit nlink_t */
3120	vap->va_nlink = MIN(links, UINT32_MAX);	/* nlink_t limit! */
3121	vap->va_size = zp->z_size;
3122#ifdef illumos
3123	vap->va_rdev = vp->v_rdev;
3124#else
3125	if (vp->v_type == VBLK || vp->v_type == VCHR)
3126		vap->va_rdev = zfs_cmpldev(rdev);
3127#endif
3128	vap->va_seq = zp->z_seq;
3129	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
3130     	vap->va_filerev = zp->z_seq;
3131
3132	/*
3133	 * Add in any requested optional attributes and the create time.
3134	 * Also set the corresponding bits in the returned attribute bitmap.
3135	 */
3136	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
3137		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
3138			xoap->xoa_archive =
3139			    ((zp->z_pflags & ZFS_ARCHIVE) != 0);
3140			XVA_SET_RTN(xvap, XAT_ARCHIVE);
3141		}
3142
3143		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
3144			xoap->xoa_readonly =
3145			    ((zp->z_pflags & ZFS_READONLY) != 0);
3146			XVA_SET_RTN(xvap, XAT_READONLY);
3147		}
3148
3149		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
3150			xoap->xoa_system =
3151			    ((zp->z_pflags & ZFS_SYSTEM) != 0);
3152			XVA_SET_RTN(xvap, XAT_SYSTEM);
3153		}
3154
3155		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
3156			xoap->xoa_hidden =
3157			    ((zp->z_pflags & ZFS_HIDDEN) != 0);
3158			XVA_SET_RTN(xvap, XAT_HIDDEN);
3159		}
3160
3161		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
3162			xoap->xoa_nounlink =
3163			    ((zp->z_pflags & ZFS_NOUNLINK) != 0);
3164			XVA_SET_RTN(xvap, XAT_NOUNLINK);
3165		}
3166
3167		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
3168			xoap->xoa_immutable =
3169			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
3170			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
3171		}
3172
3173		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
3174			xoap->xoa_appendonly =
3175			    ((zp->z_pflags & ZFS_APPENDONLY) != 0);
3176			XVA_SET_RTN(xvap, XAT_APPENDONLY);
3177		}
3178
3179		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
3180			xoap->xoa_nodump =
3181			    ((zp->z_pflags & ZFS_NODUMP) != 0);
3182			XVA_SET_RTN(xvap, XAT_NODUMP);
3183		}
3184
3185		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
3186			xoap->xoa_opaque =
3187			    ((zp->z_pflags & ZFS_OPAQUE) != 0);
3188			XVA_SET_RTN(xvap, XAT_OPAQUE);
3189		}
3190
3191		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
3192			xoap->xoa_av_quarantined =
3193			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
3194			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
3195		}
3196
3197		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
3198			xoap->xoa_av_modified =
3199			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
3200			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
3201		}
3202
3203		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
3204		    vp->v_type == VREG) {
3205			zfs_sa_get_scanstamp(zp, xvap);
3206		}
3207
3208		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
3209			xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
3210			XVA_SET_RTN(xvap, XAT_REPARSE);
3211		}
3212		if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
3213			xoap->xoa_generation = zp->z_gen;
3214			XVA_SET_RTN(xvap, XAT_GEN);
3215		}
3216
3217		if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
3218			xoap->xoa_offline =
3219			    ((zp->z_pflags & ZFS_OFFLINE) != 0);
3220			XVA_SET_RTN(xvap, XAT_OFFLINE);
3221		}
3222
3223		if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
3224			xoap->xoa_sparse =
3225			    ((zp->z_pflags & ZFS_SPARSE) != 0);
3226			XVA_SET_RTN(xvap, XAT_SPARSE);
3227		}
3228	}
3229
3230	ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
3231	ZFS_TIME_DECODE(&vap->va_mtime, mtime);
3232	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
3233	ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
3234
3235
3236	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
3237	vap->va_blksize = blksize;
3238	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
3239
3240	if (zp->z_blksz == 0) {
3241		/*
3242		 * Block size hasn't been set; suggest maximal I/O transfers.
3243		 */
3244		vap->va_blksize = zfsvfs->z_max_blksz;
3245	}
3246
3247	ZFS_EXIT(zfsvfs);
3248	return (0);
3249}
3250
3251/*
3252 * Set the file attributes to the values contained in the
3253 * vattr structure.
3254 *
3255 *	IN:	vp	- vnode of file to be modified.
3256 *		vap	- new attribute values.
3257 *			  If AT_XVATTR set, then optional attrs are being set
3258 *		flags	- ATTR_UTIME set if non-default time values provided.
3259 *			- ATTR_NOACLCHECK (CIFS context only).
3260 *		cr	- credentials of caller.
3261 *		ct	- caller context
3262 *
3263 *	RETURN:	0 on success, error code on failure.
3264 *
3265 * Timestamps:
3266 *	vp - ctime updated, mtime updated if size changed.
3267 */
3268/* ARGSUSED */
3269static int
3270zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
3271    caller_context_t *ct)
3272{
3273	znode_t		*zp = VTOZ(vp);
3274	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
3275	zilog_t		*zilog;
3276	dmu_tx_t	*tx;
3277	vattr_t		oldva;
3278	xvattr_t	tmpxvattr;
3279	uint_t		mask = vap->va_mask;
3280	uint_t		saved_mask = 0;
3281	uint64_t	saved_mode;
3282	int		trim_mask = 0;
3283	uint64_t	new_mode;
3284	uint64_t	new_uid, new_gid;
3285	uint64_t	xattr_obj;
3286	uint64_t	mtime[2], ctime[2];
3287	znode_t		*attrzp;
3288	int		need_policy = FALSE;
3289	int		err, err2;
3290	zfs_fuid_info_t *fuidp = NULL;
3291	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
3292	xoptattr_t	*xoap;
3293	zfs_acl_t	*aclp;
3294	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
3295	boolean_t	fuid_dirtied = B_FALSE;
3296	sa_bulk_attr_t	bulk[7], xattr_bulk[7];
3297	int		count = 0, xattr_count = 0;
3298
3299	if (mask == 0)
3300		return (0);
3301
3302	if (mask & AT_NOSET)
3303		return (SET_ERROR(EINVAL));
3304
3305	ZFS_ENTER(zfsvfs);
3306	ZFS_VERIFY_ZP(zp);
3307
3308	zilog = zfsvfs->z_log;
3309
3310	/*
3311	 * Make sure that if we have ephemeral uid/gid or xvattr specified
3312	 * that file system is at proper version level
3313	 */
3314
3315	if (zfsvfs->z_use_fuids == B_FALSE &&
3316	    (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
3317	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
3318	    (mask & AT_XVATTR))) {
3319		ZFS_EXIT(zfsvfs);
3320		return (SET_ERROR(EINVAL));
3321	}
3322
3323	if (mask & AT_SIZE && vp->v_type == VDIR) {
3324		ZFS_EXIT(zfsvfs);
3325		return (SET_ERROR(EISDIR));
3326	}
3327
3328	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
3329		ZFS_EXIT(zfsvfs);
3330		return (SET_ERROR(EINVAL));
3331	}
3332
3333	/*
3334	 * If this is an xvattr_t, then get a pointer to the structure of
3335	 * optional attributes.  If this is NULL, then we have a vattr_t.
3336	 */
3337	xoap = xva_getxoptattr(xvap);
3338
3339	xva_init(&tmpxvattr);
3340
3341	/*
3342	 * Immutable files can only alter immutable bit and atime
3343	 */
3344	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
3345	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
3346	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
3347		ZFS_EXIT(zfsvfs);
3348		return (SET_ERROR(EPERM));
3349	}
3350
3351	if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
3352		ZFS_EXIT(zfsvfs);
3353		return (SET_ERROR(EPERM));
3354	}
3355
3356	/*
3357	 * Verify timestamps doesn't overflow 32 bits.
3358	 * ZFS can handle large timestamps, but 32bit syscalls can't
3359	 * handle times greater than 2039.  This check should be removed
3360	 * once large timestamps are fully supported.
3361	 */
3362	if (mask & (AT_ATIME | AT_MTIME)) {
3363		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
3364		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
3365			ZFS_EXIT(zfsvfs);
3366			return (SET_ERROR(EOVERFLOW));
3367		}
3368	}
3369	if (xoap && (mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME) &&
3370	    TIMESPEC_OVERFLOW(&vap->va_birthtime)) {
3371		ZFS_EXIT(zfsvfs);
3372		return (SET_ERROR(EOVERFLOW));
3373	}
3374
3375	attrzp = NULL;
3376	aclp = NULL;
3377
3378	/* Can this be moved to before the top label? */
3379	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
3380		ZFS_EXIT(zfsvfs);
3381		return (SET_ERROR(EROFS));
3382	}
3383
3384	/*
3385	 * First validate permissions
3386	 */
3387
3388	if (mask & AT_SIZE) {
3389		/*
3390		 * XXX - Note, we are not providing any open
3391		 * mode flags here (like FNDELAY), so we may
3392		 * block if there are locks present... this
3393		 * should be addressed in openat().
3394		 */
3395		/* XXX - would it be OK to generate a log record here? */
3396		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
3397		if (err) {
3398			ZFS_EXIT(zfsvfs);
3399			return (err);
3400		}
3401	}
3402
3403	if (mask & (AT_ATIME|AT_MTIME) ||
3404	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
3405	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
3406	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
3407	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
3408	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
3409	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
3410	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
3411		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
3412		    skipaclchk, cr);
3413	}
3414
3415	if (mask & (AT_UID|AT_GID)) {
3416		int	idmask = (mask & (AT_UID|AT_GID));
3417		int	take_owner;
3418		int	take_group;
3419
3420		/*
3421		 * NOTE: even if a new mode is being set,
3422		 * we may clear S_ISUID/S_ISGID bits.
3423		 */
3424
3425		if (!(mask & AT_MODE))
3426			vap->va_mode = zp->z_mode;
3427
3428		/*
3429		 * Take ownership or chgrp to group we are a member of
3430		 */
3431
3432		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
3433		take_group = (mask & AT_GID) &&
3434		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
3435
3436		/*
3437		 * If both AT_UID and AT_GID are set then take_owner and
3438		 * take_group must both be set in order to allow taking
3439		 * ownership.
3440		 *
3441		 * Otherwise, send the check through secpolicy_vnode_setattr()
3442		 *
3443		 */
3444
3445		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
3446		    ((idmask == AT_UID) && take_owner) ||
3447		    ((idmask == AT_GID) && take_group)) {
3448			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
3449			    skipaclchk, cr) == 0) {
3450				/*
3451				 * Remove setuid/setgid for non-privileged users
3452				 */
3453				secpolicy_setid_clear(vap, vp, cr);
3454				trim_mask = (mask & (AT_UID|AT_GID));
3455			} else {
3456				need_policy =  TRUE;
3457			}
3458		} else {
3459			need_policy =  TRUE;
3460		}
3461	}
3462
3463	oldva.va_mode = zp->z_mode;
3464	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
3465	if (mask & AT_XVATTR) {
3466		/*
3467		 * Update xvattr mask to include only those attributes
3468		 * that are actually changing.
3469		 *
3470		 * the bits will be restored prior to actually setting
3471		 * the attributes so the caller thinks they were set.
3472		 */
3473		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
3474			if (xoap->xoa_appendonly !=
3475			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
3476				need_policy = TRUE;
3477			} else {
3478				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
3479				XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
3480			}
3481		}
3482
3483		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
3484			if (xoap->xoa_nounlink !=
3485			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
3486				need_policy = TRUE;
3487			} else {
3488				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
3489				XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
3490			}
3491		}
3492
3493		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
3494			if (xoap->xoa_immutable !=
3495			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
3496				need_policy = TRUE;
3497			} else {
3498				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
3499				XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
3500			}
3501		}
3502
3503		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
3504			if (xoap->xoa_nodump !=
3505			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
3506#if 0
3507				/*
3508                                 * XXXSB - zfs_netbsd_setattr()
3509                                 * has already checked if this
3510                                 * request is authorised, and our
3511                                 * secpolicy_xvattr() doesn't check
3512                                 * kauth chflags.  Fix this when we
3513                                 * migrate to openzfs.
3514				 */
3515				need_policy = TRUE;
3516#endif
3517			} else {
3518				XVA_CLR_REQ(xvap, XAT_NODUMP);
3519				XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
3520			}
3521		}
3522
3523		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
3524			if (xoap->xoa_av_modified !=
3525			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
3526				need_policy = TRUE;
3527			} else {
3528				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
3529				XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
3530			}
3531		}
3532
3533		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
3534			if ((vp->v_type != VREG &&
3535			    xoap->xoa_av_quarantined) ||
3536			    xoap->xoa_av_quarantined !=
3537			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
3538				need_policy = TRUE;
3539			} else {
3540				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
3541				XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
3542			}
3543		}
3544
3545		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
3546			ZFS_EXIT(zfsvfs);
3547			return (SET_ERROR(EPERM));
3548		}
3549
3550		if (need_policy == FALSE &&
3551		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
3552		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
3553			need_policy = TRUE;
3554		}
3555	}
3556
3557	if (mask & AT_MODE) {
3558		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
3559			err = secpolicy_setid_setsticky_clear(vp, vap,
3560			    &oldva, cr);
3561			if (err) {
3562				ZFS_EXIT(zfsvfs);
3563				return (err);
3564			}
3565			trim_mask |= AT_MODE;
3566		} else {
3567			need_policy = TRUE;
3568		}
3569	}
3570
3571	if (need_policy) {
3572		/*
3573		 * If trim_mask is set then take ownership
3574		 * has been granted or write_acl is present and user
3575		 * has the ability to modify mode.  In that case remove
3576		 * UID|GID and or MODE from mask so that
3577		 * secpolicy_vnode_setattr() doesn't revoke it.
3578		 */
3579
3580		if (trim_mask) {
3581			saved_mask = vap->va_mask;
3582			vap->va_mask &= ~trim_mask;
3583			if (trim_mask & AT_MODE) {
3584				/*
3585				 * Save the mode, as secpolicy_vnode_setattr()
3586				 * will overwrite it with ova.va_mode.
3587				 */
3588				saved_mode = vap->va_mode;
3589			}
3590		}
3591		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
3592		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
3593		if (err) {
3594			ZFS_EXIT(zfsvfs);
3595			return (err);
3596		}
3597
3598		if (trim_mask) {
3599			vap->va_mask |= saved_mask;
3600			if (trim_mask & AT_MODE) {
3601				/*
3602				 * Recover the mode after
3603				 * secpolicy_vnode_setattr().
3604				 */
3605				vap->va_mode = saved_mode;
3606			}
3607		}
3608	}
3609
3610	/*
3611	 * secpolicy_vnode_setattr, or take ownership may have
3612	 * changed va_mask
3613	 */
3614	mask = vap->va_mask;
3615
3616	if ((mask & (AT_UID | AT_GID))) {
3617		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
3618		    &xattr_obj, sizeof (xattr_obj));
3619
3620		if (err == 0 && xattr_obj) {
3621			err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
3622			if (err == 0) {
3623				err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
3624				if (err != 0)
3625					vrele(ZTOV(attrzp));
3626			}
3627			if (err)
3628				goto out2;
3629		}
3630		if (mask & AT_UID) {
3631			new_uid = zfs_fuid_create(zfsvfs,
3632			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
3633			if (new_uid != zp->z_uid &&
3634			    zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
3635				if (attrzp)
3636					vput(ZTOV(attrzp));
3637				err = SET_ERROR(EDQUOT);
3638				goto out2;
3639			}
3640		}
3641
3642		if (mask & AT_GID) {
3643			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
3644			    cr, ZFS_GROUP, &fuidp);
3645			if (new_gid != zp->z_gid &&
3646			    zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
3647				if (attrzp)
3648					vput(ZTOV(attrzp));
3649				err = SET_ERROR(EDQUOT);
3650				goto out2;
3651			}
3652		}
3653	}
3654	tx = dmu_tx_create(zfsvfs->z_os);
3655
3656	if (mask & AT_MODE) {
3657		uint64_t pmode = zp->z_mode;
3658		uint64_t acl_obj;
3659		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
3660
3661		if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
3662		    !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
3663			err = SET_ERROR(EPERM);
3664			goto out;
3665		}
3666
3667		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
3668			goto out;
3669
3670		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
3671			/*
3672			 * Are we upgrading ACL from old V0 format
3673			 * to V1 format?
3674			 */
3675			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
3676			    zfs_znode_acl_version(zp) ==
3677			    ZFS_ACL_VERSION_INITIAL) {
3678				dmu_tx_hold_free(tx, acl_obj, 0,
3679				    DMU_OBJECT_END);
3680				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3681				    0, aclp->z_acl_bytes);
3682			} else {
3683				dmu_tx_hold_write(tx, acl_obj, 0,
3684				    aclp->z_acl_bytes);
3685			}
3686		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3687			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3688			    0, aclp->z_acl_bytes);
3689		}
3690		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3691	} else {
3692		if ((mask & AT_XVATTR) &&
3693		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3694			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3695		else
3696			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3697	}
3698
3699	if (attrzp) {
3700		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3701	}
3702
3703	fuid_dirtied = zfsvfs->z_fuid_dirty;
3704	if (fuid_dirtied)
3705		zfs_fuid_txhold(zfsvfs, tx);
3706
3707	zfs_sa_upgrade_txholds(tx, zp);
3708
3709	err = dmu_tx_assign(tx, TXG_WAIT);
3710	if (err)
3711		goto out;
3712
3713	count = 0;
3714	/*
3715	 * Set each attribute requested.
3716	 * We group settings according to the locks they need to acquire.
3717	 *
3718	 * Note: you cannot set ctime directly, although it will be
3719	 * updated as a side-effect of calling this function.
3720	 */
3721
3722	if (mask & (AT_UID|AT_GID|AT_MODE))
3723		mutex_enter(&zp->z_acl_lock);
3724
3725	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3726	    &zp->z_pflags, sizeof (zp->z_pflags));
3727
3728	if (attrzp) {
3729		if (mask & (AT_UID|AT_GID|AT_MODE))
3730			mutex_enter(&attrzp->z_acl_lock);
3731		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3732		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3733		    sizeof (attrzp->z_pflags));
3734	}
3735
3736	if (mask & (AT_UID|AT_GID)) {
3737
3738		if (mask & AT_UID) {
3739			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3740			    &new_uid, sizeof (new_uid));
3741			zp->z_uid = new_uid;
3742			if (attrzp) {
3743				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3744				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3745				    sizeof (new_uid));
3746				attrzp->z_uid = new_uid;
3747			}
3748		}
3749
3750		if (mask & AT_GID) {
3751			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3752			    NULL, &new_gid, sizeof (new_gid));
3753			zp->z_gid = new_gid;
3754			if (attrzp) {
3755				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3756				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3757				    sizeof (new_gid));
3758				attrzp->z_gid = new_gid;
3759			}
3760		}
3761		if (!(mask & AT_MODE)) {
3762			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3763			    NULL, &new_mode, sizeof (new_mode));
3764			new_mode = zp->z_mode;
3765		}
3766		err = zfs_acl_chown_setattr(zp);
3767		ASSERT(err == 0);
3768		if (attrzp) {
3769			err = zfs_acl_chown_setattr(attrzp);
3770			ASSERT(err == 0);
3771		}
3772	}
3773
3774	if (mask & AT_MODE) {
3775		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3776		    &new_mode, sizeof (new_mode));
3777		zp->z_mode = new_mode;
3778		ASSERT3U((uintptr_t)aclp, !=, 0);
3779		err = zfs_aclset_common(zp, aclp, cr, tx);
3780		ASSERT0(err);
3781		if (zp->z_acl_cached)
3782			zfs_acl_free(zp->z_acl_cached);
3783		zp->z_acl_cached = aclp;
3784		aclp = NULL;
3785	}
3786
3787
3788	if (mask & AT_ATIME) {
3789		ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
3790		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
3791		    &zp->z_atime, sizeof (zp->z_atime));
3792	}
3793
3794	if (mask & AT_MTIME) {
3795		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3796		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3797		    mtime, sizeof (mtime));
3798	}
3799
3800	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3801	if (mask & AT_SIZE && !(mask & AT_MTIME)) {
3802		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
3803		    NULL, mtime, sizeof (mtime));
3804		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3805		    &ctime, sizeof (ctime));
3806		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
3807		    B_TRUE);
3808	} else if (mask != 0) {
3809		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3810		    &ctime, sizeof (ctime));
3811		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
3812		    B_TRUE);
3813		if (attrzp) {
3814			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3815			    SA_ZPL_CTIME(zfsvfs), NULL,
3816			    &ctime, sizeof (ctime));
3817			zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
3818			    mtime, ctime, B_TRUE);
3819		}
3820	}
3821	/*
3822	 * Do this after setting timestamps to prevent timestamp
3823	 * update from toggling bit
3824	 */
3825
3826	if (xoap && (mask & AT_XVATTR)) {
3827
3828		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
3829			xoap->xoa_createtime = vap->va_birthtime;
3830		/*
3831		 * restore trimmed off masks
3832		 * so that return masks can be set for caller.
3833		 */
3834
3835		if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3836			XVA_SET_REQ(xvap, XAT_APPENDONLY);
3837		}
3838		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3839			XVA_SET_REQ(xvap, XAT_NOUNLINK);
3840		}
3841		if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3842			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3843		}
3844		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3845			XVA_SET_REQ(xvap, XAT_NODUMP);
3846		}
3847		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3848			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3849		}
3850		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3851			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3852		}
3853
3854		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3855			ASSERT(vp->v_type == VREG);
3856
3857		zfs_xvattr_set(zp, xvap, tx);
3858	}
3859
3860	if (fuid_dirtied)
3861		zfs_fuid_sync(zfsvfs, tx);
3862
3863	if (mask != 0)
3864		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3865
3866	if (mask & (AT_UID|AT_GID|AT_MODE))
3867		mutex_exit(&zp->z_acl_lock);
3868
3869	if (attrzp) {
3870		if (mask & (AT_UID|AT_GID|AT_MODE))
3871			mutex_exit(&attrzp->z_acl_lock);
3872	}
3873out:
3874	if (err == 0 && attrzp) {
3875		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3876		    xattr_count, tx);
3877		ASSERT(err2 == 0);
3878	}
3879
3880	if (attrzp)
3881		vput(ZTOV(attrzp));
3882
3883	if (aclp)
3884		zfs_acl_free(aclp);
3885
3886	if (fuidp) {
3887		zfs_fuid_info_free(fuidp);
3888		fuidp = NULL;
3889	}
3890
3891	if (err) {
3892		dmu_tx_abort(tx);
3893	} else {
3894		err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3895		dmu_tx_commit(tx);
3896	}
3897
3898out2:
3899	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3900		zil_commit(zilog, 0);
3901
3902	ZFS_EXIT(zfsvfs);
3903	return (err);
3904}
3905
3906/*
3907 * We acquire all but fdvp locks using non-blocking acquisitions.  If we
3908 * fail to acquire any lock in the path we will drop all held locks,
3909 * acquire the new lock in a blocking fashion, and then release it and
3910 * restart the rename.  This acquire/release step ensures that we do not
3911 * spin on a lock waiting for release.  On error release all vnode locks
3912 * and decrement references the way tmpfs_rename() would do.
3913 */
3914static int
3915zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
3916    struct vnode *tdvp, struct vnode **tvpp,
3917    const struct componentname *scnp, const struct componentname *tcnp)
3918{
3919	zfsvfs_t	*zfsvfs;
3920	struct vnode	*nvp, *svp, *tvp;
3921	znode_t		*sdzp, *tdzp, *szp, *tzp;
3922#ifdef __FreeBSD__
3923	const char	*snm = scnp->cn_nameptr;
3924	const char	*tnm = tcnp->cn_nameptr;
3925#endif
3926#ifdef __NetBSD__
3927	char *snm, *tnm;
3928#endif
3929	int error;
3930
3931#ifdef __FreeBSD__
3932	VOP_UNLOCK(tdvp, 0);
3933	if (*tvpp != NULL && *tvpp != tdvp)
3934		VOP_UNLOCK(*tvpp, 0);
3935#endif
3936
3937relock:
3938	error = vn_lock(sdvp, LK_EXCLUSIVE);
3939	if (error)
3940		goto out;
3941	sdzp = VTOZ(sdvp);
3942
3943#ifdef __NetBSD__
3944	if (tdvp == sdvp) {
3945	} else {
3946#endif
3947	error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
3948	if (error != 0) {
3949		VOP_UNLOCK(sdvp, 0);
3950		if (error != EBUSY)
3951			goto out;
3952		error = vn_lock(tdvp, LK_EXCLUSIVE);
3953		if (error)
3954			goto out;
3955		VOP_UNLOCK(tdvp, 0);
3956		goto relock;
3957	}
3958#ifdef __NetBSD__
3959	} /* end if (tdvp == sdvp) */
3960#endif
3961
3962	tdzp = VTOZ(tdvp);
3963
3964	/*
3965	 * Before using sdzp and tdzp we must ensure that they are live.
3966	 * As a porting legacy from illumos we have two things to worry
3967	 * about.  One is typical for FreeBSD and it is that the vnode is
3968	 * not reclaimed (doomed).  The other is that the znode is live.
3969	 * The current code can invalidate the znode without acquiring the
3970	 * corresponding vnode lock if the object represented by the znode
3971	 * and vnode is no longer valid after a rollback or receive operation.
3972	 * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock
3973	 * that protects the znodes from the invalidation.
3974	 */
3975	zfsvfs = sdzp->z_zfsvfs;
3976	ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
3977	ZFS_ENTER(zfsvfs);
3978
3979	/*
3980	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
3981	 * bypassing the cleanup code in the case of an error.
3982	 */
3983	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
3984		ZFS_EXIT(zfsvfs);
3985		VOP_UNLOCK(sdvp, 0);
3986#ifdef __NetBSD__
3987		if (tdvp != sdvp)
3988#endif
3989		VOP_UNLOCK(tdvp, 0);
3990		error = SET_ERROR(EIO);
3991		goto out;
3992	}
3993
3994	/*
3995	 * Re-resolve svp to be certain it still exists and fetch the
3996	 * correct vnode.
3997	 */
3998#ifdef __NetBSD__
3999	/* ZFS wants a null-terminated name. */
4000	snm = PNBUF_GET();
4001	strlcpy(snm, scnp->cn_nameptr, scnp->cn_namelen + 1);
4002#endif
4003	error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS);
4004#ifdef __NetBSD__
4005	PNBUF_PUT(snm);
4006#endif
4007	if (error != 0) {
4008		/* Source entry invalid or not there. */
4009		ZFS_EXIT(zfsvfs);
4010		VOP_UNLOCK(sdvp, 0);
4011#ifdef __NetBSD__
4012		if (tdvp != sdvp)
4013#endif
4014		VOP_UNLOCK(tdvp, 0);
4015		if ((scnp->cn_flags & ISDOTDOT) != 0 ||
4016		    (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
4017			error = SET_ERROR(EINVAL);
4018		goto out;
4019	}
4020	svp = ZTOV(szp);
4021
4022	/*
4023	 * Re-resolve tvp, if it disappeared we just carry on.
4024	 */
4025#ifdef __NetBSD__
4026	/* ZFS wants a null-terminated name. */
4027	tnm = PNBUF_GET();
4028	strlcpy(tnm, tcnp->cn_nameptr, tcnp->cn_namelen + 1);
4029#endif
4030	error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0);
4031#ifdef __NetBSD__
4032	PNBUF_PUT(tnm);
4033#endif
4034	if (error != 0) {
4035		ZFS_EXIT(zfsvfs);
4036		VOP_UNLOCK(sdvp, 0);
4037#ifdef __NetBSD__
4038		if (tdvp != sdvp)
4039#endif
4040		VOP_UNLOCK(tdvp, 0);
4041		vrele(svp);
4042		if ((tcnp->cn_flags & ISDOTDOT) != 0)
4043			error = SET_ERROR(EINVAL);
4044		goto out;
4045	}
4046	if (tzp != NULL)
4047		tvp = ZTOV(tzp);
4048	else
4049		tvp = NULL;
4050
4051	/*
4052	 * At present the vnode locks must be acquired before z_teardown_lock,
4053	 * although it would be more logical to use the opposite order.
4054	 */
4055	ZFS_EXIT(zfsvfs);
4056
4057	/*
4058	 * Now try acquire locks on svp and tvp.
4059	 */
4060	nvp = svp;
4061	error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
4062	if (error != 0) {
4063		VOP_UNLOCK(sdvp, 0);
4064#ifdef __NetBSD__
4065		if (tdvp != sdvp)
4066#endif
4067		VOP_UNLOCK(tdvp, 0);
4068		if (tvp != NULL)
4069			vrele(tvp);
4070		if (error != EBUSY) {
4071			vrele(nvp);
4072			goto out;
4073		}
4074		error = vn_lock(nvp, LK_EXCLUSIVE);
4075		if (error != 0) {
4076			vrele(nvp);
4077			goto out;
4078		}
4079		VOP_UNLOCK(nvp, 0);
4080		/*
4081		 * Concurrent rename race.
4082		 * XXX ?
4083		 */
4084		if (nvp == tdvp) {
4085			vrele(nvp);
4086			error = SET_ERROR(EINVAL);
4087			goto out;
4088		}
4089#ifdef __NetBSD__
4090		if (*svpp != NULL)
4091#endif
4092		vrele(*svpp);
4093		*svpp = nvp;
4094		goto relock;
4095	}
4096#ifdef __NetBSD__
4097	if (*svpp != NULL)
4098#endif
4099	vrele(*svpp);
4100	*svpp = nvp;
4101
4102	if (*tvpp != NULL)
4103		vrele(*tvpp);
4104	*tvpp = NULL;
4105	if (tvp != NULL) {
4106		nvp = tvp;
4107
4108#ifdef __NetBSD__
4109		if (tvp == svp || tvp == sdvp) {
4110		} else {
4111#endif
4112		error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
4113		if (error != 0) {
4114			VOP_UNLOCK(sdvp, 0);
4115#ifdef __NetBSD__
4116			if (tdvp != sdvp)
4117#endif
4118			VOP_UNLOCK(tdvp, 0);
4119#ifdef __NetBSD__
4120			if (*svpp != tdvp)
4121#endif
4122			VOP_UNLOCK(*svpp, 0);
4123			if (error != EBUSY) {
4124				vrele(nvp);
4125				goto out;
4126			}
4127			error = vn_lock(nvp, LK_EXCLUSIVE);
4128			if (error != 0) {
4129				vrele(nvp);
4130				goto out;
4131			}
4132			vput(nvp);
4133			goto relock;
4134		}
4135#ifdef __NetBSD__
4136		} /* end if (tvp == svp || tvp == sdvp) */
4137#endif
4138
4139		*tvpp = nvp;
4140	}
4141
4142	KASSERT(VOP_ISLOCKED(sdvp) == LK_EXCLUSIVE);
4143	KASSERT(VOP_ISLOCKED(*svpp) == LK_EXCLUSIVE);
4144	KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
4145	KASSERT(*tvpp == NULL || VOP_ISLOCKED(*tvpp) == LK_EXCLUSIVE);
4146
4147	return (0);
4148
4149out:
4150	return (error);
4151}
4152
4153/*
4154 * Note that we must use VRELE_ASYNC in this function as it walks
4155 * up the directory tree and vrele may need to acquire an exclusive
4156 * lock if a last reference to a vnode is dropped.
4157 */
4158static int
4159zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
4160{
4161	zfsvfs_t	*zfsvfs;
4162	znode_t		*zp, *zp1;
4163	uint64_t	parent;
4164	int		error;
4165
4166	zfsvfs = tdzp->z_zfsvfs;
4167	if (tdzp == szp)
4168		return (SET_ERROR(EINVAL));
4169	if (tdzp == sdzp)
4170		return (0);
4171	if (tdzp->z_id == zfsvfs->z_root)
4172		return (0);
4173	zp = tdzp;
4174	for (;;) {
4175		ASSERT(!zp->z_unlinked);
4176		if ((error = sa_lookup(zp->z_sa_hdl,
4177		    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
4178			break;
4179
4180		if (parent == szp->z_id) {
4181			error = SET_ERROR(EINVAL);
4182			break;
4183		}
4184		if (parent == zfsvfs->z_root)
4185			break;
4186		if (parent == sdzp->z_id)
4187			break;
4188
4189		error = zfs_zget(zfsvfs, parent, &zp1);
4190		if (error != 0)
4191			break;
4192
4193		if (zp != tdzp)
4194			VN_RELE_ASYNC(ZTOV(zp),
4195			    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
4196		zp = zp1;
4197	}
4198
4199	if (error == ENOTDIR)
4200		panic("checkpath: .. not a directory\n");
4201	if (zp != tdzp)
4202		VN_RELE_ASYNC(ZTOV(zp),
4203		    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
4204	return (error);
4205}
4206
4207/*
4208 * Move an entry from the provided source directory to the target
4209 * directory.  Change the entry name as indicated.
4210 *
4211 *	IN:	sdvp	- Source directory containing the "old entry".
4212 *		snm	- Old entry name.
4213 *		tdvp	- Target directory to contain the "new entry".
4214 *		tnm	- New entry name.
4215 *		cr	- credentials of caller.
4216 *		ct	- caller context
4217 *		flags	- case flags
4218 *
4219 *	RETURN:	0 on success, error code on failure.
4220 *
4221 * Timestamps:
4222 *	sdvp,tdvp - ctime|mtime updated
4223 */
4224/*ARGSUSED*/
4225static int
4226zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
4227    vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
4228    cred_t *cr)
4229{
4230	zfsvfs_t	*zfsvfs;
4231	znode_t		*sdzp, *tdzp, *szp, *tzp;
4232	zilog_t		*zilog = NULL;
4233	dmu_tx_t	*tx;
4234#ifdef __FreeBSD__
4235	char		*snm = __UNCONST(scnp->cn_nameptr);
4236	char		*tnm = __UNCONST(tcnp->cn_nameptr);
4237#endif
4238#ifdef __NetBSD__
4239	char *snm, *tnm;
4240#endif
4241	int		error = 0;
4242
4243	/* Reject renames across filesystems. */
4244	if (((*svpp) != NULL && (*svpp)->v_mount != tdvp->v_mount) ||
4245	    ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
4246		error = SET_ERROR(EXDEV);
4247		goto out;
4248	}
4249
4250	if (zfsctl_is_node(tdvp)) {
4251		error = SET_ERROR(EXDEV);
4252		goto out;
4253	}
4254
4255	/*
4256	 * Lock all four vnodes to ensure safety and semantics of renaming.
4257	 */
4258	error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
4259	if (error != 0) {
4260		/* no vnodes are locked in the case of error here */
4261		return (error);
4262	}
4263
4264	tdzp = VTOZ(tdvp);
4265	sdzp = VTOZ(sdvp);
4266	zfsvfs = tdzp->z_zfsvfs;
4267	zilog = zfsvfs->z_log;
4268#ifdef __NetBSD__
4269	/* ZFS wants a null-terminated name. */
4270	snm = PNBUF_GET();
4271	strlcpy(snm, scnp->cn_nameptr, scnp->cn_namelen + 1);
4272	tnm = PNBUF_GET();
4273	strlcpy(tnm, tcnp->cn_nameptr, tcnp->cn_namelen + 1);
4274#endif
4275
4276	/*
4277	 * After we re-enter ZFS_ENTER() we will have to revalidate all
4278	 * znodes involved.
4279	 */
4280	ZFS_ENTER(zfsvfs);
4281
4282	if (zfsvfs->z_utf8 && u8_validate(tnm,
4283	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4284		error = SET_ERROR(EILSEQ);
4285		goto unlockout;
4286	}
4287
4288#ifndef __NetBSD__
4289	/* If source and target are the same file, there is nothing to do. */
4290	if ((*svpp) == (*tvpp)) {
4291		error = 0;
4292		goto unlockout;
4293	}
4294#endif
4295
4296	if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
4297	    ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
4298	    (*tvpp)->v_mountedhere != NULL)) {
4299		error = SET_ERROR(EXDEV);
4300		goto unlockout;
4301	}
4302
4303	/*
4304	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
4305	 * bypassing the cleanup code in the case of an error.
4306	 */
4307	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
4308		error = SET_ERROR(EIO);
4309		goto unlockout;
4310	}
4311
4312	szp = VTOZ(*svpp);
4313	tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
4314	if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) {
4315		error = SET_ERROR(EIO);
4316		goto unlockout;
4317	}
4318
4319	/*
4320	 * This is to prevent the creation of links into attribute space
4321	 * by renaming a linked file into/outof an attribute directory.
4322	 * See the comment in zfs_link() for why this is considered bad.
4323	 */
4324	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
4325		error = SET_ERROR(EINVAL);
4326		goto unlockout;
4327	}
4328
4329	/*
4330	 * Must have write access at the source to remove the old entry
4331	 * and write access at the target to create the new entry.
4332	 * Note that if target and source are the same, this can be
4333	 * done in a single check.
4334	 */
4335	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
4336		goto unlockout;
4337
4338	if ((*svpp)->v_type == VDIR) {
4339		/*
4340		 * Avoid ".", "..", and aliases of "." for obvious reasons.
4341		 */
4342		if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
4343		    sdzp == szp ||
4344		    (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
4345			error = SET_ERROR(EINVAL);
4346			goto unlockout;
4347		}
4348
4349		/*
4350		 * Check to make sure rename is valid.
4351		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
4352		 */
4353		if (error = zfs_rename_check(szp, sdzp, tdzp))
4354			goto unlockout;
4355	}
4356
4357	/*
4358	 * Does target exist?
4359	 */
4360	if (tzp) {
4361		/*
4362		 * Source and target must be the same type.
4363		 */
4364		if ((*svpp)->v_type == VDIR) {
4365			if ((*tvpp)->v_type != VDIR) {
4366				error = SET_ERROR(ENOTDIR);
4367				goto unlockout;
4368			} else {
4369				cache_purge(tdvp);
4370				if (sdvp != tdvp)
4371					cache_purge(sdvp);
4372			}
4373		} else {
4374			if ((*tvpp)->v_type == VDIR) {
4375				error = SET_ERROR(EISDIR);
4376				goto unlockout;
4377			}
4378		}
4379
4380		/*
4381		 * POSIX dictates that when the source and target
4382		 * entries refer to the same file object, rename
4383		 * must do nothing and exit without error.
4384		 */
4385#ifndef __NetBSD__
4386		/*
4387		 * But on NetBSD we have a different system call to do
4388		 * this, posix_rename, which sorta kinda handles this
4389		 * case (modulo races), and our tests expect BSD
4390		 * semantics for rename, so we'll do that until we can
4391		 * push the choice between BSD and POSIX semantics into
4392		 * the VOP_RENAME protocol as a flag.
4393		 */
4394		if (szp->z_id == tzp->z_id) {
4395			error = 0;
4396			goto unlockout;
4397		}
4398#endif
4399	}
4400
4401	vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
4402	if (tzp)
4403		vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
4404
4405	/*
4406	 * notify the target directory if it is not the same
4407	 * as source directory.
4408	 */
4409	if (tdvp != sdvp) {
4410		vnevent_rename_dest_dir(tdvp, ct);
4411	}
4412
4413	tx = dmu_tx_create(zfsvfs->z_os);
4414	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4415	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
4416	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
4417	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
4418	if (sdzp != tdzp) {
4419		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
4420		zfs_sa_upgrade_txholds(tx, tdzp);
4421	}
4422	if (tzp) {
4423		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
4424		zfs_sa_upgrade_txholds(tx, tzp);
4425	}
4426
4427	zfs_sa_upgrade_txholds(tx, szp);
4428	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
4429	error = dmu_tx_assign(tx, TXG_WAIT);
4430	if (error) {
4431		dmu_tx_abort(tx);
4432		goto unlockout;
4433	}
4434
4435
4436	if (tzp && (tzp->z_id != szp->z_id))
4437		/* Attempt to remove the existing target */
4438		error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
4439
4440	if (error == 0) {
4441		if (!tzp || (tzp->z_id != szp->z_id))
4442			error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
4443		if (error == 0) {
4444			szp->z_pflags |= ZFS_AV_MODIFIED;
4445
4446			error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
4447			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
4448			ASSERT0(error);
4449
4450			error = zfs_link_destroy(sdzp, snm, szp, tx,
4451			    /* Kludge for BSD rename semantics.  */
4452			    tzp && tzp->z_id == szp->z_id ? 0: ZRENAMING, NULL);
4453			if (error == 0) {
4454				zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
4455				    snm, tdzp, tnm, szp);
4456
4457				/*
4458				 * Update path information for the target vnode
4459				 */
4460				vn_renamepath(tdvp, *svpp, tnm, strlen(tnm));
4461			} else {
4462				/*
4463				 * At this point, we have successfully created
4464				 * the target name, but have failed to remove
4465				 * the source name.  Since the create was done
4466				 * with the ZRENAMING flag, there are
4467				 * complications; for one, the link count is
4468				 * wrong.  The easiest way to deal with this
4469				 * is to remove the newly created target, and
4470				 * return the original error.  This must
4471				 * succeed; fortunately, it is very unlikely to
4472				 * fail, since we just created it.
4473				 */
4474				VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx,
4475				    ZRENAMING, NULL), ==, 0);
4476			}
4477		}
4478		if (error == 0) {
4479			cache_purge(*svpp);
4480			if (*tvpp != NULL)
4481				cache_purge(*tvpp);
4482			cache_purge_negative(tdvp);
4483#ifdef __NetBSD__
4484			if (*svpp == *tvpp) {
4485				VN_KNOTE(sdvp, NOTE_WRITE);
4486				VN_KNOTE(*svpp, (szp->z_links == 0 ?
4487				    NOTE_DELETE : NOTE_LINK));
4488			} else {
4489				genfs_rename_knote(sdvp, *svpp, tdvp, *tvpp,
4490				    tzp != NULL ? tzp->z_links : 0);
4491			}
4492#endif
4493		}
4494	}
4495
4496	dmu_tx_commit(tx);
4497
4498	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4499		zil_commit(zilog, 0);
4500
4501unlockout:			/* all 4 vnodes are locked, ZFS_ENTER called */
4502	ZFS_EXIT(zfsvfs);
4503
4504	VOP_UNLOCK(*svpp, 0);
4505	VOP_UNLOCK(sdvp, 0);
4506#ifdef __NetBSD__
4507	PNBUF_PUT(snm);
4508	PNBUF_PUT(tnm);
4509#endif
4510
4511	if (*tvpp != sdvp && *tvpp != *svpp)
4512	if (*tvpp != NULL)
4513		VOP_UNLOCK(*tvpp, 0);
4514	if (tdvp != sdvp && tdvp != *svpp)
4515	if (tdvp != *tvpp)
4516		VOP_UNLOCK(tdvp, 0);
4517
4518out:
4519	return (error);
4520}
4521
4522/*
4523 * Insert the indicated symbolic reference entry into the directory.
4524 *
4525 *	IN:	dvp	- Directory to contain new symbolic link.
4526 *		link	- Name for new symlink entry.
4527 *		vap	- Attributes of new entry.
4528 *		cr	- credentials of caller.
4529 *		ct	- caller context
4530 *		flags	- case flags
4531 *
4532 *	RETURN:	0 on success, error code on failure.
4533 *
4534 * Timestamps:
4535 *	dvp - ctime|mtime updated
4536 */
4537/*ARGSUSED*/
4538static int
4539zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
4540    cred_t *cr, kthread_t *td)
4541{
4542	znode_t		*zp, *dzp = VTOZ(dvp);
4543	dmu_tx_t	*tx;
4544	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
4545	zilog_t		*zilog;
4546	uint64_t	len = strlen(link);
4547	int		error;
4548	zfs_acl_ids_t	acl_ids;
4549	boolean_t	fuid_dirtied;
4550	uint64_t	txtype = TX_SYMLINK;
4551	int		flags = 0;
4552
4553	ASSERT(vap->va_type == VLNK);
4554
4555	ZFS_ENTER(zfsvfs);
4556	ZFS_VERIFY_ZP(dzp);
4557	zilog = zfsvfs->z_log;
4558
4559	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
4560	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4561		ZFS_EXIT(zfsvfs);
4562		return (SET_ERROR(EILSEQ));
4563	}
4564
4565	if (len > MAXPATHLEN) {
4566		ZFS_EXIT(zfsvfs);
4567		return (SET_ERROR(ENAMETOOLONG));
4568	}
4569
4570	if ((error = zfs_acl_ids_create(dzp, 0,
4571	    vap, cr, NULL, &acl_ids)) != 0) {
4572		ZFS_EXIT(zfsvfs);
4573		return (error);
4574	}
4575
4576	/*
4577	 * Attempt to lock directory; fail if entry already exists.
4578	 */
4579	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
4580	if (error) {
4581		zfs_acl_ids_free(&acl_ids);
4582		ZFS_EXIT(zfsvfs);
4583		return (error);
4584	}
4585
4586	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4587		zfs_acl_ids_free(&acl_ids);
4588		ZFS_EXIT(zfsvfs);
4589		return (error);
4590	}
4591
4592	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
4593		zfs_acl_ids_free(&acl_ids);
4594		ZFS_EXIT(zfsvfs);
4595		return (SET_ERROR(EDQUOT));
4596	}
4597
4598	getnewvnode_reserve(1);
4599	tx = dmu_tx_create(zfsvfs->z_os);
4600	fuid_dirtied = zfsvfs->z_fuid_dirty;
4601	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
4602	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4603	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
4604	    ZFS_SA_BASE_ATTR_SIZE + len);
4605	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
4606	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
4607		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
4608		    acl_ids.z_aclp->z_acl_bytes);
4609	}
4610	if (fuid_dirtied)
4611		zfs_fuid_txhold(zfsvfs, tx);
4612	error = dmu_tx_assign(tx, TXG_WAIT);
4613	if (error) {
4614		zfs_acl_ids_free(&acl_ids);
4615		dmu_tx_abort(tx);
4616		getnewvnode_drop_reserve();
4617		ZFS_EXIT(zfsvfs);
4618		return (error);
4619	}
4620
4621	/*
4622	 * Create a new object for the symlink.
4623	 * for version 4 ZPL datsets the symlink will be an SA attribute
4624	 */
4625	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
4626
4627	if (fuid_dirtied)
4628		zfs_fuid_sync(zfsvfs, tx);
4629
4630	if (zp->z_is_sa)
4631		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
4632		    link, len, tx);
4633	else
4634		zfs_sa_symlink(zp, link, len, tx);
4635
4636	zp->z_size = len;
4637	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
4638	    &zp->z_size, sizeof (zp->z_size), tx);
4639	/*
4640	 * Insert the new object into the directory.
4641	 */
4642	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
4643
4644	zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
4645	*vpp = ZTOV(zp);
4646
4647	zfs_acl_ids_free(&acl_ids);
4648
4649	dmu_tx_commit(tx);
4650
4651	getnewvnode_drop_reserve();
4652
4653	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4654		zil_commit(zilog, 0);
4655
4656	ZFS_EXIT(zfsvfs);
4657	return (error);
4658}
4659
4660/*
4661 * Return, in the buffer contained in the provided uio structure,
4662 * the symbolic path referred to by vp.
4663 *
4664 *	IN:	vp	- vnode of symbolic link.
4665 *		uio	- structure to contain the link path.
4666 *		cr	- credentials of caller.
4667 *		ct	- caller context
4668 *
4669 *	OUT:	uio	- structure containing the link path.
4670 *
4671 *	RETURN:	0 on success, error code on failure.
4672 *
4673 * Timestamps:
4674 *	vp - atime updated
4675 */
4676/* ARGSUSED */
4677static int
4678zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
4679{
4680	znode_t		*zp = VTOZ(vp);
4681	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4682	int		error;
4683
4684	ZFS_ENTER(zfsvfs);
4685	ZFS_VERIFY_ZP(zp);
4686
4687	if (zp->z_is_sa)
4688		error = sa_lookup_uio(zp->z_sa_hdl,
4689		    SA_ZPL_SYMLINK(zfsvfs), uio);
4690	else
4691		error = zfs_sa_readlink(zp, uio);
4692
4693	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4694
4695	ZFS_EXIT(zfsvfs);
4696	return (error);
4697}
4698
4699/*
4700 * Insert a new entry into directory tdvp referencing svp.
4701 *
4702 *	IN:	tdvp	- Directory to contain new entry.
4703 *		svp	- vnode of new entry.
4704 *		name	- name of new entry.
4705 *		cr	- credentials of caller.
4706 *		ct	- caller context
4707 *
4708 *	RETURN:	0 on success, error code on failure.
4709 *
4710 * Timestamps:
4711 *	tdvp - ctime|mtime updated
4712 *	 svp - ctime updated
4713 */
4714/* ARGSUSED */
4715static int
4716zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
4717    caller_context_t *ct, int flags)
4718{
4719	znode_t		*dzp = VTOZ(tdvp);
4720	znode_t		*tzp, *szp;
4721	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
4722	zilog_t		*zilog;
4723	dmu_tx_t	*tx;
4724	int		error;
4725	uint64_t	parent;
4726	uid_t		owner;
4727
4728	ASSERT(tdvp->v_type == VDIR);
4729
4730	ZFS_ENTER(zfsvfs);
4731	ZFS_VERIFY_ZP(dzp);
4732	zilog = zfsvfs->z_log;
4733
4734	/*
4735	 * POSIX dictates that we return EPERM here.
4736	 * Better choices include ENOTSUP or EISDIR.
4737	 */
4738	if (svp->v_type == VDIR) {
4739		ZFS_EXIT(zfsvfs);
4740		return (SET_ERROR(EPERM));
4741	}
4742
4743	szp = VTOZ(svp);
4744	ZFS_VERIFY_ZP(szp);
4745
4746	if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) {
4747		ZFS_EXIT(zfsvfs);
4748		return (SET_ERROR(EPERM));
4749	}
4750
4751	/* Prevent links to .zfs/shares files */
4752
4753	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
4754	    &parent, sizeof (uint64_t))) != 0) {
4755		ZFS_EXIT(zfsvfs);
4756		return (error);
4757	}
4758	if (parent == zfsvfs->z_shares_dir) {
4759		ZFS_EXIT(zfsvfs);
4760		return (SET_ERROR(EPERM));
4761	}
4762
4763	if (zfsvfs->z_utf8 && u8_validate(name,
4764	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4765		ZFS_EXIT(zfsvfs);
4766		return (SET_ERROR(EILSEQ));
4767	}
4768
4769	/*
4770	 * We do not support links between attributes and non-attributes
4771	 * because of the potential security risk of creating links
4772	 * into "normal" file space in order to circumvent restrictions
4773	 * imposed in attribute space.
4774	 */
4775	if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
4776		ZFS_EXIT(zfsvfs);
4777		return (SET_ERROR(EINVAL));
4778	}
4779
4780
4781	owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
4782	if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
4783		ZFS_EXIT(zfsvfs);
4784		return (SET_ERROR(EPERM));
4785	}
4786
4787	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4788		ZFS_EXIT(zfsvfs);
4789		return (error);
4790	}
4791
4792	/*
4793	 * Attempt to lock directory; fail if entry already exists.
4794	 */
4795	error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW);
4796	if (error) {
4797		ZFS_EXIT(zfsvfs);
4798		return (error);
4799	}
4800
4801	tx = dmu_tx_create(zfsvfs->z_os);
4802	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4803	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4804	zfs_sa_upgrade_txholds(tx, szp);
4805	zfs_sa_upgrade_txholds(tx, dzp);
4806	error = dmu_tx_assign(tx, TXG_WAIT);
4807	if (error) {
4808		dmu_tx_abort(tx);
4809		ZFS_EXIT(zfsvfs);
4810		return (error);
4811	}
4812
4813	error = zfs_link_create(dzp, name, szp, tx, 0);
4814
4815	if (error == 0) {
4816		uint64_t txtype = TX_LINK;
4817		zfs_log_link(zilog, tx, txtype, dzp, szp, name);
4818	}
4819
4820	dmu_tx_commit(tx);
4821
4822	if (error == 0) {
4823		vnevent_link(svp, ct);
4824	}
4825
4826	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4827		zil_commit(zilog, 0);
4828
4829	ZFS_EXIT(zfsvfs);
4830	return (error);
4831}
4832
4833
4834/*ARGSUSED*/
4835void
4836zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4837{
4838	znode_t	*zp = VTOZ(vp);
4839	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4840	int error;
4841
4842	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4843	if (zp->z_sa_hdl == NULL) {
4844		/*
4845		 * The fs has been unmounted, or we did a
4846		 * suspend/resume and this file no longer exists.
4847		 */
4848		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4849		vrecycle(vp);
4850		return;
4851	}
4852
4853	if (zp->z_unlinked) {
4854		/*
4855		 * Fast path to recycle a vnode of a removed file.
4856		 */
4857		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4858		vrecycle(vp);
4859		return;
4860	}
4861
4862	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4863		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4864
4865		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4866		zfs_sa_upgrade_txholds(tx, zp);
4867		error = dmu_tx_assign(tx, TXG_WAIT);
4868		if (error) {
4869			dmu_tx_abort(tx);
4870		} else {
4871			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4872			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4873			zp->z_atime_dirty = 0;
4874			dmu_tx_commit(tx);
4875		}
4876	}
4877	rw_exit(&zfsvfs->z_teardown_inactive_lock);
4878}
4879
4880
4881#ifdef __FreeBSD__
4882CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
4883CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
4884#endif
4885
4886/*ARGSUSED*/
4887static int
4888zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4889{
4890	znode_t		*zp = VTOZ(vp);
4891	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4892	uint32_t	gen;
4893	uint64_t	gen64;
4894	uint64_t	object = zp->z_id;
4895	zfid_short_t	*zfid;
4896	int		size, i, error;
4897
4898	ZFS_ENTER(zfsvfs);
4899	ZFS_VERIFY_ZP(zp);
4900
4901	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4902	    &gen64, sizeof (uint64_t))) != 0) {
4903		ZFS_EXIT(zfsvfs);
4904		return (error);
4905	}
4906
4907	gen = (uint32_t)gen64;
4908
4909	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
4910
4911#ifdef illumos
4912	if (fidp->fid_len < size) {
4913		fidp->fid_len = size;
4914		ZFS_EXIT(zfsvfs);
4915		return (SET_ERROR(ENOSPC));
4916	}
4917#else
4918	fidp->fid_len = size;
4919#endif
4920
4921	zfid = (zfid_short_t *)fidp;
4922
4923	zfid->zf_len = size;
4924
4925	for (i = 0; i < sizeof (zfid->zf_object); i++)
4926		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4927
4928	/* Must have a non-zero generation number to distinguish from .zfs */
4929	if (gen == 0)
4930		gen = 1;
4931	for (i = 0; i < sizeof (zfid->zf_gen); i++)
4932		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4933
4934	if (size == LONG_FID_LEN) {
4935		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
4936		zfid_long_t	*zlfid;
4937
4938		zlfid = (zfid_long_t *)fidp;
4939
4940		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
4941			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
4942
4943		/* XXX - this should be the generation number for the objset */
4944		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
4945			zlfid->zf_setgen[i] = 0;
4946	}
4947
4948	ZFS_EXIT(zfsvfs);
4949	return (0);
4950}
4951
4952static int
4953zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4954    caller_context_t *ct)
4955{
4956	znode_t		*zp, *xzp;
4957	zfsvfs_t	*zfsvfs;
4958	int		error;
4959
4960	switch (cmd) {
4961	case _PC_LINK_MAX:
4962		*valp = INT_MAX;
4963		return (0);
4964
4965	case _PC_FILESIZEBITS:
4966		*valp = 64;
4967		return (0);
4968#ifdef illumos
4969	case _PC_XATTR_EXISTS:
4970		zp = VTOZ(vp);
4971		zfsvfs = zp->z_zfsvfs;
4972		ZFS_ENTER(zfsvfs);
4973		ZFS_VERIFY_ZP(zp);
4974		*valp = 0;
4975		error = zfs_dirent_lookup(zp, "", &xzp,
4976		    ZXATTR | ZEXISTS | ZSHARED);
4977		if (error == 0) {
4978			if (!zfs_dirempty(xzp))
4979				*valp = 1;
4980			vrele(ZTOV(xzp));
4981		} else if (error == ENOENT) {
4982			/*
4983			 * If there aren't extended attributes, it's the
4984			 * same as having zero of them.
4985			 */
4986			error = 0;
4987		}
4988		ZFS_EXIT(zfsvfs);
4989		return (error);
4990
4991	case _PC_SATTR_ENABLED:
4992	case _PC_SATTR_EXISTS:
4993		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
4994		    (vp->v_type == VREG || vp->v_type == VDIR);
4995		return (0);
4996
4997	case _PC_ACCESS_FILTERING:
4998		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
4999		    vp->v_type == VDIR;
5000		return (0);
5001
5002	case _PC_ACL_ENABLED:
5003		*valp = _ACL_ACE_ENABLED;
5004		return (0);
5005#endif	/* illumos */
5006	case _PC_MIN_HOLE_SIZE:
5007		*valp = (int)SPA_MINBLOCKSIZE;
5008		return (0);
5009#ifdef illumos
5010	case _PC_TIMESTAMP_RESOLUTION:
5011		/* nanosecond timestamp resolution */
5012		*valp = 1L;
5013		return (0);
5014#endif
5015	case _PC_ACL_EXTENDED:
5016		*valp = 0;
5017		return (0);
5018
5019#ifndef __NetBSD__
5020	case _PC_ACL_NFS4:
5021		*valp = 1;
5022		return (0);
5023
5024	case _PC_ACL_PATH_MAX:
5025		*valp = ACL_MAX_ENTRIES;
5026		return (0);
5027#endif
5028
5029	default:
5030		return (EOPNOTSUPP);
5031	}
5032}
5033
5034/*ARGSUSED*/
5035static int
5036zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
5037    caller_context_t *ct)
5038{
5039	znode_t *zp = VTOZ(vp);
5040	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5041	int error;
5042	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
5043
5044	ZFS_ENTER(zfsvfs);
5045	ZFS_VERIFY_ZP(zp);
5046	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
5047	ZFS_EXIT(zfsvfs);
5048
5049	return (error);
5050}
5051
5052/*ARGSUSED*/
5053int
5054zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
5055    caller_context_t *ct)
5056{
5057	znode_t *zp = VTOZ(vp);
5058	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5059	int error;
5060	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
5061	zilog_t	*zilog = zfsvfs->z_log;
5062
5063	ZFS_ENTER(zfsvfs);
5064	ZFS_VERIFY_ZP(zp);
5065
5066	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
5067
5068	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
5069		zil_commit(zilog, 0);
5070
5071	ZFS_EXIT(zfsvfs);
5072	return (error);
5073}
5074
5075static int
5076ioflags(int ioflags)
5077{
5078	int flags = 0;
5079
5080	if (ioflags & IO_APPEND)
5081		flags |= FAPPEND;
5082	if (ioflags & IO_NDELAY)
5083		flags |= FNONBLOCK;
5084	if (ioflags & IO_SYNC)
5085		flags |= (FSYNC | FDSYNC | FRSYNC);
5086
5087	return (flags);
5088}
5089
5090#ifdef __NetBSD__
5091
5092static int
5093zfs_netbsd_open(void *v)
5094{
5095	struct vop_open_args *ap = v;
5096
5097	return (zfs_open(&ap->a_vp, ap->a_mode, ap->a_cred, NULL));
5098}
5099
5100static int
5101zfs_netbsd_close(void *v)
5102{
5103	struct vop_close_args *ap = v;
5104
5105	return (zfs_close(ap->a_vp, ap->a_fflag, 0, 0, ap->a_cred, NULL));
5106}
5107
5108static int
5109zfs_netbsd_ioctl(void *v)
5110{
5111	struct vop_ioctl_args *ap = v;
5112
5113	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
5114		ap->a_fflag, ap->a_cred, NULL, NULL));
5115}
5116
5117
5118static int
5119zfs_netbsd_read(void *v)
5120{
5121	struct vop_read_args *ap = v;
5122	vnode_t *vp = ap->a_vp;
5123	znode_t *zp = VTOZ(vp);
5124
5125	switch (vp->v_type) {
5126	case VBLK:
5127	case VCHR:
5128		ZFS_ACCESSTIME_STAMP(zp->z_zfsvfs, zp);
5129		return (VOCALL(spec_vnodeop_p, VOFFSET(vop_read), ap));
5130	case VFIFO:
5131		ZFS_ACCESSTIME_STAMP(zp->z_zfsvfs, zp);
5132		return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_read), ap));
5133	}
5134
5135	return (zfs_read(vp, ap->a_uio, ioflags(ap->a_ioflag), ap->a_cred, NULL));
5136}
5137
5138static int
5139zfs_netbsd_write(void *v)
5140{
5141	struct vop_write_args *ap = v;
5142	vnode_t *vp = ap->a_vp;
5143	znode_t *zp = VTOZ(vp);
5144	struct uio *uio = ap->a_uio;
5145	off_t osize = zp->z_size;
5146	int error, resid;
5147
5148	switch (vp->v_type) {
5149	case VBLK:
5150	case VCHR:
5151		GOP_MARKUPDATE(vp, GOP_UPDATE_MODIFIED);
5152		return (VOCALL(spec_vnodeop_p, VOFFSET(vop_write), ap));
5153	case VFIFO:
5154		GOP_MARKUPDATE(vp, GOP_UPDATE_MODIFIED);
5155		return (VOCALL(fifo_vnodeop_p, VOFFSET(vop_write), ap));
5156	}
5157
5158	resid = uio->uio_resid;
5159	error = zfs_write(vp, uio, ioflags(ap->a_ioflag), ap->a_cred, NULL);
5160
5161	return error;
5162}
5163
5164static int
5165zfs_netbsd_access(void *v)
5166{
5167	struct vop_access_args /* {
5168		struct vnode *a_vp;
5169		accmode_t a_accmode;
5170		kauth_cred_t a_cred;
5171	} */ *ap = v;
5172	vnode_t *vp = ap->a_vp;
5173	znode_t *zp = VTOZ(vp);
5174	accmode_t accmode;
5175	kauth_cred_t cred = ap->a_cred;
5176	int error = 0;
5177
5178	/*
5179	 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
5180	 */
5181	accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
5182	if (accmode != 0)
5183		error = zfs_access(vp, accmode, 0, cred, NULL);
5184
5185	/*
5186	 * VADMIN has to be handled by kauth_authorize_vnode().
5187	 */
5188	if (error == 0) {
5189		accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
5190		if (accmode != 0) {
5191			error = kauth_authorize_vnode(cred,
5192			    KAUTH_ACCESS_ACTION(accmode, vp->v_type,
5193			    zp->z_mode & ALLPERMS), vp, NULL,
5194			    genfs_can_access(vp, cred, zp->z_uid,
5195			    zp->z_gid, zp->z_mode & ALLPERMS, NULL, accmode));
5196		}
5197	}
5198
5199	/*
5200	 * For VEXEC, ensure that at least one execute bit is set for
5201	 * non-directories.
5202	 */
5203	if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
5204	    (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
5205		error = EACCES;
5206	}
5207
5208	/* We expect EACCES as common error. */
5209	if (error == EPERM)
5210		error = EACCES;
5211
5212	return error;
5213}
5214
5215static int
5216zfs_netbsd_lookup(void *v)
5217{
5218	struct vop_lookup_v2_args /* {
5219		struct vnode *a_dvp;
5220		struct vnode **a_vpp;
5221		struct componentname *a_cnp;
5222	} */ *ap = v;
5223	struct vnode *dvp = ap->a_dvp;
5224	struct vnode **vpp = ap->a_vpp;
5225	struct componentname *cnp = ap->a_cnp;
5226	char *nm, short_nm[31];
5227	int error;
5228	int iswhiteout;
5229
5230	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5231
5232	*vpp = NULL;
5233
5234	/*
5235	 * Do an access check before the cache lookup.  zfs_lookup does
5236	 * an access check too, but it's too scary to contemplate
5237	 * injecting our namecache stuff into zfs internals.
5238	 *
5239	 * XXX Is this the correct access check?
5240	 */
5241	if ((error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred)) != 0)
5242		goto out;
5243
5244	/*
5245	 * Check the namecache before entering zfs_lookup.
5246	 * cache_lookup does the locking dance for us.
5247	 */
5248	if (cache_lookup(dvp, cnp->cn_nameptr, cnp->cn_namelen,
5249	    cnp->cn_nameiop, cnp->cn_flags, &iswhiteout, vpp)) {
5250		if (iswhiteout) {
5251			cnp->cn_flags |= ISWHITEOUT;
5252		}
5253		return *vpp == NULL ? ENOENT : 0;
5254	}
5255
5256	/*
5257	 * zfs_lookup wants a null-terminated component name, but namei
5258	 * gives us a pointer into the full pathname.
5259	 */
5260	ASSERT(cnp->cn_namelen < PATH_MAX - 1);
5261	if (cnp->cn_namelen + 1 > sizeof(short_nm))
5262		nm = PNBUF_GET();
5263	else
5264		nm = short_nm;
5265	(void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1);
5266
5267	error = zfs_lookup(dvp, nm, vpp, 0, cnp, cnp->cn_nameiop, cnp->cn_cred);
5268
5269	if (nm != short_nm)
5270		PNBUF_PUT(nm);
5271
5272	/*
5273	 * Translate errors to match our namei insanity.  Also, if the
5274	 * caller wants to create an entry here, it's apparently our
5275	 * responsibility as lookup to make sure that's permissible.
5276	 * Go figure.
5277	 */
5278	if (cnp->cn_flags & ISLASTCN) {
5279		switch (cnp->cn_nameiop) {
5280		case CREATE:
5281		case RENAME:
5282			if (error == ENOENT) {
5283				error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred);
5284				if (error)
5285					break;
5286				error = EJUSTRETURN;
5287				break;
5288			}
5289			break;
5290		case DELETE:
5291			if (error == 0) {
5292				error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred);
5293				if (error) {
5294					VN_RELE(*vpp);
5295					*vpp = NULL;
5296				}
5297			}
5298			break;
5299		}
5300	}
5301
5302	if (error) {
5303		KASSERT(*vpp == NULL);
5304		goto out;
5305	}
5306	KASSERT(*vpp != NULL);
5307
5308	if ((cnp->cn_namelen == 1) && (cnp->cn_nameptr[0] == '.')) {
5309		KASSERT(!(cnp->cn_flags & ISDOTDOT));
5310		KASSERT(dvp == *vpp);
5311	} else if ((cnp->cn_namelen == 2) &&
5312	    (cnp->cn_nameptr[0] == '.') &&
5313	    (cnp->cn_nameptr[1] == '.')) {
5314		KASSERT(cnp->cn_flags & ISDOTDOT);
5315	} else {
5316		KASSERT(!(cnp->cn_flags & ISDOTDOT));
5317	}
5318
5319out:
5320	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5321
5322	/*
5323	 * Insert name into cache if appropriate.
5324	 */
5325
5326	if (error == 0 || (error == ENOENT && cnp->cn_nameiop != CREATE))
5327		cache_enter(dvp, *vpp, cnp->cn_nameptr, cnp->cn_namelen,
5328		    cnp->cn_flags);
5329
5330	return (error);
5331}
5332
5333static int
5334zfs_netbsd_create(void *v)
5335{
5336	struct vop_create_v3_args /* {
5337		struct vnode *a_dvp;
5338		struct vnode **a_vpp;
5339		struct componentname *a_cnp;
5340		struct vattr *a_vap;
5341	} */ *ap = v;
5342	struct vnode *dvp = ap->a_dvp;
5343	struct vnode **vpp = ap->a_vpp;
5344	struct componentname *cnp = ap->a_cnp;
5345	struct vattr *vap = ap->a_vap;
5346	char *nm;
5347	int mode;
5348	int error;
5349
5350	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5351
5352	vattr_init_mask(vap);
5353	mode = vap->va_mode & ALLPERMS;
5354
5355	/* ZFS wants a null-terminated name. */
5356	nm = PNBUF_GET();
5357	(void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1);
5358
5359	/* XXX !EXCL is wrong here...  */
5360	error = zfs_create(dvp, nm, vap, !EXCL, mode, vpp, cnp->cn_cred, NULL);
5361
5362	PNBUF_PUT(nm);
5363
5364	KASSERT((error == 0) == (*vpp != NULL));
5365	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5366	if (*vpp != NULL)
5367		VOP_UNLOCK(*vpp, 0);
5368
5369	return (error);
5370}
5371
5372static int
5373zfs_netbsd_mknod(void *v)
5374{
5375	struct vop_mknod_v3_args /* {
5376		struct vnode *a_dvp;
5377		struct vnode **a_vpp;
5378		struct componentname *a_cnp;
5379		struct vattr *a_vap;
5380	} */ *ap = v;
5381	struct vnode *dvp = ap->a_dvp;
5382	struct vnode **vpp = ap->a_vpp;
5383	struct componentname *cnp = ap->a_cnp;
5384	struct vattr *vap = ap->a_vap;
5385	char *nm;
5386	int mode;
5387	int error;
5388
5389	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5390
5391	vattr_init_mask(vap);
5392	mode = vap->va_mode & ALLPERMS;
5393
5394	/* ZFS wants a null-terminated name. */
5395	nm = PNBUF_GET();
5396	(void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1);
5397
5398	/* XXX !EXCL is wrong here...  */
5399	error = zfs_create(dvp, nm, vap, !EXCL, mode, vpp, cnp->cn_cred, NULL);
5400
5401	PNBUF_PUT(nm);
5402
5403	KASSERT((error == 0) == (*vpp != NULL));
5404	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5405	if (*vpp != NULL)
5406		VOP_UNLOCK(*vpp, 0);
5407
5408	return (error);
5409}
5410
5411static int
5412zfs_netbsd_remove(void *v)
5413{
5414	struct vop_remove_v3_args /* {
5415		struct vnode *a_dvp;
5416		struct vnode *a_vp;
5417		struct componentname *a_cnp;
5418		nlink_t ctx_vp_new_nlink;
5419	} */ *ap = v;
5420	struct vnode *dvp = ap->a_dvp;
5421	struct vnode *vp = ap->a_vp;
5422	struct componentname *cnp = ap->a_cnp;
5423	char *nm;
5424	int error;
5425
5426	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5427	KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
5428
5429	/* ZFS wants a null-terminated name. */
5430	nm = PNBUF_GET();
5431	(void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1);
5432
5433	error = zfs_remove(dvp, vp, nm, cnp->cn_cred);
5434
5435	/*
5436	 * XXX Should update ctx_vp_new_nlink, but for now the
5437	 * XXX the kevent sent on "vp"  matches historical behavior.
5438	 */
5439
5440	PNBUF_PUT(nm);
5441	vput(vp);
5442	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5443	return (error);
5444}
5445
5446static int
5447zfs_netbsd_mkdir(void *v)
5448{
5449	struct vop_mkdir_v3_args /* {
5450		struct vnode *a_dvp;
5451		struct vnode **a_vpp;
5452		struct componentname *a_cnp;
5453		struct vattr *a_vap;
5454	} */ *ap = v;
5455	struct vnode *dvp = ap->a_dvp;
5456	struct vnode **vpp = ap->a_vpp;
5457	struct componentname *cnp = ap->a_cnp;
5458	struct vattr *vap = ap->a_vap;
5459	char *nm;
5460	int error;
5461
5462	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5463
5464	vattr_init_mask(vap);
5465
5466	/* ZFS wants a null-terminated name. */
5467	nm = PNBUF_GET();
5468	(void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1);
5469
5470	error = zfs_mkdir(dvp, nm, vap, vpp, cnp->cn_cred);
5471
5472	PNBUF_PUT(nm);
5473
5474	KASSERT((error == 0) == (*vpp != NULL));
5475	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5476	if (*vpp != NULL)
5477		VOP_UNLOCK(*vpp, 0);
5478
5479	return (error);
5480}
5481
5482static int
5483zfs_netbsd_rmdir(void *v)
5484{
5485	struct vop_rmdir_v2_args /* {
5486		struct vnode *a_dvp;
5487		struct vnode *a_vp;
5488		struct componentname *a_cnp;
5489	} */ *ap = v;
5490	struct vnode *dvp = ap->a_dvp;
5491	struct vnode *vp = ap->a_vp;
5492	struct componentname *cnp = ap->a_cnp;
5493	char *nm;
5494	int error;
5495
5496	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5497	KASSERT(VOP_ISLOCKED(vp) == LK_EXCLUSIVE);
5498
5499	/* ZFS wants a null-terminated name. */
5500	nm = PNBUF_GET();
5501	(void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1);
5502
5503	error = zfs_rmdir(dvp, vp, nm, cnp->cn_cred);
5504
5505	PNBUF_PUT(nm);
5506	vput(vp);
5507	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5508	return error;
5509}
5510
5511static int
5512zfs_netbsd_readdir(void *v)
5513{
5514	struct vop_readdir_args *ap = v;
5515
5516	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
5517		ap->a_ncookies, ap->a_cookies));
5518}
5519
5520static int
5521zfs_netbsd_fsync(void *v)
5522{
5523	struct vop_fsync_args *ap = v;
5524
5525	return (zfs_fsync(ap->a_vp, ap->a_flags, ap->a_cred, NULL));
5526}
5527
5528static int
5529zfs_spec_fsync(void *v)
5530{
5531	struct vop_fsync_args *ap = v;
5532	int error;
5533
5534	error = spec_fsync(v);
5535	if (error)
5536		return error;
5537
5538	return (zfs_fsync(ap->a_vp, ap->a_flags, ap->a_cred, NULL));
5539}
5540
5541static int
5542zfs_netbsd_getattr(void *v)
5543{
5544	struct vop_getattr_args *ap = v;
5545	vattr_t *vap = ap->a_vap;
5546	xvattr_t xvap;
5547	u_long fflags = 0;
5548	int error;
5549
5550	xva_init(&xvap);
5551	xvap.xva_vattr = *vap;
5552	xvap.xva_vattr.va_mask |= AT_XVATTR;
5553
5554	/* Convert chflags into ZFS-type flags. */
5555	/* XXX: what about SF_SETTABLE?. */
5556	XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
5557	XVA_SET_REQ(&xvap, XAT_APPENDONLY);
5558	XVA_SET_REQ(&xvap, XAT_NOUNLINK);
5559	XVA_SET_REQ(&xvap, XAT_NODUMP);
5560	error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
5561	if (error != 0)
5562		return (error);
5563
5564	/* Convert ZFS xattr into chflags. */
5565#define	FLAG_CHECK(fflag, xflag, xfield)	do {			\
5566	if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)		\
5567		fflags |= (fflag);					\
5568} while (0)
5569	FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
5570	    xvap.xva_xoptattrs.xoa_immutable);
5571	FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
5572	    xvap.xva_xoptattrs.xoa_appendonly);
5573	FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
5574	    xvap.xva_xoptattrs.xoa_nounlink);
5575	FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
5576	    xvap.xva_xoptattrs.xoa_nodump);
5577#undef	FLAG_CHECK
5578	*vap = xvap.xva_vattr;
5579	vap->va_flags = fflags;
5580	return (0);
5581}
5582
5583static int
5584zfs_netbsd_setattr(void *v)
5585{
5586	struct vop_setattr_args *ap = v;
5587	vnode_t *vp = ap->a_vp;
5588	vattr_t *vap = ap->a_vap;
5589	cred_t *cred = ap->a_cred;
5590	znode_t *zp = VTOZ(vp);
5591	xvattr_t xvap;
5592	kauth_action_t action;
5593	u_long fflags, sfflags = 0;
5594	uint64_t zflags;
5595	int error, flags = 0;
5596	bool changing_sysflags;
5597
5598	vattr_init_mask(vap);
5599	vap->va_mask &= ~AT_NOSET;
5600	if (ISSET(vap->va_vaflags, VA_UTIMES_NULL))
5601		flags |= ATTR_UTIME;
5602
5603	xva_init(&xvap);
5604	xvap.xva_vattr = *vap;
5605
5606	zflags = VTOZ(vp)->z_pflags;
5607
5608	/* Ignore size changes on device nodes. */
5609	if (vp->v_type == VBLK || vp->v_type == VCHR)
5610		xvap.xva_vattr.va_mask &= ~AT_SIZE;
5611	if (vap->va_flags != VNOVAL) {
5612		int error;
5613
5614		fflags = vap->va_flags;
5615		if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_NODUMP)) != 0)
5616			return (EOPNOTSUPP);
5617
5618#define	FLAG_CHANGE(fflag, zflag, xflag, xfield)	do {		\
5619	if (((fflags & (fflag)) && !(zflags & (zflag))) ||		\
5620	    ((zflags & (zflag)) && !(fflags & (fflag)))) {		\
5621		XVA_SET_REQ(&xvap, (xflag));				\
5622		(xfield) = ((fflags & (fflag)) != 0);			\
5623		if (((fflag) & SF_SETTABLE) != 0)			\
5624			sfflags |= (fflag);				\
5625	}								\
5626} while (0)
5627		/* Convert chflags into ZFS-type flags. */
5628		/* XXX: what about SF_SETTABLE?. */
5629		FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
5630		    xvap.xva_xoptattrs.xoa_immutable);
5631		FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
5632		    xvap.xva_xoptattrs.xoa_appendonly);
5633		FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
5634		    xvap.xva_xoptattrs.xoa_nounlink);
5635		FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
5636		    xvap.xva_xoptattrs.xoa_nodump);
5637#undef	FLAG_CHANGE
5638
5639		action = KAUTH_VNODE_WRITE_FLAGS;
5640		changing_sysflags = false;
5641
5642		if (zflags & (ZFS_IMMUTABLE|ZFS_APPENDONLY|ZFS_NOUNLINK)) {
5643			action |= KAUTH_VNODE_HAS_SYSFLAGS;
5644		}
5645		if (sfflags != 0) {
5646			action |= KAUTH_VNODE_WRITE_SYSFLAGS;
5647			changing_sysflags = true;
5648		}
5649
5650		error = kauth_authorize_vnode(cred, action, vp, NULL,
5651		    genfs_can_chflags(vp, cred, zp->z_uid, changing_sysflags));
5652		if (error)
5653			return error;
5654	}
5655
5656	if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL ||
5657	    vap->va_birthtime.tv_sec != VNOVAL) {
5658		error = kauth_authorize_vnode(cred, KAUTH_VNODE_WRITE_TIMES, vp,
5659		     NULL, genfs_can_chtimes(vp, cred, zp->z_uid,
5660		     vap->va_vaflags));
5661		if (error)
5662			return error;
5663	}
5664
5665	error = zfs_setattr(vp, (vattr_t *)&xvap, flags, cred, NULL);
5666	if (error)
5667		return error;
5668
5669	cache_enter_id(vp, zp->z_mode, zp->z_uid, zp->z_gid, true);
5670
5671	return error;
5672}
5673
5674static int
5675zfs_netbsd_rename(void *v)
5676{
5677	struct vop_rename_args /* {
5678		struct vnode *a_fdvp;
5679		struct vnode *a_fvp;
5680		struct componentname *a_fcnp;
5681		struct vnode *a_tdvp;
5682		struct vnode *a_tvp;
5683		struct componentname *a_tcnp;
5684	} */ *ap = v;
5685	vnode_t *fdvp = ap->a_fdvp;
5686	vnode_t *fvp = ap->a_fvp;
5687	struct componentname *fcnp = ap->a_fcnp;
5688	vnode_t *tdvp = ap->a_tdvp;
5689	vnode_t *tvp = ap->a_tvp;
5690	struct componentname *tcnp = ap->a_tcnp;
5691	kauth_cred_t cred;
5692	int error;
5693
5694	KASSERT(VOP_ISLOCKED(tdvp) == LK_EXCLUSIVE);
5695	KASSERT(tvp == NULL || VOP_ISLOCKED(tvp) == LK_EXCLUSIVE);
5696	KASSERT(fdvp->v_type == VDIR);
5697	KASSERT(tdvp->v_type == VDIR);
5698
5699	cred = fcnp->cn_cred;
5700
5701	/*
5702	 * XXX Want a better equality test.  `tcnp->cn_cred == cred'
5703	 * hoses p2k because puffs transmits the creds separately and
5704	 * allocates distinct but equivalent structures for them.
5705	 */
5706	KASSERT(kauth_cred_uidmatch(cred, tcnp->cn_cred));
5707
5708	/*
5709	 * Drop the insane locks.
5710	 */
5711	VOP_UNLOCK(tdvp, 0);
5712	if (tvp != NULL && tvp != tdvp)
5713		VOP_UNLOCK(tvp, 0);
5714
5715	/*
5716	 * Release the source and target nodes; zfs_rename will look
5717	 * them up again once the locking situation is sane.
5718	 */
5719	VN_RELE(fvp);
5720	if (tvp != NULL)
5721		VN_RELE(tvp);
5722	fvp = NULL;
5723	tvp = NULL;
5724
5725	/*
5726	 * Do the rename ZFSly.
5727	 */
5728	error = zfs_rename(fdvp, &fvp, fcnp, tdvp, &tvp, tcnp, cred);
5729
5730	/*
5731	 * Release the directories now too, because the VOP_RENAME
5732	 * protocol is insane.
5733	 */
5734
5735	VN_RELE(fdvp);
5736	VN_RELE(tdvp);
5737	if (fvp != NULL)
5738		VN_RELE(fvp);
5739	if (tvp != NULL)
5740		VN_RELE(tvp);
5741
5742	return (error);
5743}
5744
5745static int
5746zfs_netbsd_symlink(void *v)
5747{
5748	struct vop_symlink_v3_args /* {
5749		struct vnode *a_dvp;
5750		struct vnode **a_vpp;
5751		struct componentname *a_cnp;
5752		struct vattr *a_vap;
5753		char *a_target;
5754	} */ *ap = v;
5755	struct vnode *dvp = ap->a_dvp;
5756	struct vnode **vpp = ap->a_vpp;
5757	struct componentname *cnp = ap->a_cnp;
5758	struct vattr *vap = ap->a_vap;
5759	char *target = ap->a_target;
5760	char *nm;
5761	int error;
5762
5763	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5764
5765	vap->va_type = VLNK;	/* Netbsd: Syscall only sets va_mode. */
5766	vattr_init_mask(vap);
5767
5768	/* ZFS wants a null-terminated name. */
5769	nm = PNBUF_GET();
5770	(void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1);
5771
5772	error = zfs_symlink(dvp, vpp, nm, vap, target, cnp->cn_cred, 0);
5773
5774	PNBUF_PUT(nm);
5775	KASSERT((error == 0) == (*vpp != NULL));
5776	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5777	if (*vpp != NULL)
5778		VOP_UNLOCK(*vpp, 0);
5779
5780	return (error);
5781}
5782
5783static int
5784zfs_netbsd_readlink(void *v)
5785{
5786	struct vop_readlink_args *ap = v;
5787
5788	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
5789}
5790
5791static int
5792zfs_netbsd_link(void *v)
5793{
5794	struct vop_link_v2_args /* {
5795		struct vnode *a_dvp;
5796		struct vnode *a_vp;
5797		struct componentname *a_cnp;
5798	} */ *ap = v;
5799	struct vnode *dvp = ap->a_dvp;
5800	struct vnode *vp = ap->a_vp;
5801	struct componentname *cnp = ap->a_cnp;
5802	char *nm;
5803	int error;
5804
5805	KASSERT(VOP_ISLOCKED(dvp) == LK_EXCLUSIVE);
5806
5807	/* ZFS wants a null-terminated name. */
5808	nm = PNBUF_GET();
5809	(void)strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1);
5810
5811	if ((error = vn_lock(vp, LK_EXCLUSIVE)) != 0) {
5812		/* XXX: No ABORTOP? */
5813		PNBUF_PUT(nm);
5814		return error;
5815	}
5816	error = kauth_authorize_vnode(cnp->cn_cred, KAUTH_VNODE_ADD_LINK, vp,
5817	    dvp, 0);
5818	if (error)
5819		goto out;
5820	error = zfs_link(dvp, vp, nm, cnp->cn_cred,
5821	    NULL, 0);
5822
5823out:
5824	PNBUF_PUT(nm);
5825	VOP_UNLOCK(vp, 0);
5826	return error;
5827}
5828
5829static int
5830zfs_netbsd_inactive(void *v)
5831{
5832	struct vop_inactive_v2_args *ap = v;
5833	vnode_t *vp = ap->a_vp;
5834	znode_t	*zp = VTOZ(vp);
5835
5836	/*
5837	 * NetBSD: nothing to do here, other than indicate if the
5838	 * vnode should be reclaimed.  No need to lock, if we race
5839	 * vrele() will call us again.
5840	 */
5841	*ap->a_recycle = (zp->z_unlinked != 0);
5842
5843	return (0);
5844}
5845
5846static int
5847zfs_netbsd_reclaim(void *v)
5848{
5849	struct vop_reclaim_v2_args /* {
5850		struct vnode *a_vp;
5851	} */ *ap = v;
5852	struct vnode *vp = ap->a_vp;
5853	znode_t	*zp;
5854	zfsvfs_t *zfsvfs;
5855	int error;
5856
5857	VOP_UNLOCK(vp, 0);
5858	zp = VTOZ(vp);
5859	zfsvfs = zp->z_zfsvfs;
5860
5861	KASSERTMSG(!vn_has_cached_data(vp), "vp %p", vp);
5862
5863	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
5864
5865	/*
5866	 * Process a deferred atime update.
5867	 */
5868	if (zp->z_atime_dirty && zp->z_unlinked == 0 && zp->z_sa_hdl != NULL) {
5869		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
5870
5871		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
5872		zfs_sa_upgrade_txholds(tx, zp);
5873		error = dmu_tx_assign(tx, TXG_WAIT);
5874		if (error) {
5875			dmu_tx_abort(tx);
5876		} else {
5877			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
5878			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
5879			zp->z_atime_dirty = 0;
5880			dmu_tx_commit(tx);
5881		}
5882	}
5883
5884	/*
5885	 * Operation zfs_znode.c::zfs_zget_cleaner() depends on this
5886	 * zil_commit() as a barrier to guarantee the znode cannot
5887	 * get freed before its log entries are resolved.
5888	 */
5889	if (zfsvfs->z_log)
5890		zil_commit(zfsvfs->z_log, zp->z_id);
5891
5892	if (zp->z_sa_hdl == NULL)
5893		zfs_znode_free(zp);
5894	else
5895		zfs_zinactive(zp);
5896	rw_exit(&zfsvfs->z_teardown_inactive_lock);
5897	return 0;
5898}
5899
5900static int
5901zfs_netbsd_fid(void *v)
5902{
5903	struct vop_fid_args *ap = v;
5904
5905	return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
5906}
5907
5908static int
5909zfs_netbsd_pathconf(void *v)
5910{
5911	struct vop_pathconf_args *ap = v;
5912	ulong_t val;
5913	int error;
5914
5915	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->l_cred, NULL);
5916	if (error == 0)
5917		*ap->a_retval = val;
5918	else if (error == EOPNOTSUPP) {
5919		switch (ap->a_name) {
5920		case _PC_NAME_MAX:
5921			*ap->a_retval = NAME_MAX;
5922			return (0);
5923		case _PC_PATH_MAX:
5924			*ap->a_retval = PATH_MAX;
5925			return (0);
5926		case _PC_LINK_MAX:
5927			*ap->a_retval = LINK_MAX;
5928			return (0);
5929		case _PC_MAX_CANON:
5930			*ap->a_retval = MAX_CANON;
5931			return (0);
5932		case _PC_MAX_INPUT:
5933			*ap->a_retval = MAX_INPUT;
5934			return (0);
5935		case _PC_PIPE_BUF:
5936			*ap->a_retval = PIPE_BUF;
5937			return (0);
5938		case _PC_CHOWN_RESTRICTED:
5939			*ap->a_retval = 1;
5940			return (0);
5941		case _PC_NO_TRUNC:
5942			*ap->a_retval = 1;
5943			return (0);
5944		case _PC_VDISABLE:
5945			*ap->a_retval = _POSIX_VDISABLE;
5946			return (0);
5947		default:
5948			return (EINVAL);
5949		}
5950		/* NOTREACHED */
5951	}
5952	return (error);
5953}
5954
5955static int
5956zfs_netbsd_advlock(void *v)
5957{
5958	struct vop_advlock_args /* {
5959		struct vnode *a_vp;
5960		void *a_id;
5961		int a_op;
5962		struct flock *a_fl;
5963		int a_flags;
5964	} */ *ap = v;
5965	struct vnode *vp;
5966	struct znode *zp;
5967	struct zfsvfs *zfsvfs;
5968	int error;
5969
5970	vp = ap->a_vp;
5971	zp = VTOZ(vp);
5972	zfsvfs = zp->z_zfsvfs;
5973
5974	ZFS_ENTER(zfsvfs);
5975	ZFS_VERIFY_ZP(zp);
5976	error = lf_advlock(ap, &zp->z_lockf, zp->z_size);
5977	ZFS_EXIT(zfsvfs);
5978
5979	return error;
5980}
5981
5982static int
5983zfs_netbsd_getpages(void *v)
5984{
5985	struct vop_getpages_args /* {
5986		struct vnode *a_vp;
5987		voff_t a_offset;
5988		struct vm_page **a_m;
5989		int *a_count;
5990		int a_centeridx;
5991		vm_prot_t a_access_type;
5992		int a_advice;
5993		int a_flags;
5994	} */ * const ap = v;
5995
5996	vnode_t *const vp = ap->a_vp;
5997	const int flags = ap->a_flags;
5998	const bool async = (flags & PGO_SYNCIO) == 0;
5999	const bool memwrite = (ap->a_access_type & VM_PROT_WRITE) != 0;
6000
6001	struct uvm_object * const uobj = &vp->v_uobj;
6002	krwlock_t * const rw = uobj->vmobjlock;
6003	znode_t *zp = VTOZ(vp);
6004	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
6005	vfs_t *mp;
6006	struct vm_page *pg;
6007	caddr_t va;
6008	int npages = *ap->a_count, found, err = 0;
6009
6010	if (flags & PGO_LOCKED) {
6011 		uvn_findpages(uobj, ap->a_offset, &npages, ap->a_m, NULL,
6012		    UFP_NOWAIT | UFP_NOALLOC | UFP_NOBUSY |
6013		    (memwrite ? UFP_NORDONLY : 0));
6014		KASSERT(npages == *ap->a_count);
6015		if (memwrite) {
6016			KASSERT(rw_write_held(uobj->vmobjlock));
6017			for (int i = 0; i < npages; i++) {
6018				pg = ap->a_m[i];
6019				if (pg == NULL || pg == PGO_DONTCARE) {
6020					continue;
6021				}
6022				if (uvm_pagegetdirty(pg) ==
6023				    UVM_PAGE_STATUS_CLEAN) {
6024					uvm_pagemarkdirty(pg,
6025					    UVM_PAGE_STATUS_UNKNOWN);
6026				}
6027			}
6028		}
6029		return ap->a_m[ap->a_centeridx] == NULL ? EBUSY : 0;
6030	}
6031	rw_exit(rw);
6032
6033	if (async) {
6034		return 0;
6035	}
6036
6037	mp = vp->v_mount;
6038	fstrans_start(mp);
6039	if (vp->v_mount != mp) {
6040		fstrans_done(mp);
6041		return ENOENT;
6042	}
6043	ZFS_ENTER(zfsvfs);
6044	ZFS_VERIFY_ZP(zp);
6045
6046	rw_enter(rw, RW_WRITER);
6047	if (ap->a_offset + (npages << PAGE_SHIFT) > round_page(vp->v_size)) {
6048		rw_exit(rw);
6049		ZFS_EXIT(zfsvfs);
6050		fstrans_done(mp);
6051		return EINVAL;
6052	}
6053	uvn_findpages(uobj, ap->a_offset, &npages, ap->a_m, NULL, UFP_ALL);
6054	KASSERT(npages == *ap->a_count);
6055
6056	for (int i = 0; i < npages; i++) {
6057		pg = ap->a_m[i];
6058		if (pg->flags & PG_FAKE) {
6059			voff_t offset = pg->offset;
6060			KASSERT(pg->offset == ap->a_offset + (i << PAGE_SHIFT));
6061			rw_exit(rw);
6062
6063			va = zfs_map_page(pg, S_WRITE);
6064			err = dmu_read(zfsvfs->z_os, zp->z_id, offset,
6065			    PAGE_SIZE, va, DMU_READ_PREFETCH);
6066			zfs_unmap_page(pg, va);
6067
6068			if (err != 0) {
6069				uvm_aio_aiodone_pages(ap->a_m, npages, false, err);
6070				memset(ap->a_m, 0, sizeof(ap->a_m[0]) *
6071				    npages);
6072				break;
6073			}
6074			rw_enter(rw, RW_WRITER);
6075			pg->flags &= ~(PG_FAKE);
6076		}
6077
6078		if (memwrite && uvm_pagegetdirty(pg) == UVM_PAGE_STATUS_CLEAN) {
6079			/* For write faults, start dirtiness tracking. */
6080			uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_UNKNOWN);
6081		}
6082	}
6083	rw_exit(rw);
6084
6085	ZFS_EXIT(zfsvfs);
6086	fstrans_done(mp);
6087
6088	return (err);
6089}
6090
6091static int
6092zfs_putapage(vnode_t *vp, page_t **pp, int count, int flags)
6093{
6094	znode_t		*zp = VTOZ(vp);
6095	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
6096	dmu_tx_t	*tx;
6097	voff_t		off, koff;
6098	voff_t		len, klen;
6099	int		err;
6100
6101	bool *cleanedp;
6102	struct uvm_object *uobj = &vp->v_uobj;
6103	krwlock_t *rw = uobj->vmobjlock;
6104
6105	if (zp->z_sa_hdl == NULL) {
6106		err = 0;
6107		goto out;
6108	}
6109
6110	/*
6111	 * Calculate the length and assert that no whole pages are past EOF.
6112	 * This check is equivalent to "off + len <= round_page(zp->z_size)",
6113	 * with gyrations to avoid signed integer overflow.
6114	 */
6115
6116	off = pp[0]->offset;
6117	len = count * PAGESIZE;
6118	KASSERT(off <= zp->z_size);
6119	KASSERT(len <= round_page(zp->z_size));
6120	KASSERT(off <= round_page(zp->z_size) - len);
6121
6122	/*
6123	 * If EOF is within the last page, reduce len to avoid writing past
6124	 * the file size in the ZFS buffer.  Assert that
6125	 * "off + len <= zp->z_size", again avoiding signed integer overflow.
6126	 */
6127
6128	if (len > zp->z_size - off) {
6129		len = zp->z_size - off;
6130	}
6131	KASSERT(len <= zp->z_size);
6132	KASSERT(off <= zp->z_size - len);
6133
6134	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
6135	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
6136		err = SET_ERROR(EDQUOT);
6137		goto out;
6138	}
6139	tx = dmu_tx_create(zfsvfs->z_os);
6140	dmu_tx_hold_write(tx, zp->z_id, off, len);
6141
6142	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
6143	zfs_sa_upgrade_txholds(tx, zp);
6144	err = dmu_tx_assign(tx, TXG_WAIT);
6145	if (err != 0) {
6146		dmu_tx_abort(tx);
6147		goto out;
6148	}
6149
6150	if (zp->z_blksz <= PAGESIZE) {
6151		KASSERTMSG(count == 1, "vp %p pp %p count %d", vp, pp, count);
6152		caddr_t va = zfs_map_page(*pp, S_READ);
6153		ASSERT3U(len, <=, PAGESIZE);
6154		dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
6155		zfs_unmap_page(*pp, va);
6156	} else {
6157		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
6158	}
6159	cleanedp = tsd_get(zfs_putpage_key);
6160	*cleanedp = true;
6161
6162	if (err == 0) {
6163		uint64_t mtime[2], ctime[2];
6164		sa_bulk_attr_t bulk[3];
6165		int count = 0;
6166
6167		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
6168		    &mtime, 16);
6169		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
6170		    &ctime, 16);
6171		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
6172		    &zp->z_pflags, 8);
6173		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
6174		    B_TRUE);
6175		err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
6176		ASSERT0(err);
6177		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
6178	}
6179	dmu_tx_commit(tx);
6180
6181out:
6182	uvm_aio_aiodone_pages(pp, count, true, err);
6183	return (err);
6184}
6185
6186static void
6187zfs_netbsd_gop_markupdate(vnode_t *vp, int flags)
6188{
6189	znode_t		*zp = VTOZ(vp);
6190	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
6191	dmu_tx_t	*tx;
6192	sa_bulk_attr_t	bulk[2];
6193	uint64_t	mtime[2], ctime[2];
6194	int		count = 0, err;
6195
6196	KASSERT(flags == GOP_UPDATE_MODIFIED);
6197
6198	tx = dmu_tx_create(zfsvfs->z_os);
6199	err = dmu_tx_assign(tx, TXG_WAIT);
6200	if (err != 0) {
6201		dmu_tx_abort(tx);
6202		return;
6203	}
6204	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
6205	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
6206	zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
6207	dmu_tx_commit(tx);
6208}
6209
6210static int
6211zfs_netbsd_putpages(void *v)
6212{
6213	struct vop_putpages_args /* {
6214		struct vnode *a_vp;
6215		voff_t a_offlo;
6216		voff_t a_offhi;
6217		int a_flags;
6218	} */ * const ap = v;
6219
6220	struct vnode *vp = ap->a_vp;
6221	voff_t offlo = ap->a_offlo;
6222	voff_t offhi = ap->a_offhi;
6223	int flags = ap->a_flags;
6224
6225	znode_t *zp = VTOZ(vp);
6226	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
6227	rl_t *rl = NULL;
6228	uint64_t len;
6229	int error;
6230	bool cleaned = false;
6231
6232	bool async = (flags & PGO_SYNCIO) == 0;
6233	bool cleaning = (flags & PGO_CLEANIT) != 0;
6234
6235	if (cleaning) {
6236		ASSERT((offlo & PAGE_MASK) == 0 && (offhi & PAGE_MASK) == 0);
6237		ASSERT(offlo < offhi || offhi == 0);
6238		if (offhi == 0)
6239			len = UINT64_MAX;
6240		else
6241			len = offhi - offlo;
6242		rw_exit(vp->v_uobj.vmobjlock);
6243		if (curlwp == uvm.pagedaemon_lwp) {
6244			error = fstrans_start_nowait(vp->v_mount);
6245			if (error)
6246				return error;
6247		} else {
6248			vfs_t *mp = vp->v_mount;
6249			fstrans_start(mp);
6250			if (vp->v_mount != mp) {
6251				fstrans_done(mp);
6252				ASSERT(!vn_has_cached_data(vp));
6253				return 0;
6254			}
6255		}
6256		/*
6257		 * Cannot use ZFS_ENTER() here as it returns with error
6258		 * if z_unmounted.  The next statement is equivalent.
6259		 */
6260		rrm_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG);
6261
6262		rl = zfs_range_lock(zp, offlo, len, RL_WRITER);
6263		rw_enter(vp->v_uobj.vmobjlock, RW_WRITER);
6264		tsd_set(zfs_putpage_key, &cleaned);
6265	}
6266	error = genfs_putpages(v);
6267	if (cleaning) {
6268		tsd_set(zfs_putpage_key, NULL);
6269		zfs_range_unlock(rl);
6270
6271		/*
6272		 * Only zil_commit() if we cleaned something.  This avoids
6273		 * deadlock if we're called from zfs_netbsd_setsize().
6274		 */
6275
6276		if (cleaned)
6277		if (!async || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
6278			zil_commit(zfsvfs->z_log, zp->z_id);
6279		ZFS_EXIT(zfsvfs);
6280		fstrans_done(vp->v_mount);
6281	}
6282	return error;
6283}
6284
6285/*
6286 * Restrict the putpages range to the ZFS block containing the offset.
6287 */
6288static void
6289zfs_netbsd_gop_putrange(struct vnode *vp, off_t off, off_t *lop, off_t *hip)
6290{
6291	znode_t *zp = VTOZ(vp);
6292
6293	*lop = trunc_page(rounddown2(off, zp->z_blksz));
6294	*hip = round_page(*lop + zp->z_blksz);
6295}
6296
6297void
6298zfs_netbsd_setsize(vnode_t *vp, off_t size)
6299{
6300	struct uvm_object *uobj = &vp->v_uobj;
6301	krwlock_t *rw = uobj->vmobjlock;
6302	page_t *pg;
6303	int count, pgoff;
6304	caddr_t va;
6305	off_t tsize;
6306
6307	uvm_vnp_setsize(vp, size);
6308	if (!vn_has_cached_data(vp))
6309		return;
6310
6311	tsize = trunc_page(size);
6312	if (tsize == size)
6313		return;
6314
6315	/*
6316	 * If there's a partial page, we need to zero the tail.
6317	 */
6318
6319	rw_enter(rw, RW_WRITER);
6320	count = 1;
6321	pg = NULL;
6322	if (uvn_findpages(uobj, tsize, &count, &pg, NULL, UFP_NOALLOC)) {
6323		va = zfs_map_page(pg, S_WRITE);
6324		pgoff = size - tsize;
6325		memset(va + pgoff, 0, PAGESIZE - pgoff);
6326		zfs_unmap_page(pg, va);
6327		uvm_page_unbusy(&pg, 1);
6328	}
6329
6330	rw_exit(rw);
6331}
6332
6333static int
6334zfs_netbsd_print(void *v)
6335{
6336	struct vop_print_args /* {
6337		struct vnode	*a_vp;
6338	} */ *ap = v;
6339	vnode_t	*vp;
6340	znode_t	*zp;
6341
6342	vp = ap->a_vp;
6343	zp = VTOZ(vp);
6344
6345	printf("\tino %" PRIu64 " size %" PRIu64 "\n",
6346	       zp->z_id, zp->z_size);
6347	return 0;
6348}
6349
6350const struct genfs_ops zfs_genfsops = {
6351        .gop_write = zfs_putapage,
6352	.gop_markupdate = zfs_netbsd_gop_markupdate,
6353	.gop_putrange = zfs_netbsd_gop_putrange,
6354};
6355
6356int (**zfs_vnodeop_p)(void *);
6357const struct vnodeopv_entry_desc zfs_vnodeop_entries[] = {
6358	{ &vop_default_desc,		vn_default_error },
6359	{ &vop_parsepath_desc,		genfs_parsepath },
6360	{ &vop_lookup_desc,		zfs_netbsd_lookup },
6361	{ &vop_create_desc,		zfs_netbsd_create },
6362	{ &vop_mknod_desc,		zfs_netbsd_mknod },
6363	{ &vop_open_desc,		zfs_netbsd_open },
6364	{ &vop_close_desc,		zfs_netbsd_close },
6365	{ &vop_access_desc,		zfs_netbsd_access },
6366	{ &vop_accessx_desc,		genfs_accessx },
6367	{ &vop_getattr_desc,		zfs_netbsd_getattr },
6368	{ &vop_setattr_desc,		zfs_netbsd_setattr },
6369	{ &vop_read_desc,		zfs_netbsd_read },
6370	{ &vop_write_desc,		zfs_netbsd_write },
6371	{ &vop_ioctl_desc,		zfs_netbsd_ioctl },
6372	{ &vop_poll_desc,		genfs_poll },
6373	{ &vop_kqfilter_desc,		genfs_kqfilter },
6374	{ &vop_revoke_desc,		genfs_revoke },
6375	{ &vop_fsync_desc,		zfs_netbsd_fsync },
6376	{ &vop_remove_desc,		zfs_netbsd_remove },
6377	{ &vop_link_desc,		zfs_netbsd_link },
6378	{ &vop_lock_desc,		genfs_lock },
6379	{ &vop_unlock_desc,		genfs_unlock },
6380	{ &vop_rename_desc,		zfs_netbsd_rename },
6381	{ &vop_mkdir_desc,		zfs_netbsd_mkdir },
6382	{ &vop_rmdir_desc,		zfs_netbsd_rmdir },
6383	{ &vop_symlink_desc,		zfs_netbsd_symlink },
6384	{ &vop_readdir_desc,		zfs_netbsd_readdir },
6385	{ &vop_readlink_desc,		zfs_netbsd_readlink },
6386	{ &vop_inactive_desc,		zfs_netbsd_inactive },
6387	{ &vop_reclaim_desc,		zfs_netbsd_reclaim },
6388	{ &vop_pathconf_desc,		zfs_netbsd_pathconf },
6389	{ &vop_seek_desc,		genfs_seek },
6390	{ &vop_getpages_desc,		zfs_netbsd_getpages },
6391	{ &vop_putpages_desc,		zfs_netbsd_putpages },
6392	{ &vop_mmap_desc,		genfs_mmap },
6393	{ &vop_islocked_desc,		genfs_islocked },
6394	{ &vop_advlock_desc,		zfs_netbsd_advlock },
6395	{ &vop_print_desc,		zfs_netbsd_print },
6396	{ &vop_fcntl_desc,		genfs_fcntl },
6397	{ NULL, NULL }
6398};
6399
6400const struct vnodeopv_desc zfs_vnodeop_opv_desc =
6401	{ &zfs_vnodeop_p, zfs_vnodeop_entries };
6402
6403int (**zfs_specop_p)(void *);
6404const struct vnodeopv_entry_desc zfs_specop_entries[] = {
6405	{ &vop_default_desc,		vn_default_error },
6406	GENFS_SPECOP_ENTRIES,
6407	{ &vop_close_desc,		spec_close },
6408	{ &vop_access_desc,		zfs_netbsd_access },
6409	{ &vop_accessx_desc,		genfs_accessx },
6410	{ &vop_getattr_desc,		zfs_netbsd_getattr },
6411	{ &vop_setattr_desc,		zfs_netbsd_setattr },
6412	{ &vop_read_desc,		/**/zfs_netbsd_read },
6413	{ &vop_write_desc,		/**/zfs_netbsd_write },
6414	{ &vop_fsync_desc,		zfs_spec_fsync },
6415	{ &vop_lock_desc,		genfs_lock },
6416	{ &vop_unlock_desc,		genfs_unlock },
6417	{ &vop_inactive_desc,		zfs_netbsd_inactive },
6418	{ &vop_reclaim_desc,		zfs_netbsd_reclaim },
6419	{ &vop_islocked_desc,		genfs_islocked },
6420	{ &vop_bwrite_desc,		vn_bwrite },
6421	{ &vop_print_desc,		zfs_netbsd_print },
6422	{ &vop_fcntl_desc,		genfs_fcntl },
6423	{ NULL, NULL }
6424};
6425
6426const struct vnodeopv_desc zfs_specop_opv_desc =
6427	{ &zfs_specop_p, zfs_specop_entries };
6428
6429int (**zfs_fifoop_p)(void *);
6430const struct vnodeopv_entry_desc zfs_fifoop_entries[] = {
6431	{ &vop_default_desc,		vn_default_error },
6432	GENFS_FIFOOP_ENTRIES,
6433	{ &vop_close_desc,		vn_fifo_bypass },
6434	{ &vop_access_desc,		zfs_netbsd_access },
6435	{ &vop_accessx_desc,		genfs_accessx },
6436	{ &vop_getattr_desc,		zfs_netbsd_getattr },
6437	{ &vop_setattr_desc,		zfs_netbsd_setattr },
6438	{ &vop_read_desc,		/**/zfs_netbsd_read },
6439	{ &vop_write_desc,		/**/zfs_netbsd_write },
6440	{ &vop_fsync_desc,		zfs_netbsd_fsync },
6441	{ &vop_lock_desc,		genfs_lock },
6442	{ &vop_unlock_desc,		genfs_unlock },
6443	{ &vop_inactive_desc,		zfs_netbsd_inactive },
6444	{ &vop_reclaim_desc,		zfs_netbsd_reclaim },
6445	{ &vop_islocked_desc,		genfs_islocked },
6446	{ &vop_bwrite_desc,		vn_bwrite },
6447	{ &vop_strategy_desc,		vn_fifo_bypass },
6448	{ &vop_print_desc,		zfs_netbsd_print },
6449	{ &vop_fcntl_desc,		genfs_fcntl },
6450	{ NULL, NULL }
6451};
6452
6453const struct vnodeopv_desc zfs_fifoop_opv_desc =
6454	{ &zfs_fifoop_p, zfs_fifoop_entries };
6455
6456#endif /* __NetBSD__ */
6457