1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
25 * Copyright (c) 2014 Integros [integros.com]
26 * Copyright 2017 Nexenta Systems, Inc.
27 */
28
29/* Portions Copyright 2007 Jeremy Teo */
30/* Portions Copyright 2010 Robert Milkowski */
31
32#include <sys/types.h>
33#include <sys/param.h>
34#include <sys/time.h>
35#include <sys/systm.h>
36#include <sys/sysmacros.h>
37#include <sys/resource.h>
38#include <sys/vfs.h>
39#include <sys/vm.h>
40#include <sys/vnode.h>
41#include <sys/file.h>
42#include <sys/stat.h>
43#include <sys/kmem.h>
44#include <sys/taskq.h>
45#include <sys/uio.h>
46#include <sys/atomic.h>
47#include <sys/namei.h>
48#include <sys/mman.h>
49#include <sys/cmn_err.h>
50#include <sys/errno.h>
51#include <sys/unistd.h>
52#include <sys/zfs_dir.h>
53#include <sys/zfs_ioctl.h>
54#include <sys/fs/zfs.h>
55#include <sys/dmu.h>
56#include <sys/dmu_objset.h>
57#include <sys/spa.h>
58#include <sys/txg.h>
59#include <sys/dbuf.h>
60#include <sys/zap.h>
61#include <sys/sa.h>
62#include <sys/dirent.h>
63#include <sys/policy.h>
64#include <sys/sunddi.h>
65#include <sys/filio.h>
66#include <sys/sid.h>
67#include <sys/zfs_ctldir.h>
68#include <sys/zfs_fuid.h>
69#include <sys/zfs_sa.h>
70#include <sys/zfs_rlock.h>
71#include <sys/extdirent.h>
72#include <sys/kidmap.h>
73#include <sys/bio.h>
74#include <sys/buf.h>
75#include <sys/sched.h>
76#include <sys/acl.h>
77#include <sys/vmmeter.h>
78#include <vm/vm_param.h>
79#include <sys/zil.h>
80
81/*
82 * Programming rules.
83 *
84 * Each vnode op performs some logical unit of work.  To do this, the ZPL must
85 * properly lock its in-core state, create a DMU transaction, do the work,
86 * record this work in the intent log (ZIL), commit the DMU transaction,
87 * and wait for the intent log to commit if it is a synchronous operation.
88 * Moreover, the vnode ops must work in both normal and log replay context.
89 * The ordering of events is important to avoid deadlocks and references
90 * to freed memory.  The example below illustrates the following Big Rules:
91 *
92 *  (1)	A check must be made in each zfs thread for a mounted file system.
93 *	This is done avoiding races using ZFS_ENTER(zfsvfs).
94 *	A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
95 *	must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
96 *	can return EIO from the calling function.
97 *
98 *  (2)	VN_RELE() should always be the last thing except for zil_commit()
99 *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
100 *	First, if it's the last reference, the vnode/znode
101 *	can be freed, so the zp may point to freed memory.  Second, the last
102 *	reference will call zfs_zinactive(), which may induce a lot of work --
103 *	pushing cached pages (which acquires range locks) and syncing out
104 *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
105 *	which could deadlock the system if you were already holding one.
106 *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
107 *
108 *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
109 *	as they can span dmu_tx_assign() calls.
110 *
111 *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
112 *      dmu_tx_assign().  This is critical because we don't want to block
113 *      while holding locks.
114 *
115 *	If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
116 *	reduces lock contention and CPU usage when we must wait (note that if
117 *	throughput is constrained by the storage, nearly every transaction
118 *	must wait).
119 *
120 *      Note, in particular, that if a lock is sometimes acquired before
121 *      the tx assigns, and sometimes after (e.g. z_lock), then failing
122 *      to use a non-blocking assign can deadlock the system.  The scenario:
123 *
124 *	Thread A has grabbed a lock before calling dmu_tx_assign().
125 *	Thread B is in an already-assigned tx, and blocks for this lock.
126 *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
127 *	forever, because the previous txg can't quiesce until B's tx commits.
128 *
129 *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
130 *	then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
131 *	calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
132 *	to indicate that this operation has already called dmu_tx_wait().
133 *	This will ensure that we don't retry forever, waiting a short bit
134 *	each time.
135 *
136 *  (5)	If the operation succeeded, generate the intent log entry for it
137 *	before dropping locks.  This ensures that the ordering of events
138 *	in the intent log matches the order in which they actually occurred.
139 *	During ZIL replay the zfs_log_* functions will update the sequence
140 *	number to indicate the zil transaction has replayed.
141 *
142 *  (6)	At the end of each vnode op, the DMU tx must always commit,
143 *	regardless of whether there were any errors.
144 *
145 *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
146 *	to ensure that synchronous semantics are provided when necessary.
147 *
148 * In general, this is how things should be ordered in each vnode op:
149 *
150 *	ZFS_ENTER(zfsvfs);		// exit if unmounted
151 * top:
152 *	zfs_dirent_lookup(&dl, ...)	// lock directory entry (may VN_HOLD())
153 *	rw_enter(...);			// grab any other locks you need
154 *	tx = dmu_tx_create(...);	// get DMU tx
155 *	dmu_tx_hold_*();		// hold each object you might modify
156 *	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
157 *	if (error) {
158 *		rw_exit(...);		// drop locks
159 *		zfs_dirent_unlock(dl);	// unlock directory entry
160 *		VN_RELE(...);		// release held vnodes
161 *		if (error == ERESTART) {
162 *			waited = B_TRUE;
163 *			dmu_tx_wait(tx);
164 *			dmu_tx_abort(tx);
165 *			goto top;
166 *		}
167 *		dmu_tx_abort(tx);	// abort DMU tx
168 *		ZFS_EXIT(zfsvfs);	// finished in zfs
169 *		return (error);		// really out of space
170 *	}
171 *	error = do_real_work();		// do whatever this VOP does
172 *	if (error == 0)
173 *		zfs_log_*(...);		// on success, make ZIL entry
174 *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
175 *	rw_exit(...);			// drop locks
176 *	zfs_dirent_unlock(dl);		// unlock directory entry
177 *	VN_RELE(...);			// release held vnodes
178 *	zil_commit(zilog, foid);	// synchronous when necessary
179 *	ZFS_EXIT(zfsvfs);		// finished in zfs
180 *	return (error);			// done, report error
181 */
182
183/* ARGSUSED */
184static int
185zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
186{
187	znode_t	*zp = VTOZ(*vpp);
188	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
189
190	ZFS_ENTER(zfsvfs);
191	ZFS_VERIFY_ZP(zp);
192
193	if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
194	    ((flag & FAPPEND) == 0)) {
195		ZFS_EXIT(zfsvfs);
196		return (SET_ERROR(EPERM));
197	}
198
199	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
200	    ZTOV(zp)->v_type == VREG &&
201	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
202		if (fs_vscan(*vpp, cr, 0) != 0) {
203			ZFS_EXIT(zfsvfs);
204			return (SET_ERROR(EACCES));
205		}
206	}
207
208	/* Keep a count of the synchronous opens in the znode */
209	if (flag & (FSYNC | FDSYNC))
210		atomic_inc_32(&zp->z_sync_cnt);
211
212	ZFS_EXIT(zfsvfs);
213	return (0);
214}
215
216/* ARGSUSED */
217static int
218zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
219    caller_context_t *ct)
220{
221	znode_t	*zp = VTOZ(vp);
222	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
223
224	/*
225	 * Clean up any locks held by this process on the vp.
226	 */
227	cleanlocks(vp, ddi_get_pid(), 0);
228	cleanshares(vp, ddi_get_pid());
229
230	ZFS_ENTER(zfsvfs);
231	ZFS_VERIFY_ZP(zp);
232
233	/* Decrement the synchronous opens in the znode */
234	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
235		atomic_dec_32(&zp->z_sync_cnt);
236
237	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
238	    ZTOV(zp)->v_type == VREG &&
239	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
240		VERIFY(fs_vscan(vp, cr, 1) == 0);
241
242	ZFS_EXIT(zfsvfs);
243	return (0);
244}
245
246/*
247 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
248 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
249 */
250static int
251zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
252{
253	znode_t	*zp = VTOZ(vp);
254	uint64_t noff = (uint64_t)*off; /* new offset */
255	uint64_t file_sz;
256	int error;
257	boolean_t hole;
258
259	file_sz = zp->z_size;
260	if (noff >= file_sz)  {
261		return (SET_ERROR(ENXIO));
262	}
263
264	if (cmd == _FIO_SEEK_HOLE)
265		hole = B_TRUE;
266	else
267		hole = B_FALSE;
268
269	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
270
271	if (error == ESRCH)
272		return (SET_ERROR(ENXIO));
273
274	/*
275	 * We could find a hole that begins after the logical end-of-file,
276	 * because dmu_offset_next() only works on whole blocks.  If the
277	 * EOF falls mid-block, then indicate that the "virtual hole"
278	 * at the end of the file begins at the logical EOF, rather than
279	 * at the end of the last block.
280	 */
281	if (noff > file_sz) {
282		ASSERT(hole);
283		noff = file_sz;
284	}
285
286	if (noff < *off)
287		return (error);
288	*off = noff;
289	return (error);
290}
291
292/* ARGSUSED */
293static int
294zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
295    int *rvalp, caller_context_t *ct)
296{
297	offset_t off;
298	offset_t ndata;
299	dmu_object_info_t doi;
300	int error;
301	zfsvfs_t *zfsvfs;
302	znode_t *zp;
303
304	switch (com) {
305	case _FIOFFS:
306	{
307		return (0);
308
309		/*
310		 * The following two ioctls are used by bfu.  Faking out,
311		 * necessary to avoid bfu errors.
312		 */
313	}
314	case _FIOGDIO:
315	case _FIOSDIO:
316	{
317		return (0);
318	}
319
320	case _FIO_SEEK_DATA:
321	case _FIO_SEEK_HOLE:
322	{
323#ifdef illumos
324		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
325			return (SET_ERROR(EFAULT));
326#else
327		off = *(offset_t *)data;
328#endif
329		zp = VTOZ(vp);
330		zfsvfs = zp->z_zfsvfs;
331		ZFS_ENTER(zfsvfs);
332		ZFS_VERIFY_ZP(zp);
333
334		/* offset parameter is in/out */
335		error = zfs_holey(vp, com, &off);
336		ZFS_EXIT(zfsvfs);
337		if (error)
338			return (error);
339#ifdef illumos
340		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
341			return (SET_ERROR(EFAULT));
342#else
343		*(offset_t *)data = off;
344#endif
345		return (0);
346	}
347#ifdef illumos
348	case _FIO_COUNT_FILLED:
349	{
350		/*
351		 * _FIO_COUNT_FILLED adds a new ioctl command which
352		 * exposes the number of filled blocks in a
353		 * ZFS object.
354		 */
355		zp = VTOZ(vp);
356		zfsvfs = zp->z_zfsvfs;
357		ZFS_ENTER(zfsvfs);
358		ZFS_VERIFY_ZP(zp);
359
360		/*
361		 * Wait for all dirty blocks for this object
362		 * to get synced out to disk, and the DMU info
363		 * updated.
364		 */
365		error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
366		if (error) {
367			ZFS_EXIT(zfsvfs);
368			return (error);
369		}
370
371		/*
372		 * Retrieve fill count from DMU object.
373		 */
374		error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
375		if (error) {
376			ZFS_EXIT(zfsvfs);
377			return (error);
378		}
379
380		ndata = doi.doi_fill_count;
381
382		ZFS_EXIT(zfsvfs);
383		if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
384			return (SET_ERROR(EFAULT));
385		return (0);
386	}
387#endif
388	}
389	return (SET_ERROR(ENOTTY));
390}
391
392static vm_page_t
393page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
394{
395	vm_object_t obj;
396	vm_page_t pp;
397	int64_t end;
398
399	/*
400	 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
401	 * aligned boundaries, if the range is not aligned.  As a result a
402	 * DEV_BSIZE subrange with partially dirty data may get marked as clean.
403	 * It may happen that all DEV_BSIZE subranges are marked clean and thus
404	 * the whole page would be considred clean despite have some dirty data.
405	 * For this reason we should shrink the range to DEV_BSIZE aligned
406	 * boundaries before calling vm_page_clear_dirty.
407	 */
408	end = rounddown2(off + nbytes, DEV_BSIZE);
409	off = roundup2(off, DEV_BSIZE);
410	nbytes = end - off;
411
412	obj = vp->v_object;
413	zfs_vmobject_assert_wlocked(obj);
414
415	for (;;) {
416		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
417		    pp->valid) {
418			if (vm_page_xbusied(pp)) {
419				/*
420				 * Reference the page before unlocking and
421				 * sleeping so that the page daemon is less
422				 * likely to reclaim it.
423				 */
424				vm_page_reference(pp);
425				vm_page_lock(pp);
426				zfs_vmobject_wunlock(obj);
427				vm_page_busy_sleep(pp, "zfsmwb", true);
428				zfs_vmobject_wlock(obj);
429				continue;
430			}
431			vm_page_sbusy(pp);
432		} else if (pp != NULL) {
433			ASSERT(!pp->valid);
434			pp = NULL;
435		}
436
437		if (pp != NULL) {
438			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
439			vm_object_pip_add(obj, 1);
440			pmap_remove_write(pp);
441			if (nbytes != 0)
442				vm_page_clear_dirty(pp, off, nbytes);
443		}
444		break;
445	}
446	return (pp);
447}
448
449static void
450page_unbusy(vm_page_t pp)
451{
452
453	vm_page_sunbusy(pp);
454	vm_object_pip_subtract(pp->object, 1);
455}
456
457static vm_page_t
458page_hold(vnode_t *vp, int64_t start)
459{
460	vm_object_t obj;
461	vm_page_t pp;
462
463	obj = vp->v_object;
464	zfs_vmobject_assert_wlocked(obj);
465
466	for (;;) {
467		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
468		    pp->valid) {
469			if (vm_page_xbusied(pp)) {
470				/*
471				 * Reference the page before unlocking and
472				 * sleeping so that the page daemon is less
473				 * likely to reclaim it.
474				 */
475				vm_page_reference(pp);
476				vm_page_lock(pp);
477				zfs_vmobject_wunlock(obj);
478				vm_page_busy_sleep(pp, "zfsmwb", true);
479				zfs_vmobject_wlock(obj);
480				continue;
481			}
482
483			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
484			vm_page_lock(pp);
485			vm_page_hold(pp);
486			vm_page_unlock(pp);
487
488		} else
489			pp = NULL;
490		break;
491	}
492	return (pp);
493}
494
495static void
496page_unhold(vm_page_t pp)
497{
498
499	vm_page_lock(pp);
500	vm_page_unhold(pp);
501	vm_page_unlock(pp);
502}
503
504/*
505 * When a file is memory mapped, we must keep the IO data synchronized
506 * between the DMU cache and the memory mapped pages.  What this means:
507 *
508 * On Write:	If we find a memory mapped page, we write to *both*
509 *		the page and the dmu buffer.
510 */
511static void
512update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
513    int segflg, dmu_tx_t *tx)
514{
515	vm_object_t obj;
516	struct sf_buf *sf;
517	caddr_t va;
518	int off;
519
520	ASSERT(segflg != UIO_NOCOPY);
521	ASSERT(vp->v_mount != NULL);
522	obj = vp->v_object;
523	ASSERT(obj != NULL);
524
525	off = start & PAGEOFFSET;
526	zfs_vmobject_wlock(obj);
527	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
528		vm_page_t pp;
529		int nbytes = imin(PAGESIZE - off, len);
530
531		if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
532			zfs_vmobject_wunlock(obj);
533
534			va = zfs_map_page(pp, &sf);
535			(void) dmu_read(os, oid, start+off, nbytes,
536			    va+off, DMU_READ_PREFETCH);;
537			zfs_unmap_page(sf);
538
539			zfs_vmobject_wlock(obj);
540			page_unbusy(pp);
541		}
542		len -= nbytes;
543		off = 0;
544	}
545	vm_object_pip_wakeupn(obj, 0);
546	zfs_vmobject_wunlock(obj);
547}
548
549/*
550 * Read with UIO_NOCOPY flag means that sendfile(2) requests
551 * ZFS to populate a range of page cache pages with data.
552 *
553 * NOTE: this function could be optimized to pre-allocate
554 * all pages in advance, drain exclusive busy on all of them,
555 * map them into contiguous KVA region and populate them
556 * in one single dmu_read() call.
557 */
558static int
559mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
560{
561	znode_t *zp = VTOZ(vp);
562	objset_t *os = zp->z_zfsvfs->z_os;
563	struct sf_buf *sf;
564	vm_object_t obj;
565	vm_page_t pp;
566	int64_t start;
567	caddr_t va;
568	int len = nbytes;
569	int off;
570	int error = 0;
571
572	ASSERT(uio->uio_segflg == UIO_NOCOPY);
573	ASSERT(vp->v_mount != NULL);
574	obj = vp->v_object;
575	ASSERT(obj != NULL);
576	ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
577
578	zfs_vmobject_wlock(obj);
579	for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
580		int bytes = MIN(PAGESIZE, len);
581
582		pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY |
583		    VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
584		if (pp->valid == 0) {
585			zfs_vmobject_wunlock(obj);
586			va = zfs_map_page(pp, &sf);
587			error = dmu_read(os, zp->z_id, start, bytes, va,
588			    DMU_READ_PREFETCH);
589			if (bytes != PAGESIZE && error == 0)
590				bzero(va + bytes, PAGESIZE - bytes);
591			zfs_unmap_page(sf);
592			zfs_vmobject_wlock(obj);
593			vm_page_sunbusy(pp);
594			vm_page_lock(pp);
595			if (error) {
596				if (pp->wire_count == 0 && pp->valid == 0 &&
597				    !vm_page_busied(pp))
598					vm_page_free(pp);
599			} else {
600				pp->valid = VM_PAGE_BITS_ALL;
601				vm_page_activate(pp);
602			}
603			vm_page_unlock(pp);
604		} else {
605			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
606			vm_page_sunbusy(pp);
607		}
608		if (error)
609			break;
610		uio->uio_resid -= bytes;
611		uio->uio_offset += bytes;
612		len -= bytes;
613	}
614	zfs_vmobject_wunlock(obj);
615	return (error);
616}
617
618/*
619 * When a file is memory mapped, we must keep the IO data synchronized
620 * between the DMU cache and the memory mapped pages.  What this means:
621 *
622 * On Read:	We "read" preferentially from memory mapped pages,
623 *		else we default from the dmu buffer.
624 *
625 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
626 *	 the file is memory mapped.
627 */
628static int
629mappedread(vnode_t *vp, int nbytes, uio_t *uio)
630{
631	znode_t *zp = VTOZ(vp);
632	vm_object_t obj;
633	int64_t start;
634	caddr_t va;
635	int len = nbytes;
636	int off;
637	int error = 0;
638
639	ASSERT(vp->v_mount != NULL);
640	obj = vp->v_object;
641	ASSERT(obj != NULL);
642
643	start = uio->uio_loffset;
644	off = start & PAGEOFFSET;
645	zfs_vmobject_wlock(obj);
646	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
647		vm_page_t pp;
648		uint64_t bytes = MIN(PAGESIZE - off, len);
649
650		if (pp = page_hold(vp, start)) {
651			struct sf_buf *sf;
652			caddr_t va;
653
654			zfs_vmobject_wunlock(obj);
655			va = zfs_map_page(pp, &sf);
656#ifdef illumos
657			error = uiomove(va + off, bytes, UIO_READ, uio);
658#else
659			error = vn_io_fault_uiomove(va + off, bytes, uio);
660#endif
661			zfs_unmap_page(sf);
662			zfs_vmobject_wlock(obj);
663			page_unhold(pp);
664		} else {
665			zfs_vmobject_wunlock(obj);
666			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
667			    uio, bytes);
668			zfs_vmobject_wlock(obj);
669		}
670		len -= bytes;
671		off = 0;
672		if (error)
673			break;
674	}
675	zfs_vmobject_wunlock(obj);
676	return (error);
677}
678
679offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
680
681/*
682 * Read bytes from specified file into supplied buffer.
683 *
684 *	IN:	vp	- vnode of file to be read from.
685 *		uio	- structure supplying read location, range info,
686 *			  and return buffer.
687 *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
688 *		cr	- credentials of caller.
689 *		ct	- caller context
690 *
691 *	OUT:	uio	- updated offset and range, buffer filled.
692 *
693 *	RETURN:	0 on success, error code on failure.
694 *
695 * Side Effects:
696 *	vp - atime updated if byte count > 0
697 */
698/* ARGSUSED */
699static int
700zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
701{
702	znode_t		*zp = VTOZ(vp);
703	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
704	ssize_t		n, nbytes;
705	int		error = 0;
706	rl_t		*rl;
707	xuio_t		*xuio = NULL;
708
709	ZFS_ENTER(zfsvfs);
710	ZFS_VERIFY_ZP(zp);
711
712	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
713		ZFS_EXIT(zfsvfs);
714		return (SET_ERROR(EACCES));
715	}
716
717	/*
718	 * Validate file offset
719	 */
720	if (uio->uio_loffset < (offset_t)0) {
721		ZFS_EXIT(zfsvfs);
722		return (SET_ERROR(EINVAL));
723	}
724
725	/*
726	 * Fasttrack empty reads
727	 */
728	if (uio->uio_resid == 0) {
729		ZFS_EXIT(zfsvfs);
730		return (0);
731	}
732
733	/*
734	 * Check for mandatory locks
735	 */
736	if (MANDMODE(zp->z_mode)) {
737		if (error = chklock(vp, FREAD,
738		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
739			ZFS_EXIT(zfsvfs);
740			return (error);
741		}
742	}
743
744	/*
745	 * If we're in FRSYNC mode, sync out this znode before reading it.
746	 */
747	if (zfsvfs->z_log &&
748	    (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
749		zil_commit(zfsvfs->z_log, zp->z_id);
750
751	/*
752	 * Lock the range against changes.
753	 */
754	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
755
756	/*
757	 * If we are reading past end-of-file we can skip
758	 * to the end; but we might still need to set atime.
759	 */
760	if (uio->uio_loffset >= zp->z_size) {
761		error = 0;
762		goto out;
763	}
764
765	ASSERT(uio->uio_loffset < zp->z_size);
766	n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
767
768#ifdef illumos
769	if ((uio->uio_extflg == UIO_XUIO) &&
770	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
771		int nblk;
772		int blksz = zp->z_blksz;
773		uint64_t offset = uio->uio_loffset;
774
775		xuio = (xuio_t *)uio;
776		if ((ISP2(blksz))) {
777			nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
778			    blksz)) / blksz;
779		} else {
780			ASSERT(offset + n <= blksz);
781			nblk = 1;
782		}
783		(void) dmu_xuio_init(xuio, nblk);
784
785		if (vn_has_cached_data(vp)) {
786			/*
787			 * For simplicity, we always allocate a full buffer
788			 * even if we only expect to read a portion of a block.
789			 */
790			while (--nblk >= 0) {
791				(void) dmu_xuio_add(xuio,
792				    dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
793				    blksz), 0, blksz);
794			}
795		}
796	}
797#endif	/* illumos */
798
799	while (n > 0) {
800		nbytes = MIN(n, zfs_read_chunk_size -
801		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
802
803#ifdef __FreeBSD__
804		if (uio->uio_segflg == UIO_NOCOPY)
805			error = mappedread_sf(vp, nbytes, uio);
806		else
807#endif /* __FreeBSD__ */
808		if (vn_has_cached_data(vp)) {
809			error = mappedread(vp, nbytes, uio);
810		} else {
811			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
812			    uio, nbytes);
813		}
814		if (error) {
815			/* convert checksum errors into IO errors */
816			if (error == ECKSUM)
817				error = SET_ERROR(EIO);
818			break;
819		}
820
821		n -= nbytes;
822	}
823out:
824	zfs_range_unlock(rl);
825
826	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
827	ZFS_EXIT(zfsvfs);
828	return (error);
829}
830
831/*
832 * Write the bytes to a file.
833 *
834 *	IN:	vp	- vnode of file to be written to.
835 *		uio	- structure supplying write location, range info,
836 *			  and data buffer.
837 *		ioflag	- FAPPEND, FSYNC, and/or FDSYNC.  FAPPEND is
838 *			  set if in append mode.
839 *		cr	- credentials of caller.
840 *		ct	- caller context (NFS/CIFS fem monitor only)
841 *
842 *	OUT:	uio	- updated offset and range.
843 *
844 *	RETURN:	0 on success, error code on failure.
845 *
846 * Timestamps:
847 *	vp - ctime|mtime updated if byte count > 0
848 */
849
850/* ARGSUSED */
851static int
852zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
853{
854	znode_t		*zp = VTOZ(vp);
855	rlim64_t	limit = MAXOFFSET_T;
856	ssize_t		start_resid = uio->uio_resid;
857	ssize_t		tx_bytes;
858	uint64_t	end_size;
859	dmu_tx_t	*tx;
860	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
861	zilog_t		*zilog;
862	offset_t	woff;
863	ssize_t		n, nbytes;
864	rl_t		*rl;
865	int		max_blksz = zfsvfs->z_max_blksz;
866	int		error = 0;
867	arc_buf_t	*abuf;
868	iovec_t		*aiov = NULL;
869	xuio_t		*xuio = NULL;
870	int		i_iov = 0;
871	int		iovcnt = uio->uio_iovcnt;
872	iovec_t		*iovp = uio->uio_iov;
873	int		write_eof;
874	int		count = 0;
875	sa_bulk_attr_t	bulk[4];
876	uint64_t	mtime[2], ctime[2];
877
878	/*
879	 * Fasttrack empty write
880	 */
881	n = start_resid;
882	if (n == 0)
883		return (0);
884
885	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
886		limit = MAXOFFSET_T;
887
888	ZFS_ENTER(zfsvfs);
889	ZFS_VERIFY_ZP(zp);
890
891	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
892	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
893	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
894	    &zp->z_size, 8);
895	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
896	    &zp->z_pflags, 8);
897
898	/*
899	 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
900	 * callers might not be able to detect properly that we are read-only,
901	 * so check it explicitly here.
902	 */
903	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
904		ZFS_EXIT(zfsvfs);
905		return (SET_ERROR(EROFS));
906	}
907
908	/*
909	 * If immutable or not appending then return EPERM.
910	 * Intentionally allow ZFS_READONLY through here.
911	 * See zfs_zaccess_common()
912	 */
913	if ((zp->z_pflags & ZFS_IMMUTABLE) ||
914	    ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
915	    (uio->uio_loffset < zp->z_size))) {
916		ZFS_EXIT(zfsvfs);
917		return (SET_ERROR(EPERM));
918	}
919
920	zilog = zfsvfs->z_log;
921
922	/*
923	 * Validate file offset
924	 */
925	woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
926	if (woff < 0) {
927		ZFS_EXIT(zfsvfs);
928		return (SET_ERROR(EINVAL));
929	}
930
931	/*
932	 * Check for mandatory locks before calling zfs_range_lock()
933	 * in order to prevent a deadlock with locks set via fcntl().
934	 */
935	if (MANDMODE((mode_t)zp->z_mode) &&
936	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
937		ZFS_EXIT(zfsvfs);
938		return (error);
939	}
940
941#ifdef illumos
942	/*
943	 * Pre-fault the pages to ensure slow (eg NFS) pages
944	 * don't hold up txg.
945	 * Skip this if uio contains loaned arc_buf.
946	 */
947	if ((uio->uio_extflg == UIO_XUIO) &&
948	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
949		xuio = (xuio_t *)uio;
950	else
951		uio_prefaultpages(MIN(n, max_blksz), uio);
952#endif
953
954	/*
955	 * If in append mode, set the io offset pointer to eof.
956	 */
957	if (ioflag & FAPPEND) {
958		/*
959		 * Obtain an appending range lock to guarantee file append
960		 * semantics.  We reset the write offset once we have the lock.
961		 */
962		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
963		woff = rl->r_off;
964		if (rl->r_len == UINT64_MAX) {
965			/*
966			 * We overlocked the file because this write will cause
967			 * the file block size to increase.
968			 * Note that zp_size cannot change with this lock held.
969			 */
970			woff = zp->z_size;
971		}
972		uio->uio_loffset = woff;
973	} else {
974		/*
975		 * Note that if the file block size will change as a result of
976		 * this write, then this range lock will lock the entire file
977		 * so that we can re-write the block safely.
978		 */
979		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
980	}
981
982	if (vn_rlimit_fsize(vp, uio, uio->uio_td)) {
983		zfs_range_unlock(rl);
984		ZFS_EXIT(zfsvfs);
985		return (EFBIG);
986	}
987
988	if (woff >= limit) {
989		zfs_range_unlock(rl);
990		ZFS_EXIT(zfsvfs);
991		return (SET_ERROR(EFBIG));
992	}
993
994	if ((woff + n) > limit || woff > (limit - n))
995		n = limit - woff;
996
997	/* Will this write extend the file length? */
998	write_eof = (woff + n > zp->z_size);
999
1000	end_size = MAX(zp->z_size, woff + n);
1001
1002	/*
1003	 * Write the file in reasonable size chunks.  Each chunk is written
1004	 * in a separate transaction; this keeps the intent log records small
1005	 * and allows us to do more fine-grained space accounting.
1006	 */
1007	while (n > 0) {
1008		abuf = NULL;
1009		woff = uio->uio_loffset;
1010		if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
1011		    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
1012			if (abuf != NULL)
1013				dmu_return_arcbuf(abuf);
1014			error = SET_ERROR(EDQUOT);
1015			break;
1016		}
1017
1018		if (xuio && abuf == NULL) {
1019			ASSERT(i_iov < iovcnt);
1020			aiov = &iovp[i_iov];
1021			abuf = dmu_xuio_arcbuf(xuio, i_iov);
1022			dmu_xuio_clear(xuio, i_iov);
1023			DTRACE_PROBE3(zfs_cp_write, int, i_iov,
1024			    iovec_t *, aiov, arc_buf_t *, abuf);
1025			ASSERT((aiov->iov_base == abuf->b_data) ||
1026			    ((char *)aiov->iov_base - (char *)abuf->b_data +
1027			    aiov->iov_len == arc_buf_size(abuf)));
1028			i_iov++;
1029		} else if (abuf == NULL && n >= max_blksz &&
1030		    woff >= zp->z_size &&
1031		    P2PHASE(woff, max_blksz) == 0 &&
1032		    zp->z_blksz == max_blksz) {
1033			/*
1034			 * This write covers a full block.  "Borrow" a buffer
1035			 * from the dmu so that we can fill it before we enter
1036			 * a transaction.  This avoids the possibility of
1037			 * holding up the transaction if the data copy hangs
1038			 * up on a pagefault (e.g., from an NFS server mapping).
1039			 */
1040			size_t cbytes;
1041
1042			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
1043			    max_blksz);
1044			ASSERT(abuf != NULL);
1045			ASSERT(arc_buf_size(abuf) == max_blksz);
1046			if (error = uiocopy(abuf->b_data, max_blksz,
1047			    UIO_WRITE, uio, &cbytes)) {
1048				dmu_return_arcbuf(abuf);
1049				break;
1050			}
1051			ASSERT(cbytes == max_blksz);
1052		}
1053
1054		/*
1055		 * Start a transaction.
1056		 */
1057		tx = dmu_tx_create(zfsvfs->z_os);
1058		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1059		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
1060		zfs_sa_upgrade_txholds(tx, zp);
1061		error = dmu_tx_assign(tx, TXG_WAIT);
1062		if (error) {
1063			dmu_tx_abort(tx);
1064			if (abuf != NULL)
1065				dmu_return_arcbuf(abuf);
1066			break;
1067		}
1068
1069		/*
1070		 * If zfs_range_lock() over-locked we grow the blocksize
1071		 * and then reduce the lock range.  This will only happen
1072		 * on the first iteration since zfs_range_reduce() will
1073		 * shrink down r_len to the appropriate size.
1074		 */
1075		if (rl->r_len == UINT64_MAX) {
1076			uint64_t new_blksz;
1077
1078			if (zp->z_blksz > max_blksz) {
1079				/*
1080				 * File's blocksize is already larger than the
1081				 * "recordsize" property.  Only let it grow to
1082				 * the next power of 2.
1083				 */
1084				ASSERT(!ISP2(zp->z_blksz));
1085				new_blksz = MIN(end_size,
1086				    1 << highbit64(zp->z_blksz));
1087			} else {
1088				new_blksz = MIN(end_size, max_blksz);
1089			}
1090			zfs_grow_blocksize(zp, new_blksz, tx);
1091			zfs_range_reduce(rl, woff, n);
1092		}
1093
1094		/*
1095		 * XXX - should we really limit each write to z_max_blksz?
1096		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
1097		 */
1098		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
1099
1100		if (woff + nbytes > zp->z_size)
1101			vnode_pager_setsize(vp, woff + nbytes);
1102
1103		if (abuf == NULL) {
1104			tx_bytes = uio->uio_resid;
1105			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
1106			    uio, nbytes, tx);
1107			tx_bytes -= uio->uio_resid;
1108		} else {
1109			tx_bytes = nbytes;
1110			ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
1111			/*
1112			 * If this is not a full block write, but we are
1113			 * extending the file past EOF and this data starts
1114			 * block-aligned, use assign_arcbuf().  Otherwise,
1115			 * write via dmu_write().
1116			 */
1117			if (tx_bytes < max_blksz && (!write_eof ||
1118			    aiov->iov_base != abuf->b_data)) {
1119				ASSERT(xuio);
1120				dmu_write(zfsvfs->z_os, zp->z_id, woff,
1121				    aiov->iov_len, aiov->iov_base, tx);
1122				dmu_return_arcbuf(abuf);
1123				xuio_stat_wbuf_copied();
1124			} else {
1125				ASSERT(xuio || tx_bytes == max_blksz);
1126				dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
1127				    woff, abuf, tx);
1128			}
1129			ASSERT(tx_bytes <= uio->uio_resid);
1130			uioskip(uio, tx_bytes);
1131		}
1132		if (tx_bytes && vn_has_cached_data(vp)) {
1133			update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
1134			    zp->z_id, uio->uio_segflg, tx);
1135		}
1136
1137		/*
1138		 * If we made no progress, we're done.  If we made even
1139		 * partial progress, update the znode and ZIL accordingly.
1140		 */
1141		if (tx_bytes == 0) {
1142			(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
1143			    (void *)&zp->z_size, sizeof (uint64_t), tx);
1144			dmu_tx_commit(tx);
1145			ASSERT(error != 0);
1146			break;
1147		}
1148
1149		/*
1150		 * Clear Set-UID/Set-GID bits on successful write if not
1151		 * privileged and at least one of the excute bits is set.
1152		 *
1153		 * It would be nice to to this after all writes have
1154		 * been done, but that would still expose the ISUID/ISGID
1155		 * to another app after the partial write is committed.
1156		 *
1157		 * Note: we don't call zfs_fuid_map_id() here because
1158		 * user 0 is not an ephemeral uid.
1159		 */
1160		mutex_enter(&zp->z_acl_lock);
1161		if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
1162		    (S_IXUSR >> 6))) != 0 &&
1163		    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
1164		    secpolicy_vnode_setid_retain(vp, cr,
1165		    (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
1166			uint64_t newmode;
1167			zp->z_mode &= ~(S_ISUID | S_ISGID);
1168			newmode = zp->z_mode;
1169			(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
1170			    (void *)&newmode, sizeof (uint64_t), tx);
1171		}
1172		mutex_exit(&zp->z_acl_lock);
1173
1174		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
1175		    B_TRUE);
1176
1177		/*
1178		 * Update the file size (zp_size) if it has changed;
1179		 * account for possible concurrent updates.
1180		 */
1181		while ((end_size = zp->z_size) < uio->uio_loffset) {
1182			(void) atomic_cas_64(&zp->z_size, end_size,
1183			    uio->uio_loffset);
1184#ifdef illumos
1185			ASSERT(error == 0);
1186#else
1187			ASSERT(error == 0 || error == EFAULT);
1188#endif
1189		}
1190		/*
1191		 * If we are replaying and eof is non zero then force
1192		 * the file size to the specified eof. Note, there's no
1193		 * concurrency during replay.
1194		 */
1195		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
1196			zp->z_size = zfsvfs->z_replay_eof;
1197
1198		if (error == 0)
1199			error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1200		else
1201			(void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1202
1203		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
1204		dmu_tx_commit(tx);
1205
1206		if (error != 0)
1207			break;
1208		ASSERT(tx_bytes == nbytes);
1209		n -= nbytes;
1210
1211#ifdef illumos
1212		if (!xuio && n > 0)
1213			uio_prefaultpages(MIN(n, max_blksz), uio);
1214#endif
1215	}
1216
1217	zfs_range_unlock(rl);
1218
1219	/*
1220	 * If we're in replay mode, or we made no progress, return error.
1221	 * Otherwise, it's at least a partial write, so it's successful.
1222	 */
1223	if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
1224		ZFS_EXIT(zfsvfs);
1225		return (error);
1226	}
1227
1228#ifdef __FreeBSD__
1229	/*
1230	 * EFAULT means that at least one page of the source buffer was not
1231	 * available.  VFS will re-try remaining I/O upon this error.
1232	 */
1233	if (error == EFAULT) {
1234		ZFS_EXIT(zfsvfs);
1235		return (error);
1236	}
1237#endif
1238
1239	if (ioflag & (FSYNC | FDSYNC) ||
1240	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1241		zil_commit(zilog, zp->z_id);
1242
1243	ZFS_EXIT(zfsvfs);
1244	return (0);
1245}
1246
1247void
1248zfs_get_done(zgd_t *zgd, int error)
1249{
1250	znode_t *zp = zgd->zgd_private;
1251	objset_t *os = zp->z_zfsvfs->z_os;
1252
1253	if (zgd->zgd_db)
1254		dmu_buf_rele(zgd->zgd_db, zgd);
1255
1256	zfs_range_unlock(zgd->zgd_rl);
1257
1258	/*
1259	 * Release the vnode asynchronously as we currently have the
1260	 * txg stopped from syncing.
1261	 */
1262	VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1263
1264	if (error == 0 && zgd->zgd_bp)
1265		zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
1266
1267	kmem_free(zgd, sizeof (zgd_t));
1268}
1269
1270#ifdef DEBUG
1271static int zil_fault_io = 0;
1272#endif
1273
1274/*
1275 * Get data to generate a TX_WRITE intent log record.
1276 */
1277int
1278zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
1279{
1280	zfsvfs_t *zfsvfs = arg;
1281	objset_t *os = zfsvfs->z_os;
1282	znode_t *zp;
1283	uint64_t object = lr->lr_foid;
1284	uint64_t offset = lr->lr_offset;
1285	uint64_t size = lr->lr_length;
1286	dmu_buf_t *db;
1287	zgd_t *zgd;
1288	int error = 0;
1289
1290	ASSERT3P(lwb, !=, NULL);
1291	ASSERT3P(zio, !=, NULL);
1292	ASSERT3U(size, !=, 0);
1293
1294	/*
1295	 * Nothing to do if the file has been removed
1296	 */
1297	if (zfs_zget(zfsvfs, object, &zp) != 0)
1298		return (SET_ERROR(ENOENT));
1299	if (zp->z_unlinked) {
1300		/*
1301		 * Release the vnode asynchronously as we currently have the
1302		 * txg stopped from syncing.
1303		 */
1304		VN_RELE_ASYNC(ZTOV(zp),
1305		    dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1306		return (SET_ERROR(ENOENT));
1307	}
1308
1309	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1310	zgd->zgd_lwb = lwb;
1311	zgd->zgd_private = zp;
1312
1313	/*
1314	 * Write records come in two flavors: immediate and indirect.
1315	 * For small writes it's cheaper to store the data with the
1316	 * log record (immediate); for large writes it's cheaper to
1317	 * sync the data and get a pointer to it (indirect) so that
1318	 * we don't have to write the data twice.
1319	 */
1320	if (buf != NULL) { /* immediate write */
1321		zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
1322		/* test for truncation needs to be done while range locked */
1323		if (offset >= zp->z_size) {
1324			error = SET_ERROR(ENOENT);
1325		} else {
1326			error = dmu_read(os, object, offset, size, buf,
1327			    DMU_READ_NO_PREFETCH);
1328		}
1329		ASSERT(error == 0 || error == ENOENT);
1330	} else { /* indirect write */
1331		/*
1332		 * Have to lock the whole block to ensure when it's
1333		 * written out and its checksum is being calculated
1334		 * that no one can change the data. We need to re-check
1335		 * blocksize after we get the lock in case it's changed!
1336		 */
1337		for (;;) {
1338			uint64_t blkoff;
1339			size = zp->z_blksz;
1340			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1341			offset -= blkoff;
1342			zgd->zgd_rl = zfs_range_lock(zp, offset, size,
1343			    RL_READER);
1344			if (zp->z_blksz == size)
1345				break;
1346			offset += blkoff;
1347			zfs_range_unlock(zgd->zgd_rl);
1348		}
1349		/* test for truncation needs to be done while range locked */
1350		if (lr->lr_offset >= zp->z_size)
1351			error = SET_ERROR(ENOENT);
1352#ifdef DEBUG
1353		if (zil_fault_io) {
1354			error = SET_ERROR(EIO);
1355			zil_fault_io = 0;
1356		}
1357#endif
1358		if (error == 0)
1359			error = dmu_buf_hold(os, object, offset, zgd, &db,
1360			    DMU_READ_NO_PREFETCH);
1361
1362		if (error == 0) {
1363			blkptr_t *bp = &lr->lr_blkptr;
1364
1365			zgd->zgd_db = db;
1366			zgd->zgd_bp = bp;
1367
1368			ASSERT(db->db_offset == offset);
1369			ASSERT(db->db_size == size);
1370
1371			error = dmu_sync(zio, lr->lr_common.lrc_txg,
1372			    zfs_get_done, zgd);
1373			ASSERT(error || lr->lr_length <= size);
1374
1375			/*
1376			 * On success, we need to wait for the write I/O
1377			 * initiated by dmu_sync() to complete before we can
1378			 * release this dbuf.  We will finish everything up
1379			 * in the zfs_get_done() callback.
1380			 */
1381			if (error == 0)
1382				return (0);
1383
1384			if (error == EALREADY) {
1385				lr->lr_common.lrc_txtype = TX_WRITE2;
1386				/*
1387				 * TX_WRITE2 relies on the data previously
1388				 * written by the TX_WRITE that caused
1389				 * EALREADY.  We zero out the BP because
1390				 * it is the old, currently-on-disk BP,
1391				 * so there's no need to zio_flush() its
1392				 * vdevs (flushing would needlesly hurt
1393				 * performance, and doesn't work on
1394				 * indirect vdevs).
1395				 */
1396				zgd->zgd_bp = NULL;
1397				BP_ZERO(bp);
1398				error = 0;
1399			}
1400		}
1401	}
1402
1403	zfs_get_done(zgd, error);
1404
1405	return (error);
1406}
1407
1408/*ARGSUSED*/
1409static int
1410zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1411    caller_context_t *ct)
1412{
1413	znode_t *zp = VTOZ(vp);
1414	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1415	int error;
1416
1417	ZFS_ENTER(zfsvfs);
1418	ZFS_VERIFY_ZP(zp);
1419
1420	if (flag & V_ACE_MASK)
1421		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1422	else
1423		error = zfs_zaccess_rwx(zp, mode, flag, cr);
1424
1425	ZFS_EXIT(zfsvfs);
1426	return (error);
1427}
1428
1429static int
1430zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
1431{
1432	int error;
1433
1434	*vpp = arg;
1435	error = vn_lock(*vpp, lkflags);
1436	if (error != 0)
1437		vrele(*vpp);
1438	return (error);
1439}
1440
1441static int
1442zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
1443{
1444	znode_t *zdp = VTOZ(dvp);
1445	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1446	int error;
1447	int ltype;
1448
1449	ASSERT_VOP_LOCKED(dvp, __func__);
1450#ifdef DIAGNOSTIC
1451	if ((zdp->z_pflags & ZFS_XATTR) == 0)
1452		VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
1453#endif
1454
1455	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
1456		ASSERT3P(dvp, ==, vp);
1457		vref(dvp);
1458		ltype = lkflags & LK_TYPE_MASK;
1459		if (ltype != VOP_ISLOCKED(dvp)) {
1460			if (ltype == LK_EXCLUSIVE)
1461				vn_lock(dvp, LK_UPGRADE | LK_RETRY);
1462			else /* if (ltype == LK_SHARED) */
1463				vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
1464
1465			/*
1466			 * Relock for the "." case could leave us with
1467			 * reclaimed vnode.
1468			 */
1469			if (dvp->v_iflag & VI_DOOMED) {
1470				vrele(dvp);
1471				return (SET_ERROR(ENOENT));
1472			}
1473		}
1474		return (0);
1475	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
1476		/*
1477		 * Note that in this case, dvp is the child vnode, and we
1478		 * are looking up the parent vnode - exactly reverse from
1479		 * normal operation.  Unlocking dvp requires some rather
1480		 * tricky unlock/relock dance to prevent mp from being freed;
1481		 * use vn_vget_ino_gen() which takes care of all that.
1482		 *
1483		 * XXX Note that there is a time window when both vnodes are
1484		 * unlocked.  It is possible, although highly unlikely, that
1485		 * during that window the parent-child relationship between
1486		 * the vnodes may change, for example, get reversed.
1487		 * In that case we would have a wrong lock order for the vnodes.
1488		 * All other filesystems seem to ignore this problem, so we
1489		 * do the same here.
1490		 * A potential solution could be implemented as follows:
1491		 * - using LK_NOWAIT when locking the second vnode and retrying
1492		 *   if necessary
1493		 * - checking that the parent-child relationship still holds
1494		 *   after locking both vnodes and retrying if it doesn't
1495		 */
1496		error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
1497		return (error);
1498	} else {
1499		error = vn_lock(vp, lkflags);
1500		if (error != 0)
1501			vrele(vp);
1502		return (error);
1503	}
1504}
1505
1506/*
1507 * Lookup an entry in a directory, or an extended attribute directory.
1508 * If it exists, return a held vnode reference for it.
1509 *
1510 *	IN:	dvp	- vnode of directory to search.
1511 *		nm	- name of entry to lookup.
1512 *		pnp	- full pathname to lookup [UNUSED].
1513 *		flags	- LOOKUP_XATTR set if looking for an attribute.
1514 *		rdir	- root directory vnode [UNUSED].
1515 *		cr	- credentials of caller.
1516 *		ct	- caller context
1517 *
1518 *	OUT:	vpp	- vnode of located entry, NULL if not found.
1519 *
1520 *	RETURN:	0 on success, error code on failure.
1521 *
1522 * Timestamps:
1523 *	NA
1524 */
1525/* ARGSUSED */
1526static int
1527zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
1528    int nameiop, cred_t *cr, kthread_t *td, int flags, boolean_t cached)
1529{
1530	znode_t *zdp = VTOZ(dvp);
1531	znode_t *zp;
1532	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1533	int	error = 0;
1534
1535	/*
1536	 * Fast path lookup, however we must skip DNLC lookup
1537	 * for case folding or normalizing lookups because the
1538	 * DNLC code only stores the passed in name.  This means
1539	 * creating 'a' and removing 'A' on a case insensitive
1540	 * file system would work, but DNLC still thinks 'a'
1541	 * exists and won't let you create it again on the next
1542	 * pass through fast path.
1543	 */
1544	if (!(flags & LOOKUP_XATTR)) {
1545		if (dvp->v_type != VDIR) {
1546			return (SET_ERROR(ENOTDIR));
1547		} else if (zdp->z_sa_hdl == NULL) {
1548			return (SET_ERROR(EIO));
1549		}
1550	}
1551
1552	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1553
1554	ZFS_ENTER(zfsvfs);
1555	ZFS_VERIFY_ZP(zdp);
1556
1557	*vpp = NULL;
1558
1559	if (flags & LOOKUP_XATTR) {
1560#ifdef TODO
1561		/*
1562		 * If the xattr property is off, refuse the lookup request.
1563		 */
1564		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1565			ZFS_EXIT(zfsvfs);
1566			return (SET_ERROR(EINVAL));
1567		}
1568#endif
1569
1570		/*
1571		 * We don't allow recursive attributes..
1572		 * Maybe someday we will.
1573		 */
1574		if (zdp->z_pflags & ZFS_XATTR) {
1575			ZFS_EXIT(zfsvfs);
1576			return (SET_ERROR(EINVAL));
1577		}
1578
1579		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1580			ZFS_EXIT(zfsvfs);
1581			return (error);
1582		}
1583
1584		/*
1585		 * Do we have permission to get into attribute directory?
1586		 */
1587		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1588		    B_FALSE, cr)) {
1589			vrele(*vpp);
1590			*vpp = NULL;
1591		}
1592
1593		ZFS_EXIT(zfsvfs);
1594		return (error);
1595	}
1596
1597	/*
1598	 * Check accessibility of directory.
1599	 */
1600	if (!cached) {
1601		if ((cnp->cn_flags & NOEXECCHECK) != 0) {
1602			cnp->cn_flags &= ~NOEXECCHECK;
1603		} else {
1604			error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr);
1605			if (error != 0) {
1606				ZFS_EXIT(zfsvfs);
1607				return (error);
1608			}
1609		}
1610	}
1611
1612	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1613	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1614		ZFS_EXIT(zfsvfs);
1615		return (SET_ERROR(EILSEQ));
1616	}
1617
1618
1619	/*
1620	 * First handle the special cases.
1621	 */
1622	if ((cnp->cn_flags & ISDOTDOT) != 0) {
1623		/*
1624		 * If we are a snapshot mounted under .zfs, return
1625		 * the vp for the snapshot directory.
1626		 */
1627		if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
1628			struct componentname cn;
1629			vnode_t *zfsctl_vp;
1630			int ltype;
1631
1632			ZFS_EXIT(zfsvfs);
1633			ltype = VOP_ISLOCKED(dvp);
1634			VOP_UNLOCK(dvp, 0);
1635			error = zfsctl_root(zfsvfs->z_parent, LK_SHARED,
1636			    &zfsctl_vp);
1637			if (error == 0) {
1638				cn.cn_nameptr = "snapshot";
1639				cn.cn_namelen = strlen(cn.cn_nameptr);
1640				cn.cn_nameiop = cnp->cn_nameiop;
1641				cn.cn_flags = cnp->cn_flags & ~ISDOTDOT;
1642				cn.cn_lkflags = cnp->cn_lkflags;
1643				error = VOP_LOOKUP(zfsctl_vp, vpp, &cn);
1644				vput(zfsctl_vp);
1645			}
1646			vn_lock(dvp, ltype | LK_RETRY);
1647			return (error);
1648		}
1649	}
1650	if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
1651		ZFS_EXIT(zfsvfs);
1652		if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
1653			return (SET_ERROR(ENOTSUP));
1654		error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp);
1655		return (error);
1656	}
1657
1658	/*
1659	 * The loop is retry the lookup if the parent-child relationship
1660	 * changes during the dot-dot locking complexities.
1661	 */
1662	for (;;) {
1663		uint64_t parent;
1664
1665		error = zfs_dirlook(zdp, nm, &zp);
1666		if (error == 0)
1667			*vpp = ZTOV(zp);
1668
1669		ZFS_EXIT(zfsvfs);
1670		if (error != 0)
1671			break;
1672
1673		error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
1674		if (error != 0) {
1675			/*
1676			 * If we've got a locking error, then the vnode
1677			 * got reclaimed because of a force unmount.
1678			 * We never enter doomed vnodes into the name cache.
1679			 */
1680			*vpp = NULL;
1681			return (error);
1682		}
1683
1684		if ((cnp->cn_flags & ISDOTDOT) == 0)
1685			break;
1686
1687		ZFS_ENTER(zfsvfs);
1688		if (zdp->z_sa_hdl == NULL) {
1689			error = SET_ERROR(EIO);
1690		} else {
1691			error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
1692			    &parent, sizeof (parent));
1693		}
1694		if (error != 0) {
1695			ZFS_EXIT(zfsvfs);
1696			vput(ZTOV(zp));
1697			break;
1698		}
1699		if (zp->z_id == parent) {
1700			ZFS_EXIT(zfsvfs);
1701			break;
1702		}
1703		vput(ZTOV(zp));
1704	}
1705
1706out:
1707	if (error != 0)
1708		*vpp = NULL;
1709
1710	/* Translate errors and add SAVENAME when needed. */
1711	if (cnp->cn_flags & ISLASTCN) {
1712		switch (nameiop) {
1713		case CREATE:
1714		case RENAME:
1715			if (error == ENOENT) {
1716				error = EJUSTRETURN;
1717				cnp->cn_flags |= SAVENAME;
1718				break;
1719			}
1720			/* FALLTHROUGH */
1721		case DELETE:
1722			if (error == 0)
1723				cnp->cn_flags |= SAVENAME;
1724			break;
1725		}
1726	}
1727
1728	/* Insert name into cache (as non-existent) if appropriate. */
1729	if (zfsvfs->z_use_namecache &&
1730	    error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
1731		cache_enter(dvp, NULL, cnp);
1732
1733	/* Insert name into cache if appropriate. */
1734	if (zfsvfs->z_use_namecache &&
1735	    error == 0 && (cnp->cn_flags & MAKEENTRY)) {
1736		if (!(cnp->cn_flags & ISLASTCN) ||
1737		    (nameiop != DELETE && nameiop != RENAME)) {
1738			cache_enter(dvp, *vpp, cnp);
1739		}
1740	}
1741
1742	return (error);
1743}
1744
1745/*
1746 * Attempt to create a new entry in a directory.  If the entry
1747 * already exists, truncate the file if permissible, else return
1748 * an error.  Return the vp of the created or trunc'd file.
1749 *
1750 *	IN:	dvp	- vnode of directory to put new file entry in.
1751 *		name	- name of new file entry.
1752 *		vap	- attributes of new file.
1753 *		excl	- flag indicating exclusive or non-exclusive mode.
1754 *		mode	- mode to open file with.
1755 *		cr	- credentials of caller.
1756 *		flag	- large file flag [UNUSED].
1757 *		ct	- caller context
1758 *		vsecp	- ACL to be set
1759 *
1760 *	OUT:	vpp	- vnode of created or trunc'd entry.
1761 *
1762 *	RETURN:	0 on success, error code on failure.
1763 *
1764 * Timestamps:
1765 *	dvp - ctime|mtime updated if new entry created
1766 *	 vp - ctime|mtime always, atime if new
1767 */
1768
1769/* ARGSUSED */
1770static int
1771zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
1772    vnode_t **vpp, cred_t *cr, kthread_t *td)
1773{
1774	znode_t		*zp, *dzp = VTOZ(dvp);
1775	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1776	zilog_t		*zilog;
1777	objset_t	*os;
1778	dmu_tx_t	*tx;
1779	int		error;
1780	ksid_t		*ksid;
1781	uid_t		uid;
1782	gid_t		gid = crgetgid(cr);
1783	zfs_acl_ids_t   acl_ids;
1784	boolean_t	fuid_dirtied;
1785	void		*vsecp = NULL;
1786	int		flag = 0;
1787	uint64_t	txtype;
1788
1789	/*
1790	 * If we have an ephemeral id, ACL, or XVATTR then
1791	 * make sure file system is at proper version
1792	 */
1793
1794	ksid = crgetsid(cr, KSID_OWNER);
1795	if (ksid)
1796		uid = ksid_getid(ksid);
1797	else
1798		uid = crgetuid(cr);
1799
1800	if (zfsvfs->z_use_fuids == B_FALSE &&
1801	    (vsecp || (vap->va_mask & AT_XVATTR) ||
1802	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1803		return (SET_ERROR(EINVAL));
1804
1805	ZFS_ENTER(zfsvfs);
1806	ZFS_VERIFY_ZP(dzp);
1807	os = zfsvfs->z_os;
1808	zilog = zfsvfs->z_log;
1809
1810	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1811	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1812		ZFS_EXIT(zfsvfs);
1813		return (SET_ERROR(EILSEQ));
1814	}
1815
1816	if (vap->va_mask & AT_XVATTR) {
1817		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
1818		    crgetuid(cr), cr, vap->va_type)) != 0) {
1819			ZFS_EXIT(zfsvfs);
1820			return (error);
1821		}
1822	}
1823
1824	*vpp = NULL;
1825
1826	if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
1827		vap->va_mode &= ~S_ISVTX;
1828
1829	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
1830	if (error) {
1831		ZFS_EXIT(zfsvfs);
1832		return (error);
1833	}
1834	ASSERT3P(zp, ==, NULL);
1835
1836	/*
1837	 * Create a new file object and update the directory
1838	 * to reference it.
1839	 */
1840	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1841		goto out;
1842	}
1843
1844	/*
1845	 * We only support the creation of regular files in
1846	 * extended attribute directories.
1847	 */
1848
1849	if ((dzp->z_pflags & ZFS_XATTR) &&
1850	    (vap->va_type != VREG)) {
1851		error = SET_ERROR(EINVAL);
1852		goto out;
1853	}
1854
1855	if ((error = zfs_acl_ids_create(dzp, 0, vap,
1856	    cr, vsecp, &acl_ids)) != 0)
1857		goto out;
1858
1859	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1860		zfs_acl_ids_free(&acl_ids);
1861		error = SET_ERROR(EDQUOT);
1862		goto out;
1863	}
1864
1865	getnewvnode_reserve(1);
1866
1867	tx = dmu_tx_create(os);
1868
1869	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1870	    ZFS_SA_BASE_ATTR_SIZE);
1871
1872	fuid_dirtied = zfsvfs->z_fuid_dirty;
1873	if (fuid_dirtied)
1874		zfs_fuid_txhold(zfsvfs, tx);
1875	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1876	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1877	if (!zfsvfs->z_use_sa &&
1878	    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1879		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1880		    0, acl_ids.z_aclp->z_acl_bytes);
1881	}
1882	error = dmu_tx_assign(tx, TXG_WAIT);
1883	if (error) {
1884		zfs_acl_ids_free(&acl_ids);
1885		dmu_tx_abort(tx);
1886		getnewvnode_drop_reserve();
1887		ZFS_EXIT(zfsvfs);
1888		return (error);
1889	}
1890	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1891
1892	if (fuid_dirtied)
1893		zfs_fuid_sync(zfsvfs, tx);
1894
1895	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
1896	txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1897	zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1898	    vsecp, acl_ids.z_fuidp, vap);
1899	zfs_acl_ids_free(&acl_ids);
1900	dmu_tx_commit(tx);
1901
1902	getnewvnode_drop_reserve();
1903
1904out:
1905	if (error == 0) {
1906		*vpp = ZTOV(zp);
1907	}
1908
1909	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1910		zil_commit(zilog, 0);
1911
1912	ZFS_EXIT(zfsvfs);
1913	return (error);
1914}
1915
1916/*
1917 * Remove an entry from a directory.
1918 *
1919 *	IN:	dvp	- vnode of directory to remove entry from.
1920 *		name	- name of entry to remove.
1921 *		cr	- credentials of caller.
1922 *		ct	- caller context
1923 *		flags	- case flags
1924 *
1925 *	RETURN:	0 on success, error code on failure.
1926 *
1927 * Timestamps:
1928 *	dvp - ctime|mtime
1929 *	 vp - ctime (if nlink > 0)
1930 */
1931
1932/*ARGSUSED*/
1933static int
1934zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
1935{
1936	znode_t		*dzp = VTOZ(dvp);
1937	znode_t		*zp = VTOZ(vp);
1938	znode_t		*xzp;
1939	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1940	zilog_t		*zilog;
1941	uint64_t	acl_obj, xattr_obj;
1942	uint64_t	obj = 0;
1943	dmu_tx_t	*tx;
1944	boolean_t	unlinked, toobig = FALSE;
1945	uint64_t	txtype;
1946	int		error;
1947
1948	ZFS_ENTER(zfsvfs);
1949	ZFS_VERIFY_ZP(dzp);
1950	ZFS_VERIFY_ZP(zp);
1951	zilog = zfsvfs->z_log;
1952	zp = VTOZ(vp);
1953
1954	xattr_obj = 0;
1955	xzp = NULL;
1956
1957	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1958		goto out;
1959	}
1960
1961	/*
1962	 * Need to use rmdir for removing directories.
1963	 */
1964	if (vp->v_type == VDIR) {
1965		error = SET_ERROR(EPERM);
1966		goto out;
1967	}
1968
1969	vnevent_remove(vp, dvp, name, ct);
1970
1971	obj = zp->z_id;
1972
1973	/* are there any extended attributes? */
1974	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1975	    &xattr_obj, sizeof (xattr_obj));
1976	if (error == 0 && xattr_obj) {
1977		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1978		ASSERT0(error);
1979	}
1980
1981	/*
1982	 * We may delete the znode now, or we may put it in the unlinked set;
1983	 * it depends on whether we're the last link, and on whether there are
1984	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
1985	 * allow for either case.
1986	 */
1987	tx = dmu_tx_create(zfsvfs->z_os);
1988	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1989	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1990	zfs_sa_upgrade_txholds(tx, zp);
1991	zfs_sa_upgrade_txholds(tx, dzp);
1992
1993	if (xzp) {
1994		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1995		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1996	}
1997
1998	/* charge as an update -- would be nice not to charge at all */
1999	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2000
2001	/*
2002	 * Mark this transaction as typically resulting in a net free of space
2003	 */
2004	dmu_tx_mark_netfree(tx);
2005
2006	error = dmu_tx_assign(tx, TXG_WAIT);
2007	if (error) {
2008		dmu_tx_abort(tx);
2009		ZFS_EXIT(zfsvfs);
2010		return (error);
2011	}
2012
2013	/*
2014	 * Remove the directory entry.
2015	 */
2016	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
2017
2018	if (error) {
2019		dmu_tx_commit(tx);
2020		goto out;
2021	}
2022
2023	if (unlinked) {
2024		zfs_unlinked_add(zp, tx);
2025		vp->v_vflag |= VV_NOSYNC;
2026	}
2027
2028	txtype = TX_REMOVE;
2029	zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
2030
2031	dmu_tx_commit(tx);
2032out:
2033
2034	if (xzp)
2035		vrele(ZTOV(xzp));
2036
2037	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2038		zil_commit(zilog, 0);
2039
2040	ZFS_EXIT(zfsvfs);
2041	return (error);
2042}
2043
2044/*
2045 * Create a new directory and insert it into dvp using the name
2046 * provided.  Return a pointer to the inserted directory.
2047 *
2048 *	IN:	dvp	- vnode of directory to add subdir to.
2049 *		dirname	- name of new directory.
2050 *		vap	- attributes of new directory.
2051 *		cr	- credentials of caller.
2052 *		ct	- caller context
2053 *		flags	- case flags
2054 *		vsecp	- ACL to be set
2055 *
2056 *	OUT:	vpp	- vnode of created directory.
2057 *
2058 *	RETURN:	0 on success, error code on failure.
2059 *
2060 * Timestamps:
2061 *	dvp - ctime|mtime updated
2062 *	 vp - ctime|mtime|atime updated
2063 */
2064/*ARGSUSED*/
2065static int
2066zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
2067{
2068	znode_t		*zp, *dzp = VTOZ(dvp);
2069	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2070	zilog_t		*zilog;
2071	uint64_t	txtype;
2072	dmu_tx_t	*tx;
2073	int		error;
2074	ksid_t		*ksid;
2075	uid_t		uid;
2076	gid_t		gid = crgetgid(cr);
2077	zfs_acl_ids_t   acl_ids;
2078	boolean_t	fuid_dirtied;
2079
2080	ASSERT(vap->va_type == VDIR);
2081
2082	/*
2083	 * If we have an ephemeral id, ACL, or XVATTR then
2084	 * make sure file system is at proper version
2085	 */
2086
2087	ksid = crgetsid(cr, KSID_OWNER);
2088	if (ksid)
2089		uid = ksid_getid(ksid);
2090	else
2091		uid = crgetuid(cr);
2092	if (zfsvfs->z_use_fuids == B_FALSE &&
2093	    ((vap->va_mask & AT_XVATTR) ||
2094	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
2095		return (SET_ERROR(EINVAL));
2096
2097	ZFS_ENTER(zfsvfs);
2098	ZFS_VERIFY_ZP(dzp);
2099	zilog = zfsvfs->z_log;
2100
2101	if (dzp->z_pflags & ZFS_XATTR) {
2102		ZFS_EXIT(zfsvfs);
2103		return (SET_ERROR(EINVAL));
2104	}
2105
2106	if (zfsvfs->z_utf8 && u8_validate(dirname,
2107	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2108		ZFS_EXIT(zfsvfs);
2109		return (SET_ERROR(EILSEQ));
2110	}
2111
2112	if (vap->va_mask & AT_XVATTR) {
2113		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
2114		    crgetuid(cr), cr, vap->va_type)) != 0) {
2115			ZFS_EXIT(zfsvfs);
2116			return (error);
2117		}
2118	}
2119
2120	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
2121	    NULL, &acl_ids)) != 0) {
2122		ZFS_EXIT(zfsvfs);
2123		return (error);
2124	}
2125
2126	/*
2127	 * First make sure the new directory doesn't exist.
2128	 *
2129	 * Existence is checked first to make sure we don't return
2130	 * EACCES instead of EEXIST which can cause some applications
2131	 * to fail.
2132	 */
2133	*vpp = NULL;
2134
2135	if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) {
2136		zfs_acl_ids_free(&acl_ids);
2137		ZFS_EXIT(zfsvfs);
2138		return (error);
2139	}
2140	ASSERT3P(zp, ==, NULL);
2141
2142	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
2143		zfs_acl_ids_free(&acl_ids);
2144		ZFS_EXIT(zfsvfs);
2145		return (error);
2146	}
2147
2148	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
2149		zfs_acl_ids_free(&acl_ids);
2150		ZFS_EXIT(zfsvfs);
2151		return (SET_ERROR(EDQUOT));
2152	}
2153
2154	/*
2155	 * Add a new entry to the directory.
2156	 */
2157	getnewvnode_reserve(1);
2158	tx = dmu_tx_create(zfsvfs->z_os);
2159	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
2160	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2161	fuid_dirtied = zfsvfs->z_fuid_dirty;
2162	if (fuid_dirtied)
2163		zfs_fuid_txhold(zfsvfs, tx);
2164	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2165		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
2166		    acl_ids.z_aclp->z_acl_bytes);
2167	}
2168
2169	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
2170	    ZFS_SA_BASE_ATTR_SIZE);
2171
2172	error = dmu_tx_assign(tx, TXG_WAIT);
2173	if (error) {
2174		zfs_acl_ids_free(&acl_ids);
2175		dmu_tx_abort(tx);
2176		getnewvnode_drop_reserve();
2177		ZFS_EXIT(zfsvfs);
2178		return (error);
2179	}
2180
2181	/*
2182	 * Create new node.
2183	 */
2184	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
2185
2186	if (fuid_dirtied)
2187		zfs_fuid_sync(zfsvfs, tx);
2188
2189	/*
2190	 * Now put new name in parent dir.
2191	 */
2192	(void) zfs_link_create(dzp, dirname, zp, tx, ZNEW);
2193
2194	*vpp = ZTOV(zp);
2195
2196	txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
2197	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
2198	    acl_ids.z_fuidp, vap);
2199
2200	zfs_acl_ids_free(&acl_ids);
2201
2202	dmu_tx_commit(tx);
2203
2204	getnewvnode_drop_reserve();
2205
2206	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2207		zil_commit(zilog, 0);
2208
2209	ZFS_EXIT(zfsvfs);
2210	return (0);
2211}
2212
2213/*
2214 * Remove a directory subdir entry.  If the current working
2215 * directory is the same as the subdir to be removed, the
2216 * remove will fail.
2217 *
2218 *	IN:	dvp	- vnode of directory to remove from.
2219 *		name	- name of directory to be removed.
2220 *		cwd	- vnode of current working directory.
2221 *		cr	- credentials of caller.
2222 *		ct	- caller context
2223 *		flags	- case flags
2224 *
2225 *	RETURN:	0 on success, error code on failure.
2226 *
2227 * Timestamps:
2228 *	dvp - ctime|mtime updated
2229 */
2230/*ARGSUSED*/
2231static int
2232zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
2233{
2234	znode_t		*dzp = VTOZ(dvp);
2235	znode_t		*zp = VTOZ(vp);
2236	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2237	zilog_t		*zilog;
2238	dmu_tx_t	*tx;
2239	int		error;
2240
2241	ZFS_ENTER(zfsvfs);
2242	ZFS_VERIFY_ZP(dzp);
2243	ZFS_VERIFY_ZP(zp);
2244	zilog = zfsvfs->z_log;
2245
2246
2247	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2248		goto out;
2249	}
2250
2251	if (vp->v_type != VDIR) {
2252		error = SET_ERROR(ENOTDIR);
2253		goto out;
2254	}
2255
2256	vnevent_rmdir(vp, dvp, name, ct);
2257
2258	tx = dmu_tx_create(zfsvfs->z_os);
2259	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2260	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2261	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2262	zfs_sa_upgrade_txholds(tx, zp);
2263	zfs_sa_upgrade_txholds(tx, dzp);
2264	dmu_tx_mark_netfree(tx);
2265	error = dmu_tx_assign(tx, TXG_WAIT);
2266	if (error) {
2267		dmu_tx_abort(tx);
2268		ZFS_EXIT(zfsvfs);
2269		return (error);
2270	}
2271
2272	cache_purge(dvp);
2273
2274	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
2275
2276	if (error == 0) {
2277		uint64_t txtype = TX_RMDIR;
2278		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
2279	}
2280
2281	dmu_tx_commit(tx);
2282
2283	cache_purge(vp);
2284out:
2285	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2286		zil_commit(zilog, 0);
2287
2288	ZFS_EXIT(zfsvfs);
2289	return (error);
2290}
2291
2292/*
2293 * Read as many directory entries as will fit into the provided
2294 * buffer from the given directory cursor position (specified in
2295 * the uio structure).
2296 *
2297 *	IN:	vp	- vnode of directory to read.
2298 *		uio	- structure supplying read location, range info,
2299 *			  and return buffer.
2300 *		cr	- credentials of caller.
2301 *		ct	- caller context
2302 *		flags	- case flags
2303 *
2304 *	OUT:	uio	- updated offset and range, buffer filled.
2305 *		eofp	- set to true if end-of-file detected.
2306 *
2307 *	RETURN:	0 on success, error code on failure.
2308 *
2309 * Timestamps:
2310 *	vp - atime updated
2311 *
2312 * Note that the low 4 bits of the cookie returned by zap is always zero.
2313 * This allows us to use the low range for "special" directory entries:
2314 * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
2315 * we use the offset 2 for the '.zfs' directory.
2316 */
2317/* ARGSUSED */
2318static int
2319zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
2320{
2321	znode_t		*zp = VTOZ(vp);
2322	iovec_t		*iovp;
2323	edirent_t	*eodp;
2324	dirent64_t	*odp;
2325	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2326	objset_t	*os;
2327	caddr_t		outbuf;
2328	size_t		bufsize;
2329	zap_cursor_t	zc;
2330	zap_attribute_t	zap;
2331	uint_t		bytes_wanted;
2332	uint64_t	offset; /* must be unsigned; checks for < 1 */
2333	uint64_t	parent;
2334	int		local_eof;
2335	int		outcount;
2336	int		error;
2337	uint8_t		prefetch;
2338	boolean_t	check_sysattrs;
2339	uint8_t		type;
2340	int		ncooks;
2341	u_long		*cooks = NULL;
2342	int		flags = 0;
2343
2344	ZFS_ENTER(zfsvfs);
2345	ZFS_VERIFY_ZP(zp);
2346
2347	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2348	    &parent, sizeof (parent))) != 0) {
2349		ZFS_EXIT(zfsvfs);
2350		return (error);
2351	}
2352
2353	/*
2354	 * If we are not given an eof variable,
2355	 * use a local one.
2356	 */
2357	if (eofp == NULL)
2358		eofp = &local_eof;
2359
2360	/*
2361	 * Check for valid iov_len.
2362	 */
2363	if (uio->uio_iov->iov_len <= 0) {
2364		ZFS_EXIT(zfsvfs);
2365		return (SET_ERROR(EINVAL));
2366	}
2367
2368	/*
2369	 * Quit if directory has been removed (posix)
2370	 */
2371	if ((*eofp = zp->z_unlinked) != 0) {
2372		ZFS_EXIT(zfsvfs);
2373		return (0);
2374	}
2375
2376	error = 0;
2377	os = zfsvfs->z_os;
2378	offset = uio->uio_loffset;
2379	prefetch = zp->z_zn_prefetch;
2380
2381	/*
2382	 * Initialize the iterator cursor.
2383	 */
2384	if (offset <= 3) {
2385		/*
2386		 * Start iteration from the beginning of the directory.
2387		 */
2388		zap_cursor_init(&zc, os, zp->z_id);
2389	} else {
2390		/*
2391		 * The offset is a serialized cursor.
2392		 */
2393		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2394	}
2395
2396	/*
2397	 * Get space to change directory entries into fs independent format.
2398	 */
2399	iovp = uio->uio_iov;
2400	bytes_wanted = iovp->iov_len;
2401	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
2402		bufsize = bytes_wanted;
2403		outbuf = kmem_alloc(bufsize, KM_SLEEP);
2404		odp = (struct dirent64 *)outbuf;
2405	} else {
2406		bufsize = bytes_wanted;
2407		outbuf = NULL;
2408		odp = (struct dirent64 *)iovp->iov_base;
2409	}
2410	eodp = (struct edirent *)odp;
2411
2412	if (ncookies != NULL) {
2413		/*
2414		 * Minimum entry size is dirent size and 1 byte for a file name.
2415		 */
2416		ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
2417		cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
2418		*cookies = cooks;
2419		*ncookies = ncooks;
2420	}
2421	/*
2422	 * If this VFS supports the system attribute view interface; and
2423	 * we're looking at an extended attribute directory; and we care
2424	 * about normalization conflicts on this vfs; then we must check
2425	 * for normalization conflicts with the sysattr name space.
2426	 */
2427#ifdef TODO
2428	check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2429	    (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2430	    (flags & V_RDDIR_ENTFLAGS);
2431#else
2432	check_sysattrs = 0;
2433#endif
2434
2435	/*
2436	 * Transform to file-system independent format
2437	 */
2438	outcount = 0;
2439	while (outcount < bytes_wanted) {
2440		ino64_t objnum;
2441		ushort_t reclen;
2442		off64_t *next = NULL;
2443
2444		/*
2445		 * Special case `.', `..', and `.zfs'.
2446		 */
2447		if (offset == 0) {
2448			(void) strcpy(zap.za_name, ".");
2449			zap.za_normalization_conflict = 0;
2450			objnum = zp->z_id;
2451			type = DT_DIR;
2452		} else if (offset == 1) {
2453			(void) strcpy(zap.za_name, "..");
2454			zap.za_normalization_conflict = 0;
2455			objnum = parent;
2456			type = DT_DIR;
2457		} else if (offset == 2 && zfs_show_ctldir(zp)) {
2458			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2459			zap.za_normalization_conflict = 0;
2460			objnum = ZFSCTL_INO_ROOT;
2461			type = DT_DIR;
2462		} else {
2463			/*
2464			 * Grab next entry.
2465			 */
2466			if (error = zap_cursor_retrieve(&zc, &zap)) {
2467				if ((*eofp = (error == ENOENT)) != 0)
2468					break;
2469				else
2470					goto update;
2471			}
2472
2473			if (zap.za_integer_length != 8 ||
2474			    zap.za_num_integers != 1) {
2475				cmn_err(CE_WARN, "zap_readdir: bad directory "
2476				    "entry, obj = %lld, offset = %lld\n",
2477				    (u_longlong_t)zp->z_id,
2478				    (u_longlong_t)offset);
2479				error = SET_ERROR(ENXIO);
2480				goto update;
2481			}
2482
2483			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2484			/*
2485			 * MacOS X can extract the object type here such as:
2486			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2487			 */
2488			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2489
2490			if (check_sysattrs && !zap.za_normalization_conflict) {
2491#ifdef TODO
2492				zap.za_normalization_conflict =
2493				    xattr_sysattr_casechk(zap.za_name);
2494#else
2495				panic("%s:%u: TODO", __func__, __LINE__);
2496#endif
2497			}
2498		}
2499
2500		if (flags & V_RDDIR_ACCFILTER) {
2501			/*
2502			 * If we have no access at all, don't include
2503			 * this entry in the returned information
2504			 */
2505			znode_t	*ezp;
2506			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
2507				goto skip_entry;
2508			if (!zfs_has_access(ezp, cr)) {
2509				vrele(ZTOV(ezp));
2510				goto skip_entry;
2511			}
2512			vrele(ZTOV(ezp));
2513		}
2514
2515		if (flags & V_RDDIR_ENTFLAGS)
2516			reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2517		else
2518			reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2519
2520		/*
2521		 * Will this entry fit in the buffer?
2522		 */
2523		if (outcount + reclen > bufsize) {
2524			/*
2525			 * Did we manage to fit anything in the buffer?
2526			 */
2527			if (!outcount) {
2528				error = SET_ERROR(EINVAL);
2529				goto update;
2530			}
2531			break;
2532		}
2533		if (flags & V_RDDIR_ENTFLAGS) {
2534			/*
2535			 * Add extended flag entry:
2536			 */
2537			eodp->ed_ino = objnum;
2538			eodp->ed_reclen = reclen;
2539			/* NOTE: ed_off is the offset for the *next* entry */
2540			next = &(eodp->ed_off);
2541			eodp->ed_eflags = zap.za_normalization_conflict ?
2542			    ED_CASE_CONFLICT : 0;
2543			(void) strncpy(eodp->ed_name, zap.za_name,
2544			    EDIRENT_NAMELEN(reclen));
2545			eodp = (edirent_t *)((intptr_t)eodp + reclen);
2546		} else {
2547			/*
2548			 * Add normal entry:
2549			 */
2550			odp->d_ino = objnum;
2551			odp->d_reclen = reclen;
2552			odp->d_namlen = strlen(zap.za_name);
2553			(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
2554			odp->d_type = type;
2555			dirent_terminate(odp);
2556			odp = (dirent64_t *)((intptr_t)odp + reclen);
2557		}
2558		outcount += reclen;
2559
2560		ASSERT(outcount <= bufsize);
2561
2562		/* Prefetch znode */
2563		if (prefetch)
2564			dmu_prefetch(os, objnum, 0, 0, 0,
2565			    ZIO_PRIORITY_SYNC_READ);
2566
2567	skip_entry:
2568		/*
2569		 * Move to the next entry, fill in the previous offset.
2570		 */
2571		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2572			zap_cursor_advance(&zc);
2573			offset = zap_cursor_serialize(&zc);
2574		} else {
2575			offset += 1;
2576		}
2577
2578		if (cooks != NULL) {
2579			*cooks++ = offset;
2580			ncooks--;
2581			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
2582		}
2583	}
2584	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2585
2586	/* Subtract unused cookies */
2587	if (ncookies != NULL)
2588		*ncookies -= ncooks;
2589
2590	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
2591		iovp->iov_base += outcount;
2592		iovp->iov_len -= outcount;
2593		uio->uio_resid -= outcount;
2594	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2595		/*
2596		 * Reset the pointer.
2597		 */
2598		offset = uio->uio_loffset;
2599	}
2600
2601update:
2602	zap_cursor_fini(&zc);
2603	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
2604		kmem_free(outbuf, bufsize);
2605
2606	if (error == ENOENT)
2607		error = 0;
2608
2609	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2610
2611	uio->uio_loffset = offset;
2612	ZFS_EXIT(zfsvfs);
2613	if (error != 0 && cookies != NULL) {
2614		free(*cookies, M_TEMP);
2615		*cookies = NULL;
2616		*ncookies = 0;
2617	}
2618	return (error);
2619}
2620
2621ulong_t zfs_fsync_sync_cnt = 4;
2622
2623static int
2624zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2625{
2626	znode_t	*zp = VTOZ(vp);
2627	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2628
2629	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2630
2631	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
2632		ZFS_ENTER(zfsvfs);
2633		ZFS_VERIFY_ZP(zp);
2634		zil_commit(zfsvfs->z_log, zp->z_id);
2635		ZFS_EXIT(zfsvfs);
2636	}
2637	return (0);
2638}
2639
2640
2641/*
2642 * Get the requested file attributes and place them in the provided
2643 * vattr structure.
2644 *
2645 *	IN:	vp	- vnode of file.
2646 *		vap	- va_mask identifies requested attributes.
2647 *			  If AT_XVATTR set, then optional attrs are requested
2648 *		flags	- ATTR_NOACLCHECK (CIFS server context)
2649 *		cr	- credentials of caller.
2650 *		ct	- caller context
2651 *
2652 *	OUT:	vap	- attribute values.
2653 *
2654 *	RETURN:	0 (always succeeds).
2655 */
2656/* ARGSUSED */
2657static int
2658zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2659    caller_context_t *ct)
2660{
2661	znode_t *zp = VTOZ(vp);
2662	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2663	int	error = 0;
2664	uint32_t blksize;
2665	u_longlong_t nblocks;
2666	uint64_t links;
2667	uint64_t mtime[2], ctime[2], crtime[2], rdev;
2668	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2669	xoptattr_t *xoap = NULL;
2670	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2671	sa_bulk_attr_t bulk[4];
2672	int count = 0;
2673
2674	ZFS_ENTER(zfsvfs);
2675	ZFS_VERIFY_ZP(zp);
2676
2677	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2678
2679	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
2680	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
2681	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
2682	if (vp->v_type == VBLK || vp->v_type == VCHR)
2683		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
2684		    &rdev, 8);
2685
2686	if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
2687		ZFS_EXIT(zfsvfs);
2688		return (error);
2689	}
2690
2691	/*
2692	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2693	 * Also, if we are the owner don't bother, since owner should
2694	 * always be allowed to read basic attributes of file.
2695	 */
2696	if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
2697	    (vap->va_uid != crgetuid(cr))) {
2698		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2699		    skipaclchk, cr)) {
2700			ZFS_EXIT(zfsvfs);
2701			return (error);
2702		}
2703	}
2704
2705	/*
2706	 * Return all attributes.  It's cheaper to provide the answer
2707	 * than to determine whether we were asked the question.
2708	 */
2709
2710	vap->va_type = IFTOVT(zp->z_mode);
2711	vap->va_mode = zp->z_mode & ~S_IFMT;
2712#ifdef illumos
2713	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
2714#else
2715	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
2716#endif
2717	vap->va_nodeid = zp->z_id;
2718	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
2719		links = zp->z_links + 1;
2720	else
2721		links = zp->z_links;
2722	vap->va_nlink = MIN(links, LINK_MAX);	/* nlink_t limit! */
2723	vap->va_size = zp->z_size;
2724#ifdef illumos
2725	vap->va_rdev = vp->v_rdev;
2726#else
2727	if (vp->v_type == VBLK || vp->v_type == VCHR)
2728		vap->va_rdev = zfs_cmpldev(rdev);
2729#endif
2730	vap->va_seq = zp->z_seq;
2731	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
2732     	vap->va_filerev = zp->z_seq;
2733
2734	/*
2735	 * Add in any requested optional attributes and the create time.
2736	 * Also set the corresponding bits in the returned attribute bitmap.
2737	 */
2738	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2739		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2740			xoap->xoa_archive =
2741			    ((zp->z_pflags & ZFS_ARCHIVE) != 0);
2742			XVA_SET_RTN(xvap, XAT_ARCHIVE);
2743		}
2744
2745		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2746			xoap->xoa_readonly =
2747			    ((zp->z_pflags & ZFS_READONLY) != 0);
2748			XVA_SET_RTN(xvap, XAT_READONLY);
2749		}
2750
2751		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2752			xoap->xoa_system =
2753			    ((zp->z_pflags & ZFS_SYSTEM) != 0);
2754			XVA_SET_RTN(xvap, XAT_SYSTEM);
2755		}
2756
2757		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2758			xoap->xoa_hidden =
2759			    ((zp->z_pflags & ZFS_HIDDEN) != 0);
2760			XVA_SET_RTN(xvap, XAT_HIDDEN);
2761		}
2762
2763		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2764			xoap->xoa_nounlink =
2765			    ((zp->z_pflags & ZFS_NOUNLINK) != 0);
2766			XVA_SET_RTN(xvap, XAT_NOUNLINK);
2767		}
2768
2769		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2770			xoap->xoa_immutable =
2771			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
2772			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2773		}
2774
2775		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2776			xoap->xoa_appendonly =
2777			    ((zp->z_pflags & ZFS_APPENDONLY) != 0);
2778			XVA_SET_RTN(xvap, XAT_APPENDONLY);
2779		}
2780
2781		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2782			xoap->xoa_nodump =
2783			    ((zp->z_pflags & ZFS_NODUMP) != 0);
2784			XVA_SET_RTN(xvap, XAT_NODUMP);
2785		}
2786
2787		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2788			xoap->xoa_opaque =
2789			    ((zp->z_pflags & ZFS_OPAQUE) != 0);
2790			XVA_SET_RTN(xvap, XAT_OPAQUE);
2791		}
2792
2793		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2794			xoap->xoa_av_quarantined =
2795			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
2796			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2797		}
2798
2799		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2800			xoap->xoa_av_modified =
2801			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
2802			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2803		}
2804
2805		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2806		    vp->v_type == VREG) {
2807			zfs_sa_get_scanstamp(zp, xvap);
2808		}
2809
2810		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2811			xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
2812			XVA_SET_RTN(xvap, XAT_REPARSE);
2813		}
2814		if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
2815			xoap->xoa_generation = zp->z_gen;
2816			XVA_SET_RTN(xvap, XAT_GEN);
2817		}
2818
2819		if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
2820			xoap->xoa_offline =
2821			    ((zp->z_pflags & ZFS_OFFLINE) != 0);
2822			XVA_SET_RTN(xvap, XAT_OFFLINE);
2823		}
2824
2825		if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
2826			xoap->xoa_sparse =
2827			    ((zp->z_pflags & ZFS_SPARSE) != 0);
2828			XVA_SET_RTN(xvap, XAT_SPARSE);
2829		}
2830	}
2831
2832	ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
2833	ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2834	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2835	ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
2836
2837
2838	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
2839	vap->va_blksize = blksize;
2840	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
2841
2842	if (zp->z_blksz == 0) {
2843		/*
2844		 * Block size hasn't been set; suggest maximal I/O transfers.
2845		 */
2846		vap->va_blksize = zfsvfs->z_max_blksz;
2847	}
2848
2849	ZFS_EXIT(zfsvfs);
2850	return (0);
2851}
2852
2853/*
2854 * Set the file attributes to the values contained in the
2855 * vattr structure.
2856 *
2857 *	IN:	vp	- vnode of file to be modified.
2858 *		vap	- new attribute values.
2859 *			  If AT_XVATTR set, then optional attrs are being set
2860 *		flags	- ATTR_UTIME set if non-default time values provided.
2861 *			- ATTR_NOACLCHECK (CIFS context only).
2862 *		cr	- credentials of caller.
2863 *		ct	- caller context
2864 *
2865 *	RETURN:	0 on success, error code on failure.
2866 *
2867 * Timestamps:
2868 *	vp - ctime updated, mtime updated if size changed.
2869 */
2870/* ARGSUSED */
2871static int
2872zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2873    caller_context_t *ct)
2874{
2875	znode_t		*zp = VTOZ(vp);
2876	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2877	zilog_t		*zilog;
2878	dmu_tx_t	*tx;
2879	vattr_t		oldva;
2880	xvattr_t	tmpxvattr;
2881	uint_t		mask = vap->va_mask;
2882	uint_t		saved_mask = 0;
2883	uint64_t	saved_mode;
2884	int		trim_mask = 0;
2885	uint64_t	new_mode;
2886	uint64_t	new_uid, new_gid;
2887	uint64_t	xattr_obj;
2888	uint64_t	mtime[2], ctime[2];
2889	znode_t		*attrzp;
2890	int		need_policy = FALSE;
2891	int		err, err2;
2892	zfs_fuid_info_t *fuidp = NULL;
2893	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2894	xoptattr_t	*xoap;
2895	zfs_acl_t	*aclp;
2896	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2897	boolean_t	fuid_dirtied = B_FALSE;
2898	sa_bulk_attr_t	bulk[7], xattr_bulk[7];
2899	int		count = 0, xattr_count = 0;
2900
2901	if (mask == 0)
2902		return (0);
2903
2904	if (mask & AT_NOSET)
2905		return (SET_ERROR(EINVAL));
2906
2907	ZFS_ENTER(zfsvfs);
2908	ZFS_VERIFY_ZP(zp);
2909
2910	zilog = zfsvfs->z_log;
2911
2912	/*
2913	 * Make sure that if we have ephemeral uid/gid or xvattr specified
2914	 * that file system is at proper version level
2915	 */
2916
2917	if (zfsvfs->z_use_fuids == B_FALSE &&
2918	    (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2919	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2920	    (mask & AT_XVATTR))) {
2921		ZFS_EXIT(zfsvfs);
2922		return (SET_ERROR(EINVAL));
2923	}
2924
2925	if (mask & AT_SIZE && vp->v_type == VDIR) {
2926		ZFS_EXIT(zfsvfs);
2927		return (SET_ERROR(EISDIR));
2928	}
2929
2930	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
2931		ZFS_EXIT(zfsvfs);
2932		return (SET_ERROR(EINVAL));
2933	}
2934
2935	/*
2936	 * If this is an xvattr_t, then get a pointer to the structure of
2937	 * optional attributes.  If this is NULL, then we have a vattr_t.
2938	 */
2939	xoap = xva_getxoptattr(xvap);
2940
2941	xva_init(&tmpxvattr);
2942
2943	/*
2944	 * Immutable files can only alter immutable bit and atime
2945	 */
2946	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2947	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
2948	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2949		ZFS_EXIT(zfsvfs);
2950		return (SET_ERROR(EPERM));
2951	}
2952
2953	/*
2954	 * Note: ZFS_READONLY is handled in zfs_zaccess_common.
2955	 */
2956
2957	/*
2958	 * Verify timestamps doesn't overflow 32 bits.
2959	 * ZFS can handle large timestamps, but 32bit syscalls can't
2960	 * handle times greater than 2039.  This check should be removed
2961	 * once large timestamps are fully supported.
2962	 */
2963	if (mask & (AT_ATIME | AT_MTIME)) {
2964		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2965		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2966			ZFS_EXIT(zfsvfs);
2967			return (SET_ERROR(EOVERFLOW));
2968		}
2969	}
2970	if (xoap && (mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME) &&
2971	    TIMESPEC_OVERFLOW(&vap->va_birthtime)) {
2972		ZFS_EXIT(zfsvfs);
2973		return (SET_ERROR(EOVERFLOW));
2974	}
2975
2976	attrzp = NULL;
2977	aclp = NULL;
2978
2979	/* Can this be moved to before the top label? */
2980	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2981		ZFS_EXIT(zfsvfs);
2982		return (SET_ERROR(EROFS));
2983	}
2984
2985	/*
2986	 * First validate permissions
2987	 */
2988
2989	if (mask & AT_SIZE) {
2990		/*
2991		 * XXX - Note, we are not providing any open
2992		 * mode flags here (like FNDELAY), so we may
2993		 * block if there are locks present... this
2994		 * should be addressed in openat().
2995		 */
2996		/* XXX - would it be OK to generate a log record here? */
2997		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2998		if (err) {
2999			ZFS_EXIT(zfsvfs);
3000			return (err);
3001		}
3002	}
3003
3004	if (mask & (AT_ATIME|AT_MTIME) ||
3005	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
3006	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
3007	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
3008	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
3009	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
3010	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
3011	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
3012		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
3013		    skipaclchk, cr);
3014	}
3015
3016	if (mask & (AT_UID|AT_GID)) {
3017		int	idmask = (mask & (AT_UID|AT_GID));
3018		int	take_owner;
3019		int	take_group;
3020
3021		/*
3022		 * NOTE: even if a new mode is being set,
3023		 * we may clear S_ISUID/S_ISGID bits.
3024		 */
3025
3026		if (!(mask & AT_MODE))
3027			vap->va_mode = zp->z_mode;
3028
3029		/*
3030		 * Take ownership or chgrp to group we are a member of
3031		 */
3032
3033		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
3034		take_group = (mask & AT_GID) &&
3035		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
3036
3037		/*
3038		 * If both AT_UID and AT_GID are set then take_owner and
3039		 * take_group must both be set in order to allow taking
3040		 * ownership.
3041		 *
3042		 * Otherwise, send the check through secpolicy_vnode_setattr()
3043		 *
3044		 */
3045
3046		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
3047		    ((idmask == AT_UID) && take_owner) ||
3048		    ((idmask == AT_GID) && take_group)) {
3049			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
3050			    skipaclchk, cr) == 0) {
3051				/*
3052				 * Remove setuid/setgid for non-privileged users
3053				 */
3054				secpolicy_setid_clear(vap, vp, cr);
3055				trim_mask = (mask & (AT_UID|AT_GID));
3056			} else {
3057				need_policy =  TRUE;
3058			}
3059		} else {
3060			need_policy =  TRUE;
3061		}
3062	}
3063
3064	oldva.va_mode = zp->z_mode;
3065	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
3066	if (mask & AT_XVATTR) {
3067		/*
3068		 * Update xvattr mask to include only those attributes
3069		 * that are actually changing.
3070		 *
3071		 * the bits will be restored prior to actually setting
3072		 * the attributes so the caller thinks they were set.
3073		 */
3074		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
3075			if (xoap->xoa_appendonly !=
3076			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
3077				need_policy = TRUE;
3078			} else {
3079				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
3080				XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
3081			}
3082		}
3083
3084		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
3085			if (xoap->xoa_nounlink !=
3086			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
3087				need_policy = TRUE;
3088			} else {
3089				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
3090				XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
3091			}
3092		}
3093
3094		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
3095			if (xoap->xoa_immutable !=
3096			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
3097				need_policy = TRUE;
3098			} else {
3099				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
3100				XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
3101			}
3102		}
3103
3104		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
3105			if (xoap->xoa_nodump !=
3106			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
3107				need_policy = TRUE;
3108			} else {
3109				XVA_CLR_REQ(xvap, XAT_NODUMP);
3110				XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
3111			}
3112		}
3113
3114		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
3115			if (xoap->xoa_av_modified !=
3116			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
3117				need_policy = TRUE;
3118			} else {
3119				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
3120				XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
3121			}
3122		}
3123
3124		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
3125			if ((vp->v_type != VREG &&
3126			    xoap->xoa_av_quarantined) ||
3127			    xoap->xoa_av_quarantined !=
3128			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
3129				need_policy = TRUE;
3130			} else {
3131				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
3132				XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
3133			}
3134		}
3135
3136		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
3137			ZFS_EXIT(zfsvfs);
3138			return (SET_ERROR(EPERM));
3139		}
3140
3141		if (need_policy == FALSE &&
3142		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
3143		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
3144			need_policy = TRUE;
3145		}
3146	}
3147
3148	if (mask & AT_MODE) {
3149		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
3150			err = secpolicy_setid_setsticky_clear(vp, vap,
3151			    &oldva, cr);
3152			if (err) {
3153				ZFS_EXIT(zfsvfs);
3154				return (err);
3155			}
3156			trim_mask |= AT_MODE;
3157		} else {
3158			need_policy = TRUE;
3159		}
3160	}
3161
3162	if (need_policy) {
3163		/*
3164		 * If trim_mask is set then take ownership
3165		 * has been granted or write_acl is present and user
3166		 * has the ability to modify mode.  In that case remove
3167		 * UID|GID and or MODE from mask so that
3168		 * secpolicy_vnode_setattr() doesn't revoke it.
3169		 */
3170
3171		if (trim_mask) {
3172			saved_mask = vap->va_mask;
3173			vap->va_mask &= ~trim_mask;
3174			if (trim_mask & AT_MODE) {
3175				/*
3176				 * Save the mode, as secpolicy_vnode_setattr()
3177				 * will overwrite it with ova.va_mode.
3178				 */
3179				saved_mode = vap->va_mode;
3180			}
3181		}
3182		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
3183		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
3184		if (err) {
3185			ZFS_EXIT(zfsvfs);
3186			return (err);
3187		}
3188
3189		if (trim_mask) {
3190			vap->va_mask |= saved_mask;
3191			if (trim_mask & AT_MODE) {
3192				/*
3193				 * Recover the mode after
3194				 * secpolicy_vnode_setattr().
3195				 */
3196				vap->va_mode = saved_mode;
3197			}
3198		}
3199	}
3200
3201	/*
3202	 * secpolicy_vnode_setattr, or take ownership may have
3203	 * changed va_mask
3204	 */
3205	mask = vap->va_mask;
3206
3207	if ((mask & (AT_UID | AT_GID))) {
3208		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
3209		    &xattr_obj, sizeof (xattr_obj));
3210
3211		if (err == 0 && xattr_obj) {
3212			err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
3213			if (err == 0) {
3214				err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
3215				if (err != 0)
3216					vrele(ZTOV(attrzp));
3217			}
3218			if (err)
3219				goto out2;
3220		}
3221		if (mask & AT_UID) {
3222			new_uid = zfs_fuid_create(zfsvfs,
3223			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
3224			if (new_uid != zp->z_uid &&
3225			    zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
3226				if (attrzp)
3227					vput(ZTOV(attrzp));
3228				err = SET_ERROR(EDQUOT);
3229				goto out2;
3230			}
3231		}
3232
3233		if (mask & AT_GID) {
3234			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
3235			    cr, ZFS_GROUP, &fuidp);
3236			if (new_gid != zp->z_gid &&
3237			    zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
3238				if (attrzp)
3239					vput(ZTOV(attrzp));
3240				err = SET_ERROR(EDQUOT);
3241				goto out2;
3242			}
3243		}
3244	}
3245	tx = dmu_tx_create(zfsvfs->z_os);
3246
3247	if (mask & AT_MODE) {
3248		uint64_t pmode = zp->z_mode;
3249		uint64_t acl_obj;
3250		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
3251
3252		if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
3253		    !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
3254			err = SET_ERROR(EPERM);
3255			goto out;
3256		}
3257
3258		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
3259			goto out;
3260
3261		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
3262			/*
3263			 * Are we upgrading ACL from old V0 format
3264			 * to V1 format?
3265			 */
3266			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
3267			    zfs_znode_acl_version(zp) ==
3268			    ZFS_ACL_VERSION_INITIAL) {
3269				dmu_tx_hold_free(tx, acl_obj, 0,
3270				    DMU_OBJECT_END);
3271				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3272				    0, aclp->z_acl_bytes);
3273			} else {
3274				dmu_tx_hold_write(tx, acl_obj, 0,
3275				    aclp->z_acl_bytes);
3276			}
3277		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3278			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3279			    0, aclp->z_acl_bytes);
3280		}
3281		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3282	} else {
3283		if ((mask & AT_XVATTR) &&
3284		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3285			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3286		else
3287			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3288	}
3289
3290	if (attrzp) {
3291		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3292	}
3293
3294	fuid_dirtied = zfsvfs->z_fuid_dirty;
3295	if (fuid_dirtied)
3296		zfs_fuid_txhold(zfsvfs, tx);
3297
3298	zfs_sa_upgrade_txholds(tx, zp);
3299
3300	err = dmu_tx_assign(tx, TXG_WAIT);
3301	if (err)
3302		goto out;
3303
3304	count = 0;
3305	/*
3306	 * Set each attribute requested.
3307	 * We group settings according to the locks they need to acquire.
3308	 *
3309	 * Note: you cannot set ctime directly, although it will be
3310	 * updated as a side-effect of calling this function.
3311	 */
3312
3313	if (mask & (AT_UID|AT_GID|AT_MODE))
3314		mutex_enter(&zp->z_acl_lock);
3315
3316	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3317	    &zp->z_pflags, sizeof (zp->z_pflags));
3318
3319	if (attrzp) {
3320		if (mask & (AT_UID|AT_GID|AT_MODE))
3321			mutex_enter(&attrzp->z_acl_lock);
3322		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3323		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3324		    sizeof (attrzp->z_pflags));
3325	}
3326
3327	if (mask & (AT_UID|AT_GID)) {
3328
3329		if (mask & AT_UID) {
3330			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3331			    &new_uid, sizeof (new_uid));
3332			zp->z_uid = new_uid;
3333			if (attrzp) {
3334				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3335				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3336				    sizeof (new_uid));
3337				attrzp->z_uid = new_uid;
3338			}
3339		}
3340
3341		if (mask & AT_GID) {
3342			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3343			    NULL, &new_gid, sizeof (new_gid));
3344			zp->z_gid = new_gid;
3345			if (attrzp) {
3346				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3347				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3348				    sizeof (new_gid));
3349				attrzp->z_gid = new_gid;
3350			}
3351		}
3352		if (!(mask & AT_MODE)) {
3353			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3354			    NULL, &new_mode, sizeof (new_mode));
3355			new_mode = zp->z_mode;
3356		}
3357		err = zfs_acl_chown_setattr(zp);
3358		ASSERT(err == 0);
3359		if (attrzp) {
3360			err = zfs_acl_chown_setattr(attrzp);
3361			ASSERT(err == 0);
3362		}
3363	}
3364
3365	if (mask & AT_MODE) {
3366		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3367		    &new_mode, sizeof (new_mode));
3368		zp->z_mode = new_mode;
3369		ASSERT3U((uintptr_t)aclp, !=, 0);
3370		err = zfs_aclset_common(zp, aclp, cr, tx);
3371		ASSERT0(err);
3372		if (zp->z_acl_cached)
3373			zfs_acl_free(zp->z_acl_cached);
3374		zp->z_acl_cached = aclp;
3375		aclp = NULL;
3376	}
3377
3378
3379	if (mask & AT_ATIME) {
3380		ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
3381		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
3382		    &zp->z_atime, sizeof (zp->z_atime));
3383	}
3384
3385	if (mask & AT_MTIME) {
3386		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3387		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3388		    mtime, sizeof (mtime));
3389	}
3390
3391	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3392	if (mask & AT_SIZE && !(mask & AT_MTIME)) {
3393		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
3394		    NULL, mtime, sizeof (mtime));
3395		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3396		    &ctime, sizeof (ctime));
3397		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
3398		    B_TRUE);
3399	} else if (mask != 0) {
3400		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3401		    &ctime, sizeof (ctime));
3402		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
3403		    B_TRUE);
3404		if (attrzp) {
3405			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3406			    SA_ZPL_CTIME(zfsvfs), NULL,
3407			    &ctime, sizeof (ctime));
3408			zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
3409			    mtime, ctime, B_TRUE);
3410		}
3411	}
3412	/*
3413	 * Do this after setting timestamps to prevent timestamp
3414	 * update from toggling bit
3415	 */
3416
3417	if (xoap && (mask & AT_XVATTR)) {
3418
3419		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
3420			xoap->xoa_createtime = vap->va_birthtime;
3421		/*
3422		 * restore trimmed off masks
3423		 * so that return masks can be set for caller.
3424		 */
3425
3426		if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3427			XVA_SET_REQ(xvap, XAT_APPENDONLY);
3428		}
3429		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3430			XVA_SET_REQ(xvap, XAT_NOUNLINK);
3431		}
3432		if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3433			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3434		}
3435		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3436			XVA_SET_REQ(xvap, XAT_NODUMP);
3437		}
3438		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3439			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3440		}
3441		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3442			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3443		}
3444
3445		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3446			ASSERT(vp->v_type == VREG);
3447
3448		zfs_xvattr_set(zp, xvap, tx);
3449	}
3450
3451	if (fuid_dirtied)
3452		zfs_fuid_sync(zfsvfs, tx);
3453
3454	if (mask != 0)
3455		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3456
3457	if (mask & (AT_UID|AT_GID|AT_MODE))
3458		mutex_exit(&zp->z_acl_lock);
3459
3460	if (attrzp) {
3461		if (mask & (AT_UID|AT_GID|AT_MODE))
3462			mutex_exit(&attrzp->z_acl_lock);
3463	}
3464out:
3465	if (err == 0 && attrzp) {
3466		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3467		    xattr_count, tx);
3468		ASSERT(err2 == 0);
3469	}
3470
3471	if (attrzp)
3472		vput(ZTOV(attrzp));
3473
3474	if (aclp)
3475		zfs_acl_free(aclp);
3476
3477	if (fuidp) {
3478		zfs_fuid_info_free(fuidp);
3479		fuidp = NULL;
3480	}
3481
3482	if (err) {
3483		dmu_tx_abort(tx);
3484	} else {
3485		err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3486		dmu_tx_commit(tx);
3487	}
3488
3489out2:
3490	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3491		zil_commit(zilog, 0);
3492
3493	ZFS_EXIT(zfsvfs);
3494	return (err);
3495}
3496
3497/*
3498 * We acquire all but fdvp locks using non-blocking acquisitions.  If we
3499 * fail to acquire any lock in the path we will drop all held locks,
3500 * acquire the new lock in a blocking fashion, and then release it and
3501 * restart the rename.  This acquire/release step ensures that we do not
3502 * spin on a lock waiting for release.  On error release all vnode locks
3503 * and decrement references the way tmpfs_rename() would do.
3504 */
3505static int
3506zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
3507    struct vnode *tdvp, struct vnode **tvpp,
3508    const struct componentname *scnp, const struct componentname *tcnp)
3509{
3510	zfsvfs_t	*zfsvfs;
3511	struct vnode	*nvp, *svp, *tvp;
3512	znode_t		*sdzp, *tdzp, *szp, *tzp;
3513	const char	*snm = scnp->cn_nameptr;
3514	const char	*tnm = tcnp->cn_nameptr;
3515	int error;
3516
3517	VOP_UNLOCK(tdvp, 0);
3518	if (*tvpp != NULL && *tvpp != tdvp)
3519		VOP_UNLOCK(*tvpp, 0);
3520
3521relock:
3522	error = vn_lock(sdvp, LK_EXCLUSIVE);
3523	if (error)
3524		goto out;
3525	sdzp = VTOZ(sdvp);
3526
3527	error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
3528	if (error != 0) {
3529		VOP_UNLOCK(sdvp, 0);
3530		if (error != EBUSY)
3531			goto out;
3532		error = vn_lock(tdvp, LK_EXCLUSIVE);
3533		if (error)
3534			goto out;
3535		VOP_UNLOCK(tdvp, 0);
3536		goto relock;
3537	}
3538	tdzp = VTOZ(tdvp);
3539
3540	/*
3541	 * Before using sdzp and tdzp we must ensure that they are live.
3542	 * As a porting legacy from illumos we have two things to worry
3543	 * about.  One is typical for FreeBSD and it is that the vnode is
3544	 * not reclaimed (doomed).  The other is that the znode is live.
3545	 * The current code can invalidate the znode without acquiring the
3546	 * corresponding vnode lock if the object represented by the znode
3547	 * and vnode is no longer valid after a rollback or receive operation.
3548	 * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock
3549	 * that protects the znodes from the invalidation.
3550	 */
3551	zfsvfs = sdzp->z_zfsvfs;
3552	ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
3553	ZFS_ENTER(zfsvfs);
3554
3555	/*
3556	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
3557	 * bypassing the cleanup code in the case of an error.
3558	 */
3559	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
3560		ZFS_EXIT(zfsvfs);
3561		VOP_UNLOCK(sdvp, 0);
3562		VOP_UNLOCK(tdvp, 0);
3563		error = SET_ERROR(EIO);
3564		goto out;
3565	}
3566
3567	/*
3568	 * Re-resolve svp to be certain it still exists and fetch the
3569	 * correct vnode.
3570	 */
3571	error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS);
3572	if (error != 0) {
3573		/* Source entry invalid or not there. */
3574		ZFS_EXIT(zfsvfs);
3575		VOP_UNLOCK(sdvp, 0);
3576		VOP_UNLOCK(tdvp, 0);
3577		if ((scnp->cn_flags & ISDOTDOT) != 0 ||
3578		    (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
3579			error = SET_ERROR(EINVAL);
3580		goto out;
3581	}
3582	svp = ZTOV(szp);
3583
3584	/*
3585	 * Re-resolve tvp, if it disappeared we just carry on.
3586	 */
3587	error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0);
3588	if (error != 0) {
3589		ZFS_EXIT(zfsvfs);
3590		VOP_UNLOCK(sdvp, 0);
3591		VOP_UNLOCK(tdvp, 0);
3592		vrele(svp);
3593		if ((tcnp->cn_flags & ISDOTDOT) != 0)
3594			error = SET_ERROR(EINVAL);
3595		goto out;
3596	}
3597	if (tzp != NULL)
3598		tvp = ZTOV(tzp);
3599	else
3600		tvp = NULL;
3601
3602	/*
3603	 * At present the vnode locks must be acquired before z_teardown_lock,
3604	 * although it would be more logical to use the opposite order.
3605	 */
3606	ZFS_EXIT(zfsvfs);
3607
3608	/*
3609	 * Now try acquire locks on svp and tvp.
3610	 */
3611	nvp = svp;
3612	error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
3613	if (error != 0) {
3614		VOP_UNLOCK(sdvp, 0);
3615		VOP_UNLOCK(tdvp, 0);
3616		if (tvp != NULL)
3617			vrele(tvp);
3618		if (error != EBUSY) {
3619			vrele(nvp);
3620			goto out;
3621		}
3622		error = vn_lock(nvp, LK_EXCLUSIVE);
3623		if (error != 0) {
3624			vrele(nvp);
3625			goto out;
3626		}
3627		VOP_UNLOCK(nvp, 0);
3628		/*
3629		 * Concurrent rename race.
3630		 * XXX ?
3631		 */
3632		if (nvp == tdvp) {
3633			vrele(nvp);
3634			error = SET_ERROR(EINVAL);
3635			goto out;
3636		}
3637		vrele(*svpp);
3638		*svpp = nvp;
3639		goto relock;
3640	}
3641	vrele(*svpp);
3642	*svpp = nvp;
3643
3644	if (*tvpp != NULL)
3645		vrele(*tvpp);
3646	*tvpp = NULL;
3647	if (tvp != NULL) {
3648		nvp = tvp;
3649		error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
3650		if (error != 0) {
3651			VOP_UNLOCK(sdvp, 0);
3652			VOP_UNLOCK(tdvp, 0);
3653			VOP_UNLOCK(*svpp, 0);
3654			if (error != EBUSY) {
3655				vrele(nvp);
3656				goto out;
3657			}
3658			error = vn_lock(nvp, LK_EXCLUSIVE);
3659			if (error != 0) {
3660				vrele(nvp);
3661				goto out;
3662			}
3663			vput(nvp);
3664			goto relock;
3665		}
3666		*tvpp = nvp;
3667	}
3668
3669	return (0);
3670
3671out:
3672	return (error);
3673}
3674
3675/*
3676 * Note that we must use VRELE_ASYNC in this function as it walks
3677 * up the directory tree and vrele may need to acquire an exclusive
3678 * lock if a last reference to a vnode is dropped.
3679 */
3680static int
3681zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
3682{
3683	zfsvfs_t	*zfsvfs;
3684	znode_t		*zp, *zp1;
3685	uint64_t	parent;
3686	int		error;
3687
3688	zfsvfs = tdzp->z_zfsvfs;
3689	if (tdzp == szp)
3690		return (SET_ERROR(EINVAL));
3691	if (tdzp == sdzp)
3692		return (0);
3693	if (tdzp->z_id == zfsvfs->z_root)
3694		return (0);
3695	zp = tdzp;
3696	for (;;) {
3697		ASSERT(!zp->z_unlinked);
3698		if ((error = sa_lookup(zp->z_sa_hdl,
3699		    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
3700			break;
3701
3702		if (parent == szp->z_id) {
3703			error = SET_ERROR(EINVAL);
3704			break;
3705		}
3706		if (parent == zfsvfs->z_root)
3707			break;
3708		if (parent == sdzp->z_id)
3709			break;
3710
3711		error = zfs_zget(zfsvfs, parent, &zp1);
3712		if (error != 0)
3713			break;
3714
3715		if (zp != tdzp)
3716			VN_RELE_ASYNC(ZTOV(zp),
3717			    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
3718		zp = zp1;
3719	}
3720
3721	if (error == ENOTDIR)
3722		panic("checkpath: .. not a directory\n");
3723	if (zp != tdzp)
3724		VN_RELE_ASYNC(ZTOV(zp),
3725		    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
3726	return (error);
3727}
3728
3729/*
3730 * Move an entry from the provided source directory to the target
3731 * directory.  Change the entry name as indicated.
3732 *
3733 *	IN:	sdvp	- Source directory containing the "old entry".
3734 *		snm	- Old entry name.
3735 *		tdvp	- Target directory to contain the "new entry".
3736 *		tnm	- New entry name.
3737 *		cr	- credentials of caller.
3738 *		ct	- caller context
3739 *		flags	- case flags
3740 *
3741 *	RETURN:	0 on success, error code on failure.
3742 *
3743 * Timestamps:
3744 *	sdvp,tdvp - ctime|mtime updated
3745 */
3746/*ARGSUSED*/
3747static int
3748zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
3749    vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
3750    cred_t *cr)
3751{
3752	zfsvfs_t	*zfsvfs;
3753	znode_t		*sdzp, *tdzp, *szp, *tzp;
3754	zilog_t		*zilog = NULL;
3755	dmu_tx_t	*tx;
3756	char		*snm = scnp->cn_nameptr;
3757	char		*tnm = tcnp->cn_nameptr;
3758	int		error = 0;
3759
3760	/* Reject renames across filesystems. */
3761	if ((*svpp)->v_mount != tdvp->v_mount ||
3762	    ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
3763		error = SET_ERROR(EXDEV);
3764		goto out;
3765	}
3766
3767	if (zfsctl_is_node(tdvp)) {
3768		error = SET_ERROR(EXDEV);
3769		goto out;
3770	}
3771
3772	/*
3773	 * Lock all four vnodes to ensure safety and semantics of renaming.
3774	 */
3775	error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
3776	if (error != 0) {
3777		/* no vnodes are locked in the case of error here */
3778		return (error);
3779	}
3780
3781	tdzp = VTOZ(tdvp);
3782	sdzp = VTOZ(sdvp);
3783	zfsvfs = tdzp->z_zfsvfs;
3784	zilog = zfsvfs->z_log;
3785
3786	/*
3787	 * After we re-enter ZFS_ENTER() we will have to revalidate all
3788	 * znodes involved.
3789	 */
3790	ZFS_ENTER(zfsvfs);
3791
3792	if (zfsvfs->z_utf8 && u8_validate(tnm,
3793	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3794		error = SET_ERROR(EILSEQ);
3795		goto unlockout;
3796	}
3797
3798	/* If source and target are the same file, there is nothing to do. */
3799	if ((*svpp) == (*tvpp)) {
3800		error = 0;
3801		goto unlockout;
3802	}
3803
3804	if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
3805	    ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
3806	    (*tvpp)->v_mountedhere != NULL)) {
3807		error = SET_ERROR(EXDEV);
3808		goto unlockout;
3809	}
3810
3811	/*
3812	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
3813	 * bypassing the cleanup code in the case of an error.
3814	 */
3815	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
3816		error = SET_ERROR(EIO);
3817		goto unlockout;
3818	}
3819
3820	szp = VTOZ(*svpp);
3821	tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
3822	if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) {
3823		error = SET_ERROR(EIO);
3824		goto unlockout;
3825	}
3826
3827	/*
3828	 * This is to prevent the creation of links into attribute space
3829	 * by renaming a linked file into/outof an attribute directory.
3830	 * See the comment in zfs_link() for why this is considered bad.
3831	 */
3832	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3833		error = SET_ERROR(EINVAL);
3834		goto unlockout;
3835	}
3836
3837	/*
3838	 * Must have write access at the source to remove the old entry
3839	 * and write access at the target to create the new entry.
3840	 * Note that if target and source are the same, this can be
3841	 * done in a single check.
3842	 */
3843	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
3844		goto unlockout;
3845
3846	if ((*svpp)->v_type == VDIR) {
3847		/*
3848		 * Avoid ".", "..", and aliases of "." for obvious reasons.
3849		 */
3850		if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
3851		    sdzp == szp ||
3852		    (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
3853			error = EINVAL;
3854			goto unlockout;
3855		}
3856
3857		/*
3858		 * Check to make sure rename is valid.
3859		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3860		 */
3861		if (error = zfs_rename_check(szp, sdzp, tdzp))
3862			goto unlockout;
3863	}
3864
3865	/*
3866	 * Does target exist?
3867	 */
3868	if (tzp) {
3869		/*
3870		 * Source and target must be the same type.
3871		 */
3872		if ((*svpp)->v_type == VDIR) {
3873			if ((*tvpp)->v_type != VDIR) {
3874				error = SET_ERROR(ENOTDIR);
3875				goto unlockout;
3876			} else {
3877				cache_purge(tdvp);
3878				if (sdvp != tdvp)
3879					cache_purge(sdvp);
3880			}
3881		} else {
3882			if ((*tvpp)->v_type == VDIR) {
3883				error = SET_ERROR(EISDIR);
3884				goto unlockout;
3885			}
3886		}
3887	}
3888
3889	vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
3890	if (tzp)
3891		vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
3892
3893	/*
3894	 * notify the target directory if it is not the same
3895	 * as source directory.
3896	 */
3897	if (tdvp != sdvp) {
3898		vnevent_rename_dest_dir(tdvp, ct);
3899	}
3900
3901	tx = dmu_tx_create(zfsvfs->z_os);
3902	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3903	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3904	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3905	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3906	if (sdzp != tdzp) {
3907		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3908		zfs_sa_upgrade_txholds(tx, tdzp);
3909	}
3910	if (tzp) {
3911		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3912		zfs_sa_upgrade_txholds(tx, tzp);
3913	}
3914
3915	zfs_sa_upgrade_txholds(tx, szp);
3916	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3917	error = dmu_tx_assign(tx, TXG_WAIT);
3918	if (error) {
3919		dmu_tx_abort(tx);
3920		goto unlockout;
3921	}
3922
3923
3924	if (tzp)	/* Attempt to remove the existing target */
3925		error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
3926
3927	if (error == 0) {
3928		error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
3929		if (error == 0) {
3930			szp->z_pflags |= ZFS_AV_MODIFIED;
3931
3932			error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3933			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3934			ASSERT0(error);
3935
3936			error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING,
3937			    NULL);
3938			if (error == 0) {
3939				zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
3940				    snm, tdzp, tnm, szp);
3941
3942				/*
3943				 * Update path information for the target vnode
3944				 */
3945				vn_renamepath(tdvp, *svpp, tnm, strlen(tnm));
3946			} else {
3947				/*
3948				 * At this point, we have successfully created
3949				 * the target name, but have failed to remove
3950				 * the source name.  Since the create was done
3951				 * with the ZRENAMING flag, there are
3952				 * complications; for one, the link count is
3953				 * wrong.  The easiest way to deal with this
3954				 * is to remove the newly created target, and
3955				 * return the original error.  This must
3956				 * succeed; fortunately, it is very unlikely to
3957				 * fail, since we just created it.
3958				 */
3959				VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx,
3960				    ZRENAMING, NULL), ==, 0);
3961			}
3962		}
3963		if (error == 0) {
3964			cache_purge(*svpp);
3965			if (*tvpp != NULL)
3966				cache_purge(*tvpp);
3967			cache_purge_negative(tdvp);
3968		}
3969	}
3970
3971	dmu_tx_commit(tx);
3972
3973unlockout:			/* all 4 vnodes are locked, ZFS_ENTER called */
3974	ZFS_EXIT(zfsvfs);
3975	VOP_UNLOCK(*svpp, 0);
3976	VOP_UNLOCK(sdvp, 0);
3977
3978out:				/* original two vnodes are locked */
3979	if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3980		zil_commit(zilog, 0);
3981
3982	if (*tvpp != NULL)
3983		VOP_UNLOCK(*tvpp, 0);
3984	if (tdvp != *tvpp)
3985		VOP_UNLOCK(tdvp, 0);
3986	return (error);
3987}
3988
3989/*
3990 * Insert the indicated symbolic reference entry into the directory.
3991 *
3992 *	IN:	dvp	- Directory to contain new symbolic link.
3993 *		link	- Name for new symlink entry.
3994 *		vap	- Attributes of new entry.
3995 *		cr	- credentials of caller.
3996 *		ct	- caller context
3997 *		flags	- case flags
3998 *
3999 *	RETURN:	0 on success, error code on failure.
4000 *
4001 * Timestamps:
4002 *	dvp - ctime|mtime updated
4003 */
4004/*ARGSUSED*/
4005static int
4006zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
4007    cred_t *cr, kthread_t *td)
4008{
4009	znode_t		*zp, *dzp = VTOZ(dvp);
4010	dmu_tx_t	*tx;
4011	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
4012	zilog_t		*zilog;
4013	uint64_t	len = strlen(link);
4014	int		error;
4015	zfs_acl_ids_t	acl_ids;
4016	boolean_t	fuid_dirtied;
4017	uint64_t	txtype = TX_SYMLINK;
4018	int		flags = 0;
4019
4020	ASSERT(vap->va_type == VLNK);
4021
4022	ZFS_ENTER(zfsvfs);
4023	ZFS_VERIFY_ZP(dzp);
4024	zilog = zfsvfs->z_log;
4025
4026	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
4027	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4028		ZFS_EXIT(zfsvfs);
4029		return (SET_ERROR(EILSEQ));
4030	}
4031
4032	if (len > MAXPATHLEN) {
4033		ZFS_EXIT(zfsvfs);
4034		return (SET_ERROR(ENAMETOOLONG));
4035	}
4036
4037	if ((error = zfs_acl_ids_create(dzp, 0,
4038	    vap, cr, NULL, &acl_ids)) != 0) {
4039		ZFS_EXIT(zfsvfs);
4040		return (error);
4041	}
4042
4043	/*
4044	 * Attempt to lock directory; fail if entry already exists.
4045	 */
4046	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
4047	if (error) {
4048		zfs_acl_ids_free(&acl_ids);
4049		ZFS_EXIT(zfsvfs);
4050		return (error);
4051	}
4052
4053	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4054		zfs_acl_ids_free(&acl_ids);
4055		ZFS_EXIT(zfsvfs);
4056		return (error);
4057	}
4058
4059	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
4060		zfs_acl_ids_free(&acl_ids);
4061		ZFS_EXIT(zfsvfs);
4062		return (SET_ERROR(EDQUOT));
4063	}
4064
4065	getnewvnode_reserve(1);
4066	tx = dmu_tx_create(zfsvfs->z_os);
4067	fuid_dirtied = zfsvfs->z_fuid_dirty;
4068	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
4069	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4070	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
4071	    ZFS_SA_BASE_ATTR_SIZE + len);
4072	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
4073	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
4074		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
4075		    acl_ids.z_aclp->z_acl_bytes);
4076	}
4077	if (fuid_dirtied)
4078		zfs_fuid_txhold(zfsvfs, tx);
4079	error = dmu_tx_assign(tx, TXG_WAIT);
4080	if (error) {
4081		zfs_acl_ids_free(&acl_ids);
4082		dmu_tx_abort(tx);
4083		getnewvnode_drop_reserve();
4084		ZFS_EXIT(zfsvfs);
4085		return (error);
4086	}
4087
4088	/*
4089	 * Create a new object for the symlink.
4090	 * for version 4 ZPL datsets the symlink will be an SA attribute
4091	 */
4092	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
4093
4094	if (fuid_dirtied)
4095		zfs_fuid_sync(zfsvfs, tx);
4096
4097	if (zp->z_is_sa)
4098		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
4099		    link, len, tx);
4100	else
4101		zfs_sa_symlink(zp, link, len, tx);
4102
4103	zp->z_size = len;
4104	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
4105	    &zp->z_size, sizeof (zp->z_size), tx);
4106	/*
4107	 * Insert the new object into the directory.
4108	 */
4109	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
4110
4111	zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
4112	*vpp = ZTOV(zp);
4113
4114	zfs_acl_ids_free(&acl_ids);
4115
4116	dmu_tx_commit(tx);
4117
4118	getnewvnode_drop_reserve();
4119
4120	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4121		zil_commit(zilog, 0);
4122
4123	ZFS_EXIT(zfsvfs);
4124	return (error);
4125}
4126
4127/*
4128 * Return, in the buffer contained in the provided uio structure,
4129 * the symbolic path referred to by vp.
4130 *
4131 *	IN:	vp	- vnode of symbolic link.
4132 *		uio	- structure to contain the link path.
4133 *		cr	- credentials of caller.
4134 *		ct	- caller context
4135 *
4136 *	OUT:	uio	- structure containing the link path.
4137 *
4138 *	RETURN:	0 on success, error code on failure.
4139 *
4140 * Timestamps:
4141 *	vp - atime updated
4142 */
4143/* ARGSUSED */
4144static int
4145zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
4146{
4147	znode_t		*zp = VTOZ(vp);
4148	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4149	int		error;
4150
4151	ZFS_ENTER(zfsvfs);
4152	ZFS_VERIFY_ZP(zp);
4153
4154	if (zp->z_is_sa)
4155		error = sa_lookup_uio(zp->z_sa_hdl,
4156		    SA_ZPL_SYMLINK(zfsvfs), uio);
4157	else
4158		error = zfs_sa_readlink(zp, uio);
4159
4160	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4161
4162	ZFS_EXIT(zfsvfs);
4163	return (error);
4164}
4165
4166/*
4167 * Insert a new entry into directory tdvp referencing svp.
4168 *
4169 *	IN:	tdvp	- Directory to contain new entry.
4170 *		svp	- vnode of new entry.
4171 *		name	- name of new entry.
4172 *		cr	- credentials of caller.
4173 *		ct	- caller context
4174 *
4175 *	RETURN:	0 on success, error code on failure.
4176 *
4177 * Timestamps:
4178 *	tdvp - ctime|mtime updated
4179 *	 svp - ctime updated
4180 */
4181/* ARGSUSED */
4182static int
4183zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
4184    caller_context_t *ct, int flags)
4185{
4186	znode_t		*dzp = VTOZ(tdvp);
4187	znode_t		*tzp, *szp;
4188	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
4189	zilog_t		*zilog;
4190	dmu_tx_t	*tx;
4191	int		error;
4192	uint64_t	parent;
4193	uid_t		owner;
4194
4195	ASSERT(tdvp->v_type == VDIR);
4196
4197	ZFS_ENTER(zfsvfs);
4198	ZFS_VERIFY_ZP(dzp);
4199	zilog = zfsvfs->z_log;
4200
4201	/*
4202	 * POSIX dictates that we return EPERM here.
4203	 * Better choices include ENOTSUP or EISDIR.
4204	 */
4205	if (svp->v_type == VDIR) {
4206		ZFS_EXIT(zfsvfs);
4207		return (SET_ERROR(EPERM));
4208	}
4209
4210	szp = VTOZ(svp);
4211	ZFS_VERIFY_ZP(szp);
4212
4213	if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) {
4214		ZFS_EXIT(zfsvfs);
4215		return (SET_ERROR(EPERM));
4216	}
4217
4218	/* Prevent links to .zfs/shares files */
4219
4220	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
4221	    &parent, sizeof (uint64_t))) != 0) {
4222		ZFS_EXIT(zfsvfs);
4223		return (error);
4224	}
4225	if (parent == zfsvfs->z_shares_dir) {
4226		ZFS_EXIT(zfsvfs);
4227		return (SET_ERROR(EPERM));
4228	}
4229
4230	if (zfsvfs->z_utf8 && u8_validate(name,
4231	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4232		ZFS_EXIT(zfsvfs);
4233		return (SET_ERROR(EILSEQ));
4234	}
4235
4236	/*
4237	 * We do not support links between attributes and non-attributes
4238	 * because of the potential security risk of creating links
4239	 * into "normal" file space in order to circumvent restrictions
4240	 * imposed in attribute space.
4241	 */
4242	if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
4243		ZFS_EXIT(zfsvfs);
4244		return (SET_ERROR(EINVAL));
4245	}
4246
4247
4248	owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
4249	if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
4250		ZFS_EXIT(zfsvfs);
4251		return (SET_ERROR(EPERM));
4252	}
4253
4254	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4255		ZFS_EXIT(zfsvfs);
4256		return (error);
4257	}
4258
4259	/*
4260	 * Attempt to lock directory; fail if entry already exists.
4261	 */
4262	error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW);
4263	if (error) {
4264		ZFS_EXIT(zfsvfs);
4265		return (error);
4266	}
4267
4268	tx = dmu_tx_create(zfsvfs->z_os);
4269	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4270	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4271	zfs_sa_upgrade_txholds(tx, szp);
4272	zfs_sa_upgrade_txholds(tx, dzp);
4273	error = dmu_tx_assign(tx, TXG_WAIT);
4274	if (error) {
4275		dmu_tx_abort(tx);
4276		ZFS_EXIT(zfsvfs);
4277		return (error);
4278	}
4279
4280	error = zfs_link_create(dzp, name, szp, tx, 0);
4281
4282	if (error == 0) {
4283		uint64_t txtype = TX_LINK;
4284		zfs_log_link(zilog, tx, txtype, dzp, szp, name);
4285	}
4286
4287	dmu_tx_commit(tx);
4288
4289	if (error == 0) {
4290		vnevent_link(svp, ct);
4291	}
4292
4293	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4294		zil_commit(zilog, 0);
4295
4296	ZFS_EXIT(zfsvfs);
4297	return (error);
4298}
4299
4300
4301/*ARGSUSED*/
4302void
4303zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4304{
4305	znode_t	*zp = VTOZ(vp);
4306	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4307	int error;
4308
4309	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4310	if (zp->z_sa_hdl == NULL) {
4311		/*
4312		 * The fs has been unmounted, or we did a
4313		 * suspend/resume and this file no longer exists.
4314		 */
4315		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4316		vrecycle(vp);
4317		return;
4318	}
4319
4320	if (zp->z_unlinked) {
4321		/*
4322		 * Fast path to recycle a vnode of a removed file.
4323		 */
4324		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4325		vrecycle(vp);
4326		return;
4327	}
4328
4329	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4330		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4331
4332		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4333		zfs_sa_upgrade_txholds(tx, zp);
4334		error = dmu_tx_assign(tx, TXG_WAIT);
4335		if (error) {
4336			dmu_tx_abort(tx);
4337		} else {
4338			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4339			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4340			zp->z_atime_dirty = 0;
4341			dmu_tx_commit(tx);
4342		}
4343	}
4344	rw_exit(&zfsvfs->z_teardown_inactive_lock);
4345}
4346
4347
4348CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
4349CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
4350
4351/*ARGSUSED*/
4352static int
4353zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4354{
4355	znode_t		*zp = VTOZ(vp);
4356	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4357	uint32_t	gen;
4358	uint64_t	gen64;
4359	uint64_t	object = zp->z_id;
4360	zfid_short_t	*zfid;
4361	int		size, i, error;
4362
4363	ZFS_ENTER(zfsvfs);
4364	ZFS_VERIFY_ZP(zp);
4365
4366	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4367	    &gen64, sizeof (uint64_t))) != 0) {
4368		ZFS_EXIT(zfsvfs);
4369		return (error);
4370	}
4371
4372	gen = (uint32_t)gen64;
4373
4374	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
4375
4376#ifdef illumos
4377	if (fidp->fid_len < size) {
4378		fidp->fid_len = size;
4379		ZFS_EXIT(zfsvfs);
4380		return (SET_ERROR(ENOSPC));
4381	}
4382#else
4383	fidp->fid_len = size;
4384#endif
4385
4386	zfid = (zfid_short_t *)fidp;
4387
4388	zfid->zf_len = size;
4389
4390	for (i = 0; i < sizeof (zfid->zf_object); i++)
4391		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4392
4393	/* Must have a non-zero generation number to distinguish from .zfs */
4394	if (gen == 0)
4395		gen = 1;
4396	for (i = 0; i < sizeof (zfid->zf_gen); i++)
4397		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4398
4399	if (size == LONG_FID_LEN) {
4400		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
4401		zfid_long_t	*zlfid;
4402
4403		zlfid = (zfid_long_t *)fidp;
4404
4405		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
4406			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
4407
4408		/* XXX - this should be the generation number for the objset */
4409		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
4410			zlfid->zf_setgen[i] = 0;
4411	}
4412
4413	ZFS_EXIT(zfsvfs);
4414	return (0);
4415}
4416
4417static int
4418zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4419    caller_context_t *ct)
4420{
4421	znode_t		*zp, *xzp;
4422	zfsvfs_t	*zfsvfs;
4423	int		error;
4424
4425	switch (cmd) {
4426	case _PC_LINK_MAX:
4427		*valp = INT_MAX;
4428		return (0);
4429
4430	case _PC_FILESIZEBITS:
4431		*valp = 64;
4432		return (0);
4433#ifdef illumos
4434	case _PC_XATTR_EXISTS:
4435		zp = VTOZ(vp);
4436		zfsvfs = zp->z_zfsvfs;
4437		ZFS_ENTER(zfsvfs);
4438		ZFS_VERIFY_ZP(zp);
4439		*valp = 0;
4440		error = zfs_dirent_lookup(zp, "", &xzp,
4441		    ZXATTR | ZEXISTS | ZSHARED);
4442		if (error == 0) {
4443			if (!zfs_dirempty(xzp))
4444				*valp = 1;
4445			vrele(ZTOV(xzp));
4446		} else if (error == ENOENT) {
4447			/*
4448			 * If there aren't extended attributes, it's the
4449			 * same as having zero of them.
4450			 */
4451			error = 0;
4452		}
4453		ZFS_EXIT(zfsvfs);
4454		return (error);
4455
4456	case _PC_SATTR_ENABLED:
4457	case _PC_SATTR_EXISTS:
4458		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
4459		    (vp->v_type == VREG || vp->v_type == VDIR);
4460		return (0);
4461
4462	case _PC_ACCESS_FILTERING:
4463		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
4464		    vp->v_type == VDIR;
4465		return (0);
4466
4467	case _PC_ACL_ENABLED:
4468		*valp = _ACL_ACE_ENABLED;
4469		return (0);
4470#endif	/* illumos */
4471	case _PC_MIN_HOLE_SIZE:
4472		*valp = (int)SPA_MINBLOCKSIZE;
4473		return (0);
4474#ifdef illumos
4475	case _PC_TIMESTAMP_RESOLUTION:
4476		/* nanosecond timestamp resolution */
4477		*valp = 1L;
4478		return (0);
4479#endif
4480	case _PC_ACL_EXTENDED:
4481		*valp = 0;
4482		return (0);
4483
4484	case _PC_ACL_NFS4:
4485		*valp = 1;
4486		return (0);
4487
4488	case _PC_ACL_PATH_MAX:
4489		*valp = ACL_MAX_ENTRIES;
4490		return (0);
4491
4492	default:
4493		return (EOPNOTSUPP);
4494	}
4495}
4496
4497/*ARGSUSED*/
4498static int
4499zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4500    caller_context_t *ct)
4501{
4502	znode_t *zp = VTOZ(vp);
4503	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4504	int error;
4505	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4506
4507	ZFS_ENTER(zfsvfs);
4508	ZFS_VERIFY_ZP(zp);
4509	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
4510	ZFS_EXIT(zfsvfs);
4511
4512	return (error);
4513}
4514
4515/*ARGSUSED*/
4516int
4517zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4518    caller_context_t *ct)
4519{
4520	znode_t *zp = VTOZ(vp);
4521	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4522	int error;
4523	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4524	zilog_t	*zilog = zfsvfs->z_log;
4525
4526	ZFS_ENTER(zfsvfs);
4527	ZFS_VERIFY_ZP(zp);
4528
4529	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
4530
4531	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4532		zil_commit(zilog, 0);
4533
4534	ZFS_EXIT(zfsvfs);
4535	return (error);
4536}
4537
4538static int
4539zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
4540    int *rahead)
4541{
4542	znode_t *zp = VTOZ(vp);
4543	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4544	objset_t *os = zp->z_zfsvfs->z_os;
4545	rl_t *rl;
4546	vm_object_t object;
4547	off_t start, end, obj_size;
4548	uint_t blksz;
4549	int pgsin_b, pgsin_a;
4550	int error;
4551
4552	ZFS_ENTER(zfsvfs);
4553	ZFS_VERIFY_ZP(zp);
4554
4555	start = IDX_TO_OFF(ma[0]->pindex);
4556	end = IDX_TO_OFF(ma[count - 1]->pindex + 1);
4557
4558	/*
4559	 * Lock a range covering all required and optional pages.
4560	 * Note that we need to handle the case of the block size growing.
4561	 */
4562	for (;;) {
4563		blksz = zp->z_blksz;
4564		rl = zfs_range_lock(zp, rounddown(start, blksz),
4565		    roundup(end, blksz) - rounddown(start, blksz), RL_READER);
4566		if (blksz == zp->z_blksz)
4567			break;
4568		zfs_range_unlock(rl);
4569	}
4570
4571	object = ma[0]->object;
4572	zfs_vmobject_wlock(object);
4573	obj_size = object->un_pager.vnp.vnp_size;
4574	zfs_vmobject_wunlock(object);
4575	if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) {
4576		zfs_range_unlock(rl);
4577		ZFS_EXIT(zfsvfs);
4578		return (zfs_vm_pagerret_bad);
4579	}
4580
4581	pgsin_b = 0;
4582	if (rbehind != NULL) {
4583		pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz));
4584		pgsin_b = MIN(*rbehind, pgsin_b);
4585	}
4586
4587	pgsin_a = 0;
4588	if (rahead != NULL) {
4589		pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end);
4590		if (end + IDX_TO_OFF(pgsin_a) >= obj_size)
4591			pgsin_a = OFF_TO_IDX(round_page(obj_size) - end);
4592		pgsin_a = MIN(*rahead, pgsin_a);
4593	}
4594
4595	/*
4596	 * NB: we need to pass the exact byte size of the data that we expect
4597	 * to read after accounting for the file size.  This is required because
4598	 * ZFS will panic if we request DMU to read beyond the end of the last
4599	 * allocated block.
4600	 */
4601	error = dmu_read_pages(os, zp->z_id, ma, count, &pgsin_b, &pgsin_a,
4602	    MIN(end, obj_size) - (end - PAGE_SIZE));
4603
4604	zfs_range_unlock(rl);
4605	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4606	ZFS_EXIT(zfsvfs);
4607
4608	if (error != 0)
4609		return (zfs_vm_pagerret_error);
4610
4611	PCPU_INC(cnt.v_vnodein);
4612	PCPU_ADD(cnt.v_vnodepgsin, count + pgsin_b + pgsin_a);
4613	if (rbehind != NULL)
4614		*rbehind = pgsin_b;
4615	if (rahead != NULL)
4616		*rahead = pgsin_a;
4617	return (zfs_vm_pagerret_ok);
4618}
4619
4620static int
4621zfs_freebsd_getpages(ap)
4622	struct vop_getpages_args /* {
4623		struct vnode *a_vp;
4624		vm_page_t *a_m;
4625		int a_count;
4626		int *a_rbehind;
4627		int *a_rahead;
4628	} */ *ap;
4629{
4630
4631	return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind,
4632	    ap->a_rahead));
4633}
4634
4635static int
4636zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
4637    int *rtvals)
4638{
4639	znode_t		*zp = VTOZ(vp);
4640	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4641	rl_t		*rl;
4642	dmu_tx_t	*tx;
4643	struct sf_buf	*sf;
4644	vm_object_t	object;
4645	vm_page_t	m;
4646	caddr_t		va;
4647	size_t		tocopy;
4648	size_t		lo_len;
4649	vm_ooffset_t	lo_off;
4650	vm_ooffset_t	off;
4651	uint_t		blksz;
4652	int		ncount;
4653	int		pcount;
4654	int		err;
4655	int		i;
4656
4657	ZFS_ENTER(zfsvfs);
4658	ZFS_VERIFY_ZP(zp);
4659
4660	object = vp->v_object;
4661	pcount = btoc(len);
4662	ncount = pcount;
4663
4664	KASSERT(ma[0]->object == object, ("mismatching object"));
4665	KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));
4666
4667	for (i = 0; i < pcount; i++)
4668		rtvals[i] = zfs_vm_pagerret_error;
4669
4670	off = IDX_TO_OFF(ma[0]->pindex);
4671	blksz = zp->z_blksz;
4672	lo_off = rounddown(off, blksz);
4673	lo_len = roundup(len + (off - lo_off), blksz);
4674	rl = zfs_range_lock(zp, lo_off, lo_len, RL_WRITER);
4675
4676	zfs_vmobject_wlock(object);
4677	if (len + off > object->un_pager.vnp.vnp_size) {
4678		if (object->un_pager.vnp.vnp_size > off) {
4679			int pgoff;
4680
4681			len = object->un_pager.vnp.vnp_size - off;
4682			ncount = btoc(len);
4683			if ((pgoff = (int)len & PAGE_MASK) != 0) {
4684				/*
4685				 * If the object is locked and the following
4686				 * conditions hold, then the page's dirty
4687				 * field cannot be concurrently changed by a
4688				 * pmap operation.
4689				 */
4690				m = ma[ncount - 1];
4691				vm_page_assert_sbusied(m);
4692				KASSERT(!pmap_page_is_write_mapped(m),
4693				    ("zfs_putpages: page %p is not read-only", m));
4694				vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
4695				    pgoff);
4696			}
4697		} else {
4698			len = 0;
4699			ncount = 0;
4700		}
4701		if (ncount < pcount) {
4702			for (i = ncount; i < pcount; i++) {
4703				rtvals[i] = zfs_vm_pagerret_bad;
4704			}
4705		}
4706	}
4707	zfs_vmobject_wunlock(object);
4708
4709	if (ncount == 0)
4710		goto out;
4711
4712	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
4713	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
4714		goto out;
4715	}
4716
4717	tx = dmu_tx_create(zfsvfs->z_os);
4718	dmu_tx_hold_write(tx, zp->z_id, off, len);
4719
4720	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4721	zfs_sa_upgrade_txholds(tx, zp);
4722	err = dmu_tx_assign(tx, TXG_WAIT);
4723	if (err != 0) {
4724		dmu_tx_abort(tx);
4725		goto out;
4726	}
4727
4728	if (zp->z_blksz < PAGE_SIZE) {
4729		for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
4730			tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
4731			va = zfs_map_page(ma[i], &sf);
4732			dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
4733			zfs_unmap_page(sf);
4734		}
4735	} else {
4736		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
4737	}
4738
4739	if (err == 0) {
4740		uint64_t mtime[2], ctime[2];
4741		sa_bulk_attr_t bulk[3];
4742		int count = 0;
4743
4744		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4745		    &mtime, 16);
4746		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
4747		    &ctime, 16);
4748		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
4749		    &zp->z_pflags, 8);
4750		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
4751		    B_TRUE);
4752		err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
4753		ASSERT0(err);
4754		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
4755
4756		zfs_vmobject_wlock(object);
4757		for (i = 0; i < ncount; i++) {
4758			rtvals[i] = zfs_vm_pagerret_ok;
4759			vm_page_undirty(ma[i]);
4760		}
4761		zfs_vmobject_wunlock(object);
4762		PCPU_INC(cnt.v_vnodeout);
4763		PCPU_ADD(cnt.v_vnodepgsout, ncount);
4764	}
4765	dmu_tx_commit(tx);
4766
4767out:
4768	zfs_range_unlock(rl);
4769	if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 ||
4770	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4771		zil_commit(zfsvfs->z_log, zp->z_id);
4772	ZFS_EXIT(zfsvfs);
4773	return (rtvals[0]);
4774}
4775
4776int
4777zfs_freebsd_putpages(ap)
4778	struct vop_putpages_args /* {
4779		struct vnode *a_vp;
4780		vm_page_t *a_m;
4781		int a_count;
4782		int a_sync;
4783		int *a_rtvals;
4784	} */ *ap;
4785{
4786
4787	return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
4788	    ap->a_rtvals));
4789}
4790
4791static int
4792zfs_freebsd_bmap(ap)
4793	struct vop_bmap_args /* {
4794		struct vnode *a_vp;
4795		daddr_t  a_bn;
4796		struct bufobj **a_bop;
4797		daddr_t *a_bnp;
4798		int *a_runp;
4799		int *a_runb;
4800	} */ *ap;
4801{
4802
4803	if (ap->a_bop != NULL)
4804		*ap->a_bop = &ap->a_vp->v_bufobj;
4805	if (ap->a_bnp != NULL)
4806		*ap->a_bnp = ap->a_bn;
4807	if (ap->a_runp != NULL)
4808		*ap->a_runp = 0;
4809	if (ap->a_runb != NULL)
4810		*ap->a_runb = 0;
4811
4812	return (0);
4813}
4814
4815static int
4816zfs_freebsd_open(ap)
4817	struct vop_open_args /* {
4818		struct vnode *a_vp;
4819		int a_mode;
4820		struct ucred *a_cred;
4821		struct thread *a_td;
4822	} */ *ap;
4823{
4824	vnode_t	*vp = ap->a_vp;
4825	znode_t *zp = VTOZ(vp);
4826	int error;
4827
4828	error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
4829	if (error == 0)
4830		vnode_create_vobject(vp, zp->z_size, ap->a_td);
4831	return (error);
4832}
4833
4834static int
4835zfs_freebsd_close(ap)
4836	struct vop_close_args /* {
4837		struct vnode *a_vp;
4838		int  a_fflag;
4839		struct ucred *a_cred;
4840		struct thread *a_td;
4841	} */ *ap;
4842{
4843
4844	return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL));
4845}
4846
4847static int
4848zfs_freebsd_ioctl(ap)
4849	struct vop_ioctl_args /* {
4850		struct vnode *a_vp;
4851		u_long a_command;
4852		caddr_t a_data;
4853		int a_fflag;
4854		struct ucred *cred;
4855		struct thread *td;
4856	} */ *ap;
4857{
4858
4859	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
4860	    ap->a_fflag, ap->a_cred, NULL, NULL));
4861}
4862
4863static int
4864ioflags(int ioflags)
4865{
4866	int flags = 0;
4867
4868	if (ioflags & IO_APPEND)
4869		flags |= FAPPEND;
4870	if (ioflags & IO_NDELAY)
4871		flags |= FNONBLOCK;
4872	if (ioflags & IO_SYNC)
4873		flags |= (FSYNC | FDSYNC | FRSYNC);
4874
4875	return (flags);
4876}
4877
4878static int
4879zfs_freebsd_read(ap)
4880	struct vop_read_args /* {
4881		struct vnode *a_vp;
4882		struct uio *a_uio;
4883		int a_ioflag;
4884		struct ucred *a_cred;
4885	} */ *ap;
4886{
4887
4888	return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
4889	    ap->a_cred, NULL));
4890}
4891
4892static int
4893zfs_freebsd_write(ap)
4894	struct vop_write_args /* {
4895		struct vnode *a_vp;
4896		struct uio *a_uio;
4897		int a_ioflag;
4898		struct ucred *a_cred;
4899	} */ *ap;
4900{
4901
4902	return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
4903	    ap->a_cred, NULL));
4904}
4905
4906static int
4907zfs_freebsd_access(ap)
4908	struct vop_access_args /* {
4909		struct vnode *a_vp;
4910		accmode_t a_accmode;
4911		struct ucred *a_cred;
4912		struct thread *a_td;
4913	} */ *ap;
4914{
4915	vnode_t *vp = ap->a_vp;
4916	znode_t *zp = VTOZ(vp);
4917	accmode_t accmode;
4918	int error = 0;
4919
4920	/*
4921	 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
4922	 */
4923	accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
4924	if (accmode != 0)
4925		error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL);
4926
4927	/*
4928	 * VADMIN has to be handled by vaccess().
4929	 */
4930	if (error == 0) {
4931		accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
4932		if (accmode != 0) {
4933			error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
4934			    zp->z_gid, accmode, ap->a_cred, NULL);
4935		}
4936	}
4937
4938	/*
4939	 * For VEXEC, ensure that at least one execute bit is set for
4940	 * non-directories.
4941	 */
4942	if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
4943	    (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
4944		error = EACCES;
4945	}
4946
4947	return (error);
4948}
4949
4950static int
4951zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached)
4952{
4953	struct componentname *cnp = ap->a_cnp;
4954	char nm[NAME_MAX + 1];
4955
4956	ASSERT(cnp->cn_namelen < sizeof(nm));
4957	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
4958
4959	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
4960	    cnp->cn_cred, cnp->cn_thread, 0, cached));
4961}
4962
4963static int
4964zfs_freebsd_cachedlookup(struct vop_cachedlookup_args *ap)
4965{
4966
4967	return (zfs_freebsd_lookup((struct vop_lookup_args *)ap, B_TRUE));
4968}
4969
4970static int
4971zfs_cache_lookup(ap)
4972	struct vop_lookup_args /* {
4973		struct vnode *a_dvp;
4974		struct vnode **a_vpp;
4975		struct componentname *a_cnp;
4976	} */ *ap;
4977{
4978	zfsvfs_t *zfsvfs;
4979
4980	zfsvfs = ap->a_dvp->v_mount->mnt_data;
4981	if (zfsvfs->z_use_namecache)
4982		return (vfs_cache_lookup(ap));
4983	else
4984		return (zfs_freebsd_lookup(ap, B_FALSE));
4985}
4986
4987static int
4988zfs_freebsd_create(ap)
4989	struct vop_create_args /* {
4990		struct vnode *a_dvp;
4991		struct vnode **a_vpp;
4992		struct componentname *a_cnp;
4993		struct vattr *a_vap;
4994	} */ *ap;
4995{
4996	zfsvfs_t *zfsvfs;
4997	struct componentname *cnp = ap->a_cnp;
4998	vattr_t *vap = ap->a_vap;
4999	int error, mode;
5000
5001	ASSERT(cnp->cn_flags & SAVENAME);
5002
5003	vattr_init_mask(vap);
5004	mode = vap->va_mode & ALLPERMS;
5005	zfsvfs = ap->a_dvp->v_mount->mnt_data;
5006
5007	error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
5008	    ap->a_vpp, cnp->cn_cred, cnp->cn_thread);
5009	if (zfsvfs->z_use_namecache &&
5010	    error == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
5011		cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
5012	return (error);
5013}
5014
5015static int
5016zfs_freebsd_remove(ap)
5017	struct vop_remove_args /* {
5018		struct vnode *a_dvp;
5019		struct vnode *a_vp;
5020		struct componentname *a_cnp;
5021	} */ *ap;
5022{
5023
5024	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
5025
5026	return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
5027	    ap->a_cnp->cn_cred));
5028}
5029
5030static int
5031zfs_freebsd_mkdir(ap)
5032	struct vop_mkdir_args /* {
5033		struct vnode *a_dvp;
5034		struct vnode **a_vpp;
5035		struct componentname *a_cnp;
5036		struct vattr *a_vap;
5037	} */ *ap;
5038{
5039	vattr_t *vap = ap->a_vap;
5040
5041	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
5042
5043	vattr_init_mask(vap);
5044
5045	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
5046	    ap->a_cnp->cn_cred));
5047}
5048
5049static int
5050zfs_freebsd_rmdir(ap)
5051	struct vop_rmdir_args /* {
5052		struct vnode *a_dvp;
5053		struct vnode *a_vp;
5054		struct componentname *a_cnp;
5055	} */ *ap;
5056{
5057	struct componentname *cnp = ap->a_cnp;
5058
5059	ASSERT(cnp->cn_flags & SAVENAME);
5060
5061	return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
5062}
5063
5064static int
5065zfs_freebsd_readdir(ap)
5066	struct vop_readdir_args /* {
5067		struct vnode *a_vp;
5068		struct uio *a_uio;
5069		struct ucred *a_cred;
5070		int *a_eofflag;
5071		int *a_ncookies;
5072		u_long **a_cookies;
5073	} */ *ap;
5074{
5075
5076	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
5077	    ap->a_ncookies, ap->a_cookies));
5078}
5079
5080static int
5081zfs_freebsd_fsync(ap)
5082	struct vop_fsync_args /* {
5083		struct vnode *a_vp;
5084		int a_waitfor;
5085		struct thread *a_td;
5086	} */ *ap;
5087{
5088
5089	vop_stdfsync(ap);
5090	return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
5091}
5092
5093static int
5094zfs_freebsd_getattr(ap)
5095	struct vop_getattr_args /* {
5096		struct vnode *a_vp;
5097		struct vattr *a_vap;
5098		struct ucred *a_cred;
5099	} */ *ap;
5100{
5101	vattr_t *vap = ap->a_vap;
5102	xvattr_t xvap;
5103	u_long fflags = 0;
5104	int error;
5105
5106	xva_init(&xvap);
5107	xvap.xva_vattr = *vap;
5108	xvap.xva_vattr.va_mask |= AT_XVATTR;
5109
5110	/* Convert chflags into ZFS-type flags. */
5111	/* XXX: what about SF_SETTABLE?. */
5112	XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
5113	XVA_SET_REQ(&xvap, XAT_APPENDONLY);
5114	XVA_SET_REQ(&xvap, XAT_NOUNLINK);
5115	XVA_SET_REQ(&xvap, XAT_NODUMP);
5116	XVA_SET_REQ(&xvap, XAT_READONLY);
5117	XVA_SET_REQ(&xvap, XAT_ARCHIVE);
5118	XVA_SET_REQ(&xvap, XAT_SYSTEM);
5119	XVA_SET_REQ(&xvap, XAT_HIDDEN);
5120	XVA_SET_REQ(&xvap, XAT_REPARSE);
5121	XVA_SET_REQ(&xvap, XAT_OFFLINE);
5122	XVA_SET_REQ(&xvap, XAT_SPARSE);
5123
5124	error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
5125	if (error != 0)
5126		return (error);
5127
5128	/* Convert ZFS xattr into chflags. */
5129#define	FLAG_CHECK(fflag, xflag, xfield)	do {			\
5130	if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)		\
5131		fflags |= (fflag);					\
5132} while (0)
5133	FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
5134	    xvap.xva_xoptattrs.xoa_immutable);
5135	FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
5136	    xvap.xva_xoptattrs.xoa_appendonly);
5137	FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
5138	    xvap.xva_xoptattrs.xoa_nounlink);
5139	FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
5140	    xvap.xva_xoptattrs.xoa_archive);
5141	FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
5142	    xvap.xva_xoptattrs.xoa_nodump);
5143	FLAG_CHECK(UF_READONLY, XAT_READONLY,
5144	    xvap.xva_xoptattrs.xoa_readonly);
5145	FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
5146	    xvap.xva_xoptattrs.xoa_system);
5147	FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
5148	    xvap.xva_xoptattrs.xoa_hidden);
5149	FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
5150	    xvap.xva_xoptattrs.xoa_reparse);
5151	FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
5152	    xvap.xva_xoptattrs.xoa_offline);
5153	FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
5154	    xvap.xva_xoptattrs.xoa_sparse);
5155
5156#undef	FLAG_CHECK
5157	*vap = xvap.xva_vattr;
5158	vap->va_flags = fflags;
5159	return (0);
5160}
5161
5162static int
5163zfs_freebsd_setattr(ap)
5164	struct vop_setattr_args /* {
5165		struct vnode *a_vp;
5166		struct vattr *a_vap;
5167		struct ucred *a_cred;
5168	} */ *ap;
5169{
5170	vnode_t *vp = ap->a_vp;
5171	vattr_t *vap = ap->a_vap;
5172	cred_t *cred = ap->a_cred;
5173	xvattr_t xvap;
5174	u_long fflags;
5175	uint64_t zflags;
5176
5177	vattr_init_mask(vap);
5178	vap->va_mask &= ~AT_NOSET;
5179
5180	xva_init(&xvap);
5181	xvap.xva_vattr = *vap;
5182
5183	zflags = VTOZ(vp)->z_pflags;
5184
5185	if (vap->va_flags != VNOVAL) {
5186		zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
5187		int error;
5188
5189		if (zfsvfs->z_use_fuids == B_FALSE)
5190			return (EOPNOTSUPP);
5191
5192		fflags = vap->va_flags;
5193		/*
5194		 * XXX KDM
5195		 * We need to figure out whether it makes sense to allow
5196		 * UF_REPARSE through, since we don't really have other
5197		 * facilities to handle reparse points and zfs_setattr()
5198		 * doesn't currently allow setting that attribute anyway.
5199		 */
5200		if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE|
5201		     UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE|
5202		     UF_OFFLINE|UF_SPARSE)) != 0)
5203			return (EOPNOTSUPP);
5204		/*
5205		 * Unprivileged processes are not permitted to unset system
5206		 * flags, or modify flags if any system flags are set.
5207		 * Privileged non-jail processes may not modify system flags
5208		 * if securelevel > 0 and any existing system flags are set.
5209		 * Privileged jail processes behave like privileged non-jail
5210		 * processes if the security.jail.chflags_allowed sysctl is
5211		 * is non-zero; otherwise, they behave like unprivileged
5212		 * processes.
5213		 */
5214		if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
5215		    priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) {
5216			if (zflags &
5217			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
5218				error = securelevel_gt(cred, 0);
5219				if (error != 0)
5220					return (error);
5221			}
5222		} else {
5223			/*
5224			 * Callers may only modify the file flags on objects they
5225			 * have VADMIN rights for.
5226			 */
5227			if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0)
5228				return (error);
5229			if (zflags &
5230			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
5231				return (EPERM);
5232			}
5233			if (fflags &
5234			    (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
5235				return (EPERM);
5236			}
5237		}
5238
5239#define	FLAG_CHANGE(fflag, zflag, xflag, xfield)	do {		\
5240	if (((fflags & (fflag)) && !(zflags & (zflag))) ||		\
5241	    ((zflags & (zflag)) && !(fflags & (fflag)))) {		\
5242		XVA_SET_REQ(&xvap, (xflag));				\
5243		(xfield) = ((fflags & (fflag)) != 0);			\
5244	}								\
5245} while (0)
5246		/* Convert chflags into ZFS-type flags. */
5247		/* XXX: what about SF_SETTABLE?. */
5248		FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
5249		    xvap.xva_xoptattrs.xoa_immutable);
5250		FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
5251		    xvap.xva_xoptattrs.xoa_appendonly);
5252		FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
5253		    xvap.xva_xoptattrs.xoa_nounlink);
5254		FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
5255		    xvap.xva_xoptattrs.xoa_archive);
5256		FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
5257		    xvap.xva_xoptattrs.xoa_nodump);
5258		FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
5259		    xvap.xva_xoptattrs.xoa_readonly);
5260		FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
5261		    xvap.xva_xoptattrs.xoa_system);
5262		FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
5263		    xvap.xva_xoptattrs.xoa_hidden);
5264		FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
5265		    xvap.xva_xoptattrs.xoa_reparse);
5266		FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
5267		    xvap.xva_xoptattrs.xoa_offline);
5268		FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
5269		    xvap.xva_xoptattrs.xoa_sparse);
5270#undef	FLAG_CHANGE
5271	}
5272	if (vap->va_birthtime.tv_sec != VNOVAL) {
5273		xvap.xva_vattr.va_mask |= AT_XVATTR;
5274		XVA_SET_REQ(&xvap, XAT_CREATETIME);
5275	}
5276	return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
5277}
5278
5279static int
5280zfs_freebsd_rename(ap)
5281	struct vop_rename_args  /* {
5282		struct vnode *a_fdvp;
5283		struct vnode *a_fvp;
5284		struct componentname *a_fcnp;
5285		struct vnode *a_tdvp;
5286		struct vnode *a_tvp;
5287		struct componentname *a_tcnp;
5288	} */ *ap;
5289{
5290	vnode_t *fdvp = ap->a_fdvp;
5291	vnode_t *fvp = ap->a_fvp;
5292	vnode_t *tdvp = ap->a_tdvp;
5293	vnode_t *tvp = ap->a_tvp;
5294	int error;
5295
5296	ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
5297	ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
5298
5299	error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
5300	    ap->a_tcnp, ap->a_fcnp->cn_cred);
5301
5302	vrele(fdvp);
5303	vrele(fvp);
5304	vrele(tdvp);
5305	if (tvp != NULL)
5306		vrele(tvp);
5307
5308	return (error);
5309}
5310
5311static int
5312zfs_freebsd_symlink(ap)
5313	struct vop_symlink_args /* {
5314		struct vnode *a_dvp;
5315		struct vnode **a_vpp;
5316		struct componentname *a_cnp;
5317		struct vattr *a_vap;
5318		char *a_target;
5319	} */ *ap;
5320{
5321	struct componentname *cnp = ap->a_cnp;
5322	vattr_t *vap = ap->a_vap;
5323
5324	ASSERT(cnp->cn_flags & SAVENAME);
5325
5326	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
5327	vattr_init_mask(vap);
5328
5329	return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
5330	    ap->a_target, cnp->cn_cred, cnp->cn_thread));
5331}
5332
5333static int
5334zfs_freebsd_readlink(ap)
5335	struct vop_readlink_args /* {
5336		struct vnode *a_vp;
5337		struct uio *a_uio;
5338		struct ucred *a_cred;
5339	} */ *ap;
5340{
5341
5342	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
5343}
5344
5345static int
5346zfs_freebsd_link(ap)
5347	struct vop_link_args /* {
5348		struct vnode *a_tdvp;
5349		struct vnode *a_vp;
5350		struct componentname *a_cnp;
5351	} */ *ap;
5352{
5353	struct componentname *cnp = ap->a_cnp;
5354	vnode_t *vp = ap->a_vp;
5355	vnode_t *tdvp = ap->a_tdvp;
5356
5357	if (tdvp->v_mount != vp->v_mount)
5358		return (EXDEV);
5359
5360	ASSERT(cnp->cn_flags & SAVENAME);
5361
5362	return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
5363}
5364
5365static int
5366zfs_freebsd_inactive(ap)
5367	struct vop_inactive_args /* {
5368		struct vnode *a_vp;
5369		struct thread *a_td;
5370	} */ *ap;
5371{
5372	vnode_t *vp = ap->a_vp;
5373
5374	zfs_inactive(vp, ap->a_td->td_ucred, NULL);
5375	return (0);
5376}
5377
5378static int
5379zfs_freebsd_reclaim(ap)
5380	struct vop_reclaim_args /* {
5381		struct vnode *a_vp;
5382		struct thread *a_td;
5383	} */ *ap;
5384{
5385	vnode_t	*vp = ap->a_vp;
5386	znode_t	*zp = VTOZ(vp);
5387	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5388
5389	ASSERT(zp != NULL);
5390
5391	/* Destroy the vm object and flush associated pages. */
5392	vnode_destroy_vobject(vp);
5393
5394	/*
5395	 * z_teardown_inactive_lock protects from a race with
5396	 * zfs_znode_dmu_fini in zfsvfs_teardown during
5397	 * force unmount.
5398	 */
5399	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
5400	if (zp->z_sa_hdl == NULL)
5401		zfs_znode_free(zp);
5402	else
5403		zfs_zinactive(zp);
5404	rw_exit(&zfsvfs->z_teardown_inactive_lock);
5405
5406	vp->v_data = NULL;
5407	return (0);
5408}
5409
5410static int
5411zfs_freebsd_fid(ap)
5412	struct vop_fid_args /* {
5413		struct vnode *a_vp;
5414		struct fid *a_fid;
5415	} */ *ap;
5416{
5417
5418	return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
5419}
5420
5421static int
5422zfs_freebsd_pathconf(ap)
5423	struct vop_pathconf_args /* {
5424		struct vnode *a_vp;
5425		int a_name;
5426		register_t *a_retval;
5427	} */ *ap;
5428{
5429	ulong_t val;
5430	int error;
5431
5432	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL);
5433	if (error == 0) {
5434		*ap->a_retval = val;
5435		return (error);
5436	}
5437	if (error != EOPNOTSUPP)
5438		return (error);
5439
5440	switch (ap->a_name) {
5441	case _PC_NAME_MAX:
5442		*ap->a_retval = NAME_MAX;
5443		return (0);
5444	case _PC_PIPE_BUF:
5445		if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) {
5446			*ap->a_retval = PIPE_BUF;
5447			return (0);
5448		}
5449		return (EINVAL);
5450	default:
5451		return (vop_stdpathconf(ap));
5452	}
5453}
5454
5455/*
5456 * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
5457 * extended attribute name:
5458 *
5459 *	NAMESPACE	PREFIX
5460 *	system		freebsd:system:
5461 *	user		(none, can be used to access ZFS fsattr(5) attributes
5462 *			created on Solaris)
5463 */
5464static int
5465zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
5466    size_t size)
5467{
5468	const char *namespace, *prefix, *suffix;
5469
5470	/* We don't allow '/' character in attribute name. */
5471	if (strchr(name, '/') != NULL)
5472		return (EINVAL);
5473	/* We don't allow attribute names that start with "freebsd:" string. */
5474	if (strncmp(name, "freebsd:", 8) == 0)
5475		return (EINVAL);
5476
5477	bzero(attrname, size);
5478
5479	switch (attrnamespace) {
5480	case EXTATTR_NAMESPACE_USER:
5481#if 0
5482		prefix = "freebsd:";
5483		namespace = EXTATTR_NAMESPACE_USER_STRING;
5484		suffix = ":";
5485#else
5486		/*
5487		 * This is the default namespace by which we can access all
5488		 * attributes created on Solaris.
5489		 */
5490		prefix = namespace = suffix = "";
5491#endif
5492		break;
5493	case EXTATTR_NAMESPACE_SYSTEM:
5494		prefix = "freebsd:";
5495		namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
5496		suffix = ":";
5497		break;
5498	case EXTATTR_NAMESPACE_EMPTY:
5499	default:
5500		return (EINVAL);
5501	}
5502	if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
5503	    name) >= size) {
5504		return (ENAMETOOLONG);
5505	}
5506	return (0);
5507}
5508
5509/*
5510 * Vnode operating to retrieve a named extended attribute.
5511 */
5512static int
5513zfs_getextattr(struct vop_getextattr_args *ap)
5514/*
5515vop_getextattr {
5516	IN struct vnode *a_vp;
5517	IN int a_attrnamespace;
5518	IN const char *a_name;
5519	INOUT struct uio *a_uio;
5520	OUT size_t *a_size;
5521	IN struct ucred *a_cred;
5522	IN struct thread *a_td;
5523};
5524*/
5525{
5526	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5527	struct thread *td = ap->a_td;
5528	struct nameidata nd;
5529	char attrname[255];
5530	struct vattr va;
5531	vnode_t *xvp = NULL, *vp;
5532	int error, flags;
5533
5534	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5535	    ap->a_cred, ap->a_td, VREAD);
5536	if (error != 0)
5537		return (error);
5538
5539	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5540	    sizeof(attrname));
5541	if (error != 0)
5542		return (error);
5543
5544	ZFS_ENTER(zfsvfs);
5545
5546	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5547	    LOOKUP_XATTR, B_FALSE);
5548	if (error != 0) {
5549		ZFS_EXIT(zfsvfs);
5550		return (error);
5551	}
5552
5553	flags = FREAD;
5554	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
5555	    xvp, td);
5556	error = vn_open_cred(&nd, &flags, VN_OPEN_INVFS, 0, ap->a_cred, NULL);
5557	vp = nd.ni_vp;
5558	NDFREE(&nd, NDF_ONLY_PNBUF);
5559	if (error != 0) {
5560		ZFS_EXIT(zfsvfs);
5561		if (error == ENOENT)
5562			error = ENOATTR;
5563		return (error);
5564	}
5565
5566	if (ap->a_size != NULL) {
5567		error = VOP_GETATTR(vp, &va, ap->a_cred);
5568		if (error == 0)
5569			*ap->a_size = (size_t)va.va_size;
5570	} else if (ap->a_uio != NULL)
5571		error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
5572
5573	VOP_UNLOCK(vp, 0);
5574	vn_close(vp, flags, ap->a_cred, td);
5575	ZFS_EXIT(zfsvfs);
5576
5577	return (error);
5578}
5579
5580/*
5581 * Vnode operation to remove a named attribute.
5582 */
5583int
5584zfs_deleteextattr(struct vop_deleteextattr_args *ap)
5585/*
5586vop_deleteextattr {
5587	IN struct vnode *a_vp;
5588	IN int a_attrnamespace;
5589	IN const char *a_name;
5590	IN struct ucred *a_cred;
5591	IN struct thread *a_td;
5592};
5593*/
5594{
5595	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5596	struct thread *td = ap->a_td;
5597	struct nameidata nd;
5598	char attrname[255];
5599	struct vattr va;
5600	vnode_t *xvp = NULL, *vp;
5601	int error, flags;
5602
5603	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5604	    ap->a_cred, ap->a_td, VWRITE);
5605	if (error != 0)
5606		return (error);
5607
5608	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5609	    sizeof(attrname));
5610	if (error != 0)
5611		return (error);
5612
5613	ZFS_ENTER(zfsvfs);
5614
5615	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5616	    LOOKUP_XATTR, B_FALSE);
5617	if (error != 0) {
5618		ZFS_EXIT(zfsvfs);
5619		return (error);
5620	}
5621
5622	NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
5623	    UIO_SYSSPACE, attrname, xvp, td);
5624	error = namei(&nd);
5625	vp = nd.ni_vp;
5626	if (error != 0) {
5627		ZFS_EXIT(zfsvfs);
5628		NDFREE(&nd, NDF_ONLY_PNBUF);
5629		if (error == ENOENT)
5630			error = ENOATTR;
5631		return (error);
5632	}
5633
5634	error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
5635	NDFREE(&nd, NDF_ONLY_PNBUF);
5636
5637	vput(nd.ni_dvp);
5638	if (vp == nd.ni_dvp)
5639		vrele(vp);
5640	else
5641		vput(vp);
5642	ZFS_EXIT(zfsvfs);
5643
5644	return (error);
5645}
5646
5647/*
5648 * Vnode operation to set a named attribute.
5649 */
5650static int
5651zfs_setextattr(struct vop_setextattr_args *ap)
5652/*
5653vop_setextattr {
5654	IN struct vnode *a_vp;
5655	IN int a_attrnamespace;
5656	IN const char *a_name;
5657	INOUT struct uio *a_uio;
5658	IN struct ucred *a_cred;
5659	IN struct thread *a_td;
5660};
5661*/
5662{
5663	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5664	struct thread *td = ap->a_td;
5665	struct nameidata nd;
5666	char attrname[255];
5667	struct vattr va;
5668	vnode_t *xvp = NULL, *vp;
5669	int error, flags;
5670
5671	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5672	    ap->a_cred, ap->a_td, VWRITE);
5673	if (error != 0)
5674		return (error);
5675
5676	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5677	    sizeof(attrname));
5678	if (error != 0)
5679		return (error);
5680
5681	ZFS_ENTER(zfsvfs);
5682
5683	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5684	    LOOKUP_XATTR | CREATE_XATTR_DIR, B_FALSE);
5685	if (error != 0) {
5686		ZFS_EXIT(zfsvfs);
5687		return (error);
5688	}
5689
5690	flags = FFLAGS(O_WRONLY | O_CREAT);
5691	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
5692	    xvp, td);
5693	error = vn_open_cred(&nd, &flags, 0600, VN_OPEN_INVFS, ap->a_cred,
5694	    NULL);
5695	vp = nd.ni_vp;
5696	NDFREE(&nd, NDF_ONLY_PNBUF);
5697	if (error != 0) {
5698		ZFS_EXIT(zfsvfs);
5699		return (error);
5700	}
5701
5702	VATTR_NULL(&va);
5703	va.va_size = 0;
5704	error = VOP_SETATTR(vp, &va, ap->a_cred);
5705	if (error == 0)
5706		VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);
5707
5708	VOP_UNLOCK(vp, 0);
5709	vn_close(vp, flags, ap->a_cred, td);
5710	ZFS_EXIT(zfsvfs);
5711
5712	return (error);
5713}
5714
5715/*
5716 * Vnode operation to retrieve extended attributes on a vnode.
5717 */
5718static int
5719zfs_listextattr(struct vop_listextattr_args *ap)
5720/*
5721vop_listextattr {
5722	IN struct vnode *a_vp;
5723	IN int a_attrnamespace;
5724	INOUT struct uio *a_uio;
5725	OUT size_t *a_size;
5726	IN struct ucred *a_cred;
5727	IN struct thread *a_td;
5728};
5729*/
5730{
5731	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5732	struct thread *td = ap->a_td;
5733	struct nameidata nd;
5734	char attrprefix[16];
5735	u_char dirbuf[sizeof(struct dirent)];
5736	struct dirent *dp;
5737	struct iovec aiov;
5738	struct uio auio, *uio = ap->a_uio;
5739	size_t *sizep = ap->a_size;
5740	size_t plen;
5741	vnode_t *xvp = NULL, *vp;
5742	int done, error, eof, pos;
5743
5744	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5745	    ap->a_cred, ap->a_td, VREAD);
5746	if (error != 0)
5747		return (error);
5748
5749	error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
5750	    sizeof(attrprefix));
5751	if (error != 0)
5752		return (error);
5753	plen = strlen(attrprefix);
5754
5755	ZFS_ENTER(zfsvfs);
5756
5757	if (sizep != NULL)
5758		*sizep = 0;
5759
5760	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5761	    LOOKUP_XATTR, B_FALSE);
5762	if (error != 0) {
5763		ZFS_EXIT(zfsvfs);
5764		/*
5765		 * ENOATTR means that the EA directory does not yet exist,
5766		 * i.e. there are no extended attributes there.
5767		 */
5768		if (error == ENOATTR)
5769			error = 0;
5770		return (error);
5771	}
5772
5773	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
5774	    UIO_SYSSPACE, ".", xvp, td);
5775	error = namei(&nd);
5776	vp = nd.ni_vp;
5777	NDFREE(&nd, NDF_ONLY_PNBUF);
5778	if (error != 0) {
5779		ZFS_EXIT(zfsvfs);
5780		return (error);
5781	}
5782
5783	auio.uio_iov = &aiov;
5784	auio.uio_iovcnt = 1;
5785	auio.uio_segflg = UIO_SYSSPACE;
5786	auio.uio_td = td;
5787	auio.uio_rw = UIO_READ;
5788	auio.uio_offset = 0;
5789
5790	do {
5791		u_char nlen;
5792
5793		aiov.iov_base = (void *)dirbuf;
5794		aiov.iov_len = sizeof(dirbuf);
5795		auio.uio_resid = sizeof(dirbuf);
5796		error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
5797		done = sizeof(dirbuf) - auio.uio_resid;
5798		if (error != 0)
5799			break;
5800		for (pos = 0; pos < done;) {
5801			dp = (struct dirent *)(dirbuf + pos);
5802			pos += dp->d_reclen;
5803			/*
5804			 * XXX: Temporarily we also accept DT_UNKNOWN, as this
5805			 * is what we get when attribute was created on Solaris.
5806			 */
5807			if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
5808				continue;
5809			if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0)
5810				continue;
5811			else if (strncmp(dp->d_name, attrprefix, plen) != 0)
5812				continue;
5813			nlen = dp->d_namlen - plen;
5814			if (sizep != NULL)
5815				*sizep += 1 + nlen;
5816			else if (uio != NULL) {
5817				/*
5818				 * Format of extattr name entry is one byte for
5819				 * length and the rest for name.
5820				 */
5821				error = uiomove(&nlen, 1, uio->uio_rw, uio);
5822				if (error == 0) {
5823					error = uiomove(dp->d_name + plen, nlen,
5824					    uio->uio_rw, uio);
5825				}
5826				if (error != 0)
5827					break;
5828			}
5829		}
5830	} while (!eof && error == 0);
5831
5832	vput(vp);
5833	ZFS_EXIT(zfsvfs);
5834
5835	return (error);
5836}
5837
5838int
5839zfs_freebsd_getacl(ap)
5840	struct vop_getacl_args /* {
5841		struct vnode *vp;
5842		acl_type_t type;
5843		struct acl *aclp;
5844		struct ucred *cred;
5845		struct thread *td;
5846	} */ *ap;
5847{
5848	int		error;
5849	vsecattr_t      vsecattr;
5850
5851	if (ap->a_type != ACL_TYPE_NFS4)
5852		return (EINVAL);
5853
5854	vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
5855	if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))
5856		return (error);
5857
5858	error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt);
5859	if (vsecattr.vsa_aclentp != NULL)
5860		kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
5861
5862	return (error);
5863}
5864
5865int
5866zfs_freebsd_setacl(ap)
5867	struct vop_setacl_args /* {
5868		struct vnode *vp;
5869		acl_type_t type;
5870		struct acl *aclp;
5871		struct ucred *cred;
5872		struct thread *td;
5873	} */ *ap;
5874{
5875	int		error;
5876	vsecattr_t      vsecattr;
5877	int		aclbsize;	/* size of acl list in bytes */
5878	aclent_t	*aaclp;
5879
5880	if (ap->a_type != ACL_TYPE_NFS4)
5881		return (EINVAL);
5882
5883	if (ap->a_aclp == NULL)
5884		return (EINVAL);
5885
5886	if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
5887		return (EINVAL);
5888
5889	/*
5890	 * With NFSv4 ACLs, chmod(2) may need to add additional entries,
5891	 * splitting every entry into two and appending "canonical six"
5892	 * entries at the end.  Don't allow for setting an ACL that would
5893	 * cause chmod(2) to run out of ACL entries.
5894	 */
5895	if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
5896		return (ENOSPC);
5897
5898	error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
5899	if (error != 0)
5900		return (error);
5901
5902	vsecattr.vsa_mask = VSA_ACE;
5903	aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t);
5904	vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
5905	aaclp = vsecattr.vsa_aclentp;
5906	vsecattr.vsa_aclentsz = aclbsize;
5907
5908	aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
5909	error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL);
5910	kmem_free(aaclp, aclbsize);
5911
5912	return (error);
5913}
5914
5915int
5916zfs_freebsd_aclcheck(ap)
5917	struct vop_aclcheck_args /* {
5918		struct vnode *vp;
5919		acl_type_t type;
5920		struct acl *aclp;
5921		struct ucred *cred;
5922		struct thread *td;
5923	} */ *ap;
5924{
5925
5926	return (EOPNOTSUPP);
5927}
5928
5929static int
5930zfs_vptocnp(struct vop_vptocnp_args *ap)
5931{
5932	vnode_t *covered_vp;
5933	vnode_t *vp = ap->a_vp;;
5934	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
5935	znode_t *zp = VTOZ(vp);
5936	int ltype;
5937	int error;
5938
5939	ZFS_ENTER(zfsvfs);
5940	ZFS_VERIFY_ZP(zp);
5941
5942	/*
5943	 * If we are a snapshot mounted under .zfs, run the operation
5944	 * on the covered vnode.
5945	 */
5946	if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) {
5947		char name[MAXNAMLEN + 1];
5948		znode_t *dzp;
5949		size_t len;
5950
5951		error = zfs_znode_parent_and_name(zp, &dzp, name);
5952		if (error == 0) {
5953			len = strlen(name);
5954			if (*ap->a_buflen < len)
5955				error = SET_ERROR(ENOMEM);
5956		}
5957		if (error == 0) {
5958			*ap->a_buflen -= len;
5959			bcopy(name, ap->a_buf + *ap->a_buflen, len);
5960			*ap->a_vpp = ZTOV(dzp);
5961		}
5962		ZFS_EXIT(zfsvfs);
5963		return (error);
5964	}
5965	ZFS_EXIT(zfsvfs);
5966
5967	covered_vp = vp->v_mount->mnt_vnodecovered;
5968	vhold(covered_vp);
5969	ltype = VOP_ISLOCKED(vp);
5970	VOP_UNLOCK(vp, 0);
5971	error = vget(covered_vp, LK_SHARED | LK_VNHELD, curthread);
5972	if (error == 0) {
5973		error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred,
5974		    ap->a_buf, ap->a_buflen);
5975		vput(covered_vp);
5976	}
5977	vn_lock(vp, ltype | LK_RETRY);
5978	if ((vp->v_iflag & VI_DOOMED) != 0)
5979		error = SET_ERROR(ENOENT);
5980	return (error);
5981}
5982
5983#ifdef DIAGNOSTIC
5984static int
5985zfs_lock(ap)
5986	struct vop_lock1_args /* {
5987		struct vnode *a_vp;
5988		int a_flags;
5989		char *file;
5990		int line;
5991	} */ *ap;
5992{
5993	vnode_t *vp;
5994	znode_t *zp;
5995	int err;
5996
5997	err = vop_stdlock(ap);
5998	if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) {
5999		vp = ap->a_vp;
6000		zp = vp->v_data;
6001		if (vp->v_mount != NULL && (vp->v_iflag & VI_DOOMED) == 0 &&
6002		    zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0)
6003			VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock));
6004	}
6005	return (err);
6006}
6007#endif
6008
6009struct vop_vector zfs_vnodeops;
6010struct vop_vector zfs_fifoops;
6011struct vop_vector zfs_shareops;
6012
6013struct vop_vector zfs_vnodeops = {
6014	.vop_default =		&default_vnodeops,
6015	.vop_inactive =		zfs_freebsd_inactive,
6016	.vop_reclaim =		zfs_freebsd_reclaim,
6017	.vop_access =		zfs_freebsd_access,
6018	.vop_lookup =		zfs_cache_lookup,
6019	.vop_cachedlookup =	zfs_freebsd_cachedlookup,
6020	.vop_getattr =		zfs_freebsd_getattr,
6021	.vop_setattr =		zfs_freebsd_setattr,
6022	.vop_create =		zfs_freebsd_create,
6023	.vop_mknod =		zfs_freebsd_create,
6024	.vop_mkdir =		zfs_freebsd_mkdir,
6025	.vop_readdir =		zfs_freebsd_readdir,
6026	.vop_fsync =		zfs_freebsd_fsync,
6027	.vop_open =		zfs_freebsd_open,
6028	.vop_close =		zfs_freebsd_close,
6029	.vop_rmdir =		zfs_freebsd_rmdir,
6030	.vop_ioctl =		zfs_freebsd_ioctl,
6031	.vop_link =		zfs_freebsd_link,
6032	.vop_symlink =		zfs_freebsd_symlink,
6033	.vop_readlink =		zfs_freebsd_readlink,
6034	.vop_read =		zfs_freebsd_read,
6035	.vop_write =		zfs_freebsd_write,
6036	.vop_remove =		zfs_freebsd_remove,
6037	.vop_rename =		zfs_freebsd_rename,
6038	.vop_pathconf =		zfs_freebsd_pathconf,
6039	.vop_bmap =		zfs_freebsd_bmap,
6040	.vop_fid =		zfs_freebsd_fid,
6041	.vop_getextattr =	zfs_getextattr,
6042	.vop_deleteextattr =	zfs_deleteextattr,
6043	.vop_setextattr =	zfs_setextattr,
6044	.vop_listextattr =	zfs_listextattr,
6045	.vop_getacl =		zfs_freebsd_getacl,
6046	.vop_setacl =		zfs_freebsd_setacl,
6047	.vop_aclcheck =		zfs_freebsd_aclcheck,
6048	.vop_getpages =		zfs_freebsd_getpages,
6049	.vop_putpages =		zfs_freebsd_putpages,
6050	.vop_vptocnp =		zfs_vptocnp,
6051#ifdef DIAGNOSTIC
6052	.vop_lock1 =		zfs_lock,
6053#endif
6054};
6055
6056struct vop_vector zfs_fifoops = {
6057	.vop_default =		&fifo_specops,
6058	.vop_fsync =		zfs_freebsd_fsync,
6059	.vop_access =		zfs_freebsd_access,
6060	.vop_getattr =		zfs_freebsd_getattr,
6061	.vop_inactive =		zfs_freebsd_inactive,
6062	.vop_read =		VOP_PANIC,
6063	.vop_reclaim =		zfs_freebsd_reclaim,
6064	.vop_setattr =		zfs_freebsd_setattr,
6065	.vop_write =		VOP_PANIC,
6066	.vop_pathconf = 	zfs_freebsd_pathconf,
6067	.vop_fid =		zfs_freebsd_fid,
6068	.vop_getacl =		zfs_freebsd_getacl,
6069	.vop_setacl =		zfs_freebsd_setacl,
6070	.vop_aclcheck =		zfs_freebsd_aclcheck,
6071};
6072
6073/*
6074 * special share hidden files vnode operations template
6075 */
6076struct vop_vector zfs_shareops = {
6077	.vop_default =		&default_vnodeops,
6078	.vop_access =		zfs_freebsd_access,
6079	.vop_inactive =		zfs_freebsd_inactive,
6080	.vop_reclaim =		zfs_freebsd_reclaim,
6081	.vop_fid =		zfs_freebsd_fid,
6082	.vop_pathconf =		zfs_freebsd_pathconf,
6083};
6084