1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
25 * Copyright (c) 2014 Integros [integros.com]
26 * Copyright 2017 Nexenta Systems, Inc.
27 */
28
29/* Portions Copyright 2007 Jeremy Teo */
30/* Portions Copyright 2010 Robert Milkowski */
31
32#include <sys/types.h>
33#include <sys/param.h>
34#include <sys/time.h>
35#include <sys/systm.h>
36#include <sys/sysmacros.h>
37#include <sys/resource.h>
38#include <sys/vfs.h>
39#include <sys/vm.h>
40#include <sys/vnode.h>
41#include <sys/file.h>
42#include <sys/stat.h>
43#include <sys/kmem.h>
44#include <sys/taskq.h>
45#include <sys/uio.h>
46#include <sys/atomic.h>
47#include <sys/namei.h>
48#include <sys/mman.h>
49#include <sys/cmn_err.h>
50#include <sys/errno.h>
51#include <sys/unistd.h>
52#include <sys/zfs_dir.h>
53#include <sys/zfs_ioctl.h>
54#include <sys/fs/zfs.h>
55#include <sys/dmu.h>
56#include <sys/dmu_objset.h>
57#include <sys/spa.h>
58#include <sys/txg.h>
59#include <sys/dbuf.h>
60#include <sys/zap.h>
61#include <sys/sa.h>
62#include <sys/dirent.h>
63#include <sys/policy.h>
64#include <sys/sunddi.h>
65#include <sys/filio.h>
66#include <sys/sid.h>
67#include <sys/zfs_ctldir.h>
68#include <sys/zfs_fuid.h>
69#include <sys/zfs_sa.h>
70#include <sys/zfs_rlock.h>
71#include <sys/extdirent.h>
72#include <sys/kidmap.h>
73#include <sys/bio.h>
74#include <sys/buf.h>
75#include <sys/sched.h>
76#include <sys/acl.h>
77#include <sys/vmmeter.h>
78#include <vm/vm_param.h>
79#include <sys/zil.h>
80#include <sys/dataset_kstats.h>
81
82/*
83 * Programming rules.
84 *
85 * Each vnode op performs some logical unit of work.  To do this, the ZPL must
86 * properly lock its in-core state, create a DMU transaction, do the work,
87 * record this work in the intent log (ZIL), commit the DMU transaction,
88 * and wait for the intent log to commit if it is a synchronous operation.
89 * Moreover, the vnode ops must work in both normal and log replay context.
90 * The ordering of events is important to avoid deadlocks and references
91 * to freed memory.  The example below illustrates the following Big Rules:
92 *
93 *  (1)	A check must be made in each zfs thread for a mounted file system.
94 *	This is done avoiding races using ZFS_ENTER(zfsvfs).
95 *	A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
96 *	must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
97 *	can return EIO from the calling function.
98 *
99 *  (2)	VN_RELE() should always be the last thing except for zil_commit()
100 *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
101 *	First, if it's the last reference, the vnode/znode
102 *	can be freed, so the zp may point to freed memory.  Second, the last
103 *	reference will call zfs_zinactive(), which may induce a lot of work --
104 *	pushing cached pages (which acquires range locks) and syncing out
105 *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
106 *	which could deadlock the system if you were already holding one.
107 *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
108 *
109 *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
110 *	as they can span dmu_tx_assign() calls.
111 *
112 *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
113 *      dmu_tx_assign().  This is critical because we don't want to block
114 *      while holding locks.
115 *
116 *	If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
117 *	reduces lock contention and CPU usage when we must wait (note that if
118 *	throughput is constrained by the storage, nearly every transaction
119 *	must wait).
120 *
121 *      Note, in particular, that if a lock is sometimes acquired before
122 *      the tx assigns, and sometimes after (e.g. z_lock), then failing
123 *      to use a non-blocking assign can deadlock the system.  The scenario:
124 *
125 *	Thread A has grabbed a lock before calling dmu_tx_assign().
126 *	Thread B is in an already-assigned tx, and blocks for this lock.
127 *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
128 *	forever, because the previous txg can't quiesce until B's tx commits.
129 *
130 *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
131 *	then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
132 *	calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
133 *	to indicate that this operation has already called dmu_tx_wait().
134 *	This will ensure that we don't retry forever, waiting a short bit
135 *	each time.
136 *
137 *  (5)	If the operation succeeded, generate the intent log entry for it
138 *	before dropping locks.  This ensures that the ordering of events
139 *	in the intent log matches the order in which they actually occurred.
140 *	During ZIL replay the zfs_log_* functions will update the sequence
141 *	number to indicate the zil transaction has replayed.
142 *
143 *  (6)	At the end of each vnode op, the DMU tx must always commit,
144 *	regardless of whether there were any errors.
145 *
146 *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
147 *	to ensure that synchronous semantics are provided when necessary.
148 *
149 * In general, this is how things should be ordered in each vnode op:
150 *
151 *	ZFS_ENTER(zfsvfs);		// exit if unmounted
152 * top:
153 *	zfs_dirent_lookup(&dl, ...)	// lock directory entry (may VN_HOLD())
154 *	rw_enter(...);			// grab any other locks you need
155 *	tx = dmu_tx_create(...);	// get DMU tx
156 *	dmu_tx_hold_*();		// hold each object you might modify
157 *	error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
158 *	if (error) {
159 *		rw_exit(...);		// drop locks
160 *		zfs_dirent_unlock(dl);	// unlock directory entry
161 *		VN_RELE(...);		// release held vnodes
162 *		if (error == ERESTART) {
163 *			waited = B_TRUE;
164 *			dmu_tx_wait(tx);
165 *			dmu_tx_abort(tx);
166 *			goto top;
167 *		}
168 *		dmu_tx_abort(tx);	// abort DMU tx
169 *		ZFS_EXIT(zfsvfs);	// finished in zfs
170 *		return (error);		// really out of space
171 *	}
172 *	error = do_real_work();		// do whatever this VOP does
173 *	if (error == 0)
174 *		zfs_log_*(...);		// on success, make ZIL entry
175 *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
176 *	rw_exit(...);			// drop locks
177 *	zfs_dirent_unlock(dl);		// unlock directory entry
178 *	VN_RELE(...);			// release held vnodes
179 *	zil_commit(zilog, foid);	// synchronous when necessary
180 *	ZFS_EXIT(zfsvfs);		// finished in zfs
181 *	return (error);			// done, report error
182 */
183
184/* ARGSUSED */
185static int
186zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
187{
188	znode_t	*zp = VTOZ(*vpp);
189	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
190
191	ZFS_ENTER(zfsvfs);
192	ZFS_VERIFY_ZP(zp);
193
194	if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
195	    ((flag & FAPPEND) == 0)) {
196		ZFS_EXIT(zfsvfs);
197		return (SET_ERROR(EPERM));
198	}
199
200	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
201	    ZTOV(zp)->v_type == VREG &&
202	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
203		if (fs_vscan(*vpp, cr, 0) != 0) {
204			ZFS_EXIT(zfsvfs);
205			return (SET_ERROR(EACCES));
206		}
207	}
208
209	/* Keep a count of the synchronous opens in the znode */
210	if (flag & (FSYNC | FDSYNC))
211		atomic_inc_32(&zp->z_sync_cnt);
212
213	ZFS_EXIT(zfsvfs);
214	return (0);
215}
216
217/* ARGSUSED */
218static int
219zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
220    caller_context_t *ct)
221{
222	znode_t	*zp = VTOZ(vp);
223	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
224
225	/*
226	 * Clean up any locks held by this process on the vp.
227	 */
228	cleanlocks(vp, ddi_get_pid(), 0);
229	cleanshares(vp, ddi_get_pid());
230
231	ZFS_ENTER(zfsvfs);
232	ZFS_VERIFY_ZP(zp);
233
234	/* Decrement the synchronous opens in the znode */
235	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
236		atomic_dec_32(&zp->z_sync_cnt);
237
238	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
239	    ZTOV(zp)->v_type == VREG &&
240	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
241		VERIFY(fs_vscan(vp, cr, 1) == 0);
242
243	ZFS_EXIT(zfsvfs);
244	return (0);
245}
246
247/*
248 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
249 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
250 */
251static int
252zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
253{
254	znode_t	*zp = VTOZ(vp);
255	uint64_t noff = (uint64_t)*off; /* new offset */
256	uint64_t file_sz;
257	int error;
258	boolean_t hole;
259
260	file_sz = zp->z_size;
261	if (noff >= file_sz)  {
262		return (SET_ERROR(ENXIO));
263	}
264
265	if (cmd == _FIO_SEEK_HOLE)
266		hole = B_TRUE;
267	else
268		hole = B_FALSE;
269
270	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
271
272	if (error == ESRCH)
273		return (SET_ERROR(ENXIO));
274
275	/*
276	 * We could find a hole that begins after the logical end-of-file,
277	 * because dmu_offset_next() only works on whole blocks.  If the
278	 * EOF falls mid-block, then indicate that the "virtual hole"
279	 * at the end of the file begins at the logical EOF, rather than
280	 * at the end of the last block.
281	 */
282	if (noff > file_sz) {
283		ASSERT(hole);
284		noff = file_sz;
285	}
286
287	if (noff < *off)
288		return (error);
289	*off = noff;
290	return (error);
291}
292
293/* ARGSUSED */
294static int
295zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
296    int *rvalp, caller_context_t *ct)
297{
298	offset_t off;
299	offset_t ndata;
300	dmu_object_info_t doi;
301	int error;
302	zfsvfs_t *zfsvfs;
303	znode_t *zp;
304
305	switch (com) {
306	case _FIOFFS:
307	{
308		return (0);
309
310		/*
311		 * The following two ioctls are used by bfu.  Faking out,
312		 * necessary to avoid bfu errors.
313		 */
314	}
315	case _FIOGDIO:
316	case _FIOSDIO:
317	{
318		return (0);
319	}
320
321	case _FIO_SEEK_DATA:
322	case _FIO_SEEK_HOLE:
323	{
324#ifdef illumos
325		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
326			return (SET_ERROR(EFAULT));
327#else
328		off = *(offset_t *)data;
329#endif
330		zp = VTOZ(vp);
331		zfsvfs = zp->z_zfsvfs;
332		ZFS_ENTER(zfsvfs);
333		ZFS_VERIFY_ZP(zp);
334
335		/* offset parameter is in/out */
336		error = zfs_holey(vp, com, &off);
337		ZFS_EXIT(zfsvfs);
338		if (error)
339			return (error);
340#ifdef illumos
341		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
342			return (SET_ERROR(EFAULT));
343#else
344		*(offset_t *)data = off;
345#endif
346		return (0);
347	}
348#ifdef illumos
349	case _FIO_COUNT_FILLED:
350	{
351		/*
352		 * _FIO_COUNT_FILLED adds a new ioctl command which
353		 * exposes the number of filled blocks in a
354		 * ZFS object.
355		 */
356		zp = VTOZ(vp);
357		zfsvfs = zp->z_zfsvfs;
358		ZFS_ENTER(zfsvfs);
359		ZFS_VERIFY_ZP(zp);
360
361		/*
362		 * Wait for all dirty blocks for this object
363		 * to get synced out to disk, and the DMU info
364		 * updated.
365		 */
366		error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
367		if (error) {
368			ZFS_EXIT(zfsvfs);
369			return (error);
370		}
371
372		/*
373		 * Retrieve fill count from DMU object.
374		 */
375		error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
376		if (error) {
377			ZFS_EXIT(zfsvfs);
378			return (error);
379		}
380
381		ndata = doi.doi_fill_count;
382
383		ZFS_EXIT(zfsvfs);
384		if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
385			return (SET_ERROR(EFAULT));
386		return (0);
387	}
388#endif
389	}
390	return (SET_ERROR(ENOTTY));
391}
392
393static vm_page_t
394page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes)
395{
396	vm_object_t obj;
397	vm_page_t pp;
398	int64_t end;
399
400	/*
401	 * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
402	 * aligned boundaries, if the range is not aligned.  As a result a
403	 * DEV_BSIZE subrange with partially dirty data may get marked as clean.
404	 * It may happen that all DEV_BSIZE subranges are marked clean and thus
405	 * the whole page would be considred clean despite have some dirty data.
406	 * For this reason we should shrink the range to DEV_BSIZE aligned
407	 * boundaries before calling vm_page_clear_dirty.
408	 */
409	end = rounddown2(off + nbytes, DEV_BSIZE);
410	off = roundup2(off, DEV_BSIZE);
411	nbytes = end - off;
412
413	obj = vp->v_object;
414	zfs_vmobject_assert_wlocked(obj);
415
416	for (;;) {
417		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
418		    pp->valid) {
419			if (vm_page_xbusied(pp)) {
420				/*
421				 * Reference the page before unlocking and
422				 * sleeping so that the page daemon is less
423				 * likely to reclaim it.
424				 */
425				vm_page_reference(pp);
426				vm_page_lock(pp);
427				zfs_vmobject_wunlock(obj);
428				vm_page_busy_sleep(pp, "zfsmwb", true);
429				zfs_vmobject_wlock(obj);
430				continue;
431			}
432			vm_page_sbusy(pp);
433		} else if (pp != NULL) {
434			ASSERT(!pp->valid);
435			pp = NULL;
436		}
437
438		if (pp != NULL) {
439			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
440			vm_object_pip_add(obj, 1);
441			pmap_remove_write(pp);
442			if (nbytes != 0)
443				vm_page_clear_dirty(pp, off, nbytes);
444		}
445		break;
446	}
447	return (pp);
448}
449
450static void
451page_unbusy(vm_page_t pp)
452{
453
454	vm_page_sunbusy(pp);
455	vm_object_pip_subtract(pp->object, 1);
456}
457
458static vm_page_t
459page_hold(vnode_t *vp, int64_t start)
460{
461	vm_object_t obj;
462	vm_page_t pp;
463
464	obj = vp->v_object;
465	zfs_vmobject_assert_wlocked(obj);
466
467	for (;;) {
468		if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
469		    pp->valid) {
470			if (vm_page_xbusied(pp)) {
471				/*
472				 * Reference the page before unlocking and
473				 * sleeping so that the page daemon is less
474				 * likely to reclaim it.
475				 */
476				vm_page_reference(pp);
477				vm_page_lock(pp);
478				zfs_vmobject_wunlock(obj);
479				vm_page_busy_sleep(pp, "zfsmwb", true);
480				zfs_vmobject_wlock(obj);
481				continue;
482			}
483
484			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
485			vm_page_lock(pp);
486			vm_page_hold(pp);
487			vm_page_unlock(pp);
488
489		} else
490			pp = NULL;
491		break;
492	}
493	return (pp);
494}
495
496static void
497page_unhold(vm_page_t pp)
498{
499
500	vm_page_lock(pp);
501	vm_page_unhold(pp);
502	vm_page_unlock(pp);
503}
504
505/*
506 * When a file is memory mapped, we must keep the IO data synchronized
507 * between the DMU cache and the memory mapped pages.  What this means:
508 *
509 * On Write:	If we find a memory mapped page, we write to *both*
510 *		the page and the dmu buffer.
511 */
512static void
513update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid,
514    int segflg, dmu_tx_t *tx)
515{
516	vm_object_t obj;
517	struct sf_buf *sf;
518	caddr_t va;
519	int off;
520
521	ASSERT(segflg != UIO_NOCOPY);
522	ASSERT(vp->v_mount != NULL);
523	obj = vp->v_object;
524	ASSERT(obj != NULL);
525
526	off = start & PAGEOFFSET;
527	zfs_vmobject_wlock(obj);
528	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
529		vm_page_t pp;
530		int nbytes = imin(PAGESIZE - off, len);
531
532		if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
533			zfs_vmobject_wunlock(obj);
534
535			va = zfs_map_page(pp, &sf);
536			(void) dmu_read(os, oid, start+off, nbytes,
537			    va+off, DMU_READ_PREFETCH);;
538			zfs_unmap_page(sf);
539
540			zfs_vmobject_wlock(obj);
541			page_unbusy(pp);
542		}
543		len -= nbytes;
544		off = 0;
545	}
546	vm_object_pip_wakeupn(obj, 0);
547	zfs_vmobject_wunlock(obj);
548}
549
550/*
551 * Read with UIO_NOCOPY flag means that sendfile(2) requests
552 * ZFS to populate a range of page cache pages with data.
553 *
554 * NOTE: this function could be optimized to pre-allocate
555 * all pages in advance, drain exclusive busy on all of them,
556 * map them into contiguous KVA region and populate them
557 * in one single dmu_read() call.
558 */
559static int
560mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio)
561{
562	znode_t *zp = VTOZ(vp);
563	objset_t *os = zp->z_zfsvfs->z_os;
564	struct sf_buf *sf;
565	vm_object_t obj;
566	vm_page_t pp;
567	int64_t start;
568	caddr_t va;
569	int len = nbytes;
570	int off;
571	int error = 0;
572
573	ASSERT(uio->uio_segflg == UIO_NOCOPY);
574	ASSERT(vp->v_mount != NULL);
575	obj = vp->v_object;
576	ASSERT(obj != NULL);
577	ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
578
579	zfs_vmobject_wlock(obj);
580	for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
581		int bytes = MIN(PAGESIZE, len);
582
583		pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY |
584		    VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
585		if (pp->valid == 0) {
586			zfs_vmobject_wunlock(obj);
587			va = zfs_map_page(pp, &sf);
588			error = dmu_read(os, zp->z_id, start, bytes, va,
589			    DMU_READ_PREFETCH);
590			if (bytes != PAGESIZE && error == 0)
591				bzero(va + bytes, PAGESIZE - bytes);
592			zfs_unmap_page(sf);
593			zfs_vmobject_wlock(obj);
594			vm_page_sunbusy(pp);
595			vm_page_lock(pp);
596			if (error) {
597				if (pp->wire_count == 0 && pp->valid == 0 &&
598				    !vm_page_busied(pp))
599					vm_page_free(pp);
600			} else {
601				pp->valid = VM_PAGE_BITS_ALL;
602				vm_page_activate(pp);
603			}
604			vm_page_unlock(pp);
605		} else {
606			ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
607			vm_page_sunbusy(pp);
608		}
609		if (error)
610			break;
611		uio->uio_resid -= bytes;
612		uio->uio_offset += bytes;
613		len -= bytes;
614	}
615	zfs_vmobject_wunlock(obj);
616	return (error);
617}
618
619/*
620 * When a file is memory mapped, we must keep the IO data synchronized
621 * between the DMU cache and the memory mapped pages.  What this means:
622 *
623 * On Read:	We "read" preferentially from memory mapped pages,
624 *		else we default from the dmu buffer.
625 *
626 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
627 *	 the file is memory mapped.
628 */
629static int
630mappedread(vnode_t *vp, int nbytes, uio_t *uio)
631{
632	znode_t *zp = VTOZ(vp);
633	vm_object_t obj;
634	int64_t start;
635	caddr_t va;
636	int len = nbytes;
637	int off;
638	int error = 0;
639
640	ASSERT(vp->v_mount != NULL);
641	obj = vp->v_object;
642	ASSERT(obj != NULL);
643
644	start = uio->uio_loffset;
645	off = start & PAGEOFFSET;
646	zfs_vmobject_wlock(obj);
647	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
648		vm_page_t pp;
649		uint64_t bytes = MIN(PAGESIZE - off, len);
650
651		if (pp = page_hold(vp, start)) {
652			struct sf_buf *sf;
653			caddr_t va;
654
655			zfs_vmobject_wunlock(obj);
656			va = zfs_map_page(pp, &sf);
657#ifdef illumos
658			error = uiomove(va + off, bytes, UIO_READ, uio);
659#else
660			error = vn_io_fault_uiomove(va + off, bytes, uio);
661#endif
662			zfs_unmap_page(sf);
663			zfs_vmobject_wlock(obj);
664			page_unhold(pp);
665		} else {
666			zfs_vmobject_wunlock(obj);
667			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
668			    uio, bytes);
669			zfs_vmobject_wlock(obj);
670		}
671		len -= bytes;
672		off = 0;
673		if (error)
674			break;
675	}
676	zfs_vmobject_wunlock(obj);
677	return (error);
678}
679
680offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
681
682/*
683 * Read bytes from specified file into supplied buffer.
684 *
685 *	IN:	vp	- vnode of file to be read from.
686 *		uio	- structure supplying read location, range info,
687 *			  and return buffer.
688 *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
689 *		cr	- credentials of caller.
690 *		ct	- caller context
691 *
692 *	OUT:	uio	- updated offset and range, buffer filled.
693 *
694 *	RETURN:	0 on success, error code on failure.
695 *
696 * Side Effects:
697 *	vp - atime updated if byte count > 0
698 */
699/* ARGSUSED */
700static int
701zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
702{
703	znode_t		*zp = VTOZ(vp);
704	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
705	ssize_t		n, nbytes, start_resid;
706	int		error = 0;
707	xuio_t		*xuio = NULL;
708	int64_t		nread;
709
710	ZFS_ENTER(zfsvfs);
711	ZFS_VERIFY_ZP(zp);
712
713	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
714		ZFS_EXIT(zfsvfs);
715		return (SET_ERROR(EACCES));
716	}
717
718	/*
719	 * Validate file offset
720	 */
721	if (uio->uio_loffset < (offset_t)0) {
722		ZFS_EXIT(zfsvfs);
723		return (SET_ERROR(EINVAL));
724	}
725
726	/*
727	 * Fasttrack empty reads
728	 */
729	if (uio->uio_resid == 0) {
730		ZFS_EXIT(zfsvfs);
731		return (0);
732	}
733
734	/*
735	 * Check for mandatory locks
736	 */
737	if (MANDMODE(zp->z_mode)) {
738		if (error = chklock(vp, FREAD,
739		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
740			ZFS_EXIT(zfsvfs);
741			return (error);
742		}
743	}
744
745	/*
746	 * If we're in FRSYNC mode, sync out this znode before reading it.
747	 */
748	if (zfsvfs->z_log &&
749	    (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS))
750		zil_commit(zfsvfs->z_log, zp->z_id);
751
752	/*
753	 * Lock the range against changes.
754	 */
755	locked_range_t *lr = rangelock_enter(&zp->z_rangelock,
756	    uio->uio_loffset, uio->uio_resid, RL_READER);
757
758	/*
759	 * If we are reading past end-of-file we can skip
760	 * to the end; but we might still need to set atime.
761	 */
762	if (uio->uio_loffset >= zp->z_size) {
763		error = 0;
764		goto out;
765	}
766
767	ASSERT(uio->uio_loffset < zp->z_size);
768	n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
769	start_resid = n;
770
771#ifdef illumos
772	if ((uio->uio_extflg == UIO_XUIO) &&
773	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
774		int nblk;
775		int blksz = zp->z_blksz;
776		uint64_t offset = uio->uio_loffset;
777
778		xuio = (xuio_t *)uio;
779		if ((ISP2(blksz))) {
780			nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
781			    blksz)) / blksz;
782		} else {
783			ASSERT(offset + n <= blksz);
784			nblk = 1;
785		}
786		(void) dmu_xuio_init(xuio, nblk);
787
788		if (vn_has_cached_data(vp)) {
789			/*
790			 * For simplicity, we always allocate a full buffer
791			 * even if we only expect to read a portion of a block.
792			 */
793			while (--nblk >= 0) {
794				(void) dmu_xuio_add(xuio,
795				    dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
796				    blksz), 0, blksz);
797			}
798		}
799	}
800#endif	/* illumos */
801
802	while (n > 0) {
803		nbytes = MIN(n, zfs_read_chunk_size -
804		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
805
806#ifdef __FreeBSD__
807		if (uio->uio_segflg == UIO_NOCOPY)
808			error = mappedread_sf(vp, nbytes, uio);
809		else
810#endif /* __FreeBSD__ */
811		if (vn_has_cached_data(vp)) {
812			error = mappedread(vp, nbytes, uio);
813		} else {
814			error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
815			    uio, nbytes);
816		}
817		if (error) {
818			/* convert checksum errors into IO errors */
819			if (error == ECKSUM)
820				error = SET_ERROR(EIO);
821			break;
822		}
823
824		n -= nbytes;
825	}
826
827	nread = start_resid - n;
828	dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread);
829
830out:
831	rangelock_exit(lr);
832
833	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
834	ZFS_EXIT(zfsvfs);
835	return (error);
836}
837
838/*
839 * Write the bytes to a file.
840 *
841 *	IN:	vp	- vnode of file to be written to.
842 *		uio	- structure supplying write location, range info,
843 *			  and data buffer.
844 *		ioflag	- FAPPEND, FSYNC, and/or FDSYNC.  FAPPEND is
845 *			  set if in append mode.
846 *		cr	- credentials of caller.
847 *		ct	- caller context (NFS/CIFS fem monitor only)
848 *
849 *	OUT:	uio	- updated offset and range.
850 *
851 *	RETURN:	0 on success, error code on failure.
852 *
853 * Timestamps:
854 *	vp - ctime|mtime updated if byte count > 0
855 */
856
857/* ARGSUSED */
858static int
859zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
860{
861	znode_t		*zp = VTOZ(vp);
862	rlim64_t	limit = MAXOFFSET_T;
863	ssize_t		start_resid = uio->uio_resid;
864	ssize_t		tx_bytes;
865	uint64_t	end_size;
866	dmu_tx_t	*tx;
867	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
868	zilog_t		*zilog;
869	offset_t	woff;
870	ssize_t		n, nbytes;
871	int		max_blksz = zfsvfs->z_max_blksz;
872	int		error = 0;
873	arc_buf_t	*abuf;
874	iovec_t		*aiov = NULL;
875	xuio_t		*xuio = NULL;
876	int		i_iov = 0;
877	int		iovcnt = uio->uio_iovcnt;
878	iovec_t		*iovp = uio->uio_iov;
879	int		write_eof;
880	int		count = 0;
881	sa_bulk_attr_t	bulk[4];
882	uint64_t	mtime[2], ctime[2];
883	int64_t		nwritten;
884
885	/*
886	 * Fasttrack empty write
887	 */
888	n = start_resid;
889	if (n == 0)
890		return (0);
891
892	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
893		limit = MAXOFFSET_T;
894
895	ZFS_ENTER(zfsvfs);
896	ZFS_VERIFY_ZP(zp);
897
898	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
899	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
900	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
901	    &zp->z_size, 8);
902	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
903	    &zp->z_pflags, 8);
904
905	/*
906	 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
907	 * callers might not be able to detect properly that we are read-only,
908	 * so check it explicitly here.
909	 */
910	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
911		ZFS_EXIT(zfsvfs);
912		return (SET_ERROR(EROFS));
913	}
914
915	/*
916	 * If immutable or not appending then return EPERM.
917	 * Intentionally allow ZFS_READONLY through here.
918	 * See zfs_zaccess_common()
919	 */
920	if ((zp->z_pflags & ZFS_IMMUTABLE) ||
921	    ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
922	    (uio->uio_loffset < zp->z_size))) {
923		ZFS_EXIT(zfsvfs);
924		return (SET_ERROR(EPERM));
925	}
926
927	zilog = zfsvfs->z_log;
928
929	/*
930	 * Validate file offset
931	 */
932	woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
933	if (woff < 0) {
934		ZFS_EXIT(zfsvfs);
935		return (SET_ERROR(EINVAL));
936	}
937
938	/*
939	 * Check for mandatory locks before calling rangelock_enter()
940	 * in order to prevent a deadlock with locks set via fcntl().
941	 */
942	if (MANDMODE((mode_t)zp->z_mode) &&
943	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
944		ZFS_EXIT(zfsvfs);
945		return (error);
946	}
947
948#ifdef illumos
949	/*
950	 * Pre-fault the pages to ensure slow (eg NFS) pages
951	 * don't hold up txg.
952	 * Skip this if uio contains loaned arc_buf.
953	 */
954	if ((uio->uio_extflg == UIO_XUIO) &&
955	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
956		xuio = (xuio_t *)uio;
957	else
958		uio_prefaultpages(MIN(n, max_blksz), uio);
959#endif
960
961	/*
962	 * If in append mode, set the io offset pointer to eof.
963	 */
964	locked_range_t *lr;
965	if (ioflag & FAPPEND) {
966		/*
967		 * Obtain an appending range lock to guarantee file append
968		 * semantics.  We reset the write offset once we have the lock.
969		 */
970		lr = rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND);
971		woff = lr->lr_offset;
972		if (lr->lr_length == UINT64_MAX) {
973			/*
974			 * We overlocked the file because this write will cause
975			 * the file block size to increase.
976			 * Note that zp_size cannot change with this lock held.
977			 */
978			woff = zp->z_size;
979		}
980		uio->uio_loffset = woff;
981	} else {
982		/*
983		 * Note that if the file block size will change as a result of
984		 * this write, then this range lock will lock the entire file
985		 * so that we can re-write the block safely.
986		 */
987		lr = rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER);
988	}
989
990	if (vn_rlimit_fsize(vp, uio, uio->uio_td)) {
991		rangelock_exit(lr);
992		ZFS_EXIT(zfsvfs);
993		return (EFBIG);
994	}
995
996	if (woff >= limit) {
997		rangelock_exit(lr);
998		ZFS_EXIT(zfsvfs);
999		return (SET_ERROR(EFBIG));
1000	}
1001
1002	if ((woff + n) > limit || woff > (limit - n))
1003		n = limit - woff;
1004
1005	/* Will this write extend the file length? */
1006	write_eof = (woff + n > zp->z_size);
1007
1008	end_size = MAX(zp->z_size, woff + n);
1009
1010	/*
1011	 * Write the file in reasonable size chunks.  Each chunk is written
1012	 * in a separate transaction; this keeps the intent log records small
1013	 * and allows us to do more fine-grained space accounting.
1014	 */
1015	while (n > 0) {
1016		abuf = NULL;
1017		woff = uio->uio_loffset;
1018		if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
1019		    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
1020			if (abuf != NULL)
1021				dmu_return_arcbuf(abuf);
1022			error = SET_ERROR(EDQUOT);
1023			break;
1024		}
1025
1026		if (xuio && abuf == NULL) {
1027			ASSERT(i_iov < iovcnt);
1028			aiov = &iovp[i_iov];
1029			abuf = dmu_xuio_arcbuf(xuio, i_iov);
1030			dmu_xuio_clear(xuio, i_iov);
1031			DTRACE_PROBE3(zfs_cp_write, int, i_iov,
1032			    iovec_t *, aiov, arc_buf_t *, abuf);
1033			ASSERT((aiov->iov_base == abuf->b_data) ||
1034			    ((char *)aiov->iov_base - (char *)abuf->b_data +
1035			    aiov->iov_len == arc_buf_size(abuf)));
1036			i_iov++;
1037		} else if (abuf == NULL && n >= max_blksz &&
1038		    woff >= zp->z_size &&
1039		    P2PHASE(woff, max_blksz) == 0 &&
1040		    zp->z_blksz == max_blksz) {
1041			/*
1042			 * This write covers a full block.  "Borrow" a buffer
1043			 * from the dmu so that we can fill it before we enter
1044			 * a transaction.  This avoids the possibility of
1045			 * holding up the transaction if the data copy hangs
1046			 * up on a pagefault (e.g., from an NFS server mapping).
1047			 */
1048			size_t cbytes;
1049
1050			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
1051			    max_blksz);
1052			ASSERT(abuf != NULL);
1053			ASSERT(arc_buf_size(abuf) == max_blksz);
1054			if (error = uiocopy(abuf->b_data, max_blksz,
1055			    UIO_WRITE, uio, &cbytes)) {
1056				dmu_return_arcbuf(abuf);
1057				break;
1058			}
1059			ASSERT(cbytes == max_blksz);
1060		}
1061
1062		/*
1063		 * Start a transaction.
1064		 */
1065		tx = dmu_tx_create(zfsvfs->z_os);
1066		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1067		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
1068		zfs_sa_upgrade_txholds(tx, zp);
1069		error = dmu_tx_assign(tx, TXG_WAIT);
1070		if (error) {
1071			dmu_tx_abort(tx);
1072			if (abuf != NULL)
1073				dmu_return_arcbuf(abuf);
1074			break;
1075		}
1076
1077		/*
1078		 * If rangelock_enter() over-locked we grow the blocksize
1079		 * and then reduce the lock range.  This will only happen
1080		 * on the first iteration since rangelock_reduce() will
1081		 * shrink down lr_length to the appropriate size.
1082		 */
1083		if (lr->lr_length == UINT64_MAX) {
1084			uint64_t new_blksz;
1085
1086			if (zp->z_blksz > max_blksz) {
1087				/*
1088				 * File's blocksize is already larger than the
1089				 * "recordsize" property.  Only let it grow to
1090				 * the next power of 2.
1091				 */
1092				ASSERT(!ISP2(zp->z_blksz));
1093				new_blksz = MIN(end_size,
1094				    1 << highbit64(zp->z_blksz));
1095			} else {
1096				new_blksz = MIN(end_size, max_blksz);
1097			}
1098			zfs_grow_blocksize(zp, new_blksz, tx);
1099			rangelock_reduce(lr, woff, n);
1100		}
1101
1102		/*
1103		 * XXX - should we really limit each write to z_max_blksz?
1104		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
1105		 */
1106		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
1107
1108		if (woff + nbytes > zp->z_size)
1109			vnode_pager_setsize(vp, woff + nbytes);
1110
1111		if (abuf == NULL) {
1112			tx_bytes = uio->uio_resid;
1113			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
1114			    uio, nbytes, tx);
1115			tx_bytes -= uio->uio_resid;
1116		} else {
1117			tx_bytes = nbytes;
1118			ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
1119			/*
1120			 * If this is not a full block write, but we are
1121			 * extending the file past EOF and this data starts
1122			 * block-aligned, use assign_arcbuf().  Otherwise,
1123			 * write via dmu_write().
1124			 */
1125			if (tx_bytes < max_blksz && (!write_eof ||
1126			    aiov->iov_base != abuf->b_data)) {
1127				ASSERT(xuio);
1128				dmu_write(zfsvfs->z_os, zp->z_id, woff,
1129				    aiov->iov_len, aiov->iov_base, tx);
1130				dmu_return_arcbuf(abuf);
1131				xuio_stat_wbuf_copied();
1132			} else {
1133				ASSERT(xuio || tx_bytes == max_blksz);
1134				dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
1135				    woff, abuf, tx);
1136			}
1137			ASSERT(tx_bytes <= uio->uio_resid);
1138			uioskip(uio, tx_bytes);
1139		}
1140		if (tx_bytes && vn_has_cached_data(vp)) {
1141			update_pages(vp, woff, tx_bytes, zfsvfs->z_os,
1142			    zp->z_id, uio->uio_segflg, tx);
1143		}
1144
1145		/*
1146		 * If we made no progress, we're done.  If we made even
1147		 * partial progress, update the znode and ZIL accordingly.
1148		 */
1149		if (tx_bytes == 0) {
1150			(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
1151			    (void *)&zp->z_size, sizeof (uint64_t), tx);
1152			dmu_tx_commit(tx);
1153			ASSERT(error != 0);
1154			break;
1155		}
1156
1157		/*
1158		 * Clear Set-UID/Set-GID bits on successful write if not
1159		 * privileged and at least one of the excute bits is set.
1160		 *
1161		 * It would be nice to to this after all writes have
1162		 * been done, but that would still expose the ISUID/ISGID
1163		 * to another app after the partial write is committed.
1164		 *
1165		 * Note: we don't call zfs_fuid_map_id() here because
1166		 * user 0 is not an ephemeral uid.
1167		 */
1168		mutex_enter(&zp->z_acl_lock);
1169		if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
1170		    (S_IXUSR >> 6))) != 0 &&
1171		    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
1172		    secpolicy_vnode_setid_retain(vp, cr,
1173		    (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
1174			uint64_t newmode;
1175			zp->z_mode &= ~(S_ISUID | S_ISGID);
1176			newmode = zp->z_mode;
1177			(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
1178			    (void *)&newmode, sizeof (uint64_t), tx);
1179		}
1180		mutex_exit(&zp->z_acl_lock);
1181
1182		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
1183		    B_TRUE);
1184
1185		/*
1186		 * Update the file size (zp_size) if it has changed;
1187		 * account for possible concurrent updates.
1188		 */
1189		while ((end_size = zp->z_size) < uio->uio_loffset) {
1190			(void) atomic_cas_64(&zp->z_size, end_size,
1191			    uio->uio_loffset);
1192#ifdef illumos
1193			ASSERT(error == 0);
1194#else
1195			ASSERT(error == 0 || error == EFAULT);
1196#endif
1197		}
1198		/*
1199		 * If we are replaying and eof is non zero then force
1200		 * the file size to the specified eof. Note, there's no
1201		 * concurrency during replay.
1202		 */
1203		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
1204			zp->z_size = zfsvfs->z_replay_eof;
1205
1206		if (error == 0)
1207			error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1208		else
1209			(void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1210
1211		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
1212		dmu_tx_commit(tx);
1213
1214		if (error != 0)
1215			break;
1216		ASSERT(tx_bytes == nbytes);
1217		n -= nbytes;
1218
1219#ifdef illumos
1220		if (!xuio && n > 0)
1221			uio_prefaultpages(MIN(n, max_blksz), uio);
1222#endif
1223	}
1224
1225	rangelock_exit(lr);
1226
1227	/*
1228	 * If we're in replay mode, or we made no progress, return error.
1229	 * Otherwise, it's at least a partial write, so it's successful.
1230	 */
1231	if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
1232		ZFS_EXIT(zfsvfs);
1233		return (error);
1234	}
1235
1236#ifdef __FreeBSD__
1237	/*
1238	 * EFAULT means that at least one page of the source buffer was not
1239	 * available.  VFS will re-try remaining I/O upon this error.
1240	 */
1241	if (error == EFAULT) {
1242		ZFS_EXIT(zfsvfs);
1243		return (error);
1244	}
1245#endif
1246
1247	if (ioflag & (FSYNC | FDSYNC) ||
1248	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1249		zil_commit(zilog, zp->z_id);
1250
1251	nwritten = start_resid - uio->uio_resid;
1252	dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten);
1253
1254	ZFS_EXIT(zfsvfs);
1255	return (0);
1256}
1257
1258/* ARGSUSED */
1259void
1260zfs_get_done(zgd_t *zgd, int error)
1261{
1262	znode_t *zp = zgd->zgd_private;
1263	objset_t *os = zp->z_zfsvfs->z_os;
1264
1265	if (zgd->zgd_db)
1266		dmu_buf_rele(zgd->zgd_db, zgd);
1267
1268	rangelock_exit(zgd->zgd_lr);
1269
1270	/*
1271	 * Release the vnode asynchronously as we currently have the
1272	 * txg stopped from syncing.
1273	 */
1274	VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1275
1276	kmem_free(zgd, sizeof (zgd_t));
1277}
1278
1279#ifdef DEBUG
1280static int zil_fault_io = 0;
1281#endif
1282
1283/*
1284 * Get data to generate a TX_WRITE intent log record.
1285 */
1286int
1287zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
1288{
1289	zfsvfs_t *zfsvfs = arg;
1290	objset_t *os = zfsvfs->z_os;
1291	znode_t *zp;
1292	uint64_t object = lr->lr_foid;
1293	uint64_t offset = lr->lr_offset;
1294	uint64_t size = lr->lr_length;
1295	dmu_buf_t *db;
1296	zgd_t *zgd;
1297	int error = 0;
1298
1299	ASSERT3P(lwb, !=, NULL);
1300	ASSERT3P(zio, !=, NULL);
1301	ASSERT3U(size, !=, 0);
1302
1303	/*
1304	 * Nothing to do if the file has been removed
1305	 */
1306	if (zfs_zget(zfsvfs, object, &zp) != 0)
1307		return (SET_ERROR(ENOENT));
1308	if (zp->z_unlinked) {
1309		/*
1310		 * Release the vnode asynchronously as we currently have the
1311		 * txg stopped from syncing.
1312		 */
1313		VN_RELE_ASYNC(ZTOV(zp),
1314		    dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1315		return (SET_ERROR(ENOENT));
1316	}
1317
1318	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1319	zgd->zgd_lwb = lwb;
1320	zgd->zgd_private = zp;
1321
1322	/*
1323	 * Write records come in two flavors: immediate and indirect.
1324	 * For small writes it's cheaper to store the data with the
1325	 * log record (immediate); for large writes it's cheaper to
1326	 * sync the data and get a pointer to it (indirect) so that
1327	 * we don't have to write the data twice.
1328	 */
1329	if (buf != NULL) { /* immediate write */
1330		zgd->zgd_lr = rangelock_enter(&zp->z_rangelock,
1331		    offset, size, RL_READER);
1332		/* test for truncation needs to be done while range locked */
1333		if (offset >= zp->z_size) {
1334			error = SET_ERROR(ENOENT);
1335		} else {
1336			error = dmu_read(os, object, offset, size, buf,
1337			    DMU_READ_NO_PREFETCH);
1338		}
1339		ASSERT(error == 0 || error == ENOENT);
1340	} else { /* indirect write */
1341		/*
1342		 * Have to lock the whole block to ensure when it's
1343		 * written out and its checksum is being calculated
1344		 * that no one can change the data. We need to re-check
1345		 * blocksize after we get the lock in case it's changed!
1346		 */
1347		for (;;) {
1348			uint64_t blkoff;
1349			size = zp->z_blksz;
1350			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1351			offset -= blkoff;
1352			zgd->zgd_lr = rangelock_enter(&zp->z_rangelock,
1353			    offset, size, RL_READER);
1354			if (zp->z_blksz == size)
1355				break;
1356			offset += blkoff;
1357			rangelock_exit(zgd->zgd_lr);
1358		}
1359		/* test for truncation needs to be done while range locked */
1360		if (lr->lr_offset >= zp->z_size)
1361			error = SET_ERROR(ENOENT);
1362#ifdef DEBUG
1363		if (zil_fault_io) {
1364			error = SET_ERROR(EIO);
1365			zil_fault_io = 0;
1366		}
1367#endif
1368		if (error == 0)
1369			error = dmu_buf_hold(os, object, offset, zgd, &db,
1370			    DMU_READ_NO_PREFETCH);
1371
1372		if (error == 0) {
1373			blkptr_t *bp = &lr->lr_blkptr;
1374
1375			zgd->zgd_db = db;
1376			zgd->zgd_bp = bp;
1377
1378			ASSERT(db->db_offset == offset);
1379			ASSERT(db->db_size == size);
1380
1381			error = dmu_sync(zio, lr->lr_common.lrc_txg,
1382			    zfs_get_done, zgd);
1383			ASSERT(error || lr->lr_length <= size);
1384
1385			/*
1386			 * On success, we need to wait for the write I/O
1387			 * initiated by dmu_sync() to complete before we can
1388			 * release this dbuf.  We will finish everything up
1389			 * in the zfs_get_done() callback.
1390			 */
1391			if (error == 0)
1392				return (0);
1393
1394			if (error == EALREADY) {
1395				lr->lr_common.lrc_txtype = TX_WRITE2;
1396				/*
1397				 * TX_WRITE2 relies on the data previously
1398				 * written by the TX_WRITE that caused
1399				 * EALREADY.  We zero out the BP because
1400				 * it is the old, currently-on-disk BP.
1401				 */
1402				zgd->zgd_bp = NULL;
1403				BP_ZERO(bp);
1404				error = 0;
1405			}
1406		}
1407	}
1408
1409	zfs_get_done(zgd, error);
1410
1411	return (error);
1412}
1413
1414/*ARGSUSED*/
1415static int
1416zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1417    caller_context_t *ct)
1418{
1419	znode_t *zp = VTOZ(vp);
1420	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1421	int error;
1422
1423	ZFS_ENTER(zfsvfs);
1424	ZFS_VERIFY_ZP(zp);
1425
1426	if (flag & V_ACE_MASK)
1427		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1428	else
1429		error = zfs_zaccess_rwx(zp, mode, flag, cr);
1430
1431	ZFS_EXIT(zfsvfs);
1432	return (error);
1433}
1434
1435static int
1436zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
1437{
1438	int error;
1439
1440	*vpp = arg;
1441	error = vn_lock(*vpp, lkflags);
1442	if (error != 0)
1443		vrele(*vpp);
1444	return (error);
1445}
1446
1447static int
1448zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
1449{
1450	znode_t *zdp = VTOZ(dvp);
1451	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1452	int error;
1453	int ltype;
1454
1455	ASSERT_VOP_LOCKED(dvp, __func__);
1456#ifdef DIAGNOSTIC
1457	if ((zdp->z_pflags & ZFS_XATTR) == 0)
1458		VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
1459#endif
1460
1461	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
1462		ASSERT3P(dvp, ==, vp);
1463		vref(dvp);
1464		ltype = lkflags & LK_TYPE_MASK;
1465		if (ltype != VOP_ISLOCKED(dvp)) {
1466			if (ltype == LK_EXCLUSIVE)
1467				vn_lock(dvp, LK_UPGRADE | LK_RETRY);
1468			else /* if (ltype == LK_SHARED) */
1469				vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
1470
1471			/*
1472			 * Relock for the "." case could leave us with
1473			 * reclaimed vnode.
1474			 */
1475			if (dvp->v_iflag & VI_DOOMED) {
1476				vrele(dvp);
1477				return (SET_ERROR(ENOENT));
1478			}
1479		}
1480		return (0);
1481	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
1482		/*
1483		 * Note that in this case, dvp is the child vnode, and we
1484		 * are looking up the parent vnode - exactly reverse from
1485		 * normal operation.  Unlocking dvp requires some rather
1486		 * tricky unlock/relock dance to prevent mp from being freed;
1487		 * use vn_vget_ino_gen() which takes care of all that.
1488		 *
1489		 * XXX Note that there is a time window when both vnodes are
1490		 * unlocked.  It is possible, although highly unlikely, that
1491		 * during that window the parent-child relationship between
1492		 * the vnodes may change, for example, get reversed.
1493		 * In that case we would have a wrong lock order for the vnodes.
1494		 * All other filesystems seem to ignore this problem, so we
1495		 * do the same here.
1496		 * A potential solution could be implemented as follows:
1497		 * - using LK_NOWAIT when locking the second vnode and retrying
1498		 *   if necessary
1499		 * - checking that the parent-child relationship still holds
1500		 *   after locking both vnodes and retrying if it doesn't
1501		 */
1502		error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
1503		return (error);
1504	} else {
1505		error = vn_lock(vp, lkflags);
1506		if (error != 0)
1507			vrele(vp);
1508		return (error);
1509	}
1510}
1511
1512/*
1513 * Lookup an entry in a directory, or an extended attribute directory.
1514 * If it exists, return a held vnode reference for it.
1515 *
1516 *	IN:	dvp	- vnode of directory to search.
1517 *		nm	- name of entry to lookup.
1518 *		pnp	- full pathname to lookup [UNUSED].
1519 *		flags	- LOOKUP_XATTR set if looking for an attribute.
1520 *		rdir	- root directory vnode [UNUSED].
1521 *		cr	- credentials of caller.
1522 *		ct	- caller context
1523 *
1524 *	OUT:	vpp	- vnode of located entry, NULL if not found.
1525 *
1526 *	RETURN:	0 on success, error code on failure.
1527 *
1528 * Timestamps:
1529 *	NA
1530 */
1531/* ARGSUSED */
1532static int
1533zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
1534    int nameiop, cred_t *cr, kthread_t *td, int flags, boolean_t cached)
1535{
1536	znode_t *zdp = VTOZ(dvp);
1537	znode_t *zp;
1538	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1539	int	error = 0;
1540
1541	/*
1542	 * Fast path lookup, however we must skip DNLC lookup
1543	 * for case folding or normalizing lookups because the
1544	 * DNLC code only stores the passed in name.  This means
1545	 * creating 'a' and removing 'A' on a case insensitive
1546	 * file system would work, but DNLC still thinks 'a'
1547	 * exists and won't let you create it again on the next
1548	 * pass through fast path.
1549	 */
1550	if (!(flags & LOOKUP_XATTR)) {
1551		if (dvp->v_type != VDIR) {
1552			return (SET_ERROR(ENOTDIR));
1553		} else if (zdp->z_sa_hdl == NULL) {
1554			return (SET_ERROR(EIO));
1555		}
1556	}
1557
1558	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1559
1560	ZFS_ENTER(zfsvfs);
1561	ZFS_VERIFY_ZP(zdp);
1562
1563	*vpp = NULL;
1564
1565	if (flags & LOOKUP_XATTR) {
1566#ifdef TODO
1567		/*
1568		 * If the xattr property is off, refuse the lookup request.
1569		 */
1570		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1571			ZFS_EXIT(zfsvfs);
1572			return (SET_ERROR(EINVAL));
1573		}
1574#endif
1575
1576		/*
1577		 * We don't allow recursive attributes..
1578		 * Maybe someday we will.
1579		 */
1580		if (zdp->z_pflags & ZFS_XATTR) {
1581			ZFS_EXIT(zfsvfs);
1582			return (SET_ERROR(EINVAL));
1583		}
1584
1585		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1586			ZFS_EXIT(zfsvfs);
1587			return (error);
1588		}
1589
1590		/*
1591		 * Do we have permission to get into attribute directory?
1592		 */
1593		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1594		    B_FALSE, cr)) {
1595			vrele(*vpp);
1596			*vpp = NULL;
1597		}
1598
1599		ZFS_EXIT(zfsvfs);
1600		return (error);
1601	}
1602
1603	/*
1604	 * Check accessibility of directory.
1605	 */
1606	if (!cached) {
1607		if ((cnp->cn_flags & NOEXECCHECK) != 0) {
1608			cnp->cn_flags &= ~NOEXECCHECK;
1609		} else {
1610			error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr);
1611			if (error != 0) {
1612				ZFS_EXIT(zfsvfs);
1613				return (error);
1614			}
1615		}
1616	}
1617
1618	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1619	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1620		ZFS_EXIT(zfsvfs);
1621		return (SET_ERROR(EILSEQ));
1622	}
1623
1624
1625	/*
1626	 * First handle the special cases.
1627	 */
1628	if ((cnp->cn_flags & ISDOTDOT) != 0) {
1629		/*
1630		 * If we are a snapshot mounted under .zfs, return
1631		 * the vp for the snapshot directory.
1632		 */
1633		if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
1634			struct componentname cn;
1635			vnode_t *zfsctl_vp;
1636			int ltype;
1637
1638			ZFS_EXIT(zfsvfs);
1639			ltype = VOP_ISLOCKED(dvp);
1640			VOP_UNLOCK(dvp, 0);
1641			error = zfsctl_root(zfsvfs->z_parent, LK_SHARED,
1642			    &zfsctl_vp);
1643			if (error == 0) {
1644				cn.cn_nameptr = "snapshot";
1645				cn.cn_namelen = strlen(cn.cn_nameptr);
1646				cn.cn_nameiop = cnp->cn_nameiop;
1647				cn.cn_flags = cnp->cn_flags & ~ISDOTDOT;
1648				cn.cn_lkflags = cnp->cn_lkflags;
1649				error = VOP_LOOKUP(zfsctl_vp, vpp, &cn);
1650				vput(zfsctl_vp);
1651			}
1652			vn_lock(dvp, ltype | LK_RETRY);
1653			return (error);
1654		}
1655	}
1656	if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
1657		ZFS_EXIT(zfsvfs);
1658		if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
1659			return (SET_ERROR(ENOTSUP));
1660		error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp);
1661		return (error);
1662	}
1663
1664	/*
1665	 * The loop is retry the lookup if the parent-child relationship
1666	 * changes during the dot-dot locking complexities.
1667	 */
1668	for (;;) {
1669		uint64_t parent;
1670
1671		error = zfs_dirlook(zdp, nm, &zp);
1672		if (error == 0)
1673			*vpp = ZTOV(zp);
1674
1675		ZFS_EXIT(zfsvfs);
1676		if (error != 0)
1677			break;
1678
1679		error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
1680		if (error != 0) {
1681			/*
1682			 * If we've got a locking error, then the vnode
1683			 * got reclaimed because of a force unmount.
1684			 * We never enter doomed vnodes into the name cache.
1685			 */
1686			*vpp = NULL;
1687			return (error);
1688		}
1689
1690		if ((cnp->cn_flags & ISDOTDOT) == 0)
1691			break;
1692
1693		ZFS_ENTER(zfsvfs);
1694		if (zdp->z_sa_hdl == NULL) {
1695			error = SET_ERROR(EIO);
1696		} else {
1697			error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
1698			    &parent, sizeof (parent));
1699		}
1700		if (error != 0) {
1701			ZFS_EXIT(zfsvfs);
1702			vput(ZTOV(zp));
1703			break;
1704		}
1705		if (zp->z_id == parent) {
1706			ZFS_EXIT(zfsvfs);
1707			break;
1708		}
1709		vput(ZTOV(zp));
1710	}
1711
1712out:
1713	if (error != 0)
1714		*vpp = NULL;
1715
1716	/* Translate errors and add SAVENAME when needed. */
1717	if (cnp->cn_flags & ISLASTCN) {
1718		switch (nameiop) {
1719		case CREATE:
1720		case RENAME:
1721			if (error == ENOENT) {
1722				error = EJUSTRETURN;
1723				cnp->cn_flags |= SAVENAME;
1724				break;
1725			}
1726			/* FALLTHROUGH */
1727		case DELETE:
1728			if (error == 0)
1729				cnp->cn_flags |= SAVENAME;
1730			break;
1731		}
1732	}
1733
1734	/* Insert name into cache (as non-existent) if appropriate. */
1735	if (zfsvfs->z_use_namecache &&
1736	    error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
1737		cache_enter(dvp, NULL, cnp);
1738
1739	/* Insert name into cache if appropriate. */
1740	if (zfsvfs->z_use_namecache &&
1741	    error == 0 && (cnp->cn_flags & MAKEENTRY)) {
1742		if (!(cnp->cn_flags & ISLASTCN) ||
1743		    (nameiop != DELETE && nameiop != RENAME)) {
1744			cache_enter(dvp, *vpp, cnp);
1745		}
1746	}
1747
1748	return (error);
1749}
1750
1751/*
1752 * Attempt to create a new entry in a directory.  If the entry
1753 * already exists, truncate the file if permissible, else return
1754 * an error.  Return the vp of the created or trunc'd file.
1755 *
1756 *	IN:	dvp	- vnode of directory to put new file entry in.
1757 *		name	- name of new file entry.
1758 *		vap	- attributes of new file.
1759 *		excl	- flag indicating exclusive or non-exclusive mode.
1760 *		mode	- mode to open file with.
1761 *		cr	- credentials of caller.
1762 *		flag	- large file flag [UNUSED].
1763 *		ct	- caller context
1764 *		vsecp	- ACL to be set
1765 *
1766 *	OUT:	vpp	- vnode of created or trunc'd entry.
1767 *
1768 *	RETURN:	0 on success, error code on failure.
1769 *
1770 * Timestamps:
1771 *	dvp - ctime|mtime updated if new entry created
1772 *	 vp - ctime|mtime always, atime if new
1773 */
1774
1775/* ARGSUSED */
1776static int
1777zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
1778    vnode_t **vpp, cred_t *cr, kthread_t *td)
1779{
1780	znode_t		*zp, *dzp = VTOZ(dvp);
1781	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1782	zilog_t		*zilog;
1783	objset_t	*os;
1784	dmu_tx_t	*tx;
1785	int		error;
1786	ksid_t		*ksid;
1787	uid_t		uid;
1788	gid_t		gid = crgetgid(cr);
1789	zfs_acl_ids_t   acl_ids;
1790	boolean_t	fuid_dirtied;
1791	void		*vsecp = NULL;
1792	int		flag = 0;
1793	uint64_t	txtype;
1794
1795	/*
1796	 * If we have an ephemeral id, ACL, or XVATTR then
1797	 * make sure file system is at proper version
1798	 */
1799
1800	ksid = crgetsid(cr, KSID_OWNER);
1801	if (ksid)
1802		uid = ksid_getid(ksid);
1803	else
1804		uid = crgetuid(cr);
1805
1806	if (zfsvfs->z_use_fuids == B_FALSE &&
1807	    (vsecp || (vap->va_mask & AT_XVATTR) ||
1808	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1809		return (SET_ERROR(EINVAL));
1810
1811	ZFS_ENTER(zfsvfs);
1812	ZFS_VERIFY_ZP(dzp);
1813	os = zfsvfs->z_os;
1814	zilog = zfsvfs->z_log;
1815
1816	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1817	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1818		ZFS_EXIT(zfsvfs);
1819		return (SET_ERROR(EILSEQ));
1820	}
1821
1822	if (vap->va_mask & AT_XVATTR) {
1823		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
1824		    crgetuid(cr), cr, vap->va_type)) != 0) {
1825			ZFS_EXIT(zfsvfs);
1826			return (error);
1827		}
1828	}
1829
1830	*vpp = NULL;
1831
1832	if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
1833		vap->va_mode &= ~S_ISVTX;
1834
1835	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
1836	if (error) {
1837		ZFS_EXIT(zfsvfs);
1838		return (error);
1839	}
1840	ASSERT3P(zp, ==, NULL);
1841
1842	/*
1843	 * Create a new file object and update the directory
1844	 * to reference it.
1845	 */
1846	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1847		goto out;
1848	}
1849
1850	/*
1851	 * We only support the creation of regular files in
1852	 * extended attribute directories.
1853	 */
1854
1855	if ((dzp->z_pflags & ZFS_XATTR) &&
1856	    (vap->va_type != VREG)) {
1857		error = SET_ERROR(EINVAL);
1858		goto out;
1859	}
1860
1861	if ((error = zfs_acl_ids_create(dzp, 0, vap,
1862	    cr, vsecp, &acl_ids)) != 0)
1863		goto out;
1864
1865	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1866		zfs_acl_ids_free(&acl_ids);
1867		error = SET_ERROR(EDQUOT);
1868		goto out;
1869	}
1870
1871	getnewvnode_reserve(1);
1872
1873	tx = dmu_tx_create(os);
1874
1875	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1876	    ZFS_SA_BASE_ATTR_SIZE);
1877
1878	fuid_dirtied = zfsvfs->z_fuid_dirty;
1879	if (fuid_dirtied)
1880		zfs_fuid_txhold(zfsvfs, tx);
1881	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1882	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1883	if (!zfsvfs->z_use_sa &&
1884	    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1885		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1886		    0, acl_ids.z_aclp->z_acl_bytes);
1887	}
1888	error = dmu_tx_assign(tx, TXG_WAIT);
1889	if (error) {
1890		zfs_acl_ids_free(&acl_ids);
1891		dmu_tx_abort(tx);
1892		getnewvnode_drop_reserve();
1893		ZFS_EXIT(zfsvfs);
1894		return (error);
1895	}
1896	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1897
1898	if (fuid_dirtied)
1899		zfs_fuid_sync(zfsvfs, tx);
1900
1901	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
1902	txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1903	zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1904	    vsecp, acl_ids.z_fuidp, vap);
1905	zfs_acl_ids_free(&acl_ids);
1906	dmu_tx_commit(tx);
1907
1908	getnewvnode_drop_reserve();
1909
1910out:
1911	if (error == 0) {
1912		*vpp = ZTOV(zp);
1913	}
1914
1915	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1916		zil_commit(zilog, 0);
1917
1918	ZFS_EXIT(zfsvfs);
1919	return (error);
1920}
1921
1922/*
1923 * Remove an entry from a directory.
1924 *
1925 *	IN:	dvp	- vnode of directory to remove entry from.
1926 *		name	- name of entry to remove.
1927 *		cr	- credentials of caller.
1928 *		ct	- caller context
1929 *		flags	- case flags
1930 *
1931 *	RETURN:	0 on success, error code on failure.
1932 *
1933 * Timestamps:
1934 *	dvp - ctime|mtime
1935 *	 vp - ctime (if nlink > 0)
1936 */
1937
1938/*ARGSUSED*/
1939static int
1940zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
1941{
1942	znode_t		*dzp = VTOZ(dvp);
1943	znode_t		*zp = VTOZ(vp);
1944	znode_t		*xzp;
1945	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1946	zilog_t		*zilog;
1947	uint64_t	acl_obj, xattr_obj;
1948	uint64_t	obj = 0;
1949	dmu_tx_t	*tx;
1950	boolean_t	unlinked, toobig = FALSE;
1951	uint64_t	txtype;
1952	int		error;
1953
1954	ZFS_ENTER(zfsvfs);
1955	ZFS_VERIFY_ZP(dzp);
1956	ZFS_VERIFY_ZP(zp);
1957	zilog = zfsvfs->z_log;
1958	zp = VTOZ(vp);
1959
1960	xattr_obj = 0;
1961	xzp = NULL;
1962
1963	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1964		goto out;
1965	}
1966
1967	/*
1968	 * Need to use rmdir for removing directories.
1969	 */
1970	if (vp->v_type == VDIR) {
1971		error = SET_ERROR(EPERM);
1972		goto out;
1973	}
1974
1975	vnevent_remove(vp, dvp, name, ct);
1976
1977	obj = zp->z_id;
1978
1979	/* are there any extended attributes? */
1980	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1981	    &xattr_obj, sizeof (xattr_obj));
1982	if (error == 0 && xattr_obj) {
1983		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1984		ASSERT0(error);
1985	}
1986
1987	/*
1988	 * We may delete the znode now, or we may put it in the unlinked set;
1989	 * it depends on whether we're the last link, and on whether there are
1990	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
1991	 * allow for either case.
1992	 */
1993	tx = dmu_tx_create(zfsvfs->z_os);
1994	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1995	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1996	zfs_sa_upgrade_txholds(tx, zp);
1997	zfs_sa_upgrade_txholds(tx, dzp);
1998
1999	if (xzp) {
2000		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
2001		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
2002	}
2003
2004	/* charge as an update -- would be nice not to charge at all */
2005	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2006
2007	/*
2008	 * Mark this transaction as typically resulting in a net free of space
2009	 */
2010	dmu_tx_mark_netfree(tx);
2011
2012	error = dmu_tx_assign(tx, TXG_WAIT);
2013	if (error) {
2014		dmu_tx_abort(tx);
2015		ZFS_EXIT(zfsvfs);
2016		return (error);
2017	}
2018
2019	/*
2020	 * Remove the directory entry.
2021	 */
2022	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
2023
2024	if (error) {
2025		dmu_tx_commit(tx);
2026		goto out;
2027	}
2028
2029	if (unlinked) {
2030		zfs_unlinked_add(zp, tx);
2031		vp->v_vflag |= VV_NOSYNC;
2032	}
2033
2034	txtype = TX_REMOVE;
2035	zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
2036
2037	dmu_tx_commit(tx);
2038out:
2039
2040	if (xzp)
2041		vrele(ZTOV(xzp));
2042
2043	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2044		zil_commit(zilog, 0);
2045
2046	ZFS_EXIT(zfsvfs);
2047	return (error);
2048}
2049
2050/*
2051 * Create a new directory and insert it into dvp using the name
2052 * provided.  Return a pointer to the inserted directory.
2053 *
2054 *	IN:	dvp	- vnode of directory to add subdir to.
2055 *		dirname	- name of new directory.
2056 *		vap	- attributes of new directory.
2057 *		cr	- credentials of caller.
2058 *		ct	- caller context
2059 *		flags	- case flags
2060 *		vsecp	- ACL to be set
2061 *
2062 *	OUT:	vpp	- vnode of created directory.
2063 *
2064 *	RETURN:	0 on success, error code on failure.
2065 *
2066 * Timestamps:
2067 *	dvp - ctime|mtime updated
2068 *	 vp - ctime|mtime|atime updated
2069 */
2070/*ARGSUSED*/
2071static int
2072zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
2073{
2074	znode_t		*zp, *dzp = VTOZ(dvp);
2075	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2076	zilog_t		*zilog;
2077	uint64_t	txtype;
2078	dmu_tx_t	*tx;
2079	int		error;
2080	ksid_t		*ksid;
2081	uid_t		uid;
2082	gid_t		gid = crgetgid(cr);
2083	zfs_acl_ids_t   acl_ids;
2084	boolean_t	fuid_dirtied;
2085
2086	ASSERT(vap->va_type == VDIR);
2087
2088	/*
2089	 * If we have an ephemeral id, ACL, or XVATTR then
2090	 * make sure file system is at proper version
2091	 */
2092
2093	ksid = crgetsid(cr, KSID_OWNER);
2094	if (ksid)
2095		uid = ksid_getid(ksid);
2096	else
2097		uid = crgetuid(cr);
2098	if (zfsvfs->z_use_fuids == B_FALSE &&
2099	    ((vap->va_mask & AT_XVATTR) ||
2100	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
2101		return (SET_ERROR(EINVAL));
2102
2103	ZFS_ENTER(zfsvfs);
2104	ZFS_VERIFY_ZP(dzp);
2105	zilog = zfsvfs->z_log;
2106
2107	if (dzp->z_pflags & ZFS_XATTR) {
2108		ZFS_EXIT(zfsvfs);
2109		return (SET_ERROR(EINVAL));
2110	}
2111
2112	if (zfsvfs->z_utf8 && u8_validate(dirname,
2113	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
2114		ZFS_EXIT(zfsvfs);
2115		return (SET_ERROR(EILSEQ));
2116	}
2117
2118	if (vap->va_mask & AT_XVATTR) {
2119		if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
2120		    crgetuid(cr), cr, vap->va_type)) != 0) {
2121			ZFS_EXIT(zfsvfs);
2122			return (error);
2123		}
2124	}
2125
2126	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
2127	    NULL, &acl_ids)) != 0) {
2128		ZFS_EXIT(zfsvfs);
2129		return (error);
2130	}
2131
2132	/*
2133	 * First make sure the new directory doesn't exist.
2134	 *
2135	 * Existence is checked first to make sure we don't return
2136	 * EACCES instead of EEXIST which can cause some applications
2137	 * to fail.
2138	 */
2139	*vpp = NULL;
2140
2141	if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) {
2142		zfs_acl_ids_free(&acl_ids);
2143		ZFS_EXIT(zfsvfs);
2144		return (error);
2145	}
2146	ASSERT3P(zp, ==, NULL);
2147
2148	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
2149		zfs_acl_ids_free(&acl_ids);
2150		ZFS_EXIT(zfsvfs);
2151		return (error);
2152	}
2153
2154	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
2155		zfs_acl_ids_free(&acl_ids);
2156		ZFS_EXIT(zfsvfs);
2157		return (SET_ERROR(EDQUOT));
2158	}
2159
2160	/*
2161	 * Add a new entry to the directory.
2162	 */
2163	getnewvnode_reserve(1);
2164	tx = dmu_tx_create(zfsvfs->z_os);
2165	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
2166	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
2167	fuid_dirtied = zfsvfs->z_fuid_dirty;
2168	if (fuid_dirtied)
2169		zfs_fuid_txhold(zfsvfs, tx);
2170	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2171		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
2172		    acl_ids.z_aclp->z_acl_bytes);
2173	}
2174
2175	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
2176	    ZFS_SA_BASE_ATTR_SIZE);
2177
2178	error = dmu_tx_assign(tx, TXG_WAIT);
2179	if (error) {
2180		zfs_acl_ids_free(&acl_ids);
2181		dmu_tx_abort(tx);
2182		getnewvnode_drop_reserve();
2183		ZFS_EXIT(zfsvfs);
2184		return (error);
2185	}
2186
2187	/*
2188	 * Create new node.
2189	 */
2190	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
2191
2192	if (fuid_dirtied)
2193		zfs_fuid_sync(zfsvfs, tx);
2194
2195	/*
2196	 * Now put new name in parent dir.
2197	 */
2198	(void) zfs_link_create(dzp, dirname, zp, tx, ZNEW);
2199
2200	*vpp = ZTOV(zp);
2201
2202	txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
2203	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
2204	    acl_ids.z_fuidp, vap);
2205
2206	zfs_acl_ids_free(&acl_ids);
2207
2208	dmu_tx_commit(tx);
2209
2210	getnewvnode_drop_reserve();
2211
2212	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2213		zil_commit(zilog, 0);
2214
2215	ZFS_EXIT(zfsvfs);
2216	return (0);
2217}
2218
2219/*
2220 * Remove a directory subdir entry.  If the current working
2221 * directory is the same as the subdir to be removed, the
2222 * remove will fail.
2223 *
2224 *	IN:	dvp	- vnode of directory to remove from.
2225 *		name	- name of directory to be removed.
2226 *		cwd	- vnode of current working directory.
2227 *		cr	- credentials of caller.
2228 *		ct	- caller context
2229 *		flags	- case flags
2230 *
2231 *	RETURN:	0 on success, error code on failure.
2232 *
2233 * Timestamps:
2234 *	dvp - ctime|mtime updated
2235 */
2236/*ARGSUSED*/
2237static int
2238zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
2239{
2240	znode_t		*dzp = VTOZ(dvp);
2241	znode_t		*zp = VTOZ(vp);
2242	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2243	zilog_t		*zilog;
2244	dmu_tx_t	*tx;
2245	int		error;
2246
2247	ZFS_ENTER(zfsvfs);
2248	ZFS_VERIFY_ZP(dzp);
2249	ZFS_VERIFY_ZP(zp);
2250	zilog = zfsvfs->z_log;
2251
2252
2253	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2254		goto out;
2255	}
2256
2257	if (vp->v_type != VDIR) {
2258		error = SET_ERROR(ENOTDIR);
2259		goto out;
2260	}
2261
2262	vnevent_rmdir(vp, dvp, name, ct);
2263
2264	tx = dmu_tx_create(zfsvfs->z_os);
2265	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2266	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2267	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2268	zfs_sa_upgrade_txholds(tx, zp);
2269	zfs_sa_upgrade_txholds(tx, dzp);
2270	dmu_tx_mark_netfree(tx);
2271	error = dmu_tx_assign(tx, TXG_WAIT);
2272	if (error) {
2273		dmu_tx_abort(tx);
2274		ZFS_EXIT(zfsvfs);
2275		return (error);
2276	}
2277
2278	cache_purge(dvp);
2279
2280	error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
2281
2282	if (error == 0) {
2283		uint64_t txtype = TX_RMDIR;
2284		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
2285	}
2286
2287	dmu_tx_commit(tx);
2288
2289	cache_purge(vp);
2290out:
2291	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2292		zil_commit(zilog, 0);
2293
2294	ZFS_EXIT(zfsvfs);
2295	return (error);
2296}
2297
2298/*
2299 * Read as many directory entries as will fit into the provided
2300 * buffer from the given directory cursor position (specified in
2301 * the uio structure).
2302 *
2303 *	IN:	vp	- vnode of directory to read.
2304 *		uio	- structure supplying read location, range info,
2305 *			  and return buffer.
2306 *		cr	- credentials of caller.
2307 *		ct	- caller context
2308 *		flags	- case flags
2309 *
2310 *	OUT:	uio	- updated offset and range, buffer filled.
2311 *		eofp	- set to true if end-of-file detected.
2312 *
2313 *	RETURN:	0 on success, error code on failure.
2314 *
2315 * Timestamps:
2316 *	vp - atime updated
2317 *
2318 * Note that the low 4 bits of the cookie returned by zap is always zero.
2319 * This allows us to use the low range for "special" directory entries:
2320 * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
2321 * we use the offset 2 for the '.zfs' directory.
2322 */
2323/* ARGSUSED */
2324static int
2325zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
2326{
2327	znode_t		*zp = VTOZ(vp);
2328	iovec_t		*iovp;
2329	edirent_t	*eodp;
2330	dirent64_t	*odp;
2331	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2332	objset_t	*os;
2333	caddr_t		outbuf;
2334	size_t		bufsize;
2335	zap_cursor_t	zc;
2336	zap_attribute_t	zap;
2337	uint_t		bytes_wanted;
2338	uint64_t	offset; /* must be unsigned; checks for < 1 */
2339	uint64_t	parent;
2340	int		local_eof;
2341	int		outcount;
2342	int		error;
2343	uint8_t		prefetch;
2344	boolean_t	check_sysattrs;
2345	uint8_t		type;
2346	int		ncooks;
2347	u_long		*cooks = NULL;
2348	int		flags = 0;
2349
2350	ZFS_ENTER(zfsvfs);
2351	ZFS_VERIFY_ZP(zp);
2352
2353	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2354	    &parent, sizeof (parent))) != 0) {
2355		ZFS_EXIT(zfsvfs);
2356		return (error);
2357	}
2358
2359	/*
2360	 * If we are not given an eof variable,
2361	 * use a local one.
2362	 */
2363	if (eofp == NULL)
2364		eofp = &local_eof;
2365
2366	/*
2367	 * Check for valid iov_len.
2368	 */
2369	if (uio->uio_iov->iov_len <= 0) {
2370		ZFS_EXIT(zfsvfs);
2371		return (SET_ERROR(EINVAL));
2372	}
2373
2374	/*
2375	 * Quit if directory has been removed (posix)
2376	 */
2377	if ((*eofp = zp->z_unlinked) != 0) {
2378		ZFS_EXIT(zfsvfs);
2379		return (0);
2380	}
2381
2382	error = 0;
2383	os = zfsvfs->z_os;
2384	offset = uio->uio_loffset;
2385	prefetch = zp->z_zn_prefetch;
2386
2387	/*
2388	 * Initialize the iterator cursor.
2389	 */
2390	if (offset <= 3) {
2391		/*
2392		 * Start iteration from the beginning of the directory.
2393		 */
2394		zap_cursor_init(&zc, os, zp->z_id);
2395	} else {
2396		/*
2397		 * The offset is a serialized cursor.
2398		 */
2399		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2400	}
2401
2402	/*
2403	 * Get space to change directory entries into fs independent format.
2404	 */
2405	iovp = uio->uio_iov;
2406	bytes_wanted = iovp->iov_len;
2407	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
2408		bufsize = bytes_wanted;
2409		outbuf = kmem_alloc(bufsize, KM_SLEEP);
2410		odp = (struct dirent64 *)outbuf;
2411	} else {
2412		bufsize = bytes_wanted;
2413		outbuf = NULL;
2414		odp = (struct dirent64 *)iovp->iov_base;
2415	}
2416	eodp = (struct edirent *)odp;
2417
2418	if (ncookies != NULL) {
2419		/*
2420		 * Minimum entry size is dirent size and 1 byte for a file name.
2421		 */
2422		ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
2423		cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
2424		*cookies = cooks;
2425		*ncookies = ncooks;
2426	}
2427	/*
2428	 * If this VFS supports the system attribute view interface; and
2429	 * we're looking at an extended attribute directory; and we care
2430	 * about normalization conflicts on this vfs; then we must check
2431	 * for normalization conflicts with the sysattr name space.
2432	 */
2433#ifdef TODO
2434	check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2435	    (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2436	    (flags & V_RDDIR_ENTFLAGS);
2437#else
2438	check_sysattrs = 0;
2439#endif
2440
2441	/*
2442	 * Transform to file-system independent format
2443	 */
2444	outcount = 0;
2445	while (outcount < bytes_wanted) {
2446		ino64_t objnum;
2447		ushort_t reclen;
2448		off64_t *next = NULL;
2449
2450		/*
2451		 * Special case `.', `..', and `.zfs'.
2452		 */
2453		if (offset == 0) {
2454			(void) strcpy(zap.za_name, ".");
2455			zap.za_normalization_conflict = 0;
2456			objnum = zp->z_id;
2457			type = DT_DIR;
2458		} else if (offset == 1) {
2459			(void) strcpy(zap.za_name, "..");
2460			zap.za_normalization_conflict = 0;
2461			objnum = parent;
2462			type = DT_DIR;
2463		} else if (offset == 2 && zfs_show_ctldir(zp)) {
2464			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2465			zap.za_normalization_conflict = 0;
2466			objnum = ZFSCTL_INO_ROOT;
2467			type = DT_DIR;
2468		} else {
2469			/*
2470			 * Grab next entry.
2471			 */
2472			if (error = zap_cursor_retrieve(&zc, &zap)) {
2473				if ((*eofp = (error == ENOENT)) != 0)
2474					break;
2475				else
2476					goto update;
2477			}
2478
2479			if (zap.za_integer_length != 8 ||
2480			    zap.za_num_integers != 1) {
2481				cmn_err(CE_WARN, "zap_readdir: bad directory "
2482				    "entry, obj = %lld, offset = %lld\n",
2483				    (u_longlong_t)zp->z_id,
2484				    (u_longlong_t)offset);
2485				error = SET_ERROR(ENXIO);
2486				goto update;
2487			}
2488
2489			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2490			/*
2491			 * MacOS X can extract the object type here such as:
2492			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2493			 */
2494			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2495
2496			if (check_sysattrs && !zap.za_normalization_conflict) {
2497#ifdef TODO
2498				zap.za_normalization_conflict =
2499				    xattr_sysattr_casechk(zap.za_name);
2500#else
2501				panic("%s:%u: TODO", __func__, __LINE__);
2502#endif
2503			}
2504		}
2505
2506		if (flags & V_RDDIR_ACCFILTER) {
2507			/*
2508			 * If we have no access at all, don't include
2509			 * this entry in the returned information
2510			 */
2511			znode_t	*ezp;
2512			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
2513				goto skip_entry;
2514			if (!zfs_has_access(ezp, cr)) {
2515				vrele(ZTOV(ezp));
2516				goto skip_entry;
2517			}
2518			vrele(ZTOV(ezp));
2519		}
2520
2521		if (flags & V_RDDIR_ENTFLAGS)
2522			reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2523		else
2524			reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2525
2526		/*
2527		 * Will this entry fit in the buffer?
2528		 */
2529		if (outcount + reclen > bufsize) {
2530			/*
2531			 * Did we manage to fit anything in the buffer?
2532			 */
2533			if (!outcount) {
2534				error = SET_ERROR(EINVAL);
2535				goto update;
2536			}
2537			break;
2538		}
2539		if (flags & V_RDDIR_ENTFLAGS) {
2540			/*
2541			 * Add extended flag entry:
2542			 */
2543			eodp->ed_ino = objnum;
2544			eodp->ed_reclen = reclen;
2545			/* NOTE: ed_off is the offset for the *next* entry. */
2546			next = &eodp->ed_off;
2547			eodp->ed_eflags = zap.za_normalization_conflict ?
2548			    ED_CASE_CONFLICT : 0;
2549			(void) strncpy(eodp->ed_name, zap.za_name,
2550			    EDIRENT_NAMELEN(reclen));
2551			eodp = (edirent_t *)((intptr_t)eodp + reclen);
2552		} else {
2553			/*
2554			 * Add normal entry:
2555			 */
2556			odp->d_ino = objnum;
2557			odp->d_reclen = reclen;
2558			odp->d_namlen = strlen(zap.za_name);
2559			/* NOTE: d_off is the offset for the *next* entry. */
2560			next = &odp->d_off;
2561			(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
2562			odp->d_type = type;
2563			dirent_terminate(odp);
2564			odp = (dirent64_t *)((intptr_t)odp + reclen);
2565		}
2566		outcount += reclen;
2567
2568		ASSERT(outcount <= bufsize);
2569
2570		/* Prefetch znode */
2571		if (prefetch)
2572			dmu_prefetch(os, objnum, 0, 0, 0,
2573			    ZIO_PRIORITY_SYNC_READ);
2574
2575	skip_entry:
2576		/*
2577		 * Move to the next entry, fill in the previous offset.
2578		 */
2579		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2580			zap_cursor_advance(&zc);
2581			offset = zap_cursor_serialize(&zc);
2582		} else {
2583			offset += 1;
2584		}
2585
2586		/* Fill the offset right after advancing the cursor. */
2587		if (next != NULL)
2588			*next = offset;
2589		if (cooks != NULL) {
2590			*cooks++ = offset;
2591			ncooks--;
2592			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
2593		}
2594	}
2595	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2596
2597	/* Subtract unused cookies */
2598	if (ncookies != NULL)
2599		*ncookies -= ncooks;
2600
2601	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
2602		iovp->iov_base += outcount;
2603		iovp->iov_len -= outcount;
2604		uio->uio_resid -= outcount;
2605	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2606		/*
2607		 * Reset the pointer.
2608		 */
2609		offset = uio->uio_loffset;
2610	}
2611
2612update:
2613	zap_cursor_fini(&zc);
2614	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
2615		kmem_free(outbuf, bufsize);
2616
2617	if (error == ENOENT)
2618		error = 0;
2619
2620	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2621
2622	uio->uio_loffset = offset;
2623	ZFS_EXIT(zfsvfs);
2624	if (error != 0 && cookies != NULL) {
2625		free(*cookies, M_TEMP);
2626		*cookies = NULL;
2627		*ncookies = 0;
2628	}
2629	return (error);
2630}
2631
2632ulong_t zfs_fsync_sync_cnt = 4;
2633
2634static int
2635zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2636{
2637	znode_t	*zp = VTOZ(vp);
2638	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2639
2640	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2641
2642	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
2643		ZFS_ENTER(zfsvfs);
2644		ZFS_VERIFY_ZP(zp);
2645		zil_commit(zfsvfs->z_log, zp->z_id);
2646		ZFS_EXIT(zfsvfs);
2647	}
2648	return (0);
2649}
2650
2651
2652/*
2653 * Get the requested file attributes and place them in the provided
2654 * vattr structure.
2655 *
2656 *	IN:	vp	- vnode of file.
2657 *		vap	- va_mask identifies requested attributes.
2658 *			  If AT_XVATTR set, then optional attrs are requested
2659 *		flags	- ATTR_NOACLCHECK (CIFS server context)
2660 *		cr	- credentials of caller.
2661 *		ct	- caller context
2662 *
2663 *	OUT:	vap	- attribute values.
2664 *
2665 *	RETURN:	0 (always succeeds).
2666 */
2667/* ARGSUSED */
2668static int
2669zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2670    caller_context_t *ct)
2671{
2672	znode_t *zp = VTOZ(vp);
2673	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2674	int	error = 0;
2675	uint32_t blksize;
2676	u_longlong_t nblocks;
2677	uint64_t mtime[2], ctime[2], crtime[2], rdev;
2678	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2679	xoptattr_t *xoap = NULL;
2680	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2681	sa_bulk_attr_t bulk[4];
2682	int count = 0;
2683
2684	ZFS_ENTER(zfsvfs);
2685	ZFS_VERIFY_ZP(zp);
2686
2687	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2688
2689	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
2690	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
2691	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16);
2692	if (vp->v_type == VBLK || vp->v_type == VCHR)
2693		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
2694		    &rdev, 8);
2695
2696	if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
2697		ZFS_EXIT(zfsvfs);
2698		return (error);
2699	}
2700
2701	/*
2702	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2703	 * Also, if we are the owner don't bother, since owner should
2704	 * always be allowed to read basic attributes of file.
2705	 */
2706	if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
2707	    (vap->va_uid != crgetuid(cr))) {
2708		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2709		    skipaclchk, cr)) {
2710			ZFS_EXIT(zfsvfs);
2711			return (error);
2712		}
2713	}
2714
2715	/*
2716	 * Return all attributes.  It's cheaper to provide the answer
2717	 * than to determine whether we were asked the question.
2718	 */
2719
2720	vap->va_type = IFTOVT(zp->z_mode);
2721	vap->va_mode = zp->z_mode & ~S_IFMT;
2722#ifdef illumos
2723	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
2724#else
2725	vn_fsid(vp, vap);
2726#endif
2727	vap->va_nodeid = zp->z_id;
2728	vap->va_nlink = zp->z_links;
2729	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp) &&
2730	    zp->z_links < ZFS_LINK_MAX)
2731		vap->va_nlink++;
2732	vap->va_size = zp->z_size;
2733#ifdef illumos
2734	vap->va_rdev = vp->v_rdev;
2735#else
2736	if (vp->v_type == VBLK || vp->v_type == VCHR)
2737		vap->va_rdev = zfs_cmpldev(rdev);
2738#endif
2739	vap->va_seq = zp->z_seq;
2740	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
2741     	vap->va_filerev = zp->z_seq;
2742
2743	/*
2744	 * Add in any requested optional attributes and the create time.
2745	 * Also set the corresponding bits in the returned attribute bitmap.
2746	 */
2747	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2748		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2749			xoap->xoa_archive =
2750			    ((zp->z_pflags & ZFS_ARCHIVE) != 0);
2751			XVA_SET_RTN(xvap, XAT_ARCHIVE);
2752		}
2753
2754		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2755			xoap->xoa_readonly =
2756			    ((zp->z_pflags & ZFS_READONLY) != 0);
2757			XVA_SET_RTN(xvap, XAT_READONLY);
2758		}
2759
2760		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2761			xoap->xoa_system =
2762			    ((zp->z_pflags & ZFS_SYSTEM) != 0);
2763			XVA_SET_RTN(xvap, XAT_SYSTEM);
2764		}
2765
2766		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2767			xoap->xoa_hidden =
2768			    ((zp->z_pflags & ZFS_HIDDEN) != 0);
2769			XVA_SET_RTN(xvap, XAT_HIDDEN);
2770		}
2771
2772		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2773			xoap->xoa_nounlink =
2774			    ((zp->z_pflags & ZFS_NOUNLINK) != 0);
2775			XVA_SET_RTN(xvap, XAT_NOUNLINK);
2776		}
2777
2778		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2779			xoap->xoa_immutable =
2780			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
2781			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2782		}
2783
2784		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2785			xoap->xoa_appendonly =
2786			    ((zp->z_pflags & ZFS_APPENDONLY) != 0);
2787			XVA_SET_RTN(xvap, XAT_APPENDONLY);
2788		}
2789
2790		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2791			xoap->xoa_nodump =
2792			    ((zp->z_pflags & ZFS_NODUMP) != 0);
2793			XVA_SET_RTN(xvap, XAT_NODUMP);
2794		}
2795
2796		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2797			xoap->xoa_opaque =
2798			    ((zp->z_pflags & ZFS_OPAQUE) != 0);
2799			XVA_SET_RTN(xvap, XAT_OPAQUE);
2800		}
2801
2802		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2803			xoap->xoa_av_quarantined =
2804			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
2805			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2806		}
2807
2808		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2809			xoap->xoa_av_modified =
2810			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
2811			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2812		}
2813
2814		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2815		    vp->v_type == VREG) {
2816			zfs_sa_get_scanstamp(zp, xvap);
2817		}
2818
2819		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2820			xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
2821			XVA_SET_RTN(xvap, XAT_REPARSE);
2822		}
2823		if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
2824			xoap->xoa_generation = zp->z_gen;
2825			XVA_SET_RTN(xvap, XAT_GEN);
2826		}
2827
2828		if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
2829			xoap->xoa_offline =
2830			    ((zp->z_pflags & ZFS_OFFLINE) != 0);
2831			XVA_SET_RTN(xvap, XAT_OFFLINE);
2832		}
2833
2834		if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
2835			xoap->xoa_sparse =
2836			    ((zp->z_pflags & ZFS_SPARSE) != 0);
2837			XVA_SET_RTN(xvap, XAT_SPARSE);
2838		}
2839	}
2840
2841	ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
2842	ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2843	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2844	ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
2845
2846
2847	sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
2848	vap->va_blksize = blksize;
2849	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
2850
2851	if (zp->z_blksz == 0) {
2852		/*
2853		 * Block size hasn't been set; suggest maximal I/O transfers.
2854		 */
2855		vap->va_blksize = zfsvfs->z_max_blksz;
2856	}
2857
2858	ZFS_EXIT(zfsvfs);
2859	return (0);
2860}
2861
2862/*
2863 * Set the file attributes to the values contained in the
2864 * vattr structure.
2865 *
2866 *	IN:	vp	- vnode of file to be modified.
2867 *		vap	- new attribute values.
2868 *			  If AT_XVATTR set, then optional attrs are being set
2869 *		flags	- ATTR_UTIME set if non-default time values provided.
2870 *			- ATTR_NOACLCHECK (CIFS context only).
2871 *		cr	- credentials of caller.
2872 *		ct	- caller context
2873 *
2874 *	RETURN:	0 on success, error code on failure.
2875 *
2876 * Timestamps:
2877 *	vp - ctime updated, mtime updated if size changed.
2878 */
2879/* ARGSUSED */
2880static int
2881zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2882    caller_context_t *ct)
2883{
2884	znode_t		*zp = VTOZ(vp);
2885	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2886	zilog_t		*zilog;
2887	dmu_tx_t	*tx;
2888	vattr_t		oldva;
2889	xvattr_t	tmpxvattr;
2890	uint_t		mask = vap->va_mask;
2891	uint_t		saved_mask = 0;
2892	uint64_t	saved_mode;
2893	int		trim_mask = 0;
2894	uint64_t	new_mode;
2895	uint64_t	new_uid, new_gid;
2896	uint64_t	xattr_obj;
2897	uint64_t	mtime[2], ctime[2];
2898	znode_t		*attrzp;
2899	int		need_policy = FALSE;
2900	int		err, err2;
2901	zfs_fuid_info_t *fuidp = NULL;
2902	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2903	xoptattr_t	*xoap;
2904	zfs_acl_t	*aclp;
2905	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2906	boolean_t	fuid_dirtied = B_FALSE;
2907	sa_bulk_attr_t	bulk[7], xattr_bulk[7];
2908	int		count = 0, xattr_count = 0;
2909
2910	if (mask == 0)
2911		return (0);
2912
2913	if (mask & AT_NOSET)
2914		return (SET_ERROR(EINVAL));
2915
2916	ZFS_ENTER(zfsvfs);
2917	ZFS_VERIFY_ZP(zp);
2918
2919	zilog = zfsvfs->z_log;
2920
2921	/*
2922	 * Make sure that if we have ephemeral uid/gid or xvattr specified
2923	 * that file system is at proper version level
2924	 */
2925
2926	if (zfsvfs->z_use_fuids == B_FALSE &&
2927	    (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2928	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2929	    (mask & AT_XVATTR))) {
2930		ZFS_EXIT(zfsvfs);
2931		return (SET_ERROR(EINVAL));
2932	}
2933
2934	if (mask & AT_SIZE && vp->v_type == VDIR) {
2935		ZFS_EXIT(zfsvfs);
2936		return (SET_ERROR(EISDIR));
2937	}
2938
2939	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
2940		ZFS_EXIT(zfsvfs);
2941		return (SET_ERROR(EINVAL));
2942	}
2943
2944	/*
2945	 * If this is an xvattr_t, then get a pointer to the structure of
2946	 * optional attributes.  If this is NULL, then we have a vattr_t.
2947	 */
2948	xoap = xva_getxoptattr(xvap);
2949
2950	xva_init(&tmpxvattr);
2951
2952	/*
2953	 * Immutable files can only alter immutable bit and atime
2954	 */
2955	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2956	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
2957	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2958		ZFS_EXIT(zfsvfs);
2959		return (SET_ERROR(EPERM));
2960	}
2961
2962	/*
2963	 * Note: ZFS_READONLY is handled in zfs_zaccess_common.
2964	 */
2965
2966	/*
2967	 * Verify timestamps doesn't overflow 32 bits.
2968	 * ZFS can handle large timestamps, but 32bit syscalls can't
2969	 * handle times greater than 2039.  This check should be removed
2970	 * once large timestamps are fully supported.
2971	 */
2972	if (mask & (AT_ATIME | AT_MTIME)) {
2973		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2974		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2975			ZFS_EXIT(zfsvfs);
2976			return (SET_ERROR(EOVERFLOW));
2977		}
2978	}
2979	if (xoap && (mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME) &&
2980	    TIMESPEC_OVERFLOW(&vap->va_birthtime)) {
2981		ZFS_EXIT(zfsvfs);
2982		return (SET_ERROR(EOVERFLOW));
2983	}
2984
2985	attrzp = NULL;
2986	aclp = NULL;
2987
2988	/* Can this be moved to before the top label? */
2989	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2990		ZFS_EXIT(zfsvfs);
2991		return (SET_ERROR(EROFS));
2992	}
2993
2994	/*
2995	 * First validate permissions
2996	 */
2997
2998	if (mask & AT_SIZE) {
2999		/*
3000		 * XXX - Note, we are not providing any open
3001		 * mode flags here (like FNDELAY), so we may
3002		 * block if there are locks present... this
3003		 * should be addressed in openat().
3004		 */
3005		/* XXX - would it be OK to generate a log record here? */
3006		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
3007		if (err) {
3008			ZFS_EXIT(zfsvfs);
3009			return (err);
3010		}
3011	}
3012
3013	if (mask & (AT_ATIME|AT_MTIME) ||
3014	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
3015	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
3016	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
3017	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
3018	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
3019	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
3020	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
3021		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
3022		    skipaclchk, cr);
3023	}
3024
3025	if (mask & (AT_UID|AT_GID)) {
3026		int	idmask = (mask & (AT_UID|AT_GID));
3027		int	take_owner;
3028		int	take_group;
3029
3030		/*
3031		 * NOTE: even if a new mode is being set,
3032		 * we may clear S_ISUID/S_ISGID bits.
3033		 */
3034
3035		if (!(mask & AT_MODE))
3036			vap->va_mode = zp->z_mode;
3037
3038		/*
3039		 * Take ownership or chgrp to group we are a member of
3040		 */
3041
3042		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
3043		take_group = (mask & AT_GID) &&
3044		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
3045
3046		/*
3047		 * If both AT_UID and AT_GID are set then take_owner and
3048		 * take_group must both be set in order to allow taking
3049		 * ownership.
3050		 *
3051		 * Otherwise, send the check through secpolicy_vnode_setattr()
3052		 *
3053		 */
3054
3055		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
3056		    ((idmask == AT_UID) && take_owner) ||
3057		    ((idmask == AT_GID) && take_group)) {
3058			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
3059			    skipaclchk, cr) == 0) {
3060				/*
3061				 * Remove setuid/setgid for non-privileged users
3062				 */
3063				secpolicy_setid_clear(vap, vp, cr);
3064				trim_mask = (mask & (AT_UID|AT_GID));
3065			} else {
3066				need_policy =  TRUE;
3067			}
3068		} else {
3069			need_policy =  TRUE;
3070		}
3071	}
3072
3073	oldva.va_mode = zp->z_mode;
3074	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
3075	if (mask & AT_XVATTR) {
3076		/*
3077		 * Update xvattr mask to include only those attributes
3078		 * that are actually changing.
3079		 *
3080		 * the bits will be restored prior to actually setting
3081		 * the attributes so the caller thinks they were set.
3082		 */
3083		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
3084			if (xoap->xoa_appendonly !=
3085			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
3086				need_policy = TRUE;
3087			} else {
3088				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
3089				XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
3090			}
3091		}
3092
3093		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
3094			if (xoap->xoa_nounlink !=
3095			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
3096				need_policy = TRUE;
3097			} else {
3098				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
3099				XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
3100			}
3101		}
3102
3103		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
3104			if (xoap->xoa_immutable !=
3105			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
3106				need_policy = TRUE;
3107			} else {
3108				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
3109				XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
3110			}
3111		}
3112
3113		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
3114			if (xoap->xoa_nodump !=
3115			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
3116				need_policy = TRUE;
3117			} else {
3118				XVA_CLR_REQ(xvap, XAT_NODUMP);
3119				XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
3120			}
3121		}
3122
3123		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
3124			if (xoap->xoa_av_modified !=
3125			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
3126				need_policy = TRUE;
3127			} else {
3128				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
3129				XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
3130			}
3131		}
3132
3133		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
3134			if ((vp->v_type != VREG &&
3135			    xoap->xoa_av_quarantined) ||
3136			    xoap->xoa_av_quarantined !=
3137			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
3138				need_policy = TRUE;
3139			} else {
3140				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
3141				XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
3142			}
3143		}
3144
3145		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
3146			ZFS_EXIT(zfsvfs);
3147			return (SET_ERROR(EPERM));
3148		}
3149
3150		if (need_policy == FALSE &&
3151		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
3152		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
3153			need_policy = TRUE;
3154		}
3155	}
3156
3157	if (mask & AT_MODE) {
3158		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
3159			err = secpolicy_setid_setsticky_clear(vp, vap,
3160			    &oldva, cr);
3161			if (err) {
3162				ZFS_EXIT(zfsvfs);
3163				return (err);
3164			}
3165			trim_mask |= AT_MODE;
3166		} else {
3167			need_policy = TRUE;
3168		}
3169	}
3170
3171	if (need_policy) {
3172		/*
3173		 * If trim_mask is set then take ownership
3174		 * has been granted or write_acl is present and user
3175		 * has the ability to modify mode.  In that case remove
3176		 * UID|GID and or MODE from mask so that
3177		 * secpolicy_vnode_setattr() doesn't revoke it.
3178		 */
3179
3180		if (trim_mask) {
3181			saved_mask = vap->va_mask;
3182			vap->va_mask &= ~trim_mask;
3183			if (trim_mask & AT_MODE) {
3184				/*
3185				 * Save the mode, as secpolicy_vnode_setattr()
3186				 * will overwrite it with ova.va_mode.
3187				 */
3188				saved_mode = vap->va_mode;
3189			}
3190		}
3191		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
3192		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
3193		if (err) {
3194			ZFS_EXIT(zfsvfs);
3195			return (err);
3196		}
3197
3198		if (trim_mask) {
3199			vap->va_mask |= saved_mask;
3200			if (trim_mask & AT_MODE) {
3201				/*
3202				 * Recover the mode after
3203				 * secpolicy_vnode_setattr().
3204				 */
3205				vap->va_mode = saved_mode;
3206			}
3207		}
3208	}
3209
3210	/*
3211	 * secpolicy_vnode_setattr, or take ownership may have
3212	 * changed va_mask
3213	 */
3214	mask = vap->va_mask;
3215
3216	if ((mask & (AT_UID | AT_GID))) {
3217		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
3218		    &xattr_obj, sizeof (xattr_obj));
3219
3220		if (err == 0 && xattr_obj) {
3221			err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
3222			if (err == 0) {
3223				err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
3224				if (err != 0)
3225					vrele(ZTOV(attrzp));
3226			}
3227			if (err)
3228				goto out2;
3229		}
3230		if (mask & AT_UID) {
3231			new_uid = zfs_fuid_create(zfsvfs,
3232			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
3233			if (new_uid != zp->z_uid &&
3234			    zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
3235				if (attrzp)
3236					vput(ZTOV(attrzp));
3237				err = SET_ERROR(EDQUOT);
3238				goto out2;
3239			}
3240		}
3241
3242		if (mask & AT_GID) {
3243			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
3244			    cr, ZFS_GROUP, &fuidp);
3245			if (new_gid != zp->z_gid &&
3246			    zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
3247				if (attrzp)
3248					vput(ZTOV(attrzp));
3249				err = SET_ERROR(EDQUOT);
3250				goto out2;
3251			}
3252		}
3253	}
3254	tx = dmu_tx_create(zfsvfs->z_os);
3255
3256	if (mask & AT_MODE) {
3257		uint64_t pmode = zp->z_mode;
3258		uint64_t acl_obj;
3259		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
3260
3261		if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
3262		    !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
3263			err = SET_ERROR(EPERM);
3264			goto out;
3265		}
3266
3267		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
3268			goto out;
3269
3270		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
3271			/*
3272			 * Are we upgrading ACL from old V0 format
3273			 * to V1 format?
3274			 */
3275			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
3276			    zfs_znode_acl_version(zp) ==
3277			    ZFS_ACL_VERSION_INITIAL) {
3278				dmu_tx_hold_free(tx, acl_obj, 0,
3279				    DMU_OBJECT_END);
3280				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3281				    0, aclp->z_acl_bytes);
3282			} else {
3283				dmu_tx_hold_write(tx, acl_obj, 0,
3284				    aclp->z_acl_bytes);
3285			}
3286		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3287			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3288			    0, aclp->z_acl_bytes);
3289		}
3290		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3291	} else {
3292		if ((mask & AT_XVATTR) &&
3293		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3294			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3295		else
3296			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3297	}
3298
3299	if (attrzp) {
3300		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3301	}
3302
3303	fuid_dirtied = zfsvfs->z_fuid_dirty;
3304	if (fuid_dirtied)
3305		zfs_fuid_txhold(zfsvfs, tx);
3306
3307	zfs_sa_upgrade_txholds(tx, zp);
3308
3309	err = dmu_tx_assign(tx, TXG_WAIT);
3310	if (err)
3311		goto out;
3312
3313	count = 0;
3314	/*
3315	 * Set each attribute requested.
3316	 * We group settings according to the locks they need to acquire.
3317	 *
3318	 * Note: you cannot set ctime directly, although it will be
3319	 * updated as a side-effect of calling this function.
3320	 */
3321
3322	if (mask & (AT_UID|AT_GID|AT_MODE))
3323		mutex_enter(&zp->z_acl_lock);
3324
3325	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3326	    &zp->z_pflags, sizeof (zp->z_pflags));
3327
3328	if (attrzp) {
3329		if (mask & (AT_UID|AT_GID|AT_MODE))
3330			mutex_enter(&attrzp->z_acl_lock);
3331		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3332		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3333		    sizeof (attrzp->z_pflags));
3334	}
3335
3336	if (mask & (AT_UID|AT_GID)) {
3337
3338		if (mask & AT_UID) {
3339			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3340			    &new_uid, sizeof (new_uid));
3341			zp->z_uid = new_uid;
3342			if (attrzp) {
3343				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3344				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3345				    sizeof (new_uid));
3346				attrzp->z_uid = new_uid;
3347			}
3348		}
3349
3350		if (mask & AT_GID) {
3351			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3352			    NULL, &new_gid, sizeof (new_gid));
3353			zp->z_gid = new_gid;
3354			if (attrzp) {
3355				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3356				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3357				    sizeof (new_gid));
3358				attrzp->z_gid = new_gid;
3359			}
3360		}
3361		if (!(mask & AT_MODE)) {
3362			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3363			    NULL, &new_mode, sizeof (new_mode));
3364			new_mode = zp->z_mode;
3365		}
3366		err = zfs_acl_chown_setattr(zp);
3367		ASSERT(err == 0);
3368		if (attrzp) {
3369			err = zfs_acl_chown_setattr(attrzp);
3370			ASSERT(err == 0);
3371		}
3372	}
3373
3374	if (mask & AT_MODE) {
3375		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3376		    &new_mode, sizeof (new_mode));
3377		zp->z_mode = new_mode;
3378		ASSERT3U((uintptr_t)aclp, !=, 0);
3379		err = zfs_aclset_common(zp, aclp, cr, tx);
3380		ASSERT0(err);
3381		if (zp->z_acl_cached)
3382			zfs_acl_free(zp->z_acl_cached);
3383		zp->z_acl_cached = aclp;
3384		aclp = NULL;
3385	}
3386
3387
3388	if (mask & AT_ATIME) {
3389		ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
3390		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
3391		    &zp->z_atime, sizeof (zp->z_atime));
3392	}
3393
3394	if (mask & AT_MTIME) {
3395		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3396		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3397		    mtime, sizeof (mtime));
3398	}
3399
3400	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3401	if (mask & AT_SIZE && !(mask & AT_MTIME)) {
3402		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
3403		    NULL, mtime, sizeof (mtime));
3404		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3405		    &ctime, sizeof (ctime));
3406		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
3407		    B_TRUE);
3408	} else if (mask != 0) {
3409		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3410		    &ctime, sizeof (ctime));
3411		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
3412		    B_TRUE);
3413		if (attrzp) {
3414			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3415			    SA_ZPL_CTIME(zfsvfs), NULL,
3416			    &ctime, sizeof (ctime));
3417			zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
3418			    mtime, ctime, B_TRUE);
3419		}
3420	}
3421	/*
3422	 * Do this after setting timestamps to prevent timestamp
3423	 * update from toggling bit
3424	 */
3425
3426	if (xoap && (mask & AT_XVATTR)) {
3427
3428		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
3429			xoap->xoa_createtime = vap->va_birthtime;
3430		/*
3431		 * restore trimmed off masks
3432		 * so that return masks can be set for caller.
3433		 */
3434
3435		if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3436			XVA_SET_REQ(xvap, XAT_APPENDONLY);
3437		}
3438		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3439			XVA_SET_REQ(xvap, XAT_NOUNLINK);
3440		}
3441		if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3442			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3443		}
3444		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3445			XVA_SET_REQ(xvap, XAT_NODUMP);
3446		}
3447		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3448			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3449		}
3450		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3451			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3452		}
3453
3454		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3455			ASSERT(vp->v_type == VREG);
3456
3457		zfs_xvattr_set(zp, xvap, tx);
3458	}
3459
3460	if (fuid_dirtied)
3461		zfs_fuid_sync(zfsvfs, tx);
3462
3463	if (mask != 0)
3464		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3465
3466	if (mask & (AT_UID|AT_GID|AT_MODE))
3467		mutex_exit(&zp->z_acl_lock);
3468
3469	if (attrzp) {
3470		if (mask & (AT_UID|AT_GID|AT_MODE))
3471			mutex_exit(&attrzp->z_acl_lock);
3472	}
3473out:
3474	if (err == 0 && attrzp) {
3475		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3476		    xattr_count, tx);
3477		ASSERT(err2 == 0);
3478	}
3479
3480	if (attrzp)
3481		vput(ZTOV(attrzp));
3482
3483	if (aclp)
3484		zfs_acl_free(aclp);
3485
3486	if (fuidp) {
3487		zfs_fuid_info_free(fuidp);
3488		fuidp = NULL;
3489	}
3490
3491	if (err) {
3492		dmu_tx_abort(tx);
3493	} else {
3494		err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3495		dmu_tx_commit(tx);
3496	}
3497
3498out2:
3499	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3500		zil_commit(zilog, 0);
3501
3502	ZFS_EXIT(zfsvfs);
3503	return (err);
3504}
3505
3506/*
3507 * We acquire all but fdvp locks using non-blocking acquisitions.  If we
3508 * fail to acquire any lock in the path we will drop all held locks,
3509 * acquire the new lock in a blocking fashion, and then release it and
3510 * restart the rename.  This acquire/release step ensures that we do not
3511 * spin on a lock waiting for release.  On error release all vnode locks
3512 * and decrement references the way tmpfs_rename() would do.
3513 */
3514static int
3515zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
3516    struct vnode *tdvp, struct vnode **tvpp,
3517    const struct componentname *scnp, const struct componentname *tcnp)
3518{
3519	zfsvfs_t	*zfsvfs;
3520	struct vnode	*nvp, *svp, *tvp;
3521	znode_t		*sdzp, *tdzp, *szp, *tzp;
3522	const char	*snm = scnp->cn_nameptr;
3523	const char	*tnm = tcnp->cn_nameptr;
3524	int error;
3525
3526	VOP_UNLOCK(tdvp, 0);
3527	if (*tvpp != NULL && *tvpp != tdvp)
3528		VOP_UNLOCK(*tvpp, 0);
3529
3530relock:
3531	error = vn_lock(sdvp, LK_EXCLUSIVE);
3532	if (error)
3533		goto out;
3534	sdzp = VTOZ(sdvp);
3535
3536	error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
3537	if (error != 0) {
3538		VOP_UNLOCK(sdvp, 0);
3539		if (error != EBUSY)
3540			goto out;
3541		error = vn_lock(tdvp, LK_EXCLUSIVE);
3542		if (error)
3543			goto out;
3544		VOP_UNLOCK(tdvp, 0);
3545		goto relock;
3546	}
3547	tdzp = VTOZ(tdvp);
3548
3549	/*
3550	 * Before using sdzp and tdzp we must ensure that they are live.
3551	 * As a porting legacy from illumos we have two things to worry
3552	 * about.  One is typical for FreeBSD and it is that the vnode is
3553	 * not reclaimed (doomed).  The other is that the znode is live.
3554	 * The current code can invalidate the znode without acquiring the
3555	 * corresponding vnode lock if the object represented by the znode
3556	 * and vnode is no longer valid after a rollback or receive operation.
3557	 * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock
3558	 * that protects the znodes from the invalidation.
3559	 */
3560	zfsvfs = sdzp->z_zfsvfs;
3561	ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
3562	ZFS_ENTER(zfsvfs);
3563
3564	/*
3565	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
3566	 * bypassing the cleanup code in the case of an error.
3567	 */
3568	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
3569		ZFS_EXIT(zfsvfs);
3570		VOP_UNLOCK(sdvp, 0);
3571		VOP_UNLOCK(tdvp, 0);
3572		error = SET_ERROR(EIO);
3573		goto out;
3574	}
3575
3576	/*
3577	 * Re-resolve svp to be certain it still exists and fetch the
3578	 * correct vnode.
3579	 */
3580	error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS);
3581	if (error != 0) {
3582		/* Source entry invalid or not there. */
3583		ZFS_EXIT(zfsvfs);
3584		VOP_UNLOCK(sdvp, 0);
3585		VOP_UNLOCK(tdvp, 0);
3586		if ((scnp->cn_flags & ISDOTDOT) != 0 ||
3587		    (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
3588			error = SET_ERROR(EINVAL);
3589		goto out;
3590	}
3591	svp = ZTOV(szp);
3592
3593	/*
3594	 * Re-resolve tvp, if it disappeared we just carry on.
3595	 */
3596	error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0);
3597	if (error != 0) {
3598		ZFS_EXIT(zfsvfs);
3599		VOP_UNLOCK(sdvp, 0);
3600		VOP_UNLOCK(tdvp, 0);
3601		vrele(svp);
3602		if ((tcnp->cn_flags & ISDOTDOT) != 0)
3603			error = SET_ERROR(EINVAL);
3604		goto out;
3605	}
3606	if (tzp != NULL)
3607		tvp = ZTOV(tzp);
3608	else
3609		tvp = NULL;
3610
3611	/*
3612	 * At present the vnode locks must be acquired before z_teardown_lock,
3613	 * although it would be more logical to use the opposite order.
3614	 */
3615	ZFS_EXIT(zfsvfs);
3616
3617	/*
3618	 * Now try acquire locks on svp and tvp.
3619	 */
3620	nvp = svp;
3621	error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
3622	if (error != 0) {
3623		VOP_UNLOCK(sdvp, 0);
3624		VOP_UNLOCK(tdvp, 0);
3625		if (tvp != NULL)
3626			vrele(tvp);
3627		if (error != EBUSY) {
3628			vrele(nvp);
3629			goto out;
3630		}
3631		error = vn_lock(nvp, LK_EXCLUSIVE);
3632		if (error != 0) {
3633			vrele(nvp);
3634			goto out;
3635		}
3636		VOP_UNLOCK(nvp, 0);
3637		/*
3638		 * Concurrent rename race.
3639		 * XXX ?
3640		 */
3641		if (nvp == tdvp) {
3642			vrele(nvp);
3643			error = SET_ERROR(EINVAL);
3644			goto out;
3645		}
3646		vrele(*svpp);
3647		*svpp = nvp;
3648		goto relock;
3649	}
3650	vrele(*svpp);
3651	*svpp = nvp;
3652
3653	if (*tvpp != NULL)
3654		vrele(*tvpp);
3655	*tvpp = NULL;
3656	if (tvp != NULL) {
3657		nvp = tvp;
3658		error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
3659		if (error != 0) {
3660			VOP_UNLOCK(sdvp, 0);
3661			VOP_UNLOCK(tdvp, 0);
3662			VOP_UNLOCK(*svpp, 0);
3663			if (error != EBUSY) {
3664				vrele(nvp);
3665				goto out;
3666			}
3667			error = vn_lock(nvp, LK_EXCLUSIVE);
3668			if (error != 0) {
3669				vrele(nvp);
3670				goto out;
3671			}
3672			vput(nvp);
3673			goto relock;
3674		}
3675		*tvpp = nvp;
3676	}
3677
3678	return (0);
3679
3680out:
3681	return (error);
3682}
3683
3684/*
3685 * Note that we must use VRELE_ASYNC in this function as it walks
3686 * up the directory tree and vrele may need to acquire an exclusive
3687 * lock if a last reference to a vnode is dropped.
3688 */
3689static int
3690zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
3691{
3692	zfsvfs_t	*zfsvfs;
3693	znode_t		*zp, *zp1;
3694	uint64_t	parent;
3695	int		error;
3696
3697	zfsvfs = tdzp->z_zfsvfs;
3698	if (tdzp == szp)
3699		return (SET_ERROR(EINVAL));
3700	if (tdzp == sdzp)
3701		return (0);
3702	if (tdzp->z_id == zfsvfs->z_root)
3703		return (0);
3704	zp = tdzp;
3705	for (;;) {
3706		ASSERT(!zp->z_unlinked);
3707		if ((error = sa_lookup(zp->z_sa_hdl,
3708		    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
3709			break;
3710
3711		if (parent == szp->z_id) {
3712			error = SET_ERROR(EINVAL);
3713			break;
3714		}
3715		if (parent == zfsvfs->z_root)
3716			break;
3717		if (parent == sdzp->z_id)
3718			break;
3719
3720		error = zfs_zget(zfsvfs, parent, &zp1);
3721		if (error != 0)
3722			break;
3723
3724		if (zp != tdzp)
3725			VN_RELE_ASYNC(ZTOV(zp),
3726			    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
3727		zp = zp1;
3728	}
3729
3730	if (error == ENOTDIR)
3731		panic("checkpath: .. not a directory\n");
3732	if (zp != tdzp)
3733		VN_RELE_ASYNC(ZTOV(zp),
3734		    dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
3735	return (error);
3736}
3737
3738/*
3739 * Move an entry from the provided source directory to the target
3740 * directory.  Change the entry name as indicated.
3741 *
3742 *	IN:	sdvp	- Source directory containing the "old entry".
3743 *		snm	- Old entry name.
3744 *		tdvp	- Target directory to contain the "new entry".
3745 *		tnm	- New entry name.
3746 *		cr	- credentials of caller.
3747 *		ct	- caller context
3748 *		flags	- case flags
3749 *
3750 *	RETURN:	0 on success, error code on failure.
3751 *
3752 * Timestamps:
3753 *	sdvp,tdvp - ctime|mtime updated
3754 */
3755/*ARGSUSED*/
3756static int
3757zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
3758    vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
3759    cred_t *cr)
3760{
3761	zfsvfs_t	*zfsvfs;
3762	znode_t		*sdzp, *tdzp, *szp, *tzp;
3763	zilog_t		*zilog = NULL;
3764	dmu_tx_t	*tx;
3765	char		*snm = scnp->cn_nameptr;
3766	char		*tnm = tcnp->cn_nameptr;
3767	int		error = 0;
3768
3769	/* Reject renames across filesystems. */
3770	if ((*svpp)->v_mount != tdvp->v_mount ||
3771	    ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
3772		error = SET_ERROR(EXDEV);
3773		goto out;
3774	}
3775
3776	if (zfsctl_is_node(tdvp)) {
3777		error = SET_ERROR(EXDEV);
3778		goto out;
3779	}
3780
3781	/*
3782	 * Lock all four vnodes to ensure safety and semantics of renaming.
3783	 */
3784	error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
3785	if (error != 0) {
3786		/* no vnodes are locked in the case of error here */
3787		return (error);
3788	}
3789
3790	tdzp = VTOZ(tdvp);
3791	sdzp = VTOZ(sdvp);
3792	zfsvfs = tdzp->z_zfsvfs;
3793	zilog = zfsvfs->z_log;
3794
3795	/*
3796	 * After we re-enter ZFS_ENTER() we will have to revalidate all
3797	 * znodes involved.
3798	 */
3799	ZFS_ENTER(zfsvfs);
3800
3801	if (zfsvfs->z_utf8 && u8_validate(tnm,
3802	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3803		error = SET_ERROR(EILSEQ);
3804		goto unlockout;
3805	}
3806
3807	/* If source and target are the same file, there is nothing to do. */
3808	if ((*svpp) == (*tvpp)) {
3809		error = 0;
3810		goto unlockout;
3811	}
3812
3813	if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
3814	    ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
3815	    (*tvpp)->v_mountedhere != NULL)) {
3816		error = SET_ERROR(EXDEV);
3817		goto unlockout;
3818	}
3819
3820	/*
3821	 * We can not use ZFS_VERIFY_ZP() here because it could directly return
3822	 * bypassing the cleanup code in the case of an error.
3823	 */
3824	if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
3825		error = SET_ERROR(EIO);
3826		goto unlockout;
3827	}
3828
3829	szp = VTOZ(*svpp);
3830	tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
3831	if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) {
3832		error = SET_ERROR(EIO);
3833		goto unlockout;
3834	}
3835
3836	/*
3837	 * This is to prevent the creation of links into attribute space
3838	 * by renaming a linked file into/outof an attribute directory.
3839	 * See the comment in zfs_link() for why this is considered bad.
3840	 */
3841	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3842		error = SET_ERROR(EINVAL);
3843		goto unlockout;
3844	}
3845
3846	/*
3847	 * Must have write access at the source to remove the old entry
3848	 * and write access at the target to create the new entry.
3849	 * Note that if target and source are the same, this can be
3850	 * done in a single check.
3851	 */
3852	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
3853		goto unlockout;
3854
3855	if ((*svpp)->v_type == VDIR) {
3856		/*
3857		 * Avoid ".", "..", and aliases of "." for obvious reasons.
3858		 */
3859		if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
3860		    sdzp == szp ||
3861		    (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
3862			error = EINVAL;
3863			goto unlockout;
3864		}
3865
3866		/*
3867		 * Check to make sure rename is valid.
3868		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3869		 */
3870		if (error = zfs_rename_check(szp, sdzp, tdzp))
3871			goto unlockout;
3872	}
3873
3874	/*
3875	 * Does target exist?
3876	 */
3877	if (tzp) {
3878		/*
3879		 * Source and target must be the same type.
3880		 */
3881		if ((*svpp)->v_type == VDIR) {
3882			if ((*tvpp)->v_type != VDIR) {
3883				error = SET_ERROR(ENOTDIR);
3884				goto unlockout;
3885			} else {
3886				cache_purge(tdvp);
3887				if (sdvp != tdvp)
3888					cache_purge(sdvp);
3889			}
3890		} else {
3891			if ((*tvpp)->v_type == VDIR) {
3892				error = SET_ERROR(EISDIR);
3893				goto unlockout;
3894			}
3895		}
3896	}
3897
3898	vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
3899	if (tzp)
3900		vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
3901
3902	/*
3903	 * notify the target directory if it is not the same
3904	 * as source directory.
3905	 */
3906	if (tdvp != sdvp) {
3907		vnevent_rename_dest_dir(tdvp, ct);
3908	}
3909
3910	tx = dmu_tx_create(zfsvfs->z_os);
3911	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3912	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3913	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3914	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3915	if (sdzp != tdzp) {
3916		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3917		zfs_sa_upgrade_txholds(tx, tdzp);
3918	}
3919	if (tzp) {
3920		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3921		zfs_sa_upgrade_txholds(tx, tzp);
3922	}
3923
3924	zfs_sa_upgrade_txholds(tx, szp);
3925	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3926	error = dmu_tx_assign(tx, TXG_WAIT);
3927	if (error) {
3928		dmu_tx_abort(tx);
3929		goto unlockout;
3930	}
3931
3932
3933	if (tzp)	/* Attempt to remove the existing target */
3934		error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
3935
3936	if (error == 0) {
3937		error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
3938		if (error == 0) {
3939			szp->z_pflags |= ZFS_AV_MODIFIED;
3940
3941			error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3942			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3943			ASSERT0(error);
3944
3945			error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING,
3946			    NULL);
3947			if (error == 0) {
3948				zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
3949				    snm, tdzp, tnm, szp);
3950
3951				/*
3952				 * Update path information for the target vnode
3953				 */
3954				vn_renamepath(tdvp, *svpp, tnm, strlen(tnm));
3955			} else {
3956				/*
3957				 * At this point, we have successfully created
3958				 * the target name, but have failed to remove
3959				 * the source name.  Since the create was done
3960				 * with the ZRENAMING flag, there are
3961				 * complications; for one, the link count is
3962				 * wrong.  The easiest way to deal with this
3963				 * is to remove the newly created target, and
3964				 * return the original error.  This must
3965				 * succeed; fortunately, it is very unlikely to
3966				 * fail, since we just created it.
3967				 */
3968				VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx,
3969				    ZRENAMING, NULL), ==, 0);
3970			}
3971		}
3972		if (error == 0) {
3973			cache_purge(*svpp);
3974			if (*tvpp != NULL)
3975				cache_purge(*tvpp);
3976			cache_purge_negative(tdvp);
3977		}
3978	}
3979
3980	dmu_tx_commit(tx);
3981
3982unlockout:			/* all 4 vnodes are locked, ZFS_ENTER called */
3983	ZFS_EXIT(zfsvfs);
3984	VOP_UNLOCK(*svpp, 0);
3985	VOP_UNLOCK(sdvp, 0);
3986
3987out:				/* original two vnodes are locked */
3988	if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3989		zil_commit(zilog, 0);
3990
3991	if (*tvpp != NULL)
3992		VOP_UNLOCK(*tvpp, 0);
3993	if (tdvp != *tvpp)
3994		VOP_UNLOCK(tdvp, 0);
3995	return (error);
3996}
3997
3998/*
3999 * Insert the indicated symbolic reference entry into the directory.
4000 *
4001 *	IN:	dvp	- Directory to contain new symbolic link.
4002 *		link	- Name for new symlink entry.
4003 *		vap	- Attributes of new entry.
4004 *		cr	- credentials of caller.
4005 *		ct	- caller context
4006 *		flags	- case flags
4007 *
4008 *	RETURN:	0 on success, error code on failure.
4009 *
4010 * Timestamps:
4011 *	dvp - ctime|mtime updated
4012 */
4013/*ARGSUSED*/
4014static int
4015zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
4016    cred_t *cr, kthread_t *td)
4017{
4018	znode_t		*zp, *dzp = VTOZ(dvp);
4019	dmu_tx_t	*tx;
4020	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
4021	zilog_t		*zilog;
4022	uint64_t	len = strlen(link);
4023	int		error;
4024	zfs_acl_ids_t	acl_ids;
4025	boolean_t	fuid_dirtied;
4026	uint64_t	txtype = TX_SYMLINK;
4027	int		flags = 0;
4028
4029	ASSERT(vap->va_type == VLNK);
4030
4031	ZFS_ENTER(zfsvfs);
4032	ZFS_VERIFY_ZP(dzp);
4033	zilog = zfsvfs->z_log;
4034
4035	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
4036	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4037		ZFS_EXIT(zfsvfs);
4038		return (SET_ERROR(EILSEQ));
4039	}
4040
4041	if (len > MAXPATHLEN) {
4042		ZFS_EXIT(zfsvfs);
4043		return (SET_ERROR(ENAMETOOLONG));
4044	}
4045
4046	if ((error = zfs_acl_ids_create(dzp, 0,
4047	    vap, cr, NULL, &acl_ids)) != 0) {
4048		ZFS_EXIT(zfsvfs);
4049		return (error);
4050	}
4051
4052	/*
4053	 * Attempt to lock directory; fail if entry already exists.
4054	 */
4055	error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
4056	if (error) {
4057		zfs_acl_ids_free(&acl_ids);
4058		ZFS_EXIT(zfsvfs);
4059		return (error);
4060	}
4061
4062	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4063		zfs_acl_ids_free(&acl_ids);
4064		ZFS_EXIT(zfsvfs);
4065		return (error);
4066	}
4067
4068	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
4069		zfs_acl_ids_free(&acl_ids);
4070		ZFS_EXIT(zfsvfs);
4071		return (SET_ERROR(EDQUOT));
4072	}
4073
4074	getnewvnode_reserve(1);
4075	tx = dmu_tx_create(zfsvfs->z_os);
4076	fuid_dirtied = zfsvfs->z_fuid_dirty;
4077	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
4078	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4079	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
4080	    ZFS_SA_BASE_ATTR_SIZE + len);
4081	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
4082	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
4083		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
4084		    acl_ids.z_aclp->z_acl_bytes);
4085	}
4086	if (fuid_dirtied)
4087		zfs_fuid_txhold(zfsvfs, tx);
4088	error = dmu_tx_assign(tx, TXG_WAIT);
4089	if (error) {
4090		zfs_acl_ids_free(&acl_ids);
4091		dmu_tx_abort(tx);
4092		getnewvnode_drop_reserve();
4093		ZFS_EXIT(zfsvfs);
4094		return (error);
4095	}
4096
4097	/*
4098	 * Create a new object for the symlink.
4099	 * for version 4 ZPL datsets the symlink will be an SA attribute
4100	 */
4101	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
4102
4103	if (fuid_dirtied)
4104		zfs_fuid_sync(zfsvfs, tx);
4105
4106	if (zp->z_is_sa)
4107		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
4108		    link, len, tx);
4109	else
4110		zfs_sa_symlink(zp, link, len, tx);
4111
4112	zp->z_size = len;
4113	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
4114	    &zp->z_size, sizeof (zp->z_size), tx);
4115	/*
4116	 * Insert the new object into the directory.
4117	 */
4118	(void) zfs_link_create(dzp, name, zp, tx, ZNEW);
4119
4120	zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
4121	*vpp = ZTOV(zp);
4122
4123	zfs_acl_ids_free(&acl_ids);
4124
4125	dmu_tx_commit(tx);
4126
4127	getnewvnode_drop_reserve();
4128
4129	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4130		zil_commit(zilog, 0);
4131
4132	ZFS_EXIT(zfsvfs);
4133	return (error);
4134}
4135
4136/*
4137 * Return, in the buffer contained in the provided uio structure,
4138 * the symbolic path referred to by vp.
4139 *
4140 *	IN:	vp	- vnode of symbolic link.
4141 *		uio	- structure to contain the link path.
4142 *		cr	- credentials of caller.
4143 *		ct	- caller context
4144 *
4145 *	OUT:	uio	- structure containing the link path.
4146 *
4147 *	RETURN:	0 on success, error code on failure.
4148 *
4149 * Timestamps:
4150 *	vp - atime updated
4151 */
4152/* ARGSUSED */
4153static int
4154zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
4155{
4156	znode_t		*zp = VTOZ(vp);
4157	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4158	int		error;
4159
4160	ZFS_ENTER(zfsvfs);
4161	ZFS_VERIFY_ZP(zp);
4162
4163	if (zp->z_is_sa)
4164		error = sa_lookup_uio(zp->z_sa_hdl,
4165		    SA_ZPL_SYMLINK(zfsvfs), uio);
4166	else
4167		error = zfs_sa_readlink(zp, uio);
4168
4169	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4170
4171	ZFS_EXIT(zfsvfs);
4172	return (error);
4173}
4174
4175/*
4176 * Insert a new entry into directory tdvp referencing svp.
4177 *
4178 *	IN:	tdvp	- Directory to contain new entry.
4179 *		svp	- vnode of new entry.
4180 *		name	- name of new entry.
4181 *		cr	- credentials of caller.
4182 *		ct	- caller context
4183 *
4184 *	RETURN:	0 on success, error code on failure.
4185 *
4186 * Timestamps:
4187 *	tdvp - ctime|mtime updated
4188 *	 svp - ctime updated
4189 */
4190/* ARGSUSED */
4191static int
4192zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
4193    caller_context_t *ct, int flags)
4194{
4195	znode_t		*dzp = VTOZ(tdvp);
4196	znode_t		*tzp, *szp;
4197	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
4198	zilog_t		*zilog;
4199	dmu_tx_t	*tx;
4200	int		error;
4201	uint64_t	parent;
4202	uid_t		owner;
4203
4204	ASSERT(tdvp->v_type == VDIR);
4205
4206	ZFS_ENTER(zfsvfs);
4207	ZFS_VERIFY_ZP(dzp);
4208	zilog = zfsvfs->z_log;
4209
4210	/*
4211	 * POSIX dictates that we return EPERM here.
4212	 * Better choices include ENOTSUP or EISDIR.
4213	 */
4214	if (svp->v_type == VDIR) {
4215		ZFS_EXIT(zfsvfs);
4216		return (SET_ERROR(EPERM));
4217	}
4218
4219	szp = VTOZ(svp);
4220	ZFS_VERIFY_ZP(szp);
4221
4222	if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) {
4223		ZFS_EXIT(zfsvfs);
4224		return (SET_ERROR(EPERM));
4225	}
4226
4227	/* Prevent links to .zfs/shares files */
4228
4229	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
4230	    &parent, sizeof (uint64_t))) != 0) {
4231		ZFS_EXIT(zfsvfs);
4232		return (error);
4233	}
4234	if (parent == zfsvfs->z_shares_dir) {
4235		ZFS_EXIT(zfsvfs);
4236		return (SET_ERROR(EPERM));
4237	}
4238
4239	if (zfsvfs->z_utf8 && u8_validate(name,
4240	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4241		ZFS_EXIT(zfsvfs);
4242		return (SET_ERROR(EILSEQ));
4243	}
4244
4245	/*
4246	 * We do not support links between attributes and non-attributes
4247	 * because of the potential security risk of creating links
4248	 * into "normal" file space in order to circumvent restrictions
4249	 * imposed in attribute space.
4250	 */
4251	if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
4252		ZFS_EXIT(zfsvfs);
4253		return (SET_ERROR(EINVAL));
4254	}
4255
4256
4257	owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
4258	if (owner != crgetuid(cr) && secpolicy_basic_link(svp, cr) != 0) {
4259		ZFS_EXIT(zfsvfs);
4260		return (SET_ERROR(EPERM));
4261	}
4262
4263	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4264		ZFS_EXIT(zfsvfs);
4265		return (error);
4266	}
4267
4268	/*
4269	 * Attempt to lock directory; fail if entry already exists.
4270	 */
4271	error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW);
4272	if (error) {
4273		ZFS_EXIT(zfsvfs);
4274		return (error);
4275	}
4276
4277	tx = dmu_tx_create(zfsvfs->z_os);
4278	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4279	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4280	zfs_sa_upgrade_txholds(tx, szp);
4281	zfs_sa_upgrade_txholds(tx, dzp);
4282	error = dmu_tx_assign(tx, TXG_WAIT);
4283	if (error) {
4284		dmu_tx_abort(tx);
4285		ZFS_EXIT(zfsvfs);
4286		return (error);
4287	}
4288
4289	error = zfs_link_create(dzp, name, szp, tx, 0);
4290
4291	if (error == 0) {
4292		uint64_t txtype = TX_LINK;
4293		zfs_log_link(zilog, tx, txtype, dzp, szp, name);
4294	}
4295
4296	dmu_tx_commit(tx);
4297
4298	if (error == 0) {
4299		vnevent_link(svp, ct);
4300	}
4301
4302	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4303		zil_commit(zilog, 0);
4304
4305	ZFS_EXIT(zfsvfs);
4306	return (error);
4307}
4308
4309
4310/*ARGSUSED*/
4311void
4312zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4313{
4314	znode_t	*zp = VTOZ(vp);
4315	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4316	int error;
4317
4318	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4319	if (zp->z_sa_hdl == NULL) {
4320		/*
4321		 * The fs has been unmounted, or we did a
4322		 * suspend/resume and this file no longer exists.
4323		 */
4324		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4325		vrecycle(vp);
4326		return;
4327	}
4328
4329	if (zp->z_unlinked) {
4330		/*
4331		 * Fast path to recycle a vnode of a removed file.
4332		 */
4333		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4334		vrecycle(vp);
4335		return;
4336	}
4337
4338	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4339		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4340
4341		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4342		zfs_sa_upgrade_txholds(tx, zp);
4343		error = dmu_tx_assign(tx, TXG_WAIT);
4344		if (error) {
4345			dmu_tx_abort(tx);
4346		} else {
4347			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4348			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4349			zp->z_atime_dirty = 0;
4350			dmu_tx_commit(tx);
4351		}
4352	}
4353	rw_exit(&zfsvfs->z_teardown_inactive_lock);
4354}
4355
4356
4357CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
4358CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
4359
4360/*ARGSUSED*/
4361static int
4362zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4363{
4364	znode_t		*zp = VTOZ(vp);
4365	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4366	uint32_t	gen;
4367	uint64_t	gen64;
4368	uint64_t	object = zp->z_id;
4369	zfid_short_t	*zfid;
4370	int		size, i, error;
4371
4372	ZFS_ENTER(zfsvfs);
4373	ZFS_VERIFY_ZP(zp);
4374
4375	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4376	    &gen64, sizeof (uint64_t))) != 0) {
4377		ZFS_EXIT(zfsvfs);
4378		return (error);
4379	}
4380
4381	gen = (uint32_t)gen64;
4382
4383	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
4384
4385#ifdef illumos
4386	if (fidp->fid_len < size) {
4387		fidp->fid_len = size;
4388		ZFS_EXIT(zfsvfs);
4389		return (SET_ERROR(ENOSPC));
4390	}
4391#else
4392	fidp->fid_len = size;
4393#endif
4394
4395	zfid = (zfid_short_t *)fidp;
4396
4397	zfid->zf_len = size;
4398
4399	for (i = 0; i < sizeof (zfid->zf_object); i++)
4400		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4401
4402	/* Must have a non-zero generation number to distinguish from .zfs */
4403	if (gen == 0)
4404		gen = 1;
4405	for (i = 0; i < sizeof (zfid->zf_gen); i++)
4406		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4407
4408	if (size == LONG_FID_LEN) {
4409		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
4410		zfid_long_t	*zlfid;
4411
4412		zlfid = (zfid_long_t *)fidp;
4413
4414		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
4415			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
4416
4417		/* XXX - this should be the generation number for the objset */
4418		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
4419			zlfid->zf_setgen[i] = 0;
4420	}
4421
4422	ZFS_EXIT(zfsvfs);
4423	return (0);
4424}
4425
4426static int
4427zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4428    caller_context_t *ct)
4429{
4430	znode_t		*zp, *xzp;
4431	zfsvfs_t	*zfsvfs;
4432	int		error;
4433
4434	switch (cmd) {
4435	case _PC_LINK_MAX:
4436		*valp = MIN(LONG_MAX, ZFS_LINK_MAX);
4437		return (0);
4438
4439	case _PC_FILESIZEBITS:
4440		*valp = 64;
4441		return (0);
4442#ifdef illumos
4443	case _PC_XATTR_EXISTS:
4444		zp = VTOZ(vp);
4445		zfsvfs = zp->z_zfsvfs;
4446		ZFS_ENTER(zfsvfs);
4447		ZFS_VERIFY_ZP(zp);
4448		*valp = 0;
4449		error = zfs_dirent_lookup(zp, "", &xzp,
4450		    ZXATTR | ZEXISTS | ZSHARED);
4451		if (error == 0) {
4452			if (!zfs_dirempty(xzp))
4453				*valp = 1;
4454			vrele(ZTOV(xzp));
4455		} else if (error == ENOENT) {
4456			/*
4457			 * If there aren't extended attributes, it's the
4458			 * same as having zero of them.
4459			 */
4460			error = 0;
4461		}
4462		ZFS_EXIT(zfsvfs);
4463		return (error);
4464
4465	case _PC_SATTR_ENABLED:
4466	case _PC_SATTR_EXISTS:
4467		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
4468		    (vp->v_type == VREG || vp->v_type == VDIR);
4469		return (0);
4470
4471	case _PC_ACCESS_FILTERING:
4472		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
4473		    vp->v_type == VDIR;
4474		return (0);
4475
4476	case _PC_ACL_ENABLED:
4477		*valp = _ACL_ACE_ENABLED;
4478		return (0);
4479#endif	/* illumos */
4480	case _PC_MIN_HOLE_SIZE:
4481		*valp = (int)SPA_MINBLOCKSIZE;
4482		return (0);
4483#ifdef illumos
4484	case _PC_TIMESTAMP_RESOLUTION:
4485		/* nanosecond timestamp resolution */
4486		*valp = 1L;
4487		return (0);
4488#endif
4489	case _PC_ACL_EXTENDED:
4490		*valp = 0;
4491		return (0);
4492
4493	case _PC_ACL_NFS4:
4494		*valp = 1;
4495		return (0);
4496
4497	case _PC_ACL_PATH_MAX:
4498		*valp = ACL_MAX_ENTRIES;
4499		return (0);
4500
4501	default:
4502		return (EOPNOTSUPP);
4503	}
4504}
4505
4506/*ARGSUSED*/
4507static int
4508zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4509    caller_context_t *ct)
4510{
4511	znode_t *zp = VTOZ(vp);
4512	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4513	int error;
4514	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4515
4516	ZFS_ENTER(zfsvfs);
4517	ZFS_VERIFY_ZP(zp);
4518	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
4519	ZFS_EXIT(zfsvfs);
4520
4521	return (error);
4522}
4523
4524/*ARGSUSED*/
4525int
4526zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4527    caller_context_t *ct)
4528{
4529	znode_t *zp = VTOZ(vp);
4530	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4531	int error;
4532	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4533	zilog_t	*zilog = zfsvfs->z_log;
4534
4535	ZFS_ENTER(zfsvfs);
4536	ZFS_VERIFY_ZP(zp);
4537
4538	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
4539
4540	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4541		zil_commit(zilog, 0);
4542
4543	ZFS_EXIT(zfsvfs);
4544	return (error);
4545}
4546
4547static int
4548zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind,
4549    int *rahead)
4550{
4551	znode_t *zp = VTOZ(vp);
4552	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4553	objset_t *os = zp->z_zfsvfs->z_os;
4554	locked_range_t *lr;
4555	vm_object_t object;
4556	off_t start, end, obj_size;
4557	uint_t blksz;
4558	int pgsin_b, pgsin_a;
4559	int error;
4560
4561	ZFS_ENTER(zfsvfs);
4562	ZFS_VERIFY_ZP(zp);
4563
4564	start = IDX_TO_OFF(ma[0]->pindex);
4565	end = IDX_TO_OFF(ma[count - 1]->pindex + 1);
4566
4567	/*
4568	 * Lock a range covering all required and optional pages.
4569	 * Note that we need to handle the case of the block size growing.
4570	 */
4571	for (;;) {
4572		blksz = zp->z_blksz;
4573		lr = rangelock_enter(&zp->z_rangelock, rounddown(start, blksz),
4574		    roundup(end, blksz) - rounddown(start, blksz), RL_READER);
4575		if (blksz == zp->z_blksz)
4576			break;
4577		rangelock_exit(lr);
4578	}
4579
4580	object = ma[0]->object;
4581	zfs_vmobject_wlock(object);
4582	obj_size = object->un_pager.vnp.vnp_size;
4583	zfs_vmobject_wunlock(object);
4584	if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) {
4585		rangelock_exit(lr);
4586		ZFS_EXIT(zfsvfs);
4587		return (zfs_vm_pagerret_bad);
4588	}
4589
4590	pgsin_b = 0;
4591	if (rbehind != NULL) {
4592		pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz));
4593		pgsin_b = MIN(*rbehind, pgsin_b);
4594	}
4595
4596	pgsin_a = 0;
4597	if (rahead != NULL) {
4598		pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end);
4599		if (end + IDX_TO_OFF(pgsin_a) >= obj_size)
4600			pgsin_a = OFF_TO_IDX(round_page(obj_size) - end);
4601		pgsin_a = MIN(*rahead, pgsin_a);
4602	}
4603
4604	/*
4605	 * NB: we need to pass the exact byte size of the data that we expect
4606	 * to read after accounting for the file size.  This is required because
4607	 * ZFS will panic if we request DMU to read beyond the end of the last
4608	 * allocated block.
4609	 */
4610	error = dmu_read_pages(os, zp->z_id, ma, count, &pgsin_b, &pgsin_a,
4611	    MIN(end, obj_size) - (end - PAGE_SIZE));
4612
4613	rangelock_exit(lr);
4614	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4615	ZFS_EXIT(zfsvfs);
4616
4617	if (error != 0)
4618		return (zfs_vm_pagerret_error);
4619
4620	VM_CNT_INC(v_vnodein);
4621	VM_CNT_ADD(v_vnodepgsin, count + pgsin_b + pgsin_a);
4622	if (rbehind != NULL)
4623		*rbehind = pgsin_b;
4624	if (rahead != NULL)
4625		*rahead = pgsin_a;
4626	return (zfs_vm_pagerret_ok);
4627}
4628
4629static int
4630zfs_freebsd_getpages(ap)
4631	struct vop_getpages_args /* {
4632		struct vnode *a_vp;
4633		vm_page_t *a_m;
4634		int a_count;
4635		int *a_rbehind;
4636		int *a_rahead;
4637	} */ *ap;
4638{
4639
4640	return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind,
4641	    ap->a_rahead));
4642}
4643
4644static int
4645zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
4646    int *rtvals)
4647{
4648	znode_t		*zp = VTOZ(vp);
4649	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4650	locked_range_t	*lr;
4651	dmu_tx_t	*tx;
4652	struct sf_buf	*sf;
4653	vm_object_t	object;
4654	vm_page_t	m;
4655	caddr_t		va;
4656	size_t		tocopy;
4657	size_t		lo_len;
4658	vm_ooffset_t	lo_off;
4659	vm_ooffset_t	off;
4660	uint_t		blksz;
4661	int		ncount;
4662	int		pcount;
4663	int		err;
4664	int		i;
4665
4666	ZFS_ENTER(zfsvfs);
4667	ZFS_VERIFY_ZP(zp);
4668
4669	object = vp->v_object;
4670	pcount = btoc(len);
4671	ncount = pcount;
4672
4673	KASSERT(ma[0]->object == object, ("mismatching object"));
4674	KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));
4675
4676	for (i = 0; i < pcount; i++)
4677		rtvals[i] = zfs_vm_pagerret_error;
4678
4679	off = IDX_TO_OFF(ma[0]->pindex);
4680	blksz = zp->z_blksz;
4681	lo_off = rounddown(off, blksz);
4682	lo_len = roundup(len + (off - lo_off), blksz);
4683	lr = rangelock_enter(&zp->z_rangelock, lo_off, lo_len, RL_WRITER);
4684
4685	zfs_vmobject_wlock(object);
4686	if (len + off > object->un_pager.vnp.vnp_size) {
4687		if (object->un_pager.vnp.vnp_size > off) {
4688			int pgoff;
4689
4690			len = object->un_pager.vnp.vnp_size - off;
4691			ncount = btoc(len);
4692			if ((pgoff = (int)len & PAGE_MASK) != 0) {
4693				/*
4694				 * If the object is locked and the following
4695				 * conditions hold, then the page's dirty
4696				 * field cannot be concurrently changed by a
4697				 * pmap operation.
4698				 */
4699				m = ma[ncount - 1];
4700				vm_page_assert_sbusied(m);
4701				KASSERT(!pmap_page_is_write_mapped(m),
4702				    ("zfs_putpages: page %p is not read-only", m));
4703				vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
4704				    pgoff);
4705			}
4706		} else {
4707			len = 0;
4708			ncount = 0;
4709		}
4710		if (ncount < pcount) {
4711			for (i = ncount; i < pcount; i++) {
4712				rtvals[i] = zfs_vm_pagerret_bad;
4713			}
4714		}
4715	}
4716	zfs_vmobject_wunlock(object);
4717
4718	if (ncount == 0)
4719		goto out;
4720
4721	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
4722	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
4723		goto out;
4724	}
4725
4726	tx = dmu_tx_create(zfsvfs->z_os);
4727	dmu_tx_hold_write(tx, zp->z_id, off, len);
4728
4729	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4730	zfs_sa_upgrade_txholds(tx, zp);
4731	err = dmu_tx_assign(tx, TXG_WAIT);
4732	if (err != 0) {
4733		dmu_tx_abort(tx);
4734		goto out;
4735	}
4736
4737	if (zp->z_blksz < PAGE_SIZE) {
4738		for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
4739			tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
4740			va = zfs_map_page(ma[i], &sf);
4741			dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
4742			zfs_unmap_page(sf);
4743		}
4744	} else {
4745		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
4746	}
4747
4748	if (err == 0) {
4749		uint64_t mtime[2], ctime[2];
4750		sa_bulk_attr_t bulk[3];
4751		int count = 0;
4752
4753		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4754		    &mtime, 16);
4755		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
4756		    &ctime, 16);
4757		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
4758		    &zp->z_pflags, 8);
4759		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
4760		    B_TRUE);
4761		err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
4762		ASSERT0(err);
4763		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
4764
4765		zfs_vmobject_wlock(object);
4766		for (i = 0; i < ncount; i++) {
4767			rtvals[i] = zfs_vm_pagerret_ok;
4768			vm_page_undirty(ma[i]);
4769		}
4770		zfs_vmobject_wunlock(object);
4771		VM_CNT_INC(v_vnodeout);
4772		VM_CNT_ADD(v_vnodepgsout, ncount);
4773	}
4774	dmu_tx_commit(tx);
4775
4776out:
4777	rangelock_exit(lr);
4778	if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 ||
4779	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4780		zil_commit(zfsvfs->z_log, zp->z_id);
4781	ZFS_EXIT(zfsvfs);
4782	return (rtvals[0]);
4783}
4784
4785int
4786zfs_freebsd_putpages(ap)
4787	struct vop_putpages_args /* {
4788		struct vnode *a_vp;
4789		vm_page_t *a_m;
4790		int a_count;
4791		int a_sync;
4792		int *a_rtvals;
4793	} */ *ap;
4794{
4795
4796	return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
4797	    ap->a_rtvals));
4798}
4799
4800static int
4801zfs_freebsd_bmap(ap)
4802	struct vop_bmap_args /* {
4803		struct vnode *a_vp;
4804		daddr_t  a_bn;
4805		struct bufobj **a_bop;
4806		daddr_t *a_bnp;
4807		int *a_runp;
4808		int *a_runb;
4809	} */ *ap;
4810{
4811
4812	if (ap->a_bop != NULL)
4813		*ap->a_bop = &ap->a_vp->v_bufobj;
4814	if (ap->a_bnp != NULL)
4815		*ap->a_bnp = ap->a_bn;
4816	if (ap->a_runp != NULL)
4817		*ap->a_runp = 0;
4818	if (ap->a_runb != NULL)
4819		*ap->a_runb = 0;
4820
4821	return (0);
4822}
4823
4824static int
4825zfs_freebsd_open(ap)
4826	struct vop_open_args /* {
4827		struct vnode *a_vp;
4828		int a_mode;
4829		struct ucred *a_cred;
4830		struct thread *a_td;
4831	} */ *ap;
4832{
4833	vnode_t	*vp = ap->a_vp;
4834	znode_t *zp = VTOZ(vp);
4835	int error;
4836
4837	error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
4838	if (error == 0)
4839		vnode_create_vobject(vp, zp->z_size, ap->a_td);
4840	return (error);
4841}
4842
4843static int
4844zfs_freebsd_close(ap)
4845	struct vop_close_args /* {
4846		struct vnode *a_vp;
4847		int  a_fflag;
4848		struct ucred *a_cred;
4849		struct thread *a_td;
4850	} */ *ap;
4851{
4852
4853	return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred, NULL));
4854}
4855
4856static int
4857zfs_freebsd_ioctl(ap)
4858	struct vop_ioctl_args /* {
4859		struct vnode *a_vp;
4860		u_long a_command;
4861		caddr_t a_data;
4862		int a_fflag;
4863		struct ucred *cred;
4864		struct thread *td;
4865	} */ *ap;
4866{
4867
4868	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
4869	    ap->a_fflag, ap->a_cred, NULL, NULL));
4870}
4871
4872static int
4873ioflags(int ioflags)
4874{
4875	int flags = 0;
4876
4877	if (ioflags & IO_APPEND)
4878		flags |= FAPPEND;
4879	if (ioflags & IO_NDELAY)
4880		flags |= FNONBLOCK;
4881	if (ioflags & IO_SYNC)
4882		flags |= (FSYNC | FDSYNC | FRSYNC);
4883
4884	return (flags);
4885}
4886
4887static int
4888zfs_freebsd_read(ap)
4889	struct vop_read_args /* {
4890		struct vnode *a_vp;
4891		struct uio *a_uio;
4892		int a_ioflag;
4893		struct ucred *a_cred;
4894	} */ *ap;
4895{
4896
4897	return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
4898	    ap->a_cred, NULL));
4899}
4900
4901static int
4902zfs_freebsd_write(ap)
4903	struct vop_write_args /* {
4904		struct vnode *a_vp;
4905		struct uio *a_uio;
4906		int a_ioflag;
4907		struct ucred *a_cred;
4908	} */ *ap;
4909{
4910
4911	return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag),
4912	    ap->a_cred, NULL));
4913}
4914
4915static int
4916zfs_freebsd_access(ap)
4917	struct vop_access_args /* {
4918		struct vnode *a_vp;
4919		accmode_t a_accmode;
4920		struct ucred *a_cred;
4921		struct thread *a_td;
4922	} */ *ap;
4923{
4924	vnode_t *vp = ap->a_vp;
4925	znode_t *zp = VTOZ(vp);
4926	accmode_t accmode;
4927	int error = 0;
4928
4929	if (ap->a_accmode == VEXEC) {
4930		if (zfs_freebsd_fastaccesschk_execute(ap->a_vp, ap->a_cred) == 0)
4931			return (0);
4932	}
4933
4934	/*
4935	 * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND,
4936	 */
4937	accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND);
4938	if (accmode != 0)
4939		error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL);
4940
4941	/*
4942	 * VADMIN has to be handled by vaccess().
4943	 */
4944	if (error == 0) {
4945		accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND);
4946		if (accmode != 0) {
4947			error = vaccess(vp->v_type, zp->z_mode, zp->z_uid,
4948			    zp->z_gid, accmode, ap->a_cred, NULL);
4949		}
4950	}
4951
4952	/*
4953	 * For VEXEC, ensure that at least one execute bit is set for
4954	 * non-directories.
4955	 */
4956	if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR &&
4957	    (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) {
4958		error = EACCES;
4959	}
4960
4961	return (error);
4962}
4963
4964static int
4965zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached)
4966{
4967	struct componentname *cnp = ap->a_cnp;
4968	char nm[NAME_MAX + 1];
4969
4970	ASSERT(cnp->cn_namelen < sizeof(nm));
4971	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
4972
4973	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
4974	    cnp->cn_cred, cnp->cn_thread, 0, cached));
4975}
4976
4977static int
4978zfs_freebsd_cachedlookup(struct vop_cachedlookup_args *ap)
4979{
4980
4981	return (zfs_freebsd_lookup((struct vop_lookup_args *)ap, B_TRUE));
4982}
4983
4984static int
4985zfs_cache_lookup(ap)
4986	struct vop_lookup_args /* {
4987		struct vnode *a_dvp;
4988		struct vnode **a_vpp;
4989		struct componentname *a_cnp;
4990	} */ *ap;
4991{
4992	zfsvfs_t *zfsvfs;
4993
4994	zfsvfs = ap->a_dvp->v_mount->mnt_data;
4995	if (zfsvfs->z_use_namecache)
4996		return (vfs_cache_lookup(ap));
4997	else
4998		return (zfs_freebsd_lookup(ap, B_FALSE));
4999}
5000
5001static int
5002zfs_freebsd_create(ap)
5003	struct vop_create_args /* {
5004		struct vnode *a_dvp;
5005		struct vnode **a_vpp;
5006		struct componentname *a_cnp;
5007		struct vattr *a_vap;
5008	} */ *ap;
5009{
5010	zfsvfs_t *zfsvfs;
5011	struct componentname *cnp = ap->a_cnp;
5012	vattr_t *vap = ap->a_vap;
5013	int error, mode;
5014
5015	ASSERT(cnp->cn_flags & SAVENAME);
5016
5017	vattr_init_mask(vap);
5018	mode = vap->va_mode & ALLPERMS;
5019	zfsvfs = ap->a_dvp->v_mount->mnt_data;
5020
5021	error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
5022	    ap->a_vpp, cnp->cn_cred, cnp->cn_thread);
5023	if (zfsvfs->z_use_namecache &&
5024	    error == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
5025		cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
5026	return (error);
5027}
5028
5029static int
5030zfs_freebsd_remove(ap)
5031	struct vop_remove_args /* {
5032		struct vnode *a_dvp;
5033		struct vnode *a_vp;
5034		struct componentname *a_cnp;
5035	} */ *ap;
5036{
5037
5038	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
5039
5040	return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
5041	    ap->a_cnp->cn_cred));
5042}
5043
5044static int
5045zfs_freebsd_mkdir(ap)
5046	struct vop_mkdir_args /* {
5047		struct vnode *a_dvp;
5048		struct vnode **a_vpp;
5049		struct componentname *a_cnp;
5050		struct vattr *a_vap;
5051	} */ *ap;
5052{
5053	vattr_t *vap = ap->a_vap;
5054
5055	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
5056
5057	vattr_init_mask(vap);
5058
5059	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
5060	    ap->a_cnp->cn_cred));
5061}
5062
5063static int
5064zfs_freebsd_rmdir(ap)
5065	struct vop_rmdir_args /* {
5066		struct vnode *a_dvp;
5067		struct vnode *a_vp;
5068		struct componentname *a_cnp;
5069	} */ *ap;
5070{
5071	struct componentname *cnp = ap->a_cnp;
5072
5073	ASSERT(cnp->cn_flags & SAVENAME);
5074
5075	return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
5076}
5077
5078static int
5079zfs_freebsd_readdir(ap)
5080	struct vop_readdir_args /* {
5081		struct vnode *a_vp;
5082		struct uio *a_uio;
5083		struct ucred *a_cred;
5084		int *a_eofflag;
5085		int *a_ncookies;
5086		u_long **a_cookies;
5087	} */ *ap;
5088{
5089
5090	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
5091	    ap->a_ncookies, ap->a_cookies));
5092}
5093
5094static int
5095zfs_freebsd_fsync(ap)
5096	struct vop_fsync_args /* {
5097		struct vnode *a_vp;
5098		int a_waitfor;
5099		struct thread *a_td;
5100	} */ *ap;
5101{
5102
5103	vop_stdfsync(ap);
5104	return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
5105}
5106
5107static int
5108zfs_freebsd_getattr(ap)
5109	struct vop_getattr_args /* {
5110		struct vnode *a_vp;
5111		struct vattr *a_vap;
5112		struct ucred *a_cred;
5113	} */ *ap;
5114{
5115	vattr_t *vap = ap->a_vap;
5116	xvattr_t xvap;
5117	u_long fflags = 0;
5118	int error;
5119
5120	xva_init(&xvap);
5121	xvap.xva_vattr = *vap;
5122	xvap.xva_vattr.va_mask |= AT_XVATTR;
5123
5124	/* Convert chflags into ZFS-type flags. */
5125	/* XXX: what about SF_SETTABLE?. */
5126	XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
5127	XVA_SET_REQ(&xvap, XAT_APPENDONLY);
5128	XVA_SET_REQ(&xvap, XAT_NOUNLINK);
5129	XVA_SET_REQ(&xvap, XAT_NODUMP);
5130	XVA_SET_REQ(&xvap, XAT_READONLY);
5131	XVA_SET_REQ(&xvap, XAT_ARCHIVE);
5132	XVA_SET_REQ(&xvap, XAT_SYSTEM);
5133	XVA_SET_REQ(&xvap, XAT_HIDDEN);
5134	XVA_SET_REQ(&xvap, XAT_REPARSE);
5135	XVA_SET_REQ(&xvap, XAT_OFFLINE);
5136	XVA_SET_REQ(&xvap, XAT_SPARSE);
5137
5138	error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
5139	if (error != 0)
5140		return (error);
5141
5142	/* Convert ZFS xattr into chflags. */
5143#define	FLAG_CHECK(fflag, xflag, xfield)	do {			\
5144	if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)		\
5145		fflags |= (fflag);					\
5146} while (0)
5147	FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
5148	    xvap.xva_xoptattrs.xoa_immutable);
5149	FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
5150	    xvap.xva_xoptattrs.xoa_appendonly);
5151	FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
5152	    xvap.xva_xoptattrs.xoa_nounlink);
5153	FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
5154	    xvap.xva_xoptattrs.xoa_archive);
5155	FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
5156	    xvap.xva_xoptattrs.xoa_nodump);
5157	FLAG_CHECK(UF_READONLY, XAT_READONLY,
5158	    xvap.xva_xoptattrs.xoa_readonly);
5159	FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
5160	    xvap.xva_xoptattrs.xoa_system);
5161	FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
5162	    xvap.xva_xoptattrs.xoa_hidden);
5163	FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
5164	    xvap.xva_xoptattrs.xoa_reparse);
5165	FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
5166	    xvap.xva_xoptattrs.xoa_offline);
5167	FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
5168	    xvap.xva_xoptattrs.xoa_sparse);
5169
5170#undef	FLAG_CHECK
5171	*vap = xvap.xva_vattr;
5172	vap->va_flags = fflags;
5173	return (0);
5174}
5175
5176static int
5177zfs_freebsd_setattr(ap)
5178	struct vop_setattr_args /* {
5179		struct vnode *a_vp;
5180		struct vattr *a_vap;
5181		struct ucred *a_cred;
5182	} */ *ap;
5183{
5184	vnode_t *vp = ap->a_vp;
5185	vattr_t *vap = ap->a_vap;
5186	cred_t *cred = ap->a_cred;
5187	xvattr_t xvap;
5188	u_long fflags;
5189	uint64_t zflags;
5190
5191	vattr_init_mask(vap);
5192	vap->va_mask &= ~AT_NOSET;
5193
5194	xva_init(&xvap);
5195	xvap.xva_vattr = *vap;
5196
5197	zflags = VTOZ(vp)->z_pflags;
5198
5199	if (vap->va_flags != VNOVAL) {
5200		zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs;
5201		int error;
5202
5203		if (zfsvfs->z_use_fuids == B_FALSE)
5204			return (EOPNOTSUPP);
5205
5206		fflags = vap->va_flags;
5207		/*
5208		 * XXX KDM
5209		 * We need to figure out whether it makes sense to allow
5210		 * UF_REPARSE through, since we don't really have other
5211		 * facilities to handle reparse points and zfs_setattr()
5212		 * doesn't currently allow setting that attribute anyway.
5213		 */
5214		if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE|
5215		     UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE|
5216		     UF_OFFLINE|UF_SPARSE)) != 0)
5217			return (EOPNOTSUPP);
5218		/*
5219		 * Unprivileged processes are not permitted to unset system
5220		 * flags, or modify flags if any system flags are set.
5221		 * Privileged non-jail processes may not modify system flags
5222		 * if securelevel > 0 and any existing system flags are set.
5223		 * Privileged jail processes behave like privileged non-jail
5224		 * processes if the PR_ALLOW_CHFLAGS permission bit is set;
5225		 * otherwise, they behave like unprivileged processes.
5226		 */
5227		if (secpolicy_fs_owner(vp->v_mount, cred) == 0 ||
5228		    priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) {
5229			if (zflags &
5230			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
5231				error = securelevel_gt(cred, 0);
5232				if (error != 0)
5233					return (error);
5234			}
5235		} else {
5236			/*
5237			 * Callers may only modify the file flags on objects they
5238			 * have VADMIN rights for.
5239			 */
5240			if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0)
5241				return (error);
5242			if (zflags &
5243			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
5244				return (EPERM);
5245			}
5246			if (fflags &
5247			    (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
5248				return (EPERM);
5249			}
5250		}
5251
5252#define	FLAG_CHANGE(fflag, zflag, xflag, xfield)	do {		\
5253	if (((fflags & (fflag)) && !(zflags & (zflag))) ||		\
5254	    ((zflags & (zflag)) && !(fflags & (fflag)))) {		\
5255		XVA_SET_REQ(&xvap, (xflag));				\
5256		(xfield) = ((fflags & (fflag)) != 0);			\
5257	}								\
5258} while (0)
5259		/* Convert chflags into ZFS-type flags. */
5260		/* XXX: what about SF_SETTABLE?. */
5261		FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
5262		    xvap.xva_xoptattrs.xoa_immutable);
5263		FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
5264		    xvap.xva_xoptattrs.xoa_appendonly);
5265		FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
5266		    xvap.xva_xoptattrs.xoa_nounlink);
5267		FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
5268		    xvap.xva_xoptattrs.xoa_archive);
5269		FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
5270		    xvap.xva_xoptattrs.xoa_nodump);
5271		FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
5272		    xvap.xva_xoptattrs.xoa_readonly);
5273		FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
5274		    xvap.xva_xoptattrs.xoa_system);
5275		FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
5276		    xvap.xva_xoptattrs.xoa_hidden);
5277		FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
5278		    xvap.xva_xoptattrs.xoa_reparse);
5279		FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
5280		    xvap.xva_xoptattrs.xoa_offline);
5281		FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
5282		    xvap.xva_xoptattrs.xoa_sparse);
5283#undef	FLAG_CHANGE
5284	}
5285	if (vap->va_birthtime.tv_sec != VNOVAL) {
5286		xvap.xva_vattr.va_mask |= AT_XVATTR;
5287		XVA_SET_REQ(&xvap, XAT_CREATETIME);
5288	}
5289	return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
5290}
5291
5292static int
5293zfs_freebsd_rename(ap)
5294	struct vop_rename_args  /* {
5295		struct vnode *a_fdvp;
5296		struct vnode *a_fvp;
5297		struct componentname *a_fcnp;
5298		struct vnode *a_tdvp;
5299		struct vnode *a_tvp;
5300		struct componentname *a_tcnp;
5301	} */ *ap;
5302{
5303	vnode_t *fdvp = ap->a_fdvp;
5304	vnode_t *fvp = ap->a_fvp;
5305	vnode_t *tdvp = ap->a_tdvp;
5306	vnode_t *tvp = ap->a_tvp;
5307	int error;
5308
5309	ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
5310	ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
5311
5312	error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
5313	    ap->a_tcnp, ap->a_fcnp->cn_cred);
5314
5315	vrele(fdvp);
5316	vrele(fvp);
5317	vrele(tdvp);
5318	if (tvp != NULL)
5319		vrele(tvp);
5320
5321	return (error);
5322}
5323
5324static int
5325zfs_freebsd_symlink(ap)
5326	struct vop_symlink_args /* {
5327		struct vnode *a_dvp;
5328		struct vnode **a_vpp;
5329		struct componentname *a_cnp;
5330		struct vattr *a_vap;
5331		char *a_target;
5332	} */ *ap;
5333{
5334	struct componentname *cnp = ap->a_cnp;
5335	vattr_t *vap = ap->a_vap;
5336
5337	ASSERT(cnp->cn_flags & SAVENAME);
5338
5339	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
5340	vattr_init_mask(vap);
5341
5342	return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
5343	    ap->a_target, cnp->cn_cred, cnp->cn_thread));
5344}
5345
5346static int
5347zfs_freebsd_readlink(ap)
5348	struct vop_readlink_args /* {
5349		struct vnode *a_vp;
5350		struct uio *a_uio;
5351		struct ucred *a_cred;
5352	} */ *ap;
5353{
5354
5355	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
5356}
5357
5358static int
5359zfs_freebsd_link(ap)
5360	struct vop_link_args /* {
5361		struct vnode *a_tdvp;
5362		struct vnode *a_vp;
5363		struct componentname *a_cnp;
5364	} */ *ap;
5365{
5366	struct componentname *cnp = ap->a_cnp;
5367	vnode_t *vp = ap->a_vp;
5368	vnode_t *tdvp = ap->a_tdvp;
5369
5370	if (tdvp->v_mount != vp->v_mount)
5371		return (EXDEV);
5372
5373	ASSERT(cnp->cn_flags & SAVENAME);
5374
5375	return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
5376}
5377
5378static int
5379zfs_freebsd_inactive(ap)
5380	struct vop_inactive_args /* {
5381		struct vnode *a_vp;
5382		struct thread *a_td;
5383	} */ *ap;
5384{
5385	vnode_t *vp = ap->a_vp;
5386
5387	zfs_inactive(vp, ap->a_td->td_ucred, NULL);
5388	return (0);
5389}
5390
5391static int
5392zfs_freebsd_reclaim(ap)
5393	struct vop_reclaim_args /* {
5394		struct vnode *a_vp;
5395		struct thread *a_td;
5396	} */ *ap;
5397{
5398	vnode_t	*vp = ap->a_vp;
5399	znode_t	*zp = VTOZ(vp);
5400	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5401
5402	ASSERT(zp != NULL);
5403
5404	/* Destroy the vm object and flush associated pages. */
5405	vnode_destroy_vobject(vp);
5406
5407	/*
5408	 * z_teardown_inactive_lock protects from a race with
5409	 * zfs_znode_dmu_fini in zfsvfs_teardown during
5410	 * force unmount.
5411	 */
5412	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
5413	if (zp->z_sa_hdl == NULL)
5414		zfs_znode_free(zp);
5415	else
5416		zfs_zinactive(zp);
5417	rw_exit(&zfsvfs->z_teardown_inactive_lock);
5418
5419	vp->v_data = NULL;
5420	return (0);
5421}
5422
5423static int
5424zfs_freebsd_fid(ap)
5425	struct vop_fid_args /* {
5426		struct vnode *a_vp;
5427		struct fid *a_fid;
5428	} */ *ap;
5429{
5430
5431	return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
5432}
5433
5434static int
5435zfs_freebsd_pathconf(ap)
5436	struct vop_pathconf_args /* {
5437		struct vnode *a_vp;
5438		int a_name;
5439		register_t *a_retval;
5440	} */ *ap;
5441{
5442	ulong_t val;
5443	int error;
5444
5445	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL);
5446	if (error == 0) {
5447		*ap->a_retval = val;
5448		return (error);
5449	}
5450	if (error != EOPNOTSUPP)
5451		return (error);
5452
5453	switch (ap->a_name) {
5454	case _PC_NAME_MAX:
5455		*ap->a_retval = NAME_MAX;
5456		return (0);
5457	case _PC_PIPE_BUF:
5458		if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) {
5459			*ap->a_retval = PIPE_BUF;
5460			return (0);
5461		}
5462		return (EINVAL);
5463	default:
5464		return (vop_stdpathconf(ap));
5465	}
5466}
5467
5468/*
5469 * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
5470 * extended attribute name:
5471 *
5472 *	NAMESPACE	PREFIX
5473 *	system		freebsd:system:
5474 *	user		(none, can be used to access ZFS fsattr(5) attributes
5475 *			created on Solaris)
5476 */
5477static int
5478zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
5479    size_t size)
5480{
5481	const char *namespace, *prefix, *suffix;
5482
5483	/* We don't allow '/' character in attribute name. */
5484	if (strchr(name, '/') != NULL)
5485		return (EINVAL);
5486	/* We don't allow attribute names that start with "freebsd:" string. */
5487	if (strncmp(name, "freebsd:", 8) == 0)
5488		return (EINVAL);
5489
5490	bzero(attrname, size);
5491
5492	switch (attrnamespace) {
5493	case EXTATTR_NAMESPACE_USER:
5494#if 0
5495		prefix = "freebsd:";
5496		namespace = EXTATTR_NAMESPACE_USER_STRING;
5497		suffix = ":";
5498#else
5499		/*
5500		 * This is the default namespace by which we can access all
5501		 * attributes created on Solaris.
5502		 */
5503		prefix = namespace = suffix = "";
5504#endif
5505		break;
5506	case EXTATTR_NAMESPACE_SYSTEM:
5507		prefix = "freebsd:";
5508		namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
5509		suffix = ":";
5510		break;
5511	case EXTATTR_NAMESPACE_EMPTY:
5512	default:
5513		return (EINVAL);
5514	}
5515	if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
5516	    name) >= size) {
5517		return (ENAMETOOLONG);
5518	}
5519	return (0);
5520}
5521
5522/*
5523 * Vnode operating to retrieve a named extended attribute.
5524 */
5525static int
5526zfs_getextattr(struct vop_getextattr_args *ap)
5527/*
5528vop_getextattr {
5529	IN struct vnode *a_vp;
5530	IN int a_attrnamespace;
5531	IN const char *a_name;
5532	INOUT struct uio *a_uio;
5533	OUT size_t *a_size;
5534	IN struct ucred *a_cred;
5535	IN struct thread *a_td;
5536};
5537*/
5538{
5539	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5540	struct thread *td = ap->a_td;
5541	struct nameidata nd;
5542	char attrname[255];
5543	struct vattr va;
5544	vnode_t *xvp = NULL, *vp;
5545	int error, flags;
5546
5547	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5548	    ap->a_cred, ap->a_td, VREAD);
5549	if (error != 0)
5550		return (error);
5551
5552	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5553	    sizeof(attrname));
5554	if (error != 0)
5555		return (error);
5556
5557	ZFS_ENTER(zfsvfs);
5558
5559	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5560	    LOOKUP_XATTR, B_FALSE);
5561	if (error != 0) {
5562		ZFS_EXIT(zfsvfs);
5563		return (error);
5564	}
5565
5566	flags = FREAD;
5567	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
5568	    xvp, td);
5569	error = vn_open_cred(&nd, &flags, VN_OPEN_INVFS, 0, ap->a_cred, NULL);
5570	vp = nd.ni_vp;
5571	NDFREE(&nd, NDF_ONLY_PNBUF);
5572	if (error != 0) {
5573		ZFS_EXIT(zfsvfs);
5574		if (error == ENOENT)
5575			error = ENOATTR;
5576		return (error);
5577	}
5578
5579	if (ap->a_size != NULL) {
5580		error = VOP_GETATTR(vp, &va, ap->a_cred);
5581		if (error == 0)
5582			*ap->a_size = (size_t)va.va_size;
5583	} else if (ap->a_uio != NULL)
5584		error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred);
5585
5586	VOP_UNLOCK(vp, 0);
5587	vn_close(vp, flags, ap->a_cred, td);
5588	ZFS_EXIT(zfsvfs);
5589
5590	return (error);
5591}
5592
5593/*
5594 * Vnode operation to remove a named attribute.
5595 */
5596int
5597zfs_deleteextattr(struct vop_deleteextattr_args *ap)
5598/*
5599vop_deleteextattr {
5600	IN struct vnode *a_vp;
5601	IN int a_attrnamespace;
5602	IN const char *a_name;
5603	IN struct ucred *a_cred;
5604	IN struct thread *a_td;
5605};
5606*/
5607{
5608	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5609	struct thread *td = ap->a_td;
5610	struct nameidata nd;
5611	char attrname[255];
5612	struct vattr va;
5613	vnode_t *xvp = NULL, *vp;
5614	int error, flags;
5615
5616	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5617	    ap->a_cred, ap->a_td, VWRITE);
5618	if (error != 0)
5619		return (error);
5620
5621	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5622	    sizeof(attrname));
5623	if (error != 0)
5624		return (error);
5625
5626	ZFS_ENTER(zfsvfs);
5627
5628	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5629	    LOOKUP_XATTR, B_FALSE);
5630	if (error != 0) {
5631		ZFS_EXIT(zfsvfs);
5632		return (error);
5633	}
5634
5635	NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
5636	    UIO_SYSSPACE, attrname, xvp, td);
5637	error = namei(&nd);
5638	vp = nd.ni_vp;
5639	if (error != 0) {
5640		ZFS_EXIT(zfsvfs);
5641		NDFREE(&nd, NDF_ONLY_PNBUF);
5642		if (error == ENOENT)
5643			error = ENOATTR;
5644		return (error);
5645	}
5646
5647	error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
5648	NDFREE(&nd, NDF_ONLY_PNBUF);
5649
5650	vput(nd.ni_dvp);
5651	if (vp == nd.ni_dvp)
5652		vrele(vp);
5653	else
5654		vput(vp);
5655	ZFS_EXIT(zfsvfs);
5656
5657	return (error);
5658}
5659
5660/*
5661 * Vnode operation to set a named attribute.
5662 */
5663static int
5664zfs_setextattr(struct vop_setextattr_args *ap)
5665/*
5666vop_setextattr {
5667	IN struct vnode *a_vp;
5668	IN int a_attrnamespace;
5669	IN const char *a_name;
5670	INOUT struct uio *a_uio;
5671	IN struct ucred *a_cred;
5672	IN struct thread *a_td;
5673};
5674*/
5675{
5676	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5677	struct thread *td = ap->a_td;
5678	struct nameidata nd;
5679	char attrname[255];
5680	struct vattr va;
5681	vnode_t *xvp = NULL, *vp;
5682	int error, flags;
5683
5684	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5685	    ap->a_cred, ap->a_td, VWRITE);
5686	if (error != 0)
5687		return (error);
5688
5689	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
5690	    sizeof(attrname));
5691	if (error != 0)
5692		return (error);
5693
5694	ZFS_ENTER(zfsvfs);
5695
5696	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5697	    LOOKUP_XATTR | CREATE_XATTR_DIR, B_FALSE);
5698	if (error != 0) {
5699		ZFS_EXIT(zfsvfs);
5700		return (error);
5701	}
5702
5703	flags = FFLAGS(O_WRONLY | O_CREAT);
5704	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
5705	    xvp, td);
5706	error = vn_open_cred(&nd, &flags, 0600, VN_OPEN_INVFS, ap->a_cred,
5707	    NULL);
5708	vp = nd.ni_vp;
5709	NDFREE(&nd, NDF_ONLY_PNBUF);
5710	if (error != 0) {
5711		ZFS_EXIT(zfsvfs);
5712		return (error);
5713	}
5714
5715	VATTR_NULL(&va);
5716	va.va_size = 0;
5717	error = VOP_SETATTR(vp, &va, ap->a_cred);
5718	if (error == 0)
5719		VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);
5720
5721	VOP_UNLOCK(vp, 0);
5722	vn_close(vp, flags, ap->a_cred, td);
5723	ZFS_EXIT(zfsvfs);
5724
5725	return (error);
5726}
5727
5728/*
5729 * Vnode operation to retrieve extended attributes on a vnode.
5730 */
5731static int
5732zfs_listextattr(struct vop_listextattr_args *ap)
5733/*
5734vop_listextattr {
5735	IN struct vnode *a_vp;
5736	IN int a_attrnamespace;
5737	INOUT struct uio *a_uio;
5738	OUT size_t *a_size;
5739	IN struct ucred *a_cred;
5740	IN struct thread *a_td;
5741};
5742*/
5743{
5744	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
5745	struct thread *td = ap->a_td;
5746	struct nameidata nd;
5747	char attrprefix[16];
5748	u_char dirbuf[sizeof(struct dirent)];
5749	struct dirent *dp;
5750	struct iovec aiov;
5751	struct uio auio, *uio = ap->a_uio;
5752	size_t *sizep = ap->a_size;
5753	size_t plen;
5754	vnode_t *xvp = NULL, *vp;
5755	int done, error, eof, pos;
5756
5757	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
5758	    ap->a_cred, ap->a_td, VREAD);
5759	if (error != 0)
5760		return (error);
5761
5762	error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
5763	    sizeof(attrprefix));
5764	if (error != 0)
5765		return (error);
5766	plen = strlen(attrprefix);
5767
5768	ZFS_ENTER(zfsvfs);
5769
5770	if (sizep != NULL)
5771		*sizep = 0;
5772
5773	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
5774	    LOOKUP_XATTR, B_FALSE);
5775	if (error != 0) {
5776		ZFS_EXIT(zfsvfs);
5777		/*
5778		 * ENOATTR means that the EA directory does not yet exist,
5779		 * i.e. there are no extended attributes there.
5780		 */
5781		if (error == ENOATTR)
5782			error = 0;
5783		return (error);
5784	}
5785
5786	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
5787	    UIO_SYSSPACE, ".", xvp, td);
5788	error = namei(&nd);
5789	vp = nd.ni_vp;
5790	NDFREE(&nd, NDF_ONLY_PNBUF);
5791	if (error != 0) {
5792		ZFS_EXIT(zfsvfs);
5793		return (error);
5794	}
5795
5796	auio.uio_iov = &aiov;
5797	auio.uio_iovcnt = 1;
5798	auio.uio_segflg = UIO_SYSSPACE;
5799	auio.uio_td = td;
5800	auio.uio_rw = UIO_READ;
5801	auio.uio_offset = 0;
5802
5803	do {
5804		u_char nlen;
5805
5806		aiov.iov_base = (void *)dirbuf;
5807		aiov.iov_len = sizeof(dirbuf);
5808		auio.uio_resid = sizeof(dirbuf);
5809		error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
5810		done = sizeof(dirbuf) - auio.uio_resid;
5811		if (error != 0)
5812			break;
5813		for (pos = 0; pos < done;) {
5814			dp = (struct dirent *)(dirbuf + pos);
5815			pos += dp->d_reclen;
5816			/*
5817			 * XXX: Temporarily we also accept DT_UNKNOWN, as this
5818			 * is what we get when attribute was created on Solaris.
5819			 */
5820			if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
5821				continue;
5822			if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0)
5823				continue;
5824			else if (strncmp(dp->d_name, attrprefix, plen) != 0)
5825				continue;
5826			nlen = dp->d_namlen - plen;
5827			if (sizep != NULL)
5828				*sizep += 1 + nlen;
5829			else if (uio != NULL) {
5830				/*
5831				 * Format of extattr name entry is one byte for
5832				 * length and the rest for name.
5833				 */
5834				error = uiomove(&nlen, 1, uio->uio_rw, uio);
5835				if (error == 0) {
5836					error = uiomove(dp->d_name + plen, nlen,
5837					    uio->uio_rw, uio);
5838				}
5839				if (error != 0)
5840					break;
5841			}
5842		}
5843	} while (!eof && error == 0);
5844
5845	vput(vp);
5846	ZFS_EXIT(zfsvfs);
5847
5848	return (error);
5849}
5850
5851int
5852zfs_freebsd_getacl(ap)
5853	struct vop_getacl_args /* {
5854		struct vnode *vp;
5855		acl_type_t type;
5856		struct acl *aclp;
5857		struct ucred *cred;
5858		struct thread *td;
5859	} */ *ap;
5860{
5861	int		error;
5862	vsecattr_t      vsecattr;
5863
5864	if (ap->a_type != ACL_TYPE_NFS4)
5865		return (EINVAL);
5866
5867	vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
5868	if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))
5869		return (error);
5870
5871	error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt);
5872	if (vsecattr.vsa_aclentp != NULL)
5873		kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
5874
5875	return (error);
5876}
5877
5878int
5879zfs_freebsd_setacl(ap)
5880	struct vop_setacl_args /* {
5881		struct vnode *vp;
5882		acl_type_t type;
5883		struct acl *aclp;
5884		struct ucred *cred;
5885		struct thread *td;
5886	} */ *ap;
5887{
5888	int		error;
5889	vsecattr_t      vsecattr;
5890	int		aclbsize;	/* size of acl list in bytes */
5891	aclent_t	*aaclp;
5892
5893	if (ap->a_type != ACL_TYPE_NFS4)
5894		return (EINVAL);
5895
5896	if (ap->a_aclp == NULL)
5897		return (EINVAL);
5898
5899	if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
5900		return (EINVAL);
5901
5902	/*
5903	 * With NFSv4 ACLs, chmod(2) may need to add additional entries,
5904	 * splitting every entry into two and appending "canonical six"
5905	 * entries at the end.  Don't allow for setting an ACL that would
5906	 * cause chmod(2) to run out of ACL entries.
5907	 */
5908	if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
5909		return (ENOSPC);
5910
5911	error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR);
5912	if (error != 0)
5913		return (error);
5914
5915	vsecattr.vsa_mask = VSA_ACE;
5916	aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t);
5917	vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
5918	aaclp = vsecattr.vsa_aclentp;
5919	vsecattr.vsa_aclentsz = aclbsize;
5920
5921	aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
5922	error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL);
5923	kmem_free(aaclp, aclbsize);
5924
5925	return (error);
5926}
5927
5928int
5929zfs_freebsd_aclcheck(ap)
5930	struct vop_aclcheck_args /* {
5931		struct vnode *vp;
5932		acl_type_t type;
5933		struct acl *aclp;
5934		struct ucred *cred;
5935		struct thread *td;
5936	} */ *ap;
5937{
5938
5939	return (EOPNOTSUPP);
5940}
5941
5942static int
5943zfs_vptocnp(struct vop_vptocnp_args *ap)
5944{
5945	vnode_t *covered_vp;
5946	vnode_t *vp = ap->a_vp;;
5947	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
5948	znode_t *zp = VTOZ(vp);
5949	int ltype;
5950	int error;
5951
5952	ZFS_ENTER(zfsvfs);
5953	ZFS_VERIFY_ZP(zp);
5954
5955	/*
5956	 * If we are a snapshot mounted under .zfs, run the operation
5957	 * on the covered vnode.
5958	 */
5959	if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) {
5960		char name[MAXNAMLEN + 1];
5961		znode_t *dzp;
5962		size_t len;
5963
5964		error = zfs_znode_parent_and_name(zp, &dzp, name);
5965		if (error == 0) {
5966			len = strlen(name);
5967			if (*ap->a_buflen < len)
5968				error = SET_ERROR(ENOMEM);
5969		}
5970		if (error == 0) {
5971			*ap->a_buflen -= len;
5972			bcopy(name, ap->a_buf + *ap->a_buflen, len);
5973			*ap->a_vpp = ZTOV(dzp);
5974		}
5975		ZFS_EXIT(zfsvfs);
5976		return (error);
5977	}
5978	ZFS_EXIT(zfsvfs);
5979
5980	covered_vp = vp->v_mount->mnt_vnodecovered;
5981	vhold(covered_vp);
5982	ltype = VOP_ISLOCKED(vp);
5983	VOP_UNLOCK(vp, 0);
5984	error = vget(covered_vp, LK_SHARED | LK_VNHELD, curthread);
5985	if (error == 0) {
5986		error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred,
5987		    ap->a_buf, ap->a_buflen);
5988		vput(covered_vp);
5989	}
5990	vn_lock(vp, ltype | LK_RETRY);
5991	if ((vp->v_iflag & VI_DOOMED) != 0)
5992		error = SET_ERROR(ENOENT);
5993	return (error);
5994}
5995
5996#ifdef DIAGNOSTIC
5997static int
5998zfs_lock(ap)
5999	struct vop_lock1_args /* {
6000		struct vnode *a_vp;
6001		int a_flags;
6002		char *file;
6003		int line;
6004	} */ *ap;
6005{
6006	vnode_t *vp;
6007	znode_t *zp;
6008	int err;
6009
6010	err = vop_stdlock(ap);
6011	if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) {
6012		vp = ap->a_vp;
6013		zp = vp->v_data;
6014		if (vp->v_mount != NULL && (vp->v_iflag & VI_DOOMED) == 0 &&
6015		    zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0)
6016			VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock));
6017	}
6018	return (err);
6019}
6020#endif
6021
6022struct vop_vector zfs_vnodeops;
6023struct vop_vector zfs_fifoops;
6024struct vop_vector zfs_shareops;
6025
6026struct vop_vector zfs_vnodeops = {
6027	.vop_default =		&default_vnodeops,
6028	.vop_inactive =		zfs_freebsd_inactive,
6029	.vop_reclaim =		zfs_freebsd_reclaim,
6030	.vop_access =		zfs_freebsd_access,
6031	.vop_allocate =		VOP_EINVAL,
6032	.vop_lookup =		zfs_cache_lookup,
6033	.vop_cachedlookup =	zfs_freebsd_cachedlookup,
6034	.vop_getattr =		zfs_freebsd_getattr,
6035	.vop_setattr =		zfs_freebsd_setattr,
6036	.vop_create =		zfs_freebsd_create,
6037	.vop_mknod =		zfs_freebsd_create,
6038	.vop_mkdir =		zfs_freebsd_mkdir,
6039	.vop_readdir =		zfs_freebsd_readdir,
6040	.vop_fsync =		zfs_freebsd_fsync,
6041	.vop_open =		zfs_freebsd_open,
6042	.vop_close =		zfs_freebsd_close,
6043	.vop_rmdir =		zfs_freebsd_rmdir,
6044	.vop_ioctl =		zfs_freebsd_ioctl,
6045	.vop_link =		zfs_freebsd_link,
6046	.vop_symlink =		zfs_freebsd_symlink,
6047	.vop_readlink =		zfs_freebsd_readlink,
6048	.vop_read =		zfs_freebsd_read,
6049	.vop_write =		zfs_freebsd_write,
6050	.vop_remove =		zfs_freebsd_remove,
6051	.vop_rename =		zfs_freebsd_rename,
6052	.vop_pathconf =		zfs_freebsd_pathconf,
6053	.vop_bmap =		zfs_freebsd_bmap,
6054	.vop_fid =		zfs_freebsd_fid,
6055	.vop_getextattr =	zfs_getextattr,
6056	.vop_deleteextattr =	zfs_deleteextattr,
6057	.vop_setextattr =	zfs_setextattr,
6058	.vop_listextattr =	zfs_listextattr,
6059	.vop_getacl =		zfs_freebsd_getacl,
6060	.vop_setacl =		zfs_freebsd_setacl,
6061	.vop_aclcheck =		zfs_freebsd_aclcheck,
6062	.vop_getpages =		zfs_freebsd_getpages,
6063	.vop_putpages =		zfs_freebsd_putpages,
6064	.vop_vptocnp =		zfs_vptocnp,
6065#ifdef DIAGNOSTIC
6066	.vop_lock1 =		zfs_lock,
6067#endif
6068};
6069
6070struct vop_vector zfs_fifoops = {
6071	.vop_default =		&fifo_specops,
6072	.vop_fsync =		zfs_freebsd_fsync,
6073	.vop_access =		zfs_freebsd_access,
6074	.vop_getattr =		zfs_freebsd_getattr,
6075	.vop_inactive =		zfs_freebsd_inactive,
6076	.vop_read =		VOP_PANIC,
6077	.vop_reclaim =		zfs_freebsd_reclaim,
6078	.vop_setattr =		zfs_freebsd_setattr,
6079	.vop_write =		VOP_PANIC,
6080	.vop_pathconf = 	zfs_freebsd_pathconf,
6081	.vop_fid =		zfs_freebsd_fid,
6082	.vop_getacl =		zfs_freebsd_getacl,
6083	.vop_setacl =		zfs_freebsd_setacl,
6084	.vop_aclcheck =		zfs_freebsd_aclcheck,
6085};
6086
6087/*
6088 * special share hidden files vnode operations template
6089 */
6090struct vop_vector zfs_shareops = {
6091	.vop_default =		&default_vnodeops,
6092	.vop_access =		zfs_freebsd_access,
6093	.vop_inactive =		zfs_freebsd_inactive,
6094	.vop_reclaim =		zfs_freebsd_reclaim,
6095	.vop_fid =		zfs_freebsd_fid,
6096	.vop_pathconf =		zfs_freebsd_pathconf,
6097};
6098