zfs_vnops.c revision 196299
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/* Portions Copyright 2007 Jeremy Teo */
27
28#include <sys/types.h>
29#include <sys/param.h>
30#include <sys/time.h>
31#include <sys/systm.h>
32#include <sys/sysmacros.h>
33#include <sys/resource.h>
34#include <sys/vfs.h>
35#include <sys/vnode.h>
36#include <sys/file.h>
37#include <sys/stat.h>
38#include <sys/kmem.h>
39#include <sys/taskq.h>
40#include <sys/uio.h>
41#include <sys/atomic.h>
42#include <sys/namei.h>
43#include <sys/mman.h>
44#include <sys/cmn_err.h>
45#include <sys/errno.h>
46#include <sys/unistd.h>
47#include <sys/zfs_dir.h>
48#include <sys/zfs_ioctl.h>
49#include <sys/fs/zfs.h>
50#include <sys/dmu.h>
51#include <sys/spa.h>
52#include <sys/txg.h>
53#include <sys/dbuf.h>
54#include <sys/zap.h>
55#include <sys/dirent.h>
56#include <sys/policy.h>
57#include <sys/sunddi.h>
58#include <sys/filio.h>
59#include <sys/zfs_ctldir.h>
60#include <sys/zfs_fuid.h>
61#include <sys/dnlc.h>
62#include <sys/zfs_rlock.h>
63#include <sys/extdirent.h>
64#include <sys/kidmap.h>
65#include <sys/bio.h>
66#include <sys/buf.h>
67#include <sys/sf_buf.h>
68#include <sys/sched.h>
69#include <sys/acl.h>
70
71/*
72 * Programming rules.
73 *
74 * Each vnode op performs some logical unit of work.  To do this, the ZPL must
75 * properly lock its in-core state, create a DMU transaction, do the work,
76 * record this work in the intent log (ZIL), commit the DMU transaction,
77 * and wait for the intent log to commit if it is a synchronous operation.
78 * Moreover, the vnode ops must work in both normal and log replay context.
79 * The ordering of events is important to avoid deadlocks and references
80 * to freed memory.  The example below illustrates the following Big Rules:
81 *
82 *  (1) A check must be made in each zfs thread for a mounted file system.
83 *	This is done avoiding races using ZFS_ENTER(zfsvfs).
84 *      A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
85 *      must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
86 *      can return EIO from the calling function.
87 *
88 *  (2)	VN_RELE() should always be the last thing except for zil_commit()
89 *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
90 *	First, if it's the last reference, the vnode/znode
91 *	can be freed, so the zp may point to freed memory.  Second, the last
92 *	reference will call zfs_zinactive(), which may induce a lot of work --
93 *	pushing cached pages (which acquires range locks) and syncing out
94 *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
95 *	which could deadlock the system if you were already holding one.
96 *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
97 *
98 *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
99 *	as they can span dmu_tx_assign() calls.
100 *
101 *  (4)	Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign().
102 *	In normal operation, this will be TXG_NOWAIT.  During ZIL replay,
103 *	it will be a specific txg.  Either way, dmu_tx_assign() never blocks.
104 *	This is critical because we don't want to block while holding locks.
105 *	Note, in particular, that if a lock is sometimes acquired before
106 *	the tx assigns, and sometimes after (e.g. z_lock), then failing to
107 *	use a non-blocking assign can deadlock the system.  The scenario:
108 *
109 *	Thread A has grabbed a lock before calling dmu_tx_assign().
110 *	Thread B is in an already-assigned tx, and blocks for this lock.
111 *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
112 *	forever, because the previous txg can't quiesce until B's tx commits.
113 *
114 *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
115 *	then drop all locks, call dmu_tx_wait(), and try again.
116 *
117 *  (5)	If the operation succeeded, generate the intent log entry for it
118 *	before dropping locks.  This ensures that the ordering of events
119 *	in the intent log matches the order in which they actually occurred.
120 *
121 *  (6)	At the end of each vnode op, the DMU tx must always commit,
122 *	regardless of whether there were any errors.
123 *
124 *  (7)	After dropping all locks, invoke zil_commit(zilog, seq, foid)
125 *	to ensure that synchronous semantics are provided when necessary.
126 *
127 * In general, this is how things should be ordered in each vnode op:
128 *
129 *	ZFS_ENTER(zfsvfs);		// exit if unmounted
130 * top:
131 *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may VN_HOLD())
132 *	rw_enter(...);			// grab any other locks you need
133 *	tx = dmu_tx_create(...);	// get DMU tx
134 *	dmu_tx_hold_*();		// hold each object you might modify
135 *	error = dmu_tx_assign(tx, zfsvfs->z_assign);	// try to assign
136 *	if (error) {
137 *		rw_exit(...);		// drop locks
138 *		zfs_dirent_unlock(dl);	// unlock directory entry
139 *		VN_RELE(...);		// release held vnodes
140 *		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
141 *			dmu_tx_wait(tx);
142 *			dmu_tx_abort(tx);
143 *			goto top;
144 *		}
145 *		dmu_tx_abort(tx);	// abort DMU tx
146 *		ZFS_EXIT(zfsvfs);	// finished in zfs
147 *		return (error);		// really out of space
148 *	}
149 *	error = do_real_work();		// do whatever this VOP does
150 *	if (error == 0)
151 *		zfs_log_*(...);		// on success, make ZIL entry
152 *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
153 *	rw_exit(...);			// drop locks
154 *	zfs_dirent_unlock(dl);		// unlock directory entry
155 *	VN_RELE(...);			// release held vnodes
156 *	zil_commit(zilog, seq, foid);	// synchronous when necessary
157 *	ZFS_EXIT(zfsvfs);		// finished in zfs
158 *	return (error);			// done, report error
159 */
160
161/* ARGSUSED */
162static int
163zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
164{
165	znode_t	*zp = VTOZ(*vpp);
166
167	if ((flag & FWRITE) && (zp->z_phys->zp_flags & ZFS_APPENDONLY) &&
168	    ((flag & FAPPEND) == 0)) {
169		return (EPERM);
170	}
171
172	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
173	    ZTOV(zp)->v_type == VREG &&
174	    !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
175	    zp->z_phys->zp_size > 0)
176		if (fs_vscan(*vpp, cr, 0) != 0)
177			return (EACCES);
178
179	/* Keep a count of the synchronous opens in the znode */
180	if (flag & (FSYNC | FDSYNC))
181		atomic_inc_32(&zp->z_sync_cnt);
182
183	return (0);
184}
185
186/* ARGSUSED */
187static int
188zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
189    caller_context_t *ct)
190{
191	znode_t	*zp = VTOZ(vp);
192
193	/* Decrement the synchronous opens in the znode */
194	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
195		atomic_dec_32(&zp->z_sync_cnt);
196
197	/*
198	 * Clean up any locks held by this process on the vp.
199	 */
200	cleanlocks(vp, ddi_get_pid(), 0);
201	cleanshares(vp, ddi_get_pid());
202
203	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
204	    ZTOV(zp)->v_type == VREG &&
205	    !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
206	    zp->z_phys->zp_size > 0)
207		VERIFY(fs_vscan(vp, cr, 1) == 0);
208
209	return (0);
210}
211
212/*
213 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
214 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
215 */
216static int
217zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
218{
219	znode_t	*zp = VTOZ(vp);
220	uint64_t noff = (uint64_t)*off; /* new offset */
221	uint64_t file_sz;
222	int error;
223	boolean_t hole;
224
225	file_sz = zp->z_phys->zp_size;
226	if (noff >= file_sz)  {
227		return (ENXIO);
228	}
229
230	if (cmd == _FIO_SEEK_HOLE)
231		hole = B_TRUE;
232	else
233		hole = B_FALSE;
234
235	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
236
237	/* end of file? */
238	if ((error == ESRCH) || (noff > file_sz)) {
239		/*
240		 * Handle the virtual hole at the end of file.
241		 */
242		if (hole) {
243			*off = file_sz;
244			return (0);
245		}
246		return (ENXIO);
247	}
248
249	if (noff < *off)
250		return (error);
251	*off = noff;
252	return (error);
253}
254
255/* ARGSUSED */
256static int
257zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
258    int *rvalp, caller_context_t *ct)
259{
260	offset_t off;
261	int error;
262	zfsvfs_t *zfsvfs;
263	znode_t *zp;
264
265	switch (com) {
266	case _FIOFFS:
267		return (0);
268
269		/*
270		 * The following two ioctls are used by bfu.  Faking out,
271		 * necessary to avoid bfu errors.
272		 */
273	case _FIOGDIO:
274	case _FIOSDIO:
275		return (0);
276
277	case _FIO_SEEK_DATA:
278	case _FIO_SEEK_HOLE:
279		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
280			return (EFAULT);
281
282		zp = VTOZ(vp);
283		zfsvfs = zp->z_zfsvfs;
284		ZFS_ENTER(zfsvfs);
285		ZFS_VERIFY_ZP(zp);
286
287		/* offset parameter is in/out */
288		error = zfs_holey(vp, com, &off);
289		ZFS_EXIT(zfsvfs);
290		if (error)
291			return (error);
292		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
293			return (EFAULT);
294		return (0);
295	}
296	return (ENOTTY);
297}
298
299/*
300 * When a file is memory mapped, we must keep the IO data synchronized
301 * between the DMU cache and the memory mapped pages.  What this means:
302 *
303 * On Write:	If we find a memory mapped page, we write to *both*
304 *		the page and the dmu buffer.
305 *
306 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
307 *	the file is memory mapped.
308 */
309static int
310mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx)
311{
312	znode_t *zp = VTOZ(vp);
313	objset_t *os = zp->z_zfsvfs->z_os;
314	vm_object_t obj;
315	vm_page_t m;
316	struct sf_buf *sf;
317	int64_t start, off;
318	int len = nbytes;
319	int error = 0;
320	uint64_t dirbytes;
321
322	ASSERT(vp->v_mount != NULL);
323	obj = vp->v_object;
324	ASSERT(obj != NULL);
325
326	start = uio->uio_loffset;
327	off = start & PAGEOFFSET;
328	dirbytes = 0;
329	VM_OBJECT_LOCK(obj);
330	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
331		uint64_t bytes = MIN(PAGESIZE - off, len);
332		uint64_t fsize;
333
334again:
335		if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
336		    vm_page_is_valid(m, (vm_offset_t)off, bytes)) {
337			uint64_t woff;
338			caddr_t va;
339
340			if (vm_page_sleep_if_busy(m, FALSE, "zfsmwb"))
341				goto again;
342			fsize = obj->un_pager.vnp.vnp_size;
343			vm_page_busy(m);
344			vm_page_lock_queues();
345			vm_page_undirty(m);
346			vm_page_unlock_queues();
347			VM_OBJECT_UNLOCK(obj);
348			if (dirbytes > 0) {
349				error = dmu_write_uio(os, zp->z_id, uio,
350				    dirbytes, tx);
351				dirbytes = 0;
352			}
353			if (error == 0) {
354				sched_pin();
355				sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
356				va = (caddr_t)sf_buf_kva(sf);
357				woff = uio->uio_loffset - off;
358				error = uiomove(va + off, bytes, UIO_WRITE, uio);
359				/*
360				 * The uiomove() above could have been partially
361				 * successful, that's why we call dmu_write()
362				 * below unconditionally. The page was marked
363				 * non-dirty above and we would lose the changes
364				 * without doing so. If the uiomove() failed
365				 * entirely, well, we just write what we got
366				 * before one more time.
367				 */
368				dmu_write(os, zp->z_id, woff,
369				    MIN(PAGESIZE, fsize - woff), va, tx);
370				sf_buf_free(sf);
371				sched_unpin();
372			}
373			VM_OBJECT_LOCK(obj);
374			vm_page_wakeup(m);
375		} else {
376			if (__predict_false(obj->cache != NULL)) {
377				vm_page_cache_free(obj, OFF_TO_IDX(start),
378				    OFF_TO_IDX(start) + 1);
379			}
380			dirbytes += bytes;
381		}
382		len -= bytes;
383		off = 0;
384		if (error)
385			break;
386	}
387	VM_OBJECT_UNLOCK(obj);
388	if (error == 0 && dirbytes > 0)
389		error = dmu_write_uio(os, zp->z_id, uio, dirbytes, tx);
390	return (error);
391}
392
393/*
394 * When a file is memory mapped, we must keep the IO data synchronized
395 * between the DMU cache and the memory mapped pages.  What this means:
396 *
397 * On Read:	We "read" preferentially from memory mapped pages,
398 *		else we default from the dmu buffer.
399 *
400 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
401 *	the file is memory mapped.
402 */
403static int
404mappedread(vnode_t *vp, int nbytes, uio_t *uio)
405{
406	znode_t *zp = VTOZ(vp);
407	objset_t *os = zp->z_zfsvfs->z_os;
408	vm_object_t obj;
409	vm_page_t m;
410	struct sf_buf *sf;
411	int64_t start, off;
412	caddr_t va;
413	int len = nbytes;
414	int error = 0;
415	uint64_t dirbytes;
416
417	ASSERT(vp->v_mount != NULL);
418	obj = vp->v_object;
419	ASSERT(obj != NULL);
420
421	start = uio->uio_loffset;
422	off = start & PAGEOFFSET;
423	dirbytes = 0;
424	VM_OBJECT_LOCK(obj);
425	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
426		uint64_t bytes = MIN(PAGESIZE - off, len);
427
428again:
429		if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
430		    vm_page_is_valid(m, (vm_offset_t)off, bytes)) {
431			if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb"))
432				goto again;
433			vm_page_busy(m);
434			VM_OBJECT_UNLOCK(obj);
435			if (dirbytes > 0) {
436				error = dmu_read_uio(os, zp->z_id, uio,
437				    dirbytes);
438				dirbytes = 0;
439			}
440			if (error == 0) {
441				sched_pin();
442				sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
443				va = (caddr_t)sf_buf_kva(sf);
444				error = uiomove(va + off, bytes, UIO_READ, uio);
445				sf_buf_free(sf);
446				sched_unpin();
447			}
448			VM_OBJECT_LOCK(obj);
449			vm_page_wakeup(m);
450		} else if (m != NULL && uio->uio_segflg == UIO_NOCOPY) {
451			/*
452			 * The code below is here to make sendfile(2) work
453			 * correctly with ZFS. As pointed out by ups@
454			 * sendfile(2) should be changed to use VOP_GETPAGES(),
455			 * but it pessimize performance of sendfile/UFS, that's
456			 * why I handle this special case in ZFS code.
457			 */
458			if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb"))
459				goto again;
460			vm_page_busy(m);
461			VM_OBJECT_UNLOCK(obj);
462			if (dirbytes > 0) {
463				error = dmu_read_uio(os, zp->z_id, uio,
464				    dirbytes);
465				dirbytes = 0;
466			}
467			if (error == 0) {
468				sched_pin();
469				sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
470				va = (caddr_t)sf_buf_kva(sf);
471				error = dmu_read(os, zp->z_id, start + off,
472				    bytes, (void *)(va + off));
473				sf_buf_free(sf);
474				sched_unpin();
475			}
476			VM_OBJECT_LOCK(obj);
477			vm_page_wakeup(m);
478			if (error == 0)
479				uio->uio_resid -= bytes;
480		} else {
481			dirbytes += bytes;
482		}
483		len -= bytes;
484		off = 0;
485		if (error)
486			break;
487	}
488	VM_OBJECT_UNLOCK(obj);
489	if (error == 0 && dirbytes > 0)
490		error = dmu_read_uio(os, zp->z_id, uio, dirbytes);
491	return (error);
492}
493
494offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
495
496/*
497 * Read bytes from specified file into supplied buffer.
498 *
499 *	IN:	vp	- vnode of file to be read from.
500 *		uio	- structure supplying read location, range info,
501 *			  and return buffer.
502 *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
503 *		cr	- credentials of caller.
504 *		ct	- caller context
505 *
506 *	OUT:	uio	- updated offset and range, buffer filled.
507 *
508 *	RETURN:	0 if success
509 *		error code if failure
510 *
511 * Side Effects:
512 *	vp - atime updated if byte count > 0
513 */
514/* ARGSUSED */
515static int
516zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
517{
518	znode_t		*zp = VTOZ(vp);
519	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
520	objset_t	*os;
521	ssize_t		n, nbytes;
522	int		error;
523	rl_t		*rl;
524
525	ZFS_ENTER(zfsvfs);
526	ZFS_VERIFY_ZP(zp);
527	os = zfsvfs->z_os;
528
529	if (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) {
530		ZFS_EXIT(zfsvfs);
531		return (EACCES);
532	}
533
534	/*
535	 * Validate file offset
536	 */
537	if (uio->uio_loffset < (offset_t)0) {
538		ZFS_EXIT(zfsvfs);
539		return (EINVAL);
540	}
541
542	/*
543	 * Fasttrack empty reads
544	 */
545	if (uio->uio_resid == 0) {
546		ZFS_EXIT(zfsvfs);
547		return (0);
548	}
549
550	/*
551	 * Check for mandatory locks
552	 */
553	if (MANDMODE((mode_t)zp->z_phys->zp_mode)) {
554		if (error = chklock(vp, FREAD,
555		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
556			ZFS_EXIT(zfsvfs);
557			return (error);
558		}
559	}
560
561	/*
562	 * If we're in FRSYNC mode, sync out this znode before reading it.
563	 */
564	if (ioflag & FRSYNC)
565		zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
566
567	/*
568	 * Lock the range against changes.
569	 */
570	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
571
572	/*
573	 * If we are reading past end-of-file we can skip
574	 * to the end; but we might still need to set atime.
575	 */
576	if (uio->uio_loffset >= zp->z_phys->zp_size) {
577		error = 0;
578		goto out;
579	}
580
581	ASSERT(uio->uio_loffset < zp->z_phys->zp_size);
582	n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset);
583
584	while (n > 0) {
585		nbytes = MIN(n, zfs_read_chunk_size -
586		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
587
588		if (vn_has_cached_data(vp))
589			error = mappedread(vp, nbytes, uio);
590		else
591			error = dmu_read_uio(os, zp->z_id, uio, nbytes);
592		if (error) {
593			/* convert checksum errors into IO errors */
594			if (error == ECKSUM)
595				error = EIO;
596			break;
597		}
598
599		n -= nbytes;
600	}
601
602out:
603	zfs_range_unlock(rl);
604
605	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
606	ZFS_EXIT(zfsvfs);
607	return (error);
608}
609
610/*
611 * Fault in the pages of the first n bytes specified by the uio structure.
612 * 1 byte in each page is touched and the uio struct is unmodified.
613 * Any error will exit this routine as this is only a best
614 * attempt to get the pages resident. This is a copy of ufs_trans_touch().
615 */
616static void
617zfs_prefault_write(ssize_t n, struct uio *uio)
618{
619	struct iovec *iov;
620	ulong_t cnt, incr;
621	caddr_t p;
622
623	if (uio->uio_segflg != UIO_USERSPACE)
624		return;
625
626	iov = uio->uio_iov;
627
628	while (n) {
629		cnt = MIN(iov->iov_len, n);
630		if (cnt == 0) {
631			/* empty iov entry */
632			iov++;
633			continue;
634		}
635		n -= cnt;
636		/*
637		 * touch each page in this segment.
638		 */
639		p = iov->iov_base;
640		while (cnt) {
641			if (fubyte(p) == -1)
642				return;
643			incr = MIN(cnt, PAGESIZE);
644			p += incr;
645			cnt -= incr;
646		}
647		/*
648		 * touch the last byte in case it straddles a page.
649		 */
650		p--;
651		if (fubyte(p) == -1)
652			return;
653		iov++;
654	}
655}
656
657/*
658 * Write the bytes to a file.
659 *
660 *	IN:	vp	- vnode of file to be written to.
661 *		uio	- structure supplying write location, range info,
662 *			  and data buffer.
663 *		ioflag	- IO_APPEND flag set if in append mode.
664 *		cr	- credentials of caller.
665 *		ct	- caller context (NFS/CIFS fem monitor only)
666 *
667 *	OUT:	uio	- updated offset and range.
668 *
669 *	RETURN:	0 if success
670 *		error code if failure
671 *
672 * Timestamps:
673 *	vp - ctime|mtime updated if byte count > 0
674 */
675/* ARGSUSED */
676static int
677zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
678{
679	znode_t		*zp = VTOZ(vp);
680	rlim64_t	limit = MAXOFFSET_T;
681	ssize_t		start_resid = uio->uio_resid;
682	ssize_t		tx_bytes;
683	uint64_t	end_size;
684	dmu_tx_t	*tx;
685	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
686	zilog_t		*zilog;
687	offset_t	woff;
688	ssize_t		n, nbytes;
689	rl_t		*rl;
690	int		max_blksz = zfsvfs->z_max_blksz;
691	uint64_t	pflags;
692	int		error;
693
694	/*
695	 * Fasttrack empty write
696	 */
697	n = start_resid;
698	if (n == 0)
699		return (0);
700
701	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
702		limit = MAXOFFSET_T;
703
704	ZFS_ENTER(zfsvfs);
705	ZFS_VERIFY_ZP(zp);
706
707	/*
708	 * If immutable or not appending then return EPERM
709	 */
710	pflags = zp->z_phys->zp_flags;
711	if ((pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
712	    ((pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
713	    (uio->uio_loffset < zp->z_phys->zp_size))) {
714		ZFS_EXIT(zfsvfs);
715		return (EPERM);
716	}
717
718	zilog = zfsvfs->z_log;
719
720	/*
721	 * Pre-fault the pages to ensure slow (eg NFS) pages
722	 * don't hold up txg.
723	 */
724	zfs_prefault_write(n, uio);
725
726	/*
727	 * If in append mode, set the io offset pointer to eof.
728	 */
729	if (ioflag & IO_APPEND) {
730		/*
731		 * Range lock for a file append:
732		 * The value for the start of range will be determined by
733		 * zfs_range_lock() (to guarantee append semantics).
734		 * If this write will cause the block size to increase,
735		 * zfs_range_lock() will lock the entire file, so we must
736		 * later reduce the range after we grow the block size.
737		 */
738		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
739		if (rl->r_len == UINT64_MAX) {
740			/* overlocked, zp_size can't change */
741			woff = uio->uio_loffset = zp->z_phys->zp_size;
742		} else {
743			woff = uio->uio_loffset = rl->r_off;
744		}
745	} else {
746		woff = uio->uio_loffset;
747		/*
748		 * Validate file offset
749		 */
750		if (woff < 0) {
751			ZFS_EXIT(zfsvfs);
752			return (EINVAL);
753		}
754
755		/*
756		 * If we need to grow the block size then zfs_range_lock()
757		 * will lock a wider range than we request here.
758		 * Later after growing the block size we reduce the range.
759		 */
760		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
761	}
762
763	if (woff >= limit) {
764		zfs_range_unlock(rl);
765		ZFS_EXIT(zfsvfs);
766		return (EFBIG);
767	}
768
769	if ((woff + n) > limit || woff > (limit - n))
770		n = limit - woff;
771
772	/*
773	 * Check for mandatory locks
774	 */
775	if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
776	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
777		zfs_range_unlock(rl);
778		ZFS_EXIT(zfsvfs);
779		return (error);
780	}
781	end_size = MAX(zp->z_phys->zp_size, woff + n);
782
783	/*
784	 * Write the file in reasonable size chunks.  Each chunk is written
785	 * in a separate transaction; this keeps the intent log records small
786	 * and allows us to do more fine-grained space accounting.
787	 */
788	while (n > 0) {
789		/*
790		 * Start a transaction.
791		 */
792		woff = uio->uio_loffset;
793		tx = dmu_tx_create(zfsvfs->z_os);
794		dmu_tx_hold_bonus(tx, zp->z_id);
795		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
796		error = dmu_tx_assign(tx, zfsvfs->z_assign);
797		if (error) {
798			if (error == ERESTART &&
799			    zfsvfs->z_assign == TXG_NOWAIT) {
800				dmu_tx_wait(tx);
801				dmu_tx_abort(tx);
802				continue;
803			}
804			dmu_tx_abort(tx);
805			break;
806		}
807
808		/*
809		 * If zfs_range_lock() over-locked we grow the blocksize
810		 * and then reduce the lock range.  This will only happen
811		 * on the first iteration since zfs_range_reduce() will
812		 * shrink down r_len to the appropriate size.
813		 */
814		if (rl->r_len == UINT64_MAX) {
815			uint64_t new_blksz;
816
817			if (zp->z_blksz > max_blksz) {
818				ASSERT(!ISP2(zp->z_blksz));
819				new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
820			} else {
821				new_blksz = MIN(end_size, max_blksz);
822			}
823			zfs_grow_blocksize(zp, new_blksz, tx);
824			zfs_range_reduce(rl, woff, n);
825		}
826
827		/*
828		 * XXX - should we really limit each write to z_max_blksz?
829		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
830		 */
831		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
832
833		if (woff + nbytes > zp->z_phys->zp_size)
834			vnode_pager_setsize(vp, woff + nbytes);
835
836		rw_enter(&zp->z_map_lock, RW_READER);
837
838		tx_bytes = uio->uio_resid;
839		if (vn_has_cached_data(vp)) {
840			rw_exit(&zp->z_map_lock);
841			error = mappedwrite(vp, nbytes, uio, tx);
842		} else {
843			error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
844			    uio, nbytes, tx);
845			rw_exit(&zp->z_map_lock);
846		}
847		tx_bytes -= uio->uio_resid;
848
849		/*
850		 * If we made no progress, we're done.  If we made even
851		 * partial progress, update the znode and ZIL accordingly.
852		 */
853		if (tx_bytes == 0) {
854			dmu_tx_commit(tx);
855			ASSERT(error != 0);
856			break;
857		}
858
859		/*
860		 * Clear Set-UID/Set-GID bits on successful write if not
861		 * privileged and at least one of the excute bits is set.
862		 *
863		 * It would be nice to to this after all writes have
864		 * been done, but that would still expose the ISUID/ISGID
865		 * to another app after the partial write is committed.
866		 *
867		 * Note: we don't call zfs_fuid_map_id() here because
868		 * user 0 is not an ephemeral uid.
869		 */
870		mutex_enter(&zp->z_acl_lock);
871		if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) |
872		    (S_IXUSR >> 6))) != 0 &&
873		    (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 &&
874		    secpolicy_vnode_setid_retain(vp, cr,
875		    (zp->z_phys->zp_mode & S_ISUID) != 0 &&
876		    zp->z_phys->zp_uid == 0) != 0) {
877			zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID);
878		}
879		mutex_exit(&zp->z_acl_lock);
880
881		/*
882		 * Update time stamp.  NOTE: This marks the bonus buffer as
883		 * dirty, so we don't have to do it again for zp_size.
884		 */
885		zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
886
887		/*
888		 * Update the file size (zp_size) if it has changed;
889		 * account for possible concurrent updates.
890		 */
891		while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset)
892			(void) atomic_cas_64(&zp->z_phys->zp_size, end_size,
893			    uio->uio_loffset);
894		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
895		dmu_tx_commit(tx);
896
897		if (error != 0)
898			break;
899		ASSERT(tx_bytes == nbytes);
900		n -= nbytes;
901	}
902
903	zfs_range_unlock(rl);
904
905	/*
906	 * If we're in replay mode, or we made no progress, return error.
907	 * Otherwise, it's at least a partial write, so it's successful.
908	 */
909	if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) {
910		ZFS_EXIT(zfsvfs);
911		return (error);
912	}
913
914	if (ioflag & (FSYNC | FDSYNC))
915		zil_commit(zilog, zp->z_last_itx, zp->z_id);
916
917	ZFS_EXIT(zfsvfs);
918	return (0);
919}
920
921void
922zfs_get_done(dmu_buf_t *db, void *vzgd)
923{
924	zgd_t *zgd = (zgd_t *)vzgd;
925	rl_t *rl = zgd->zgd_rl;
926	vnode_t *vp = ZTOV(rl->r_zp);
927	int vfslocked;
928
929	vfslocked = VFS_LOCK_GIANT(vp->v_vfsp);
930	dmu_buf_rele(db, vzgd);
931	zfs_range_unlock(rl);
932	/*
933	 * Release the vnode asynchronously as we currently have the
934	 * txg stopped from syncing.
935	 */
936	VN_RELE_ASYNC(vp, NULL);
937	zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
938	kmem_free(zgd, sizeof (zgd_t));
939	VFS_UNLOCK_GIANT(vfslocked);
940}
941
942/*
943 * Get data to generate a TX_WRITE intent log record.
944 */
945int
946zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
947{
948	zfsvfs_t *zfsvfs = arg;
949	objset_t *os = zfsvfs->z_os;
950	znode_t *zp;
951	uint64_t off = lr->lr_offset;
952	dmu_buf_t *db;
953	rl_t *rl;
954	zgd_t *zgd;
955	int dlen = lr->lr_length;		/* length of user data */
956	int error = 0;
957
958	ASSERT(zio);
959	ASSERT(dlen != 0);
960
961	/*
962	 * Nothing to do if the file has been removed
963	 */
964	if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0)
965		return (ENOENT);
966	if (zp->z_unlinked) {
967		/*
968		 * Release the vnode asynchronously as we currently have the
969		 * txg stopped from syncing.
970		 */
971		VN_RELE_ASYNC(ZTOV(zp), NULL);
972
973		return (ENOENT);
974	}
975
976	/*
977	 * Write records come in two flavors: immediate and indirect.
978	 * For small writes it's cheaper to store the data with the
979	 * log record (immediate); for large writes it's cheaper to
980	 * sync the data and get a pointer to it (indirect) so that
981	 * we don't have to write the data twice.
982	 */
983	if (buf != NULL) { /* immediate write */
984		rl = zfs_range_lock(zp, off, dlen, RL_READER);
985		/* test for truncation needs to be done while range locked */
986		if (off >= zp->z_phys->zp_size) {
987			error = ENOENT;
988			goto out;
989		}
990		VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf));
991	} else { /* indirect write */
992		uint64_t boff; /* block starting offset */
993
994		/*
995		 * Have to lock the whole block to ensure when it's
996		 * written out and it's checksum is being calculated
997		 * that no one can change the data. We need to re-check
998		 * blocksize after we get the lock in case it's changed!
999		 */
1000		for (;;) {
1001			if (ISP2(zp->z_blksz)) {
1002				boff = P2ALIGN_TYPED(off, zp->z_blksz,
1003				    uint64_t);
1004			} else {
1005				boff = 0;
1006			}
1007			dlen = zp->z_blksz;
1008			rl = zfs_range_lock(zp, boff, dlen, RL_READER);
1009			if (zp->z_blksz == dlen)
1010				break;
1011			zfs_range_unlock(rl);
1012		}
1013		/* test for truncation needs to be done while range locked */
1014		if (off >= zp->z_phys->zp_size) {
1015			error = ENOENT;
1016			goto out;
1017		}
1018		zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
1019		zgd->zgd_rl = rl;
1020		zgd->zgd_zilog = zfsvfs->z_log;
1021		zgd->zgd_bp = &lr->lr_blkptr;
1022		VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db));
1023		ASSERT(boff == db->db_offset);
1024		lr->lr_blkoff = off - boff;
1025		error = dmu_sync(zio, db, &lr->lr_blkptr,
1026		    lr->lr_common.lrc_txg, zfs_get_done, zgd);
1027		ASSERT((error && error != EINPROGRESS) ||
1028		    lr->lr_length <= zp->z_blksz);
1029		if (error == 0)
1030			zil_add_block(zfsvfs->z_log, &lr->lr_blkptr);
1031		/*
1032		 * If we get EINPROGRESS, then we need to wait for a
1033		 * write IO initiated by dmu_sync() to complete before
1034		 * we can release this dbuf.  We will finish everything
1035		 * up in the zfs_get_done() callback.
1036		 */
1037		if (error == EINPROGRESS)
1038			return (0);
1039		dmu_buf_rele(db, zgd);
1040		kmem_free(zgd, sizeof (zgd_t));
1041	}
1042out:
1043	zfs_range_unlock(rl);
1044	/*
1045	 * Release the vnode asynchronously as we currently have the
1046	 * txg stopped from syncing.
1047	 */
1048	VN_RELE_ASYNC(ZTOV(zp), NULL);
1049	return (error);
1050}
1051
1052/*ARGSUSED*/
1053static int
1054zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1055    caller_context_t *ct)
1056{
1057	znode_t *zp = VTOZ(vp);
1058	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1059	int error;
1060
1061	ZFS_ENTER(zfsvfs);
1062	ZFS_VERIFY_ZP(zp);
1063
1064	if (flag & V_ACE_MASK)
1065		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1066	else
1067		error = zfs_zaccess_rwx(zp, mode, flag, cr);
1068
1069	ZFS_EXIT(zfsvfs);
1070	return (error);
1071}
1072
1073/*
1074 * Lookup an entry in a directory, or an extended attribute directory.
1075 * If it exists, return a held vnode reference for it.
1076 *
1077 *	IN:	dvp	- vnode of directory to search.
1078 *		nm	- name of entry to lookup.
1079 *		pnp	- full pathname to lookup [UNUSED].
1080 *		flags	- LOOKUP_XATTR set if looking for an attribute.
1081 *		rdir	- root directory vnode [UNUSED].
1082 *		cr	- credentials of caller.
1083 *		ct	- caller context
1084 *		direntflags - directory lookup flags
1085 *		realpnp - returned pathname.
1086 *
1087 *	OUT:	vpp	- vnode of located entry, NULL if not found.
1088 *
1089 *	RETURN:	0 if success
1090 *		error code if failure
1091 *
1092 * Timestamps:
1093 *	NA
1094 */
1095/* ARGSUSED */
1096static int
1097zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
1098    int nameiop, cred_t *cr, kthread_t *td, int flags)
1099{
1100	znode_t *zdp = VTOZ(dvp);
1101	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1102	int	error;
1103	int *direntflags = NULL;
1104	void *realpnp = NULL;
1105
1106	ZFS_ENTER(zfsvfs);
1107	ZFS_VERIFY_ZP(zdp);
1108
1109	*vpp = NULL;
1110
1111	if (flags & LOOKUP_XATTR) {
1112#ifdef TODO
1113		/*
1114		 * If the xattr property is off, refuse the lookup request.
1115		 */
1116		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1117			ZFS_EXIT(zfsvfs);
1118			return (EINVAL);
1119		}
1120#endif
1121
1122		/*
1123		 * We don't allow recursive attributes..
1124		 * Maybe someday we will.
1125		 */
1126		if (zdp->z_phys->zp_flags & ZFS_XATTR) {
1127			ZFS_EXIT(zfsvfs);
1128			return (EINVAL);
1129		}
1130
1131		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1132			ZFS_EXIT(zfsvfs);
1133			return (error);
1134		}
1135
1136		/*
1137		 * Do we have permission to get into attribute directory?
1138		 */
1139
1140		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1141		    B_FALSE, cr)) {
1142			VN_RELE(*vpp);
1143			*vpp = NULL;
1144		}
1145
1146		ZFS_EXIT(zfsvfs);
1147		return (error);
1148	}
1149
1150	if (dvp->v_type != VDIR) {
1151		ZFS_EXIT(zfsvfs);
1152		return (ENOTDIR);
1153	}
1154
1155	/*
1156	 * Check accessibility of directory.
1157	 */
1158
1159	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1160		ZFS_EXIT(zfsvfs);
1161		return (error);
1162	}
1163
1164	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1165	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1166		ZFS_EXIT(zfsvfs);
1167		return (EILSEQ);
1168	}
1169
1170	error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
1171	if (error == 0) {
1172		/*
1173		 * Convert device special files
1174		 */
1175		if (IS_DEVVP(*vpp)) {
1176			vnode_t	*svp;
1177
1178			svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1179			VN_RELE(*vpp);
1180			if (svp == NULL)
1181				error = ENOSYS;
1182			else
1183				*vpp = svp;
1184		}
1185	}
1186
1187	ZFS_EXIT(zfsvfs);
1188
1189	/* Translate errors and add SAVENAME when needed. */
1190	if (cnp->cn_flags & ISLASTCN) {
1191		switch (nameiop) {
1192		case CREATE:
1193		case RENAME:
1194			if (error == ENOENT) {
1195				error = EJUSTRETURN;
1196				cnp->cn_flags |= SAVENAME;
1197				break;
1198			}
1199			/* FALLTHROUGH */
1200		case DELETE:
1201			if (error == 0)
1202				cnp->cn_flags |= SAVENAME;
1203			break;
1204		}
1205	}
1206	if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) {
1207		int ltype = 0;
1208
1209		if (cnp->cn_flags & ISDOTDOT) {
1210			ltype = VOP_ISLOCKED(dvp);
1211			VOP_UNLOCK(dvp, 0);
1212		}
1213		error = vn_lock(*vpp, cnp->cn_lkflags);
1214		if (cnp->cn_flags & ISDOTDOT)
1215			vn_lock(dvp, ltype | LK_RETRY);
1216		if (error != 0) {
1217			VN_RELE(*vpp);
1218			*vpp = NULL;
1219			return (error);
1220		}
1221	}
1222
1223#ifdef FREEBSD_NAMECACHE
1224	/*
1225	 * Insert name into cache (as non-existent) if appropriate.
1226	 */
1227	if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
1228		cache_enter(dvp, *vpp, cnp);
1229	/*
1230	 * Insert name into cache if appropriate.
1231	 */
1232	if (error == 0 && (cnp->cn_flags & MAKEENTRY)) {
1233		if (!(cnp->cn_flags & ISLASTCN) ||
1234		    (nameiop != DELETE && nameiop != RENAME)) {
1235			cache_enter(dvp, *vpp, cnp);
1236		}
1237	}
1238#endif
1239
1240	return (error);
1241}
1242
1243/*
1244 * Attempt to create a new entry in a directory.  If the entry
1245 * already exists, truncate the file if permissible, else return
1246 * an error.  Return the vp of the created or trunc'd file.
1247 *
1248 *	IN:	dvp	- vnode of directory to put new file entry in.
1249 *		name	- name of new file entry.
1250 *		vap	- attributes of new file.
1251 *		excl	- flag indicating exclusive or non-exclusive mode.
1252 *		mode	- mode to open file with.
1253 *		cr	- credentials of caller.
1254 *		flag	- large file flag [UNUSED].
1255 *		ct	- caller context
1256 *		vsecp 	- ACL to be set
1257 *
1258 *	OUT:	vpp	- vnode of created or trunc'd entry.
1259 *
1260 *	RETURN:	0 if success
1261 *		error code if failure
1262 *
1263 * Timestamps:
1264 *	dvp - ctime|mtime updated if new entry created
1265 *	 vp - ctime|mtime always, atime if new
1266 */
1267
1268/* ARGSUSED */
1269static int
1270zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
1271    vnode_t **vpp, cred_t *cr, kthread_t *td)
1272{
1273	znode_t		*zp, *dzp = VTOZ(dvp);
1274	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1275	zilog_t		*zilog;
1276	objset_t	*os;
1277	zfs_dirlock_t	*dl;
1278	dmu_tx_t	*tx;
1279	int		error;
1280	zfs_acl_t	*aclp = NULL;
1281	zfs_fuid_info_t *fuidp = NULL;
1282	void		*vsecp = NULL;
1283	int		flag = 0;
1284
1285	/*
1286	 * If we have an ephemeral id, ACL, or XVATTR then
1287	 * make sure file system is at proper version
1288	 */
1289
1290	if (zfsvfs->z_use_fuids == B_FALSE &&
1291	    (vsecp || (vap->va_mask & AT_XVATTR) ||
1292	    IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))))
1293		return (EINVAL);
1294
1295	ZFS_ENTER(zfsvfs);
1296	ZFS_VERIFY_ZP(dzp);
1297	os = zfsvfs->z_os;
1298	zilog = zfsvfs->z_log;
1299
1300	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1301	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1302		ZFS_EXIT(zfsvfs);
1303		return (EILSEQ);
1304	}
1305
1306	if (vap->va_mask & AT_XVATTR) {
1307		if ((error = secpolicy_xvattr((xvattr_t *)vap,
1308		    crgetuid(cr), cr, vap->va_type)) != 0) {
1309			ZFS_EXIT(zfsvfs);
1310			return (error);
1311		}
1312	}
1313top:
1314	*vpp = NULL;
1315
1316	if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
1317		vap->va_mode &= ~S_ISVTX;
1318
1319	if (*name == '\0') {
1320		/*
1321		 * Null component name refers to the directory itself.
1322		 */
1323		VN_HOLD(dvp);
1324		zp = dzp;
1325		dl = NULL;
1326		error = 0;
1327	} else {
1328		/* possible VN_HOLD(zp) */
1329		int zflg = 0;
1330
1331		if (flag & FIGNORECASE)
1332			zflg |= ZCILOOK;
1333
1334		error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1335		    NULL, NULL);
1336		if (error) {
1337			if (strcmp(name, "..") == 0)
1338				error = EISDIR;
1339			ZFS_EXIT(zfsvfs);
1340			if (aclp)
1341				zfs_acl_free(aclp);
1342			return (error);
1343		}
1344	}
1345	if (vsecp && aclp == NULL) {
1346		error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp);
1347		if (error) {
1348			ZFS_EXIT(zfsvfs);
1349			if (dl)
1350				zfs_dirent_unlock(dl);
1351			return (error);
1352		}
1353	}
1354
1355	if (zp == NULL) {
1356		uint64_t txtype;
1357
1358		/*
1359		 * Create a new file object and update the directory
1360		 * to reference it.
1361		 */
1362		if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1363			goto out;
1364		}
1365
1366		/*
1367		 * We only support the creation of regular files in
1368		 * extended attribute directories.
1369		 */
1370		if ((dzp->z_phys->zp_flags & ZFS_XATTR) &&
1371		    (vap->va_type != VREG)) {
1372			error = EINVAL;
1373			goto out;
1374		}
1375
1376		tx = dmu_tx_create(os);
1377		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1378		if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(crgetuid(cr)) ||
1379		    IS_EPHEMERAL(crgetgid(cr))) {
1380			if (zfsvfs->z_fuid_obj == 0) {
1381				dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1382				dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
1383				    FUID_SIZE_ESTIMATE(zfsvfs));
1384				dmu_tx_hold_zap(tx, MASTER_NODE_OBJ,
1385				    FALSE, NULL);
1386			} else {
1387				dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
1388				dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
1389				    FUID_SIZE_ESTIMATE(zfsvfs));
1390			}
1391		}
1392		dmu_tx_hold_bonus(tx, dzp->z_id);
1393		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1394		if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) {
1395			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1396			    0, SPA_MAXBLOCKSIZE);
1397		}
1398		error = dmu_tx_assign(tx, zfsvfs->z_assign);
1399		if (error) {
1400			zfs_dirent_unlock(dl);
1401			if (error == ERESTART &&
1402			    zfsvfs->z_assign == TXG_NOWAIT) {
1403				dmu_tx_wait(tx);
1404				dmu_tx_abort(tx);
1405				goto top;
1406			}
1407			dmu_tx_abort(tx);
1408			ZFS_EXIT(zfsvfs);
1409			if (aclp)
1410				zfs_acl_free(aclp);
1411			return (error);
1412		}
1413		zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp);
1414		(void) zfs_link_create(dl, zp, tx, ZNEW);
1415		txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1416		if (flag & FIGNORECASE)
1417			txtype |= TX_CI;
1418		zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1419		    vsecp, fuidp, vap);
1420		if (fuidp)
1421			zfs_fuid_info_free(fuidp);
1422		dmu_tx_commit(tx);
1423	} else {
1424		int aflags = (flag & FAPPEND) ? V_APPEND : 0;
1425
1426		/*
1427		 * A directory entry already exists for this name.
1428		 */
1429		/*
1430		 * Can't truncate an existing file if in exclusive mode.
1431		 */
1432		if (excl == EXCL) {
1433			error = EEXIST;
1434			goto out;
1435		}
1436		/*
1437		 * Can't open a directory for writing.
1438		 */
1439		if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
1440			error = EISDIR;
1441			goto out;
1442		}
1443		/*
1444		 * Verify requested access to file.
1445		 */
1446		if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
1447			goto out;
1448		}
1449
1450		mutex_enter(&dzp->z_lock);
1451		dzp->z_seq++;
1452		mutex_exit(&dzp->z_lock);
1453
1454		/*
1455		 * Truncate regular files if requested.
1456		 */
1457		if ((ZTOV(zp)->v_type == VREG) &&
1458		    (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
1459			/* we can't hold any locks when calling zfs_freesp() */
1460			zfs_dirent_unlock(dl);
1461			dl = NULL;
1462			error = zfs_freesp(zp, 0, 0, mode, TRUE);
1463			if (error == 0) {
1464				vnevent_create(ZTOV(zp), ct);
1465			}
1466		}
1467	}
1468out:
1469	if (dl)
1470		zfs_dirent_unlock(dl);
1471
1472	if (error) {
1473		if (zp)
1474			VN_RELE(ZTOV(zp));
1475	} else {
1476		*vpp = ZTOV(zp);
1477		/*
1478		 * If vnode is for a device return a specfs vnode instead.
1479		 */
1480		if (IS_DEVVP(*vpp)) {
1481			struct vnode *svp;
1482
1483			svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1484			VN_RELE(*vpp);
1485			if (svp == NULL) {
1486				error = ENOSYS;
1487			}
1488			*vpp = svp;
1489		}
1490	}
1491	if (aclp)
1492		zfs_acl_free(aclp);
1493
1494	ZFS_EXIT(zfsvfs);
1495	return (error);
1496}
1497
1498/*
1499 * Remove an entry from a directory.
1500 *
1501 *	IN:	dvp	- vnode of directory to remove entry from.
1502 *		name	- name of entry to remove.
1503 *		cr	- credentials of caller.
1504 *		ct	- caller context
1505 *		flags	- case flags
1506 *
1507 *	RETURN:	0 if success
1508 *		error code if failure
1509 *
1510 * Timestamps:
1511 *	dvp - ctime|mtime
1512 *	 vp - ctime (if nlink > 0)
1513 */
1514/*ARGSUSED*/
1515static int
1516zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
1517    int flags)
1518{
1519	znode_t		*zp, *dzp = VTOZ(dvp);
1520	znode_t		*xzp = NULL;
1521	vnode_t		*vp;
1522	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1523	zilog_t		*zilog;
1524	uint64_t	acl_obj, xattr_obj;
1525	zfs_dirlock_t	*dl;
1526	dmu_tx_t	*tx;
1527	boolean_t	may_delete_now, delete_now = FALSE;
1528	boolean_t	unlinked, toobig = FALSE;
1529	uint64_t	txtype;
1530	pathname_t	*realnmp = NULL;
1531	pathname_t	realnm;
1532	int		error;
1533	int		zflg = ZEXISTS;
1534
1535	ZFS_ENTER(zfsvfs);
1536	ZFS_VERIFY_ZP(dzp);
1537	zilog = zfsvfs->z_log;
1538
1539	if (flags & FIGNORECASE) {
1540		zflg |= ZCILOOK;
1541		pn_alloc(&realnm);
1542		realnmp = &realnm;
1543	}
1544
1545top:
1546	/*
1547	 * Attempt to lock directory; fail if entry doesn't exist.
1548	 */
1549	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1550	    NULL, realnmp)) {
1551		if (realnmp)
1552			pn_free(realnmp);
1553		ZFS_EXIT(zfsvfs);
1554		return (error);
1555	}
1556
1557	vp = ZTOV(zp);
1558
1559	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1560		goto out;
1561	}
1562
1563	/*
1564	 * Need to use rmdir for removing directories.
1565	 */
1566	if (vp->v_type == VDIR) {
1567		error = EPERM;
1568		goto out;
1569	}
1570
1571	vnevent_remove(vp, dvp, name, ct);
1572
1573	if (realnmp)
1574		dnlc_remove(dvp, realnmp->pn_buf);
1575	else
1576		dnlc_remove(dvp, name);
1577
1578	may_delete_now = FALSE;
1579
1580	/*
1581	 * We may delete the znode now, or we may put it in the unlinked set;
1582	 * it depends on whether we're the last link, and on whether there are
1583	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
1584	 * allow for either case.
1585	 */
1586	tx = dmu_tx_create(zfsvfs->z_os);
1587	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1588	dmu_tx_hold_bonus(tx, zp->z_id);
1589	if (may_delete_now) {
1590		toobig =
1591		    zp->z_phys->zp_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
1592		/* if the file is too big, only hold_free a token amount */
1593		dmu_tx_hold_free(tx, zp->z_id, 0,
1594		    (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1595	}
1596
1597	/* are there any extended attributes? */
1598	if ((xattr_obj = zp->z_phys->zp_xattr) != 0) {
1599		/* XXX - do we need this if we are deleting? */
1600		dmu_tx_hold_bonus(tx, xattr_obj);
1601	}
1602
1603	/* are there any additional acls */
1604	if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 &&
1605	    may_delete_now)
1606		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1607
1608	/* charge as an update -- would be nice not to charge at all */
1609	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1610
1611	error = dmu_tx_assign(tx, zfsvfs->z_assign);
1612	if (error) {
1613		zfs_dirent_unlock(dl);
1614		VN_RELE(vp);
1615		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
1616			dmu_tx_wait(tx);
1617			dmu_tx_abort(tx);
1618			goto top;
1619		}
1620		if (realnmp)
1621			pn_free(realnmp);
1622		dmu_tx_abort(tx);
1623		ZFS_EXIT(zfsvfs);
1624		return (error);
1625	}
1626
1627	/*
1628	 * Remove the directory entry.
1629	 */
1630	error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1631
1632	if (error) {
1633		dmu_tx_commit(tx);
1634		goto out;
1635	}
1636
1637	if (0 && unlinked) {
1638		VI_LOCK(vp);
1639		delete_now = may_delete_now && !toobig &&
1640		    vp->v_count == 1 && !vn_has_cached_data(vp) &&
1641		    zp->z_phys->zp_xattr == xattr_obj &&
1642		    zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj;
1643		VI_UNLOCK(vp);
1644	}
1645
1646	if (delete_now) {
1647		if (zp->z_phys->zp_xattr) {
1648			error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
1649			ASSERT3U(error, ==, 0);
1650			ASSERT3U(xzp->z_phys->zp_links, ==, 2);
1651			dmu_buf_will_dirty(xzp->z_dbuf, tx);
1652			mutex_enter(&xzp->z_lock);
1653			xzp->z_unlinked = 1;
1654			xzp->z_phys->zp_links = 0;
1655			mutex_exit(&xzp->z_lock);
1656			zfs_unlinked_add(xzp, tx);
1657			zp->z_phys->zp_xattr = 0; /* probably unnecessary */
1658		}
1659		mutex_enter(&zp->z_lock);
1660		VI_LOCK(vp);
1661		vp->v_count--;
1662		ASSERT3U(vp->v_count, ==, 0);
1663		VI_UNLOCK(vp);
1664		mutex_exit(&zp->z_lock);
1665		zfs_znode_delete(zp, tx);
1666	} else if (unlinked) {
1667		zfs_unlinked_add(zp, tx);
1668	}
1669
1670	txtype = TX_REMOVE;
1671	if (flags & FIGNORECASE)
1672		txtype |= TX_CI;
1673	zfs_log_remove(zilog, tx, txtype, dzp, name);
1674
1675	dmu_tx_commit(tx);
1676out:
1677	if (realnmp)
1678		pn_free(realnmp);
1679
1680	zfs_dirent_unlock(dl);
1681
1682	if (!delete_now) {
1683		VN_RELE(vp);
1684	} else if (xzp) {
1685		/* this rele is delayed to prevent nesting transactions */
1686		VN_RELE(ZTOV(xzp));
1687	}
1688
1689	ZFS_EXIT(zfsvfs);
1690	return (error);
1691}
1692
1693/*
1694 * Create a new directory and insert it into dvp using the name
1695 * provided.  Return a pointer to the inserted directory.
1696 *
1697 *	IN:	dvp	- vnode of directory to add subdir to.
1698 *		dirname	- name of new directory.
1699 *		vap	- attributes of new directory.
1700 *		cr	- credentials of caller.
1701 *		ct	- caller context
1702 *		vsecp	- ACL to be set
1703 *
1704 *	OUT:	vpp	- vnode of created directory.
1705 *
1706 *	RETURN:	0 if success
1707 *		error code if failure
1708 *
1709 * Timestamps:
1710 *	dvp - ctime|mtime updated
1711 *	 vp - ctime|mtime|atime updated
1712 */
1713/*ARGSUSED*/
1714static int
1715zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
1716    caller_context_t *ct, int flags, vsecattr_t *vsecp)
1717{
1718	znode_t		*zp, *dzp = VTOZ(dvp);
1719	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1720	zilog_t		*zilog;
1721	zfs_dirlock_t	*dl;
1722	uint64_t	txtype;
1723	dmu_tx_t	*tx;
1724	int		error;
1725	zfs_acl_t	*aclp = NULL;
1726	zfs_fuid_info_t	*fuidp = NULL;
1727	int		zf = ZNEW;
1728
1729	ASSERT(vap->va_type == VDIR);
1730
1731	/*
1732	 * If we have an ephemeral id, ACL, or XVATTR then
1733	 * make sure file system is at proper version
1734	 */
1735
1736	if (zfsvfs->z_use_fuids == B_FALSE &&
1737	    (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(crgetuid(cr))||
1738	    IS_EPHEMERAL(crgetgid(cr))))
1739		return (EINVAL);
1740
1741	ZFS_ENTER(zfsvfs);
1742	ZFS_VERIFY_ZP(dzp);
1743	zilog = zfsvfs->z_log;
1744
1745	if (dzp->z_phys->zp_flags & ZFS_XATTR) {
1746		ZFS_EXIT(zfsvfs);
1747		return (EINVAL);
1748	}
1749
1750	if (zfsvfs->z_utf8 && u8_validate(dirname,
1751	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1752		ZFS_EXIT(zfsvfs);
1753		return (EILSEQ);
1754	}
1755	if (flags & FIGNORECASE)
1756		zf |= ZCILOOK;
1757
1758	if (vap->va_mask & AT_XVATTR)
1759		if ((error = secpolicy_xvattr((xvattr_t *)vap,
1760		    crgetuid(cr), cr, vap->va_type)) != 0) {
1761			ZFS_EXIT(zfsvfs);
1762			return (error);
1763		}
1764
1765	/*
1766	 * First make sure the new directory doesn't exist.
1767	 */
1768top:
1769	*vpp = NULL;
1770
1771	if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
1772	    NULL, NULL)) {
1773		ZFS_EXIT(zfsvfs);
1774		return (error);
1775	}
1776
1777	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
1778		zfs_dirent_unlock(dl);
1779		ZFS_EXIT(zfsvfs);
1780		return (error);
1781	}
1782
1783	if (vsecp && aclp == NULL) {
1784		error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp);
1785		if (error) {
1786			zfs_dirent_unlock(dl);
1787			ZFS_EXIT(zfsvfs);
1788			return (error);
1789		}
1790	}
1791	/*
1792	 * Add a new entry to the directory.
1793	 */
1794	tx = dmu_tx_create(zfsvfs->z_os);
1795	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1796	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1797	if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(crgetuid(cr)) ||
1798	    IS_EPHEMERAL(crgetgid(cr))) {
1799		if (zfsvfs->z_fuid_obj == 0) {
1800			dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1801			dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
1802			    FUID_SIZE_ESTIMATE(zfsvfs));
1803			dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
1804		} else {
1805			dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
1806			dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
1807			    FUID_SIZE_ESTIMATE(zfsvfs));
1808		}
1809	}
1810	if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp)
1811		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1812		    0, SPA_MAXBLOCKSIZE);
1813	error = dmu_tx_assign(tx, zfsvfs->z_assign);
1814	if (error) {
1815		zfs_dirent_unlock(dl);
1816		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
1817			dmu_tx_wait(tx);
1818			dmu_tx_abort(tx);
1819			goto top;
1820		}
1821		dmu_tx_abort(tx);
1822		ZFS_EXIT(zfsvfs);
1823		if (aclp)
1824			zfs_acl_free(aclp);
1825		return (error);
1826	}
1827
1828	/*
1829	 * Create new node.
1830	 */
1831	zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp);
1832
1833	if (aclp)
1834		zfs_acl_free(aclp);
1835
1836	/*
1837	 * Now put new name in parent dir.
1838	 */
1839	(void) zfs_link_create(dl, zp, tx, ZNEW);
1840
1841	*vpp = ZTOV(zp);
1842
1843	txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
1844	if (flags & FIGNORECASE)
1845		txtype |= TX_CI;
1846	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, fuidp, vap);
1847
1848	if (fuidp)
1849		zfs_fuid_info_free(fuidp);
1850	dmu_tx_commit(tx);
1851
1852	zfs_dirent_unlock(dl);
1853
1854	ZFS_EXIT(zfsvfs);
1855	return (0);
1856}
1857
1858/*
1859 * Remove a directory subdir entry.  If the current working
1860 * directory is the same as the subdir to be removed, the
1861 * remove will fail.
1862 *
1863 *	IN:	dvp	- vnode of directory to remove from.
1864 *		name	- name of directory to be removed.
1865 *		cwd	- vnode of current working directory.
1866 *		cr	- credentials of caller.
1867 *		ct	- caller context
1868 *		flags	- case flags
1869 *
1870 *	RETURN:	0 if success
1871 *		error code if failure
1872 *
1873 * Timestamps:
1874 *	dvp - ctime|mtime updated
1875 */
1876/*ARGSUSED*/
1877static int
1878zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
1879    caller_context_t *ct, int flags)
1880{
1881	znode_t		*dzp = VTOZ(dvp);
1882	znode_t		*zp;
1883	vnode_t		*vp;
1884	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1885	zilog_t		*zilog;
1886	zfs_dirlock_t	*dl;
1887	dmu_tx_t	*tx;
1888	int		error;
1889	int		zflg = ZEXISTS;
1890
1891	ZFS_ENTER(zfsvfs);
1892	ZFS_VERIFY_ZP(dzp);
1893	zilog = zfsvfs->z_log;
1894
1895	if (flags & FIGNORECASE)
1896		zflg |= ZCILOOK;
1897top:
1898	zp = NULL;
1899
1900	/*
1901	 * Attempt to lock directory; fail if entry doesn't exist.
1902	 */
1903	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1904	    NULL, NULL)) {
1905		ZFS_EXIT(zfsvfs);
1906		return (error);
1907	}
1908
1909	vp = ZTOV(zp);
1910
1911	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1912		goto out;
1913	}
1914
1915	if (vp->v_type != VDIR) {
1916		error = ENOTDIR;
1917		goto out;
1918	}
1919
1920	if (vp == cwd) {
1921		error = EINVAL;
1922		goto out;
1923	}
1924
1925	vnevent_rmdir(vp, dvp, name, ct);
1926
1927	/*
1928	 * Grab a lock on the directory to make sure that noone is
1929	 * trying to add (or lookup) entries while we are removing it.
1930	 */
1931	rw_enter(&zp->z_name_lock, RW_WRITER);
1932
1933	/*
1934	 * Grab a lock on the parent pointer to make sure we play well
1935	 * with the treewalk and directory rename code.
1936	 */
1937	rw_enter(&zp->z_parent_lock, RW_WRITER);
1938
1939	tx = dmu_tx_create(zfsvfs->z_os);
1940	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1941	dmu_tx_hold_bonus(tx, zp->z_id);
1942	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1943	error = dmu_tx_assign(tx, zfsvfs->z_assign);
1944	if (error) {
1945		rw_exit(&zp->z_parent_lock);
1946		rw_exit(&zp->z_name_lock);
1947		zfs_dirent_unlock(dl);
1948		VN_RELE(vp);
1949		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
1950			dmu_tx_wait(tx);
1951			dmu_tx_abort(tx);
1952			goto top;
1953		}
1954		dmu_tx_abort(tx);
1955		ZFS_EXIT(zfsvfs);
1956		return (error);
1957	}
1958
1959#ifdef FREEBSD_NAMECACHE
1960	cache_purge(dvp);
1961#endif
1962
1963	error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
1964
1965	if (error == 0) {
1966		uint64_t txtype = TX_RMDIR;
1967		if (flags & FIGNORECASE)
1968			txtype |= TX_CI;
1969		zfs_log_remove(zilog, tx, txtype, dzp, name);
1970	}
1971
1972	dmu_tx_commit(tx);
1973
1974	rw_exit(&zp->z_parent_lock);
1975	rw_exit(&zp->z_name_lock);
1976#ifdef FREEBSD_NAMECACHE
1977	cache_purge(vp);
1978#endif
1979out:
1980	zfs_dirent_unlock(dl);
1981
1982	VN_RELE(vp);
1983
1984	ZFS_EXIT(zfsvfs);
1985	return (error);
1986}
1987
1988/*
1989 * Read as many directory entries as will fit into the provided
1990 * buffer from the given directory cursor position (specified in
1991 * the uio structure.
1992 *
1993 *	IN:	vp	- vnode of directory to read.
1994 *		uio	- structure supplying read location, range info,
1995 *			  and return buffer.
1996 *		cr	- credentials of caller.
1997 *		ct	- caller context
1998 *		flags	- case flags
1999 *
2000 *	OUT:	uio	- updated offset and range, buffer filled.
2001 *		eofp	- set to true if end-of-file detected.
2002 *
2003 *	RETURN:	0 if success
2004 *		error code if failure
2005 *
2006 * Timestamps:
2007 *	vp - atime updated
2008 *
2009 * Note that the low 4 bits of the cookie returned by zap is always zero.
2010 * This allows us to use the low range for "special" directory entries:
2011 * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
2012 * we use the offset 2 for the '.zfs' directory.
2013 */
2014/* ARGSUSED */
2015static int
2016zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
2017{
2018	znode_t		*zp = VTOZ(vp);
2019	iovec_t		*iovp;
2020	edirent_t	*eodp;
2021	dirent64_t	*odp;
2022	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2023	objset_t	*os;
2024	caddr_t		outbuf;
2025	size_t		bufsize;
2026	zap_cursor_t	zc;
2027	zap_attribute_t	zap;
2028	uint_t		bytes_wanted;
2029	uint64_t	offset; /* must be unsigned; checks for < 1 */
2030	int		local_eof;
2031	int		outcount;
2032	int		error;
2033	uint8_t		prefetch;
2034	boolean_t	check_sysattrs;
2035	uint8_t		type;
2036	int		ncooks;
2037	u_long		*cooks = NULL;
2038	int		flags = 0;
2039
2040	ZFS_ENTER(zfsvfs);
2041	ZFS_VERIFY_ZP(zp);
2042
2043	/*
2044	 * If we are not given an eof variable,
2045	 * use a local one.
2046	 */
2047	if (eofp == NULL)
2048		eofp = &local_eof;
2049
2050	/*
2051	 * Check for valid iov_len.
2052	 */
2053	if (uio->uio_iov->iov_len <= 0) {
2054		ZFS_EXIT(zfsvfs);
2055		return (EINVAL);
2056	}
2057
2058	/*
2059	 * Quit if directory has been removed (posix)
2060	 */
2061	if ((*eofp = zp->z_unlinked) != 0) {
2062		ZFS_EXIT(zfsvfs);
2063		return (0);
2064	}
2065
2066	error = 0;
2067	os = zfsvfs->z_os;
2068	offset = uio->uio_loffset;
2069	prefetch = zp->z_zn_prefetch;
2070
2071	/*
2072	 * Initialize the iterator cursor.
2073	 */
2074	if (offset <= 3) {
2075		/*
2076		 * Start iteration from the beginning of the directory.
2077		 */
2078		zap_cursor_init(&zc, os, zp->z_id);
2079	} else {
2080		/*
2081		 * The offset is a serialized cursor.
2082		 */
2083		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2084	}
2085
2086	/*
2087	 * Get space to change directory entries into fs independent format.
2088	 */
2089	iovp = uio->uio_iov;
2090	bytes_wanted = iovp->iov_len;
2091	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
2092		bufsize = bytes_wanted;
2093		outbuf = kmem_alloc(bufsize, KM_SLEEP);
2094		odp = (struct dirent64 *)outbuf;
2095	} else {
2096		bufsize = bytes_wanted;
2097		odp = (struct dirent64 *)iovp->iov_base;
2098	}
2099	eodp = (struct edirent *)odp;
2100
2101	if (ncookies != NULL) {
2102		/*
2103		 * Minimum entry size is dirent size and 1 byte for a file name.
2104		 */
2105		ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
2106		cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
2107		*cookies = cooks;
2108		*ncookies = ncooks;
2109	}
2110	/*
2111	 * If this VFS supports the system attribute view interface; and
2112	 * we're looking at an extended attribute directory; and we care
2113	 * about normalization conflicts on this vfs; then we must check
2114	 * for normalization conflicts with the sysattr name space.
2115	 */
2116#ifdef TODO
2117	check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2118	    (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2119	    (flags & V_RDDIR_ENTFLAGS);
2120#else
2121	check_sysattrs = 0;
2122#endif
2123
2124	/*
2125	 * Transform to file-system independent format
2126	 */
2127	outcount = 0;
2128	while (outcount < bytes_wanted) {
2129		ino64_t objnum;
2130		ushort_t reclen;
2131		off64_t *next;
2132
2133		/*
2134		 * Special case `.', `..', and `.zfs'.
2135		 */
2136		if (offset == 0) {
2137			(void) strcpy(zap.za_name, ".");
2138			zap.za_normalization_conflict = 0;
2139			objnum = zp->z_id;
2140			type = DT_DIR;
2141		} else if (offset == 1) {
2142			(void) strcpy(zap.za_name, "..");
2143			zap.za_normalization_conflict = 0;
2144			objnum = zp->z_phys->zp_parent;
2145			type = DT_DIR;
2146		} else if (offset == 2 && zfs_show_ctldir(zp)) {
2147			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2148			zap.za_normalization_conflict = 0;
2149			objnum = ZFSCTL_INO_ROOT;
2150			type = DT_DIR;
2151		} else {
2152			/*
2153			 * Grab next entry.
2154			 */
2155			if (error = zap_cursor_retrieve(&zc, &zap)) {
2156				if ((*eofp = (error == ENOENT)) != 0)
2157					break;
2158				else
2159					goto update;
2160			}
2161
2162			if (zap.za_integer_length != 8 ||
2163			    zap.za_num_integers != 1) {
2164				cmn_err(CE_WARN, "zap_readdir: bad directory "
2165				    "entry, obj = %lld, offset = %lld\n",
2166				    (u_longlong_t)zp->z_id,
2167				    (u_longlong_t)offset);
2168				error = ENXIO;
2169				goto update;
2170			}
2171
2172			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2173			/*
2174			 * MacOS X can extract the object type here such as:
2175			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2176			 */
2177			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2178
2179			if (check_sysattrs && !zap.za_normalization_conflict) {
2180#ifdef TODO
2181				zap.za_normalization_conflict =
2182				    xattr_sysattr_casechk(zap.za_name);
2183#else
2184				panic("%s:%u: TODO", __func__, __LINE__);
2185#endif
2186			}
2187		}
2188
2189		if (flags & V_RDDIR_ENTFLAGS)
2190			reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2191		else
2192			reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2193
2194		/*
2195		 * Will this entry fit in the buffer?
2196		 */
2197		if (outcount + reclen > bufsize) {
2198			/*
2199			 * Did we manage to fit anything in the buffer?
2200			 */
2201			if (!outcount) {
2202				error = EINVAL;
2203				goto update;
2204			}
2205			break;
2206		}
2207		if (flags & V_RDDIR_ENTFLAGS) {
2208			/*
2209			 * Add extended flag entry:
2210			 */
2211			eodp->ed_ino = objnum;
2212			eodp->ed_reclen = reclen;
2213			/* NOTE: ed_off is the offset for the *next* entry */
2214			next = &(eodp->ed_off);
2215			eodp->ed_eflags = zap.za_normalization_conflict ?
2216			    ED_CASE_CONFLICT : 0;
2217			(void) strncpy(eodp->ed_name, zap.za_name,
2218			    EDIRENT_NAMELEN(reclen));
2219			eodp = (edirent_t *)((intptr_t)eodp + reclen);
2220		} else {
2221			/*
2222			 * Add normal entry:
2223			 */
2224			odp->d_ino = objnum;
2225			odp->d_reclen = reclen;
2226			odp->d_namlen = strlen(zap.za_name);
2227			(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
2228			odp->d_type = type;
2229			odp = (dirent64_t *)((intptr_t)odp + reclen);
2230		}
2231		outcount += reclen;
2232
2233		ASSERT(outcount <= bufsize);
2234
2235		/* Prefetch znode */
2236		if (prefetch)
2237			dmu_prefetch(os, objnum, 0, 0);
2238
2239		/*
2240		 * Move to the next entry, fill in the previous offset.
2241		 */
2242		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2243			zap_cursor_advance(&zc);
2244			offset = zap_cursor_serialize(&zc);
2245		} else {
2246			offset += 1;
2247		}
2248
2249		if (cooks != NULL) {
2250			*cooks++ = offset;
2251			ncooks--;
2252			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
2253		}
2254	}
2255	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2256
2257	/* Subtract unused cookies */
2258	if (ncookies != NULL)
2259		*ncookies -= ncooks;
2260
2261	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
2262		iovp->iov_base += outcount;
2263		iovp->iov_len -= outcount;
2264		uio->uio_resid -= outcount;
2265	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2266		/*
2267		 * Reset the pointer.
2268		 */
2269		offset = uio->uio_loffset;
2270	}
2271
2272update:
2273	zap_cursor_fini(&zc);
2274	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
2275		kmem_free(outbuf, bufsize);
2276
2277	if (error == ENOENT)
2278		error = 0;
2279
2280	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2281
2282	uio->uio_loffset = offset;
2283	ZFS_EXIT(zfsvfs);
2284	if (error != 0 && cookies != NULL) {
2285		free(*cookies, M_TEMP);
2286		*cookies = NULL;
2287		*ncookies = 0;
2288	}
2289	return (error);
2290}
2291
2292ulong_t zfs_fsync_sync_cnt = 4;
2293
2294static int
2295zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2296{
2297	znode_t	*zp = VTOZ(vp);
2298	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2299
2300	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2301
2302	ZFS_ENTER(zfsvfs);
2303	ZFS_VERIFY_ZP(zp);
2304	zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
2305	ZFS_EXIT(zfsvfs);
2306	return (0);
2307}
2308
2309
2310/*
2311 * Get the requested file attributes and place them in the provided
2312 * vattr structure.
2313 *
2314 *	IN:	vp	- vnode of file.
2315 *		vap	- va_mask identifies requested attributes.
2316 *			  If AT_XVATTR set, then optional attrs are requested
2317 *		flags	- ATTR_NOACLCHECK (CIFS server context)
2318 *		cr	- credentials of caller.
2319 *		ct	- caller context
2320 *
2321 *	OUT:	vap	- attribute values.
2322 *
2323 *	RETURN:	0 (always succeeds)
2324 */
2325/* ARGSUSED */
2326static int
2327zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2328    caller_context_t *ct)
2329{
2330	znode_t *zp = VTOZ(vp);
2331	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2332	znode_phys_t *pzp;
2333	int	error = 0;
2334	uint32_t blksize;
2335	u_longlong_t nblocks;
2336	uint64_t links;
2337	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2338	xoptattr_t *xoap = NULL;
2339	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2340
2341	ZFS_ENTER(zfsvfs);
2342	ZFS_VERIFY_ZP(zp);
2343	pzp = zp->z_phys;
2344
2345	mutex_enter(&zp->z_lock);
2346
2347	/*
2348	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2349	 * Also, if we are the owner don't bother, since owner should
2350	 * always be allowed to read basic attributes of file.
2351	 */
2352	if (!(pzp->zp_flags & ZFS_ACL_TRIVIAL) &&
2353	    (pzp->zp_uid != crgetuid(cr))) {
2354		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2355		    skipaclchk, cr)) {
2356			mutex_exit(&zp->z_lock);
2357			ZFS_EXIT(zfsvfs);
2358			return (error);
2359		}
2360	}
2361
2362	/*
2363	 * Return all attributes.  It's cheaper to provide the answer
2364	 * than to determine whether we were asked the question.
2365	 */
2366
2367	vap->va_type = IFTOVT(pzp->zp_mode);
2368	vap->va_mode = pzp->zp_mode & ~S_IFMT;
2369	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2370//	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
2371	vap->va_nodeid = zp->z_id;
2372	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
2373		links = pzp->zp_links + 1;
2374	else
2375		links = pzp->zp_links;
2376	vap->va_nlink = MIN(links, UINT32_MAX);	/* nlink_t limit! */
2377	vap->va_size = pzp->zp_size;
2378	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
2379	vap->va_rdev = zfs_cmpldev(pzp->zp_rdev);
2380	vap->va_seq = zp->z_seq;
2381	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
2382
2383	/*
2384	 * Add in any requested optional attributes and the create time.
2385	 * Also set the corresponding bits in the returned attribute bitmap.
2386	 */
2387	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2388		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2389			xoap->xoa_archive =
2390			    ((pzp->zp_flags & ZFS_ARCHIVE) != 0);
2391			XVA_SET_RTN(xvap, XAT_ARCHIVE);
2392		}
2393
2394		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2395			xoap->xoa_readonly =
2396			    ((pzp->zp_flags & ZFS_READONLY) != 0);
2397			XVA_SET_RTN(xvap, XAT_READONLY);
2398		}
2399
2400		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2401			xoap->xoa_system =
2402			    ((pzp->zp_flags & ZFS_SYSTEM) != 0);
2403			XVA_SET_RTN(xvap, XAT_SYSTEM);
2404		}
2405
2406		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2407			xoap->xoa_hidden =
2408			    ((pzp->zp_flags & ZFS_HIDDEN) != 0);
2409			XVA_SET_RTN(xvap, XAT_HIDDEN);
2410		}
2411
2412		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2413			xoap->xoa_nounlink =
2414			    ((pzp->zp_flags & ZFS_NOUNLINK) != 0);
2415			XVA_SET_RTN(xvap, XAT_NOUNLINK);
2416		}
2417
2418		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2419			xoap->xoa_immutable =
2420			    ((pzp->zp_flags & ZFS_IMMUTABLE) != 0);
2421			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2422		}
2423
2424		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2425			xoap->xoa_appendonly =
2426			    ((pzp->zp_flags & ZFS_APPENDONLY) != 0);
2427			XVA_SET_RTN(xvap, XAT_APPENDONLY);
2428		}
2429
2430		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2431			xoap->xoa_nodump =
2432			    ((pzp->zp_flags & ZFS_NODUMP) != 0);
2433			XVA_SET_RTN(xvap, XAT_NODUMP);
2434		}
2435
2436		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2437			xoap->xoa_opaque =
2438			    ((pzp->zp_flags & ZFS_OPAQUE) != 0);
2439			XVA_SET_RTN(xvap, XAT_OPAQUE);
2440		}
2441
2442		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2443			xoap->xoa_av_quarantined =
2444			    ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0);
2445			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2446		}
2447
2448		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2449			xoap->xoa_av_modified =
2450			    ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0);
2451			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2452		}
2453
2454		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2455		    vp->v_type == VREG &&
2456		    (pzp->zp_flags & ZFS_BONUS_SCANSTAMP)) {
2457			size_t len;
2458			dmu_object_info_t doi;
2459
2460			/*
2461			 * Only VREG files have anti-virus scanstamps, so we
2462			 * won't conflict with symlinks in the bonus buffer.
2463			 */
2464			dmu_object_info_from_db(zp->z_dbuf, &doi);
2465			len = sizeof (xoap->xoa_av_scanstamp) +
2466			    sizeof (znode_phys_t);
2467			if (len <= doi.doi_bonus_size) {
2468				/*
2469				 * pzp points to the start of the
2470				 * znode_phys_t. pzp + 1 points to the
2471				 * first byte after the znode_phys_t.
2472				 */
2473				(void) memcpy(xoap->xoa_av_scanstamp,
2474				    pzp + 1,
2475				    sizeof (xoap->xoa_av_scanstamp));
2476				XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
2477			}
2478		}
2479
2480		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
2481			ZFS_TIME_DECODE(&xoap->xoa_createtime, pzp->zp_crtime);
2482			XVA_SET_RTN(xvap, XAT_CREATETIME);
2483		}
2484	}
2485
2486	ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime);
2487	ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime);
2488	ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime);
2489	ZFS_TIME_DECODE(&vap->va_birthtime, pzp->zp_crtime);
2490
2491	mutex_exit(&zp->z_lock);
2492
2493	dmu_object_size_from_db(zp->z_dbuf, &blksize, &nblocks);
2494	vap->va_blksize = blksize;
2495	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
2496
2497	if (zp->z_blksz == 0) {
2498		/*
2499		 * Block size hasn't been set; suggest maximal I/O transfers.
2500		 */
2501		vap->va_blksize = zfsvfs->z_max_blksz;
2502	}
2503
2504	ZFS_EXIT(zfsvfs);
2505	return (0);
2506}
2507
2508/*
2509 * Set the file attributes to the values contained in the
2510 * vattr structure.
2511 *
2512 *	IN:	vp	- vnode of file to be modified.
2513 *		vap	- new attribute values.
2514 *			  If AT_XVATTR set, then optional attrs are being set
2515 *		flags	- ATTR_UTIME set if non-default time values provided.
2516 *			- ATTR_NOACLCHECK (CIFS context only).
2517 *		cr	- credentials of caller.
2518 *		ct	- caller context
2519 *
2520 *	RETURN:	0 if success
2521 *		error code if failure
2522 *
2523 * Timestamps:
2524 *	vp - ctime updated, mtime updated if size changed.
2525 */
2526/* ARGSUSED */
2527static int
2528zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2529	caller_context_t *ct)
2530{
2531	znode_t		*zp = VTOZ(vp);
2532	znode_phys_t	*pzp;
2533	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2534	zilog_t		*zilog;
2535	dmu_tx_t	*tx;
2536	vattr_t		oldva;
2537	uint_t		mask = vap->va_mask;
2538	uint_t		saved_mask;
2539	int		trim_mask = 0;
2540	uint64_t	new_mode;
2541	znode_t		*attrzp;
2542	int		need_policy = FALSE;
2543	int		err;
2544	zfs_fuid_info_t *fuidp = NULL;
2545	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2546	xoptattr_t	*xoap;
2547	zfs_acl_t	*aclp = NULL;
2548	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2549
2550	if (mask == 0)
2551		return (0);
2552
2553	if (mask & AT_NOSET)
2554		return (EINVAL);
2555
2556	ZFS_ENTER(zfsvfs);
2557	ZFS_VERIFY_ZP(zp);
2558
2559	pzp = zp->z_phys;
2560	zilog = zfsvfs->z_log;
2561
2562	/*
2563	 * Make sure that if we have ephemeral uid/gid or xvattr specified
2564	 * that file system is at proper version level
2565	 */
2566
2567	if (zfsvfs->z_use_fuids == B_FALSE &&
2568	    (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2569	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2570	    (mask & AT_XVATTR))) {
2571		ZFS_EXIT(zfsvfs);
2572		return (EINVAL);
2573	}
2574
2575	if (mask & AT_SIZE && vp->v_type == VDIR) {
2576		ZFS_EXIT(zfsvfs);
2577		return (EISDIR);
2578	}
2579
2580	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
2581		ZFS_EXIT(zfsvfs);
2582		return (EINVAL);
2583	}
2584
2585	/*
2586	 * If this is an xvattr_t, then get a pointer to the structure of
2587	 * optional attributes.  If this is NULL, then we have a vattr_t.
2588	 */
2589	xoap = xva_getxoptattr(xvap);
2590
2591	/*
2592	 * Immutable files can only alter immutable bit and atime
2593	 */
2594	if ((pzp->zp_flags & ZFS_IMMUTABLE) &&
2595	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
2596	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2597		ZFS_EXIT(zfsvfs);
2598		return (EPERM);
2599	}
2600
2601	if ((mask & AT_SIZE) && (pzp->zp_flags & ZFS_READONLY)) {
2602		ZFS_EXIT(zfsvfs);
2603		return (EPERM);
2604	}
2605
2606	/*
2607	 * Verify timestamps doesn't overflow 32 bits.
2608	 * ZFS can handle large timestamps, but 32bit syscalls can't
2609	 * handle times greater than 2039.  This check should be removed
2610	 * once large timestamps are fully supported.
2611	 */
2612	if (mask & (AT_ATIME | AT_MTIME)) {
2613		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2614		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2615			ZFS_EXIT(zfsvfs);
2616			return (EOVERFLOW);
2617		}
2618	}
2619
2620top:
2621	attrzp = NULL;
2622
2623	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2624		ZFS_EXIT(zfsvfs);
2625		return (EROFS);
2626	}
2627
2628	/*
2629	 * First validate permissions
2630	 */
2631
2632	if (mask & AT_SIZE) {
2633		err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
2634		if (err) {
2635			ZFS_EXIT(zfsvfs);
2636			return (err);
2637		}
2638		/*
2639		 * XXX - Note, we are not providing any open
2640		 * mode flags here (like FNDELAY), so we may
2641		 * block if there are locks present... this
2642		 * should be addressed in openat().
2643		 */
2644		/* XXX - would it be OK to generate a log record here? */
2645		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2646		if (err) {
2647			ZFS_EXIT(zfsvfs);
2648			return (err);
2649		}
2650	}
2651
2652	if (mask & (AT_ATIME|AT_MTIME) ||
2653	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2654	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2655	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2656	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2657	    XVA_ISSET_REQ(xvap, XAT_SYSTEM))))
2658		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2659		    skipaclchk, cr);
2660
2661	if (mask & (AT_UID|AT_GID)) {
2662		int	idmask = (mask & (AT_UID|AT_GID));
2663		int	take_owner;
2664		int	take_group;
2665
2666		/*
2667		 * NOTE: even if a new mode is being set,
2668		 * we may clear S_ISUID/S_ISGID bits.
2669		 */
2670
2671		if (!(mask & AT_MODE))
2672			vap->va_mode = pzp->zp_mode;
2673
2674		/*
2675		 * Take ownership or chgrp to group we are a member of
2676		 */
2677
2678		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
2679		take_group = (mask & AT_GID) &&
2680		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
2681
2682		/*
2683		 * If both AT_UID and AT_GID are set then take_owner and
2684		 * take_group must both be set in order to allow taking
2685		 * ownership.
2686		 *
2687		 * Otherwise, send the check through secpolicy_vnode_setattr()
2688		 *
2689		 */
2690
2691		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
2692		    ((idmask == AT_UID) && take_owner) ||
2693		    ((idmask == AT_GID) && take_group)) {
2694			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
2695			    skipaclchk, cr) == 0) {
2696				/*
2697				 * Remove setuid/setgid for non-privileged users
2698				 */
2699				secpolicy_setid_clear(vap, vp, cr);
2700				trim_mask = (mask & (AT_UID|AT_GID));
2701			} else {
2702				need_policy =  TRUE;
2703			}
2704		} else {
2705			need_policy =  TRUE;
2706		}
2707	}
2708
2709	mutex_enter(&zp->z_lock);
2710	oldva.va_mode = pzp->zp_mode;
2711	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
2712	if (mask & AT_XVATTR) {
2713		if ((need_policy == FALSE) &&
2714		    (XVA_ISSET_REQ(xvap, XAT_APPENDONLY) &&
2715		    xoap->xoa_appendonly !=
2716		    ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) ||
2717		    (XVA_ISSET_REQ(xvap, XAT_NOUNLINK) &&
2718		    xoap->xoa_nounlink !=
2719		    ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) ||
2720		    (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE) &&
2721		    xoap->xoa_immutable !=
2722		    ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) ||
2723		    (XVA_ISSET_REQ(xvap, XAT_NODUMP) &&
2724		    xoap->xoa_nodump !=
2725		    ((pzp->zp_flags & ZFS_NODUMP) != 0)) ||
2726		    (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED) &&
2727		    xoap->xoa_av_modified !=
2728		    ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) ||
2729		    ((XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED) &&
2730		    ((vp->v_type != VREG && xoap->xoa_av_quarantined) ||
2731		    xoap->xoa_av_quarantined !=
2732		    ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)))) ||
2733		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
2734		    (XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
2735			need_policy = TRUE;
2736		}
2737	}
2738
2739	mutex_exit(&zp->z_lock);
2740
2741	if (mask & AT_MODE) {
2742		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
2743			err = secpolicy_setid_setsticky_clear(vp, vap,
2744			    &oldva, cr);
2745			if (err) {
2746				ZFS_EXIT(zfsvfs);
2747				return (err);
2748			}
2749			trim_mask |= AT_MODE;
2750		} else {
2751			need_policy = TRUE;
2752		}
2753	}
2754
2755	if (need_policy) {
2756		/*
2757		 * If trim_mask is set then take ownership
2758		 * has been granted or write_acl is present and user
2759		 * has the ability to modify mode.  In that case remove
2760		 * UID|GID and or MODE from mask so that
2761		 * secpolicy_vnode_setattr() doesn't revoke it.
2762		 */
2763
2764		if (trim_mask) {
2765			saved_mask = vap->va_mask;
2766			vap->va_mask &= ~trim_mask;
2767		}
2768		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
2769		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
2770		if (err) {
2771			ZFS_EXIT(zfsvfs);
2772			return (err);
2773		}
2774
2775		if (trim_mask)
2776			vap->va_mask |= saved_mask;
2777	}
2778
2779	/*
2780	 * secpolicy_vnode_setattr, or take ownership may have
2781	 * changed va_mask
2782	 */
2783	mask = vap->va_mask;
2784
2785	tx = dmu_tx_create(zfsvfs->z_os);
2786	dmu_tx_hold_bonus(tx, zp->z_id);
2787	if (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2788	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid))) {
2789		if (zfsvfs->z_fuid_obj == 0) {
2790			dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
2791			dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
2792			    FUID_SIZE_ESTIMATE(zfsvfs));
2793			dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
2794		} else {
2795			dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
2796			dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
2797			    FUID_SIZE_ESTIMATE(zfsvfs));
2798		}
2799	}
2800
2801	if (mask & AT_MODE) {
2802		uint64_t pmode = pzp->zp_mode;
2803
2804		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
2805
2806		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) {
2807			dmu_tx_abort(tx);
2808			ZFS_EXIT(zfsvfs);
2809			return (err);
2810		}
2811		if (pzp->zp_acl.z_acl_extern_obj) {
2812			/* Are we upgrading ACL from old V0 format to new V1 */
2813			if (zfsvfs->z_version <= ZPL_VERSION_FUID &&
2814			    pzp->zp_acl.z_acl_version ==
2815			    ZFS_ACL_VERSION_INITIAL) {
2816				dmu_tx_hold_free(tx,
2817				    pzp->zp_acl.z_acl_extern_obj, 0,
2818				    DMU_OBJECT_END);
2819				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2820				    0, aclp->z_acl_bytes);
2821			} else {
2822				dmu_tx_hold_write(tx,
2823				    pzp->zp_acl.z_acl_extern_obj, 0,
2824				    aclp->z_acl_bytes);
2825			}
2826		} else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2827			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2828			    0, aclp->z_acl_bytes);
2829		}
2830	}
2831
2832	if ((mask & (AT_UID | AT_GID)) && pzp->zp_xattr != 0) {
2833		err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp);
2834		if (err) {
2835			dmu_tx_abort(tx);
2836			ZFS_EXIT(zfsvfs);
2837			if (aclp)
2838				zfs_acl_free(aclp);
2839			return (err);
2840		}
2841		dmu_tx_hold_bonus(tx, attrzp->z_id);
2842	}
2843
2844	err = dmu_tx_assign(tx, zfsvfs->z_assign);
2845	if (err) {
2846		if (attrzp)
2847			VN_RELE(ZTOV(attrzp));
2848
2849		if (aclp) {
2850			zfs_acl_free(aclp);
2851			aclp = NULL;
2852		}
2853
2854		if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
2855			dmu_tx_wait(tx);
2856			dmu_tx_abort(tx);
2857			goto top;
2858		}
2859		dmu_tx_abort(tx);
2860		ZFS_EXIT(zfsvfs);
2861		return (err);
2862	}
2863
2864	dmu_buf_will_dirty(zp->z_dbuf, tx);
2865
2866	/*
2867	 * Set each attribute requested.
2868	 * We group settings according to the locks they need to acquire.
2869	 *
2870	 * Note: you cannot set ctime directly, although it will be
2871	 * updated as a side-effect of calling this function.
2872	 */
2873
2874	mutex_enter(&zp->z_lock);
2875
2876	if (mask & AT_MODE) {
2877		mutex_enter(&zp->z_acl_lock);
2878		zp->z_phys->zp_mode = new_mode;
2879		err = zfs_aclset_common(zp, aclp, cr, &fuidp, tx);
2880		ASSERT3U(err, ==, 0);
2881		mutex_exit(&zp->z_acl_lock);
2882	}
2883
2884	if (attrzp)
2885		mutex_enter(&attrzp->z_lock);
2886
2887	if (mask & AT_UID) {
2888		pzp->zp_uid = zfs_fuid_create(zfsvfs,
2889		    vap->va_uid, cr, ZFS_OWNER, tx, &fuidp);
2890		if (attrzp) {
2891			attrzp->z_phys->zp_uid = zfs_fuid_create(zfsvfs,
2892			    vap->va_uid,  cr, ZFS_OWNER, tx, &fuidp);
2893		}
2894	}
2895
2896	if (mask & AT_GID) {
2897		pzp->zp_gid = zfs_fuid_create(zfsvfs, vap->va_gid,
2898		    cr, ZFS_GROUP, tx, &fuidp);
2899		if (attrzp)
2900			attrzp->z_phys->zp_gid = zfs_fuid_create(zfsvfs,
2901			    vap->va_gid, cr, ZFS_GROUP, tx, &fuidp);
2902	}
2903
2904	if (aclp)
2905		zfs_acl_free(aclp);
2906
2907	if (attrzp)
2908		mutex_exit(&attrzp->z_lock);
2909
2910	if (mask & AT_ATIME)
2911		ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
2912
2913	if (mask & AT_MTIME)
2914		ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
2915
2916	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
2917	if (mask & AT_SIZE)
2918		zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx);
2919	else if (mask != 0)
2920		zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
2921	/*
2922	 * Do this after setting timestamps to prevent timestamp
2923	 * update from toggling bit
2924	 */
2925
2926	if (xoap && (mask & AT_XVATTR)) {
2927		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
2928			size_t len;
2929			dmu_object_info_t doi;
2930
2931			ASSERT(vp->v_type == VREG);
2932
2933			/* Grow the bonus buffer if necessary. */
2934			dmu_object_info_from_db(zp->z_dbuf, &doi);
2935			len = sizeof (xoap->xoa_av_scanstamp) +
2936			    sizeof (znode_phys_t);
2937			if (len > doi.doi_bonus_size)
2938				VERIFY(dmu_set_bonus(zp->z_dbuf, len, tx) == 0);
2939		}
2940		zfs_xvattr_set(zp, xvap);
2941	}
2942
2943	if (mask != 0)
2944		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
2945
2946	if (fuidp)
2947		zfs_fuid_info_free(fuidp);
2948	mutex_exit(&zp->z_lock);
2949
2950	if (attrzp)
2951		VN_RELE(ZTOV(attrzp));
2952
2953	dmu_tx_commit(tx);
2954
2955	ZFS_EXIT(zfsvfs);
2956	return (err);
2957}
2958
2959typedef struct zfs_zlock {
2960	krwlock_t	*zl_rwlock;	/* lock we acquired */
2961	znode_t		*zl_znode;	/* znode we held */
2962	struct zfs_zlock *zl_next;	/* next in list */
2963} zfs_zlock_t;
2964
2965/*
2966 * Drop locks and release vnodes that were held by zfs_rename_lock().
2967 */
2968static void
2969zfs_rename_unlock(zfs_zlock_t **zlpp)
2970{
2971	zfs_zlock_t *zl;
2972
2973	while ((zl = *zlpp) != NULL) {
2974		if (zl->zl_znode != NULL)
2975			VN_RELE(ZTOV(zl->zl_znode));
2976		rw_exit(zl->zl_rwlock);
2977		*zlpp = zl->zl_next;
2978		kmem_free(zl, sizeof (*zl));
2979	}
2980}
2981
2982/*
2983 * Search back through the directory tree, using the ".." entries.
2984 * Lock each directory in the chain to prevent concurrent renames.
2985 * Fail any attempt to move a directory into one of its own descendants.
2986 * XXX - z_parent_lock can overlap with map or grow locks
2987 */
2988static int
2989zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
2990{
2991	zfs_zlock_t	*zl;
2992	znode_t		*zp = tdzp;
2993	uint64_t	rootid = zp->z_zfsvfs->z_root;
2994	uint64_t	*oidp = &zp->z_id;
2995	krwlock_t	*rwlp = &szp->z_parent_lock;
2996	krw_t		rw = RW_WRITER;
2997
2998	/*
2999	 * First pass write-locks szp and compares to zp->z_id.
3000	 * Later passes read-lock zp and compare to zp->z_parent.
3001	 */
3002	do {
3003		if (!rw_tryenter(rwlp, rw)) {
3004			/*
3005			 * Another thread is renaming in this path.
3006			 * Note that if we are a WRITER, we don't have any
3007			 * parent_locks held yet.
3008			 */
3009			if (rw == RW_READER && zp->z_id > szp->z_id) {
3010				/*
3011				 * Drop our locks and restart
3012				 */
3013				zfs_rename_unlock(&zl);
3014				*zlpp = NULL;
3015				zp = tdzp;
3016				oidp = &zp->z_id;
3017				rwlp = &szp->z_parent_lock;
3018				rw = RW_WRITER;
3019				continue;
3020			} else {
3021				/*
3022				 * Wait for other thread to drop its locks
3023				 */
3024				rw_enter(rwlp, rw);
3025			}
3026		}
3027
3028		zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
3029		zl->zl_rwlock = rwlp;
3030		zl->zl_znode = NULL;
3031		zl->zl_next = *zlpp;
3032		*zlpp = zl;
3033
3034		if (*oidp == szp->z_id)		/* We're a descendant of szp */
3035			return (EINVAL);
3036
3037		if (*oidp == rootid)		/* We've hit the top */
3038			return (0);
3039
3040		if (rw == RW_READER) {		/* i.e. not the first pass */
3041			int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp);
3042			if (error)
3043				return (error);
3044			zl->zl_znode = zp;
3045		}
3046		oidp = &zp->z_phys->zp_parent;
3047		rwlp = &zp->z_parent_lock;
3048		rw = RW_READER;
3049
3050	} while (zp->z_id != sdzp->z_id);
3051
3052	return (0);
3053}
3054
3055/*
3056 * Move an entry from the provided source directory to the target
3057 * directory.  Change the entry name as indicated.
3058 *
3059 *	IN:	sdvp	- Source directory containing the "old entry".
3060 *		snm	- Old entry name.
3061 *		tdvp	- Target directory to contain the "new entry".
3062 *		tnm	- New entry name.
3063 *		cr	- credentials of caller.
3064 *		ct	- caller context
3065 *		flags	- case flags
3066 *
3067 *	RETURN:	0 if success
3068 *		error code if failure
3069 *
3070 * Timestamps:
3071 *	sdvp,tdvp - ctime|mtime updated
3072 */
3073/*ARGSUSED*/
3074static int
3075zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
3076    caller_context_t *ct, int flags)
3077{
3078	znode_t		*tdzp, *szp, *tzp;
3079	znode_t		*sdzp = VTOZ(sdvp);
3080	zfsvfs_t	*zfsvfs = sdzp->z_zfsvfs;
3081	zilog_t		*zilog;
3082	vnode_t		*realvp;
3083	zfs_dirlock_t	*sdl, *tdl;
3084	dmu_tx_t	*tx;
3085	zfs_zlock_t	*zl;
3086	int		cmp, serr, terr;
3087	int		error = 0;
3088	int		zflg = 0;
3089
3090	ZFS_ENTER(zfsvfs);
3091	ZFS_VERIFY_ZP(sdzp);
3092	zilog = zfsvfs->z_log;
3093
3094	/*
3095	 * Make sure we have the real vp for the target directory.
3096	 */
3097	if (VOP_REALVP(tdvp, &realvp, ct) == 0)
3098		tdvp = realvp;
3099
3100	if (tdvp->v_vfsp != sdvp->v_vfsp) {
3101		ZFS_EXIT(zfsvfs);
3102		return (EXDEV);
3103	}
3104
3105	tdzp = VTOZ(tdvp);
3106	ZFS_VERIFY_ZP(tdzp);
3107	if (zfsvfs->z_utf8 && u8_validate(tnm,
3108	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3109		ZFS_EXIT(zfsvfs);
3110		return (EILSEQ);
3111	}
3112
3113	if (flags & FIGNORECASE)
3114		zflg |= ZCILOOK;
3115
3116top:
3117	szp = NULL;
3118	tzp = NULL;
3119	zl = NULL;
3120
3121	/*
3122	 * This is to prevent the creation of links into attribute space
3123	 * by renaming a linked file into/outof an attribute directory.
3124	 * See the comment in zfs_link() for why this is considered bad.
3125	 */
3126	if ((tdzp->z_phys->zp_flags & ZFS_XATTR) !=
3127	    (sdzp->z_phys->zp_flags & ZFS_XATTR)) {
3128		ZFS_EXIT(zfsvfs);
3129		return (EINVAL);
3130	}
3131
3132	/*
3133	 * Lock source and target directory entries.  To prevent deadlock,
3134	 * a lock ordering must be defined.  We lock the directory with
3135	 * the smallest object id first, or if it's a tie, the one with
3136	 * the lexically first name.
3137	 */
3138	if (sdzp->z_id < tdzp->z_id) {
3139		cmp = -1;
3140	} else if (sdzp->z_id > tdzp->z_id) {
3141		cmp = 1;
3142	} else {
3143		/*
3144		 * First compare the two name arguments without
3145		 * considering any case folding.
3146		 */
3147		int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
3148
3149		cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
3150		ASSERT(error == 0 || !zfsvfs->z_utf8);
3151		if (cmp == 0) {
3152			/*
3153			 * POSIX: "If the old argument and the new argument
3154			 * both refer to links to the same existing file,
3155			 * the rename() function shall return successfully
3156			 * and perform no other action."
3157			 */
3158			ZFS_EXIT(zfsvfs);
3159			return (0);
3160		}
3161		/*
3162		 * If the file system is case-folding, then we may
3163		 * have some more checking to do.  A case-folding file
3164		 * system is either supporting mixed case sensitivity
3165		 * access or is completely case-insensitive.  Note
3166		 * that the file system is always case preserving.
3167		 *
3168		 * In mixed sensitivity mode case sensitive behavior
3169		 * is the default.  FIGNORECASE must be used to
3170		 * explicitly request case insensitive behavior.
3171		 *
3172		 * If the source and target names provided differ only
3173		 * by case (e.g., a request to rename 'tim' to 'Tim'),
3174		 * we will treat this as a special case in the
3175		 * case-insensitive mode: as long as the source name
3176		 * is an exact match, we will allow this to proceed as
3177		 * a name-change request.
3178		 */
3179		if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
3180		    (zfsvfs->z_case == ZFS_CASE_MIXED &&
3181		    flags & FIGNORECASE)) &&
3182		    u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
3183		    &error) == 0) {
3184			/*
3185			 * case preserving rename request, require exact
3186			 * name matches
3187			 */
3188			zflg |= ZCIEXACT;
3189			zflg &= ~ZCILOOK;
3190		}
3191	}
3192
3193	if (cmp < 0) {
3194		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
3195		    ZEXISTS | zflg, NULL, NULL);
3196		terr = zfs_dirent_lock(&tdl,
3197		    tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
3198	} else {
3199		terr = zfs_dirent_lock(&tdl,
3200		    tdzp, tnm, &tzp, zflg, NULL, NULL);
3201		serr = zfs_dirent_lock(&sdl,
3202		    sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
3203		    NULL, NULL);
3204	}
3205
3206	if (serr) {
3207		/*
3208		 * Source entry invalid or not there.
3209		 */
3210		if (!terr) {
3211			zfs_dirent_unlock(tdl);
3212			if (tzp)
3213				VN_RELE(ZTOV(tzp));
3214		}
3215		if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0)
3216			serr = EINVAL;
3217		ZFS_EXIT(zfsvfs);
3218		return (serr);
3219	}
3220	if (terr) {
3221		zfs_dirent_unlock(sdl);
3222		VN_RELE(ZTOV(szp));
3223		if (strcmp(tnm, "..") == 0)
3224			terr = EINVAL;
3225		ZFS_EXIT(zfsvfs);
3226		return (terr);
3227	}
3228
3229	/*
3230	 * Must have write access at the source to remove the old entry
3231	 * and write access at the target to create the new entry.
3232	 * Note that if target and source are the same, this can be
3233	 * done in a single check.
3234	 */
3235
3236	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
3237		goto out;
3238
3239	if (ZTOV(szp)->v_type == VDIR) {
3240		/*
3241		 * Check to make sure rename is valid.
3242		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3243		 */
3244		if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
3245			goto out;
3246	}
3247
3248	/*
3249	 * Does target exist?
3250	 */
3251	if (tzp) {
3252		/*
3253		 * Source and target must be the same type.
3254		 */
3255		if (ZTOV(szp)->v_type == VDIR) {
3256			if (ZTOV(tzp)->v_type != VDIR) {
3257				error = ENOTDIR;
3258				goto out;
3259			}
3260		} else {
3261			if (ZTOV(tzp)->v_type == VDIR) {
3262				error = EISDIR;
3263				goto out;
3264			}
3265		}
3266		/*
3267		 * POSIX dictates that when the source and target
3268		 * entries refer to the same file object, rename
3269		 * must do nothing and exit without error.
3270		 */
3271		if (szp->z_id == tzp->z_id) {
3272			error = 0;
3273			goto out;
3274		}
3275	}
3276
3277	vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
3278	if (tzp)
3279		vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
3280
3281	/*
3282	 * notify the target directory if it is not the same
3283	 * as source directory.
3284	 */
3285	if (tdvp != sdvp) {
3286		vnevent_rename_dest_dir(tdvp, ct);
3287	}
3288
3289	tx = dmu_tx_create(zfsvfs->z_os);
3290	dmu_tx_hold_bonus(tx, szp->z_id);	/* nlink changes */
3291	dmu_tx_hold_bonus(tx, sdzp->z_id);	/* nlink changes */
3292	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3293	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3294	if (sdzp != tdzp)
3295		dmu_tx_hold_bonus(tx, tdzp->z_id);	/* nlink changes */
3296	if (tzp)
3297		dmu_tx_hold_bonus(tx, tzp->z_id);	/* parent changes */
3298	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3299	error = dmu_tx_assign(tx, zfsvfs->z_assign);
3300	if (error) {
3301		if (zl != NULL)
3302			zfs_rename_unlock(&zl);
3303		zfs_dirent_unlock(sdl);
3304		zfs_dirent_unlock(tdl);
3305		VN_RELE(ZTOV(szp));
3306		if (tzp)
3307			VN_RELE(ZTOV(tzp));
3308		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
3309			dmu_tx_wait(tx);
3310			dmu_tx_abort(tx);
3311			goto top;
3312		}
3313		dmu_tx_abort(tx);
3314		ZFS_EXIT(zfsvfs);
3315		return (error);
3316	}
3317
3318	if (tzp)	/* Attempt to remove the existing target */
3319		error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
3320
3321	if (error == 0) {
3322		error = zfs_link_create(tdl, szp, tx, ZRENAMING);
3323		if (error == 0) {
3324			szp->z_phys->zp_flags |= ZFS_AV_MODIFIED;
3325
3326			error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
3327			ASSERT(error == 0);
3328
3329			zfs_log_rename(zilog, tx,
3330			    TX_RENAME | (flags & FIGNORECASE ? TX_CI : 0),
3331			    sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp);
3332
3333			/* Update path information for the target vnode */
3334			vn_renamepath(tdvp, ZTOV(szp), tnm, strlen(tnm));
3335		}
3336#ifdef FREEBSD_NAMECACHE
3337		if (error == 0) {
3338			cache_purge(sdvp);
3339			cache_purge(tdvp);
3340		}
3341#endif
3342	}
3343
3344	dmu_tx_commit(tx);
3345out:
3346	if (zl != NULL)
3347		zfs_rename_unlock(&zl);
3348
3349	zfs_dirent_unlock(sdl);
3350	zfs_dirent_unlock(tdl);
3351
3352	VN_RELE(ZTOV(szp));
3353	if (tzp)
3354		VN_RELE(ZTOV(tzp));
3355
3356	ZFS_EXIT(zfsvfs);
3357
3358	return (error);
3359}
3360
3361/*
3362 * Insert the indicated symbolic reference entry into the directory.
3363 *
3364 *	IN:	dvp	- Directory to contain new symbolic link.
3365 *		link	- Name for new symlink entry.
3366 *		vap	- Attributes of new entry.
3367 *		target	- Target path of new symlink.
3368 *		cr	- credentials of caller.
3369 *		ct	- caller context
3370 *		flags	- case flags
3371 *
3372 *	RETURN:	0 if success
3373 *		error code if failure
3374 *
3375 * Timestamps:
3376 *	dvp - ctime|mtime updated
3377 */
3378/*ARGSUSED*/
3379static int
3380zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link,
3381    cred_t *cr, kthread_t *td)
3382{
3383	znode_t		*zp, *dzp = VTOZ(dvp);
3384	zfs_dirlock_t	*dl;
3385	dmu_tx_t	*tx;
3386	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
3387	zilog_t		*zilog;
3388	int		len = strlen(link);
3389	int		error;
3390	int		zflg = ZNEW;
3391	zfs_fuid_info_t *fuidp = NULL;
3392	int		flags = 0;
3393
3394	ASSERT(vap->va_type == VLNK);
3395
3396	ZFS_ENTER(zfsvfs);
3397	ZFS_VERIFY_ZP(dzp);
3398	zilog = zfsvfs->z_log;
3399
3400	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
3401	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3402		ZFS_EXIT(zfsvfs);
3403		return (EILSEQ);
3404	}
3405	if (flags & FIGNORECASE)
3406		zflg |= ZCILOOK;
3407top:
3408	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
3409		ZFS_EXIT(zfsvfs);
3410		return (error);
3411	}
3412
3413	if (len > MAXPATHLEN) {
3414		ZFS_EXIT(zfsvfs);
3415		return (ENAMETOOLONG);
3416	}
3417
3418	/*
3419	 * Attempt to lock directory; fail if entry already exists.
3420	 */
3421	error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
3422	if (error) {
3423		ZFS_EXIT(zfsvfs);
3424		return (error);
3425	}
3426
3427	tx = dmu_tx_create(zfsvfs->z_os);
3428	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
3429	dmu_tx_hold_bonus(tx, dzp->z_id);
3430	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3431	if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
3432		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE);
3433	if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) {
3434		if (zfsvfs->z_fuid_obj == 0) {
3435			dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
3436			dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
3437			    FUID_SIZE_ESTIMATE(zfsvfs));
3438			dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
3439		} else {
3440			dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
3441			dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
3442			    FUID_SIZE_ESTIMATE(zfsvfs));
3443		}
3444	}
3445	error = dmu_tx_assign(tx, zfsvfs->z_assign);
3446	if (error) {
3447		zfs_dirent_unlock(dl);
3448		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
3449			dmu_tx_wait(tx);
3450			dmu_tx_abort(tx);
3451			goto top;
3452		}
3453		dmu_tx_abort(tx);
3454		ZFS_EXIT(zfsvfs);
3455		return (error);
3456	}
3457
3458	dmu_buf_will_dirty(dzp->z_dbuf, tx);
3459
3460	/*
3461	 * Create a new object for the symlink.
3462	 * Put the link content into bonus buffer if it will fit;
3463	 * otherwise, store it just like any other file data.
3464	 */
3465	if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) {
3466		zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, NULL, &fuidp);
3467		if (len != 0)
3468			bcopy(link, zp->z_phys + 1, len);
3469	} else {
3470		dmu_buf_t *dbp;
3471
3472		zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, NULL, &fuidp);
3473		/*
3474		 * Nothing can access the znode yet so no locking needed
3475		 * for growing the znode's blocksize.
3476		 */
3477		zfs_grow_blocksize(zp, len, tx);
3478
3479		VERIFY(0 == dmu_buf_hold(zfsvfs->z_os,
3480		    zp->z_id, 0, FTAG, &dbp));
3481		dmu_buf_will_dirty(dbp, tx);
3482
3483		ASSERT3U(len, <=, dbp->db_size);
3484		bcopy(link, dbp->db_data, len);
3485		dmu_buf_rele(dbp, FTAG);
3486	}
3487	zp->z_phys->zp_size = len;
3488
3489	/*
3490	 * Insert the new object into the directory.
3491	 */
3492	(void) zfs_link_create(dl, zp, tx, ZNEW);
3493out:
3494	if (error == 0) {
3495		uint64_t txtype = TX_SYMLINK;
3496		if (flags & FIGNORECASE)
3497			txtype |= TX_CI;
3498		zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
3499		*vpp = ZTOV(zp);
3500	}
3501	if (fuidp)
3502		zfs_fuid_info_free(fuidp);
3503
3504	dmu_tx_commit(tx);
3505
3506	zfs_dirent_unlock(dl);
3507
3508	ZFS_EXIT(zfsvfs);
3509	return (error);
3510}
3511
3512/*
3513 * Return, in the buffer contained in the provided uio structure,
3514 * the symbolic path referred to by vp.
3515 *
3516 *	IN:	vp	- vnode of symbolic link.
3517 *		uoip	- structure to contain the link path.
3518 *		cr	- credentials of caller.
3519 *		ct	- caller context
3520 *
3521 *	OUT:	uio	- structure to contain the link path.
3522 *
3523 *	RETURN:	0 if success
3524 *		error code if failure
3525 *
3526 * Timestamps:
3527 *	vp - atime updated
3528 */
3529/* ARGSUSED */
3530static int
3531zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
3532{
3533	znode_t		*zp = VTOZ(vp);
3534	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
3535	size_t		bufsz;
3536	int		error;
3537
3538	ZFS_ENTER(zfsvfs);
3539	ZFS_VERIFY_ZP(zp);
3540
3541	bufsz = (size_t)zp->z_phys->zp_size;
3542	if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) {
3543		error = uiomove(zp->z_phys + 1,
3544		    MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
3545	} else {
3546		dmu_buf_t *dbp;
3547		error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp);
3548		if (error) {
3549			ZFS_EXIT(zfsvfs);
3550			return (error);
3551		}
3552		error = uiomove(dbp->db_data,
3553		    MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
3554		dmu_buf_rele(dbp, FTAG);
3555	}
3556
3557	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
3558	ZFS_EXIT(zfsvfs);
3559	return (error);
3560}
3561
3562/*
3563 * Insert a new entry into directory tdvp referencing svp.
3564 *
3565 *	IN:	tdvp	- Directory to contain new entry.
3566 *		svp	- vnode of new entry.
3567 *		name	- name of new entry.
3568 *		cr	- credentials of caller.
3569 *		ct	- caller context
3570 *
3571 *	RETURN:	0 if success
3572 *		error code if failure
3573 *
3574 * Timestamps:
3575 *	tdvp - ctime|mtime updated
3576 *	 svp - ctime updated
3577 */
3578/* ARGSUSED */
3579static int
3580zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
3581    caller_context_t *ct, int flags)
3582{
3583	znode_t		*dzp = VTOZ(tdvp);
3584	znode_t		*tzp, *szp;
3585	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
3586	zilog_t		*zilog;
3587	zfs_dirlock_t	*dl;
3588	dmu_tx_t	*tx;
3589	vnode_t		*realvp;
3590	int		error;
3591	int		zf = ZNEW;
3592	uid_t		owner;
3593
3594	ASSERT(tdvp->v_type == VDIR);
3595
3596	ZFS_ENTER(zfsvfs);
3597	ZFS_VERIFY_ZP(dzp);
3598	zilog = zfsvfs->z_log;
3599
3600	if (VOP_REALVP(svp, &realvp, ct) == 0)
3601		svp = realvp;
3602
3603	if (svp->v_vfsp != tdvp->v_vfsp) {
3604		ZFS_EXIT(zfsvfs);
3605		return (EXDEV);
3606	}
3607	szp = VTOZ(svp);
3608	ZFS_VERIFY_ZP(szp);
3609
3610	if (zfsvfs->z_utf8 && u8_validate(name,
3611	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3612		ZFS_EXIT(zfsvfs);
3613		return (EILSEQ);
3614	}
3615	if (flags & FIGNORECASE)
3616		zf |= ZCILOOK;
3617
3618top:
3619	/*
3620	 * We do not support links between attributes and non-attributes
3621	 * because of the potential security risk of creating links
3622	 * into "normal" file space in order to circumvent restrictions
3623	 * imposed in attribute space.
3624	 */
3625	if ((szp->z_phys->zp_flags & ZFS_XATTR) !=
3626	    (dzp->z_phys->zp_flags & ZFS_XATTR)) {
3627		ZFS_EXIT(zfsvfs);
3628		return (EINVAL);
3629	}
3630
3631	/*
3632	 * POSIX dictates that we return EPERM here.
3633	 * Better choices include ENOTSUP or EISDIR.
3634	 */
3635	if (svp->v_type == VDIR) {
3636		ZFS_EXIT(zfsvfs);
3637		return (EPERM);
3638	}
3639
3640	owner = zfs_fuid_map_id(zfsvfs, szp->z_phys->zp_uid, cr, ZFS_OWNER);
3641	if (owner != crgetuid(cr) &&
3642	    secpolicy_basic_link(svp, cr) != 0) {
3643		ZFS_EXIT(zfsvfs);
3644		return (EPERM);
3645	}
3646
3647	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
3648		ZFS_EXIT(zfsvfs);
3649		return (error);
3650	}
3651
3652	/*
3653	 * Attempt to lock directory; fail if entry already exists.
3654	 */
3655	error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
3656	if (error) {
3657		ZFS_EXIT(zfsvfs);
3658		return (error);
3659	}
3660
3661	tx = dmu_tx_create(zfsvfs->z_os);
3662	dmu_tx_hold_bonus(tx, szp->z_id);
3663	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3664	error = dmu_tx_assign(tx, zfsvfs->z_assign);
3665	if (error) {
3666		zfs_dirent_unlock(dl);
3667		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
3668			dmu_tx_wait(tx);
3669			dmu_tx_abort(tx);
3670			goto top;
3671		}
3672		dmu_tx_abort(tx);
3673		ZFS_EXIT(zfsvfs);
3674		return (error);
3675	}
3676
3677	error = zfs_link_create(dl, szp, tx, 0);
3678
3679	if (error == 0) {
3680		uint64_t txtype = TX_LINK;
3681		if (flags & FIGNORECASE)
3682			txtype |= TX_CI;
3683		zfs_log_link(zilog, tx, txtype, dzp, szp, name);
3684	}
3685
3686	dmu_tx_commit(tx);
3687
3688	zfs_dirent_unlock(dl);
3689
3690	if (error == 0) {
3691		vnevent_link(svp, ct);
3692	}
3693
3694	ZFS_EXIT(zfsvfs);
3695	return (error);
3696}
3697
3698/*ARGSUSED*/
3699void
3700zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
3701{
3702	znode_t	*zp = VTOZ(vp);
3703	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3704	int error;
3705
3706	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
3707	if (zp->z_dbuf == NULL) {
3708		/*
3709		 * The fs has been unmounted, or we did a
3710		 * suspend/resume and this file no longer exists.
3711		 */
3712		VI_LOCK(vp);
3713		vp->v_count = 0; /* count arrives as 1 */
3714		VI_UNLOCK(vp);
3715		vrecycle(vp, curthread);
3716		rw_exit(&zfsvfs->z_teardown_inactive_lock);
3717		return;
3718	}
3719
3720	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
3721		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
3722
3723		dmu_tx_hold_bonus(tx, zp->z_id);
3724		error = dmu_tx_assign(tx, TXG_WAIT);
3725		if (error) {
3726			dmu_tx_abort(tx);
3727		} else {
3728			dmu_buf_will_dirty(zp->z_dbuf, tx);
3729			mutex_enter(&zp->z_lock);
3730			zp->z_atime_dirty = 0;
3731			mutex_exit(&zp->z_lock);
3732			dmu_tx_commit(tx);
3733		}
3734	}
3735
3736	zfs_zinactive(zp);
3737	rw_exit(&zfsvfs->z_teardown_inactive_lock);
3738}
3739
3740CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
3741CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
3742
3743/*ARGSUSED*/
3744static int
3745zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
3746{
3747	znode_t		*zp = VTOZ(vp);
3748	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
3749	uint32_t	gen;
3750	uint64_t	object = zp->z_id;
3751	zfid_short_t	*zfid;
3752	int		size, i;
3753
3754	ZFS_ENTER(zfsvfs);
3755	ZFS_VERIFY_ZP(zp);
3756	gen = (uint32_t)zp->z_gen;
3757
3758	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
3759	fidp->fid_len = size;
3760
3761	zfid = (zfid_short_t *)fidp;
3762
3763	zfid->zf_len = size;
3764
3765	for (i = 0; i < sizeof (zfid->zf_object); i++)
3766		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
3767
3768	/* Must have a non-zero generation number to distinguish from .zfs */
3769	if (gen == 0)
3770		gen = 1;
3771	for (i = 0; i < sizeof (zfid->zf_gen); i++)
3772		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
3773
3774	if (size == LONG_FID_LEN) {
3775		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
3776		zfid_long_t	*zlfid;
3777
3778		zlfid = (zfid_long_t *)fidp;
3779
3780		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
3781			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
3782
3783		/* XXX - this should be the generation number for the objset */
3784		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
3785			zlfid->zf_setgen[i] = 0;
3786	}
3787
3788	ZFS_EXIT(zfsvfs);
3789	return (0);
3790}
3791
3792static int
3793zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
3794    caller_context_t *ct)
3795{
3796	znode_t		*zp, *xzp;
3797	zfsvfs_t	*zfsvfs;
3798	zfs_dirlock_t	*dl;
3799	int		error;
3800
3801	switch (cmd) {
3802	case _PC_LINK_MAX:
3803		*valp = INT_MAX;
3804		return (0);
3805
3806	case _PC_FILESIZEBITS:
3807		*valp = 64;
3808		return (0);
3809
3810#if 0
3811	case _PC_XATTR_EXISTS:
3812		zp = VTOZ(vp);
3813		zfsvfs = zp->z_zfsvfs;
3814		ZFS_ENTER(zfsvfs);
3815		ZFS_VERIFY_ZP(zp);
3816		*valp = 0;
3817		error = zfs_dirent_lock(&dl, zp, "", &xzp,
3818		    ZXATTR | ZEXISTS | ZSHARED, NULL, NULL);
3819		if (error == 0) {
3820			zfs_dirent_unlock(dl);
3821			if (!zfs_dirempty(xzp))
3822				*valp = 1;
3823			VN_RELE(ZTOV(xzp));
3824		} else if (error == ENOENT) {
3825			/*
3826			 * If there aren't extended attributes, it's the
3827			 * same as having zero of them.
3828			 */
3829			error = 0;
3830		}
3831		ZFS_EXIT(zfsvfs);
3832		return (error);
3833#endif
3834
3835	case _PC_ACL_EXTENDED:
3836		*valp = 0;	/* TODO */
3837		return (0);
3838
3839	case _PC_MIN_HOLE_SIZE:
3840		*valp = (int)SPA_MINBLOCKSIZE;
3841		return (0);
3842
3843	default:
3844		return (EOPNOTSUPP);
3845	}
3846}
3847
3848/*ARGSUSED*/
3849static int
3850zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
3851    caller_context_t *ct)
3852{
3853	znode_t *zp = VTOZ(vp);
3854	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3855	int error;
3856	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
3857
3858	ZFS_ENTER(zfsvfs);
3859	ZFS_VERIFY_ZP(zp);
3860	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
3861	ZFS_EXIT(zfsvfs);
3862
3863	return (error);
3864}
3865
3866/*ARGSUSED*/
3867static int
3868zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
3869    caller_context_t *ct)
3870{
3871	znode_t *zp = VTOZ(vp);
3872	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3873	int error;
3874	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
3875
3876	ZFS_ENTER(zfsvfs);
3877	ZFS_VERIFY_ZP(zp);
3878	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
3879	ZFS_EXIT(zfsvfs);
3880	return (error);
3881}
3882
3883static int
3884zfs_freebsd_open(ap)
3885	struct vop_open_args /* {
3886		struct vnode *a_vp;
3887		int a_mode;
3888		struct ucred *a_cred;
3889		struct thread *a_td;
3890	} */ *ap;
3891{
3892	vnode_t	*vp = ap->a_vp;
3893	znode_t *zp = VTOZ(vp);
3894	int error;
3895
3896	error = zfs_open(&vp, ap->a_mode, ap->a_cred, NULL);
3897	if (error == 0)
3898		vnode_create_vobject(vp, zp->z_phys->zp_size, ap->a_td);
3899	return (error);
3900}
3901
3902static int
3903zfs_freebsd_close(ap)
3904	struct vop_close_args /* {
3905		struct vnode *a_vp;
3906		int  a_fflag;
3907		struct ucred *a_cred;
3908		struct thread *a_td;
3909	} */ *ap;
3910{
3911
3912	return (zfs_close(ap->a_vp, ap->a_fflag, 0, 0, ap->a_cred, NULL));
3913}
3914
3915static int
3916zfs_freebsd_ioctl(ap)
3917	struct vop_ioctl_args /* {
3918		struct vnode *a_vp;
3919		u_long a_command;
3920		caddr_t a_data;
3921		int a_fflag;
3922		struct ucred *cred;
3923		struct thread *td;
3924	} */ *ap;
3925{
3926
3927	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
3928	    ap->a_fflag, ap->a_cred, NULL, NULL));
3929}
3930
3931static int
3932zfs_freebsd_read(ap)
3933	struct vop_read_args /* {
3934		struct vnode *a_vp;
3935		struct uio *a_uio;
3936		int a_ioflag;
3937		struct ucred *a_cred;
3938	} */ *ap;
3939{
3940
3941	return (zfs_read(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL));
3942}
3943
3944static int
3945zfs_freebsd_write(ap)
3946	struct vop_write_args /* {
3947		struct vnode *a_vp;
3948		struct uio *a_uio;
3949		int a_ioflag;
3950		struct ucred *a_cred;
3951	} */ *ap;
3952{
3953
3954	return (zfs_write(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL));
3955}
3956
3957static int
3958zfs_freebsd_access(ap)
3959	struct vop_access_args /* {
3960		struct vnode *a_vp;
3961		accmode_t a_accmode;
3962		struct ucred *a_cred;
3963		struct thread *a_td;
3964	} */ *ap;
3965{
3966
3967	/*
3968	 * ZFS itself only knowns about VREAD, VWRITE and VEXEC, the rest
3969	 * we have to handle by calling vaccess().
3970	 */
3971	if ((ap->a_accmode & ~(VREAD|VWRITE|VEXEC)) != 0) {
3972		vnode_t *vp = ap->a_vp;
3973		znode_t *zp = VTOZ(vp);
3974		znode_phys_t *zphys = zp->z_phys;
3975
3976		return (vaccess(vp->v_type, zphys->zp_mode, zphys->zp_uid,
3977		    zphys->zp_gid, ap->a_accmode, ap->a_cred, NULL));
3978	}
3979
3980	return (zfs_access(ap->a_vp, ap->a_accmode, 0, ap->a_cred, NULL));
3981}
3982
3983static int
3984zfs_freebsd_lookup(ap)
3985	struct vop_lookup_args /* {
3986		struct vnode *a_dvp;
3987		struct vnode **a_vpp;
3988		struct componentname *a_cnp;
3989	} */ *ap;
3990{
3991	struct componentname *cnp = ap->a_cnp;
3992	char nm[NAME_MAX + 1];
3993
3994	ASSERT(cnp->cn_namelen < sizeof(nm));
3995	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
3996
3997	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
3998	    cnp->cn_cred, cnp->cn_thread, 0));
3999}
4000
4001static int
4002zfs_freebsd_create(ap)
4003	struct vop_create_args /* {
4004		struct vnode *a_dvp;
4005		struct vnode **a_vpp;
4006		struct componentname *a_cnp;
4007		struct vattr *a_vap;
4008	} */ *ap;
4009{
4010	struct componentname *cnp = ap->a_cnp;
4011	vattr_t *vap = ap->a_vap;
4012	int mode;
4013
4014	ASSERT(cnp->cn_flags & SAVENAME);
4015
4016	vattr_init_mask(vap);
4017	mode = vap->va_mode & ALLPERMS;
4018
4019	return (zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
4020	    ap->a_vpp, cnp->cn_cred, cnp->cn_thread));
4021}
4022
4023static int
4024zfs_freebsd_remove(ap)
4025	struct vop_remove_args /* {
4026		struct vnode *a_dvp;
4027		struct vnode *a_vp;
4028		struct componentname *a_cnp;
4029	} */ *ap;
4030{
4031
4032	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
4033
4034	return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr,
4035	    ap->a_cnp->cn_cred, NULL, 0));
4036}
4037
4038static int
4039zfs_freebsd_mkdir(ap)
4040	struct vop_mkdir_args /* {
4041		struct vnode *a_dvp;
4042		struct vnode **a_vpp;
4043		struct componentname *a_cnp;
4044		struct vattr *a_vap;
4045	} */ *ap;
4046{
4047	vattr_t *vap = ap->a_vap;
4048
4049	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
4050
4051	vattr_init_mask(vap);
4052
4053	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
4054	    ap->a_cnp->cn_cred, NULL, 0, NULL));
4055}
4056
4057static int
4058zfs_freebsd_rmdir(ap)
4059	struct vop_rmdir_args /* {
4060		struct vnode *a_dvp;
4061		struct vnode *a_vp;
4062		struct componentname *a_cnp;
4063	} */ *ap;
4064{
4065	struct componentname *cnp = ap->a_cnp;
4066
4067	ASSERT(cnp->cn_flags & SAVENAME);
4068
4069	return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred, NULL, 0));
4070}
4071
4072static int
4073zfs_freebsd_readdir(ap)
4074	struct vop_readdir_args /* {
4075		struct vnode *a_vp;
4076		struct uio *a_uio;
4077		struct ucred *a_cred;
4078		int *a_eofflag;
4079		int *a_ncookies;
4080		u_long **a_cookies;
4081	} */ *ap;
4082{
4083
4084	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
4085	    ap->a_ncookies, ap->a_cookies));
4086}
4087
4088static int
4089zfs_freebsd_fsync(ap)
4090	struct vop_fsync_args /* {
4091		struct vnode *a_vp;
4092		int a_waitfor;
4093		struct thread *a_td;
4094	} */ *ap;
4095{
4096
4097	vop_stdfsync(ap);
4098	return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL));
4099}
4100
4101static int
4102zfs_freebsd_getattr(ap)
4103	struct vop_getattr_args /* {
4104		struct vnode *a_vp;
4105		struct vattr *a_vap;
4106		struct ucred *a_cred;
4107		struct thread *a_td;
4108	} */ *ap;
4109{
4110	vattr_t *vap = ap->a_vap;
4111	xvattr_t xvap;
4112	u_long fflags = 0;
4113	int error;
4114
4115	xva_init(&xvap);
4116	xvap.xva_vattr = *vap;
4117	xvap.xva_vattr.va_mask |= AT_XVATTR;
4118
4119	/* Convert chflags into ZFS-type flags. */
4120	/* XXX: what about SF_SETTABLE?. */
4121	XVA_SET_REQ(&xvap, XAT_IMMUTABLE);
4122	XVA_SET_REQ(&xvap, XAT_APPENDONLY);
4123	XVA_SET_REQ(&xvap, XAT_NOUNLINK);
4124	XVA_SET_REQ(&xvap, XAT_NODUMP);
4125	error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
4126	if (error != 0)
4127		return (error);
4128
4129	/* Convert ZFS xattr into chflags. */
4130#define	FLAG_CHECK(fflag, xflag, xfield)	do {			\
4131	if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0)		\
4132		fflags |= (fflag);					\
4133} while (0)
4134	FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE,
4135	    xvap.xva_xoptattrs.xoa_immutable);
4136	FLAG_CHECK(SF_APPEND, XAT_APPENDONLY,
4137	    xvap.xva_xoptattrs.xoa_appendonly);
4138	FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
4139	    xvap.xva_xoptattrs.xoa_nounlink);
4140	FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
4141	    xvap.xva_xoptattrs.xoa_nodump);
4142#undef	FLAG_CHECK
4143	*vap = xvap.xva_vattr;
4144	vap->va_flags = fflags;
4145	return (0);
4146}
4147
4148static int
4149zfs_freebsd_setattr(ap)
4150	struct vop_setattr_args /* {
4151		struct vnode *a_vp;
4152		struct vattr *a_vap;
4153		struct ucred *a_cred;
4154		struct thread *a_td;
4155	} */ *ap;
4156{
4157	vnode_t *vp = ap->a_vp;
4158	vattr_t *vap = ap->a_vap;
4159	cred_t *cred = ap->a_cred;
4160	xvattr_t xvap;
4161	u_long fflags;
4162	uint64_t zflags;
4163
4164	vattr_init_mask(vap);
4165	vap->va_mask &= ~AT_NOSET;
4166
4167	xva_init(&xvap);
4168	xvap.xva_vattr = *vap;
4169
4170	zflags = VTOZ(vp)->z_phys->zp_flags;
4171
4172	if (vap->va_flags != VNOVAL) {
4173		int error;
4174
4175		fflags = vap->va_flags;
4176		if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_NODUMP)) != 0)
4177			return (EOPNOTSUPP);
4178		/*
4179		 * Callers may only modify the file flags on objects they
4180		 * have VADMIN rights for.
4181		 */
4182		if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0)
4183			return (error);
4184		/*
4185		 * Unprivileged processes are not permitted to unset system
4186		 * flags, or modify flags if any system flags are set.
4187		 * Privileged non-jail processes may not modify system flags
4188		 * if securelevel > 0 and any existing system flags are set.
4189		 * Privileged jail processes behave like privileged non-jail
4190		 * processes if the security.jail.chflags_allowed sysctl is
4191		 * is non-zero; otherwise, they behave like unprivileged
4192		 * processes.
4193		 */
4194		if (priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0) == 0) {
4195			if (zflags &
4196			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
4197				error = securelevel_gt(cred, 0);
4198				if (error)
4199					return (error);
4200			}
4201		} else {
4202			if (zflags &
4203			    (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) {
4204				return (EPERM);
4205			}
4206			if (fflags &
4207			    (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) {
4208				return (EPERM);
4209			}
4210		}
4211
4212#define	FLAG_CHANGE(fflag, zflag, xflag, xfield)	do {		\
4213	if (((fflags & (fflag)) && !(zflags & (zflag))) ||		\
4214	    ((zflags & (zflag)) && !(fflags & (fflag)))) {		\
4215		XVA_SET_REQ(&xvap, (xflag));				\
4216		(xfield) = ((fflags & (fflag)) != 0);			\
4217	}								\
4218} while (0)
4219		/* Convert chflags into ZFS-type flags. */
4220		/* XXX: what about SF_SETTABLE?. */
4221		FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE,
4222		    xvap.xva_xoptattrs.xoa_immutable);
4223		FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY,
4224		    xvap.xva_xoptattrs.xoa_appendonly);
4225		FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
4226		    xvap.xva_xoptattrs.xoa_nounlink);
4227		FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
4228		    xvap.xva_xoptattrs.xoa_nodump);
4229#undef	FLAG_CHANGE
4230	}
4231	return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
4232}
4233
4234static int
4235zfs_freebsd_rename(ap)
4236	struct vop_rename_args  /* {
4237		struct vnode *a_fdvp;
4238		struct vnode *a_fvp;
4239		struct componentname *a_fcnp;
4240		struct vnode *a_tdvp;
4241		struct vnode *a_tvp;
4242		struct componentname *a_tcnp;
4243	} */ *ap;
4244{
4245	vnode_t *fdvp = ap->a_fdvp;
4246	vnode_t *fvp = ap->a_fvp;
4247	vnode_t *tdvp = ap->a_tdvp;
4248	vnode_t *tvp = ap->a_tvp;
4249	int error;
4250
4251	ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
4252	ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
4253
4254	error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp,
4255	    ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred, NULL, 0);
4256
4257	if (tdvp == tvp)
4258		VN_RELE(tdvp);
4259	else
4260		VN_URELE(tdvp);
4261	if (tvp)
4262		VN_URELE(tvp);
4263	VN_RELE(fdvp);
4264	VN_RELE(fvp);
4265
4266	return (error);
4267}
4268
4269static int
4270zfs_freebsd_symlink(ap)
4271	struct vop_symlink_args /* {
4272		struct vnode *a_dvp;
4273		struct vnode **a_vpp;
4274		struct componentname *a_cnp;
4275		struct vattr *a_vap;
4276		char *a_target;
4277	} */ *ap;
4278{
4279	struct componentname *cnp = ap->a_cnp;
4280	vattr_t *vap = ap->a_vap;
4281
4282	ASSERT(cnp->cn_flags & SAVENAME);
4283
4284	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
4285	vattr_init_mask(vap);
4286
4287	return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
4288	    ap->a_target, cnp->cn_cred, cnp->cn_thread));
4289}
4290
4291static int
4292zfs_freebsd_readlink(ap)
4293	struct vop_readlink_args /* {
4294		struct vnode *a_vp;
4295		struct uio *a_uio;
4296		struct ucred *a_cred;
4297	} */ *ap;
4298{
4299
4300	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL));
4301}
4302
4303static int
4304zfs_freebsd_link(ap)
4305	struct vop_link_args /* {
4306		struct vnode *a_tdvp;
4307		struct vnode *a_vp;
4308		struct componentname *a_cnp;
4309	} */ *ap;
4310{
4311	struct componentname *cnp = ap->a_cnp;
4312
4313	ASSERT(cnp->cn_flags & SAVENAME);
4314
4315	return (zfs_link(ap->a_tdvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
4316}
4317
4318static int
4319zfs_freebsd_inactive(ap)
4320	struct vop_inactive_args /* {
4321		struct vnode *a_vp;
4322		struct thread *a_td;
4323	} */ *ap;
4324{
4325	vnode_t *vp = ap->a_vp;
4326
4327	zfs_inactive(vp, ap->a_td->td_ucred, NULL);
4328	return (0);
4329}
4330
4331static void
4332zfs_reclaim_complete(void *arg, int pending)
4333{
4334	znode_t	*zp = arg;
4335	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4336
4337	ZFS_LOG(1, "zp=%p", zp);
4338	ZFS_OBJ_HOLD_ENTER(zfsvfs, zp->z_id);
4339	zfs_znode_dmu_fini(zp);
4340	ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
4341	zfs_znode_free(zp);
4342}
4343
4344static int
4345zfs_freebsd_reclaim(ap)
4346	struct vop_reclaim_args /* {
4347		struct vnode *a_vp;
4348		struct thread *a_td;
4349	} */ *ap;
4350{
4351	vnode_t	*vp = ap->a_vp;
4352	znode_t	*zp = VTOZ(vp);
4353	zfsvfs_t *zfsvfs;
4354
4355	ASSERT(zp != NULL);
4356
4357	/*
4358	 * Destroy the vm object and flush associated pages.
4359	 */
4360	vnode_destroy_vobject(vp);
4361
4362	mutex_enter(&zp->z_lock);
4363	ASSERT(zp->z_phys);
4364	ZTOV(zp) = NULL;
4365	if (!zp->z_unlinked) {
4366		int locked;
4367
4368		zfsvfs = zp->z_zfsvfs;
4369		mutex_exit(&zp->z_lock);
4370		locked = MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)) ? 2 :
4371		    ZFS_OBJ_HOLD_TRYENTER(zfsvfs, zp->z_id);
4372		if (locked == 0) {
4373			/*
4374			 * Lock can't be obtained due to deadlock possibility,
4375			 * so defer znode destruction.
4376			 */
4377			TASK_INIT(&zp->z_task, 0, zfs_reclaim_complete, zp);
4378			taskqueue_enqueue(taskqueue_thread, &zp->z_task);
4379		} else {
4380			zfs_znode_dmu_fini(zp);
4381			if (locked == 1)
4382				ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
4383			zfs_znode_free(zp);
4384		}
4385	} else {
4386		mutex_exit(&zp->z_lock);
4387	}
4388	VI_LOCK(vp);
4389	vp->v_data = NULL;
4390	ASSERT(vp->v_holdcnt >= 1);
4391	VI_UNLOCK(vp);
4392	return (0);
4393}
4394
4395static int
4396zfs_freebsd_fid(ap)
4397	struct vop_fid_args /* {
4398		struct vnode *a_vp;
4399		struct fid *a_fid;
4400	} */ *ap;
4401{
4402
4403	return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL));
4404}
4405
4406static int
4407zfs_freebsd_pathconf(ap)
4408	struct vop_pathconf_args /* {
4409		struct vnode *a_vp;
4410		int a_name;
4411		register_t *a_retval;
4412	} */ *ap;
4413{
4414	ulong_t val;
4415	int error;
4416
4417	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL);
4418	if (error == 0)
4419		*ap->a_retval = val;
4420	else if (error == EOPNOTSUPP)
4421		error = vop_stdpathconf(ap);
4422	return (error);
4423}
4424
4425/*
4426 * FreeBSD's extended attributes namespace defines file name prefix for ZFS'
4427 * extended attribute name:
4428 *
4429 *	NAMESPACE	PREFIX
4430 *	system		freebsd:system:
4431 *	user		(none, can be used to access ZFS fsattr(5) attributes
4432 *			created on Solaris)
4433 */
4434static int
4435zfs_create_attrname(int attrnamespace, const char *name, char *attrname,
4436    size_t size)
4437{
4438	const char *namespace, *prefix, *suffix;
4439
4440	/* We don't allow '/' character in attribute name. */
4441	if (strchr(name, '/') != NULL)
4442		return (EINVAL);
4443	/* We don't allow attribute names that start with "freebsd:" string. */
4444	if (strncmp(name, "freebsd:", 8) == 0)
4445		return (EINVAL);
4446
4447	bzero(attrname, size);
4448
4449	switch (attrnamespace) {
4450	case EXTATTR_NAMESPACE_USER:
4451#if 0
4452		prefix = "freebsd:";
4453		namespace = EXTATTR_NAMESPACE_USER_STRING;
4454		suffix = ":";
4455#else
4456		/*
4457		 * This is the default namespace by which we can access all
4458		 * attributes created on Solaris.
4459		 */
4460		prefix = namespace = suffix = "";
4461#endif
4462		break;
4463	case EXTATTR_NAMESPACE_SYSTEM:
4464		prefix = "freebsd:";
4465		namespace = EXTATTR_NAMESPACE_SYSTEM_STRING;
4466		suffix = ":";
4467		break;
4468	case EXTATTR_NAMESPACE_EMPTY:
4469	default:
4470		return (EINVAL);
4471	}
4472	if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix,
4473	    name) >= size) {
4474		return (ENAMETOOLONG);
4475	}
4476	return (0);
4477}
4478
4479/*
4480 * Vnode operating to retrieve a named extended attribute.
4481 */
4482static int
4483zfs_getextattr(struct vop_getextattr_args *ap)
4484/*
4485vop_getextattr {
4486	IN struct vnode *a_vp;
4487	IN int a_attrnamespace;
4488	IN const char *a_name;
4489	INOUT struct uio *a_uio;
4490	OUT size_t *a_size;
4491	IN struct ucred *a_cred;
4492	IN struct thread *a_td;
4493};
4494*/
4495{
4496	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
4497	struct thread *td = ap->a_td;
4498	struct nameidata nd;
4499	char attrname[255];
4500	struct vattr va;
4501	vnode_t *xvp = NULL, *vp;
4502	int error, flags;
4503
4504	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
4505	    ap->a_cred, ap->a_td, VREAD);
4506	if (error != 0)
4507		return (error);
4508
4509	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
4510	    sizeof(attrname));
4511	if (error != 0)
4512		return (error);
4513
4514	ZFS_ENTER(zfsvfs);
4515
4516	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
4517	    LOOKUP_XATTR);
4518	if (error != 0) {
4519		ZFS_EXIT(zfsvfs);
4520		return (error);
4521	}
4522
4523	flags = FREAD;
4524	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, attrname,
4525	    xvp, td);
4526	error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL);
4527	vp = nd.ni_vp;
4528	NDFREE(&nd, NDF_ONLY_PNBUF);
4529	if (error != 0) {
4530		if (error == ENOENT)
4531			error = ENOATTR;
4532		ZFS_EXIT(zfsvfs);
4533		return (error);
4534	}
4535
4536	if (ap->a_size != NULL) {
4537		error = VOP_GETATTR(vp, &va, ap->a_cred);
4538		if (error == 0)
4539			*ap->a_size = (size_t)va.va_size;
4540	} else if (ap->a_uio != NULL)
4541		error = VOP_READ(vp, ap->a_uio, IO_UNIT | IO_SYNC, ap->a_cred);
4542
4543	VOP_UNLOCK(vp, 0);
4544	vn_close(vp, flags, ap->a_cred, td);
4545	ZFS_EXIT(zfsvfs);
4546
4547	return (error);
4548}
4549
4550/*
4551 * Vnode operation to remove a named attribute.
4552 */
4553int
4554zfs_deleteextattr(struct vop_deleteextattr_args *ap)
4555/*
4556vop_deleteextattr {
4557	IN struct vnode *a_vp;
4558	IN int a_attrnamespace;
4559	IN const char *a_name;
4560	IN struct ucred *a_cred;
4561	IN struct thread *a_td;
4562};
4563*/
4564{
4565	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
4566	struct thread *td = ap->a_td;
4567	struct nameidata nd;
4568	char attrname[255];
4569	struct vattr va;
4570	vnode_t *xvp = NULL, *vp;
4571	int error, flags;
4572
4573	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
4574	    ap->a_cred, ap->a_td, VWRITE);
4575	if (error != 0)
4576		return (error);
4577
4578	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
4579	    sizeof(attrname));
4580	if (error != 0)
4581		return (error);
4582
4583	ZFS_ENTER(zfsvfs);
4584
4585	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
4586	    LOOKUP_XATTR);
4587	if (error != 0) {
4588		ZFS_EXIT(zfsvfs);
4589		return (error);
4590	}
4591
4592	NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF | MPSAFE,
4593	    UIO_SYSSPACE, attrname, xvp, td);
4594	error = namei(&nd);
4595	vp = nd.ni_vp;
4596	NDFREE(&nd, NDF_ONLY_PNBUF);
4597	if (error != 0) {
4598		if (error == ENOENT)
4599			error = ENOATTR;
4600		ZFS_EXIT(zfsvfs);
4601		return (error);
4602	}
4603	error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
4604
4605	vput(nd.ni_dvp);
4606	if (vp == nd.ni_dvp)
4607		vrele(vp);
4608	else
4609		vput(vp);
4610	ZFS_EXIT(zfsvfs);
4611
4612	return (error);
4613}
4614
4615/*
4616 * Vnode operation to set a named attribute.
4617 */
4618static int
4619zfs_setextattr(struct vop_setextattr_args *ap)
4620/*
4621vop_setextattr {
4622	IN struct vnode *a_vp;
4623	IN int a_attrnamespace;
4624	IN const char *a_name;
4625	INOUT struct uio *a_uio;
4626	IN struct ucred *a_cred;
4627	IN struct thread *a_td;
4628};
4629*/
4630{
4631	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
4632	struct thread *td = ap->a_td;
4633	struct nameidata nd;
4634	char attrname[255];
4635	struct vattr va;
4636	vnode_t *xvp = NULL, *vp;
4637	int error, flags;
4638
4639	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
4640	    ap->a_cred, ap->a_td, VWRITE);
4641	if (error != 0)
4642		return (error);
4643
4644	error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname,
4645	    sizeof(attrname));
4646	if (error != 0)
4647		return (error);
4648
4649	ZFS_ENTER(zfsvfs);
4650
4651	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
4652	    LOOKUP_XATTR | CREATE_XATTR_DIR);
4653	if (error != 0) {
4654		ZFS_EXIT(zfsvfs);
4655		return (error);
4656	}
4657
4658	flags = FFLAGS(O_WRONLY | O_CREAT);
4659	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, attrname,
4660	    xvp, td);
4661	error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL);
4662	vp = nd.ni_vp;
4663	NDFREE(&nd, NDF_ONLY_PNBUF);
4664	if (error != 0) {
4665		ZFS_EXIT(zfsvfs);
4666		return (error);
4667	}
4668
4669	VATTR_NULL(&va);
4670	va.va_size = 0;
4671	error = VOP_SETATTR(vp, &va, ap->a_cred);
4672	if (error == 0)
4673		VOP_WRITE(vp, ap->a_uio, IO_UNIT | IO_SYNC, ap->a_cred);
4674
4675	VOP_UNLOCK(vp, 0);
4676	vn_close(vp, flags, ap->a_cred, td);
4677	ZFS_EXIT(zfsvfs);
4678
4679	return (error);
4680}
4681
4682/*
4683 * Vnode operation to retrieve extended attributes on a vnode.
4684 */
4685static int
4686zfs_listextattr(struct vop_listextattr_args *ap)
4687/*
4688vop_listextattr {
4689	IN struct vnode *a_vp;
4690	IN int a_attrnamespace;
4691	INOUT struct uio *a_uio;
4692	OUT size_t *a_size;
4693	IN struct ucred *a_cred;
4694	IN struct thread *a_td;
4695};
4696*/
4697{
4698	zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs;
4699	struct thread *td = ap->a_td;
4700	struct nameidata nd;
4701	char attrprefix[16];
4702	u_char dirbuf[sizeof(struct dirent)];
4703	struct dirent *dp;
4704	struct iovec aiov;
4705	struct uio auio, *uio = ap->a_uio;
4706	size_t *sizep = ap->a_size;
4707	size_t plen;
4708	vnode_t *xvp = NULL, *vp;
4709	int done, error, eof, pos;
4710
4711	error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace,
4712	    ap->a_cred, ap->a_td, VREAD);
4713	if (error)
4714		return (error);
4715
4716	error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix,
4717	    sizeof(attrprefix));
4718	if (error != 0)
4719		return (error);
4720	plen = strlen(attrprefix);
4721
4722	ZFS_ENTER(zfsvfs);
4723
4724	if (sizep != NULL)
4725		*sizep = 0;
4726
4727	error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td,
4728	    LOOKUP_XATTR);
4729	if (error != 0) {
4730		/*
4731		 * ENOATTR means that the EA directory does not yet exist,
4732		 * i.e. there are no extended attributes there.
4733		 */
4734		if (error == ENOATTR)
4735			error = 0;
4736		ZFS_EXIT(zfsvfs);
4737		return (error);
4738	}
4739
4740	NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED | MPSAFE,
4741	    UIO_SYSSPACE, ".", xvp, td);
4742	error = namei(&nd);
4743	vp = nd.ni_vp;
4744	NDFREE(&nd, NDF_ONLY_PNBUF);
4745	if (error != 0) {
4746		ZFS_EXIT(zfsvfs);
4747		return (error);
4748	}
4749
4750	auio.uio_iov = &aiov;
4751	auio.uio_iovcnt = 1;
4752	auio.uio_segflg = UIO_SYSSPACE;
4753	auio.uio_td = td;
4754	auio.uio_rw = UIO_READ;
4755	auio.uio_offset = 0;
4756
4757	do {
4758		u_char nlen;
4759
4760		aiov.iov_base = (void *)dirbuf;
4761		aiov.iov_len = sizeof(dirbuf);
4762		auio.uio_resid = sizeof(dirbuf);
4763		error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL);
4764		done = sizeof(dirbuf) - auio.uio_resid;
4765		if (error != 0)
4766			break;
4767		for (pos = 0; pos < done;) {
4768			dp = (struct dirent *)(dirbuf + pos);
4769			pos += dp->d_reclen;
4770			/*
4771			 * XXX: Temporarily we also accept DT_UNKNOWN, as this
4772			 * is what we get when attribute was created on Solaris.
4773			 */
4774			if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN)
4775				continue;
4776			if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0)
4777				continue;
4778			else if (strncmp(dp->d_name, attrprefix, plen) != 0)
4779				continue;
4780			nlen = dp->d_namlen - plen;
4781			if (sizep != NULL)
4782				*sizep += 1 + nlen;
4783			else if (uio != NULL) {
4784				/*
4785				 * Format of extattr name entry is one byte for
4786				 * length and the rest for name.
4787				 */
4788				error = uiomove(&nlen, 1, uio->uio_rw, uio);
4789				if (error == 0) {
4790					error = uiomove(dp->d_name + plen, nlen,
4791					    uio->uio_rw, uio);
4792				}
4793				if (error != 0)
4794					break;
4795			}
4796		}
4797	} while (!eof && error == 0);
4798
4799	vput(vp);
4800	ZFS_EXIT(zfsvfs);
4801
4802	return (error);
4803}
4804
4805int
4806zfs_freebsd_getacl(ap)
4807	struct vop_getacl_args /* {
4808		struct vnode *vp;
4809		acl_type_t type;
4810		struct acl *aclp;
4811		struct ucred *cred;
4812		struct thread *td;
4813	} */ *ap;
4814{
4815	int		error;
4816	vsecattr_t      vsecattr;
4817
4818	if (ap->a_type != ACL_TYPE_NFS4)
4819		return (EOPNOTSUPP);
4820
4821	vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT;
4822	if (error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))
4823		return (error);
4824
4825	error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt);
4826        if (vsecattr.vsa_aclentp != NULL)
4827                kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz);
4828
4829        return (error);
4830}
4831
4832int
4833zfs_freebsd_setacl(ap)
4834	struct vop_setacl_args /* {
4835		struct vnode *vp;
4836		acl_type_t type;
4837		struct acl *aclp;
4838		struct ucred *cred;
4839		struct thread *td;
4840	} */ *ap;
4841{
4842	int		error;
4843	vsecattr_t      vsecattr;
4844	int		aclbsize;	/* size of acl list in bytes */
4845	aclent_t	*aaclp;
4846
4847	if (ap->a_type != ACL_TYPE_NFS4)
4848		return (EOPNOTSUPP);
4849
4850	if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
4851		return (EINVAL);
4852
4853	/*
4854	 * With NFS4 ACLs, chmod(2) may need to add additional entries,
4855	 * splitting every entry into two and appending "canonical six"
4856	 * entries at the end.  Don't allow for setting an ACL that would
4857	 * cause chmod(2) to run out of ACL entries.
4858	 */
4859	if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES)
4860		return (ENOSPC);
4861
4862	vsecattr.vsa_mask = VSA_ACE;
4863	aclbsize = ap->a_aclp->acl_cnt * sizeof(ace_t);
4864	vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP);
4865	aaclp = vsecattr.vsa_aclentp;
4866	vsecattr.vsa_aclentsz = aclbsize;
4867
4868	aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp);
4869	error = zfs_setsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL);
4870	kmem_free(aaclp, aclbsize);
4871
4872	return (error);
4873}
4874
4875int
4876zfs_freebsd_aclcheck(ap)
4877	struct vop_aclcheck_args /* {
4878		struct vnode *vp;
4879		acl_type_t type;
4880		struct acl *aclp;
4881		struct ucred *cred;
4882		struct thread *td;
4883	} */ *ap;
4884{
4885
4886	return (EOPNOTSUPP);
4887}
4888
4889struct vop_vector zfs_vnodeops;
4890struct vop_vector zfs_fifoops;
4891
4892struct vop_vector zfs_vnodeops = {
4893	.vop_default =		&default_vnodeops,
4894	.vop_inactive =		zfs_freebsd_inactive,
4895	.vop_reclaim =		zfs_freebsd_reclaim,
4896	.vop_access =		zfs_freebsd_access,
4897#ifdef FREEBSD_NAMECACHE
4898	.vop_lookup =		vfs_cache_lookup,
4899	.vop_cachedlookup =	zfs_freebsd_lookup,
4900#else
4901	.vop_lookup =		zfs_freebsd_lookup,
4902#endif
4903	.vop_getattr =		zfs_freebsd_getattr,
4904	.vop_setattr =		zfs_freebsd_setattr,
4905	.vop_create =		zfs_freebsd_create,
4906	.vop_mknod =		zfs_freebsd_create,
4907	.vop_mkdir =		zfs_freebsd_mkdir,
4908	.vop_readdir =		zfs_freebsd_readdir,
4909	.vop_fsync =		zfs_freebsd_fsync,
4910	.vop_open =		zfs_freebsd_open,
4911	.vop_close =		zfs_freebsd_close,
4912	.vop_rmdir =		zfs_freebsd_rmdir,
4913	.vop_ioctl =		zfs_freebsd_ioctl,
4914	.vop_link =		zfs_freebsd_link,
4915	.vop_symlink =		zfs_freebsd_symlink,
4916	.vop_readlink =		zfs_freebsd_readlink,
4917	.vop_read =		zfs_freebsd_read,
4918	.vop_write =		zfs_freebsd_write,
4919	.vop_remove =		zfs_freebsd_remove,
4920	.vop_rename =		zfs_freebsd_rename,
4921	.vop_pathconf =		zfs_freebsd_pathconf,
4922	.vop_bmap =		VOP_EOPNOTSUPP,
4923	.vop_fid =		zfs_freebsd_fid,
4924	.vop_getextattr =	zfs_getextattr,
4925	.vop_deleteextattr =	zfs_deleteextattr,
4926	.vop_setextattr =	zfs_setextattr,
4927	.vop_listextattr =	zfs_listextattr,
4928#ifdef notyet
4929	.vop_getacl =		zfs_freebsd_getacl,
4930	.vop_setacl =		zfs_freebsd_setacl,
4931	.vop_aclcheck =		zfs_freebsd_aclcheck,
4932#endif
4933};
4934
4935struct vop_vector zfs_fifoops = {
4936	.vop_default =		&fifo_specops,
4937	.vop_fsync =		VOP_PANIC,
4938	.vop_access =		zfs_freebsd_access,
4939	.vop_getattr =		zfs_freebsd_getattr,
4940	.vop_inactive =		zfs_freebsd_inactive,
4941	.vop_read =		VOP_PANIC,
4942	.vop_reclaim =		zfs_freebsd_reclaim,
4943	.vop_setattr =		zfs_freebsd_setattr,
4944	.vop_write =		VOP_PANIC,
4945	.vop_fid =		zfs_freebsd_fid,
4946#ifdef notyet
4947	.vop_getacl =		zfs_freebsd_getacl,
4948	.vop_setacl =		zfs_freebsd_setacl,
4949	.vop_aclcheck =		zfs_freebsd_aclcheck,
4950#endif
4951};
4952