zfs_vnops.c revision 177633
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/* Portions Copyright 2007 Jeremy Teo */
27
28#pragma ident	"%Z%%M%	%I%	%E% SMI"
29
30#include <sys/types.h>
31#include <sys/param.h>
32#include <sys/time.h>
33#include <sys/systm.h>
34#include <sys/sysmacros.h>
35#include <sys/resource.h>
36#include <sys/vfs.h>
37#include <sys/vnode.h>
38#include <sys/file.h>
39#include <sys/stat.h>
40#include <sys/kmem.h>
41#include <sys/taskq.h>
42#include <sys/uio.h>
43#include <sys/atomic.h>
44#include <sys/namei.h>
45#include <sys/mman.h>
46#include <sys/cmn_err.h>
47#include <sys/errno.h>
48#include <sys/unistd.h>
49#include <sys/zfs_vfsops.h>
50#include <sys/zfs_dir.h>
51#include <sys/zfs_acl.h>
52#include <sys/zfs_ioctl.h>
53#include <sys/fs/zfs.h>
54#include <sys/dmu.h>
55#include <sys/spa.h>
56#include <sys/txg.h>
57#include <sys/dbuf.h>
58#include <sys/zap.h>
59#include <sys/dirent.h>
60#include <sys/policy.h>
61#include <sys/sunddi.h>
62#include <sys/filio.h>
63#include <sys/zfs_ctldir.h>
64#include <sys/dnlc.h>
65#include <sys/zfs_rlock.h>
66#include <sys/bio.h>
67#include <sys/buf.h>
68#include <sys/sf_buf.h>
69#include <sys/sched.h>
70
71/*
72 * Programming rules.
73 *
74 * Each vnode op performs some logical unit of work.  To do this, the ZPL must
75 * properly lock its in-core state, create a DMU transaction, do the work,
76 * record this work in the intent log (ZIL), commit the DMU transaction,
77 * and wait the the intent log to commit if it's is a synchronous operation.
78 * Morover, the vnode ops must work in both normal and log replay context.
79 * The ordering of events is important to avoid deadlocks and references
80 * to freed memory.  The example below illustrates the following Big Rules:
81 *
82 *  (1) A check must be made in each zfs thread for a mounted file system.
83 *	This is done avoiding races using ZFS_ENTER(zfsvfs).
84 *	A ZFS_EXIT(zfsvfs) is needed before all returns.
85 *
86 *  (2)	VN_RELE() should always be the last thing except for zil_commit()
87 *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
88 *	First, if it's the last reference, the vnode/znode
89 *	can be freed, so the zp may point to freed memory.  Second, the last
90 *	reference will call zfs_zinactive(), which may induce a lot of work --
91 *	pushing cached pages (which acquires range locks) and syncing out
92 *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
93 *	which could deadlock the system if you were already holding one.
94 *
95 *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
96 *	as they can span dmu_tx_assign() calls.
97 *
98 *  (4)	Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign().
99 *	In normal operation, this will be TXG_NOWAIT.  During ZIL replay,
100 *	it will be a specific txg.  Either way, dmu_tx_assign() never blocks.
101 *	This is critical because we don't want to block while holding locks.
102 *	Note, in particular, that if a lock is sometimes acquired before
103 *	the tx assigns, and sometimes after (e.g. z_lock), then failing to
104 *	use a non-blocking assign can deadlock the system.  The scenario:
105 *
106 *	Thread A has grabbed a lock before calling dmu_tx_assign().
107 *	Thread B is in an already-assigned tx, and blocks for this lock.
108 *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
109 *	forever, because the previous txg can't quiesce until B's tx commits.
110 *
111 *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
112 *	then drop all locks, call dmu_tx_wait(), and try again.
113 *
114 *  (5)	If the operation succeeded, generate the intent log entry for it
115 *	before dropping locks.  This ensures that the ordering of events
116 *	in the intent log matches the order in which they actually occurred.
117 *
118 *  (6)	At the end of each vnode op, the DMU tx must always commit,
119 *	regardless of whether there were any errors.
120 *
121 *  (7)	After dropping all locks, invoke zil_commit(zilog, seq, foid)
122 *	to ensure that synchronous semantics are provided when necessary.
123 *
124 * In general, this is how things should be ordered in each vnode op:
125 *
126 *	ZFS_ENTER(zfsvfs);		// exit if unmounted
127 * top:
128 *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may VN_HOLD())
129 *	rw_enter(...);			// grab any other locks you need
130 *	tx = dmu_tx_create(...);	// get DMU tx
131 *	dmu_tx_hold_*();		// hold each object you might modify
132 *	error = dmu_tx_assign(tx, zfsvfs->z_assign);	// try to assign
133 *	if (error) {
134 *		rw_exit(...);		// drop locks
135 *		zfs_dirent_unlock(dl);	// unlock directory entry
136 *		VN_RELE(...);		// release held vnodes
137 *		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
138 *			dmu_tx_wait(tx);
139 *			dmu_tx_abort(tx);
140 *			goto top;
141 *		}
142 *		dmu_tx_abort(tx);	// abort DMU tx
143 *		ZFS_EXIT(zfsvfs);	// finished in zfs
144 *		return (error);		// really out of space
145 *	}
146 *	error = do_real_work();		// do whatever this VOP does
147 *	if (error == 0)
148 *		zfs_log_*(...);		// on success, make ZIL entry
149 *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
150 *	rw_exit(...);			// drop locks
151 *	zfs_dirent_unlock(dl);		// unlock directory entry
152 *	VN_RELE(...);			// release held vnodes
153 *	zil_commit(zilog, seq, foid);	// synchronous when necessary
154 *	ZFS_EXIT(zfsvfs);		// finished in zfs
155 *	return (error);			// done, report error
156 */
157/* ARGSUSED */
158static int
159zfs_open(vnode_t **vpp, int flag, cred_t *cr)
160{
161	znode_t	*zp = VTOZ(*vpp);
162
163	/* Keep a count of the synchronous opens in the znode */
164	if (flag & (FSYNC | FDSYNC))
165		atomic_inc_32(&zp->z_sync_cnt);
166	return (0);
167}
168
169/* ARGSUSED */
170static int
171zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
172{
173	znode_t	*zp = VTOZ(vp);
174
175	/* Decrement the synchronous opens in the znode */
176	if (flag & (FSYNC | FDSYNC))
177		atomic_dec_32(&zp->z_sync_cnt);
178
179	/*
180	 * Clean up any locks held by this process on the vp.
181	 */
182	cleanlocks(vp, ddi_get_pid(), 0);
183	cleanshares(vp, ddi_get_pid());
184
185	return (0);
186}
187
188/*
189 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
190 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
191 */
192static int
193zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
194{
195	znode_t	*zp = VTOZ(vp);
196	uint64_t noff = (uint64_t)*off; /* new offset */
197	uint64_t file_sz;
198	int error;
199	boolean_t hole;
200
201	file_sz = zp->z_phys->zp_size;
202	if (noff >= file_sz)  {
203		return (ENXIO);
204	}
205
206	if (cmd == _FIO_SEEK_HOLE)
207		hole = B_TRUE;
208	else
209		hole = B_FALSE;
210
211	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
212
213	/* end of file? */
214	if ((error == ESRCH) || (noff > file_sz)) {
215		/*
216		 * Handle the virtual hole at the end of file.
217		 */
218		if (hole) {
219			*off = file_sz;
220			return (0);
221		}
222		return (ENXIO);
223	}
224
225	if (noff < *off)
226		return (error);
227	*off = noff;
228	return (error);
229}
230
231/* ARGSUSED */
232static int
233zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
234    int *rvalp)
235{
236	offset_t off;
237	int error;
238	zfsvfs_t *zfsvfs;
239
240	switch (com) {
241	    case _FIOFFS:
242		return (0);
243
244		/*
245		 * The following two ioctls are used by bfu.  Faking out,
246		 * necessary to avoid bfu errors.
247		 */
248	    case _FIOGDIO:
249	    case _FIOSDIO:
250		return (0);
251
252	    case _FIO_SEEK_DATA:
253	    case _FIO_SEEK_HOLE:
254		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
255			return (EFAULT);
256
257		zfsvfs = VTOZ(vp)->z_zfsvfs;
258		ZFS_ENTER(zfsvfs);
259
260		/* offset parameter is in/out */
261		error = zfs_holey(vp, com, &off);
262		ZFS_EXIT(zfsvfs);
263		if (error)
264			return (error);
265		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
266			return (EFAULT);
267		return (0);
268	}
269	return (ENOTTY);
270}
271
272/*
273 * When a file is memory mapped, we must keep the IO data synchronized
274 * between the DMU cache and the memory mapped pages.  What this means:
275 *
276 * On Write:	If we find a memory mapped page, we write to *both*
277 *		the page and the dmu buffer.
278 *
279 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
280 *	the file is memory mapped.
281 */
282static int
283mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx)
284{
285	znode_t *zp = VTOZ(vp);
286	objset_t *os = zp->z_zfsvfs->z_os;
287	vm_object_t obj;
288	vm_page_t m;
289	struct sf_buf *sf;
290	int64_t start, off;
291	int len = nbytes;
292	int error = 0;
293	uint64_t dirbytes;
294
295	ASSERT(vp->v_mount != NULL);
296	obj = vp->v_object;
297	ASSERT(obj != NULL);
298
299	start = uio->uio_loffset;
300	off = start & PAGEOFFSET;
301	dirbytes = 0;
302	VM_OBJECT_LOCK(obj);
303	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
304		uint64_t bytes = MIN(PAGESIZE - off, len);
305		uint64_t fsize;
306
307again:
308		if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
309		    vm_page_is_valid(m, (vm_offset_t)off, bytes)) {
310			uint64_t woff;
311			caddr_t va;
312
313			if (vm_page_sleep_if_busy(m, FALSE, "zfsmwb"))
314				goto again;
315			fsize = obj->un_pager.vnp.vnp_size;
316			vm_page_busy(m);
317			vm_page_lock_queues();
318			vm_page_undirty(m);
319			vm_page_unlock_queues();
320			VM_OBJECT_UNLOCK(obj);
321			if (dirbytes > 0) {
322				error = dmu_write_uio(os, zp->z_id, uio,
323				    dirbytes, tx);
324				dirbytes = 0;
325			}
326			if (error == 0) {
327				sched_pin();
328				sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
329				va = (caddr_t)sf_buf_kva(sf);
330				woff = uio->uio_loffset - off;
331				error = uiomove(va + off, bytes, UIO_WRITE, uio);
332				/*
333				 * The uiomove() above could have been partially
334				 * successful, that's why we call dmu_write()
335				 * below unconditionally. The page was marked
336				 * non-dirty above and we would lose the changes
337				 * without doing so. If the uiomove() failed
338				 * entirely, well, we just write what we got
339				 * before one more time.
340				 */
341				dmu_write(os, zp->z_id, woff,
342				    MIN(PAGESIZE, fsize - woff), va, tx);
343				sf_buf_free(sf);
344				sched_unpin();
345			}
346			VM_OBJECT_LOCK(obj);
347			vm_page_wakeup(m);
348		} else {
349			if (__predict_false(obj->cache != NULL)) {
350				vm_page_cache_free(obj, OFF_TO_IDX(start),
351				    OFF_TO_IDX(start) + 1);
352			}
353			dirbytes += bytes;
354		}
355		len -= bytes;
356		off = 0;
357		if (error)
358			break;
359	}
360	VM_OBJECT_UNLOCK(obj);
361	if (error == 0 && dirbytes > 0)
362		error = dmu_write_uio(os, zp->z_id, uio, dirbytes, tx);
363	return (error);
364}
365
366/*
367 * When a file is memory mapped, we must keep the IO data synchronized
368 * between the DMU cache and the memory mapped pages.  What this means:
369 *
370 * On Read:	We "read" preferentially from memory mapped pages,
371 *		else we default from the dmu buffer.
372 *
373 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
374 *	the file is memory mapped.
375 */
376static int
377mappedread(vnode_t *vp, int nbytes, uio_t *uio)
378{
379	znode_t *zp = VTOZ(vp);
380	objset_t *os = zp->z_zfsvfs->z_os;
381	vm_object_t obj;
382	vm_page_t m;
383	struct sf_buf *sf;
384	int64_t start, off;
385	caddr_t va;
386	int len = nbytes;
387	int error = 0;
388	uint64_t dirbytes;
389
390	ASSERT(vp->v_mount != NULL);
391	obj = vp->v_object;
392	ASSERT(obj != NULL);
393
394	start = uio->uio_loffset;
395	off = start & PAGEOFFSET;
396	dirbytes = 0;
397	VM_OBJECT_LOCK(obj);
398	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
399		uint64_t bytes = MIN(PAGESIZE - off, len);
400
401again:
402		if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
403		    vm_page_is_valid(m, (vm_offset_t)off, bytes)) {
404			if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb"))
405				goto again;
406			vm_page_busy(m);
407			VM_OBJECT_UNLOCK(obj);
408			if (dirbytes > 0) {
409				error = dmu_read_uio(os, zp->z_id, uio,
410				    dirbytes);
411				dirbytes = 0;
412			}
413			if (error == 0) {
414				sched_pin();
415				sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
416				va = (caddr_t)sf_buf_kva(sf);
417				error = uiomove(va + off, bytes, UIO_READ, uio);
418				sf_buf_free(sf);
419				sched_unpin();
420			}
421			VM_OBJECT_LOCK(obj);
422			vm_page_wakeup(m);
423		} else if (m != NULL && uio->uio_segflg == UIO_NOCOPY) {
424			/*
425			 * The code below is here to make sendfile(2) work
426			 * correctly with ZFS. As pointed out by ups@
427			 * sendfile(2) should be changed to use VOP_GETPAGES(),
428			 * but it pessimize performance of sendfile/UFS, that's
429			 * why I handle this special case in ZFS code.
430			 */
431			if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb"))
432				goto again;
433			vm_page_busy(m);
434			VM_OBJECT_UNLOCK(obj);
435			if (dirbytes > 0) {
436				error = dmu_read_uio(os, zp->z_id, uio,
437				    dirbytes);
438				dirbytes = 0;
439			}
440			if (error == 0) {
441				sched_pin();
442				sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
443				va = (caddr_t)sf_buf_kva(sf);
444				error = dmu_read(os, zp->z_id, start + off,
445				    bytes, (void *)(va + off));
446				sf_buf_free(sf);
447				sched_unpin();
448			}
449			VM_OBJECT_LOCK(obj);
450			vm_page_wakeup(m);
451			if (error == 0)
452				uio->uio_resid -= bytes;
453		} else {
454			dirbytes += bytes;
455		}
456		len -= bytes;
457		off = 0;
458		if (error)
459			break;
460	}
461	VM_OBJECT_UNLOCK(obj);
462	if (error == 0 && dirbytes > 0)
463		error = dmu_read_uio(os, zp->z_id, uio, dirbytes);
464	return (error);
465}
466
467offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
468
469/*
470 * Read bytes from specified file into supplied buffer.
471 *
472 *	IN:	vp	- vnode of file to be read from.
473 *		uio	- structure supplying read location, range info,
474 *			  and return buffer.
475 *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
476 *		cr	- credentials of caller.
477 *
478 *	OUT:	uio	- updated offset and range, buffer filled.
479 *
480 *	RETURN:	0 if success
481 *		error code if failure
482 *
483 * Side Effects:
484 *	vp - atime updated if byte count > 0
485 */
486/* ARGSUSED */
487static int
488zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
489{
490	znode_t		*zp = VTOZ(vp);
491	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
492	objset_t	*os = zfsvfs->z_os;
493	ssize_t		n, nbytes;
494	int		error;
495	rl_t		*rl;
496
497	ZFS_ENTER(zfsvfs);
498
499	/*
500	 * Validate file offset
501	 */
502	if (uio->uio_loffset < (offset_t)0) {
503		ZFS_EXIT(zfsvfs);
504		return (EINVAL);
505	}
506
507	/*
508	 * Fasttrack empty reads
509	 */
510	if (uio->uio_resid == 0) {
511		ZFS_EXIT(zfsvfs);
512		return (0);
513	}
514
515	/*
516	 * Check for mandatory locks
517	 */
518	if (MANDMODE((mode_t)zp->z_phys->zp_mode)) {
519		if (error = chklock(vp, FREAD,
520		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
521			ZFS_EXIT(zfsvfs);
522			return (error);
523		}
524	}
525
526	/*
527	 * If we're in FRSYNC mode, sync out this znode before reading it.
528	 */
529	if (ioflag & FRSYNC)
530		zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
531
532	/*
533	 * Lock the range against changes.
534	 */
535	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
536
537	/*
538	 * If we are reading past end-of-file we can skip
539	 * to the end; but we might still need to set atime.
540	 */
541	if (uio->uio_loffset >= zp->z_phys->zp_size) {
542		error = 0;
543		goto out;
544	}
545
546	ASSERT(uio->uio_loffset < zp->z_phys->zp_size);
547	n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset);
548
549	while (n > 0) {
550		nbytes = MIN(n, zfs_read_chunk_size -
551		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
552
553		if (vn_has_cached_data(vp))
554			error = mappedread(vp, nbytes, uio);
555		else
556			error = dmu_read_uio(os, zp->z_id, uio, nbytes);
557		if (error)
558			break;
559
560		n -= nbytes;
561	}
562
563out:
564	zfs_range_unlock(rl);
565
566	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
567	ZFS_EXIT(zfsvfs);
568	return (error);
569}
570
571/*
572 * Fault in the pages of the first n bytes specified by the uio structure.
573 * 1 byte in each page is touched and the uio struct is unmodified.
574 * Any error will exit this routine as this is only a best
575 * attempt to get the pages resident. This is a copy of ufs_trans_touch().
576 */
577static void
578zfs_prefault_write(ssize_t n, struct uio *uio)
579{
580	struct iovec *iov;
581	ulong_t cnt, incr;
582	caddr_t p;
583
584	if (uio->uio_segflg != UIO_USERSPACE)
585		return;
586
587	iov = uio->uio_iov;
588
589	while (n) {
590		cnt = MIN(iov->iov_len, n);
591		if (cnt == 0) {
592			/* empty iov entry */
593			iov++;
594			continue;
595		}
596		n -= cnt;
597		/*
598		 * touch each page in this segment.
599		 */
600		p = iov->iov_base;
601		while (cnt) {
602			if (fubyte(p) == -1)
603				return;
604			incr = MIN(cnt, PAGESIZE);
605			p += incr;
606			cnt -= incr;
607		}
608		/*
609		 * touch the last byte in case it straddles a page.
610		 */
611		p--;
612		if (fubyte(p) == -1)
613			return;
614		iov++;
615	}
616}
617
618/*
619 * Write the bytes to a file.
620 *
621 *	IN:	vp	- vnode of file to be written to.
622 *		uio	- structure supplying write location, range info,
623 *			  and data buffer.
624 *		ioflag	- IO_APPEND flag set if in append mode.
625 *		cr	- credentials of caller.
626 *
627 *	OUT:	uio	- updated offset and range.
628 *
629 *	RETURN:	0 if success
630 *		error code if failure
631 *
632 * Timestamps:
633 *	vp - ctime|mtime updated if byte count > 0
634 */
635/* ARGSUSED */
636static int
637zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
638{
639	znode_t		*zp = VTOZ(vp);
640	rlim64_t	limit = MAXOFFSET_T;
641	ssize_t		start_resid = uio->uio_resid;
642	ssize_t		tx_bytes;
643	uint64_t	end_size;
644	dmu_tx_t	*tx;
645	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
646	zilog_t		*zilog = zfsvfs->z_log;
647	offset_t	woff;
648	ssize_t		n, nbytes;
649	rl_t		*rl;
650	int		max_blksz = zfsvfs->z_max_blksz;
651	int		error;
652
653	/*
654	 * Fasttrack empty write
655	 */
656	n = start_resid;
657	if (n == 0)
658		return (0);
659
660	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
661		limit = MAXOFFSET_T;
662
663	ZFS_ENTER(zfsvfs);
664
665	/*
666	 * Pre-fault the pages to ensure slow (eg NFS) pages
667	 * don't hold up txg.
668	 */
669	zfs_prefault_write(n, uio);
670
671	/*
672	 * If in append mode, set the io offset pointer to eof.
673	 */
674	if (ioflag & IO_APPEND) {
675		/*
676		 * Range lock for a file append:
677		 * The value for the start of range will be determined by
678		 * zfs_range_lock() (to guarantee append semantics).
679		 * If this write will cause the block size to increase,
680		 * zfs_range_lock() will lock the entire file, so we must
681		 * later reduce the range after we grow the block size.
682		 */
683		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
684		if (rl->r_len == UINT64_MAX) {
685			/* overlocked, zp_size can't change */
686			woff = uio->uio_loffset = zp->z_phys->zp_size;
687		} else {
688			woff = uio->uio_loffset = rl->r_off;
689		}
690	} else {
691		woff = uio->uio_loffset;
692		/*
693		 * Validate file offset
694		 */
695		if (woff < 0) {
696			ZFS_EXIT(zfsvfs);
697			return (EINVAL);
698		}
699
700		/*
701		 * If we need to grow the block size then zfs_range_lock()
702		 * will lock a wider range than we request here.
703		 * Later after growing the block size we reduce the range.
704		 */
705		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
706	}
707
708	if (woff >= limit) {
709		zfs_range_unlock(rl);
710		ZFS_EXIT(zfsvfs);
711		return (EFBIG);
712	}
713
714	if ((woff + n) > limit || woff > (limit - n))
715		n = limit - woff;
716
717	/*
718	 * Check for mandatory locks
719	 */
720	if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
721	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
722		zfs_range_unlock(rl);
723		ZFS_EXIT(zfsvfs);
724		return (error);
725	}
726	end_size = MAX(zp->z_phys->zp_size, woff + n);
727
728	/*
729	 * Write the file in reasonable size chunks.  Each chunk is written
730	 * in a separate transaction; this keeps the intent log records small
731	 * and allows us to do more fine-grained space accounting.
732	 */
733	while (n > 0) {
734		/*
735		 * Start a transaction.
736		 */
737		woff = uio->uio_loffset;
738		tx = dmu_tx_create(zfsvfs->z_os);
739		dmu_tx_hold_bonus(tx, zp->z_id);
740		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
741		error = dmu_tx_assign(tx, zfsvfs->z_assign);
742		if (error) {
743			if (error == ERESTART &&
744			    zfsvfs->z_assign == TXG_NOWAIT) {
745				dmu_tx_wait(tx);
746				dmu_tx_abort(tx);
747				continue;
748			}
749			dmu_tx_abort(tx);
750			break;
751		}
752
753		/*
754		 * If zfs_range_lock() over-locked we grow the blocksize
755		 * and then reduce the lock range.  This will only happen
756		 * on the first iteration since zfs_range_reduce() will
757		 * shrink down r_len to the appropriate size.
758		 */
759		if (rl->r_len == UINT64_MAX) {
760			uint64_t new_blksz;
761
762			if (zp->z_blksz > max_blksz) {
763				ASSERT(!ISP2(zp->z_blksz));
764				new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
765			} else {
766				new_blksz = MIN(end_size, max_blksz);
767			}
768			zfs_grow_blocksize(zp, new_blksz, tx);
769			zfs_range_reduce(rl, woff, n);
770		}
771
772		/*
773		 * XXX - should we really limit each write to z_max_blksz?
774		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
775		 */
776		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
777
778		if (woff + nbytes > zp->z_phys->zp_size)
779			vnode_pager_setsize(vp, woff + nbytes);
780
781		rw_enter(&zp->z_map_lock, RW_READER);
782
783		tx_bytes = uio->uio_resid;
784		if (vn_has_cached_data(vp)) {
785			rw_exit(&zp->z_map_lock);
786			error = mappedwrite(vp, nbytes, uio, tx);
787		} else {
788			error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
789			    uio, nbytes, tx);
790			rw_exit(&zp->z_map_lock);
791		}
792		tx_bytes -= uio->uio_resid;
793
794		/*
795		 * If we made no progress, we're done.  If we made even
796		 * partial progress, update the znode and ZIL accordingly.
797		 */
798		if (tx_bytes == 0) {
799			dmu_tx_commit(tx);
800			ASSERT(error != 0);
801			break;
802		}
803
804		/*
805		 * Clear Set-UID/Set-GID bits on successful write if not
806		 * privileged and at least one of the excute bits is set.
807		 *
808		 * It would be nice to to this after all writes have
809		 * been done, but that would still expose the ISUID/ISGID
810		 * to another app after the partial write is committed.
811		 */
812		mutex_enter(&zp->z_acl_lock);
813		if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) |
814		    (S_IXUSR >> 6))) != 0 &&
815		    (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 &&
816		    secpolicy_vnode_setid_retain(cr,
817		    (zp->z_phys->zp_mode & S_ISUID) != 0 &&
818		    zp->z_phys->zp_uid == 0) != 0) {
819			    zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID);
820		}
821		mutex_exit(&zp->z_acl_lock);
822
823		/*
824		 * Update time stamp.  NOTE: This marks the bonus buffer as
825		 * dirty, so we don't have to do it again for zp_size.
826		 */
827		zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
828
829		/*
830		 * Update the file size (zp_size) if it has changed;
831		 * account for possible concurrent updates.
832		 */
833		while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset)
834			(void) atomic_cas_64(&zp->z_phys->zp_size, end_size,
835			    uio->uio_loffset);
836		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
837		dmu_tx_commit(tx);
838
839		if (error != 0)
840			break;
841		ASSERT(tx_bytes == nbytes);
842		n -= nbytes;
843	}
844
845	zfs_range_unlock(rl);
846
847	/*
848	 * If we're in replay mode, or we made no progress, return error.
849	 * Otherwise, it's at least a partial write, so it's successful.
850	 */
851	if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) {
852		ZFS_EXIT(zfsvfs);
853		return (error);
854	}
855
856	if (ioflag & (FSYNC | FDSYNC))
857		zil_commit(zilog, zp->z_last_itx, zp->z_id);
858
859	ZFS_EXIT(zfsvfs);
860	return (0);
861}
862
863void
864zfs_get_done(dmu_buf_t *db, void *vzgd)
865{
866	zgd_t *zgd = (zgd_t *)vzgd;
867	rl_t *rl = zgd->zgd_rl;
868	vnode_t *vp = ZTOV(rl->r_zp);
869	int vfslocked;
870
871	vfslocked = VFS_LOCK_GIANT(vp->v_vfsp);
872	dmu_buf_rele(db, vzgd);
873	zfs_range_unlock(rl);
874	VN_RELE(vp);
875	zil_add_vdev(zgd->zgd_zilog, DVA_GET_VDEV(BP_IDENTITY(zgd->zgd_bp)));
876	kmem_free(zgd, sizeof (zgd_t));
877	VFS_UNLOCK_GIANT(vfslocked);
878}
879
880/*
881 * Get data to generate a TX_WRITE intent log record.
882 */
883int
884zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
885{
886	zfsvfs_t *zfsvfs = arg;
887	objset_t *os = zfsvfs->z_os;
888	znode_t *zp;
889	uint64_t off = lr->lr_offset;
890	dmu_buf_t *db;
891	rl_t *rl;
892	zgd_t *zgd;
893	int dlen = lr->lr_length;		/* length of user data */
894	int error = 0;
895
896	ASSERT(zio);
897	ASSERT(dlen != 0);
898
899	/*
900	 * Nothing to do if the file has been removed
901	 */
902	if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0)
903		return (ENOENT);
904	if (zp->z_unlinked) {
905		VN_RELE(ZTOV(zp));
906		return (ENOENT);
907	}
908
909	/*
910	 * Write records come in two flavors: immediate and indirect.
911	 * For small writes it's cheaper to store the data with the
912	 * log record (immediate); for large writes it's cheaper to
913	 * sync the data and get a pointer to it (indirect) so that
914	 * we don't have to write the data twice.
915	 */
916	if (buf != NULL) { /* immediate write */
917		rl = zfs_range_lock(zp, off, dlen, RL_READER);
918		/* test for truncation needs to be done while range locked */
919		if (off >= zp->z_phys->zp_size) {
920			error = ENOENT;
921			goto out;
922		}
923		VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf));
924	} else { /* indirect write */
925		uint64_t boff; /* block starting offset */
926
927		/*
928		 * Have to lock the whole block to ensure when it's
929		 * written out and it's checksum is being calculated
930		 * that no one can change the data. We need to re-check
931		 * blocksize after we get the lock in case it's changed!
932		 */
933		for (;;) {
934			if (ISP2(zp->z_blksz)) {
935				boff = P2ALIGN_TYPED(off, zp->z_blksz,
936				    uint64_t);
937			} else {
938				boff = 0;
939			}
940			dlen = zp->z_blksz;
941			rl = zfs_range_lock(zp, boff, dlen, RL_READER);
942			if (zp->z_blksz == dlen)
943				break;
944			zfs_range_unlock(rl);
945		}
946		/* test for truncation needs to be done while range locked */
947		if (off >= zp->z_phys->zp_size) {
948			error = ENOENT;
949			goto out;
950		}
951		zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
952		zgd->zgd_rl = rl;
953		zgd->zgd_zilog = zfsvfs->z_log;
954		zgd->zgd_bp = &lr->lr_blkptr;
955		VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db));
956		ASSERT(boff == db->db_offset);
957		lr->lr_blkoff = off - boff;
958		error = dmu_sync(zio, db, &lr->lr_blkptr,
959		    lr->lr_common.lrc_txg, zfs_get_done, zgd);
960		ASSERT(error == EEXIST || lr->lr_length <= zp->z_blksz);
961		if (error == 0) {
962			zil_add_vdev(zfsvfs->z_log,
963			    DVA_GET_VDEV(BP_IDENTITY(&lr->lr_blkptr)));
964		}
965		/*
966		 * If we get EINPROGRESS, then we need to wait for a
967		 * write IO initiated by dmu_sync() to complete before
968		 * we can release this dbuf.  We will finish everything
969		 * up in the zfs_get_done() callback.
970		 */
971		if (error == EINPROGRESS)
972			return (0);
973		dmu_buf_rele(db, zgd);
974		kmem_free(zgd, sizeof (zgd_t));
975	}
976out:
977	zfs_range_unlock(rl);
978	VN_RELE(ZTOV(zp));
979	return (error);
980}
981
982/*ARGSUSED*/
983static int
984zfs_access(vnode_t *vp, int mode, int flags, cred_t *cr)
985{
986	znode_t *zp = VTOZ(vp);
987	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
988	int error;
989
990	ZFS_ENTER(zfsvfs);
991	error = zfs_zaccess_rwx(zp, mode, cr);
992	ZFS_EXIT(zfsvfs);
993	return (error);
994}
995
996/*
997 * Lookup an entry in a directory, or an extended attribute directory.
998 * If it exists, return a held vnode reference for it.
999 *
1000 *	IN:	dvp	- vnode of directory to search.
1001 *		nm	- name of entry to lookup.
1002 *		pnp	- full pathname to lookup [UNUSED].
1003 *		flags	- LOOKUP_XATTR set if looking for an attribute.
1004 *		rdir	- root directory vnode [UNUSED].
1005 *		cr	- credentials of caller.
1006 *
1007 *	OUT:	vpp	- vnode of located entry, NULL if not found.
1008 *
1009 *	RETURN:	0 if success
1010 *		error code if failure
1011 *
1012 * Timestamps:
1013 *	NA
1014 */
1015/* ARGSUSED */
1016static int
1017zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
1018    int nameiop, cred_t *cr, kthread_t *td)
1019{
1020
1021	znode_t *zdp = VTOZ(dvp);
1022	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1023	int	error;
1024
1025	ZFS_ENTER(zfsvfs);
1026
1027	*vpp = NULL;
1028
1029#ifdef TODO
1030	if (flags & LOOKUP_XATTR) {
1031		/*
1032		 * If the xattr property is off, refuse the lookup request.
1033		 */
1034		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1035			ZFS_EXIT(zfsvfs);
1036			return (EINVAL);
1037		}
1038
1039		/*
1040		 * We don't allow recursive attributes..
1041		 * Maybe someday we will.
1042		 */
1043		if (zdp->z_phys->zp_flags & ZFS_XATTR) {
1044			ZFS_EXIT(zfsvfs);
1045			return (EINVAL);
1046		}
1047
1048		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1049			ZFS_EXIT(zfsvfs);
1050			return (error);
1051		}
1052
1053		/*
1054		 * Do we have permission to get into attribute directory?
1055		 */
1056
1057		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, cr)) {
1058			VN_RELE(*vpp);
1059		}
1060
1061		ZFS_EXIT(zfsvfs);
1062		return (error);
1063	}
1064#endif	/* TODO */
1065
1066	if (dvp->v_type != VDIR) {
1067		ZFS_EXIT(zfsvfs);
1068		return (ENOTDIR);
1069	}
1070
1071	/*
1072	 * Check accessibility of directory.
1073	 */
1074
1075	if (error = zfs_zaccess(zdp, ACE_EXECUTE, cr)) {
1076		ZFS_EXIT(zfsvfs);
1077		return (error);
1078	}
1079
1080	if ((error = zfs_dirlook(zdp, nm, vpp)) == 0) {
1081
1082		/*
1083		 * Convert device special files
1084		 */
1085		if (IS_DEVVP(*vpp)) {
1086			vnode_t	*svp;
1087
1088			svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1089			VN_RELE(*vpp);
1090			if (svp == NULL)
1091				error = ENOSYS;
1092			else
1093				*vpp = svp;
1094		}
1095	}
1096
1097	ZFS_EXIT(zfsvfs);
1098
1099	/* Translate errors and add SAVENAME when needed. */
1100	if (cnp->cn_flags & ISLASTCN) {
1101		switch (nameiop) {
1102		case CREATE:
1103		case RENAME:
1104			if (error == ENOENT) {
1105				error = EJUSTRETURN;
1106				cnp->cn_flags |= SAVENAME;
1107				break;
1108			}
1109			/* FALLTHROUGH */
1110		case DELETE:
1111			if (error == 0)
1112				cnp->cn_flags |= SAVENAME;
1113			break;
1114		}
1115	}
1116	if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) {
1117		int ltype = 0;
1118
1119		if (cnp->cn_flags & ISDOTDOT) {
1120			ltype = VOP_ISLOCKED(dvp);
1121			VOP_UNLOCK(dvp, 0);
1122		}
1123		error = vn_lock(*vpp, cnp->cn_lkflags);
1124		if (cnp->cn_flags & ISDOTDOT)
1125			vn_lock(dvp, ltype | LK_RETRY);
1126		if (error != 0) {
1127			VN_RELE(*vpp);
1128			*vpp = NULL;
1129			return (error);
1130		}
1131	}
1132
1133#ifdef FREEBSD_NAMECACHE
1134	/*
1135	 * Insert name into cache (as non-existent) if appropriate.
1136	 */
1137	if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
1138		cache_enter(dvp, *vpp, cnp);
1139	/*
1140	 * Insert name into cache if appropriate.
1141	 */
1142	if (error == 0 && (cnp->cn_flags & MAKEENTRY)) {
1143		if (!(cnp->cn_flags & ISLASTCN) ||
1144		    (nameiop != DELETE && nameiop != RENAME)) {
1145			cache_enter(dvp, *vpp, cnp);
1146		}
1147	}
1148#endif
1149
1150	return (error);
1151}
1152
1153/*
1154 * Attempt to create a new entry in a directory.  If the entry
1155 * already exists, truncate the file if permissible, else return
1156 * an error.  Return the vp of the created or trunc'd file.
1157 *
1158 *	IN:	dvp	- vnode of directory to put new file entry in.
1159 *		name	- name of new file entry.
1160 *		vap	- attributes of new file.
1161 *		excl	- flag indicating exclusive or non-exclusive mode.
1162 *		mode	- mode to open file with.
1163 *		cr	- credentials of caller.
1164 *		flag	- large file flag [UNUSED].
1165 *
1166 *	OUT:	vpp	- vnode of created or trunc'd entry.
1167 *
1168 *	RETURN:	0 if success
1169 *		error code if failure
1170 *
1171 * Timestamps:
1172 *	dvp - ctime|mtime updated if new entry created
1173 *	 vp - ctime|mtime always, atime if new
1174 */
1175/* ARGSUSED */
1176static int
1177zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
1178    vnode_t **vpp, cred_t *cr)
1179{
1180	znode_t		*zp, *dzp = VTOZ(dvp);
1181	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1182	zilog_t		*zilog = zfsvfs->z_log;
1183	objset_t	*os = zfsvfs->z_os;
1184	zfs_dirlock_t	*dl;
1185	dmu_tx_t	*tx;
1186	int		error;
1187	uint64_t	zoid;
1188
1189	ZFS_ENTER(zfsvfs);
1190
1191top:
1192	*vpp = NULL;
1193
1194	if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr))
1195		vap->va_mode &= ~VSVTX;
1196
1197	if (*name == '\0') {
1198		/*
1199		 * Null component name refers to the directory itself.
1200		 */
1201		VN_HOLD(dvp);
1202		zp = dzp;
1203		dl = NULL;
1204		error = 0;
1205	} else {
1206		/* possible VN_HOLD(zp) */
1207		if (error = zfs_dirent_lock(&dl, dzp, name, &zp, 0)) {
1208			if (strcmp(name, "..") == 0)
1209				error = EISDIR;
1210			ZFS_EXIT(zfsvfs);
1211			return (error);
1212		}
1213	}
1214
1215	zoid = zp ? zp->z_id : -1ULL;
1216
1217	if (zp == NULL) {
1218		/*
1219		 * Create a new file object and update the directory
1220		 * to reference it.
1221		 */
1222		if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
1223			goto out;
1224		}
1225
1226		/*
1227		 * We only support the creation of regular files in
1228		 * extended attribute directories.
1229		 */
1230		if ((dzp->z_phys->zp_flags & ZFS_XATTR) &&
1231		    (vap->va_type != VREG)) {
1232			error = EINVAL;
1233			goto out;
1234		}
1235
1236		tx = dmu_tx_create(os);
1237		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1238		dmu_tx_hold_bonus(tx, dzp->z_id);
1239		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1240		if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
1241			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1242			    0, SPA_MAXBLOCKSIZE);
1243		error = dmu_tx_assign(tx, zfsvfs->z_assign);
1244		if (error) {
1245			zfs_dirent_unlock(dl);
1246			if (error == ERESTART &&
1247			    zfsvfs->z_assign == TXG_NOWAIT) {
1248				dmu_tx_wait(tx);
1249				dmu_tx_abort(tx);
1250				goto top;
1251			}
1252			dmu_tx_abort(tx);
1253			ZFS_EXIT(zfsvfs);
1254			return (error);
1255		}
1256		zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
1257		ASSERT(zp->z_id == zoid);
1258		(void) zfs_link_create(dl, zp, tx, ZNEW);
1259		zfs_log_create(zilog, tx, TX_CREATE, dzp, zp, name);
1260		dmu_tx_commit(tx);
1261	} else {
1262		/*
1263		 * A directory entry already exists for this name.
1264		 */
1265		/*
1266		 * Can't truncate an existing file if in exclusive mode.
1267		 */
1268		if (excl == EXCL) {
1269			error = EEXIST;
1270			goto out;
1271		}
1272		/*
1273		 * Can't open a directory for writing.
1274		 */
1275		if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
1276			error = EISDIR;
1277			goto out;
1278		}
1279		/*
1280		 * Verify requested access to file.
1281		 */
1282		if (mode && (error = zfs_zaccess_rwx(zp, mode, cr))) {
1283			goto out;
1284		}
1285
1286		mutex_enter(&dzp->z_lock);
1287		dzp->z_seq++;
1288		mutex_exit(&dzp->z_lock);
1289
1290		/*
1291		 * Truncate regular files if requested.
1292		 */
1293		if ((ZTOV(zp)->v_type == VREG) &&
1294		    (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
1295			error = zfs_freesp(zp, 0, 0, mode, TRUE);
1296			if (error == ERESTART &&
1297			    zfsvfs->z_assign == TXG_NOWAIT) {
1298				/* NB: we already did dmu_tx_wait() */
1299				zfs_dirent_unlock(dl);
1300				VN_RELE(ZTOV(zp));
1301				goto top;
1302			}
1303		}
1304	}
1305out:
1306
1307	if (error == 0) {
1308		*vpp = ZTOV(zp);
1309		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
1310	}
1311
1312	if (dl)
1313		zfs_dirent_unlock(dl);
1314
1315	if (error) {
1316		if (zp)
1317			VN_RELE(ZTOV(zp));
1318	} else {
1319		*vpp = ZTOV(zp);
1320		/*
1321		 * If vnode is for a device return a specfs vnode instead.
1322		 */
1323		if (IS_DEVVP(*vpp)) {
1324			struct vnode *svp;
1325
1326			svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1327			VN_RELE(*vpp);
1328			if (svp == NULL) {
1329				error = ENOSYS;
1330			}
1331			*vpp = svp;
1332		}
1333	}
1334
1335	ZFS_EXIT(zfsvfs);
1336	return (error);
1337}
1338
1339/*
1340 * Remove an entry from a directory.
1341 *
1342 *	IN:	dvp	- vnode of directory to remove entry from.
1343 *		name	- name of entry to remove.
1344 *		cr	- credentials of caller.
1345 *
1346 *	RETURN:	0 if success
1347 *		error code if failure
1348 *
1349 * Timestamps:
1350 *	dvp - ctime|mtime
1351 *	 vp - ctime (if nlink > 0)
1352 */
1353static int
1354zfs_remove(vnode_t *dvp, char *name, cred_t *cr)
1355{
1356	znode_t		*zp, *dzp = VTOZ(dvp);
1357	znode_t		*xzp = NULL;
1358	vnode_t		*vp;
1359	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1360	zilog_t		*zilog = zfsvfs->z_log;
1361	uint64_t	acl_obj, xattr_obj;
1362	zfs_dirlock_t	*dl;
1363	dmu_tx_t	*tx;
1364	boolean_t	may_delete_now, delete_now = FALSE;
1365	boolean_t	unlinked;
1366	int		error;
1367
1368	ZFS_ENTER(zfsvfs);
1369
1370top:
1371	/*
1372	 * Attempt to lock directory; fail if entry doesn't exist.
1373	 */
1374	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) {
1375		ZFS_EXIT(zfsvfs);
1376		return (error);
1377	}
1378
1379	vp = ZTOV(zp);
1380
1381	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1382		goto out;
1383	}
1384
1385	/*
1386	 * Need to use rmdir for removing directories.
1387	 */
1388	if (vp->v_type == VDIR) {
1389		error = EPERM;
1390		goto out;
1391	}
1392
1393	vnevent_remove(vp);
1394
1395	dnlc_remove(dvp, name);
1396
1397	may_delete_now = FALSE;
1398
1399	/*
1400	 * We may delete the znode now, or we may put it in the unlinked set;
1401	 * it depends on whether we're the last link, and on whether there are
1402	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
1403	 * allow for either case.
1404	 */
1405	tx = dmu_tx_create(zfsvfs->z_os);
1406	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1407	dmu_tx_hold_bonus(tx, zp->z_id);
1408	if (may_delete_now)
1409		dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
1410
1411	/* are there any extended attributes? */
1412	if ((xattr_obj = zp->z_phys->zp_xattr) != 0) {
1413		/* XXX - do we need this if we are deleting? */
1414		dmu_tx_hold_bonus(tx, xattr_obj);
1415	}
1416
1417	/* are there any additional acls */
1418	if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 &&
1419	    may_delete_now)
1420		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1421
1422	/* charge as an update -- would be nice not to charge at all */
1423	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1424
1425	error = dmu_tx_assign(tx, zfsvfs->z_assign);
1426	if (error) {
1427		zfs_dirent_unlock(dl);
1428		VN_RELE(vp);
1429		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
1430			dmu_tx_wait(tx);
1431			dmu_tx_abort(tx);
1432			goto top;
1433		}
1434		dmu_tx_abort(tx);
1435		ZFS_EXIT(zfsvfs);
1436		return (error);
1437	}
1438
1439	/*
1440	 * Remove the directory entry.
1441	 */
1442	error = zfs_link_destroy(dl, zp, tx, 0, &unlinked);
1443
1444	if (error) {
1445		dmu_tx_commit(tx);
1446		goto out;
1447	}
1448
1449	if (0 && unlinked) {
1450		VI_LOCK(vp);
1451		delete_now = may_delete_now &&
1452		    vp->v_count == 1 && !vn_has_cached_data(vp) &&
1453		    zp->z_phys->zp_xattr == xattr_obj &&
1454		    zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj;
1455		VI_UNLOCK(vp);
1456	}
1457
1458	if (delete_now) {
1459		if (zp->z_phys->zp_xattr) {
1460			error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
1461			ASSERT3U(error, ==, 0);
1462			ASSERT3U(xzp->z_phys->zp_links, ==, 2);
1463			dmu_buf_will_dirty(xzp->z_dbuf, tx);
1464			mutex_enter(&xzp->z_lock);
1465			xzp->z_unlinked = 1;
1466			xzp->z_phys->zp_links = 0;
1467			mutex_exit(&xzp->z_lock);
1468			zfs_unlinked_add(xzp, tx);
1469			zp->z_phys->zp_xattr = 0; /* probably unnecessary */
1470		}
1471		mutex_enter(&zp->z_lock);
1472		VI_LOCK(vp);
1473		vp->v_count--;
1474		ASSERT3U(vp->v_count, ==, 0);
1475		VI_UNLOCK(vp);
1476		mutex_exit(&zp->z_lock);
1477		zfs_znode_delete(zp, tx);
1478		VFS_RELE(zfsvfs->z_vfs);
1479	} else if (unlinked) {
1480		zfs_unlinked_add(zp, tx);
1481	}
1482
1483	zfs_log_remove(zilog, tx, TX_REMOVE, dzp, name);
1484
1485	dmu_tx_commit(tx);
1486out:
1487	zfs_dirent_unlock(dl);
1488
1489	if (!delete_now) {
1490		VN_RELE(vp);
1491	} else if (xzp) {
1492		/* this rele delayed to prevent nesting transactions */
1493		VN_RELE(ZTOV(xzp));
1494	}
1495
1496	ZFS_EXIT(zfsvfs);
1497	return (error);
1498}
1499
1500/*
1501 * Create a new directory and insert it into dvp using the name
1502 * provided.  Return a pointer to the inserted directory.
1503 *
1504 *	IN:	dvp	- vnode of directory to add subdir to.
1505 *		dirname	- name of new directory.
1506 *		vap	- attributes of new directory.
1507 *		cr	- credentials of caller.
1508 *
1509 *	OUT:	vpp	- vnode of created directory.
1510 *
1511 *	RETURN:	0 if success
1512 *		error code if failure
1513 *
1514 * Timestamps:
1515 *	dvp - ctime|mtime updated
1516 *	 vp - ctime|mtime|atime updated
1517 */
1518static int
1519zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
1520{
1521	znode_t		*zp, *dzp = VTOZ(dvp);
1522	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1523	zilog_t		*zilog = zfsvfs->z_log;
1524	zfs_dirlock_t	*dl;
1525	uint64_t	zoid = 0;
1526	dmu_tx_t	*tx;
1527	int		error;
1528
1529	ASSERT(vap->va_type == VDIR);
1530
1531	ZFS_ENTER(zfsvfs);
1532
1533	if (dzp->z_phys->zp_flags & ZFS_XATTR) {
1534		ZFS_EXIT(zfsvfs);
1535		return (EINVAL);
1536	}
1537top:
1538	*vpp = NULL;
1539
1540	/*
1541	 * First make sure the new directory doesn't exist.
1542	 */
1543	if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, ZNEW)) {
1544		ZFS_EXIT(zfsvfs);
1545		return (error);
1546	}
1547
1548	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, cr)) {
1549		zfs_dirent_unlock(dl);
1550		ZFS_EXIT(zfsvfs);
1551		return (error);
1552	}
1553
1554	/*
1555	 * Add a new entry to the directory.
1556	 */
1557	tx = dmu_tx_create(zfsvfs->z_os);
1558	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1559	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1560	if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
1561		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1562		    0, SPA_MAXBLOCKSIZE);
1563	error = dmu_tx_assign(tx, zfsvfs->z_assign);
1564	if (error) {
1565		zfs_dirent_unlock(dl);
1566		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
1567			dmu_tx_wait(tx);
1568			dmu_tx_abort(tx);
1569			goto top;
1570		}
1571		dmu_tx_abort(tx);
1572		ZFS_EXIT(zfsvfs);
1573		return (error);
1574	}
1575
1576	/*
1577	 * Create new node.
1578	 */
1579	zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
1580
1581	/*
1582	 * Now put new name in parent dir.
1583	 */
1584	(void) zfs_link_create(dl, zp, tx, ZNEW);
1585
1586	*vpp = ZTOV(zp);
1587
1588	zfs_log_create(zilog, tx, TX_MKDIR, dzp, zp, dirname);
1589	dmu_tx_commit(tx);
1590
1591	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
1592
1593	zfs_dirent_unlock(dl);
1594
1595	ZFS_EXIT(zfsvfs);
1596	return (0);
1597}
1598
1599/*
1600 * Remove a directory subdir entry.  If the current working
1601 * directory is the same as the subdir to be removed, the
1602 * remove will fail.
1603 *
1604 *	IN:	dvp	- vnode of directory to remove from.
1605 *		name	- name of directory to be removed.
1606 *		cwd	- vnode of current working directory.
1607 *		cr	- credentials of caller.
1608 *
1609 *	RETURN:	0 if success
1610 *		error code if failure
1611 *
1612 * Timestamps:
1613 *	dvp - ctime|mtime updated
1614 */
1615static int
1616zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr)
1617{
1618	znode_t		*dzp = VTOZ(dvp);
1619	znode_t		*zp;
1620	vnode_t		*vp;
1621	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1622	zilog_t		*zilog = zfsvfs->z_log;
1623	zfs_dirlock_t	*dl;
1624	dmu_tx_t	*tx;
1625	int		error;
1626
1627	ZFS_ENTER(zfsvfs);
1628
1629top:
1630	zp = NULL;
1631
1632	/*
1633	 * Attempt to lock directory; fail if entry doesn't exist.
1634	 */
1635	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) {
1636		ZFS_EXIT(zfsvfs);
1637		return (error);
1638	}
1639
1640	vp = ZTOV(zp);
1641
1642	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1643		goto out;
1644	}
1645
1646	if (vp->v_type != VDIR) {
1647		error = ENOTDIR;
1648		goto out;
1649	}
1650
1651	if (vp == cwd) {
1652		error = EINVAL;
1653		goto out;
1654	}
1655
1656	vnevent_rmdir(vp);
1657
1658	/*
1659	 * Grab a lock on the directory to make sure that noone is
1660	 * trying to add (or lookup) entries while we are removing it.
1661	 */
1662	rw_enter(&zp->z_name_lock, RW_WRITER);
1663
1664	/*
1665	 * Grab a lock on the parent pointer to make sure we play well
1666	 * with the treewalk and directory rename code.
1667	 */
1668	rw_enter(&zp->z_parent_lock, RW_WRITER);
1669
1670	tx = dmu_tx_create(zfsvfs->z_os);
1671	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1672	dmu_tx_hold_bonus(tx, zp->z_id);
1673	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1674	error = dmu_tx_assign(tx, zfsvfs->z_assign);
1675	if (error) {
1676		rw_exit(&zp->z_parent_lock);
1677		rw_exit(&zp->z_name_lock);
1678		zfs_dirent_unlock(dl);
1679		VN_RELE(vp);
1680		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
1681			dmu_tx_wait(tx);
1682			dmu_tx_abort(tx);
1683			goto top;
1684		}
1685		dmu_tx_abort(tx);
1686		ZFS_EXIT(zfsvfs);
1687		return (error);
1688	}
1689
1690#ifdef FREEBSD_NAMECACHE
1691	cache_purge(dvp);
1692#endif
1693
1694	error = zfs_link_destroy(dl, zp, tx, 0, NULL);
1695
1696	if (error == 0)
1697		zfs_log_remove(zilog, tx, TX_RMDIR, dzp, name);
1698
1699	dmu_tx_commit(tx);
1700
1701	rw_exit(&zp->z_parent_lock);
1702	rw_exit(&zp->z_name_lock);
1703#ifdef FREEBSD_NAMECACHE
1704	cache_purge(vp);
1705#endif
1706out:
1707	zfs_dirent_unlock(dl);
1708
1709	VN_RELE(vp);
1710
1711	ZFS_EXIT(zfsvfs);
1712	return (error);
1713}
1714
1715/*
1716 * Read as many directory entries as will fit into the provided
1717 * buffer from the given directory cursor position (specified in
1718 * the uio structure.
1719 *
1720 *	IN:	vp	- vnode of directory to read.
1721 *		uio	- structure supplying read location, range info,
1722 *			  and return buffer.
1723 *		cr	- credentials of caller.
1724 *
1725 *	OUT:	uio	- updated offset and range, buffer filled.
1726 *		eofp	- set to true if end-of-file detected.
1727 *
1728 *	RETURN:	0 if success
1729 *		error code if failure
1730 *
1731 * Timestamps:
1732 *	vp - atime updated
1733 *
1734 * Note that the low 4 bits of the cookie returned by zap is always zero.
1735 * This allows us to use the low range for "special" directory entries:
1736 * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
1737 * we use the offset 2 for the '.zfs' directory.
1738 */
1739/* ARGSUSED */
1740static int
1741zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
1742{
1743	znode_t		*zp = VTOZ(vp);
1744	iovec_t		*iovp;
1745	dirent64_t	*odp;
1746	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
1747	objset_t	*os;
1748	caddr_t		outbuf;
1749	size_t		bufsize;
1750	zap_cursor_t	zc;
1751	zap_attribute_t	zap;
1752	uint_t		bytes_wanted;
1753	uint64_t	offset; /* must be unsigned; checks for < 1 */
1754	int		local_eof;
1755	int		outcount;
1756	int		error;
1757	uint8_t		prefetch;
1758	uint8_t		type;
1759	int		ncooks;
1760	u_long		*cooks = NULL;
1761
1762	ZFS_ENTER(zfsvfs);
1763
1764	/*
1765	 * If we are not given an eof variable,
1766	 * use a local one.
1767	 */
1768	if (eofp == NULL)
1769		eofp = &local_eof;
1770
1771	/*
1772	 * Check for valid iov_len.
1773	 */
1774	if (uio->uio_iov->iov_len <= 0) {
1775		ZFS_EXIT(zfsvfs);
1776		return (EINVAL);
1777	}
1778
1779	/*
1780	 * Quit if directory has been removed (posix)
1781	 */
1782	if ((*eofp = zp->z_unlinked) != 0) {
1783		ZFS_EXIT(zfsvfs);
1784		return (0);
1785	}
1786
1787	error = 0;
1788	os = zfsvfs->z_os;
1789	offset = uio->uio_loffset;
1790	prefetch = zp->z_zn_prefetch;
1791
1792	/*
1793	 * Initialize the iterator cursor.
1794	 */
1795	if (offset <= 3) {
1796		/*
1797		 * Start iteration from the beginning of the directory.
1798		 */
1799		zap_cursor_init(&zc, os, zp->z_id);
1800	} else {
1801		/*
1802		 * The offset is a serialized cursor.
1803		 */
1804		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
1805	}
1806
1807	/*
1808	 * Get space to change directory entries into fs independent format.
1809	 */
1810	iovp = uio->uio_iov;
1811	bytes_wanted = iovp->iov_len;
1812	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
1813		bufsize = bytes_wanted;
1814		outbuf = kmem_alloc(bufsize, KM_SLEEP);
1815		odp = (struct dirent64 *)outbuf;
1816	} else {
1817		bufsize = bytes_wanted;
1818		odp = (struct dirent64 *)iovp->iov_base;
1819	}
1820
1821	if (ncookies != NULL) {
1822		/*
1823		 * Minimum entry size is dirent size and 1 byte for a file name.
1824		 */
1825		ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
1826		cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
1827		*cookies = cooks;
1828		*ncookies = ncooks;
1829	}
1830
1831	/*
1832	 * Transform to file-system independent format
1833	 */
1834	outcount = 0;
1835	while (outcount < bytes_wanted) {
1836		ino64_t objnum;
1837		ushort_t reclen;
1838
1839		/*
1840		 * Special case `.', `..', and `.zfs'.
1841		 */
1842		if (offset == 0) {
1843			(void) strcpy(zap.za_name, ".");
1844			objnum = zp->z_id;
1845			type = DT_DIR;
1846		} else if (offset == 1) {
1847			(void) strcpy(zap.za_name, "..");
1848			objnum = zp->z_phys->zp_parent;
1849			type = DT_DIR;
1850		} else if (offset == 2 && zfs_show_ctldir(zp)) {
1851			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
1852			objnum = ZFSCTL_INO_ROOT;
1853			type = DT_DIR;
1854		} else {
1855			/*
1856			 * Grab next entry.
1857			 */
1858			if (error = zap_cursor_retrieve(&zc, &zap)) {
1859				if ((*eofp = (error == ENOENT)) != 0)
1860					break;
1861				else
1862					goto update;
1863			}
1864
1865			if (zap.za_integer_length != 8 ||
1866			    zap.za_num_integers != 1) {
1867				cmn_err(CE_WARN, "zap_readdir: bad directory "
1868				    "entry, obj = %lld, offset = %lld\n",
1869				    (u_longlong_t)zp->z_id,
1870				    (u_longlong_t)offset);
1871				error = ENXIO;
1872				goto update;
1873			}
1874
1875			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
1876			/*
1877			 * MacOS X can extract the object type here such as:
1878			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
1879			 */
1880			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
1881		}
1882		reclen = DIRENT64_RECLEN(strlen(zap.za_name));
1883
1884		/*
1885		 * Will this entry fit in the buffer?
1886		 */
1887		if (outcount + reclen > bufsize) {
1888			/*
1889			 * Did we manage to fit anything in the buffer?
1890			 */
1891			if (!outcount) {
1892				error = EINVAL;
1893				goto update;
1894			}
1895			break;
1896		}
1897		/*
1898		 * Add this entry:
1899		 */
1900		odp->d_ino = objnum;
1901		odp->d_reclen = reclen;
1902		odp->d_namlen = strlen(zap.za_name);
1903		(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
1904		odp->d_type = type;
1905		outcount += reclen;
1906		odp = (dirent64_t *)((intptr_t)odp + reclen);
1907
1908		ASSERT(outcount <= bufsize);
1909
1910		/* Prefetch znode */
1911		if (prefetch)
1912			dmu_prefetch(os, objnum, 0, 0);
1913
1914		/*
1915		 * Move to the next entry, fill in the previous offset.
1916		 */
1917		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
1918			zap_cursor_advance(&zc);
1919			offset = zap_cursor_serialize(&zc);
1920		} else {
1921			offset += 1;
1922		}
1923
1924		if (cooks != NULL) {
1925			*cooks++ = offset;
1926			ncooks--;
1927			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
1928		}
1929	}
1930	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
1931
1932	/* Subtract unused cookies */
1933	if (ncookies != NULL)
1934		*ncookies -= ncooks;
1935
1936	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
1937		iovp->iov_base += outcount;
1938		iovp->iov_len -= outcount;
1939		uio->uio_resid -= outcount;
1940	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
1941		/*
1942		 * Reset the pointer.
1943		 */
1944		offset = uio->uio_loffset;
1945	}
1946
1947update:
1948	zap_cursor_fini(&zc);
1949	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
1950		kmem_free(outbuf, bufsize);
1951
1952	if (error == ENOENT)
1953		error = 0;
1954
1955	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
1956
1957	uio->uio_loffset = offset;
1958	ZFS_EXIT(zfsvfs);
1959	if (error != 0 && cookies != NULL) {
1960		free(*cookies, M_TEMP);
1961		*cookies = NULL;
1962		*ncookies = 0;
1963	}
1964	return (error);
1965}
1966
1967static int
1968zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr)
1969{
1970	znode_t	*zp = VTOZ(vp);
1971	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1972
1973	ZFS_ENTER(zfsvfs);
1974	zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
1975	ZFS_EXIT(zfsvfs);
1976	return (0);
1977}
1978
1979/*
1980 * Get the requested file attributes and place them in the provided
1981 * vattr structure.
1982 *
1983 *	IN:	vp	- vnode of file.
1984 *		vap	- va_mask identifies requested attributes.
1985 *		flags	- [UNUSED]
1986 *		cr	- credentials of caller.
1987 *
1988 *	OUT:	vap	- attribute values.
1989 *
1990 *	RETURN:	0 (always succeeds)
1991 */
1992/* ARGSUSED */
1993static int
1994zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
1995{
1996	znode_t *zp = VTOZ(vp);
1997	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1998	znode_phys_t *pzp = zp->z_phys;
1999	uint32_t blksize;
2000	u_longlong_t nblocks;
2001	int	error;
2002
2003	ZFS_ENTER(zfsvfs);
2004
2005	/*
2006	 * Return all attributes.  It's cheaper to provide the answer
2007	 * than to determine whether we were asked the question.
2008	 */
2009	mutex_enter(&zp->z_lock);
2010
2011	vap->va_type = IFTOVT(pzp->zp_mode);
2012	vap->va_mode = pzp->zp_mode & ~S_IFMT;
2013	vap->va_uid = zp->z_phys->zp_uid;
2014	vap->va_gid = zp->z_phys->zp_gid;
2015	vap->va_nodeid = zp->z_id;
2016	vap->va_nlink = MIN(pzp->zp_links, UINT32_MAX);	/* nlink_t limit! */
2017	vap->va_size = pzp->zp_size;
2018	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
2019	vap->va_rdev = zfs_cmpldev(pzp->zp_rdev);
2020	vap->va_seq = zp->z_seq;
2021	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
2022
2023	ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime);
2024	ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime);
2025	ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime);
2026	ZFS_TIME_DECODE(&vap->va_birthtime, pzp->zp_crtime);
2027
2028	/*
2029	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2030	 * Also, if we are the owner don't bother, since owner should
2031	 * always be allowed to read basic attributes of file.
2032	 */
2033	if (!(zp->z_phys->zp_flags & ZFS_ACL_TRIVIAL) &&
2034	    (zp->z_phys->zp_uid != crgetuid(cr))) {
2035		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, cr)) {
2036			mutex_exit(&zp->z_lock);
2037			ZFS_EXIT(zfsvfs);
2038			return (error);
2039		}
2040	}
2041
2042	mutex_exit(&zp->z_lock);
2043
2044	dmu_object_size_from_db(zp->z_dbuf, &blksize, &nblocks);
2045	vap->va_blksize = blksize;
2046	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
2047
2048	if (zp->z_blksz == 0) {
2049		/*
2050		 * Block size hasn't been set; suggest maximal I/O transfers.
2051		 */
2052		vap->va_blksize = zfsvfs->z_max_blksz;
2053	}
2054
2055	ZFS_EXIT(zfsvfs);
2056	return (0);
2057}
2058
2059/*
2060 * Set the file attributes to the values contained in the
2061 * vattr structure.
2062 *
2063 *	IN:	vp	- vnode of file to be modified.
2064 *		vap	- new attribute values.
2065 *		flags	- ATTR_UTIME set if non-default time values provided.
2066 *		cr	- credentials of caller.
2067 *
2068 *	RETURN:	0 if success
2069 *		error code if failure
2070 *
2071 * Timestamps:
2072 *	vp - ctime updated, mtime updated if size changed.
2073 */
2074/* ARGSUSED */
2075static int
2076zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2077	caller_context_t *ct)
2078{
2079	struct znode	*zp = VTOZ(vp);
2080	znode_phys_t	*pzp = zp->z_phys;
2081	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2082	zilog_t		*zilog = zfsvfs->z_log;
2083	dmu_tx_t	*tx;
2084	vattr_t		oldva;
2085	uint_t		mask = vap->va_mask;
2086	uint_t		saved_mask;
2087	int		trim_mask = 0;
2088	uint64_t	new_mode;
2089	znode_t		*attrzp;
2090	int		need_policy = FALSE;
2091	int		err;
2092
2093	if (mask == 0)
2094		return (0);
2095
2096	if (mask & AT_NOSET)
2097		return (EINVAL);
2098
2099	if (mask & AT_SIZE && vp->v_type == VDIR)
2100		return (EISDIR);
2101
2102	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO)
2103		return (EINVAL);
2104
2105	ZFS_ENTER(zfsvfs);
2106
2107top:
2108	attrzp = NULL;
2109
2110	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2111		ZFS_EXIT(zfsvfs);
2112		return (EROFS);
2113	}
2114
2115	/*
2116	 * First validate permissions
2117	 */
2118
2119	if (mask & AT_SIZE) {
2120		err = zfs_zaccess(zp, ACE_WRITE_DATA, cr);
2121		if (err) {
2122			ZFS_EXIT(zfsvfs);
2123			return (err);
2124		}
2125		/*
2126		 * XXX - Note, we are not providing any open
2127		 * mode flags here (like FNDELAY), so we may
2128		 * block if there are locks present... this
2129		 * should be addressed in openat().
2130		 */
2131		do {
2132			err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2133			/* NB: we already did dmu_tx_wait() if necessary */
2134		} while (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT);
2135		if (err) {
2136			ZFS_EXIT(zfsvfs);
2137			return (err);
2138		}
2139	}
2140
2141	if (mask & (AT_ATIME|AT_MTIME))
2142		need_policy = zfs_zaccess_v4_perm(zp, ACE_WRITE_ATTRIBUTES, cr);
2143
2144	if (mask & (AT_UID|AT_GID)) {
2145		int	idmask = (mask & (AT_UID|AT_GID));
2146		int	take_owner;
2147		int	take_group;
2148
2149		/*
2150		 * NOTE: even if a new mode is being set,
2151		 * we may clear S_ISUID/S_ISGID bits.
2152		 */
2153
2154		if (!(mask & AT_MODE))
2155			vap->va_mode = pzp->zp_mode;
2156
2157		/*
2158		 * Take ownership or chgrp to group we are a member of
2159		 */
2160
2161		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
2162		take_group = (mask & AT_GID) && groupmember(vap->va_gid, cr);
2163
2164		/*
2165		 * If both AT_UID and AT_GID are set then take_owner and
2166		 * take_group must both be set in order to allow taking
2167		 * ownership.
2168		 *
2169		 * Otherwise, send the check through secpolicy_vnode_setattr()
2170		 *
2171		 */
2172
2173		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
2174		    ((idmask == AT_UID) && take_owner) ||
2175		    ((idmask == AT_GID) && take_group)) {
2176			if (zfs_zaccess_v4_perm(zp, ACE_WRITE_OWNER, cr) == 0) {
2177				/*
2178				 * Remove setuid/setgid for non-privileged users
2179				 */
2180				secpolicy_setid_clear(vap, cr);
2181				trim_mask = (mask & (AT_UID|AT_GID));
2182			} else {
2183				need_policy =  TRUE;
2184			}
2185		} else {
2186			need_policy =  TRUE;
2187		}
2188	}
2189
2190	mutex_enter(&zp->z_lock);
2191	oldva.va_mode = pzp->zp_mode;
2192	oldva.va_uid = zp->z_phys->zp_uid;
2193	oldva.va_gid = zp->z_phys->zp_gid;
2194	mutex_exit(&zp->z_lock);
2195
2196	if (mask & AT_MODE) {
2197		if (zfs_zaccess_v4_perm(zp, ACE_WRITE_ACL, cr) == 0) {
2198			err = secpolicy_setid_setsticky_clear(vp, vap,
2199			    &oldva, cr);
2200			if (err) {
2201				ZFS_EXIT(zfsvfs);
2202				return (err);
2203			}
2204			trim_mask |= AT_MODE;
2205		} else {
2206			need_policy = TRUE;
2207		}
2208	}
2209
2210	if (need_policy) {
2211		/*
2212		 * If trim_mask is set then take ownership
2213		 * has been granted or write_acl is present and user
2214		 * has the ability to modify mode.  In that case remove
2215		 * UID|GID and or MODE from mask so that
2216		 * secpolicy_vnode_setattr() doesn't revoke it.
2217		 */
2218
2219		if (trim_mask) {
2220			saved_mask = vap->va_mask;
2221			vap->va_mask &= ~trim_mask;
2222
2223		}
2224		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
2225		    (int (*)(void *, int, cred_t *))zfs_zaccess_rwx, zp);
2226		if (err) {
2227			ZFS_EXIT(zfsvfs);
2228			return (err);
2229		}
2230
2231		if (trim_mask)
2232			vap->va_mask |= saved_mask;
2233	}
2234
2235	/*
2236	 * secpolicy_vnode_setattr, or take ownership may have
2237	 * changed va_mask
2238	 */
2239	mask = vap->va_mask;
2240
2241	tx = dmu_tx_create(zfsvfs->z_os);
2242	dmu_tx_hold_bonus(tx, zp->z_id);
2243
2244	if (mask & AT_MODE) {
2245		uint64_t pmode = pzp->zp_mode;
2246
2247		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
2248
2249		if (zp->z_phys->zp_acl.z_acl_extern_obj)
2250			dmu_tx_hold_write(tx,
2251			    pzp->zp_acl.z_acl_extern_obj, 0, SPA_MAXBLOCKSIZE);
2252		else
2253			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2254			    0, ZFS_ACL_SIZE(MAX_ACL_SIZE));
2255	}
2256
2257	if ((mask & (AT_UID | AT_GID)) && zp->z_phys->zp_xattr != 0) {
2258		err = zfs_zget(zp->z_zfsvfs, zp->z_phys->zp_xattr, &attrzp);
2259		if (err) {
2260			dmu_tx_abort(tx);
2261			ZFS_EXIT(zfsvfs);
2262			return (err);
2263		}
2264		dmu_tx_hold_bonus(tx, attrzp->z_id);
2265	}
2266
2267	err = dmu_tx_assign(tx, zfsvfs->z_assign);
2268	if (err) {
2269		if (attrzp)
2270			VN_RELE(ZTOV(attrzp));
2271		if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
2272			dmu_tx_wait(tx);
2273			dmu_tx_abort(tx);
2274			goto top;
2275		}
2276		dmu_tx_abort(tx);
2277		ZFS_EXIT(zfsvfs);
2278		return (err);
2279	}
2280
2281	dmu_buf_will_dirty(zp->z_dbuf, tx);
2282
2283	/*
2284	 * Set each attribute requested.
2285	 * We group settings according to the locks they need to acquire.
2286	 *
2287	 * Note: you cannot set ctime directly, although it will be
2288	 * updated as a side-effect of calling this function.
2289	 */
2290
2291	mutex_enter(&zp->z_lock);
2292
2293	if (mask & AT_MODE) {
2294		err = zfs_acl_chmod_setattr(zp, new_mode, tx);
2295		ASSERT3U(err, ==, 0);
2296	}
2297
2298	if (attrzp)
2299		mutex_enter(&attrzp->z_lock);
2300
2301	if (mask & AT_UID) {
2302		zp->z_phys->zp_uid = (uint64_t)vap->va_uid;
2303		if (attrzp) {
2304			attrzp->z_phys->zp_uid = (uint64_t)vap->va_uid;
2305		}
2306	}
2307
2308	if (mask & AT_GID) {
2309		zp->z_phys->zp_gid = (uint64_t)vap->va_gid;
2310		if (attrzp)
2311			attrzp->z_phys->zp_gid = (uint64_t)vap->va_gid;
2312	}
2313
2314	if (attrzp)
2315		mutex_exit(&attrzp->z_lock);
2316
2317	if (mask & AT_ATIME)
2318		ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
2319
2320	if (mask & AT_MTIME)
2321		ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
2322
2323	if (mask & AT_SIZE)
2324		zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx);
2325	else if (mask != 0)
2326		zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
2327
2328	if (mask != 0)
2329		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask);
2330
2331	mutex_exit(&zp->z_lock);
2332
2333	if (attrzp)
2334		VN_RELE(ZTOV(attrzp));
2335
2336	dmu_tx_commit(tx);
2337
2338	ZFS_EXIT(zfsvfs);
2339	return (err);
2340}
2341
2342typedef struct zfs_zlock {
2343	krwlock_t	*zl_rwlock;	/* lock we acquired */
2344	znode_t		*zl_znode;	/* znode we held */
2345	struct zfs_zlock *zl_next;	/* next in list */
2346} zfs_zlock_t;
2347
2348/*
2349 * Drop locks and release vnodes that were held by zfs_rename_lock().
2350 */
2351static void
2352zfs_rename_unlock(zfs_zlock_t **zlpp)
2353{
2354	zfs_zlock_t *zl;
2355
2356	while ((zl = *zlpp) != NULL) {
2357		if (zl->zl_znode != NULL)
2358			VN_RELE(ZTOV(zl->zl_znode));
2359		rw_exit(zl->zl_rwlock);
2360		*zlpp = zl->zl_next;
2361		kmem_free(zl, sizeof (*zl));
2362	}
2363}
2364
2365/*
2366 * Search back through the directory tree, using the ".." entries.
2367 * Lock each directory in the chain to prevent concurrent renames.
2368 * Fail any attempt to move a directory into one of its own descendants.
2369 * XXX - z_parent_lock can overlap with map or grow locks
2370 */
2371static int
2372zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
2373{
2374	zfs_zlock_t	*zl;
2375	znode_t		*zp = tdzp;
2376	uint64_t	rootid = zp->z_zfsvfs->z_root;
2377	uint64_t	*oidp = &zp->z_id;
2378	krwlock_t	*rwlp = &szp->z_parent_lock;
2379	krw_t		rw = RW_WRITER;
2380
2381	/*
2382	 * First pass write-locks szp and compares to zp->z_id.
2383	 * Later passes read-lock zp and compare to zp->z_parent.
2384	 */
2385	do {
2386		if (!rw_tryenter(rwlp, rw)) {
2387			/*
2388			 * Another thread is renaming in this path.
2389			 * Note that if we are a WRITER, we don't have any
2390			 * parent_locks held yet.
2391			 */
2392			if (rw == RW_READER && zp->z_id > szp->z_id) {
2393				/*
2394				 * Drop our locks and restart
2395				 */
2396				zfs_rename_unlock(&zl);
2397				*zlpp = NULL;
2398				zp = tdzp;
2399				oidp = &zp->z_id;
2400				rwlp = &szp->z_parent_lock;
2401				rw = RW_WRITER;
2402				continue;
2403			} else {
2404				/*
2405				 * Wait for other thread to drop its locks
2406				 */
2407				rw_enter(rwlp, rw);
2408			}
2409		}
2410
2411		zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
2412		zl->zl_rwlock = rwlp;
2413		zl->zl_znode = NULL;
2414		zl->zl_next = *zlpp;
2415		*zlpp = zl;
2416
2417		if (*oidp == szp->z_id)		/* We're a descendant of szp */
2418			return (EINVAL);
2419
2420		if (*oidp == rootid)		/* We've hit the top */
2421			return (0);
2422
2423		if (rw == RW_READER) {		/* i.e. not the first pass */
2424			int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp);
2425			if (error)
2426				return (error);
2427			zl->zl_znode = zp;
2428		}
2429		oidp = &zp->z_phys->zp_parent;
2430		rwlp = &zp->z_parent_lock;
2431		rw = RW_READER;
2432
2433	} while (zp->z_id != sdzp->z_id);
2434
2435	return (0);
2436}
2437
2438/*
2439 * Move an entry from the provided source directory to the target
2440 * directory.  Change the entry name as indicated.
2441 *
2442 *	IN:	sdvp	- Source directory containing the "old entry".
2443 *		snm	- Old entry name.
2444 *		tdvp	- Target directory to contain the "new entry".
2445 *		tnm	- New entry name.
2446 *		cr	- credentials of caller.
2447 *
2448 *	RETURN:	0 if success
2449 *		error code if failure
2450 *
2451 * Timestamps:
2452 *	sdvp,tdvp - ctime|mtime updated
2453 */
2454static int
2455zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr)
2456{
2457	znode_t		*tdzp, *szp, *tzp;
2458	znode_t		*sdzp = VTOZ(sdvp);
2459	zfsvfs_t	*zfsvfs = sdzp->z_zfsvfs;
2460	zilog_t		*zilog = zfsvfs->z_log;
2461	vnode_t		*realvp;
2462	zfs_dirlock_t	*sdl, *tdl;
2463	dmu_tx_t	*tx;
2464	zfs_zlock_t	*zl;
2465	int		cmp, serr, terr, error;
2466
2467	ZFS_ENTER(zfsvfs);
2468
2469	/*
2470	 * Make sure we have the real vp for the target directory.
2471	 */
2472	if (VOP_REALVP(tdvp, &realvp) == 0)
2473		tdvp = realvp;
2474
2475	if (tdvp->v_vfsp != sdvp->v_vfsp) {
2476		ZFS_EXIT(zfsvfs);
2477		return (EXDEV);
2478	}
2479
2480	tdzp = VTOZ(tdvp);
2481top:
2482	szp = NULL;
2483	tzp = NULL;
2484	zl = NULL;
2485
2486	/*
2487	 * This is to prevent the creation of links into attribute space
2488	 * by renaming a linked file into/outof an attribute directory.
2489	 * See the comment in zfs_link() for why this is considered bad.
2490	 */
2491	if ((tdzp->z_phys->zp_flags & ZFS_XATTR) !=
2492	    (sdzp->z_phys->zp_flags & ZFS_XATTR)) {
2493		ZFS_EXIT(zfsvfs);
2494		return (EINVAL);
2495	}
2496
2497	/*
2498	 * Lock source and target directory entries.  To prevent deadlock,
2499	 * a lock ordering must be defined.  We lock the directory with
2500	 * the smallest object id first, or if it's a tie, the one with
2501	 * the lexically first name.
2502	 */
2503	if (sdzp->z_id < tdzp->z_id) {
2504		cmp = -1;
2505	} else if (sdzp->z_id > tdzp->z_id) {
2506		cmp = 1;
2507	} else {
2508		cmp = strcmp(snm, tnm);
2509		if (cmp == 0) {
2510			/*
2511			 * POSIX: "If the old argument and the new argument
2512			 * both refer to links to the same existing file,
2513			 * the rename() function shall return successfully
2514			 * and perform no other action."
2515			 */
2516			ZFS_EXIT(zfsvfs);
2517			return (0);
2518		}
2519	}
2520	if (cmp < 0) {
2521		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS);
2522		terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0);
2523	} else {
2524		terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0);
2525		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS);
2526	}
2527
2528	if (serr) {
2529		/*
2530		 * Source entry invalid or not there.
2531		 */
2532		if (!terr) {
2533			zfs_dirent_unlock(tdl);
2534			if (tzp)
2535				VN_RELE(ZTOV(tzp));
2536		}
2537		if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0)
2538			serr = EINVAL;
2539		ZFS_EXIT(zfsvfs);
2540		return (serr);
2541	}
2542	if (terr) {
2543		zfs_dirent_unlock(sdl);
2544		VN_RELE(ZTOV(szp));
2545		if (strcmp(tnm, "..") == 0)
2546			terr = EINVAL;
2547		ZFS_EXIT(zfsvfs);
2548		return (terr);
2549	}
2550
2551	/*
2552	 * Must have write access at the source to remove the old entry
2553	 * and write access at the target to create the new entry.
2554	 * Note that if target and source are the same, this can be
2555	 * done in a single check.
2556	 */
2557
2558	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
2559		goto out;
2560
2561	if (ZTOV(szp)->v_type == VDIR) {
2562		/*
2563		 * Check to make sure rename is valid.
2564		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
2565		 */
2566		if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
2567			goto out;
2568	}
2569
2570	/*
2571	 * Does target exist?
2572	 */
2573	if (tzp) {
2574		/*
2575		 * Source and target must be the same type.
2576		 */
2577		if (ZTOV(szp)->v_type == VDIR) {
2578			if (ZTOV(tzp)->v_type != VDIR) {
2579				error = ENOTDIR;
2580				goto out;
2581			}
2582		} else {
2583			if (ZTOV(tzp)->v_type == VDIR) {
2584				error = EISDIR;
2585				goto out;
2586			}
2587		}
2588		/*
2589		 * POSIX dictates that when the source and target
2590		 * entries refer to the same file object, rename
2591		 * must do nothing and exit without error.
2592		 */
2593		if (szp->z_id == tzp->z_id) {
2594			error = 0;
2595			goto out;
2596		}
2597	}
2598
2599	vnevent_rename_src(ZTOV(szp));
2600	if (tzp)
2601		vnevent_rename_dest(ZTOV(tzp));
2602
2603	tx = dmu_tx_create(zfsvfs->z_os);
2604	dmu_tx_hold_bonus(tx, szp->z_id);	/* nlink changes */
2605	dmu_tx_hold_bonus(tx, sdzp->z_id);	/* nlink changes */
2606	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
2607	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
2608	if (sdzp != tdzp)
2609		dmu_tx_hold_bonus(tx, tdzp->z_id);	/* nlink changes */
2610	if (tzp)
2611		dmu_tx_hold_bonus(tx, tzp->z_id);	/* parent changes */
2612	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2613	error = dmu_tx_assign(tx, zfsvfs->z_assign);
2614	if (error) {
2615		if (zl != NULL)
2616			zfs_rename_unlock(&zl);
2617		zfs_dirent_unlock(sdl);
2618		zfs_dirent_unlock(tdl);
2619		VN_RELE(ZTOV(szp));
2620		if (tzp)
2621			VN_RELE(ZTOV(tzp));
2622		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
2623			dmu_tx_wait(tx);
2624			dmu_tx_abort(tx);
2625			goto top;
2626		}
2627		dmu_tx_abort(tx);
2628		ZFS_EXIT(zfsvfs);
2629		return (error);
2630	}
2631
2632	if (tzp)	/* Attempt to remove the existing target */
2633		error = zfs_link_destroy(tdl, tzp, tx, 0, NULL);
2634
2635	if (error == 0) {
2636		error = zfs_link_create(tdl, szp, tx, ZRENAMING);
2637		if (error == 0) {
2638			error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
2639			ASSERT(error == 0);
2640			zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
2641			    sdl->dl_name, tdzp, tdl->dl_name, szp);
2642		}
2643#ifdef FREEBSD_NAMECACHE
2644		if (error == 0) {
2645			cache_purge(sdvp);
2646			cache_purge(tdvp);
2647		}
2648#endif
2649	}
2650
2651	dmu_tx_commit(tx);
2652out:
2653	if (zl != NULL)
2654		zfs_rename_unlock(&zl);
2655
2656	zfs_dirent_unlock(sdl);
2657	zfs_dirent_unlock(tdl);
2658
2659	VN_RELE(ZTOV(szp));
2660	if (tzp)
2661		VN_RELE(ZTOV(tzp));
2662
2663	ZFS_EXIT(zfsvfs);
2664
2665	return (error);
2666}
2667
2668/*
2669 * Insert the indicated symbolic reference entry into the directory.
2670 *
2671 *	IN:	dvp	- Directory to contain new symbolic link.
2672 *		link	- Name for new symlink entry.
2673 *		vap	- Attributes of new entry.
2674 *		target	- Target path of new symlink.
2675 *		cr	- credentials of caller.
2676 *
2677 *	RETURN:	0 if success
2678 *		error code if failure
2679 *
2680 * Timestamps:
2681 *	dvp - ctime|mtime updated
2682 */
2683static int
2684zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, cred_t *cr, kthread_t *td)
2685{
2686	znode_t		*zp, *dzp = VTOZ(dvp);
2687	zfs_dirlock_t	*dl;
2688	dmu_tx_t	*tx;
2689	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2690	zilog_t		*zilog = zfsvfs->z_log;
2691	uint64_t	zoid;
2692	int		len = strlen(link);
2693	int		error;
2694
2695	ASSERT(vap->va_type == VLNK);
2696
2697	ZFS_ENTER(zfsvfs);
2698top:
2699	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
2700		ZFS_EXIT(zfsvfs);
2701		return (error);
2702	}
2703
2704	if (len > MAXPATHLEN) {
2705		ZFS_EXIT(zfsvfs);
2706		return (ENAMETOOLONG);
2707	}
2708
2709	/*
2710	 * Attempt to lock directory; fail if entry already exists.
2711	 */
2712	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZNEW)) {
2713		ZFS_EXIT(zfsvfs);
2714		return (error);
2715	}
2716
2717	tx = dmu_tx_create(zfsvfs->z_os);
2718	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
2719	dmu_tx_hold_bonus(tx, dzp->z_id);
2720	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
2721	if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
2722		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE);
2723	error = dmu_tx_assign(tx, zfsvfs->z_assign);
2724	if (error) {
2725		zfs_dirent_unlock(dl);
2726		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
2727			dmu_tx_wait(tx);
2728			dmu_tx_abort(tx);
2729			goto top;
2730		}
2731		dmu_tx_abort(tx);
2732		ZFS_EXIT(zfsvfs);
2733		return (error);
2734	}
2735
2736	dmu_buf_will_dirty(dzp->z_dbuf, tx);
2737
2738	/*
2739	 * Create a new object for the symlink.
2740	 * Put the link content into bonus buffer if it will fit;
2741	 * otherwise, store it just like any other file data.
2742	 */
2743	zoid = 0;
2744	if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) {
2745		zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, len);
2746		if (len != 0)
2747			bcopy(link, zp->z_phys + 1, len);
2748	} else {
2749		dmu_buf_t *dbp;
2750
2751		zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
2752
2753		/*
2754		 * Nothing can access the znode yet so no locking needed
2755		 * for growing the znode's blocksize.
2756		 */
2757		zfs_grow_blocksize(zp, len, tx);
2758
2759		VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, zoid, 0, FTAG, &dbp));
2760		dmu_buf_will_dirty(dbp, tx);
2761
2762		ASSERT3U(len, <=, dbp->db_size);
2763		bcopy(link, dbp->db_data, len);
2764		dmu_buf_rele(dbp, FTAG);
2765	}
2766	zp->z_phys->zp_size = len;
2767
2768	/*
2769	 * Insert the new object into the directory.
2770	 */
2771	(void) zfs_link_create(dl, zp, tx, ZNEW);
2772out:
2773	if (error == 0) {
2774		zfs_log_symlink(zilog, tx, TX_SYMLINK, dzp, zp, name, link);
2775		*vpp = ZTOV(zp);
2776		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
2777	}
2778
2779	dmu_tx_commit(tx);
2780
2781	zfs_dirent_unlock(dl);
2782
2783	ZFS_EXIT(zfsvfs);
2784	return (error);
2785}
2786
2787/*
2788 * Return, in the buffer contained in the provided uio structure,
2789 * the symbolic path referred to by vp.
2790 *
2791 *	IN:	vp	- vnode of symbolic link.
2792 *		uoip	- structure to contain the link path.
2793 *		cr	- credentials of caller.
2794 *
2795 *	OUT:	uio	- structure to contain the link path.
2796 *
2797 *	RETURN:	0 if success
2798 *		error code if failure
2799 *
2800 * Timestamps:
2801 *	vp - atime updated
2802 */
2803/* ARGSUSED */
2804static int
2805zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr)
2806{
2807	znode_t		*zp = VTOZ(vp);
2808	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2809	size_t		bufsz;
2810	int		error;
2811
2812	ZFS_ENTER(zfsvfs);
2813
2814	bufsz = (size_t)zp->z_phys->zp_size;
2815	if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) {
2816		error = uiomove(zp->z_phys + 1,
2817		    MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
2818	} else {
2819		dmu_buf_t *dbp;
2820		error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp);
2821		if (error) {
2822			ZFS_EXIT(zfsvfs);
2823			return (error);
2824		}
2825		error = uiomove(dbp->db_data,
2826		    MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
2827		dmu_buf_rele(dbp, FTAG);
2828	}
2829
2830	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2831	ZFS_EXIT(zfsvfs);
2832	return (error);
2833}
2834
2835/*
2836 * Insert a new entry into directory tdvp referencing svp.
2837 *
2838 *	IN:	tdvp	- Directory to contain new entry.
2839 *		svp	- vnode of new entry.
2840 *		name	- name of new entry.
2841 *		cr	- credentials of caller.
2842 *
2843 *	RETURN:	0 if success
2844 *		error code if failure
2845 *
2846 * Timestamps:
2847 *	tdvp - ctime|mtime updated
2848 *	 svp - ctime updated
2849 */
2850/* ARGSUSED */
2851static int
2852zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr)
2853{
2854	znode_t		*dzp = VTOZ(tdvp);
2855	znode_t		*tzp, *szp;
2856	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2857	zilog_t		*zilog = zfsvfs->z_log;
2858	zfs_dirlock_t	*dl;
2859	dmu_tx_t	*tx;
2860	vnode_t		*realvp;
2861	int		error;
2862
2863	ASSERT(tdvp->v_type == VDIR);
2864
2865	ZFS_ENTER(zfsvfs);
2866
2867	if (VOP_REALVP(svp, &realvp) == 0)
2868		svp = realvp;
2869
2870	if (svp->v_vfsp != tdvp->v_vfsp) {
2871		ZFS_EXIT(zfsvfs);
2872		return (EXDEV);
2873	}
2874
2875	szp = VTOZ(svp);
2876top:
2877	/*
2878	 * We do not support links between attributes and non-attributes
2879	 * because of the potential security risk of creating links
2880	 * into "normal" file space in order to circumvent restrictions
2881	 * imposed in attribute space.
2882	 */
2883	if ((szp->z_phys->zp_flags & ZFS_XATTR) !=
2884	    (dzp->z_phys->zp_flags & ZFS_XATTR)) {
2885		ZFS_EXIT(zfsvfs);
2886		return (EINVAL);
2887	}
2888
2889	/*
2890	 * POSIX dictates that we return EPERM here.
2891	 * Better choices include ENOTSUP or EISDIR.
2892	 */
2893	if (svp->v_type == VDIR) {
2894		ZFS_EXIT(zfsvfs);
2895		return (EPERM);
2896	}
2897
2898	if ((uid_t)szp->z_phys->zp_uid != crgetuid(cr) &&
2899	    secpolicy_basic_link(cr) != 0) {
2900		ZFS_EXIT(zfsvfs);
2901		return (EPERM);
2902	}
2903
2904	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
2905		ZFS_EXIT(zfsvfs);
2906		return (error);
2907	}
2908
2909	/*
2910	 * Attempt to lock directory; fail if entry already exists.
2911	 */
2912	if (error = zfs_dirent_lock(&dl, dzp, name, &tzp, ZNEW)) {
2913		ZFS_EXIT(zfsvfs);
2914		return (error);
2915	}
2916
2917	tx = dmu_tx_create(zfsvfs->z_os);
2918	dmu_tx_hold_bonus(tx, szp->z_id);
2919	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
2920	error = dmu_tx_assign(tx, zfsvfs->z_assign);
2921	if (error) {
2922		zfs_dirent_unlock(dl);
2923		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
2924			dmu_tx_wait(tx);
2925			dmu_tx_abort(tx);
2926			goto top;
2927		}
2928		dmu_tx_abort(tx);
2929		ZFS_EXIT(zfsvfs);
2930		return (error);
2931	}
2932
2933	error = zfs_link_create(dl, szp, tx, 0);
2934
2935	if (error == 0)
2936		zfs_log_link(zilog, tx, TX_LINK, dzp, szp, name);
2937
2938	dmu_tx_commit(tx);
2939
2940	zfs_dirent_unlock(dl);
2941
2942	ZFS_EXIT(zfsvfs);
2943	return (error);
2944}
2945
2946void
2947zfs_inactive(vnode_t *vp, cred_t *cr)
2948{
2949	znode_t	*zp = VTOZ(vp);
2950	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2951	int error;
2952
2953	rw_enter(&zfsvfs->z_um_lock, RW_READER);
2954	if (zfsvfs->z_unmounted2) {
2955		ASSERT(zp->z_dbuf_held == 0);
2956
2957		mutex_enter(&zp->z_lock);
2958		VI_LOCK(vp);
2959		vp->v_count = 0; /* count arrives as 1 */
2960		VI_UNLOCK(vp);
2961		if (zp->z_dbuf == NULL) {
2962			mutex_exit(&zp->z_lock);
2963			zfs_znode_free(zp);
2964		} else {
2965			mutex_exit(&zp->z_lock);
2966		}
2967		rw_exit(&zfsvfs->z_um_lock);
2968		VFS_RELE(zfsvfs->z_vfs);
2969		return;
2970	}
2971
2972	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
2973		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
2974
2975		dmu_tx_hold_bonus(tx, zp->z_id);
2976		error = dmu_tx_assign(tx, TXG_WAIT);
2977		if (error) {
2978			dmu_tx_abort(tx);
2979		} else {
2980			dmu_buf_will_dirty(zp->z_dbuf, tx);
2981			mutex_enter(&zp->z_lock);
2982			zp->z_atime_dirty = 0;
2983			mutex_exit(&zp->z_lock);
2984			dmu_tx_commit(tx);
2985		}
2986	}
2987
2988	zfs_zinactive(zp);
2989	rw_exit(&zfsvfs->z_um_lock);
2990}
2991
2992CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
2993CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
2994
2995static int
2996zfs_fid(vnode_t *vp, fid_t *fidp)
2997{
2998	znode_t		*zp = VTOZ(vp);
2999	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
3000	uint32_t	gen = (uint32_t)zp->z_phys->zp_gen;
3001	uint64_t	object = zp->z_id;
3002	zfid_short_t	*zfid;
3003	int		size, i;
3004
3005	ZFS_ENTER(zfsvfs);
3006
3007	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
3008	fidp->fid_len = size;
3009
3010	zfid = (zfid_short_t *)fidp;
3011
3012	zfid->zf_len = size;
3013
3014	for (i = 0; i < sizeof (zfid->zf_object); i++)
3015		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
3016
3017	/* Must have a non-zero generation number to distinguish from .zfs */
3018	if (gen == 0)
3019		gen = 1;
3020	for (i = 0; i < sizeof (zfid->zf_gen); i++)
3021		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
3022
3023	if (size == LONG_FID_LEN) {
3024		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
3025		zfid_long_t	*zlfid;
3026
3027		zlfid = (zfid_long_t *)fidp;
3028
3029		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
3030			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
3031
3032		/* XXX - this should be the generation number for the objset */
3033		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
3034			zlfid->zf_setgen[i] = 0;
3035	}
3036
3037	ZFS_EXIT(zfsvfs);
3038	return (0);
3039}
3040
3041static int
3042zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr)
3043{
3044	znode_t		*zp, *xzp;
3045	zfsvfs_t	*zfsvfs;
3046	zfs_dirlock_t	*dl;
3047	int		error;
3048
3049	switch (cmd) {
3050	case _PC_LINK_MAX:
3051		*valp = INT_MAX;
3052		return (0);
3053
3054	case _PC_FILESIZEBITS:
3055		*valp = 64;
3056		return (0);
3057
3058#if 0
3059	case _PC_XATTR_EXISTS:
3060		zp = VTOZ(vp);
3061		zfsvfs = zp->z_zfsvfs;
3062		ZFS_ENTER(zfsvfs);
3063		*valp = 0;
3064		error = zfs_dirent_lock(&dl, zp, "", &xzp,
3065		    ZXATTR | ZEXISTS | ZSHARED);
3066		if (error == 0) {
3067			zfs_dirent_unlock(dl);
3068			if (!zfs_dirempty(xzp))
3069				*valp = 1;
3070			VN_RELE(ZTOV(xzp));
3071		} else if (error == ENOENT) {
3072			/*
3073			 * If there aren't extended attributes, it's the
3074			 * same as having zero of them.
3075			 */
3076			error = 0;
3077		}
3078		ZFS_EXIT(zfsvfs);
3079		return (error);
3080#endif
3081
3082	case _PC_ACL_EXTENDED:
3083		*valp = 0;	/* TODO */
3084		return (0);
3085
3086	case _PC_MIN_HOLE_SIZE:
3087		*valp = (int)SPA_MINBLOCKSIZE;
3088		return (0);
3089
3090	default:
3091		return (EOPNOTSUPP);
3092	}
3093}
3094
3095#ifdef TODO
3096/*ARGSUSED*/
3097static int
3098zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr)
3099{
3100	znode_t *zp = VTOZ(vp);
3101	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3102	int error;
3103
3104	ZFS_ENTER(zfsvfs);
3105	error = zfs_getacl(zp, vsecp, cr);
3106	ZFS_EXIT(zfsvfs);
3107
3108	return (error);
3109}
3110#endif	/* TODO */
3111
3112#ifdef TODO
3113/*ARGSUSED*/
3114static int
3115zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr)
3116{
3117	znode_t *zp = VTOZ(vp);
3118	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3119	int error;
3120
3121	ZFS_ENTER(zfsvfs);
3122	error = zfs_setacl(zp, vsecp, cr);
3123	ZFS_EXIT(zfsvfs);
3124	return (error);
3125}
3126#endif	/* TODO */
3127
3128static int
3129zfs_freebsd_open(ap)
3130	struct vop_open_args /* {
3131		struct vnode *a_vp;
3132		int a_mode;
3133		struct ucred *a_cred;
3134		struct thread *a_td;
3135	} */ *ap;
3136{
3137	vnode_t	*vp = ap->a_vp;
3138	znode_t *zp = VTOZ(vp);
3139	int error;
3140
3141	error = zfs_open(&vp, ap->a_mode, ap->a_cred);
3142	if (error == 0)
3143		vnode_create_vobject(vp, zp->z_phys->zp_size, ap->a_td);
3144	return (error);
3145}
3146
3147static int
3148zfs_freebsd_close(ap)
3149	struct vop_close_args /* {
3150		struct vnode *a_vp;
3151		int  a_fflag;
3152		struct ucred *a_cred;
3153		struct thread *a_td;
3154	} */ *ap;
3155{
3156
3157	return (zfs_close(ap->a_vp, ap->a_fflag, 0, 0, ap->a_cred));
3158}
3159
3160static int
3161zfs_freebsd_ioctl(ap)
3162	struct vop_ioctl_args /* {
3163		struct vnode *a_vp;
3164		u_long a_command;
3165		caddr_t a_data;
3166		int a_fflag;
3167		struct ucred *cred;
3168		struct thread *td;
3169	} */ *ap;
3170{
3171
3172	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
3173	    ap->a_fflag, ap->a_cred, NULL));
3174}
3175
3176static int
3177zfs_freebsd_read(ap)
3178	struct vop_read_args /* {
3179		struct vnode *a_vp;
3180		struct uio *a_uio;
3181		int a_ioflag;
3182		struct ucred *a_cred;
3183	} */ *ap;
3184{
3185
3186	return (zfs_read(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL));
3187}
3188
3189static int
3190zfs_freebsd_write(ap)
3191	struct vop_write_args /* {
3192		struct vnode *a_vp;
3193		struct uio *a_uio;
3194		int a_ioflag;
3195		struct ucred *a_cred;
3196	} */ *ap;
3197{
3198
3199	return (zfs_write(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL));
3200}
3201
3202static int
3203zfs_freebsd_access(ap)
3204	struct vop_access_args /* {
3205		struct vnode *a_vp;
3206		int  a_mode;
3207		struct ucred *a_cred;
3208		struct thread *a_td;
3209	} */ *ap;
3210{
3211
3212	return (zfs_access(ap->a_vp, ap->a_mode, 0, ap->a_cred));
3213}
3214
3215static int
3216zfs_freebsd_lookup(ap)
3217	struct vop_lookup_args /* {
3218		struct vnode *a_dvp;
3219		struct vnode **a_vpp;
3220		struct componentname *a_cnp;
3221	} */ *ap;
3222{
3223	struct componentname *cnp = ap->a_cnp;
3224	char nm[NAME_MAX + 1];
3225
3226	ASSERT(cnp->cn_namelen < sizeof(nm));
3227	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
3228
3229	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
3230	    cnp->cn_cred, cnp->cn_thread));
3231}
3232
3233static int
3234zfs_freebsd_create(ap)
3235	struct vop_create_args /* {
3236		struct vnode *a_dvp;
3237		struct vnode **a_vpp;
3238		struct componentname *a_cnp;
3239		struct vattr *a_vap;
3240	} */ *ap;
3241{
3242	struct componentname *cnp = ap->a_cnp;
3243	vattr_t *vap = ap->a_vap;
3244	int mode;
3245
3246	ASSERT(cnp->cn_flags & SAVENAME);
3247
3248	vattr_init_mask(vap);
3249	mode = vap->va_mode & ALLPERMS;
3250
3251	return (zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
3252	    ap->a_vpp, cnp->cn_cred));
3253}
3254
3255static int
3256zfs_freebsd_remove(ap)
3257	struct vop_remove_args /* {
3258		struct vnode *a_dvp;
3259		struct vnode *a_vp;
3260		struct componentname *a_cnp;
3261	} */ *ap;
3262{
3263
3264	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
3265
3266	return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr,
3267	    ap->a_cnp->cn_cred));
3268}
3269
3270static int
3271zfs_freebsd_mkdir(ap)
3272	struct vop_mkdir_args /* {
3273		struct vnode *a_dvp;
3274		struct vnode **a_vpp;
3275		struct componentname *a_cnp;
3276		struct vattr *a_vap;
3277	} */ *ap;
3278{
3279	vattr_t *vap = ap->a_vap;
3280
3281	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
3282
3283	vattr_init_mask(vap);
3284
3285	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
3286	    ap->a_cnp->cn_cred));
3287}
3288
3289static int
3290zfs_freebsd_rmdir(ap)
3291	struct vop_rmdir_args /* {
3292		struct vnode *a_dvp;
3293		struct vnode *a_vp;
3294		struct componentname *a_cnp;
3295	} */ *ap;
3296{
3297	struct componentname *cnp = ap->a_cnp;
3298
3299	ASSERT(cnp->cn_flags & SAVENAME);
3300
3301	return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred));
3302}
3303
3304static int
3305zfs_freebsd_readdir(ap)
3306	struct vop_readdir_args /* {
3307		struct vnode *a_vp;
3308		struct uio *a_uio;
3309		struct ucred *a_cred;
3310		int *a_eofflag;
3311		int *a_ncookies;
3312		u_long **a_cookies;
3313	} */ *ap;
3314{
3315
3316	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
3317	    ap->a_ncookies, ap->a_cookies));
3318}
3319
3320static int
3321zfs_freebsd_fsync(ap)
3322	struct vop_fsync_args /* {
3323		struct vnode *a_vp;
3324		int a_waitfor;
3325		struct thread *a_td;
3326	} */ *ap;
3327{
3328
3329	vop_stdfsync(ap);
3330	return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred));
3331}
3332
3333static int
3334zfs_freebsd_getattr(ap)
3335	struct vop_getattr_args /* {
3336		struct vnode *a_vp;
3337		struct vattr *a_vap;
3338		struct ucred *a_cred;
3339		struct thread *a_td;
3340	} */ *ap;
3341{
3342
3343	return (zfs_getattr(ap->a_vp, ap->a_vap, 0, ap->a_cred));
3344}
3345
3346static int
3347zfs_freebsd_setattr(ap)
3348	struct vop_setattr_args /* {
3349		struct vnode *a_vp;
3350		struct vattr *a_vap;
3351		struct ucred *a_cred;
3352		struct thread *a_td;
3353	} */ *ap;
3354{
3355	vattr_t *vap = ap->a_vap;
3356
3357	/* No support for FreeBSD's chflags(2). */
3358	if (vap->va_flags != VNOVAL)
3359		return (EOPNOTSUPP);
3360
3361	vattr_init_mask(vap);
3362	vap->va_mask &= ~AT_NOSET;
3363
3364	return (zfs_setattr(ap->a_vp, vap, 0, ap->a_cred, NULL));
3365}
3366
3367static int
3368zfs_freebsd_rename(ap)
3369	struct vop_rename_args  /* {
3370		struct vnode *a_fdvp;
3371		struct vnode *a_fvp;
3372		struct componentname *a_fcnp;
3373		struct vnode *a_tdvp;
3374		struct vnode *a_tvp;
3375		struct componentname *a_tcnp;
3376	} */ *ap;
3377{
3378	vnode_t *fdvp = ap->a_fdvp;
3379	vnode_t *fvp = ap->a_fvp;
3380	vnode_t *tdvp = ap->a_tdvp;
3381	vnode_t *tvp = ap->a_tvp;
3382	int error;
3383
3384	ASSERT(ap->a_fcnp->cn_flags & SAVENAME);
3385	ASSERT(ap->a_tcnp->cn_flags & SAVENAME);
3386
3387	error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp,
3388	    ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred);
3389
3390	if (tdvp == tvp)
3391		VN_RELE(tdvp);
3392	else
3393		VN_URELE(tdvp);
3394	if (tvp)
3395		VN_URELE(tvp);
3396	VN_RELE(fdvp);
3397	VN_RELE(fvp);
3398
3399	return (error);
3400}
3401
3402static int
3403zfs_freebsd_symlink(ap)
3404	struct vop_symlink_args /* {
3405		struct vnode *a_dvp;
3406		struct vnode **a_vpp;
3407		struct componentname *a_cnp;
3408		struct vattr *a_vap;
3409		char *a_target;
3410	} */ *ap;
3411{
3412	struct componentname *cnp = ap->a_cnp;
3413	vattr_t *vap = ap->a_vap;
3414
3415	ASSERT(cnp->cn_flags & SAVENAME);
3416
3417	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
3418	vattr_init_mask(vap);
3419
3420	return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
3421	    ap->a_target, cnp->cn_cred, cnp->cn_thread));
3422}
3423
3424static int
3425zfs_freebsd_readlink(ap)
3426	struct vop_readlink_args /* {
3427		struct vnode *a_vp;
3428		struct uio *a_uio;
3429		struct ucred *a_cred;
3430	} */ *ap;
3431{
3432
3433	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred));
3434}
3435
3436static int
3437zfs_freebsd_link(ap)
3438	struct vop_link_args /* {
3439		struct vnode *a_tdvp;
3440		struct vnode *a_vp;
3441		struct componentname *a_cnp;
3442	} */ *ap;
3443{
3444	struct componentname *cnp = ap->a_cnp;
3445
3446	ASSERT(cnp->cn_flags & SAVENAME);
3447
3448	return (zfs_link(ap->a_tdvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
3449}
3450
3451static int
3452zfs_freebsd_inactive(ap)
3453	struct vop_inactive_args /* {
3454		struct vnode *a_vp;
3455		struct thread *a_td;
3456	} */ *ap;
3457{
3458	vnode_t *vp = ap->a_vp;
3459
3460	zfs_inactive(vp, ap->a_td->td_ucred);
3461	return (0);
3462}
3463
3464static int
3465zfs_freebsd_reclaim(ap)
3466	struct vop_reclaim_args /* {
3467		struct vnode *a_vp;
3468		struct thread *a_td;
3469	} */ *ap;
3470{
3471	vnode_t	*vp = ap->a_vp;
3472	znode_t	*zp = VTOZ(vp);
3473	zfsvfs_t *zfsvfs;
3474	int rele = 1;
3475
3476	ASSERT(zp != NULL);
3477
3478	/*
3479	 * Destroy the vm object and flush associated pages.
3480	 */
3481	vnode_destroy_vobject(vp);
3482
3483	mutex_enter(&zp->z_lock);
3484	ASSERT(zp->z_phys);
3485	ASSERT(zp->z_dbuf_held);
3486	zfsvfs = zp->z_zfsvfs;
3487	if (!zp->z_unlinked) {
3488		zp->z_dbuf_held = 0;
3489		ZTOV(zp) = NULL;
3490		mutex_exit(&zp->z_lock);
3491		dmu_buf_rele(zp->z_dbuf, NULL);
3492	} else {
3493		mutex_exit(&zp->z_lock);
3494	}
3495	VI_LOCK(vp);
3496	if (vp->v_count > 0)
3497		rele = 0;
3498	vp->v_data = NULL;
3499	ASSERT(vp->v_holdcnt >= 1);
3500	VI_UNLOCK(vp);
3501	if (!zp->z_unlinked && rele)
3502		VFS_RELE(zfsvfs->z_vfs);
3503	return (0);
3504}
3505
3506static int
3507zfs_freebsd_fid(ap)
3508	struct vop_fid_args /* {
3509		struct vnode *a_vp;
3510		struct fid *a_fid;
3511	} */ *ap;
3512{
3513
3514	return (zfs_fid(ap->a_vp, (void *)ap->a_fid));
3515}
3516
3517static int
3518zfs_freebsd_pathconf(ap)
3519	struct vop_pathconf_args /* {
3520		struct vnode *a_vp;
3521		int a_name;
3522		register_t *a_retval;
3523	} */ *ap;
3524{
3525	ulong_t val;
3526	int error;
3527
3528	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred);
3529	if (error == 0)
3530		*ap->a_retval = val;
3531	else if (error == EOPNOTSUPP)
3532		error = vop_stdpathconf(ap);
3533	return (error);
3534}
3535
3536/*
3537 * Advisory record locking support
3538 */
3539static int
3540zfs_freebsd_advlock(ap)
3541	struct vop_advlock_args /* {
3542		struct vnode *a_vp;
3543		caddr_t  a_id;
3544		int  a_op;
3545		struct flock *a_fl;
3546		int  a_flags;
3547	} */ *ap;
3548{
3549	znode_t	*zp = VTOZ(ap->a_vp);
3550
3551	return (lf_advlock(ap, &(zp->z_lockf), zp->z_phys->zp_size));
3552}
3553
3554/*
3555 * Advisory record locking support
3556 */
3557static int
3558zfs_freebsd_advlockasync(ap)
3559	struct vop_advlockasync_args /* {
3560		struct vnode *a_vp;
3561		caddr_t  a_id;
3562		int  a_op;
3563		struct flock *a_fl;
3564		int  a_flags;
3565		struct task *a_task;
3566	} */ *ap;
3567{
3568	znode_t	*zp = VTOZ(ap->a_vp);
3569
3570	return (lf_advlockasync(ap, &(zp->z_lockf), zp->z_phys->zp_size));
3571}
3572
3573struct vop_vector zfs_vnodeops;
3574struct vop_vector zfs_fifoops;
3575
3576struct vop_vector zfs_vnodeops = {
3577	.vop_default =	&default_vnodeops,
3578	.vop_inactive =	zfs_freebsd_inactive,
3579	.vop_reclaim =	zfs_freebsd_reclaim,
3580	.vop_access =	zfs_freebsd_access,
3581#ifdef FREEBSD_NAMECACHE
3582	.vop_lookup =	vfs_cache_lookup,
3583	.vop_cachedlookup = zfs_freebsd_lookup,
3584#else
3585	.vop_lookup =	zfs_freebsd_lookup,
3586#endif
3587	.vop_getattr =	zfs_freebsd_getattr,
3588	.vop_setattr =	zfs_freebsd_setattr,
3589	.vop_create =	zfs_freebsd_create,
3590	.vop_mknod =	zfs_freebsd_create,
3591	.vop_mkdir =	zfs_freebsd_mkdir,
3592	.vop_readdir =	zfs_freebsd_readdir,
3593	.vop_fsync =	zfs_freebsd_fsync,
3594	.vop_open =	zfs_freebsd_open,
3595	.vop_close =	zfs_freebsd_close,
3596	.vop_rmdir =	zfs_freebsd_rmdir,
3597	.vop_ioctl =	zfs_freebsd_ioctl,
3598	.vop_link =	zfs_freebsd_link,
3599	.vop_symlink =	zfs_freebsd_symlink,
3600	.vop_readlink =	zfs_freebsd_readlink,
3601	.vop_read =	zfs_freebsd_read,
3602	.vop_write =	zfs_freebsd_write,
3603	.vop_remove =	zfs_freebsd_remove,
3604	.vop_rename =	zfs_freebsd_rename,
3605	.vop_advlock =	zfs_freebsd_advlock,
3606	.vop_advlockasync = zfs_freebsd_advlockasync,
3607	.vop_pathconf =	zfs_freebsd_pathconf,
3608	.vop_bmap =	VOP_EOPNOTSUPP,
3609	.vop_fid =	zfs_freebsd_fid,
3610};
3611
3612struct vop_vector zfs_fifoops = {
3613	.vop_default =	&fifo_specops,
3614	.vop_fsync =	VOP_PANIC,
3615	.vop_access =	zfs_freebsd_access,
3616	.vop_getattr =	zfs_freebsd_getattr,
3617	.vop_inactive =	zfs_freebsd_inactive,
3618	.vop_read =	VOP_PANIC,
3619	.vop_reclaim =	zfs_freebsd_reclaim,
3620	.vop_setattr =	zfs_freebsd_setattr,
3621	.vop_write =	VOP_PANIC,
3622	.vop_fid =	zfs_freebsd_fid,
3623};
3624