zfs_vnops.c revision 175202
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/* Portions Copyright 2007 Jeremy Teo */
27
28#pragma ident	"%Z%%M%	%I%	%E% SMI"
29
30#include <sys/types.h>
31#include <sys/param.h>
32#include <sys/time.h>
33#include <sys/systm.h>
34#include <sys/sysmacros.h>
35#include <sys/resource.h>
36#include <sys/vfs.h>
37#include <sys/vnode.h>
38#include <sys/file.h>
39#include <sys/stat.h>
40#include <sys/kmem.h>
41#include <sys/taskq.h>
42#include <sys/uio.h>
43#include <sys/atomic.h>
44#include <sys/namei.h>
45#include <sys/mman.h>
46#include <sys/cmn_err.h>
47#include <sys/errno.h>
48#include <sys/unistd.h>
49#include <sys/zfs_vfsops.h>
50#include <sys/zfs_dir.h>
51#include <sys/zfs_acl.h>
52#include <sys/zfs_ioctl.h>
53#include <sys/fs/zfs.h>
54#include <sys/dmu.h>
55#include <sys/spa.h>
56#include <sys/txg.h>
57#include <sys/dbuf.h>
58#include <sys/zap.h>
59#include <sys/dirent.h>
60#include <sys/policy.h>
61#include <sys/sunddi.h>
62#include <sys/filio.h>
63#include <sys/zfs_ctldir.h>
64#include <sys/dnlc.h>
65#include <sys/zfs_rlock.h>
66#include <sys/bio.h>
67#include <sys/buf.h>
68#include <sys/sf_buf.h>
69#include <sys/sched.h>
70
71/*
72 * Programming rules.
73 *
74 * Each vnode op performs some logical unit of work.  To do this, the ZPL must
75 * properly lock its in-core state, create a DMU transaction, do the work,
76 * record this work in the intent log (ZIL), commit the DMU transaction,
77 * and wait the the intent log to commit if it's is a synchronous operation.
78 * Morover, the vnode ops must work in both normal and log replay context.
79 * The ordering of events is important to avoid deadlocks and references
80 * to freed memory.  The example below illustrates the following Big Rules:
81 *
82 *  (1) A check must be made in each zfs thread for a mounted file system.
83 *	This is done avoiding races using ZFS_ENTER(zfsvfs).
84 *	A ZFS_EXIT(zfsvfs) is needed before all returns.
85 *
86 *  (2)	VN_RELE() should always be the last thing except for zil_commit()
87 *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
88 *	First, if it's the last reference, the vnode/znode
89 *	can be freed, so the zp may point to freed memory.  Second, the last
90 *	reference will call zfs_zinactive(), which may induce a lot of work --
91 *	pushing cached pages (which acquires range locks) and syncing out
92 *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
93 *	which could deadlock the system if you were already holding one.
94 *
95 *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
96 *	as they can span dmu_tx_assign() calls.
97 *
98 *  (4)	Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign().
99 *	In normal operation, this will be TXG_NOWAIT.  During ZIL replay,
100 *	it will be a specific txg.  Either way, dmu_tx_assign() never blocks.
101 *	This is critical because we don't want to block while holding locks.
102 *	Note, in particular, that if a lock is sometimes acquired before
103 *	the tx assigns, and sometimes after (e.g. z_lock), then failing to
104 *	use a non-blocking assign can deadlock the system.  The scenario:
105 *
106 *	Thread A has grabbed a lock before calling dmu_tx_assign().
107 *	Thread B is in an already-assigned tx, and blocks for this lock.
108 *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
109 *	forever, because the previous txg can't quiesce until B's tx commits.
110 *
111 *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
112 *	then drop all locks, call dmu_tx_wait(), and try again.
113 *
114 *  (5)	If the operation succeeded, generate the intent log entry for it
115 *	before dropping locks.  This ensures that the ordering of events
116 *	in the intent log matches the order in which they actually occurred.
117 *
118 *  (6)	At the end of each vnode op, the DMU tx must always commit,
119 *	regardless of whether there were any errors.
120 *
121 *  (7)	After dropping all locks, invoke zil_commit(zilog, seq, foid)
122 *	to ensure that synchronous semantics are provided when necessary.
123 *
124 * In general, this is how things should be ordered in each vnode op:
125 *
126 *	ZFS_ENTER(zfsvfs);		// exit if unmounted
127 * top:
128 *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may VN_HOLD())
129 *	rw_enter(...);			// grab any other locks you need
130 *	tx = dmu_tx_create(...);	// get DMU tx
131 *	dmu_tx_hold_*();		// hold each object you might modify
132 *	error = dmu_tx_assign(tx, zfsvfs->z_assign);	// try to assign
133 *	if (error) {
134 *		rw_exit(...);		// drop locks
135 *		zfs_dirent_unlock(dl);	// unlock directory entry
136 *		VN_RELE(...);		// release held vnodes
137 *		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
138 *			dmu_tx_wait(tx);
139 *			dmu_tx_abort(tx);
140 *			goto top;
141 *		}
142 *		dmu_tx_abort(tx);	// abort DMU tx
143 *		ZFS_EXIT(zfsvfs);	// finished in zfs
144 *		return (error);		// really out of space
145 *	}
146 *	error = do_real_work();		// do whatever this VOP does
147 *	if (error == 0)
148 *		zfs_log_*(...);		// on success, make ZIL entry
149 *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
150 *	rw_exit(...);			// drop locks
151 *	zfs_dirent_unlock(dl);		// unlock directory entry
152 *	VN_RELE(...);			// release held vnodes
153 *	zil_commit(zilog, seq, foid);	// synchronous when necessary
154 *	ZFS_EXIT(zfsvfs);		// finished in zfs
155 *	return (error);			// done, report error
156 */
157/* ARGSUSED */
158static int
159zfs_open(vnode_t **vpp, int flag, cred_t *cr)
160{
161	znode_t	*zp = VTOZ(*vpp);
162
163	/* Keep a count of the synchronous opens in the znode */
164	if (flag & (FSYNC | FDSYNC))
165		atomic_inc_32(&zp->z_sync_cnt);
166	return (0);
167}
168
169/* ARGSUSED */
170static int
171zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
172{
173	znode_t	*zp = VTOZ(vp);
174
175	/* Decrement the synchronous opens in the znode */
176	if (flag & (FSYNC | FDSYNC))
177		atomic_dec_32(&zp->z_sync_cnt);
178
179	/*
180	 * Clean up any locks held by this process on the vp.
181	 */
182	cleanlocks(vp, ddi_get_pid(), 0);
183	cleanshares(vp, ddi_get_pid());
184
185	return (0);
186}
187
188/*
189 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
190 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
191 */
192static int
193zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
194{
195	znode_t	*zp = VTOZ(vp);
196	uint64_t noff = (uint64_t)*off; /* new offset */
197	uint64_t file_sz;
198	int error;
199	boolean_t hole;
200
201	file_sz = zp->z_phys->zp_size;
202	if (noff >= file_sz)  {
203		return (ENXIO);
204	}
205
206	if (cmd == _FIO_SEEK_HOLE)
207		hole = B_TRUE;
208	else
209		hole = B_FALSE;
210
211	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
212
213	/* end of file? */
214	if ((error == ESRCH) || (noff > file_sz)) {
215		/*
216		 * Handle the virtual hole at the end of file.
217		 */
218		if (hole) {
219			*off = file_sz;
220			return (0);
221		}
222		return (ENXIO);
223	}
224
225	if (noff < *off)
226		return (error);
227	*off = noff;
228	return (error);
229}
230
231/* ARGSUSED */
232static int
233zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
234    int *rvalp)
235{
236	offset_t off;
237	int error;
238	zfsvfs_t *zfsvfs;
239
240	switch (com) {
241	    case _FIOFFS:
242		return (0);
243
244		/*
245		 * The following two ioctls are used by bfu.  Faking out,
246		 * necessary to avoid bfu errors.
247		 */
248	    case _FIOGDIO:
249	    case _FIOSDIO:
250		return (0);
251
252	    case _FIO_SEEK_DATA:
253	    case _FIO_SEEK_HOLE:
254		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
255			return (EFAULT);
256
257		zfsvfs = VTOZ(vp)->z_zfsvfs;
258		ZFS_ENTER(zfsvfs);
259
260		/* offset parameter is in/out */
261		error = zfs_holey(vp, com, &off);
262		ZFS_EXIT(zfsvfs);
263		if (error)
264			return (error);
265		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
266			return (EFAULT);
267		return (0);
268	}
269	return (ENOTTY);
270}
271
272/*
273 * When a file is memory mapped, we must keep the IO data synchronized
274 * between the DMU cache and the memory mapped pages.  What this means:
275 *
276 * On Write:	If we find a memory mapped page, we write to *both*
277 *		the page and the dmu buffer.
278 *
279 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
280 *	the file is memory mapped.
281 */
282static int
283mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx)
284{
285	znode_t *zp = VTOZ(vp);
286	objset_t *os = zp->z_zfsvfs->z_os;
287	vm_object_t obj;
288	vm_page_t m;
289	struct sf_buf *sf;
290	int64_t start, off;
291	int len = nbytes;
292	int error = 0;
293	uint64_t dirbytes;
294
295	ASSERT(vp->v_mount != NULL);
296	obj = vp->v_object;
297	ASSERT(obj != NULL);
298
299	start = uio->uio_loffset;
300	off = start & PAGEOFFSET;
301	dirbytes = 0;
302	VM_OBJECT_LOCK(obj);
303	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
304		uint64_t bytes = MIN(PAGESIZE - off, len);
305		uint64_t fsize;
306
307again:
308		if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
309		    vm_page_is_valid(m, (vm_offset_t)off, bytes)) {
310			uint64_t woff;
311			caddr_t va;
312
313			if (vm_page_sleep_if_busy(m, FALSE, "zfsmwb"))
314				goto again;
315			fsize = obj->un_pager.vnp.vnp_size;
316			vm_page_busy(m);
317			vm_page_lock_queues();
318			vm_page_undirty(m);
319			vm_page_unlock_queues();
320			VM_OBJECT_UNLOCK(obj);
321			if (dirbytes > 0) {
322				error = dmu_write_uio(os, zp->z_id, uio,
323				    dirbytes, tx);
324				dirbytes = 0;
325			}
326			if (error == 0) {
327				sched_pin();
328				sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
329				va = (caddr_t)sf_buf_kva(sf);
330				woff = uio->uio_loffset - off;
331				error = uiomove(va + off, bytes, UIO_WRITE, uio);
332				/*
333				 * The uiomove() above could have been partially
334				 * successful, that's why we call dmu_write()
335				 * below unconditionally. The page was marked
336				 * non-dirty above and we would lose the changes
337				 * without doing so. If the uiomove() failed
338				 * entirely, well, we just write what we got
339				 * before one more time.
340				 */
341				dmu_write(os, zp->z_id, woff,
342				    MIN(PAGESIZE, fsize - woff), va, tx);
343				sf_buf_free(sf);
344				sched_unpin();
345			}
346			VM_OBJECT_LOCK(obj);
347			vm_page_wakeup(m);
348		} else {
349			dirbytes += bytes;
350		}
351		len -= bytes;
352		off = 0;
353		if (error)
354			break;
355	}
356	VM_OBJECT_UNLOCK(obj);
357	if (error == 0 && dirbytes > 0)
358		error = dmu_write_uio(os, zp->z_id, uio, dirbytes, tx);
359	return (error);
360}
361
362/*
363 * When a file is memory mapped, we must keep the IO data synchronized
364 * between the DMU cache and the memory mapped pages.  What this means:
365 *
366 * On Read:	We "read" preferentially from memory mapped pages,
367 *		else we default from the dmu buffer.
368 *
369 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
370 *	the file is memory mapped.
371 */
372static int
373mappedread(vnode_t *vp, int nbytes, uio_t *uio)
374{
375	znode_t *zp = VTOZ(vp);
376	objset_t *os = zp->z_zfsvfs->z_os;
377	vm_object_t obj;
378	vm_page_t m;
379	struct sf_buf *sf;
380	int64_t start, off;
381	caddr_t va;
382	int len = nbytes;
383	int error = 0;
384	uint64_t dirbytes;
385
386	ASSERT(vp->v_mount != NULL);
387	obj = vp->v_object;
388	ASSERT(obj != NULL);
389
390	start = uio->uio_loffset;
391	off = start & PAGEOFFSET;
392	dirbytes = 0;
393	VM_OBJECT_LOCK(obj);
394	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
395		uint64_t bytes = MIN(PAGESIZE - off, len);
396
397again:
398		if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
399		    vm_page_is_valid(m, (vm_offset_t)off, bytes)) {
400			if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb"))
401				goto again;
402			vm_page_busy(m);
403			VM_OBJECT_UNLOCK(obj);
404			if (dirbytes > 0) {
405				error = dmu_read_uio(os, zp->z_id, uio,
406				    dirbytes);
407				dirbytes = 0;
408			}
409			if (error == 0) {
410				sched_pin();
411				sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
412				va = (caddr_t)sf_buf_kva(sf);
413				error = uiomove(va + off, bytes, UIO_READ, uio);
414				sf_buf_free(sf);
415				sched_unpin();
416			}
417			VM_OBJECT_LOCK(obj);
418			vm_page_wakeup(m);
419		} else if (m != NULL && uio->uio_segflg == UIO_NOCOPY) {
420			/*
421			 * The code below is here to make sendfile(2) work
422			 * correctly with ZFS. As pointed out by ups@
423			 * sendfile(2) should be changed to use VOP_GETPAGES(),
424			 * but it pessimize performance of sendfile/UFS, that's
425			 * why I handle this special case in ZFS code.
426			 */
427			if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb"))
428				goto again;
429			vm_page_busy(m);
430			VM_OBJECT_UNLOCK(obj);
431			if (dirbytes > 0) {
432				error = dmu_read_uio(os, zp->z_id, uio,
433				    dirbytes);
434				dirbytes = 0;
435			}
436			if (error == 0) {
437				sched_pin();
438				sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
439				va = (caddr_t)sf_buf_kva(sf);
440				error = dmu_read(os, zp->z_id, start + off,
441				    bytes, (void *)(va + off));
442				sf_buf_free(sf);
443				sched_unpin();
444			}
445			VM_OBJECT_LOCK(obj);
446			vm_page_wakeup(m);
447			if (error == 0)
448				uio->uio_resid -= bytes;
449		} else {
450			dirbytes += bytes;
451		}
452		len -= bytes;
453		off = 0;
454		if (error)
455			break;
456	}
457	VM_OBJECT_UNLOCK(obj);
458	if (error == 0 && dirbytes > 0)
459		error = dmu_read_uio(os, zp->z_id, uio, dirbytes);
460	return (error);
461}
462
463offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
464
465/*
466 * Read bytes from specified file into supplied buffer.
467 *
468 *	IN:	vp	- vnode of file to be read from.
469 *		uio	- structure supplying read location, range info,
470 *			  and return buffer.
471 *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
472 *		cr	- credentials of caller.
473 *
474 *	OUT:	uio	- updated offset and range, buffer filled.
475 *
476 *	RETURN:	0 if success
477 *		error code if failure
478 *
479 * Side Effects:
480 *	vp - atime updated if byte count > 0
481 */
482/* ARGSUSED */
483static int
484zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
485{
486	znode_t		*zp = VTOZ(vp);
487	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
488	objset_t	*os = zfsvfs->z_os;
489	ssize_t		n, nbytes;
490	int		error;
491	rl_t		*rl;
492
493	ZFS_ENTER(zfsvfs);
494
495	/*
496	 * Validate file offset
497	 */
498	if (uio->uio_loffset < (offset_t)0) {
499		ZFS_EXIT(zfsvfs);
500		return (EINVAL);
501	}
502
503	/*
504	 * Fasttrack empty reads
505	 */
506	if (uio->uio_resid == 0) {
507		ZFS_EXIT(zfsvfs);
508		return (0);
509	}
510
511	/*
512	 * Check for mandatory locks
513	 */
514	if (MANDMODE((mode_t)zp->z_phys->zp_mode)) {
515		if (error = chklock(vp, FREAD,
516		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
517			ZFS_EXIT(zfsvfs);
518			return (error);
519		}
520	}
521
522	/*
523	 * If we're in FRSYNC mode, sync out this znode before reading it.
524	 */
525	if (ioflag & FRSYNC)
526		zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
527
528	/*
529	 * Lock the range against changes.
530	 */
531	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
532
533	/*
534	 * If we are reading past end-of-file we can skip
535	 * to the end; but we might still need to set atime.
536	 */
537	if (uio->uio_loffset >= zp->z_phys->zp_size) {
538		error = 0;
539		goto out;
540	}
541
542	ASSERT(uio->uio_loffset < zp->z_phys->zp_size);
543	n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset);
544
545	while (n > 0) {
546		nbytes = MIN(n, zfs_read_chunk_size -
547		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
548
549		if (vn_has_cached_data(vp))
550			error = mappedread(vp, nbytes, uio);
551		else
552			error = dmu_read_uio(os, zp->z_id, uio, nbytes);
553		if (error)
554			break;
555
556		n -= nbytes;
557	}
558
559out:
560	zfs_range_unlock(rl);
561
562	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
563	ZFS_EXIT(zfsvfs);
564	return (error);
565}
566
567/*
568 * Fault in the pages of the first n bytes specified by the uio structure.
569 * 1 byte in each page is touched and the uio struct is unmodified.
570 * Any error will exit this routine as this is only a best
571 * attempt to get the pages resident. This is a copy of ufs_trans_touch().
572 */
573static void
574zfs_prefault_write(ssize_t n, struct uio *uio)
575{
576	struct iovec *iov;
577	ulong_t cnt, incr;
578	caddr_t p;
579
580	if (uio->uio_segflg != UIO_USERSPACE)
581		return;
582
583	iov = uio->uio_iov;
584
585	while (n) {
586		cnt = MIN(iov->iov_len, n);
587		if (cnt == 0) {
588			/* empty iov entry */
589			iov++;
590			continue;
591		}
592		n -= cnt;
593		/*
594		 * touch each page in this segment.
595		 */
596		p = iov->iov_base;
597		while (cnt) {
598			if (fubyte(p) == -1)
599				return;
600			incr = MIN(cnt, PAGESIZE);
601			p += incr;
602			cnt -= incr;
603		}
604		/*
605		 * touch the last byte in case it straddles a page.
606		 */
607		p--;
608		if (fubyte(p) == -1)
609			return;
610		iov++;
611	}
612}
613
614/*
615 * Write the bytes to a file.
616 *
617 *	IN:	vp	- vnode of file to be written to.
618 *		uio	- structure supplying write location, range info,
619 *			  and data buffer.
620 *		ioflag	- IO_APPEND flag set if in append mode.
621 *		cr	- credentials of caller.
622 *
623 *	OUT:	uio	- updated offset and range.
624 *
625 *	RETURN:	0 if success
626 *		error code if failure
627 *
628 * Timestamps:
629 *	vp - ctime|mtime updated if byte count > 0
630 */
631/* ARGSUSED */
632static int
633zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
634{
635	znode_t		*zp = VTOZ(vp);
636	rlim64_t	limit = MAXOFFSET_T;
637	ssize_t		start_resid = uio->uio_resid;
638	ssize_t		tx_bytes;
639	uint64_t	end_size;
640	dmu_tx_t	*tx;
641	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
642	zilog_t		*zilog = zfsvfs->z_log;
643	offset_t	woff;
644	ssize_t		n, nbytes;
645	rl_t		*rl;
646	int		max_blksz = zfsvfs->z_max_blksz;
647	int		error;
648
649	/*
650	 * Fasttrack empty write
651	 */
652	n = start_resid;
653	if (n == 0)
654		return (0);
655
656	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
657		limit = MAXOFFSET_T;
658
659	ZFS_ENTER(zfsvfs);
660
661	/*
662	 * Pre-fault the pages to ensure slow (eg NFS) pages
663	 * don't hold up txg.
664	 */
665	zfs_prefault_write(n, uio);
666
667	/*
668	 * If in append mode, set the io offset pointer to eof.
669	 */
670	if (ioflag & IO_APPEND) {
671		/*
672		 * Range lock for a file append:
673		 * The value for the start of range will be determined by
674		 * zfs_range_lock() (to guarantee append semantics).
675		 * If this write will cause the block size to increase,
676		 * zfs_range_lock() will lock the entire file, so we must
677		 * later reduce the range after we grow the block size.
678		 */
679		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
680		if (rl->r_len == UINT64_MAX) {
681			/* overlocked, zp_size can't change */
682			woff = uio->uio_loffset = zp->z_phys->zp_size;
683		} else {
684			woff = uio->uio_loffset = rl->r_off;
685		}
686	} else {
687		woff = uio->uio_loffset;
688		/*
689		 * Validate file offset
690		 */
691		if (woff < 0) {
692			ZFS_EXIT(zfsvfs);
693			return (EINVAL);
694		}
695
696		/*
697		 * If we need to grow the block size then zfs_range_lock()
698		 * will lock a wider range than we request here.
699		 * Later after growing the block size we reduce the range.
700		 */
701		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
702	}
703
704	if (woff >= limit) {
705		zfs_range_unlock(rl);
706		ZFS_EXIT(zfsvfs);
707		return (EFBIG);
708	}
709
710	if ((woff + n) > limit || woff > (limit - n))
711		n = limit - woff;
712
713	/*
714	 * Check for mandatory locks
715	 */
716	if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
717	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
718		zfs_range_unlock(rl);
719		ZFS_EXIT(zfsvfs);
720		return (error);
721	}
722	end_size = MAX(zp->z_phys->zp_size, woff + n);
723
724	/*
725	 * Write the file in reasonable size chunks.  Each chunk is written
726	 * in a separate transaction; this keeps the intent log records small
727	 * and allows us to do more fine-grained space accounting.
728	 */
729	while (n > 0) {
730		/*
731		 * Start a transaction.
732		 */
733		woff = uio->uio_loffset;
734		tx = dmu_tx_create(zfsvfs->z_os);
735		dmu_tx_hold_bonus(tx, zp->z_id);
736		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
737		error = dmu_tx_assign(tx, zfsvfs->z_assign);
738		if (error) {
739			if (error == ERESTART &&
740			    zfsvfs->z_assign == TXG_NOWAIT) {
741				dmu_tx_wait(tx);
742				dmu_tx_abort(tx);
743				continue;
744			}
745			dmu_tx_abort(tx);
746			break;
747		}
748
749		/*
750		 * If zfs_range_lock() over-locked we grow the blocksize
751		 * and then reduce the lock range.  This will only happen
752		 * on the first iteration since zfs_range_reduce() will
753		 * shrink down r_len to the appropriate size.
754		 */
755		if (rl->r_len == UINT64_MAX) {
756			uint64_t new_blksz;
757
758			if (zp->z_blksz > max_blksz) {
759				ASSERT(!ISP2(zp->z_blksz));
760				new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
761			} else {
762				new_blksz = MIN(end_size, max_blksz);
763			}
764			zfs_grow_blocksize(zp, new_blksz, tx);
765			zfs_range_reduce(rl, woff, n);
766		}
767
768		/*
769		 * XXX - should we really limit each write to z_max_blksz?
770		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
771		 */
772		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
773
774		if (woff + nbytes > zp->z_phys->zp_size)
775			vnode_pager_setsize(vp, woff + nbytes);
776
777		rw_enter(&zp->z_map_lock, RW_READER);
778
779		tx_bytes = uio->uio_resid;
780		if (vn_has_cached_data(vp)) {
781			rw_exit(&zp->z_map_lock);
782			error = mappedwrite(vp, nbytes, uio, tx);
783		} else {
784			error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
785			    uio, nbytes, tx);
786			rw_exit(&zp->z_map_lock);
787		}
788		tx_bytes -= uio->uio_resid;
789
790		/*
791		 * If we made no progress, we're done.  If we made even
792		 * partial progress, update the znode and ZIL accordingly.
793		 */
794		if (tx_bytes == 0) {
795			dmu_tx_commit(tx);
796			ASSERT(error != 0);
797			break;
798		}
799
800		/*
801		 * Clear Set-UID/Set-GID bits on successful write if not
802		 * privileged and at least one of the excute bits is set.
803		 *
804		 * It would be nice to to this after all writes have
805		 * been done, but that would still expose the ISUID/ISGID
806		 * to another app after the partial write is committed.
807		 */
808		mutex_enter(&zp->z_acl_lock);
809		if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) |
810		    (S_IXUSR >> 6))) != 0 &&
811		    (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 &&
812		    secpolicy_vnode_setid_retain(cr,
813		    (zp->z_phys->zp_mode & S_ISUID) != 0 &&
814		    zp->z_phys->zp_uid == 0) != 0) {
815			    zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID);
816		}
817		mutex_exit(&zp->z_acl_lock);
818
819		/*
820		 * Update time stamp.  NOTE: This marks the bonus buffer as
821		 * dirty, so we don't have to do it again for zp_size.
822		 */
823		zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
824
825		/*
826		 * Update the file size (zp_size) if it has changed;
827		 * account for possible concurrent updates.
828		 */
829		while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset)
830			(void) atomic_cas_64(&zp->z_phys->zp_size, end_size,
831			    uio->uio_loffset);
832		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
833		dmu_tx_commit(tx);
834
835		if (error != 0)
836			break;
837		ASSERT(tx_bytes == nbytes);
838		n -= nbytes;
839	}
840
841	zfs_range_unlock(rl);
842
843	/*
844	 * If we're in replay mode, or we made no progress, return error.
845	 * Otherwise, it's at least a partial write, so it's successful.
846	 */
847	if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) {
848		ZFS_EXIT(zfsvfs);
849		return (error);
850	}
851
852	if (ioflag & (FSYNC | FDSYNC))
853		zil_commit(zilog, zp->z_last_itx, zp->z_id);
854
855	ZFS_EXIT(zfsvfs);
856	return (0);
857}
858
859void
860zfs_get_done(dmu_buf_t *db, void *vzgd)
861{
862	zgd_t *zgd = (zgd_t *)vzgd;
863	rl_t *rl = zgd->zgd_rl;
864	vnode_t *vp = ZTOV(rl->r_zp);
865	int vfslocked;
866
867	vfslocked = VFS_LOCK_GIANT(vp->v_vfsp);
868	dmu_buf_rele(db, vzgd);
869	zfs_range_unlock(rl);
870	VN_RELE(vp);
871	zil_add_vdev(zgd->zgd_zilog, DVA_GET_VDEV(BP_IDENTITY(zgd->zgd_bp)));
872	kmem_free(zgd, sizeof (zgd_t));
873	VFS_UNLOCK_GIANT(vfslocked);
874}
875
876/*
877 * Get data to generate a TX_WRITE intent log record.
878 */
879int
880zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
881{
882	zfsvfs_t *zfsvfs = arg;
883	objset_t *os = zfsvfs->z_os;
884	znode_t *zp;
885	uint64_t off = lr->lr_offset;
886	dmu_buf_t *db;
887	rl_t *rl;
888	zgd_t *zgd;
889	int dlen = lr->lr_length;		/* length of user data */
890	int error = 0;
891
892	ASSERT(zio);
893	ASSERT(dlen != 0);
894
895	/*
896	 * Nothing to do if the file has been removed
897	 */
898	if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0)
899		return (ENOENT);
900	if (zp->z_unlinked) {
901		VN_RELE(ZTOV(zp));
902		return (ENOENT);
903	}
904
905	/*
906	 * Write records come in two flavors: immediate and indirect.
907	 * For small writes it's cheaper to store the data with the
908	 * log record (immediate); for large writes it's cheaper to
909	 * sync the data and get a pointer to it (indirect) so that
910	 * we don't have to write the data twice.
911	 */
912	if (buf != NULL) { /* immediate write */
913		rl = zfs_range_lock(zp, off, dlen, RL_READER);
914		/* test for truncation needs to be done while range locked */
915		if (off >= zp->z_phys->zp_size) {
916			error = ENOENT;
917			goto out;
918		}
919		VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf));
920	} else { /* indirect write */
921		uint64_t boff; /* block starting offset */
922
923		/*
924		 * Have to lock the whole block to ensure when it's
925		 * written out and it's checksum is being calculated
926		 * that no one can change the data. We need to re-check
927		 * blocksize after we get the lock in case it's changed!
928		 */
929		for (;;) {
930			if (ISP2(zp->z_blksz)) {
931				boff = P2ALIGN_TYPED(off, zp->z_blksz,
932				    uint64_t);
933			} else {
934				boff = 0;
935			}
936			dlen = zp->z_blksz;
937			rl = zfs_range_lock(zp, boff, dlen, RL_READER);
938			if (zp->z_blksz == dlen)
939				break;
940			zfs_range_unlock(rl);
941		}
942		/* test for truncation needs to be done while range locked */
943		if (off >= zp->z_phys->zp_size) {
944			error = ENOENT;
945			goto out;
946		}
947		zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
948		zgd->zgd_rl = rl;
949		zgd->zgd_zilog = zfsvfs->z_log;
950		zgd->zgd_bp = &lr->lr_blkptr;
951		VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db));
952		ASSERT(boff == db->db_offset);
953		lr->lr_blkoff = off - boff;
954		error = dmu_sync(zio, db, &lr->lr_blkptr,
955		    lr->lr_common.lrc_txg, zfs_get_done, zgd);
956		ASSERT(error == EEXIST || lr->lr_length <= zp->z_blksz);
957		if (error == 0) {
958			zil_add_vdev(zfsvfs->z_log,
959			    DVA_GET_VDEV(BP_IDENTITY(&lr->lr_blkptr)));
960		}
961		/*
962		 * If we get EINPROGRESS, then we need to wait for a
963		 * write IO initiated by dmu_sync() to complete before
964		 * we can release this dbuf.  We will finish everything
965		 * up in the zfs_get_done() callback.
966		 */
967		if (error == EINPROGRESS)
968			return (0);
969		dmu_buf_rele(db, zgd);
970		kmem_free(zgd, sizeof (zgd_t));
971	}
972out:
973	zfs_range_unlock(rl);
974	VN_RELE(ZTOV(zp));
975	return (error);
976}
977
978/*ARGSUSED*/
979static int
980zfs_access(vnode_t *vp, int mode, int flags, cred_t *cr)
981{
982	znode_t *zp = VTOZ(vp);
983	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
984	int error;
985
986	ZFS_ENTER(zfsvfs);
987	error = zfs_zaccess_rwx(zp, mode, cr);
988	ZFS_EXIT(zfsvfs);
989	return (error);
990}
991
992/*
993 * Lookup an entry in a directory, or an extended attribute directory.
994 * If it exists, return a held vnode reference for it.
995 *
996 *	IN:	dvp	- vnode of directory to search.
997 *		nm	- name of entry to lookup.
998 *		pnp	- full pathname to lookup [UNUSED].
999 *		flags	- LOOKUP_XATTR set if looking for an attribute.
1000 *		rdir	- root directory vnode [UNUSED].
1001 *		cr	- credentials of caller.
1002 *
1003 *	OUT:	vpp	- vnode of located entry, NULL if not found.
1004 *
1005 *	RETURN:	0 if success
1006 *		error code if failure
1007 *
1008 * Timestamps:
1009 *	NA
1010 */
1011/* ARGSUSED */
1012static int
1013zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
1014    int nameiop, cred_t *cr, kthread_t *td)
1015{
1016
1017	znode_t *zdp = VTOZ(dvp);
1018	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1019	int	error;
1020
1021	ZFS_ENTER(zfsvfs);
1022
1023	*vpp = NULL;
1024
1025#ifdef TODO
1026	if (flags & LOOKUP_XATTR) {
1027		/*
1028		 * If the xattr property is off, refuse the lookup request.
1029		 */
1030		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1031			ZFS_EXIT(zfsvfs);
1032			return (EINVAL);
1033		}
1034
1035		/*
1036		 * We don't allow recursive attributes..
1037		 * Maybe someday we will.
1038		 */
1039		if (zdp->z_phys->zp_flags & ZFS_XATTR) {
1040			ZFS_EXIT(zfsvfs);
1041			return (EINVAL);
1042		}
1043
1044		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1045			ZFS_EXIT(zfsvfs);
1046			return (error);
1047		}
1048
1049		/*
1050		 * Do we have permission to get into attribute directory?
1051		 */
1052
1053		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, cr)) {
1054			VN_RELE(*vpp);
1055		}
1056
1057		ZFS_EXIT(zfsvfs);
1058		return (error);
1059	}
1060#endif	/* TODO */
1061
1062	if (dvp->v_type != VDIR) {
1063		ZFS_EXIT(zfsvfs);
1064		return (ENOTDIR);
1065	}
1066
1067	/*
1068	 * Check accessibility of directory.
1069	 */
1070
1071	if (error = zfs_zaccess(zdp, ACE_EXECUTE, cr)) {
1072		ZFS_EXIT(zfsvfs);
1073		return (error);
1074	}
1075
1076	if ((error = zfs_dirlook(zdp, nm, vpp)) == 0) {
1077
1078		/*
1079		 * Convert device special files
1080		 */
1081		if (IS_DEVVP(*vpp)) {
1082			vnode_t	*svp;
1083
1084			svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1085			VN_RELE(*vpp);
1086			if (svp == NULL)
1087				error = ENOSYS;
1088			else
1089				*vpp = svp;
1090		}
1091	}
1092
1093	ZFS_EXIT(zfsvfs);
1094
1095	/* Translate errors and add SAVENAME when needed. */
1096	if (cnp->cn_flags & ISLASTCN) {
1097		switch (nameiop) {
1098		case CREATE:
1099		case RENAME:
1100			if (error == ENOENT) {
1101				error = EJUSTRETURN;
1102				cnp->cn_flags |= SAVENAME;
1103				break;
1104			}
1105			/* FALLTHROUGH */
1106		case DELETE:
1107			if (error == 0)
1108				cnp->cn_flags |= SAVENAME;
1109			break;
1110		}
1111	}
1112	if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) {
1113		int ltype = 0;
1114
1115		if (cnp->cn_flags & ISDOTDOT) {
1116			ltype = VOP_ISLOCKED(dvp, td);
1117			VOP_UNLOCK(dvp, 0, td);
1118		}
1119		error = vn_lock(*vpp, cnp->cn_lkflags);
1120		if (cnp->cn_flags & ISDOTDOT)
1121			vn_lock(dvp, ltype | LK_RETRY);
1122		if (error != 0) {
1123			VN_RELE(*vpp);
1124			*vpp = NULL;
1125			return (error);
1126		}
1127	}
1128
1129#ifdef FREEBSD_NAMECACHE
1130	/*
1131	 * Insert name into cache (as non-existent) if appropriate.
1132	 */
1133	if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
1134		cache_enter(dvp, *vpp, cnp);
1135	/*
1136	 * Insert name into cache if appropriate.
1137	 */
1138	if (error == 0 && (cnp->cn_flags & MAKEENTRY)) {
1139		if (!(cnp->cn_flags & ISLASTCN) ||
1140		    (nameiop != DELETE && nameiop != RENAME)) {
1141			cache_enter(dvp, *vpp, cnp);
1142		}
1143	}
1144#endif
1145
1146	return (error);
1147}
1148
1149/*
1150 * Attempt to create a new entry in a directory.  If the entry
1151 * already exists, truncate the file if permissible, else return
1152 * an error.  Return the vp of the created or trunc'd file.
1153 *
1154 *	IN:	dvp	- vnode of directory to put new file entry in.
1155 *		name	- name of new file entry.
1156 *		vap	- attributes of new file.
1157 *		excl	- flag indicating exclusive or non-exclusive mode.
1158 *		mode	- mode to open file with.
1159 *		cr	- credentials of caller.
1160 *		flag	- large file flag [UNUSED].
1161 *
1162 *	OUT:	vpp	- vnode of created or trunc'd entry.
1163 *
1164 *	RETURN:	0 if success
1165 *		error code if failure
1166 *
1167 * Timestamps:
1168 *	dvp - ctime|mtime updated if new entry created
1169 *	 vp - ctime|mtime always, atime if new
1170 */
1171/* ARGSUSED */
1172static int
1173zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
1174    vnode_t **vpp, cred_t *cr, kthread_t *td)
1175{
1176	znode_t		*zp, *dzp = VTOZ(dvp);
1177	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1178	zilog_t		*zilog = zfsvfs->z_log;
1179	objset_t	*os = zfsvfs->z_os;
1180	zfs_dirlock_t	*dl;
1181	dmu_tx_t	*tx;
1182	int		error;
1183	uint64_t	zoid;
1184
1185	ZFS_ENTER(zfsvfs);
1186
1187top:
1188	*vpp = NULL;
1189
1190	if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr))
1191		vap->va_mode &= ~VSVTX;
1192
1193	if (*name == '\0') {
1194		/*
1195		 * Null component name refers to the directory itself.
1196		 */
1197		VN_HOLD(dvp);
1198		zp = dzp;
1199		dl = NULL;
1200		error = 0;
1201	} else {
1202		/* possible VN_HOLD(zp) */
1203		if (error = zfs_dirent_lock(&dl, dzp, name, &zp, 0)) {
1204			if (strcmp(name, "..") == 0)
1205				error = EISDIR;
1206			ZFS_EXIT(zfsvfs);
1207			return (error);
1208		}
1209	}
1210
1211	zoid = zp ? zp->z_id : -1ULL;
1212
1213	if (zp == NULL) {
1214		/*
1215		 * Create a new file object and update the directory
1216		 * to reference it.
1217		 */
1218		if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
1219			goto out;
1220		}
1221
1222		/*
1223		 * We only support the creation of regular files in
1224		 * extended attribute directories.
1225		 */
1226		if ((dzp->z_phys->zp_flags & ZFS_XATTR) &&
1227		    (vap->va_type != VREG)) {
1228			error = EINVAL;
1229			goto out;
1230		}
1231
1232		tx = dmu_tx_create(os);
1233		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1234		dmu_tx_hold_bonus(tx, dzp->z_id);
1235		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1236		if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
1237			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1238			    0, SPA_MAXBLOCKSIZE);
1239		error = dmu_tx_assign(tx, zfsvfs->z_assign);
1240		if (error) {
1241			zfs_dirent_unlock(dl);
1242			if (error == ERESTART &&
1243			    zfsvfs->z_assign == TXG_NOWAIT) {
1244				dmu_tx_wait(tx);
1245				dmu_tx_abort(tx);
1246				goto top;
1247			}
1248			dmu_tx_abort(tx);
1249			ZFS_EXIT(zfsvfs);
1250			return (error);
1251		}
1252		zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
1253		ASSERT(zp->z_id == zoid);
1254		(void) zfs_link_create(dl, zp, tx, ZNEW);
1255		zfs_log_create(zilog, tx, TX_CREATE, dzp, zp, name);
1256		dmu_tx_commit(tx);
1257	} else {
1258		/*
1259		 * A directory entry already exists for this name.
1260		 */
1261		/*
1262		 * Can't truncate an existing file if in exclusive mode.
1263		 */
1264		if (excl == EXCL) {
1265			error = EEXIST;
1266			goto out;
1267		}
1268		/*
1269		 * Can't open a directory for writing.
1270		 */
1271		if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
1272			error = EISDIR;
1273			goto out;
1274		}
1275		/*
1276		 * Verify requested access to file.
1277		 */
1278		if (mode && (error = zfs_zaccess_rwx(zp, mode, cr))) {
1279			goto out;
1280		}
1281
1282		mutex_enter(&dzp->z_lock);
1283		dzp->z_seq++;
1284		mutex_exit(&dzp->z_lock);
1285
1286		/*
1287		 * Truncate regular files if requested.
1288		 */
1289		if ((ZTOV(zp)->v_type == VREG) &&
1290		    (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
1291			error = zfs_freesp(zp, 0, 0, mode, TRUE);
1292			if (error == ERESTART &&
1293			    zfsvfs->z_assign == TXG_NOWAIT) {
1294				/* NB: we already did dmu_tx_wait() */
1295				zfs_dirent_unlock(dl);
1296				VN_RELE(ZTOV(zp));
1297				goto top;
1298			}
1299		}
1300	}
1301out:
1302
1303	if (error == 0) {
1304		*vpp = ZTOV(zp);
1305		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
1306	}
1307
1308	if (dl)
1309		zfs_dirent_unlock(dl);
1310
1311	if (error) {
1312		if (zp)
1313			VN_RELE(ZTOV(zp));
1314	} else {
1315		*vpp = ZTOV(zp);
1316		/*
1317		 * If vnode is for a device return a specfs vnode instead.
1318		 */
1319		if (IS_DEVVP(*vpp)) {
1320			struct vnode *svp;
1321
1322			svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1323			VN_RELE(*vpp);
1324			if (svp == NULL) {
1325				error = ENOSYS;
1326			}
1327			*vpp = svp;
1328		}
1329	}
1330
1331	ZFS_EXIT(zfsvfs);
1332	return (error);
1333}
1334
1335/*
1336 * Remove an entry from a directory.
1337 *
1338 *	IN:	dvp	- vnode of directory to remove entry from.
1339 *		name	- name of entry to remove.
1340 *		cr	- credentials of caller.
1341 *
1342 *	RETURN:	0 if success
1343 *		error code if failure
1344 *
1345 * Timestamps:
1346 *	dvp - ctime|mtime
1347 *	 vp - ctime (if nlink > 0)
1348 */
1349static int
1350zfs_remove(vnode_t *dvp, char *name, cred_t *cr)
1351{
1352	znode_t		*zp, *dzp = VTOZ(dvp);
1353	znode_t		*xzp = NULL;
1354	vnode_t		*vp;
1355	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1356	zilog_t		*zilog = zfsvfs->z_log;
1357	uint64_t	acl_obj, xattr_obj;
1358	zfs_dirlock_t	*dl;
1359	dmu_tx_t	*tx;
1360	boolean_t	may_delete_now, delete_now = FALSE;
1361	boolean_t	unlinked;
1362	int		error;
1363
1364	ZFS_ENTER(zfsvfs);
1365
1366top:
1367	/*
1368	 * Attempt to lock directory; fail if entry doesn't exist.
1369	 */
1370	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) {
1371		ZFS_EXIT(zfsvfs);
1372		return (error);
1373	}
1374
1375	vp = ZTOV(zp);
1376
1377	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1378		goto out;
1379	}
1380
1381	/*
1382	 * Need to use rmdir for removing directories.
1383	 */
1384	if (vp->v_type == VDIR) {
1385		error = EPERM;
1386		goto out;
1387	}
1388
1389	vnevent_remove(vp);
1390
1391	dnlc_remove(dvp, name);
1392
1393	may_delete_now = FALSE;
1394
1395	/*
1396	 * We may delete the znode now, or we may put it in the unlinked set;
1397	 * it depends on whether we're the last link, and on whether there are
1398	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
1399	 * allow for either case.
1400	 */
1401	tx = dmu_tx_create(zfsvfs->z_os);
1402	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1403	dmu_tx_hold_bonus(tx, zp->z_id);
1404	if (may_delete_now)
1405		dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
1406
1407	/* are there any extended attributes? */
1408	if ((xattr_obj = zp->z_phys->zp_xattr) != 0) {
1409		/* XXX - do we need this if we are deleting? */
1410		dmu_tx_hold_bonus(tx, xattr_obj);
1411	}
1412
1413	/* are there any additional acls */
1414	if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 &&
1415	    may_delete_now)
1416		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1417
1418	/* charge as an update -- would be nice not to charge at all */
1419	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1420
1421	error = dmu_tx_assign(tx, zfsvfs->z_assign);
1422	if (error) {
1423		zfs_dirent_unlock(dl);
1424		VN_RELE(vp);
1425		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
1426			dmu_tx_wait(tx);
1427			dmu_tx_abort(tx);
1428			goto top;
1429		}
1430		dmu_tx_abort(tx);
1431		ZFS_EXIT(zfsvfs);
1432		return (error);
1433	}
1434
1435	/*
1436	 * Remove the directory entry.
1437	 */
1438	error = zfs_link_destroy(dl, zp, tx, 0, &unlinked);
1439
1440	if (error) {
1441		dmu_tx_commit(tx);
1442		goto out;
1443	}
1444
1445	if (0 && unlinked) {
1446		VI_LOCK(vp);
1447		delete_now = may_delete_now &&
1448		    vp->v_count == 1 && !vn_has_cached_data(vp) &&
1449		    zp->z_phys->zp_xattr == xattr_obj &&
1450		    zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj;
1451		VI_UNLOCK(vp);
1452	}
1453
1454	if (delete_now) {
1455		if (zp->z_phys->zp_xattr) {
1456			error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
1457			ASSERT3U(error, ==, 0);
1458			ASSERT3U(xzp->z_phys->zp_links, ==, 2);
1459			dmu_buf_will_dirty(xzp->z_dbuf, tx);
1460			mutex_enter(&xzp->z_lock);
1461			xzp->z_unlinked = 1;
1462			xzp->z_phys->zp_links = 0;
1463			mutex_exit(&xzp->z_lock);
1464			zfs_unlinked_add(xzp, tx);
1465			zp->z_phys->zp_xattr = 0; /* probably unnecessary */
1466		}
1467		mutex_enter(&zp->z_lock);
1468		VI_LOCK(vp);
1469		vp->v_count--;
1470		ASSERT3U(vp->v_count, ==, 0);
1471		VI_UNLOCK(vp);
1472		mutex_exit(&zp->z_lock);
1473		zfs_znode_delete(zp, tx);
1474		VFS_RELE(zfsvfs->z_vfs);
1475	} else if (unlinked) {
1476		zfs_unlinked_add(zp, tx);
1477	}
1478
1479	zfs_log_remove(zilog, tx, TX_REMOVE, dzp, name);
1480
1481	dmu_tx_commit(tx);
1482out:
1483	zfs_dirent_unlock(dl);
1484
1485	if (!delete_now) {
1486		VN_RELE(vp);
1487	} else if (xzp) {
1488		/* this rele delayed to prevent nesting transactions */
1489		VN_RELE(ZTOV(xzp));
1490	}
1491
1492	ZFS_EXIT(zfsvfs);
1493	return (error);
1494}
1495
1496/*
1497 * Create a new directory and insert it into dvp using the name
1498 * provided.  Return a pointer to the inserted directory.
1499 *
1500 *	IN:	dvp	- vnode of directory to add subdir to.
1501 *		dirname	- name of new directory.
1502 *		vap	- attributes of new directory.
1503 *		cr	- credentials of caller.
1504 *
1505 *	OUT:	vpp	- vnode of created directory.
1506 *
1507 *	RETURN:	0 if success
1508 *		error code if failure
1509 *
1510 * Timestamps:
1511 *	dvp - ctime|mtime updated
1512 *	 vp - ctime|mtime|atime updated
1513 */
1514static int
1515zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
1516{
1517	znode_t		*zp, *dzp = VTOZ(dvp);
1518	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1519	zilog_t		*zilog = zfsvfs->z_log;
1520	zfs_dirlock_t	*dl;
1521	uint64_t	zoid = 0;
1522	dmu_tx_t	*tx;
1523	int		error;
1524
1525	ASSERT(vap->va_type == VDIR);
1526
1527	ZFS_ENTER(zfsvfs);
1528
1529	if (dzp->z_phys->zp_flags & ZFS_XATTR) {
1530		ZFS_EXIT(zfsvfs);
1531		return (EINVAL);
1532	}
1533top:
1534	*vpp = NULL;
1535
1536	/*
1537	 * First make sure the new directory doesn't exist.
1538	 */
1539	if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, ZNEW)) {
1540		ZFS_EXIT(zfsvfs);
1541		return (error);
1542	}
1543
1544	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, cr)) {
1545		zfs_dirent_unlock(dl);
1546		ZFS_EXIT(zfsvfs);
1547		return (error);
1548	}
1549
1550	/*
1551	 * Add a new entry to the directory.
1552	 */
1553	tx = dmu_tx_create(zfsvfs->z_os);
1554	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1555	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1556	if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
1557		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1558		    0, SPA_MAXBLOCKSIZE);
1559	error = dmu_tx_assign(tx, zfsvfs->z_assign);
1560	if (error) {
1561		zfs_dirent_unlock(dl);
1562		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
1563			dmu_tx_wait(tx);
1564			dmu_tx_abort(tx);
1565			goto top;
1566		}
1567		dmu_tx_abort(tx);
1568		ZFS_EXIT(zfsvfs);
1569		return (error);
1570	}
1571
1572	/*
1573	 * Create new node.
1574	 */
1575	zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
1576
1577	/*
1578	 * Now put new name in parent dir.
1579	 */
1580	(void) zfs_link_create(dl, zp, tx, ZNEW);
1581
1582	*vpp = ZTOV(zp);
1583
1584	zfs_log_create(zilog, tx, TX_MKDIR, dzp, zp, dirname);
1585	dmu_tx_commit(tx);
1586
1587	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
1588
1589	zfs_dirent_unlock(dl);
1590
1591	ZFS_EXIT(zfsvfs);
1592	return (0);
1593}
1594
1595/*
1596 * Remove a directory subdir entry.  If the current working
1597 * directory is the same as the subdir to be removed, the
1598 * remove will fail.
1599 *
1600 *	IN:	dvp	- vnode of directory to remove from.
1601 *		name	- name of directory to be removed.
1602 *		cwd	- vnode of current working directory.
1603 *		cr	- credentials of caller.
1604 *
1605 *	RETURN:	0 if success
1606 *		error code if failure
1607 *
1608 * Timestamps:
1609 *	dvp - ctime|mtime updated
1610 */
1611static int
1612zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr)
1613{
1614	znode_t		*dzp = VTOZ(dvp);
1615	znode_t		*zp;
1616	vnode_t		*vp;
1617	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1618	zilog_t		*zilog = zfsvfs->z_log;
1619	zfs_dirlock_t	*dl;
1620	dmu_tx_t	*tx;
1621	int		error;
1622
1623	ZFS_ENTER(zfsvfs);
1624
1625top:
1626	zp = NULL;
1627
1628	/*
1629	 * Attempt to lock directory; fail if entry doesn't exist.
1630	 */
1631	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) {
1632		ZFS_EXIT(zfsvfs);
1633		return (error);
1634	}
1635
1636	vp = ZTOV(zp);
1637
1638	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1639		goto out;
1640	}
1641
1642	if (vp->v_type != VDIR) {
1643		error = ENOTDIR;
1644		goto out;
1645	}
1646
1647	if (vp == cwd) {
1648		error = EINVAL;
1649		goto out;
1650	}
1651
1652	vnevent_rmdir(vp);
1653
1654	/*
1655	 * Grab a lock on the directory to make sure that noone is
1656	 * trying to add (or lookup) entries while we are removing it.
1657	 */
1658	rw_enter(&zp->z_name_lock, RW_WRITER);
1659
1660	/*
1661	 * Grab a lock on the parent pointer to make sure we play well
1662	 * with the treewalk and directory rename code.
1663	 */
1664	rw_enter(&zp->z_parent_lock, RW_WRITER);
1665
1666	tx = dmu_tx_create(zfsvfs->z_os);
1667	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1668	dmu_tx_hold_bonus(tx, zp->z_id);
1669	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1670	error = dmu_tx_assign(tx, zfsvfs->z_assign);
1671	if (error) {
1672		rw_exit(&zp->z_parent_lock);
1673		rw_exit(&zp->z_name_lock);
1674		zfs_dirent_unlock(dl);
1675		VN_RELE(vp);
1676		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
1677			dmu_tx_wait(tx);
1678			dmu_tx_abort(tx);
1679			goto top;
1680		}
1681		dmu_tx_abort(tx);
1682		ZFS_EXIT(zfsvfs);
1683		return (error);
1684	}
1685
1686#ifdef FREEBSD_NAMECACHE
1687	cache_purge(dvp);
1688#endif
1689
1690	error = zfs_link_destroy(dl, zp, tx, 0, NULL);
1691
1692	if (error == 0)
1693		zfs_log_remove(zilog, tx, TX_RMDIR, dzp, name);
1694
1695	dmu_tx_commit(tx);
1696
1697	rw_exit(&zp->z_parent_lock);
1698	rw_exit(&zp->z_name_lock);
1699#ifdef FREEBSD_NAMECACHE
1700	cache_purge(vp);
1701#endif
1702out:
1703	zfs_dirent_unlock(dl);
1704
1705	VN_RELE(vp);
1706
1707	ZFS_EXIT(zfsvfs);
1708	return (error);
1709}
1710
1711/*
1712 * Read as many directory entries as will fit into the provided
1713 * buffer from the given directory cursor position (specified in
1714 * the uio structure.
1715 *
1716 *	IN:	vp	- vnode of directory to read.
1717 *		uio	- structure supplying read location, range info,
1718 *			  and return buffer.
1719 *		cr	- credentials of caller.
1720 *
1721 *	OUT:	uio	- updated offset and range, buffer filled.
1722 *		eofp	- set to true if end-of-file detected.
1723 *
1724 *	RETURN:	0 if success
1725 *		error code if failure
1726 *
1727 * Timestamps:
1728 *	vp - atime updated
1729 *
1730 * Note that the low 4 bits of the cookie returned by zap is always zero.
1731 * This allows us to use the low range for "special" directory entries:
1732 * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
1733 * we use the offset 2 for the '.zfs' directory.
1734 */
1735/* ARGSUSED */
1736static int
1737zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
1738{
1739	znode_t		*zp = VTOZ(vp);
1740	iovec_t		*iovp;
1741	dirent64_t	*odp;
1742	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
1743	objset_t	*os;
1744	caddr_t		outbuf;
1745	size_t		bufsize;
1746	zap_cursor_t	zc;
1747	zap_attribute_t	zap;
1748	uint_t		bytes_wanted;
1749	uint64_t	offset; /* must be unsigned; checks for < 1 */
1750	int		local_eof;
1751	int		outcount;
1752	int		error;
1753	uint8_t		prefetch;
1754	uint8_t		type;
1755	int		ncooks;
1756	u_long		*cooks = NULL;
1757
1758	ZFS_ENTER(zfsvfs);
1759
1760	/*
1761	 * If we are not given an eof variable,
1762	 * use a local one.
1763	 */
1764	if (eofp == NULL)
1765		eofp = &local_eof;
1766
1767	/*
1768	 * Check for valid iov_len.
1769	 */
1770	if (uio->uio_iov->iov_len <= 0) {
1771		ZFS_EXIT(zfsvfs);
1772		return (EINVAL);
1773	}
1774
1775	/*
1776	 * Quit if directory has been removed (posix)
1777	 */
1778	if ((*eofp = zp->z_unlinked) != 0) {
1779		ZFS_EXIT(zfsvfs);
1780		return (0);
1781	}
1782
1783	error = 0;
1784	os = zfsvfs->z_os;
1785	offset = uio->uio_loffset;
1786	prefetch = zp->z_zn_prefetch;
1787
1788	/*
1789	 * Initialize the iterator cursor.
1790	 */
1791	if (offset <= 3) {
1792		/*
1793		 * Start iteration from the beginning of the directory.
1794		 */
1795		zap_cursor_init(&zc, os, zp->z_id);
1796	} else {
1797		/*
1798		 * The offset is a serialized cursor.
1799		 */
1800		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
1801	}
1802
1803	/*
1804	 * Get space to change directory entries into fs independent format.
1805	 */
1806	iovp = uio->uio_iov;
1807	bytes_wanted = iovp->iov_len;
1808	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
1809		bufsize = bytes_wanted;
1810		outbuf = kmem_alloc(bufsize, KM_SLEEP);
1811		odp = (struct dirent64 *)outbuf;
1812	} else {
1813		bufsize = bytes_wanted;
1814		odp = (struct dirent64 *)iovp->iov_base;
1815	}
1816
1817	if (ncookies != NULL) {
1818		/*
1819		 * Minimum entry size is dirent size and 1 byte for a file name.
1820		 */
1821		ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
1822		cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
1823		*cookies = cooks;
1824		*ncookies = ncooks;
1825	}
1826
1827	/*
1828	 * Transform to file-system independent format
1829	 */
1830	outcount = 0;
1831	while (outcount < bytes_wanted) {
1832		ino64_t objnum;
1833		ushort_t reclen;
1834
1835		/*
1836		 * Special case `.', `..', and `.zfs'.
1837		 */
1838		if (offset == 0) {
1839			(void) strcpy(zap.za_name, ".");
1840			objnum = zp->z_id;
1841			type = DT_DIR;
1842		} else if (offset == 1) {
1843			(void) strcpy(zap.za_name, "..");
1844			objnum = zp->z_phys->zp_parent;
1845			type = DT_DIR;
1846		} else if (offset == 2 && zfs_show_ctldir(zp)) {
1847			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
1848			objnum = ZFSCTL_INO_ROOT;
1849			type = DT_DIR;
1850		} else {
1851			/*
1852			 * Grab next entry.
1853			 */
1854			if (error = zap_cursor_retrieve(&zc, &zap)) {
1855				if ((*eofp = (error == ENOENT)) != 0)
1856					break;
1857				else
1858					goto update;
1859			}
1860
1861			if (zap.za_integer_length != 8 ||
1862			    zap.za_num_integers != 1) {
1863				cmn_err(CE_WARN, "zap_readdir: bad directory "
1864				    "entry, obj = %lld, offset = %lld\n",
1865				    (u_longlong_t)zp->z_id,
1866				    (u_longlong_t)offset);
1867				error = ENXIO;
1868				goto update;
1869			}
1870
1871			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
1872			/*
1873			 * MacOS X can extract the object type here such as:
1874			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
1875			 */
1876			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
1877		}
1878		reclen = DIRENT64_RECLEN(strlen(zap.za_name));
1879
1880		/*
1881		 * Will this entry fit in the buffer?
1882		 */
1883		if (outcount + reclen > bufsize) {
1884			/*
1885			 * Did we manage to fit anything in the buffer?
1886			 */
1887			if (!outcount) {
1888				error = EINVAL;
1889				goto update;
1890			}
1891			break;
1892		}
1893		/*
1894		 * Add this entry:
1895		 */
1896		odp->d_ino = objnum;
1897		odp->d_reclen = reclen;
1898		odp->d_namlen = strlen(zap.za_name);
1899		(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
1900		odp->d_type = type;
1901		outcount += reclen;
1902		odp = (dirent64_t *)((intptr_t)odp + reclen);
1903
1904		ASSERT(outcount <= bufsize);
1905
1906		/* Prefetch znode */
1907		if (prefetch)
1908			dmu_prefetch(os, objnum, 0, 0);
1909
1910		/*
1911		 * Move to the next entry, fill in the previous offset.
1912		 */
1913		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
1914			zap_cursor_advance(&zc);
1915			offset = zap_cursor_serialize(&zc);
1916		} else {
1917			offset += 1;
1918		}
1919
1920		if (cooks != NULL) {
1921			*cooks++ = offset;
1922			ncooks--;
1923			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
1924		}
1925	}
1926	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
1927
1928	/* Subtract unused cookies */
1929	if (ncookies != NULL)
1930		*ncookies -= ncooks;
1931
1932	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
1933		iovp->iov_base += outcount;
1934		iovp->iov_len -= outcount;
1935		uio->uio_resid -= outcount;
1936	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
1937		/*
1938		 * Reset the pointer.
1939		 */
1940		offset = uio->uio_loffset;
1941	}
1942
1943update:
1944	zap_cursor_fini(&zc);
1945	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
1946		kmem_free(outbuf, bufsize);
1947
1948	if (error == ENOENT)
1949		error = 0;
1950
1951	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
1952
1953	uio->uio_loffset = offset;
1954	ZFS_EXIT(zfsvfs);
1955	if (error != 0 && cookies != NULL) {
1956		free(*cookies, M_TEMP);
1957		*cookies = NULL;
1958		*ncookies = 0;
1959	}
1960	return (error);
1961}
1962
1963static int
1964zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr)
1965{
1966	znode_t	*zp = VTOZ(vp);
1967	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1968
1969	ZFS_ENTER(zfsvfs);
1970	zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
1971	ZFS_EXIT(zfsvfs);
1972	return (0);
1973}
1974
1975/*
1976 * Get the requested file attributes and place them in the provided
1977 * vattr structure.
1978 *
1979 *	IN:	vp	- vnode of file.
1980 *		vap	- va_mask identifies requested attributes.
1981 *		flags	- [UNUSED]
1982 *		cr	- credentials of caller.
1983 *
1984 *	OUT:	vap	- attribute values.
1985 *
1986 *	RETURN:	0 (always succeeds)
1987 */
1988/* ARGSUSED */
1989static int
1990zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
1991{
1992	znode_t *zp = VTOZ(vp);
1993	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1994	znode_phys_t *pzp = zp->z_phys;
1995	uint32_t blksize;
1996	u_longlong_t nblocks;
1997	int	error;
1998
1999	ZFS_ENTER(zfsvfs);
2000
2001	/*
2002	 * Return all attributes.  It's cheaper to provide the answer
2003	 * than to determine whether we were asked the question.
2004	 */
2005	mutex_enter(&zp->z_lock);
2006
2007	vap->va_type = IFTOVT(pzp->zp_mode);
2008	vap->va_mode = pzp->zp_mode & ~S_IFMT;
2009	vap->va_uid = zp->z_phys->zp_uid;
2010	vap->va_gid = zp->z_phys->zp_gid;
2011	vap->va_nodeid = zp->z_id;
2012	vap->va_nlink = MIN(pzp->zp_links, UINT32_MAX);	/* nlink_t limit! */
2013	vap->va_size = pzp->zp_size;
2014	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
2015	vap->va_rdev = zfs_cmpldev(pzp->zp_rdev);
2016	vap->va_seq = zp->z_seq;
2017	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
2018
2019	ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime);
2020	ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime);
2021	ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime);
2022	ZFS_TIME_DECODE(&vap->va_birthtime, pzp->zp_crtime);
2023
2024	/*
2025	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2026	 * Also, if we are the owner don't bother, since owner should
2027	 * always be allowed to read basic attributes of file.
2028	 */
2029	if (!(zp->z_phys->zp_flags & ZFS_ACL_TRIVIAL) &&
2030	    (zp->z_phys->zp_uid != crgetuid(cr))) {
2031		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, cr)) {
2032			mutex_exit(&zp->z_lock);
2033			ZFS_EXIT(zfsvfs);
2034			return (error);
2035		}
2036	}
2037
2038	mutex_exit(&zp->z_lock);
2039
2040	dmu_object_size_from_db(zp->z_dbuf, &blksize, &nblocks);
2041	vap->va_blksize = blksize;
2042	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
2043
2044	if (zp->z_blksz == 0) {
2045		/*
2046		 * Block size hasn't been set; suggest maximal I/O transfers.
2047		 */
2048		vap->va_blksize = zfsvfs->z_max_blksz;
2049	}
2050
2051	ZFS_EXIT(zfsvfs);
2052	return (0);
2053}
2054
2055/*
2056 * Set the file attributes to the values contained in the
2057 * vattr structure.
2058 *
2059 *	IN:	vp	- vnode of file to be modified.
2060 *		vap	- new attribute values.
2061 *		flags	- ATTR_UTIME set if non-default time values provided.
2062 *		cr	- credentials of caller.
2063 *
2064 *	RETURN:	0 if success
2065 *		error code if failure
2066 *
2067 * Timestamps:
2068 *	vp - ctime updated, mtime updated if size changed.
2069 */
2070/* ARGSUSED */
2071static int
2072zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2073	caller_context_t *ct)
2074{
2075	struct znode	*zp = VTOZ(vp);
2076	znode_phys_t	*pzp = zp->z_phys;
2077	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2078	zilog_t		*zilog = zfsvfs->z_log;
2079	dmu_tx_t	*tx;
2080	vattr_t		oldva;
2081	uint_t		mask = vap->va_mask;
2082	uint_t		saved_mask;
2083	int		trim_mask = 0;
2084	uint64_t	new_mode;
2085	znode_t		*attrzp;
2086	int		need_policy = FALSE;
2087	int		err;
2088
2089	if (mask == 0)
2090		return (0);
2091
2092	if (mask & AT_NOSET)
2093		return (EINVAL);
2094
2095	if (mask & AT_SIZE && vp->v_type == VDIR)
2096		return (EISDIR);
2097
2098	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO)
2099		return (EINVAL);
2100
2101	ZFS_ENTER(zfsvfs);
2102
2103top:
2104	attrzp = NULL;
2105
2106	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2107		ZFS_EXIT(zfsvfs);
2108		return (EROFS);
2109	}
2110
2111	/*
2112	 * First validate permissions
2113	 */
2114
2115	if (mask & AT_SIZE) {
2116		err = zfs_zaccess(zp, ACE_WRITE_DATA, cr);
2117		if (err) {
2118			ZFS_EXIT(zfsvfs);
2119			return (err);
2120		}
2121		/*
2122		 * XXX - Note, we are not providing any open
2123		 * mode flags here (like FNDELAY), so we may
2124		 * block if there are locks present... this
2125		 * should be addressed in openat().
2126		 */
2127		do {
2128			err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2129			/* NB: we already did dmu_tx_wait() if necessary */
2130		} while (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT);
2131		if (err) {
2132			ZFS_EXIT(zfsvfs);
2133			return (err);
2134		}
2135	}
2136
2137	if (mask & (AT_ATIME|AT_MTIME))
2138		need_policy = zfs_zaccess_v4_perm(zp, ACE_WRITE_ATTRIBUTES, cr);
2139
2140	if (mask & (AT_UID|AT_GID)) {
2141		int	idmask = (mask & (AT_UID|AT_GID));
2142		int	take_owner;
2143		int	take_group;
2144
2145		/*
2146		 * NOTE: even if a new mode is being set,
2147		 * we may clear S_ISUID/S_ISGID bits.
2148		 */
2149
2150		if (!(mask & AT_MODE))
2151			vap->va_mode = pzp->zp_mode;
2152
2153		/*
2154		 * Take ownership or chgrp to group we are a member of
2155		 */
2156
2157		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
2158		take_group = (mask & AT_GID) && groupmember(vap->va_gid, cr);
2159
2160		/*
2161		 * If both AT_UID and AT_GID are set then take_owner and
2162		 * take_group must both be set in order to allow taking
2163		 * ownership.
2164		 *
2165		 * Otherwise, send the check through secpolicy_vnode_setattr()
2166		 *
2167		 */
2168
2169		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
2170		    ((idmask == AT_UID) && take_owner) ||
2171		    ((idmask == AT_GID) && take_group)) {
2172			if (zfs_zaccess_v4_perm(zp, ACE_WRITE_OWNER, cr) == 0) {
2173				/*
2174				 * Remove setuid/setgid for non-privileged users
2175				 */
2176				secpolicy_setid_clear(vap, cr);
2177				trim_mask = (mask & (AT_UID|AT_GID));
2178			} else {
2179				need_policy =  TRUE;
2180			}
2181		} else {
2182			need_policy =  TRUE;
2183		}
2184	}
2185
2186	mutex_enter(&zp->z_lock);
2187	oldva.va_mode = pzp->zp_mode;
2188	oldva.va_uid = zp->z_phys->zp_uid;
2189	oldva.va_gid = zp->z_phys->zp_gid;
2190	mutex_exit(&zp->z_lock);
2191
2192	if (mask & AT_MODE) {
2193		if (zfs_zaccess_v4_perm(zp, ACE_WRITE_ACL, cr) == 0) {
2194			err = secpolicy_setid_setsticky_clear(vp, vap,
2195			    &oldva, cr);
2196			if (err) {
2197				ZFS_EXIT(zfsvfs);
2198				return (err);
2199			}
2200			trim_mask |= AT_MODE;
2201		} else {
2202			need_policy = TRUE;
2203		}
2204	}
2205
2206	if (need_policy) {
2207		/*
2208		 * If trim_mask is set then take ownership
2209		 * has been granted or write_acl is present and user
2210		 * has the ability to modify mode.  In that case remove
2211		 * UID|GID and or MODE from mask so that
2212		 * secpolicy_vnode_setattr() doesn't revoke it.
2213		 */
2214
2215		if (trim_mask) {
2216			saved_mask = vap->va_mask;
2217			vap->va_mask &= ~trim_mask;
2218
2219		}
2220		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
2221		    (int (*)(void *, int, cred_t *))zfs_zaccess_rwx, zp);
2222		if (err) {
2223			ZFS_EXIT(zfsvfs);
2224			return (err);
2225		}
2226
2227		if (trim_mask)
2228			vap->va_mask |= saved_mask;
2229	}
2230
2231	/*
2232	 * secpolicy_vnode_setattr, or take ownership may have
2233	 * changed va_mask
2234	 */
2235	mask = vap->va_mask;
2236
2237	tx = dmu_tx_create(zfsvfs->z_os);
2238	dmu_tx_hold_bonus(tx, zp->z_id);
2239
2240	if (mask & AT_MODE) {
2241		uint64_t pmode = pzp->zp_mode;
2242
2243		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
2244
2245		if (zp->z_phys->zp_acl.z_acl_extern_obj)
2246			dmu_tx_hold_write(tx,
2247			    pzp->zp_acl.z_acl_extern_obj, 0, SPA_MAXBLOCKSIZE);
2248		else
2249			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2250			    0, ZFS_ACL_SIZE(MAX_ACL_SIZE));
2251	}
2252
2253	if ((mask & (AT_UID | AT_GID)) && zp->z_phys->zp_xattr != 0) {
2254		err = zfs_zget(zp->z_zfsvfs, zp->z_phys->zp_xattr, &attrzp);
2255		if (err) {
2256			dmu_tx_abort(tx);
2257			ZFS_EXIT(zfsvfs);
2258			return (err);
2259		}
2260		dmu_tx_hold_bonus(tx, attrzp->z_id);
2261	}
2262
2263	err = dmu_tx_assign(tx, zfsvfs->z_assign);
2264	if (err) {
2265		if (attrzp)
2266			VN_RELE(ZTOV(attrzp));
2267		if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
2268			dmu_tx_wait(tx);
2269			dmu_tx_abort(tx);
2270			goto top;
2271		}
2272		dmu_tx_abort(tx);
2273		ZFS_EXIT(zfsvfs);
2274		return (err);
2275	}
2276
2277	dmu_buf_will_dirty(zp->z_dbuf, tx);
2278
2279	/*
2280	 * Set each attribute requested.
2281	 * We group settings according to the locks they need to acquire.
2282	 *
2283	 * Note: you cannot set ctime directly, although it will be
2284	 * updated as a side-effect of calling this function.
2285	 */
2286
2287	mutex_enter(&zp->z_lock);
2288
2289	if (mask & AT_MODE) {
2290		err = zfs_acl_chmod_setattr(zp, new_mode, tx);
2291		ASSERT3U(err, ==, 0);
2292	}
2293
2294	if (attrzp)
2295		mutex_enter(&attrzp->z_lock);
2296
2297	if (mask & AT_UID) {
2298		zp->z_phys->zp_uid = (uint64_t)vap->va_uid;
2299		if (attrzp) {
2300			attrzp->z_phys->zp_uid = (uint64_t)vap->va_uid;
2301		}
2302	}
2303
2304	if (mask & AT_GID) {
2305		zp->z_phys->zp_gid = (uint64_t)vap->va_gid;
2306		if (attrzp)
2307			attrzp->z_phys->zp_gid = (uint64_t)vap->va_gid;
2308	}
2309
2310	if (attrzp)
2311		mutex_exit(&attrzp->z_lock);
2312
2313	if (mask & AT_ATIME)
2314		ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
2315
2316	if (mask & AT_MTIME)
2317		ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
2318
2319	if (mask & AT_SIZE)
2320		zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx);
2321	else if (mask != 0)
2322		zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
2323
2324	if (mask != 0)
2325		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask);
2326
2327	mutex_exit(&zp->z_lock);
2328
2329	if (attrzp)
2330		VN_RELE(ZTOV(attrzp));
2331
2332	dmu_tx_commit(tx);
2333
2334	ZFS_EXIT(zfsvfs);
2335	return (err);
2336}
2337
2338typedef struct zfs_zlock {
2339	krwlock_t	*zl_rwlock;	/* lock we acquired */
2340	znode_t		*zl_znode;	/* znode we held */
2341	struct zfs_zlock *zl_next;	/* next in list */
2342} zfs_zlock_t;
2343
2344/*
2345 * Drop locks and release vnodes that were held by zfs_rename_lock().
2346 */
2347static void
2348zfs_rename_unlock(zfs_zlock_t **zlpp)
2349{
2350	zfs_zlock_t *zl;
2351
2352	while ((zl = *zlpp) != NULL) {
2353		if (zl->zl_znode != NULL)
2354			VN_RELE(ZTOV(zl->zl_znode));
2355		rw_exit(zl->zl_rwlock);
2356		*zlpp = zl->zl_next;
2357		kmem_free(zl, sizeof (*zl));
2358	}
2359}
2360
2361/*
2362 * Search back through the directory tree, using the ".." entries.
2363 * Lock each directory in the chain to prevent concurrent renames.
2364 * Fail any attempt to move a directory into one of its own descendants.
2365 * XXX - z_parent_lock can overlap with map or grow locks
2366 */
2367static int
2368zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
2369{
2370	zfs_zlock_t	*zl;
2371	znode_t		*zp = tdzp;
2372	uint64_t	rootid = zp->z_zfsvfs->z_root;
2373	uint64_t	*oidp = &zp->z_id;
2374	krwlock_t	*rwlp = &szp->z_parent_lock;
2375	krw_t		rw = RW_WRITER;
2376
2377	/*
2378	 * First pass write-locks szp and compares to zp->z_id.
2379	 * Later passes read-lock zp and compare to zp->z_parent.
2380	 */
2381	do {
2382		if (!rw_tryenter(rwlp, rw)) {
2383			/*
2384			 * Another thread is renaming in this path.
2385			 * Note that if we are a WRITER, we don't have any
2386			 * parent_locks held yet.
2387			 */
2388			if (rw == RW_READER && zp->z_id > szp->z_id) {
2389				/*
2390				 * Drop our locks and restart
2391				 */
2392				zfs_rename_unlock(&zl);
2393				*zlpp = NULL;
2394				zp = tdzp;
2395				oidp = &zp->z_id;
2396				rwlp = &szp->z_parent_lock;
2397				rw = RW_WRITER;
2398				continue;
2399			} else {
2400				/*
2401				 * Wait for other thread to drop its locks
2402				 */
2403				rw_enter(rwlp, rw);
2404			}
2405		}
2406
2407		zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
2408		zl->zl_rwlock = rwlp;
2409		zl->zl_znode = NULL;
2410		zl->zl_next = *zlpp;
2411		*zlpp = zl;
2412
2413		if (*oidp == szp->z_id)		/* We're a descendant of szp */
2414			return (EINVAL);
2415
2416		if (*oidp == rootid)		/* We've hit the top */
2417			return (0);
2418
2419		if (rw == RW_READER) {		/* i.e. not the first pass */
2420			int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp);
2421			if (error)
2422				return (error);
2423			zl->zl_znode = zp;
2424		}
2425		oidp = &zp->z_phys->zp_parent;
2426		rwlp = &zp->z_parent_lock;
2427		rw = RW_READER;
2428
2429	} while (zp->z_id != sdzp->z_id);
2430
2431	return (0);
2432}
2433
2434/*
2435 * Move an entry from the provided source directory to the target
2436 * directory.  Change the entry name as indicated.
2437 *
2438 *	IN:	sdvp	- Source directory containing the "old entry".
2439 *		snm	- Old entry name.
2440 *		tdvp	- Target directory to contain the "new entry".
2441 *		tnm	- New entry name.
2442 *		cr	- credentials of caller.
2443 *
2444 *	RETURN:	0 if success
2445 *		error code if failure
2446 *
2447 * Timestamps:
2448 *	sdvp,tdvp - ctime|mtime updated
2449 */
2450static int
2451zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr)
2452{
2453	znode_t		*tdzp, *szp, *tzp;
2454	znode_t		*sdzp = VTOZ(sdvp);
2455	zfsvfs_t	*zfsvfs = sdzp->z_zfsvfs;
2456	zilog_t		*zilog = zfsvfs->z_log;
2457	vnode_t		*realvp;
2458	zfs_dirlock_t	*sdl, *tdl;
2459	dmu_tx_t	*tx;
2460	zfs_zlock_t	*zl;
2461	int		cmp, serr, terr, error;
2462
2463	ZFS_ENTER(zfsvfs);
2464
2465	/*
2466	 * Make sure we have the real vp for the target directory.
2467	 */
2468	if (VOP_REALVP(tdvp, &realvp) == 0)
2469		tdvp = realvp;
2470
2471	if (tdvp->v_vfsp != sdvp->v_vfsp) {
2472		ZFS_EXIT(zfsvfs);
2473		return (EXDEV);
2474	}
2475
2476	tdzp = VTOZ(tdvp);
2477top:
2478	szp = NULL;
2479	tzp = NULL;
2480	zl = NULL;
2481
2482	/*
2483	 * This is to prevent the creation of links into attribute space
2484	 * by renaming a linked file into/outof an attribute directory.
2485	 * See the comment in zfs_link() for why this is considered bad.
2486	 */
2487	if ((tdzp->z_phys->zp_flags & ZFS_XATTR) !=
2488	    (sdzp->z_phys->zp_flags & ZFS_XATTR)) {
2489		ZFS_EXIT(zfsvfs);
2490		return (EINVAL);
2491	}
2492
2493	/*
2494	 * Lock source and target directory entries.  To prevent deadlock,
2495	 * a lock ordering must be defined.  We lock the directory with
2496	 * the smallest object id first, or if it's a tie, the one with
2497	 * the lexically first name.
2498	 */
2499	if (sdzp->z_id < tdzp->z_id) {
2500		cmp = -1;
2501	} else if (sdzp->z_id > tdzp->z_id) {
2502		cmp = 1;
2503	} else {
2504		cmp = strcmp(snm, tnm);
2505		if (cmp == 0) {
2506			/*
2507			 * POSIX: "If the old argument and the new argument
2508			 * both refer to links to the same existing file,
2509			 * the rename() function shall return successfully
2510			 * and perform no other action."
2511			 */
2512			ZFS_EXIT(zfsvfs);
2513			return (0);
2514		}
2515	}
2516	if (cmp < 0) {
2517		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS);
2518		terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0);
2519	} else {
2520		terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0);
2521		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS);
2522	}
2523
2524	if (serr) {
2525		/*
2526		 * Source entry invalid or not there.
2527		 */
2528		if (!terr) {
2529			zfs_dirent_unlock(tdl);
2530			if (tzp)
2531				VN_RELE(ZTOV(tzp));
2532		}
2533		if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0)
2534			serr = EINVAL;
2535		ZFS_EXIT(zfsvfs);
2536		return (serr);
2537	}
2538	if (terr) {
2539		zfs_dirent_unlock(sdl);
2540		VN_RELE(ZTOV(szp));
2541		if (strcmp(tnm, "..") == 0)
2542			terr = EINVAL;
2543		ZFS_EXIT(zfsvfs);
2544		return (terr);
2545	}
2546
2547	/*
2548	 * Must have write access at the source to remove the old entry
2549	 * and write access at the target to create the new entry.
2550	 * Note that if target and source are the same, this can be
2551	 * done in a single check.
2552	 */
2553
2554	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
2555		goto out;
2556
2557	if (ZTOV(szp)->v_type == VDIR) {
2558		/*
2559		 * Check to make sure rename is valid.
2560		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
2561		 */
2562		if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
2563			goto out;
2564	}
2565
2566	/*
2567	 * Does target exist?
2568	 */
2569	if (tzp) {
2570		/*
2571		 * Source and target must be the same type.
2572		 */
2573		if (ZTOV(szp)->v_type == VDIR) {
2574			if (ZTOV(tzp)->v_type != VDIR) {
2575				error = ENOTDIR;
2576				goto out;
2577			}
2578		} else {
2579			if (ZTOV(tzp)->v_type == VDIR) {
2580				error = EISDIR;
2581				goto out;
2582			}
2583		}
2584		/*
2585		 * POSIX dictates that when the source and target
2586		 * entries refer to the same file object, rename
2587		 * must do nothing and exit without error.
2588		 */
2589		if (szp->z_id == tzp->z_id) {
2590			error = 0;
2591			goto out;
2592		}
2593	}
2594
2595	vnevent_rename_src(ZTOV(szp));
2596	if (tzp)
2597		vnevent_rename_dest(ZTOV(tzp));
2598
2599	tx = dmu_tx_create(zfsvfs->z_os);
2600	dmu_tx_hold_bonus(tx, szp->z_id);	/* nlink changes */
2601	dmu_tx_hold_bonus(tx, sdzp->z_id);	/* nlink changes */
2602	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
2603	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
2604	if (sdzp != tdzp)
2605		dmu_tx_hold_bonus(tx, tdzp->z_id);	/* nlink changes */
2606	if (tzp)
2607		dmu_tx_hold_bonus(tx, tzp->z_id);	/* parent changes */
2608	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2609	error = dmu_tx_assign(tx, zfsvfs->z_assign);
2610	if (error) {
2611		if (zl != NULL)
2612			zfs_rename_unlock(&zl);
2613		zfs_dirent_unlock(sdl);
2614		zfs_dirent_unlock(tdl);
2615		VN_RELE(ZTOV(szp));
2616		if (tzp)
2617			VN_RELE(ZTOV(tzp));
2618		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
2619			dmu_tx_wait(tx);
2620			dmu_tx_abort(tx);
2621			goto top;
2622		}
2623		dmu_tx_abort(tx);
2624		ZFS_EXIT(zfsvfs);
2625		return (error);
2626	}
2627
2628	if (tzp)	/* Attempt to remove the existing target */
2629		error = zfs_link_destroy(tdl, tzp, tx, 0, NULL);
2630
2631	if (error == 0) {
2632		error = zfs_link_create(tdl, szp, tx, ZRENAMING);
2633		if (error == 0) {
2634			error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
2635			ASSERT(error == 0);
2636			zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
2637			    sdl->dl_name, tdzp, tdl->dl_name, szp);
2638		}
2639#ifdef FREEBSD_NAMECACHE
2640		if (error == 0) {
2641			cache_purge(sdvp);
2642			cache_purge(tdvp);
2643		}
2644#endif
2645	}
2646
2647	dmu_tx_commit(tx);
2648out:
2649	if (zl != NULL)
2650		zfs_rename_unlock(&zl);
2651
2652	zfs_dirent_unlock(sdl);
2653	zfs_dirent_unlock(tdl);
2654
2655	VN_RELE(ZTOV(szp));
2656	if (tzp)
2657		VN_RELE(ZTOV(tzp));
2658
2659	ZFS_EXIT(zfsvfs);
2660
2661	return (error);
2662}
2663
2664/*
2665 * Insert the indicated symbolic reference entry into the directory.
2666 *
2667 *	IN:	dvp	- Directory to contain new symbolic link.
2668 *		link	- Name for new symlink entry.
2669 *		vap	- Attributes of new entry.
2670 *		target	- Target path of new symlink.
2671 *		cr	- credentials of caller.
2672 *
2673 *	RETURN:	0 if success
2674 *		error code if failure
2675 *
2676 * Timestamps:
2677 *	dvp - ctime|mtime updated
2678 */
2679static int
2680zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, cred_t *cr, kthread_t *td)
2681{
2682	znode_t		*zp, *dzp = VTOZ(dvp);
2683	zfs_dirlock_t	*dl;
2684	dmu_tx_t	*tx;
2685	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2686	zilog_t		*zilog = zfsvfs->z_log;
2687	uint64_t	zoid;
2688	int		len = strlen(link);
2689	int		error;
2690
2691	ASSERT(vap->va_type == VLNK);
2692
2693	ZFS_ENTER(zfsvfs);
2694top:
2695	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
2696		ZFS_EXIT(zfsvfs);
2697		return (error);
2698	}
2699
2700	if (len > MAXPATHLEN) {
2701		ZFS_EXIT(zfsvfs);
2702		return (ENAMETOOLONG);
2703	}
2704
2705	/*
2706	 * Attempt to lock directory; fail if entry already exists.
2707	 */
2708	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZNEW)) {
2709		ZFS_EXIT(zfsvfs);
2710		return (error);
2711	}
2712
2713	tx = dmu_tx_create(zfsvfs->z_os);
2714	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
2715	dmu_tx_hold_bonus(tx, dzp->z_id);
2716	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
2717	if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
2718		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE);
2719	error = dmu_tx_assign(tx, zfsvfs->z_assign);
2720	if (error) {
2721		zfs_dirent_unlock(dl);
2722		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
2723			dmu_tx_wait(tx);
2724			dmu_tx_abort(tx);
2725			goto top;
2726		}
2727		dmu_tx_abort(tx);
2728		ZFS_EXIT(zfsvfs);
2729		return (error);
2730	}
2731
2732	dmu_buf_will_dirty(dzp->z_dbuf, tx);
2733
2734	/*
2735	 * Create a new object for the symlink.
2736	 * Put the link content into bonus buffer if it will fit;
2737	 * otherwise, store it just like any other file data.
2738	 */
2739	zoid = 0;
2740	if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) {
2741		zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, len);
2742		if (len != 0)
2743			bcopy(link, zp->z_phys + 1, len);
2744	} else {
2745		dmu_buf_t *dbp;
2746
2747		zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
2748
2749		/*
2750		 * Nothing can access the znode yet so no locking needed
2751		 * for growing the znode's blocksize.
2752		 */
2753		zfs_grow_blocksize(zp, len, tx);
2754
2755		VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, zoid, 0, FTAG, &dbp));
2756		dmu_buf_will_dirty(dbp, tx);
2757
2758		ASSERT3U(len, <=, dbp->db_size);
2759		bcopy(link, dbp->db_data, len);
2760		dmu_buf_rele(dbp, FTAG);
2761	}
2762	zp->z_phys->zp_size = len;
2763
2764	/*
2765	 * Insert the new object into the directory.
2766	 */
2767	(void) zfs_link_create(dl, zp, tx, ZNEW);
2768out:
2769	if (error == 0) {
2770		zfs_log_symlink(zilog, tx, TX_SYMLINK, dzp, zp, name, link);
2771		*vpp = ZTOV(zp);
2772		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
2773	}
2774
2775	dmu_tx_commit(tx);
2776
2777	zfs_dirent_unlock(dl);
2778
2779	ZFS_EXIT(zfsvfs);
2780	return (error);
2781}
2782
2783/*
2784 * Return, in the buffer contained in the provided uio structure,
2785 * the symbolic path referred to by vp.
2786 *
2787 *	IN:	vp	- vnode of symbolic link.
2788 *		uoip	- structure to contain the link path.
2789 *		cr	- credentials of caller.
2790 *
2791 *	OUT:	uio	- structure to contain the link path.
2792 *
2793 *	RETURN:	0 if success
2794 *		error code if failure
2795 *
2796 * Timestamps:
2797 *	vp - atime updated
2798 */
2799/* ARGSUSED */
2800static int
2801zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr)
2802{
2803	znode_t		*zp = VTOZ(vp);
2804	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2805	size_t		bufsz;
2806	int		error;
2807
2808	ZFS_ENTER(zfsvfs);
2809
2810	bufsz = (size_t)zp->z_phys->zp_size;
2811	if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) {
2812		error = uiomove(zp->z_phys + 1,
2813		    MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
2814	} else {
2815		dmu_buf_t *dbp;
2816		error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp);
2817		if (error) {
2818			ZFS_EXIT(zfsvfs);
2819			return (error);
2820		}
2821		error = uiomove(dbp->db_data,
2822		    MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
2823		dmu_buf_rele(dbp, FTAG);
2824	}
2825
2826	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2827	ZFS_EXIT(zfsvfs);
2828	return (error);
2829}
2830
2831/*
2832 * Insert a new entry into directory tdvp referencing svp.
2833 *
2834 *	IN:	tdvp	- Directory to contain new entry.
2835 *		svp	- vnode of new entry.
2836 *		name	- name of new entry.
2837 *		cr	- credentials of caller.
2838 *
2839 *	RETURN:	0 if success
2840 *		error code if failure
2841 *
2842 * Timestamps:
2843 *	tdvp - ctime|mtime updated
2844 *	 svp - ctime updated
2845 */
2846/* ARGSUSED */
2847static int
2848zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr)
2849{
2850	znode_t		*dzp = VTOZ(tdvp);
2851	znode_t		*tzp, *szp;
2852	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2853	zilog_t		*zilog = zfsvfs->z_log;
2854	zfs_dirlock_t	*dl;
2855	dmu_tx_t	*tx;
2856	vnode_t		*realvp;
2857	int		error;
2858
2859	ASSERT(tdvp->v_type == VDIR);
2860
2861	ZFS_ENTER(zfsvfs);
2862
2863	if (VOP_REALVP(svp, &realvp) == 0)
2864		svp = realvp;
2865
2866	if (svp->v_vfsp != tdvp->v_vfsp) {
2867		ZFS_EXIT(zfsvfs);
2868		return (EXDEV);
2869	}
2870
2871	szp = VTOZ(svp);
2872top:
2873	/*
2874	 * We do not support links between attributes and non-attributes
2875	 * because of the potential security risk of creating links
2876	 * into "normal" file space in order to circumvent restrictions
2877	 * imposed in attribute space.
2878	 */
2879	if ((szp->z_phys->zp_flags & ZFS_XATTR) !=
2880	    (dzp->z_phys->zp_flags & ZFS_XATTR)) {
2881		ZFS_EXIT(zfsvfs);
2882		return (EINVAL);
2883	}
2884
2885	/*
2886	 * POSIX dictates that we return EPERM here.
2887	 * Better choices include ENOTSUP or EISDIR.
2888	 */
2889	if (svp->v_type == VDIR) {
2890		ZFS_EXIT(zfsvfs);
2891		return (EPERM);
2892	}
2893
2894	if ((uid_t)szp->z_phys->zp_uid != crgetuid(cr) &&
2895	    secpolicy_basic_link(cr) != 0) {
2896		ZFS_EXIT(zfsvfs);
2897		return (EPERM);
2898	}
2899
2900	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
2901		ZFS_EXIT(zfsvfs);
2902		return (error);
2903	}
2904
2905	/*
2906	 * Attempt to lock directory; fail if entry already exists.
2907	 */
2908	if (error = zfs_dirent_lock(&dl, dzp, name, &tzp, ZNEW)) {
2909		ZFS_EXIT(zfsvfs);
2910		return (error);
2911	}
2912
2913	tx = dmu_tx_create(zfsvfs->z_os);
2914	dmu_tx_hold_bonus(tx, szp->z_id);
2915	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
2916	error = dmu_tx_assign(tx, zfsvfs->z_assign);
2917	if (error) {
2918		zfs_dirent_unlock(dl);
2919		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
2920			dmu_tx_wait(tx);
2921			dmu_tx_abort(tx);
2922			goto top;
2923		}
2924		dmu_tx_abort(tx);
2925		ZFS_EXIT(zfsvfs);
2926		return (error);
2927	}
2928
2929	error = zfs_link_create(dl, szp, tx, 0);
2930
2931	if (error == 0)
2932		zfs_log_link(zilog, tx, TX_LINK, dzp, szp, name);
2933
2934	dmu_tx_commit(tx);
2935
2936	zfs_dirent_unlock(dl);
2937
2938	ZFS_EXIT(zfsvfs);
2939	return (error);
2940}
2941
2942void
2943zfs_inactive(vnode_t *vp, cred_t *cr)
2944{
2945	znode_t	*zp = VTOZ(vp);
2946	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2947	int error;
2948
2949	rw_enter(&zfsvfs->z_um_lock, RW_READER);
2950	if (zfsvfs->z_unmounted2) {
2951		ASSERT(zp->z_dbuf_held == 0);
2952
2953		mutex_enter(&zp->z_lock);
2954		VI_LOCK(vp);
2955		vp->v_count = 0; /* count arrives as 1 */
2956		VI_UNLOCK(vp);
2957		if (zp->z_dbuf == NULL) {
2958			mutex_exit(&zp->z_lock);
2959			zfs_znode_free(zp);
2960		} else {
2961			mutex_exit(&zp->z_lock);
2962		}
2963		rw_exit(&zfsvfs->z_um_lock);
2964		VFS_RELE(zfsvfs->z_vfs);
2965		return;
2966	}
2967
2968	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
2969		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
2970
2971		dmu_tx_hold_bonus(tx, zp->z_id);
2972		error = dmu_tx_assign(tx, TXG_WAIT);
2973		if (error) {
2974			dmu_tx_abort(tx);
2975		} else {
2976			dmu_buf_will_dirty(zp->z_dbuf, tx);
2977			mutex_enter(&zp->z_lock);
2978			zp->z_atime_dirty = 0;
2979			mutex_exit(&zp->z_lock);
2980			dmu_tx_commit(tx);
2981		}
2982	}
2983
2984	zfs_zinactive(zp);
2985	rw_exit(&zfsvfs->z_um_lock);
2986}
2987
2988CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
2989CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
2990
2991static int
2992zfs_fid(vnode_t *vp, fid_t *fidp)
2993{
2994	znode_t		*zp = VTOZ(vp);
2995	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2996	uint32_t	gen = (uint32_t)zp->z_phys->zp_gen;
2997	uint64_t	object = zp->z_id;
2998	zfid_short_t	*zfid;
2999	int		size, i;
3000
3001	ZFS_ENTER(zfsvfs);
3002
3003	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
3004	fidp->fid_len = size;
3005
3006	zfid = (zfid_short_t *)fidp;
3007
3008	zfid->zf_len = size;
3009
3010	for (i = 0; i < sizeof (zfid->zf_object); i++)
3011		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
3012
3013	/* Must have a non-zero generation number to distinguish from .zfs */
3014	if (gen == 0)
3015		gen = 1;
3016	for (i = 0; i < sizeof (zfid->zf_gen); i++)
3017		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
3018
3019	if (size == LONG_FID_LEN) {
3020		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
3021		zfid_long_t	*zlfid;
3022
3023		zlfid = (zfid_long_t *)fidp;
3024
3025		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
3026			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
3027
3028		/* XXX - this should be the generation number for the objset */
3029		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
3030			zlfid->zf_setgen[i] = 0;
3031	}
3032
3033	ZFS_EXIT(zfsvfs);
3034	return (0);
3035}
3036
3037static int
3038zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr)
3039{
3040	znode_t		*zp, *xzp;
3041	zfsvfs_t	*zfsvfs;
3042	zfs_dirlock_t	*dl;
3043	int		error;
3044
3045	switch (cmd) {
3046	case _PC_LINK_MAX:
3047		*valp = INT_MAX;
3048		return (0);
3049
3050	case _PC_FILESIZEBITS:
3051		*valp = 64;
3052		return (0);
3053
3054#if 0
3055	case _PC_XATTR_EXISTS:
3056		zp = VTOZ(vp);
3057		zfsvfs = zp->z_zfsvfs;
3058		ZFS_ENTER(zfsvfs);
3059		*valp = 0;
3060		error = zfs_dirent_lock(&dl, zp, "", &xzp,
3061		    ZXATTR | ZEXISTS | ZSHARED);
3062		if (error == 0) {
3063			zfs_dirent_unlock(dl);
3064			if (!zfs_dirempty(xzp))
3065				*valp = 1;
3066			VN_RELE(ZTOV(xzp));
3067		} else if (error == ENOENT) {
3068			/*
3069			 * If there aren't extended attributes, it's the
3070			 * same as having zero of them.
3071			 */
3072			error = 0;
3073		}
3074		ZFS_EXIT(zfsvfs);
3075		return (error);
3076#endif
3077
3078	case _PC_ACL_EXTENDED:
3079		*valp = 0;	/* TODO */
3080		return (0);
3081
3082	case _PC_MIN_HOLE_SIZE:
3083		*valp = (int)SPA_MINBLOCKSIZE;
3084		return (0);
3085
3086	default:
3087		return (EOPNOTSUPP);
3088	}
3089}
3090
3091#ifdef TODO
3092/*ARGSUSED*/
3093static int
3094zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr)
3095{
3096	znode_t *zp = VTOZ(vp);
3097	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3098	int error;
3099
3100	ZFS_ENTER(zfsvfs);
3101	error = zfs_getacl(zp, vsecp, cr);
3102	ZFS_EXIT(zfsvfs);
3103
3104	return (error);
3105}
3106#endif	/* TODO */
3107
3108#ifdef TODO
3109/*ARGSUSED*/
3110static int
3111zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr)
3112{
3113	znode_t *zp = VTOZ(vp);
3114	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3115	int error;
3116
3117	ZFS_ENTER(zfsvfs);
3118	error = zfs_setacl(zp, vsecp, cr);
3119	ZFS_EXIT(zfsvfs);
3120	return (error);
3121}
3122#endif	/* TODO */
3123
3124static int
3125zfs_freebsd_open(ap)
3126	struct vop_open_args /* {
3127		struct vnode *a_vp;
3128		int a_mode;
3129		struct ucred *a_cred;
3130		struct thread *a_td;
3131	} */ *ap;
3132{
3133	vnode_t	*vp = ap->a_vp;
3134	znode_t *zp = VTOZ(vp);
3135	int error;
3136
3137	error = zfs_open(&vp, ap->a_mode, ap->a_cred);
3138	if (error == 0)
3139		vnode_create_vobject(vp, zp->z_phys->zp_size, ap->a_td);
3140	return (error);
3141}
3142
3143static int
3144zfs_freebsd_close(ap)
3145	struct vop_close_args /* {
3146		struct vnode *a_vp;
3147		int  a_fflag;
3148		struct ucred *a_cred;
3149		struct thread *a_td;
3150	} */ *ap;
3151{
3152
3153	return (zfs_close(ap->a_vp, ap->a_fflag, 0, 0, ap->a_cred));
3154}
3155
3156static int
3157zfs_freebsd_ioctl(ap)
3158	struct vop_ioctl_args /* {
3159		struct vnode *a_vp;
3160		u_long a_command;
3161		caddr_t a_data;
3162		int a_fflag;
3163		struct ucred *cred;
3164		struct thread *td;
3165	} */ *ap;
3166{
3167
3168	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
3169	    ap->a_fflag, ap->a_cred, NULL));
3170}
3171
3172static int
3173zfs_freebsd_read(ap)
3174	struct vop_read_args /* {
3175		struct vnode *a_vp;
3176		struct uio *a_uio;
3177		int a_ioflag;
3178		struct ucred *a_cred;
3179	} */ *ap;
3180{
3181
3182	return (zfs_read(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL));
3183}
3184
3185static int
3186zfs_freebsd_write(ap)
3187	struct vop_write_args /* {
3188		struct vnode *a_vp;
3189		struct uio *a_uio;
3190		int a_ioflag;
3191		struct ucred *a_cred;
3192	} */ *ap;
3193{
3194
3195	return (zfs_write(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL));
3196}
3197
3198static int
3199zfs_freebsd_access(ap)
3200	struct vop_access_args /* {
3201		struct vnode *a_vp;
3202		int  a_mode;
3203		struct ucred *a_cred;
3204		struct thread *a_td;
3205	} */ *ap;
3206{
3207
3208	return (zfs_access(ap->a_vp, ap->a_mode, 0, ap->a_cred));
3209}
3210
3211static int
3212zfs_freebsd_lookup(ap)
3213	struct vop_lookup_args /* {
3214		struct vnode *a_dvp;
3215		struct vnode **a_vpp;
3216		struct componentname *a_cnp;
3217	} */ *ap;
3218{
3219	struct componentname *cnp = ap->a_cnp;
3220	char nm[NAME_MAX + 1];
3221
3222	ASSERT(cnp->cn_namelen < sizeof(nm));
3223	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
3224
3225	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
3226	    cnp->cn_cred, cnp->cn_thread));
3227}
3228
3229static int
3230zfs_freebsd_create(ap)
3231	struct vop_create_args /* {
3232		struct vnode *a_dvp;
3233		struct vnode **a_vpp;
3234		struct componentname *a_cnp;
3235		struct vattr *a_vap;
3236	} */ *ap;
3237{
3238	struct componentname *cnp = ap->a_cnp;
3239	vattr_t *vap = ap->a_vap;
3240	int mode;
3241
3242	ASSERT(cnp->cn_flags & SAVENAME);
3243
3244	vattr_init_mask(vap);
3245	mode = vap->va_mode & ALLPERMS;
3246
3247	return (zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
3248	    ap->a_vpp, cnp->cn_cred, cnp->cn_thread));
3249}
3250
3251static int
3252zfs_freebsd_remove(ap)
3253	struct vop_remove_args /* {
3254		struct vnode *a_dvp;
3255		struct vnode *a_vp;
3256		struct componentname *a_cnp;
3257	} */ *ap;
3258{
3259
3260	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
3261
3262	return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr,
3263	    ap->a_cnp->cn_cred));
3264}
3265
3266static int
3267zfs_freebsd_mkdir(ap)
3268	struct vop_mkdir_args /* {
3269		struct vnode *a_dvp;
3270		struct vnode **a_vpp;
3271		struct componentname *a_cnp;
3272		struct vattr *a_vap;
3273	} */ *ap;
3274{
3275	vattr_t *vap = ap->a_vap;
3276
3277	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
3278
3279	vattr_init_mask(vap);
3280
3281	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
3282	    ap->a_cnp->cn_cred));
3283}
3284
3285static int
3286zfs_freebsd_rmdir(ap)
3287	struct vop_rmdir_args /* {
3288		struct vnode *a_dvp;
3289		struct vnode *a_vp;
3290		struct componentname *a_cnp;
3291	} */ *ap;
3292{
3293	struct componentname *cnp = ap->a_cnp;
3294
3295	ASSERT(cnp->cn_flags & SAVENAME);
3296
3297	return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred));
3298}
3299
3300static int
3301zfs_freebsd_readdir(ap)
3302	struct vop_readdir_args /* {
3303		struct vnode *a_vp;
3304		struct uio *a_uio;
3305		struct ucred *a_cred;
3306		int *a_eofflag;
3307		int *a_ncookies;
3308		u_long **a_cookies;
3309	} */ *ap;
3310{
3311
3312	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
3313	    ap->a_ncookies, ap->a_cookies));
3314}
3315
3316static int
3317zfs_freebsd_fsync(ap)
3318	struct vop_fsync_args /* {
3319		struct vnode *a_vp;
3320		int a_waitfor;
3321		struct thread *a_td;
3322	} */ *ap;
3323{
3324
3325	vop_stdfsync(ap);
3326	return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred));
3327}
3328
3329static int
3330zfs_freebsd_getattr(ap)
3331	struct vop_getattr_args /* {
3332		struct vnode *a_vp;
3333		struct vattr *a_vap;
3334		struct ucred *a_cred;
3335		struct thread *a_td;
3336	} */ *ap;
3337{
3338
3339	return (zfs_getattr(ap->a_vp, ap->a_vap, 0, ap->a_cred));
3340}
3341
3342static int
3343zfs_freebsd_setattr(ap)
3344	struct vop_setattr_args /* {
3345		struct vnode *a_vp;
3346		struct vattr *a_vap;
3347		struct ucred *a_cred;
3348		struct thread *a_td;
3349	} */ *ap;
3350{
3351	vattr_t *vap = ap->a_vap;
3352
3353	/* No support for FreeBSD's chflags(2). */
3354	if (vap->va_flags != VNOVAL)
3355		return (EOPNOTSUPP);
3356
3357	vattr_init_mask(vap);
3358	vap->va_mask &= ~AT_NOSET;
3359
3360	return (zfs_setattr(ap->a_vp, vap, 0, ap->a_cred, NULL));
3361}
3362
3363static int
3364zfs_freebsd_rename(ap)
3365	struct vop_rename_args  /* {
3366		struct vnode *a_fdvp;
3367		struct vnode *a_fvp;
3368		struct componentname *a_fcnp;
3369		struct vnode *a_tdvp;
3370		struct vnode *a_tvp;
3371		struct componentname *a_tcnp;
3372	} */ *ap;
3373{
3374	vnode_t *fdvp = ap->a_fdvp;
3375	vnode_t *fvp = ap->a_fvp;
3376	vnode_t *tdvp = ap->a_tdvp;
3377	vnode_t *tvp = ap->a_tvp;
3378	int error;
3379
3380	ASSERT(ap->a_fcnp->cn_flags & SAVENAME);
3381	ASSERT(ap->a_tcnp->cn_flags & SAVENAME);
3382
3383	error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp,
3384	    ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred);
3385
3386	if (tdvp == tvp)
3387		VN_RELE(tdvp);
3388	else
3389		VN_URELE(tdvp);
3390	if (tvp)
3391		VN_URELE(tvp);
3392	VN_RELE(fdvp);
3393	VN_RELE(fvp);
3394
3395	return (error);
3396}
3397
3398static int
3399zfs_freebsd_symlink(ap)
3400	struct vop_symlink_args /* {
3401		struct vnode *a_dvp;
3402		struct vnode **a_vpp;
3403		struct componentname *a_cnp;
3404		struct vattr *a_vap;
3405		char *a_target;
3406	} */ *ap;
3407{
3408	struct componentname *cnp = ap->a_cnp;
3409	vattr_t *vap = ap->a_vap;
3410
3411	ASSERT(cnp->cn_flags & SAVENAME);
3412
3413	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
3414	vattr_init_mask(vap);
3415
3416	return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
3417	    ap->a_target, cnp->cn_cred, cnp->cn_thread));
3418}
3419
3420static int
3421zfs_freebsd_readlink(ap)
3422	struct vop_readlink_args /* {
3423		struct vnode *a_vp;
3424		struct uio *a_uio;
3425		struct ucred *a_cred;
3426	} */ *ap;
3427{
3428
3429	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred));
3430}
3431
3432static int
3433zfs_freebsd_link(ap)
3434	struct vop_link_args /* {
3435		struct vnode *a_tdvp;
3436		struct vnode *a_vp;
3437		struct componentname *a_cnp;
3438	} */ *ap;
3439{
3440	struct componentname *cnp = ap->a_cnp;
3441
3442	ASSERT(cnp->cn_flags & SAVENAME);
3443
3444	return (zfs_link(ap->a_tdvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
3445}
3446
3447static int
3448zfs_freebsd_inactive(ap)
3449	struct vop_inactive_args /* {
3450		struct vnode *a_vp;
3451		struct thread *a_td;
3452	} */ *ap;
3453{
3454	vnode_t *vp = ap->a_vp;
3455
3456	zfs_inactive(vp, ap->a_td->td_ucred);
3457	return (0);
3458}
3459
3460static int
3461zfs_freebsd_reclaim(ap)
3462	struct vop_reclaim_args /* {
3463		struct vnode *a_vp;
3464		struct thread *a_td;
3465	} */ *ap;
3466{
3467	vnode_t	*vp = ap->a_vp;
3468	znode_t	*zp = VTOZ(vp);
3469	zfsvfs_t *zfsvfs;
3470	int rele = 1;
3471
3472	ASSERT(zp != NULL);
3473
3474	/*
3475	 * Destroy the vm object and flush associated pages.
3476	 */
3477	vnode_destroy_vobject(vp);
3478
3479	mutex_enter(&zp->z_lock);
3480	ASSERT(zp->z_phys);
3481	ASSERT(zp->z_dbuf_held);
3482	zfsvfs = zp->z_zfsvfs;
3483	if (!zp->z_unlinked) {
3484		zp->z_dbuf_held = 0;
3485		ZTOV(zp) = NULL;
3486		mutex_exit(&zp->z_lock);
3487		dmu_buf_rele(zp->z_dbuf, NULL);
3488	} else {
3489		mutex_exit(&zp->z_lock);
3490	}
3491	VI_LOCK(vp);
3492	if (vp->v_count > 0)
3493		rele = 0;
3494	vp->v_data = NULL;
3495	ASSERT(vp->v_holdcnt >= 1);
3496	VI_UNLOCK(vp);
3497	if (!zp->z_unlinked && rele)
3498		VFS_RELE(zfsvfs->z_vfs);
3499	return (0);
3500}
3501
3502static int
3503zfs_freebsd_fid(ap)
3504	struct vop_fid_args /* {
3505		struct vnode *a_vp;
3506		struct fid *a_fid;
3507	} */ *ap;
3508{
3509
3510	return (zfs_fid(ap->a_vp, (void *)ap->a_fid));
3511}
3512
3513static int
3514zfs_freebsd_pathconf(ap)
3515	struct vop_pathconf_args /* {
3516		struct vnode *a_vp;
3517		int a_name;
3518		register_t *a_retval;
3519	} */ *ap;
3520{
3521	ulong_t val;
3522	int error;
3523
3524	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred);
3525	if (error == 0)
3526		*ap->a_retval = val;
3527	else if (error == EOPNOTSUPP)
3528		error = vop_stdpathconf(ap);
3529	return (error);
3530}
3531
3532/*
3533 * Advisory record locking support
3534 */
3535static int
3536zfs_freebsd_advlock(ap)
3537	struct vop_advlock_args /* {
3538		struct vnode *a_vp;
3539		caddr_t  a_id;
3540		int  a_op;
3541		struct flock *a_fl;
3542		int  a_flags;
3543	} */ *ap;
3544{
3545	znode_t	*zp = VTOZ(ap->a_vp);
3546
3547	return (lf_advlock(ap, &(zp->z_lockf), zp->z_phys->zp_size));
3548}
3549
3550struct vop_vector zfs_vnodeops;
3551struct vop_vector zfs_fifoops;
3552
3553struct vop_vector zfs_vnodeops = {
3554	.vop_default =	&default_vnodeops,
3555	.vop_inactive =	zfs_freebsd_inactive,
3556	.vop_reclaim =	zfs_freebsd_reclaim,
3557	.vop_access =	zfs_freebsd_access,
3558#ifdef FREEBSD_NAMECACHE
3559	.vop_lookup =	vfs_cache_lookup,
3560	.vop_cachedlookup = zfs_freebsd_lookup,
3561#else
3562	.vop_lookup =	zfs_freebsd_lookup,
3563#endif
3564	.vop_getattr =	zfs_freebsd_getattr,
3565	.vop_setattr =	zfs_freebsd_setattr,
3566	.vop_create =	zfs_freebsd_create,
3567	.vop_mknod =	zfs_freebsd_create,
3568	.vop_mkdir =	zfs_freebsd_mkdir,
3569	.vop_readdir =	zfs_freebsd_readdir,
3570	.vop_fsync =	zfs_freebsd_fsync,
3571	.vop_open =	zfs_freebsd_open,
3572	.vop_close =	zfs_freebsd_close,
3573	.vop_rmdir =	zfs_freebsd_rmdir,
3574	.vop_ioctl =	zfs_freebsd_ioctl,
3575	.vop_link =	zfs_freebsd_link,
3576	.vop_symlink =	zfs_freebsd_symlink,
3577	.vop_readlink =	zfs_freebsd_readlink,
3578	.vop_read =	zfs_freebsd_read,
3579	.vop_write =	zfs_freebsd_write,
3580	.vop_remove =	zfs_freebsd_remove,
3581	.vop_rename =	zfs_freebsd_rename,
3582	.vop_advlock =	zfs_freebsd_advlock,
3583	.vop_pathconf =	zfs_freebsd_pathconf,
3584	.vop_bmap =	VOP_EOPNOTSUPP,
3585	.vop_fid =	zfs_freebsd_fid,
3586};
3587
3588struct vop_vector zfs_fifoops = {
3589	.vop_default =	&fifo_specops,
3590	.vop_fsync =	VOP_PANIC,
3591	.vop_access =	zfs_freebsd_access,
3592	.vop_getattr =	zfs_freebsd_getattr,
3593	.vop_inactive =	zfs_freebsd_inactive,
3594	.vop_read =	VOP_PANIC,
3595	.vop_reclaim =	zfs_freebsd_reclaim,
3596	.vop_setattr =	zfs_freebsd_setattr,
3597	.vop_write =	VOP_PANIC,
3598	.vop_fid =	zfs_freebsd_fid,
3599};
3600