zfs_vnops.c revision 169057
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28#include <sys/types.h>
29#include <sys/param.h>
30#include <sys/time.h>
31#include <sys/systm.h>
32#include <sys/sysmacros.h>
33#include <sys/resource.h>
34#include <sys/vfs.h>
35#include <sys/vnode.h>
36#include <sys/file.h>
37#include <sys/stat.h>
38#include <sys/kmem.h>
39#include <sys/taskq.h>
40#include <sys/uio.h>
41#include <sys/atomic.h>
42#include <sys/namei.h>
43#include <sys/mman.h>
44#include <sys/cmn_err.h>
45#include <sys/errno.h>
46#include <sys/unistd.h>
47#include <sys/zfs_vfsops.h>
48#include <sys/zfs_dir.h>
49#include <sys/zfs_acl.h>
50#include <sys/zfs_ioctl.h>
51#include <sys/fs/zfs.h>
52#include <sys/dmu.h>
53#include <sys/spa.h>
54#include <sys/txg.h>
55#include <sys/dbuf.h>
56#include <sys/zap.h>
57#include <sys/dirent.h>
58#include <sys/policy.h>
59#include <sys/sunddi.h>
60#include <sys/filio.h>
61#include <sys/zfs_ctldir.h>
62#include <sys/dnlc.h>
63#include <sys/zfs_rlock.h>
64#include <sys/bio.h>
65#include <sys/buf.h>
66#include <sys/sf_buf.h>
67#include <sys/sched.h>
68
69/*
70 * Programming rules.
71 *
72 * Each vnode op performs some logical unit of work.  To do this, the ZPL must
73 * properly lock its in-core state, create a DMU transaction, do the work,
74 * record this work in the intent log (ZIL), commit the DMU transaction,
75 * and wait the the intent log to commit if it's is a synchronous operation.
76 * Morover, the vnode ops must work in both normal and log replay context.
77 * The ordering of events is important to avoid deadlocks and references
78 * to freed memory.  The example below illustrates the following Big Rules:
79 *
80 *  (1) A check must be made in each zfs thread for a mounted file system.
81 *	This is done avoiding races using ZFS_ENTER(zfsvfs).
82 *	A ZFS_EXIT(zfsvfs) is needed before all returns.
83 *
84 *  (2)	VN_RELE() should always be the last thing except for zil_commit()
85 *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
86 *	First, if it's the last reference, the vnode/znode
87 *	can be freed, so the zp may point to freed memory.  Second, the last
88 *	reference will call zfs_zinactive(), which may induce a lot of work --
89 *	pushing cached pages (which acquires range locks) and syncing out
90 *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
91 *	which could deadlock the system if you were already holding one.
92 *
93 *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
94 *	as they can span dmu_tx_assign() calls.
95 *
96 *  (4)	Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign().
97 *	In normal operation, this will be TXG_NOWAIT.  During ZIL replay,
98 *	it will be a specific txg.  Either way, dmu_tx_assign() never blocks.
99 *	This is critical because we don't want to block while holding locks.
100 *	Note, in particular, that if a lock is sometimes acquired before
101 *	the tx assigns, and sometimes after (e.g. z_lock), then failing to
102 *	use a non-blocking assign can deadlock the system.  The scenario:
103 *
104 *	Thread A has grabbed a lock before calling dmu_tx_assign().
105 *	Thread B is in an already-assigned tx, and blocks for this lock.
106 *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
107 *	forever, because the previous txg can't quiesce until B's tx commits.
108 *
109 *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
110 *	then drop all locks, call dmu_tx_wait(), and try again.
111 *
112 *  (5)	If the operation succeeded, generate the intent log entry for it
113 *	before dropping locks.  This ensures that the ordering of events
114 *	in the intent log matches the order in which they actually occurred.
115 *
116 *  (6)	At the end of each vnode op, the DMU tx must always commit,
117 *	regardless of whether there were any errors.
118 *
119 *  (7)	After dropping all locks, invoke zil_commit(zilog, seq, foid)
120 *	to ensure that synchronous semantics are provided when necessary.
121 *
122 * In general, this is how things should be ordered in each vnode op:
123 *
124 *	ZFS_ENTER(zfsvfs);		// exit if unmounted
125 * top:
126 *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may VN_HOLD())
127 *	rw_enter(...);			// grab any other locks you need
128 *	tx = dmu_tx_create(...);	// get DMU tx
129 *	dmu_tx_hold_*();		// hold each object you might modify
130 *	error = dmu_tx_assign(tx, zfsvfs->z_assign);	// try to assign
131 *	if (error) {
132 *		rw_exit(...);		// drop locks
133 *		zfs_dirent_unlock(dl);	// unlock directory entry
134 *		VN_RELE(...);		// release held vnodes
135 *		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
136 *			dmu_tx_wait(tx);
137 *			dmu_tx_abort(tx);
138 *			goto top;
139 *		}
140 *		dmu_tx_abort(tx);	// abort DMU tx
141 *		ZFS_EXIT(zfsvfs);	// finished in zfs
142 *		return (error);		// really out of space
143 *	}
144 *	error = do_real_work();		// do whatever this VOP does
145 *	if (error == 0)
146 *		zfs_log_*(...);		// on success, make ZIL entry
147 *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
148 *	rw_exit(...);			// drop locks
149 *	zfs_dirent_unlock(dl);		// unlock directory entry
150 *	VN_RELE(...);			// release held vnodes
151 *	zil_commit(zilog, seq, foid);	// synchronous when necessary
152 *	ZFS_EXIT(zfsvfs);		// finished in zfs
153 *	return (error);			// done, report error
154 */
155/* ARGSUSED */
156static int
157zfs_open(vnode_t **vpp, int flag, cred_t *cr)
158{
159	znode_t	*zp = VTOZ(*vpp);
160
161	/* Keep a count of the synchronous opens in the znode */
162	if (flag & (FSYNC | FDSYNC))
163		atomic_inc_32(&zp->z_sync_cnt);
164	return (0);
165}
166
167/* ARGSUSED */
168static int
169zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
170{
171	znode_t	*zp = VTOZ(vp);
172
173	/* Decrement the synchronous opens in the znode */
174	if (flag & (FSYNC | FDSYNC))
175		atomic_dec_32(&zp->z_sync_cnt);
176
177	/*
178	 * Clean up any locks held by this process on the vp.
179	 */
180	cleanlocks(vp, ddi_get_pid(), 0);
181	cleanshares(vp, ddi_get_pid());
182
183	return (0);
184}
185
186/*
187 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
188 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
189 */
190static int
191zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
192{
193	znode_t	*zp = VTOZ(vp);
194	uint64_t noff = (uint64_t)*off; /* new offset */
195	uint64_t file_sz;
196	int error;
197	boolean_t hole;
198
199	file_sz = zp->z_phys->zp_size;
200	if (noff >= file_sz)  {
201		return (ENXIO);
202	}
203
204	if (cmd == _FIO_SEEK_HOLE)
205		hole = B_TRUE;
206	else
207		hole = B_FALSE;
208
209	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
210
211	/* end of file? */
212	if ((error == ESRCH) || (noff > file_sz)) {
213		/*
214		 * Handle the virtual hole at the end of file.
215		 */
216		if (hole) {
217			*off = file_sz;
218			return (0);
219		}
220		return (ENXIO);
221	}
222
223	if (noff < *off)
224		return (error);
225	*off = noff;
226	return (error);
227}
228
229/* ARGSUSED */
230static int
231zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
232    int *rvalp)
233{
234	offset_t off;
235	int error;
236	zfsvfs_t *zfsvfs;
237
238	switch (com) {
239	    case _FIOFFS:
240		return (0);
241
242		/*
243		 * The following two ioctls are used by bfu.  Faking out,
244		 * necessary to avoid bfu errors.
245		 */
246	    case _FIOGDIO:
247	    case _FIOSDIO:
248		return (0);
249
250	    case _FIO_SEEK_DATA:
251	    case _FIO_SEEK_HOLE:
252		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
253			return (EFAULT);
254
255		zfsvfs = VTOZ(vp)->z_zfsvfs;
256		ZFS_ENTER(zfsvfs);
257
258		/* offset parameter is in/out */
259		error = zfs_holey(vp, com, &off);
260		ZFS_EXIT(zfsvfs);
261		if (error)
262			return (error);
263		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
264			return (EFAULT);
265		return (0);
266	}
267	return (ENOTTY);
268}
269
270/*
271 * When a file is memory mapped, we must keep the IO data synchronized
272 * between the DMU cache and the memory mapped pages.  What this means:
273 *
274 * On Write:	If we find a memory mapped page, we write to *both*
275 *		the page and the dmu buffer.
276 *
277 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
278 *	the file is memory mapped.
279 */
280static int
281mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx)
282{
283	znode_t *zp = VTOZ(vp);
284	objset_t *os = zp->z_zfsvfs->z_os;
285	vm_object_t obj;
286	vm_page_t m;
287	struct sf_buf *sf;
288	int64_t start, off;
289	int len = nbytes;
290	int error = 0;
291
292	ASSERT(vp->v_mount != NULL);
293	obj = vp->v_object;
294	ASSERT(obj != NULL);
295
296	start = uio->uio_loffset;
297	off = start & PAGEOFFSET;
298	VM_OBJECT_LOCK(obj);
299	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
300		uint64_t bytes = MIN(PAGESIZE - off, len);
301
302again:
303		if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
304		    vm_page_is_valid(m, (vm_offset_t)off, bytes)) {
305			uint64_t woff, dmubytes;
306			caddr_t va;
307
308			if (vm_page_sleep_if_busy(m, FALSE, "zfsmwb"))
309				goto again;
310			woff = uio->uio_loffset;
311			dmubytes = MIN(PAGESIZE,
312			    obj->un_pager.vnp.vnp_size - (woff - off));
313			vm_page_busy(m);
314			vm_page_lock_queues();
315			vm_page_undirty(m);
316			vm_page_unlock_queues();
317			VM_OBJECT_UNLOCK(obj);
318			sched_pin();
319			sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
320			va = (caddr_t)sf_buf_kva(sf);
321			error = uiomove(va + off, bytes, UIO_WRITE, uio);
322			dmu_write(os, zp->z_id, woff - off, dmubytes, va, tx);
323			sf_buf_free(sf);
324			sched_unpin();
325			VM_OBJECT_LOCK(obj);
326			vm_page_wakeup(m);
327		} else {
328			VM_OBJECT_UNLOCK(obj);
329			error = dmu_write_uio(os, zp->z_id, uio, bytes, tx);
330			VM_OBJECT_LOCK(obj);
331		}
332		len -= bytes;
333		off = 0;
334		if (error)
335			break;
336	}
337	VM_OBJECT_UNLOCK(obj);
338	return (error);
339}
340
341/*
342 * When a file is memory mapped, we must keep the IO data synchronized
343 * between the DMU cache and the memory mapped pages.  What this means:
344 *
345 * On Read:	We "read" preferentially from memory mapped pages,
346 *		else we default from the dmu buffer.
347 *
348 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
349 *	the file is memory mapped.
350 */
351static int
352mappedread(vnode_t *vp, int nbytes, uio_t *uio)
353{
354	znode_t *zp = VTOZ(vp);
355	objset_t *os = zp->z_zfsvfs->z_os;
356	vm_object_t obj;
357	vm_page_t m;
358	struct sf_buf *sf;
359	int64_t start, off;
360	caddr_t va;
361	int len = nbytes;
362	int error = 0;
363
364	ASSERT(vp->v_mount != NULL);
365	obj = vp->v_object;
366	ASSERT(obj != NULL);
367
368	start = uio->uio_loffset;
369	off = start & PAGEOFFSET;
370	VM_OBJECT_LOCK(obj);
371	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
372		uint64_t bytes = MIN(PAGESIZE - off, len);
373
374again:
375		if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
376		    vm_page_is_valid(m, (vm_offset_t)off, bytes)) {
377			if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb"))
378				goto again;
379			vm_page_busy(m);
380			VM_OBJECT_UNLOCK(obj);
381			sched_pin();
382			sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
383			va = (caddr_t)sf_buf_kva(sf);
384			error = uiomove(va + off, bytes, UIO_READ, uio);
385			sf_buf_free(sf);
386			sched_unpin();
387			VM_OBJECT_LOCK(obj);
388			vm_page_wakeup(m);
389		} else if (m != NULL && uio->uio_segflg == UIO_NOCOPY) {
390			/*
391			 * The code below is here to make sendfile(2) work
392			 * correctly with ZFS. As pointed out by ups@
393			 * sendfile(2) should be changed to use VOP_GETPAGES(),
394			 * but it pessimize performance of sendfile/UFS, that's
395			 * why I handle this special case in ZFS code.
396			 */
397			if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb"))
398				goto again;
399			vm_page_busy(m);
400			VM_OBJECT_UNLOCK(obj);
401			sched_pin();
402			sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
403			va = (caddr_t)sf_buf_kva(sf);
404			error = dmu_read(os, zp->z_id, start + off, bytes,
405			    (void *)(va + off));
406			sf_buf_free(sf);
407			sched_unpin();
408			VM_OBJECT_LOCK(obj);
409			vm_page_wakeup(m);
410			uio->uio_resid -= bytes;
411		} else {
412			VM_OBJECT_UNLOCK(obj);
413			error = dmu_read_uio(os, zp->z_id, uio, bytes);
414			VM_OBJECT_LOCK(obj);
415		}
416		len -= bytes;
417		off = 0;
418		if (error)
419			break;
420	}
421	VM_OBJECT_UNLOCK(obj);
422	return (error);
423}
424
425offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
426
427/*
428 * Read bytes from specified file into supplied buffer.
429 *
430 *	IN:	vp	- vnode of file to be read from.
431 *		uio	- structure supplying read location, range info,
432 *			  and return buffer.
433 *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
434 *		cr	- credentials of caller.
435 *
436 *	OUT:	uio	- updated offset and range, buffer filled.
437 *
438 *	RETURN:	0 if success
439 *		error code if failure
440 *
441 * Side Effects:
442 *	vp - atime updated if byte count > 0
443 */
444/* ARGSUSED */
445static int
446zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
447{
448	znode_t		*zp = VTOZ(vp);
449	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
450	objset_t	*os = zfsvfs->z_os;
451	ssize_t		n, nbytes;
452	int		error;
453	rl_t		*rl;
454
455	ZFS_ENTER(zfsvfs);
456
457	/*
458	 * Validate file offset
459	 */
460	if (uio->uio_loffset < (offset_t)0) {
461		ZFS_EXIT(zfsvfs);
462		return (EINVAL);
463	}
464
465	/*
466	 * Fasttrack empty reads
467	 */
468	if (uio->uio_resid == 0) {
469		ZFS_EXIT(zfsvfs);
470		return (0);
471	}
472
473	/*
474	 * Check for mandatory locks
475	 */
476	if (MANDMODE((mode_t)zp->z_phys->zp_mode)) {
477		if (error = chklock(vp, FREAD,
478		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
479			ZFS_EXIT(zfsvfs);
480			return (error);
481		}
482	}
483
484	/*
485	 * If we're in FRSYNC mode, sync out this znode before reading it.
486	 */
487	if (ioflag & FRSYNC)
488		zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
489
490	/*
491	 * Lock the range against changes.
492	 */
493	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
494
495	/*
496	 * If we are reading past end-of-file we can skip
497	 * to the end; but we might still need to set atime.
498	 */
499	if (uio->uio_loffset >= zp->z_phys->zp_size) {
500		error = 0;
501		goto out;
502	}
503
504	ASSERT(uio->uio_loffset < zp->z_phys->zp_size);
505	n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset);
506
507	while (n > 0) {
508		nbytes = MIN(n, zfs_read_chunk_size -
509		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
510
511		if (vn_has_cached_data(vp))
512			error = mappedread(vp, nbytes, uio);
513		else
514			error = dmu_read_uio(os, zp->z_id, uio, nbytes);
515		if (error)
516			break;
517
518		n -= nbytes;
519	}
520
521out:
522	zfs_range_unlock(rl);
523
524	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
525	ZFS_EXIT(zfsvfs);
526	return (error);
527}
528
529/*
530 * Fault in the pages of the first n bytes specified by the uio structure.
531 * 1 byte in each page is touched and the uio struct is unmodified.
532 * Any error will exit this routine as this is only a best
533 * attempt to get the pages resident. This is a copy of ufs_trans_touch().
534 */
535static void
536zfs_prefault_write(ssize_t n, struct uio *uio)
537{
538	struct iovec *iov;
539	ulong_t cnt, incr;
540	caddr_t p;
541
542	if (uio->uio_segflg != UIO_USERSPACE)
543		return;
544
545	iov = uio->uio_iov;
546
547	while (n) {
548		cnt = MIN(iov->iov_len, n);
549		if (cnt == 0) {
550			/* empty iov entry */
551			iov++;
552			continue;
553		}
554		n -= cnt;
555		/*
556		 * touch each page in this segment.
557		 */
558		p = iov->iov_base;
559		while (cnt) {
560			if (fubyte(p) == -1)
561				return;
562			incr = MIN(cnt, PAGESIZE);
563			p += incr;
564			cnt -= incr;
565		}
566		/*
567		 * touch the last byte in case it straddles a page.
568		 */
569		p--;
570		if (fubyte(p) == -1)
571			return;
572		iov++;
573	}
574}
575
576/*
577 * Write the bytes to a file.
578 *
579 *	IN:	vp	- vnode of file to be written to.
580 *		uio	- structure supplying write location, range info,
581 *			  and data buffer.
582 *		ioflag	- IO_APPEND flag set if in append mode.
583 *		cr	- credentials of caller.
584 *
585 *	OUT:	uio	- updated offset and range.
586 *
587 *	RETURN:	0 if success
588 *		error code if failure
589 *
590 * Timestamps:
591 *	vp - ctime|mtime updated if byte count > 0
592 */
593/* ARGSUSED */
594static int
595zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
596{
597	znode_t		*zp = VTOZ(vp);
598	rlim64_t	limit = MAXOFFSET_T;
599	ssize_t		start_resid = uio->uio_resid;
600	ssize_t		tx_bytes;
601	uint64_t	end_size;
602	dmu_tx_t	*tx;
603	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
604	zilog_t		*zilog = zfsvfs->z_log;
605	offset_t	woff;
606	ssize_t		n, nbytes;
607	rl_t		*rl;
608	int		max_blksz = zfsvfs->z_max_blksz;
609	int		error;
610
611	/*
612	 * Fasttrack empty write
613	 */
614	n = start_resid;
615	if (n == 0)
616		return (0);
617
618	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
619		limit = MAXOFFSET_T;
620
621	ZFS_ENTER(zfsvfs);
622
623	/*
624	 * Pre-fault the pages to ensure slow (eg NFS) pages
625	 * don't hold up txg.
626	 */
627	zfs_prefault_write(n, uio);
628
629	/*
630	 * If in append mode, set the io offset pointer to eof.
631	 */
632	if (ioflag & IO_APPEND) {
633		/*
634		 * Range lock for a file append:
635		 * The value for the start of range will be determined by
636		 * zfs_range_lock() (to guarantee append semantics).
637		 * If this write will cause the block size to increase,
638		 * zfs_range_lock() will lock the entire file, so we must
639		 * later reduce the range after we grow the block size.
640		 */
641		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
642		if (rl->r_len == UINT64_MAX) {
643			/* overlocked, zp_size can't change */
644			woff = uio->uio_loffset = zp->z_phys->zp_size;
645		} else {
646			woff = uio->uio_loffset = rl->r_off;
647		}
648	} else {
649		woff = uio->uio_loffset;
650		/*
651		 * Validate file offset
652		 */
653		if (woff < 0) {
654			ZFS_EXIT(zfsvfs);
655			return (EINVAL);
656		}
657
658		/*
659		 * If we need to grow the block size then zfs_range_lock()
660		 * will lock a wider range than we request here.
661		 * Later after growing the block size we reduce the range.
662		 */
663		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
664	}
665
666	if (woff >= limit) {
667		zfs_range_unlock(rl);
668		ZFS_EXIT(zfsvfs);
669		return (EFBIG);
670	}
671
672	if ((woff + n) > limit || woff > (limit - n))
673		n = limit - woff;
674
675	/*
676	 * Check for mandatory locks
677	 */
678	if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
679	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
680		zfs_range_unlock(rl);
681		ZFS_EXIT(zfsvfs);
682		return (error);
683	}
684	end_size = MAX(zp->z_phys->zp_size, woff + n);
685
686	/*
687	 * Write the file in reasonable size chunks.  Each chunk is written
688	 * in a separate transaction; this keeps the intent log records small
689	 * and allows us to do more fine-grained space accounting.
690	 */
691	while (n > 0) {
692		/*
693		 * Start a transaction.
694		 */
695		woff = uio->uio_loffset;
696		tx = dmu_tx_create(zfsvfs->z_os);
697		dmu_tx_hold_bonus(tx, zp->z_id);
698		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
699		error = dmu_tx_assign(tx, zfsvfs->z_assign);
700		if (error) {
701			if (error == ERESTART &&
702			    zfsvfs->z_assign == TXG_NOWAIT) {
703				dmu_tx_wait(tx);
704				dmu_tx_abort(tx);
705				continue;
706			}
707			dmu_tx_abort(tx);
708			break;
709		}
710
711		/*
712		 * If zfs_range_lock() over-locked we grow the blocksize
713		 * and then reduce the lock range.  This will only happen
714		 * on the first iteration since zfs_range_reduce() will
715		 * shrink down r_len to the appropriate size.
716		 */
717		if (rl->r_len == UINT64_MAX) {
718			uint64_t new_blksz;
719
720			if (zp->z_blksz > max_blksz) {
721				ASSERT(!ISP2(zp->z_blksz));
722				new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
723			} else {
724				new_blksz = MIN(end_size, max_blksz);
725			}
726			zfs_grow_blocksize(zp, new_blksz, tx);
727			zfs_range_reduce(rl, woff, n);
728		}
729
730		/*
731		 * XXX - should we really limit each write to z_max_blksz?
732		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
733		 */
734		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
735		rw_enter(&zp->z_map_lock, RW_READER);
736
737		if (woff + nbytes > zp->z_phys->zp_size)
738			vnode_pager_setsize(vp, woff + nbytes);
739
740		tx_bytes = uio->uio_resid;
741		if (vn_has_cached_data(vp)) {
742			rw_exit(&zp->z_map_lock);
743			error = mappedwrite(vp, nbytes, uio, tx);
744		} else {
745			error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
746			    uio, nbytes, tx);
747			rw_exit(&zp->z_map_lock);
748		}
749		tx_bytes -= uio->uio_resid;
750
751		/*
752		 * If we made no progress, we're done.  If we made even
753		 * partial progress, update the znode and ZIL accordingly.
754		 */
755		if (tx_bytes == 0) {
756			dmu_tx_commit(tx);
757			ASSERT(error != 0);
758			break;
759		}
760
761		/*
762		 * Clear Set-UID/Set-GID bits on successful write if not
763		 * privileged and at least one of the excute bits is set.
764		 *
765		 * It would be nice to to this after all writes have
766		 * been done, but that would still expose the ISUID/ISGID
767		 * to another app after the partial write is committed.
768		 */
769		mutex_enter(&zp->z_acl_lock);
770		if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) |
771		    (S_IXUSR >> 6))) != 0 &&
772		    (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 &&
773		    secpolicy_vnode_setid_retain(cr,
774		    (zp->z_phys->zp_mode & S_ISUID) != 0 &&
775		    zp->z_phys->zp_uid == 0) != 0) {
776			    zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID);
777		}
778		mutex_exit(&zp->z_acl_lock);
779
780		/*
781		 * Update time stamp.  NOTE: This marks the bonus buffer as
782		 * dirty, so we don't have to do it again for zp_size.
783		 */
784		zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
785
786		/*
787		 * Update the file size (zp_size) if it has changed;
788		 * account for possible concurrent updates.
789		 */
790		while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset)
791			(void) atomic_cas_64(&zp->z_phys->zp_size, end_size,
792			    uio->uio_loffset);
793		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
794		dmu_tx_commit(tx);
795
796		if (error != 0)
797			break;
798		ASSERT(tx_bytes == nbytes);
799		n -= nbytes;
800	}
801
802	zfs_range_unlock(rl);
803
804	/*
805	 * If we're in replay mode, or we made no progress, return error.
806	 * Otherwise, it's at least a partial write, so it's successful.
807	 */
808	if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) {
809		ZFS_EXIT(zfsvfs);
810		return (error);
811	}
812
813	if (ioflag & (FSYNC | FDSYNC))
814		zil_commit(zilog, zp->z_last_itx, zp->z_id);
815
816	ZFS_EXIT(zfsvfs);
817	return (0);
818}
819
820void
821zfs_get_done(dmu_buf_t *db, void *vzgd)
822{
823	zgd_t *zgd = (zgd_t *)vzgd;
824	rl_t *rl = zgd->zgd_rl;
825	vnode_t *vp = ZTOV(rl->r_zp);
826	int vfslocked;
827
828	vfslocked = VFS_LOCK_GIANT(vp->v_vfsp);
829	dmu_buf_rele(db, vzgd);
830	zfs_range_unlock(rl);
831	VN_RELE(vp);
832	zil_add_vdev(zgd->zgd_zilog, DVA_GET_VDEV(BP_IDENTITY(zgd->zgd_bp)));
833	kmem_free(zgd, sizeof (zgd_t));
834	VFS_UNLOCK_GIANT(vfslocked);
835}
836
837/*
838 * Get data to generate a TX_WRITE intent log record.
839 */
840int
841zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
842{
843	zfsvfs_t *zfsvfs = arg;
844	objset_t *os = zfsvfs->z_os;
845	znode_t *zp;
846	uint64_t off = lr->lr_offset;
847	dmu_buf_t *db;
848	rl_t *rl;
849	zgd_t *zgd;
850	int dlen = lr->lr_length;		/* length of user data */
851	int error = 0;
852
853	ASSERT(zio);
854	ASSERT(dlen != 0);
855
856	/*
857	 * Nothing to do if the file has been removed
858	 */
859	if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0)
860		return (ENOENT);
861	if (zp->z_unlinked) {
862		VN_RELE(ZTOV(zp));
863		return (ENOENT);
864	}
865
866	/*
867	 * Write records come in two flavors: immediate and indirect.
868	 * For small writes it's cheaper to store the data with the
869	 * log record (immediate); for large writes it's cheaper to
870	 * sync the data and get a pointer to it (indirect) so that
871	 * we don't have to write the data twice.
872	 */
873	if (buf != NULL) { /* immediate write */
874		rl = zfs_range_lock(zp, off, dlen, RL_READER);
875		/* test for truncation needs to be done while range locked */
876		if (off >= zp->z_phys->zp_size) {
877			error = ENOENT;
878			goto out;
879		}
880		VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf));
881	} else { /* indirect write */
882		uint64_t boff; /* block starting offset */
883
884		/*
885		 * Have to lock the whole block to ensure when it's
886		 * written out and it's checksum is being calculated
887		 * that no one can change the data. We need to re-check
888		 * blocksize after we get the lock in case it's changed!
889		 */
890		for (;;) {
891			if (ISP2(zp->z_blksz)) {
892				boff = P2ALIGN_TYPED(off, zp->z_blksz,
893				    uint64_t);
894			} else {
895				boff = 0;
896			}
897			dlen = zp->z_blksz;
898			rl = zfs_range_lock(zp, boff, dlen, RL_READER);
899			if (zp->z_blksz == dlen)
900				break;
901			zfs_range_unlock(rl);
902		}
903		/* test for truncation needs to be done while range locked */
904		if (off >= zp->z_phys->zp_size) {
905			error = ENOENT;
906			goto out;
907		}
908		zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
909		zgd->zgd_rl = rl;
910		zgd->zgd_zilog = zfsvfs->z_log;
911		zgd->zgd_bp = &lr->lr_blkptr;
912		VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db));
913		ASSERT(boff == db->db_offset);
914		lr->lr_blkoff = off - boff;
915		error = dmu_sync(zio, db, &lr->lr_blkptr,
916		    lr->lr_common.lrc_txg, zfs_get_done, zgd);
917		ASSERT(error == EEXIST || lr->lr_length <= zp->z_blksz);
918		if (error == 0) {
919			zil_add_vdev(zfsvfs->z_log,
920			    DVA_GET_VDEV(BP_IDENTITY(&lr->lr_blkptr)));
921		}
922		/*
923		 * If we get EINPROGRESS, then we need to wait for a
924		 * write IO initiated by dmu_sync() to complete before
925		 * we can release this dbuf.  We will finish everything
926		 * up in the zfs_get_done() callback.
927		 */
928		if (error == EINPROGRESS)
929			return (0);
930		dmu_buf_rele(db, zgd);
931		kmem_free(zgd, sizeof (zgd_t));
932	}
933out:
934	zfs_range_unlock(rl);
935	VN_RELE(ZTOV(zp));
936	return (error);
937}
938
939/*ARGSUSED*/
940static int
941zfs_access(vnode_t *vp, int mode, int flags, cred_t *cr)
942{
943	znode_t *zp = VTOZ(vp);
944	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
945	int error;
946
947	ZFS_ENTER(zfsvfs);
948	error = zfs_zaccess_rwx(zp, mode, cr);
949	ZFS_EXIT(zfsvfs);
950	return (error);
951}
952
953/*
954 * Lookup an entry in a directory, or an extended attribute directory.
955 * If it exists, return a held vnode reference for it.
956 *
957 *	IN:	dvp	- vnode of directory to search.
958 *		nm	- name of entry to lookup.
959 *		pnp	- full pathname to lookup [UNUSED].
960 *		flags	- LOOKUP_XATTR set if looking for an attribute.
961 *		rdir	- root directory vnode [UNUSED].
962 *		cr	- credentials of caller.
963 *
964 *	OUT:	vpp	- vnode of located entry, NULL if not found.
965 *
966 *	RETURN:	0 if success
967 *		error code if failure
968 *
969 * Timestamps:
970 *	NA
971 */
972/* ARGSUSED */
973static int
974zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
975     int nameiop, cred_t *cr, kthread_t *td)
976{
977
978	znode_t *zdp = VTOZ(dvp);
979	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
980	int	error;
981
982	ZFS_ENTER(zfsvfs);
983
984	*vpp = NULL;
985
986#ifdef TODO
987	if (flags & LOOKUP_XATTR) {
988		/*
989		 * If the xattr property is off, refuse the lookup request.
990		 */
991		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
992			ZFS_EXIT(zfsvfs);
993			return (EINVAL);
994		}
995
996		/*
997		 * We don't allow recursive attributes..
998		 * Maybe someday we will.
999		 */
1000		if (zdp->z_phys->zp_flags & ZFS_XATTR) {
1001			ZFS_EXIT(zfsvfs);
1002			return (EINVAL);
1003		}
1004
1005		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1006			ZFS_EXIT(zfsvfs);
1007			return (error);
1008		}
1009
1010		/*
1011		 * Do we have permission to get into attribute directory?
1012		 */
1013
1014		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, cr)) {
1015			VN_RELE(*vpp);
1016		}
1017
1018		ZFS_EXIT(zfsvfs);
1019		return (error);
1020	}
1021#endif	/* TODO */
1022
1023	if (dvp->v_type != VDIR) {
1024		ZFS_EXIT(zfsvfs);
1025		return (ENOTDIR);
1026	}
1027
1028	/*
1029	 * Check accessibility of directory.
1030	 */
1031
1032	if (error = zfs_zaccess(zdp, ACE_EXECUTE, cr)) {
1033		ZFS_EXIT(zfsvfs);
1034		return (error);
1035	}
1036
1037	if ((error = zfs_dirlook(zdp, nm, vpp)) == 0) {
1038
1039		/*
1040		 * Convert device special files
1041		 */
1042		if (IS_DEVVP(*vpp)) {
1043			vnode_t	*svp;
1044
1045			svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1046			VN_RELE(*vpp);
1047			if (svp == NULL)
1048				error = ENOSYS;
1049			else
1050				*vpp = svp;
1051		}
1052	}
1053
1054	ZFS_EXIT(zfsvfs);
1055
1056	/* Translate errors and add SAVENAME when needed. */
1057	if (cnp->cn_flags & ISLASTCN) {
1058		switch (nameiop) {
1059		case CREATE:
1060		case RENAME:
1061			if (error == ENOENT) {
1062				error = EJUSTRETURN;
1063				cnp->cn_flags |= SAVENAME;
1064				break;
1065			}
1066			/* FALLTHROUGH */
1067		case DELETE:
1068			if (error == 0)
1069				cnp->cn_flags |= SAVENAME;
1070			break;
1071		}
1072	}
1073	if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) {
1074		if (cnp->cn_flags & ISDOTDOT)
1075			VOP_UNLOCK(dvp, 0, td);
1076		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);
1077		if (cnp->cn_flags & ISDOTDOT)
1078			vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td);
1079	}
1080
1081#ifdef FREEBSD_NAMECACHE
1082	/*
1083	 * Insert name into cache (as non-existent) if appropriate.
1084	 */
1085	if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
1086		cache_enter(dvp, *vpp, cnp);
1087        /*
1088         * Insert name into cache if appropriate.
1089         */
1090	if (error == 0 && (cnp->cn_flags & MAKEENTRY)) {
1091		if (!(cnp->cn_flags & ISLASTCN) ||
1092		    (nameiop != DELETE && nameiop != RENAME)) {
1093			cache_enter(dvp, *vpp, cnp);
1094		}
1095	}
1096#endif
1097
1098	return (error);
1099}
1100
1101/*
1102 * Attempt to create a new entry in a directory.  If the entry
1103 * already exists, truncate the file if permissible, else return
1104 * an error.  Return the vp of the created or trunc'd file.
1105 *
1106 *	IN:	dvp	- vnode of directory to put new file entry in.
1107 *		name	- name of new file entry.
1108 *		vap	- attributes of new file.
1109 *		excl	- flag indicating exclusive or non-exclusive mode.
1110 *		mode	- mode to open file with.
1111 *		cr	- credentials of caller.
1112 *		flag	- large file flag [UNUSED].
1113 *
1114 *	OUT:	vpp	- vnode of created or trunc'd entry.
1115 *
1116 *	RETURN:	0 if success
1117 *		error code if failure
1118 *
1119 * Timestamps:
1120 *	dvp - ctime|mtime updated if new entry created
1121 *	 vp - ctime|mtime always, atime if new
1122 */
1123/* ARGSUSED */
1124static int
1125zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
1126    vnode_t **vpp, cred_t *cr, kthread_t *td)
1127{
1128	znode_t		*zp, *dzp = VTOZ(dvp);
1129	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1130	zilog_t		*zilog = zfsvfs->z_log;
1131	objset_t	*os = zfsvfs->z_os;
1132	zfs_dirlock_t	*dl;
1133	dmu_tx_t	*tx;
1134	int		error;
1135	uint64_t	zoid;
1136
1137	ZFS_ENTER(zfsvfs);
1138
1139top:
1140	*vpp = NULL;
1141
1142	if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr))
1143		vap->va_mode &= ~VSVTX;
1144
1145	if (*name == '\0') {
1146		/*
1147		 * Null component name refers to the directory itself.
1148		 */
1149		VN_HOLD(dvp);
1150		zp = dzp;
1151		dl = NULL;
1152		error = 0;
1153	} else {
1154		/* possible VN_HOLD(zp) */
1155		if (error = zfs_dirent_lock(&dl, dzp, name, &zp, 0)) {
1156			if (strcmp(name, "..") == 0)
1157				error = EISDIR;
1158			ZFS_EXIT(zfsvfs);
1159			return (error);
1160		}
1161	}
1162
1163	zoid = zp ? zp->z_id : -1ULL;
1164
1165	if (zp == NULL) {
1166		/*
1167		 * Create a new file object and update the directory
1168		 * to reference it.
1169		 */
1170		if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
1171			goto out;
1172		}
1173
1174		/*
1175		 * We only support the creation of regular files in
1176		 * extended attribute directories.
1177		 */
1178		if ((dzp->z_phys->zp_flags & ZFS_XATTR) &&
1179		    (vap->va_type != VREG)) {
1180			error = EINVAL;
1181			goto out;
1182		}
1183
1184		tx = dmu_tx_create(os);
1185		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1186		dmu_tx_hold_bonus(tx, dzp->z_id);
1187		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1188		if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
1189			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1190			    0, SPA_MAXBLOCKSIZE);
1191		error = dmu_tx_assign(tx, zfsvfs->z_assign);
1192		if (error) {
1193			zfs_dirent_unlock(dl);
1194			if (error == ERESTART &&
1195			    zfsvfs->z_assign == TXG_NOWAIT) {
1196				dmu_tx_wait(tx);
1197				dmu_tx_abort(tx);
1198				goto top;
1199			}
1200			dmu_tx_abort(tx);
1201			ZFS_EXIT(zfsvfs);
1202			return (error);
1203		}
1204		zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
1205		ASSERT(zp->z_id == zoid);
1206		(void) zfs_link_create(dl, zp, tx, ZNEW);
1207		zfs_log_create(zilog, tx, TX_CREATE, dzp, zp, name);
1208		dmu_tx_commit(tx);
1209	} else {
1210		/*
1211		 * A directory entry already exists for this name.
1212		 */
1213		/*
1214		 * Can't truncate an existing file if in exclusive mode.
1215		 */
1216		if (excl == EXCL) {
1217			error = EEXIST;
1218			goto out;
1219		}
1220		/*
1221		 * Can't open a directory for writing.
1222		 */
1223		if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
1224			error = EISDIR;
1225			goto out;
1226		}
1227		/*
1228		 * Verify requested access to file.
1229		 */
1230		if (mode && (error = zfs_zaccess_rwx(zp, mode, cr))) {
1231			goto out;
1232		}
1233
1234		mutex_enter(&dzp->z_lock);
1235		dzp->z_seq++;
1236		mutex_exit(&dzp->z_lock);
1237
1238		/*
1239		 * Truncate regular files if requested.
1240		 */
1241		if ((ZTOV(zp)->v_type == VREG) &&
1242		    (zp->z_phys->zp_size != 0) &&
1243		    (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
1244			error = zfs_freesp(zp, 0, 0, mode, TRUE);
1245			if (error == ERESTART &&
1246			    zfsvfs->z_assign == TXG_NOWAIT) {
1247				/* NB: we already did dmu_tx_wait() */
1248				zfs_dirent_unlock(dl);
1249				VN_RELE(ZTOV(zp));
1250				goto top;
1251			}
1252		}
1253	}
1254out:
1255
1256	if (error == 0) {
1257		*vpp = ZTOV(zp);
1258		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);
1259	}
1260
1261	if (dl)
1262		zfs_dirent_unlock(dl);
1263
1264	if (error) {
1265		if (zp)
1266			VN_RELE(ZTOV(zp));
1267	} else {
1268		*vpp = ZTOV(zp);
1269		/*
1270		 * If vnode is for a device return a specfs vnode instead.
1271		 */
1272		if (IS_DEVVP(*vpp)) {
1273			struct vnode *svp;
1274
1275			svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1276			VN_RELE(*vpp);
1277			if (svp == NULL) {
1278				error = ENOSYS;
1279			}
1280			*vpp = svp;
1281		}
1282	}
1283
1284	ZFS_EXIT(zfsvfs);
1285	return (error);
1286}
1287
1288/*
1289 * Remove an entry from a directory.
1290 *
1291 *	IN:	dvp	- vnode of directory to remove entry from.
1292 *		name	- name of entry to remove.
1293 *		cr	- credentials of caller.
1294 *
1295 *	RETURN:	0 if success
1296 *		error code if failure
1297 *
1298 * Timestamps:
1299 *	dvp - ctime|mtime
1300 *	 vp - ctime (if nlink > 0)
1301 */
1302static int
1303zfs_remove(vnode_t *dvp, char *name, cred_t *cr)
1304{
1305	znode_t		*zp, *dzp = VTOZ(dvp);
1306	znode_t		*xzp = NULL;
1307	vnode_t		*vp;
1308	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1309	zilog_t		*zilog = zfsvfs->z_log;
1310	uint64_t	acl_obj, xattr_obj;
1311	zfs_dirlock_t	*dl;
1312	dmu_tx_t	*tx;
1313	boolean_t	may_delete_now, delete_now = FALSE;
1314	boolean_t	unlinked;
1315	int		error;
1316
1317	ZFS_ENTER(zfsvfs);
1318
1319top:
1320	/*
1321	 * Attempt to lock directory; fail if entry doesn't exist.
1322	 */
1323	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) {
1324		ZFS_EXIT(zfsvfs);
1325		return (error);
1326	}
1327
1328	vp = ZTOV(zp);
1329
1330	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1331		goto out;
1332	}
1333
1334	/*
1335	 * Need to use rmdir for removing directories.
1336	 */
1337	if (vp->v_type == VDIR) {
1338		error = EPERM;
1339		goto out;
1340	}
1341
1342	vnevent_remove(vp);
1343
1344	dnlc_remove(dvp, name);
1345
1346	may_delete_now = FALSE;
1347
1348	/*
1349	 * We may delete the znode now, or we may put it in the unlinked set;
1350	 * it depends on whether we're the last link, and on whether there are
1351	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
1352	 * allow for either case.
1353	 */
1354	tx = dmu_tx_create(zfsvfs->z_os);
1355	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1356	dmu_tx_hold_bonus(tx, zp->z_id);
1357	if (may_delete_now)
1358		dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
1359
1360	/* are there any extended attributes? */
1361	if ((xattr_obj = zp->z_phys->zp_xattr) != 0) {
1362		/* XXX - do we need this if we are deleting? */
1363		dmu_tx_hold_bonus(tx, xattr_obj);
1364	}
1365
1366	/* are there any additional acls */
1367	if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 &&
1368	    may_delete_now)
1369		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1370
1371	/* charge as an update -- would be nice not to charge at all */
1372	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1373
1374	error = dmu_tx_assign(tx, zfsvfs->z_assign);
1375	if (error) {
1376		zfs_dirent_unlock(dl);
1377		VN_RELE(vp);
1378		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
1379			dmu_tx_wait(tx);
1380			dmu_tx_abort(tx);
1381			goto top;
1382		}
1383		dmu_tx_abort(tx);
1384		ZFS_EXIT(zfsvfs);
1385		return (error);
1386	}
1387
1388	/*
1389	 * Remove the directory entry.
1390	 */
1391	error = zfs_link_destroy(dl, zp, tx, 0, &unlinked);
1392
1393	if (error) {
1394		dmu_tx_commit(tx);
1395		goto out;
1396	}
1397
1398	if (0 && unlinked) {
1399		VI_LOCK(vp);
1400		delete_now = may_delete_now &&
1401		    vp->v_count == 1 && !vn_has_cached_data(vp) &&
1402		    zp->z_phys->zp_xattr == xattr_obj &&
1403		    zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj;
1404		VI_UNLOCK(vp);
1405	}
1406
1407	if (delete_now) {
1408		if (zp->z_phys->zp_xattr) {
1409			error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
1410			ASSERT3U(error, ==, 0);
1411			ASSERT3U(xzp->z_phys->zp_links, ==, 2);
1412			dmu_buf_will_dirty(xzp->z_dbuf, tx);
1413			mutex_enter(&xzp->z_lock);
1414			xzp->z_unlinked = 1;
1415			xzp->z_phys->zp_links = 0;
1416			mutex_exit(&xzp->z_lock);
1417			zfs_unlinked_add(xzp, tx);
1418			zp->z_phys->zp_xattr = 0; /* probably unnecessary */
1419		}
1420		mutex_enter(&zp->z_lock);
1421		VI_LOCK(vp);
1422		vp->v_count--;
1423		ASSERT3U(vp->v_count, ==, 0);
1424		VI_UNLOCK(vp);
1425		mutex_exit(&zp->z_lock);
1426		zfs_znode_delete(zp, tx);
1427		VFS_RELE(zfsvfs->z_vfs);
1428	} else if (unlinked) {
1429		zfs_unlinked_add(zp, tx);
1430	}
1431
1432	zfs_log_remove(zilog, tx, TX_REMOVE, dzp, name);
1433
1434	dmu_tx_commit(tx);
1435out:
1436	zfs_dirent_unlock(dl);
1437
1438	if (!delete_now) {
1439		VN_RELE(vp);
1440	} else if (xzp) {
1441		/* this rele delayed to prevent nesting transactions */
1442		VN_RELE(ZTOV(xzp));
1443	}
1444
1445	ZFS_EXIT(zfsvfs);
1446	return (error);
1447}
1448
1449/*
1450 * Create a new directory and insert it into dvp using the name
1451 * provided.  Return a pointer to the inserted directory.
1452 *
1453 *	IN:	dvp	- vnode of directory to add subdir to.
1454 *		dirname	- name of new directory.
1455 *		vap	- attributes of new directory.
1456 *		cr	- credentials of caller.
1457 *
1458 *	OUT:	vpp	- vnode of created directory.
1459 *
1460 *	RETURN:	0 if success
1461 *		error code if failure
1462 *
1463 * Timestamps:
1464 *	dvp - ctime|mtime updated
1465 *	 vp - ctime|mtime|atime updated
1466 */
1467static int
1468zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
1469{
1470	znode_t		*zp, *dzp = VTOZ(dvp);
1471	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1472	zilog_t		*zilog = zfsvfs->z_log;
1473	zfs_dirlock_t	*dl;
1474	uint64_t	zoid = 0;
1475	dmu_tx_t	*tx;
1476	int		error;
1477
1478	ASSERT(vap->va_type == VDIR);
1479
1480	ZFS_ENTER(zfsvfs);
1481
1482	if (dzp->z_phys->zp_flags & ZFS_XATTR) {
1483		ZFS_EXIT(zfsvfs);
1484		return (EINVAL);
1485	}
1486top:
1487	*vpp = NULL;
1488
1489	/*
1490	 * First make sure the new directory doesn't exist.
1491	 */
1492	if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, ZNEW)) {
1493		ZFS_EXIT(zfsvfs);
1494		return (error);
1495	}
1496
1497	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, cr)) {
1498		zfs_dirent_unlock(dl);
1499		ZFS_EXIT(zfsvfs);
1500		return (error);
1501	}
1502
1503	/*
1504	 * Add a new entry to the directory.
1505	 */
1506	tx = dmu_tx_create(zfsvfs->z_os);
1507	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1508	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1509	if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
1510		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1511		    0, SPA_MAXBLOCKSIZE);
1512	error = dmu_tx_assign(tx, zfsvfs->z_assign);
1513	if (error) {
1514		zfs_dirent_unlock(dl);
1515		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
1516			dmu_tx_wait(tx);
1517			dmu_tx_abort(tx);
1518			goto top;
1519		}
1520		dmu_tx_abort(tx);
1521		ZFS_EXIT(zfsvfs);
1522		return (error);
1523	}
1524
1525	/*
1526	 * Create new node.
1527	 */
1528	zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
1529
1530	/*
1531	 * Now put new name in parent dir.
1532	 */
1533	(void) zfs_link_create(dl, zp, tx, ZNEW);
1534
1535	*vpp = ZTOV(zp);
1536
1537	zfs_log_create(zilog, tx, TX_MKDIR, dzp, zp, dirname);
1538	dmu_tx_commit(tx);
1539
1540	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, curthread);
1541
1542	zfs_dirent_unlock(dl);
1543
1544	ZFS_EXIT(zfsvfs);
1545	return (0);
1546}
1547
1548/*
1549 * Remove a directory subdir entry.  If the current working
1550 * directory is the same as the subdir to be removed, the
1551 * remove will fail.
1552 *
1553 *	IN:	dvp	- vnode of directory to remove from.
1554 *		name	- name of directory to be removed.
1555 *		cwd	- vnode of current working directory.
1556 *		cr	- credentials of caller.
1557 *
1558 *	RETURN:	0 if success
1559 *		error code if failure
1560 *
1561 * Timestamps:
1562 *	dvp - ctime|mtime updated
1563 */
1564static int
1565zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr)
1566{
1567	znode_t		*dzp = VTOZ(dvp);
1568	znode_t		*zp;
1569	vnode_t		*vp;
1570	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1571	zilog_t		*zilog = zfsvfs->z_log;
1572	zfs_dirlock_t	*dl;
1573	dmu_tx_t	*tx;
1574	int		error;
1575
1576	ZFS_ENTER(zfsvfs);
1577
1578top:
1579	zp = NULL;
1580
1581	/*
1582	 * Attempt to lock directory; fail if entry doesn't exist.
1583	 */
1584	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) {
1585		ZFS_EXIT(zfsvfs);
1586		return (error);
1587	}
1588
1589	vp = ZTOV(zp);
1590
1591	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1592		goto out;
1593	}
1594
1595	if (vp->v_type != VDIR) {
1596		error = ENOTDIR;
1597		goto out;
1598	}
1599
1600	if (vp == cwd) {
1601		error = EINVAL;
1602		goto out;
1603	}
1604
1605	vnevent_rmdir(vp);
1606
1607	/*
1608	 * Grab a lock on the directory to make sure that noone is
1609	 * trying to add (or lookup) entries while we are removing it.
1610	 */
1611	rw_enter(&zp->z_name_lock, RW_WRITER);
1612
1613	/*
1614	 * Grab a lock on the parent pointer to make sure we play well
1615	 * with the treewalk and directory rename code.
1616	 */
1617	rw_enter(&zp->z_parent_lock, RW_WRITER);
1618
1619	tx = dmu_tx_create(zfsvfs->z_os);
1620	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1621	dmu_tx_hold_bonus(tx, zp->z_id);
1622	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1623	error = dmu_tx_assign(tx, zfsvfs->z_assign);
1624	if (error) {
1625		rw_exit(&zp->z_parent_lock);
1626		rw_exit(&zp->z_name_lock);
1627		zfs_dirent_unlock(dl);
1628		VN_RELE(vp);
1629		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
1630			dmu_tx_wait(tx);
1631			dmu_tx_abort(tx);
1632			goto top;
1633		}
1634		dmu_tx_abort(tx);
1635		ZFS_EXIT(zfsvfs);
1636		return (error);
1637	}
1638
1639#ifdef FREEBSD_NAMECACHE
1640	cache_purge(dvp);
1641#endif
1642
1643	error = zfs_link_destroy(dl, zp, tx, 0, NULL);
1644
1645	if (error == 0)
1646		zfs_log_remove(zilog, tx, TX_RMDIR, dzp, name);
1647
1648	dmu_tx_commit(tx);
1649
1650	rw_exit(&zp->z_parent_lock);
1651	rw_exit(&zp->z_name_lock);
1652#ifdef FREEBSD_NAMECACHE
1653	cache_purge(vp);
1654#endif
1655out:
1656	zfs_dirent_unlock(dl);
1657
1658	VN_RELE(vp);
1659
1660	ZFS_EXIT(zfsvfs);
1661	return (error);
1662}
1663
1664/*
1665 * Read as many directory entries as will fit into the provided
1666 * buffer from the given directory cursor position (specified in
1667 * the uio structure.
1668 *
1669 *	IN:	vp	- vnode of directory to read.
1670 *		uio	- structure supplying read location, range info,
1671 *			  and return buffer.
1672 *		cr	- credentials of caller.
1673 *
1674 *	OUT:	uio	- updated offset and range, buffer filled.
1675 *		eofp	- set to true if end-of-file detected.
1676 *
1677 *	RETURN:	0 if success
1678 *		error code if failure
1679 *
1680 * Timestamps:
1681 *	vp - atime updated
1682 *
1683 * Note that the low 4 bits of the cookie returned by zap is always zero.
1684 * This allows us to use the low range for "special" directory entries:
1685 * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
1686 * we use the offset 2 for the '.zfs' directory.
1687 */
1688/* ARGSUSED */
1689static int
1690zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
1691{
1692	znode_t		*zp = VTOZ(vp);
1693	iovec_t		*iovp;
1694	dirent64_t	*odp;
1695	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
1696	objset_t	*os;
1697	caddr_t		outbuf;
1698	size_t		bufsize;
1699	zap_cursor_t	zc;
1700	zap_attribute_t	zap;
1701	uint_t		bytes_wanted;
1702	uint64_t	offset; /* must be unsigned; checks for < 1 */
1703	int		local_eof;
1704	int		outcount;
1705	int		error;
1706	uint8_t		prefetch;
1707	uint8_t		type;
1708	int		ncooks;
1709	u_long		*cooks = NULL;
1710
1711	ZFS_ENTER(zfsvfs);
1712
1713	/*
1714	 * If we are not given an eof variable,
1715	 * use a local one.
1716	 */
1717	if (eofp == NULL)
1718		eofp = &local_eof;
1719
1720	/*
1721	 * Check for valid iov_len.
1722	 */
1723	if (uio->uio_iov->iov_len <= 0) {
1724		ZFS_EXIT(zfsvfs);
1725		return (EINVAL);
1726	}
1727
1728	/*
1729	 * Quit if directory has been removed (posix)
1730	 */
1731	if ((*eofp = zp->z_unlinked) != 0) {
1732		ZFS_EXIT(zfsvfs);
1733		return (0);
1734	}
1735
1736	error = 0;
1737	os = zfsvfs->z_os;
1738	offset = uio->uio_loffset;
1739	prefetch = zp->z_zn_prefetch;
1740
1741	/*
1742	 * Initialize the iterator cursor.
1743	 */
1744	if (offset <= 3) {
1745		/*
1746		 * Start iteration from the beginning of the directory.
1747		 */
1748		zap_cursor_init(&zc, os, zp->z_id);
1749	} else {
1750		/*
1751		 * The offset is a serialized cursor.
1752		 */
1753		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
1754	}
1755
1756	/*
1757	 * Get space to change directory entries into fs independent format.
1758	 */
1759	iovp = uio->uio_iov;
1760	bytes_wanted = iovp->iov_len;
1761	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
1762		bufsize = bytes_wanted;
1763		outbuf = kmem_alloc(bufsize, KM_SLEEP);
1764		odp = (struct dirent64 *)outbuf;
1765	} else {
1766		bufsize = bytes_wanted;
1767		odp = (struct dirent64 *)iovp->iov_base;
1768	}
1769
1770	if (ncookies != NULL) {
1771		/*
1772		 * Minimum entry size is dirent size and 1 byte for a file name.
1773		 */
1774		ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
1775		cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
1776		*cookies = cooks;
1777		*ncookies = ncooks;
1778	}
1779
1780	/*
1781	 * Transform to file-system independent format
1782	 */
1783	outcount = 0;
1784	while (outcount < bytes_wanted) {
1785		ino64_t objnum;
1786		ushort_t reclen;
1787
1788		/*
1789		 * Special case `.', `..', and `.zfs'.
1790		 */
1791		if (offset == 0) {
1792			(void) strcpy(zap.za_name, ".");
1793			objnum = zp->z_id;
1794		} else if (offset == 1) {
1795			(void) strcpy(zap.za_name, "..");
1796			objnum = zp->z_phys->zp_parent;
1797		} else if (offset == 2 && zfs_show_ctldir(zp)) {
1798			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
1799			objnum = ZFSCTL_INO_ROOT;
1800		} else {
1801			/*
1802			 * Grab next entry.
1803			 */
1804			if (error = zap_cursor_retrieve(&zc, &zap)) {
1805				if ((*eofp = (error == ENOENT)) != 0)
1806					break;
1807				else
1808					goto update;
1809			}
1810
1811			if (zap.za_integer_length != 8 ||
1812			    zap.za_num_integers != 1) {
1813				cmn_err(CE_WARN, "zap_readdir: bad directory "
1814				    "entry, obj = %lld, offset = %lld\n",
1815				    (u_longlong_t)zp->z_id,
1816				    (u_longlong_t)offset);
1817				error = ENXIO;
1818				goto update;
1819			}
1820
1821			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
1822			/*
1823			 * MacOS X can extract the object type here such as:
1824			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
1825			 */
1826			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
1827		}
1828		reclen = DIRENT64_RECLEN(strlen(zap.za_name));
1829
1830		/*
1831		 * Will this entry fit in the buffer?
1832		 */
1833		if (outcount + reclen > bufsize) {
1834			/*
1835			 * Did we manage to fit anything in the buffer?
1836			 */
1837			if (!outcount) {
1838				error = EINVAL;
1839				goto update;
1840			}
1841			break;
1842		}
1843		/*
1844		 * Add this entry:
1845		 */
1846		odp->d_ino = objnum;
1847		odp->d_reclen = reclen;
1848		odp->d_namlen = strlen(zap.za_name);
1849		(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
1850		odp->d_type = type;
1851		outcount += reclen;
1852		odp = (dirent64_t *)((intptr_t)odp + reclen);
1853
1854		ASSERT(outcount <= bufsize);
1855
1856		/* Prefetch znode */
1857		if (prefetch)
1858			dmu_prefetch(os, objnum, 0, 0);
1859
1860		/*
1861		 * Move to the next entry, fill in the previous offset.
1862		 */
1863		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
1864			zap_cursor_advance(&zc);
1865			offset = zap_cursor_serialize(&zc);
1866		} else {
1867			offset += 1;
1868		}
1869
1870		if (cooks != NULL) {
1871			*cooks++ = offset;
1872			ncooks--;
1873			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
1874		}
1875	}
1876	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
1877
1878	/* Subtract unused cookies */
1879	if (ncookies != NULL)
1880		*ncookies -= ncooks;
1881
1882	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
1883		iovp->iov_base += outcount;
1884		iovp->iov_len -= outcount;
1885		uio->uio_resid -= outcount;
1886	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
1887		/*
1888		 * Reset the pointer.
1889		 */
1890		offset = uio->uio_loffset;
1891	}
1892
1893update:
1894	zap_cursor_fini(&zc);
1895	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
1896		kmem_free(outbuf, bufsize);
1897
1898	if (error == ENOENT)
1899		error = 0;
1900
1901	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
1902
1903	uio->uio_loffset = offset;
1904	ZFS_EXIT(zfsvfs);
1905	if (error != 0) {
1906		free(*cookies, M_TEMP);
1907		*cookies = NULL;
1908		*ncookies = 0;
1909	}
1910	return (error);
1911}
1912
1913static int
1914zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr)
1915{
1916	znode_t	*zp = VTOZ(vp);
1917	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1918
1919	ZFS_ENTER(zfsvfs);
1920	zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
1921	ZFS_EXIT(zfsvfs);
1922	return (0);
1923}
1924
1925/*
1926 * Get the requested file attributes and place them in the provided
1927 * vattr structure.
1928 *
1929 *	IN:	vp	- vnode of file.
1930 *		vap	- va_mask identifies requested attributes.
1931 *		flags	- [UNUSED]
1932 *		cr	- credentials of caller.
1933 *
1934 *	OUT:	vap	- attribute values.
1935 *
1936 *	RETURN:	0 (always succeeds)
1937 */
1938/* ARGSUSED */
1939static int
1940zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
1941{
1942	znode_t *zp = VTOZ(vp);
1943	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1944	znode_phys_t *pzp = zp->z_phys;
1945	uint32_t blksize;
1946	u_longlong_t nblocks;
1947	int	error;
1948
1949	ZFS_ENTER(zfsvfs);
1950
1951	/*
1952	 * Return all attributes.  It's cheaper to provide the answer
1953	 * than to determine whether we were asked the question.
1954	 */
1955	mutex_enter(&zp->z_lock);
1956
1957	vap->va_type = IFTOVT(pzp->zp_mode);
1958	vap->va_mode = pzp->zp_mode & ~S_IFMT;
1959	vap->va_uid = zp->z_phys->zp_uid;
1960	vap->va_gid = zp->z_phys->zp_gid;
1961	vap->va_nodeid = zp->z_id;
1962	vap->va_nlink = MIN(pzp->zp_links, UINT32_MAX);	/* nlink_t limit! */
1963	vap->va_size = pzp->zp_size;
1964	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
1965	vap->va_rdev = zfs_cmpldev(pzp->zp_rdev);
1966	vap->va_seq = zp->z_seq;
1967	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
1968
1969	ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime);
1970	ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime);
1971	ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime);
1972	ZFS_TIME_DECODE(&vap->va_birthtime, pzp->zp_crtime);
1973
1974	/*
1975	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
1976	 * Also, if we are the owner don't bother, since owner should
1977	 * always be allowed to read basic attributes of file.
1978	 */
1979	if (!(zp->z_phys->zp_flags & ZFS_ACL_TRIVIAL) &&
1980	    (zp->z_phys->zp_uid != crgetuid(cr))) {
1981		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, cr)) {
1982			mutex_exit(&zp->z_lock);
1983			ZFS_EXIT(zfsvfs);
1984			return (error);
1985		}
1986	}
1987
1988	mutex_exit(&zp->z_lock);
1989
1990	dmu_object_size_from_db(zp->z_dbuf, &blksize, &nblocks);
1991	vap->va_blksize = blksize;
1992	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
1993
1994	if (zp->z_blksz == 0) {
1995		/*
1996		 * Block size hasn't been set; suggest maximal I/O transfers.
1997		 */
1998		vap->va_blksize = zfsvfs->z_max_blksz;
1999	}
2000
2001	ZFS_EXIT(zfsvfs);
2002	return (0);
2003}
2004
2005/*
2006 * Set the file attributes to the values contained in the
2007 * vattr structure.
2008 *
2009 *	IN:	vp	- vnode of file to be modified.
2010 *		vap	- new attribute values.
2011 *		flags	- ATTR_UTIME set if non-default time values provided.
2012 *		cr	- credentials of caller.
2013 *
2014 *	RETURN:	0 if success
2015 *		error code if failure
2016 *
2017 * Timestamps:
2018 *	vp - ctime updated, mtime updated if size changed.
2019 */
2020/* ARGSUSED */
2021static int
2022zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2023	caller_context_t *ct)
2024{
2025	struct znode	*zp = VTOZ(vp);
2026	znode_phys_t	*pzp = zp->z_phys;
2027	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2028	zilog_t		*zilog = zfsvfs->z_log;
2029	dmu_tx_t	*tx;
2030	vattr_t		oldva;
2031	uint_t		mask = vap->va_mask;
2032	uint_t		saved_mask;
2033	int		trim_mask = 0;
2034	uint64_t	new_mode;
2035	znode_t		*attrzp;
2036	int		need_policy = FALSE;
2037	int		err;
2038
2039	if (mask == 0)
2040		return (0);
2041
2042	if (mask & AT_NOSET)
2043		return (EINVAL);
2044
2045	if (mask & AT_SIZE && vp->v_type == VDIR)
2046		return (EISDIR);
2047
2048	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO)
2049		return (EINVAL);
2050
2051	ZFS_ENTER(zfsvfs);
2052
2053top:
2054	attrzp = NULL;
2055
2056	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2057		ZFS_EXIT(zfsvfs);
2058		return (EROFS);
2059	}
2060
2061	/*
2062	 * First validate permissions
2063	 */
2064
2065	if (mask & AT_SIZE) {
2066		err = zfs_zaccess(zp, ACE_WRITE_DATA, cr);
2067		if (err) {
2068			ZFS_EXIT(zfsvfs);
2069			return (err);
2070		}
2071		/*
2072		 * XXX - Note, we are not providing any open
2073		 * mode flags here (like FNDELAY), so we may
2074		 * block if there are locks present... this
2075		 * should be addressed in openat().
2076		 */
2077		do {
2078			err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2079			/* NB: we already did dmu_tx_wait() if necessary */
2080		} while (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT);
2081		if (err) {
2082			ZFS_EXIT(zfsvfs);
2083			return (err);
2084		}
2085	}
2086
2087	if (mask & (AT_ATIME|AT_MTIME))
2088		need_policy = zfs_zaccess_v4_perm(zp, ACE_WRITE_ATTRIBUTES, cr);
2089
2090	if (mask & (AT_UID|AT_GID)) {
2091		int	idmask = (mask & (AT_UID|AT_GID));
2092		int	take_owner;
2093		int	take_group;
2094
2095		/*
2096		 * NOTE: even if a new mode is being set,
2097		 * we may clear S_ISUID/S_ISGID bits.
2098		 */
2099
2100		if (!(mask & AT_MODE))
2101			vap->va_mode = pzp->zp_mode;
2102
2103		/*
2104		 * Take ownership or chgrp to group we are a member of
2105		 */
2106
2107		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
2108		take_group = (mask & AT_GID) && groupmember(vap->va_gid, cr);
2109
2110		/*
2111		 * If both AT_UID and AT_GID are set then take_owner and
2112		 * take_group must both be set in order to allow taking
2113		 * ownership.
2114		 *
2115		 * Otherwise, send the check through secpolicy_vnode_setattr()
2116		 *
2117		 */
2118
2119		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
2120		    ((idmask == AT_UID) && take_owner) ||
2121		    ((idmask == AT_GID) && take_group)) {
2122			if (zfs_zaccess_v4_perm(zp, ACE_WRITE_OWNER, cr) == 0) {
2123				/*
2124				 * Remove setuid/setgid for non-privileged users
2125				 */
2126				secpolicy_setid_clear(vap, cr);
2127				trim_mask = (mask & (AT_UID|AT_GID));
2128			} else {
2129				need_policy =  TRUE;
2130			}
2131		} else {
2132			need_policy =  TRUE;
2133		}
2134	}
2135
2136	mutex_enter(&zp->z_lock);
2137	oldva.va_mode = pzp->zp_mode;
2138	oldva.va_uid = zp->z_phys->zp_uid;
2139	oldva.va_gid = zp->z_phys->zp_gid;
2140	mutex_exit(&zp->z_lock);
2141
2142	if (mask & AT_MODE) {
2143		if (zfs_zaccess_v4_perm(zp, ACE_WRITE_ACL, cr) == 0) {
2144			err = secpolicy_setid_setsticky_clear(vp, vap,
2145			    &oldva, cr);
2146			if (err) {
2147				ZFS_EXIT(zfsvfs);
2148				return (err);
2149			}
2150			trim_mask |= AT_MODE;
2151		} else {
2152			need_policy = TRUE;
2153		}
2154	}
2155
2156	if (need_policy) {
2157		/*
2158		 * If trim_mask is set then take ownership
2159		 * has been granted or write_acl is present and user
2160		 * has the ability to modify mode.  In that case remove
2161		 * UID|GID and or MODE from mask so that
2162		 * secpolicy_vnode_setattr() doesn't revoke it.
2163		 */
2164
2165		if (trim_mask) {
2166			saved_mask = vap->va_mask;
2167			vap->va_mask &= ~trim_mask;
2168
2169		}
2170		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
2171		    (int (*)(void *, int, cred_t *))zfs_zaccess_rwx, zp);
2172		if (err) {
2173			ZFS_EXIT(zfsvfs);
2174			return (err);
2175		}
2176
2177		if (trim_mask)
2178			vap->va_mask |= saved_mask;
2179	}
2180
2181	/*
2182	 * secpolicy_vnode_setattr, or take ownership may have
2183	 * changed va_mask
2184	 */
2185	mask = vap->va_mask;
2186
2187	tx = dmu_tx_create(zfsvfs->z_os);
2188	dmu_tx_hold_bonus(tx, zp->z_id);
2189
2190	if (mask & AT_MODE) {
2191		uint64_t pmode = pzp->zp_mode;
2192
2193		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
2194
2195		if (zp->z_phys->zp_acl.z_acl_extern_obj)
2196			dmu_tx_hold_write(tx,
2197			    pzp->zp_acl.z_acl_extern_obj, 0, SPA_MAXBLOCKSIZE);
2198		else
2199			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2200			    0, ZFS_ACL_SIZE(MAX_ACL_SIZE));
2201	}
2202
2203	if ((mask & (AT_UID | AT_GID)) && zp->z_phys->zp_xattr != 0) {
2204		err = zfs_zget(zp->z_zfsvfs, zp->z_phys->zp_xattr, &attrzp);
2205		if (err) {
2206			dmu_tx_abort(tx);
2207			ZFS_EXIT(zfsvfs);
2208			return (err);
2209		}
2210		dmu_tx_hold_bonus(tx, attrzp->z_id);
2211	}
2212
2213	err = dmu_tx_assign(tx, zfsvfs->z_assign);
2214	if (err) {
2215		if (attrzp)
2216			VN_RELE(ZTOV(attrzp));
2217		if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
2218			dmu_tx_wait(tx);
2219			dmu_tx_abort(tx);
2220			goto top;
2221		}
2222		dmu_tx_abort(tx);
2223		ZFS_EXIT(zfsvfs);
2224		return (err);
2225	}
2226
2227	dmu_buf_will_dirty(zp->z_dbuf, tx);
2228
2229	/*
2230	 * Set each attribute requested.
2231	 * We group settings according to the locks they need to acquire.
2232	 *
2233	 * Note: you cannot set ctime directly, although it will be
2234	 * updated as a side-effect of calling this function.
2235	 */
2236
2237	mutex_enter(&zp->z_lock);
2238
2239	if (mask & AT_MODE) {
2240		err = zfs_acl_chmod_setattr(zp, new_mode, tx);
2241		ASSERT3U(err, ==, 0);
2242	}
2243
2244	if (attrzp)
2245		mutex_enter(&attrzp->z_lock);
2246
2247	if (mask & AT_UID) {
2248		zp->z_phys->zp_uid = (uint64_t)vap->va_uid;
2249		if (attrzp) {
2250			attrzp->z_phys->zp_uid = (uint64_t)vap->va_uid;
2251		}
2252	}
2253
2254	if (mask & AT_GID) {
2255		zp->z_phys->zp_gid = (uint64_t)vap->va_gid;
2256		if (attrzp)
2257			attrzp->z_phys->zp_gid = (uint64_t)vap->va_gid;
2258	}
2259
2260	if (attrzp)
2261		mutex_exit(&attrzp->z_lock);
2262
2263	if (mask & AT_ATIME)
2264		ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
2265
2266	if (mask & AT_MTIME)
2267		ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
2268
2269	if (mask & AT_SIZE)
2270		zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx);
2271	else if (mask != 0)
2272		zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
2273
2274	if (mask != 0)
2275		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask);
2276
2277	mutex_exit(&zp->z_lock);
2278
2279	if (attrzp)
2280		VN_RELE(ZTOV(attrzp));
2281
2282	dmu_tx_commit(tx);
2283
2284	ZFS_EXIT(zfsvfs);
2285	return (err);
2286}
2287
2288typedef struct zfs_zlock {
2289	krwlock_t	*zl_rwlock;	/* lock we acquired */
2290	znode_t		*zl_znode;	/* znode we held */
2291	struct zfs_zlock *zl_next;	/* next in list */
2292} zfs_zlock_t;
2293
2294/*
2295 * Drop locks and release vnodes that were held by zfs_rename_lock().
2296 */
2297static void
2298zfs_rename_unlock(zfs_zlock_t **zlpp)
2299{
2300	zfs_zlock_t *zl;
2301
2302	while ((zl = *zlpp) != NULL) {
2303		if (zl->zl_znode != NULL)
2304			VN_RELE(ZTOV(zl->zl_znode));
2305		rw_exit(zl->zl_rwlock);
2306		*zlpp = zl->zl_next;
2307		kmem_free(zl, sizeof (*zl));
2308	}
2309}
2310
2311/*
2312 * Search back through the directory tree, using the ".." entries.
2313 * Lock each directory in the chain to prevent concurrent renames.
2314 * Fail any attempt to move a directory into one of its own descendants.
2315 * XXX - z_parent_lock can overlap with map or grow locks
2316 */
2317static int
2318zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
2319{
2320	zfs_zlock_t	*zl;
2321	znode_t		*zp = tdzp;
2322	uint64_t	rootid = zp->z_zfsvfs->z_root;
2323	uint64_t	*oidp = &zp->z_id;
2324	krwlock_t	*rwlp = &szp->z_parent_lock;
2325	krw_t		rw = RW_WRITER;
2326
2327	/*
2328	 * First pass write-locks szp and compares to zp->z_id.
2329	 * Later passes read-lock zp and compare to zp->z_parent.
2330	 */
2331	do {
2332		if (!rw_tryenter(rwlp, rw)) {
2333			/*
2334			 * Another thread is renaming in this path.
2335			 * Note that if we are a WRITER, we don't have any
2336			 * parent_locks held yet.
2337			 */
2338			if (rw == RW_READER && zp->z_id > szp->z_id) {
2339				/*
2340				 * Drop our locks and restart
2341				 */
2342				zfs_rename_unlock(&zl);
2343				*zlpp = NULL;
2344				zp = tdzp;
2345				oidp = &zp->z_id;
2346				rwlp = &szp->z_parent_lock;
2347				rw = RW_WRITER;
2348				continue;
2349			} else {
2350				/*
2351				 * Wait for other thread to drop its locks
2352				 */
2353				rw_enter(rwlp, rw);
2354			}
2355		}
2356
2357		zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
2358		zl->zl_rwlock = rwlp;
2359		zl->zl_znode = NULL;
2360		zl->zl_next = *zlpp;
2361		*zlpp = zl;
2362
2363		if (*oidp == szp->z_id)		/* We're a descendant of szp */
2364			return (EINVAL);
2365
2366		if (*oidp == rootid)		/* We've hit the top */
2367			return (0);
2368
2369		if (rw == RW_READER) {		/* i.e. not the first pass */
2370			int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp);
2371			if (error)
2372				return (error);
2373			zl->zl_znode = zp;
2374		}
2375		oidp = &zp->z_phys->zp_parent;
2376		rwlp = &zp->z_parent_lock;
2377		rw = RW_READER;
2378
2379	} while (zp->z_id != sdzp->z_id);
2380
2381	return (0);
2382}
2383
2384/*
2385 * Move an entry from the provided source directory to the target
2386 * directory.  Change the entry name as indicated.
2387 *
2388 *	IN:	sdvp	- Source directory containing the "old entry".
2389 *		snm	- Old entry name.
2390 *		tdvp	- Target directory to contain the "new entry".
2391 *		tnm	- New entry name.
2392 *		cr	- credentials of caller.
2393 *
2394 *	RETURN:	0 if success
2395 *		error code if failure
2396 *
2397 * Timestamps:
2398 *	sdvp,tdvp - ctime|mtime updated
2399 */
2400static int
2401zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr)
2402{
2403	znode_t		*tdzp, *szp, *tzp;
2404	znode_t		*sdzp = VTOZ(sdvp);
2405	zfsvfs_t	*zfsvfs = sdzp->z_zfsvfs;
2406	zilog_t		*zilog = zfsvfs->z_log;
2407	vnode_t		*realvp;
2408	zfs_dirlock_t	*sdl, *tdl;
2409	dmu_tx_t	*tx;
2410	zfs_zlock_t	*zl;
2411	int		cmp, serr, terr, error;
2412
2413	ZFS_ENTER(zfsvfs);
2414
2415	/*
2416	 * Make sure we have the real vp for the target directory.
2417	 */
2418	if (VOP_REALVP(tdvp, &realvp) == 0)
2419		tdvp = realvp;
2420
2421	if (tdvp->v_vfsp != sdvp->v_vfsp) {
2422		ZFS_EXIT(zfsvfs);
2423		return (EXDEV);
2424	}
2425
2426	tdzp = VTOZ(tdvp);
2427top:
2428	szp = NULL;
2429	tzp = NULL;
2430	zl = NULL;
2431
2432	/*
2433	 * This is to prevent the creation of links into attribute space
2434	 * by renaming a linked file into/outof an attribute directory.
2435	 * See the comment in zfs_link() for why this is considered bad.
2436	 */
2437	if ((tdzp->z_phys->zp_flags & ZFS_XATTR) !=
2438	    (sdzp->z_phys->zp_flags & ZFS_XATTR)) {
2439		ZFS_EXIT(zfsvfs);
2440		return (EINVAL);
2441	}
2442
2443	/*
2444	 * Lock source and target directory entries.  To prevent deadlock,
2445	 * a lock ordering must be defined.  We lock the directory with
2446	 * the smallest object id first, or if it's a tie, the one with
2447	 * the lexically first name.
2448	 */
2449	if (sdzp->z_id < tdzp->z_id) {
2450		cmp = -1;
2451	} else if (sdzp->z_id > tdzp->z_id) {
2452		cmp = 1;
2453	} else {
2454		cmp = strcmp(snm, tnm);
2455		if (cmp == 0) {
2456			/*
2457			 * POSIX: "If the old argument and the new argument
2458			 * both refer to links to the same existing file,
2459			 * the rename() function shall return successfully
2460			 * and perform no other action."
2461			 */
2462			ZFS_EXIT(zfsvfs);
2463			return (0);
2464		}
2465	}
2466	if (cmp < 0) {
2467		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS);
2468		terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0);
2469	} else {
2470		terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0);
2471		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS);
2472	}
2473
2474	if (serr) {
2475		/*
2476		 * Source entry invalid or not there.
2477		 */
2478		if (!terr) {
2479			zfs_dirent_unlock(tdl);
2480			if (tzp)
2481				VN_RELE(ZTOV(tzp));
2482		}
2483		if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0)
2484			serr = EINVAL;
2485		ZFS_EXIT(zfsvfs);
2486		return (serr);
2487	}
2488	if (terr) {
2489		zfs_dirent_unlock(sdl);
2490		VN_RELE(ZTOV(szp));
2491		if (strcmp(tnm, "..") == 0)
2492			terr = EINVAL;
2493		ZFS_EXIT(zfsvfs);
2494		return (terr);
2495	}
2496
2497	/*
2498	 * Must have write access at the source to remove the old entry
2499	 * and write access at the target to create the new entry.
2500	 * Note that if target and source are the same, this can be
2501	 * done in a single check.
2502	 */
2503
2504	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
2505		goto out;
2506
2507	if (ZTOV(szp)->v_type == VDIR) {
2508		/*
2509		 * Check to make sure rename is valid.
2510		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
2511		 */
2512		if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
2513			goto out;
2514	}
2515
2516	/*
2517	 * Does target exist?
2518	 */
2519	if (tzp) {
2520		/*
2521		 * Source and target must be the same type.
2522		 */
2523		if (ZTOV(szp)->v_type == VDIR) {
2524			if (ZTOV(tzp)->v_type != VDIR) {
2525				error = ENOTDIR;
2526				goto out;
2527			}
2528		} else {
2529			if (ZTOV(tzp)->v_type == VDIR) {
2530				error = EISDIR;
2531				goto out;
2532			}
2533		}
2534		/*
2535		 * POSIX dictates that when the source and target
2536		 * entries refer to the same file object, rename
2537		 * must do nothing and exit without error.
2538		 */
2539		if (szp->z_id == tzp->z_id) {
2540			error = 0;
2541			goto out;
2542		}
2543	}
2544
2545	vnevent_rename_src(ZTOV(szp));
2546	if (tzp)
2547		vnevent_rename_dest(ZTOV(tzp));
2548
2549	tx = dmu_tx_create(zfsvfs->z_os);
2550	dmu_tx_hold_bonus(tx, szp->z_id);	/* nlink changes */
2551	dmu_tx_hold_bonus(tx, sdzp->z_id);	/* nlink changes */
2552	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
2553	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
2554	if (sdzp != tdzp)
2555		dmu_tx_hold_bonus(tx, tdzp->z_id);	/* nlink changes */
2556	if (tzp)
2557		dmu_tx_hold_bonus(tx, tzp->z_id);	/* parent changes */
2558	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2559	error = dmu_tx_assign(tx, zfsvfs->z_assign);
2560	if (error) {
2561		if (zl != NULL)
2562			zfs_rename_unlock(&zl);
2563		zfs_dirent_unlock(sdl);
2564		zfs_dirent_unlock(tdl);
2565		VN_RELE(ZTOV(szp));
2566		if (tzp)
2567			VN_RELE(ZTOV(tzp));
2568		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
2569			dmu_tx_wait(tx);
2570			dmu_tx_abort(tx);
2571			goto top;
2572		}
2573		dmu_tx_abort(tx);
2574		ZFS_EXIT(zfsvfs);
2575		return (error);
2576	}
2577
2578	if (tzp)	/* Attempt to remove the existing target */
2579		error = zfs_link_destroy(tdl, tzp, tx, 0, NULL);
2580
2581	if (error == 0) {
2582		error = zfs_link_create(tdl, szp, tx, ZRENAMING);
2583		if (error == 0) {
2584			error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
2585			ASSERT(error == 0);
2586			zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
2587			    sdl->dl_name, tdzp, tdl->dl_name, szp);
2588		}
2589#ifdef FREEBSD_NAMECACHE
2590		if (error == 0) {
2591			cache_purge(sdvp);
2592			cache_purge(tdvp);
2593		}
2594#endif
2595	}
2596
2597	dmu_tx_commit(tx);
2598out:
2599	if (zl != NULL)
2600		zfs_rename_unlock(&zl);
2601
2602	zfs_dirent_unlock(sdl);
2603	zfs_dirent_unlock(tdl);
2604
2605	VN_RELE(ZTOV(szp));
2606	if (tzp)
2607		VN_RELE(ZTOV(tzp));
2608
2609	ZFS_EXIT(zfsvfs);
2610
2611	return (error);
2612}
2613
2614/*
2615 * Insert the indicated symbolic reference entry into the directory.
2616 *
2617 *	IN:	dvp	- Directory to contain new symbolic link.
2618 *		link	- Name for new symlink entry.
2619 *		vap	- Attributes of new entry.
2620 *		target	- Target path of new symlink.
2621 *		cr	- credentials of caller.
2622 *
2623 *	RETURN:	0 if success
2624 *		error code if failure
2625 *
2626 * Timestamps:
2627 *	dvp - ctime|mtime updated
2628 */
2629static int
2630zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, cred_t *cr, kthread_t *td)
2631{
2632	znode_t		*zp, *dzp = VTOZ(dvp);
2633	zfs_dirlock_t	*dl;
2634	dmu_tx_t	*tx;
2635	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2636	zilog_t		*zilog = zfsvfs->z_log;
2637	uint64_t	zoid;
2638	int		len = strlen(link);
2639	int		error;
2640
2641	ASSERT(vap->va_type == VLNK);
2642
2643	ZFS_ENTER(zfsvfs);
2644top:
2645	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
2646		ZFS_EXIT(zfsvfs);
2647		return (error);
2648	}
2649
2650	if (len > MAXPATHLEN) {
2651		ZFS_EXIT(zfsvfs);
2652		return (ENAMETOOLONG);
2653	}
2654
2655	/*
2656	 * Attempt to lock directory; fail if entry already exists.
2657	 */
2658	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZNEW)) {
2659		ZFS_EXIT(zfsvfs);
2660		return (error);
2661	}
2662
2663	tx = dmu_tx_create(zfsvfs->z_os);
2664	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
2665	dmu_tx_hold_bonus(tx, dzp->z_id);
2666	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
2667	if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
2668		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE);
2669	error = dmu_tx_assign(tx, zfsvfs->z_assign);
2670	if (error) {
2671		zfs_dirent_unlock(dl);
2672		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
2673			dmu_tx_wait(tx);
2674			dmu_tx_abort(tx);
2675			goto top;
2676		}
2677		dmu_tx_abort(tx);
2678		ZFS_EXIT(zfsvfs);
2679		return (error);
2680	}
2681
2682	dmu_buf_will_dirty(dzp->z_dbuf, tx);
2683
2684	/*
2685	 * Create a new object for the symlink.
2686	 * Put the link content into bonus buffer if it will fit;
2687	 * otherwise, store it just like any other file data.
2688	 */
2689	zoid = 0;
2690	if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) {
2691		zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, len);
2692		if (len != 0)
2693			bcopy(link, zp->z_phys + 1, len);
2694	} else {
2695		dmu_buf_t *dbp;
2696
2697		zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
2698
2699		/*
2700		 * Nothing can access the znode yet so no locking needed
2701		 * for growing the znode's blocksize.
2702		 */
2703		zfs_grow_blocksize(zp, len, tx);
2704
2705		VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, zoid, 0, FTAG, &dbp));
2706		dmu_buf_will_dirty(dbp, tx);
2707
2708		ASSERT3U(len, <=, dbp->db_size);
2709		bcopy(link, dbp->db_data, len);
2710		dmu_buf_rele(dbp, FTAG);
2711	}
2712	zp->z_phys->zp_size = len;
2713
2714	/*
2715	 * Insert the new object into the directory.
2716	 */
2717	(void) zfs_link_create(dl, zp, tx, ZNEW);
2718out:
2719	if (error == 0) {
2720		zfs_log_symlink(zilog, tx, TX_SYMLINK, dzp, zp, name, link);
2721		*vpp = ZTOV(zp);
2722		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);
2723	}
2724
2725	dmu_tx_commit(tx);
2726
2727	zfs_dirent_unlock(dl);
2728
2729	ZFS_EXIT(zfsvfs);
2730	return (error);
2731}
2732
2733/*
2734 * Return, in the buffer contained in the provided uio structure,
2735 * the symbolic path referred to by vp.
2736 *
2737 *	IN:	vp	- vnode of symbolic link.
2738 *		uoip	- structure to contain the link path.
2739 *		cr	- credentials of caller.
2740 *
2741 *	OUT:	uio	- structure to contain the link path.
2742 *
2743 *	RETURN:	0 if success
2744 *		error code if failure
2745 *
2746 * Timestamps:
2747 *	vp - atime updated
2748 */
2749/* ARGSUSED */
2750static int
2751zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr)
2752{
2753	znode_t		*zp = VTOZ(vp);
2754	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2755	size_t		bufsz;
2756	int		error;
2757
2758	ZFS_ENTER(zfsvfs);
2759
2760	bufsz = (size_t)zp->z_phys->zp_size;
2761	if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) {
2762		error = uiomove(zp->z_phys + 1,
2763		    MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
2764	} else {
2765		dmu_buf_t *dbp;
2766		error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp);
2767		if (error) {
2768			ZFS_EXIT(zfsvfs);
2769			return (error);
2770		}
2771		error = uiomove(dbp->db_data,
2772		    MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
2773		dmu_buf_rele(dbp, FTAG);
2774	}
2775
2776	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2777	ZFS_EXIT(zfsvfs);
2778	return (error);
2779}
2780
2781/*
2782 * Insert a new entry into directory tdvp referencing svp.
2783 *
2784 *	IN:	tdvp	- Directory to contain new entry.
2785 *		svp	- vnode of new entry.
2786 *		name	- name of new entry.
2787 *		cr	- credentials of caller.
2788 *
2789 *	RETURN:	0 if success
2790 *		error code if failure
2791 *
2792 * Timestamps:
2793 *	tdvp - ctime|mtime updated
2794 *	 svp - ctime updated
2795 */
2796/* ARGSUSED */
2797static int
2798zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr)
2799{
2800	znode_t		*dzp = VTOZ(tdvp);
2801	znode_t		*tzp, *szp;
2802	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2803	zilog_t		*zilog = zfsvfs->z_log;
2804	zfs_dirlock_t	*dl;
2805	dmu_tx_t	*tx;
2806	vnode_t		*realvp;
2807	int		error;
2808
2809	ASSERT(tdvp->v_type == VDIR);
2810
2811	ZFS_ENTER(zfsvfs);
2812
2813	if (VOP_REALVP(svp, &realvp) == 0)
2814		svp = realvp;
2815
2816	if (svp->v_vfsp != tdvp->v_vfsp) {
2817		ZFS_EXIT(zfsvfs);
2818		return (EXDEV);
2819	}
2820
2821	szp = VTOZ(svp);
2822top:
2823	/*
2824	 * We do not support links between attributes and non-attributes
2825	 * because of the potential security risk of creating links
2826	 * into "normal" file space in order to circumvent restrictions
2827	 * imposed in attribute space.
2828	 */
2829	if ((szp->z_phys->zp_flags & ZFS_XATTR) !=
2830	    (dzp->z_phys->zp_flags & ZFS_XATTR)) {
2831		ZFS_EXIT(zfsvfs);
2832		return (EINVAL);
2833	}
2834
2835	/*
2836	 * POSIX dictates that we return EPERM here.
2837	 * Better choices include ENOTSUP or EISDIR.
2838	 */
2839	if (svp->v_type == VDIR) {
2840		ZFS_EXIT(zfsvfs);
2841		return (EPERM);
2842	}
2843
2844	if ((uid_t)szp->z_phys->zp_uid != crgetuid(cr) &&
2845	    secpolicy_basic_link(cr) != 0) {
2846		ZFS_EXIT(zfsvfs);
2847		return (EPERM);
2848	}
2849
2850	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
2851		ZFS_EXIT(zfsvfs);
2852		return (error);
2853	}
2854
2855	/*
2856	 * Attempt to lock directory; fail if entry already exists.
2857	 */
2858	if (error = zfs_dirent_lock(&dl, dzp, name, &tzp, ZNEW)) {
2859		ZFS_EXIT(zfsvfs);
2860		return (error);
2861	}
2862
2863	tx = dmu_tx_create(zfsvfs->z_os);
2864	dmu_tx_hold_bonus(tx, szp->z_id);
2865	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
2866	error = dmu_tx_assign(tx, zfsvfs->z_assign);
2867	if (error) {
2868		zfs_dirent_unlock(dl);
2869		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
2870			dmu_tx_wait(tx);
2871			dmu_tx_abort(tx);
2872			goto top;
2873		}
2874		dmu_tx_abort(tx);
2875		ZFS_EXIT(zfsvfs);
2876		return (error);
2877	}
2878
2879	error = zfs_link_create(dl, szp, tx, 0);
2880
2881	if (error == 0)
2882		zfs_log_link(zilog, tx, TX_LINK, dzp, szp, name);
2883
2884	dmu_tx_commit(tx);
2885
2886	zfs_dirent_unlock(dl);
2887
2888	ZFS_EXIT(zfsvfs);
2889	return (error);
2890}
2891
2892void
2893zfs_inactive(vnode_t *vp, cred_t *cr)
2894{
2895	znode_t	*zp = VTOZ(vp);
2896	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2897	int error;
2898
2899	rw_enter(&zfsvfs->z_um_lock, RW_READER);
2900	if (zfsvfs->z_unmounted2) {
2901		ASSERT(zp->z_dbuf_held == 0);
2902
2903		mutex_enter(&zp->z_lock);
2904		VI_LOCK(vp);
2905		vp->v_count = 0; /* count arrives as 1 */
2906		VI_UNLOCK(vp);
2907		if (zp->z_dbuf == NULL) {
2908			mutex_exit(&zp->z_lock);
2909			zfs_znode_free(zp);
2910		} else {
2911			mutex_exit(&zp->z_lock);
2912		}
2913		rw_exit(&zfsvfs->z_um_lock);
2914		VFS_RELE(zfsvfs->z_vfs);
2915		return;
2916	}
2917
2918	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
2919		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
2920
2921		dmu_tx_hold_bonus(tx, zp->z_id);
2922		error = dmu_tx_assign(tx, TXG_WAIT);
2923		if (error) {
2924			dmu_tx_abort(tx);
2925		} else {
2926			dmu_buf_will_dirty(zp->z_dbuf, tx);
2927			mutex_enter(&zp->z_lock);
2928			zp->z_atime_dirty = 0;
2929			mutex_exit(&zp->z_lock);
2930			dmu_tx_commit(tx);
2931		}
2932	}
2933
2934	zfs_zinactive(zp);
2935	rw_exit(&zfsvfs->z_um_lock);
2936}
2937
2938CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
2939CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
2940
2941static int
2942zfs_fid(vnode_t *vp, fid_t *fidp)
2943{
2944	znode_t		*zp = VTOZ(vp);
2945	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2946	uint32_t	gen = (uint32_t)zp->z_phys->zp_gen;
2947	uint64_t	object = zp->z_id;
2948	zfid_short_t	*zfid;
2949	int		size, i;
2950
2951	ZFS_ENTER(zfsvfs);
2952
2953	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
2954	fidp->fid_len = size;
2955
2956	zfid = (zfid_short_t *)fidp;
2957
2958	zfid->zf_len = size;
2959
2960	for (i = 0; i < sizeof (zfid->zf_object); i++)
2961		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
2962
2963	/* Must have a non-zero generation number to distinguish from .zfs */
2964	if (gen == 0)
2965		gen = 1;
2966	for (i = 0; i < sizeof (zfid->zf_gen); i++)
2967		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
2968
2969	if (size == LONG_FID_LEN) {
2970		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
2971		zfid_long_t	*zlfid;
2972
2973		zlfid = (zfid_long_t *)fidp;
2974
2975		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
2976			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
2977
2978		/* XXX - this should be the generation number for the objset */
2979		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
2980			zlfid->zf_setgen[i] = 0;
2981	}
2982
2983	ZFS_EXIT(zfsvfs);
2984	return (0);
2985}
2986
2987static int
2988zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr)
2989{
2990	znode_t		*zp, *xzp;
2991	zfsvfs_t	*zfsvfs;
2992	zfs_dirlock_t	*dl;
2993	int		error;
2994
2995	switch (cmd) {
2996	case _PC_LINK_MAX:
2997		*valp = INT_MAX;
2998		return (0);
2999
3000	case _PC_FILESIZEBITS:
3001		*valp = 64;
3002		return (0);
3003
3004#if 0
3005	case _PC_XATTR_EXISTS:
3006		zp = VTOZ(vp);
3007		zfsvfs = zp->z_zfsvfs;
3008		ZFS_ENTER(zfsvfs);
3009		*valp = 0;
3010		error = zfs_dirent_lock(&dl, zp, "", &xzp,
3011		    ZXATTR | ZEXISTS | ZSHARED);
3012		if (error == 0) {
3013			zfs_dirent_unlock(dl);
3014			if (!zfs_dirempty(xzp))
3015				*valp = 1;
3016			VN_RELE(ZTOV(xzp));
3017		} else if (error == ENOENT) {
3018			/*
3019			 * If there aren't extended attributes, it's the
3020			 * same as having zero of them.
3021			 */
3022			error = 0;
3023		}
3024		ZFS_EXIT(zfsvfs);
3025		return (error);
3026#endif
3027
3028	case _PC_ACL_EXTENDED:
3029		*valp = 0;	/* TODO */
3030		return (0);
3031
3032	case _PC_MIN_HOLE_SIZE:
3033		*valp = (int)SPA_MINBLOCKSIZE;
3034		return (0);
3035
3036	default:
3037		return (EOPNOTSUPP);
3038	}
3039}
3040
3041#ifdef TODO
3042/*ARGSUSED*/
3043static int
3044zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr)
3045{
3046	znode_t *zp = VTOZ(vp);
3047	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3048	int error;
3049
3050	ZFS_ENTER(zfsvfs);
3051	error = zfs_getacl(zp, vsecp, cr);
3052	ZFS_EXIT(zfsvfs);
3053
3054	return (error);
3055}
3056#endif	/* TODO */
3057
3058#ifdef TODO
3059/*ARGSUSED*/
3060static int
3061zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr)
3062{
3063	znode_t *zp = VTOZ(vp);
3064	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3065	int error;
3066
3067	ZFS_ENTER(zfsvfs);
3068	error = zfs_setacl(zp, vsecp, cr);
3069	ZFS_EXIT(zfsvfs);
3070	return (error);
3071}
3072#endif	/* TODO */
3073
3074static int
3075zfs_freebsd_open(ap)
3076	struct vop_open_args /* {
3077		struct vnode *a_vp;
3078		int a_mode;
3079		struct ucred *a_cred;
3080		struct thread *a_td;
3081	} */ *ap;
3082{
3083	vnode_t	*vp = ap->a_vp;
3084	znode_t *zp = VTOZ(vp);
3085	int error;
3086
3087	error = zfs_open(&vp, ap->a_mode, ap->a_cred);
3088	if (error == 0)
3089		vnode_create_vobject(vp, zp->z_phys->zp_size, ap->a_td);
3090	return (error);
3091}
3092
3093static int
3094zfs_freebsd_close(ap)
3095	struct vop_close_args /* {
3096		struct vnode *a_vp;
3097		int  a_fflag;
3098		struct ucred *a_cred;
3099		struct thread *a_td;
3100	} */ *ap;
3101{
3102
3103	return (zfs_close(ap->a_vp, ap->a_fflag, 0, 0, ap->a_cred));
3104}
3105
3106static int
3107zfs_freebsd_ioctl(ap)
3108	struct vop_ioctl_args /* {
3109		struct vnode *a_vp;
3110		u_long a_command;
3111		caddr_t a_data;
3112		int a_fflag;
3113		struct ucred *cred;
3114		struct thread *td;
3115	} */ *ap;
3116{
3117
3118	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
3119	    ap->a_fflag, ap->a_cred, NULL));
3120}
3121
3122static int
3123zfs_freebsd_read(ap)
3124	struct vop_read_args /* {
3125		struct vnode *a_vp;
3126		struct uio *a_uio;
3127		int a_ioflag;
3128		struct ucred *a_cred;
3129	} */ *ap;
3130{
3131
3132	return (zfs_read(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL));
3133}
3134
3135static int
3136zfs_freebsd_write(ap)
3137	struct vop_write_args /* {
3138		struct vnode *a_vp;
3139		struct uio *a_uio;
3140		int a_ioflag;
3141		struct ucred *a_cred;
3142	} */ *ap;
3143{
3144
3145	return (zfs_write(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL));
3146}
3147
3148static int
3149zfs_freebsd_access(ap)
3150	struct vop_access_args /* {
3151		struct vnode *a_vp;
3152		int  a_mode;
3153		struct ucred *a_cred;
3154		struct thread *a_td;
3155	} */ *ap;
3156{
3157
3158	return (zfs_access(ap->a_vp, ap->a_mode, 0, ap->a_cred));
3159}
3160
3161static int
3162zfs_freebsd_lookup(ap)
3163	struct vop_lookup_args /* {
3164		struct vnode *a_dvp;
3165		struct vnode **a_vpp;
3166		struct componentname *a_cnp;
3167	} */ *ap;
3168{
3169	struct componentname *cnp = ap->a_cnp;
3170	char nm[NAME_MAX + 1];
3171
3172	ASSERT(cnp->cn_namelen < sizeof(nm));
3173	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
3174
3175	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
3176	    cnp->cn_cred, cnp->cn_thread));
3177}
3178
3179static int
3180zfs_freebsd_create(ap)
3181	struct vop_create_args /* {
3182		struct vnode *a_dvp;
3183		struct vnode **a_vpp;
3184		struct componentname *a_cnp;
3185		struct vattr *a_vap;
3186	} */ *ap;
3187{
3188	struct componentname *cnp = ap->a_cnp;
3189	vattr_t *vap = ap->a_vap;
3190	int mode;
3191
3192	ASSERT(cnp->cn_flags & SAVENAME);
3193
3194	vattr_init_mask(vap);
3195	mode = vap->va_mode & ALLPERMS;
3196
3197	return (zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
3198	    ap->a_vpp, cnp->cn_cred, cnp->cn_thread));
3199}
3200
3201static int
3202zfs_freebsd_remove(ap)
3203	struct vop_remove_args /* {
3204		struct vnode *a_dvp;
3205		struct vnode *a_vp;
3206		struct componentname *a_cnp;
3207	} */ *ap;
3208{
3209
3210	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
3211
3212	return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr,
3213	    ap->a_cnp->cn_cred));
3214}
3215
3216static int
3217zfs_freebsd_mkdir(ap)
3218	struct vop_mkdir_args /* {
3219		struct vnode *a_dvp;
3220		struct vnode **a_vpp;
3221		struct componentname *a_cnp;
3222		struct vattr *a_vap;
3223	} */ *ap;
3224{
3225	vattr_t *vap = ap->a_vap;
3226
3227	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
3228
3229	vattr_init_mask(vap);
3230
3231	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
3232	    ap->a_cnp->cn_cred));
3233}
3234
3235static int
3236zfs_freebsd_rmdir(ap)
3237	struct vop_rmdir_args /* {
3238		struct vnode *a_dvp;
3239		struct vnode *a_vp;
3240		struct componentname *a_cnp;
3241	} */ *ap;
3242{
3243	struct componentname *cnp = ap->a_cnp;
3244
3245	ASSERT(cnp->cn_flags & SAVENAME);
3246
3247	return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred));
3248}
3249
3250static int
3251zfs_freebsd_readdir(ap)
3252	struct vop_readdir_args /* {
3253		struct vnode *a_vp;
3254		struct uio *a_uio;
3255		struct ucred *a_cred;
3256		int *a_eofflag;
3257		int *a_ncookies;
3258		u_long **a_cookies;
3259	} */ *ap;
3260{
3261
3262	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
3263	    ap->a_ncookies, ap->a_cookies));
3264}
3265
3266static int
3267zfs_freebsd_fsync(ap)
3268	struct vop_fsync_args /* {
3269		struct vnode *a_vp;
3270		int a_waitfor;
3271		struct thread *a_td;
3272	} */ *ap;
3273{
3274
3275	vop_stdfsync(ap);
3276	return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred));
3277}
3278
3279static int
3280zfs_freebsd_getattr(ap)
3281	struct vop_getattr_args /* {
3282		struct vnode *a_vp;
3283		struct vattr *a_vap;
3284		struct ucred *a_cred;
3285		struct thread *a_td;
3286	} */ *ap;
3287{
3288
3289	return (zfs_getattr(ap->a_vp, ap->a_vap, 0, ap->a_cred));
3290}
3291
3292static int
3293zfs_freebsd_setattr(ap)
3294	struct vop_setattr_args /* {
3295		struct vnode *a_vp;
3296		struct vattr *a_vap;
3297		struct ucred *a_cred;
3298		struct thread *a_td;
3299	} */ *ap;
3300{
3301	vattr_t *vap = ap->a_vap;
3302
3303	/* No support for FreeBSD's chflags(2). */
3304	if (vap->va_flags != VNOVAL)
3305		return (EOPNOTSUPP);
3306
3307	vattr_init_mask(vap);
3308
3309	return (zfs_setattr(ap->a_vp, vap, 0, ap->a_cred, NULL));
3310}
3311
3312static int
3313zfs_freebsd_rename(ap)
3314	struct vop_rename_args  /* {
3315		struct vnode *a_fdvp;
3316		struct vnode *a_fvp;
3317		struct componentname *a_fcnp;
3318		struct vnode *a_tdvp;
3319		struct vnode *a_tvp;
3320		struct componentname *a_tcnp;
3321	} */ *ap;
3322{
3323	vnode_t *fdvp = ap->a_fdvp;
3324	vnode_t *fvp = ap->a_fvp;
3325	vnode_t *tdvp = ap->a_tdvp;
3326	vnode_t *tvp = ap->a_tvp;
3327	int error;
3328
3329	ASSERT(ap->a_fcnp->cn_flags & SAVENAME);
3330	ASSERT(ap->a_tcnp->cn_flags & SAVENAME);
3331
3332	error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp,
3333	    ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred);
3334
3335	if (tdvp == tvp)
3336		VN_RELE(tdvp);
3337	else
3338		VN_URELE(tdvp);
3339	if (tvp)
3340		VN_URELE(tvp);
3341	VN_RELE(fdvp);
3342	VN_RELE(fvp);
3343
3344	return (error);
3345}
3346
3347static int
3348zfs_freebsd_symlink(ap)
3349	struct vop_symlink_args /* {
3350		struct vnode *a_dvp;
3351		struct vnode **a_vpp;
3352		struct componentname *a_cnp;
3353		struct vattr *a_vap;
3354		char *a_target;
3355	} */ *ap;
3356{
3357	struct componentname *cnp = ap->a_cnp;
3358	vattr_t *vap = ap->a_vap;
3359
3360	ASSERT(cnp->cn_flags & SAVENAME);
3361
3362	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
3363	vattr_init_mask(vap);
3364
3365	return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
3366	    ap->a_target, cnp->cn_cred, cnp->cn_thread));
3367}
3368
3369static int
3370zfs_freebsd_readlink(ap)
3371	struct vop_readlink_args /* {
3372		struct vnode *a_vp;
3373		struct uio *a_uio;
3374		struct ucred *a_cred;
3375	} */ *ap;
3376{
3377
3378	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred));
3379}
3380
3381static int
3382zfs_freebsd_link(ap)
3383	struct vop_link_args /* {
3384		struct vnode *a_tdvp;
3385		struct vnode *a_vp;
3386		struct componentname *a_cnp;
3387	} */ *ap;
3388{
3389	struct componentname *cnp = ap->a_cnp;
3390
3391	ASSERT(cnp->cn_flags & SAVENAME);
3392
3393	return (zfs_link(ap->a_tdvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
3394}
3395
3396static int
3397zfs_freebsd_inactive(ap)
3398        struct vop_inactive_args /* {
3399                struct vnode *a_vp;
3400                struct thread *a_td;
3401        } */ *ap;
3402{
3403	vnode_t *vp = ap->a_vp;
3404
3405	zfs_inactive(vp, ap->a_td->td_ucred);
3406	return (0);
3407}
3408
3409static int
3410zfs_freebsd_reclaim(ap)
3411	struct vop_reclaim_args /* {
3412		struct vnode *a_vp;
3413		struct thread *a_td;
3414	} */ *ap;
3415{
3416        vnode_t	*vp = ap->a_vp;
3417	znode_t	*zp = VTOZ(vp);
3418	zfsvfs_t *zfsvfs;
3419	int rele = 1;
3420
3421	ASSERT(zp != NULL);
3422
3423	/*
3424	 * Destroy the vm object and flush associated pages.
3425	 */
3426	vnode_destroy_vobject(vp);
3427
3428	mutex_enter(&zp->z_lock);
3429	ASSERT(zp->z_phys);
3430	ASSERT(zp->z_dbuf_held);
3431	zfsvfs = zp->z_zfsvfs;
3432	if (!zp->z_unlinked) {
3433		zp->z_dbuf_held = 0;
3434		ZTOV(zp) = NULL;
3435		mutex_exit(&zp->z_lock);
3436		dmu_buf_rele(zp->z_dbuf, NULL);
3437	} else {
3438		mutex_exit(&zp->z_lock);
3439	}
3440	VI_LOCK(vp);
3441	if (vp->v_count > 0)
3442		rele = 0;
3443	vp->v_data = NULL;
3444	ASSERT(vp->v_holdcnt > 1);
3445	vdropl(vp);
3446	if (!zp->z_unlinked && rele)
3447		VFS_RELE(zfsvfs->z_vfs);
3448	return (0);
3449}
3450
3451static int
3452zfs_freebsd_fid(ap)
3453	struct vop_fid_args /* {
3454		struct vnode *a_vp;
3455		struct fid *a_fid;
3456	} */ *ap;
3457{
3458
3459	return (zfs_fid(ap->a_vp, (void *)ap->a_fid));
3460}
3461
3462static int
3463zfs_freebsd_pathconf(ap)
3464	struct vop_pathconf_args /* {
3465		struct vnode *a_vp;
3466		int a_name;
3467		register_t *a_retval;
3468	} */ *ap;
3469{
3470	ulong_t val;
3471	int error;
3472
3473	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred);
3474	if (error == 0)
3475		*ap->a_retval = val;
3476	else if (error == EOPNOTSUPP)
3477		error = vop_stdpathconf(ap);
3478	return (error);
3479}
3480
3481/*
3482 * Advisory record locking support
3483 */
3484static int
3485zfs_freebsd_advlock(ap)
3486	struct vop_advlock_args /* {
3487		struct vnode *a_vp;
3488		caddr_t  a_id;
3489		int  a_op;
3490		struct flock *a_fl;
3491		int  a_flags;
3492	} */ *ap;
3493{
3494	znode_t	*zp = VTOZ(ap->a_vp);
3495
3496	return (lf_advlock(ap, &(zp->z_lockf), zp->z_phys->zp_size));
3497}
3498
3499struct vop_vector zfs_vnodeops;
3500struct vop_vector zfs_fifoops;
3501
3502struct vop_vector zfs_vnodeops = {
3503	.vop_default =	&default_vnodeops,
3504	.vop_inactive =	zfs_freebsd_inactive,
3505	.vop_reclaim =	zfs_freebsd_reclaim,
3506	.vop_access =	zfs_freebsd_access,
3507#ifdef FREEBSD_NAMECACHE
3508	.vop_lookup =	vfs_cache_lookup,
3509	.vop_cachedlookup = zfs_freebsd_lookup,
3510#else
3511	.vop_lookup =	zfs_freebsd_lookup,
3512#endif
3513	.vop_getattr =	zfs_freebsd_getattr,
3514	.vop_setattr =	zfs_freebsd_setattr,
3515	.vop_create =	zfs_freebsd_create,
3516	.vop_mknod =	zfs_freebsd_create,
3517	.vop_mkdir =	zfs_freebsd_mkdir,
3518	.vop_readdir =	zfs_freebsd_readdir,
3519	.vop_fsync =	zfs_freebsd_fsync,
3520	.vop_open =	zfs_freebsd_open,
3521	.vop_close =	zfs_freebsd_close,
3522	.vop_rmdir =	zfs_freebsd_rmdir,
3523	.vop_ioctl =	zfs_freebsd_ioctl,
3524	.vop_link =	zfs_freebsd_link,
3525	.vop_symlink =	zfs_freebsd_symlink,
3526	.vop_readlink =	zfs_freebsd_readlink,
3527	.vop_read =	zfs_freebsd_read,
3528	.vop_write =	zfs_freebsd_write,
3529	.vop_remove =	zfs_freebsd_remove,
3530	.vop_rename =	zfs_freebsd_rename,
3531	.vop_advlock =	zfs_freebsd_advlock,
3532	.vop_pathconf =	zfs_freebsd_pathconf,
3533	.vop_bmap =	VOP_EOPNOTSUPP,
3534	.vop_fid =	zfs_freebsd_fid,
3535};
3536
3537struct vop_vector zfs_fifoops = {
3538	.vop_default =	&fifo_specops,
3539	.vop_fsync =	VOP_PANIC,
3540	.vop_access =	zfs_freebsd_access,
3541	.vop_getattr =	zfs_freebsd_getattr,
3542	.vop_inactive =	zfs_freebsd_inactive,
3543	.vop_read =	VOP_PANIC,
3544	.vop_reclaim =	zfs_freebsd_reclaim,
3545	.vop_setattr =	zfs_freebsd_setattr,
3546	.vop_write =	VOP_PANIC,
3547	.vop_fid =	zfs_freebsd_fid,
3548};
3549