zfs_vnops.c revision 169195
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26/* Portions Copyright 2007 Jeremy Teo */
27
28#pragma ident	"%Z%%M%	%I%	%E% SMI"
29
30#include <sys/types.h>
31#include <sys/param.h>
32#include <sys/time.h>
33#include <sys/systm.h>
34#include <sys/sysmacros.h>
35#include <sys/resource.h>
36#include <sys/vfs.h>
37#include <sys/vnode.h>
38#include <sys/file.h>
39#include <sys/stat.h>
40#include <sys/kmem.h>
41#include <sys/taskq.h>
42#include <sys/uio.h>
43#include <sys/atomic.h>
44#include <sys/namei.h>
45#include <sys/mman.h>
46#include <sys/cmn_err.h>
47#include <sys/errno.h>
48#include <sys/unistd.h>
49#include <sys/zfs_vfsops.h>
50#include <sys/zfs_dir.h>
51#include <sys/zfs_acl.h>
52#include <sys/zfs_ioctl.h>
53#include <sys/fs/zfs.h>
54#include <sys/dmu.h>
55#include <sys/spa.h>
56#include <sys/txg.h>
57#include <sys/dbuf.h>
58#include <sys/zap.h>
59#include <sys/dirent.h>
60#include <sys/policy.h>
61#include <sys/sunddi.h>
62#include <sys/filio.h>
63#include <sys/zfs_ctldir.h>
64#include <sys/dnlc.h>
65#include <sys/zfs_rlock.h>
66#include <sys/bio.h>
67#include <sys/buf.h>
68#include <sys/sf_buf.h>
69#include <sys/sched.h>
70
71/*
72 * Programming rules.
73 *
74 * Each vnode op performs some logical unit of work.  To do this, the ZPL must
75 * properly lock its in-core state, create a DMU transaction, do the work,
76 * record this work in the intent log (ZIL), commit the DMU transaction,
77 * and wait the the intent log to commit if it's is a synchronous operation.
78 * Morover, the vnode ops must work in both normal and log replay context.
79 * The ordering of events is important to avoid deadlocks and references
80 * to freed memory.  The example below illustrates the following Big Rules:
81 *
82 *  (1) A check must be made in each zfs thread for a mounted file system.
83 *	This is done avoiding races using ZFS_ENTER(zfsvfs).
84 *	A ZFS_EXIT(zfsvfs) is needed before all returns.
85 *
86 *  (2)	VN_RELE() should always be the last thing except for zil_commit()
87 *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
88 *	First, if it's the last reference, the vnode/znode
89 *	can be freed, so the zp may point to freed memory.  Second, the last
90 *	reference will call zfs_zinactive(), which may induce a lot of work --
91 *	pushing cached pages (which acquires range locks) and syncing out
92 *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
93 *	which could deadlock the system if you were already holding one.
94 *
95 *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
96 *	as they can span dmu_tx_assign() calls.
97 *
98 *  (4)	Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign().
99 *	In normal operation, this will be TXG_NOWAIT.  During ZIL replay,
100 *	it will be a specific txg.  Either way, dmu_tx_assign() never blocks.
101 *	This is critical because we don't want to block while holding locks.
102 *	Note, in particular, that if a lock is sometimes acquired before
103 *	the tx assigns, and sometimes after (e.g. z_lock), then failing to
104 *	use a non-blocking assign can deadlock the system.  The scenario:
105 *
106 *	Thread A has grabbed a lock before calling dmu_tx_assign().
107 *	Thread B is in an already-assigned tx, and blocks for this lock.
108 *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
109 *	forever, because the previous txg can't quiesce until B's tx commits.
110 *
111 *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
112 *	then drop all locks, call dmu_tx_wait(), and try again.
113 *
114 *  (5)	If the operation succeeded, generate the intent log entry for it
115 *	before dropping locks.  This ensures that the ordering of events
116 *	in the intent log matches the order in which they actually occurred.
117 *
118 *  (6)	At the end of each vnode op, the DMU tx must always commit,
119 *	regardless of whether there were any errors.
120 *
121 *  (7)	After dropping all locks, invoke zil_commit(zilog, seq, foid)
122 *	to ensure that synchronous semantics are provided when necessary.
123 *
124 * In general, this is how things should be ordered in each vnode op:
125 *
126 *	ZFS_ENTER(zfsvfs);		// exit if unmounted
127 * top:
128 *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may VN_HOLD())
129 *	rw_enter(...);			// grab any other locks you need
130 *	tx = dmu_tx_create(...);	// get DMU tx
131 *	dmu_tx_hold_*();		// hold each object you might modify
132 *	error = dmu_tx_assign(tx, zfsvfs->z_assign);	// try to assign
133 *	if (error) {
134 *		rw_exit(...);		// drop locks
135 *		zfs_dirent_unlock(dl);	// unlock directory entry
136 *		VN_RELE(...);		// release held vnodes
137 *		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
138 *			dmu_tx_wait(tx);
139 *			dmu_tx_abort(tx);
140 *			goto top;
141 *		}
142 *		dmu_tx_abort(tx);	// abort DMU tx
143 *		ZFS_EXIT(zfsvfs);	// finished in zfs
144 *		return (error);		// really out of space
145 *	}
146 *	error = do_real_work();		// do whatever this VOP does
147 *	if (error == 0)
148 *		zfs_log_*(...);		// on success, make ZIL entry
149 *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
150 *	rw_exit(...);			// drop locks
151 *	zfs_dirent_unlock(dl);		// unlock directory entry
152 *	VN_RELE(...);			// release held vnodes
153 *	zil_commit(zilog, seq, foid);	// synchronous when necessary
154 *	ZFS_EXIT(zfsvfs);		// finished in zfs
155 *	return (error);			// done, report error
156 */
157/* ARGSUSED */
158static int
159zfs_open(vnode_t **vpp, int flag, cred_t *cr)
160{
161	znode_t	*zp = VTOZ(*vpp);
162
163	/* Keep a count of the synchronous opens in the znode */
164	if (flag & (FSYNC | FDSYNC))
165		atomic_inc_32(&zp->z_sync_cnt);
166	return (0);
167}
168
169/* ARGSUSED */
170static int
171zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
172{
173	znode_t	*zp = VTOZ(vp);
174
175	/* Decrement the synchronous opens in the znode */
176	if (flag & (FSYNC | FDSYNC))
177		atomic_dec_32(&zp->z_sync_cnt);
178
179	/*
180	 * Clean up any locks held by this process on the vp.
181	 */
182	cleanlocks(vp, ddi_get_pid(), 0);
183	cleanshares(vp, ddi_get_pid());
184
185	return (0);
186}
187
188/*
189 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
190 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
191 */
192static int
193zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
194{
195	znode_t	*zp = VTOZ(vp);
196	uint64_t noff = (uint64_t)*off; /* new offset */
197	uint64_t file_sz;
198	int error;
199	boolean_t hole;
200
201	file_sz = zp->z_phys->zp_size;
202	if (noff >= file_sz)  {
203		return (ENXIO);
204	}
205
206	if (cmd == _FIO_SEEK_HOLE)
207		hole = B_TRUE;
208	else
209		hole = B_FALSE;
210
211	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
212
213	/* end of file? */
214	if ((error == ESRCH) || (noff > file_sz)) {
215		/*
216		 * Handle the virtual hole at the end of file.
217		 */
218		if (hole) {
219			*off = file_sz;
220			return (0);
221		}
222		return (ENXIO);
223	}
224
225	if (noff < *off)
226		return (error);
227	*off = noff;
228	return (error);
229}
230
231/* ARGSUSED */
232static int
233zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
234    int *rvalp)
235{
236	offset_t off;
237	int error;
238	zfsvfs_t *zfsvfs;
239
240	switch (com) {
241	    case _FIOFFS:
242		return (0);
243
244		/*
245		 * The following two ioctls are used by bfu.  Faking out,
246		 * necessary to avoid bfu errors.
247		 */
248	    case _FIOGDIO:
249	    case _FIOSDIO:
250		return (0);
251
252	    case _FIO_SEEK_DATA:
253	    case _FIO_SEEK_HOLE:
254		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
255			return (EFAULT);
256
257		zfsvfs = VTOZ(vp)->z_zfsvfs;
258		ZFS_ENTER(zfsvfs);
259
260		/* offset parameter is in/out */
261		error = zfs_holey(vp, com, &off);
262		ZFS_EXIT(zfsvfs);
263		if (error)
264			return (error);
265		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
266			return (EFAULT);
267		return (0);
268	}
269	return (ENOTTY);
270}
271
272/*
273 * When a file is memory mapped, we must keep the IO data synchronized
274 * between the DMU cache and the memory mapped pages.  What this means:
275 *
276 * On Write:	If we find a memory mapped page, we write to *both*
277 *		the page and the dmu buffer.
278 *
279 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
280 *	the file is memory mapped.
281 */
282static int
283mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx)
284{
285	znode_t *zp = VTOZ(vp);
286	objset_t *os = zp->z_zfsvfs->z_os;
287	vm_object_t obj;
288	vm_page_t m;
289	struct sf_buf *sf;
290	int64_t start, off;
291	int len = nbytes;
292	int error = 0;
293	uint64_t dirbytes;
294
295	ASSERT(vp->v_mount != NULL);
296	obj = vp->v_object;
297	ASSERT(obj != NULL);
298
299	start = uio->uio_loffset;
300	off = start & PAGEOFFSET;
301	dirbytes = 0;
302	VM_OBJECT_LOCK(obj);
303	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
304		uint64_t bytes = MIN(PAGESIZE - off, len);
305		uint64_t fsize;
306
307again:
308		if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
309		    vm_page_is_valid(m, (vm_offset_t)off, bytes)) {
310			uint64_t woff;
311			caddr_t va;
312
313			if (vm_page_sleep_if_busy(m, FALSE, "zfsmwb"))
314				goto again;
315			fsize = obj->un_pager.vnp.vnp_size;
316			vm_page_busy(m);
317			vm_page_lock_queues();
318			vm_page_undirty(m);
319			vm_page_unlock_queues();
320			VM_OBJECT_UNLOCK(obj);
321			if (dirbytes > 0) {
322				error = dmu_write_uio(os, zp->z_id, uio,
323				    dirbytes, tx);
324				dirbytes = 0;
325			}
326			if (error == 0) {
327				sched_pin();
328				sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
329				va = (caddr_t)sf_buf_kva(sf);
330				woff = uio->uio_loffset - off;
331				error = uiomove(va + off, bytes, UIO_WRITE, uio);
332				/*
333				 * The uiomove() above could have been partially
334				 * successful, that's why we call dmu_write()
335				 * below unconditionally. The page was marked
336				 * non-dirty above and we would lose the changes
337				 * without doing so. If the uiomove() failed
338				 * entirely, well, we just write what we got
339				 * before one more time.
340				 */
341				dmu_write(os, zp->z_id, woff,
342				    MIN(PAGESIZE, fsize - woff), va, tx);
343				sf_buf_free(sf);
344				sched_unpin();
345			}
346			VM_OBJECT_LOCK(obj);
347			vm_page_wakeup(m);
348		} else {
349			dirbytes += bytes;
350		}
351		len -= bytes;
352		off = 0;
353		if (error)
354			break;
355	}
356	VM_OBJECT_UNLOCK(obj);
357	if (error == 0 && dirbytes > 0)
358		error = dmu_write_uio(os, zp->z_id, uio, dirbytes, tx);
359	return (error);
360}
361
362/*
363 * When a file is memory mapped, we must keep the IO data synchronized
364 * between the DMU cache and the memory mapped pages.  What this means:
365 *
366 * On Read:	We "read" preferentially from memory mapped pages,
367 *		else we default from the dmu buffer.
368 *
369 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
370 *	the file is memory mapped.
371 */
372static int
373mappedread(vnode_t *vp, int nbytes, uio_t *uio)
374{
375	znode_t *zp = VTOZ(vp);
376	objset_t *os = zp->z_zfsvfs->z_os;
377	vm_object_t obj;
378	vm_page_t m;
379	struct sf_buf *sf;
380	int64_t start, off;
381	caddr_t va;
382	int len = nbytes;
383	int error = 0;
384	uint64_t dirbytes;
385
386	ASSERT(vp->v_mount != NULL);
387	obj = vp->v_object;
388	ASSERT(obj != NULL);
389
390	start = uio->uio_loffset;
391	off = start & PAGEOFFSET;
392	dirbytes = 0;
393	VM_OBJECT_LOCK(obj);
394	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
395		uint64_t bytes = MIN(PAGESIZE - off, len);
396
397again:
398		if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
399		    vm_page_is_valid(m, (vm_offset_t)off, bytes)) {
400			if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb"))
401				goto again;
402			vm_page_busy(m);
403			VM_OBJECT_UNLOCK(obj);
404			if (dirbytes > 0) {
405				error = dmu_read_uio(os, zp->z_id, uio,
406				    dirbytes);
407				dirbytes = 0;
408			}
409			if (error == 0) {
410				sched_pin();
411				sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
412				va = (caddr_t)sf_buf_kva(sf);
413				error = uiomove(va + off, bytes, UIO_READ, uio);
414				sf_buf_free(sf);
415				sched_unpin();
416			}
417			VM_OBJECT_LOCK(obj);
418			vm_page_wakeup(m);
419		} else if (m != NULL && uio->uio_segflg == UIO_NOCOPY) {
420			/*
421			 * The code below is here to make sendfile(2) work
422			 * correctly with ZFS. As pointed out by ups@
423			 * sendfile(2) should be changed to use VOP_GETPAGES(),
424			 * but it pessimize performance of sendfile/UFS, that's
425			 * why I handle this special case in ZFS code.
426			 */
427			if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb"))
428				goto again;
429			vm_page_busy(m);
430			VM_OBJECT_UNLOCK(obj);
431			if (dirbytes > 0) {
432				error = dmu_read_uio(os, zp->z_id, uio,
433				    dirbytes);
434				dirbytes = 0;
435			}
436			if (error == 0) {
437				sched_pin();
438				sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
439				va = (caddr_t)sf_buf_kva(sf);
440				error = dmu_read(os, zp->z_id, start + off,
441				    bytes, (void *)(va + off));
442				sf_buf_free(sf);
443				sched_unpin();
444			}
445			VM_OBJECT_LOCK(obj);
446			vm_page_wakeup(m);
447			if (error == 0)
448				uio->uio_resid -= bytes;
449		} else {
450			dirbytes += bytes;
451		}
452		len -= bytes;
453		off = 0;
454		if (error)
455			break;
456	}
457	VM_OBJECT_UNLOCK(obj);
458	if (error == 0 && dirbytes > 0)
459		error = dmu_read_uio(os, zp->z_id, uio, dirbytes);
460	return (error);
461}
462
463offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
464
465/*
466 * Read bytes from specified file into supplied buffer.
467 *
468 *	IN:	vp	- vnode of file to be read from.
469 *		uio	- structure supplying read location, range info,
470 *			  and return buffer.
471 *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
472 *		cr	- credentials of caller.
473 *
474 *	OUT:	uio	- updated offset and range, buffer filled.
475 *
476 *	RETURN:	0 if success
477 *		error code if failure
478 *
479 * Side Effects:
480 *	vp - atime updated if byte count > 0
481 */
482/* ARGSUSED */
483static int
484zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
485{
486	znode_t		*zp = VTOZ(vp);
487	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
488	objset_t	*os = zfsvfs->z_os;
489	ssize_t		n, nbytes;
490	int		error;
491	rl_t		*rl;
492
493	ZFS_ENTER(zfsvfs);
494
495	/*
496	 * Validate file offset
497	 */
498	if (uio->uio_loffset < (offset_t)0) {
499		ZFS_EXIT(zfsvfs);
500		return (EINVAL);
501	}
502
503	/*
504	 * Fasttrack empty reads
505	 */
506	if (uio->uio_resid == 0) {
507		ZFS_EXIT(zfsvfs);
508		return (0);
509	}
510
511	/*
512	 * Check for mandatory locks
513	 */
514	if (MANDMODE((mode_t)zp->z_phys->zp_mode)) {
515		if (error = chklock(vp, FREAD,
516		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
517			ZFS_EXIT(zfsvfs);
518			return (error);
519		}
520	}
521
522	/*
523	 * If we're in FRSYNC mode, sync out this znode before reading it.
524	 */
525	if (ioflag & FRSYNC)
526		zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
527
528	/*
529	 * Lock the range against changes.
530	 */
531	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
532
533	/*
534	 * If we are reading past end-of-file we can skip
535	 * to the end; but we might still need to set atime.
536	 */
537	if (uio->uio_loffset >= zp->z_phys->zp_size) {
538		error = 0;
539		goto out;
540	}
541
542	ASSERT(uio->uio_loffset < zp->z_phys->zp_size);
543	n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset);
544
545	while (n > 0) {
546		nbytes = MIN(n, zfs_read_chunk_size -
547		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
548
549		if (vn_has_cached_data(vp))
550			error = mappedread(vp, nbytes, uio);
551		else
552			error = dmu_read_uio(os, zp->z_id, uio, nbytes);
553		if (error)
554			break;
555
556		n -= nbytes;
557	}
558
559out:
560	zfs_range_unlock(rl);
561
562	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
563	ZFS_EXIT(zfsvfs);
564	return (error);
565}
566
567/*
568 * Fault in the pages of the first n bytes specified by the uio structure.
569 * 1 byte in each page is touched and the uio struct is unmodified.
570 * Any error will exit this routine as this is only a best
571 * attempt to get the pages resident. This is a copy of ufs_trans_touch().
572 */
573static void
574zfs_prefault_write(ssize_t n, struct uio *uio)
575{
576	struct iovec *iov;
577	ulong_t cnt, incr;
578	caddr_t p;
579
580	if (uio->uio_segflg != UIO_USERSPACE)
581		return;
582
583	iov = uio->uio_iov;
584
585	while (n) {
586		cnt = MIN(iov->iov_len, n);
587		if (cnt == 0) {
588			/* empty iov entry */
589			iov++;
590			continue;
591		}
592		n -= cnt;
593		/*
594		 * touch each page in this segment.
595		 */
596		p = iov->iov_base;
597		while (cnt) {
598			if (fubyte(p) == -1)
599				return;
600			incr = MIN(cnt, PAGESIZE);
601			p += incr;
602			cnt -= incr;
603		}
604		/*
605		 * touch the last byte in case it straddles a page.
606		 */
607		p--;
608		if (fubyte(p) == -1)
609			return;
610		iov++;
611	}
612}
613
614/*
615 * Write the bytes to a file.
616 *
617 *	IN:	vp	- vnode of file to be written to.
618 *		uio	- structure supplying write location, range info,
619 *			  and data buffer.
620 *		ioflag	- IO_APPEND flag set if in append mode.
621 *		cr	- credentials of caller.
622 *
623 *	OUT:	uio	- updated offset and range.
624 *
625 *	RETURN:	0 if success
626 *		error code if failure
627 *
628 * Timestamps:
629 *	vp - ctime|mtime updated if byte count > 0
630 */
631/* ARGSUSED */
632static int
633zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
634{
635	znode_t		*zp = VTOZ(vp);
636	rlim64_t	limit = MAXOFFSET_T;
637	ssize_t		start_resid = uio->uio_resid;
638	ssize_t		tx_bytes;
639	uint64_t	end_size;
640	dmu_tx_t	*tx;
641	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
642	zilog_t		*zilog = zfsvfs->z_log;
643	offset_t	woff;
644	ssize_t		n, nbytes;
645	rl_t		*rl;
646	int		max_blksz = zfsvfs->z_max_blksz;
647	int		error;
648
649	/*
650	 * Fasttrack empty write
651	 */
652	n = start_resid;
653	if (n == 0)
654		return (0);
655
656	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
657		limit = MAXOFFSET_T;
658
659	ZFS_ENTER(zfsvfs);
660
661	/*
662	 * Pre-fault the pages to ensure slow (eg NFS) pages
663	 * don't hold up txg.
664	 */
665	zfs_prefault_write(n, uio);
666
667	/*
668	 * If in append mode, set the io offset pointer to eof.
669	 */
670	if (ioflag & IO_APPEND) {
671		/*
672		 * Range lock for a file append:
673		 * The value for the start of range will be determined by
674		 * zfs_range_lock() (to guarantee append semantics).
675		 * If this write will cause the block size to increase,
676		 * zfs_range_lock() will lock the entire file, so we must
677		 * later reduce the range after we grow the block size.
678		 */
679		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
680		if (rl->r_len == UINT64_MAX) {
681			/* overlocked, zp_size can't change */
682			woff = uio->uio_loffset = zp->z_phys->zp_size;
683		} else {
684			woff = uio->uio_loffset = rl->r_off;
685		}
686	} else {
687		woff = uio->uio_loffset;
688		/*
689		 * Validate file offset
690		 */
691		if (woff < 0) {
692			ZFS_EXIT(zfsvfs);
693			return (EINVAL);
694		}
695
696		/*
697		 * If we need to grow the block size then zfs_range_lock()
698		 * will lock a wider range than we request here.
699		 * Later after growing the block size we reduce the range.
700		 */
701		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
702	}
703
704	if (woff >= limit) {
705		zfs_range_unlock(rl);
706		ZFS_EXIT(zfsvfs);
707		return (EFBIG);
708	}
709
710	if ((woff + n) > limit || woff > (limit - n))
711		n = limit - woff;
712
713	/*
714	 * Check for mandatory locks
715	 */
716	if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
717	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
718		zfs_range_unlock(rl);
719		ZFS_EXIT(zfsvfs);
720		return (error);
721	}
722	end_size = MAX(zp->z_phys->zp_size, woff + n);
723
724	/*
725	 * Write the file in reasonable size chunks.  Each chunk is written
726	 * in a separate transaction; this keeps the intent log records small
727	 * and allows us to do more fine-grained space accounting.
728	 */
729	while (n > 0) {
730		/*
731		 * Start a transaction.
732		 */
733		woff = uio->uio_loffset;
734		tx = dmu_tx_create(zfsvfs->z_os);
735		dmu_tx_hold_bonus(tx, zp->z_id);
736		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
737		error = dmu_tx_assign(tx, zfsvfs->z_assign);
738		if (error) {
739			if (error == ERESTART &&
740			    zfsvfs->z_assign == TXG_NOWAIT) {
741				dmu_tx_wait(tx);
742				dmu_tx_abort(tx);
743				continue;
744			}
745			dmu_tx_abort(tx);
746			break;
747		}
748
749		/*
750		 * If zfs_range_lock() over-locked we grow the blocksize
751		 * and then reduce the lock range.  This will only happen
752		 * on the first iteration since zfs_range_reduce() will
753		 * shrink down r_len to the appropriate size.
754		 */
755		if (rl->r_len == UINT64_MAX) {
756			uint64_t new_blksz;
757
758			if (zp->z_blksz > max_blksz) {
759				ASSERT(!ISP2(zp->z_blksz));
760				new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
761			} else {
762				new_blksz = MIN(end_size, max_blksz);
763			}
764			zfs_grow_blocksize(zp, new_blksz, tx);
765			zfs_range_reduce(rl, woff, n);
766		}
767
768		/*
769		 * XXX - should we really limit each write to z_max_blksz?
770		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
771		 */
772		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
773		rw_enter(&zp->z_map_lock, RW_READER);
774
775		if (woff + nbytes > zp->z_phys->zp_size)
776			vnode_pager_setsize(vp, woff + nbytes);
777
778		tx_bytes = uio->uio_resid;
779		if (vn_has_cached_data(vp)) {
780			rw_exit(&zp->z_map_lock);
781			error = mappedwrite(vp, nbytes, uio, tx);
782		} else {
783			error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
784			    uio, nbytes, tx);
785			rw_exit(&zp->z_map_lock);
786		}
787		tx_bytes -= uio->uio_resid;
788
789		/*
790		 * If we made no progress, we're done.  If we made even
791		 * partial progress, update the znode and ZIL accordingly.
792		 */
793		if (tx_bytes == 0) {
794			dmu_tx_commit(tx);
795			ASSERT(error != 0);
796			break;
797		}
798
799		/*
800		 * Clear Set-UID/Set-GID bits on successful write if not
801		 * privileged and at least one of the excute bits is set.
802		 *
803		 * It would be nice to to this after all writes have
804		 * been done, but that would still expose the ISUID/ISGID
805		 * to another app after the partial write is committed.
806		 */
807		mutex_enter(&zp->z_acl_lock);
808		if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) |
809		    (S_IXUSR >> 6))) != 0 &&
810		    (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 &&
811		    secpolicy_vnode_setid_retain(cr,
812		    (zp->z_phys->zp_mode & S_ISUID) != 0 &&
813		    zp->z_phys->zp_uid == 0) != 0) {
814			    zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID);
815		}
816		mutex_exit(&zp->z_acl_lock);
817
818		/*
819		 * Update time stamp.  NOTE: This marks the bonus buffer as
820		 * dirty, so we don't have to do it again for zp_size.
821		 */
822		zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
823
824		/*
825		 * Update the file size (zp_size) if it has changed;
826		 * account for possible concurrent updates.
827		 */
828		while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset)
829			(void) atomic_cas_64(&zp->z_phys->zp_size, end_size,
830			    uio->uio_loffset);
831		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
832		dmu_tx_commit(tx);
833
834		if (error != 0)
835			break;
836		ASSERT(tx_bytes == nbytes);
837		n -= nbytes;
838	}
839
840	zfs_range_unlock(rl);
841
842	/*
843	 * If we're in replay mode, or we made no progress, return error.
844	 * Otherwise, it's at least a partial write, so it's successful.
845	 */
846	if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) {
847		ZFS_EXIT(zfsvfs);
848		return (error);
849	}
850
851	if (ioflag & (FSYNC | FDSYNC))
852		zil_commit(zilog, zp->z_last_itx, zp->z_id);
853
854	ZFS_EXIT(zfsvfs);
855	return (0);
856}
857
858void
859zfs_get_done(dmu_buf_t *db, void *vzgd)
860{
861	zgd_t *zgd = (zgd_t *)vzgd;
862	rl_t *rl = zgd->zgd_rl;
863	vnode_t *vp = ZTOV(rl->r_zp);
864	int vfslocked;
865
866	vfslocked = VFS_LOCK_GIANT(vp->v_vfsp);
867	dmu_buf_rele(db, vzgd);
868	zfs_range_unlock(rl);
869	VN_RELE(vp);
870	zil_add_vdev(zgd->zgd_zilog, DVA_GET_VDEV(BP_IDENTITY(zgd->zgd_bp)));
871	kmem_free(zgd, sizeof (zgd_t));
872	VFS_UNLOCK_GIANT(vfslocked);
873}
874
875/*
876 * Get data to generate a TX_WRITE intent log record.
877 */
878int
879zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
880{
881	zfsvfs_t *zfsvfs = arg;
882	objset_t *os = zfsvfs->z_os;
883	znode_t *zp;
884	uint64_t off = lr->lr_offset;
885	dmu_buf_t *db;
886	rl_t *rl;
887	zgd_t *zgd;
888	int dlen = lr->lr_length;		/* length of user data */
889	int error = 0;
890
891	ASSERT(zio);
892	ASSERT(dlen != 0);
893
894	/*
895	 * Nothing to do if the file has been removed
896	 */
897	if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0)
898		return (ENOENT);
899	if (zp->z_unlinked) {
900		VN_RELE(ZTOV(zp));
901		return (ENOENT);
902	}
903
904	/*
905	 * Write records come in two flavors: immediate and indirect.
906	 * For small writes it's cheaper to store the data with the
907	 * log record (immediate); for large writes it's cheaper to
908	 * sync the data and get a pointer to it (indirect) so that
909	 * we don't have to write the data twice.
910	 */
911	if (buf != NULL) { /* immediate write */
912		rl = zfs_range_lock(zp, off, dlen, RL_READER);
913		/* test for truncation needs to be done while range locked */
914		if (off >= zp->z_phys->zp_size) {
915			error = ENOENT;
916			goto out;
917		}
918		VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf));
919	} else { /* indirect write */
920		uint64_t boff; /* block starting offset */
921
922		/*
923		 * Have to lock the whole block to ensure when it's
924		 * written out and it's checksum is being calculated
925		 * that no one can change the data. We need to re-check
926		 * blocksize after we get the lock in case it's changed!
927		 */
928		for (;;) {
929			if (ISP2(zp->z_blksz)) {
930				boff = P2ALIGN_TYPED(off, zp->z_blksz,
931				    uint64_t);
932			} else {
933				boff = 0;
934			}
935			dlen = zp->z_blksz;
936			rl = zfs_range_lock(zp, boff, dlen, RL_READER);
937			if (zp->z_blksz == dlen)
938				break;
939			zfs_range_unlock(rl);
940		}
941		/* test for truncation needs to be done while range locked */
942		if (off >= zp->z_phys->zp_size) {
943			error = ENOENT;
944			goto out;
945		}
946		zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
947		zgd->zgd_rl = rl;
948		zgd->zgd_zilog = zfsvfs->z_log;
949		zgd->zgd_bp = &lr->lr_blkptr;
950		VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db));
951		ASSERT(boff == db->db_offset);
952		lr->lr_blkoff = off - boff;
953		error = dmu_sync(zio, db, &lr->lr_blkptr,
954		    lr->lr_common.lrc_txg, zfs_get_done, zgd);
955		ASSERT(error == EEXIST || lr->lr_length <= zp->z_blksz);
956		if (error == 0) {
957			zil_add_vdev(zfsvfs->z_log,
958			    DVA_GET_VDEV(BP_IDENTITY(&lr->lr_blkptr)));
959		}
960		/*
961		 * If we get EINPROGRESS, then we need to wait for a
962		 * write IO initiated by dmu_sync() to complete before
963		 * we can release this dbuf.  We will finish everything
964		 * up in the zfs_get_done() callback.
965		 */
966		if (error == EINPROGRESS)
967			return (0);
968		dmu_buf_rele(db, zgd);
969		kmem_free(zgd, sizeof (zgd_t));
970	}
971out:
972	zfs_range_unlock(rl);
973	VN_RELE(ZTOV(zp));
974	return (error);
975}
976
977/*ARGSUSED*/
978static int
979zfs_access(vnode_t *vp, int mode, int flags, cred_t *cr)
980{
981	znode_t *zp = VTOZ(vp);
982	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
983	int error;
984
985	ZFS_ENTER(zfsvfs);
986	error = zfs_zaccess_rwx(zp, mode, cr);
987	ZFS_EXIT(zfsvfs);
988	return (error);
989}
990
991/*
992 * Lookup an entry in a directory, or an extended attribute directory.
993 * If it exists, return a held vnode reference for it.
994 *
995 *	IN:	dvp	- vnode of directory to search.
996 *		nm	- name of entry to lookup.
997 *		pnp	- full pathname to lookup [UNUSED].
998 *		flags	- LOOKUP_XATTR set if looking for an attribute.
999 *		rdir	- root directory vnode [UNUSED].
1000 *		cr	- credentials of caller.
1001 *
1002 *	OUT:	vpp	- vnode of located entry, NULL if not found.
1003 *
1004 *	RETURN:	0 if success
1005 *		error code if failure
1006 *
1007 * Timestamps:
1008 *	NA
1009 */
1010/* ARGSUSED */
1011static int
1012zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
1013    int nameiop, cred_t *cr, kthread_t *td)
1014{
1015
1016	znode_t *zdp = VTOZ(dvp);
1017	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1018	int	error;
1019
1020	ZFS_ENTER(zfsvfs);
1021
1022	*vpp = NULL;
1023
1024#ifdef TODO
1025	if (flags & LOOKUP_XATTR) {
1026		/*
1027		 * If the xattr property is off, refuse the lookup request.
1028		 */
1029		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1030			ZFS_EXIT(zfsvfs);
1031			return (EINVAL);
1032		}
1033
1034		/*
1035		 * We don't allow recursive attributes..
1036		 * Maybe someday we will.
1037		 */
1038		if (zdp->z_phys->zp_flags & ZFS_XATTR) {
1039			ZFS_EXIT(zfsvfs);
1040			return (EINVAL);
1041		}
1042
1043		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1044			ZFS_EXIT(zfsvfs);
1045			return (error);
1046		}
1047
1048		/*
1049		 * Do we have permission to get into attribute directory?
1050		 */
1051
1052		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, cr)) {
1053			VN_RELE(*vpp);
1054		}
1055
1056		ZFS_EXIT(zfsvfs);
1057		return (error);
1058	}
1059#endif	/* TODO */
1060
1061	if (dvp->v_type != VDIR) {
1062		ZFS_EXIT(zfsvfs);
1063		return (ENOTDIR);
1064	}
1065
1066	/*
1067	 * Check accessibility of directory.
1068	 */
1069
1070	if (error = zfs_zaccess(zdp, ACE_EXECUTE, cr)) {
1071		ZFS_EXIT(zfsvfs);
1072		return (error);
1073	}
1074
1075	if ((error = zfs_dirlook(zdp, nm, vpp)) == 0) {
1076
1077		/*
1078		 * Convert device special files
1079		 */
1080		if (IS_DEVVP(*vpp)) {
1081			vnode_t	*svp;
1082
1083			svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1084			VN_RELE(*vpp);
1085			if (svp == NULL)
1086				error = ENOSYS;
1087			else
1088				*vpp = svp;
1089		}
1090	}
1091
1092	ZFS_EXIT(zfsvfs);
1093
1094	/* Translate errors and add SAVENAME when needed. */
1095	if (cnp->cn_flags & ISLASTCN) {
1096		switch (nameiop) {
1097		case CREATE:
1098		case RENAME:
1099			if (error == ENOENT) {
1100				error = EJUSTRETURN;
1101				cnp->cn_flags |= SAVENAME;
1102				break;
1103			}
1104			/* FALLTHROUGH */
1105		case DELETE:
1106			if (error == 0)
1107				cnp->cn_flags |= SAVENAME;
1108			break;
1109		}
1110	}
1111	if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) {
1112		if (cnp->cn_flags & ISDOTDOT)
1113			VOP_UNLOCK(dvp, 0, td);
1114		error = vn_lock(*vpp, LK_EXCLUSIVE, td);
1115		if (cnp->cn_flags & ISDOTDOT)
1116			vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td);
1117		if (error != 0) {
1118			VN_RELE(*vpp);
1119			*vpp = NULL;
1120			return (error);
1121		}
1122	}
1123
1124#ifdef FREEBSD_NAMECACHE
1125	/*
1126	 * Insert name into cache (as non-existent) if appropriate.
1127	 */
1128	if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
1129		cache_enter(dvp, *vpp, cnp);
1130	/*
1131	 * Insert name into cache if appropriate.
1132	 */
1133	if (error == 0 && (cnp->cn_flags & MAKEENTRY)) {
1134		if (!(cnp->cn_flags & ISLASTCN) ||
1135		    (nameiop != DELETE && nameiop != RENAME)) {
1136			cache_enter(dvp, *vpp, cnp);
1137		}
1138	}
1139#endif
1140
1141	return (error);
1142}
1143
1144/*
1145 * Attempt to create a new entry in a directory.  If the entry
1146 * already exists, truncate the file if permissible, else return
1147 * an error.  Return the vp of the created or trunc'd file.
1148 *
1149 *	IN:	dvp	- vnode of directory to put new file entry in.
1150 *		name	- name of new file entry.
1151 *		vap	- attributes of new file.
1152 *		excl	- flag indicating exclusive or non-exclusive mode.
1153 *		mode	- mode to open file with.
1154 *		cr	- credentials of caller.
1155 *		flag	- large file flag [UNUSED].
1156 *
1157 *	OUT:	vpp	- vnode of created or trunc'd entry.
1158 *
1159 *	RETURN:	0 if success
1160 *		error code if failure
1161 *
1162 * Timestamps:
1163 *	dvp - ctime|mtime updated if new entry created
1164 *	 vp - ctime|mtime always, atime if new
1165 */
1166/* ARGSUSED */
1167static int
1168zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
1169    vnode_t **vpp, cred_t *cr, kthread_t *td)
1170{
1171	znode_t		*zp, *dzp = VTOZ(dvp);
1172	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1173	zilog_t		*zilog = zfsvfs->z_log;
1174	objset_t	*os = zfsvfs->z_os;
1175	zfs_dirlock_t	*dl;
1176	dmu_tx_t	*tx;
1177	int		error;
1178	uint64_t	zoid;
1179
1180	ZFS_ENTER(zfsvfs);
1181
1182top:
1183	*vpp = NULL;
1184
1185	if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr))
1186		vap->va_mode &= ~VSVTX;
1187
1188	if (*name == '\0') {
1189		/*
1190		 * Null component name refers to the directory itself.
1191		 */
1192		VN_HOLD(dvp);
1193		zp = dzp;
1194		dl = NULL;
1195		error = 0;
1196	} else {
1197		/* possible VN_HOLD(zp) */
1198		if (error = zfs_dirent_lock(&dl, dzp, name, &zp, 0)) {
1199			if (strcmp(name, "..") == 0)
1200				error = EISDIR;
1201			ZFS_EXIT(zfsvfs);
1202			return (error);
1203		}
1204	}
1205
1206	zoid = zp ? zp->z_id : -1ULL;
1207
1208	if (zp == NULL) {
1209		/*
1210		 * Create a new file object and update the directory
1211		 * to reference it.
1212		 */
1213		if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
1214			goto out;
1215		}
1216
1217		/*
1218		 * We only support the creation of regular files in
1219		 * extended attribute directories.
1220		 */
1221		if ((dzp->z_phys->zp_flags & ZFS_XATTR) &&
1222		    (vap->va_type != VREG)) {
1223			error = EINVAL;
1224			goto out;
1225		}
1226
1227		tx = dmu_tx_create(os);
1228		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1229		dmu_tx_hold_bonus(tx, dzp->z_id);
1230		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1231		if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
1232			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1233			    0, SPA_MAXBLOCKSIZE);
1234		error = dmu_tx_assign(tx, zfsvfs->z_assign);
1235		if (error) {
1236			zfs_dirent_unlock(dl);
1237			if (error == ERESTART &&
1238			    zfsvfs->z_assign == TXG_NOWAIT) {
1239				dmu_tx_wait(tx);
1240				dmu_tx_abort(tx);
1241				goto top;
1242			}
1243			dmu_tx_abort(tx);
1244			ZFS_EXIT(zfsvfs);
1245			return (error);
1246		}
1247		zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
1248		ASSERT(zp->z_id == zoid);
1249		(void) zfs_link_create(dl, zp, tx, ZNEW);
1250		zfs_log_create(zilog, tx, TX_CREATE, dzp, zp, name);
1251		dmu_tx_commit(tx);
1252	} else {
1253		/*
1254		 * A directory entry already exists for this name.
1255		 */
1256		/*
1257		 * Can't truncate an existing file if in exclusive mode.
1258		 */
1259		if (excl == EXCL) {
1260			error = EEXIST;
1261			goto out;
1262		}
1263		/*
1264		 * Can't open a directory for writing.
1265		 */
1266		if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
1267			error = EISDIR;
1268			goto out;
1269		}
1270		/*
1271		 * Verify requested access to file.
1272		 */
1273		if (mode && (error = zfs_zaccess_rwx(zp, mode, cr))) {
1274			goto out;
1275		}
1276
1277		mutex_enter(&dzp->z_lock);
1278		dzp->z_seq++;
1279		mutex_exit(&dzp->z_lock);
1280
1281		/*
1282		 * Truncate regular files if requested.
1283		 */
1284		if ((ZTOV(zp)->v_type == VREG) &&
1285		    (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
1286			error = zfs_freesp(zp, 0, 0, mode, TRUE);
1287			if (error == ERESTART &&
1288			    zfsvfs->z_assign == TXG_NOWAIT) {
1289				/* NB: we already did dmu_tx_wait() */
1290				zfs_dirent_unlock(dl);
1291				VN_RELE(ZTOV(zp));
1292				goto top;
1293			}
1294		}
1295	}
1296out:
1297
1298	if (error == 0) {
1299		*vpp = ZTOV(zp);
1300		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);
1301	}
1302
1303	if (dl)
1304		zfs_dirent_unlock(dl);
1305
1306	if (error) {
1307		if (zp)
1308			VN_RELE(ZTOV(zp));
1309	} else {
1310		*vpp = ZTOV(zp);
1311		/*
1312		 * If vnode is for a device return a specfs vnode instead.
1313		 */
1314		if (IS_DEVVP(*vpp)) {
1315			struct vnode *svp;
1316
1317			svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1318			VN_RELE(*vpp);
1319			if (svp == NULL) {
1320				error = ENOSYS;
1321			}
1322			*vpp = svp;
1323		}
1324	}
1325
1326	ZFS_EXIT(zfsvfs);
1327	return (error);
1328}
1329
1330/*
1331 * Remove an entry from a directory.
1332 *
1333 *	IN:	dvp	- vnode of directory to remove entry from.
1334 *		name	- name of entry to remove.
1335 *		cr	- credentials of caller.
1336 *
1337 *	RETURN:	0 if success
1338 *		error code if failure
1339 *
1340 * Timestamps:
1341 *	dvp - ctime|mtime
1342 *	 vp - ctime (if nlink > 0)
1343 */
1344static int
1345zfs_remove(vnode_t *dvp, char *name, cred_t *cr)
1346{
1347	znode_t		*zp, *dzp = VTOZ(dvp);
1348	znode_t		*xzp = NULL;
1349	vnode_t		*vp;
1350	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1351	zilog_t		*zilog = zfsvfs->z_log;
1352	uint64_t	acl_obj, xattr_obj;
1353	zfs_dirlock_t	*dl;
1354	dmu_tx_t	*tx;
1355	boolean_t	may_delete_now, delete_now = FALSE;
1356	boolean_t	unlinked;
1357	int		error;
1358
1359	ZFS_ENTER(zfsvfs);
1360
1361top:
1362	/*
1363	 * Attempt to lock directory; fail if entry doesn't exist.
1364	 */
1365	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) {
1366		ZFS_EXIT(zfsvfs);
1367		return (error);
1368	}
1369
1370	vp = ZTOV(zp);
1371
1372	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1373		goto out;
1374	}
1375
1376	/*
1377	 * Need to use rmdir for removing directories.
1378	 */
1379	if (vp->v_type == VDIR) {
1380		error = EPERM;
1381		goto out;
1382	}
1383
1384	vnevent_remove(vp);
1385
1386	dnlc_remove(dvp, name);
1387
1388	may_delete_now = FALSE;
1389
1390	/*
1391	 * We may delete the znode now, or we may put it in the unlinked set;
1392	 * it depends on whether we're the last link, and on whether there are
1393	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
1394	 * allow for either case.
1395	 */
1396	tx = dmu_tx_create(zfsvfs->z_os);
1397	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1398	dmu_tx_hold_bonus(tx, zp->z_id);
1399	if (may_delete_now)
1400		dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
1401
1402	/* are there any extended attributes? */
1403	if ((xattr_obj = zp->z_phys->zp_xattr) != 0) {
1404		/* XXX - do we need this if we are deleting? */
1405		dmu_tx_hold_bonus(tx, xattr_obj);
1406	}
1407
1408	/* are there any additional acls */
1409	if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 &&
1410	    may_delete_now)
1411		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1412
1413	/* charge as an update -- would be nice not to charge at all */
1414	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1415
1416	error = dmu_tx_assign(tx, zfsvfs->z_assign);
1417	if (error) {
1418		zfs_dirent_unlock(dl);
1419		VN_RELE(vp);
1420		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
1421			dmu_tx_wait(tx);
1422			dmu_tx_abort(tx);
1423			goto top;
1424		}
1425		dmu_tx_abort(tx);
1426		ZFS_EXIT(zfsvfs);
1427		return (error);
1428	}
1429
1430	/*
1431	 * Remove the directory entry.
1432	 */
1433	error = zfs_link_destroy(dl, zp, tx, 0, &unlinked);
1434
1435	if (error) {
1436		dmu_tx_commit(tx);
1437		goto out;
1438	}
1439
1440	if (0 && unlinked) {
1441		VI_LOCK(vp);
1442		delete_now = may_delete_now &&
1443		    vp->v_count == 1 && !vn_has_cached_data(vp) &&
1444		    zp->z_phys->zp_xattr == xattr_obj &&
1445		    zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj;
1446		VI_UNLOCK(vp);
1447	}
1448
1449	if (delete_now) {
1450		if (zp->z_phys->zp_xattr) {
1451			error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
1452			ASSERT3U(error, ==, 0);
1453			ASSERT3U(xzp->z_phys->zp_links, ==, 2);
1454			dmu_buf_will_dirty(xzp->z_dbuf, tx);
1455			mutex_enter(&xzp->z_lock);
1456			xzp->z_unlinked = 1;
1457			xzp->z_phys->zp_links = 0;
1458			mutex_exit(&xzp->z_lock);
1459			zfs_unlinked_add(xzp, tx);
1460			zp->z_phys->zp_xattr = 0; /* probably unnecessary */
1461		}
1462		mutex_enter(&zp->z_lock);
1463		VI_LOCK(vp);
1464		vp->v_count--;
1465		ASSERT3U(vp->v_count, ==, 0);
1466		VI_UNLOCK(vp);
1467		mutex_exit(&zp->z_lock);
1468		zfs_znode_delete(zp, tx);
1469		VFS_RELE(zfsvfs->z_vfs);
1470	} else if (unlinked) {
1471		zfs_unlinked_add(zp, tx);
1472	}
1473
1474	zfs_log_remove(zilog, tx, TX_REMOVE, dzp, name);
1475
1476	dmu_tx_commit(tx);
1477out:
1478	zfs_dirent_unlock(dl);
1479
1480	if (!delete_now) {
1481		VN_RELE(vp);
1482	} else if (xzp) {
1483		/* this rele delayed to prevent nesting transactions */
1484		VN_RELE(ZTOV(xzp));
1485	}
1486
1487	ZFS_EXIT(zfsvfs);
1488	return (error);
1489}
1490
1491/*
1492 * Create a new directory and insert it into dvp using the name
1493 * provided.  Return a pointer to the inserted directory.
1494 *
1495 *	IN:	dvp	- vnode of directory to add subdir to.
1496 *		dirname	- name of new directory.
1497 *		vap	- attributes of new directory.
1498 *		cr	- credentials of caller.
1499 *
1500 *	OUT:	vpp	- vnode of created directory.
1501 *
1502 *	RETURN:	0 if success
1503 *		error code if failure
1504 *
1505 * Timestamps:
1506 *	dvp - ctime|mtime updated
1507 *	 vp - ctime|mtime|atime updated
1508 */
1509static int
1510zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
1511{
1512	znode_t		*zp, *dzp = VTOZ(dvp);
1513	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1514	zilog_t		*zilog = zfsvfs->z_log;
1515	zfs_dirlock_t	*dl;
1516	uint64_t	zoid = 0;
1517	dmu_tx_t	*tx;
1518	int		error;
1519
1520	ASSERT(vap->va_type == VDIR);
1521
1522	ZFS_ENTER(zfsvfs);
1523
1524	if (dzp->z_phys->zp_flags & ZFS_XATTR) {
1525		ZFS_EXIT(zfsvfs);
1526		return (EINVAL);
1527	}
1528top:
1529	*vpp = NULL;
1530
1531	/*
1532	 * First make sure the new directory doesn't exist.
1533	 */
1534	if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, ZNEW)) {
1535		ZFS_EXIT(zfsvfs);
1536		return (error);
1537	}
1538
1539	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, cr)) {
1540		zfs_dirent_unlock(dl);
1541		ZFS_EXIT(zfsvfs);
1542		return (error);
1543	}
1544
1545	/*
1546	 * Add a new entry to the directory.
1547	 */
1548	tx = dmu_tx_create(zfsvfs->z_os);
1549	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1550	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1551	if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
1552		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1553		    0, SPA_MAXBLOCKSIZE);
1554	error = dmu_tx_assign(tx, zfsvfs->z_assign);
1555	if (error) {
1556		zfs_dirent_unlock(dl);
1557		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
1558			dmu_tx_wait(tx);
1559			dmu_tx_abort(tx);
1560			goto top;
1561		}
1562		dmu_tx_abort(tx);
1563		ZFS_EXIT(zfsvfs);
1564		return (error);
1565	}
1566
1567	/*
1568	 * Create new node.
1569	 */
1570	zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
1571
1572	/*
1573	 * Now put new name in parent dir.
1574	 */
1575	(void) zfs_link_create(dl, zp, tx, ZNEW);
1576
1577	*vpp = ZTOV(zp);
1578
1579	zfs_log_create(zilog, tx, TX_MKDIR, dzp, zp, dirname);
1580	dmu_tx_commit(tx);
1581
1582	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, curthread);
1583
1584	zfs_dirent_unlock(dl);
1585
1586	ZFS_EXIT(zfsvfs);
1587	return (0);
1588}
1589
1590/*
1591 * Remove a directory subdir entry.  If the current working
1592 * directory is the same as the subdir to be removed, the
1593 * remove will fail.
1594 *
1595 *	IN:	dvp	- vnode of directory to remove from.
1596 *		name	- name of directory to be removed.
1597 *		cwd	- vnode of current working directory.
1598 *		cr	- credentials of caller.
1599 *
1600 *	RETURN:	0 if success
1601 *		error code if failure
1602 *
1603 * Timestamps:
1604 *	dvp - ctime|mtime updated
1605 */
1606static int
1607zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr)
1608{
1609	znode_t		*dzp = VTOZ(dvp);
1610	znode_t		*zp;
1611	vnode_t		*vp;
1612	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1613	zilog_t		*zilog = zfsvfs->z_log;
1614	zfs_dirlock_t	*dl;
1615	dmu_tx_t	*tx;
1616	int		error;
1617
1618	ZFS_ENTER(zfsvfs);
1619
1620top:
1621	zp = NULL;
1622
1623	/*
1624	 * Attempt to lock directory; fail if entry doesn't exist.
1625	 */
1626	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) {
1627		ZFS_EXIT(zfsvfs);
1628		return (error);
1629	}
1630
1631	vp = ZTOV(zp);
1632
1633	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1634		goto out;
1635	}
1636
1637	if (vp->v_type != VDIR) {
1638		error = ENOTDIR;
1639		goto out;
1640	}
1641
1642	if (vp == cwd) {
1643		error = EINVAL;
1644		goto out;
1645	}
1646
1647	vnevent_rmdir(vp);
1648
1649	/*
1650	 * Grab a lock on the directory to make sure that noone is
1651	 * trying to add (or lookup) entries while we are removing it.
1652	 */
1653	rw_enter(&zp->z_name_lock, RW_WRITER);
1654
1655	/*
1656	 * Grab a lock on the parent pointer to make sure we play well
1657	 * with the treewalk and directory rename code.
1658	 */
1659	rw_enter(&zp->z_parent_lock, RW_WRITER);
1660
1661	tx = dmu_tx_create(zfsvfs->z_os);
1662	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1663	dmu_tx_hold_bonus(tx, zp->z_id);
1664	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1665	error = dmu_tx_assign(tx, zfsvfs->z_assign);
1666	if (error) {
1667		rw_exit(&zp->z_parent_lock);
1668		rw_exit(&zp->z_name_lock);
1669		zfs_dirent_unlock(dl);
1670		VN_RELE(vp);
1671		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
1672			dmu_tx_wait(tx);
1673			dmu_tx_abort(tx);
1674			goto top;
1675		}
1676		dmu_tx_abort(tx);
1677		ZFS_EXIT(zfsvfs);
1678		return (error);
1679	}
1680
1681#ifdef FREEBSD_NAMECACHE
1682	cache_purge(dvp);
1683#endif
1684
1685	error = zfs_link_destroy(dl, zp, tx, 0, NULL);
1686
1687	if (error == 0)
1688		zfs_log_remove(zilog, tx, TX_RMDIR, dzp, name);
1689
1690	dmu_tx_commit(tx);
1691
1692	rw_exit(&zp->z_parent_lock);
1693	rw_exit(&zp->z_name_lock);
1694#ifdef FREEBSD_NAMECACHE
1695	cache_purge(vp);
1696#endif
1697out:
1698	zfs_dirent_unlock(dl);
1699
1700	VN_RELE(vp);
1701
1702	ZFS_EXIT(zfsvfs);
1703	return (error);
1704}
1705
1706/*
1707 * Read as many directory entries as will fit into the provided
1708 * buffer from the given directory cursor position (specified in
1709 * the uio structure.
1710 *
1711 *	IN:	vp	- vnode of directory to read.
1712 *		uio	- structure supplying read location, range info,
1713 *			  and return buffer.
1714 *		cr	- credentials of caller.
1715 *
1716 *	OUT:	uio	- updated offset and range, buffer filled.
1717 *		eofp	- set to true if end-of-file detected.
1718 *
1719 *	RETURN:	0 if success
1720 *		error code if failure
1721 *
1722 * Timestamps:
1723 *	vp - atime updated
1724 *
1725 * Note that the low 4 bits of the cookie returned by zap is always zero.
1726 * This allows us to use the low range for "special" directory entries:
1727 * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
1728 * we use the offset 2 for the '.zfs' directory.
1729 */
1730/* ARGSUSED */
1731static int
1732zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
1733{
1734	znode_t		*zp = VTOZ(vp);
1735	iovec_t		*iovp;
1736	dirent64_t	*odp;
1737	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
1738	objset_t	*os;
1739	caddr_t		outbuf;
1740	size_t		bufsize;
1741	zap_cursor_t	zc;
1742	zap_attribute_t	zap;
1743	uint_t		bytes_wanted;
1744	uint64_t	offset; /* must be unsigned; checks for < 1 */
1745	int		local_eof;
1746	int		outcount;
1747	int		error;
1748	uint8_t		prefetch;
1749	uint8_t		type;
1750	int		ncooks;
1751	u_long		*cooks = NULL;
1752
1753	ZFS_ENTER(zfsvfs);
1754
1755	/*
1756	 * If we are not given an eof variable,
1757	 * use a local one.
1758	 */
1759	if (eofp == NULL)
1760		eofp = &local_eof;
1761
1762	/*
1763	 * Check for valid iov_len.
1764	 */
1765	if (uio->uio_iov->iov_len <= 0) {
1766		ZFS_EXIT(zfsvfs);
1767		return (EINVAL);
1768	}
1769
1770	/*
1771	 * Quit if directory has been removed (posix)
1772	 */
1773	if ((*eofp = zp->z_unlinked) != 0) {
1774		ZFS_EXIT(zfsvfs);
1775		return (0);
1776	}
1777
1778	error = 0;
1779	os = zfsvfs->z_os;
1780	offset = uio->uio_loffset;
1781	prefetch = zp->z_zn_prefetch;
1782
1783	/*
1784	 * Initialize the iterator cursor.
1785	 */
1786	if (offset <= 3) {
1787		/*
1788		 * Start iteration from the beginning of the directory.
1789		 */
1790		zap_cursor_init(&zc, os, zp->z_id);
1791	} else {
1792		/*
1793		 * The offset is a serialized cursor.
1794		 */
1795		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
1796	}
1797
1798	/*
1799	 * Get space to change directory entries into fs independent format.
1800	 */
1801	iovp = uio->uio_iov;
1802	bytes_wanted = iovp->iov_len;
1803	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
1804		bufsize = bytes_wanted;
1805		outbuf = kmem_alloc(bufsize, KM_SLEEP);
1806		odp = (struct dirent64 *)outbuf;
1807	} else {
1808		bufsize = bytes_wanted;
1809		odp = (struct dirent64 *)iovp->iov_base;
1810	}
1811
1812	if (ncookies != NULL) {
1813		/*
1814		 * Minimum entry size is dirent size and 1 byte for a file name.
1815		 */
1816		ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
1817		cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
1818		*cookies = cooks;
1819		*ncookies = ncooks;
1820	}
1821
1822	/*
1823	 * Transform to file-system independent format
1824	 */
1825	outcount = 0;
1826	while (outcount < bytes_wanted) {
1827		ino64_t objnum;
1828		ushort_t reclen;
1829
1830		/*
1831		 * Special case `.', `..', and `.zfs'.
1832		 */
1833		if (offset == 0) {
1834			(void) strcpy(zap.za_name, ".");
1835			objnum = zp->z_id;
1836			type = DT_DIR;
1837		} else if (offset == 1) {
1838			(void) strcpy(zap.za_name, "..");
1839			objnum = zp->z_phys->zp_parent;
1840			type = DT_DIR;
1841		} else if (offset == 2 && zfs_show_ctldir(zp)) {
1842			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
1843			objnum = ZFSCTL_INO_ROOT;
1844			type = DT_DIR;
1845		} else {
1846			/*
1847			 * Grab next entry.
1848			 */
1849			if (error = zap_cursor_retrieve(&zc, &zap)) {
1850				if ((*eofp = (error == ENOENT)) != 0)
1851					break;
1852				else
1853					goto update;
1854			}
1855
1856			if (zap.za_integer_length != 8 ||
1857			    zap.za_num_integers != 1) {
1858				cmn_err(CE_WARN, "zap_readdir: bad directory "
1859				    "entry, obj = %lld, offset = %lld\n",
1860				    (u_longlong_t)zp->z_id,
1861				    (u_longlong_t)offset);
1862				error = ENXIO;
1863				goto update;
1864			}
1865
1866			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
1867			/*
1868			 * MacOS X can extract the object type here such as:
1869			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
1870			 */
1871			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
1872		}
1873		reclen = DIRENT64_RECLEN(strlen(zap.za_name));
1874
1875		/*
1876		 * Will this entry fit in the buffer?
1877		 */
1878		if (outcount + reclen > bufsize) {
1879			/*
1880			 * Did we manage to fit anything in the buffer?
1881			 */
1882			if (!outcount) {
1883				error = EINVAL;
1884				goto update;
1885			}
1886			break;
1887		}
1888		/*
1889		 * Add this entry:
1890		 */
1891		odp->d_ino = objnum;
1892		odp->d_reclen = reclen;
1893		odp->d_namlen = strlen(zap.za_name);
1894		(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
1895		odp->d_type = type;
1896		outcount += reclen;
1897		odp = (dirent64_t *)((intptr_t)odp + reclen);
1898
1899		ASSERT(outcount <= bufsize);
1900
1901		/* Prefetch znode */
1902		if (prefetch)
1903			dmu_prefetch(os, objnum, 0, 0);
1904
1905		/*
1906		 * Move to the next entry, fill in the previous offset.
1907		 */
1908		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
1909			zap_cursor_advance(&zc);
1910			offset = zap_cursor_serialize(&zc);
1911		} else {
1912			offset += 1;
1913		}
1914
1915		if (cooks != NULL) {
1916			*cooks++ = offset;
1917			ncooks--;
1918			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
1919		}
1920	}
1921	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
1922
1923	/* Subtract unused cookies */
1924	if (ncookies != NULL)
1925		*ncookies -= ncooks;
1926
1927	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
1928		iovp->iov_base += outcount;
1929		iovp->iov_len -= outcount;
1930		uio->uio_resid -= outcount;
1931	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
1932		/*
1933		 * Reset the pointer.
1934		 */
1935		offset = uio->uio_loffset;
1936	}
1937
1938update:
1939	zap_cursor_fini(&zc);
1940	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
1941		kmem_free(outbuf, bufsize);
1942
1943	if (error == ENOENT)
1944		error = 0;
1945
1946	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
1947
1948	uio->uio_loffset = offset;
1949	ZFS_EXIT(zfsvfs);
1950	if (error != 0 && cookies != NULL) {
1951		free(*cookies, M_TEMP);
1952		*cookies = NULL;
1953		*ncookies = 0;
1954	}
1955	return (error);
1956}
1957
1958static int
1959zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr)
1960{
1961	znode_t	*zp = VTOZ(vp);
1962	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1963
1964	ZFS_ENTER(zfsvfs);
1965	zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
1966	ZFS_EXIT(zfsvfs);
1967	return (0);
1968}
1969
1970/*
1971 * Get the requested file attributes and place them in the provided
1972 * vattr structure.
1973 *
1974 *	IN:	vp	- vnode of file.
1975 *		vap	- va_mask identifies requested attributes.
1976 *		flags	- [UNUSED]
1977 *		cr	- credentials of caller.
1978 *
1979 *	OUT:	vap	- attribute values.
1980 *
1981 *	RETURN:	0 (always succeeds)
1982 */
1983/* ARGSUSED */
1984static int
1985zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
1986{
1987	znode_t *zp = VTOZ(vp);
1988	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1989	znode_phys_t *pzp = zp->z_phys;
1990	uint32_t blksize;
1991	u_longlong_t nblocks;
1992	int	error;
1993
1994	ZFS_ENTER(zfsvfs);
1995
1996	/*
1997	 * Return all attributes.  It's cheaper to provide the answer
1998	 * than to determine whether we were asked the question.
1999	 */
2000	mutex_enter(&zp->z_lock);
2001
2002	vap->va_type = IFTOVT(pzp->zp_mode);
2003	vap->va_mode = pzp->zp_mode & ~S_IFMT;
2004	vap->va_uid = zp->z_phys->zp_uid;
2005	vap->va_gid = zp->z_phys->zp_gid;
2006	vap->va_nodeid = zp->z_id;
2007	vap->va_nlink = MIN(pzp->zp_links, UINT32_MAX);	/* nlink_t limit! */
2008	vap->va_size = pzp->zp_size;
2009	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
2010	vap->va_rdev = zfs_cmpldev(pzp->zp_rdev);
2011	vap->va_seq = zp->z_seq;
2012	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
2013
2014	ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime);
2015	ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime);
2016	ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime);
2017	ZFS_TIME_DECODE(&vap->va_birthtime, pzp->zp_crtime);
2018
2019	/*
2020	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2021	 * Also, if we are the owner don't bother, since owner should
2022	 * always be allowed to read basic attributes of file.
2023	 */
2024	if (!(zp->z_phys->zp_flags & ZFS_ACL_TRIVIAL) &&
2025	    (zp->z_phys->zp_uid != crgetuid(cr))) {
2026		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, cr)) {
2027			mutex_exit(&zp->z_lock);
2028			ZFS_EXIT(zfsvfs);
2029			return (error);
2030		}
2031	}
2032
2033	mutex_exit(&zp->z_lock);
2034
2035	dmu_object_size_from_db(zp->z_dbuf, &blksize, &nblocks);
2036	vap->va_blksize = blksize;
2037	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
2038
2039	if (zp->z_blksz == 0) {
2040		/*
2041		 * Block size hasn't been set; suggest maximal I/O transfers.
2042		 */
2043		vap->va_blksize = zfsvfs->z_max_blksz;
2044	}
2045
2046	ZFS_EXIT(zfsvfs);
2047	return (0);
2048}
2049
2050/*
2051 * Set the file attributes to the values contained in the
2052 * vattr structure.
2053 *
2054 *	IN:	vp	- vnode of file to be modified.
2055 *		vap	- new attribute values.
2056 *		flags	- ATTR_UTIME set if non-default time values provided.
2057 *		cr	- credentials of caller.
2058 *
2059 *	RETURN:	0 if success
2060 *		error code if failure
2061 *
2062 * Timestamps:
2063 *	vp - ctime updated, mtime updated if size changed.
2064 */
2065/* ARGSUSED */
2066static int
2067zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2068	caller_context_t *ct)
2069{
2070	struct znode	*zp = VTOZ(vp);
2071	znode_phys_t	*pzp = zp->z_phys;
2072	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2073	zilog_t		*zilog = zfsvfs->z_log;
2074	dmu_tx_t	*tx;
2075	vattr_t		oldva;
2076	uint_t		mask = vap->va_mask;
2077	uint_t		saved_mask;
2078	int		trim_mask = 0;
2079	uint64_t	new_mode;
2080	znode_t		*attrzp;
2081	int		need_policy = FALSE;
2082	int		err;
2083
2084	if (mask == 0)
2085		return (0);
2086
2087	if (mask & AT_NOSET)
2088		return (EINVAL);
2089
2090	if (mask & AT_SIZE && vp->v_type == VDIR)
2091		return (EISDIR);
2092
2093	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO)
2094		return (EINVAL);
2095
2096	ZFS_ENTER(zfsvfs);
2097
2098top:
2099	attrzp = NULL;
2100
2101	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2102		ZFS_EXIT(zfsvfs);
2103		return (EROFS);
2104	}
2105
2106	/*
2107	 * First validate permissions
2108	 */
2109
2110	if (mask & AT_SIZE) {
2111		err = zfs_zaccess(zp, ACE_WRITE_DATA, cr);
2112		if (err) {
2113			ZFS_EXIT(zfsvfs);
2114			return (err);
2115		}
2116		/*
2117		 * XXX - Note, we are not providing any open
2118		 * mode flags here (like FNDELAY), so we may
2119		 * block if there are locks present... this
2120		 * should be addressed in openat().
2121		 */
2122		do {
2123			err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2124			/* NB: we already did dmu_tx_wait() if necessary */
2125		} while (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT);
2126		if (err) {
2127			ZFS_EXIT(zfsvfs);
2128			return (err);
2129		}
2130	}
2131
2132	if (mask & (AT_ATIME|AT_MTIME))
2133		need_policy = zfs_zaccess_v4_perm(zp, ACE_WRITE_ATTRIBUTES, cr);
2134
2135	if (mask & (AT_UID|AT_GID)) {
2136		int	idmask = (mask & (AT_UID|AT_GID));
2137		int	take_owner;
2138		int	take_group;
2139
2140		/*
2141		 * NOTE: even if a new mode is being set,
2142		 * we may clear S_ISUID/S_ISGID bits.
2143		 */
2144
2145		if (!(mask & AT_MODE))
2146			vap->va_mode = pzp->zp_mode;
2147
2148		/*
2149		 * Take ownership or chgrp to group we are a member of
2150		 */
2151
2152		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
2153		take_group = (mask & AT_GID) && groupmember(vap->va_gid, cr);
2154
2155		/*
2156		 * If both AT_UID and AT_GID are set then take_owner and
2157		 * take_group must both be set in order to allow taking
2158		 * ownership.
2159		 *
2160		 * Otherwise, send the check through secpolicy_vnode_setattr()
2161		 *
2162		 */
2163
2164		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
2165		    ((idmask == AT_UID) && take_owner) ||
2166		    ((idmask == AT_GID) && take_group)) {
2167			if (zfs_zaccess_v4_perm(zp, ACE_WRITE_OWNER, cr) == 0) {
2168				/*
2169				 * Remove setuid/setgid for non-privileged users
2170				 */
2171				secpolicy_setid_clear(vap, cr);
2172				trim_mask = (mask & (AT_UID|AT_GID));
2173			} else {
2174				need_policy =  TRUE;
2175			}
2176		} else {
2177			need_policy =  TRUE;
2178		}
2179	}
2180
2181	mutex_enter(&zp->z_lock);
2182	oldva.va_mode = pzp->zp_mode;
2183	oldva.va_uid = zp->z_phys->zp_uid;
2184	oldva.va_gid = zp->z_phys->zp_gid;
2185	mutex_exit(&zp->z_lock);
2186
2187	if (mask & AT_MODE) {
2188		if (zfs_zaccess_v4_perm(zp, ACE_WRITE_ACL, cr) == 0) {
2189			err = secpolicy_setid_setsticky_clear(vp, vap,
2190			    &oldva, cr);
2191			if (err) {
2192				ZFS_EXIT(zfsvfs);
2193				return (err);
2194			}
2195			trim_mask |= AT_MODE;
2196		} else {
2197			need_policy = TRUE;
2198		}
2199	}
2200
2201	if (need_policy) {
2202		/*
2203		 * If trim_mask is set then take ownership
2204		 * has been granted or write_acl is present and user
2205		 * has the ability to modify mode.  In that case remove
2206		 * UID|GID and or MODE from mask so that
2207		 * secpolicy_vnode_setattr() doesn't revoke it.
2208		 */
2209
2210		if (trim_mask) {
2211			saved_mask = vap->va_mask;
2212			vap->va_mask &= ~trim_mask;
2213
2214		}
2215		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
2216		    (int (*)(void *, int, cred_t *))zfs_zaccess_rwx, zp);
2217		if (err) {
2218			ZFS_EXIT(zfsvfs);
2219			return (err);
2220		}
2221
2222		if (trim_mask)
2223			vap->va_mask |= saved_mask;
2224	}
2225
2226	/*
2227	 * secpolicy_vnode_setattr, or take ownership may have
2228	 * changed va_mask
2229	 */
2230	mask = vap->va_mask;
2231
2232	tx = dmu_tx_create(zfsvfs->z_os);
2233	dmu_tx_hold_bonus(tx, zp->z_id);
2234
2235	if (mask & AT_MODE) {
2236		uint64_t pmode = pzp->zp_mode;
2237
2238		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
2239
2240		if (zp->z_phys->zp_acl.z_acl_extern_obj)
2241			dmu_tx_hold_write(tx,
2242			    pzp->zp_acl.z_acl_extern_obj, 0, SPA_MAXBLOCKSIZE);
2243		else
2244			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2245			    0, ZFS_ACL_SIZE(MAX_ACL_SIZE));
2246	}
2247
2248	if ((mask & (AT_UID | AT_GID)) && zp->z_phys->zp_xattr != 0) {
2249		err = zfs_zget(zp->z_zfsvfs, zp->z_phys->zp_xattr, &attrzp);
2250		if (err) {
2251			dmu_tx_abort(tx);
2252			ZFS_EXIT(zfsvfs);
2253			return (err);
2254		}
2255		dmu_tx_hold_bonus(tx, attrzp->z_id);
2256	}
2257
2258	err = dmu_tx_assign(tx, zfsvfs->z_assign);
2259	if (err) {
2260		if (attrzp)
2261			VN_RELE(ZTOV(attrzp));
2262		if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
2263			dmu_tx_wait(tx);
2264			dmu_tx_abort(tx);
2265			goto top;
2266		}
2267		dmu_tx_abort(tx);
2268		ZFS_EXIT(zfsvfs);
2269		return (err);
2270	}
2271
2272	dmu_buf_will_dirty(zp->z_dbuf, tx);
2273
2274	/*
2275	 * Set each attribute requested.
2276	 * We group settings according to the locks they need to acquire.
2277	 *
2278	 * Note: you cannot set ctime directly, although it will be
2279	 * updated as a side-effect of calling this function.
2280	 */
2281
2282	mutex_enter(&zp->z_lock);
2283
2284	if (mask & AT_MODE) {
2285		err = zfs_acl_chmod_setattr(zp, new_mode, tx);
2286		ASSERT3U(err, ==, 0);
2287	}
2288
2289	if (attrzp)
2290		mutex_enter(&attrzp->z_lock);
2291
2292	if (mask & AT_UID) {
2293		zp->z_phys->zp_uid = (uint64_t)vap->va_uid;
2294		if (attrzp) {
2295			attrzp->z_phys->zp_uid = (uint64_t)vap->va_uid;
2296		}
2297	}
2298
2299	if (mask & AT_GID) {
2300		zp->z_phys->zp_gid = (uint64_t)vap->va_gid;
2301		if (attrzp)
2302			attrzp->z_phys->zp_gid = (uint64_t)vap->va_gid;
2303	}
2304
2305	if (attrzp)
2306		mutex_exit(&attrzp->z_lock);
2307
2308	if (mask & AT_ATIME)
2309		ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
2310
2311	if (mask & AT_MTIME)
2312		ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
2313
2314	if (mask & AT_SIZE)
2315		zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx);
2316	else if (mask != 0)
2317		zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
2318
2319	if (mask != 0)
2320		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask);
2321
2322	mutex_exit(&zp->z_lock);
2323
2324	if (attrzp)
2325		VN_RELE(ZTOV(attrzp));
2326
2327	dmu_tx_commit(tx);
2328
2329	ZFS_EXIT(zfsvfs);
2330	return (err);
2331}
2332
2333typedef struct zfs_zlock {
2334	krwlock_t	*zl_rwlock;	/* lock we acquired */
2335	znode_t		*zl_znode;	/* znode we held */
2336	struct zfs_zlock *zl_next;	/* next in list */
2337} zfs_zlock_t;
2338
2339/*
2340 * Drop locks and release vnodes that were held by zfs_rename_lock().
2341 */
2342static void
2343zfs_rename_unlock(zfs_zlock_t **zlpp)
2344{
2345	zfs_zlock_t *zl;
2346
2347	while ((zl = *zlpp) != NULL) {
2348		if (zl->zl_znode != NULL)
2349			VN_RELE(ZTOV(zl->zl_znode));
2350		rw_exit(zl->zl_rwlock);
2351		*zlpp = zl->zl_next;
2352		kmem_free(zl, sizeof (*zl));
2353	}
2354}
2355
2356/*
2357 * Search back through the directory tree, using the ".." entries.
2358 * Lock each directory in the chain to prevent concurrent renames.
2359 * Fail any attempt to move a directory into one of its own descendants.
2360 * XXX - z_parent_lock can overlap with map or grow locks
2361 */
2362static int
2363zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
2364{
2365	zfs_zlock_t	*zl;
2366	znode_t		*zp = tdzp;
2367	uint64_t	rootid = zp->z_zfsvfs->z_root;
2368	uint64_t	*oidp = &zp->z_id;
2369	krwlock_t	*rwlp = &szp->z_parent_lock;
2370	krw_t		rw = RW_WRITER;
2371
2372	/*
2373	 * First pass write-locks szp and compares to zp->z_id.
2374	 * Later passes read-lock zp and compare to zp->z_parent.
2375	 */
2376	do {
2377		if (!rw_tryenter(rwlp, rw)) {
2378			/*
2379			 * Another thread is renaming in this path.
2380			 * Note that if we are a WRITER, we don't have any
2381			 * parent_locks held yet.
2382			 */
2383			if (rw == RW_READER && zp->z_id > szp->z_id) {
2384				/*
2385				 * Drop our locks and restart
2386				 */
2387				zfs_rename_unlock(&zl);
2388				*zlpp = NULL;
2389				zp = tdzp;
2390				oidp = &zp->z_id;
2391				rwlp = &szp->z_parent_lock;
2392				rw = RW_WRITER;
2393				continue;
2394			} else {
2395				/*
2396				 * Wait for other thread to drop its locks
2397				 */
2398				rw_enter(rwlp, rw);
2399			}
2400		}
2401
2402		zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
2403		zl->zl_rwlock = rwlp;
2404		zl->zl_znode = NULL;
2405		zl->zl_next = *zlpp;
2406		*zlpp = zl;
2407
2408		if (*oidp == szp->z_id)		/* We're a descendant of szp */
2409			return (EINVAL);
2410
2411		if (*oidp == rootid)		/* We've hit the top */
2412			return (0);
2413
2414		if (rw == RW_READER) {		/* i.e. not the first pass */
2415			int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp);
2416			if (error)
2417				return (error);
2418			zl->zl_znode = zp;
2419		}
2420		oidp = &zp->z_phys->zp_parent;
2421		rwlp = &zp->z_parent_lock;
2422		rw = RW_READER;
2423
2424	} while (zp->z_id != sdzp->z_id);
2425
2426	return (0);
2427}
2428
2429/*
2430 * Move an entry from the provided source directory to the target
2431 * directory.  Change the entry name as indicated.
2432 *
2433 *	IN:	sdvp	- Source directory containing the "old entry".
2434 *		snm	- Old entry name.
2435 *		tdvp	- Target directory to contain the "new entry".
2436 *		tnm	- New entry name.
2437 *		cr	- credentials of caller.
2438 *
2439 *	RETURN:	0 if success
2440 *		error code if failure
2441 *
2442 * Timestamps:
2443 *	sdvp,tdvp - ctime|mtime updated
2444 */
2445static int
2446zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr)
2447{
2448	znode_t		*tdzp, *szp, *tzp;
2449	znode_t		*sdzp = VTOZ(sdvp);
2450	zfsvfs_t	*zfsvfs = sdzp->z_zfsvfs;
2451	zilog_t		*zilog = zfsvfs->z_log;
2452	vnode_t		*realvp;
2453	zfs_dirlock_t	*sdl, *tdl;
2454	dmu_tx_t	*tx;
2455	zfs_zlock_t	*zl;
2456	int		cmp, serr, terr, error;
2457
2458	ZFS_ENTER(zfsvfs);
2459
2460	/*
2461	 * Make sure we have the real vp for the target directory.
2462	 */
2463	if (VOP_REALVP(tdvp, &realvp) == 0)
2464		tdvp = realvp;
2465
2466	if (tdvp->v_vfsp != sdvp->v_vfsp) {
2467		ZFS_EXIT(zfsvfs);
2468		return (EXDEV);
2469	}
2470
2471	tdzp = VTOZ(tdvp);
2472top:
2473	szp = NULL;
2474	tzp = NULL;
2475	zl = NULL;
2476
2477	/*
2478	 * This is to prevent the creation of links into attribute space
2479	 * by renaming a linked file into/outof an attribute directory.
2480	 * See the comment in zfs_link() for why this is considered bad.
2481	 */
2482	if ((tdzp->z_phys->zp_flags & ZFS_XATTR) !=
2483	    (sdzp->z_phys->zp_flags & ZFS_XATTR)) {
2484		ZFS_EXIT(zfsvfs);
2485		return (EINVAL);
2486	}
2487
2488	/*
2489	 * Lock source and target directory entries.  To prevent deadlock,
2490	 * a lock ordering must be defined.  We lock the directory with
2491	 * the smallest object id first, or if it's a tie, the one with
2492	 * the lexically first name.
2493	 */
2494	if (sdzp->z_id < tdzp->z_id) {
2495		cmp = -1;
2496	} else if (sdzp->z_id > tdzp->z_id) {
2497		cmp = 1;
2498	} else {
2499		cmp = strcmp(snm, tnm);
2500		if (cmp == 0) {
2501			/*
2502			 * POSIX: "If the old argument and the new argument
2503			 * both refer to links to the same existing file,
2504			 * the rename() function shall return successfully
2505			 * and perform no other action."
2506			 */
2507			ZFS_EXIT(zfsvfs);
2508			return (0);
2509		}
2510	}
2511	if (cmp < 0) {
2512		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS);
2513		terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0);
2514	} else {
2515		terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0);
2516		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS);
2517	}
2518
2519	if (serr) {
2520		/*
2521		 * Source entry invalid or not there.
2522		 */
2523		if (!terr) {
2524			zfs_dirent_unlock(tdl);
2525			if (tzp)
2526				VN_RELE(ZTOV(tzp));
2527		}
2528		if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0)
2529			serr = EINVAL;
2530		ZFS_EXIT(zfsvfs);
2531		return (serr);
2532	}
2533	if (terr) {
2534		zfs_dirent_unlock(sdl);
2535		VN_RELE(ZTOV(szp));
2536		if (strcmp(tnm, "..") == 0)
2537			terr = EINVAL;
2538		ZFS_EXIT(zfsvfs);
2539		return (terr);
2540	}
2541
2542	/*
2543	 * Must have write access at the source to remove the old entry
2544	 * and write access at the target to create the new entry.
2545	 * Note that if target and source are the same, this can be
2546	 * done in a single check.
2547	 */
2548
2549	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
2550		goto out;
2551
2552	if (ZTOV(szp)->v_type == VDIR) {
2553		/*
2554		 * Check to make sure rename is valid.
2555		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
2556		 */
2557		if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
2558			goto out;
2559	}
2560
2561	/*
2562	 * Does target exist?
2563	 */
2564	if (tzp) {
2565		/*
2566		 * Source and target must be the same type.
2567		 */
2568		if (ZTOV(szp)->v_type == VDIR) {
2569			if (ZTOV(tzp)->v_type != VDIR) {
2570				error = ENOTDIR;
2571				goto out;
2572			}
2573		} else {
2574			if (ZTOV(tzp)->v_type == VDIR) {
2575				error = EISDIR;
2576				goto out;
2577			}
2578		}
2579		/*
2580		 * POSIX dictates that when the source and target
2581		 * entries refer to the same file object, rename
2582		 * must do nothing and exit without error.
2583		 */
2584		if (szp->z_id == tzp->z_id) {
2585			error = 0;
2586			goto out;
2587		}
2588	}
2589
2590	vnevent_rename_src(ZTOV(szp));
2591	if (tzp)
2592		vnevent_rename_dest(ZTOV(tzp));
2593
2594	tx = dmu_tx_create(zfsvfs->z_os);
2595	dmu_tx_hold_bonus(tx, szp->z_id);	/* nlink changes */
2596	dmu_tx_hold_bonus(tx, sdzp->z_id);	/* nlink changes */
2597	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
2598	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
2599	if (sdzp != tdzp)
2600		dmu_tx_hold_bonus(tx, tdzp->z_id);	/* nlink changes */
2601	if (tzp)
2602		dmu_tx_hold_bonus(tx, tzp->z_id);	/* parent changes */
2603	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2604	error = dmu_tx_assign(tx, zfsvfs->z_assign);
2605	if (error) {
2606		if (zl != NULL)
2607			zfs_rename_unlock(&zl);
2608		zfs_dirent_unlock(sdl);
2609		zfs_dirent_unlock(tdl);
2610		VN_RELE(ZTOV(szp));
2611		if (tzp)
2612			VN_RELE(ZTOV(tzp));
2613		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
2614			dmu_tx_wait(tx);
2615			dmu_tx_abort(tx);
2616			goto top;
2617		}
2618		dmu_tx_abort(tx);
2619		ZFS_EXIT(zfsvfs);
2620		return (error);
2621	}
2622
2623	if (tzp)	/* Attempt to remove the existing target */
2624		error = zfs_link_destroy(tdl, tzp, tx, 0, NULL);
2625
2626	if (error == 0) {
2627		error = zfs_link_create(tdl, szp, tx, ZRENAMING);
2628		if (error == 0) {
2629			error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
2630			ASSERT(error == 0);
2631			zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
2632			    sdl->dl_name, tdzp, tdl->dl_name, szp);
2633		}
2634#ifdef FREEBSD_NAMECACHE
2635		if (error == 0) {
2636			cache_purge(sdvp);
2637			cache_purge(tdvp);
2638		}
2639#endif
2640	}
2641
2642	dmu_tx_commit(tx);
2643out:
2644	if (zl != NULL)
2645		zfs_rename_unlock(&zl);
2646
2647	zfs_dirent_unlock(sdl);
2648	zfs_dirent_unlock(tdl);
2649
2650	VN_RELE(ZTOV(szp));
2651	if (tzp)
2652		VN_RELE(ZTOV(tzp));
2653
2654	ZFS_EXIT(zfsvfs);
2655
2656	return (error);
2657}
2658
2659/*
2660 * Insert the indicated symbolic reference entry into the directory.
2661 *
2662 *	IN:	dvp	- Directory to contain new symbolic link.
2663 *		link	- Name for new symlink entry.
2664 *		vap	- Attributes of new entry.
2665 *		target	- Target path of new symlink.
2666 *		cr	- credentials of caller.
2667 *
2668 *	RETURN:	0 if success
2669 *		error code if failure
2670 *
2671 * Timestamps:
2672 *	dvp - ctime|mtime updated
2673 */
2674static int
2675zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, cred_t *cr, kthread_t *td)
2676{
2677	znode_t		*zp, *dzp = VTOZ(dvp);
2678	zfs_dirlock_t	*dl;
2679	dmu_tx_t	*tx;
2680	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2681	zilog_t		*zilog = zfsvfs->z_log;
2682	uint64_t	zoid;
2683	int		len = strlen(link);
2684	int		error;
2685
2686	ASSERT(vap->va_type == VLNK);
2687
2688	ZFS_ENTER(zfsvfs);
2689top:
2690	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
2691		ZFS_EXIT(zfsvfs);
2692		return (error);
2693	}
2694
2695	if (len > MAXPATHLEN) {
2696		ZFS_EXIT(zfsvfs);
2697		return (ENAMETOOLONG);
2698	}
2699
2700	/*
2701	 * Attempt to lock directory; fail if entry already exists.
2702	 */
2703	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZNEW)) {
2704		ZFS_EXIT(zfsvfs);
2705		return (error);
2706	}
2707
2708	tx = dmu_tx_create(zfsvfs->z_os);
2709	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
2710	dmu_tx_hold_bonus(tx, dzp->z_id);
2711	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
2712	if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
2713		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE);
2714	error = dmu_tx_assign(tx, zfsvfs->z_assign);
2715	if (error) {
2716		zfs_dirent_unlock(dl);
2717		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
2718			dmu_tx_wait(tx);
2719			dmu_tx_abort(tx);
2720			goto top;
2721		}
2722		dmu_tx_abort(tx);
2723		ZFS_EXIT(zfsvfs);
2724		return (error);
2725	}
2726
2727	dmu_buf_will_dirty(dzp->z_dbuf, tx);
2728
2729	/*
2730	 * Create a new object for the symlink.
2731	 * Put the link content into bonus buffer if it will fit;
2732	 * otherwise, store it just like any other file data.
2733	 */
2734	zoid = 0;
2735	if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) {
2736		zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, len);
2737		if (len != 0)
2738			bcopy(link, zp->z_phys + 1, len);
2739	} else {
2740		dmu_buf_t *dbp;
2741
2742		zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
2743
2744		/*
2745		 * Nothing can access the znode yet so no locking needed
2746		 * for growing the znode's blocksize.
2747		 */
2748		zfs_grow_blocksize(zp, len, tx);
2749
2750		VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, zoid, 0, FTAG, &dbp));
2751		dmu_buf_will_dirty(dbp, tx);
2752
2753		ASSERT3U(len, <=, dbp->db_size);
2754		bcopy(link, dbp->db_data, len);
2755		dmu_buf_rele(dbp, FTAG);
2756	}
2757	zp->z_phys->zp_size = len;
2758
2759	/*
2760	 * Insert the new object into the directory.
2761	 */
2762	(void) zfs_link_create(dl, zp, tx, ZNEW);
2763out:
2764	if (error == 0) {
2765		zfs_log_symlink(zilog, tx, TX_SYMLINK, dzp, zp, name, link);
2766		*vpp = ZTOV(zp);
2767		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);
2768	}
2769
2770	dmu_tx_commit(tx);
2771
2772	zfs_dirent_unlock(dl);
2773
2774	ZFS_EXIT(zfsvfs);
2775	return (error);
2776}
2777
2778/*
2779 * Return, in the buffer contained in the provided uio structure,
2780 * the symbolic path referred to by vp.
2781 *
2782 *	IN:	vp	- vnode of symbolic link.
2783 *		uoip	- structure to contain the link path.
2784 *		cr	- credentials of caller.
2785 *
2786 *	OUT:	uio	- structure to contain the link path.
2787 *
2788 *	RETURN:	0 if success
2789 *		error code if failure
2790 *
2791 * Timestamps:
2792 *	vp - atime updated
2793 */
2794/* ARGSUSED */
2795static int
2796zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr)
2797{
2798	znode_t		*zp = VTOZ(vp);
2799	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2800	size_t		bufsz;
2801	int		error;
2802
2803	ZFS_ENTER(zfsvfs);
2804
2805	bufsz = (size_t)zp->z_phys->zp_size;
2806	if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) {
2807		error = uiomove(zp->z_phys + 1,
2808		    MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
2809	} else {
2810		dmu_buf_t *dbp;
2811		error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp);
2812		if (error) {
2813			ZFS_EXIT(zfsvfs);
2814			return (error);
2815		}
2816		error = uiomove(dbp->db_data,
2817		    MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
2818		dmu_buf_rele(dbp, FTAG);
2819	}
2820
2821	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2822	ZFS_EXIT(zfsvfs);
2823	return (error);
2824}
2825
2826/*
2827 * Insert a new entry into directory tdvp referencing svp.
2828 *
2829 *	IN:	tdvp	- Directory to contain new entry.
2830 *		svp	- vnode of new entry.
2831 *		name	- name of new entry.
2832 *		cr	- credentials of caller.
2833 *
2834 *	RETURN:	0 if success
2835 *		error code if failure
2836 *
2837 * Timestamps:
2838 *	tdvp - ctime|mtime updated
2839 *	 svp - ctime updated
2840 */
2841/* ARGSUSED */
2842static int
2843zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr)
2844{
2845	znode_t		*dzp = VTOZ(tdvp);
2846	znode_t		*tzp, *szp;
2847	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2848	zilog_t		*zilog = zfsvfs->z_log;
2849	zfs_dirlock_t	*dl;
2850	dmu_tx_t	*tx;
2851	vnode_t		*realvp;
2852	int		error;
2853
2854	ASSERT(tdvp->v_type == VDIR);
2855
2856	ZFS_ENTER(zfsvfs);
2857
2858	if (VOP_REALVP(svp, &realvp) == 0)
2859		svp = realvp;
2860
2861	if (svp->v_vfsp != tdvp->v_vfsp) {
2862		ZFS_EXIT(zfsvfs);
2863		return (EXDEV);
2864	}
2865
2866	szp = VTOZ(svp);
2867top:
2868	/*
2869	 * We do not support links between attributes and non-attributes
2870	 * because of the potential security risk of creating links
2871	 * into "normal" file space in order to circumvent restrictions
2872	 * imposed in attribute space.
2873	 */
2874	if ((szp->z_phys->zp_flags & ZFS_XATTR) !=
2875	    (dzp->z_phys->zp_flags & ZFS_XATTR)) {
2876		ZFS_EXIT(zfsvfs);
2877		return (EINVAL);
2878	}
2879
2880	/*
2881	 * POSIX dictates that we return EPERM here.
2882	 * Better choices include ENOTSUP or EISDIR.
2883	 */
2884	if (svp->v_type == VDIR) {
2885		ZFS_EXIT(zfsvfs);
2886		return (EPERM);
2887	}
2888
2889	if ((uid_t)szp->z_phys->zp_uid != crgetuid(cr) &&
2890	    secpolicy_basic_link(cr) != 0) {
2891		ZFS_EXIT(zfsvfs);
2892		return (EPERM);
2893	}
2894
2895	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
2896		ZFS_EXIT(zfsvfs);
2897		return (error);
2898	}
2899
2900	/*
2901	 * Attempt to lock directory; fail if entry already exists.
2902	 */
2903	if (error = zfs_dirent_lock(&dl, dzp, name, &tzp, ZNEW)) {
2904		ZFS_EXIT(zfsvfs);
2905		return (error);
2906	}
2907
2908	tx = dmu_tx_create(zfsvfs->z_os);
2909	dmu_tx_hold_bonus(tx, szp->z_id);
2910	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
2911	error = dmu_tx_assign(tx, zfsvfs->z_assign);
2912	if (error) {
2913		zfs_dirent_unlock(dl);
2914		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
2915			dmu_tx_wait(tx);
2916			dmu_tx_abort(tx);
2917			goto top;
2918		}
2919		dmu_tx_abort(tx);
2920		ZFS_EXIT(zfsvfs);
2921		return (error);
2922	}
2923
2924	error = zfs_link_create(dl, szp, tx, 0);
2925
2926	if (error == 0)
2927		zfs_log_link(zilog, tx, TX_LINK, dzp, szp, name);
2928
2929	dmu_tx_commit(tx);
2930
2931	zfs_dirent_unlock(dl);
2932
2933	ZFS_EXIT(zfsvfs);
2934	return (error);
2935}
2936
2937void
2938zfs_inactive(vnode_t *vp, cred_t *cr)
2939{
2940	znode_t	*zp = VTOZ(vp);
2941	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2942	int error;
2943
2944	rw_enter(&zfsvfs->z_um_lock, RW_READER);
2945	if (zfsvfs->z_unmounted2) {
2946		ASSERT(zp->z_dbuf_held == 0);
2947
2948		mutex_enter(&zp->z_lock);
2949		VI_LOCK(vp);
2950		vp->v_count = 0; /* count arrives as 1 */
2951		VI_UNLOCK(vp);
2952		if (zp->z_dbuf == NULL) {
2953			mutex_exit(&zp->z_lock);
2954			zfs_znode_free(zp);
2955		} else {
2956			mutex_exit(&zp->z_lock);
2957		}
2958		rw_exit(&zfsvfs->z_um_lock);
2959		VFS_RELE(zfsvfs->z_vfs);
2960		return;
2961	}
2962
2963	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
2964		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
2965
2966		dmu_tx_hold_bonus(tx, zp->z_id);
2967		error = dmu_tx_assign(tx, TXG_WAIT);
2968		if (error) {
2969			dmu_tx_abort(tx);
2970		} else {
2971			dmu_buf_will_dirty(zp->z_dbuf, tx);
2972			mutex_enter(&zp->z_lock);
2973			zp->z_atime_dirty = 0;
2974			mutex_exit(&zp->z_lock);
2975			dmu_tx_commit(tx);
2976		}
2977	}
2978
2979	zfs_zinactive(zp);
2980	rw_exit(&zfsvfs->z_um_lock);
2981}
2982
2983CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
2984CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
2985
2986static int
2987zfs_fid(vnode_t *vp, fid_t *fidp)
2988{
2989	znode_t		*zp = VTOZ(vp);
2990	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2991	uint32_t	gen = (uint32_t)zp->z_phys->zp_gen;
2992	uint64_t	object = zp->z_id;
2993	zfid_short_t	*zfid;
2994	int		size, i;
2995
2996	ZFS_ENTER(zfsvfs);
2997
2998	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
2999	fidp->fid_len = size;
3000
3001	zfid = (zfid_short_t *)fidp;
3002
3003	zfid->zf_len = size;
3004
3005	for (i = 0; i < sizeof (zfid->zf_object); i++)
3006		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
3007
3008	/* Must have a non-zero generation number to distinguish from .zfs */
3009	if (gen == 0)
3010		gen = 1;
3011	for (i = 0; i < sizeof (zfid->zf_gen); i++)
3012		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
3013
3014	if (size == LONG_FID_LEN) {
3015		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
3016		zfid_long_t	*zlfid;
3017
3018		zlfid = (zfid_long_t *)fidp;
3019
3020		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
3021			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
3022
3023		/* XXX - this should be the generation number for the objset */
3024		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
3025			zlfid->zf_setgen[i] = 0;
3026	}
3027
3028	ZFS_EXIT(zfsvfs);
3029	return (0);
3030}
3031
3032static int
3033zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr)
3034{
3035	znode_t		*zp, *xzp;
3036	zfsvfs_t	*zfsvfs;
3037	zfs_dirlock_t	*dl;
3038	int		error;
3039
3040	switch (cmd) {
3041	case _PC_LINK_MAX:
3042		*valp = INT_MAX;
3043		return (0);
3044
3045	case _PC_FILESIZEBITS:
3046		*valp = 64;
3047		return (0);
3048
3049#if 0
3050	case _PC_XATTR_EXISTS:
3051		zp = VTOZ(vp);
3052		zfsvfs = zp->z_zfsvfs;
3053		ZFS_ENTER(zfsvfs);
3054		*valp = 0;
3055		error = zfs_dirent_lock(&dl, zp, "", &xzp,
3056		    ZXATTR | ZEXISTS | ZSHARED);
3057		if (error == 0) {
3058			zfs_dirent_unlock(dl);
3059			if (!zfs_dirempty(xzp))
3060				*valp = 1;
3061			VN_RELE(ZTOV(xzp));
3062		} else if (error == ENOENT) {
3063			/*
3064			 * If there aren't extended attributes, it's the
3065			 * same as having zero of them.
3066			 */
3067			error = 0;
3068		}
3069		ZFS_EXIT(zfsvfs);
3070		return (error);
3071#endif
3072
3073	case _PC_ACL_EXTENDED:
3074		*valp = 0;	/* TODO */
3075		return (0);
3076
3077	case _PC_MIN_HOLE_SIZE:
3078		*valp = (int)SPA_MINBLOCKSIZE;
3079		return (0);
3080
3081	default:
3082		return (EOPNOTSUPP);
3083	}
3084}
3085
3086#ifdef TODO
3087/*ARGSUSED*/
3088static int
3089zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr)
3090{
3091	znode_t *zp = VTOZ(vp);
3092	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3093	int error;
3094
3095	ZFS_ENTER(zfsvfs);
3096	error = zfs_getacl(zp, vsecp, cr);
3097	ZFS_EXIT(zfsvfs);
3098
3099	return (error);
3100}
3101#endif	/* TODO */
3102
3103#ifdef TODO
3104/*ARGSUSED*/
3105static int
3106zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr)
3107{
3108	znode_t *zp = VTOZ(vp);
3109	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3110	int error;
3111
3112	ZFS_ENTER(zfsvfs);
3113	error = zfs_setacl(zp, vsecp, cr);
3114	ZFS_EXIT(zfsvfs);
3115	return (error);
3116}
3117#endif	/* TODO */
3118
3119static int
3120zfs_freebsd_open(ap)
3121	struct vop_open_args /* {
3122		struct vnode *a_vp;
3123		int a_mode;
3124		struct ucred *a_cred;
3125		struct thread *a_td;
3126	} */ *ap;
3127{
3128	vnode_t	*vp = ap->a_vp;
3129	znode_t *zp = VTOZ(vp);
3130	int error;
3131
3132	error = zfs_open(&vp, ap->a_mode, ap->a_cred);
3133	if (error == 0)
3134		vnode_create_vobject(vp, zp->z_phys->zp_size, ap->a_td);
3135	return (error);
3136}
3137
3138static int
3139zfs_freebsd_close(ap)
3140	struct vop_close_args /* {
3141		struct vnode *a_vp;
3142		int  a_fflag;
3143		struct ucred *a_cred;
3144		struct thread *a_td;
3145	} */ *ap;
3146{
3147
3148	return (zfs_close(ap->a_vp, ap->a_fflag, 0, 0, ap->a_cred));
3149}
3150
3151static int
3152zfs_freebsd_ioctl(ap)
3153	struct vop_ioctl_args /* {
3154		struct vnode *a_vp;
3155		u_long a_command;
3156		caddr_t a_data;
3157		int a_fflag;
3158		struct ucred *cred;
3159		struct thread *td;
3160	} */ *ap;
3161{
3162
3163	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
3164	    ap->a_fflag, ap->a_cred, NULL));
3165}
3166
3167static int
3168zfs_freebsd_read(ap)
3169	struct vop_read_args /* {
3170		struct vnode *a_vp;
3171		struct uio *a_uio;
3172		int a_ioflag;
3173		struct ucred *a_cred;
3174	} */ *ap;
3175{
3176
3177	return (zfs_read(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL));
3178}
3179
3180static int
3181zfs_freebsd_write(ap)
3182	struct vop_write_args /* {
3183		struct vnode *a_vp;
3184		struct uio *a_uio;
3185		int a_ioflag;
3186		struct ucred *a_cred;
3187	} */ *ap;
3188{
3189
3190	return (zfs_write(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL));
3191}
3192
3193static int
3194zfs_freebsd_access(ap)
3195	struct vop_access_args /* {
3196		struct vnode *a_vp;
3197		int  a_mode;
3198		struct ucred *a_cred;
3199		struct thread *a_td;
3200	} */ *ap;
3201{
3202
3203	return (zfs_access(ap->a_vp, ap->a_mode, 0, ap->a_cred));
3204}
3205
3206static int
3207zfs_freebsd_lookup(ap)
3208	struct vop_lookup_args /* {
3209		struct vnode *a_dvp;
3210		struct vnode **a_vpp;
3211		struct componentname *a_cnp;
3212	} */ *ap;
3213{
3214	struct componentname *cnp = ap->a_cnp;
3215	char nm[NAME_MAX + 1];
3216
3217	ASSERT(cnp->cn_namelen < sizeof(nm));
3218	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
3219
3220	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
3221	    cnp->cn_cred, cnp->cn_thread));
3222}
3223
3224static int
3225zfs_freebsd_create(ap)
3226	struct vop_create_args /* {
3227		struct vnode *a_dvp;
3228		struct vnode **a_vpp;
3229		struct componentname *a_cnp;
3230		struct vattr *a_vap;
3231	} */ *ap;
3232{
3233	struct componentname *cnp = ap->a_cnp;
3234	vattr_t *vap = ap->a_vap;
3235	int mode;
3236
3237	ASSERT(cnp->cn_flags & SAVENAME);
3238
3239	vattr_init_mask(vap);
3240	mode = vap->va_mode & ALLPERMS;
3241
3242	return (zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
3243	    ap->a_vpp, cnp->cn_cred, cnp->cn_thread));
3244}
3245
3246static int
3247zfs_freebsd_remove(ap)
3248	struct vop_remove_args /* {
3249		struct vnode *a_dvp;
3250		struct vnode *a_vp;
3251		struct componentname *a_cnp;
3252	} */ *ap;
3253{
3254
3255	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
3256
3257	return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr,
3258	    ap->a_cnp->cn_cred));
3259}
3260
3261static int
3262zfs_freebsd_mkdir(ap)
3263	struct vop_mkdir_args /* {
3264		struct vnode *a_dvp;
3265		struct vnode **a_vpp;
3266		struct componentname *a_cnp;
3267		struct vattr *a_vap;
3268	} */ *ap;
3269{
3270	vattr_t *vap = ap->a_vap;
3271
3272	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
3273
3274	vattr_init_mask(vap);
3275
3276	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
3277	    ap->a_cnp->cn_cred));
3278}
3279
3280static int
3281zfs_freebsd_rmdir(ap)
3282	struct vop_rmdir_args /* {
3283		struct vnode *a_dvp;
3284		struct vnode *a_vp;
3285		struct componentname *a_cnp;
3286	} */ *ap;
3287{
3288	struct componentname *cnp = ap->a_cnp;
3289
3290	ASSERT(cnp->cn_flags & SAVENAME);
3291
3292	return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred));
3293}
3294
3295static int
3296zfs_freebsd_readdir(ap)
3297	struct vop_readdir_args /* {
3298		struct vnode *a_vp;
3299		struct uio *a_uio;
3300		struct ucred *a_cred;
3301		int *a_eofflag;
3302		int *a_ncookies;
3303		u_long **a_cookies;
3304	} */ *ap;
3305{
3306
3307	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
3308	    ap->a_ncookies, ap->a_cookies));
3309}
3310
3311static int
3312zfs_freebsd_fsync(ap)
3313	struct vop_fsync_args /* {
3314		struct vnode *a_vp;
3315		int a_waitfor;
3316		struct thread *a_td;
3317	} */ *ap;
3318{
3319
3320	vop_stdfsync(ap);
3321	return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred));
3322}
3323
3324static int
3325zfs_freebsd_getattr(ap)
3326	struct vop_getattr_args /* {
3327		struct vnode *a_vp;
3328		struct vattr *a_vap;
3329		struct ucred *a_cred;
3330		struct thread *a_td;
3331	} */ *ap;
3332{
3333
3334	return (zfs_getattr(ap->a_vp, ap->a_vap, 0, ap->a_cred));
3335}
3336
3337static int
3338zfs_freebsd_setattr(ap)
3339	struct vop_setattr_args /* {
3340		struct vnode *a_vp;
3341		struct vattr *a_vap;
3342		struct ucred *a_cred;
3343		struct thread *a_td;
3344	} */ *ap;
3345{
3346	vattr_t *vap = ap->a_vap;
3347
3348	/* No support for FreeBSD's chflags(2). */
3349	if (vap->va_flags != VNOVAL)
3350		return (EOPNOTSUPP);
3351
3352	vattr_init_mask(vap);
3353
3354	return (zfs_setattr(ap->a_vp, vap, 0, ap->a_cred, NULL));
3355}
3356
3357static int
3358zfs_freebsd_rename(ap)
3359	struct vop_rename_args  /* {
3360		struct vnode *a_fdvp;
3361		struct vnode *a_fvp;
3362		struct componentname *a_fcnp;
3363		struct vnode *a_tdvp;
3364		struct vnode *a_tvp;
3365		struct componentname *a_tcnp;
3366	} */ *ap;
3367{
3368	vnode_t *fdvp = ap->a_fdvp;
3369	vnode_t *fvp = ap->a_fvp;
3370	vnode_t *tdvp = ap->a_tdvp;
3371	vnode_t *tvp = ap->a_tvp;
3372	int error;
3373
3374	ASSERT(ap->a_fcnp->cn_flags & SAVENAME);
3375	ASSERT(ap->a_tcnp->cn_flags & SAVENAME);
3376
3377	error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp,
3378	    ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred);
3379
3380	if (tdvp == tvp)
3381		VN_RELE(tdvp);
3382	else
3383		VN_URELE(tdvp);
3384	if (tvp)
3385		VN_URELE(tvp);
3386	VN_RELE(fdvp);
3387	VN_RELE(fvp);
3388
3389	return (error);
3390}
3391
3392static int
3393zfs_freebsd_symlink(ap)
3394	struct vop_symlink_args /* {
3395		struct vnode *a_dvp;
3396		struct vnode **a_vpp;
3397		struct componentname *a_cnp;
3398		struct vattr *a_vap;
3399		char *a_target;
3400	} */ *ap;
3401{
3402	struct componentname *cnp = ap->a_cnp;
3403	vattr_t *vap = ap->a_vap;
3404
3405	ASSERT(cnp->cn_flags & SAVENAME);
3406
3407	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
3408	vattr_init_mask(vap);
3409
3410	return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
3411	    ap->a_target, cnp->cn_cred, cnp->cn_thread));
3412}
3413
3414static int
3415zfs_freebsd_readlink(ap)
3416	struct vop_readlink_args /* {
3417		struct vnode *a_vp;
3418		struct uio *a_uio;
3419		struct ucred *a_cred;
3420	} */ *ap;
3421{
3422
3423	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred));
3424}
3425
3426static int
3427zfs_freebsd_link(ap)
3428	struct vop_link_args /* {
3429		struct vnode *a_tdvp;
3430		struct vnode *a_vp;
3431		struct componentname *a_cnp;
3432	} */ *ap;
3433{
3434	struct componentname *cnp = ap->a_cnp;
3435
3436	ASSERT(cnp->cn_flags & SAVENAME);
3437
3438	return (zfs_link(ap->a_tdvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
3439}
3440
3441static int
3442zfs_freebsd_inactive(ap)
3443	struct vop_inactive_args /* {
3444		struct vnode *a_vp;
3445		struct thread *a_td;
3446	} */ *ap;
3447{
3448	vnode_t *vp = ap->a_vp;
3449
3450	zfs_inactive(vp, ap->a_td->td_ucred);
3451	return (0);
3452}
3453
3454static int
3455zfs_freebsd_reclaim(ap)
3456	struct vop_reclaim_args /* {
3457		struct vnode *a_vp;
3458		struct thread *a_td;
3459	} */ *ap;
3460{
3461	vnode_t	*vp = ap->a_vp;
3462	znode_t	*zp = VTOZ(vp);
3463	zfsvfs_t *zfsvfs;
3464	int rele = 1;
3465
3466	ASSERT(zp != NULL);
3467
3468	/*
3469	 * Destroy the vm object and flush associated pages.
3470	 */
3471	vnode_destroy_vobject(vp);
3472
3473	mutex_enter(&zp->z_lock);
3474	ASSERT(zp->z_phys);
3475	ASSERT(zp->z_dbuf_held);
3476	zfsvfs = zp->z_zfsvfs;
3477	if (!zp->z_unlinked) {
3478		zp->z_dbuf_held = 0;
3479		ZTOV(zp) = NULL;
3480		mutex_exit(&zp->z_lock);
3481		dmu_buf_rele(zp->z_dbuf, NULL);
3482	} else {
3483		mutex_exit(&zp->z_lock);
3484	}
3485	VI_LOCK(vp);
3486	if (vp->v_count > 0)
3487		rele = 0;
3488	vp->v_data = NULL;
3489	ASSERT(vp->v_holdcnt > 1);
3490	vdropl(vp);
3491	if (!zp->z_unlinked && rele)
3492		VFS_RELE(zfsvfs->z_vfs);
3493	return (0);
3494}
3495
3496static int
3497zfs_freebsd_fid(ap)
3498	struct vop_fid_args /* {
3499		struct vnode *a_vp;
3500		struct fid *a_fid;
3501	} */ *ap;
3502{
3503
3504	return (zfs_fid(ap->a_vp, (void *)ap->a_fid));
3505}
3506
3507static int
3508zfs_freebsd_pathconf(ap)
3509	struct vop_pathconf_args /* {
3510		struct vnode *a_vp;
3511		int a_name;
3512		register_t *a_retval;
3513	} */ *ap;
3514{
3515	ulong_t val;
3516	int error;
3517
3518	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred);
3519	if (error == 0)
3520		*ap->a_retval = val;
3521	else if (error == EOPNOTSUPP)
3522		error = vop_stdpathconf(ap);
3523	return (error);
3524}
3525
3526/*
3527 * Advisory record locking support
3528 */
3529static int
3530zfs_freebsd_advlock(ap)
3531	struct vop_advlock_args /* {
3532		struct vnode *a_vp;
3533		caddr_t  a_id;
3534		int  a_op;
3535		struct flock *a_fl;
3536		int  a_flags;
3537	} */ *ap;
3538{
3539	znode_t	*zp = VTOZ(ap->a_vp);
3540
3541	return (lf_advlock(ap, &(zp->z_lockf), zp->z_phys->zp_size));
3542}
3543
3544struct vop_vector zfs_vnodeops;
3545struct vop_vector zfs_fifoops;
3546
3547struct vop_vector zfs_vnodeops = {
3548	.vop_default =	&default_vnodeops,
3549	.vop_inactive =	zfs_freebsd_inactive,
3550	.vop_reclaim =	zfs_freebsd_reclaim,
3551	.vop_access =	zfs_freebsd_access,
3552#ifdef FREEBSD_NAMECACHE
3553	.vop_lookup =	vfs_cache_lookup,
3554	.vop_cachedlookup = zfs_freebsd_lookup,
3555#else
3556	.vop_lookup =	zfs_freebsd_lookup,
3557#endif
3558	.vop_getattr =	zfs_freebsd_getattr,
3559	.vop_setattr =	zfs_freebsd_setattr,
3560	.vop_create =	zfs_freebsd_create,
3561	.vop_mknod =	zfs_freebsd_create,
3562	.vop_mkdir =	zfs_freebsd_mkdir,
3563	.vop_readdir =	zfs_freebsd_readdir,
3564	.vop_fsync =	zfs_freebsd_fsync,
3565	.vop_open =	zfs_freebsd_open,
3566	.vop_close =	zfs_freebsd_close,
3567	.vop_rmdir =	zfs_freebsd_rmdir,
3568	.vop_ioctl =	zfs_freebsd_ioctl,
3569	.vop_link =	zfs_freebsd_link,
3570	.vop_symlink =	zfs_freebsd_symlink,
3571	.vop_readlink =	zfs_freebsd_readlink,
3572	.vop_read =	zfs_freebsd_read,
3573	.vop_write =	zfs_freebsd_write,
3574	.vop_remove =	zfs_freebsd_remove,
3575	.vop_rename =	zfs_freebsd_rename,
3576	.vop_advlock =	zfs_freebsd_advlock,
3577	.vop_pathconf =	zfs_freebsd_pathconf,
3578	.vop_bmap =	VOP_EOPNOTSUPP,
3579	.vop_fid =	zfs_freebsd_fid,
3580};
3581
3582struct vop_vector zfs_fifoops = {
3583	.vop_default =	&fifo_specops,
3584	.vop_fsync =	VOP_PANIC,
3585	.vop_access =	zfs_freebsd_access,
3586	.vop_getattr =	zfs_freebsd_getattr,
3587	.vop_inactive =	zfs_freebsd_inactive,
3588	.vop_read =	VOP_PANIC,
3589	.vop_reclaim =	zfs_freebsd_reclaim,
3590	.vop_setattr =	zfs_freebsd_setattr,
3591	.vop_write =	VOP_PANIC,
3592	.vop_fid =	zfs_freebsd_fid,
3593};
3594