zfs_vnops.c revision 169167
190075Sobrien/*
290075Sobrien * CDDL HEADER START
390075Sobrien *
490075Sobrien * The contents of this file are subject to the terms of the
590075Sobrien * Common Development and Distribution License (the "License").
690075Sobrien * You may not use this file except in compliance with the License.
790075Sobrien *
890075Sobrien * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
990075Sobrien * or http://www.opensolaris.org/os/licensing.
1090075Sobrien * See the License for the specific language governing permissions
1190075Sobrien * and limitations under the License.
1290075Sobrien *
1390075Sobrien * When distributing Covered Code, include this CDDL HEADER in each
1490075Sobrien * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
1590075Sobrien * If applicable, add the following below this CDDL HEADER, with the
1690075Sobrien * fields enclosed by brackets "[]" replaced with your own identifying
1790075Sobrien * information: Portions Copyright [yyyy] [name of copyright owner]
1890075Sobrien *
1990075Sobrien * CDDL HEADER END
2090075Sobrien */
2190075Sobrien/*
2290075Sobrien * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
2390075Sobrien * Use is subject to license terms.
24169689Skan */
25169689Skan
2690075Sobrien#pragma ident	"%Z%%M%	%I%	%E% SMI"
2790075Sobrien
28117395Skan#include <sys/types.h>
2990075Sobrien#include <sys/param.h>
30169689Skan#include <sys/time.h>
31117395Skan#include <sys/systm.h>
3290075Sobrien#include <sys/sysmacros.h>
33117395Skan#include <sys/resource.h>
3490075Sobrien#include <sys/vfs.h>
3590075Sobrien#include <sys/vnode.h>
3690075Sobrien#include <sys/file.h>
3790075Sobrien#include <sys/stat.h>
3890075Sobrien#include <sys/kmem.h>
3990075Sobrien#include <sys/taskq.h>
4090075Sobrien#include <sys/uio.h>
4190075Sobrien#include <sys/atomic.h>
4290075Sobrien#include <sys/namei.h>
4390075Sobrien#include <sys/mman.h>
4490075Sobrien#include <sys/cmn_err.h>
4590075Sobrien#include <sys/errno.h>
4690075Sobrien#include <sys/unistd.h>
4790075Sobrien#include <sys/zfs_vfsops.h>
4890075Sobrien#include <sys/zfs_dir.h>
49117395Skan#include <sys/zfs_acl.h>
50117395Skan#include <sys/zfs_ioctl.h>
51169689Skan#include <sys/fs/zfs.h>
52117395Skan#include <sys/dmu.h>
53117395Skan#include <sys/spa.h>
54117395Skan#include <sys/txg.h>
55117395Skan#include <sys/dbuf.h>
56117395Skan#include <sys/zap.h>
57117395Skan#include <sys/dirent.h>
58169689Skan#include <sys/policy.h>
5990075Sobrien#include <sys/sunddi.h>
6090075Sobrien#include <sys/filio.h>
6190075Sobrien#include <sys/zfs_ctldir.h>
6290075Sobrien#include <sys/dnlc.h>
63169689Skan#include <sys/zfs_rlock.h>
64169689Skan#include <sys/bio.h>
65169689Skan#include <sys/buf.h>
6690075Sobrien#include <sys/sf_buf.h>
6790075Sobrien#include <sys/sched.h>
68117395Skan
6990075Sobrien/*
7090075Sobrien * Programming rules.
7190075Sobrien *
7290075Sobrien * Each vnode op performs some logical unit of work.  To do this, the ZPL must
7390075Sobrien * properly lock its in-core state, create a DMU transaction, do the work,
7490075Sobrien * record this work in the intent log (ZIL), commit the DMU transaction,
7590075Sobrien * and wait the the intent log to commit if it's is a synchronous operation.
7690075Sobrien * Morover, the vnode ops must work in both normal and log replay context.
7790075Sobrien * The ordering of events is important to avoid deadlocks and references
7890075Sobrien * to freed memory.  The example below illustrates the following Big Rules:
7990075Sobrien *
8090075Sobrien *  (1) A check must be made in each zfs thread for a mounted file system.
8190075Sobrien *	This is done avoiding races using ZFS_ENTER(zfsvfs).
8290075Sobrien *	A ZFS_EXIT(zfsvfs) is needed before all returns.
8390075Sobrien *
8490075Sobrien *  (2)	VN_RELE() should always be the last thing except for zil_commit()
8590075Sobrien *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
8690075Sobrien *	First, if it's the last reference, the vnode/znode
8790075Sobrien *	can be freed, so the zp may point to freed memory.  Second, the last
8890075Sobrien *	reference will call zfs_zinactive(), which may induce a lot of work --
8990075Sobrien *	pushing cached pages (which acquires range locks) and syncing out
9090075Sobrien *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
9190075Sobrien *	which could deadlock the system if you were already holding one.
9290075Sobrien *
9390075Sobrien *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
9490075Sobrien *	as they can span dmu_tx_assign() calls.
9590075Sobrien *
9690075Sobrien *  (4)	Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign().
9790075Sobrien *	In normal operation, this will be TXG_NOWAIT.  During ZIL replay,
9890075Sobrien *	it will be a specific txg.  Either way, dmu_tx_assign() never blocks.
9990075Sobrien *	This is critical because we don't want to block while holding locks.
10090075Sobrien *	Note, in particular, that if a lock is sometimes acquired before
10190075Sobrien *	the tx assigns, and sometimes after (e.g. z_lock), then failing to
102132718Skan *	use a non-blocking assign can deadlock the system.  The scenario:
10390075Sobrien *
10490075Sobrien *	Thread A has grabbed a lock before calling dmu_tx_assign().
105169689Skan *	Thread B is in an already-assigned tx, and blocks for this lock.
10690075Sobrien *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
10790075Sobrien *	forever, because the previous txg can't quiesce until B's tx commits.
10890075Sobrien *
109169689Skan *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
110169689Skan *	then drop all locks, call dmu_tx_wait(), and try again.
111169689Skan *
11290075Sobrien *  (5)	If the operation succeeded, generate the intent log entry for it
11390075Sobrien *	before dropping locks.  This ensures that the ordering of events
11490075Sobrien *	in the intent log matches the order in which they actually occurred.
11590075Sobrien *
11690075Sobrien *  (6)	At the end of each vnode op, the DMU tx must always commit,
11790075Sobrien *	regardless of whether there were any errors.
118117395Skan *
11990075Sobrien *  (7)	After dropping all locks, invoke zil_commit(zilog, seq, foid)
12090075Sobrien *	to ensure that synchronous semantics are provided when necessary.
12190075Sobrien *
12290075Sobrien * In general, this is how things should be ordered in each vnode op:
12390075Sobrien *
12490075Sobrien *	ZFS_ENTER(zfsvfs);		// exit if unmounted
12590075Sobrien * top:
12690075Sobrien *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may VN_HOLD())
12790075Sobrien *	rw_enter(...);			// grab any other locks you need
12890075Sobrien *	tx = dmu_tx_create(...);	// get DMU tx
129169689Skan *	dmu_tx_hold_*();		// hold each object you might modify
13090075Sobrien *	error = dmu_tx_assign(tx, zfsvfs->z_assign);	// try to assign
13190075Sobrien *	if (error) {
13290075Sobrien *		rw_exit(...);		// drop locks
13390075Sobrien *		zfs_dirent_unlock(dl);	// unlock directory entry
13490075Sobrien *		VN_RELE(...);		// release held vnodes
135132718Skan *		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
13690075Sobrien *			dmu_tx_wait(tx);
13790075Sobrien *			dmu_tx_abort(tx);
138169689Skan *			goto top;
13990075Sobrien *		}
14090075Sobrien *		dmu_tx_abort(tx);	// abort DMU tx
141169689Skan *		ZFS_EXIT(zfsvfs);	// finished in zfs
142169689Skan *		return (error);		// really out of space
14390075Sobrien *	}
144169689Skan *	error = do_real_work();		// do whatever this VOP does
14590075Sobrien *	if (error == 0)
14690075Sobrien *		zfs_log_*(...);		// on success, make ZIL entry
14790075Sobrien *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
14890075Sobrien *	rw_exit(...);			// drop locks
14990075Sobrien *	zfs_dirent_unlock(dl);		// unlock directory entry
15090075Sobrien *	VN_RELE(...);			// release held vnodes
151117395Skan *	zil_commit(zilog, seq, foid);	// synchronous when necessary
15290075Sobrien *	ZFS_EXIT(zfsvfs);		// finished in zfs
15390075Sobrien *	return (error);			// done, report error
15490075Sobrien */
15590075Sobrien/* ARGSUSED */
15690075Sobrienstatic int
15790075Sobrienzfs_open(vnode_t **vpp, int flag, cred_t *cr)
15890075Sobrien{
15990075Sobrien	znode_t	*zp = VTOZ(*vpp);
16090075Sobrien
16190075Sobrien	/* Keep a count of the synchronous opens in the znode */
16290075Sobrien	if (flag & (FSYNC | FDSYNC))
16390075Sobrien		atomic_inc_32(&zp->z_sync_cnt);
16490075Sobrien	return (0);
16590075Sobrien}
16690075Sobrien
16790075Sobrien/* ARGSUSED */
16890075Sobrienstatic int
16990075Sobrienzfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr)
17090075Sobrien{
17190075Sobrien	znode_t	*zp = VTOZ(vp);
17290075Sobrien
17390075Sobrien	/* Decrement the synchronous opens in the znode */
17490075Sobrien	if (flag & (FSYNC | FDSYNC))
17590075Sobrien		atomic_dec_32(&zp->z_sync_cnt);
17690075Sobrien
17790075Sobrien	/*
17890075Sobrien	 * Clean up any locks held by this process on the vp.
17990075Sobrien	 */
180169689Skan	cleanlocks(vp, ddi_get_pid(), 0);
181169689Skan	cleanshares(vp, ddi_get_pid());
18290075Sobrien
18390075Sobrien	return (0);
18490075Sobrien}
18590075Sobrien
18690075Sobrien/*
18790075Sobrien * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
18890075Sobrien * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
18990075Sobrien */
190static int
191zfs_holey(vnode_t *vp, u_long cmd, offset_t *off)
192{
193	znode_t	*zp = VTOZ(vp);
194	uint64_t noff = (uint64_t)*off; /* new offset */
195	uint64_t file_sz;
196	int error;
197	boolean_t hole;
198
199	file_sz = zp->z_phys->zp_size;
200	if (noff >= file_sz)  {
201		return (ENXIO);
202	}
203
204	if (cmd == _FIO_SEEK_HOLE)
205		hole = B_TRUE;
206	else
207		hole = B_FALSE;
208
209	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
210
211	/* end of file? */
212	if ((error == ESRCH) || (noff > file_sz)) {
213		/*
214		 * Handle the virtual hole at the end of file.
215		 */
216		if (hole) {
217			*off = file_sz;
218			return (0);
219		}
220		return (ENXIO);
221	}
222
223	if (noff < *off)
224		return (error);
225	*off = noff;
226	return (error);
227}
228
229/* ARGSUSED */
230static int
231zfs_ioctl(vnode_t *vp, u_long com, intptr_t data, int flag, cred_t *cred,
232    int *rvalp)
233{
234	offset_t off;
235	int error;
236	zfsvfs_t *zfsvfs;
237
238	switch (com) {
239	    case _FIOFFS:
240		return (0);
241
242		/*
243		 * The following two ioctls are used by bfu.  Faking out,
244		 * necessary to avoid bfu errors.
245		 */
246	    case _FIOGDIO:
247	    case _FIOSDIO:
248		return (0);
249
250	    case _FIO_SEEK_DATA:
251	    case _FIO_SEEK_HOLE:
252		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
253			return (EFAULT);
254
255		zfsvfs = VTOZ(vp)->z_zfsvfs;
256		ZFS_ENTER(zfsvfs);
257
258		/* offset parameter is in/out */
259		error = zfs_holey(vp, com, &off);
260		ZFS_EXIT(zfsvfs);
261		if (error)
262			return (error);
263		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
264			return (EFAULT);
265		return (0);
266	}
267	return (ENOTTY);
268}
269
270/*
271 * When a file is memory mapped, we must keep the IO data synchronized
272 * between the DMU cache and the memory mapped pages.  What this means:
273 *
274 * On Write:	If we find a memory mapped page, we write to *both*
275 *		the page and the dmu buffer.
276 *
277 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
278 *	the file is memory mapped.
279 */
280static int
281mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx)
282{
283	znode_t *zp = VTOZ(vp);
284	objset_t *os = zp->z_zfsvfs->z_os;
285	vm_object_t obj;
286	vm_page_t m;
287	struct sf_buf *sf;
288	int64_t start, off;
289	int len = nbytes;
290	int error = 0;
291	uint64_t dirbytes;
292
293	ASSERT(vp->v_mount != NULL);
294	obj = vp->v_object;
295	ASSERT(obj != NULL);
296
297	start = uio->uio_loffset;
298	off = start & PAGEOFFSET;
299	dirbytes = 0;
300	VM_OBJECT_LOCK(obj);
301	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
302		uint64_t bytes = MIN(PAGESIZE - off, len);
303		uint64_t fsize;
304
305again:
306		if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
307		    vm_page_is_valid(m, (vm_offset_t)off, bytes)) {
308			uint64_t woff;
309			caddr_t va;
310
311			if (vm_page_sleep_if_busy(m, FALSE, "zfsmwb"))
312				goto again;
313			fsize = obj->un_pager.vnp.vnp_size;
314			vm_page_busy(m);
315			vm_page_lock_queues();
316			vm_page_undirty(m);
317			vm_page_unlock_queues();
318			VM_OBJECT_UNLOCK(obj);
319			if (dirbytes > 0) {
320				error = dmu_write_uio(os, zp->z_id, uio,
321				    dirbytes, tx);
322				dirbytes = 0;
323			}
324			if (error == 0) {
325				sched_pin();
326				sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
327				va = (caddr_t)sf_buf_kva(sf);
328				woff = uio->uio_loffset - off;
329				error = uiomove(va + off, bytes, UIO_WRITE, uio);
330				/*
331				 * The uiomove() above could have been partially
332				 * successful, that's why we call dmu_write()
333				 * below unconditionally. The page was marked
334				 * non-dirty above and we would lose the changes
335				 * without doing so. If the uiomove() failed
336				 * entirely, well, we just write what we got
337				 * before one more time.
338				 */
339				dmu_write(os, zp->z_id, woff,
340				    MIN(PAGESIZE, fsize - woff), va, tx);
341				sf_buf_free(sf);
342				sched_unpin();
343			}
344			VM_OBJECT_LOCK(obj);
345			vm_page_wakeup(m);
346		} else {
347			dirbytes += bytes;
348		}
349		len -= bytes;
350		off = 0;
351		if (error)
352			break;
353	}
354	VM_OBJECT_UNLOCK(obj);
355	if (error == 0 && dirbytes > 0)
356		error = dmu_write_uio(os, zp->z_id, uio, dirbytes, tx);
357	return (error);
358}
359
360/*
361 * When a file is memory mapped, we must keep the IO data synchronized
362 * between the DMU cache and the memory mapped pages.  What this means:
363 *
364 * On Read:	We "read" preferentially from memory mapped pages,
365 *		else we default from the dmu buffer.
366 *
367 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
368 *	the file is memory mapped.
369 */
370static int
371mappedread(vnode_t *vp, int nbytes, uio_t *uio)
372{
373	znode_t *zp = VTOZ(vp);
374	objset_t *os = zp->z_zfsvfs->z_os;
375	vm_object_t obj;
376	vm_page_t m;
377	struct sf_buf *sf;
378	int64_t start, off;
379	caddr_t va;
380	int len = nbytes;
381	int error = 0;
382	uint64_t dirbytes;
383
384	ASSERT(vp->v_mount != NULL);
385	obj = vp->v_object;
386	ASSERT(obj != NULL);
387
388	start = uio->uio_loffset;
389	off = start & PAGEOFFSET;
390	dirbytes = 0;
391	VM_OBJECT_LOCK(obj);
392	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
393		uint64_t bytes = MIN(PAGESIZE - off, len);
394
395again:
396		if ((m = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
397		    vm_page_is_valid(m, (vm_offset_t)off, bytes)) {
398			if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb"))
399				goto again;
400			vm_page_busy(m);
401			VM_OBJECT_UNLOCK(obj);
402			if (dirbytes > 0) {
403				error = dmu_read_uio(os, zp->z_id, uio,
404				    dirbytes);
405				dirbytes = 0;
406			}
407			if (error == 0) {
408				sched_pin();
409				sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
410				va = (caddr_t)sf_buf_kva(sf);
411				error = uiomove(va + off, bytes, UIO_READ, uio);
412				sf_buf_free(sf);
413				sched_unpin();
414			}
415			VM_OBJECT_LOCK(obj);
416			vm_page_wakeup(m);
417		} else if (m != NULL && uio->uio_segflg == UIO_NOCOPY) {
418			/*
419			 * The code below is here to make sendfile(2) work
420			 * correctly with ZFS. As pointed out by ups@
421			 * sendfile(2) should be changed to use VOP_GETPAGES(),
422			 * but it pessimize performance of sendfile/UFS, that's
423			 * why I handle this special case in ZFS code.
424			 */
425			if (vm_page_sleep_if_busy(m, FALSE, "zfsmrb"))
426				goto again;
427			vm_page_busy(m);
428			VM_OBJECT_UNLOCK(obj);
429			if (dirbytes > 0) {
430				error = dmu_read_uio(os, zp->z_id, uio,
431				    dirbytes);
432				dirbytes = 0;
433			}
434			if (error == 0) {
435				sched_pin();
436				sf = sf_buf_alloc(m, SFB_CPUPRIVATE);
437				va = (caddr_t)sf_buf_kva(sf);
438				error = dmu_read(os, zp->z_id, start + off,
439				    bytes, (void *)(va + off));
440				sf_buf_free(sf);
441				sched_unpin();
442			}
443			VM_OBJECT_LOCK(obj);
444			vm_page_wakeup(m);
445			if (error == 0)
446				uio->uio_resid -= bytes;
447		} else {
448			dirbytes += bytes;
449		}
450		len -= bytes;
451		off = 0;
452		if (error)
453			break;
454	}
455	VM_OBJECT_UNLOCK(obj);
456	if (error == 0 && dirbytes > 0)
457		error = dmu_read_uio(os, zp->z_id, uio, dirbytes);
458	return (error);
459}
460
461offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
462
463/*
464 * Read bytes from specified file into supplied buffer.
465 *
466 *	IN:	vp	- vnode of file to be read from.
467 *		uio	- structure supplying read location, range info,
468 *			  and return buffer.
469 *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
470 *		cr	- credentials of caller.
471 *
472 *	OUT:	uio	- updated offset and range, buffer filled.
473 *
474 *	RETURN:	0 if success
475 *		error code if failure
476 *
477 * Side Effects:
478 *	vp - atime updated if byte count > 0
479 */
480/* ARGSUSED */
481static int
482zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
483{
484	znode_t		*zp = VTOZ(vp);
485	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
486	objset_t	*os = zfsvfs->z_os;
487	ssize_t		n, nbytes;
488	int		error;
489	rl_t		*rl;
490
491	ZFS_ENTER(zfsvfs);
492
493	/*
494	 * Validate file offset
495	 */
496	if (uio->uio_loffset < (offset_t)0) {
497		ZFS_EXIT(zfsvfs);
498		return (EINVAL);
499	}
500
501	/*
502	 * Fasttrack empty reads
503	 */
504	if (uio->uio_resid == 0) {
505		ZFS_EXIT(zfsvfs);
506		return (0);
507	}
508
509	/*
510	 * Check for mandatory locks
511	 */
512	if (MANDMODE((mode_t)zp->z_phys->zp_mode)) {
513		if (error = chklock(vp, FREAD,
514		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
515			ZFS_EXIT(zfsvfs);
516			return (error);
517		}
518	}
519
520	/*
521	 * If we're in FRSYNC mode, sync out this znode before reading it.
522	 */
523	if (ioflag & FRSYNC)
524		zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
525
526	/*
527	 * Lock the range against changes.
528	 */
529	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
530
531	/*
532	 * If we are reading past end-of-file we can skip
533	 * to the end; but we might still need to set atime.
534	 */
535	if (uio->uio_loffset >= zp->z_phys->zp_size) {
536		error = 0;
537		goto out;
538	}
539
540	ASSERT(uio->uio_loffset < zp->z_phys->zp_size);
541	n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset);
542
543	while (n > 0) {
544		nbytes = MIN(n, zfs_read_chunk_size -
545		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
546
547		if (vn_has_cached_data(vp))
548			error = mappedread(vp, nbytes, uio);
549		else
550			error = dmu_read_uio(os, zp->z_id, uio, nbytes);
551		if (error)
552			break;
553
554		n -= nbytes;
555	}
556
557out:
558	zfs_range_unlock(rl);
559
560	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
561	ZFS_EXIT(zfsvfs);
562	return (error);
563}
564
565/*
566 * Fault in the pages of the first n bytes specified by the uio structure.
567 * 1 byte in each page is touched and the uio struct is unmodified.
568 * Any error will exit this routine as this is only a best
569 * attempt to get the pages resident. This is a copy of ufs_trans_touch().
570 */
571static void
572zfs_prefault_write(ssize_t n, struct uio *uio)
573{
574	struct iovec *iov;
575	ulong_t cnt, incr;
576	caddr_t p;
577
578	if (uio->uio_segflg != UIO_USERSPACE)
579		return;
580
581	iov = uio->uio_iov;
582
583	while (n) {
584		cnt = MIN(iov->iov_len, n);
585		if (cnt == 0) {
586			/* empty iov entry */
587			iov++;
588			continue;
589		}
590		n -= cnt;
591		/*
592		 * touch each page in this segment.
593		 */
594		p = iov->iov_base;
595		while (cnt) {
596			if (fubyte(p) == -1)
597				return;
598			incr = MIN(cnt, PAGESIZE);
599			p += incr;
600			cnt -= incr;
601		}
602		/*
603		 * touch the last byte in case it straddles a page.
604		 */
605		p--;
606		if (fubyte(p) == -1)
607			return;
608		iov++;
609	}
610}
611
612/*
613 * Write the bytes to a file.
614 *
615 *	IN:	vp	- vnode of file to be written to.
616 *		uio	- structure supplying write location, range info,
617 *			  and data buffer.
618 *		ioflag	- IO_APPEND flag set if in append mode.
619 *		cr	- credentials of caller.
620 *
621 *	OUT:	uio	- updated offset and range.
622 *
623 *	RETURN:	0 if success
624 *		error code if failure
625 *
626 * Timestamps:
627 *	vp - ctime|mtime updated if byte count > 0
628 */
629/* ARGSUSED */
630static int
631zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
632{
633	znode_t		*zp = VTOZ(vp);
634	rlim64_t	limit = MAXOFFSET_T;
635	ssize_t		start_resid = uio->uio_resid;
636	ssize_t		tx_bytes;
637	uint64_t	end_size;
638	dmu_tx_t	*tx;
639	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
640	zilog_t		*zilog = zfsvfs->z_log;
641	offset_t	woff;
642	ssize_t		n, nbytes;
643	rl_t		*rl;
644	int		max_blksz = zfsvfs->z_max_blksz;
645	int		error;
646
647	/*
648	 * Fasttrack empty write
649	 */
650	n = start_resid;
651	if (n == 0)
652		return (0);
653
654	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
655		limit = MAXOFFSET_T;
656
657	ZFS_ENTER(zfsvfs);
658
659	/*
660	 * Pre-fault the pages to ensure slow (eg NFS) pages
661	 * don't hold up txg.
662	 */
663	zfs_prefault_write(n, uio);
664
665	/*
666	 * If in append mode, set the io offset pointer to eof.
667	 */
668	if (ioflag & IO_APPEND) {
669		/*
670		 * Range lock for a file append:
671		 * The value for the start of range will be determined by
672		 * zfs_range_lock() (to guarantee append semantics).
673		 * If this write will cause the block size to increase,
674		 * zfs_range_lock() will lock the entire file, so we must
675		 * later reduce the range after we grow the block size.
676		 */
677		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
678		if (rl->r_len == UINT64_MAX) {
679			/* overlocked, zp_size can't change */
680			woff = uio->uio_loffset = zp->z_phys->zp_size;
681		} else {
682			woff = uio->uio_loffset = rl->r_off;
683		}
684	} else {
685		woff = uio->uio_loffset;
686		/*
687		 * Validate file offset
688		 */
689		if (woff < 0) {
690			ZFS_EXIT(zfsvfs);
691			return (EINVAL);
692		}
693
694		/*
695		 * If we need to grow the block size then zfs_range_lock()
696		 * will lock a wider range than we request here.
697		 * Later after growing the block size we reduce the range.
698		 */
699		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
700	}
701
702	if (woff >= limit) {
703		zfs_range_unlock(rl);
704		ZFS_EXIT(zfsvfs);
705		return (EFBIG);
706	}
707
708	if ((woff + n) > limit || woff > (limit - n))
709		n = limit - woff;
710
711	/*
712	 * Check for mandatory locks
713	 */
714	if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
715	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
716		zfs_range_unlock(rl);
717		ZFS_EXIT(zfsvfs);
718		return (error);
719	}
720	end_size = MAX(zp->z_phys->zp_size, woff + n);
721
722	/*
723	 * Write the file in reasonable size chunks.  Each chunk is written
724	 * in a separate transaction; this keeps the intent log records small
725	 * and allows us to do more fine-grained space accounting.
726	 */
727	while (n > 0) {
728		/*
729		 * Start a transaction.
730		 */
731		woff = uio->uio_loffset;
732		tx = dmu_tx_create(zfsvfs->z_os);
733		dmu_tx_hold_bonus(tx, zp->z_id);
734		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
735		error = dmu_tx_assign(tx, zfsvfs->z_assign);
736		if (error) {
737			if (error == ERESTART &&
738			    zfsvfs->z_assign == TXG_NOWAIT) {
739				dmu_tx_wait(tx);
740				dmu_tx_abort(tx);
741				continue;
742			}
743			dmu_tx_abort(tx);
744			break;
745		}
746
747		/*
748		 * If zfs_range_lock() over-locked we grow the blocksize
749		 * and then reduce the lock range.  This will only happen
750		 * on the first iteration since zfs_range_reduce() will
751		 * shrink down r_len to the appropriate size.
752		 */
753		if (rl->r_len == UINT64_MAX) {
754			uint64_t new_blksz;
755
756			if (zp->z_blksz > max_blksz) {
757				ASSERT(!ISP2(zp->z_blksz));
758				new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
759			} else {
760				new_blksz = MIN(end_size, max_blksz);
761			}
762			zfs_grow_blocksize(zp, new_blksz, tx);
763			zfs_range_reduce(rl, woff, n);
764		}
765
766		/*
767		 * XXX - should we really limit each write to z_max_blksz?
768		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
769		 */
770		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
771		rw_enter(&zp->z_map_lock, RW_READER);
772
773		if (woff + nbytes > zp->z_phys->zp_size)
774			vnode_pager_setsize(vp, woff + nbytes);
775
776		tx_bytes = uio->uio_resid;
777		if (vn_has_cached_data(vp)) {
778			rw_exit(&zp->z_map_lock);
779			error = mappedwrite(vp, nbytes, uio, tx);
780		} else {
781			error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
782			    uio, nbytes, tx);
783			rw_exit(&zp->z_map_lock);
784		}
785		tx_bytes -= uio->uio_resid;
786
787		/*
788		 * If we made no progress, we're done.  If we made even
789		 * partial progress, update the znode and ZIL accordingly.
790		 */
791		if (tx_bytes == 0) {
792			dmu_tx_commit(tx);
793			ASSERT(error != 0);
794			break;
795		}
796
797		/*
798		 * Clear Set-UID/Set-GID bits on successful write if not
799		 * privileged and at least one of the excute bits is set.
800		 *
801		 * It would be nice to to this after all writes have
802		 * been done, but that would still expose the ISUID/ISGID
803		 * to another app after the partial write is committed.
804		 */
805		mutex_enter(&zp->z_acl_lock);
806		if ((zp->z_phys->zp_mode & (S_IXUSR | (S_IXUSR >> 3) |
807		    (S_IXUSR >> 6))) != 0 &&
808		    (zp->z_phys->zp_mode & (S_ISUID | S_ISGID)) != 0 &&
809		    secpolicy_vnode_setid_retain(cr,
810		    (zp->z_phys->zp_mode & S_ISUID) != 0 &&
811		    zp->z_phys->zp_uid == 0) != 0) {
812			    zp->z_phys->zp_mode &= ~(S_ISUID | S_ISGID);
813		}
814		mutex_exit(&zp->z_acl_lock);
815
816		/*
817		 * Update time stamp.  NOTE: This marks the bonus buffer as
818		 * dirty, so we don't have to do it again for zp_size.
819		 */
820		zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
821
822		/*
823		 * Update the file size (zp_size) if it has changed;
824		 * account for possible concurrent updates.
825		 */
826		while ((end_size = zp->z_phys->zp_size) < uio->uio_loffset)
827			(void) atomic_cas_64(&zp->z_phys->zp_size, end_size,
828			    uio->uio_loffset);
829		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
830		dmu_tx_commit(tx);
831
832		if (error != 0)
833			break;
834		ASSERT(tx_bytes == nbytes);
835		n -= nbytes;
836	}
837
838	zfs_range_unlock(rl);
839
840	/*
841	 * If we're in replay mode, or we made no progress, return error.
842	 * Otherwise, it's at least a partial write, so it's successful.
843	 */
844	if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) {
845		ZFS_EXIT(zfsvfs);
846		return (error);
847	}
848
849	if (ioflag & (FSYNC | FDSYNC))
850		zil_commit(zilog, zp->z_last_itx, zp->z_id);
851
852	ZFS_EXIT(zfsvfs);
853	return (0);
854}
855
856void
857zfs_get_done(dmu_buf_t *db, void *vzgd)
858{
859	zgd_t *zgd = (zgd_t *)vzgd;
860	rl_t *rl = zgd->zgd_rl;
861	vnode_t *vp = ZTOV(rl->r_zp);
862	int vfslocked;
863
864	vfslocked = VFS_LOCK_GIANT(vp->v_vfsp);
865	dmu_buf_rele(db, vzgd);
866	zfs_range_unlock(rl);
867	VN_RELE(vp);
868	zil_add_vdev(zgd->zgd_zilog, DVA_GET_VDEV(BP_IDENTITY(zgd->zgd_bp)));
869	kmem_free(zgd, sizeof (zgd_t));
870	VFS_UNLOCK_GIANT(vfslocked);
871}
872
873/*
874 * Get data to generate a TX_WRITE intent log record.
875 */
876int
877zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
878{
879	zfsvfs_t *zfsvfs = arg;
880	objset_t *os = zfsvfs->z_os;
881	znode_t *zp;
882	uint64_t off = lr->lr_offset;
883	dmu_buf_t *db;
884	rl_t *rl;
885	zgd_t *zgd;
886	int dlen = lr->lr_length;		/* length of user data */
887	int error = 0;
888
889	ASSERT(zio);
890	ASSERT(dlen != 0);
891
892	/*
893	 * Nothing to do if the file has been removed
894	 */
895	if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0)
896		return (ENOENT);
897	if (zp->z_unlinked) {
898		VN_RELE(ZTOV(zp));
899		return (ENOENT);
900	}
901
902	/*
903	 * Write records come in two flavors: immediate and indirect.
904	 * For small writes it's cheaper to store the data with the
905	 * log record (immediate); for large writes it's cheaper to
906	 * sync the data and get a pointer to it (indirect) so that
907	 * we don't have to write the data twice.
908	 */
909	if (buf != NULL) { /* immediate write */
910		rl = zfs_range_lock(zp, off, dlen, RL_READER);
911		/* test for truncation needs to be done while range locked */
912		if (off >= zp->z_phys->zp_size) {
913			error = ENOENT;
914			goto out;
915		}
916		VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf));
917	} else { /* indirect write */
918		uint64_t boff; /* block starting offset */
919
920		/*
921		 * Have to lock the whole block to ensure when it's
922		 * written out and it's checksum is being calculated
923		 * that no one can change the data. We need to re-check
924		 * blocksize after we get the lock in case it's changed!
925		 */
926		for (;;) {
927			if (ISP2(zp->z_blksz)) {
928				boff = P2ALIGN_TYPED(off, zp->z_blksz,
929				    uint64_t);
930			} else {
931				boff = 0;
932			}
933			dlen = zp->z_blksz;
934			rl = zfs_range_lock(zp, boff, dlen, RL_READER);
935			if (zp->z_blksz == dlen)
936				break;
937			zfs_range_unlock(rl);
938		}
939		/* test for truncation needs to be done while range locked */
940		if (off >= zp->z_phys->zp_size) {
941			error = ENOENT;
942			goto out;
943		}
944		zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
945		zgd->zgd_rl = rl;
946		zgd->zgd_zilog = zfsvfs->z_log;
947		zgd->zgd_bp = &lr->lr_blkptr;
948		VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db));
949		ASSERT(boff == db->db_offset);
950		lr->lr_blkoff = off - boff;
951		error = dmu_sync(zio, db, &lr->lr_blkptr,
952		    lr->lr_common.lrc_txg, zfs_get_done, zgd);
953		ASSERT(error == EEXIST || lr->lr_length <= zp->z_blksz);
954		if (error == 0) {
955			zil_add_vdev(zfsvfs->z_log,
956			    DVA_GET_VDEV(BP_IDENTITY(&lr->lr_blkptr)));
957		}
958		/*
959		 * If we get EINPROGRESS, then we need to wait for a
960		 * write IO initiated by dmu_sync() to complete before
961		 * we can release this dbuf.  We will finish everything
962		 * up in the zfs_get_done() callback.
963		 */
964		if (error == EINPROGRESS)
965			return (0);
966		dmu_buf_rele(db, zgd);
967		kmem_free(zgd, sizeof (zgd_t));
968	}
969out:
970	zfs_range_unlock(rl);
971	VN_RELE(ZTOV(zp));
972	return (error);
973}
974
975/*ARGSUSED*/
976static int
977zfs_access(vnode_t *vp, int mode, int flags, cred_t *cr)
978{
979	znode_t *zp = VTOZ(vp);
980	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
981	int error;
982
983	ZFS_ENTER(zfsvfs);
984	error = zfs_zaccess_rwx(zp, mode, cr);
985	ZFS_EXIT(zfsvfs);
986	return (error);
987}
988
989/*
990 * Lookup an entry in a directory, or an extended attribute directory.
991 * If it exists, return a held vnode reference for it.
992 *
993 *	IN:	dvp	- vnode of directory to search.
994 *		nm	- name of entry to lookup.
995 *		pnp	- full pathname to lookup [UNUSED].
996 *		flags	- LOOKUP_XATTR set if looking for an attribute.
997 *		rdir	- root directory vnode [UNUSED].
998 *		cr	- credentials of caller.
999 *
1000 *	OUT:	vpp	- vnode of located entry, NULL if not found.
1001 *
1002 *	RETURN:	0 if success
1003 *		error code if failure
1004 *
1005 * Timestamps:
1006 *	NA
1007 */
1008/* ARGSUSED */
1009static int
1010zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp,
1011     int nameiop, cred_t *cr, kthread_t *td)
1012{
1013
1014	znode_t *zdp = VTOZ(dvp);
1015	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1016	int	error;
1017
1018	ZFS_ENTER(zfsvfs);
1019
1020	*vpp = NULL;
1021
1022#ifdef TODO
1023	if (flags & LOOKUP_XATTR) {
1024		/*
1025		 * If the xattr property is off, refuse the lookup request.
1026		 */
1027		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1028			ZFS_EXIT(zfsvfs);
1029			return (EINVAL);
1030		}
1031
1032		/*
1033		 * We don't allow recursive attributes..
1034		 * Maybe someday we will.
1035		 */
1036		if (zdp->z_phys->zp_flags & ZFS_XATTR) {
1037			ZFS_EXIT(zfsvfs);
1038			return (EINVAL);
1039		}
1040
1041		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1042			ZFS_EXIT(zfsvfs);
1043			return (error);
1044		}
1045
1046		/*
1047		 * Do we have permission to get into attribute directory?
1048		 */
1049
1050		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, cr)) {
1051			VN_RELE(*vpp);
1052		}
1053
1054		ZFS_EXIT(zfsvfs);
1055		return (error);
1056	}
1057#endif	/* TODO */
1058
1059	if (dvp->v_type != VDIR) {
1060		ZFS_EXIT(zfsvfs);
1061		return (ENOTDIR);
1062	}
1063
1064	/*
1065	 * Check accessibility of directory.
1066	 */
1067
1068	if (error = zfs_zaccess(zdp, ACE_EXECUTE, cr)) {
1069		ZFS_EXIT(zfsvfs);
1070		return (error);
1071	}
1072
1073	if ((error = zfs_dirlook(zdp, nm, vpp)) == 0) {
1074
1075		/*
1076		 * Convert device special files
1077		 */
1078		if (IS_DEVVP(*vpp)) {
1079			vnode_t	*svp;
1080
1081			svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1082			VN_RELE(*vpp);
1083			if (svp == NULL)
1084				error = ENOSYS;
1085			else
1086				*vpp = svp;
1087		}
1088	}
1089
1090	ZFS_EXIT(zfsvfs);
1091
1092	/* Translate errors and add SAVENAME when needed. */
1093	if (cnp->cn_flags & ISLASTCN) {
1094		switch (nameiop) {
1095		case CREATE:
1096		case RENAME:
1097			if (error == ENOENT) {
1098				error = EJUSTRETURN;
1099				cnp->cn_flags |= SAVENAME;
1100				break;
1101			}
1102			/* FALLTHROUGH */
1103		case DELETE:
1104			if (error == 0)
1105				cnp->cn_flags |= SAVENAME;
1106			break;
1107		}
1108	}
1109	if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) {
1110		if (cnp->cn_flags & ISDOTDOT)
1111			VOP_UNLOCK(dvp, 0, td);
1112		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);
1113		if (cnp->cn_flags & ISDOTDOT)
1114			vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td);
1115	}
1116
1117#ifdef FREEBSD_NAMECACHE
1118	/*
1119	 * Insert name into cache (as non-existent) if appropriate.
1120	 */
1121	if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
1122		cache_enter(dvp, *vpp, cnp);
1123        /*
1124         * Insert name into cache if appropriate.
1125         */
1126	if (error == 0 && (cnp->cn_flags & MAKEENTRY)) {
1127		if (!(cnp->cn_flags & ISLASTCN) ||
1128		    (nameiop != DELETE && nameiop != RENAME)) {
1129			cache_enter(dvp, *vpp, cnp);
1130		}
1131	}
1132#endif
1133
1134	return (error);
1135}
1136
1137/*
1138 * Attempt to create a new entry in a directory.  If the entry
1139 * already exists, truncate the file if permissible, else return
1140 * an error.  Return the vp of the created or trunc'd file.
1141 *
1142 *	IN:	dvp	- vnode of directory to put new file entry in.
1143 *		name	- name of new file entry.
1144 *		vap	- attributes of new file.
1145 *		excl	- flag indicating exclusive or non-exclusive mode.
1146 *		mode	- mode to open file with.
1147 *		cr	- credentials of caller.
1148 *		flag	- large file flag [UNUSED].
1149 *
1150 *	OUT:	vpp	- vnode of created or trunc'd entry.
1151 *
1152 *	RETURN:	0 if success
1153 *		error code if failure
1154 *
1155 * Timestamps:
1156 *	dvp - ctime|mtime updated if new entry created
1157 *	 vp - ctime|mtime always, atime if new
1158 */
1159/* ARGSUSED */
1160static int
1161zfs_create(vnode_t *dvp, char *name, vattr_t *vap, int excl, int mode,
1162    vnode_t **vpp, cred_t *cr, kthread_t *td)
1163{
1164	znode_t		*zp, *dzp = VTOZ(dvp);
1165	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1166	zilog_t		*zilog = zfsvfs->z_log;
1167	objset_t	*os = zfsvfs->z_os;
1168	zfs_dirlock_t	*dl;
1169	dmu_tx_t	*tx;
1170	int		error;
1171	uint64_t	zoid;
1172
1173	ZFS_ENTER(zfsvfs);
1174
1175top:
1176	*vpp = NULL;
1177
1178	if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr))
1179		vap->va_mode &= ~VSVTX;
1180
1181	if (*name == '\0') {
1182		/*
1183		 * Null component name refers to the directory itself.
1184		 */
1185		VN_HOLD(dvp);
1186		zp = dzp;
1187		dl = NULL;
1188		error = 0;
1189	} else {
1190		/* possible VN_HOLD(zp) */
1191		if (error = zfs_dirent_lock(&dl, dzp, name, &zp, 0)) {
1192			if (strcmp(name, "..") == 0)
1193				error = EISDIR;
1194			ZFS_EXIT(zfsvfs);
1195			return (error);
1196		}
1197	}
1198
1199	zoid = zp ? zp->z_id : -1ULL;
1200
1201	if (zp == NULL) {
1202		/*
1203		 * Create a new file object and update the directory
1204		 * to reference it.
1205		 */
1206		if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
1207			goto out;
1208		}
1209
1210		/*
1211		 * We only support the creation of regular files in
1212		 * extended attribute directories.
1213		 */
1214		if ((dzp->z_phys->zp_flags & ZFS_XATTR) &&
1215		    (vap->va_type != VREG)) {
1216			error = EINVAL;
1217			goto out;
1218		}
1219
1220		tx = dmu_tx_create(os);
1221		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1222		dmu_tx_hold_bonus(tx, dzp->z_id);
1223		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1224		if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
1225			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1226			    0, SPA_MAXBLOCKSIZE);
1227		error = dmu_tx_assign(tx, zfsvfs->z_assign);
1228		if (error) {
1229			zfs_dirent_unlock(dl);
1230			if (error == ERESTART &&
1231			    zfsvfs->z_assign == TXG_NOWAIT) {
1232				dmu_tx_wait(tx);
1233				dmu_tx_abort(tx);
1234				goto top;
1235			}
1236			dmu_tx_abort(tx);
1237			ZFS_EXIT(zfsvfs);
1238			return (error);
1239		}
1240		zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
1241		ASSERT(zp->z_id == zoid);
1242		(void) zfs_link_create(dl, zp, tx, ZNEW);
1243		zfs_log_create(zilog, tx, TX_CREATE, dzp, zp, name);
1244		dmu_tx_commit(tx);
1245	} else {
1246		/*
1247		 * A directory entry already exists for this name.
1248		 */
1249		/*
1250		 * Can't truncate an existing file if in exclusive mode.
1251		 */
1252		if (excl == EXCL) {
1253			error = EEXIST;
1254			goto out;
1255		}
1256		/*
1257		 * Can't open a directory for writing.
1258		 */
1259		if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
1260			error = EISDIR;
1261			goto out;
1262		}
1263		/*
1264		 * Verify requested access to file.
1265		 */
1266		if (mode && (error = zfs_zaccess_rwx(zp, mode, cr))) {
1267			goto out;
1268		}
1269
1270		mutex_enter(&dzp->z_lock);
1271		dzp->z_seq++;
1272		mutex_exit(&dzp->z_lock);
1273
1274		/*
1275		 * Truncate regular files if requested.
1276		 */
1277		if ((ZTOV(zp)->v_type == VREG) &&
1278		    (zp->z_phys->zp_size != 0) &&
1279		    (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
1280			error = zfs_freesp(zp, 0, 0, mode, TRUE);
1281			if (error == ERESTART &&
1282			    zfsvfs->z_assign == TXG_NOWAIT) {
1283				/* NB: we already did dmu_tx_wait() */
1284				zfs_dirent_unlock(dl);
1285				VN_RELE(ZTOV(zp));
1286				goto top;
1287			}
1288		}
1289	}
1290out:
1291
1292	if (error == 0) {
1293		*vpp = ZTOV(zp);
1294		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);
1295	}
1296
1297	if (dl)
1298		zfs_dirent_unlock(dl);
1299
1300	if (error) {
1301		if (zp)
1302			VN_RELE(ZTOV(zp));
1303	} else {
1304		*vpp = ZTOV(zp);
1305		/*
1306		 * If vnode is for a device return a specfs vnode instead.
1307		 */
1308		if (IS_DEVVP(*vpp)) {
1309			struct vnode *svp;
1310
1311			svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1312			VN_RELE(*vpp);
1313			if (svp == NULL) {
1314				error = ENOSYS;
1315			}
1316			*vpp = svp;
1317		}
1318	}
1319
1320	ZFS_EXIT(zfsvfs);
1321	return (error);
1322}
1323
1324/*
1325 * Remove an entry from a directory.
1326 *
1327 *	IN:	dvp	- vnode of directory to remove entry from.
1328 *		name	- name of entry to remove.
1329 *		cr	- credentials of caller.
1330 *
1331 *	RETURN:	0 if success
1332 *		error code if failure
1333 *
1334 * Timestamps:
1335 *	dvp - ctime|mtime
1336 *	 vp - ctime (if nlink > 0)
1337 */
1338static int
1339zfs_remove(vnode_t *dvp, char *name, cred_t *cr)
1340{
1341	znode_t		*zp, *dzp = VTOZ(dvp);
1342	znode_t		*xzp = NULL;
1343	vnode_t		*vp;
1344	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1345	zilog_t		*zilog = zfsvfs->z_log;
1346	uint64_t	acl_obj, xattr_obj;
1347	zfs_dirlock_t	*dl;
1348	dmu_tx_t	*tx;
1349	boolean_t	may_delete_now, delete_now = FALSE;
1350	boolean_t	unlinked;
1351	int		error;
1352
1353	ZFS_ENTER(zfsvfs);
1354
1355top:
1356	/*
1357	 * Attempt to lock directory; fail if entry doesn't exist.
1358	 */
1359	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) {
1360		ZFS_EXIT(zfsvfs);
1361		return (error);
1362	}
1363
1364	vp = ZTOV(zp);
1365
1366	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1367		goto out;
1368	}
1369
1370	/*
1371	 * Need to use rmdir for removing directories.
1372	 */
1373	if (vp->v_type == VDIR) {
1374		error = EPERM;
1375		goto out;
1376	}
1377
1378	vnevent_remove(vp);
1379
1380	dnlc_remove(dvp, name);
1381
1382	may_delete_now = FALSE;
1383
1384	/*
1385	 * We may delete the znode now, or we may put it in the unlinked set;
1386	 * it depends on whether we're the last link, and on whether there are
1387	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
1388	 * allow for either case.
1389	 */
1390	tx = dmu_tx_create(zfsvfs->z_os);
1391	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1392	dmu_tx_hold_bonus(tx, zp->z_id);
1393	if (may_delete_now)
1394		dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
1395
1396	/* are there any extended attributes? */
1397	if ((xattr_obj = zp->z_phys->zp_xattr) != 0) {
1398		/* XXX - do we need this if we are deleting? */
1399		dmu_tx_hold_bonus(tx, xattr_obj);
1400	}
1401
1402	/* are there any additional acls */
1403	if ((acl_obj = zp->z_phys->zp_acl.z_acl_extern_obj) != 0 &&
1404	    may_delete_now)
1405		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1406
1407	/* charge as an update -- would be nice not to charge at all */
1408	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1409
1410	error = dmu_tx_assign(tx, zfsvfs->z_assign);
1411	if (error) {
1412		zfs_dirent_unlock(dl);
1413		VN_RELE(vp);
1414		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
1415			dmu_tx_wait(tx);
1416			dmu_tx_abort(tx);
1417			goto top;
1418		}
1419		dmu_tx_abort(tx);
1420		ZFS_EXIT(zfsvfs);
1421		return (error);
1422	}
1423
1424	/*
1425	 * Remove the directory entry.
1426	 */
1427	error = zfs_link_destroy(dl, zp, tx, 0, &unlinked);
1428
1429	if (error) {
1430		dmu_tx_commit(tx);
1431		goto out;
1432	}
1433
1434	if (0 && unlinked) {
1435		VI_LOCK(vp);
1436		delete_now = may_delete_now &&
1437		    vp->v_count == 1 && !vn_has_cached_data(vp) &&
1438		    zp->z_phys->zp_xattr == xattr_obj &&
1439		    zp->z_phys->zp_acl.z_acl_extern_obj == acl_obj;
1440		VI_UNLOCK(vp);
1441	}
1442
1443	if (delete_now) {
1444		if (zp->z_phys->zp_xattr) {
1445			error = zfs_zget(zfsvfs, zp->z_phys->zp_xattr, &xzp);
1446			ASSERT3U(error, ==, 0);
1447			ASSERT3U(xzp->z_phys->zp_links, ==, 2);
1448			dmu_buf_will_dirty(xzp->z_dbuf, tx);
1449			mutex_enter(&xzp->z_lock);
1450			xzp->z_unlinked = 1;
1451			xzp->z_phys->zp_links = 0;
1452			mutex_exit(&xzp->z_lock);
1453			zfs_unlinked_add(xzp, tx);
1454			zp->z_phys->zp_xattr = 0; /* probably unnecessary */
1455		}
1456		mutex_enter(&zp->z_lock);
1457		VI_LOCK(vp);
1458		vp->v_count--;
1459		ASSERT3U(vp->v_count, ==, 0);
1460		VI_UNLOCK(vp);
1461		mutex_exit(&zp->z_lock);
1462		zfs_znode_delete(zp, tx);
1463		VFS_RELE(zfsvfs->z_vfs);
1464	} else if (unlinked) {
1465		zfs_unlinked_add(zp, tx);
1466	}
1467
1468	zfs_log_remove(zilog, tx, TX_REMOVE, dzp, name);
1469
1470	dmu_tx_commit(tx);
1471out:
1472	zfs_dirent_unlock(dl);
1473
1474	if (!delete_now) {
1475		VN_RELE(vp);
1476	} else if (xzp) {
1477		/* this rele delayed to prevent nesting transactions */
1478		VN_RELE(ZTOV(xzp));
1479	}
1480
1481	ZFS_EXIT(zfsvfs);
1482	return (error);
1483}
1484
1485/*
1486 * Create a new directory and insert it into dvp using the name
1487 * provided.  Return a pointer to the inserted directory.
1488 *
1489 *	IN:	dvp	- vnode of directory to add subdir to.
1490 *		dirname	- name of new directory.
1491 *		vap	- attributes of new directory.
1492 *		cr	- credentials of caller.
1493 *
1494 *	OUT:	vpp	- vnode of created directory.
1495 *
1496 *	RETURN:	0 if success
1497 *		error code if failure
1498 *
1499 * Timestamps:
1500 *	dvp - ctime|mtime updated
1501 *	 vp - ctime|mtime|atime updated
1502 */
1503static int
1504zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
1505{
1506	znode_t		*zp, *dzp = VTOZ(dvp);
1507	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1508	zilog_t		*zilog = zfsvfs->z_log;
1509	zfs_dirlock_t	*dl;
1510	uint64_t	zoid = 0;
1511	dmu_tx_t	*tx;
1512	int		error;
1513
1514	ASSERT(vap->va_type == VDIR);
1515
1516	ZFS_ENTER(zfsvfs);
1517
1518	if (dzp->z_phys->zp_flags & ZFS_XATTR) {
1519		ZFS_EXIT(zfsvfs);
1520		return (EINVAL);
1521	}
1522top:
1523	*vpp = NULL;
1524
1525	/*
1526	 * First make sure the new directory doesn't exist.
1527	 */
1528	if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, ZNEW)) {
1529		ZFS_EXIT(zfsvfs);
1530		return (error);
1531	}
1532
1533	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, cr)) {
1534		zfs_dirent_unlock(dl);
1535		ZFS_EXIT(zfsvfs);
1536		return (error);
1537	}
1538
1539	/*
1540	 * Add a new entry to the directory.
1541	 */
1542	tx = dmu_tx_create(zfsvfs->z_os);
1543	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1544	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1545	if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
1546		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1547		    0, SPA_MAXBLOCKSIZE);
1548	error = dmu_tx_assign(tx, zfsvfs->z_assign);
1549	if (error) {
1550		zfs_dirent_unlock(dl);
1551		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
1552			dmu_tx_wait(tx);
1553			dmu_tx_abort(tx);
1554			goto top;
1555		}
1556		dmu_tx_abort(tx);
1557		ZFS_EXIT(zfsvfs);
1558		return (error);
1559	}
1560
1561	/*
1562	 * Create new node.
1563	 */
1564	zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
1565
1566	/*
1567	 * Now put new name in parent dir.
1568	 */
1569	(void) zfs_link_create(dl, zp, tx, ZNEW);
1570
1571	*vpp = ZTOV(zp);
1572
1573	zfs_log_create(zilog, tx, TX_MKDIR, dzp, zp, dirname);
1574	dmu_tx_commit(tx);
1575
1576	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, curthread);
1577
1578	zfs_dirent_unlock(dl);
1579
1580	ZFS_EXIT(zfsvfs);
1581	return (0);
1582}
1583
1584/*
1585 * Remove a directory subdir entry.  If the current working
1586 * directory is the same as the subdir to be removed, the
1587 * remove will fail.
1588 *
1589 *	IN:	dvp	- vnode of directory to remove from.
1590 *		name	- name of directory to be removed.
1591 *		cwd	- vnode of current working directory.
1592 *		cr	- credentials of caller.
1593 *
1594 *	RETURN:	0 if success
1595 *		error code if failure
1596 *
1597 * Timestamps:
1598 *	dvp - ctime|mtime updated
1599 */
1600static int
1601zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr)
1602{
1603	znode_t		*dzp = VTOZ(dvp);
1604	znode_t		*zp;
1605	vnode_t		*vp;
1606	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1607	zilog_t		*zilog = zfsvfs->z_log;
1608	zfs_dirlock_t	*dl;
1609	dmu_tx_t	*tx;
1610	int		error;
1611
1612	ZFS_ENTER(zfsvfs);
1613
1614top:
1615	zp = NULL;
1616
1617	/*
1618	 * Attempt to lock directory; fail if entry doesn't exist.
1619	 */
1620	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZEXISTS)) {
1621		ZFS_EXIT(zfsvfs);
1622		return (error);
1623	}
1624
1625	vp = ZTOV(zp);
1626
1627	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1628		goto out;
1629	}
1630
1631	if (vp->v_type != VDIR) {
1632		error = ENOTDIR;
1633		goto out;
1634	}
1635
1636	if (vp == cwd) {
1637		error = EINVAL;
1638		goto out;
1639	}
1640
1641	vnevent_rmdir(vp);
1642
1643	/*
1644	 * Grab a lock on the directory to make sure that noone is
1645	 * trying to add (or lookup) entries while we are removing it.
1646	 */
1647	rw_enter(&zp->z_name_lock, RW_WRITER);
1648
1649	/*
1650	 * Grab a lock on the parent pointer to make sure we play well
1651	 * with the treewalk and directory rename code.
1652	 */
1653	rw_enter(&zp->z_parent_lock, RW_WRITER);
1654
1655	tx = dmu_tx_create(zfsvfs->z_os);
1656	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1657	dmu_tx_hold_bonus(tx, zp->z_id);
1658	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1659	error = dmu_tx_assign(tx, zfsvfs->z_assign);
1660	if (error) {
1661		rw_exit(&zp->z_parent_lock);
1662		rw_exit(&zp->z_name_lock);
1663		zfs_dirent_unlock(dl);
1664		VN_RELE(vp);
1665		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
1666			dmu_tx_wait(tx);
1667			dmu_tx_abort(tx);
1668			goto top;
1669		}
1670		dmu_tx_abort(tx);
1671		ZFS_EXIT(zfsvfs);
1672		return (error);
1673	}
1674
1675#ifdef FREEBSD_NAMECACHE
1676	cache_purge(dvp);
1677#endif
1678
1679	error = zfs_link_destroy(dl, zp, tx, 0, NULL);
1680
1681	if (error == 0)
1682		zfs_log_remove(zilog, tx, TX_RMDIR, dzp, name);
1683
1684	dmu_tx_commit(tx);
1685
1686	rw_exit(&zp->z_parent_lock);
1687	rw_exit(&zp->z_name_lock);
1688#ifdef FREEBSD_NAMECACHE
1689	cache_purge(vp);
1690#endif
1691out:
1692	zfs_dirent_unlock(dl);
1693
1694	VN_RELE(vp);
1695
1696	ZFS_EXIT(zfsvfs);
1697	return (error);
1698}
1699
1700/*
1701 * Read as many directory entries as will fit into the provided
1702 * buffer from the given directory cursor position (specified in
1703 * the uio structure.
1704 *
1705 *	IN:	vp	- vnode of directory to read.
1706 *		uio	- structure supplying read location, range info,
1707 *			  and return buffer.
1708 *		cr	- credentials of caller.
1709 *
1710 *	OUT:	uio	- updated offset and range, buffer filled.
1711 *		eofp	- set to true if end-of-file detected.
1712 *
1713 *	RETURN:	0 if success
1714 *		error code if failure
1715 *
1716 * Timestamps:
1717 *	vp - atime updated
1718 *
1719 * Note that the low 4 bits of the cookie returned by zap is always zero.
1720 * This allows us to use the low range for "special" directory entries:
1721 * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
1722 * we use the offset 2 for the '.zfs' directory.
1723 */
1724/* ARGSUSED */
1725static int
1726zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, u_long **cookies)
1727{
1728	znode_t		*zp = VTOZ(vp);
1729	iovec_t		*iovp;
1730	dirent64_t	*odp;
1731	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
1732	objset_t	*os;
1733	caddr_t		outbuf;
1734	size_t		bufsize;
1735	zap_cursor_t	zc;
1736	zap_attribute_t	zap;
1737	uint_t		bytes_wanted;
1738	uint64_t	offset; /* must be unsigned; checks for < 1 */
1739	int		local_eof;
1740	int		outcount;
1741	int		error;
1742	uint8_t		prefetch;
1743	uint8_t		type;
1744	int		ncooks;
1745	u_long		*cooks = NULL;
1746
1747	ZFS_ENTER(zfsvfs);
1748
1749	/*
1750	 * If we are not given an eof variable,
1751	 * use a local one.
1752	 */
1753	if (eofp == NULL)
1754		eofp = &local_eof;
1755
1756	/*
1757	 * Check for valid iov_len.
1758	 */
1759	if (uio->uio_iov->iov_len <= 0) {
1760		ZFS_EXIT(zfsvfs);
1761		return (EINVAL);
1762	}
1763
1764	/*
1765	 * Quit if directory has been removed (posix)
1766	 */
1767	if ((*eofp = zp->z_unlinked) != 0) {
1768		ZFS_EXIT(zfsvfs);
1769		return (0);
1770	}
1771
1772	error = 0;
1773	os = zfsvfs->z_os;
1774	offset = uio->uio_loffset;
1775	prefetch = zp->z_zn_prefetch;
1776
1777	/*
1778	 * Initialize the iterator cursor.
1779	 */
1780	if (offset <= 3) {
1781		/*
1782		 * Start iteration from the beginning of the directory.
1783		 */
1784		zap_cursor_init(&zc, os, zp->z_id);
1785	} else {
1786		/*
1787		 * The offset is a serialized cursor.
1788		 */
1789		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
1790	}
1791
1792	/*
1793	 * Get space to change directory entries into fs independent format.
1794	 */
1795	iovp = uio->uio_iov;
1796	bytes_wanted = iovp->iov_len;
1797	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
1798		bufsize = bytes_wanted;
1799		outbuf = kmem_alloc(bufsize, KM_SLEEP);
1800		odp = (struct dirent64 *)outbuf;
1801	} else {
1802		bufsize = bytes_wanted;
1803		odp = (struct dirent64 *)iovp->iov_base;
1804	}
1805
1806	if (ncookies != NULL) {
1807		/*
1808		 * Minimum entry size is dirent size and 1 byte for a file name.
1809		 */
1810		ncooks = uio->uio_resid / (sizeof(struct dirent) - sizeof(((struct dirent *)NULL)->d_name) + 1);
1811		cooks = malloc(ncooks * sizeof(u_long), M_TEMP, M_WAITOK);
1812		*cookies = cooks;
1813		*ncookies = ncooks;
1814	}
1815
1816	/*
1817	 * Transform to file-system independent format
1818	 */
1819	outcount = 0;
1820	while (outcount < bytes_wanted) {
1821		ino64_t objnum;
1822		ushort_t reclen;
1823
1824		/*
1825		 * Special case `.', `..', and `.zfs'.
1826		 */
1827		if (offset == 0) {
1828			(void) strcpy(zap.za_name, ".");
1829			objnum = zp->z_id;
1830			type = DT_DIR;
1831		} else if (offset == 1) {
1832			(void) strcpy(zap.za_name, "..");
1833			objnum = zp->z_phys->zp_parent;
1834			type = DT_DIR;
1835		} else if (offset == 2 && zfs_show_ctldir(zp)) {
1836			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
1837			objnum = ZFSCTL_INO_ROOT;
1838			type = DT_DIR;
1839		} else {
1840			/*
1841			 * Grab next entry.
1842			 */
1843			if (error = zap_cursor_retrieve(&zc, &zap)) {
1844				if ((*eofp = (error == ENOENT)) != 0)
1845					break;
1846				else
1847					goto update;
1848			}
1849
1850			if (zap.za_integer_length != 8 ||
1851			    zap.za_num_integers != 1) {
1852				cmn_err(CE_WARN, "zap_readdir: bad directory "
1853				    "entry, obj = %lld, offset = %lld\n",
1854				    (u_longlong_t)zp->z_id,
1855				    (u_longlong_t)offset);
1856				error = ENXIO;
1857				goto update;
1858			}
1859
1860			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
1861			/*
1862			 * MacOS X can extract the object type here such as:
1863			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
1864			 */
1865			type = ZFS_DIRENT_TYPE(zap.za_first_integer);
1866		}
1867		reclen = DIRENT64_RECLEN(strlen(zap.za_name));
1868
1869		/*
1870		 * Will this entry fit in the buffer?
1871		 */
1872		if (outcount + reclen > bufsize) {
1873			/*
1874			 * Did we manage to fit anything in the buffer?
1875			 */
1876			if (!outcount) {
1877				error = EINVAL;
1878				goto update;
1879			}
1880			break;
1881		}
1882		/*
1883		 * Add this entry:
1884		 */
1885		odp->d_ino = objnum;
1886		odp->d_reclen = reclen;
1887		odp->d_namlen = strlen(zap.za_name);
1888		(void) strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1);
1889		odp->d_type = type;
1890		outcount += reclen;
1891		odp = (dirent64_t *)((intptr_t)odp + reclen);
1892
1893		ASSERT(outcount <= bufsize);
1894
1895		/* Prefetch znode */
1896		if (prefetch)
1897			dmu_prefetch(os, objnum, 0, 0);
1898
1899		/*
1900		 * Move to the next entry, fill in the previous offset.
1901		 */
1902		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
1903			zap_cursor_advance(&zc);
1904			offset = zap_cursor_serialize(&zc);
1905		} else {
1906			offset += 1;
1907		}
1908
1909		if (cooks != NULL) {
1910			*cooks++ = offset;
1911			ncooks--;
1912			KASSERT(ncooks >= 0, ("ncookies=%d", ncooks));
1913		}
1914	}
1915	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
1916
1917	/* Subtract unused cookies */
1918	if (ncookies != NULL)
1919		*ncookies -= ncooks;
1920
1921	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
1922		iovp->iov_base += outcount;
1923		iovp->iov_len -= outcount;
1924		uio->uio_resid -= outcount;
1925	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
1926		/*
1927		 * Reset the pointer.
1928		 */
1929		offset = uio->uio_loffset;
1930	}
1931
1932update:
1933	zap_cursor_fini(&zc);
1934	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
1935		kmem_free(outbuf, bufsize);
1936
1937	if (error == ENOENT)
1938		error = 0;
1939
1940	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
1941
1942	uio->uio_loffset = offset;
1943	ZFS_EXIT(zfsvfs);
1944	if (error != 0 && cookies != NULL) {
1945		free(*cookies, M_TEMP);
1946		*cookies = NULL;
1947		*ncookies = 0;
1948	}
1949	return (error);
1950}
1951
1952static int
1953zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr)
1954{
1955	znode_t	*zp = VTOZ(vp);
1956	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1957
1958	ZFS_ENTER(zfsvfs);
1959	zil_commit(zfsvfs->z_log, zp->z_last_itx, zp->z_id);
1960	ZFS_EXIT(zfsvfs);
1961	return (0);
1962}
1963
1964/*
1965 * Get the requested file attributes and place them in the provided
1966 * vattr structure.
1967 *
1968 *	IN:	vp	- vnode of file.
1969 *		vap	- va_mask identifies requested attributes.
1970 *		flags	- [UNUSED]
1971 *		cr	- credentials of caller.
1972 *
1973 *	OUT:	vap	- attribute values.
1974 *
1975 *	RETURN:	0 (always succeeds)
1976 */
1977/* ARGSUSED */
1978static int
1979zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr)
1980{
1981	znode_t *zp = VTOZ(vp);
1982	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1983	znode_phys_t *pzp = zp->z_phys;
1984	uint32_t blksize;
1985	u_longlong_t nblocks;
1986	int	error;
1987
1988	ZFS_ENTER(zfsvfs);
1989
1990	/*
1991	 * Return all attributes.  It's cheaper to provide the answer
1992	 * than to determine whether we were asked the question.
1993	 */
1994	mutex_enter(&zp->z_lock);
1995
1996	vap->va_type = IFTOVT(pzp->zp_mode);
1997	vap->va_mode = pzp->zp_mode & ~S_IFMT;
1998	vap->va_uid = zp->z_phys->zp_uid;
1999	vap->va_gid = zp->z_phys->zp_gid;
2000	vap->va_nodeid = zp->z_id;
2001	vap->va_nlink = MIN(pzp->zp_links, UINT32_MAX);	/* nlink_t limit! */
2002	vap->va_size = pzp->zp_size;
2003	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
2004	vap->va_rdev = zfs_cmpldev(pzp->zp_rdev);
2005	vap->va_seq = zp->z_seq;
2006	vap->va_flags = 0;	/* FreeBSD: Reset chflags(2) flags. */
2007
2008	ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime);
2009	ZFS_TIME_DECODE(&vap->va_mtime, pzp->zp_mtime);
2010	ZFS_TIME_DECODE(&vap->va_ctime, pzp->zp_ctime);
2011	ZFS_TIME_DECODE(&vap->va_birthtime, pzp->zp_crtime);
2012
2013	/*
2014	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2015	 * Also, if we are the owner don't bother, since owner should
2016	 * always be allowed to read basic attributes of file.
2017	 */
2018	if (!(zp->z_phys->zp_flags & ZFS_ACL_TRIVIAL) &&
2019	    (zp->z_phys->zp_uid != crgetuid(cr))) {
2020		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, cr)) {
2021			mutex_exit(&zp->z_lock);
2022			ZFS_EXIT(zfsvfs);
2023			return (error);
2024		}
2025	}
2026
2027	mutex_exit(&zp->z_lock);
2028
2029	dmu_object_size_from_db(zp->z_dbuf, &blksize, &nblocks);
2030	vap->va_blksize = blksize;
2031	vap->va_bytes = nblocks << 9;	/* nblocks * 512 */
2032
2033	if (zp->z_blksz == 0) {
2034		/*
2035		 * Block size hasn't been set; suggest maximal I/O transfers.
2036		 */
2037		vap->va_blksize = zfsvfs->z_max_blksz;
2038	}
2039
2040	ZFS_EXIT(zfsvfs);
2041	return (0);
2042}
2043
2044/*
2045 * Set the file attributes to the values contained in the
2046 * vattr structure.
2047 *
2048 *	IN:	vp	- vnode of file to be modified.
2049 *		vap	- new attribute values.
2050 *		flags	- ATTR_UTIME set if non-default time values provided.
2051 *		cr	- credentials of caller.
2052 *
2053 *	RETURN:	0 if success
2054 *		error code if failure
2055 *
2056 * Timestamps:
2057 *	vp - ctime updated, mtime updated if size changed.
2058 */
2059/* ARGSUSED */
2060static int
2061zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2062	caller_context_t *ct)
2063{
2064	struct znode	*zp = VTOZ(vp);
2065	znode_phys_t	*pzp = zp->z_phys;
2066	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2067	zilog_t		*zilog = zfsvfs->z_log;
2068	dmu_tx_t	*tx;
2069	vattr_t		oldva;
2070	uint_t		mask = vap->va_mask;
2071	uint_t		saved_mask;
2072	int		trim_mask = 0;
2073	uint64_t	new_mode;
2074	znode_t		*attrzp;
2075	int		need_policy = FALSE;
2076	int		err;
2077
2078	if (mask == 0)
2079		return (0);
2080
2081	if (mask & AT_NOSET)
2082		return (EINVAL);
2083
2084	if (mask & AT_SIZE && vp->v_type == VDIR)
2085		return (EISDIR);
2086
2087	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO)
2088		return (EINVAL);
2089
2090	ZFS_ENTER(zfsvfs);
2091
2092top:
2093	attrzp = NULL;
2094
2095	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2096		ZFS_EXIT(zfsvfs);
2097		return (EROFS);
2098	}
2099
2100	/*
2101	 * First validate permissions
2102	 */
2103
2104	if (mask & AT_SIZE) {
2105		err = zfs_zaccess(zp, ACE_WRITE_DATA, cr);
2106		if (err) {
2107			ZFS_EXIT(zfsvfs);
2108			return (err);
2109		}
2110		/*
2111		 * XXX - Note, we are not providing any open
2112		 * mode flags here (like FNDELAY), so we may
2113		 * block if there are locks present... this
2114		 * should be addressed in openat().
2115		 */
2116		do {
2117			err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2118			/* NB: we already did dmu_tx_wait() if necessary */
2119		} while (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT);
2120		if (err) {
2121			ZFS_EXIT(zfsvfs);
2122			return (err);
2123		}
2124	}
2125
2126	if (mask & (AT_ATIME|AT_MTIME))
2127		need_policy = zfs_zaccess_v4_perm(zp, ACE_WRITE_ATTRIBUTES, cr);
2128
2129	if (mask & (AT_UID|AT_GID)) {
2130		int	idmask = (mask & (AT_UID|AT_GID));
2131		int	take_owner;
2132		int	take_group;
2133
2134		/*
2135		 * NOTE: even if a new mode is being set,
2136		 * we may clear S_ISUID/S_ISGID bits.
2137		 */
2138
2139		if (!(mask & AT_MODE))
2140			vap->va_mode = pzp->zp_mode;
2141
2142		/*
2143		 * Take ownership or chgrp to group we are a member of
2144		 */
2145
2146		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
2147		take_group = (mask & AT_GID) && groupmember(vap->va_gid, cr);
2148
2149		/*
2150		 * If both AT_UID and AT_GID are set then take_owner and
2151		 * take_group must both be set in order to allow taking
2152		 * ownership.
2153		 *
2154		 * Otherwise, send the check through secpolicy_vnode_setattr()
2155		 *
2156		 */
2157
2158		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
2159		    ((idmask == AT_UID) && take_owner) ||
2160		    ((idmask == AT_GID) && take_group)) {
2161			if (zfs_zaccess_v4_perm(zp, ACE_WRITE_OWNER, cr) == 0) {
2162				/*
2163				 * Remove setuid/setgid for non-privileged users
2164				 */
2165				secpolicy_setid_clear(vap, cr);
2166				trim_mask = (mask & (AT_UID|AT_GID));
2167			} else {
2168				need_policy =  TRUE;
2169			}
2170		} else {
2171			need_policy =  TRUE;
2172		}
2173	}
2174
2175	mutex_enter(&zp->z_lock);
2176	oldva.va_mode = pzp->zp_mode;
2177	oldva.va_uid = zp->z_phys->zp_uid;
2178	oldva.va_gid = zp->z_phys->zp_gid;
2179	mutex_exit(&zp->z_lock);
2180
2181	if (mask & AT_MODE) {
2182		if (zfs_zaccess_v4_perm(zp, ACE_WRITE_ACL, cr) == 0) {
2183			err = secpolicy_setid_setsticky_clear(vp, vap,
2184			    &oldva, cr);
2185			if (err) {
2186				ZFS_EXIT(zfsvfs);
2187				return (err);
2188			}
2189			trim_mask |= AT_MODE;
2190		} else {
2191			need_policy = TRUE;
2192		}
2193	}
2194
2195	if (need_policy) {
2196		/*
2197		 * If trim_mask is set then take ownership
2198		 * has been granted or write_acl is present and user
2199		 * has the ability to modify mode.  In that case remove
2200		 * UID|GID and or MODE from mask so that
2201		 * secpolicy_vnode_setattr() doesn't revoke it.
2202		 */
2203
2204		if (trim_mask) {
2205			saved_mask = vap->va_mask;
2206			vap->va_mask &= ~trim_mask;
2207
2208		}
2209		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
2210		    (int (*)(void *, int, cred_t *))zfs_zaccess_rwx, zp);
2211		if (err) {
2212			ZFS_EXIT(zfsvfs);
2213			return (err);
2214		}
2215
2216		if (trim_mask)
2217			vap->va_mask |= saved_mask;
2218	}
2219
2220	/*
2221	 * secpolicy_vnode_setattr, or take ownership may have
2222	 * changed va_mask
2223	 */
2224	mask = vap->va_mask;
2225
2226	tx = dmu_tx_create(zfsvfs->z_os);
2227	dmu_tx_hold_bonus(tx, zp->z_id);
2228
2229	if (mask & AT_MODE) {
2230		uint64_t pmode = pzp->zp_mode;
2231
2232		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
2233
2234		if (zp->z_phys->zp_acl.z_acl_extern_obj)
2235			dmu_tx_hold_write(tx,
2236			    pzp->zp_acl.z_acl_extern_obj, 0, SPA_MAXBLOCKSIZE);
2237		else
2238			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2239			    0, ZFS_ACL_SIZE(MAX_ACL_SIZE));
2240	}
2241
2242	if ((mask & (AT_UID | AT_GID)) && zp->z_phys->zp_xattr != 0) {
2243		err = zfs_zget(zp->z_zfsvfs, zp->z_phys->zp_xattr, &attrzp);
2244		if (err) {
2245			dmu_tx_abort(tx);
2246			ZFS_EXIT(zfsvfs);
2247			return (err);
2248		}
2249		dmu_tx_hold_bonus(tx, attrzp->z_id);
2250	}
2251
2252	err = dmu_tx_assign(tx, zfsvfs->z_assign);
2253	if (err) {
2254		if (attrzp)
2255			VN_RELE(ZTOV(attrzp));
2256		if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
2257			dmu_tx_wait(tx);
2258			dmu_tx_abort(tx);
2259			goto top;
2260		}
2261		dmu_tx_abort(tx);
2262		ZFS_EXIT(zfsvfs);
2263		return (err);
2264	}
2265
2266	dmu_buf_will_dirty(zp->z_dbuf, tx);
2267
2268	/*
2269	 * Set each attribute requested.
2270	 * We group settings according to the locks they need to acquire.
2271	 *
2272	 * Note: you cannot set ctime directly, although it will be
2273	 * updated as a side-effect of calling this function.
2274	 */
2275
2276	mutex_enter(&zp->z_lock);
2277
2278	if (mask & AT_MODE) {
2279		err = zfs_acl_chmod_setattr(zp, new_mode, tx);
2280		ASSERT3U(err, ==, 0);
2281	}
2282
2283	if (attrzp)
2284		mutex_enter(&attrzp->z_lock);
2285
2286	if (mask & AT_UID) {
2287		zp->z_phys->zp_uid = (uint64_t)vap->va_uid;
2288		if (attrzp) {
2289			attrzp->z_phys->zp_uid = (uint64_t)vap->va_uid;
2290		}
2291	}
2292
2293	if (mask & AT_GID) {
2294		zp->z_phys->zp_gid = (uint64_t)vap->va_gid;
2295		if (attrzp)
2296			attrzp->z_phys->zp_gid = (uint64_t)vap->va_gid;
2297	}
2298
2299	if (attrzp)
2300		mutex_exit(&attrzp->z_lock);
2301
2302	if (mask & AT_ATIME)
2303		ZFS_TIME_ENCODE(&vap->va_atime, pzp->zp_atime);
2304
2305	if (mask & AT_MTIME)
2306		ZFS_TIME_ENCODE(&vap->va_mtime, pzp->zp_mtime);
2307
2308	if (mask & AT_SIZE)
2309		zfs_time_stamper_locked(zp, CONTENT_MODIFIED, tx);
2310	else if (mask != 0)
2311		zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
2312
2313	if (mask != 0)
2314		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask);
2315
2316	mutex_exit(&zp->z_lock);
2317
2318	if (attrzp)
2319		VN_RELE(ZTOV(attrzp));
2320
2321	dmu_tx_commit(tx);
2322
2323	ZFS_EXIT(zfsvfs);
2324	return (err);
2325}
2326
2327typedef struct zfs_zlock {
2328	krwlock_t	*zl_rwlock;	/* lock we acquired */
2329	znode_t		*zl_znode;	/* znode we held */
2330	struct zfs_zlock *zl_next;	/* next in list */
2331} zfs_zlock_t;
2332
2333/*
2334 * Drop locks and release vnodes that were held by zfs_rename_lock().
2335 */
2336static void
2337zfs_rename_unlock(zfs_zlock_t **zlpp)
2338{
2339	zfs_zlock_t *zl;
2340
2341	while ((zl = *zlpp) != NULL) {
2342		if (zl->zl_znode != NULL)
2343			VN_RELE(ZTOV(zl->zl_znode));
2344		rw_exit(zl->zl_rwlock);
2345		*zlpp = zl->zl_next;
2346		kmem_free(zl, sizeof (*zl));
2347	}
2348}
2349
2350/*
2351 * Search back through the directory tree, using the ".." entries.
2352 * Lock each directory in the chain to prevent concurrent renames.
2353 * Fail any attempt to move a directory into one of its own descendants.
2354 * XXX - z_parent_lock can overlap with map or grow locks
2355 */
2356static int
2357zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
2358{
2359	zfs_zlock_t	*zl;
2360	znode_t		*zp = tdzp;
2361	uint64_t	rootid = zp->z_zfsvfs->z_root;
2362	uint64_t	*oidp = &zp->z_id;
2363	krwlock_t	*rwlp = &szp->z_parent_lock;
2364	krw_t		rw = RW_WRITER;
2365
2366	/*
2367	 * First pass write-locks szp and compares to zp->z_id.
2368	 * Later passes read-lock zp and compare to zp->z_parent.
2369	 */
2370	do {
2371		if (!rw_tryenter(rwlp, rw)) {
2372			/*
2373			 * Another thread is renaming in this path.
2374			 * Note that if we are a WRITER, we don't have any
2375			 * parent_locks held yet.
2376			 */
2377			if (rw == RW_READER && zp->z_id > szp->z_id) {
2378				/*
2379				 * Drop our locks and restart
2380				 */
2381				zfs_rename_unlock(&zl);
2382				*zlpp = NULL;
2383				zp = tdzp;
2384				oidp = &zp->z_id;
2385				rwlp = &szp->z_parent_lock;
2386				rw = RW_WRITER;
2387				continue;
2388			} else {
2389				/*
2390				 * Wait for other thread to drop its locks
2391				 */
2392				rw_enter(rwlp, rw);
2393			}
2394		}
2395
2396		zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
2397		zl->zl_rwlock = rwlp;
2398		zl->zl_znode = NULL;
2399		zl->zl_next = *zlpp;
2400		*zlpp = zl;
2401
2402		if (*oidp == szp->z_id)		/* We're a descendant of szp */
2403			return (EINVAL);
2404
2405		if (*oidp == rootid)		/* We've hit the top */
2406			return (0);
2407
2408		if (rw == RW_READER) {		/* i.e. not the first pass */
2409			int error = zfs_zget(zp->z_zfsvfs, *oidp, &zp);
2410			if (error)
2411				return (error);
2412			zl->zl_znode = zp;
2413		}
2414		oidp = &zp->z_phys->zp_parent;
2415		rwlp = &zp->z_parent_lock;
2416		rw = RW_READER;
2417
2418	} while (zp->z_id != sdzp->z_id);
2419
2420	return (0);
2421}
2422
2423/*
2424 * Move an entry from the provided source directory to the target
2425 * directory.  Change the entry name as indicated.
2426 *
2427 *	IN:	sdvp	- Source directory containing the "old entry".
2428 *		snm	- Old entry name.
2429 *		tdvp	- Target directory to contain the "new entry".
2430 *		tnm	- New entry name.
2431 *		cr	- credentials of caller.
2432 *
2433 *	RETURN:	0 if success
2434 *		error code if failure
2435 *
2436 * Timestamps:
2437 *	sdvp,tdvp - ctime|mtime updated
2438 */
2439static int
2440zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr)
2441{
2442	znode_t		*tdzp, *szp, *tzp;
2443	znode_t		*sdzp = VTOZ(sdvp);
2444	zfsvfs_t	*zfsvfs = sdzp->z_zfsvfs;
2445	zilog_t		*zilog = zfsvfs->z_log;
2446	vnode_t		*realvp;
2447	zfs_dirlock_t	*sdl, *tdl;
2448	dmu_tx_t	*tx;
2449	zfs_zlock_t	*zl;
2450	int		cmp, serr, terr, error;
2451
2452	ZFS_ENTER(zfsvfs);
2453
2454	/*
2455	 * Make sure we have the real vp for the target directory.
2456	 */
2457	if (VOP_REALVP(tdvp, &realvp) == 0)
2458		tdvp = realvp;
2459
2460	if (tdvp->v_vfsp != sdvp->v_vfsp) {
2461		ZFS_EXIT(zfsvfs);
2462		return (EXDEV);
2463	}
2464
2465	tdzp = VTOZ(tdvp);
2466top:
2467	szp = NULL;
2468	tzp = NULL;
2469	zl = NULL;
2470
2471	/*
2472	 * This is to prevent the creation of links into attribute space
2473	 * by renaming a linked file into/outof an attribute directory.
2474	 * See the comment in zfs_link() for why this is considered bad.
2475	 */
2476	if ((tdzp->z_phys->zp_flags & ZFS_XATTR) !=
2477	    (sdzp->z_phys->zp_flags & ZFS_XATTR)) {
2478		ZFS_EXIT(zfsvfs);
2479		return (EINVAL);
2480	}
2481
2482	/*
2483	 * Lock source and target directory entries.  To prevent deadlock,
2484	 * a lock ordering must be defined.  We lock the directory with
2485	 * the smallest object id first, or if it's a tie, the one with
2486	 * the lexically first name.
2487	 */
2488	if (sdzp->z_id < tdzp->z_id) {
2489		cmp = -1;
2490	} else if (sdzp->z_id > tdzp->z_id) {
2491		cmp = 1;
2492	} else {
2493		cmp = strcmp(snm, tnm);
2494		if (cmp == 0) {
2495			/*
2496			 * POSIX: "If the old argument and the new argument
2497			 * both refer to links to the same existing file,
2498			 * the rename() function shall return successfully
2499			 * and perform no other action."
2500			 */
2501			ZFS_EXIT(zfsvfs);
2502			return (0);
2503		}
2504	}
2505	if (cmp < 0) {
2506		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS);
2507		terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0);
2508	} else {
2509		terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, 0);
2510		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS);
2511	}
2512
2513	if (serr) {
2514		/*
2515		 * Source entry invalid or not there.
2516		 */
2517		if (!terr) {
2518			zfs_dirent_unlock(tdl);
2519			if (tzp)
2520				VN_RELE(ZTOV(tzp));
2521		}
2522		if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0)
2523			serr = EINVAL;
2524		ZFS_EXIT(zfsvfs);
2525		return (serr);
2526	}
2527	if (terr) {
2528		zfs_dirent_unlock(sdl);
2529		VN_RELE(ZTOV(szp));
2530		if (strcmp(tnm, "..") == 0)
2531			terr = EINVAL;
2532		ZFS_EXIT(zfsvfs);
2533		return (terr);
2534	}
2535
2536	/*
2537	 * Must have write access at the source to remove the old entry
2538	 * and write access at the target to create the new entry.
2539	 * Note that if target and source are the same, this can be
2540	 * done in a single check.
2541	 */
2542
2543	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
2544		goto out;
2545
2546	if (ZTOV(szp)->v_type == VDIR) {
2547		/*
2548		 * Check to make sure rename is valid.
2549		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
2550		 */
2551		if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
2552			goto out;
2553	}
2554
2555	/*
2556	 * Does target exist?
2557	 */
2558	if (tzp) {
2559		/*
2560		 * Source and target must be the same type.
2561		 */
2562		if (ZTOV(szp)->v_type == VDIR) {
2563			if (ZTOV(tzp)->v_type != VDIR) {
2564				error = ENOTDIR;
2565				goto out;
2566			}
2567		} else {
2568			if (ZTOV(tzp)->v_type == VDIR) {
2569				error = EISDIR;
2570				goto out;
2571			}
2572		}
2573		/*
2574		 * POSIX dictates that when the source and target
2575		 * entries refer to the same file object, rename
2576		 * must do nothing and exit without error.
2577		 */
2578		if (szp->z_id == tzp->z_id) {
2579			error = 0;
2580			goto out;
2581		}
2582	}
2583
2584	vnevent_rename_src(ZTOV(szp));
2585	if (tzp)
2586		vnevent_rename_dest(ZTOV(tzp));
2587
2588	tx = dmu_tx_create(zfsvfs->z_os);
2589	dmu_tx_hold_bonus(tx, szp->z_id);	/* nlink changes */
2590	dmu_tx_hold_bonus(tx, sdzp->z_id);	/* nlink changes */
2591	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
2592	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
2593	if (sdzp != tdzp)
2594		dmu_tx_hold_bonus(tx, tdzp->z_id);	/* nlink changes */
2595	if (tzp)
2596		dmu_tx_hold_bonus(tx, tzp->z_id);	/* parent changes */
2597	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2598	error = dmu_tx_assign(tx, zfsvfs->z_assign);
2599	if (error) {
2600		if (zl != NULL)
2601			zfs_rename_unlock(&zl);
2602		zfs_dirent_unlock(sdl);
2603		zfs_dirent_unlock(tdl);
2604		VN_RELE(ZTOV(szp));
2605		if (tzp)
2606			VN_RELE(ZTOV(tzp));
2607		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
2608			dmu_tx_wait(tx);
2609			dmu_tx_abort(tx);
2610			goto top;
2611		}
2612		dmu_tx_abort(tx);
2613		ZFS_EXIT(zfsvfs);
2614		return (error);
2615	}
2616
2617	if (tzp)	/* Attempt to remove the existing target */
2618		error = zfs_link_destroy(tdl, tzp, tx, 0, NULL);
2619
2620	if (error == 0) {
2621		error = zfs_link_create(tdl, szp, tx, ZRENAMING);
2622		if (error == 0) {
2623			error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
2624			ASSERT(error == 0);
2625			zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
2626			    sdl->dl_name, tdzp, tdl->dl_name, szp);
2627		}
2628#ifdef FREEBSD_NAMECACHE
2629		if (error == 0) {
2630			cache_purge(sdvp);
2631			cache_purge(tdvp);
2632		}
2633#endif
2634	}
2635
2636	dmu_tx_commit(tx);
2637out:
2638	if (zl != NULL)
2639		zfs_rename_unlock(&zl);
2640
2641	zfs_dirent_unlock(sdl);
2642	zfs_dirent_unlock(tdl);
2643
2644	VN_RELE(ZTOV(szp));
2645	if (tzp)
2646		VN_RELE(ZTOV(tzp));
2647
2648	ZFS_EXIT(zfsvfs);
2649
2650	return (error);
2651}
2652
2653/*
2654 * Insert the indicated symbolic reference entry into the directory.
2655 *
2656 *	IN:	dvp	- Directory to contain new symbolic link.
2657 *		link	- Name for new symlink entry.
2658 *		vap	- Attributes of new entry.
2659 *		target	- Target path of new symlink.
2660 *		cr	- credentials of caller.
2661 *
2662 *	RETURN:	0 if success
2663 *		error code if failure
2664 *
2665 * Timestamps:
2666 *	dvp - ctime|mtime updated
2667 */
2668static int
2669zfs_symlink(vnode_t *dvp, vnode_t **vpp, char *name, vattr_t *vap, char *link, cred_t *cr, kthread_t *td)
2670{
2671	znode_t		*zp, *dzp = VTOZ(dvp);
2672	zfs_dirlock_t	*dl;
2673	dmu_tx_t	*tx;
2674	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2675	zilog_t		*zilog = zfsvfs->z_log;
2676	uint64_t	zoid;
2677	int		len = strlen(link);
2678	int		error;
2679
2680	ASSERT(vap->va_type == VLNK);
2681
2682	ZFS_ENTER(zfsvfs);
2683top:
2684	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
2685		ZFS_EXIT(zfsvfs);
2686		return (error);
2687	}
2688
2689	if (len > MAXPATHLEN) {
2690		ZFS_EXIT(zfsvfs);
2691		return (ENAMETOOLONG);
2692	}
2693
2694	/*
2695	 * Attempt to lock directory; fail if entry already exists.
2696	 */
2697	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, ZNEW)) {
2698		ZFS_EXIT(zfsvfs);
2699		return (error);
2700	}
2701
2702	tx = dmu_tx_create(zfsvfs->z_os);
2703	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
2704	dmu_tx_hold_bonus(tx, dzp->z_id);
2705	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
2706	if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
2707		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE);
2708	error = dmu_tx_assign(tx, zfsvfs->z_assign);
2709	if (error) {
2710		zfs_dirent_unlock(dl);
2711		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
2712			dmu_tx_wait(tx);
2713			dmu_tx_abort(tx);
2714			goto top;
2715		}
2716		dmu_tx_abort(tx);
2717		ZFS_EXIT(zfsvfs);
2718		return (error);
2719	}
2720
2721	dmu_buf_will_dirty(dzp->z_dbuf, tx);
2722
2723	/*
2724	 * Create a new object for the symlink.
2725	 * Put the link content into bonus buffer if it will fit;
2726	 * otherwise, store it just like any other file data.
2727	 */
2728	zoid = 0;
2729	if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) {
2730		zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, len);
2731		if (len != 0)
2732			bcopy(link, zp->z_phys + 1, len);
2733	} else {
2734		dmu_buf_t *dbp;
2735
2736		zfs_mknode(dzp, vap, &zoid, tx, cr, 0, &zp, 0);
2737
2738		/*
2739		 * Nothing can access the znode yet so no locking needed
2740		 * for growing the znode's blocksize.
2741		 */
2742		zfs_grow_blocksize(zp, len, tx);
2743
2744		VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, zoid, 0, FTAG, &dbp));
2745		dmu_buf_will_dirty(dbp, tx);
2746
2747		ASSERT3U(len, <=, dbp->db_size);
2748		bcopy(link, dbp->db_data, len);
2749		dmu_buf_rele(dbp, FTAG);
2750	}
2751	zp->z_phys->zp_size = len;
2752
2753	/*
2754	 * Insert the new object into the directory.
2755	 */
2756	(void) zfs_link_create(dl, zp, tx, ZNEW);
2757out:
2758	if (error == 0) {
2759		zfs_log_symlink(zilog, tx, TX_SYMLINK, dzp, zp, name, link);
2760		*vpp = ZTOV(zp);
2761		vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, td);
2762	}
2763
2764	dmu_tx_commit(tx);
2765
2766	zfs_dirent_unlock(dl);
2767
2768	ZFS_EXIT(zfsvfs);
2769	return (error);
2770}
2771
2772/*
2773 * Return, in the buffer contained in the provided uio structure,
2774 * the symbolic path referred to by vp.
2775 *
2776 *	IN:	vp	- vnode of symbolic link.
2777 *		uoip	- structure to contain the link path.
2778 *		cr	- credentials of caller.
2779 *
2780 *	OUT:	uio	- structure to contain the link path.
2781 *
2782 *	RETURN:	0 if success
2783 *		error code if failure
2784 *
2785 * Timestamps:
2786 *	vp - atime updated
2787 */
2788/* ARGSUSED */
2789static int
2790zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr)
2791{
2792	znode_t		*zp = VTOZ(vp);
2793	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2794	size_t		bufsz;
2795	int		error;
2796
2797	ZFS_ENTER(zfsvfs);
2798
2799	bufsz = (size_t)zp->z_phys->zp_size;
2800	if (bufsz + sizeof (znode_phys_t) <= zp->z_dbuf->db_size) {
2801		error = uiomove(zp->z_phys + 1,
2802		    MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
2803	} else {
2804		dmu_buf_t *dbp;
2805		error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp);
2806		if (error) {
2807			ZFS_EXIT(zfsvfs);
2808			return (error);
2809		}
2810		error = uiomove(dbp->db_data,
2811		    MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
2812		dmu_buf_rele(dbp, FTAG);
2813	}
2814
2815	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2816	ZFS_EXIT(zfsvfs);
2817	return (error);
2818}
2819
2820/*
2821 * Insert a new entry into directory tdvp referencing svp.
2822 *
2823 *	IN:	tdvp	- Directory to contain new entry.
2824 *		svp	- vnode of new entry.
2825 *		name	- name of new entry.
2826 *		cr	- credentials of caller.
2827 *
2828 *	RETURN:	0 if success
2829 *		error code if failure
2830 *
2831 * Timestamps:
2832 *	tdvp - ctime|mtime updated
2833 *	 svp - ctime updated
2834 */
2835/* ARGSUSED */
2836static int
2837zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr)
2838{
2839	znode_t		*dzp = VTOZ(tdvp);
2840	znode_t		*tzp, *szp;
2841	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
2842	zilog_t		*zilog = zfsvfs->z_log;
2843	zfs_dirlock_t	*dl;
2844	dmu_tx_t	*tx;
2845	vnode_t		*realvp;
2846	int		error;
2847
2848	ASSERT(tdvp->v_type == VDIR);
2849
2850	ZFS_ENTER(zfsvfs);
2851
2852	if (VOP_REALVP(svp, &realvp) == 0)
2853		svp = realvp;
2854
2855	if (svp->v_vfsp != tdvp->v_vfsp) {
2856		ZFS_EXIT(zfsvfs);
2857		return (EXDEV);
2858	}
2859
2860	szp = VTOZ(svp);
2861top:
2862	/*
2863	 * We do not support links between attributes and non-attributes
2864	 * because of the potential security risk of creating links
2865	 * into "normal" file space in order to circumvent restrictions
2866	 * imposed in attribute space.
2867	 */
2868	if ((szp->z_phys->zp_flags & ZFS_XATTR) !=
2869	    (dzp->z_phys->zp_flags & ZFS_XATTR)) {
2870		ZFS_EXIT(zfsvfs);
2871		return (EINVAL);
2872	}
2873
2874	/*
2875	 * POSIX dictates that we return EPERM here.
2876	 * Better choices include ENOTSUP or EISDIR.
2877	 */
2878	if (svp->v_type == VDIR) {
2879		ZFS_EXIT(zfsvfs);
2880		return (EPERM);
2881	}
2882
2883	if ((uid_t)szp->z_phys->zp_uid != crgetuid(cr) &&
2884	    secpolicy_basic_link(cr) != 0) {
2885		ZFS_EXIT(zfsvfs);
2886		return (EPERM);
2887	}
2888
2889	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, cr)) {
2890		ZFS_EXIT(zfsvfs);
2891		return (error);
2892	}
2893
2894	/*
2895	 * Attempt to lock directory; fail if entry already exists.
2896	 */
2897	if (error = zfs_dirent_lock(&dl, dzp, name, &tzp, ZNEW)) {
2898		ZFS_EXIT(zfsvfs);
2899		return (error);
2900	}
2901
2902	tx = dmu_tx_create(zfsvfs->z_os);
2903	dmu_tx_hold_bonus(tx, szp->z_id);
2904	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
2905	error = dmu_tx_assign(tx, zfsvfs->z_assign);
2906	if (error) {
2907		zfs_dirent_unlock(dl);
2908		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
2909			dmu_tx_wait(tx);
2910			dmu_tx_abort(tx);
2911			goto top;
2912		}
2913		dmu_tx_abort(tx);
2914		ZFS_EXIT(zfsvfs);
2915		return (error);
2916	}
2917
2918	error = zfs_link_create(dl, szp, tx, 0);
2919
2920	if (error == 0)
2921		zfs_log_link(zilog, tx, TX_LINK, dzp, szp, name);
2922
2923	dmu_tx_commit(tx);
2924
2925	zfs_dirent_unlock(dl);
2926
2927	ZFS_EXIT(zfsvfs);
2928	return (error);
2929}
2930
2931void
2932zfs_inactive(vnode_t *vp, cred_t *cr)
2933{
2934	znode_t	*zp = VTOZ(vp);
2935	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2936	int error;
2937
2938	rw_enter(&zfsvfs->z_um_lock, RW_READER);
2939	if (zfsvfs->z_unmounted2) {
2940		ASSERT(zp->z_dbuf_held == 0);
2941
2942		mutex_enter(&zp->z_lock);
2943		VI_LOCK(vp);
2944		vp->v_count = 0; /* count arrives as 1 */
2945		VI_UNLOCK(vp);
2946		if (zp->z_dbuf == NULL) {
2947			mutex_exit(&zp->z_lock);
2948			zfs_znode_free(zp);
2949		} else {
2950			mutex_exit(&zp->z_lock);
2951		}
2952		rw_exit(&zfsvfs->z_um_lock);
2953		VFS_RELE(zfsvfs->z_vfs);
2954		return;
2955	}
2956
2957	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
2958		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
2959
2960		dmu_tx_hold_bonus(tx, zp->z_id);
2961		error = dmu_tx_assign(tx, TXG_WAIT);
2962		if (error) {
2963			dmu_tx_abort(tx);
2964		} else {
2965			dmu_buf_will_dirty(zp->z_dbuf, tx);
2966			mutex_enter(&zp->z_lock);
2967			zp->z_atime_dirty = 0;
2968			mutex_exit(&zp->z_lock);
2969			dmu_tx_commit(tx);
2970		}
2971	}
2972
2973	zfs_zinactive(zp);
2974	rw_exit(&zfsvfs->z_um_lock);
2975}
2976
2977CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
2978CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
2979
2980static int
2981zfs_fid(vnode_t *vp, fid_t *fidp)
2982{
2983	znode_t		*zp = VTOZ(vp);
2984	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2985	uint32_t	gen = (uint32_t)zp->z_phys->zp_gen;
2986	uint64_t	object = zp->z_id;
2987	zfid_short_t	*zfid;
2988	int		size, i;
2989
2990	ZFS_ENTER(zfsvfs);
2991
2992	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
2993	fidp->fid_len = size;
2994
2995	zfid = (zfid_short_t *)fidp;
2996
2997	zfid->zf_len = size;
2998
2999	for (i = 0; i < sizeof (zfid->zf_object); i++)
3000		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
3001
3002	/* Must have a non-zero generation number to distinguish from .zfs */
3003	if (gen == 0)
3004		gen = 1;
3005	for (i = 0; i < sizeof (zfid->zf_gen); i++)
3006		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
3007
3008	if (size == LONG_FID_LEN) {
3009		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
3010		zfid_long_t	*zlfid;
3011
3012		zlfid = (zfid_long_t *)fidp;
3013
3014		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
3015			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
3016
3017		/* XXX - this should be the generation number for the objset */
3018		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
3019			zlfid->zf_setgen[i] = 0;
3020	}
3021
3022	ZFS_EXIT(zfsvfs);
3023	return (0);
3024}
3025
3026static int
3027zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr)
3028{
3029	znode_t		*zp, *xzp;
3030	zfsvfs_t	*zfsvfs;
3031	zfs_dirlock_t	*dl;
3032	int		error;
3033
3034	switch (cmd) {
3035	case _PC_LINK_MAX:
3036		*valp = INT_MAX;
3037		return (0);
3038
3039	case _PC_FILESIZEBITS:
3040		*valp = 64;
3041		return (0);
3042
3043#if 0
3044	case _PC_XATTR_EXISTS:
3045		zp = VTOZ(vp);
3046		zfsvfs = zp->z_zfsvfs;
3047		ZFS_ENTER(zfsvfs);
3048		*valp = 0;
3049		error = zfs_dirent_lock(&dl, zp, "", &xzp,
3050		    ZXATTR | ZEXISTS | ZSHARED);
3051		if (error == 0) {
3052			zfs_dirent_unlock(dl);
3053			if (!zfs_dirempty(xzp))
3054				*valp = 1;
3055			VN_RELE(ZTOV(xzp));
3056		} else if (error == ENOENT) {
3057			/*
3058			 * If there aren't extended attributes, it's the
3059			 * same as having zero of them.
3060			 */
3061			error = 0;
3062		}
3063		ZFS_EXIT(zfsvfs);
3064		return (error);
3065#endif
3066
3067	case _PC_ACL_EXTENDED:
3068		*valp = 0;	/* TODO */
3069		return (0);
3070
3071	case _PC_MIN_HOLE_SIZE:
3072		*valp = (int)SPA_MINBLOCKSIZE;
3073		return (0);
3074
3075	default:
3076		return (EOPNOTSUPP);
3077	}
3078}
3079
3080#ifdef TODO
3081/*ARGSUSED*/
3082static int
3083zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr)
3084{
3085	znode_t *zp = VTOZ(vp);
3086	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3087	int error;
3088
3089	ZFS_ENTER(zfsvfs);
3090	error = zfs_getacl(zp, vsecp, cr);
3091	ZFS_EXIT(zfsvfs);
3092
3093	return (error);
3094}
3095#endif	/* TODO */
3096
3097#ifdef TODO
3098/*ARGSUSED*/
3099static int
3100zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr)
3101{
3102	znode_t *zp = VTOZ(vp);
3103	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3104	int error;
3105
3106	ZFS_ENTER(zfsvfs);
3107	error = zfs_setacl(zp, vsecp, cr);
3108	ZFS_EXIT(zfsvfs);
3109	return (error);
3110}
3111#endif	/* TODO */
3112
3113static int
3114zfs_freebsd_open(ap)
3115	struct vop_open_args /* {
3116		struct vnode *a_vp;
3117		int a_mode;
3118		struct ucred *a_cred;
3119		struct thread *a_td;
3120	} */ *ap;
3121{
3122	vnode_t	*vp = ap->a_vp;
3123	znode_t *zp = VTOZ(vp);
3124	int error;
3125
3126	error = zfs_open(&vp, ap->a_mode, ap->a_cred);
3127	if (error == 0)
3128		vnode_create_vobject(vp, zp->z_phys->zp_size, ap->a_td);
3129	return (error);
3130}
3131
3132static int
3133zfs_freebsd_close(ap)
3134	struct vop_close_args /* {
3135		struct vnode *a_vp;
3136		int  a_fflag;
3137		struct ucred *a_cred;
3138		struct thread *a_td;
3139	} */ *ap;
3140{
3141
3142	return (zfs_close(ap->a_vp, ap->a_fflag, 0, 0, ap->a_cred));
3143}
3144
3145static int
3146zfs_freebsd_ioctl(ap)
3147	struct vop_ioctl_args /* {
3148		struct vnode *a_vp;
3149		u_long a_command;
3150		caddr_t a_data;
3151		int a_fflag;
3152		struct ucred *cred;
3153		struct thread *td;
3154	} */ *ap;
3155{
3156
3157	return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data,
3158	    ap->a_fflag, ap->a_cred, NULL));
3159}
3160
3161static int
3162zfs_freebsd_read(ap)
3163	struct vop_read_args /* {
3164		struct vnode *a_vp;
3165		struct uio *a_uio;
3166		int a_ioflag;
3167		struct ucred *a_cred;
3168	} */ *ap;
3169{
3170
3171	return (zfs_read(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL));
3172}
3173
3174static int
3175zfs_freebsd_write(ap)
3176	struct vop_write_args /* {
3177		struct vnode *a_vp;
3178		struct uio *a_uio;
3179		int a_ioflag;
3180		struct ucred *a_cred;
3181	} */ *ap;
3182{
3183
3184	return (zfs_write(ap->a_vp, ap->a_uio, ap->a_ioflag, ap->a_cred, NULL));
3185}
3186
3187static int
3188zfs_freebsd_access(ap)
3189	struct vop_access_args /* {
3190		struct vnode *a_vp;
3191		int  a_mode;
3192		struct ucred *a_cred;
3193		struct thread *a_td;
3194	} */ *ap;
3195{
3196
3197	return (zfs_access(ap->a_vp, ap->a_mode, 0, ap->a_cred));
3198}
3199
3200static int
3201zfs_freebsd_lookup(ap)
3202	struct vop_lookup_args /* {
3203		struct vnode *a_dvp;
3204		struct vnode **a_vpp;
3205		struct componentname *a_cnp;
3206	} */ *ap;
3207{
3208	struct componentname *cnp = ap->a_cnp;
3209	char nm[NAME_MAX + 1];
3210
3211	ASSERT(cnp->cn_namelen < sizeof(nm));
3212	strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof(nm)));
3213
3214	return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop,
3215	    cnp->cn_cred, cnp->cn_thread));
3216}
3217
3218static int
3219zfs_freebsd_create(ap)
3220	struct vop_create_args /* {
3221		struct vnode *a_dvp;
3222		struct vnode **a_vpp;
3223		struct componentname *a_cnp;
3224		struct vattr *a_vap;
3225	} */ *ap;
3226{
3227	struct componentname *cnp = ap->a_cnp;
3228	vattr_t *vap = ap->a_vap;
3229	int mode;
3230
3231	ASSERT(cnp->cn_flags & SAVENAME);
3232
3233	vattr_init_mask(vap);
3234	mode = vap->va_mode & ALLPERMS;
3235
3236	return (zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
3237	    ap->a_vpp, cnp->cn_cred, cnp->cn_thread));
3238}
3239
3240static int
3241zfs_freebsd_remove(ap)
3242	struct vop_remove_args /* {
3243		struct vnode *a_dvp;
3244		struct vnode *a_vp;
3245		struct componentname *a_cnp;
3246	} */ *ap;
3247{
3248
3249	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
3250
3251	return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr,
3252	    ap->a_cnp->cn_cred));
3253}
3254
3255static int
3256zfs_freebsd_mkdir(ap)
3257	struct vop_mkdir_args /* {
3258		struct vnode *a_dvp;
3259		struct vnode **a_vpp;
3260		struct componentname *a_cnp;
3261		struct vattr *a_vap;
3262	} */ *ap;
3263{
3264	vattr_t *vap = ap->a_vap;
3265
3266	ASSERT(ap->a_cnp->cn_flags & SAVENAME);
3267
3268	vattr_init_mask(vap);
3269
3270	return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
3271	    ap->a_cnp->cn_cred));
3272}
3273
3274static int
3275zfs_freebsd_rmdir(ap)
3276	struct vop_rmdir_args /* {
3277		struct vnode *a_dvp;
3278		struct vnode *a_vp;
3279		struct componentname *a_cnp;
3280	} */ *ap;
3281{
3282	struct componentname *cnp = ap->a_cnp;
3283
3284	ASSERT(cnp->cn_flags & SAVENAME);
3285
3286	return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred));
3287}
3288
3289static int
3290zfs_freebsd_readdir(ap)
3291	struct vop_readdir_args /* {
3292		struct vnode *a_vp;
3293		struct uio *a_uio;
3294		struct ucred *a_cred;
3295		int *a_eofflag;
3296		int *a_ncookies;
3297		u_long **a_cookies;
3298	} */ *ap;
3299{
3300
3301	return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag,
3302	    ap->a_ncookies, ap->a_cookies));
3303}
3304
3305static int
3306zfs_freebsd_fsync(ap)
3307	struct vop_fsync_args /* {
3308		struct vnode *a_vp;
3309		int a_waitfor;
3310		struct thread *a_td;
3311	} */ *ap;
3312{
3313
3314	vop_stdfsync(ap);
3315	return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred));
3316}
3317
3318static int
3319zfs_freebsd_getattr(ap)
3320	struct vop_getattr_args /* {
3321		struct vnode *a_vp;
3322		struct vattr *a_vap;
3323		struct ucred *a_cred;
3324		struct thread *a_td;
3325	} */ *ap;
3326{
3327
3328	return (zfs_getattr(ap->a_vp, ap->a_vap, 0, ap->a_cred));
3329}
3330
3331static int
3332zfs_freebsd_setattr(ap)
3333	struct vop_setattr_args /* {
3334		struct vnode *a_vp;
3335		struct vattr *a_vap;
3336		struct ucred *a_cred;
3337		struct thread *a_td;
3338	} */ *ap;
3339{
3340	vattr_t *vap = ap->a_vap;
3341
3342	/* No support for FreeBSD's chflags(2). */
3343	if (vap->va_flags != VNOVAL)
3344		return (EOPNOTSUPP);
3345
3346	vattr_init_mask(vap);
3347
3348	return (zfs_setattr(ap->a_vp, vap, 0, ap->a_cred, NULL));
3349}
3350
3351static int
3352zfs_freebsd_rename(ap)
3353	struct vop_rename_args  /* {
3354		struct vnode *a_fdvp;
3355		struct vnode *a_fvp;
3356		struct componentname *a_fcnp;
3357		struct vnode *a_tdvp;
3358		struct vnode *a_tvp;
3359		struct componentname *a_tcnp;
3360	} */ *ap;
3361{
3362	vnode_t *fdvp = ap->a_fdvp;
3363	vnode_t *fvp = ap->a_fvp;
3364	vnode_t *tdvp = ap->a_tdvp;
3365	vnode_t *tvp = ap->a_tvp;
3366	int error;
3367
3368	ASSERT(ap->a_fcnp->cn_flags & SAVENAME);
3369	ASSERT(ap->a_tcnp->cn_flags & SAVENAME);
3370
3371	error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp,
3372	    ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred);
3373
3374	if (tdvp == tvp)
3375		VN_RELE(tdvp);
3376	else
3377		VN_URELE(tdvp);
3378	if (tvp)
3379		VN_URELE(tvp);
3380	VN_RELE(fdvp);
3381	VN_RELE(fvp);
3382
3383	return (error);
3384}
3385
3386static int
3387zfs_freebsd_symlink(ap)
3388	struct vop_symlink_args /* {
3389		struct vnode *a_dvp;
3390		struct vnode **a_vpp;
3391		struct componentname *a_cnp;
3392		struct vattr *a_vap;
3393		char *a_target;
3394	} */ *ap;
3395{
3396	struct componentname *cnp = ap->a_cnp;
3397	vattr_t *vap = ap->a_vap;
3398
3399	ASSERT(cnp->cn_flags & SAVENAME);
3400
3401	vap->va_type = VLNK;	/* FreeBSD: Syscall only sets va_mode. */
3402	vattr_init_mask(vap);
3403
3404	return (zfs_symlink(ap->a_dvp, ap->a_vpp, cnp->cn_nameptr, vap,
3405	    ap->a_target, cnp->cn_cred, cnp->cn_thread));
3406}
3407
3408static int
3409zfs_freebsd_readlink(ap)
3410	struct vop_readlink_args /* {
3411		struct vnode *a_vp;
3412		struct uio *a_uio;
3413		struct ucred *a_cred;
3414	} */ *ap;
3415{
3416
3417	return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred));
3418}
3419
3420static int
3421zfs_freebsd_link(ap)
3422	struct vop_link_args /* {
3423		struct vnode *a_tdvp;
3424		struct vnode *a_vp;
3425		struct componentname *a_cnp;
3426	} */ *ap;
3427{
3428	struct componentname *cnp = ap->a_cnp;
3429
3430	ASSERT(cnp->cn_flags & SAVENAME);
3431
3432	return (zfs_link(ap->a_tdvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
3433}
3434
3435static int
3436zfs_freebsd_inactive(ap)
3437        struct vop_inactive_args /* {
3438                struct vnode *a_vp;
3439                struct thread *a_td;
3440        } */ *ap;
3441{
3442	vnode_t *vp = ap->a_vp;
3443
3444	zfs_inactive(vp, ap->a_td->td_ucred);
3445	return (0);
3446}
3447
3448static int
3449zfs_freebsd_reclaim(ap)
3450	struct vop_reclaim_args /* {
3451		struct vnode *a_vp;
3452		struct thread *a_td;
3453	} */ *ap;
3454{
3455        vnode_t	*vp = ap->a_vp;
3456	znode_t	*zp = VTOZ(vp);
3457	zfsvfs_t *zfsvfs;
3458	int rele = 1;
3459
3460	ASSERT(zp != NULL);
3461
3462	/*
3463	 * Destroy the vm object and flush associated pages.
3464	 */
3465	vnode_destroy_vobject(vp);
3466
3467	mutex_enter(&zp->z_lock);
3468	ASSERT(zp->z_phys);
3469	ASSERT(zp->z_dbuf_held);
3470	zfsvfs = zp->z_zfsvfs;
3471	if (!zp->z_unlinked) {
3472		zp->z_dbuf_held = 0;
3473		ZTOV(zp) = NULL;
3474		mutex_exit(&zp->z_lock);
3475		dmu_buf_rele(zp->z_dbuf, NULL);
3476	} else {
3477		mutex_exit(&zp->z_lock);
3478	}
3479	VI_LOCK(vp);
3480	if (vp->v_count > 0)
3481		rele = 0;
3482	vp->v_data = NULL;
3483	ASSERT(vp->v_holdcnt > 1);
3484	vdropl(vp);
3485	if (!zp->z_unlinked && rele)
3486		VFS_RELE(zfsvfs->z_vfs);
3487	return (0);
3488}
3489
3490static int
3491zfs_freebsd_fid(ap)
3492	struct vop_fid_args /* {
3493		struct vnode *a_vp;
3494		struct fid *a_fid;
3495	} */ *ap;
3496{
3497
3498	return (zfs_fid(ap->a_vp, (void *)ap->a_fid));
3499}
3500
3501static int
3502zfs_freebsd_pathconf(ap)
3503	struct vop_pathconf_args /* {
3504		struct vnode *a_vp;
3505		int a_name;
3506		register_t *a_retval;
3507	} */ *ap;
3508{
3509	ulong_t val;
3510	int error;
3511
3512	error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred);
3513	if (error == 0)
3514		*ap->a_retval = val;
3515	else if (error == EOPNOTSUPP)
3516		error = vop_stdpathconf(ap);
3517	return (error);
3518}
3519
3520/*
3521 * Advisory record locking support
3522 */
3523static int
3524zfs_freebsd_advlock(ap)
3525	struct vop_advlock_args /* {
3526		struct vnode *a_vp;
3527		caddr_t  a_id;
3528		int  a_op;
3529		struct flock *a_fl;
3530		int  a_flags;
3531	} */ *ap;
3532{
3533	znode_t	*zp = VTOZ(ap->a_vp);
3534
3535	return (lf_advlock(ap, &(zp->z_lockf), zp->z_phys->zp_size));
3536}
3537
3538struct vop_vector zfs_vnodeops;
3539struct vop_vector zfs_fifoops;
3540
3541struct vop_vector zfs_vnodeops = {
3542	.vop_default =	&default_vnodeops,
3543	.vop_inactive =	zfs_freebsd_inactive,
3544	.vop_reclaim =	zfs_freebsd_reclaim,
3545	.vop_access =	zfs_freebsd_access,
3546#ifdef FREEBSD_NAMECACHE
3547	.vop_lookup =	vfs_cache_lookup,
3548	.vop_cachedlookup = zfs_freebsd_lookup,
3549#else
3550	.vop_lookup =	zfs_freebsd_lookup,
3551#endif
3552	.vop_getattr =	zfs_freebsd_getattr,
3553	.vop_setattr =	zfs_freebsd_setattr,
3554	.vop_create =	zfs_freebsd_create,
3555	.vop_mknod =	zfs_freebsd_create,
3556	.vop_mkdir =	zfs_freebsd_mkdir,
3557	.vop_readdir =	zfs_freebsd_readdir,
3558	.vop_fsync =	zfs_freebsd_fsync,
3559	.vop_open =	zfs_freebsd_open,
3560	.vop_close =	zfs_freebsd_close,
3561	.vop_rmdir =	zfs_freebsd_rmdir,
3562	.vop_ioctl =	zfs_freebsd_ioctl,
3563	.vop_link =	zfs_freebsd_link,
3564	.vop_symlink =	zfs_freebsd_symlink,
3565	.vop_readlink =	zfs_freebsd_readlink,
3566	.vop_read =	zfs_freebsd_read,
3567	.vop_write =	zfs_freebsd_write,
3568	.vop_remove =	zfs_freebsd_remove,
3569	.vop_rename =	zfs_freebsd_rename,
3570	.vop_advlock =	zfs_freebsd_advlock,
3571	.vop_pathconf =	zfs_freebsd_pathconf,
3572	.vop_bmap =	VOP_EOPNOTSUPP,
3573	.vop_fid =	zfs_freebsd_fid,
3574};
3575
3576struct vop_vector zfs_fifoops = {
3577	.vop_default =	&fifo_specops,
3578	.vop_fsync =	VOP_PANIC,
3579	.vop_access =	zfs_freebsd_access,
3580	.vop_getattr =	zfs_freebsd_getattr,
3581	.vop_inactive =	zfs_freebsd_inactive,
3582	.vop_read =	VOP_PANIC,
3583	.vop_reclaim =	zfs_freebsd_reclaim,
3584	.vop_setattr =	zfs_freebsd_setattr,
3585	.vop_write =	VOP_PANIC,
3586	.vop_fid =	zfs_freebsd_fid,
3587};
3588