1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25/* Portions Copyright 2007 Jeremy Teo */
26/* Portions Copyright 2010 Robert Milkowski */
27
28#include <sys/types.h>
29#include <sys/param.h>
30#include <sys/time.h>
31#include <sys/systm.h>
32#include <sys/sysmacros.h>
33#include <sys/resource.h>
34#include <sys/vfs.h>
35#include <sys/vfs_opreg.h>
36#include <sys/vnode.h>
37#include <sys/file.h>
38#include <sys/stat.h>
39#include <sys/kmem.h>
40#include <sys/taskq.h>
41#include <sys/uio.h>
42#include <sys/vmsystm.h>
43#include <sys/atomic.h>
44#include <sys/vm.h>
45#include <vm/seg_vn.h>
46#include <vm/pvn.h>
47#include <vm/as.h>
48#include <vm/kpm.h>
49#include <vm/seg_kpm.h>
50#include <sys/mman.h>
51#include <sys/pathname.h>
52#include <sys/cmn_err.h>
53#include <sys/errno.h>
54#include <sys/unistd.h>
55#include <sys/zfs_dir.h>
56#include <sys/zfs_acl.h>
57#include <sys/zfs_ioctl.h>
58#include <sys/fs/zfs.h>
59#include <sys/dmu.h>
60#include <sys/dmu_objset.h>
61#include <sys/spa.h>
62#include <sys/txg.h>
63#include <sys/dbuf.h>
64#include <sys/zap.h>
65#include <sys/sa.h>
66#include <sys/dirent.h>
67#include <sys/policy.h>
68#include <sys/sunddi.h>
69#include <sys/filio.h>
70#include <sys/sid.h>
71#include "fs/fs_subr.h"
72#include <sys/zfs_ctldir.h>
73#include <sys/zfs_fuid.h>
74#include <sys/zfs_sa.h>
75#include <sys/dnlc.h>
76#include <sys/zfs_rlock.h>
77#include <sys/extdirent.h>
78#include <sys/kidmap.h>
79#include <sys/cred.h>
80#include <sys/attr.h>
81
82/*
83 * Programming rules.
84 *
85 * Each vnode op performs some logical unit of work.  To do this, the ZPL must
86 * properly lock its in-core state, create a DMU transaction, do the work,
87 * record this work in the intent log (ZIL), commit the DMU transaction,
88 * and wait for the intent log to commit if it is a synchronous operation.
89 * Moreover, the vnode ops must work in both normal and log replay context.
90 * The ordering of events is important to avoid deadlocks and references
91 * to freed memory.  The example below illustrates the following Big Rules:
92 *
93 *  (1) A check must be made in each zfs thread for a mounted file system.
94 *	This is done avoiding races using ZFS_ENTER(zfsvfs).
95 *      A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
96 *      must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
97 *      can return EIO from the calling function.
98 *
99 *  (2)	VN_RELE() should always be the last thing except for zil_commit()
100 *	(if necessary) and ZFS_EXIT(). This is for 3 reasons:
101 *	First, if it's the last reference, the vnode/znode
102 *	can be freed, so the zp may point to freed memory.  Second, the last
103 *	reference will call zfs_zinactive(), which may induce a lot of work --
104 *	pushing cached pages (which acquires range locks) and syncing out
105 *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
106 *	which could deadlock the system if you were already holding one.
107 *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
108 *
109 *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
110 *	as they can span dmu_tx_assign() calls.
111 *
112 *  (4)	Always pass TXG_NOWAIT as the second argument to dmu_tx_assign().
113 *	This is critical because we don't want to block while holding locks.
114 *	Note, in particular, that if a lock is sometimes acquired before
115 *	the tx assigns, and sometimes after (e.g. z_lock), then failing to
116 *	use a non-blocking assign can deadlock the system.  The scenario:
117 *
118 *	Thread A has grabbed a lock before calling dmu_tx_assign().
119 *	Thread B is in an already-assigned tx, and blocks for this lock.
120 *	Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
121 *	forever, because the previous txg can't quiesce until B's tx commits.
122 *
123 *	If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
124 *	then drop all locks, call dmu_tx_wait(), and try again.
125 *
126 *  (5)	If the operation succeeded, generate the intent log entry for it
127 *	before dropping locks.  This ensures that the ordering of events
128 *	in the intent log matches the order in which they actually occurred.
129 *      During ZIL replay the zfs_log_* functions will update the sequence
130 *	number to indicate the zil transaction has replayed.
131 *
132 *  (6)	At the end of each vnode op, the DMU tx must always commit,
133 *	regardless of whether there were any errors.
134 *
135 *  (7)	After dropping all locks, invoke zil_commit(zilog, foid)
136 *	to ensure that synchronous semantics are provided when necessary.
137 *
138 * In general, this is how things should be ordered in each vnode op:
139 *
140 *	ZFS_ENTER(zfsvfs);		// exit if unmounted
141 * top:
142 *	zfs_dirent_lock(&dl, ...)	// lock directory entry (may VN_HOLD())
143 *	rw_enter(...);			// grab any other locks you need
144 *	tx = dmu_tx_create(...);	// get DMU tx
145 *	dmu_tx_hold_*();		// hold each object you might modify
146 *	error = dmu_tx_assign(tx, TXG_NOWAIT);	// try to assign
147 *	if (error) {
148 *		rw_exit(...);		// drop locks
149 *		zfs_dirent_unlock(dl);	// unlock directory entry
150 *		VN_RELE(...);		// release held vnodes
151 *		if (error == ERESTART) {
152 *			dmu_tx_wait(tx);
153 *			dmu_tx_abort(tx);
154 *			goto top;
155 *		}
156 *		dmu_tx_abort(tx);	// abort DMU tx
157 *		ZFS_EXIT(zfsvfs);	// finished in zfs
158 *		return (error);		// really out of space
159 *	}
160 *	error = do_real_work();		// do whatever this VOP does
161 *	if (error == 0)
162 *		zfs_log_*(...);		// on success, make ZIL entry
163 *	dmu_tx_commit(tx);		// commit DMU tx -- error or not
164 *	rw_exit(...);			// drop locks
165 *	zfs_dirent_unlock(dl);		// unlock directory entry
166 *	VN_RELE(...);			// release held vnodes
167 *	zil_commit(zilog, foid);	// synchronous when necessary
168 *	ZFS_EXIT(zfsvfs);		// finished in zfs
169 *	return (error);			// done, report error
170 */
171
172/* ARGSUSED */
173static int
174zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
175{
176	znode_t	*zp = VTOZ(*vpp);
177	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
178
179	ZFS_ENTER(zfsvfs);
180	ZFS_VERIFY_ZP(zp);
181
182	if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
183	    ((flag & FAPPEND) == 0)) {
184		ZFS_EXIT(zfsvfs);
185		return (EPERM);
186	}
187
188	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
189	    ZTOV(zp)->v_type == VREG &&
190	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
191		if (fs_vscan(*vpp, cr, 0) != 0) {
192			ZFS_EXIT(zfsvfs);
193			return (EACCES);
194		}
195	}
196
197	/* Keep a count of the synchronous opens in the znode */
198	if (flag & (FSYNC | FDSYNC))
199		atomic_inc_32(&zp->z_sync_cnt);
200
201	ZFS_EXIT(zfsvfs);
202	return (0);
203}
204
205/* ARGSUSED */
206static int
207zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
208    caller_context_t *ct)
209{
210	znode_t	*zp = VTOZ(vp);
211	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
212
213	/*
214	 * Clean up any locks held by this process on the vp.
215	 */
216	cleanlocks(vp, ddi_get_pid(), 0);
217	cleanshares(vp, ddi_get_pid());
218
219	ZFS_ENTER(zfsvfs);
220	ZFS_VERIFY_ZP(zp);
221
222	/* Decrement the synchronous opens in the znode */
223	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
224		atomic_dec_32(&zp->z_sync_cnt);
225
226	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
227	    ZTOV(zp)->v_type == VREG &&
228	    !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
229		VERIFY(fs_vscan(vp, cr, 1) == 0);
230
231	ZFS_EXIT(zfsvfs);
232	return (0);
233}
234
235/*
236 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
237 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
238 */
239static int
240zfs_holey(vnode_t *vp, int cmd, offset_t *off)
241{
242	znode_t	*zp = VTOZ(vp);
243	uint64_t noff = (uint64_t)*off; /* new offset */
244	uint64_t file_sz;
245	int error;
246	boolean_t hole;
247
248	file_sz = zp->z_size;
249	if (noff >= file_sz)  {
250		return (ENXIO);
251	}
252
253	if (cmd == _FIO_SEEK_HOLE)
254		hole = B_TRUE;
255	else
256		hole = B_FALSE;
257
258	error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
259
260	/* end of file? */
261	if ((error == ESRCH) || (noff > file_sz)) {
262		/*
263		 * Handle the virtual hole at the end of file.
264		 */
265		if (hole) {
266			*off = file_sz;
267			return (0);
268		}
269		return (ENXIO);
270	}
271
272	if (noff < *off)
273		return (error);
274	*off = noff;
275	return (error);
276}
277
278/* ARGSUSED */
279static int
280zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred,
281    int *rvalp, caller_context_t *ct)
282{
283	offset_t off;
284	int error;
285	zfsvfs_t *zfsvfs;
286	znode_t *zp;
287
288	switch (com) {
289	case _FIOFFS:
290		return (zfs_sync(vp->v_vfsp, 0, cred));
291
292		/*
293		 * The following two ioctls are used by bfu.  Faking out,
294		 * necessary to avoid bfu errors.
295		 */
296	case _FIOGDIO:
297	case _FIOSDIO:
298		return (0);
299
300	case _FIO_SEEK_DATA:
301	case _FIO_SEEK_HOLE:
302		if (ddi_copyin((void *)data, &off, sizeof (off), flag))
303			return (EFAULT);
304
305		zp = VTOZ(vp);
306		zfsvfs = zp->z_zfsvfs;
307		ZFS_ENTER(zfsvfs);
308		ZFS_VERIFY_ZP(zp);
309
310		/* offset parameter is in/out */
311		error = zfs_holey(vp, com, &off);
312		ZFS_EXIT(zfsvfs);
313		if (error)
314			return (error);
315		if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
316			return (EFAULT);
317		return (0);
318	}
319	return (ENOTTY);
320}
321
322/*
323 * Utility functions to map and unmap a single physical page.  These
324 * are used to manage the mappable copies of ZFS file data, and therefore
325 * do not update ref/mod bits.
326 */
327caddr_t
328zfs_map_page(page_t *pp, enum seg_rw rw)
329{
330	if (kpm_enable)
331		return (hat_kpm_mapin(pp, 0));
332	ASSERT(rw == S_READ || rw == S_WRITE);
333	return (ppmapin(pp, PROT_READ | ((rw == S_WRITE) ? PROT_WRITE : 0),
334	    (caddr_t)-1));
335}
336
337void
338zfs_unmap_page(page_t *pp, caddr_t addr)
339{
340	if (kpm_enable) {
341		hat_kpm_mapout(pp, 0, addr);
342	} else {
343		ppmapout(addr);
344	}
345}
346
347/*
348 * When a file is memory mapped, we must keep the IO data synchronized
349 * between the DMU cache and the memory mapped pages.  What this means:
350 *
351 * On Write:	If we find a memory mapped page, we write to *both*
352 *		the page and the dmu buffer.
353 */
354static void
355update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid)
356{
357	int64_t	off;
358
359	off = start & PAGEOFFSET;
360	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
361		page_t *pp;
362		uint64_t nbytes = MIN(PAGESIZE - off, len);
363
364		if (pp = page_lookup(vp, start, SE_SHARED)) {
365			caddr_t va;
366
367			va = zfs_map_page(pp, S_WRITE);
368			(void) dmu_read(os, oid, start+off, nbytes, va+off,
369			    DMU_READ_PREFETCH);
370			zfs_unmap_page(pp, va);
371			page_unlock(pp);
372		}
373		len -= nbytes;
374		off = 0;
375	}
376}
377
378/*
379 * When a file is memory mapped, we must keep the IO data synchronized
380 * between the DMU cache and the memory mapped pages.  What this means:
381 *
382 * On Read:	We "read" preferentially from memory mapped pages,
383 *		else we default from the dmu buffer.
384 *
385 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
386 *	the file is memory mapped.
387 */
388static int
389mappedread(vnode_t *vp, int nbytes, uio_t *uio)
390{
391	znode_t *zp = VTOZ(vp);
392	objset_t *os = zp->z_zfsvfs->z_os;
393	int64_t	start, off;
394	int len = nbytes;
395	int error = 0;
396
397	start = uio->uio_loffset;
398	off = start & PAGEOFFSET;
399	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
400		page_t *pp;
401		uint64_t bytes = MIN(PAGESIZE - off, len);
402
403		if (pp = page_lookup(vp, start, SE_SHARED)) {
404			caddr_t va;
405
406			va = zfs_map_page(pp, S_READ);
407			error = uiomove(va + off, bytes, UIO_READ, uio);
408			zfs_unmap_page(pp, va);
409			page_unlock(pp);
410		} else {
411			error = dmu_read_uio(os, zp->z_id, uio, bytes);
412		}
413		len -= bytes;
414		off = 0;
415		if (error)
416			break;
417	}
418	return (error);
419}
420
421offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
422
423/*
424 * Read bytes from specified file into supplied buffer.
425 *
426 *	IN:	vp	- vnode of file to be read from.
427 *		uio	- structure supplying read location, range info,
428 *			  and return buffer.
429 *		ioflag	- SYNC flags; used to provide FRSYNC semantics.
430 *		cr	- credentials of caller.
431 *		ct	- caller context
432 *
433 *	OUT:	uio	- updated offset and range, buffer filled.
434 *
435 *	RETURN:	0 if success
436 *		error code if failure
437 *
438 * Side Effects:
439 *	vp - atime updated if byte count > 0
440 */
441/* ARGSUSED */
442static int
443zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
444{
445	znode_t		*zp = VTOZ(vp);
446	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
447	objset_t	*os;
448	ssize_t		n, nbytes;
449	int		error;
450	rl_t		*rl;
451	xuio_t		*xuio = NULL;
452
453	ZFS_ENTER(zfsvfs);
454	ZFS_VERIFY_ZP(zp);
455	os = zfsvfs->z_os;
456
457	if (zp->z_pflags & ZFS_AV_QUARANTINED) {
458		ZFS_EXIT(zfsvfs);
459		return (EACCES);
460	}
461
462	/*
463	 * Validate file offset
464	 */
465	if (uio->uio_loffset < (offset_t)0) {
466		ZFS_EXIT(zfsvfs);
467		return (EINVAL);
468	}
469
470	/*
471	 * Fasttrack empty reads
472	 */
473	if (uio->uio_resid == 0) {
474		ZFS_EXIT(zfsvfs);
475		return (0);
476	}
477
478	/*
479	 * Check for mandatory locks
480	 */
481	if (MANDMODE(zp->z_mode)) {
482		if (error = chklock(vp, FREAD,
483		    uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
484			ZFS_EXIT(zfsvfs);
485			return (error);
486		}
487	}
488
489	/*
490	 * If we're in FRSYNC mode, sync out this znode before reading it.
491	 */
492	if (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
493		zil_commit(zfsvfs->z_log, zp->z_id);
494
495	/*
496	 * Lock the range against changes.
497	 */
498	rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
499
500	/*
501	 * If we are reading past end-of-file we can skip
502	 * to the end; but we might still need to set atime.
503	 */
504	if (uio->uio_loffset >= zp->z_size) {
505		error = 0;
506		goto out;
507	}
508
509	ASSERT(uio->uio_loffset < zp->z_size);
510	n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
511
512	if ((uio->uio_extflg == UIO_XUIO) &&
513	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
514		int nblk;
515		int blksz = zp->z_blksz;
516		uint64_t offset = uio->uio_loffset;
517
518		xuio = (xuio_t *)uio;
519		if ((ISP2(blksz))) {
520			nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
521			    blksz)) / blksz;
522		} else {
523			ASSERT(offset + n <= blksz);
524			nblk = 1;
525		}
526		(void) dmu_xuio_init(xuio, nblk);
527
528		if (vn_has_cached_data(vp)) {
529			/*
530			 * For simplicity, we always allocate a full buffer
531			 * even if we only expect to read a portion of a block.
532			 */
533			while (--nblk >= 0) {
534				(void) dmu_xuio_add(xuio,
535				    dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
536				    blksz), 0, blksz);
537			}
538		}
539	}
540
541	while (n > 0) {
542		nbytes = MIN(n, zfs_read_chunk_size -
543		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
544
545		if (vn_has_cached_data(vp))
546			error = mappedread(vp, nbytes, uio);
547		else
548			error = dmu_read_uio(os, zp->z_id, uio, nbytes);
549		if (error) {
550			/* convert checksum errors into IO errors */
551			if (error == ECKSUM)
552				error = EIO;
553			break;
554		}
555
556		n -= nbytes;
557	}
558out:
559	zfs_range_unlock(rl);
560
561	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
562	ZFS_EXIT(zfsvfs);
563	return (error);
564}
565
566/*
567 * Write the bytes to a file.
568 *
569 *	IN:	vp	- vnode of file to be written to.
570 *		uio	- structure supplying write location, range info,
571 *			  and data buffer.
572 *		ioflag	- FAPPEND flag set if in append mode.
573 *		cr	- credentials of caller.
574 *		ct	- caller context (NFS/CIFS fem monitor only)
575 *
576 *	OUT:	uio	- updated offset and range.
577 *
578 *	RETURN:	0 if success
579 *		error code if failure
580 *
581 * Timestamps:
582 *	vp - ctime|mtime updated if byte count > 0
583 */
584
585/* ARGSUSED */
586static int
587zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
588{
589	znode_t		*zp = VTOZ(vp);
590	rlim64_t	limit = uio->uio_llimit;
591	ssize_t		start_resid = uio->uio_resid;
592	ssize_t		tx_bytes;
593	uint64_t	end_size;
594	dmu_tx_t	*tx;
595	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
596	zilog_t		*zilog;
597	offset_t	woff;
598	ssize_t		n, nbytes;
599	rl_t		*rl;
600	int		max_blksz = zfsvfs->z_max_blksz;
601	int		error;
602	arc_buf_t	*abuf;
603	iovec_t		*aiov;
604	xuio_t		*xuio = NULL;
605	int		i_iov = 0;
606	int		iovcnt = uio->uio_iovcnt;
607	iovec_t		*iovp = uio->uio_iov;
608	int		write_eof;
609	int		count = 0;
610	sa_bulk_attr_t	bulk[4];
611	uint64_t	mtime[2], ctime[2];
612
613	/*
614	 * Fasttrack empty write
615	 */
616	n = start_resid;
617	if (n == 0)
618		return (0);
619
620	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
621		limit = MAXOFFSET_T;
622
623	ZFS_ENTER(zfsvfs);
624	ZFS_VERIFY_ZP(zp);
625
626	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
627	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
628	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
629	    &zp->z_size, 8);
630	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
631	    &zp->z_pflags, 8);
632
633	/*
634	 * If immutable or not appending then return EPERM
635	 */
636	if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
637	    ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
638	    (uio->uio_loffset < zp->z_size))) {
639		ZFS_EXIT(zfsvfs);
640		return (EPERM);
641	}
642
643	zilog = zfsvfs->z_log;
644
645	/*
646	 * Validate file offset
647	 */
648	woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
649	if (woff < 0) {
650		ZFS_EXIT(zfsvfs);
651		return (EINVAL);
652	}
653
654	/*
655	 * Check for mandatory locks before calling zfs_range_lock()
656	 * in order to prevent a deadlock with locks set via fcntl().
657	 */
658	if (MANDMODE((mode_t)zp->z_mode) &&
659	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
660		ZFS_EXIT(zfsvfs);
661		return (error);
662	}
663
664	/*
665	 * Pre-fault the pages to ensure slow (eg NFS) pages
666	 * don't hold up txg.
667	 * Skip this if uio contains loaned arc_buf.
668	 */
669	if ((uio->uio_extflg == UIO_XUIO) &&
670	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
671		xuio = (xuio_t *)uio;
672	else
673		uio_prefaultpages(MIN(n, max_blksz), uio);
674
675	/*
676	 * If in append mode, set the io offset pointer to eof.
677	 */
678	if (ioflag & FAPPEND) {
679		/*
680		 * Obtain an appending range lock to guarantee file append
681		 * semantics.  We reset the write offset once we have the lock.
682		 */
683		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
684		woff = rl->r_off;
685		if (rl->r_len == UINT64_MAX) {
686			/*
687			 * We overlocked the file because this write will cause
688			 * the file block size to increase.
689			 * Note that zp_size cannot change with this lock held.
690			 */
691			woff = zp->z_size;
692		}
693		uio->uio_loffset = woff;
694	} else {
695		/*
696		 * Note that if the file block size will change as a result of
697		 * this write, then this range lock will lock the entire file
698		 * so that we can re-write the block safely.
699		 */
700		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
701	}
702
703	if (woff >= limit) {
704		zfs_range_unlock(rl);
705		ZFS_EXIT(zfsvfs);
706		return (EFBIG);
707	}
708
709	if ((woff + n) > limit || woff > (limit - n))
710		n = limit - woff;
711
712	/* Will this write extend the file length? */
713	write_eof = (woff + n > zp->z_size);
714
715	end_size = MAX(zp->z_size, woff + n);
716
717	/*
718	 * Write the file in reasonable size chunks.  Each chunk is written
719	 * in a separate transaction; this keeps the intent log records small
720	 * and allows us to do more fine-grained space accounting.
721	 */
722	while (n > 0) {
723		abuf = NULL;
724		woff = uio->uio_loffset;
725again:
726		if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
727		    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
728			if (abuf != NULL)
729				dmu_return_arcbuf(abuf);
730			error = EDQUOT;
731			break;
732		}
733
734		if (xuio && abuf == NULL) {
735			ASSERT(i_iov < iovcnt);
736			aiov = &iovp[i_iov];
737			abuf = dmu_xuio_arcbuf(xuio, i_iov);
738			dmu_xuio_clear(xuio, i_iov);
739			DTRACE_PROBE3(zfs_cp_write, int, i_iov,
740			    iovec_t *, aiov, arc_buf_t *, abuf);
741			ASSERT((aiov->iov_base == abuf->b_data) ||
742			    ((char *)aiov->iov_base - (char *)abuf->b_data +
743			    aiov->iov_len == arc_buf_size(abuf)));
744			i_iov++;
745		} else if (abuf == NULL && n >= max_blksz &&
746		    woff >= zp->z_size &&
747		    P2PHASE(woff, max_blksz) == 0 &&
748		    zp->z_blksz == max_blksz) {
749			/*
750			 * This write covers a full block.  "Borrow" a buffer
751			 * from the dmu so that we can fill it before we enter
752			 * a transaction.  This avoids the possibility of
753			 * holding up the transaction if the data copy hangs
754			 * up on a pagefault (e.g., from an NFS server mapping).
755			 */
756			size_t cbytes;
757
758			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
759			    max_blksz);
760			ASSERT(abuf != NULL);
761			ASSERT(arc_buf_size(abuf) == max_blksz);
762			if (error = uiocopy(abuf->b_data, max_blksz,
763			    UIO_WRITE, uio, &cbytes)) {
764				dmu_return_arcbuf(abuf);
765				break;
766			}
767			ASSERT(cbytes == max_blksz);
768		}
769
770		/*
771		 * Start a transaction.
772		 */
773		tx = dmu_tx_create(zfsvfs->z_os);
774		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
775		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
776		zfs_sa_upgrade_txholds(tx, zp);
777		error = dmu_tx_assign(tx, TXG_NOWAIT);
778		if (error) {
779			if (error == ERESTART) {
780				dmu_tx_wait(tx);
781				dmu_tx_abort(tx);
782				goto again;
783			}
784			dmu_tx_abort(tx);
785			if (abuf != NULL)
786				dmu_return_arcbuf(abuf);
787			break;
788		}
789
790		/*
791		 * If zfs_range_lock() over-locked we grow the blocksize
792		 * and then reduce the lock range.  This will only happen
793		 * on the first iteration since zfs_range_reduce() will
794		 * shrink down r_len to the appropriate size.
795		 */
796		if (rl->r_len == UINT64_MAX) {
797			uint64_t new_blksz;
798
799			if (zp->z_blksz > max_blksz) {
800				ASSERT(!ISP2(zp->z_blksz));
801				new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
802			} else {
803				new_blksz = MIN(end_size, max_blksz);
804			}
805			zfs_grow_blocksize(zp, new_blksz, tx);
806			zfs_range_reduce(rl, woff, n);
807		}
808
809		/*
810		 * XXX - should we really limit each write to z_max_blksz?
811		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
812		 */
813		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
814
815		if (abuf == NULL) {
816			tx_bytes = uio->uio_resid;
817			error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
818			    uio, nbytes, tx);
819			tx_bytes -= uio->uio_resid;
820		} else {
821			tx_bytes = nbytes;
822			ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
823			/*
824			 * If this is not a full block write, but we are
825			 * extending the file past EOF and this data starts
826			 * block-aligned, use assign_arcbuf().  Otherwise,
827			 * write via dmu_write().
828			 */
829			if (tx_bytes < max_blksz && (!write_eof ||
830			    aiov->iov_base != abuf->b_data)) {
831				ASSERT(xuio);
832				dmu_write(zfsvfs->z_os, zp->z_id, woff,
833				    aiov->iov_len, aiov->iov_base, tx);
834				dmu_return_arcbuf(abuf);
835				xuio_stat_wbuf_copied();
836			} else {
837				ASSERT(xuio || tx_bytes == max_blksz);
838				dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
839				    woff, abuf, tx);
840			}
841			ASSERT(tx_bytes <= uio->uio_resid);
842			uioskip(uio, tx_bytes);
843		}
844		if (tx_bytes && vn_has_cached_data(vp)) {
845			update_pages(vp, woff,
846			    tx_bytes, zfsvfs->z_os, zp->z_id);
847		}
848
849		/*
850		 * If we made no progress, we're done.  If we made even
851		 * partial progress, update the znode and ZIL accordingly.
852		 */
853		if (tx_bytes == 0) {
854			(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
855			    (void *)&zp->z_size, sizeof (uint64_t), tx);
856			dmu_tx_commit(tx);
857			ASSERT(error != 0);
858			break;
859		}
860
861		/*
862		 * Clear Set-UID/Set-GID bits on successful write if not
863		 * privileged and at least one of the excute bits is set.
864		 *
865		 * It would be nice to to this after all writes have
866		 * been done, but that would still expose the ISUID/ISGID
867		 * to another app after the partial write is committed.
868		 *
869		 * Note: we don't call zfs_fuid_map_id() here because
870		 * user 0 is not an ephemeral uid.
871		 */
872		mutex_enter(&zp->z_acl_lock);
873		if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
874		    (S_IXUSR >> 6))) != 0 &&
875		    (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
876		    secpolicy_vnode_setid_retain(cr,
877		    (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
878			uint64_t newmode;
879			zp->z_mode &= ~(S_ISUID | S_ISGID);
880			newmode = zp->z_mode;
881			(void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
882			    (void *)&newmode, sizeof (uint64_t), tx);
883		}
884		mutex_exit(&zp->z_acl_lock);
885
886		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
887		    B_TRUE);
888
889		/*
890		 * Update the file size (zp_size) if it has changed;
891		 * account for possible concurrent updates.
892		 */
893		while ((end_size = zp->z_size) < uio->uio_loffset) {
894			(void) atomic_cas_64(&zp->z_size, end_size,
895			    uio->uio_loffset);
896			ASSERT(error == 0);
897		}
898		/*
899		 * If we are replaying and eof is non zero then force
900		 * the file size to the specified eof. Note, there's no
901		 * concurrency during replay.
902		 */
903		if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
904			zp->z_size = zfsvfs->z_replay_eof;
905
906		error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
907
908		zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
909		dmu_tx_commit(tx);
910
911		if (error != 0)
912			break;
913		ASSERT(tx_bytes == nbytes);
914		n -= nbytes;
915
916		if (!xuio && n > 0)
917			uio_prefaultpages(MIN(n, max_blksz), uio);
918	}
919
920	zfs_range_unlock(rl);
921
922	/*
923	 * If we're in replay mode, or we made no progress, return error.
924	 * Otherwise, it's at least a partial write, so it's successful.
925	 */
926	if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
927		ZFS_EXIT(zfsvfs);
928		return (error);
929	}
930
931	if (ioflag & (FSYNC | FDSYNC) ||
932	    zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
933		zil_commit(zilog, zp->z_id);
934
935	ZFS_EXIT(zfsvfs);
936	return (0);
937}
938
939void
940zfs_get_done(zgd_t *zgd, int error)
941{
942	znode_t *zp = zgd->zgd_private;
943	objset_t *os = zp->z_zfsvfs->z_os;
944
945	if (zgd->zgd_db)
946		dmu_buf_rele(zgd->zgd_db, zgd);
947
948	zfs_range_unlock(zgd->zgd_rl);
949
950	/*
951	 * Release the vnode asynchronously as we currently have the
952	 * txg stopped from syncing.
953	 */
954	VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
955
956	if (error == 0 && zgd->zgd_bp)
957		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
958
959	kmem_free(zgd, sizeof (zgd_t));
960}
961
962#ifdef DEBUG
963static int zil_fault_io = 0;
964#endif
965
966/*
967 * Get data to generate a TX_WRITE intent log record.
968 */
969int
970zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
971{
972	zfsvfs_t *zfsvfs = arg;
973	objset_t *os = zfsvfs->z_os;
974	znode_t *zp;
975	uint64_t object = lr->lr_foid;
976	uint64_t offset = lr->lr_offset;
977	uint64_t size = lr->lr_length;
978	blkptr_t *bp = &lr->lr_blkptr;
979	dmu_buf_t *db;
980	zgd_t *zgd;
981	int error = 0;
982
983	ASSERT(zio != NULL);
984	ASSERT(size != 0);
985
986	/*
987	 * Nothing to do if the file has been removed
988	 */
989	if (zfs_zget(zfsvfs, object, &zp) != 0)
990		return (ENOENT);
991	if (zp->z_unlinked) {
992		/*
993		 * Release the vnode asynchronously as we currently have the
994		 * txg stopped from syncing.
995		 */
996		VN_RELE_ASYNC(ZTOV(zp),
997		    dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
998		return (ENOENT);
999	}
1000
1001	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1002	zgd->zgd_zilog = zfsvfs->z_log;
1003	zgd->zgd_private = zp;
1004
1005	/*
1006	 * Write records come in two flavors: immediate and indirect.
1007	 * For small writes it's cheaper to store the data with the
1008	 * log record (immediate); for large writes it's cheaper to
1009	 * sync the data and get a pointer to it (indirect) so that
1010	 * we don't have to write the data twice.
1011	 */
1012	if (buf != NULL) { /* immediate write */
1013		zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
1014		/* test for truncation needs to be done while range locked */
1015		if (offset >= zp->z_size) {
1016			error = ENOENT;
1017		} else {
1018			error = dmu_read(os, object, offset, size, buf,
1019			    DMU_READ_NO_PREFETCH);
1020		}
1021		ASSERT(error == 0 || error == ENOENT);
1022	} else { /* indirect write */
1023		/*
1024		 * Have to lock the whole block to ensure when it's
1025		 * written out and it's checksum is being calculated
1026		 * that no one can change the data. We need to re-check
1027		 * blocksize after we get the lock in case it's changed!
1028		 */
1029		for (;;) {
1030			uint64_t blkoff;
1031			size = zp->z_blksz;
1032			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1033			offset -= blkoff;
1034			zgd->zgd_rl = zfs_range_lock(zp, offset, size,
1035			    RL_READER);
1036			if (zp->z_blksz == size)
1037				break;
1038			offset += blkoff;
1039			zfs_range_unlock(zgd->zgd_rl);
1040		}
1041		/* test for truncation needs to be done while range locked */
1042		if (lr->lr_offset >= zp->z_size)
1043			error = ENOENT;
1044#ifdef DEBUG
1045		if (zil_fault_io) {
1046			error = EIO;
1047			zil_fault_io = 0;
1048		}
1049#endif
1050		if (error == 0)
1051			error = dmu_buf_hold(os, object, offset, zgd, &db,
1052			    DMU_READ_NO_PREFETCH);
1053
1054		if (error == 0) {
1055			zgd->zgd_db = db;
1056			zgd->zgd_bp = bp;
1057
1058			ASSERT(db->db_offset == offset);
1059			ASSERT(db->db_size == size);
1060
1061			error = dmu_sync(zio, lr->lr_common.lrc_txg,
1062			    zfs_get_done, zgd);
1063			ASSERT(error || lr->lr_length <= zp->z_blksz);
1064
1065			/*
1066			 * On success, we need to wait for the write I/O
1067			 * initiated by dmu_sync() to complete before we can
1068			 * release this dbuf.  We will finish everything up
1069			 * in the zfs_get_done() callback.
1070			 */
1071			if (error == 0)
1072				return (0);
1073
1074			if (error == EALREADY) {
1075				lr->lr_common.lrc_txtype = TX_WRITE2;
1076				error = 0;
1077			}
1078		}
1079	}
1080
1081	zfs_get_done(zgd, error);
1082
1083	return (error);
1084}
1085
1086/*ARGSUSED*/
1087static int
1088zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1089    caller_context_t *ct)
1090{
1091	znode_t *zp = VTOZ(vp);
1092	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1093	int error;
1094
1095	ZFS_ENTER(zfsvfs);
1096	ZFS_VERIFY_ZP(zp);
1097
1098	if (flag & V_ACE_MASK)
1099		error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1100	else
1101		error = zfs_zaccess_rwx(zp, mode, flag, cr);
1102
1103	ZFS_EXIT(zfsvfs);
1104	return (error);
1105}
1106
1107/*
1108 * If vnode is for a device return a specfs vnode instead.
1109 */
1110static int
1111specvp_check(vnode_t **vpp, cred_t *cr)
1112{
1113	int error = 0;
1114
1115	if (IS_DEVVP(*vpp)) {
1116		struct vnode *svp;
1117
1118		svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1119		VN_RELE(*vpp);
1120		if (svp == NULL)
1121			error = ENOSYS;
1122		*vpp = svp;
1123	}
1124	return (error);
1125}
1126
1127
1128/*
1129 * Lookup an entry in a directory, or an extended attribute directory.
1130 * If it exists, return a held vnode reference for it.
1131 *
1132 *	IN:	dvp	- vnode of directory to search.
1133 *		nm	- name of entry to lookup.
1134 *		pnp	- full pathname to lookup [UNUSED].
1135 *		flags	- LOOKUP_XATTR set if looking for an attribute.
1136 *		rdir	- root directory vnode [UNUSED].
1137 *		cr	- credentials of caller.
1138 *		ct	- caller context
1139 *		direntflags - directory lookup flags
1140 *		realpnp - returned pathname.
1141 *
1142 *	OUT:	vpp	- vnode of located entry, NULL if not found.
1143 *
1144 *	RETURN:	0 if success
1145 *		error code if failure
1146 *
1147 * Timestamps:
1148 *	NA
1149 */
1150/* ARGSUSED */
1151static int
1152zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1153    int flags, vnode_t *rdir, cred_t *cr,  caller_context_t *ct,
1154    int *direntflags, pathname_t *realpnp)
1155{
1156	znode_t *zdp = VTOZ(dvp);
1157	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1158	int	error = 0;
1159
1160	/* fast path */
1161	if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
1162
1163		if (dvp->v_type != VDIR) {
1164			return (ENOTDIR);
1165		} else if (zdp->z_sa_hdl == NULL) {
1166			return (EIO);
1167		}
1168
1169		if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
1170			error = zfs_fastaccesschk_execute(zdp, cr);
1171			if (!error) {
1172				*vpp = dvp;
1173				VN_HOLD(*vpp);
1174				return (0);
1175			}
1176			return (error);
1177		} else {
1178			vnode_t *tvp = dnlc_lookup(dvp, nm);
1179
1180			if (tvp) {
1181				error = zfs_fastaccesschk_execute(zdp, cr);
1182				if (error) {
1183					VN_RELE(tvp);
1184					return (error);
1185				}
1186				if (tvp == DNLC_NO_VNODE) {
1187					VN_RELE(tvp);
1188					return (ENOENT);
1189				} else {
1190					*vpp = tvp;
1191					return (specvp_check(vpp, cr));
1192				}
1193			}
1194		}
1195	}
1196
1197	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1198
1199	ZFS_ENTER(zfsvfs);
1200	ZFS_VERIFY_ZP(zdp);
1201
1202	*vpp = NULL;
1203
1204	if (flags & LOOKUP_XATTR) {
1205		/*
1206		 * If the xattr property is off, refuse the lookup request.
1207		 */
1208		if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1209			ZFS_EXIT(zfsvfs);
1210			return (EINVAL);
1211		}
1212
1213		/*
1214		 * We don't allow recursive attributes..
1215		 * Maybe someday we will.
1216		 */
1217		if (zdp->z_pflags & ZFS_XATTR) {
1218			ZFS_EXIT(zfsvfs);
1219			return (EINVAL);
1220		}
1221
1222		if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1223			ZFS_EXIT(zfsvfs);
1224			return (error);
1225		}
1226
1227		/*
1228		 * Do we have permission to get into attribute directory?
1229		 */
1230
1231		if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1232		    B_FALSE, cr)) {
1233			VN_RELE(*vpp);
1234			*vpp = NULL;
1235		}
1236
1237		ZFS_EXIT(zfsvfs);
1238		return (error);
1239	}
1240
1241	if (dvp->v_type != VDIR) {
1242		ZFS_EXIT(zfsvfs);
1243		return (ENOTDIR);
1244	}
1245
1246	/*
1247	 * Check accessibility of directory.
1248	 */
1249
1250	if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1251		ZFS_EXIT(zfsvfs);
1252		return (error);
1253	}
1254
1255	if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1256	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1257		ZFS_EXIT(zfsvfs);
1258		return (EILSEQ);
1259	}
1260
1261	error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
1262	if (error == 0)
1263		error = specvp_check(vpp, cr);
1264
1265	ZFS_EXIT(zfsvfs);
1266	return (error);
1267}
1268
1269/*
1270 * Attempt to create a new entry in a directory.  If the entry
1271 * already exists, truncate the file if permissible, else return
1272 * an error.  Return the vp of the created or trunc'd file.
1273 *
1274 *	IN:	dvp	- vnode of directory to put new file entry in.
1275 *		name	- name of new file entry.
1276 *		vap	- attributes of new file.
1277 *		excl	- flag indicating exclusive or non-exclusive mode.
1278 *		mode	- mode to open file with.
1279 *		cr	- credentials of caller.
1280 *		flag	- large file flag [UNUSED].
1281 *		ct	- caller context
1282 *		vsecp 	- ACL to be set
1283 *
1284 *	OUT:	vpp	- vnode of created or trunc'd entry.
1285 *
1286 *	RETURN:	0 if success
1287 *		error code if failure
1288 *
1289 * Timestamps:
1290 *	dvp - ctime|mtime updated if new entry created
1291 *	 vp - ctime|mtime always, atime if new
1292 */
1293
1294/* ARGSUSED */
1295static int
1296zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl,
1297    int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct,
1298    vsecattr_t *vsecp)
1299{
1300	znode_t		*zp, *dzp = VTOZ(dvp);
1301	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1302	zilog_t		*zilog;
1303	objset_t	*os;
1304	zfs_dirlock_t	*dl;
1305	dmu_tx_t	*tx;
1306	int		error;
1307	ksid_t		*ksid;
1308	uid_t		uid;
1309	gid_t		gid = crgetgid(cr);
1310	zfs_acl_ids_t   acl_ids;
1311	boolean_t	fuid_dirtied;
1312	boolean_t	have_acl = B_FALSE;
1313
1314	/*
1315	 * If we have an ephemeral id, ACL, or XVATTR then
1316	 * make sure file system is at proper version
1317	 */
1318
1319	ksid = crgetsid(cr, KSID_OWNER);
1320	if (ksid)
1321		uid = ksid_getid(ksid);
1322	else
1323		uid = crgetuid(cr);
1324
1325	if (zfsvfs->z_use_fuids == B_FALSE &&
1326	    (vsecp || (vap->va_mask & AT_XVATTR) ||
1327	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1328		return (EINVAL);
1329
1330	ZFS_ENTER(zfsvfs);
1331	ZFS_VERIFY_ZP(dzp);
1332	os = zfsvfs->z_os;
1333	zilog = zfsvfs->z_log;
1334
1335	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1336	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1337		ZFS_EXIT(zfsvfs);
1338		return (EILSEQ);
1339	}
1340
1341	if (vap->va_mask & AT_XVATTR) {
1342		if ((error = secpolicy_xvattr((xvattr_t *)vap,
1343		    crgetuid(cr), cr, vap->va_type)) != 0) {
1344			ZFS_EXIT(zfsvfs);
1345			return (error);
1346		}
1347	}
1348top:
1349	*vpp = NULL;
1350
1351	if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr))
1352		vap->va_mode &= ~VSVTX;
1353
1354	if (*name == '\0') {
1355		/*
1356		 * Null component name refers to the directory itself.
1357		 */
1358		VN_HOLD(dvp);
1359		zp = dzp;
1360		dl = NULL;
1361		error = 0;
1362	} else {
1363		/* possible VN_HOLD(zp) */
1364		int zflg = 0;
1365
1366		if (flag & FIGNORECASE)
1367			zflg |= ZCILOOK;
1368
1369		error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1370		    NULL, NULL);
1371		if (error) {
1372			if (have_acl)
1373				zfs_acl_ids_free(&acl_ids);
1374			if (strcmp(name, "..") == 0)
1375				error = EISDIR;
1376			ZFS_EXIT(zfsvfs);
1377			return (error);
1378		}
1379	}
1380
1381	if (zp == NULL) {
1382		uint64_t txtype;
1383
1384		/*
1385		 * Create a new file object and update the directory
1386		 * to reference it.
1387		 */
1388		if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1389			if (have_acl)
1390				zfs_acl_ids_free(&acl_ids);
1391			goto out;
1392		}
1393
1394		/*
1395		 * We only support the creation of regular files in
1396		 * extended attribute directories.
1397		 */
1398
1399		if ((dzp->z_pflags & ZFS_XATTR) &&
1400		    (vap->va_type != VREG)) {
1401			if (have_acl)
1402				zfs_acl_ids_free(&acl_ids);
1403			error = EINVAL;
1404			goto out;
1405		}
1406
1407		if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
1408		    cr, vsecp, &acl_ids)) != 0)
1409			goto out;
1410		have_acl = B_TRUE;
1411
1412		if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1413			zfs_acl_ids_free(&acl_ids);
1414			error = EDQUOT;
1415			goto out;
1416		}
1417
1418		tx = dmu_tx_create(os);
1419
1420		dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1421		    ZFS_SA_BASE_ATTR_SIZE);
1422
1423		fuid_dirtied = zfsvfs->z_fuid_dirty;
1424		if (fuid_dirtied)
1425			zfs_fuid_txhold(zfsvfs, tx);
1426		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1427		dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1428		if (!zfsvfs->z_use_sa &&
1429		    acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1430			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1431			    0, acl_ids.z_aclp->z_acl_bytes);
1432		}
1433		error = dmu_tx_assign(tx, TXG_NOWAIT);
1434		if (error) {
1435			zfs_dirent_unlock(dl);
1436			if (error == ERESTART) {
1437				dmu_tx_wait(tx);
1438				dmu_tx_abort(tx);
1439				goto top;
1440			}
1441			zfs_acl_ids_free(&acl_ids);
1442			dmu_tx_abort(tx);
1443			ZFS_EXIT(zfsvfs);
1444			return (error);
1445		}
1446		zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1447
1448		if (fuid_dirtied)
1449			zfs_fuid_sync(zfsvfs, tx);
1450
1451		(void) zfs_link_create(dl, zp, tx, ZNEW);
1452		txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1453		if (flag & FIGNORECASE)
1454			txtype |= TX_CI;
1455		zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1456		    vsecp, acl_ids.z_fuidp, vap);
1457		zfs_acl_ids_free(&acl_ids);
1458		dmu_tx_commit(tx);
1459	} else {
1460		int aflags = (flag & FAPPEND) ? V_APPEND : 0;
1461
1462		if (have_acl)
1463			zfs_acl_ids_free(&acl_ids);
1464		have_acl = B_FALSE;
1465
1466		/*
1467		 * A directory entry already exists for this name.
1468		 */
1469		/*
1470		 * Can't truncate an existing file if in exclusive mode.
1471		 */
1472		if (excl == EXCL) {
1473			error = EEXIST;
1474			goto out;
1475		}
1476		/*
1477		 * Can't open a directory for writing.
1478		 */
1479		if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
1480			error = EISDIR;
1481			goto out;
1482		}
1483		/*
1484		 * Verify requested access to file.
1485		 */
1486		if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
1487			goto out;
1488		}
1489
1490		mutex_enter(&dzp->z_lock);
1491		dzp->z_seq++;
1492		mutex_exit(&dzp->z_lock);
1493
1494		/*
1495		 * Truncate regular files if requested.
1496		 */
1497		if ((ZTOV(zp)->v_type == VREG) &&
1498		    (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
1499			/* we can't hold any locks when calling zfs_freesp() */
1500			zfs_dirent_unlock(dl);
1501			dl = NULL;
1502			error = zfs_freesp(zp, 0, 0, mode, TRUE);
1503			if (error == 0) {
1504				vnevent_create(ZTOV(zp), ct);
1505			}
1506		}
1507	}
1508out:
1509
1510	if (dl)
1511		zfs_dirent_unlock(dl);
1512
1513	if (error) {
1514		if (zp)
1515			VN_RELE(ZTOV(zp));
1516	} else {
1517		*vpp = ZTOV(zp);
1518		error = specvp_check(vpp, cr);
1519	}
1520
1521	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1522		zil_commit(zilog, 0);
1523
1524	ZFS_EXIT(zfsvfs);
1525	return (error);
1526}
1527
1528/*
1529 * Remove an entry from a directory.
1530 *
1531 *	IN:	dvp	- vnode of directory to remove entry from.
1532 *		name	- name of entry to remove.
1533 *		cr	- credentials of caller.
1534 *		ct	- caller context
1535 *		flags	- case flags
1536 *
1537 *	RETURN:	0 if success
1538 *		error code if failure
1539 *
1540 * Timestamps:
1541 *	dvp - ctime|mtime
1542 *	 vp - ctime (if nlink > 0)
1543 */
1544
1545uint64_t null_xattr = 0;
1546
1547/*ARGSUSED*/
1548static int
1549zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
1550    int flags)
1551{
1552	znode_t		*zp, *dzp = VTOZ(dvp);
1553	znode_t		*xzp;
1554	vnode_t		*vp;
1555	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1556	zilog_t		*zilog;
1557	uint64_t	acl_obj, xattr_obj;
1558	uint64_t 	xattr_obj_unlinked = 0;
1559	uint64_t	obj = 0;
1560	zfs_dirlock_t	*dl;
1561	dmu_tx_t	*tx;
1562	boolean_t	may_delete_now, delete_now = FALSE;
1563	boolean_t	unlinked, toobig = FALSE;
1564	uint64_t	txtype;
1565	pathname_t	*realnmp = NULL;
1566	pathname_t	realnm;
1567	int		error;
1568	int		zflg = ZEXISTS;
1569
1570	ZFS_ENTER(zfsvfs);
1571	ZFS_VERIFY_ZP(dzp);
1572	zilog = zfsvfs->z_log;
1573
1574	if (flags & FIGNORECASE) {
1575		zflg |= ZCILOOK;
1576		pn_alloc(&realnm);
1577		realnmp = &realnm;
1578	}
1579
1580top:
1581	xattr_obj = 0;
1582	xzp = NULL;
1583	/*
1584	 * Attempt to lock directory; fail if entry doesn't exist.
1585	 */
1586	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1587	    NULL, realnmp)) {
1588		if (realnmp)
1589			pn_free(realnmp);
1590		ZFS_EXIT(zfsvfs);
1591		return (error);
1592	}
1593
1594	vp = ZTOV(zp);
1595
1596	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1597		goto out;
1598	}
1599
1600	/*
1601	 * Need to use rmdir for removing directories.
1602	 */
1603	if (vp->v_type == VDIR) {
1604		error = EPERM;
1605		goto out;
1606	}
1607
1608	vnevent_remove(vp, dvp, name, ct);
1609
1610	if (realnmp)
1611		dnlc_remove(dvp, realnmp->pn_buf);
1612	else
1613		dnlc_remove(dvp, name);
1614
1615	mutex_enter(&vp->v_lock);
1616	may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp);
1617	mutex_exit(&vp->v_lock);
1618
1619	/*
1620	 * We may delete the znode now, or we may put it in the unlinked set;
1621	 * it depends on whether we're the last link, and on whether there are
1622	 * other holds on the vnode.  So we dmu_tx_hold() the right things to
1623	 * allow for either case.
1624	 */
1625	obj = zp->z_id;
1626	tx = dmu_tx_create(zfsvfs->z_os);
1627	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1628	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1629	zfs_sa_upgrade_txholds(tx, zp);
1630	zfs_sa_upgrade_txholds(tx, dzp);
1631	if (may_delete_now) {
1632		toobig =
1633		    zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
1634		/* if the file is too big, only hold_free a token amount */
1635		dmu_tx_hold_free(tx, zp->z_id, 0,
1636		    (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1637	}
1638
1639	/* are there any extended attributes? */
1640	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1641	    &xattr_obj, sizeof (xattr_obj));
1642	if (error == 0 && xattr_obj) {
1643		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1644		ASSERT3U(error, ==, 0);
1645		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1646		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1647	}
1648
1649	mutex_enter(&zp->z_lock);
1650	if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
1651		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1652	mutex_exit(&zp->z_lock);
1653
1654	/* charge as an update -- would be nice not to charge at all */
1655	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1656
1657	error = dmu_tx_assign(tx, TXG_NOWAIT);
1658	if (error) {
1659		zfs_dirent_unlock(dl);
1660		VN_RELE(vp);
1661		if (xzp)
1662			VN_RELE(ZTOV(xzp));
1663		if (error == ERESTART) {
1664			dmu_tx_wait(tx);
1665			dmu_tx_abort(tx);
1666			goto top;
1667		}
1668		if (realnmp)
1669			pn_free(realnmp);
1670		dmu_tx_abort(tx);
1671		ZFS_EXIT(zfsvfs);
1672		return (error);
1673	}
1674
1675	/*
1676	 * Remove the directory entry.
1677	 */
1678	error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1679
1680	if (error) {
1681		dmu_tx_commit(tx);
1682		goto out;
1683	}
1684
1685	if (unlinked) {
1686
1687		/*
1688		 * Hold z_lock so that we can make sure that the ACL obj
1689		 * hasn't changed.  Could have been deleted due to
1690		 * zfs_sa_upgrade().
1691		 */
1692		mutex_enter(&zp->z_lock);
1693		mutex_enter(&vp->v_lock);
1694		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1695		    &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
1696		delete_now = may_delete_now && !toobig &&
1697		    vp->v_count == 1 && !vn_has_cached_data(vp) &&
1698		    xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) ==
1699		    acl_obj;
1700		mutex_exit(&vp->v_lock);
1701	}
1702
1703	if (delete_now) {
1704		if (xattr_obj_unlinked) {
1705			ASSERT3U(xzp->z_links, ==, 2);
1706			mutex_enter(&xzp->z_lock);
1707			xzp->z_unlinked = 1;
1708			xzp->z_links = 0;
1709			error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
1710			    &xzp->z_links, sizeof (xzp->z_links), tx);
1711			ASSERT3U(error,  ==,  0);
1712			mutex_exit(&xzp->z_lock);
1713			zfs_unlinked_add(xzp, tx);
1714
1715			if (zp->z_is_sa)
1716				error = sa_remove(zp->z_sa_hdl,
1717				    SA_ZPL_XATTR(zfsvfs), tx);
1718			else
1719				error = sa_update(zp->z_sa_hdl,
1720				    SA_ZPL_XATTR(zfsvfs), &null_xattr,
1721				    sizeof (uint64_t), tx);
1722			ASSERT3U(error, ==, 0);
1723		}
1724		mutex_enter(&vp->v_lock);
1725		vp->v_count--;
1726		ASSERT3U(vp->v_count, ==, 0);
1727		mutex_exit(&vp->v_lock);
1728		mutex_exit(&zp->z_lock);
1729		zfs_znode_delete(zp, tx);
1730	} else if (unlinked) {
1731		mutex_exit(&zp->z_lock);
1732		zfs_unlinked_add(zp, tx);
1733	}
1734
1735	txtype = TX_REMOVE;
1736	if (flags & FIGNORECASE)
1737		txtype |= TX_CI;
1738	zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
1739
1740	dmu_tx_commit(tx);
1741out:
1742	if (realnmp)
1743		pn_free(realnmp);
1744
1745	zfs_dirent_unlock(dl);
1746
1747	if (!delete_now)
1748		VN_RELE(vp);
1749	if (xzp)
1750		VN_RELE(ZTOV(xzp));
1751
1752	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1753		zil_commit(zilog, 0);
1754
1755	ZFS_EXIT(zfsvfs);
1756	return (error);
1757}
1758
1759/*
1760 * Create a new directory and insert it into dvp using the name
1761 * provided.  Return a pointer to the inserted directory.
1762 *
1763 *	IN:	dvp	- vnode of directory to add subdir to.
1764 *		dirname	- name of new directory.
1765 *		vap	- attributes of new directory.
1766 *		cr	- credentials of caller.
1767 *		ct	- caller context
1768 *		vsecp	- ACL to be set
1769 *
1770 *	OUT:	vpp	- vnode of created directory.
1771 *
1772 *	RETURN:	0 if success
1773 *		error code if failure
1774 *
1775 * Timestamps:
1776 *	dvp - ctime|mtime updated
1777 *	 vp - ctime|mtime|atime updated
1778 */
1779/*ARGSUSED*/
1780static int
1781zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
1782    caller_context_t *ct, int flags, vsecattr_t *vsecp)
1783{
1784	znode_t		*zp, *dzp = VTOZ(dvp);
1785	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1786	zilog_t		*zilog;
1787	zfs_dirlock_t	*dl;
1788	uint64_t	txtype;
1789	dmu_tx_t	*tx;
1790	int		error;
1791	int		zf = ZNEW;
1792	ksid_t		*ksid;
1793	uid_t		uid;
1794	gid_t		gid = crgetgid(cr);
1795	zfs_acl_ids_t   acl_ids;
1796	boolean_t	fuid_dirtied;
1797
1798	ASSERT(vap->va_type == VDIR);
1799
1800	/*
1801	 * If we have an ephemeral id, ACL, or XVATTR then
1802	 * make sure file system is at proper version
1803	 */
1804
1805	ksid = crgetsid(cr, KSID_OWNER);
1806	if (ksid)
1807		uid = ksid_getid(ksid);
1808	else
1809		uid = crgetuid(cr);
1810	if (zfsvfs->z_use_fuids == B_FALSE &&
1811	    (vsecp || (vap->va_mask & AT_XVATTR) ||
1812	    IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1813		return (EINVAL);
1814
1815	ZFS_ENTER(zfsvfs);
1816	ZFS_VERIFY_ZP(dzp);
1817	zilog = zfsvfs->z_log;
1818
1819	if (dzp->z_pflags & ZFS_XATTR) {
1820		ZFS_EXIT(zfsvfs);
1821		return (EINVAL);
1822	}
1823
1824	if (zfsvfs->z_utf8 && u8_validate(dirname,
1825	    strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1826		ZFS_EXIT(zfsvfs);
1827		return (EILSEQ);
1828	}
1829	if (flags & FIGNORECASE)
1830		zf |= ZCILOOK;
1831
1832	if (vap->va_mask & AT_XVATTR) {
1833		if ((error = secpolicy_xvattr((xvattr_t *)vap,
1834		    crgetuid(cr), cr, vap->va_type)) != 0) {
1835			ZFS_EXIT(zfsvfs);
1836			return (error);
1837		}
1838	}
1839
1840	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
1841	    vsecp, &acl_ids)) != 0) {
1842		ZFS_EXIT(zfsvfs);
1843		return (error);
1844	}
1845	/*
1846	 * First make sure the new directory doesn't exist.
1847	 *
1848	 * Existence is checked first to make sure we don't return
1849	 * EACCES instead of EEXIST which can cause some applications
1850	 * to fail.
1851	 */
1852top:
1853	*vpp = NULL;
1854
1855	if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
1856	    NULL, NULL)) {
1857		zfs_acl_ids_free(&acl_ids);
1858		ZFS_EXIT(zfsvfs);
1859		return (error);
1860	}
1861
1862	if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
1863		zfs_acl_ids_free(&acl_ids);
1864		zfs_dirent_unlock(dl);
1865		ZFS_EXIT(zfsvfs);
1866		return (error);
1867	}
1868
1869	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1870		zfs_acl_ids_free(&acl_ids);
1871		zfs_dirent_unlock(dl);
1872		ZFS_EXIT(zfsvfs);
1873		return (EDQUOT);
1874	}
1875
1876	/*
1877	 * Add a new entry to the directory.
1878	 */
1879	tx = dmu_tx_create(zfsvfs->z_os);
1880	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1881	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1882	fuid_dirtied = zfsvfs->z_fuid_dirty;
1883	if (fuid_dirtied)
1884		zfs_fuid_txhold(zfsvfs, tx);
1885	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1886		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
1887		    acl_ids.z_aclp->z_acl_bytes);
1888	}
1889
1890	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1891	    ZFS_SA_BASE_ATTR_SIZE);
1892
1893	error = dmu_tx_assign(tx, TXG_NOWAIT);
1894	if (error) {
1895		zfs_dirent_unlock(dl);
1896		if (error == ERESTART) {
1897			dmu_tx_wait(tx);
1898			dmu_tx_abort(tx);
1899			goto top;
1900		}
1901		zfs_acl_ids_free(&acl_ids);
1902		dmu_tx_abort(tx);
1903		ZFS_EXIT(zfsvfs);
1904		return (error);
1905	}
1906
1907	/*
1908	 * Create new node.
1909	 */
1910	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1911
1912	if (fuid_dirtied)
1913		zfs_fuid_sync(zfsvfs, tx);
1914
1915	/*
1916	 * Now put new name in parent dir.
1917	 */
1918	(void) zfs_link_create(dl, zp, tx, ZNEW);
1919
1920	*vpp = ZTOV(zp);
1921
1922	txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
1923	if (flags & FIGNORECASE)
1924		txtype |= TX_CI;
1925	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
1926	    acl_ids.z_fuidp, vap);
1927
1928	zfs_acl_ids_free(&acl_ids);
1929
1930	dmu_tx_commit(tx);
1931
1932	zfs_dirent_unlock(dl);
1933
1934	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1935		zil_commit(zilog, 0);
1936
1937	ZFS_EXIT(zfsvfs);
1938	return (0);
1939}
1940
1941/*
1942 * Remove a directory subdir entry.  If the current working
1943 * directory is the same as the subdir to be removed, the
1944 * remove will fail.
1945 *
1946 *	IN:	dvp	- vnode of directory to remove from.
1947 *		name	- name of directory to be removed.
1948 *		cwd	- vnode of current working directory.
1949 *		cr	- credentials of caller.
1950 *		ct	- caller context
1951 *		flags	- case flags
1952 *
1953 *	RETURN:	0 if success
1954 *		error code if failure
1955 *
1956 * Timestamps:
1957 *	dvp - ctime|mtime updated
1958 */
1959/*ARGSUSED*/
1960static int
1961zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
1962    caller_context_t *ct, int flags)
1963{
1964	znode_t		*dzp = VTOZ(dvp);
1965	znode_t		*zp;
1966	vnode_t		*vp;
1967	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
1968	zilog_t		*zilog;
1969	zfs_dirlock_t	*dl;
1970	dmu_tx_t	*tx;
1971	int		error;
1972	int		zflg = ZEXISTS;
1973
1974	ZFS_ENTER(zfsvfs);
1975	ZFS_VERIFY_ZP(dzp);
1976	zilog = zfsvfs->z_log;
1977
1978	if (flags & FIGNORECASE)
1979		zflg |= ZCILOOK;
1980top:
1981	zp = NULL;
1982
1983	/*
1984	 * Attempt to lock directory; fail if entry doesn't exist.
1985	 */
1986	if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1987	    NULL, NULL)) {
1988		ZFS_EXIT(zfsvfs);
1989		return (error);
1990	}
1991
1992	vp = ZTOV(zp);
1993
1994	if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1995		goto out;
1996	}
1997
1998	if (vp->v_type != VDIR) {
1999		error = ENOTDIR;
2000		goto out;
2001	}
2002
2003	if (vp == cwd) {
2004		error = EINVAL;
2005		goto out;
2006	}
2007
2008	vnevent_rmdir(vp, dvp, name, ct);
2009
2010	/*
2011	 * Grab a lock on the directory to make sure that noone is
2012	 * trying to add (or lookup) entries while we are removing it.
2013	 */
2014	rw_enter(&zp->z_name_lock, RW_WRITER);
2015
2016	/*
2017	 * Grab a lock on the parent pointer to make sure we play well
2018	 * with the treewalk and directory rename code.
2019	 */
2020	rw_enter(&zp->z_parent_lock, RW_WRITER);
2021
2022	tx = dmu_tx_create(zfsvfs->z_os);
2023	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2024	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2025	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2026	zfs_sa_upgrade_txholds(tx, zp);
2027	zfs_sa_upgrade_txholds(tx, dzp);
2028	error = dmu_tx_assign(tx, TXG_NOWAIT);
2029	if (error) {
2030		rw_exit(&zp->z_parent_lock);
2031		rw_exit(&zp->z_name_lock);
2032		zfs_dirent_unlock(dl);
2033		VN_RELE(vp);
2034		if (error == ERESTART) {
2035			dmu_tx_wait(tx);
2036			dmu_tx_abort(tx);
2037			goto top;
2038		}
2039		dmu_tx_abort(tx);
2040		ZFS_EXIT(zfsvfs);
2041		return (error);
2042	}
2043
2044	error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
2045
2046	if (error == 0) {
2047		uint64_t txtype = TX_RMDIR;
2048		if (flags & FIGNORECASE)
2049			txtype |= TX_CI;
2050		zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
2051	}
2052
2053	dmu_tx_commit(tx);
2054
2055	rw_exit(&zp->z_parent_lock);
2056	rw_exit(&zp->z_name_lock);
2057out:
2058	zfs_dirent_unlock(dl);
2059
2060	VN_RELE(vp);
2061
2062	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2063		zil_commit(zilog, 0);
2064
2065	ZFS_EXIT(zfsvfs);
2066	return (error);
2067}
2068
2069/*
2070 * Read as many directory entries as will fit into the provided
2071 * buffer from the given directory cursor position (specified in
2072 * the uio structure.
2073 *
2074 *	IN:	vp	- vnode of directory to read.
2075 *		uio	- structure supplying read location, range info,
2076 *			  and return buffer.
2077 *		cr	- credentials of caller.
2078 *		ct	- caller context
2079 *		flags	- case flags
2080 *
2081 *	OUT:	uio	- updated offset and range, buffer filled.
2082 *		eofp	- set to true if end-of-file detected.
2083 *
2084 *	RETURN:	0 if success
2085 *		error code if failure
2086 *
2087 * Timestamps:
2088 *	vp - atime updated
2089 *
2090 * Note that the low 4 bits of the cookie returned by zap is always zero.
2091 * This allows us to use the low range for "special" directory entries:
2092 * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
2093 * we use the offset 2 for the '.zfs' directory.
2094 */
2095/* ARGSUSED */
2096static int
2097zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp,
2098    caller_context_t *ct, int flags)
2099{
2100	znode_t		*zp = VTOZ(vp);
2101	iovec_t		*iovp;
2102	edirent_t	*eodp;
2103	dirent64_t	*odp;
2104	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2105	objset_t	*os;
2106	caddr_t		outbuf;
2107	size_t		bufsize;
2108	zap_cursor_t	zc;
2109	zap_attribute_t	zap;
2110	uint_t		bytes_wanted;
2111	uint64_t	offset; /* must be unsigned; checks for < 1 */
2112	uint64_t	parent;
2113	int		local_eof;
2114	int		outcount;
2115	int		error;
2116	uint8_t		prefetch;
2117	boolean_t	check_sysattrs;
2118
2119	ZFS_ENTER(zfsvfs);
2120	ZFS_VERIFY_ZP(zp);
2121
2122	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2123	    &parent, sizeof (parent))) != 0) {
2124		ZFS_EXIT(zfsvfs);
2125		return (error);
2126	}
2127
2128	/*
2129	 * If we are not given an eof variable,
2130	 * use a local one.
2131	 */
2132	if (eofp == NULL)
2133		eofp = &local_eof;
2134
2135	/*
2136	 * Check for valid iov_len.
2137	 */
2138	if (uio->uio_iov->iov_len <= 0) {
2139		ZFS_EXIT(zfsvfs);
2140		return (EINVAL);
2141	}
2142
2143	/*
2144	 * Quit if directory has been removed (posix)
2145	 */
2146	if ((*eofp = zp->z_unlinked) != 0) {
2147		ZFS_EXIT(zfsvfs);
2148		return (0);
2149	}
2150
2151	error = 0;
2152	os = zfsvfs->z_os;
2153	offset = uio->uio_loffset;
2154	prefetch = zp->z_zn_prefetch;
2155
2156	/*
2157	 * Initialize the iterator cursor.
2158	 */
2159	if (offset <= 3) {
2160		/*
2161		 * Start iteration from the beginning of the directory.
2162		 */
2163		zap_cursor_init(&zc, os, zp->z_id);
2164	} else {
2165		/*
2166		 * The offset is a serialized cursor.
2167		 */
2168		zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2169	}
2170
2171	/*
2172	 * Get space to change directory entries into fs independent format.
2173	 */
2174	iovp = uio->uio_iov;
2175	bytes_wanted = iovp->iov_len;
2176	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
2177		bufsize = bytes_wanted;
2178		outbuf = kmem_alloc(bufsize, KM_SLEEP);
2179		odp = (struct dirent64 *)outbuf;
2180	} else {
2181		bufsize = bytes_wanted;
2182		odp = (struct dirent64 *)iovp->iov_base;
2183	}
2184	eodp = (struct edirent *)odp;
2185
2186	/*
2187	 * If this VFS supports the system attribute view interface; and
2188	 * we're looking at an extended attribute directory; and we care
2189	 * about normalization conflicts on this vfs; then we must check
2190	 * for normalization conflicts with the sysattr name space.
2191	 */
2192	check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2193	    (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2194	    (flags & V_RDDIR_ENTFLAGS);
2195
2196	/*
2197	 * Transform to file-system independent format
2198	 */
2199	outcount = 0;
2200	while (outcount < bytes_wanted) {
2201		ino64_t objnum;
2202		ushort_t reclen;
2203		off64_t *next = NULL;
2204
2205		/*
2206		 * Special case `.', `..', and `.zfs'.
2207		 */
2208		if (offset == 0) {
2209			(void) strcpy(zap.za_name, ".");
2210			zap.za_normalization_conflict = 0;
2211			objnum = zp->z_id;
2212		} else if (offset == 1) {
2213			(void) strcpy(zap.za_name, "..");
2214			zap.za_normalization_conflict = 0;
2215			objnum = parent;
2216		} else if (offset == 2 && zfs_show_ctldir(zp)) {
2217			(void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2218			zap.za_normalization_conflict = 0;
2219			objnum = ZFSCTL_INO_ROOT;
2220		} else {
2221			/*
2222			 * Grab next entry.
2223			 */
2224			if (error = zap_cursor_retrieve(&zc, &zap)) {
2225				if ((*eofp = (error == ENOENT)) != 0)
2226					break;
2227				else
2228					goto update;
2229			}
2230
2231			if (zap.za_integer_length != 8 ||
2232			    zap.za_num_integers != 1) {
2233				cmn_err(CE_WARN, "zap_readdir: bad directory "
2234				    "entry, obj = %lld, offset = %lld\n",
2235				    (u_longlong_t)zp->z_id,
2236				    (u_longlong_t)offset);
2237				error = ENXIO;
2238				goto update;
2239			}
2240
2241			objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2242			/*
2243			 * MacOS X can extract the object type here such as:
2244			 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2245			 */
2246
2247			if (check_sysattrs && !zap.za_normalization_conflict) {
2248				zap.za_normalization_conflict =
2249				    xattr_sysattr_casechk(zap.za_name);
2250			}
2251		}
2252
2253		if (flags & V_RDDIR_ACCFILTER) {
2254			/*
2255			 * If we have no access at all, don't include
2256			 * this entry in the returned information
2257			 */
2258			znode_t	*ezp;
2259			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
2260				goto skip_entry;
2261			if (!zfs_has_access(ezp, cr)) {
2262				VN_RELE(ZTOV(ezp));
2263				goto skip_entry;
2264			}
2265			VN_RELE(ZTOV(ezp));
2266		}
2267
2268		if (flags & V_RDDIR_ENTFLAGS)
2269			reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2270		else
2271			reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2272
2273		/*
2274		 * Will this entry fit in the buffer?
2275		 */
2276		if (outcount + reclen > bufsize) {
2277			/*
2278			 * Did we manage to fit anything in the buffer?
2279			 */
2280			if (!outcount) {
2281				error = EINVAL;
2282				goto update;
2283			}
2284			break;
2285		}
2286		if (flags & V_RDDIR_ENTFLAGS) {
2287			/*
2288			 * Add extended flag entry:
2289			 */
2290			eodp->ed_ino = objnum;
2291			eodp->ed_reclen = reclen;
2292			/* NOTE: ed_off is the offset for the *next* entry */
2293			next = &(eodp->ed_off);
2294			eodp->ed_eflags = zap.za_normalization_conflict ?
2295			    ED_CASE_CONFLICT : 0;
2296			(void) strncpy(eodp->ed_name, zap.za_name,
2297			    EDIRENT_NAMELEN(reclen));
2298			eodp = (edirent_t *)((intptr_t)eodp + reclen);
2299		} else {
2300			/*
2301			 * Add normal entry:
2302			 */
2303			odp->d_ino = objnum;
2304			odp->d_reclen = reclen;
2305			/* NOTE: d_off is the offset for the *next* entry */
2306			next = &(odp->d_off);
2307			(void) strncpy(odp->d_name, zap.za_name,
2308			    DIRENT64_NAMELEN(reclen));
2309			odp = (dirent64_t *)((intptr_t)odp + reclen);
2310		}
2311		outcount += reclen;
2312
2313		ASSERT(outcount <= bufsize);
2314
2315		/* Prefetch znode */
2316		if (prefetch)
2317			dmu_prefetch(os, objnum, 0, 0);
2318
2319	skip_entry:
2320		/*
2321		 * Move to the next entry, fill in the previous offset.
2322		 */
2323		if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2324			zap_cursor_advance(&zc);
2325			offset = zap_cursor_serialize(&zc);
2326		} else {
2327			offset += 1;
2328		}
2329		if (next)
2330			*next = offset;
2331	}
2332	zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2333
2334	if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
2335		iovp->iov_base += outcount;
2336		iovp->iov_len -= outcount;
2337		uio->uio_resid -= outcount;
2338	} else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2339		/*
2340		 * Reset the pointer.
2341		 */
2342		offset = uio->uio_loffset;
2343	}
2344
2345update:
2346	zap_cursor_fini(&zc);
2347	if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
2348		kmem_free(outbuf, bufsize);
2349
2350	if (error == ENOENT)
2351		error = 0;
2352
2353	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2354
2355	uio->uio_loffset = offset;
2356	ZFS_EXIT(zfsvfs);
2357	return (error);
2358}
2359
2360ulong_t zfs_fsync_sync_cnt = 4;
2361
2362static int
2363zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2364{
2365	znode_t	*zp = VTOZ(vp);
2366	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2367
2368	/*
2369	 * Regardless of whether this is required for standards conformance,
2370	 * this is the logical behavior when fsync() is called on a file with
2371	 * dirty pages.  We use B_ASYNC since the ZIL transactions are already
2372	 * going to be pushed out as part of the zil_commit().
2373	 */
2374	if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) &&
2375	    (vp->v_type == VREG) && !(IS_SWAPVP(vp)))
2376		(void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_ASYNC, cr, ct);
2377
2378	(void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2379
2380	if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
2381		ZFS_ENTER(zfsvfs);
2382		ZFS_VERIFY_ZP(zp);
2383		zil_commit(zfsvfs->z_log, zp->z_id);
2384		ZFS_EXIT(zfsvfs);
2385	}
2386	return (0);
2387}
2388
2389
2390/*
2391 * Get the requested file attributes and place them in the provided
2392 * vattr structure.
2393 *
2394 *	IN:	vp	- vnode of file.
2395 *		vap	- va_mask identifies requested attributes.
2396 *			  If AT_XVATTR set, then optional attrs are requested
2397 *		flags	- ATTR_NOACLCHECK (CIFS server context)
2398 *		cr	- credentials of caller.
2399 *		ct	- caller context
2400 *
2401 *	OUT:	vap	- attribute values.
2402 *
2403 *	RETURN:	0 (always succeeds)
2404 */
2405/* ARGSUSED */
2406static int
2407zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2408    caller_context_t *ct)
2409{
2410	znode_t *zp = VTOZ(vp);
2411	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2412	int	error = 0;
2413	uint64_t links;
2414	uint64_t mtime[2], ctime[2];
2415	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2416	xoptattr_t *xoap = NULL;
2417	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2418	sa_bulk_attr_t bulk[2];
2419	int count = 0;
2420
2421	ZFS_ENTER(zfsvfs);
2422	ZFS_VERIFY_ZP(zp);
2423
2424	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2425
2426	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
2427	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
2428
2429	if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
2430		ZFS_EXIT(zfsvfs);
2431		return (error);
2432	}
2433
2434	/*
2435	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2436	 * Also, if we are the owner don't bother, since owner should
2437	 * always be allowed to read basic attributes of file.
2438	 */
2439	if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
2440	    (vap->va_uid != crgetuid(cr))) {
2441		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2442		    skipaclchk, cr)) {
2443			ZFS_EXIT(zfsvfs);
2444			return (error);
2445		}
2446	}
2447
2448	/*
2449	 * Return all attributes.  It's cheaper to provide the answer
2450	 * than to determine whether we were asked the question.
2451	 */
2452
2453	mutex_enter(&zp->z_lock);
2454	vap->va_type = vp->v_type;
2455	vap->va_mode = zp->z_mode & MODEMASK;
2456	vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
2457	vap->va_nodeid = zp->z_id;
2458	if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
2459		links = zp->z_links + 1;
2460	else
2461		links = zp->z_links;
2462	vap->va_nlink = MIN(links, UINT32_MAX);	/* nlink_t limit! */
2463	vap->va_size = zp->z_size;
2464	vap->va_rdev = vp->v_rdev;
2465	vap->va_seq = zp->z_seq;
2466
2467	/*
2468	 * Add in any requested optional attributes and the create time.
2469	 * Also set the corresponding bits in the returned attribute bitmap.
2470	 */
2471	if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2472		if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2473			xoap->xoa_archive =
2474			    ((zp->z_pflags & ZFS_ARCHIVE) != 0);
2475			XVA_SET_RTN(xvap, XAT_ARCHIVE);
2476		}
2477
2478		if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2479			xoap->xoa_readonly =
2480			    ((zp->z_pflags & ZFS_READONLY) != 0);
2481			XVA_SET_RTN(xvap, XAT_READONLY);
2482		}
2483
2484		if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2485			xoap->xoa_system =
2486			    ((zp->z_pflags & ZFS_SYSTEM) != 0);
2487			XVA_SET_RTN(xvap, XAT_SYSTEM);
2488		}
2489
2490		if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2491			xoap->xoa_hidden =
2492			    ((zp->z_pflags & ZFS_HIDDEN) != 0);
2493			XVA_SET_RTN(xvap, XAT_HIDDEN);
2494		}
2495
2496		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2497			xoap->xoa_nounlink =
2498			    ((zp->z_pflags & ZFS_NOUNLINK) != 0);
2499			XVA_SET_RTN(xvap, XAT_NOUNLINK);
2500		}
2501
2502		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2503			xoap->xoa_immutable =
2504			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
2505			XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2506		}
2507
2508		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2509			xoap->xoa_appendonly =
2510			    ((zp->z_pflags & ZFS_APPENDONLY) != 0);
2511			XVA_SET_RTN(xvap, XAT_APPENDONLY);
2512		}
2513
2514		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2515			xoap->xoa_nodump =
2516			    ((zp->z_pflags & ZFS_NODUMP) != 0);
2517			XVA_SET_RTN(xvap, XAT_NODUMP);
2518		}
2519
2520		if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2521			xoap->xoa_opaque =
2522			    ((zp->z_pflags & ZFS_OPAQUE) != 0);
2523			XVA_SET_RTN(xvap, XAT_OPAQUE);
2524		}
2525
2526		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2527			xoap->xoa_av_quarantined =
2528			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
2529			XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2530		}
2531
2532		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2533			xoap->xoa_av_modified =
2534			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
2535			XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2536		}
2537
2538		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2539		    vp->v_type == VREG) {
2540			zfs_sa_get_scanstamp(zp, xvap);
2541		}
2542
2543		if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
2544			uint64_t times[2];
2545
2546			(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
2547			    times, sizeof (times));
2548			ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
2549			XVA_SET_RTN(xvap, XAT_CREATETIME);
2550		}
2551
2552		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2553			xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
2554			XVA_SET_RTN(xvap, XAT_REPARSE);
2555		}
2556		if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
2557			xoap->xoa_generation = zp->z_gen;
2558			XVA_SET_RTN(xvap, XAT_GEN);
2559		}
2560
2561		if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
2562			xoap->xoa_offline =
2563			    ((zp->z_pflags & ZFS_OFFLINE) != 0);
2564			XVA_SET_RTN(xvap, XAT_OFFLINE);
2565		}
2566
2567		if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
2568			xoap->xoa_sparse =
2569			    ((zp->z_pflags & ZFS_SPARSE) != 0);
2570			XVA_SET_RTN(xvap, XAT_SPARSE);
2571		}
2572	}
2573
2574	ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
2575	ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2576	ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2577
2578	mutex_exit(&zp->z_lock);
2579
2580	sa_object_size(zp->z_sa_hdl, &vap->va_blksize, &vap->va_nblocks);
2581
2582	if (zp->z_blksz == 0) {
2583		/*
2584		 * Block size hasn't been set; suggest maximal I/O transfers.
2585		 */
2586		vap->va_blksize = zfsvfs->z_max_blksz;
2587	}
2588
2589	ZFS_EXIT(zfsvfs);
2590	return (0);
2591}
2592
2593/*
2594 * Set the file attributes to the values contained in the
2595 * vattr structure.
2596 *
2597 *	IN:	vp	- vnode of file to be modified.
2598 *		vap	- new attribute values.
2599 *			  If AT_XVATTR set, then optional attrs are being set
2600 *		flags	- ATTR_UTIME set if non-default time values provided.
2601 *			- ATTR_NOACLCHECK (CIFS context only).
2602 *		cr	- credentials of caller.
2603 *		ct	- caller context
2604 *
2605 *	RETURN:	0 if success
2606 *		error code if failure
2607 *
2608 * Timestamps:
2609 *	vp - ctime updated, mtime updated if size changed.
2610 */
2611/* ARGSUSED */
2612static int
2613zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2614	caller_context_t *ct)
2615{
2616	znode_t		*zp = VTOZ(vp);
2617	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
2618	zilog_t		*zilog;
2619	dmu_tx_t	*tx;
2620	vattr_t		oldva;
2621	xvattr_t	tmpxvattr;
2622	uint_t		mask = vap->va_mask;
2623	uint_t		saved_mask;
2624	int		trim_mask = 0;
2625	uint64_t	new_mode;
2626	uint64_t	new_uid, new_gid;
2627	uint64_t	xattr_obj;
2628	uint64_t	mtime[2], ctime[2];
2629	znode_t		*attrzp;
2630	int		need_policy = FALSE;
2631	int		err, err2;
2632	zfs_fuid_info_t *fuidp = NULL;
2633	xvattr_t *xvap = (xvattr_t *)vap;	/* vap may be an xvattr_t * */
2634	xoptattr_t	*xoap;
2635	zfs_acl_t	*aclp;
2636	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2637	boolean_t	fuid_dirtied = B_FALSE;
2638	sa_bulk_attr_t	bulk[7], xattr_bulk[7];
2639	int		count = 0, xattr_count = 0;
2640
2641	if (mask == 0)
2642		return (0);
2643
2644	if (mask & AT_NOSET)
2645		return (EINVAL);
2646
2647	ZFS_ENTER(zfsvfs);
2648	ZFS_VERIFY_ZP(zp);
2649
2650	zilog = zfsvfs->z_log;
2651
2652	/*
2653	 * Make sure that if we have ephemeral uid/gid or xvattr specified
2654	 * that file system is at proper version level
2655	 */
2656
2657	if (zfsvfs->z_use_fuids == B_FALSE &&
2658	    (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2659	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2660	    (mask & AT_XVATTR))) {
2661		ZFS_EXIT(zfsvfs);
2662		return (EINVAL);
2663	}
2664
2665	if (mask & AT_SIZE && vp->v_type == VDIR) {
2666		ZFS_EXIT(zfsvfs);
2667		return (EISDIR);
2668	}
2669
2670	if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
2671		ZFS_EXIT(zfsvfs);
2672		return (EINVAL);
2673	}
2674
2675	/*
2676	 * If this is an xvattr_t, then get a pointer to the structure of
2677	 * optional attributes.  If this is NULL, then we have a vattr_t.
2678	 */
2679	xoap = xva_getxoptattr(xvap);
2680
2681	xva_init(&tmpxvattr);
2682
2683	/*
2684	 * Immutable files can only alter immutable bit and atime
2685	 */
2686	if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2687	    ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
2688	    ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2689		ZFS_EXIT(zfsvfs);
2690		return (EPERM);
2691	}
2692
2693	if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
2694		ZFS_EXIT(zfsvfs);
2695		return (EPERM);
2696	}
2697
2698	/*
2699	 * Verify timestamps doesn't overflow 32 bits.
2700	 * ZFS can handle large timestamps, but 32bit syscalls can't
2701	 * handle times greater than 2039.  This check should be removed
2702	 * once large timestamps are fully supported.
2703	 */
2704	if (mask & (AT_ATIME | AT_MTIME)) {
2705		if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2706		    ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2707			ZFS_EXIT(zfsvfs);
2708			return (EOVERFLOW);
2709		}
2710	}
2711
2712top:
2713	attrzp = NULL;
2714	aclp = NULL;
2715
2716	/* Can this be moved to before the top label? */
2717	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2718		ZFS_EXIT(zfsvfs);
2719		return (EROFS);
2720	}
2721
2722	/*
2723	 * First validate permissions
2724	 */
2725
2726	if (mask & AT_SIZE) {
2727		err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
2728		if (err) {
2729			ZFS_EXIT(zfsvfs);
2730			return (err);
2731		}
2732		/*
2733		 * XXX - Note, we are not providing any open
2734		 * mode flags here (like FNDELAY), so we may
2735		 * block if there are locks present... this
2736		 * should be addressed in openat().
2737		 */
2738		/* XXX - would it be OK to generate a log record here? */
2739		err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2740		if (err) {
2741			ZFS_EXIT(zfsvfs);
2742			return (err);
2743		}
2744	}
2745
2746	if (mask & (AT_ATIME|AT_MTIME) ||
2747	    ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2748	    XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2749	    XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2750	    XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
2751	    XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
2752	    XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2753	    XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
2754		need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2755		    skipaclchk, cr);
2756	}
2757
2758	if (mask & (AT_UID|AT_GID)) {
2759		int	idmask = (mask & (AT_UID|AT_GID));
2760		int	take_owner;
2761		int	take_group;
2762
2763		/*
2764		 * NOTE: even if a new mode is being set,
2765		 * we may clear S_ISUID/S_ISGID bits.
2766		 */
2767
2768		if (!(mask & AT_MODE))
2769			vap->va_mode = zp->z_mode;
2770
2771		/*
2772		 * Take ownership or chgrp to group we are a member of
2773		 */
2774
2775		take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
2776		take_group = (mask & AT_GID) &&
2777		    zfs_groupmember(zfsvfs, vap->va_gid, cr);
2778
2779		/*
2780		 * If both AT_UID and AT_GID are set then take_owner and
2781		 * take_group must both be set in order to allow taking
2782		 * ownership.
2783		 *
2784		 * Otherwise, send the check through secpolicy_vnode_setattr()
2785		 *
2786		 */
2787
2788		if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
2789		    ((idmask == AT_UID) && take_owner) ||
2790		    ((idmask == AT_GID) && take_group)) {
2791			if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
2792			    skipaclchk, cr) == 0) {
2793				/*
2794				 * Remove setuid/setgid for non-privileged users
2795				 */
2796				secpolicy_setid_clear(vap, cr);
2797				trim_mask = (mask & (AT_UID|AT_GID));
2798			} else {
2799				need_policy =  TRUE;
2800			}
2801		} else {
2802			need_policy =  TRUE;
2803		}
2804	}
2805
2806	mutex_enter(&zp->z_lock);
2807	oldva.va_mode = zp->z_mode;
2808	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
2809	if (mask & AT_XVATTR) {
2810		/*
2811		 * Update xvattr mask to include only those attributes
2812		 * that are actually changing.
2813		 *
2814		 * the bits will be restored prior to actually setting
2815		 * the attributes so the caller thinks they were set.
2816		 */
2817		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2818			if (xoap->xoa_appendonly !=
2819			    ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
2820				need_policy = TRUE;
2821			} else {
2822				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
2823				XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
2824			}
2825		}
2826
2827		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2828			if (xoap->xoa_nounlink !=
2829			    ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
2830				need_policy = TRUE;
2831			} else {
2832				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
2833				XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
2834			}
2835		}
2836
2837		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2838			if (xoap->xoa_immutable !=
2839			    ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
2840				need_policy = TRUE;
2841			} else {
2842				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
2843				XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
2844			}
2845		}
2846
2847		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2848			if (xoap->xoa_nodump !=
2849			    ((zp->z_pflags & ZFS_NODUMP) != 0)) {
2850				need_policy = TRUE;
2851			} else {
2852				XVA_CLR_REQ(xvap, XAT_NODUMP);
2853				XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
2854			}
2855		}
2856
2857		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2858			if (xoap->xoa_av_modified !=
2859			    ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
2860				need_policy = TRUE;
2861			} else {
2862				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
2863				XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
2864			}
2865		}
2866
2867		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2868			if ((vp->v_type != VREG &&
2869			    xoap->xoa_av_quarantined) ||
2870			    xoap->xoa_av_quarantined !=
2871			    ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
2872				need_policy = TRUE;
2873			} else {
2874				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
2875				XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
2876			}
2877		}
2878
2879		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2880			mutex_exit(&zp->z_lock);
2881			ZFS_EXIT(zfsvfs);
2882			return (EPERM);
2883		}
2884
2885		if (need_policy == FALSE &&
2886		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
2887		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
2888			need_policy = TRUE;
2889		}
2890	}
2891
2892	mutex_exit(&zp->z_lock);
2893
2894	if (mask & AT_MODE) {
2895		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
2896			err = secpolicy_setid_setsticky_clear(vp, vap,
2897			    &oldva, cr);
2898			if (err) {
2899				ZFS_EXIT(zfsvfs);
2900				return (err);
2901			}
2902			trim_mask |= AT_MODE;
2903		} else {
2904			need_policy = TRUE;
2905		}
2906	}
2907
2908	if (need_policy) {
2909		/*
2910		 * If trim_mask is set then take ownership
2911		 * has been granted or write_acl is present and user
2912		 * has the ability to modify mode.  In that case remove
2913		 * UID|GID and or MODE from mask so that
2914		 * secpolicy_vnode_setattr() doesn't revoke it.
2915		 */
2916
2917		if (trim_mask) {
2918			saved_mask = vap->va_mask;
2919			vap->va_mask &= ~trim_mask;
2920		}
2921		err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
2922		    (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
2923		if (err) {
2924			ZFS_EXIT(zfsvfs);
2925			return (err);
2926		}
2927
2928		if (trim_mask)
2929			vap->va_mask |= saved_mask;
2930	}
2931
2932	/*
2933	 * secpolicy_vnode_setattr, or take ownership may have
2934	 * changed va_mask
2935	 */
2936	mask = vap->va_mask;
2937
2938	if ((mask & (AT_UID | AT_GID))) {
2939		err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
2940		    &xattr_obj, sizeof (xattr_obj));
2941
2942		if (err == 0 && xattr_obj) {
2943			err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
2944			if (err)
2945				goto out2;
2946		}
2947		if (mask & AT_UID) {
2948			new_uid = zfs_fuid_create(zfsvfs,
2949			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
2950			if (new_uid != zp->z_uid &&
2951			    zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
2952				if (attrzp)
2953					VN_RELE(ZTOV(attrzp));
2954				err = EDQUOT;
2955				goto out2;
2956			}
2957		}
2958
2959		if (mask & AT_GID) {
2960			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
2961			    cr, ZFS_GROUP, &fuidp);
2962			if (new_gid != zp->z_gid &&
2963			    zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
2964				if (attrzp)
2965					VN_RELE(ZTOV(attrzp));
2966				err = EDQUOT;
2967				goto out2;
2968			}
2969		}
2970	}
2971	tx = dmu_tx_create(zfsvfs->z_os);
2972
2973	if (mask & AT_MODE) {
2974		uint64_t pmode = zp->z_mode;
2975		uint64_t acl_obj;
2976		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
2977
2978		zfs_acl_chmod_setattr(zp, &aclp, new_mode);
2979
2980		mutex_enter(&zp->z_lock);
2981		if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
2982			/*
2983			 * Are we upgrading ACL from old V0 format
2984			 * to V1 format?
2985			 */
2986			if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
2987			    zfs_znode_acl_version(zp) ==
2988			    ZFS_ACL_VERSION_INITIAL) {
2989				dmu_tx_hold_free(tx, acl_obj, 0,
2990				    DMU_OBJECT_END);
2991				dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2992				    0, aclp->z_acl_bytes);
2993			} else {
2994				dmu_tx_hold_write(tx, acl_obj, 0,
2995				    aclp->z_acl_bytes);
2996			}
2997		} else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
2998			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
2999			    0, aclp->z_acl_bytes);
3000		}
3001		mutex_exit(&zp->z_lock);
3002		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3003	} else {
3004		if ((mask & AT_XVATTR) &&
3005		    XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3006			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3007		else
3008			dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3009	}
3010
3011	if (attrzp) {
3012		dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3013	}
3014
3015	fuid_dirtied = zfsvfs->z_fuid_dirty;
3016	if (fuid_dirtied)
3017		zfs_fuid_txhold(zfsvfs, tx);
3018
3019	zfs_sa_upgrade_txholds(tx, zp);
3020
3021	err = dmu_tx_assign(tx, TXG_NOWAIT);
3022	if (err) {
3023		if (err == ERESTART)
3024			dmu_tx_wait(tx);
3025		goto out;
3026	}
3027
3028	count = 0;
3029	/*
3030	 * Set each attribute requested.
3031	 * We group settings according to the locks they need to acquire.
3032	 *
3033	 * Note: you cannot set ctime directly, although it will be
3034	 * updated as a side-effect of calling this function.
3035	 */
3036
3037
3038	if (mask & (AT_UID|AT_GID|AT_MODE))
3039		mutex_enter(&zp->z_acl_lock);
3040	mutex_enter(&zp->z_lock);
3041
3042	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3043	    &zp->z_pflags, sizeof (zp->z_pflags));
3044
3045	if (attrzp) {
3046		if (mask & (AT_UID|AT_GID|AT_MODE))
3047			mutex_enter(&attrzp->z_acl_lock);
3048		mutex_enter(&attrzp->z_lock);
3049		SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3050		    SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3051		    sizeof (attrzp->z_pflags));
3052	}
3053
3054	if (mask & (AT_UID|AT_GID)) {
3055
3056		if (mask & AT_UID) {
3057			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3058			    &new_uid, sizeof (new_uid));
3059			zp->z_uid = new_uid;
3060			if (attrzp) {
3061				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3062				    SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3063				    sizeof (new_uid));
3064				attrzp->z_uid = new_uid;
3065			}
3066		}
3067
3068		if (mask & AT_GID) {
3069			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3070			    NULL, &new_gid, sizeof (new_gid));
3071			zp->z_gid = new_gid;
3072			if (attrzp) {
3073				SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3074				    SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3075				    sizeof (new_gid));
3076				attrzp->z_gid = new_gid;
3077			}
3078		}
3079		if (!(mask & AT_MODE)) {
3080			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3081			    NULL, &new_mode, sizeof (new_mode));
3082			new_mode = zp->z_mode;
3083		}
3084		err = zfs_acl_chown_setattr(zp);
3085		ASSERT(err == 0);
3086		if (attrzp) {
3087			err = zfs_acl_chown_setattr(attrzp);
3088			ASSERT(err == 0);
3089		}
3090	}
3091
3092	if (mask & AT_MODE) {
3093		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3094		    &new_mode, sizeof (new_mode));
3095		zp->z_mode = new_mode;
3096		ASSERT3U((uintptr_t)aclp, !=, NULL);
3097		err = zfs_aclset_common(zp, aclp, cr, tx);
3098		ASSERT3U(err, ==, 0);
3099		if (zp->z_acl_cached)
3100			zfs_acl_free(zp->z_acl_cached);
3101		zp->z_acl_cached = aclp;
3102		aclp = NULL;
3103	}
3104
3105
3106	if (mask & AT_ATIME) {
3107		ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
3108		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
3109		    &zp->z_atime, sizeof (zp->z_atime));
3110	}
3111
3112	if (mask & AT_MTIME) {
3113		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3114		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3115		    mtime, sizeof (mtime));
3116	}
3117
3118	/* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3119	if (mask & AT_SIZE && !(mask & AT_MTIME)) {
3120		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
3121		    NULL, mtime, sizeof (mtime));
3122		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3123		    &ctime, sizeof (ctime));
3124		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
3125		    B_TRUE);
3126	} else if (mask != 0) {
3127		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3128		    &ctime, sizeof (ctime));
3129		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
3130		    B_TRUE);
3131		if (attrzp) {
3132			SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3133			    SA_ZPL_CTIME(zfsvfs), NULL,
3134			    &ctime, sizeof (ctime));
3135			zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
3136			    mtime, ctime, B_TRUE);
3137		}
3138	}
3139	/*
3140	 * Do this after setting timestamps to prevent timestamp
3141	 * update from toggling bit
3142	 */
3143
3144	if (xoap && (mask & AT_XVATTR)) {
3145
3146		/*
3147		 * restore trimmed off masks
3148		 * so that return masks can be set for caller.
3149		 */
3150
3151		if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3152			XVA_SET_REQ(xvap, XAT_APPENDONLY);
3153		}
3154		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3155			XVA_SET_REQ(xvap, XAT_NOUNLINK);
3156		}
3157		if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3158			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3159		}
3160		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3161			XVA_SET_REQ(xvap, XAT_NODUMP);
3162		}
3163		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3164			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3165		}
3166		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3167			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3168		}
3169
3170		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3171			ASSERT(vp->v_type == VREG);
3172
3173		zfs_xvattr_set(zp, xvap, tx);
3174	}
3175
3176	if (fuid_dirtied)
3177		zfs_fuid_sync(zfsvfs, tx);
3178
3179	if (mask != 0)
3180		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3181
3182	mutex_exit(&zp->z_lock);
3183	if (mask & (AT_UID|AT_GID|AT_MODE))
3184		mutex_exit(&zp->z_acl_lock);
3185
3186	if (attrzp) {
3187		if (mask & (AT_UID|AT_GID|AT_MODE))
3188			mutex_exit(&attrzp->z_acl_lock);
3189		mutex_exit(&attrzp->z_lock);
3190	}
3191out:
3192	if (err == 0 && attrzp) {
3193		err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3194		    xattr_count, tx);
3195		ASSERT(err2 == 0);
3196	}
3197
3198	if (attrzp)
3199		VN_RELE(ZTOV(attrzp));
3200	if (aclp)
3201		zfs_acl_free(aclp);
3202
3203	if (fuidp) {
3204		zfs_fuid_info_free(fuidp);
3205		fuidp = NULL;
3206	}
3207
3208	if (err) {
3209		dmu_tx_abort(tx);
3210		if (err == ERESTART)
3211			goto top;
3212	} else {
3213		err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3214		dmu_tx_commit(tx);
3215	}
3216
3217out2:
3218	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3219		zil_commit(zilog, 0);
3220
3221	ZFS_EXIT(zfsvfs);
3222	return (err);
3223}
3224
3225typedef struct zfs_zlock {
3226	krwlock_t	*zl_rwlock;	/* lock we acquired */
3227	znode_t		*zl_znode;	/* znode we held */
3228	struct zfs_zlock *zl_next;	/* next in list */
3229} zfs_zlock_t;
3230
3231/*
3232 * Drop locks and release vnodes that were held by zfs_rename_lock().
3233 */
3234static void
3235zfs_rename_unlock(zfs_zlock_t **zlpp)
3236{
3237	zfs_zlock_t *zl;
3238
3239	while ((zl = *zlpp) != NULL) {
3240		if (zl->zl_znode != NULL)
3241			VN_RELE(ZTOV(zl->zl_znode));
3242		rw_exit(zl->zl_rwlock);
3243		*zlpp = zl->zl_next;
3244		kmem_free(zl, sizeof (*zl));
3245	}
3246}
3247
3248/*
3249 * Search back through the directory tree, using the ".." entries.
3250 * Lock each directory in the chain to prevent concurrent renames.
3251 * Fail any attempt to move a directory into one of its own descendants.
3252 * XXX - z_parent_lock can overlap with map or grow locks
3253 */
3254static int
3255zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
3256{
3257	zfs_zlock_t	*zl;
3258	znode_t		*zp = tdzp;
3259	uint64_t	rootid = zp->z_zfsvfs->z_root;
3260	uint64_t	oidp = zp->z_id;
3261	krwlock_t	*rwlp = &szp->z_parent_lock;
3262	krw_t		rw = RW_WRITER;
3263
3264	/*
3265	 * First pass write-locks szp and compares to zp->z_id.
3266	 * Later passes read-lock zp and compare to zp->z_parent.
3267	 */
3268	do {
3269		if (!rw_tryenter(rwlp, rw)) {
3270			/*
3271			 * Another thread is renaming in this path.
3272			 * Note that if we are a WRITER, we don't have any
3273			 * parent_locks held yet.
3274			 */
3275			if (rw == RW_READER && zp->z_id > szp->z_id) {
3276				/*
3277				 * Drop our locks and restart
3278				 */
3279				zfs_rename_unlock(&zl);
3280				*zlpp = NULL;
3281				zp = tdzp;
3282				oidp = zp->z_id;
3283				rwlp = &szp->z_parent_lock;
3284				rw = RW_WRITER;
3285				continue;
3286			} else {
3287				/*
3288				 * Wait for other thread to drop its locks
3289				 */
3290				rw_enter(rwlp, rw);
3291			}
3292		}
3293
3294		zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
3295		zl->zl_rwlock = rwlp;
3296		zl->zl_znode = NULL;
3297		zl->zl_next = *zlpp;
3298		*zlpp = zl;
3299
3300		if (oidp == szp->z_id)		/* We're a descendant of szp */
3301			return (EINVAL);
3302
3303		if (oidp == rootid)		/* We've hit the top */
3304			return (0);
3305
3306		if (rw == RW_READER) {		/* i.e. not the first pass */
3307			int error = zfs_zget(zp->z_zfsvfs, oidp, &zp);
3308			if (error)
3309				return (error);
3310			zl->zl_znode = zp;
3311		}
3312		(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs),
3313		    &oidp, sizeof (oidp));
3314		rwlp = &zp->z_parent_lock;
3315		rw = RW_READER;
3316
3317	} while (zp->z_id != sdzp->z_id);
3318
3319	return (0);
3320}
3321
3322/*
3323 * Move an entry from the provided source directory to the target
3324 * directory.  Change the entry name as indicated.
3325 *
3326 *	IN:	sdvp	- Source directory containing the "old entry".
3327 *		snm	- Old entry name.
3328 *		tdvp	- Target directory to contain the "new entry".
3329 *		tnm	- New entry name.
3330 *		cr	- credentials of caller.
3331 *		ct	- caller context
3332 *		flags	- case flags
3333 *
3334 *	RETURN:	0 if success
3335 *		error code if failure
3336 *
3337 * Timestamps:
3338 *	sdvp,tdvp - ctime|mtime updated
3339 */
3340/*ARGSUSED*/
3341static int
3342zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
3343    caller_context_t *ct, int flags)
3344{
3345	znode_t		*tdzp, *szp, *tzp;
3346	znode_t		*sdzp = VTOZ(sdvp);
3347	zfsvfs_t	*zfsvfs = sdzp->z_zfsvfs;
3348	zilog_t		*zilog;
3349	vnode_t		*realvp;
3350	zfs_dirlock_t	*sdl, *tdl;
3351	dmu_tx_t	*tx;
3352	zfs_zlock_t	*zl;
3353	int		cmp, serr, terr;
3354	int		error = 0;
3355	int		zflg = 0;
3356
3357	ZFS_ENTER(zfsvfs);
3358	ZFS_VERIFY_ZP(sdzp);
3359	zilog = zfsvfs->z_log;
3360
3361	/*
3362	 * Make sure we have the real vp for the target directory.
3363	 */
3364	if (VOP_REALVP(tdvp, &realvp, ct) == 0)
3365		tdvp = realvp;
3366
3367	if (tdvp->v_vfsp != sdvp->v_vfsp || zfsctl_is_node(tdvp)) {
3368		ZFS_EXIT(zfsvfs);
3369		return (EXDEV);
3370	}
3371
3372	tdzp = VTOZ(tdvp);
3373	ZFS_VERIFY_ZP(tdzp);
3374	if (zfsvfs->z_utf8 && u8_validate(tnm,
3375	    strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3376		ZFS_EXIT(zfsvfs);
3377		return (EILSEQ);
3378	}
3379
3380	if (flags & FIGNORECASE)
3381		zflg |= ZCILOOK;
3382
3383top:
3384	szp = NULL;
3385	tzp = NULL;
3386	zl = NULL;
3387
3388	/*
3389	 * This is to prevent the creation of links into attribute space
3390	 * by renaming a linked file into/outof an attribute directory.
3391	 * See the comment in zfs_link() for why this is considered bad.
3392	 */
3393	if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3394		ZFS_EXIT(zfsvfs);
3395		return (EINVAL);
3396	}
3397
3398	/*
3399	 * Lock source and target directory entries.  To prevent deadlock,
3400	 * a lock ordering must be defined.  We lock the directory with
3401	 * the smallest object id first, or if it's a tie, the one with
3402	 * the lexically first name.
3403	 */
3404	if (sdzp->z_id < tdzp->z_id) {
3405		cmp = -1;
3406	} else if (sdzp->z_id > tdzp->z_id) {
3407		cmp = 1;
3408	} else {
3409		/*
3410		 * First compare the two name arguments without
3411		 * considering any case folding.
3412		 */
3413		int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
3414
3415		cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
3416		ASSERT(error == 0 || !zfsvfs->z_utf8);
3417		if (cmp == 0) {
3418			/*
3419			 * POSIX: "If the old argument and the new argument
3420			 * both refer to links to the same existing file,
3421			 * the rename() function shall return successfully
3422			 * and perform no other action."
3423			 */
3424			ZFS_EXIT(zfsvfs);
3425			return (0);
3426		}
3427		/*
3428		 * If the file system is case-folding, then we may
3429		 * have some more checking to do.  A case-folding file
3430		 * system is either supporting mixed case sensitivity
3431		 * access or is completely case-insensitive.  Note
3432		 * that the file system is always case preserving.
3433		 *
3434		 * In mixed sensitivity mode case sensitive behavior
3435		 * is the default.  FIGNORECASE must be used to
3436		 * explicitly request case insensitive behavior.
3437		 *
3438		 * If the source and target names provided differ only
3439		 * by case (e.g., a request to rename 'tim' to 'Tim'),
3440		 * we will treat this as a special case in the
3441		 * case-insensitive mode: as long as the source name
3442		 * is an exact match, we will allow this to proceed as
3443		 * a name-change request.
3444		 */
3445		if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
3446		    (zfsvfs->z_case == ZFS_CASE_MIXED &&
3447		    flags & FIGNORECASE)) &&
3448		    u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
3449		    &error) == 0) {
3450			/*
3451			 * case preserving rename request, require exact
3452			 * name matches
3453			 */
3454			zflg |= ZCIEXACT;
3455			zflg &= ~ZCILOOK;
3456		}
3457	}
3458
3459	/*
3460	 * If the source and destination directories are the same, we should
3461	 * grab the z_name_lock of that directory only once.
3462	 */
3463	if (sdzp == tdzp) {
3464		zflg |= ZHAVELOCK;
3465		rw_enter(&sdzp->z_name_lock, RW_READER);
3466	}
3467
3468	if (cmp < 0) {
3469		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
3470		    ZEXISTS | zflg, NULL, NULL);
3471		terr = zfs_dirent_lock(&tdl,
3472		    tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
3473	} else {
3474		terr = zfs_dirent_lock(&tdl,
3475		    tdzp, tnm, &tzp, zflg, NULL, NULL);
3476		serr = zfs_dirent_lock(&sdl,
3477		    sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
3478		    NULL, NULL);
3479	}
3480
3481	if (serr) {
3482		/*
3483		 * Source entry invalid or not there.
3484		 */
3485		if (!terr) {
3486			zfs_dirent_unlock(tdl);
3487			if (tzp)
3488				VN_RELE(ZTOV(tzp));
3489		}
3490
3491		if (sdzp == tdzp)
3492			rw_exit(&sdzp->z_name_lock);
3493
3494		if (strcmp(snm, "..") == 0)
3495			serr = EINVAL;
3496		ZFS_EXIT(zfsvfs);
3497		return (serr);
3498	}
3499	if (terr) {
3500		zfs_dirent_unlock(sdl);
3501		VN_RELE(ZTOV(szp));
3502
3503		if (sdzp == tdzp)
3504			rw_exit(&sdzp->z_name_lock);
3505
3506		if (strcmp(tnm, "..") == 0)
3507			terr = EINVAL;
3508		ZFS_EXIT(zfsvfs);
3509		return (terr);
3510	}
3511
3512	/*
3513	 * Must have write access at the source to remove the old entry
3514	 * and write access at the target to create the new entry.
3515	 * Note that if target and source are the same, this can be
3516	 * done in a single check.
3517	 */
3518
3519	if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
3520		goto out;
3521
3522	if (ZTOV(szp)->v_type == VDIR) {
3523		/*
3524		 * Check to make sure rename is valid.
3525		 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3526		 */
3527		if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
3528			goto out;
3529	}
3530
3531	/*
3532	 * Does target exist?
3533	 */
3534	if (tzp) {
3535		/*
3536		 * Source and target must be the same type.
3537		 */
3538		if (ZTOV(szp)->v_type == VDIR) {
3539			if (ZTOV(tzp)->v_type != VDIR) {
3540				error = ENOTDIR;
3541				goto out;
3542			}
3543		} else {
3544			if (ZTOV(tzp)->v_type == VDIR) {
3545				error = EISDIR;
3546				goto out;
3547			}
3548		}
3549		/*
3550		 * POSIX dictates that when the source and target
3551		 * entries refer to the same file object, rename
3552		 * must do nothing and exit without error.
3553		 */
3554		if (szp->z_id == tzp->z_id) {
3555			error = 0;
3556			goto out;
3557		}
3558	}
3559
3560	vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
3561	if (tzp)
3562		vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
3563
3564	/*
3565	 * notify the target directory if it is not the same
3566	 * as source directory.
3567	 */
3568	if (tdvp != sdvp) {
3569		vnevent_rename_dest_dir(tdvp, ct);
3570	}
3571
3572	tx = dmu_tx_create(zfsvfs->z_os);
3573	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3574	dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3575	dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3576	dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3577	if (sdzp != tdzp) {
3578		dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3579		zfs_sa_upgrade_txholds(tx, tdzp);
3580	}
3581	if (tzp) {
3582		dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3583		zfs_sa_upgrade_txholds(tx, tzp);
3584	}
3585
3586	zfs_sa_upgrade_txholds(tx, szp);
3587	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3588	error = dmu_tx_assign(tx, TXG_NOWAIT);
3589	if (error) {
3590		if (zl != NULL)
3591			zfs_rename_unlock(&zl);
3592		zfs_dirent_unlock(sdl);
3593		zfs_dirent_unlock(tdl);
3594
3595		if (sdzp == tdzp)
3596			rw_exit(&sdzp->z_name_lock);
3597
3598		VN_RELE(ZTOV(szp));
3599		if (tzp)
3600			VN_RELE(ZTOV(tzp));
3601		if (error == ERESTART) {
3602			dmu_tx_wait(tx);
3603			dmu_tx_abort(tx);
3604			goto top;
3605		}
3606		dmu_tx_abort(tx);
3607		ZFS_EXIT(zfsvfs);
3608		return (error);
3609	}
3610
3611	if (tzp)	/* Attempt to remove the existing target */
3612		error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
3613
3614	if (error == 0) {
3615		error = zfs_link_create(tdl, szp, tx, ZRENAMING);
3616		if (error == 0) {
3617			szp->z_pflags |= ZFS_AV_MODIFIED;
3618
3619			error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3620			    (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3621			ASSERT3U(error, ==, 0);
3622
3623			error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
3624			if (error == 0) {
3625				zfs_log_rename(zilog, tx, TX_RENAME |
3626				    (flags & FIGNORECASE ? TX_CI : 0), sdzp,
3627				    sdl->dl_name, tdzp, tdl->dl_name, szp);
3628
3629				/*
3630				 * Update path information for the target vnode
3631				 */
3632				vn_renamepath(tdvp, ZTOV(szp), tnm,
3633				    strlen(tnm));
3634			} else {
3635				/*
3636				 * At this point, we have successfully created
3637				 * the target name, but have failed to remove
3638				 * the source name.  Since the create was done
3639				 * with the ZRENAMING flag, there are
3640				 * complications; for one, the link count is
3641				 * wrong.  The easiest way to deal with this
3642				 * is to remove the newly created target, and
3643				 * return the original error.  This must
3644				 * succeed; fortunately, it is very unlikely to
3645				 * fail, since we just created it.
3646				 */
3647				VERIFY3U(zfs_link_destroy(tdl, szp, tx,
3648				    ZRENAMING, NULL), ==, 0);
3649			}
3650		}
3651	}
3652
3653	dmu_tx_commit(tx);
3654out:
3655	if (zl != NULL)
3656		zfs_rename_unlock(&zl);
3657
3658	zfs_dirent_unlock(sdl);
3659	zfs_dirent_unlock(tdl);
3660
3661	if (sdzp == tdzp)
3662		rw_exit(&sdzp->z_name_lock);
3663
3664
3665	VN_RELE(ZTOV(szp));
3666	if (tzp)
3667		VN_RELE(ZTOV(tzp));
3668
3669	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3670		zil_commit(zilog, 0);
3671
3672	ZFS_EXIT(zfsvfs);
3673	return (error);
3674}
3675
3676/*
3677 * Insert the indicated symbolic reference entry into the directory.
3678 *
3679 *	IN:	dvp	- Directory to contain new symbolic link.
3680 *		link	- Name for new symlink entry.
3681 *		vap	- Attributes of new entry.
3682 *		target	- Target path of new symlink.
3683 *		cr	- credentials of caller.
3684 *		ct	- caller context
3685 *		flags	- case flags
3686 *
3687 *	RETURN:	0 if success
3688 *		error code if failure
3689 *
3690 * Timestamps:
3691 *	dvp - ctime|mtime updated
3692 */
3693/*ARGSUSED*/
3694static int
3695zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr,
3696    caller_context_t *ct, int flags)
3697{
3698	znode_t		*zp, *dzp = VTOZ(dvp);
3699	zfs_dirlock_t	*dl;
3700	dmu_tx_t	*tx;
3701	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
3702	zilog_t		*zilog;
3703	uint64_t	len = strlen(link);
3704	int		error;
3705	int		zflg = ZNEW;
3706	zfs_acl_ids_t	acl_ids;
3707	boolean_t	fuid_dirtied;
3708	uint64_t	txtype = TX_SYMLINK;
3709
3710	ASSERT(vap->va_type == VLNK);
3711
3712	ZFS_ENTER(zfsvfs);
3713	ZFS_VERIFY_ZP(dzp);
3714	zilog = zfsvfs->z_log;
3715
3716	if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
3717	    NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3718		ZFS_EXIT(zfsvfs);
3719		return (EILSEQ);
3720	}
3721	if (flags & FIGNORECASE)
3722		zflg |= ZCILOOK;
3723
3724	if (len > MAXPATHLEN) {
3725		ZFS_EXIT(zfsvfs);
3726		return (ENAMETOOLONG);
3727	}
3728
3729	if ((error = zfs_acl_ids_create(dzp, 0,
3730	    vap, cr, NULL, &acl_ids)) != 0) {
3731		ZFS_EXIT(zfsvfs);
3732		return (error);
3733	}
3734top:
3735	/*
3736	 * Attempt to lock directory; fail if entry already exists.
3737	 */
3738	error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
3739	if (error) {
3740		zfs_acl_ids_free(&acl_ids);
3741		ZFS_EXIT(zfsvfs);
3742		return (error);
3743	}
3744
3745	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
3746		zfs_acl_ids_free(&acl_ids);
3747		zfs_dirent_unlock(dl);
3748		ZFS_EXIT(zfsvfs);
3749		return (error);
3750	}
3751
3752	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
3753		zfs_acl_ids_free(&acl_ids);
3754		zfs_dirent_unlock(dl);
3755		ZFS_EXIT(zfsvfs);
3756		return (EDQUOT);
3757	}
3758	tx = dmu_tx_create(zfsvfs->z_os);
3759	fuid_dirtied = zfsvfs->z_fuid_dirty;
3760	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
3761	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3762	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
3763	    ZFS_SA_BASE_ATTR_SIZE + len);
3764	dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
3765	if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3766		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
3767		    acl_ids.z_aclp->z_acl_bytes);
3768	}
3769	if (fuid_dirtied)
3770		zfs_fuid_txhold(zfsvfs, tx);
3771	error = dmu_tx_assign(tx, TXG_NOWAIT);
3772	if (error) {
3773		zfs_dirent_unlock(dl);
3774		if (error == ERESTART) {
3775			dmu_tx_wait(tx);
3776			dmu_tx_abort(tx);
3777			goto top;
3778		}
3779		zfs_acl_ids_free(&acl_ids);
3780		dmu_tx_abort(tx);
3781		ZFS_EXIT(zfsvfs);
3782		return (error);
3783	}
3784
3785	/*
3786	 * Create a new object for the symlink.
3787	 * for version 4 ZPL datsets the symlink will be an SA attribute
3788	 */
3789	zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
3790
3791	if (fuid_dirtied)
3792		zfs_fuid_sync(zfsvfs, tx);
3793
3794	mutex_enter(&zp->z_lock);
3795	if (zp->z_is_sa)
3796		error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
3797		    link, len, tx);
3798	else
3799		zfs_sa_symlink(zp, link, len, tx);
3800	mutex_exit(&zp->z_lock);
3801
3802	zp->z_size = len;
3803	(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
3804	    &zp->z_size, sizeof (zp->z_size), tx);
3805	/*
3806	 * Insert the new object into the directory.
3807	 */
3808	(void) zfs_link_create(dl, zp, tx, ZNEW);
3809
3810	if (flags & FIGNORECASE)
3811		txtype |= TX_CI;
3812	zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
3813
3814	zfs_acl_ids_free(&acl_ids);
3815
3816	dmu_tx_commit(tx);
3817
3818	zfs_dirent_unlock(dl);
3819
3820	VN_RELE(ZTOV(zp));
3821
3822	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3823		zil_commit(zilog, 0);
3824
3825	ZFS_EXIT(zfsvfs);
3826	return (error);
3827}
3828
3829/*
3830 * Return, in the buffer contained in the provided uio structure,
3831 * the symbolic path referred to by vp.
3832 *
3833 *	IN:	vp	- vnode of symbolic link.
3834 *		uoip	- structure to contain the link path.
3835 *		cr	- credentials of caller.
3836 *		ct	- caller context
3837 *
3838 *	OUT:	uio	- structure to contain the link path.
3839 *
3840 *	RETURN:	0 if success
3841 *		error code if failure
3842 *
3843 * Timestamps:
3844 *	vp - atime updated
3845 */
3846/* ARGSUSED */
3847static int
3848zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
3849{
3850	znode_t		*zp = VTOZ(vp);
3851	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
3852	int		error;
3853
3854	ZFS_ENTER(zfsvfs);
3855	ZFS_VERIFY_ZP(zp);
3856
3857	mutex_enter(&zp->z_lock);
3858	if (zp->z_is_sa)
3859		error = sa_lookup_uio(zp->z_sa_hdl,
3860		    SA_ZPL_SYMLINK(zfsvfs), uio);
3861	else
3862		error = zfs_sa_readlink(zp, uio);
3863	mutex_exit(&zp->z_lock);
3864
3865	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
3866
3867	ZFS_EXIT(zfsvfs);
3868	return (error);
3869}
3870
3871/*
3872 * Insert a new entry into directory tdvp referencing svp.
3873 *
3874 *	IN:	tdvp	- Directory to contain new entry.
3875 *		svp	- vnode of new entry.
3876 *		name	- name of new entry.
3877 *		cr	- credentials of caller.
3878 *		ct	- caller context
3879 *
3880 *	RETURN:	0 if success
3881 *		error code if failure
3882 *
3883 * Timestamps:
3884 *	tdvp - ctime|mtime updated
3885 *	 svp - ctime updated
3886 */
3887/* ARGSUSED */
3888static int
3889zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
3890    caller_context_t *ct, int flags)
3891{
3892	znode_t		*dzp = VTOZ(tdvp);
3893	znode_t		*tzp, *szp;
3894	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
3895	zilog_t		*zilog;
3896	zfs_dirlock_t	*dl;
3897	dmu_tx_t	*tx;
3898	vnode_t		*realvp;
3899	int		error;
3900	int		zf = ZNEW;
3901	uint64_t	parent;
3902	uid_t		owner;
3903
3904	ASSERT(tdvp->v_type == VDIR);
3905
3906	ZFS_ENTER(zfsvfs);
3907	ZFS_VERIFY_ZP(dzp);
3908	zilog = zfsvfs->z_log;
3909
3910	if (VOP_REALVP(svp, &realvp, ct) == 0)
3911		svp = realvp;
3912
3913	/*
3914	 * POSIX dictates that we return EPERM here.
3915	 * Better choices include ENOTSUP or EISDIR.
3916	 */
3917	if (svp->v_type == VDIR) {
3918		ZFS_EXIT(zfsvfs);
3919		return (EPERM);
3920	}
3921
3922	if (svp->v_vfsp != tdvp->v_vfsp || zfsctl_is_node(svp)) {
3923		ZFS_EXIT(zfsvfs);
3924		return (EXDEV);
3925	}
3926
3927	szp = VTOZ(svp);
3928	ZFS_VERIFY_ZP(szp);
3929
3930	/* Prevent links to .zfs/shares files */
3931
3932	if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
3933	    &parent, sizeof (uint64_t))) != 0) {
3934		ZFS_EXIT(zfsvfs);
3935		return (error);
3936	}
3937	if (parent == zfsvfs->z_shares_dir) {
3938		ZFS_EXIT(zfsvfs);
3939		return (EPERM);
3940	}
3941
3942	if (zfsvfs->z_utf8 && u8_validate(name,
3943	    strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3944		ZFS_EXIT(zfsvfs);
3945		return (EILSEQ);
3946	}
3947	if (flags & FIGNORECASE)
3948		zf |= ZCILOOK;
3949
3950	/*
3951	 * We do not support links between attributes and non-attributes
3952	 * because of the potential security risk of creating links
3953	 * into "normal" file space in order to circumvent restrictions
3954	 * imposed in attribute space.
3955	 */
3956	if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
3957		ZFS_EXIT(zfsvfs);
3958		return (EINVAL);
3959	}
3960
3961
3962	owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
3963	if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
3964		ZFS_EXIT(zfsvfs);
3965		return (EPERM);
3966	}
3967
3968	if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
3969		ZFS_EXIT(zfsvfs);
3970		return (error);
3971	}
3972
3973top:
3974	/*
3975	 * Attempt to lock directory; fail if entry already exists.
3976	 */
3977	error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
3978	if (error) {
3979		ZFS_EXIT(zfsvfs);
3980		return (error);
3981	}
3982
3983	tx = dmu_tx_create(zfsvfs->z_os);
3984	dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3985	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3986	zfs_sa_upgrade_txholds(tx, szp);
3987	zfs_sa_upgrade_txholds(tx, dzp);
3988	error = dmu_tx_assign(tx, TXG_NOWAIT);
3989	if (error) {
3990		zfs_dirent_unlock(dl);
3991		if (error == ERESTART) {
3992			dmu_tx_wait(tx);
3993			dmu_tx_abort(tx);
3994			goto top;
3995		}
3996		dmu_tx_abort(tx);
3997		ZFS_EXIT(zfsvfs);
3998		return (error);
3999	}
4000
4001	error = zfs_link_create(dl, szp, tx, 0);
4002
4003	if (error == 0) {
4004		uint64_t txtype = TX_LINK;
4005		if (flags & FIGNORECASE)
4006			txtype |= TX_CI;
4007		zfs_log_link(zilog, tx, txtype, dzp, szp, name);
4008	}
4009
4010	dmu_tx_commit(tx);
4011
4012	zfs_dirent_unlock(dl);
4013
4014	if (error == 0) {
4015		vnevent_link(svp, ct);
4016	}
4017
4018	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4019		zil_commit(zilog, 0);
4020
4021	ZFS_EXIT(zfsvfs);
4022	return (error);
4023}
4024
4025/*
4026 * zfs_null_putapage() is used when the file system has been force
4027 * unmounted. It just drops the pages.
4028 */
4029/* ARGSUSED */
4030static int
4031zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4032		size_t *lenp, int flags, cred_t *cr)
4033{
4034	pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR);
4035	return (0);
4036}
4037
4038/*
4039 * Push a page out to disk, klustering if possible.
4040 *
4041 *	IN:	vp	- file to push page to.
4042 *		pp	- page to push.
4043 *		flags	- additional flags.
4044 *		cr	- credentials of caller.
4045 *
4046 *	OUT:	offp	- start of range pushed.
4047 *		lenp	- len of range pushed.
4048 *
4049 *	RETURN:	0 if success
4050 *		error code if failure
4051 *
4052 * NOTE: callers must have locked the page to be pushed.  On
4053 * exit, the page (and all other pages in the kluster) must be
4054 * unlocked.
4055 */
4056/* ARGSUSED */
4057static int
4058zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4059		size_t *lenp, int flags, cred_t *cr)
4060{
4061	znode_t		*zp = VTOZ(vp);
4062	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4063	dmu_tx_t	*tx;
4064	u_offset_t	off, koff;
4065	size_t		len, klen;
4066	int		err;
4067
4068	off = pp->p_offset;
4069	len = PAGESIZE;
4070	/*
4071	 * If our blocksize is bigger than the page size, try to kluster
4072	 * multiple pages so that we write a full block (thus avoiding
4073	 * a read-modify-write).
4074	 */
4075	if (off < zp->z_size && zp->z_blksz > PAGESIZE) {
4076		klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
4077		koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0;
4078		ASSERT(koff <= zp->z_size);
4079		if (koff + klen > zp->z_size)
4080			klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE);
4081		pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
4082	}
4083	ASSERT3U(btop(len), ==, btopr(len));
4084
4085	/*
4086	 * Can't push pages past end-of-file.
4087	 */
4088	if (off >= zp->z_size) {
4089		/* ignore all pages */
4090		err = 0;
4091		goto out;
4092	} else if (off + len > zp->z_size) {
4093		int npages = btopr(zp->z_size - off);
4094		page_t *trunc;
4095
4096		page_list_break(&pp, &trunc, npages);
4097		/* ignore pages past end of file */
4098		if (trunc)
4099			pvn_write_done(trunc, flags);
4100		len = zp->z_size - off;
4101	}
4102
4103	if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
4104	    zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
4105		err = EDQUOT;
4106		goto out;
4107	}
4108top:
4109	tx = dmu_tx_create(zfsvfs->z_os);
4110	dmu_tx_hold_write(tx, zp->z_id, off, len);
4111
4112	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4113	zfs_sa_upgrade_txholds(tx, zp);
4114	err = dmu_tx_assign(tx, TXG_NOWAIT);
4115	if (err != 0) {
4116		if (err == ERESTART) {
4117			dmu_tx_wait(tx);
4118			dmu_tx_abort(tx);
4119			goto top;
4120		}
4121		dmu_tx_abort(tx);
4122		goto out;
4123	}
4124
4125	if (zp->z_blksz <= PAGESIZE) {
4126		caddr_t va = zfs_map_page(pp, S_READ);
4127		ASSERT3U(len, <=, PAGESIZE);
4128		dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
4129		zfs_unmap_page(pp, va);
4130	} else {
4131		err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
4132	}
4133
4134	if (err == 0) {
4135		uint64_t mtime[2], ctime[2];
4136		sa_bulk_attr_t bulk[3];
4137		int count = 0;
4138
4139		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4140		    &mtime, 16);
4141		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
4142		    &ctime, 16);
4143		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
4144		    &zp->z_pflags, 8);
4145		zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
4146		    B_TRUE);
4147		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
4148	}
4149	dmu_tx_commit(tx);
4150
4151out:
4152	pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
4153	if (offp)
4154		*offp = off;
4155	if (lenp)
4156		*lenp = len;
4157
4158	return (err);
4159}
4160
4161/*
4162 * Copy the portion of the file indicated from pages into the file.
4163 * The pages are stored in a page list attached to the files vnode.
4164 *
4165 *	IN:	vp	- vnode of file to push page data to.
4166 *		off	- position in file to put data.
4167 *		len	- amount of data to write.
4168 *		flags	- flags to control the operation.
4169 *		cr	- credentials of caller.
4170 *		ct	- caller context.
4171 *
4172 *	RETURN:	0 if success
4173 *		error code if failure
4174 *
4175 * Timestamps:
4176 *	vp - ctime|mtime updated
4177 */
4178/*ARGSUSED*/
4179static int
4180zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
4181    caller_context_t *ct)
4182{
4183	znode_t		*zp = VTOZ(vp);
4184	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4185	page_t		*pp;
4186	size_t		io_len;
4187	u_offset_t	io_off;
4188	uint_t		blksz;
4189	rl_t		*rl;
4190	int		error = 0;
4191
4192	ZFS_ENTER(zfsvfs);
4193	ZFS_VERIFY_ZP(zp);
4194
4195	/*
4196	 * Align this request to the file block size in case we kluster.
4197	 * XXX - this can result in pretty aggresive locking, which can
4198	 * impact simultanious read/write access.  One option might be
4199	 * to break up long requests (len == 0) into block-by-block
4200	 * operations to get narrower locking.
4201	 */
4202	blksz = zp->z_blksz;
4203	if (ISP2(blksz))
4204		io_off = P2ALIGN_TYPED(off, blksz, u_offset_t);
4205	else
4206		io_off = 0;
4207	if (len > 0 && ISP2(blksz))
4208		io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t);
4209	else
4210		io_len = 0;
4211
4212	if (io_len == 0) {
4213		/*
4214		 * Search the entire vp list for pages >= io_off.
4215		 */
4216		rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER);
4217		error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr);
4218		goto out;
4219	}
4220	rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER);
4221
4222	if (off > zp->z_size) {
4223		/* past end of file */
4224		zfs_range_unlock(rl);
4225		ZFS_EXIT(zfsvfs);
4226		return (0);
4227	}
4228
4229	len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off);
4230
4231	for (off = io_off; io_off < off + len; io_off += io_len) {
4232		if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
4233			pp = page_lookup(vp, io_off,
4234			    (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
4235		} else {
4236			pp = page_lookup_nowait(vp, io_off,
4237			    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
4238		}
4239
4240		if (pp != NULL && pvn_getdirty(pp, flags)) {
4241			int err;
4242
4243			/*
4244			 * Found a dirty page to push
4245			 */
4246			err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr);
4247			if (err)
4248				error = err;
4249		} else {
4250			io_len = PAGESIZE;
4251		}
4252	}
4253out:
4254	zfs_range_unlock(rl);
4255	if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4256		zil_commit(zfsvfs->z_log, zp->z_id);
4257	ZFS_EXIT(zfsvfs);
4258	return (error);
4259}
4260
4261/*ARGSUSED*/
4262void
4263zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4264{
4265	znode_t	*zp = VTOZ(vp);
4266	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4267	int error;
4268
4269	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4270	if (zp->z_sa_hdl == NULL) {
4271		/*
4272		 * The fs has been unmounted, or we did a
4273		 * suspend/resume and this file no longer exists.
4274		 */
4275		if (vn_has_cached_data(vp)) {
4276			(void) pvn_vplist_dirty(vp, 0, zfs_null_putapage,
4277			    B_INVAL, cr);
4278		}
4279
4280		mutex_enter(&zp->z_lock);
4281		mutex_enter(&vp->v_lock);
4282		ASSERT(vp->v_count == 1);
4283		vp->v_count = 0;
4284		mutex_exit(&vp->v_lock);
4285		mutex_exit(&zp->z_lock);
4286		rw_exit(&zfsvfs->z_teardown_inactive_lock);
4287		zfs_znode_free(zp);
4288		return;
4289	}
4290
4291	/*
4292	 * Attempt to push any data in the page cache.  If this fails
4293	 * we will get kicked out later in zfs_zinactive().
4294	 */
4295	if (vn_has_cached_data(vp)) {
4296		(void) pvn_vplist_dirty(vp, 0, zfs_putapage, B_INVAL|B_ASYNC,
4297		    cr);
4298	}
4299
4300	if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4301		dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4302
4303		dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4304		zfs_sa_upgrade_txholds(tx, zp);
4305		error = dmu_tx_assign(tx, TXG_WAIT);
4306		if (error) {
4307			dmu_tx_abort(tx);
4308		} else {
4309			mutex_enter(&zp->z_lock);
4310			(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4311			    (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4312			zp->z_atime_dirty = 0;
4313			mutex_exit(&zp->z_lock);
4314			dmu_tx_commit(tx);
4315		}
4316	}
4317
4318	zfs_zinactive(zp);
4319	rw_exit(&zfsvfs->z_teardown_inactive_lock);
4320}
4321
4322/*
4323 * Bounds-check the seek operation.
4324 *
4325 *	IN:	vp	- vnode seeking within
4326 *		ooff	- old file offset
4327 *		noffp	- pointer to new file offset
4328 *		ct	- caller context
4329 *
4330 *	RETURN:	0 if success
4331 *		EINVAL if new offset invalid
4332 */
4333/* ARGSUSED */
4334static int
4335zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
4336    caller_context_t *ct)
4337{
4338	if (vp->v_type == VDIR)
4339		return (0);
4340	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
4341}
4342
4343/*
4344 * Pre-filter the generic locking function to trap attempts to place
4345 * a mandatory lock on a memory mapped file.
4346 */
4347static int
4348zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
4349    flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct)
4350{
4351	znode_t *zp = VTOZ(vp);
4352	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4353
4354	ZFS_ENTER(zfsvfs);
4355	ZFS_VERIFY_ZP(zp);
4356
4357	/*
4358	 * We are following the UFS semantics with respect to mapcnt
4359	 * here: If we see that the file is mapped already, then we will
4360	 * return an error, but we don't worry about races between this
4361	 * function and zfs_map().
4362	 */
4363	if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) {
4364		ZFS_EXIT(zfsvfs);
4365		return (EAGAIN);
4366	}
4367	ZFS_EXIT(zfsvfs);
4368	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
4369}
4370
4371/*
4372 * If we can't find a page in the cache, we will create a new page
4373 * and fill it with file data.  For efficiency, we may try to fill
4374 * multiple pages at once (klustering) to fill up the supplied page
4375 * list.  Note that the pages to be filled are held with an exclusive
4376 * lock to prevent access by other threads while they are being filled.
4377 */
4378static int
4379zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
4380    caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw)
4381{
4382	znode_t *zp = VTOZ(vp);
4383	page_t *pp, *cur_pp;
4384	objset_t *os = zp->z_zfsvfs->z_os;
4385	u_offset_t io_off, total;
4386	size_t io_len;
4387	int err;
4388
4389	if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) {
4390		/*
4391		 * We only have a single page, don't bother klustering
4392		 */
4393		io_off = off;
4394		io_len = PAGESIZE;
4395		pp = page_create_va(vp, io_off, io_len,
4396		    PG_EXCL | PG_WAIT, seg, addr);
4397	} else {
4398		/*
4399		 * Try to find enough pages to fill the page list
4400		 */
4401		pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
4402		    &io_len, off, plsz, 0);
4403	}
4404	if (pp == NULL) {
4405		/*
4406		 * The page already exists, nothing to do here.
4407		 */
4408		*pl = NULL;
4409		return (0);
4410	}
4411
4412	/*
4413	 * Fill the pages in the kluster.
4414	 */
4415	cur_pp = pp;
4416	for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
4417		caddr_t va;
4418
4419		ASSERT3U(io_off, ==, cur_pp->p_offset);
4420		va = zfs_map_page(cur_pp, S_WRITE);
4421		err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
4422		    DMU_READ_PREFETCH);
4423		zfs_unmap_page(cur_pp, va);
4424		if (err) {
4425			/* On error, toss the entire kluster */
4426			pvn_read_done(pp, B_ERROR);
4427			/* convert checksum errors into IO errors */
4428			if (err == ECKSUM)
4429				err = EIO;
4430			return (err);
4431		}
4432		cur_pp = cur_pp->p_next;
4433	}
4434
4435	/*
4436	 * Fill in the page list array from the kluster starting
4437	 * from the desired offset `off'.
4438	 * NOTE: the page list will always be null terminated.
4439	 */
4440	pvn_plist_init(pp, pl, plsz, off, io_len, rw);
4441	ASSERT(pl == NULL || (*pl)->p_offset == off);
4442
4443	return (0);
4444}
4445
4446/*
4447 * Return pointers to the pages for the file region [off, off + len]
4448 * in the pl array.  If plsz is greater than len, this function may
4449 * also return page pointers from after the specified region
4450 * (i.e. the region [off, off + plsz]).  These additional pages are
4451 * only returned if they are already in the cache, or were created as
4452 * part of a klustered read.
4453 *
4454 *	IN:	vp	- vnode of file to get data from.
4455 *		off	- position in file to get data from.
4456 *		len	- amount of data to retrieve.
4457 *		plsz	- length of provided page list.
4458 *		seg	- segment to obtain pages for.
4459 *		addr	- virtual address of fault.
4460 *		rw	- mode of created pages.
4461 *		cr	- credentials of caller.
4462 *		ct	- caller context.
4463 *
4464 *	OUT:	protp	- protection mode of created pages.
4465 *		pl	- list of pages created.
4466 *
4467 *	RETURN:	0 if success
4468 *		error code if failure
4469 *
4470 * Timestamps:
4471 *	vp - atime updated
4472 */
4473/* ARGSUSED */
4474static int
4475zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
4476	page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
4477	enum seg_rw rw, cred_t *cr, caller_context_t *ct)
4478{
4479	znode_t		*zp = VTOZ(vp);
4480	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4481	page_t		**pl0 = pl;
4482	int		err = 0;
4483
4484	/* we do our own caching, faultahead is unnecessary */
4485	if (pl == NULL)
4486		return (0);
4487	else if (len > plsz)
4488		len = plsz;
4489	else
4490		len = P2ROUNDUP(len, PAGESIZE);
4491	ASSERT(plsz >= len);
4492
4493	ZFS_ENTER(zfsvfs);
4494	ZFS_VERIFY_ZP(zp);
4495
4496	if (protp)
4497		*protp = PROT_ALL;
4498
4499	/*
4500	 * Loop through the requested range [off, off + len) looking
4501	 * for pages.  If we don't find a page, we will need to create
4502	 * a new page and fill it with data from the file.
4503	 */
4504	while (len > 0) {
4505		if (*pl = page_lookup(vp, off, SE_SHARED))
4506			*(pl+1) = NULL;
4507		else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw))
4508			goto out;
4509		while (*pl) {
4510			ASSERT3U((*pl)->p_offset, ==, off);
4511			off += PAGESIZE;
4512			addr += PAGESIZE;
4513			if (len > 0) {
4514				ASSERT3U(len, >=, PAGESIZE);
4515				len -= PAGESIZE;
4516			}
4517			ASSERT3U(plsz, >=, PAGESIZE);
4518			plsz -= PAGESIZE;
4519			pl++;
4520		}
4521	}
4522
4523	/*
4524	 * Fill out the page array with any pages already in the cache.
4525	 */
4526	while (plsz > 0 &&
4527	    (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) {
4528			off += PAGESIZE;
4529			plsz -= PAGESIZE;
4530	}
4531out:
4532	if (err) {
4533		/*
4534		 * Release any pages we have previously locked.
4535		 */
4536		while (pl > pl0)
4537			page_unlock(*--pl);
4538	} else {
4539		ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4540	}
4541
4542	*pl = NULL;
4543
4544	ZFS_EXIT(zfsvfs);
4545	return (err);
4546}
4547
4548/*
4549 * Request a memory map for a section of a file.  This code interacts
4550 * with common code and the VM system as follows:
4551 *
4552 *	common code calls mmap(), which ends up in smmap_common()
4553 *
4554 *	this calls VOP_MAP(), which takes you into (say) zfs
4555 *
4556 *	zfs_map() calls as_map(), passing segvn_create() as the callback
4557 *
4558 *	segvn_create() creates the new segment and calls VOP_ADDMAP()
4559 *
4560 *	zfs_addmap() updates z_mapcnt
4561 */
4562/*ARGSUSED*/
4563static int
4564zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
4565    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4566    caller_context_t *ct)
4567{
4568	znode_t *zp = VTOZ(vp);
4569	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4570	segvn_crargs_t	vn_a;
4571	int		error;
4572
4573	ZFS_ENTER(zfsvfs);
4574	ZFS_VERIFY_ZP(zp);
4575
4576	if ((prot & PROT_WRITE) && (zp->z_pflags &
4577	    (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
4578		ZFS_EXIT(zfsvfs);
4579		return (EPERM);
4580	}
4581
4582	if ((prot & (PROT_READ | PROT_EXEC)) &&
4583	    (zp->z_pflags & ZFS_AV_QUARANTINED)) {
4584		ZFS_EXIT(zfsvfs);
4585		return (EACCES);
4586	}
4587
4588	if (vp->v_flag & VNOMAP) {
4589		ZFS_EXIT(zfsvfs);
4590		return (ENOSYS);
4591	}
4592
4593	if (off < 0 || len > MAXOFFSET_T - off) {
4594		ZFS_EXIT(zfsvfs);
4595		return (ENXIO);
4596	}
4597
4598	if (vp->v_type != VREG) {
4599		ZFS_EXIT(zfsvfs);
4600		return (ENODEV);
4601	}
4602
4603	/*
4604	 * If file is locked, disallow mapping.
4605	 */
4606	if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) {
4607		ZFS_EXIT(zfsvfs);
4608		return (EAGAIN);
4609	}
4610
4611	as_rangelock(as);
4612	error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
4613	if (error != 0) {
4614		as_rangeunlock(as);
4615		ZFS_EXIT(zfsvfs);
4616		return (error);
4617	}
4618
4619	vn_a.vp = vp;
4620	vn_a.offset = (u_offset_t)off;
4621	vn_a.type = flags & MAP_TYPE;
4622	vn_a.prot = prot;
4623	vn_a.maxprot = maxprot;
4624	vn_a.cred = cr;
4625	vn_a.amp = NULL;
4626	vn_a.flags = flags & ~MAP_TYPE;
4627	vn_a.szc = 0;
4628	vn_a.lgrp_mem_policy_flags = 0;
4629
4630	error = as_map(as, *addrp, len, segvn_create, &vn_a);
4631
4632	as_rangeunlock(as);
4633	ZFS_EXIT(zfsvfs);
4634	return (error);
4635}
4636
4637/* ARGSUSED */
4638static int
4639zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4640    size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4641    caller_context_t *ct)
4642{
4643	uint64_t pages = btopr(len);
4644
4645	atomic_add_64(&VTOZ(vp)->z_mapcnt, pages);
4646	return (0);
4647}
4648
4649/*
4650 * The reason we push dirty pages as part of zfs_delmap() is so that we get a
4651 * more accurate mtime for the associated file.  Since we don't have a way of
4652 * detecting when the data was actually modified, we have to resort to
4653 * heuristics.  If an explicit msync() is done, then we mark the mtime when the
4654 * last page is pushed.  The problem occurs when the msync() call is omitted,
4655 * which by far the most common case:
4656 *
4657 * 	open()
4658 * 	mmap()
4659 * 	<modify memory>
4660 * 	munmap()
4661 * 	close()
4662 * 	<time lapse>
4663 * 	putpage() via fsflush
4664 *
4665 * If we wait until fsflush to come along, we can have a modification time that
4666 * is some arbitrary point in the future.  In order to prevent this in the
4667 * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
4668 * torn down.
4669 */
4670/* ARGSUSED */
4671static int
4672zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4673    size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
4674    caller_context_t *ct)
4675{
4676	uint64_t pages = btopr(len);
4677
4678	ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
4679	atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
4680
4681	if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
4682	    vn_has_cached_data(vp))
4683		(void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
4684
4685	return (0);
4686}
4687
4688/*
4689 * Free or allocate space in a file.  Currently, this function only
4690 * supports the `F_FREESP' command.  However, this command is somewhat
4691 * misnamed, as its functionality includes the ability to allocate as
4692 * well as free space.
4693 *
4694 *	IN:	vp	- vnode of file to free data in.
4695 *		cmd	- action to take (only F_FREESP supported).
4696 *		bfp	- section of file to free/alloc.
4697 *		flag	- current file open mode flags.
4698 *		offset	- current file offset.
4699 *		cr	- credentials of caller [UNUSED].
4700 *		ct	- caller context.
4701 *
4702 *	RETURN:	0 if success
4703 *		error code if failure
4704 *
4705 * Timestamps:
4706 *	vp - ctime|mtime updated
4707 */
4708/* ARGSUSED */
4709static int
4710zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
4711    offset_t offset, cred_t *cr, caller_context_t *ct)
4712{
4713	znode_t		*zp = VTOZ(vp);
4714	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4715	uint64_t	off, len;
4716	int		error;
4717
4718	ZFS_ENTER(zfsvfs);
4719	ZFS_VERIFY_ZP(zp);
4720
4721	if (cmd != F_FREESP) {
4722		ZFS_EXIT(zfsvfs);
4723		return (EINVAL);
4724	}
4725
4726	if (error = convoff(vp, bfp, 0, offset)) {
4727		ZFS_EXIT(zfsvfs);
4728		return (error);
4729	}
4730
4731	if (bfp->l_len < 0) {
4732		ZFS_EXIT(zfsvfs);
4733		return (EINVAL);
4734	}
4735
4736	off = bfp->l_start;
4737	len = bfp->l_len; /* 0 means from off to end of file */
4738
4739	error = zfs_freesp(zp, off, len, flag, TRUE);
4740
4741	ZFS_EXIT(zfsvfs);
4742	return (error);
4743}
4744
4745/*ARGSUSED*/
4746static int
4747zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4748{
4749	znode_t		*zp = VTOZ(vp);
4750	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
4751	uint32_t	gen;
4752	uint64_t	gen64;
4753	uint64_t	object = zp->z_id;
4754	zfid_short_t	*zfid;
4755	int		size, i, error;
4756
4757	ZFS_ENTER(zfsvfs);
4758	ZFS_VERIFY_ZP(zp);
4759
4760	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4761	    &gen64, sizeof (uint64_t))) != 0) {
4762		ZFS_EXIT(zfsvfs);
4763		return (error);
4764	}
4765
4766	gen = (uint32_t)gen64;
4767
4768	size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
4769	if (fidp->fid_len < size) {
4770		fidp->fid_len = size;
4771		ZFS_EXIT(zfsvfs);
4772		return (ENOSPC);
4773	}
4774
4775	zfid = (zfid_short_t *)fidp;
4776
4777	zfid->zf_len = size;
4778
4779	for (i = 0; i < sizeof (zfid->zf_object); i++)
4780		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4781
4782	/* Must have a non-zero generation number to distinguish from .zfs */
4783	if (gen == 0)
4784		gen = 1;
4785	for (i = 0; i < sizeof (zfid->zf_gen); i++)
4786		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4787
4788	if (size == LONG_FID_LEN) {
4789		uint64_t	objsetid = dmu_objset_id(zfsvfs->z_os);
4790		zfid_long_t	*zlfid;
4791
4792		zlfid = (zfid_long_t *)fidp;
4793
4794		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
4795			zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
4796
4797		/* XXX - this should be the generation number for the objset */
4798		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
4799			zlfid->zf_setgen[i] = 0;
4800	}
4801
4802	ZFS_EXIT(zfsvfs);
4803	return (0);
4804}
4805
4806static int
4807zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4808    caller_context_t *ct)
4809{
4810	znode_t		*zp, *xzp;
4811	zfsvfs_t	*zfsvfs;
4812	zfs_dirlock_t	*dl;
4813	int		error;
4814
4815	switch (cmd) {
4816	case _PC_LINK_MAX:
4817		*valp = ULONG_MAX;
4818		return (0);
4819
4820	case _PC_FILESIZEBITS:
4821		*valp = 64;
4822		return (0);
4823
4824	case _PC_XATTR_EXISTS:
4825		zp = VTOZ(vp);
4826		zfsvfs = zp->z_zfsvfs;
4827		ZFS_ENTER(zfsvfs);
4828		ZFS_VERIFY_ZP(zp);
4829		*valp = 0;
4830		error = zfs_dirent_lock(&dl, zp, "", &xzp,
4831		    ZXATTR | ZEXISTS | ZSHARED, NULL, NULL);
4832		if (error == 0) {
4833			zfs_dirent_unlock(dl);
4834			if (!zfs_dirempty(xzp))
4835				*valp = 1;
4836			VN_RELE(ZTOV(xzp));
4837		} else if (error == ENOENT) {
4838			/*
4839			 * If there aren't extended attributes, it's the
4840			 * same as having zero of them.
4841			 */
4842			error = 0;
4843		}
4844		ZFS_EXIT(zfsvfs);
4845		return (error);
4846
4847	case _PC_SATTR_ENABLED:
4848	case _PC_SATTR_EXISTS:
4849		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
4850		    (vp->v_type == VREG || vp->v_type == VDIR);
4851		return (0);
4852
4853	case _PC_ACCESS_FILTERING:
4854		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
4855		    vp->v_type == VDIR;
4856		return (0);
4857
4858	case _PC_ACL_ENABLED:
4859		*valp = _ACL_ACE_ENABLED;
4860		return (0);
4861
4862	case _PC_MIN_HOLE_SIZE:
4863		*valp = (ulong_t)SPA_MINBLOCKSIZE;
4864		return (0);
4865
4866	case _PC_TIMESTAMP_RESOLUTION:
4867		/* nanosecond timestamp resolution */
4868		*valp = 1L;
4869		return (0);
4870
4871	default:
4872		return (fs_pathconf(vp, cmd, valp, cr, ct));
4873	}
4874}
4875
4876/*ARGSUSED*/
4877static int
4878zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4879    caller_context_t *ct)
4880{
4881	znode_t *zp = VTOZ(vp);
4882	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4883	int error;
4884	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4885
4886	ZFS_ENTER(zfsvfs);
4887	ZFS_VERIFY_ZP(zp);
4888	error = zfs_getacl(zp, vsecp, skipaclchk, cr);
4889	ZFS_EXIT(zfsvfs);
4890
4891	return (error);
4892}
4893
4894/*ARGSUSED*/
4895static int
4896zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
4897    caller_context_t *ct)
4898{
4899	znode_t *zp = VTOZ(vp);
4900	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4901	int error;
4902	boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
4903	zilog_t	*zilog = zfsvfs->z_log;
4904
4905	ZFS_ENTER(zfsvfs);
4906	ZFS_VERIFY_ZP(zp);
4907
4908	error = zfs_setacl(zp, vsecp, skipaclchk, cr);
4909
4910	if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4911		zil_commit(zilog, 0);
4912
4913	ZFS_EXIT(zfsvfs);
4914	return (error);
4915}
4916
4917/*
4918 * Tunable, both must be a power of 2.
4919 *
4920 * zcr_blksz_min: the smallest read we may consider to loan out an arcbuf
4921 * zcr_blksz_max: if set to less than the file block size, allow loaning out of
4922 *                an arcbuf for a partial block read
4923 */
4924int zcr_blksz_min = (1 << 10);	/* 1K */
4925int zcr_blksz_max = (1 << 17);	/* 128K */
4926
4927/*ARGSUSED*/
4928static int
4929zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr,
4930    caller_context_t *ct)
4931{
4932	znode_t	*zp = VTOZ(vp);
4933	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4934	int max_blksz = zfsvfs->z_max_blksz;
4935	uio_t *uio = &xuio->xu_uio;
4936	ssize_t size = uio->uio_resid;
4937	offset_t offset = uio->uio_loffset;
4938	int blksz;
4939	int fullblk, i;
4940	arc_buf_t *abuf;
4941	ssize_t maxsize;
4942	int preamble, postamble;
4943
4944	if (xuio->xu_type != UIOTYPE_ZEROCOPY)
4945		return (EINVAL);
4946
4947	ZFS_ENTER(zfsvfs);
4948	ZFS_VERIFY_ZP(zp);
4949	switch (ioflag) {
4950	case UIO_WRITE:
4951		/*
4952		 * Loan out an arc_buf for write if write size is bigger than
4953		 * max_blksz, and the file's block size is also max_blksz.
4954		 */
4955		blksz = max_blksz;
4956		if (size < blksz || zp->z_blksz != blksz) {
4957			ZFS_EXIT(zfsvfs);
4958			return (EINVAL);
4959		}
4960		/*
4961		 * Caller requests buffers for write before knowing where the
4962		 * write offset might be (e.g. NFS TCP write).
4963		 */
4964		if (offset == -1) {
4965			preamble = 0;
4966		} else {
4967			preamble = P2PHASE(offset, blksz);
4968			if (preamble) {
4969				preamble = blksz - preamble;
4970				size -= preamble;
4971			}
4972		}
4973
4974		postamble = P2PHASE(size, blksz);
4975		size -= postamble;
4976
4977		fullblk = size / blksz;
4978		(void) dmu_xuio_init(xuio,
4979		    (preamble != 0) + fullblk + (postamble != 0));
4980		DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble,
4981		    int, postamble, int,
4982		    (preamble != 0) + fullblk + (postamble != 0));
4983
4984		/*
4985		 * Have to fix iov base/len for partial buffers.  They
4986		 * currently represent full arc_buf's.
4987		 */
4988		if (preamble) {
4989			/* data begins in the middle of the arc_buf */
4990			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
4991			    blksz);
4992			ASSERT(abuf);
4993			(void) dmu_xuio_add(xuio, abuf,
4994			    blksz - preamble, preamble);
4995		}
4996
4997		for (i = 0; i < fullblk; i++) {
4998			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
4999			    blksz);
5000			ASSERT(abuf);
5001			(void) dmu_xuio_add(xuio, abuf, 0, blksz);
5002		}
5003
5004		if (postamble) {
5005			/* data ends in the middle of the arc_buf */
5006			abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5007			    blksz);
5008			ASSERT(abuf);
5009			(void) dmu_xuio_add(xuio, abuf, 0, postamble);
5010		}
5011		break;
5012	case UIO_READ:
5013		/*
5014		 * Loan out an arc_buf for read if the read size is larger than
5015		 * the current file block size.  Block alignment is not
5016		 * considered.  Partial arc_buf will be loaned out for read.
5017		 */
5018		blksz = zp->z_blksz;
5019		if (blksz < zcr_blksz_min)
5020			blksz = zcr_blksz_min;
5021		if (blksz > zcr_blksz_max)
5022			blksz = zcr_blksz_max;
5023		/* avoid potential complexity of dealing with it */
5024		if (blksz > max_blksz) {
5025			ZFS_EXIT(zfsvfs);
5026			return (EINVAL);
5027		}
5028
5029		maxsize = zp->z_size - uio->uio_loffset;
5030		if (size > maxsize)
5031			size = maxsize;
5032
5033		if (size < blksz || vn_has_cached_data(vp)) {
5034			ZFS_EXIT(zfsvfs);
5035			return (EINVAL);
5036		}
5037		break;
5038	default:
5039		ZFS_EXIT(zfsvfs);
5040		return (EINVAL);
5041	}
5042
5043	uio->uio_extflg = UIO_XUIO;
5044	XUIO_XUZC_RW(xuio) = ioflag;
5045	ZFS_EXIT(zfsvfs);
5046	return (0);
5047}
5048
5049/*ARGSUSED*/
5050static int
5051zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct)
5052{
5053	int i;
5054	arc_buf_t *abuf;
5055	int ioflag = XUIO_XUZC_RW(xuio);
5056
5057	ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
5058
5059	i = dmu_xuio_cnt(xuio);
5060	while (i-- > 0) {
5061		abuf = dmu_xuio_arcbuf(xuio, i);
5062		/*
5063		 * if abuf == NULL, it must be a write buffer
5064		 * that has been returned in zfs_write().
5065		 */
5066		if (abuf)
5067			dmu_return_arcbuf(abuf);
5068		ASSERT(abuf || ioflag == UIO_WRITE);
5069	}
5070
5071	dmu_xuio_fini(xuio);
5072	return (0);
5073}
5074
5075/*
5076 * Predeclare these here so that the compiler assumes that
5077 * this is an "old style" function declaration that does
5078 * not include arguments => we won't get type mismatch errors
5079 * in the initializations that follow.
5080 */
5081static int zfs_inval();
5082static int zfs_isdir();
5083
5084static int
5085zfs_inval()
5086{
5087	return (EINVAL);
5088}
5089
5090static int
5091zfs_isdir()
5092{
5093	return (EISDIR);
5094}
5095/*
5096 * Directory vnode operations template
5097 */
5098vnodeops_t *zfs_dvnodeops;
5099const fs_operation_def_t zfs_dvnodeops_template[] = {
5100	VOPNAME_OPEN,		{ .vop_open = zfs_open },
5101	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
5102	VOPNAME_READ,		{ .error = zfs_isdir },
5103	VOPNAME_WRITE,		{ .error = zfs_isdir },
5104	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
5105	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5106	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5107	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5108	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
5109	VOPNAME_CREATE,		{ .vop_create = zfs_create },
5110	VOPNAME_REMOVE,		{ .vop_remove = zfs_remove },
5111	VOPNAME_LINK,		{ .vop_link = zfs_link },
5112	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5113	VOPNAME_MKDIR,		{ .vop_mkdir = zfs_mkdir },
5114	VOPNAME_RMDIR,		{ .vop_rmdir = zfs_rmdir },
5115	VOPNAME_READDIR,	{ .vop_readdir = zfs_readdir },
5116	VOPNAME_SYMLINK,	{ .vop_symlink = zfs_symlink },
5117	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
5118	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5119	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5120	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
5121	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5122	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5123	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5124	VOPNAME_VNEVENT, 	{ .vop_vnevent = fs_vnevent_support },
5125	NULL,			NULL
5126};
5127
5128/*
5129 * Regular file vnode operations template
5130 */
5131vnodeops_t *zfs_fvnodeops;
5132const fs_operation_def_t zfs_fvnodeops_template[] = {
5133	VOPNAME_OPEN,		{ .vop_open = zfs_open },
5134	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
5135	VOPNAME_READ,		{ .vop_read = zfs_read },
5136	VOPNAME_WRITE,		{ .vop_write = zfs_write },
5137	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
5138	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5139	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5140	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5141	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
5142	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5143	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
5144	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5145	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5146	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
5147	VOPNAME_FRLOCK,		{ .vop_frlock = zfs_frlock },
5148	VOPNAME_SPACE,		{ .vop_space = zfs_space },
5149	VOPNAME_GETPAGE,	{ .vop_getpage = zfs_getpage },
5150	VOPNAME_PUTPAGE,	{ .vop_putpage = zfs_putpage },
5151	VOPNAME_MAP,		{ .vop_map = zfs_map },
5152	VOPNAME_ADDMAP,		{ .vop_addmap = zfs_addmap },
5153	VOPNAME_DELMAP,		{ .vop_delmap = zfs_delmap },
5154	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5155	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5156	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5157	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5158	VOPNAME_REQZCBUF, 	{ .vop_reqzcbuf = zfs_reqzcbuf },
5159	VOPNAME_RETZCBUF, 	{ .vop_retzcbuf = zfs_retzcbuf },
5160	NULL,			NULL
5161};
5162
5163/*
5164 * Symbolic link vnode operations template
5165 */
5166vnodeops_t *zfs_symvnodeops;
5167const fs_operation_def_t zfs_symvnodeops_template[] = {
5168	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5169	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5170	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5171	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5172	VOPNAME_READLINK,	{ .vop_readlink = zfs_readlink },
5173	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5174	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5175	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5176	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5177	NULL,			NULL
5178};
5179
5180/*
5181 * special share hidden files vnode operations template
5182 */
5183vnodeops_t *zfs_sharevnodeops;
5184const fs_operation_def_t zfs_sharevnodeops_template[] = {
5185	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5186	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5187	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5188	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5189	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5190	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5191	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5192	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5193	NULL,			NULL
5194};
5195
5196/*
5197 * Extended attribute directory vnode operations template
5198 *	This template is identical to the directory vnodes
5199 *	operation template except for restricted operations:
5200 *		VOP_MKDIR()
5201 *		VOP_SYMLINK()
5202 * Note that there are other restrictions embedded in:
5203 *	zfs_create()	- restrict type to VREG
5204 *	zfs_link()	- no links into/out of attribute space
5205 *	zfs_rename()	- no moves into/out of attribute space
5206 */
5207vnodeops_t *zfs_xdvnodeops;
5208const fs_operation_def_t zfs_xdvnodeops_template[] = {
5209	VOPNAME_OPEN,		{ .vop_open = zfs_open },
5210	VOPNAME_CLOSE,		{ .vop_close = zfs_close },
5211	VOPNAME_IOCTL,		{ .vop_ioctl = zfs_ioctl },
5212	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
5213	VOPNAME_SETATTR,	{ .vop_setattr = zfs_setattr },
5214	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
5215	VOPNAME_LOOKUP,		{ .vop_lookup = zfs_lookup },
5216	VOPNAME_CREATE,		{ .vop_create = zfs_create },
5217	VOPNAME_REMOVE,		{ .vop_remove = zfs_remove },
5218	VOPNAME_LINK,		{ .vop_link = zfs_link },
5219	VOPNAME_RENAME,		{ .vop_rename = zfs_rename },
5220	VOPNAME_MKDIR,		{ .error = zfs_inval },
5221	VOPNAME_RMDIR,		{ .vop_rmdir = zfs_rmdir },
5222	VOPNAME_READDIR,	{ .vop_readdir = zfs_readdir },
5223	VOPNAME_SYMLINK,	{ .error = zfs_inval },
5224	VOPNAME_FSYNC,		{ .vop_fsync = zfs_fsync },
5225	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5226	VOPNAME_FID,		{ .vop_fid = zfs_fid },
5227	VOPNAME_SEEK,		{ .vop_seek = zfs_seek },
5228	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5229	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
5230	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
5231	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
5232	NULL,			NULL
5233};
5234
5235/*
5236 * Error vnode operations template
5237 */
5238vnodeops_t *zfs_evnodeops;
5239const fs_operation_def_t zfs_evnodeops_template[] = {
5240	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
5241	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
5242	NULL,			NULL
5243};
5244