zfs_znode.c revision 224251
11558Srgrimes/*
250476Speter * CDDL HEADER START
31558Srgrimes *
412481Speter * The contents of this file are subject to the terms of the
51558Srgrimes * Common Development and Distribution License (the "License").
641061Sbde * You may not use this file except in compliance with the License.
774448Ssos *
841061Sbde * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
939271Sphk * or http://www.opensolaris.org/os/licensing.
1039255Sgibbs * See the License for the specific language governing permissions
1138653Sgpalmer * and limitations under the License.
1238653Sgpalmer *
1355980Speter * When distributing Covered Code, include this CDDL HEADER in each
1485380Sjlemon * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
1543859Sobrien * If applicable, add the following below this CDDL HEADER, with the
1638653Sgpalmer * fields enclosed by brackets "[]" replaced with your own identifying
1738653Sgpalmer * information: Portions Copyright [yyyy] [name of copyright owner]
1838653Sgpalmer *
1938653Sgpalmer * CDDL HEADER END
2038653Sgpalmer */
2138653Sgpalmer/*
2279458Sobrien * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
2379457Sobrien */
2438653Sgpalmer
2538653Sgpalmer/* Portions Copyright 2007 Jeremy Teo */
2669800Stomsoft
2738653Sgpalmer#ifdef _KERNEL
2838653Sgpalmer#include <sys/types.h>
2956815Sshin#include <sys/param.h>
3038653Sgpalmer#include <sys/time.h>
3178978Sroam#include <sys/systm.h>
3238653Sgpalmer#include <sys/sysmacros.h>
3338653Sgpalmer#include <sys/resource.h>
3438653Sgpalmer#include <sys/mntent.h>
3538843Sjb#include <sys/u8_textprep.h>
3638653Sgpalmer#include <sys/dsl_dataset.h>
3770450Sphk#include <sys/vfs.h>
3878448Sdd#include <sys/vnode.h>
3938653Sgpalmer#include <sys/file.h>
4038653Sgpalmer#include <sys/kmem.h>
4138653Sgpalmer#include <sys/errno.h>
4238653Sgpalmer#include <sys/unistd.h>
4377577Sru#include <sys/atomic.h>
4438653Sgpalmer#include <sys/zfs_dir.h>
4543557Ssemenu#include <sys/zfs_acl.h>
4677042Sru#include <sys/zfs_ioctl.h>
4777042Sru#include <sys/zfs_rlock.h>
4838653Sgpalmer#include <sys/zfs_fuid.h>
4994658Sscottl#include <sys/dnode.h>
5077042Sru#include <sys/fs/zfs.h>
5177042Sru#include <sys/kidmap.h>
5238653Sgpalmer#endif /* _KERNEL */
5344690Sbrian
5438653Sgpalmer#include <sys/dmu.h>
5538653Sgpalmer#include <sys/refcount.h>
5638653Sgpalmer#include <sys/stat.h>
5738653Sgpalmer#include <sys/zap.h>
5838653Sgpalmer#include <sys/zfs_znode.h>
5938653Sgpalmer#include <sys/sa.h>
6055163Sshin#include <sys/zfs_sa.h>
6138653Sgpalmer#include <sys/zfs_stat.h>
6298187Sgordon#include <sys/refcount.h>
6338653Sgpalmer
6438653Sgpalmer#include "zfs_prop.h"
6538653Sgpalmer#include "zfs_comutil.h"
6638653Sgpalmer
6755163Sshin/* Used by fstat(1). */
6893651SmarcelSYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, 0, sizeof(znode_t),
6938653Sgpalmer    "sizeof(znode_t)");
7038653Sgpalmer
7141061Sbde/*
7238653Sgpalmer * Define ZNODE_STATS to turn on statistic gathering. By default, it is only
7385469Sru * turned on when DEBUG is also defined.
7446878Sobrien */
7538653Sgpalmer#ifdef	DEBUG
7642117Ssos#define	ZNODE_STATS
7742117Ssos#endif	/* DEBUG */
7810855Sjoerg
7992868Sru#ifdef	ZNODE_STATS
8092868Sru#define	ZNODE_STAT_ADD(stat)			((stat)++)
8192868Sru#else
8292868Sru#define	ZNODE_STAT_ADD(stat)			/* nothing */
8392868Sru#endif	/* ZNODE_STATS */
8492868Sru
8592868Sru/*
8692868Sru * Functions needed for userland (ie: libzpool) are not put under
8785954Speter * #ifdef_KERNEL; the rest of the functions have dependencies
8885954Speter * (such as VFS logic) that will not compile easily in userland.
8985954Speter */
9097951Sgordon#ifdef _KERNEL
9186032Speter/*
9286032Speter * Needed to close a small window in zfs_znode_move() that allows the zfsvfs to
9386032Speter * be freed before it can be safely accessed.
9486032Speter */
9544317Sjkhkrwlock_t zfsvfs_lock;
9686032Speter
9744317Sjkhstatic kmem_cache_t *znode_cache = NULL;
9885954Speter
9985954Speter/*ARGSUSED*/
10038458Sjbstatic void
10138458Sjbznode_evict_error(dmu_buf_t *dbuf, void *user_ptr)
1021558Srgrimes{
103	/*
104	 * We should never drop all dbuf refs without first clearing
105	 * the eviction callback.
106	 */
107	panic("evicting znode %p\n", user_ptr);
108}
109
110extern struct vop_vector zfs_vnodeops;
111extern struct vop_vector zfs_fifoops;
112extern struct vop_vector zfs_shareops;
113
114/*
115 * XXX: We cannot use this function as a cache constructor, because
116 *      there is one global cache for all file systems and we need
117 *      to pass vfsp here, which is not possible, because argument
118 *      'cdrarg' is defined at kmem_cache_create() time.
119 */
120/*ARGSUSED*/
121static int
122zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
123{
124	znode_t *zp = buf;
125	vnode_t *vp;
126	vfs_t *vfsp = arg;
127	int error;
128
129	POINTER_INVALIDATE(&zp->z_zfsvfs);
130	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
131
132	if (vfsp != NULL) {
133		error = getnewvnode("zfs", vfsp, &zfs_vnodeops, &vp);
134		if (error != 0 && (kmflags & KM_NOSLEEP))
135			return (-1);
136		ASSERT(error == 0);
137		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
138		zp->z_vnode = vp;
139		vp->v_data = (caddr_t)zp;
140		VN_LOCK_AREC(vp);
141		VN_LOCK_ASHARE(vp);
142	} else {
143		zp->z_vnode = NULL;
144	}
145
146	list_link_init(&zp->z_link_node);
147
148	mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
149	rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
150	rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
151	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
152
153	mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
154	avl_create(&zp->z_range_avl, zfs_range_compare,
155	    sizeof (rl_t), offsetof(rl_t, r_node));
156
157	zp->z_dirlocks = NULL;
158	zp->z_acl_cached = NULL;
159	zp->z_moved = 0;
160	return (0);
161}
162
163/*ARGSUSED*/
164static void
165zfs_znode_cache_destructor(void *buf, void *arg)
166{
167	znode_t *zp = buf;
168
169	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
170	ASSERT(ZTOV(zp) == NULL);
171	vn_free(ZTOV(zp));
172	ASSERT(!list_link_active(&zp->z_link_node));
173	mutex_destroy(&zp->z_lock);
174	rw_destroy(&zp->z_parent_lock);
175	rw_destroy(&zp->z_name_lock);
176	mutex_destroy(&zp->z_acl_lock);
177	avl_destroy(&zp->z_range_avl);
178	mutex_destroy(&zp->z_range_lock);
179
180	ASSERT(zp->z_dirlocks == NULL);
181	ASSERT(zp->z_acl_cached == NULL);
182}
183
184#ifdef	ZNODE_STATS
185static struct {
186	uint64_t zms_zfsvfs_invalid;
187	uint64_t zms_zfsvfs_recheck1;
188	uint64_t zms_zfsvfs_unmounted;
189	uint64_t zms_zfsvfs_recheck2;
190	uint64_t zms_obj_held;
191	uint64_t zms_vnode_locked;
192	uint64_t zms_not_only_dnlc;
193} znode_move_stats;
194#endif	/* ZNODE_STATS */
195
196#ifdef sun
197static void
198zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
199{
200	vnode_t *vp;
201
202	/* Copy fields. */
203	nzp->z_zfsvfs = ozp->z_zfsvfs;
204
205	/* Swap vnodes. */
206	vp = nzp->z_vnode;
207	nzp->z_vnode = ozp->z_vnode;
208	ozp->z_vnode = vp; /* let destructor free the overwritten vnode */
209	ZTOV(ozp)->v_data = ozp;
210	ZTOV(nzp)->v_data = nzp;
211
212	nzp->z_id = ozp->z_id;
213	ASSERT(ozp->z_dirlocks == NULL); /* znode not in use */
214	ASSERT(avl_numnodes(&ozp->z_range_avl) == 0);
215	nzp->z_unlinked = ozp->z_unlinked;
216	nzp->z_atime_dirty = ozp->z_atime_dirty;
217	nzp->z_zn_prefetch = ozp->z_zn_prefetch;
218	nzp->z_blksz = ozp->z_blksz;
219	nzp->z_seq = ozp->z_seq;
220	nzp->z_mapcnt = ozp->z_mapcnt;
221	nzp->z_gen = ozp->z_gen;
222	nzp->z_sync_cnt = ozp->z_sync_cnt;
223	nzp->z_is_sa = ozp->z_is_sa;
224	nzp->z_sa_hdl = ozp->z_sa_hdl;
225	bcopy(ozp->z_atime, nzp->z_atime, sizeof (uint64_t) * 2);
226	nzp->z_links = ozp->z_links;
227	nzp->z_size = ozp->z_size;
228	nzp->z_pflags = ozp->z_pflags;
229	nzp->z_uid = ozp->z_uid;
230	nzp->z_gid = ozp->z_gid;
231	nzp->z_mode = ozp->z_mode;
232
233	/*
234	 * Since this is just an idle znode and kmem is already dealing with
235	 * memory pressure, release any cached ACL.
236	 */
237	if (ozp->z_acl_cached) {
238		zfs_acl_free(ozp->z_acl_cached);
239		ozp->z_acl_cached = NULL;
240	}
241
242	sa_set_userp(nzp->z_sa_hdl, nzp);
243
244	/*
245	 * Invalidate the original znode by clearing fields that provide a
246	 * pointer back to the znode. Set the low bit of the vfs pointer to
247	 * ensure that zfs_znode_move() recognizes the znode as invalid in any
248	 * subsequent callback.
249	 */
250	ozp->z_sa_hdl = NULL;
251	POINTER_INVALIDATE(&ozp->z_zfsvfs);
252
253	/*
254	 * Mark the znode.
255	 */
256	nzp->z_moved = 1;
257	ozp->z_moved = (uint8_t)-1;
258}
259
260/*ARGSUSED*/
261static kmem_cbrc_t
262zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg)
263{
264	znode_t *ozp = buf, *nzp = newbuf;
265	zfsvfs_t *zfsvfs;
266	vnode_t *vp;
267
268	/*
269	 * The znode is on the file system's list of known znodes if the vfs
270	 * pointer is valid. We set the low bit of the vfs pointer when freeing
271	 * the znode to invalidate it, and the memory patterns written by kmem
272	 * (baddcafe and deadbeef) set at least one of the two low bits. A newly
273	 * created znode sets the vfs pointer last of all to indicate that the
274	 * znode is known and in a valid state to be moved by this function.
275	 */
276	zfsvfs = ozp->z_zfsvfs;
277	if (!POINTER_IS_VALID(zfsvfs)) {
278		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_invalid);
279		return (KMEM_CBRC_DONT_KNOW);
280	}
281
282	/*
283	 * Close a small window in which it's possible that the filesystem could
284	 * be unmounted and freed, and zfsvfs, though valid in the previous
285	 * statement, could point to unrelated memory by the time we try to
286	 * prevent the filesystem from being unmounted.
287	 */
288	rw_enter(&zfsvfs_lock, RW_WRITER);
289	if (zfsvfs != ozp->z_zfsvfs) {
290		rw_exit(&zfsvfs_lock);
291		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck1);
292		return (KMEM_CBRC_DONT_KNOW);
293	}
294
295	/*
296	 * If the znode is still valid, then so is the file system. We know that
297	 * no valid file system can be freed while we hold zfsvfs_lock, so we
298	 * can safely ensure that the filesystem is not and will not be
299	 * unmounted. The next statement is equivalent to ZFS_ENTER().
300	 */
301	rrw_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG);
302	if (zfsvfs->z_unmounted) {
303		ZFS_EXIT(zfsvfs);
304		rw_exit(&zfsvfs_lock);
305		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted);
306		return (KMEM_CBRC_DONT_KNOW);
307	}
308	rw_exit(&zfsvfs_lock);
309
310	mutex_enter(&zfsvfs->z_znodes_lock);
311	/*
312	 * Recheck the vfs pointer in case the znode was removed just before
313	 * acquiring the lock.
314	 */
315	if (zfsvfs != ozp->z_zfsvfs) {
316		mutex_exit(&zfsvfs->z_znodes_lock);
317		ZFS_EXIT(zfsvfs);
318		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck2);
319		return (KMEM_CBRC_DONT_KNOW);
320	}
321
322	/*
323	 * At this point we know that as long as we hold z_znodes_lock, the
324	 * znode cannot be freed and fields within the znode can be safely
325	 * accessed. Now, prevent a race with zfs_zget().
326	 */
327	if (ZFS_OBJ_HOLD_TRYENTER(zfsvfs, ozp->z_id) == 0) {
328		mutex_exit(&zfsvfs->z_znodes_lock);
329		ZFS_EXIT(zfsvfs);
330		ZNODE_STAT_ADD(znode_move_stats.zms_obj_held);
331		return (KMEM_CBRC_LATER);
332	}
333
334	vp = ZTOV(ozp);
335	if (mutex_tryenter(&vp->v_lock) == 0) {
336		ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
337		mutex_exit(&zfsvfs->z_znodes_lock);
338		ZFS_EXIT(zfsvfs);
339		ZNODE_STAT_ADD(znode_move_stats.zms_vnode_locked);
340		return (KMEM_CBRC_LATER);
341	}
342
343	/* Only move znodes that are referenced _only_ by the DNLC. */
344	if (vp->v_count != 1 || !vn_in_dnlc(vp)) {
345		mutex_exit(&vp->v_lock);
346		ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
347		mutex_exit(&zfsvfs->z_znodes_lock);
348		ZFS_EXIT(zfsvfs);
349		ZNODE_STAT_ADD(znode_move_stats.zms_not_only_dnlc);
350		return (KMEM_CBRC_LATER);
351	}
352
353	/*
354	 * The znode is known and in a valid state to move. We're holding the
355	 * locks needed to execute the critical section.
356	 */
357	zfs_znode_move_impl(ozp, nzp);
358	mutex_exit(&vp->v_lock);
359	ZFS_OBJ_HOLD_EXIT(zfsvfs, ozp->z_id);
360
361	list_link_replace(&ozp->z_link_node, &nzp->z_link_node);
362	mutex_exit(&zfsvfs->z_znodes_lock);
363	ZFS_EXIT(zfsvfs);
364
365	return (KMEM_CBRC_YES);
366}
367#endif /* sun */
368
369void
370zfs_znode_init(void)
371{
372	/*
373	 * Initialize zcache
374	 */
375	rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL);
376	ASSERT(znode_cache == NULL);
377	znode_cache = kmem_cache_create("zfs_znode_cache",
378	    sizeof (znode_t), 0, /* zfs_znode_cache_constructor */ NULL,
379	    zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
380	kmem_cache_set_move(znode_cache, zfs_znode_move);
381}
382
383void
384zfs_znode_fini(void)
385{
386#ifdef sun
387	/*
388	 * Cleanup vfs & vnode ops
389	 */
390	zfs_remove_op_tables();
391#endif	/* sun */
392
393	/*
394	 * Cleanup zcache
395	 */
396	if (znode_cache)
397		kmem_cache_destroy(znode_cache);
398	znode_cache = NULL;
399	rw_destroy(&zfsvfs_lock);
400}
401
402#ifdef sun
403struct vnodeops *zfs_dvnodeops;
404struct vnodeops *zfs_fvnodeops;
405struct vnodeops *zfs_symvnodeops;
406struct vnodeops *zfs_xdvnodeops;
407struct vnodeops *zfs_evnodeops;
408struct vnodeops *zfs_sharevnodeops;
409
410void
411zfs_remove_op_tables()
412{
413	/*
414	 * Remove vfs ops
415	 */
416	ASSERT(zfsfstype);
417	(void) vfs_freevfsops_by_type(zfsfstype);
418	zfsfstype = 0;
419
420	/*
421	 * Remove vnode ops
422	 */
423	if (zfs_dvnodeops)
424		vn_freevnodeops(zfs_dvnodeops);
425	if (zfs_fvnodeops)
426		vn_freevnodeops(zfs_fvnodeops);
427	if (zfs_symvnodeops)
428		vn_freevnodeops(zfs_symvnodeops);
429	if (zfs_xdvnodeops)
430		vn_freevnodeops(zfs_xdvnodeops);
431	if (zfs_evnodeops)
432		vn_freevnodeops(zfs_evnodeops);
433	if (zfs_sharevnodeops)
434		vn_freevnodeops(zfs_sharevnodeops);
435
436	zfs_dvnodeops = NULL;
437	zfs_fvnodeops = NULL;
438	zfs_symvnodeops = NULL;
439	zfs_xdvnodeops = NULL;
440	zfs_evnodeops = NULL;
441	zfs_sharevnodeops = NULL;
442}
443
444extern const fs_operation_def_t zfs_dvnodeops_template[];
445extern const fs_operation_def_t zfs_fvnodeops_template[];
446extern const fs_operation_def_t zfs_xdvnodeops_template[];
447extern const fs_operation_def_t zfs_symvnodeops_template[];
448extern const fs_operation_def_t zfs_evnodeops_template[];
449extern const fs_operation_def_t zfs_sharevnodeops_template[];
450
451int
452zfs_create_op_tables()
453{
454	int error;
455
456	/*
457	 * zfs_dvnodeops can be set if mod_remove() calls mod_installfs()
458	 * due to a failure to remove the the 2nd modlinkage (zfs_modldrv).
459	 * In this case we just return as the ops vectors are already set up.
460	 */
461	if (zfs_dvnodeops)
462		return (0);
463
464	error = vn_make_ops(MNTTYPE_ZFS, zfs_dvnodeops_template,
465	    &zfs_dvnodeops);
466	if (error)
467		return (error);
468
469	error = vn_make_ops(MNTTYPE_ZFS, zfs_fvnodeops_template,
470	    &zfs_fvnodeops);
471	if (error)
472		return (error);
473
474	error = vn_make_ops(MNTTYPE_ZFS, zfs_symvnodeops_template,
475	    &zfs_symvnodeops);
476	if (error)
477		return (error);
478
479	error = vn_make_ops(MNTTYPE_ZFS, zfs_xdvnodeops_template,
480	    &zfs_xdvnodeops);
481	if (error)
482		return (error);
483
484	error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template,
485	    &zfs_evnodeops);
486	if (error)
487		return (error);
488
489	error = vn_make_ops(MNTTYPE_ZFS, zfs_sharevnodeops_template,
490	    &zfs_sharevnodeops);
491
492	return (error);
493}
494#endif	/* sun */
495
496int
497zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
498{
499	zfs_acl_ids_t acl_ids;
500	vattr_t vattr;
501	znode_t *sharezp;
502	vnode_t *vp, vnode;
503	znode_t *zp;
504	int error;
505
506	vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
507	vattr.va_type = VDIR;
508	vattr.va_mode = S_IFDIR|0555;
509	vattr.va_uid = crgetuid(kcred);
510	vattr.va_gid = crgetgid(kcred);
511
512	sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP);
513	zfs_znode_cache_constructor(sharezp, zfsvfs->z_parent->z_vfs, 0);
514	ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs));
515	sharezp->z_moved = 0;
516	sharezp->z_unlinked = 0;
517	sharezp->z_atime_dirty = 0;
518	sharezp->z_zfsvfs = zfsvfs;
519	sharezp->z_is_sa = zfsvfs->z_use_sa;
520
521	sharezp->z_vnode = &vnode;
522	vnode.v_data = sharezp;
523
524	vp = ZTOV(sharezp);
525	vp->v_type = VDIR;
526
527	VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
528	    kcred, NULL, &acl_ids));
529	zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids);
530	ASSERT3P(zp, ==, sharezp);
531	POINTER_INVALIDATE(&sharezp->z_zfsvfs);
532	error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
533	    ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx);
534	zfsvfs->z_shares_dir = sharezp->z_id;
535
536	zfs_acl_ids_free(&acl_ids);
537	ZTOV(sharezp)->v_data = NULL;
538	ZTOV(sharezp)->v_count = 0;
539	ZTOV(sharezp)->v_holdcnt = 0;
540	zp->z_vnode = NULL;
541	sa_handle_destroy(sharezp->z_sa_hdl);
542	sharezp->z_vnode = NULL;
543	kmem_cache_free(znode_cache, sharezp);
544
545	return (error);
546}
547
548/*
549 * define a couple of values we need available
550 * for both 64 and 32 bit environments.
551 */
552#ifndef NBITSMINOR64
553#define	NBITSMINOR64	32
554#endif
555#ifndef MAXMAJ64
556#define	MAXMAJ64	0xffffffffUL
557#endif
558#ifndef	MAXMIN64
559#define	MAXMIN64	0xffffffffUL
560#endif
561
562/*
563 * Create special expldev for ZFS private use.
564 * Can't use standard expldev since it doesn't do
565 * what we want.  The standard expldev() takes a
566 * dev32_t in LP64 and expands it to a long dev_t.
567 * We need an interface that takes a dev32_t in ILP32
568 * and expands it to a long dev_t.
569 */
570static uint64_t
571zfs_expldev(dev_t dev)
572{
573	return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev));
574}
575/*
576 * Special cmpldev for ZFS private use.
577 * Can't use standard cmpldev since it takes
578 * a long dev_t and compresses it to dev32_t in
579 * LP64.  We need to do a compaction of a long dev_t
580 * to a dev32_t in ILP32.
581 */
582dev_t
583zfs_cmpldev(uint64_t dev)
584{
585	return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64)));
586}
587
588static void
589zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp,
590    dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl)
591{
592	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
593	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
594
595	mutex_enter(&zp->z_lock);
596
597	ASSERT(zp->z_sa_hdl == NULL);
598	ASSERT(zp->z_acl_cached == NULL);
599	if (sa_hdl == NULL) {
600		VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp,
601		    SA_HDL_SHARED, &zp->z_sa_hdl));
602	} else {
603		zp->z_sa_hdl = sa_hdl;
604		sa_set_userp(sa_hdl, zp);
605	}
606
607	zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
608
609	/*
610	 * Slap on VROOT if we are the root znode
611	 */
612	if (zp->z_id == zfsvfs->z_root)
613		ZTOV(zp)->v_flag |= VROOT;
614
615	mutex_exit(&zp->z_lock);
616	vn_exists(ZTOV(zp));
617}
618
619void
620zfs_znode_dmu_fini(znode_t *zp)
621{
622	ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) ||
623	    zp->z_unlinked ||
624	    RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock));
625
626	sa_handle_destroy(zp->z_sa_hdl);
627	zp->z_sa_hdl = NULL;
628}
629
630/*
631 * Construct a new znode/vnode and intialize.
632 *
633 * This does not do a call to dmu_set_user() that is
634 * up to the caller to do, in case you don't want to
635 * return the znode
636 */
637static znode_t *
638zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz,
639    dmu_object_type_t obj_type, sa_handle_t *hdl)
640{
641	znode_t	*zp;
642	vnode_t *vp;
643	uint64_t mode;
644	uint64_t parent;
645	sa_bulk_attr_t bulk[9];
646	int count = 0;
647
648	zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
649	zfs_znode_cache_constructor(zp, zfsvfs->z_parent->z_vfs, 0);
650
651	ASSERT(zp->z_dirlocks == NULL);
652	ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
653	zp->z_moved = 0;
654
655	/*
656	 * Defer setting z_zfsvfs until the znode is ready to be a candidate for
657	 * the zfs_znode_move() callback.
658	 */
659	zp->z_sa_hdl = NULL;
660	zp->z_unlinked = 0;
661	zp->z_atime_dirty = 0;
662	zp->z_mapcnt = 0;
663	zp->z_id = db->db_object;
664	zp->z_blksz = blksz;
665	zp->z_seq = 0x7A4653;
666	zp->z_sync_cnt = 0;
667
668	vp = ZTOV(zp);
669
670	zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl);
671
672	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
673	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8);
674	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
675	    &zp->z_size, 8);
676	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
677	    &zp->z_links, 8);
678	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
679	    &zp->z_pflags, 8);
680	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
681	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
682	    &zp->z_atime, 16);
683	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
684	    &zp->z_uid, 8);
685	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
686	    &zp->z_gid, 8);
687
688	if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0) {
689		if (hdl == NULL)
690			sa_handle_destroy(zp->z_sa_hdl);
691		kmem_cache_free(znode_cache, zp);
692		return (NULL);
693	}
694
695	zp->z_mode = mode;
696
697	vp->v_type = IFTOVT((mode_t)mode);
698
699	switch (vp->v_type) {
700	case VDIR:
701		zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
702		break;
703#ifdef sun
704	case VBLK:
705	case VCHR:
706		{
707			uint64_t rdev;
708			VERIFY(sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zfsvfs),
709			    &rdev, sizeof (rdev)) == 0);
710
711			vp->v_rdev = zfs_cmpldev(rdev);
712		}
713		break;
714#endif	/* sun */
715	case VFIFO:
716		vp->v_op = &zfs_fifoops;
717		break;
718	case VREG:
719		if (parent == zfsvfs->z_shares_dir) {
720			ASSERT(zp->z_uid == 0 && zp->z_gid == 0);
721			vp->v_op = &zfs_shareops;
722		}
723		break;
724	}
725	if (vp->v_type != VFIFO)
726		VN_LOCK_ASHARE(vp);
727
728	mutex_enter(&zfsvfs->z_znodes_lock);
729	list_insert_tail(&zfsvfs->z_all_znodes, zp);
730	membar_producer();
731	/*
732	 * Everything else must be valid before assigning z_zfsvfs makes the
733	 * znode eligible for zfs_znode_move().
734	 */
735	zp->z_zfsvfs = zfsvfs;
736	mutex_exit(&zfsvfs->z_znodes_lock);
737
738	VFS_HOLD(zfsvfs->z_vfs);
739	return (zp);
740}
741
742static uint64_t empty_xattr;
743static uint64_t pad[4];
744static zfs_acl_phys_t acl_phys;
745/*
746 * Create a new DMU object to hold a zfs znode.
747 *
748 *	IN:	dzp	- parent directory for new znode
749 *		vap	- file attributes for new znode
750 *		tx	- dmu transaction id for zap operations
751 *		cr	- credentials of caller
752 *		flag	- flags:
753 *			  IS_ROOT_NODE	- new object will be root
754 *			  IS_XATTR	- new object is an attribute
755 *		bonuslen - length of bonus buffer
756 *		setaclp  - File/Dir initial ACL
757 *		fuidp	 - Tracks fuid allocation.
758 *
759 *	OUT:	zpp	- allocated znode
760 *
761 */
762void
763zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
764    uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids)
765{
766	uint64_t	crtime[2], atime[2], mtime[2], ctime[2];
767	uint64_t	mode, size, links, parent, pflags;
768	uint64_t	dzp_pflags = 0;
769	uint64_t	rdev = 0;
770	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
771	dmu_buf_t	*db;
772	timestruc_t	now;
773	uint64_t	gen, obj;
774	int		err;
775	int		bonuslen;
776	sa_handle_t	*sa_hdl;
777	dmu_object_type_t obj_type;
778	sa_bulk_attr_t	sa_attrs[ZPL_END];
779	int		cnt = 0;
780	zfs_acl_locator_cb_t locate = { 0 };
781
782	ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
783
784	if (zfsvfs->z_replay) {
785		obj = vap->va_nodeid;
786		now = vap->va_ctime;		/* see zfs_replay_create() */
787		gen = vap->va_nblocks;		/* ditto */
788	} else {
789		obj = 0;
790		gethrestime(&now);
791		gen = dmu_tx_get_txg(tx);
792	}
793
794	obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE;
795	bonuslen = (obj_type == DMU_OT_SA) ?
796	    DN_MAX_BONUSLEN : ZFS_OLD_ZNODE_PHYS_SIZE;
797
798	/*
799	 * Create a new DMU object.
800	 */
801	/*
802	 * There's currently no mechanism for pre-reading the blocks that will
803	 * be needed to allocate a new object, so we accept the small chance
804	 * that there will be an i/o error and we will fail one of the
805	 * assertions below.
806	 */
807	if (vap->va_type == VDIR) {
808		if (zfsvfs->z_replay) {
809			err = zap_create_claim_norm(zfsvfs->z_os, obj,
810			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
811			    obj_type, bonuslen, tx);
812			ASSERT3U(err, ==, 0);
813		} else {
814			obj = zap_create_norm(zfsvfs->z_os,
815			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
816			    obj_type, bonuslen, tx);
817		}
818	} else {
819		if (zfsvfs->z_replay) {
820			err = dmu_object_claim(zfsvfs->z_os, obj,
821			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
822			    obj_type, bonuslen, tx);
823			ASSERT3U(err, ==, 0);
824		} else {
825			obj = dmu_object_alloc(zfsvfs->z_os,
826			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
827			    obj_type, bonuslen, tx);
828		}
829	}
830
831	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
832	VERIFY(0 == sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
833
834	/*
835	 * If this is the root, fix up the half-initialized parent pointer
836	 * to reference the just-allocated physical data area.
837	 */
838	if (flag & IS_ROOT_NODE) {
839		dzp->z_id = obj;
840	} else {
841		dzp_pflags = dzp->z_pflags;
842	}
843
844	/*
845	 * If parent is an xattr, so am I.
846	 */
847	if (dzp_pflags & ZFS_XATTR) {
848		flag |= IS_XATTR;
849	}
850
851	if (zfsvfs->z_use_fuids)
852		pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED;
853	else
854		pflags = 0;
855
856	if (vap->va_type == VDIR) {
857		size = 2;		/* contents ("." and "..") */
858		links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1;
859	} else {
860		size = links = 0;
861	}
862
863	if (vap->va_type == VBLK || vap->va_type == VCHR) {
864		rdev = zfs_expldev(vap->va_rdev);
865	}
866
867	parent = dzp->z_id;
868	mode = acl_ids->z_mode;
869	if (flag & IS_XATTR)
870		pflags |= ZFS_XATTR;
871
872	/*
873	 * No execs denied will be deterimed when zfs_mode_compute() is called.
874	 */
875	pflags |= acl_ids->z_aclp->z_hints &
876	    (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT|
877	    ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED);
878
879	ZFS_TIME_ENCODE(&now, crtime);
880	ZFS_TIME_ENCODE(&now, ctime);
881
882	if (vap->va_mask & AT_ATIME) {
883		ZFS_TIME_ENCODE(&vap->va_atime, atime);
884	} else {
885		ZFS_TIME_ENCODE(&now, atime);
886	}
887
888	if (vap->va_mask & AT_MTIME) {
889		ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
890	} else {
891		ZFS_TIME_ENCODE(&now, mtime);
892	}
893
894	/* Now add in all of the "SA" attributes */
895	VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED,
896	    &sa_hdl));
897
898	/*
899	 * Setup the array of attributes to be replaced/set on the new file
900	 *
901	 * order for  DMU_OT_ZNODE is critical since it needs to be constructed
902	 * in the old znode_phys_t format.  Don't change this ordering
903	 */
904
905	if (obj_type == DMU_OT_ZNODE) {
906		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
907		    NULL, &atime, 16);
908		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
909		    NULL, &mtime, 16);
910		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
911		    NULL, &ctime, 16);
912		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
913		    NULL, &crtime, 16);
914		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
915		    NULL, &gen, 8);
916		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
917		    NULL, &mode, 8);
918		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
919		    NULL, &size, 8);
920		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
921		    NULL, &parent, 8);
922	} else {
923		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs),
924		    NULL, &mode, 8);
925		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs),
926		    NULL, &size, 8);
927		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs),
928		    NULL, &gen, 8);
929		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
930		    &acl_ids->z_fuid, 8);
931		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
932		    &acl_ids->z_fgid, 8);
933		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs),
934		    NULL, &parent, 8);
935		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
936		    NULL, &pflags, 8);
937		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs),
938		    NULL, &atime, 16);
939		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs),
940		    NULL, &mtime, 16);
941		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs),
942		    NULL, &ctime, 16);
943		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs),
944		    NULL, &crtime, 16);
945	}
946
947	SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
948
949	if (obj_type == DMU_OT_ZNODE) {
950		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL,
951		    &empty_xattr, 8);
952	}
953	if (obj_type == DMU_OT_ZNODE ||
954	    (vap->va_type == VBLK || vap->va_type == VCHR)) {
955		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs),
956		    NULL, &rdev, 8);
957
958	}
959	if (obj_type == DMU_OT_ZNODE) {
960		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs),
961		    NULL, &pflags, 8);
962		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL,
963		    &acl_ids->z_fuid, 8);
964		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL,
965		    &acl_ids->z_fgid, 8);
966		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad,
967		    sizeof (uint64_t) * 4);
968		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
969		    &acl_phys, sizeof (zfs_acl_phys_t));
970	} else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) {
971		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
972		    &acl_ids->z_aclp->z_acl_count, 8);
973		locate.cb_aclp = acl_ids->z_aclp;
974		SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs),
975		    zfs_acl_data_locator, &locate,
976		    acl_ids->z_aclp->z_acl_bytes);
977		mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags,
978		    acl_ids->z_fuid, acl_ids->z_fgid);
979	}
980
981	VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0);
982
983	if (!(flag & IS_ROOT_NODE)) {
984		*zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl);
985		ASSERT(*zpp != NULL);
986	} else {
987		/*
988		 * If we are creating the root node, the "parent" we
989		 * passed in is the znode for the root.
990		 */
991		*zpp = dzp;
992
993		(*zpp)->z_sa_hdl = sa_hdl;
994	}
995
996	(*zpp)->z_pflags = pflags;
997	(*zpp)->z_mode = mode;
998
999	if (vap->va_mask & AT_XVATTR)
1000		zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx);
1001
1002	if (obj_type == DMU_OT_ZNODE ||
1003	    acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
1004		err = zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx);
1005		ASSERT3P(err, ==, 0);
1006	}
1007	if (!(flag & IS_ROOT_NODE)) {
1008		vnode_t *vp;
1009
1010		vp = ZTOV(*zpp);
1011		vp->v_vflag |= VV_FORCEINSMQ;
1012		err = insmntque(vp, zfsvfs->z_vfs);
1013		vp->v_vflag &= ~VV_FORCEINSMQ;
1014		KASSERT(err == 0, ("insmntque() failed: error %d", err));
1015	}
1016	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
1017}
1018
1019/*
1020 * zfs_xvattr_set only updates the in-core attributes
1021 * it is assumed the caller will be doing an sa_bulk_update
1022 * to push the changes out
1023 */
1024void
1025zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
1026{
1027	xoptattr_t *xoap;
1028
1029	xoap = xva_getxoptattr(xvap);
1030	ASSERT(xoap);
1031
1032	if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
1033		uint64_t times[2];
1034		ZFS_TIME_ENCODE(&xoap->xoa_createtime, times);
1035		(void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
1036		    &times, sizeof (times), tx);
1037		XVA_SET_RTN(xvap, XAT_CREATETIME);
1038	}
1039	if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
1040		ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly,
1041		    zp->z_pflags, tx);
1042		XVA_SET_RTN(xvap, XAT_READONLY);
1043	}
1044	if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
1045		ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden,
1046		    zp->z_pflags, tx);
1047		XVA_SET_RTN(xvap, XAT_HIDDEN);
1048	}
1049	if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
1050		ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system,
1051		    zp->z_pflags, tx);
1052		XVA_SET_RTN(xvap, XAT_SYSTEM);
1053	}
1054	if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
1055		ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive,
1056		    zp->z_pflags, tx);
1057		XVA_SET_RTN(xvap, XAT_ARCHIVE);
1058	}
1059	if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
1060		ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable,
1061		    zp->z_pflags, tx);
1062		XVA_SET_RTN(xvap, XAT_IMMUTABLE);
1063	}
1064	if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
1065		ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink,
1066		    zp->z_pflags, tx);
1067		XVA_SET_RTN(xvap, XAT_NOUNLINK);
1068	}
1069	if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
1070		ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly,
1071		    zp->z_pflags, tx);
1072		XVA_SET_RTN(xvap, XAT_APPENDONLY);
1073	}
1074	if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
1075		ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump,
1076		    zp->z_pflags, tx);
1077		XVA_SET_RTN(xvap, XAT_NODUMP);
1078	}
1079	if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
1080		ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque,
1081		    zp->z_pflags, tx);
1082		XVA_SET_RTN(xvap, XAT_OPAQUE);
1083	}
1084	if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
1085		ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED,
1086		    xoap->xoa_av_quarantined, zp->z_pflags, tx);
1087		XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
1088	}
1089	if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
1090		ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified,
1091		    zp->z_pflags, tx);
1092		XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
1093	}
1094	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
1095		zfs_sa_set_scanstamp(zp, xvap, tx);
1096		XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
1097	}
1098	if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
1099		ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse,
1100		    zp->z_pflags, tx);
1101		XVA_SET_RTN(xvap, XAT_REPARSE);
1102	}
1103	if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
1104		ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline,
1105		    zp->z_pflags, tx);
1106		XVA_SET_RTN(xvap, XAT_OFFLINE);
1107	}
1108	if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
1109		ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse,
1110		    zp->z_pflags, tx);
1111		XVA_SET_RTN(xvap, XAT_SPARSE);
1112	}
1113}
1114
1115int
1116zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
1117{
1118	dmu_object_info_t doi;
1119	dmu_buf_t	*db;
1120	znode_t		*zp;
1121	int err;
1122	sa_handle_t	*hdl;
1123	int first = 1;
1124
1125	*zpp = NULL;
1126
1127again:
1128	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
1129
1130	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
1131	if (err) {
1132		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1133		return (err);
1134	}
1135
1136	dmu_object_info_from_db(db, &doi);
1137	if (doi.doi_bonus_type != DMU_OT_SA &&
1138	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
1139	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
1140	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1141		sa_buf_rele(db, NULL);
1142		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1143		return (EINVAL);
1144	}
1145
1146	hdl = dmu_buf_get_user(db);
1147	if (hdl != NULL) {
1148		zp  = sa_get_userdata(hdl);
1149
1150
1151		/*
1152		 * Since "SA" does immediate eviction we
1153		 * should never find a sa handle that doesn't
1154		 * know about the znode.
1155		 */
1156
1157		ASSERT3P(zp, !=, NULL);
1158
1159		mutex_enter(&zp->z_lock);
1160		ASSERT3U(zp->z_id, ==, obj_num);
1161		if (zp->z_unlinked) {
1162			err = ENOENT;
1163		} else {
1164			vnode_t *vp;
1165			int dying = 0;
1166
1167			vp = ZTOV(zp);
1168			if (vp == NULL)
1169				dying = 1;
1170			else {
1171				VN_HOLD(vp);
1172				if ((vp->v_iflag & VI_DOOMED) != 0) {
1173					dying = 1;
1174					/*
1175					 * Don't VN_RELE() vnode here, because
1176					 * it can call vn_lock() which creates
1177					 * LOR between vnode lock and znode
1178					 * lock. We will VN_RELE() the vnode
1179					 * after droping znode lock.
1180					 */
1181				}
1182			}
1183			if (dying) {
1184				if (first) {
1185					ZFS_LOG(1, "dying znode detected (zp=%p)", zp);
1186					first = 0;
1187				}
1188				/*
1189				 * znode is dying so we can't reuse it, we must
1190				 * wait until destruction is completed.
1191				 */
1192				sa_buf_rele(db, NULL);
1193				mutex_exit(&zp->z_lock);
1194				ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1195				if (vp != NULL)
1196					VN_RELE(vp);
1197				tsleep(zp, 0, "zcollide", 1);
1198				goto again;
1199			}
1200			*zpp = zp;
1201			err = 0;
1202		}
1203		sa_buf_rele(db, NULL);
1204		mutex_exit(&zp->z_lock);
1205		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1206		return (err);
1207	}
1208
1209	/*
1210	 * Not found create new znode/vnode
1211	 * but only if file exists.
1212	 *
1213	 * There is a small window where zfs_vget() could
1214	 * find this object while a file create is still in
1215	 * progress.  This is checked for in zfs_znode_alloc()
1216	 *
1217	 * if zfs_znode_alloc() fails it will drop the hold on the
1218	 * bonus buffer.
1219	 */
1220	zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size,
1221	    doi.doi_bonus_type, NULL);
1222	if (zp == NULL) {
1223		err = ENOENT;
1224	} else {
1225		*zpp = zp;
1226	}
1227	if (err == 0) {
1228		vnode_t *vp = ZTOV(zp);
1229
1230		err = insmntque(vp, zfsvfs->z_vfs);
1231		if (err == 0)
1232			VOP_UNLOCK(vp, 0);
1233		else {
1234			zp->z_vnode = NULL;
1235			zfs_znode_dmu_fini(zp);
1236			zfs_znode_free(zp);
1237			*zpp = NULL;
1238		}
1239	}
1240	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1241	return (err);
1242}
1243
1244int
1245zfs_rezget(znode_t *zp)
1246{
1247	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1248	dmu_object_info_t doi;
1249	dmu_buf_t *db;
1250	uint64_t obj_num = zp->z_id;
1251	uint64_t mode, size;
1252	sa_bulk_attr_t bulk[8];
1253	int err;
1254	int count = 0;
1255	uint64_t gen;
1256
1257	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
1258
1259	mutex_enter(&zp->z_acl_lock);
1260	if (zp->z_acl_cached) {
1261		zfs_acl_free(zp->z_acl_cached);
1262		zp->z_acl_cached = NULL;
1263	}
1264
1265	mutex_exit(&zp->z_acl_lock);
1266	ASSERT(zp->z_sa_hdl == NULL);
1267	err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db);
1268	if (err) {
1269		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1270		return (err);
1271	}
1272
1273	dmu_object_info_from_db(db, &doi);
1274	if (doi.doi_bonus_type != DMU_OT_SA &&
1275	    (doi.doi_bonus_type != DMU_OT_ZNODE ||
1276	    (doi.doi_bonus_type == DMU_OT_ZNODE &&
1277	    doi.doi_bonus_size < sizeof (znode_phys_t)))) {
1278		sa_buf_rele(db, NULL);
1279		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1280		return (EINVAL);
1281	}
1282
1283	zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL);
1284	size = zp->z_size;
1285
1286	/* reload cached values */
1287	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
1288	    &gen, sizeof (gen));
1289	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
1290	    &zp->z_size, sizeof (zp->z_size));
1291	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
1292	    &zp->z_links, sizeof (zp->z_links));
1293	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
1294	    &zp->z_pflags, sizeof (zp->z_pflags));
1295	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
1296	    &zp->z_atime, sizeof (zp->z_atime));
1297	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
1298	    &zp->z_uid, sizeof (zp->z_uid));
1299	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
1300	    &zp->z_gid, sizeof (zp->z_gid));
1301	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
1302	    &mode, sizeof (mode));
1303
1304	if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) {
1305		zfs_znode_dmu_fini(zp);
1306		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1307		return (EIO);
1308	}
1309
1310	zp->z_mode = mode;
1311
1312	if (gen != zp->z_gen) {
1313		zfs_znode_dmu_fini(zp);
1314		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1315		return (EIO);
1316	}
1317
1318	/*
1319	 * XXXPJD: Not sure how is that possible, but under heavy
1320	 * zfs recv -F load it happens that z_gen is the same, but
1321	 * vnode type is different than znode type. This would mean
1322	 * that for example regular file was replaced with directory
1323	 * which has the same object number.
1324	 */
1325	if (ZTOV(zp) != NULL &&
1326	    ZTOV(zp)->v_type != IFTOVT((mode_t)zp->z_mode)) {
1327		zfs_znode_dmu_fini(zp);
1328		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1329		return (EIO);
1330	}
1331
1332	zp->z_unlinked = (zp->z_links == 0);
1333	zp->z_blksz = doi.doi_data_block_size;
1334	if (zp->z_size != size && ZTOV(zp) != NULL)
1335		vnode_pager_setsize(ZTOV(zp), zp->z_size);
1336
1337	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
1338
1339	return (0);
1340}
1341
1342void
1343zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
1344{
1345	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1346	objset_t *os = zfsvfs->z_os;
1347	uint64_t obj = zp->z_id;
1348	uint64_t acl_obj = zfs_external_acl(zp);
1349
1350	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
1351	if (acl_obj) {
1352		VERIFY(!zp->z_is_sa);
1353		VERIFY(0 == dmu_object_free(os, acl_obj, tx));
1354	}
1355	VERIFY(0 == dmu_object_free(os, obj, tx));
1356	zfs_znode_dmu_fini(zp);
1357	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
1358	zfs_znode_free(zp);
1359}
1360
1361void
1362zfs_zinactive(znode_t *zp)
1363{
1364	vnode_t	*vp = ZTOV(zp);
1365	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1366	uint64_t z_id = zp->z_id;
1367	int vfslocked;
1368
1369	ASSERT(zp->z_sa_hdl);
1370
1371	/*
1372	 * Don't allow a zfs_zget() while were trying to release this znode
1373	 */
1374	ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
1375
1376	mutex_enter(&zp->z_lock);
1377	VI_LOCK(vp);
1378	if (vp->v_count > 0) {
1379		/*
1380		 * If the hold count is greater than zero, somebody has
1381		 * obtained a new reference on this znode while we were
1382		 * processing it here, so we are done.
1383		 */
1384		VI_UNLOCK(vp);
1385		mutex_exit(&zp->z_lock);
1386		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
1387		return;
1388	}
1389	VI_UNLOCK(vp);
1390
1391	/*
1392	 * If this was the last reference to a file with no links,
1393	 * remove the file from the file system.
1394	 */
1395	if (zp->z_unlinked) {
1396		mutex_exit(&zp->z_lock);
1397		ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
1398		ASSERT(vp->v_count == 0);
1399		vrecycle(vp, curthread);
1400		vfslocked = VFS_LOCK_GIANT(zfsvfs->z_vfs);
1401		zfs_rmnode(zp);
1402		VFS_UNLOCK_GIANT(vfslocked);
1403		return;
1404	}
1405
1406	mutex_exit(&zp->z_lock);
1407	ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
1408}
1409
1410void
1411zfs_znode_free(znode_t *zp)
1412{
1413	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1414
1415	ASSERT(ZTOV(zp) == NULL);
1416	ASSERT(zp->z_sa_hdl == NULL);
1417	mutex_enter(&zfsvfs->z_znodes_lock);
1418	POINTER_INVALIDATE(&zp->z_zfsvfs);
1419	list_remove(&zfsvfs->z_all_znodes, zp);
1420	mutex_exit(&zfsvfs->z_znodes_lock);
1421
1422	if (zp->z_acl_cached) {
1423		zfs_acl_free(zp->z_acl_cached);
1424		zp->z_acl_cached = NULL;
1425	}
1426
1427	kmem_cache_free(znode_cache, zp);
1428
1429	VFS_RELE(zfsvfs->z_vfs);
1430}
1431
1432void
1433zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2],
1434    uint64_t ctime[2], boolean_t have_tx)
1435{
1436	timestruc_t	now;
1437
1438	gethrestime(&now);
1439
1440	if (have_tx) {	/* will sa_bulk_update happen really soon? */
1441		zp->z_atime_dirty = 0;
1442		zp->z_seq++;
1443	} else {
1444		zp->z_atime_dirty = 1;
1445	}
1446
1447	if (flag & AT_ATIME) {
1448		ZFS_TIME_ENCODE(&now, zp->z_atime);
1449	}
1450
1451	if (flag & AT_MTIME) {
1452		ZFS_TIME_ENCODE(&now, mtime);
1453		if (zp->z_zfsvfs->z_use_fuids) {
1454			zp->z_pflags |= (ZFS_ARCHIVE |
1455			    ZFS_AV_MODIFIED);
1456		}
1457	}
1458
1459	if (flag & AT_CTIME) {
1460		ZFS_TIME_ENCODE(&now, ctime);
1461		if (zp->z_zfsvfs->z_use_fuids)
1462			zp->z_pflags |= ZFS_ARCHIVE;
1463	}
1464}
1465
1466/*
1467 * Grow the block size for a file.
1468 *
1469 *	IN:	zp	- znode of file to free data in.
1470 *		size	- requested block size
1471 *		tx	- open transaction.
1472 *
1473 * NOTE: this function assumes that the znode is write locked.
1474 */
1475void
1476zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx)
1477{
1478	int		error;
1479	u_longlong_t	dummy;
1480
1481	if (size <= zp->z_blksz)
1482		return;
1483	/*
1484	 * If the file size is already greater than the current blocksize,
1485	 * we will not grow.  If there is more than one block in a file,
1486	 * the blocksize cannot change.
1487	 */
1488	if (zp->z_blksz && zp->z_size > zp->z_blksz)
1489		return;
1490
1491	error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id,
1492	    size, 0, tx);
1493
1494	if (error == ENOTSUP)
1495		return;
1496	ASSERT3U(error, ==, 0);
1497
1498	/* What blocksize did we actually get? */
1499	dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
1500}
1501
1502#ifdef sun
1503/*
1504 * This is a dummy interface used when pvn_vplist_dirty() should *not*
1505 * be calling back into the fs for a putpage().  E.g.: when truncating
1506 * a file, the pages being "thrown away* don't need to be written out.
1507 */
1508/* ARGSUSED */
1509static int
1510zfs_no_putpage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
1511    int flags, cred_t *cr)
1512{
1513	ASSERT(0);
1514	return (0);
1515}
1516#endif	/* sun */
1517
1518/*
1519 * Increase the file length
1520 *
1521 *	IN:	zp	- znode of file to free data in.
1522 *		end	- new end-of-file
1523 *
1524 * 	RETURN:	0 if success
1525 *		error code if failure
1526 */
1527static int
1528zfs_extend(znode_t *zp, uint64_t end)
1529{
1530	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1531	dmu_tx_t *tx;
1532	rl_t *rl;
1533	uint64_t newblksz;
1534	int error;
1535
1536	/*
1537	 * We will change zp_size, lock the whole file.
1538	 */
1539	rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
1540
1541	/*
1542	 * Nothing to do if file already at desired length.
1543	 */
1544	if (end <= zp->z_size) {
1545		zfs_range_unlock(rl);
1546		return (0);
1547	}
1548top:
1549	tx = dmu_tx_create(zfsvfs->z_os);
1550	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1551	zfs_sa_upgrade_txholds(tx, zp);
1552	if (end > zp->z_blksz &&
1553	    (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) {
1554		/*
1555		 * We are growing the file past the current block size.
1556		 */
1557		if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
1558			ASSERT(!ISP2(zp->z_blksz));
1559			newblksz = MIN(end, SPA_MAXBLOCKSIZE);
1560		} else {
1561			newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
1562		}
1563		dmu_tx_hold_write(tx, zp->z_id, 0, newblksz);
1564	} else {
1565		newblksz = 0;
1566	}
1567
1568	error = dmu_tx_assign(tx, TXG_NOWAIT);
1569	if (error) {
1570		if (error == ERESTART) {
1571			dmu_tx_wait(tx);
1572			dmu_tx_abort(tx);
1573			goto top;
1574		}
1575		dmu_tx_abort(tx);
1576		zfs_range_unlock(rl);
1577		return (error);
1578	}
1579
1580	if (newblksz)
1581		zfs_grow_blocksize(zp, newblksz, tx);
1582
1583	zp->z_size = end;
1584
1585	VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs),
1586	    &zp->z_size, sizeof (zp->z_size), tx));
1587
1588	vnode_pager_setsize(ZTOV(zp), end);
1589
1590	zfs_range_unlock(rl);
1591
1592	dmu_tx_commit(tx);
1593
1594	return (0);
1595}
1596
1597/*
1598 * Free space in a file.
1599 *
1600 *	IN:	zp	- znode of file to free data in.
1601 *		off	- start of section to free.
1602 *		len	- length of section to free.
1603 *
1604 * 	RETURN:	0 if success
1605 *		error code if failure
1606 */
1607static int
1608zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
1609{
1610	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1611	rl_t *rl;
1612	int error;
1613
1614	/*
1615	 * Lock the range being freed.
1616	 */
1617	rl = zfs_range_lock(zp, off, len, RL_WRITER);
1618
1619	/*
1620	 * Nothing to do if file already at desired length.
1621	 */
1622	if (off >= zp->z_size) {
1623		zfs_range_unlock(rl);
1624		return (0);
1625	}
1626
1627	if (off + len > zp->z_size)
1628		len = zp->z_size - off;
1629
1630	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len);
1631
1632	if (error == 0) {
1633		/*
1634		 * In FreeBSD we cannot free block in the middle of a file,
1635		 * but only at the end of a file, so this code path should
1636		 * never happen.
1637		 */
1638		vnode_pager_setsize(ZTOV(zp), off);
1639	}
1640
1641	zfs_range_unlock(rl);
1642
1643	return (error);
1644}
1645
1646/*
1647 * Truncate a file
1648 *
1649 *	IN:	zp	- znode of file to free data in.
1650 *		end	- new end-of-file.
1651 *
1652 * 	RETURN:	0 if success
1653 *		error code if failure
1654 */
1655static int
1656zfs_trunc(znode_t *zp, uint64_t end)
1657{
1658	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1659	vnode_t *vp = ZTOV(zp);
1660	dmu_tx_t *tx;
1661	rl_t *rl;
1662	int error;
1663	sa_bulk_attr_t bulk[2];
1664	int count = 0;
1665
1666	/*
1667	 * We will change zp_size, lock the whole file.
1668	 */
1669	rl = zfs_range_lock(zp, 0, UINT64_MAX, RL_WRITER);
1670
1671	/*
1672	 * Nothing to do if file already at desired length.
1673	 */
1674	if (end >= zp->z_size) {
1675		zfs_range_unlock(rl);
1676		return (0);
1677	}
1678
1679	error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end,  -1);
1680	if (error) {
1681		zfs_range_unlock(rl);
1682		return (error);
1683	}
1684top:
1685	tx = dmu_tx_create(zfsvfs->z_os);
1686	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1687	zfs_sa_upgrade_txholds(tx, zp);
1688	error = dmu_tx_assign(tx, TXG_NOWAIT);
1689	if (error) {
1690		if (error == ERESTART) {
1691			dmu_tx_wait(tx);
1692			dmu_tx_abort(tx);
1693			goto top;
1694		}
1695		dmu_tx_abort(tx);
1696		zfs_range_unlock(rl);
1697		return (error);
1698	}
1699
1700	zp->z_size = end;
1701	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
1702	    NULL, &zp->z_size, sizeof (zp->z_size));
1703
1704	if (end == 0) {
1705		zp->z_pflags &= ~ZFS_SPARSE;
1706		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
1707		    NULL, &zp->z_pflags, 8);
1708	}
1709	VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0);
1710
1711	dmu_tx_commit(tx);
1712
1713	/*
1714	 * Clear any mapped pages in the truncated region.  This has to
1715	 * happen outside of the transaction to avoid the possibility of
1716	 * a deadlock with someone trying to push a page that we are
1717	 * about to invalidate.
1718	 */
1719	vnode_pager_setsize(vp, end);
1720
1721	zfs_range_unlock(rl);
1722
1723	return (0);
1724}
1725
1726/*
1727 * Free space in a file
1728 *
1729 *	IN:	zp	- znode of file to free data in.
1730 *		off	- start of range
1731 *		len	- end of range (0 => EOF)
1732 *		flag	- current file open mode flags.
1733 *		log	- TRUE if this action should be logged
1734 *
1735 * 	RETURN:	0 if success
1736 *		error code if failure
1737 */
1738int
1739zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
1740{
1741	vnode_t *vp = ZTOV(zp);
1742	dmu_tx_t *tx;
1743	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1744	zilog_t *zilog = zfsvfs->z_log;
1745	uint64_t mode;
1746	uint64_t mtime[2], ctime[2];
1747	sa_bulk_attr_t bulk[3];
1748	int count = 0;
1749	int error;
1750
1751	if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode,
1752	    sizeof (mode))) != 0)
1753		return (error);
1754
1755	if (off > zp->z_size) {
1756		error =  zfs_extend(zp, off+len);
1757		if (error == 0 && log)
1758			goto log;
1759		else
1760			return (error);
1761	}
1762
1763	/*
1764	 * Check for any locks in the region to be freed.
1765	 */
1766
1767	if (MANDLOCK(vp, (mode_t)mode)) {
1768		uint64_t length = (len ? len : zp->z_size - off);
1769		if (error = chklock(vp, FWRITE, off, length, flag, NULL))
1770			return (error);
1771	}
1772
1773	if (len == 0) {
1774		error = zfs_trunc(zp, off);
1775	} else {
1776		if ((error = zfs_free_range(zp, off, len)) == 0 &&
1777		    off + len > zp->z_size)
1778			error = zfs_extend(zp, off+len);
1779	}
1780	if (error || !log)
1781		return (error);
1782log:
1783	tx = dmu_tx_create(zfsvfs->z_os);
1784	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1785	zfs_sa_upgrade_txholds(tx, zp);
1786	error = dmu_tx_assign(tx, TXG_NOWAIT);
1787	if (error) {
1788		if (error == ERESTART) {
1789			dmu_tx_wait(tx);
1790			dmu_tx_abort(tx);
1791			goto log;
1792		}
1793		dmu_tx_abort(tx);
1794		return (error);
1795	}
1796
1797	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16);
1798	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16);
1799	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
1800	    NULL, &zp->z_pflags, 8);
1801	zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
1802	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
1803	ASSERT(error == 0);
1804
1805	zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len);
1806
1807	dmu_tx_commit(tx);
1808	return (0);
1809}
1810
1811void
1812zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
1813{
1814	zfsvfs_t	zfsvfs;
1815	uint64_t	moid, obj, sa_obj, version;
1816	uint64_t	sense = ZFS_CASE_SENSITIVE;
1817	uint64_t	norm = 0;
1818	nvpair_t	*elem;
1819	int		error;
1820	int		i;
1821	znode_t		*rootzp = NULL;
1822	vnode_t		vnode;
1823	vattr_t		vattr;
1824	znode_t		*zp;
1825	zfs_acl_ids_t	acl_ids;
1826
1827	/*
1828	 * First attempt to create master node.
1829	 */
1830	/*
1831	 * In an empty objset, there are no blocks to read and thus
1832	 * there can be no i/o errors (which we assert below).
1833	 */
1834	moid = MASTER_NODE_OBJ;
1835	error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
1836	    DMU_OT_NONE, 0, tx);
1837	ASSERT(error == 0);
1838
1839	/*
1840	 * Set starting attributes.
1841	 */
1842	version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os)));
1843	elem = NULL;
1844	while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
1845		/* For the moment we expect all zpl props to be uint64_ts */
1846		uint64_t val;
1847		char *name;
1848
1849		ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64);
1850		VERIFY(nvpair_value_uint64(elem, &val) == 0);
1851		name = nvpair_name(elem);
1852		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
1853			if (val < version)
1854				version = val;
1855		} else {
1856			error = zap_update(os, moid, name, 8, 1, &val, tx);
1857		}
1858		ASSERT(error == 0);
1859		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0)
1860			norm = val;
1861		else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0)
1862			sense = val;
1863	}
1864	ASSERT(version != 0);
1865	error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
1866
1867	/*
1868	 * Create zap object used for SA attribute registration
1869	 */
1870
1871	if (version >= ZPL_VERSION_SA) {
1872		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
1873		    DMU_OT_NONE, 0, tx);
1874		error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
1875		ASSERT(error == 0);
1876	} else {
1877		sa_obj = 0;
1878	}
1879	/*
1880	 * Create a delete queue.
1881	 */
1882	obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
1883
1884	error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
1885	ASSERT(error == 0);
1886
1887	/*
1888	 * Create root znode.  Create minimal znode/vnode/zfsvfs
1889	 * to allow zfs_mknode to work.
1890	 */
1891	VATTR_NULL(&vattr);
1892	vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
1893	vattr.va_type = VDIR;
1894	vattr.va_mode = S_IFDIR|0755;
1895	vattr.va_uid = crgetuid(cr);
1896	vattr.va_gid = crgetgid(cr);
1897
1898	bzero(&zfsvfs, sizeof (zfsvfs_t));
1899
1900	rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
1901	zfs_znode_cache_constructor(rootzp, NULL, 0);
1902	ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
1903	rootzp->z_moved = 0;
1904	rootzp->z_unlinked = 0;
1905	rootzp->z_atime_dirty = 0;
1906	rootzp->z_is_sa = USE_SA(version, os);
1907
1908	vnode.v_type = VDIR;
1909	vnode.v_data = rootzp;
1910	rootzp->z_vnode = &vnode;
1911
1912	zfsvfs.z_os = os;
1913	zfsvfs.z_parent = &zfsvfs;
1914	zfsvfs.z_version = version;
1915	zfsvfs.z_use_fuids = USE_FUIDS(version, os);
1916	zfsvfs.z_use_sa = USE_SA(version, os);
1917	zfsvfs.z_norm = norm;
1918
1919	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
1920	    &zfsvfs.z_attr_table);
1921
1922	ASSERT(error == 0);
1923
1924	/*
1925	 * Fold case on file systems that are always or sometimes case
1926	 * insensitive.
1927	 */
1928	if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
1929		zfsvfs.z_norm |= U8_TEXTPREP_TOUPPER;
1930
1931	mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
1932	list_create(&zfsvfs.z_all_znodes, sizeof (znode_t),
1933	    offsetof(znode_t, z_link_node));
1934
1935	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1936		mutex_init(&zfsvfs.z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
1937
1938	rootzp->z_zfsvfs = &zfsvfs;
1939	VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
1940	    cr, NULL, &acl_ids));
1941	zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
1942	ASSERT3P(zp, ==, rootzp);
1943	error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
1944	ASSERT(error == 0);
1945	zfs_acl_ids_free(&acl_ids);
1946	POINTER_INVALIDATE(&rootzp->z_zfsvfs);
1947
1948	sa_handle_destroy(rootzp->z_sa_hdl);
1949	rootzp->z_vnode = NULL;
1950	kmem_cache_free(znode_cache, rootzp);
1951
1952	/*
1953	 * Create shares directory
1954	 */
1955
1956	error = zfs_create_share_dir(&zfsvfs, tx);
1957
1958	ASSERT(error == 0);
1959
1960	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
1961		mutex_destroy(&zfsvfs.z_hold_mtx[i]);
1962}
1963
1964#endif /* _KERNEL */
1965
1966static int
1967zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table)
1968{
1969	uint64_t sa_obj = 0;
1970	int error;
1971
1972	error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj);
1973	if (error != 0 && error != ENOENT)
1974		return (error);
1975
1976	error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table);
1977	return (error);
1978}
1979
1980static int
1981zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp,
1982    dmu_buf_t **db, void *tag)
1983{
1984	dmu_object_info_t doi;
1985	int error;
1986
1987	if ((error = sa_buf_hold(osp, obj, tag, db)) != 0)
1988		return (error);
1989
1990	dmu_object_info_from_db(*db, &doi);
1991	if ((doi.doi_bonus_type != DMU_OT_SA &&
1992	    doi.doi_bonus_type != DMU_OT_ZNODE) ||
1993	    doi.doi_bonus_type == DMU_OT_ZNODE &&
1994	    doi.doi_bonus_size < sizeof (znode_phys_t)) {
1995		sa_buf_rele(*db, tag);
1996		return (ENOTSUP);
1997	}
1998
1999	error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp);
2000	if (error != 0) {
2001		sa_buf_rele(*db, tag);
2002		return (error);
2003	}
2004
2005	return (0);
2006}
2007
2008void
2009zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag)
2010{
2011	sa_handle_destroy(hdl);
2012	sa_buf_rele(db, tag);
2013}
2014
2015/*
2016 * Given an object number, return its parent object number and whether
2017 * or not the object is an extended attribute directory.
2018 */
2019static int
2020zfs_obj_to_pobj(sa_handle_t *hdl, sa_attr_type_t *sa_table, uint64_t *pobjp,
2021    int *is_xattrdir)
2022{
2023	uint64_t parent;
2024	uint64_t pflags;
2025	uint64_t mode;
2026	sa_bulk_attr_t bulk[3];
2027	int count = 0;
2028	int error;
2029
2030	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL,
2031	    &parent, sizeof (parent));
2032	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL,
2033	    &pflags, sizeof (pflags));
2034	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
2035	    &mode, sizeof (mode));
2036
2037	if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0)
2038		return (error);
2039
2040	*pobjp = parent;
2041	*is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode);
2042
2043	return (0);
2044}
2045
2046/*
2047 * Given an object number, return some zpl level statistics
2048 */
2049static int
2050zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table,
2051    zfs_stat_t *sb)
2052{
2053	sa_bulk_attr_t bulk[4];
2054	int count = 0;
2055
2056	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL,
2057	    &sb->zs_mode, sizeof (sb->zs_mode));
2058	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL,
2059	    &sb->zs_gen, sizeof (sb->zs_gen));
2060	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL,
2061	    &sb->zs_links, sizeof (sb->zs_links));
2062	SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL,
2063	    &sb->zs_ctime, sizeof (sb->zs_ctime));
2064
2065	return (sa_bulk_lookup(hdl, bulk, count));
2066}
2067
2068static int
2069zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl,
2070    sa_attr_type_t *sa_table, char *buf, int len)
2071{
2072	sa_handle_t *sa_hdl;
2073	sa_handle_t *prevhdl = NULL;
2074	dmu_buf_t *prevdb = NULL;
2075	dmu_buf_t *sa_db = NULL;
2076	char *path = buf + len - 1;
2077	int error;
2078
2079	*path = '\0';
2080	sa_hdl = hdl;
2081
2082	for (;;) {
2083		uint64_t pobj;
2084		char component[MAXNAMELEN + 2];
2085		size_t complen;
2086		int is_xattrdir;
2087
2088		if (prevdb)
2089			zfs_release_sa_handle(prevhdl, prevdb, FTAG);
2090
2091		if ((error = zfs_obj_to_pobj(sa_hdl, sa_table, &pobj,
2092		    &is_xattrdir)) != 0)
2093			break;
2094
2095		if (pobj == obj) {
2096			if (path[0] != '/')
2097				*--path = '/';
2098			break;
2099		}
2100
2101		component[0] = '/';
2102		if (is_xattrdir) {
2103			(void) sprintf(component + 1, "<xattrdir>");
2104		} else {
2105			error = zap_value_search(osp, pobj, obj,
2106			    ZFS_DIRENT_OBJ(-1ULL), component + 1);
2107			if (error != 0)
2108				break;
2109		}
2110
2111		complen = strlen(component);
2112		path -= complen;
2113		ASSERT(path >= buf);
2114		bcopy(component, path, complen);
2115		obj = pobj;
2116
2117		if (sa_hdl != hdl) {
2118			prevhdl = sa_hdl;
2119			prevdb = sa_db;
2120		}
2121		error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG);
2122		if (error != 0) {
2123			sa_hdl = prevhdl;
2124			sa_db = prevdb;
2125			break;
2126		}
2127	}
2128
2129	if (sa_hdl != NULL && sa_hdl != hdl) {
2130		ASSERT(sa_db != NULL);
2131		zfs_release_sa_handle(sa_hdl, sa_db, FTAG);
2132	}
2133
2134	if (error == 0)
2135		(void) memmove(buf, path, buf + len - path);
2136
2137	return (error);
2138}
2139
2140int
2141zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len)
2142{
2143	sa_attr_type_t *sa_table;
2144	sa_handle_t *hdl;
2145	dmu_buf_t *db;
2146	int error;
2147
2148	error = zfs_sa_setup(osp, &sa_table);
2149	if (error != 0)
2150		return (error);
2151
2152	error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
2153	if (error != 0)
2154		return (error);
2155
2156	error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
2157
2158	zfs_release_sa_handle(hdl, db, FTAG);
2159	return (error);
2160}
2161
2162int
2163zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb,
2164    char *buf, int len)
2165{
2166	char *path = buf + len - 1;
2167	sa_attr_type_t *sa_table;
2168	sa_handle_t *hdl;
2169	dmu_buf_t *db;
2170	int error;
2171
2172	*path = '\0';
2173
2174	error = zfs_sa_setup(osp, &sa_table);
2175	if (error != 0)
2176		return (error);
2177
2178	error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG);
2179	if (error != 0)
2180		return (error);
2181
2182	error = zfs_obj_to_stats_impl(hdl, sa_table, sb);
2183	if (error != 0) {
2184		zfs_release_sa_handle(hdl, db, FTAG);
2185		return (error);
2186	}
2187
2188	error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len);
2189
2190	zfs_release_sa_handle(hdl, db, FTAG);
2191	return (error);
2192}
2193