1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
25 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
26 * Copyright 2023 RackTop Systems, Inc.
27 */
28
29#include <sys/zfs_context.h>
30#include <sys/types.h>
31#include <sys/param.h>
32#include <sys/sysmacros.h>
33#include <sys/dmu.h>
34#include <sys/dmu_impl.h>
35#include <sys/dmu_objset.h>
36#include <sys/dmu_tx.h>
37#include <sys/dbuf.h>
38#include <sys/dnode.h>
39#include <sys/zap.h>
40#include <sys/sa.h>
41#include <sys/sunddi.h>
42#include <sys/sa_impl.h>
43#include <sys/errno.h>
44#include <sys/zfs_context.h>
45
46#ifdef _KERNEL
47#include <sys/zfs_znode.h>
48#endif
49
50/*
51 * ZFS System attributes:
52 *
53 * A generic mechanism to allow for arbitrary attributes
54 * to be stored in a dnode.  The data will be stored in the bonus buffer of
55 * the dnode and if necessary a special "spill" block will be used to handle
56 * overflow situations.  The spill block will be sized to fit the data
57 * from 512 - 128K.  When a spill block is used the BP (blkptr_t) for the
58 * spill block is stored at the end of the current bonus buffer.  Any
59 * attributes that would be in the way of the blkptr_t will be relocated
60 * into the spill block.
61 *
62 * Attribute registration:
63 *
64 * Stored persistently on a per dataset basis
65 * a mapping between attribute "string" names and their actual attribute
66 * numeric values, length, and byteswap function.  The names are only used
67 * during registration.  All  attributes are known by their unique attribute
68 * id value.  If an attribute can have a variable size then the value
69 * 0 will be used to indicate this.
70 *
71 * Attribute Layout:
72 *
73 * Attribute layouts are a way to compactly store multiple attributes, but
74 * without taking the overhead associated with managing each attribute
75 * individually.  Since you will typically have the same set of attributes
76 * stored in the same order a single table will be used to represent that
77 * layout.  The ZPL for example will usually have only about 10 different
78 * layouts (regular files, device files, symlinks,
79 * regular files + scanstamp, files/dir with extended attributes, and then
80 * you have the possibility of all of those minus ACL, because it would
81 * be kicked out into the spill block)
82 *
83 * Layouts are simply an array of the attributes and their
84 * ordering i.e. [0, 1, 4, 5, 2]
85 *
86 * Each distinct layout is given a unique layout number and that is what's
87 * stored in the header at the beginning of the SA data buffer.
88 *
89 * A layout only covers a single dbuf (bonus or spill).  If a set of
90 * attributes is split up between the bonus buffer and a spill buffer then
91 * two different layouts will be used.  This allows us to byteswap the
92 * spill without looking at the bonus buffer and keeps the on disk format of
93 * the bonus and spill buffer the same.
94 *
95 * Adding a single attribute will cause the entire set of attributes to
96 * be rewritten and could result in a new layout number being constructed
97 * as part of the rewrite if no such layout exists for the new set of
98 * attributes.  The new attribute will be appended to the end of the already
99 * existing attributes.
100 *
101 * Both the attribute registration and attribute layout information are
102 * stored in normal ZAP attributes.  Their should be a small number of
103 * known layouts and the set of attributes is assumed to typically be quite
104 * small.
105 *
106 * The registered attributes and layout "table" information is maintained
107 * in core and a special "sa_os_t" is attached to the objset_t.
108 *
109 * A special interface is provided to allow for quickly applying
110 * a large set of attributes at once.  sa_replace_all_by_template() is
111 * used to set an array of attributes.  This is used by the ZPL when
112 * creating a brand new file.  The template that is passed into the function
113 * specifies the attribute, size for variable length attributes, location of
114 * data and special "data locator" function if the data isn't in a contiguous
115 * location.
116 *
117 * Byteswap implications:
118 *
119 * Since the SA attributes are not entirely self describing we can't do
120 * the normal byteswap processing.  The special ZAP layout attribute and
121 * attribute registration attributes define the byteswap function and the
122 * size of the attributes, unless it is variable sized.
123 * The normal ZFS byteswapping infrastructure assumes you don't need
124 * to read any objects in order to do the necessary byteswapping.  Whereas
125 * SA attributes can only be properly byteswapped if the dataset is opened
126 * and the layout/attribute ZAP attributes are available.  Because of this
127 * the SA attributes will be byteswapped when they are first accessed by
128 * the SA code that will read the SA data.
129 */
130
131typedef void (sa_iterfunc_t)(void *hdr, void *addr, sa_attr_type_t,
132    uint16_t length, int length_idx, boolean_t, void *userp);
133
134static int sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype);
135static void sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab);
136static sa_idx_tab_t *sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype,
137    sa_hdr_phys_t *hdr);
138static void sa_idx_tab_rele(objset_t *os, void *arg);
139static void sa_copy_data(sa_data_locator_t *func, void *start, void *target,
140    int buflen);
141static int sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
142    sa_data_op_t action, sa_data_locator_t *locator, void *datastart,
143    uint16_t buflen, dmu_tx_t *tx);
144
145static arc_byteswap_func_t sa_bswap_table[] = {
146	byteswap_uint64_array,
147	byteswap_uint32_array,
148	byteswap_uint16_array,
149	byteswap_uint8_array,
150	zfs_acl_byteswap,
151};
152
153#ifdef HAVE_EFFICIENT_UNALIGNED_ACCESS
154#define	SA_COPY_DATA(f, s, t, l)				\
155do {								\
156	if (f == NULL) {					\
157		if (l == 8) {					\
158			*(uint64_t *)t = *(uint64_t *)s;	\
159		} else if (l == 16) {				\
160			*(uint64_t *)t = *(uint64_t *)s;	\
161			*(uint64_t *)((uintptr_t)t + 8) =	\
162			    *(uint64_t *)((uintptr_t)s + 8);	\
163		} else {					\
164			memcpy(t, s, l);				\
165		}						\
166	} else {						\
167		sa_copy_data(f, s, t, l);			\
168	}							\
169} while (0)
170#else
171#define	SA_COPY_DATA(f, s, t, l)	sa_copy_data(f, s, t, l)
172#endif
173
174/*
175 * This table is fixed and cannot be changed.  Its purpose is to
176 * allow the SA code to work with both old/new ZPL file systems.
177 * It contains the list of legacy attributes.  These attributes aren't
178 * stored in the "attribute" registry zap objects, since older ZPL file systems
179 * won't have the registry.  Only objsets of type ZFS_TYPE_FILESYSTEM will
180 * use this static table.
181 */
182static const sa_attr_reg_t sa_legacy_attrs[] = {
183	{"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0},
184	{"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1},
185	{"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2},
186	{"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3},
187	{"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4},
188	{"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5},
189	{"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6},
190	{"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7},
191	{"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8},
192	{"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9},
193	{"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10},
194	{"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11},
195	{"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12},
196	{"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13},
197	{"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14},
198	{"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15},
199};
200
201/*
202 * This is only used for objects of type DMU_OT_ZNODE
203 */
204static const sa_attr_type_t sa_legacy_zpl_layout[] = {
205    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
206};
207
208/*
209 * Special dummy layout used for buffers with no attributes.
210 */
211static const sa_attr_type_t sa_dummy_zpl_layout[] = { 0 };
212
213static const size_t sa_legacy_attr_count = ARRAY_SIZE(sa_legacy_attrs);
214static kmem_cache_t *sa_cache = NULL;
215
216static int
217sa_cache_constructor(void *buf, void *unused, int kmflag)
218{
219	(void) unused, (void) kmflag;
220	sa_handle_t *hdl = buf;
221
222	mutex_init(&hdl->sa_lock, NULL, MUTEX_DEFAULT, NULL);
223	return (0);
224}
225
226static void
227sa_cache_destructor(void *buf, void *unused)
228{
229	(void) unused;
230	sa_handle_t *hdl = buf;
231	mutex_destroy(&hdl->sa_lock);
232}
233
234void
235sa_cache_init(void)
236{
237	sa_cache = kmem_cache_create("sa_cache",
238	    sizeof (sa_handle_t), 0, sa_cache_constructor,
239	    sa_cache_destructor, NULL, NULL, NULL, 0);
240}
241
242void
243sa_cache_fini(void)
244{
245	if (sa_cache)
246		kmem_cache_destroy(sa_cache);
247}
248
249static int
250layout_num_compare(const void *arg1, const void *arg2)
251{
252	const sa_lot_t *node1 = (const sa_lot_t *)arg1;
253	const sa_lot_t *node2 = (const sa_lot_t *)arg2;
254
255	return (TREE_CMP(node1->lot_num, node2->lot_num));
256}
257
258static int
259layout_hash_compare(const void *arg1, const void *arg2)
260{
261	const sa_lot_t *node1 = (const sa_lot_t *)arg1;
262	const sa_lot_t *node2 = (const sa_lot_t *)arg2;
263
264	int cmp = TREE_CMP(node1->lot_hash, node2->lot_hash);
265	if (likely(cmp))
266		return (cmp);
267
268	return (TREE_CMP(node1->lot_instance, node2->lot_instance));
269}
270
271static boolean_t
272sa_layout_equal(sa_lot_t *tbf, sa_attr_type_t *attrs, int count)
273{
274	int i;
275
276	if (count != tbf->lot_attr_count)
277		return (1);
278
279	for (i = 0; i != count; i++) {
280		if (attrs[i] != tbf->lot_attrs[i])
281			return (1);
282	}
283	return (0);
284}
285
286#define	SA_ATTR_HASH(attr) (zfs_crc64_table[(-1ULL ^ attr) & 0xFF])
287
288static uint64_t
289sa_layout_info_hash(const sa_attr_type_t *attrs, int attr_count)
290{
291	uint64_t crc = -1ULL;
292
293	for (int i = 0; i != attr_count; i++)
294		crc ^= SA_ATTR_HASH(attrs[i]);
295
296	return (crc);
297}
298
299static int
300sa_get_spill(sa_handle_t *hdl)
301{
302	int rc;
303	if (hdl->sa_spill == NULL) {
304		if ((rc = dmu_spill_hold_existing(hdl->sa_bonus, NULL,
305		    &hdl->sa_spill)) == 0)
306			VERIFY(0 == sa_build_index(hdl, SA_SPILL));
307	} else {
308		rc = 0;
309	}
310
311	return (rc);
312}
313
314/*
315 * Main attribute lookup/update function
316 * returns 0 for success or non zero for failures
317 *
318 * Operates on bulk array, first failure will abort further processing
319 */
320static int
321sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
322    sa_data_op_t data_op, dmu_tx_t *tx)
323{
324	sa_os_t *sa = hdl->sa_os->os_sa;
325	int i;
326	int error = 0;
327	sa_buf_type_t buftypes;
328
329	buftypes = 0;
330
331	ASSERT(count > 0);
332	for (i = 0; i != count; i++) {
333		ASSERT(bulk[i].sa_attr <= hdl->sa_os->os_sa->sa_num_attrs);
334
335		bulk[i].sa_addr = NULL;
336		/* First check the bonus buffer */
337
338		if (hdl->sa_bonus_tab && TOC_ATTR_PRESENT(
339		    hdl->sa_bonus_tab->sa_idx_tab[bulk[i].sa_attr])) {
340			SA_ATTR_INFO(sa, hdl->sa_bonus_tab,
341			    SA_GET_HDR(hdl, SA_BONUS),
342			    bulk[i].sa_attr, bulk[i], SA_BONUS, hdl);
343			if (tx && !(buftypes & SA_BONUS)) {
344				dmu_buf_will_dirty(hdl->sa_bonus, tx);
345				buftypes |= SA_BONUS;
346			}
347		}
348		if (bulk[i].sa_addr == NULL &&
349		    ((error = sa_get_spill(hdl)) == 0)) {
350			if (TOC_ATTR_PRESENT(
351			    hdl->sa_spill_tab->sa_idx_tab[bulk[i].sa_attr])) {
352				SA_ATTR_INFO(sa, hdl->sa_spill_tab,
353				    SA_GET_HDR(hdl, SA_SPILL),
354				    bulk[i].sa_attr, bulk[i], SA_SPILL, hdl);
355				if (tx && !(buftypes & SA_SPILL) &&
356				    bulk[i].sa_size == bulk[i].sa_length) {
357					dmu_buf_will_dirty(hdl->sa_spill, tx);
358					buftypes |= SA_SPILL;
359				}
360			}
361		}
362		if (error && error != ENOENT) {
363			return ((error == ECKSUM) ? EIO : error);
364		}
365
366		switch (data_op) {
367		case SA_LOOKUP:
368			if (bulk[i].sa_addr == NULL)
369				return (SET_ERROR(ENOENT));
370			if (bulk[i].sa_data) {
371				SA_COPY_DATA(bulk[i].sa_data_func,
372				    bulk[i].sa_addr, bulk[i].sa_data,
373				    MIN(bulk[i].sa_size, bulk[i].sa_length));
374			}
375			continue;
376
377		case SA_UPDATE:
378			/* existing rewrite of attr */
379			if (bulk[i].sa_addr &&
380			    bulk[i].sa_size == bulk[i].sa_length) {
381				SA_COPY_DATA(bulk[i].sa_data_func,
382				    bulk[i].sa_data, bulk[i].sa_addr,
383				    bulk[i].sa_length);
384				continue;
385			} else if (bulk[i].sa_addr) { /* attr size change */
386				error = sa_modify_attrs(hdl, bulk[i].sa_attr,
387				    SA_REPLACE, bulk[i].sa_data_func,
388				    bulk[i].sa_data, bulk[i].sa_length, tx);
389			} else { /* adding new attribute */
390				error = sa_modify_attrs(hdl, bulk[i].sa_attr,
391				    SA_ADD, bulk[i].sa_data_func,
392				    bulk[i].sa_data, bulk[i].sa_length, tx);
393			}
394			if (error)
395				return (error);
396			break;
397		default:
398			break;
399		}
400	}
401	return (error);
402}
403
404static sa_lot_t *
405sa_add_layout_entry(objset_t *os, const sa_attr_type_t *attrs, int attr_count,
406    uint64_t lot_num, uint64_t hash, boolean_t zapadd, dmu_tx_t *tx)
407{
408	sa_os_t *sa = os->os_sa;
409	sa_lot_t *tb, *findtb;
410	int i;
411	avl_index_t loc;
412
413	ASSERT(MUTEX_HELD(&sa->sa_lock));
414	tb = kmem_zalloc(sizeof (sa_lot_t), KM_SLEEP);
415	tb->lot_attr_count = attr_count;
416	tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
417	    KM_SLEEP);
418	memcpy(tb->lot_attrs, attrs, sizeof (sa_attr_type_t) * attr_count);
419	tb->lot_num = lot_num;
420	tb->lot_hash = hash;
421	tb->lot_instance = 0;
422
423	if (zapadd) {
424		char attr_name[8];
425
426		if (sa->sa_layout_attr_obj == 0) {
427			sa->sa_layout_attr_obj = zap_create_link(os,
428			    DMU_OT_SA_ATTR_LAYOUTS,
429			    sa->sa_master_obj, SA_LAYOUTS, tx);
430		}
431
432		(void) snprintf(attr_name, sizeof (attr_name),
433		    "%d", (int)lot_num);
434		VERIFY(0 == zap_update(os, os->os_sa->sa_layout_attr_obj,
435		    attr_name, 2, attr_count, attrs, tx));
436	}
437
438	list_create(&tb->lot_idx_tab, sizeof (sa_idx_tab_t),
439	    offsetof(sa_idx_tab_t, sa_next));
440
441	for (i = 0; i != attr_count; i++) {
442		if (sa->sa_attr_table[tb->lot_attrs[i]].sa_length == 0)
443			tb->lot_var_sizes++;
444	}
445
446	avl_add(&sa->sa_layout_num_tree, tb);
447
448	/* verify we don't have a hash collision */
449	if ((findtb = avl_find(&sa->sa_layout_hash_tree, tb, &loc)) != NULL) {
450		for (; findtb && findtb->lot_hash == hash;
451		    findtb = AVL_NEXT(&sa->sa_layout_hash_tree, findtb)) {
452			if (findtb->lot_instance != tb->lot_instance)
453				break;
454			tb->lot_instance++;
455		}
456	}
457	avl_add(&sa->sa_layout_hash_tree, tb);
458	return (tb);
459}
460
461static void
462sa_find_layout(objset_t *os, uint64_t hash, sa_attr_type_t *attrs,
463    int count, dmu_tx_t *tx, sa_lot_t **lot)
464{
465	sa_lot_t *tb, tbsearch;
466	avl_index_t loc;
467	sa_os_t *sa = os->os_sa;
468	boolean_t found = B_FALSE;
469
470	mutex_enter(&sa->sa_lock);
471	tbsearch.lot_hash = hash;
472	tbsearch.lot_instance = 0;
473	tb = avl_find(&sa->sa_layout_hash_tree, &tbsearch, &loc);
474	if (tb) {
475		for (; tb && tb->lot_hash == hash;
476		    tb = AVL_NEXT(&sa->sa_layout_hash_tree, tb)) {
477			if (sa_layout_equal(tb, attrs, count) == 0) {
478				found = B_TRUE;
479				break;
480			}
481		}
482	}
483	if (!found) {
484		tb = sa_add_layout_entry(os, attrs, count,
485		    avl_numnodes(&sa->sa_layout_num_tree), hash, B_TRUE, tx);
486	}
487	mutex_exit(&sa->sa_lock);
488	*lot = tb;
489}
490
491static int
492sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx)
493{
494	int error;
495	uint32_t blocksize;
496
497	if (size == 0) {
498		blocksize = SPA_MINBLOCKSIZE;
499	} else if (size > SPA_OLD_MAXBLOCKSIZE) {
500		ASSERT(0);
501		return (SET_ERROR(EFBIG));
502	} else {
503		blocksize = P2ROUNDUP_TYPED(size, SPA_MINBLOCKSIZE, uint32_t);
504	}
505
506	error = dbuf_spill_set_blksz(hdl->sa_spill, blocksize, tx);
507	ASSERT(error == 0);
508	return (error);
509}
510
511static void
512sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen)
513{
514	if (func == NULL) {
515		memcpy(target, datastart, buflen);
516	} else {
517		boolean_t start;
518		int bytes;
519		void *dataptr;
520		void *saptr = target;
521		uint32_t length;
522
523		start = B_TRUE;
524		bytes = 0;
525		while (bytes < buflen) {
526			func(&dataptr, &length, buflen, start, datastart);
527			memcpy(saptr, dataptr, length);
528			saptr = (void *)((caddr_t)saptr + length);
529			bytes += length;
530			start = B_FALSE;
531		}
532	}
533}
534
535/*
536 * Determine several different values pertaining to system attribute
537 * buffers.
538 *
539 * Return the size of the sa_hdr_phys_t header for the buffer. Each
540 * variable length attribute except the first contributes two bytes to
541 * the header size, which is then rounded up to an 8-byte boundary.
542 *
543 * The following output parameters are also computed.
544 *
545 *  index - The index of the first attribute in attr_desc that will
546 *  spill over. Only valid if will_spill is set.
547 *
548 *  total - The total number of bytes of all system attributes described
549 *  in attr_desc.
550 *
551 *  will_spill - Set when spilling is necessary. It is only set when
552 *  the buftype is SA_BONUS.
553 */
554static int
555sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count,
556    dmu_buf_t *db, sa_buf_type_t buftype, int full_space, int *index,
557    int *total, boolean_t *will_spill)
558{
559	int var_size_count = 0;
560	int i;
561	int hdrsize;
562	int extra_hdrsize;
563
564	if (buftype == SA_BONUS && sa->sa_force_spill) {
565		*total = 0;
566		*index = 0;
567		*will_spill = B_TRUE;
568		return (0);
569	}
570
571	*index = -1;
572	*total = 0;
573	*will_spill = B_FALSE;
574
575	extra_hdrsize = 0;
576	hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 :
577	    sizeof (sa_hdr_phys_t);
578
579	ASSERT(IS_P2ALIGNED(full_space, 8));
580
581	for (i = 0; i != attr_count; i++) {
582		boolean_t is_var_sz, might_spill_here;
583		int tmp_hdrsize;
584
585		*total = P2ROUNDUP(*total, 8);
586		*total += attr_desc[i].sa_length;
587		if (*will_spill)
588			continue;
589
590		is_var_sz = (SA_REGISTERED_LEN(sa, attr_desc[i].sa_attr) == 0);
591		if (is_var_sz)
592			var_size_count++;
593
594		/*
595		 * Calculate what the SA header size would be if this
596		 * attribute doesn't spill.
597		 */
598		tmp_hdrsize = hdrsize + ((is_var_sz && var_size_count > 1) ?
599		    sizeof (uint16_t) : 0);
600
601		/*
602		 * Check whether this attribute spans into the space
603		 * that would be used by the spill block pointer should
604		 * a spill block be needed.
605		 */
606		might_spill_here =
607		    buftype == SA_BONUS && *index == -1 &&
608		    (*total + P2ROUNDUP(tmp_hdrsize, 8)) >
609		    (full_space - sizeof (blkptr_t));
610
611		if (is_var_sz && var_size_count > 1) {
612			if (buftype == SA_SPILL ||
613			    tmp_hdrsize + *total < full_space) {
614				/*
615				 * Record the extra header size in case this
616				 * increase needs to be reversed due to
617				 * spill-over.
618				 */
619				hdrsize = tmp_hdrsize;
620				if (*index != -1 || might_spill_here)
621					extra_hdrsize += sizeof (uint16_t);
622			} else {
623				ASSERT(buftype == SA_BONUS);
624				if (*index == -1)
625					*index = i;
626				*will_spill = B_TRUE;
627				continue;
628			}
629		}
630
631		/*
632		 * Store index of where spill *could* occur. Then
633		 * continue to count the remaining attribute sizes. The
634		 * sum is used later for sizing bonus and spill buffer.
635		 */
636		if (might_spill_here)
637			*index = i;
638
639		if ((*total + P2ROUNDUP(hdrsize, 8)) > full_space &&
640		    buftype == SA_BONUS)
641			*will_spill = B_TRUE;
642	}
643
644	if (*will_spill)
645		hdrsize -= extra_hdrsize;
646
647	hdrsize = P2ROUNDUP(hdrsize, 8);
648	return (hdrsize);
649}
650
651#define	BUF_SPACE_NEEDED(total, header) (total + header)
652
653/*
654 * Find layout that corresponds to ordering of attributes
655 * If not found a new layout number is created and added to
656 * persistent layout tables.
657 */
658static int
659sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count,
660    dmu_tx_t *tx)
661{
662	sa_os_t *sa = hdl->sa_os->os_sa;
663	uint64_t hash;
664	sa_buf_type_t buftype;
665	sa_hdr_phys_t *sahdr;
666	void *data_start;
667	sa_attr_type_t *attrs, *attrs_start;
668	int i, lot_count;
669	int dnodesize;
670	int spill_idx;
671	int hdrsize;
672	int spillhdrsize = 0;
673	int used;
674	dmu_object_type_t bonustype;
675	sa_lot_t *lot;
676	int len_idx;
677	int spill_used;
678	int bonuslen;
679	boolean_t spilling;
680
681	dmu_buf_will_dirty(hdl->sa_bonus, tx);
682	bonustype = SA_BONUSTYPE_FROM_DB(hdl->sa_bonus);
683	dmu_object_dnsize_from_db(hdl->sa_bonus, &dnodesize);
684	bonuslen = DN_BONUS_SIZE(dnodesize);
685
686	/* first determine bonus header size and sum of all attributes */
687	hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus,
688	    SA_BONUS, bonuslen, &spill_idx, &used, &spilling);
689
690	if (used > SPA_OLD_MAXBLOCKSIZE)
691		return (SET_ERROR(EFBIG));
692
693	VERIFY0(dmu_set_bonus(hdl->sa_bonus, spilling ?
694	    MIN(bonuslen - sizeof (blkptr_t), used + hdrsize) :
695	    used + hdrsize, tx));
696
697	ASSERT((bonustype == DMU_OT_ZNODE && spilling == 0) ||
698	    bonustype == DMU_OT_SA);
699
700	/* setup and size spill buffer when needed */
701	if (spilling) {
702		boolean_t dummy;
703
704		if (hdl->sa_spill == NULL) {
705			VERIFY(dmu_spill_hold_by_bonus(hdl->sa_bonus, 0, NULL,
706			    &hdl->sa_spill) == 0);
707		}
708		dmu_buf_will_dirty(hdl->sa_spill, tx);
709
710		spillhdrsize = sa_find_sizes(sa, &attr_desc[spill_idx],
711		    attr_count - spill_idx, hdl->sa_spill, SA_SPILL,
712		    hdl->sa_spill->db_size, &i, &spill_used, &dummy);
713
714		if (spill_used > SPA_OLD_MAXBLOCKSIZE)
715			return (SET_ERROR(EFBIG));
716
717		if (BUF_SPACE_NEEDED(spill_used, spillhdrsize) >
718		    hdl->sa_spill->db_size)
719			VERIFY(0 == sa_resize_spill(hdl,
720			    BUF_SPACE_NEEDED(spill_used, spillhdrsize), tx));
721	}
722
723	/* setup starting pointers to lay down data */
724	data_start = (void *)((uintptr_t)hdl->sa_bonus->db_data + hdrsize);
725	sahdr = (sa_hdr_phys_t *)hdl->sa_bonus->db_data;
726	buftype = SA_BONUS;
727
728	attrs_start = attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count,
729	    KM_SLEEP);
730	lot_count = 0;
731
732	for (i = 0, len_idx = 0, hash = -1ULL; i != attr_count; i++) {
733		uint16_t length;
734
735		ASSERT(IS_P2ALIGNED(data_start, 8));
736		attrs[i] = attr_desc[i].sa_attr;
737		length = SA_REGISTERED_LEN(sa, attrs[i]);
738		if (length == 0)
739			length = attr_desc[i].sa_length;
740
741		if (spilling && i == spill_idx) { /* switch to spill buffer */
742			VERIFY(bonustype == DMU_OT_SA);
743			if (buftype == SA_BONUS && !sa->sa_force_spill) {
744				sa_find_layout(hdl->sa_os, hash, attrs_start,
745				    lot_count, tx, &lot);
746				SA_SET_HDR(sahdr, lot->lot_num, hdrsize);
747			}
748
749			buftype = SA_SPILL;
750			hash = -1ULL;
751			len_idx = 0;
752
753			sahdr = (sa_hdr_phys_t *)hdl->sa_spill->db_data;
754			sahdr->sa_magic = SA_MAGIC;
755			data_start = (void *)((uintptr_t)sahdr +
756			    spillhdrsize);
757			attrs_start = &attrs[i];
758			lot_count = 0;
759		}
760		hash ^= SA_ATTR_HASH(attrs[i]);
761		attr_desc[i].sa_addr = data_start;
762		attr_desc[i].sa_size = length;
763		SA_COPY_DATA(attr_desc[i].sa_data_func, attr_desc[i].sa_data,
764		    data_start, length);
765		if (sa->sa_attr_table[attrs[i]].sa_length == 0) {
766			sahdr->sa_lengths[len_idx++] = length;
767		}
768		data_start = (void *)P2ROUNDUP(((uintptr_t)data_start +
769		    length), 8);
770		lot_count++;
771	}
772
773	sa_find_layout(hdl->sa_os, hash, attrs_start, lot_count, tx, &lot);
774
775	/*
776	 * Verify that old znodes always have layout number 0.
777	 * Must be DMU_OT_SA for arbitrary layouts
778	 */
779	VERIFY((bonustype == DMU_OT_ZNODE && lot->lot_num == 0) ||
780	    (bonustype == DMU_OT_SA && lot->lot_num > 1));
781
782	if (bonustype == DMU_OT_SA) {
783		SA_SET_HDR(sahdr, lot->lot_num,
784		    buftype == SA_BONUS ? hdrsize : spillhdrsize);
785	}
786
787	kmem_free(attrs, sizeof (sa_attr_type_t) * attr_count);
788	if (hdl->sa_bonus_tab) {
789		sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
790		hdl->sa_bonus_tab = NULL;
791	}
792	if (!sa->sa_force_spill)
793		VERIFY(0 == sa_build_index(hdl, SA_BONUS));
794	if (hdl->sa_spill) {
795		sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
796		if (!spilling) {
797			/*
798			 * remove spill block that is no longer needed.
799			 */
800			dmu_buf_rele(hdl->sa_spill, NULL);
801			hdl->sa_spill = NULL;
802			hdl->sa_spill_tab = NULL;
803			VERIFY(0 == dmu_rm_spill(hdl->sa_os,
804			    sa_handle_object(hdl), tx));
805		} else {
806			VERIFY(0 == sa_build_index(hdl, SA_SPILL));
807		}
808	}
809
810	return (0);
811}
812
813static void
814sa_free_attr_table(sa_os_t *sa)
815{
816	int i;
817
818	if (sa->sa_attr_table == NULL)
819		return;
820
821	for (i = 0; i != sa->sa_num_attrs; i++) {
822		if (sa->sa_attr_table[i].sa_name)
823			kmem_free(sa->sa_attr_table[i].sa_name,
824			    strlen(sa->sa_attr_table[i].sa_name) + 1);
825	}
826
827	kmem_free(sa->sa_attr_table,
828	    sizeof (sa_attr_table_t) * sa->sa_num_attrs);
829
830	sa->sa_attr_table = NULL;
831}
832
833static int
834sa_attr_table_setup(objset_t *os, const sa_attr_reg_t *reg_attrs, int count)
835{
836	sa_os_t *sa = os->os_sa;
837	uint64_t sa_attr_count = 0;
838	uint64_t sa_reg_count = 0;
839	int error = 0;
840	uint64_t attr_value;
841	sa_attr_table_t *tb;
842	zap_cursor_t zc;
843	zap_attribute_t za;
844	int registered_count = 0;
845	int i;
846	dmu_objset_type_t ostype = dmu_objset_type(os);
847
848	sa->sa_user_table =
849	    kmem_zalloc(count * sizeof (sa_attr_type_t), KM_SLEEP);
850	sa->sa_user_table_sz = count * sizeof (sa_attr_type_t);
851
852	if (sa->sa_reg_attr_obj != 0) {
853		error = zap_count(os, sa->sa_reg_attr_obj,
854		    &sa_attr_count);
855
856		/*
857		 * Make sure we retrieved a count and that it isn't zero
858		 */
859		if (error || (error == 0 && sa_attr_count == 0)) {
860			if (error == 0)
861				error = SET_ERROR(EINVAL);
862			goto bail;
863		}
864		sa_reg_count = sa_attr_count;
865	}
866
867	if (ostype == DMU_OST_ZFS && sa_attr_count == 0)
868		sa_attr_count += sa_legacy_attr_count;
869
870	/* Allocate attribute numbers for attributes that aren't registered */
871	for (i = 0; i != count; i++) {
872		boolean_t found = B_FALSE;
873		int j;
874
875		if (ostype == DMU_OST_ZFS) {
876			for (j = 0; j != sa_legacy_attr_count; j++) {
877				if (strcmp(reg_attrs[i].sa_name,
878				    sa_legacy_attrs[j].sa_name) == 0) {
879					sa->sa_user_table[i] =
880					    sa_legacy_attrs[j].sa_attr;
881					found = B_TRUE;
882				}
883			}
884		}
885		if (found)
886			continue;
887
888		if (sa->sa_reg_attr_obj)
889			error = zap_lookup(os, sa->sa_reg_attr_obj,
890			    reg_attrs[i].sa_name, 8, 1, &attr_value);
891		else
892			error = SET_ERROR(ENOENT);
893		switch (error) {
894		case ENOENT:
895			sa->sa_user_table[i] = (sa_attr_type_t)sa_attr_count;
896			sa_attr_count++;
897			break;
898		case 0:
899			sa->sa_user_table[i] = ATTR_NUM(attr_value);
900			break;
901		default:
902			goto bail;
903		}
904	}
905
906	sa->sa_num_attrs = sa_attr_count;
907	tb = sa->sa_attr_table =
908	    kmem_zalloc(sizeof (sa_attr_table_t) * sa_attr_count, KM_SLEEP);
909
910	/*
911	 * Attribute table is constructed from requested attribute list,
912	 * previously foreign registered attributes, and also the legacy
913	 * ZPL set of attributes.
914	 */
915
916	if (sa->sa_reg_attr_obj) {
917		for (zap_cursor_init(&zc, os, sa->sa_reg_attr_obj);
918		    (error = zap_cursor_retrieve(&zc, &za)) == 0;
919		    zap_cursor_advance(&zc)) {
920			uint64_t value;
921			value  = za.za_first_integer;
922
923			registered_count++;
924			tb[ATTR_NUM(value)].sa_attr = ATTR_NUM(value);
925			tb[ATTR_NUM(value)].sa_length = ATTR_LENGTH(value);
926			tb[ATTR_NUM(value)].sa_byteswap = ATTR_BSWAP(value);
927			tb[ATTR_NUM(value)].sa_registered = B_TRUE;
928
929			if (tb[ATTR_NUM(value)].sa_name) {
930				continue;
931			}
932			tb[ATTR_NUM(value)].sa_name =
933			    kmem_zalloc(strlen(za.za_name) +1, KM_SLEEP);
934			(void) strlcpy(tb[ATTR_NUM(value)].sa_name, za.za_name,
935			    strlen(za.za_name) +1);
936		}
937		zap_cursor_fini(&zc);
938		/*
939		 * Make sure we processed the correct number of registered
940		 * attributes
941		 */
942		if (registered_count != sa_reg_count) {
943			ASSERT(error != 0);
944			goto bail;
945		}
946
947	}
948
949	if (ostype == DMU_OST_ZFS) {
950		for (i = 0; i != sa_legacy_attr_count; i++) {
951			if (tb[i].sa_name)
952				continue;
953			tb[i].sa_attr = sa_legacy_attrs[i].sa_attr;
954			tb[i].sa_length = sa_legacy_attrs[i].sa_length;
955			tb[i].sa_byteswap = sa_legacy_attrs[i].sa_byteswap;
956			tb[i].sa_registered = B_FALSE;
957			tb[i].sa_name =
958			    kmem_zalloc(strlen(sa_legacy_attrs[i].sa_name) +1,
959			    KM_SLEEP);
960			(void) strlcpy(tb[i].sa_name,
961			    sa_legacy_attrs[i].sa_name,
962			    strlen(sa_legacy_attrs[i].sa_name) + 1);
963		}
964	}
965
966	for (i = 0; i != count; i++) {
967		sa_attr_type_t attr_id;
968
969		attr_id = sa->sa_user_table[i];
970		if (tb[attr_id].sa_name)
971			continue;
972
973		tb[attr_id].sa_length = reg_attrs[i].sa_length;
974		tb[attr_id].sa_byteswap = reg_attrs[i].sa_byteswap;
975		tb[attr_id].sa_attr = attr_id;
976		tb[attr_id].sa_name =
977		    kmem_zalloc(strlen(reg_attrs[i].sa_name) + 1, KM_SLEEP);
978		(void) strlcpy(tb[attr_id].sa_name, reg_attrs[i].sa_name,
979		    strlen(reg_attrs[i].sa_name) + 1);
980	}
981
982	sa->sa_need_attr_registration =
983	    (sa_attr_count != registered_count);
984
985	return (0);
986bail:
987	kmem_free(sa->sa_user_table, count * sizeof (sa_attr_type_t));
988	sa->sa_user_table = NULL;
989	sa_free_attr_table(sa);
990	ASSERT(error != 0);
991	return (error);
992}
993
994int
995sa_setup(objset_t *os, uint64_t sa_obj, const sa_attr_reg_t *reg_attrs,
996    int count, sa_attr_type_t **user_table)
997{
998	zap_cursor_t zc;
999	zap_attribute_t za;
1000	sa_os_t *sa;
1001	dmu_objset_type_t ostype = dmu_objset_type(os);
1002	sa_attr_type_t *tb;
1003	int error;
1004
1005	mutex_enter(&os->os_user_ptr_lock);
1006	if (os->os_sa) {
1007		mutex_enter(&os->os_sa->sa_lock);
1008		mutex_exit(&os->os_user_ptr_lock);
1009		tb = os->os_sa->sa_user_table;
1010		mutex_exit(&os->os_sa->sa_lock);
1011		*user_table = tb;
1012		return (0);
1013	}
1014
1015	sa = kmem_zalloc(sizeof (sa_os_t), KM_SLEEP);
1016	mutex_init(&sa->sa_lock, NULL, MUTEX_NOLOCKDEP, NULL);
1017	sa->sa_master_obj = sa_obj;
1018
1019	os->os_sa = sa;
1020	mutex_enter(&sa->sa_lock);
1021	mutex_exit(&os->os_user_ptr_lock);
1022	avl_create(&sa->sa_layout_num_tree, layout_num_compare,
1023	    sizeof (sa_lot_t), offsetof(sa_lot_t, lot_num_node));
1024	avl_create(&sa->sa_layout_hash_tree, layout_hash_compare,
1025	    sizeof (sa_lot_t), offsetof(sa_lot_t, lot_hash_node));
1026
1027	if (sa_obj) {
1028		error = zap_lookup(os, sa_obj, SA_LAYOUTS,
1029		    8, 1, &sa->sa_layout_attr_obj);
1030		if (error != 0 && error != ENOENT)
1031			goto fail;
1032		error = zap_lookup(os, sa_obj, SA_REGISTRY,
1033		    8, 1, &sa->sa_reg_attr_obj);
1034		if (error != 0 && error != ENOENT)
1035			goto fail;
1036	}
1037
1038	if ((error = sa_attr_table_setup(os, reg_attrs, count)) != 0)
1039		goto fail;
1040
1041	if (sa->sa_layout_attr_obj != 0) {
1042		uint64_t layout_count;
1043
1044		error = zap_count(os, sa->sa_layout_attr_obj,
1045		    &layout_count);
1046
1047		/*
1048		 * Layout number count should be > 0
1049		 */
1050		if (error || (error == 0 && layout_count == 0)) {
1051			if (error == 0)
1052				error = SET_ERROR(EINVAL);
1053			goto fail;
1054		}
1055
1056		for (zap_cursor_init(&zc, os, sa->sa_layout_attr_obj);
1057		    (error = zap_cursor_retrieve(&zc, &za)) == 0;
1058		    zap_cursor_advance(&zc)) {
1059			sa_attr_type_t *lot_attrs;
1060			uint64_t lot_num;
1061
1062			lot_attrs = kmem_zalloc(sizeof (sa_attr_type_t) *
1063			    za.za_num_integers, KM_SLEEP);
1064
1065			if ((error = (zap_lookup(os, sa->sa_layout_attr_obj,
1066			    za.za_name, 2, za.za_num_integers,
1067			    lot_attrs))) != 0) {
1068				kmem_free(lot_attrs, sizeof (sa_attr_type_t) *
1069				    za.za_num_integers);
1070				break;
1071			}
1072			VERIFY0(ddi_strtoull(za.za_name, NULL, 10,
1073			    (unsigned long long *)&lot_num));
1074
1075			(void) sa_add_layout_entry(os, lot_attrs,
1076			    za.za_num_integers, lot_num,
1077			    sa_layout_info_hash(lot_attrs,
1078			    za.za_num_integers), B_FALSE, NULL);
1079			kmem_free(lot_attrs, sizeof (sa_attr_type_t) *
1080			    za.za_num_integers);
1081		}
1082		zap_cursor_fini(&zc);
1083
1084		/*
1085		 * Make sure layout count matches number of entries added
1086		 * to AVL tree
1087		 */
1088		if (avl_numnodes(&sa->sa_layout_num_tree) != layout_count) {
1089			ASSERT(error != 0);
1090			goto fail;
1091		}
1092	}
1093
1094	/* Add special layout number for old ZNODES */
1095	if (ostype == DMU_OST_ZFS) {
1096		(void) sa_add_layout_entry(os, sa_legacy_zpl_layout,
1097		    sa_legacy_attr_count, 0,
1098		    sa_layout_info_hash(sa_legacy_zpl_layout,
1099		    sa_legacy_attr_count), B_FALSE, NULL);
1100
1101		(void) sa_add_layout_entry(os, sa_dummy_zpl_layout, 0, 1,
1102		    0, B_FALSE, NULL);
1103	}
1104	*user_table = os->os_sa->sa_user_table;
1105	mutex_exit(&sa->sa_lock);
1106	return (0);
1107fail:
1108	os->os_sa = NULL;
1109	sa_free_attr_table(sa);
1110	if (sa->sa_user_table)
1111		kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
1112	mutex_exit(&sa->sa_lock);
1113	avl_destroy(&sa->sa_layout_hash_tree);
1114	avl_destroy(&sa->sa_layout_num_tree);
1115	mutex_destroy(&sa->sa_lock);
1116	kmem_free(sa, sizeof (sa_os_t));
1117	return ((error == ECKSUM) ? EIO : error);
1118}
1119
1120void
1121sa_tear_down(objset_t *os)
1122{
1123	sa_os_t *sa = os->os_sa;
1124	sa_lot_t *layout;
1125	void *cookie;
1126
1127	kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
1128
1129	/* Free up attr table */
1130
1131	sa_free_attr_table(sa);
1132
1133	cookie = NULL;
1134	while ((layout =
1135	    avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie))) {
1136		sa_idx_tab_t *tab;
1137		while ((tab = list_head(&layout->lot_idx_tab))) {
1138			ASSERT(zfs_refcount_count(&tab->sa_refcount));
1139			sa_idx_tab_rele(os, tab);
1140		}
1141	}
1142
1143	cookie = NULL;
1144	while ((layout = avl_destroy_nodes(&sa->sa_layout_num_tree, &cookie))) {
1145		kmem_free(layout->lot_attrs,
1146		    sizeof (sa_attr_type_t) * layout->lot_attr_count);
1147		kmem_free(layout, sizeof (sa_lot_t));
1148	}
1149
1150	avl_destroy(&sa->sa_layout_hash_tree);
1151	avl_destroy(&sa->sa_layout_num_tree);
1152	mutex_destroy(&sa->sa_lock);
1153
1154	kmem_free(sa, sizeof (sa_os_t));
1155	os->os_sa = NULL;
1156}
1157
1158static void
1159sa_build_idx_tab(void *hdr, void *attr_addr, sa_attr_type_t attr,
1160    uint16_t length, int length_idx, boolean_t var_length, void *userp)
1161{
1162	sa_idx_tab_t *idx_tab = userp;
1163
1164	if (var_length) {
1165		ASSERT(idx_tab->sa_variable_lengths);
1166		idx_tab->sa_variable_lengths[length_idx] = length;
1167	}
1168	TOC_ATTR_ENCODE(idx_tab->sa_idx_tab[attr], length_idx,
1169	    (uint32_t)((uintptr_t)attr_addr - (uintptr_t)hdr));
1170}
1171
1172static void
1173sa_attr_iter(objset_t *os, sa_hdr_phys_t *hdr, dmu_object_type_t type,
1174    sa_iterfunc_t func, sa_lot_t *tab, void *userp)
1175{
1176	void *data_start;
1177	sa_lot_t *tb = tab;
1178	sa_lot_t search;
1179	avl_index_t loc;
1180	sa_os_t *sa = os->os_sa;
1181	int i;
1182	uint16_t *length_start = NULL;
1183	uint8_t length_idx = 0;
1184
1185	if (tab == NULL) {
1186		search.lot_num = SA_LAYOUT_NUM(hdr, type);
1187		tb = avl_find(&sa->sa_layout_num_tree, &search, &loc);
1188		ASSERT(tb);
1189	}
1190
1191	if (IS_SA_BONUSTYPE(type)) {
1192		data_start = (void *)P2ROUNDUP(((uintptr_t)hdr +
1193		    offsetof(sa_hdr_phys_t, sa_lengths) +
1194		    (sizeof (uint16_t) * tb->lot_var_sizes)), 8);
1195		length_start = hdr->sa_lengths;
1196	} else {
1197		data_start = hdr;
1198	}
1199
1200	for (i = 0; i != tb->lot_attr_count; i++) {
1201		int attr_length, reg_length;
1202		uint8_t idx_len;
1203
1204		reg_length = sa->sa_attr_table[tb->lot_attrs[i]].sa_length;
1205		IMPLY(reg_length == 0, IS_SA_BONUSTYPE(type));
1206		if (reg_length) {
1207			attr_length = reg_length;
1208			idx_len = 0;
1209		} else {
1210			attr_length = length_start[length_idx];
1211			idx_len = length_idx++;
1212		}
1213
1214		func(hdr, data_start, tb->lot_attrs[i], attr_length,
1215		    idx_len, reg_length == 0 ? B_TRUE : B_FALSE, userp);
1216
1217		data_start = (void *)P2ROUNDUP(((uintptr_t)data_start +
1218		    attr_length), 8);
1219	}
1220}
1221
1222static void
1223sa_byteswap_cb(void *hdr, void *attr_addr, sa_attr_type_t attr,
1224    uint16_t length, int length_idx, boolean_t variable_length, void *userp)
1225{
1226	(void) hdr, (void) length_idx, (void) variable_length;
1227	sa_handle_t *hdl = userp;
1228	sa_os_t *sa = hdl->sa_os->os_sa;
1229
1230	sa_bswap_table[sa->sa_attr_table[attr].sa_byteswap](attr_addr, length);
1231}
1232
1233static void
1234sa_byteswap(sa_handle_t *hdl, sa_buf_type_t buftype)
1235{
1236	sa_hdr_phys_t *sa_hdr_phys = SA_GET_HDR(hdl, buftype);
1237	dmu_buf_impl_t *db;
1238	int num_lengths = 1;
1239	int i;
1240	sa_os_t *sa __maybe_unused = hdl->sa_os->os_sa;
1241
1242	ASSERT(MUTEX_HELD(&sa->sa_lock));
1243	if (sa_hdr_phys->sa_magic == SA_MAGIC)
1244		return;
1245
1246	db = SA_GET_DB(hdl, buftype);
1247
1248	if (buftype == SA_SPILL) {
1249		arc_release(db->db_buf, NULL);
1250		arc_buf_thaw(db->db_buf);
1251	}
1252
1253	sa_hdr_phys->sa_magic = BSWAP_32(sa_hdr_phys->sa_magic);
1254	sa_hdr_phys->sa_layout_info = BSWAP_16(sa_hdr_phys->sa_layout_info);
1255
1256	/*
1257	 * Determine number of variable lengths in header
1258	 * The standard 8 byte header has one for free and a
1259	 * 16 byte header would have 4 + 1;
1260	 */
1261	if (SA_HDR_SIZE(sa_hdr_phys) > 8)
1262		num_lengths += (SA_HDR_SIZE(sa_hdr_phys) - 8) >> 1;
1263	for (i = 0; i != num_lengths; i++)
1264		sa_hdr_phys->sa_lengths[i] =
1265		    BSWAP_16(sa_hdr_phys->sa_lengths[i]);
1266
1267	sa_attr_iter(hdl->sa_os, sa_hdr_phys, DMU_OT_SA,
1268	    sa_byteswap_cb, NULL, hdl);
1269
1270	if (buftype == SA_SPILL)
1271		arc_buf_freeze(((dmu_buf_impl_t *)hdl->sa_spill)->db_buf);
1272}
1273
1274static int
1275sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype)
1276{
1277	sa_hdr_phys_t *sa_hdr_phys;
1278	dmu_buf_impl_t *db = SA_GET_DB(hdl, buftype);
1279	dmu_object_type_t bonustype = SA_BONUSTYPE_FROM_DB(db);
1280	sa_os_t *sa = hdl->sa_os->os_sa;
1281	sa_idx_tab_t *idx_tab;
1282
1283	sa_hdr_phys = SA_GET_HDR(hdl, buftype);
1284
1285	mutex_enter(&sa->sa_lock);
1286
1287	/* Do we need to byteswap? */
1288
1289	/* only check if not old znode */
1290	if (IS_SA_BONUSTYPE(bonustype) && sa_hdr_phys->sa_magic != SA_MAGIC &&
1291	    sa_hdr_phys->sa_magic != 0) {
1292		if (BSWAP_32(sa_hdr_phys->sa_magic) != SA_MAGIC) {
1293			mutex_exit(&sa->sa_lock);
1294			zfs_dbgmsg("Buffer Header: %x != SA_MAGIC:%x "
1295			    "object=%#llx\n", sa_hdr_phys->sa_magic, SA_MAGIC,
1296			    (u_longlong_t)db->db.db_object);
1297			return (SET_ERROR(EIO));
1298		}
1299		sa_byteswap(hdl, buftype);
1300	}
1301
1302	idx_tab = sa_find_idx_tab(hdl->sa_os, bonustype, sa_hdr_phys);
1303
1304	if (buftype == SA_BONUS)
1305		hdl->sa_bonus_tab = idx_tab;
1306	else
1307		hdl->sa_spill_tab = idx_tab;
1308
1309	mutex_exit(&sa->sa_lock);
1310	return (0);
1311}
1312
1313static void
1314sa_evict_sync(void *dbu)
1315{
1316	(void) dbu;
1317	panic("evicting sa dbuf\n");
1318}
1319
1320static void
1321sa_idx_tab_rele(objset_t *os, void *arg)
1322{
1323	sa_os_t *sa = os->os_sa;
1324	sa_idx_tab_t *idx_tab = arg;
1325
1326	if (idx_tab == NULL)
1327		return;
1328
1329	mutex_enter(&sa->sa_lock);
1330	if (zfs_refcount_remove(&idx_tab->sa_refcount, NULL) == 0) {
1331		list_remove(&idx_tab->sa_layout->lot_idx_tab, idx_tab);
1332		if (idx_tab->sa_variable_lengths)
1333			kmem_free(idx_tab->sa_variable_lengths,
1334			    sizeof (uint16_t) *
1335			    idx_tab->sa_layout->lot_var_sizes);
1336		zfs_refcount_destroy(&idx_tab->sa_refcount);
1337		kmem_free(idx_tab->sa_idx_tab,
1338		    sizeof (uint32_t) * sa->sa_num_attrs);
1339		kmem_free(idx_tab, sizeof (sa_idx_tab_t));
1340	}
1341	mutex_exit(&sa->sa_lock);
1342}
1343
1344static void
1345sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab)
1346{
1347	sa_os_t *sa __maybe_unused = os->os_sa;
1348
1349	ASSERT(MUTEX_HELD(&sa->sa_lock));
1350	(void) zfs_refcount_add(&idx_tab->sa_refcount, NULL);
1351}
1352
1353void
1354sa_spill_rele(sa_handle_t *hdl)
1355{
1356	mutex_enter(&hdl->sa_lock);
1357	if (hdl->sa_spill) {
1358		sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
1359		dmu_buf_rele(hdl->sa_spill, NULL);
1360		hdl->sa_spill = NULL;
1361		hdl->sa_spill_tab = NULL;
1362	}
1363	mutex_exit(&hdl->sa_lock);
1364}
1365
1366void
1367sa_handle_destroy(sa_handle_t *hdl)
1368{
1369	dmu_buf_t *db = hdl->sa_bonus;
1370
1371	mutex_enter(&hdl->sa_lock);
1372	(void) dmu_buf_remove_user(db, &hdl->sa_dbu);
1373
1374	if (hdl->sa_bonus_tab)
1375		sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
1376
1377	if (hdl->sa_spill_tab)
1378		sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
1379
1380	dmu_buf_rele(hdl->sa_bonus, NULL);
1381
1382	if (hdl->sa_spill)
1383		dmu_buf_rele(hdl->sa_spill, NULL);
1384	mutex_exit(&hdl->sa_lock);
1385
1386	kmem_cache_free(sa_cache, hdl);
1387}
1388
1389int
1390sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp,
1391    sa_handle_type_t hdl_type, sa_handle_t **handlepp)
1392{
1393	int error = 0;
1394	sa_handle_t *handle = NULL;
1395#ifdef ZFS_DEBUG
1396	dmu_object_info_t doi;
1397
1398	dmu_object_info_from_db(db, &doi);
1399	ASSERT(doi.doi_bonus_type == DMU_OT_SA ||
1400	    doi.doi_bonus_type == DMU_OT_ZNODE);
1401#endif
1402	/* find handle, if it exists */
1403	/* if one doesn't exist then create a new one, and initialize it */
1404
1405	if (hdl_type == SA_HDL_SHARED)
1406		handle = dmu_buf_get_user(db);
1407
1408	if (handle == NULL) {
1409		sa_handle_t *winner = NULL;
1410
1411		handle = kmem_cache_alloc(sa_cache, KM_SLEEP);
1412		handle->sa_dbu.dbu_evict_func_sync = NULL;
1413		handle->sa_dbu.dbu_evict_func_async = NULL;
1414		handle->sa_userp = userp;
1415		handle->sa_bonus = db;
1416		handle->sa_os = os;
1417		handle->sa_spill = NULL;
1418		handle->sa_bonus_tab = NULL;
1419		handle->sa_spill_tab = NULL;
1420
1421		error = sa_build_index(handle, SA_BONUS);
1422
1423		if (hdl_type == SA_HDL_SHARED) {
1424			dmu_buf_init_user(&handle->sa_dbu, sa_evict_sync, NULL,
1425			    NULL);
1426			winner = dmu_buf_set_user_ie(db, &handle->sa_dbu);
1427		}
1428
1429		if (winner != NULL) {
1430			kmem_cache_free(sa_cache, handle);
1431			handle = winner;
1432		}
1433	}
1434	*handlepp = handle;
1435
1436	return (error);
1437}
1438
1439int
1440sa_handle_get(objset_t *objset, uint64_t objid, void *userp,
1441    sa_handle_type_t hdl_type, sa_handle_t **handlepp)
1442{
1443	dmu_buf_t *db;
1444	int error;
1445
1446	if ((error = dmu_bonus_hold(objset, objid, NULL, &db)))
1447		return (error);
1448
1449	return (sa_handle_get_from_db(objset, db, userp, hdl_type,
1450	    handlepp));
1451}
1452
1453int
1454sa_buf_hold(objset_t *objset, uint64_t obj_num, const void *tag, dmu_buf_t **db)
1455{
1456	return (dmu_bonus_hold(objset, obj_num, tag, db));
1457}
1458
1459void
1460sa_buf_rele(dmu_buf_t *db, const void *tag)
1461{
1462	dmu_buf_rele(db, tag);
1463}
1464
1465static int
1466sa_lookup_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count)
1467{
1468	ASSERT(hdl);
1469	ASSERT(MUTEX_HELD(&hdl->sa_lock));
1470	return (sa_attr_op(hdl, bulk, count, SA_LOOKUP, NULL));
1471}
1472
1473static int
1474sa_lookup_locked(sa_handle_t *hdl, sa_attr_type_t attr, void *buf,
1475    uint32_t buflen)
1476{
1477	int error;
1478	sa_bulk_attr_t bulk;
1479
1480	VERIFY3U(buflen, <=, SA_ATTR_MAX_LEN);
1481
1482	bulk.sa_attr = attr;
1483	bulk.sa_data = buf;
1484	bulk.sa_length = buflen;
1485	bulk.sa_data_func = NULL;
1486
1487	ASSERT(hdl);
1488	error = sa_lookup_impl(hdl, &bulk, 1);
1489	return (error);
1490}
1491
1492int
1493sa_lookup(sa_handle_t *hdl, sa_attr_type_t attr, void *buf, uint32_t buflen)
1494{
1495	int error;
1496
1497	mutex_enter(&hdl->sa_lock);
1498	error = sa_lookup_locked(hdl, attr, buf, buflen);
1499	mutex_exit(&hdl->sa_lock);
1500
1501	return (error);
1502}
1503
1504#ifdef _KERNEL
1505int
1506sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, zfs_uio_t *uio)
1507{
1508	int error;
1509	sa_bulk_attr_t bulk;
1510
1511	bulk.sa_data = NULL;
1512	bulk.sa_attr = attr;
1513	bulk.sa_data_func = NULL;
1514
1515	ASSERT(hdl);
1516
1517	mutex_enter(&hdl->sa_lock);
1518	if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) == 0) {
1519		error = zfs_uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size,
1520		    zfs_uio_resid(uio)), UIO_READ, uio);
1521	}
1522	mutex_exit(&hdl->sa_lock);
1523	return (error);
1524}
1525
1526/*
1527 * For the existed object that is upgraded from old system, its ondisk layout
1528 * has no slot for the project ID attribute. But quota accounting logic needs
1529 * to access related slots by offset directly. So we need to adjust these old
1530 * objects' layout to make the project ID to some unified and fixed offset.
1531 */
1532int
1533sa_add_projid(sa_handle_t *hdl, dmu_tx_t *tx, uint64_t projid)
1534{
1535	znode_t *zp = sa_get_userdata(hdl);
1536	dmu_buf_t *db = sa_get_db(hdl);
1537	zfsvfs_t *zfsvfs = ZTOZSB(zp);
1538	int count = 0, err = 0;
1539	sa_bulk_attr_t *bulk, *attrs;
1540	zfs_acl_locator_cb_t locate = { 0 };
1541	uint64_t uid, gid, mode, rdev, xattr = 0, parent, gen, links;
1542	uint64_t crtime[2], mtime[2], ctime[2], atime[2];
1543	zfs_acl_phys_t znode_acl = { 0 };
1544	char scanstamp[AV_SCANSTAMP_SZ];
1545
1546	if (zp->z_acl_cached == NULL) {
1547		zfs_acl_t *aclp;
1548
1549		mutex_enter(&zp->z_acl_lock);
1550		err = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE);
1551		mutex_exit(&zp->z_acl_lock);
1552		if (err != 0 && err != ENOENT)
1553			return (err);
1554	}
1555
1556	bulk = kmem_zalloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
1557	attrs = kmem_zalloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP);
1558	mutex_enter(&hdl->sa_lock);
1559	mutex_enter(&zp->z_lock);
1560
1561	err = sa_lookup_locked(hdl, SA_ZPL_PROJID(zfsvfs), &projid,
1562	    sizeof (uint64_t));
1563	if (unlikely(err == 0))
1564		/* Someone has added project ID attr by race. */
1565		err = EEXIST;
1566	if (err != ENOENT)
1567		goto out;
1568
1569	/* First do a bulk query of the attributes that aren't cached */
1570	if (zp->z_is_sa) {
1571		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
1572		    &mode, 8);
1573		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
1574		    &gen, 8);
1575		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
1576		    &uid, 8);
1577		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
1578		    &gid, 8);
1579		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
1580		    &parent, 8);
1581		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
1582		    &atime, 16);
1583		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
1584		    &mtime, 16);
1585		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
1586		    &ctime, 16);
1587		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL,
1588		    &crtime, 16);
1589		if (Z_ISBLK(ZTOTYPE(zp)) || Z_ISCHR(ZTOTYPE(zp)))
1590			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
1591			    &rdev, 8);
1592	} else {
1593		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
1594		    &atime, 16);
1595		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
1596		    &mtime, 16);
1597		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
1598		    &ctime, 16);
1599		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL,
1600		    &crtime, 16);
1601		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL,
1602		    &gen, 8);
1603		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
1604		    &mode, 8);
1605		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
1606		    &parent, 8);
1607		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_XATTR(zfsvfs), NULL,
1608		    &xattr, 8);
1609		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL,
1610		    &rdev, 8);
1611		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
1612		    &uid, 8);
1613		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL,
1614		    &gid, 8);
1615		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL,
1616		    &znode_acl, 88);
1617	}
1618	err = sa_bulk_lookup_locked(hdl, bulk, count);
1619	if (err != 0)
1620		goto out;
1621
1622	err = sa_lookup_locked(hdl, SA_ZPL_XATTR(zfsvfs), &xattr, 8);
1623	if (err != 0 && err != ENOENT)
1624		goto out;
1625
1626	zp->z_projid = projid;
1627	zp->z_pflags |= ZFS_PROJID;
1628	links = ZTONLNK(zp);
1629	count = 0;
1630	err = 0;
1631
1632	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8);
1633	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_SIZE(zfsvfs), NULL,
1634	    &zp->z_size, 8);
1635	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_GEN(zfsvfs), NULL, &gen, 8);
1636	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8);
1637	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8);
1638	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8);
1639	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_FLAGS(zfsvfs), NULL,
1640	    &zp->z_pflags, 8);
1641	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16);
1642	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
1643	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
1644	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_CRTIME(zfsvfs), NULL,
1645	    &crtime, 16);
1646	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8);
1647	SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_PROJID(zfsvfs), NULL, &projid, 8);
1648
1649	if (Z_ISBLK(ZTOTYPE(zp)) || Z_ISCHR(ZTOTYPE(zp)))
1650		SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_RDEV(zfsvfs), NULL,
1651		    &rdev, 8);
1652
1653	if (zp->z_acl_cached != NULL) {
1654		SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_DACL_COUNT(zfsvfs), NULL,
1655		    &zp->z_acl_cached->z_acl_count, 8);
1656		if (zp->z_acl_cached->z_version < ZFS_ACL_VERSION_FUID)
1657			zfs_acl_xform(zp, zp->z_acl_cached, CRED());
1658		locate.cb_aclp = zp->z_acl_cached;
1659		SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_DACL_ACES(zfsvfs),
1660		    zfs_acl_data_locator, &locate,
1661		    zp->z_acl_cached->z_acl_bytes);
1662	}
1663
1664	if (xattr)
1665		SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_XATTR(zfsvfs), NULL,
1666		    &xattr, 8);
1667
1668	if (zp->z_pflags & ZFS_BONUS_SCANSTAMP) {
1669		memcpy(scanstamp,
1670		    (caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE,
1671		    AV_SCANSTAMP_SZ);
1672		SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_SCANSTAMP(zfsvfs), NULL,
1673		    scanstamp, AV_SCANSTAMP_SZ);
1674		zp->z_pflags &= ~ZFS_BONUS_SCANSTAMP;
1675	}
1676
1677	VERIFY(dmu_set_bonustype(db, DMU_OT_SA, tx) == 0);
1678	VERIFY(sa_replace_all_by_template_locked(hdl, attrs, count, tx) == 0);
1679	if (znode_acl.z_acl_extern_obj) {
1680		VERIFY(0 == dmu_object_free(zfsvfs->z_os,
1681		    znode_acl.z_acl_extern_obj, tx));
1682	}
1683
1684	zp->z_is_sa = B_TRUE;
1685
1686out:
1687	mutex_exit(&zp->z_lock);
1688	mutex_exit(&hdl->sa_lock);
1689	kmem_free(attrs, sizeof (sa_bulk_attr_t) * ZPL_END);
1690	kmem_free(bulk, sizeof (sa_bulk_attr_t) * ZPL_END);
1691	return (err);
1692}
1693#endif
1694
1695static sa_idx_tab_t *
1696sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, sa_hdr_phys_t *hdr)
1697{
1698	sa_idx_tab_t *idx_tab;
1699	sa_os_t *sa = os->os_sa;
1700	sa_lot_t *tb, search;
1701	avl_index_t loc;
1702
1703	/*
1704	 * Deterimine layout number.  If SA node and header == 0 then
1705	 * force the index table to the dummy "1" empty layout.
1706	 *
1707	 * The layout number would only be zero for a newly created file
1708	 * that has not added any attributes yet, or with crypto enabled which
1709	 * doesn't write any attributes to the bonus buffer.
1710	 */
1711
1712	search.lot_num = SA_LAYOUT_NUM(hdr, bonustype);
1713
1714	tb = avl_find(&sa->sa_layout_num_tree, &search, &loc);
1715
1716	/* Verify header size is consistent with layout information */
1717	ASSERT(tb);
1718	ASSERT((IS_SA_BONUSTYPE(bonustype) &&
1719	    SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb)) || !IS_SA_BONUSTYPE(bonustype) ||
1720	    (IS_SA_BONUSTYPE(bonustype) && hdr->sa_layout_info == 0));
1721
1722	/*
1723	 * See if any of the already existing TOC entries can be reused?
1724	 */
1725
1726	for (idx_tab = list_head(&tb->lot_idx_tab); idx_tab;
1727	    idx_tab = list_next(&tb->lot_idx_tab, idx_tab)) {
1728		boolean_t valid_idx = B_TRUE;
1729		int i;
1730
1731		if (tb->lot_var_sizes != 0 &&
1732		    idx_tab->sa_variable_lengths != NULL) {
1733			for (i = 0; i != tb->lot_var_sizes; i++) {
1734				if (hdr->sa_lengths[i] !=
1735				    idx_tab->sa_variable_lengths[i]) {
1736					valid_idx = B_FALSE;
1737					break;
1738				}
1739			}
1740		}
1741		if (valid_idx) {
1742			sa_idx_tab_hold(os, idx_tab);
1743			return (idx_tab);
1744		}
1745	}
1746
1747	/* No such luck, create a new entry */
1748	idx_tab = kmem_zalloc(sizeof (sa_idx_tab_t), KM_SLEEP);
1749	idx_tab->sa_idx_tab =
1750	    kmem_zalloc(sizeof (uint32_t) * sa->sa_num_attrs, KM_SLEEP);
1751	idx_tab->sa_layout = tb;
1752	zfs_refcount_create(&idx_tab->sa_refcount);
1753	if (tb->lot_var_sizes)
1754		idx_tab->sa_variable_lengths = kmem_alloc(sizeof (uint16_t) *
1755		    tb->lot_var_sizes, KM_SLEEP);
1756
1757	sa_attr_iter(os, hdr, bonustype, sa_build_idx_tab,
1758	    tb, idx_tab);
1759	sa_idx_tab_hold(os, idx_tab);   /* one hold for consumer */
1760	sa_idx_tab_hold(os, idx_tab);	/* one for layout */
1761	list_insert_tail(&tb->lot_idx_tab, idx_tab);
1762	return (idx_tab);
1763}
1764
1765void
1766sa_default_locator(void **dataptr, uint32_t *len, uint32_t total_len,
1767    boolean_t start, void *userdata)
1768{
1769	ASSERT(start);
1770
1771	*dataptr = userdata;
1772	*len = total_len;
1773}
1774
1775static void
1776sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx)
1777{
1778	uint64_t attr_value = 0;
1779	sa_os_t *sa = hdl->sa_os->os_sa;
1780	sa_attr_table_t *tb = sa->sa_attr_table;
1781	int i;
1782
1783	mutex_enter(&sa->sa_lock);
1784
1785	if (!sa->sa_need_attr_registration || sa->sa_master_obj == 0) {
1786		mutex_exit(&sa->sa_lock);
1787		return;
1788	}
1789
1790	if (sa->sa_reg_attr_obj == 0) {
1791		sa->sa_reg_attr_obj = zap_create_link(hdl->sa_os,
1792		    DMU_OT_SA_ATTR_REGISTRATION,
1793		    sa->sa_master_obj, SA_REGISTRY, tx);
1794	}
1795	for (i = 0; i != sa->sa_num_attrs; i++) {
1796		if (sa->sa_attr_table[i].sa_registered)
1797			continue;
1798		ATTR_ENCODE(attr_value, tb[i].sa_attr, tb[i].sa_length,
1799		    tb[i].sa_byteswap);
1800		VERIFY(0 == zap_update(hdl->sa_os, sa->sa_reg_attr_obj,
1801		    tb[i].sa_name, 8, 1, &attr_value, tx));
1802		tb[i].sa_registered = B_TRUE;
1803	}
1804	sa->sa_need_attr_registration = B_FALSE;
1805	mutex_exit(&sa->sa_lock);
1806}
1807
1808/*
1809 * Replace all attributes with attributes specified in template.
1810 * If dnode had a spill buffer then those attributes will be
1811 * also be replaced, possibly with just an empty spill block
1812 *
1813 * This interface is intended to only be used for bulk adding of
1814 * attributes for a new file.  It will also be used by the ZPL
1815 * when converting and old formatted znode to native SA support.
1816 */
1817int
1818sa_replace_all_by_template_locked(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc,
1819    int attr_count, dmu_tx_t *tx)
1820{
1821	sa_os_t *sa = hdl->sa_os->os_sa;
1822
1823	if (sa->sa_need_attr_registration)
1824		sa_attr_register_sync(hdl, tx);
1825	return (sa_build_layouts(hdl, attr_desc, attr_count, tx));
1826}
1827
1828int
1829sa_replace_all_by_template(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc,
1830    int attr_count, dmu_tx_t *tx)
1831{
1832	int error;
1833
1834	mutex_enter(&hdl->sa_lock);
1835	error = sa_replace_all_by_template_locked(hdl, attr_desc,
1836	    attr_count, tx);
1837	mutex_exit(&hdl->sa_lock);
1838	return (error);
1839}
1840
1841/*
1842 * Add/remove a single attribute or replace a variable-sized attribute value
1843 * with a value of a different size, and then rewrite the entire set
1844 * of attributes.
1845 * Same-length attribute value replacement (including fixed-length attributes)
1846 * is handled more efficiently by the upper layers.
1847 */
1848static int
1849sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr,
1850    sa_data_op_t action, sa_data_locator_t *locator, void *datastart,
1851    uint16_t buflen, dmu_tx_t *tx)
1852{
1853	sa_os_t *sa = hdl->sa_os->os_sa;
1854	dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
1855	dnode_t *dn;
1856	sa_bulk_attr_t *attr_desc;
1857	void *old_data[2];
1858	int bonus_attr_count = 0;
1859	int bonus_data_size = 0;
1860	int spill_data_size = 0;
1861	int spill_attr_count = 0;
1862	int error;
1863	uint16_t length, reg_length;
1864	int i, j, k, length_idx;
1865	sa_hdr_phys_t *hdr;
1866	sa_idx_tab_t *idx_tab;
1867	int attr_count;
1868	int count;
1869
1870	ASSERT(MUTEX_HELD(&hdl->sa_lock));
1871
1872	/* First make of copy of the old data */
1873
1874	DB_DNODE_ENTER(db);
1875	dn = DB_DNODE(db);
1876	if (dn->dn_bonuslen != 0) {
1877		bonus_data_size = hdl->sa_bonus->db_size;
1878		old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP);
1879		memcpy(old_data[0], hdl->sa_bonus->db_data,
1880		    hdl->sa_bonus->db_size);
1881		bonus_attr_count = hdl->sa_bonus_tab->sa_layout->lot_attr_count;
1882	} else {
1883		old_data[0] = NULL;
1884	}
1885	DB_DNODE_EXIT(db);
1886
1887	/* Bring spill buffer online if it isn't currently */
1888
1889	if ((error = sa_get_spill(hdl)) == 0) {
1890		spill_data_size = hdl->sa_spill->db_size;
1891		old_data[1] = vmem_alloc(spill_data_size, KM_SLEEP);
1892		memcpy(old_data[1], hdl->sa_spill->db_data,
1893		    hdl->sa_spill->db_size);
1894		spill_attr_count =
1895		    hdl->sa_spill_tab->sa_layout->lot_attr_count;
1896	} else if (error && error != ENOENT) {
1897		if (old_data[0])
1898			kmem_free(old_data[0], bonus_data_size);
1899		return (error);
1900	} else {
1901		old_data[1] = NULL;
1902	}
1903
1904	/* build descriptor of all attributes */
1905
1906	attr_count = bonus_attr_count + spill_attr_count;
1907	if (action == SA_ADD)
1908		attr_count++;
1909	else if (action == SA_REMOVE)
1910		attr_count--;
1911
1912	attr_desc = kmem_zalloc(sizeof (sa_bulk_attr_t) * attr_count, KM_SLEEP);
1913
1914	/*
1915	 * loop through bonus and spill buffer if it exists, and
1916	 * build up new attr_descriptor to reset the attributes
1917	 */
1918	k = j = 0;
1919	count = bonus_attr_count;
1920	hdr = SA_GET_HDR(hdl, SA_BONUS);
1921	idx_tab = SA_IDX_TAB_GET(hdl, SA_BONUS);
1922	for (; ; k++) {
1923		/*
1924		 * Iterate over each attribute in layout.  Fetch the
1925		 * size of variable-length attributes needing rewrite
1926		 * from sa_lengths[].
1927		 */
1928		for (i = 0, length_idx = 0; i != count; i++) {
1929			sa_attr_type_t attr;
1930
1931			attr = idx_tab->sa_layout->lot_attrs[i];
1932			reg_length = SA_REGISTERED_LEN(sa, attr);
1933			if (reg_length == 0) {
1934				length = hdr->sa_lengths[length_idx];
1935				length_idx++;
1936			} else {
1937				length = reg_length;
1938			}
1939			if (attr == newattr) {
1940				/*
1941				 * There is nothing to do for SA_REMOVE,
1942				 * so it is just skipped.
1943				 */
1944				if (action == SA_REMOVE)
1945					continue;
1946
1947				/*
1948				 * Duplicate attributes are not allowed, so the
1949				 * action can not be SA_ADD here.
1950				 */
1951				ASSERT3S(action, ==, SA_REPLACE);
1952
1953				/*
1954				 * Only a variable-sized attribute can be
1955				 * replaced here, and its size must be changing.
1956				 */
1957				ASSERT3U(reg_length, ==, 0);
1958				ASSERT3U(length, !=, buflen);
1959				SA_ADD_BULK_ATTR(attr_desc, j, attr,
1960				    locator, datastart, buflen);
1961			} else {
1962				SA_ADD_BULK_ATTR(attr_desc, j, attr,
1963				    NULL, (void *)
1964				    (TOC_OFF(idx_tab->sa_idx_tab[attr]) +
1965				    (uintptr_t)old_data[k]), length);
1966			}
1967		}
1968		if (k == 0 && hdl->sa_spill) {
1969			hdr = SA_GET_HDR(hdl, SA_SPILL);
1970			idx_tab = SA_IDX_TAB_GET(hdl, SA_SPILL);
1971			count = spill_attr_count;
1972		} else {
1973			break;
1974		}
1975	}
1976	if (action == SA_ADD) {
1977		reg_length = SA_REGISTERED_LEN(sa, newattr);
1978		IMPLY(reg_length != 0, reg_length == buflen);
1979		SA_ADD_BULK_ATTR(attr_desc, j, newattr, locator,
1980		    datastart, buflen);
1981	}
1982	ASSERT3U(j, ==, attr_count);
1983
1984	error = sa_build_layouts(hdl, attr_desc, attr_count, tx);
1985
1986	if (old_data[0])
1987		kmem_free(old_data[0], bonus_data_size);
1988	if (old_data[1])
1989		vmem_free(old_data[1], spill_data_size);
1990	kmem_free(attr_desc, sizeof (sa_bulk_attr_t) * attr_count);
1991
1992	return (error);
1993}
1994
1995static int
1996sa_bulk_update_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count,
1997    dmu_tx_t *tx)
1998{
1999	int error;
2000	sa_os_t *sa = hdl->sa_os->os_sa;
2001	dmu_object_type_t bonustype;
2002	dmu_buf_t *saved_spill;
2003
2004	ASSERT(hdl);
2005	ASSERT(MUTEX_HELD(&hdl->sa_lock));
2006
2007	bonustype = SA_BONUSTYPE_FROM_DB(SA_GET_DB(hdl, SA_BONUS));
2008	saved_spill = hdl->sa_spill;
2009
2010	/* sync out registration table if necessary */
2011	if (sa->sa_need_attr_registration)
2012		sa_attr_register_sync(hdl, tx);
2013
2014	error = sa_attr_op(hdl, bulk, count, SA_UPDATE, tx);
2015	if (error == 0 && !IS_SA_BONUSTYPE(bonustype) && sa->sa_update_cb)
2016		sa->sa_update_cb(hdl, tx);
2017
2018	/*
2019	 * If saved_spill is NULL and current sa_spill is not NULL that
2020	 * means we increased the refcount of the spill buffer through
2021	 * sa_get_spill() or dmu_spill_hold_by_dnode().  Therefore we
2022	 * must release the hold before calling dmu_tx_commit() to avoid
2023	 * making a copy of this buffer in dbuf_sync_leaf() due to the
2024	 * reference count now being greater than 1.
2025	 */
2026	if (!saved_spill && hdl->sa_spill) {
2027		if (hdl->sa_spill_tab) {
2028			sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
2029			hdl->sa_spill_tab = NULL;
2030		}
2031
2032		dmu_buf_rele(hdl->sa_spill, NULL);
2033		hdl->sa_spill = NULL;
2034	}
2035
2036	return (error);
2037}
2038
2039/*
2040 * update or add new attribute
2041 */
2042int
2043sa_update(sa_handle_t *hdl, sa_attr_type_t type,
2044    void *buf, uint32_t buflen, dmu_tx_t *tx)
2045{
2046	int error;
2047	sa_bulk_attr_t bulk;
2048
2049	VERIFY3U(buflen, <=, SA_ATTR_MAX_LEN);
2050
2051	bulk.sa_attr = type;
2052	bulk.sa_data_func = NULL;
2053	bulk.sa_length = buflen;
2054	bulk.sa_data = buf;
2055
2056	mutex_enter(&hdl->sa_lock);
2057	error = sa_bulk_update_impl(hdl, &bulk, 1, tx);
2058	mutex_exit(&hdl->sa_lock);
2059	return (error);
2060}
2061
2062/*
2063 * Return size of an attribute
2064 */
2065
2066int
2067sa_size(sa_handle_t *hdl, sa_attr_type_t attr, int *size)
2068{
2069	sa_bulk_attr_t bulk;
2070	int error;
2071
2072	bulk.sa_data = NULL;
2073	bulk.sa_attr = attr;
2074	bulk.sa_data_func = NULL;
2075
2076	ASSERT(hdl);
2077	mutex_enter(&hdl->sa_lock);
2078	if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) != 0) {
2079		mutex_exit(&hdl->sa_lock);
2080		return (error);
2081	}
2082	*size = bulk.sa_size;
2083
2084	mutex_exit(&hdl->sa_lock);
2085	return (0);
2086}
2087
2088int
2089sa_bulk_lookup_locked(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count)
2090{
2091	ASSERT(hdl);
2092	ASSERT(MUTEX_HELD(&hdl->sa_lock));
2093	return (sa_lookup_impl(hdl, attrs, count));
2094}
2095
2096int
2097sa_bulk_lookup(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count)
2098{
2099	int error;
2100
2101	ASSERT(hdl);
2102	mutex_enter(&hdl->sa_lock);
2103	error = sa_bulk_lookup_locked(hdl, attrs, count);
2104	mutex_exit(&hdl->sa_lock);
2105	return (error);
2106}
2107
2108int
2109sa_bulk_update(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count, dmu_tx_t *tx)
2110{
2111	int error;
2112
2113	ASSERT(hdl);
2114	mutex_enter(&hdl->sa_lock);
2115	error = sa_bulk_update_impl(hdl, attrs, count, tx);
2116	mutex_exit(&hdl->sa_lock);
2117	return (error);
2118}
2119
2120int
2121sa_remove(sa_handle_t *hdl, sa_attr_type_t attr, dmu_tx_t *tx)
2122{
2123	int error;
2124
2125	mutex_enter(&hdl->sa_lock);
2126	error = sa_modify_attrs(hdl, attr, SA_REMOVE, NULL,
2127	    NULL, 0, tx);
2128	mutex_exit(&hdl->sa_lock);
2129	return (error);
2130}
2131
2132void
2133sa_object_info(sa_handle_t *hdl, dmu_object_info_t *doi)
2134{
2135	dmu_object_info_from_db(hdl->sa_bonus, doi);
2136}
2137
2138void
2139sa_object_size(sa_handle_t *hdl, uint32_t *blksize, u_longlong_t *nblocks)
2140{
2141	dmu_object_size_from_db(hdl->sa_bonus,
2142	    blksize, nblocks);
2143}
2144
2145void
2146sa_set_userp(sa_handle_t *hdl, void *ptr)
2147{
2148	hdl->sa_userp = ptr;
2149}
2150
2151dmu_buf_t *
2152sa_get_db(sa_handle_t *hdl)
2153{
2154	return (hdl->sa_bonus);
2155}
2156
2157void *
2158sa_get_userdata(sa_handle_t *hdl)
2159{
2160	return (hdl->sa_userp);
2161}
2162
2163void
2164sa_register_update_callback_locked(objset_t *os, sa_update_cb_t *func)
2165{
2166	ASSERT(MUTEX_HELD(&os->os_sa->sa_lock));
2167	os->os_sa->sa_update_cb = func;
2168}
2169
2170void
2171sa_register_update_callback(objset_t *os, sa_update_cb_t *func)
2172{
2173
2174	mutex_enter(&os->os_sa->sa_lock);
2175	sa_register_update_callback_locked(os, func);
2176	mutex_exit(&os->os_sa->sa_lock);
2177}
2178
2179uint64_t
2180sa_handle_object(sa_handle_t *hdl)
2181{
2182	return (hdl->sa_bonus->db_object);
2183}
2184
2185boolean_t
2186sa_enabled(objset_t *os)
2187{
2188	return (os->os_sa == NULL);
2189}
2190
2191int
2192sa_set_sa_object(objset_t *os, uint64_t sa_object)
2193{
2194	sa_os_t *sa = os->os_sa;
2195
2196	if (sa->sa_master_obj)
2197		return (1);
2198
2199	sa->sa_master_obj = sa_object;
2200
2201	return (0);
2202}
2203
2204int
2205sa_hdrsize(void *arg)
2206{
2207	sa_hdr_phys_t *hdr = arg;
2208
2209	return (SA_HDR_SIZE(hdr));
2210}
2211
2212void
2213sa_handle_lock(sa_handle_t *hdl)
2214{
2215	ASSERT(hdl);
2216	mutex_enter(&hdl->sa_lock);
2217}
2218
2219void
2220sa_handle_unlock(sa_handle_t *hdl)
2221{
2222	ASSERT(hdl);
2223	mutex_exit(&hdl->sa_lock);
2224}
2225
2226#ifdef _KERNEL
2227EXPORT_SYMBOL(sa_handle_get);
2228EXPORT_SYMBOL(sa_handle_get_from_db);
2229EXPORT_SYMBOL(sa_handle_destroy);
2230EXPORT_SYMBOL(sa_buf_hold);
2231EXPORT_SYMBOL(sa_buf_rele);
2232EXPORT_SYMBOL(sa_spill_rele);
2233EXPORT_SYMBOL(sa_lookup);
2234EXPORT_SYMBOL(sa_update);
2235EXPORT_SYMBOL(sa_remove);
2236EXPORT_SYMBOL(sa_bulk_lookup);
2237EXPORT_SYMBOL(sa_bulk_lookup_locked);
2238EXPORT_SYMBOL(sa_bulk_update);
2239EXPORT_SYMBOL(sa_size);
2240EXPORT_SYMBOL(sa_object_info);
2241EXPORT_SYMBOL(sa_object_size);
2242EXPORT_SYMBOL(sa_get_userdata);
2243EXPORT_SYMBOL(sa_set_userp);
2244EXPORT_SYMBOL(sa_get_db);
2245EXPORT_SYMBOL(sa_handle_object);
2246EXPORT_SYMBOL(sa_register_update_callback);
2247EXPORT_SYMBOL(sa_setup);
2248EXPORT_SYMBOL(sa_replace_all_by_template);
2249EXPORT_SYMBOL(sa_replace_all_by_template_locked);
2250EXPORT_SYMBOL(sa_enabled);
2251EXPORT_SYMBOL(sa_cache_init);
2252EXPORT_SYMBOL(sa_cache_fini);
2253EXPORT_SYMBOL(sa_set_sa_object);
2254EXPORT_SYMBOL(sa_hdrsize);
2255EXPORT_SYMBOL(sa_handle_lock);
2256EXPORT_SYMBOL(sa_handle_unlock);
2257EXPORT_SYMBOL(sa_lookup_uio);
2258EXPORT_SYMBOL(sa_add_projid);
2259#endif /* _KERNEL */
2260