1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
25 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
26 * Copyright 2017 Nexenta Systems, Inc.
27 */
28
29#include <sys/zio.h>
30#include <sys/spa.h>
31#include <sys/dmu.h>
32#include <sys/zfs_context.h>
33#include <sys/zap.h>
34#include <sys/zap_impl.h>
35#include <sys/zap_leaf.h>
36#include <sys/btree.h>
37#include <sys/arc.h>
38#include <sys/dmu_objset.h>
39
40#ifdef _KERNEL
41#include <sys/sunddi.h>
42#endif
43
44int zap_micro_max_size = MZAP_MAX_BLKSZ;
45
46static int mzap_upgrade(zap_t **zapp,
47    const void *tag, dmu_tx_t *tx, zap_flags_t flags);
48
49uint64_t
50zap_getflags(zap_t *zap)
51{
52	if (zap->zap_ismicro)
53		return (0);
54	return (zap_f_phys(zap)->zap_flags);
55}
56
57int
58zap_hashbits(zap_t *zap)
59{
60	if (zap_getflags(zap) & ZAP_FLAG_HASH64)
61		return (48);
62	else
63		return (28);
64}
65
66uint32_t
67zap_maxcd(zap_t *zap)
68{
69	if (zap_getflags(zap) & ZAP_FLAG_HASH64)
70		return ((1<<16)-1);
71	else
72		return (-1U);
73}
74
75static uint64_t
76zap_hash(zap_name_t *zn)
77{
78	zap_t *zap = zn->zn_zap;
79	uint64_t h = 0;
80
81	if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) {
82		ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY);
83		h = *(uint64_t *)zn->zn_key_orig;
84	} else {
85		h = zap->zap_salt;
86		ASSERT(h != 0);
87		ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
88
89		if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
90			const uint64_t *wp = zn->zn_key_norm;
91
92			ASSERT(zn->zn_key_intlen == 8);
93			for (int i = 0; i < zn->zn_key_norm_numints;
94			    wp++, i++) {
95				uint64_t word = *wp;
96
97				for (int j = 0; j < 8; j++) {
98					h = (h >> 8) ^
99					    zfs_crc64_table[(h ^ word) & 0xFF];
100					word >>= NBBY;
101				}
102			}
103		} else {
104			const uint8_t *cp = zn->zn_key_norm;
105
106			/*
107			 * We previously stored the terminating null on
108			 * disk, but didn't hash it, so we need to
109			 * continue to not hash it.  (The
110			 * zn_key_*_numints includes the terminating
111			 * null for non-binary keys.)
112			 */
113			int len = zn->zn_key_norm_numints - 1;
114
115			ASSERT(zn->zn_key_intlen == 1);
116			for (int i = 0; i < len; cp++, i++) {
117				h = (h >> 8) ^
118				    zfs_crc64_table[(h ^ *cp) & 0xFF];
119			}
120		}
121	}
122	/*
123	 * Don't use all 64 bits, since we need some in the cookie for
124	 * the collision differentiator.  We MUST use the high bits,
125	 * since those are the ones that we first pay attention to when
126	 * choosing the bucket.
127	 */
128	h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1);
129
130	return (h);
131}
132
133static int
134zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags)
135{
136	ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY));
137
138	size_t inlen = strlen(name) + 1;
139	size_t outlen = ZAP_MAXNAMELEN;
140
141	int err = 0;
142	(void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
143	    normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID,
144	    U8_UNICODE_LATEST, &err);
145
146	return (err);
147}
148
149boolean_t
150zap_match(zap_name_t *zn, const char *matchname)
151{
152	ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY));
153
154	if (zn->zn_matchtype & MT_NORMALIZE) {
155		char norm[ZAP_MAXNAMELEN];
156
157		if (zap_normalize(zn->zn_zap, matchname, norm,
158		    zn->zn_normflags) != 0)
159			return (B_FALSE);
160
161		return (strcmp(zn->zn_key_norm, norm) == 0);
162	} else {
163		return (strcmp(zn->zn_key_orig, matchname) == 0);
164	}
165}
166
167static zap_name_t *
168zap_name_alloc(zap_t *zap)
169{
170	zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
171	zn->zn_zap = zap;
172	return (zn);
173}
174
175void
176zap_name_free(zap_name_t *zn)
177{
178	kmem_free(zn, sizeof (zap_name_t));
179}
180
181static int
182zap_name_init_str(zap_name_t *zn, const char *key, matchtype_t mt)
183{
184	zap_t *zap = zn->zn_zap;
185
186	zn->zn_key_intlen = sizeof (*key);
187	zn->zn_key_orig = key;
188	zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1;
189	zn->zn_matchtype = mt;
190	zn->zn_normflags = zap->zap_normflags;
191
192	/*
193	 * If we're dealing with a case sensitive lookup on a mixed or
194	 * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup
195	 * will fold case to all caps overriding the lookup request.
196	 */
197	if (mt & MT_MATCH_CASE)
198		zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER;
199
200	if (zap->zap_normflags) {
201		/*
202		 * We *must* use zap_normflags because this normalization is
203		 * what the hash is computed from.
204		 */
205		if (zap_normalize(zap, key, zn->zn_normbuf,
206		    zap->zap_normflags) != 0)
207			return (SET_ERROR(ENOTSUP));
208		zn->zn_key_norm = zn->zn_normbuf;
209		zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
210	} else {
211		if (mt != 0)
212			return (SET_ERROR(ENOTSUP));
213		zn->zn_key_norm = zn->zn_key_orig;
214		zn->zn_key_norm_numints = zn->zn_key_orig_numints;
215	}
216
217	zn->zn_hash = zap_hash(zn);
218
219	if (zap->zap_normflags != zn->zn_normflags) {
220		/*
221		 * We *must* use zn_normflags because this normalization is
222		 * what the matching is based on.  (Not the hash!)
223		 */
224		if (zap_normalize(zap, key, zn->zn_normbuf,
225		    zn->zn_normflags) != 0)
226			return (SET_ERROR(ENOTSUP));
227		zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
228	}
229
230	return (0);
231}
232
233zap_name_t *
234zap_name_alloc_str(zap_t *zap, const char *key, matchtype_t mt)
235{
236	zap_name_t *zn = zap_name_alloc(zap);
237	if (zap_name_init_str(zn, key, mt) != 0) {
238		zap_name_free(zn);
239		return (NULL);
240	}
241	return (zn);
242}
243
244static zap_name_t *
245zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints)
246{
247	zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
248
249	ASSERT(zap->zap_normflags == 0);
250	zn->zn_zap = zap;
251	zn->zn_key_intlen = sizeof (*key);
252	zn->zn_key_orig = zn->zn_key_norm = key;
253	zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints;
254	zn->zn_matchtype = 0;
255
256	zn->zn_hash = zap_hash(zn);
257	return (zn);
258}
259
260static void
261mzap_byteswap(mzap_phys_t *buf, size_t size)
262{
263	buf->mz_block_type = BSWAP_64(buf->mz_block_type);
264	buf->mz_salt = BSWAP_64(buf->mz_salt);
265	buf->mz_normflags = BSWAP_64(buf->mz_normflags);
266	int max = (size / MZAP_ENT_LEN) - 1;
267	for (int i = 0; i < max; i++) {
268		buf->mz_chunk[i].mze_value =
269		    BSWAP_64(buf->mz_chunk[i].mze_value);
270		buf->mz_chunk[i].mze_cd =
271		    BSWAP_32(buf->mz_chunk[i].mze_cd);
272	}
273}
274
275void
276zap_byteswap(void *buf, size_t size)
277{
278	uint64_t block_type = *(uint64_t *)buf;
279
280	if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) {
281		/* ASSERT(magic == ZAP_LEAF_MAGIC); */
282		mzap_byteswap(buf, size);
283	} else {
284		fzap_byteswap(buf, size);
285	}
286}
287
288__attribute__((always_inline)) inline
289static int
290mze_compare(const void *arg1, const void *arg2)
291{
292	const mzap_ent_t *mze1 = arg1;
293	const mzap_ent_t *mze2 = arg2;
294
295	return (TREE_CMP((uint64_t)(mze1->mze_hash) << 32 | mze1->mze_cd,
296	    (uint64_t)(mze2->mze_hash) << 32 | mze2->mze_cd));
297}
298
299ZFS_BTREE_FIND_IN_BUF_FUNC(mze_find_in_buf, mzap_ent_t,
300    mze_compare)
301
302static void
303mze_insert(zap_t *zap, uint16_t chunkid, uint64_t hash)
304{
305	mzap_ent_t mze;
306
307	ASSERT(zap->zap_ismicro);
308	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
309
310	mze.mze_chunkid = chunkid;
311	ASSERT0(hash & 0xffffffff);
312	mze.mze_hash = hash >> 32;
313	ASSERT3U(MZE_PHYS(zap, &mze)->mze_cd, <=, 0xffff);
314	mze.mze_cd = (uint16_t)MZE_PHYS(zap, &mze)->mze_cd;
315	ASSERT(MZE_PHYS(zap, &mze)->mze_name[0] != 0);
316	zfs_btree_add(&zap->zap_m.zap_tree, &mze);
317}
318
319static mzap_ent_t *
320mze_find(zap_name_t *zn, zfs_btree_index_t *idx)
321{
322	mzap_ent_t mze_tofind;
323	mzap_ent_t *mze;
324	zfs_btree_t *tree = &zn->zn_zap->zap_m.zap_tree;
325
326	ASSERT(zn->zn_zap->zap_ismicro);
327	ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
328
329	ASSERT0(zn->zn_hash & 0xffffffff);
330	mze_tofind.mze_hash = zn->zn_hash >> 32;
331	mze_tofind.mze_cd = 0;
332
333	mze = zfs_btree_find(tree, &mze_tofind, idx);
334	if (mze == NULL)
335		mze = zfs_btree_next(tree, idx, idx);
336	for (; mze && mze->mze_hash == mze_tofind.mze_hash;
337	    mze = zfs_btree_next(tree, idx, idx)) {
338		ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd);
339		if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name))
340			return (mze);
341	}
342
343	return (NULL);
344}
345
346static uint32_t
347mze_find_unused_cd(zap_t *zap, uint64_t hash)
348{
349	mzap_ent_t mze_tofind;
350	zfs_btree_index_t idx;
351	zfs_btree_t *tree = &zap->zap_m.zap_tree;
352
353	ASSERT(zap->zap_ismicro);
354	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
355
356	ASSERT0(hash & 0xffffffff);
357	hash >>= 32;
358	mze_tofind.mze_hash = hash;
359	mze_tofind.mze_cd = 0;
360
361	uint32_t cd = 0;
362	for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx);
363	    mze && mze->mze_hash == hash;
364	    mze = zfs_btree_next(tree, &idx, &idx)) {
365		if (mze->mze_cd != cd)
366			break;
367		cd++;
368	}
369
370	return (cd);
371}
372
373/*
374 * Each mzap entry requires at max : 4 chunks
375 * 3 chunks for names + 1 chunk for value.
376 */
377#define	MZAP_ENT_CHUNKS	(1 + ZAP_LEAF_ARRAY_NCHUNKS(MZAP_NAME_LEN) + \
378	ZAP_LEAF_ARRAY_NCHUNKS(sizeof (uint64_t)))
379
380/*
381 * Check if the current entry keeps the colliding entries under the fatzap leaf
382 * size.
383 */
384static boolean_t
385mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash)
386{
387	zap_t *zap = zn->zn_zap;
388	mzap_ent_t mze_tofind;
389	zfs_btree_index_t idx;
390	zfs_btree_t *tree = &zap->zap_m.zap_tree;
391	uint32_t mzap_ents = 0;
392
393	ASSERT0(hash & 0xffffffff);
394	hash >>= 32;
395	mze_tofind.mze_hash = hash;
396	mze_tofind.mze_cd = 0;
397
398	for (mzap_ent_t *mze = zfs_btree_find(tree, &mze_tofind, &idx);
399	    mze && mze->mze_hash == hash;
400	    mze = zfs_btree_next(tree, &idx, &idx)) {
401		mzap_ents++;
402	}
403
404	/* Include the new entry being added */
405	mzap_ents++;
406
407	return (ZAP_LEAF_NUMCHUNKS_DEF > (mzap_ents * MZAP_ENT_CHUNKS));
408}
409
410static void
411mze_destroy(zap_t *zap)
412{
413	zfs_btree_clear(&zap->zap_m.zap_tree);
414	zfs_btree_destroy(&zap->zap_m.zap_tree);
415}
416
417static zap_t *
418mzap_open(dmu_buf_t *db)
419{
420	zap_t *winner;
421	uint64_t *zap_hdr = (uint64_t *)db->db_data;
422	uint64_t zap_block_type = zap_hdr[0];
423	uint64_t zap_magic = zap_hdr[1];
424
425	ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
426
427	zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
428	rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL);
429	rw_enter(&zap->zap_rwlock, RW_WRITER);
430	zap->zap_objset = dmu_buf_get_objset(db);
431	zap->zap_object = db->db_object;
432	zap->zap_dbuf = db;
433
434	if (zap_block_type != ZBT_MICRO) {
435		mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, MUTEX_DEFAULT,
436		    0);
437		zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1;
438		if (zap_block_type != ZBT_HEADER || zap_magic != ZAP_MAGIC) {
439			winner = NULL;	/* No actual winner here... */
440			goto handle_winner;
441		}
442	} else {
443		zap->zap_ismicro = TRUE;
444	}
445
446	/*
447	 * Make sure that zap_ismicro is set before we let others see
448	 * it, because zap_lockdir() checks zap_ismicro without the lock
449	 * held.
450	 */
451	dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf);
452	winner = dmu_buf_set_user(db, &zap->zap_dbu);
453
454	if (winner != NULL)
455		goto handle_winner;
456
457	if (zap->zap_ismicro) {
458		zap->zap_salt = zap_m_phys(zap)->mz_salt;
459		zap->zap_normflags = zap_m_phys(zap)->mz_normflags;
460		zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
461
462		/*
463		 * Reduce B-tree leaf from 4KB to 512 bytes to reduce memmove()
464		 * overhead on massive inserts below.  It still allows to store
465		 * 62 entries before we have to add 2KB B-tree core node.
466		 */
467		zfs_btree_create_custom(&zap->zap_m.zap_tree, mze_compare,
468		    mze_find_in_buf, sizeof (mzap_ent_t), 512);
469
470		zap_name_t *zn = zap_name_alloc(zap);
471		for (uint16_t i = 0; i < zap->zap_m.zap_num_chunks; i++) {
472			mzap_ent_phys_t *mze =
473			    &zap_m_phys(zap)->mz_chunk[i];
474			if (mze->mze_name[0]) {
475				zap->zap_m.zap_num_entries++;
476				zap_name_init_str(zn, mze->mze_name, 0);
477				mze_insert(zap, i, zn->zn_hash);
478			}
479		}
480		zap_name_free(zn);
481	} else {
482		zap->zap_salt = zap_f_phys(zap)->zap_salt;
483		zap->zap_normflags = zap_f_phys(zap)->zap_normflags;
484
485		ASSERT3U(sizeof (struct zap_leaf_header), ==,
486		    2*ZAP_LEAF_CHUNKSIZE);
487
488		/*
489		 * The embedded pointer table should not overlap the
490		 * other members.
491		 */
492		ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
493		    &zap_f_phys(zap)->zap_salt);
494
495		/*
496		 * The embedded pointer table should end at the end of
497		 * the block
498		 */
499		ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
500		    1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
501		    (uintptr_t)zap_f_phys(zap), ==,
502		    zap->zap_dbuf->db_size);
503	}
504	rw_exit(&zap->zap_rwlock);
505	return (zap);
506
507handle_winner:
508	rw_exit(&zap->zap_rwlock);
509	rw_destroy(&zap->zap_rwlock);
510	if (!zap->zap_ismicro)
511		mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
512	kmem_free(zap, sizeof (zap_t));
513	return (winner);
514}
515
516/*
517 * This routine "consumes" the caller's hold on the dbuf, which must
518 * have the specified tag.
519 */
520static int
521zap_lockdir_impl(dnode_t *dn, dmu_buf_t *db, const void *tag, dmu_tx_t *tx,
522    krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
523{
524	ASSERT0(db->db_offset);
525	objset_t *os = dmu_buf_get_objset(db);
526	uint64_t obj = db->db_object;
527	dmu_object_info_t doi;
528
529	*zapp = NULL;
530
531	dmu_object_info_from_dnode(dn, &doi);
532	if (DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP)
533		return (SET_ERROR(EINVAL));
534
535	zap_t *zap = dmu_buf_get_user(db);
536	if (zap == NULL) {
537		zap = mzap_open(db);
538		if (zap == NULL) {
539			/*
540			 * mzap_open() didn't like what it saw on-disk.
541			 * Check for corruption!
542			 */
543			return (SET_ERROR(EIO));
544		}
545	}
546
547	/*
548	 * We're checking zap_ismicro without the lock held, in order to
549	 * tell what type of lock we want.  Once we have some sort of
550	 * lock, see if it really is the right type.  In practice this
551	 * can only be different if it was upgraded from micro to fat,
552	 * and micro wanted WRITER but fat only needs READER.
553	 */
554	krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
555	rw_enter(&zap->zap_rwlock, lt);
556	if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
557		/* it was upgraded, now we only need reader */
558		ASSERT(lt == RW_WRITER);
559		ASSERT(RW_READER ==
560		    ((!zap->zap_ismicro && fatreader) ? RW_READER : lti));
561		rw_downgrade(&zap->zap_rwlock);
562		lt = RW_READER;
563	}
564
565	zap->zap_objset = os;
566	zap->zap_dnode = dn;
567
568	if (lt == RW_WRITER)
569		dmu_buf_will_dirty(db, tx);
570
571	ASSERT3P(zap->zap_dbuf, ==, db);
572
573	ASSERT(!zap->zap_ismicro ||
574	    zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks);
575	if (zap->zap_ismicro && tx && adding &&
576	    zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
577		uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
578		if (newsz > zap_micro_max_size) {
579			dprintf("upgrading obj %llu: num_entries=%u\n",
580			    (u_longlong_t)obj, zap->zap_m.zap_num_entries);
581			*zapp = zap;
582			int err = mzap_upgrade(zapp, tag, tx, 0);
583			if (err != 0)
584				rw_exit(&zap->zap_rwlock);
585			return (err);
586		}
587		VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx));
588		zap->zap_m.zap_num_chunks =
589		    db->db_size / MZAP_ENT_LEN - 1;
590	}
591
592	*zapp = zap;
593	return (0);
594}
595
596static int
597zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
598    krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
599    zap_t **zapp)
600{
601	dmu_buf_t *db;
602	int err;
603
604	err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
605	if (err != 0)
606		return (err);
607	err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
608	if (err != 0)
609		dmu_buf_rele(db, tag);
610	else
611		VERIFY(dnode_add_ref(dn, tag));
612	return (err);
613}
614
615int
616zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
617    krw_t lti, boolean_t fatreader, boolean_t adding, const void *tag,
618    zap_t **zapp)
619{
620	dnode_t *dn;
621	dmu_buf_t *db;
622	int err;
623
624	err = dnode_hold(os, obj, tag, &dn);
625	if (err != 0)
626		return (err);
627	err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
628	if (err != 0) {
629		dnode_rele(dn, tag);
630		return (err);
631	}
632	err = zap_lockdir_impl(dn, db, tag, tx, lti, fatreader, adding, zapp);
633	if (err != 0) {
634		dmu_buf_rele(db, tag);
635		dnode_rele(dn, tag);
636	}
637	return (err);
638}
639
640void
641zap_unlockdir(zap_t *zap, const void *tag)
642{
643	rw_exit(&zap->zap_rwlock);
644	dnode_rele(zap->zap_dnode, tag);
645	dmu_buf_rele(zap->zap_dbuf, tag);
646}
647
648static int
649mzap_upgrade(zap_t **zapp, const void *tag, dmu_tx_t *tx, zap_flags_t flags)
650{
651	int err = 0;
652	zap_t *zap = *zapp;
653
654	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
655
656	int sz = zap->zap_dbuf->db_size;
657	mzap_phys_t *mzp = vmem_alloc(sz, KM_SLEEP);
658	memcpy(mzp, zap->zap_dbuf->db_data, sz);
659	int nchunks = zap->zap_m.zap_num_chunks;
660
661	if (!flags) {
662		err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
663		    1ULL << fzap_default_block_shift, 0, tx);
664		if (err != 0) {
665			vmem_free(mzp, sz);
666			return (err);
667		}
668	}
669
670	dprintf("upgrading obj=%llu with %u chunks\n",
671	    (u_longlong_t)zap->zap_object, nchunks);
672	/* XXX destroy the tree later, so we can use the stored hash value */
673	mze_destroy(zap);
674
675	fzap_upgrade(zap, tx, flags);
676
677	zap_name_t *zn = zap_name_alloc(zap);
678	for (int i = 0; i < nchunks; i++) {
679		mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
680		if (mze->mze_name[0] == 0)
681			continue;
682		dprintf("adding %s=%llu\n",
683		    mze->mze_name, (u_longlong_t)mze->mze_value);
684		zap_name_init_str(zn, mze->mze_name, 0);
685		/* If we fail here, we would end up losing entries */
686		VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd,
687		    tag, tx));
688		zap = zn->zn_zap;	/* fzap_add_cd() may change zap */
689	}
690	zap_name_free(zn);
691	vmem_free(mzp, sz);
692	*zapp = zap;
693	return (0);
694}
695
696/*
697 * The "normflags" determine the behavior of the matchtype_t which is
698 * passed to zap_lookup_norm().  Names which have the same normalized
699 * version will be stored with the same hash value, and therefore we can
700 * perform normalization-insensitive lookups.  We can be Unicode form-
701 * insensitive and/or case-insensitive.  The following flags are valid for
702 * "normflags":
703 *
704 * U8_TEXTPREP_NFC
705 * U8_TEXTPREP_NFD
706 * U8_TEXTPREP_NFKC
707 * U8_TEXTPREP_NFKD
708 * U8_TEXTPREP_TOUPPER
709 *
710 * The *_NF* (Normalization Form) flags are mutually exclusive; at most one
711 * of them may be supplied.
712 */
713void
714mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx)
715{
716	dmu_buf_t *db;
717
718	VERIFY0(dmu_buf_hold_by_dnode(dn, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
719
720	dmu_buf_will_dirty(db, tx);
721	mzap_phys_t *zp = db->db_data;
722	zp->mz_block_type = ZBT_MICRO;
723	zp->mz_salt =
724	    ((uintptr_t)db ^ (uintptr_t)tx ^ (dn->dn_object << 1)) | 1ULL;
725	zp->mz_normflags = normflags;
726
727	if (flags != 0) {
728		zap_t *zap;
729		/* Only fat zap supports flags; upgrade immediately. */
730		VERIFY(dnode_add_ref(dn, FTAG));
731		VERIFY0(zap_lockdir_impl(dn, db, FTAG, tx, RW_WRITER,
732		    B_FALSE, B_FALSE, &zap));
733		VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags));
734		zap_unlockdir(zap, FTAG);
735	} else {
736		dmu_buf_rele(db, FTAG);
737	}
738}
739
740static uint64_t
741zap_create_impl(objset_t *os, int normflags, zap_flags_t flags,
742    dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
743    dmu_object_type_t bonustype, int bonuslen, int dnodesize,
744    dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)
745{
746	uint64_t obj;
747
748	ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
749
750	if (allocated_dnode == NULL) {
751		dnode_t *dn;
752		obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift,
753		    indirect_blockshift, bonustype, bonuslen, dnodesize,
754		    &dn, FTAG, tx);
755		mzap_create_impl(dn, normflags, flags, tx);
756		dnode_rele(dn, FTAG);
757	} else {
758		obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift,
759		    indirect_blockshift, bonustype, bonuslen, dnodesize,
760		    allocated_dnode, tag, tx);
761		mzap_create_impl(*allocated_dnode, normflags, flags, tx);
762	}
763
764	return (obj);
765}
766
767int
768zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
769    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
770{
771	return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen,
772	    0, tx));
773}
774
775int
776zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot,
777    dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
778{
779	return (zap_create_claim_norm_dnsize(os, obj,
780	    0, ot, bonustype, bonuslen, dnodesize, tx));
781}
782
783int
784zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags,
785    dmu_object_type_t ot,
786    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
787{
788	return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype,
789	    bonuslen, 0, tx));
790}
791
792int
793zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags,
794    dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen,
795    int dnodesize, dmu_tx_t *tx)
796{
797	dnode_t *dn;
798	int error;
799
800	ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
801	error = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen,
802	    dnodesize, tx);
803	if (error != 0)
804		return (error);
805
806	error = dnode_hold(os, obj, FTAG, &dn);
807	if (error != 0)
808		return (error);
809
810	mzap_create_impl(dn, normflags, 0, tx);
811
812	dnode_rele(dn, FTAG);
813
814	return (0);
815}
816
817uint64_t
818zap_create(objset_t *os, dmu_object_type_t ot,
819    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
820{
821	return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx));
822}
823
824uint64_t
825zap_create_dnsize(objset_t *os, dmu_object_type_t ot,
826    dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
827{
828	return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen,
829	    dnodesize, tx));
830}
831
832uint64_t
833zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
834    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
835{
836	return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen,
837	    0, tx));
838}
839
840uint64_t
841zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot,
842    dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
843{
844	return (zap_create_impl(os, normflags, 0, ot, 0, 0,
845	    bonustype, bonuslen, dnodesize, NULL, NULL, tx));
846}
847
848uint64_t
849zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
850    dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
851    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
852{
853	return (zap_create_flags_dnsize(os, normflags, flags, ot,
854	    leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx));
855}
856
857uint64_t
858zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags,
859    dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
860    dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
861{
862	return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift,
863	    indirect_blockshift, bonustype, bonuslen, dnodesize, NULL, NULL,
864	    tx));
865}
866
867/*
868 * Create a zap object and return a pointer to the newly allocated dnode via
869 * the allocated_dnode argument.  The returned dnode will be held and the
870 * caller is responsible for releasing the hold by calling dnode_rele().
871 */
872uint64_t
873zap_create_hold(objset_t *os, int normflags, zap_flags_t flags,
874    dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
875    dmu_object_type_t bonustype, int bonuslen, int dnodesize,
876    dnode_t **allocated_dnode, const void *tag, dmu_tx_t *tx)
877{
878	return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift,
879	    indirect_blockshift, bonustype, bonuslen, dnodesize,
880	    allocated_dnode, tag, tx));
881}
882
883int
884zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
885{
886	/*
887	 * dmu_object_free will free the object number and free the
888	 * data.  Freeing the data will cause our pageout function to be
889	 * called, which will destroy our data (zap_leaf_t's and zap_t).
890	 */
891
892	return (dmu_object_free(os, zapobj, tx));
893}
894
895void
896zap_evict_sync(void *dbu)
897{
898	zap_t *zap = dbu;
899
900	rw_destroy(&zap->zap_rwlock);
901
902	if (zap->zap_ismicro)
903		mze_destroy(zap);
904	else
905		mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
906
907	kmem_free(zap, sizeof (zap_t));
908}
909
910int
911zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
912{
913	zap_t *zap;
914
915	int err =
916	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
917	if (err != 0)
918		return (err);
919	if (!zap->zap_ismicro) {
920		err = fzap_count(zap, count);
921	} else {
922		*count = zap->zap_m.zap_num_entries;
923	}
924	zap_unlockdir(zap, FTAG);
925	return (err);
926}
927
928/*
929 * zn may be NULL; if not specified, it will be computed if needed.
930 * See also the comment above zap_entry_normalization_conflict().
931 */
932static boolean_t
933mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze,
934    zfs_btree_index_t *idx)
935{
936	boolean_t allocdzn = B_FALSE;
937	mzap_ent_t *other;
938	zfs_btree_index_t oidx;
939
940	if (zap->zap_normflags == 0)
941		return (B_FALSE);
942
943	for (other = zfs_btree_prev(&zap->zap_m.zap_tree, idx, &oidx);
944	    other && other->mze_hash == mze->mze_hash;
945	    other = zfs_btree_prev(&zap->zap_m.zap_tree, &oidx, &oidx)) {
946
947		if (zn == NULL) {
948			zn = zap_name_alloc_str(zap,
949			    MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE);
950			allocdzn = B_TRUE;
951		}
952		if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
953			if (allocdzn)
954				zap_name_free(zn);
955			return (B_TRUE);
956		}
957	}
958
959	for (other = zfs_btree_next(&zap->zap_m.zap_tree, idx, &oidx);
960	    other && other->mze_hash == mze->mze_hash;
961	    other = zfs_btree_next(&zap->zap_m.zap_tree, &oidx, &oidx)) {
962
963		if (zn == NULL) {
964			zn = zap_name_alloc_str(zap,
965			    MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE);
966			allocdzn = B_TRUE;
967		}
968		if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
969			if (allocdzn)
970				zap_name_free(zn);
971			return (B_TRUE);
972		}
973	}
974
975	if (allocdzn)
976		zap_name_free(zn);
977	return (B_FALSE);
978}
979
980/*
981 * Routines for manipulating attributes.
982 */
983
984int
985zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
986    uint64_t integer_size, uint64_t num_integers, void *buf)
987{
988	return (zap_lookup_norm(os, zapobj, name, integer_size,
989	    num_integers, buf, 0, NULL, 0, NULL));
990}
991
992static int
993zap_lookup_impl(zap_t *zap, const char *name,
994    uint64_t integer_size, uint64_t num_integers, void *buf,
995    matchtype_t mt, char *realname, int rn_len,
996    boolean_t *ncp)
997{
998	int err = 0;
999
1000	zap_name_t *zn = zap_name_alloc_str(zap, name, mt);
1001	if (zn == NULL)
1002		return (SET_ERROR(ENOTSUP));
1003
1004	if (!zap->zap_ismicro) {
1005		err = fzap_lookup(zn, integer_size, num_integers, buf,
1006		    realname, rn_len, ncp);
1007	} else {
1008		zfs_btree_index_t idx;
1009		mzap_ent_t *mze = mze_find(zn, &idx);
1010		if (mze == NULL) {
1011			err = SET_ERROR(ENOENT);
1012		} else {
1013			if (num_integers < 1) {
1014				err = SET_ERROR(EOVERFLOW);
1015			} else if (integer_size != 8) {
1016				err = SET_ERROR(EINVAL);
1017			} else {
1018				*(uint64_t *)buf =
1019				    MZE_PHYS(zap, mze)->mze_value;
1020				if (realname != NULL)
1021					(void) strlcpy(realname,
1022					    MZE_PHYS(zap, mze)->mze_name,
1023					    rn_len);
1024				if (ncp) {
1025					*ncp = mzap_normalization_conflict(zap,
1026					    zn, mze, &idx);
1027				}
1028			}
1029		}
1030	}
1031	zap_name_free(zn);
1032	return (err);
1033}
1034
1035int
1036zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
1037    uint64_t integer_size, uint64_t num_integers, void *buf,
1038    matchtype_t mt, char *realname, int rn_len,
1039    boolean_t *ncp)
1040{
1041	zap_t *zap;
1042
1043	int err =
1044	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1045	if (err != 0)
1046		return (err);
1047	err = zap_lookup_impl(zap, name, integer_size,
1048	    num_integers, buf, mt, realname, rn_len, ncp);
1049	zap_unlockdir(zap, FTAG);
1050	return (err);
1051}
1052
1053int
1054zap_prefetch(objset_t *os, uint64_t zapobj, const char *name)
1055{
1056	zap_t *zap;
1057	int err;
1058	zap_name_t *zn;
1059
1060	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1061	if (err)
1062		return (err);
1063	zn = zap_name_alloc_str(zap, name, 0);
1064	if (zn == NULL) {
1065		zap_unlockdir(zap, FTAG);
1066		return (SET_ERROR(ENOTSUP));
1067	}
1068
1069	fzap_prefetch(zn);
1070	zap_name_free(zn);
1071	zap_unlockdir(zap, FTAG);
1072	return (err);
1073}
1074
1075int
1076zap_lookup_by_dnode(dnode_t *dn, const char *name,
1077    uint64_t integer_size, uint64_t num_integers, void *buf)
1078{
1079	return (zap_lookup_norm_by_dnode(dn, name, integer_size,
1080	    num_integers, buf, 0, NULL, 0, NULL));
1081}
1082
1083int
1084zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
1085    uint64_t integer_size, uint64_t num_integers, void *buf,
1086    matchtype_t mt, char *realname, int rn_len,
1087    boolean_t *ncp)
1088{
1089	zap_t *zap;
1090
1091	int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
1092	    FTAG, &zap);
1093	if (err != 0)
1094		return (err);
1095	err = zap_lookup_impl(zap, name, integer_size,
1096	    num_integers, buf, mt, realname, rn_len, ncp);
1097	zap_unlockdir(zap, FTAG);
1098	return (err);
1099}
1100
1101int
1102zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1103    int key_numints)
1104{
1105	zap_t *zap;
1106
1107	int err =
1108	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1109	if (err != 0)
1110		return (err);
1111	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1112	if (zn == NULL) {
1113		zap_unlockdir(zap, FTAG);
1114		return (SET_ERROR(ENOTSUP));
1115	}
1116
1117	fzap_prefetch(zn);
1118	zap_name_free(zn);
1119	zap_unlockdir(zap, FTAG);
1120	return (err);
1121}
1122
1123int
1124zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1125    int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
1126{
1127	zap_t *zap;
1128
1129	int err =
1130	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1131	if (err != 0)
1132		return (err);
1133	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1134	if (zn == NULL) {
1135		zap_unlockdir(zap, FTAG);
1136		return (SET_ERROR(ENOTSUP));
1137	}
1138
1139	err = fzap_lookup(zn, integer_size, num_integers, buf,
1140	    NULL, 0, NULL);
1141	zap_name_free(zn);
1142	zap_unlockdir(zap, FTAG);
1143	return (err);
1144}
1145
1146int
1147zap_contains(objset_t *os, uint64_t zapobj, const char *name)
1148{
1149	int err = zap_lookup_norm(os, zapobj, name, 0,
1150	    0, NULL, 0, NULL, 0, NULL);
1151	if (err == EOVERFLOW || err == EINVAL)
1152		err = 0; /* found, but skipped reading the value */
1153	return (err);
1154}
1155
1156int
1157zap_length(objset_t *os, uint64_t zapobj, const char *name,
1158    uint64_t *integer_size, uint64_t *num_integers)
1159{
1160	zap_t *zap;
1161
1162	int err =
1163	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1164	if (err != 0)
1165		return (err);
1166	zap_name_t *zn = zap_name_alloc_str(zap, name, 0);
1167	if (zn == NULL) {
1168		zap_unlockdir(zap, FTAG);
1169		return (SET_ERROR(ENOTSUP));
1170	}
1171	if (!zap->zap_ismicro) {
1172		err = fzap_length(zn, integer_size, num_integers);
1173	} else {
1174		zfs_btree_index_t idx;
1175		mzap_ent_t *mze = mze_find(zn, &idx);
1176		if (mze == NULL) {
1177			err = SET_ERROR(ENOENT);
1178		} else {
1179			if (integer_size)
1180				*integer_size = 8;
1181			if (num_integers)
1182				*num_integers = 1;
1183		}
1184	}
1185	zap_name_free(zn);
1186	zap_unlockdir(zap, FTAG);
1187	return (err);
1188}
1189
1190int
1191zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1192    int key_numints, uint64_t *integer_size, uint64_t *num_integers)
1193{
1194	zap_t *zap;
1195
1196	int err =
1197	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1198	if (err != 0)
1199		return (err);
1200	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1201	if (zn == NULL) {
1202		zap_unlockdir(zap, FTAG);
1203		return (SET_ERROR(ENOTSUP));
1204	}
1205	err = fzap_length(zn, integer_size, num_integers);
1206	zap_name_free(zn);
1207	zap_unlockdir(zap, FTAG);
1208	return (err);
1209}
1210
1211static void
1212mzap_addent(zap_name_t *zn, uint64_t value)
1213{
1214	zap_t *zap = zn->zn_zap;
1215	uint16_t start = zap->zap_m.zap_alloc_next;
1216
1217	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
1218
1219#ifdef ZFS_DEBUG
1220	for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) {
1221		mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
1222		ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0);
1223	}
1224#endif
1225
1226	uint32_t cd = mze_find_unused_cd(zap, zn->zn_hash);
1227	/* given the limited size of the microzap, this can't happen */
1228	ASSERT(cd < zap_maxcd(zap));
1229
1230again:
1231	for (uint16_t i = start; i < zap->zap_m.zap_num_chunks; i++) {
1232		mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
1233		if (mze->mze_name[0] == 0) {
1234			mze->mze_value = value;
1235			mze->mze_cd = cd;
1236			(void) strlcpy(mze->mze_name, zn->zn_key_orig,
1237			    sizeof (mze->mze_name));
1238			zap->zap_m.zap_num_entries++;
1239			zap->zap_m.zap_alloc_next = i+1;
1240			if (zap->zap_m.zap_alloc_next ==
1241			    zap->zap_m.zap_num_chunks)
1242				zap->zap_m.zap_alloc_next = 0;
1243			mze_insert(zap, i, zn->zn_hash);
1244			return;
1245		}
1246	}
1247	if (start != 0) {
1248		start = 0;
1249		goto again;
1250	}
1251	cmn_err(CE_PANIC, "out of entries!");
1252}
1253
1254static int
1255zap_add_impl(zap_t *zap, const char *key,
1256    int integer_size, uint64_t num_integers,
1257    const void *val, dmu_tx_t *tx, const void *tag)
1258{
1259	const uint64_t *intval = val;
1260	int err = 0;
1261
1262	zap_name_t *zn = zap_name_alloc_str(zap, key, 0);
1263	if (zn == NULL) {
1264		zap_unlockdir(zap, tag);
1265		return (SET_ERROR(ENOTSUP));
1266	}
1267	if (!zap->zap_ismicro) {
1268		err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
1269		zap = zn->zn_zap;	/* fzap_add() may change zap */
1270	} else if (integer_size != 8 || num_integers != 1 ||
1271	    strlen(key) >= MZAP_NAME_LEN ||
1272	    !mze_canfit_fzap_leaf(zn, zn->zn_hash)) {
1273		err = mzap_upgrade(&zn->zn_zap, tag, tx, 0);
1274		if (err == 0) {
1275			err = fzap_add(zn, integer_size, num_integers, val,
1276			    tag, tx);
1277		}
1278		zap = zn->zn_zap;	/* fzap_add() may change zap */
1279	} else {
1280		zfs_btree_index_t idx;
1281		if (mze_find(zn, &idx) != NULL) {
1282			err = SET_ERROR(EEXIST);
1283		} else {
1284			mzap_addent(zn, *intval);
1285		}
1286	}
1287	ASSERT(zap == zn->zn_zap);
1288	zap_name_free(zn);
1289	if (zap != NULL)	/* may be NULL if fzap_add() failed */
1290		zap_unlockdir(zap, tag);
1291	return (err);
1292}
1293
1294int
1295zap_add(objset_t *os, uint64_t zapobj, const char *key,
1296    int integer_size, uint64_t num_integers,
1297    const void *val, dmu_tx_t *tx)
1298{
1299	zap_t *zap;
1300	int err;
1301
1302	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1303	if (err != 0)
1304		return (err);
1305	err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
1306	/* zap_add_impl() calls zap_unlockdir() */
1307	return (err);
1308}
1309
1310int
1311zap_add_by_dnode(dnode_t *dn, const char *key,
1312    int integer_size, uint64_t num_integers,
1313    const void *val, dmu_tx_t *tx)
1314{
1315	zap_t *zap;
1316	int err;
1317
1318	err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1319	if (err != 0)
1320		return (err);
1321	err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
1322	/* zap_add_impl() calls zap_unlockdir() */
1323	return (err);
1324}
1325
1326static int
1327zap_add_uint64_impl(zap_t *zap, const uint64_t *key,
1328    int key_numints, int integer_size, uint64_t num_integers,
1329    const void *val, dmu_tx_t *tx, const void *tag)
1330{
1331	int err;
1332
1333	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1334	if (zn == NULL) {
1335		zap_unlockdir(zap, tag);
1336		return (SET_ERROR(ENOTSUP));
1337	}
1338	err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
1339	zap = zn->zn_zap;	/* fzap_add() may change zap */
1340	zap_name_free(zn);
1341	if (zap != NULL)	/* may be NULL if fzap_add() failed */
1342		zap_unlockdir(zap, tag);
1343	return (err);
1344}
1345
1346int
1347zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1348    int key_numints, int integer_size, uint64_t num_integers,
1349    const void *val, dmu_tx_t *tx)
1350{
1351	zap_t *zap;
1352
1353	int err =
1354	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1355	if (err != 0)
1356		return (err);
1357	err = zap_add_uint64_impl(zap, key, key_numints,
1358	    integer_size, num_integers, val, tx, FTAG);
1359	/* zap_add_uint64_impl() calls zap_unlockdir() */
1360	return (err);
1361}
1362
1363int
1364zap_add_uint64_by_dnode(dnode_t *dn, const uint64_t *key,
1365    int key_numints, int integer_size, uint64_t num_integers,
1366    const void *val, dmu_tx_t *tx)
1367{
1368	zap_t *zap;
1369
1370	int err =
1371	    zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1372	if (err != 0)
1373		return (err);
1374	err = zap_add_uint64_impl(zap, key, key_numints,
1375	    integer_size, num_integers, val, tx, FTAG);
1376	/* zap_add_uint64_impl() calls zap_unlockdir() */
1377	return (err);
1378}
1379
1380int
1381zap_update(objset_t *os, uint64_t zapobj, const char *name,
1382    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
1383{
1384	zap_t *zap;
1385	const uint64_t *intval = val;
1386
1387	int err =
1388	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1389	if (err != 0)
1390		return (err);
1391	zap_name_t *zn = zap_name_alloc_str(zap, name, 0);
1392	if (zn == NULL) {
1393		zap_unlockdir(zap, FTAG);
1394		return (SET_ERROR(ENOTSUP));
1395	}
1396	if (!zap->zap_ismicro) {
1397		err = fzap_update(zn, integer_size, num_integers, val,
1398		    FTAG, tx);
1399		zap = zn->zn_zap;	/* fzap_update() may change zap */
1400	} else if (integer_size != 8 || num_integers != 1 ||
1401	    strlen(name) >= MZAP_NAME_LEN) {
1402		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
1403		    (u_longlong_t)zapobj, integer_size,
1404		    (u_longlong_t)num_integers, name);
1405		err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0);
1406		if (err == 0) {
1407			err = fzap_update(zn, integer_size, num_integers,
1408			    val, FTAG, tx);
1409		}
1410		zap = zn->zn_zap;	/* fzap_update() may change zap */
1411	} else {
1412		zfs_btree_index_t idx;
1413		mzap_ent_t *mze = mze_find(zn, &idx);
1414		if (mze != NULL) {
1415			MZE_PHYS(zap, mze)->mze_value = *intval;
1416		} else {
1417			mzap_addent(zn, *intval);
1418		}
1419	}
1420	ASSERT(zap == zn->zn_zap);
1421	zap_name_free(zn);
1422	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
1423		zap_unlockdir(zap, FTAG);
1424	return (err);
1425}
1426
1427static int
1428zap_update_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
1429    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx,
1430    const void *tag)
1431{
1432	int err;
1433
1434	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1435	if (zn == NULL) {
1436		zap_unlockdir(zap, tag);
1437		return (SET_ERROR(ENOTSUP));
1438	}
1439	err = fzap_update(zn, integer_size, num_integers, val, tag, tx);
1440	zap = zn->zn_zap;	/* fzap_update() may change zap */
1441	zap_name_free(zn);
1442	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
1443		zap_unlockdir(zap, tag);
1444	return (err);
1445}
1446
1447int
1448zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1449    int key_numints, int integer_size, uint64_t num_integers, const void *val,
1450    dmu_tx_t *tx)
1451{
1452	zap_t *zap;
1453
1454	int err =
1455	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1456	if (err != 0)
1457		return (err);
1458	err = zap_update_uint64_impl(zap, key, key_numints,
1459	    integer_size, num_integers, val, tx, FTAG);
1460	/* zap_update_uint64_impl() calls zap_unlockdir() */
1461	return (err);
1462}
1463
1464int
1465zap_update_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
1466    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
1467{
1468	zap_t *zap;
1469
1470	int err =
1471	    zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1472	if (err != 0)
1473		return (err);
1474	err = zap_update_uint64_impl(zap, key, key_numints,
1475	    integer_size, num_integers, val, tx, FTAG);
1476	/* zap_update_uint64_impl() calls zap_unlockdir() */
1477	return (err);
1478}
1479
1480int
1481zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
1482{
1483	return (zap_remove_norm(os, zapobj, name, 0, tx));
1484}
1485
1486static int
1487zap_remove_impl(zap_t *zap, const char *name,
1488    matchtype_t mt, dmu_tx_t *tx)
1489{
1490	int err = 0;
1491
1492	zap_name_t *zn = zap_name_alloc_str(zap, name, mt);
1493	if (zn == NULL)
1494		return (SET_ERROR(ENOTSUP));
1495	if (!zap->zap_ismicro) {
1496		err = fzap_remove(zn, tx);
1497	} else {
1498		zfs_btree_index_t idx;
1499		mzap_ent_t *mze = mze_find(zn, &idx);
1500		if (mze == NULL) {
1501			err = SET_ERROR(ENOENT);
1502		} else {
1503			zap->zap_m.zap_num_entries--;
1504			memset(MZE_PHYS(zap, mze), 0, sizeof (mzap_ent_phys_t));
1505			zfs_btree_remove_idx(&zap->zap_m.zap_tree, &idx);
1506		}
1507	}
1508	zap_name_free(zn);
1509	return (err);
1510}
1511
1512int
1513zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
1514    matchtype_t mt, dmu_tx_t *tx)
1515{
1516	zap_t *zap;
1517	int err;
1518
1519	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
1520	if (err)
1521		return (err);
1522	err = zap_remove_impl(zap, name, mt, tx);
1523	zap_unlockdir(zap, FTAG);
1524	return (err);
1525}
1526
1527int
1528zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx)
1529{
1530	zap_t *zap;
1531	int err;
1532
1533	err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
1534	if (err)
1535		return (err);
1536	err = zap_remove_impl(zap, name, 0, tx);
1537	zap_unlockdir(zap, FTAG);
1538	return (err);
1539}
1540
1541static int
1542zap_remove_uint64_impl(zap_t *zap, const uint64_t *key, int key_numints,
1543    dmu_tx_t *tx, const void *tag)
1544{
1545	int err;
1546
1547	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1548	if (zn == NULL) {
1549		zap_unlockdir(zap, tag);
1550		return (SET_ERROR(ENOTSUP));
1551	}
1552	err = fzap_remove(zn, tx);
1553	zap_name_free(zn);
1554	zap_unlockdir(zap, tag);
1555	return (err);
1556}
1557
1558int
1559zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1560    int key_numints, dmu_tx_t *tx)
1561{
1562	zap_t *zap;
1563
1564	int err =
1565	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
1566	if (err != 0)
1567		return (err);
1568	err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG);
1569	/* zap_remove_uint64_impl() calls zap_unlockdir() */
1570	return (err);
1571}
1572
1573int
1574zap_remove_uint64_by_dnode(dnode_t *dn, const uint64_t *key, int key_numints,
1575    dmu_tx_t *tx)
1576{
1577	zap_t *zap;
1578
1579	int err =
1580	    zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
1581	if (err != 0)
1582		return (err);
1583	err = zap_remove_uint64_impl(zap, key, key_numints, tx, FTAG);
1584	/* zap_remove_uint64_impl() calls zap_unlockdir() */
1585	return (err);
1586}
1587
1588/*
1589 * Routines for iterating over the attributes.
1590 */
1591
1592static void
1593zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
1594    uint64_t serialized, boolean_t prefetch)
1595{
1596	zc->zc_objset = os;
1597	zc->zc_zap = NULL;
1598	zc->zc_leaf = NULL;
1599	zc->zc_zapobj = zapobj;
1600	zc->zc_serialized = serialized;
1601	zc->zc_hash = 0;
1602	zc->zc_cd = 0;
1603	zc->zc_prefetch = prefetch;
1604}
1605void
1606zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
1607    uint64_t serialized)
1608{
1609	zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE);
1610}
1611
1612/*
1613 * Initialize a cursor at the beginning of the ZAP object.  The entire
1614 * ZAP object will be prefetched.
1615 */
1616void
1617zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
1618{
1619	zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE);
1620}
1621
1622/*
1623 * Initialize a cursor at the beginning, but request that we not prefetch
1624 * the entire ZAP object.
1625 */
1626void
1627zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
1628{
1629	zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE);
1630}
1631
1632void
1633zap_cursor_fini(zap_cursor_t *zc)
1634{
1635	if (zc->zc_zap) {
1636		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
1637		zap_unlockdir(zc->zc_zap, NULL);
1638		zc->zc_zap = NULL;
1639	}
1640	if (zc->zc_leaf) {
1641		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
1642		zap_put_leaf(zc->zc_leaf);
1643		zc->zc_leaf = NULL;
1644	}
1645	zc->zc_objset = NULL;
1646}
1647
1648uint64_t
1649zap_cursor_serialize(zap_cursor_t *zc)
1650{
1651	if (zc->zc_hash == -1ULL)
1652		return (-1ULL);
1653	if (zc->zc_zap == NULL)
1654		return (zc->zc_serialized);
1655	ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0);
1656	ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap));
1657
1658	/*
1659	 * We want to keep the high 32 bits of the cursor zero if we can, so
1660	 * that 32-bit programs can access this.  So usually use a small
1661	 * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits
1662	 * of the cursor.
1663	 *
1664	 * [ collision differentiator | zap_hashbits()-bit hash value ]
1665	 */
1666	return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) |
1667	    ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap)));
1668}
1669
1670int
1671zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
1672{
1673	int err;
1674
1675	if (zc->zc_hash == -1ULL)
1676		return (SET_ERROR(ENOENT));
1677
1678	if (zc->zc_zap == NULL) {
1679		int hb;
1680		err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
1681		    RW_READER, TRUE, FALSE, NULL, &zc->zc_zap);
1682		if (err != 0)
1683			return (err);
1684
1685		/*
1686		 * To support zap_cursor_init_serialized, advance, retrieve,
1687		 * we must add to the existing zc_cd, which may already
1688		 * be 1 due to the zap_cursor_advance.
1689		 */
1690		ASSERT(zc->zc_hash == 0);
1691		hb = zap_hashbits(zc->zc_zap);
1692		zc->zc_hash = zc->zc_serialized << (64 - hb);
1693		zc->zc_cd += zc->zc_serialized >> hb;
1694		if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */
1695			zc->zc_cd = 0;
1696	} else {
1697		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
1698	}
1699	if (!zc->zc_zap->zap_ismicro) {
1700		err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
1701	} else {
1702		zfs_btree_index_t idx;
1703		mzap_ent_t mze_tofind;
1704
1705		mze_tofind.mze_hash = zc->zc_hash >> 32;
1706		mze_tofind.mze_cd = zc->zc_cd;
1707
1708		mzap_ent_t *mze = zfs_btree_find(&zc->zc_zap->zap_m.zap_tree,
1709		    &mze_tofind, &idx);
1710		if (mze == NULL) {
1711			mze = zfs_btree_next(&zc->zc_zap->zap_m.zap_tree,
1712			    &idx, &idx);
1713		}
1714		if (mze) {
1715			mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze);
1716			ASSERT3U(mze->mze_cd, ==, mzep->mze_cd);
1717			za->za_normalization_conflict =
1718			    mzap_normalization_conflict(zc->zc_zap, NULL,
1719			    mze, &idx);
1720			za->za_integer_length = 8;
1721			za->za_num_integers = 1;
1722			za->za_first_integer = mzep->mze_value;
1723			(void) strlcpy(za->za_name, mzep->mze_name,
1724			    sizeof (za->za_name));
1725			zc->zc_hash = (uint64_t)mze->mze_hash << 32;
1726			zc->zc_cd = mze->mze_cd;
1727			err = 0;
1728		} else {
1729			zc->zc_hash = -1ULL;
1730			err = SET_ERROR(ENOENT);
1731		}
1732	}
1733	rw_exit(&zc->zc_zap->zap_rwlock);
1734	return (err);
1735}
1736
1737void
1738zap_cursor_advance(zap_cursor_t *zc)
1739{
1740	if (zc->zc_hash == -1ULL)
1741		return;
1742	zc->zc_cd++;
1743}
1744
1745int
1746zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
1747{
1748	zap_t *zap;
1749
1750	int err =
1751	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1752	if (err != 0)
1753		return (err);
1754
1755	memset(zs, 0, sizeof (zap_stats_t));
1756
1757	if (zap->zap_ismicro) {
1758		zs->zs_blocksize = zap->zap_dbuf->db_size;
1759		zs->zs_num_entries = zap->zap_m.zap_num_entries;
1760		zs->zs_num_blocks = 1;
1761	} else {
1762		fzap_get_stats(zap, zs);
1763	}
1764	zap_unlockdir(zap, FTAG);
1765	return (0);
1766}
1767
1768#if defined(_KERNEL)
1769EXPORT_SYMBOL(zap_create);
1770EXPORT_SYMBOL(zap_create_dnsize);
1771EXPORT_SYMBOL(zap_create_norm);
1772EXPORT_SYMBOL(zap_create_norm_dnsize);
1773EXPORT_SYMBOL(zap_create_flags);
1774EXPORT_SYMBOL(zap_create_flags_dnsize);
1775EXPORT_SYMBOL(zap_create_claim);
1776EXPORT_SYMBOL(zap_create_claim_norm);
1777EXPORT_SYMBOL(zap_create_claim_norm_dnsize);
1778EXPORT_SYMBOL(zap_create_hold);
1779EXPORT_SYMBOL(zap_destroy);
1780EXPORT_SYMBOL(zap_lookup);
1781EXPORT_SYMBOL(zap_lookup_by_dnode);
1782EXPORT_SYMBOL(zap_lookup_norm);
1783EXPORT_SYMBOL(zap_lookup_uint64);
1784EXPORT_SYMBOL(zap_contains);
1785EXPORT_SYMBOL(zap_prefetch);
1786EXPORT_SYMBOL(zap_prefetch_uint64);
1787EXPORT_SYMBOL(zap_add);
1788EXPORT_SYMBOL(zap_add_by_dnode);
1789EXPORT_SYMBOL(zap_add_uint64);
1790EXPORT_SYMBOL(zap_add_uint64_by_dnode);
1791EXPORT_SYMBOL(zap_update);
1792EXPORT_SYMBOL(zap_update_uint64);
1793EXPORT_SYMBOL(zap_update_uint64_by_dnode);
1794EXPORT_SYMBOL(zap_length);
1795EXPORT_SYMBOL(zap_length_uint64);
1796EXPORT_SYMBOL(zap_remove);
1797EXPORT_SYMBOL(zap_remove_by_dnode);
1798EXPORT_SYMBOL(zap_remove_norm);
1799EXPORT_SYMBOL(zap_remove_uint64);
1800EXPORT_SYMBOL(zap_remove_uint64_by_dnode);
1801EXPORT_SYMBOL(zap_count);
1802EXPORT_SYMBOL(zap_value_search);
1803EXPORT_SYMBOL(zap_join);
1804EXPORT_SYMBOL(zap_join_increment);
1805EXPORT_SYMBOL(zap_add_int);
1806EXPORT_SYMBOL(zap_remove_int);
1807EXPORT_SYMBOL(zap_lookup_int);
1808EXPORT_SYMBOL(zap_increment_int);
1809EXPORT_SYMBOL(zap_add_int_key);
1810EXPORT_SYMBOL(zap_lookup_int_key);
1811EXPORT_SYMBOL(zap_increment);
1812EXPORT_SYMBOL(zap_cursor_init);
1813EXPORT_SYMBOL(zap_cursor_fini);
1814EXPORT_SYMBOL(zap_cursor_retrieve);
1815EXPORT_SYMBOL(zap_cursor_advance);
1816EXPORT_SYMBOL(zap_cursor_serialize);
1817EXPORT_SYMBOL(zap_cursor_init_serialized);
1818EXPORT_SYMBOL(zap_get_stats);
1819
1820/* CSTYLED */
1821ZFS_MODULE_PARAM(zfs, , zap_micro_max_size, INT, ZMOD_RW,
1822	"Maximum micro ZAP size, before converting to a fat ZAP, in bytes");
1823#endif
1824