1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
25 * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
26 * Copyright 2017 Nexenta Systems, Inc.
27 */
28
29#include <sys/zio.h>
30#include <sys/spa.h>
31#include <sys/dmu.h>
32#include <sys/zfs_context.h>
33#include <sys/zap.h>
34#include <sys/zap_impl.h>
35#include <sys/zap_leaf.h>
36#include <sys/avl.h>
37#include <sys/arc.h>
38#include <sys/dmu_objset.h>
39
40#ifdef _KERNEL
41#include <sys/sunddi.h>
42#endif
43
44extern inline mzap_phys_t *zap_m_phys(zap_t *zap);
45
46static int mzap_upgrade(zap_t **zapp,
47    void *tag, dmu_tx_t *tx, zap_flags_t flags);
48
49uint64_t
50zap_getflags(zap_t *zap)
51{
52	if (zap->zap_ismicro)
53		return (0);
54	return (zap_f_phys(zap)->zap_flags);
55}
56
57int
58zap_hashbits(zap_t *zap)
59{
60	if (zap_getflags(zap) & ZAP_FLAG_HASH64)
61		return (48);
62	else
63		return (28);
64}
65
66uint32_t
67zap_maxcd(zap_t *zap)
68{
69	if (zap_getflags(zap) & ZAP_FLAG_HASH64)
70		return ((1<<16)-1);
71	else
72		return (-1U);
73}
74
75static uint64_t
76zap_hash(zap_name_t *zn)
77{
78	zap_t *zap = zn->zn_zap;
79	uint64_t h = 0;
80
81	if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) {
82		ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY);
83		h = *(uint64_t *)zn->zn_key_orig;
84	} else {
85		h = zap->zap_salt;
86		ASSERT(h != 0);
87		ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
88
89		if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
90			const uint64_t *wp = zn->zn_key_norm;
91
92			ASSERT(zn->zn_key_intlen == 8);
93			for (int i = 0; i < zn->zn_key_norm_numints;
94			    wp++, i++) {
95				uint64_t word = *wp;
96
97				for (int j = 0; j < zn->zn_key_intlen; j++) {
98					h = (h >> 8) ^
99					    zfs_crc64_table[(h ^ word) & 0xFF];
100					word >>= NBBY;
101				}
102			}
103		} else {
104			const uint8_t *cp = zn->zn_key_norm;
105
106			/*
107			 * We previously stored the terminating null on
108			 * disk, but didn't hash it, so we need to
109			 * continue to not hash it.  (The
110			 * zn_key_*_numints includes the terminating
111			 * null for non-binary keys.)
112			 */
113			int len = zn->zn_key_norm_numints - 1;
114
115			ASSERT(zn->zn_key_intlen == 1);
116			for (int i = 0; i < len; cp++, i++) {
117				h = (h >> 8) ^
118				    zfs_crc64_table[(h ^ *cp) & 0xFF];
119			}
120		}
121	}
122	/*
123	 * Don't use all 64 bits, since we need some in the cookie for
124	 * the collision differentiator.  We MUST use the high bits,
125	 * since those are the ones that we first pay attention to when
126	 * choosing the bucket.
127	 */
128	h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1);
129
130	return (h);
131}
132
133static int
134zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags)
135{
136	ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY));
137
138	size_t inlen = strlen(name) + 1;
139	size_t outlen = ZAP_MAXNAMELEN;
140
141	int err = 0;
142	(void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
143	    normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID,
144	    U8_UNICODE_LATEST, &err);
145
146	return (err);
147}
148
149boolean_t
150zap_match(zap_name_t *zn, const char *matchname)
151{
152	ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY));
153
154	if (zn->zn_matchtype & MT_NORMALIZE) {
155		char norm[ZAP_MAXNAMELEN];
156
157		if (zap_normalize(zn->zn_zap, matchname, norm,
158		    zn->zn_normflags) != 0)
159			return (B_FALSE);
160
161		return (strcmp(zn->zn_key_norm, norm) == 0);
162	} else {
163		return (strcmp(zn->zn_key_orig, matchname) == 0);
164	}
165}
166
167void
168zap_name_free(zap_name_t *zn)
169{
170	kmem_free(zn, sizeof (zap_name_t));
171}
172
173zap_name_t *
174zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt)
175{
176	zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
177
178	zn->zn_zap = zap;
179	zn->zn_key_intlen = sizeof (*key);
180	zn->zn_key_orig = key;
181	zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1;
182	zn->zn_matchtype = mt;
183	zn->zn_normflags = zap->zap_normflags;
184
185	/*
186	 * If we're dealing with a case sensitive lookup on a mixed or
187	 * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup
188	 * will fold case to all caps overriding the lookup request.
189	 */
190	if (mt & MT_MATCH_CASE)
191		zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER;
192
193	if (zap->zap_normflags) {
194		/*
195		 * We *must* use zap_normflags because this normalization is
196		 * what the hash is computed from.
197		 */
198		if (zap_normalize(zap, key, zn->zn_normbuf,
199		    zap->zap_normflags) != 0) {
200			zap_name_free(zn);
201			return (NULL);
202		}
203		zn->zn_key_norm = zn->zn_normbuf;
204		zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
205	} else {
206		if (mt != 0) {
207			zap_name_free(zn);
208			return (NULL);
209		}
210		zn->zn_key_norm = zn->zn_key_orig;
211		zn->zn_key_norm_numints = zn->zn_key_orig_numints;
212	}
213
214	zn->zn_hash = zap_hash(zn);
215
216	if (zap->zap_normflags != zn->zn_normflags) {
217		/*
218		 * We *must* use zn_normflags because this normalization is
219		 * what the matching is based on.  (Not the hash!)
220		 */
221		if (zap_normalize(zap, key, zn->zn_normbuf,
222		    zn->zn_normflags) != 0) {
223			zap_name_free(zn);
224			return (NULL);
225		}
226		zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
227	}
228
229	return (zn);
230}
231
232static zap_name_t *
233zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints)
234{
235	zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
236
237	ASSERT(zap->zap_normflags == 0);
238	zn->zn_zap = zap;
239	zn->zn_key_intlen = sizeof (*key);
240	zn->zn_key_orig = zn->zn_key_norm = key;
241	zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints;
242	zn->zn_matchtype = 0;
243
244	zn->zn_hash = zap_hash(zn);
245	return (zn);
246}
247
248static void
249mzap_byteswap(mzap_phys_t *buf, size_t size)
250{
251	buf->mz_block_type = BSWAP_64(buf->mz_block_type);
252	buf->mz_salt = BSWAP_64(buf->mz_salt);
253	buf->mz_normflags = BSWAP_64(buf->mz_normflags);
254	int max = (size / MZAP_ENT_LEN) - 1;
255	for (int i = 0; i < max; i++) {
256		buf->mz_chunk[i].mze_value =
257		    BSWAP_64(buf->mz_chunk[i].mze_value);
258		buf->mz_chunk[i].mze_cd =
259		    BSWAP_32(buf->mz_chunk[i].mze_cd);
260	}
261}
262
263void
264zap_byteswap(void *buf, size_t size)
265{
266	uint64_t block_type = *(uint64_t *)buf;
267
268	if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) {
269		/* ASSERT(magic == ZAP_LEAF_MAGIC); */
270		mzap_byteswap(buf, size);
271	} else {
272		fzap_byteswap(buf, size);
273	}
274}
275
276static int
277mze_compare(const void *arg1, const void *arg2)
278{
279	const mzap_ent_t *mze1 = arg1;
280	const mzap_ent_t *mze2 = arg2;
281
282	int cmp = TREE_CMP(mze1->mze_hash, mze2->mze_hash);
283	if (likely(cmp))
284		return (cmp);
285
286	return (TREE_CMP(mze1->mze_cd, mze2->mze_cd));
287}
288
289static void
290mze_insert(zap_t *zap, int chunkid, uint64_t hash)
291{
292	ASSERT(zap->zap_ismicro);
293	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
294
295	mzap_ent_t *mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
296	mze->mze_chunkid = chunkid;
297	mze->mze_hash = hash;
298	mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd;
299	ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0);
300	avl_add(&zap->zap_m.zap_avl, mze);
301}
302
303static mzap_ent_t *
304mze_find(zap_name_t *zn)
305{
306	mzap_ent_t mze_tofind;
307	mzap_ent_t *mze;
308	avl_index_t idx;
309	avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl;
310
311	ASSERT(zn->zn_zap->zap_ismicro);
312	ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
313
314	mze_tofind.mze_hash = zn->zn_hash;
315	mze_tofind.mze_cd = 0;
316
317	mze = avl_find(avl, &mze_tofind, &idx);
318	if (mze == NULL)
319		mze = avl_nearest(avl, idx, AVL_AFTER);
320	for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) {
321		ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd);
322		if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name))
323			return (mze);
324	}
325
326	return (NULL);
327}
328
329static uint32_t
330mze_find_unused_cd(zap_t *zap, uint64_t hash)
331{
332	mzap_ent_t mze_tofind;
333	avl_index_t idx;
334	avl_tree_t *avl = &zap->zap_m.zap_avl;
335
336	ASSERT(zap->zap_ismicro);
337	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
338
339	mze_tofind.mze_hash = hash;
340	mze_tofind.mze_cd = 0;
341
342	uint32_t cd = 0;
343	for (mzap_ent_t *mze = avl_find(avl, &mze_tofind, &idx);
344	    mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
345		if (mze->mze_cd != cd)
346			break;
347		cd++;
348	}
349
350	return (cd);
351}
352
353/*
354 * Each mzap entry requires at max : 4 chunks
355 * 3 chunks for names + 1 chunk for value.
356 */
357#define	MZAP_ENT_CHUNKS	(1 + ZAP_LEAF_ARRAY_NCHUNKS(MZAP_NAME_LEN) + \
358	ZAP_LEAF_ARRAY_NCHUNKS(sizeof (uint64_t)))
359
360/*
361 * Check if the current entry keeps the colliding entries under the fatzap leaf
362 * size.
363 */
364static boolean_t
365mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash)
366{
367	zap_t *zap = zn->zn_zap;
368	mzap_ent_t mze_tofind;
369	mzap_ent_t *mze;
370	avl_index_t idx;
371	avl_tree_t *avl = &zap->zap_m.zap_avl;
372	uint32_t mzap_ents = 0;
373
374	mze_tofind.mze_hash = hash;
375	mze_tofind.mze_cd = 0;
376
377	for (mze = avl_find(avl, &mze_tofind, &idx);
378	    mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
379		mzap_ents++;
380	}
381
382	/* Include the new entry being added */
383	mzap_ents++;
384
385	return (ZAP_LEAF_NUMCHUNKS_DEF > (mzap_ents * MZAP_ENT_CHUNKS));
386}
387
388static void
389mze_remove(zap_t *zap, mzap_ent_t *mze)
390{
391	ASSERT(zap->zap_ismicro);
392	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
393
394	avl_remove(&zap->zap_m.zap_avl, mze);
395	kmem_free(mze, sizeof (mzap_ent_t));
396}
397
398static void
399mze_destroy(zap_t *zap)
400{
401	mzap_ent_t *mze;
402	void *avlcookie = NULL;
403
404	while ((mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie)))
405		kmem_free(mze, sizeof (mzap_ent_t));
406	avl_destroy(&zap->zap_m.zap_avl);
407}
408
409static zap_t *
410mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
411{
412	zap_t *winner;
413	uint64_t *zap_hdr = (uint64_t *)db->db_data;
414	uint64_t zap_block_type = zap_hdr[0];
415	uint64_t zap_magic = zap_hdr[1];
416
417	ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
418
419	zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
420	rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL);
421	rw_enter(&zap->zap_rwlock, RW_WRITER);
422	zap->zap_objset = os;
423	zap->zap_object = obj;
424	zap->zap_dbuf = db;
425
426	if (zap_block_type != ZBT_MICRO) {
427		mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, MUTEX_DEFAULT,
428		    0);
429		zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1;
430		if (zap_block_type != ZBT_HEADER || zap_magic != ZAP_MAGIC) {
431			winner = NULL;	/* No actual winner here... */
432			goto handle_winner;
433		}
434	} else {
435		zap->zap_ismicro = TRUE;
436	}
437
438	/*
439	 * Make sure that zap_ismicro is set before we let others see
440	 * it, because zap_lockdir() checks zap_ismicro without the lock
441	 * held.
442	 */
443	dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf);
444	winner = dmu_buf_set_user(db, &zap->zap_dbu);
445
446	if (winner != NULL)
447		goto handle_winner;
448
449	if (zap->zap_ismicro) {
450		zap->zap_salt = zap_m_phys(zap)->mz_salt;
451		zap->zap_normflags = zap_m_phys(zap)->mz_normflags;
452		zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
453		avl_create(&zap->zap_m.zap_avl, mze_compare,
454		    sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
455
456		for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) {
457			mzap_ent_phys_t *mze =
458			    &zap_m_phys(zap)->mz_chunk[i];
459			if (mze->mze_name[0]) {
460				zap_name_t *zn;
461
462				zap->zap_m.zap_num_entries++;
463				zn = zap_name_alloc(zap, mze->mze_name, 0);
464				mze_insert(zap, i, zn->zn_hash);
465				zap_name_free(zn);
466			}
467		}
468	} else {
469		zap->zap_salt = zap_f_phys(zap)->zap_salt;
470		zap->zap_normflags = zap_f_phys(zap)->zap_normflags;
471
472		ASSERT3U(sizeof (struct zap_leaf_header), ==,
473		    2*ZAP_LEAF_CHUNKSIZE);
474
475		/*
476		 * The embedded pointer table should not overlap the
477		 * other members.
478		 */
479		ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
480		    &zap_f_phys(zap)->zap_salt);
481
482		/*
483		 * The embedded pointer table should end at the end of
484		 * the block
485		 */
486		ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
487		    1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
488		    (uintptr_t)zap_f_phys(zap), ==,
489		    zap->zap_dbuf->db_size);
490	}
491	rw_exit(&zap->zap_rwlock);
492	return (zap);
493
494handle_winner:
495	rw_exit(&zap->zap_rwlock);
496	rw_destroy(&zap->zap_rwlock);
497	if (!zap->zap_ismicro)
498		mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
499	kmem_free(zap, sizeof (zap_t));
500	return (winner);
501}
502
503/*
504 * This routine "consumes" the caller's hold on the dbuf, which must
505 * have the specified tag.
506 */
507static int
508zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx,
509    krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
510{
511	ASSERT0(db->db_offset);
512	objset_t *os = dmu_buf_get_objset(db);
513	uint64_t obj = db->db_object;
514	dmu_object_info_t doi;
515
516	*zapp = NULL;
517
518	dmu_object_info_from_db(db, &doi);
519	if (DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP)
520		return (SET_ERROR(EINVAL));
521
522	zap_t *zap = dmu_buf_get_user(db);
523	if (zap == NULL) {
524		zap = mzap_open(os, obj, db);
525		if (zap == NULL) {
526			/*
527			 * mzap_open() didn't like what it saw on-disk.
528			 * Check for corruption!
529			 */
530			return (SET_ERROR(EIO));
531		}
532	}
533
534	/*
535	 * We're checking zap_ismicro without the lock held, in order to
536	 * tell what type of lock we want.  Once we have some sort of
537	 * lock, see if it really is the right type.  In practice this
538	 * can only be different if it was upgraded from micro to fat,
539	 * and micro wanted WRITER but fat only needs READER.
540	 */
541	krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
542	rw_enter(&zap->zap_rwlock, lt);
543	if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
544		/* it was upgraded, now we only need reader */
545		ASSERT(lt == RW_WRITER);
546		ASSERT(RW_READER ==
547		    ((!zap->zap_ismicro && fatreader) ? RW_READER : lti));
548		rw_downgrade(&zap->zap_rwlock);
549		lt = RW_READER;
550	}
551
552	zap->zap_objset = os;
553
554	if (lt == RW_WRITER)
555		dmu_buf_will_dirty(db, tx);
556
557	ASSERT3P(zap->zap_dbuf, ==, db);
558
559	ASSERT(!zap->zap_ismicro ||
560	    zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks);
561	if (zap->zap_ismicro && tx && adding &&
562	    zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
563		uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
564		if (newsz > MZAP_MAX_BLKSZ) {
565			dprintf("upgrading obj %llu: num_entries=%u\n",
566			    obj, zap->zap_m.zap_num_entries);
567			*zapp = zap;
568			int err = mzap_upgrade(zapp, tag, tx, 0);
569			if (err != 0)
570				rw_exit(&zap->zap_rwlock);
571			return (err);
572		}
573		VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx));
574		zap->zap_m.zap_num_chunks =
575		    db->db_size / MZAP_ENT_LEN - 1;
576	}
577
578	*zapp = zap;
579	return (0);
580}
581
582static int
583zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
584    krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
585{
586	dmu_buf_t *db;
587
588	int err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
589	if (err != 0) {
590		return (err);
591	}
592#ifdef ZFS_DEBUG
593	{
594		dmu_object_info_t doi;
595		dmu_object_info_from_db(db, &doi);
596		ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
597	}
598#endif
599
600	err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
601	if (err != 0) {
602		dmu_buf_rele(db, tag);
603	}
604	return (err);
605}
606
607int
608zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
609    krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
610{
611	dmu_buf_t *db;
612
613	int err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH);
614	if (err != 0)
615		return (err);
616#ifdef ZFS_DEBUG
617	{
618		dmu_object_info_t doi;
619		dmu_object_info_from_db(db, &doi);
620		ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP);
621	}
622#endif
623	err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
624	if (err != 0)
625		dmu_buf_rele(db, tag);
626	return (err);
627}
628
629void
630zap_unlockdir(zap_t *zap, void *tag)
631{
632	rw_exit(&zap->zap_rwlock);
633	dmu_buf_rele(zap->zap_dbuf, tag);
634}
635
636static int
637mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags)
638{
639	int err = 0;
640	zap_t *zap = *zapp;
641
642	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
643
644	int sz = zap->zap_dbuf->db_size;
645	mzap_phys_t *mzp = vmem_alloc(sz, KM_SLEEP);
646	bcopy(zap->zap_dbuf->db_data, mzp, sz);
647	int nchunks = zap->zap_m.zap_num_chunks;
648
649	if (!flags) {
650		err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
651		    1ULL << fzap_default_block_shift, 0, tx);
652		if (err != 0) {
653			vmem_free(mzp, sz);
654			return (err);
655		}
656	}
657
658	dprintf("upgrading obj=%llu with %u chunks\n",
659	    zap->zap_object, nchunks);
660	/* XXX destroy the avl later, so we can use the stored hash value */
661	mze_destroy(zap);
662
663	fzap_upgrade(zap, tx, flags);
664
665	for (int i = 0; i < nchunks; i++) {
666		mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
667		if (mze->mze_name[0] == 0)
668			continue;
669		dprintf("adding %s=%llu\n",
670		    mze->mze_name, mze->mze_value);
671		zap_name_t *zn = zap_name_alloc(zap, mze->mze_name, 0);
672		/* If we fail here, we would end up losing entries */
673		VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd,
674		    tag, tx));
675		zap = zn->zn_zap;	/* fzap_add_cd() may change zap */
676		zap_name_free(zn);
677	}
678	vmem_free(mzp, sz);
679	*zapp = zap;
680	return (0);
681}
682
683/*
684 * The "normflags" determine the behavior of the matchtype_t which is
685 * passed to zap_lookup_norm().  Names which have the same normalized
686 * version will be stored with the same hash value, and therefore we can
687 * perform normalization-insensitive lookups.  We can be Unicode form-
688 * insensitive and/or case-insensitive.  The following flags are valid for
689 * "normflags":
690 *
691 * U8_TEXTPREP_NFC
692 * U8_TEXTPREP_NFD
693 * U8_TEXTPREP_NFKC
694 * U8_TEXTPREP_NFKD
695 * U8_TEXTPREP_TOUPPER
696 *
697 * The *_NF* (Normalization Form) flags are mutually exclusive; at most one
698 * of them may be supplied.
699 */
700void
701mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx)
702{
703	dmu_buf_t *db;
704
705	VERIFY0(dmu_buf_hold_by_dnode(dn, 0, FTAG, &db, DMU_READ_NO_PREFETCH));
706
707	dmu_buf_will_dirty(db, tx);
708	mzap_phys_t *zp = db->db_data;
709	zp->mz_block_type = ZBT_MICRO;
710	zp->mz_salt =
711	    ((uintptr_t)db ^ (uintptr_t)tx ^ (dn->dn_object << 1)) | 1ULL;
712	zp->mz_normflags = normflags;
713
714	if (flags != 0) {
715		zap_t *zap;
716		/* Only fat zap supports flags; upgrade immediately. */
717		VERIFY0(zap_lockdir_impl(db, FTAG, tx, RW_WRITER,
718		    B_FALSE, B_FALSE, &zap));
719		VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags));
720		zap_unlockdir(zap, FTAG);
721	} else {
722		dmu_buf_rele(db, FTAG);
723	}
724}
725
726static uint64_t
727zap_create_impl(objset_t *os, int normflags, zap_flags_t flags,
728    dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
729    dmu_object_type_t bonustype, int bonuslen, int dnodesize,
730    dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx)
731{
732	uint64_t obj;
733
734	ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
735
736	if (allocated_dnode == NULL) {
737		dnode_t *dn;
738		obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift,
739		    indirect_blockshift, bonustype, bonuslen, dnodesize,
740		    &dn, FTAG, tx);
741		mzap_create_impl(dn, normflags, flags, tx);
742		dnode_rele(dn, FTAG);
743	} else {
744		obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift,
745		    indirect_blockshift, bonustype, bonuslen, dnodesize,
746		    allocated_dnode, tag, tx);
747		mzap_create_impl(*allocated_dnode, normflags, flags, tx);
748	}
749
750	return (obj);
751}
752
753int
754zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
755    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
756{
757	return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen,
758	    0, tx));
759}
760
761int
762zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot,
763    dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
764{
765	return (zap_create_claim_norm_dnsize(os, obj,
766	    0, ot, bonustype, bonuslen, dnodesize, tx));
767}
768
769int
770zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags,
771    dmu_object_type_t ot,
772    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
773{
774	return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype,
775	    bonuslen, 0, tx));
776}
777
778int
779zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags,
780    dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen,
781    int dnodesize, dmu_tx_t *tx)
782{
783	dnode_t *dn;
784	int error;
785
786	ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP);
787	error = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen,
788	    dnodesize, tx);
789	if (error != 0)
790		return (error);
791
792	error = dnode_hold(os, obj, FTAG, &dn);
793	if (error != 0)
794		return (error);
795
796	mzap_create_impl(dn, normflags, 0, tx);
797
798	dnode_rele(dn, FTAG);
799
800	return (0);
801}
802
803uint64_t
804zap_create(objset_t *os, dmu_object_type_t ot,
805    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
806{
807	return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx));
808}
809
810uint64_t
811zap_create_dnsize(objset_t *os, dmu_object_type_t ot,
812    dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
813{
814	return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen,
815	    dnodesize, tx));
816}
817
818uint64_t
819zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
820    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
821{
822	return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen,
823	    0, tx));
824}
825
826uint64_t
827zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot,
828    dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
829{
830	return (zap_create_impl(os, normflags, 0, ot, 0, 0,
831	    bonustype, bonuslen, dnodesize, NULL, NULL, tx));
832}
833
834uint64_t
835zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
836    dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
837    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
838{
839	return (zap_create_flags_dnsize(os, normflags, flags, ot,
840	    leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx));
841}
842
843uint64_t
844zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags,
845    dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
846    dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx)
847{
848	return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift,
849	    indirect_blockshift, bonustype, bonuslen, dnodesize, NULL, NULL,
850	    tx));
851}
852
853/*
854 * Create a zap object and return a pointer to the newly allocated dnode via
855 * the allocated_dnode argument.  The returned dnode will be held and the
856 * caller is responsible for releasing the hold by calling dnode_rele().
857 */
858uint64_t
859zap_create_hold(objset_t *os, int normflags, zap_flags_t flags,
860    dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
861    dmu_object_type_t bonustype, int bonuslen, int dnodesize,
862    dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx)
863{
864	return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift,
865	    indirect_blockshift, bonustype, bonuslen, dnodesize,
866	    allocated_dnode, tag, tx));
867}
868
869int
870zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
871{
872	/*
873	 * dmu_object_free will free the object number and free the
874	 * data.  Freeing the data will cause our pageout function to be
875	 * called, which will destroy our data (zap_leaf_t's and zap_t).
876	 */
877
878	return (dmu_object_free(os, zapobj, tx));
879}
880
881void
882zap_evict_sync(void *dbu)
883{
884	zap_t *zap = dbu;
885
886	rw_destroy(&zap->zap_rwlock);
887
888	if (zap->zap_ismicro)
889		mze_destroy(zap);
890	else
891		mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
892
893	kmem_free(zap, sizeof (zap_t));
894}
895
896int
897zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
898{
899	zap_t *zap;
900
901	int err =
902	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
903	if (err != 0)
904		return (err);
905	if (!zap->zap_ismicro) {
906		err = fzap_count(zap, count);
907	} else {
908		*count = zap->zap_m.zap_num_entries;
909	}
910	zap_unlockdir(zap, FTAG);
911	return (err);
912}
913
914/*
915 * zn may be NULL; if not specified, it will be computed if needed.
916 * See also the comment above zap_entry_normalization_conflict().
917 */
918static boolean_t
919mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze)
920{
921	int direction = AVL_BEFORE;
922	boolean_t allocdzn = B_FALSE;
923
924	if (zap->zap_normflags == 0)
925		return (B_FALSE);
926
927again:
928	for (mzap_ent_t *other = avl_walk(&zap->zap_m.zap_avl, mze, direction);
929	    other && other->mze_hash == mze->mze_hash;
930	    other = avl_walk(&zap->zap_m.zap_avl, other, direction)) {
931
932		if (zn == NULL) {
933			zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name,
934			    MT_NORMALIZE);
935			allocdzn = B_TRUE;
936		}
937		if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) {
938			if (allocdzn)
939				zap_name_free(zn);
940			return (B_TRUE);
941		}
942	}
943
944	if (direction == AVL_BEFORE) {
945		direction = AVL_AFTER;
946		goto again;
947	}
948
949	if (allocdzn)
950		zap_name_free(zn);
951	return (B_FALSE);
952}
953
954/*
955 * Routines for manipulating attributes.
956 */
957
958int
959zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
960    uint64_t integer_size, uint64_t num_integers, void *buf)
961{
962	return (zap_lookup_norm(os, zapobj, name, integer_size,
963	    num_integers, buf, 0, NULL, 0, NULL));
964}
965
966static int
967zap_lookup_impl(zap_t *zap, const char *name,
968    uint64_t integer_size, uint64_t num_integers, void *buf,
969    matchtype_t mt, char *realname, int rn_len,
970    boolean_t *ncp)
971{
972	int err = 0;
973
974	zap_name_t *zn = zap_name_alloc(zap, name, mt);
975	if (zn == NULL)
976		return (SET_ERROR(ENOTSUP));
977
978	if (!zap->zap_ismicro) {
979		err = fzap_lookup(zn, integer_size, num_integers, buf,
980		    realname, rn_len, ncp);
981	} else {
982		mzap_ent_t *mze = mze_find(zn);
983		if (mze == NULL) {
984			err = SET_ERROR(ENOENT);
985		} else {
986			if (num_integers < 1) {
987				err = SET_ERROR(EOVERFLOW);
988			} else if (integer_size != 8) {
989				err = SET_ERROR(EINVAL);
990			} else {
991				*(uint64_t *)buf =
992				    MZE_PHYS(zap, mze)->mze_value;
993				(void) strlcpy(realname,
994				    MZE_PHYS(zap, mze)->mze_name, rn_len);
995				if (ncp) {
996					*ncp = mzap_normalization_conflict(zap,
997					    zn, mze);
998				}
999			}
1000		}
1001	}
1002	zap_name_free(zn);
1003	return (err);
1004}
1005
1006int
1007zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
1008    uint64_t integer_size, uint64_t num_integers, void *buf,
1009    matchtype_t mt, char *realname, int rn_len,
1010    boolean_t *ncp)
1011{
1012	zap_t *zap;
1013
1014	int err =
1015	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1016	if (err != 0)
1017		return (err);
1018	err = zap_lookup_impl(zap, name, integer_size,
1019	    num_integers, buf, mt, realname, rn_len, ncp);
1020	zap_unlockdir(zap, FTAG);
1021	return (err);
1022}
1023
1024int
1025zap_prefetch(objset_t *os, uint64_t zapobj, const char *name)
1026{
1027	zap_t *zap;
1028	int err;
1029	zap_name_t *zn;
1030
1031	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1032	if (err)
1033		return (err);
1034	zn = zap_name_alloc(zap, name, 0);
1035	if (zn == NULL) {
1036		zap_unlockdir(zap, FTAG);
1037		return (SET_ERROR(ENOTSUP));
1038	}
1039
1040	fzap_prefetch(zn);
1041	zap_name_free(zn);
1042	zap_unlockdir(zap, FTAG);
1043	return (err);
1044}
1045
1046int
1047zap_lookup_by_dnode(dnode_t *dn, const char *name,
1048    uint64_t integer_size, uint64_t num_integers, void *buf)
1049{
1050	return (zap_lookup_norm_by_dnode(dn, name, integer_size,
1051	    num_integers, buf, 0, NULL, 0, NULL));
1052}
1053
1054int
1055zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
1056    uint64_t integer_size, uint64_t num_integers, void *buf,
1057    matchtype_t mt, char *realname, int rn_len,
1058    boolean_t *ncp)
1059{
1060	zap_t *zap;
1061
1062	int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
1063	    FTAG, &zap);
1064	if (err != 0)
1065		return (err);
1066	err = zap_lookup_impl(zap, name, integer_size,
1067	    num_integers, buf, mt, realname, rn_len, ncp);
1068	zap_unlockdir(zap, FTAG);
1069	return (err);
1070}
1071
1072int
1073zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1074    int key_numints)
1075{
1076	zap_t *zap;
1077
1078	int err =
1079	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1080	if (err != 0)
1081		return (err);
1082	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1083	if (zn == NULL) {
1084		zap_unlockdir(zap, FTAG);
1085		return (SET_ERROR(ENOTSUP));
1086	}
1087
1088	fzap_prefetch(zn);
1089	zap_name_free(zn);
1090	zap_unlockdir(zap, FTAG);
1091	return (err);
1092}
1093
1094int
1095zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1096    int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
1097{
1098	zap_t *zap;
1099
1100	int err =
1101	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1102	if (err != 0)
1103		return (err);
1104	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1105	if (zn == NULL) {
1106		zap_unlockdir(zap, FTAG);
1107		return (SET_ERROR(ENOTSUP));
1108	}
1109
1110	err = fzap_lookup(zn, integer_size, num_integers, buf,
1111	    NULL, 0, NULL);
1112	zap_name_free(zn);
1113	zap_unlockdir(zap, FTAG);
1114	return (err);
1115}
1116
1117int
1118zap_contains(objset_t *os, uint64_t zapobj, const char *name)
1119{
1120	int err = zap_lookup_norm(os, zapobj, name, 0,
1121	    0, NULL, 0, NULL, 0, NULL);
1122	if (err == EOVERFLOW || err == EINVAL)
1123		err = 0; /* found, but skipped reading the value */
1124	return (err);
1125}
1126
1127int
1128zap_length(objset_t *os, uint64_t zapobj, const char *name,
1129    uint64_t *integer_size, uint64_t *num_integers)
1130{
1131	zap_t *zap;
1132
1133	int err =
1134	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1135	if (err != 0)
1136		return (err);
1137	zap_name_t *zn = zap_name_alloc(zap, name, 0);
1138	if (zn == NULL) {
1139		zap_unlockdir(zap, FTAG);
1140		return (SET_ERROR(ENOTSUP));
1141	}
1142	if (!zap->zap_ismicro) {
1143		err = fzap_length(zn, integer_size, num_integers);
1144	} else {
1145		mzap_ent_t *mze = mze_find(zn);
1146		if (mze == NULL) {
1147			err = SET_ERROR(ENOENT);
1148		} else {
1149			if (integer_size)
1150				*integer_size = 8;
1151			if (num_integers)
1152				*num_integers = 1;
1153		}
1154	}
1155	zap_name_free(zn);
1156	zap_unlockdir(zap, FTAG);
1157	return (err);
1158}
1159
1160int
1161zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1162    int key_numints, uint64_t *integer_size, uint64_t *num_integers)
1163{
1164	zap_t *zap;
1165
1166	int err =
1167	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1168	if (err != 0)
1169		return (err);
1170	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1171	if (zn == NULL) {
1172		zap_unlockdir(zap, FTAG);
1173		return (SET_ERROR(ENOTSUP));
1174	}
1175	err = fzap_length(zn, integer_size, num_integers);
1176	zap_name_free(zn);
1177	zap_unlockdir(zap, FTAG);
1178	return (err);
1179}
1180
1181static void
1182mzap_addent(zap_name_t *zn, uint64_t value)
1183{
1184	zap_t *zap = zn->zn_zap;
1185	int start = zap->zap_m.zap_alloc_next;
1186
1187	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
1188
1189#ifdef ZFS_DEBUG
1190	for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) {
1191		mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
1192		ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0);
1193	}
1194#endif
1195
1196	uint32_t cd = mze_find_unused_cd(zap, zn->zn_hash);
1197	/* given the limited size of the microzap, this can't happen */
1198	ASSERT(cd < zap_maxcd(zap));
1199
1200again:
1201	for (int i = start; i < zap->zap_m.zap_num_chunks; i++) {
1202		mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
1203		if (mze->mze_name[0] == 0) {
1204			mze->mze_value = value;
1205			mze->mze_cd = cd;
1206			(void) strlcpy(mze->mze_name, zn->zn_key_orig,
1207			    sizeof (mze->mze_name));
1208			zap->zap_m.zap_num_entries++;
1209			zap->zap_m.zap_alloc_next = i+1;
1210			if (zap->zap_m.zap_alloc_next ==
1211			    zap->zap_m.zap_num_chunks)
1212				zap->zap_m.zap_alloc_next = 0;
1213			mze_insert(zap, i, zn->zn_hash);
1214			return;
1215		}
1216	}
1217	if (start != 0) {
1218		start = 0;
1219		goto again;
1220	}
1221	cmn_err(CE_PANIC, "out of entries!");
1222}
1223
1224static int
1225zap_add_impl(zap_t *zap, const char *key,
1226    int integer_size, uint64_t num_integers,
1227    const void *val, dmu_tx_t *tx, void *tag)
1228{
1229	const uint64_t *intval = val;
1230	int err = 0;
1231
1232	zap_name_t *zn = zap_name_alloc(zap, key, 0);
1233	if (zn == NULL) {
1234		zap_unlockdir(zap, tag);
1235		return (SET_ERROR(ENOTSUP));
1236	}
1237	if (!zap->zap_ismicro) {
1238		err = fzap_add(zn, integer_size, num_integers, val, tag, tx);
1239		zap = zn->zn_zap;	/* fzap_add() may change zap */
1240	} else if (integer_size != 8 || num_integers != 1 ||
1241	    strlen(key) >= MZAP_NAME_LEN ||
1242	    !mze_canfit_fzap_leaf(zn, zn->zn_hash)) {
1243		err = mzap_upgrade(&zn->zn_zap, tag, tx, 0);
1244		if (err == 0) {
1245			err = fzap_add(zn, integer_size, num_integers, val,
1246			    tag, tx);
1247		}
1248		zap = zn->zn_zap;	/* fzap_add() may change zap */
1249	} else {
1250		if (mze_find(zn) != NULL) {
1251			err = SET_ERROR(EEXIST);
1252		} else {
1253			mzap_addent(zn, *intval);
1254		}
1255	}
1256	ASSERT(zap == zn->zn_zap);
1257	zap_name_free(zn);
1258	if (zap != NULL)	/* may be NULL if fzap_add() failed */
1259		zap_unlockdir(zap, tag);
1260	return (err);
1261}
1262
1263int
1264zap_add(objset_t *os, uint64_t zapobj, const char *key,
1265    int integer_size, uint64_t num_integers,
1266    const void *val, dmu_tx_t *tx)
1267{
1268	zap_t *zap;
1269	int err;
1270
1271	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1272	if (err != 0)
1273		return (err);
1274	err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
1275	/* zap_add_impl() calls zap_unlockdir() */
1276	return (err);
1277}
1278
1279int
1280zap_add_by_dnode(dnode_t *dn, const char *key,
1281    int integer_size, uint64_t num_integers,
1282    const void *val, dmu_tx_t *tx)
1283{
1284	zap_t *zap;
1285	int err;
1286
1287	err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1288	if (err != 0)
1289		return (err);
1290	err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG);
1291	/* zap_add_impl() calls zap_unlockdir() */
1292	return (err);
1293}
1294
1295int
1296zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1297    int key_numints, int integer_size, uint64_t num_integers,
1298    const void *val, dmu_tx_t *tx)
1299{
1300	zap_t *zap;
1301
1302	int err =
1303	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1304	if (err != 0)
1305		return (err);
1306	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1307	if (zn == NULL) {
1308		zap_unlockdir(zap, FTAG);
1309		return (SET_ERROR(ENOTSUP));
1310	}
1311	err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx);
1312	zap = zn->zn_zap;	/* fzap_add() may change zap */
1313	zap_name_free(zn);
1314	if (zap != NULL)	/* may be NULL if fzap_add() failed */
1315		zap_unlockdir(zap, FTAG);
1316	return (err);
1317}
1318
1319int
1320zap_update(objset_t *os, uint64_t zapobj, const char *name,
1321    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
1322{
1323	zap_t *zap;
1324	const uint64_t *intval = val;
1325
1326	int err =
1327	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1328	if (err != 0)
1329		return (err);
1330	zap_name_t *zn = zap_name_alloc(zap, name, 0);
1331	if (zn == NULL) {
1332		zap_unlockdir(zap, FTAG);
1333		return (SET_ERROR(ENOTSUP));
1334	}
1335	if (!zap->zap_ismicro) {
1336		err = fzap_update(zn, integer_size, num_integers, val,
1337		    FTAG, tx);
1338		zap = zn->zn_zap;	/* fzap_update() may change zap */
1339	} else if (integer_size != 8 || num_integers != 1 ||
1340	    strlen(name) >= MZAP_NAME_LEN) {
1341		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
1342		    zapobj, integer_size, num_integers, name);
1343		err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0);
1344		if (err == 0) {
1345			err = fzap_update(zn, integer_size, num_integers,
1346			    val, FTAG, tx);
1347		}
1348		zap = zn->zn_zap;	/* fzap_update() may change zap */
1349	} else {
1350		mzap_ent_t *mze = mze_find(zn);
1351		if (mze != NULL) {
1352			MZE_PHYS(zap, mze)->mze_value = *intval;
1353		} else {
1354			mzap_addent(zn, *intval);
1355		}
1356	}
1357	ASSERT(zap == zn->zn_zap);
1358	zap_name_free(zn);
1359	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
1360		zap_unlockdir(zap, FTAG);
1361	return (err);
1362}
1363
1364int
1365zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1366    int key_numints,
1367    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
1368{
1369	zap_t *zap;
1370
1371	int err =
1372	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
1373	if (err != 0)
1374		return (err);
1375	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1376	if (zn == NULL) {
1377		zap_unlockdir(zap, FTAG);
1378		return (SET_ERROR(ENOTSUP));
1379	}
1380	err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx);
1381	zap = zn->zn_zap;	/* fzap_update() may change zap */
1382	zap_name_free(zn);
1383	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
1384		zap_unlockdir(zap, FTAG);
1385	return (err);
1386}
1387
1388int
1389zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
1390{
1391	return (zap_remove_norm(os, zapobj, name, 0, tx));
1392}
1393
1394static int
1395zap_remove_impl(zap_t *zap, const char *name,
1396    matchtype_t mt, dmu_tx_t *tx)
1397{
1398	int err = 0;
1399
1400	zap_name_t *zn = zap_name_alloc(zap, name, mt);
1401	if (zn == NULL)
1402		return (SET_ERROR(ENOTSUP));
1403	if (!zap->zap_ismicro) {
1404		err = fzap_remove(zn, tx);
1405	} else {
1406		mzap_ent_t *mze = mze_find(zn);
1407		if (mze == NULL) {
1408			err = SET_ERROR(ENOENT);
1409		} else {
1410			zap->zap_m.zap_num_entries--;
1411			bzero(&zap_m_phys(zap)->mz_chunk[mze->mze_chunkid],
1412			    sizeof (mzap_ent_phys_t));
1413			mze_remove(zap, mze);
1414		}
1415	}
1416	zap_name_free(zn);
1417	return (err);
1418}
1419
1420int
1421zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
1422    matchtype_t mt, dmu_tx_t *tx)
1423{
1424	zap_t *zap;
1425	int err;
1426
1427	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
1428	if (err)
1429		return (err);
1430	err = zap_remove_impl(zap, name, mt, tx);
1431	zap_unlockdir(zap, FTAG);
1432	return (err);
1433}
1434
1435int
1436zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx)
1437{
1438	zap_t *zap;
1439	int err;
1440
1441	err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
1442	if (err)
1443		return (err);
1444	err = zap_remove_impl(zap, name, 0, tx);
1445	zap_unlockdir(zap, FTAG);
1446	return (err);
1447}
1448
1449int
1450zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
1451    int key_numints, dmu_tx_t *tx)
1452{
1453	zap_t *zap;
1454
1455	int err =
1456	    zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
1457	if (err != 0)
1458		return (err);
1459	zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints);
1460	if (zn == NULL) {
1461		zap_unlockdir(zap, FTAG);
1462		return (SET_ERROR(ENOTSUP));
1463	}
1464	err = fzap_remove(zn, tx);
1465	zap_name_free(zn);
1466	zap_unlockdir(zap, FTAG);
1467	return (err);
1468}
1469
1470/*
1471 * Routines for iterating over the attributes.
1472 */
1473
1474static void
1475zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
1476    uint64_t serialized, boolean_t prefetch)
1477{
1478	zc->zc_objset = os;
1479	zc->zc_zap = NULL;
1480	zc->zc_leaf = NULL;
1481	zc->zc_zapobj = zapobj;
1482	zc->zc_serialized = serialized;
1483	zc->zc_hash = 0;
1484	zc->zc_cd = 0;
1485	zc->zc_prefetch = prefetch;
1486}
1487void
1488zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
1489    uint64_t serialized)
1490{
1491	zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE);
1492}
1493
1494/*
1495 * Initialize a cursor at the beginning of the ZAP object.  The entire
1496 * ZAP object will be prefetched.
1497 */
1498void
1499zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
1500{
1501	zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE);
1502}
1503
1504/*
1505 * Initialize a cursor at the beginning, but request that we not prefetch
1506 * the entire ZAP object.
1507 */
1508void
1509zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
1510{
1511	zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE);
1512}
1513
1514void
1515zap_cursor_fini(zap_cursor_t *zc)
1516{
1517	if (zc->zc_zap) {
1518		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
1519		zap_unlockdir(zc->zc_zap, NULL);
1520		zc->zc_zap = NULL;
1521	}
1522	if (zc->zc_leaf) {
1523		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
1524		zap_put_leaf(zc->zc_leaf);
1525		zc->zc_leaf = NULL;
1526	}
1527	zc->zc_objset = NULL;
1528}
1529
1530uint64_t
1531zap_cursor_serialize(zap_cursor_t *zc)
1532{
1533	if (zc->zc_hash == -1ULL)
1534		return (-1ULL);
1535	if (zc->zc_zap == NULL)
1536		return (zc->zc_serialized);
1537	ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0);
1538	ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap));
1539
1540	/*
1541	 * We want to keep the high 32 bits of the cursor zero if we can, so
1542	 * that 32-bit programs can access this.  So usually use a small
1543	 * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits
1544	 * of the cursor.
1545	 *
1546	 * [ collision differentiator | zap_hashbits()-bit hash value ]
1547	 */
1548	return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) |
1549	    ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap)));
1550}
1551
1552int
1553zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
1554{
1555	int err;
1556
1557	if (zc->zc_hash == -1ULL)
1558		return (SET_ERROR(ENOENT));
1559
1560	if (zc->zc_zap == NULL) {
1561		int hb;
1562		err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
1563		    RW_READER, TRUE, FALSE, NULL, &zc->zc_zap);
1564		if (err != 0)
1565			return (err);
1566
1567		/*
1568		 * To support zap_cursor_init_serialized, advance, retrieve,
1569		 * we must add to the existing zc_cd, which may already
1570		 * be 1 due to the zap_cursor_advance.
1571		 */
1572		ASSERT(zc->zc_hash == 0);
1573		hb = zap_hashbits(zc->zc_zap);
1574		zc->zc_hash = zc->zc_serialized << (64 - hb);
1575		zc->zc_cd += zc->zc_serialized >> hb;
1576		if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */
1577			zc->zc_cd = 0;
1578	} else {
1579		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
1580	}
1581	if (!zc->zc_zap->zap_ismicro) {
1582		err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
1583	} else {
1584		avl_index_t idx;
1585		mzap_ent_t mze_tofind;
1586
1587		mze_tofind.mze_hash = zc->zc_hash;
1588		mze_tofind.mze_cd = zc->zc_cd;
1589
1590		mzap_ent_t *mze =
1591		    avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx);
1592		if (mze == NULL) {
1593			mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl,
1594			    idx, AVL_AFTER);
1595		}
1596		if (mze) {
1597			mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze);
1598			ASSERT3U(mze->mze_cd, ==, mzep->mze_cd);
1599			za->za_normalization_conflict =
1600			    mzap_normalization_conflict(zc->zc_zap, NULL, mze);
1601			za->za_integer_length = 8;
1602			za->za_num_integers = 1;
1603			za->za_first_integer = mzep->mze_value;
1604			(void) strlcpy(za->za_name, mzep->mze_name,
1605			    sizeof (za->za_name));
1606			zc->zc_hash = mze->mze_hash;
1607			zc->zc_cd = mze->mze_cd;
1608			err = 0;
1609		} else {
1610			zc->zc_hash = -1ULL;
1611			err = SET_ERROR(ENOENT);
1612		}
1613	}
1614	rw_exit(&zc->zc_zap->zap_rwlock);
1615	return (err);
1616}
1617
1618void
1619zap_cursor_advance(zap_cursor_t *zc)
1620{
1621	if (zc->zc_hash == -1ULL)
1622		return;
1623	zc->zc_cd++;
1624}
1625
1626int
1627zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
1628{
1629	zap_t *zap;
1630
1631	int err =
1632	    zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
1633	if (err != 0)
1634		return (err);
1635
1636	bzero(zs, sizeof (zap_stats_t));
1637
1638	if (zap->zap_ismicro) {
1639		zs->zs_blocksize = zap->zap_dbuf->db_size;
1640		zs->zs_num_entries = zap->zap_m.zap_num_entries;
1641		zs->zs_num_blocks = 1;
1642	} else {
1643		fzap_get_stats(zap, zs);
1644	}
1645	zap_unlockdir(zap, FTAG);
1646	return (0);
1647}
1648
1649#if defined(_KERNEL)
1650EXPORT_SYMBOL(zap_create);
1651EXPORT_SYMBOL(zap_create_dnsize);
1652EXPORT_SYMBOL(zap_create_norm);
1653EXPORT_SYMBOL(zap_create_norm_dnsize);
1654EXPORT_SYMBOL(zap_create_flags);
1655EXPORT_SYMBOL(zap_create_flags_dnsize);
1656EXPORT_SYMBOL(zap_create_claim);
1657EXPORT_SYMBOL(zap_create_claim_norm);
1658EXPORT_SYMBOL(zap_create_claim_norm_dnsize);
1659EXPORT_SYMBOL(zap_create_hold);
1660EXPORT_SYMBOL(zap_destroy);
1661EXPORT_SYMBOL(zap_lookup);
1662EXPORT_SYMBOL(zap_lookup_by_dnode);
1663EXPORT_SYMBOL(zap_lookup_norm);
1664EXPORT_SYMBOL(zap_lookup_uint64);
1665EXPORT_SYMBOL(zap_contains);
1666EXPORT_SYMBOL(zap_prefetch);
1667EXPORT_SYMBOL(zap_prefetch_uint64);
1668EXPORT_SYMBOL(zap_add);
1669EXPORT_SYMBOL(zap_add_by_dnode);
1670EXPORT_SYMBOL(zap_add_uint64);
1671EXPORT_SYMBOL(zap_update);
1672EXPORT_SYMBOL(zap_update_uint64);
1673EXPORT_SYMBOL(zap_length);
1674EXPORT_SYMBOL(zap_length_uint64);
1675EXPORT_SYMBOL(zap_remove);
1676EXPORT_SYMBOL(zap_remove_by_dnode);
1677EXPORT_SYMBOL(zap_remove_norm);
1678EXPORT_SYMBOL(zap_remove_uint64);
1679EXPORT_SYMBOL(zap_count);
1680EXPORT_SYMBOL(zap_value_search);
1681EXPORT_SYMBOL(zap_join);
1682EXPORT_SYMBOL(zap_join_increment);
1683EXPORT_SYMBOL(zap_add_int);
1684EXPORT_SYMBOL(zap_remove_int);
1685EXPORT_SYMBOL(zap_lookup_int);
1686EXPORT_SYMBOL(zap_increment_int);
1687EXPORT_SYMBOL(zap_add_int_key);
1688EXPORT_SYMBOL(zap_lookup_int_key);
1689EXPORT_SYMBOL(zap_increment);
1690EXPORT_SYMBOL(zap_cursor_init);
1691EXPORT_SYMBOL(zap_cursor_fini);
1692EXPORT_SYMBOL(zap_cursor_retrieve);
1693EXPORT_SYMBOL(zap_cursor_advance);
1694EXPORT_SYMBOL(zap_cursor_serialize);
1695EXPORT_SYMBOL(zap_cursor_init_serialized);
1696EXPORT_SYMBOL(zap_get_stats);
1697#endif
1698