zap_micro.c revision 197172
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28#include <sys/spa.h>
29#include <sys/dmu.h>
30#include <sys/zfs_context.h>
31#include <sys/zap.h>
32#include <sys/refcount.h>
33#include <sys/zap_impl.h>
34#include <sys/zap_leaf.h>
35#include <sys/avl.h>
36
37#ifdef _KERNEL
38#include <sys/sunddi.h>
39#endif
40
41static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx);
42
43
44static uint64_t
45zap_hash(zap_t *zap, const char *normname)
46{
47	const uint8_t *cp;
48	uint8_t c;
49	uint64_t crc = zap->zap_salt;
50
51	/* NB: name must already be normalized, if necessary */
52
53	ASSERT(crc != 0);
54	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
55	for (cp = (const uint8_t *)normname; (c = *cp) != '\0'; cp++) {
56		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
57	}
58
59	/*
60	 * Only use 28 bits, since we need 4 bits in the cookie for the
61	 * collision differentiator.  We MUST use the high bits, since
62	 * those are the ones that we first pay attention to when
63	 * chosing the bucket.
64	 */
65	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
66
67	return (crc);
68}
69
70static int
71zap_normalize(zap_t *zap, const char *name, char *namenorm)
72{
73	size_t inlen, outlen;
74	int err;
75
76	inlen = strlen(name) + 1;
77	outlen = ZAP_MAXNAMELEN;
78
79	err = 0;
80	(void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
81	    zap->zap_normflags | U8_TEXTPREP_IGNORE_NULL, U8_UNICODE_LATEST,
82	    &err);
83
84	return (err);
85}
86
87boolean_t
88zap_match(zap_name_t *zn, const char *matchname)
89{
90	if (zn->zn_matchtype == MT_FIRST) {
91		char norm[ZAP_MAXNAMELEN];
92
93		if (zap_normalize(zn->zn_zap, matchname, norm) != 0)
94			return (B_FALSE);
95
96		return (strcmp(zn->zn_name_norm, norm) == 0);
97	} else {
98		/* MT_BEST or MT_EXACT */
99		return (strcmp(zn->zn_name_orij, matchname) == 0);
100	}
101}
102
103void
104zap_name_free(zap_name_t *zn)
105{
106	kmem_free(zn, sizeof (zap_name_t));
107}
108
109/* XXX combine this with zap_lockdir()? */
110zap_name_t *
111zap_name_alloc(zap_t *zap, const char *name, matchtype_t mt)
112{
113	zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
114
115	zn->zn_zap = zap;
116	zn->zn_name_orij = name;
117	zn->zn_matchtype = mt;
118	if (zap->zap_normflags) {
119		if (zap_normalize(zap, name, zn->zn_normbuf) != 0) {
120			zap_name_free(zn);
121			return (NULL);
122		}
123		zn->zn_name_norm = zn->zn_normbuf;
124	} else {
125		if (mt != MT_EXACT) {
126			zap_name_free(zn);
127			return (NULL);
128		}
129		zn->zn_name_norm = zn->zn_name_orij;
130	}
131
132	zn->zn_hash = zap_hash(zap, zn->zn_name_norm);
133	return (zn);
134}
135
136static void
137mzap_byteswap(mzap_phys_t *buf, size_t size)
138{
139	int i, max;
140	buf->mz_block_type = BSWAP_64(buf->mz_block_type);
141	buf->mz_salt = BSWAP_64(buf->mz_salt);
142	buf->mz_normflags = BSWAP_64(buf->mz_normflags);
143	max = (size / MZAP_ENT_LEN) - 1;
144	for (i = 0; i < max; i++) {
145		buf->mz_chunk[i].mze_value =
146		    BSWAP_64(buf->mz_chunk[i].mze_value);
147		buf->mz_chunk[i].mze_cd =
148		    BSWAP_32(buf->mz_chunk[i].mze_cd);
149	}
150}
151
152void
153zap_byteswap(void *buf, size_t size)
154{
155	uint64_t block_type;
156
157	block_type = *(uint64_t *)buf;
158
159	if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) {
160		/* ASSERT(magic == ZAP_LEAF_MAGIC); */
161		mzap_byteswap(buf, size);
162	} else {
163		fzap_byteswap(buf, size);
164	}
165}
166
167static int
168mze_compare(const void *arg1, const void *arg2)
169{
170	const mzap_ent_t *mze1 = arg1;
171	const mzap_ent_t *mze2 = arg2;
172
173	if (mze1->mze_hash > mze2->mze_hash)
174		return (+1);
175	if (mze1->mze_hash < mze2->mze_hash)
176		return (-1);
177	if (mze1->mze_phys.mze_cd > mze2->mze_phys.mze_cd)
178		return (+1);
179	if (mze1->mze_phys.mze_cd < mze2->mze_phys.mze_cd)
180		return (-1);
181	return (0);
182}
183
184static int
185mze_insert(zap_t *zap, int chunkid, uint64_t hash, mzap_ent_phys_t *mzep)
186{
187	mzap_ent_t *mze;
188	avl_index_t idx;
189
190	ASSERT(zap->zap_ismicro);
191	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
192	ASSERT(mzep->mze_cd < ZAP_MAXCD);
193
194	mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
195	mze->mze_chunkid = chunkid;
196	mze->mze_hash = hash;
197	mze->mze_phys = *mzep;
198	if (avl_find(&zap->zap_m.zap_avl, mze, &idx) != NULL) {
199		kmem_free(mze, sizeof (mzap_ent_t));
200		return (EEXIST);
201	}
202	avl_insert(&zap->zap_m.zap_avl, mze, idx);
203	return (0);
204}
205
206static mzap_ent_t *
207mze_find(zap_name_t *zn)
208{
209	mzap_ent_t mze_tofind;
210	mzap_ent_t *mze;
211	avl_index_t idx;
212	avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl;
213
214	ASSERT(zn->zn_zap->zap_ismicro);
215	ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
216
217	if (strlen(zn->zn_name_norm) >= sizeof (mze_tofind.mze_phys.mze_name))
218		return (NULL);
219
220	mze_tofind.mze_hash = zn->zn_hash;
221	mze_tofind.mze_phys.mze_cd = 0;
222
223again:
224	mze = avl_find(avl, &mze_tofind, &idx);
225	if (mze == NULL)
226		mze = avl_nearest(avl, idx, AVL_AFTER);
227	for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) {
228		if (zap_match(zn, mze->mze_phys.mze_name))
229			return (mze);
230	}
231	if (zn->zn_matchtype == MT_BEST) {
232		zn->zn_matchtype = MT_FIRST;
233		goto again;
234	}
235	return (NULL);
236}
237
238static uint32_t
239mze_find_unused_cd(zap_t *zap, uint64_t hash)
240{
241	mzap_ent_t mze_tofind;
242	mzap_ent_t *mze;
243	avl_index_t idx;
244	avl_tree_t *avl = &zap->zap_m.zap_avl;
245	uint32_t cd;
246
247	ASSERT(zap->zap_ismicro);
248	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
249
250	mze_tofind.mze_hash = hash;
251	mze_tofind.mze_phys.mze_cd = 0;
252
253	cd = 0;
254	for (mze = avl_find(avl, &mze_tofind, &idx);
255	    mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) {
256		if (mze->mze_phys.mze_cd != cd)
257			break;
258		cd++;
259	}
260
261	return (cd);
262}
263
264static void
265mze_remove(zap_t *zap, mzap_ent_t *mze)
266{
267	ASSERT(zap->zap_ismicro);
268	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
269
270	avl_remove(&zap->zap_m.zap_avl, mze);
271	kmem_free(mze, sizeof (mzap_ent_t));
272}
273
274static void
275mze_destroy(zap_t *zap)
276{
277	mzap_ent_t *mze;
278	void *avlcookie = NULL;
279
280	while (mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie))
281		kmem_free(mze, sizeof (mzap_ent_t));
282	avl_destroy(&zap->zap_m.zap_avl);
283}
284
285static zap_t *
286mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db)
287{
288	zap_t *winner;
289	zap_t *zap;
290	int i;
291
292	ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t));
293
294	zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP);
295	rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, 0);
296	rw_enter(&zap->zap_rwlock, RW_WRITER);
297	zap->zap_objset = os;
298	zap->zap_object = obj;
299	zap->zap_dbuf = db;
300
301	if (*(uint64_t *)db->db_data != ZBT_MICRO) {
302		mutex_init(&zap->zap_f.zap_num_entries_mtx, NULL,
303		    MUTEX_DEFAULT, 0);
304		zap->zap_f.zap_block_shift = highbit(db->db_size) - 1;
305	} else {
306		zap->zap_ismicro = TRUE;
307	}
308
309	/*
310	 * Make sure that zap_ismicro is set before we let others see
311	 * it, because zap_lockdir() checks zap_ismicro without the lock
312	 * held.
313	 */
314	winner = dmu_buf_set_user(db, zap, &zap->zap_m.zap_phys, zap_evict);
315
316	if (winner != NULL) {
317		rw_exit(&zap->zap_rwlock);
318		rw_destroy(&zap->zap_rwlock);
319		if (!zap->zap_ismicro)
320			mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
321		kmem_free(zap, sizeof (zap_t));
322		return (winner);
323	}
324
325	if (zap->zap_ismicro) {
326		zap->zap_salt = zap->zap_m.zap_phys->mz_salt;
327		zap->zap_normflags = zap->zap_m.zap_phys->mz_normflags;
328		zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
329		avl_create(&zap->zap_m.zap_avl, mze_compare,
330		    sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
331
332		for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
333			mzap_ent_phys_t *mze =
334			    &zap->zap_m.zap_phys->mz_chunk[i];
335			if (mze->mze_name[0]) {
336				zap_name_t *zn;
337
338				zn = zap_name_alloc(zap, mze->mze_name,
339				    MT_EXACT);
340				if (mze_insert(zap, i, zn->zn_hash, mze) == 0)
341					zap->zap_m.zap_num_entries++;
342				else {
343					printf("ZFS WARNING: Duplicated ZAP "
344					    "entry detected (%s).\n",
345					    mze->mze_name);
346				}
347				zap_name_free(zn);
348			}
349		}
350	} else {
351		zap->zap_salt = zap->zap_f.zap_phys->zap_salt;
352		zap->zap_normflags = zap->zap_f.zap_phys->zap_normflags;
353
354		ASSERT3U(sizeof (struct zap_leaf_header), ==,
355		    2*ZAP_LEAF_CHUNKSIZE);
356
357		/*
358		 * The embedded pointer table should not overlap the
359		 * other members.
360		 */
361		ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
362		    &zap->zap_f.zap_phys->zap_salt);
363
364		/*
365		 * The embedded pointer table should end at the end of
366		 * the block
367		 */
368		ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
369		    1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
370		    (uintptr_t)zap->zap_f.zap_phys, ==,
371		    zap->zap_dbuf->db_size);
372	}
373	rw_exit(&zap->zap_rwlock);
374	return (zap);
375}
376
377int
378zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
379    krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
380{
381	zap_t *zap;
382	dmu_buf_t *db;
383	krw_t lt;
384	int err;
385
386	*zapp = NULL;
387
388	err = dmu_buf_hold(os, obj, 0, NULL, &db);
389	if (err)
390		return (err);
391
392#ifdef ZFS_DEBUG
393	{
394		dmu_object_info_t doi;
395		dmu_object_info_from_db(db, &doi);
396		ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap);
397	}
398#endif
399
400	zap = dmu_buf_get_user(db);
401	if (zap == NULL)
402		zap = mzap_open(os, obj, db);
403
404	/*
405	 * We're checking zap_ismicro without the lock held, in order to
406	 * tell what type of lock we want.  Once we have some sort of
407	 * lock, see if it really is the right type.  In practice this
408	 * can only be different if it was upgraded from micro to fat,
409	 * and micro wanted WRITER but fat only needs READER.
410	 */
411	lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti;
412	rw_enter(&zap->zap_rwlock, lt);
413	if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) {
414		/* it was upgraded, now we only need reader */
415		ASSERT(lt == RW_WRITER);
416		ASSERT(RW_READER ==
417		    (!zap->zap_ismicro && fatreader) ? RW_READER : lti);
418		rw_downgrade(&zap->zap_rwlock);
419		lt = RW_READER;
420	}
421
422	zap->zap_objset = os;
423
424	if (lt == RW_WRITER)
425		dmu_buf_will_dirty(db, tx);
426
427	ASSERT3P(zap->zap_dbuf, ==, db);
428
429	ASSERT(!zap->zap_ismicro ||
430	    zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks);
431	if (zap->zap_ismicro && tx && adding &&
432	    zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) {
433		uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE;
434		if (newsz > MZAP_MAX_BLKSZ) {
435			dprintf("upgrading obj %llu: num_entries=%u\n",
436			    obj, zap->zap_m.zap_num_entries);
437			*zapp = zap;
438			return (mzap_upgrade(zapp, tx));
439		}
440		err = dmu_object_set_blocksize(os, obj, newsz, 0, tx);
441		ASSERT3U(err, ==, 0);
442		zap->zap_m.zap_num_chunks =
443		    db->db_size / MZAP_ENT_LEN - 1;
444	}
445
446	*zapp = zap;
447	return (0);
448}
449
450void
451zap_unlockdir(zap_t *zap)
452{
453	rw_exit(&zap->zap_rwlock);
454	dmu_buf_rele(zap->zap_dbuf, NULL);
455}
456
457static int
458mzap_upgrade(zap_t **zapp, dmu_tx_t *tx)
459{
460	mzap_phys_t *mzp;
461	int i, sz, nchunks, err;
462	zap_t *zap = *zapp;
463
464	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
465
466	sz = zap->zap_dbuf->db_size;
467	mzp = kmem_alloc(sz, KM_SLEEP);
468	bcopy(zap->zap_dbuf->db_data, mzp, sz);
469	nchunks = zap->zap_m.zap_num_chunks;
470
471	err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
472	    1ULL << fzap_default_block_shift, 0, tx);
473	if (err) {
474		kmem_free(mzp, sz);
475		return (err);
476	}
477
478	dprintf("upgrading obj=%llu with %u chunks\n",
479	    zap->zap_object, nchunks);
480	/* XXX destroy the avl later, so we can use the stored hash value */
481	mze_destroy(zap);
482
483	fzap_upgrade(zap, tx);
484
485	for (i = 0; i < nchunks; i++) {
486		int err;
487		mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
488		zap_name_t *zn;
489		if (mze->mze_name[0] == 0)
490			continue;
491		dprintf("adding %s=%llu\n",
492		    mze->mze_name, mze->mze_value);
493		zn = zap_name_alloc(zap, mze->mze_name, MT_EXACT);
494		err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, tx);
495		zap = zn->zn_zap;	/* fzap_add_cd() may change zap */
496		zap_name_free(zn);
497		if (err)
498			break;
499	}
500	kmem_free(mzp, sz);
501	*zapp = zap;
502	return (err);
503}
504
505static void
506mzap_create_impl(objset_t *os, uint64_t obj, int normflags, dmu_tx_t *tx)
507{
508	dmu_buf_t *db;
509	mzap_phys_t *zp;
510
511	VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db));
512
513#ifdef ZFS_DEBUG
514	{
515		dmu_object_info_t doi;
516		dmu_object_info_from_db(db, &doi);
517		ASSERT(dmu_ot[doi.doi_type].ot_byteswap == zap_byteswap);
518	}
519#endif
520
521	dmu_buf_will_dirty(db, tx);
522	zp = db->db_data;
523	zp->mz_block_type = ZBT_MICRO;
524	zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL;
525	zp->mz_normflags = normflags;
526	dmu_buf_rele(db, FTAG);
527}
528
529int
530zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot,
531    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
532{
533	return (zap_create_claim_norm(os, obj,
534	    0, ot, bonustype, bonuslen, tx));
535}
536
537int
538zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags,
539    dmu_object_type_t ot,
540    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
541{
542	int err;
543
544	err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx);
545	if (err != 0)
546		return (err);
547	mzap_create_impl(os, obj, normflags, tx);
548	return (0);
549}
550
551uint64_t
552zap_create(objset_t *os, dmu_object_type_t ot,
553    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
554{
555	return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx));
556}
557
558uint64_t
559zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
560    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
561{
562	uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
563
564	mzap_create_impl(os, obj, normflags, tx);
565	return (obj);
566}
567
568int
569zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx)
570{
571	/*
572	 * dmu_object_free will free the object number and free the
573	 * data.  Freeing the data will cause our pageout function to be
574	 * called, which will destroy our data (zap_leaf_t's and zap_t).
575	 */
576
577	return (dmu_object_free(os, zapobj, tx));
578}
579
580_NOTE(ARGSUSED(0))
581void
582zap_evict(dmu_buf_t *db, void *vzap)
583{
584	zap_t *zap = vzap;
585
586	rw_destroy(&zap->zap_rwlock);
587
588	if (zap->zap_ismicro)
589		mze_destroy(zap);
590	else
591		mutex_destroy(&zap->zap_f.zap_num_entries_mtx);
592
593	kmem_free(zap, sizeof (zap_t));
594}
595
596int
597zap_count(objset_t *os, uint64_t zapobj, uint64_t *count)
598{
599	zap_t *zap;
600	int err;
601
602	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
603	if (err)
604		return (err);
605	if (!zap->zap_ismicro) {
606		err = fzap_count(zap, count);
607	} else {
608		*count = zap->zap_m.zap_num_entries;
609	}
610	zap_unlockdir(zap);
611	return (err);
612}
613
614/*
615 * zn may be NULL; if not specified, it will be computed if needed.
616 * See also the comment above zap_entry_normalization_conflict().
617 */
618static boolean_t
619mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze)
620{
621	mzap_ent_t *other;
622	int direction = AVL_BEFORE;
623	boolean_t allocdzn = B_FALSE;
624
625	if (zap->zap_normflags == 0)
626		return (B_FALSE);
627
628again:
629	for (other = avl_walk(&zap->zap_m.zap_avl, mze, direction);
630	    other && other->mze_hash == mze->mze_hash;
631	    other = avl_walk(&zap->zap_m.zap_avl, other, direction)) {
632
633		if (zn == NULL) {
634			zn = zap_name_alloc(zap, mze->mze_phys.mze_name,
635			    MT_FIRST);
636			allocdzn = B_TRUE;
637		}
638		if (zap_match(zn, other->mze_phys.mze_name)) {
639			if (allocdzn)
640				zap_name_free(zn);
641			return (B_TRUE);
642		}
643	}
644
645	if (direction == AVL_BEFORE) {
646		direction = AVL_AFTER;
647		goto again;
648	}
649
650	if (allocdzn)
651		zap_name_free(zn);
652	return (B_FALSE);
653}
654
655/*
656 * Routines for manipulating attributes.
657 */
658
659int
660zap_lookup(objset_t *os, uint64_t zapobj, const char *name,
661    uint64_t integer_size, uint64_t num_integers, void *buf)
662{
663	return (zap_lookup_norm(os, zapobj, name, integer_size,
664	    num_integers, buf, MT_EXACT, NULL, 0, NULL));
665}
666
667int
668zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
669    uint64_t integer_size, uint64_t num_integers, void *buf,
670    matchtype_t mt, char *realname, int rn_len,
671    boolean_t *ncp)
672{
673	zap_t *zap;
674	int err;
675	mzap_ent_t *mze;
676	zap_name_t *zn;
677
678	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
679	if (err)
680		return (err);
681	zn = zap_name_alloc(zap, name, mt);
682	if (zn == NULL) {
683		zap_unlockdir(zap);
684		return (ENOTSUP);
685	}
686
687	if (!zap->zap_ismicro) {
688		err = fzap_lookup(zn, integer_size, num_integers, buf,
689		    realname, rn_len, ncp);
690	} else {
691		mze = mze_find(zn);
692		if (mze == NULL) {
693			err = ENOENT;
694		} else {
695			if (num_integers < 1) {
696				err = EOVERFLOW;
697			} else if (integer_size != 8) {
698				err = EINVAL;
699			} else {
700				*(uint64_t *)buf = mze->mze_phys.mze_value;
701				(void) strlcpy(realname,
702				    mze->mze_phys.mze_name, rn_len);
703				if (ncp) {
704					*ncp = mzap_normalization_conflict(zap,
705					    zn, mze);
706				}
707			}
708		}
709	}
710	zap_name_free(zn);
711	zap_unlockdir(zap);
712	return (err);
713}
714
715int
716zap_length(objset_t *os, uint64_t zapobj, const char *name,
717    uint64_t *integer_size, uint64_t *num_integers)
718{
719	zap_t *zap;
720	int err;
721	mzap_ent_t *mze;
722	zap_name_t *zn;
723
724	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
725	if (err)
726		return (err);
727	zn = zap_name_alloc(zap, name, MT_EXACT);
728	if (zn == NULL) {
729		zap_unlockdir(zap);
730		return (ENOTSUP);
731	}
732	if (!zap->zap_ismicro) {
733		err = fzap_length(zn, integer_size, num_integers);
734	} else {
735		mze = mze_find(zn);
736		if (mze == NULL) {
737			err = ENOENT;
738		} else {
739			if (integer_size)
740				*integer_size = 8;
741			if (num_integers)
742				*num_integers = 1;
743		}
744	}
745	zap_name_free(zn);
746	zap_unlockdir(zap);
747	return (err);
748}
749
750static void
751mzap_addent(zap_name_t *zn, uint64_t value)
752{
753	int i;
754	zap_t *zap = zn->zn_zap;
755	int start = zap->zap_m.zap_alloc_next;
756	uint32_t cd;
757
758	dprintf("obj=%llu %s=%llu\n", zap->zap_object,
759	    zn->zn_name_orij, value);
760	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
761
762#ifdef ZFS_DEBUG
763	for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
764		mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
765		ASSERT(strcmp(zn->zn_name_orij, mze->mze_name) != 0);
766	}
767#endif
768
769	cd = mze_find_unused_cd(zap, zn->zn_hash);
770	/* given the limited size of the microzap, this can't happen */
771	ASSERT(cd != ZAP_MAXCD);
772
773again:
774	for (i = start; i < zap->zap_m.zap_num_chunks; i++) {
775		mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
776		if (mze->mze_name[0] == 0) {
777			mze->mze_value = value;
778			mze->mze_cd = cd;
779			(void) strcpy(mze->mze_name, zn->zn_name_orij);
780			zap->zap_m.zap_num_entries++;
781			zap->zap_m.zap_alloc_next = i+1;
782			if (zap->zap_m.zap_alloc_next ==
783			    zap->zap_m.zap_num_chunks)
784				zap->zap_m.zap_alloc_next = 0;
785			VERIFY(0 == mze_insert(zap, i, zn->zn_hash, mze));
786			return;
787		}
788	}
789	if (start != 0) {
790		start = 0;
791		goto again;
792	}
793	ASSERT(!"out of entries!");
794}
795
796int
797zap_add(objset_t *os, uint64_t zapobj, const char *name,
798    int integer_size, uint64_t num_integers,
799    const void *val, dmu_tx_t *tx)
800{
801	zap_t *zap;
802	int err;
803	mzap_ent_t *mze;
804	const uint64_t *intval = val;
805	zap_name_t *zn;
806
807	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
808	if (err)
809		return (err);
810	zn = zap_name_alloc(zap, name, MT_EXACT);
811	if (zn == NULL) {
812		zap_unlockdir(zap);
813		return (ENOTSUP);
814	}
815	if (!zap->zap_ismicro) {
816		err = fzap_add(zn, integer_size, num_integers, val, tx);
817		zap = zn->zn_zap;	/* fzap_add() may change zap */
818	} else if (integer_size != 8 || num_integers != 1 ||
819	    strlen(name) >= MZAP_NAME_LEN) {
820		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
821		    zapobj, integer_size, num_integers, name);
822		err = mzap_upgrade(&zn->zn_zap, tx);
823		if (err == 0)
824			err = fzap_add(zn, integer_size, num_integers, val, tx);
825		zap = zn->zn_zap;	/* fzap_add() may change zap */
826	} else {
827		mze = mze_find(zn);
828		if (mze != NULL) {
829			err = EEXIST;
830		} else {
831			mzap_addent(zn, *intval);
832		}
833	}
834	ASSERT(zap == zn->zn_zap);
835	zap_name_free(zn);
836	if (zap != NULL)	/* may be NULL if fzap_add() failed */
837		zap_unlockdir(zap);
838	return (err);
839}
840
841int
842zap_update(objset_t *os, uint64_t zapobj, const char *name,
843    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
844{
845	zap_t *zap;
846	mzap_ent_t *mze;
847	const uint64_t *intval = val;
848	zap_name_t *zn;
849	int err;
850
851	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
852	if (err)
853		return (err);
854	zn = zap_name_alloc(zap, name, MT_EXACT);
855	if (zn == NULL) {
856		zap_unlockdir(zap);
857		return (ENOTSUP);
858	}
859	if (!zap->zap_ismicro) {
860		err = fzap_update(zn, integer_size, num_integers, val, tx);
861		zap = zn->zn_zap;	/* fzap_update() may change zap */
862	} else if (integer_size != 8 || num_integers != 1 ||
863	    strlen(name) >= MZAP_NAME_LEN) {
864		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
865		    zapobj, integer_size, num_integers, name);
866		err = mzap_upgrade(&zn->zn_zap, tx);
867		if (err == 0)
868			err = fzap_update(zn, integer_size, num_integers,
869			    val, tx);
870		zap = zn->zn_zap;	/* fzap_update() may change zap */
871	} else {
872		mze = mze_find(zn);
873		if (mze != NULL) {
874			mze->mze_phys.mze_value = *intval;
875			zap->zap_m.zap_phys->mz_chunk
876			    [mze->mze_chunkid].mze_value = *intval;
877		} else {
878			mzap_addent(zn, *intval);
879		}
880	}
881	ASSERT(zap == zn->zn_zap);
882	zap_name_free(zn);
883	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
884		zap_unlockdir(zap);
885	return (err);
886}
887
888int
889zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
890{
891	return (zap_remove_norm(os, zapobj, name, MT_EXACT, tx));
892}
893
894int
895zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
896    matchtype_t mt, dmu_tx_t *tx)
897{
898	zap_t *zap;
899	int err;
900	mzap_ent_t *mze;
901	zap_name_t *zn;
902
903	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap);
904	if (err)
905		return (err);
906	zn = zap_name_alloc(zap, name, mt);
907	if (zn == NULL) {
908		zap_unlockdir(zap);
909		return (ENOTSUP);
910	}
911	if (!zap->zap_ismicro) {
912		err = fzap_remove(zn, tx);
913	} else {
914		mze = mze_find(zn);
915		if (mze == NULL) {
916			err = ENOENT;
917		} else {
918			zap->zap_m.zap_num_entries--;
919			bzero(&zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid],
920			    sizeof (mzap_ent_phys_t));
921			mze_remove(zap, mze);
922		}
923	}
924	zap_name_free(zn);
925	zap_unlockdir(zap);
926	return (err);
927}
928
929/*
930 * Routines for iterating over the attributes.
931 */
932
933/*
934 * We want to keep the high 32 bits of the cursor zero if we can, so
935 * that 32-bit programs can access this.  So use a small hash value so
936 * we can fit 4 bits of cd into the 32-bit cursor.
937 *
938 * [ 4 zero bits | 32-bit collision differentiator | 28-bit hash value ]
939 */
940void
941zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
942    uint64_t serialized)
943{
944	zc->zc_objset = os;
945	zc->zc_zap = NULL;
946	zc->zc_leaf = NULL;
947	zc->zc_zapobj = zapobj;
948	if (serialized == -1ULL) {
949		zc->zc_hash = -1ULL;
950		zc->zc_cd = 0;
951	} else {
952		zc->zc_hash = serialized << (64-ZAP_HASHBITS);
953		zc->zc_cd = serialized >> ZAP_HASHBITS;
954		if (zc->zc_cd >= ZAP_MAXCD) /* corrupt serialized */
955			zc->zc_cd = 0;
956	}
957}
958
959void
960zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj)
961{
962	zap_cursor_init_serialized(zc, os, zapobj, 0);
963}
964
965void
966zap_cursor_fini(zap_cursor_t *zc)
967{
968	if (zc->zc_zap) {
969		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
970		zap_unlockdir(zc->zc_zap);
971		zc->zc_zap = NULL;
972	}
973	if (zc->zc_leaf) {
974		rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
975		zap_put_leaf(zc->zc_leaf);
976		zc->zc_leaf = NULL;
977	}
978	zc->zc_objset = NULL;
979}
980
981uint64_t
982zap_cursor_serialize(zap_cursor_t *zc)
983{
984	if (zc->zc_hash == -1ULL)
985		return (-1ULL);
986	ASSERT((zc->zc_hash & (ZAP_MAXCD-1)) == 0);
987	ASSERT(zc->zc_cd < ZAP_MAXCD);
988	return ((zc->zc_hash >> (64-ZAP_HASHBITS)) |
989	    ((uint64_t)zc->zc_cd << ZAP_HASHBITS));
990}
991
992int
993zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
994{
995	int err;
996	avl_index_t idx;
997	mzap_ent_t mze_tofind;
998	mzap_ent_t *mze;
999
1000	if (zc->zc_hash == -1ULL)
1001		return (ENOENT);
1002
1003	if (zc->zc_zap == NULL) {
1004		err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
1005		    RW_READER, TRUE, FALSE, &zc->zc_zap);
1006		if (err)
1007			return (err);
1008	} else {
1009		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
1010	}
1011	if (!zc->zc_zap->zap_ismicro) {
1012		err = fzap_cursor_retrieve(zc->zc_zap, zc, za);
1013	} else {
1014		err = ENOENT;
1015
1016		mze_tofind.mze_hash = zc->zc_hash;
1017		mze_tofind.mze_phys.mze_cd = zc->zc_cd;
1018
1019		mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx);
1020		if (mze == NULL) {
1021			mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl,
1022			    idx, AVL_AFTER);
1023		}
1024		if (mze) {
1025			ASSERT(0 == bcmp(&mze->mze_phys,
1026			    &zc->zc_zap->zap_m.zap_phys->mz_chunk
1027			    [mze->mze_chunkid], sizeof (mze->mze_phys)));
1028
1029			za->za_normalization_conflict =
1030			    mzap_normalization_conflict(zc->zc_zap, NULL, mze);
1031			za->za_integer_length = 8;
1032			za->za_num_integers = 1;
1033			za->za_first_integer = mze->mze_phys.mze_value;
1034			(void) strcpy(za->za_name, mze->mze_phys.mze_name);
1035			zc->zc_hash = mze->mze_hash;
1036			zc->zc_cd = mze->mze_phys.mze_cd;
1037			err = 0;
1038		} else {
1039			zc->zc_hash = -1ULL;
1040		}
1041	}
1042	rw_exit(&zc->zc_zap->zap_rwlock);
1043	return (err);
1044}
1045
1046void
1047zap_cursor_advance(zap_cursor_t *zc)
1048{
1049	if (zc->zc_hash == -1ULL)
1050		return;
1051	zc->zc_cd++;
1052	if (zc->zc_cd >= ZAP_MAXCD) {
1053		zc->zc_cd = 0;
1054		zc->zc_hash += 1ULL<<(64-ZAP_HASHBITS);
1055		if (zc->zc_hash == 0) /* EOF */
1056			zc->zc_hash = -1ULL;
1057	}
1058}
1059
1060int
1061zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
1062{
1063	int err;
1064	zap_t *zap;
1065
1066	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
1067	if (err)
1068		return (err);
1069
1070	bzero(zs, sizeof (zap_stats_t));
1071
1072	if (zap->zap_ismicro) {
1073		zs->zs_blocksize = zap->zap_dbuf->db_size;
1074		zs->zs_num_entries = zap->zap_m.zap_num_entries;
1075		zs->zs_num_blocks = 1;
1076	} else {
1077		fzap_get_stats(zap, zs);
1078	}
1079	zap_unlockdir(zap);
1080	return (0);
1081}
1082