zap_leaf.c revision 219089
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25/*
26 * The 512-byte leaf is broken into 32 16-byte chunks.
27 * chunk number n means l_chunk[n], even though the header precedes it.
28 * the names are stored null-terminated.
29 */
30
31#include <sys/zio.h>
32#include <sys/spa.h>
33#include <sys/dmu.h>
34#include <sys/zfs_context.h>
35#include <sys/fs/zfs.h>
36#include <sys/zap.h>
37#include <sys/zap_impl.h>
38#include <sys/zap_leaf.h>
39#include <sys/arc.h>
40
41static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
42
43#define	CHAIN_END 0xffff /* end of the chunk chain */
44
45/* half the (current) minimum block size */
46#define	MAX_ARRAY_BYTES (8<<10)
47
48#define	LEAF_HASH(l, h) \
49	((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \
50	((h) >> (64 - ZAP_LEAF_HASH_SHIFT(l)-(l)->l_phys->l_hdr.lh_prefix_len)))
51
52#define	LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[LEAF_HASH(l, h)])
53
54
55static void
56zap_memset(void *a, int c, size_t n)
57{
58	char *cp = a;
59	char *cpend = cp + n;
60
61	while (cp < cpend)
62		*cp++ = c;
63}
64
65static void
66stv(int len, void *addr, uint64_t value)
67{
68	switch (len) {
69	case 1:
70		*(uint8_t *)addr = value;
71		return;
72	case 2:
73		*(uint16_t *)addr = value;
74		return;
75	case 4:
76		*(uint32_t *)addr = value;
77		return;
78	case 8:
79		*(uint64_t *)addr = value;
80		return;
81	}
82	ASSERT(!"bad int len");
83}
84
85static uint64_t
86ldv(int len, const void *addr)
87{
88	switch (len) {
89	case 1:
90		return (*(uint8_t *)addr);
91	case 2:
92		return (*(uint16_t *)addr);
93	case 4:
94		return (*(uint32_t *)addr);
95	case 8:
96		return (*(uint64_t *)addr);
97	}
98	ASSERT(!"bad int len");
99	return (0xFEEDFACEDEADBEEFULL);
100}
101
102void
103zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
104{
105	int i;
106	zap_leaf_t l;
107	l.l_bs = highbit(size)-1;
108	l.l_phys = buf;
109
110	buf->l_hdr.lh_block_type = 	BSWAP_64(buf->l_hdr.lh_block_type);
111	buf->l_hdr.lh_prefix = 		BSWAP_64(buf->l_hdr.lh_prefix);
112	buf->l_hdr.lh_magic = 		BSWAP_32(buf->l_hdr.lh_magic);
113	buf->l_hdr.lh_nfree = 		BSWAP_16(buf->l_hdr.lh_nfree);
114	buf->l_hdr.lh_nentries = 	BSWAP_16(buf->l_hdr.lh_nentries);
115	buf->l_hdr.lh_prefix_len = 	BSWAP_16(buf->l_hdr.lh_prefix_len);
116	buf->l_hdr.lh_freelist = 	BSWAP_16(buf->l_hdr.lh_freelist);
117
118	for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
119		buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
120
121	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
122		zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i);
123		struct zap_leaf_entry *le;
124
125		switch (lc->l_free.lf_type) {
126		case ZAP_CHUNK_ENTRY:
127			le = &lc->l_entry;
128
129			le->le_type =		BSWAP_8(le->le_type);
130			le->le_value_intlen =	BSWAP_8(le->le_value_intlen);
131			le->le_next =		BSWAP_16(le->le_next);
132			le->le_name_chunk =	BSWAP_16(le->le_name_chunk);
133			le->le_name_numints =	BSWAP_16(le->le_name_numints);
134			le->le_value_chunk =	BSWAP_16(le->le_value_chunk);
135			le->le_value_numints =	BSWAP_16(le->le_value_numints);
136			le->le_cd =		BSWAP_32(le->le_cd);
137			le->le_hash =		BSWAP_64(le->le_hash);
138			break;
139		case ZAP_CHUNK_FREE:
140			lc->l_free.lf_type =	BSWAP_8(lc->l_free.lf_type);
141			lc->l_free.lf_next =	BSWAP_16(lc->l_free.lf_next);
142			break;
143		case ZAP_CHUNK_ARRAY:
144			lc->l_array.la_type =	BSWAP_8(lc->l_array.la_type);
145			lc->l_array.la_next =	BSWAP_16(lc->l_array.la_next);
146			/* la_array doesn't need swapping */
147			break;
148		default:
149			ASSERT(!"bad leaf type");
150		}
151	}
152}
153
154void
155zap_leaf_init(zap_leaf_t *l, boolean_t sort)
156{
157	int i;
158
159	l->l_bs = highbit(l->l_dbuf->db_size)-1;
160	zap_memset(&l->l_phys->l_hdr, 0, sizeof (struct zap_leaf_header));
161	zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l));
162	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
163		ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
164		ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
165	}
166	ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END;
167	l->l_phys->l_hdr.lh_block_type = ZBT_LEAF;
168	l->l_phys->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
169	l->l_phys->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
170	if (sort)
171		l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
172}
173
174/*
175 * Routines which manipulate leaf chunks (l_chunk[]).
176 */
177
178static uint16_t
179zap_leaf_chunk_alloc(zap_leaf_t *l)
180{
181	int chunk;
182
183	ASSERT(l->l_phys->l_hdr.lh_nfree > 0);
184
185	chunk = l->l_phys->l_hdr.lh_freelist;
186	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
187	ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE);
188
189	l->l_phys->l_hdr.lh_freelist = ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next;
190
191	l->l_phys->l_hdr.lh_nfree--;
192
193	return (chunk);
194}
195
196static void
197zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
198{
199	struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free;
200	ASSERT3U(l->l_phys->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l));
201	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
202	ASSERT(zlf->lf_type != ZAP_CHUNK_FREE);
203
204	zlf->lf_type = ZAP_CHUNK_FREE;
205	zlf->lf_next = l->l_phys->l_hdr.lh_freelist;
206	bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */
207	l->l_phys->l_hdr.lh_freelist = chunk;
208
209	l->l_phys->l_hdr.lh_nfree++;
210}
211
212/*
213 * Routines which manipulate leaf arrays (zap_leaf_array type chunks).
214 */
215
216static uint16_t
217zap_leaf_array_create(zap_leaf_t *l, const char *buf,
218    int integer_size, int num_integers)
219{
220	uint16_t chunk_head;
221	uint16_t *chunkp = &chunk_head;
222	int byten = 0;
223	uint64_t value;
224	int shift = (integer_size-1)*8;
225	int len = num_integers;
226
227	ASSERT3U(num_integers * integer_size, <, MAX_ARRAY_BYTES);
228
229	while (len > 0) {
230		uint16_t chunk = zap_leaf_chunk_alloc(l);
231		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
232		int i;
233
234		la->la_type = ZAP_CHUNK_ARRAY;
235		for (i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
236			if (byten == 0)
237				value = ldv(integer_size, buf);
238			la->la_array[i] = value >> shift;
239			value <<= 8;
240			if (++byten == integer_size) {
241				byten = 0;
242				buf += integer_size;
243				if (--len == 0)
244					break;
245			}
246		}
247
248		*chunkp = chunk;
249		chunkp = &la->la_next;
250	}
251	*chunkp = CHAIN_END;
252
253	return (chunk_head);
254}
255
256static void
257zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp)
258{
259	uint16_t chunk = *chunkp;
260
261	*chunkp = CHAIN_END;
262
263	while (chunk != CHAIN_END) {
264		int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
265		ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==,
266		    ZAP_CHUNK_ARRAY);
267		zap_leaf_chunk_free(l, chunk);
268		chunk = nextchunk;
269	}
270}
271
272/* array_len and buf_len are in integers, not bytes */
273static void
274zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
275    int array_int_len, int array_len, int buf_int_len, uint64_t buf_len,
276    void *buf)
277{
278	int len = MIN(array_len, buf_len);
279	int byten = 0;
280	uint64_t value = 0;
281	char *p = buf;
282
283	ASSERT3U(array_int_len, <=, buf_int_len);
284
285	/* Fast path for one 8-byte integer */
286	if (array_int_len == 8 && buf_int_len == 8 && len == 1) {
287		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
288		uint8_t *ip = la->la_array;
289		uint64_t *buf64 = buf;
290
291		*buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 |
292		    (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 |
293		    (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 |
294		    (uint64_t)ip[6] << 8 | (uint64_t)ip[7];
295		return;
296	}
297
298	/* Fast path for an array of 1-byte integers (eg. the entry name) */
299	if (array_int_len == 1 && buf_int_len == 1 &&
300	    buf_len > array_len + ZAP_LEAF_ARRAY_BYTES) {
301		while (chunk != CHAIN_END) {
302			struct zap_leaf_array *la =
303			    &ZAP_LEAF_CHUNK(l, chunk).l_array;
304			bcopy(la->la_array, p, ZAP_LEAF_ARRAY_BYTES);
305			p += ZAP_LEAF_ARRAY_BYTES;
306			chunk = la->la_next;
307		}
308		return;
309	}
310
311	while (len > 0) {
312		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
313		int i;
314
315		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
316		for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
317			value = (value << 8) | la->la_array[i];
318			byten++;
319			if (byten == array_int_len) {
320				stv(buf_int_len, p, value);
321				byten = 0;
322				len--;
323				if (len == 0)
324					return;
325				p += buf_int_len;
326			}
327		}
328		chunk = la->la_next;
329	}
330}
331
332static boolean_t
333zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn,
334    int chunk, int array_numints)
335{
336	int bseen = 0;
337
338	if (zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY) {
339		uint64_t *thiskey;
340		boolean_t match;
341
342		ASSERT(zn->zn_key_intlen == sizeof (*thiskey));
343		thiskey = kmem_alloc(array_numints * sizeof (*thiskey),
344		    KM_SLEEP);
345
346		zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_numints,
347		    sizeof (*thiskey), array_numints, thiskey);
348		match = bcmp(thiskey, zn->zn_key_orig,
349		    array_numints * sizeof (*thiskey)) == 0;
350		kmem_free(thiskey, array_numints * sizeof (*thiskey));
351		return (match);
352	}
353
354	ASSERT(zn->zn_key_intlen == 1);
355	if (zn->zn_matchtype == MT_FIRST) {
356		char *thisname = kmem_alloc(array_numints, KM_SLEEP);
357		boolean_t match;
358
359		zap_leaf_array_read(l, chunk, sizeof (char), array_numints,
360		    sizeof (char), array_numints, thisname);
361		match = zap_match(zn, thisname);
362		kmem_free(thisname, array_numints);
363		return (match);
364	}
365
366	/*
367	 * Fast path for exact matching.
368	 * First check that the lengths match, so that we don't read
369	 * past the end of the zn_key_orig array.
370	 */
371	if (array_numints != zn->zn_key_orig_numints)
372		return (B_FALSE);
373	while (bseen < array_numints) {
374		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
375		int toread = MIN(array_numints - bseen, ZAP_LEAF_ARRAY_BYTES);
376		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
377		if (bcmp(la->la_array, (char *)zn->zn_key_orig + bseen, toread))
378			break;
379		chunk = la->la_next;
380		bseen += toread;
381	}
382	return (bseen == array_numints);
383}
384
385/*
386 * Routines which manipulate leaf entries.
387 */
388
389int
390zap_leaf_lookup(zap_leaf_t *l, zap_name_t *zn, zap_entry_handle_t *zeh)
391{
392	uint16_t *chunkp;
393	struct zap_leaf_entry *le;
394
395	ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
396
397again:
398	for (chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash);
399	    *chunkp != CHAIN_END; chunkp = &le->le_next) {
400		uint16_t chunk = *chunkp;
401		le = ZAP_LEAF_ENTRY(l, chunk);
402
403		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
404		ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
405
406		if (le->le_hash != zn->zn_hash)
407			continue;
408
409		/*
410		 * NB: the entry chain is always sorted by cd on
411		 * normalized zap objects, so this will find the
412		 * lowest-cd match for MT_FIRST.
413		 */
414		ASSERT(zn->zn_matchtype == MT_EXACT ||
415		    (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED));
416		if (zap_leaf_array_match(l, zn, le->le_name_chunk,
417		    le->le_name_numints)) {
418			zeh->zeh_num_integers = le->le_value_numints;
419			zeh->zeh_integer_size = le->le_value_intlen;
420			zeh->zeh_cd = le->le_cd;
421			zeh->zeh_hash = le->le_hash;
422			zeh->zeh_chunkp = chunkp;
423			zeh->zeh_leaf = l;
424			return (0);
425		}
426	}
427
428	/*
429	 * NB: we could of course do this in one pass, but that would be
430	 * a pain.  We'll see if MT_BEST is even used much.
431	 */
432	if (zn->zn_matchtype == MT_BEST) {
433		zn->zn_matchtype = MT_FIRST;
434		goto again;
435	}
436
437	return (ENOENT);
438}
439
440/* Return (h1,cd1 >= h2,cd2) */
441#define	HCD_GTEQ(h1, cd1, h2, cd2) \
442	((h1 > h2) ? TRUE : ((h1 == h2 && cd1 >= cd2) ? TRUE : FALSE))
443
444int
445zap_leaf_lookup_closest(zap_leaf_t *l,
446    uint64_t h, uint32_t cd, zap_entry_handle_t *zeh)
447{
448	uint16_t chunk;
449	uint64_t besth = -1ULL;
450	uint32_t bestcd = -1U;
451	uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1;
452	uint16_t lh;
453	struct zap_leaf_entry *le;
454
455	ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
456
457	for (lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
458		for (chunk = l->l_phys->l_hash[lh];
459		    chunk != CHAIN_END; chunk = le->le_next) {
460			le = ZAP_LEAF_ENTRY(l, chunk);
461
462			ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
463			ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
464
465			if (HCD_GTEQ(le->le_hash, le->le_cd, h, cd) &&
466			    HCD_GTEQ(besth, bestcd, le->le_hash, le->le_cd)) {
467				ASSERT3U(bestlh, >=, lh);
468				bestlh = lh;
469				besth = le->le_hash;
470				bestcd = le->le_cd;
471
472				zeh->zeh_num_integers = le->le_value_numints;
473				zeh->zeh_integer_size = le->le_value_intlen;
474				zeh->zeh_cd = le->le_cd;
475				zeh->zeh_hash = le->le_hash;
476				zeh->zeh_fakechunk = chunk;
477				zeh->zeh_chunkp = &zeh->zeh_fakechunk;
478				zeh->zeh_leaf = l;
479			}
480		}
481	}
482
483	return (bestcd == -1U ? ENOENT : 0);
484}
485
486int
487zap_entry_read(const zap_entry_handle_t *zeh,
488    uint8_t integer_size, uint64_t num_integers, void *buf)
489{
490	struct zap_leaf_entry *le =
491	    ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
492	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
493
494	if (le->le_value_intlen > integer_size)
495		return (EINVAL);
496
497	zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk,
498	    le->le_value_intlen, le->le_value_numints,
499	    integer_size, num_integers, buf);
500
501	if (zeh->zeh_num_integers > num_integers)
502		return (EOVERFLOW);
503	return (0);
504
505}
506
507int
508zap_entry_read_name(zap_t *zap, const zap_entry_handle_t *zeh, uint16_t buflen,
509    char *buf)
510{
511	struct zap_leaf_entry *le =
512	    ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
513	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
514
515	if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
516		zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 8,
517		    le->le_name_numints, 8, buflen / 8, buf);
518	} else {
519		zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1,
520		    le->le_name_numints, 1, buflen, buf);
521	}
522	if (le->le_name_numints > buflen)
523		return (EOVERFLOW);
524	return (0);
525}
526
527int
528zap_entry_update(zap_entry_handle_t *zeh,
529	uint8_t integer_size, uint64_t num_integers, const void *buf)
530{
531	int delta_chunks;
532	zap_leaf_t *l = zeh->zeh_leaf;
533	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp);
534
535	delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) -
536	    ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen);
537
538	if ((int)l->l_phys->l_hdr.lh_nfree < delta_chunks)
539		return (EAGAIN);
540
541	zap_leaf_array_free(l, &le->le_value_chunk);
542	le->le_value_chunk =
543	    zap_leaf_array_create(l, buf, integer_size, num_integers);
544	le->le_value_numints = num_integers;
545	le->le_value_intlen = integer_size;
546	return (0);
547}
548
549void
550zap_entry_remove(zap_entry_handle_t *zeh)
551{
552	uint16_t entry_chunk;
553	struct zap_leaf_entry *le;
554	zap_leaf_t *l = zeh->zeh_leaf;
555
556	ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk);
557
558	entry_chunk = *zeh->zeh_chunkp;
559	le = ZAP_LEAF_ENTRY(l, entry_chunk);
560	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
561
562	zap_leaf_array_free(l, &le->le_name_chunk);
563	zap_leaf_array_free(l, &le->le_value_chunk);
564
565	*zeh->zeh_chunkp = le->le_next;
566	zap_leaf_chunk_free(l, entry_chunk);
567
568	l->l_phys->l_hdr.lh_nentries--;
569}
570
571int
572zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
573    uint8_t integer_size, uint64_t num_integers, const void *buf,
574    zap_entry_handle_t *zeh)
575{
576	uint16_t chunk;
577	uint16_t *chunkp;
578	struct zap_leaf_entry *le;
579	uint64_t valuelen;
580	int numchunks;
581	uint64_t h = zn->zn_hash;
582
583	valuelen = integer_size * num_integers;
584
585	numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
586	    zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
587	if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
588		return (E2BIG);
589
590	if (cd == ZAP_NEED_CD) {
591		/* find the lowest unused cd */
592		if (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) {
593			cd = 0;
594
595			for (chunk = *LEAF_HASH_ENTPTR(l, h);
596			    chunk != CHAIN_END; chunk = le->le_next) {
597				le = ZAP_LEAF_ENTRY(l, chunk);
598				if (le->le_cd > cd)
599					break;
600				if (le->le_hash == h) {
601					ASSERT3U(cd, ==, le->le_cd);
602					cd++;
603				}
604			}
605		} else {
606			/* old unsorted format; do it the O(n^2) way */
607			for (cd = 0; ; cd++) {
608				for (chunk = *LEAF_HASH_ENTPTR(l, h);
609				    chunk != CHAIN_END; chunk = le->le_next) {
610					le = ZAP_LEAF_ENTRY(l, chunk);
611					if (le->le_hash == h &&
612					    le->le_cd == cd) {
613						break;
614					}
615				}
616				/* If this cd is not in use, we are good. */
617				if (chunk == CHAIN_END)
618					break;
619			}
620		}
621		/*
622		 * We would run out of space in a block before we could
623		 * store enough entries to run out of CD values.
624		 */
625		ASSERT3U(cd, <, zap_maxcd(zn->zn_zap));
626	}
627
628	if (l->l_phys->l_hdr.lh_nfree < numchunks)
629		return (EAGAIN);
630
631	/* make the entry */
632	chunk = zap_leaf_chunk_alloc(l);
633	le = ZAP_LEAF_ENTRY(l, chunk);
634	le->le_type = ZAP_CHUNK_ENTRY;
635	le->le_name_chunk = zap_leaf_array_create(l, zn->zn_key_orig,
636	    zn->zn_key_intlen, zn->zn_key_orig_numints);
637	le->le_name_numints = zn->zn_key_orig_numints;
638	le->le_value_chunk =
639	    zap_leaf_array_create(l, buf, integer_size, num_integers);
640	le->le_value_numints = num_integers;
641	le->le_value_intlen = integer_size;
642	le->le_hash = h;
643	le->le_cd = cd;
644
645	/* link it into the hash chain */
646	/* XXX if we did the search above, we could just use that */
647	chunkp = zap_leaf_rehash_entry(l, chunk);
648
649	l->l_phys->l_hdr.lh_nentries++;
650
651	zeh->zeh_leaf = l;
652	zeh->zeh_num_integers = num_integers;
653	zeh->zeh_integer_size = le->le_value_intlen;
654	zeh->zeh_cd = le->le_cd;
655	zeh->zeh_hash = le->le_hash;
656	zeh->zeh_chunkp = chunkp;
657
658	return (0);
659}
660
661/*
662 * Determine if there is another entry with the same normalized form.
663 * For performance purposes, either zn or name must be provided (the
664 * other can be NULL).  Note, there usually won't be any hash
665 * conflicts, in which case we don't need the concatenated/normalized
666 * form of the name.  But all callers have one of these on hand anyway,
667 * so might as well take advantage.  A cleaner but slower interface
668 * would accept neither argument, and compute the normalized name as
669 * needed (using zap_name_alloc(zap_entry_read_name(zeh))).
670 */
671boolean_t
672zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
673    const char *name, zap_t *zap)
674{
675	uint64_t chunk;
676	struct zap_leaf_entry *le;
677	boolean_t allocdzn = B_FALSE;
678
679	if (zap->zap_normflags == 0)
680		return (B_FALSE);
681
682	for (chunk = *LEAF_HASH_ENTPTR(zeh->zeh_leaf, zeh->zeh_hash);
683	    chunk != CHAIN_END; chunk = le->le_next) {
684		le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, chunk);
685		if (le->le_hash != zeh->zeh_hash)
686			continue;
687		if (le->le_cd == zeh->zeh_cd)
688			continue;
689
690		if (zn == NULL) {
691			zn = zap_name_alloc(zap, name, MT_FIRST);
692			allocdzn = B_TRUE;
693		}
694		if (zap_leaf_array_match(zeh->zeh_leaf, zn,
695		    le->le_name_chunk, le->le_name_numints)) {
696			if (allocdzn)
697				zap_name_free(zn);
698			return (B_TRUE);
699		}
700	}
701	if (allocdzn)
702		zap_name_free(zn);
703	return (B_FALSE);
704}
705
706/*
707 * Routines for transferring entries between leafs.
708 */
709
710static uint16_t *
711zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry)
712{
713	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
714	struct zap_leaf_entry *le2;
715	uint16_t *chunkp;
716
717	/*
718	 * keep the entry chain sorted by cd
719	 * NB: this will not cause problems for unsorted leafs, though
720	 * it is unnecessary there.
721	 */
722	for (chunkp = LEAF_HASH_ENTPTR(l, le->le_hash);
723	    *chunkp != CHAIN_END; chunkp = &le2->le_next) {
724		le2 = ZAP_LEAF_ENTRY(l, *chunkp);
725		if (le2->le_cd > le->le_cd)
726			break;
727	}
728
729	le->le_next = *chunkp;
730	*chunkp = entry;
731	return (chunkp);
732}
733
734static uint16_t
735zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
736{
737	uint16_t new_chunk;
738	uint16_t *nchunkp = &new_chunk;
739
740	while (chunk != CHAIN_END) {
741		uint16_t nchunk = zap_leaf_chunk_alloc(nl);
742		struct zap_leaf_array *nla =
743		    &ZAP_LEAF_CHUNK(nl, nchunk).l_array;
744		struct zap_leaf_array *la =
745		    &ZAP_LEAF_CHUNK(l, chunk).l_array;
746		int nextchunk = la->la_next;
747
748		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
749		ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l));
750
751		*nla = *la; /* structure assignment */
752
753		zap_leaf_chunk_free(l, chunk);
754		chunk = nextchunk;
755		*nchunkp = nchunk;
756		nchunkp = &nla->la_next;
757	}
758	*nchunkp = CHAIN_END;
759	return (new_chunk);
760}
761
762static void
763zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
764{
765	struct zap_leaf_entry *le, *nle;
766	uint16_t chunk;
767
768	le = ZAP_LEAF_ENTRY(l, entry);
769	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
770
771	chunk = zap_leaf_chunk_alloc(nl);
772	nle = ZAP_LEAF_ENTRY(nl, chunk);
773	*nle = *le; /* structure assignment */
774
775	(void) zap_leaf_rehash_entry(nl, chunk);
776
777	nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
778	nle->le_value_chunk =
779	    zap_leaf_transfer_array(l, le->le_value_chunk, nl);
780
781	zap_leaf_chunk_free(l, entry);
782
783	l->l_phys->l_hdr.lh_nentries--;
784	nl->l_phys->l_hdr.lh_nentries++;
785}
786
787/*
788 * Transfer the entries whose hash prefix ends in 1 to the new leaf.
789 */
790void
791zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
792{
793	int i;
794	int bit = 64 - 1 - l->l_phys->l_hdr.lh_prefix_len;
795
796	/* set new prefix and prefix_len */
797	l->l_phys->l_hdr.lh_prefix <<= 1;
798	l->l_phys->l_hdr.lh_prefix_len++;
799	nl->l_phys->l_hdr.lh_prefix = l->l_phys->l_hdr.lh_prefix | 1;
800	nl->l_phys->l_hdr.lh_prefix_len = l->l_phys->l_hdr.lh_prefix_len;
801
802	/* break existing hash chains */
803	zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l));
804
805	if (sort)
806		l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
807
808	/*
809	 * Transfer entries whose hash bit 'bit' is set to nl; rehash
810	 * the remaining entries
811	 *
812	 * NB: We could find entries via the hashtable instead. That
813	 * would be O(hashents+numents) rather than O(numblks+numents),
814	 * but this accesses memory more sequentially, and when we're
815	 * called, the block is usually pretty full.
816	 */
817	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
818		struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i);
819		if (le->le_type != ZAP_CHUNK_ENTRY)
820			continue;
821
822		if (le->le_hash & (1ULL << bit))
823			zap_leaf_transfer_entry(l, i, nl);
824		else
825			(void) zap_leaf_rehash_entry(l, i);
826	}
827}
828
829void
830zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
831{
832	int i, n;
833
834	n = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift -
835	    l->l_phys->l_hdr.lh_prefix_len;
836	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
837	zs->zs_leafs_with_2n_pointers[n]++;
838
839
840	n = l->l_phys->l_hdr.lh_nentries/5;
841	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
842	zs->zs_blocks_with_n5_entries[n]++;
843
844	n = ((1<<FZAP_BLOCK_SHIFT(zap)) -
845	    l->l_phys->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
846	    (1<<FZAP_BLOCK_SHIFT(zap));
847	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
848	zs->zs_blocks_n_tenths_full[n]++;
849
850	for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
851		int nentries = 0;
852		int chunk = l->l_phys->l_hash[i];
853
854		while (chunk != CHAIN_END) {
855			struct zap_leaf_entry *le =
856			    ZAP_LEAF_ENTRY(l, chunk);
857
858			n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_numints) +
859			    ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints *
860			    le->le_value_intlen);
861			n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
862			zs->zs_entries_using_n_chunks[n]++;
863
864			chunk = le->le_next;
865			nentries++;
866		}
867
868		n = nentries;
869		n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
870		zs->zs_buckets_with_n_entries[n]++;
871	}
872}
873