zap_leaf.c revision 247187
1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23168404Spjd */
24168404Spjd
25168404Spjd/*
26168404Spjd * The 512-byte leaf is broken into 32 16-byte chunks.
27168404Spjd * chunk number n means l_chunk[n], even though the header precedes it.
28168404Spjd * the names are stored null-terminated.
29168404Spjd */
30168404Spjd
31219089Spjd#include <sys/zio.h>
32219089Spjd#include <sys/spa.h>
33219089Spjd#include <sys/dmu.h>
34168404Spjd#include <sys/zfs_context.h>
35219089Spjd#include <sys/fs/zfs.h>
36168404Spjd#include <sys/zap.h>
37168404Spjd#include <sys/zap_impl.h>
38168404Spjd#include <sys/zap_leaf.h>
39219089Spjd#include <sys/arc.h>
40168404Spjd
41185029Spjdstatic uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
42185029Spjd
43168404Spjd#define	CHAIN_END 0xffff /* end of the chunk chain */
44168404Spjd
45168404Spjd/* half the (current) minimum block size */
46168404Spjd#define	MAX_ARRAY_BYTES (8<<10)
47168404Spjd
48168404Spjd#define	LEAF_HASH(l, h) \
49168404Spjd	((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \
50168404Spjd	((h) >> (64 - ZAP_LEAF_HASH_SHIFT(l)-(l)->l_phys->l_hdr.lh_prefix_len)))
51168404Spjd
52168404Spjd#define	LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[LEAF_HASH(l, h)])
53168404Spjd
54168404Spjd
55168404Spjdstatic void
56168404Spjdzap_memset(void *a, int c, size_t n)
57168404Spjd{
58168404Spjd	char *cp = a;
59168404Spjd	char *cpend = cp + n;
60168404Spjd
61168404Spjd	while (cp < cpend)
62168404Spjd		*cp++ = c;
63168404Spjd}
64168404Spjd
65168404Spjdstatic void
66168404Spjdstv(int len, void *addr, uint64_t value)
67168404Spjd{
68168404Spjd	switch (len) {
69168404Spjd	case 1:
70168404Spjd		*(uint8_t *)addr = value;
71168404Spjd		return;
72168404Spjd	case 2:
73168404Spjd		*(uint16_t *)addr = value;
74168404Spjd		return;
75168404Spjd	case 4:
76168404Spjd		*(uint32_t *)addr = value;
77168404Spjd		return;
78168404Spjd	case 8:
79168404Spjd		*(uint64_t *)addr = value;
80168404Spjd		return;
81168404Spjd	}
82168404Spjd	ASSERT(!"bad int len");
83168404Spjd}
84168404Spjd
85168404Spjdstatic uint64_t
86168404Spjdldv(int len, const void *addr)
87168404Spjd{
88168404Spjd	switch (len) {
89168404Spjd	case 1:
90168404Spjd		return (*(uint8_t *)addr);
91168404Spjd	case 2:
92168404Spjd		return (*(uint16_t *)addr);
93168404Spjd	case 4:
94168404Spjd		return (*(uint32_t *)addr);
95168404Spjd	case 8:
96168404Spjd		return (*(uint64_t *)addr);
97168404Spjd	}
98168404Spjd	ASSERT(!"bad int len");
99168404Spjd	return (0xFEEDFACEDEADBEEFULL);
100168404Spjd}
101168404Spjd
102168404Spjdvoid
103168404Spjdzap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
104168404Spjd{
105168404Spjd	int i;
106168404Spjd	zap_leaf_t l;
107168404Spjd	l.l_bs = highbit(size)-1;
108168404Spjd	l.l_phys = buf;
109168404Spjd
110168404Spjd	buf->l_hdr.lh_block_type = 	BSWAP_64(buf->l_hdr.lh_block_type);
111168404Spjd	buf->l_hdr.lh_prefix = 		BSWAP_64(buf->l_hdr.lh_prefix);
112168404Spjd	buf->l_hdr.lh_magic = 		BSWAP_32(buf->l_hdr.lh_magic);
113168404Spjd	buf->l_hdr.lh_nfree = 		BSWAP_16(buf->l_hdr.lh_nfree);
114168404Spjd	buf->l_hdr.lh_nentries = 	BSWAP_16(buf->l_hdr.lh_nentries);
115168404Spjd	buf->l_hdr.lh_prefix_len = 	BSWAP_16(buf->l_hdr.lh_prefix_len);
116168404Spjd	buf->l_hdr.lh_freelist = 	BSWAP_16(buf->l_hdr.lh_freelist);
117168404Spjd
118168404Spjd	for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
119168404Spjd		buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
120168404Spjd
121168404Spjd	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
122168404Spjd		zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i);
123168404Spjd		struct zap_leaf_entry *le;
124168404Spjd
125168404Spjd		switch (lc->l_free.lf_type) {
126168404Spjd		case ZAP_CHUNK_ENTRY:
127168404Spjd			le = &lc->l_entry;
128168404Spjd
129168404Spjd			le->le_type =		BSWAP_8(le->le_type);
130219089Spjd			le->le_value_intlen =	BSWAP_8(le->le_value_intlen);
131168404Spjd			le->le_next =		BSWAP_16(le->le_next);
132168404Spjd			le->le_name_chunk =	BSWAP_16(le->le_name_chunk);
133219089Spjd			le->le_name_numints =	BSWAP_16(le->le_name_numints);
134168404Spjd			le->le_value_chunk =	BSWAP_16(le->le_value_chunk);
135219089Spjd			le->le_value_numints =	BSWAP_16(le->le_value_numints);
136168404Spjd			le->le_cd =		BSWAP_32(le->le_cd);
137168404Spjd			le->le_hash =		BSWAP_64(le->le_hash);
138168404Spjd			break;
139168404Spjd		case ZAP_CHUNK_FREE:
140168404Spjd			lc->l_free.lf_type =	BSWAP_8(lc->l_free.lf_type);
141168404Spjd			lc->l_free.lf_next =	BSWAP_16(lc->l_free.lf_next);
142168404Spjd			break;
143168404Spjd		case ZAP_CHUNK_ARRAY:
144168404Spjd			lc->l_array.la_type =	BSWAP_8(lc->l_array.la_type);
145168404Spjd			lc->l_array.la_next =	BSWAP_16(lc->l_array.la_next);
146168404Spjd			/* la_array doesn't need swapping */
147168404Spjd			break;
148168404Spjd		default:
149168404Spjd			ASSERT(!"bad leaf type");
150168404Spjd		}
151168404Spjd	}
152168404Spjd}
153168404Spjd
154168404Spjdvoid
155185029Spjdzap_leaf_init(zap_leaf_t *l, boolean_t sort)
156168404Spjd{
157168404Spjd	int i;
158168404Spjd
159168404Spjd	l->l_bs = highbit(l->l_dbuf->db_size)-1;
160168404Spjd	zap_memset(&l->l_phys->l_hdr, 0, sizeof (struct zap_leaf_header));
161168404Spjd	zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l));
162168404Spjd	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
163168404Spjd		ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
164168404Spjd		ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
165168404Spjd	}
166168404Spjd	ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END;
167168404Spjd	l->l_phys->l_hdr.lh_block_type = ZBT_LEAF;
168168404Spjd	l->l_phys->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
169168404Spjd	l->l_phys->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
170185029Spjd	if (sort)
171185029Spjd		l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
172168404Spjd}
173168404Spjd
174168404Spjd/*
175168404Spjd * Routines which manipulate leaf chunks (l_chunk[]).
176168404Spjd */
177168404Spjd
178168404Spjdstatic uint16_t
179168404Spjdzap_leaf_chunk_alloc(zap_leaf_t *l)
180168404Spjd{
181168404Spjd	int chunk;
182168404Spjd
183168404Spjd	ASSERT(l->l_phys->l_hdr.lh_nfree > 0);
184168404Spjd
185168404Spjd	chunk = l->l_phys->l_hdr.lh_freelist;
186168404Spjd	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
187168404Spjd	ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE);
188168404Spjd
189168404Spjd	l->l_phys->l_hdr.lh_freelist = ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next;
190168404Spjd
191168404Spjd	l->l_phys->l_hdr.lh_nfree--;
192168404Spjd
193168404Spjd	return (chunk);
194168404Spjd}
195168404Spjd
196168404Spjdstatic void
197168404Spjdzap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
198168404Spjd{
199168404Spjd	struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free;
200168404Spjd	ASSERT3U(l->l_phys->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l));
201168404Spjd	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
202168404Spjd	ASSERT(zlf->lf_type != ZAP_CHUNK_FREE);
203168404Spjd
204168404Spjd	zlf->lf_type = ZAP_CHUNK_FREE;
205168404Spjd	zlf->lf_next = l->l_phys->l_hdr.lh_freelist;
206168404Spjd	bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */
207168404Spjd	l->l_phys->l_hdr.lh_freelist = chunk;
208168404Spjd
209168404Spjd	l->l_phys->l_hdr.lh_nfree++;
210168404Spjd}
211168404Spjd
212168404Spjd/*
213168404Spjd * Routines which manipulate leaf arrays (zap_leaf_array type chunks).
214168404Spjd */
215168404Spjd
216168404Spjdstatic uint16_t
217168404Spjdzap_leaf_array_create(zap_leaf_t *l, const char *buf,
218219089Spjd    int integer_size, int num_integers)
219168404Spjd{
220168404Spjd	uint16_t chunk_head;
221168404Spjd	uint16_t *chunkp = &chunk_head;
222168404Spjd	int byten = 0;
223247187Smm	uint64_t value = 0;
224168404Spjd	int shift = (integer_size-1)*8;
225168404Spjd	int len = num_integers;
226168404Spjd
227168404Spjd	ASSERT3U(num_integers * integer_size, <, MAX_ARRAY_BYTES);
228168404Spjd
229168404Spjd	while (len > 0) {
230168404Spjd		uint16_t chunk = zap_leaf_chunk_alloc(l);
231168404Spjd		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
232168404Spjd		int i;
233168404Spjd
234168404Spjd		la->la_type = ZAP_CHUNK_ARRAY;
235168404Spjd		for (i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
236168404Spjd			if (byten == 0)
237168404Spjd				value = ldv(integer_size, buf);
238168404Spjd			la->la_array[i] = value >> shift;
239168404Spjd			value <<= 8;
240168404Spjd			if (++byten == integer_size) {
241168404Spjd				byten = 0;
242168404Spjd				buf += integer_size;
243168404Spjd				if (--len == 0)
244168404Spjd					break;
245168404Spjd			}
246168404Spjd		}
247168404Spjd
248168404Spjd		*chunkp = chunk;
249168404Spjd		chunkp = &la->la_next;
250168404Spjd	}
251168404Spjd	*chunkp = CHAIN_END;
252168404Spjd
253168404Spjd	return (chunk_head);
254168404Spjd}
255168404Spjd
256168404Spjdstatic void
257168404Spjdzap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp)
258168404Spjd{
259168404Spjd	uint16_t chunk = *chunkp;
260168404Spjd
261168404Spjd	*chunkp = CHAIN_END;
262168404Spjd
263168404Spjd	while (chunk != CHAIN_END) {
264168404Spjd		int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
265168404Spjd		ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==,
266168404Spjd		    ZAP_CHUNK_ARRAY);
267168404Spjd		zap_leaf_chunk_free(l, chunk);
268168404Spjd		chunk = nextchunk;
269168404Spjd	}
270168404Spjd}
271168404Spjd
272168404Spjd/* array_len and buf_len are in integers, not bytes */
273168404Spjdstatic void
274168404Spjdzap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
275168404Spjd    int array_int_len, int array_len, int buf_int_len, uint64_t buf_len,
276219089Spjd    void *buf)
277168404Spjd{
278168404Spjd	int len = MIN(array_len, buf_len);
279168404Spjd	int byten = 0;
280168404Spjd	uint64_t value = 0;
281219089Spjd	char *p = buf;
282168404Spjd
283168404Spjd	ASSERT3U(array_int_len, <=, buf_int_len);
284168404Spjd
285168404Spjd	/* Fast path for one 8-byte integer */
286168404Spjd	if (array_int_len == 8 && buf_int_len == 8 && len == 1) {
287168404Spjd		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
288168404Spjd		uint8_t *ip = la->la_array;
289219089Spjd		uint64_t *buf64 = buf;
290168404Spjd
291168404Spjd		*buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 |
292168404Spjd		    (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 |
293168404Spjd		    (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 |
294168404Spjd		    (uint64_t)ip[6] << 8 | (uint64_t)ip[7];
295168404Spjd		return;
296168404Spjd	}
297168404Spjd
298168404Spjd	/* Fast path for an array of 1-byte integers (eg. the entry name) */
299168404Spjd	if (array_int_len == 1 && buf_int_len == 1 &&
300168404Spjd	    buf_len > array_len + ZAP_LEAF_ARRAY_BYTES) {
301168404Spjd		while (chunk != CHAIN_END) {
302168404Spjd			struct zap_leaf_array *la =
303168404Spjd			    &ZAP_LEAF_CHUNK(l, chunk).l_array;
304219089Spjd			bcopy(la->la_array, p, ZAP_LEAF_ARRAY_BYTES);
305219089Spjd			p += ZAP_LEAF_ARRAY_BYTES;
306168404Spjd			chunk = la->la_next;
307168404Spjd		}
308168404Spjd		return;
309168404Spjd	}
310168404Spjd
311168404Spjd	while (len > 0) {
312168404Spjd		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
313168404Spjd		int i;
314168404Spjd
315168404Spjd		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
316168404Spjd		for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
317168404Spjd			value = (value << 8) | la->la_array[i];
318168404Spjd			byten++;
319168404Spjd			if (byten == array_int_len) {
320219089Spjd				stv(buf_int_len, p, value);
321168404Spjd				byten = 0;
322168404Spjd				len--;
323168404Spjd				if (len == 0)
324168404Spjd					return;
325219089Spjd				p += buf_int_len;
326168404Spjd			}
327168404Spjd		}
328168404Spjd		chunk = la->la_next;
329168404Spjd	}
330168404Spjd}
331168404Spjd
332185029Spjdstatic boolean_t
333219089Spjdzap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn,
334219089Spjd    int chunk, int array_numints)
335168404Spjd{
336168404Spjd	int bseen = 0;
337168404Spjd
338219089Spjd	if (zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY) {
339219089Spjd		uint64_t *thiskey;
340219089Spjd		boolean_t match;
341219089Spjd
342219089Spjd		ASSERT(zn->zn_key_intlen == sizeof (*thiskey));
343219089Spjd		thiskey = kmem_alloc(array_numints * sizeof (*thiskey),
344219089Spjd		    KM_SLEEP);
345219089Spjd
346219089Spjd		zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_numints,
347219089Spjd		    sizeof (*thiskey), array_numints, thiskey);
348219089Spjd		match = bcmp(thiskey, zn->zn_key_orig,
349219089Spjd		    array_numints * sizeof (*thiskey)) == 0;
350219089Spjd		kmem_free(thiskey, array_numints * sizeof (*thiskey));
351219089Spjd		return (match);
352219089Spjd	}
353219089Spjd
354219089Spjd	ASSERT(zn->zn_key_intlen == 1);
355185029Spjd	if (zn->zn_matchtype == MT_FIRST) {
356219089Spjd		char *thisname = kmem_alloc(array_numints, KM_SLEEP);
357185029Spjd		boolean_t match;
358185029Spjd
359219089Spjd		zap_leaf_array_read(l, chunk, sizeof (char), array_numints,
360219089Spjd		    sizeof (char), array_numints, thisname);
361185029Spjd		match = zap_match(zn, thisname);
362219089Spjd		kmem_free(thisname, array_numints);
363185029Spjd		return (match);
364185029Spjd	}
365185029Spjd
366219089Spjd	/*
367219089Spjd	 * Fast path for exact matching.
368219089Spjd	 * First check that the lengths match, so that we don't read
369219089Spjd	 * past the end of the zn_key_orig array.
370219089Spjd	 */
371219089Spjd	if (array_numints != zn->zn_key_orig_numints)
372219089Spjd		return (B_FALSE);
373219089Spjd	while (bseen < array_numints) {
374168404Spjd		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
375219089Spjd		int toread = MIN(array_numints - bseen, ZAP_LEAF_ARRAY_BYTES);
376168404Spjd		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
377219089Spjd		if (bcmp(la->la_array, (char *)zn->zn_key_orig + bseen, toread))
378168404Spjd			break;
379168404Spjd		chunk = la->la_next;
380168404Spjd		bseen += toread;
381168404Spjd	}
382219089Spjd	return (bseen == array_numints);
383168404Spjd}
384168404Spjd
385168404Spjd/*
386168404Spjd * Routines which manipulate leaf entries.
387168404Spjd */
388168404Spjd
389168404Spjdint
390185029Spjdzap_leaf_lookup(zap_leaf_t *l, zap_name_t *zn, zap_entry_handle_t *zeh)
391168404Spjd{
392168404Spjd	uint16_t *chunkp;
393168404Spjd	struct zap_leaf_entry *le;
394168404Spjd
395168404Spjd	ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
396168404Spjd
397185029Spjdagain:
398185029Spjd	for (chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash);
399168404Spjd	    *chunkp != CHAIN_END; chunkp = &le->le_next) {
400168404Spjd		uint16_t chunk = *chunkp;
401168404Spjd		le = ZAP_LEAF_ENTRY(l, chunk);
402168404Spjd
403168404Spjd		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
404168404Spjd		ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
405168404Spjd
406185029Spjd		if (le->le_hash != zn->zn_hash)
407168404Spjd			continue;
408168404Spjd
409185029Spjd		/*
410185029Spjd		 * NB: the entry chain is always sorted by cd on
411185029Spjd		 * normalized zap objects, so this will find the
412185029Spjd		 * lowest-cd match for MT_FIRST.
413185029Spjd		 */
414185029Spjd		ASSERT(zn->zn_matchtype == MT_EXACT ||
415185029Spjd		    (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED));
416185029Spjd		if (zap_leaf_array_match(l, zn, le->le_name_chunk,
417219089Spjd		    le->le_name_numints)) {
418219089Spjd			zeh->zeh_num_integers = le->le_value_numints;
419219089Spjd			zeh->zeh_integer_size = le->le_value_intlen;
420168404Spjd			zeh->zeh_cd = le->le_cd;
421168404Spjd			zeh->zeh_hash = le->le_hash;
422168404Spjd			zeh->zeh_chunkp = chunkp;
423168404Spjd			zeh->zeh_leaf = l;
424168404Spjd			return (0);
425168404Spjd		}
426168404Spjd	}
427168404Spjd
428185029Spjd	/*
429185029Spjd	 * NB: we could of course do this in one pass, but that would be
430185029Spjd	 * a pain.  We'll see if MT_BEST is even used much.
431185029Spjd	 */
432185029Spjd	if (zn->zn_matchtype == MT_BEST) {
433185029Spjd		zn->zn_matchtype = MT_FIRST;
434185029Spjd		goto again;
435185029Spjd	}
436185029Spjd
437168404Spjd	return (ENOENT);
438168404Spjd}
439168404Spjd
440168404Spjd/* Return (h1,cd1 >= h2,cd2) */
441168404Spjd#define	HCD_GTEQ(h1, cd1, h2, cd2) \
442168404Spjd	((h1 > h2) ? TRUE : ((h1 == h2 && cd1 >= cd2) ? TRUE : FALSE))
443168404Spjd
444168404Spjdint
445168404Spjdzap_leaf_lookup_closest(zap_leaf_t *l,
446168404Spjd    uint64_t h, uint32_t cd, zap_entry_handle_t *zeh)
447168404Spjd{
448168404Spjd	uint16_t chunk;
449168404Spjd	uint64_t besth = -1ULL;
450219089Spjd	uint32_t bestcd = -1U;
451168404Spjd	uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1;
452168404Spjd	uint16_t lh;
453168404Spjd	struct zap_leaf_entry *le;
454168404Spjd
455168404Spjd	ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
456168404Spjd
457168404Spjd	for (lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
458168404Spjd		for (chunk = l->l_phys->l_hash[lh];
459168404Spjd		    chunk != CHAIN_END; chunk = le->le_next) {
460168404Spjd			le = ZAP_LEAF_ENTRY(l, chunk);
461168404Spjd
462168404Spjd			ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
463168404Spjd			ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
464168404Spjd
465168404Spjd			if (HCD_GTEQ(le->le_hash, le->le_cd, h, cd) &&
466168404Spjd			    HCD_GTEQ(besth, bestcd, le->le_hash, le->le_cd)) {
467168404Spjd				ASSERT3U(bestlh, >=, lh);
468168404Spjd				bestlh = lh;
469168404Spjd				besth = le->le_hash;
470168404Spjd				bestcd = le->le_cd;
471168404Spjd
472219089Spjd				zeh->zeh_num_integers = le->le_value_numints;
473219089Spjd				zeh->zeh_integer_size = le->le_value_intlen;
474168404Spjd				zeh->zeh_cd = le->le_cd;
475168404Spjd				zeh->zeh_hash = le->le_hash;
476168404Spjd				zeh->zeh_fakechunk = chunk;
477168404Spjd				zeh->zeh_chunkp = &zeh->zeh_fakechunk;
478168404Spjd				zeh->zeh_leaf = l;
479168404Spjd			}
480168404Spjd		}
481168404Spjd	}
482168404Spjd
483219089Spjd	return (bestcd == -1U ? ENOENT : 0);
484168404Spjd}
485168404Spjd
486168404Spjdint
487168404Spjdzap_entry_read(const zap_entry_handle_t *zeh,
488168404Spjd    uint8_t integer_size, uint64_t num_integers, void *buf)
489168404Spjd{
490168404Spjd	struct zap_leaf_entry *le =
491168404Spjd	    ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
492168404Spjd	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
493168404Spjd
494219089Spjd	if (le->le_value_intlen > integer_size)
495168404Spjd		return (EINVAL);
496168404Spjd
497219089Spjd	zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk,
498219089Spjd	    le->le_value_intlen, le->le_value_numints,
499219089Spjd	    integer_size, num_integers, buf);
500168404Spjd
501168404Spjd	if (zeh->zeh_num_integers > num_integers)
502168404Spjd		return (EOVERFLOW);
503168404Spjd	return (0);
504168404Spjd
505168404Spjd}
506168404Spjd
507168404Spjdint
508219089Spjdzap_entry_read_name(zap_t *zap, const zap_entry_handle_t *zeh, uint16_t buflen,
509219089Spjd    char *buf)
510168404Spjd{
511168404Spjd	struct zap_leaf_entry *le =
512168404Spjd	    ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
513168404Spjd	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
514168404Spjd
515219089Spjd	if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
516219089Spjd		zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 8,
517219089Spjd		    le->le_name_numints, 8, buflen / 8, buf);
518219089Spjd	} else {
519219089Spjd		zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1,
520219089Spjd		    le->le_name_numints, 1, buflen, buf);
521219089Spjd	}
522219089Spjd	if (le->le_name_numints > buflen)
523168404Spjd		return (EOVERFLOW);
524168404Spjd	return (0);
525168404Spjd}
526168404Spjd
527168404Spjdint
528168404Spjdzap_entry_update(zap_entry_handle_t *zeh,
529168404Spjd	uint8_t integer_size, uint64_t num_integers, const void *buf)
530168404Spjd{
531168404Spjd	int delta_chunks;
532168404Spjd	zap_leaf_t *l = zeh->zeh_leaf;
533168404Spjd	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp);
534168404Spjd
535168404Spjd	delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) -
536219089Spjd	    ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen);
537168404Spjd
538168404Spjd	if ((int)l->l_phys->l_hdr.lh_nfree < delta_chunks)
539168404Spjd		return (EAGAIN);
540168404Spjd
541168404Spjd	zap_leaf_array_free(l, &le->le_value_chunk);
542168404Spjd	le->le_value_chunk =
543168404Spjd	    zap_leaf_array_create(l, buf, integer_size, num_integers);
544219089Spjd	le->le_value_numints = num_integers;
545219089Spjd	le->le_value_intlen = integer_size;
546168404Spjd	return (0);
547168404Spjd}
548168404Spjd
549168404Spjdvoid
550168404Spjdzap_entry_remove(zap_entry_handle_t *zeh)
551168404Spjd{
552168404Spjd	uint16_t entry_chunk;
553168404Spjd	struct zap_leaf_entry *le;
554168404Spjd	zap_leaf_t *l = zeh->zeh_leaf;
555168404Spjd
556168404Spjd	ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk);
557168404Spjd
558168404Spjd	entry_chunk = *zeh->zeh_chunkp;
559168404Spjd	le = ZAP_LEAF_ENTRY(l, entry_chunk);
560168404Spjd	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
561168404Spjd
562168404Spjd	zap_leaf_array_free(l, &le->le_name_chunk);
563168404Spjd	zap_leaf_array_free(l, &le->le_value_chunk);
564168404Spjd
565168404Spjd	*zeh->zeh_chunkp = le->le_next;
566168404Spjd	zap_leaf_chunk_free(l, entry_chunk);
567168404Spjd
568168404Spjd	l->l_phys->l_hdr.lh_nentries--;
569168404Spjd}
570168404Spjd
571168404Spjdint
572219089Spjdzap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
573168404Spjd    uint8_t integer_size, uint64_t num_integers, const void *buf,
574168404Spjd    zap_entry_handle_t *zeh)
575168404Spjd{
576168404Spjd	uint16_t chunk;
577168404Spjd	uint16_t *chunkp;
578168404Spjd	struct zap_leaf_entry *le;
579219089Spjd	uint64_t valuelen;
580168404Spjd	int numchunks;
581219089Spjd	uint64_t h = zn->zn_hash;
582168404Spjd
583168404Spjd	valuelen = integer_size * num_integers;
584168404Spjd
585219089Spjd	numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
586219089Spjd	    zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
587168404Spjd	if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
588168404Spjd		return (E2BIG);
589168404Spjd
590219089Spjd	if (cd == ZAP_NEED_CD) {
591185029Spjd		/* find the lowest unused cd */
592185029Spjd		if (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) {
593185029Spjd			cd = 0;
594185029Spjd
595168404Spjd			for (chunk = *LEAF_HASH_ENTPTR(l, h);
596168404Spjd			    chunk != CHAIN_END; chunk = le->le_next) {
597168404Spjd				le = ZAP_LEAF_ENTRY(l, chunk);
598185029Spjd				if (le->le_cd > cd)
599168404Spjd					break;
600185029Spjd				if (le->le_hash == h) {
601185029Spjd					ASSERT3U(cd, ==, le->le_cd);
602185029Spjd					cd++;
603168404Spjd				}
604168404Spjd			}
605185029Spjd		} else {
606185029Spjd			/* old unsorted format; do it the O(n^2) way */
607219089Spjd			for (cd = 0; ; cd++) {
608185029Spjd				for (chunk = *LEAF_HASH_ENTPTR(l, h);
609185029Spjd				    chunk != CHAIN_END; chunk = le->le_next) {
610185029Spjd					le = ZAP_LEAF_ENTRY(l, chunk);
611185029Spjd					if (le->le_hash == h &&
612185029Spjd					    le->le_cd == cd) {
613185029Spjd						break;
614185029Spjd					}
615185029Spjd				}
616185029Spjd				/* If this cd is not in use, we are good. */
617185029Spjd				if (chunk == CHAIN_END)
618185029Spjd					break;
619185029Spjd			}
620168404Spjd		}
621185029Spjd		/*
622219089Spjd		 * We would run out of space in a block before we could
623219089Spjd		 * store enough entries to run out of CD values.
624185029Spjd		 */
625219089Spjd		ASSERT3U(cd, <, zap_maxcd(zn->zn_zap));
626168404Spjd	}
627168404Spjd
628168404Spjd	if (l->l_phys->l_hdr.lh_nfree < numchunks)
629168404Spjd		return (EAGAIN);
630168404Spjd
631168404Spjd	/* make the entry */
632168404Spjd	chunk = zap_leaf_chunk_alloc(l);
633168404Spjd	le = ZAP_LEAF_ENTRY(l, chunk);
634168404Spjd	le->le_type = ZAP_CHUNK_ENTRY;
635219089Spjd	le->le_name_chunk = zap_leaf_array_create(l, zn->zn_key_orig,
636219089Spjd	    zn->zn_key_intlen, zn->zn_key_orig_numints);
637219089Spjd	le->le_name_numints = zn->zn_key_orig_numints;
638168404Spjd	le->le_value_chunk =
639168404Spjd	    zap_leaf_array_create(l, buf, integer_size, num_integers);
640219089Spjd	le->le_value_numints = num_integers;
641219089Spjd	le->le_value_intlen = integer_size;
642168404Spjd	le->le_hash = h;
643168404Spjd	le->le_cd = cd;
644168404Spjd
645168404Spjd	/* link it into the hash chain */
646185029Spjd	/* XXX if we did the search above, we could just use that */
647185029Spjd	chunkp = zap_leaf_rehash_entry(l, chunk);
648168404Spjd
649168404Spjd	l->l_phys->l_hdr.lh_nentries++;
650168404Spjd
651168404Spjd	zeh->zeh_leaf = l;
652168404Spjd	zeh->zeh_num_integers = num_integers;
653219089Spjd	zeh->zeh_integer_size = le->le_value_intlen;
654168404Spjd	zeh->zeh_cd = le->le_cd;
655168404Spjd	zeh->zeh_hash = le->le_hash;
656168404Spjd	zeh->zeh_chunkp = chunkp;
657168404Spjd
658168404Spjd	return (0);
659168404Spjd}
660168404Spjd
661168404Spjd/*
662185029Spjd * Determine if there is another entry with the same normalized form.
663185029Spjd * For performance purposes, either zn or name must be provided (the
664185029Spjd * other can be NULL).  Note, there usually won't be any hash
665185029Spjd * conflicts, in which case we don't need the concatenated/normalized
666185029Spjd * form of the name.  But all callers have one of these on hand anyway,
667185029Spjd * so might as well take advantage.  A cleaner but slower interface
668185029Spjd * would accept neither argument, and compute the normalized name as
669185029Spjd * needed (using zap_name_alloc(zap_entry_read_name(zeh))).
670185029Spjd */
671185029Spjdboolean_t
672185029Spjdzap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
673185029Spjd    const char *name, zap_t *zap)
674185029Spjd{
675185029Spjd	uint64_t chunk;
676185029Spjd	struct zap_leaf_entry *le;
677185029Spjd	boolean_t allocdzn = B_FALSE;
678185029Spjd
679185029Spjd	if (zap->zap_normflags == 0)
680185029Spjd		return (B_FALSE);
681185029Spjd
682185029Spjd	for (chunk = *LEAF_HASH_ENTPTR(zeh->zeh_leaf, zeh->zeh_hash);
683185029Spjd	    chunk != CHAIN_END; chunk = le->le_next) {
684185029Spjd		le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, chunk);
685185029Spjd		if (le->le_hash != zeh->zeh_hash)
686185029Spjd			continue;
687185029Spjd		if (le->le_cd == zeh->zeh_cd)
688185029Spjd			continue;
689185029Spjd
690185029Spjd		if (zn == NULL) {
691185029Spjd			zn = zap_name_alloc(zap, name, MT_FIRST);
692185029Spjd			allocdzn = B_TRUE;
693185029Spjd		}
694185029Spjd		if (zap_leaf_array_match(zeh->zeh_leaf, zn,
695219089Spjd		    le->le_name_chunk, le->le_name_numints)) {
696185029Spjd			if (allocdzn)
697185029Spjd				zap_name_free(zn);
698185029Spjd			return (B_TRUE);
699185029Spjd		}
700185029Spjd	}
701185029Spjd	if (allocdzn)
702185029Spjd		zap_name_free(zn);
703185029Spjd	return (B_FALSE);
704185029Spjd}
705185029Spjd
706185029Spjd/*
707168404Spjd * Routines for transferring entries between leafs.
708168404Spjd */
709168404Spjd
710185029Spjdstatic uint16_t *
711168404Spjdzap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry)
712168404Spjd{
713168404Spjd	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
714185029Spjd	struct zap_leaf_entry *le2;
715185029Spjd	uint16_t *chunkp;
716185029Spjd
717185029Spjd	/*
718185029Spjd	 * keep the entry chain sorted by cd
719185029Spjd	 * NB: this will not cause problems for unsorted leafs, though
720185029Spjd	 * it is unnecessary there.
721185029Spjd	 */
722185029Spjd	for (chunkp = LEAF_HASH_ENTPTR(l, le->le_hash);
723185029Spjd	    *chunkp != CHAIN_END; chunkp = &le2->le_next) {
724185029Spjd		le2 = ZAP_LEAF_ENTRY(l, *chunkp);
725185029Spjd		if (le2->le_cd > le->le_cd)
726185029Spjd			break;
727185029Spjd	}
728185029Spjd
729185029Spjd	le->le_next = *chunkp;
730185029Spjd	*chunkp = entry;
731185029Spjd	return (chunkp);
732168404Spjd}
733168404Spjd
734168404Spjdstatic uint16_t
735168404Spjdzap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
736168404Spjd{
737168404Spjd	uint16_t new_chunk;
738168404Spjd	uint16_t *nchunkp = &new_chunk;
739168404Spjd
740168404Spjd	while (chunk != CHAIN_END) {
741168404Spjd		uint16_t nchunk = zap_leaf_chunk_alloc(nl);
742168404Spjd		struct zap_leaf_array *nla =
743168404Spjd		    &ZAP_LEAF_CHUNK(nl, nchunk).l_array;
744168404Spjd		struct zap_leaf_array *la =
745168404Spjd		    &ZAP_LEAF_CHUNK(l, chunk).l_array;
746168404Spjd		int nextchunk = la->la_next;
747168404Spjd
748168404Spjd		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
749168404Spjd		ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l));
750168404Spjd
751168404Spjd		*nla = *la; /* structure assignment */
752168404Spjd
753168404Spjd		zap_leaf_chunk_free(l, chunk);
754168404Spjd		chunk = nextchunk;
755168404Spjd		*nchunkp = nchunk;
756168404Spjd		nchunkp = &nla->la_next;
757168404Spjd	}
758168404Spjd	*nchunkp = CHAIN_END;
759168404Spjd	return (new_chunk);
760168404Spjd}
761168404Spjd
762168404Spjdstatic void
763168404Spjdzap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
764168404Spjd{
765168404Spjd	struct zap_leaf_entry *le, *nle;
766168404Spjd	uint16_t chunk;
767168404Spjd
768168404Spjd	le = ZAP_LEAF_ENTRY(l, entry);
769168404Spjd	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
770168404Spjd
771168404Spjd	chunk = zap_leaf_chunk_alloc(nl);
772168404Spjd	nle = ZAP_LEAF_ENTRY(nl, chunk);
773168404Spjd	*nle = *le; /* structure assignment */
774168404Spjd
775185029Spjd	(void) zap_leaf_rehash_entry(nl, chunk);
776168404Spjd
777168404Spjd	nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
778168404Spjd	nle->le_value_chunk =
779168404Spjd	    zap_leaf_transfer_array(l, le->le_value_chunk, nl);
780168404Spjd
781168404Spjd	zap_leaf_chunk_free(l, entry);
782168404Spjd
783168404Spjd	l->l_phys->l_hdr.lh_nentries--;
784168404Spjd	nl->l_phys->l_hdr.lh_nentries++;
785168404Spjd}
786168404Spjd
787168404Spjd/*
788168404Spjd * Transfer the entries whose hash prefix ends in 1 to the new leaf.
789168404Spjd */
790168404Spjdvoid
791185029Spjdzap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
792168404Spjd{
793168404Spjd	int i;
794168404Spjd	int bit = 64 - 1 - l->l_phys->l_hdr.lh_prefix_len;
795168404Spjd
796168404Spjd	/* set new prefix and prefix_len */
797168404Spjd	l->l_phys->l_hdr.lh_prefix <<= 1;
798168404Spjd	l->l_phys->l_hdr.lh_prefix_len++;
799168404Spjd	nl->l_phys->l_hdr.lh_prefix = l->l_phys->l_hdr.lh_prefix | 1;
800168404Spjd	nl->l_phys->l_hdr.lh_prefix_len = l->l_phys->l_hdr.lh_prefix_len;
801168404Spjd
802168404Spjd	/* break existing hash chains */
803168404Spjd	zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l));
804168404Spjd
805185029Spjd	if (sort)
806185029Spjd		l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
807185029Spjd
808168404Spjd	/*
809168404Spjd	 * Transfer entries whose hash bit 'bit' is set to nl; rehash
810168404Spjd	 * the remaining entries
811168404Spjd	 *
812168404Spjd	 * NB: We could find entries via the hashtable instead. That
813168404Spjd	 * would be O(hashents+numents) rather than O(numblks+numents),
814168404Spjd	 * but this accesses memory more sequentially, and when we're
815168404Spjd	 * called, the block is usually pretty full.
816168404Spjd	 */
817168404Spjd	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
818168404Spjd		struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i);
819168404Spjd		if (le->le_type != ZAP_CHUNK_ENTRY)
820168404Spjd			continue;
821168404Spjd
822168404Spjd		if (le->le_hash & (1ULL << bit))
823168404Spjd			zap_leaf_transfer_entry(l, i, nl);
824168404Spjd		else
825185029Spjd			(void) zap_leaf_rehash_entry(l, i);
826168404Spjd	}
827168404Spjd}
828168404Spjd
829168404Spjdvoid
830168404Spjdzap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
831168404Spjd{
832168404Spjd	int i, n;
833168404Spjd
834168404Spjd	n = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift -
835168404Spjd	    l->l_phys->l_hdr.lh_prefix_len;
836168404Spjd	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
837168404Spjd	zs->zs_leafs_with_2n_pointers[n]++;
838168404Spjd
839168404Spjd
840168404Spjd	n = l->l_phys->l_hdr.lh_nentries/5;
841168404Spjd	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
842168404Spjd	zs->zs_blocks_with_n5_entries[n]++;
843168404Spjd
844168404Spjd	n = ((1<<FZAP_BLOCK_SHIFT(zap)) -
845168404Spjd	    l->l_phys->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
846168404Spjd	    (1<<FZAP_BLOCK_SHIFT(zap));
847168404Spjd	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
848168404Spjd	zs->zs_blocks_n_tenths_full[n]++;
849168404Spjd
850168404Spjd	for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
851168404Spjd		int nentries = 0;
852168404Spjd		int chunk = l->l_phys->l_hash[i];
853168404Spjd
854168404Spjd		while (chunk != CHAIN_END) {
855168404Spjd			struct zap_leaf_entry *le =
856168404Spjd			    ZAP_LEAF_ENTRY(l, chunk);
857168404Spjd
858219089Spjd			n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_numints) +
859219089Spjd			    ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints *
860219089Spjd			    le->le_value_intlen);
861168404Spjd			n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
862168404Spjd			zs->zs_entries_using_n_chunks[n]++;
863168404Spjd
864168404Spjd			chunk = le->le_next;
865168404Spjd			nentries++;
866168404Spjd		}
867168404Spjd
868168404Spjd		n = nentries;
869168404Spjd		n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
870168404Spjd		zs->zs_buckets_with_n_entries[n]++;
871168404Spjd	}
872168404Spjd}
873