zap_leaf.c revision 249195
1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23249195Smm * Copyright (c) 2013 by Delphix. All rights reserved.
24168404Spjd */
25168404Spjd
26168404Spjd/*
27168404Spjd * The 512-byte leaf is broken into 32 16-byte chunks.
28168404Spjd * chunk number n means l_chunk[n], even though the header precedes it.
29168404Spjd * the names are stored null-terminated.
30168404Spjd */
31168404Spjd
32219089Spjd#include <sys/zio.h>
33219089Spjd#include <sys/spa.h>
34219089Spjd#include <sys/dmu.h>
35168404Spjd#include <sys/zfs_context.h>
36219089Spjd#include <sys/fs/zfs.h>
37168404Spjd#include <sys/zap.h>
38168404Spjd#include <sys/zap_impl.h>
39168404Spjd#include <sys/zap_leaf.h>
40219089Spjd#include <sys/arc.h>
41168404Spjd
42185029Spjdstatic uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
43185029Spjd
44168404Spjd#define	CHAIN_END 0xffff /* end of the chunk chain */
45168404Spjd
46168404Spjd/* half the (current) minimum block size */
47168404Spjd#define	MAX_ARRAY_BYTES (8<<10)
48168404Spjd
49168404Spjd#define	LEAF_HASH(l, h) \
50168404Spjd	((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \
51168404Spjd	((h) >> (64 - ZAP_LEAF_HASH_SHIFT(l)-(l)->l_phys->l_hdr.lh_prefix_len)))
52168404Spjd
53168404Spjd#define	LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[LEAF_HASH(l, h)])
54168404Spjd
55168404Spjd
56168404Spjdstatic void
57168404Spjdzap_memset(void *a, int c, size_t n)
58168404Spjd{
59168404Spjd	char *cp = a;
60168404Spjd	char *cpend = cp + n;
61168404Spjd
62168404Spjd	while (cp < cpend)
63168404Spjd		*cp++ = c;
64168404Spjd}
65168404Spjd
66168404Spjdstatic void
67168404Spjdstv(int len, void *addr, uint64_t value)
68168404Spjd{
69168404Spjd	switch (len) {
70168404Spjd	case 1:
71168404Spjd		*(uint8_t *)addr = value;
72168404Spjd		return;
73168404Spjd	case 2:
74168404Spjd		*(uint16_t *)addr = value;
75168404Spjd		return;
76168404Spjd	case 4:
77168404Spjd		*(uint32_t *)addr = value;
78168404Spjd		return;
79168404Spjd	case 8:
80168404Spjd		*(uint64_t *)addr = value;
81168404Spjd		return;
82168404Spjd	}
83168404Spjd	ASSERT(!"bad int len");
84168404Spjd}
85168404Spjd
86168404Spjdstatic uint64_t
87168404Spjdldv(int len, const void *addr)
88168404Spjd{
89168404Spjd	switch (len) {
90168404Spjd	case 1:
91168404Spjd		return (*(uint8_t *)addr);
92168404Spjd	case 2:
93168404Spjd		return (*(uint16_t *)addr);
94168404Spjd	case 4:
95168404Spjd		return (*(uint32_t *)addr);
96168404Spjd	case 8:
97168404Spjd		return (*(uint64_t *)addr);
98168404Spjd	}
99168404Spjd	ASSERT(!"bad int len");
100168404Spjd	return (0xFEEDFACEDEADBEEFULL);
101168404Spjd}
102168404Spjd
103168404Spjdvoid
104168404Spjdzap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
105168404Spjd{
106168404Spjd	int i;
107168404Spjd	zap_leaf_t l;
108168404Spjd	l.l_bs = highbit(size)-1;
109168404Spjd	l.l_phys = buf;
110168404Spjd
111168404Spjd	buf->l_hdr.lh_block_type = 	BSWAP_64(buf->l_hdr.lh_block_type);
112168404Spjd	buf->l_hdr.lh_prefix = 		BSWAP_64(buf->l_hdr.lh_prefix);
113168404Spjd	buf->l_hdr.lh_magic = 		BSWAP_32(buf->l_hdr.lh_magic);
114168404Spjd	buf->l_hdr.lh_nfree = 		BSWAP_16(buf->l_hdr.lh_nfree);
115168404Spjd	buf->l_hdr.lh_nentries = 	BSWAP_16(buf->l_hdr.lh_nentries);
116168404Spjd	buf->l_hdr.lh_prefix_len = 	BSWAP_16(buf->l_hdr.lh_prefix_len);
117168404Spjd	buf->l_hdr.lh_freelist = 	BSWAP_16(buf->l_hdr.lh_freelist);
118168404Spjd
119168404Spjd	for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
120168404Spjd		buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
121168404Spjd
122168404Spjd	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) {
123168404Spjd		zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i);
124168404Spjd		struct zap_leaf_entry *le;
125168404Spjd
126168404Spjd		switch (lc->l_free.lf_type) {
127168404Spjd		case ZAP_CHUNK_ENTRY:
128168404Spjd			le = &lc->l_entry;
129168404Spjd
130168404Spjd			le->le_type =		BSWAP_8(le->le_type);
131219089Spjd			le->le_value_intlen =	BSWAP_8(le->le_value_intlen);
132168404Spjd			le->le_next =		BSWAP_16(le->le_next);
133168404Spjd			le->le_name_chunk =	BSWAP_16(le->le_name_chunk);
134219089Spjd			le->le_name_numints =	BSWAP_16(le->le_name_numints);
135168404Spjd			le->le_value_chunk =	BSWAP_16(le->le_value_chunk);
136219089Spjd			le->le_value_numints =	BSWAP_16(le->le_value_numints);
137168404Spjd			le->le_cd =		BSWAP_32(le->le_cd);
138168404Spjd			le->le_hash =		BSWAP_64(le->le_hash);
139168404Spjd			break;
140168404Spjd		case ZAP_CHUNK_FREE:
141168404Spjd			lc->l_free.lf_type =	BSWAP_8(lc->l_free.lf_type);
142168404Spjd			lc->l_free.lf_next =	BSWAP_16(lc->l_free.lf_next);
143168404Spjd			break;
144168404Spjd		case ZAP_CHUNK_ARRAY:
145168404Spjd			lc->l_array.la_type =	BSWAP_8(lc->l_array.la_type);
146168404Spjd			lc->l_array.la_next =	BSWAP_16(lc->l_array.la_next);
147168404Spjd			/* la_array doesn't need swapping */
148168404Spjd			break;
149168404Spjd		default:
150168404Spjd			ASSERT(!"bad leaf type");
151168404Spjd		}
152168404Spjd	}
153168404Spjd}
154168404Spjd
155168404Spjdvoid
156185029Spjdzap_leaf_init(zap_leaf_t *l, boolean_t sort)
157168404Spjd{
158168404Spjd	int i;
159168404Spjd
160168404Spjd	l->l_bs = highbit(l->l_dbuf->db_size)-1;
161168404Spjd	zap_memset(&l->l_phys->l_hdr, 0, sizeof (struct zap_leaf_header));
162168404Spjd	zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l));
163168404Spjd	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
164168404Spjd		ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
165168404Spjd		ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
166168404Spjd	}
167168404Spjd	ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END;
168168404Spjd	l->l_phys->l_hdr.lh_block_type = ZBT_LEAF;
169168404Spjd	l->l_phys->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
170168404Spjd	l->l_phys->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
171185029Spjd	if (sort)
172185029Spjd		l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
173168404Spjd}
174168404Spjd
175168404Spjd/*
176168404Spjd * Routines which manipulate leaf chunks (l_chunk[]).
177168404Spjd */
178168404Spjd
179168404Spjdstatic uint16_t
180168404Spjdzap_leaf_chunk_alloc(zap_leaf_t *l)
181168404Spjd{
182168404Spjd	int chunk;
183168404Spjd
184168404Spjd	ASSERT(l->l_phys->l_hdr.lh_nfree > 0);
185168404Spjd
186168404Spjd	chunk = l->l_phys->l_hdr.lh_freelist;
187168404Spjd	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
188168404Spjd	ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE);
189168404Spjd
190168404Spjd	l->l_phys->l_hdr.lh_freelist = ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next;
191168404Spjd
192168404Spjd	l->l_phys->l_hdr.lh_nfree--;
193168404Spjd
194168404Spjd	return (chunk);
195168404Spjd}
196168404Spjd
197168404Spjdstatic void
198168404Spjdzap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
199168404Spjd{
200168404Spjd	struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free;
201168404Spjd	ASSERT3U(l->l_phys->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l));
202168404Spjd	ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
203168404Spjd	ASSERT(zlf->lf_type != ZAP_CHUNK_FREE);
204168404Spjd
205168404Spjd	zlf->lf_type = ZAP_CHUNK_FREE;
206168404Spjd	zlf->lf_next = l->l_phys->l_hdr.lh_freelist;
207168404Spjd	bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */
208168404Spjd	l->l_phys->l_hdr.lh_freelist = chunk;
209168404Spjd
210168404Spjd	l->l_phys->l_hdr.lh_nfree++;
211168404Spjd}
212168404Spjd
213168404Spjd/*
214168404Spjd * Routines which manipulate leaf arrays (zap_leaf_array type chunks).
215168404Spjd */
216168404Spjd
217168404Spjdstatic uint16_t
218168404Spjdzap_leaf_array_create(zap_leaf_t *l, const char *buf,
219219089Spjd    int integer_size, int num_integers)
220168404Spjd{
221168404Spjd	uint16_t chunk_head;
222168404Spjd	uint16_t *chunkp = &chunk_head;
223168404Spjd	int byten = 0;
224247187Smm	uint64_t value = 0;
225168404Spjd	int shift = (integer_size-1)*8;
226168404Spjd	int len = num_integers;
227168404Spjd
228168404Spjd	ASSERT3U(num_integers * integer_size, <, MAX_ARRAY_BYTES);
229168404Spjd
230168404Spjd	while (len > 0) {
231168404Spjd		uint16_t chunk = zap_leaf_chunk_alloc(l);
232168404Spjd		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
233168404Spjd		int i;
234168404Spjd
235168404Spjd		la->la_type = ZAP_CHUNK_ARRAY;
236168404Spjd		for (i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) {
237168404Spjd			if (byten == 0)
238168404Spjd				value = ldv(integer_size, buf);
239168404Spjd			la->la_array[i] = value >> shift;
240168404Spjd			value <<= 8;
241168404Spjd			if (++byten == integer_size) {
242168404Spjd				byten = 0;
243168404Spjd				buf += integer_size;
244168404Spjd				if (--len == 0)
245168404Spjd					break;
246168404Spjd			}
247168404Spjd		}
248168404Spjd
249168404Spjd		*chunkp = chunk;
250168404Spjd		chunkp = &la->la_next;
251168404Spjd	}
252168404Spjd	*chunkp = CHAIN_END;
253168404Spjd
254168404Spjd	return (chunk_head);
255168404Spjd}
256168404Spjd
257168404Spjdstatic void
258168404Spjdzap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp)
259168404Spjd{
260168404Spjd	uint16_t chunk = *chunkp;
261168404Spjd
262168404Spjd	*chunkp = CHAIN_END;
263168404Spjd
264168404Spjd	while (chunk != CHAIN_END) {
265168404Spjd		int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next;
266168404Spjd		ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==,
267168404Spjd		    ZAP_CHUNK_ARRAY);
268168404Spjd		zap_leaf_chunk_free(l, chunk);
269168404Spjd		chunk = nextchunk;
270168404Spjd	}
271168404Spjd}
272168404Spjd
273168404Spjd/* array_len and buf_len are in integers, not bytes */
274168404Spjdstatic void
275168404Spjdzap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
276168404Spjd    int array_int_len, int array_len, int buf_int_len, uint64_t buf_len,
277219089Spjd    void *buf)
278168404Spjd{
279168404Spjd	int len = MIN(array_len, buf_len);
280168404Spjd	int byten = 0;
281168404Spjd	uint64_t value = 0;
282219089Spjd	char *p = buf;
283168404Spjd
284168404Spjd	ASSERT3U(array_int_len, <=, buf_int_len);
285168404Spjd
286168404Spjd	/* Fast path for one 8-byte integer */
287168404Spjd	if (array_int_len == 8 && buf_int_len == 8 && len == 1) {
288168404Spjd		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
289168404Spjd		uint8_t *ip = la->la_array;
290219089Spjd		uint64_t *buf64 = buf;
291168404Spjd
292168404Spjd		*buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 |
293168404Spjd		    (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 |
294168404Spjd		    (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 |
295168404Spjd		    (uint64_t)ip[6] << 8 | (uint64_t)ip[7];
296168404Spjd		return;
297168404Spjd	}
298168404Spjd
299168404Spjd	/* Fast path for an array of 1-byte integers (eg. the entry name) */
300168404Spjd	if (array_int_len == 1 && buf_int_len == 1 &&
301168404Spjd	    buf_len > array_len + ZAP_LEAF_ARRAY_BYTES) {
302168404Spjd		while (chunk != CHAIN_END) {
303168404Spjd			struct zap_leaf_array *la =
304168404Spjd			    &ZAP_LEAF_CHUNK(l, chunk).l_array;
305219089Spjd			bcopy(la->la_array, p, ZAP_LEAF_ARRAY_BYTES);
306219089Spjd			p += ZAP_LEAF_ARRAY_BYTES;
307168404Spjd			chunk = la->la_next;
308168404Spjd		}
309168404Spjd		return;
310168404Spjd	}
311168404Spjd
312168404Spjd	while (len > 0) {
313168404Spjd		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
314168404Spjd		int i;
315168404Spjd
316168404Spjd		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
317168404Spjd		for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
318168404Spjd			value = (value << 8) | la->la_array[i];
319168404Spjd			byten++;
320168404Spjd			if (byten == array_int_len) {
321219089Spjd				stv(buf_int_len, p, value);
322168404Spjd				byten = 0;
323168404Spjd				len--;
324168404Spjd				if (len == 0)
325168404Spjd					return;
326219089Spjd				p += buf_int_len;
327168404Spjd			}
328168404Spjd		}
329168404Spjd		chunk = la->la_next;
330168404Spjd	}
331168404Spjd}
332168404Spjd
333185029Spjdstatic boolean_t
334219089Spjdzap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn,
335219089Spjd    int chunk, int array_numints)
336168404Spjd{
337168404Spjd	int bseen = 0;
338168404Spjd
339219089Spjd	if (zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY) {
340219089Spjd		uint64_t *thiskey;
341219089Spjd		boolean_t match;
342219089Spjd
343219089Spjd		ASSERT(zn->zn_key_intlen == sizeof (*thiskey));
344219089Spjd		thiskey = kmem_alloc(array_numints * sizeof (*thiskey),
345219089Spjd		    KM_SLEEP);
346219089Spjd
347219089Spjd		zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_numints,
348219089Spjd		    sizeof (*thiskey), array_numints, thiskey);
349219089Spjd		match = bcmp(thiskey, zn->zn_key_orig,
350219089Spjd		    array_numints * sizeof (*thiskey)) == 0;
351219089Spjd		kmem_free(thiskey, array_numints * sizeof (*thiskey));
352219089Spjd		return (match);
353219089Spjd	}
354219089Spjd
355219089Spjd	ASSERT(zn->zn_key_intlen == 1);
356185029Spjd	if (zn->zn_matchtype == MT_FIRST) {
357219089Spjd		char *thisname = kmem_alloc(array_numints, KM_SLEEP);
358185029Spjd		boolean_t match;
359185029Spjd
360219089Spjd		zap_leaf_array_read(l, chunk, sizeof (char), array_numints,
361219089Spjd		    sizeof (char), array_numints, thisname);
362185029Spjd		match = zap_match(zn, thisname);
363219089Spjd		kmem_free(thisname, array_numints);
364185029Spjd		return (match);
365185029Spjd	}
366185029Spjd
367219089Spjd	/*
368219089Spjd	 * Fast path for exact matching.
369219089Spjd	 * First check that the lengths match, so that we don't read
370219089Spjd	 * past the end of the zn_key_orig array.
371219089Spjd	 */
372219089Spjd	if (array_numints != zn->zn_key_orig_numints)
373219089Spjd		return (B_FALSE);
374219089Spjd	while (bseen < array_numints) {
375168404Spjd		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
376219089Spjd		int toread = MIN(array_numints - bseen, ZAP_LEAF_ARRAY_BYTES);
377168404Spjd		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
378219089Spjd		if (bcmp(la->la_array, (char *)zn->zn_key_orig + bseen, toread))
379168404Spjd			break;
380168404Spjd		chunk = la->la_next;
381168404Spjd		bseen += toread;
382168404Spjd	}
383219089Spjd	return (bseen == array_numints);
384168404Spjd}
385168404Spjd
386168404Spjd/*
387168404Spjd * Routines which manipulate leaf entries.
388168404Spjd */
389168404Spjd
390168404Spjdint
391185029Spjdzap_leaf_lookup(zap_leaf_t *l, zap_name_t *zn, zap_entry_handle_t *zeh)
392168404Spjd{
393168404Spjd	uint16_t *chunkp;
394168404Spjd	struct zap_leaf_entry *le;
395168404Spjd
396168404Spjd	ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
397168404Spjd
398185029Spjdagain:
399185029Spjd	for (chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash);
400168404Spjd	    *chunkp != CHAIN_END; chunkp = &le->le_next) {
401168404Spjd		uint16_t chunk = *chunkp;
402168404Spjd		le = ZAP_LEAF_ENTRY(l, chunk);
403168404Spjd
404168404Spjd		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
405168404Spjd		ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
406168404Spjd
407185029Spjd		if (le->le_hash != zn->zn_hash)
408168404Spjd			continue;
409168404Spjd
410185029Spjd		/*
411185029Spjd		 * NB: the entry chain is always sorted by cd on
412185029Spjd		 * normalized zap objects, so this will find the
413185029Spjd		 * lowest-cd match for MT_FIRST.
414185029Spjd		 */
415185029Spjd		ASSERT(zn->zn_matchtype == MT_EXACT ||
416185029Spjd		    (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED));
417185029Spjd		if (zap_leaf_array_match(l, zn, le->le_name_chunk,
418219089Spjd		    le->le_name_numints)) {
419219089Spjd			zeh->zeh_num_integers = le->le_value_numints;
420219089Spjd			zeh->zeh_integer_size = le->le_value_intlen;
421168404Spjd			zeh->zeh_cd = le->le_cd;
422168404Spjd			zeh->zeh_hash = le->le_hash;
423168404Spjd			zeh->zeh_chunkp = chunkp;
424168404Spjd			zeh->zeh_leaf = l;
425168404Spjd			return (0);
426168404Spjd		}
427168404Spjd	}
428168404Spjd
429185029Spjd	/*
430185029Spjd	 * NB: we could of course do this in one pass, but that would be
431185029Spjd	 * a pain.  We'll see if MT_BEST is even used much.
432185029Spjd	 */
433185029Spjd	if (zn->zn_matchtype == MT_BEST) {
434185029Spjd		zn->zn_matchtype = MT_FIRST;
435185029Spjd		goto again;
436185029Spjd	}
437185029Spjd
438249195Smm	return (SET_ERROR(ENOENT));
439168404Spjd}
440168404Spjd
441168404Spjd/* Return (h1,cd1 >= h2,cd2) */
442168404Spjd#define	HCD_GTEQ(h1, cd1, h2, cd2) \
443168404Spjd	((h1 > h2) ? TRUE : ((h1 == h2 && cd1 >= cd2) ? TRUE : FALSE))
444168404Spjd
445168404Spjdint
446168404Spjdzap_leaf_lookup_closest(zap_leaf_t *l,
447168404Spjd    uint64_t h, uint32_t cd, zap_entry_handle_t *zeh)
448168404Spjd{
449168404Spjd	uint16_t chunk;
450168404Spjd	uint64_t besth = -1ULL;
451219089Spjd	uint32_t bestcd = -1U;
452168404Spjd	uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1;
453168404Spjd	uint16_t lh;
454168404Spjd	struct zap_leaf_entry *le;
455168404Spjd
456168404Spjd	ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
457168404Spjd
458168404Spjd	for (lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
459168404Spjd		for (chunk = l->l_phys->l_hash[lh];
460168404Spjd		    chunk != CHAIN_END; chunk = le->le_next) {
461168404Spjd			le = ZAP_LEAF_ENTRY(l, chunk);
462168404Spjd
463168404Spjd			ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
464168404Spjd			ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
465168404Spjd
466168404Spjd			if (HCD_GTEQ(le->le_hash, le->le_cd, h, cd) &&
467168404Spjd			    HCD_GTEQ(besth, bestcd, le->le_hash, le->le_cd)) {
468168404Spjd				ASSERT3U(bestlh, >=, lh);
469168404Spjd				bestlh = lh;
470168404Spjd				besth = le->le_hash;
471168404Spjd				bestcd = le->le_cd;
472168404Spjd
473219089Spjd				zeh->zeh_num_integers = le->le_value_numints;
474219089Spjd				zeh->zeh_integer_size = le->le_value_intlen;
475168404Spjd				zeh->zeh_cd = le->le_cd;
476168404Spjd				zeh->zeh_hash = le->le_hash;
477168404Spjd				zeh->zeh_fakechunk = chunk;
478168404Spjd				zeh->zeh_chunkp = &zeh->zeh_fakechunk;
479168404Spjd				zeh->zeh_leaf = l;
480168404Spjd			}
481168404Spjd		}
482168404Spjd	}
483168404Spjd
484219089Spjd	return (bestcd == -1U ? ENOENT : 0);
485168404Spjd}
486168404Spjd
487168404Spjdint
488168404Spjdzap_entry_read(const zap_entry_handle_t *zeh,
489168404Spjd    uint8_t integer_size, uint64_t num_integers, void *buf)
490168404Spjd{
491168404Spjd	struct zap_leaf_entry *le =
492168404Spjd	    ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
493168404Spjd	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
494168404Spjd
495219089Spjd	if (le->le_value_intlen > integer_size)
496249195Smm		return (SET_ERROR(EINVAL));
497168404Spjd
498219089Spjd	zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk,
499219089Spjd	    le->le_value_intlen, le->le_value_numints,
500219089Spjd	    integer_size, num_integers, buf);
501168404Spjd
502168404Spjd	if (zeh->zeh_num_integers > num_integers)
503249195Smm		return (SET_ERROR(EOVERFLOW));
504168404Spjd	return (0);
505168404Spjd
506168404Spjd}
507168404Spjd
508168404Spjdint
509219089Spjdzap_entry_read_name(zap_t *zap, const zap_entry_handle_t *zeh, uint16_t buflen,
510219089Spjd    char *buf)
511168404Spjd{
512168404Spjd	struct zap_leaf_entry *le =
513168404Spjd	    ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
514168404Spjd	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
515168404Spjd
516219089Spjd	if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
517219089Spjd		zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 8,
518219089Spjd		    le->le_name_numints, 8, buflen / 8, buf);
519219089Spjd	} else {
520219089Spjd		zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1,
521219089Spjd		    le->le_name_numints, 1, buflen, buf);
522219089Spjd	}
523219089Spjd	if (le->le_name_numints > buflen)
524249195Smm		return (SET_ERROR(EOVERFLOW));
525168404Spjd	return (0);
526168404Spjd}
527168404Spjd
528168404Spjdint
529168404Spjdzap_entry_update(zap_entry_handle_t *zeh,
530168404Spjd	uint8_t integer_size, uint64_t num_integers, const void *buf)
531168404Spjd{
532168404Spjd	int delta_chunks;
533168404Spjd	zap_leaf_t *l = zeh->zeh_leaf;
534168404Spjd	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp);
535168404Spjd
536168404Spjd	delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) -
537219089Spjd	    ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen);
538168404Spjd
539168404Spjd	if ((int)l->l_phys->l_hdr.lh_nfree < delta_chunks)
540249195Smm		return (SET_ERROR(EAGAIN));
541168404Spjd
542168404Spjd	zap_leaf_array_free(l, &le->le_value_chunk);
543168404Spjd	le->le_value_chunk =
544168404Spjd	    zap_leaf_array_create(l, buf, integer_size, num_integers);
545219089Spjd	le->le_value_numints = num_integers;
546219089Spjd	le->le_value_intlen = integer_size;
547168404Spjd	return (0);
548168404Spjd}
549168404Spjd
550168404Spjdvoid
551168404Spjdzap_entry_remove(zap_entry_handle_t *zeh)
552168404Spjd{
553168404Spjd	uint16_t entry_chunk;
554168404Spjd	struct zap_leaf_entry *le;
555168404Spjd	zap_leaf_t *l = zeh->zeh_leaf;
556168404Spjd
557168404Spjd	ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk);
558168404Spjd
559168404Spjd	entry_chunk = *zeh->zeh_chunkp;
560168404Spjd	le = ZAP_LEAF_ENTRY(l, entry_chunk);
561168404Spjd	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
562168404Spjd
563168404Spjd	zap_leaf_array_free(l, &le->le_name_chunk);
564168404Spjd	zap_leaf_array_free(l, &le->le_value_chunk);
565168404Spjd
566168404Spjd	*zeh->zeh_chunkp = le->le_next;
567168404Spjd	zap_leaf_chunk_free(l, entry_chunk);
568168404Spjd
569168404Spjd	l->l_phys->l_hdr.lh_nentries--;
570168404Spjd}
571168404Spjd
572168404Spjdint
573219089Spjdzap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
574168404Spjd    uint8_t integer_size, uint64_t num_integers, const void *buf,
575168404Spjd    zap_entry_handle_t *zeh)
576168404Spjd{
577168404Spjd	uint16_t chunk;
578168404Spjd	uint16_t *chunkp;
579168404Spjd	struct zap_leaf_entry *le;
580219089Spjd	uint64_t valuelen;
581168404Spjd	int numchunks;
582219089Spjd	uint64_t h = zn->zn_hash;
583168404Spjd
584168404Spjd	valuelen = integer_size * num_integers;
585168404Spjd
586219089Spjd	numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
587219089Spjd	    zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
588168404Spjd	if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
589168404Spjd		return (E2BIG);
590168404Spjd
591219089Spjd	if (cd == ZAP_NEED_CD) {
592185029Spjd		/* find the lowest unused cd */
593185029Spjd		if (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) {
594185029Spjd			cd = 0;
595185029Spjd
596168404Spjd			for (chunk = *LEAF_HASH_ENTPTR(l, h);
597168404Spjd			    chunk != CHAIN_END; chunk = le->le_next) {
598168404Spjd				le = ZAP_LEAF_ENTRY(l, chunk);
599185029Spjd				if (le->le_cd > cd)
600168404Spjd					break;
601185029Spjd				if (le->le_hash == h) {
602185029Spjd					ASSERT3U(cd, ==, le->le_cd);
603185029Spjd					cd++;
604168404Spjd				}
605168404Spjd			}
606185029Spjd		} else {
607185029Spjd			/* old unsorted format; do it the O(n^2) way */
608219089Spjd			for (cd = 0; ; cd++) {
609185029Spjd				for (chunk = *LEAF_HASH_ENTPTR(l, h);
610185029Spjd				    chunk != CHAIN_END; chunk = le->le_next) {
611185029Spjd					le = ZAP_LEAF_ENTRY(l, chunk);
612185029Spjd					if (le->le_hash == h &&
613185029Spjd					    le->le_cd == cd) {
614185029Spjd						break;
615185029Spjd					}
616185029Spjd				}
617185029Spjd				/* If this cd is not in use, we are good. */
618185029Spjd				if (chunk == CHAIN_END)
619185029Spjd					break;
620185029Spjd			}
621168404Spjd		}
622185029Spjd		/*
623219089Spjd		 * We would run out of space in a block before we could
624219089Spjd		 * store enough entries to run out of CD values.
625185029Spjd		 */
626219089Spjd		ASSERT3U(cd, <, zap_maxcd(zn->zn_zap));
627168404Spjd	}
628168404Spjd
629168404Spjd	if (l->l_phys->l_hdr.lh_nfree < numchunks)
630249195Smm		return (SET_ERROR(EAGAIN));
631168404Spjd
632168404Spjd	/* make the entry */
633168404Spjd	chunk = zap_leaf_chunk_alloc(l);
634168404Spjd	le = ZAP_LEAF_ENTRY(l, chunk);
635168404Spjd	le->le_type = ZAP_CHUNK_ENTRY;
636219089Spjd	le->le_name_chunk = zap_leaf_array_create(l, zn->zn_key_orig,
637219089Spjd	    zn->zn_key_intlen, zn->zn_key_orig_numints);
638219089Spjd	le->le_name_numints = zn->zn_key_orig_numints;
639168404Spjd	le->le_value_chunk =
640168404Spjd	    zap_leaf_array_create(l, buf, integer_size, num_integers);
641219089Spjd	le->le_value_numints = num_integers;
642219089Spjd	le->le_value_intlen = integer_size;
643168404Spjd	le->le_hash = h;
644168404Spjd	le->le_cd = cd;
645168404Spjd
646168404Spjd	/* link it into the hash chain */
647185029Spjd	/* XXX if we did the search above, we could just use that */
648185029Spjd	chunkp = zap_leaf_rehash_entry(l, chunk);
649168404Spjd
650168404Spjd	l->l_phys->l_hdr.lh_nentries++;
651168404Spjd
652168404Spjd	zeh->zeh_leaf = l;
653168404Spjd	zeh->zeh_num_integers = num_integers;
654219089Spjd	zeh->zeh_integer_size = le->le_value_intlen;
655168404Spjd	zeh->zeh_cd = le->le_cd;
656168404Spjd	zeh->zeh_hash = le->le_hash;
657168404Spjd	zeh->zeh_chunkp = chunkp;
658168404Spjd
659168404Spjd	return (0);
660168404Spjd}
661168404Spjd
662168404Spjd/*
663185029Spjd * Determine if there is another entry with the same normalized form.
664185029Spjd * For performance purposes, either zn or name must be provided (the
665185029Spjd * other can be NULL).  Note, there usually won't be any hash
666185029Spjd * conflicts, in which case we don't need the concatenated/normalized
667185029Spjd * form of the name.  But all callers have one of these on hand anyway,
668185029Spjd * so might as well take advantage.  A cleaner but slower interface
669185029Spjd * would accept neither argument, and compute the normalized name as
670185029Spjd * needed (using zap_name_alloc(zap_entry_read_name(zeh))).
671185029Spjd */
672185029Spjdboolean_t
673185029Spjdzap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
674185029Spjd    const char *name, zap_t *zap)
675185029Spjd{
676185029Spjd	uint64_t chunk;
677185029Spjd	struct zap_leaf_entry *le;
678185029Spjd	boolean_t allocdzn = B_FALSE;
679185029Spjd
680185029Spjd	if (zap->zap_normflags == 0)
681185029Spjd		return (B_FALSE);
682185029Spjd
683185029Spjd	for (chunk = *LEAF_HASH_ENTPTR(zeh->zeh_leaf, zeh->zeh_hash);
684185029Spjd	    chunk != CHAIN_END; chunk = le->le_next) {
685185029Spjd		le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, chunk);
686185029Spjd		if (le->le_hash != zeh->zeh_hash)
687185029Spjd			continue;
688185029Spjd		if (le->le_cd == zeh->zeh_cd)
689185029Spjd			continue;
690185029Spjd
691185029Spjd		if (zn == NULL) {
692185029Spjd			zn = zap_name_alloc(zap, name, MT_FIRST);
693185029Spjd			allocdzn = B_TRUE;
694185029Spjd		}
695185029Spjd		if (zap_leaf_array_match(zeh->zeh_leaf, zn,
696219089Spjd		    le->le_name_chunk, le->le_name_numints)) {
697185029Spjd			if (allocdzn)
698185029Spjd				zap_name_free(zn);
699185029Spjd			return (B_TRUE);
700185029Spjd		}
701185029Spjd	}
702185029Spjd	if (allocdzn)
703185029Spjd		zap_name_free(zn);
704185029Spjd	return (B_FALSE);
705185029Spjd}
706185029Spjd
707185029Spjd/*
708168404Spjd * Routines for transferring entries between leafs.
709168404Spjd */
710168404Spjd
711185029Spjdstatic uint16_t *
712168404Spjdzap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry)
713168404Spjd{
714168404Spjd	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry);
715185029Spjd	struct zap_leaf_entry *le2;
716185029Spjd	uint16_t *chunkp;
717185029Spjd
718185029Spjd	/*
719185029Spjd	 * keep the entry chain sorted by cd
720185029Spjd	 * NB: this will not cause problems for unsorted leafs, though
721185029Spjd	 * it is unnecessary there.
722185029Spjd	 */
723185029Spjd	for (chunkp = LEAF_HASH_ENTPTR(l, le->le_hash);
724185029Spjd	    *chunkp != CHAIN_END; chunkp = &le2->le_next) {
725185029Spjd		le2 = ZAP_LEAF_ENTRY(l, *chunkp);
726185029Spjd		if (le2->le_cd > le->le_cd)
727185029Spjd			break;
728185029Spjd	}
729185029Spjd
730185029Spjd	le->le_next = *chunkp;
731185029Spjd	*chunkp = entry;
732185029Spjd	return (chunkp);
733168404Spjd}
734168404Spjd
735168404Spjdstatic uint16_t
736168404Spjdzap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl)
737168404Spjd{
738168404Spjd	uint16_t new_chunk;
739168404Spjd	uint16_t *nchunkp = &new_chunk;
740168404Spjd
741168404Spjd	while (chunk != CHAIN_END) {
742168404Spjd		uint16_t nchunk = zap_leaf_chunk_alloc(nl);
743168404Spjd		struct zap_leaf_array *nla =
744168404Spjd		    &ZAP_LEAF_CHUNK(nl, nchunk).l_array;
745168404Spjd		struct zap_leaf_array *la =
746168404Spjd		    &ZAP_LEAF_CHUNK(l, chunk).l_array;
747168404Spjd		int nextchunk = la->la_next;
748168404Spjd
749168404Spjd		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
750168404Spjd		ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l));
751168404Spjd
752168404Spjd		*nla = *la; /* structure assignment */
753168404Spjd
754168404Spjd		zap_leaf_chunk_free(l, chunk);
755168404Spjd		chunk = nextchunk;
756168404Spjd		*nchunkp = nchunk;
757168404Spjd		nchunkp = &nla->la_next;
758168404Spjd	}
759168404Spjd	*nchunkp = CHAIN_END;
760168404Spjd	return (new_chunk);
761168404Spjd}
762168404Spjd
763168404Spjdstatic void
764168404Spjdzap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl)
765168404Spjd{
766168404Spjd	struct zap_leaf_entry *le, *nle;
767168404Spjd	uint16_t chunk;
768168404Spjd
769168404Spjd	le = ZAP_LEAF_ENTRY(l, entry);
770168404Spjd	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
771168404Spjd
772168404Spjd	chunk = zap_leaf_chunk_alloc(nl);
773168404Spjd	nle = ZAP_LEAF_ENTRY(nl, chunk);
774168404Spjd	*nle = *le; /* structure assignment */
775168404Spjd
776185029Spjd	(void) zap_leaf_rehash_entry(nl, chunk);
777168404Spjd
778168404Spjd	nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl);
779168404Spjd	nle->le_value_chunk =
780168404Spjd	    zap_leaf_transfer_array(l, le->le_value_chunk, nl);
781168404Spjd
782168404Spjd	zap_leaf_chunk_free(l, entry);
783168404Spjd
784168404Spjd	l->l_phys->l_hdr.lh_nentries--;
785168404Spjd	nl->l_phys->l_hdr.lh_nentries++;
786168404Spjd}
787168404Spjd
788168404Spjd/*
789168404Spjd * Transfer the entries whose hash prefix ends in 1 to the new leaf.
790168404Spjd */
791168404Spjdvoid
792185029Spjdzap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
793168404Spjd{
794168404Spjd	int i;
795168404Spjd	int bit = 64 - 1 - l->l_phys->l_hdr.lh_prefix_len;
796168404Spjd
797168404Spjd	/* set new prefix and prefix_len */
798168404Spjd	l->l_phys->l_hdr.lh_prefix <<= 1;
799168404Spjd	l->l_phys->l_hdr.lh_prefix_len++;
800168404Spjd	nl->l_phys->l_hdr.lh_prefix = l->l_phys->l_hdr.lh_prefix | 1;
801168404Spjd	nl->l_phys->l_hdr.lh_prefix_len = l->l_phys->l_hdr.lh_prefix_len;
802168404Spjd
803168404Spjd	/* break existing hash chains */
804168404Spjd	zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l));
805168404Spjd
806185029Spjd	if (sort)
807185029Spjd		l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
808185029Spjd
809168404Spjd	/*
810168404Spjd	 * Transfer entries whose hash bit 'bit' is set to nl; rehash
811168404Spjd	 * the remaining entries
812168404Spjd	 *
813168404Spjd	 * NB: We could find entries via the hashtable instead. That
814168404Spjd	 * would be O(hashents+numents) rather than O(numblks+numents),
815168404Spjd	 * but this accesses memory more sequentially, and when we're
816168404Spjd	 * called, the block is usually pretty full.
817168404Spjd	 */
818168404Spjd	for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
819168404Spjd		struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i);
820168404Spjd		if (le->le_type != ZAP_CHUNK_ENTRY)
821168404Spjd			continue;
822168404Spjd
823168404Spjd		if (le->le_hash & (1ULL << bit))
824168404Spjd			zap_leaf_transfer_entry(l, i, nl);
825168404Spjd		else
826185029Spjd			(void) zap_leaf_rehash_entry(l, i);
827168404Spjd	}
828168404Spjd}
829168404Spjd
830168404Spjdvoid
831168404Spjdzap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
832168404Spjd{
833168404Spjd	int i, n;
834168404Spjd
835168404Spjd	n = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift -
836168404Spjd	    l->l_phys->l_hdr.lh_prefix_len;
837168404Spjd	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
838168404Spjd	zs->zs_leafs_with_2n_pointers[n]++;
839168404Spjd
840168404Spjd
841168404Spjd	n = l->l_phys->l_hdr.lh_nentries/5;
842168404Spjd	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
843168404Spjd	zs->zs_blocks_with_n5_entries[n]++;
844168404Spjd
845168404Spjd	n = ((1<<FZAP_BLOCK_SHIFT(zap)) -
846168404Spjd	    l->l_phys->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
847168404Spjd	    (1<<FZAP_BLOCK_SHIFT(zap));
848168404Spjd	n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
849168404Spjd	zs->zs_blocks_n_tenths_full[n]++;
850168404Spjd
851168404Spjd	for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
852168404Spjd		int nentries = 0;
853168404Spjd		int chunk = l->l_phys->l_hash[i];
854168404Spjd
855168404Spjd		while (chunk != CHAIN_END) {
856168404Spjd			struct zap_leaf_entry *le =
857168404Spjd			    ZAP_LEAF_ENTRY(l, chunk);
858168404Spjd
859219089Spjd			n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_numints) +
860219089Spjd			    ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints *
861219089Spjd			    le->le_value_intlen);
862168404Spjd			n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
863168404Spjd			zs->zs_entries_using_n_chunks[n]++;
864168404Spjd
865168404Spjd			chunk = le->le_next;
866168404Spjd			nentries++;
867168404Spjd		}
868168404Spjd
869168404Spjd		n = nentries;
870168404Spjd		n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
871168404Spjd		zs->zs_buckets_with_n_entries[n]++;
872168404Spjd	}
873168404Spjd}
874