subr_blist.c revision 323665
1/*-
2 * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
3 * Redistribution and use in source and binary forms, with or without
4 * modification, are permitted provided that the following conditions
5 * are met:
6 * 1. Redistributions of source code must retain the above copyright
7 *    notice, this list of conditions and the following disclaimer.
8 * 2. Redistributions in binary form must reproduce the above copyright
9 *    notice, this list of conditions and the following disclaimer in the
10 *    documentation and/or other materials provided with the distribution.
11 * 4. Neither the name of the University nor the names of its contributors
12 *    may be used to endorse or promote products derived from this software
13 *    without specific prior written permission.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
16 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
19 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
21 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27/*
28 * BLIST.C -	Bitmap allocator/deallocator, using a radix tree with hinting
29 *
30 *	This module implements a general bitmap allocator/deallocator.  The
31 *	allocator eats around 2 bits per 'block'.  The module does not
32 *	try to interpret the meaning of a 'block' other than to return
33 *	SWAPBLK_NONE on an allocation failure.
34 *
35 *	A radix tree is used to maintain the bitmap.  Two radix constants are
36 *	involved:  One for the bitmaps contained in the leaf nodes (typically
37 *	64), and one for the meta nodes (typically 16).  Both meta and leaf
38 *	nodes have a hint field.  This field gives us a hint as to the largest
39 *	free contiguous range of blocks under the node.  It may contain a
40 *	value that is too high, but will never contain a value that is too
41 *	low.  When the radix tree is searched, allocation failures in subtrees
42 *	update the hint.
43 *
44 *	The radix tree also implements two collapsed states for meta nodes:
45 *	the ALL-ALLOCATED state and the ALL-FREE state.  If a meta node is
46 *	in either of these two states, all information contained underneath
47 *	the node is considered stale.  These states are used to optimize
48 *	allocation and freeing operations.
49 *
50 * 	The hinting greatly increases code efficiency for allocations while
51 *	the general radix structure optimizes both allocations and frees.  The
52 *	radix tree should be able to operate well no matter how much
53 *	fragmentation there is and no matter how large a bitmap is used.
54 *
55 *	The blist code wires all necessary memory at creation time.  Neither
56 *	allocations nor frees require interaction with the memory subsystem.
57 *	The non-blocking features of the blist code are used in the swap code
58 *	(vm/swap_pager.c).
59 *
60 *	LAYOUT: The radix tree is laid out recursively using a
61 *	linear array.  Each meta node is immediately followed (laid out
62 *	sequentially in memory) by BLIST_META_RADIX lower level nodes.  This
63 *	is a recursive structure but one that can be easily scanned through
64 *	a very simple 'skip' calculation.  In order to support large radixes,
65 *	portions of the tree may reside outside our memory allocation.  We
66 *	handle this with an early-termination optimization (when bighint is
67 *	set to -1) on the scan.  The memory allocation is only large enough
68 *	to cover the number of blocks requested at creation time even if it
69 *	must be encompassed in larger root-node radix.
70 *
71 *	NOTE: the allocator cannot currently allocate more than
72 *	BLIST_BMAP_RADIX blocks per call.  It will panic with 'allocation too
73 *	large' if you try.  This is an area that could use improvement.  The
74 *	radix is large enough that this restriction does not effect the swap
75 *	system, though.  Currently only the allocation code is affected by
76 *	this algorithmic unfeature.  The freeing code can handle arbitrary
77 *	ranges.
78 *
79 *	This code can be compiled stand-alone for debugging.
80 */
81
82#include <sys/cdefs.h>
83__FBSDID("$FreeBSD: stable/11/sys/kern/subr_blist.c 323665 2017-09-17 04:15:12Z alc $");
84
85#ifdef _KERNEL
86
87#include <sys/param.h>
88#include <sys/systm.h>
89#include <sys/lock.h>
90#include <sys/kernel.h>
91#include <sys/blist.h>
92#include <sys/malloc.h>
93#include <sys/proc.h>
94#include <sys/mutex.h>
95
96#else
97
98#ifndef BLIST_NO_DEBUG
99#define BLIST_DEBUG
100#endif
101
102#include <sys/types.h>
103#include <sys/malloc.h>
104#include <stdio.h>
105#include <string.h>
106#include <stdlib.h>
107#include <stdarg.h>
108#include <stdbool.h>
109
110#define	bitcount64(x)	__bitcount64((uint64_t)(x))
111#define malloc(a,b,c)	calloc(a, 1)
112#define free(a,b)	free(a)
113
114#include <sys/blist.h>
115
116void panic(const char *ctl, ...);
117
118#endif
119
120/*
121 * static support functions
122 */
123static daddr_t	blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count,
124		    daddr_t cursor);
125static daddr_t	blst_meta_alloc(blmeta_t *scan, daddr_t blk, daddr_t count,
126		    daddr_t radix, daddr_t skip, daddr_t cursor);
127static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
128static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
129		    daddr_t radix, daddr_t skip, daddr_t blk);
130static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
131		    daddr_t skip, blist_t dest, daddr_t count);
132static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
133static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
134		    daddr_t radix, daddr_t skip, daddr_t blk);
135static daddr_t	blst_radix_init(blmeta_t *scan, daddr_t radix, daddr_t skip,
136		    daddr_t count);
137#ifndef _KERNEL
138static void	blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
139		    daddr_t skip, int tab);
140#endif
141
142#ifdef _KERNEL
143static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
144#endif
145
146/*
147 * blist_create() - create a blist capable of handling up to the specified
148 *		    number of blocks
149 *
150 *	blocks - must be greater than 0
151 * 	flags  - malloc flags
152 *
153 *	The smallest blist consists of a single leaf node capable of
154 *	managing BLIST_BMAP_RADIX blocks.
155 */
156blist_t
157blist_create(daddr_t blocks, int flags)
158{
159	blist_t bl;
160	daddr_t nodes, radix, skip;
161
162	/*
163	 * Calculate radix and skip field used for scanning.
164	 */
165	radix = BLIST_BMAP_RADIX;
166	skip = 0;
167	while (radix < blocks) {
168		radix *= BLIST_META_RADIX;
169		skip = (skip + 1) * BLIST_META_RADIX;
170	}
171	nodes = 1 + blst_radix_init(NULL, radix, skip, blocks);
172
173	bl = malloc(sizeof(struct blist), M_SWAP, flags);
174	if (bl == NULL)
175		return (NULL);
176
177	bl->bl_blocks = blocks;
178	bl->bl_radix = radix;
179	bl->bl_skip = skip;
180	bl->bl_cursor = 0;
181	bl->bl_root = malloc(nodes * sizeof(blmeta_t), M_SWAP, flags);
182	if (bl->bl_root == NULL) {
183		free(bl, M_SWAP);
184		return (NULL);
185	}
186	blst_radix_init(bl->bl_root, radix, skip, blocks);
187
188#if defined(BLIST_DEBUG)
189	printf(
190		"BLIST representing %lld blocks (%lld MB of swap)"
191		", requiring %lldK of ram\n",
192		(long long)bl->bl_blocks,
193		(long long)bl->bl_blocks * 4 / 1024,
194		(long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
195	);
196	printf("BLIST raw radix tree contains %lld records\n",
197	    (long long)nodes);
198#endif
199
200	return (bl);
201}
202
203void
204blist_destroy(blist_t bl)
205{
206	free(bl->bl_root, M_SWAP);
207	free(bl, M_SWAP);
208}
209
210/*
211 * blist_alloc() -   reserve space in the block bitmap.  Return the base
212 *		     of a contiguous region or SWAPBLK_NONE if space could
213 *		     not be allocated.
214 */
215daddr_t
216blist_alloc(blist_t bl, daddr_t count)
217{
218	daddr_t blk;
219
220	/*
221	 * This loop iterates at most twice.  An allocation failure in the
222	 * first iteration leads to a second iteration only if the cursor was
223	 * non-zero.  When the cursor is zero, an allocation failure will
224	 * reduce the hint, stopping further iterations.
225	 */
226	while (count <= bl->bl_root->bm_bighint) {
227		blk = blst_meta_alloc(bl->bl_root, 0, count, bl->bl_radix,
228		    bl->bl_skip, bl->bl_cursor);
229		if (blk != SWAPBLK_NONE) {
230			bl->bl_cursor = blk + count;
231			return (blk);
232		} else if (bl->bl_cursor != 0)
233			bl->bl_cursor = 0;
234	}
235	return (SWAPBLK_NONE);
236}
237
238/*
239 * blist_avail() -	return the number of free blocks.
240 */
241daddr_t
242blist_avail(blist_t bl)
243{
244
245	if (bl->bl_radix == BLIST_BMAP_RADIX)
246		return (bitcount64(bl->bl_root->u.bmu_bitmap));
247	else
248		return (bl->bl_root->u.bmu_avail);
249}
250
251/*
252 * blist_free() -	free up space in the block bitmap.  Return the base
253 *		     	of a contiguous region.  Panic if an inconsistancy is
254 *			found.
255 */
256void
257blist_free(blist_t bl, daddr_t blkno, daddr_t count)
258{
259
260	blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix, bl->bl_skip, 0);
261}
262
263/*
264 * blist_fill() -	mark a region in the block bitmap as off-limits
265 *			to the allocator (i.e. allocate it), ignoring any
266 *			existing allocations.  Return the number of blocks
267 *			actually filled that were free before the call.
268 */
269daddr_t
270blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
271{
272
273	return (blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix,
274	    bl->bl_skip, 0));
275}
276
277/*
278 * blist_resize() -	resize an existing radix tree to handle the
279 *			specified number of blocks.  This will reallocate
280 *			the tree and transfer the previous bitmap to the new
281 *			one.  When extending the tree you can specify whether
282 *			the new blocks are to left allocated or freed.
283 */
284void
285blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
286{
287    blist_t newbl = blist_create(count, flags);
288    blist_t save = *pbl;
289
290    *pbl = newbl;
291    if (count > save->bl_blocks)
292	    count = save->bl_blocks;
293    blst_copy(save->bl_root, 0, save->bl_radix, save->bl_skip, newbl, count);
294
295    /*
296     * If resizing upwards, should we free the new space or not?
297     */
298    if (freenew && count < newbl->bl_blocks) {
299	    blist_free(newbl, count, newbl->bl_blocks - count);
300    }
301    blist_destroy(save);
302}
303
304#ifdef BLIST_DEBUG
305
306/*
307 * blist_print()    - dump radix tree
308 */
309void
310blist_print(blist_t bl)
311{
312	printf("BLIST {\n");
313	blst_radix_print(bl->bl_root, 0, bl->bl_radix, bl->bl_skip, 4);
314	printf("}\n");
315}
316
317#endif
318
319/************************************************************************
320 *			  ALLOCATION SUPPORT FUNCTIONS			*
321 ************************************************************************
322 *
323 *	These support functions do all the actual work.  They may seem
324 *	rather longish, but that's because I've commented them up.  The
325 *	actual code is straight forward.
326 *
327 */
328
329/*
330 * blist_leaf_alloc() -	allocate at a leaf in the radix tree (a bitmap).
331 *
332 *	This is the core of the allocator and is optimized for the
333 *	BLIST_BMAP_RADIX block allocation case.  Otherwise, execution
334 *	time is proportional to log2(count) + log2(BLIST_BMAP_RADIX).
335 */
336static daddr_t
337blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count, daddr_t cursor)
338{
339	u_daddr_t mask;
340	int count1, hi, lo, mid, num_shifts, range1, range_ext;
341
342	if (count == BLIST_BMAP_RADIX) {
343		/*
344		 * Optimize allocation of BLIST_BMAP_RADIX bits.  If this wasn't
345		 * a special case, then forming the final value of 'mask' below
346		 * would require special handling to avoid an invalid left shift
347		 * when count equals the number of bits in mask.
348		 */
349		if (~scan->u.bmu_bitmap != 0) {
350			scan->bm_bighint = BLIST_BMAP_RADIX - 1;
351			return (SWAPBLK_NONE);
352		}
353		if (cursor != blk)
354			return (SWAPBLK_NONE);
355		scan->u.bmu_bitmap = 0;
356		scan->bm_bighint = 0;
357		return (blk);
358	}
359	range1 = 0;
360	count1 = count - 1;
361	num_shifts = fls(count1);
362	mask = scan->u.bmu_bitmap;
363	while (mask != 0 && num_shifts > 0) {
364		/*
365		 * If bit i is set in mask, then bits in [i, i+range1] are set
366		 * in scan->u.bmu_bitmap.  The value of range1 is equal to
367		 * count1 >> num_shifts.  Grow range and reduce num_shifts to 0,
368		 * while preserving these invariants.  The updates to mask leave
369		 * fewer bits set, but each bit that remains set represents a
370		 * longer string of consecutive bits set in scan->u.bmu_bitmap.
371		 */
372		num_shifts--;
373		range_ext = range1 + ((count1 >> num_shifts) & 1);
374		mask &= mask >> range_ext;
375		range1 += range_ext;
376	}
377	if (mask == 0) {
378		/*
379		 * Update bighint.  There is no allocation bigger than range1
380		 * available in this leaf.
381		 */
382		scan->bm_bighint = range1;
383		return (SWAPBLK_NONE);
384	}
385
386	/*
387	 * Discard any candidates that appear before the cursor.
388	 */
389	lo = cursor - blk;
390	mask &= ~(u_daddr_t)0 << lo;
391
392	if (mask == 0)
393		return (SWAPBLK_NONE);
394
395	/*
396	 * The least significant set bit in mask marks the start of the first
397	 * available range of sufficient size.  Clear all the bits but that one,
398	 * and then perform a binary search to find its position.
399	 */
400	mask &= -mask;
401	hi = BLIST_BMAP_RADIX - count1;
402	while (lo + 1 < hi) {
403		mid = (lo + hi) >> 1;
404		if ((mask >> mid) != 0)
405			lo = mid;
406		else
407			hi = mid;
408	}
409
410	/*
411	 * Set in mask exactly the bits being allocated, and clear them from
412	 * the set of available bits.
413	 */
414	mask = (mask << count) - mask;
415	scan->u.bmu_bitmap &= ~mask;
416	return (blk + lo);
417}
418
419/*
420 * blist_meta_alloc() -	allocate at a meta in the radix tree.
421 *
422 *	Attempt to allocate at a meta node.  If we can't, we update
423 *	bighint and return a failure.  Updating bighint optimize future
424 *	calls that hit this node.  We have to check for our collapse cases
425 *	and we have a few optimizations strewn in as well.
426 */
427static daddr_t
428blst_meta_alloc(blmeta_t *scan, daddr_t blk, daddr_t count, daddr_t radix,
429    daddr_t skip, daddr_t cursor)
430{
431	daddr_t i, next_skip, r;
432	int child;
433	bool scan_from_start;
434
435	if (radix == BLIST_BMAP_RADIX)
436		return (blst_leaf_alloc(scan, blk, count, cursor));
437	if (scan->u.bmu_avail < count) {
438		/*
439		 * The meta node's hint must be too large if the allocation
440		 * exceeds the number of free blocks.  Reduce the hint, and
441		 * return failure.
442		 */
443		scan->bm_bighint = scan->u.bmu_avail;
444		return (SWAPBLK_NONE);
445	}
446	next_skip = skip / BLIST_META_RADIX;
447
448	/*
449	 * An ALL-FREE meta node requires special handling before allocating
450	 * any of its blocks.
451	 */
452	if (scan->u.bmu_avail == radix) {
453		radix /= BLIST_META_RADIX;
454
455		/*
456		 * Reinitialize each of the meta node's children.  An ALL-FREE
457		 * meta node cannot have a terminator in any subtree.
458		 */
459		for (i = 1; i <= skip; i += next_skip) {
460			if (next_skip == 1)
461				scan[i].u.bmu_bitmap = (u_daddr_t)-1;
462			else
463				scan[i].u.bmu_avail = radix;
464			scan[i].bm_bighint = radix;
465		}
466	} else {
467		radix /= BLIST_META_RADIX;
468	}
469
470	if (count > radix) {
471		/*
472		 * The allocation exceeds the number of blocks that are
473		 * managed by a subtree of this meta node.
474		 */
475		panic("allocation too large");
476	}
477	scan_from_start = cursor == blk;
478	child = (cursor - blk) / radix;
479	blk += child * radix;
480	for (i = 1 + child * next_skip; i <= skip; i += next_skip) {
481		if (count <= scan[i].bm_bighint) {
482			/*
483			 * The allocation might fit in the i'th subtree.
484			 */
485			r = blst_meta_alloc(&scan[i], blk, count, radix,
486			    next_skip - 1, cursor > blk ? cursor : blk);
487			if (r != SWAPBLK_NONE) {
488				scan->u.bmu_avail -= count;
489				return (r);
490			}
491		} else if (scan[i].bm_bighint == (daddr_t)-1) {
492			/*
493			 * Terminator
494			 */
495			break;
496		}
497		blk += radix;
498	}
499
500	/*
501	 * We couldn't allocate count in this subtree, update bighint.
502	 */
503	if (scan_from_start && scan->bm_bighint >= count)
504		scan->bm_bighint = count - 1;
505
506	return (SWAPBLK_NONE);
507}
508
509/*
510 * BLST_LEAF_FREE() -	free allocated block from leaf bitmap
511 *
512 */
513static void
514blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
515{
516	/*
517	 * free some data in this bitmap
518	 *
519	 * e.g.
520	 *	0000111111111110000
521	 *          \_________/\__/
522	 *		v        n
523	 */
524	int n = blk & (BLIST_BMAP_RADIX - 1);
525	u_daddr_t mask;
526
527	mask = ((u_daddr_t)-1 << n) &
528	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
529
530	if (scan->u.bmu_bitmap & mask)
531		panic("blst_radix_free: freeing free block");
532	scan->u.bmu_bitmap |= mask;
533
534	/*
535	 * We could probably do a better job here.  We are required to make
536	 * bighint at least as large as the biggest contiguous block of
537	 * data.  If we just shoehorn it, a little extra overhead will
538	 * be incured on the next allocation (but only that one typically).
539	 */
540	scan->bm_bighint = BLIST_BMAP_RADIX;
541}
542
543/*
544 * BLST_META_FREE() - free allocated blocks from radix tree meta info
545 *
546 *	This support routine frees a range of blocks from the bitmap.
547 *	The range must be entirely enclosed by this radix node.  If a
548 *	meta node, we break the range down recursively to free blocks
549 *	in subnodes (which means that this code can free an arbitrary
550 *	range whereas the allocation code cannot allocate an arbitrary
551 *	range).
552 */
553static void
554blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, daddr_t radix,
555    daddr_t skip, daddr_t blk)
556{
557	daddr_t i, next_skip, v;
558	int child;
559
560	if (scan->bm_bighint == (daddr_t)-1)
561		panic("freeing invalid range");
562	if (radix == BLIST_BMAP_RADIX)
563		return (blst_leaf_free(scan, freeBlk, count));
564	next_skip = skip / BLIST_META_RADIX;
565
566	if (scan->u.bmu_avail == 0) {
567		/*
568		 * ALL-ALLOCATED special case, with possible
569		 * shortcut to ALL-FREE special case.
570		 */
571		scan->u.bmu_avail = count;
572		scan->bm_bighint = count;
573
574		if (count != radix)  {
575			for (i = 1; i <= skip; i += next_skip) {
576				if (scan[i].bm_bighint == (daddr_t)-1)
577					break;
578				scan[i].bm_bighint = 0;
579				if (next_skip == 1) {
580					scan[i].u.bmu_bitmap = 0;
581				} else {
582					scan[i].u.bmu_avail = 0;
583				}
584			}
585			/* fall through */
586		}
587	} else {
588		scan->u.bmu_avail += count;
589		/* scan->bm_bighint = radix; */
590	}
591
592	/*
593	 * ALL-FREE special case.
594	 */
595
596	if (scan->u.bmu_avail == radix)
597		return;
598	if (scan->u.bmu_avail > radix)
599		panic("blst_meta_free: freeing already free blocks (%lld) %lld/%lld",
600		    (long long)count, (long long)scan->u.bmu_avail,
601		    (long long)radix);
602
603	/*
604	 * Break the free down into its components
605	 */
606
607	radix /= BLIST_META_RADIX;
608
609	child = (freeBlk - blk) / radix;
610	blk += child * radix;
611	i = 1 + child * next_skip;
612	while (i <= skip && blk < freeBlk + count) {
613		v = blk + radix - freeBlk;
614		if (v > count)
615			v = count;
616		blst_meta_free(&scan[i], freeBlk, v, radix, next_skip - 1, blk);
617		if (scan->bm_bighint < scan[i].bm_bighint)
618			scan->bm_bighint = scan[i].bm_bighint;
619		count -= v;
620		freeBlk += v;
621		blk += radix;
622		i += next_skip;
623	}
624}
625
626/*
627 * BLIST_RADIX_COPY() - copy one radix tree to another
628 *
629 *	Locates free space in the source tree and frees it in the destination
630 *	tree.  The space may not already be free in the destination.
631 */
632static void
633blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, daddr_t skip,
634    blist_t dest, daddr_t count)
635{
636	daddr_t i, next_skip;
637
638	/*
639	 * Leaf node
640	 */
641
642	if (radix == BLIST_BMAP_RADIX) {
643		u_daddr_t v = scan->u.bmu_bitmap;
644
645		if (v == (u_daddr_t)-1) {
646			blist_free(dest, blk, count);
647		} else if (v != 0) {
648			int i;
649
650			for (i = 0; i < BLIST_BMAP_RADIX && i < count; ++i) {
651				if (v & ((u_daddr_t)1 << i))
652					blist_free(dest, blk + i, 1);
653			}
654		}
655		return;
656	}
657
658	/*
659	 * Meta node
660	 */
661
662	if (scan->u.bmu_avail == 0) {
663		/*
664		 * Source all allocated, leave dest allocated
665		 */
666		return;
667	}
668	if (scan->u.bmu_avail == radix) {
669		/*
670		 * Source all free, free entire dest
671		 */
672		if (count < radix)
673			blist_free(dest, blk, count);
674		else
675			blist_free(dest, blk, radix);
676		return;
677	}
678
679
680	radix /= BLIST_META_RADIX;
681	next_skip = skip / BLIST_META_RADIX;
682
683	for (i = 1; count && i <= skip; i += next_skip) {
684		if (scan[i].bm_bighint == (daddr_t)-1)
685			break;
686
687		if (count >= radix) {
688			blst_copy(&scan[i], blk, radix, next_skip - 1, dest,
689			    radix);
690			count -= radix;
691		} else {
692			if (count) {
693				blst_copy(&scan[i], blk, radix, next_skip - 1,
694				    dest, count);
695			}
696			count = 0;
697		}
698		blk += radix;
699	}
700}
701
702/*
703 * BLST_LEAF_FILL() -	allocate specific blocks in leaf bitmap
704 *
705 *	This routine allocates all blocks in the specified range
706 *	regardless of any existing allocations in that range.  Returns
707 *	the number of blocks allocated by the call.
708 */
709static daddr_t
710blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
711{
712	int n = blk & (BLIST_BMAP_RADIX - 1);
713	daddr_t nblks;
714	u_daddr_t mask;
715
716	mask = ((u_daddr_t)-1 << n) &
717	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
718
719	/* Count the number of blocks that we are allocating. */
720	nblks = bitcount64(scan->u.bmu_bitmap & mask);
721
722	scan->u.bmu_bitmap &= ~mask;
723	return (nblks);
724}
725
726/*
727 * BLIST_META_FILL() -	allocate specific blocks at a meta node
728 *
729 *	This routine allocates the specified range of blocks,
730 *	regardless of any existing allocations in the range.  The
731 *	range must be within the extent of this node.  Returns the
732 *	number of blocks allocated by the call.
733 */
734static daddr_t
735blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, daddr_t radix,
736    daddr_t skip, daddr_t blk)
737{
738	daddr_t i, nblks, next_skip, v;
739	int child;
740
741	if (scan->bm_bighint == (daddr_t)-1)
742		panic("filling invalid range");
743	if (count > radix) {
744		/*
745		 * The allocation exceeds the number of blocks that are
746		 * managed by this node.
747		 */
748		panic("fill too large");
749	}
750	if (radix == BLIST_BMAP_RADIX)
751		return (blst_leaf_fill(scan, allocBlk, count));
752	if (count == radix || scan->u.bmu_avail == 0)  {
753		/*
754		 * ALL-ALLOCATED special case
755		 */
756		nblks = scan->u.bmu_avail;
757		scan->u.bmu_avail = 0;
758		scan->bm_bighint = 0;
759		return (nblks);
760	}
761	next_skip = skip / BLIST_META_RADIX;
762
763	/*
764	 * An ALL-FREE meta node requires special handling before allocating
765	 * any of its blocks.
766	 */
767	if (scan->u.bmu_avail == radix) {
768		radix /= BLIST_META_RADIX;
769
770		/*
771		 * Reinitialize each of the meta node's children.  An ALL-FREE
772		 * meta node cannot have a terminator in any subtree.
773		 */
774		for (i = 1; i <= skip; i += next_skip) {
775			if (next_skip == 1)
776				scan[i].u.bmu_bitmap = (u_daddr_t)-1;
777			else
778				scan[i].u.bmu_avail = radix;
779			scan[i].bm_bighint = radix;
780		}
781	} else {
782		radix /= BLIST_META_RADIX;
783	}
784
785	nblks = 0;
786	child = (allocBlk - blk) / radix;
787	blk += child * radix;
788	i = 1 + child * next_skip;
789	while (i <= skip && blk < allocBlk + count) {
790		v = blk + radix - allocBlk;
791		if (v > count)
792			v = count;
793		nblks += blst_meta_fill(&scan[i], allocBlk, v, radix,
794		    next_skip - 1, blk);
795		count -= v;
796		allocBlk += v;
797		blk += radix;
798		i += next_skip;
799	}
800	scan->u.bmu_avail -= nblks;
801	return (nblks);
802}
803
804/*
805 * BLST_RADIX_INIT() - initialize radix tree
806 *
807 *	Initialize our meta structures and bitmaps and calculate the exact
808 *	amount of space required to manage 'count' blocks - this space may
809 *	be considerably less than the calculated radix due to the large
810 *	RADIX values we use.
811 */
812static daddr_t
813blst_radix_init(blmeta_t *scan, daddr_t radix, daddr_t skip, daddr_t count)
814{
815	daddr_t i, memindex, next_skip;
816
817	memindex = 0;
818
819	/*
820	 * Leaf node
821	 */
822
823	if (radix == BLIST_BMAP_RADIX) {
824		if (scan) {
825			scan->bm_bighint = 0;
826			scan->u.bmu_bitmap = 0;
827		}
828		return (memindex);
829	}
830
831	/*
832	 * Meta node.  If allocating the entire object we can special
833	 * case it.  However, we need to figure out how much memory
834	 * is required to manage 'count' blocks, so we continue on anyway.
835	 */
836
837	if (scan) {
838		scan->bm_bighint = 0;
839		scan->u.bmu_avail = 0;
840	}
841
842	radix /= BLIST_META_RADIX;
843	next_skip = skip / BLIST_META_RADIX;
844
845	for (i = 1; i <= skip; i += next_skip) {
846		if (count >= radix) {
847			/*
848			 * Allocate the entire object
849			 */
850			memindex = i +
851			    blst_radix_init(((scan) ? &scan[i] : NULL), radix,
852			    next_skip - 1, radix);
853			count -= radix;
854		} else if (count > 0) {
855			/*
856			 * Allocate a partial object
857			 */
858			memindex = i +
859			    blst_radix_init(((scan) ? &scan[i] : NULL), radix,
860			    next_skip - 1, count);
861			count = 0;
862		} else {
863			/*
864			 * Add terminator and break out
865			 */
866			if (scan)
867				scan[i].bm_bighint = (daddr_t)-1;
868			break;
869		}
870	}
871	if (memindex < i)
872		memindex = i;
873	return (memindex);
874}
875
876#ifdef BLIST_DEBUG
877
878static void
879blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, daddr_t skip,
880    int tab)
881{
882	daddr_t i, next_skip;
883
884	if (radix == BLIST_BMAP_RADIX) {
885		printf(
886		    "%*.*s(%08llx,%lld): bitmap %016llx big=%lld\n",
887		    tab, tab, "",
888		    (long long)blk, (long long)radix,
889		    (long long)scan->u.bmu_bitmap,
890		    (long long)scan->bm_bighint
891		);
892		return;
893	}
894
895	if (scan->u.bmu_avail == 0) {
896		printf(
897		    "%*.*s(%08llx,%lld) ALL ALLOCATED\n",
898		    tab, tab, "",
899		    (long long)blk,
900		    (long long)radix
901		);
902		return;
903	}
904	if (scan->u.bmu_avail == radix) {
905		printf(
906		    "%*.*s(%08llx,%lld) ALL FREE\n",
907		    tab, tab, "",
908		    (long long)blk,
909		    (long long)radix
910		);
911		return;
912	}
913
914	printf(
915	    "%*.*s(%08llx,%lld): subtree (%lld/%lld) big=%lld {\n",
916	    tab, tab, "",
917	    (long long)blk, (long long)radix,
918	    (long long)scan->u.bmu_avail,
919	    (long long)radix,
920	    (long long)scan->bm_bighint
921	);
922
923	radix /= BLIST_META_RADIX;
924	next_skip = skip / BLIST_META_RADIX;
925	tab += 4;
926
927	for (i = 1; i <= skip; i += next_skip) {
928		if (scan[i].bm_bighint == (daddr_t)-1) {
929			printf(
930			    "%*.*s(%08llx,%lld): Terminator\n",
931			    tab, tab, "",
932			    (long long)blk, (long long)radix
933			);
934			break;
935		}
936		blst_radix_print(&scan[i], blk, radix, next_skip - 1, tab);
937		blk += radix;
938	}
939	tab -= 4;
940
941	printf(
942	    "%*.*s}\n",
943	    tab, tab, ""
944	);
945}
946
947#endif
948
949#ifdef BLIST_DEBUG
950
951int
952main(int ac, char **av)
953{
954	int size = 1024;
955	int i;
956	blist_t bl;
957
958	for (i = 1; i < ac; ++i) {
959		const char *ptr = av[i];
960		if (*ptr != '-') {
961			size = strtol(ptr, NULL, 0);
962			continue;
963		}
964		ptr += 2;
965		fprintf(stderr, "Bad option: %s\n", ptr - 2);
966		exit(1);
967	}
968	bl = blist_create(size, M_WAITOK);
969	blist_free(bl, 0, size);
970
971	for (;;) {
972		char buf[1024];
973		long long da = 0;
974		long long count = 0;
975
976		printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
977		    (long long)size, (long long)bl->bl_radix);
978		fflush(stdout);
979		if (fgets(buf, sizeof(buf), stdin) == NULL)
980			break;
981		switch(buf[0]) {
982		case 'r':
983			if (sscanf(buf + 1, "%lld", &count) == 1) {
984				blist_resize(&bl, count, 1, M_WAITOK);
985			} else {
986				printf("?\n");
987			}
988		case 'p':
989			blist_print(bl);
990			break;
991		case 'a':
992			if (sscanf(buf + 1, "%lld", &count) == 1) {
993				daddr_t blk = blist_alloc(bl, count);
994				printf("    R=%08llx\n", (long long)blk);
995			} else {
996				printf("?\n");
997			}
998			break;
999		case 'f':
1000			if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1001				blist_free(bl, da, count);
1002			} else {
1003				printf("?\n");
1004			}
1005			break;
1006		case 'l':
1007			if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1008				printf("    n=%jd\n",
1009				    (intmax_t)blist_fill(bl, da, count));
1010			} else {
1011				printf("?\n");
1012			}
1013			break;
1014		case '?':
1015		case 'h':
1016			puts(
1017			    "p          -print\n"
1018			    "a %d       -allocate\n"
1019			    "f %x %d    -free\n"
1020			    "l %x %d    -fill\n"
1021			    "r %d       -resize\n"
1022			    "h/?        -help"
1023			);
1024			break;
1025		default:
1026			printf("?\n");
1027			break;
1028		}
1029	}
1030	return(0);
1031}
1032
1033void
1034panic(const char *ctl, ...)
1035{
1036	va_list va;
1037
1038	va_start(va, ctl);
1039	vfprintf(stderr, ctl, va);
1040	fprintf(stderr, "\n");
1041	va_end(va);
1042	exit(1);
1043}
1044
1045#endif
1046