1/*-
2 * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
3 * Redistribution and use in source and binary forms, with or without
4 * modification, are permitted provided that the following conditions
5 * are met:
6 * 1. Redistributions of source code must retain the above copyright
7 *    notice, this list of conditions and the following disclaimer.
8 * 2. Redistributions in binary form must reproduce the above copyright
9 *    notice, this list of conditions and the following disclaimer in the
10 *    documentation and/or other materials provided with the distribution.
11 * 4. Neither the name of the University nor the names of its contributors
12 *    may be used to endorse or promote products derived from this software
13 *    without specific prior written permission.
14 *
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
16 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
19 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
21 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27/*
28 * BLIST.C -	Bitmap allocator/deallocator, using a radix tree with hinting
29 *
30 *	This module implements a general bitmap allocator/deallocator.  The
31 *	allocator eats around 2 bits per 'block'.  The module does not
32 *	try to interpret the meaning of a 'block' other than to return
33 *	SWAPBLK_NONE on an allocation failure.
34 *
35 *	A radix tree controls access to pieces of the bitmap, and includes
36 *	auxiliary information at each interior node about the availabilty of
37 *	contiguous free blocks in the subtree rooted at that node.  Two radix
38 *	constants are involved: one for the size of the bitmaps contained in the
39 *	leaf nodes (BLIST_BMAP_RADIX), and one for the number of descendents of
40 *	each of the meta (interior) nodes (BLIST_META_RADIX).  Each subtree is
41 *	associated with a range of blocks.  The root of any subtree stores a
42 *	hint field that defines an upper bound on the size of the largest
43 *	allocation that can begin in the associated block range.  A hint is an
44 *	upper bound on a potential allocation, but not necessarily a tight upper
45 *	bound.
46 *
47 *	The radix tree also implements two collapsed states for meta nodes:
48 *	the ALL-ALLOCATED state and the ALL-FREE state.  If a meta node is
49 *	in either of these two states, all information contained underneath
50 *	the node is considered stale.  These states are used to optimize
51 *	allocation and freeing operations.
52 *
53 * 	The hinting greatly increases code efficiency for allocations while
54 *	the general radix structure optimizes both allocations and frees.  The
55 *	radix tree should be able to operate well no matter how much
56 *	fragmentation there is and no matter how large a bitmap is used.
57 *
58 *	The blist code wires all necessary memory at creation time.  Neither
59 *	allocations nor frees require interaction with the memory subsystem.
60 *	The non-blocking features of the blist code are used in the swap code
61 *	(vm/swap_pager.c).
62 *
63 *	LAYOUT: The radix tree is laid out recursively using a
64 *	linear array.  Each meta node is immediately followed (laid out
65 *	sequentially in memory) by BLIST_META_RADIX lower level nodes.  This
66 *	is a recursive structure but one that can be easily scanned through
67 *	a very simple 'skip' calculation.  In order to support large radixes,
68 *	portions of the tree may reside outside our memory allocation.  We
69 *	handle this with an early-termination optimization (when bighint is
70 *	set to -1) on the scan.  The memory allocation is only large enough
71 *	to cover the number of blocks requested at creation time even if it
72 *	must be encompassed in larger root-node radix.
73 *
74 *	NOTE: the allocator cannot currently allocate more than
75 *	BLIST_BMAP_RADIX blocks per call.  It will panic with 'allocation too
76 *	large' if you try.  This is an area that could use improvement.  The
77 *	radix is large enough that this restriction does not effect the swap
78 *	system, though.  Currently only the allocation code is affected by
79 *	this algorithmic unfeature.  The freeing code can handle arbitrary
80 *	ranges.
81 *
82 *	This code can be compiled stand-alone for debugging.
83 */
84
85#include <sys/cdefs.h>
86__FBSDID("$FreeBSD: stable/11/sys/kern/subr_blist.c 324384 2017-10-07 16:55:44Z alc $");
87
88#ifdef _KERNEL
89
90#include <sys/param.h>
91#include <sys/systm.h>
92#include <sys/lock.h>
93#include <sys/kernel.h>
94#include <sys/blist.h>
95#include <sys/malloc.h>
96#include <sys/sbuf.h>
97#include <sys/proc.h>
98#include <sys/mutex.h>
99
100#else
101
102#ifndef BLIST_NO_DEBUG
103#define BLIST_DEBUG
104#endif
105
106#include <sys/types.h>
107#include <sys/malloc.h>
108#include <sys/sbuf.h>
109#include <stdio.h>
110#include <string.h>
111#include <stdlib.h>
112#include <stdarg.h>
113#include <stdbool.h>
114
115#define	bitcount64(x)	__bitcount64((uint64_t)(x))
116#define malloc(a,b,c)	calloc(a, 1)
117#define free(a,b)	free(a)
118static __inline int imax(int a, int b) { return (a > b ? a : b); }
119
120#include <sys/blist.h>
121
122void panic(const char *ctl, ...);
123
124#endif
125
126/*
127 * static support functions
128 */
129static daddr_t	blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count);
130static daddr_t	blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count,
131		    u_daddr_t radix);
132static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
133static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
134		    u_daddr_t radix);
135static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
136		    blist_t dest, daddr_t count);
137static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
138static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
139		    u_daddr_t radix);
140static daddr_t	blst_radix_init(blmeta_t *scan, daddr_t radix, daddr_t count);
141#ifndef _KERNEL
142static void	blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
143		    int tab);
144#endif
145
146#ifdef _KERNEL
147static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
148#endif
149
150_Static_assert(BLIST_BMAP_RADIX % BLIST_META_RADIX == 0,
151    "radix divisibility error");
152#define	BLIST_BMAP_MASK	(BLIST_BMAP_RADIX - 1)
153#define	BLIST_META_MASK	(BLIST_META_RADIX - 1)
154
155/*
156 * For a subtree that can represent the state of up to 'radix' blocks, the
157 * number of leaf nodes of the subtree is L=radix/BLIST_BMAP_RADIX.  If 'm'
158 * is short for BLIST_META_RADIX, then for a tree of height h with L=m**h
159 * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h,
160 * or, equivalently, (m**(h+1)-1)/(m-1).  This quantity is called 'skip'
161 * in the 'meta' functions that process subtrees.  Since integer division
162 * discards remainders, we can express this computation as
163 * skip = (m * m**h) / (m - 1)
164 * skip = (m * (radix / BLIST_BMAP_RADIX)) / (m - 1)
165 * and since m divides BLIST_BMAP_RADIX, we can simplify further to
166 * skip = (radix / (BLIST_BMAP_RADIX / m)) / (m - 1)
167 * skip = radix / ((BLIST_BMAP_RADIX / m) * (m - 1))
168 * so that simple integer division by a constant can safely be used for the
169 * calculation.
170 */
171static inline daddr_t
172radix_to_skip(daddr_t radix)
173{
174
175	return (radix /
176	    ((BLIST_BMAP_RADIX / BLIST_META_RADIX) * BLIST_META_MASK));
177}
178
179/*
180 * Use binary search, or a faster method, to find the 1 bit in a u_daddr_t.
181 * Assumes that the argument has only one bit set.
182 */
183static inline int
184bitpos(u_daddr_t mask)
185{
186	int hi, lo, mid;
187
188	switch (sizeof(mask)) {
189#ifdef HAVE_INLINE_FFSLL
190	case sizeof(long long):
191		return (ffsll(mask) - 1);
192#endif
193	default:
194		lo = 0;
195		hi = BLIST_BMAP_RADIX;
196		while (lo + 1 < hi) {
197			mid = (lo + hi) >> 1;
198			if ((mask >> mid) != 0)
199				lo = mid;
200			else
201				hi = mid;
202		}
203		return (lo);
204	}
205}
206
207/*
208 * blist_create() - create a blist capable of handling up to the specified
209 *		    number of blocks
210 *
211 *	blocks - must be greater than 0
212 * 	flags  - malloc flags
213 *
214 *	The smallest blist consists of a single leaf node capable of
215 *	managing BLIST_BMAP_RADIX blocks.
216 */
217blist_t
218blist_create(daddr_t blocks, int flags)
219{
220	blist_t bl;
221	daddr_t nodes, radix;
222
223	/*
224	 * Calculate the radix field used for scanning.
225	 */
226	radix = BLIST_BMAP_RADIX;
227	while (radix < blocks) {
228		radix *= BLIST_META_RADIX;
229	}
230	nodes = 1 + blst_radix_init(NULL, radix, blocks);
231
232	bl = malloc(sizeof(struct blist), M_SWAP, flags);
233	if (bl == NULL)
234		return (NULL);
235
236	bl->bl_blocks = blocks;
237	bl->bl_radix = radix;
238	bl->bl_cursor = 0;
239	bl->bl_root = malloc(nodes * sizeof(blmeta_t), M_SWAP, flags);
240	if (bl->bl_root == NULL) {
241		free(bl, M_SWAP);
242		return (NULL);
243	}
244	blst_radix_init(bl->bl_root, radix, blocks);
245
246#if defined(BLIST_DEBUG)
247	printf(
248		"BLIST representing %lld blocks (%lld MB of swap)"
249		", requiring %lldK of ram\n",
250		(long long)bl->bl_blocks,
251		(long long)bl->bl_blocks * 4 / 1024,
252		(long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
253	);
254	printf("BLIST raw radix tree contains %lld records\n",
255	    (long long)nodes);
256#endif
257
258	return (bl);
259}
260
261void
262blist_destroy(blist_t bl)
263{
264	free(bl->bl_root, M_SWAP);
265	free(bl, M_SWAP);
266}
267
268/*
269 * blist_alloc() -   reserve space in the block bitmap.  Return the base
270 *		     of a contiguous region or SWAPBLK_NONE if space could
271 *		     not be allocated.
272 */
273daddr_t
274blist_alloc(blist_t bl, daddr_t count)
275{
276	daddr_t blk;
277
278	/*
279	 * This loop iterates at most twice.  An allocation failure in the
280	 * first iteration leads to a second iteration only if the cursor was
281	 * non-zero.  When the cursor is zero, an allocation failure will
282	 * reduce the hint, stopping further iterations.
283	 */
284	while (count <= bl->bl_root->bm_bighint) {
285		blk = blst_meta_alloc(bl->bl_root, bl->bl_cursor, count,
286		    bl->bl_radix);
287		if (blk != SWAPBLK_NONE) {
288			bl->bl_cursor = blk + count;
289			if (bl->bl_cursor == bl->bl_blocks)
290				bl->bl_cursor = 0;
291			return (blk);
292		} else if (bl->bl_cursor != 0)
293			bl->bl_cursor = 0;
294	}
295	return (SWAPBLK_NONE);
296}
297
298/*
299 * blist_avail() -	return the number of free blocks.
300 */
301daddr_t
302blist_avail(blist_t bl)
303{
304
305	if (bl->bl_radix == BLIST_BMAP_RADIX)
306		return (bitcount64(bl->bl_root->u.bmu_bitmap));
307	else
308		return (bl->bl_root->u.bmu_avail);
309}
310
311/*
312 * blist_free() -	free up space in the block bitmap.  Return the base
313 *		     	of a contiguous region.  Panic if an inconsistancy is
314 *			found.
315 */
316void
317blist_free(blist_t bl, daddr_t blkno, daddr_t count)
318{
319
320	blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix);
321}
322
323/*
324 * blist_fill() -	mark a region in the block bitmap as off-limits
325 *			to the allocator (i.e. allocate it), ignoring any
326 *			existing allocations.  Return the number of blocks
327 *			actually filled that were free before the call.
328 */
329daddr_t
330blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
331{
332
333	return (blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix));
334}
335
336/*
337 * blist_resize() -	resize an existing radix tree to handle the
338 *			specified number of blocks.  This will reallocate
339 *			the tree and transfer the previous bitmap to the new
340 *			one.  When extending the tree you can specify whether
341 *			the new blocks are to left allocated or freed.
342 */
343void
344blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags)
345{
346    blist_t newbl = blist_create(count, flags);
347    blist_t save = *pbl;
348
349    *pbl = newbl;
350    if (count > save->bl_blocks)
351	    count = save->bl_blocks;
352    blst_copy(save->bl_root, 0, save->bl_radix, newbl, count);
353
354    /*
355     * If resizing upwards, should we free the new space or not?
356     */
357    if (freenew && count < newbl->bl_blocks) {
358	    blist_free(newbl, count, newbl->bl_blocks - count);
359    }
360    blist_destroy(save);
361}
362
363#ifdef BLIST_DEBUG
364
365/*
366 * blist_print()    - dump radix tree
367 */
368void
369blist_print(blist_t bl)
370{
371	printf("BLIST cursor = %08jx {\n", (uintmax_t)bl->bl_cursor);
372	blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4);
373	printf("}\n");
374}
375
376#endif
377
378static const u_daddr_t fib[] = {
379	1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584,
380	4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811,
381	514229, 832040, 1346269, 2178309, 3524578,
382};
383
384/*
385 * Use 'gap' to describe a maximal range of unallocated blocks/bits.
386 */
387struct gap_stats {
388	daddr_t	start;		/* current gap start, or SWAPBLK_NONE */
389	daddr_t	num;		/* number of gaps observed */
390	daddr_t	max;		/* largest gap size */
391	daddr_t	avg;		/* average gap size */
392	daddr_t	err;		/* sum - num * avg */
393	daddr_t	histo[nitems(fib)]; /* # gaps in each size range */
394	int	max_bucket;	/* last histo elt with nonzero val */
395};
396
397/*
398 * gap_stats_counting()    - is the state 'counting 1 bits'?
399 *                           or 'skipping 0 bits'?
400 */
401static inline bool
402gap_stats_counting(const struct gap_stats *stats)
403{
404
405	return (stats->start != SWAPBLK_NONE);
406}
407
408/*
409 * init_gap_stats()    - initialize stats on gap sizes
410 */
411static inline void
412init_gap_stats(struct gap_stats *stats)
413{
414
415	bzero(stats, sizeof(*stats));
416	stats->start = SWAPBLK_NONE;
417}
418
419/*
420 * update_gap_stats()    - update stats on gap sizes
421 */
422static void
423update_gap_stats(struct gap_stats *stats, daddr_t posn)
424{
425	daddr_t size;
426	int hi, lo, mid;
427
428	if (!gap_stats_counting(stats)) {
429		stats->start = posn;
430		return;
431	}
432	size = posn - stats->start;
433	stats->start = SWAPBLK_NONE;
434	if (size > stats->max)
435		stats->max = size;
436
437	/*
438	 * Find the fibonacci range that contains size,
439	 * expecting to find it in an early range.
440	 */
441	lo = 0;
442	hi = 1;
443	while (hi < nitems(fib) && fib[hi] <= size) {
444		lo = hi;
445		hi *= 2;
446	}
447	if (hi >= nitems(fib))
448		hi = nitems(fib);
449	while (lo + 1 != hi) {
450		mid = (lo + hi) >> 1;
451		if (fib[mid] <= size)
452			lo = mid;
453		else
454			hi = mid;
455	}
456	stats->histo[lo]++;
457	if (lo > stats->max_bucket)
458		stats->max_bucket = lo;
459	stats->err += size - stats->avg;
460	stats->num++;
461	stats->avg += stats->err / stats->num;
462	stats->err %= stats->num;
463}
464
465/*
466 * dump_gap_stats()    - print stats on gap sizes
467 */
468static inline void
469dump_gap_stats(const struct gap_stats *stats, struct sbuf *s)
470{
471	int i;
472
473	sbuf_printf(s, "number of maximal free ranges: %jd\n",
474	    (intmax_t)stats->num);
475	sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max);
476	sbuf_printf(s, "average maximal free range size: %jd\n",
477	    (intmax_t)stats->avg);
478	sbuf_printf(s, "number of maximal free ranges of different sizes:\n");
479	sbuf_printf(s, "               count  |  size range\n");
480	sbuf_printf(s, "               -----  |  ----------\n");
481	for (i = 0; i < stats->max_bucket; i++) {
482		if (stats->histo[i] != 0) {
483			sbuf_printf(s, "%20jd  |  ",
484			    (intmax_t)stats->histo[i]);
485			if (fib[i] != fib[i + 1] - 1)
486				sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
487				    (intmax_t)fib[i + 1] - 1);
488			else
489				sbuf_printf(s, "%jd\n", (intmax_t)fib[i]);
490		}
491	}
492	sbuf_printf(s, "%20jd  |  ", (intmax_t)stats->histo[i]);
493	if (stats->histo[i] > 1)
494		sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i],
495		    (intmax_t)stats->max);
496	else
497		sbuf_printf(s, "%jd\n", (intmax_t)stats->max);
498}
499
500/*
501 * blist_stats()    - dump radix tree stats
502 */
503void
504blist_stats(blist_t bl, struct sbuf *s)
505{
506	struct gap_stats gstats;
507	struct gap_stats *stats = &gstats;
508	daddr_t i, nodes, radix;
509	u_daddr_t bit, diff, mask;
510
511	init_gap_stats(stats);
512	nodes = 0;
513	i = bl->bl_radix;
514	while (i < bl->bl_radix + bl->bl_blocks) {
515		/*
516		 * Find max size subtree starting at i.
517		 */
518		radix = BLIST_BMAP_RADIX;
519		while (((i / radix) & BLIST_META_MASK) == 0)
520			radix *= BLIST_META_RADIX;
521
522		/*
523		 * Check for skippable subtrees starting at i.
524		 */
525		while (radix > BLIST_BMAP_RADIX) {
526			if (bl->bl_root[nodes].u.bmu_avail == 0) {
527				if (gap_stats_counting(stats))
528					update_gap_stats(stats, i);
529				break;
530			}
531			if (bl->bl_root[nodes].u.bmu_avail == radix) {
532				if (!gap_stats_counting(stats))
533					update_gap_stats(stats, i);
534				break;
535			}
536
537			/*
538			 * Skip subtree root.
539			 */
540			nodes++;
541			radix /= BLIST_META_RADIX;
542		}
543		if (radix == BLIST_BMAP_RADIX) {
544			/*
545			 * Scan leaf.
546			 */
547			mask = bl->bl_root[nodes].u.bmu_bitmap;
548			diff = mask ^ (mask << 1);
549			if (gap_stats_counting(stats))
550				diff ^= 1;
551			while (diff != 0) {
552				bit = diff & -diff;
553				update_gap_stats(stats, i + bitpos(bit));
554				diff ^= bit;
555			}
556		}
557		nodes += radix_to_skip(radix);
558		i += radix;
559	}
560	update_gap_stats(stats, i);
561	dump_gap_stats(stats, s);
562}
563
564/************************************************************************
565 *			  ALLOCATION SUPPORT FUNCTIONS			*
566 ************************************************************************
567 *
568 *	These support functions do all the actual work.  They may seem
569 *	rather longish, but that's because I've commented them up.  The
570 *	actual code is straight forward.
571 *
572 */
573
574/*
575 * blist_leaf_alloc() -	allocate at a leaf in the radix tree (a bitmap).
576 *
577 *	This is the core of the allocator and is optimized for the
578 *	BLIST_BMAP_RADIX block allocation case.  Otherwise, execution
579 *	time is proportional to log2(count) + bitpos time.
580 */
581static daddr_t
582blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count)
583{
584	u_daddr_t mask;
585	int count1, hi, lo, num_shifts, range1, range_ext;
586
587	range1 = 0;
588	count1 = count - 1;
589	num_shifts = fls(count1);
590	mask = scan->u.bmu_bitmap;
591	while ((-mask & ~mask) != 0 && num_shifts > 0) {
592		/*
593		 * If bit i is set in mask, then bits in [i, i+range1] are set
594		 * in scan->u.bmu_bitmap.  The value of range1 is equal to
595		 * count1 >> num_shifts.  Grow range and reduce num_shifts to 0,
596		 * while preserving these invariants.  The updates to mask leave
597		 * fewer bits set, but each bit that remains set represents a
598		 * longer string of consecutive bits set in scan->u.bmu_bitmap.
599		 * If more updates to mask cannot clear more bits, because mask
600		 * is partitioned with all 0 bits preceding all 1 bits, the loop
601		 * terminates immediately.
602		 */
603		num_shifts--;
604		range_ext = range1 + ((count1 >> num_shifts) & 1);
605		/*
606		 * mask is a signed quantity for the shift because when it is
607		 * shifted right, the sign bit should copied; when the last
608		 * block of the leaf is free, pretend, for a while, that all the
609		 * blocks that follow it are also free.
610		 */
611		mask &= (daddr_t)mask >> range_ext;
612		range1 += range_ext;
613	}
614	if (mask == 0) {
615		/*
616		 * Update bighint.  There is no allocation bigger than range1
617		 * starting in this leaf.
618		 */
619		scan->bm_bighint = range1;
620		return (SWAPBLK_NONE);
621	}
622
623	/* Discard any candidates that appear before blk. */
624	mask &= (u_daddr_t)-1 << (blk & BLIST_BMAP_MASK);
625	if (mask == 0)
626		return (SWAPBLK_NONE);
627
628	/*
629	 * The least significant set bit in mask marks the start of the first
630	 * available range of sufficient size.  Clear all the bits but that one,
631	 * and then find its position.
632	 */
633	mask &= -mask;
634	lo = bitpos(mask);
635
636	hi = lo + count;
637	if (hi > BLIST_BMAP_RADIX) {
638		/*
639		 * An allocation within this leaf is impossible, so a successful
640		 * allocation depends on the next leaf providing some of the blocks.
641		 */
642		if (((blk / BLIST_BMAP_RADIX + 1) & BLIST_META_MASK) == 0) {
643			/*
644			 * The next leaf has a different meta-node parent, so it
645			 * is not necessarily initialized.  Update bighint,
646			 * comparing the range found at the end of mask to the
647			 * largest earlier range that could have been made to
648			 * vanish in the initial processing of mask.
649			 */
650			scan->bm_bighint = imax(BLIST_BMAP_RADIX - lo, range1);
651			return (SWAPBLK_NONE);
652		}
653		hi -= BLIST_BMAP_RADIX;
654		if (((scan[1].u.bmu_bitmap + 1) & ~((u_daddr_t)-1 << hi)) != 0) {
655			/*
656			 * The next leaf doesn't have enough free blocks at the
657			 * beginning to complete the spanning allocation.  The
658			 * hint cannot be updated, because the same allocation
659			 * request could be satisfied later, by this leaf, if
660			 * the state of the next leaf changes, and without any
661			 * changes to this leaf.
662			 */
663			return (SWAPBLK_NONE);
664		}
665		/* Clear the first 'hi' bits in the next leaf, allocating them. */
666		scan[1].u.bmu_bitmap &= (u_daddr_t)-1 << hi;
667		hi = BLIST_BMAP_RADIX;
668	}
669
670	/* Set the bits of mask at position 'lo' and higher. */
671	mask = -mask;
672	if (hi == BLIST_BMAP_RADIX) {
673		/*
674		 * Update bighint.  There is no allocation bigger than range1
675		 * available in this leaf after this allocation completes.
676		 */
677		scan->bm_bighint = range1;
678	} else {
679		/* Clear the bits of mask at position 'hi' and higher. */
680		mask &= (u_daddr_t)-1 >> (BLIST_BMAP_RADIX - hi);
681		/* If this allocation uses all the bits, clear the hint. */
682		if (mask == scan->u.bmu_bitmap)
683			scan->bm_bighint = 0;
684	}
685	/* Clear the allocated bits from this leaf. */
686	scan->u.bmu_bitmap &= ~mask;
687	return ((blk & ~BLIST_BMAP_MASK) + lo);
688}
689
690/*
691 * blist_meta_alloc() -	allocate at a meta in the radix tree.
692 *
693 *	Attempt to allocate at a meta node.  If we can't, we update
694 *	bighint and return a failure.  Updating bighint optimize future
695 *	calls that hit this node.  We have to check for our collapse cases
696 *	and we have a few optimizations strewn in as well.
697 */
698static daddr_t
699blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, u_daddr_t radix)
700{
701	daddr_t blk, i, next_skip, r, skip;
702	int child;
703	bool scan_from_start;
704
705	if (radix == BLIST_BMAP_RADIX)
706		return (blst_leaf_alloc(scan, cursor, count));
707	if (scan->u.bmu_avail < count) {
708		/*
709		 * The meta node's hint must be too large if the allocation
710		 * exceeds the number of free blocks.  Reduce the hint, and
711		 * return failure.
712		 */
713		scan->bm_bighint = scan->u.bmu_avail;
714		return (SWAPBLK_NONE);
715	}
716	blk = cursor & -radix;
717	skip = radix_to_skip(radix);
718	next_skip = skip / BLIST_META_RADIX;
719
720	/*
721	 * An ALL-FREE meta node requires special handling before allocating
722	 * any of its blocks.
723	 */
724	if (scan->u.bmu_avail == radix) {
725		radix /= BLIST_META_RADIX;
726
727		/*
728		 * Reinitialize each of the meta node's children.  An ALL-FREE
729		 * meta node cannot have a terminator in any subtree.
730		 */
731		for (i = 1; i < skip; i += next_skip) {
732			if (next_skip == 1)
733				scan[i].u.bmu_bitmap = (u_daddr_t)-1;
734			else
735				scan[i].u.bmu_avail = radix;
736			scan[i].bm_bighint = radix;
737		}
738	} else {
739		radix /= BLIST_META_RADIX;
740	}
741
742	if (count > radix) {
743		/*
744		 * The allocation exceeds the number of blocks that are
745		 * managed by a subtree of this meta node.
746		 */
747		panic("allocation too large");
748	}
749	scan_from_start = cursor == blk;
750	child = (cursor - blk) / radix;
751	blk += child * radix;
752	for (i = 1 + child * next_skip; i < skip; i += next_skip) {
753		if (count <= scan[i].bm_bighint) {
754			/*
755			 * The allocation might fit beginning in the i'th subtree.
756			 */
757			r = blst_meta_alloc(&scan[i],
758			    cursor > blk ? cursor : blk, count, radix);
759			if (r != SWAPBLK_NONE) {
760				scan->u.bmu_avail -= count;
761				return (r);
762			}
763		} else if (scan[i].bm_bighint == (daddr_t)-1) {
764			/*
765			 * Terminator
766			 */
767			break;
768		}
769		blk += radix;
770	}
771
772	/*
773	 * We couldn't allocate count in this subtree, update bighint.
774	 */
775	if (scan_from_start && scan->bm_bighint >= count)
776		scan->bm_bighint = count - 1;
777
778	return (SWAPBLK_NONE);
779}
780
781/*
782 * BLST_LEAF_FREE() -	free allocated block from leaf bitmap
783 *
784 */
785static void
786blst_leaf_free(blmeta_t *scan, daddr_t blk, int count)
787{
788	u_daddr_t mask;
789	int n;
790
791	/*
792	 * free some data in this bitmap
793	 * mask=0000111111111110000
794	 *          \_________/\__/
795	 *		count   n
796	 */
797	n = blk & BLIST_BMAP_MASK;
798	mask = ((u_daddr_t)-1 << n) &
799	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
800	if (scan->u.bmu_bitmap & mask)
801		panic("freeing free block");
802	scan->u.bmu_bitmap |= mask;
803
804	/*
805	 * We could probably do a better job here.  We are required to make
806	 * bighint at least as large as the biggest contiguous block of
807	 * data.  If we just shoehorn it, a little extra overhead will
808	 * be incured on the next allocation (but only that one typically).
809	 */
810	scan->bm_bighint = BLIST_BMAP_RADIX;
811}
812
813/*
814 * BLST_META_FREE() - free allocated blocks from radix tree meta info
815 *
816 *	This support routine frees a range of blocks from the bitmap.
817 *	The range must be entirely enclosed by this radix node.  If a
818 *	meta node, we break the range down recursively to free blocks
819 *	in subnodes (which means that this code can free an arbitrary
820 *	range whereas the allocation code cannot allocate an arbitrary
821 *	range).
822 */
823static void
824blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix)
825{
826	daddr_t blk, i, next_skip, skip, v;
827	int child;
828
829	if (scan->bm_bighint == (daddr_t)-1)
830		panic("freeing invalid range");
831	if (radix == BLIST_BMAP_RADIX)
832		return (blst_leaf_free(scan, freeBlk, count));
833	skip = radix_to_skip(radix);
834	next_skip = skip / BLIST_META_RADIX;
835
836	if (scan->u.bmu_avail == 0) {
837		/*
838		 * ALL-ALLOCATED special case, with possible
839		 * shortcut to ALL-FREE special case.
840		 */
841		scan->u.bmu_avail = count;
842		scan->bm_bighint = count;
843
844		if (count != radix)  {
845			for (i = 1; i < skip; i += next_skip) {
846				if (scan[i].bm_bighint == (daddr_t)-1)
847					break;
848				scan[i].bm_bighint = 0;
849				if (next_skip == 1) {
850					scan[i].u.bmu_bitmap = 0;
851				} else {
852					scan[i].u.bmu_avail = 0;
853				}
854			}
855			/* fall through */
856		}
857	} else {
858		scan->u.bmu_avail += count;
859		/* scan->bm_bighint = radix; */
860	}
861
862	/*
863	 * ALL-FREE special case.
864	 */
865
866	if (scan->u.bmu_avail == radix)
867		return;
868	if (scan->u.bmu_avail > radix)
869		panic("blst_meta_free: freeing already free blocks (%lld) %lld/%lld",
870		    (long long)count, (long long)scan->u.bmu_avail,
871		    (long long)radix);
872
873	/*
874	 * Break the free down into its components
875	 */
876
877	blk = freeBlk & -radix;
878	radix /= BLIST_META_RADIX;
879
880	child = (freeBlk - blk) / radix;
881	blk += child * radix;
882	i = 1 + child * next_skip;
883	while (i < skip && blk < freeBlk + count) {
884		v = blk + radix - freeBlk;
885		if (v > count)
886			v = count;
887		blst_meta_free(&scan[i], freeBlk, v, radix);
888		if (scan->bm_bighint < scan[i].bm_bighint)
889			scan->bm_bighint = scan[i].bm_bighint;
890		count -= v;
891		freeBlk += v;
892		blk += radix;
893		i += next_skip;
894	}
895}
896
897/*
898 * BLIST_RADIX_COPY() - copy one radix tree to another
899 *
900 *	Locates free space in the source tree and frees it in the destination
901 *	tree.  The space may not already be free in the destination.
902 */
903static void
904blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest,
905    daddr_t count)
906{
907	daddr_t i, next_skip, skip;
908
909	/*
910	 * Leaf node
911	 */
912
913	if (radix == BLIST_BMAP_RADIX) {
914		u_daddr_t v = scan->u.bmu_bitmap;
915
916		if (v == (u_daddr_t)-1) {
917			blist_free(dest, blk, count);
918		} else if (v != 0) {
919			int i;
920
921			for (i = 0; i < BLIST_BMAP_RADIX && i < count; ++i) {
922				if (v & ((u_daddr_t)1 << i))
923					blist_free(dest, blk + i, 1);
924			}
925		}
926		return;
927	}
928
929	/*
930	 * Meta node
931	 */
932
933	if (scan->u.bmu_avail == 0) {
934		/*
935		 * Source all allocated, leave dest allocated
936		 */
937		return;
938	}
939	if (scan->u.bmu_avail == radix) {
940		/*
941		 * Source all free, free entire dest
942		 */
943		if (count < radix)
944			blist_free(dest, blk, count);
945		else
946			blist_free(dest, blk, radix);
947		return;
948	}
949
950
951	skip = radix_to_skip(radix);
952	next_skip = skip / BLIST_META_RADIX;
953	radix /= BLIST_META_RADIX;
954
955	for (i = 1; count && i < skip; i += next_skip) {
956		if (scan[i].bm_bighint == (daddr_t)-1)
957			break;
958
959		if (count >= radix) {
960			blst_copy(&scan[i], blk, radix, dest, radix);
961			count -= radix;
962		} else {
963			if (count) {
964				blst_copy(&scan[i], blk, radix, dest, count);
965			}
966			count = 0;
967		}
968		blk += radix;
969	}
970}
971
972/*
973 * BLST_LEAF_FILL() -	allocate specific blocks in leaf bitmap
974 *
975 *	This routine allocates all blocks in the specified range
976 *	regardless of any existing allocations in that range.  Returns
977 *	the number of blocks allocated by the call.
978 */
979static daddr_t
980blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
981{
982	daddr_t nblks;
983	u_daddr_t mask;
984	int n;
985
986	n = blk & BLIST_BMAP_MASK;
987	mask = ((u_daddr_t)-1 << n) &
988	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
989
990	/* Count the number of blocks that we are allocating. */
991	nblks = bitcount64(scan->u.bmu_bitmap & mask);
992
993	scan->u.bmu_bitmap &= ~mask;
994	return (nblks);
995}
996
997/*
998 * BLIST_META_FILL() -	allocate specific blocks at a meta node
999 *
1000 *	This routine allocates the specified range of blocks,
1001 *	regardless of any existing allocations in the range.  The
1002 *	range must be within the extent of this node.  Returns the
1003 *	number of blocks allocated by the call.
1004 */
1005static daddr_t
1006blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix)
1007{
1008	daddr_t blk, i, nblks, next_skip, skip, v;
1009	int child;
1010
1011	if (scan->bm_bighint == (daddr_t)-1)
1012		panic("filling invalid range");
1013	if (count > radix) {
1014		/*
1015		 * The allocation exceeds the number of blocks that are
1016		 * managed by this node.
1017		 */
1018		panic("fill too large");
1019	}
1020	if (radix == BLIST_BMAP_RADIX)
1021		return (blst_leaf_fill(scan, allocBlk, count));
1022	if (count == radix || scan->u.bmu_avail == 0)  {
1023		/*
1024		 * ALL-ALLOCATED special case
1025		 */
1026		nblks = scan->u.bmu_avail;
1027		scan->u.bmu_avail = 0;
1028		scan->bm_bighint = 0;
1029		return (nblks);
1030	}
1031	skip = radix_to_skip(radix);
1032	next_skip = skip / BLIST_META_RADIX;
1033	blk = allocBlk & -radix;
1034
1035	/*
1036	 * An ALL-FREE meta node requires special handling before allocating
1037	 * any of its blocks.
1038	 */
1039	if (scan->u.bmu_avail == radix) {
1040		radix /= BLIST_META_RADIX;
1041
1042		/*
1043		 * Reinitialize each of the meta node's children.  An ALL-FREE
1044		 * meta node cannot have a terminator in any subtree.
1045		 */
1046		for (i = 1; i < skip; i += next_skip) {
1047			if (next_skip == 1)
1048				scan[i].u.bmu_bitmap = (u_daddr_t)-1;
1049			else
1050				scan[i].u.bmu_avail = radix;
1051			scan[i].bm_bighint = radix;
1052		}
1053	} else {
1054		radix /= BLIST_META_RADIX;
1055	}
1056
1057	nblks = 0;
1058	child = (allocBlk - blk) / radix;
1059	blk += child * radix;
1060	i = 1 + child * next_skip;
1061	while (i < skip && blk < allocBlk + count) {
1062		v = blk + radix - allocBlk;
1063		if (v > count)
1064			v = count;
1065		nblks += blst_meta_fill(&scan[i], allocBlk, v, radix);
1066		count -= v;
1067		allocBlk += v;
1068		blk += radix;
1069		i += next_skip;
1070	}
1071	scan->u.bmu_avail -= nblks;
1072	return (nblks);
1073}
1074
1075/*
1076 * BLST_RADIX_INIT() - initialize radix tree
1077 *
1078 *	Initialize our meta structures and bitmaps and calculate the exact
1079 *	amount of space required to manage 'count' blocks - this space may
1080 *	be considerably less than the calculated radix due to the large
1081 *	RADIX values we use.
1082 */
1083static daddr_t
1084blst_radix_init(blmeta_t *scan, daddr_t radix, daddr_t count)
1085{
1086	daddr_t i, memindex, next_skip, skip;
1087
1088	memindex = 0;
1089
1090	/*
1091	 * Leaf node
1092	 */
1093
1094	if (radix == BLIST_BMAP_RADIX) {
1095		if (scan) {
1096			scan->bm_bighint = 0;
1097			scan->u.bmu_bitmap = 0;
1098		}
1099		return (memindex);
1100	}
1101
1102	/*
1103	 * Meta node.  If allocating the entire object we can special
1104	 * case it.  However, we need to figure out how much memory
1105	 * is required to manage 'count' blocks, so we continue on anyway.
1106	 */
1107
1108	if (scan) {
1109		scan->bm_bighint = 0;
1110		scan->u.bmu_avail = 0;
1111	}
1112
1113	skip = radix_to_skip(radix);
1114	next_skip = skip / BLIST_META_RADIX;
1115	radix /= BLIST_META_RADIX;
1116
1117	for (i = 1; i < skip; i += next_skip) {
1118		if (count >= radix) {
1119			/*
1120			 * Allocate the entire object
1121			 */
1122			memindex = i +
1123			    blst_radix_init(((scan) ? &scan[i] : NULL), radix,
1124			    radix);
1125			count -= radix;
1126		} else if (count > 0) {
1127			/*
1128			 * Allocate a partial object
1129			 */
1130			memindex = i +
1131			    blst_radix_init(((scan) ? &scan[i] : NULL), radix,
1132			    count);
1133			count = 0;
1134		} else {
1135			/*
1136			 * Add terminator and break out.  Make terminator bitmap
1137			 * zero to avoid a spanning leaf allocation that
1138			 * includes the terminator.
1139			 */
1140			if (scan) {
1141				scan[i].bm_bighint = (daddr_t)-1;
1142				scan[i].u.bmu_bitmap = 0;
1143			}
1144			break;
1145		}
1146	}
1147	if (memindex < i)
1148		memindex = i;
1149	return (memindex);
1150}
1151
1152#ifdef BLIST_DEBUG
1153
1154static void
1155blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab)
1156{
1157	daddr_t i, next_skip, skip;
1158
1159	if (radix == BLIST_BMAP_RADIX) {
1160		printf(
1161		    "%*.*s(%08llx,%lld): bitmap %016llx big=%lld\n",
1162		    tab, tab, "",
1163		    (long long)blk, (long long)radix,
1164		    (long long)scan->u.bmu_bitmap,
1165		    (long long)scan->bm_bighint
1166		);
1167		return;
1168	}
1169
1170	if (scan->u.bmu_avail == 0) {
1171		printf(
1172		    "%*.*s(%08llx,%lld) ALL ALLOCATED\n",
1173		    tab, tab, "",
1174		    (long long)blk,
1175		    (long long)radix
1176		);
1177		return;
1178	}
1179	if (scan->u.bmu_avail == radix) {
1180		printf(
1181		    "%*.*s(%08llx,%lld) ALL FREE\n",
1182		    tab, tab, "",
1183		    (long long)blk,
1184		    (long long)radix
1185		);
1186		return;
1187	}
1188
1189	printf(
1190	    "%*.*s(%08llx,%lld): subtree (%lld/%lld) big=%lld {\n",
1191	    tab, tab, "",
1192	    (long long)blk, (long long)radix,
1193	    (long long)scan->u.bmu_avail,
1194	    (long long)radix,
1195	    (long long)scan->bm_bighint
1196	);
1197
1198	skip = radix_to_skip(radix);
1199	next_skip = skip / BLIST_META_RADIX;
1200	radix /= BLIST_META_RADIX;
1201	tab += 4;
1202
1203	for (i = 1; i < skip; i += next_skip) {
1204		if (scan[i].bm_bighint == (daddr_t)-1) {
1205			printf(
1206			    "%*.*s(%08llx,%lld): Terminator\n",
1207			    tab, tab, "",
1208			    (long long)blk, (long long)radix
1209			);
1210			break;
1211		}
1212		blst_radix_print(&scan[i], blk, radix, tab);
1213		blk += radix;
1214	}
1215	tab -= 4;
1216
1217	printf(
1218	    "%*.*s}\n",
1219	    tab, tab, ""
1220	);
1221}
1222
1223#endif
1224
1225#ifdef BLIST_DEBUG
1226
1227int
1228main(int ac, char **av)
1229{
1230	int size = 1024;
1231	int i;
1232	blist_t bl;
1233	struct sbuf *s;
1234
1235	for (i = 1; i < ac; ++i) {
1236		const char *ptr = av[i];
1237		if (*ptr != '-') {
1238			size = strtol(ptr, NULL, 0);
1239			continue;
1240		}
1241		ptr += 2;
1242		fprintf(stderr, "Bad option: %s\n", ptr - 2);
1243		exit(1);
1244	}
1245	bl = blist_create(size, M_WAITOK);
1246	blist_free(bl, 0, size);
1247
1248	for (;;) {
1249		char buf[1024];
1250		long long da = 0;
1251		long long count = 0;
1252
1253		printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
1254		    (long long)size, (long long)bl->bl_radix);
1255		fflush(stdout);
1256		if (fgets(buf, sizeof(buf), stdin) == NULL)
1257			break;
1258		switch(buf[0]) {
1259		case 'r':
1260			if (sscanf(buf + 1, "%lld", &count) == 1) {
1261				blist_resize(&bl, count, 1, M_WAITOK);
1262			} else {
1263				printf("?\n");
1264			}
1265		case 'p':
1266			blist_print(bl);
1267			break;
1268		case 's':
1269			s = sbuf_new_auto();
1270			blist_stats(bl, s);
1271			sbuf_finish(s);
1272			printf("%s", sbuf_data(s));
1273			sbuf_delete(s);
1274			break;
1275		case 'a':
1276			if (sscanf(buf + 1, "%lld", &count) == 1) {
1277				daddr_t blk = blist_alloc(bl, count);
1278				printf("    R=%08llx\n", (long long)blk);
1279			} else {
1280				printf("?\n");
1281			}
1282			break;
1283		case 'f':
1284			if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1285				blist_free(bl, da, count);
1286			} else {
1287				printf("?\n");
1288			}
1289			break;
1290		case 'l':
1291			if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
1292				printf("    n=%jd\n",
1293				    (intmax_t)blist_fill(bl, da, count));
1294			} else {
1295				printf("?\n");
1296			}
1297			break;
1298		case '?':
1299		case 'h':
1300			puts(
1301			    "p          -print\n"
1302			    "s          -stats\n"
1303			    "a %d       -allocate\n"
1304			    "f %x %d    -free\n"
1305			    "l %x %d    -fill\n"
1306			    "r %d       -resize\n"
1307			    "h/?        -help"
1308			);
1309			break;
1310		default:
1311			printf("?\n");
1312			break;
1313		}
1314	}
1315	return(0);
1316}
1317
1318void
1319panic(const char *ctl, ...)
1320{
1321	va_list va;
1322
1323	va_start(va, ctl);
1324	vfprintf(stderr, ctl, va);
1325	fprintf(stderr, "\n");
1326	va_end(va);
1327	exit(1);
1328}
1329
1330#endif
1331