subr_blist.c revision 118848
1126262Srwatson/*-
2189503Srwatson * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
3126262Srwatson * Redistribution and use in source and binary forms, with or without
4145167Srwatson * modification, are permitted provided that the following conditions
5172930Srwatson * are met:
6182063Srwatson * 1. Redistributions of source code must retain the above copyright
7126262Srwatson *    notice, this list of conditions and the following disclaimer.
8126262Srwatson * 2. Redistributions in binary form must reproduce the above copyright
9126262Srwatson *    notice, this list of conditions and the following disclaimer in the
10126262Srwatson *    documentation and/or other materials provided with the distribution.
11126262Srwatson * 4. Neither the name of the University nor the names of its contributors
12145167Srwatson *    may be used to endorse or promote products derived from this software
13145167Srwatson *    without specific prior written permission.
14145167Srwatson *
15145167Srwatson * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
16165426Srwatson * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17147784Srwatson * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18147784Srwatson * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
19126262Srwatson * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20189503Srwatson * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
21189503Srwatson * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22189503Srwatson * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23126262Srwatson * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
24126262Srwatson * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25126262Srwatson * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26126262Srwatson */
27126262Srwatson/*
28126262Srwatson * BLIST.C -	Bitmap allocator/deallocator, using a radix tree with hinting
29126262Srwatson *
30126262Srwatson *	This module implements a general bitmap allocator/deallocator.  The
31126262Srwatson *	allocator eats around 2 bits per 'block'.  The module does not
32126262Srwatson *	try to interpret the meaning of a 'block' other then to return
33126262Srwatson *	SWAPBLK_NONE on an allocation failure.
34126262Srwatson *
35126262Srwatson *	A radix tree is used to maintain the bitmap.  Two radix constants are
36126262Srwatson *	involved:  One for the bitmaps contained in the leaf nodes (typically
37126262Srwatson *	32), and one for the meta nodes (typically 16).  Both meta and leaf
38126262Srwatson *	nodes have a hint field.  This field gives us a hint as to the largest
39126262Srwatson *	free contiguous range of blocks under the node.  It may contain a
40126262Srwatson *	value that is too high, but will never contain a value that is too
41126262Srwatson *	low.  When the radix tree is searched, allocation failures in subtrees
42126262Srwatson *	update the hint.
43126262Srwatson *
44126262Srwatson *	The radix tree also implements two collapsed states for meta nodes:
45126262Srwatson *	the ALL-ALLOCATED state and the ALL-FREE state.  If a meta node is
46126262Srwatson *	in either of these two states, all information contained underneath
47126262Srwatson *	the node is considered stale.  These states are used to optimize
48189503Srwatson *	allocation and freeing operations.
49126262Srwatson *
50126262Srwatson * 	The hinting greatly increases code efficiency for allocations while
51126262Srwatson *	the general radix structure optimizes both allocations and frees.  The
52126262Srwatson *	radix tree should be able to operate well no matter how much
53126262Srwatson *	fragmentation there is and no matter how large a bitmap is used.
54126262Srwatson *
55126262Srwatson *	Unlike the rlist code, the blist code wires all necessary memory at
56126262Srwatson *	creation time.  Neither allocations nor frees require interaction with
57126262Srwatson *	the memory subsystem.  In contrast, the rlist code may allocate memory
58189503Srwatson *	on an rlist_free() call.  The non-blocking features of the blist code
59126262Srwatson *	are used to great advantage in the swap code (vm/nswap_pager.c).  The
60126262Srwatson *	rlist code uses a little less overall memory then the blist code (but
61126262Srwatson *	due to swap interleaving not all that much less), but the blist code
62126262Srwatson *	scales much, much better.
63126262Srwatson *
64126262Srwatson *	LAYOUT: The radix tree is layed out recursively using a
65126262Srwatson *	linear array.  Each meta node is immediately followed (layed out
66126262Srwatson *	sequentially in memory) by BLIST_META_RADIX lower level nodes.  This
67126262Srwatson *	is a recursive structure but one that can be easily scanned through
68126262Srwatson *	a very simple 'skip' calculation.  In order to support large radixes,
69126262Srwatson *	portions of the tree may reside outside our memory allocation.  We
70126262Srwatson *	handle this with an early-termination optimization (when bighint is
71126262Srwatson *	set to -1) on the scan.  The memory allocation is only large enough
72126262Srwatson *	to cover the number of blocks requested at creation time even if it
73126262Srwatson *	must be encompassed in larger root-node radix.
74126262Srwatson *
75126262Srwatson *	NOTE: the allocator cannot currently allocate more then
76163606Srwatson *	BLIST_BMAP_RADIX blocks per call.  It will panic with 'allocation too
77126262Srwatson *	large' if you try.  This is an area that could use improvement.  The
78165469Srwatson *	radix is large enough that this restriction does not effect the swap
79126262Srwatson *	system, though.  Currently only the allocation code is effected by
80126262Srwatson *	this algorithmic unfeature.  The freeing code can handle arbitrary
81165426Srwatson *	ranges.
82165426Srwatson *
83165426Srwatson *	This code can be compiled stand-alone for debugging.
84165426Srwatson */
85165426Srwatson
86165426Srwatson#include <sys/cdefs.h>
87165426Srwatson__FBSDID("$FreeBSD: head/sys/kern/subr_blist.c 118848 2003-08-12 23:24:05Z imp $");
88165426Srwatson
89165426Srwatson#ifdef _KERNEL
90165426Srwatson
91193391Srwatson#include <sys/param.h>
92193391Srwatson#include <sys/systm.h>
93193391Srwatson#include <sys/lock.h>
94193391Srwatson#include <sys/kernel.h>
95193391Srwatson#include <sys/blist.h>
96193391Srwatson#include <sys/malloc.h>
97193391Srwatson#include <sys/proc.h>
98193391Srwatson#include <sys/mutex.h>
99193391Srwatson#include <vm/vm.h>
100193391Srwatson#include <vm/vm_object.h>
101165426Srwatson#include <vm/vm_kern.h>
102165426Srwatson#include <vm/vm_extern.h>
103126262Srwatson#include <vm/vm_page.h>
104126262Srwatson
105126262Srwatson#else
106126262Srwatson
107126262Srwatson#ifndef BLIST_NO_DEBUG
108126262Srwatson#define BLIST_DEBUG
109126262Srwatson#endif
110126262Srwatson
111126262Srwatson#define SWAPBLK_NONE ((daddr_t)-1)
112126262Srwatson
113189797Srwatson#include <sys/types.h>
114191731Srwatson#include <stdio.h>
115189797Srwatson#include <string.h>
116191731Srwatson#include <stdlib.h>
117126262Srwatson#include <stdarg.h>
118191731Srwatson
119126262Srwatson#define malloc(a,b,c)	calloc(a, 1)
120126262Srwatson#define free(a,b)	free(a)
121126262Srwatson
122126262Srwatsontypedef unsigned int u_daddr_t;
123126262Srwatson
124126262Srwatson#include <sys/blist.h>
125126262Srwatson
126172930Srwatsonvoid panic(const char *ctl, ...);
127126262Srwatson
128126262Srwatson#endif
129126262Srwatson
130126262Srwatson/*
131126262Srwatson * static support functions
132126262Srwatson */
133126262Srwatson
134126262Srwatsonstatic daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count);
135189797Srwatsonstatic daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t blk,
136191731Srwatson				daddr_t count, daddr_t radix, int skip);
137189797Srwatsonstatic void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
138191731Srwatsonstatic void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count,
139126262Srwatson					daddr_t radix, int skip, daddr_t blk);
140191731Srwatsonstatic void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix,
141126262Srwatson				daddr_t skip, blist_t dest, daddr_t count);
142126262Srwatsonstatic int blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
143126262Srwatsonstatic int blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
144126262Srwatson				daddr_t radix, int skip, daddr_t blk);
145126262Srwatsonstatic daddr_t	blst_radix_init(blmeta_t *scan, daddr_t radix,
146126262Srwatson						int skip, daddr_t count);
147126262Srwatson#ifndef _KERNEL
148172930Srwatsonstatic void	blst_radix_print(blmeta_t *scan, daddr_t blk,
149126262Srwatson					daddr_t radix, int skip, int tab);
150126262Srwatson#endif
151182063Srwatson
152182063Srwatson#ifdef _KERNEL
153182063Srwatsonstatic MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space");
154182063Srwatson#endif
155182063Srwatson
156182063Srwatson/*
157182063Srwatson * blist_create() - create a blist capable of handling up to the specified
158182063Srwatson *		    number of blocks
159182063Srwatson *
160182063Srwatson *	blocks must be greater then 0
161182063Srwatson *
162126262Srwatson *	The smallest blist consists of a single leaf node capable of
163182063Srwatson *	managing BLIST_BMAP_RADIX blocks.
164126262Srwatson */
165126262Srwatson
166126262Srwatsonblist_t
167126262Srwatsonblist_create(daddr_t blocks)
168126262Srwatson{
169126262Srwatson	blist_t bl;
170126262Srwatson	int radix;
171126262Srwatson	int skip = 0;
172191731Srwatson
173126262Srwatson	/*
174126262Srwatson	 * Calculate radix and skip field used for scanning.
175126262Srwatson	 */
176126262Srwatson	radix = BLIST_BMAP_RADIX;
177172930Srwatson
178126262Srwatson	while (radix < blocks) {
179126262Srwatson		radix *= BLIST_META_RADIX;
180191731Srwatson		skip = (skip + 1) * BLIST_META_RADIX;
181126262Srwatson	}
182126262Srwatson
183126262Srwatson	bl = malloc(sizeof(struct blist), M_SWAP, M_WAITOK | M_ZERO);
184126262Srwatson
185172930Srwatson	bl->bl_blocks = blocks;
186126262Srwatson	bl->bl_radix = radix;
187126262Srwatson	bl->bl_skip = skip;
188182063Srwatson	bl->bl_rootblks = 1 +
189182063Srwatson	    blst_radix_init(NULL, bl->bl_radix, bl->bl_skip, blocks);
190182063Srwatson	bl->bl_root = malloc(sizeof(blmeta_t) * bl->bl_rootblks, M_SWAP, M_WAITOK);
191182063Srwatson
192182063Srwatson#if defined(BLIST_DEBUG)
193182063Srwatson	printf(
194126262Srwatson		"BLIST representing %lld blocks (%lld MB of swap)"
195126262Srwatson		", requiring %lldK of ram\n",
196126262Srwatson		(long long)bl->bl_blocks,
197172930Srwatson		(long long)bl->bl_blocks * 4 / 1024,
198126262Srwatson		(long long)(bl->bl_rootblks * sizeof(blmeta_t) + 1023) / 1024
199126262Srwatson	);
200191731Srwatson	printf("BLIST raw radix tree contains %lld records\n",
201126262Srwatson	    (long long)bl->bl_rootblks);
202126262Srwatson#endif
203126262Srwatson	blst_radix_init(bl->bl_root, bl->bl_radix, bl->bl_skip, blocks);
204172930Srwatson
205126262Srwatson	return(bl);
206126262Srwatson}
207126262Srwatson
208126262Srwatsonvoid
209191731Srwatsonblist_destroy(blist_t bl)
210126262Srwatson{
211126262Srwatson	free(bl->bl_root, M_SWAP);
212126262Srwatson	free(bl, M_SWAP);
213126262Srwatson}
214126262Srwatson
215172930Srwatson/*
216126262Srwatson * blist_alloc() - reserve space in the block bitmap.  Return the base
217126262Srwatson *		     of a contiguous region or SWAPBLK_NONE if space could
218126262Srwatson *		     not be allocated.
219126262Srwatson */
220191731Srwatson
221191731Srwatsondaddr_t
222126262Srwatsonblist_alloc(blist_t bl, daddr_t count)
223126262Srwatson{
224126262Srwatson	daddr_t blk = SWAPBLK_NONE;
225126262Srwatson
226126262Srwatson	if (bl) {
227172930Srwatson		if (bl->bl_radix == BLIST_BMAP_RADIX)
228126262Srwatson			blk = blst_leaf_alloc(bl->bl_root, 0, count);
229126262Srwatson		else
230126262Srwatson			blk = blst_meta_alloc(bl->bl_root, 0, count, bl->bl_radix, bl->bl_skip);
231191731Srwatson		if (blk != SWAPBLK_NONE)
232126262Srwatson			bl->bl_free -= count;
233126262Srwatson	}
234126262Srwatson	return(blk);
235126262Srwatson}
236126262Srwatson
237172930Srwatson/*
238126262Srwatson * blist_free() -	free up space in the block bitmap.  Return the base
239126262Srwatson *		     	of a contiguous region.  Panic if an inconsistancy is
240191731Srwatson *			found.
241126262Srwatson */
242126262Srwatson
243126262Srwatsonvoid
244172930Srwatsonblist_free(blist_t bl, daddr_t blkno, daddr_t count)
245126262Srwatson{
246126262Srwatson	if (bl) {
247191731Srwatson		if (bl->bl_radix == BLIST_BMAP_RADIX)
248191731Srwatson			blst_leaf_free(bl->bl_root, blkno, count);
249126262Srwatson		else
250126262Srwatson			blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix, bl->bl_skip, 0);
251126262Srwatson		bl->bl_free += count;
252172930Srwatson	}
253126262Srwatson}
254126262Srwatson
255126262Srwatson/*
256168955Srwatson * blist_fill() -	mark a region in the block bitmap as off-limits
257168955Srwatson *			to the allocator (i.e. allocate it), ignoring any
258191731Srwatson *			existing allocations.  Return the number of blocks
259189797Srwatson *			actually filled that were free before the call.
260126262Srwatson */
261126262Srwatson
262126262Srwatsonint
263172930Srwatsonblist_fill(blist_t bl, daddr_t blkno, daddr_t count)
264126262Srwatson{
265126262Srwatson	int filled;
266126262Srwatson
267193391Srwatson	if (bl) {
268193391Srwatson		if (bl->bl_radix == BLIST_BMAP_RADIX)
269193391Srwatson			filled = blst_leaf_fill(bl->bl_root, blkno, count);
270168955Srwatson		else
271126262Srwatson			filled = blst_meta_fill(bl->bl_root, blkno, count,
272191731Srwatson			    bl->bl_radix, bl->bl_skip, 0);
273168955Srwatson		bl->bl_free -= filled;
274126262Srwatson		return filled;
275126262Srwatson	} else
276126262Srwatson		return 0;
277172930Srwatson}
278126262Srwatson
279193332Srwatson/*
280193332Srwatson * blist_resize() -	resize an existing radix tree to handle the
281193332Srwatson *			specified number of blocks.  This will reallocate
282126262Srwatson *			the tree and transfer the previous bitmap to the new
283191731Srwatson *			one.  When extending the tree you can specify whether
284189797Srwatson *			the new blocks are to left allocated or freed.
285126262Srwatson */
286126262Srwatson
287126262Srwatsonvoid
288172930Srwatsonblist_resize(blist_t *pbl, daddr_t count, int freenew)
289126262Srwatson{
290126262Srwatson    blist_t newbl = blist_create(count);
291126262Srwatson    blist_t save = *pbl;
292193332Srwatson
293193332Srwatson    *pbl = newbl;
294126262Srwatson    if (count > save->bl_blocks)
295168955Srwatson	    count = save->bl_blocks;
296168955Srwatson    blst_copy(save->bl_root, 0, save->bl_radix, save->bl_skip, newbl, count);
297191731Srwatson
298191731Srwatson    /*
299126262Srwatson     * If resizing upwards, should we free the new space or not?
300126262Srwatson     */
301189503Srwatson    if (freenew && count < newbl->bl_blocks) {
302189503Srwatson	    blist_free(newbl, count, newbl->bl_blocks - count);
303189503Srwatson    }
304126262Srwatson    blist_destroy(save);
305172930Srwatson}
306145167Srwatson
307145167Srwatson#ifdef BLIST_DEBUG
308145167Srwatson
309191731Srwatson/*
310191731Srwatson * blist_print()    - dump radix tree
311189503Srwatson */
312145167Srwatson
313145167Srwatsonvoid
314145167Srwatsonblist_print(blist_t bl)
315145167Srwatson{
316189503Srwatson	printf("BLIST {\n");
317189503Srwatson	blst_radix_print(bl->bl_root, 0, bl->bl_radix, bl->bl_skip, 4);
318189503Srwatson	printf("}\n");
319145167Srwatson}
320189532Srwatson
321168955Srwatson#endif
322126262Srwatson
323126262Srwatson/************************************************************************
324126262Srwatson *			  ALLOCATION SUPPORT FUNCTIONS			*
325191731Srwatson ************************************************************************
326191731Srwatson *
327189532Srwatson *	These support functions do all the actual work.  They may seem
328126262Srwatson *	rather longish, but that's because I've commented them up.  The
329126262Srwatson *	actual code is straight forward.
330126262Srwatson *
331126262Srwatson */
332189503Srwatson
333189503Srwatson/*
334189503Srwatson * blist_leaf_alloc() -	allocate at a leaf in the radix tree (a bitmap).
335126262Srwatson *
336172930Srwatson *	This is the core of the allocator and is optimized for the 1 block
337168955Srwatson *	and the BLIST_BMAP_RADIX block allocation cases.  Other cases are
338126262Srwatson *	somewhat slower.  The 1 block allocation case is log2 and extremely
339126262Srwatson *	quick.
340126262Srwatson */
341191731Srwatson
342191731Srwatsonstatic daddr_t
343189503Srwatsonblst_leaf_alloc(
344126262Srwatson	blmeta_t *scan,
345126262Srwatson	daddr_t blk,
346126262Srwatson	int count
347126262Srwatson) {
348189503Srwatson	u_daddr_t orig = scan->u.bmu_bitmap;
349189503Srwatson
350189503Srwatson	if (orig == 0) {
351126262Srwatson		/*
352172930Srwatson		 * Optimize bitmap all-allocated case.  Also, count = 1
353147784Srwatson		 * case assumes at least 1 bit is free in the bitmap, so
354147784Srwatson		 * we have to take care of this case here.
355147784Srwatson		 */
356191731Srwatson		scan->bm_bighint = 0;
357191731Srwatson		return(SWAPBLK_NONE);
358189503Srwatson	}
359189503Srwatson	if (count == 1) {
360147784Srwatson		/*
361147784Srwatson		 * Optimized code to allocate one bit out of the bitmap
362147784Srwatson		 */
363147784Srwatson		u_daddr_t mask;
364189503Srwatson		int j = BLIST_BMAP_RADIX/2;
365189503Srwatson		int r = 0;
366189503Srwatson
367147784Srwatson		mask = (u_daddr_t)-1 >> (BLIST_BMAP_RADIX/2);
368172930Srwatson
369126262Srwatson		while (j) {
370126262Srwatson			if ((orig & mask) == 0) {
371126262Srwatson			    r += j;
372126262Srwatson			    orig >>= j;
373193332Srwatson			}
374193332Srwatson			j >>= 1;
375130398Srwatson			mask >>= j;
376168955Srwatson		}
377126262Srwatson		scan->u.bmu_bitmap &= ~(1 << r);
378191731Srwatson		return(blk + r);
379191731Srwatson	}
380189503Srwatson	if (count <= BLIST_BMAP_RADIX) {
381126262Srwatson		/*
382126262Srwatson		 * non-optimized code to allocate N bits out of the bitmap.
383126262Srwatson		 * The more bits, the faster the code runs.  It will run
384126262Srwatson		 * the slowest allocating 2 bits, but since there aren't any
385189503Srwatson		 * memory ops in the core loop (or shouldn't be, anyway),
386189503Srwatson		 * you probably won't notice the difference.
387189503Srwatson		 */
388126262Srwatson		int j;
389172930Srwatson		int n = BLIST_BMAP_RADIX - count;
390126262Srwatson		u_daddr_t mask;
391126262Srwatson
392126262Srwatson		mask = (u_daddr_t)-1 >> n;
393191731Srwatson
394191731Srwatson		for (j = 0; j <= n; ++j) {
395189503Srwatson			if ((orig & mask) == mask) {
396168955Srwatson				scan->u.bmu_bitmap &= ~mask;
397126262Srwatson				return(blk + j);
398126262Srwatson			}
399126262Srwatson			mask = (mask << 1);
400189503Srwatson		}
401189503Srwatson	}
402189503Srwatson	/*
403126262Srwatson	 * We couldn't allocate count in this subtree, update bighint.
404172930Srwatson	 */
405145167Srwatson	scan->bm_bighint = count - 1;
406145167Srwatson	return(SWAPBLK_NONE);
407145167Srwatson}
408191731Srwatson
409189503Srwatson/*
410168955Srwatson * blist_meta_alloc() -	allocate at a meta in the radix tree.
411145167Srwatson *
412145167Srwatson *	Attempt to allocate at a meta node.  If we can't, we update
413145167Srwatson *	bighint and return a failure.  Updating bighint optimize future
414189503Srwatson *	calls that hit this node.  We have to check for our collapse cases
415189503Srwatson *	and we have a few optimizations strewn in as well.
416189503Srwatson */
417145167Srwatson
418172930Srwatsonstatic daddr_t
419126262Srwatsonblst_meta_alloc(
420126262Srwatson	blmeta_t *scan,
421126262Srwatson	daddr_t blk,
422191731Srwatson	daddr_t count,
423191731Srwatson	daddr_t radix,
424189503Srwatson	int skip
425126262Srwatson) {
426126262Srwatson	int i;
427126262Srwatson	int next_skip = ((u_int)skip / BLIST_META_RADIX);
428126262Srwatson
429189503Srwatson	if (scan->u.bmu_avail == 0)  {
430189503Srwatson		/*
431189503Srwatson		 * ALL-ALLOCATED special case
432126262Srwatson		 */
433172930Srwatson		scan->bm_bighint = count;
434126262Srwatson		return(SWAPBLK_NONE);
435126262Srwatson	}
436126262Srwatson
437126262Srwatson	if (scan->u.bmu_avail == radix) {
438168955Srwatson		radix /= BLIST_META_RADIX;
439130398Srwatson
440191731Srwatson		/*
441191731Srwatson		 * ALL-FREE special case, initialize uninitialize
442189503Srwatson		 * sublevel.
443126262Srwatson		 */
444126262Srwatson		for (i = 1; i <= skip; i += next_skip) {
445126262Srwatson			if (scan[i].bm_bighint == (daddr_t)-1)
446126262Srwatson				break;
447189503Srwatson			if (next_skip == 1) {
448189503Srwatson				scan[i].u.bmu_bitmap = (u_daddr_t)-1;
449189503Srwatson				scan[i].bm_bighint = BLIST_BMAP_RADIX;
450126262Srwatson			} else {
451172930Srwatson				scan[i].bm_bighint = radix;
452126262Srwatson				scan[i].u.bmu_avail = radix;
453126262Srwatson			}
454126262Srwatson		}
455191731Srwatson	} else {
456189503Srwatson		radix /= BLIST_META_RADIX;
457126262Srwatson	}
458126262Srwatson
459126262Srwatson	for (i = 1; i <= skip; i += next_skip) {
460126262Srwatson		if (count <= scan[i].bm_bighint) {
461189503Srwatson			/*
462189503Srwatson			 * count fits in object
463189503Srwatson			 */
464126262Srwatson			daddr_t r;
465172930Srwatson			if (next_skip == 1) {
466145167Srwatson				r = blst_leaf_alloc(&scan[i], blk, count);
467145167Srwatson			} else {
468145167Srwatson				r = blst_meta_alloc(&scan[i], blk, count, radix, next_skip - 1);
469191731Srwatson			}
470189503Srwatson			if (r != SWAPBLK_NONE) {
471145167Srwatson				scan->u.bmu_avail -= count;
472145167Srwatson				if (scan->bm_bighint > scan->u.bmu_avail)
473145167Srwatson					scan->bm_bighint = scan->u.bmu_avail;
474145167Srwatson				return(r);
475189503Srwatson			}
476189503Srwatson		} else if (scan[i].bm_bighint == (daddr_t)-1) {
477189503Srwatson			/*
478145167Srwatson			 * Terminator
479172930Srwatson			 */
480126262Srwatson			break;
481126262Srwatson		} else if (count > radix) {
482126262Srwatson			/*
483191731Srwatson			 * count does not fit in object even if it were
484191731Srwatson			 * complete free.
485189503Srwatson			 */
486126262Srwatson			panic("blist_meta_alloc: allocation too large");
487126262Srwatson		}
488126262Srwatson		blk += radix;
489126262Srwatson	}
490126262Srwatson
491126262Srwatson	/*
492126262Srwatson	 * We couldn't allocate count in this subtree, update bighint.
493126262Srwatson	 */
494126262Srwatson	if (scan->bm_bighint >= count)
495126262Srwatson		scan->bm_bighint = count - 1;
496130398Srwatson	return(SWAPBLK_NONE);
497165426Srwatson}
498165426Srwatson
499165426Srwatson/*
500165426Srwatson * BLST_LEAF_FREE() -	free allocated block from leaf bitmap
501165426Srwatson *
502165426Srwatson */
503130398Srwatson
504130398Srwatsonstatic void
505172930Srwatsonblst_leaf_free(
506130398Srwatson	blmeta_t *scan,
507130398Srwatson	daddr_t blk,
508126262Srwatson	int count
509130398Srwatson) {
510126262Srwatson	/*
511172930Srwatson	 * free some data in this bitmap
512130398Srwatson	 *
513165426Srwatson	 * e.g.
514126262Srwatson	 *	0000111111111110000
515126262Srwatson	 *          \_________/\__/
516165426Srwatson	 *		v        n
517165426Srwatson	 */
518165426Srwatson	int n = blk & (BLIST_BMAP_RADIX - 1);
519126262Srwatson	u_daddr_t mask;
520126262Srwatson
521126262Srwatson	mask = ((u_daddr_t)-1 << n) &
522126262Srwatson	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
523126262Srwatson
524126262Srwatson	if (scan->u.bmu_bitmap & mask)
525126262Srwatson		panic("blst_radix_free: freeing free block");
526126262Srwatson	scan->u.bmu_bitmap |= mask;
527126262Srwatson
528126262Srwatson	/*
529126262Srwatson	 * We could probably do a better job here.  We are required to make
530126262Srwatson	 * bighint at least as large as the biggest contiguous block of
531126262Srwatson	 * data.  If we just shoehorn it, a little extra overhead will
532126262Srwatson	 * be incured on the next allocation (but only that one typically).
533182063Srwatson	 */
534182063Srwatson	scan->bm_bighint = BLIST_BMAP_RADIX;
535182063Srwatson}
536126262Srwatson
537126262Srwatson/*
538126262Srwatson * BLST_META_FREE() - free allocated blocks from radix tree meta info
539126262Srwatson *
540126262Srwatson *	This support routine frees a range of blocks from the bitmap.
541126262Srwatson *	The range must be entirely enclosed by this radix node.  If a
542126262Srwatson *	meta node, we break the range down recursively to free blocks
543126262Srwatson *	in subnodes (which means that this code can free an arbitrary
544126262Srwatson *	range whereas the allocation code cannot allocate an arbitrary
545126262Srwatson *	range).
546126262Srwatson */
547126262Srwatson
548172930Srwatsonstatic void
549126262Srwatsonblst_meta_free(
550126262Srwatson	blmeta_t *scan,
551126262Srwatson	daddr_t freeBlk,
552126262Srwatson	daddr_t count,
553126262Srwatson	daddr_t radix,
554126262Srwatson	int skip,
555126262Srwatson	daddr_t blk
556126262Srwatson) {
557126262Srwatson	int i;
558126262Srwatson	int next_skip = ((u_int)skip / BLIST_META_RADIX);
559126262Srwatson
560126262Srwatson#if 0
561126262Srwatson	printf("FREE (%llx,%lld) FROM (%llx,%lld)\n",
562126262Srwatson	    (long long)freeBlk, (long long)count,
563130398Srwatson	    (long long)blk, (long long)radix
564126262Srwatson	);
565126262Srwatson#endif
566182063Srwatson
567182063Srwatson	if (scan->u.bmu_avail == 0) {
568182063Srwatson		/*
569126262Srwatson		 * ALL-ALLOCATED special case, with possible
570126262Srwatson		 * shortcut to ALL-FREE special case.
571126262Srwatson		 */
572126262Srwatson		scan->u.bmu_avail = count;
573126262Srwatson		scan->bm_bighint = count;
574126262Srwatson
575126262Srwatson		if (count != radix)  {
576126262Srwatson			for (i = 1; i <= skip; i += next_skip) {
577126262Srwatson				if (scan[i].bm_bighint == (daddr_t)-1)
578126262Srwatson					break;
579126262Srwatson				scan[i].bm_bighint = 0;
580126262Srwatson				if (next_skip == 1) {
581130398Srwatson					scan[i].u.bmu_bitmap = 0;
582130398Srwatson				} else {
583172930Srwatson					scan[i].u.bmu_avail = 0;
584130398Srwatson				}
585172930Srwatson			}
586130398Srwatson			/* fall through */
587130398Srwatson		}
588126262Srwatson	} else {
589126262Srwatson		scan->u.bmu_avail += count;
590126262Srwatson		/* scan->bm_bighint = radix; */
591126262Srwatson	}
592126262Srwatson
593126262Srwatson	/*
594126262Srwatson	 * ALL-FREE special case.
595126262Srwatson	 */
596126262Srwatson
597126262Srwatson	if (scan->u.bmu_avail == radix)
598126262Srwatson		return;
599126262Srwatson	if (scan->u.bmu_avail > radix)
600126262Srwatson		panic("blst_meta_free: freeing already free blocks (%lld) %lld/%lld",
601126262Srwatson		    (long long)count, (long long)scan->u.bmu_avail,
602130398Srwatson		    (long long)radix);
603126262Srwatson
604126262Srwatson	/*
605182063Srwatson	 * Break the free down into its components
606182063Srwatson	 */
607182063Srwatson
608126262Srwatson	radix /= BLIST_META_RADIX;
609126262Srwatson
610126262Srwatson	i = (freeBlk - blk) / radix;
611126262Srwatson	blk += i * radix;
612126262Srwatson	i = i * next_skip + 1;
613126262Srwatson
614126262Srwatson	while (i <= skip && blk < freeBlk + count) {
615126262Srwatson		daddr_t v;
616126262Srwatson
617126262Srwatson		v = blk + radix - freeBlk;
618126262Srwatson		if (v > count)
619126262Srwatson			v = count;
620130398Srwatson
621130398Srwatson		if (scan->bm_bighint == (daddr_t)-1)
622172930Srwatson			panic("blst_meta_free: freeing unexpected range");
623130398Srwatson
624172930Srwatson		if (next_skip == 1) {
625130398Srwatson			blst_leaf_free(&scan[i], freeBlk, v);
626130398Srwatson		} else {
627126262Srwatson			blst_meta_free(&scan[i], freeBlk, v, radix, next_skip - 1, blk);
628126262Srwatson		}
629126262Srwatson		if (scan->bm_bighint < scan[i].bm_bighint)
630126262Srwatson		    scan->bm_bighint = scan[i].bm_bighint;
631126262Srwatson		count -= v;
632126262Srwatson		freeBlk += v;
633126262Srwatson		blk += radix;
634126262Srwatson		i += next_skip;
635	}
636}
637
638/*
639 * BLIST_RADIX_COPY() - copy one radix tree to another
640 *
641 *	Locates free space in the source tree and frees it in the destination
642 *	tree.  The space may not already be free in the destination.
643 */
644
645static void blst_copy(
646	blmeta_t *scan,
647	daddr_t blk,
648	daddr_t radix,
649	daddr_t skip,
650	blist_t dest,
651	daddr_t count
652) {
653	int next_skip;
654	int i;
655
656	/*
657	 * Leaf node
658	 */
659
660	if (radix == BLIST_BMAP_RADIX) {
661		u_daddr_t v = scan->u.bmu_bitmap;
662
663		if (v == (u_daddr_t)-1) {
664			blist_free(dest, blk, count);
665		} else if (v != 0) {
666			int i;
667
668			for (i = 0; i < BLIST_BMAP_RADIX && i < count; ++i) {
669				if (v & (1 << i))
670					blist_free(dest, blk + i, 1);
671			}
672		}
673		return;
674	}
675
676	/*
677	 * Meta node
678	 */
679
680	if (scan->u.bmu_avail == 0) {
681		/*
682		 * Source all allocated, leave dest allocated
683		 */
684		return;
685	}
686	if (scan->u.bmu_avail == radix) {
687		/*
688		 * Source all free, free entire dest
689		 */
690		if (count < radix)
691			blist_free(dest, blk, count);
692		else
693			blist_free(dest, blk, radix);
694		return;
695	}
696
697
698	radix /= BLIST_META_RADIX;
699	next_skip = ((u_int)skip / BLIST_META_RADIX);
700
701	for (i = 1; count && i <= skip; i += next_skip) {
702		if (scan[i].bm_bighint == (daddr_t)-1)
703			break;
704
705		if (count >= radix) {
706			blst_copy(
707			    &scan[i],
708			    blk,
709			    radix,
710			    next_skip - 1,
711			    dest,
712			    radix
713			);
714			count -= radix;
715		} else {
716			if (count) {
717				blst_copy(
718				    &scan[i],
719				    blk,
720				    radix,
721				    next_skip - 1,
722				    dest,
723				    count
724				);
725			}
726			count = 0;
727		}
728		blk += radix;
729	}
730}
731
732/*
733 * BLST_LEAF_FILL() -	allocate specific blocks in leaf bitmap
734 *
735 *	This routine allocates all blocks in the specified range
736 *	regardless of any existing allocations in that range.  Returns
737 *	the number of blocks allocated by the call.
738 */
739
740static int
741blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
742{
743	int n = blk & (BLIST_BMAP_RADIX - 1);
744	int nblks;
745	u_daddr_t mask, bitmap;
746
747	mask = ((u_daddr_t)-1 << n) &
748	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
749
750	/* Count the number of blocks we're about to allocate */
751	bitmap = scan->u.bmu_bitmap & mask;
752	for (nblks = 0; bitmap != 0; nblks++)
753		bitmap &= bitmap - 1;
754
755	scan->u.bmu_bitmap &= ~mask;
756	return nblks;
757}
758
759/*
760 * BLIST_META_FILL() -	allocate specific blocks at a meta node
761 *
762 *	This routine allocates the specified range of blocks,
763 *	regardless of any existing allocations in the range.  The
764 *	range must be within the extent of this node.  Returns the
765 *	number of blocks allocated by the call.
766 */
767static int
768blst_meta_fill(
769	blmeta_t *scan,
770	daddr_t allocBlk,
771	daddr_t count,
772	daddr_t radix,
773	int skip,
774	daddr_t blk
775) {
776	int i;
777	int next_skip = ((u_int)skip / BLIST_META_RADIX);
778	int nblks = 0;
779
780	if (count == radix || scan->u.bmu_avail == 0)  {
781		/*
782		 * ALL-ALLOCATED special case
783		 */
784		nblks = scan->u.bmu_avail;
785		scan->u.bmu_avail = 0;
786		scan->bm_bighint = count;
787		return nblks;
788	}
789
790	if (scan->u.bmu_avail == radix) {
791		radix /= BLIST_META_RADIX;
792
793		/*
794		 * ALL-FREE special case, initialize sublevel
795		 */
796		for (i = 1; i <= skip; i += next_skip) {
797			if (scan[i].bm_bighint == (daddr_t)-1)
798				break;
799			if (next_skip == 1) {
800				scan[i].u.bmu_bitmap = (u_daddr_t)-1;
801				scan[i].bm_bighint = BLIST_BMAP_RADIX;
802			} else {
803				scan[i].bm_bighint = radix;
804				scan[i].u.bmu_avail = radix;
805			}
806		}
807	} else {
808		radix /= BLIST_META_RADIX;
809	}
810
811	if (count > radix)
812		panic("blist_meta_fill: allocation too large");
813
814	i = (allocBlk - blk) / radix;
815	blk += i * radix;
816	i = i * next_skip + 1;
817
818	while (i <= skip && blk < allocBlk + count) {
819		daddr_t v;
820
821		v = blk + radix - allocBlk;
822		if (v > count)
823			v = count;
824
825		if (scan->bm_bighint == (daddr_t)-1)
826			panic("blst_meta_fill: filling unexpected range");
827
828		if (next_skip == 1) {
829			nblks += blst_leaf_fill(&scan[i], allocBlk, v);
830		} else {
831			nblks += blst_meta_fill(&scan[i], allocBlk, v,
832			    radix, next_skip - 1, blk);
833		}
834		count -= v;
835		allocBlk += v;
836		blk += radix;
837		i += next_skip;
838	}
839	scan->u.bmu_avail -= nblks;
840	return nblks;
841}
842
843/*
844 * BLST_RADIX_INIT() - initialize radix tree
845 *
846 *	Initialize our meta structures and bitmaps and calculate the exact
847 *	amount of space required to manage 'count' blocks - this space may
848 *	be considerably less then the calculated radix due to the large
849 *	RADIX values we use.
850 */
851
852static daddr_t
853blst_radix_init(blmeta_t *scan, daddr_t radix, int skip, daddr_t count)
854{
855	int i;
856	int next_skip;
857	daddr_t memindex = 0;
858
859	/*
860	 * Leaf node
861	 */
862
863	if (radix == BLIST_BMAP_RADIX) {
864		if (scan) {
865			scan->bm_bighint = 0;
866			scan->u.bmu_bitmap = 0;
867		}
868		return(memindex);
869	}
870
871	/*
872	 * Meta node.  If allocating the entire object we can special
873	 * case it.  However, we need to figure out how much memory
874	 * is required to manage 'count' blocks, so we continue on anyway.
875	 */
876
877	if (scan) {
878		scan->bm_bighint = 0;
879		scan->u.bmu_avail = 0;
880	}
881
882	radix /= BLIST_META_RADIX;
883	next_skip = ((u_int)skip / BLIST_META_RADIX);
884
885	for (i = 1; i <= skip; i += next_skip) {
886		if (count >= radix) {
887			/*
888			 * Allocate the entire object
889			 */
890			memindex = i + blst_radix_init(
891			    ((scan) ? &scan[i] : NULL),
892			    radix,
893			    next_skip - 1,
894			    radix
895			);
896			count -= radix;
897		} else if (count > 0) {
898			/*
899			 * Allocate a partial object
900			 */
901			memindex = i + blst_radix_init(
902			    ((scan) ? &scan[i] : NULL),
903			    radix,
904			    next_skip - 1,
905			    count
906			);
907			count = 0;
908		} else {
909			/*
910			 * Add terminator and break out
911			 */
912			if (scan)
913				scan[i].bm_bighint = (daddr_t)-1;
914			break;
915		}
916	}
917	if (memindex < i)
918		memindex = i;
919	return(memindex);
920}
921
922#ifdef BLIST_DEBUG
923
924static void
925blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int skip, int tab)
926{
927	int i;
928	int next_skip;
929	int lastState = 0;
930
931	if (radix == BLIST_BMAP_RADIX) {
932		printf(
933		    "%*.*s(%08llx,%lld): bitmap %08llx big=%lld\n",
934		    tab, tab, "",
935		    (long long)blk, (long long)radix,
936		    (long long)scan->u.bmu_bitmap,
937		    (long long)scan->bm_bighint
938		);
939		return;
940	}
941
942	if (scan->u.bmu_avail == 0) {
943		printf(
944		    "%*.*s(%08llx,%lld) ALL ALLOCATED\n",
945		    tab, tab, "",
946		    (long long)blk,
947		    (long long)radix
948		);
949		return;
950	}
951	if (scan->u.bmu_avail == radix) {
952		printf(
953		    "%*.*s(%08llx,%lld) ALL FREE\n",
954		    tab, tab, "",
955		    (long long)blk,
956		    (long long)radix
957		);
958		return;
959	}
960
961	printf(
962	    "%*.*s(%08llx,%lld): subtree (%lld/%lld) big=%lld {\n",
963	    tab, tab, "",
964	    (long long)blk, (long long)radix,
965	    (long long)scan->u.bmu_avail,
966	    (long long)radix,
967	    (long long)scan->bm_bighint
968	);
969
970	radix /= BLIST_META_RADIX;
971	next_skip = ((u_int)skip / BLIST_META_RADIX);
972	tab += 4;
973
974	for (i = 1; i <= skip; i += next_skip) {
975		if (scan[i].bm_bighint == (daddr_t)-1) {
976			printf(
977			    "%*.*s(%08llx,%lld): Terminator\n",
978			    tab, tab, "",
979			    (long long)blk, (long long)radix
980			);
981			lastState = 0;
982			break;
983		}
984		blst_radix_print(
985		    &scan[i],
986		    blk,
987		    radix,
988		    next_skip - 1,
989		    tab
990		);
991		blk += radix;
992	}
993	tab -= 4;
994
995	printf(
996	    "%*.*s}\n",
997	    tab, tab, ""
998	);
999}
1000
1001#endif
1002
1003#ifdef BLIST_DEBUG
1004
1005int
1006main(int ac, char **av)
1007{
1008	int size = 1024;
1009	int i;
1010	blist_t bl;
1011
1012	for (i = 1; i < ac; ++i) {
1013		const char *ptr = av[i];
1014		if (*ptr != '-') {
1015			size = strtol(ptr, NULL, 0);
1016			continue;
1017		}
1018		ptr += 2;
1019		fprintf(stderr, "Bad option: %s\n", ptr - 2);
1020		exit(1);
1021	}
1022	bl = blist_create(size);
1023	blist_free(bl, 0, size);
1024
1025	for (;;) {
1026		char buf[1024];
1027		daddr_t da = 0;
1028		daddr_t count = 0;
1029
1030
1031		printf("%lld/%lld/%lld> ", (long long)bl->bl_free,
1032		    (long long)size, (long long)bl->bl_radix);
1033		fflush(stdout);
1034		if (fgets(buf, sizeof(buf), stdin) == NULL)
1035			break;
1036		switch(buf[0]) {
1037		case 'r':
1038			if (sscanf(buf + 1, "%lld", &count) == 1) {
1039				blist_resize(&bl, count, 1);
1040			} else {
1041				printf("?\n");
1042			}
1043		case 'p':
1044			blist_print(bl);
1045			break;
1046		case 'a':
1047			if (sscanf(buf + 1, "%lld", &count) == 1) {
1048				daddr_t blk = blist_alloc(bl, count);
1049				printf("    R=%08llx\n", (long long)blk);
1050			} else {
1051				printf("?\n");
1052			}
1053			break;
1054		case 'f':
1055			if (sscanf(buf + 1, "%llx %lld",
1056			    (long long *)&da, (long long *)&count) == 2) {
1057				blist_free(bl, da, count);
1058			} else {
1059				printf("?\n");
1060			}
1061			break;
1062		case 'l':
1063			if (sscanf(buf + 1, "%llx %lld",
1064			    (long long *)&da, (long long *)&count) == 2) {
1065				printf("    n=%d\n",
1066				    blist_fill(bl, da, count));
1067			} else {
1068				printf("?\n");
1069			}
1070			break;
1071		case '?':
1072		case 'h':
1073			puts(
1074			    "p          -print\n"
1075			    "a %d       -allocate\n"
1076			    "f %x %d    -free\n"
1077			    "l %x %d    -fill\n"
1078			    "r %d       -resize\n"
1079			    "h/?        -help"
1080			);
1081			break;
1082		default:
1083			printf("?\n");
1084			break;
1085		}
1086	}
1087	return(0);
1088}
1089
1090void
1091panic(const char *ctl, ...)
1092{
1093	va_list va;
1094
1095	va_start(va, ctl);
1096	vfprintf(stderr, ctl, va);
1097	fprintf(stderr, "\n");
1098	va_end(va);
1099	exit(1);
1100}
1101
1102#endif
1103
1104