subr_blist.c revision 178792
1204076Spjd/*- 2204076Spjd * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. 3204076Spjd * Redistribution and use in source and binary forms, with or without 4204076Spjd * modification, are permitted provided that the following conditions 5204076Spjd * are met: 6204076Spjd * 1. Redistributions of source code must retain the above copyright 7204076Spjd * notice, this list of conditions and the following disclaimer. 8204076Spjd * 2. Redistributions in binary form must reproduce the above copyright 9204076Spjd * notice, this list of conditions and the following disclaimer in the 10204076Spjd * documentation and/or other materials provided with the distribution. 11204076Spjd * 4. Neither the name of the University nor the names of its contributors 12204076Spjd * may be used to endorse or promote products derived from this software 13204076Spjd * without specific prior written permission. 14204076Spjd * 15204076Spjd * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 16204076Spjd * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17204076Spjd * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18204076Spjd * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 19204076Spjd * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20204076Spjd * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 21204076Spjd * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22204076Spjd * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 23204076Spjd * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 24204076Spjd * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25204076Spjd * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26204076Spjd */ 27204076Spjd/* 28204076Spjd * BLIST.C - Bitmap allocator/deallocator, using a radix tree with hinting 29204076Spjd * 30204076Spjd * This module implements a general bitmap allocator/deallocator. The 31204076Spjd * allocator eats around 2 bits per 'block'. The module does not 32204076Spjd * try to interpret the meaning of a 'block' other then to return 33204076Spjd * SWAPBLK_NONE on an allocation failure. 34204076Spjd * 35204076Spjd * A radix tree is used to maintain the bitmap. Two radix constants are 36204076Spjd * involved: One for the bitmaps contained in the leaf nodes (typically 37204076Spjd * 32), and one for the meta nodes (typically 16). Both meta and leaf 38204076Spjd * nodes have a hint field. This field gives us a hint as to the largest 39204076Spjd * free contiguous range of blocks under the node. It may contain a 40204076Spjd * value that is too high, but will never contain a value that is too 41204076Spjd * low. When the radix tree is searched, allocation failures in subtrees 42204076Spjd * update the hint. 43204076Spjd * 44204076Spjd * The radix tree also implements two collapsed states for meta nodes: 45204076Spjd * the ALL-ALLOCATED state and the ALL-FREE state. If a meta node is 46204076Spjd * in either of these two states, all information contained underneath 47204076Spjd * the node is considered stale. These states are used to optimize 48204076Spjd * allocation and freeing operations. 49204076Spjd * 50204076Spjd * The hinting greatly increases code efficiency for allocations while 51204076Spjd * the general radix structure optimizes both allocations and frees. The 52204076Spjd * radix tree should be able to operate well no matter how much 53204076Spjd * fragmentation there is and no matter how large a bitmap is used. 54204076Spjd * 55204076Spjd * Unlike the rlist code, the blist code wires all necessary memory at 56204076Spjd * creation time. Neither allocations nor frees require interaction with 57204076Spjd * the memory subsystem. In contrast, the rlist code may allocate memory 58204076Spjd * on an rlist_free() call. The non-blocking features of the blist code 59204076Spjd * are used to great advantage in the swap code (vm/nswap_pager.c). The 60204076Spjd * rlist code uses a little less overall memory then the blist code (but 61204076Spjd * due to swap interleaving not all that much less), but the blist code 62204076Spjd * scales much, much better. 63204076Spjd * 64204076Spjd * LAYOUT: The radix tree is layed out recursively using a 65204076Spjd * linear array. Each meta node is immediately followed (layed out 66204076Spjd * sequentially in memory) by BLIST_META_RADIX lower level nodes. This 67204076Spjd * is a recursive structure but one that can be easily scanned through 68204076Spjd * a very simple 'skip' calculation. In order to support large radixes, 69204076Spjd * portions of the tree may reside outside our memory allocation. We 70204076Spjd * handle this with an early-termination optimization (when bighint is 71204076Spjd * set to -1) on the scan. The memory allocation is only large enough 72204076Spjd * to cover the number of blocks requested at creation time even if it 73204076Spjd * must be encompassed in larger root-node radix. 74204076Spjd * 75204076Spjd * NOTE: the allocator cannot currently allocate more then 76204076Spjd * BLIST_BMAP_RADIX blocks per call. It will panic with 'allocation too 77204076Spjd * large' if you try. This is an area that could use improvement. The 78204076Spjd * radix is large enough that this restriction does not effect the swap 79204076Spjd * system, though. Currently only the allocation code is effected by 80204076Spjd * this algorithmic unfeature. The freeing code can handle arbitrary 81204076Spjd * ranges. 82204076Spjd * 83204076Spjd * This code can be compiled stand-alone for debugging. 84204076Spjd */ 85204076Spjd 86204076Spjd#include <sys/cdefs.h> 87204076Spjd__FBSDID("$FreeBSD: head/sys/kern/subr_blist.c 178792 2008-05-05 19:48:54Z kmacy $"); 88204076Spjd 89204076Spjd#ifdef _KERNEL 90204076Spjd 91204076Spjd#include <sys/param.h> 92204076Spjd#include <sys/systm.h> 93204076Spjd#include <sys/lock.h> 94204076Spjd#include <sys/kernel.h> 95204076Spjd#include <sys/blist.h> 96204076Spjd#include <sys/malloc.h> 97204076Spjd#include <sys/proc.h> 98204076Spjd#include <sys/mutex.h> 99204076Spjd 100204076Spjd#else 101204076Spjd 102204076Spjd#ifndef BLIST_NO_DEBUG 103204076Spjd#define BLIST_DEBUG 104204076Spjd#endif 105204076Spjd 106204076Spjd#define SWAPBLK_NONE ((daddr_t)-1) 107204076Spjd 108204076Spjd#include <sys/types.h> 109204076Spjd#include <stdio.h> 110204076Spjd#include <string.h> 111204076Spjd#include <stdlib.h> 112204076Spjd#include <stdarg.h> 113204076Spjd 114204076Spjd#define malloc(a,b,c) calloc(a, 1) 115204076Spjd#define free(a,b) free(a) 116204076Spjd 117204076Spjdtypedef unsigned int u_daddr_t; 118204076Spjd 119204076Spjd#include <sys/blist.h> 120204076Spjd 121204076Spjdvoid panic(const char *ctl, ...); 122204076Spjd 123204076Spjd#endif 124204076Spjd 125204076Spjd/* 126204076Spjd * static support functions 127204076Spjd */ 128204076Spjd 129204076Spjdstatic daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count); 130204076Spjdstatic daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t blk, 131204076Spjd daddr_t count, daddr_t radix, int skip); 132204076Spjdstatic void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count); 133204076Spjdstatic void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, 134204076Spjd daddr_t radix, int skip, daddr_t blk); 135204076Spjdstatic void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, 136204076Spjd daddr_t skip, blist_t dest, daddr_t count); 137204076Spjdstatic int blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count); 138204076Spjdstatic int blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, 139204076Spjd daddr_t radix, int skip, daddr_t blk); 140204076Spjdstatic daddr_t blst_radix_init(blmeta_t *scan, daddr_t radix, 141204076Spjd int skip, daddr_t count); 142204076Spjd#ifndef _KERNEL 143204076Spjdstatic void blst_radix_print(blmeta_t *scan, daddr_t blk, 144204076Spjd daddr_t radix, int skip, int tab); 145204076Spjd#endif 146204076Spjd 147204076Spjd#ifdef _KERNEL 148204076Spjdstatic MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space"); 149204076Spjd#endif 150204076Spjd 151204076Spjd/* 152204076Spjd * blist_create() - create a blist capable of handling up to the specified 153204076Spjd * number of blocks 154204076Spjd * 155204076Spjd * blocks - must be greater then 0 156204076Spjd * flags - malloc flags 157204076Spjd * 158204076Spjd * The smallest blist consists of a single leaf node capable of 159204076Spjd * managing BLIST_BMAP_RADIX blocks. 160204076Spjd */ 161204076Spjd 162204076Spjdblist_t 163204076Spjdblist_create(daddr_t blocks, int flags) 164204076Spjd{ 165204076Spjd blist_t bl; 166204076Spjd int radix; 167204076Spjd int skip = 0; 168204076Spjd 169204076Spjd /* 170204076Spjd * Calculate radix and skip field used for scanning. 171204076Spjd */ 172204076Spjd radix = BLIST_BMAP_RADIX; 173204076Spjd 174204076Spjd while (radix < blocks) { 175204076Spjd radix *= BLIST_META_RADIX; 176204076Spjd skip = (skip + 1) * BLIST_META_RADIX; 177204076Spjd } 178204076Spjd 179204076Spjd bl = malloc(sizeof(struct blist), M_SWAP, flags | M_ZERO); 180204076Spjd 181204076Spjd bl->bl_blocks = blocks; 182204076Spjd bl->bl_radix = radix; 183204076Spjd bl->bl_skip = skip; 184204076Spjd bl->bl_rootblks = 1 + 185204076Spjd blst_radix_init(NULL, bl->bl_radix, bl->bl_skip, blocks); 186204076Spjd bl->bl_root = malloc(sizeof(blmeta_t) * bl->bl_rootblks, M_SWAP, flags); 187204076Spjd 188204076Spjd#if defined(BLIST_DEBUG) 189204076Spjd printf( 190204076Spjd "BLIST representing %lld blocks (%lld MB of swap)" 191204076Spjd ", requiring %lldK of ram\n", 192204076Spjd (long long)bl->bl_blocks, 193204076Spjd (long long)bl->bl_blocks * 4 / 1024, 194204076Spjd (long long)(bl->bl_rootblks * sizeof(blmeta_t) + 1023) / 1024 195204076Spjd ); 196204076Spjd printf("BLIST raw radix tree contains %lld records\n", 197204076Spjd (long long)bl->bl_rootblks); 198204076Spjd#endif 199204076Spjd blst_radix_init(bl->bl_root, bl->bl_radix, bl->bl_skip, blocks); 200204076Spjd 201204076Spjd return(bl); 202204076Spjd} 203204076Spjd 204204076Spjdvoid 205204076Spjdblist_destroy(blist_t bl) 206204076Spjd{ 207204076Spjd free(bl->bl_root, M_SWAP); 208204076Spjd free(bl, M_SWAP); 209204076Spjd} 210204076Spjd 211204076Spjd/* 212204076Spjd * blist_alloc() - reserve space in the block bitmap. Return the base 213204076Spjd * of a contiguous region or SWAPBLK_NONE if space could 214204076Spjd * not be allocated. 215204076Spjd */ 216204076Spjd 217204076Spjddaddr_t 218204076Spjdblist_alloc(blist_t bl, daddr_t count) 219204076Spjd{ 220204076Spjd daddr_t blk = SWAPBLK_NONE; 221204076Spjd 222204076Spjd if (bl) { 223204076Spjd if (bl->bl_radix == BLIST_BMAP_RADIX) 224204076Spjd blk = blst_leaf_alloc(bl->bl_root, 0, count); 225204076Spjd else 226204076Spjd blk = blst_meta_alloc(bl->bl_root, 0, count, bl->bl_radix, bl->bl_skip); 227204076Spjd if (blk != SWAPBLK_NONE) 228204076Spjd bl->bl_free -= count; 229204076Spjd } 230204076Spjd return(blk); 231204076Spjd} 232204076Spjd 233204076Spjd/* 234204076Spjd * blist_free() - free up space in the block bitmap. Return the base 235204076Spjd * of a contiguous region. Panic if an inconsistancy is 236204076Spjd * found. 237204076Spjd */ 238204076Spjd 239204076Spjdvoid 240204076Spjdblist_free(blist_t bl, daddr_t blkno, daddr_t count) 241204076Spjd{ 242204076Spjd if (bl) { 243204076Spjd if (bl->bl_radix == BLIST_BMAP_RADIX) 244204076Spjd blst_leaf_free(bl->bl_root, blkno, count); 245204076Spjd else 246204076Spjd blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix, bl->bl_skip, 0); 247204076Spjd bl->bl_free += count; 248204076Spjd } 249204076Spjd} 250204076Spjd 251204076Spjd/* 252204076Spjd * blist_fill() - mark a region in the block bitmap as off-limits 253204076Spjd * to the allocator (i.e. allocate it), ignoring any 254204076Spjd * existing allocations. Return the number of blocks 255204076Spjd * actually filled that were free before the call. 256204076Spjd */ 257204076Spjd 258204076Spjdint 259204076Spjdblist_fill(blist_t bl, daddr_t blkno, daddr_t count) 260204076Spjd{ 261204076Spjd int filled; 262204076Spjd 263204076Spjd if (bl) { 264204076Spjd if (bl->bl_radix == BLIST_BMAP_RADIX) 265204076Spjd filled = blst_leaf_fill(bl->bl_root, blkno, count); 266204076Spjd else 267204076Spjd filled = blst_meta_fill(bl->bl_root, blkno, count, 268204076Spjd bl->bl_radix, bl->bl_skip, 0); 269204076Spjd bl->bl_free -= filled; 270204076Spjd return filled; 271204076Spjd } else 272204076Spjd return 0; 273204076Spjd} 274204076Spjd 275204076Spjd/* 276204076Spjd * blist_resize() - resize an existing radix tree to handle the 277204076Spjd * specified number of blocks. This will reallocate 278204076Spjd * the tree and transfer the previous bitmap to the new 279204076Spjd * one. When extending the tree you can specify whether 280204076Spjd * the new blocks are to left allocated or freed. 281204076Spjd */ 282204076Spjd 283204076Spjdvoid 284204076Spjdblist_resize(blist_t *pbl, daddr_t count, int freenew, int flags) 285204076Spjd{ 286204076Spjd blist_t newbl = blist_create(count, flags); 287204076Spjd blist_t save = *pbl; 288204076Spjd 289204076Spjd *pbl = newbl; 290204076Spjd if (count > save->bl_blocks) 291204076Spjd count = save->bl_blocks; 292204076Spjd blst_copy(save->bl_root, 0, save->bl_radix, save->bl_skip, newbl, count); 293204076Spjd 294204076Spjd /* 295204076Spjd * If resizing upwards, should we free the new space or not? 296204076Spjd */ 297204076Spjd if (freenew && count < newbl->bl_blocks) { 298204076Spjd blist_free(newbl, count, newbl->bl_blocks - count); 299204076Spjd } 300204076Spjd blist_destroy(save); 301204076Spjd} 302204076Spjd 303204076Spjd#ifdef BLIST_DEBUG 304204076Spjd 305204076Spjd/* 306204076Spjd * blist_print() - dump radix tree 307204076Spjd */ 308204076Spjd 309204076Spjdvoid 310204076Spjdblist_print(blist_t bl) 311204076Spjd{ 312204076Spjd printf("BLIST {\n"); 313204076Spjd blst_radix_print(bl->bl_root, 0, bl->bl_radix, bl->bl_skip, 4); 314204076Spjd printf("}\n"); 315204076Spjd} 316204076Spjd 317204076Spjd#endif 318204076Spjd 319204076Spjd/************************************************************************ 320204076Spjd * ALLOCATION SUPPORT FUNCTIONS * 321204076Spjd ************************************************************************ 322204076Spjd * 323204076Spjd * These support functions do all the actual work. They may seem 324204076Spjd * rather longish, but that's because I've commented them up. The 325204076Spjd * actual code is straight forward. 326204076Spjd * 327204076Spjd */ 328204076Spjd 329204076Spjd/* 330204076Spjd * blist_leaf_alloc() - allocate at a leaf in the radix tree (a bitmap). 331204076Spjd * 332204076Spjd * This is the core of the allocator and is optimized for the 1 block 333204076Spjd * and the BLIST_BMAP_RADIX block allocation cases. Other cases are 334204076Spjd * somewhat slower. The 1 block allocation case is log2 and extremely 335204076Spjd * quick. 336204076Spjd */ 337204076Spjd 338204076Spjdstatic daddr_t 339204076Spjdblst_leaf_alloc( 340204076Spjd blmeta_t *scan, 341204076Spjd daddr_t blk, 342204076Spjd int count 343204076Spjd) { 344204076Spjd u_daddr_t orig = scan->u.bmu_bitmap; 345204076Spjd 346204076Spjd if (orig == 0) { 347204076Spjd /* 348204076Spjd * Optimize bitmap all-allocated case. Also, count = 1 349204076Spjd * case assumes at least 1 bit is free in the bitmap, so 350204076Spjd * we have to take care of this case here. 351204076Spjd */ 352204076Spjd scan->bm_bighint = 0; 353204076Spjd return(SWAPBLK_NONE); 354204076Spjd } 355204076Spjd if (count == 1) { 356204076Spjd /* 357204076Spjd * Optimized code to allocate one bit out of the bitmap 358204076Spjd */ 359204076Spjd u_daddr_t mask; 360204076Spjd int j = BLIST_BMAP_RADIX/2; 361204076Spjd int r = 0; 362204076Spjd 363204076Spjd mask = (u_daddr_t)-1 >> (BLIST_BMAP_RADIX/2); 364204076Spjd 365204076Spjd while (j) { 366204076Spjd if ((orig & mask) == 0) { 367204076Spjd r += j; 368204076Spjd orig >>= j; 369204076Spjd } 370204076Spjd j >>= 1; 371204076Spjd mask >>= j; 372204076Spjd } 373204076Spjd scan->u.bmu_bitmap &= ~(1 << r); 374204076Spjd return(blk + r); 375204076Spjd } 376204076Spjd if (count <= BLIST_BMAP_RADIX) { 377204076Spjd /* 378204076Spjd * non-optimized code to allocate N bits out of the bitmap. 379204076Spjd * The more bits, the faster the code runs. It will run 380204076Spjd * the slowest allocating 2 bits, but since there aren't any 381204076Spjd * memory ops in the core loop (or shouldn't be, anyway), 382204076Spjd * you probably won't notice the difference. 383204076Spjd */ 384204076Spjd int j; 385204076Spjd int n = BLIST_BMAP_RADIX - count; 386204076Spjd u_daddr_t mask; 387204076Spjd 388204076Spjd mask = (u_daddr_t)-1 >> n; 389204076Spjd 390204076Spjd for (j = 0; j <= n; ++j) { 391204076Spjd if ((orig & mask) == mask) { 392204076Spjd scan->u.bmu_bitmap &= ~mask; 393204076Spjd return(blk + j); 394204076Spjd } 395204076Spjd mask = (mask << 1); 396204076Spjd } 397204076Spjd } 398204076Spjd /* 399204076Spjd * We couldn't allocate count in this subtree, update bighint. 400204076Spjd */ 401204076Spjd scan->bm_bighint = count - 1; 402204076Spjd return(SWAPBLK_NONE); 403204076Spjd} 404204076Spjd 405204076Spjd/* 406204076Spjd * blist_meta_alloc() - allocate at a meta in the radix tree. 407204076Spjd * 408204076Spjd * Attempt to allocate at a meta node. If we can't, we update 409204076Spjd * bighint and return a failure. Updating bighint optimize future 410204076Spjd * calls that hit this node. We have to check for our collapse cases 411204076Spjd * and we have a few optimizations strewn in as well. 412204076Spjd */ 413204076Spjd 414204076Spjdstatic daddr_t 415204076Spjdblst_meta_alloc( 416204076Spjd blmeta_t *scan, 417204076Spjd daddr_t blk, 418204076Spjd daddr_t count, 419204076Spjd daddr_t radix, 420204076Spjd int skip 421204076Spjd) { 422204076Spjd int i; 423204076Spjd int next_skip = ((u_int)skip / BLIST_META_RADIX); 424204076Spjd 425204076Spjd if (scan->u.bmu_avail == 0) { 426204076Spjd /* 427204076Spjd * ALL-ALLOCATED special case 428204076Spjd */ 429204076Spjd scan->bm_bighint = count; 430204076Spjd return(SWAPBLK_NONE); 431204076Spjd } 432204076Spjd 433217965Spjd if (scan->u.bmu_avail == radix) { 434204076Spjd radix /= BLIST_META_RADIX; 435204076Spjd 436210909Sdougb /* 437204076Spjd * ALL-FREE special case, initialize uninitialize 438204076Spjd * sublevel. 439204076Spjd */ 440204076Spjd for (i = 1; i <= skip; i += next_skip) { 441204076Spjd if (scan[i].bm_bighint == (daddr_t)-1) 442204076Spjd break; 443204076Spjd if (next_skip == 1) { 444204076Spjd scan[i].u.bmu_bitmap = (u_daddr_t)-1; 445204076Spjd scan[i].bm_bighint = BLIST_BMAP_RADIX; 446204076Spjd } else { 447204076Spjd scan[i].bm_bighint = radix; 448204076Spjd scan[i].u.bmu_avail = radix; 449204076Spjd } 450204076Spjd } 451204076Spjd } else { 452204076Spjd radix /= BLIST_META_RADIX; 453204076Spjd } 454204076Spjd 455204076Spjd for (i = 1; i <= skip; i += next_skip) { 456204076Spjd if (count <= scan[i].bm_bighint) { 457204076Spjd /* 458204076Spjd * count fits in object 459204076Spjd */ 460204076Spjd daddr_t r; 461204076Spjd if (next_skip == 1) { 462204076Spjd r = blst_leaf_alloc(&scan[i], blk, count); 463204076Spjd } else { 464204076Spjd r = blst_meta_alloc(&scan[i], blk, count, radix, next_skip - 1); 465204076Spjd } 466204076Spjd if (r != SWAPBLK_NONE) { 467204076Spjd scan->u.bmu_avail -= count; 468204076Spjd if (scan->bm_bighint > scan->u.bmu_avail) 469204076Spjd scan->bm_bighint = scan->u.bmu_avail; 470204076Spjd return(r); 471204076Spjd } 472204076Spjd } else if (scan[i].bm_bighint == (daddr_t)-1) { 473204076Spjd /* 474204076Spjd * Terminator 475204076Spjd */ 476204076Spjd break; 477204076Spjd } else if (count > radix) { 478204076Spjd /* 479204076Spjd * count does not fit in object even if it were 480204076Spjd * complete free. 481204076Spjd */ 482204076Spjd panic("blist_meta_alloc: allocation too large"); 483204076Spjd } 484204076Spjd blk += radix; 485204076Spjd } 486204076Spjd 487204076Spjd /* 488204076Spjd * We couldn't allocate count in this subtree, update bighint. 489204076Spjd */ 490218201Sbz if (scan->bm_bighint >= count) 491204076Spjd scan->bm_bighint = count - 1; 492204076Spjd return(SWAPBLK_NONE); 493204076Spjd} 494218215Spjd 495218215Spjd/* 496218215Spjd * BLST_LEAF_FREE() - free allocated block from leaf bitmap 497218215Spjd * 498218215Spjd */ 499204076Spjd 500204076Spjdstatic void 501204076Spjdblst_leaf_free( 502204076Spjd blmeta_t *scan, 503204076Spjd daddr_t blk, 504204076Spjd int count 505204076Spjd) { 506204076Spjd /* 507204076Spjd * free some data in this bitmap 508204076Spjd * 509204076Spjd * e.g. 510204076Spjd * 0000111111111110000 511204076Spjd * \_________/\__/ 512204076Spjd * v n 513204076Spjd */ 514204076Spjd int n = blk & (BLIST_BMAP_RADIX - 1); 515204076Spjd u_daddr_t mask; 516204076Spjd 517204076Spjd mask = ((u_daddr_t)-1 << n) & 518204076Spjd ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n)); 519204076Spjd 520204076Spjd if (scan->u.bmu_bitmap & mask) 521204076Spjd panic("blst_radix_free: freeing free block"); 522204076Spjd scan->u.bmu_bitmap |= mask; 523204076Spjd 524204076Spjd /* 525204076Spjd * We could probably do a better job here. We are required to make 526204076Spjd * bighint at least as large as the biggest contiguous block of 527204076Spjd * data. If we just shoehorn it, a little extra overhead will 528204076Spjd * be incured on the next allocation (but only that one typically). 529204076Spjd */ 530204076Spjd scan->bm_bighint = BLIST_BMAP_RADIX; 531204076Spjd} 532204076Spjd 533/* 534 * BLST_META_FREE() - free allocated blocks from radix tree meta info 535 * 536 * This support routine frees a range of blocks from the bitmap. 537 * The range must be entirely enclosed by this radix node. If a 538 * meta node, we break the range down recursively to free blocks 539 * in subnodes (which means that this code can free an arbitrary 540 * range whereas the allocation code cannot allocate an arbitrary 541 * range). 542 */ 543 544static void 545blst_meta_free( 546 blmeta_t *scan, 547 daddr_t freeBlk, 548 daddr_t count, 549 daddr_t radix, 550 int skip, 551 daddr_t blk 552) { 553 int i; 554 int next_skip = ((u_int)skip / BLIST_META_RADIX); 555 556#if 0 557 printf("FREE (%llx,%lld) FROM (%llx,%lld)\n", 558 (long long)freeBlk, (long long)count, 559 (long long)blk, (long long)radix 560 ); 561#endif 562 563 if (scan->u.bmu_avail == 0) { 564 /* 565 * ALL-ALLOCATED special case, with possible 566 * shortcut to ALL-FREE special case. 567 */ 568 scan->u.bmu_avail = count; 569 scan->bm_bighint = count; 570 571 if (count != radix) { 572 for (i = 1; i <= skip; i += next_skip) { 573 if (scan[i].bm_bighint == (daddr_t)-1) 574 break; 575 scan[i].bm_bighint = 0; 576 if (next_skip == 1) { 577 scan[i].u.bmu_bitmap = 0; 578 } else { 579 scan[i].u.bmu_avail = 0; 580 } 581 } 582 /* fall through */ 583 } 584 } else { 585 scan->u.bmu_avail += count; 586 /* scan->bm_bighint = radix; */ 587 } 588 589 /* 590 * ALL-FREE special case. 591 */ 592 593 if (scan->u.bmu_avail == radix) 594 return; 595 if (scan->u.bmu_avail > radix) 596 panic("blst_meta_free: freeing already free blocks (%lld) %lld/%lld", 597 (long long)count, (long long)scan->u.bmu_avail, 598 (long long)radix); 599 600 /* 601 * Break the free down into its components 602 */ 603 604 radix /= BLIST_META_RADIX; 605 606 i = (freeBlk - blk) / radix; 607 blk += i * radix; 608 i = i * next_skip + 1; 609 610 while (i <= skip && blk < freeBlk + count) { 611 daddr_t v; 612 613 v = blk + radix - freeBlk; 614 if (v > count) 615 v = count; 616 617 if (scan->bm_bighint == (daddr_t)-1) 618 panic("blst_meta_free: freeing unexpected range"); 619 620 if (next_skip == 1) { 621 blst_leaf_free(&scan[i], freeBlk, v); 622 } else { 623 blst_meta_free(&scan[i], freeBlk, v, radix, next_skip - 1, blk); 624 } 625 if (scan->bm_bighint < scan[i].bm_bighint) 626 scan->bm_bighint = scan[i].bm_bighint; 627 count -= v; 628 freeBlk += v; 629 blk += radix; 630 i += next_skip; 631 } 632} 633 634/* 635 * BLIST_RADIX_COPY() - copy one radix tree to another 636 * 637 * Locates free space in the source tree and frees it in the destination 638 * tree. The space may not already be free in the destination. 639 */ 640 641static void blst_copy( 642 blmeta_t *scan, 643 daddr_t blk, 644 daddr_t radix, 645 daddr_t skip, 646 blist_t dest, 647 daddr_t count 648) { 649 int next_skip; 650 int i; 651 652 /* 653 * Leaf node 654 */ 655 656 if (radix == BLIST_BMAP_RADIX) { 657 u_daddr_t v = scan->u.bmu_bitmap; 658 659 if (v == (u_daddr_t)-1) { 660 blist_free(dest, blk, count); 661 } else if (v != 0) { 662 int i; 663 664 for (i = 0; i < BLIST_BMAP_RADIX && i < count; ++i) { 665 if (v & (1 << i)) 666 blist_free(dest, blk + i, 1); 667 } 668 } 669 return; 670 } 671 672 /* 673 * Meta node 674 */ 675 676 if (scan->u.bmu_avail == 0) { 677 /* 678 * Source all allocated, leave dest allocated 679 */ 680 return; 681 } 682 if (scan->u.bmu_avail == radix) { 683 /* 684 * Source all free, free entire dest 685 */ 686 if (count < radix) 687 blist_free(dest, blk, count); 688 else 689 blist_free(dest, blk, radix); 690 return; 691 } 692 693 694 radix /= BLIST_META_RADIX; 695 next_skip = ((u_int)skip / BLIST_META_RADIX); 696 697 for (i = 1; count && i <= skip; i += next_skip) { 698 if (scan[i].bm_bighint == (daddr_t)-1) 699 break; 700 701 if (count >= radix) { 702 blst_copy( 703 &scan[i], 704 blk, 705 radix, 706 next_skip - 1, 707 dest, 708 radix 709 ); 710 count -= radix; 711 } else { 712 if (count) { 713 blst_copy( 714 &scan[i], 715 blk, 716 radix, 717 next_skip - 1, 718 dest, 719 count 720 ); 721 } 722 count = 0; 723 } 724 blk += radix; 725 } 726} 727 728/* 729 * BLST_LEAF_FILL() - allocate specific blocks in leaf bitmap 730 * 731 * This routine allocates all blocks in the specified range 732 * regardless of any existing allocations in that range. Returns 733 * the number of blocks allocated by the call. 734 */ 735 736static int 737blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count) 738{ 739 int n = blk & (BLIST_BMAP_RADIX - 1); 740 int nblks; 741 u_daddr_t mask, bitmap; 742 743 mask = ((u_daddr_t)-1 << n) & 744 ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n)); 745 746 /* Count the number of blocks we're about to allocate */ 747 bitmap = scan->u.bmu_bitmap & mask; 748 for (nblks = 0; bitmap != 0; nblks++) 749 bitmap &= bitmap - 1; 750 751 scan->u.bmu_bitmap &= ~mask; 752 return nblks; 753} 754 755/* 756 * BLIST_META_FILL() - allocate specific blocks at a meta node 757 * 758 * This routine allocates the specified range of blocks, 759 * regardless of any existing allocations in the range. The 760 * range must be within the extent of this node. Returns the 761 * number of blocks allocated by the call. 762 */ 763static int 764blst_meta_fill( 765 blmeta_t *scan, 766 daddr_t allocBlk, 767 daddr_t count, 768 daddr_t radix, 769 int skip, 770 daddr_t blk 771) { 772 int i; 773 int next_skip = ((u_int)skip / BLIST_META_RADIX); 774 int nblks = 0; 775 776 if (count == radix || scan->u.bmu_avail == 0) { 777 /* 778 * ALL-ALLOCATED special case 779 */ 780 nblks = scan->u.bmu_avail; 781 scan->u.bmu_avail = 0; 782 scan->bm_bighint = count; 783 return nblks; 784 } 785 786 if (scan->u.bmu_avail == radix) { 787 radix /= BLIST_META_RADIX; 788 789 /* 790 * ALL-FREE special case, initialize sublevel 791 */ 792 for (i = 1; i <= skip; i += next_skip) { 793 if (scan[i].bm_bighint == (daddr_t)-1) 794 break; 795 if (next_skip == 1) { 796 scan[i].u.bmu_bitmap = (u_daddr_t)-1; 797 scan[i].bm_bighint = BLIST_BMAP_RADIX; 798 } else { 799 scan[i].bm_bighint = radix; 800 scan[i].u.bmu_avail = radix; 801 } 802 } 803 } else { 804 radix /= BLIST_META_RADIX; 805 } 806 807 if (count > radix) 808 panic("blist_meta_fill: allocation too large"); 809 810 i = (allocBlk - blk) / radix; 811 blk += i * radix; 812 i = i * next_skip + 1; 813 814 while (i <= skip && blk < allocBlk + count) { 815 daddr_t v; 816 817 v = blk + radix - allocBlk; 818 if (v > count) 819 v = count; 820 821 if (scan->bm_bighint == (daddr_t)-1) 822 panic("blst_meta_fill: filling unexpected range"); 823 824 if (next_skip == 1) { 825 nblks += blst_leaf_fill(&scan[i], allocBlk, v); 826 } else { 827 nblks += blst_meta_fill(&scan[i], allocBlk, v, 828 radix, next_skip - 1, blk); 829 } 830 count -= v; 831 allocBlk += v; 832 blk += radix; 833 i += next_skip; 834 } 835 scan->u.bmu_avail -= nblks; 836 return nblks; 837} 838 839/* 840 * BLST_RADIX_INIT() - initialize radix tree 841 * 842 * Initialize our meta structures and bitmaps and calculate the exact 843 * amount of space required to manage 'count' blocks - this space may 844 * be considerably less then the calculated radix due to the large 845 * RADIX values we use. 846 */ 847 848static daddr_t 849blst_radix_init(blmeta_t *scan, daddr_t radix, int skip, daddr_t count) 850{ 851 int i; 852 int next_skip; 853 daddr_t memindex = 0; 854 855 /* 856 * Leaf node 857 */ 858 859 if (radix == BLIST_BMAP_RADIX) { 860 if (scan) { 861 scan->bm_bighint = 0; 862 scan->u.bmu_bitmap = 0; 863 } 864 return(memindex); 865 } 866 867 /* 868 * Meta node. If allocating the entire object we can special 869 * case it. However, we need to figure out how much memory 870 * is required to manage 'count' blocks, so we continue on anyway. 871 */ 872 873 if (scan) { 874 scan->bm_bighint = 0; 875 scan->u.bmu_avail = 0; 876 } 877 878 radix /= BLIST_META_RADIX; 879 next_skip = ((u_int)skip / BLIST_META_RADIX); 880 881 for (i = 1; i <= skip; i += next_skip) { 882 if (count >= radix) { 883 /* 884 * Allocate the entire object 885 */ 886 memindex = i + blst_radix_init( 887 ((scan) ? &scan[i] : NULL), 888 radix, 889 next_skip - 1, 890 radix 891 ); 892 count -= radix; 893 } else if (count > 0) { 894 /* 895 * Allocate a partial object 896 */ 897 memindex = i + blst_radix_init( 898 ((scan) ? &scan[i] : NULL), 899 radix, 900 next_skip - 1, 901 count 902 ); 903 count = 0; 904 } else { 905 /* 906 * Add terminator and break out 907 */ 908 if (scan) 909 scan[i].bm_bighint = (daddr_t)-1; 910 break; 911 } 912 } 913 if (memindex < i) 914 memindex = i; 915 return(memindex); 916} 917 918#ifdef BLIST_DEBUG 919 920static void 921blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int skip, int tab) 922{ 923 int i; 924 int next_skip; 925 int lastState = 0; 926 927 if (radix == BLIST_BMAP_RADIX) { 928 printf( 929 "%*.*s(%08llx,%lld): bitmap %08llx big=%lld\n", 930 tab, tab, "", 931 (long long)blk, (long long)radix, 932 (long long)scan->u.bmu_bitmap, 933 (long long)scan->bm_bighint 934 ); 935 return; 936 } 937 938 if (scan->u.bmu_avail == 0) { 939 printf( 940 "%*.*s(%08llx,%lld) ALL ALLOCATED\n", 941 tab, tab, "", 942 (long long)blk, 943 (long long)radix 944 ); 945 return; 946 } 947 if (scan->u.bmu_avail == radix) { 948 printf( 949 "%*.*s(%08llx,%lld) ALL FREE\n", 950 tab, tab, "", 951 (long long)blk, 952 (long long)radix 953 ); 954 return; 955 } 956 957 printf( 958 "%*.*s(%08llx,%lld): subtree (%lld/%lld) big=%lld {\n", 959 tab, tab, "", 960 (long long)blk, (long long)radix, 961 (long long)scan->u.bmu_avail, 962 (long long)radix, 963 (long long)scan->bm_bighint 964 ); 965 966 radix /= BLIST_META_RADIX; 967 next_skip = ((u_int)skip / BLIST_META_RADIX); 968 tab += 4; 969 970 for (i = 1; i <= skip; i += next_skip) { 971 if (scan[i].bm_bighint == (daddr_t)-1) { 972 printf( 973 "%*.*s(%08llx,%lld): Terminator\n", 974 tab, tab, "", 975 (long long)blk, (long long)radix 976 ); 977 lastState = 0; 978 break; 979 } 980 blst_radix_print( 981 &scan[i], 982 blk, 983 radix, 984 next_skip - 1, 985 tab 986 ); 987 blk += radix; 988 } 989 tab -= 4; 990 991 printf( 992 "%*.*s}\n", 993 tab, tab, "" 994 ); 995} 996 997#endif 998 999#ifdef BLIST_DEBUG 1000 1001int 1002main(int ac, char **av) 1003{ 1004 int size = 1024; 1005 int i; 1006 blist_t bl; 1007 1008 for (i = 1; i < ac; ++i) { 1009 const char *ptr = av[i]; 1010 if (*ptr != '-') { 1011 size = strtol(ptr, NULL, 0); 1012 continue; 1013 } 1014 ptr += 2; 1015 fprintf(stderr, "Bad option: %s\n", ptr - 2); 1016 exit(1); 1017 } 1018 bl = blist_create(size, M_WAITOK); 1019 blist_free(bl, 0, size); 1020 1021 for (;;) { 1022 char buf[1024]; 1023 daddr_t da = 0; 1024 daddr_t count = 0; 1025 1026 1027 printf("%lld/%lld/%lld> ", (long long)bl->bl_free, 1028 (long long)size, (long long)bl->bl_radix); 1029 fflush(stdout); 1030 if (fgets(buf, sizeof(buf), stdin) == NULL) 1031 break; 1032 switch(buf[0]) { 1033 case 'r': 1034 if (sscanf(buf + 1, "%lld", &count) == 1) { 1035 blist_resize(&bl, count, 1); 1036 } else { 1037 printf("?\n"); 1038 } 1039 case 'p': 1040 blist_print(bl); 1041 break; 1042 case 'a': 1043 if (sscanf(buf + 1, "%lld", &count) == 1) { 1044 daddr_t blk = blist_alloc(bl, count); 1045 printf(" R=%08llx\n", (long long)blk); 1046 } else { 1047 printf("?\n"); 1048 } 1049 break; 1050 case 'f': 1051 if (sscanf(buf + 1, "%llx %lld", 1052 (long long *)&da, (long long *)&count) == 2) { 1053 blist_free(bl, da, count); 1054 } else { 1055 printf("?\n"); 1056 } 1057 break; 1058 case 'l': 1059 if (sscanf(buf + 1, "%llx %lld", 1060 (long long *)&da, (long long *)&count) == 2) { 1061 printf(" n=%d\n", 1062 blist_fill(bl, da, count)); 1063 } else { 1064 printf("?\n"); 1065 } 1066 break; 1067 case '?': 1068 case 'h': 1069 puts( 1070 "p -print\n" 1071 "a %d -allocate\n" 1072 "f %x %d -free\n" 1073 "l %x %d -fill\n" 1074 "r %d -resize\n" 1075 "h/? -help" 1076 ); 1077 break; 1078 default: 1079 printf("?\n"); 1080 break; 1081 } 1082 } 1083 return(0); 1084} 1085 1086void 1087panic(const char *ctl, ...) 1088{ 1089 va_list va; 1090 1091 va_start(va, ctl); 1092 vfprintf(stderr, ctl, va); 1093 fprintf(stderr, "\n"); 1094 va_end(va); 1095 exit(1); 1096} 1097 1098#endif 1099 1100