1/*- 2 * Copyright (c) 1998 Matthew Dillon. All Rights Reserved. 3 * Redistribution and use in source and binary forms, with or without 4 * modification, are permitted provided that the following conditions 5 * are met: 6 * 1. Redistributions of source code must retain the above copyright 7 * notice, this list of conditions and the following disclaimer. 8 * 2. Redistributions in binary form must reproduce the above copyright 9 * notice, this list of conditions and the following disclaimer in the 10 * documentation and/or other materials provided with the distribution. 11 * 4. Neither the name of the University nor the names of its contributors 12 * may be used to endorse or promote products derived from this software 13 * without specific prior written permission. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 16 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY 19 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 21 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 22 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 23 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 24 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27/* 28 * BLIST.C - Bitmap allocator/deallocator, using a radix tree with hinting 29 * 30 * This module implements a general bitmap allocator/deallocator. The 31 * allocator eats around 2 bits per 'block'. The module does not 32 * try to interpret the meaning of a 'block' other than to return 33 * SWAPBLK_NONE on an allocation failure. 34 * 35 * A radix tree controls access to pieces of the bitmap, and includes 36 * auxiliary information at each interior node about the availabilty of 37 * contiguous free blocks in the subtree rooted at that node. Two radix 38 * constants are involved: one for the size of the bitmaps contained in the 39 * leaf nodes (BLIST_BMAP_RADIX), and one for the number of descendents of 40 * each of the meta (interior) nodes (BLIST_META_RADIX). Each subtree is 41 * associated with a range of blocks. The root of any subtree stores a 42 * hint field that defines an upper bound on the size of the largest 43 * allocation that can begin in the associated block range. A hint is an 44 * upper bound on a potential allocation, but not necessarily a tight upper 45 * bound. 46 * 47 * The radix tree also implements two collapsed states for meta nodes: 48 * the ALL-ALLOCATED state and the ALL-FREE state. If a meta node is 49 * in either of these two states, all information contained underneath 50 * the node is considered stale. These states are used to optimize 51 * allocation and freeing operations. 52 * 53 * The hinting greatly increases code efficiency for allocations while 54 * the general radix structure optimizes both allocations and frees. The 55 * radix tree should be able to operate well no matter how much 56 * fragmentation there is and no matter how large a bitmap is used. 57 * 58 * The blist code wires all necessary memory at creation time. Neither 59 * allocations nor frees require interaction with the memory subsystem. 60 * The non-blocking features of the blist code are used in the swap code 61 * (vm/swap_pager.c). 62 * 63 * LAYOUT: The radix tree is laid out recursively using a 64 * linear array. Each meta node is immediately followed (laid out 65 * sequentially in memory) by BLIST_META_RADIX lower level nodes. This 66 * is a recursive structure but one that can be easily scanned through 67 * a very simple 'skip' calculation. In order to support large radixes, 68 * portions of the tree may reside outside our memory allocation. We 69 * handle this with an early-termination optimization (when bighint is 70 * set to -1) on the scan. The memory allocation is only large enough 71 * to cover the number of blocks requested at creation time even if it 72 * must be encompassed in larger root-node radix. 73 * 74 * NOTE: the allocator cannot currently allocate more than 75 * BLIST_BMAP_RADIX blocks per call. It will panic with 'allocation too 76 * large' if you try. This is an area that could use improvement. The 77 * radix is large enough that this restriction does not effect the swap 78 * system, though. Currently only the allocation code is affected by 79 * this algorithmic unfeature. The freeing code can handle arbitrary 80 * ranges. 81 * 82 * This code can be compiled stand-alone for debugging. 83 */ 84 85#include <sys/cdefs.h> 86__FBSDID("$FreeBSD: stable/11/sys/kern/subr_blist.c 324384 2017-10-07 16:55:44Z alc $"); 87 88#ifdef _KERNEL 89 90#include <sys/param.h> 91#include <sys/systm.h> 92#include <sys/lock.h> 93#include <sys/kernel.h> 94#include <sys/blist.h> 95#include <sys/malloc.h> 96#include <sys/sbuf.h> 97#include <sys/proc.h> 98#include <sys/mutex.h> 99 100#else 101 102#ifndef BLIST_NO_DEBUG 103#define BLIST_DEBUG 104#endif 105 106#include <sys/types.h> 107#include <sys/malloc.h> 108#include <sys/sbuf.h> 109#include <stdio.h> 110#include <string.h> 111#include <stdlib.h> 112#include <stdarg.h> 113#include <stdbool.h> 114 115#define bitcount64(x) __bitcount64((uint64_t)(x)) 116#define malloc(a,b,c) calloc(a, 1) 117#define free(a,b) free(a) 118static __inline int imax(int a, int b) { return (a > b ? a : b); } 119 120#include <sys/blist.h> 121 122void panic(const char *ctl, ...); 123 124#endif 125 126/* 127 * static support functions 128 */ 129static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count); 130static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, 131 u_daddr_t radix); 132static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count); 133static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, 134 u_daddr_t radix); 135static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, 136 blist_t dest, daddr_t count); 137static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count); 138static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, 139 u_daddr_t radix); 140static daddr_t blst_radix_init(blmeta_t *scan, daddr_t radix, daddr_t count); 141#ifndef _KERNEL 142static void blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, 143 int tab); 144#endif 145 146#ifdef _KERNEL 147static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space"); 148#endif 149 150_Static_assert(BLIST_BMAP_RADIX % BLIST_META_RADIX == 0, 151 "radix divisibility error"); 152#define BLIST_BMAP_MASK (BLIST_BMAP_RADIX - 1) 153#define BLIST_META_MASK (BLIST_META_RADIX - 1) 154 155/* 156 * For a subtree that can represent the state of up to 'radix' blocks, the 157 * number of leaf nodes of the subtree is L=radix/BLIST_BMAP_RADIX. If 'm' 158 * is short for BLIST_META_RADIX, then for a tree of height h with L=m**h 159 * leaf nodes, the total number of tree nodes is 1 + m + m**2 + ... + m**h, 160 * or, equivalently, (m**(h+1)-1)/(m-1). This quantity is called 'skip' 161 * in the 'meta' functions that process subtrees. Since integer division 162 * discards remainders, we can express this computation as 163 * skip = (m * m**h) / (m - 1) 164 * skip = (m * (radix / BLIST_BMAP_RADIX)) / (m - 1) 165 * and since m divides BLIST_BMAP_RADIX, we can simplify further to 166 * skip = (radix / (BLIST_BMAP_RADIX / m)) / (m - 1) 167 * skip = radix / ((BLIST_BMAP_RADIX / m) * (m - 1)) 168 * so that simple integer division by a constant can safely be used for the 169 * calculation. 170 */ 171static inline daddr_t 172radix_to_skip(daddr_t radix) 173{ 174 175 return (radix / 176 ((BLIST_BMAP_RADIX / BLIST_META_RADIX) * BLIST_META_MASK)); 177} 178 179/* 180 * Use binary search, or a faster method, to find the 1 bit in a u_daddr_t. 181 * Assumes that the argument has only one bit set. 182 */ 183static inline int 184bitpos(u_daddr_t mask) 185{ 186 int hi, lo, mid; 187 188 switch (sizeof(mask)) { 189#ifdef HAVE_INLINE_FFSLL 190 case sizeof(long long): 191 return (ffsll(mask) - 1); 192#endif 193 default: 194 lo = 0; 195 hi = BLIST_BMAP_RADIX; 196 while (lo + 1 < hi) { 197 mid = (lo + hi) >> 1; 198 if ((mask >> mid) != 0) 199 lo = mid; 200 else 201 hi = mid; 202 } 203 return (lo); 204 } 205} 206 207/* 208 * blist_create() - create a blist capable of handling up to the specified 209 * number of blocks 210 * 211 * blocks - must be greater than 0 212 * flags - malloc flags 213 * 214 * The smallest blist consists of a single leaf node capable of 215 * managing BLIST_BMAP_RADIX blocks. 216 */ 217blist_t 218blist_create(daddr_t blocks, int flags) 219{ 220 blist_t bl; 221 daddr_t nodes, radix; 222 223 /* 224 * Calculate the radix field used for scanning. 225 */ 226 radix = BLIST_BMAP_RADIX; 227 while (radix < blocks) { 228 radix *= BLIST_META_RADIX; 229 } 230 nodes = 1 + blst_radix_init(NULL, radix, blocks); 231 232 bl = malloc(sizeof(struct blist), M_SWAP, flags); 233 if (bl == NULL) 234 return (NULL); 235 236 bl->bl_blocks = blocks; 237 bl->bl_radix = radix; 238 bl->bl_cursor = 0; 239 bl->bl_root = malloc(nodes * sizeof(blmeta_t), M_SWAP, flags); 240 if (bl->bl_root == NULL) { 241 free(bl, M_SWAP); 242 return (NULL); 243 } 244 blst_radix_init(bl->bl_root, radix, blocks); 245 246#if defined(BLIST_DEBUG) 247 printf( 248 "BLIST representing %lld blocks (%lld MB of swap)" 249 ", requiring %lldK of ram\n", 250 (long long)bl->bl_blocks, 251 (long long)bl->bl_blocks * 4 / 1024, 252 (long long)(nodes * sizeof(blmeta_t) + 1023) / 1024 253 ); 254 printf("BLIST raw radix tree contains %lld records\n", 255 (long long)nodes); 256#endif 257 258 return (bl); 259} 260 261void 262blist_destroy(blist_t bl) 263{ 264 free(bl->bl_root, M_SWAP); 265 free(bl, M_SWAP); 266} 267 268/* 269 * blist_alloc() - reserve space in the block bitmap. Return the base 270 * of a contiguous region or SWAPBLK_NONE if space could 271 * not be allocated. 272 */ 273daddr_t 274blist_alloc(blist_t bl, daddr_t count) 275{ 276 daddr_t blk; 277 278 /* 279 * This loop iterates at most twice. An allocation failure in the 280 * first iteration leads to a second iteration only if the cursor was 281 * non-zero. When the cursor is zero, an allocation failure will 282 * reduce the hint, stopping further iterations. 283 */ 284 while (count <= bl->bl_root->bm_bighint) { 285 blk = blst_meta_alloc(bl->bl_root, bl->bl_cursor, count, 286 bl->bl_radix); 287 if (blk != SWAPBLK_NONE) { 288 bl->bl_cursor = blk + count; 289 if (bl->bl_cursor == bl->bl_blocks) 290 bl->bl_cursor = 0; 291 return (blk); 292 } else if (bl->bl_cursor != 0) 293 bl->bl_cursor = 0; 294 } 295 return (SWAPBLK_NONE); 296} 297 298/* 299 * blist_avail() - return the number of free blocks. 300 */ 301daddr_t 302blist_avail(blist_t bl) 303{ 304 305 if (bl->bl_radix == BLIST_BMAP_RADIX) 306 return (bitcount64(bl->bl_root->u.bmu_bitmap)); 307 else 308 return (bl->bl_root->u.bmu_avail); 309} 310 311/* 312 * blist_free() - free up space in the block bitmap. Return the base 313 * of a contiguous region. Panic if an inconsistancy is 314 * found. 315 */ 316void 317blist_free(blist_t bl, daddr_t blkno, daddr_t count) 318{ 319 320 blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix); 321} 322 323/* 324 * blist_fill() - mark a region in the block bitmap as off-limits 325 * to the allocator (i.e. allocate it), ignoring any 326 * existing allocations. Return the number of blocks 327 * actually filled that were free before the call. 328 */ 329daddr_t 330blist_fill(blist_t bl, daddr_t blkno, daddr_t count) 331{ 332 333 return (blst_meta_fill(bl->bl_root, blkno, count, bl->bl_radix)); 334} 335 336/* 337 * blist_resize() - resize an existing radix tree to handle the 338 * specified number of blocks. This will reallocate 339 * the tree and transfer the previous bitmap to the new 340 * one. When extending the tree you can specify whether 341 * the new blocks are to left allocated or freed. 342 */ 343void 344blist_resize(blist_t *pbl, daddr_t count, int freenew, int flags) 345{ 346 blist_t newbl = blist_create(count, flags); 347 blist_t save = *pbl; 348 349 *pbl = newbl; 350 if (count > save->bl_blocks) 351 count = save->bl_blocks; 352 blst_copy(save->bl_root, 0, save->bl_radix, newbl, count); 353 354 /* 355 * If resizing upwards, should we free the new space or not? 356 */ 357 if (freenew && count < newbl->bl_blocks) { 358 blist_free(newbl, count, newbl->bl_blocks - count); 359 } 360 blist_destroy(save); 361} 362 363#ifdef BLIST_DEBUG 364 365/* 366 * blist_print() - dump radix tree 367 */ 368void 369blist_print(blist_t bl) 370{ 371 printf("BLIST cursor = %08jx {\n", (uintmax_t)bl->bl_cursor); 372 blst_radix_print(bl->bl_root, 0, bl->bl_radix, 4); 373 printf("}\n"); 374} 375 376#endif 377 378static const u_daddr_t fib[] = { 379 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 380 4181, 6765, 10946, 17711, 28657, 46368, 75025, 121393, 196418, 317811, 381 514229, 832040, 1346269, 2178309, 3524578, 382}; 383 384/* 385 * Use 'gap' to describe a maximal range of unallocated blocks/bits. 386 */ 387struct gap_stats { 388 daddr_t start; /* current gap start, or SWAPBLK_NONE */ 389 daddr_t num; /* number of gaps observed */ 390 daddr_t max; /* largest gap size */ 391 daddr_t avg; /* average gap size */ 392 daddr_t err; /* sum - num * avg */ 393 daddr_t histo[nitems(fib)]; /* # gaps in each size range */ 394 int max_bucket; /* last histo elt with nonzero val */ 395}; 396 397/* 398 * gap_stats_counting() - is the state 'counting 1 bits'? 399 * or 'skipping 0 bits'? 400 */ 401static inline bool 402gap_stats_counting(const struct gap_stats *stats) 403{ 404 405 return (stats->start != SWAPBLK_NONE); 406} 407 408/* 409 * init_gap_stats() - initialize stats on gap sizes 410 */ 411static inline void 412init_gap_stats(struct gap_stats *stats) 413{ 414 415 bzero(stats, sizeof(*stats)); 416 stats->start = SWAPBLK_NONE; 417} 418 419/* 420 * update_gap_stats() - update stats on gap sizes 421 */ 422static void 423update_gap_stats(struct gap_stats *stats, daddr_t posn) 424{ 425 daddr_t size; 426 int hi, lo, mid; 427 428 if (!gap_stats_counting(stats)) { 429 stats->start = posn; 430 return; 431 } 432 size = posn - stats->start; 433 stats->start = SWAPBLK_NONE; 434 if (size > stats->max) 435 stats->max = size; 436 437 /* 438 * Find the fibonacci range that contains size, 439 * expecting to find it in an early range. 440 */ 441 lo = 0; 442 hi = 1; 443 while (hi < nitems(fib) && fib[hi] <= size) { 444 lo = hi; 445 hi *= 2; 446 } 447 if (hi >= nitems(fib)) 448 hi = nitems(fib); 449 while (lo + 1 != hi) { 450 mid = (lo + hi) >> 1; 451 if (fib[mid] <= size) 452 lo = mid; 453 else 454 hi = mid; 455 } 456 stats->histo[lo]++; 457 if (lo > stats->max_bucket) 458 stats->max_bucket = lo; 459 stats->err += size - stats->avg; 460 stats->num++; 461 stats->avg += stats->err / stats->num; 462 stats->err %= stats->num; 463} 464 465/* 466 * dump_gap_stats() - print stats on gap sizes 467 */ 468static inline void 469dump_gap_stats(const struct gap_stats *stats, struct sbuf *s) 470{ 471 int i; 472 473 sbuf_printf(s, "number of maximal free ranges: %jd\n", 474 (intmax_t)stats->num); 475 sbuf_printf(s, "largest free range: %jd\n", (intmax_t)stats->max); 476 sbuf_printf(s, "average maximal free range size: %jd\n", 477 (intmax_t)stats->avg); 478 sbuf_printf(s, "number of maximal free ranges of different sizes:\n"); 479 sbuf_printf(s, " count | size range\n"); 480 sbuf_printf(s, " ----- | ----------\n"); 481 for (i = 0; i < stats->max_bucket; i++) { 482 if (stats->histo[i] != 0) { 483 sbuf_printf(s, "%20jd | ", 484 (intmax_t)stats->histo[i]); 485 if (fib[i] != fib[i + 1] - 1) 486 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i], 487 (intmax_t)fib[i + 1] - 1); 488 else 489 sbuf_printf(s, "%jd\n", (intmax_t)fib[i]); 490 } 491 } 492 sbuf_printf(s, "%20jd | ", (intmax_t)stats->histo[i]); 493 if (stats->histo[i] > 1) 494 sbuf_printf(s, "%jd to %jd\n", (intmax_t)fib[i], 495 (intmax_t)stats->max); 496 else 497 sbuf_printf(s, "%jd\n", (intmax_t)stats->max); 498} 499 500/* 501 * blist_stats() - dump radix tree stats 502 */ 503void 504blist_stats(blist_t bl, struct sbuf *s) 505{ 506 struct gap_stats gstats; 507 struct gap_stats *stats = &gstats; 508 daddr_t i, nodes, radix; 509 u_daddr_t bit, diff, mask; 510 511 init_gap_stats(stats); 512 nodes = 0; 513 i = bl->bl_radix; 514 while (i < bl->bl_radix + bl->bl_blocks) { 515 /* 516 * Find max size subtree starting at i. 517 */ 518 radix = BLIST_BMAP_RADIX; 519 while (((i / radix) & BLIST_META_MASK) == 0) 520 radix *= BLIST_META_RADIX; 521 522 /* 523 * Check for skippable subtrees starting at i. 524 */ 525 while (radix > BLIST_BMAP_RADIX) { 526 if (bl->bl_root[nodes].u.bmu_avail == 0) { 527 if (gap_stats_counting(stats)) 528 update_gap_stats(stats, i); 529 break; 530 } 531 if (bl->bl_root[nodes].u.bmu_avail == radix) { 532 if (!gap_stats_counting(stats)) 533 update_gap_stats(stats, i); 534 break; 535 } 536 537 /* 538 * Skip subtree root. 539 */ 540 nodes++; 541 radix /= BLIST_META_RADIX; 542 } 543 if (radix == BLIST_BMAP_RADIX) { 544 /* 545 * Scan leaf. 546 */ 547 mask = bl->bl_root[nodes].u.bmu_bitmap; 548 diff = mask ^ (mask << 1); 549 if (gap_stats_counting(stats)) 550 diff ^= 1; 551 while (diff != 0) { 552 bit = diff & -diff; 553 update_gap_stats(stats, i + bitpos(bit)); 554 diff ^= bit; 555 } 556 } 557 nodes += radix_to_skip(radix); 558 i += radix; 559 } 560 update_gap_stats(stats, i); 561 dump_gap_stats(stats, s); 562} 563 564/************************************************************************ 565 * ALLOCATION SUPPORT FUNCTIONS * 566 ************************************************************************ 567 * 568 * These support functions do all the actual work. They may seem 569 * rather longish, but that's because I've commented them up. The 570 * actual code is straight forward. 571 * 572 */ 573 574/* 575 * blist_leaf_alloc() - allocate at a leaf in the radix tree (a bitmap). 576 * 577 * This is the core of the allocator and is optimized for the 578 * BLIST_BMAP_RADIX block allocation case. Otherwise, execution 579 * time is proportional to log2(count) + bitpos time. 580 */ 581static daddr_t 582blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count) 583{ 584 u_daddr_t mask; 585 int count1, hi, lo, num_shifts, range1, range_ext; 586 587 range1 = 0; 588 count1 = count - 1; 589 num_shifts = fls(count1); 590 mask = scan->u.bmu_bitmap; 591 while ((-mask & ~mask) != 0 && num_shifts > 0) { 592 /* 593 * If bit i is set in mask, then bits in [i, i+range1] are set 594 * in scan->u.bmu_bitmap. The value of range1 is equal to 595 * count1 >> num_shifts. Grow range and reduce num_shifts to 0, 596 * while preserving these invariants. The updates to mask leave 597 * fewer bits set, but each bit that remains set represents a 598 * longer string of consecutive bits set in scan->u.bmu_bitmap. 599 * If more updates to mask cannot clear more bits, because mask 600 * is partitioned with all 0 bits preceding all 1 bits, the loop 601 * terminates immediately. 602 */ 603 num_shifts--; 604 range_ext = range1 + ((count1 >> num_shifts) & 1); 605 /* 606 * mask is a signed quantity for the shift because when it is 607 * shifted right, the sign bit should copied; when the last 608 * block of the leaf is free, pretend, for a while, that all the 609 * blocks that follow it are also free. 610 */ 611 mask &= (daddr_t)mask >> range_ext; 612 range1 += range_ext; 613 } 614 if (mask == 0) { 615 /* 616 * Update bighint. There is no allocation bigger than range1 617 * starting in this leaf. 618 */ 619 scan->bm_bighint = range1; 620 return (SWAPBLK_NONE); 621 } 622 623 /* Discard any candidates that appear before blk. */ 624 mask &= (u_daddr_t)-1 << (blk & BLIST_BMAP_MASK); 625 if (mask == 0) 626 return (SWAPBLK_NONE); 627 628 /* 629 * The least significant set bit in mask marks the start of the first 630 * available range of sufficient size. Clear all the bits but that one, 631 * and then find its position. 632 */ 633 mask &= -mask; 634 lo = bitpos(mask); 635 636 hi = lo + count; 637 if (hi > BLIST_BMAP_RADIX) { 638 /* 639 * An allocation within this leaf is impossible, so a successful 640 * allocation depends on the next leaf providing some of the blocks. 641 */ 642 if (((blk / BLIST_BMAP_RADIX + 1) & BLIST_META_MASK) == 0) { 643 /* 644 * The next leaf has a different meta-node parent, so it 645 * is not necessarily initialized. Update bighint, 646 * comparing the range found at the end of mask to the 647 * largest earlier range that could have been made to 648 * vanish in the initial processing of mask. 649 */ 650 scan->bm_bighint = imax(BLIST_BMAP_RADIX - lo, range1); 651 return (SWAPBLK_NONE); 652 } 653 hi -= BLIST_BMAP_RADIX; 654 if (((scan[1].u.bmu_bitmap + 1) & ~((u_daddr_t)-1 << hi)) != 0) { 655 /* 656 * The next leaf doesn't have enough free blocks at the 657 * beginning to complete the spanning allocation. The 658 * hint cannot be updated, because the same allocation 659 * request could be satisfied later, by this leaf, if 660 * the state of the next leaf changes, and without any 661 * changes to this leaf. 662 */ 663 return (SWAPBLK_NONE); 664 } 665 /* Clear the first 'hi' bits in the next leaf, allocating them. */ 666 scan[1].u.bmu_bitmap &= (u_daddr_t)-1 << hi; 667 hi = BLIST_BMAP_RADIX; 668 } 669 670 /* Set the bits of mask at position 'lo' and higher. */ 671 mask = -mask; 672 if (hi == BLIST_BMAP_RADIX) { 673 /* 674 * Update bighint. There is no allocation bigger than range1 675 * available in this leaf after this allocation completes. 676 */ 677 scan->bm_bighint = range1; 678 } else { 679 /* Clear the bits of mask at position 'hi' and higher. */ 680 mask &= (u_daddr_t)-1 >> (BLIST_BMAP_RADIX - hi); 681 /* If this allocation uses all the bits, clear the hint. */ 682 if (mask == scan->u.bmu_bitmap) 683 scan->bm_bighint = 0; 684 } 685 /* Clear the allocated bits from this leaf. */ 686 scan->u.bmu_bitmap &= ~mask; 687 return ((blk & ~BLIST_BMAP_MASK) + lo); 688} 689 690/* 691 * blist_meta_alloc() - allocate at a meta in the radix tree. 692 * 693 * Attempt to allocate at a meta node. If we can't, we update 694 * bighint and return a failure. Updating bighint optimize future 695 * calls that hit this node. We have to check for our collapse cases 696 * and we have a few optimizations strewn in as well. 697 */ 698static daddr_t 699blst_meta_alloc(blmeta_t *scan, daddr_t cursor, daddr_t count, u_daddr_t radix) 700{ 701 daddr_t blk, i, next_skip, r, skip; 702 int child; 703 bool scan_from_start; 704 705 if (radix == BLIST_BMAP_RADIX) 706 return (blst_leaf_alloc(scan, cursor, count)); 707 if (scan->u.bmu_avail < count) { 708 /* 709 * The meta node's hint must be too large if the allocation 710 * exceeds the number of free blocks. Reduce the hint, and 711 * return failure. 712 */ 713 scan->bm_bighint = scan->u.bmu_avail; 714 return (SWAPBLK_NONE); 715 } 716 blk = cursor & -radix; 717 skip = radix_to_skip(radix); 718 next_skip = skip / BLIST_META_RADIX; 719 720 /* 721 * An ALL-FREE meta node requires special handling before allocating 722 * any of its blocks. 723 */ 724 if (scan->u.bmu_avail == radix) { 725 radix /= BLIST_META_RADIX; 726 727 /* 728 * Reinitialize each of the meta node's children. An ALL-FREE 729 * meta node cannot have a terminator in any subtree. 730 */ 731 for (i = 1; i < skip; i += next_skip) { 732 if (next_skip == 1) 733 scan[i].u.bmu_bitmap = (u_daddr_t)-1; 734 else 735 scan[i].u.bmu_avail = radix; 736 scan[i].bm_bighint = radix; 737 } 738 } else { 739 radix /= BLIST_META_RADIX; 740 } 741 742 if (count > radix) { 743 /* 744 * The allocation exceeds the number of blocks that are 745 * managed by a subtree of this meta node. 746 */ 747 panic("allocation too large"); 748 } 749 scan_from_start = cursor == blk; 750 child = (cursor - blk) / radix; 751 blk += child * radix; 752 for (i = 1 + child * next_skip; i < skip; i += next_skip) { 753 if (count <= scan[i].bm_bighint) { 754 /* 755 * The allocation might fit beginning in the i'th subtree. 756 */ 757 r = blst_meta_alloc(&scan[i], 758 cursor > blk ? cursor : blk, count, radix); 759 if (r != SWAPBLK_NONE) { 760 scan->u.bmu_avail -= count; 761 return (r); 762 } 763 } else if (scan[i].bm_bighint == (daddr_t)-1) { 764 /* 765 * Terminator 766 */ 767 break; 768 } 769 blk += radix; 770 } 771 772 /* 773 * We couldn't allocate count in this subtree, update bighint. 774 */ 775 if (scan_from_start && scan->bm_bighint >= count) 776 scan->bm_bighint = count - 1; 777 778 return (SWAPBLK_NONE); 779} 780 781/* 782 * BLST_LEAF_FREE() - free allocated block from leaf bitmap 783 * 784 */ 785static void 786blst_leaf_free(blmeta_t *scan, daddr_t blk, int count) 787{ 788 u_daddr_t mask; 789 int n; 790 791 /* 792 * free some data in this bitmap 793 * mask=0000111111111110000 794 * \_________/\__/ 795 * count n 796 */ 797 n = blk & BLIST_BMAP_MASK; 798 mask = ((u_daddr_t)-1 << n) & 799 ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n)); 800 if (scan->u.bmu_bitmap & mask) 801 panic("freeing free block"); 802 scan->u.bmu_bitmap |= mask; 803 804 /* 805 * We could probably do a better job here. We are required to make 806 * bighint at least as large as the biggest contiguous block of 807 * data. If we just shoehorn it, a little extra overhead will 808 * be incured on the next allocation (but only that one typically). 809 */ 810 scan->bm_bighint = BLIST_BMAP_RADIX; 811} 812 813/* 814 * BLST_META_FREE() - free allocated blocks from radix tree meta info 815 * 816 * This support routine frees a range of blocks from the bitmap. 817 * The range must be entirely enclosed by this radix node. If a 818 * meta node, we break the range down recursively to free blocks 819 * in subnodes (which means that this code can free an arbitrary 820 * range whereas the allocation code cannot allocate an arbitrary 821 * range). 822 */ 823static void 824blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, u_daddr_t radix) 825{ 826 daddr_t blk, i, next_skip, skip, v; 827 int child; 828 829 if (scan->bm_bighint == (daddr_t)-1) 830 panic("freeing invalid range"); 831 if (radix == BLIST_BMAP_RADIX) 832 return (blst_leaf_free(scan, freeBlk, count)); 833 skip = radix_to_skip(radix); 834 next_skip = skip / BLIST_META_RADIX; 835 836 if (scan->u.bmu_avail == 0) { 837 /* 838 * ALL-ALLOCATED special case, with possible 839 * shortcut to ALL-FREE special case. 840 */ 841 scan->u.bmu_avail = count; 842 scan->bm_bighint = count; 843 844 if (count != radix) { 845 for (i = 1; i < skip; i += next_skip) { 846 if (scan[i].bm_bighint == (daddr_t)-1) 847 break; 848 scan[i].bm_bighint = 0; 849 if (next_skip == 1) { 850 scan[i].u.bmu_bitmap = 0; 851 } else { 852 scan[i].u.bmu_avail = 0; 853 } 854 } 855 /* fall through */ 856 } 857 } else { 858 scan->u.bmu_avail += count; 859 /* scan->bm_bighint = radix; */ 860 } 861 862 /* 863 * ALL-FREE special case. 864 */ 865 866 if (scan->u.bmu_avail == radix) 867 return; 868 if (scan->u.bmu_avail > radix) 869 panic("blst_meta_free: freeing already free blocks (%lld) %lld/%lld", 870 (long long)count, (long long)scan->u.bmu_avail, 871 (long long)radix); 872 873 /* 874 * Break the free down into its components 875 */ 876 877 blk = freeBlk & -radix; 878 radix /= BLIST_META_RADIX; 879 880 child = (freeBlk - blk) / radix; 881 blk += child * radix; 882 i = 1 + child * next_skip; 883 while (i < skip && blk < freeBlk + count) { 884 v = blk + radix - freeBlk; 885 if (v > count) 886 v = count; 887 blst_meta_free(&scan[i], freeBlk, v, radix); 888 if (scan->bm_bighint < scan[i].bm_bighint) 889 scan->bm_bighint = scan[i].bm_bighint; 890 count -= v; 891 freeBlk += v; 892 blk += radix; 893 i += next_skip; 894 } 895} 896 897/* 898 * BLIST_RADIX_COPY() - copy one radix tree to another 899 * 900 * Locates free space in the source tree and frees it in the destination 901 * tree. The space may not already be free in the destination. 902 */ 903static void 904blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, blist_t dest, 905 daddr_t count) 906{ 907 daddr_t i, next_skip, skip; 908 909 /* 910 * Leaf node 911 */ 912 913 if (radix == BLIST_BMAP_RADIX) { 914 u_daddr_t v = scan->u.bmu_bitmap; 915 916 if (v == (u_daddr_t)-1) { 917 blist_free(dest, blk, count); 918 } else if (v != 0) { 919 int i; 920 921 for (i = 0; i < BLIST_BMAP_RADIX && i < count; ++i) { 922 if (v & ((u_daddr_t)1 << i)) 923 blist_free(dest, blk + i, 1); 924 } 925 } 926 return; 927 } 928 929 /* 930 * Meta node 931 */ 932 933 if (scan->u.bmu_avail == 0) { 934 /* 935 * Source all allocated, leave dest allocated 936 */ 937 return; 938 } 939 if (scan->u.bmu_avail == radix) { 940 /* 941 * Source all free, free entire dest 942 */ 943 if (count < radix) 944 blist_free(dest, blk, count); 945 else 946 blist_free(dest, blk, radix); 947 return; 948 } 949 950 951 skip = radix_to_skip(radix); 952 next_skip = skip / BLIST_META_RADIX; 953 radix /= BLIST_META_RADIX; 954 955 for (i = 1; count && i < skip; i += next_skip) { 956 if (scan[i].bm_bighint == (daddr_t)-1) 957 break; 958 959 if (count >= radix) { 960 blst_copy(&scan[i], blk, radix, dest, radix); 961 count -= radix; 962 } else { 963 if (count) { 964 blst_copy(&scan[i], blk, radix, dest, count); 965 } 966 count = 0; 967 } 968 blk += radix; 969 } 970} 971 972/* 973 * BLST_LEAF_FILL() - allocate specific blocks in leaf bitmap 974 * 975 * This routine allocates all blocks in the specified range 976 * regardless of any existing allocations in that range. Returns 977 * the number of blocks allocated by the call. 978 */ 979static daddr_t 980blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count) 981{ 982 daddr_t nblks; 983 u_daddr_t mask; 984 int n; 985 986 n = blk & BLIST_BMAP_MASK; 987 mask = ((u_daddr_t)-1 << n) & 988 ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n)); 989 990 /* Count the number of blocks that we are allocating. */ 991 nblks = bitcount64(scan->u.bmu_bitmap & mask); 992 993 scan->u.bmu_bitmap &= ~mask; 994 return (nblks); 995} 996 997/* 998 * BLIST_META_FILL() - allocate specific blocks at a meta node 999 * 1000 * This routine allocates the specified range of blocks, 1001 * regardless of any existing allocations in the range. The 1002 * range must be within the extent of this node. Returns the 1003 * number of blocks allocated by the call. 1004 */ 1005static daddr_t 1006blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, u_daddr_t radix) 1007{ 1008 daddr_t blk, i, nblks, next_skip, skip, v; 1009 int child; 1010 1011 if (scan->bm_bighint == (daddr_t)-1) 1012 panic("filling invalid range"); 1013 if (count > radix) { 1014 /* 1015 * The allocation exceeds the number of blocks that are 1016 * managed by this node. 1017 */ 1018 panic("fill too large"); 1019 } 1020 if (radix == BLIST_BMAP_RADIX) 1021 return (blst_leaf_fill(scan, allocBlk, count)); 1022 if (count == radix || scan->u.bmu_avail == 0) { 1023 /* 1024 * ALL-ALLOCATED special case 1025 */ 1026 nblks = scan->u.bmu_avail; 1027 scan->u.bmu_avail = 0; 1028 scan->bm_bighint = 0; 1029 return (nblks); 1030 } 1031 skip = radix_to_skip(radix); 1032 next_skip = skip / BLIST_META_RADIX; 1033 blk = allocBlk & -radix; 1034 1035 /* 1036 * An ALL-FREE meta node requires special handling before allocating 1037 * any of its blocks. 1038 */ 1039 if (scan->u.bmu_avail == radix) { 1040 radix /= BLIST_META_RADIX; 1041 1042 /* 1043 * Reinitialize each of the meta node's children. An ALL-FREE 1044 * meta node cannot have a terminator in any subtree. 1045 */ 1046 for (i = 1; i < skip; i += next_skip) { 1047 if (next_skip == 1) 1048 scan[i].u.bmu_bitmap = (u_daddr_t)-1; 1049 else 1050 scan[i].u.bmu_avail = radix; 1051 scan[i].bm_bighint = radix; 1052 } 1053 } else { 1054 radix /= BLIST_META_RADIX; 1055 } 1056 1057 nblks = 0; 1058 child = (allocBlk - blk) / radix; 1059 blk += child * radix; 1060 i = 1 + child * next_skip; 1061 while (i < skip && blk < allocBlk + count) { 1062 v = blk + radix - allocBlk; 1063 if (v > count) 1064 v = count; 1065 nblks += blst_meta_fill(&scan[i], allocBlk, v, radix); 1066 count -= v; 1067 allocBlk += v; 1068 blk += radix; 1069 i += next_skip; 1070 } 1071 scan->u.bmu_avail -= nblks; 1072 return (nblks); 1073} 1074 1075/* 1076 * BLST_RADIX_INIT() - initialize radix tree 1077 * 1078 * Initialize our meta structures and bitmaps and calculate the exact 1079 * amount of space required to manage 'count' blocks - this space may 1080 * be considerably less than the calculated radix due to the large 1081 * RADIX values we use. 1082 */ 1083static daddr_t 1084blst_radix_init(blmeta_t *scan, daddr_t radix, daddr_t count) 1085{ 1086 daddr_t i, memindex, next_skip, skip; 1087 1088 memindex = 0; 1089 1090 /* 1091 * Leaf node 1092 */ 1093 1094 if (radix == BLIST_BMAP_RADIX) { 1095 if (scan) { 1096 scan->bm_bighint = 0; 1097 scan->u.bmu_bitmap = 0; 1098 } 1099 return (memindex); 1100 } 1101 1102 /* 1103 * Meta node. If allocating the entire object we can special 1104 * case it. However, we need to figure out how much memory 1105 * is required to manage 'count' blocks, so we continue on anyway. 1106 */ 1107 1108 if (scan) { 1109 scan->bm_bighint = 0; 1110 scan->u.bmu_avail = 0; 1111 } 1112 1113 skip = radix_to_skip(radix); 1114 next_skip = skip / BLIST_META_RADIX; 1115 radix /= BLIST_META_RADIX; 1116 1117 for (i = 1; i < skip; i += next_skip) { 1118 if (count >= radix) { 1119 /* 1120 * Allocate the entire object 1121 */ 1122 memindex = i + 1123 blst_radix_init(((scan) ? &scan[i] : NULL), radix, 1124 radix); 1125 count -= radix; 1126 } else if (count > 0) { 1127 /* 1128 * Allocate a partial object 1129 */ 1130 memindex = i + 1131 blst_radix_init(((scan) ? &scan[i] : NULL), radix, 1132 count); 1133 count = 0; 1134 } else { 1135 /* 1136 * Add terminator and break out. Make terminator bitmap 1137 * zero to avoid a spanning leaf allocation that 1138 * includes the terminator. 1139 */ 1140 if (scan) { 1141 scan[i].bm_bighint = (daddr_t)-1; 1142 scan[i].u.bmu_bitmap = 0; 1143 } 1144 break; 1145 } 1146 } 1147 if (memindex < i) 1148 memindex = i; 1149 return (memindex); 1150} 1151 1152#ifdef BLIST_DEBUG 1153 1154static void 1155blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int tab) 1156{ 1157 daddr_t i, next_skip, skip; 1158 1159 if (radix == BLIST_BMAP_RADIX) { 1160 printf( 1161 "%*.*s(%08llx,%lld): bitmap %016llx big=%lld\n", 1162 tab, tab, "", 1163 (long long)blk, (long long)radix, 1164 (long long)scan->u.bmu_bitmap, 1165 (long long)scan->bm_bighint 1166 ); 1167 return; 1168 } 1169 1170 if (scan->u.bmu_avail == 0) { 1171 printf( 1172 "%*.*s(%08llx,%lld) ALL ALLOCATED\n", 1173 tab, tab, "", 1174 (long long)blk, 1175 (long long)radix 1176 ); 1177 return; 1178 } 1179 if (scan->u.bmu_avail == radix) { 1180 printf( 1181 "%*.*s(%08llx,%lld) ALL FREE\n", 1182 tab, tab, "", 1183 (long long)blk, 1184 (long long)radix 1185 ); 1186 return; 1187 } 1188 1189 printf( 1190 "%*.*s(%08llx,%lld): subtree (%lld/%lld) big=%lld {\n", 1191 tab, tab, "", 1192 (long long)blk, (long long)radix, 1193 (long long)scan->u.bmu_avail, 1194 (long long)radix, 1195 (long long)scan->bm_bighint 1196 ); 1197 1198 skip = radix_to_skip(radix); 1199 next_skip = skip / BLIST_META_RADIX; 1200 radix /= BLIST_META_RADIX; 1201 tab += 4; 1202 1203 for (i = 1; i < skip; i += next_skip) { 1204 if (scan[i].bm_bighint == (daddr_t)-1) { 1205 printf( 1206 "%*.*s(%08llx,%lld): Terminator\n", 1207 tab, tab, "", 1208 (long long)blk, (long long)radix 1209 ); 1210 break; 1211 } 1212 blst_radix_print(&scan[i], blk, radix, tab); 1213 blk += radix; 1214 } 1215 tab -= 4; 1216 1217 printf( 1218 "%*.*s}\n", 1219 tab, tab, "" 1220 ); 1221} 1222 1223#endif 1224 1225#ifdef BLIST_DEBUG 1226 1227int 1228main(int ac, char **av) 1229{ 1230 int size = 1024; 1231 int i; 1232 blist_t bl; 1233 struct sbuf *s; 1234 1235 for (i = 1; i < ac; ++i) { 1236 const char *ptr = av[i]; 1237 if (*ptr != '-') { 1238 size = strtol(ptr, NULL, 0); 1239 continue; 1240 } 1241 ptr += 2; 1242 fprintf(stderr, "Bad option: %s\n", ptr - 2); 1243 exit(1); 1244 } 1245 bl = blist_create(size, M_WAITOK); 1246 blist_free(bl, 0, size); 1247 1248 for (;;) { 1249 char buf[1024]; 1250 long long da = 0; 1251 long long count = 0; 1252 1253 printf("%lld/%lld/%lld> ", (long long)blist_avail(bl), 1254 (long long)size, (long long)bl->bl_radix); 1255 fflush(stdout); 1256 if (fgets(buf, sizeof(buf), stdin) == NULL) 1257 break; 1258 switch(buf[0]) { 1259 case 'r': 1260 if (sscanf(buf + 1, "%lld", &count) == 1) { 1261 blist_resize(&bl, count, 1, M_WAITOK); 1262 } else { 1263 printf("?\n"); 1264 } 1265 case 'p': 1266 blist_print(bl); 1267 break; 1268 case 's': 1269 s = sbuf_new_auto(); 1270 blist_stats(bl, s); 1271 sbuf_finish(s); 1272 printf("%s", sbuf_data(s)); 1273 sbuf_delete(s); 1274 break; 1275 case 'a': 1276 if (sscanf(buf + 1, "%lld", &count) == 1) { 1277 daddr_t blk = blist_alloc(bl, count); 1278 printf(" R=%08llx\n", (long long)blk); 1279 } else { 1280 printf("?\n"); 1281 } 1282 break; 1283 case 'f': 1284 if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) { 1285 blist_free(bl, da, count); 1286 } else { 1287 printf("?\n"); 1288 } 1289 break; 1290 case 'l': 1291 if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) { 1292 printf(" n=%jd\n", 1293 (intmax_t)blist_fill(bl, da, count)); 1294 } else { 1295 printf("?\n"); 1296 } 1297 break; 1298 case '?': 1299 case 'h': 1300 puts( 1301 "p -print\n" 1302 "s -stats\n" 1303 "a %d -allocate\n" 1304 "f %x %d -free\n" 1305 "l %x %d -fill\n" 1306 "r %d -resize\n" 1307 "h/? -help" 1308 ); 1309 break; 1310 default: 1311 printf("?\n"); 1312 break; 1313 } 1314 } 1315 return(0); 1316} 1317 1318void 1319panic(const char *ctl, ...) 1320{ 1321 va_list va; 1322 1323 va_start(va, ctl); 1324 vfprintf(stderr, ctl, va); 1325 fprintf(stderr, "\n"); 1326 va_end(va); 1327 exit(1); 1328} 1329 1330#endif 1331