1// SPDX-License-Identifier: GPL-2.0-or-later 2/* 3 * Copyright (C) 2020 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <darrick.wong@oracle.com> 5 */ 6#include "xfs.h" 7#include "xfs_fs.h" 8#include "xfs_shared.h" 9#include "xfs_format.h" 10#include "xfs_log_format.h" 11#include "xfs_trans_resv.h" 12#include "xfs_bit.h" 13#include "xfs_mount.h" 14#include "xfs_inode.h" 15#include "xfs_trans.h" 16#include "xfs_btree.h" 17#include "xfs_trace.h" 18#include "xfs_btree_staging.h" 19 20/* 21 * Staging Cursors and Fake Roots for Btrees 22 * ========================================= 23 * 24 * A staging btree cursor is a special type of btree cursor that callers must 25 * use to construct a new btree index using the btree bulk loader code. The 26 * bulk loading code uses the staging btree cursor to abstract the details of 27 * initializing new btree blocks and filling them with records or key/ptr 28 * pairs. Regular btree operations (e.g. queries and modifications) are not 29 * supported with staging cursors, and callers must not invoke them. 30 * 31 * Fake root structures contain all the information about a btree that is under 32 * construction by the bulk loading code. Staging btree cursors point to fake 33 * root structures instead of the usual AG header or inode structure. 34 * 35 * Callers are expected to initialize a fake root structure and pass it into 36 * the _stage_cursor function for a specific btree type. When bulk loading is 37 * complete, callers should call the _commit_staged_btree function for that 38 * specific btree type to commit the new btree into the filesystem. 39 */ 40 41/* 42 * Bulk Loading for AG Btrees 43 * ========================== 44 * 45 * For a btree rooted in an AG header, pass a xbtree_afakeroot structure to the 46 * staging cursor. Callers should initialize this to zero. 47 * 48 * The _stage_cursor() function for a specific btree type should call 49 * xfs_btree_stage_afakeroot to set up the in-memory cursor as a staging 50 * cursor. The corresponding _commit_staged_btree() function should log the 51 * new root and call xfs_btree_commit_afakeroot() to transform the staging 52 * cursor into a regular btree cursor. 53 */ 54 55/* 56 * Initialize a AG-rooted btree cursor with the given AG btree fake root. 57 */ 58void 59xfs_btree_stage_afakeroot( 60 struct xfs_btree_cur *cur, 61 struct xbtree_afakeroot *afake) 62{ 63 ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING)); 64 ASSERT(cur->bc_ops->type != XFS_BTREE_TYPE_INODE); 65 ASSERT(cur->bc_tp == NULL); 66 67 cur->bc_ag.afake = afake; 68 cur->bc_nlevels = afake->af_levels; 69 cur->bc_flags |= XFS_BTREE_STAGING; 70} 71 72/* 73 * Transform an AG-rooted staging btree cursor back into a regular cursor by 74 * substituting a real btree root for the fake one and restoring normal btree 75 * cursor ops. The caller must log the btree root change prior to calling 76 * this. 77 */ 78void 79xfs_btree_commit_afakeroot( 80 struct xfs_btree_cur *cur, 81 struct xfs_trans *tp, 82 struct xfs_buf *agbp) 83{ 84 ASSERT(cur->bc_flags & XFS_BTREE_STAGING); 85 ASSERT(cur->bc_tp == NULL); 86 87 trace_xfs_btree_commit_afakeroot(cur); 88 89 cur->bc_ag.afake = NULL; 90 cur->bc_ag.agbp = agbp; 91 cur->bc_flags &= ~XFS_BTREE_STAGING; 92 cur->bc_tp = tp; 93} 94 95/* 96 * Bulk Loading for Inode-Rooted Btrees 97 * ==================================== 98 * 99 * For a btree rooted in an inode fork, pass a xbtree_ifakeroot structure to 100 * the staging cursor. This structure should be initialized as follows: 101 * 102 * - if_fork_size field should be set to the number of bytes available to the 103 * fork in the inode. 104 * 105 * - if_fork should point to a freshly allocated struct xfs_ifork. 106 * 107 * - if_format should be set to the appropriate fork type (e.g. 108 * XFS_DINODE_FMT_BTREE). 109 * 110 * All other fields must be zero. 111 * 112 * The _stage_cursor() function for a specific btree type should call 113 * xfs_btree_stage_ifakeroot to set up the in-memory cursor as a staging 114 * cursor. The corresponding _commit_staged_btree() function should log the 115 * new root and call xfs_btree_commit_ifakeroot() to transform the staging 116 * cursor into a regular btree cursor. 117 */ 118 119/* 120 * Initialize an inode-rooted btree cursor with the given inode btree fake 121 * root. The btree cursor's bc_ops will be overridden as needed to make the 122 * staging functionality work. If new_ops is not NULL, these new ops will be 123 * passed out to the caller for further overriding. 124 */ 125void 126xfs_btree_stage_ifakeroot( 127 struct xfs_btree_cur *cur, 128 struct xbtree_ifakeroot *ifake) 129{ 130 ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING)); 131 ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE); 132 ASSERT(cur->bc_tp == NULL); 133 134 cur->bc_ino.ifake = ifake; 135 cur->bc_nlevels = ifake->if_levels; 136 cur->bc_ino.forksize = ifake->if_fork_size; 137 cur->bc_flags |= XFS_BTREE_STAGING; 138} 139 140/* 141 * Transform an inode-rooted staging btree cursor back into a regular cursor by 142 * substituting a real btree root for the fake one and restoring normal btree 143 * cursor ops. The caller must log the btree root change prior to calling 144 * this. 145 */ 146void 147xfs_btree_commit_ifakeroot( 148 struct xfs_btree_cur *cur, 149 struct xfs_trans *tp, 150 int whichfork) 151{ 152 ASSERT(cur->bc_flags & XFS_BTREE_STAGING); 153 ASSERT(cur->bc_tp == NULL); 154 155 trace_xfs_btree_commit_ifakeroot(cur); 156 157 cur->bc_ino.ifake = NULL; 158 cur->bc_ino.whichfork = whichfork; 159 cur->bc_flags &= ~XFS_BTREE_STAGING; 160 cur->bc_tp = tp; 161} 162 163/* 164 * Bulk Loading of Staged Btrees 165 * ============================= 166 * 167 * This interface is used with a staged btree cursor to create a totally new 168 * btree with a large number of records (i.e. more than what would fit in a 169 * single root block). When the creation is complete, the new root can be 170 * linked atomically into the filesystem by committing the staged cursor. 171 * 172 * Creation of a new btree proceeds roughly as follows: 173 * 174 * The first step is to initialize an appropriate fake btree root structure and 175 * then construct a staged btree cursor. Refer to the block comments about 176 * "Bulk Loading for AG Btrees" and "Bulk Loading for Inode-Rooted Btrees" for 177 * more information about how to do this. 178 * 179 * The second step is to initialize a struct xfs_btree_bload context as 180 * documented in the structure definition. 181 * 182 * The third step is to call xfs_btree_bload_compute_geometry to compute the 183 * height of and the number of blocks needed to construct the btree. See the 184 * section "Computing the Geometry of the New Btree" for details about this 185 * computation. 186 * 187 * In step four, the caller must allocate xfs_btree_bload.nr_blocks blocks and 188 * save them for later use by ->claim_block(). Bulk loading requires all 189 * blocks to be allocated beforehand to avoid ENOSPC failures midway through a 190 * rebuild, and to minimize seek distances of the new btree. 191 * 192 * Step five is to call xfs_btree_bload() to start constructing the btree. 193 * 194 * The final step is to commit the staging btree cursor, which logs the new 195 * btree root and turns the staging cursor into a regular cursor. The caller 196 * is responsible for cleaning up the previous btree blocks, if any. 197 * 198 * Computing the Geometry of the New Btree 199 * ======================================= 200 * 201 * The number of items placed in each btree block is computed via the following 202 * algorithm: For leaf levels, the number of items for the level is nr_records 203 * in the bload structure. For node levels, the number of items for the level 204 * is the number of blocks in the next lower level of the tree. For each 205 * level, the desired number of items per block is defined as: 206 * 207 * desired = max(minrecs, maxrecs - slack factor) 208 * 209 * The number of blocks for the level is defined to be: 210 * 211 * blocks = floor(nr_items / desired) 212 * 213 * Note this is rounded down so that the npb calculation below will never fall 214 * below minrecs. The number of items that will actually be loaded into each 215 * btree block is defined as: 216 * 217 * npb = nr_items / blocks 218 * 219 * Some of the leftmost blocks in the level will contain one extra record as 220 * needed to handle uneven division. If the number of records in any block 221 * would exceed maxrecs for that level, blocks is incremented and npb is 222 * recalculated. 223 * 224 * In other words, we compute the number of blocks needed to satisfy a given 225 * loading level, then spread the items as evenly as possible. 226 * 227 * The height and number of fs blocks required to create the btree are computed 228 * and returned via btree_height and nr_blocks. 229 */ 230 231/* 232 * Put a btree block that we're loading onto the ordered list and release it. 233 * The btree blocks will be written to disk when bulk loading is finished. 234 * If we reach the dirty buffer threshold, flush them to disk before 235 * continuing. 236 */ 237static int 238xfs_btree_bload_drop_buf( 239 struct xfs_btree_bload *bbl, 240 struct list_head *buffers_list, 241 struct xfs_buf **bpp) 242{ 243 struct xfs_buf *bp = *bpp; 244 int error; 245 246 if (!bp) 247 return 0; 248 249 /* 250 * Mark this buffer XBF_DONE (i.e. uptodate) so that a subsequent 251 * xfs_buf_read will not pointlessly reread the contents from the disk. 252 */ 253 bp->b_flags |= XBF_DONE; 254 255 xfs_buf_delwri_queue_here(bp, buffers_list); 256 xfs_buf_relse(bp); 257 *bpp = NULL; 258 bbl->nr_dirty++; 259 260 if (!bbl->max_dirty || bbl->nr_dirty < bbl->max_dirty) 261 return 0; 262 263 error = xfs_buf_delwri_submit(buffers_list); 264 if (error) 265 return error; 266 267 bbl->nr_dirty = 0; 268 return 0; 269} 270 271/* 272 * Allocate and initialize one btree block for bulk loading. 273 * 274 * The new btree block will have its level and numrecs fields set to the values 275 * of the level and nr_this_block parameters, respectively. 276 * 277 * The caller should ensure that ptrp, bpp, and blockp refer to the left 278 * sibling of the new block, if there is any. On exit, ptrp, bpp, and blockp 279 * will all point to the new block. 280 */ 281STATIC int 282xfs_btree_bload_prep_block( 283 struct xfs_btree_cur *cur, 284 struct xfs_btree_bload *bbl, 285 struct list_head *buffers_list, 286 unsigned int level, 287 unsigned int nr_this_block, 288 union xfs_btree_ptr *ptrp, /* in/out */ 289 struct xfs_buf **bpp, /* in/out */ 290 struct xfs_btree_block **blockp, /* in/out */ 291 void *priv) 292{ 293 union xfs_btree_ptr new_ptr; 294 struct xfs_buf *new_bp; 295 struct xfs_btree_block *new_block; 296 int ret; 297 298 if (xfs_btree_at_iroot(cur, level)) { 299 struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur); 300 size_t new_size; 301 302 ASSERT(*bpp == NULL); 303 304 /* Allocate a new incore btree root block. */ 305 new_size = bbl->iroot_size(cur, level, nr_this_block, priv); 306 ifp->if_broot = kzalloc(new_size, GFP_KERNEL | __GFP_NOFAIL); 307 ifp->if_broot_bytes = (int)new_size; 308 309 /* Initialize it and send it out. */ 310 xfs_btree_init_block(cur->bc_mp, ifp->if_broot, cur->bc_ops, 311 level, nr_this_block, cur->bc_ino.ip->i_ino); 312 313 *bpp = NULL; 314 *blockp = ifp->if_broot; 315 xfs_btree_set_ptr_null(cur, ptrp); 316 return 0; 317 } 318 319 /* Claim one of the caller's preallocated blocks. */ 320 xfs_btree_set_ptr_null(cur, &new_ptr); 321 ret = bbl->claim_block(cur, &new_ptr, priv); 322 if (ret) 323 return ret; 324 325 ASSERT(!xfs_btree_ptr_is_null(cur, &new_ptr)); 326 327 ret = xfs_btree_get_buf_block(cur, &new_ptr, &new_block, &new_bp); 328 if (ret) 329 return ret; 330 331 /* 332 * The previous block (if any) is the left sibling of the new block, 333 * so set its right sibling pointer to the new block and drop it. 334 */ 335 if (*blockp) 336 xfs_btree_set_sibling(cur, *blockp, &new_ptr, XFS_BB_RIGHTSIB); 337 338 ret = xfs_btree_bload_drop_buf(bbl, buffers_list, bpp); 339 if (ret) 340 return ret; 341 342 /* Initialize the new btree block. */ 343 xfs_btree_init_block_cur(cur, new_bp, level, nr_this_block); 344 xfs_btree_set_sibling(cur, new_block, ptrp, XFS_BB_LEFTSIB); 345 346 /* Set the out parameters. */ 347 *bpp = new_bp; 348 *blockp = new_block; 349 xfs_btree_copy_ptrs(cur, ptrp, &new_ptr, 1); 350 return 0; 351} 352 353/* Load one leaf block. */ 354STATIC int 355xfs_btree_bload_leaf( 356 struct xfs_btree_cur *cur, 357 unsigned int recs_this_block, 358 xfs_btree_bload_get_records_fn get_records, 359 struct xfs_btree_block *block, 360 void *priv) 361{ 362 unsigned int j = 1; 363 int ret; 364 365 /* Fill the leaf block with records. */ 366 while (j <= recs_this_block) { 367 ret = get_records(cur, j, block, recs_this_block - j + 1, priv); 368 if (ret < 0) 369 return ret; 370 j += ret; 371 } 372 373 return 0; 374} 375 376/* 377 * Load one node block with key/ptr pairs. 378 * 379 * child_ptr must point to a block within the next level down in the tree. A 380 * key/ptr entry will be created in the new node block to the block pointed to 381 * by child_ptr. On exit, child_ptr points to the next block on the child 382 * level that needs processing. 383 */ 384STATIC int 385xfs_btree_bload_node( 386 struct xfs_btree_cur *cur, 387 unsigned int recs_this_block, 388 union xfs_btree_ptr *child_ptr, 389 struct xfs_btree_block *block) 390{ 391 unsigned int j; 392 int ret; 393 394 /* Fill the node block with keys and pointers. */ 395 for (j = 1; j <= recs_this_block; j++) { 396 union xfs_btree_key child_key; 397 union xfs_btree_ptr *block_ptr; 398 union xfs_btree_key *block_key; 399 struct xfs_btree_block *child_block; 400 struct xfs_buf *child_bp; 401 402 ASSERT(!xfs_btree_ptr_is_null(cur, child_ptr)); 403 404 /* 405 * Read the lower-level block in case the buffer for it has 406 * been reclaimed. LRU refs will be set on the block, which is 407 * desirable if the new btree commits. 408 */ 409 ret = xfs_btree_read_buf_block(cur, child_ptr, 0, &child_block, 410 &child_bp); 411 if (ret) 412 return ret; 413 414 block_ptr = xfs_btree_ptr_addr(cur, j, block); 415 xfs_btree_copy_ptrs(cur, block_ptr, child_ptr, 1); 416 417 block_key = xfs_btree_key_addr(cur, j, block); 418 xfs_btree_get_keys(cur, child_block, &child_key); 419 xfs_btree_copy_keys(cur, block_key, &child_key, 1); 420 421 xfs_btree_get_sibling(cur, child_block, child_ptr, 422 XFS_BB_RIGHTSIB); 423 xfs_buf_relse(child_bp); 424 } 425 426 return 0; 427} 428 429/* 430 * Compute the maximum number of records (or keyptrs) per block that we want to 431 * install at this level in the btree. Caller is responsible for having set 432 * @cur->bc_ino.forksize to the desired fork size, if appropriate. 433 */ 434STATIC unsigned int 435xfs_btree_bload_max_npb( 436 struct xfs_btree_cur *cur, 437 struct xfs_btree_bload *bbl, 438 unsigned int level) 439{ 440 unsigned int ret; 441 442 if (level == cur->bc_nlevels - 1 && cur->bc_ops->get_dmaxrecs) 443 return cur->bc_ops->get_dmaxrecs(cur, level); 444 445 ret = cur->bc_ops->get_maxrecs(cur, level); 446 if (level == 0) 447 ret -= bbl->leaf_slack; 448 else 449 ret -= bbl->node_slack; 450 return ret; 451} 452 453/* 454 * Compute the desired number of records (or keyptrs) per block that we want to 455 * install at this level in the btree, which must be somewhere between minrecs 456 * and max_npb. The caller is free to install fewer records per block. 457 */ 458STATIC unsigned int 459xfs_btree_bload_desired_npb( 460 struct xfs_btree_cur *cur, 461 struct xfs_btree_bload *bbl, 462 unsigned int level) 463{ 464 unsigned int npb = xfs_btree_bload_max_npb(cur, bbl, level); 465 466 /* Root blocks are not subject to minrecs rules. */ 467 if (level == cur->bc_nlevels - 1) 468 return max(1U, npb); 469 470 return max_t(unsigned int, cur->bc_ops->get_minrecs(cur, level), npb); 471} 472 473/* 474 * Compute the number of records to be stored in each block at this level and 475 * the number of blocks for this level. For leaf levels, we must populate an 476 * empty root block even if there are no records, so we have to have at least 477 * one block. 478 */ 479STATIC void 480xfs_btree_bload_level_geometry( 481 struct xfs_btree_cur *cur, 482 struct xfs_btree_bload *bbl, 483 unsigned int level, 484 uint64_t nr_this_level, 485 unsigned int *avg_per_block, 486 uint64_t *blocks, 487 uint64_t *blocks_with_extra) 488{ 489 uint64_t npb; 490 uint64_t dontcare; 491 unsigned int desired_npb; 492 unsigned int maxnr; 493 494 /* 495 * Compute the absolute maximum number of records that we can store in 496 * the ondisk block or inode root. 497 */ 498 if (cur->bc_ops->get_dmaxrecs) 499 maxnr = cur->bc_ops->get_dmaxrecs(cur, level); 500 else 501 maxnr = cur->bc_ops->get_maxrecs(cur, level); 502 503 /* 504 * Compute the number of blocks we need to fill each block with the 505 * desired number of records/keyptrs per block. Because desired_npb 506 * could be minrecs, we use regular integer division (which rounds 507 * the block count down) so that in the next step the effective # of 508 * items per block will never be less than desired_npb. 509 */ 510 desired_npb = xfs_btree_bload_desired_npb(cur, bbl, level); 511 *blocks = div64_u64_rem(nr_this_level, desired_npb, &dontcare); 512 *blocks = max(1ULL, *blocks); 513 514 /* 515 * Compute the number of records that we will actually put in each 516 * block, assuming that we want to spread the records evenly between 517 * the blocks. Take care that the effective # of items per block (npb) 518 * won't exceed maxrecs even for the blocks that get an extra record, 519 * since desired_npb could be maxrecs, and in the previous step we 520 * rounded the block count down. 521 */ 522 npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra); 523 if (npb > maxnr || (npb == maxnr && *blocks_with_extra > 0)) { 524 (*blocks)++; 525 npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra); 526 } 527 528 *avg_per_block = min_t(uint64_t, npb, nr_this_level); 529 530 trace_xfs_btree_bload_level_geometry(cur, level, nr_this_level, 531 *avg_per_block, desired_npb, *blocks, 532 *blocks_with_extra); 533} 534 535/* 536 * Ensure a slack value is appropriate for the btree. 537 * 538 * If the slack value is negative, set slack so that we fill the block to 539 * halfway between minrecs and maxrecs. Make sure the slack is never so large 540 * that we can underflow minrecs. 541 */ 542static void 543xfs_btree_bload_ensure_slack( 544 struct xfs_btree_cur *cur, 545 int *slack, 546 int level) 547{ 548 int maxr; 549 int minr; 550 551 maxr = cur->bc_ops->get_maxrecs(cur, level); 552 minr = cur->bc_ops->get_minrecs(cur, level); 553 554 /* 555 * If slack is negative, automatically set slack so that we load the 556 * btree block approximately halfway between minrecs and maxrecs. 557 * Generally, this will net us 75% loading. 558 */ 559 if (*slack < 0) 560 *slack = maxr - ((maxr + minr) >> 1); 561 562 *slack = min(*slack, maxr - minr); 563} 564 565/* 566 * Prepare a btree cursor for a bulk load operation by computing the geometry 567 * fields in bbl. Caller must ensure that the btree cursor is a staging 568 * cursor. This function can be called multiple times. 569 */ 570int 571xfs_btree_bload_compute_geometry( 572 struct xfs_btree_cur *cur, 573 struct xfs_btree_bload *bbl, 574 uint64_t nr_records) 575{ 576 uint64_t nr_blocks = 0; 577 uint64_t nr_this_level; 578 579 ASSERT(cur->bc_flags & XFS_BTREE_STAGING); 580 581 /* 582 * Make sure that the slack values make sense for traditional leaf and 583 * node blocks. Inode-rooted btrees will return different minrecs and 584 * maxrecs values for the root block (bc_nlevels == level - 1). We're 585 * checking levels 0 and 1 here, so set bc_nlevels such that the btree 586 * code doesn't interpret either as the root level. 587 */ 588 cur->bc_nlevels = cur->bc_maxlevels - 1; 589 xfs_btree_bload_ensure_slack(cur, &bbl->leaf_slack, 0); 590 xfs_btree_bload_ensure_slack(cur, &bbl->node_slack, 1); 591 592 bbl->nr_records = nr_this_level = nr_records; 593 for (cur->bc_nlevels = 1; cur->bc_nlevels <= cur->bc_maxlevels;) { 594 uint64_t level_blocks; 595 uint64_t dontcare64; 596 unsigned int level = cur->bc_nlevels - 1; 597 unsigned int avg_per_block; 598 599 xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, 600 &avg_per_block, &level_blocks, &dontcare64); 601 602 if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { 603 /* 604 * If all the items we want to store at this level 605 * would fit in the inode root block, then we have our 606 * btree root and are done. 607 * 608 * Note that bmap btrees forbid records in the root. 609 */ 610 if (level != 0 && nr_this_level <= avg_per_block) { 611 nr_blocks++; 612 break; 613 } 614 615 /* 616 * Otherwise, we have to store all the items for this 617 * level in traditional btree blocks and therefore need 618 * another level of btree to point to those blocks. 619 * 620 * We have to re-compute the geometry for each level of 621 * an inode-rooted btree because the geometry differs 622 * between a btree root in an inode fork and a 623 * traditional btree block. 624 * 625 * This distinction is made in the btree code based on 626 * whether level == bc_nlevels - 1. Based on the 627 * previous root block size check against the root 628 * block geometry, we know that we aren't yet ready to 629 * populate the root. Increment bc_nevels and 630 * recalculate the geometry for a traditional 631 * block-based btree level. 632 */ 633 cur->bc_nlevels++; 634 ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); 635 xfs_btree_bload_level_geometry(cur, bbl, level, 636 nr_this_level, &avg_per_block, 637 &level_blocks, &dontcare64); 638 } else { 639 /* 640 * If all the items we want to store at this level 641 * would fit in a single root block, we're done. 642 */ 643 if (nr_this_level <= avg_per_block) { 644 nr_blocks++; 645 break; 646 } 647 648 /* Otherwise, we need another level of btree. */ 649 cur->bc_nlevels++; 650 ASSERT(cur->bc_nlevels <= cur->bc_maxlevels); 651 } 652 653 nr_blocks += level_blocks; 654 nr_this_level = level_blocks; 655 } 656 657 if (cur->bc_nlevels > cur->bc_maxlevels) 658 return -EOVERFLOW; 659 660 bbl->btree_height = cur->bc_nlevels; 661 if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) 662 bbl->nr_blocks = nr_blocks - 1; 663 else 664 bbl->nr_blocks = nr_blocks; 665 return 0; 666} 667 668/* Bulk load a btree given the parameters and geometry established in bbl. */ 669int 670xfs_btree_bload( 671 struct xfs_btree_cur *cur, 672 struct xfs_btree_bload *bbl, 673 void *priv) 674{ 675 struct list_head buffers_list; 676 union xfs_btree_ptr child_ptr; 677 union xfs_btree_ptr ptr; 678 struct xfs_buf *bp = NULL; 679 struct xfs_btree_block *block = NULL; 680 uint64_t nr_this_level = bbl->nr_records; 681 uint64_t blocks; 682 uint64_t i; 683 uint64_t blocks_with_extra; 684 uint64_t total_blocks = 0; 685 unsigned int avg_per_block; 686 unsigned int level = 0; 687 int ret; 688 689 ASSERT(cur->bc_flags & XFS_BTREE_STAGING); 690 691 INIT_LIST_HEAD(&buffers_list); 692 cur->bc_nlevels = bbl->btree_height; 693 xfs_btree_set_ptr_null(cur, &child_ptr); 694 xfs_btree_set_ptr_null(cur, &ptr); 695 bbl->nr_dirty = 0; 696 697 xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, 698 &avg_per_block, &blocks, &blocks_with_extra); 699 700 /* Load each leaf block. */ 701 for (i = 0; i < blocks; i++) { 702 unsigned int nr_this_block = avg_per_block; 703 704 /* 705 * Due to rounding, btree blocks will not be evenly populated 706 * in most cases. blocks_with_extra tells us how many blocks 707 * will receive an extra record to distribute the excess across 708 * the current level as evenly as possible. 709 */ 710 if (i < blocks_with_extra) 711 nr_this_block++; 712 713 ret = xfs_btree_bload_prep_block(cur, bbl, &buffers_list, level, 714 nr_this_block, &ptr, &bp, &block, priv); 715 if (ret) 716 goto out; 717 718 trace_xfs_btree_bload_block(cur, level, i, blocks, &ptr, 719 nr_this_block); 720 721 ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_records, 722 block, priv); 723 if (ret) 724 goto out; 725 726 /* 727 * Record the leftmost leaf pointer so we know where to start 728 * with the first node level. 729 */ 730 if (i == 0) 731 xfs_btree_copy_ptrs(cur, &child_ptr, &ptr, 1); 732 } 733 total_blocks += blocks; 734 735 ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp); 736 if (ret) 737 goto out; 738 739 /* Populate the internal btree nodes. */ 740 for (level = 1; level < cur->bc_nlevels; level++) { 741 union xfs_btree_ptr first_ptr; 742 743 nr_this_level = blocks; 744 block = NULL; 745 xfs_btree_set_ptr_null(cur, &ptr); 746 747 xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level, 748 &avg_per_block, &blocks, &blocks_with_extra); 749 750 /* Load each node block. */ 751 for (i = 0; i < blocks; i++) { 752 unsigned int nr_this_block = avg_per_block; 753 754 if (i < blocks_with_extra) 755 nr_this_block++; 756 757 ret = xfs_btree_bload_prep_block(cur, bbl, 758 &buffers_list, level, nr_this_block, 759 &ptr, &bp, &block, priv); 760 if (ret) 761 goto out; 762 763 trace_xfs_btree_bload_block(cur, level, i, blocks, 764 &ptr, nr_this_block); 765 766 ret = xfs_btree_bload_node(cur, nr_this_block, 767 &child_ptr, block); 768 if (ret) 769 goto out; 770 771 /* 772 * Record the leftmost node pointer so that we know 773 * where to start the next node level above this one. 774 */ 775 if (i == 0) 776 xfs_btree_copy_ptrs(cur, &first_ptr, &ptr, 1); 777 } 778 total_blocks += blocks; 779 780 ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp); 781 if (ret) 782 goto out; 783 784 xfs_btree_copy_ptrs(cur, &child_ptr, &first_ptr, 1); 785 } 786 787 /* Initialize the new root. */ 788 if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) { 789 ASSERT(xfs_btree_ptr_is_null(cur, &ptr)); 790 cur->bc_ino.ifake->if_levels = cur->bc_nlevels; 791 cur->bc_ino.ifake->if_blocks = total_blocks - 1; 792 } else { 793 cur->bc_ag.afake->af_root = be32_to_cpu(ptr.s); 794 cur->bc_ag.afake->af_levels = cur->bc_nlevels; 795 cur->bc_ag.afake->af_blocks = total_blocks; 796 } 797 798 /* 799 * Write the new blocks to disk. If the ordered list isn't empty after 800 * that, then something went wrong and we have to fail. This should 801 * never happen, but we'll check anyway. 802 */ 803 ret = xfs_buf_delwri_submit(&buffers_list); 804 if (ret) 805 goto out; 806 if (!list_empty(&buffers_list)) { 807 ASSERT(list_empty(&buffers_list)); 808 ret = -EIO; 809 } 810 811out: 812 xfs_buf_delwri_cancel(&buffers_list); 813 if (bp) 814 xfs_buf_relse(bp); 815 return ret; 816} 817