100 * ========================================================================== 101 * Metaslab classes 102 * ========================================================================== 103 */ 104metaslab_class_t * 105metaslab_class_create(spa_t *spa, space_map_ops_t *ops) 106{ 107 metaslab_class_t *mc; 108 109 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 110 111 mc->mc_spa = spa; 112 mc->mc_rotor = NULL; 113 mc->mc_ops = ops; 114 115 return (mc); 116} 117 118void 119metaslab_class_destroy(metaslab_class_t *mc) 120{ 121 ASSERT(mc->mc_rotor == NULL); 122 ASSERT(mc->mc_alloc == 0); 123 ASSERT(mc->mc_deferred == 0); 124 ASSERT(mc->mc_space == 0); 125 ASSERT(mc->mc_dspace == 0); 126 127 kmem_free(mc, sizeof (metaslab_class_t)); 128} 129 130int 131metaslab_class_validate(metaslab_class_t *mc) 132{ 133 metaslab_group_t *mg; 134 vdev_t *vd; 135 136 /* 137 * Must hold one of the spa_config locks. 138 */ 139 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 140 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 141 142 if ((mg = mc->mc_rotor) == NULL) 143 return (0); 144 145 do { 146 vd = mg->mg_vd; 147 ASSERT(vd->vdev_mg != NULL); 148 ASSERT3P(vd->vdev_top, ==, vd); 149 ASSERT3P(mg->mg_class, ==, mc); 150 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 151 } while ((mg = mg->mg_next) != mc->mc_rotor); 152 153 return (0); 154} 155 156void 157metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 158 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 159{ 160 atomic_add_64(&mc->mc_alloc, alloc_delta); 161 atomic_add_64(&mc->mc_deferred, defer_delta); 162 atomic_add_64(&mc->mc_space, space_delta); 163 atomic_add_64(&mc->mc_dspace, dspace_delta); 164} 165 166uint64_t 167metaslab_class_get_alloc(metaslab_class_t *mc) 168{ 169 return (mc->mc_alloc); 170} 171 172uint64_t 173metaslab_class_get_deferred(metaslab_class_t *mc) 174{ 175 return (mc->mc_deferred); 176} 177 178uint64_t 179metaslab_class_get_space(metaslab_class_t *mc) 180{ 181 return (mc->mc_space); 182} 183 184uint64_t 185metaslab_class_get_dspace(metaslab_class_t *mc) 186{ 187 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 188} 189 190/* 191 * ========================================================================== 192 * Metaslab groups 193 * ========================================================================== 194 */ 195static int 196metaslab_compare(const void *x1, const void *x2) 197{ 198 const metaslab_t *m1 = x1; 199 const metaslab_t *m2 = x2; 200 201 if (m1->ms_weight < m2->ms_weight) 202 return (1); 203 if (m1->ms_weight > m2->ms_weight) 204 return (-1); 205 206 /* 207 * If the weights are identical, use the offset to force uniqueness. 208 */ 209 if (m1->ms_map.sm_start < m2->ms_map.sm_start) 210 return (-1); 211 if (m1->ms_map.sm_start > m2->ms_map.sm_start) 212 return (1); 213 214 ASSERT3P(m1, ==, m2); 215 216 return (0); 217} 218 219metaslab_group_t * 220metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) 221{ 222 metaslab_group_t *mg; 223 224 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 225 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 226 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 227 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 228 mg->mg_vd = vd; 229 mg->mg_class = mc; 230 mg->mg_activation_count = 0; 231 232 return (mg); 233} 234 235void 236metaslab_group_destroy(metaslab_group_t *mg) 237{ 238 ASSERT(mg->mg_prev == NULL); 239 ASSERT(mg->mg_next == NULL); 240 /* 241 * We may have gone below zero with the activation count 242 * either because we never activated in the first place or 243 * because we're done, and possibly removing the vdev. 244 */ 245 ASSERT(mg->mg_activation_count <= 0); 246 247 avl_destroy(&mg->mg_metaslab_tree); 248 mutex_destroy(&mg->mg_lock); 249 kmem_free(mg, sizeof (metaslab_group_t)); 250} 251 252void 253metaslab_group_activate(metaslab_group_t *mg) 254{ 255 metaslab_class_t *mc = mg->mg_class; 256 metaslab_group_t *mgprev, *mgnext; 257 258 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 259 260 ASSERT(mc->mc_rotor != mg); 261 ASSERT(mg->mg_prev == NULL); 262 ASSERT(mg->mg_next == NULL); 263 ASSERT(mg->mg_activation_count <= 0); 264 265 if (++mg->mg_activation_count <= 0) 266 return; 267 268 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 269 270 if ((mgprev = mc->mc_rotor) == NULL) { 271 mg->mg_prev = mg; 272 mg->mg_next = mg; 273 } else { 274 mgnext = mgprev->mg_next; 275 mg->mg_prev = mgprev; 276 mg->mg_next = mgnext; 277 mgprev->mg_next = mg; 278 mgnext->mg_prev = mg; 279 } 280 mc->mc_rotor = mg; 281} 282 283void 284metaslab_group_passivate(metaslab_group_t *mg) 285{ 286 metaslab_class_t *mc = mg->mg_class; 287 metaslab_group_t *mgprev, *mgnext; 288 289 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 290 291 if (--mg->mg_activation_count != 0) { 292 ASSERT(mc->mc_rotor != mg); 293 ASSERT(mg->mg_prev == NULL); 294 ASSERT(mg->mg_next == NULL); 295 ASSERT(mg->mg_activation_count < 0); 296 return; 297 } 298 299 mgprev = mg->mg_prev; 300 mgnext = mg->mg_next; 301 302 if (mg == mgnext) { 303 mc->mc_rotor = NULL; 304 } else { 305 mc->mc_rotor = mgnext; 306 mgprev->mg_next = mgnext; 307 mgnext->mg_prev = mgprev; 308 } 309 310 mg->mg_prev = NULL; 311 mg->mg_next = NULL; 312} 313 314static void 315metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 316{ 317 mutex_enter(&mg->mg_lock); 318 ASSERT(msp->ms_group == NULL); 319 msp->ms_group = mg; 320 msp->ms_weight = 0; 321 avl_add(&mg->mg_metaslab_tree, msp); 322 mutex_exit(&mg->mg_lock); 323} 324 325static void 326metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 327{ 328 mutex_enter(&mg->mg_lock); 329 ASSERT(msp->ms_group == mg); 330 avl_remove(&mg->mg_metaslab_tree, msp); 331 msp->ms_group = NULL; 332 mutex_exit(&mg->mg_lock); 333} 334 335static void 336metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 337{ 338 /* 339 * Although in principle the weight can be any value, in 340 * practice we do not use values in the range [1, 510]. 341 */ 342 ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0); 343 ASSERT(MUTEX_HELD(&msp->ms_lock)); 344 345 mutex_enter(&mg->mg_lock); 346 ASSERT(msp->ms_group == mg); 347 avl_remove(&mg->mg_metaslab_tree, msp); 348 msp->ms_weight = weight; 349 avl_add(&mg->mg_metaslab_tree, msp); 350 mutex_exit(&mg->mg_lock); 351} 352 353/* 354 * ========================================================================== 355 * Common allocator routines 356 * ========================================================================== 357 */ 358static int 359metaslab_segsize_compare(const void *x1, const void *x2) 360{ 361 const space_seg_t *s1 = x1; 362 const space_seg_t *s2 = x2; 363 uint64_t ss_size1 = s1->ss_end - s1->ss_start; 364 uint64_t ss_size2 = s2->ss_end - s2->ss_start; 365 366 if (ss_size1 < ss_size2) 367 return (-1); 368 if (ss_size1 > ss_size2) 369 return (1); 370 371 if (s1->ss_start < s2->ss_start) 372 return (-1); 373 if (s1->ss_start > s2->ss_start) 374 return (1); 375 376 return (0); 377} 378 379/* 380 * This is a helper function that can be used by the allocator to find 381 * a suitable block to allocate. This will search the specified AVL 382 * tree looking for a block that matches the specified criteria. 383 */ 384static uint64_t 385metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 386 uint64_t align) 387{ 388 space_seg_t *ss, ssearch; 389 avl_index_t where; 390 391 ssearch.ss_start = *cursor; 392 ssearch.ss_end = *cursor + size; 393 394 ss = avl_find(t, &ssearch, &where); 395 if (ss == NULL) 396 ss = avl_nearest(t, where, AVL_AFTER); 397 398 while (ss != NULL) { 399 uint64_t offset = P2ROUNDUP(ss->ss_start, align); 400 401 if (offset + size <= ss->ss_end) { 402 *cursor = offset + size; 403 return (offset); 404 } 405 ss = AVL_NEXT(t, ss); 406 } 407 408 /* 409 * If we know we've searched the whole map (*cursor == 0), give up. 410 * Otherwise, reset the cursor to the beginning and try again. 411 */ 412 if (*cursor == 0) 413 return (-1ULL); 414 415 *cursor = 0; 416 return (metaslab_block_picker(t, cursor, size, align)); 417} 418 419static void 420metaslab_pp_load(space_map_t *sm) 421{ 422 space_seg_t *ss; 423 424 ASSERT(sm->sm_ppd == NULL); 425 sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); 426 427 sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 428 avl_create(sm->sm_pp_root, metaslab_segsize_compare, 429 sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node)); 430 431 for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) 432 avl_add(sm->sm_pp_root, ss); 433} 434 435static void 436metaslab_pp_unload(space_map_t *sm) 437{ 438 void *cookie = NULL; 439 440 kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); 441 sm->sm_ppd = NULL; 442 443 while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) { 444 /* tear down the tree */ 445 } 446 447 avl_destroy(sm->sm_pp_root); 448 kmem_free(sm->sm_pp_root, sizeof (avl_tree_t)); 449 sm->sm_pp_root = NULL; 450} 451 452/* ARGSUSED */ 453static void 454metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size) 455{ 456 /* No need to update cursor */ 457} 458 459/* ARGSUSED */ 460static void 461metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size) 462{ 463 /* No need to update cursor */ 464} 465 466/* 467 * Return the maximum contiguous segment within the metaslab. 468 */ 469uint64_t 470metaslab_pp_maxsize(space_map_t *sm) 471{ 472 avl_tree_t *t = sm->sm_pp_root; 473 space_seg_t *ss; 474 475 if (t == NULL || (ss = avl_last(t)) == NULL) 476 return (0ULL); 477 478 return (ss->ss_end - ss->ss_start); 479} 480 481/* 482 * ========================================================================== 483 * The first-fit block allocator 484 * ========================================================================== 485 */ 486static uint64_t 487metaslab_ff_alloc(space_map_t *sm, uint64_t size) 488{ 489 avl_tree_t *t = &sm->sm_root; 490 uint64_t align = size & -size; 491 uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; 492 493 return (metaslab_block_picker(t, cursor, size, align)); 494} 495 496/* ARGSUSED */ 497boolean_t 498metaslab_ff_fragmented(space_map_t *sm) 499{ 500 return (B_TRUE); 501} 502 503static space_map_ops_t metaslab_ff_ops = { 504 metaslab_pp_load, 505 metaslab_pp_unload, 506 metaslab_ff_alloc, 507 metaslab_pp_claim, 508 metaslab_pp_free, 509 metaslab_pp_maxsize, 510 metaslab_ff_fragmented 511}; 512 513/* 514 * ========================================================================== 515 * Dynamic block allocator - 516 * Uses the first fit allocation scheme until space get low and then 517 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 518 * and metaslab_df_free_pct to determine when to switch the allocation scheme. 519 * ========================================================================== 520 */ 521static uint64_t 522metaslab_df_alloc(space_map_t *sm, uint64_t size) 523{ 524 avl_tree_t *t = &sm->sm_root; 525 uint64_t align = size & -size; 526 uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; 527 uint64_t max_size = metaslab_pp_maxsize(sm); 528 int free_pct = sm->sm_space * 100 / sm->sm_size; 529 530 ASSERT(MUTEX_HELD(sm->sm_lock)); 531 ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); 532 533 if (max_size < size) 534 return (-1ULL); 535 536 /* 537 * If we're running low on space switch to using the size 538 * sorted AVL tree (best-fit). 539 */ 540 if (max_size < metaslab_df_alloc_threshold || 541 free_pct < metaslab_df_free_pct) { 542 t = sm->sm_pp_root; 543 *cursor = 0; 544 } 545 546 return (metaslab_block_picker(t, cursor, size, 1ULL)); 547} 548 549static boolean_t 550metaslab_df_fragmented(space_map_t *sm) 551{ 552 uint64_t max_size = metaslab_pp_maxsize(sm); 553 int free_pct = sm->sm_space * 100 / sm->sm_size; 554 555 if (max_size >= metaslab_df_alloc_threshold && 556 free_pct >= metaslab_df_free_pct) 557 return (B_FALSE); 558 559 return (B_TRUE); 560} 561 562static space_map_ops_t metaslab_df_ops = { 563 metaslab_pp_load, 564 metaslab_pp_unload, 565 metaslab_df_alloc, 566 metaslab_pp_claim, 567 metaslab_pp_free, 568 metaslab_pp_maxsize, 569 metaslab_df_fragmented 570}; 571 572/* 573 * ========================================================================== 574 * Other experimental allocators 575 * ========================================================================== 576 */ 577static uint64_t 578metaslab_cdf_alloc(space_map_t *sm, uint64_t size) 579{ 580 avl_tree_t *t = &sm->sm_root; 581 uint64_t *cursor = (uint64_t *)sm->sm_ppd; 582 uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1; 583 uint64_t max_size = metaslab_pp_maxsize(sm); 584 uint64_t rsize = size; 585 uint64_t offset = 0; 586 587 ASSERT(MUTEX_HELD(sm->sm_lock)); 588 ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); 589 590 if (max_size < size) 591 return (-1ULL); 592 593 ASSERT3U(*extent_end, >=, *cursor); 594 595 /* 596 * If we're running low on space switch to using the size 597 * sorted AVL tree (best-fit). 598 */ 599 if ((*cursor + size) > *extent_end) { 600 601 t = sm->sm_pp_root; 602 *cursor = *extent_end = 0; 603 604 if (max_size > 2 * SPA_MAXBLOCKSIZE) 605 rsize = MIN(metaslab_min_alloc_size, max_size); 606 offset = metaslab_block_picker(t, extent_end, rsize, 1ULL); 607 if (offset != -1) 608 *cursor = offset + size; 609 } else { 610 offset = metaslab_block_picker(t, cursor, rsize, 1ULL); 611 } 612 ASSERT3U(*cursor, <=, *extent_end); 613 return (offset); 614} 615 616static boolean_t 617metaslab_cdf_fragmented(space_map_t *sm) 618{ 619 uint64_t max_size = metaslab_pp_maxsize(sm); 620 621 if (max_size > (metaslab_min_alloc_size * 10)) 622 return (B_FALSE); 623 return (B_TRUE); 624} 625 626static space_map_ops_t metaslab_cdf_ops = { 627 metaslab_pp_load, 628 metaslab_pp_unload, 629 metaslab_cdf_alloc, 630 metaslab_pp_claim, 631 metaslab_pp_free, 632 metaslab_pp_maxsize, 633 metaslab_cdf_fragmented 634}; 635 636uint64_t metaslab_ndf_clump_shift = 4; 637 638static uint64_t 639metaslab_ndf_alloc(space_map_t *sm, uint64_t size) 640{ 641 avl_tree_t *t = &sm->sm_root; 642 avl_index_t where; 643 space_seg_t *ss, ssearch; 644 uint64_t hbit = highbit(size); 645 uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1; 646 uint64_t max_size = metaslab_pp_maxsize(sm); 647 648 ASSERT(MUTEX_HELD(sm->sm_lock)); 649 ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); 650 651 if (max_size < size) 652 return (-1ULL); 653 654 ssearch.ss_start = *cursor; 655 ssearch.ss_end = *cursor + size; 656 657 ss = avl_find(t, &ssearch, &where); 658 if (ss == NULL || (ss->ss_start + size > ss->ss_end)) { 659 t = sm->sm_pp_root; 660 661 ssearch.ss_start = 0; 662 ssearch.ss_end = MIN(max_size, 663 1ULL << (hbit + metaslab_ndf_clump_shift)); 664 ss = avl_find(t, &ssearch, &where); 665 if (ss == NULL) 666 ss = avl_nearest(t, where, AVL_AFTER); 667 ASSERT(ss != NULL); 668 } 669 670 if (ss != NULL) { 671 if (ss->ss_start + size <= ss->ss_end) { 672 *cursor = ss->ss_start + size; 673 return (ss->ss_start); 674 } 675 } 676 return (-1ULL); 677} 678 679static boolean_t 680metaslab_ndf_fragmented(space_map_t *sm) 681{ 682 uint64_t max_size = metaslab_pp_maxsize(sm); 683 684 if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift)) 685 return (B_FALSE); 686 return (B_TRUE); 687} 688 689 690static space_map_ops_t metaslab_ndf_ops = { 691 metaslab_pp_load, 692 metaslab_pp_unload, 693 metaslab_ndf_alloc, 694 metaslab_pp_claim, 695 metaslab_pp_free, 696 metaslab_pp_maxsize, 697 metaslab_ndf_fragmented 698}; 699 700space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 701 702/* 703 * ========================================================================== 704 * Metaslabs 705 * ========================================================================== 706 */ 707metaslab_t * 708metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, 709 uint64_t start, uint64_t size, uint64_t txg) 710{ 711 vdev_t *vd = mg->mg_vd; 712 metaslab_t *msp; 713 714 msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 715 mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL); 716 717 msp->ms_smo_syncing = *smo; 718 719 /* 720 * We create the main space map here, but we don't create the 721 * allocmaps and freemaps until metaslab_sync_done(). This serves 722 * two purposes: it allows metaslab_sync_done() to detect the 723 * addition of new space; and for debugging, it ensures that we'd 724 * data fault on any attempt to use this metaslab before it's ready. 725 */ 726 space_map_create(&msp->ms_map, start, size, 727 vd->vdev_ashift, &msp->ms_lock); 728 729 metaslab_group_add(mg, msp); 730 731 if (metaslab_debug && smo->smo_object != 0) { 732 mutex_enter(&msp->ms_lock); 733 VERIFY(space_map_load(&msp->ms_map, mg->mg_class->mc_ops, 734 SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0); 735 mutex_exit(&msp->ms_lock); 736 } 737 738 /* 739 * If we're opening an existing pool (txg == 0) or creating 740 * a new one (txg == TXG_INITIAL), all space is available now. 741 * If we're adding space to an existing pool, the new space 742 * does not become available until after this txg has synced. 743 */ 744 if (txg <= TXG_INITIAL) 745 metaslab_sync_done(msp, 0); 746 747 if (txg != 0) { 748 vdev_dirty(vd, 0, NULL, txg); 749 vdev_dirty(vd, VDD_METASLAB, msp, txg); 750 } 751 752 return (msp); 753} 754 755void 756metaslab_fini(metaslab_t *msp) 757{ 758 metaslab_group_t *mg = msp->ms_group; 759 760 vdev_space_update(mg->mg_vd, 761 -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size); 762 763 metaslab_group_remove(mg, msp); 764 765 mutex_enter(&msp->ms_lock); 766 767 space_map_unload(&msp->ms_map); 768 space_map_destroy(&msp->ms_map); 769 770 for (int t = 0; t < TXG_SIZE; t++) { 771 space_map_destroy(&msp->ms_allocmap[t]); 772 space_map_destroy(&msp->ms_freemap[t]); 773 } 774 775 for (int t = 0; t < TXG_DEFER_SIZE; t++) 776 space_map_destroy(&msp->ms_defermap[t]); 777 778 ASSERT0(msp->ms_deferspace); 779 780 mutex_exit(&msp->ms_lock); 781 mutex_destroy(&msp->ms_lock); 782 783 kmem_free(msp, sizeof (metaslab_t)); 784} 785 786#define METASLAB_WEIGHT_PRIMARY (1ULL << 63) 787#define METASLAB_WEIGHT_SECONDARY (1ULL << 62) 788#define METASLAB_ACTIVE_MASK \ 789 (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) 790 791static uint64_t 792metaslab_weight(metaslab_t *msp) 793{ 794 metaslab_group_t *mg = msp->ms_group; 795 space_map_t *sm = &msp->ms_map; 796 space_map_obj_t *smo = &msp->ms_smo; 797 vdev_t *vd = mg->mg_vd; 798 uint64_t weight, space; 799 800 ASSERT(MUTEX_HELD(&msp->ms_lock)); 801 802 /* 803 * The baseline weight is the metaslab's free space. 804 */ 805 space = sm->sm_size - smo->smo_alloc; 806 weight = space; 807 808 /* 809 * Modern disks have uniform bit density and constant angular velocity. 810 * Therefore, the outer recording zones are faster (higher bandwidth) 811 * than the inner zones by the ratio of outer to inner track diameter, 812 * which is typically around 2:1. We account for this by assigning 813 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 814 * In effect, this means that we'll select the metaslab with the most 815 * free bandwidth rather than simply the one with the most free space. 816 */ 817 weight = 2 * weight - 818 ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count; 819 ASSERT(weight >= space && weight <= 2 * space); 820 821 /* 822 * For locality, assign higher weight to metaslabs which have 823 * a lower offset than what we've already activated. 824 */ 825 if (sm->sm_start <= mg->mg_bonus_area) 826 weight *= (metaslab_smo_bonus_pct / 100); 827 ASSERT(weight >= space && 828 weight <= 2 * (metaslab_smo_bonus_pct / 100) * space); 829 830 if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) { 831 /* 832 * If this metaslab is one we're actively using, adjust its 833 * weight to make it preferable to any inactive metaslab so 834 * we'll polish it off. 835 */ 836 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 837 } 838 return (weight); 839} 840 841static void 842metaslab_prefetch(metaslab_group_t *mg) 843{ 844 spa_t *spa = mg->mg_vd->vdev_spa; 845 metaslab_t *msp; 846 avl_tree_t *t = &mg->mg_metaslab_tree; 847 int m; 848 849 mutex_enter(&mg->mg_lock); 850 851 /* 852 * Prefetch the next potential metaslabs 853 */ 854 for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) { 855 space_map_t *sm = &msp->ms_map; 856 space_map_obj_t *smo = &msp->ms_smo; 857 858 /* If we have reached our prefetch limit then we're done */ 859 if (m >= metaslab_prefetch_limit) 860 break; 861 862 if (!sm->sm_loaded && smo->smo_object != 0) { 863 mutex_exit(&mg->mg_lock); 864 dmu_prefetch(spa_meta_objset(spa), smo->smo_object, 865 0ULL, smo->smo_objsize); 866 mutex_enter(&mg->mg_lock); 867 } 868 } 869 mutex_exit(&mg->mg_lock); 870} 871 872static int 873metaslab_activate(metaslab_t *msp, uint64_t activation_weight) 874{ 875 metaslab_group_t *mg = msp->ms_group; 876 space_map_t *sm = &msp->ms_map; 877 space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops; 878 879 ASSERT(MUTEX_HELD(&msp->ms_lock)); 880 881 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 882 space_map_load_wait(sm); 883 if (!sm->sm_loaded) { 884 space_map_obj_t *smo = &msp->ms_smo; 885 886 int error = space_map_load(sm, sm_ops, SM_FREE, smo, 887 spa_meta_objset(msp->ms_group->mg_vd->vdev_spa)); 888 if (error) { 889 metaslab_group_sort(msp->ms_group, msp, 0); 890 return (error); 891 } 892 for (int t = 0; t < TXG_DEFER_SIZE; t++) 893 space_map_walk(&msp->ms_defermap[t], 894 space_map_claim, sm); 895 896 } 897 898 /* 899 * Track the bonus area as we activate new metaslabs. 900 */ 901 if (sm->sm_start > mg->mg_bonus_area) { 902 mutex_enter(&mg->mg_lock); 903 mg->mg_bonus_area = sm->sm_start; 904 mutex_exit(&mg->mg_lock); 905 } 906 907 metaslab_group_sort(msp->ms_group, msp, 908 msp->ms_weight | activation_weight); 909 } 910 ASSERT(sm->sm_loaded); 911 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 912 913 return (0); 914} 915 916static void 917metaslab_passivate(metaslab_t *msp, uint64_t size) 918{ 919 /* 920 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 921 * this metaslab again. In that case, it had better be empty, 922 * or we would be leaving space on the table. 923 */ 924 ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0); 925 metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); 926 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 927} 928 929/* 930 * Write a metaslab to disk in the context of the specified transaction group. 931 */ 932void 933metaslab_sync(metaslab_t *msp, uint64_t txg) 934{ 935 vdev_t *vd = msp->ms_group->mg_vd; 936 spa_t *spa = vd->vdev_spa; 937 objset_t *mos = spa_meta_objset(spa); 938 space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK]; 939 space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK]; 940 space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; 941 space_map_t *sm = &msp->ms_map; 942 space_map_obj_t *smo = &msp->ms_smo_syncing; 943 dmu_buf_t *db; 944 dmu_tx_t *tx; 945 946 ASSERT(!vd->vdev_ishole); 947 948 if (allocmap->sm_space == 0 && freemap->sm_space == 0) 949 return; 950 951 /* 952 * The only state that can actually be changing concurrently with 953 * metaslab_sync() is the metaslab's ms_map. No other thread can 954 * be modifying this txg's allocmap, freemap, freed_map, or smo. 955 * Therefore, we only hold ms_lock to satify space_map ASSERTs. 956 * We drop it whenever we call into the DMU, because the DMU 957 * can call down to us (e.g. via zio_free()) at any time. 958 */ 959 960 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 961 962 if (smo->smo_object == 0) { 963 ASSERT(smo->smo_objsize == 0); 964 ASSERT(smo->smo_alloc == 0); 965 smo->smo_object = dmu_object_alloc(mos, 966 DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, 967 DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); 968 ASSERT(smo->smo_object != 0); 969 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 970 (sm->sm_start >> vd->vdev_ms_shift), 971 sizeof (uint64_t), &smo->smo_object, tx); 972 } 973 974 mutex_enter(&msp->ms_lock); 975 976 space_map_walk(freemap, space_map_add, freed_map); 977 978 if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >= 979 2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) { 980 /* 981 * The in-core space map representation is twice as compact 982 * as the on-disk one, so it's time to condense the latter 983 * by generating a pure allocmap from first principles. 984 * 985 * This metaslab is 100% allocated, 986 * minus the content of the in-core map (sm), 987 * minus what's been freed this txg (freed_map), 988 * minus deferred frees (ms_defermap[]), 989 * minus allocations from txgs in the future 990 * (because they haven't been committed yet). 991 */ 992 space_map_vacate(allocmap, NULL, NULL); 993 space_map_vacate(freemap, NULL, NULL); 994 995 space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size); 996 997 space_map_walk(sm, space_map_remove, allocmap); 998 space_map_walk(freed_map, space_map_remove, allocmap); 999 1000 for (int t = 0; t < TXG_DEFER_SIZE; t++) 1001 space_map_walk(&msp->ms_defermap[t], 1002 space_map_remove, allocmap); 1003 1004 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) 1005 space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK], 1006 space_map_remove, allocmap); 1007 1008 mutex_exit(&msp->ms_lock); 1009 space_map_truncate(smo, mos, tx); 1010 mutex_enter(&msp->ms_lock); 1011 } 1012 1013 space_map_sync(allocmap, SM_ALLOC, smo, mos, tx); 1014 space_map_sync(freemap, SM_FREE, smo, mos, tx); 1015 1016 mutex_exit(&msp->ms_lock); 1017 1018 VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); 1019 dmu_buf_will_dirty(db, tx); 1020 ASSERT3U(db->db_size, >=, sizeof (*smo)); 1021 bcopy(smo, db->db_data, sizeof (*smo)); 1022 dmu_buf_rele(db, FTAG); 1023 1024 dmu_tx_commit(tx); 1025} 1026 1027/* 1028 * Called after a transaction group has completely synced to mark 1029 * all of the metaslab's free space as usable. 1030 */ 1031void 1032metaslab_sync_done(metaslab_t *msp, uint64_t txg) 1033{ 1034 space_map_obj_t *smo = &msp->ms_smo; 1035 space_map_obj_t *smosync = &msp->ms_smo_syncing; 1036 space_map_t *sm = &msp->ms_map; 1037 space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; 1038 space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE]; 1039 metaslab_group_t *mg = msp->ms_group; 1040 vdev_t *vd = mg->mg_vd; 1041 int64_t alloc_delta, defer_delta; 1042 1043 ASSERT(!vd->vdev_ishole); 1044 1045 mutex_enter(&msp->ms_lock); 1046 1047 /* 1048 * If this metaslab is just becoming available, initialize its 1049 * allocmaps and freemaps and add its capacity to the vdev. 1050 */ 1051 if (freed_map->sm_size == 0) { 1052 for (int t = 0; t < TXG_SIZE; t++) { 1053 space_map_create(&msp->ms_allocmap[t], sm->sm_start, 1054 sm->sm_size, sm->sm_shift, sm->sm_lock); 1055 space_map_create(&msp->ms_freemap[t], sm->sm_start, 1056 sm->sm_size, sm->sm_shift, sm->sm_lock); 1057 } 1058 1059 for (int t = 0; t < TXG_DEFER_SIZE; t++) 1060 space_map_create(&msp->ms_defermap[t], sm->sm_start, 1061 sm->sm_size, sm->sm_shift, sm->sm_lock); 1062 1063 vdev_space_update(vd, 0, 0, sm->sm_size); 1064 } 1065 1066 alloc_delta = smosync->smo_alloc - smo->smo_alloc; 1067 defer_delta = freed_map->sm_space - defer_map->sm_space; 1068 1069 vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); 1070 1071 ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0); 1072 ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0); 1073 1074 /* 1075 * If there's a space_map_load() in progress, wait for it to complete 1076 * so that we have a consistent view of the in-core space map. 1077 * Then, add defer_map (oldest deferred frees) to this map and 1078 * transfer freed_map (this txg's frees) to defer_map. 1079 */ 1080 space_map_load_wait(sm); 1081 space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm); 1082 space_map_vacate(freed_map, space_map_add, defer_map); 1083 1084 *smo = *smosync; 1085 1086 msp->ms_deferspace += defer_delta; 1087 ASSERT3S(msp->ms_deferspace, >=, 0); 1088 ASSERT3S(msp->ms_deferspace, <=, sm->sm_size); 1089 if (msp->ms_deferspace != 0) { 1090 /* 1091 * Keep syncing this metaslab until all deferred frees 1092 * are back in circulation. 1093 */ 1094 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 1095 } 1096 1097 /* 1098 * If the map is loaded but no longer active, evict it as soon as all 1099 * future allocations have synced. (If we unloaded it now and then 1100 * loaded a moment later, the map wouldn't reflect those allocations.) 1101 */ 1102 if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 1103 int evictable = 1; 1104 1105 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) 1106 if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space) 1107 evictable = 0; 1108 1109 if (evictable && !metaslab_debug) 1110 space_map_unload(sm); 1111 } 1112 1113 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 1114 1115 mutex_exit(&msp->ms_lock); 1116} 1117 1118void 1119metaslab_sync_reassess(metaslab_group_t *mg) 1120{ 1121 vdev_t *vd = mg->mg_vd; 1122 int64_t failures = mg->mg_alloc_failures; 1123 1124 /* 1125 * Re-evaluate all metaslabs which have lower offsets than the 1126 * bonus area. 1127 */ 1128 for (int m = 0; m < vd->vdev_ms_count; m++) { 1129 metaslab_t *msp = vd->vdev_ms[m]; 1130 1131 if (msp->ms_map.sm_start > mg->mg_bonus_area) 1132 break; 1133 1134 mutex_enter(&msp->ms_lock); 1135 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 1136 mutex_exit(&msp->ms_lock); 1137 } 1138 1139 atomic_add_64(&mg->mg_alloc_failures, -failures); 1140 1141 /* 1142 * Prefetch the next potential metaslabs 1143 */ 1144 metaslab_prefetch(mg); 1145} 1146 1147static uint64_t 1148metaslab_distance(metaslab_t *msp, dva_t *dva) 1149{ 1150 uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; 1151 uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; 1152 uint64_t start = msp->ms_map.sm_start >> ms_shift; 1153 1154 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 1155 return (1ULL << 63); 1156 1157 if (offset < start) 1158 return ((start - offset) << ms_shift); 1159 if (offset > start) 1160 return ((offset - start) << ms_shift); 1161 return (0); 1162} 1163 1164static uint64_t 1165metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, 1166 uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags) 1167{ 1168 spa_t *spa = mg->mg_vd->vdev_spa; 1169 metaslab_t *msp = NULL; 1170 uint64_t offset = -1ULL; 1171 avl_tree_t *t = &mg->mg_metaslab_tree; 1172 uint64_t activation_weight; 1173 uint64_t target_distance; 1174 int i; 1175 1176 activation_weight = METASLAB_WEIGHT_PRIMARY; 1177 for (i = 0; i < d; i++) { 1178 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 1179 activation_weight = METASLAB_WEIGHT_SECONDARY; 1180 break; 1181 } 1182 } 1183 1184 for (;;) { 1185 boolean_t was_active; 1186 1187 mutex_enter(&mg->mg_lock); 1188 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { 1189 if (msp->ms_weight < asize) { 1190 spa_dbgmsg(spa, "%s: failed to meet weight " 1191 "requirement: vdev %llu, txg %llu, mg %p, " 1192 "msp %p, psize %llu, asize %llu, " 1193 "failures %llu, weight %llu", 1194 spa_name(spa), mg->mg_vd->vdev_id, txg, 1195 mg, msp, psize, asize, 1196 mg->mg_alloc_failures, msp->ms_weight); 1197 mutex_exit(&mg->mg_lock); 1198 return (-1ULL); 1199 } 1200 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 1201 if (activation_weight == METASLAB_WEIGHT_PRIMARY) 1202 break; 1203 1204 target_distance = min_distance + 1205 (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1); 1206 1207 for (i = 0; i < d; i++) 1208 if (metaslab_distance(msp, &dva[i]) < 1209 target_distance) 1210 break; 1211 if (i == d) 1212 break; 1213 } 1214 mutex_exit(&mg->mg_lock); 1215 if (msp == NULL) 1216 return (-1ULL); 1217 1218 /* 1219 * If we've already reached the allowable number of failed 1220 * allocation attempts on this metaslab group then we 1221 * consider skipping it. We skip it only if we're allowed 1222 * to "fast" gang, the physical size is larger than 1223 * a gang block, and we're attempting to allocate from 1224 * the primary metaslab. 1225 */ 1226 if (mg->mg_alloc_failures > zfs_mg_alloc_failures && 1227 CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE && 1228 activation_weight == METASLAB_WEIGHT_PRIMARY) { 1229 spa_dbgmsg(spa, "%s: skipping metaslab group: " 1230 "vdev %llu, txg %llu, mg %p, psize %llu, " 1231 "asize %llu, failures %llu", spa_name(spa), 1232 mg->mg_vd->vdev_id, txg, mg, psize, asize, 1233 mg->mg_alloc_failures); 1234 return (-1ULL); 1235 } 1236 1237 mutex_enter(&msp->ms_lock); 1238 1239 /* 1240 * Ensure that the metaslab we have selected is still 1241 * capable of handling our request. It's possible that 1242 * another thread may have changed the weight while we 1243 * were blocked on the metaslab lock. 1244 */ 1245 if (msp->ms_weight < asize || (was_active && 1246 !(msp->ms_weight & METASLAB_ACTIVE_MASK) && 1247 activation_weight == METASLAB_WEIGHT_PRIMARY)) { 1248 mutex_exit(&msp->ms_lock); 1249 continue; 1250 } 1251 1252 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && 1253 activation_weight == METASLAB_WEIGHT_PRIMARY) { 1254 metaslab_passivate(msp, 1255 msp->ms_weight & ~METASLAB_ACTIVE_MASK); 1256 mutex_exit(&msp->ms_lock); 1257 continue; 1258 } 1259 1260 if (metaslab_activate(msp, activation_weight) != 0) { 1261 mutex_exit(&msp->ms_lock); 1262 continue; 1263 } 1264 1265 if ((offset = space_map_alloc(&msp->ms_map, asize)) != -1ULL) 1266 break; 1267 1268 atomic_inc_64(&mg->mg_alloc_failures); 1269 1270 metaslab_passivate(msp, space_map_maxsize(&msp->ms_map)); 1271 1272 mutex_exit(&msp->ms_lock); 1273 } 1274 1275 if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) 1276 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 1277 1278 space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, asize); 1279 1280 mutex_exit(&msp->ms_lock); 1281 1282 return (offset); 1283} 1284 1285/* 1286 * Allocate a block for the specified i/o. 1287 */ 1288static int 1289metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 1290 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) 1291{ 1292 metaslab_group_t *mg, *rotor; 1293 vdev_t *vd; 1294 int dshift = 3; 1295 int all_zero; 1296 int zio_lock = B_FALSE; 1297 boolean_t allocatable; 1298 uint64_t offset = -1ULL; 1299 uint64_t asize; 1300 uint64_t distance; 1301 1302 ASSERT(!DVA_IS_VALID(&dva[d])); 1303 1304 /* 1305 * For testing, make some blocks above a certain size be gang blocks. 1306 */ 1307 if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) 1308 return (ENOSPC); 1309 1310 /* 1311 * Start at the rotor and loop through all mgs until we find something. 1312 * Note that there's no locking on mc_rotor or mc_aliquot because 1313 * nothing actually breaks if we miss a few updates -- we just won't 1314 * allocate quite as evenly. It all balances out over time. 1315 * 1316 * If we are doing ditto or log blocks, try to spread them across 1317 * consecutive vdevs. If we're forced to reuse a vdev before we've 1318 * allocated all of our ditto blocks, then try and spread them out on 1319 * that vdev as much as possible. If it turns out to not be possible, 1320 * gradually lower our standards until anything becomes acceptable. 1321 * Also, allocating on consecutive vdevs (as opposed to random vdevs) 1322 * gives us hope of containing our fault domains to something we're 1323 * able to reason about. Otherwise, any two top-level vdev failures 1324 * will guarantee the loss of data. With consecutive allocation, 1325 * only two adjacent top-level vdev failures will result in data loss. 1326 * 1327 * If we are doing gang blocks (hintdva is non-NULL), try to keep 1328 * ourselves on the same vdev as our gang block header. That 1329 * way, we can hope for locality in vdev_cache, plus it makes our 1330 * fault domains something tractable. 1331 */ 1332 if (hintdva) { 1333 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 1334 1335 /* 1336 * It's possible the vdev we're using as the hint no 1337 * longer exists (i.e. removed). Consult the rotor when 1338 * all else fails. 1339 */ 1340 if (vd != NULL) { 1341 mg = vd->vdev_mg; 1342 1343 if (flags & METASLAB_HINTBP_AVOID && 1344 mg->mg_next != NULL) 1345 mg = mg->mg_next; 1346 } else { 1347 mg = mc->mc_rotor; 1348 } 1349 } else if (d != 0) { 1350 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 1351 mg = vd->vdev_mg->mg_next; 1352 } else { 1353 mg = mc->mc_rotor; 1354 } 1355 1356 /* 1357 * If the hint put us into the wrong metaslab class, or into a 1358 * metaslab group that has been passivated, just follow the rotor. 1359 */ 1360 if (mg->mg_class != mc || mg->mg_activation_count <= 0) 1361 mg = mc->mc_rotor; 1362 1363 rotor = mg; 1364top: 1365 all_zero = B_TRUE; 1366 do { 1367 ASSERT(mg->mg_activation_count == 1); 1368 1369 vd = mg->mg_vd; 1370 1371 /* 1372 * Don't allocate from faulted devices. 1373 */ 1374 if (zio_lock) { 1375 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 1376 allocatable = vdev_allocatable(vd); 1377 spa_config_exit(spa, SCL_ZIO, FTAG); 1378 } else { 1379 allocatable = vdev_allocatable(vd); 1380 } 1381 if (!allocatable) 1382 goto next; 1383 1384 /* 1385 * Avoid writing single-copy data to a failing vdev
| 110 * ========================================================================== 111 * Metaslab classes 112 * ========================================================================== 113 */ 114metaslab_class_t * 115metaslab_class_create(spa_t *spa, space_map_ops_t *ops) 116{ 117 metaslab_class_t *mc; 118 119 mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); 120 121 mc->mc_spa = spa; 122 mc->mc_rotor = NULL; 123 mc->mc_ops = ops; 124 125 return (mc); 126} 127 128void 129metaslab_class_destroy(metaslab_class_t *mc) 130{ 131 ASSERT(mc->mc_rotor == NULL); 132 ASSERT(mc->mc_alloc == 0); 133 ASSERT(mc->mc_deferred == 0); 134 ASSERT(mc->mc_space == 0); 135 ASSERT(mc->mc_dspace == 0); 136 137 kmem_free(mc, sizeof (metaslab_class_t)); 138} 139 140int 141metaslab_class_validate(metaslab_class_t *mc) 142{ 143 metaslab_group_t *mg; 144 vdev_t *vd; 145 146 /* 147 * Must hold one of the spa_config locks. 148 */ 149 ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || 150 spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); 151 152 if ((mg = mc->mc_rotor) == NULL) 153 return (0); 154 155 do { 156 vd = mg->mg_vd; 157 ASSERT(vd->vdev_mg != NULL); 158 ASSERT3P(vd->vdev_top, ==, vd); 159 ASSERT3P(mg->mg_class, ==, mc); 160 ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); 161 } while ((mg = mg->mg_next) != mc->mc_rotor); 162 163 return (0); 164} 165 166void 167metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, 168 int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) 169{ 170 atomic_add_64(&mc->mc_alloc, alloc_delta); 171 atomic_add_64(&mc->mc_deferred, defer_delta); 172 atomic_add_64(&mc->mc_space, space_delta); 173 atomic_add_64(&mc->mc_dspace, dspace_delta); 174} 175 176uint64_t 177metaslab_class_get_alloc(metaslab_class_t *mc) 178{ 179 return (mc->mc_alloc); 180} 181 182uint64_t 183metaslab_class_get_deferred(metaslab_class_t *mc) 184{ 185 return (mc->mc_deferred); 186} 187 188uint64_t 189metaslab_class_get_space(metaslab_class_t *mc) 190{ 191 return (mc->mc_space); 192} 193 194uint64_t 195metaslab_class_get_dspace(metaslab_class_t *mc) 196{ 197 return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); 198} 199 200/* 201 * ========================================================================== 202 * Metaslab groups 203 * ========================================================================== 204 */ 205static int 206metaslab_compare(const void *x1, const void *x2) 207{ 208 const metaslab_t *m1 = x1; 209 const metaslab_t *m2 = x2; 210 211 if (m1->ms_weight < m2->ms_weight) 212 return (1); 213 if (m1->ms_weight > m2->ms_weight) 214 return (-1); 215 216 /* 217 * If the weights are identical, use the offset to force uniqueness. 218 */ 219 if (m1->ms_map.sm_start < m2->ms_map.sm_start) 220 return (-1); 221 if (m1->ms_map.sm_start > m2->ms_map.sm_start) 222 return (1); 223 224 ASSERT3P(m1, ==, m2); 225 226 return (0); 227} 228 229metaslab_group_t * 230metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) 231{ 232 metaslab_group_t *mg; 233 234 mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); 235 mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); 236 avl_create(&mg->mg_metaslab_tree, metaslab_compare, 237 sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); 238 mg->mg_vd = vd; 239 mg->mg_class = mc; 240 mg->mg_activation_count = 0; 241 242 return (mg); 243} 244 245void 246metaslab_group_destroy(metaslab_group_t *mg) 247{ 248 ASSERT(mg->mg_prev == NULL); 249 ASSERT(mg->mg_next == NULL); 250 /* 251 * We may have gone below zero with the activation count 252 * either because we never activated in the first place or 253 * because we're done, and possibly removing the vdev. 254 */ 255 ASSERT(mg->mg_activation_count <= 0); 256 257 avl_destroy(&mg->mg_metaslab_tree); 258 mutex_destroy(&mg->mg_lock); 259 kmem_free(mg, sizeof (metaslab_group_t)); 260} 261 262void 263metaslab_group_activate(metaslab_group_t *mg) 264{ 265 metaslab_class_t *mc = mg->mg_class; 266 metaslab_group_t *mgprev, *mgnext; 267 268 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 269 270 ASSERT(mc->mc_rotor != mg); 271 ASSERT(mg->mg_prev == NULL); 272 ASSERT(mg->mg_next == NULL); 273 ASSERT(mg->mg_activation_count <= 0); 274 275 if (++mg->mg_activation_count <= 0) 276 return; 277 278 mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); 279 280 if ((mgprev = mc->mc_rotor) == NULL) { 281 mg->mg_prev = mg; 282 mg->mg_next = mg; 283 } else { 284 mgnext = mgprev->mg_next; 285 mg->mg_prev = mgprev; 286 mg->mg_next = mgnext; 287 mgprev->mg_next = mg; 288 mgnext->mg_prev = mg; 289 } 290 mc->mc_rotor = mg; 291} 292 293void 294metaslab_group_passivate(metaslab_group_t *mg) 295{ 296 metaslab_class_t *mc = mg->mg_class; 297 metaslab_group_t *mgprev, *mgnext; 298 299 ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); 300 301 if (--mg->mg_activation_count != 0) { 302 ASSERT(mc->mc_rotor != mg); 303 ASSERT(mg->mg_prev == NULL); 304 ASSERT(mg->mg_next == NULL); 305 ASSERT(mg->mg_activation_count < 0); 306 return; 307 } 308 309 mgprev = mg->mg_prev; 310 mgnext = mg->mg_next; 311 312 if (mg == mgnext) { 313 mc->mc_rotor = NULL; 314 } else { 315 mc->mc_rotor = mgnext; 316 mgprev->mg_next = mgnext; 317 mgnext->mg_prev = mgprev; 318 } 319 320 mg->mg_prev = NULL; 321 mg->mg_next = NULL; 322} 323 324static void 325metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) 326{ 327 mutex_enter(&mg->mg_lock); 328 ASSERT(msp->ms_group == NULL); 329 msp->ms_group = mg; 330 msp->ms_weight = 0; 331 avl_add(&mg->mg_metaslab_tree, msp); 332 mutex_exit(&mg->mg_lock); 333} 334 335static void 336metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) 337{ 338 mutex_enter(&mg->mg_lock); 339 ASSERT(msp->ms_group == mg); 340 avl_remove(&mg->mg_metaslab_tree, msp); 341 msp->ms_group = NULL; 342 mutex_exit(&mg->mg_lock); 343} 344 345static void 346metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) 347{ 348 /* 349 * Although in principle the weight can be any value, in 350 * practice we do not use values in the range [1, 510]. 351 */ 352 ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0); 353 ASSERT(MUTEX_HELD(&msp->ms_lock)); 354 355 mutex_enter(&mg->mg_lock); 356 ASSERT(msp->ms_group == mg); 357 avl_remove(&mg->mg_metaslab_tree, msp); 358 msp->ms_weight = weight; 359 avl_add(&mg->mg_metaslab_tree, msp); 360 mutex_exit(&mg->mg_lock); 361} 362 363/* 364 * ========================================================================== 365 * Common allocator routines 366 * ========================================================================== 367 */ 368static int 369metaslab_segsize_compare(const void *x1, const void *x2) 370{ 371 const space_seg_t *s1 = x1; 372 const space_seg_t *s2 = x2; 373 uint64_t ss_size1 = s1->ss_end - s1->ss_start; 374 uint64_t ss_size2 = s2->ss_end - s2->ss_start; 375 376 if (ss_size1 < ss_size2) 377 return (-1); 378 if (ss_size1 > ss_size2) 379 return (1); 380 381 if (s1->ss_start < s2->ss_start) 382 return (-1); 383 if (s1->ss_start > s2->ss_start) 384 return (1); 385 386 return (0); 387} 388 389/* 390 * This is a helper function that can be used by the allocator to find 391 * a suitable block to allocate. This will search the specified AVL 392 * tree looking for a block that matches the specified criteria. 393 */ 394static uint64_t 395metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, 396 uint64_t align) 397{ 398 space_seg_t *ss, ssearch; 399 avl_index_t where; 400 401 ssearch.ss_start = *cursor; 402 ssearch.ss_end = *cursor + size; 403 404 ss = avl_find(t, &ssearch, &where); 405 if (ss == NULL) 406 ss = avl_nearest(t, where, AVL_AFTER); 407 408 while (ss != NULL) { 409 uint64_t offset = P2ROUNDUP(ss->ss_start, align); 410 411 if (offset + size <= ss->ss_end) { 412 *cursor = offset + size; 413 return (offset); 414 } 415 ss = AVL_NEXT(t, ss); 416 } 417 418 /* 419 * If we know we've searched the whole map (*cursor == 0), give up. 420 * Otherwise, reset the cursor to the beginning and try again. 421 */ 422 if (*cursor == 0) 423 return (-1ULL); 424 425 *cursor = 0; 426 return (metaslab_block_picker(t, cursor, size, align)); 427} 428 429static void 430metaslab_pp_load(space_map_t *sm) 431{ 432 space_seg_t *ss; 433 434 ASSERT(sm->sm_ppd == NULL); 435 sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); 436 437 sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); 438 avl_create(sm->sm_pp_root, metaslab_segsize_compare, 439 sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node)); 440 441 for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) 442 avl_add(sm->sm_pp_root, ss); 443} 444 445static void 446metaslab_pp_unload(space_map_t *sm) 447{ 448 void *cookie = NULL; 449 450 kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); 451 sm->sm_ppd = NULL; 452 453 while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) { 454 /* tear down the tree */ 455 } 456 457 avl_destroy(sm->sm_pp_root); 458 kmem_free(sm->sm_pp_root, sizeof (avl_tree_t)); 459 sm->sm_pp_root = NULL; 460} 461 462/* ARGSUSED */ 463static void 464metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size) 465{ 466 /* No need to update cursor */ 467} 468 469/* ARGSUSED */ 470static void 471metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size) 472{ 473 /* No need to update cursor */ 474} 475 476/* 477 * Return the maximum contiguous segment within the metaslab. 478 */ 479uint64_t 480metaslab_pp_maxsize(space_map_t *sm) 481{ 482 avl_tree_t *t = sm->sm_pp_root; 483 space_seg_t *ss; 484 485 if (t == NULL || (ss = avl_last(t)) == NULL) 486 return (0ULL); 487 488 return (ss->ss_end - ss->ss_start); 489} 490 491/* 492 * ========================================================================== 493 * The first-fit block allocator 494 * ========================================================================== 495 */ 496static uint64_t 497metaslab_ff_alloc(space_map_t *sm, uint64_t size) 498{ 499 avl_tree_t *t = &sm->sm_root; 500 uint64_t align = size & -size; 501 uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; 502 503 return (metaslab_block_picker(t, cursor, size, align)); 504} 505 506/* ARGSUSED */ 507boolean_t 508metaslab_ff_fragmented(space_map_t *sm) 509{ 510 return (B_TRUE); 511} 512 513static space_map_ops_t metaslab_ff_ops = { 514 metaslab_pp_load, 515 metaslab_pp_unload, 516 metaslab_ff_alloc, 517 metaslab_pp_claim, 518 metaslab_pp_free, 519 metaslab_pp_maxsize, 520 metaslab_ff_fragmented 521}; 522 523/* 524 * ========================================================================== 525 * Dynamic block allocator - 526 * Uses the first fit allocation scheme until space get low and then 527 * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold 528 * and metaslab_df_free_pct to determine when to switch the allocation scheme. 529 * ========================================================================== 530 */ 531static uint64_t 532metaslab_df_alloc(space_map_t *sm, uint64_t size) 533{ 534 avl_tree_t *t = &sm->sm_root; 535 uint64_t align = size & -size; 536 uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; 537 uint64_t max_size = metaslab_pp_maxsize(sm); 538 int free_pct = sm->sm_space * 100 / sm->sm_size; 539 540 ASSERT(MUTEX_HELD(sm->sm_lock)); 541 ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); 542 543 if (max_size < size) 544 return (-1ULL); 545 546 /* 547 * If we're running low on space switch to using the size 548 * sorted AVL tree (best-fit). 549 */ 550 if (max_size < metaslab_df_alloc_threshold || 551 free_pct < metaslab_df_free_pct) { 552 t = sm->sm_pp_root; 553 *cursor = 0; 554 } 555 556 return (metaslab_block_picker(t, cursor, size, 1ULL)); 557} 558 559static boolean_t 560metaslab_df_fragmented(space_map_t *sm) 561{ 562 uint64_t max_size = metaslab_pp_maxsize(sm); 563 int free_pct = sm->sm_space * 100 / sm->sm_size; 564 565 if (max_size >= metaslab_df_alloc_threshold && 566 free_pct >= metaslab_df_free_pct) 567 return (B_FALSE); 568 569 return (B_TRUE); 570} 571 572static space_map_ops_t metaslab_df_ops = { 573 metaslab_pp_load, 574 metaslab_pp_unload, 575 metaslab_df_alloc, 576 metaslab_pp_claim, 577 metaslab_pp_free, 578 metaslab_pp_maxsize, 579 metaslab_df_fragmented 580}; 581 582/* 583 * ========================================================================== 584 * Other experimental allocators 585 * ========================================================================== 586 */ 587static uint64_t 588metaslab_cdf_alloc(space_map_t *sm, uint64_t size) 589{ 590 avl_tree_t *t = &sm->sm_root; 591 uint64_t *cursor = (uint64_t *)sm->sm_ppd; 592 uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1; 593 uint64_t max_size = metaslab_pp_maxsize(sm); 594 uint64_t rsize = size; 595 uint64_t offset = 0; 596 597 ASSERT(MUTEX_HELD(sm->sm_lock)); 598 ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); 599 600 if (max_size < size) 601 return (-1ULL); 602 603 ASSERT3U(*extent_end, >=, *cursor); 604 605 /* 606 * If we're running low on space switch to using the size 607 * sorted AVL tree (best-fit). 608 */ 609 if ((*cursor + size) > *extent_end) { 610 611 t = sm->sm_pp_root; 612 *cursor = *extent_end = 0; 613 614 if (max_size > 2 * SPA_MAXBLOCKSIZE) 615 rsize = MIN(metaslab_min_alloc_size, max_size); 616 offset = metaslab_block_picker(t, extent_end, rsize, 1ULL); 617 if (offset != -1) 618 *cursor = offset + size; 619 } else { 620 offset = metaslab_block_picker(t, cursor, rsize, 1ULL); 621 } 622 ASSERT3U(*cursor, <=, *extent_end); 623 return (offset); 624} 625 626static boolean_t 627metaslab_cdf_fragmented(space_map_t *sm) 628{ 629 uint64_t max_size = metaslab_pp_maxsize(sm); 630 631 if (max_size > (metaslab_min_alloc_size * 10)) 632 return (B_FALSE); 633 return (B_TRUE); 634} 635 636static space_map_ops_t metaslab_cdf_ops = { 637 metaslab_pp_load, 638 metaslab_pp_unload, 639 metaslab_cdf_alloc, 640 metaslab_pp_claim, 641 metaslab_pp_free, 642 metaslab_pp_maxsize, 643 metaslab_cdf_fragmented 644}; 645 646uint64_t metaslab_ndf_clump_shift = 4; 647 648static uint64_t 649metaslab_ndf_alloc(space_map_t *sm, uint64_t size) 650{ 651 avl_tree_t *t = &sm->sm_root; 652 avl_index_t where; 653 space_seg_t *ss, ssearch; 654 uint64_t hbit = highbit(size); 655 uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1; 656 uint64_t max_size = metaslab_pp_maxsize(sm); 657 658 ASSERT(MUTEX_HELD(sm->sm_lock)); 659 ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); 660 661 if (max_size < size) 662 return (-1ULL); 663 664 ssearch.ss_start = *cursor; 665 ssearch.ss_end = *cursor + size; 666 667 ss = avl_find(t, &ssearch, &where); 668 if (ss == NULL || (ss->ss_start + size > ss->ss_end)) { 669 t = sm->sm_pp_root; 670 671 ssearch.ss_start = 0; 672 ssearch.ss_end = MIN(max_size, 673 1ULL << (hbit + metaslab_ndf_clump_shift)); 674 ss = avl_find(t, &ssearch, &where); 675 if (ss == NULL) 676 ss = avl_nearest(t, where, AVL_AFTER); 677 ASSERT(ss != NULL); 678 } 679 680 if (ss != NULL) { 681 if (ss->ss_start + size <= ss->ss_end) { 682 *cursor = ss->ss_start + size; 683 return (ss->ss_start); 684 } 685 } 686 return (-1ULL); 687} 688 689static boolean_t 690metaslab_ndf_fragmented(space_map_t *sm) 691{ 692 uint64_t max_size = metaslab_pp_maxsize(sm); 693 694 if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift)) 695 return (B_FALSE); 696 return (B_TRUE); 697} 698 699 700static space_map_ops_t metaslab_ndf_ops = { 701 metaslab_pp_load, 702 metaslab_pp_unload, 703 metaslab_ndf_alloc, 704 metaslab_pp_claim, 705 metaslab_pp_free, 706 metaslab_pp_maxsize, 707 metaslab_ndf_fragmented 708}; 709 710space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops; 711 712/* 713 * ========================================================================== 714 * Metaslabs 715 * ========================================================================== 716 */ 717metaslab_t * 718metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, 719 uint64_t start, uint64_t size, uint64_t txg) 720{ 721 vdev_t *vd = mg->mg_vd; 722 metaslab_t *msp; 723 724 msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); 725 mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL); 726 727 msp->ms_smo_syncing = *smo; 728 729 /* 730 * We create the main space map here, but we don't create the 731 * allocmaps and freemaps until metaslab_sync_done(). This serves 732 * two purposes: it allows metaslab_sync_done() to detect the 733 * addition of new space; and for debugging, it ensures that we'd 734 * data fault on any attempt to use this metaslab before it's ready. 735 */ 736 space_map_create(&msp->ms_map, start, size, 737 vd->vdev_ashift, &msp->ms_lock); 738 739 metaslab_group_add(mg, msp); 740 741 if (metaslab_debug && smo->smo_object != 0) { 742 mutex_enter(&msp->ms_lock); 743 VERIFY(space_map_load(&msp->ms_map, mg->mg_class->mc_ops, 744 SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0); 745 mutex_exit(&msp->ms_lock); 746 } 747 748 /* 749 * If we're opening an existing pool (txg == 0) or creating 750 * a new one (txg == TXG_INITIAL), all space is available now. 751 * If we're adding space to an existing pool, the new space 752 * does not become available until after this txg has synced. 753 */ 754 if (txg <= TXG_INITIAL) 755 metaslab_sync_done(msp, 0); 756 757 if (txg != 0) { 758 vdev_dirty(vd, 0, NULL, txg); 759 vdev_dirty(vd, VDD_METASLAB, msp, txg); 760 } 761 762 return (msp); 763} 764 765void 766metaslab_fini(metaslab_t *msp) 767{ 768 metaslab_group_t *mg = msp->ms_group; 769 770 vdev_space_update(mg->mg_vd, 771 -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size); 772 773 metaslab_group_remove(mg, msp); 774 775 mutex_enter(&msp->ms_lock); 776 777 space_map_unload(&msp->ms_map); 778 space_map_destroy(&msp->ms_map); 779 780 for (int t = 0; t < TXG_SIZE; t++) { 781 space_map_destroy(&msp->ms_allocmap[t]); 782 space_map_destroy(&msp->ms_freemap[t]); 783 } 784 785 for (int t = 0; t < TXG_DEFER_SIZE; t++) 786 space_map_destroy(&msp->ms_defermap[t]); 787 788 ASSERT0(msp->ms_deferspace); 789 790 mutex_exit(&msp->ms_lock); 791 mutex_destroy(&msp->ms_lock); 792 793 kmem_free(msp, sizeof (metaslab_t)); 794} 795 796#define METASLAB_WEIGHT_PRIMARY (1ULL << 63) 797#define METASLAB_WEIGHT_SECONDARY (1ULL << 62) 798#define METASLAB_ACTIVE_MASK \ 799 (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) 800 801static uint64_t 802metaslab_weight(metaslab_t *msp) 803{ 804 metaslab_group_t *mg = msp->ms_group; 805 space_map_t *sm = &msp->ms_map; 806 space_map_obj_t *smo = &msp->ms_smo; 807 vdev_t *vd = mg->mg_vd; 808 uint64_t weight, space; 809 810 ASSERT(MUTEX_HELD(&msp->ms_lock)); 811 812 /* 813 * The baseline weight is the metaslab's free space. 814 */ 815 space = sm->sm_size - smo->smo_alloc; 816 weight = space; 817 818 /* 819 * Modern disks have uniform bit density and constant angular velocity. 820 * Therefore, the outer recording zones are faster (higher bandwidth) 821 * than the inner zones by the ratio of outer to inner track diameter, 822 * which is typically around 2:1. We account for this by assigning 823 * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). 824 * In effect, this means that we'll select the metaslab with the most 825 * free bandwidth rather than simply the one with the most free space. 826 */ 827 weight = 2 * weight - 828 ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count; 829 ASSERT(weight >= space && weight <= 2 * space); 830 831 /* 832 * For locality, assign higher weight to metaslabs which have 833 * a lower offset than what we've already activated. 834 */ 835 if (sm->sm_start <= mg->mg_bonus_area) 836 weight *= (metaslab_smo_bonus_pct / 100); 837 ASSERT(weight >= space && 838 weight <= 2 * (metaslab_smo_bonus_pct / 100) * space); 839 840 if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) { 841 /* 842 * If this metaslab is one we're actively using, adjust its 843 * weight to make it preferable to any inactive metaslab so 844 * we'll polish it off. 845 */ 846 weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); 847 } 848 return (weight); 849} 850 851static void 852metaslab_prefetch(metaslab_group_t *mg) 853{ 854 spa_t *spa = mg->mg_vd->vdev_spa; 855 metaslab_t *msp; 856 avl_tree_t *t = &mg->mg_metaslab_tree; 857 int m; 858 859 mutex_enter(&mg->mg_lock); 860 861 /* 862 * Prefetch the next potential metaslabs 863 */ 864 for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) { 865 space_map_t *sm = &msp->ms_map; 866 space_map_obj_t *smo = &msp->ms_smo; 867 868 /* If we have reached our prefetch limit then we're done */ 869 if (m >= metaslab_prefetch_limit) 870 break; 871 872 if (!sm->sm_loaded && smo->smo_object != 0) { 873 mutex_exit(&mg->mg_lock); 874 dmu_prefetch(spa_meta_objset(spa), smo->smo_object, 875 0ULL, smo->smo_objsize); 876 mutex_enter(&mg->mg_lock); 877 } 878 } 879 mutex_exit(&mg->mg_lock); 880} 881 882static int 883metaslab_activate(metaslab_t *msp, uint64_t activation_weight) 884{ 885 metaslab_group_t *mg = msp->ms_group; 886 space_map_t *sm = &msp->ms_map; 887 space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops; 888 889 ASSERT(MUTEX_HELD(&msp->ms_lock)); 890 891 if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 892 space_map_load_wait(sm); 893 if (!sm->sm_loaded) { 894 space_map_obj_t *smo = &msp->ms_smo; 895 896 int error = space_map_load(sm, sm_ops, SM_FREE, smo, 897 spa_meta_objset(msp->ms_group->mg_vd->vdev_spa)); 898 if (error) { 899 metaslab_group_sort(msp->ms_group, msp, 0); 900 return (error); 901 } 902 for (int t = 0; t < TXG_DEFER_SIZE; t++) 903 space_map_walk(&msp->ms_defermap[t], 904 space_map_claim, sm); 905 906 } 907 908 /* 909 * Track the bonus area as we activate new metaslabs. 910 */ 911 if (sm->sm_start > mg->mg_bonus_area) { 912 mutex_enter(&mg->mg_lock); 913 mg->mg_bonus_area = sm->sm_start; 914 mutex_exit(&mg->mg_lock); 915 } 916 917 metaslab_group_sort(msp->ms_group, msp, 918 msp->ms_weight | activation_weight); 919 } 920 ASSERT(sm->sm_loaded); 921 ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); 922 923 return (0); 924} 925 926static void 927metaslab_passivate(metaslab_t *msp, uint64_t size) 928{ 929 /* 930 * If size < SPA_MINBLOCKSIZE, then we will not allocate from 931 * this metaslab again. In that case, it had better be empty, 932 * or we would be leaving space on the table. 933 */ 934 ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map.sm_space == 0); 935 metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size)); 936 ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0); 937} 938 939/* 940 * Write a metaslab to disk in the context of the specified transaction group. 941 */ 942void 943metaslab_sync(metaslab_t *msp, uint64_t txg) 944{ 945 vdev_t *vd = msp->ms_group->mg_vd; 946 spa_t *spa = vd->vdev_spa; 947 objset_t *mos = spa_meta_objset(spa); 948 space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK]; 949 space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK]; 950 space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; 951 space_map_t *sm = &msp->ms_map; 952 space_map_obj_t *smo = &msp->ms_smo_syncing; 953 dmu_buf_t *db; 954 dmu_tx_t *tx; 955 956 ASSERT(!vd->vdev_ishole); 957 958 if (allocmap->sm_space == 0 && freemap->sm_space == 0) 959 return; 960 961 /* 962 * The only state that can actually be changing concurrently with 963 * metaslab_sync() is the metaslab's ms_map. No other thread can 964 * be modifying this txg's allocmap, freemap, freed_map, or smo. 965 * Therefore, we only hold ms_lock to satify space_map ASSERTs. 966 * We drop it whenever we call into the DMU, because the DMU 967 * can call down to us (e.g. via zio_free()) at any time. 968 */ 969 970 tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); 971 972 if (smo->smo_object == 0) { 973 ASSERT(smo->smo_objsize == 0); 974 ASSERT(smo->smo_alloc == 0); 975 smo->smo_object = dmu_object_alloc(mos, 976 DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, 977 DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); 978 ASSERT(smo->smo_object != 0); 979 dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * 980 (sm->sm_start >> vd->vdev_ms_shift), 981 sizeof (uint64_t), &smo->smo_object, tx); 982 } 983 984 mutex_enter(&msp->ms_lock); 985 986 space_map_walk(freemap, space_map_add, freed_map); 987 988 if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >= 989 2 * sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) { 990 /* 991 * The in-core space map representation is twice as compact 992 * as the on-disk one, so it's time to condense the latter 993 * by generating a pure allocmap from first principles. 994 * 995 * This metaslab is 100% allocated, 996 * minus the content of the in-core map (sm), 997 * minus what's been freed this txg (freed_map), 998 * minus deferred frees (ms_defermap[]), 999 * minus allocations from txgs in the future 1000 * (because they haven't been committed yet). 1001 */ 1002 space_map_vacate(allocmap, NULL, NULL); 1003 space_map_vacate(freemap, NULL, NULL); 1004 1005 space_map_add(allocmap, allocmap->sm_start, allocmap->sm_size); 1006 1007 space_map_walk(sm, space_map_remove, allocmap); 1008 space_map_walk(freed_map, space_map_remove, allocmap); 1009 1010 for (int t = 0; t < TXG_DEFER_SIZE; t++) 1011 space_map_walk(&msp->ms_defermap[t], 1012 space_map_remove, allocmap); 1013 1014 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) 1015 space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK], 1016 space_map_remove, allocmap); 1017 1018 mutex_exit(&msp->ms_lock); 1019 space_map_truncate(smo, mos, tx); 1020 mutex_enter(&msp->ms_lock); 1021 } 1022 1023 space_map_sync(allocmap, SM_ALLOC, smo, mos, tx); 1024 space_map_sync(freemap, SM_FREE, smo, mos, tx); 1025 1026 mutex_exit(&msp->ms_lock); 1027 1028 VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)); 1029 dmu_buf_will_dirty(db, tx); 1030 ASSERT3U(db->db_size, >=, sizeof (*smo)); 1031 bcopy(smo, db->db_data, sizeof (*smo)); 1032 dmu_buf_rele(db, FTAG); 1033 1034 dmu_tx_commit(tx); 1035} 1036 1037/* 1038 * Called after a transaction group has completely synced to mark 1039 * all of the metaslab's free space as usable. 1040 */ 1041void 1042metaslab_sync_done(metaslab_t *msp, uint64_t txg) 1043{ 1044 space_map_obj_t *smo = &msp->ms_smo; 1045 space_map_obj_t *smosync = &msp->ms_smo_syncing; 1046 space_map_t *sm = &msp->ms_map; 1047 space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; 1048 space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE]; 1049 metaslab_group_t *mg = msp->ms_group; 1050 vdev_t *vd = mg->mg_vd; 1051 int64_t alloc_delta, defer_delta; 1052 1053 ASSERT(!vd->vdev_ishole); 1054 1055 mutex_enter(&msp->ms_lock); 1056 1057 /* 1058 * If this metaslab is just becoming available, initialize its 1059 * allocmaps and freemaps and add its capacity to the vdev. 1060 */ 1061 if (freed_map->sm_size == 0) { 1062 for (int t = 0; t < TXG_SIZE; t++) { 1063 space_map_create(&msp->ms_allocmap[t], sm->sm_start, 1064 sm->sm_size, sm->sm_shift, sm->sm_lock); 1065 space_map_create(&msp->ms_freemap[t], sm->sm_start, 1066 sm->sm_size, sm->sm_shift, sm->sm_lock); 1067 } 1068 1069 for (int t = 0; t < TXG_DEFER_SIZE; t++) 1070 space_map_create(&msp->ms_defermap[t], sm->sm_start, 1071 sm->sm_size, sm->sm_shift, sm->sm_lock); 1072 1073 vdev_space_update(vd, 0, 0, sm->sm_size); 1074 } 1075 1076 alloc_delta = smosync->smo_alloc - smo->smo_alloc; 1077 defer_delta = freed_map->sm_space - defer_map->sm_space; 1078 1079 vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); 1080 1081 ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0); 1082 ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0); 1083 1084 /* 1085 * If there's a space_map_load() in progress, wait for it to complete 1086 * so that we have a consistent view of the in-core space map. 1087 * Then, add defer_map (oldest deferred frees) to this map and 1088 * transfer freed_map (this txg's frees) to defer_map. 1089 */ 1090 space_map_load_wait(sm); 1091 space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm); 1092 space_map_vacate(freed_map, space_map_add, defer_map); 1093 1094 *smo = *smosync; 1095 1096 msp->ms_deferspace += defer_delta; 1097 ASSERT3S(msp->ms_deferspace, >=, 0); 1098 ASSERT3S(msp->ms_deferspace, <=, sm->sm_size); 1099 if (msp->ms_deferspace != 0) { 1100 /* 1101 * Keep syncing this metaslab until all deferred frees 1102 * are back in circulation. 1103 */ 1104 vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); 1105 } 1106 1107 /* 1108 * If the map is loaded but no longer active, evict it as soon as all 1109 * future allocations have synced. (If we unloaded it now and then 1110 * loaded a moment later, the map wouldn't reflect those allocations.) 1111 */ 1112 if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { 1113 int evictable = 1; 1114 1115 for (int t = 1; t < TXG_CONCURRENT_STATES; t++) 1116 if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space) 1117 evictable = 0; 1118 1119 if (evictable && !metaslab_debug) 1120 space_map_unload(sm); 1121 } 1122 1123 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 1124 1125 mutex_exit(&msp->ms_lock); 1126} 1127 1128void 1129metaslab_sync_reassess(metaslab_group_t *mg) 1130{ 1131 vdev_t *vd = mg->mg_vd; 1132 int64_t failures = mg->mg_alloc_failures; 1133 1134 /* 1135 * Re-evaluate all metaslabs which have lower offsets than the 1136 * bonus area. 1137 */ 1138 for (int m = 0; m < vd->vdev_ms_count; m++) { 1139 metaslab_t *msp = vd->vdev_ms[m]; 1140 1141 if (msp->ms_map.sm_start > mg->mg_bonus_area) 1142 break; 1143 1144 mutex_enter(&msp->ms_lock); 1145 metaslab_group_sort(mg, msp, metaslab_weight(msp)); 1146 mutex_exit(&msp->ms_lock); 1147 } 1148 1149 atomic_add_64(&mg->mg_alloc_failures, -failures); 1150 1151 /* 1152 * Prefetch the next potential metaslabs 1153 */ 1154 metaslab_prefetch(mg); 1155} 1156 1157static uint64_t 1158metaslab_distance(metaslab_t *msp, dva_t *dva) 1159{ 1160 uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift; 1161 uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift; 1162 uint64_t start = msp->ms_map.sm_start >> ms_shift; 1163 1164 if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) 1165 return (1ULL << 63); 1166 1167 if (offset < start) 1168 return ((start - offset) << ms_shift); 1169 if (offset > start) 1170 return ((offset - start) << ms_shift); 1171 return (0); 1172} 1173 1174static uint64_t 1175metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize, 1176 uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags) 1177{ 1178 spa_t *spa = mg->mg_vd->vdev_spa; 1179 metaslab_t *msp = NULL; 1180 uint64_t offset = -1ULL; 1181 avl_tree_t *t = &mg->mg_metaslab_tree; 1182 uint64_t activation_weight; 1183 uint64_t target_distance; 1184 int i; 1185 1186 activation_weight = METASLAB_WEIGHT_PRIMARY; 1187 for (i = 0; i < d; i++) { 1188 if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { 1189 activation_weight = METASLAB_WEIGHT_SECONDARY; 1190 break; 1191 } 1192 } 1193 1194 for (;;) { 1195 boolean_t was_active; 1196 1197 mutex_enter(&mg->mg_lock); 1198 for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { 1199 if (msp->ms_weight < asize) { 1200 spa_dbgmsg(spa, "%s: failed to meet weight " 1201 "requirement: vdev %llu, txg %llu, mg %p, " 1202 "msp %p, psize %llu, asize %llu, " 1203 "failures %llu, weight %llu", 1204 spa_name(spa), mg->mg_vd->vdev_id, txg, 1205 mg, msp, psize, asize, 1206 mg->mg_alloc_failures, msp->ms_weight); 1207 mutex_exit(&mg->mg_lock); 1208 return (-1ULL); 1209 } 1210 was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; 1211 if (activation_weight == METASLAB_WEIGHT_PRIMARY) 1212 break; 1213 1214 target_distance = min_distance + 1215 (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1); 1216 1217 for (i = 0; i < d; i++) 1218 if (metaslab_distance(msp, &dva[i]) < 1219 target_distance) 1220 break; 1221 if (i == d) 1222 break; 1223 } 1224 mutex_exit(&mg->mg_lock); 1225 if (msp == NULL) 1226 return (-1ULL); 1227 1228 /* 1229 * If we've already reached the allowable number of failed 1230 * allocation attempts on this metaslab group then we 1231 * consider skipping it. We skip it only if we're allowed 1232 * to "fast" gang, the physical size is larger than 1233 * a gang block, and we're attempting to allocate from 1234 * the primary metaslab. 1235 */ 1236 if (mg->mg_alloc_failures > zfs_mg_alloc_failures && 1237 CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE && 1238 activation_weight == METASLAB_WEIGHT_PRIMARY) { 1239 spa_dbgmsg(spa, "%s: skipping metaslab group: " 1240 "vdev %llu, txg %llu, mg %p, psize %llu, " 1241 "asize %llu, failures %llu", spa_name(spa), 1242 mg->mg_vd->vdev_id, txg, mg, psize, asize, 1243 mg->mg_alloc_failures); 1244 return (-1ULL); 1245 } 1246 1247 mutex_enter(&msp->ms_lock); 1248 1249 /* 1250 * Ensure that the metaslab we have selected is still 1251 * capable of handling our request. It's possible that 1252 * another thread may have changed the weight while we 1253 * were blocked on the metaslab lock. 1254 */ 1255 if (msp->ms_weight < asize || (was_active && 1256 !(msp->ms_weight & METASLAB_ACTIVE_MASK) && 1257 activation_weight == METASLAB_WEIGHT_PRIMARY)) { 1258 mutex_exit(&msp->ms_lock); 1259 continue; 1260 } 1261 1262 if ((msp->ms_weight & METASLAB_WEIGHT_SECONDARY) && 1263 activation_weight == METASLAB_WEIGHT_PRIMARY) { 1264 metaslab_passivate(msp, 1265 msp->ms_weight & ~METASLAB_ACTIVE_MASK); 1266 mutex_exit(&msp->ms_lock); 1267 continue; 1268 } 1269 1270 if (metaslab_activate(msp, activation_weight) != 0) { 1271 mutex_exit(&msp->ms_lock); 1272 continue; 1273 } 1274 1275 if ((offset = space_map_alloc(&msp->ms_map, asize)) != -1ULL) 1276 break; 1277 1278 atomic_inc_64(&mg->mg_alloc_failures); 1279 1280 metaslab_passivate(msp, space_map_maxsize(&msp->ms_map)); 1281 1282 mutex_exit(&msp->ms_lock); 1283 } 1284 1285 if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) 1286 vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); 1287 1288 space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, asize); 1289 1290 mutex_exit(&msp->ms_lock); 1291 1292 return (offset); 1293} 1294 1295/* 1296 * Allocate a block for the specified i/o. 1297 */ 1298static int 1299metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, 1300 dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags) 1301{ 1302 metaslab_group_t *mg, *rotor; 1303 vdev_t *vd; 1304 int dshift = 3; 1305 int all_zero; 1306 int zio_lock = B_FALSE; 1307 boolean_t allocatable; 1308 uint64_t offset = -1ULL; 1309 uint64_t asize; 1310 uint64_t distance; 1311 1312 ASSERT(!DVA_IS_VALID(&dva[d])); 1313 1314 /* 1315 * For testing, make some blocks above a certain size be gang blocks. 1316 */ 1317 if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) 1318 return (ENOSPC); 1319 1320 /* 1321 * Start at the rotor and loop through all mgs until we find something. 1322 * Note that there's no locking on mc_rotor or mc_aliquot because 1323 * nothing actually breaks if we miss a few updates -- we just won't 1324 * allocate quite as evenly. It all balances out over time. 1325 * 1326 * If we are doing ditto or log blocks, try to spread them across 1327 * consecutive vdevs. If we're forced to reuse a vdev before we've 1328 * allocated all of our ditto blocks, then try and spread them out on 1329 * that vdev as much as possible. If it turns out to not be possible, 1330 * gradually lower our standards until anything becomes acceptable. 1331 * Also, allocating on consecutive vdevs (as opposed to random vdevs) 1332 * gives us hope of containing our fault domains to something we're 1333 * able to reason about. Otherwise, any two top-level vdev failures 1334 * will guarantee the loss of data. With consecutive allocation, 1335 * only two adjacent top-level vdev failures will result in data loss. 1336 * 1337 * If we are doing gang blocks (hintdva is non-NULL), try to keep 1338 * ourselves on the same vdev as our gang block header. That 1339 * way, we can hope for locality in vdev_cache, plus it makes our 1340 * fault domains something tractable. 1341 */ 1342 if (hintdva) { 1343 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); 1344 1345 /* 1346 * It's possible the vdev we're using as the hint no 1347 * longer exists (i.e. removed). Consult the rotor when 1348 * all else fails. 1349 */ 1350 if (vd != NULL) { 1351 mg = vd->vdev_mg; 1352 1353 if (flags & METASLAB_HINTBP_AVOID && 1354 mg->mg_next != NULL) 1355 mg = mg->mg_next; 1356 } else { 1357 mg = mc->mc_rotor; 1358 } 1359 } else if (d != 0) { 1360 vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); 1361 mg = vd->vdev_mg->mg_next; 1362 } else { 1363 mg = mc->mc_rotor; 1364 } 1365 1366 /* 1367 * If the hint put us into the wrong metaslab class, or into a 1368 * metaslab group that has been passivated, just follow the rotor. 1369 */ 1370 if (mg->mg_class != mc || mg->mg_activation_count <= 0) 1371 mg = mc->mc_rotor; 1372 1373 rotor = mg; 1374top: 1375 all_zero = B_TRUE; 1376 do { 1377 ASSERT(mg->mg_activation_count == 1); 1378 1379 vd = mg->mg_vd; 1380 1381 /* 1382 * Don't allocate from faulted devices. 1383 */ 1384 if (zio_lock) { 1385 spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); 1386 allocatable = vdev_allocatable(vd); 1387 spa_config_exit(spa, SCL_ZIO, FTAG); 1388 } else { 1389 allocatable = vdev_allocatable(vd); 1390 } 1391 if (!allocatable) 1392 goto next; 1393 1394 /* 1395 * Avoid writing single-copy data to a failing vdev
|
1390 all_zero = B_FALSE; 1391 goto next; 1392 } 1393 1394 ASSERT(mg->mg_class == mc); 1395 1396 distance = vd->vdev_asize >> dshift; 1397 if (distance <= (1ULL << vd->vdev_ms_shift)) 1398 distance = 0; 1399 else 1400 all_zero = B_FALSE; 1401 1402 asize = vdev_psize_to_asize(vd, psize); 1403 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 1404 1405 offset = metaslab_group_alloc(mg, psize, asize, txg, distance, 1406 dva, d, flags); 1407 if (offset != -1ULL) { 1408 /* 1409 * If we've just selected this metaslab group, 1410 * figure out whether the corresponding vdev is 1411 * over- or under-used relative to the pool, 1412 * and set an allocation bias to even it out. 1413 */ 1414 if (mc->mc_aliquot == 0) { 1415 vdev_stat_t *vs = &vd->vdev_stat; 1416 int64_t vu, cu; 1417 1418 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 1419 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 1420 1421 /* 1422 * Calculate how much more or less we should 1423 * try to allocate from this device during 1424 * this iteration around the rotor. 1425 * For example, if a device is 80% full 1426 * and the pool is 20% full then we should 1427 * reduce allocations by 60% on this device. 1428 * 1429 * mg_bias = (20 - 80) * 512K / 100 = -307K 1430 * 1431 * This reduces allocations by 307K for this 1432 * iteration. 1433 */ 1434 mg->mg_bias = ((cu - vu) * 1435 (int64_t)mg->mg_aliquot) / 100; 1436 } 1437 1438 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 1439 mg->mg_aliquot + mg->mg_bias) { 1440 mc->mc_rotor = mg->mg_next; 1441 mc->mc_aliquot = 0; 1442 } 1443 1444 DVA_SET_VDEV(&dva[d], vd->vdev_id); 1445 DVA_SET_OFFSET(&dva[d], offset); 1446 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 1447 DVA_SET_ASIZE(&dva[d], asize); 1448 1449 return (0); 1450 } 1451next: 1452 mc->mc_rotor = mg->mg_next; 1453 mc->mc_aliquot = 0; 1454 } while ((mg = mg->mg_next) != rotor); 1455 1456 if (!all_zero) { 1457 dshift++; 1458 ASSERT(dshift < 64); 1459 goto top; 1460 } 1461 1462 if (!allocatable && !zio_lock) { 1463 dshift = 3; 1464 zio_lock = B_TRUE; 1465 goto top; 1466 } 1467 1468 bzero(&dva[d], sizeof (dva_t)); 1469 1470 return (ENOSPC); 1471} 1472 1473/* 1474 * Free the block represented by DVA in the context of the specified 1475 * transaction group. 1476 */ 1477static void 1478metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) 1479{ 1480 uint64_t vdev = DVA_GET_VDEV(dva); 1481 uint64_t offset = DVA_GET_OFFSET(dva); 1482 uint64_t size = DVA_GET_ASIZE(dva); 1483 vdev_t *vd; 1484 metaslab_t *msp; 1485 1486 ASSERT(DVA_IS_VALID(dva)); 1487 1488 if (txg > spa_freeze_txg(spa)) 1489 return; 1490 1491 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 1492 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 1493 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 1494 (u_longlong_t)vdev, (u_longlong_t)offset); 1495 ASSERT(0); 1496 return; 1497 } 1498 1499 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 1500 1501 if (DVA_GET_GANG(dva)) 1502 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 1503 1504 mutex_enter(&msp->ms_lock); 1505 1506 if (now) { 1507 space_map_remove(&msp->ms_allocmap[txg & TXG_MASK], 1508 offset, size); 1509 space_map_free(&msp->ms_map, offset, size); 1510 } else { 1511 if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0) 1512 vdev_dirty(vd, VDD_METASLAB, msp, txg); 1513 space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size); 1514 } 1515 1516 mutex_exit(&msp->ms_lock); 1517} 1518 1519/* 1520 * Intent log support: upon opening the pool after a crash, notify the SPA 1521 * of blocks that the intent log has allocated for immediate write, but 1522 * which are still considered free by the SPA because the last transaction 1523 * group didn't commit yet. 1524 */ 1525static int 1526metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 1527{ 1528 uint64_t vdev = DVA_GET_VDEV(dva); 1529 uint64_t offset = DVA_GET_OFFSET(dva); 1530 uint64_t size = DVA_GET_ASIZE(dva); 1531 vdev_t *vd; 1532 metaslab_t *msp; 1533 int error = 0; 1534 1535 ASSERT(DVA_IS_VALID(dva)); 1536 1537 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 1538 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) 1539 return (ENXIO); 1540 1541 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 1542 1543 if (DVA_GET_GANG(dva)) 1544 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 1545 1546 mutex_enter(&msp->ms_lock); 1547 1548 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded) 1549 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); 1550 1551 if (error == 0 && !space_map_contains(&msp->ms_map, offset, size)) 1552 error = ENOENT; 1553 1554 if (error || txg == 0) { /* txg == 0 indicates dry run */ 1555 mutex_exit(&msp->ms_lock); 1556 return (error); 1557 } 1558 1559 space_map_claim(&msp->ms_map, offset, size); 1560 1561 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 1562 if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) 1563 vdev_dirty(vd, VDD_METASLAB, msp, txg); 1564 space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); 1565 } 1566 1567 mutex_exit(&msp->ms_lock); 1568 1569 return (0); 1570} 1571 1572int 1573metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 1574 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags) 1575{ 1576 dva_t *dva = bp->blk_dva; 1577 dva_t *hintdva = hintbp->blk_dva; 1578 int error = 0; 1579 1580 ASSERT(bp->blk_birth == 0); 1581 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 1582 1583 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 1584 1585 if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 1586 spa_config_exit(spa, SCL_ALLOC, FTAG); 1587 return (ENOSPC); 1588 } 1589 1590 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 1591 ASSERT(BP_GET_NDVAS(bp) == 0); 1592 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 1593 1594 for (int d = 0; d < ndvas; d++) { 1595 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 1596 txg, flags); 1597 if (error) { 1598 for (d--; d >= 0; d--) { 1599 metaslab_free_dva(spa, &dva[d], txg, B_TRUE); 1600 bzero(&dva[d], sizeof (dva_t)); 1601 } 1602 spa_config_exit(spa, SCL_ALLOC, FTAG); 1603 return (error); 1604 } 1605 } 1606 ASSERT(error == 0); 1607 ASSERT(BP_GET_NDVAS(bp) == ndvas); 1608 1609 spa_config_exit(spa, SCL_ALLOC, FTAG); 1610 1611 BP_SET_BIRTH(bp, txg, txg); 1612 1613 return (0); 1614} 1615 1616void 1617metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 1618{ 1619 const dva_t *dva = bp->blk_dva; 1620 int ndvas = BP_GET_NDVAS(bp); 1621 1622 ASSERT(!BP_IS_HOLE(bp)); 1623 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 1624 1625 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 1626 1627 for (int d = 0; d < ndvas; d++) 1628 metaslab_free_dva(spa, &dva[d], txg, now); 1629 1630 spa_config_exit(spa, SCL_FREE, FTAG); 1631} 1632 1633int 1634metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 1635{ 1636 const dva_t *dva = bp->blk_dva; 1637 int ndvas = BP_GET_NDVAS(bp); 1638 int error = 0; 1639 1640 ASSERT(!BP_IS_HOLE(bp)); 1641 1642 if (txg != 0) { 1643 /* 1644 * First do a dry run to make sure all DVAs are claimable, 1645 * so we don't have to unwind from partial failures below. 1646 */ 1647 if ((error = metaslab_claim(spa, bp, 0)) != 0) 1648 return (error); 1649 } 1650 1651 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 1652 1653 for (int d = 0; d < ndvas; d++) 1654 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) 1655 break; 1656 1657 spa_config_exit(spa, SCL_ALLOC, FTAG); 1658 1659 ASSERT(error == 0 || txg == 0); 1660 1661 return (error); 1662}
| 1403 all_zero = B_FALSE; 1404 goto next; 1405 } 1406 1407 ASSERT(mg->mg_class == mc); 1408 1409 distance = vd->vdev_asize >> dshift; 1410 if (distance <= (1ULL << vd->vdev_ms_shift)) 1411 distance = 0; 1412 else 1413 all_zero = B_FALSE; 1414 1415 asize = vdev_psize_to_asize(vd, psize); 1416 ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); 1417 1418 offset = metaslab_group_alloc(mg, psize, asize, txg, distance, 1419 dva, d, flags); 1420 if (offset != -1ULL) { 1421 /* 1422 * If we've just selected this metaslab group, 1423 * figure out whether the corresponding vdev is 1424 * over- or under-used relative to the pool, 1425 * and set an allocation bias to even it out. 1426 */ 1427 if (mc->mc_aliquot == 0) { 1428 vdev_stat_t *vs = &vd->vdev_stat; 1429 int64_t vu, cu; 1430 1431 vu = (vs->vs_alloc * 100) / (vs->vs_space + 1); 1432 cu = (mc->mc_alloc * 100) / (mc->mc_space + 1); 1433 1434 /* 1435 * Calculate how much more or less we should 1436 * try to allocate from this device during 1437 * this iteration around the rotor. 1438 * For example, if a device is 80% full 1439 * and the pool is 20% full then we should 1440 * reduce allocations by 60% on this device. 1441 * 1442 * mg_bias = (20 - 80) * 512K / 100 = -307K 1443 * 1444 * This reduces allocations by 307K for this 1445 * iteration. 1446 */ 1447 mg->mg_bias = ((cu - vu) * 1448 (int64_t)mg->mg_aliquot) / 100; 1449 } 1450 1451 if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= 1452 mg->mg_aliquot + mg->mg_bias) { 1453 mc->mc_rotor = mg->mg_next; 1454 mc->mc_aliquot = 0; 1455 } 1456 1457 DVA_SET_VDEV(&dva[d], vd->vdev_id); 1458 DVA_SET_OFFSET(&dva[d], offset); 1459 DVA_SET_GANG(&dva[d], !!(flags & METASLAB_GANG_HEADER)); 1460 DVA_SET_ASIZE(&dva[d], asize); 1461 1462 return (0); 1463 } 1464next: 1465 mc->mc_rotor = mg->mg_next; 1466 mc->mc_aliquot = 0; 1467 } while ((mg = mg->mg_next) != rotor); 1468 1469 if (!all_zero) { 1470 dshift++; 1471 ASSERT(dshift < 64); 1472 goto top; 1473 } 1474 1475 if (!allocatable && !zio_lock) { 1476 dshift = 3; 1477 zio_lock = B_TRUE; 1478 goto top; 1479 } 1480 1481 bzero(&dva[d], sizeof (dva_t)); 1482 1483 return (ENOSPC); 1484} 1485 1486/* 1487 * Free the block represented by DVA in the context of the specified 1488 * transaction group. 1489 */ 1490static void 1491metaslab_free_dva(spa_t *spa, const dva_t *dva, uint64_t txg, boolean_t now) 1492{ 1493 uint64_t vdev = DVA_GET_VDEV(dva); 1494 uint64_t offset = DVA_GET_OFFSET(dva); 1495 uint64_t size = DVA_GET_ASIZE(dva); 1496 vdev_t *vd; 1497 metaslab_t *msp; 1498 1499 ASSERT(DVA_IS_VALID(dva)); 1500 1501 if (txg > spa_freeze_txg(spa)) 1502 return; 1503 1504 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 1505 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { 1506 cmn_err(CE_WARN, "metaslab_free_dva(): bad DVA %llu:%llu", 1507 (u_longlong_t)vdev, (u_longlong_t)offset); 1508 ASSERT(0); 1509 return; 1510 } 1511 1512 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 1513 1514 if (DVA_GET_GANG(dva)) 1515 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 1516 1517 mutex_enter(&msp->ms_lock); 1518 1519 if (now) { 1520 space_map_remove(&msp->ms_allocmap[txg & TXG_MASK], 1521 offset, size); 1522 space_map_free(&msp->ms_map, offset, size); 1523 } else { 1524 if (msp->ms_freemap[txg & TXG_MASK].sm_space == 0) 1525 vdev_dirty(vd, VDD_METASLAB, msp, txg); 1526 space_map_add(&msp->ms_freemap[txg & TXG_MASK], offset, size); 1527 } 1528 1529 mutex_exit(&msp->ms_lock); 1530} 1531 1532/* 1533 * Intent log support: upon opening the pool after a crash, notify the SPA 1534 * of blocks that the intent log has allocated for immediate write, but 1535 * which are still considered free by the SPA because the last transaction 1536 * group didn't commit yet. 1537 */ 1538static int 1539metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) 1540{ 1541 uint64_t vdev = DVA_GET_VDEV(dva); 1542 uint64_t offset = DVA_GET_OFFSET(dva); 1543 uint64_t size = DVA_GET_ASIZE(dva); 1544 vdev_t *vd; 1545 metaslab_t *msp; 1546 int error = 0; 1547 1548 ASSERT(DVA_IS_VALID(dva)); 1549 1550 if ((vd = vdev_lookup_top(spa, vdev)) == NULL || 1551 (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) 1552 return (ENXIO); 1553 1554 msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; 1555 1556 if (DVA_GET_GANG(dva)) 1557 size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); 1558 1559 mutex_enter(&msp->ms_lock); 1560 1561 if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded) 1562 error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); 1563 1564 if (error == 0 && !space_map_contains(&msp->ms_map, offset, size)) 1565 error = ENOENT; 1566 1567 if (error || txg == 0) { /* txg == 0 indicates dry run */ 1568 mutex_exit(&msp->ms_lock); 1569 return (error); 1570 } 1571 1572 space_map_claim(&msp->ms_map, offset, size); 1573 1574 if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ 1575 if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) 1576 vdev_dirty(vd, VDD_METASLAB, msp, txg); 1577 space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); 1578 } 1579 1580 mutex_exit(&msp->ms_lock); 1581 1582 return (0); 1583} 1584 1585int 1586metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, 1587 int ndvas, uint64_t txg, blkptr_t *hintbp, int flags) 1588{ 1589 dva_t *dva = bp->blk_dva; 1590 dva_t *hintdva = hintbp->blk_dva; 1591 int error = 0; 1592 1593 ASSERT(bp->blk_birth == 0); 1594 ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); 1595 1596 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 1597 1598 if (mc->mc_rotor == NULL) { /* no vdevs in this class */ 1599 spa_config_exit(spa, SCL_ALLOC, FTAG); 1600 return (ENOSPC); 1601 } 1602 1603 ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); 1604 ASSERT(BP_GET_NDVAS(bp) == 0); 1605 ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); 1606 1607 for (int d = 0; d < ndvas; d++) { 1608 error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, 1609 txg, flags); 1610 if (error) { 1611 for (d--; d >= 0; d--) { 1612 metaslab_free_dva(spa, &dva[d], txg, B_TRUE); 1613 bzero(&dva[d], sizeof (dva_t)); 1614 } 1615 spa_config_exit(spa, SCL_ALLOC, FTAG); 1616 return (error); 1617 } 1618 } 1619 ASSERT(error == 0); 1620 ASSERT(BP_GET_NDVAS(bp) == ndvas); 1621 1622 spa_config_exit(spa, SCL_ALLOC, FTAG); 1623 1624 BP_SET_BIRTH(bp, txg, txg); 1625 1626 return (0); 1627} 1628 1629void 1630metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) 1631{ 1632 const dva_t *dva = bp->blk_dva; 1633 int ndvas = BP_GET_NDVAS(bp); 1634 1635 ASSERT(!BP_IS_HOLE(bp)); 1636 ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); 1637 1638 spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); 1639 1640 for (int d = 0; d < ndvas; d++) 1641 metaslab_free_dva(spa, &dva[d], txg, now); 1642 1643 spa_config_exit(spa, SCL_FREE, FTAG); 1644} 1645 1646int 1647metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) 1648{ 1649 const dva_t *dva = bp->blk_dva; 1650 int ndvas = BP_GET_NDVAS(bp); 1651 int error = 0; 1652 1653 ASSERT(!BP_IS_HOLE(bp)); 1654 1655 if (txg != 0) { 1656 /* 1657 * First do a dry run to make sure all DVAs are claimable, 1658 * so we don't have to unwind from partial failures below. 1659 */ 1660 if ((error = metaslab_claim(spa, bp, 0)) != 0) 1661 return (error); 1662 } 1663 1664 spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); 1665 1666 for (int d = 0; d < ndvas; d++) 1667 if ((error = metaslab_claim_dva(spa, &dva[d], txg)) != 0) 1668 break; 1669 1670 spa_config_exit(spa, SCL_ALLOC, FTAG); 1671 1672 ASSERT(error == 0 || txg == 0); 1673 1674 return (error); 1675}
|