1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25/* 26 * Copyright (c) 2012 by Delphix. All rights reserved. 27 */ 28 29#include <sys/zfs_context.h> 30#include <sys/spa.h> 31#include <sys/dmu.h> 32#include <sys/zio.h> 33#include <sys/space_map.h> 34 35SYSCTL_DECL(_vfs_zfs); 36static int space_map_last_hope; 37TUNABLE_INT("vfs.zfs.space_map_last_hope", &space_map_last_hope); 38SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_last_hope, CTLFLAG_RDTUN, 39 &space_map_last_hope, 0, 40 "If kernel panic in space_map code on pool import, import the pool in readonly mode and backup all your data before trying this option."); 41 42static kmem_cache_t *space_seg_cache; 43 44void 45space_map_init(void) 46{ 47 ASSERT(space_seg_cache == NULL); 48 space_seg_cache = kmem_cache_create("space_seg_cache", 49 sizeof (space_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 50} 51 52void 53space_map_fini(void) 54{ 55 kmem_cache_destroy(space_seg_cache); 56 space_seg_cache = NULL; 57} 58 59/* 60 * Space map routines. 61 * NOTE: caller is responsible for all locking. 62 */ 63static int 64space_map_seg_compare(const void *x1, const void *x2) 65{ 66 const space_seg_t *s1 = x1; 67 const space_seg_t *s2 = x2; 68 69 if (s1->ss_start < s2->ss_start) { 70 if (s1->ss_end > s2->ss_start) 71 return (0); 72 return (-1); 73 } 74 if (s1->ss_start > s2->ss_start) { 75 if (s1->ss_start < s2->ss_end) 76 return (0); 77 return (1); 78 } 79 return (0); 80} 81 82void 83space_map_create(space_map_t *sm, uint64_t start, uint64_t size, uint8_t shift, 84 kmutex_t *lp) 85{ 86 bzero(sm, sizeof (*sm)); 87 88 cv_init(&sm->sm_load_cv, NULL, CV_DEFAULT, NULL); 89 90 avl_create(&sm->sm_root, space_map_seg_compare, 91 sizeof (space_seg_t), offsetof(struct space_seg, ss_node)); 92 93 sm->sm_start = start; 94 sm->sm_size = size; 95 sm->sm_shift = shift; 96 sm->sm_lock = lp; 97} 98 99void 100space_map_destroy(space_map_t *sm) 101{ 102 ASSERT(!sm->sm_loaded && !sm->sm_loading); 103 VERIFY0(sm->sm_space); 104 avl_destroy(&sm->sm_root); 105 cv_destroy(&sm->sm_load_cv); 106} 107 108void 109space_map_add(space_map_t *sm, uint64_t start, uint64_t size) 110{ 111 avl_index_t where; 112 space_seg_t *ss_before, *ss_after, *ss; 113 uint64_t end = start + size; 114 int merge_before, merge_after; 115 116 ASSERT(MUTEX_HELD(sm->sm_lock)); 117 VERIFY(!sm->sm_condensing); 118 VERIFY(size != 0); 119 VERIFY3U(start, >=, sm->sm_start); 120 VERIFY3U(end, <=, sm->sm_start + sm->sm_size); 121 VERIFY(sm->sm_space + size <= sm->sm_size); 122 VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0); 123 VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0); 124again: 125 ss = space_map_find(sm, start, size, &where); 126 if (ss != NULL) { 127 zfs_panic_recover("zfs: allocating allocated segment" 128 "(offset=%llu size=%llu)\n", 129 (longlong_t)start, (longlong_t)size); 130 return; 131 } 132 if (ss != NULL && space_map_last_hope) { 133 uint64_t sstart, ssize; 134 135 if (ss->ss_start > start) 136 sstart = ss->ss_start; 137 else 138 sstart = start; 139 if (ss->ss_end > end) 140 ssize = end - sstart; 141 else 142 ssize = ss->ss_end - sstart; 143 ZFS_LOG(0, 144 "Removing colliding space_map range (start=%ju end=%ju). Good luck!", 145 (uintmax_t)sstart, (uintmax_t)(sstart + ssize)); 146 space_map_remove(sm, sstart, ssize); 147 goto again; 148 } 149 150 /* Make sure we don't overlap with either of our neighbors */ 151 VERIFY(ss == NULL); 152 153 ss_before = avl_nearest(&sm->sm_root, where, AVL_BEFORE); 154 ss_after = avl_nearest(&sm->sm_root, where, AVL_AFTER); 155 156 merge_before = (ss_before != NULL && ss_before->ss_end == start); 157 merge_after = (ss_after != NULL && ss_after->ss_start == end); 158 159 if (merge_before && merge_after) { 160 avl_remove(&sm->sm_root, ss_before); 161 if (sm->sm_pp_root) { 162 avl_remove(sm->sm_pp_root, ss_before); 163 avl_remove(sm->sm_pp_root, ss_after); 164 } 165 ss_after->ss_start = ss_before->ss_start; 166 kmem_cache_free(space_seg_cache, ss_before); 167 ss = ss_after; 168 } else if (merge_before) { 169 ss_before->ss_end = end; 170 if (sm->sm_pp_root) 171 avl_remove(sm->sm_pp_root, ss_before); 172 ss = ss_before; 173 } else if (merge_after) { 174 ss_after->ss_start = start; 175 if (sm->sm_pp_root) 176 avl_remove(sm->sm_pp_root, ss_after); 177 ss = ss_after; 178 } else { 179 ss = kmem_cache_alloc(space_seg_cache, KM_SLEEP); 180 ss->ss_start = start; 181 ss->ss_end = end; 182 avl_insert(&sm->sm_root, ss, where); 183 } 184 185 if (sm->sm_pp_root) 186 avl_add(sm->sm_pp_root, ss); 187 188 sm->sm_space += size; 189} 190 191void 192space_map_remove(space_map_t *sm, uint64_t start, uint64_t size) 193{ 194#ifdef illumos 195 avl_index_t where; 196#endif 197 space_seg_t *ss, *newseg; 198 uint64_t end = start + size; 199 int left_over, right_over; 200 201 VERIFY(!sm->sm_condensing); 202#ifdef illumos 203 ss = space_map_find(sm, start, size, &where); 204#else 205 ss = space_map_find(sm, start, size, NULL); 206#endif 207 208 /* Make sure we completely overlap with someone */ 209 if (ss == NULL) { 210 zfs_panic_recover("zfs: freeing free segment " 211 "(offset=%llu size=%llu)", 212 (longlong_t)start, (longlong_t)size); 213 return; 214 } 215 VERIFY3U(ss->ss_start, <=, start); 216 VERIFY3U(ss->ss_end, >=, end); 217 VERIFY(sm->sm_space - size < sm->sm_size); 218 219 left_over = (ss->ss_start != start); 220 right_over = (ss->ss_end != end); 221 222 if (sm->sm_pp_root) 223 avl_remove(sm->sm_pp_root, ss); 224 225 if (left_over && right_over) { 226 newseg = kmem_cache_alloc(space_seg_cache, KM_SLEEP); 227 newseg->ss_start = end; 228 newseg->ss_end = ss->ss_end; 229 ss->ss_end = start; 230 avl_insert_here(&sm->sm_root, newseg, ss, AVL_AFTER); 231 if (sm->sm_pp_root) 232 avl_add(sm->sm_pp_root, newseg); 233 } else if (left_over) { 234 ss->ss_end = start; 235 } else if (right_over) { 236 ss->ss_start = end; 237 } else { 238 avl_remove(&sm->sm_root, ss); 239 kmem_cache_free(space_seg_cache, ss); 240 ss = NULL; 241 } 242 243 if (sm->sm_pp_root && ss != NULL) 244 avl_add(sm->sm_pp_root, ss); 245 246 sm->sm_space -= size; 247} 248 249space_seg_t * 250space_map_find(space_map_t *sm, uint64_t start, uint64_t size, 251 avl_index_t *wherep) 252{ 253 space_seg_t ssearch, *ss; 254 255 ASSERT(MUTEX_HELD(sm->sm_lock)); 256 VERIFY(size != 0); 257 VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0); 258 VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0); 259 260 ssearch.ss_start = start; 261 ssearch.ss_end = start + size; 262 ss = avl_find(&sm->sm_root, &ssearch, wherep); 263 264 if (ss != NULL && ss->ss_start <= start && ss->ss_end >= start + size) 265 return (ss); 266 return (NULL); 267} 268 269boolean_t 270space_map_contains(space_map_t *sm, uint64_t start, uint64_t size) 271{ 272 avl_index_t where; 273 274 return (space_map_find(sm, start, size, &where) != 0); 275} 276 277void 278space_map_swap(space_map_t **msrc, space_map_t **mdst) 279{ 280 space_map_t *sm; 281 282 ASSERT(MUTEX_HELD((*msrc)->sm_lock)); 283 ASSERT0((*mdst)->sm_space); 284 ASSERT0(avl_numnodes(&(*mdst)->sm_root)); 285 286 sm = *msrc; 287 *msrc = *mdst; 288 *mdst = sm; 289} 290 291void 292space_map_vacate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest) 293{ 294 space_seg_t *ss; 295 void *cookie = NULL; 296 297 ASSERT(MUTEX_HELD(sm->sm_lock)); 298 299 while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) { 300 if (func != NULL) 301 func(mdest, ss->ss_start, ss->ss_end - ss->ss_start); 302 kmem_cache_free(space_seg_cache, ss); 303 } 304 sm->sm_space = 0; 305} 306 307void 308space_map_walk(space_map_t *sm, space_map_func_t *func, space_map_t *mdest) 309{ 310 space_seg_t *ss; 311 312 ASSERT(MUTEX_HELD(sm->sm_lock)); 313 314 for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) 315 func(mdest, ss->ss_start, ss->ss_end - ss->ss_start); 316} 317 318/* 319 * Wait for any in-progress space_map_load() to complete. 320 */ 321void 322space_map_load_wait(space_map_t *sm) 323{ 324 ASSERT(MUTEX_HELD(sm->sm_lock)); 325 326 while (sm->sm_loading) { 327 ASSERT(!sm->sm_loaded); 328 cv_wait(&sm->sm_load_cv, sm->sm_lock); 329 } 330} 331 332/* 333 * Note: space_map_load() will drop sm_lock across dmu_read() calls. 334 * The caller must be OK with this. 335 */ 336int 337space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype, 338 space_map_obj_t *smo, objset_t *os) 339{ 340 uint64_t *entry, *entry_map, *entry_map_end; 341 uint64_t bufsize, size, offset, end, space; 342 uint64_t mapstart = sm->sm_start; 343 int error = 0; 344 345 ASSERT(MUTEX_HELD(sm->sm_lock)); 346 ASSERT(!sm->sm_loaded); 347 ASSERT(!sm->sm_loading); 348 349 sm->sm_loading = B_TRUE; 350 end = smo->smo_objsize; 351 space = smo->smo_alloc; 352 353 ASSERT(sm->sm_ops == NULL); 354 VERIFY0(sm->sm_space); 355 356 if (maptype == SM_FREE) { 357 space_map_add(sm, sm->sm_start, sm->sm_size); 358 space = sm->sm_size - space; 359 } 360 361 bufsize = 1ULL << SPACE_MAP_BLOCKSHIFT; 362 entry_map = zio_buf_alloc(bufsize); 363 364 mutex_exit(sm->sm_lock); 365 if (end > bufsize) 366 dmu_prefetch(os, smo->smo_object, bufsize, end - bufsize); 367 mutex_enter(sm->sm_lock); 368 369 for (offset = 0; offset < end; offset += bufsize) { 370 size = MIN(end - offset, bufsize); 371 VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0); 372 VERIFY(size != 0); 373 374 dprintf("object=%llu offset=%llx size=%llx\n", 375 smo->smo_object, offset, size); 376 377 mutex_exit(sm->sm_lock); 378 error = dmu_read(os, smo->smo_object, offset, size, entry_map, 379 DMU_READ_PREFETCH); 380 mutex_enter(sm->sm_lock); 381 if (error != 0) 382 break; 383 384 entry_map_end = entry_map + (size / sizeof (uint64_t)); 385 for (entry = entry_map; entry < entry_map_end; entry++) { 386 uint64_t e = *entry; 387 388 if (SM_DEBUG_DECODE(e)) /* Skip debug entries */ 389 continue; 390 391 (SM_TYPE_DECODE(e) == maptype ? 392 space_map_add : space_map_remove)(sm, 393 (SM_OFFSET_DECODE(e) << sm->sm_shift) + mapstart, 394 SM_RUN_DECODE(e) << sm->sm_shift); 395 } 396 } 397 398 if (error == 0) { 399 VERIFY3U(sm->sm_space, ==, space); 400 401 sm->sm_loaded = B_TRUE; 402 sm->sm_ops = ops; 403 if (ops != NULL) 404 ops->smop_load(sm); 405 } else { 406 space_map_vacate(sm, NULL, NULL); 407 } 408 409 zio_buf_free(entry_map, bufsize); 410 411 sm->sm_loading = B_FALSE; 412 413 cv_broadcast(&sm->sm_load_cv); 414 415 return (error); 416} 417 418void 419space_map_unload(space_map_t *sm) 420{ 421 ASSERT(MUTEX_HELD(sm->sm_lock)); 422 423 if (sm->sm_loaded && sm->sm_ops != NULL) 424 sm->sm_ops->smop_unload(sm); 425 426 sm->sm_loaded = B_FALSE; 427 sm->sm_ops = NULL; 428 429 space_map_vacate(sm, NULL, NULL); 430} 431 432uint64_t 433space_map_maxsize(space_map_t *sm) 434{ 435 ASSERT(sm->sm_ops != NULL); 436 return (sm->sm_ops->smop_max(sm)); 437} 438 439uint64_t 440space_map_alloc(space_map_t *sm, uint64_t size) 441{ 442 uint64_t start; 443 444 start = sm->sm_ops->smop_alloc(sm, size); 445 if (start != -1ULL) 446 space_map_remove(sm, start, size); 447 return (start); 448} 449 450void 451space_map_claim(space_map_t *sm, uint64_t start, uint64_t size) 452{ 453 sm->sm_ops->smop_claim(sm, start, size); 454 space_map_remove(sm, start, size); 455} 456 457void 458space_map_free(space_map_t *sm, uint64_t start, uint64_t size) 459{ 460 space_map_add(sm, start, size); 461 sm->sm_ops->smop_free(sm, start, size); 462} 463 464/* 465 * Note: space_map_sync() will drop sm_lock across dmu_write() calls. 466 */ 467void 468space_map_sync(space_map_t *sm, uint8_t maptype, 469 space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx) 470{ 471 spa_t *spa = dmu_objset_spa(os); 472 avl_tree_t *t = &sm->sm_root; 473 space_seg_t *ss; 474 uint64_t bufsize, start, size, run_len, total, sm_space, nodes; 475 uint64_t *entry, *entry_map, *entry_map_end; 476 477 ASSERT(MUTEX_HELD(sm->sm_lock)); 478 479 if (sm->sm_space == 0) 480 return; 481 482 dprintf("object %4llu, txg %llu, pass %d, %c, count %lu, space %llx\n", 483 smo->smo_object, dmu_tx_get_txg(tx), spa_sync_pass(spa), 484 maptype == SM_ALLOC ? 'A' : 'F', avl_numnodes(&sm->sm_root), 485 sm->sm_space); 486 487 if (maptype == SM_ALLOC) 488 smo->smo_alloc += sm->sm_space; 489 else 490 smo->smo_alloc -= sm->sm_space; 491 492 bufsize = (8 + avl_numnodes(&sm->sm_root)) * sizeof (uint64_t); 493 bufsize = MIN(bufsize, 1ULL << SPACE_MAP_BLOCKSHIFT); 494 entry_map = zio_buf_alloc(bufsize); 495 entry_map_end = entry_map + (bufsize / sizeof (uint64_t)); 496 entry = entry_map; 497 498 *entry++ = SM_DEBUG_ENCODE(1) | 499 SM_DEBUG_ACTION_ENCODE(maptype) | 500 SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa)) | 501 SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx)); 502 503 total = 0; 504 nodes = avl_numnodes(&sm->sm_root); 505 sm_space = sm->sm_space; 506 for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss)) { 507 size = ss->ss_end - ss->ss_start; 508 start = (ss->ss_start - sm->sm_start) >> sm->sm_shift; 509 510 total += size; 511 size >>= sm->sm_shift; 512 513 while (size) { 514 run_len = MIN(size, SM_RUN_MAX); 515 516 if (entry == entry_map_end) { 517 mutex_exit(sm->sm_lock); 518 dmu_write(os, smo->smo_object, smo->smo_objsize, 519 bufsize, entry_map, tx); 520 mutex_enter(sm->sm_lock); 521 smo->smo_objsize += bufsize; 522 entry = entry_map; 523 } 524 525 *entry++ = SM_OFFSET_ENCODE(start) | 526 SM_TYPE_ENCODE(maptype) | 527 SM_RUN_ENCODE(run_len); 528 529 start += run_len; 530 size -= run_len; 531 } 532 } 533 534 if (entry != entry_map) { 535 size = (entry - entry_map) * sizeof (uint64_t); 536 mutex_exit(sm->sm_lock); 537 dmu_write(os, smo->smo_object, smo->smo_objsize, 538 size, entry_map, tx); 539 mutex_enter(sm->sm_lock); 540 smo->smo_objsize += size; 541 } 542 543 /* 544 * Ensure that the space_map's accounting wasn't changed 545 * while we were in the middle of writing it out. 546 */ 547 VERIFY3U(nodes, ==, avl_numnodes(&sm->sm_root)); 548 VERIFY3U(sm->sm_space, ==, sm_space); 549 VERIFY3U(sm->sm_space, ==, total); 550 551 zio_buf_free(entry_map, bufsize); 552} 553 554void 555space_map_truncate(space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx) 556{ 557 VERIFY(dmu_free_range(os, smo->smo_object, 0, -1ULL, tx) == 0); 558 559 smo->smo_objsize = 0; 560 smo->smo_alloc = 0; 561} 562 563/* 564 * Space map reference trees. 565 * 566 * A space map is a collection of integers. Every integer is either 567 * in the map, or it's not. A space map reference tree generalizes 568 * the idea: it allows its members to have arbitrary reference counts, 569 * as opposed to the implicit reference count of 0 or 1 in a space map. 570 * This representation comes in handy when computing the union or 571 * intersection of multiple space maps. For example, the union of 572 * N space maps is the subset of the reference tree with refcnt >= 1. 573 * The intersection of N space maps is the subset with refcnt >= N. 574 * 575 * [It's very much like a Fourier transform. Unions and intersections 576 * are hard to perform in the 'space map domain', so we convert the maps 577 * into the 'reference count domain', where it's trivial, then invert.] 578 * 579 * vdev_dtl_reassess() uses computations of this form to determine 580 * DTL_MISSING and DTL_OUTAGE for interior vdevs -- e.g. a RAID-Z vdev 581 * has an outage wherever refcnt >= vdev_nparity + 1, and a mirror vdev 582 * has an outage wherever refcnt >= vdev_children. 583 */ 584static int 585space_map_ref_compare(const void *x1, const void *x2) 586{ 587 const space_ref_t *sr1 = x1; 588 const space_ref_t *sr2 = x2; 589 590 if (sr1->sr_offset < sr2->sr_offset) 591 return (-1); 592 if (sr1->sr_offset > sr2->sr_offset) 593 return (1); 594 595 if (sr1 < sr2) 596 return (-1); 597 if (sr1 > sr2) 598 return (1); 599 600 return (0); 601} 602 603void 604space_map_ref_create(avl_tree_t *t) 605{ 606 avl_create(t, space_map_ref_compare, 607 sizeof (space_ref_t), offsetof(space_ref_t, sr_node)); 608} 609 610void 611space_map_ref_destroy(avl_tree_t *t) 612{ 613 space_ref_t *sr; 614 void *cookie = NULL; 615 616 while ((sr = avl_destroy_nodes(t, &cookie)) != NULL) 617 kmem_free(sr, sizeof (*sr)); 618 619 avl_destroy(t); 620} 621 622static void 623space_map_ref_add_node(avl_tree_t *t, uint64_t offset, int64_t refcnt) 624{ 625 space_ref_t *sr; 626 627 sr = kmem_alloc(sizeof (*sr), KM_SLEEP); 628 sr->sr_offset = offset; 629 sr->sr_refcnt = refcnt; 630 631 avl_add(t, sr); 632} 633 634void 635space_map_ref_add_seg(avl_tree_t *t, uint64_t start, uint64_t end, 636 int64_t refcnt) 637{ 638 space_map_ref_add_node(t, start, refcnt); 639 space_map_ref_add_node(t, end, -refcnt); 640} 641 642/* 643 * Convert (or add) a space map into a reference tree. 644 */ 645void 646space_map_ref_add_map(avl_tree_t *t, space_map_t *sm, int64_t refcnt) 647{ 648 space_seg_t *ss; 649 650 ASSERT(MUTEX_HELD(sm->sm_lock)); 651 652 for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) 653 space_map_ref_add_seg(t, ss->ss_start, ss->ss_end, refcnt); 654} 655 656/* 657 * Convert a reference tree into a space map. The space map will contain 658 * all members of the reference tree for which refcnt >= minref. 659 */ 660void 661space_map_ref_generate_map(avl_tree_t *t, space_map_t *sm, int64_t minref) 662{ 663 uint64_t start = -1ULL; 664 int64_t refcnt = 0; 665 space_ref_t *sr; 666 667 ASSERT(MUTEX_HELD(sm->sm_lock)); 668 669 space_map_vacate(sm, NULL, NULL); 670 671 for (sr = avl_first(t); sr != NULL; sr = AVL_NEXT(t, sr)) { 672 refcnt += sr->sr_refcnt; 673 if (refcnt >= minref) { 674 if (start == -1ULL) { 675 start = sr->sr_offset; 676 } 677 } else { 678 if (start != -1ULL) { 679 uint64_t end = sr->sr_offset; 680 ASSERT(start <= end); 681 if (end > start) 682 space_map_add(sm, start, end - start); 683 start = -1ULL; 684 } 685 } 686 } 687 ASSERT(refcnt == 0); 688 ASSERT(start == -1ULL); 689} 690