ddt.c revision 243674
1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2012 by Delphix. All rights reserved. 25 */ 26 27#include <sys/zfs_context.h> 28#include <sys/spa.h> 29#include <sys/spa_impl.h> 30#include <sys/zio.h> 31#include <sys/ddt.h> 32#include <sys/zap.h> 33#include <sys/dmu_tx.h> 34#include <sys/arc.h> 35#include <sys/dsl_pool.h> 36#include <sys/zio_checksum.h> 37#include <sys/zio_compress.h> 38#include <sys/dsl_scan.h> 39 40/* 41 * Enable/disable prefetching of dedup-ed blocks which are going to be freed. 42 */ 43int zfs_dedup_prefetch = 1; 44 45SYSCTL_DECL(_vfs_zfs); 46SYSCTL_NODE(_vfs_zfs, OID_AUTO, dedup, CTLFLAG_RW, 0, "ZFS DEDUP"); 47TUNABLE_INT("vfs.zfs.dedup.prefetch", &zfs_dedup_prefetch); 48SYSCTL_INT(_vfs_zfs_dedup, OID_AUTO, prefetch, CTLFLAG_RW, &zfs_dedup_prefetch, 49 0, "Enable/disable prefetching of dedup-ed blocks which are going to be freed"); 50 51static const ddt_ops_t *ddt_ops[DDT_TYPES] = { 52 &ddt_zap_ops, 53}; 54 55static const char *ddt_class_name[DDT_CLASSES] = { 56 "ditto", 57 "duplicate", 58 "unique", 59}; 60 61static void 62ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 63 dmu_tx_t *tx) 64{ 65 spa_t *spa = ddt->ddt_spa; 66 objset_t *os = ddt->ddt_os; 67 uint64_t *objectp = &ddt->ddt_object[type][class]; 68 boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_dedup; 69 char name[DDT_NAMELEN]; 70 71 ddt_object_name(ddt, type, class, name); 72 73 ASSERT(*objectp == 0); 74 VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0); 75 ASSERT(*objectp != 0); 76 77 VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name, 78 sizeof (uint64_t), 1, objectp, tx) == 0); 79 80 VERIFY(zap_add(os, spa->spa_ddt_stat_object, name, 81 sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), 82 &ddt->ddt_histogram[type][class], tx) == 0); 83} 84 85static void 86ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 87 dmu_tx_t *tx) 88{ 89 spa_t *spa = ddt->ddt_spa; 90 objset_t *os = ddt->ddt_os; 91 uint64_t *objectp = &ddt->ddt_object[type][class]; 92 char name[DDT_NAMELEN]; 93 94 ddt_object_name(ddt, type, class, name); 95 96 ASSERT(*objectp != 0); 97 ASSERT(ddt_object_count(ddt, type, class) == 0); 98 ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class])); 99 VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0); 100 VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0); 101 VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0); 102 bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t)); 103 104 *objectp = 0; 105} 106 107static int 108ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class) 109{ 110 ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; 111 dmu_object_info_t doi; 112 char name[DDT_NAMELEN]; 113 int error; 114 115 ddt_object_name(ddt, type, class, name); 116 117 error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, 118 sizeof (uint64_t), 1, &ddt->ddt_object[type][class]); 119 120 if (error) 121 return (error); 122 123 error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, 124 sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), 125 &ddt->ddt_histogram[type][class]); 126 127 /* 128 * Seed the cached statistics. 129 */ 130 VERIFY(ddt_object_info(ddt, type, class, &doi) == 0); 131 132 ddo->ddo_count = ddt_object_count(ddt, type, class); 133 ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; 134 ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; 135 136 ASSERT(error == 0); 137 return (error); 138} 139 140static void 141ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 142 dmu_tx_t *tx) 143{ 144 ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; 145 dmu_object_info_t doi; 146 char name[DDT_NAMELEN]; 147 148 ddt_object_name(ddt, type, class, name); 149 150 VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, 151 sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), 152 &ddt->ddt_histogram[type][class], tx) == 0); 153 154 /* 155 * Cache DDT statistics; this is the only time they'll change. 156 */ 157 VERIFY(ddt_object_info(ddt, type, class, &doi) == 0); 158 159 ddo->ddo_count = ddt_object_count(ddt, type, class); 160 ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; 161 ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; 162} 163 164static int 165ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 166 ddt_entry_t *dde) 167{ 168 if (!ddt_object_exists(ddt, type, class)) 169 return (ENOENT); 170 171 return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os, 172 ddt->ddt_object[type][class], dde)); 173} 174 175static void 176ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 177 ddt_entry_t *dde) 178{ 179 if (!ddt_object_exists(ddt, type, class)) 180 return; 181 182 ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os, 183 ddt->ddt_object[type][class], dde); 184} 185 186int 187ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 188 ddt_entry_t *dde, dmu_tx_t *tx) 189{ 190 ASSERT(ddt_object_exists(ddt, type, class)); 191 192 return (ddt_ops[type]->ddt_op_update(ddt->ddt_os, 193 ddt->ddt_object[type][class], dde, tx)); 194} 195 196static int 197ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 198 ddt_entry_t *dde, dmu_tx_t *tx) 199{ 200 ASSERT(ddt_object_exists(ddt, type, class)); 201 202 return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os, 203 ddt->ddt_object[type][class], dde, tx)); 204} 205 206int 207ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 208 uint64_t *walk, ddt_entry_t *dde) 209{ 210 ASSERT(ddt_object_exists(ddt, type, class)); 211 212 return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os, 213 ddt->ddt_object[type][class], dde, walk)); 214} 215 216uint64_t 217ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class) 218{ 219 ASSERT(ddt_object_exists(ddt, type, class)); 220 221 return (ddt_ops[type]->ddt_op_count(ddt->ddt_os, 222 ddt->ddt_object[type][class])); 223} 224 225int 226ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 227 dmu_object_info_t *doi) 228{ 229 if (!ddt_object_exists(ddt, type, class)) 230 return (ENOENT); 231 232 return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class], 233 doi)); 234} 235 236boolean_t 237ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class) 238{ 239 return (!!ddt->ddt_object[type][class]); 240} 241 242void 243ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 244 char *name) 245{ 246 (void) sprintf(name, DMU_POOL_DDT, 247 zio_checksum_table[ddt->ddt_checksum].ci_name, 248 ddt_ops[type]->ddt_op_name, ddt_class_name[class]); 249} 250 251void 252ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg) 253{ 254 ASSERT(txg != 0); 255 256 for (int d = 0; d < SPA_DVAS_PER_BP; d++) 257 bp->blk_dva[d] = ddp->ddp_dva[d]; 258 BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth); 259} 260 261void 262ddt_bp_create(enum zio_checksum checksum, 263 const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp) 264{ 265 BP_ZERO(bp); 266 267 if (ddp != NULL) 268 ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth); 269 270 bp->blk_cksum = ddk->ddk_cksum; 271 bp->blk_fill = 1; 272 273 BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk)); 274 BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk)); 275 BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk)); 276 BP_SET_CHECKSUM(bp, checksum); 277 BP_SET_TYPE(bp, DMU_OT_DEDUP); 278 BP_SET_LEVEL(bp, 0); 279 BP_SET_DEDUP(bp, 0); 280 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 281} 282 283void 284ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp) 285{ 286 ddk->ddk_cksum = bp->blk_cksum; 287 ddk->ddk_prop = 0; 288 289 DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp)); 290 DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp)); 291 DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp)); 292} 293 294void 295ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp) 296{ 297 ASSERT(ddp->ddp_phys_birth == 0); 298 299 for (int d = 0; d < SPA_DVAS_PER_BP; d++) 300 ddp->ddp_dva[d] = bp->blk_dva[d]; 301 ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp); 302} 303 304void 305ddt_phys_clear(ddt_phys_t *ddp) 306{ 307 bzero(ddp, sizeof (*ddp)); 308} 309 310void 311ddt_phys_addref(ddt_phys_t *ddp) 312{ 313 ddp->ddp_refcnt++; 314} 315 316void 317ddt_phys_decref(ddt_phys_t *ddp) 318{ 319 ASSERT((int64_t)ddp->ddp_refcnt > 0); 320 ddp->ddp_refcnt--; 321} 322 323void 324ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg) 325{ 326 blkptr_t blk; 327 328 ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); 329 ddt_phys_clear(ddp); 330 zio_free(ddt->ddt_spa, txg, &blk); 331} 332 333ddt_phys_t * 334ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp) 335{ 336 ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys; 337 338 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 339 if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) && 340 BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth) 341 return (ddp); 342 } 343 return (NULL); 344} 345 346uint64_t 347ddt_phys_total_refcnt(const ddt_entry_t *dde) 348{ 349 uint64_t refcnt = 0; 350 351 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) 352 refcnt += dde->dde_phys[p].ddp_refcnt; 353 354 return (refcnt); 355} 356 357static void 358ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds) 359{ 360 spa_t *spa = ddt->ddt_spa; 361 ddt_phys_t *ddp = dde->dde_phys; 362 ddt_key_t *ddk = &dde->dde_key; 363 uint64_t lsize = DDK_GET_LSIZE(ddk); 364 uint64_t psize = DDK_GET_PSIZE(ddk); 365 366 bzero(dds, sizeof (*dds)); 367 368 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 369 uint64_t dsize = 0; 370 uint64_t refcnt = ddp->ddp_refcnt; 371 372 if (ddp->ddp_phys_birth == 0) 373 continue; 374 375 for (int d = 0; d < SPA_DVAS_PER_BP; d++) 376 dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]); 377 378 dds->dds_blocks += 1; 379 dds->dds_lsize += lsize; 380 dds->dds_psize += psize; 381 dds->dds_dsize += dsize; 382 383 dds->dds_ref_blocks += refcnt; 384 dds->dds_ref_lsize += lsize * refcnt; 385 dds->dds_ref_psize += psize * refcnt; 386 dds->dds_ref_dsize += dsize * refcnt; 387 } 388} 389 390void 391ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg) 392{ 393 const uint64_t *s = (const uint64_t *)src; 394 uint64_t *d = (uint64_t *)dst; 395 uint64_t *d_end = (uint64_t *)(dst + 1); 396 397 ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */ 398 399 while (d < d_end) 400 *d++ += (*s++ ^ neg) - neg; 401} 402 403static void 404ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg) 405{ 406 ddt_stat_t dds; 407 ddt_histogram_t *ddh; 408 int bucket; 409 410 ddt_stat_generate(ddt, dde, &dds); 411 412 bucket = highbit(dds.dds_ref_blocks) - 1; 413 ASSERT(bucket >= 0); 414 415 ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class]; 416 417 ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg); 418} 419 420void 421ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src) 422{ 423 for (int h = 0; h < 64; h++) 424 ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0); 425} 426 427void 428ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh) 429{ 430 bzero(dds, sizeof (*dds)); 431 432 for (int h = 0; h < 64; h++) 433 ddt_stat_add(dds, &ddh->ddh_stat[h], 0); 434} 435 436boolean_t 437ddt_histogram_empty(const ddt_histogram_t *ddh) 438{ 439 const uint64_t *s = (const uint64_t *)ddh; 440 const uint64_t *s_end = (const uint64_t *)(ddh + 1); 441 442 while (s < s_end) 443 if (*s++ != 0) 444 return (B_FALSE); 445 446 return (B_TRUE); 447} 448 449void 450ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total) 451{ 452 /* Sum the statistics we cached in ddt_object_sync(). */ 453 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 454 ddt_t *ddt = spa->spa_ddt[c]; 455 for (enum ddt_type type = 0; type < DDT_TYPES; type++) { 456 for (enum ddt_class class = 0; class < DDT_CLASSES; 457 class++) { 458 ddt_object_t *ddo = 459 &ddt->ddt_object_stats[type][class]; 460 ddo_total->ddo_count += ddo->ddo_count; 461 ddo_total->ddo_dspace += ddo->ddo_dspace; 462 ddo_total->ddo_mspace += ddo->ddo_mspace; 463 } 464 } 465 } 466 467 /* ... and compute the averages. */ 468 if (ddo_total->ddo_count != 0) { 469 ddo_total->ddo_dspace /= ddo_total->ddo_count; 470 ddo_total->ddo_mspace /= ddo_total->ddo_count; 471 } 472} 473 474void 475ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh) 476{ 477 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 478 ddt_t *ddt = spa->spa_ddt[c]; 479 for (enum ddt_type type = 0; type < DDT_TYPES; type++) { 480 for (enum ddt_class class = 0; class < DDT_CLASSES; 481 class++) { 482 ddt_histogram_add(ddh, 483 &ddt->ddt_histogram_cache[type][class]); 484 } 485 } 486 } 487} 488 489void 490ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total) 491{ 492 ddt_histogram_t *ddh_total; 493 494 ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); 495 ddt_get_dedup_histogram(spa, ddh_total); 496 ddt_histogram_stat(dds_total, ddh_total); 497 kmem_free(ddh_total, sizeof (ddt_histogram_t)); 498} 499 500uint64_t 501ddt_get_dedup_dspace(spa_t *spa) 502{ 503 ddt_stat_t dds_total = { 0 }; 504 505 ddt_get_dedup_stats(spa, &dds_total); 506 return (dds_total.dds_ref_dsize - dds_total.dds_dsize); 507} 508 509uint64_t 510ddt_get_pool_dedup_ratio(spa_t *spa) 511{ 512 ddt_stat_t dds_total = { 0 }; 513 514 ddt_get_dedup_stats(spa, &dds_total); 515 if (dds_total.dds_dsize == 0) 516 return (100); 517 518 return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize); 519} 520 521int 522ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref) 523{ 524 spa_t *spa = ddt->ddt_spa; 525 uint64_t total_refcnt = 0; 526 uint64_t ditto = spa->spa_dedup_ditto; 527 int total_copies = 0; 528 int desired_copies = 0; 529 530 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 531 ddt_phys_t *ddp = &dde->dde_phys[p]; 532 zio_t *zio = dde->dde_lead_zio[p]; 533 uint64_t refcnt = ddp->ddp_refcnt; /* committed refs */ 534 if (zio != NULL) 535 refcnt += zio->io_parent_count; /* pending refs */ 536 if (ddp == ddp_willref) 537 refcnt++; /* caller's ref */ 538 if (refcnt != 0) { 539 total_refcnt += refcnt; 540 total_copies += p; 541 } 542 } 543 544 if (ditto == 0 || ditto > UINT32_MAX) 545 ditto = UINT32_MAX; 546 547 if (total_refcnt >= 1) 548 desired_copies++; 549 if (total_refcnt >= ditto) 550 desired_copies++; 551 if (total_refcnt >= ditto * ditto) 552 desired_copies++; 553 554 return (MAX(desired_copies, total_copies) - total_copies); 555} 556 557int 558ddt_ditto_copies_present(ddt_entry_t *dde) 559{ 560 ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO]; 561 dva_t *dva = ddp->ddp_dva; 562 int copies = 0 - DVA_GET_GANG(dva); 563 564 for (int d = 0; d < SPA_DVAS_PER_BP; d++, dva++) 565 if (DVA_IS_VALID(dva)) 566 copies++; 567 568 ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP); 569 570 return (copies); 571} 572 573size_t 574ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len) 575{ 576 uchar_t *version = dst++; 577 int cpfunc = ZIO_COMPRESS_ZLE; 578 zio_compress_info_t *ci = &zio_compress_table[cpfunc]; 579 size_t c_len; 580 581 ASSERT(d_len >= s_len + 1); /* no compression plus version byte */ 582 583 c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level); 584 585 if (c_len == s_len) { 586 cpfunc = ZIO_COMPRESS_OFF; 587 bcopy(src, dst, s_len); 588 } 589 590 *version = (ZFS_HOST_BYTEORDER & DDT_COMPRESS_BYTEORDER_MASK) | cpfunc; 591 592 return (c_len + 1); 593} 594 595void 596ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len) 597{ 598 uchar_t version = *src++; 599 int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK; 600 zio_compress_info_t *ci = &zio_compress_table[cpfunc]; 601 602 if (ci->ci_decompress != NULL) 603 (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level); 604 else 605 bcopy(src, dst, d_len); 606 607 if ((version ^ ZFS_HOST_BYTEORDER) & DDT_COMPRESS_BYTEORDER_MASK) 608 byteswap_uint64_array(dst, d_len); 609} 610 611ddt_t * 612ddt_select_by_checksum(spa_t *spa, enum zio_checksum c) 613{ 614 return (spa->spa_ddt[c]); 615} 616 617ddt_t * 618ddt_select(spa_t *spa, const blkptr_t *bp) 619{ 620 return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]); 621} 622 623void 624ddt_enter(ddt_t *ddt) 625{ 626 mutex_enter(&ddt->ddt_lock); 627} 628 629void 630ddt_exit(ddt_t *ddt) 631{ 632 mutex_exit(&ddt->ddt_lock); 633} 634 635static ddt_entry_t * 636ddt_alloc(const ddt_key_t *ddk) 637{ 638 ddt_entry_t *dde; 639 640 dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP); 641 cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL); 642 643 dde->dde_key = *ddk; 644 645 return (dde); 646} 647 648static void 649ddt_free(ddt_entry_t *dde) 650{ 651 ASSERT(!dde->dde_loading); 652 653 for (int p = 0; p < DDT_PHYS_TYPES; p++) 654 ASSERT(dde->dde_lead_zio[p] == NULL); 655 656 if (dde->dde_repair_data != NULL) 657 zio_buf_free(dde->dde_repair_data, 658 DDK_GET_PSIZE(&dde->dde_key)); 659 660 cv_destroy(&dde->dde_cv); 661 kmem_free(dde, sizeof (*dde)); 662} 663 664void 665ddt_remove(ddt_t *ddt, ddt_entry_t *dde) 666{ 667 ASSERT(MUTEX_HELD(&ddt->ddt_lock)); 668 669 avl_remove(&ddt->ddt_tree, dde); 670 ddt_free(dde); 671} 672 673ddt_entry_t * 674ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) 675{ 676 ddt_entry_t *dde, dde_search; 677 enum ddt_type type; 678 enum ddt_class class; 679 avl_index_t where; 680 int error; 681 682 ASSERT(MUTEX_HELD(&ddt->ddt_lock)); 683 684 ddt_key_fill(&dde_search.dde_key, bp); 685 686 dde = avl_find(&ddt->ddt_tree, &dde_search, &where); 687 if (dde == NULL) { 688 if (!add) 689 return (NULL); 690 dde = ddt_alloc(&dde_search.dde_key); 691 avl_insert(&ddt->ddt_tree, dde, where); 692 } 693 694 while (dde->dde_loading) 695 cv_wait(&dde->dde_cv, &ddt->ddt_lock); 696 697 if (dde->dde_loaded) 698 return (dde); 699 700 dde->dde_loading = B_TRUE; 701 702 ddt_exit(ddt); 703 704 error = ENOENT; 705 706 for (type = 0; type < DDT_TYPES; type++) { 707 for (class = 0; class < DDT_CLASSES; class++) { 708 error = ddt_object_lookup(ddt, type, class, dde); 709 if (error != ENOENT) 710 break; 711 } 712 if (error != ENOENT) 713 break; 714 } 715 716 ASSERT(error == 0 || error == ENOENT); 717 718 ddt_enter(ddt); 719 720 ASSERT(dde->dde_loaded == B_FALSE); 721 ASSERT(dde->dde_loading == B_TRUE); 722 723 dde->dde_type = type; /* will be DDT_TYPES if no entry found */ 724 dde->dde_class = class; /* will be DDT_CLASSES if no entry found */ 725 dde->dde_loaded = B_TRUE; 726 dde->dde_loading = B_FALSE; 727 728 if (error == 0) 729 ddt_stat_update(ddt, dde, -1ULL); 730 731 cv_broadcast(&dde->dde_cv); 732 733 return (dde); 734} 735 736void 737ddt_prefetch(spa_t *spa, const blkptr_t *bp) 738{ 739 ddt_t *ddt; 740 ddt_entry_t dde; 741 742 if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp)) 743 return; 744 745 /* 746 * We only remove the DDT once all tables are empty and only 747 * prefetch dedup blocks when there are entries in the DDT. 748 * Thus no locking is required as the DDT can't disappear on us. 749 */ 750 ddt = ddt_select(spa, bp); 751 ddt_key_fill(&dde.dde_key, bp); 752 753 for (enum ddt_type type = 0; type < DDT_TYPES; type++) { 754 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { 755 ddt_object_prefetch(ddt, type, class, &dde); 756 } 757 } 758} 759 760int 761ddt_entry_compare(const void *x1, const void *x2) 762{ 763 const ddt_entry_t *dde1 = x1; 764 const ddt_entry_t *dde2 = x2; 765 const uint64_t *u1 = (const uint64_t *)&dde1->dde_key; 766 const uint64_t *u2 = (const uint64_t *)&dde2->dde_key; 767 768 for (int i = 0; i < DDT_KEY_WORDS; i++) { 769 if (u1[i] < u2[i]) 770 return (-1); 771 if (u1[i] > u2[i]) 772 return (1); 773 } 774 775 return (0); 776} 777 778static ddt_t * 779ddt_table_alloc(spa_t *spa, enum zio_checksum c) 780{ 781 ddt_t *ddt; 782 783 ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP); 784 785 mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL); 786 avl_create(&ddt->ddt_tree, ddt_entry_compare, 787 sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); 788 avl_create(&ddt->ddt_repair_tree, ddt_entry_compare, 789 sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); 790 ddt->ddt_checksum = c; 791 ddt->ddt_spa = spa; 792 ddt->ddt_os = spa->spa_meta_objset; 793 794 return (ddt); 795} 796 797static void 798ddt_table_free(ddt_t *ddt) 799{ 800 ASSERT(avl_numnodes(&ddt->ddt_tree) == 0); 801 ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0); 802 avl_destroy(&ddt->ddt_tree); 803 avl_destroy(&ddt->ddt_repair_tree); 804 mutex_destroy(&ddt->ddt_lock); 805 kmem_free(ddt, sizeof (*ddt)); 806} 807 808void 809ddt_create(spa_t *spa) 810{ 811 spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM; 812 813 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) 814 spa->spa_ddt[c] = ddt_table_alloc(spa, c); 815} 816 817int 818ddt_load(spa_t *spa) 819{ 820 int error; 821 822 ddt_create(spa); 823 824 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 825 DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, 826 &spa->spa_ddt_stat_object); 827 828 if (error) 829 return (error == ENOENT ? 0 : error); 830 831 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 832 ddt_t *ddt = spa->spa_ddt[c]; 833 for (enum ddt_type type = 0; type < DDT_TYPES; type++) { 834 for (enum ddt_class class = 0; class < DDT_CLASSES; 835 class++) { 836 error = ddt_object_load(ddt, type, class); 837 if (error != 0 && error != ENOENT) 838 return (error); 839 } 840 } 841 842 /* 843 * Seed the cached histograms. 844 */ 845 bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache, 846 sizeof (ddt->ddt_histogram)); 847 } 848 849 return (0); 850} 851 852void 853ddt_unload(spa_t *spa) 854{ 855 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 856 if (spa->spa_ddt[c]) { 857 ddt_table_free(spa->spa_ddt[c]); 858 spa->spa_ddt[c] = NULL; 859 } 860 } 861} 862 863boolean_t 864ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp) 865{ 866 ddt_t *ddt; 867 ddt_entry_t dde; 868 869 if (!BP_GET_DEDUP(bp)) 870 return (B_FALSE); 871 872 if (max_class == DDT_CLASS_UNIQUE) 873 return (B_TRUE); 874 875 ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)]; 876 877 ddt_key_fill(&dde.dde_key, bp); 878 879 for (enum ddt_type type = 0; type < DDT_TYPES; type++) 880 for (enum ddt_class class = 0; class <= max_class; class++) 881 if (ddt_object_lookup(ddt, type, class, &dde) == 0) 882 return (B_TRUE); 883 884 return (B_FALSE); 885} 886 887ddt_entry_t * 888ddt_repair_start(ddt_t *ddt, const blkptr_t *bp) 889{ 890 ddt_key_t ddk; 891 ddt_entry_t *dde; 892 893 ddt_key_fill(&ddk, bp); 894 895 dde = ddt_alloc(&ddk); 896 897 for (enum ddt_type type = 0; type < DDT_TYPES; type++) { 898 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { 899 /* 900 * We can only do repair if there are multiple copies 901 * of the block. For anything in the UNIQUE class, 902 * there's definitely only one copy, so don't even try. 903 */ 904 if (class != DDT_CLASS_UNIQUE && 905 ddt_object_lookup(ddt, type, class, dde) == 0) 906 return (dde); 907 } 908 } 909 910 bzero(dde->dde_phys, sizeof (dde->dde_phys)); 911 912 return (dde); 913} 914 915void 916ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde) 917{ 918 avl_index_t where; 919 920 ddt_enter(ddt); 921 922 if (dde->dde_repair_data != NULL && spa_writeable(ddt->ddt_spa) && 923 avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL) 924 avl_insert(&ddt->ddt_repair_tree, dde, where); 925 else 926 ddt_free(dde); 927 928 ddt_exit(ddt); 929} 930 931static void 932ddt_repair_entry_done(zio_t *zio) 933{ 934 ddt_entry_t *rdde = zio->io_private; 935 936 ddt_free(rdde); 937} 938 939static void 940ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio) 941{ 942 ddt_phys_t *ddp = dde->dde_phys; 943 ddt_phys_t *rddp = rdde->dde_phys; 944 ddt_key_t *ddk = &dde->dde_key; 945 ddt_key_t *rddk = &rdde->dde_key; 946 zio_t *zio; 947 blkptr_t blk; 948 949 zio = zio_null(rio, rio->io_spa, NULL, 950 ddt_repair_entry_done, rdde, rio->io_flags); 951 952 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) { 953 if (ddp->ddp_phys_birth == 0 || 954 ddp->ddp_phys_birth != rddp->ddp_phys_birth || 955 bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva))) 956 continue; 957 ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); 958 zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk, 959 rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL, 960 ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL)); 961 } 962 963 zio_nowait(zio); 964} 965 966static void 967ddt_repair_table(ddt_t *ddt, zio_t *rio) 968{ 969 spa_t *spa = ddt->ddt_spa; 970 ddt_entry_t *dde, *rdde_next, *rdde; 971 avl_tree_t *t = &ddt->ddt_repair_tree; 972 blkptr_t blk; 973 974 if (spa_sync_pass(spa) > 1) 975 return; 976 977 ddt_enter(ddt); 978 for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) { 979 rdde_next = AVL_NEXT(t, rdde); 980 avl_remove(&ddt->ddt_repair_tree, rdde); 981 ddt_exit(ddt); 982 ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk); 983 dde = ddt_repair_start(ddt, &blk); 984 ddt_repair_entry(ddt, dde, rdde, rio); 985 ddt_repair_done(ddt, dde); 986 ddt_enter(ddt); 987 } 988 ddt_exit(ddt); 989} 990 991static void 992ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) 993{ 994 dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool; 995 ddt_phys_t *ddp = dde->dde_phys; 996 ddt_key_t *ddk = &dde->dde_key; 997 enum ddt_type otype = dde->dde_type; 998 enum ddt_type ntype = DDT_TYPE_CURRENT; 999 enum ddt_class oclass = dde->dde_class; 1000 enum ddt_class nclass; 1001 uint64_t total_refcnt = 0; 1002 1003 ASSERT(dde->dde_loaded); 1004 ASSERT(!dde->dde_loading); 1005 1006 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 1007 ASSERT(dde->dde_lead_zio[p] == NULL); 1008 ASSERT((int64_t)ddp->ddp_refcnt >= 0); 1009 if (ddp->ddp_phys_birth == 0) { 1010 ASSERT(ddp->ddp_refcnt == 0); 1011 continue; 1012 } 1013 if (p == DDT_PHYS_DITTO) { 1014 if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0) 1015 ddt_phys_free(ddt, ddk, ddp, txg); 1016 continue; 1017 } 1018 if (ddp->ddp_refcnt == 0) 1019 ddt_phys_free(ddt, ddk, ddp, txg); 1020 total_refcnt += ddp->ddp_refcnt; 1021 } 1022 1023 if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0) 1024 nclass = DDT_CLASS_DITTO; 1025 else if (total_refcnt > 1) 1026 nclass = DDT_CLASS_DUPLICATE; 1027 else 1028 nclass = DDT_CLASS_UNIQUE; 1029 1030 if (otype != DDT_TYPES && 1031 (otype != ntype || oclass != nclass || total_refcnt == 0)) { 1032 VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0); 1033 ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT); 1034 } 1035 1036 if (total_refcnt != 0) { 1037 dde->dde_type = ntype; 1038 dde->dde_class = nclass; 1039 ddt_stat_update(ddt, dde, 0); 1040 if (!ddt_object_exists(ddt, ntype, nclass)) 1041 ddt_object_create(ddt, ntype, nclass, tx); 1042 VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0); 1043 1044 /* 1045 * If the class changes, the order that we scan this bp 1046 * changes. If it decreases, we could miss it, so 1047 * scan it right now. (This covers both class changing 1048 * while we are doing ddt_walk(), and when we are 1049 * traversing.) 1050 */ 1051 if (nclass < oclass) { 1052 dsl_scan_ddt_entry(dp->dp_scan, 1053 ddt->ddt_checksum, dde, tx); 1054 } 1055 } 1056} 1057 1058static void 1059ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) 1060{ 1061 spa_t *spa = ddt->ddt_spa; 1062 ddt_entry_t *dde; 1063 void *cookie = NULL; 1064 1065 if (avl_numnodes(&ddt->ddt_tree) == 0) 1066 return; 1067 1068 ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP); 1069 1070 if (spa->spa_ddt_stat_object == 0) { 1071 spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os, 1072 DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT, 1073 DMU_POOL_DDT_STATS, tx); 1074 } 1075 1076 while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) { 1077 ddt_sync_entry(ddt, dde, tx, txg); 1078 ddt_free(dde); 1079 } 1080 1081 for (enum ddt_type type = 0; type < DDT_TYPES; type++) { 1082 uint64_t count = 0; 1083 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { 1084 if (ddt_object_exists(ddt, type, class)) { 1085 ddt_object_sync(ddt, type, class, tx); 1086 count += ddt_object_count(ddt, type, class); 1087 } 1088 } 1089 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { 1090 if (count == 0 && ddt_object_exists(ddt, type, class)) 1091 ddt_object_destroy(ddt, type, class, tx); 1092 } 1093 } 1094 1095 bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache, 1096 sizeof (ddt->ddt_histogram)); 1097} 1098 1099void 1100ddt_sync(spa_t *spa, uint64_t txg) 1101{ 1102 dmu_tx_t *tx; 1103 zio_t *rio = zio_root(spa, NULL, NULL, 1104 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1105 1106 ASSERT(spa_syncing_txg(spa) == txg); 1107 1108 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1109 1110 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 1111 ddt_t *ddt = spa->spa_ddt[c]; 1112 if (ddt == NULL) 1113 continue; 1114 ddt_sync_table(ddt, tx, txg); 1115 ddt_repair_table(ddt, rio); 1116 } 1117 1118 (void) zio_wait(rio); 1119 1120 dmu_tx_commit(tx); 1121} 1122 1123int 1124ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde) 1125{ 1126 do { 1127 do { 1128 do { 1129 ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum]; 1130 int error = ENOENT; 1131 if (ddt_object_exists(ddt, ddb->ddb_type, 1132 ddb->ddb_class)) { 1133 error = ddt_object_walk(ddt, 1134 ddb->ddb_type, ddb->ddb_class, 1135 &ddb->ddb_cursor, dde); 1136 } 1137 dde->dde_type = ddb->ddb_type; 1138 dde->dde_class = ddb->ddb_class; 1139 if (error == 0) 1140 return (0); 1141 if (error != ENOENT) 1142 return (error); 1143 ddb->ddb_cursor = 0; 1144 } while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS); 1145 ddb->ddb_checksum = 0; 1146 } while (++ddb->ddb_type < DDT_TYPES); 1147 ddb->ddb_type = 0; 1148 } while (++ddb->ddb_class < DDT_CLASSES); 1149 1150 return (ENOENT); 1151} 1152