1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. 24 * Copyright (c) 2013 by Delphix. All rights reserved. 25 */ 26 27#include <sys/zfs_context.h> 28#include <sys/spa.h> 29#include <sys/spa_impl.h> 30#include <sys/zio.h> 31#include <sys/ddt.h> 32#include <sys/zap.h> 33#include <sys/dmu_tx.h> 34#include <sys/arc.h> 35#include <sys/dsl_pool.h> 36#include <sys/zio_checksum.h> 37#include <sys/zio_compress.h> 38#include <sys/dsl_scan.h> 39 40/* 41 * Enable/disable prefetching of dedup-ed blocks which are going to be freed. 42 */ 43int zfs_dedup_prefetch = 1; 44 45SYSCTL_DECL(_vfs_zfs); 46SYSCTL_NODE(_vfs_zfs, OID_AUTO, dedup, CTLFLAG_RW, 0, "ZFS DEDUP"); 47TUNABLE_INT("vfs.zfs.dedup.prefetch", &zfs_dedup_prefetch); 48SYSCTL_INT(_vfs_zfs_dedup, OID_AUTO, prefetch, CTLFLAG_RW, &zfs_dedup_prefetch, 49 0, "Enable/disable prefetching of dedup-ed blocks which are going to be freed"); 50 51static const ddt_ops_t *ddt_ops[DDT_TYPES] = { 52 &ddt_zap_ops, 53}; 54 55static const char *ddt_class_name[DDT_CLASSES] = { 56 "ditto", 57 "duplicate", 58 "unique", 59}; 60 61static void 62ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 63 dmu_tx_t *tx) 64{ 65 spa_t *spa = ddt->ddt_spa; 66 objset_t *os = ddt->ddt_os; 67 uint64_t *objectp = &ddt->ddt_object[type][class]; 68 boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_dedup; 69 char name[DDT_NAMELEN]; 70 71 ddt_object_name(ddt, type, class, name); 72 73 ASSERT(*objectp == 0); 74 VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0); 75 ASSERT(*objectp != 0); 76 77 VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name, 78 sizeof (uint64_t), 1, objectp, tx) == 0); 79 80 VERIFY(zap_add(os, spa->spa_ddt_stat_object, name, 81 sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), 82 &ddt->ddt_histogram[type][class], tx) == 0); 83} 84 85static void 86ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 87 dmu_tx_t *tx) 88{ 89 spa_t *spa = ddt->ddt_spa; 90 objset_t *os = ddt->ddt_os; 91 uint64_t *objectp = &ddt->ddt_object[type][class]; 92 uint64_t count; 93 char name[DDT_NAMELEN]; 94 95 ddt_object_name(ddt, type, class, name); 96 97 ASSERT(*objectp != 0); 98 VERIFY(ddt_object_count(ddt, type, class, &count) == 0 && count == 0); 99 ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class])); 100 VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0); 101 VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0); 102 VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0); 103 bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t)); 104 105 *objectp = 0; 106} 107 108static int 109ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class) 110{ 111 ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; 112 dmu_object_info_t doi; 113 uint64_t count; 114 char name[DDT_NAMELEN]; 115 int error; 116 117 ddt_object_name(ddt, type, class, name); 118 119 error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, 120 sizeof (uint64_t), 1, &ddt->ddt_object[type][class]); 121 122 if (error) 123 return (error); 124 125 error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, 126 sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), 127 &ddt->ddt_histogram[type][class]); 128 129 /* 130 * Seed the cached statistics. 131 */ 132 VERIFY(ddt_object_info(ddt, type, class, &doi) == 0); 133 134 error = ddt_object_count(ddt, type, class, &count); 135 if (error) 136 return error; 137 138 ddo->ddo_count = count; 139 ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; 140 ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; 141 142 ASSERT(error == 0); 143 return (error); 144} 145 146static void 147ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 148 dmu_tx_t *tx) 149{ 150 ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; 151 dmu_object_info_t doi; 152 uint64_t count; 153 char name[DDT_NAMELEN]; 154 155 ddt_object_name(ddt, type, class, name); 156 157 VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, 158 sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), 159 &ddt->ddt_histogram[type][class], tx) == 0); 160 161 /* 162 * Cache DDT statistics; this is the only time they'll change. 163 */ 164 VERIFY(ddt_object_info(ddt, type, class, &doi) == 0); 165 VERIFY(ddt_object_count(ddt, type, class, &count) == 0); 166 167 ddo->ddo_count = count; 168 ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; 169 ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; 170} 171 172static int 173ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 174 ddt_entry_t *dde) 175{ 176 if (!ddt_object_exists(ddt, type, class)) 177 return (SET_ERROR(ENOENT)); 178 179 return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os, 180 ddt->ddt_object[type][class], dde)); 181} 182 183static void 184ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 185 ddt_entry_t *dde) 186{ 187 if (!ddt_object_exists(ddt, type, class)) 188 return; 189 190 ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os, 191 ddt->ddt_object[type][class], dde); 192} 193 194int 195ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 196 ddt_entry_t *dde, dmu_tx_t *tx) 197{ 198 ASSERT(ddt_object_exists(ddt, type, class)); 199 200 return (ddt_ops[type]->ddt_op_update(ddt->ddt_os, 201 ddt->ddt_object[type][class], dde, tx)); 202} 203 204static int 205ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 206 ddt_entry_t *dde, dmu_tx_t *tx) 207{ 208 ASSERT(ddt_object_exists(ddt, type, class)); 209 210 return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os, 211 ddt->ddt_object[type][class], dde, tx)); 212} 213 214int 215ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 216 uint64_t *walk, ddt_entry_t *dde) 217{ 218 ASSERT(ddt_object_exists(ddt, type, class)); 219 220 return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os, 221 ddt->ddt_object[type][class], dde, walk)); 222} 223 224int 225ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class, uint64_t *count) 226{ 227 ASSERT(ddt_object_exists(ddt, type, class)); 228 229 return (ddt_ops[type]->ddt_op_count(ddt->ddt_os, 230 ddt->ddt_object[type][class], count)); 231} 232 233int 234ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 235 dmu_object_info_t *doi) 236{ 237 if (!ddt_object_exists(ddt, type, class)) 238 return (SET_ERROR(ENOENT)); 239 240 return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class], 241 doi)); 242} 243 244boolean_t 245ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class) 246{ 247 return (!!ddt->ddt_object[type][class]); 248} 249 250void 251ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class, 252 char *name) 253{ 254 (void) sprintf(name, DMU_POOL_DDT, 255 zio_checksum_table[ddt->ddt_checksum].ci_name, 256 ddt_ops[type]->ddt_op_name, ddt_class_name[class]); 257} 258 259void 260ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg) 261{ 262 ASSERT(txg != 0); 263 264 for (int d = 0; d < SPA_DVAS_PER_BP; d++) 265 bp->blk_dva[d] = ddp->ddp_dva[d]; 266 BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth); 267} 268 269void 270ddt_bp_create(enum zio_checksum checksum, 271 const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp) 272{ 273 BP_ZERO(bp); 274 275 if (ddp != NULL) 276 ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth); 277 278 bp->blk_cksum = ddk->ddk_cksum; 279 bp->blk_fill = 1; 280 281 BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk)); 282 BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk)); 283 BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk)); 284 BP_SET_CHECKSUM(bp, checksum); 285 BP_SET_TYPE(bp, DMU_OT_DEDUP); 286 BP_SET_LEVEL(bp, 0); 287 BP_SET_DEDUP(bp, 0); 288 BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); 289} 290 291void 292ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp) 293{ 294 ddk->ddk_cksum = bp->blk_cksum; 295 ddk->ddk_prop = 0; 296 297 DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp)); 298 DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp)); 299 DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp)); 300} 301 302void 303ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp) 304{ 305 ASSERT(ddp->ddp_phys_birth == 0); 306 307 for (int d = 0; d < SPA_DVAS_PER_BP; d++) 308 ddp->ddp_dva[d] = bp->blk_dva[d]; 309 ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp); 310} 311 312void 313ddt_phys_clear(ddt_phys_t *ddp) 314{ 315 bzero(ddp, sizeof (*ddp)); 316} 317 318void 319ddt_phys_addref(ddt_phys_t *ddp) 320{ 321 ddp->ddp_refcnt++; 322} 323 324void 325ddt_phys_decref(ddt_phys_t *ddp) 326{ 327 ASSERT((int64_t)ddp->ddp_refcnt > 0); 328 ddp->ddp_refcnt--; 329} 330 331void 332ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg) 333{ 334 blkptr_t blk; 335 336 ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); 337 ddt_phys_clear(ddp); 338 zio_free(ddt->ddt_spa, txg, &blk); 339} 340 341ddt_phys_t * 342ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp) 343{ 344 ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys; 345 346 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 347 if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) && 348 BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth) 349 return (ddp); 350 } 351 return (NULL); 352} 353 354uint64_t 355ddt_phys_total_refcnt(const ddt_entry_t *dde) 356{ 357 uint64_t refcnt = 0; 358 359 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) 360 refcnt += dde->dde_phys[p].ddp_refcnt; 361 362 return (refcnt); 363} 364 365static void 366ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds) 367{ 368 spa_t *spa = ddt->ddt_spa; 369 ddt_phys_t *ddp = dde->dde_phys; 370 ddt_key_t *ddk = &dde->dde_key; 371 uint64_t lsize = DDK_GET_LSIZE(ddk); 372 uint64_t psize = DDK_GET_PSIZE(ddk); 373 374 bzero(dds, sizeof (*dds)); 375 376 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 377 uint64_t dsize = 0; 378 uint64_t refcnt = ddp->ddp_refcnt; 379 380 if (ddp->ddp_phys_birth == 0) 381 continue; 382 383 for (int d = 0; d < SPA_DVAS_PER_BP; d++) 384 dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]); 385 386 dds->dds_blocks += 1; 387 dds->dds_lsize += lsize; 388 dds->dds_psize += psize; 389 dds->dds_dsize += dsize; 390 391 dds->dds_ref_blocks += refcnt; 392 dds->dds_ref_lsize += lsize * refcnt; 393 dds->dds_ref_psize += psize * refcnt; 394 dds->dds_ref_dsize += dsize * refcnt; 395 } 396} 397 398void 399ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg) 400{ 401 const uint64_t *s = (const uint64_t *)src; 402 uint64_t *d = (uint64_t *)dst; 403 uint64_t *d_end = (uint64_t *)(dst + 1); 404 405 ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */ 406 407 while (d < d_end) 408 *d++ += (*s++ ^ neg) - neg; 409} 410 411static void 412ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg) 413{ 414 ddt_stat_t dds; 415 ddt_histogram_t *ddh; 416 int bucket; 417 418 ddt_stat_generate(ddt, dde, &dds); 419 420 bucket = highbit(dds.dds_ref_blocks) - 1; 421 ASSERT(bucket >= 0); 422 423 ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class]; 424 425 ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg); 426} 427 428void 429ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src) 430{ 431 for (int h = 0; h < 64; h++) 432 ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0); 433} 434 435void 436ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh) 437{ 438 bzero(dds, sizeof (*dds)); 439 440 for (int h = 0; h < 64; h++) 441 ddt_stat_add(dds, &ddh->ddh_stat[h], 0); 442} 443 444boolean_t 445ddt_histogram_empty(const ddt_histogram_t *ddh) 446{ 447 const uint64_t *s = (const uint64_t *)ddh; 448 const uint64_t *s_end = (const uint64_t *)(ddh + 1); 449 450 while (s < s_end) 451 if (*s++ != 0) 452 return (B_FALSE); 453 454 return (B_TRUE); 455} 456 457void 458ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total) 459{ 460 /* Sum the statistics we cached in ddt_object_sync(). */ 461 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 462 ddt_t *ddt = spa->spa_ddt[c]; 463 for (enum ddt_type type = 0; type < DDT_TYPES; type++) { 464 for (enum ddt_class class = 0; class < DDT_CLASSES; 465 class++) { 466 ddt_object_t *ddo = 467 &ddt->ddt_object_stats[type][class]; 468 ddo_total->ddo_count += ddo->ddo_count; 469 ddo_total->ddo_dspace += ddo->ddo_dspace; 470 ddo_total->ddo_mspace += ddo->ddo_mspace; 471 } 472 } 473 } 474 475 /* ... and compute the averages. */ 476 if (ddo_total->ddo_count != 0) { 477 ddo_total->ddo_dspace /= ddo_total->ddo_count; 478 ddo_total->ddo_mspace /= ddo_total->ddo_count; 479 } 480} 481 482void 483ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh) 484{ 485 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 486 ddt_t *ddt = spa->spa_ddt[c]; 487 for (enum ddt_type type = 0; type < DDT_TYPES; type++) { 488 for (enum ddt_class class = 0; class < DDT_CLASSES; 489 class++) { 490 ddt_histogram_add(ddh, 491 &ddt->ddt_histogram_cache[type][class]); 492 } 493 } 494 } 495} 496 497void 498ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total) 499{ 500 ddt_histogram_t *ddh_total; 501 502 ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); 503 ddt_get_dedup_histogram(spa, ddh_total); 504 ddt_histogram_stat(dds_total, ddh_total); 505 kmem_free(ddh_total, sizeof (ddt_histogram_t)); 506} 507 508uint64_t 509ddt_get_dedup_dspace(spa_t *spa) 510{ 511 ddt_stat_t dds_total = { 0 }; 512 513 ddt_get_dedup_stats(spa, &dds_total); 514 return (dds_total.dds_ref_dsize - dds_total.dds_dsize); 515} 516 517uint64_t 518ddt_get_pool_dedup_ratio(spa_t *spa) 519{ 520 ddt_stat_t dds_total = { 0 }; 521 522 ddt_get_dedup_stats(spa, &dds_total); 523 if (dds_total.dds_dsize == 0) 524 return (100); 525 526 return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize); 527} 528 529int 530ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref) 531{ 532 spa_t *spa = ddt->ddt_spa; 533 uint64_t total_refcnt = 0; 534 uint64_t ditto = spa->spa_dedup_ditto; 535 int total_copies = 0; 536 int desired_copies = 0; 537 538 for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { 539 ddt_phys_t *ddp = &dde->dde_phys[p]; 540 zio_t *zio = dde->dde_lead_zio[p]; 541 uint64_t refcnt = ddp->ddp_refcnt; /* committed refs */ 542 if (zio != NULL) 543 refcnt += zio->io_parent_count; /* pending refs */ 544 if (ddp == ddp_willref) 545 refcnt++; /* caller's ref */ 546 if (refcnt != 0) { 547 total_refcnt += refcnt; 548 total_copies += p; 549 } 550 } 551 552 if (ditto == 0 || ditto > UINT32_MAX) 553 ditto = UINT32_MAX; 554 555 if (total_refcnt >= 1) 556 desired_copies++; 557 if (total_refcnt >= ditto) 558 desired_copies++; 559 if (total_refcnt >= ditto * ditto) 560 desired_copies++; 561 562 return (MAX(desired_copies, total_copies) - total_copies); 563} 564 565int 566ddt_ditto_copies_present(ddt_entry_t *dde) 567{ 568 ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO]; 569 dva_t *dva = ddp->ddp_dva; 570 int copies = 0 - DVA_GET_GANG(dva); 571 572 for (int d = 0; d < SPA_DVAS_PER_BP; d++, dva++) 573 if (DVA_IS_VALID(dva)) 574 copies++; 575 576 ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP); 577 578 return (copies); 579} 580 581size_t 582ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len) 583{ 584 uchar_t *version = dst++; 585 int cpfunc = ZIO_COMPRESS_ZLE; 586 zio_compress_info_t *ci = &zio_compress_table[cpfunc]; 587 size_t c_len; 588 589 ASSERT(d_len >= s_len + 1); /* no compression plus version byte */ 590 591 c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level); 592 593 if (c_len == s_len) { 594 cpfunc = ZIO_COMPRESS_OFF; 595 bcopy(src, dst, s_len); 596 } 597 598 *version = (ZFS_HOST_BYTEORDER & DDT_COMPRESS_BYTEORDER_MASK) | cpfunc; 599 600 return (c_len + 1); 601} 602 603void 604ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len) 605{ 606 uchar_t version = *src++; 607 int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK; 608 zio_compress_info_t *ci = &zio_compress_table[cpfunc]; 609 610 if (ci->ci_decompress != NULL) 611 (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level); 612 else 613 bcopy(src, dst, d_len); 614 615 if ((version ^ ZFS_HOST_BYTEORDER) & DDT_COMPRESS_BYTEORDER_MASK) 616 byteswap_uint64_array(dst, d_len); 617} 618 619ddt_t * 620ddt_select_by_checksum(spa_t *spa, enum zio_checksum c) 621{ 622 return (spa->spa_ddt[c]); 623} 624 625ddt_t * 626ddt_select(spa_t *spa, const blkptr_t *bp) 627{ 628 return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]); 629} 630 631void 632ddt_enter(ddt_t *ddt) 633{ 634 mutex_enter(&ddt->ddt_lock); 635} 636 637void 638ddt_exit(ddt_t *ddt) 639{ 640 mutex_exit(&ddt->ddt_lock); 641} 642 643static ddt_entry_t * 644ddt_alloc(const ddt_key_t *ddk) 645{ 646 ddt_entry_t *dde; 647 648 dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP); 649 cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL); 650 651 dde->dde_key = *ddk; 652 653 return (dde); 654} 655 656static void 657ddt_free(ddt_entry_t *dde) 658{ 659 ASSERT(!dde->dde_loading); 660 661 for (int p = 0; p < DDT_PHYS_TYPES; p++) 662 ASSERT(dde->dde_lead_zio[p] == NULL); 663 664 if (dde->dde_repair_data != NULL) 665 zio_buf_free(dde->dde_repair_data, 666 DDK_GET_PSIZE(&dde->dde_key)); 667 668 cv_destroy(&dde->dde_cv); 669 kmem_free(dde, sizeof (*dde)); 670} 671 672void 673ddt_remove(ddt_t *ddt, ddt_entry_t *dde) 674{ 675 ASSERT(MUTEX_HELD(&ddt->ddt_lock)); 676 677 avl_remove(&ddt->ddt_tree, dde); 678 ddt_free(dde); 679} 680 681ddt_entry_t * 682ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) 683{ 684 ddt_entry_t *dde, dde_search; 685 enum ddt_type type; 686 enum ddt_class class; 687 avl_index_t where; 688 int error; 689 690 ASSERT(MUTEX_HELD(&ddt->ddt_lock)); 691 692 ddt_key_fill(&dde_search.dde_key, bp); 693 694 dde = avl_find(&ddt->ddt_tree, &dde_search, &where); 695 if (dde == NULL) { 696 if (!add) 697 return (NULL); 698 dde = ddt_alloc(&dde_search.dde_key); 699 avl_insert(&ddt->ddt_tree, dde, where); 700 } 701 702 while (dde->dde_loading) 703 cv_wait(&dde->dde_cv, &ddt->ddt_lock); 704 705 if (dde->dde_loaded) 706 return (dde); 707 708 dde->dde_loading = B_TRUE; 709 710 ddt_exit(ddt); 711 712 error = ENOENT; 713 714 for (type = 0; type < DDT_TYPES; type++) { 715 for (class = 0; class < DDT_CLASSES; class++) { 716 error = ddt_object_lookup(ddt, type, class, dde); 717 if (error != ENOENT) 718 break; 719 } 720 if (error != ENOENT) 721 break; 722 } 723 724 ASSERT(error == 0 || error == ENOENT); 725 726 ddt_enter(ddt); 727 728 ASSERT(dde->dde_loaded == B_FALSE); 729 ASSERT(dde->dde_loading == B_TRUE); 730 731 dde->dde_type = type; /* will be DDT_TYPES if no entry found */ 732 dde->dde_class = class; /* will be DDT_CLASSES if no entry found */ 733 dde->dde_loaded = B_TRUE; 734 dde->dde_loading = B_FALSE; 735 736 if (error == 0) 737 ddt_stat_update(ddt, dde, -1ULL); 738 739 cv_broadcast(&dde->dde_cv); 740 741 return (dde); 742} 743 744void 745ddt_prefetch(spa_t *spa, const blkptr_t *bp) 746{ 747 ddt_t *ddt; 748 ddt_entry_t dde; 749 750 if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp)) 751 return; 752 753 /* 754 * We only remove the DDT once all tables are empty and only 755 * prefetch dedup blocks when there are entries in the DDT. 756 * Thus no locking is required as the DDT can't disappear on us. 757 */ 758 ddt = ddt_select(spa, bp); 759 ddt_key_fill(&dde.dde_key, bp); 760 761 for (enum ddt_type type = 0; type < DDT_TYPES; type++) { 762 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { 763 ddt_object_prefetch(ddt, type, class, &dde); 764 } 765 } 766} 767 768int 769ddt_entry_compare(const void *x1, const void *x2) 770{ 771 const ddt_entry_t *dde1 = x1; 772 const ddt_entry_t *dde2 = x2; 773 const uint64_t *u1 = (const uint64_t *)&dde1->dde_key; 774 const uint64_t *u2 = (const uint64_t *)&dde2->dde_key; 775 776 for (int i = 0; i < DDT_KEY_WORDS; i++) { 777 if (u1[i] < u2[i]) 778 return (-1); 779 if (u1[i] > u2[i]) 780 return (1); 781 } 782 783 return (0); 784} 785 786static ddt_t * 787ddt_table_alloc(spa_t *spa, enum zio_checksum c) 788{ 789 ddt_t *ddt; 790 791 ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP); 792 793 mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL); 794 avl_create(&ddt->ddt_tree, ddt_entry_compare, 795 sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); 796 avl_create(&ddt->ddt_repair_tree, ddt_entry_compare, 797 sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); 798 ddt->ddt_checksum = c; 799 ddt->ddt_spa = spa; 800 ddt->ddt_os = spa->spa_meta_objset; 801 802 return (ddt); 803} 804 805static void 806ddt_table_free(ddt_t *ddt) 807{ 808 ASSERT(avl_numnodes(&ddt->ddt_tree) == 0); 809 ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0); 810 avl_destroy(&ddt->ddt_tree); 811 avl_destroy(&ddt->ddt_repair_tree); 812 mutex_destroy(&ddt->ddt_lock); 813 kmem_free(ddt, sizeof (*ddt)); 814} 815 816void 817ddt_create(spa_t *spa) 818{ 819 spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM; 820 821 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) 822 spa->spa_ddt[c] = ddt_table_alloc(spa, c); 823} 824 825int 826ddt_load(spa_t *spa) 827{ 828 int error; 829 830 ddt_create(spa); 831 832 error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, 833 DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, 834 &spa->spa_ddt_stat_object); 835 836 if (error) 837 return (error == ENOENT ? 0 : error); 838 839 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 840 ddt_t *ddt = spa->spa_ddt[c]; 841 for (enum ddt_type type = 0; type < DDT_TYPES; type++) { 842 for (enum ddt_class class = 0; class < DDT_CLASSES; 843 class++) { 844 error = ddt_object_load(ddt, type, class); 845 if (error != 0 && error != ENOENT) 846 return (error); 847 } 848 } 849 850 /* 851 * Seed the cached histograms. 852 */ 853 bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache, 854 sizeof (ddt->ddt_histogram)); 855 } 856 857 return (0); 858} 859 860void 861ddt_unload(spa_t *spa) 862{ 863 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 864 if (spa->spa_ddt[c]) { 865 ddt_table_free(spa->spa_ddt[c]); 866 spa->spa_ddt[c] = NULL; 867 } 868 } 869} 870 871boolean_t 872ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp) 873{ 874 ddt_t *ddt; 875 ddt_entry_t dde; 876 877 if (!BP_GET_DEDUP(bp)) 878 return (B_FALSE); 879 880 if (max_class == DDT_CLASS_UNIQUE) 881 return (B_TRUE); 882 883 ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)]; 884 885 ddt_key_fill(&dde.dde_key, bp); 886 887 for (enum ddt_type type = 0; type < DDT_TYPES; type++) 888 for (enum ddt_class class = 0; class <= max_class; class++) 889 if (ddt_object_lookup(ddt, type, class, &dde) == 0) 890 return (B_TRUE); 891 892 return (B_FALSE); 893} 894 895ddt_entry_t * 896ddt_repair_start(ddt_t *ddt, const blkptr_t *bp) 897{ 898 ddt_key_t ddk; 899 ddt_entry_t *dde; 900 901 ddt_key_fill(&ddk, bp); 902 903 dde = ddt_alloc(&ddk); 904 905 for (enum ddt_type type = 0; type < DDT_TYPES; type++) { 906 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { 907 /* 908 * We can only do repair if there are multiple copies 909 * of the block. For anything in the UNIQUE class, 910 * there's definitely only one copy, so don't even try. 911 */ 912 if (class != DDT_CLASS_UNIQUE && 913 ddt_object_lookup(ddt, type, class, dde) == 0) 914 return (dde); 915 } 916 } 917 918 bzero(dde->dde_phys, sizeof (dde->dde_phys)); 919 920 return (dde); 921} 922 923void 924ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde) 925{ 926 avl_index_t where; 927 928 ddt_enter(ddt); 929 930 if (dde->dde_repair_data != NULL && spa_writeable(ddt->ddt_spa) && 931 avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL) 932 avl_insert(&ddt->ddt_repair_tree, dde, where); 933 else 934 ddt_free(dde); 935 936 ddt_exit(ddt); 937} 938 939static void 940ddt_repair_entry_done(zio_t *zio) 941{ 942 ddt_entry_t *rdde = zio->io_private; 943 944 ddt_free(rdde); 945} 946 947static void 948ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio) 949{ 950 ddt_phys_t *ddp = dde->dde_phys; 951 ddt_phys_t *rddp = rdde->dde_phys; 952 ddt_key_t *ddk = &dde->dde_key; 953 ddt_key_t *rddk = &rdde->dde_key; 954 zio_t *zio; 955 blkptr_t blk; 956 957 zio = zio_null(rio, rio->io_spa, NULL, 958 ddt_repair_entry_done, rdde, rio->io_flags); 959 960 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) { 961 if (ddp->ddp_phys_birth == 0 || 962 ddp->ddp_phys_birth != rddp->ddp_phys_birth || 963 bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva))) 964 continue; 965 ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); 966 zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk, 967 rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL, 968 ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL)); 969 } 970 971 zio_nowait(zio); 972} 973 974static void 975ddt_repair_table(ddt_t *ddt, zio_t *rio) 976{ 977 spa_t *spa = ddt->ddt_spa; 978 ddt_entry_t *dde, *rdde_next, *rdde; 979 avl_tree_t *t = &ddt->ddt_repair_tree; 980 blkptr_t blk; 981 982 if (spa_sync_pass(spa) > 1) 983 return; 984 985 ddt_enter(ddt); 986 for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) { 987 rdde_next = AVL_NEXT(t, rdde); 988 avl_remove(&ddt->ddt_repair_tree, rdde); 989 ddt_exit(ddt); 990 ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk); 991 dde = ddt_repair_start(ddt, &blk); 992 ddt_repair_entry(ddt, dde, rdde, rio); 993 ddt_repair_done(ddt, dde); 994 ddt_enter(ddt); 995 } 996 ddt_exit(ddt); 997} 998 999static void 1000ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) 1001{ 1002 dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool; 1003 ddt_phys_t *ddp = dde->dde_phys; 1004 ddt_key_t *ddk = &dde->dde_key; 1005 enum ddt_type otype = dde->dde_type; 1006 enum ddt_type ntype = DDT_TYPE_CURRENT; 1007 enum ddt_class oclass = dde->dde_class; 1008 enum ddt_class nclass; 1009 uint64_t total_refcnt = 0; 1010 1011 ASSERT(dde->dde_loaded); 1012 ASSERT(!dde->dde_loading); 1013 1014 for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { 1015 ASSERT(dde->dde_lead_zio[p] == NULL); 1016 ASSERT((int64_t)ddp->ddp_refcnt >= 0); 1017 if (ddp->ddp_phys_birth == 0) { 1018 ASSERT(ddp->ddp_refcnt == 0); 1019 continue; 1020 } 1021 if (p == DDT_PHYS_DITTO) { 1022 if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0) 1023 ddt_phys_free(ddt, ddk, ddp, txg); 1024 continue; 1025 } 1026 if (ddp->ddp_refcnt == 0) 1027 ddt_phys_free(ddt, ddk, ddp, txg); 1028 total_refcnt += ddp->ddp_refcnt; 1029 } 1030 1031 if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0) 1032 nclass = DDT_CLASS_DITTO; 1033 else if (total_refcnt > 1) 1034 nclass = DDT_CLASS_DUPLICATE; 1035 else 1036 nclass = DDT_CLASS_UNIQUE; 1037 1038 if (otype != DDT_TYPES && 1039 (otype != ntype || oclass != nclass || total_refcnt == 0)) { 1040 VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0); 1041 ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT); 1042 } 1043 1044 if (total_refcnt != 0) { 1045 dde->dde_type = ntype; 1046 dde->dde_class = nclass; 1047 ddt_stat_update(ddt, dde, 0); 1048 if (!ddt_object_exists(ddt, ntype, nclass)) 1049 ddt_object_create(ddt, ntype, nclass, tx); 1050 VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0); 1051 1052 /* 1053 * If the class changes, the order that we scan this bp 1054 * changes. If it decreases, we could miss it, so 1055 * scan it right now. (This covers both class changing 1056 * while we are doing ddt_walk(), and when we are 1057 * traversing.) 1058 */ 1059 if (nclass < oclass) { 1060 dsl_scan_ddt_entry(dp->dp_scan, 1061 ddt->ddt_checksum, dde, tx); 1062 } 1063 } 1064} 1065 1066static void 1067ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) 1068{ 1069 spa_t *spa = ddt->ddt_spa; 1070 ddt_entry_t *dde; 1071 void *cookie = NULL; 1072 1073 if (avl_numnodes(&ddt->ddt_tree) == 0) 1074 return; 1075 1076 ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP); 1077 1078 if (spa->spa_ddt_stat_object == 0) { 1079 spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os, 1080 DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT, 1081 DMU_POOL_DDT_STATS, tx); 1082 } 1083 1084 while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) { 1085 ddt_sync_entry(ddt, dde, tx, txg); 1086 ddt_free(dde); 1087 } 1088 1089 for (enum ddt_type type = 0; type < DDT_TYPES; type++) { 1090 uint64_t add, count = 0; 1091 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { 1092 if (ddt_object_exists(ddt, type, class)) { 1093 ddt_object_sync(ddt, type, class, tx); 1094 VERIFY(ddt_object_count(ddt, type, class, 1095 &add) == 0); 1096 count += add; 1097 } 1098 } 1099 for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { 1100 if (count == 0 && ddt_object_exists(ddt, type, class)) 1101 ddt_object_destroy(ddt, type, class, tx); 1102 } 1103 } 1104 1105 bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache, 1106 sizeof (ddt->ddt_histogram)); 1107} 1108 1109void 1110ddt_sync(spa_t *spa, uint64_t txg) 1111{ 1112 dmu_tx_t *tx; 1113 zio_t *rio = zio_root(spa, NULL, NULL, 1114 ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); 1115 1116 ASSERT(spa_syncing_txg(spa) == txg); 1117 1118 tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); 1119 1120 for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { 1121 ddt_t *ddt = spa->spa_ddt[c]; 1122 if (ddt == NULL) 1123 continue; 1124 ddt_sync_table(ddt, tx, txg); 1125 ddt_repair_table(ddt, rio); 1126 } 1127 1128 (void) zio_wait(rio); 1129 1130 dmu_tx_commit(tx); 1131} 1132 1133int 1134ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde) 1135{ 1136 do { 1137 do { 1138 do { 1139 ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum]; 1140 int error = ENOENT; 1141 if (ddt_object_exists(ddt, ddb->ddb_type, 1142 ddb->ddb_class)) { 1143 error = ddt_object_walk(ddt, 1144 ddb->ddb_type, ddb->ddb_class, 1145 &ddb->ddb_cursor, dde); 1146 } 1147 dde->dde_type = ddb->ddb_type; 1148 dde->dde_class = ddb->ddb_class; 1149 if (error == 0) 1150 return (0); 1151 if (error != ENOENT) 1152 return (error); 1153 ddb->ddb_cursor = 0; 1154 } while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS); 1155 ddb->ddb_checksum = 0; 1156 } while (++ddb->ddb_type < DDT_TYPES); 1157 ddb->ddb_type = 0; 1158 } while (++ddb->ddb_class < DDT_CLASSES); 1159 1160 return (SET_ERROR(ENOENT)); 1161} 1162