ddt.c revision 243674
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012 by Delphix. All rights reserved.
25 */
26
27#include <sys/zfs_context.h>
28#include <sys/spa.h>
29#include <sys/spa_impl.h>
30#include <sys/zio.h>
31#include <sys/ddt.h>
32#include <sys/zap.h>
33#include <sys/dmu_tx.h>
34#include <sys/arc.h>
35#include <sys/dsl_pool.h>
36#include <sys/zio_checksum.h>
37#include <sys/zio_compress.h>
38#include <sys/dsl_scan.h>
39
40/*
41 * Enable/disable prefetching of dedup-ed blocks which are going to be freed.
42 */
43int zfs_dedup_prefetch = 1;
44
45SYSCTL_DECL(_vfs_zfs);
46SYSCTL_NODE(_vfs_zfs, OID_AUTO, dedup, CTLFLAG_RW, 0, "ZFS DEDUP");
47TUNABLE_INT("vfs.zfs.dedup.prefetch", &zfs_dedup_prefetch);
48SYSCTL_INT(_vfs_zfs_dedup, OID_AUTO, prefetch, CTLFLAG_RW, &zfs_dedup_prefetch,
49    0, "Enable/disable prefetching of dedup-ed blocks which are going to be freed");
50
51static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
52	&ddt_zap_ops,
53};
54
55static const char *ddt_class_name[DDT_CLASSES] = {
56	"ditto",
57	"duplicate",
58	"unique",
59};
60
61static void
62ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
63    dmu_tx_t *tx)
64{
65	spa_t *spa = ddt->ddt_spa;
66	objset_t *os = ddt->ddt_os;
67	uint64_t *objectp = &ddt->ddt_object[type][class];
68	boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_dedup;
69	char name[DDT_NAMELEN];
70
71	ddt_object_name(ddt, type, class, name);
72
73	ASSERT(*objectp == 0);
74	VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0);
75	ASSERT(*objectp != 0);
76
77	VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name,
78	    sizeof (uint64_t), 1, objectp, tx) == 0);
79
80	VERIFY(zap_add(os, spa->spa_ddt_stat_object, name,
81	    sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
82	    &ddt->ddt_histogram[type][class], tx) == 0);
83}
84
85static void
86ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
87    dmu_tx_t *tx)
88{
89	spa_t *spa = ddt->ddt_spa;
90	objset_t *os = ddt->ddt_os;
91	uint64_t *objectp = &ddt->ddt_object[type][class];
92	char name[DDT_NAMELEN];
93
94	ddt_object_name(ddt, type, class, name);
95
96	ASSERT(*objectp != 0);
97	ASSERT(ddt_object_count(ddt, type, class) == 0);
98	ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
99	VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0);
100	VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0);
101	VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0);
102	bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t));
103
104	*objectp = 0;
105}
106
107static int
108ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
109{
110	ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
111	dmu_object_info_t doi;
112	char name[DDT_NAMELEN];
113	int error;
114
115	ddt_object_name(ddt, type, class, name);
116
117	error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
118	    sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
119
120	if (error)
121		return (error);
122
123	error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
124	    sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
125	    &ddt->ddt_histogram[type][class]);
126
127	/*
128	 * Seed the cached statistics.
129	 */
130	VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
131
132	ddo->ddo_count = ddt_object_count(ddt, type, class);
133	ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
134	ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
135
136	ASSERT(error == 0);
137	return (error);
138}
139
140static void
141ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
142    dmu_tx_t *tx)
143{
144	ddt_object_t *ddo = &ddt->ddt_object_stats[type][class];
145	dmu_object_info_t doi;
146	char name[DDT_NAMELEN];
147
148	ddt_object_name(ddt, type, class, name);
149
150	VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
151	    sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
152	    &ddt->ddt_histogram[type][class], tx) == 0);
153
154	/*
155	 * Cache DDT statistics; this is the only time they'll change.
156	 */
157	VERIFY(ddt_object_info(ddt, type, class, &doi) == 0);
158
159	ddo->ddo_count = ddt_object_count(ddt, type, class);
160	ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
161	ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
162}
163
164static int
165ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
166    ddt_entry_t *dde)
167{
168	if (!ddt_object_exists(ddt, type, class))
169		return (ENOENT);
170
171	return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
172	    ddt->ddt_object[type][class], dde));
173}
174
175static void
176ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
177    ddt_entry_t *dde)
178{
179	if (!ddt_object_exists(ddt, type, class))
180		return;
181
182	ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os,
183	    ddt->ddt_object[type][class], dde);
184}
185
186int
187ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
188    ddt_entry_t *dde, dmu_tx_t *tx)
189{
190	ASSERT(ddt_object_exists(ddt, type, class));
191
192	return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
193	    ddt->ddt_object[type][class], dde, tx));
194}
195
196static int
197ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
198    ddt_entry_t *dde, dmu_tx_t *tx)
199{
200	ASSERT(ddt_object_exists(ddt, type, class));
201
202	return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os,
203	    ddt->ddt_object[type][class], dde, tx));
204}
205
206int
207ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
208    uint64_t *walk, ddt_entry_t *dde)
209{
210	ASSERT(ddt_object_exists(ddt, type, class));
211
212	return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
213	    ddt->ddt_object[type][class], dde, walk));
214}
215
216uint64_t
217ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
218{
219	ASSERT(ddt_object_exists(ddt, type, class));
220
221	return (ddt_ops[type]->ddt_op_count(ddt->ddt_os,
222	    ddt->ddt_object[type][class]));
223}
224
225int
226ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
227    dmu_object_info_t *doi)
228{
229	if (!ddt_object_exists(ddt, type, class))
230		return (ENOENT);
231
232	return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class],
233	    doi));
234}
235
236boolean_t
237ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
238{
239	return (!!ddt->ddt_object[type][class]);
240}
241
242void
243ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
244    char *name)
245{
246	(void) sprintf(name, DMU_POOL_DDT,
247	    zio_checksum_table[ddt->ddt_checksum].ci_name,
248	    ddt_ops[type]->ddt_op_name, ddt_class_name[class]);
249}
250
251void
252ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg)
253{
254	ASSERT(txg != 0);
255
256	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
257		bp->blk_dva[d] = ddp->ddp_dva[d];
258	BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth);
259}
260
261void
262ddt_bp_create(enum zio_checksum checksum,
263    const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp)
264{
265	BP_ZERO(bp);
266
267	if (ddp != NULL)
268		ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth);
269
270	bp->blk_cksum = ddk->ddk_cksum;
271	bp->blk_fill = 1;
272
273	BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
274	BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
275	BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
276	BP_SET_CHECKSUM(bp, checksum);
277	BP_SET_TYPE(bp, DMU_OT_DEDUP);
278	BP_SET_LEVEL(bp, 0);
279	BP_SET_DEDUP(bp, 0);
280	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
281}
282
283void
284ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
285{
286	ddk->ddk_cksum = bp->blk_cksum;
287	ddk->ddk_prop = 0;
288
289	DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp));
290	DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp));
291	DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp));
292}
293
294void
295ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp)
296{
297	ASSERT(ddp->ddp_phys_birth == 0);
298
299	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
300		ddp->ddp_dva[d] = bp->blk_dva[d];
301	ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp);
302}
303
304void
305ddt_phys_clear(ddt_phys_t *ddp)
306{
307	bzero(ddp, sizeof (*ddp));
308}
309
310void
311ddt_phys_addref(ddt_phys_t *ddp)
312{
313	ddp->ddp_refcnt++;
314}
315
316void
317ddt_phys_decref(ddt_phys_t *ddp)
318{
319	ASSERT((int64_t)ddp->ddp_refcnt > 0);
320	ddp->ddp_refcnt--;
321}
322
323void
324ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg)
325{
326	blkptr_t blk;
327
328	ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
329	ddt_phys_clear(ddp);
330	zio_free(ddt->ddt_spa, txg, &blk);
331}
332
333ddt_phys_t *
334ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp)
335{
336	ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys;
337
338	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
339		if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
340		    BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth)
341			return (ddp);
342	}
343	return (NULL);
344}
345
346uint64_t
347ddt_phys_total_refcnt(const ddt_entry_t *dde)
348{
349	uint64_t refcnt = 0;
350
351	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
352		refcnt += dde->dde_phys[p].ddp_refcnt;
353
354	return (refcnt);
355}
356
357static void
358ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
359{
360	spa_t *spa = ddt->ddt_spa;
361	ddt_phys_t *ddp = dde->dde_phys;
362	ddt_key_t *ddk = &dde->dde_key;
363	uint64_t lsize = DDK_GET_LSIZE(ddk);
364	uint64_t psize = DDK_GET_PSIZE(ddk);
365
366	bzero(dds, sizeof (*dds));
367
368	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
369		uint64_t dsize = 0;
370		uint64_t refcnt = ddp->ddp_refcnt;
371
372		if (ddp->ddp_phys_birth == 0)
373			continue;
374
375		for (int d = 0; d < SPA_DVAS_PER_BP; d++)
376			dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
377
378		dds->dds_blocks += 1;
379		dds->dds_lsize += lsize;
380		dds->dds_psize += psize;
381		dds->dds_dsize += dsize;
382
383		dds->dds_ref_blocks += refcnt;
384		dds->dds_ref_lsize += lsize * refcnt;
385		dds->dds_ref_psize += psize * refcnt;
386		dds->dds_ref_dsize += dsize * refcnt;
387	}
388}
389
390void
391ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
392{
393	const uint64_t *s = (const uint64_t *)src;
394	uint64_t *d = (uint64_t *)dst;
395	uint64_t *d_end = (uint64_t *)(dst + 1);
396
397	ASSERT(neg == 0 || neg == -1ULL);	/* add or subtract */
398
399	while (d < d_end)
400		*d++ += (*s++ ^ neg) - neg;
401}
402
403static void
404ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
405{
406	ddt_stat_t dds;
407	ddt_histogram_t *ddh;
408	int bucket;
409
410	ddt_stat_generate(ddt, dde, &dds);
411
412	bucket = highbit(dds.dds_ref_blocks) - 1;
413	ASSERT(bucket >= 0);
414
415	ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
416
417	ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg);
418}
419
420void
421ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
422{
423	for (int h = 0; h < 64; h++)
424		ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
425}
426
427void
428ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh)
429{
430	bzero(dds, sizeof (*dds));
431
432	for (int h = 0; h < 64; h++)
433		ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
434}
435
436boolean_t
437ddt_histogram_empty(const ddt_histogram_t *ddh)
438{
439	const uint64_t *s = (const uint64_t *)ddh;
440	const uint64_t *s_end = (const uint64_t *)(ddh + 1);
441
442	while (s < s_end)
443		if (*s++ != 0)
444			return (B_FALSE);
445
446	return (B_TRUE);
447}
448
449void
450ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total)
451{
452	/* Sum the statistics we cached in ddt_object_sync(). */
453	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
454		ddt_t *ddt = spa->spa_ddt[c];
455		for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
456			for (enum ddt_class class = 0; class < DDT_CLASSES;
457			    class++) {
458				ddt_object_t *ddo =
459				    &ddt->ddt_object_stats[type][class];
460				ddo_total->ddo_count += ddo->ddo_count;
461				ddo_total->ddo_dspace += ddo->ddo_dspace;
462				ddo_total->ddo_mspace += ddo->ddo_mspace;
463			}
464		}
465	}
466
467	/* ... and compute the averages. */
468	if (ddo_total->ddo_count != 0) {
469		ddo_total->ddo_dspace /= ddo_total->ddo_count;
470		ddo_total->ddo_mspace /= ddo_total->ddo_count;
471	}
472}
473
474void
475ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
476{
477	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
478		ddt_t *ddt = spa->spa_ddt[c];
479		for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
480			for (enum ddt_class class = 0; class < DDT_CLASSES;
481			    class++) {
482				ddt_histogram_add(ddh,
483				    &ddt->ddt_histogram_cache[type][class]);
484			}
485		}
486	}
487}
488
489void
490ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
491{
492	ddt_histogram_t *ddh_total;
493
494	ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
495	ddt_get_dedup_histogram(spa, ddh_total);
496	ddt_histogram_stat(dds_total, ddh_total);
497	kmem_free(ddh_total, sizeof (ddt_histogram_t));
498}
499
500uint64_t
501ddt_get_dedup_dspace(spa_t *spa)
502{
503	ddt_stat_t dds_total = { 0 };
504
505	ddt_get_dedup_stats(spa, &dds_total);
506	return (dds_total.dds_ref_dsize - dds_total.dds_dsize);
507}
508
509uint64_t
510ddt_get_pool_dedup_ratio(spa_t *spa)
511{
512	ddt_stat_t dds_total = { 0 };
513
514	ddt_get_dedup_stats(spa, &dds_total);
515	if (dds_total.dds_dsize == 0)
516		return (100);
517
518	return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize);
519}
520
521int
522ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref)
523{
524	spa_t *spa = ddt->ddt_spa;
525	uint64_t total_refcnt = 0;
526	uint64_t ditto = spa->spa_dedup_ditto;
527	int total_copies = 0;
528	int desired_copies = 0;
529
530	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
531		ddt_phys_t *ddp = &dde->dde_phys[p];
532		zio_t *zio = dde->dde_lead_zio[p];
533		uint64_t refcnt = ddp->ddp_refcnt;	/* committed refs */
534		if (zio != NULL)
535			refcnt += zio->io_parent_count;	/* pending refs */
536		if (ddp == ddp_willref)
537			refcnt++;			/* caller's ref */
538		if (refcnt != 0) {
539			total_refcnt += refcnt;
540			total_copies += p;
541		}
542	}
543
544	if (ditto == 0 || ditto > UINT32_MAX)
545		ditto = UINT32_MAX;
546
547	if (total_refcnt >= 1)
548		desired_copies++;
549	if (total_refcnt >= ditto)
550		desired_copies++;
551	if (total_refcnt >= ditto * ditto)
552		desired_copies++;
553
554	return (MAX(desired_copies, total_copies) - total_copies);
555}
556
557int
558ddt_ditto_copies_present(ddt_entry_t *dde)
559{
560	ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO];
561	dva_t *dva = ddp->ddp_dva;
562	int copies = 0 - DVA_GET_GANG(dva);
563
564	for (int d = 0; d < SPA_DVAS_PER_BP; d++, dva++)
565		if (DVA_IS_VALID(dva))
566			copies++;
567
568	ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP);
569
570	return (copies);
571}
572
573size_t
574ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len)
575{
576	uchar_t *version = dst++;
577	int cpfunc = ZIO_COMPRESS_ZLE;
578	zio_compress_info_t *ci = &zio_compress_table[cpfunc];
579	size_t c_len;
580
581	ASSERT(d_len >= s_len + 1);	/* no compression plus version byte */
582
583	c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level);
584
585	if (c_len == s_len) {
586		cpfunc = ZIO_COMPRESS_OFF;
587		bcopy(src, dst, s_len);
588	}
589
590	*version = (ZFS_HOST_BYTEORDER & DDT_COMPRESS_BYTEORDER_MASK) | cpfunc;
591
592	return (c_len + 1);
593}
594
595void
596ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
597{
598	uchar_t version = *src++;
599	int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK;
600	zio_compress_info_t *ci = &zio_compress_table[cpfunc];
601
602	if (ci->ci_decompress != NULL)
603		(void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
604	else
605		bcopy(src, dst, d_len);
606
607	if ((version ^ ZFS_HOST_BYTEORDER) & DDT_COMPRESS_BYTEORDER_MASK)
608		byteswap_uint64_array(dst, d_len);
609}
610
611ddt_t *
612ddt_select_by_checksum(spa_t *spa, enum zio_checksum c)
613{
614	return (spa->spa_ddt[c]);
615}
616
617ddt_t *
618ddt_select(spa_t *spa, const blkptr_t *bp)
619{
620	return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
621}
622
623void
624ddt_enter(ddt_t *ddt)
625{
626	mutex_enter(&ddt->ddt_lock);
627}
628
629void
630ddt_exit(ddt_t *ddt)
631{
632	mutex_exit(&ddt->ddt_lock);
633}
634
635static ddt_entry_t *
636ddt_alloc(const ddt_key_t *ddk)
637{
638	ddt_entry_t *dde;
639
640	dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP);
641	cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
642
643	dde->dde_key = *ddk;
644
645	return (dde);
646}
647
648static void
649ddt_free(ddt_entry_t *dde)
650{
651	ASSERT(!dde->dde_loading);
652
653	for (int p = 0; p < DDT_PHYS_TYPES; p++)
654		ASSERT(dde->dde_lead_zio[p] == NULL);
655
656	if (dde->dde_repair_data != NULL)
657		zio_buf_free(dde->dde_repair_data,
658		    DDK_GET_PSIZE(&dde->dde_key));
659
660	cv_destroy(&dde->dde_cv);
661	kmem_free(dde, sizeof (*dde));
662}
663
664void
665ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
666{
667	ASSERT(MUTEX_HELD(&ddt->ddt_lock));
668
669	avl_remove(&ddt->ddt_tree, dde);
670	ddt_free(dde);
671}
672
673ddt_entry_t *
674ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
675{
676	ddt_entry_t *dde, dde_search;
677	enum ddt_type type;
678	enum ddt_class class;
679	avl_index_t where;
680	int error;
681
682	ASSERT(MUTEX_HELD(&ddt->ddt_lock));
683
684	ddt_key_fill(&dde_search.dde_key, bp);
685
686	dde = avl_find(&ddt->ddt_tree, &dde_search, &where);
687	if (dde == NULL) {
688		if (!add)
689			return (NULL);
690		dde = ddt_alloc(&dde_search.dde_key);
691		avl_insert(&ddt->ddt_tree, dde, where);
692	}
693
694	while (dde->dde_loading)
695		cv_wait(&dde->dde_cv, &ddt->ddt_lock);
696
697	if (dde->dde_loaded)
698		return (dde);
699
700	dde->dde_loading = B_TRUE;
701
702	ddt_exit(ddt);
703
704	error = ENOENT;
705
706	for (type = 0; type < DDT_TYPES; type++) {
707		for (class = 0; class < DDT_CLASSES; class++) {
708			error = ddt_object_lookup(ddt, type, class, dde);
709			if (error != ENOENT)
710				break;
711		}
712		if (error != ENOENT)
713			break;
714	}
715
716	ASSERT(error == 0 || error == ENOENT);
717
718	ddt_enter(ddt);
719
720	ASSERT(dde->dde_loaded == B_FALSE);
721	ASSERT(dde->dde_loading == B_TRUE);
722
723	dde->dde_type = type;	/* will be DDT_TYPES if no entry found */
724	dde->dde_class = class;	/* will be DDT_CLASSES if no entry found */
725	dde->dde_loaded = B_TRUE;
726	dde->dde_loading = B_FALSE;
727
728	if (error == 0)
729		ddt_stat_update(ddt, dde, -1ULL);
730
731	cv_broadcast(&dde->dde_cv);
732
733	return (dde);
734}
735
736void
737ddt_prefetch(spa_t *spa, const blkptr_t *bp)
738{
739	ddt_t *ddt;
740	ddt_entry_t dde;
741
742	if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp))
743		return;
744
745	/*
746	 * We only remove the DDT once all tables are empty and only
747	 * prefetch dedup blocks when there are entries in the DDT.
748	 * Thus no locking is required as the DDT can't disappear on us.
749	 */
750	ddt = ddt_select(spa, bp);
751	ddt_key_fill(&dde.dde_key, bp);
752
753	for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
754		for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
755			ddt_object_prefetch(ddt, type, class, &dde);
756		}
757	}
758}
759
760int
761ddt_entry_compare(const void *x1, const void *x2)
762{
763	const ddt_entry_t *dde1 = x1;
764	const ddt_entry_t *dde2 = x2;
765	const uint64_t *u1 = (const uint64_t *)&dde1->dde_key;
766	const uint64_t *u2 = (const uint64_t *)&dde2->dde_key;
767
768	for (int i = 0; i < DDT_KEY_WORDS; i++) {
769		if (u1[i] < u2[i])
770			return (-1);
771		if (u1[i] > u2[i])
772			return (1);
773	}
774
775	return (0);
776}
777
778static ddt_t *
779ddt_table_alloc(spa_t *spa, enum zio_checksum c)
780{
781	ddt_t *ddt;
782
783	ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP);
784
785	mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
786	avl_create(&ddt->ddt_tree, ddt_entry_compare,
787	    sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
788	avl_create(&ddt->ddt_repair_tree, ddt_entry_compare,
789	    sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
790	ddt->ddt_checksum = c;
791	ddt->ddt_spa = spa;
792	ddt->ddt_os = spa->spa_meta_objset;
793
794	return (ddt);
795}
796
797static void
798ddt_table_free(ddt_t *ddt)
799{
800	ASSERT(avl_numnodes(&ddt->ddt_tree) == 0);
801	ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0);
802	avl_destroy(&ddt->ddt_tree);
803	avl_destroy(&ddt->ddt_repair_tree);
804	mutex_destroy(&ddt->ddt_lock);
805	kmem_free(ddt, sizeof (*ddt));
806}
807
808void
809ddt_create(spa_t *spa)
810{
811	spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
812
813	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
814		spa->spa_ddt[c] = ddt_table_alloc(spa, c);
815}
816
817int
818ddt_load(spa_t *spa)
819{
820	int error;
821
822	ddt_create(spa);
823
824	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
825	    DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
826	    &spa->spa_ddt_stat_object);
827
828	if (error)
829		return (error == ENOENT ? 0 : error);
830
831	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
832		ddt_t *ddt = spa->spa_ddt[c];
833		for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
834			for (enum ddt_class class = 0; class < DDT_CLASSES;
835			    class++) {
836				error = ddt_object_load(ddt, type, class);
837				if (error != 0 && error != ENOENT)
838					return (error);
839			}
840		}
841
842		/*
843		 * Seed the cached histograms.
844		 */
845		bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
846		    sizeof (ddt->ddt_histogram));
847	}
848
849	return (0);
850}
851
852void
853ddt_unload(spa_t *spa)
854{
855	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
856		if (spa->spa_ddt[c]) {
857			ddt_table_free(spa->spa_ddt[c]);
858			spa->spa_ddt[c] = NULL;
859		}
860	}
861}
862
863boolean_t
864ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp)
865{
866	ddt_t *ddt;
867	ddt_entry_t dde;
868
869	if (!BP_GET_DEDUP(bp))
870		return (B_FALSE);
871
872	if (max_class == DDT_CLASS_UNIQUE)
873		return (B_TRUE);
874
875	ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
876
877	ddt_key_fill(&dde.dde_key, bp);
878
879	for (enum ddt_type type = 0; type < DDT_TYPES; type++)
880		for (enum ddt_class class = 0; class <= max_class; class++)
881			if (ddt_object_lookup(ddt, type, class, &dde) == 0)
882				return (B_TRUE);
883
884	return (B_FALSE);
885}
886
887ddt_entry_t *
888ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
889{
890	ddt_key_t ddk;
891	ddt_entry_t *dde;
892
893	ddt_key_fill(&ddk, bp);
894
895	dde = ddt_alloc(&ddk);
896
897	for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
898		for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
899			/*
900			 * We can only do repair if there are multiple copies
901			 * of the block.  For anything in the UNIQUE class,
902			 * there's definitely only one copy, so don't even try.
903			 */
904			if (class != DDT_CLASS_UNIQUE &&
905			    ddt_object_lookup(ddt, type, class, dde) == 0)
906				return (dde);
907		}
908	}
909
910	bzero(dde->dde_phys, sizeof (dde->dde_phys));
911
912	return (dde);
913}
914
915void
916ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
917{
918	avl_index_t where;
919
920	ddt_enter(ddt);
921
922	if (dde->dde_repair_data != NULL && spa_writeable(ddt->ddt_spa) &&
923	    avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
924		avl_insert(&ddt->ddt_repair_tree, dde, where);
925	else
926		ddt_free(dde);
927
928	ddt_exit(ddt);
929}
930
931static void
932ddt_repair_entry_done(zio_t *zio)
933{
934	ddt_entry_t *rdde = zio->io_private;
935
936	ddt_free(rdde);
937}
938
939static void
940ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
941{
942	ddt_phys_t *ddp = dde->dde_phys;
943	ddt_phys_t *rddp = rdde->dde_phys;
944	ddt_key_t *ddk = &dde->dde_key;
945	ddt_key_t *rddk = &rdde->dde_key;
946	zio_t *zio;
947	blkptr_t blk;
948
949	zio = zio_null(rio, rio->io_spa, NULL,
950	    ddt_repair_entry_done, rdde, rio->io_flags);
951
952	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) {
953		if (ddp->ddp_phys_birth == 0 ||
954		    ddp->ddp_phys_birth != rddp->ddp_phys_birth ||
955		    bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva)))
956			continue;
957		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
958		zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
959		    rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL,
960		    ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
961	}
962
963	zio_nowait(zio);
964}
965
966static void
967ddt_repair_table(ddt_t *ddt, zio_t *rio)
968{
969	spa_t *spa = ddt->ddt_spa;
970	ddt_entry_t *dde, *rdde_next, *rdde;
971	avl_tree_t *t = &ddt->ddt_repair_tree;
972	blkptr_t blk;
973
974	if (spa_sync_pass(spa) > 1)
975		return;
976
977	ddt_enter(ddt);
978	for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) {
979		rdde_next = AVL_NEXT(t, rdde);
980		avl_remove(&ddt->ddt_repair_tree, rdde);
981		ddt_exit(ddt);
982		ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk);
983		dde = ddt_repair_start(ddt, &blk);
984		ddt_repair_entry(ddt, dde, rdde, rio);
985		ddt_repair_done(ddt, dde);
986		ddt_enter(ddt);
987	}
988	ddt_exit(ddt);
989}
990
991static void
992ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
993{
994	dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
995	ddt_phys_t *ddp = dde->dde_phys;
996	ddt_key_t *ddk = &dde->dde_key;
997	enum ddt_type otype = dde->dde_type;
998	enum ddt_type ntype = DDT_TYPE_CURRENT;
999	enum ddt_class oclass = dde->dde_class;
1000	enum ddt_class nclass;
1001	uint64_t total_refcnt = 0;
1002
1003	ASSERT(dde->dde_loaded);
1004	ASSERT(!dde->dde_loading);
1005
1006	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
1007		ASSERT(dde->dde_lead_zio[p] == NULL);
1008		ASSERT((int64_t)ddp->ddp_refcnt >= 0);
1009		if (ddp->ddp_phys_birth == 0) {
1010			ASSERT(ddp->ddp_refcnt == 0);
1011			continue;
1012		}
1013		if (p == DDT_PHYS_DITTO) {
1014			if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0)
1015				ddt_phys_free(ddt, ddk, ddp, txg);
1016			continue;
1017		}
1018		if (ddp->ddp_refcnt == 0)
1019			ddt_phys_free(ddt, ddk, ddp, txg);
1020		total_refcnt += ddp->ddp_refcnt;
1021	}
1022
1023	if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0)
1024		nclass = DDT_CLASS_DITTO;
1025	else if (total_refcnt > 1)
1026		nclass = DDT_CLASS_DUPLICATE;
1027	else
1028		nclass = DDT_CLASS_UNIQUE;
1029
1030	if (otype != DDT_TYPES &&
1031	    (otype != ntype || oclass != nclass || total_refcnt == 0)) {
1032		VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0);
1033		ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT);
1034	}
1035
1036	if (total_refcnt != 0) {
1037		dde->dde_type = ntype;
1038		dde->dde_class = nclass;
1039		ddt_stat_update(ddt, dde, 0);
1040		if (!ddt_object_exists(ddt, ntype, nclass))
1041			ddt_object_create(ddt, ntype, nclass, tx);
1042		VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0);
1043
1044		/*
1045		 * If the class changes, the order that we scan this bp
1046		 * changes.  If it decreases, we could miss it, so
1047		 * scan it right now.  (This covers both class changing
1048		 * while we are doing ddt_walk(), and when we are
1049		 * traversing.)
1050		 */
1051		if (nclass < oclass) {
1052			dsl_scan_ddt_entry(dp->dp_scan,
1053			    ddt->ddt_checksum, dde, tx);
1054		}
1055	}
1056}
1057
1058static void
1059ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
1060{
1061	spa_t *spa = ddt->ddt_spa;
1062	ddt_entry_t *dde;
1063	void *cookie = NULL;
1064
1065	if (avl_numnodes(&ddt->ddt_tree) == 0)
1066		return;
1067
1068	ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP);
1069
1070	if (spa->spa_ddt_stat_object == 0) {
1071		spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os,
1072		    DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT,
1073		    DMU_POOL_DDT_STATS, tx);
1074	}
1075
1076	while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
1077		ddt_sync_entry(ddt, dde, tx, txg);
1078		ddt_free(dde);
1079	}
1080
1081	for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
1082		uint64_t count = 0;
1083		for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
1084			if (ddt_object_exists(ddt, type, class)) {
1085				ddt_object_sync(ddt, type, class, tx);
1086				count += ddt_object_count(ddt, type, class);
1087			}
1088		}
1089		for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
1090			if (count == 0 && ddt_object_exists(ddt, type, class))
1091				ddt_object_destroy(ddt, type, class, tx);
1092		}
1093	}
1094
1095	bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache,
1096	    sizeof (ddt->ddt_histogram));
1097}
1098
1099void
1100ddt_sync(spa_t *spa, uint64_t txg)
1101{
1102	dmu_tx_t *tx;
1103	zio_t *rio = zio_root(spa, NULL, NULL,
1104	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
1105
1106	ASSERT(spa_syncing_txg(spa) == txg);
1107
1108	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
1109
1110	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
1111		ddt_t *ddt = spa->spa_ddt[c];
1112		if (ddt == NULL)
1113			continue;
1114		ddt_sync_table(ddt, tx, txg);
1115		ddt_repair_table(ddt, rio);
1116	}
1117
1118	(void) zio_wait(rio);
1119
1120	dmu_tx_commit(tx);
1121}
1122
1123int
1124ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
1125{
1126	do {
1127		do {
1128			do {
1129				ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
1130				int error = ENOENT;
1131				if (ddt_object_exists(ddt, ddb->ddb_type,
1132				    ddb->ddb_class)) {
1133					error = ddt_object_walk(ddt,
1134					    ddb->ddb_type, ddb->ddb_class,
1135					    &ddb->ddb_cursor, dde);
1136				}
1137				dde->dde_type = ddb->ddb_type;
1138				dde->dde_class = ddb->ddb_class;
1139				if (error == 0)
1140					return (0);
1141				if (error != ENOENT)
1142					return (error);
1143				ddb->ddb_cursor = 0;
1144			} while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS);
1145			ddb->ddb_checksum = 0;
1146		} while (++ddb->ddb_type < DDT_TYPES);
1147		ddb->ddb_type = 0;
1148	} while (++ddb->ddb_class < DDT_CLASSES);
1149
1150	return (ENOENT);
1151}
1152