1/*
2 * CDDL HEADER START
3 *
4 * This file and its contents are supplied under the terms of the
5 * Common Development and Distribution License ("CDDL"), version 1.0.
6 * You may only use this file in accordance with the terms of version
7 * 1.0 of the CDDL.
8 *
9 * A full copy of the text of the CDDL should have accompanied this
10 * source.  A copy of the CDDL is also available via the Internet at
11 * http://www.illumos.org/license/CDDL.
12 *
13 * CDDL HEADER END
14 */
15
16/*
17 * Copyright (c) 2020 by Delphix. All rights reserved.
18 */
19
20#include <assert.h>
21#include <cityhash.h>
22#include <ctype.h>
23#include <errno.h>
24#include <fcntl.h>
25#include <libzfs_impl.h>
26#include <libzfs.h>
27#include <libzutil.h>
28#include <stddef.h>
29#include <stdio.h>
30#include <stdlib.h>
31#include <strings.h>
32#include <umem.h>
33#include <unistd.h>
34#include <sys/debug.h>
35#include <sys/stat.h>
36#include <sys/zfs_ioctl.h>
37#include <sys/zio_checksum.h>
38#include "zfs_fletcher.h"
39#include "zstream.h"
40
41
42#define	MAX_RDT_PHYSMEM_PERCENT		20
43#define	SMALLEST_POSSIBLE_MAX_RDT_MB		128
44
45typedef struct redup_entry {
46	struct redup_entry	*rde_next;
47	uint64_t rde_guid;
48	uint64_t rde_object;
49	uint64_t rde_offset;
50	uint64_t rde_stream_offset;
51} redup_entry_t;
52
53typedef struct redup_table {
54	redup_entry_t	**redup_hash_array;
55	umem_cache_t	*ddecache;
56	uint64_t	ddt_count;
57	int		numhashbits;
58} redup_table_t;
59
60int
61highbit64(uint64_t i)
62{
63	if (i == 0)
64		return (0);
65
66	return (NBBY * sizeof (uint64_t) - __builtin_clzll(i));
67}
68
69static void *
70safe_calloc(size_t n)
71{
72	void *rv = calloc(1, n);
73	if (rv == NULL) {
74		fprintf(stderr,
75		    "Error: could not allocate %u bytes of memory\n",
76		    (int)n);
77		exit(1);
78	}
79	return (rv);
80}
81
82/*
83 * Safe version of fread(), exits on error.
84 */
85static int
86sfread(void *buf, size_t size, FILE *fp)
87{
88	int rv = fread(buf, size, 1, fp);
89	if (rv == 0 && ferror(fp)) {
90		(void) fprintf(stderr, "Error while reading file: %s\n",
91		    strerror(errno));
92		exit(1);
93	}
94	return (rv);
95}
96
97/*
98 * Safe version of pread(), exits on error.
99 */
100static void
101spread(int fd, void *buf, size_t count, off_t offset)
102{
103	ssize_t err = pread(fd, buf, count, offset);
104	if (err == -1) {
105		(void) fprintf(stderr,
106		    "Error while reading file: %s\n",
107		    strerror(errno));
108		exit(1);
109	} else if (err != count) {
110		(void) fprintf(stderr,
111		    "Error while reading file: short read\n");
112		exit(1);
113	}
114}
115
116static int
117dump_record(dmu_replay_record_t *drr, void *payload, int payload_len,
118    zio_cksum_t *zc, int outfd)
119{
120	assert(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum)
121	    == sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
122	fletcher_4_incremental_native(drr,
123	    offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc);
124	if (drr->drr_type != DRR_BEGIN) {
125		assert(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u.
126		    drr_checksum.drr_checksum));
127		drr->drr_u.drr_checksum.drr_checksum = *zc;
128	}
129	fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum,
130	    sizeof (zio_cksum_t), zc);
131	if (write(outfd, drr, sizeof (*drr)) == -1)
132		return (errno);
133	if (payload_len != 0) {
134		fletcher_4_incremental_native(payload, payload_len, zc);
135		if (write(outfd, payload, payload_len) == -1)
136			return (errno);
137	}
138	return (0);
139}
140
141static void
142rdt_insert(redup_table_t *rdt,
143    uint64_t guid, uint64_t object, uint64_t offset, uint64_t stream_offset)
144{
145	uint64_t ch = cityhash4(guid, object, offset, 0);
146	uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits);
147	redup_entry_t **rdepp;
148
149	rdepp = &(rdt->redup_hash_array[hashcode]);
150	redup_entry_t *rde = umem_cache_alloc(rdt->ddecache, UMEM_NOFAIL);
151	rde->rde_next = *rdepp;
152	rde->rde_guid = guid;
153	rde->rde_object = object;
154	rde->rde_offset = offset;
155	rde->rde_stream_offset = stream_offset;
156	*rdepp = rde;
157	rdt->ddt_count++;
158}
159
160static void
161rdt_lookup(redup_table_t *rdt,
162    uint64_t guid, uint64_t object, uint64_t offset,
163    uint64_t *stream_offsetp)
164{
165	uint64_t ch = cityhash4(guid, object, offset, 0);
166	uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits);
167
168	for (redup_entry_t *rde = rdt->redup_hash_array[hashcode];
169	    rde != NULL; rde = rde->rde_next) {
170		if (rde->rde_guid == guid &&
171		    rde->rde_object == object &&
172		    rde->rde_offset == offset) {
173			*stream_offsetp = rde->rde_stream_offset;
174			return;
175		}
176	}
177	assert(!"could not find expected redup table entry");
178}
179
180/*
181 * Convert a dedup stream (generated by "zfs send -D") to a
182 * non-deduplicated stream.  The entire infd will be converted, including
183 * any substreams in a stream package (generated by "zfs send -RD"). The
184 * infd must be seekable.
185 */
186static void
187zfs_redup_stream(int infd, int outfd, boolean_t verbose)
188{
189	int bufsz = SPA_MAXBLOCKSIZE;
190	dmu_replay_record_t thedrr = { 0 };
191	dmu_replay_record_t *drr = &thedrr;
192	redup_table_t rdt;
193	zio_cksum_t stream_cksum;
194	uint64_t numbuckets;
195	uint64_t num_records = 0;
196	uint64_t num_write_byref_records = 0;
197
198#ifdef _ILP32
199	uint64_t max_rde_size = SMALLEST_POSSIBLE_MAX_RDT_MB << 20;
200#else
201	uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE);
202	uint64_t max_rde_size =
203	    MAX((physmem * MAX_RDT_PHYSMEM_PERCENT) / 100,
204	    SMALLEST_POSSIBLE_MAX_RDT_MB << 20);
205#endif
206
207	numbuckets = max_rde_size / (sizeof (redup_entry_t));
208
209	/*
210	 * numbuckets must be a power of 2.  Increase number to
211	 * a power of 2 if necessary.
212	 */
213	if (!ISP2(numbuckets))
214		numbuckets = 1ULL << highbit64(numbuckets);
215
216	rdt.redup_hash_array =
217	    safe_calloc(numbuckets * sizeof (redup_entry_t *));
218	rdt.ddecache = umem_cache_create("rde", sizeof (redup_entry_t), 0,
219	    NULL, NULL, NULL, NULL, NULL, 0);
220	rdt.numhashbits = highbit64(numbuckets) - 1;
221	rdt.ddt_count = 0;
222
223	char *buf = safe_calloc(bufsz);
224	FILE *ofp = fdopen(infd, "r");
225	long offset = ftell(ofp);
226	while (sfread(drr, sizeof (*drr), ofp) != 0) {
227		num_records++;
228
229		/*
230		 * We need to regenerate the checksum.
231		 */
232		if (drr->drr_type != DRR_BEGIN) {
233			bzero(&drr->drr_u.drr_checksum.drr_checksum,
234			    sizeof (drr->drr_u.drr_checksum.drr_checksum));
235		}
236
237		uint64_t payload_size = 0;
238		switch (drr->drr_type) {
239		case DRR_BEGIN:
240		{
241			struct drr_begin *drrb = &drr->drr_u.drr_begin;
242			int fflags;
243			ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
244
245			assert(drrb->drr_magic == DMU_BACKUP_MAGIC);
246
247			/* clear the DEDUP feature flag for this stream */
248			fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
249			fflags &= ~(DMU_BACKUP_FEATURE_DEDUP |
250			    DMU_BACKUP_FEATURE_DEDUPPROPS);
251			/* cppcheck-suppress syntaxError */
252			DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags);
253
254			int sz = drr->drr_payloadlen;
255			if (sz != 0) {
256				if (sz > bufsz) {
257					free(buf);
258					buf = safe_calloc(sz);
259					bufsz = sz;
260				}
261				(void) sfread(buf, sz, ofp);
262			}
263			payload_size = sz;
264			break;
265		}
266
267		case DRR_END:
268		{
269			struct drr_end *drre = &drr->drr_u.drr_end;
270			/*
271			 * Use the recalculated checksum, unless this is
272			 * the END record of a stream package, which has
273			 * no checksum.
274			 */
275			if (!ZIO_CHECKSUM_IS_ZERO(&drre->drr_checksum))
276				drre->drr_checksum = stream_cksum;
277			break;
278		}
279
280		case DRR_OBJECT:
281		{
282			struct drr_object *drro = &drr->drr_u.drr_object;
283
284			if (drro->drr_bonuslen > 0) {
285				payload_size = DRR_OBJECT_PAYLOAD_SIZE(drro);
286				(void) sfread(buf, payload_size, ofp);
287			}
288			break;
289		}
290
291		case DRR_SPILL:
292		{
293			struct drr_spill *drrs = &drr->drr_u.drr_spill;
294			payload_size = DRR_SPILL_PAYLOAD_SIZE(drrs);
295			(void) sfread(buf, payload_size, ofp);
296			break;
297		}
298
299		case DRR_WRITE_BYREF:
300		{
301			struct drr_write_byref drrwb =
302			    drr->drr_u.drr_write_byref;
303
304			num_write_byref_records++;
305
306			/*
307			 * Look up in hash table by drrwb->drr_refguid,
308			 * drr_refobject, drr_refoffset.  Replace this
309			 * record with the found WRITE record, but with
310			 * drr_object,drr_offset,drr_toguid replaced with ours.
311			 */
312			uint64_t stream_offset = 0;
313			rdt_lookup(&rdt, drrwb.drr_refguid,
314			    drrwb.drr_refobject, drrwb.drr_refoffset,
315			    &stream_offset);
316
317			spread(infd, drr, sizeof (*drr), stream_offset);
318
319			assert(drr->drr_type == DRR_WRITE);
320			struct drr_write *drrw = &drr->drr_u.drr_write;
321			assert(drrw->drr_toguid == drrwb.drr_refguid);
322			assert(drrw->drr_object == drrwb.drr_refobject);
323			assert(drrw->drr_offset == drrwb.drr_refoffset);
324
325			payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
326			spread(infd, buf, payload_size,
327			    stream_offset + sizeof (*drr));
328
329			drrw->drr_toguid = drrwb.drr_toguid;
330			drrw->drr_object = drrwb.drr_object;
331			drrw->drr_offset = drrwb.drr_offset;
332			break;
333		}
334
335		case DRR_WRITE:
336		{
337			struct drr_write *drrw = &drr->drr_u.drr_write;
338			payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
339			(void) sfread(buf, payload_size, ofp);
340
341			rdt_insert(&rdt, drrw->drr_toguid,
342			    drrw->drr_object, drrw->drr_offset, offset);
343			break;
344		}
345
346		case DRR_WRITE_EMBEDDED:
347		{
348			struct drr_write_embedded *drrwe =
349			    &drr->drr_u.drr_write_embedded;
350			payload_size =
351			    P2ROUNDUP((uint64_t)drrwe->drr_psize, 8);
352			(void) sfread(buf, payload_size, ofp);
353			break;
354		}
355
356		case DRR_FREEOBJECTS:
357		case DRR_FREE:
358		case DRR_OBJECT_RANGE:
359			break;
360
361		default:
362			(void) fprintf(stderr, "INVALID record type 0x%x\n",
363			    drr->drr_type);
364			/* should never happen, so assert */
365			assert(B_FALSE);
366		}
367
368		if (feof(ofp)) {
369			fprintf(stderr, "Error: unexpected end-of-file\n");
370			exit(1);
371		}
372		if (ferror(ofp)) {
373			fprintf(stderr, "Error while reading file: %s\n",
374			    strerror(errno));
375			exit(1);
376		}
377
378		/*
379		 * We need to recalculate the checksum, and it needs to be
380		 * initially zero to do that.  BEGIN records don't have
381		 * a checksum.
382		 */
383		if (drr->drr_type != DRR_BEGIN) {
384			bzero(&drr->drr_u.drr_checksum.drr_checksum,
385			    sizeof (drr->drr_u.drr_checksum.drr_checksum));
386		}
387		if (dump_record(drr, buf, payload_size,
388		    &stream_cksum, outfd) != 0)
389			break;
390		if (drr->drr_type == DRR_END) {
391			/*
392			 * Typically the END record is either the last
393			 * thing in the stream, or it is followed
394			 * by a BEGIN record (which also zeros the checksum).
395			 * However, a stream package ends with two END
396			 * records.  The last END record's checksum starts
397			 * from zero.
398			 */
399			ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
400		}
401		offset = ftell(ofp);
402	}
403
404	if (verbose) {
405		char mem_str[16];
406		zfs_nicenum(rdt.ddt_count * sizeof (redup_entry_t),
407		    mem_str, sizeof (mem_str));
408		fprintf(stderr, "converted stream with %llu total records, "
409		    "including %llu dedup records, using %sB memory.\n",
410		    (long long)num_records,
411		    (long long)num_write_byref_records,
412		    mem_str);
413	}
414
415	umem_cache_destroy(rdt.ddecache);
416	free(rdt.redup_hash_array);
417	free(buf);
418	(void) fclose(ofp);
419}
420
421int
422zstream_do_redup(int argc, char *argv[])
423{
424	boolean_t verbose = B_FALSE;
425	int c;
426
427	while ((c = getopt(argc, argv, "v")) != -1) {
428		switch (c) {
429		case 'v':
430			verbose = B_TRUE;
431			break;
432		case '?':
433			(void) fprintf(stderr, "invalid option '%c'\n",
434			    optopt);
435			zstream_usage();
436			break;
437		}
438	}
439
440	argc -= optind;
441	argv += optind;
442
443	if (argc != 1)
444		zstream_usage();
445
446	const char *filename = argv[0];
447
448	if (isatty(STDOUT_FILENO)) {
449		(void) fprintf(stderr,
450		    "Error: Stream can not be written to a terminal.\n"
451		    "You must redirect standard output.\n");
452		return (1);
453	}
454
455	int fd = open(filename, O_RDONLY);
456	if (fd == -1) {
457		(void) fprintf(stderr,
458		    "Error while opening file '%s': %s\n",
459		    filename, strerror(errno));
460		exit(1);
461	}
462
463	fletcher_4_init();
464	zfs_redup_stream(fd, STDOUT_FILENO, verbose);
465	fletcher_4_fini();
466
467	close(fd);
468
469	return (0);
470}
471