1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd/*
22219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23290765Smav * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
24255750Sdelphij * Copyright (c) 2013, Joyent, Inc. All rights reserved.
25290757Smav * Copyright 2013 Saso Kiselkov. All rights reserved.
26168404Spjd */
27168404Spjd
28168404Spjd#include <sys/zfs_context.h>
29168404Spjd#include <sys/spa.h>
30290757Smav#include <sys/spa_impl.h>
31168404Spjd#include <sys/zio.h>
32168404Spjd#include <sys/zio_checksum.h>
33219089Spjd#include <sys/zil.h>
34219089Spjd#include <zfs_fletcher.h>
35168404Spjd
36168404Spjd/*
37168404Spjd * Checksum vectors.
38168404Spjd *
39168404Spjd * In the SPA, everything is checksummed.  We support checksum vectors
40168404Spjd * for three distinct reasons:
41168404Spjd *
42168404Spjd *   1. Different kinds of data need different levels of protection.
43168404Spjd *	For SPA metadata, we always want a very strong checksum.
44168404Spjd *	For user data, we let users make the trade-off between speed
45168404Spjd *	and checksum strength.
46168404Spjd *
47168404Spjd *   2. Cryptographic hash and MAC algorithms are an area of active research.
48168404Spjd *	It is likely that in future hash functions will be at least as strong
49168404Spjd *	as current best-of-breed, and may be substantially faster as well.
50168404Spjd *	We want the ability to take advantage of these new hashes as soon as
51168404Spjd *	they become available.
52168404Spjd *
53168404Spjd *   3. If someone develops hardware that can compute a strong hash quickly,
54168404Spjd *	we want the ability to take advantage of that hardware.
55168404Spjd *
56168404Spjd * Of course, we don't want a checksum upgrade to invalidate existing
57219089Spjd * data, so we store the checksum *function* in eight bits of the bp.
58219089Spjd * This gives us room for up to 256 different checksum functions.
59168404Spjd *
60168404Spjd * When writing a block, we always checksum it with the latest-and-greatest
61168404Spjd * checksum function of the appropriate strength.  When reading a block,
62168404Spjd * we compare the expected checksum against the actual checksum, which we
63219089Spjd * compute via the checksum function specified by BP_GET_CHECKSUM(bp).
64290757Smav *
65290757Smav * SALTED CHECKSUMS
66290757Smav *
67290757Smav * To enable the use of less secure hash algorithms with dedup, we
68290757Smav * introduce the notion of salted checksums (MACs, really).  A salted
69290757Smav * checksum is fed both a random 256-bit value (the salt) and the data
70290757Smav * to be checksummed.  This salt is kept secret (stored on the pool, but
71290757Smav * never shown to the user).  Thus even if an attacker knew of collision
72290757Smav * weaknesses in the hash algorithm, they won't be able to mount a known
73290757Smav * plaintext attack on the DDT, since the actual hash value cannot be
74290757Smav * known ahead of time.  How the salt is used is algorithm-specific
75290757Smav * (some might simply prefix it to the data block, others might need to
76290757Smav * utilize a full-blown HMAC).  On disk the salt is stored in a ZAP
77290757Smav * object in the MOS (DMU_POOL_CHECKSUM_SALT).
78290757Smav *
79290757Smav * CONTEXT TEMPLATES
80290757Smav *
81290757Smav * Some hashing algorithms need to perform a substantial amount of
82290757Smav * initialization work (e.g. salted checksums above may need to pre-hash
83290757Smav * the salt) before being able to process data.  Performing this
84290757Smav * redundant work for each block would be wasteful, so we instead allow
85290757Smav * a checksum algorithm to do the work once (the first time it's used)
86290757Smav * and then keep this pre-initialized context as a template inside the
87290757Smav * spa_t (spa_cksum_tmpls).  If the zio_checksum_info_t contains
88290757Smav * non-NULL ci_tmpl_init and ci_tmpl_free callbacks, they are used to
89290757Smav * construct and destruct the pre-initialized checksum context.  The
90290757Smav * pre-initialized context is then reused during each checksum
91290757Smav * invocation and passed to the checksum function.
92168404Spjd */
93168404Spjd
94168404Spjd/*ARGSUSED*/
95168404Spjdstatic void
96290757Smavzio_checksum_off(const void *buf, uint64_t size,
97290757Smav    const void *ctx_template, zio_cksum_t *zcp)
98168404Spjd{
99168404Spjd	ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
100168404Spjd}
101168404Spjd
102168404Spjdzio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
103290757Smav	{{NULL, NULL}, NULL, NULL, 0, "inherit"},
104290757Smav	{{NULL, NULL}, NULL, NULL, 0, "on"},
105290757Smav	{{zio_checksum_off,		zio_checksum_off},
106290757Smav	    NULL, NULL, 0, "off"},
107290757Smav	{{zio_checksum_SHA256,		zio_checksum_SHA256},
108290757Smav	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
109290757Smav	    "label"},
110290757Smav	{{zio_checksum_SHA256,		zio_checksum_SHA256},
111290757Smav	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
112290757Smav	    "gang_header"},
113290757Smav	{{fletcher_2_native,		fletcher_2_byteswap},
114290757Smav	    NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
115290757Smav	{{fletcher_2_native,		fletcher_2_byteswap},
116290757Smav	    NULL, NULL, 0, "fletcher2"},
117290757Smav	{{fletcher_4_native,		fletcher_4_byteswap},
118290757Smav	    NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"},
119290757Smav	{{zio_checksum_SHA256,		zio_checksum_SHA256},
120290757Smav	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
121290757Smav	    ZCHECKSUM_FLAG_NOPWRITE, "sha256"},
122290757Smav	{{fletcher_4_native,		fletcher_4_byteswap},
123290757Smav	    NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"},
124290757Smav	{{zio_checksum_off,		zio_checksum_off},
125290757Smav	    NULL, NULL, 0, "noparity"},
126290757Smav#ifdef illumos
127290757Smav	{{zio_checksum_SHA512_native,	zio_checksum_SHA512_byteswap},
128290757Smav	    NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
129290757Smav	    ZCHECKSUM_FLAG_NOPWRITE, "sha512"},
130290757Smav	{{zio_checksum_skein_native,	zio_checksum_skein_byteswap},
131290757Smav	    zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
132290757Smav	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
133290757Smav	    ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
134290757Smav	{{zio_checksum_edonr_native,	zio_checksum_edonr_byteswap},
135290757Smav	    zio_checksum_edonr_tmpl_init, zio_checksum_edonr_tmpl_free,
136290757Smav	    ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED |
137290757Smav	    ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
138290757Smav#endif
139168404Spjd};
140168404Spjd
141297114Smav/*
142297114Smav * The flag corresponding to the "verify" in dedup=[checksum,]verify
143297114Smav * must be cleared first, so callers should use ZIO_CHECKSUM_MASK.
144297114Smav */
145290757Smavspa_feature_t
146290757Smavzio_checksum_to_feature(enum zio_checksum cksum)
147290757Smav{
148290757Smav#ifdef illumos
149297114Smav	VERIFY((cksum & ~ZIO_CHECKSUM_MASK) == 0);
150297114Smav
151290757Smav	switch (cksum) {
152290757Smav	case ZIO_CHECKSUM_SHA512:
153290757Smav		return (SPA_FEATURE_SHA512);
154290757Smav	case ZIO_CHECKSUM_SKEIN:
155290757Smav		return (SPA_FEATURE_SKEIN);
156290757Smav	case ZIO_CHECKSUM_EDONR:
157290757Smav		return (SPA_FEATURE_EDONR);
158290757Smav	}
159290757Smav#endif
160290757Smav	return (SPA_FEATURE_NONE);
161290757Smav}
162290757Smav
163219089Spjdenum zio_checksum
164219089Spjdzio_checksum_select(enum zio_checksum child, enum zio_checksum parent)
165168404Spjd{
166168404Spjd	ASSERT(child < ZIO_CHECKSUM_FUNCTIONS);
167168404Spjd	ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS);
168168404Spjd	ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
169168404Spjd
170168404Spjd	if (child == ZIO_CHECKSUM_INHERIT)
171168404Spjd		return (parent);
172168404Spjd
173168404Spjd	if (child == ZIO_CHECKSUM_ON)
174168404Spjd		return (ZIO_CHECKSUM_ON_VALUE);
175168404Spjd
176168404Spjd	return (child);
177168404Spjd}
178168404Spjd
179219089Spjdenum zio_checksum
180219089Spjdzio_checksum_dedup_select(spa_t *spa, enum zio_checksum child,
181219089Spjd    enum zio_checksum parent)
182219089Spjd{
183219089Spjd	ASSERT((child & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
184219089Spjd	ASSERT((parent & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
185219089Spjd	ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
186219089Spjd
187219089Spjd	if (child == ZIO_CHECKSUM_INHERIT)
188219089Spjd		return (parent);
189219089Spjd
190219089Spjd	if (child == ZIO_CHECKSUM_ON)
191219089Spjd		return (spa_dedup_checksum(spa));
192219089Spjd
193219089Spjd	if (child == (ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY))
194219089Spjd		return (spa_dedup_checksum(spa) | ZIO_CHECKSUM_VERIFY);
195219089Spjd
196290757Smav	ASSERT((zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_flags &
197290757Smav	    ZCHECKSUM_FLAG_DEDUP) ||
198219089Spjd	    (child & ZIO_CHECKSUM_VERIFY) || child == ZIO_CHECKSUM_OFF);
199219089Spjd
200219089Spjd	return (child);
201219089Spjd}
202219089Spjd
203168404Spjd/*
204185029Spjd * Set the external verifier for a gang block based on <vdev, offset, txg>,
205185029Spjd * a tuple which is guaranteed to be unique for the life of the pool.
206185029Spjd */
207185029Spjdstatic void
208185029Spjdzio_checksum_gang_verifier(zio_cksum_t *zcp, blkptr_t *bp)
209185029Spjd{
210185029Spjd	dva_t *dva = BP_IDENTITY(bp);
211219089Spjd	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
212185029Spjd
213185029Spjd	ASSERT(BP_IS_GANG(bp));
214185029Spjd
215185029Spjd	ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0);
216185029Spjd}
217185029Spjd
218185029Spjd/*
219185029Spjd * Set the external verifier for a label block based on its offset.
220185029Spjd * The vdev is implicit, and the txg is unknowable at pool open time --
221185029Spjd * hence the logic in vdev_uberblock_load() to find the most recent copy.
222185029Spjd */
223185029Spjdstatic void
224185029Spjdzio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset)
225185029Spjd{
226185029Spjd	ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0);
227185029Spjd}
228185029Spjd
229185029Spjd/*
230290757Smav * Calls the template init function of a checksum which supports context
231290757Smav * templates and installs the template into the spa_t.
232290757Smav */
233290757Smavstatic void
234290757Smavzio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
235290757Smav{
236290757Smav	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
237290757Smav
238290757Smav	if (ci->ci_tmpl_init == NULL)
239290757Smav		return;
240290757Smav	if (spa->spa_cksum_tmpls[checksum] != NULL)
241290757Smav		return;
242290757Smav
243290757Smav	VERIFY(ci->ci_tmpl_free != NULL);
244290757Smav	mutex_enter(&spa->spa_cksum_tmpls_lock);
245290757Smav	if (spa->spa_cksum_tmpls[checksum] == NULL) {
246290757Smav		spa->spa_cksum_tmpls[checksum] =
247290757Smav		    ci->ci_tmpl_init(&spa->spa_cksum_salt);
248290757Smav		VERIFY(spa->spa_cksum_tmpls[checksum] != NULL);
249290757Smav	}
250290757Smav	mutex_exit(&spa->spa_cksum_tmpls_lock);
251290757Smav}
252290757Smav
253290757Smav/*
254168404Spjd * Generate the checksum.
255168404Spjd */
256168404Spjdvoid
257185029Spjdzio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
258290765Smav    void *data, uint64_t size)
259168404Spjd{
260185029Spjd	blkptr_t *bp = zio->io_bp;
261185029Spjd	uint64_t offset = zio->io_offset;
262168404Spjd	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
263219089Spjd	zio_cksum_t cksum;
264290757Smav	spa_t *spa = zio->io_spa;
265168404Spjd
266185029Spjd	ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS);
267168404Spjd	ASSERT(ci->ci_func[0] != NULL);
268168404Spjd
269290757Smav	zio_checksum_template_init(checksum, spa);
270290757Smav
271290757Smav	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
272219089Spjd		zio_eck_t *eck;
273219089Spjd
274219089Spjd		if (checksum == ZIO_CHECKSUM_ZILOG2) {
275219089Spjd			zil_chain_t *zilc = data;
276219089Spjd
277219089Spjd			size = P2ROUNDUP_TYPED(zilc->zc_nused, ZIL_MIN_BLKSZ,
278219089Spjd			    uint64_t);
279219089Spjd			eck = &zilc->zc_eck;
280219089Spjd		} else {
281219089Spjd			eck = (zio_eck_t *)((char *)data + size) - 1;
282219089Spjd		}
283185029Spjd		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
284219089Spjd			zio_checksum_gang_verifier(&eck->zec_cksum, bp);
285185029Spjd		else if (checksum == ZIO_CHECKSUM_LABEL)
286219089Spjd			zio_checksum_label_verifier(&eck->zec_cksum, offset);
287185029Spjd		else
288219089Spjd			bp->blk_cksum = eck->zec_cksum;
289219089Spjd		eck->zec_magic = ZEC_MAGIC;
290290757Smav		ci->ci_func[0](data, size, spa->spa_cksum_tmpls[checksum],
291290757Smav		    &cksum);
292219089Spjd		eck->zec_cksum = cksum;
293168404Spjd	} else {
294290757Smav		ci->ci_func[0](data, size, spa->spa_cksum_tmpls[checksum],
295290757Smav		    &bp->blk_cksum);
296168404Spjd	}
297168404Spjd}
298168404Spjd
299168404Spjdint
300307266Smavzio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum,
301307266Smav    void *data, uint64_t size, uint64_t offset, zio_bad_cksum_t *info)
302168404Spjd{
303307266Smav	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
304307266Smav	zio_cksum_t actual_cksum, expected_cksum;
305185029Spjd	int byteswap;
306168404Spjd
307168404Spjd	if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
308249195Smm		return (SET_ERROR(EINVAL));
309168404Spjd
310290757Smav	zio_checksum_template_init(checksum, spa);
311290757Smav
312290757Smav	if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
313219089Spjd		zio_eck_t *eck;
314307266Smav		zio_cksum_t verifier;
315219089Spjd
316219089Spjd		if (checksum == ZIO_CHECKSUM_ZILOG2) {
317219089Spjd			zil_chain_t *zilc = data;
318219089Spjd			uint64_t nused;
319219089Spjd
320219089Spjd			eck = &zilc->zc_eck;
321219089Spjd			if (eck->zec_magic == ZEC_MAGIC)
322219089Spjd				nused = zilc->zc_nused;
323219089Spjd			else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC))
324219089Spjd				nused = BSWAP_64(zilc->zc_nused);
325219089Spjd			else
326249195Smm				return (SET_ERROR(ECKSUM));
327219089Spjd
328219089Spjd			if (nused > size)
329249195Smm				return (SET_ERROR(ECKSUM));
330219089Spjd
331219089Spjd			size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t);
332219089Spjd		} else {
333219089Spjd			eck = (zio_eck_t *)((char *)data + size) - 1;
334219089Spjd		}
335219089Spjd
336168404Spjd		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
337185029Spjd			zio_checksum_gang_verifier(&verifier, bp);
338185029Spjd		else if (checksum == ZIO_CHECKSUM_LABEL)
339185029Spjd			zio_checksum_label_verifier(&verifier, offset);
340185029Spjd		else
341185029Spjd			verifier = bp->blk_cksum;
342168404Spjd
343219089Spjd		byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
344185029Spjd
345185029Spjd		if (byteswap)
346185029Spjd			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
347185029Spjd
348219089Spjd		expected_cksum = eck->zec_cksum;
349219089Spjd		eck->zec_cksum = verifier;
350290757Smav		ci->ci_func[byteswap](data, size,
351290757Smav		    spa->spa_cksum_tmpls[checksum], &actual_cksum);
352219089Spjd		eck->zec_cksum = expected_cksum;
353185029Spjd
354307266Smav		if (byteswap) {
355168404Spjd			byteswap_uint64_array(&expected_cksum,
356168404Spjd			    sizeof (zio_cksum_t));
357307266Smav		}
358168404Spjd	} else {
359185029Spjd		byteswap = BP_SHOULD_BYTESWAP(bp);
360185029Spjd		expected_cksum = bp->blk_cksum;
361290757Smav		ci->ci_func[byteswap](data, size,
362290757Smav		    spa->spa_cksum_tmpls[checksum], &actual_cksum);
363168404Spjd	}
364168404Spjd
365307266Smav	if (info != NULL) {
366307266Smav		info->zbc_expected = expected_cksum;
367307266Smav		info->zbc_actual = actual_cksum;
368307266Smav		info->zbc_checksum_name = ci->ci_name;
369307266Smav		info->zbc_byteswapped = byteswap;
370307266Smav		info->zbc_injected = 0;
371307266Smav		info->zbc_has_cksum = 1;
372307266Smav	}
373219089Spjd
374185029Spjd	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum))
375249195Smm		return (SET_ERROR(ECKSUM));
376168404Spjd
377307266Smav	return (0);
378307266Smav}
379307266Smav
380307266Smavint
381307266Smavzio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
382307266Smav{
383307266Smav	blkptr_t *bp = zio->io_bp;
384307266Smav	uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
385307266Smav	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
386307266Smav	int error;
387307266Smav	uint64_t size = (bp == NULL ? zio->io_size :
388307266Smav	    (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
389307266Smav	uint64_t offset = zio->io_offset;
390307266Smav	void *data = zio->io_data;
391307266Smav	spa_t *spa = zio->io_spa;
392307266Smav
393307266Smav	error = zio_checksum_error_impl(spa, bp, checksum, data, size,
394307266Smav	    offset, info);
395307266Smav	if (error != 0 && zio_injection_enabled && !zio->io_error &&
396219089Spjd	    (error = zio_handle_fault_injection(zio, ECKSUM)) != 0) {
397168404Spjd
398219089Spjd		info->zbc_injected = 1;
399219089Spjd		return (error);
400219089Spjd	}
401307266Smav	return (error);
402168404Spjd}
403290757Smav
404290757Smav/*
405290757Smav * Called by a spa_t that's about to be deallocated. This steps through
406290757Smav * all of the checksum context templates and deallocates any that were
407290757Smav * initialized using the algorithm-specific template init function.
408290757Smav */
409290757Smavvoid
410290757Smavzio_checksum_templates_free(spa_t *spa)
411290757Smav{
412290757Smav	for (enum zio_checksum checksum = 0;
413290757Smav	    checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
414290757Smav		if (spa->spa_cksum_tmpls[checksum] != NULL) {
415290757Smav			zio_checksum_info_t *ci = &zio_checksum_table[checksum];
416290757Smav
417290757Smav			VERIFY(ci->ci_tmpl_free != NULL);
418290757Smav			ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
419290757Smav			spa->spa_cksum_tmpls[checksum] = NULL;
420290757Smav		}
421290757Smav	}
422290757Smav}
423