1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd
22168404Spjd/*
23219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24249643Smm * Copyright (c) 2013 by Delphix. All rights reserved.
25262089Savg * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26168404Spjd */
27168404Spjd
28168404Spjd#include <sys/zfs_context.h>
29168404Spjd#include <sys/spa.h>
30168404Spjd#include <sys/vdev_impl.h>
31262089Savg#ifdef illumos
32262089Savg#include <sys/vdev_disk.h>
33262089Savg#endif
34262089Savg#include <sys/vdev_file.h>
35262089Savg#include <sys/vdev_raidz.h>
36168404Spjd#include <sys/zio.h>
37168404Spjd#include <sys/zio_checksum.h>
38168404Spjd#include <sys/fs/zfs.h>
39168404Spjd#include <sys/fm/fs/zfs.h>
40262089Savg#include <sys/bio.h>
41168404Spjd
42168404Spjd/*
43168404Spjd * Virtual device vector for RAID-Z.
44168404Spjd *
45219089Spjd * This vdev supports single, double, and triple parity. For single parity,
46219089Spjd * we use a simple XOR of all the data columns. For double or triple parity,
47219089Spjd * we use a special case of Reed-Solomon coding. This extends the
48219089Spjd * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
49219089Spjd * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
50219089Spjd * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
51219089Spjd * former is also based. The latter is designed to provide higher performance
52219089Spjd * for writes.
53168404Spjd *
54219089Spjd * Note that the Plank paper claimed to support arbitrary N+M, but was then
55219089Spjd * amended six years later identifying a critical flaw that invalidates its
56219089Spjd * claims. Nevertheless, the technique can be adapted to work for up to
57219089Spjd * triple parity. For additional parity, the amendment "Note: Correction to
58219089Spjd * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
59219089Spjd * is viable, but the additional complexity means that write performance will
60219089Spjd * suffer.
61219089Spjd *
62219089Spjd * All of the methods above operate on a Galois field, defined over the
63219089Spjd * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
64219089Spjd * can be expressed with a single byte. Briefly, the operations on the
65219089Spjd * field are defined as follows:
66219089Spjd *
67168404Spjd *   o addition (+) is represented by a bitwise XOR
68168404Spjd *   o subtraction (-) is therefore identical to addition: A + B = A - B
69168404Spjd *   o multiplication of A by 2 is defined by the following bitwise expression:
70252751Sdelphij *
71168404Spjd *	(A * 2)_7 = A_6
72168404Spjd *	(A * 2)_6 = A_5
73168404Spjd *	(A * 2)_5 = A_4
74168404Spjd *	(A * 2)_4 = A_3 + A_7
75168404Spjd *	(A * 2)_3 = A_2 + A_7
76168404Spjd *	(A * 2)_2 = A_1 + A_7
77168404Spjd *	(A * 2)_1 = A_0
78168404Spjd *	(A * 2)_0 = A_7
79168404Spjd *
80168404Spjd * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
81219089Spjd * As an aside, this multiplication is derived from the error correcting
82219089Spjd * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
83168404Spjd *
84168404Spjd * Observe that any number in the field (except for 0) can be expressed as a
85168404Spjd * power of 2 -- a generator for the field. We store a table of the powers of
86168404Spjd * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
87168404Spjd * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
88219089Spjd * than field addition). The inverse of a field element A (A^-1) is therefore
89219089Spjd * A ^ (255 - 1) = A^254.
90168404Spjd *
91219089Spjd * The up-to-three parity columns, P, Q, R over several data columns,
92219089Spjd * D_0, ... D_n-1, can be expressed by field operations:
93168404Spjd *
94168404Spjd *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
95168404Spjd *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
96168404Spjd *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
97219089Spjd *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
98219089Spjd *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
99168404Spjd *
100219089Spjd * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
101219089Spjd * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
102219089Spjd * independent coefficients. (There are no additional coefficients that have
103219089Spjd * this property which is why the uncorrected Plank method breaks down.)
104219089Spjd *
105219089Spjd * See the reconstruction code below for how P, Q and R can used individually
106219089Spjd * or in concert to recover missing data columns.
107168404Spjd */
108168404Spjd
109168404Spjdtypedef struct raidz_col {
110168404Spjd	uint64_t rc_devidx;		/* child device index for I/O */
111168404Spjd	uint64_t rc_offset;		/* device offset */
112168404Spjd	uint64_t rc_size;		/* I/O size */
113168404Spjd	void *rc_data;			/* I/O data */
114219089Spjd	void *rc_gdata;			/* used to store the "good" version */
115168404Spjd	int rc_error;			/* I/O error for this device */
116168404Spjd	uint8_t rc_tried;		/* Did we attempt this I/O column? */
117168404Spjd	uint8_t rc_skipped;		/* Did we skip this I/O column? */
118168404Spjd} raidz_col_t;
119168404Spjd
120168404Spjdtypedef struct raidz_map {
121219089Spjd	uint64_t rm_cols;		/* Regular column count */
122219089Spjd	uint64_t rm_scols;		/* Count including skipped columns */
123168404Spjd	uint64_t rm_bigcols;		/* Number of oversized columns */
124168404Spjd	uint64_t rm_asize;		/* Actual total I/O size */
125168404Spjd	uint64_t rm_missingdata;	/* Count of missing data devices */
126168404Spjd	uint64_t rm_missingparity;	/* Count of missing parity devices */
127168404Spjd	uint64_t rm_firstdatacol;	/* First data column/parity count */
128219089Spjd	uint64_t rm_nskip;		/* Skipped sectors for padding */
129252751Sdelphij	uint64_t rm_skipstart;		/* Column index of padding start */
130219089Spjd	void *rm_datacopy;		/* rm_asize-buffer of copied data */
131219089Spjd	uintptr_t rm_reports;		/* # of referencing checksum reports */
132219089Spjd	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
133219089Spjd	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
134168404Spjd	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
135168404Spjd} raidz_map_t;
136168404Spjd
137168404Spjd#define	VDEV_RAIDZ_P		0
138168404Spjd#define	VDEV_RAIDZ_Q		1
139219089Spjd#define	VDEV_RAIDZ_R		2
140168404Spjd
141219089Spjd#define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
142219089Spjd#define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
143168404Spjd
144219089Spjd/*
145219089Spjd * We provide a mechanism to perform the field multiplication operation on a
146219089Spjd * 64-bit value all at once rather than a byte at a time. This works by
147219089Spjd * creating a mask from the top bit in each byte and using that to
148219089Spjd * conditionally apply the XOR of 0x1d.
149219089Spjd */
150219089Spjd#define	VDEV_RAIDZ_64MUL_2(x, mask) \
151219089Spjd{ \
152219089Spjd	(mask) = (x) & 0x8080808080808080ULL; \
153219089Spjd	(mask) = ((mask) << 1) - ((mask) >> 7); \
154219089Spjd	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
155219089Spjd	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
156219089Spjd}
157168404Spjd
158219089Spjd#define	VDEV_RAIDZ_64MUL_4(x, mask) \
159219089Spjd{ \
160219089Spjd	VDEV_RAIDZ_64MUL_2((x), mask); \
161219089Spjd	VDEV_RAIDZ_64MUL_2((x), mask); \
162219089Spjd}
163219089Spjd
164262089Savg#define	VDEV_LABEL_OFFSET(x)	(x + VDEV_LABEL_START_SIZE)
165262089Savg
166168404Spjd/*
167219089Spjd * Force reconstruction to use the general purpose method.
168219089Spjd */
169219089Spjdint vdev_raidz_default_to_general;
170219089Spjd
171252751Sdelphij/* Powers of 2 in the Galois field defined above. */
172168404Spjdstatic const uint8_t vdev_raidz_pow2[256] = {
173168404Spjd	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
174168404Spjd	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
175168404Spjd	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
176168404Spjd	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
177168404Spjd	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
178168404Spjd	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
179168404Spjd	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
180168404Spjd	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
181168404Spjd	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
182168404Spjd	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
183168404Spjd	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
184168404Spjd	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
185168404Spjd	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
186168404Spjd	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
187168404Spjd	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
188168404Spjd	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
189168404Spjd	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
190168404Spjd	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
191168404Spjd	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
192168404Spjd	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
193168404Spjd	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
194168404Spjd	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
195168404Spjd	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
196168404Spjd	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
197168404Spjd	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
198168404Spjd	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
199168404Spjd	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
200168404Spjd	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
201168404Spjd	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
202168404Spjd	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
203168404Spjd	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
204168404Spjd	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
205168404Spjd};
206252751Sdelphij/* Logs of 2 in the Galois field defined above. */
207168404Spjdstatic const uint8_t vdev_raidz_log2[256] = {
208168404Spjd	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
209168404Spjd	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
210168404Spjd	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
211168404Spjd	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
212168404Spjd	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
213168404Spjd	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
214168404Spjd	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
215168404Spjd	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
216168404Spjd	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
217168404Spjd	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
218168404Spjd	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
219168404Spjd	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
220168404Spjd	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
221168404Spjd	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
222168404Spjd	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
223168404Spjd	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
224168404Spjd	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
225168404Spjd	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
226168404Spjd	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
227168404Spjd	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
228168404Spjd	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
229168404Spjd	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
230168404Spjd	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
231168404Spjd	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
232168404Spjd	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
233168404Spjd	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
234168404Spjd	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
235168404Spjd	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
236168404Spjd	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
237168404Spjd	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
238168404Spjd	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
239168404Spjd	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
240168404Spjd};
241168404Spjd
242219089Spjdstatic void vdev_raidz_generate_parity(raidz_map_t *rm);
243219089Spjd
244168404Spjd/*
245168404Spjd * Multiply a given number by 2 raised to the given power.
246168404Spjd */
247168404Spjdstatic uint8_t
248168404Spjdvdev_raidz_exp2(uint_t a, int exp)
249168404Spjd{
250168404Spjd	if (a == 0)
251168404Spjd		return (0);
252168404Spjd
253168404Spjd	ASSERT(exp >= 0);
254168404Spjd	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
255168404Spjd
256168404Spjd	exp += vdev_raidz_log2[a];
257168404Spjd	if (exp > 255)
258168404Spjd		exp -= 255;
259168404Spjd
260168404Spjd	return (vdev_raidz_pow2[exp]);
261168404Spjd}
262168404Spjd
263185029Spjdstatic void
264219089Spjdvdev_raidz_map_free(raidz_map_t *rm)
265185029Spjd{
266185029Spjd	int c;
267219089Spjd	size_t size;
268185029Spjd
269219089Spjd	for (c = 0; c < rm->rm_firstdatacol; c++) {
270251419Ssmh		if (rm->rm_col[c].rc_data != NULL)
271251419Ssmh			zio_buf_free(rm->rm_col[c].rc_data,
272251419Ssmh			    rm->rm_col[c].rc_size);
273185029Spjd
274219089Spjd		if (rm->rm_col[c].rc_gdata != NULL)
275219089Spjd			zio_buf_free(rm->rm_col[c].rc_gdata,
276219089Spjd			    rm->rm_col[c].rc_size);
277219089Spjd	}
278219089Spjd
279219089Spjd	size = 0;
280219089Spjd	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
281219089Spjd		size += rm->rm_col[c].rc_size;
282219089Spjd
283219089Spjd	if (rm->rm_datacopy != NULL)
284219089Spjd		zio_buf_free(rm->rm_datacopy, size);
285219089Spjd
286219089Spjd	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
287185029Spjd}
288185029Spjd
289219089Spjdstatic void
290219089Spjdvdev_raidz_map_free_vsd(zio_t *zio)
291219089Spjd{
292219089Spjd	raidz_map_t *rm = zio->io_vsd;
293219089Spjd
294243674Smm	ASSERT0(rm->rm_freed);
295219089Spjd	rm->rm_freed = 1;
296219089Spjd
297219089Spjd	if (rm->rm_reports == 0)
298219089Spjd		vdev_raidz_map_free(rm);
299219089Spjd}
300219089Spjd
301219089Spjd/*ARGSUSED*/
302219089Spjdstatic void
303219089Spjdvdev_raidz_cksum_free(void *arg, size_t ignored)
304219089Spjd{
305219089Spjd	raidz_map_t *rm = arg;
306219089Spjd
307219089Spjd	ASSERT3U(rm->rm_reports, >, 0);
308219089Spjd
309219089Spjd	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
310219089Spjd		vdev_raidz_map_free(rm);
311219089Spjd}
312219089Spjd
313219089Spjdstatic void
314219089Spjdvdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
315219089Spjd{
316219089Spjd	raidz_map_t *rm = zcr->zcr_cbdata;
317219089Spjd	size_t c = zcr->zcr_cbinfo;
318219089Spjd	size_t x;
319219089Spjd
320219089Spjd	const char *good = NULL;
321219089Spjd	const char *bad = rm->rm_col[c].rc_data;
322219089Spjd
323219089Spjd	if (good_data == NULL) {
324219089Spjd		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
325219089Spjd		return;
326219089Spjd	}
327219089Spjd
328219089Spjd	if (c < rm->rm_firstdatacol) {
329219089Spjd		/*
330219089Spjd		 * The first time through, calculate the parity blocks for
331219089Spjd		 * the good data (this relies on the fact that the good
332219089Spjd		 * data never changes for a given logical ZIO)
333219089Spjd		 */
334219089Spjd		if (rm->rm_col[0].rc_gdata == NULL) {
335219089Spjd			char *bad_parity[VDEV_RAIDZ_MAXPARITY];
336219089Spjd			char *buf;
337219089Spjd
338219089Spjd			/*
339219089Spjd			 * Set up the rm_col[]s to generate the parity for
340219089Spjd			 * good_data, first saving the parity bufs and
341219089Spjd			 * replacing them with buffers to hold the result.
342219089Spjd			 */
343219089Spjd			for (x = 0; x < rm->rm_firstdatacol; x++) {
344219089Spjd				bad_parity[x] = rm->rm_col[x].rc_data;
345219089Spjd				rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
346219089Spjd				    zio_buf_alloc(rm->rm_col[x].rc_size);
347219089Spjd			}
348219089Spjd
349219089Spjd			/* fill in the data columns from good_data */
350219089Spjd			buf = (char *)good_data;
351219089Spjd			for (; x < rm->rm_cols; x++) {
352219089Spjd				rm->rm_col[x].rc_data = buf;
353219089Spjd				buf += rm->rm_col[x].rc_size;
354219089Spjd			}
355219089Spjd
356219089Spjd			/*
357219089Spjd			 * Construct the parity from the good data.
358219089Spjd			 */
359219089Spjd			vdev_raidz_generate_parity(rm);
360219089Spjd
361219089Spjd			/* restore everything back to its original state */
362219089Spjd			for (x = 0; x < rm->rm_firstdatacol; x++)
363219089Spjd				rm->rm_col[x].rc_data = bad_parity[x];
364219089Spjd
365219089Spjd			buf = rm->rm_datacopy;
366219089Spjd			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
367219089Spjd				rm->rm_col[x].rc_data = buf;
368219089Spjd				buf += rm->rm_col[x].rc_size;
369219089Spjd			}
370219089Spjd		}
371219089Spjd
372219089Spjd		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
373219089Spjd		good = rm->rm_col[c].rc_gdata;
374219089Spjd	} else {
375219089Spjd		/* adjust good_data to point at the start of our column */
376219089Spjd		good = good_data;
377219089Spjd
378219089Spjd		for (x = rm->rm_firstdatacol; x < c; x++)
379219089Spjd			good += rm->rm_col[x].rc_size;
380219089Spjd	}
381219089Spjd
382219089Spjd	/* we drop the ereport if it ends up that the data was good */
383219089Spjd	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
384219089Spjd}
385219089Spjd
386219089Spjd/*
387219089Spjd * Invoked indirectly by zfs_ereport_start_checksum(), called
388219089Spjd * below when our read operation fails completely.  The main point
389219089Spjd * is to keep a copy of everything we read from disk, so that at
390219089Spjd * vdev_raidz_cksum_finish() time we can compare it with the good data.
391219089Spjd */
392219089Spjdstatic void
393219089Spjdvdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
394219089Spjd{
395219089Spjd	size_t c = (size_t)(uintptr_t)arg;
396219089Spjd	caddr_t buf;
397219089Spjd
398219089Spjd	raidz_map_t *rm = zio->io_vsd;
399219089Spjd	size_t size;
400219089Spjd
401219089Spjd	/* set up the report and bump the refcount  */
402219089Spjd	zcr->zcr_cbdata = rm;
403219089Spjd	zcr->zcr_cbinfo = c;
404219089Spjd	zcr->zcr_finish = vdev_raidz_cksum_finish;
405219089Spjd	zcr->zcr_free = vdev_raidz_cksum_free;
406219089Spjd
407219089Spjd	rm->rm_reports++;
408219089Spjd	ASSERT3U(rm->rm_reports, >, 0);
409219089Spjd
410219089Spjd	if (rm->rm_datacopy != NULL)
411219089Spjd		return;
412219089Spjd
413219089Spjd	/*
414219089Spjd	 * It's the first time we're called for this raidz_map_t, so we need
415219089Spjd	 * to copy the data aside; there's no guarantee that our zio's buffer
416219089Spjd	 * won't be re-used for something else.
417219089Spjd	 *
418219089Spjd	 * Our parity data is already in separate buffers, so there's no need
419219089Spjd	 * to copy them.
420219089Spjd	 */
421219089Spjd
422219089Spjd	size = 0;
423219089Spjd	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
424219089Spjd		size += rm->rm_col[c].rc_size;
425219089Spjd
426219089Spjd	buf = rm->rm_datacopy = zio_buf_alloc(size);
427219089Spjd
428219089Spjd	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
429219089Spjd		raidz_col_t *col = &rm->rm_col[c];
430219089Spjd
431219089Spjd		bcopy(col->rc_data, buf, col->rc_size);
432219089Spjd		col->rc_data = buf;
433219089Spjd
434219089Spjd		buf += col->rc_size;
435219089Spjd	}
436219089Spjd	ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
437219089Spjd}
438219089Spjd
439219089Spjdstatic const zio_vsd_ops_t vdev_raidz_vsd_ops = {
440219089Spjd	vdev_raidz_map_free_vsd,
441219089Spjd	vdev_raidz_cksum_report
442219089Spjd};
443219089Spjd
444252749Sdelphij/*
445252749Sdelphij * Divides the IO evenly across all child vdevs; usually, dcols is
446252749Sdelphij * the number of children in the target vdev.
447252749Sdelphij */
448168404Spjdstatic raidz_map_t *
449262089Savgvdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, boolean_t dofree,
450262089Savg    uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
451168404Spjd{
452168404Spjd	raidz_map_t *rm;
453252749Sdelphij	/* The starting RAIDZ (parent) vdev sector of the block. */
454262089Savg	uint64_t b = offset >> unit_shift;
455252749Sdelphij	/* The zio's size in units of the vdev's minimum sector size. */
456262089Savg	uint64_t s = size >> unit_shift;
457252749Sdelphij	/* The first column for this stripe. */
458168404Spjd	uint64_t f = b % dcols;
459252749Sdelphij	/* The starting byte offset on each child vdev. */
460168404Spjd	uint64_t o = (b / dcols) << unit_shift;
461219089Spjd	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
462168404Spjd
463252749Sdelphij	/*
464252749Sdelphij	 * "Quotient": The number of data sectors for this stripe on all but
465252749Sdelphij	 * the "big column" child vdevs that also contain "remainder" data.
466252749Sdelphij	 */
467168404Spjd	q = s / (dcols - nparity);
468252749Sdelphij
469252749Sdelphij	/*
470252749Sdelphij	 * "Remainder": The number of partial stripe data sectors in this I/O.
471252749Sdelphij	 * This will add a sector to some, but not all, child vdevs.
472252749Sdelphij	 */
473168404Spjd	r = s - q * (dcols - nparity);
474252749Sdelphij
475252749Sdelphij	/* The number of "big columns" - those which contain remainder data. */
476168404Spjd	bc = (r == 0 ? 0 : r + nparity);
477252749Sdelphij
478252749Sdelphij	/*
479252749Sdelphij	 * The total number of data and parity sectors associated with
480252749Sdelphij	 * this I/O.
481252749Sdelphij	 */
482219089Spjd	tot = s + nparity * (q + (r == 0 ? 0 : 1));
483168404Spjd
484252749Sdelphij	/* acols: The columns that will be accessed. */
485252749Sdelphij	/* scols: The columns that will be accessed or skipped. */
486219089Spjd	if (q == 0) {
487252749Sdelphij		/* Our I/O request doesn't span all child vdevs. */
488219089Spjd		acols = bc;
489219089Spjd		scols = MIN(dcols, roundup(bc, nparity + 1));
490219089Spjd	} else {
491219089Spjd		acols = dcols;
492219089Spjd		scols = dcols;
493219089Spjd	}
494168404Spjd
495219089Spjd	ASSERT3U(acols, <=, scols);
496168404Spjd
497219089Spjd	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
498219089Spjd
499168404Spjd	rm->rm_cols = acols;
500219089Spjd	rm->rm_scols = scols;
501168404Spjd	rm->rm_bigcols = bc;
502219089Spjd	rm->rm_skipstart = bc;
503168404Spjd	rm->rm_missingdata = 0;
504168404Spjd	rm->rm_missingparity = 0;
505168404Spjd	rm->rm_firstdatacol = nparity;
506219089Spjd	rm->rm_datacopy = NULL;
507219089Spjd	rm->rm_reports = 0;
508219089Spjd	rm->rm_freed = 0;
509219089Spjd	rm->rm_ecksuminjected = 0;
510168404Spjd
511219089Spjd	asize = 0;
512219089Spjd
513219089Spjd	for (c = 0; c < scols; c++) {
514168404Spjd		col = f + c;
515168404Spjd		coff = o;
516168404Spjd		if (col >= dcols) {
517168404Spjd			col -= dcols;
518168404Spjd			coff += 1ULL << unit_shift;
519168404Spjd		}
520168404Spjd		rm->rm_col[c].rc_devidx = col;
521168404Spjd		rm->rm_col[c].rc_offset = coff;
522168404Spjd		rm->rm_col[c].rc_data = NULL;
523219089Spjd		rm->rm_col[c].rc_gdata = NULL;
524168404Spjd		rm->rm_col[c].rc_error = 0;
525168404Spjd		rm->rm_col[c].rc_tried = 0;
526168404Spjd		rm->rm_col[c].rc_skipped = 0;
527219089Spjd
528219089Spjd		if (c >= acols)
529219089Spjd			rm->rm_col[c].rc_size = 0;
530219089Spjd		else if (c < bc)
531219089Spjd			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
532219089Spjd		else
533219089Spjd			rm->rm_col[c].rc_size = q << unit_shift;
534219089Spjd
535219089Spjd		asize += rm->rm_col[c].rc_size;
536168404Spjd	}
537168404Spjd
538219089Spjd	ASSERT3U(asize, ==, tot << unit_shift);
539219089Spjd	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
540219089Spjd	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
541219089Spjd	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
542219089Spjd	ASSERT3U(rm->rm_nskip, <=, nparity);
543168404Spjd
544262089Savg	if (!dofree) {
545251419Ssmh		for (c = 0; c < rm->rm_firstdatacol; c++) {
546251419Ssmh			rm->rm_col[c].rc_data =
547251419Ssmh			    zio_buf_alloc(rm->rm_col[c].rc_size);
548251419Ssmh		}
549168404Spjd
550262089Savg		rm->rm_col[c].rc_data = data;
551168404Spjd
552251419Ssmh		for (c = c + 1; c < acols; c++) {
553251419Ssmh			rm->rm_col[c].rc_data =
554251419Ssmh			    (char *)rm->rm_col[c - 1].rc_data +
555251419Ssmh			    rm->rm_col[c - 1].rc_size;
556251419Ssmh		}
557251419Ssmh	}
558168404Spjd
559168404Spjd	/*
560168404Spjd	 * If all data stored spans all columns, there's a danger that parity
561168404Spjd	 * will always be on the same device and, since parity isn't read
562168404Spjd	 * during normal operation, that that device's I/O bandwidth won't be
563168404Spjd	 * used effectively. We therefore switch the parity every 1MB.
564168404Spjd	 *
565168404Spjd	 * ... at least that was, ostensibly, the theory. As a practical
566168404Spjd	 * matter unless we juggle the parity between all devices evenly, we
567168404Spjd	 * won't see any benefit. Further, occasional writes that aren't a
568168404Spjd	 * multiple of the LCM of the number of children and the minimum
569168404Spjd	 * stripe width are sufficient to avoid pessimal behavior.
570168404Spjd	 * Unfortunately, this decision created an implicit on-disk format
571168404Spjd	 * requirement that we need to support for all eternity, but only
572168404Spjd	 * for single-parity RAID-Z.
573219089Spjd	 *
574219089Spjd	 * If we intend to skip a sector in the zeroth column for padding
575219089Spjd	 * we must make sure to note this swap. We will never intend to
576219089Spjd	 * skip the first column since at least one data and one parity
577219089Spjd	 * column must appear in each row.
578168404Spjd	 */
579168404Spjd	ASSERT(rm->rm_cols >= 2);
580168404Spjd	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
581168404Spjd
582262089Savg	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
583168404Spjd		devidx = rm->rm_col[0].rc_devidx;
584168404Spjd		o = rm->rm_col[0].rc_offset;
585168404Spjd		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
586168404Spjd		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
587168404Spjd		rm->rm_col[1].rc_devidx = devidx;
588168404Spjd		rm->rm_col[1].rc_offset = o;
589219089Spjd
590219089Spjd		if (rm->rm_skipstart == 0)
591219089Spjd			rm->rm_skipstart = 1;
592168404Spjd	}
593168404Spjd
594168404Spjd	return (rm);
595168404Spjd}
596168404Spjd
597168404Spjdstatic void
598168404Spjdvdev_raidz_generate_parity_p(raidz_map_t *rm)
599168404Spjd{
600168404Spjd	uint64_t *p, *src, pcount, ccount, i;
601168404Spjd	int c;
602168404Spjd
603168404Spjd	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
604168404Spjd
605168404Spjd	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
606168404Spjd		src = rm->rm_col[c].rc_data;
607168404Spjd		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
608168404Spjd		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
609168404Spjd
610168404Spjd		if (c == rm->rm_firstdatacol) {
611168404Spjd			ASSERT(ccount == pcount);
612219089Spjd			for (i = 0; i < ccount; i++, src++, p++) {
613168404Spjd				*p = *src;
614168404Spjd			}
615168404Spjd		} else {
616168404Spjd			ASSERT(ccount <= pcount);
617219089Spjd			for (i = 0; i < ccount; i++, src++, p++) {
618168404Spjd				*p ^= *src;
619168404Spjd			}
620168404Spjd		}
621168404Spjd	}
622168404Spjd}
623168404Spjd
624168404Spjdstatic void
625168404Spjdvdev_raidz_generate_parity_pq(raidz_map_t *rm)
626168404Spjd{
627219089Spjd	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
628168404Spjd	int c;
629168404Spjd
630219089Spjd	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
631168404Spjd	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
632168404Spjd	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
633168404Spjd
634168404Spjd	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
635168404Spjd		src = rm->rm_col[c].rc_data;
636168404Spjd		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
637168404Spjd		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
638168404Spjd
639219089Spjd		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
640219089Spjd
641168404Spjd		if (c == rm->rm_firstdatacol) {
642219089Spjd			ASSERT(ccnt == pcnt || ccnt == 0);
643219089Spjd			for (i = 0; i < ccnt; i++, src++, p++, q++) {
644219089Spjd				*p = *src;
645168404Spjd				*q = *src;
646219089Spjd			}
647219089Spjd			for (; i < pcnt; i++, src++, p++, q++) {
648219089Spjd				*p = 0;
649219089Spjd				*q = 0;
650219089Spjd			}
651219089Spjd		} else {
652219089Spjd			ASSERT(ccnt <= pcnt);
653219089Spjd
654219089Spjd			/*
655219089Spjd			 * Apply the algorithm described above by multiplying
656219089Spjd			 * the previous result and adding in the new value.
657219089Spjd			 */
658219089Spjd			for (i = 0; i < ccnt; i++, src++, p++, q++) {
659219089Spjd				*p ^= *src;
660219089Spjd
661219089Spjd				VDEV_RAIDZ_64MUL_2(*q, mask);
662219089Spjd				*q ^= *src;
663219089Spjd			}
664219089Spjd
665219089Spjd			/*
666219089Spjd			 * Treat short columns as though they are full of 0s.
667219089Spjd			 * Note that there's therefore nothing needed for P.
668219089Spjd			 */
669219089Spjd			for (; i < pcnt; i++, q++) {
670219089Spjd				VDEV_RAIDZ_64MUL_2(*q, mask);
671219089Spjd			}
672219089Spjd		}
673219089Spjd	}
674219089Spjd}
675219089Spjd
676219089Spjdstatic void
677219089Spjdvdev_raidz_generate_parity_pqr(raidz_map_t *rm)
678219089Spjd{
679219089Spjd	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
680219089Spjd	int c;
681219089Spjd
682219089Spjd	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
683219089Spjd	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
684219089Spjd	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
685219089Spjd	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
686219089Spjd	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
687219089Spjd
688219089Spjd	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
689219089Spjd		src = rm->rm_col[c].rc_data;
690219089Spjd		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
691219089Spjd		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
692219089Spjd		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
693219089Spjd
694219089Spjd		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
695219089Spjd
696219089Spjd		if (c == rm->rm_firstdatacol) {
697219089Spjd			ASSERT(ccnt == pcnt || ccnt == 0);
698219089Spjd			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
699168404Spjd				*p = *src;
700219089Spjd				*q = *src;
701219089Spjd				*r = *src;
702168404Spjd			}
703219089Spjd			for (; i < pcnt; i++, src++, p++, q++, r++) {
704219089Spjd				*p = 0;
705168404Spjd				*q = 0;
706219089Spjd				*r = 0;
707168404Spjd			}
708168404Spjd		} else {
709219089Spjd			ASSERT(ccnt <= pcnt);
710168404Spjd
711168404Spjd			/*
712219089Spjd			 * Apply the algorithm described above by multiplying
713219089Spjd			 * the previous result and adding in the new value.
714168404Spjd			 */
715219089Spjd			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
716219089Spjd				*p ^= *src;
717219089Spjd
718219089Spjd				VDEV_RAIDZ_64MUL_2(*q, mask);
719168404Spjd				*q ^= *src;
720219089Spjd
721219089Spjd				VDEV_RAIDZ_64MUL_4(*r, mask);
722219089Spjd				*r ^= *src;
723168404Spjd			}
724168404Spjd
725168404Spjd			/*
726168404Spjd			 * Treat short columns as though they are full of 0s.
727219089Spjd			 * Note that there's therefore nothing needed for P.
728168404Spjd			 */
729219089Spjd			for (; i < pcnt; i++, q++, r++) {
730219089Spjd				VDEV_RAIDZ_64MUL_2(*q, mask);
731219089Spjd				VDEV_RAIDZ_64MUL_4(*r, mask);
732168404Spjd			}
733168404Spjd		}
734168404Spjd	}
735168404Spjd}
736168404Spjd
737219089Spjd/*
738219089Spjd * Generate RAID parity in the first virtual columns according to the number of
739219089Spjd * parity columns available.
740219089Spjd */
741168404Spjdstatic void
742219089Spjdvdev_raidz_generate_parity(raidz_map_t *rm)
743168404Spjd{
744219089Spjd	switch (rm->rm_firstdatacol) {
745219089Spjd	case 1:
746219089Spjd		vdev_raidz_generate_parity_p(rm);
747219089Spjd		break;
748219089Spjd	case 2:
749219089Spjd		vdev_raidz_generate_parity_pq(rm);
750219089Spjd		break;
751219089Spjd	case 3:
752219089Spjd		vdev_raidz_generate_parity_pqr(rm);
753219089Spjd		break;
754219089Spjd	default:
755219089Spjd		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
756219089Spjd	}
757219089Spjd}
758219089Spjd
759219089Spjdstatic int
760219089Spjdvdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
761219089Spjd{
762168404Spjd	uint64_t *dst, *src, xcount, ccount, count, i;
763219089Spjd	int x = tgts[0];
764168404Spjd	int c;
765168404Spjd
766219089Spjd	ASSERT(ntgts == 1);
767219089Spjd	ASSERT(x >= rm->rm_firstdatacol);
768219089Spjd	ASSERT(x < rm->rm_cols);
769219089Spjd
770168404Spjd	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
771168404Spjd	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
772168404Spjd	ASSERT(xcount > 0);
773168404Spjd
774168404Spjd	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
775168404Spjd	dst = rm->rm_col[x].rc_data;
776168404Spjd	for (i = 0; i < xcount; i++, dst++, src++) {
777168404Spjd		*dst = *src;
778168404Spjd	}
779168404Spjd
780168404Spjd	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
781168404Spjd		src = rm->rm_col[c].rc_data;
782168404Spjd		dst = rm->rm_col[x].rc_data;
783168404Spjd
784168404Spjd		if (c == x)
785168404Spjd			continue;
786168404Spjd
787168404Spjd		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
788168404Spjd		count = MIN(ccount, xcount);
789168404Spjd
790168404Spjd		for (i = 0; i < count; i++, dst++, src++) {
791168404Spjd			*dst ^= *src;
792168404Spjd		}
793168404Spjd	}
794219089Spjd
795219089Spjd	return (1 << VDEV_RAIDZ_P);
796168404Spjd}
797168404Spjd
798219089Spjdstatic int
799219089Spjdvdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
800168404Spjd{
801168404Spjd	uint64_t *dst, *src, xcount, ccount, count, mask, i;
802168404Spjd	uint8_t *b;
803219089Spjd	int x = tgts[0];
804168404Spjd	int c, j, exp;
805168404Spjd
806219089Spjd	ASSERT(ntgts == 1);
807219089Spjd
808168404Spjd	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
809168404Spjd	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
810168404Spjd
811168404Spjd	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
812168404Spjd		src = rm->rm_col[c].rc_data;
813168404Spjd		dst = rm->rm_col[x].rc_data;
814168404Spjd
815168404Spjd		if (c == x)
816168404Spjd			ccount = 0;
817168404Spjd		else
818168404Spjd			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
819168404Spjd
820168404Spjd		count = MIN(ccount, xcount);
821168404Spjd
822168404Spjd		if (c == rm->rm_firstdatacol) {
823168404Spjd			for (i = 0; i < count; i++, dst++, src++) {
824168404Spjd				*dst = *src;
825168404Spjd			}
826168404Spjd			for (; i < xcount; i++, dst++) {
827168404Spjd				*dst = 0;
828168404Spjd			}
829168404Spjd
830168404Spjd		} else {
831168404Spjd			for (i = 0; i < count; i++, dst++, src++) {
832219089Spjd				VDEV_RAIDZ_64MUL_2(*dst, mask);
833168404Spjd				*dst ^= *src;
834168404Spjd			}
835168404Spjd
836168404Spjd			for (; i < xcount; i++, dst++) {
837219089Spjd				VDEV_RAIDZ_64MUL_2(*dst, mask);
838168404Spjd			}
839168404Spjd		}
840168404Spjd	}
841168404Spjd
842168404Spjd	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
843168404Spjd	dst = rm->rm_col[x].rc_data;
844168404Spjd	exp = 255 - (rm->rm_cols - 1 - x);
845168404Spjd
846168404Spjd	for (i = 0; i < xcount; i++, dst++, src++) {
847168404Spjd		*dst ^= *src;
848168404Spjd		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
849168404Spjd			*b = vdev_raidz_exp2(*b, exp);
850168404Spjd		}
851168404Spjd	}
852219089Spjd
853219089Spjd	return (1 << VDEV_RAIDZ_Q);
854168404Spjd}
855168404Spjd
856219089Spjdstatic int
857219089Spjdvdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
858168404Spjd{
859168404Spjd	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
860168404Spjd	void *pdata, *qdata;
861168404Spjd	uint64_t xsize, ysize, i;
862219089Spjd	int x = tgts[0];
863219089Spjd	int y = tgts[1];
864168404Spjd
865219089Spjd	ASSERT(ntgts == 2);
866168404Spjd	ASSERT(x < y);
867168404Spjd	ASSERT(x >= rm->rm_firstdatacol);
868168404Spjd	ASSERT(y < rm->rm_cols);
869168404Spjd
870168404Spjd	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
871168404Spjd
872168404Spjd	/*
873168404Spjd	 * Move the parity data aside -- we're going to compute parity as
874168404Spjd	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
875168404Spjd	 * reuse the parity generation mechanism without trashing the actual
876168404Spjd	 * parity so we make those columns appear to be full of zeros by
877168404Spjd	 * setting their lengths to zero.
878168404Spjd	 */
879168404Spjd	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
880168404Spjd	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
881168404Spjd	xsize = rm->rm_col[x].rc_size;
882168404Spjd	ysize = rm->rm_col[y].rc_size;
883168404Spjd
884168404Spjd	rm->rm_col[VDEV_RAIDZ_P].rc_data =
885168404Spjd	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
886168404Spjd	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
887168404Spjd	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
888168404Spjd	rm->rm_col[x].rc_size = 0;
889168404Spjd	rm->rm_col[y].rc_size = 0;
890168404Spjd
891168404Spjd	vdev_raidz_generate_parity_pq(rm);
892168404Spjd
893168404Spjd	rm->rm_col[x].rc_size = xsize;
894168404Spjd	rm->rm_col[y].rc_size = ysize;
895168404Spjd
896168404Spjd	p = pdata;
897168404Spjd	q = qdata;
898168404Spjd	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
899168404Spjd	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
900168404Spjd	xd = rm->rm_col[x].rc_data;
901168404Spjd	yd = rm->rm_col[y].rc_data;
902168404Spjd
903168404Spjd	/*
904168404Spjd	 * We now have:
905168404Spjd	 *	Pxy = P + D_x + D_y
906168404Spjd	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
907168404Spjd	 *
908168404Spjd	 * We can then solve for D_x:
909168404Spjd	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
910168404Spjd	 * where
911168404Spjd	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
912168404Spjd	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
913168404Spjd	 *
914168404Spjd	 * With D_x in hand, we can easily solve for D_y:
915168404Spjd	 *	D_y = P + Pxy + D_x
916168404Spjd	 */
917168404Spjd
918168404Spjd	a = vdev_raidz_pow2[255 + x - y];
919168404Spjd	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
920168404Spjd	tmp = 255 - vdev_raidz_log2[a ^ 1];
921168404Spjd
922168404Spjd	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
923168404Spjd	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
924168404Spjd
925168404Spjd	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
926168404Spjd		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
927168404Spjd		    vdev_raidz_exp2(*q ^ *qxy, bexp);
928168404Spjd
929168404Spjd		if (i < ysize)
930168404Spjd			*yd = *p ^ *pxy ^ *xd;
931168404Spjd	}
932168404Spjd
933168404Spjd	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
934168404Spjd	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
935168404Spjd	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
936168404Spjd	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
937168404Spjd
938168404Spjd	/*
939168404Spjd	 * Restore the saved parity data.
940168404Spjd	 */
941168404Spjd	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
942168404Spjd	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
943219089Spjd
944219089Spjd	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
945168404Spjd}
946168404Spjd
947219089Spjd/* BEGIN CSTYLED */
948219089Spjd/*
949219089Spjd * In the general case of reconstruction, we must solve the system of linear
950219089Spjd * equations defined by the coeffecients used to generate parity as well as
951219089Spjd * the contents of the data and parity disks. This can be expressed with
952219089Spjd * vectors for the original data (D) and the actual data (d) and parity (p)
953219089Spjd * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
954219089Spjd *
955219089Spjd *            __   __                     __     __
956219089Spjd *            |     |         __     __   |  p_0  |
957219089Spjd *            |  V  |         |  D_0  |   | p_m-1 |
958219089Spjd *            |     |    x    |   :   | = |  d_0  |
959219089Spjd *            |  I  |         | D_n-1 |   |   :   |
960219089Spjd *            |     |         ~~     ~~   | d_n-1 |
961219089Spjd *            ~~   ~~                     ~~     ~~
962219089Spjd *
963219089Spjd * I is simply a square identity matrix of size n, and V is a vandermonde
964219089Spjd * matrix defined by the coeffecients we chose for the various parity columns
965219089Spjd * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
966219089Spjd * computation as well as linear separability.
967219089Spjd *
968219089Spjd *      __               __               __     __
969219089Spjd *      |   1   ..  1 1 1 |               |  p_0  |
970219089Spjd *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
971219089Spjd *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
972219089Spjd *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
973219089Spjd *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
974219089Spjd *      |   :       : : : |   |   :   |   |  d_2  |
975219089Spjd *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
976219089Spjd *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
977219089Spjd *      |   0   ..  0 0 1 |               | d_n-1 |
978219089Spjd *      ~~               ~~               ~~     ~~
979219089Spjd *
980219089Spjd * Note that I, V, d, and p are known. To compute D, we must invert the
981219089Spjd * matrix and use the known data and parity values to reconstruct the unknown
982219089Spjd * data values. We begin by removing the rows in V|I and d|p that correspond
983219089Spjd * to failed or missing columns; we then make V|I square (n x n) and d|p
984219089Spjd * sized n by removing rows corresponding to unused parity from the bottom up
985219089Spjd * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
986219089Spjd * using Gauss-Jordan elimination. In the example below we use m=3 parity
987219089Spjd * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
988219089Spjd *           __                               __
989219089Spjd *           |  1   1   1   1   1   1   1   1  |
990219089Spjd *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
991219089Spjd *           |  19 205 116  29  64  16  4   1  |      / /
992219089Spjd *           |  1   0   0   0   0   0   0   0  |     / /
993219089Spjd *           |  0   1   0   0   0   0   0   0  | <--' /
994219089Spjd *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
995219089Spjd *           |  0   0   0   1   0   0   0   0  |
996219089Spjd *           |  0   0   0   0   1   0   0   0  |
997219089Spjd *           |  0   0   0   0   0   1   0   0  |
998219089Spjd *           |  0   0   0   0   0   0   1   0  |
999219089Spjd *           |  0   0   0   0   0   0   0   1  |
1000219089Spjd *           ~~                               ~~
1001219089Spjd *           __                               __
1002219089Spjd *           |  1   1   1   1   1   1   1   1  |
1003219089Spjd *           |  19 205 116  29  64  16  4   1  |
1004219089Spjd *           |  1   0   0   0   0   0   0   0  |
1005262089Savg *  (V|I)' = |  0   0   0   1   0   0   0   0  |
1006219089Spjd *           |  0   0   0   0   1   0   0   0  |
1007219089Spjd *           |  0   0   0   0   0   1   0   0  |
1008219089Spjd *           |  0   0   0   0   0   0   1   0  |
1009219089Spjd *           |  0   0   0   0   0   0   0   1  |
1010219089Spjd *           ~~                               ~~
1011219089Spjd *
1012219089Spjd * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1013219089Spjd * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1014219089Spjd * matrix is not singular.
1015219089Spjd * __                                                                 __
1016219089Spjd * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1017219089Spjd * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1018219089Spjd * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1019219089Spjd * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1020219089Spjd * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1021219089Spjd * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1022219089Spjd * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1023219089Spjd * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1024219089Spjd * ~~                                                                 ~~
1025219089Spjd * __                                                                 __
1026219089Spjd * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1027219089Spjd * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1028219089Spjd * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1029219089Spjd * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1030219089Spjd * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1031219089Spjd * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1032219089Spjd * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1033219089Spjd * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1034219089Spjd * ~~                                                                 ~~
1035219089Spjd * __                                                                 __
1036219089Spjd * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1037219089Spjd * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1038219089Spjd * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1039219089Spjd * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1040219089Spjd * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1041219089Spjd * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1042219089Spjd * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1043219089Spjd * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1044219089Spjd * ~~                                                                 ~~
1045219089Spjd * __                                                                 __
1046219089Spjd * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1047219089Spjd * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1048219089Spjd * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1049219089Spjd * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1050219089Spjd * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1051219089Spjd * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1052219089Spjd * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1053219089Spjd * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1054219089Spjd * ~~                                                                 ~~
1055219089Spjd * __                                                                 __
1056219089Spjd * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1057219089Spjd * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1058219089Spjd * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1059219089Spjd * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1060219089Spjd * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1061219089Spjd * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1062219089Spjd * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1063219089Spjd * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1064219089Spjd * ~~                                                                 ~~
1065219089Spjd * __                                                                 __
1066219089Spjd * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1067219089Spjd * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1068219089Spjd * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1069219089Spjd * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1070219089Spjd * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1071219089Spjd * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1072219089Spjd * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1073219089Spjd * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1074219089Spjd * ~~                                                                 ~~
1075219089Spjd *                   __                               __
1076219089Spjd *                   |  0   0   1   0   0   0   0   0  |
1077219089Spjd *                   | 167 100  5   41 159 169 217 208 |
1078219089Spjd *                   | 166 100  4   40 158 168 216 209 |
1079219089Spjd *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1080219089Spjd *                   |  0   0   0   0   1   0   0   0  |
1081219089Spjd *                   |  0   0   0   0   0   1   0   0  |
1082219089Spjd *                   |  0   0   0   0   0   0   1   0  |
1083219089Spjd *                   |  0   0   0   0   0   0   0   1  |
1084219089Spjd *                   ~~                               ~~
1085219089Spjd *
1086219089Spjd * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1087219089Spjd * of the missing data.
1088219089Spjd *
1089219089Spjd * As is apparent from the example above, the only non-trivial rows in the
1090219089Spjd * inverse matrix correspond to the data disks that we're trying to
1091219089Spjd * reconstruct. Indeed, those are the only rows we need as the others would
1092219089Spjd * only be useful for reconstructing data known or assumed to be valid. For
1093219089Spjd * that reason, we only build the coefficients in the rows that correspond to
1094219089Spjd * targeted columns.
1095219089Spjd */
1096219089Spjd/* END CSTYLED */
1097168404Spjd
1098219089Spjdstatic void
1099219089Spjdvdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1100219089Spjd    uint8_t **rows)
1101219089Spjd{
1102219089Spjd	int i, j;
1103219089Spjd	int pow;
1104219089Spjd
1105219089Spjd	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1106219089Spjd
1107219089Spjd	/*
1108219089Spjd	 * Fill in the missing rows of interest.
1109219089Spjd	 */
1110219089Spjd	for (i = 0; i < nmap; i++) {
1111219089Spjd		ASSERT3S(0, <=, map[i]);
1112219089Spjd		ASSERT3S(map[i], <=, 2);
1113219089Spjd
1114219089Spjd		pow = map[i] * n;
1115219089Spjd		if (pow > 255)
1116219089Spjd			pow -= 255;
1117219089Spjd		ASSERT(pow <= 255);
1118219089Spjd
1119219089Spjd		for (j = 0; j < n; j++) {
1120219089Spjd			pow -= map[i];
1121219089Spjd			if (pow < 0)
1122219089Spjd				pow += 255;
1123219089Spjd			rows[i][j] = vdev_raidz_pow2[pow];
1124219089Spjd		}
1125219089Spjd	}
1126219089Spjd}
1127219089Spjd
1128219089Spjdstatic void
1129219089Spjdvdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1130219089Spjd    uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1131219089Spjd{
1132219089Spjd	int i, j, ii, jj;
1133219089Spjd	uint8_t log;
1134219089Spjd
1135219089Spjd	/*
1136219089Spjd	 * Assert that the first nmissing entries from the array of used
1137219089Spjd	 * columns correspond to parity columns and that subsequent entries
1138219089Spjd	 * correspond to data columns.
1139219089Spjd	 */
1140219089Spjd	for (i = 0; i < nmissing; i++) {
1141219089Spjd		ASSERT3S(used[i], <, rm->rm_firstdatacol);
1142219089Spjd	}
1143219089Spjd	for (; i < n; i++) {
1144219089Spjd		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1145219089Spjd	}
1146219089Spjd
1147219089Spjd	/*
1148219089Spjd	 * First initialize the storage where we'll compute the inverse rows.
1149219089Spjd	 */
1150219089Spjd	for (i = 0; i < nmissing; i++) {
1151219089Spjd		for (j = 0; j < n; j++) {
1152219089Spjd			invrows[i][j] = (i == j) ? 1 : 0;
1153219089Spjd		}
1154219089Spjd	}
1155219089Spjd
1156219089Spjd	/*
1157219089Spjd	 * Subtract all trivial rows from the rows of consequence.
1158219089Spjd	 */
1159219089Spjd	for (i = 0; i < nmissing; i++) {
1160219089Spjd		for (j = nmissing; j < n; j++) {
1161219089Spjd			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1162219089Spjd			jj = used[j] - rm->rm_firstdatacol;
1163219089Spjd			ASSERT3S(jj, <, n);
1164219089Spjd			invrows[i][j] = rows[i][jj];
1165219089Spjd			rows[i][jj] = 0;
1166219089Spjd		}
1167219089Spjd	}
1168219089Spjd
1169219089Spjd	/*
1170219089Spjd	 * For each of the rows of interest, we must normalize it and subtract
1171219089Spjd	 * a multiple of it from the other rows.
1172219089Spjd	 */
1173219089Spjd	for (i = 0; i < nmissing; i++) {
1174219089Spjd		for (j = 0; j < missing[i]; j++) {
1175243674Smm			ASSERT0(rows[i][j]);
1176219089Spjd		}
1177219089Spjd		ASSERT3U(rows[i][missing[i]], !=, 0);
1178219089Spjd
1179219089Spjd		/*
1180219089Spjd		 * Compute the inverse of the first element and multiply each
1181219089Spjd		 * element in the row by that value.
1182219089Spjd		 */
1183219089Spjd		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1184219089Spjd
1185219089Spjd		for (j = 0; j < n; j++) {
1186219089Spjd			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1187219089Spjd			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1188219089Spjd		}
1189219089Spjd
1190219089Spjd		for (ii = 0; ii < nmissing; ii++) {
1191219089Spjd			if (i == ii)
1192219089Spjd				continue;
1193219089Spjd
1194219089Spjd			ASSERT3U(rows[ii][missing[i]], !=, 0);
1195219089Spjd
1196219089Spjd			log = vdev_raidz_log2[rows[ii][missing[i]]];
1197219089Spjd
1198219089Spjd			for (j = 0; j < n; j++) {
1199219089Spjd				rows[ii][j] ^=
1200219089Spjd				    vdev_raidz_exp2(rows[i][j], log);
1201219089Spjd				invrows[ii][j] ^=
1202219089Spjd				    vdev_raidz_exp2(invrows[i][j], log);
1203219089Spjd			}
1204219089Spjd		}
1205219089Spjd	}
1206219089Spjd
1207219089Spjd	/*
1208219089Spjd	 * Verify that the data that is left in the rows are properly part of
1209219089Spjd	 * an identity matrix.
1210219089Spjd	 */
1211219089Spjd	for (i = 0; i < nmissing; i++) {
1212219089Spjd		for (j = 0; j < n; j++) {
1213219089Spjd			if (j == missing[i]) {
1214219089Spjd				ASSERT3U(rows[i][j], ==, 1);
1215219089Spjd			} else {
1216243674Smm				ASSERT0(rows[i][j]);
1217219089Spjd			}
1218219089Spjd		}
1219219089Spjd	}
1220219089Spjd}
1221219089Spjd
1222219089Spjdstatic void
1223219089Spjdvdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1224219089Spjd    int *missing, uint8_t **invrows, const uint8_t *used)
1225219089Spjd{
1226219089Spjd	int i, j, x, cc, c;
1227219089Spjd	uint8_t *src;
1228219089Spjd	uint64_t ccount;
1229219089Spjd	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1230219089Spjd	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1231248369Smm	uint8_t log = 0;
1232248369Smm	uint8_t val;
1233219089Spjd	int ll;
1234219089Spjd	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1235219089Spjd	uint8_t *p, *pp;
1236219089Spjd	size_t psize;
1237219089Spjd
1238219089Spjd	psize = sizeof (invlog[0][0]) * n * nmissing;
1239219089Spjd	p = kmem_alloc(psize, KM_SLEEP);
1240219089Spjd
1241219089Spjd	for (pp = p, i = 0; i < nmissing; i++) {
1242219089Spjd		invlog[i] = pp;
1243219089Spjd		pp += n;
1244219089Spjd	}
1245219089Spjd
1246219089Spjd	for (i = 0; i < nmissing; i++) {
1247219089Spjd		for (j = 0; j < n; j++) {
1248219089Spjd			ASSERT3U(invrows[i][j], !=, 0);
1249219089Spjd			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1250219089Spjd		}
1251219089Spjd	}
1252219089Spjd
1253219089Spjd	for (i = 0; i < n; i++) {
1254219089Spjd		c = used[i];
1255219089Spjd		ASSERT3U(c, <, rm->rm_cols);
1256219089Spjd
1257219089Spjd		src = rm->rm_col[c].rc_data;
1258219089Spjd		ccount = rm->rm_col[c].rc_size;
1259219089Spjd		for (j = 0; j < nmissing; j++) {
1260219089Spjd			cc = missing[j] + rm->rm_firstdatacol;
1261219089Spjd			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1262219089Spjd			ASSERT3U(cc, <, rm->rm_cols);
1263219089Spjd			ASSERT3U(cc, !=, c);
1264219089Spjd
1265219089Spjd			dst[j] = rm->rm_col[cc].rc_data;
1266219089Spjd			dcount[j] = rm->rm_col[cc].rc_size;
1267219089Spjd		}
1268219089Spjd
1269219089Spjd		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1270219089Spjd
1271219089Spjd		for (x = 0; x < ccount; x++, src++) {
1272219089Spjd			if (*src != 0)
1273219089Spjd				log = vdev_raidz_log2[*src];
1274219089Spjd
1275219089Spjd			for (cc = 0; cc < nmissing; cc++) {
1276219089Spjd				if (x >= dcount[cc])
1277219089Spjd					continue;
1278219089Spjd
1279219089Spjd				if (*src == 0) {
1280219089Spjd					val = 0;
1281219089Spjd				} else {
1282219089Spjd					if ((ll = log + invlog[cc][i]) >= 255)
1283219089Spjd						ll -= 255;
1284219089Spjd					val = vdev_raidz_pow2[ll];
1285219089Spjd				}
1286219089Spjd
1287219089Spjd				if (i == 0)
1288219089Spjd					dst[cc][x] = val;
1289219089Spjd				else
1290219089Spjd					dst[cc][x] ^= val;
1291219089Spjd			}
1292219089Spjd		}
1293219089Spjd	}
1294219089Spjd
1295219089Spjd	kmem_free(p, psize);
1296219089Spjd}
1297219089Spjd
1298168404Spjdstatic int
1299219089Spjdvdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1300219089Spjd{
1301219089Spjd	int n, i, c, t, tt;
1302219089Spjd	int nmissing_rows;
1303219089Spjd	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1304219089Spjd	int parity_map[VDEV_RAIDZ_MAXPARITY];
1305219089Spjd
1306219089Spjd	uint8_t *p, *pp;
1307219089Spjd	size_t psize;
1308219089Spjd
1309219089Spjd	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1310219089Spjd	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1311219089Spjd	uint8_t *used;
1312219089Spjd
1313219089Spjd	int code = 0;
1314219089Spjd
1315219089Spjd
1316219089Spjd	n = rm->rm_cols - rm->rm_firstdatacol;
1317219089Spjd
1318219089Spjd	/*
1319219089Spjd	 * Figure out which data columns are missing.
1320219089Spjd	 */
1321219089Spjd	nmissing_rows = 0;
1322219089Spjd	for (t = 0; t < ntgts; t++) {
1323219089Spjd		if (tgts[t] >= rm->rm_firstdatacol) {
1324219089Spjd			missing_rows[nmissing_rows++] =
1325219089Spjd			    tgts[t] - rm->rm_firstdatacol;
1326219089Spjd		}
1327219089Spjd	}
1328219089Spjd
1329219089Spjd	/*
1330219089Spjd	 * Figure out which parity columns to use to help generate the missing
1331219089Spjd	 * data columns.
1332219089Spjd	 */
1333219089Spjd	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1334219089Spjd		ASSERT(tt < ntgts);
1335219089Spjd		ASSERT(c < rm->rm_firstdatacol);
1336219089Spjd
1337219089Spjd		/*
1338219089Spjd		 * Skip any targeted parity columns.
1339219089Spjd		 */
1340219089Spjd		if (c == tgts[tt]) {
1341219089Spjd			tt++;
1342219089Spjd			continue;
1343219089Spjd		}
1344219089Spjd
1345219089Spjd		code |= 1 << c;
1346219089Spjd
1347219089Spjd		parity_map[i] = c;
1348219089Spjd		i++;
1349219089Spjd	}
1350219089Spjd
1351219089Spjd	ASSERT(code != 0);
1352219089Spjd	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1353219089Spjd
1354219089Spjd	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1355219089Spjd	    nmissing_rows * n + sizeof (used[0]) * n;
1356219089Spjd	p = kmem_alloc(psize, KM_SLEEP);
1357219089Spjd
1358219089Spjd	for (pp = p, i = 0; i < nmissing_rows; i++) {
1359219089Spjd		rows[i] = pp;
1360219089Spjd		pp += n;
1361219089Spjd		invrows[i] = pp;
1362219089Spjd		pp += n;
1363219089Spjd	}
1364219089Spjd	used = pp;
1365219089Spjd
1366219089Spjd	for (i = 0; i < nmissing_rows; i++) {
1367219089Spjd		used[i] = parity_map[i];
1368219089Spjd	}
1369219089Spjd
1370219089Spjd	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1371219089Spjd		if (tt < nmissing_rows &&
1372219089Spjd		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1373219089Spjd			tt++;
1374219089Spjd			continue;
1375219089Spjd		}
1376219089Spjd
1377219089Spjd		ASSERT3S(i, <, n);
1378219089Spjd		used[i] = c;
1379219089Spjd		i++;
1380219089Spjd	}
1381219089Spjd
1382219089Spjd	/*
1383219089Spjd	 * Initialize the interesting rows of the matrix.
1384219089Spjd	 */
1385219089Spjd	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1386219089Spjd
1387219089Spjd	/*
1388219089Spjd	 * Invert the matrix.
1389219089Spjd	 */
1390219089Spjd	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1391219089Spjd	    invrows, used);
1392219089Spjd
1393219089Spjd	/*
1394219089Spjd	 * Reconstruct the missing data using the generated matrix.
1395219089Spjd	 */
1396219089Spjd	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1397219089Spjd	    invrows, used);
1398219089Spjd
1399219089Spjd	kmem_free(p, psize);
1400219089Spjd
1401219089Spjd	return (code);
1402219089Spjd}
1403219089Spjd
1404219089Spjdstatic int
1405219089Spjdvdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1406219089Spjd{
1407219089Spjd	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1408219089Spjd	int ntgts;
1409219089Spjd	int i, c;
1410219089Spjd	int code;
1411219089Spjd	int nbadparity, nbaddata;
1412219089Spjd	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1413219089Spjd
1414219089Spjd	/*
1415219089Spjd	 * The tgts list must already be sorted.
1416219089Spjd	 */
1417219089Spjd	for (i = 1; i < nt; i++) {
1418219089Spjd		ASSERT(t[i] > t[i - 1]);
1419219089Spjd	}
1420219089Spjd
1421219089Spjd	nbadparity = rm->rm_firstdatacol;
1422219089Spjd	nbaddata = rm->rm_cols - nbadparity;
1423219089Spjd	ntgts = 0;
1424219089Spjd	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1425219089Spjd		if (c < rm->rm_firstdatacol)
1426219089Spjd			parity_valid[c] = B_FALSE;
1427219089Spjd
1428219089Spjd		if (i < nt && c == t[i]) {
1429219089Spjd			tgts[ntgts++] = c;
1430219089Spjd			i++;
1431219089Spjd		} else if (rm->rm_col[c].rc_error != 0) {
1432219089Spjd			tgts[ntgts++] = c;
1433219089Spjd		} else if (c >= rm->rm_firstdatacol) {
1434219089Spjd			nbaddata--;
1435219089Spjd		} else {
1436219089Spjd			parity_valid[c] = B_TRUE;
1437219089Spjd			nbadparity--;
1438219089Spjd		}
1439219089Spjd	}
1440219089Spjd
1441219089Spjd	ASSERT(ntgts >= nt);
1442219089Spjd	ASSERT(nbaddata >= 0);
1443219089Spjd	ASSERT(nbaddata + nbadparity == ntgts);
1444219089Spjd
1445219089Spjd	dt = &tgts[nbadparity];
1446219089Spjd
1447219089Spjd	/*
1448219089Spjd	 * See if we can use any of our optimized reconstruction routines.
1449219089Spjd	 */
1450219089Spjd	if (!vdev_raidz_default_to_general) {
1451219089Spjd		switch (nbaddata) {
1452219089Spjd		case 1:
1453219089Spjd			if (parity_valid[VDEV_RAIDZ_P])
1454219089Spjd				return (vdev_raidz_reconstruct_p(rm, dt, 1));
1455219089Spjd
1456219089Spjd			ASSERT(rm->rm_firstdatacol > 1);
1457219089Spjd
1458219089Spjd			if (parity_valid[VDEV_RAIDZ_Q])
1459219089Spjd				return (vdev_raidz_reconstruct_q(rm, dt, 1));
1460219089Spjd
1461219089Spjd			ASSERT(rm->rm_firstdatacol > 2);
1462219089Spjd			break;
1463219089Spjd
1464219089Spjd		case 2:
1465219089Spjd			ASSERT(rm->rm_firstdatacol > 1);
1466219089Spjd
1467219089Spjd			if (parity_valid[VDEV_RAIDZ_P] &&
1468219089Spjd			    parity_valid[VDEV_RAIDZ_Q])
1469219089Spjd				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1470219089Spjd
1471219089Spjd			ASSERT(rm->rm_firstdatacol > 2);
1472219089Spjd
1473219089Spjd			break;
1474219089Spjd		}
1475219089Spjd	}
1476219089Spjd
1477219089Spjd	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1478219089Spjd	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1479219089Spjd	ASSERT(code > 0);
1480219089Spjd	return (code);
1481219089Spjd}
1482219089Spjd
1483219089Spjdstatic int
1484236839Smmvdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1485262081Savg    uint64_t *logical_ashift, uint64_t *physical_ashift)
1486168404Spjd{
1487168404Spjd	vdev_t *cvd;
1488168404Spjd	uint64_t nparity = vd->vdev_nparity;
1489219089Spjd	int c;
1490168404Spjd	int lasterror = 0;
1491168404Spjd	int numerrors = 0;
1492168404Spjd
1493168404Spjd	ASSERT(nparity > 0);
1494168404Spjd
1495168404Spjd	if (nparity > VDEV_RAIDZ_MAXPARITY ||
1496168404Spjd	    vd->vdev_children < nparity + 1) {
1497168404Spjd		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1498249643Smm		return (SET_ERROR(EINVAL));
1499168404Spjd	}
1500168404Spjd
1501219089Spjd	vdev_open_children(vd);
1502219089Spjd
1503168404Spjd	for (c = 0; c < vd->vdev_children; c++) {
1504168404Spjd		cvd = vd->vdev_child[c];
1505168404Spjd
1506219089Spjd		if (cvd->vdev_open_error != 0) {
1507219089Spjd			lasterror = cvd->vdev_open_error;
1508168404Spjd			numerrors++;
1509168404Spjd			continue;
1510168404Spjd		}
1511168404Spjd
1512168404Spjd		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1513236839Smm		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1514262081Savg		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
1515262081Savg		*physical_ashift = MAX(*physical_ashift,
1516262081Savg		    cvd->vdev_physical_ashift);
1517168404Spjd	}
1518168404Spjd
1519168404Spjd	*asize *= vd->vdev_children;
1520236839Smm	*max_asize *= vd->vdev_children;
1521168404Spjd
1522168404Spjd	if (numerrors > nparity) {
1523168404Spjd		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1524168404Spjd		return (lasterror);
1525168404Spjd	}
1526168404Spjd
1527168404Spjd	return (0);
1528168404Spjd}
1529168404Spjd
1530168404Spjdstatic void
1531168404Spjdvdev_raidz_close(vdev_t *vd)
1532168404Spjd{
1533168404Spjd	int c;
1534168404Spjd
1535168404Spjd	for (c = 0; c < vd->vdev_children; c++)
1536168404Spjd		vdev_close(vd->vdev_child[c]);
1537168404Spjd}
1538168404Spjd
1539262089Savg#ifdef illumos
1540262089Savg/*
1541262089Savg * Handle a read or write I/O to a RAID-Z dump device.
1542262089Savg *
1543262089Savg * The dump device is in a unique situation compared to other ZFS datasets:
1544262089Savg * writing to this device should be as simple and fast as possible.  In
1545262089Savg * addition, durability matters much less since the dump will be extracted
1546262089Savg * once the machine reboots.  For that reason, this function eschews parity for
1547262089Savg * performance and simplicity.  The dump device uses the checksum setting
1548262089Savg * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1549262089Savg * dataset.
1550262089Savg *
1551262089Savg * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
1552262089Savg * 128 KB will not fill an entire block; in addition, they may not be properly
1553262089Savg * aligned.  In that case, this function uses the preallocated 128 KB block and
1554262089Savg * omits reading or writing any "empty" portions of that block, as opposed to
1555262089Savg * allocating a fresh appropriately-sized block.
1556262089Savg *
1557262089Savg * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1558262089Savg *
1559262089Savg *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1560262089Savg *
1561262089Savg * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1562262089Savg * allocated which spans all five child vdevs.  8 KB of data would be written to
1563262089Savg * each of four vdevs, with the fifth containing the parity bits.
1564262089Savg *
1565262089Savg *       parity    data     data     data     data
1566262089Savg *     |   PP   |   XX   |   XX   |   XX   |   XX   |
1567262089Savg *         ^        ^        ^        ^        ^
1568262089Savg *         |        |        |        |        |
1569262089Savg *   8 KB parity    ------8 KB data blocks------
1570262089Savg *
1571262089Savg * However, when writing to the dump device, the behavior is different:
1572262089Savg *
1573262089Savg *     vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1574262089Savg *
1575262089Savg * Unlike the normal RAID-Z case in which the block is allocated based on the
1576262089Savg * I/O size, reads and writes here always use a 128 KB logical I/O size.  If the
1577262089Savg * I/O size is less than 128 KB, only the actual portions of data are written.
1578262089Savg * In this example the data is written to the third data vdev since that vdev
1579262089Savg * contains the offset [64 KB, 96 KB).
1580262089Savg *
1581262089Savg *       parity    data     data     data     data
1582262089Savg *     |        |        |        |   XX   |        |
1583262089Savg *                                    ^
1584262089Savg *                                    |
1585262089Savg *                             32 KB data block
1586262089Savg *
1587262089Savg * As a result, an individual I/O may not span all child vdevs; moreover, a
1588262089Savg * small I/O may only operate on a single child vdev.
1589262089Savg *
1590262089Savg * Note that since there are no parity bits calculated or written, this format
1591262089Savg * remains the same no matter how many parity bits are used in a normal RAID-Z
1592262089Savg * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
1593262089Savg * would look like:
1594262089Savg *
1595262089Savg *       parity   parity   parity    data     data     data     data
1596262089Savg *     |        |        |        |        |        |   XX   |        |
1597262089Savg *                                                      ^
1598262089Savg *                                                      |
1599262089Savg *                                               32 KB data block
1600262089Savg */
1601262089Savgint
1602262089Savgvdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1603262089Savg    uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1604262089Savg{
1605262089Savg	vdev_t *tvd = vd->vdev_top;
1606262089Savg	vdev_t *cvd;
1607262089Savg	raidz_map_t *rm;
1608262089Savg	raidz_col_t *rc;
1609262089Savg	int c, err = 0;
1610262089Savg
1611262089Savg	uint64_t start, end, colstart, colend;
1612262089Savg	uint64_t coloffset, colsize, colskip;
1613262089Savg
1614262089Savg	int flags = doread ? BIO_READ : BIO_WRITE;
1615262089Savg
1616262089Savg#ifdef	_KERNEL
1617262089Savg
1618262089Savg	/*
1619262089Savg	 * Don't write past the end of the block
1620262089Savg	 */
1621262089Savg	VERIFY3U(offset + size, <=, origoffset + SPA_MAXBLOCKSIZE);
1622262089Savg
1623262089Savg	start = offset;
1624262089Savg	end = start + size;
1625262089Savg
1626262089Savg	/*
1627262089Savg	 * Allocate a RAID-Z map for this block.  Note that this block starts
1628262089Savg	 * from the "original" offset, this is, the offset of the extent which
1629262089Savg	 * contains the requisite offset of the data being read or written.
1630262089Savg	 *
1631262089Savg	 * Even if this I/O operation doesn't span the full block size, let's
1632262089Savg	 * treat the on-disk format as if the only blocks are the complete 128
1633262089Savg	 * KB size.
1634262089Savg	 */
1635262089Savg	rm = vdev_raidz_map_alloc(data - (offset - origoffset),
1636262089Savg	    SPA_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift, vd->vdev_children,
1637262089Savg	    vd->vdev_nparity);
1638262089Savg
1639262089Savg	coloffset = origoffset;
1640262089Savg
1641262089Savg	for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1642262089Savg	    c++, coloffset += rc->rc_size) {
1643262089Savg		rc = &rm->rm_col[c];
1644262089Savg		cvd = vd->vdev_child[rc->rc_devidx];
1645262089Savg
1646262089Savg		/*
1647262089Savg		 * Find the start and end of this column in the RAID-Z map,
1648262089Savg		 * keeping in mind that the stated size and offset of the
1649262089Savg		 * operation may not fill the entire column for this vdev.
1650262089Savg		 *
1651262089Savg		 * If any portion of the data spans this column, issue the
1652262089Savg		 * appropriate operation to the vdev.
1653262089Savg		 */
1654262089Savg		if (coloffset + rc->rc_size <= start)
1655262089Savg			continue;
1656262089Savg		if (coloffset >= end)
1657262089Savg			continue;
1658262089Savg
1659262089Savg		colstart = MAX(coloffset, start);
1660262089Savg		colend = MIN(end, coloffset + rc->rc_size);
1661262089Savg		colsize = colend - colstart;
1662262089Savg		colskip = colstart - coloffset;
1663262089Savg
1664262089Savg		VERIFY3U(colsize, <=, rc->rc_size);
1665262089Savg		VERIFY3U(colskip, <=, rc->rc_size);
1666262089Savg
1667262089Savg		/*
1668262089Savg		 * Note that the child vdev will have a vdev label at the start
1669262089Savg		 * of its range of offsets, hence the need for
1670262089Savg		 * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
1671262089Savg		 * example of why this calculation is needed.
1672262089Savg		 */
1673262089Savg		if ((err = vdev_disk_physio(cvd,
1674262089Savg		    ((char *)rc->rc_data) + colskip, colsize,
1675262089Savg		    VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1676262089Savg		    flags, isdump)) != 0)
1677262089Savg			break;
1678262089Savg	}
1679262089Savg
1680262089Savg	vdev_raidz_map_free(rm);
1681262089Savg#endif	/* KERNEL */
1682262089Savg
1683262089Savg	return (err);
1684262089Savg}
1685262089Savg#endif
1686262089Savg
1687168404Spjdstatic uint64_t
1688168404Spjdvdev_raidz_asize(vdev_t *vd, uint64_t psize)
1689168404Spjd{
1690168404Spjd	uint64_t asize;
1691168404Spjd	uint64_t ashift = vd->vdev_top->vdev_ashift;
1692168404Spjd	uint64_t cols = vd->vdev_children;
1693168404Spjd	uint64_t nparity = vd->vdev_nparity;
1694168404Spjd
1695168404Spjd	asize = ((psize - 1) >> ashift) + 1;
1696168404Spjd	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1697168404Spjd	asize = roundup(asize, nparity + 1) << ashift;
1698168404Spjd
1699168404Spjd	return (asize);
1700168404Spjd}
1701168404Spjd
1702168404Spjdstatic void
1703168404Spjdvdev_raidz_child_done(zio_t *zio)
1704168404Spjd{
1705168404Spjd	raidz_col_t *rc = zio->io_private;
1706168404Spjd
1707168404Spjd	rc->rc_error = zio->io_error;
1708168404Spjd	rc->rc_tried = 1;
1709168404Spjd	rc->rc_skipped = 0;
1710168404Spjd}
1711168404Spjd
1712252749Sdelphij/*
1713252749Sdelphij * Start an IO operation on a RAIDZ VDev
1714252749Sdelphij *
1715252749Sdelphij * Outline:
1716252749Sdelphij * - For write operations:
1717252749Sdelphij *   1. Generate the parity data
1718252749Sdelphij *   2. Create child zio write operations to each column's vdev, for both
1719252749Sdelphij *      data and parity.
1720252749Sdelphij *   3. If the column skips any sectors for padding, create optional dummy
1721252749Sdelphij *      write zio children for those areas to improve aggregation continuity.
1722252749Sdelphij * - For read operations:
1723252749Sdelphij *   1. Create child zio read operations to each data column's vdev to read
1724252749Sdelphij *      the range of data required for zio.
1725252749Sdelphij *   2. If this is a scrub or resilver operation, or if any of the data
1726252749Sdelphij *      vdevs have had errors, then create zio read operations to the parity
1727252749Sdelphij *      columns' VDevs as well.
1728252749Sdelphij */
1729185029Spjdstatic int
1730168404Spjdvdev_raidz_io_start(zio_t *zio)
1731168404Spjd{
1732168404Spjd	vdev_t *vd = zio->io_vd;
1733168404Spjd	vdev_t *tvd = vd->vdev_top;
1734168404Spjd	vdev_t *cvd;
1735168404Spjd	raidz_map_t *rm;
1736168404Spjd	raidz_col_t *rc;
1737219089Spjd	int c, i;
1738168404Spjd
1739262089Savg	rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
1740262089Savg	    zio->io_type == ZIO_TYPE_FREE,
1741262089Savg	    tvd->vdev_ashift, vd->vdev_children,
1742168404Spjd	    vd->vdev_nparity);
1743168404Spjd
1744262089Savg	zio->io_vsd = rm;
1745262089Savg	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1746262089Savg
1747168404Spjd	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1748168404Spjd
1749251419Ssmh	if (zio->io_type == ZIO_TYPE_FREE) {
1750251419Ssmh		for (c = 0; c < rm->rm_cols; c++) {
1751251419Ssmh			rc = &rm->rm_col[c];
1752251419Ssmh			cvd = vd->vdev_child[rc->rc_devidx];
1753251419Ssmh			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1754251419Ssmh			    rc->rc_offset, rc->rc_data, rc->rc_size,
1755251419Ssmh			    zio->io_type, zio->io_priority, 0,
1756251419Ssmh			    vdev_raidz_child_done, rc));
1757251419Ssmh		}
1758251419Ssmh		return (ZIO_PIPELINE_CONTINUE);
1759251419Ssmh	}
1760251419Ssmh
1761168404Spjd	if (zio->io_type == ZIO_TYPE_WRITE) {
1762219089Spjd		vdev_raidz_generate_parity(rm);
1763168404Spjd
1764168404Spjd		for (c = 0; c < rm->rm_cols; c++) {
1765168404Spjd			rc = &rm->rm_col[c];
1766168404Spjd			cvd = vd->vdev_child[rc->rc_devidx];
1767168404Spjd			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1768168404Spjd			    rc->rc_offset, rc->rc_data, rc->rc_size,
1769185029Spjd			    zio->io_type, zio->io_priority, 0,
1770168404Spjd			    vdev_raidz_child_done, rc));
1771168404Spjd		}
1772185029Spjd
1773219089Spjd		/*
1774219089Spjd		 * Generate optional I/Os for any skipped sectors to improve
1775219089Spjd		 * aggregation contiguity.
1776219089Spjd		 */
1777219089Spjd		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1778219089Spjd			ASSERT(c <= rm->rm_scols);
1779219089Spjd			if (c == rm->rm_scols)
1780219089Spjd				c = 0;
1781219089Spjd			rc = &rm->rm_col[c];
1782219089Spjd			cvd = vd->vdev_child[rc->rc_devidx];
1783219089Spjd			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1784219089Spjd			    rc->rc_offset + rc->rc_size, NULL,
1785219089Spjd			    1 << tvd->vdev_ashift,
1786219089Spjd			    zio->io_type, zio->io_priority,
1787219089Spjd			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1788219089Spjd		}
1789219089Spjd
1790185029Spjd		return (ZIO_PIPELINE_CONTINUE);
1791168404Spjd	}
1792168404Spjd
1793168404Spjd	ASSERT(zio->io_type == ZIO_TYPE_READ);
1794168404Spjd
1795168404Spjd	/*
1796168404Spjd	 * Iterate over the columns in reverse order so that we hit the parity
1797219089Spjd	 * last -- any errors along the way will force us to read the parity.
1798168404Spjd	 */
1799168404Spjd	for (c = rm->rm_cols - 1; c >= 0; c--) {
1800168404Spjd		rc = &rm->rm_col[c];
1801168404Spjd		cvd = vd->vdev_child[rc->rc_devidx];
1802185029Spjd		if (!vdev_readable(cvd)) {
1803168404Spjd			if (c >= rm->rm_firstdatacol)
1804168404Spjd				rm->rm_missingdata++;
1805168404Spjd			else
1806168404Spjd				rm->rm_missingparity++;
1807249643Smm			rc->rc_error = SET_ERROR(ENXIO);
1808168404Spjd			rc->rc_tried = 1;	/* don't even try */
1809168404Spjd			rc->rc_skipped = 1;
1810168404Spjd			continue;
1811168404Spjd		}
1812219089Spjd		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1813168404Spjd			if (c >= rm->rm_firstdatacol)
1814168404Spjd				rm->rm_missingdata++;
1815168404Spjd			else
1816168404Spjd				rm->rm_missingparity++;
1817249643Smm			rc->rc_error = SET_ERROR(ESTALE);
1818168404Spjd			rc->rc_skipped = 1;
1819168404Spjd			continue;
1820168404Spjd		}
1821168404Spjd		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1822209095Smm		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1823168404Spjd			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1824168404Spjd			    rc->rc_offset, rc->rc_data, rc->rc_size,
1825185029Spjd			    zio->io_type, zio->io_priority, 0,
1826168404Spjd			    vdev_raidz_child_done, rc));
1827168404Spjd		}
1828168404Spjd	}
1829168404Spjd
1830185029Spjd	return (ZIO_PIPELINE_CONTINUE);
1831168404Spjd}
1832168404Spjd
1833219089Spjd
1834168404Spjd/*
1835168404Spjd * Report a checksum error for a child of a RAID-Z device.
1836168404Spjd */
1837168404Spjdstatic void
1838219089Spjdraidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1839168404Spjd{
1840168404Spjd	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1841168404Spjd
1842168404Spjd	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1843219089Spjd		zio_bad_cksum_t zbc;
1844219089Spjd		raidz_map_t *rm = zio->io_vsd;
1845219089Spjd
1846168404Spjd		mutex_enter(&vd->vdev_stat_lock);
1847168404Spjd		vd->vdev_stat.vs_checksum_errors++;
1848168404Spjd		mutex_exit(&vd->vdev_stat_lock);
1849219089Spjd
1850219089Spjd		zbc.zbc_has_cksum = 0;
1851219089Spjd		zbc.zbc_injected = rm->rm_ecksuminjected;
1852219089Spjd
1853219089Spjd		zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1854219089Spjd		    rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1855219089Spjd		    &zbc);
1856168404Spjd	}
1857219089Spjd}
1858168404Spjd
1859219089Spjd/*
1860219089Spjd * We keep track of whether or not there were any injected errors, so that
1861219089Spjd * any ereports we generate can note it.
1862219089Spjd */
1863219089Spjdstatic int
1864219089Spjdraidz_checksum_verify(zio_t *zio)
1865219089Spjd{
1866219089Spjd	zio_bad_cksum_t zbc;
1867219089Spjd	raidz_map_t *rm = zio->io_vsd;
1868219089Spjd
1869219089Spjd	int ret = zio_checksum_error(zio, &zbc);
1870219089Spjd	if (ret != 0 && zbc.zbc_injected != 0)
1871219089Spjd		rm->rm_ecksuminjected = 1;
1872219089Spjd
1873219089Spjd	return (ret);
1874168404Spjd}
1875168404Spjd
1876168404Spjd/*
1877168404Spjd * Generate the parity from the data columns. If we tried and were able to
1878168404Spjd * read the parity without error, verify that the generated parity matches the
1879168404Spjd * data we read. If it doesn't, we fire off a checksum error. Return the
1880168404Spjd * number such failures.
1881168404Spjd */
1882168404Spjdstatic int
1883168404Spjdraidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1884168404Spjd{
1885168404Spjd	void *orig[VDEV_RAIDZ_MAXPARITY];
1886168404Spjd	int c, ret = 0;
1887168404Spjd	raidz_col_t *rc;
1888168404Spjd
1889262089Savg	blkptr_t *bp = zio->io_bp;
1890262089Savg	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
1891262089Savg	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
1892262089Savg
1893262089Savg	if (checksum == ZIO_CHECKSUM_NOPARITY)
1894262089Savg		return (ret);
1895262089Savg
1896168404Spjd	for (c = 0; c < rm->rm_firstdatacol; c++) {
1897168404Spjd		rc = &rm->rm_col[c];
1898168404Spjd		if (!rc->rc_tried || rc->rc_error != 0)
1899168404Spjd			continue;
1900168404Spjd		orig[c] = zio_buf_alloc(rc->rc_size);
1901168404Spjd		bcopy(rc->rc_data, orig[c], rc->rc_size);
1902168404Spjd	}
1903168404Spjd
1904219089Spjd	vdev_raidz_generate_parity(rm);
1905168404Spjd
1906168404Spjd	for (c = 0; c < rm->rm_firstdatacol; c++) {
1907168404Spjd		rc = &rm->rm_col[c];
1908168404Spjd		if (!rc->rc_tried || rc->rc_error != 0)
1909168404Spjd			continue;
1910168404Spjd		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1911219089Spjd			raidz_checksum_error(zio, rc, orig[c]);
1912249643Smm			rc->rc_error = SET_ERROR(ECKSUM);
1913168404Spjd			ret++;
1914168404Spjd		}
1915168404Spjd		zio_buf_free(orig[c], rc->rc_size);
1916168404Spjd	}
1917168404Spjd
1918168404Spjd	return (ret);
1919168404Spjd}
1920168404Spjd
1921219089Spjd/*
1922219089Spjd * Keep statistics on all the ways that we used parity to correct data.
1923219089Spjd */
1924219089Spjdstatic uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1925168404Spjd
1926185029Spjdstatic int
1927185029Spjdvdev_raidz_worst_error(raidz_map_t *rm)
1928185029Spjd{
1929185029Spjd	int error = 0;
1930185029Spjd
1931185029Spjd	for (int c = 0; c < rm->rm_cols; c++)
1932185029Spjd		error = zio_worst_error(error, rm->rm_col[c].rc_error);
1933185029Spjd
1934185029Spjd	return (error);
1935185029Spjd}
1936185029Spjd
1937219089Spjd/*
1938219089Spjd * Iterate over all combinations of bad data and attempt a reconstruction.
1939219089Spjd * Note that the algorithm below is non-optimal because it doesn't take into
1940219089Spjd * account how reconstruction is actually performed. For example, with
1941219089Spjd * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1942219089Spjd * is targeted as invalid as if columns 1 and 4 are targeted since in both
1943219089Spjd * cases we'd only use parity information in column 0.
1944219089Spjd */
1945219089Spjdstatic int
1946219089Spjdvdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1947219089Spjd{
1948219089Spjd	raidz_map_t *rm = zio->io_vsd;
1949219089Spjd	raidz_col_t *rc;
1950219089Spjd	void *orig[VDEV_RAIDZ_MAXPARITY];
1951219089Spjd	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1952219089Spjd	int *tgts = &tstore[1];
1953219089Spjd	int current, next, i, c, n;
1954219089Spjd	int code, ret = 0;
1955219089Spjd
1956219089Spjd	ASSERT(total_errors < rm->rm_firstdatacol);
1957219089Spjd
1958219089Spjd	/*
1959219089Spjd	 * This simplifies one edge condition.
1960219089Spjd	 */
1961219089Spjd	tgts[-1] = -1;
1962219089Spjd
1963219089Spjd	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1964219089Spjd		/*
1965219089Spjd		 * Initialize the targets array by finding the first n columns
1966219089Spjd		 * that contain no error.
1967219089Spjd		 *
1968219089Spjd		 * If there were no data errors, we need to ensure that we're
1969219089Spjd		 * always explicitly attempting to reconstruct at least one
1970219089Spjd		 * data column. To do this, we simply push the highest target
1971219089Spjd		 * up into the data columns.
1972219089Spjd		 */
1973219089Spjd		for (c = 0, i = 0; i < n; i++) {
1974219089Spjd			if (i == n - 1 && data_errors == 0 &&
1975219089Spjd			    c < rm->rm_firstdatacol) {
1976219089Spjd				c = rm->rm_firstdatacol;
1977219089Spjd			}
1978219089Spjd
1979219089Spjd			while (rm->rm_col[c].rc_error != 0) {
1980219089Spjd				c++;
1981219089Spjd				ASSERT3S(c, <, rm->rm_cols);
1982219089Spjd			}
1983219089Spjd
1984219089Spjd			tgts[i] = c++;
1985219089Spjd		}
1986219089Spjd
1987219089Spjd		/*
1988219089Spjd		 * Setting tgts[n] simplifies the other edge condition.
1989219089Spjd		 */
1990219089Spjd		tgts[n] = rm->rm_cols;
1991219089Spjd
1992219089Spjd		/*
1993219089Spjd		 * These buffers were allocated in previous iterations.
1994219089Spjd		 */
1995219089Spjd		for (i = 0; i < n - 1; i++) {
1996219089Spjd			ASSERT(orig[i] != NULL);
1997219089Spjd		}
1998219089Spjd
1999219089Spjd		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
2000219089Spjd
2001219089Spjd		current = 0;
2002219089Spjd		next = tgts[current];
2003219089Spjd
2004219089Spjd		while (current != n) {
2005219089Spjd			tgts[current] = next;
2006219089Spjd			current = 0;
2007219089Spjd
2008219089Spjd			/*
2009219089Spjd			 * Save off the original data that we're going to
2010219089Spjd			 * attempt to reconstruct.
2011219089Spjd			 */
2012219089Spjd			for (i = 0; i < n; i++) {
2013219089Spjd				ASSERT(orig[i] != NULL);
2014219089Spjd				c = tgts[i];
2015219089Spjd				ASSERT3S(c, >=, 0);
2016219089Spjd				ASSERT3S(c, <, rm->rm_cols);
2017219089Spjd				rc = &rm->rm_col[c];
2018219089Spjd				bcopy(rc->rc_data, orig[i], rc->rc_size);
2019219089Spjd			}
2020219089Spjd
2021219089Spjd			/*
2022219089Spjd			 * Attempt a reconstruction and exit the outer loop on
2023219089Spjd			 * success.
2024219089Spjd			 */
2025219089Spjd			code = vdev_raidz_reconstruct(rm, tgts, n);
2026219089Spjd			if (raidz_checksum_verify(zio) == 0) {
2027219089Spjd				atomic_inc_64(&raidz_corrected[code]);
2028219089Spjd
2029219089Spjd				for (i = 0; i < n; i++) {
2030219089Spjd					c = tgts[i];
2031219089Spjd					rc = &rm->rm_col[c];
2032219089Spjd					ASSERT(rc->rc_error == 0);
2033219089Spjd					if (rc->rc_tried)
2034219089Spjd						raidz_checksum_error(zio, rc,
2035219089Spjd						    orig[i]);
2036249643Smm					rc->rc_error = SET_ERROR(ECKSUM);
2037219089Spjd				}
2038219089Spjd
2039219089Spjd				ret = code;
2040219089Spjd				goto done;
2041219089Spjd			}
2042219089Spjd
2043219089Spjd			/*
2044219089Spjd			 * Restore the original data.
2045219089Spjd			 */
2046219089Spjd			for (i = 0; i < n; i++) {
2047219089Spjd				c = tgts[i];
2048219089Spjd				rc = &rm->rm_col[c];
2049219089Spjd				bcopy(orig[i], rc->rc_data, rc->rc_size);
2050219089Spjd			}
2051219089Spjd
2052219089Spjd			do {
2053219089Spjd				/*
2054219089Spjd				 * Find the next valid column after the current
2055219089Spjd				 * position..
2056219089Spjd				 */
2057219089Spjd				for (next = tgts[current] + 1;
2058219089Spjd				    next < rm->rm_cols &&
2059219089Spjd				    rm->rm_col[next].rc_error != 0; next++)
2060219089Spjd					continue;
2061219089Spjd
2062219089Spjd				ASSERT(next <= tgts[current + 1]);
2063219089Spjd
2064219089Spjd				/*
2065219089Spjd				 * If that spot is available, we're done here.
2066219089Spjd				 */
2067219089Spjd				if (next != tgts[current + 1])
2068219089Spjd					break;
2069219089Spjd
2070219089Spjd				/*
2071219089Spjd				 * Otherwise, find the next valid column after
2072219089Spjd				 * the previous position.
2073219089Spjd				 */
2074219089Spjd				for (c = tgts[current - 1] + 1;
2075219089Spjd				    rm->rm_col[c].rc_error != 0; c++)
2076219089Spjd					continue;
2077219089Spjd
2078219089Spjd				tgts[current] = c;
2079219089Spjd				current++;
2080219089Spjd
2081219089Spjd			} while (current != n);
2082219089Spjd		}
2083219089Spjd	}
2084219089Spjd	n--;
2085219089Spjddone:
2086219089Spjd	for (i = 0; i < n; i++) {
2087219089Spjd		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2088219089Spjd	}
2089219089Spjd
2090219089Spjd	return (ret);
2091219089Spjd}
2092219089Spjd
2093252749Sdelphij/*
2094252749Sdelphij * Complete an IO operation on a RAIDZ VDev
2095252749Sdelphij *
2096252749Sdelphij * Outline:
2097252749Sdelphij * - For write operations:
2098252749Sdelphij *   1. Check for errors on the child IOs.
2099252749Sdelphij *   2. Return, setting an error code if too few child VDevs were written
2100252749Sdelphij *      to reconstruct the data later.  Note that partial writes are
2101252749Sdelphij *      considered successful if they can be reconstructed at all.
2102252749Sdelphij * - For read operations:
2103252749Sdelphij *   1. Check for errors on the child IOs.
2104252749Sdelphij *   2. If data errors occurred:
2105252749Sdelphij *      a. Try to reassemble the data from the parity available.
2106252749Sdelphij *      b. If we haven't yet read the parity drives, read them now.
2107252749Sdelphij *      c. If all parity drives have been read but the data still doesn't
2108252749Sdelphij *         reassemble with a correct checksum, then try combinatorial
2109252749Sdelphij *         reconstruction.
2110252749Sdelphij *      d. If that doesn't work, return an error.
2111252749Sdelphij *   3. If there were unexpected errors or this is a resilver operation,
2112252749Sdelphij *      rewrite the vdevs that had errors.
2113252749Sdelphij */
2114168404Spjdstatic void
2115168404Spjdvdev_raidz_io_done(zio_t *zio)
2116168404Spjd{
2117168404Spjd	vdev_t *vd = zio->io_vd;
2118168404Spjd	vdev_t *cvd;
2119168404Spjd	raidz_map_t *rm = zio->io_vsd;
2120219089Spjd	raidz_col_t *rc;
2121168404Spjd	int unexpected_errors = 0;
2122168404Spjd	int parity_errors = 0;
2123168404Spjd	int parity_untried = 0;
2124168404Spjd	int data_errors = 0;
2125185029Spjd	int total_errors = 0;
2126219089Spjd	int n, c;
2127219089Spjd	int tgts[VDEV_RAIDZ_MAXPARITY];
2128219089Spjd	int code;
2129168404Spjd
2130168404Spjd	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
2131168404Spjd
2132168404Spjd	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2133168404Spjd	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2134168404Spjd
2135168404Spjd	for (c = 0; c < rm->rm_cols; c++) {
2136168404Spjd		rc = &rm->rm_col[c];
2137168404Spjd
2138168404Spjd		if (rc->rc_error) {
2139185029Spjd			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
2140168404Spjd
2141168404Spjd			if (c < rm->rm_firstdatacol)
2142168404Spjd				parity_errors++;
2143168404Spjd			else
2144168404Spjd				data_errors++;
2145168404Spjd
2146168404Spjd			if (!rc->rc_skipped)
2147168404Spjd				unexpected_errors++;
2148168404Spjd
2149185029Spjd			total_errors++;
2150168404Spjd		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2151168404Spjd			parity_untried++;
2152168404Spjd		}
2153168404Spjd	}
2154168404Spjd
2155168404Spjd	if (zio->io_type == ZIO_TYPE_WRITE) {
2156168404Spjd		/*
2157185029Spjd		 * XXX -- for now, treat partial writes as a success.
2158185029Spjd		 * (If we couldn't write enough columns to reconstruct
2159185029Spjd		 * the data, the I/O failed.  Otherwise, good enough.)
2160185029Spjd		 *
2161185029Spjd		 * Now that we support write reallocation, it would be better
2162185029Spjd		 * to treat partial failure as real failure unless there are
2163185029Spjd		 * no non-degraded top-level vdevs left, and not update DTLs
2164185029Spjd		 * if we intend to reallocate.
2165168404Spjd		 */
2166168404Spjd		/* XXPOLICY */
2167185029Spjd		if (total_errors > rm->rm_firstdatacol)
2168185029Spjd			zio->io_error = vdev_raidz_worst_error(rm);
2169168404Spjd
2170168404Spjd		return;
2171251419Ssmh	} else if (zio->io_type == ZIO_TYPE_FREE) {
2172251419Ssmh		return;
2173168404Spjd	}
2174168404Spjd
2175168404Spjd	ASSERT(zio->io_type == ZIO_TYPE_READ);
2176168404Spjd	/*
2177168404Spjd	 * There are three potential phases for a read:
2178168404Spjd	 *	1. produce valid data from the columns read
2179168404Spjd	 *	2. read all disks and try again
2180168404Spjd	 *	3. perform combinatorial reconstruction
2181168404Spjd	 *
2182168404Spjd	 * Each phase is progressively both more expensive and less likely to
2183168404Spjd	 * occur. If we encounter more errors than we can repair or all phases
2184168404Spjd	 * fail, we have no choice but to return an error.
2185168404Spjd	 */
2186168404Spjd
2187168404Spjd	/*
2188168404Spjd	 * If the number of errors we saw was correctable -- less than or equal
2189168404Spjd	 * to the number of parity disks read -- attempt to produce data that
2190168404Spjd	 * has a valid checksum. Naturally, this case applies in the absence of
2191168404Spjd	 * any errors.
2192168404Spjd	 */
2193185029Spjd	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2194219089Spjd		if (data_errors == 0) {
2195219089Spjd			if (raidz_checksum_verify(zio) == 0) {
2196168738Spjd				/*
2197168738Spjd				 * If we read parity information (unnecessarily
2198168738Spjd				 * as it happens since no reconstruction was
2199168738Spjd				 * needed) regenerate and verify the parity.
2200168738Spjd				 * We also regenerate parity when resilvering
2201168738Spjd				 * so we can write it out to the failed device
2202168738Spjd				 * later.
2203168738Spjd				 */
2204168404Spjd				if (parity_errors + parity_untried <
2205168738Spjd				    rm->rm_firstdatacol ||
2206168738Spjd				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2207168404Spjd					n = raidz_parity_verify(zio, rm);
2208168404Spjd					unexpected_errors += n;
2209168404Spjd					ASSERT(parity_errors + n <=
2210168404Spjd					    rm->rm_firstdatacol);
2211168404Spjd				}
2212168404Spjd				goto done;
2213168404Spjd			}
2214219089Spjd		} else {
2215168404Spjd			/*
2216168404Spjd			 * We either attempt to read all the parity columns or
2217168404Spjd			 * none of them. If we didn't try to read parity, we
2218168404Spjd			 * wouldn't be here in the correctable case. There must
2219168404Spjd			 * also have been fewer parity errors than parity
2220168404Spjd			 * columns or, again, we wouldn't be in this code path.
2221168404Spjd			 */
2222168404Spjd			ASSERT(parity_untried == 0);
2223168404Spjd			ASSERT(parity_errors < rm->rm_firstdatacol);
2224168404Spjd
2225168404Spjd			/*
2226219089Spjd			 * Identify the data columns that reported an error.
2227168404Spjd			 */
2228219089Spjd			n = 0;
2229168404Spjd			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2230168404Spjd				rc = &rm->rm_col[c];
2231219089Spjd				if (rc->rc_error != 0) {
2232219089Spjd					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2233219089Spjd					tgts[n++] = c;
2234219089Spjd				}
2235168404Spjd			}
2236168404Spjd
2237219089Spjd			ASSERT(rm->rm_firstdatacol >= n);
2238168404Spjd
2239219089Spjd			code = vdev_raidz_reconstruct(rm, tgts, n);
2240168404Spjd
2241219089Spjd			if (raidz_checksum_verify(zio) == 0) {
2242219089Spjd				atomic_inc_64(&raidz_corrected[code]);
2243219089Spjd
2244168404Spjd				/*
2245219089Spjd				 * If we read more parity disks than were used
2246219089Spjd				 * for reconstruction, confirm that the other
2247219089Spjd				 * parity disks produced correct data. This
2248219089Spjd				 * routine is suboptimal in that it regenerates
2249219089Spjd				 * the parity that we already used in addition
2250219089Spjd				 * to the parity that we're attempting to
2251219089Spjd				 * verify, but this should be a relatively
2252219089Spjd				 * uncommon case, and can be optimized if it
2253219089Spjd				 * becomes a problem. Note that we regenerate
2254219089Spjd				 * parity when resilvering so we can write it
2255219089Spjd				 * out to failed devices later.
2256168404Spjd				 */
2257219089Spjd				if (parity_errors < rm->rm_firstdatacol - n ||
2258168738Spjd				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2259168404Spjd					n = raidz_parity_verify(zio, rm);
2260168404Spjd					unexpected_errors += n;
2261168404Spjd					ASSERT(parity_errors + n <=
2262168404Spjd					    rm->rm_firstdatacol);
2263168404Spjd				}
2264168404Spjd
2265168404Spjd				goto done;
2266168404Spjd			}
2267168404Spjd		}
2268168404Spjd	}
2269168404Spjd
2270168404Spjd	/*
2271168404Spjd	 * This isn't a typical situation -- either we got a read error or
2272168404Spjd	 * a child silently returned bad data. Read every block so we can
2273168404Spjd	 * try again with as much data and parity as we can track down. If
2274168404Spjd	 * we've already been through once before, all children will be marked
2275168404Spjd	 * as tried so we'll proceed to combinatorial reconstruction.
2276168404Spjd	 */
2277168404Spjd	unexpected_errors = 1;
2278168404Spjd	rm->rm_missingdata = 0;
2279168404Spjd	rm->rm_missingparity = 0;
2280168404Spjd
2281168404Spjd	for (c = 0; c < rm->rm_cols; c++) {
2282168404Spjd		if (rm->rm_col[c].rc_tried)
2283168404Spjd			continue;
2284168404Spjd
2285168404Spjd		zio_vdev_io_redone(zio);
2286168404Spjd		do {
2287168404Spjd			rc = &rm->rm_col[c];
2288168404Spjd			if (rc->rc_tried)
2289168404Spjd				continue;
2290168404Spjd			zio_nowait(zio_vdev_child_io(zio, NULL,
2291168404Spjd			    vd->vdev_child[rc->rc_devidx],
2292168404Spjd			    rc->rc_offset, rc->rc_data, rc->rc_size,
2293185029Spjd			    zio->io_type, zio->io_priority, 0,
2294168404Spjd			    vdev_raidz_child_done, rc));
2295168404Spjd		} while (++c < rm->rm_cols);
2296185029Spjd
2297168404Spjd		return;
2298168404Spjd	}
2299168404Spjd
2300168404Spjd	/*
2301168404Spjd	 * At this point we've attempted to reconstruct the data given the
2302168404Spjd	 * errors we detected, and we've attempted to read all columns. There
2303168404Spjd	 * must, therefore, be one or more additional problems -- silent errors
2304168404Spjd	 * resulting in invalid data rather than explicit I/O errors resulting
2305219089Spjd	 * in absent data. We check if there is enough additional data to
2306219089Spjd	 * possibly reconstruct the data and then perform combinatorial
2307219089Spjd	 * reconstruction over all possible combinations. If that fails,
2308219089Spjd	 * we're cooked.
2309168404Spjd	 */
2310219089Spjd	if (total_errors > rm->rm_firstdatacol) {
2311185029Spjd		zio->io_error = vdev_raidz_worst_error(rm);
2312168404Spjd
2313219089Spjd	} else if (total_errors < rm->rm_firstdatacol &&
2314219089Spjd	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2315168404Spjd		/*
2316219089Spjd		 * If we didn't use all the available parity for the
2317219089Spjd		 * combinatorial reconstruction, verify that the remaining
2318219089Spjd		 * parity is correct.
2319168404Spjd		 */
2320219089Spjd		if (code != (1 << rm->rm_firstdatacol) - 1)
2321219089Spjd			(void) raidz_parity_verify(zio, rm);
2322219089Spjd	} else {
2323168404Spjd		/*
2324219089Spjd		 * We're here because either:
2325219089Spjd		 *
2326219089Spjd		 *	total_errors == rm_first_datacol, or
2327219089Spjd		 *	vdev_raidz_combrec() failed
2328219089Spjd		 *
2329219089Spjd		 * In either case, there is enough bad data to prevent
2330219089Spjd		 * reconstruction.
2331219089Spjd		 *
2332219089Spjd		 * Start checksum ereports for all children which haven't
2333219089Spjd		 * failed, and the IO wasn't speculative.
2334168404Spjd		 */
2335249643Smm		zio->io_error = SET_ERROR(ECKSUM);
2336168404Spjd
2337219089Spjd		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2338219089Spjd			for (c = 0; c < rm->rm_cols; c++) {
2339219089Spjd				rc = &rm->rm_col[c];
2340219089Spjd				if (rc->rc_error == 0) {
2341219089Spjd					zio_bad_cksum_t zbc;
2342219089Spjd					zbc.zbc_has_cksum = 0;
2343219089Spjd					zbc.zbc_injected =
2344219089Spjd					    rm->rm_ecksuminjected;
2345168404Spjd
2346219089Spjd					zfs_ereport_start_checksum(
2347219089Spjd					    zio->io_spa,
2348219089Spjd					    vd->vdev_child[rc->rc_devidx],
2349219089Spjd					    zio, rc->rc_offset, rc->rc_size,
2350219089Spjd					    (void *)(uintptr_t)c, &zbc);
2351168404Spjd				}
2352168404Spjd			}
2353168404Spjd		}
2354168404Spjd	}
2355168404Spjd
2356168404Spjddone:
2357168404Spjd	zio_checksum_verified(zio);
2358168404Spjd
2359209962Smm	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2360168404Spjd	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2361168404Spjd		/*
2362168404Spjd		 * Use the good data we have in hand to repair damaged children.
2363168404Spjd		 */
2364168404Spjd		for (c = 0; c < rm->rm_cols; c++) {
2365168404Spjd			rc = &rm->rm_col[c];
2366168404Spjd			cvd = vd->vdev_child[rc->rc_devidx];
2367168404Spjd
2368168404Spjd			if (rc->rc_error == 0)
2369168404Spjd				continue;
2370168404Spjd
2371185029Spjd			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2372168404Spjd			    rc->rc_offset, rc->rc_data, rc->rc_size,
2373260764Savg			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2374209962Smm			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2375209962Smm			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2376168404Spjd		}
2377168404Spjd	}
2378168404Spjd}
2379168404Spjd
2380168404Spjdstatic void
2381168404Spjdvdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2382168404Spjd{
2383168404Spjd	if (faulted > vd->vdev_nparity)
2384168404Spjd		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2385168404Spjd		    VDEV_AUX_NO_REPLICAS);
2386168404Spjd	else if (degraded + faulted != 0)
2387168404Spjd		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2388168404Spjd	else
2389168404Spjd		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2390168404Spjd}
2391168404Spjd
2392168404Spjdvdev_ops_t vdev_raidz_ops = {
2393168404Spjd	vdev_raidz_open,
2394168404Spjd	vdev_raidz_close,
2395168404Spjd	vdev_raidz_asize,
2396168404Spjd	vdev_raidz_io_start,
2397168404Spjd	vdev_raidz_io_done,
2398168404Spjd	vdev_raidz_state_change,
2399219089Spjd	NULL,
2400219089Spjd	NULL,
2401168404Spjd	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2402168404Spjd	B_FALSE			/* not a leaf vdev */
2403168404Spjd};
2404