1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd
22168404Spjd/*
23219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24249195Smm * Copyright (c) 2013 by Delphix. All rights reserved.
25255750Sdelphij * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26168404Spjd */
27168404Spjd
28168404Spjd#include <sys/zfs_context.h>
29168404Spjd#include <sys/spa.h>
30168404Spjd#include <sys/vdev_impl.h>
31255750Sdelphij#ifdef illumos
32255750Sdelphij#include <sys/vdev_disk.h>
33255750Sdelphij#endif
34255750Sdelphij#include <sys/vdev_file.h>
35255750Sdelphij#include <sys/vdev_raidz.h>
36168404Spjd#include <sys/zio.h>
37168404Spjd#include <sys/zio_checksum.h>
38168404Spjd#include <sys/fs/zfs.h>
39168404Spjd#include <sys/fm/fs/zfs.h>
40255750Sdelphij#include <sys/bio.h>
41168404Spjd
42168404Spjd/*
43168404Spjd * Virtual device vector for RAID-Z.
44168404Spjd *
45219089Spjd * This vdev supports single, double, and triple parity. For single parity,
46219089Spjd * we use a simple XOR of all the data columns. For double or triple parity,
47219089Spjd * we use a special case of Reed-Solomon coding. This extends the
48219089Spjd * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
49219089Spjd * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
50219089Spjd * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
51219089Spjd * former is also based. The latter is designed to provide higher performance
52219089Spjd * for writes.
53168404Spjd *
54219089Spjd * Note that the Plank paper claimed to support arbitrary N+M, but was then
55219089Spjd * amended six years later identifying a critical flaw that invalidates its
56219089Spjd * claims. Nevertheless, the technique can be adapted to work for up to
57219089Spjd * triple parity. For additional parity, the amendment "Note: Correction to
58219089Spjd * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
59219089Spjd * is viable, but the additional complexity means that write performance will
60219089Spjd * suffer.
61219089Spjd *
62219089Spjd * All of the methods above operate on a Galois field, defined over the
63219089Spjd * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
64219089Spjd * can be expressed with a single byte. Briefly, the operations on the
65219089Spjd * field are defined as follows:
66219089Spjd *
67168404Spjd *   o addition (+) is represented by a bitwise XOR
68168404Spjd *   o subtraction (-) is therefore identical to addition: A + B = A - B
69168404Spjd *   o multiplication of A by 2 is defined by the following bitwise expression:
70251631Sdelphij *
71168404Spjd *	(A * 2)_7 = A_6
72168404Spjd *	(A * 2)_6 = A_5
73168404Spjd *	(A * 2)_5 = A_4
74168404Spjd *	(A * 2)_4 = A_3 + A_7
75168404Spjd *	(A * 2)_3 = A_2 + A_7
76168404Spjd *	(A * 2)_2 = A_1 + A_7
77168404Spjd *	(A * 2)_1 = A_0
78168404Spjd *	(A * 2)_0 = A_7
79168404Spjd *
80168404Spjd * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
81219089Spjd * As an aside, this multiplication is derived from the error correcting
82219089Spjd * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
83168404Spjd *
84168404Spjd * Observe that any number in the field (except for 0) can be expressed as a
85168404Spjd * power of 2 -- a generator for the field. We store a table of the powers of
86168404Spjd * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
87168404Spjd * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
88219089Spjd * than field addition). The inverse of a field element A (A^-1) is therefore
89219089Spjd * A ^ (255 - 1) = A^254.
90168404Spjd *
91219089Spjd * The up-to-three parity columns, P, Q, R over several data columns,
92219089Spjd * D_0, ... D_n-1, can be expressed by field operations:
93168404Spjd *
94168404Spjd *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
95168404Spjd *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
96168404Spjd *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
97219089Spjd *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
98219089Spjd *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
99168404Spjd *
100219089Spjd * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
101219089Spjd * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
102219089Spjd * independent coefficients. (There are no additional coefficients that have
103219089Spjd * this property which is why the uncorrected Plank method breaks down.)
104219089Spjd *
105219089Spjd * See the reconstruction code below for how P, Q and R can used individually
106219089Spjd * or in concert to recover missing data columns.
107168404Spjd */
108168404Spjd
109168404Spjdtypedef struct raidz_col {
110168404Spjd	uint64_t rc_devidx;		/* child device index for I/O */
111168404Spjd	uint64_t rc_offset;		/* device offset */
112168404Spjd	uint64_t rc_size;		/* I/O size */
113168404Spjd	void *rc_data;			/* I/O data */
114219089Spjd	void *rc_gdata;			/* used to store the "good" version */
115168404Spjd	int rc_error;			/* I/O error for this device */
116168404Spjd	uint8_t rc_tried;		/* Did we attempt this I/O column? */
117168404Spjd	uint8_t rc_skipped;		/* Did we skip this I/O column? */
118168404Spjd} raidz_col_t;
119168404Spjd
120168404Spjdtypedef struct raidz_map {
121219089Spjd	uint64_t rm_cols;		/* Regular column count */
122219089Spjd	uint64_t rm_scols;		/* Count including skipped columns */
123168404Spjd	uint64_t rm_bigcols;		/* Number of oversized columns */
124168404Spjd	uint64_t rm_asize;		/* Actual total I/O size */
125168404Spjd	uint64_t rm_missingdata;	/* Count of missing data devices */
126168404Spjd	uint64_t rm_missingparity;	/* Count of missing parity devices */
127168404Spjd	uint64_t rm_firstdatacol;	/* First data column/parity count */
128219089Spjd	uint64_t rm_nskip;		/* Skipped sectors for padding */
129251631Sdelphij	uint64_t rm_skipstart;		/* Column index of padding start */
130219089Spjd	void *rm_datacopy;		/* rm_asize-buffer of copied data */
131219089Spjd	uintptr_t rm_reports;		/* # of referencing checksum reports */
132219089Spjd	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
133219089Spjd	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
134168404Spjd	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
135168404Spjd} raidz_map_t;
136168404Spjd
137168404Spjd#define	VDEV_RAIDZ_P		0
138168404Spjd#define	VDEV_RAIDZ_Q		1
139219089Spjd#define	VDEV_RAIDZ_R		2
140168404Spjd
141219089Spjd#define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
142219089Spjd#define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
143168404Spjd
144219089Spjd/*
145219089Spjd * We provide a mechanism to perform the field multiplication operation on a
146219089Spjd * 64-bit value all at once rather than a byte at a time. This works by
147219089Spjd * creating a mask from the top bit in each byte and using that to
148219089Spjd * conditionally apply the XOR of 0x1d.
149219089Spjd */
150219089Spjd#define	VDEV_RAIDZ_64MUL_2(x, mask) \
151219089Spjd{ \
152219089Spjd	(mask) = (x) & 0x8080808080808080ULL; \
153219089Spjd	(mask) = ((mask) << 1) - ((mask) >> 7); \
154219089Spjd	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
155219089Spjd	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
156219089Spjd}
157168404Spjd
158219089Spjd#define	VDEV_RAIDZ_64MUL_4(x, mask) \
159219089Spjd{ \
160219089Spjd	VDEV_RAIDZ_64MUL_2((x), mask); \
161219089Spjd	VDEV_RAIDZ_64MUL_2((x), mask); \
162219089Spjd}
163219089Spjd
164255750Sdelphij#define	VDEV_LABEL_OFFSET(x)	(x + VDEV_LABEL_START_SIZE)
165255750Sdelphij
166168404Spjd/*
167219089Spjd * Force reconstruction to use the general purpose method.
168219089Spjd */
169219089Spjdint vdev_raidz_default_to_general;
170219089Spjd
171251631Sdelphij/* Powers of 2 in the Galois field defined above. */
172168404Spjdstatic const uint8_t vdev_raidz_pow2[256] = {
173168404Spjd	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
174168404Spjd	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
175168404Spjd	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
176168404Spjd	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
177168404Spjd	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
178168404Spjd	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
179168404Spjd	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
180168404Spjd	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
181168404Spjd	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
182168404Spjd	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
183168404Spjd	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
184168404Spjd	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
185168404Spjd	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
186168404Spjd	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
187168404Spjd	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
188168404Spjd	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
189168404Spjd	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
190168404Spjd	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
191168404Spjd	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
192168404Spjd	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
193168404Spjd	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
194168404Spjd	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
195168404Spjd	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
196168404Spjd	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
197168404Spjd	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
198168404Spjd	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
199168404Spjd	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
200168404Spjd	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
201168404Spjd	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
202168404Spjd	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
203168404Spjd	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
204168404Spjd	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
205168404Spjd};
206251631Sdelphij/* Logs of 2 in the Galois field defined above. */
207168404Spjdstatic const uint8_t vdev_raidz_log2[256] = {
208168404Spjd	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
209168404Spjd	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
210168404Spjd	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
211168404Spjd	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
212168404Spjd	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
213168404Spjd	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
214168404Spjd	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
215168404Spjd	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
216168404Spjd	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
217168404Spjd	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
218168404Spjd	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
219168404Spjd	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
220168404Spjd	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
221168404Spjd	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
222168404Spjd	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
223168404Spjd	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
224168404Spjd	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
225168404Spjd	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
226168404Spjd	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
227168404Spjd	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
228168404Spjd	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
229168404Spjd	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
230168404Spjd	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
231168404Spjd	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
232168404Spjd	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
233168404Spjd	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
234168404Spjd	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
235168404Spjd	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
236168404Spjd	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
237168404Spjd	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
238168404Spjd	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
239168404Spjd	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
240168404Spjd};
241168404Spjd
242219089Spjdstatic void vdev_raidz_generate_parity(raidz_map_t *rm);
243219089Spjd
244168404Spjd/*
245168404Spjd * Multiply a given number by 2 raised to the given power.
246168404Spjd */
247168404Spjdstatic uint8_t
248168404Spjdvdev_raidz_exp2(uint_t a, int exp)
249168404Spjd{
250168404Spjd	if (a == 0)
251168404Spjd		return (0);
252168404Spjd
253168404Spjd	ASSERT(exp >= 0);
254168404Spjd	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
255168404Spjd
256168404Spjd	exp += vdev_raidz_log2[a];
257168404Spjd	if (exp > 255)
258168404Spjd		exp -= 255;
259168404Spjd
260168404Spjd	return (vdev_raidz_pow2[exp]);
261168404Spjd}
262168404Spjd
263185029Spjdstatic void
264219089Spjdvdev_raidz_map_free(raidz_map_t *rm)
265185029Spjd{
266185029Spjd	int c;
267219089Spjd	size_t size;
268185029Spjd
269219089Spjd	for (c = 0; c < rm->rm_firstdatacol; c++) {
270240868Spjd		if (rm->rm_col[c].rc_data != NULL)
271240868Spjd			zio_buf_free(rm->rm_col[c].rc_data,
272240868Spjd			    rm->rm_col[c].rc_size);
273185029Spjd
274219089Spjd		if (rm->rm_col[c].rc_gdata != NULL)
275219089Spjd			zio_buf_free(rm->rm_col[c].rc_gdata,
276219089Spjd			    rm->rm_col[c].rc_size);
277219089Spjd	}
278219089Spjd
279219089Spjd	size = 0;
280219089Spjd	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
281219089Spjd		size += rm->rm_col[c].rc_size;
282219089Spjd
283219089Spjd	if (rm->rm_datacopy != NULL)
284219089Spjd		zio_buf_free(rm->rm_datacopy, size);
285219089Spjd
286219089Spjd	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
287185029Spjd}
288185029Spjd
289219089Spjdstatic void
290219089Spjdvdev_raidz_map_free_vsd(zio_t *zio)
291219089Spjd{
292219089Spjd	raidz_map_t *rm = zio->io_vsd;
293219089Spjd
294240415Smm	ASSERT0(rm->rm_freed);
295219089Spjd	rm->rm_freed = 1;
296219089Spjd
297219089Spjd	if (rm->rm_reports == 0)
298219089Spjd		vdev_raidz_map_free(rm);
299219089Spjd}
300219089Spjd
301219089Spjd/*ARGSUSED*/
302219089Spjdstatic void
303219089Spjdvdev_raidz_cksum_free(void *arg, size_t ignored)
304219089Spjd{
305219089Spjd	raidz_map_t *rm = arg;
306219089Spjd
307219089Spjd	ASSERT3U(rm->rm_reports, >, 0);
308219089Spjd
309219089Spjd	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
310219089Spjd		vdev_raidz_map_free(rm);
311219089Spjd}
312219089Spjd
313219089Spjdstatic void
314219089Spjdvdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
315219089Spjd{
316219089Spjd	raidz_map_t *rm = zcr->zcr_cbdata;
317219089Spjd	size_t c = zcr->zcr_cbinfo;
318219089Spjd	size_t x;
319219089Spjd
320219089Spjd	const char *good = NULL;
321219089Spjd	const char *bad = rm->rm_col[c].rc_data;
322219089Spjd
323219089Spjd	if (good_data == NULL) {
324219089Spjd		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
325219089Spjd		return;
326219089Spjd	}
327219089Spjd
328219089Spjd	if (c < rm->rm_firstdatacol) {
329219089Spjd		/*
330219089Spjd		 * The first time through, calculate the parity blocks for
331219089Spjd		 * the good data (this relies on the fact that the good
332219089Spjd		 * data never changes for a given logical ZIO)
333219089Spjd		 */
334219089Spjd		if (rm->rm_col[0].rc_gdata == NULL) {
335219089Spjd			char *bad_parity[VDEV_RAIDZ_MAXPARITY];
336219089Spjd			char *buf;
337219089Spjd
338219089Spjd			/*
339219089Spjd			 * Set up the rm_col[]s to generate the parity for
340219089Spjd			 * good_data, first saving the parity bufs and
341219089Spjd			 * replacing them with buffers to hold the result.
342219089Spjd			 */
343219089Spjd			for (x = 0; x < rm->rm_firstdatacol; x++) {
344219089Spjd				bad_parity[x] = rm->rm_col[x].rc_data;
345219089Spjd				rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
346219089Spjd				    zio_buf_alloc(rm->rm_col[x].rc_size);
347219089Spjd			}
348219089Spjd
349219089Spjd			/* fill in the data columns from good_data */
350219089Spjd			buf = (char *)good_data;
351219089Spjd			for (; x < rm->rm_cols; x++) {
352219089Spjd				rm->rm_col[x].rc_data = buf;
353219089Spjd				buf += rm->rm_col[x].rc_size;
354219089Spjd			}
355219089Spjd
356219089Spjd			/*
357219089Spjd			 * Construct the parity from the good data.
358219089Spjd			 */
359219089Spjd			vdev_raidz_generate_parity(rm);
360219089Spjd
361219089Spjd			/* restore everything back to its original state */
362219089Spjd			for (x = 0; x < rm->rm_firstdatacol; x++)
363219089Spjd				rm->rm_col[x].rc_data = bad_parity[x];
364219089Spjd
365219089Spjd			buf = rm->rm_datacopy;
366219089Spjd			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
367219089Spjd				rm->rm_col[x].rc_data = buf;
368219089Spjd				buf += rm->rm_col[x].rc_size;
369219089Spjd			}
370219089Spjd		}
371219089Spjd
372219089Spjd		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
373219089Spjd		good = rm->rm_col[c].rc_gdata;
374219089Spjd	} else {
375219089Spjd		/* adjust good_data to point at the start of our column */
376219089Spjd		good = good_data;
377219089Spjd
378219089Spjd		for (x = rm->rm_firstdatacol; x < c; x++)
379219089Spjd			good += rm->rm_col[x].rc_size;
380219089Spjd	}
381219089Spjd
382219089Spjd	/* we drop the ereport if it ends up that the data was good */
383219089Spjd	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
384219089Spjd}
385219089Spjd
386219089Spjd/*
387219089Spjd * Invoked indirectly by zfs_ereport_start_checksum(), called
388219089Spjd * below when our read operation fails completely.  The main point
389219089Spjd * is to keep a copy of everything we read from disk, so that at
390219089Spjd * vdev_raidz_cksum_finish() time we can compare it with the good data.
391219089Spjd */
392219089Spjdstatic void
393219089Spjdvdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
394219089Spjd{
395219089Spjd	size_t c = (size_t)(uintptr_t)arg;
396219089Spjd	caddr_t buf;
397219089Spjd
398219089Spjd	raidz_map_t *rm = zio->io_vsd;
399219089Spjd	size_t size;
400219089Spjd
401219089Spjd	/* set up the report and bump the refcount  */
402219089Spjd	zcr->zcr_cbdata = rm;
403219089Spjd	zcr->zcr_cbinfo = c;
404219089Spjd	zcr->zcr_finish = vdev_raidz_cksum_finish;
405219089Spjd	zcr->zcr_free = vdev_raidz_cksum_free;
406219089Spjd
407219089Spjd	rm->rm_reports++;
408219089Spjd	ASSERT3U(rm->rm_reports, >, 0);
409219089Spjd
410219089Spjd	if (rm->rm_datacopy != NULL)
411219089Spjd		return;
412219089Spjd
413219089Spjd	/*
414219089Spjd	 * It's the first time we're called for this raidz_map_t, so we need
415219089Spjd	 * to copy the data aside; there's no guarantee that our zio's buffer
416219089Spjd	 * won't be re-used for something else.
417219089Spjd	 *
418219089Spjd	 * Our parity data is already in separate buffers, so there's no need
419219089Spjd	 * to copy them.
420219089Spjd	 */
421219089Spjd
422219089Spjd	size = 0;
423219089Spjd	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
424219089Spjd		size += rm->rm_col[c].rc_size;
425219089Spjd
426219089Spjd	buf = rm->rm_datacopy = zio_buf_alloc(size);
427219089Spjd
428219089Spjd	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
429219089Spjd		raidz_col_t *col = &rm->rm_col[c];
430219089Spjd
431219089Spjd		bcopy(col->rc_data, buf, col->rc_size);
432219089Spjd		col->rc_data = buf;
433219089Spjd
434219089Spjd		buf += col->rc_size;
435219089Spjd	}
436219089Spjd	ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
437219089Spjd}
438219089Spjd
439219089Spjdstatic const zio_vsd_ops_t vdev_raidz_vsd_ops = {
440219089Spjd	vdev_raidz_map_free_vsd,
441219089Spjd	vdev_raidz_cksum_report
442219089Spjd};
443219089Spjd
444251629Sdelphij/*
445251629Sdelphij * Divides the IO evenly across all child vdevs; usually, dcols is
446251629Sdelphij * the number of children in the target vdev.
447251629Sdelphij */
448168404Spjdstatic raidz_map_t *
449255750Sdelphijvdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, boolean_t dofree,
450255750Sdelphij    uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
451168404Spjd{
452168404Spjd	raidz_map_t *rm;
453251629Sdelphij	/* The starting RAIDZ (parent) vdev sector of the block. */
454255750Sdelphij	uint64_t b = offset >> unit_shift;
455251629Sdelphij	/* The zio's size in units of the vdev's minimum sector size. */
456255750Sdelphij	uint64_t s = size >> unit_shift;
457251629Sdelphij	/* The first column for this stripe. */
458168404Spjd	uint64_t f = b % dcols;
459251629Sdelphij	/* The starting byte offset on each child vdev. */
460168404Spjd	uint64_t o = (b / dcols) << unit_shift;
461219089Spjd	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
462168404Spjd
463251629Sdelphij	/*
464251629Sdelphij	 * "Quotient": The number of data sectors for this stripe on all but
465251629Sdelphij	 * the "big column" child vdevs that also contain "remainder" data.
466251629Sdelphij	 */
467168404Spjd	q = s / (dcols - nparity);
468251629Sdelphij
469251629Sdelphij	/*
470251629Sdelphij	 * "Remainder": The number of partial stripe data sectors in this I/O.
471251629Sdelphij	 * This will add a sector to some, but not all, child vdevs.
472251629Sdelphij	 */
473168404Spjd	r = s - q * (dcols - nparity);
474251629Sdelphij
475251629Sdelphij	/* The number of "big columns" - those which contain remainder data. */
476168404Spjd	bc = (r == 0 ? 0 : r + nparity);
477251629Sdelphij
478251629Sdelphij	/*
479251629Sdelphij	 * The total number of data and parity sectors associated with
480251629Sdelphij	 * this I/O.
481251629Sdelphij	 */
482219089Spjd	tot = s + nparity * (q + (r == 0 ? 0 : 1));
483168404Spjd
484251629Sdelphij	/* acols: The columns that will be accessed. */
485251629Sdelphij	/* scols: The columns that will be accessed or skipped. */
486219089Spjd	if (q == 0) {
487251629Sdelphij		/* Our I/O request doesn't span all child vdevs. */
488219089Spjd		acols = bc;
489219089Spjd		scols = MIN(dcols, roundup(bc, nparity + 1));
490219089Spjd	} else {
491219089Spjd		acols = dcols;
492219089Spjd		scols = dcols;
493219089Spjd	}
494168404Spjd
495219089Spjd	ASSERT3U(acols, <=, scols);
496168404Spjd
497219089Spjd	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
498219089Spjd
499168404Spjd	rm->rm_cols = acols;
500219089Spjd	rm->rm_scols = scols;
501168404Spjd	rm->rm_bigcols = bc;
502219089Spjd	rm->rm_skipstart = bc;
503168404Spjd	rm->rm_missingdata = 0;
504168404Spjd	rm->rm_missingparity = 0;
505168404Spjd	rm->rm_firstdatacol = nparity;
506219089Spjd	rm->rm_datacopy = NULL;
507219089Spjd	rm->rm_reports = 0;
508219089Spjd	rm->rm_freed = 0;
509219089Spjd	rm->rm_ecksuminjected = 0;
510168404Spjd
511219089Spjd	asize = 0;
512219089Spjd
513219089Spjd	for (c = 0; c < scols; c++) {
514168404Spjd		col = f + c;
515168404Spjd		coff = o;
516168404Spjd		if (col >= dcols) {
517168404Spjd			col -= dcols;
518168404Spjd			coff += 1ULL << unit_shift;
519168404Spjd		}
520168404Spjd		rm->rm_col[c].rc_devidx = col;
521168404Spjd		rm->rm_col[c].rc_offset = coff;
522168404Spjd		rm->rm_col[c].rc_data = NULL;
523219089Spjd		rm->rm_col[c].rc_gdata = NULL;
524168404Spjd		rm->rm_col[c].rc_error = 0;
525168404Spjd		rm->rm_col[c].rc_tried = 0;
526168404Spjd		rm->rm_col[c].rc_skipped = 0;
527219089Spjd
528219089Spjd		if (c >= acols)
529219089Spjd			rm->rm_col[c].rc_size = 0;
530219089Spjd		else if (c < bc)
531219089Spjd			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
532219089Spjd		else
533219089Spjd			rm->rm_col[c].rc_size = q << unit_shift;
534219089Spjd
535219089Spjd		asize += rm->rm_col[c].rc_size;
536168404Spjd	}
537168404Spjd
538219089Spjd	ASSERT3U(asize, ==, tot << unit_shift);
539219089Spjd	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
540219089Spjd	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
541219089Spjd	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
542219089Spjd	ASSERT3U(rm->rm_nskip, <=, nparity);
543168404Spjd
544255750Sdelphij	if (!dofree) {
545240868Spjd		for (c = 0; c < rm->rm_firstdatacol; c++) {
546240868Spjd			rm->rm_col[c].rc_data =
547240868Spjd			    zio_buf_alloc(rm->rm_col[c].rc_size);
548240868Spjd		}
549168404Spjd
550255750Sdelphij		rm->rm_col[c].rc_data = data;
551168404Spjd
552240868Spjd		for (c = c + 1; c < acols; c++) {
553240868Spjd			rm->rm_col[c].rc_data =
554240868Spjd			    (char *)rm->rm_col[c - 1].rc_data +
555240868Spjd			    rm->rm_col[c - 1].rc_size;
556240868Spjd		}
557240868Spjd	}
558168404Spjd
559168404Spjd	/*
560168404Spjd	 * If all data stored spans all columns, there's a danger that parity
561168404Spjd	 * will always be on the same device and, since parity isn't read
562168404Spjd	 * during normal operation, that that device's I/O bandwidth won't be
563168404Spjd	 * used effectively. We therefore switch the parity every 1MB.
564168404Spjd	 *
565168404Spjd	 * ... at least that was, ostensibly, the theory. As a practical
566168404Spjd	 * matter unless we juggle the parity between all devices evenly, we
567168404Spjd	 * won't see any benefit. Further, occasional writes that aren't a
568168404Spjd	 * multiple of the LCM of the number of children and the minimum
569168404Spjd	 * stripe width are sufficient to avoid pessimal behavior.
570168404Spjd	 * Unfortunately, this decision created an implicit on-disk format
571168404Spjd	 * requirement that we need to support for all eternity, but only
572168404Spjd	 * for single-parity RAID-Z.
573219089Spjd	 *
574219089Spjd	 * If we intend to skip a sector in the zeroth column for padding
575219089Spjd	 * we must make sure to note this swap. We will never intend to
576219089Spjd	 * skip the first column since at least one data and one parity
577219089Spjd	 * column must appear in each row.
578168404Spjd	 */
579168404Spjd	ASSERT(rm->rm_cols >= 2);
580168404Spjd	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
581168404Spjd
582255750Sdelphij	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
583168404Spjd		devidx = rm->rm_col[0].rc_devidx;
584168404Spjd		o = rm->rm_col[0].rc_offset;
585168404Spjd		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
586168404Spjd		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
587168404Spjd		rm->rm_col[1].rc_devidx = devidx;
588168404Spjd		rm->rm_col[1].rc_offset = o;
589219089Spjd
590219089Spjd		if (rm->rm_skipstart == 0)
591219089Spjd			rm->rm_skipstart = 1;
592168404Spjd	}
593168404Spjd
594168404Spjd	return (rm);
595168404Spjd}
596168404Spjd
597168404Spjdstatic void
598168404Spjdvdev_raidz_generate_parity_p(raidz_map_t *rm)
599168404Spjd{
600168404Spjd	uint64_t *p, *src, pcount, ccount, i;
601168404Spjd	int c;
602168404Spjd
603168404Spjd	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
604168404Spjd
605168404Spjd	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
606168404Spjd		src = rm->rm_col[c].rc_data;
607168404Spjd		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
608168404Spjd		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
609168404Spjd
610168404Spjd		if (c == rm->rm_firstdatacol) {
611168404Spjd			ASSERT(ccount == pcount);
612219089Spjd			for (i = 0; i < ccount; i++, src++, p++) {
613168404Spjd				*p = *src;
614168404Spjd			}
615168404Spjd		} else {
616168404Spjd			ASSERT(ccount <= pcount);
617219089Spjd			for (i = 0; i < ccount; i++, src++, p++) {
618168404Spjd				*p ^= *src;
619168404Spjd			}
620168404Spjd		}
621168404Spjd	}
622168404Spjd}
623168404Spjd
624168404Spjdstatic void
625168404Spjdvdev_raidz_generate_parity_pq(raidz_map_t *rm)
626168404Spjd{
627219089Spjd	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
628168404Spjd	int c;
629168404Spjd
630219089Spjd	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
631168404Spjd	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
632168404Spjd	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
633168404Spjd
634168404Spjd	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
635168404Spjd		src = rm->rm_col[c].rc_data;
636168404Spjd		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
637168404Spjd		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
638168404Spjd
639219089Spjd		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
640219089Spjd
641168404Spjd		if (c == rm->rm_firstdatacol) {
642219089Spjd			ASSERT(ccnt == pcnt || ccnt == 0);
643219089Spjd			for (i = 0; i < ccnt; i++, src++, p++, q++) {
644219089Spjd				*p = *src;
645168404Spjd				*q = *src;
646219089Spjd			}
647219089Spjd			for (; i < pcnt; i++, src++, p++, q++) {
648219089Spjd				*p = 0;
649219089Spjd				*q = 0;
650219089Spjd			}
651219089Spjd		} else {
652219089Spjd			ASSERT(ccnt <= pcnt);
653219089Spjd
654219089Spjd			/*
655219089Spjd			 * Apply the algorithm described above by multiplying
656219089Spjd			 * the previous result and adding in the new value.
657219089Spjd			 */
658219089Spjd			for (i = 0; i < ccnt; i++, src++, p++, q++) {
659219089Spjd				*p ^= *src;
660219089Spjd
661219089Spjd				VDEV_RAIDZ_64MUL_2(*q, mask);
662219089Spjd				*q ^= *src;
663219089Spjd			}
664219089Spjd
665219089Spjd			/*
666219089Spjd			 * Treat short columns as though they are full of 0s.
667219089Spjd			 * Note that there's therefore nothing needed for P.
668219089Spjd			 */
669219089Spjd			for (; i < pcnt; i++, q++) {
670219089Spjd				VDEV_RAIDZ_64MUL_2(*q, mask);
671219089Spjd			}
672219089Spjd		}
673219089Spjd	}
674219089Spjd}
675219089Spjd
676219089Spjdstatic void
677219089Spjdvdev_raidz_generate_parity_pqr(raidz_map_t *rm)
678219089Spjd{
679219089Spjd	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
680219089Spjd	int c;
681219089Spjd
682219089Spjd	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
683219089Spjd	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
684219089Spjd	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
685219089Spjd	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
686219089Spjd	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
687219089Spjd
688219089Spjd	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
689219089Spjd		src = rm->rm_col[c].rc_data;
690219089Spjd		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
691219089Spjd		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
692219089Spjd		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
693219089Spjd
694219089Spjd		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
695219089Spjd
696219089Spjd		if (c == rm->rm_firstdatacol) {
697219089Spjd			ASSERT(ccnt == pcnt || ccnt == 0);
698219089Spjd			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
699168404Spjd				*p = *src;
700219089Spjd				*q = *src;
701219089Spjd				*r = *src;
702168404Spjd			}
703219089Spjd			for (; i < pcnt; i++, src++, p++, q++, r++) {
704219089Spjd				*p = 0;
705168404Spjd				*q = 0;
706219089Spjd				*r = 0;
707168404Spjd			}
708168404Spjd		} else {
709219089Spjd			ASSERT(ccnt <= pcnt);
710168404Spjd
711168404Spjd			/*
712219089Spjd			 * Apply the algorithm described above by multiplying
713219089Spjd			 * the previous result and adding in the new value.
714168404Spjd			 */
715219089Spjd			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
716219089Spjd				*p ^= *src;
717219089Spjd
718219089Spjd				VDEV_RAIDZ_64MUL_2(*q, mask);
719168404Spjd				*q ^= *src;
720219089Spjd
721219089Spjd				VDEV_RAIDZ_64MUL_4(*r, mask);
722219089Spjd				*r ^= *src;
723168404Spjd			}
724168404Spjd
725168404Spjd			/*
726168404Spjd			 * Treat short columns as though they are full of 0s.
727219089Spjd			 * Note that there's therefore nothing needed for P.
728168404Spjd			 */
729219089Spjd			for (; i < pcnt; i++, q++, r++) {
730219089Spjd				VDEV_RAIDZ_64MUL_2(*q, mask);
731219089Spjd				VDEV_RAIDZ_64MUL_4(*r, mask);
732168404Spjd			}
733168404Spjd		}
734168404Spjd	}
735168404Spjd}
736168404Spjd
737219089Spjd/*
738219089Spjd * Generate RAID parity in the first virtual columns according to the number of
739219089Spjd * parity columns available.
740219089Spjd */
741168404Spjdstatic void
742219089Spjdvdev_raidz_generate_parity(raidz_map_t *rm)
743168404Spjd{
744219089Spjd	switch (rm->rm_firstdatacol) {
745219089Spjd	case 1:
746219089Spjd		vdev_raidz_generate_parity_p(rm);
747219089Spjd		break;
748219089Spjd	case 2:
749219089Spjd		vdev_raidz_generate_parity_pq(rm);
750219089Spjd		break;
751219089Spjd	case 3:
752219089Spjd		vdev_raidz_generate_parity_pqr(rm);
753219089Spjd		break;
754219089Spjd	default:
755219089Spjd		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
756219089Spjd	}
757219089Spjd}
758219089Spjd
759219089Spjdstatic int
760219089Spjdvdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
761219089Spjd{
762168404Spjd	uint64_t *dst, *src, xcount, ccount, count, i;
763219089Spjd	int x = tgts[0];
764168404Spjd	int c;
765168404Spjd
766219089Spjd	ASSERT(ntgts == 1);
767219089Spjd	ASSERT(x >= rm->rm_firstdatacol);
768219089Spjd	ASSERT(x < rm->rm_cols);
769219089Spjd
770168404Spjd	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
771168404Spjd	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
772168404Spjd	ASSERT(xcount > 0);
773168404Spjd
774168404Spjd	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
775168404Spjd	dst = rm->rm_col[x].rc_data;
776168404Spjd	for (i = 0; i < xcount; i++, dst++, src++) {
777168404Spjd		*dst = *src;
778168404Spjd	}
779168404Spjd
780168404Spjd	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
781168404Spjd		src = rm->rm_col[c].rc_data;
782168404Spjd		dst = rm->rm_col[x].rc_data;
783168404Spjd
784168404Spjd		if (c == x)
785168404Spjd			continue;
786168404Spjd
787168404Spjd		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
788168404Spjd		count = MIN(ccount, xcount);
789168404Spjd
790168404Spjd		for (i = 0; i < count; i++, dst++, src++) {
791168404Spjd			*dst ^= *src;
792168404Spjd		}
793168404Spjd	}
794219089Spjd
795219089Spjd	return (1 << VDEV_RAIDZ_P);
796168404Spjd}
797168404Spjd
798219089Spjdstatic int
799219089Spjdvdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
800168404Spjd{
801168404Spjd	uint64_t *dst, *src, xcount, ccount, count, mask, i;
802168404Spjd	uint8_t *b;
803219089Spjd	int x = tgts[0];
804168404Spjd	int c, j, exp;
805168404Spjd
806219089Spjd	ASSERT(ntgts == 1);
807219089Spjd
808168404Spjd	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
809168404Spjd	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
810168404Spjd
811168404Spjd	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
812168404Spjd		src = rm->rm_col[c].rc_data;
813168404Spjd		dst = rm->rm_col[x].rc_data;
814168404Spjd
815168404Spjd		if (c == x)
816168404Spjd			ccount = 0;
817168404Spjd		else
818168404Spjd			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
819168404Spjd
820168404Spjd		count = MIN(ccount, xcount);
821168404Spjd
822168404Spjd		if (c == rm->rm_firstdatacol) {
823168404Spjd			for (i = 0; i < count; i++, dst++, src++) {
824168404Spjd				*dst = *src;
825168404Spjd			}
826168404Spjd			for (; i < xcount; i++, dst++) {
827168404Spjd				*dst = 0;
828168404Spjd			}
829168404Spjd
830168404Spjd		} else {
831168404Spjd			for (i = 0; i < count; i++, dst++, src++) {
832219089Spjd				VDEV_RAIDZ_64MUL_2(*dst, mask);
833168404Spjd				*dst ^= *src;
834168404Spjd			}
835168404Spjd
836168404Spjd			for (; i < xcount; i++, dst++) {
837219089Spjd				VDEV_RAIDZ_64MUL_2(*dst, mask);
838168404Spjd			}
839168404Spjd		}
840168404Spjd	}
841168404Spjd
842168404Spjd	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
843168404Spjd	dst = rm->rm_col[x].rc_data;
844168404Spjd	exp = 255 - (rm->rm_cols - 1 - x);
845168404Spjd
846168404Spjd	for (i = 0; i < xcount; i++, dst++, src++) {
847168404Spjd		*dst ^= *src;
848168404Spjd		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
849168404Spjd			*b = vdev_raidz_exp2(*b, exp);
850168404Spjd		}
851168404Spjd	}
852219089Spjd
853219089Spjd	return (1 << VDEV_RAIDZ_Q);
854168404Spjd}
855168404Spjd
856219089Spjdstatic int
857219089Spjdvdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
858168404Spjd{
859168404Spjd	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
860168404Spjd	void *pdata, *qdata;
861168404Spjd	uint64_t xsize, ysize, i;
862219089Spjd	int x = tgts[0];
863219089Spjd	int y = tgts[1];
864168404Spjd
865219089Spjd	ASSERT(ntgts == 2);
866168404Spjd	ASSERT(x < y);
867168404Spjd	ASSERT(x >= rm->rm_firstdatacol);
868168404Spjd	ASSERT(y < rm->rm_cols);
869168404Spjd
870168404Spjd	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
871168404Spjd
872168404Spjd	/*
873168404Spjd	 * Move the parity data aside -- we're going to compute parity as
874168404Spjd	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
875168404Spjd	 * reuse the parity generation mechanism without trashing the actual
876168404Spjd	 * parity so we make those columns appear to be full of zeros by
877168404Spjd	 * setting their lengths to zero.
878168404Spjd	 */
879168404Spjd	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
880168404Spjd	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
881168404Spjd	xsize = rm->rm_col[x].rc_size;
882168404Spjd	ysize = rm->rm_col[y].rc_size;
883168404Spjd
884168404Spjd	rm->rm_col[VDEV_RAIDZ_P].rc_data =
885168404Spjd	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
886168404Spjd	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
887168404Spjd	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
888168404Spjd	rm->rm_col[x].rc_size = 0;
889168404Spjd	rm->rm_col[y].rc_size = 0;
890168404Spjd
891168404Spjd	vdev_raidz_generate_parity_pq(rm);
892168404Spjd
893168404Spjd	rm->rm_col[x].rc_size = xsize;
894168404Spjd	rm->rm_col[y].rc_size = ysize;
895168404Spjd
896168404Spjd	p = pdata;
897168404Spjd	q = qdata;
898168404Spjd	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
899168404Spjd	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
900168404Spjd	xd = rm->rm_col[x].rc_data;
901168404Spjd	yd = rm->rm_col[y].rc_data;
902168404Spjd
903168404Spjd	/*
904168404Spjd	 * We now have:
905168404Spjd	 *	Pxy = P + D_x + D_y
906168404Spjd	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
907168404Spjd	 *
908168404Spjd	 * We can then solve for D_x:
909168404Spjd	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
910168404Spjd	 * where
911168404Spjd	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
912168404Spjd	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
913168404Spjd	 *
914168404Spjd	 * With D_x in hand, we can easily solve for D_y:
915168404Spjd	 *	D_y = P + Pxy + D_x
916168404Spjd	 */
917168404Spjd
918168404Spjd	a = vdev_raidz_pow2[255 + x - y];
919168404Spjd	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
920168404Spjd	tmp = 255 - vdev_raidz_log2[a ^ 1];
921168404Spjd
922168404Spjd	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
923168404Spjd	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
924168404Spjd
925168404Spjd	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
926168404Spjd		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
927168404Spjd		    vdev_raidz_exp2(*q ^ *qxy, bexp);
928168404Spjd
929168404Spjd		if (i < ysize)
930168404Spjd			*yd = *p ^ *pxy ^ *xd;
931168404Spjd	}
932168404Spjd
933168404Spjd	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
934168404Spjd	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
935168404Spjd	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
936168404Spjd	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
937168404Spjd
938168404Spjd	/*
939168404Spjd	 * Restore the saved parity data.
940168404Spjd	 */
941168404Spjd	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
942168404Spjd	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
943219089Spjd
944219089Spjd	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
945168404Spjd}
946168404Spjd
947219089Spjd/* BEGIN CSTYLED */
948219089Spjd/*
949219089Spjd * In the general case of reconstruction, we must solve the system of linear
950219089Spjd * equations defined by the coeffecients used to generate parity as well as
951219089Spjd * the contents of the data and parity disks. This can be expressed with
952219089Spjd * vectors for the original data (D) and the actual data (d) and parity (p)
953219089Spjd * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
954219089Spjd *
955219089Spjd *            __   __                     __     __
956219089Spjd *            |     |         __     __   |  p_0  |
957219089Spjd *            |  V  |         |  D_0  |   | p_m-1 |
958219089Spjd *            |     |    x    |   :   | = |  d_0  |
959219089Spjd *            |  I  |         | D_n-1 |   |   :   |
960219089Spjd *            |     |         ~~     ~~   | d_n-1 |
961219089Spjd *            ~~   ~~                     ~~     ~~
962219089Spjd *
963219089Spjd * I is simply a square identity matrix of size n, and V is a vandermonde
964219089Spjd * matrix defined by the coeffecients we chose for the various parity columns
965219089Spjd * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
966219089Spjd * computation as well as linear separability.
967219089Spjd *
968219089Spjd *      __               __               __     __
969219089Spjd *      |   1   ..  1 1 1 |               |  p_0  |
970219089Spjd *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
971219089Spjd *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
972219089Spjd *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
973219089Spjd *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
974219089Spjd *      |   :       : : : |   |   :   |   |  d_2  |
975219089Spjd *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
976219089Spjd *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
977219089Spjd *      |   0   ..  0 0 1 |               | d_n-1 |
978219089Spjd *      ~~               ~~               ~~     ~~
979219089Spjd *
980219089Spjd * Note that I, V, d, and p are known. To compute D, we must invert the
981219089Spjd * matrix and use the known data and parity values to reconstruct the unknown
982219089Spjd * data values. We begin by removing the rows in V|I and d|p that correspond
983219089Spjd * to failed or missing columns; we then make V|I square (n x n) and d|p
984219089Spjd * sized n by removing rows corresponding to unused parity from the bottom up
985219089Spjd * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
986219089Spjd * using Gauss-Jordan elimination. In the example below we use m=3 parity
987219089Spjd * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
988219089Spjd *           __                               __
989219089Spjd *           |  1   1   1   1   1   1   1   1  |
990219089Spjd *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
991219089Spjd *           |  19 205 116  29  64  16  4   1  |      / /
992219089Spjd *           |  1   0   0   0   0   0   0   0  |     / /
993219089Spjd *           |  0   1   0   0   0   0   0   0  | <--' /
994219089Spjd *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
995219089Spjd *           |  0   0   0   1   0   0   0   0  |
996219089Spjd *           |  0   0   0   0   1   0   0   0  |
997219089Spjd *           |  0   0   0   0   0   1   0   0  |
998219089Spjd *           |  0   0   0   0   0   0   1   0  |
999219089Spjd *           |  0   0   0   0   0   0   0   1  |
1000219089Spjd *           ~~                               ~~
1001219089Spjd *           __                               __
1002219089Spjd *           |  1   1   1   1   1   1   1   1  |
1003219089Spjd *           |  19 205 116  29  64  16  4   1  |
1004219089Spjd *           |  1   0   0   0   0   0   0   0  |
1005255750Sdelphij *  (V|I)' = |  0   0   0   1   0   0   0   0  |
1006219089Spjd *           |  0   0   0   0   1   0   0   0  |
1007219089Spjd *           |  0   0   0   0   0   1   0   0  |
1008219089Spjd *           |  0   0   0   0   0   0   1   0  |
1009219089Spjd *           |  0   0   0   0   0   0   0   1  |
1010219089Spjd *           ~~                               ~~
1011219089Spjd *
1012219089Spjd * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1013219089Spjd * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1014219089Spjd * matrix is not singular.
1015219089Spjd * __                                                                 __
1016219089Spjd * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1017219089Spjd * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1018219089Spjd * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1019219089Spjd * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1020219089Spjd * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1021219089Spjd * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1022219089Spjd * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1023219089Spjd * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1024219089Spjd * ~~                                                                 ~~
1025219089Spjd * __                                                                 __
1026219089Spjd * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1027219089Spjd * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1028219089Spjd * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1029219089Spjd * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1030219089Spjd * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1031219089Spjd * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1032219089Spjd * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1033219089Spjd * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1034219089Spjd * ~~                                                                 ~~
1035219089Spjd * __                                                                 __
1036219089Spjd * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1037219089Spjd * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1038219089Spjd * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1039219089Spjd * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1040219089Spjd * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1041219089Spjd * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1042219089Spjd * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1043219089Spjd * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1044219089Spjd * ~~                                                                 ~~
1045219089Spjd * __                                                                 __
1046219089Spjd * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1047219089Spjd * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1048219089Spjd * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1049219089Spjd * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1050219089Spjd * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1051219089Spjd * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1052219089Spjd * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1053219089Spjd * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1054219089Spjd * ~~                                                                 ~~
1055219089Spjd * __                                                                 __
1056219089Spjd * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1057219089Spjd * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1058219089Spjd * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1059219089Spjd * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1060219089Spjd * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1061219089Spjd * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1062219089Spjd * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1063219089Spjd * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1064219089Spjd * ~~                                                                 ~~
1065219089Spjd * __                                                                 __
1066219089Spjd * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1067219089Spjd * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1068219089Spjd * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1069219089Spjd * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1070219089Spjd * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1071219089Spjd * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1072219089Spjd * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1073219089Spjd * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1074219089Spjd * ~~                                                                 ~~
1075219089Spjd *                   __                               __
1076219089Spjd *                   |  0   0   1   0   0   0   0   0  |
1077219089Spjd *                   | 167 100  5   41 159 169 217 208 |
1078219089Spjd *                   | 166 100  4   40 158 168 216 209 |
1079219089Spjd *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1080219089Spjd *                   |  0   0   0   0   1   0   0   0  |
1081219089Spjd *                   |  0   0   0   0   0   1   0   0  |
1082219089Spjd *                   |  0   0   0   0   0   0   1   0  |
1083219089Spjd *                   |  0   0   0   0   0   0   0   1  |
1084219089Spjd *                   ~~                               ~~
1085219089Spjd *
1086219089Spjd * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1087219089Spjd * of the missing data.
1088219089Spjd *
1089219089Spjd * As is apparent from the example above, the only non-trivial rows in the
1090219089Spjd * inverse matrix correspond to the data disks that we're trying to
1091219089Spjd * reconstruct. Indeed, those are the only rows we need as the others would
1092219089Spjd * only be useful for reconstructing data known or assumed to be valid. For
1093219089Spjd * that reason, we only build the coefficients in the rows that correspond to
1094219089Spjd * targeted columns.
1095219089Spjd */
1096219089Spjd/* END CSTYLED */
1097168404Spjd
1098219089Spjdstatic void
1099219089Spjdvdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1100219089Spjd    uint8_t **rows)
1101219089Spjd{
1102219089Spjd	int i, j;
1103219089Spjd	int pow;
1104219089Spjd
1105219089Spjd	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1106219089Spjd
1107219089Spjd	/*
1108219089Spjd	 * Fill in the missing rows of interest.
1109219089Spjd	 */
1110219089Spjd	for (i = 0; i < nmap; i++) {
1111219089Spjd		ASSERT3S(0, <=, map[i]);
1112219089Spjd		ASSERT3S(map[i], <=, 2);
1113219089Spjd
1114219089Spjd		pow = map[i] * n;
1115219089Spjd		if (pow > 255)
1116219089Spjd			pow -= 255;
1117219089Spjd		ASSERT(pow <= 255);
1118219089Spjd
1119219089Spjd		for (j = 0; j < n; j++) {
1120219089Spjd			pow -= map[i];
1121219089Spjd			if (pow < 0)
1122219089Spjd				pow += 255;
1123219089Spjd			rows[i][j] = vdev_raidz_pow2[pow];
1124219089Spjd		}
1125219089Spjd	}
1126219089Spjd}
1127219089Spjd
1128219089Spjdstatic void
1129219089Spjdvdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1130219089Spjd    uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1131219089Spjd{
1132219089Spjd	int i, j, ii, jj;
1133219089Spjd	uint8_t log;
1134219089Spjd
1135219089Spjd	/*
1136219089Spjd	 * Assert that the first nmissing entries from the array of used
1137219089Spjd	 * columns correspond to parity columns and that subsequent entries
1138219089Spjd	 * correspond to data columns.
1139219089Spjd	 */
1140219089Spjd	for (i = 0; i < nmissing; i++) {
1141219089Spjd		ASSERT3S(used[i], <, rm->rm_firstdatacol);
1142219089Spjd	}
1143219089Spjd	for (; i < n; i++) {
1144219089Spjd		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1145219089Spjd	}
1146219089Spjd
1147219089Spjd	/*
1148219089Spjd	 * First initialize the storage where we'll compute the inverse rows.
1149219089Spjd	 */
1150219089Spjd	for (i = 0; i < nmissing; i++) {
1151219089Spjd		for (j = 0; j < n; j++) {
1152219089Spjd			invrows[i][j] = (i == j) ? 1 : 0;
1153219089Spjd		}
1154219089Spjd	}
1155219089Spjd
1156219089Spjd	/*
1157219089Spjd	 * Subtract all trivial rows from the rows of consequence.
1158219089Spjd	 */
1159219089Spjd	for (i = 0; i < nmissing; i++) {
1160219089Spjd		for (j = nmissing; j < n; j++) {
1161219089Spjd			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1162219089Spjd			jj = used[j] - rm->rm_firstdatacol;
1163219089Spjd			ASSERT3S(jj, <, n);
1164219089Spjd			invrows[i][j] = rows[i][jj];
1165219089Spjd			rows[i][jj] = 0;
1166219089Spjd		}
1167219089Spjd	}
1168219089Spjd
1169219089Spjd	/*
1170219089Spjd	 * For each of the rows of interest, we must normalize it and subtract
1171219089Spjd	 * a multiple of it from the other rows.
1172219089Spjd	 */
1173219089Spjd	for (i = 0; i < nmissing; i++) {
1174219089Spjd		for (j = 0; j < missing[i]; j++) {
1175240415Smm			ASSERT0(rows[i][j]);
1176219089Spjd		}
1177219089Spjd		ASSERT3U(rows[i][missing[i]], !=, 0);
1178219089Spjd
1179219089Spjd		/*
1180219089Spjd		 * Compute the inverse of the first element and multiply each
1181219089Spjd		 * element in the row by that value.
1182219089Spjd		 */
1183219089Spjd		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1184219089Spjd
1185219089Spjd		for (j = 0; j < n; j++) {
1186219089Spjd			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1187219089Spjd			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1188219089Spjd		}
1189219089Spjd
1190219089Spjd		for (ii = 0; ii < nmissing; ii++) {
1191219089Spjd			if (i == ii)
1192219089Spjd				continue;
1193219089Spjd
1194219089Spjd			ASSERT3U(rows[ii][missing[i]], !=, 0);
1195219089Spjd
1196219089Spjd			log = vdev_raidz_log2[rows[ii][missing[i]]];
1197219089Spjd
1198219089Spjd			for (j = 0; j < n; j++) {
1199219089Spjd				rows[ii][j] ^=
1200219089Spjd				    vdev_raidz_exp2(rows[i][j], log);
1201219089Spjd				invrows[ii][j] ^=
1202219089Spjd				    vdev_raidz_exp2(invrows[i][j], log);
1203219089Spjd			}
1204219089Spjd		}
1205219089Spjd	}
1206219089Spjd
1207219089Spjd	/*
1208219089Spjd	 * Verify that the data that is left in the rows are properly part of
1209219089Spjd	 * an identity matrix.
1210219089Spjd	 */
1211219089Spjd	for (i = 0; i < nmissing; i++) {
1212219089Spjd		for (j = 0; j < n; j++) {
1213219089Spjd			if (j == missing[i]) {
1214219089Spjd				ASSERT3U(rows[i][j], ==, 1);
1215219089Spjd			} else {
1216240415Smm				ASSERT0(rows[i][j]);
1217219089Spjd			}
1218219089Spjd		}
1219219089Spjd	}
1220219089Spjd}
1221219089Spjd
1222219089Spjdstatic void
1223219089Spjdvdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1224219089Spjd    int *missing, uint8_t **invrows, const uint8_t *used)
1225219089Spjd{
1226219089Spjd	int i, j, x, cc, c;
1227219089Spjd	uint8_t *src;
1228219089Spjd	uint64_t ccount;
1229219089Spjd	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1230219089Spjd	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1231247187Smm	uint8_t log = 0;
1232247187Smm	uint8_t val;
1233219089Spjd	int ll;
1234219089Spjd	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1235219089Spjd	uint8_t *p, *pp;
1236219089Spjd	size_t psize;
1237219089Spjd
1238219089Spjd	psize = sizeof (invlog[0][0]) * n * nmissing;
1239219089Spjd	p = kmem_alloc(psize, KM_SLEEP);
1240219089Spjd
1241219089Spjd	for (pp = p, i = 0; i < nmissing; i++) {
1242219089Spjd		invlog[i] = pp;
1243219089Spjd		pp += n;
1244219089Spjd	}
1245219089Spjd
1246219089Spjd	for (i = 0; i < nmissing; i++) {
1247219089Spjd		for (j = 0; j < n; j++) {
1248219089Spjd			ASSERT3U(invrows[i][j], !=, 0);
1249219089Spjd			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1250219089Spjd		}
1251219089Spjd	}
1252219089Spjd
1253219089Spjd	for (i = 0; i < n; i++) {
1254219089Spjd		c = used[i];
1255219089Spjd		ASSERT3U(c, <, rm->rm_cols);
1256219089Spjd
1257219089Spjd		src = rm->rm_col[c].rc_data;
1258219089Spjd		ccount = rm->rm_col[c].rc_size;
1259219089Spjd		for (j = 0; j < nmissing; j++) {
1260219089Spjd			cc = missing[j] + rm->rm_firstdatacol;
1261219089Spjd			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1262219089Spjd			ASSERT3U(cc, <, rm->rm_cols);
1263219089Spjd			ASSERT3U(cc, !=, c);
1264219089Spjd
1265219089Spjd			dst[j] = rm->rm_col[cc].rc_data;
1266219089Spjd			dcount[j] = rm->rm_col[cc].rc_size;
1267219089Spjd		}
1268219089Spjd
1269219089Spjd		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1270219089Spjd
1271219089Spjd		for (x = 0; x < ccount; x++, src++) {
1272219089Spjd			if (*src != 0)
1273219089Spjd				log = vdev_raidz_log2[*src];
1274219089Spjd
1275219089Spjd			for (cc = 0; cc < nmissing; cc++) {
1276219089Spjd				if (x >= dcount[cc])
1277219089Spjd					continue;
1278219089Spjd
1279219089Spjd				if (*src == 0) {
1280219089Spjd					val = 0;
1281219089Spjd				} else {
1282219089Spjd					if ((ll = log + invlog[cc][i]) >= 255)
1283219089Spjd						ll -= 255;
1284219089Spjd					val = vdev_raidz_pow2[ll];
1285219089Spjd				}
1286219089Spjd
1287219089Spjd				if (i == 0)
1288219089Spjd					dst[cc][x] = val;
1289219089Spjd				else
1290219089Spjd					dst[cc][x] ^= val;
1291219089Spjd			}
1292219089Spjd		}
1293219089Spjd	}
1294219089Spjd
1295219089Spjd	kmem_free(p, psize);
1296219089Spjd}
1297219089Spjd
1298168404Spjdstatic int
1299219089Spjdvdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1300219089Spjd{
1301219089Spjd	int n, i, c, t, tt;
1302219089Spjd	int nmissing_rows;
1303219089Spjd	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1304219089Spjd	int parity_map[VDEV_RAIDZ_MAXPARITY];
1305219089Spjd
1306219089Spjd	uint8_t *p, *pp;
1307219089Spjd	size_t psize;
1308219089Spjd
1309219089Spjd	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1310219089Spjd	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1311219089Spjd	uint8_t *used;
1312219089Spjd
1313219089Spjd	int code = 0;
1314219089Spjd
1315219089Spjd
1316219089Spjd	n = rm->rm_cols - rm->rm_firstdatacol;
1317219089Spjd
1318219089Spjd	/*
1319219089Spjd	 * Figure out which data columns are missing.
1320219089Spjd	 */
1321219089Spjd	nmissing_rows = 0;
1322219089Spjd	for (t = 0; t < ntgts; t++) {
1323219089Spjd		if (tgts[t] >= rm->rm_firstdatacol) {
1324219089Spjd			missing_rows[nmissing_rows++] =
1325219089Spjd			    tgts[t] - rm->rm_firstdatacol;
1326219089Spjd		}
1327219089Spjd	}
1328219089Spjd
1329219089Spjd	/*
1330219089Spjd	 * Figure out which parity columns to use to help generate the missing
1331219089Spjd	 * data columns.
1332219089Spjd	 */
1333219089Spjd	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1334219089Spjd		ASSERT(tt < ntgts);
1335219089Spjd		ASSERT(c < rm->rm_firstdatacol);
1336219089Spjd
1337219089Spjd		/*
1338219089Spjd		 * Skip any targeted parity columns.
1339219089Spjd		 */
1340219089Spjd		if (c == tgts[tt]) {
1341219089Spjd			tt++;
1342219089Spjd			continue;
1343219089Spjd		}
1344219089Spjd
1345219089Spjd		code |= 1 << c;
1346219089Spjd
1347219089Spjd		parity_map[i] = c;
1348219089Spjd		i++;
1349219089Spjd	}
1350219089Spjd
1351219089Spjd	ASSERT(code != 0);
1352219089Spjd	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1353219089Spjd
1354219089Spjd	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1355219089Spjd	    nmissing_rows * n + sizeof (used[0]) * n;
1356219089Spjd	p = kmem_alloc(psize, KM_SLEEP);
1357219089Spjd
1358219089Spjd	for (pp = p, i = 0; i < nmissing_rows; i++) {
1359219089Spjd		rows[i] = pp;
1360219089Spjd		pp += n;
1361219089Spjd		invrows[i] = pp;
1362219089Spjd		pp += n;
1363219089Spjd	}
1364219089Spjd	used = pp;
1365219089Spjd
1366219089Spjd	for (i = 0; i < nmissing_rows; i++) {
1367219089Spjd		used[i] = parity_map[i];
1368219089Spjd	}
1369219089Spjd
1370219089Spjd	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1371219089Spjd		if (tt < nmissing_rows &&
1372219089Spjd		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1373219089Spjd			tt++;
1374219089Spjd			continue;
1375219089Spjd		}
1376219089Spjd
1377219089Spjd		ASSERT3S(i, <, n);
1378219089Spjd		used[i] = c;
1379219089Spjd		i++;
1380219089Spjd	}
1381219089Spjd
1382219089Spjd	/*
1383219089Spjd	 * Initialize the interesting rows of the matrix.
1384219089Spjd	 */
1385219089Spjd	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1386219089Spjd
1387219089Spjd	/*
1388219089Spjd	 * Invert the matrix.
1389219089Spjd	 */
1390219089Spjd	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1391219089Spjd	    invrows, used);
1392219089Spjd
1393219089Spjd	/*
1394219089Spjd	 * Reconstruct the missing data using the generated matrix.
1395219089Spjd	 */
1396219089Spjd	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1397219089Spjd	    invrows, used);
1398219089Spjd
1399219089Spjd	kmem_free(p, psize);
1400219089Spjd
1401219089Spjd	return (code);
1402219089Spjd}
1403219089Spjd
1404219089Spjdstatic int
1405219089Spjdvdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1406219089Spjd{
1407219089Spjd	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1408219089Spjd	int ntgts;
1409219089Spjd	int i, c;
1410219089Spjd	int code;
1411219089Spjd	int nbadparity, nbaddata;
1412219089Spjd	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1413219089Spjd
1414219089Spjd	/*
1415219089Spjd	 * The tgts list must already be sorted.
1416219089Spjd	 */
1417219089Spjd	for (i = 1; i < nt; i++) {
1418219089Spjd		ASSERT(t[i] > t[i - 1]);
1419219089Spjd	}
1420219089Spjd
1421219089Spjd	nbadparity = rm->rm_firstdatacol;
1422219089Spjd	nbaddata = rm->rm_cols - nbadparity;
1423219089Spjd	ntgts = 0;
1424219089Spjd	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1425219089Spjd		if (c < rm->rm_firstdatacol)
1426219089Spjd			parity_valid[c] = B_FALSE;
1427219089Spjd
1428219089Spjd		if (i < nt && c == t[i]) {
1429219089Spjd			tgts[ntgts++] = c;
1430219089Spjd			i++;
1431219089Spjd		} else if (rm->rm_col[c].rc_error != 0) {
1432219089Spjd			tgts[ntgts++] = c;
1433219089Spjd		} else if (c >= rm->rm_firstdatacol) {
1434219089Spjd			nbaddata--;
1435219089Spjd		} else {
1436219089Spjd			parity_valid[c] = B_TRUE;
1437219089Spjd			nbadparity--;
1438219089Spjd		}
1439219089Spjd	}
1440219089Spjd
1441219089Spjd	ASSERT(ntgts >= nt);
1442219089Spjd	ASSERT(nbaddata >= 0);
1443219089Spjd	ASSERT(nbaddata + nbadparity == ntgts);
1444219089Spjd
1445219089Spjd	dt = &tgts[nbadparity];
1446219089Spjd
1447219089Spjd	/*
1448219089Spjd	 * See if we can use any of our optimized reconstruction routines.
1449219089Spjd	 */
1450219089Spjd	if (!vdev_raidz_default_to_general) {
1451219089Spjd		switch (nbaddata) {
1452219089Spjd		case 1:
1453219089Spjd			if (parity_valid[VDEV_RAIDZ_P])
1454219089Spjd				return (vdev_raidz_reconstruct_p(rm, dt, 1));
1455219089Spjd
1456219089Spjd			ASSERT(rm->rm_firstdatacol > 1);
1457219089Spjd
1458219089Spjd			if (parity_valid[VDEV_RAIDZ_Q])
1459219089Spjd				return (vdev_raidz_reconstruct_q(rm, dt, 1));
1460219089Spjd
1461219089Spjd			ASSERT(rm->rm_firstdatacol > 2);
1462219089Spjd			break;
1463219089Spjd
1464219089Spjd		case 2:
1465219089Spjd			ASSERT(rm->rm_firstdatacol > 1);
1466219089Spjd
1467219089Spjd			if (parity_valid[VDEV_RAIDZ_P] &&
1468219089Spjd			    parity_valid[VDEV_RAIDZ_Q])
1469219089Spjd				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1470219089Spjd
1471219089Spjd			ASSERT(rm->rm_firstdatacol > 2);
1472219089Spjd
1473219089Spjd			break;
1474219089Spjd		}
1475219089Spjd	}
1476219089Spjd
1477219089Spjd	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1478219089Spjd	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1479219089Spjd	ASSERT(code > 0);
1480219089Spjd	return (code);
1481219089Spjd}
1482219089Spjd
1483219089Spjdstatic int
1484236155Smmvdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1485254591Sgibbs    uint64_t *logical_ashift, uint64_t *physical_ashift)
1486168404Spjd{
1487168404Spjd	vdev_t *cvd;
1488168404Spjd	uint64_t nparity = vd->vdev_nparity;
1489219089Spjd	int c;
1490168404Spjd	int lasterror = 0;
1491168404Spjd	int numerrors = 0;
1492168404Spjd
1493168404Spjd	ASSERT(nparity > 0);
1494168404Spjd
1495168404Spjd	if (nparity > VDEV_RAIDZ_MAXPARITY ||
1496168404Spjd	    vd->vdev_children < nparity + 1) {
1497168404Spjd		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1498249195Smm		return (SET_ERROR(EINVAL));
1499168404Spjd	}
1500168404Spjd
1501219089Spjd	vdev_open_children(vd);
1502219089Spjd
1503168404Spjd	for (c = 0; c < vd->vdev_children; c++) {
1504168404Spjd		cvd = vd->vdev_child[c];
1505168404Spjd
1506219089Spjd		if (cvd->vdev_open_error != 0) {
1507219089Spjd			lasterror = cvd->vdev_open_error;
1508168404Spjd			numerrors++;
1509168404Spjd			continue;
1510168404Spjd		}
1511168404Spjd
1512168404Spjd		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1513236155Smm		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1514254591Sgibbs		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
1515254591Sgibbs		*physical_ashift = MAX(*physical_ashift,
1516254591Sgibbs		    cvd->vdev_physical_ashift);
1517168404Spjd	}
1518168404Spjd
1519168404Spjd	*asize *= vd->vdev_children;
1520236155Smm	*max_asize *= vd->vdev_children;
1521168404Spjd
1522168404Spjd	if (numerrors > nparity) {
1523168404Spjd		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1524168404Spjd		return (lasterror);
1525168404Spjd	}
1526168404Spjd
1527168404Spjd	return (0);
1528168404Spjd}
1529168404Spjd
1530168404Spjdstatic void
1531168404Spjdvdev_raidz_close(vdev_t *vd)
1532168404Spjd{
1533168404Spjd	int c;
1534168404Spjd
1535168404Spjd	for (c = 0; c < vd->vdev_children; c++)
1536168404Spjd		vdev_close(vd->vdev_child[c]);
1537168404Spjd}
1538168404Spjd
1539255750Sdelphij#ifdef illumos
1540255750Sdelphij/*
1541255750Sdelphij * Handle a read or write I/O to a RAID-Z dump device.
1542255750Sdelphij *
1543255750Sdelphij * The dump device is in a unique situation compared to other ZFS datasets:
1544255750Sdelphij * writing to this device should be as simple and fast as possible.  In
1545255750Sdelphij * addition, durability matters much less since the dump will be extracted
1546255750Sdelphij * once the machine reboots.  For that reason, this function eschews parity for
1547255750Sdelphij * performance and simplicity.  The dump device uses the checksum setting
1548255750Sdelphij * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1549255750Sdelphij * dataset.
1550255750Sdelphij *
1551255750Sdelphij * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
1552255750Sdelphij * 128 KB will not fill an entire block; in addition, they may not be properly
1553255750Sdelphij * aligned.  In that case, this function uses the preallocated 128 KB block and
1554255750Sdelphij * omits reading or writing any "empty" portions of that block, as opposed to
1555255750Sdelphij * allocating a fresh appropriately-sized block.
1556255750Sdelphij *
1557255750Sdelphij * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1558255750Sdelphij *
1559255750Sdelphij *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1560255750Sdelphij *
1561255750Sdelphij * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1562255750Sdelphij * allocated which spans all five child vdevs.  8 KB of data would be written to
1563255750Sdelphij * each of four vdevs, with the fifth containing the parity bits.
1564255750Sdelphij *
1565255750Sdelphij *       parity    data     data     data     data
1566255750Sdelphij *     |   PP   |   XX   |   XX   |   XX   |   XX   |
1567255750Sdelphij *         ^        ^        ^        ^        ^
1568255750Sdelphij *         |        |        |        |        |
1569255750Sdelphij *   8 KB parity    ------8 KB data blocks------
1570255750Sdelphij *
1571255750Sdelphij * However, when writing to the dump device, the behavior is different:
1572255750Sdelphij *
1573255750Sdelphij *     vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1574255750Sdelphij *
1575255750Sdelphij * Unlike the normal RAID-Z case in which the block is allocated based on the
1576255750Sdelphij * I/O size, reads and writes here always use a 128 KB logical I/O size.  If the
1577255750Sdelphij * I/O size is less than 128 KB, only the actual portions of data are written.
1578255750Sdelphij * In this example the data is written to the third data vdev since that vdev
1579255750Sdelphij * contains the offset [64 KB, 96 KB).
1580255750Sdelphij *
1581255750Sdelphij *       parity    data     data     data     data
1582255750Sdelphij *     |        |        |        |   XX   |        |
1583255750Sdelphij *                                    ^
1584255750Sdelphij *                                    |
1585255750Sdelphij *                             32 KB data block
1586255750Sdelphij *
1587255750Sdelphij * As a result, an individual I/O may not span all child vdevs; moreover, a
1588255750Sdelphij * small I/O may only operate on a single child vdev.
1589255750Sdelphij *
1590255750Sdelphij * Note that since there are no parity bits calculated or written, this format
1591255750Sdelphij * remains the same no matter how many parity bits are used in a normal RAID-Z
1592255750Sdelphij * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
1593255750Sdelphij * would look like:
1594255750Sdelphij *
1595255750Sdelphij *       parity   parity   parity    data     data     data     data
1596255750Sdelphij *     |        |        |        |        |        |   XX   |        |
1597255750Sdelphij *                                                      ^
1598255750Sdelphij *                                                      |
1599255750Sdelphij *                                               32 KB data block
1600255750Sdelphij */
1601255750Sdelphijint
1602255750Sdelphijvdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1603255750Sdelphij    uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1604255750Sdelphij{
1605255750Sdelphij	vdev_t *tvd = vd->vdev_top;
1606255750Sdelphij	vdev_t *cvd;
1607255750Sdelphij	raidz_map_t *rm;
1608255750Sdelphij	raidz_col_t *rc;
1609255750Sdelphij	int c, err = 0;
1610255750Sdelphij
1611255750Sdelphij	uint64_t start, end, colstart, colend;
1612255750Sdelphij	uint64_t coloffset, colsize, colskip;
1613255750Sdelphij
1614255750Sdelphij	int flags = doread ? BIO_READ : BIO_WRITE;
1615255750Sdelphij
1616255750Sdelphij#ifdef	_KERNEL
1617255750Sdelphij
1618255750Sdelphij	/*
1619255750Sdelphij	 * Don't write past the end of the block
1620255750Sdelphij	 */
1621255750Sdelphij	VERIFY3U(offset + size, <=, origoffset + SPA_MAXBLOCKSIZE);
1622255750Sdelphij
1623255750Sdelphij	start = offset;
1624255750Sdelphij	end = start + size;
1625255750Sdelphij
1626255750Sdelphij	/*
1627255750Sdelphij	 * Allocate a RAID-Z map for this block.  Note that this block starts
1628255750Sdelphij	 * from the "original" offset, this is, the offset of the extent which
1629255750Sdelphij	 * contains the requisite offset of the data being read or written.
1630255750Sdelphij	 *
1631255750Sdelphij	 * Even if this I/O operation doesn't span the full block size, let's
1632255750Sdelphij	 * treat the on-disk format as if the only blocks are the complete 128
1633255750Sdelphij	 * KB size.
1634255750Sdelphij	 */
1635255750Sdelphij	rm = vdev_raidz_map_alloc(data - (offset - origoffset),
1636255750Sdelphij	    SPA_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift, vd->vdev_children,
1637255750Sdelphij	    vd->vdev_nparity);
1638255750Sdelphij
1639255750Sdelphij	coloffset = origoffset;
1640255750Sdelphij
1641255750Sdelphij	for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1642255750Sdelphij	    c++, coloffset += rc->rc_size) {
1643255750Sdelphij		rc = &rm->rm_col[c];
1644255750Sdelphij		cvd = vd->vdev_child[rc->rc_devidx];
1645255750Sdelphij
1646255750Sdelphij		/*
1647255750Sdelphij		 * Find the start and end of this column in the RAID-Z map,
1648255750Sdelphij		 * keeping in mind that the stated size and offset of the
1649255750Sdelphij		 * operation may not fill the entire column for this vdev.
1650255750Sdelphij		 *
1651255750Sdelphij		 * If any portion of the data spans this column, issue the
1652255750Sdelphij		 * appropriate operation to the vdev.
1653255750Sdelphij		 */
1654255750Sdelphij		if (coloffset + rc->rc_size <= start)
1655255750Sdelphij			continue;
1656255750Sdelphij		if (coloffset >= end)
1657255750Sdelphij			continue;
1658255750Sdelphij
1659255750Sdelphij		colstart = MAX(coloffset, start);
1660255750Sdelphij		colend = MIN(end, coloffset + rc->rc_size);
1661255750Sdelphij		colsize = colend - colstart;
1662255750Sdelphij		colskip = colstart - coloffset;
1663255750Sdelphij
1664255750Sdelphij		VERIFY3U(colsize, <=, rc->rc_size);
1665255750Sdelphij		VERIFY3U(colskip, <=, rc->rc_size);
1666255750Sdelphij
1667255750Sdelphij		/*
1668255750Sdelphij		 * Note that the child vdev will have a vdev label at the start
1669255750Sdelphij		 * of its range of offsets, hence the need for
1670255750Sdelphij		 * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
1671255750Sdelphij		 * example of why this calculation is needed.
1672255750Sdelphij		 */
1673255750Sdelphij		if ((err = vdev_disk_physio(cvd,
1674255750Sdelphij		    ((char *)rc->rc_data) + colskip, colsize,
1675255750Sdelphij		    VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1676255750Sdelphij		    flags, isdump)) != 0)
1677255750Sdelphij			break;
1678255750Sdelphij	}
1679255750Sdelphij
1680255750Sdelphij	vdev_raidz_map_free(rm);
1681255750Sdelphij#endif	/* KERNEL */
1682255750Sdelphij
1683255750Sdelphij	return (err);
1684255750Sdelphij}
1685255750Sdelphij#endif
1686255750Sdelphij
1687168404Spjdstatic uint64_t
1688168404Spjdvdev_raidz_asize(vdev_t *vd, uint64_t psize)
1689168404Spjd{
1690168404Spjd	uint64_t asize;
1691168404Spjd	uint64_t ashift = vd->vdev_top->vdev_ashift;
1692168404Spjd	uint64_t cols = vd->vdev_children;
1693168404Spjd	uint64_t nparity = vd->vdev_nparity;
1694168404Spjd
1695168404Spjd	asize = ((psize - 1) >> ashift) + 1;
1696168404Spjd	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1697168404Spjd	asize = roundup(asize, nparity + 1) << ashift;
1698168404Spjd
1699168404Spjd	return (asize);
1700168404Spjd}
1701168404Spjd
1702168404Spjdstatic void
1703168404Spjdvdev_raidz_child_done(zio_t *zio)
1704168404Spjd{
1705168404Spjd	raidz_col_t *rc = zio->io_private;
1706168404Spjd
1707168404Spjd	rc->rc_error = zio->io_error;
1708168404Spjd	rc->rc_tried = 1;
1709168404Spjd	rc->rc_skipped = 0;
1710168404Spjd}
1711168404Spjd
1712251629Sdelphij/*
1713251629Sdelphij * Start an IO operation on a RAIDZ VDev
1714251629Sdelphij *
1715251629Sdelphij * Outline:
1716251629Sdelphij * - For write operations:
1717251629Sdelphij *   1. Generate the parity data
1718251629Sdelphij *   2. Create child zio write operations to each column's vdev, for both
1719251629Sdelphij *      data and parity.
1720251629Sdelphij *   3. If the column skips any sectors for padding, create optional dummy
1721251629Sdelphij *      write zio children for those areas to improve aggregation continuity.
1722251629Sdelphij * - For read operations:
1723251629Sdelphij *   1. Create child zio read operations to each data column's vdev to read
1724251629Sdelphij *      the range of data required for zio.
1725251629Sdelphij *   2. If this is a scrub or resilver operation, or if any of the data
1726251629Sdelphij *      vdevs have had errors, then create zio read operations to the parity
1727251629Sdelphij *      columns' VDevs as well.
1728251629Sdelphij */
1729185029Spjdstatic int
1730168404Spjdvdev_raidz_io_start(zio_t *zio)
1731168404Spjd{
1732168404Spjd	vdev_t *vd = zio->io_vd;
1733168404Spjd	vdev_t *tvd = vd->vdev_top;
1734168404Spjd	vdev_t *cvd;
1735168404Spjd	raidz_map_t *rm;
1736168404Spjd	raidz_col_t *rc;
1737219089Spjd	int c, i;
1738168404Spjd
1739255750Sdelphij	rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
1740255750Sdelphij	    zio->io_type == ZIO_TYPE_FREE,
1741255750Sdelphij	    tvd->vdev_ashift, vd->vdev_children,
1742168404Spjd	    vd->vdev_nparity);
1743168404Spjd
1744255750Sdelphij	zio->io_vsd = rm;
1745255750Sdelphij	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1746255750Sdelphij
1747168404Spjd	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1748168404Spjd
1749240868Spjd	if (zio->io_type == ZIO_TYPE_FREE) {
1750240868Spjd		for (c = 0; c < rm->rm_cols; c++) {
1751240868Spjd			rc = &rm->rm_col[c];
1752240868Spjd			cvd = vd->vdev_child[rc->rc_devidx];
1753240868Spjd			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1754240868Spjd			    rc->rc_offset, rc->rc_data, rc->rc_size,
1755240868Spjd			    zio->io_type, zio->io_priority, 0,
1756240868Spjd			    vdev_raidz_child_done, rc));
1757240868Spjd		}
1758270312Ssmh
1759270312Ssmh		zio_interrupt(zio);
1760270312Ssmh		return (ZIO_PIPELINE_STOP);
1761240868Spjd	}
1762240868Spjd
1763168404Spjd	if (zio->io_type == ZIO_TYPE_WRITE) {
1764219089Spjd		vdev_raidz_generate_parity(rm);
1765168404Spjd
1766168404Spjd		for (c = 0; c < rm->rm_cols; c++) {
1767168404Spjd			rc = &rm->rm_col[c];
1768168404Spjd			cvd = vd->vdev_child[rc->rc_devidx];
1769168404Spjd			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1770168404Spjd			    rc->rc_offset, rc->rc_data, rc->rc_size,
1771185029Spjd			    zio->io_type, zio->io_priority, 0,
1772168404Spjd			    vdev_raidz_child_done, rc));
1773168404Spjd		}
1774185029Spjd
1775219089Spjd		/*
1776219089Spjd		 * Generate optional I/Os for any skipped sectors to improve
1777219089Spjd		 * aggregation contiguity.
1778219089Spjd		 */
1779219089Spjd		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1780219089Spjd			ASSERT(c <= rm->rm_scols);
1781219089Spjd			if (c == rm->rm_scols)
1782219089Spjd				c = 0;
1783219089Spjd			rc = &rm->rm_col[c];
1784219089Spjd			cvd = vd->vdev_child[rc->rc_devidx];
1785219089Spjd			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1786219089Spjd			    rc->rc_offset + rc->rc_size, NULL,
1787219089Spjd			    1 << tvd->vdev_ashift,
1788219089Spjd			    zio->io_type, zio->io_priority,
1789219089Spjd			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1790219089Spjd		}
1791219089Spjd
1792270312Ssmh		zio_interrupt(zio);
1793270312Ssmh		return (ZIO_PIPELINE_STOP);
1794168404Spjd	}
1795168404Spjd
1796168404Spjd	ASSERT(zio->io_type == ZIO_TYPE_READ);
1797168404Spjd
1798168404Spjd	/*
1799168404Spjd	 * Iterate over the columns in reverse order so that we hit the parity
1800219089Spjd	 * last -- any errors along the way will force us to read the parity.
1801168404Spjd	 */
1802168404Spjd	for (c = rm->rm_cols - 1; c >= 0; c--) {
1803168404Spjd		rc = &rm->rm_col[c];
1804168404Spjd		cvd = vd->vdev_child[rc->rc_devidx];
1805185029Spjd		if (!vdev_readable(cvd)) {
1806168404Spjd			if (c >= rm->rm_firstdatacol)
1807168404Spjd				rm->rm_missingdata++;
1808168404Spjd			else
1809168404Spjd				rm->rm_missingparity++;
1810249195Smm			rc->rc_error = SET_ERROR(ENXIO);
1811168404Spjd			rc->rc_tried = 1;	/* don't even try */
1812168404Spjd			rc->rc_skipped = 1;
1813168404Spjd			continue;
1814168404Spjd		}
1815219089Spjd		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1816168404Spjd			if (c >= rm->rm_firstdatacol)
1817168404Spjd				rm->rm_missingdata++;
1818168404Spjd			else
1819168404Spjd				rm->rm_missingparity++;
1820249195Smm			rc->rc_error = SET_ERROR(ESTALE);
1821168404Spjd			rc->rc_skipped = 1;
1822168404Spjd			continue;
1823168404Spjd		}
1824168404Spjd		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1825209095Smm		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1826168404Spjd			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1827168404Spjd			    rc->rc_offset, rc->rc_data, rc->rc_size,
1828185029Spjd			    zio->io_type, zio->io_priority, 0,
1829168404Spjd			    vdev_raidz_child_done, rc));
1830168404Spjd		}
1831168404Spjd	}
1832168404Spjd
1833270312Ssmh	zio_interrupt(zio);
1834270312Ssmh	return (ZIO_PIPELINE_STOP);
1835168404Spjd}
1836168404Spjd
1837219089Spjd
1838168404Spjd/*
1839168404Spjd * Report a checksum error for a child of a RAID-Z device.
1840168404Spjd */
1841168404Spjdstatic void
1842219089Spjdraidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1843168404Spjd{
1844168404Spjd	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1845168404Spjd
1846168404Spjd	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1847219089Spjd		zio_bad_cksum_t zbc;
1848219089Spjd		raidz_map_t *rm = zio->io_vsd;
1849219089Spjd
1850168404Spjd		mutex_enter(&vd->vdev_stat_lock);
1851168404Spjd		vd->vdev_stat.vs_checksum_errors++;
1852168404Spjd		mutex_exit(&vd->vdev_stat_lock);
1853219089Spjd
1854219089Spjd		zbc.zbc_has_cksum = 0;
1855219089Spjd		zbc.zbc_injected = rm->rm_ecksuminjected;
1856219089Spjd
1857219089Spjd		zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1858219089Spjd		    rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1859219089Spjd		    &zbc);
1860168404Spjd	}
1861219089Spjd}
1862168404Spjd
1863219089Spjd/*
1864219089Spjd * We keep track of whether or not there were any injected errors, so that
1865219089Spjd * any ereports we generate can note it.
1866219089Spjd */
1867219089Spjdstatic int
1868219089Spjdraidz_checksum_verify(zio_t *zio)
1869219089Spjd{
1870219089Spjd	zio_bad_cksum_t zbc;
1871219089Spjd	raidz_map_t *rm = zio->io_vsd;
1872219089Spjd
1873219089Spjd	int ret = zio_checksum_error(zio, &zbc);
1874219089Spjd	if (ret != 0 && zbc.zbc_injected != 0)
1875219089Spjd		rm->rm_ecksuminjected = 1;
1876219089Spjd
1877219089Spjd	return (ret);
1878168404Spjd}
1879168404Spjd
1880168404Spjd/*
1881168404Spjd * Generate the parity from the data columns. If we tried and were able to
1882168404Spjd * read the parity without error, verify that the generated parity matches the
1883168404Spjd * data we read. If it doesn't, we fire off a checksum error. Return the
1884168404Spjd * number such failures.
1885168404Spjd */
1886168404Spjdstatic int
1887168404Spjdraidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1888168404Spjd{
1889168404Spjd	void *orig[VDEV_RAIDZ_MAXPARITY];
1890168404Spjd	int c, ret = 0;
1891168404Spjd	raidz_col_t *rc;
1892168404Spjd
1893255750Sdelphij	blkptr_t *bp = zio->io_bp;
1894255750Sdelphij	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
1895255750Sdelphij	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
1896255750Sdelphij
1897255750Sdelphij	if (checksum == ZIO_CHECKSUM_NOPARITY)
1898255750Sdelphij		return (ret);
1899255750Sdelphij
1900168404Spjd	for (c = 0; c < rm->rm_firstdatacol; c++) {
1901168404Spjd		rc = &rm->rm_col[c];
1902168404Spjd		if (!rc->rc_tried || rc->rc_error != 0)
1903168404Spjd			continue;
1904168404Spjd		orig[c] = zio_buf_alloc(rc->rc_size);
1905168404Spjd		bcopy(rc->rc_data, orig[c], rc->rc_size);
1906168404Spjd	}
1907168404Spjd
1908219089Spjd	vdev_raidz_generate_parity(rm);
1909168404Spjd
1910168404Spjd	for (c = 0; c < rm->rm_firstdatacol; c++) {
1911168404Spjd		rc = &rm->rm_col[c];
1912168404Spjd		if (!rc->rc_tried || rc->rc_error != 0)
1913168404Spjd			continue;
1914168404Spjd		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1915219089Spjd			raidz_checksum_error(zio, rc, orig[c]);
1916249195Smm			rc->rc_error = SET_ERROR(ECKSUM);
1917168404Spjd			ret++;
1918168404Spjd		}
1919168404Spjd		zio_buf_free(orig[c], rc->rc_size);
1920168404Spjd	}
1921168404Spjd
1922168404Spjd	return (ret);
1923168404Spjd}
1924168404Spjd
1925219089Spjd/*
1926219089Spjd * Keep statistics on all the ways that we used parity to correct data.
1927219089Spjd */
1928219089Spjdstatic uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1929168404Spjd
1930185029Spjdstatic int
1931185029Spjdvdev_raidz_worst_error(raidz_map_t *rm)
1932185029Spjd{
1933185029Spjd	int error = 0;
1934185029Spjd
1935185029Spjd	for (int c = 0; c < rm->rm_cols; c++)
1936185029Spjd		error = zio_worst_error(error, rm->rm_col[c].rc_error);
1937185029Spjd
1938185029Spjd	return (error);
1939185029Spjd}
1940185029Spjd
1941219089Spjd/*
1942219089Spjd * Iterate over all combinations of bad data and attempt a reconstruction.
1943219089Spjd * Note that the algorithm below is non-optimal because it doesn't take into
1944219089Spjd * account how reconstruction is actually performed. For example, with
1945219089Spjd * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1946219089Spjd * is targeted as invalid as if columns 1 and 4 are targeted since in both
1947219089Spjd * cases we'd only use parity information in column 0.
1948219089Spjd */
1949219089Spjdstatic int
1950219089Spjdvdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1951219089Spjd{
1952219089Spjd	raidz_map_t *rm = zio->io_vsd;
1953219089Spjd	raidz_col_t *rc;
1954219089Spjd	void *orig[VDEV_RAIDZ_MAXPARITY];
1955219089Spjd	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1956219089Spjd	int *tgts = &tstore[1];
1957219089Spjd	int current, next, i, c, n;
1958219089Spjd	int code, ret = 0;
1959219089Spjd
1960219089Spjd	ASSERT(total_errors < rm->rm_firstdatacol);
1961219089Spjd
1962219089Spjd	/*
1963219089Spjd	 * This simplifies one edge condition.
1964219089Spjd	 */
1965219089Spjd	tgts[-1] = -1;
1966219089Spjd
1967219089Spjd	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1968219089Spjd		/*
1969219089Spjd		 * Initialize the targets array by finding the first n columns
1970219089Spjd		 * that contain no error.
1971219089Spjd		 *
1972219089Spjd		 * If there were no data errors, we need to ensure that we're
1973219089Spjd		 * always explicitly attempting to reconstruct at least one
1974219089Spjd		 * data column. To do this, we simply push the highest target
1975219089Spjd		 * up into the data columns.
1976219089Spjd		 */
1977219089Spjd		for (c = 0, i = 0; i < n; i++) {
1978219089Spjd			if (i == n - 1 && data_errors == 0 &&
1979219089Spjd			    c < rm->rm_firstdatacol) {
1980219089Spjd				c = rm->rm_firstdatacol;
1981219089Spjd			}
1982219089Spjd
1983219089Spjd			while (rm->rm_col[c].rc_error != 0) {
1984219089Spjd				c++;
1985219089Spjd				ASSERT3S(c, <, rm->rm_cols);
1986219089Spjd			}
1987219089Spjd
1988219089Spjd			tgts[i] = c++;
1989219089Spjd		}
1990219089Spjd
1991219089Spjd		/*
1992219089Spjd		 * Setting tgts[n] simplifies the other edge condition.
1993219089Spjd		 */
1994219089Spjd		tgts[n] = rm->rm_cols;
1995219089Spjd
1996219089Spjd		/*
1997219089Spjd		 * These buffers were allocated in previous iterations.
1998219089Spjd		 */
1999219089Spjd		for (i = 0; i < n - 1; i++) {
2000219089Spjd			ASSERT(orig[i] != NULL);
2001219089Spjd		}
2002219089Spjd
2003219089Spjd		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
2004219089Spjd
2005219089Spjd		current = 0;
2006219089Spjd		next = tgts[current];
2007219089Spjd
2008219089Spjd		while (current != n) {
2009219089Spjd			tgts[current] = next;
2010219089Spjd			current = 0;
2011219089Spjd
2012219089Spjd			/*
2013219089Spjd			 * Save off the original data that we're going to
2014219089Spjd			 * attempt to reconstruct.
2015219089Spjd			 */
2016219089Spjd			for (i = 0; i < n; i++) {
2017219089Spjd				ASSERT(orig[i] != NULL);
2018219089Spjd				c = tgts[i];
2019219089Spjd				ASSERT3S(c, >=, 0);
2020219089Spjd				ASSERT3S(c, <, rm->rm_cols);
2021219089Spjd				rc = &rm->rm_col[c];
2022219089Spjd				bcopy(rc->rc_data, orig[i], rc->rc_size);
2023219089Spjd			}
2024219089Spjd
2025219089Spjd			/*
2026219089Spjd			 * Attempt a reconstruction and exit the outer loop on
2027219089Spjd			 * success.
2028219089Spjd			 */
2029219089Spjd			code = vdev_raidz_reconstruct(rm, tgts, n);
2030219089Spjd			if (raidz_checksum_verify(zio) == 0) {
2031219089Spjd				atomic_inc_64(&raidz_corrected[code]);
2032219089Spjd
2033219089Spjd				for (i = 0; i < n; i++) {
2034219089Spjd					c = tgts[i];
2035219089Spjd					rc = &rm->rm_col[c];
2036219089Spjd					ASSERT(rc->rc_error == 0);
2037219089Spjd					if (rc->rc_tried)
2038219089Spjd						raidz_checksum_error(zio, rc,
2039219089Spjd						    orig[i]);
2040249195Smm					rc->rc_error = SET_ERROR(ECKSUM);
2041219089Spjd				}
2042219089Spjd
2043219089Spjd				ret = code;
2044219089Spjd				goto done;
2045219089Spjd			}
2046219089Spjd
2047219089Spjd			/*
2048219089Spjd			 * Restore the original data.
2049219089Spjd			 */
2050219089Spjd			for (i = 0; i < n; i++) {
2051219089Spjd				c = tgts[i];
2052219089Spjd				rc = &rm->rm_col[c];
2053219089Spjd				bcopy(orig[i], rc->rc_data, rc->rc_size);
2054219089Spjd			}
2055219089Spjd
2056219089Spjd			do {
2057219089Spjd				/*
2058219089Spjd				 * Find the next valid column after the current
2059219089Spjd				 * position..
2060219089Spjd				 */
2061219089Spjd				for (next = tgts[current] + 1;
2062219089Spjd				    next < rm->rm_cols &&
2063219089Spjd				    rm->rm_col[next].rc_error != 0; next++)
2064219089Spjd					continue;
2065219089Spjd
2066219089Spjd				ASSERT(next <= tgts[current + 1]);
2067219089Spjd
2068219089Spjd				/*
2069219089Spjd				 * If that spot is available, we're done here.
2070219089Spjd				 */
2071219089Spjd				if (next != tgts[current + 1])
2072219089Spjd					break;
2073219089Spjd
2074219089Spjd				/*
2075219089Spjd				 * Otherwise, find the next valid column after
2076219089Spjd				 * the previous position.
2077219089Spjd				 */
2078219089Spjd				for (c = tgts[current - 1] + 1;
2079219089Spjd				    rm->rm_col[c].rc_error != 0; c++)
2080219089Spjd					continue;
2081219089Spjd
2082219089Spjd				tgts[current] = c;
2083219089Spjd				current++;
2084219089Spjd
2085219089Spjd			} while (current != n);
2086219089Spjd		}
2087219089Spjd	}
2088219089Spjd	n--;
2089219089Spjddone:
2090219089Spjd	for (i = 0; i < n; i++) {
2091219089Spjd		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2092219089Spjd	}
2093219089Spjd
2094219089Spjd	return (ret);
2095219089Spjd}
2096219089Spjd
2097251629Sdelphij/*
2098251629Sdelphij * Complete an IO operation on a RAIDZ VDev
2099251629Sdelphij *
2100251629Sdelphij * Outline:
2101251629Sdelphij * - For write operations:
2102251629Sdelphij *   1. Check for errors on the child IOs.
2103251629Sdelphij *   2. Return, setting an error code if too few child VDevs were written
2104251629Sdelphij *      to reconstruct the data later.  Note that partial writes are
2105251629Sdelphij *      considered successful if they can be reconstructed at all.
2106251629Sdelphij * - For read operations:
2107251629Sdelphij *   1. Check for errors on the child IOs.
2108251629Sdelphij *   2. If data errors occurred:
2109251629Sdelphij *      a. Try to reassemble the data from the parity available.
2110251629Sdelphij *      b. If we haven't yet read the parity drives, read them now.
2111251629Sdelphij *      c. If all parity drives have been read but the data still doesn't
2112251629Sdelphij *         reassemble with a correct checksum, then try combinatorial
2113251629Sdelphij *         reconstruction.
2114251629Sdelphij *      d. If that doesn't work, return an error.
2115251629Sdelphij *   3. If there were unexpected errors or this is a resilver operation,
2116251629Sdelphij *      rewrite the vdevs that had errors.
2117251629Sdelphij */
2118168404Spjdstatic void
2119168404Spjdvdev_raidz_io_done(zio_t *zio)
2120168404Spjd{
2121168404Spjd	vdev_t *vd = zio->io_vd;
2122168404Spjd	vdev_t *cvd;
2123168404Spjd	raidz_map_t *rm = zio->io_vsd;
2124219089Spjd	raidz_col_t *rc;
2125168404Spjd	int unexpected_errors = 0;
2126168404Spjd	int parity_errors = 0;
2127168404Spjd	int parity_untried = 0;
2128168404Spjd	int data_errors = 0;
2129185029Spjd	int total_errors = 0;
2130219089Spjd	int n, c;
2131219089Spjd	int tgts[VDEV_RAIDZ_MAXPARITY];
2132219089Spjd	int code;
2133168404Spjd
2134168404Spjd	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
2135168404Spjd
2136168404Spjd	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2137168404Spjd	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2138168404Spjd
2139168404Spjd	for (c = 0; c < rm->rm_cols; c++) {
2140168404Spjd		rc = &rm->rm_col[c];
2141168404Spjd
2142168404Spjd		if (rc->rc_error) {
2143185029Spjd			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
2144168404Spjd
2145168404Spjd			if (c < rm->rm_firstdatacol)
2146168404Spjd				parity_errors++;
2147168404Spjd			else
2148168404Spjd				data_errors++;
2149168404Spjd
2150168404Spjd			if (!rc->rc_skipped)
2151168404Spjd				unexpected_errors++;
2152168404Spjd
2153185029Spjd			total_errors++;
2154168404Spjd		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2155168404Spjd			parity_untried++;
2156168404Spjd		}
2157168404Spjd	}
2158168404Spjd
2159168404Spjd	if (zio->io_type == ZIO_TYPE_WRITE) {
2160168404Spjd		/*
2161185029Spjd		 * XXX -- for now, treat partial writes as a success.
2162185029Spjd		 * (If we couldn't write enough columns to reconstruct
2163185029Spjd		 * the data, the I/O failed.  Otherwise, good enough.)
2164185029Spjd		 *
2165185029Spjd		 * Now that we support write reallocation, it would be better
2166185029Spjd		 * to treat partial failure as real failure unless there are
2167185029Spjd		 * no non-degraded top-level vdevs left, and not update DTLs
2168185029Spjd		 * if we intend to reallocate.
2169168404Spjd		 */
2170168404Spjd		/* XXPOLICY */
2171185029Spjd		if (total_errors > rm->rm_firstdatacol)
2172185029Spjd			zio->io_error = vdev_raidz_worst_error(rm);
2173168404Spjd
2174168404Spjd		return;
2175240868Spjd	} else if (zio->io_type == ZIO_TYPE_FREE) {
2176240868Spjd		return;
2177168404Spjd	}
2178168404Spjd
2179168404Spjd	ASSERT(zio->io_type == ZIO_TYPE_READ);
2180168404Spjd	/*
2181168404Spjd	 * There are three potential phases for a read:
2182168404Spjd	 *	1. produce valid data from the columns read
2183168404Spjd	 *	2. read all disks and try again
2184168404Spjd	 *	3. perform combinatorial reconstruction
2185168404Spjd	 *
2186168404Spjd	 * Each phase is progressively both more expensive and less likely to
2187168404Spjd	 * occur. If we encounter more errors than we can repair or all phases
2188168404Spjd	 * fail, we have no choice but to return an error.
2189168404Spjd	 */
2190168404Spjd
2191168404Spjd	/*
2192168404Spjd	 * If the number of errors we saw was correctable -- less than or equal
2193168404Spjd	 * to the number of parity disks read -- attempt to produce data that
2194168404Spjd	 * has a valid checksum. Naturally, this case applies in the absence of
2195168404Spjd	 * any errors.
2196168404Spjd	 */
2197185029Spjd	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2198219089Spjd		if (data_errors == 0) {
2199219089Spjd			if (raidz_checksum_verify(zio) == 0) {
2200168738Spjd				/*
2201168738Spjd				 * If we read parity information (unnecessarily
2202168738Spjd				 * as it happens since no reconstruction was
2203168738Spjd				 * needed) regenerate and verify the parity.
2204168738Spjd				 * We also regenerate parity when resilvering
2205168738Spjd				 * so we can write it out to the failed device
2206168738Spjd				 * later.
2207168738Spjd				 */
2208168404Spjd				if (parity_errors + parity_untried <
2209168738Spjd				    rm->rm_firstdatacol ||
2210168738Spjd				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2211168404Spjd					n = raidz_parity_verify(zio, rm);
2212168404Spjd					unexpected_errors += n;
2213168404Spjd					ASSERT(parity_errors + n <=
2214168404Spjd					    rm->rm_firstdatacol);
2215168404Spjd				}
2216168404Spjd				goto done;
2217168404Spjd			}
2218219089Spjd		} else {
2219168404Spjd			/*
2220168404Spjd			 * We either attempt to read all the parity columns or
2221168404Spjd			 * none of them. If we didn't try to read parity, we
2222168404Spjd			 * wouldn't be here in the correctable case. There must
2223168404Spjd			 * also have been fewer parity errors than parity
2224168404Spjd			 * columns or, again, we wouldn't be in this code path.
2225168404Spjd			 */
2226168404Spjd			ASSERT(parity_untried == 0);
2227168404Spjd			ASSERT(parity_errors < rm->rm_firstdatacol);
2228168404Spjd
2229168404Spjd			/*
2230219089Spjd			 * Identify the data columns that reported an error.
2231168404Spjd			 */
2232219089Spjd			n = 0;
2233168404Spjd			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2234168404Spjd				rc = &rm->rm_col[c];
2235219089Spjd				if (rc->rc_error != 0) {
2236219089Spjd					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2237219089Spjd					tgts[n++] = c;
2238219089Spjd				}
2239168404Spjd			}
2240168404Spjd
2241219089Spjd			ASSERT(rm->rm_firstdatacol >= n);
2242168404Spjd
2243219089Spjd			code = vdev_raidz_reconstruct(rm, tgts, n);
2244168404Spjd
2245219089Spjd			if (raidz_checksum_verify(zio) == 0) {
2246219089Spjd				atomic_inc_64(&raidz_corrected[code]);
2247219089Spjd
2248168404Spjd				/*
2249219089Spjd				 * If we read more parity disks than were used
2250219089Spjd				 * for reconstruction, confirm that the other
2251219089Spjd				 * parity disks produced correct data. This
2252219089Spjd				 * routine is suboptimal in that it regenerates
2253219089Spjd				 * the parity that we already used in addition
2254219089Spjd				 * to the parity that we're attempting to
2255219089Spjd				 * verify, but this should be a relatively
2256219089Spjd				 * uncommon case, and can be optimized if it
2257219089Spjd				 * becomes a problem. Note that we regenerate
2258219089Spjd				 * parity when resilvering so we can write it
2259219089Spjd				 * out to failed devices later.
2260168404Spjd				 */
2261219089Spjd				if (parity_errors < rm->rm_firstdatacol - n ||
2262168738Spjd				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2263168404Spjd					n = raidz_parity_verify(zio, rm);
2264168404Spjd					unexpected_errors += n;
2265168404Spjd					ASSERT(parity_errors + n <=
2266168404Spjd					    rm->rm_firstdatacol);
2267168404Spjd				}
2268168404Spjd
2269168404Spjd				goto done;
2270168404Spjd			}
2271168404Spjd		}
2272168404Spjd	}
2273168404Spjd
2274168404Spjd	/*
2275168404Spjd	 * This isn't a typical situation -- either we got a read error or
2276168404Spjd	 * a child silently returned bad data. Read every block so we can
2277168404Spjd	 * try again with as much data and parity as we can track down. If
2278168404Spjd	 * we've already been through once before, all children will be marked
2279168404Spjd	 * as tried so we'll proceed to combinatorial reconstruction.
2280168404Spjd	 */
2281168404Spjd	unexpected_errors = 1;
2282168404Spjd	rm->rm_missingdata = 0;
2283168404Spjd	rm->rm_missingparity = 0;
2284168404Spjd
2285168404Spjd	for (c = 0; c < rm->rm_cols; c++) {
2286168404Spjd		if (rm->rm_col[c].rc_tried)
2287168404Spjd			continue;
2288168404Spjd
2289168404Spjd		zio_vdev_io_redone(zio);
2290168404Spjd		do {
2291168404Spjd			rc = &rm->rm_col[c];
2292168404Spjd			if (rc->rc_tried)
2293168404Spjd				continue;
2294168404Spjd			zio_nowait(zio_vdev_child_io(zio, NULL,
2295168404Spjd			    vd->vdev_child[rc->rc_devidx],
2296168404Spjd			    rc->rc_offset, rc->rc_data, rc->rc_size,
2297185029Spjd			    zio->io_type, zio->io_priority, 0,
2298168404Spjd			    vdev_raidz_child_done, rc));
2299168404Spjd		} while (++c < rm->rm_cols);
2300185029Spjd
2301168404Spjd		return;
2302168404Spjd	}
2303168404Spjd
2304168404Spjd	/*
2305168404Spjd	 * At this point we've attempted to reconstruct the data given the
2306168404Spjd	 * errors we detected, and we've attempted to read all columns. There
2307168404Spjd	 * must, therefore, be one or more additional problems -- silent errors
2308168404Spjd	 * resulting in invalid data rather than explicit I/O errors resulting
2309219089Spjd	 * in absent data. We check if there is enough additional data to
2310219089Spjd	 * possibly reconstruct the data and then perform combinatorial
2311219089Spjd	 * reconstruction over all possible combinations. If that fails,
2312219089Spjd	 * we're cooked.
2313168404Spjd	 */
2314219089Spjd	if (total_errors > rm->rm_firstdatacol) {
2315185029Spjd		zio->io_error = vdev_raidz_worst_error(rm);
2316168404Spjd
2317219089Spjd	} else if (total_errors < rm->rm_firstdatacol &&
2318219089Spjd	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2319168404Spjd		/*
2320219089Spjd		 * If we didn't use all the available parity for the
2321219089Spjd		 * combinatorial reconstruction, verify that the remaining
2322219089Spjd		 * parity is correct.
2323168404Spjd		 */
2324219089Spjd		if (code != (1 << rm->rm_firstdatacol) - 1)
2325219089Spjd			(void) raidz_parity_verify(zio, rm);
2326219089Spjd	} else {
2327168404Spjd		/*
2328219089Spjd		 * We're here because either:
2329219089Spjd		 *
2330219089Spjd		 *	total_errors == rm_first_datacol, or
2331219089Spjd		 *	vdev_raidz_combrec() failed
2332219089Spjd		 *
2333219089Spjd		 * In either case, there is enough bad data to prevent
2334219089Spjd		 * reconstruction.
2335219089Spjd		 *
2336219089Spjd		 * Start checksum ereports for all children which haven't
2337219089Spjd		 * failed, and the IO wasn't speculative.
2338168404Spjd		 */
2339249195Smm		zio->io_error = SET_ERROR(ECKSUM);
2340168404Spjd
2341219089Spjd		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2342219089Spjd			for (c = 0; c < rm->rm_cols; c++) {
2343219089Spjd				rc = &rm->rm_col[c];
2344219089Spjd				if (rc->rc_error == 0) {
2345219089Spjd					zio_bad_cksum_t zbc;
2346219089Spjd					zbc.zbc_has_cksum = 0;
2347219089Spjd					zbc.zbc_injected =
2348219089Spjd					    rm->rm_ecksuminjected;
2349168404Spjd
2350219089Spjd					zfs_ereport_start_checksum(
2351219089Spjd					    zio->io_spa,
2352219089Spjd					    vd->vdev_child[rc->rc_devidx],
2353219089Spjd					    zio, rc->rc_offset, rc->rc_size,
2354219089Spjd					    (void *)(uintptr_t)c, &zbc);
2355168404Spjd				}
2356168404Spjd			}
2357168404Spjd		}
2358168404Spjd	}
2359168404Spjd
2360168404Spjddone:
2361168404Spjd	zio_checksum_verified(zio);
2362168404Spjd
2363209962Smm	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2364168404Spjd	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2365168404Spjd		/*
2366168404Spjd		 * Use the good data we have in hand to repair damaged children.
2367168404Spjd		 */
2368168404Spjd		for (c = 0; c < rm->rm_cols; c++) {
2369168404Spjd			rc = &rm->rm_col[c];
2370168404Spjd			cvd = vd->vdev_child[rc->rc_devidx];
2371168404Spjd
2372168404Spjd			if (rc->rc_error == 0)
2373168404Spjd				continue;
2374168404Spjd
2375185029Spjd			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2376168404Spjd			    rc->rc_offset, rc->rc_data, rc->rc_size,
2377260763Savg			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2378209962Smm			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2379209962Smm			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2380168404Spjd		}
2381168404Spjd	}
2382168404Spjd}
2383168404Spjd
2384168404Spjdstatic void
2385168404Spjdvdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2386168404Spjd{
2387168404Spjd	if (faulted > vd->vdev_nparity)
2388168404Spjd		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2389168404Spjd		    VDEV_AUX_NO_REPLICAS);
2390168404Spjd	else if (degraded + faulted != 0)
2391168404Spjd		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2392168404Spjd	else
2393168404Spjd		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2394168404Spjd}
2395168404Spjd
2396168404Spjdvdev_ops_t vdev_raidz_ops = {
2397168404Spjd	vdev_raidz_open,
2398168404Spjd	vdev_raidz_close,
2399168404Spjd	vdev_raidz_asize,
2400168404Spjd	vdev_raidz_io_start,
2401168404Spjd	vdev_raidz_io_done,
2402168404Spjd	vdev_raidz_state_change,
2403219089Spjd	NULL,
2404219089Spjd	NULL,
2405168404Spjd	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2406168404Spjd	B_FALSE			/* not a leaf vdev */
2407168404Spjd};
2408