1168404Spjd/*
2168404Spjd * CDDL HEADER START
3168404Spjd *
4168404Spjd * The contents of this file are subject to the terms of the
5168404Spjd * Common Development and Distribution License (the "License").
6168404Spjd * You may not use this file except in compliance with the License.
7168404Spjd *
8168404Spjd * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9168404Spjd * or http://www.opensolaris.org/os/licensing.
10168404Spjd * See the License for the specific language governing permissions
11168404Spjd * and limitations under the License.
12168404Spjd *
13168404Spjd * When distributing Covered Code, include this CDDL HEADER in each
14168404Spjd * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15168404Spjd * If applicable, add the following below this CDDL HEADER, with the
16168404Spjd * fields enclosed by brackets "[]" replaced with your own identifying
17168404Spjd * information: Portions Copyright [yyyy] [name of copyright owner]
18168404Spjd *
19168404Spjd * CDDL HEADER END
20168404Spjd */
21168404Spjd
22168404Spjd/*
23219089Spjd * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24297078Smav * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
25255750Sdelphij * Copyright (c) 2013, Joyent, Inc. All rights reserved.
26297112Smav * Copyright (c) 2014 Integros [integros.com]
27168404Spjd */
28168404Spjd
29168404Spjd#include <sys/zfs_context.h>
30168404Spjd#include <sys/spa.h>
31168404Spjd#include <sys/vdev_impl.h>
32255750Sdelphij#ifdef illumos
33255750Sdelphij#include <sys/vdev_disk.h>
34255750Sdelphij#endif
35255750Sdelphij#include <sys/vdev_file.h>
36255750Sdelphij#include <sys/vdev_raidz.h>
37168404Spjd#include <sys/zio.h>
38168404Spjd#include <sys/zio_checksum.h>
39168404Spjd#include <sys/fs/zfs.h>
40168404Spjd#include <sys/fm/fs/zfs.h>
41255750Sdelphij#include <sys/bio.h>
42168404Spjd
43168404Spjd/*
44168404Spjd * Virtual device vector for RAID-Z.
45168404Spjd *
46219089Spjd * This vdev supports single, double, and triple parity. For single parity,
47219089Spjd * we use a simple XOR of all the data columns. For double or triple parity,
48219089Spjd * we use a special case of Reed-Solomon coding. This extends the
49219089Spjd * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
50219089Spjd * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
51219089Spjd * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
52219089Spjd * former is also based. The latter is designed to provide higher performance
53219089Spjd * for writes.
54168404Spjd *
55219089Spjd * Note that the Plank paper claimed to support arbitrary N+M, but was then
56219089Spjd * amended six years later identifying a critical flaw that invalidates its
57219089Spjd * claims. Nevertheless, the technique can be adapted to work for up to
58219089Spjd * triple parity. For additional parity, the amendment "Note: Correction to
59219089Spjd * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
60219089Spjd * is viable, but the additional complexity means that write performance will
61219089Spjd * suffer.
62219089Spjd *
63219089Spjd * All of the methods above operate on a Galois field, defined over the
64219089Spjd * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
65219089Spjd * can be expressed with a single byte. Briefly, the operations on the
66219089Spjd * field are defined as follows:
67219089Spjd *
68168404Spjd *   o addition (+) is represented by a bitwise XOR
69168404Spjd *   o subtraction (-) is therefore identical to addition: A + B = A - B
70168404Spjd *   o multiplication of A by 2 is defined by the following bitwise expression:
71251631Sdelphij *
72168404Spjd *	(A * 2)_7 = A_6
73168404Spjd *	(A * 2)_6 = A_5
74168404Spjd *	(A * 2)_5 = A_4
75168404Spjd *	(A * 2)_4 = A_3 + A_7
76168404Spjd *	(A * 2)_3 = A_2 + A_7
77168404Spjd *	(A * 2)_2 = A_1 + A_7
78168404Spjd *	(A * 2)_1 = A_0
79168404Spjd *	(A * 2)_0 = A_7
80168404Spjd *
81168404Spjd * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
82219089Spjd * As an aside, this multiplication is derived from the error correcting
83219089Spjd * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
84168404Spjd *
85168404Spjd * Observe that any number in the field (except for 0) can be expressed as a
86168404Spjd * power of 2 -- a generator for the field. We store a table of the powers of
87168404Spjd * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
88168404Spjd * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
89219089Spjd * than field addition). The inverse of a field element A (A^-1) is therefore
90219089Spjd * A ^ (255 - 1) = A^254.
91168404Spjd *
92219089Spjd * The up-to-three parity columns, P, Q, R over several data columns,
93219089Spjd * D_0, ... D_n-1, can be expressed by field operations:
94168404Spjd *
95168404Spjd *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
96168404Spjd *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
97168404Spjd *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
98219089Spjd *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
99219089Spjd *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
100168404Spjd *
101219089Spjd * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
102219089Spjd * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
103219089Spjd * independent coefficients. (There are no additional coefficients that have
104219089Spjd * this property which is why the uncorrected Plank method breaks down.)
105219089Spjd *
106219089Spjd * See the reconstruction code below for how P, Q and R can used individually
107219089Spjd * or in concert to recover missing data columns.
108168404Spjd */
109168404Spjd
110168404Spjdtypedef struct raidz_col {
111168404Spjd	uint64_t rc_devidx;		/* child device index for I/O */
112168404Spjd	uint64_t rc_offset;		/* device offset */
113168404Spjd	uint64_t rc_size;		/* I/O size */
114168404Spjd	void *rc_data;			/* I/O data */
115219089Spjd	void *rc_gdata;			/* used to store the "good" version */
116168404Spjd	int rc_error;			/* I/O error for this device */
117168404Spjd	uint8_t rc_tried;		/* Did we attempt this I/O column? */
118168404Spjd	uint8_t rc_skipped;		/* Did we skip this I/O column? */
119168404Spjd} raidz_col_t;
120168404Spjd
121168404Spjdtypedef struct raidz_map {
122219089Spjd	uint64_t rm_cols;		/* Regular column count */
123219089Spjd	uint64_t rm_scols;		/* Count including skipped columns */
124168404Spjd	uint64_t rm_bigcols;		/* Number of oversized columns */
125168404Spjd	uint64_t rm_asize;		/* Actual total I/O size */
126168404Spjd	uint64_t rm_missingdata;	/* Count of missing data devices */
127168404Spjd	uint64_t rm_missingparity;	/* Count of missing parity devices */
128168404Spjd	uint64_t rm_firstdatacol;	/* First data column/parity count */
129219089Spjd	uint64_t rm_nskip;		/* Skipped sectors for padding */
130251631Sdelphij	uint64_t rm_skipstart;		/* Column index of padding start */
131219089Spjd	void *rm_datacopy;		/* rm_asize-buffer of copied data */
132219089Spjd	uintptr_t rm_reports;		/* # of referencing checksum reports */
133219089Spjd	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
134219089Spjd	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
135168404Spjd	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
136168404Spjd} raidz_map_t;
137168404Spjd
138168404Spjd#define	VDEV_RAIDZ_P		0
139168404Spjd#define	VDEV_RAIDZ_Q		1
140219089Spjd#define	VDEV_RAIDZ_R		2
141168404Spjd
142219089Spjd#define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
143219089Spjd#define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
144168404Spjd
145219089Spjd/*
146219089Spjd * We provide a mechanism to perform the field multiplication operation on a
147219089Spjd * 64-bit value all at once rather than a byte at a time. This works by
148219089Spjd * creating a mask from the top bit in each byte and using that to
149219089Spjd * conditionally apply the XOR of 0x1d.
150219089Spjd */
151219089Spjd#define	VDEV_RAIDZ_64MUL_2(x, mask) \
152219089Spjd{ \
153219089Spjd	(mask) = (x) & 0x8080808080808080ULL; \
154219089Spjd	(mask) = ((mask) << 1) - ((mask) >> 7); \
155219089Spjd	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
156219089Spjd	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
157219089Spjd}
158168404Spjd
159219089Spjd#define	VDEV_RAIDZ_64MUL_4(x, mask) \
160219089Spjd{ \
161219089Spjd	VDEV_RAIDZ_64MUL_2((x), mask); \
162219089Spjd	VDEV_RAIDZ_64MUL_2((x), mask); \
163219089Spjd}
164219089Spjd
165255750Sdelphij#define	VDEV_LABEL_OFFSET(x)	(x + VDEV_LABEL_START_SIZE)
166255750Sdelphij
167168404Spjd/*
168219089Spjd * Force reconstruction to use the general purpose method.
169219089Spjd */
170219089Spjdint vdev_raidz_default_to_general;
171219089Spjd
172251631Sdelphij/* Powers of 2 in the Galois field defined above. */
173168404Spjdstatic const uint8_t vdev_raidz_pow2[256] = {
174168404Spjd	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
175168404Spjd	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
176168404Spjd	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
177168404Spjd	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
178168404Spjd	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
179168404Spjd	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
180168404Spjd	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
181168404Spjd	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
182168404Spjd	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
183168404Spjd	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
184168404Spjd	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
185168404Spjd	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
186168404Spjd	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
187168404Spjd	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
188168404Spjd	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
189168404Spjd	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
190168404Spjd	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
191168404Spjd	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
192168404Spjd	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
193168404Spjd	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
194168404Spjd	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
195168404Spjd	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
196168404Spjd	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
197168404Spjd	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
198168404Spjd	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
199168404Spjd	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
200168404Spjd	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
201168404Spjd	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
202168404Spjd	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
203168404Spjd	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
204168404Spjd	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
205168404Spjd	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
206168404Spjd};
207251631Sdelphij/* Logs of 2 in the Galois field defined above. */
208168404Spjdstatic const uint8_t vdev_raidz_log2[256] = {
209168404Spjd	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
210168404Spjd	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
211168404Spjd	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
212168404Spjd	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
213168404Spjd	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
214168404Spjd	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
215168404Spjd	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
216168404Spjd	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
217168404Spjd	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
218168404Spjd	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
219168404Spjd	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
220168404Spjd	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
221168404Spjd	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
222168404Spjd	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
223168404Spjd	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
224168404Spjd	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
225168404Spjd	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
226168404Spjd	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
227168404Spjd	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
228168404Spjd	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
229168404Spjd	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
230168404Spjd	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
231168404Spjd	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
232168404Spjd	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
233168404Spjd	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
234168404Spjd	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
235168404Spjd	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
236168404Spjd	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
237168404Spjd	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
238168404Spjd	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
239168404Spjd	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
240168404Spjd	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
241168404Spjd};
242168404Spjd
243219089Spjdstatic void vdev_raidz_generate_parity(raidz_map_t *rm);
244219089Spjd
245168404Spjd/*
246168404Spjd * Multiply a given number by 2 raised to the given power.
247168404Spjd */
248168404Spjdstatic uint8_t
249168404Spjdvdev_raidz_exp2(uint_t a, int exp)
250168404Spjd{
251168404Spjd	if (a == 0)
252168404Spjd		return (0);
253168404Spjd
254168404Spjd	ASSERT(exp >= 0);
255168404Spjd	ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
256168404Spjd
257168404Spjd	exp += vdev_raidz_log2[a];
258168404Spjd	if (exp > 255)
259168404Spjd		exp -= 255;
260168404Spjd
261168404Spjd	return (vdev_raidz_pow2[exp]);
262168404Spjd}
263168404Spjd
264185029Spjdstatic void
265219089Spjdvdev_raidz_map_free(raidz_map_t *rm)
266185029Spjd{
267185029Spjd	int c;
268219089Spjd	size_t size;
269185029Spjd
270219089Spjd	for (c = 0; c < rm->rm_firstdatacol; c++) {
271240868Spjd		if (rm->rm_col[c].rc_data != NULL)
272240868Spjd			zio_buf_free(rm->rm_col[c].rc_data,
273240868Spjd			    rm->rm_col[c].rc_size);
274185029Spjd
275219089Spjd		if (rm->rm_col[c].rc_gdata != NULL)
276219089Spjd			zio_buf_free(rm->rm_col[c].rc_gdata,
277219089Spjd			    rm->rm_col[c].rc_size);
278219089Spjd	}
279219089Spjd
280219089Spjd	size = 0;
281219089Spjd	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
282219089Spjd		size += rm->rm_col[c].rc_size;
283219089Spjd
284219089Spjd	if (rm->rm_datacopy != NULL)
285219089Spjd		zio_buf_free(rm->rm_datacopy, size);
286219089Spjd
287219089Spjd	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
288185029Spjd}
289185029Spjd
290219089Spjdstatic void
291219089Spjdvdev_raidz_map_free_vsd(zio_t *zio)
292219089Spjd{
293219089Spjd	raidz_map_t *rm = zio->io_vsd;
294219089Spjd
295240415Smm	ASSERT0(rm->rm_freed);
296219089Spjd	rm->rm_freed = 1;
297219089Spjd
298219089Spjd	if (rm->rm_reports == 0)
299219089Spjd		vdev_raidz_map_free(rm);
300219089Spjd}
301219089Spjd
302219089Spjd/*ARGSUSED*/
303219089Spjdstatic void
304219089Spjdvdev_raidz_cksum_free(void *arg, size_t ignored)
305219089Spjd{
306219089Spjd	raidz_map_t *rm = arg;
307219089Spjd
308219089Spjd	ASSERT3U(rm->rm_reports, >, 0);
309219089Spjd
310219089Spjd	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
311219089Spjd		vdev_raidz_map_free(rm);
312219089Spjd}
313219089Spjd
314219089Spjdstatic void
315219089Spjdvdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
316219089Spjd{
317219089Spjd	raidz_map_t *rm = zcr->zcr_cbdata;
318219089Spjd	size_t c = zcr->zcr_cbinfo;
319219089Spjd	size_t x;
320219089Spjd
321219089Spjd	const char *good = NULL;
322219089Spjd	const char *bad = rm->rm_col[c].rc_data;
323219089Spjd
324219089Spjd	if (good_data == NULL) {
325219089Spjd		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
326219089Spjd		return;
327219089Spjd	}
328219089Spjd
329219089Spjd	if (c < rm->rm_firstdatacol) {
330219089Spjd		/*
331219089Spjd		 * The first time through, calculate the parity blocks for
332219089Spjd		 * the good data (this relies on the fact that the good
333219089Spjd		 * data never changes for a given logical ZIO)
334219089Spjd		 */
335219089Spjd		if (rm->rm_col[0].rc_gdata == NULL) {
336219089Spjd			char *bad_parity[VDEV_RAIDZ_MAXPARITY];
337219089Spjd			char *buf;
338219089Spjd
339219089Spjd			/*
340219089Spjd			 * Set up the rm_col[]s to generate the parity for
341219089Spjd			 * good_data, first saving the parity bufs and
342219089Spjd			 * replacing them with buffers to hold the result.
343219089Spjd			 */
344219089Spjd			for (x = 0; x < rm->rm_firstdatacol; x++) {
345219089Spjd				bad_parity[x] = rm->rm_col[x].rc_data;
346219089Spjd				rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
347219089Spjd				    zio_buf_alloc(rm->rm_col[x].rc_size);
348219089Spjd			}
349219089Spjd
350219089Spjd			/* fill in the data columns from good_data */
351219089Spjd			buf = (char *)good_data;
352219089Spjd			for (; x < rm->rm_cols; x++) {
353219089Spjd				rm->rm_col[x].rc_data = buf;
354219089Spjd				buf += rm->rm_col[x].rc_size;
355219089Spjd			}
356219089Spjd
357219089Spjd			/*
358219089Spjd			 * Construct the parity from the good data.
359219089Spjd			 */
360219089Spjd			vdev_raidz_generate_parity(rm);
361219089Spjd
362219089Spjd			/* restore everything back to its original state */
363219089Spjd			for (x = 0; x < rm->rm_firstdatacol; x++)
364219089Spjd				rm->rm_col[x].rc_data = bad_parity[x];
365219089Spjd
366219089Spjd			buf = rm->rm_datacopy;
367219089Spjd			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
368219089Spjd				rm->rm_col[x].rc_data = buf;
369219089Spjd				buf += rm->rm_col[x].rc_size;
370219089Spjd			}
371219089Spjd		}
372219089Spjd
373219089Spjd		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
374219089Spjd		good = rm->rm_col[c].rc_gdata;
375219089Spjd	} else {
376219089Spjd		/* adjust good_data to point at the start of our column */
377219089Spjd		good = good_data;
378219089Spjd
379219089Spjd		for (x = rm->rm_firstdatacol; x < c; x++)
380219089Spjd			good += rm->rm_col[x].rc_size;
381219089Spjd	}
382219089Spjd
383219089Spjd	/* we drop the ereport if it ends up that the data was good */
384219089Spjd	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
385219089Spjd}
386219089Spjd
387219089Spjd/*
388219089Spjd * Invoked indirectly by zfs_ereport_start_checksum(), called
389219089Spjd * below when our read operation fails completely.  The main point
390219089Spjd * is to keep a copy of everything we read from disk, so that at
391219089Spjd * vdev_raidz_cksum_finish() time we can compare it with the good data.
392219089Spjd */
393219089Spjdstatic void
394219089Spjdvdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
395219089Spjd{
396219089Spjd	size_t c = (size_t)(uintptr_t)arg;
397219089Spjd	caddr_t buf;
398219089Spjd
399219089Spjd	raidz_map_t *rm = zio->io_vsd;
400219089Spjd	size_t size;
401219089Spjd
402219089Spjd	/* set up the report and bump the refcount  */
403219089Spjd	zcr->zcr_cbdata = rm;
404219089Spjd	zcr->zcr_cbinfo = c;
405219089Spjd	zcr->zcr_finish = vdev_raidz_cksum_finish;
406219089Spjd	zcr->zcr_free = vdev_raidz_cksum_free;
407219089Spjd
408219089Spjd	rm->rm_reports++;
409219089Spjd	ASSERT3U(rm->rm_reports, >, 0);
410219089Spjd
411219089Spjd	if (rm->rm_datacopy != NULL)
412219089Spjd		return;
413219089Spjd
414219089Spjd	/*
415219089Spjd	 * It's the first time we're called for this raidz_map_t, so we need
416219089Spjd	 * to copy the data aside; there's no guarantee that our zio's buffer
417219089Spjd	 * won't be re-used for something else.
418219089Spjd	 *
419219089Spjd	 * Our parity data is already in separate buffers, so there's no need
420219089Spjd	 * to copy them.
421219089Spjd	 */
422219089Spjd
423219089Spjd	size = 0;
424219089Spjd	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
425219089Spjd		size += rm->rm_col[c].rc_size;
426219089Spjd
427219089Spjd	buf = rm->rm_datacopy = zio_buf_alloc(size);
428219089Spjd
429219089Spjd	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
430219089Spjd		raidz_col_t *col = &rm->rm_col[c];
431219089Spjd
432219089Spjd		bcopy(col->rc_data, buf, col->rc_size);
433219089Spjd		col->rc_data = buf;
434219089Spjd
435219089Spjd		buf += col->rc_size;
436219089Spjd	}
437219089Spjd	ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
438219089Spjd}
439219089Spjd
440219089Spjdstatic const zio_vsd_ops_t vdev_raidz_vsd_ops = {
441219089Spjd	vdev_raidz_map_free_vsd,
442219089Spjd	vdev_raidz_cksum_report
443219089Spjd};
444219089Spjd
445251629Sdelphij/*
446251629Sdelphij * Divides the IO evenly across all child vdevs; usually, dcols is
447251629Sdelphij * the number of children in the target vdev.
448251629Sdelphij */
449168404Spjdstatic raidz_map_t *
450255750Sdelphijvdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, boolean_t dofree,
451255750Sdelphij    uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
452168404Spjd{
453168404Spjd	raidz_map_t *rm;
454251629Sdelphij	/* The starting RAIDZ (parent) vdev sector of the block. */
455255750Sdelphij	uint64_t b = offset >> unit_shift;
456251629Sdelphij	/* The zio's size in units of the vdev's minimum sector size. */
457255750Sdelphij	uint64_t s = size >> unit_shift;
458251629Sdelphij	/* The first column for this stripe. */
459168404Spjd	uint64_t f = b % dcols;
460251629Sdelphij	/* The starting byte offset on each child vdev. */
461168404Spjd	uint64_t o = (b / dcols) << unit_shift;
462219089Spjd	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
463168404Spjd
464251629Sdelphij	/*
465251629Sdelphij	 * "Quotient": The number of data sectors for this stripe on all but
466251629Sdelphij	 * the "big column" child vdevs that also contain "remainder" data.
467251629Sdelphij	 */
468168404Spjd	q = s / (dcols - nparity);
469251629Sdelphij
470251629Sdelphij	/*
471251629Sdelphij	 * "Remainder": The number of partial stripe data sectors in this I/O.
472251629Sdelphij	 * This will add a sector to some, but not all, child vdevs.
473251629Sdelphij	 */
474168404Spjd	r = s - q * (dcols - nparity);
475251629Sdelphij
476251629Sdelphij	/* The number of "big columns" - those which contain remainder data. */
477168404Spjd	bc = (r == 0 ? 0 : r + nparity);
478251629Sdelphij
479251629Sdelphij	/*
480251629Sdelphij	 * The total number of data and parity sectors associated with
481251629Sdelphij	 * this I/O.
482251629Sdelphij	 */
483219089Spjd	tot = s + nparity * (q + (r == 0 ? 0 : 1));
484168404Spjd
485251629Sdelphij	/* acols: The columns that will be accessed. */
486251629Sdelphij	/* scols: The columns that will be accessed or skipped. */
487219089Spjd	if (q == 0) {
488251629Sdelphij		/* Our I/O request doesn't span all child vdevs. */
489219089Spjd		acols = bc;
490219089Spjd		scols = MIN(dcols, roundup(bc, nparity + 1));
491219089Spjd	} else {
492219089Spjd		acols = dcols;
493219089Spjd		scols = dcols;
494219089Spjd	}
495168404Spjd
496219089Spjd	ASSERT3U(acols, <=, scols);
497168404Spjd
498219089Spjd	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
499219089Spjd
500168404Spjd	rm->rm_cols = acols;
501219089Spjd	rm->rm_scols = scols;
502168404Spjd	rm->rm_bigcols = bc;
503219089Spjd	rm->rm_skipstart = bc;
504168404Spjd	rm->rm_missingdata = 0;
505168404Spjd	rm->rm_missingparity = 0;
506168404Spjd	rm->rm_firstdatacol = nparity;
507219089Spjd	rm->rm_datacopy = NULL;
508219089Spjd	rm->rm_reports = 0;
509219089Spjd	rm->rm_freed = 0;
510219089Spjd	rm->rm_ecksuminjected = 0;
511168404Spjd
512219089Spjd	asize = 0;
513219089Spjd
514219089Spjd	for (c = 0; c < scols; c++) {
515168404Spjd		col = f + c;
516168404Spjd		coff = o;
517168404Spjd		if (col >= dcols) {
518168404Spjd			col -= dcols;
519168404Spjd			coff += 1ULL << unit_shift;
520168404Spjd		}
521168404Spjd		rm->rm_col[c].rc_devidx = col;
522168404Spjd		rm->rm_col[c].rc_offset = coff;
523168404Spjd		rm->rm_col[c].rc_data = NULL;
524219089Spjd		rm->rm_col[c].rc_gdata = NULL;
525168404Spjd		rm->rm_col[c].rc_error = 0;
526168404Spjd		rm->rm_col[c].rc_tried = 0;
527168404Spjd		rm->rm_col[c].rc_skipped = 0;
528219089Spjd
529219089Spjd		if (c >= acols)
530219089Spjd			rm->rm_col[c].rc_size = 0;
531219089Spjd		else if (c < bc)
532219089Spjd			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
533219089Spjd		else
534219089Spjd			rm->rm_col[c].rc_size = q << unit_shift;
535219089Spjd
536219089Spjd		asize += rm->rm_col[c].rc_size;
537168404Spjd	}
538168404Spjd
539219089Spjd	ASSERT3U(asize, ==, tot << unit_shift);
540219089Spjd	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
541219089Spjd	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
542219089Spjd	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
543219089Spjd	ASSERT3U(rm->rm_nskip, <=, nparity);
544168404Spjd
545255750Sdelphij	if (!dofree) {
546240868Spjd		for (c = 0; c < rm->rm_firstdatacol; c++) {
547240868Spjd			rm->rm_col[c].rc_data =
548240868Spjd			    zio_buf_alloc(rm->rm_col[c].rc_size);
549240868Spjd		}
550168404Spjd
551255750Sdelphij		rm->rm_col[c].rc_data = data;
552168404Spjd
553240868Spjd		for (c = c + 1; c < acols; c++) {
554240868Spjd			rm->rm_col[c].rc_data =
555240868Spjd			    (char *)rm->rm_col[c - 1].rc_data +
556240868Spjd			    rm->rm_col[c - 1].rc_size;
557240868Spjd		}
558240868Spjd	}
559168404Spjd
560168404Spjd	/*
561168404Spjd	 * If all data stored spans all columns, there's a danger that parity
562168404Spjd	 * will always be on the same device and, since parity isn't read
563168404Spjd	 * during normal operation, that that device's I/O bandwidth won't be
564168404Spjd	 * used effectively. We therefore switch the parity every 1MB.
565168404Spjd	 *
566168404Spjd	 * ... at least that was, ostensibly, the theory. As a practical
567168404Spjd	 * matter unless we juggle the parity between all devices evenly, we
568168404Spjd	 * won't see any benefit. Further, occasional writes that aren't a
569168404Spjd	 * multiple of the LCM of the number of children and the minimum
570168404Spjd	 * stripe width are sufficient to avoid pessimal behavior.
571168404Spjd	 * Unfortunately, this decision created an implicit on-disk format
572168404Spjd	 * requirement that we need to support for all eternity, but only
573168404Spjd	 * for single-parity RAID-Z.
574219089Spjd	 *
575219089Spjd	 * If we intend to skip a sector in the zeroth column for padding
576219089Spjd	 * we must make sure to note this swap. We will never intend to
577219089Spjd	 * skip the first column since at least one data and one parity
578219089Spjd	 * column must appear in each row.
579168404Spjd	 */
580168404Spjd	ASSERT(rm->rm_cols >= 2);
581168404Spjd	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
582168404Spjd
583255750Sdelphij	if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
584168404Spjd		devidx = rm->rm_col[0].rc_devidx;
585168404Spjd		o = rm->rm_col[0].rc_offset;
586168404Spjd		rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
587168404Spjd		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
588168404Spjd		rm->rm_col[1].rc_devidx = devidx;
589168404Spjd		rm->rm_col[1].rc_offset = o;
590219089Spjd
591219089Spjd		if (rm->rm_skipstart == 0)
592219089Spjd			rm->rm_skipstart = 1;
593168404Spjd	}
594168404Spjd
595168404Spjd	return (rm);
596168404Spjd}
597168404Spjd
598168404Spjdstatic void
599168404Spjdvdev_raidz_generate_parity_p(raidz_map_t *rm)
600168404Spjd{
601168404Spjd	uint64_t *p, *src, pcount, ccount, i;
602168404Spjd	int c;
603168404Spjd
604168404Spjd	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
605168404Spjd
606168404Spjd	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
607168404Spjd		src = rm->rm_col[c].rc_data;
608168404Spjd		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
609168404Spjd		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
610168404Spjd
611168404Spjd		if (c == rm->rm_firstdatacol) {
612168404Spjd			ASSERT(ccount == pcount);
613219089Spjd			for (i = 0; i < ccount; i++, src++, p++) {
614168404Spjd				*p = *src;
615168404Spjd			}
616168404Spjd		} else {
617168404Spjd			ASSERT(ccount <= pcount);
618219089Spjd			for (i = 0; i < ccount; i++, src++, p++) {
619168404Spjd				*p ^= *src;
620168404Spjd			}
621168404Spjd		}
622168404Spjd	}
623168404Spjd}
624168404Spjd
625168404Spjdstatic void
626168404Spjdvdev_raidz_generate_parity_pq(raidz_map_t *rm)
627168404Spjd{
628219089Spjd	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
629168404Spjd	int c;
630168404Spjd
631219089Spjd	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
632168404Spjd	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
633168404Spjd	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
634168404Spjd
635168404Spjd	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
636168404Spjd		src = rm->rm_col[c].rc_data;
637168404Spjd		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
638168404Spjd		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
639168404Spjd
640219089Spjd		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
641219089Spjd
642168404Spjd		if (c == rm->rm_firstdatacol) {
643219089Spjd			ASSERT(ccnt == pcnt || ccnt == 0);
644219089Spjd			for (i = 0; i < ccnt; i++, src++, p++, q++) {
645219089Spjd				*p = *src;
646168404Spjd				*q = *src;
647219089Spjd			}
648219089Spjd			for (; i < pcnt; i++, src++, p++, q++) {
649219089Spjd				*p = 0;
650219089Spjd				*q = 0;
651219089Spjd			}
652219089Spjd		} else {
653219089Spjd			ASSERT(ccnt <= pcnt);
654219089Spjd
655219089Spjd			/*
656219089Spjd			 * Apply the algorithm described above by multiplying
657219089Spjd			 * the previous result and adding in the new value.
658219089Spjd			 */
659219089Spjd			for (i = 0; i < ccnt; i++, src++, p++, q++) {
660219089Spjd				*p ^= *src;
661219089Spjd
662219089Spjd				VDEV_RAIDZ_64MUL_2(*q, mask);
663219089Spjd				*q ^= *src;
664219089Spjd			}
665219089Spjd
666219089Spjd			/*
667219089Spjd			 * Treat short columns as though they are full of 0s.
668219089Spjd			 * Note that there's therefore nothing needed for P.
669219089Spjd			 */
670219089Spjd			for (; i < pcnt; i++, q++) {
671219089Spjd				VDEV_RAIDZ_64MUL_2(*q, mask);
672219089Spjd			}
673219089Spjd		}
674219089Spjd	}
675219089Spjd}
676219089Spjd
677219089Spjdstatic void
678219089Spjdvdev_raidz_generate_parity_pqr(raidz_map_t *rm)
679219089Spjd{
680219089Spjd	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
681219089Spjd	int c;
682219089Spjd
683219089Spjd	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
684219089Spjd	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
685219089Spjd	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
686219089Spjd	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
687219089Spjd	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
688219089Spjd
689219089Spjd	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
690219089Spjd		src = rm->rm_col[c].rc_data;
691219089Spjd		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
692219089Spjd		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
693219089Spjd		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
694219089Spjd
695219089Spjd		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
696219089Spjd
697219089Spjd		if (c == rm->rm_firstdatacol) {
698219089Spjd			ASSERT(ccnt == pcnt || ccnt == 0);
699219089Spjd			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
700168404Spjd				*p = *src;
701219089Spjd				*q = *src;
702219089Spjd				*r = *src;
703168404Spjd			}
704219089Spjd			for (; i < pcnt; i++, src++, p++, q++, r++) {
705219089Spjd				*p = 0;
706168404Spjd				*q = 0;
707219089Spjd				*r = 0;
708168404Spjd			}
709168404Spjd		} else {
710219089Spjd			ASSERT(ccnt <= pcnt);
711168404Spjd
712168404Spjd			/*
713219089Spjd			 * Apply the algorithm described above by multiplying
714219089Spjd			 * the previous result and adding in the new value.
715168404Spjd			 */
716219089Spjd			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
717219089Spjd				*p ^= *src;
718219089Spjd
719219089Spjd				VDEV_RAIDZ_64MUL_2(*q, mask);
720168404Spjd				*q ^= *src;
721219089Spjd
722219089Spjd				VDEV_RAIDZ_64MUL_4(*r, mask);
723219089Spjd				*r ^= *src;
724168404Spjd			}
725168404Spjd
726168404Spjd			/*
727168404Spjd			 * Treat short columns as though they are full of 0s.
728219089Spjd			 * Note that there's therefore nothing needed for P.
729168404Spjd			 */
730219089Spjd			for (; i < pcnt; i++, q++, r++) {
731219089Spjd				VDEV_RAIDZ_64MUL_2(*q, mask);
732219089Spjd				VDEV_RAIDZ_64MUL_4(*r, mask);
733168404Spjd			}
734168404Spjd		}
735168404Spjd	}
736168404Spjd}
737168404Spjd
738219089Spjd/*
739219089Spjd * Generate RAID parity in the first virtual columns according to the number of
740219089Spjd * parity columns available.
741219089Spjd */
742168404Spjdstatic void
743219089Spjdvdev_raidz_generate_parity(raidz_map_t *rm)
744168404Spjd{
745219089Spjd	switch (rm->rm_firstdatacol) {
746219089Spjd	case 1:
747219089Spjd		vdev_raidz_generate_parity_p(rm);
748219089Spjd		break;
749219089Spjd	case 2:
750219089Spjd		vdev_raidz_generate_parity_pq(rm);
751219089Spjd		break;
752219089Spjd	case 3:
753219089Spjd		vdev_raidz_generate_parity_pqr(rm);
754219089Spjd		break;
755219089Spjd	default:
756219089Spjd		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
757219089Spjd	}
758219089Spjd}
759219089Spjd
760219089Spjdstatic int
761219089Spjdvdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
762219089Spjd{
763168404Spjd	uint64_t *dst, *src, xcount, ccount, count, i;
764219089Spjd	int x = tgts[0];
765168404Spjd	int c;
766168404Spjd
767219089Spjd	ASSERT(ntgts == 1);
768219089Spjd	ASSERT(x >= rm->rm_firstdatacol);
769219089Spjd	ASSERT(x < rm->rm_cols);
770219089Spjd
771168404Spjd	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
772168404Spjd	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
773168404Spjd	ASSERT(xcount > 0);
774168404Spjd
775168404Spjd	src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
776168404Spjd	dst = rm->rm_col[x].rc_data;
777168404Spjd	for (i = 0; i < xcount; i++, dst++, src++) {
778168404Spjd		*dst = *src;
779168404Spjd	}
780168404Spjd
781168404Spjd	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
782168404Spjd		src = rm->rm_col[c].rc_data;
783168404Spjd		dst = rm->rm_col[x].rc_data;
784168404Spjd
785168404Spjd		if (c == x)
786168404Spjd			continue;
787168404Spjd
788168404Spjd		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
789168404Spjd		count = MIN(ccount, xcount);
790168404Spjd
791168404Spjd		for (i = 0; i < count; i++, dst++, src++) {
792168404Spjd			*dst ^= *src;
793168404Spjd		}
794168404Spjd	}
795219089Spjd
796219089Spjd	return (1 << VDEV_RAIDZ_P);
797168404Spjd}
798168404Spjd
799219089Spjdstatic int
800219089Spjdvdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
801168404Spjd{
802168404Spjd	uint64_t *dst, *src, xcount, ccount, count, mask, i;
803168404Spjd	uint8_t *b;
804219089Spjd	int x = tgts[0];
805168404Spjd	int c, j, exp;
806168404Spjd
807219089Spjd	ASSERT(ntgts == 1);
808219089Spjd
809168404Spjd	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
810168404Spjd	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
811168404Spjd
812168404Spjd	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
813168404Spjd		src = rm->rm_col[c].rc_data;
814168404Spjd		dst = rm->rm_col[x].rc_data;
815168404Spjd
816168404Spjd		if (c == x)
817168404Spjd			ccount = 0;
818168404Spjd		else
819168404Spjd			ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
820168404Spjd
821168404Spjd		count = MIN(ccount, xcount);
822168404Spjd
823168404Spjd		if (c == rm->rm_firstdatacol) {
824168404Spjd			for (i = 0; i < count; i++, dst++, src++) {
825168404Spjd				*dst = *src;
826168404Spjd			}
827168404Spjd			for (; i < xcount; i++, dst++) {
828168404Spjd				*dst = 0;
829168404Spjd			}
830168404Spjd
831168404Spjd		} else {
832168404Spjd			for (i = 0; i < count; i++, dst++, src++) {
833219089Spjd				VDEV_RAIDZ_64MUL_2(*dst, mask);
834168404Spjd				*dst ^= *src;
835168404Spjd			}
836168404Spjd
837168404Spjd			for (; i < xcount; i++, dst++) {
838219089Spjd				VDEV_RAIDZ_64MUL_2(*dst, mask);
839168404Spjd			}
840168404Spjd		}
841168404Spjd	}
842168404Spjd
843168404Spjd	src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
844168404Spjd	dst = rm->rm_col[x].rc_data;
845168404Spjd	exp = 255 - (rm->rm_cols - 1 - x);
846168404Spjd
847168404Spjd	for (i = 0; i < xcount; i++, dst++, src++) {
848168404Spjd		*dst ^= *src;
849168404Spjd		for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
850168404Spjd			*b = vdev_raidz_exp2(*b, exp);
851168404Spjd		}
852168404Spjd	}
853219089Spjd
854219089Spjd	return (1 << VDEV_RAIDZ_Q);
855168404Spjd}
856168404Spjd
857219089Spjdstatic int
858219089Spjdvdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
859168404Spjd{
860168404Spjd	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
861168404Spjd	void *pdata, *qdata;
862168404Spjd	uint64_t xsize, ysize, i;
863219089Spjd	int x = tgts[0];
864219089Spjd	int y = tgts[1];
865168404Spjd
866219089Spjd	ASSERT(ntgts == 2);
867168404Spjd	ASSERT(x < y);
868168404Spjd	ASSERT(x >= rm->rm_firstdatacol);
869168404Spjd	ASSERT(y < rm->rm_cols);
870168404Spjd
871168404Spjd	ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
872168404Spjd
873168404Spjd	/*
874168404Spjd	 * Move the parity data aside -- we're going to compute parity as
875168404Spjd	 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
876168404Spjd	 * reuse the parity generation mechanism without trashing the actual
877168404Spjd	 * parity so we make those columns appear to be full of zeros by
878168404Spjd	 * setting their lengths to zero.
879168404Spjd	 */
880168404Spjd	pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
881168404Spjd	qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
882168404Spjd	xsize = rm->rm_col[x].rc_size;
883168404Spjd	ysize = rm->rm_col[y].rc_size;
884168404Spjd
885168404Spjd	rm->rm_col[VDEV_RAIDZ_P].rc_data =
886168404Spjd	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
887168404Spjd	rm->rm_col[VDEV_RAIDZ_Q].rc_data =
888168404Spjd	    zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
889168404Spjd	rm->rm_col[x].rc_size = 0;
890168404Spjd	rm->rm_col[y].rc_size = 0;
891168404Spjd
892168404Spjd	vdev_raidz_generate_parity_pq(rm);
893168404Spjd
894168404Spjd	rm->rm_col[x].rc_size = xsize;
895168404Spjd	rm->rm_col[y].rc_size = ysize;
896168404Spjd
897168404Spjd	p = pdata;
898168404Spjd	q = qdata;
899168404Spjd	pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
900168404Spjd	qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
901168404Spjd	xd = rm->rm_col[x].rc_data;
902168404Spjd	yd = rm->rm_col[y].rc_data;
903168404Spjd
904168404Spjd	/*
905168404Spjd	 * We now have:
906168404Spjd	 *	Pxy = P + D_x + D_y
907168404Spjd	 *	Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
908168404Spjd	 *
909168404Spjd	 * We can then solve for D_x:
910168404Spjd	 *	D_x = A * (P + Pxy) + B * (Q + Qxy)
911168404Spjd	 * where
912168404Spjd	 *	A = 2^(x - y) * (2^(x - y) + 1)^-1
913168404Spjd	 *	B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
914168404Spjd	 *
915168404Spjd	 * With D_x in hand, we can easily solve for D_y:
916168404Spjd	 *	D_y = P + Pxy + D_x
917168404Spjd	 */
918168404Spjd
919168404Spjd	a = vdev_raidz_pow2[255 + x - y];
920168404Spjd	b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
921168404Spjd	tmp = 255 - vdev_raidz_log2[a ^ 1];
922168404Spjd
923168404Spjd	aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
924168404Spjd	bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
925168404Spjd
926168404Spjd	for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
927168404Spjd		*xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
928168404Spjd		    vdev_raidz_exp2(*q ^ *qxy, bexp);
929168404Spjd
930168404Spjd		if (i < ysize)
931168404Spjd			*yd = *p ^ *pxy ^ *xd;
932168404Spjd	}
933168404Spjd
934168404Spjd	zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
935168404Spjd	    rm->rm_col[VDEV_RAIDZ_P].rc_size);
936168404Spjd	zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
937168404Spjd	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
938168404Spjd
939168404Spjd	/*
940168404Spjd	 * Restore the saved parity data.
941168404Spjd	 */
942168404Spjd	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
943168404Spjd	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
944219089Spjd
945219089Spjd	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
946168404Spjd}
947168404Spjd
948219089Spjd/* BEGIN CSTYLED */
949219089Spjd/*
950219089Spjd * In the general case of reconstruction, we must solve the system of linear
951219089Spjd * equations defined by the coeffecients used to generate parity as well as
952219089Spjd * the contents of the data and parity disks. This can be expressed with
953219089Spjd * vectors for the original data (D) and the actual data (d) and parity (p)
954219089Spjd * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
955219089Spjd *
956219089Spjd *            __   __                     __     __
957219089Spjd *            |     |         __     __   |  p_0  |
958219089Spjd *            |  V  |         |  D_0  |   | p_m-1 |
959219089Spjd *            |     |    x    |   :   | = |  d_0  |
960219089Spjd *            |  I  |         | D_n-1 |   |   :   |
961219089Spjd *            |     |         ~~     ~~   | d_n-1 |
962219089Spjd *            ~~   ~~                     ~~     ~~
963219089Spjd *
964219089Spjd * I is simply a square identity matrix of size n, and V is a vandermonde
965219089Spjd * matrix defined by the coeffecients we chose for the various parity columns
966219089Spjd * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
967219089Spjd * computation as well as linear separability.
968219089Spjd *
969219089Spjd *      __               __               __     __
970219089Spjd *      |   1   ..  1 1 1 |               |  p_0  |
971219089Spjd *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
972219089Spjd *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
973219089Spjd *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
974219089Spjd *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
975219089Spjd *      |   :       : : : |   |   :   |   |  d_2  |
976219089Spjd *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
977219089Spjd *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
978219089Spjd *      |   0   ..  0 0 1 |               | d_n-1 |
979219089Spjd *      ~~               ~~               ~~     ~~
980219089Spjd *
981219089Spjd * Note that I, V, d, and p are known. To compute D, we must invert the
982219089Spjd * matrix and use the known data and parity values to reconstruct the unknown
983219089Spjd * data values. We begin by removing the rows in V|I and d|p that correspond
984219089Spjd * to failed or missing columns; we then make V|I square (n x n) and d|p
985219089Spjd * sized n by removing rows corresponding to unused parity from the bottom up
986219089Spjd * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
987219089Spjd * using Gauss-Jordan elimination. In the example below we use m=3 parity
988219089Spjd * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
989219089Spjd *           __                               __
990219089Spjd *           |  1   1   1   1   1   1   1   1  |
991219089Spjd *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
992219089Spjd *           |  19 205 116  29  64  16  4   1  |      / /
993219089Spjd *           |  1   0   0   0   0   0   0   0  |     / /
994219089Spjd *           |  0   1   0   0   0   0   0   0  | <--' /
995219089Spjd *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
996219089Spjd *           |  0   0   0   1   0   0   0   0  |
997219089Spjd *           |  0   0   0   0   1   0   0   0  |
998219089Spjd *           |  0   0   0   0   0   1   0   0  |
999219089Spjd *           |  0   0   0   0   0   0   1   0  |
1000219089Spjd *           |  0   0   0   0   0   0   0   1  |
1001219089Spjd *           ~~                               ~~
1002219089Spjd *           __                               __
1003219089Spjd *           |  1   1   1   1   1   1   1   1  |
1004219089Spjd *           |  19 205 116  29  64  16  4   1  |
1005219089Spjd *           |  1   0   0   0   0   0   0   0  |
1006255750Sdelphij *  (V|I)' = |  0   0   0   1   0   0   0   0  |
1007219089Spjd *           |  0   0   0   0   1   0   0   0  |
1008219089Spjd *           |  0   0   0   0   0   1   0   0  |
1009219089Spjd *           |  0   0   0   0   0   0   1   0  |
1010219089Spjd *           |  0   0   0   0   0   0   0   1  |
1011219089Spjd *           ~~                               ~~
1012219089Spjd *
1013219089Spjd * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1014219089Spjd * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1015219089Spjd * matrix is not singular.
1016219089Spjd * __                                                                 __
1017219089Spjd * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1018219089Spjd * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1019219089Spjd * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1020219089Spjd * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1021219089Spjd * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1022219089Spjd * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1023219089Spjd * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1024219089Spjd * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1025219089Spjd * ~~                                                                 ~~
1026219089Spjd * __                                                                 __
1027219089Spjd * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1028219089Spjd * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1029219089Spjd * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1030219089Spjd * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1031219089Spjd * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1032219089Spjd * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1033219089Spjd * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1034219089Spjd * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1035219089Spjd * ~~                                                                 ~~
1036219089Spjd * __                                                                 __
1037219089Spjd * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1038219089Spjd * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1039219089Spjd * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1040219089Spjd * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1041219089Spjd * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1042219089Spjd * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1043219089Spjd * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1044219089Spjd * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1045219089Spjd * ~~                                                                 ~~
1046219089Spjd * __                                                                 __
1047219089Spjd * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1048219089Spjd * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1049219089Spjd * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1050219089Spjd * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1051219089Spjd * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1052219089Spjd * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1053219089Spjd * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1054219089Spjd * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1055219089Spjd * ~~                                                                 ~~
1056219089Spjd * __                                                                 __
1057219089Spjd * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1058219089Spjd * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1059219089Spjd * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1060219089Spjd * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1061219089Spjd * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1062219089Spjd * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1063219089Spjd * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1064219089Spjd * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1065219089Spjd * ~~                                                                 ~~
1066219089Spjd * __                                                                 __
1067219089Spjd * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1068219089Spjd * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1069219089Spjd * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1070219089Spjd * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1071219089Spjd * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1072219089Spjd * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1073219089Spjd * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1074219089Spjd * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1075219089Spjd * ~~                                                                 ~~
1076219089Spjd *                   __                               __
1077219089Spjd *                   |  0   0   1   0   0   0   0   0  |
1078219089Spjd *                   | 167 100  5   41 159 169 217 208 |
1079219089Spjd *                   | 166 100  4   40 158 168 216 209 |
1080219089Spjd *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1081219089Spjd *                   |  0   0   0   0   1   0   0   0  |
1082219089Spjd *                   |  0   0   0   0   0   1   0   0  |
1083219089Spjd *                   |  0   0   0   0   0   0   1   0  |
1084219089Spjd *                   |  0   0   0   0   0   0   0   1  |
1085219089Spjd *                   ~~                               ~~
1086219089Spjd *
1087219089Spjd * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1088219089Spjd * of the missing data.
1089219089Spjd *
1090219089Spjd * As is apparent from the example above, the only non-trivial rows in the
1091219089Spjd * inverse matrix correspond to the data disks that we're trying to
1092219089Spjd * reconstruct. Indeed, those are the only rows we need as the others would
1093219089Spjd * only be useful for reconstructing data known or assumed to be valid. For
1094219089Spjd * that reason, we only build the coefficients in the rows that correspond to
1095219089Spjd * targeted columns.
1096219089Spjd */
1097219089Spjd/* END CSTYLED */
1098168404Spjd
1099219089Spjdstatic void
1100219089Spjdvdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1101219089Spjd    uint8_t **rows)
1102219089Spjd{
1103219089Spjd	int i, j;
1104219089Spjd	int pow;
1105219089Spjd
1106219089Spjd	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1107219089Spjd
1108219089Spjd	/*
1109219089Spjd	 * Fill in the missing rows of interest.
1110219089Spjd	 */
1111219089Spjd	for (i = 0; i < nmap; i++) {
1112219089Spjd		ASSERT3S(0, <=, map[i]);
1113219089Spjd		ASSERT3S(map[i], <=, 2);
1114219089Spjd
1115219089Spjd		pow = map[i] * n;
1116219089Spjd		if (pow > 255)
1117219089Spjd			pow -= 255;
1118219089Spjd		ASSERT(pow <= 255);
1119219089Spjd
1120219089Spjd		for (j = 0; j < n; j++) {
1121219089Spjd			pow -= map[i];
1122219089Spjd			if (pow < 0)
1123219089Spjd				pow += 255;
1124219089Spjd			rows[i][j] = vdev_raidz_pow2[pow];
1125219089Spjd		}
1126219089Spjd	}
1127219089Spjd}
1128219089Spjd
1129219089Spjdstatic void
1130219089Spjdvdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1131219089Spjd    uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1132219089Spjd{
1133219089Spjd	int i, j, ii, jj;
1134219089Spjd	uint8_t log;
1135219089Spjd
1136219089Spjd	/*
1137219089Spjd	 * Assert that the first nmissing entries from the array of used
1138219089Spjd	 * columns correspond to parity columns and that subsequent entries
1139219089Spjd	 * correspond to data columns.
1140219089Spjd	 */
1141219089Spjd	for (i = 0; i < nmissing; i++) {
1142219089Spjd		ASSERT3S(used[i], <, rm->rm_firstdatacol);
1143219089Spjd	}
1144219089Spjd	for (; i < n; i++) {
1145219089Spjd		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1146219089Spjd	}
1147219089Spjd
1148219089Spjd	/*
1149219089Spjd	 * First initialize the storage where we'll compute the inverse rows.
1150219089Spjd	 */
1151219089Spjd	for (i = 0; i < nmissing; i++) {
1152219089Spjd		for (j = 0; j < n; j++) {
1153219089Spjd			invrows[i][j] = (i == j) ? 1 : 0;
1154219089Spjd		}
1155219089Spjd	}
1156219089Spjd
1157219089Spjd	/*
1158219089Spjd	 * Subtract all trivial rows from the rows of consequence.
1159219089Spjd	 */
1160219089Spjd	for (i = 0; i < nmissing; i++) {
1161219089Spjd		for (j = nmissing; j < n; j++) {
1162219089Spjd			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1163219089Spjd			jj = used[j] - rm->rm_firstdatacol;
1164219089Spjd			ASSERT3S(jj, <, n);
1165219089Spjd			invrows[i][j] = rows[i][jj];
1166219089Spjd			rows[i][jj] = 0;
1167219089Spjd		}
1168219089Spjd	}
1169219089Spjd
1170219089Spjd	/*
1171219089Spjd	 * For each of the rows of interest, we must normalize it and subtract
1172219089Spjd	 * a multiple of it from the other rows.
1173219089Spjd	 */
1174219089Spjd	for (i = 0; i < nmissing; i++) {
1175219089Spjd		for (j = 0; j < missing[i]; j++) {
1176240415Smm			ASSERT0(rows[i][j]);
1177219089Spjd		}
1178219089Spjd		ASSERT3U(rows[i][missing[i]], !=, 0);
1179219089Spjd
1180219089Spjd		/*
1181219089Spjd		 * Compute the inverse of the first element and multiply each
1182219089Spjd		 * element in the row by that value.
1183219089Spjd		 */
1184219089Spjd		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1185219089Spjd
1186219089Spjd		for (j = 0; j < n; j++) {
1187219089Spjd			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1188219089Spjd			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1189219089Spjd		}
1190219089Spjd
1191219089Spjd		for (ii = 0; ii < nmissing; ii++) {
1192219089Spjd			if (i == ii)
1193219089Spjd				continue;
1194219089Spjd
1195219089Spjd			ASSERT3U(rows[ii][missing[i]], !=, 0);
1196219089Spjd
1197219089Spjd			log = vdev_raidz_log2[rows[ii][missing[i]]];
1198219089Spjd
1199219089Spjd			for (j = 0; j < n; j++) {
1200219089Spjd				rows[ii][j] ^=
1201219089Spjd				    vdev_raidz_exp2(rows[i][j], log);
1202219089Spjd				invrows[ii][j] ^=
1203219089Spjd				    vdev_raidz_exp2(invrows[i][j], log);
1204219089Spjd			}
1205219089Spjd		}
1206219089Spjd	}
1207219089Spjd
1208219089Spjd	/*
1209219089Spjd	 * Verify that the data that is left in the rows are properly part of
1210219089Spjd	 * an identity matrix.
1211219089Spjd	 */
1212219089Spjd	for (i = 0; i < nmissing; i++) {
1213219089Spjd		for (j = 0; j < n; j++) {
1214219089Spjd			if (j == missing[i]) {
1215219089Spjd				ASSERT3U(rows[i][j], ==, 1);
1216219089Spjd			} else {
1217240415Smm				ASSERT0(rows[i][j]);
1218219089Spjd			}
1219219089Spjd		}
1220219089Spjd	}
1221219089Spjd}
1222219089Spjd
1223219089Spjdstatic void
1224219089Spjdvdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1225219089Spjd    int *missing, uint8_t **invrows, const uint8_t *used)
1226219089Spjd{
1227219089Spjd	int i, j, x, cc, c;
1228219089Spjd	uint8_t *src;
1229219089Spjd	uint64_t ccount;
1230219089Spjd	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1231219089Spjd	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1232247187Smm	uint8_t log = 0;
1233247187Smm	uint8_t val;
1234219089Spjd	int ll;
1235219089Spjd	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1236219089Spjd	uint8_t *p, *pp;
1237219089Spjd	size_t psize;
1238219089Spjd
1239219089Spjd	psize = sizeof (invlog[0][0]) * n * nmissing;
1240219089Spjd	p = kmem_alloc(psize, KM_SLEEP);
1241219089Spjd
1242219089Spjd	for (pp = p, i = 0; i < nmissing; i++) {
1243219089Spjd		invlog[i] = pp;
1244219089Spjd		pp += n;
1245219089Spjd	}
1246219089Spjd
1247219089Spjd	for (i = 0; i < nmissing; i++) {
1248219089Spjd		for (j = 0; j < n; j++) {
1249219089Spjd			ASSERT3U(invrows[i][j], !=, 0);
1250219089Spjd			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1251219089Spjd		}
1252219089Spjd	}
1253219089Spjd
1254219089Spjd	for (i = 0; i < n; i++) {
1255219089Spjd		c = used[i];
1256219089Spjd		ASSERT3U(c, <, rm->rm_cols);
1257219089Spjd
1258219089Spjd		src = rm->rm_col[c].rc_data;
1259219089Spjd		ccount = rm->rm_col[c].rc_size;
1260219089Spjd		for (j = 0; j < nmissing; j++) {
1261219089Spjd			cc = missing[j] + rm->rm_firstdatacol;
1262219089Spjd			ASSERT3U(cc, >=, rm->rm_firstdatacol);
1263219089Spjd			ASSERT3U(cc, <, rm->rm_cols);
1264219089Spjd			ASSERT3U(cc, !=, c);
1265219089Spjd
1266219089Spjd			dst[j] = rm->rm_col[cc].rc_data;
1267219089Spjd			dcount[j] = rm->rm_col[cc].rc_size;
1268219089Spjd		}
1269219089Spjd
1270219089Spjd		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1271219089Spjd
1272219089Spjd		for (x = 0; x < ccount; x++, src++) {
1273219089Spjd			if (*src != 0)
1274219089Spjd				log = vdev_raidz_log2[*src];
1275219089Spjd
1276219089Spjd			for (cc = 0; cc < nmissing; cc++) {
1277219089Spjd				if (x >= dcount[cc])
1278219089Spjd					continue;
1279219089Spjd
1280219089Spjd				if (*src == 0) {
1281219089Spjd					val = 0;
1282219089Spjd				} else {
1283219089Spjd					if ((ll = log + invlog[cc][i]) >= 255)
1284219089Spjd						ll -= 255;
1285219089Spjd					val = vdev_raidz_pow2[ll];
1286219089Spjd				}
1287219089Spjd
1288219089Spjd				if (i == 0)
1289219089Spjd					dst[cc][x] = val;
1290219089Spjd				else
1291219089Spjd					dst[cc][x] ^= val;
1292219089Spjd			}
1293219089Spjd		}
1294219089Spjd	}
1295219089Spjd
1296219089Spjd	kmem_free(p, psize);
1297219089Spjd}
1298219089Spjd
1299168404Spjdstatic int
1300219089Spjdvdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1301219089Spjd{
1302219089Spjd	int n, i, c, t, tt;
1303219089Spjd	int nmissing_rows;
1304219089Spjd	int missing_rows[VDEV_RAIDZ_MAXPARITY];
1305219089Spjd	int parity_map[VDEV_RAIDZ_MAXPARITY];
1306219089Spjd
1307219089Spjd	uint8_t *p, *pp;
1308219089Spjd	size_t psize;
1309219089Spjd
1310219089Spjd	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1311219089Spjd	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1312219089Spjd	uint8_t *used;
1313219089Spjd
1314219089Spjd	int code = 0;
1315219089Spjd
1316219089Spjd
1317219089Spjd	n = rm->rm_cols - rm->rm_firstdatacol;
1318219089Spjd
1319219089Spjd	/*
1320219089Spjd	 * Figure out which data columns are missing.
1321219089Spjd	 */
1322219089Spjd	nmissing_rows = 0;
1323219089Spjd	for (t = 0; t < ntgts; t++) {
1324219089Spjd		if (tgts[t] >= rm->rm_firstdatacol) {
1325219089Spjd			missing_rows[nmissing_rows++] =
1326219089Spjd			    tgts[t] - rm->rm_firstdatacol;
1327219089Spjd		}
1328219089Spjd	}
1329219089Spjd
1330219089Spjd	/*
1331219089Spjd	 * Figure out which parity columns to use to help generate the missing
1332219089Spjd	 * data columns.
1333219089Spjd	 */
1334219089Spjd	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1335219089Spjd		ASSERT(tt < ntgts);
1336219089Spjd		ASSERT(c < rm->rm_firstdatacol);
1337219089Spjd
1338219089Spjd		/*
1339219089Spjd		 * Skip any targeted parity columns.
1340219089Spjd		 */
1341219089Spjd		if (c == tgts[tt]) {
1342219089Spjd			tt++;
1343219089Spjd			continue;
1344219089Spjd		}
1345219089Spjd
1346219089Spjd		code |= 1 << c;
1347219089Spjd
1348219089Spjd		parity_map[i] = c;
1349219089Spjd		i++;
1350219089Spjd	}
1351219089Spjd
1352219089Spjd	ASSERT(code != 0);
1353219089Spjd	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1354219089Spjd
1355219089Spjd	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1356219089Spjd	    nmissing_rows * n + sizeof (used[0]) * n;
1357219089Spjd	p = kmem_alloc(psize, KM_SLEEP);
1358219089Spjd
1359219089Spjd	for (pp = p, i = 0; i < nmissing_rows; i++) {
1360219089Spjd		rows[i] = pp;
1361219089Spjd		pp += n;
1362219089Spjd		invrows[i] = pp;
1363219089Spjd		pp += n;
1364219089Spjd	}
1365219089Spjd	used = pp;
1366219089Spjd
1367219089Spjd	for (i = 0; i < nmissing_rows; i++) {
1368219089Spjd		used[i] = parity_map[i];
1369219089Spjd	}
1370219089Spjd
1371219089Spjd	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1372219089Spjd		if (tt < nmissing_rows &&
1373219089Spjd		    c == missing_rows[tt] + rm->rm_firstdatacol) {
1374219089Spjd			tt++;
1375219089Spjd			continue;
1376219089Spjd		}
1377219089Spjd
1378219089Spjd		ASSERT3S(i, <, n);
1379219089Spjd		used[i] = c;
1380219089Spjd		i++;
1381219089Spjd	}
1382219089Spjd
1383219089Spjd	/*
1384219089Spjd	 * Initialize the interesting rows of the matrix.
1385219089Spjd	 */
1386219089Spjd	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1387219089Spjd
1388219089Spjd	/*
1389219089Spjd	 * Invert the matrix.
1390219089Spjd	 */
1391219089Spjd	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1392219089Spjd	    invrows, used);
1393219089Spjd
1394219089Spjd	/*
1395219089Spjd	 * Reconstruct the missing data using the generated matrix.
1396219089Spjd	 */
1397219089Spjd	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1398219089Spjd	    invrows, used);
1399219089Spjd
1400219089Spjd	kmem_free(p, psize);
1401219089Spjd
1402219089Spjd	return (code);
1403219089Spjd}
1404219089Spjd
1405219089Spjdstatic int
1406219089Spjdvdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1407219089Spjd{
1408219089Spjd	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1409219089Spjd	int ntgts;
1410219089Spjd	int i, c;
1411219089Spjd	int code;
1412219089Spjd	int nbadparity, nbaddata;
1413219089Spjd	int parity_valid[VDEV_RAIDZ_MAXPARITY];
1414219089Spjd
1415219089Spjd	/*
1416219089Spjd	 * The tgts list must already be sorted.
1417219089Spjd	 */
1418219089Spjd	for (i = 1; i < nt; i++) {
1419219089Spjd		ASSERT(t[i] > t[i - 1]);
1420219089Spjd	}
1421219089Spjd
1422219089Spjd	nbadparity = rm->rm_firstdatacol;
1423219089Spjd	nbaddata = rm->rm_cols - nbadparity;
1424219089Spjd	ntgts = 0;
1425219089Spjd	for (i = 0, c = 0; c < rm->rm_cols; c++) {
1426219089Spjd		if (c < rm->rm_firstdatacol)
1427219089Spjd			parity_valid[c] = B_FALSE;
1428219089Spjd
1429219089Spjd		if (i < nt && c == t[i]) {
1430219089Spjd			tgts[ntgts++] = c;
1431219089Spjd			i++;
1432219089Spjd		} else if (rm->rm_col[c].rc_error != 0) {
1433219089Spjd			tgts[ntgts++] = c;
1434219089Spjd		} else if (c >= rm->rm_firstdatacol) {
1435219089Spjd			nbaddata--;
1436219089Spjd		} else {
1437219089Spjd			parity_valid[c] = B_TRUE;
1438219089Spjd			nbadparity--;
1439219089Spjd		}
1440219089Spjd	}
1441219089Spjd
1442219089Spjd	ASSERT(ntgts >= nt);
1443219089Spjd	ASSERT(nbaddata >= 0);
1444219089Spjd	ASSERT(nbaddata + nbadparity == ntgts);
1445219089Spjd
1446219089Spjd	dt = &tgts[nbadparity];
1447219089Spjd
1448219089Spjd	/*
1449219089Spjd	 * See if we can use any of our optimized reconstruction routines.
1450219089Spjd	 */
1451219089Spjd	if (!vdev_raidz_default_to_general) {
1452219089Spjd		switch (nbaddata) {
1453219089Spjd		case 1:
1454219089Spjd			if (parity_valid[VDEV_RAIDZ_P])
1455219089Spjd				return (vdev_raidz_reconstruct_p(rm, dt, 1));
1456219089Spjd
1457219089Spjd			ASSERT(rm->rm_firstdatacol > 1);
1458219089Spjd
1459219089Spjd			if (parity_valid[VDEV_RAIDZ_Q])
1460219089Spjd				return (vdev_raidz_reconstruct_q(rm, dt, 1));
1461219089Spjd
1462219089Spjd			ASSERT(rm->rm_firstdatacol > 2);
1463219089Spjd			break;
1464219089Spjd
1465219089Spjd		case 2:
1466219089Spjd			ASSERT(rm->rm_firstdatacol > 1);
1467219089Spjd
1468219089Spjd			if (parity_valid[VDEV_RAIDZ_P] &&
1469219089Spjd			    parity_valid[VDEV_RAIDZ_Q])
1470219089Spjd				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1471219089Spjd
1472219089Spjd			ASSERT(rm->rm_firstdatacol > 2);
1473219089Spjd
1474219089Spjd			break;
1475219089Spjd		}
1476219089Spjd	}
1477219089Spjd
1478219089Spjd	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1479219089Spjd	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1480219089Spjd	ASSERT(code > 0);
1481219089Spjd	return (code);
1482219089Spjd}
1483219089Spjd
1484219089Spjdstatic int
1485236155Smmvdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1486254591Sgibbs    uint64_t *logical_ashift, uint64_t *physical_ashift)
1487168404Spjd{
1488168404Spjd	vdev_t *cvd;
1489168404Spjd	uint64_t nparity = vd->vdev_nparity;
1490219089Spjd	int c;
1491168404Spjd	int lasterror = 0;
1492168404Spjd	int numerrors = 0;
1493168404Spjd
1494168404Spjd	ASSERT(nparity > 0);
1495168404Spjd
1496168404Spjd	if (nparity > VDEV_RAIDZ_MAXPARITY ||
1497168404Spjd	    vd->vdev_children < nparity + 1) {
1498168404Spjd		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1499249195Smm		return (SET_ERROR(EINVAL));
1500168404Spjd	}
1501168404Spjd
1502219089Spjd	vdev_open_children(vd);
1503219089Spjd
1504168404Spjd	for (c = 0; c < vd->vdev_children; c++) {
1505168404Spjd		cvd = vd->vdev_child[c];
1506168404Spjd
1507219089Spjd		if (cvd->vdev_open_error != 0) {
1508219089Spjd			lasterror = cvd->vdev_open_error;
1509168404Spjd			numerrors++;
1510168404Spjd			continue;
1511168404Spjd		}
1512168404Spjd
1513168404Spjd		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1514236155Smm		*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1515254591Sgibbs		*logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
1516254591Sgibbs		*physical_ashift = MAX(*physical_ashift,
1517254591Sgibbs		    cvd->vdev_physical_ashift);
1518168404Spjd	}
1519168404Spjd
1520168404Spjd	*asize *= vd->vdev_children;
1521236155Smm	*max_asize *= vd->vdev_children;
1522168404Spjd
1523168404Spjd	if (numerrors > nparity) {
1524168404Spjd		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1525168404Spjd		return (lasterror);
1526168404Spjd	}
1527168404Spjd
1528168404Spjd	return (0);
1529168404Spjd}
1530168404Spjd
1531168404Spjdstatic void
1532168404Spjdvdev_raidz_close(vdev_t *vd)
1533168404Spjd{
1534168404Spjd	int c;
1535168404Spjd
1536168404Spjd	for (c = 0; c < vd->vdev_children; c++)
1537168404Spjd		vdev_close(vd->vdev_child[c]);
1538168404Spjd}
1539168404Spjd
1540255750Sdelphij#ifdef illumos
1541255750Sdelphij/*
1542255750Sdelphij * Handle a read or write I/O to a RAID-Z dump device.
1543255750Sdelphij *
1544255750Sdelphij * The dump device is in a unique situation compared to other ZFS datasets:
1545255750Sdelphij * writing to this device should be as simple and fast as possible.  In
1546255750Sdelphij * addition, durability matters much less since the dump will be extracted
1547255750Sdelphij * once the machine reboots.  For that reason, this function eschews parity for
1548255750Sdelphij * performance and simplicity.  The dump device uses the checksum setting
1549255750Sdelphij * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
1550255750Sdelphij * dataset.
1551255750Sdelphij *
1552255750Sdelphij * Blocks of size 128 KB have been preallocated for this volume.  I/Os less than
1553255750Sdelphij * 128 KB will not fill an entire block; in addition, they may not be properly
1554255750Sdelphij * aligned.  In that case, this function uses the preallocated 128 KB block and
1555255750Sdelphij * omits reading or writing any "empty" portions of that block, as opposed to
1556255750Sdelphij * allocating a fresh appropriately-sized block.
1557255750Sdelphij *
1558255750Sdelphij * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
1559255750Sdelphij *
1560255750Sdelphij *     vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
1561255750Sdelphij *
1562255750Sdelphij * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
1563255750Sdelphij * allocated which spans all five child vdevs.  8 KB of data would be written to
1564255750Sdelphij * each of four vdevs, with the fifth containing the parity bits.
1565255750Sdelphij *
1566255750Sdelphij *       parity    data     data     data     data
1567255750Sdelphij *     |   PP   |   XX   |   XX   |   XX   |   XX   |
1568255750Sdelphij *         ^        ^        ^        ^        ^
1569255750Sdelphij *         |        |        |        |        |
1570255750Sdelphij *   8 KB parity    ------8 KB data blocks------
1571255750Sdelphij *
1572255750Sdelphij * However, when writing to the dump device, the behavior is different:
1573255750Sdelphij *
1574255750Sdelphij *     vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
1575255750Sdelphij *
1576255750Sdelphij * Unlike the normal RAID-Z case in which the block is allocated based on the
1577255750Sdelphij * I/O size, reads and writes here always use a 128 KB logical I/O size.  If the
1578255750Sdelphij * I/O size is less than 128 KB, only the actual portions of data are written.
1579255750Sdelphij * In this example the data is written to the third data vdev since that vdev
1580255750Sdelphij * contains the offset [64 KB, 96 KB).
1581255750Sdelphij *
1582255750Sdelphij *       parity    data     data     data     data
1583255750Sdelphij *     |        |        |        |   XX   |        |
1584255750Sdelphij *                                    ^
1585255750Sdelphij *                                    |
1586255750Sdelphij *                             32 KB data block
1587255750Sdelphij *
1588255750Sdelphij * As a result, an individual I/O may not span all child vdevs; moreover, a
1589255750Sdelphij * small I/O may only operate on a single child vdev.
1590255750Sdelphij *
1591255750Sdelphij * Note that since there are no parity bits calculated or written, this format
1592255750Sdelphij * remains the same no matter how many parity bits are used in a normal RAID-Z
1593255750Sdelphij * stripe.  On a RAID-Z3 configuration with seven child vdevs, the example above
1594255750Sdelphij * would look like:
1595255750Sdelphij *
1596255750Sdelphij *       parity   parity   parity    data     data     data     data
1597255750Sdelphij *     |        |        |        |        |        |   XX   |        |
1598255750Sdelphij *                                                      ^
1599255750Sdelphij *                                                      |
1600255750Sdelphij *                                               32 KB data block
1601255750Sdelphij */
1602255750Sdelphijint
1603255750Sdelphijvdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
1604255750Sdelphij    uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
1605255750Sdelphij{
1606255750Sdelphij	vdev_t *tvd = vd->vdev_top;
1607255750Sdelphij	vdev_t *cvd;
1608255750Sdelphij	raidz_map_t *rm;
1609255750Sdelphij	raidz_col_t *rc;
1610255750Sdelphij	int c, err = 0;
1611255750Sdelphij
1612255750Sdelphij	uint64_t start, end, colstart, colend;
1613255750Sdelphij	uint64_t coloffset, colsize, colskip;
1614255750Sdelphij
1615255750Sdelphij	int flags = doread ? BIO_READ : BIO_WRITE;
1616255750Sdelphij
1617255750Sdelphij#ifdef	_KERNEL
1618255750Sdelphij
1619255750Sdelphij	/*
1620255750Sdelphij	 * Don't write past the end of the block
1621255750Sdelphij	 */
1622276081Sdelphij	VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1623255750Sdelphij
1624255750Sdelphij	start = offset;
1625255750Sdelphij	end = start + size;
1626255750Sdelphij
1627255750Sdelphij	/*
1628255750Sdelphij	 * Allocate a RAID-Z map for this block.  Note that this block starts
1629255750Sdelphij	 * from the "original" offset, this is, the offset of the extent which
1630255750Sdelphij	 * contains the requisite offset of the data being read or written.
1631255750Sdelphij	 *
1632255750Sdelphij	 * Even if this I/O operation doesn't span the full block size, let's
1633255750Sdelphij	 * treat the on-disk format as if the only blocks are the complete 128
1634255750Sdelphij	 * KB size.
1635255750Sdelphij	 */
1636255750Sdelphij	rm = vdev_raidz_map_alloc(data - (offset - origoffset),
1637276081Sdelphij	    SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift,
1638276081Sdelphij	    vd->vdev_children, vd->vdev_nparity);
1639255750Sdelphij
1640255750Sdelphij	coloffset = origoffset;
1641255750Sdelphij
1642255750Sdelphij	for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1643255750Sdelphij	    c++, coloffset += rc->rc_size) {
1644255750Sdelphij		rc = &rm->rm_col[c];
1645255750Sdelphij		cvd = vd->vdev_child[rc->rc_devidx];
1646255750Sdelphij
1647255750Sdelphij		/*
1648255750Sdelphij		 * Find the start and end of this column in the RAID-Z map,
1649255750Sdelphij		 * keeping in mind that the stated size and offset of the
1650255750Sdelphij		 * operation may not fill the entire column for this vdev.
1651255750Sdelphij		 *
1652255750Sdelphij		 * If any portion of the data spans this column, issue the
1653255750Sdelphij		 * appropriate operation to the vdev.
1654255750Sdelphij		 */
1655255750Sdelphij		if (coloffset + rc->rc_size <= start)
1656255750Sdelphij			continue;
1657255750Sdelphij		if (coloffset >= end)
1658255750Sdelphij			continue;
1659255750Sdelphij
1660255750Sdelphij		colstart = MAX(coloffset, start);
1661255750Sdelphij		colend = MIN(end, coloffset + rc->rc_size);
1662255750Sdelphij		colsize = colend - colstart;
1663255750Sdelphij		colskip = colstart - coloffset;
1664255750Sdelphij
1665255750Sdelphij		VERIFY3U(colsize, <=, rc->rc_size);
1666255750Sdelphij		VERIFY3U(colskip, <=, rc->rc_size);
1667255750Sdelphij
1668255750Sdelphij		/*
1669255750Sdelphij		 * Note that the child vdev will have a vdev label at the start
1670255750Sdelphij		 * of its range of offsets, hence the need for
1671255750Sdelphij		 * VDEV_LABEL_OFFSET().  See zio_vdev_child_io() for another
1672255750Sdelphij		 * example of why this calculation is needed.
1673255750Sdelphij		 */
1674255750Sdelphij		if ((err = vdev_disk_physio(cvd,
1675255750Sdelphij		    ((char *)rc->rc_data) + colskip, colsize,
1676255750Sdelphij		    VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
1677255750Sdelphij		    flags, isdump)) != 0)
1678255750Sdelphij			break;
1679255750Sdelphij	}
1680255750Sdelphij
1681255750Sdelphij	vdev_raidz_map_free(rm);
1682255750Sdelphij#endif	/* KERNEL */
1683255750Sdelphij
1684255750Sdelphij	return (err);
1685255750Sdelphij}
1686255750Sdelphij#endif
1687255750Sdelphij
1688168404Spjdstatic uint64_t
1689168404Spjdvdev_raidz_asize(vdev_t *vd, uint64_t psize)
1690168404Spjd{
1691168404Spjd	uint64_t asize;
1692168404Spjd	uint64_t ashift = vd->vdev_top->vdev_ashift;
1693168404Spjd	uint64_t cols = vd->vdev_children;
1694168404Spjd	uint64_t nparity = vd->vdev_nparity;
1695168404Spjd
1696168404Spjd	asize = ((psize - 1) >> ashift) + 1;
1697168404Spjd	asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1698168404Spjd	asize = roundup(asize, nparity + 1) << ashift;
1699168404Spjd
1700168404Spjd	return (asize);
1701168404Spjd}
1702168404Spjd
1703168404Spjdstatic void
1704168404Spjdvdev_raidz_child_done(zio_t *zio)
1705168404Spjd{
1706168404Spjd	raidz_col_t *rc = zio->io_private;
1707168404Spjd
1708168404Spjd	rc->rc_error = zio->io_error;
1709168404Spjd	rc->rc_tried = 1;
1710168404Spjd	rc->rc_skipped = 0;
1711168404Spjd}
1712168404Spjd
1713251629Sdelphij/*
1714251629Sdelphij * Start an IO operation on a RAIDZ VDev
1715251629Sdelphij *
1716251629Sdelphij * Outline:
1717251629Sdelphij * - For write operations:
1718251629Sdelphij *   1. Generate the parity data
1719251629Sdelphij *   2. Create child zio write operations to each column's vdev, for both
1720251629Sdelphij *      data and parity.
1721251629Sdelphij *   3. If the column skips any sectors for padding, create optional dummy
1722251629Sdelphij *      write zio children for those areas to improve aggregation continuity.
1723251629Sdelphij * - For read operations:
1724251629Sdelphij *   1. Create child zio read operations to each data column's vdev to read
1725251629Sdelphij *      the range of data required for zio.
1726251629Sdelphij *   2. If this is a scrub or resilver operation, or if any of the data
1727251629Sdelphij *      vdevs have had errors, then create zio read operations to the parity
1728251629Sdelphij *      columns' VDevs as well.
1729251629Sdelphij */
1730297078Smavstatic void
1731168404Spjdvdev_raidz_io_start(zio_t *zio)
1732168404Spjd{
1733168404Spjd	vdev_t *vd = zio->io_vd;
1734168404Spjd	vdev_t *tvd = vd->vdev_top;
1735168404Spjd	vdev_t *cvd;
1736168404Spjd	raidz_map_t *rm;
1737168404Spjd	raidz_col_t *rc;
1738219089Spjd	int c, i;
1739168404Spjd
1740255750Sdelphij	rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
1741255750Sdelphij	    zio->io_type == ZIO_TYPE_FREE,
1742255750Sdelphij	    tvd->vdev_ashift, vd->vdev_children,
1743168404Spjd	    vd->vdev_nparity);
1744168404Spjd
1745255750Sdelphij	zio->io_vsd = rm;
1746255750Sdelphij	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1747255750Sdelphij
1748168404Spjd	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1749168404Spjd
1750240868Spjd	if (zio->io_type == ZIO_TYPE_FREE) {
1751240868Spjd		for (c = 0; c < rm->rm_cols; c++) {
1752240868Spjd			rc = &rm->rm_col[c];
1753240868Spjd			cvd = vd->vdev_child[rc->rc_devidx];
1754240868Spjd			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1755240868Spjd			    rc->rc_offset, rc->rc_data, rc->rc_size,
1756240868Spjd			    zio->io_type, zio->io_priority, 0,
1757240868Spjd			    vdev_raidz_child_done, rc));
1758240868Spjd		}
1759270312Ssmh
1760297078Smav		zio_execute(zio);
1761297078Smav		return;
1762240868Spjd	}
1763240868Spjd
1764168404Spjd	if (zio->io_type == ZIO_TYPE_WRITE) {
1765219089Spjd		vdev_raidz_generate_parity(rm);
1766168404Spjd
1767168404Spjd		for (c = 0; c < rm->rm_cols; c++) {
1768168404Spjd			rc = &rm->rm_col[c];
1769168404Spjd			cvd = vd->vdev_child[rc->rc_devidx];
1770168404Spjd			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1771168404Spjd			    rc->rc_offset, rc->rc_data, rc->rc_size,
1772185029Spjd			    zio->io_type, zio->io_priority, 0,
1773168404Spjd			    vdev_raidz_child_done, rc));
1774168404Spjd		}
1775185029Spjd
1776219089Spjd		/*
1777219089Spjd		 * Generate optional I/Os for any skipped sectors to improve
1778219089Spjd		 * aggregation contiguity.
1779219089Spjd		 */
1780219089Spjd		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1781219089Spjd			ASSERT(c <= rm->rm_scols);
1782219089Spjd			if (c == rm->rm_scols)
1783219089Spjd				c = 0;
1784219089Spjd			rc = &rm->rm_col[c];
1785219089Spjd			cvd = vd->vdev_child[rc->rc_devidx];
1786219089Spjd			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1787219089Spjd			    rc->rc_offset + rc->rc_size, NULL,
1788219089Spjd			    1 << tvd->vdev_ashift,
1789219089Spjd			    zio->io_type, zio->io_priority,
1790219089Spjd			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1791219089Spjd		}
1792219089Spjd
1793297078Smav		zio_execute(zio);
1794297078Smav		return;
1795168404Spjd	}
1796168404Spjd
1797168404Spjd	ASSERT(zio->io_type == ZIO_TYPE_READ);
1798168404Spjd
1799168404Spjd	/*
1800168404Spjd	 * Iterate over the columns in reverse order so that we hit the parity
1801219089Spjd	 * last -- any errors along the way will force us to read the parity.
1802168404Spjd	 */
1803168404Spjd	for (c = rm->rm_cols - 1; c >= 0; c--) {
1804168404Spjd		rc = &rm->rm_col[c];
1805168404Spjd		cvd = vd->vdev_child[rc->rc_devidx];
1806185029Spjd		if (!vdev_readable(cvd)) {
1807168404Spjd			if (c >= rm->rm_firstdatacol)
1808168404Spjd				rm->rm_missingdata++;
1809168404Spjd			else
1810168404Spjd				rm->rm_missingparity++;
1811249195Smm			rc->rc_error = SET_ERROR(ENXIO);
1812168404Spjd			rc->rc_tried = 1;	/* don't even try */
1813168404Spjd			rc->rc_skipped = 1;
1814168404Spjd			continue;
1815168404Spjd		}
1816219089Spjd		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1817168404Spjd			if (c >= rm->rm_firstdatacol)
1818168404Spjd				rm->rm_missingdata++;
1819168404Spjd			else
1820168404Spjd				rm->rm_missingparity++;
1821249195Smm			rc->rc_error = SET_ERROR(ESTALE);
1822168404Spjd			rc->rc_skipped = 1;
1823168404Spjd			continue;
1824168404Spjd		}
1825168404Spjd		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1826209095Smm		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1827168404Spjd			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1828168404Spjd			    rc->rc_offset, rc->rc_data, rc->rc_size,
1829185029Spjd			    zio->io_type, zio->io_priority, 0,
1830168404Spjd			    vdev_raidz_child_done, rc));
1831168404Spjd		}
1832168404Spjd	}
1833168404Spjd
1834297078Smav	zio_execute(zio);
1835168404Spjd}
1836168404Spjd
1837219089Spjd
1838168404Spjd/*
1839168404Spjd * Report a checksum error for a child of a RAID-Z device.
1840168404Spjd */
1841168404Spjdstatic void
1842219089Spjdraidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1843168404Spjd{
1844168404Spjd	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1845168404Spjd
1846168404Spjd	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1847219089Spjd		zio_bad_cksum_t zbc;
1848219089Spjd		raidz_map_t *rm = zio->io_vsd;
1849219089Spjd
1850168404Spjd		mutex_enter(&vd->vdev_stat_lock);
1851168404Spjd		vd->vdev_stat.vs_checksum_errors++;
1852168404Spjd		mutex_exit(&vd->vdev_stat_lock);
1853219089Spjd
1854219089Spjd		zbc.zbc_has_cksum = 0;
1855219089Spjd		zbc.zbc_injected = rm->rm_ecksuminjected;
1856219089Spjd
1857219089Spjd		zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1858219089Spjd		    rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1859219089Spjd		    &zbc);
1860168404Spjd	}
1861219089Spjd}
1862168404Spjd
1863219089Spjd/*
1864219089Spjd * We keep track of whether or not there were any injected errors, so that
1865219089Spjd * any ereports we generate can note it.
1866219089Spjd */
1867219089Spjdstatic int
1868219089Spjdraidz_checksum_verify(zio_t *zio)
1869219089Spjd{
1870219089Spjd	zio_bad_cksum_t zbc;
1871219089Spjd	raidz_map_t *rm = zio->io_vsd;
1872219089Spjd
1873219089Spjd	int ret = zio_checksum_error(zio, &zbc);
1874219089Spjd	if (ret != 0 && zbc.zbc_injected != 0)
1875219089Spjd		rm->rm_ecksuminjected = 1;
1876219089Spjd
1877219089Spjd	return (ret);
1878168404Spjd}
1879168404Spjd
1880168404Spjd/*
1881168404Spjd * Generate the parity from the data columns. If we tried and were able to
1882168404Spjd * read the parity without error, verify that the generated parity matches the
1883168404Spjd * data we read. If it doesn't, we fire off a checksum error. Return the
1884168404Spjd * number such failures.
1885168404Spjd */
1886168404Spjdstatic int
1887168404Spjdraidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1888168404Spjd{
1889168404Spjd	void *orig[VDEV_RAIDZ_MAXPARITY];
1890168404Spjd	int c, ret = 0;
1891168404Spjd	raidz_col_t *rc;
1892168404Spjd
1893255750Sdelphij	blkptr_t *bp = zio->io_bp;
1894255750Sdelphij	enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
1895255750Sdelphij	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
1896255750Sdelphij
1897255750Sdelphij	if (checksum == ZIO_CHECKSUM_NOPARITY)
1898255750Sdelphij		return (ret);
1899255750Sdelphij
1900168404Spjd	for (c = 0; c < rm->rm_firstdatacol; c++) {
1901168404Spjd		rc = &rm->rm_col[c];
1902168404Spjd		if (!rc->rc_tried || rc->rc_error != 0)
1903168404Spjd			continue;
1904168404Spjd		orig[c] = zio_buf_alloc(rc->rc_size);
1905168404Spjd		bcopy(rc->rc_data, orig[c], rc->rc_size);
1906168404Spjd	}
1907168404Spjd
1908219089Spjd	vdev_raidz_generate_parity(rm);
1909168404Spjd
1910168404Spjd	for (c = 0; c < rm->rm_firstdatacol; c++) {
1911168404Spjd		rc = &rm->rm_col[c];
1912168404Spjd		if (!rc->rc_tried || rc->rc_error != 0)
1913168404Spjd			continue;
1914168404Spjd		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1915219089Spjd			raidz_checksum_error(zio, rc, orig[c]);
1916249195Smm			rc->rc_error = SET_ERROR(ECKSUM);
1917168404Spjd			ret++;
1918168404Spjd		}
1919168404Spjd		zio_buf_free(orig[c], rc->rc_size);
1920168404Spjd	}
1921168404Spjd
1922168404Spjd	return (ret);
1923168404Spjd}
1924168404Spjd
1925219089Spjd/*
1926219089Spjd * Keep statistics on all the ways that we used parity to correct data.
1927219089Spjd */
1928219089Spjdstatic uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1929168404Spjd
1930185029Spjdstatic int
1931185029Spjdvdev_raidz_worst_error(raidz_map_t *rm)
1932185029Spjd{
1933185029Spjd	int error = 0;
1934185029Spjd
1935185029Spjd	for (int c = 0; c < rm->rm_cols; c++)
1936185029Spjd		error = zio_worst_error(error, rm->rm_col[c].rc_error);
1937185029Spjd
1938185029Spjd	return (error);
1939185029Spjd}
1940185029Spjd
1941219089Spjd/*
1942219089Spjd * Iterate over all combinations of bad data and attempt a reconstruction.
1943219089Spjd * Note that the algorithm below is non-optimal because it doesn't take into
1944219089Spjd * account how reconstruction is actually performed. For example, with
1945219089Spjd * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1946219089Spjd * is targeted as invalid as if columns 1 and 4 are targeted since in both
1947219089Spjd * cases we'd only use parity information in column 0.
1948219089Spjd */
1949219089Spjdstatic int
1950219089Spjdvdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1951219089Spjd{
1952219089Spjd	raidz_map_t *rm = zio->io_vsd;
1953219089Spjd	raidz_col_t *rc;
1954219089Spjd	void *orig[VDEV_RAIDZ_MAXPARITY];
1955219089Spjd	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1956219089Spjd	int *tgts = &tstore[1];
1957219089Spjd	int current, next, i, c, n;
1958219089Spjd	int code, ret = 0;
1959219089Spjd
1960219089Spjd	ASSERT(total_errors < rm->rm_firstdatacol);
1961219089Spjd
1962219089Spjd	/*
1963219089Spjd	 * This simplifies one edge condition.
1964219089Spjd	 */
1965219089Spjd	tgts[-1] = -1;
1966219089Spjd
1967219089Spjd	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1968219089Spjd		/*
1969219089Spjd		 * Initialize the targets array by finding the first n columns
1970219089Spjd		 * that contain no error.
1971219089Spjd		 *
1972219089Spjd		 * If there were no data errors, we need to ensure that we're
1973219089Spjd		 * always explicitly attempting to reconstruct at least one
1974219089Spjd		 * data column. To do this, we simply push the highest target
1975219089Spjd		 * up into the data columns.
1976219089Spjd		 */
1977219089Spjd		for (c = 0, i = 0; i < n; i++) {
1978219089Spjd			if (i == n - 1 && data_errors == 0 &&
1979219089Spjd			    c < rm->rm_firstdatacol) {
1980219089Spjd				c = rm->rm_firstdatacol;
1981219089Spjd			}
1982219089Spjd
1983219089Spjd			while (rm->rm_col[c].rc_error != 0) {
1984219089Spjd				c++;
1985219089Spjd				ASSERT3S(c, <, rm->rm_cols);
1986219089Spjd			}
1987219089Spjd
1988219089Spjd			tgts[i] = c++;
1989219089Spjd		}
1990219089Spjd
1991219089Spjd		/*
1992219089Spjd		 * Setting tgts[n] simplifies the other edge condition.
1993219089Spjd		 */
1994219089Spjd		tgts[n] = rm->rm_cols;
1995219089Spjd
1996219089Spjd		/*
1997219089Spjd		 * These buffers were allocated in previous iterations.
1998219089Spjd		 */
1999219089Spjd		for (i = 0; i < n - 1; i++) {
2000219089Spjd			ASSERT(orig[i] != NULL);
2001219089Spjd		}
2002219089Spjd
2003219089Spjd		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
2004219089Spjd
2005219089Spjd		current = 0;
2006219089Spjd		next = tgts[current];
2007219089Spjd
2008219089Spjd		while (current != n) {
2009219089Spjd			tgts[current] = next;
2010219089Spjd			current = 0;
2011219089Spjd
2012219089Spjd			/*
2013219089Spjd			 * Save off the original data that we're going to
2014219089Spjd			 * attempt to reconstruct.
2015219089Spjd			 */
2016219089Spjd			for (i = 0; i < n; i++) {
2017219089Spjd				ASSERT(orig[i] != NULL);
2018219089Spjd				c = tgts[i];
2019219089Spjd				ASSERT3S(c, >=, 0);
2020219089Spjd				ASSERT3S(c, <, rm->rm_cols);
2021219089Spjd				rc = &rm->rm_col[c];
2022219089Spjd				bcopy(rc->rc_data, orig[i], rc->rc_size);
2023219089Spjd			}
2024219089Spjd
2025219089Spjd			/*
2026219089Spjd			 * Attempt a reconstruction and exit the outer loop on
2027219089Spjd			 * success.
2028219089Spjd			 */
2029219089Spjd			code = vdev_raidz_reconstruct(rm, tgts, n);
2030219089Spjd			if (raidz_checksum_verify(zio) == 0) {
2031219089Spjd				atomic_inc_64(&raidz_corrected[code]);
2032219089Spjd
2033219089Spjd				for (i = 0; i < n; i++) {
2034219089Spjd					c = tgts[i];
2035219089Spjd					rc = &rm->rm_col[c];
2036219089Spjd					ASSERT(rc->rc_error == 0);
2037219089Spjd					if (rc->rc_tried)
2038219089Spjd						raidz_checksum_error(zio, rc,
2039219089Spjd						    orig[i]);
2040249195Smm					rc->rc_error = SET_ERROR(ECKSUM);
2041219089Spjd				}
2042219089Spjd
2043219089Spjd				ret = code;
2044219089Spjd				goto done;
2045219089Spjd			}
2046219089Spjd
2047219089Spjd			/*
2048219089Spjd			 * Restore the original data.
2049219089Spjd			 */
2050219089Spjd			for (i = 0; i < n; i++) {
2051219089Spjd				c = tgts[i];
2052219089Spjd				rc = &rm->rm_col[c];
2053219089Spjd				bcopy(orig[i], rc->rc_data, rc->rc_size);
2054219089Spjd			}
2055219089Spjd
2056219089Spjd			do {
2057219089Spjd				/*
2058219089Spjd				 * Find the next valid column after the current
2059219089Spjd				 * position..
2060219089Spjd				 */
2061219089Spjd				for (next = tgts[current] + 1;
2062219089Spjd				    next < rm->rm_cols &&
2063219089Spjd				    rm->rm_col[next].rc_error != 0; next++)
2064219089Spjd					continue;
2065219089Spjd
2066219089Spjd				ASSERT(next <= tgts[current + 1]);
2067219089Spjd
2068219089Spjd				/*
2069219089Spjd				 * If that spot is available, we're done here.
2070219089Spjd				 */
2071219089Spjd				if (next != tgts[current + 1])
2072219089Spjd					break;
2073219089Spjd
2074219089Spjd				/*
2075219089Spjd				 * Otherwise, find the next valid column after
2076219089Spjd				 * the previous position.
2077219089Spjd				 */
2078219089Spjd				for (c = tgts[current - 1] + 1;
2079219089Spjd				    rm->rm_col[c].rc_error != 0; c++)
2080219089Spjd					continue;
2081219089Spjd
2082219089Spjd				tgts[current] = c;
2083219089Spjd				current++;
2084219089Spjd
2085219089Spjd			} while (current != n);
2086219089Spjd		}
2087219089Spjd	}
2088219089Spjd	n--;
2089219089Spjddone:
2090219089Spjd	for (i = 0; i < n; i++) {
2091219089Spjd		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
2092219089Spjd	}
2093219089Spjd
2094219089Spjd	return (ret);
2095219089Spjd}
2096219089Spjd
2097251629Sdelphij/*
2098251629Sdelphij * Complete an IO operation on a RAIDZ VDev
2099251629Sdelphij *
2100251629Sdelphij * Outline:
2101251629Sdelphij * - For write operations:
2102251629Sdelphij *   1. Check for errors on the child IOs.
2103251629Sdelphij *   2. Return, setting an error code if too few child VDevs were written
2104251629Sdelphij *      to reconstruct the data later.  Note that partial writes are
2105251629Sdelphij *      considered successful if they can be reconstructed at all.
2106251629Sdelphij * - For read operations:
2107251629Sdelphij *   1. Check for errors on the child IOs.
2108251629Sdelphij *   2. If data errors occurred:
2109251629Sdelphij *      a. Try to reassemble the data from the parity available.
2110251629Sdelphij *      b. If we haven't yet read the parity drives, read them now.
2111251629Sdelphij *      c. If all parity drives have been read but the data still doesn't
2112251629Sdelphij *         reassemble with a correct checksum, then try combinatorial
2113251629Sdelphij *         reconstruction.
2114251629Sdelphij *      d. If that doesn't work, return an error.
2115251629Sdelphij *   3. If there were unexpected errors or this is a resilver operation,
2116251629Sdelphij *      rewrite the vdevs that had errors.
2117251629Sdelphij */
2118168404Spjdstatic void
2119168404Spjdvdev_raidz_io_done(zio_t *zio)
2120168404Spjd{
2121168404Spjd	vdev_t *vd = zio->io_vd;
2122168404Spjd	vdev_t *cvd;
2123168404Spjd	raidz_map_t *rm = zio->io_vsd;
2124219089Spjd	raidz_col_t *rc;
2125168404Spjd	int unexpected_errors = 0;
2126168404Spjd	int parity_errors = 0;
2127168404Spjd	int parity_untried = 0;
2128168404Spjd	int data_errors = 0;
2129185029Spjd	int total_errors = 0;
2130219089Spjd	int n, c;
2131219089Spjd	int tgts[VDEV_RAIDZ_MAXPARITY];
2132219089Spjd	int code;
2133168404Spjd
2134168404Spjd	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
2135168404Spjd
2136168404Spjd	ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2137168404Spjd	ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2138168404Spjd
2139168404Spjd	for (c = 0; c < rm->rm_cols; c++) {
2140168404Spjd		rc = &rm->rm_col[c];
2141168404Spjd
2142168404Spjd		if (rc->rc_error) {
2143185029Spjd			ASSERT(rc->rc_error != ECKSUM);	/* child has no bp */
2144168404Spjd
2145168404Spjd			if (c < rm->rm_firstdatacol)
2146168404Spjd				parity_errors++;
2147168404Spjd			else
2148168404Spjd				data_errors++;
2149168404Spjd
2150168404Spjd			if (!rc->rc_skipped)
2151168404Spjd				unexpected_errors++;
2152168404Spjd
2153185029Spjd			total_errors++;
2154168404Spjd		} else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2155168404Spjd			parity_untried++;
2156168404Spjd		}
2157168404Spjd	}
2158168404Spjd
2159168404Spjd	if (zio->io_type == ZIO_TYPE_WRITE) {
2160168404Spjd		/*
2161185029Spjd		 * XXX -- for now, treat partial writes as a success.
2162185029Spjd		 * (If we couldn't write enough columns to reconstruct
2163185029Spjd		 * the data, the I/O failed.  Otherwise, good enough.)
2164185029Spjd		 *
2165185029Spjd		 * Now that we support write reallocation, it would be better
2166185029Spjd		 * to treat partial failure as real failure unless there are
2167185029Spjd		 * no non-degraded top-level vdevs left, and not update DTLs
2168185029Spjd		 * if we intend to reallocate.
2169168404Spjd		 */
2170168404Spjd		/* XXPOLICY */
2171185029Spjd		if (total_errors > rm->rm_firstdatacol)
2172185029Spjd			zio->io_error = vdev_raidz_worst_error(rm);
2173168404Spjd
2174168404Spjd		return;
2175240868Spjd	} else if (zio->io_type == ZIO_TYPE_FREE) {
2176240868Spjd		return;
2177168404Spjd	}
2178168404Spjd
2179168404Spjd	ASSERT(zio->io_type == ZIO_TYPE_READ);
2180168404Spjd	/*
2181168404Spjd	 * There are three potential phases for a read:
2182168404Spjd	 *	1. produce valid data from the columns read
2183168404Spjd	 *	2. read all disks and try again
2184168404Spjd	 *	3. perform combinatorial reconstruction
2185168404Spjd	 *
2186168404Spjd	 * Each phase is progressively both more expensive and less likely to
2187168404Spjd	 * occur. If we encounter more errors than we can repair or all phases
2188168404Spjd	 * fail, we have no choice but to return an error.
2189168404Spjd	 */
2190168404Spjd
2191168404Spjd	/*
2192168404Spjd	 * If the number of errors we saw was correctable -- less than or equal
2193168404Spjd	 * to the number of parity disks read -- attempt to produce data that
2194168404Spjd	 * has a valid checksum. Naturally, this case applies in the absence of
2195168404Spjd	 * any errors.
2196168404Spjd	 */
2197185029Spjd	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2198219089Spjd		if (data_errors == 0) {
2199219089Spjd			if (raidz_checksum_verify(zio) == 0) {
2200168738Spjd				/*
2201168738Spjd				 * If we read parity information (unnecessarily
2202168738Spjd				 * as it happens since no reconstruction was
2203168738Spjd				 * needed) regenerate and verify the parity.
2204168738Spjd				 * We also regenerate parity when resilvering
2205168738Spjd				 * so we can write it out to the failed device
2206168738Spjd				 * later.
2207168738Spjd				 */
2208168404Spjd				if (parity_errors + parity_untried <
2209168738Spjd				    rm->rm_firstdatacol ||
2210168738Spjd				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2211168404Spjd					n = raidz_parity_verify(zio, rm);
2212168404Spjd					unexpected_errors += n;
2213168404Spjd					ASSERT(parity_errors + n <=
2214168404Spjd					    rm->rm_firstdatacol);
2215168404Spjd				}
2216168404Spjd				goto done;
2217168404Spjd			}
2218219089Spjd		} else {
2219168404Spjd			/*
2220168404Spjd			 * We either attempt to read all the parity columns or
2221168404Spjd			 * none of them. If we didn't try to read parity, we
2222168404Spjd			 * wouldn't be here in the correctable case. There must
2223168404Spjd			 * also have been fewer parity errors than parity
2224168404Spjd			 * columns or, again, we wouldn't be in this code path.
2225168404Spjd			 */
2226168404Spjd			ASSERT(parity_untried == 0);
2227168404Spjd			ASSERT(parity_errors < rm->rm_firstdatacol);
2228168404Spjd
2229168404Spjd			/*
2230219089Spjd			 * Identify the data columns that reported an error.
2231168404Spjd			 */
2232219089Spjd			n = 0;
2233168404Spjd			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2234168404Spjd				rc = &rm->rm_col[c];
2235219089Spjd				if (rc->rc_error != 0) {
2236219089Spjd					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2237219089Spjd					tgts[n++] = c;
2238219089Spjd				}
2239168404Spjd			}
2240168404Spjd
2241219089Spjd			ASSERT(rm->rm_firstdatacol >= n);
2242168404Spjd
2243219089Spjd			code = vdev_raidz_reconstruct(rm, tgts, n);
2244168404Spjd
2245219089Spjd			if (raidz_checksum_verify(zio) == 0) {
2246219089Spjd				atomic_inc_64(&raidz_corrected[code]);
2247219089Spjd
2248168404Spjd				/*
2249219089Spjd				 * If we read more parity disks than were used
2250219089Spjd				 * for reconstruction, confirm that the other
2251219089Spjd				 * parity disks produced correct data. This
2252219089Spjd				 * routine is suboptimal in that it regenerates
2253219089Spjd				 * the parity that we already used in addition
2254219089Spjd				 * to the parity that we're attempting to
2255219089Spjd				 * verify, but this should be a relatively
2256219089Spjd				 * uncommon case, and can be optimized if it
2257219089Spjd				 * becomes a problem. Note that we regenerate
2258219089Spjd				 * parity when resilvering so we can write it
2259219089Spjd				 * out to failed devices later.
2260168404Spjd				 */
2261219089Spjd				if (parity_errors < rm->rm_firstdatacol - n ||
2262168738Spjd				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
2263168404Spjd					n = raidz_parity_verify(zio, rm);
2264168404Spjd					unexpected_errors += n;
2265168404Spjd					ASSERT(parity_errors + n <=
2266168404Spjd					    rm->rm_firstdatacol);
2267168404Spjd				}
2268168404Spjd
2269168404Spjd				goto done;
2270168404Spjd			}
2271168404Spjd		}
2272168404Spjd	}
2273168404Spjd
2274168404Spjd	/*
2275168404Spjd	 * This isn't a typical situation -- either we got a read error or
2276168404Spjd	 * a child silently returned bad data. Read every block so we can
2277168404Spjd	 * try again with as much data and parity as we can track down. If
2278168404Spjd	 * we've already been through once before, all children will be marked
2279168404Spjd	 * as tried so we'll proceed to combinatorial reconstruction.
2280168404Spjd	 */
2281168404Spjd	unexpected_errors = 1;
2282168404Spjd	rm->rm_missingdata = 0;
2283168404Spjd	rm->rm_missingparity = 0;
2284168404Spjd
2285168404Spjd	for (c = 0; c < rm->rm_cols; c++) {
2286168404Spjd		if (rm->rm_col[c].rc_tried)
2287168404Spjd			continue;
2288168404Spjd
2289168404Spjd		zio_vdev_io_redone(zio);
2290168404Spjd		do {
2291168404Spjd			rc = &rm->rm_col[c];
2292168404Spjd			if (rc->rc_tried)
2293168404Spjd				continue;
2294168404Spjd			zio_nowait(zio_vdev_child_io(zio, NULL,
2295168404Spjd			    vd->vdev_child[rc->rc_devidx],
2296168404Spjd			    rc->rc_offset, rc->rc_data, rc->rc_size,
2297185029Spjd			    zio->io_type, zio->io_priority, 0,
2298168404Spjd			    vdev_raidz_child_done, rc));
2299168404Spjd		} while (++c < rm->rm_cols);
2300185029Spjd
2301168404Spjd		return;
2302168404Spjd	}
2303168404Spjd
2304168404Spjd	/*
2305168404Spjd	 * At this point we've attempted to reconstruct the data given the
2306168404Spjd	 * errors we detected, and we've attempted to read all columns. There
2307168404Spjd	 * must, therefore, be one or more additional problems -- silent errors
2308168404Spjd	 * resulting in invalid data rather than explicit I/O errors resulting
2309219089Spjd	 * in absent data. We check if there is enough additional data to
2310219089Spjd	 * possibly reconstruct the data and then perform combinatorial
2311219089Spjd	 * reconstruction over all possible combinations. If that fails,
2312219089Spjd	 * we're cooked.
2313168404Spjd	 */
2314219089Spjd	if (total_errors > rm->rm_firstdatacol) {
2315185029Spjd		zio->io_error = vdev_raidz_worst_error(rm);
2316168404Spjd
2317219089Spjd	} else if (total_errors < rm->rm_firstdatacol &&
2318219089Spjd	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2319168404Spjd		/*
2320219089Spjd		 * If we didn't use all the available parity for the
2321219089Spjd		 * combinatorial reconstruction, verify that the remaining
2322219089Spjd		 * parity is correct.
2323168404Spjd		 */
2324219089Spjd		if (code != (1 << rm->rm_firstdatacol) - 1)
2325219089Spjd			(void) raidz_parity_verify(zio, rm);
2326219089Spjd	} else {
2327168404Spjd		/*
2328219089Spjd		 * We're here because either:
2329219089Spjd		 *
2330219089Spjd		 *	total_errors == rm_first_datacol, or
2331219089Spjd		 *	vdev_raidz_combrec() failed
2332219089Spjd		 *
2333219089Spjd		 * In either case, there is enough bad data to prevent
2334219089Spjd		 * reconstruction.
2335219089Spjd		 *
2336219089Spjd		 * Start checksum ereports for all children which haven't
2337219089Spjd		 * failed, and the IO wasn't speculative.
2338168404Spjd		 */
2339249195Smm		zio->io_error = SET_ERROR(ECKSUM);
2340168404Spjd
2341219089Spjd		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2342219089Spjd			for (c = 0; c < rm->rm_cols; c++) {
2343219089Spjd				rc = &rm->rm_col[c];
2344219089Spjd				if (rc->rc_error == 0) {
2345219089Spjd					zio_bad_cksum_t zbc;
2346219089Spjd					zbc.zbc_has_cksum = 0;
2347219089Spjd					zbc.zbc_injected =
2348219089Spjd					    rm->rm_ecksuminjected;
2349168404Spjd
2350219089Spjd					zfs_ereport_start_checksum(
2351219089Spjd					    zio->io_spa,
2352219089Spjd					    vd->vdev_child[rc->rc_devidx],
2353219089Spjd					    zio, rc->rc_offset, rc->rc_size,
2354219089Spjd					    (void *)(uintptr_t)c, &zbc);
2355168404Spjd				}
2356168404Spjd			}
2357168404Spjd		}
2358168404Spjd	}
2359168404Spjd
2360168404Spjddone:
2361168404Spjd	zio_checksum_verified(zio);
2362168404Spjd
2363209962Smm	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2364168404Spjd	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2365168404Spjd		/*
2366168404Spjd		 * Use the good data we have in hand to repair damaged children.
2367168404Spjd		 */
2368168404Spjd		for (c = 0; c < rm->rm_cols; c++) {
2369168404Spjd			rc = &rm->rm_col[c];
2370168404Spjd			cvd = vd->vdev_child[rc->rc_devidx];
2371168404Spjd
2372168404Spjd			if (rc->rc_error == 0)
2373168404Spjd				continue;
2374168404Spjd
2375185029Spjd			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2376168404Spjd			    rc->rc_offset, rc->rc_data, rc->rc_size,
2377260763Savg			    ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2378209962Smm			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2379209962Smm			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2380168404Spjd		}
2381168404Spjd	}
2382168404Spjd}
2383168404Spjd
2384168404Spjdstatic void
2385168404Spjdvdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2386168404Spjd{
2387168404Spjd	if (faulted > vd->vdev_nparity)
2388168404Spjd		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2389168404Spjd		    VDEV_AUX_NO_REPLICAS);
2390168404Spjd	else if (degraded + faulted != 0)
2391168404Spjd		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2392168404Spjd	else
2393168404Spjd		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2394168404Spjd}
2395168404Spjd
2396168404Spjdvdev_ops_t vdev_raidz_ops = {
2397168404Spjd	vdev_raidz_open,
2398168404Spjd	vdev_raidz_close,
2399168404Spjd	vdev_raidz_asize,
2400168404Spjd	vdev_raidz_io_start,
2401168404Spjd	vdev_raidz_io_done,
2402168404Spjd	vdev_raidz_state_change,
2403219089Spjd	NULL,
2404219089Spjd	NULL,
2405168404Spjd	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
2406168404Spjd	B_FALSE			/* not a leaf vdev */
2407168404Spjd};
2408