1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (C) 2016 Gvozden Ne��kovi��. All rights reserved.
24 */
25
26#include <sys/vdev_raidz_impl.h>
27
28/*
29 * Provide native CPU scalar routines.
30 * Support 32bit and 64bit CPUs.
31 */
32#if ((~(0x0ULL)) >> 24) == 0xffULL
33#define	ELEM_SIZE	4
34typedef uint32_t iv_t;
35#elif ((~(0x0ULL)) >> 56) == 0xffULL
36#define	ELEM_SIZE	8
37typedef uint64_t iv_t;
38#endif
39
40/*
41 * Vector type used in scalar implementation
42 *
43 * The union is expected to be of native CPU register size. Since addition
44 * uses XOR operation, it can be performed an all byte elements at once.
45 * Multiplication requires per byte access.
46 */
47typedef union {
48	iv_t e;
49	uint8_t b[ELEM_SIZE];
50} v_t;
51
52/*
53 * Precomputed lookup tables for multiplication by a constant
54 *
55 * Reconstruction path requires multiplication by a constant factors. Instead of
56 * performing two step lookup (log & exp tables), a direct lookup can be used
57 * instead. Multiplication of element 'a' by a constant 'c' is obtained as:
58 *
59 * 	r = vdev_raidz_mul_lt[c_log][a];
60 *
61 * where c_log = vdev_raidz_log2[c]. Log of coefficient factors is used because
62 * they are faster to obtain while solving the syndrome equations.
63 *
64 * PERFORMANCE NOTE:
65 * Even though the complete lookup table uses 64kiB, only relatively small
66 * portion of it is used at the same time. Following shows number of accessed
67 * bytes for different cases:
68 * 	- 1 failed disk: 256B (1 mul. coefficient)
69 * 	- 2 failed disks: 512B (2 mul. coefficients)
70 * 	- 3 failed disks: 1536B (6 mul. coefficients)
71 *
72 * Size of actually accessed lookup table regions is only larger for
73 * reconstruction of 3 failed disks, when compared to traditional log/exp
74 * method. But since the result is obtained in one lookup step performance is
75 * doubled.
76 */
77static uint8_t vdev_raidz_mul_lt[256][256] __attribute__((aligned(256)));
78
79static void
80raidz_init_scalar(void)
81{
82	int c, i;
83	for (c = 0; c < 256; c++)
84		for (i = 0; i < 256; i++)
85			vdev_raidz_mul_lt[c][i] = gf_mul(c, i);
86
87}
88
89#define	PREFETCHNTA(ptr, offset)	{}
90#define	PREFETCH(ptr, offset) 		{}
91
92#define	XOR_ACC(src, acc)	acc.e ^= ((v_t *)src)[0].e
93#define	XOR(src, acc)		acc.e ^= src.e
94#define	ZERO(acc)		acc.e = 0
95#define	COPY(src, dst)		dst = src
96#define	LOAD(src, val) 		val = ((v_t *)src)[0]
97#define	STORE(dst, val)		((v_t *)dst)[0] = val
98
99/*
100 * Constants used for optimized multiplication by 2.
101 */
102static const struct {
103	iv_t mod;
104	iv_t mask;
105	iv_t msb;
106} scalar_mul2_consts = {
107#if ELEM_SIZE == 8
108	.mod	= 0x1d1d1d1d1d1d1d1dULL,
109	.mask	= 0xfefefefefefefefeULL,
110	.msb	= 0x8080808080808080ULL,
111#else
112	.mod	= 0x1d1d1d1dULL,
113	.mask	= 0xfefefefeULL,
114	.msb	= 0x80808080ULL,
115#endif
116};
117
118#define	MUL2_SETUP() {}
119
120#define	MUL2(a)								\
121{									\
122	iv_t _mask;							\
123									\
124	_mask = (a).e & scalar_mul2_consts.msb;				\
125	_mask = (_mask << 1) - (_mask >> 7);				\
126	(a).e = ((a).e << 1) & scalar_mul2_consts.mask;			\
127	(a).e = (a).e ^ (_mask & scalar_mul2_consts.mod);		\
128}
129
130#define	MUL4(a) 							\
131{									\
132	MUL2(a);							\
133	MUL2(a);							\
134}
135
136#define	MUL(c, a)							\
137{									\
138	const uint8_t *mul_lt = vdev_raidz_mul_lt[c];			\
139	switch (ELEM_SIZE) {						\
140	case 8:								\
141		a.b[7] = mul_lt[a.b[7]];				\
142		a.b[6] = mul_lt[a.b[6]];				\
143		a.b[5] = mul_lt[a.b[5]];				\
144		a.b[4] = mul_lt[a.b[4]];				\
145		zfs_fallthrough;					\
146	case 4:								\
147		a.b[3] = mul_lt[a.b[3]];				\
148		a.b[2] = mul_lt[a.b[2]];				\
149		a.b[1] = mul_lt[a.b[1]];				\
150		a.b[0] = mul_lt[a.b[0]];				\
151		break;							\
152	}								\
153}
154
155#define	raidz_math_begin()	{}
156#define	raidz_math_end()	{}
157
158#define	SYN_STRIDE		1
159
160#define	ZERO_DEFINE()		v_t d0
161#define	ZERO_STRIDE		1
162#define	ZERO_D			d0
163
164#define	COPY_DEFINE()		v_t d0
165#define	COPY_STRIDE		1
166#define	COPY_D			d0
167
168#define	ADD_DEFINE()		v_t d0
169#define	ADD_STRIDE		1
170#define	ADD_D			d0
171
172#define	MUL_DEFINE()		v_t d0
173#define	MUL_STRIDE		1
174#define	MUL_D			d0
175
176#define	GEN_P_STRIDE		1
177#define	GEN_P_DEFINE()		v_t p0
178#define	GEN_P_P			p0
179
180#define	GEN_PQ_STRIDE		1
181#define	GEN_PQ_DEFINE()		v_t d0, c0
182#define	GEN_PQ_D		d0
183#define	GEN_PQ_C		c0
184
185#define	GEN_PQR_STRIDE		1
186#define	GEN_PQR_DEFINE()	v_t d0, c0
187#define	GEN_PQR_D		d0
188#define	GEN_PQR_C		c0
189
190#define	SYN_Q_DEFINE()		v_t d0, x0
191#define	SYN_Q_D			d0
192#define	SYN_Q_X			x0
193
194
195#define	SYN_R_DEFINE()		v_t d0, x0
196#define	SYN_R_D			d0
197#define	SYN_R_X			x0
198
199
200#define	SYN_PQ_DEFINE()		v_t d0, x0
201#define	SYN_PQ_D		d0
202#define	SYN_PQ_X		x0
203
204
205#define	REC_PQ_STRIDE		1
206#define	REC_PQ_DEFINE()		v_t x0, y0, t0
207#define	REC_PQ_X		x0
208#define	REC_PQ_Y		y0
209#define	REC_PQ_T		t0
210
211
212#define	SYN_PR_DEFINE()		v_t d0, x0
213#define	SYN_PR_D		d0
214#define	SYN_PR_X		x0
215
216#define	REC_PR_STRIDE		1
217#define	REC_PR_DEFINE()		v_t x0, y0, t0
218#define	REC_PR_X		x0
219#define	REC_PR_Y		y0
220#define	REC_PR_T		t0
221
222
223#define	SYN_QR_DEFINE()		v_t d0, x0
224#define	SYN_QR_D		d0
225#define	SYN_QR_X		x0
226
227
228#define	REC_QR_STRIDE		1
229#define	REC_QR_DEFINE()		v_t x0, y0, t0
230#define	REC_QR_X		x0
231#define	REC_QR_Y		y0
232#define	REC_QR_T		t0
233
234
235#define	SYN_PQR_DEFINE()	v_t d0, x0
236#define	SYN_PQR_D		d0
237#define	SYN_PQR_X		x0
238
239#define	REC_PQR_STRIDE		1
240#define	REC_PQR_DEFINE()	v_t x0, y0, z0, xs0, ys0
241#define	REC_PQR_X		x0
242#define	REC_PQR_Y		y0
243#define	REC_PQR_Z		z0
244#define	REC_PQR_XS		xs0
245#define	REC_PQR_YS		ys0
246
247#include "vdev_raidz_math_impl.h"
248
249DEFINE_GEN_METHODS(scalar);
250DEFINE_REC_METHODS(scalar);
251
252boolean_t
253raidz_will_scalar_work(void)
254{
255	return (B_TRUE); /* always */
256}
257
258const raidz_impl_ops_t vdev_raidz_scalar_impl = {
259	.init = raidz_init_scalar,
260	.fini = NULL,
261	.gen = RAIDZ_GEN_METHODS(scalar),
262	.rec = RAIDZ_REC_METHODS(scalar),
263	.is_supported = &raidz_will_scalar_work,
264	.name = "scalar"
265};
266
267/* Powers of 2 in the RAID-Z Galois field. */
268const uint8_t vdev_raidz_pow2[256] __attribute__((aligned(256))) = {
269	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
270	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
271	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
272	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
273	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
274	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
275	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
276	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
277	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
278	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
279	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
280	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
281	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
282	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
283	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
284	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
285	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
286	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
287	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
288	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
289	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
290	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
291	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
292	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
293	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
294	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
295	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
296	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
297	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
298	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
299	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
300	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
301};
302
303/* Logs of 2 in the RAID-Z Galois field. */
304const uint8_t vdev_raidz_log2[256] __attribute__((aligned(256))) = {
305	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
306	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
307	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
308	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
309	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
310	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
311	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
312	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
313	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
314	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
315	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
316	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
317	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
318	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
319	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
320	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
321	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
322	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
323	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
324	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
325	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
326	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
327	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
328	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
329	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
330	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
331	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
332	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
333	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
334	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
335	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
336	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
337};
338