1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (C) 2016 Romain Dolbeau. All rights reserved.
23 * Copyright (C) 2016 Gvozden Ne��kovi��. All rights reserved.
24 */
25
26#include <sys/isa_defs.h>
27
28#if defined(__x86_64) && defined(HAVE_AVX512BW)
29
30#include <sys/param.h>
31#include <sys/types.h>
32#include <sys/simd.h>
33
34
35#ifdef __linux__
36#define	__asm __asm__ __volatile__
37#endif
38
39#define	_REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N
40#define	REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1)
41
42#define	VR0_(REG, ...) "zmm"#REG
43#define	VR1_(_1, REG, ...) "zmm"#REG
44#define	VR2_(_1, _2, REG, ...) "zmm"#REG
45#define	VR3_(_1, _2, _3, REG, ...) "zmm"#REG
46#define	VR4_(_1, _2, _3, _4, REG, ...) "zmm"#REG
47#define	VR5_(_1, _2, _3, _4, _5, REG, ...) "zmm"#REG
48#define	VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "zmm"#REG
49#define	VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "zmm"#REG
50
51#define	VR0(r...) VR0_(r)
52#define	VR1(r...) VR1_(r)
53#define	VR2(r...) VR2_(r, 1)
54#define	VR3(r...) VR3_(r, 1, 2)
55#define	VR4(r...) VR4_(r, 1, 2)
56#define	VR5(r...) VR5_(r, 1, 2, 3)
57#define	VR6(r...) VR6_(r, 1, 2, 3, 4)
58#define	VR7(r...) VR7_(r, 1, 2, 3, 4, 5)
59
60#define	R_01(REG1, REG2, ...) REG1, REG2
61#define	_R_23(_0, _1, REG2, REG3, ...) REG2, REG3
62#define	R_23(REG...) _R_23(REG, 1, 2, 3)
63
64#define	ZFS_ASM_BUG()	ASSERT(0)
65
66extern const uint8_t gf_clmul_mod_lt[4*256][16];
67
68#define	ELEM_SIZE 64
69
70typedef struct v {
71	uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE)));
72} v_t;
73
74#define	XOR_ACC(src, r...)						\
75{									\
76	switch (REG_CNT(r)) {						\
77	case 4:								\
78		__asm(							\
79		    "vpxorq 0x00(%[SRC]), %%" VR0(r)", %%" VR0(r) "\n"	\
80		    "vpxorq 0x40(%[SRC]), %%" VR1(r)", %%" VR1(r) "\n"	\
81		    "vpxorq 0x80(%[SRC]), %%" VR2(r)", %%" VR2(r) "\n"	\
82		    "vpxorq 0xc0(%[SRC]), %%" VR3(r)", %%" VR3(r) "\n"	\
83		    : : [SRC] "r" (src));				\
84		break;							\
85	case 2:								\
86		__asm(							\
87		    "vpxorq 0x00(%[SRC]), %%" VR0(r)", %%" VR0(r) "\n"	\
88		    "vpxorq 0x40(%[SRC]), %%" VR1(r)", %%" VR1(r) "\n"	\
89		    : : [SRC] "r" (src));				\
90		break;							\
91	default:							\
92		ZFS_ASM_BUG();						\
93	}								\
94}
95
96#define	XOR(r...)							\
97{									\
98	switch (REG_CNT(r)) {						\
99	case 8:								\
100		__asm(							\
101		    "vpxorq %" VR0(r) ", %" VR4(r)", %" VR4(r) "\n"	\
102		    "vpxorq %" VR1(r) ", %" VR5(r)", %" VR5(r) "\n"	\
103		    "vpxorq %" VR2(r) ", %" VR6(r)", %" VR6(r) "\n"	\
104		    "vpxorq %" VR3(r) ", %" VR7(r)", %" VR7(r));	\
105		break;							\
106	case 4:								\
107		__asm(							\
108		    "vpxorq %" VR0(r) ", %" VR2(r)", %" VR2(r) "\n"	\
109		    "vpxorq %" VR1(r) ", %" VR3(r)", %" VR3(r));	\
110		break;							\
111	default:							\
112		ZFS_ASM_BUG();						\
113	}								\
114}
115
116#define	ZERO(r...)	XOR(r, r)
117
118#define	COPY(r...)							\
119{									\
120	switch (REG_CNT(r)) {						\
121	case 8:								\
122		__asm(							\
123		    "vmovdqa64 %" VR0(r) ", %" VR4(r) "\n"		\
124		    "vmovdqa64 %" VR1(r) ", %" VR5(r) "\n"		\
125		    "vmovdqa64 %" VR2(r) ", %" VR6(r) "\n"		\
126		    "vmovdqa64 %" VR3(r) ", %" VR7(r));			\
127		break;							\
128	case 4:								\
129		__asm(							\
130		    "vmovdqa64 %" VR0(r) ", %" VR2(r) "\n"		\
131		    "vmovdqa64 %" VR1(r) ", %" VR3(r));			\
132		break;							\
133	default:							\
134		ZFS_ASM_BUG();						\
135	}								\
136}
137
138#define	LOAD(src, r...)							\
139{									\
140	switch (REG_CNT(r)) {						\
141	case 4:								\
142		__asm(							\
143		    "vmovdqa64 0x00(%[SRC]), %%" VR0(r) "\n"		\
144		    "vmovdqa64 0x40(%[SRC]), %%" VR1(r) "\n"		\
145		    "vmovdqa64 0x80(%[SRC]), %%" VR2(r) "\n"		\
146		    "vmovdqa64 0xc0(%[SRC]), %%" VR3(r) "\n"		\
147		    : : [SRC] "r" (src));				\
148		break;							\
149	case 2:								\
150		__asm(							\
151		    "vmovdqa64 0x00(%[SRC]), %%" VR0(r) "\n"		\
152		    "vmovdqa64 0x40(%[SRC]), %%" VR1(r) "\n"		\
153		    : : [SRC] "r" (src));				\
154		break;							\
155	default:							\
156		ZFS_ASM_BUG();						\
157	}								\
158}
159
160#define	STORE(dst, r...)						\
161{									\
162	switch (REG_CNT(r)) {						\
163	case 4:								\
164		__asm(							\
165		    "vmovdqa64 %%" VR0(r) ", 0x00(%[DST])\n"		\
166		    "vmovdqa64 %%" VR1(r) ", 0x40(%[DST])\n"		\
167		    "vmovdqa64 %%" VR2(r) ", 0x80(%[DST])\n"		\
168		    "vmovdqa64 %%" VR3(r) ", 0xc0(%[DST])\n"		\
169		    : : [DST] "r" (dst));				\
170		break;							\
171	case 2:								\
172		__asm(							\
173		    "vmovdqa64 %%" VR0(r) ", 0x00(%[DST])\n"		\
174		    "vmovdqa64 %%" VR1(r) ", 0x40(%[DST])\n"		\
175		    : : [DST] "r" (dst));				\
176		break;							\
177	default:							\
178		ZFS_ASM_BUG();						\
179	}								\
180}
181
182#define	MUL2_SETUP() 							\
183{   									\
184	__asm("vmovq %0,    %%xmm22" :: "r"(0x1d1d1d1d1d1d1d1d));	\
185	__asm("vpbroadcastq  %xmm22, %zmm22");				\
186	__asm("vpxord        %zmm23, %zmm23 ,%zmm23");			\
187}
188
189#define	_MUL2(r...)							\
190{									\
191	switch	(REG_CNT(r)) {						\
192	case 2:								\
193		__asm(							\
194		    "vpcmpb $1, %zmm23,     %" VR0(r)", %k1\n"		\
195		    "vpcmpb $1, %zmm23,     %" VR1(r)", %k2\n"		\
196		    "vpaddb     %" VR0(r)", %" VR0(r)", %" VR0(r) "\n"	\
197		    "vpaddb     %" VR1(r)", %" VR1(r)", %" VR1(r) "\n"	\
198		    "vpxord     %zmm22,     %" VR0(r)", %zmm12\n"	\
199		    "vpxord     %zmm22,     %" VR1(r)", %zmm13\n"	\
200		    "vmovdqu8   %zmm12,     %" VR0(r) "{%k1}\n"		\
201		    "vmovdqu8   %zmm13,     %" VR1(r) "{%k2}");		\
202		break;							\
203	default:							\
204		ZFS_ASM_BUG();						\
205	}								\
206}
207
208#define	MUL2(r...)							\
209{									\
210	switch (REG_CNT(r)) {						\
211	case 4:								\
212	    _MUL2(R_01(r));						\
213	    _MUL2(R_23(r));						\
214	    break;							\
215	case 2:								\
216	    _MUL2(r);							\
217	    break;							\
218	default:							\
219		ZFS_ASM_BUG();						\
220	}								\
221}
222
223#define	MUL4(r...)							\
224{									\
225	MUL2(r);							\
226	MUL2(r);							\
227}
228
229#define	_0f		"zmm15"
230#define	_as		"zmm14"
231#define	_bs		"zmm13"
232#define	_ltmod		"zmm12"
233#define	_ltmul		"zmm11"
234#define	_ta		"zmm10"
235#define	_tb		"zmm15"
236
237static const uint8_t __attribute__((aligned(64))) _mul_mask = 0x0F;
238
239#define	_MULx2(c, r...)							\
240{									\
241	switch (REG_CNT(r)) {						\
242	case 2:								\
243		__asm(							\
244		    "vpbroadcastb (%[mask]), %%" _0f "\n"		\
245		    /* upper bits */					\
246		    "vbroadcasti32x4 0x00(%[lt]), %%" _ltmod "\n"	\
247		    "vbroadcasti32x4 0x10(%[lt]), %%" _ltmul "\n"	\
248									\
249		    "vpsraw $0x4, %%" VR0(r) ", %%"_as "\n"		\
250		    "vpsraw $0x4, %%" VR1(r) ", %%"_bs "\n"		\
251		    "vpandq %%" _0f ", %%" VR0(r) ", %%" VR0(r) "\n"	\
252		    "vpandq %%" _0f ", %%" VR1(r) ", %%" VR1(r) "\n"	\
253		    "vpandq %%" _0f ", %%" _as ", %%" _as "\n"		\
254		    "vpandq %%" _0f ", %%" _bs ", %%" _bs "\n"		\
255									\
256		    "vpshufb %%" _as ", %%" _ltmod ", %%" _ta "\n"	\
257		    "vpshufb %%" _bs ", %%" _ltmod ", %%" _tb "\n"	\
258		    "vpshufb %%" _as ", %%" _ltmul ", %%" _as "\n"	\
259		    "vpshufb %%" _bs ", %%" _ltmul ", %%" _bs "\n"	\
260		    /* lower bits */					\
261		    "vbroadcasti32x4 0x20(%[lt]), %%" _ltmod "\n"	\
262		    "vbroadcasti32x4 0x30(%[lt]), %%" _ltmul "\n"	\
263									\
264		    "vpxorq %%" _ta ", %%" _as ", %%" _as "\n"		\
265		    "vpxorq %%" _tb ", %%" _bs ", %%" _bs "\n"		\
266									\
267		    "vpshufb %%" VR0(r) ", %%" _ltmod ", %%" _ta "\n"	\
268		    "vpshufb %%" VR1(r) ", %%" _ltmod ", %%" _tb "\n"	\
269		    "vpshufb %%" VR0(r) ", %%" _ltmul ", %%" VR0(r) "\n"\
270		    "vpshufb %%" VR1(r) ", %%" _ltmul ", %%" VR1(r) "\n"\
271									\
272		    "vpxorq %%" _ta ", %%" VR0(r) ", %%" VR0(r) "\n"	\
273		    "vpxorq %%" _as ", %%" VR0(r) ", %%" VR0(r) "\n"	\
274		    "vpxorq %%" _tb ", %%" VR1(r) ", %%" VR1(r) "\n"	\
275		    "vpxorq %%" _bs ", %%" VR1(r) ", %%" VR1(r) "\n"	\
276		    : : [mask] "r" (&_mul_mask),			\
277		    [lt] "r" (gf_clmul_mod_lt[4*(c)]));			\
278		break;							\
279	default:							\
280		ZFS_ASM_BUG();						\
281	}								\
282}
283
284#define	MUL(c, r...)							\
285{									\
286	switch (REG_CNT(r)) {						\
287	case 4:								\
288		_MULx2(c, R_01(r));					\
289		_MULx2(c, R_23(r));					\
290		break;							\
291	case 2:								\
292		_MULx2(c, R_01(r));					\
293		break;							\
294	default:							\
295		ZFS_ASM_BUG();						\
296	}								\
297}
298
299#define	raidz_math_begin()	kfpu_begin()
300#define	raidz_math_end()	kfpu_end()
301
302/*
303 * ZERO, COPY, and MUL operations are already 2x unrolled, which means that
304 * the stride of these operations for avx512 must not exceed 4. Otherwise, a
305 * single step would exceed 512B block size.
306 */
307
308#define	SYN_STRIDE		4
309
310#define	ZERO_STRIDE		4
311#define	ZERO_DEFINE()		{}
312#define	ZERO_D			0, 1, 2, 3
313
314#define	COPY_STRIDE		4
315#define	COPY_DEFINE()		{}
316#define	COPY_D			0, 1, 2, 3
317
318#define	ADD_STRIDE		4
319#define	ADD_DEFINE()		{}
320#define	ADD_D			0, 1, 2, 3
321
322#define	MUL_STRIDE		4
323#define	MUL_DEFINE()		{}
324#define	MUL_D			0, 1, 2, 3
325
326#define	GEN_P_STRIDE		4
327#define	GEN_P_DEFINE()		{}
328#define	GEN_P_P			0, 1, 2, 3
329
330#define	GEN_PQ_STRIDE		4
331#define	GEN_PQ_DEFINE() 	{}
332#define	GEN_PQ_D		0, 1, 2, 3
333#define	GEN_PQ_C		4, 5, 6, 7
334
335#define	GEN_PQR_STRIDE		4
336#define	GEN_PQR_DEFINE() 	{}
337#define	GEN_PQR_D		0, 1, 2, 3
338#define	GEN_PQR_C		4, 5, 6, 7
339
340#define	SYN_Q_DEFINE()		{}
341#define	SYN_Q_D			0, 1, 2, 3
342#define	SYN_Q_X			4, 5, 6, 7
343
344#define	SYN_R_DEFINE()		{}
345#define	SYN_R_D			0, 1, 2, 3
346#define	SYN_R_X			4, 5, 6, 7
347
348#define	SYN_PQ_DEFINE() 	{}
349#define	SYN_PQ_D		0, 1, 2, 3
350#define	SYN_PQ_X		4, 5, 6, 7
351
352#define	REC_PQ_STRIDE		2
353#define	REC_PQ_DEFINE() 	{}
354#define	REC_PQ_X		0, 1
355#define	REC_PQ_Y		2, 3
356#define	REC_PQ_T		4, 5
357
358#define	SYN_PR_DEFINE() 	{}
359#define	SYN_PR_D		0, 1, 2, 3
360#define	SYN_PR_X		4, 5, 6, 7
361
362#define	REC_PR_STRIDE		2
363#define	REC_PR_DEFINE() 	{}
364#define	REC_PR_X		0, 1
365#define	REC_PR_Y		2, 3
366#define	REC_PR_T		4, 5
367
368#define	SYN_QR_DEFINE() 	{}
369#define	SYN_QR_D		0, 1, 2, 3
370#define	SYN_QR_X		4, 5, 6, 7
371
372#define	REC_QR_STRIDE		2
373#define	REC_QR_DEFINE() 	{}
374#define	REC_QR_X		0, 1
375#define	REC_QR_Y		2, 3
376#define	REC_QR_T		4, 5
377
378#define	SYN_PQR_DEFINE() 	{}
379#define	SYN_PQR_D		0, 1, 2, 3
380#define	SYN_PQR_X		4, 5, 6, 7
381
382#define	REC_PQR_STRIDE		2
383#define	REC_PQR_DEFINE() 	{}
384#define	REC_PQR_X		0, 1
385#define	REC_PQR_Y		2, 3
386#define	REC_PQR_Z		4, 5
387#define	REC_PQR_XS		6, 7
388#define	REC_PQR_YS		8, 9
389
390
391#include <sys/vdev_raidz_impl.h>
392#include "vdev_raidz_math_impl.h"
393
394DEFINE_GEN_METHODS(avx512bw);
395DEFINE_REC_METHODS(avx512bw);
396
397static boolean_t
398raidz_will_avx512bw_work(void)
399{
400	return (kfpu_allowed() && zfs_avx_available() &&
401	    zfs_avx512f_available() && zfs_avx512bw_available());
402}
403
404const raidz_impl_ops_t vdev_raidz_avx512bw_impl = {
405	.init = NULL,
406	.fini = NULL,
407	.gen = RAIDZ_GEN_METHODS(avx512bw),
408	.rec = RAIDZ_REC_METHODS(avx512bw),
409	.is_supported = &raidz_will_avx512bw_work,
410	.name = "avx512bw"
411};
412
413#endif /* defined(__x86_64) && defined(HAVE_AVX512BW) */
414