1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (C) 2016 Romain Dolbeau. All rights reserved.
23 */
24
25#include <sys/types.h>
26#include <sys/simd.h>
27
28#ifdef __linux__
29#define	__asm __asm__ __volatile__
30#endif
31
32#define	_REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N
33#define	REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1)
34
35#define	VR0_(REG, ...) "%[w"#REG"]"
36#define	VR1_(_1, REG, ...) "%[w"#REG"]"
37#define	VR2_(_1, _2, REG, ...) "%[w"#REG"]"
38#define	VR3_(_1, _2, _3, REG, ...) "%[w"#REG"]"
39#define	VR4_(_1, _2, _3, _4, REG, ...) "%[w"#REG"]"
40#define	VR5_(_1, _2, _3, _4, _5, REG, ...) "%[w"#REG"]"
41#define	VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "%[w"#REG"]"
42#define	VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "%[w"#REG"]"
43
44/*
45 * Here we need registers not used otherwise.
46 * They will be used in unused ASM for the case
47 * with more registers than required... but GCC
48 * will still need to make sure the constraints
49 * are correct, and duplicate constraints are illegal
50 * ... and we use the "register" number as a name
51 */
52
53#define	VR0(r...) VR0_(r)
54#define	VR1(r...) VR1_(r)
55#define	VR2(r...) VR2_(r, 36)
56#define	VR3(r...) VR3_(r, 36, 35)
57#define	VR4(r...) VR4_(r, 36, 35, 34, 33)
58#define	VR5(r...) VR5_(r, 36, 35, 34, 33, 32)
59#define	VR6(r...) VR6_(r, 36, 35, 34, 33, 32, 31)
60#define	VR7(r...) VR7_(r, 36, 35, 34, 33, 32, 31, 30)
61
62#define	VR(X) "%[w"#X"]"
63
64#define	RVR0_(REG, ...) [w##REG] "w" (w##REG)
65#define	RVR1_(_1, REG, ...) [w##REG] "w" (w##REG)
66#define	RVR2_(_1, _2, REG, ...) [w##REG] "w" (w##REG)
67#define	RVR3_(_1, _2, _3, REG, ...) [w##REG] "w" (w##REG)
68#define	RVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "w" (w##REG)
69#define	RVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "w" (w##REG)
70#define	RVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "w" (w##REG)
71#define	RVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "w" (w##REG)
72
73#define	RVR0(r...) RVR0_(r)
74#define	RVR1(r...) RVR1_(r)
75#define	RVR2(r...) RVR2_(r, 36)
76#define	RVR3(r...) RVR3_(r, 36, 35)
77#define	RVR4(r...) RVR4_(r, 36, 35, 34, 33)
78#define	RVR5(r...) RVR5_(r, 36, 35, 34, 33, 32)
79#define	RVR6(r...) RVR6_(r, 36, 35, 34, 33, 32, 31)
80#define	RVR7(r...) RVR7_(r, 36, 35, 34, 33, 32, 31, 30)
81
82#define	RVR(X) [w##X] "w" (w##X)
83
84#define	WVR0_(REG, ...) [w##REG] "=w" (w##REG)
85#define	WVR1_(_1, REG, ...) [w##REG] "=w" (w##REG)
86#define	WVR2_(_1, _2, REG, ...) [w##REG] "=w" (w##REG)
87#define	WVR3_(_1, _2, _3, REG, ...) [w##REG] "=w" (w##REG)
88#define	WVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "=w" (w##REG)
89#define	WVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "=w" (w##REG)
90#define	WVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "=w" (w##REG)
91#define	WVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "=w" (w##REG)
92
93#define	WVR0(r...) WVR0_(r)
94#define	WVR1(r...) WVR1_(r)
95#define	WVR2(r...) WVR2_(r, 36)
96#define	WVR3(r...) WVR3_(r, 36, 35)
97#define	WVR4(r...) WVR4_(r, 36, 35, 34, 33)
98#define	WVR5(r...) WVR5_(r, 36, 35, 34, 33, 32)
99#define	WVR6(r...) WVR6_(r, 36, 35, 34, 33, 32, 31)
100#define	WVR7(r...) WVR7_(r, 36, 35, 34, 33, 32, 31, 30)
101
102#define	WVR(X) [w##X] "=w" (w##X)
103
104#define	UVR0_(REG, ...) [w##REG] "+&w" (w##REG)
105#define	UVR1_(_1, REG, ...) [w##REG] "+&w" (w##REG)
106#define	UVR2_(_1, _2, REG, ...) [w##REG] "+&w" (w##REG)
107#define	UVR3_(_1, _2, _3, REG, ...) [w##REG] "+&w" (w##REG)
108#define	UVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "+&w" (w##REG)
109#define	UVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "+&w" (w##REG)
110#define	UVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "+&w" (w##REG)
111#define	UVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "+&w" (w##REG)
112
113#define	UVR0(r...) UVR0_(r)
114#define	UVR1(r...) UVR1_(r)
115#define	UVR2(r...) UVR2_(r, 36)
116#define	UVR3(r...) UVR3_(r, 36, 35)
117#define	UVR4(r...) UVR4_(r, 36, 35, 34, 33)
118#define	UVR5(r...) UVR5_(r, 36, 35, 34, 33, 32)
119#define	UVR6(r...) UVR6_(r, 36, 35, 34, 33, 32, 31)
120#define	UVR7(r...) UVR7_(r, 36, 35, 34, 33, 32, 31, 30)
121
122#define	UVR(X) [w##X] "+&w" (w##X)
123
124#define	R_01(REG1, REG2, ...) REG1, REG2
125#define	_R_23(_0, _1, REG2, REG3, ...) REG2, REG3
126#define	R_23(REG...) _R_23(REG, 1, 2, 3)
127
128#define	ZFS_ASM_BUG()	ASSERT(0)
129
130#define	OFFSET(ptr, val)	(((unsigned char *)(ptr))+val)
131
132extern const uint8_t gf_clmul_mod_lt[4*256][16];
133
134#define	ELEM_SIZE 16
135
136typedef struct v {
137	uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE)));
138} v_t;
139
140#define	XOR_ACC(src, r...)						\
141{									\
142	switch (REG_CNT(r)) {						\
143	case 8:								\
144		__asm(							\
145		"ld1 { v21.4s },%[SRC0]\n"				\
146		"ld1 { v20.4s },%[SRC1]\n"				\
147		"ld1 { v19.4s },%[SRC2]\n"				\
148		"ld1 { v18.4s },%[SRC3]\n"				\
149		"eor " VR0(r) ".16b," VR0(r) ".16b,v21.16b\n"		\
150		"eor " VR1(r) ".16b," VR1(r) ".16b,v20.16b\n"		\
151		"eor " VR2(r) ".16b," VR2(r) ".16b,v19.16b\n"		\
152		"eor " VR3(r) ".16b," VR3(r) ".16b,v18.16b\n"		\
153		"ld1 { v21.4s },%[SRC4]\n"				\
154		"ld1 { v20.4s },%[SRC5]\n"				\
155		"ld1 { v19.4s },%[SRC6]\n"				\
156		"ld1 { v18.4s },%[SRC7]\n"				\
157		"eor " VR4(r) ".16b," VR4(r) ".16b,v21.16b\n"		\
158		"eor " VR5(r) ".16b," VR5(r) ".16b,v20.16b\n"		\
159		"eor " VR6(r) ".16b," VR6(r) ".16b,v19.16b\n"		\
160		"eor " VR7(r) ".16b," VR7(r) ".16b,v18.16b\n"		\
161		:	UVR0(r), UVR1(r), UVR2(r), UVR3(r),		\
162			UVR4(r), UVR5(r), UVR6(r), UVR7(r)		\
163		:	[SRC0] "Q" (*(OFFSET(src, 0))),			\
164		[SRC1] "Q" (*(OFFSET(src, 16))),			\
165		[SRC2] "Q" (*(OFFSET(src, 32))),			\
166		[SRC3] "Q" (*(OFFSET(src, 48))),			\
167		[SRC4] "Q" (*(OFFSET(src, 64))),			\
168		[SRC5] "Q" (*(OFFSET(src, 80))),			\
169		[SRC6] "Q" (*(OFFSET(src, 96))),			\
170		[SRC7] "Q" (*(OFFSET(src, 112)))			\
171		:	"v18", "v19", "v20", "v21");			\
172		break;							\
173	case 4:								\
174		__asm(							\
175		"ld1 { v21.4s },%[SRC0]\n"				\
176		"ld1 { v20.4s },%[SRC1]\n"				\
177		"ld1 { v19.4s },%[SRC2]\n"				\
178		"ld1 { v18.4s },%[SRC3]\n"				\
179		"eor " VR0(r) ".16b," VR0(r) ".16b,v21.16b\n"		\
180		"eor " VR1(r) ".16b," VR1(r) ".16b,v20.16b\n"		\
181		"eor " VR2(r) ".16b," VR2(r) ".16b,v19.16b\n"		\
182		"eor " VR3(r) ".16b," VR3(r) ".16b,v18.16b\n"		\
183		:	UVR0(r), UVR1(r), UVR2(r), UVR3(r)		\
184		:	[SRC0] "Q" (*(OFFSET(src, 0))),			\
185		[SRC1] "Q" (*(OFFSET(src, 16))),			\
186		[SRC2] "Q" (*(OFFSET(src, 32))),			\
187		[SRC3] "Q" (*(OFFSET(src, 48)))				\
188		:	"v18", "v19", "v20", "v21");			\
189		break;							\
190	case 2:								\
191		__asm(							\
192		"ld1 { v21.4s },%[SRC0]\n"				\
193		"ld1 { v20.4s },%[SRC1]\n"				\
194		"eor " VR0(r) ".16b," VR0(r) ".16b,v21.16b\n"		\
195		"eor " VR1(r) ".16b," VR1(r) ".16b,v20.16b\n"		\
196		:	UVR0(r), UVR1(r)				\
197		:	[SRC0] "Q" (*(OFFSET(src, 0))),			\
198		[SRC1] "Q" (*(OFFSET(src, 16)))				\
199		:	"v20", "v21");					\
200		break;							\
201	default:							\
202		ZFS_ASM_BUG();						\
203	}								\
204}
205
206#define	XOR(r...)							\
207{									\
208	switch (REG_CNT(r)) {						\
209	case 8:								\
210		__asm(							\
211		"eor " VR4(r) ".16b," VR4(r) ".16b," VR0(r) ".16b\n"	\
212		"eor " VR5(r) ".16b," VR5(r) ".16b," VR1(r) ".16b\n"	\
213		"eor " VR6(r) ".16b," VR6(r) ".16b," VR2(r) ".16b\n"	\
214		"eor " VR7(r) ".16b," VR7(r) ".16b," VR3(r) ".16b\n"	\
215		:	UVR4(r), UVR5(r), UVR6(r), UVR7(r)		\
216		:	RVR0(r), RVR1(r), RVR2(r), RVR3(r));		\
217		break;							\
218	case 4:								\
219		__asm(							\
220		"eor " VR2(r) ".16b," VR2(r) ".16b," VR0(r) ".16b\n"	\
221		"eor " VR3(r) ".16b," VR3(r) ".16b," VR1(r) ".16b\n"	\
222		:	UVR2(r), UVR3(r)				\
223		:	RVR0(r), RVR1(r));				\
224		break;							\
225	default:							\
226		ZFS_ASM_BUG();						\
227	}								\
228}
229
230#define	ZERO(r...)							\
231{									\
232	switch (REG_CNT(r)) {						\
233	case 8:								\
234		__asm(							\
235		"eor " VR0(r) ".16b," VR0(r) ".16b," VR0(r) ".16b\n"	\
236		"eor " VR1(r) ".16b," VR1(r) ".16b," VR1(r) ".16b\n"	\
237		"eor " VR2(r) ".16b," VR2(r) ".16b," VR2(r) ".16b\n"	\
238		"eor " VR3(r) ".16b," VR3(r) ".16b," VR3(r) ".16b\n"	\
239		"eor " VR4(r) ".16b," VR4(r) ".16b," VR4(r) ".16b\n"	\
240		"eor " VR5(r) ".16b," VR5(r) ".16b," VR5(r) ".16b\n"	\
241		"eor " VR6(r) ".16b," VR6(r) ".16b," VR6(r) ".16b\n"	\
242		"eor " VR7(r) ".16b," VR7(r) ".16b," VR7(r) ".16b\n"	\
243		:	WVR0(r), WVR1(r), WVR2(r), WVR3(r),		\
244			WVR4(r), WVR5(r), WVR6(r), WVR7(r));		\
245		break;							\
246	case 4:								\
247		__asm(							\
248		"eor " VR0(r) ".16b," VR0(r) ".16b," VR0(r) ".16b\n"	\
249		"eor " VR1(r) ".16b," VR1(r) ".16b," VR1(r) ".16b\n"	\
250		"eor " VR2(r) ".16b," VR2(r) ".16b," VR2(r) ".16b\n"	\
251		"eor " VR3(r) ".16b," VR3(r) ".16b," VR3(r) ".16b\n"	\
252		:	WVR0(r), WVR1(r), WVR2(r), WVR3(r));		\
253		break;							\
254	case 2:								\
255		__asm(							\
256		"eor " VR0(r) ".16b," VR0(r) ".16b," VR0(r) ".16b\n"	\
257		"eor " VR1(r) ".16b," VR1(r) ".16b," VR1(r) ".16b\n"	\
258		:	WVR0(r), WVR1(r));				\
259		break;							\
260	default:							\
261		ZFS_ASM_BUG();						\
262	}								\
263}
264
265#define	COPY(r...)							\
266{									\
267	switch (REG_CNT(r)) {						\
268	case 8:								\
269		__asm(							\
270		"mov " VR4(r) ".16b," VR0(r) ".16b\n"			\
271		"mov " VR5(r) ".16b," VR1(r) ".16b\n"			\
272		"mov " VR6(r) ".16b," VR2(r) ".16b\n"			\
273		"mov " VR7(r) ".16b," VR3(r) ".16b\n"			\
274		:	WVR4(r), WVR5(r), WVR6(r), WVR7(r)		\
275		:	RVR0(r), RVR1(r), RVR2(r), RVR3(r));		\
276		break;							\
277	case 4:								\
278		__asm(							\
279		"mov " VR2(r) ".16b," VR0(r) ".16b\n"			\
280		"mov " VR3(r) ".16b," VR1(r) ".16b\n"			\
281		:	WVR2(r), WVR3(r)				\
282		:	RVR0(r), RVR1(r));				\
283		break;							\
284	default:							\
285		ZFS_ASM_BUG();						\
286	}								\
287}
288
289#define	LOAD(src, r...)							\
290{									\
291	switch (REG_CNT(r)) {						\
292	case 8:								\
293		__asm(							\
294		"ld1 { " VR0(r) ".4s },%[SRC0]\n"			\
295		"ld1 { " VR1(r) ".4s },%[SRC1]\n"			\
296		"ld1 { " VR2(r) ".4s },%[SRC2]\n"			\
297		"ld1 { " VR3(r) ".4s },%[SRC3]\n"			\
298		"ld1 { " VR4(r) ".4s },%[SRC4]\n"			\
299		"ld1 { " VR5(r) ".4s },%[SRC5]\n"			\
300		"ld1 { " VR6(r) ".4s },%[SRC6]\n"			\
301		"ld1 { " VR7(r) ".4s },%[SRC7]\n"			\
302		:	WVR0(r), WVR1(r), WVR2(r), WVR3(r),		\
303			WVR4(r), WVR5(r), WVR6(r), WVR7(r)		\
304		:	[SRC0] "Q" (*(OFFSET(src, 0))),			\
305		[SRC1] "Q" (*(OFFSET(src, 16))),			\
306		[SRC2] "Q" (*(OFFSET(src, 32))),			\
307		[SRC3] "Q" (*(OFFSET(src, 48))),			\
308		[SRC4] "Q" (*(OFFSET(src, 64))),			\
309		[SRC5] "Q" (*(OFFSET(src, 80))),			\
310		[SRC6] "Q" (*(OFFSET(src, 96))),			\
311		[SRC7] "Q" (*(OFFSET(src, 112))));			\
312		break;							\
313	case 4:								\
314		__asm(							\
315		"ld1 { " VR0(r) ".4s },%[SRC0]\n"			\
316		"ld1 { " VR1(r) ".4s },%[SRC1]\n"			\
317		"ld1 { " VR2(r) ".4s },%[SRC2]\n"			\
318		"ld1 { " VR3(r) ".4s },%[SRC3]\n"			\
319		:	WVR0(r), WVR1(r), WVR2(r), WVR3(r)		\
320		:	[SRC0] "Q" (*(OFFSET(src, 0))),			\
321		[SRC1] "Q" (*(OFFSET(src, 16))),			\
322		[SRC2] "Q" (*(OFFSET(src, 32))),			\
323		[SRC3] "Q" (*(OFFSET(src, 48))));			\
324		break;							\
325	case 2:								\
326		__asm(							\
327		"ld1 { " VR0(r) ".4s },%[SRC0]\n"			\
328		"ld1 { " VR1(r) ".4s },%[SRC1]\n"			\
329		:	WVR0(r), WVR1(r)				\
330		:	[SRC0] "Q" (*(OFFSET(src, 0))),			\
331		[SRC1] "Q" (*(OFFSET(src, 16))));			\
332		break;							\
333	default:							\
334		ZFS_ASM_BUG();						\
335	}								\
336}
337
338#define	STORE(dst, r...)						\
339{									\
340	switch (REG_CNT(r)) {						\
341	case 8:								\
342		__asm(							\
343		"st1 { " VR0(r) ".4s },%[DST0]\n"			\
344		"st1 { " VR1(r) ".4s },%[DST1]\n"			\
345		"st1 { " VR2(r) ".4s },%[DST2]\n"			\
346		"st1 { " VR3(r) ".4s },%[DST3]\n"			\
347		"st1 { " VR4(r) ".4s },%[DST4]\n"			\
348		"st1 { " VR5(r) ".4s },%[DST5]\n"			\
349		"st1 { " VR6(r) ".4s },%[DST6]\n"			\
350		"st1 { " VR7(r) ".4s },%[DST7]\n"			\
351		:	[DST0] "=Q" (*(OFFSET(dst, 0))),		\
352		[DST1] "=Q" (*(OFFSET(dst, 16))),			\
353		[DST2] "=Q" (*(OFFSET(dst, 32))),			\
354		[DST3] "=Q" (*(OFFSET(dst, 48))),			\
355		[DST4] "=Q" (*(OFFSET(dst, 64))),			\
356		[DST5] "=Q" (*(OFFSET(dst, 80))),			\
357		[DST6] "=Q" (*(OFFSET(dst, 96))),			\
358		[DST7] "=Q" (*(OFFSET(dst, 112)))			\
359		:	RVR0(r), RVR1(r), RVR2(r), RVR3(r),		\
360			RVR4(r), RVR5(r), RVR6(r), RVR7(r));		\
361		break;							\
362	case 4:								\
363		__asm(							\
364		"st1 { " VR0(r) ".4s },%[DST0]\n"			\
365		"st1 { " VR1(r) ".4s },%[DST1]\n"			\
366		"st1 { " VR2(r) ".4s },%[DST2]\n"			\
367		"st1 { " VR3(r) ".4s },%[DST3]\n"			\
368		:	[DST0] "=Q" (*(OFFSET(dst, 0))),		\
369		[DST1] "=Q" (*(OFFSET(dst, 16))),			\
370		[DST2] "=Q" (*(OFFSET(dst, 32))),			\
371		[DST3] "=Q" (*(OFFSET(dst, 48)))			\
372		:	RVR0(r), RVR1(r), RVR2(r), RVR3(r));		\
373		break;							\
374	case 2:								\
375		__asm(							\
376		"st1 { " VR0(r) ".4s },%[DST0]\n"			\
377		"st1 { " VR1(r) ".4s },%[DST1]\n"			\
378		:	[DST0] "=Q" (*(OFFSET(dst, 0))),		\
379		[DST1] "=Q" (*(OFFSET(dst, 16)))			\
380		:	RVR0(r), RVR1(r));				\
381		break;							\
382	default:							\
383		ZFS_ASM_BUG();						\
384	}								\
385}
386
387/*
388 * Unfortunately cannot use the macro, because GCC
389 * will try to use the macro name and not value
390 * later on...
391 * Kept as a reference to what a numbered variable is
392 */
393#define	_00	"v17"
394#define	_1d	"v16"
395#define	_temp0	"v19"
396#define	_temp1	"v18"
397
398#define	MUL2_SETUP()							\
399{									\
400	__asm(								\
401	"eor " VR(17) ".16b," VR(17) ".16b," VR(17) ".16b\n"		\
402	"movi " VR(16) ".16b,#0x1d\n"					\
403	:	WVR(16), WVR(17));					\
404}
405
406#define	MUL2(r...)							\
407{									\
408	switch (REG_CNT(r)) {						\
409	case 4:								\
410		__asm(							\
411		"cmgt v19.16b," VR(17) ".16b," VR0(r) ".16b\n"		\
412		"cmgt v18.16b," VR(17) ".16b," VR1(r) ".16b\n"		\
413		"cmgt v21.16b," VR(17) ".16b," VR2(r) ".16b\n"		\
414		"cmgt v20.16b," VR(17) ".16b," VR3(r) ".16b\n"		\
415		"and v19.16b,v19.16b," VR(16) ".16b\n"			\
416		"and v18.16b,v18.16b," VR(16) ".16b\n"			\
417		"and v21.16b,v21.16b," VR(16) ".16b\n"			\
418		"and v20.16b,v20.16b," VR(16) ".16b\n"			\
419		"shl " VR0(r) ".16b," VR0(r) ".16b,#1\n"		\
420		"shl " VR1(r) ".16b," VR1(r) ".16b,#1\n"		\
421		"shl " VR2(r) ".16b," VR2(r) ".16b,#1\n"		\
422		"shl " VR3(r) ".16b," VR3(r) ".16b,#1\n"		\
423		"eor " VR0(r) ".16b,v19.16b," VR0(r) ".16b\n"		\
424		"eor " VR1(r) ".16b,v18.16b," VR1(r) ".16b\n"		\
425		"eor " VR2(r) ".16b,v21.16b," VR2(r) ".16b\n"		\
426		"eor " VR3(r) ".16b,v20.16b," VR3(r) ".16b\n"		\
427		:	UVR0(r), UVR1(r), UVR2(r), UVR3(r)		\
428		:	RVR(17), RVR(16)				\
429		:	"v18", "v19", "v20", "v21");			\
430		break;							\
431	case 2:								\
432		__asm(							\
433		"cmgt v19.16b," VR(17) ".16b," VR0(r) ".16b\n"		\
434		"cmgt v18.16b," VR(17) ".16b," VR1(r) ".16b\n"		\
435		"and v19.16b,v19.16b," VR(16) ".16b\n"			\
436		"and v18.16b,v18.16b," VR(16) ".16b\n"			\
437		"shl " VR0(r) ".16b," VR0(r) ".16b,#1\n"		\
438		"shl " VR1(r) ".16b," VR1(r) ".16b,#1\n"		\
439		"eor " VR0(r) ".16b,v19.16b," VR0(r) ".16b\n"		\
440		"eor " VR1(r) ".16b,v18.16b," VR1(r) ".16b\n"		\
441		:	UVR0(r), UVR1(r)				\
442		:	RVR(17), RVR(16)				\
443		:	"v18", "v19");					\
444		break;							\
445	default:							\
446		ZFS_ASM_BUG();						\
447	}								\
448}
449
450#define	MUL4(r...)							\
451{									\
452	MUL2(r);							\
453	MUL2(r);							\
454}
455
456/*
457 * Unfortunately cannot use the macro, because GCC
458 * will try to use the macro name and not value
459 * later on...
460 * Kept as a reference to what a register is
461 * (here we're using actual registers for the
462 * clobbered ones)
463 */
464#define	_0f		"v15"
465#define	_a_save		"v14"
466#define	_b_save		"v13"
467#define	_lt_mod_a	"v12"
468#define	_lt_clmul_a	"v11"
469#define	_lt_mod_b	"v10"
470#define	_lt_clmul_b	"v15"
471
472#define	_MULx2(c, r...)							\
473{									\
474	switch (REG_CNT(r)) {						\
475	case 2:								\
476		__asm(							\
477		/* lts for upper part */				\
478		"movi v15.16b,#0x0f\n"					\
479		"ld1 { v10.4s },%[lt0]\n"				\
480		"ld1 { v11.4s },%[lt1]\n"				\
481		/* upper part */					\
482		"and v14.16b," VR0(r) ".16b,v15.16b\n"			\
483		"and v13.16b," VR1(r) ".16b,v15.16b\n"			\
484		"ushr " VR0(r) ".16b," VR0(r) ".16b,#4\n"		\
485		"ushr " VR1(r) ".16b," VR1(r) ".16b,#4\n"		\
486									\
487		"tbl v12.16b,{v10.16b}," VR0(r) ".16b\n"		\
488		"tbl v10.16b,{v10.16b}," VR1(r) ".16b\n"		\
489		"tbl v15.16b,{v11.16b}," VR0(r) ".16b\n"		\
490		"tbl v11.16b,{v11.16b}," VR1(r) ".16b\n"		\
491									\
492		"eor " VR0(r) ".16b,v15.16b,v12.16b\n"			\
493		"eor " VR1(r) ".16b,v11.16b,v10.16b\n"			\
494		/* lts for lower part */				\
495		"ld1 { v10.4s },%[lt2]\n"				\
496		"ld1 { v15.4s },%[lt3]\n"				\
497		/* lower part */					\
498		"tbl v12.16b,{v10.16b},v14.16b\n"			\
499		"tbl v10.16b,{v10.16b},v13.16b\n"			\
500		"tbl v11.16b,{v15.16b},v14.16b\n"			\
501		"tbl v15.16b,{v15.16b},v13.16b\n"			\
502									\
503		"eor " VR0(r) ".16b," VR0(r) ".16b,v12.16b\n"		\
504		"eor " VR1(r) ".16b," VR1(r) ".16b,v10.16b\n"		\
505		"eor " VR0(r) ".16b," VR0(r) ".16b,v11.16b\n"		\
506		"eor " VR1(r) ".16b," VR1(r) ".16b,v15.16b\n"		\
507		:	UVR0(r), UVR1(r)				\
508		:	[lt0] "Q" ((gf_clmul_mod_lt[4*(c)+0][0])),	\
509		[lt1] "Q" ((gf_clmul_mod_lt[4*(c)+1][0])),		\
510		[lt2] "Q" ((gf_clmul_mod_lt[4*(c)+2][0])),		\
511		[lt3] "Q" ((gf_clmul_mod_lt[4*(c)+3][0]))		\
512		:	"v10", "v11", "v12", "v13", "v14", "v15");	\
513		break;							\
514	default:							\
515		ZFS_ASM_BUG();						\
516	}								\
517}
518
519#define	MUL(c, r...)							\
520{									\
521	switch (REG_CNT(r)) {						\
522	case 4:								\
523		_MULx2(c, R_23(r));					\
524		_MULx2(c, R_01(r));					\
525		break;							\
526	case 2:								\
527		_MULx2(c, R_01(r));					\
528		break;							\
529	default:							\
530		ZFS_ASM_BUG();						\
531	}								\
532}
533
534#define	raidz_math_begin()	kfpu_begin()
535#define	raidz_math_end()	kfpu_end()
536
537/* Overkill... */
538#if defined(_KERNEL)
539#define	GEN_X_DEFINE_0_3()	\
540register unsigned char w0 asm("v0") __attribute__((vector_size(16)));	\
541register unsigned char w1 asm("v1") __attribute__((vector_size(16)));	\
542register unsigned char w2 asm("v2") __attribute__((vector_size(16)));	\
543register unsigned char w3 asm("v3") __attribute__((vector_size(16)));
544#define	GEN_X_DEFINE_4_5()	\
545register unsigned char w4 asm("v4") __attribute__((vector_size(16)));	\
546register unsigned char w5 asm("v5") __attribute__((vector_size(16)));
547#define	GEN_X_DEFINE_6_7()	\
548register unsigned char w6 asm("v6") __attribute__((vector_size(16)));	\
549register unsigned char w7 asm("v7") __attribute__((vector_size(16)));
550#define	GEN_X_DEFINE_8_9()	\
551register unsigned char w8 asm("v8") __attribute__((vector_size(16)));	\
552register unsigned char w9 asm("v9") __attribute__((vector_size(16)));
553#define	GEN_X_DEFINE_10_11()	\
554register unsigned char w10 asm("v10") __attribute__((vector_size(16)));	\
555register unsigned char w11 asm("v11") __attribute__((vector_size(16)));
556#define	GEN_X_DEFINE_12_15()	\
557register unsigned char w12 asm("v12") __attribute__((vector_size(16)));	\
558register unsigned char w13 asm("v13") __attribute__((vector_size(16)));	\
559register unsigned char w14 asm("v14") __attribute__((vector_size(16)));	\
560register unsigned char w15 asm("v15") __attribute__((vector_size(16)));
561#define	GEN_X_DEFINE_16()	\
562register unsigned char w16 asm("v16") __attribute__((vector_size(16)));
563#define	GEN_X_DEFINE_17()	\
564register unsigned char w17 asm("v17") __attribute__((vector_size(16)));
565#define	GEN_X_DEFINE_18_21()	\
566register unsigned char w18 asm("v18") __attribute__((vector_size(16)));	\
567register unsigned char w19 asm("v19") __attribute__((vector_size(16)));	\
568register unsigned char w20 asm("v20") __attribute__((vector_size(16)));	\
569register unsigned char w21 asm("v21") __attribute__((vector_size(16)));
570#define	GEN_X_DEFINE_22_23()	\
571register unsigned char w22 asm("v22") __attribute__((vector_size(16)));	\
572register unsigned char w23 asm("v23") __attribute__((vector_size(16)));
573#define	GEN_X_DEFINE_24_27()	\
574register unsigned char w24 asm("v24") __attribute__((vector_size(16)));	\
575register unsigned char w25 asm("v25") __attribute__((vector_size(16)));	\
576register unsigned char w26 asm("v26") __attribute__((vector_size(16)));	\
577register unsigned char w27 asm("v27") __attribute__((vector_size(16)));
578#define	GEN_X_DEFINE_28_30()	\
579register unsigned char w28 asm("v28") __attribute__((vector_size(16)));	\
580register unsigned char w29 asm("v29") __attribute__((vector_size(16)));	\
581register unsigned char w30 asm("v30") __attribute__((vector_size(16)));
582#define	GEN_X_DEFINE_31()	\
583register unsigned char w31 asm("v31") __attribute__((vector_size(16)));
584#define	GEN_X_DEFINE_32()	\
585register unsigned char w32 asm("v31") __attribute__((vector_size(16)));
586#define	GEN_X_DEFINE_33_36()	\
587register unsigned char w33 asm("v31") __attribute__((vector_size(16)));	\
588register unsigned char w34 asm("v31") __attribute__((vector_size(16)));	\
589register unsigned char w35 asm("v31") __attribute__((vector_size(16)));	\
590register unsigned char w36 asm("v31") __attribute__((vector_size(16)));
591#define	GEN_X_DEFINE_37_38()	\
592register unsigned char w37 asm("v31") __attribute__((vector_size(16)));	\
593register unsigned char w38 asm("v31") __attribute__((vector_size(16)));
594#define	GEN_X_DEFINE_ALL()	\
595	GEN_X_DEFINE_0_3()	\
596	GEN_X_DEFINE_4_5()	\
597	GEN_X_DEFINE_6_7()	\
598	GEN_X_DEFINE_8_9()	\
599	GEN_X_DEFINE_10_11()	\
600	GEN_X_DEFINE_12_15()	\
601	GEN_X_DEFINE_16()	\
602	GEN_X_DEFINE_17()	\
603	GEN_X_DEFINE_18_21()	\
604	GEN_X_DEFINE_22_23()	\
605	GEN_X_DEFINE_24_27()	\
606	GEN_X_DEFINE_28_30()	\
607	GEN_X_DEFINE_31()	\
608	GEN_X_DEFINE_32()	\
609	GEN_X_DEFINE_33_36() 	\
610	GEN_X_DEFINE_37_38()
611#else
612#define	GEN_X_DEFINE_0_3()	\
613	unsigned char w0 __attribute__((vector_size(16)));	\
614	unsigned char w1 __attribute__((vector_size(16)));	\
615	unsigned char w2 __attribute__((vector_size(16)));	\
616	unsigned char w3 __attribute__((vector_size(16)));
617#define	GEN_X_DEFINE_4_5()	\
618	unsigned char w4 __attribute__((vector_size(16)));	\
619	unsigned char w5 __attribute__((vector_size(16)));
620#define	GEN_X_DEFINE_6_7()	\
621	unsigned char w6 __attribute__((vector_size(16)));	\
622	unsigned char w7 __attribute__((vector_size(16)));
623#define	GEN_X_DEFINE_8_9()	\
624	unsigned char w8 __attribute__((vector_size(16)));	\
625	unsigned char w9 __attribute__((vector_size(16)));
626#define	GEN_X_DEFINE_10_11()	\
627	unsigned char w10 __attribute__((vector_size(16)));	\
628	unsigned char w11 __attribute__((vector_size(16)));
629#define	GEN_X_DEFINE_12_15()	\
630	unsigned char w12 __attribute__((vector_size(16)));	\
631	unsigned char w13 __attribute__((vector_size(16)));	\
632	unsigned char w14 __attribute__((vector_size(16)));	\
633	unsigned char w15 __attribute__((vector_size(16)));
634#define	GEN_X_DEFINE_16()	\
635	unsigned char w16 __attribute__((vector_size(16)));
636#define	GEN_X_DEFINE_17()	\
637	unsigned char w17 __attribute__((vector_size(16)));
638#define	GEN_X_DEFINE_18_21()	\
639	unsigned char w18 __attribute__((vector_size(16)));	\
640	unsigned char w19 __attribute__((vector_size(16)));	\
641	unsigned char w20 __attribute__((vector_size(16)));	\
642	unsigned char w21 __attribute__((vector_size(16)));
643#define	GEN_X_DEFINE_22_23()	\
644	unsigned char w22 __attribute__((vector_size(16)));	\
645	unsigned char w23 __attribute__((vector_size(16)));
646#define	GEN_X_DEFINE_24_27()	\
647	unsigned char w24 __attribute__((vector_size(16)));	\
648	unsigned char w25 __attribute__((vector_size(16)));	\
649	unsigned char w26 __attribute__((vector_size(16)));	\
650	unsigned char w27 __attribute__((vector_size(16)));
651#define	GEN_X_DEFINE_28_30()	\
652	unsigned char w28 __attribute__((vector_size(16)));	\
653	unsigned char w29 __attribute__((vector_size(16)));	\
654	unsigned char w30 __attribute__((vector_size(16)));
655#define	GEN_X_DEFINE_31()	\
656	unsigned char w31 __attribute__((vector_size(16)));
657#define	GEN_X_DEFINE_32()	\
658	unsigned char w32 __attribute__((vector_size(16)));
659#define	GEN_X_DEFINE_33_36()	\
660	unsigned char w33 __attribute__((vector_size(16)));	\
661	unsigned char w34 __attribute__((vector_size(16)));	\
662	unsigned char w35 __attribute__((vector_size(16)));	\
663	unsigned char w36 __attribute__((vector_size(16)));
664#define	GEN_X_DEFINE_37_38()	\
665	unsigned char w37 __attribute__((vector_size(16)));	\
666	unsigned char w38 __attribute__((vector_size(16)));
667#define	GEN_X_DEFINE_ALL()	\
668	GEN_X_DEFINE_0_3()	\
669	GEN_X_DEFINE_4_5()	\
670	GEN_X_DEFINE_6_7()	\
671	GEN_X_DEFINE_8_9()	\
672	GEN_X_DEFINE_10_11()	\
673	GEN_X_DEFINE_12_15()	\
674	GEN_X_DEFINE_16()	\
675	GEN_X_DEFINE_17()	\
676	GEN_X_DEFINE_18_21()	\
677	GEN_X_DEFINE_22_23()	\
678	GEN_X_DEFINE_24_27()	\
679	GEN_X_DEFINE_28_30()	\
680	GEN_X_DEFINE_31()	\
681	GEN_X_DEFINE_32()	\
682	GEN_X_DEFINE_33_36()	\
683	GEN_X_DEFINE_37_38()
684#endif
685