1// SPDX-License-Identifier: GPL-2.0-or-later
2/*
3 * RAID6 syndrome calculations in LoongArch SIMD (LSX & LASX)
4 *
5 * Copyright 2023 WANG Xuerui <git@xen0n.name>
6 *
7 * Based on the generic RAID-6 code (int.uc):
8 *
9 * Copyright 2002-2004 H. Peter Anvin
10 */
11
12#include <linux/raid/pq.h>
13#include "loongarch.h"
14
15/*
16 * The vector algorithms are currently priority 0, which means the generic
17 * scalar algorithms are not being disabled if vector support is present.
18 * This is like the similar LoongArch RAID5 XOR code, with the main reason
19 * repeated here: it cannot be ruled out at this point of time, that some
20 * future (maybe reduced) models could run the vector algorithms slower than
21 * the scalar ones, maybe for errata or micro-op reasons. It may be
22 * appropriate to revisit this after one or two more uarch generations.
23 */
24
25#ifdef CONFIG_CPU_HAS_LSX
26#define NSIZE 16
27
28static int raid6_has_lsx(void)
29{
30	return cpu_has_lsx;
31}
32
33static void raid6_lsx_gen_syndrome(int disks, size_t bytes, void **ptrs)
34{
35	u8 **dptr = (u8 **)ptrs;
36	u8 *p, *q;
37	int d, z, z0;
38
39	z0 = disks - 3;		/* Highest data disk */
40	p = dptr[z0+1];		/* XOR parity */
41	q = dptr[z0+2];		/* RS syndrome */
42
43	kernel_fpu_begin();
44
45	/*
46	 * $vr0, $vr1, $vr2, $vr3: wp
47	 * $vr4, $vr5, $vr6, $vr7: wq
48	 * $vr8, $vr9, $vr10, $vr11: wd
49	 * $vr12, $vr13, $vr14, $vr15: w2
50	 * $vr16, $vr17, $vr18, $vr19: w1
51	 */
52	for (d = 0; d < bytes; d += NSIZE*4) {
53		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
54		asm volatile("vld $vr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
55		asm volatile("vld $vr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
56		asm volatile("vld $vr2, %0" : : "m"(dptr[z0][d+2*NSIZE]));
57		asm volatile("vld $vr3, %0" : : "m"(dptr[z0][d+3*NSIZE]));
58		asm volatile("vori.b $vr4, $vr0, 0");
59		asm volatile("vori.b $vr5, $vr1, 0");
60		asm volatile("vori.b $vr6, $vr2, 0");
61		asm volatile("vori.b $vr7, $vr3, 0");
62		for (z = z0-1; z >= 0; z--) {
63			/* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
64			asm volatile("vld $vr8, %0" : : "m"(dptr[z][d+0*NSIZE]));
65			asm volatile("vld $vr9, %0" : : "m"(dptr[z][d+1*NSIZE]));
66			asm volatile("vld $vr10, %0" : : "m"(dptr[z][d+2*NSIZE]));
67			asm volatile("vld $vr11, %0" : : "m"(dptr[z][d+3*NSIZE]));
68			/* wp$$ ^= wd$$; */
69			asm volatile("vxor.v $vr0, $vr0, $vr8");
70			asm volatile("vxor.v $vr1, $vr1, $vr9");
71			asm volatile("vxor.v $vr2, $vr2, $vr10");
72			asm volatile("vxor.v $vr3, $vr3, $vr11");
73			/* w2$$ = MASK(wq$$); */
74			asm volatile("vslti.b $vr12, $vr4, 0");
75			asm volatile("vslti.b $vr13, $vr5, 0");
76			asm volatile("vslti.b $vr14, $vr6, 0");
77			asm volatile("vslti.b $vr15, $vr7, 0");
78			/* w1$$ = SHLBYTE(wq$$); */
79			asm volatile("vslli.b $vr16, $vr4, 1");
80			asm volatile("vslli.b $vr17, $vr5, 1");
81			asm volatile("vslli.b $vr18, $vr6, 1");
82			asm volatile("vslli.b $vr19, $vr7, 1");
83			/* w2$$ &= NBYTES(0x1d); */
84			asm volatile("vandi.b $vr12, $vr12, 0x1d");
85			asm volatile("vandi.b $vr13, $vr13, 0x1d");
86			asm volatile("vandi.b $vr14, $vr14, 0x1d");
87			asm volatile("vandi.b $vr15, $vr15, 0x1d");
88			/* w1$$ ^= w2$$; */
89			asm volatile("vxor.v $vr16, $vr16, $vr12");
90			asm volatile("vxor.v $vr17, $vr17, $vr13");
91			asm volatile("vxor.v $vr18, $vr18, $vr14");
92			asm volatile("vxor.v $vr19, $vr19, $vr15");
93			/* wq$$ = w1$$ ^ wd$$; */
94			asm volatile("vxor.v $vr4, $vr16, $vr8");
95			asm volatile("vxor.v $vr5, $vr17, $vr9");
96			asm volatile("vxor.v $vr6, $vr18, $vr10");
97			asm volatile("vxor.v $vr7, $vr19, $vr11");
98		}
99		/* *(unative_t *)&p[d+NSIZE*$$] = wp$$; */
100		asm volatile("vst $vr0, %0" : "=m"(p[d+NSIZE*0]));
101		asm volatile("vst $vr1, %0" : "=m"(p[d+NSIZE*1]));
102		asm volatile("vst $vr2, %0" : "=m"(p[d+NSIZE*2]));
103		asm volatile("vst $vr3, %0" : "=m"(p[d+NSIZE*3]));
104		/* *(unative_t *)&q[d+NSIZE*$$] = wq$$; */
105		asm volatile("vst $vr4, %0" : "=m"(q[d+NSIZE*0]));
106		asm volatile("vst $vr5, %0" : "=m"(q[d+NSIZE*1]));
107		asm volatile("vst $vr6, %0" : "=m"(q[d+NSIZE*2]));
108		asm volatile("vst $vr7, %0" : "=m"(q[d+NSIZE*3]));
109	}
110
111	kernel_fpu_end();
112}
113
114static void raid6_lsx_xor_syndrome(int disks, int start, int stop,
115				   size_t bytes, void **ptrs)
116{
117	u8 **dptr = (u8 **)ptrs;
118	u8 *p, *q;
119	int d, z, z0;
120
121	z0 = stop;		/* P/Q right side optimization */
122	p = dptr[disks-2];	/* XOR parity */
123	q = dptr[disks-1];	/* RS syndrome */
124
125	kernel_fpu_begin();
126
127	/*
128	 * $vr0, $vr1, $vr2, $vr3: wp
129	 * $vr4, $vr5, $vr6, $vr7: wq
130	 * $vr8, $vr9, $vr10, $vr11: wd
131	 * $vr12, $vr13, $vr14, $vr15: w2
132	 * $vr16, $vr17, $vr18, $vr19: w1
133	 */
134	for (d = 0; d < bytes; d += NSIZE*4) {
135		/* P/Q data pages */
136		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
137		asm volatile("vld $vr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
138		asm volatile("vld $vr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
139		asm volatile("vld $vr2, %0" : : "m"(dptr[z0][d+2*NSIZE]));
140		asm volatile("vld $vr3, %0" : : "m"(dptr[z0][d+3*NSIZE]));
141		asm volatile("vori.b $vr4, $vr0, 0");
142		asm volatile("vori.b $vr5, $vr1, 0");
143		asm volatile("vori.b $vr6, $vr2, 0");
144		asm volatile("vori.b $vr7, $vr3, 0");
145		for (z = z0-1; z >= start; z--) {
146			/* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
147			asm volatile("vld $vr8, %0" : : "m"(dptr[z][d+0*NSIZE]));
148			asm volatile("vld $vr9, %0" : : "m"(dptr[z][d+1*NSIZE]));
149			asm volatile("vld $vr10, %0" : : "m"(dptr[z][d+2*NSIZE]));
150			asm volatile("vld $vr11, %0" : : "m"(dptr[z][d+3*NSIZE]));
151			/* wp$$ ^= wd$$; */
152			asm volatile("vxor.v $vr0, $vr0, $vr8");
153			asm volatile("vxor.v $vr1, $vr1, $vr9");
154			asm volatile("vxor.v $vr2, $vr2, $vr10");
155			asm volatile("vxor.v $vr3, $vr3, $vr11");
156			/* w2$$ = MASK(wq$$); */
157			asm volatile("vslti.b $vr12, $vr4, 0");
158			asm volatile("vslti.b $vr13, $vr5, 0");
159			asm volatile("vslti.b $vr14, $vr6, 0");
160			asm volatile("vslti.b $vr15, $vr7, 0");
161			/* w1$$ = SHLBYTE(wq$$); */
162			asm volatile("vslli.b $vr16, $vr4, 1");
163			asm volatile("vslli.b $vr17, $vr5, 1");
164			asm volatile("vslli.b $vr18, $vr6, 1");
165			asm volatile("vslli.b $vr19, $vr7, 1");
166			/* w2$$ &= NBYTES(0x1d); */
167			asm volatile("vandi.b $vr12, $vr12, 0x1d");
168			asm volatile("vandi.b $vr13, $vr13, 0x1d");
169			asm volatile("vandi.b $vr14, $vr14, 0x1d");
170			asm volatile("vandi.b $vr15, $vr15, 0x1d");
171			/* w1$$ ^= w2$$; */
172			asm volatile("vxor.v $vr16, $vr16, $vr12");
173			asm volatile("vxor.v $vr17, $vr17, $vr13");
174			asm volatile("vxor.v $vr18, $vr18, $vr14");
175			asm volatile("vxor.v $vr19, $vr19, $vr15");
176			/* wq$$ = w1$$ ^ wd$$; */
177			asm volatile("vxor.v $vr4, $vr16, $vr8");
178			asm volatile("vxor.v $vr5, $vr17, $vr9");
179			asm volatile("vxor.v $vr6, $vr18, $vr10");
180			asm volatile("vxor.v $vr7, $vr19, $vr11");
181		}
182
183		/* P/Q left side optimization */
184		for (z = start-1; z >= 0; z--) {
185			/* w2$$ = MASK(wq$$); */
186			asm volatile("vslti.b $vr12, $vr4, 0");
187			asm volatile("vslti.b $vr13, $vr5, 0");
188			asm volatile("vslti.b $vr14, $vr6, 0");
189			asm volatile("vslti.b $vr15, $vr7, 0");
190			/* w1$$ = SHLBYTE(wq$$); */
191			asm volatile("vslli.b $vr16, $vr4, 1");
192			asm volatile("vslli.b $vr17, $vr5, 1");
193			asm volatile("vslli.b $vr18, $vr6, 1");
194			asm volatile("vslli.b $vr19, $vr7, 1");
195			/* w2$$ &= NBYTES(0x1d); */
196			asm volatile("vandi.b $vr12, $vr12, 0x1d");
197			asm volatile("vandi.b $vr13, $vr13, 0x1d");
198			asm volatile("vandi.b $vr14, $vr14, 0x1d");
199			asm volatile("vandi.b $vr15, $vr15, 0x1d");
200			/* wq$$ = w1$$ ^ w2$$; */
201			asm volatile("vxor.v $vr4, $vr16, $vr12");
202			asm volatile("vxor.v $vr5, $vr17, $vr13");
203			asm volatile("vxor.v $vr6, $vr18, $vr14");
204			asm volatile("vxor.v $vr7, $vr19, $vr15");
205		}
206		/*
207		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
208		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
209		 */
210		asm volatile(
211			"vld $vr20, %0\n\t"
212			"vld $vr21, %1\n\t"
213			"vld $vr22, %2\n\t"
214			"vld $vr23, %3\n\t"
215			"vld $vr24, %4\n\t"
216			"vld $vr25, %5\n\t"
217			"vld $vr26, %6\n\t"
218			"vld $vr27, %7\n\t"
219			"vxor.v $vr20, $vr20, $vr0\n\t"
220			"vxor.v $vr21, $vr21, $vr1\n\t"
221			"vxor.v $vr22, $vr22, $vr2\n\t"
222			"vxor.v $vr23, $vr23, $vr3\n\t"
223			"vxor.v $vr24, $vr24, $vr4\n\t"
224			"vxor.v $vr25, $vr25, $vr5\n\t"
225			"vxor.v $vr26, $vr26, $vr6\n\t"
226			"vxor.v $vr27, $vr27, $vr7\n\t"
227			"vst $vr20, %0\n\t"
228			"vst $vr21, %1\n\t"
229			"vst $vr22, %2\n\t"
230			"vst $vr23, %3\n\t"
231			"vst $vr24, %4\n\t"
232			"vst $vr25, %5\n\t"
233			"vst $vr26, %6\n\t"
234			"vst $vr27, %7\n\t"
235			: "+m"(p[d+NSIZE*0]), "+m"(p[d+NSIZE*1]),
236			  "+m"(p[d+NSIZE*2]), "+m"(p[d+NSIZE*3]),
237			  "+m"(q[d+NSIZE*0]), "+m"(q[d+NSIZE*1]),
238			  "+m"(q[d+NSIZE*2]), "+m"(q[d+NSIZE*3])
239		);
240	}
241
242	kernel_fpu_end();
243}
244
245const struct raid6_calls raid6_lsx = {
246	raid6_lsx_gen_syndrome,
247	raid6_lsx_xor_syndrome,
248	raid6_has_lsx,
249	"lsx",
250	.priority = 0 /* see the comment near the top of the file for reason */
251};
252
253#undef NSIZE
254#endif /* CONFIG_CPU_HAS_LSX */
255
256#ifdef CONFIG_CPU_HAS_LASX
257#define NSIZE 32
258
259static int raid6_has_lasx(void)
260{
261	return cpu_has_lasx;
262}
263
264static void raid6_lasx_gen_syndrome(int disks, size_t bytes, void **ptrs)
265{
266	u8 **dptr = (u8 **)ptrs;
267	u8 *p, *q;
268	int d, z, z0;
269
270	z0 = disks - 3;		/* Highest data disk */
271	p = dptr[z0+1];		/* XOR parity */
272	q = dptr[z0+2];		/* RS syndrome */
273
274	kernel_fpu_begin();
275
276	/*
277	 * $xr0, $xr1: wp
278	 * $xr2, $xr3: wq
279	 * $xr4, $xr5: wd
280	 * $xr6, $xr7: w2
281	 * $xr8, $xr9: w1
282	 */
283	for (d = 0; d < bytes; d += NSIZE*2) {
284		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
285		asm volatile("xvld $xr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
286		asm volatile("xvld $xr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
287		asm volatile("xvori.b $xr2, $xr0, 0");
288		asm volatile("xvori.b $xr3, $xr1, 0");
289		for (z = z0-1; z >= 0; z--) {
290			/* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
291			asm volatile("xvld $xr4, %0" : : "m"(dptr[z][d+0*NSIZE]));
292			asm volatile("xvld $xr5, %0" : : "m"(dptr[z][d+1*NSIZE]));
293			/* wp$$ ^= wd$$; */
294			asm volatile("xvxor.v $xr0, $xr0, $xr4");
295			asm volatile("xvxor.v $xr1, $xr1, $xr5");
296			/* w2$$ = MASK(wq$$); */
297			asm volatile("xvslti.b $xr6, $xr2, 0");
298			asm volatile("xvslti.b $xr7, $xr3, 0");
299			/* w1$$ = SHLBYTE(wq$$); */
300			asm volatile("xvslli.b $xr8, $xr2, 1");
301			asm volatile("xvslli.b $xr9, $xr3, 1");
302			/* w2$$ &= NBYTES(0x1d); */
303			asm volatile("xvandi.b $xr6, $xr6, 0x1d");
304			asm volatile("xvandi.b $xr7, $xr7, 0x1d");
305			/* w1$$ ^= w2$$; */
306			asm volatile("xvxor.v $xr8, $xr8, $xr6");
307			asm volatile("xvxor.v $xr9, $xr9, $xr7");
308			/* wq$$ = w1$$ ^ wd$$; */
309			asm volatile("xvxor.v $xr2, $xr8, $xr4");
310			asm volatile("xvxor.v $xr3, $xr9, $xr5");
311		}
312		/* *(unative_t *)&p[d+NSIZE*$$] = wp$$; */
313		asm volatile("xvst $xr0, %0" : "=m"(p[d+NSIZE*0]));
314		asm volatile("xvst $xr1, %0" : "=m"(p[d+NSIZE*1]));
315		/* *(unative_t *)&q[d+NSIZE*$$] = wq$$; */
316		asm volatile("xvst $xr2, %0" : "=m"(q[d+NSIZE*0]));
317		asm volatile("xvst $xr3, %0" : "=m"(q[d+NSIZE*1]));
318	}
319
320	kernel_fpu_end();
321}
322
323static void raid6_lasx_xor_syndrome(int disks, int start, int stop,
324				    size_t bytes, void **ptrs)
325{
326	u8 **dptr = (u8 **)ptrs;
327	u8 *p, *q;
328	int d, z, z0;
329
330	z0 = stop;		/* P/Q right side optimization */
331	p = dptr[disks-2];	/* XOR parity */
332	q = dptr[disks-1];	/* RS syndrome */
333
334	kernel_fpu_begin();
335
336	/*
337	 * $xr0, $xr1: wp
338	 * $xr2, $xr3: wq
339	 * $xr4, $xr5: wd
340	 * $xr6, $xr7: w2
341	 * $xr8, $xr9: w1
342	 */
343	for (d = 0; d < bytes; d += NSIZE*2) {
344		/* P/Q data pages */
345		/* wq$$ = wp$$ = *(unative_t *)&dptr[z0][d+$$*NSIZE]; */
346		asm volatile("xvld $xr0, %0" : : "m"(dptr[z0][d+0*NSIZE]));
347		asm volatile("xvld $xr1, %0" : : "m"(dptr[z0][d+1*NSIZE]));
348		asm volatile("xvori.b $xr2, $xr0, 0");
349		asm volatile("xvori.b $xr3, $xr1, 0");
350		for (z = z0-1; z >= start; z--) {
351			/* wd$$ = *(unative_t *)&dptr[z][d+$$*NSIZE]; */
352			asm volatile("xvld $xr4, %0" : : "m"(dptr[z][d+0*NSIZE]));
353			asm volatile("xvld $xr5, %0" : : "m"(dptr[z][d+1*NSIZE]));
354			/* wp$$ ^= wd$$; */
355			asm volatile("xvxor.v $xr0, $xr0, $xr4");
356			asm volatile("xvxor.v $xr1, $xr1, $xr5");
357			/* w2$$ = MASK(wq$$); */
358			asm volatile("xvslti.b $xr6, $xr2, 0");
359			asm volatile("xvslti.b $xr7, $xr3, 0");
360			/* w1$$ = SHLBYTE(wq$$); */
361			asm volatile("xvslli.b $xr8, $xr2, 1");
362			asm volatile("xvslli.b $xr9, $xr3, 1");
363			/* w2$$ &= NBYTES(0x1d); */
364			asm volatile("xvandi.b $xr6, $xr6, 0x1d");
365			asm volatile("xvandi.b $xr7, $xr7, 0x1d");
366			/* w1$$ ^= w2$$; */
367			asm volatile("xvxor.v $xr8, $xr8, $xr6");
368			asm volatile("xvxor.v $xr9, $xr9, $xr7");
369			/* wq$$ = w1$$ ^ wd$$; */
370			asm volatile("xvxor.v $xr2, $xr8, $xr4");
371			asm volatile("xvxor.v $xr3, $xr9, $xr5");
372		}
373
374		/* P/Q left side optimization */
375		for (z = start-1; z >= 0; z--) {
376			/* w2$$ = MASK(wq$$); */
377			asm volatile("xvslti.b $xr6, $xr2, 0");
378			asm volatile("xvslti.b $xr7, $xr3, 0");
379			/* w1$$ = SHLBYTE(wq$$); */
380			asm volatile("xvslli.b $xr8, $xr2, 1");
381			asm volatile("xvslli.b $xr9, $xr3, 1");
382			/* w2$$ &= NBYTES(0x1d); */
383			asm volatile("xvandi.b $xr6, $xr6, 0x1d");
384			asm volatile("xvandi.b $xr7, $xr7, 0x1d");
385			/* wq$$ = w1$$ ^ w2$$; */
386			asm volatile("xvxor.v $xr2, $xr8, $xr6");
387			asm volatile("xvxor.v $xr3, $xr9, $xr7");
388		}
389		/*
390		 * *(unative_t *)&p[d+NSIZE*$$] ^= wp$$;
391		 * *(unative_t *)&q[d+NSIZE*$$] ^= wq$$;
392		 */
393		asm volatile(
394			"xvld $xr10, %0\n\t"
395			"xvld $xr11, %1\n\t"
396			"xvld $xr12, %2\n\t"
397			"xvld $xr13, %3\n\t"
398			"xvxor.v $xr10, $xr10, $xr0\n\t"
399			"xvxor.v $xr11, $xr11, $xr1\n\t"
400			"xvxor.v $xr12, $xr12, $xr2\n\t"
401			"xvxor.v $xr13, $xr13, $xr3\n\t"
402			"xvst $xr10, %0\n\t"
403			"xvst $xr11, %1\n\t"
404			"xvst $xr12, %2\n\t"
405			"xvst $xr13, %3\n\t"
406			: "+m"(p[d+NSIZE*0]), "+m"(p[d+NSIZE*1]),
407			  "+m"(q[d+NSIZE*0]), "+m"(q[d+NSIZE*1])
408		);
409	}
410
411	kernel_fpu_end();
412}
413
414const struct raid6_calls raid6_lasx = {
415	raid6_lasx_gen_syndrome,
416	raid6_lasx_xor_syndrome,
417	raid6_has_lasx,
418	"lasx",
419	.priority = 0 /* see the comment near the top of the file for reason */
420};
421#undef NSIZE
422#endif /* CONFIG_CPU_HAS_LASX */
423