1/* SPDX-License-Identifier: GPL-2.0-only */
2#ifndef _ASM_X86_XOR_AVX_H
3#define _ASM_X86_XOR_AVX_H
4
5/*
6 * Optimized RAID-5 checksumming functions for AVX
7 *
8 * Copyright (C) 2012 Intel Corporation
9 * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
10 *
11 * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
12 */
13
14#include <linux/compiler.h>
15#include <asm/fpu/api.h>
16
17#define BLOCK4(i) \
18		BLOCK(32 * i, 0) \
19		BLOCK(32 * (i + 1), 1) \
20		BLOCK(32 * (i + 2), 2) \
21		BLOCK(32 * (i + 3), 3)
22
23#define BLOCK16() \
24		BLOCK4(0) \
25		BLOCK4(4) \
26		BLOCK4(8) \
27		BLOCK4(12)
28
29static void xor_avx_2(unsigned long bytes, unsigned long * __restrict p0,
30		      const unsigned long * __restrict p1)
31{
32	unsigned long lines = bytes >> 9;
33
34	kernel_fpu_begin();
35
36	while (lines--) {
37#undef BLOCK
38#define BLOCK(i, reg) \
39do { \
40	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
41	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm"  #reg : : \
42		"m" (p0[i / sizeof(*p0)])); \
43	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
44		"=m" (p0[i / sizeof(*p0)])); \
45} while (0);
46
47		BLOCK16()
48
49		p0 = (unsigned long *)((uintptr_t)p0 + 512);
50		p1 = (unsigned long *)((uintptr_t)p1 + 512);
51	}
52
53	kernel_fpu_end();
54}
55
56static void xor_avx_3(unsigned long bytes, unsigned long * __restrict p0,
57		      const unsigned long * __restrict p1,
58		      const unsigned long * __restrict p2)
59{
60	unsigned long lines = bytes >> 9;
61
62	kernel_fpu_begin();
63
64	while (lines--) {
65#undef BLOCK
66#define BLOCK(i, reg) \
67do { \
68	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
69	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
70		"m" (p1[i / sizeof(*p1)])); \
71	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
72		"m" (p0[i / sizeof(*p0)])); \
73	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
74		"=m" (p0[i / sizeof(*p0)])); \
75} while (0);
76
77		BLOCK16()
78
79		p0 = (unsigned long *)((uintptr_t)p0 + 512);
80		p1 = (unsigned long *)((uintptr_t)p1 + 512);
81		p2 = (unsigned long *)((uintptr_t)p2 + 512);
82	}
83
84	kernel_fpu_end();
85}
86
87static void xor_avx_4(unsigned long bytes, unsigned long * __restrict p0,
88		      const unsigned long * __restrict p1,
89		      const unsigned long * __restrict p2,
90		      const unsigned long * __restrict p3)
91{
92	unsigned long lines = bytes >> 9;
93
94	kernel_fpu_begin();
95
96	while (lines--) {
97#undef BLOCK
98#define BLOCK(i, reg) \
99do { \
100	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
101	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
102		"m" (p2[i / sizeof(*p2)])); \
103	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
104		"m" (p1[i / sizeof(*p1)])); \
105	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
106		"m" (p0[i / sizeof(*p0)])); \
107	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
108		"=m" (p0[i / sizeof(*p0)])); \
109} while (0);
110
111		BLOCK16();
112
113		p0 = (unsigned long *)((uintptr_t)p0 + 512);
114		p1 = (unsigned long *)((uintptr_t)p1 + 512);
115		p2 = (unsigned long *)((uintptr_t)p2 + 512);
116		p3 = (unsigned long *)((uintptr_t)p3 + 512);
117	}
118
119	kernel_fpu_end();
120}
121
122static void xor_avx_5(unsigned long bytes, unsigned long * __restrict p0,
123	     const unsigned long * __restrict p1,
124	     const unsigned long * __restrict p2,
125	     const unsigned long * __restrict p3,
126	     const unsigned long * __restrict p4)
127{
128	unsigned long lines = bytes >> 9;
129
130	kernel_fpu_begin();
131
132	while (lines--) {
133#undef BLOCK
134#define BLOCK(i, reg) \
135do { \
136	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
137	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
138		"m" (p3[i / sizeof(*p3)])); \
139	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
140		"m" (p2[i / sizeof(*p2)])); \
141	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
142		"m" (p1[i / sizeof(*p1)])); \
143	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
144		"m" (p0[i / sizeof(*p0)])); \
145	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
146		"=m" (p0[i / sizeof(*p0)])); \
147} while (0);
148
149		BLOCK16()
150
151		p0 = (unsigned long *)((uintptr_t)p0 + 512);
152		p1 = (unsigned long *)((uintptr_t)p1 + 512);
153		p2 = (unsigned long *)((uintptr_t)p2 + 512);
154		p3 = (unsigned long *)((uintptr_t)p3 + 512);
155		p4 = (unsigned long *)((uintptr_t)p4 + 512);
156	}
157
158	kernel_fpu_end();
159}
160
161static struct xor_block_template xor_block_avx = {
162	.name = "avx",
163	.do_2 = xor_avx_2,
164	.do_3 = xor_avx_3,
165	.do_4 = xor_avx_4,
166	.do_5 = xor_avx_5,
167};
168
169#define AVX_XOR_SPEED \
170do { \
171	if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \
172		xor_speed(&xor_block_avx); \
173} while (0)
174
175#define AVX_SELECT(FASTEST) \
176	(boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST)
177
178#endif
179