1/*
2 * include/asm-x86_64/xor.h
3 *
4 * Optimized RAID-5 checksumming functions for MMX and SSE.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2, or (at your option)
9 * any later version.
10 *
11 * You should have received a copy of the GNU General Public License
12 * (for example /usr/src/linux/COPYING); if not, write to the Free
13 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
14 */
15
16
17/*
18 * Cache avoiding checksumming functions utilizing KNI instructions
19 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
20 */
21
22/*
23 * Based on
24 * High-speed RAID5 checksumming functions utilizing SSE instructions.
25 * Copyright (C) 1998 Ingo Molnar.
26 */
27
28/*
29 * x86-64 changes / gcc fixes from Andi Kleen.
30 * Copyright 2002 Andi Kleen, SuSE Labs.
31 *
32 * This hasn't been optimized for the hammer yet, but there are likely
33 * no advantages to be gotten from x86-64 here anyways.
34 */
35
36typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t;
37
38/* Doesn't use gcc to save the XMM registers, because there is no easy way to
39   tell it to do a clts before the register saving. */
40#define XMMS_SAVE do {				\
41	preempt_disable();			\
42	asm volatile (				\
43		"movq %%cr0,%0		;\n\t"	\
44		"clts			;\n\t"	\
45		"movups %%xmm0,(%1)	;\n\t"	\
46		"movups %%xmm1,0x10(%1)	;\n\t"	\
47		"movups %%xmm2,0x20(%1)	;\n\t"	\
48		"movups %%xmm3,0x30(%1)	;\n\t"	\
49		: "=&r" (cr0)			\
50		: "r" (xmm_save) 		\
51		: "memory");			\
52} while(0)
53
54#define XMMS_RESTORE do {			\
55	asm volatile (				\
56		"sfence			;\n\t"	\
57		"movups (%1),%%xmm0	;\n\t"	\
58		"movups 0x10(%1),%%xmm1	;\n\t"	\
59		"movups 0x20(%1),%%xmm2	;\n\t"	\
60		"movups 0x30(%1),%%xmm3	;\n\t"	\
61		"movq 	%0,%%cr0	;\n\t"	\
62		:				\
63		: "r" (cr0), "r" (xmm_save)	\
64		: "memory");			\
65	preempt_enable();			\
66} while(0)
67
68#define OFFS(x)		"16*("#x")"
69#define PF_OFFS(x)	"256+16*("#x")"
70#define	PF0(x)		"	prefetchnta "PF_OFFS(x)"(%[p1])		;\n"
71#define LD(x,y)		"       movaps   "OFFS(x)"(%[p1]), %%xmm"#y"	;\n"
72#define ST(x,y)		"       movaps %%xmm"#y",   "OFFS(x)"(%[p1])	;\n"
73#define PF1(x)		"	prefetchnta "PF_OFFS(x)"(%[p2])		;\n"
74#define PF2(x)		"	prefetchnta "PF_OFFS(x)"(%[p3])		;\n"
75#define PF3(x)		"	prefetchnta "PF_OFFS(x)"(%[p4])		;\n"
76#define PF4(x)		"	prefetchnta "PF_OFFS(x)"(%[p5])		;\n"
77#define PF5(x)		"	prefetchnta "PF_OFFS(x)"(%[p6])		;\n"
78#define XO1(x,y)	"       xorps   "OFFS(x)"(%[p2]), %%xmm"#y"	;\n"
79#define XO2(x,y)	"       xorps   "OFFS(x)"(%[p3]), %%xmm"#y"	;\n"
80#define XO3(x,y)	"       xorps   "OFFS(x)"(%[p4]), %%xmm"#y"	;\n"
81#define XO4(x,y)	"       xorps   "OFFS(x)"(%[p5]), %%xmm"#y"	;\n"
82#define XO5(x,y)	"       xorps   "OFFS(x)"(%[p6]), %%xmm"#y"	;\n"
83
84
85static void
86xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
87{
88        unsigned int lines = bytes >> 8;
89	unsigned long cr0;
90	xmm_store_t xmm_save[4];
91
92	XMMS_SAVE;
93
94        asm volatile (
95#undef BLOCK
96#define BLOCK(i) \
97		LD(i,0)					\
98			LD(i+1,1)			\
99		PF1(i)					\
100				PF1(i+2)		\
101				LD(i+2,2)		\
102					LD(i+3,3)	\
103		PF0(i+4)				\
104				PF0(i+6)		\
105		XO1(i,0)				\
106			XO1(i+1,1)			\
107				XO1(i+2,2)		\
108					XO1(i+3,3)	\
109		ST(i,0)					\
110			ST(i+1,1)			\
111				ST(i+2,2)		\
112					ST(i+3,3)	\
113
114
115		PF0(0)
116				PF0(2)
117
118	" .align 32			;\n"
119        " 1:                            ;\n"
120
121		BLOCK(0)
122		BLOCK(4)
123		BLOCK(8)
124		BLOCK(12)
125
126        "       addq %[inc], %[p1]           ;\n"
127        "       addq %[inc], %[p2]           ;\n"
128		"		decl %[cnt] ; jnz 1b"
129	: [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
130	: [inc] "r" (256UL)
131        : "memory");
132
133	XMMS_RESTORE;
134}
135
136static void
137xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
138	  unsigned long *p3)
139{
140	unsigned int lines = bytes >> 8;
141	xmm_store_t xmm_save[4];
142	unsigned long cr0;
143
144	XMMS_SAVE;
145
146        __asm__ __volatile__ (
147#undef BLOCK
148#define BLOCK(i) \
149		PF1(i)					\
150				PF1(i+2)		\
151		LD(i,0)					\
152			LD(i+1,1)			\
153				LD(i+2,2)		\
154					LD(i+3,3)	\
155		PF2(i)					\
156				PF2(i+2)		\
157		PF0(i+4)				\
158				PF0(i+6)		\
159		XO1(i,0)				\
160			XO1(i+1,1)			\
161				XO1(i+2,2)		\
162					XO1(i+3,3)	\
163		XO2(i,0)				\
164			XO2(i+1,1)			\
165				XO2(i+2,2)		\
166					XO2(i+3,3)	\
167		ST(i,0)					\
168			ST(i+1,1)			\
169				ST(i+2,2)		\
170					ST(i+3,3)	\
171
172
173		PF0(0)
174				PF0(2)
175
176	" .align 32			;\n"
177        " 1:                            ;\n"
178
179		BLOCK(0)
180		BLOCK(4)
181		BLOCK(8)
182		BLOCK(12)
183
184        "       addq %[inc], %[p1]           ;\n"
185        "       addq %[inc], %[p2]          ;\n"
186        "       addq %[inc], %[p3]           ;\n"
187		"		decl %[cnt] ; jnz 1b"
188	: [cnt] "+r" (lines),
189	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
190	: [inc] "r" (256UL)
191	: "memory");
192	XMMS_RESTORE;
193}
194
195static void
196xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
197	  unsigned long *p3, unsigned long *p4)
198{
199	unsigned int lines = bytes >> 8;
200	xmm_store_t xmm_save[4];
201	unsigned long cr0;
202
203	XMMS_SAVE;
204
205        __asm__ __volatile__ (
206#undef BLOCK
207#define BLOCK(i) \
208		PF1(i)					\
209				PF1(i+2)		\
210		LD(i,0)					\
211			LD(i+1,1)			\
212				LD(i+2,2)		\
213					LD(i+3,3)	\
214		PF2(i)					\
215				PF2(i+2)		\
216		XO1(i,0)				\
217			XO1(i+1,1)			\
218				XO1(i+2,2)		\
219					XO1(i+3,3)	\
220		PF3(i)					\
221				PF3(i+2)		\
222		PF0(i+4)				\
223				PF0(i+6)		\
224		XO2(i,0)				\
225			XO2(i+1,1)			\
226				XO2(i+2,2)		\
227					XO2(i+3,3)	\
228		XO3(i,0)				\
229			XO3(i+1,1)			\
230				XO3(i+2,2)		\
231					XO3(i+3,3)	\
232		ST(i,0)					\
233			ST(i+1,1)			\
234				ST(i+2,2)		\
235					ST(i+3,3)	\
236
237
238		PF0(0)
239				PF0(2)
240
241	" .align 32			;\n"
242        " 1:                            ;\n"
243
244		BLOCK(0)
245		BLOCK(4)
246		BLOCK(8)
247		BLOCK(12)
248
249        "       addq %[inc], %[p1]           ;\n"
250        "       addq %[inc], %[p2]           ;\n"
251        "       addq %[inc], %[p3]           ;\n"
252        "       addq %[inc], %[p4]           ;\n"
253	"	decl %[cnt] ; jnz 1b"
254	: [cnt] "+c" (lines),
255	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
256	: [inc] "r" (256UL)
257        : "memory" );
258
259	XMMS_RESTORE;
260}
261
262static void
263xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
264	  unsigned long *p3, unsigned long *p4, unsigned long *p5)
265{
266        unsigned int lines = bytes >> 8;
267	xmm_store_t xmm_save[4];
268	unsigned long cr0;
269
270	XMMS_SAVE;
271
272        __asm__ __volatile__ (
273#undef BLOCK
274#define BLOCK(i) \
275		PF1(i)					\
276				PF1(i+2)		\
277		LD(i,0)					\
278			LD(i+1,1)			\
279				LD(i+2,2)		\
280					LD(i+3,3)	\
281		PF2(i)					\
282				PF2(i+2)		\
283		XO1(i,0)				\
284			XO1(i+1,1)			\
285				XO1(i+2,2)		\
286					XO1(i+3,3)	\
287		PF3(i)					\
288				PF3(i+2)		\
289		XO2(i,0)				\
290			XO2(i+1,1)			\
291				XO2(i+2,2)		\
292					XO2(i+3,3)	\
293		PF4(i)					\
294				PF4(i+2)		\
295		PF0(i+4)				\
296				PF0(i+6)		\
297		XO3(i,0)				\
298			XO3(i+1,1)			\
299				XO3(i+2,2)		\
300					XO3(i+3,3)	\
301		XO4(i,0)				\
302			XO4(i+1,1)			\
303				XO4(i+2,2)		\
304					XO4(i+3,3)	\
305		ST(i,0)					\
306			ST(i+1,1)			\
307				ST(i+2,2)		\
308					ST(i+3,3)	\
309
310
311		PF0(0)
312				PF0(2)
313
314	" .align 32			;\n"
315        " 1:                            ;\n"
316
317		BLOCK(0)
318		BLOCK(4)
319		BLOCK(8)
320		BLOCK(12)
321
322        "       addq %[inc], %[p1]           ;\n"
323        "       addq %[inc], %[p2]           ;\n"
324        "       addq %[inc], %[p3]           ;\n"
325        "       addq %[inc], %[p4]           ;\n"
326        "       addq %[inc], %[p5]           ;\n"
327	"	decl %[cnt] ; jnz 1b"
328	: [cnt] "+c" (lines),
329  	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
330	  [p5] "+r" (p5)
331	: [inc] "r" (256UL)
332	: "memory");
333
334	XMMS_RESTORE;
335}
336
337static struct xor_block_template xor_block_sse = {
338        .name = "generic_sse",
339        .do_2 = xor_sse_2,
340        .do_3 = xor_sse_3,
341        .do_4 = xor_sse_4,
342        .do_5 = xor_sse_5,
343};
344
345#undef XOR_TRY_TEMPLATES
346#define XOR_TRY_TEMPLATES				\
347	do {						\
348		xor_speed(&xor_block_sse);	\
349	} while (0)
350
351/* We force the use of the SSE xor block because it can write around L2.
352   We may also be able to load into the L1 only depending on how the cpu
353   deals with a load to a line that is being prefetched.  */
354#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
355