• Home
  • History
  • Annotate
  • Line#
  • Navigate
  • Raw
  • Download
  • only in /asuswrt-rt-n18u-9.0.0.4.380.2695/release/src-rt-6.x.4708/linux/linux-2.6.36/arch/x86/include/asm/
1#ifndef _ASM_X86_XOR_64_H
2#define _ASM_X86_XOR_64_H
3
4/*
5 * Optimized RAID-5 checksumming functions for MMX and SSE.
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2, or (at your option)
10 * any later version.
11 *
12 * You should have received a copy of the GNU General Public License
13 * (for example /usr/src/linux/COPYING); if not, write to the Free
14 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17
18/*
19 * Cache avoiding checksumming functions utilizing KNI instructions
20 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
21 */
22
23/*
24 * Based on
25 * High-speed RAID5 checksumming functions utilizing SSE instructions.
26 * Copyright (C) 1998 Ingo Molnar.
27 */
28
29/*
30 * x86-64 changes / gcc fixes from Andi Kleen.
31 * Copyright 2002 Andi Kleen, SuSE Labs.
32 *
33 * This hasn't been optimized for the hammer yet, but there are likely
34 * no advantages to be gotten from x86-64 here anyways.
35 */
36
37typedef struct {
38	unsigned long a, b;
39} __attribute__((aligned(16))) xmm_store_t;
40
41/* Doesn't use gcc to save the XMM registers, because there is no easy way to
42   tell it to do a clts before the register saving. */
43#define XMMS_SAVE				\
44do {						\
45	preempt_disable();			\
46	asm volatile(				\
47		"movq %%cr0,%0		;\n\t"	\
48		"clts			;\n\t"	\
49		"movups %%xmm0,(%1)	;\n\t"	\
50		"movups %%xmm1,0x10(%1)	;\n\t"	\
51		"movups %%xmm2,0x20(%1)	;\n\t"	\
52		"movups %%xmm3,0x30(%1)	;\n\t"	\
53		: "=&r" (cr0)			\
54		: "r" (xmm_save) 		\
55		: "memory");			\
56} while (0)
57
58#define XMMS_RESTORE				\
59do {						\
60	asm volatile(				\
61		"sfence			;\n\t"	\
62		"movups (%1),%%xmm0	;\n\t"	\
63		"movups 0x10(%1),%%xmm1	;\n\t"	\
64		"movups 0x20(%1),%%xmm2	;\n\t"	\
65		"movups 0x30(%1),%%xmm3	;\n\t"	\
66		"movq 	%0,%%cr0	;\n\t"	\
67		:				\
68		: "r" (cr0), "r" (xmm_save)	\
69		: "memory");			\
70	preempt_enable();			\
71} while (0)
72
73#define OFFS(x)		"16*("#x")"
74#define PF_OFFS(x)	"256+16*("#x")"
75#define	PF0(x)		"	prefetchnta "PF_OFFS(x)"(%[p1])		;\n"
76#define LD(x, y)	"       movaps   "OFFS(x)"(%[p1]), %%xmm"#y"	;\n"
77#define ST(x, y)	"       movaps %%xmm"#y",   "OFFS(x)"(%[p1])	;\n"
78#define PF1(x)		"	prefetchnta "PF_OFFS(x)"(%[p2])		;\n"
79#define PF2(x)		"	prefetchnta "PF_OFFS(x)"(%[p3])		;\n"
80#define PF3(x)		"	prefetchnta "PF_OFFS(x)"(%[p4])		;\n"
81#define PF4(x)		"	prefetchnta "PF_OFFS(x)"(%[p5])		;\n"
82#define PF5(x)		"	prefetchnta "PF_OFFS(x)"(%[p6])		;\n"
83#define XO1(x, y)	"       xorps   "OFFS(x)"(%[p2]), %%xmm"#y"	;\n"
84#define XO2(x, y)	"       xorps   "OFFS(x)"(%[p3]), %%xmm"#y"	;\n"
85#define XO3(x, y)	"       xorps   "OFFS(x)"(%[p4]), %%xmm"#y"	;\n"
86#define XO4(x, y)	"       xorps   "OFFS(x)"(%[p5]), %%xmm"#y"	;\n"
87#define XO5(x, y)	"       xorps   "OFFS(x)"(%[p6]), %%xmm"#y"	;\n"
88
89
90static void
91xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
92{
93	unsigned int lines = bytes >> 8;
94	unsigned long cr0;
95	xmm_store_t xmm_save[4];
96
97	XMMS_SAVE;
98
99	asm volatile(
100#undef BLOCK
101#define BLOCK(i) \
102		LD(i, 0)				\
103			LD(i + 1, 1)			\
104		PF1(i)					\
105				PF1(i + 2)		\
106				LD(i + 2, 2)		\
107					LD(i + 3, 3)	\
108		PF0(i + 4)				\
109				PF0(i + 6)		\
110		XO1(i, 0)				\
111			XO1(i + 1, 1)			\
112				XO1(i + 2, 2)		\
113					XO1(i + 3, 3)	\
114		ST(i, 0)				\
115			ST(i + 1, 1)			\
116				ST(i + 2, 2)		\
117					ST(i + 3, 3)	\
118
119
120		PF0(0)
121				PF0(2)
122
123	" .align 32			;\n"
124	" 1:                            ;\n"
125
126		BLOCK(0)
127		BLOCK(4)
128		BLOCK(8)
129		BLOCK(12)
130
131	"       addq %[inc], %[p1]           ;\n"
132	"       addq %[inc], %[p2]           ;\n"
133		"		decl %[cnt] ; jnz 1b"
134	: [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
135	: [inc] "r" (256UL)
136	: "memory");
137
138	XMMS_RESTORE;
139}
140
141static void
142xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
143	  unsigned long *p3)
144{
145	unsigned int lines = bytes >> 8;
146	xmm_store_t xmm_save[4];
147	unsigned long cr0;
148
149	XMMS_SAVE;
150
151	asm volatile(
152#undef BLOCK
153#define BLOCK(i) \
154		PF1(i)					\
155				PF1(i + 2)		\
156		LD(i, 0)					\
157			LD(i + 1, 1)			\
158				LD(i + 2, 2)		\
159					LD(i + 3, 3)	\
160		PF2(i)					\
161				PF2(i + 2)		\
162		PF0(i + 4)				\
163				PF0(i + 6)		\
164		XO1(i, 0)				\
165			XO1(i + 1, 1)			\
166				XO1(i + 2, 2)		\
167					XO1(i + 3, 3)	\
168		XO2(i, 0)				\
169			XO2(i + 1, 1)			\
170				XO2(i + 2, 2)		\
171					XO2(i + 3, 3)	\
172		ST(i, 0)				\
173			ST(i + 1, 1)			\
174				ST(i + 2, 2)		\
175					ST(i + 3, 3)	\
176
177
178		PF0(0)
179				PF0(2)
180
181	" .align 32			;\n"
182	" 1:                            ;\n"
183
184		BLOCK(0)
185		BLOCK(4)
186		BLOCK(8)
187		BLOCK(12)
188
189	"       addq %[inc], %[p1]           ;\n"
190	"       addq %[inc], %[p2]          ;\n"
191	"       addq %[inc], %[p3]           ;\n"
192		"		decl %[cnt] ; jnz 1b"
193	: [cnt] "+r" (lines),
194	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
195	: [inc] "r" (256UL)
196	: "memory");
197	XMMS_RESTORE;
198}
199
200static void
201xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
202	  unsigned long *p3, unsigned long *p4)
203{
204	unsigned int lines = bytes >> 8;
205	xmm_store_t xmm_save[4];
206	unsigned long cr0;
207
208	XMMS_SAVE;
209
210	asm volatile(
211#undef BLOCK
212#define BLOCK(i) \
213		PF1(i)					\
214				PF1(i + 2)		\
215		LD(i, 0)				\
216			LD(i + 1, 1)			\
217				LD(i + 2, 2)		\
218					LD(i + 3, 3)	\
219		PF2(i)					\
220				PF2(i + 2)		\
221		XO1(i, 0)				\
222			XO1(i + 1, 1)			\
223				XO1(i + 2, 2)		\
224					XO1(i + 3, 3)	\
225		PF3(i)					\
226				PF3(i + 2)		\
227		PF0(i + 4)				\
228				PF0(i + 6)		\
229		XO2(i, 0)				\
230			XO2(i + 1, 1)			\
231				XO2(i + 2, 2)		\
232					XO2(i + 3, 3)	\
233		XO3(i, 0)				\
234			XO3(i + 1, 1)			\
235				XO3(i + 2, 2)		\
236					XO3(i + 3, 3)	\
237		ST(i, 0)				\
238			ST(i + 1, 1)			\
239				ST(i + 2, 2)		\
240					ST(i + 3, 3)	\
241
242
243		PF0(0)
244				PF0(2)
245
246	" .align 32			;\n"
247	" 1:                            ;\n"
248
249		BLOCK(0)
250		BLOCK(4)
251		BLOCK(8)
252		BLOCK(12)
253
254	"       addq %[inc], %[p1]           ;\n"
255	"       addq %[inc], %[p2]           ;\n"
256	"       addq %[inc], %[p3]           ;\n"
257	"       addq %[inc], %[p4]           ;\n"
258	"	decl %[cnt] ; jnz 1b"
259	: [cnt] "+c" (lines),
260	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
261	: [inc] "r" (256UL)
262	: "memory" );
263
264	XMMS_RESTORE;
265}
266
267static void
268xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
269	  unsigned long *p3, unsigned long *p4, unsigned long *p5)
270{
271	unsigned int lines = bytes >> 8;
272	xmm_store_t xmm_save[4];
273	unsigned long cr0;
274
275	XMMS_SAVE;
276
277	asm volatile(
278#undef BLOCK
279#define BLOCK(i) \
280		PF1(i)					\
281				PF1(i + 2)		\
282		LD(i, 0)				\
283			LD(i + 1, 1)			\
284				LD(i + 2, 2)		\
285					LD(i + 3, 3)	\
286		PF2(i)					\
287				PF2(i + 2)		\
288		XO1(i, 0)				\
289			XO1(i + 1, 1)			\
290				XO1(i + 2, 2)		\
291					XO1(i + 3, 3)	\
292		PF3(i)					\
293				PF3(i + 2)		\
294		XO2(i, 0)				\
295			XO2(i + 1, 1)			\
296				XO2(i + 2, 2)		\
297					XO2(i + 3, 3)	\
298		PF4(i)					\
299				PF4(i + 2)		\
300		PF0(i + 4)				\
301				PF0(i + 6)		\
302		XO3(i, 0)				\
303			XO3(i + 1, 1)			\
304				XO3(i + 2, 2)		\
305					XO3(i + 3, 3)	\
306		XO4(i, 0)				\
307			XO4(i + 1, 1)			\
308				XO4(i + 2, 2)		\
309					XO4(i + 3, 3)	\
310		ST(i, 0)				\
311			ST(i + 1, 1)			\
312				ST(i + 2, 2)		\
313					ST(i + 3, 3)	\
314
315
316		PF0(0)
317				PF0(2)
318
319	" .align 32			;\n"
320	" 1:                            ;\n"
321
322		BLOCK(0)
323		BLOCK(4)
324		BLOCK(8)
325		BLOCK(12)
326
327	"       addq %[inc], %[p1]           ;\n"
328	"       addq %[inc], %[p2]           ;\n"
329	"       addq %[inc], %[p3]           ;\n"
330	"       addq %[inc], %[p4]           ;\n"
331	"       addq %[inc], %[p5]           ;\n"
332	"	decl %[cnt] ; jnz 1b"
333	: [cnt] "+c" (lines),
334	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
335	  [p5] "+r" (p5)
336	: [inc] "r" (256UL)
337	: "memory");
338
339	XMMS_RESTORE;
340}
341
342static struct xor_block_template xor_block_sse = {
343	.name = "generic_sse",
344	.do_2 = xor_sse_2,
345	.do_3 = xor_sse_3,
346	.do_4 = xor_sse_4,
347	.do_5 = xor_sse_5,
348};
349
350#undef XOR_TRY_TEMPLATES
351#define XOR_TRY_TEMPLATES			\
352do {						\
353	xor_speed(&xor_block_sse);		\
354} while (0)
355
356/* We force the use of the SSE xor block because it can write around L2.
357   We may also be able to load into the L1 only depending on how the cpu
358   deals with a load to a line that is being prefetched.  */
359#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
360
361#endif /* _ASM_X86_XOR_64_H */
362