1#if defined __x86_64__
2
3	.text
4	.align 4,0x90
5.globl _crc32_vec
6_crc32_vec:
7
8	// input :
9	//		 crc : edi
10	//		 buf : rsi
11	// 		 len : rdx
12
13	// symbolizing x86_64 registers
14
15	#define	crc		%edi
16	#define	buf		%rsi
17	#define	len		%rdx
18	#define	tab		%rcx
19
20	#define	v0		%xmm0
21	#define	v1		%xmm1
22	#define	v2		%xmm2
23	#define	v3		%xmm3
24	#define	v4		%xmm4
25	#define	v5		%xmm5
26
27	// push rbp, sp should now be 16-byte aligned
28	pushq	%rbp
29	movq	%rsp, %rbp
30
31#ifdef	KERNEL
32	/*
33		allocate 6*16 = 96 stack space and save %xmm0-%xmm7
34	*/
35	subq	$96, %rsp
36	movaps	v0, -16(%rbp)
37	movaps	v1, -32(%rbp)
38	movaps	v2, -48(%rbp)
39	movaps	v3, -64(%rbp)
40	movaps	v4, -80(%rbp)
41	movaps	v5, -96(%rbp)
42#endif
43
44	/*
45		set up the table pointer and use 16-byte data directly in pclmulqdq
46		tried movaps to %xmm7, and use %xmm7, performance about the same
47	*/
48	leaq    L_coefficients(%rip), tab
49	#define	K12		(tab)
50	#define	K34		16(tab)
51	#define	K56		32(tab)
52	#define	uPx		48(tab)
53	#define	L_shufb	64(tab)
54
55	/* load the initial crc and xor with the 1st 16-byte vector */
56	movd	crc, v0
57	movdqu	(buf), v1
58	pslldq	$12, v0			// shift up to the most significant word in v0
59	pshufb	L_shufb, v1
60	pxor	v1, v0
61
62	/* if this is the only vector, we've achieve the final 128-bit vector */
63	add		$16, buf
64	sub		$16, len
65	jle		L_128bits
66
67	/* make sure there are at least 3 more vectors */
68	cmp		$48, len
69	jl		L_no_more_4_vectors
70
71	/* read the next 3 vectors*/
72	movdqu	(buf), v1
73	movdqu	16(buf), v2
74	movdqu	32(buf), v3
75	pshufb		L_shufb, v1
76	pshufb		L_shufb, v2
77	pshufb		L_shufb, v3
78
79	add		$48, buf
80
81	/* pre-decrement len by 64, to check whether there are at least 4 more vectors */
82	sub		$48+64, len
83	jl		L_foldv13
84
85	/*	-------------------------------------------------
86		the main loop, folding 4 vectors per iterations
87		-------------------------------------------------
88	*/
89L_FOLD_BY_4:
90
91	movdqa		v0, v4
92	movdqa		v1, v5
93	pclmulqdq	$0x11, K12, v0
94	pclmulqdq	$0x11, K12, v1
95	pclmulqdq	$0x00, K12, v4
96	pclmulqdq	$0x00, K12, v5
97	pxor		v4, v0
98	pxor		v5, v1
99	movdqu		0(buf), v4
100	movdqu		16(buf), v5
101	pshufb		L_shufb, v4
102	pshufb		L_shufb, v5
103	pxor		v4, v0
104	pxor		v5, v1
105	movdqa		v2, v4
106	movdqa		v3, v5
107	pclmulqdq	$0x11, K12, v2
108	pclmulqdq	$0x11, K12, v3
109	pclmulqdq	$0x00, K12, v4
110	pclmulqdq	$0x00, K12, v5
111	pxor		v4, v2
112	pxor		v5, v3
113	movdqu		32(buf), v4
114	movdqu		48(buf), v5
115	pshufb		L_shufb, v4
116	pshufb		L_shufb, v5
117	pxor		v4, v2
118	pxor		v5, v3
119
120	add			$64, buf
121	sub     	$64, len
122	ja			L_FOLD_BY_4
123
124
125	/*
126		now sequentially fold v0 into v1,v2,v3
127	*/
128L_foldv13:
129
130	.macro	FOLD1
131	movdqa		v0, v4				// a copy of v0 = H(x)x^64 + L(x)
132	pclmulqdq	$$0x11, K34, v0		// H(x) * {x^[128+64] mod P(x)}
133	pclmulqdq	$$0x00, K34, v4		// L(x) * {x^128 mod P(x)}
134	pxor		v4, v0				// xor with L(x) * {x^128 mod P(x)}
135	pxor		$0, v0				// H(x) * {x^[128+64] mod P(x)} xor with the new vector v1/v2/v3
136	.endm
137
138	/* FOLD1 of v1-v3 into v0 */
139	FOLD1	v1
140	FOLD1	v2
141	FOLD1	v3
142
143	/* post-increment len by 64 */
144	add		$64, len
145
146L_no_more_4_vectors:
147
148	/* pre-decrement len by 16 to detect whether there is still some vector to process */
149	sub			$16, len
150	jl			L_128bits
151L_FOLD_BY_1:
152	movdqu		(buf), v5
153	pshufb		L_shufb, v5
154	FOLD1		v5					/* folding into the new vector */
155	add			$16, buf
156	sub			$16, len
157	jae			L_FOLD_BY_1		/* until no more new vector */
158
159L_128bits:		/* we've arrived at the final 128-bit vector */
160
161	/* reduction from 128-bits to 64-bits */
162	movdqa		v0, v1
163	pclmulqdq	$0x11, K56, v0		// v0 = H(x) * K5 96-bits
164	pslldq		$8, v1				// v1 = L(x) 64-bits
165	psrldq		$4, v1				// v1 = L(x) 64-bits in the right position
166	pxor		v1, v0
167	movdqa		v0, v1
168	pclmulqdq	$0x01, K56, v1
169	pxor		v1, v0
170
171	/*
172		barrett reduction:
173
174			T1 = floor(R(x)/x^32) * [1/P(x)];	R/P
175			T2 = floor(T1/x^32) * P(x);			int(R/P)*P;
176			CRC = (R+int(R/P)*P) mod x^32;		R-int(R/P)*P
177
178	*/
179	movq		v0, v1
180	psrldq		$4, v1				// R/x^32
181	pclmulqdq	$0x00, uPx, v1		// T1 = floor(R/x^32)*u
182	psrldq		$4, v1				// T1/x^32
183	pclmulqdq	$0x10, uPx, v1		// T2 = floor(T1/x^32)*P
184	pxor		v1, v0
185	movd		v0, %eax
186
187
188#ifdef	KERNEL
189	// restore xmm0-xmm7, and deallocate 96 bytes from stack
190	movaps	-16(%rbp), v0
191	movaps	-32(%rbp), v1
192	movaps	-48(%rbp), v2
193	movaps	-64(%rbp), v3
194	movaps	-80(%rbp), v4
195	movaps	-96(%rbp), v5
196	addq	$96, %rsp
197#endif
198
199	leave
200	ret
201
202	.const
203	.align	4
204L_coefficients: // used for vectorizing crc32 computation using pclmulqdq
205
206#define	K1  0x8833794C
207#define	K2	0xE6228B11
208#define	K3	0xC5B9CD4C
209#define	K4	0xE8A45605
210#define	K5	0xF200AA66
211#define	K6	0x490D678D
212#define	ux	0x104D101DF
213#define	Px	0x104C11DB7
214
215	.quad	K2
216	.quad	K1
217	.quad	K4
218	.quad	K3
219	.quad	K6
220	.quad	K5
221	.quad	ux
222	.quad	Px
223	.quad	0x08090a0b0c0d0e0f
224	.quad	0x0001020304050607
225
226
227#endif // defined VEC_OPTIMIZE
228