1/* Apple Copyright 2009
2   CoreOS - vector & Numerics, cclee 10-22-09
3
4	This following source code implements a vectorized version of adler32 computation that is defined in zlib.
5	The target architectures are x86_64 and i386.
6
7	Given 2 unsigned 32-bit alder and sum2 (both pre-modulo by BASE=65521) and a sequence of input bytes x[0],...x[N-1].
8	The adler-sum2 pair is updated according to
9
10		for (i=0;i<N;i++) {
11			adler = (adler+x[i])%BASE;
12			sum2 = (sum2+adler)%BASE;
13		}
14
15	To reduce/save the modulo operations, it can be shown that, if initial alder and sum2 are less than BASE(=65521),
16	adler and sum2 (in 32-bit representation), will never overflow for the next NMAX=5552 bytes. This simplifies the
17	algorithm to
18
19		for (i=0;i<N;i+=NMAX) {
20			for (k=0;k<NMAX;k++) {
21				adler+=x[i+k];
22				sum2+=adler;
23			}
24			adler%=BASE;
25			sum2%=BASE;
26		}
27
28	The hand optimization of this function is now reduced to
29
30			for (k=0;k<NMAX;k++) {
31                adler+=x[k];
32                sum2+=adler;
33            }
34
35	This subtask turns out to be very vecterizable. Suppose we perform the adler/sum2 update once per K bytes,
36
37			for (k=0;k<K;k++) {
38                adler+=x[k];
39                sum2+=adler;
40            }
41
42	It can be shown that the sum2-adler pair can be updated according to
43
44		sum2 += adler*K;
45		adler += (x[0] + x[1] + ... + x[K-1]);
46		sum2 += (x[0]*K + x[1]*(K-1) + ... + x[K-1]*1);
47
48	The last 2 equations obviously show that the adler-sum2 pair update can be speeded up using vector processor.
49	The input vector [ x[0] x[1] ... x[K-1] ]. And we need two coefficient vectors
50		[ 1 1 1 ... 1 ] for adler update.
51		[ K K-1 ... 1 ] for sum2 update.
52
53	The implementation below reads vector (K=16,32,48,64) into xmm registers, and sets up coefficient vectors in xmm
54	registers. It then uses SSE instructions to perform the aforementioned vector computation.
55
56	For i386, NMAX/16 = 347, whenever possible (NMAX-bytes block), it calls 173 times of macro code DO32 (K=32),
57	followed by a single DO16 (K=16), before calling a modulo operation for adler and sum2.
58
59	For x86_64 (where more xmm registers are available), NMAX/64 = 86, whenever possible (NMAX-bytes block),
60	it calls 86 times of macro code DO64 (K=64), followed by a single DO48 (K=48),
61	before calling a modulo operation for adler and sum2.
62
63*/
64
65/* added cpu_capability to detect kHasSupplementalSSE3 to branch into code w or wo SupplementalSSE3
66
67	Previously, ssse3 code was intentionally turned off, because Yonah does not support ssse3
68	add code here to probe cpu_capabilities for ssse3 support
69		if ssse3 is supported, branch to ssse3-based code, otherwise use the original code
70
71	cclee 5-3-10
72*/
73
74#define BASE 65521  /* largest prime smaller than 65536 */
75#define NMAX 5552 	/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
76
77// uLong	adler32_vec(unsigned int adler, unsigned int sum2, const Bytef *buf, int len) {
78//    unsigned n;
79//    while (len >= NMAX) {
80//        len -= NMAX;
81//        n = NMAX / 16;          /* NMAX is divisible by 16 */
82//        do {
83//            DO16(buf);          /* 16 sums unrolled */
84//            buf += 16;
85//        } while (--n);
86//        MOD(adler);
87//        MOD(sum2);
88//    }
89//    if (len) {                  /* avoid modulos if none remaining */
90//        while (len >= 16) {
91//            len -= 16;
92//            DO16(buf);
93//            buf += 16;
94//        }
95//        while (len--) {
96//            adler += *buf++;
97//            sum2 += adler;
98//        }
99//        MOD(adler);
100//        MOD(sum2);
101//    }
102//    return adler | (sum2 << 16);
103// }
104
105#if (defined __i386__ || defined __x86_64__)
106
107#include <i386/cpu_capabilities.h>
108
109	.text
110	.align 4,0x90
111.globl _adler32_vec
112_adler32_vec:
113
114#if (defined __i386__)
115
116	pushl	%ebp
117	movl	%esp, %ebp
118
119	pushl	%ebx
120	pushl	%edi
121	pushl	%esi
122
123#ifdef	KERNEL 						// if this is for kernel, need to save xmm registers
124	subl	$140, %esp				// to save %xmm0-%xmm7 into stack, extra 12 to align %esp to 16-byte boundary
125	movaps	%xmm0, 0(%esp)		// save xmm0, offset -12 for ebx/edi/esi
126	movaps	%xmm1, 16(%esp)		// save xmm1
127	movaps	%xmm2, 32(%esp)		// save xmm2
128	movaps	%xmm3, 48(%esp)		// save xmm3
129	movaps	%xmm4, 64(%esp)		// save xmm4
130	movaps	%xmm5, 80(%esp)		// save xmm5
131	movaps	%xmm6, 96(%esp)		// save xmm6
132	movaps	%xmm7, 112(%esp)		// save xmm7, if this is for SSSE3 or above
133#endif
134
135	#define	adler	%edi				// 8(%ebp)
136	#define	sum2	%esi				// 12(%ebp)
137	#define	buf		%ecx				// 16(%ebp)
138	#define	len		%ebx				// 20(%ebp)
139	#define	zero	%xmm0
140	#define ones	%xmm5
141
142	movl	8(%ebp), adler
143	movl	12(%ebp), sum2
144	movl	16(%ebp), buf			// use ecx as buf pointer
145	movl	20(%ebp), len
146
147	.macro		modulo_BASE
148	movl		$$-2146992015, %eax		// 1/BASE in Q47
149	mull		adler					// edx:eax = adler divided by BASE in Q47
150	shrl		$$15, %edx				// edx is now the floor integer of adler and BASE
151	imull		$$BASE, %edx, %edx		// edx * BASE
152	subl		%edx, adler				// adler -= edx*BASE
153	movl		$$-2146992015, %eax		// 1/BASE in Q47
154	mull		sum2					// edx:eax = sum2 divided by BASE in Q47
155	shrl		$$15, %edx				// edx is now the floor integer of sum2 and BASE
156	imull		$$BASE, %edx, %eax		// eax = edx * BASE
157	subl		%eax, sum2				// sum2 -= sdx*BASE
158	.endmacro
159
160	// update adler/sum2 according to a new 16-byte vector
161	.macro		DO16
162	movaps		(buf), %xmm1			// 16 bytes vector, in xmm1
163	movaps		%xmm1, %xmm3			// a copy of the vector, used for unsigned byte in the destination of pmaddubsw
164	addl		$$16, buf				// buf -> next vector
165	psadbw		zero, %xmm1				// 2 16-bit words to be added for adler in xmm1
166	pmaddubsw	%xmm4, %xmm3			// 8 16-bit words to be added for sum2 in xmm3
167	imull		$$16, adler, %edx		// edx = 16*adler;
168	movhlps		%xmm1, %xmm2			// higher 16-bit word (for adler) in xmm2
169	pmaddwd		ones, %xmm3				// 4 32-bit elements to be added for sum2 in xmm3
170	paddq		%xmm2, %xmm1			// xmm1 lower 32-bit to be added to adler
171	addl		%edx, sum2				// sum2 += adler*16;
172	movhlps		%xmm3, %xmm2			// 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
173	movd		%xmm1, %edx				// to be added to adler
174	paddd		%xmm2, %xmm3			// 2 32-bits elements in xmm3 to be added to sum2
175	addl		%edx, adler				// update adler
176	movd		%xmm3, %edx				// to be added to sum2
177	psrlq		$$32, %xmm3				// another 32-bit to be added to sum2
178	addl		%edx, sum2				// sum2 += 1st half of update
179	movd		%xmm3, %edx				// to be added to sum2
180	addl		%edx, sum2				// sum2 += 2nd half of update
181	.endm
182
183	// update adler/sum2 according to a new 32-byte vector
184	.macro		DO32
185	imull		$$32, adler, %edx		// edx = 32*adler
186	movaps		(buf), %xmm1			// 1st 16 bytes vector
187	movaps		16(buf), %xmm7			// 2nd 16 bytes vector
188	movaps		%xmm1, %xmm3			// a copy of 1st vector, used for unsigned byte in the destination of pmaddubsw
189	movaps		%xmm7, %xmm2			// a copy of 2nd vector, used for unsigned byte in the destination of pmaddubsw
190	psadbw		zero, %xmm1				// 2 16-bit words to be added for adler in xmm1
191	psadbw		zero, %xmm7				// 2 16-bit words to be added for adler in xmm7
192	addl		%edx, sum2				// sum2 += adler*32;
193	pmaddubsw	%xmm6, %xmm3			// 8 16-bit words to be added for sum2 in xmm3
194	pmaddubsw	%xmm4, %xmm2			// 8 16-bit words to be added for sum2 in xmm2
195	paddd		%xmm7, %xmm1			// 2 16-bit words to be added for adler in xmm1
196	paddd		%xmm2, %xmm3			// 8 16-bit words to be added for sum2 in xmm3
197	addl		$$32, buf				// buf -> vector for next iteration
198	movhlps		%xmm1, %xmm2			// higher 16-bit word (for adler) in xmm2
199	pmaddwd		ones, %xmm3				// 4 32-bit elements to be added for sum2 in xmm3
200	paddq		%xmm2, %xmm1			// xmm1 lower 32-bit to be added to adler
201	movhlps		%xmm3, %xmm2			// 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
202	movd		%xmm1, %edx				// to be added to adler
203	paddd		%xmm2, %xmm3			// 2 32-bits elements in xmm3 to be added to sum2
204	addl		%edx, adler				// update adler
205	movd		%xmm3, %edx				// to be added to sum2
206	psrlq		$$32, %xmm3				// another 32-bit to be added to sum2
207	addl		%edx, sum2				// sum2 += 1st half of update
208	movd		%xmm3, %edx				// to be added to sum2
209	addl		%edx, sum2				// sum2 += 2nd half of update
210	.endm
211
212	// this defines the macro DO16 for SSSE3 not supported
213    .macro      DO16_nossse3
214    movaps      (buf), %xmm1            // 16 bytes vector
215    movaps      %xmm1, %xmm3            // a copy of the vector, the lower 8 bytes to be shuffled into 8 words
216    movaps      %xmm1, %xmm2            // a copy of the vector, the higher 8 bytes to be shuffled into 8 words
217    psrldq      $$8, %xmm2              // shift down 8 bytes, to reuse the shuffle vector
218    punpcklbw   zero, %xmm3             // convert lower 8 bytes into 8 words
219    punpcklbw   zero, %xmm2             // convert higher 8 bytes into 8 words
220    pmullw      %xmm6, %xmm3            // lower 8 words * 16:9
221    pmullw      %xmm4, %xmm2            // higher 8 words * 8:1
222    addl        $$16, buf               // buf -> next vector
223    psadbw      zero, %xmm1             // 2 16-bit words to be added for adler in xmm1
224    paddw       %xmm2, %xmm3            // 8 16-bit words to be added for sum2 in xmm3
225    imull       $$16, adler, %edx       // edx = 16*adler;
226    movhlps     %xmm1, %xmm2            // higher 16-bit word (for adler) in xmm2
227    pmaddwd     ones, %xmm3             // 4 32-bit elements to be added for sum2 in xmm3
228    paddq       %xmm2, %xmm1            // xmm1 lower 32-bit to be added to adler
229    addl        %edx, sum2              // sum2 += adler*16;
230    movhlps     %xmm3, %xmm2            // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
231    movd        %xmm1, %edx             // to be added to adler
232    paddd       %xmm2, %xmm3            // 2 32-bits elements in xmm3 to be added to sum2
233    addl        %edx, adler             // update adler
234    movd        %xmm3, %edx             // to be added to sum2
235    psrlq       $$32, %xmm3             // another 32-bit to be added to sum2
236    addl        %edx, sum2              // sum2 += 1st half of update
237    movd        %xmm3, %edx             // to be added to sum2
238    addl        %edx, sum2              // sum2 += 2nd half of update
239    .endm
240
241#ifdef  KERNEL
242    leal    __cpu_capabilities, %eax                        // %eax -> __cpu_capabilities
243    mov     (%eax), %eax                                    // %eax = __cpu_capabilities
244#else
245    mov    _COMM_PAGE_CPU_CAPABILITIES, %eax
246#endif
247    test    $(kHasSupplementalSSE3), %eax 					// __cpu_capabilities & kHasAES
248	je		L_no_ssse3
249
250	// i386 adler32 with ssse3
251
252	// need to fill up xmm4/xmm5/xmm6 only if len>=16
253	cmpl	$16, len
254	jl		L_skip_loading_tables
255
256	// set up table starting address to %eax
257	leal	sum2_coefficients, %eax
258
259	// reading coefficients
260	pxor	zero, zero
261	movaps	(%eax), %xmm6			// coefficients for computing sum2 : pmaddubsw 32:17
262	movaps	16(%eax), %xmm4			// coefficients for computing sum2 : pmaddubsw 16:1
263	movaps	32(%eax), ones			// coefficients for computing sum2 : pmaddwd 1,1,...,1
264
265L_skip_loading_tables:
266
267	cmpl	$NMAX, len				// len vs NMAX
268	jl		len_lessthan_NMAX		// if (len < NMAX), skip the following NMAX batches processing
269
270len_ge_NMAX_loop:					// while (len>=NMAX) {
271
272	subl	$NMAX, len				// 		len -= NMAX
273	movl	$(NMAX/32), %eax		// 		n = NMAX/32
274
275n_loop:								// 		do {
276	DO32							// 			update adler/sum2 for a 32-byte input
277	decl 	%eax					// 			n--;
278	jg		n_loop					//  	} while (n);
279	DO16							//  	update adler/sum2 for a 16-byte input
280	modulo_BASE						// 		(adler/sum2) modulo BASE;
281	cmpl	$NMAX, len				//
282	jge		len_ge_NMAX_loop		// }	/* len>=NMAX */
283
284len_lessthan_NMAX:
285
286	subl	$32, len				// pre-decrement len by 32
287	jl		len_lessthan_32			// if len < 32, skip the 32-vector code
288len32_loop:							// while (len>=32) {
289	DO32							//   update adler/sum2 for a 32-byte input
290	subl	$32, len				//   len -= 32;
291	jge		len32_loop				// }
292
293len_lessthan_32:
294
295	addl	$(32-16), len			// post-increment by 32 + pre-decrement by 16 on len
296	jl		L_len_lessthan_16			// if len < 16, skip the 16-vector code
297	DO16							// update adler/sum2 for a 16-byte input
298	subl	$16, len				// len -= 16;
299
300L_len_lessthan_16:
301	addl	$16, len				// post-increment len by 16
302	jz		len_is_zero				// if len==0, branch over scalar processing
303
3040:									// while (len) {
305	movzbl	(buf), %edx				// 	new input byte
306	incl	buf						// 	buf++
307	addl	%edx, adler				// 	adler += *buf
308	addl	adler, sum2				// 	sum2 += adler
309	subl	$1, len					// 	len--
310	jg		0b						// }
311
312len_is_zero:
313
314	modulo_BASE						// (adler/sum2) modulo BASE;
315
316	// construct 32-bit (sum2<<16 | adler) to be returned
317
318	sall	$16, sum2				// sum2 <<16
319	movl	adler, %eax				// adler
320	orl		sum2, %eax				// sum2<<16 | adler
321
322
323#ifdef	KERNEL 					// if this is for kernel code, need to restore xmm registers
324	movaps	(%esp), %xmm0		// restore xmm0, offset -12 for ebx/edi/esi
325	movaps	16(%esp), %xmm1		// restore xmm1
326	movaps	32(%esp), %xmm2		// restore xmm2
327	movaps	48(%esp), %xmm3		// restore xmm3
328	movaps	64(%esp), %xmm4		// restore xmm4
329	movaps	80(%esp), %xmm5		// restore xmm5
330	movaps	96(%esp), %xmm6		// restore xmm6
331	movaps	112(%esp), %xmm7	// restore xmm7, if this is for SSSE3 or above
332	addl	$140, %esp			// we've already restored %xmm0-%xmm7 from stack
333#endif
334
335    popl   %esi
336    popl   %edi
337	popl   %ebx
338	leave						// pop ebp out from stack
339	ret
340
341
342L_no_ssse3:
343
344	// i386 adler32 without ssse3
345
346	// need to fill up xmm4/xmm5/xmm6 only if len>=16
347	cmpl	$16, len
348	jl		2f
349
350	// set up table starting address to %eax
351	leal	sum2_coefficients, %eax
352
353	// reading coefficients
354	pxor	zero, zero
355	movaps  48(%eax), %xmm6         // coefficients for computing sum2 : pmaddubsw 16:9
356    movaps  64(%eax), %xmm4         // coefficients for computing sum2 : pmaddubsw 8:1
357    movaps  80(%eax), ones          // coefficients for computing sum2 : pmaddwd 1,1,...,1
358
3592:
360
361	cmpl	$NMAX, len				// len vs NMAX
362	jl		3f						// if (len < NMAX), skip the following NMAX batches processing
363
3640:									// while (len>=NMAX) {
365
366	subl	$NMAX, len				// 		len -= NMAX
367	movl	$(NMAX/16), %eax		// 		n = NMAX/16
368
3691:									// 		do {
370	DO16_nossse3					//			update adler/sum2 for a 16-byte input
371	decl 	%eax					// 			n--;
372	jg		1b						//  	} while (n);
373
374	modulo_BASE						// 		(adler/sum2) modulo BASE;
375
376	cmpl	$NMAX, len				//
377	jge		0b						// }	/* len>=NMAX */
378
3793:
380
381	subl	$16, len				// pre-decrement len by 16
382	jl		L_len_lessthan_16		// if len < 16, skip the 16-vector code
383	DO16_nossse3					// update adler/sum2 for a 16-byte input
384	subl	$16, len				// len -= 16;
385	jmp		L_len_lessthan_16
386
387
388	.const
389	.align	4
390sum2_coefficients:	// used for vectorizing adler32 computation
391
392	.byte	32
393	.byte	31
394	.byte	30
395	.byte	29
396	.byte	28
397	.byte	27
398	.byte	26
399	.byte	25
400	.byte	24
401	.byte	23
402	.byte	22
403	.byte	21
404	.byte	20
405	.byte	19
406	.byte	18
407	.byte	17
408	.byte	16
409	.byte	15
410	.byte	14
411	.byte	13
412	.byte	12
413	.byte	11
414	.byte	10
415	.byte	9
416	.byte	8
417	.byte	7
418	.byte	6
419	.byte	5
420	.byte	4
421	.byte	3
422	.byte	2
423	.byte	1
424
425	// coefficients for pmaddwd, to combine into 4 32-bit elements for sum2
426	.word	1
427	.word	1
428	.word	1
429	.word	1
430	.word	1
431	.word	1
432	.word	1
433	.word	1
434
435
436	// data for without ssse3
437
438	.word   16
439    .word   15
440    .word   14
441    .word   13
442    .word   12
443    .word   11
444    .word   10
445    .word   9
446    .word   8
447    .word   7
448    .word   6
449    .word   5
450    .word   4
451    .word   3
452    .word   2
453    .word   1
454
455	// coefficients for pmaddwd, to combine into 4 32-bit elements for sum2
456	.word	1
457	.word	1
458	.word	1
459	.word	1
460	.word	1
461	.word	1
462	.word	1
463	.word	1
464
465#else	// (defined __x86_64__)
466
467	movq    __cpu_capabilities@GOTPCREL(%rip), %rax         // %rax -> __cpu_capabilities
468	mov     (%rax), %eax                                    // %eax = __cpu_capabilities
469	test    $(kHasSupplementalSSE3), %eax                   // __cpu_capabilities & kHasSupplementalSSE3
470    jne      L_has_ssse3
471
472	// ----------------------------------------------------------------------------------
473	// the following is added for x86_64 without SSSE3 support
474	// it is essentially a translated copy of the i386 code without SSSE3 code
475	// ----------------------------------------------------------------------------------
476
477	// input :
478	//		 adler : rdi
479	//		 sum2  : rsi
480	// 		 buf   : rdx
481	//		 len   : rcx
482
483	pushq	%rbp
484	movq	%rsp, %rbp
485	pushq	%rbx
486
487#ifdef	KERNEL			// if for kernel, save %xmm0-%xmm11
488	subq	$200, %rsp	// allocate for %xmm0-%xmm11 (192 bytes), extra 8 to align %rsp to 16-byte boundary
489	movaps	%xmm0, -32(%rbp)
490	movaps	%xmm1, -48(%rbp)
491	movaps	%xmm2, -64(%rbp)
492	movaps	%xmm3, -80(%rbp)
493	movaps	%xmm4, -96(%rbp)
494	movaps	%xmm5, -112(%rbp)
495	movaps	%xmm6, -128(%rbp)
496#endif
497
498	#define	adler	%rdi				// 16(%rbp)
499	#define	sum2	%rsi				// 24(%ebp)
500	#define	buf		%rcx				// 32(%ebp)
501	#define	len		%rbx				// 40(%ebp)
502	#define	zero	%xmm0
503	#define ones	%xmm5
504
505	movq	%rcx, len
506	movq	%rdx, buf
507
508	.macro		modulo_BASE
509	movl		$$-2146992015, %eax		// 1/BASE in Q47
510	mull		%edi					// edx:eax = adler divided by BASE in Q47
511	shrl		$$15, %edx				// edx is now the floor integer of adler and BASE
512	imull		$$BASE, %edx, %edx		// edx * BASE
513	subq		%rdx, adler				// adler -= edx*BASE
514	movl		$$-2146992015, %eax		// 1/BASE in Q47
515	mull		%esi					// edx:eax = sum2 divided by BASE in Q47
516	shrl		$$15, %edx				// edx is now the floor integer of sum2 and BASE
517	imull		$$BASE, %edx, %eax		// eax = edx * BASE
518	subq		%rax, sum2				// sum2 -= sdx*BASE
519	.endmacro
520
521	// update adler/sum2 according to a new 16-byte vector, no ssse3
522	.macro		DO16_nossse3
523    movaps      (buf), %xmm1            // 16 bytes vector
524    movaps      %xmm1, %xmm3            // a copy of the vector, the lower 8 bytes to be shuffled into 8 words
525    movaps      %xmm1, %xmm2            // a copy of the vector, the higher 8 bytes to be shuffled into 8 words
526    psrldq      $$8, %xmm2              // shift down 8 bytes, to reuse the shuffle vector
527    punpcklbw   zero, %xmm3             // convert lower 8 bytes into 8 words
528    punpcklbw   zero, %xmm2             // convert higher 8 bytes into 8 words
529    pmullw      %xmm6, %xmm3            // lower 8 words * 16:9
530    pmullw      %xmm4, %xmm2            // higher 8 words * 8:1
531    add	        $$16, buf               // buf -> next vector
532    psadbw      zero, %xmm1             // 2 16-bit words to be added for adler in xmm1
533    paddw       %xmm2, %xmm3            // 8 16-bit words to be added for sum2 in xmm3
534    imulq       $$16, adler, %rdx       // edx = 16*adler;
535    movhlps     %xmm1, %xmm2            // higher 16-bit word (for adler) in xmm2
536    pmaddwd     ones, %xmm3             // 4 32-bit elements to be added for sum2 in xmm3
537    paddq       %xmm2, %xmm1            // xmm1 lower 32-bit to be added to adler
538    add         %rdx, sum2              // sum2 += adler*16;
539    movhlps     %xmm3, %xmm2            // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
540    movd        %xmm1, %edx             // to be added to adler
541    paddd       %xmm2, %xmm3            // 2 32-bits elements in xmm3 to be added to sum2
542    addq        %rdx, adler             // update adler
543    movd        %xmm3, %edx             // to be added to sum2
544    psrlq       $$32, %xmm3             // another 32-bit to be added to sum2
545    addq        %rdx, sum2              // sum2 += 1st half of update
546    movd        %xmm3, %edx             // to be added to sum2
547    addq        %rdx, sum2              // sum2 += 2nd half of update
548	.endm
549
550	// need to fill up xmm4/xmm5/xmm6 only if len>=16
551	cmpq	$16, len
552	jl		0f
553
554	// set up table starting address to %eax
555	leaq    sum2_coefficients_nossse3(%rip), %rax
556
557	// reading coefficients
558	pxor	zero, zero
559	movaps  (%rax), %xmm6           // coefficients for computing sum2 : pmaddubsw 16:9
560    movaps  16(%rax), %xmm4         // coefficients for computing sum2 : pmaddubsw 8:1
561    movaps  32(%rax), ones          // coefficients for computing sum2 : pmaddwd 1,1,...,1
5620:
563
564	cmp		$NMAX, len				// len vs NMAX
565	jl		3f						// if (len < NMAX), skip the following NMAX batches processing
566
5670:									// while (len>=NMAX) {
568
569	sub		$NMAX, len				// 		len -= NMAX
570	mov		$(NMAX/16), %eax		// 		n = NMAX/16
571
5721:									// 		do {
573	DO16_nossse3					//			update adler/sum2 for a 16-byte input
574	decl 	%eax					// 			n--;
575	jg		1b						//  	} while (n);
576
577	modulo_BASE						// 		(adler/sum2) modulo BASE;
578
579	cmp		$NMAX, len				//
580	jge		0b						// }	/* len>=NMAX */
581
5823:
583
584	sub		$16, len				// pre-decrement len by 16
585	jl		2f						// if len < 16, skip the 16-vector code
586	DO16_nossse3					// update adler/sum2 for a 16-byte input
587	sub		$16, len				// len -= 16;
588
5892:
590	add		$16, len				// post-increment len by 16
591	jz		1f						// if len==0, branch over scalar processing
592
5930:									// while (len) {
594	movzbq	(buf), %rdx				// 	new input byte
595	incq	buf						// 	buf++
596	addq	%rdx, adler				// 	adler += *buf
597	addq	adler, sum2				// 	sum2 += adler
598	decq	len						// 	len--
599	jg		0b						// }
600
6011:
602
603	modulo_BASE						// (adler/sum2) modulo BASE;
604
605	// construct 32-bit (sum2<<16 | adler) to be returned
606
607	salq	$16, sum2				// sum2 <<16
608	movq	adler, %rax				// adler
609	orq		sum2, %rax				// sum2<<16 | adler
610
611#ifdef	KERNEL 					// if this is for kernel code, need to restore xmm registers
612	movaps	-32(%rbp), %xmm0
613	movaps	-48(%rbp), %xmm1
614	movaps	-64(%rbp), %xmm2
615	movaps	-80(%rbp), %xmm3
616	movaps	-96(%rbp), %xmm4
617	movaps	-112(%rbp), %xmm5
618	movaps	-128(%rbp), %xmm6
619	addq	$200, %rsp	// we've already restored %xmm0-%xmm11 from stack
620#endif
621
622	popq   %rbx
623	leave
624	ret
625
626
627
628	.const
629	.align	4
630sum2_coefficients_nossse3:	// used for vectorizing adler32 computation
631
632	// data for without ssse3
633
634	.word   16
635    .word   15
636    .word   14
637    .word   13
638    .word   12
639    .word   11
640    .word   10
641    .word   9
642    .word   8
643    .word   7
644    .word   6
645    .word   5
646    .word   4
647    .word   3
648    .word   2
649    .word   1
650
651	// coefficients for pmaddwd, to combine into 4 32-bit elements for sum2
652	.word	1
653	.word	1
654	.word	1
655	.word	1
656	.word	1
657	.word	1
658	.word	1
659	.word	1
660
661
662	.text
663
664	// ----------------------------------------------------------------------------------
665	// the following is the original x86_64 adler32_vec code that uses SSSE3 instructions
666	// ----------------------------------------------------------------------------------
667
668L_has_ssse3:
669
670	// input :
671	//		 adler : rdi
672	//		 sum2  : rsi
673	// 		 buf   : rdx
674	//		 len   : rcx
675
676	pushq	%rbp
677	movq	%rsp, %rbp
678	pushq	%rbx
679
680#ifdef	KERNEL			// if for kernel, save %xmm0-%xmm11
681	subq	$200, %rsp	// allocate for %xmm0-%xmm11 (192 bytes), extra 8 to align %rsp to 16-byte boundary
682	movaps	%xmm0, -32(%rbp)
683	movaps	%xmm1, -48(%rbp)
684	movaps	%xmm2, -64(%rbp)
685	movaps	%xmm3, -80(%rbp)
686	movaps	%xmm4, -96(%rbp)
687	movaps	%xmm5, -112(%rbp)
688	movaps	%xmm6, -128(%rbp)
689	movaps	%xmm7, -144(%rbp)
690	movaps	%xmm8, -160(%rbp)
691	movaps	%xmm9, -176(%rbp)
692	movaps	%xmm10, -192(%rbp)
693	movaps	%xmm11, -208(%rbp)
694#endif
695
696	#define	adler	%rdi				// 16(%rbp)
697	#define	sum2	%rsi				// 24(%ebp)
698	#define	buf		%rcx				// 32(%ebp)
699	#define	len		%rbx				// 40(%ebp)
700	#define	zero	%xmm0
701	#define ones	%xmm5
702
703	movq	%rcx, len
704	movq	%rdx, buf
705
706	// update adler/sum2 according to a new 16-byte vector
707	.macro		DO16
708	movaps		(buf), %xmm1			// 16 bytes vector
709	movaps		%xmm1, %xmm3			// a copy of the vector, used for unsigned byte in the destination of pmaddubsw
710	addq		$$16, buf				// buf -> next vector
711	psadbw		zero, %xmm1				// 2 16-bit words to be added for adler in xmm1
712	pmaddubsw	%xmm4, %xmm3			// 8 16-bit words to be added for sum2 in xmm3
713	imulq		$$16, adler, %rdx		// edx = 16*adler;
714	movhlps		%xmm1, %xmm2			// higher 16-bit word (for adler) in xmm2
715	pmaddwd		ones, %xmm3				// 4 32-bit elements to be added for sum2 in xmm3
716	paddq		%xmm2, %xmm1			// xmm1 lower 32-bit to be added to adler
717	addq		%rdx, sum2				// sum2 += adler*16;
718	movhlps		%xmm3, %xmm2			// 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
719	movd		%xmm1, %edx				// to be added to adler
720	paddd		%xmm2, %xmm3			// 2 32-bits elements in xmm3 to be added to sum2
721	addq		%rdx, adler				// update adler
722	movd		%xmm3, %edx				// to be added to sum2
723	psrlq		$$32, %xmm3				// another 32-bit to be added to sum2
724	addq		%rdx, sum2				// sum2 += 1st half of update
725	movd		%xmm3, %edx				// to be added to sum2
726	addq		%rdx, sum2				// sum2 += 2nd half of update
727	.endm
728
729	// update adler/sum2 according to a new 32-byte vector
730	.macro		DO32
731	imulq		$$32, adler, %rdx		// edx = 32*adler
732	movaps		(buf), %xmm1			// 1st 16 bytes vector
733	movaps		16(buf), %xmm7			// 2nd 16 bytes vector
734	movaps		%xmm1, %xmm3			// a copy of 1st vector, used for unsigned byte in the destination of pmaddubsw
735	movaps		%xmm7, %xmm2			// a copy of 2nd vector, used for unsigned byte in the destination of pmaddubsw
736	psadbw		zero, %xmm1				// 2 16-bit words to be added for adler in xmm1
737	psadbw		zero, %xmm7				// 2 16-bit words to be added for adler in xmm7
738	addq		%rdx, sum2				// sum2 += adler*32;
739	pmaddubsw	%xmm6, %xmm3			// 8 16-bit words to be added for sum2 in xmm3
740	pmaddubsw	%xmm4, %xmm2			// 8 16-bit words to be added for sum2 in xmm2
741	paddd		%xmm7, %xmm1			// 2 16-bit words to be added for adler in xmm1
742	paddw		%xmm2, %xmm3			// 8 16-bit words to be added for sum2 in xmm3
743	addq		$$32, buf				// buf -> vector for next iteration
744	movhlps		%xmm1, %xmm2			// higher 16-bit word (for adler) in xmm2
745	pmaddwd		ones, %xmm3				// 4 32-bit elements to be added for sum2 in xmm3
746	paddq		%xmm2, %xmm1			// xmm1 lower 32-bit to be added to adler
747	movhlps		%xmm3, %xmm2			// 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
748	movd		%xmm1, %edx				// to be added to adler
749	paddd		%xmm2, %xmm3			// 2 32-bits elements in xmm3 to be added to sum2
750	addq		%rdx, adler				// update adler
751	movd		%xmm3, %edx				// to be added to sum2
752	psrlq		$$32, %xmm3				// another 32-bit to be added to sum2
753	addq		%rdx, sum2				// sum2 += 1st half of update
754	movd		%xmm3, %edx				// to be added to sum2
755	addq		%rdx, sum2				// sum2 += 2nd half of update
756	.endm
757
758	// update adler/sum2 according to a new 48-byte vector
759
760	.macro		DO48
761	imulq		$$48, adler, %rdx		// edx = 48*adler
762
763	movaps		(buf), %xmm7			// 1st 16 bytes vector
764	movaps		16(buf), %xmm10			// 2nd 16 bytes vector
765	movaps		32(buf), %xmm11			// 3rd 16 bytes vector
766
767	movaps		%xmm7, %xmm1			// 1st vector
768	movaps		%xmm10, %xmm2			// 2nd vector
769	movaps		%xmm11, %xmm3			// 3rd vector
770
771	psadbw		zero, %xmm7				// 1st vector for adler
772	psadbw		zero, %xmm10			// 2nd vector for adler
773	psadbw		zero, %xmm11			// 3rd vector for adler
774
775	addq		%rdx, sum2				// sum2 += adler*48;
776
777	pmaddubsw	%xmm9, %xmm1			// 8 16-bit words to be added for sum2 : 1st vector
778	pmaddubsw	%xmm6, %xmm2			// 8 16-bit words to be added for sum2 : 2nd vector
779	pmaddubsw	%xmm4, %xmm3			// 8 16-bit words to be added for sum2 : 3rd vector
780
781	pmaddwd		ones, %xmm1				// 4 32-bit elements to be added for sum2 in xmm1
782	pmaddwd		ones, %xmm2				// 4 32-bit elements to be added for sum2 in xmm2
783	pmaddwd		ones, %xmm3				// 4 32-bit elements to be added for sum2 in xmm3
784
785	paddd		%xmm10, %xmm7			// 2 16-bit words to be added for adler
786	paddd		%xmm11, %xmm7			// 2 16-bit words to be added for adler
787
788	paddd		%xmm1, %xmm3			// 4 32-bit elements to be added for sum2
789	paddd		%xmm2, %xmm3			// 4 32-bit elements to be added for sum2
790
791	addq		$$48, buf				// buf -> vector for next iteration
792
793	movhlps		%xmm7, %xmm2			// higher 16-bit word (for adler) in xmm2
794	paddq		%xmm2, %xmm7			// xmm7 lower 32-bit to be added to adler
795
796	movhlps		%xmm3, %xmm2			// 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
797	movd		%xmm7, %edx				// to be added to adler
798	paddd		%xmm2, %xmm3			// 2 32-bits elements in xmm3 to be added to sum2
799	addq		%rdx, adler				// update adler
800	movd		%xmm3, %edx				// to be added to sum2
801	psrlq		$$32, %xmm3				// another 32-bit to be added to sum2
802	addq		%rdx, sum2				// sum2 += 1st half of update
803	movd		%xmm3, %edx				// to be added to sum2
804	addq		%rdx, sum2				// sum2 += 2nd half of update
805	.endm
806
807	// update adler/sum2 according to a new 64-byte vector
808	.macro		DO64
809	imulq		$$64, adler, %rdx		// edx = 64*adler
810
811	movaps		(buf), %xmm1			// 1st 16 bytes vector
812	movaps		16(buf), %xmm7			// 2nd 16 bytes vector
813	movaps		32(buf), %xmm10			// 3rd 16 bytes vector
814	movaps		48(buf), %xmm11			// 4th 16 bytes vector
815
816	movaps		%xmm1, %xmm3			// 1st vector
817	movaps		%xmm11, %xmm2			// 4th vector
818	psadbw		zero, %xmm1				// 1st vector for adler
819	psadbw		zero, %xmm11			// 4th vector for adler
820
821	addq		%rdx, sum2				// sum2 += adler*64;
822
823	pmaddubsw	%xmm8, %xmm3			// 8 16-bit words to be added for sum2 : 1st vector
824	pmaddubsw	%xmm4, %xmm2			// 8 16-bit words to be added for sum2 : 4th vector
825	pmaddwd		ones, %xmm3				// 4 32-bit elements to be added for sum2 in xmm3
826	pmaddwd		ones, %xmm2				// 4 32-bit elements to be added for sum2 in xmm2
827
828	paddd		%xmm11, %xmm1			// 2 16-bit words to be added for adler in xmm1
829	paddd		%xmm2, %xmm3			// 4 32-bit elements to be added for sum2 in xmm3
830
831	movaps		%xmm7, %xmm2			// 2nd vector
832	movaps		%xmm10, %xmm11			// 3rd vector
833
834	psadbw		zero, %xmm7				// 2nd vector for adler
835	psadbw		zero, %xmm10			// 3rd vector for adler
836
837	pmaddubsw	%xmm9, %xmm2			// 8 16-bit words to be added for sum2 : 2nd vector
838	pmaddubsw	%xmm6, %xmm11			// 8 16-bit words to be added for sum2 : 3rd vector
839	pmaddwd		ones, %xmm2				// 4 32-bit elements to be added for sum2 in xmm2
840	pmaddwd		ones, %xmm11			// 4 32-bit elements to be added for sum2 in xmm11
841
842	paddd		%xmm7, %xmm1			// 2 16-bit words to be added for adler in xmm1
843	paddd		%xmm10, %xmm1			// 2 16-bit words to be added for adler in xmm1
844
845	paddd		%xmm2, %xmm3			// 4 32-bit elements to be added for sum2 in xmm3
846	paddd		%xmm11, %xmm3			// 4 32-bit elements to be added for sum2 in xmm3
847
848	addq		$$64, buf				// buf -> vector for next iteration
849
850	movhlps		%xmm1, %xmm2			// higher 16-bit word (for adler) in xmm2
851	paddq		%xmm2, %xmm1			// xmm1 lower 32-bit to be added to adler
852	movhlps		%xmm3, %xmm2			// 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements
853	movd		%xmm1, %edx				// to be added to adler
854	paddd		%xmm2, %xmm3			// 2 32-bits elements in xmm3 to be added to sum2
855	addq		%rdx, adler				// update adler
856	movd		%xmm3, %edx				// to be added to sum2
857	psrlq		$$32, %xmm3				// another 32-bit to be added to sum2
858	addq		%rdx, sum2				// sum2 += 1st half of update
859	movd		%xmm3, %edx				// to be added to sum2
860	addq		%rdx, sum2				// sum2 += 2nd half of update
861	.endm
862
863	// need to fill up xmm4/xmm5/xmm6 only if len>=16
864	cmpq	$16, len
865	jl		skip_loading_tables
866
867	// set up table starting address to %eax
868	leaq    sum2_coefficients(%rip), %rax
869
870	// reading coefficients
871	pxor	zero, zero
872	movaps	(%rax), %xmm8			// coefficients for computing sum2 : pmaddubsw 64:49
873	movaps	16(%rax), %xmm9			// coefficients for computing sum2 : pmaddubsw 48:33
874	movaps	32(%rax), %xmm6			// coefficients for computing sum2 : pmaddubsw 32:17
875	movaps	48(%rax), %xmm4			// coefficients for computing sum2 : pmaddubsw 16:1
876	movaps	64(%rax), ones			// coefficients for computing sum2 : pmaddwd 1,1,...,1
877
878skip_loading_tables:
879
880
881	cmpq	$NMAX, len				// len vs NMAX
882	jl		len_lessthan_NMAX		// if (len < NMAX), skip the following NMAX batches processing
883
884len_ge_NMAX_loop:					// while (len>=NMAX) {
885
886	subq	$NMAX, len				// 		len -= NMAX
887	movq	$(NMAX/64), %rax		// 		n = NMAX/64
888
889n_loop:								// 		do {
890	DO64							// 			update adler/sum2 for a 64-byte input
891	decq 	%rax					// 			n--;
892	jg		n_loop					//  	} while (n);
893
894	DO48							//		update adler/sum2 for a 48-byte input
895
896	modulo_BASE						// 		(adler/sum2) modulo BASE;
897
898	cmpq	$NMAX, len				//
899	jge		len_ge_NMAX_loop		// }	/* len>=NMAX */
900
901len_lessthan_NMAX:
902
903	subq	$64, len				// pre-decrement len by 64
904	jl		len_lessthan_64			// if len < 64, skip the 64-vector code
905len64_loop:							// while (len>=64) {
906	DO64							//   update adler/sum2 for a 64-byte input
907	subq	$64, len				//   len -= 64;
908	jge		len64_loop				// }
909
910len_lessthan_64:
911	addq	$(64-32), len			// post-increment 64 + pre-decrement 32 of len
912	jl		len_lessthan_32			// if len < 32, skip the 32-vector code
913	DO32							//   update adler/sum2 for a 32-byte input
914	subq	$32, len				//   len -= 32;
915
916len_lessthan_32:
917
918	addq	$(32-16), len			// post-increment by 32 + pre-decrement by 16 on len
919	jl		len_lessthan_16			// if len < 16, skip the 16-vector code
920	DO16							// update adler/sum2 for a 16-byte input
921	subq	$16, len				// len -= 16;
922
923len_lessthan_16:
924	addq	$16, len				// post-increment len by 16
925	jz		len_is_zero				// if len==0, branch over scalar processing
926
927scalar_loop:						// while (len) {
928	movzbq	(buf), %rdx				// 	new input byte
929	incq	buf						// 	buf++
930	addq	%rdx, adler				// 	adler += *buf
931	addq	adler, sum2				// 	sum2 += adler
932	decq	len						// 	len--
933	jg		scalar_loop				// }
934
935len_is_zero:
936
937	modulo_BASE						// (adler/sum2) modulo BASE;
938
939	// construct 32-bit (sum2<<16 | adler) to be returned
940
941	salq	$16, sum2				// sum2 <<16
942	movq	adler, %rax				// adler
943	orq		sum2, %rax				// sum2<<16 | adler
944
945
946#ifdef	KERNEL			// if for kernel, restore %xmm0-%xmm11
947	movaps	-32(%rbp), %xmm0
948	movaps	-48(%rbp), %xmm1
949	movaps	-64(%rbp), %xmm2
950	movaps	-80(%rbp), %xmm3
951	movaps	-96(%rbp), %xmm4
952	movaps	-112(%rbp), %xmm5
953	movaps	-128(%rbp), %xmm6
954	movaps	-144(%rbp), %xmm7
955	movaps	-160(%rbp), %xmm8
956	movaps	-176(%rbp), %xmm9
957	movaps	-192(%rbp), %xmm10
958	movaps	-208(%rbp), %xmm11
959	addq	$200, %rsp	// we've already restored %xmm0-%xmm11 from stack
960#endif
961
962	popq   %rbx
963	leave							// pop ebp out from stack
964	ret
965
966
967	.const
968	.align	4
969sum2_coefficients:	// used for vectorizing adler32 computation
970
971	// coefficients for pmaddubsw instruction, used to generate 16-bit elements for sum2
972
973	.byte	64
974	.byte	63
975	.byte	62
976	.byte	61
977	.byte	60
978	.byte	59
979	.byte	58
980	.byte	57
981	.byte	56
982	.byte	55
983	.byte	54
984	.byte	53
985	.byte	52
986	.byte	51
987	.byte	50
988	.byte	49
989	.byte	48
990	.byte	47
991	.byte	46
992	.byte	45
993	.byte	44
994	.byte	43
995	.byte	42
996	.byte	41
997	.byte	40
998	.byte	39
999	.byte	38
1000	.byte	37
1001	.byte	36
1002	.byte	35
1003	.byte	34
1004	.byte	33
1005	.byte	32
1006	.byte	31
1007	.byte	30
1008	.byte	29
1009	.byte	28
1010	.byte	27
1011	.byte	26
1012	.byte	25
1013	.byte	24
1014	.byte	23
1015	.byte	22
1016	.byte	21
1017	.byte	20
1018	.byte	19
1019	.byte	18
1020	.byte	17
1021	.byte	16
1022	.byte	15
1023	.byte	14
1024	.byte	13
1025	.byte	12
1026	.byte	11
1027	.byte	10
1028	.byte	9
1029	.byte	8
1030	.byte	7
1031	.byte	6
1032	.byte	5
1033	.byte	4
1034	.byte	3
1035	.byte	2
1036	.byte	1
1037
1038	// coefficients for pmaddwd, to combine into 4 32-bit elements for sum2
1039	.word	1
1040	.word	1
1041	.word	1
1042	.word	1
1043	.word	1
1044	.word	1
1045	.word	1
1046	.word	1
1047
1048#endif	// (defined __i386__)
1049
1050#endif	// (defined __i386__ || defined __x86_64__)
1051