zlib/intel/adler32vec.s

7 	Given 2 unsigned 32-bit alder and sum2 (both pre-modulo by BASE=65521) and a sequence of input bytes x[0],...x[N-1].
8 	The adler-sum2 pair is updated according to
12 			sum2 = (sum2+adler)%BASE;
15 	To reduce/save the modulo operations, it can be shown that, if initial alder and sum2 are less than BASE(=65521),
16 	adler and sum2 (in 32-bit representation), will never overflow for the next NMAX=5552 bytes. This simplifies the
22 				sum2+=adler;
25 			sum2%=BASE;
32                 sum2+=adler;
35 	This subtask turns out to be very vecterizable. Suppose we perform the adler/sum2 update once per K bytes,
39                 sum2+=adler;
42 	It can be shown that the sum2-adler pair can be updated according to
44 		sum2 += adler*K;
46 		sum2 += (x[0]*K + x[1]*(K-1) + ... + x[K-1]*1);
48 	The last 2 equations obviously show that the adler-sum2 pair update can be speeded up using vector processor.
51 		[ K K-1 ... 1 ] for sum2 update.
57 	followed by a single DO16 (K=16), before calling a modulo operation for adler and sum2.
61 	before calling a modulo operation for adler and sum2.
77 // uLong	adler32_vec(unsigned int adler, unsigned int sum2, const Bytef *buf, int len) {
87 //        MOD(sum2);
97 //            sum2 += adler;
100 //        MOD(sum2);
102 //    return adler | (sum2 << 16);
136 	#define	sum2	%esi				// 12(%ebp)
143 	movl	12(%ebp), sum2
154 	mull		sum2					// edx:eax = sum2 divided by BASE in Q47
155 	shrl		$$15, %edx				// edx is now the floor integer of sum2 and BASE
157 	subl		%eax, sum2				// sum2 -= sdx*BASE
160 	// update adler/sum2 according to a new 16-byte vector
166 	pmaddubsw	%xmm4, %xmm3			// 8 16-bit words to be added for sum2 in xmm3
169 	pmaddwd		ones, %xmm3				// 4 32-bit elements to be added for sum2 in xmm3
171 	addl		%edx, sum2				// sum2 += adler*16;
174 	paddd		%xmm2, %xmm3			// 2 32-bits elements in xmm3 to be added to sum2
176 	movd		%xmm3, %edx				// to be added to sum2
177 	psrlq		$$32, %xmm3				// another 32-bit to be added to sum2
178 	addl		%edx, sum2				// sum2 += 1st half of update
179 	movd		%xmm3, %edx				// to be added to sum2
180 	addl		%edx, sum2				// sum2 += 2nd half of update
183 	// update adler/sum2 according to a new 32-byte vector
192 	addl		%edx, sum2				// sum2 += adler*32;
193 	pmaddubsw	%xmm6, %xmm3			// 8 16-bit words to be added for sum2 in xmm3
194 	pmaddubsw	%xmm4, %xmm2			// 8 16-bit words to be added for sum2 in xmm2
196 	paddd		%xmm2, %xmm3			// 8 16-bit words to be added for sum2 in xmm3
199 	pmaddwd		ones, %xmm3				// 4 32-bit elements to be added for sum2 in xmm3
203 	paddd		%xmm2, %xmm3			// 2 32-bits elements in xmm3 to be added to sum2
205 	movd		%xmm3, %edx				// to be added to sum2
206 	psrlq		$$32, %xmm3				// another 32-bit to be added to sum2
207 	addl		%edx, sum2				// sum2 += 1st half of update
208 	movd		%xmm3, %edx				// to be added to sum2
209 	addl		%edx, sum2				// sum2 += 2nd half of update
224     paddw       %xmm2, %xmm3            // 8 16-bit words to be added for sum2 in xmm3
227     pmaddwd     ones, %xmm3             // 4 32-bit elements to be added for sum2 in xmm3
229     addl        %edx, sum2              // sum2 += adler*16;
232     paddd       %xmm2, %xmm3            // 2 32-bits elements in xmm3 to be added to sum2
234     movd        %xmm3, %edx             // to be added to sum2
235     psrlq       $$32, %xmm3             // another 32-bit to be added to sum2
236     addl        %edx, sum2              // sum2 += 1st half of update
237     movd        %xmm3, %edx             // to be added to sum2
238     addl        %edx, sum2              // sum2 += 2nd half of update
261 	movaps	(%eax), %xmm6			// coefficients for computing sum2 : pmaddubsw 32:17
262 	movaps	16(%eax), %xmm4			// coefficients for computing sum2 : pmaddubsw 16:1
263 	movaps	32(%eax), ones			// coefficients for computing sum2 : pmaddwd 1,1,...,1
276 	DO32							// 			update adler/sum2 for a 32-byte input
279 	DO16							//  	update adler/sum2 for a 16-byte input
280 	modulo_BASE						// 		(adler/sum2) modulo BASE;
289 	DO32							//   update adler/sum2 for a 32-byte input
297 	DO16							// update adler/sum2 for a 16-byte input
308 	addl	adler, sum2				// 	sum2 += adler
314 	modulo_BASE						// (adler/sum2) modulo BASE;
316 	// construct 32-bit (sum2<<16 | adler) to be returned
318 	sall	$16, sum2				// sum2 <<16
320 	orl		sum2, %eax				// sum2<<16 | adler
355 	movaps  48(%eax), %xmm6         // coefficients for computing sum2 : pmaddubsw 16:9
356     movaps  64(%eax), %xmm4         // coefficients for computing sum2 : pmaddubsw 8:1
357     movaps  80(%eax), ones          // coefficients for computing sum2 : pmaddwd 1,1,...,1
370 	DO16_nossse3					//			update adler/sum2 for a 16-byte input
374 	modulo_BASE						// 		(adler/sum2) modulo BASE;
383 	DO16_nossse3					// update adler/sum2 for a 16-byte input
425 	// coefficients for pmaddwd, to combine into 4 32-bit elements for sum2
455 	// coefficients for pmaddwd, to combine into 4 32-bit elements for sum2
479 	//		 sum2  : rsi
499 	#define	sum2	%rsi				// 24(%ebp)
515 	mull		%esi					// edx:eax = sum2 divided by BASE in Q47
516 	shrl		$$15, %edx				// edx is now the floor integer of sum2 and BASE
518 	subq		%rax, sum2				// sum2 -= sdx*BASE
521 	// update adler/sum2 according to a new 16-byte vector, no ssse3
533     paddw       %xmm2, %xmm3            // 8 16-bit words to be added for sum2 in xmm3
536     pmaddwd     ones, %xmm3             // 4 32-bit elements to be added for sum2 in xmm3
538     add         %rdx, sum2              // sum2 += adler*16;
541     paddd       %xmm2, %xmm3            // 2 32-bits elements in xmm3 to be added to sum2
543     movd        %xmm3, %edx             // to be added to sum2
544     psrlq       $$32, %xmm3             // another 32-bit to be added to sum2
545     addq        %rdx, sum2              // sum2 += 1st half of update
546     movd        %xmm3, %edx             // to be added to sum2
547     addq        %rdx, sum2              // sum2 += 2nd half of update
559 	movaps  (%rax), %xmm6           // coefficients for computing sum2 : pmaddubsw 16:9
560     movaps  16(%rax), %xmm4         // coefficients for computing sum2 : pmaddubsw 8:1
561     movaps  32(%rax), ones          // coefficients for computing sum2 : pmaddwd 1,1,...,1
573 	DO16_nossse3					//			update adler/sum2 for a 16-byte input
577 	modulo_BASE						// 		(adler/sum2) modulo BASE;
586 	DO16_nossse3					// update adler/sum2 for a 16-byte input
597 	addq	adler, sum2				// 	sum2 += adler
603 	modulo_BASE						// (adler/sum2) modulo BASE;
605 	// construct 32-bit (sum2<<16 | adler) to be returned
607 	salq	$16, sum2				// sum2 <<16
609 	orq		sum2, %rax				// sum2<<16 | adler
651 	// coefficients for pmaddwd, to combine into 4 32-bit elements for sum2
672 	//		 sum2  : rsi
697 	#define	sum2	%rsi				// 24(%ebp)
706 	// update adler/sum2 according to a new 16-byte vector
712 	pmaddubsw	%xmm4, %xmm3			// 8 16-bit words to be added for sum2 in xmm3
715 	pmaddwd		ones, %xmm3				// 4 32-bit elements to be added for sum2 in xmm3
717 	addq		%rdx, sum2				// sum2 += adler*16;
720 	paddd		%xmm2, %xmm3			// 2 32-bits elements in xmm3 to be added to sum2
722 	movd		%xmm3, %edx				// to be added to sum2
723 	psrlq		$$32, %xmm3				// another 32-bit to be added to sum2
724 	addq		%rdx, sum2				// sum2 += 1st half of update
725 	movd		%xmm3, %edx				// to be added to sum2
726 	addq		%rdx, sum2				// sum2 += 2nd half of update
729 	// update adler/sum2 according to a new 32-byte vector
738 	addq		%rdx, sum2				// sum2 += adler*32;
739 	pmaddubsw	%xmm6, %xmm3			// 8 16-bit words to be added for sum2 in xmm3
740 	pmaddubsw	%xmm4, %xmm2			// 8 16-bit words to be added for sum2 in xmm2
742 	paddw		%xmm2, %xmm3			// 8 16-bit words to be added for sum2 in xmm3
745 	pmaddwd		ones, %xmm3				// 4 32-bit elements to be added for sum2 in xmm3
749 	paddd		%xmm2, %xmm3			// 2 32-bits elements in xmm3 to be added to sum2
751 	movd		%xmm3, %edx				// to be added to sum2
752 	psrlq		$$32, %xmm3				// another 32-bit to be added to sum2
753 	addq		%rdx, sum2				// sum2 += 1st half of update
754 	movd		%xmm3, %edx				// to be added to sum2
755 	addq		%rdx, sum2				// sum2 += 2nd half of update
758 	// update adler/sum2 according to a new 48-byte vector
775 	addq		%rdx, sum2				// sum2 += adler*48;
777 	pmaddubsw	%xmm9, %xmm1			// 8 16-bit words to be added for sum2 : 1st vector
778 	pmaddubsw	%xmm6, %xmm2			// 8 16-bit words to be added for sum2 : 2nd vector
779 	pmaddubsw	%xmm4, %xmm3			// 8 16-bit words to be added for sum2 : 3rd vector
781 	pmaddwd		ones, %xmm1				// 4 32-bit elements to be added for sum2 in xmm1
782 	pmaddwd		ones, %xmm2				// 4 32-bit elements to be added for sum2 in xmm2
783 	pmaddwd		ones, %xmm3				// 4 32-bit elements to be added for sum2 in xmm3
788 	paddd		%xmm1, %xmm3			// 4 32-bit elements to be added for sum2
789 	paddd		%xmm2, %xmm3			// 4 32-bit elements to be added for sum2
798 	paddd		%xmm2, %xmm3			// 2 32-bits elements in xmm3 to be added to sum2
800 	movd		%xmm3, %edx				// to be added to sum2
801 	psrlq		$$32, %xmm3				// another 32-bit to be added to sum2
802 	addq		%rdx, sum2				// sum2 += 1st half of update
803 	movd		%xmm3, %edx				// to be added to sum2
804 	addq		%rdx, sum2				// sum2 += 2nd half of update
807 	// update adler/sum2 according to a new 64-byte vector
821 	addq		%rdx, sum2				// sum2 += adler*64;
823 	pmaddubsw	%xmm8, %xmm3			// 8 16-bit words to be added for sum2 : 1st vector
824 	pmaddubsw	%xmm4, %xmm2			// 8 16-bit words to be added for sum2 : 4th vector
825 	pmaddwd		ones, %xmm3				// 4 32-bit elements to be added for sum2 in xmm3
826 	pmaddwd		ones, %xmm2				// 4 32-bit elements to be added for sum2 in xmm2
829 	paddd		%xmm2, %xmm3			// 4 32-bit elements to be added for sum2 in xmm3
837 	pmaddubsw	%xmm9, %xmm2			// 8 16-bit words to be added for sum2 : 2nd vector
838 	pmaddubsw	%xmm6, %xmm11			// 8 16-bit words to be added for sum2 : 3rd vector
839 	pmaddwd		ones, %xmm2				// 4 32-bit elements to be added for sum2 in xmm2
840 	pmaddwd		ones, %xmm11			// 4 32-bit elements to be added for sum2 in xmm11
845 	paddd		%xmm2, %xmm3			// 4 32-bit elements to be added for sum2 in xmm3
846 	paddd		%xmm11, %xmm3			// 4 32-bit elements to be added for sum2 in xmm3
854 	paddd		%xmm2, %xmm3			// 2 32-bits elements in xmm3 to be added to sum2
856 	movd		%xmm3, %edx				// to be added to sum2
857 	psrlq		$$32, %xmm3				// another 32-bit to be added to sum2
858 	addq		%rdx, sum2				// sum2 += 1st half of update
859 	movd		%xmm3, %edx				// to be added to sum2
860 	addq		%rdx, sum2				// sum2 += 2nd half of update
872 	movaps	(%rax), %xmm8			// coefficients for computing sum2 : pmaddubsw 64:49
873 	movaps	16(%rax), %xmm9			// coefficients for computing sum2 : pmaddubsw 48:33
874 	movaps	32(%rax), %xmm6			// coefficients for computing sum2 : pmaddubsw 32:17
875 	movaps	48(%rax), %xmm4			// coefficients for computing sum2 : pmaddubsw 16:1
876 	movaps	64(%rax), ones			// coefficients for computing sum2 : pmaddwd 1,1,...,1
890 	DO64							// 			update adler/sum2 for a 64-byte input
894 	DO48							//		update adler/sum2 for a 48-byte input
896 	modulo_BASE						// 		(adler/sum2) modulo BASE;
906 	DO64							//   update adler/sum2 for a 64-byte input
913 	DO32							//   update adler/sum2 for a 32-byte input
920 	DO16							// update adler/sum2 for a 16-byte input
931 	addq	adler, sum2				// 	sum2 += adler
937 	modulo_BASE						// (adler/sum2) modulo BASE;
939 	// construct 32-bit (sum2<<16 | adler) to be returned
941 	salq	$16, sum2				// sum2 <<16
943 	orq		sum2, %rax				// sum2<<16 | adler
971 	// coefficients for pmaddubsw instruction, used to generate 16-bit elements for sum2
1038 	// coefficients for pmaddwd, to combine into 4 32-bit elements for sum2