Lines Matching refs:sum2

7 	Given 2 unsigned 32-bit alder and sum2 (both pre-modulo by BASE=65521) and a sequence of input bytes x[0],...x[N-1].
8 The adler-sum2 pair is updated according to
12 sum2 = (sum2+adler)%BASE;
15 To reduce/save the modulo operations, it can be shown that, if initial alder and sum2 are less than BASE(=65521),
16 adler and sum2 (in 32-bit representation), will never overflow for the next NMAX=5552 bytes. This simplifies the
22 sum2+=adler;
25 sum2%=BASE;
32 sum2+=adler;
35 This subtask turns out to be very vecterizable. Suppose we perform the adler/sum2 update once per K bytes,
39 sum2+=adler;
42 It can be shown that the sum2-adler pair can be updated according to
44 sum2 += adler*K;
46 sum2 += (x[0]*K + x[1]*(K-1) + ... + x[K-1]*1);
48 The last 2 equations obviously show that the adler-sum2 pair update can be speeded up using vector processor.
51 [ K K-1 ... 1 ] for sum2 update.
57 followed by a single DO16 (K=16), before calling a modulo operation for adler and sum2.
61 before calling a modulo operation for adler and sum2.
77 // uLong adler32_vec(unsigned int adler, unsigned int sum2, const Bytef *buf, int len) {
87 // MOD(sum2);
97 // sum2 += adler;
100 // MOD(sum2);
102 // return adler | (sum2 << 16);
136 #define sum2 %esi // 12(%ebp)
143 movl 12(%ebp), sum2
154 mull sum2 // edx:eax = sum2 divided by BASE in Q47
155 shrl $$15, %edx // edx is now the floor integer of sum2 and BASE
157 subl %eax, sum2 // sum2 -= sdx*BASE
160 // update adler/sum2 according to a new 16-byte vector
166 pmaddubsw %xmm4, %xmm3 // 8 16-bit words to be added for sum2 in xmm3
169 pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
171 addl %edx, sum2 // sum2 += adler*16;
174 paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2
176 movd %xmm3, %edx // to be added to sum2
177 psrlq $$32, %xmm3 // another 32-bit to be added to sum2
178 addl %edx, sum2 // sum2 += 1st half of update
179 movd %xmm3, %edx // to be added to sum2
180 addl %edx, sum2 // sum2 += 2nd half of update
183 // update adler/sum2 according to a new 32-byte vector
192 addl %edx, sum2 // sum2 += adler*32;
193 pmaddubsw %xmm6, %xmm3 // 8 16-bit words to be added for sum2 in xmm3
194 pmaddubsw %xmm4, %xmm2 // 8 16-bit words to be added for sum2 in xmm2
196 paddd %xmm2, %xmm3 // 8 16-bit words to be added for sum2 in xmm3
199 pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
203 paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2
205 movd %xmm3, %edx // to be added to sum2
206 psrlq $$32, %xmm3 // another 32-bit to be added to sum2
207 addl %edx, sum2 // sum2 += 1st half of update
208 movd %xmm3, %edx // to be added to sum2
209 addl %edx, sum2 // sum2 += 2nd half of update
224 paddw %xmm2, %xmm3 // 8 16-bit words to be added for sum2 in xmm3
227 pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
229 addl %edx, sum2 // sum2 += adler*16;
232 paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2
234 movd %xmm3, %edx // to be added to sum2
235 psrlq $$32, %xmm3 // another 32-bit to be added to sum2
236 addl %edx, sum2 // sum2 += 1st half of update
237 movd %xmm3, %edx // to be added to sum2
238 addl %edx, sum2 // sum2 += 2nd half of update
261 movaps (%eax), %xmm6 // coefficients for computing sum2 : pmaddubsw 32:17
262 movaps 16(%eax), %xmm4 // coefficients for computing sum2 : pmaddubsw 16:1
263 movaps 32(%eax), ones // coefficients for computing sum2 : pmaddwd 1,1,...,1
276 DO32 // update adler/sum2 for a 32-byte input
279 DO16 // update adler/sum2 for a 16-byte input
280 modulo_BASE // (adler/sum2) modulo BASE;
289 DO32 // update adler/sum2 for a 32-byte input
297 DO16 // update adler/sum2 for a 16-byte input
308 addl adler, sum2 // sum2 += adler
314 modulo_BASE // (adler/sum2) modulo BASE;
316 // construct 32-bit (sum2<<16 | adler) to be returned
318 sall $16, sum2 // sum2 <<16
320 orl sum2, %eax // sum2<<16 | adler
355 movaps 48(%eax), %xmm6 // coefficients for computing sum2 : pmaddubsw 16:9
356 movaps 64(%eax), %xmm4 // coefficients for computing sum2 : pmaddubsw 8:1
357 movaps 80(%eax), ones // coefficients for computing sum2 : pmaddwd 1,1,...,1
370 DO16_nossse3 // update adler/sum2 for a 16-byte input
374 modulo_BASE // (adler/sum2) modulo BASE;
383 DO16_nossse3 // update adler/sum2 for a 16-byte input
425 // coefficients for pmaddwd, to combine into 4 32-bit elements for sum2
455 // coefficients for pmaddwd, to combine into 4 32-bit elements for sum2
479 // sum2 : rsi
499 #define sum2 %rsi // 24(%ebp)
515 mull %esi // edx:eax = sum2 divided by BASE in Q47
516 shrl $$15, %edx // edx is now the floor integer of sum2 and BASE
518 subq %rax, sum2 // sum2 -= sdx*BASE
521 // update adler/sum2 according to a new 16-byte vector, no ssse3
533 paddw %xmm2, %xmm3 // 8 16-bit words to be added for sum2 in xmm3
536 pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
538 add %rdx, sum2 // sum2 += adler*16;
541 paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2
543 movd %xmm3, %edx // to be added to sum2
544 psrlq $$32, %xmm3 // another 32-bit to be added to sum2
545 addq %rdx, sum2 // sum2 += 1st half of update
546 movd %xmm3, %edx // to be added to sum2
547 addq %rdx, sum2 // sum2 += 2nd half of update
559 movaps (%rax), %xmm6 // coefficients for computing sum2 : pmaddubsw 16:9
560 movaps 16(%rax), %xmm4 // coefficients for computing sum2 : pmaddubsw 8:1
561 movaps 32(%rax), ones // coefficients for computing sum2 : pmaddwd 1,1,...,1
573 DO16_nossse3 // update adler/sum2 for a 16-byte input
577 modulo_BASE // (adler/sum2) modulo BASE;
586 DO16_nossse3 // update adler/sum2 for a 16-byte input
597 addq adler, sum2 // sum2 += adler
603 modulo_BASE // (adler/sum2) modulo BASE;
605 // construct 32-bit (sum2<<16 | adler) to be returned
607 salq $16, sum2 // sum2 <<16
609 orq sum2, %rax // sum2<<16 | adler
651 // coefficients for pmaddwd, to combine into 4 32-bit elements for sum2
672 // sum2 : rsi
697 #define sum2 %rsi // 24(%ebp)
706 // update adler/sum2 according to a new 16-byte vector
712 pmaddubsw %xmm4, %xmm3 // 8 16-bit words to be added for sum2 in xmm3
715 pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
717 addq %rdx, sum2 // sum2 += adler*16;
720 paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2
722 movd %xmm3, %edx // to be added to sum2
723 psrlq $$32, %xmm3 // another 32-bit to be added to sum2
724 addq %rdx, sum2 // sum2 += 1st half of update
725 movd %xmm3, %edx // to be added to sum2
726 addq %rdx, sum2 // sum2 += 2nd half of update
729 // update adler/sum2 according to a new 32-byte vector
738 addq %rdx, sum2 // sum2 += adler*32;
739 pmaddubsw %xmm6, %xmm3 // 8 16-bit words to be added for sum2 in xmm3
740 pmaddubsw %xmm4, %xmm2 // 8 16-bit words to be added for sum2 in xmm2
742 paddw %xmm2, %xmm3 // 8 16-bit words to be added for sum2 in xmm3
745 pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
749 paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2
751 movd %xmm3, %edx // to be added to sum2
752 psrlq $$32, %xmm3 // another 32-bit to be added to sum2
753 addq %rdx, sum2 // sum2 += 1st half of update
754 movd %xmm3, %edx // to be added to sum2
755 addq %rdx, sum2 // sum2 += 2nd half of update
758 // update adler/sum2 according to a new 48-byte vector
775 addq %rdx, sum2 // sum2 += adler*48;
777 pmaddubsw %xmm9, %xmm1 // 8 16-bit words to be added for sum2 : 1st vector
778 pmaddubsw %xmm6, %xmm2 // 8 16-bit words to be added for sum2 : 2nd vector
779 pmaddubsw %xmm4, %xmm3 // 8 16-bit words to be added for sum2 : 3rd vector
781 pmaddwd ones, %xmm1 // 4 32-bit elements to be added for sum2 in xmm1
782 pmaddwd ones, %xmm2 // 4 32-bit elements to be added for sum2 in xmm2
783 pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
788 paddd %xmm1, %xmm3 // 4 32-bit elements to be added for sum2
789 paddd %xmm2, %xmm3 // 4 32-bit elements to be added for sum2
798 paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2
800 movd %xmm3, %edx // to be added to sum2
801 psrlq $$32, %xmm3 // another 32-bit to be added to sum2
802 addq %rdx, sum2 // sum2 += 1st half of update
803 movd %xmm3, %edx // to be added to sum2
804 addq %rdx, sum2 // sum2 += 2nd half of update
807 // update adler/sum2 according to a new 64-byte vector
821 addq %rdx, sum2 // sum2 += adler*64;
823 pmaddubsw %xmm8, %xmm3 // 8 16-bit words to be added for sum2 : 1st vector
824 pmaddubsw %xmm4, %xmm2 // 8 16-bit words to be added for sum2 : 4th vector
825 pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
826 pmaddwd ones, %xmm2 // 4 32-bit elements to be added for sum2 in xmm2
829 paddd %xmm2, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
837 pmaddubsw %xmm9, %xmm2 // 8 16-bit words to be added for sum2 : 2nd vector
838 pmaddubsw %xmm6, %xmm11 // 8 16-bit words to be added for sum2 : 3rd vector
839 pmaddwd ones, %xmm2 // 4 32-bit elements to be added for sum2 in xmm2
840 pmaddwd ones, %xmm11 // 4 32-bit elements to be added for sum2 in xmm11
845 paddd %xmm2, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
846 paddd %xmm11, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3
854 paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2
856 movd %xmm3, %edx // to be added to sum2
857 psrlq $$32, %xmm3 // another 32-bit to be added to sum2
858 addq %rdx, sum2 // sum2 += 1st half of update
859 movd %xmm3, %edx // to be added to sum2
860 addq %rdx, sum2 // sum2 += 2nd half of update
872 movaps (%rax), %xmm8 // coefficients for computing sum2 : pmaddubsw 64:49
873 movaps 16(%rax), %xmm9 // coefficients for computing sum2 : pmaddubsw 48:33
874 movaps 32(%rax), %xmm6 // coefficients for computing sum2 : pmaddubsw 32:17
875 movaps 48(%rax), %xmm4 // coefficients for computing sum2 : pmaddubsw 16:1
876 movaps 64(%rax), ones // coefficients for computing sum2 : pmaddwd 1,1,...,1
890 DO64 // update adler/sum2 for a 64-byte input
894 DO48 // update adler/sum2 for a 48-byte input
896 modulo_BASE // (adler/sum2) modulo BASE;
906 DO64 // update adler/sum2 for a 64-byte input
913 DO32 // update adler/sum2 for a 32-byte input
920 DO16 // update adler/sum2 for a 16-byte input
931 addq adler, sum2 // sum2 += adler
937 modulo_BASE // (adler/sum2) modulo BASE;
939 // construct 32-bit (sum2<<16 | adler) to be returned
941 salq $16, sum2 // sum2 <<16
943 orq sum2, %rax // sum2<<16 | adler
971 // coefficients for pmaddubsw instruction, used to generate 16-bit elements for sum2
1038 // coefficients for pmaddwd, to combine into 4 32-bit elements for sum2