1/* Apple Copyright 2009 2 CoreOS - vector & Numerics, cclee 10-22-09 3 4 This following source code implements a vectorized version of adler32 computation that is defined in zlib. 5 The target architectures are x86_64 and i386. 6 7 Given 2 unsigned 32-bit alder and sum2 (both pre-modulo by BASE=65521) and a sequence of input bytes x[0],...x[N-1]. 8 The adler-sum2 pair is updated according to 9 10 for (i=0;i<N;i++) { 11 adler = (adler+x[i])%BASE; 12 sum2 = (sum2+adler)%BASE; 13 } 14 15 To reduce/save the modulo operations, it can be shown that, if initial alder and sum2 are less than BASE(=65521), 16 adler and sum2 (in 32-bit representation), will never overflow for the next NMAX=5552 bytes. This simplifies the 17 algorithm to 18 19 for (i=0;i<N;i+=NMAX) { 20 for (k=0;k<NMAX;k++) { 21 adler+=x[i+k]; 22 sum2+=adler; 23 } 24 adler%=BASE; 25 sum2%=BASE; 26 } 27 28 The hand optimization of this function is now reduced to 29 30 for (k=0;k<NMAX;k++) { 31 adler+=x[k]; 32 sum2+=adler; 33 } 34 35 This subtask turns out to be very vecterizable. Suppose we perform the adler/sum2 update once per K bytes, 36 37 for (k=0;k<K;k++) { 38 adler+=x[k]; 39 sum2+=adler; 40 } 41 42 It can be shown that the sum2-adler pair can be updated according to 43 44 sum2 += adler*K; 45 adler += (x[0] + x[1] + ... + x[K-1]); 46 sum2 += (x[0]*K + x[1]*(K-1) + ... + x[K-1]*1); 47 48 The last 2 equations obviously show that the adler-sum2 pair update can be speeded up using vector processor. 49 The input vector [ x[0] x[1] ... x[K-1] ]. And we need two coefficient vectors 50 [ 1 1 1 ... 1 ] for adler update. 51 [ K K-1 ... 1 ] for sum2 update. 52 53 The implementation below reads vector (K=16,32,48,64) into xmm registers, and sets up coefficient vectors in xmm 54 registers. It then uses SSE instructions to perform the aforementioned vector computation. 55 56 For i386, NMAX/16 = 347, whenever possible (NMAX-bytes block), it calls 173 times of macro code DO32 (K=32), 57 followed by a single DO16 (K=16), before calling a modulo operation for adler and sum2. 58 59 For x86_64 (where more xmm registers are available), NMAX/64 = 86, whenever possible (NMAX-bytes block), 60 it calls 86 times of macro code DO64 (K=64), followed by a single DO48 (K=48), 61 before calling a modulo operation for adler and sum2. 62 63*/ 64 65/* added cpu_capability to detect kHasSupplementalSSE3 to branch into code w or wo SupplementalSSE3 66 67 Previously, ssse3 code was intentionally turned off, because Yonah does not support ssse3 68 add code here to probe cpu_capabilities for ssse3 support 69 if ssse3 is supported, branch to ssse3-based code, otherwise use the original code 70 71 cclee 5-3-10 72*/ 73 74#define BASE 65521 /* largest prime smaller than 65536 */ 75#define NMAX 5552 /* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */ 76 77// uLong adler32_vec(unsigned int adler, unsigned int sum2, const Bytef *buf, int len) { 78// unsigned n; 79// while (len >= NMAX) { 80// len -= NMAX; 81// n = NMAX / 16; /* NMAX is divisible by 16 */ 82// do { 83// DO16(buf); /* 16 sums unrolled */ 84// buf += 16; 85// } while (--n); 86// MOD(adler); 87// MOD(sum2); 88// } 89// if (len) { /* avoid modulos if none remaining */ 90// while (len >= 16) { 91// len -= 16; 92// DO16(buf); 93// buf += 16; 94// } 95// while (len--) { 96// adler += *buf++; 97// sum2 += adler; 98// } 99// MOD(adler); 100// MOD(sum2); 101// } 102// return adler | (sum2 << 16); 103// } 104 105#if (defined __i386__ || defined __x86_64__) 106 107#include <i386/cpu_capabilities.h> 108 109 .text 110 .align 4,0x90 111.globl _adler32_vec 112_adler32_vec: 113 114#if (defined __i386__) 115 116 pushl %ebp 117 movl %esp, %ebp 118 119 pushl %ebx 120 pushl %edi 121 pushl %esi 122 123#ifdef KERNEL // if this is for kernel, need to save xmm registers 124 subl $140, %esp // to save %xmm0-%xmm7 into stack, extra 12 to align %esp to 16-byte boundary 125 movaps %xmm0, 0(%esp) // save xmm0, offset -12 for ebx/edi/esi 126 movaps %xmm1, 16(%esp) // save xmm1 127 movaps %xmm2, 32(%esp) // save xmm2 128 movaps %xmm3, 48(%esp) // save xmm3 129 movaps %xmm4, 64(%esp) // save xmm4 130 movaps %xmm5, 80(%esp) // save xmm5 131 movaps %xmm6, 96(%esp) // save xmm6 132 movaps %xmm7, 112(%esp) // save xmm7, if this is for SSSE3 or above 133#endif 134 135 #define adler %edi // 8(%ebp) 136 #define sum2 %esi // 12(%ebp) 137 #define buf %ecx // 16(%ebp) 138 #define len %ebx // 20(%ebp) 139 #define zero %xmm0 140 #define ones %xmm5 141 142 movl 8(%ebp), adler 143 movl 12(%ebp), sum2 144 movl 16(%ebp), buf // use ecx as buf pointer 145 movl 20(%ebp), len 146 147 .macro modulo_BASE 148 movl $$-2146992015, %eax // 1/BASE in Q47 149 mull adler // edx:eax = adler divided by BASE in Q47 150 shrl $$15, %edx // edx is now the floor integer of adler and BASE 151 imull $$BASE, %edx, %edx // edx * BASE 152 subl %edx, adler // adler -= edx*BASE 153 movl $$-2146992015, %eax // 1/BASE in Q47 154 mull sum2 // edx:eax = sum2 divided by BASE in Q47 155 shrl $$15, %edx // edx is now the floor integer of sum2 and BASE 156 imull $$BASE, %edx, %eax // eax = edx * BASE 157 subl %eax, sum2 // sum2 -= sdx*BASE 158 .endmacro 159 160 // update adler/sum2 according to a new 16-byte vector 161 .macro DO16 162 movaps (buf), %xmm1 // 16 bytes vector, in xmm1 163 movaps %xmm1, %xmm3 // a copy of the vector, used for unsigned byte in the destination of pmaddubsw 164 addl $$16, buf // buf -> next vector 165 psadbw zero, %xmm1 // 2 16-bit words to be added for adler in xmm1 166 pmaddubsw %xmm4, %xmm3 // 8 16-bit words to be added for sum2 in xmm3 167 imull $$16, adler, %edx // edx = 16*adler; 168 movhlps %xmm1, %xmm2 // higher 16-bit word (for adler) in xmm2 169 pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3 170 paddq %xmm2, %xmm1 // xmm1 lower 32-bit to be added to adler 171 addl %edx, sum2 // sum2 += adler*16; 172 movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements 173 movd %xmm1, %edx // to be added to adler 174 paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2 175 addl %edx, adler // update adler 176 movd %xmm3, %edx // to be added to sum2 177 psrlq $$32, %xmm3 // another 32-bit to be added to sum2 178 addl %edx, sum2 // sum2 += 1st half of update 179 movd %xmm3, %edx // to be added to sum2 180 addl %edx, sum2 // sum2 += 2nd half of update 181 .endm 182 183 // update adler/sum2 according to a new 32-byte vector 184 .macro DO32 185 imull $$32, adler, %edx // edx = 32*adler 186 movaps (buf), %xmm1 // 1st 16 bytes vector 187 movaps 16(buf), %xmm7 // 2nd 16 bytes vector 188 movaps %xmm1, %xmm3 // a copy of 1st vector, used for unsigned byte in the destination of pmaddubsw 189 movaps %xmm7, %xmm2 // a copy of 2nd vector, used for unsigned byte in the destination of pmaddubsw 190 psadbw zero, %xmm1 // 2 16-bit words to be added for adler in xmm1 191 psadbw zero, %xmm7 // 2 16-bit words to be added for adler in xmm7 192 addl %edx, sum2 // sum2 += adler*32; 193 pmaddubsw %xmm6, %xmm3 // 8 16-bit words to be added for sum2 in xmm3 194 pmaddubsw %xmm4, %xmm2 // 8 16-bit words to be added for sum2 in xmm2 195 paddd %xmm7, %xmm1 // 2 16-bit words to be added for adler in xmm1 196 paddd %xmm2, %xmm3 // 8 16-bit words to be added for sum2 in xmm3 197 addl $$32, buf // buf -> vector for next iteration 198 movhlps %xmm1, %xmm2 // higher 16-bit word (for adler) in xmm2 199 pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3 200 paddq %xmm2, %xmm1 // xmm1 lower 32-bit to be added to adler 201 movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements 202 movd %xmm1, %edx // to be added to adler 203 paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2 204 addl %edx, adler // update adler 205 movd %xmm3, %edx // to be added to sum2 206 psrlq $$32, %xmm3 // another 32-bit to be added to sum2 207 addl %edx, sum2 // sum2 += 1st half of update 208 movd %xmm3, %edx // to be added to sum2 209 addl %edx, sum2 // sum2 += 2nd half of update 210 .endm 211 212 // this defines the macro DO16 for SSSE3 not supported 213 .macro DO16_nossse3 214 movaps (buf), %xmm1 // 16 bytes vector 215 movaps %xmm1, %xmm3 // a copy of the vector, the lower 8 bytes to be shuffled into 8 words 216 movaps %xmm1, %xmm2 // a copy of the vector, the higher 8 bytes to be shuffled into 8 words 217 psrldq $$8, %xmm2 // shift down 8 bytes, to reuse the shuffle vector 218 punpcklbw zero, %xmm3 // convert lower 8 bytes into 8 words 219 punpcklbw zero, %xmm2 // convert higher 8 bytes into 8 words 220 pmullw %xmm6, %xmm3 // lower 8 words * 16:9 221 pmullw %xmm4, %xmm2 // higher 8 words * 8:1 222 addl $$16, buf // buf -> next vector 223 psadbw zero, %xmm1 // 2 16-bit words to be added for adler in xmm1 224 paddw %xmm2, %xmm3 // 8 16-bit words to be added for sum2 in xmm3 225 imull $$16, adler, %edx // edx = 16*adler; 226 movhlps %xmm1, %xmm2 // higher 16-bit word (for adler) in xmm2 227 pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3 228 paddq %xmm2, %xmm1 // xmm1 lower 32-bit to be added to adler 229 addl %edx, sum2 // sum2 += adler*16; 230 movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements 231 movd %xmm1, %edx // to be added to adler 232 paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2 233 addl %edx, adler // update adler 234 movd %xmm3, %edx // to be added to sum2 235 psrlq $$32, %xmm3 // another 32-bit to be added to sum2 236 addl %edx, sum2 // sum2 += 1st half of update 237 movd %xmm3, %edx // to be added to sum2 238 addl %edx, sum2 // sum2 += 2nd half of update 239 .endm 240 241#ifdef KERNEL 242 leal __cpu_capabilities, %eax // %eax -> __cpu_capabilities 243 mov (%eax), %eax // %eax = __cpu_capabilities 244#else 245 mov _COMM_PAGE_CPU_CAPABILITIES, %eax 246#endif 247 test $(kHasSupplementalSSE3), %eax // __cpu_capabilities & kHasAES 248 je L_no_ssse3 249 250 // i386 adler32 with ssse3 251 252 // need to fill up xmm4/xmm5/xmm6 only if len>=16 253 cmpl $16, len 254 jl L_skip_loading_tables 255 256 // set up table starting address to %eax 257 leal sum2_coefficients, %eax 258 259 // reading coefficients 260 pxor zero, zero 261 movaps (%eax), %xmm6 // coefficients for computing sum2 : pmaddubsw 32:17 262 movaps 16(%eax), %xmm4 // coefficients for computing sum2 : pmaddubsw 16:1 263 movaps 32(%eax), ones // coefficients for computing sum2 : pmaddwd 1,1,...,1 264 265L_skip_loading_tables: 266 267 cmpl $NMAX, len // len vs NMAX 268 jl len_lessthan_NMAX // if (len < NMAX), skip the following NMAX batches processing 269 270len_ge_NMAX_loop: // while (len>=NMAX) { 271 272 subl $NMAX, len // len -= NMAX 273 movl $(NMAX/32), %eax // n = NMAX/32 274 275n_loop: // do { 276 DO32 // update adler/sum2 for a 32-byte input 277 decl %eax // n--; 278 jg n_loop // } while (n); 279 DO16 // update adler/sum2 for a 16-byte input 280 modulo_BASE // (adler/sum2) modulo BASE; 281 cmpl $NMAX, len // 282 jge len_ge_NMAX_loop // } /* len>=NMAX */ 283 284len_lessthan_NMAX: 285 286 subl $32, len // pre-decrement len by 32 287 jl len_lessthan_32 // if len < 32, skip the 32-vector code 288len32_loop: // while (len>=32) { 289 DO32 // update adler/sum2 for a 32-byte input 290 subl $32, len // len -= 32; 291 jge len32_loop // } 292 293len_lessthan_32: 294 295 addl $(32-16), len // post-increment by 32 + pre-decrement by 16 on len 296 jl L_len_lessthan_16 // if len < 16, skip the 16-vector code 297 DO16 // update adler/sum2 for a 16-byte input 298 subl $16, len // len -= 16; 299 300L_len_lessthan_16: 301 addl $16, len // post-increment len by 16 302 jz len_is_zero // if len==0, branch over scalar processing 303 3040: // while (len) { 305 movzbl (buf), %edx // new input byte 306 incl buf // buf++ 307 addl %edx, adler // adler += *buf 308 addl adler, sum2 // sum2 += adler 309 subl $1, len // len-- 310 jg 0b // } 311 312len_is_zero: 313 314 modulo_BASE // (adler/sum2) modulo BASE; 315 316 // construct 32-bit (sum2<<16 | adler) to be returned 317 318 sall $16, sum2 // sum2 <<16 319 movl adler, %eax // adler 320 orl sum2, %eax // sum2<<16 | adler 321 322 323#ifdef KERNEL // if this is for kernel code, need to restore xmm registers 324 movaps (%esp), %xmm0 // restore xmm0, offset -12 for ebx/edi/esi 325 movaps 16(%esp), %xmm1 // restore xmm1 326 movaps 32(%esp), %xmm2 // restore xmm2 327 movaps 48(%esp), %xmm3 // restore xmm3 328 movaps 64(%esp), %xmm4 // restore xmm4 329 movaps 80(%esp), %xmm5 // restore xmm5 330 movaps 96(%esp), %xmm6 // restore xmm6 331 movaps 112(%esp), %xmm7 // restore xmm7, if this is for SSSE3 or above 332 addl $140, %esp // we've already restored %xmm0-%xmm7 from stack 333#endif 334 335 popl %esi 336 popl %edi 337 popl %ebx 338 leave // pop ebp out from stack 339 ret 340 341 342L_no_ssse3: 343 344 // i386 adler32 without ssse3 345 346 // need to fill up xmm4/xmm5/xmm6 only if len>=16 347 cmpl $16, len 348 jl 2f 349 350 // set up table starting address to %eax 351 leal sum2_coefficients, %eax 352 353 // reading coefficients 354 pxor zero, zero 355 movaps 48(%eax), %xmm6 // coefficients for computing sum2 : pmaddubsw 16:9 356 movaps 64(%eax), %xmm4 // coefficients for computing sum2 : pmaddubsw 8:1 357 movaps 80(%eax), ones // coefficients for computing sum2 : pmaddwd 1,1,...,1 358 3592: 360 361 cmpl $NMAX, len // len vs NMAX 362 jl 3f // if (len < NMAX), skip the following NMAX batches processing 363 3640: // while (len>=NMAX) { 365 366 subl $NMAX, len // len -= NMAX 367 movl $(NMAX/16), %eax // n = NMAX/16 368 3691: // do { 370 DO16_nossse3 // update adler/sum2 for a 16-byte input 371 decl %eax // n--; 372 jg 1b // } while (n); 373 374 modulo_BASE // (adler/sum2) modulo BASE; 375 376 cmpl $NMAX, len // 377 jge 0b // } /* len>=NMAX */ 378 3793: 380 381 subl $16, len // pre-decrement len by 16 382 jl L_len_lessthan_16 // if len < 16, skip the 16-vector code 383 DO16_nossse3 // update adler/sum2 for a 16-byte input 384 subl $16, len // len -= 16; 385 jmp L_len_lessthan_16 386 387 388 .const 389 .align 4 390sum2_coefficients: // used for vectorizing adler32 computation 391 392 .byte 32 393 .byte 31 394 .byte 30 395 .byte 29 396 .byte 28 397 .byte 27 398 .byte 26 399 .byte 25 400 .byte 24 401 .byte 23 402 .byte 22 403 .byte 21 404 .byte 20 405 .byte 19 406 .byte 18 407 .byte 17 408 .byte 16 409 .byte 15 410 .byte 14 411 .byte 13 412 .byte 12 413 .byte 11 414 .byte 10 415 .byte 9 416 .byte 8 417 .byte 7 418 .byte 6 419 .byte 5 420 .byte 4 421 .byte 3 422 .byte 2 423 .byte 1 424 425 // coefficients for pmaddwd, to combine into 4 32-bit elements for sum2 426 .word 1 427 .word 1 428 .word 1 429 .word 1 430 .word 1 431 .word 1 432 .word 1 433 .word 1 434 435 436 // data for without ssse3 437 438 .word 16 439 .word 15 440 .word 14 441 .word 13 442 .word 12 443 .word 11 444 .word 10 445 .word 9 446 .word 8 447 .word 7 448 .word 6 449 .word 5 450 .word 4 451 .word 3 452 .word 2 453 .word 1 454 455 // coefficients for pmaddwd, to combine into 4 32-bit elements for sum2 456 .word 1 457 .word 1 458 .word 1 459 .word 1 460 .word 1 461 .word 1 462 .word 1 463 .word 1 464 465#else // (defined __x86_64__) 466 467 movq __cpu_capabilities@GOTPCREL(%rip), %rax // %rax -> __cpu_capabilities 468 mov (%rax), %eax // %eax = __cpu_capabilities 469 test $(kHasSupplementalSSE3), %eax // __cpu_capabilities & kHasSupplementalSSE3 470 jne L_has_ssse3 471 472 // ---------------------------------------------------------------------------------- 473 // the following is added for x86_64 without SSSE3 support 474 // it is essentially a translated copy of the i386 code without SSSE3 code 475 // ---------------------------------------------------------------------------------- 476 477 // input : 478 // adler : rdi 479 // sum2 : rsi 480 // buf : rdx 481 // len : rcx 482 483 pushq %rbp 484 movq %rsp, %rbp 485 pushq %rbx 486 487#ifdef KERNEL // if for kernel, save %xmm0-%xmm11 488 subq $200, %rsp // allocate for %xmm0-%xmm11 (192 bytes), extra 8 to align %rsp to 16-byte boundary 489 movaps %xmm0, -32(%rbp) 490 movaps %xmm1, -48(%rbp) 491 movaps %xmm2, -64(%rbp) 492 movaps %xmm3, -80(%rbp) 493 movaps %xmm4, -96(%rbp) 494 movaps %xmm5, -112(%rbp) 495 movaps %xmm6, -128(%rbp) 496#endif 497 498 #define adler %rdi // 16(%rbp) 499 #define sum2 %rsi // 24(%ebp) 500 #define buf %rcx // 32(%ebp) 501 #define len %rbx // 40(%ebp) 502 #define zero %xmm0 503 #define ones %xmm5 504 505 movq %rcx, len 506 movq %rdx, buf 507 508 .macro modulo_BASE 509 movl $$-2146992015, %eax // 1/BASE in Q47 510 mull %edi // edx:eax = adler divided by BASE in Q47 511 shrl $$15, %edx // edx is now the floor integer of adler and BASE 512 imull $$BASE, %edx, %edx // edx * BASE 513 subq %rdx, adler // adler -= edx*BASE 514 movl $$-2146992015, %eax // 1/BASE in Q47 515 mull %esi // edx:eax = sum2 divided by BASE in Q47 516 shrl $$15, %edx // edx is now the floor integer of sum2 and BASE 517 imull $$BASE, %edx, %eax // eax = edx * BASE 518 subq %rax, sum2 // sum2 -= sdx*BASE 519 .endmacro 520 521 // update adler/sum2 according to a new 16-byte vector, no ssse3 522 .macro DO16_nossse3 523 movaps (buf), %xmm1 // 16 bytes vector 524 movaps %xmm1, %xmm3 // a copy of the vector, the lower 8 bytes to be shuffled into 8 words 525 movaps %xmm1, %xmm2 // a copy of the vector, the higher 8 bytes to be shuffled into 8 words 526 psrldq $$8, %xmm2 // shift down 8 bytes, to reuse the shuffle vector 527 punpcklbw zero, %xmm3 // convert lower 8 bytes into 8 words 528 punpcklbw zero, %xmm2 // convert higher 8 bytes into 8 words 529 pmullw %xmm6, %xmm3 // lower 8 words * 16:9 530 pmullw %xmm4, %xmm2 // higher 8 words * 8:1 531 add $$16, buf // buf -> next vector 532 psadbw zero, %xmm1 // 2 16-bit words to be added for adler in xmm1 533 paddw %xmm2, %xmm3 // 8 16-bit words to be added for sum2 in xmm3 534 imulq $$16, adler, %rdx // edx = 16*adler; 535 movhlps %xmm1, %xmm2 // higher 16-bit word (for adler) in xmm2 536 pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3 537 paddq %xmm2, %xmm1 // xmm1 lower 32-bit to be added to adler 538 add %rdx, sum2 // sum2 += adler*16; 539 movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements 540 movd %xmm1, %edx // to be added to adler 541 paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2 542 addq %rdx, adler // update adler 543 movd %xmm3, %edx // to be added to sum2 544 psrlq $$32, %xmm3 // another 32-bit to be added to sum2 545 addq %rdx, sum2 // sum2 += 1st half of update 546 movd %xmm3, %edx // to be added to sum2 547 addq %rdx, sum2 // sum2 += 2nd half of update 548 .endm 549 550 // need to fill up xmm4/xmm5/xmm6 only if len>=16 551 cmpq $16, len 552 jl 0f 553 554 // set up table starting address to %eax 555 leaq sum2_coefficients_nossse3(%rip), %rax 556 557 // reading coefficients 558 pxor zero, zero 559 movaps (%rax), %xmm6 // coefficients for computing sum2 : pmaddubsw 16:9 560 movaps 16(%rax), %xmm4 // coefficients for computing sum2 : pmaddubsw 8:1 561 movaps 32(%rax), ones // coefficients for computing sum2 : pmaddwd 1,1,...,1 5620: 563 564 cmp $NMAX, len // len vs NMAX 565 jl 3f // if (len < NMAX), skip the following NMAX batches processing 566 5670: // while (len>=NMAX) { 568 569 sub $NMAX, len // len -= NMAX 570 mov $(NMAX/16), %eax // n = NMAX/16 571 5721: // do { 573 DO16_nossse3 // update adler/sum2 for a 16-byte input 574 decl %eax // n--; 575 jg 1b // } while (n); 576 577 modulo_BASE // (adler/sum2) modulo BASE; 578 579 cmp $NMAX, len // 580 jge 0b // } /* len>=NMAX */ 581 5823: 583 584 sub $16, len // pre-decrement len by 16 585 jl 2f // if len < 16, skip the 16-vector code 586 DO16_nossse3 // update adler/sum2 for a 16-byte input 587 sub $16, len // len -= 16; 588 5892: 590 add $16, len // post-increment len by 16 591 jz 1f // if len==0, branch over scalar processing 592 5930: // while (len) { 594 movzbq (buf), %rdx // new input byte 595 incq buf // buf++ 596 addq %rdx, adler // adler += *buf 597 addq adler, sum2 // sum2 += adler 598 decq len // len-- 599 jg 0b // } 600 6011: 602 603 modulo_BASE // (adler/sum2) modulo BASE; 604 605 // construct 32-bit (sum2<<16 | adler) to be returned 606 607 salq $16, sum2 // sum2 <<16 608 movq adler, %rax // adler 609 orq sum2, %rax // sum2<<16 | adler 610 611#ifdef KERNEL // if this is for kernel code, need to restore xmm registers 612 movaps -32(%rbp), %xmm0 613 movaps -48(%rbp), %xmm1 614 movaps -64(%rbp), %xmm2 615 movaps -80(%rbp), %xmm3 616 movaps -96(%rbp), %xmm4 617 movaps -112(%rbp), %xmm5 618 movaps -128(%rbp), %xmm6 619 addq $200, %rsp // we've already restored %xmm0-%xmm11 from stack 620#endif 621 622 popq %rbx 623 leave 624 ret 625 626 627 628 .const 629 .align 4 630sum2_coefficients_nossse3: // used for vectorizing adler32 computation 631 632 // data for without ssse3 633 634 .word 16 635 .word 15 636 .word 14 637 .word 13 638 .word 12 639 .word 11 640 .word 10 641 .word 9 642 .word 8 643 .word 7 644 .word 6 645 .word 5 646 .word 4 647 .word 3 648 .word 2 649 .word 1 650 651 // coefficients for pmaddwd, to combine into 4 32-bit elements for sum2 652 .word 1 653 .word 1 654 .word 1 655 .word 1 656 .word 1 657 .word 1 658 .word 1 659 .word 1 660 661 662 .text 663 664 // ---------------------------------------------------------------------------------- 665 // the following is the original x86_64 adler32_vec code that uses SSSE3 instructions 666 // ---------------------------------------------------------------------------------- 667 668L_has_ssse3: 669 670 // input : 671 // adler : rdi 672 // sum2 : rsi 673 // buf : rdx 674 // len : rcx 675 676 pushq %rbp 677 movq %rsp, %rbp 678 pushq %rbx 679 680#ifdef KERNEL // if for kernel, save %xmm0-%xmm11 681 subq $200, %rsp // allocate for %xmm0-%xmm11 (192 bytes), extra 8 to align %rsp to 16-byte boundary 682 movaps %xmm0, -32(%rbp) 683 movaps %xmm1, -48(%rbp) 684 movaps %xmm2, -64(%rbp) 685 movaps %xmm3, -80(%rbp) 686 movaps %xmm4, -96(%rbp) 687 movaps %xmm5, -112(%rbp) 688 movaps %xmm6, -128(%rbp) 689 movaps %xmm7, -144(%rbp) 690 movaps %xmm8, -160(%rbp) 691 movaps %xmm9, -176(%rbp) 692 movaps %xmm10, -192(%rbp) 693 movaps %xmm11, -208(%rbp) 694#endif 695 696 #define adler %rdi // 16(%rbp) 697 #define sum2 %rsi // 24(%ebp) 698 #define buf %rcx // 32(%ebp) 699 #define len %rbx // 40(%ebp) 700 #define zero %xmm0 701 #define ones %xmm5 702 703 movq %rcx, len 704 movq %rdx, buf 705 706 // update adler/sum2 according to a new 16-byte vector 707 .macro DO16 708 movaps (buf), %xmm1 // 16 bytes vector 709 movaps %xmm1, %xmm3 // a copy of the vector, used for unsigned byte in the destination of pmaddubsw 710 addq $$16, buf // buf -> next vector 711 psadbw zero, %xmm1 // 2 16-bit words to be added for adler in xmm1 712 pmaddubsw %xmm4, %xmm3 // 8 16-bit words to be added for sum2 in xmm3 713 imulq $$16, adler, %rdx // edx = 16*adler; 714 movhlps %xmm1, %xmm2 // higher 16-bit word (for adler) in xmm2 715 pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3 716 paddq %xmm2, %xmm1 // xmm1 lower 32-bit to be added to adler 717 addq %rdx, sum2 // sum2 += adler*16; 718 movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements 719 movd %xmm1, %edx // to be added to adler 720 paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2 721 addq %rdx, adler // update adler 722 movd %xmm3, %edx // to be added to sum2 723 psrlq $$32, %xmm3 // another 32-bit to be added to sum2 724 addq %rdx, sum2 // sum2 += 1st half of update 725 movd %xmm3, %edx // to be added to sum2 726 addq %rdx, sum2 // sum2 += 2nd half of update 727 .endm 728 729 // update adler/sum2 according to a new 32-byte vector 730 .macro DO32 731 imulq $$32, adler, %rdx // edx = 32*adler 732 movaps (buf), %xmm1 // 1st 16 bytes vector 733 movaps 16(buf), %xmm7 // 2nd 16 bytes vector 734 movaps %xmm1, %xmm3 // a copy of 1st vector, used for unsigned byte in the destination of pmaddubsw 735 movaps %xmm7, %xmm2 // a copy of 2nd vector, used for unsigned byte in the destination of pmaddubsw 736 psadbw zero, %xmm1 // 2 16-bit words to be added for adler in xmm1 737 psadbw zero, %xmm7 // 2 16-bit words to be added for adler in xmm7 738 addq %rdx, sum2 // sum2 += adler*32; 739 pmaddubsw %xmm6, %xmm3 // 8 16-bit words to be added for sum2 in xmm3 740 pmaddubsw %xmm4, %xmm2 // 8 16-bit words to be added for sum2 in xmm2 741 paddd %xmm7, %xmm1 // 2 16-bit words to be added for adler in xmm1 742 paddw %xmm2, %xmm3 // 8 16-bit words to be added for sum2 in xmm3 743 addq $$32, buf // buf -> vector for next iteration 744 movhlps %xmm1, %xmm2 // higher 16-bit word (for adler) in xmm2 745 pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3 746 paddq %xmm2, %xmm1 // xmm1 lower 32-bit to be added to adler 747 movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements 748 movd %xmm1, %edx // to be added to adler 749 paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2 750 addq %rdx, adler // update adler 751 movd %xmm3, %edx // to be added to sum2 752 psrlq $$32, %xmm3 // another 32-bit to be added to sum2 753 addq %rdx, sum2 // sum2 += 1st half of update 754 movd %xmm3, %edx // to be added to sum2 755 addq %rdx, sum2 // sum2 += 2nd half of update 756 .endm 757 758 // update adler/sum2 according to a new 48-byte vector 759 760 .macro DO48 761 imulq $$48, adler, %rdx // edx = 48*adler 762 763 movaps (buf), %xmm7 // 1st 16 bytes vector 764 movaps 16(buf), %xmm10 // 2nd 16 bytes vector 765 movaps 32(buf), %xmm11 // 3rd 16 bytes vector 766 767 movaps %xmm7, %xmm1 // 1st vector 768 movaps %xmm10, %xmm2 // 2nd vector 769 movaps %xmm11, %xmm3 // 3rd vector 770 771 psadbw zero, %xmm7 // 1st vector for adler 772 psadbw zero, %xmm10 // 2nd vector for adler 773 psadbw zero, %xmm11 // 3rd vector for adler 774 775 addq %rdx, sum2 // sum2 += adler*48; 776 777 pmaddubsw %xmm9, %xmm1 // 8 16-bit words to be added for sum2 : 1st vector 778 pmaddubsw %xmm6, %xmm2 // 8 16-bit words to be added for sum2 : 2nd vector 779 pmaddubsw %xmm4, %xmm3 // 8 16-bit words to be added for sum2 : 3rd vector 780 781 pmaddwd ones, %xmm1 // 4 32-bit elements to be added for sum2 in xmm1 782 pmaddwd ones, %xmm2 // 4 32-bit elements to be added for sum2 in xmm2 783 pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3 784 785 paddd %xmm10, %xmm7 // 2 16-bit words to be added for adler 786 paddd %xmm11, %xmm7 // 2 16-bit words to be added for adler 787 788 paddd %xmm1, %xmm3 // 4 32-bit elements to be added for sum2 789 paddd %xmm2, %xmm3 // 4 32-bit elements to be added for sum2 790 791 addq $$48, buf // buf -> vector for next iteration 792 793 movhlps %xmm7, %xmm2 // higher 16-bit word (for adler) in xmm2 794 paddq %xmm2, %xmm7 // xmm7 lower 32-bit to be added to adler 795 796 movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements 797 movd %xmm7, %edx // to be added to adler 798 paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2 799 addq %rdx, adler // update adler 800 movd %xmm3, %edx // to be added to sum2 801 psrlq $$32, %xmm3 // another 32-bit to be added to sum2 802 addq %rdx, sum2 // sum2 += 1st half of update 803 movd %xmm3, %edx // to be added to sum2 804 addq %rdx, sum2 // sum2 += 2nd half of update 805 .endm 806 807 // update adler/sum2 according to a new 64-byte vector 808 .macro DO64 809 imulq $$64, adler, %rdx // edx = 64*adler 810 811 movaps (buf), %xmm1 // 1st 16 bytes vector 812 movaps 16(buf), %xmm7 // 2nd 16 bytes vector 813 movaps 32(buf), %xmm10 // 3rd 16 bytes vector 814 movaps 48(buf), %xmm11 // 4th 16 bytes vector 815 816 movaps %xmm1, %xmm3 // 1st vector 817 movaps %xmm11, %xmm2 // 4th vector 818 psadbw zero, %xmm1 // 1st vector for adler 819 psadbw zero, %xmm11 // 4th vector for adler 820 821 addq %rdx, sum2 // sum2 += adler*64; 822 823 pmaddubsw %xmm8, %xmm3 // 8 16-bit words to be added for sum2 : 1st vector 824 pmaddubsw %xmm4, %xmm2 // 8 16-bit words to be added for sum2 : 4th vector 825 pmaddwd ones, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3 826 pmaddwd ones, %xmm2 // 4 32-bit elements to be added for sum2 in xmm2 827 828 paddd %xmm11, %xmm1 // 2 16-bit words to be added for adler in xmm1 829 paddd %xmm2, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3 830 831 movaps %xmm7, %xmm2 // 2nd vector 832 movaps %xmm10, %xmm11 // 3rd vector 833 834 psadbw zero, %xmm7 // 2nd vector for adler 835 psadbw zero, %xmm10 // 3rd vector for adler 836 837 pmaddubsw %xmm9, %xmm2 // 8 16-bit words to be added for sum2 : 2nd vector 838 pmaddubsw %xmm6, %xmm11 // 8 16-bit words to be added for sum2 : 3rd vector 839 pmaddwd ones, %xmm2 // 4 32-bit elements to be added for sum2 in xmm2 840 pmaddwd ones, %xmm11 // 4 32-bit elements to be added for sum2 in xmm11 841 842 paddd %xmm7, %xmm1 // 2 16-bit words to be added for adler in xmm1 843 paddd %xmm10, %xmm1 // 2 16-bit words to be added for adler in xmm1 844 845 paddd %xmm2, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3 846 paddd %xmm11, %xmm3 // 4 32-bit elements to be added for sum2 in xmm3 847 848 addq $$64, buf // buf -> vector for next iteration 849 850 movhlps %xmm1, %xmm2 // higher 16-bit word (for adler) in xmm2 851 paddq %xmm2, %xmm1 // xmm1 lower 32-bit to be added to adler 852 movhlps %xmm3, %xmm2 // 2 higher 32-bit elements of xmm3 to be added to lower 2 32-bit elements 853 movd %xmm1, %edx // to be added to adler 854 paddd %xmm2, %xmm3 // 2 32-bits elements in xmm3 to be added to sum2 855 addq %rdx, adler // update adler 856 movd %xmm3, %edx // to be added to sum2 857 psrlq $$32, %xmm3 // another 32-bit to be added to sum2 858 addq %rdx, sum2 // sum2 += 1st half of update 859 movd %xmm3, %edx // to be added to sum2 860 addq %rdx, sum2 // sum2 += 2nd half of update 861 .endm 862 863 // need to fill up xmm4/xmm5/xmm6 only if len>=16 864 cmpq $16, len 865 jl skip_loading_tables 866 867 // set up table starting address to %eax 868 leaq sum2_coefficients(%rip), %rax 869 870 // reading coefficients 871 pxor zero, zero 872 movaps (%rax), %xmm8 // coefficients for computing sum2 : pmaddubsw 64:49 873 movaps 16(%rax), %xmm9 // coefficients for computing sum2 : pmaddubsw 48:33 874 movaps 32(%rax), %xmm6 // coefficients for computing sum2 : pmaddubsw 32:17 875 movaps 48(%rax), %xmm4 // coefficients for computing sum2 : pmaddubsw 16:1 876 movaps 64(%rax), ones // coefficients for computing sum2 : pmaddwd 1,1,...,1 877 878skip_loading_tables: 879 880 881 cmpq $NMAX, len // len vs NMAX 882 jl len_lessthan_NMAX // if (len < NMAX), skip the following NMAX batches processing 883 884len_ge_NMAX_loop: // while (len>=NMAX) { 885 886 subq $NMAX, len // len -= NMAX 887 movq $(NMAX/64), %rax // n = NMAX/64 888 889n_loop: // do { 890 DO64 // update adler/sum2 for a 64-byte input 891 decq %rax // n--; 892 jg n_loop // } while (n); 893 894 DO48 // update adler/sum2 for a 48-byte input 895 896 modulo_BASE // (adler/sum2) modulo BASE; 897 898 cmpq $NMAX, len // 899 jge len_ge_NMAX_loop // } /* len>=NMAX */ 900 901len_lessthan_NMAX: 902 903 subq $64, len // pre-decrement len by 64 904 jl len_lessthan_64 // if len < 64, skip the 64-vector code 905len64_loop: // while (len>=64) { 906 DO64 // update adler/sum2 for a 64-byte input 907 subq $64, len // len -= 64; 908 jge len64_loop // } 909 910len_lessthan_64: 911 addq $(64-32), len // post-increment 64 + pre-decrement 32 of len 912 jl len_lessthan_32 // if len < 32, skip the 32-vector code 913 DO32 // update adler/sum2 for a 32-byte input 914 subq $32, len // len -= 32; 915 916len_lessthan_32: 917 918 addq $(32-16), len // post-increment by 32 + pre-decrement by 16 on len 919 jl len_lessthan_16 // if len < 16, skip the 16-vector code 920 DO16 // update adler/sum2 for a 16-byte input 921 subq $16, len // len -= 16; 922 923len_lessthan_16: 924 addq $16, len // post-increment len by 16 925 jz len_is_zero // if len==0, branch over scalar processing 926 927scalar_loop: // while (len) { 928 movzbq (buf), %rdx // new input byte 929 incq buf // buf++ 930 addq %rdx, adler // adler += *buf 931 addq adler, sum2 // sum2 += adler 932 decq len // len-- 933 jg scalar_loop // } 934 935len_is_zero: 936 937 modulo_BASE // (adler/sum2) modulo BASE; 938 939 // construct 32-bit (sum2<<16 | adler) to be returned 940 941 salq $16, sum2 // sum2 <<16 942 movq adler, %rax // adler 943 orq sum2, %rax // sum2<<16 | adler 944 945 946#ifdef KERNEL // if for kernel, restore %xmm0-%xmm11 947 movaps -32(%rbp), %xmm0 948 movaps -48(%rbp), %xmm1 949 movaps -64(%rbp), %xmm2 950 movaps -80(%rbp), %xmm3 951 movaps -96(%rbp), %xmm4 952 movaps -112(%rbp), %xmm5 953 movaps -128(%rbp), %xmm6 954 movaps -144(%rbp), %xmm7 955 movaps -160(%rbp), %xmm8 956 movaps -176(%rbp), %xmm9 957 movaps -192(%rbp), %xmm10 958 movaps -208(%rbp), %xmm11 959 addq $200, %rsp // we've already restored %xmm0-%xmm11 from stack 960#endif 961 962 popq %rbx 963 leave // pop ebp out from stack 964 ret 965 966 967 .const 968 .align 4 969sum2_coefficients: // used for vectorizing adler32 computation 970 971 // coefficients for pmaddubsw instruction, used to generate 16-bit elements for sum2 972 973 .byte 64 974 .byte 63 975 .byte 62 976 .byte 61 977 .byte 60 978 .byte 59 979 .byte 58 980 .byte 57 981 .byte 56 982 .byte 55 983 .byte 54 984 .byte 53 985 .byte 52 986 .byte 51 987 .byte 50 988 .byte 49 989 .byte 48 990 .byte 47 991 .byte 46 992 .byte 45 993 .byte 44 994 .byte 43 995 .byte 42 996 .byte 41 997 .byte 40 998 .byte 39 999 .byte 38 1000 .byte 37 1001 .byte 36 1002 .byte 35 1003 .byte 34 1004 .byte 33 1005 .byte 32 1006 .byte 31 1007 .byte 30 1008 .byte 29 1009 .byte 28 1010 .byte 27 1011 .byte 26 1012 .byte 25 1013 .byte 24 1014 .byte 23 1015 .byte 22 1016 .byte 21 1017 .byte 20 1018 .byte 19 1019 .byte 18 1020 .byte 17 1021 .byte 16 1022 .byte 15 1023 .byte 14 1024 .byte 13 1025 .byte 12 1026 .byte 11 1027 .byte 10 1028 .byte 9 1029 .byte 8 1030 .byte 7 1031 .byte 6 1032 .byte 5 1033 .byte 4 1034 .byte 3 1035 .byte 2 1036 .byte 1 1037 1038 // coefficients for pmaddwd, to combine into 4 32-bit elements for sum2 1039 .word 1 1040 .word 1 1041 .word 1 1042 .word 1 1043 .word 1 1044 .word 1 1045 .word 1 1046 .word 1 1047 1048#endif // (defined __i386__) 1049 1050#endif // (defined __i386__ || defined __x86_64__) 1051