poly1305-x86.S revision 1.4
1#include <machine/asm.h> 2.text 3.align 64 4.globl poly1305_init 5.type poly1305_init,@function 6.align 16 7poly1305_init: 8.L_poly1305_init_begin: 9 #ifdef __CET__ 10 11.byte 243,15,30,251 12 #endif 13 14 pushl %ebp 15 pushl %ebx 16 pushl %esi 17 pushl %edi 18 movl 20(%esp),%edi 19 movl 24(%esp),%esi 20 movl 28(%esp),%ebp 21 xorl %eax,%eax 22 movl %eax,(%edi) 23 movl %eax,4(%edi) 24 movl %eax,8(%edi) 25 movl %eax,12(%edi) 26 movl %eax,16(%edi) 27 movl %eax,20(%edi) 28 cmpl $0,%esi 29 je .L000nokey 30 call .L001pic_point 31.L001pic_point: 32 popl %ebx 33 leal poly1305_blocks-.L001pic_point(%ebx),%eax 34 leal poly1305_emit-.L001pic_point(%ebx),%edx 35 leal OPENSSL_ia32cap_P-.L001pic_point(%ebx),%edi 36 movl (%edi),%ecx 37 andl $83886080,%ecx 38 cmpl $83886080,%ecx 39 jne .L002no_sse2 40 leal _poly1305_blocks_sse2-.L001pic_point(%ebx),%eax 41 leal _poly1305_emit_sse2-.L001pic_point(%ebx),%edx 42.L002no_sse2: 43 movl 20(%esp),%edi 44 movl %eax,(%ebp) 45 movl %edx,4(%ebp) 46 movl (%esi),%eax 47 movl 4(%esi),%ebx 48 movl 8(%esi),%ecx 49 movl 12(%esi),%edx 50 andl $268435455,%eax 51 andl $268435452,%ebx 52 andl $268435452,%ecx 53 andl $268435452,%edx 54 movl %eax,24(%edi) 55 movl %ebx,28(%edi) 56 movl %ecx,32(%edi) 57 movl %edx,36(%edi) 58 movl $1,%eax 59.L000nokey: 60 popl %edi 61 popl %esi 62 popl %ebx 63 popl %ebp 64 ret 65.size poly1305_init,.-.L_poly1305_init_begin 66.globl poly1305_blocks 67.type poly1305_blocks,@function 68.align 16 69poly1305_blocks: 70.L_poly1305_blocks_begin: 71 #ifdef __CET__ 72 73.byte 243,15,30,251 74 #endif 75 76 pushl %ebp 77 pushl %ebx 78 pushl %esi 79 pushl %edi 80 movl 20(%esp),%edi 81 movl 24(%esp),%esi 82 movl 28(%esp),%ecx 83.Lenter_blocks: 84 andl $-15,%ecx 85 jz .L003nodata 86 subl $64,%esp 87 movl 24(%edi),%eax 88 movl 28(%edi),%ebx 89 leal (%esi,%ecx,1),%ebp 90 movl 32(%edi),%ecx 91 movl 36(%edi),%edx 92 movl %ebp,92(%esp) 93 movl %esi,%ebp 94 movl %eax,36(%esp) 95 movl %ebx,%eax 96 shrl $2,%eax 97 movl %ebx,40(%esp) 98 addl %ebx,%eax 99 movl %ecx,%ebx 100 shrl $2,%ebx 101 movl %ecx,44(%esp) 102 addl %ecx,%ebx 103 movl %edx,%ecx 104 shrl $2,%ecx 105 movl %edx,48(%esp) 106 addl %edx,%ecx 107 movl %eax,52(%esp) 108 movl %ebx,56(%esp) 109 movl %ecx,60(%esp) 110 movl (%edi),%eax 111 movl 4(%edi),%ebx 112 movl 8(%edi),%ecx 113 movl 12(%edi),%esi 114 movl 16(%edi),%edi 115 jmp .L004loop 116.align 32 117.L004loop: 118 addl (%ebp),%eax 119 adcl 4(%ebp),%ebx 120 adcl 8(%ebp),%ecx 121 adcl 12(%ebp),%esi 122 leal 16(%ebp),%ebp 123 adcl 96(%esp),%edi 124 movl %eax,(%esp) 125 movl %esi,12(%esp) 126 mull 36(%esp) 127 movl %edi,16(%esp) 128 movl %eax,%edi 129 movl %ebx,%eax 130 movl %edx,%esi 131 mull 60(%esp) 132 addl %eax,%edi 133 movl %ecx,%eax 134 adcl %edx,%esi 135 mull 56(%esp) 136 addl %eax,%edi 137 movl 12(%esp),%eax 138 adcl %edx,%esi 139 mull 52(%esp) 140 addl %eax,%edi 141 movl (%esp),%eax 142 adcl %edx,%esi 143 mull 40(%esp) 144 movl %edi,20(%esp) 145 xorl %edi,%edi 146 addl %eax,%esi 147 movl %ebx,%eax 148 adcl %edx,%edi 149 mull 36(%esp) 150 addl %eax,%esi 151 movl %ecx,%eax 152 adcl %edx,%edi 153 mull 60(%esp) 154 addl %eax,%esi 155 movl 12(%esp),%eax 156 adcl %edx,%edi 157 mull 56(%esp) 158 addl %eax,%esi 159 movl 16(%esp),%eax 160 adcl %edx,%edi 161 imull 52(%esp),%eax 162 addl %eax,%esi 163 movl (%esp),%eax 164 adcl $0,%edi 165 mull 44(%esp) 166 movl %esi,24(%esp) 167 xorl %esi,%esi 168 addl %eax,%edi 169 movl %ebx,%eax 170 adcl %edx,%esi 171 mull 40(%esp) 172 addl %eax,%edi 173 movl %ecx,%eax 174 adcl %edx,%esi 175 mull 36(%esp) 176 addl %eax,%edi 177 movl 12(%esp),%eax 178 adcl %edx,%esi 179 mull 60(%esp) 180 addl %eax,%edi 181 movl 16(%esp),%eax 182 adcl %edx,%esi 183 imull 56(%esp),%eax 184 addl %eax,%edi 185 movl (%esp),%eax 186 adcl $0,%esi 187 mull 48(%esp) 188 movl %edi,28(%esp) 189 xorl %edi,%edi 190 addl %eax,%esi 191 movl %ebx,%eax 192 adcl %edx,%edi 193 mull 44(%esp) 194 addl %eax,%esi 195 movl %ecx,%eax 196 adcl %edx,%edi 197 mull 40(%esp) 198 addl %eax,%esi 199 movl 12(%esp),%eax 200 adcl %edx,%edi 201 mull 36(%esp) 202 addl %eax,%esi 203 movl 16(%esp),%ecx 204 adcl %edx,%edi 205 movl %ecx,%edx 206 imull 60(%esp),%ecx 207 addl %ecx,%esi 208 movl 20(%esp),%eax 209 adcl $0,%edi 210 imull 36(%esp),%edx 211 addl %edi,%edx 212 movl 24(%esp),%ebx 213 movl 28(%esp),%ecx 214 movl %edx,%edi 215 shrl $2,%edx 216 andl $3,%edi 217 leal (%edx,%edx,4),%edx 218 addl %edx,%eax 219 adcl $0,%ebx 220 adcl $0,%ecx 221 adcl $0,%esi 222 adcl $0,%edi 223 cmpl 92(%esp),%ebp 224 jne .L004loop 225 movl 84(%esp),%edx 226 addl $64,%esp 227 movl %eax,(%edx) 228 movl %ebx,4(%edx) 229 movl %ecx,8(%edx) 230 movl %esi,12(%edx) 231 movl %edi,16(%edx) 232.L003nodata: 233 popl %edi 234 popl %esi 235 popl %ebx 236 popl %ebp 237 ret 238.size poly1305_blocks,.-.L_poly1305_blocks_begin 239.globl poly1305_emit 240.type poly1305_emit,@function 241.align 16 242poly1305_emit: 243.L_poly1305_emit_begin: 244 #ifdef __CET__ 245 246.byte 243,15,30,251 247 #endif 248 249 pushl %ebp 250 pushl %ebx 251 pushl %esi 252 pushl %edi 253 movl 20(%esp),%ebp 254.Lenter_emit: 255 movl 24(%esp),%edi 256 movl (%ebp),%eax 257 movl 4(%ebp),%ebx 258 movl 8(%ebp),%ecx 259 movl 12(%ebp),%edx 260 movl 16(%ebp),%esi 261 addl $5,%eax 262 adcl $0,%ebx 263 adcl $0,%ecx 264 adcl $0,%edx 265 adcl $0,%esi 266 shrl $2,%esi 267 negl %esi 268 andl %esi,%eax 269 andl %esi,%ebx 270 andl %esi,%ecx 271 andl %esi,%edx 272 movl %eax,(%edi) 273 movl %ebx,4(%edi) 274 movl %ecx,8(%edi) 275 movl %edx,12(%edi) 276 notl %esi 277 movl (%ebp),%eax 278 movl 4(%ebp),%ebx 279 movl 8(%ebp),%ecx 280 movl 12(%ebp),%edx 281 movl 28(%esp),%ebp 282 andl %esi,%eax 283 andl %esi,%ebx 284 andl %esi,%ecx 285 andl %esi,%edx 286 orl (%edi),%eax 287 orl 4(%edi),%ebx 288 orl 8(%edi),%ecx 289 orl 12(%edi),%edx 290 addl (%ebp),%eax 291 adcl 4(%ebp),%ebx 292 adcl 8(%ebp),%ecx 293 adcl 12(%ebp),%edx 294 movl %eax,(%edi) 295 movl %ebx,4(%edi) 296 movl %ecx,8(%edi) 297 movl %edx,12(%edi) 298 popl %edi 299 popl %esi 300 popl %ebx 301 popl %ebp 302 ret 303.size poly1305_emit,.-.L_poly1305_emit_begin 304.align 32 305.type _poly1305_init_sse2,@function 306.align 16 307_poly1305_init_sse2: 308 #ifdef __CET__ 309 310.byte 243,15,30,251 311 #endif 312 313 movdqu 24(%edi),%xmm4 314 leal 48(%edi),%edi 315 movl %esp,%ebp 316 subl $224,%esp 317 andl $-16,%esp 318 movq 64(%ebx),%xmm7 319 movdqa %xmm4,%xmm0 320 movdqa %xmm4,%xmm1 321 movdqa %xmm4,%xmm2 322 pand %xmm7,%xmm0 323 psrlq $26,%xmm1 324 psrldq $6,%xmm2 325 pand %xmm7,%xmm1 326 movdqa %xmm2,%xmm3 327 psrlq $4,%xmm2 328 psrlq $30,%xmm3 329 pand %xmm7,%xmm2 330 pand %xmm7,%xmm3 331 psrldq $13,%xmm4 332 leal 144(%esp),%edx 333 movl $2,%ecx 334.L005square: 335 movdqa %xmm0,(%esp) 336 movdqa %xmm1,16(%esp) 337 movdqa %xmm2,32(%esp) 338 movdqa %xmm3,48(%esp) 339 movdqa %xmm4,64(%esp) 340 movdqa %xmm1,%xmm6 341 movdqa %xmm2,%xmm5 342 pslld $2,%xmm6 343 pslld $2,%xmm5 344 paddd %xmm1,%xmm6 345 paddd %xmm2,%xmm5 346 movdqa %xmm6,80(%esp) 347 movdqa %xmm5,96(%esp) 348 movdqa %xmm3,%xmm6 349 movdqa %xmm4,%xmm5 350 pslld $2,%xmm6 351 pslld $2,%xmm5 352 paddd %xmm3,%xmm6 353 paddd %xmm4,%xmm5 354 movdqa %xmm6,112(%esp) 355 movdqa %xmm5,128(%esp) 356 pshufd $68,%xmm0,%xmm6 357 movdqa %xmm1,%xmm5 358 pshufd $68,%xmm1,%xmm1 359 pshufd $68,%xmm2,%xmm2 360 pshufd $68,%xmm3,%xmm3 361 pshufd $68,%xmm4,%xmm4 362 movdqa %xmm6,(%edx) 363 movdqa %xmm1,16(%edx) 364 movdqa %xmm2,32(%edx) 365 movdqa %xmm3,48(%edx) 366 movdqa %xmm4,64(%edx) 367 pmuludq %xmm0,%xmm4 368 pmuludq %xmm0,%xmm3 369 pmuludq %xmm0,%xmm2 370 pmuludq %xmm0,%xmm1 371 pmuludq %xmm6,%xmm0 372 movdqa %xmm5,%xmm6 373 pmuludq 48(%edx),%xmm5 374 movdqa %xmm6,%xmm7 375 pmuludq 32(%edx),%xmm6 376 paddq %xmm5,%xmm4 377 movdqa %xmm7,%xmm5 378 pmuludq 16(%edx),%xmm7 379 paddq %xmm6,%xmm3 380 movdqa 80(%esp),%xmm6 381 pmuludq (%edx),%xmm5 382 paddq %xmm7,%xmm2 383 pmuludq 64(%edx),%xmm6 384 movdqa 32(%esp),%xmm7 385 paddq %xmm5,%xmm1 386 movdqa %xmm7,%xmm5 387 pmuludq 32(%edx),%xmm7 388 paddq %xmm6,%xmm0 389 movdqa %xmm5,%xmm6 390 pmuludq 16(%edx),%xmm5 391 paddq %xmm7,%xmm4 392 movdqa 96(%esp),%xmm7 393 pmuludq (%edx),%xmm6 394 paddq %xmm5,%xmm3 395 movdqa %xmm7,%xmm5 396 pmuludq 64(%edx),%xmm7 397 paddq %xmm6,%xmm2 398 pmuludq 48(%edx),%xmm5 399 movdqa 48(%esp),%xmm6 400 paddq %xmm7,%xmm1 401 movdqa %xmm6,%xmm7 402 pmuludq 16(%edx),%xmm6 403 paddq %xmm5,%xmm0 404 movdqa 112(%esp),%xmm5 405 pmuludq (%edx),%xmm7 406 paddq %xmm6,%xmm4 407 movdqa %xmm5,%xmm6 408 pmuludq 64(%edx),%xmm5 409 paddq %xmm7,%xmm3 410 movdqa %xmm6,%xmm7 411 pmuludq 48(%edx),%xmm6 412 paddq %xmm5,%xmm2 413 pmuludq 32(%edx),%xmm7 414 movdqa 64(%esp),%xmm5 415 paddq %xmm6,%xmm1 416 movdqa 128(%esp),%xmm6 417 pmuludq (%edx),%xmm5 418 paddq %xmm7,%xmm0 419 movdqa %xmm6,%xmm7 420 pmuludq 64(%edx),%xmm6 421 paddq %xmm5,%xmm4 422 movdqa %xmm7,%xmm5 423 pmuludq 16(%edx),%xmm7 424 paddq %xmm6,%xmm3 425 movdqa %xmm5,%xmm6 426 pmuludq 32(%edx),%xmm5 427 paddq %xmm7,%xmm0 428 pmuludq 48(%edx),%xmm6 429 movdqa 64(%ebx),%xmm7 430 paddq %xmm5,%xmm1 431 paddq %xmm6,%xmm2 432 movdqa %xmm3,%xmm5 433 pand %xmm7,%xmm3 434 psrlq $26,%xmm5 435 paddq %xmm4,%xmm5 436 movdqa %xmm0,%xmm6 437 pand %xmm7,%xmm0 438 psrlq $26,%xmm6 439 movdqa %xmm5,%xmm4 440 paddq %xmm1,%xmm6 441 psrlq $26,%xmm5 442 pand %xmm7,%xmm4 443 movdqa %xmm6,%xmm1 444 psrlq $26,%xmm6 445 paddd %xmm5,%xmm0 446 psllq $2,%xmm5 447 paddq %xmm2,%xmm6 448 paddq %xmm0,%xmm5 449 pand %xmm7,%xmm1 450 movdqa %xmm6,%xmm2 451 psrlq $26,%xmm6 452 pand %xmm7,%xmm2 453 paddd %xmm3,%xmm6 454 movdqa %xmm5,%xmm0 455 psrlq $26,%xmm5 456 movdqa %xmm6,%xmm3 457 psrlq $26,%xmm6 458 pand %xmm7,%xmm0 459 paddd %xmm5,%xmm1 460 pand %xmm7,%xmm3 461 paddd %xmm6,%xmm4 462 decl %ecx 463 jz .L006square_break 464 punpcklqdq (%esp),%xmm0 465 punpcklqdq 16(%esp),%xmm1 466 punpcklqdq 32(%esp),%xmm2 467 punpcklqdq 48(%esp),%xmm3 468 punpcklqdq 64(%esp),%xmm4 469 jmp .L005square 470.L006square_break: 471 psllq $32,%xmm0 472 psllq $32,%xmm1 473 psllq $32,%xmm2 474 psllq $32,%xmm3 475 psllq $32,%xmm4 476 por (%esp),%xmm0 477 por 16(%esp),%xmm1 478 por 32(%esp),%xmm2 479 por 48(%esp),%xmm3 480 por 64(%esp),%xmm4 481 pshufd $141,%xmm0,%xmm0 482 pshufd $141,%xmm1,%xmm1 483 pshufd $141,%xmm2,%xmm2 484 pshufd $141,%xmm3,%xmm3 485 pshufd $141,%xmm4,%xmm4 486 movdqu %xmm0,(%edi) 487 movdqu %xmm1,16(%edi) 488 movdqu %xmm2,32(%edi) 489 movdqu %xmm3,48(%edi) 490 movdqu %xmm4,64(%edi) 491 movdqa %xmm1,%xmm6 492 movdqa %xmm2,%xmm5 493 pslld $2,%xmm6 494 pslld $2,%xmm5 495 paddd %xmm1,%xmm6 496 paddd %xmm2,%xmm5 497 movdqu %xmm6,80(%edi) 498 movdqu %xmm5,96(%edi) 499 movdqa %xmm3,%xmm6 500 movdqa %xmm4,%xmm5 501 pslld $2,%xmm6 502 pslld $2,%xmm5 503 paddd %xmm3,%xmm6 504 paddd %xmm4,%xmm5 505 movdqu %xmm6,112(%edi) 506 movdqu %xmm5,128(%edi) 507 movl %ebp,%esp 508 leal -48(%edi),%edi 509 ret 510.size _poly1305_init_sse2,.-_poly1305_init_sse2 511.align 32 512.type _poly1305_blocks_sse2,@function 513.align 16 514_poly1305_blocks_sse2: 515 #ifdef __CET__ 516 517.byte 243,15,30,251 518 #endif 519 520 pushl %ebp 521 pushl %ebx 522 pushl %esi 523 pushl %edi 524 movl 20(%esp),%edi 525 movl 24(%esp),%esi 526 movl 28(%esp),%ecx 527 movl 20(%edi),%eax 528 andl $-16,%ecx 529 jz .L007nodata 530 cmpl $64,%ecx 531 jae .L008enter_sse2 532 testl %eax,%eax 533 jz .Lenter_blocks 534.align 16 535.L008enter_sse2: 536 call .L009pic_point 537.L009pic_point: 538 popl %ebx 539 leal .Lconst_sse2-.L009pic_point(%ebx),%ebx 540 testl %eax,%eax 541 jnz .L010base2_26 542 call _poly1305_init_sse2 543 movl (%edi),%eax 544 movl 3(%edi),%ecx 545 movl 6(%edi),%edx 546 movl 9(%edi),%esi 547 movl 13(%edi),%ebp 548 movl $1,20(%edi) 549 shrl $2,%ecx 550 andl $67108863,%eax 551 shrl $4,%edx 552 andl $67108863,%ecx 553 shrl $6,%esi 554 andl $67108863,%edx 555 movd %eax,%xmm0 556 movd %ecx,%xmm1 557 movd %edx,%xmm2 558 movd %esi,%xmm3 559 movd %ebp,%xmm4 560 movl 24(%esp),%esi 561 movl 28(%esp),%ecx 562 jmp .L011base2_32 563.align 16 564.L010base2_26: 565 movd (%edi),%xmm0 566 movd 4(%edi),%xmm1 567 movd 8(%edi),%xmm2 568 movd 12(%edi),%xmm3 569 movd 16(%edi),%xmm4 570 movdqa 64(%ebx),%xmm7 571.L011base2_32: 572 movl 32(%esp),%eax 573 movl %esp,%ebp 574 subl $528,%esp 575 andl $-16,%esp 576 leal 48(%edi),%edi 577 shll $24,%eax 578 testl $31,%ecx 579 jz .L012even 580 movdqu (%esi),%xmm6 581 leal 16(%esi),%esi 582 movdqa %xmm6,%xmm5 583 pand %xmm7,%xmm6 584 paddd %xmm6,%xmm0 585 movdqa %xmm5,%xmm6 586 psrlq $26,%xmm5 587 psrldq $6,%xmm6 588 pand %xmm7,%xmm5 589 paddd %xmm5,%xmm1 590 movdqa %xmm6,%xmm5 591 psrlq $4,%xmm6 592 pand %xmm7,%xmm6 593 paddd %xmm6,%xmm2 594 movdqa %xmm5,%xmm6 595 psrlq $30,%xmm5 596 pand %xmm7,%xmm5 597 psrldq $7,%xmm6 598 paddd %xmm5,%xmm3 599 movd %eax,%xmm5 600 paddd %xmm6,%xmm4 601 movd 12(%edi),%xmm6 602 paddd %xmm5,%xmm4 603 movdqa %xmm0,(%esp) 604 movdqa %xmm1,16(%esp) 605 movdqa %xmm2,32(%esp) 606 movdqa %xmm3,48(%esp) 607 movdqa %xmm4,64(%esp) 608 pmuludq %xmm6,%xmm0 609 pmuludq %xmm6,%xmm1 610 pmuludq %xmm6,%xmm2 611 movd 28(%edi),%xmm5 612 pmuludq %xmm6,%xmm3 613 pmuludq %xmm6,%xmm4 614 movdqa %xmm5,%xmm6 615 pmuludq 48(%esp),%xmm5 616 movdqa %xmm6,%xmm7 617 pmuludq 32(%esp),%xmm6 618 paddq %xmm5,%xmm4 619 movdqa %xmm7,%xmm5 620 pmuludq 16(%esp),%xmm7 621 paddq %xmm6,%xmm3 622 movd 92(%edi),%xmm6 623 pmuludq (%esp),%xmm5 624 paddq %xmm7,%xmm2 625 pmuludq 64(%esp),%xmm6 626 movd 44(%edi),%xmm7 627 paddq %xmm5,%xmm1 628 movdqa %xmm7,%xmm5 629 pmuludq 32(%esp),%xmm7 630 paddq %xmm6,%xmm0 631 movdqa %xmm5,%xmm6 632 pmuludq 16(%esp),%xmm5 633 paddq %xmm7,%xmm4 634 movd 108(%edi),%xmm7 635 pmuludq (%esp),%xmm6 636 paddq %xmm5,%xmm3 637 movdqa %xmm7,%xmm5 638 pmuludq 64(%esp),%xmm7 639 paddq %xmm6,%xmm2 640 pmuludq 48(%esp),%xmm5 641 movd 60(%edi),%xmm6 642 paddq %xmm7,%xmm1 643 movdqa %xmm6,%xmm7 644 pmuludq 16(%esp),%xmm6 645 paddq %xmm5,%xmm0 646 movd 124(%edi),%xmm5 647 pmuludq (%esp),%xmm7 648 paddq %xmm6,%xmm4 649 movdqa %xmm5,%xmm6 650 pmuludq 64(%esp),%xmm5 651 paddq %xmm7,%xmm3 652 movdqa %xmm6,%xmm7 653 pmuludq 48(%esp),%xmm6 654 paddq %xmm5,%xmm2 655 pmuludq 32(%esp),%xmm7 656 movd 76(%edi),%xmm5 657 paddq %xmm6,%xmm1 658 movd 140(%edi),%xmm6 659 pmuludq (%esp),%xmm5 660 paddq %xmm7,%xmm0 661 movdqa %xmm6,%xmm7 662 pmuludq 64(%esp),%xmm6 663 paddq %xmm5,%xmm4 664 movdqa %xmm7,%xmm5 665 pmuludq 16(%esp),%xmm7 666 paddq %xmm6,%xmm3 667 movdqa %xmm5,%xmm6 668 pmuludq 32(%esp),%xmm5 669 paddq %xmm7,%xmm0 670 pmuludq 48(%esp),%xmm6 671 movdqa 64(%ebx),%xmm7 672 paddq %xmm5,%xmm1 673 paddq %xmm6,%xmm2 674 movdqa %xmm3,%xmm5 675 pand %xmm7,%xmm3 676 psrlq $26,%xmm5 677 paddq %xmm4,%xmm5 678 movdqa %xmm0,%xmm6 679 pand %xmm7,%xmm0 680 psrlq $26,%xmm6 681 movdqa %xmm5,%xmm4 682 paddq %xmm1,%xmm6 683 psrlq $26,%xmm5 684 pand %xmm7,%xmm4 685 movdqa %xmm6,%xmm1 686 psrlq $26,%xmm6 687 paddd %xmm5,%xmm0 688 psllq $2,%xmm5 689 paddq %xmm2,%xmm6 690 paddq %xmm0,%xmm5 691 pand %xmm7,%xmm1 692 movdqa %xmm6,%xmm2 693 psrlq $26,%xmm6 694 pand %xmm7,%xmm2 695 paddd %xmm3,%xmm6 696 movdqa %xmm5,%xmm0 697 psrlq $26,%xmm5 698 movdqa %xmm6,%xmm3 699 psrlq $26,%xmm6 700 pand %xmm7,%xmm0 701 paddd %xmm5,%xmm1 702 pand %xmm7,%xmm3 703 paddd %xmm6,%xmm4 704 subl $16,%ecx 705 jz .L013done 706.L012even: 707 leal 384(%esp),%edx 708 leal -32(%esi),%eax 709 subl $64,%ecx 710 movdqu (%edi),%xmm5 711 pshufd $68,%xmm5,%xmm6 712 cmovbl %eax,%esi 713 pshufd $238,%xmm5,%xmm5 714 movdqa %xmm6,(%edx) 715 leal 160(%esp),%eax 716 movdqu 16(%edi),%xmm6 717 movdqa %xmm5,-144(%edx) 718 pshufd $68,%xmm6,%xmm5 719 pshufd $238,%xmm6,%xmm6 720 movdqa %xmm5,16(%edx) 721 movdqu 32(%edi),%xmm5 722 movdqa %xmm6,-128(%edx) 723 pshufd $68,%xmm5,%xmm6 724 pshufd $238,%xmm5,%xmm5 725 movdqa %xmm6,32(%edx) 726 movdqu 48(%edi),%xmm6 727 movdqa %xmm5,-112(%edx) 728 pshufd $68,%xmm6,%xmm5 729 pshufd $238,%xmm6,%xmm6 730 movdqa %xmm5,48(%edx) 731 movdqu 64(%edi),%xmm5 732 movdqa %xmm6,-96(%edx) 733 pshufd $68,%xmm5,%xmm6 734 pshufd $238,%xmm5,%xmm5 735 movdqa %xmm6,64(%edx) 736 movdqu 80(%edi),%xmm6 737 movdqa %xmm5,-80(%edx) 738 pshufd $68,%xmm6,%xmm5 739 pshufd $238,%xmm6,%xmm6 740 movdqa %xmm5,80(%edx) 741 movdqu 96(%edi),%xmm5 742 movdqa %xmm6,-64(%edx) 743 pshufd $68,%xmm5,%xmm6 744 pshufd $238,%xmm5,%xmm5 745 movdqa %xmm6,96(%edx) 746 movdqu 112(%edi),%xmm6 747 movdqa %xmm5,-48(%edx) 748 pshufd $68,%xmm6,%xmm5 749 pshufd $238,%xmm6,%xmm6 750 movdqa %xmm5,112(%edx) 751 movdqu 128(%edi),%xmm5 752 movdqa %xmm6,-32(%edx) 753 pshufd $68,%xmm5,%xmm6 754 pshufd $238,%xmm5,%xmm5 755 movdqa %xmm6,128(%edx) 756 movdqa %xmm5,-16(%edx) 757 movdqu 32(%esi),%xmm5 758 movdqu 48(%esi),%xmm6 759 leal 32(%esi),%esi 760 movdqa %xmm2,112(%esp) 761 movdqa %xmm3,128(%esp) 762 movdqa %xmm4,144(%esp) 763 movdqa %xmm5,%xmm2 764 movdqa %xmm6,%xmm3 765 psrldq $6,%xmm2 766 psrldq $6,%xmm3 767 movdqa %xmm5,%xmm4 768 punpcklqdq %xmm3,%xmm2 769 punpckhqdq %xmm6,%xmm4 770 punpcklqdq %xmm6,%xmm5 771 movdqa %xmm2,%xmm3 772 psrlq $4,%xmm2 773 psrlq $30,%xmm3 774 movdqa %xmm5,%xmm6 775 psrlq $40,%xmm4 776 psrlq $26,%xmm6 777 pand %xmm7,%xmm5 778 pand %xmm7,%xmm6 779 pand %xmm7,%xmm2 780 pand %xmm7,%xmm3 781 por (%ebx),%xmm4 782 movdqa %xmm0,80(%esp) 783 movdqa %xmm1,96(%esp) 784 jbe .L014skip_loop 785 jmp .L015loop 786.align 32 787.L015loop: 788 movdqa -144(%edx),%xmm7 789 movdqa %xmm6,16(%eax) 790 movdqa %xmm2,32(%eax) 791 movdqa %xmm3,48(%eax) 792 movdqa %xmm4,64(%eax) 793 movdqa %xmm5,%xmm1 794 pmuludq %xmm7,%xmm5 795 movdqa %xmm6,%xmm0 796 pmuludq %xmm7,%xmm6 797 pmuludq %xmm7,%xmm2 798 pmuludq %xmm7,%xmm3 799 pmuludq %xmm7,%xmm4 800 pmuludq -16(%edx),%xmm0 801 movdqa %xmm1,%xmm7 802 pmuludq -128(%edx),%xmm1 803 paddq %xmm5,%xmm0 804 movdqa %xmm7,%xmm5 805 pmuludq -112(%edx),%xmm7 806 paddq %xmm6,%xmm1 807 movdqa %xmm5,%xmm6 808 pmuludq -96(%edx),%xmm5 809 paddq %xmm7,%xmm2 810 movdqa 16(%eax),%xmm7 811 pmuludq -80(%edx),%xmm6 812 paddq %xmm5,%xmm3 813 movdqa %xmm7,%xmm5 814 pmuludq -128(%edx),%xmm7 815 paddq %xmm6,%xmm4 816 movdqa %xmm5,%xmm6 817 pmuludq -112(%edx),%xmm5 818 paddq %xmm7,%xmm2 819 movdqa 32(%eax),%xmm7 820 pmuludq -96(%edx),%xmm6 821 paddq %xmm5,%xmm3 822 movdqa %xmm7,%xmm5 823 pmuludq -32(%edx),%xmm7 824 paddq %xmm6,%xmm4 825 movdqa %xmm5,%xmm6 826 pmuludq -16(%edx),%xmm5 827 paddq %xmm7,%xmm0 828 movdqa %xmm6,%xmm7 829 pmuludq -128(%edx),%xmm6 830 paddq %xmm5,%xmm1 831 movdqa 48(%eax),%xmm5 832 pmuludq -112(%edx),%xmm7 833 paddq %xmm6,%xmm3 834 movdqa %xmm5,%xmm6 835 pmuludq -48(%edx),%xmm5 836 paddq %xmm7,%xmm4 837 movdqa %xmm6,%xmm7 838 pmuludq -32(%edx),%xmm6 839 paddq %xmm5,%xmm0 840 movdqa %xmm7,%xmm5 841 pmuludq -16(%edx),%xmm7 842 paddq %xmm6,%xmm1 843 movdqa 64(%eax),%xmm6 844 pmuludq -128(%edx),%xmm5 845 paddq %xmm7,%xmm2 846 movdqa %xmm6,%xmm7 847 pmuludq -16(%edx),%xmm6 848 paddq %xmm5,%xmm4 849 movdqa %xmm7,%xmm5 850 pmuludq -64(%edx),%xmm7 851 paddq %xmm6,%xmm3 852 movdqa %xmm5,%xmm6 853 pmuludq -48(%edx),%xmm5 854 paddq %xmm7,%xmm0 855 movdqa 64(%ebx),%xmm7 856 pmuludq -32(%edx),%xmm6 857 paddq %xmm5,%xmm1 858 paddq %xmm6,%xmm2 859 movdqu -32(%esi),%xmm5 860 movdqu -16(%esi),%xmm6 861 leal 32(%esi),%esi 862 movdqa %xmm2,32(%esp) 863 movdqa %xmm3,48(%esp) 864 movdqa %xmm4,64(%esp) 865 movdqa %xmm5,%xmm2 866 movdqa %xmm6,%xmm3 867 psrldq $6,%xmm2 868 psrldq $6,%xmm3 869 movdqa %xmm5,%xmm4 870 punpcklqdq %xmm3,%xmm2 871 punpckhqdq %xmm6,%xmm4 872 punpcklqdq %xmm6,%xmm5 873 movdqa %xmm2,%xmm3 874 psrlq $4,%xmm2 875 psrlq $30,%xmm3 876 movdqa %xmm5,%xmm6 877 psrlq $40,%xmm4 878 psrlq $26,%xmm6 879 pand %xmm7,%xmm5 880 pand %xmm7,%xmm6 881 pand %xmm7,%xmm2 882 pand %xmm7,%xmm3 883 por (%ebx),%xmm4 884 leal -32(%esi),%eax 885 subl $64,%ecx 886 paddd 80(%esp),%xmm5 887 paddd 96(%esp),%xmm6 888 paddd 112(%esp),%xmm2 889 paddd 128(%esp),%xmm3 890 paddd 144(%esp),%xmm4 891 cmovbl %eax,%esi 892 leal 160(%esp),%eax 893 movdqa (%edx),%xmm7 894 movdqa %xmm1,16(%esp) 895 movdqa %xmm6,16(%eax) 896 movdqa %xmm2,32(%eax) 897 movdqa %xmm3,48(%eax) 898 movdqa %xmm4,64(%eax) 899 movdqa %xmm5,%xmm1 900 pmuludq %xmm7,%xmm5 901 paddq %xmm0,%xmm5 902 movdqa %xmm6,%xmm0 903 pmuludq %xmm7,%xmm6 904 pmuludq %xmm7,%xmm2 905 pmuludq %xmm7,%xmm3 906 pmuludq %xmm7,%xmm4 907 paddq 16(%esp),%xmm6 908 paddq 32(%esp),%xmm2 909 paddq 48(%esp),%xmm3 910 paddq 64(%esp),%xmm4 911 pmuludq 128(%edx),%xmm0 912 movdqa %xmm1,%xmm7 913 pmuludq 16(%edx),%xmm1 914 paddq %xmm5,%xmm0 915 movdqa %xmm7,%xmm5 916 pmuludq 32(%edx),%xmm7 917 paddq %xmm6,%xmm1 918 movdqa %xmm5,%xmm6 919 pmuludq 48(%edx),%xmm5 920 paddq %xmm7,%xmm2 921 movdqa 16(%eax),%xmm7 922 pmuludq 64(%edx),%xmm6 923 paddq %xmm5,%xmm3 924 movdqa %xmm7,%xmm5 925 pmuludq 16(%edx),%xmm7 926 paddq %xmm6,%xmm4 927 movdqa %xmm5,%xmm6 928 pmuludq 32(%edx),%xmm5 929 paddq %xmm7,%xmm2 930 movdqa 32(%eax),%xmm7 931 pmuludq 48(%edx),%xmm6 932 paddq %xmm5,%xmm3 933 movdqa %xmm7,%xmm5 934 pmuludq 112(%edx),%xmm7 935 paddq %xmm6,%xmm4 936 movdqa %xmm5,%xmm6 937 pmuludq 128(%edx),%xmm5 938 paddq %xmm7,%xmm0 939 movdqa %xmm6,%xmm7 940 pmuludq 16(%edx),%xmm6 941 paddq %xmm5,%xmm1 942 movdqa 48(%eax),%xmm5 943 pmuludq 32(%edx),%xmm7 944 paddq %xmm6,%xmm3 945 movdqa %xmm5,%xmm6 946 pmuludq 96(%edx),%xmm5 947 paddq %xmm7,%xmm4 948 movdqa %xmm6,%xmm7 949 pmuludq 112(%edx),%xmm6 950 paddq %xmm5,%xmm0 951 movdqa %xmm7,%xmm5 952 pmuludq 128(%edx),%xmm7 953 paddq %xmm6,%xmm1 954 movdqa 64(%eax),%xmm6 955 pmuludq 16(%edx),%xmm5 956 paddq %xmm7,%xmm2 957 movdqa %xmm6,%xmm7 958 pmuludq 128(%edx),%xmm6 959 paddq %xmm5,%xmm4 960 movdqa %xmm7,%xmm5 961 pmuludq 80(%edx),%xmm7 962 paddq %xmm6,%xmm3 963 movdqa %xmm5,%xmm6 964 pmuludq 96(%edx),%xmm5 965 paddq %xmm7,%xmm0 966 movdqa 64(%ebx),%xmm7 967 pmuludq 112(%edx),%xmm6 968 paddq %xmm5,%xmm1 969 paddq %xmm6,%xmm2 970 movdqa %xmm3,%xmm5 971 pand %xmm7,%xmm3 972 psrlq $26,%xmm5 973 paddq %xmm4,%xmm5 974 movdqa %xmm0,%xmm6 975 pand %xmm7,%xmm0 976 psrlq $26,%xmm6 977 movdqa %xmm5,%xmm4 978 paddq %xmm1,%xmm6 979 psrlq $26,%xmm5 980 pand %xmm7,%xmm4 981 movdqa %xmm6,%xmm1 982 psrlq $26,%xmm6 983 paddd %xmm5,%xmm0 984 psllq $2,%xmm5 985 paddq %xmm2,%xmm6 986 paddq %xmm0,%xmm5 987 pand %xmm7,%xmm1 988 movdqa %xmm6,%xmm2 989 psrlq $26,%xmm6 990 pand %xmm7,%xmm2 991 paddd %xmm3,%xmm6 992 movdqa %xmm5,%xmm0 993 psrlq $26,%xmm5 994 movdqa %xmm6,%xmm3 995 psrlq $26,%xmm6 996 pand %xmm7,%xmm0 997 paddd %xmm5,%xmm1 998 pand %xmm7,%xmm3 999 paddd %xmm6,%xmm4 1000 movdqu 32(%esi),%xmm5 1001 movdqu 48(%esi),%xmm6 1002 leal 32(%esi),%esi 1003 movdqa %xmm2,112(%esp) 1004 movdqa %xmm3,128(%esp) 1005 movdqa %xmm4,144(%esp) 1006 movdqa %xmm5,%xmm2 1007 movdqa %xmm6,%xmm3 1008 psrldq $6,%xmm2 1009 psrldq $6,%xmm3 1010 movdqa %xmm5,%xmm4 1011 punpcklqdq %xmm3,%xmm2 1012 punpckhqdq %xmm6,%xmm4 1013 punpcklqdq %xmm6,%xmm5 1014 movdqa %xmm2,%xmm3 1015 psrlq $4,%xmm2 1016 psrlq $30,%xmm3 1017 movdqa %xmm5,%xmm6 1018 psrlq $40,%xmm4 1019 psrlq $26,%xmm6 1020 pand %xmm7,%xmm5 1021 pand %xmm7,%xmm6 1022 pand %xmm7,%xmm2 1023 pand %xmm7,%xmm3 1024 por (%ebx),%xmm4 1025 movdqa %xmm0,80(%esp) 1026 movdqa %xmm1,96(%esp) 1027 ja .L015loop 1028.L014skip_loop: 1029 pshufd $16,-144(%edx),%xmm7 1030 addl $32,%ecx 1031 jnz .L016long_tail 1032 paddd %xmm0,%xmm5 1033 paddd %xmm1,%xmm6 1034 paddd 112(%esp),%xmm2 1035 paddd 128(%esp),%xmm3 1036 paddd 144(%esp),%xmm4 1037.L016long_tail: 1038 movdqa %xmm5,(%eax) 1039 movdqa %xmm6,16(%eax) 1040 movdqa %xmm2,32(%eax) 1041 movdqa %xmm3,48(%eax) 1042 movdqa %xmm4,64(%eax) 1043 pmuludq %xmm7,%xmm5 1044 pmuludq %xmm7,%xmm6 1045 pmuludq %xmm7,%xmm2 1046 movdqa %xmm5,%xmm0 1047 pshufd $16,-128(%edx),%xmm5 1048 pmuludq %xmm7,%xmm3 1049 movdqa %xmm6,%xmm1 1050 pmuludq %xmm7,%xmm4 1051 movdqa %xmm5,%xmm6 1052 pmuludq 48(%eax),%xmm5 1053 movdqa %xmm6,%xmm7 1054 pmuludq 32(%eax),%xmm6 1055 paddq %xmm5,%xmm4 1056 movdqa %xmm7,%xmm5 1057 pmuludq 16(%eax),%xmm7 1058 paddq %xmm6,%xmm3 1059 pshufd $16,-64(%edx),%xmm6 1060 pmuludq (%eax),%xmm5 1061 paddq %xmm7,%xmm2 1062 pmuludq 64(%eax),%xmm6 1063 pshufd $16,-112(%edx),%xmm7 1064 paddq %xmm5,%xmm1 1065 movdqa %xmm7,%xmm5 1066 pmuludq 32(%eax),%xmm7 1067 paddq %xmm6,%xmm0 1068 movdqa %xmm5,%xmm6 1069 pmuludq 16(%eax),%xmm5 1070 paddq %xmm7,%xmm4 1071 pshufd $16,-48(%edx),%xmm7 1072 pmuludq (%eax),%xmm6 1073 paddq %xmm5,%xmm3 1074 movdqa %xmm7,%xmm5 1075 pmuludq 64(%eax),%xmm7 1076 paddq %xmm6,%xmm2 1077 pmuludq 48(%eax),%xmm5 1078 pshufd $16,-96(%edx),%xmm6 1079 paddq %xmm7,%xmm1 1080 movdqa %xmm6,%xmm7 1081 pmuludq 16(%eax),%xmm6 1082 paddq %xmm5,%xmm0 1083 pshufd $16,-32(%edx),%xmm5 1084 pmuludq (%eax),%xmm7 1085 paddq %xmm6,%xmm4 1086 movdqa %xmm5,%xmm6 1087 pmuludq 64(%eax),%xmm5 1088 paddq %xmm7,%xmm3 1089 movdqa %xmm6,%xmm7 1090 pmuludq 48(%eax),%xmm6 1091 paddq %xmm5,%xmm2 1092 pmuludq 32(%eax),%xmm7 1093 pshufd $16,-80(%edx),%xmm5 1094 paddq %xmm6,%xmm1 1095 pshufd $16,-16(%edx),%xmm6 1096 pmuludq (%eax),%xmm5 1097 paddq %xmm7,%xmm0 1098 movdqa %xmm6,%xmm7 1099 pmuludq 64(%eax),%xmm6 1100 paddq %xmm5,%xmm4 1101 movdqa %xmm7,%xmm5 1102 pmuludq 16(%eax),%xmm7 1103 paddq %xmm6,%xmm3 1104 movdqa %xmm5,%xmm6 1105 pmuludq 32(%eax),%xmm5 1106 paddq %xmm7,%xmm0 1107 pmuludq 48(%eax),%xmm6 1108 movdqa 64(%ebx),%xmm7 1109 paddq %xmm5,%xmm1 1110 paddq %xmm6,%xmm2 1111 jz .L017short_tail 1112 movdqu -32(%esi),%xmm5 1113 movdqu -16(%esi),%xmm6 1114 leal 32(%esi),%esi 1115 movdqa %xmm2,32(%esp) 1116 movdqa %xmm3,48(%esp) 1117 movdqa %xmm4,64(%esp) 1118 movdqa %xmm5,%xmm2 1119 movdqa %xmm6,%xmm3 1120 psrldq $6,%xmm2 1121 psrldq $6,%xmm3 1122 movdqa %xmm5,%xmm4 1123 punpcklqdq %xmm3,%xmm2 1124 punpckhqdq %xmm6,%xmm4 1125 punpcklqdq %xmm6,%xmm5 1126 movdqa %xmm2,%xmm3 1127 psrlq $4,%xmm2 1128 psrlq $30,%xmm3 1129 movdqa %xmm5,%xmm6 1130 psrlq $40,%xmm4 1131 psrlq $26,%xmm6 1132 pand %xmm7,%xmm5 1133 pand %xmm7,%xmm6 1134 pand %xmm7,%xmm2 1135 pand %xmm7,%xmm3 1136 por (%ebx),%xmm4 1137 pshufd $16,(%edx),%xmm7 1138 paddd 80(%esp),%xmm5 1139 paddd 96(%esp),%xmm6 1140 paddd 112(%esp),%xmm2 1141 paddd 128(%esp),%xmm3 1142 paddd 144(%esp),%xmm4 1143 movdqa %xmm5,(%esp) 1144 pmuludq %xmm7,%xmm5 1145 movdqa %xmm6,16(%esp) 1146 pmuludq %xmm7,%xmm6 1147 paddq %xmm5,%xmm0 1148 movdqa %xmm2,%xmm5 1149 pmuludq %xmm7,%xmm2 1150 paddq %xmm6,%xmm1 1151 movdqa %xmm3,%xmm6 1152 pmuludq %xmm7,%xmm3 1153 paddq 32(%esp),%xmm2 1154 movdqa %xmm5,32(%esp) 1155 pshufd $16,16(%edx),%xmm5 1156 paddq 48(%esp),%xmm3 1157 movdqa %xmm6,48(%esp) 1158 movdqa %xmm4,%xmm6 1159 pmuludq %xmm7,%xmm4 1160 paddq 64(%esp),%xmm4 1161 movdqa %xmm6,64(%esp) 1162 movdqa %xmm5,%xmm6 1163 pmuludq 48(%esp),%xmm5 1164 movdqa %xmm6,%xmm7 1165 pmuludq 32(%esp),%xmm6 1166 paddq %xmm5,%xmm4 1167 movdqa %xmm7,%xmm5 1168 pmuludq 16(%esp),%xmm7 1169 paddq %xmm6,%xmm3 1170 pshufd $16,80(%edx),%xmm6 1171 pmuludq (%esp),%xmm5 1172 paddq %xmm7,%xmm2 1173 pmuludq 64(%esp),%xmm6 1174 pshufd $16,32(%edx),%xmm7 1175 paddq %xmm5,%xmm1 1176 movdqa %xmm7,%xmm5 1177 pmuludq 32(%esp),%xmm7 1178 paddq %xmm6,%xmm0 1179 movdqa %xmm5,%xmm6 1180 pmuludq 16(%esp),%xmm5 1181 paddq %xmm7,%xmm4 1182 pshufd $16,96(%edx),%xmm7 1183 pmuludq (%esp),%xmm6 1184 paddq %xmm5,%xmm3 1185 movdqa %xmm7,%xmm5 1186 pmuludq 64(%esp),%xmm7 1187 paddq %xmm6,%xmm2 1188 pmuludq 48(%esp),%xmm5 1189 pshufd $16,48(%edx),%xmm6 1190 paddq %xmm7,%xmm1 1191 movdqa %xmm6,%xmm7 1192 pmuludq 16(%esp),%xmm6 1193 paddq %xmm5,%xmm0 1194 pshufd $16,112(%edx),%xmm5 1195 pmuludq (%esp),%xmm7 1196 paddq %xmm6,%xmm4 1197 movdqa %xmm5,%xmm6 1198 pmuludq 64(%esp),%xmm5 1199 paddq %xmm7,%xmm3 1200 movdqa %xmm6,%xmm7 1201 pmuludq 48(%esp),%xmm6 1202 paddq %xmm5,%xmm2 1203 pmuludq 32(%esp),%xmm7 1204 pshufd $16,64(%edx),%xmm5 1205 paddq %xmm6,%xmm1 1206 pshufd $16,128(%edx),%xmm6 1207 pmuludq (%esp),%xmm5 1208 paddq %xmm7,%xmm0 1209 movdqa %xmm6,%xmm7 1210 pmuludq 64(%esp),%xmm6 1211 paddq %xmm5,%xmm4 1212 movdqa %xmm7,%xmm5 1213 pmuludq 16(%esp),%xmm7 1214 paddq %xmm6,%xmm3 1215 movdqa %xmm5,%xmm6 1216 pmuludq 32(%esp),%xmm5 1217 paddq %xmm7,%xmm0 1218 pmuludq 48(%esp),%xmm6 1219 movdqa 64(%ebx),%xmm7 1220 paddq %xmm5,%xmm1 1221 paddq %xmm6,%xmm2 1222.L017short_tail: 1223 pshufd $78,%xmm4,%xmm6 1224 pshufd $78,%xmm3,%xmm5 1225 paddq %xmm6,%xmm4 1226 paddq %xmm5,%xmm3 1227 pshufd $78,%xmm0,%xmm6 1228 pshufd $78,%xmm1,%xmm5 1229 paddq %xmm6,%xmm0 1230 paddq %xmm5,%xmm1 1231 pshufd $78,%xmm2,%xmm6 1232 movdqa %xmm3,%xmm5 1233 pand %xmm7,%xmm3 1234 psrlq $26,%xmm5 1235 paddq %xmm6,%xmm2 1236 paddq %xmm4,%xmm5 1237 movdqa %xmm0,%xmm6 1238 pand %xmm7,%xmm0 1239 psrlq $26,%xmm6 1240 movdqa %xmm5,%xmm4 1241 paddq %xmm1,%xmm6 1242 psrlq $26,%xmm5 1243 pand %xmm7,%xmm4 1244 movdqa %xmm6,%xmm1 1245 psrlq $26,%xmm6 1246 paddd %xmm5,%xmm0 1247 psllq $2,%xmm5 1248 paddq %xmm2,%xmm6 1249 paddq %xmm0,%xmm5 1250 pand %xmm7,%xmm1 1251 movdqa %xmm6,%xmm2 1252 psrlq $26,%xmm6 1253 pand %xmm7,%xmm2 1254 paddd %xmm3,%xmm6 1255 movdqa %xmm5,%xmm0 1256 psrlq $26,%xmm5 1257 movdqa %xmm6,%xmm3 1258 psrlq $26,%xmm6 1259 pand %xmm7,%xmm0 1260 paddd %xmm5,%xmm1 1261 pand %xmm7,%xmm3 1262 paddd %xmm6,%xmm4 1263.L013done: 1264 movd %xmm0,-48(%edi) 1265 movd %xmm1,-44(%edi) 1266 movd %xmm2,-40(%edi) 1267 movd %xmm3,-36(%edi) 1268 movd %xmm4,-32(%edi) 1269 movl %ebp,%esp 1270.L007nodata: 1271 popl %edi 1272 popl %esi 1273 popl %ebx 1274 popl %ebp 1275 ret 1276.size _poly1305_blocks_sse2,.-_poly1305_blocks_sse2 1277.align 32 1278.type _poly1305_emit_sse2,@function 1279.align 16 1280_poly1305_emit_sse2: 1281 #ifdef __CET__ 1282 1283.byte 243,15,30,251 1284 #endif 1285 1286 pushl %ebp 1287 pushl %ebx 1288 pushl %esi 1289 pushl %edi 1290 movl 20(%esp),%ebp 1291 cmpl $0,20(%ebp) 1292 je .Lenter_emit 1293 movl (%ebp),%eax 1294 movl 4(%ebp),%edi 1295 movl 8(%ebp),%ecx 1296 movl 12(%ebp),%edx 1297 movl 16(%ebp),%esi 1298 movl %edi,%ebx 1299 shll $26,%edi 1300 shrl $6,%ebx 1301 addl %edi,%eax 1302 movl %ecx,%edi 1303 adcl $0,%ebx 1304 shll $20,%edi 1305 shrl $12,%ecx 1306 addl %edi,%ebx 1307 movl %edx,%edi 1308 adcl $0,%ecx 1309 shll $14,%edi 1310 shrl $18,%edx 1311 addl %edi,%ecx 1312 movl %esi,%edi 1313 adcl $0,%edx 1314 shll $8,%edi 1315 shrl $24,%esi 1316 addl %edi,%edx 1317 adcl $0,%esi 1318 movl %esi,%edi 1319 andl $3,%esi 1320 shrl $2,%edi 1321 leal (%edi,%edi,4),%ebp 1322 movl 24(%esp),%edi 1323 addl %ebp,%eax 1324 movl 28(%esp),%ebp 1325 adcl $0,%ebx 1326 adcl $0,%ecx 1327 adcl $0,%edx 1328 adcl $0,%esi 1329 movd %eax,%xmm0 1330 addl $5,%eax 1331 movd %ebx,%xmm1 1332 adcl $0,%ebx 1333 movd %ecx,%xmm2 1334 adcl $0,%ecx 1335 movd %edx,%xmm3 1336 adcl $0,%edx 1337 adcl $0,%esi 1338 shrl $2,%esi 1339 negl %esi 1340 andl %esi,%eax 1341 andl %esi,%ebx 1342 andl %esi,%ecx 1343 andl %esi,%edx 1344 movl %eax,(%edi) 1345 movd %xmm0,%eax 1346 movl %ebx,4(%edi) 1347 movd %xmm1,%ebx 1348 movl %ecx,8(%edi) 1349 movd %xmm2,%ecx 1350 movl %edx,12(%edi) 1351 movd %xmm3,%edx 1352 notl %esi 1353 andl %esi,%eax 1354 andl %esi,%ebx 1355 orl (%edi),%eax 1356 andl %esi,%ecx 1357 orl 4(%edi),%ebx 1358 andl %esi,%edx 1359 orl 8(%edi),%ecx 1360 orl 12(%edi),%edx 1361 addl (%ebp),%eax 1362 adcl 4(%ebp),%ebx 1363 movl %eax,(%edi) 1364 adcl 8(%ebp),%ecx 1365 movl %ebx,4(%edi) 1366 adcl 12(%ebp),%edx 1367 movl %ecx,8(%edi) 1368 movl %edx,12(%edi) 1369 popl %edi 1370 popl %esi 1371 popl %ebx 1372 popl %ebp 1373 ret 1374.size _poly1305_emit_sse2,.-_poly1305_emit_sse2 1375.align 64 1376.Lconst_sse2: 1377.long 16777216,0,16777216,0,16777216,0,16777216,0 1378.long 0,0,0,0,0,0,0,0 1379.long 67108863,0,67108863,0,67108863,0,67108863,0 1380.long 268435455,268435452,268435452,268435452 1381.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54 1382.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 1383.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 1384.byte 114,103,62,0 1385.align 4 1386.comm OPENSSL_ia32cap_P,16,4 1387 1388 .section ".note.gnu.property", "a" 1389 .p2align 2 1390 .long 1f - 0f 1391 .long 4f - 1f 1392 .long 5 13930: 1394 .asciz "GNU" 13951: 1396 .p2align 2 1397 .long 0xc0000002 1398 .long 3f - 2f 13992: 1400 .long 3 14013: 1402 .p2align 2 14034: 1404