poly1305-x86_64.S revision 1.3
1#include <machine/asm.h> 2.text 3 4 5 6.globl poly1305_init 7.hidden poly1305_init 8.globl poly1305_blocks 9.hidden poly1305_blocks 10.globl poly1305_emit 11.hidden poly1305_emit 12 13.type poly1305_init,@function 14.align 32 15poly1305_init: 16.cfi_startproc 17 xorq %rax,%rax 18 movq %rax,0(%rdi) 19 movq %rax,8(%rdi) 20 movq %rax,16(%rdi) 21 22 cmpq $0,%rsi 23 je .Lno_key 24 25 leaq poly1305_blocks(%rip),%r10 26 leaq poly1305_emit(%rip),%r11 27 movq OPENSSL_ia32cap_P+4(%rip),%r9 28 leaq poly1305_blocks_avx(%rip),%rax 29 leaq poly1305_emit_avx(%rip),%rcx 30 btq $28,%r9 31 cmovcq %rax,%r10 32 cmovcq %rcx,%r11 33 leaq poly1305_blocks_avx2(%rip),%rax 34 btq $37,%r9 35 cmovcq %rax,%r10 36 movq $2149646336,%rax 37 shrq $32,%r9 38 andq %rax,%r9 39 cmpq %rax,%r9 40 je .Linit_base2_44 41 movq $0x0ffffffc0fffffff,%rax 42 movq $0x0ffffffc0ffffffc,%rcx 43 andq 0(%rsi),%rax 44 andq 8(%rsi),%rcx 45 movq %rax,24(%rdi) 46 movq %rcx,32(%rdi) 47 movq %r10,0(%rdx) 48 movq %r11,8(%rdx) 49 movl $1,%eax 50.Lno_key: 51 .byte 0xf3,0xc3 52.cfi_endproc 53.size poly1305_init,.-poly1305_init 54 55.type poly1305_blocks,@function 56.align 32 57poly1305_blocks: 58.cfi_startproc 59.Lblocks: 60 shrq $4,%rdx 61 jz .Lno_data 62 63 pushq %rbx 64.cfi_adjust_cfa_offset 8 65.cfi_offset %rbx,-16 66 pushq %rbp 67.cfi_adjust_cfa_offset 8 68.cfi_offset %rbp,-24 69 pushq %r12 70.cfi_adjust_cfa_offset 8 71.cfi_offset %r12,-32 72 pushq %r13 73.cfi_adjust_cfa_offset 8 74.cfi_offset %r13,-40 75 pushq %r14 76.cfi_adjust_cfa_offset 8 77.cfi_offset %r14,-48 78 pushq %r15 79.cfi_adjust_cfa_offset 8 80.cfi_offset %r15,-56 81.Lblocks_body: 82 83 movq %rdx,%r15 84 85 movq 24(%rdi),%r11 86 movq 32(%rdi),%r13 87 88 movq 0(%rdi),%r14 89 movq 8(%rdi),%rbx 90 movq 16(%rdi),%rbp 91 92 movq %r13,%r12 93 shrq $2,%r13 94 movq %r12,%rax 95 addq %r12,%r13 96 jmp .Loop 97 98.align 32 99.Loop: 100 addq 0(%rsi),%r14 101 adcq 8(%rsi),%rbx 102 leaq 16(%rsi),%rsi 103 adcq %rcx,%rbp 104 mulq %r14 105 movq %rax,%r9 106 movq %r11,%rax 107 movq %rdx,%r10 108 109 mulq %r14 110 movq %rax,%r14 111 movq %r11,%rax 112 movq %rdx,%r8 113 114 mulq %rbx 115 addq %rax,%r9 116 movq %r13,%rax 117 adcq %rdx,%r10 118 119 mulq %rbx 120 movq %rbp,%rbx 121 addq %rax,%r14 122 adcq %rdx,%r8 123 124 imulq %r13,%rbx 125 addq %rbx,%r9 126 movq %r8,%rbx 127 adcq $0,%r10 128 129 imulq %r11,%rbp 130 addq %r9,%rbx 131 movq $-4,%rax 132 adcq %rbp,%r10 133 134 andq %r10,%rax 135 movq %r10,%rbp 136 shrq $2,%r10 137 andq $3,%rbp 138 addq %r10,%rax 139 addq %rax,%r14 140 adcq $0,%rbx 141 adcq $0,%rbp 142 movq %r12,%rax 143 decq %r15 144 jnz .Loop 145 146 movq %r14,0(%rdi) 147 movq %rbx,8(%rdi) 148 movq %rbp,16(%rdi) 149 150 movq 0(%rsp),%r15 151.cfi_restore %r15 152 movq 8(%rsp),%r14 153.cfi_restore %r14 154 movq 16(%rsp),%r13 155.cfi_restore %r13 156 movq 24(%rsp),%r12 157.cfi_restore %r12 158 movq 32(%rsp),%rbp 159.cfi_restore %rbp 160 movq 40(%rsp),%rbx 161.cfi_restore %rbx 162 leaq 48(%rsp),%rsp 163.cfi_adjust_cfa_offset -48 164.Lno_data: 165.Lblocks_epilogue: 166 .byte 0xf3,0xc3 167.cfi_endproc 168.size poly1305_blocks,.-poly1305_blocks 169 170.type poly1305_emit,@function 171.align 32 172poly1305_emit: 173.cfi_startproc 174.Lemit: 175 movq 0(%rdi),%r8 176 movq 8(%rdi),%r9 177 movq 16(%rdi),%r10 178 179 movq %r8,%rax 180 addq $5,%r8 181 movq %r9,%rcx 182 adcq $0,%r9 183 adcq $0,%r10 184 shrq $2,%r10 185 cmovnzq %r8,%rax 186 cmovnzq %r9,%rcx 187 188 addq 0(%rdx),%rax 189 adcq 8(%rdx),%rcx 190 movq %rax,0(%rsi) 191 movq %rcx,8(%rsi) 192 193 .byte 0xf3,0xc3 194.cfi_endproc 195.size poly1305_emit,.-poly1305_emit 196.type __poly1305_block,@function 197.align 32 198__poly1305_block: 199.cfi_startproc 200 mulq %r14 201 movq %rax,%r9 202 movq %r11,%rax 203 movq %rdx,%r10 204 205 mulq %r14 206 movq %rax,%r14 207 movq %r11,%rax 208 movq %rdx,%r8 209 210 mulq %rbx 211 addq %rax,%r9 212 movq %r13,%rax 213 adcq %rdx,%r10 214 215 mulq %rbx 216 movq %rbp,%rbx 217 addq %rax,%r14 218 adcq %rdx,%r8 219 220 imulq %r13,%rbx 221 addq %rbx,%r9 222 movq %r8,%rbx 223 adcq $0,%r10 224 225 imulq %r11,%rbp 226 addq %r9,%rbx 227 movq $-4,%rax 228 adcq %rbp,%r10 229 230 andq %r10,%rax 231 movq %r10,%rbp 232 shrq $2,%r10 233 andq $3,%rbp 234 addq %r10,%rax 235 addq %rax,%r14 236 adcq $0,%rbx 237 adcq $0,%rbp 238 .byte 0xf3,0xc3 239.cfi_endproc 240.size __poly1305_block,.-__poly1305_block 241 242.type __poly1305_init_avx,@function 243.align 32 244__poly1305_init_avx: 245.cfi_startproc 246 movq %r11,%r14 247 movq %r12,%rbx 248 xorq %rbp,%rbp 249 250 leaq 48+64(%rdi),%rdi 251 252 movq %r12,%rax 253 call __poly1305_block 254 255 movl $0x3ffffff,%eax 256 movl $0x3ffffff,%edx 257 movq %r14,%r8 258 andl %r14d,%eax 259 movq %r11,%r9 260 andl %r11d,%edx 261 movl %eax,-64(%rdi) 262 shrq $26,%r8 263 movl %edx,-60(%rdi) 264 shrq $26,%r9 265 266 movl $0x3ffffff,%eax 267 movl $0x3ffffff,%edx 268 andl %r8d,%eax 269 andl %r9d,%edx 270 movl %eax,-48(%rdi) 271 leal (%rax,%rax,4),%eax 272 movl %edx,-44(%rdi) 273 leal (%rdx,%rdx,4),%edx 274 movl %eax,-32(%rdi) 275 shrq $26,%r8 276 movl %edx,-28(%rdi) 277 shrq $26,%r9 278 279 movq %rbx,%rax 280 movq %r12,%rdx 281 shlq $12,%rax 282 shlq $12,%rdx 283 orq %r8,%rax 284 orq %r9,%rdx 285 andl $0x3ffffff,%eax 286 andl $0x3ffffff,%edx 287 movl %eax,-16(%rdi) 288 leal (%rax,%rax,4),%eax 289 movl %edx,-12(%rdi) 290 leal (%rdx,%rdx,4),%edx 291 movl %eax,0(%rdi) 292 movq %rbx,%r8 293 movl %edx,4(%rdi) 294 movq %r12,%r9 295 296 movl $0x3ffffff,%eax 297 movl $0x3ffffff,%edx 298 shrq $14,%r8 299 shrq $14,%r9 300 andl %r8d,%eax 301 andl %r9d,%edx 302 movl %eax,16(%rdi) 303 leal (%rax,%rax,4),%eax 304 movl %edx,20(%rdi) 305 leal (%rdx,%rdx,4),%edx 306 movl %eax,32(%rdi) 307 shrq $26,%r8 308 movl %edx,36(%rdi) 309 shrq $26,%r9 310 311 movq %rbp,%rax 312 shlq $24,%rax 313 orq %rax,%r8 314 movl %r8d,48(%rdi) 315 leaq (%r8,%r8,4),%r8 316 movl %r9d,52(%rdi) 317 leaq (%r9,%r9,4),%r9 318 movl %r8d,64(%rdi) 319 movl %r9d,68(%rdi) 320 321 movq %r12,%rax 322 call __poly1305_block 323 324 movl $0x3ffffff,%eax 325 movq %r14,%r8 326 andl %r14d,%eax 327 shrq $26,%r8 328 movl %eax,-52(%rdi) 329 330 movl $0x3ffffff,%edx 331 andl %r8d,%edx 332 movl %edx,-36(%rdi) 333 leal (%rdx,%rdx,4),%edx 334 shrq $26,%r8 335 movl %edx,-20(%rdi) 336 337 movq %rbx,%rax 338 shlq $12,%rax 339 orq %r8,%rax 340 andl $0x3ffffff,%eax 341 movl %eax,-4(%rdi) 342 leal (%rax,%rax,4),%eax 343 movq %rbx,%r8 344 movl %eax,12(%rdi) 345 346 movl $0x3ffffff,%edx 347 shrq $14,%r8 348 andl %r8d,%edx 349 movl %edx,28(%rdi) 350 leal (%rdx,%rdx,4),%edx 351 shrq $26,%r8 352 movl %edx,44(%rdi) 353 354 movq %rbp,%rax 355 shlq $24,%rax 356 orq %rax,%r8 357 movl %r8d,60(%rdi) 358 leaq (%r8,%r8,4),%r8 359 movl %r8d,76(%rdi) 360 361 movq %r12,%rax 362 call __poly1305_block 363 364 movl $0x3ffffff,%eax 365 movq %r14,%r8 366 andl %r14d,%eax 367 shrq $26,%r8 368 movl %eax,-56(%rdi) 369 370 movl $0x3ffffff,%edx 371 andl %r8d,%edx 372 movl %edx,-40(%rdi) 373 leal (%rdx,%rdx,4),%edx 374 shrq $26,%r8 375 movl %edx,-24(%rdi) 376 377 movq %rbx,%rax 378 shlq $12,%rax 379 orq %r8,%rax 380 andl $0x3ffffff,%eax 381 movl %eax,-8(%rdi) 382 leal (%rax,%rax,4),%eax 383 movq %rbx,%r8 384 movl %eax,8(%rdi) 385 386 movl $0x3ffffff,%edx 387 shrq $14,%r8 388 andl %r8d,%edx 389 movl %edx,24(%rdi) 390 leal (%rdx,%rdx,4),%edx 391 shrq $26,%r8 392 movl %edx,40(%rdi) 393 394 movq %rbp,%rax 395 shlq $24,%rax 396 orq %rax,%r8 397 movl %r8d,56(%rdi) 398 leaq (%r8,%r8,4),%r8 399 movl %r8d,72(%rdi) 400 401 leaq -48-64(%rdi),%rdi 402 .byte 0xf3,0xc3 403.cfi_endproc 404.size __poly1305_init_avx,.-__poly1305_init_avx 405 406.type poly1305_blocks_avx,@function 407.align 32 408poly1305_blocks_avx: 409.cfi_startproc 410 movl 20(%rdi),%r8d 411 cmpq $128,%rdx 412 jae .Lblocks_avx 413 testl %r8d,%r8d 414 jz .Lblocks 415 416.Lblocks_avx: 417 andq $-16,%rdx 418 jz .Lno_data_avx 419 420 vzeroupper 421 422 testl %r8d,%r8d 423 jz .Lbase2_64_avx 424 425 testq $31,%rdx 426 jz .Leven_avx 427 428 pushq %rbx 429.cfi_adjust_cfa_offset 8 430.cfi_offset %rbx,-16 431 pushq %rbp 432.cfi_adjust_cfa_offset 8 433.cfi_offset %rbp,-24 434 pushq %r12 435.cfi_adjust_cfa_offset 8 436.cfi_offset %r12,-32 437 pushq %r13 438.cfi_adjust_cfa_offset 8 439.cfi_offset %r13,-40 440 pushq %r14 441.cfi_adjust_cfa_offset 8 442.cfi_offset %r14,-48 443 pushq %r15 444.cfi_adjust_cfa_offset 8 445.cfi_offset %r15,-56 446.Lblocks_avx_body: 447 448 movq %rdx,%r15 449 450 movq 0(%rdi),%r8 451 movq 8(%rdi),%r9 452 movl 16(%rdi),%ebp 453 454 movq 24(%rdi),%r11 455 movq 32(%rdi),%r13 456 457 458 movl %r8d,%r14d 459 andq $-2147483648,%r8 460 movq %r9,%r12 461 movl %r9d,%ebx 462 andq $-2147483648,%r9 463 464 shrq $6,%r8 465 shlq $52,%r12 466 addq %r8,%r14 467 shrq $12,%rbx 468 shrq $18,%r9 469 addq %r12,%r14 470 adcq %r9,%rbx 471 472 movq %rbp,%r8 473 shlq $40,%r8 474 shrq $24,%rbp 475 addq %r8,%rbx 476 adcq $0,%rbp 477 478 movq $-4,%r9 479 movq %rbp,%r8 480 andq %rbp,%r9 481 shrq $2,%r8 482 andq $3,%rbp 483 addq %r9,%r8 484 addq %r8,%r14 485 adcq $0,%rbx 486 adcq $0,%rbp 487 488 movq %r13,%r12 489 movq %r13,%rax 490 shrq $2,%r13 491 addq %r12,%r13 492 493 addq 0(%rsi),%r14 494 adcq 8(%rsi),%rbx 495 leaq 16(%rsi),%rsi 496 adcq %rcx,%rbp 497 498 call __poly1305_block 499 500 testq %rcx,%rcx 501 jz .Lstore_base2_64_avx 502 503 504 movq %r14,%rax 505 movq %r14,%rdx 506 shrq $52,%r14 507 movq %rbx,%r11 508 movq %rbx,%r12 509 shrq $26,%rdx 510 andq $0x3ffffff,%rax 511 shlq $12,%r11 512 andq $0x3ffffff,%rdx 513 shrq $14,%rbx 514 orq %r11,%r14 515 shlq $24,%rbp 516 andq $0x3ffffff,%r14 517 shrq $40,%r12 518 andq $0x3ffffff,%rbx 519 orq %r12,%rbp 520 521 subq $16,%r15 522 jz .Lstore_base2_26_avx 523 524 vmovd %eax,%xmm0 525 vmovd %edx,%xmm1 526 vmovd %r14d,%xmm2 527 vmovd %ebx,%xmm3 528 vmovd %ebp,%xmm4 529 jmp .Lproceed_avx 530 531.align 32 532.Lstore_base2_64_avx: 533 movq %r14,0(%rdi) 534 movq %rbx,8(%rdi) 535 movq %rbp,16(%rdi) 536 jmp .Ldone_avx 537 538.align 16 539.Lstore_base2_26_avx: 540 movl %eax,0(%rdi) 541 movl %edx,4(%rdi) 542 movl %r14d,8(%rdi) 543 movl %ebx,12(%rdi) 544 movl %ebp,16(%rdi) 545.align 16 546.Ldone_avx: 547 movq 0(%rsp),%r15 548.cfi_restore %r15 549 movq 8(%rsp),%r14 550.cfi_restore %r14 551 movq 16(%rsp),%r13 552.cfi_restore %r13 553 movq 24(%rsp),%r12 554.cfi_restore %r12 555 movq 32(%rsp),%rbp 556.cfi_restore %rbp 557 movq 40(%rsp),%rbx 558.cfi_restore %rbx 559 leaq 48(%rsp),%rsp 560.cfi_adjust_cfa_offset -48 561.Lno_data_avx: 562.Lblocks_avx_epilogue: 563 .byte 0xf3,0xc3 564.cfi_endproc 565 566.align 32 567.Lbase2_64_avx: 568.cfi_startproc 569 pushq %rbx 570.cfi_adjust_cfa_offset 8 571.cfi_offset %rbx,-16 572 pushq %rbp 573.cfi_adjust_cfa_offset 8 574.cfi_offset %rbp,-24 575 pushq %r12 576.cfi_adjust_cfa_offset 8 577.cfi_offset %r12,-32 578 pushq %r13 579.cfi_adjust_cfa_offset 8 580.cfi_offset %r13,-40 581 pushq %r14 582.cfi_adjust_cfa_offset 8 583.cfi_offset %r14,-48 584 pushq %r15 585.cfi_adjust_cfa_offset 8 586.cfi_offset %r15,-56 587.Lbase2_64_avx_body: 588 589 movq %rdx,%r15 590 591 movq 24(%rdi),%r11 592 movq 32(%rdi),%r13 593 594 movq 0(%rdi),%r14 595 movq 8(%rdi),%rbx 596 movl 16(%rdi),%ebp 597 598 movq %r13,%r12 599 movq %r13,%rax 600 shrq $2,%r13 601 addq %r12,%r13 602 603 testq $31,%rdx 604 jz .Linit_avx 605 606 addq 0(%rsi),%r14 607 adcq 8(%rsi),%rbx 608 leaq 16(%rsi),%rsi 609 adcq %rcx,%rbp 610 subq $16,%r15 611 612 call __poly1305_block 613 614.Linit_avx: 615 616 movq %r14,%rax 617 movq %r14,%rdx 618 shrq $52,%r14 619 movq %rbx,%r8 620 movq %rbx,%r9 621 shrq $26,%rdx 622 andq $0x3ffffff,%rax 623 shlq $12,%r8 624 andq $0x3ffffff,%rdx 625 shrq $14,%rbx 626 orq %r8,%r14 627 shlq $24,%rbp 628 andq $0x3ffffff,%r14 629 shrq $40,%r9 630 andq $0x3ffffff,%rbx 631 orq %r9,%rbp 632 633 vmovd %eax,%xmm0 634 vmovd %edx,%xmm1 635 vmovd %r14d,%xmm2 636 vmovd %ebx,%xmm3 637 vmovd %ebp,%xmm4 638 movl $1,20(%rdi) 639 640 call __poly1305_init_avx 641 642.Lproceed_avx: 643 movq %r15,%rdx 644 645 movq 0(%rsp),%r15 646.cfi_restore %r15 647 movq 8(%rsp),%r14 648.cfi_restore %r14 649 movq 16(%rsp),%r13 650.cfi_restore %r13 651 movq 24(%rsp),%r12 652.cfi_restore %r12 653 movq 32(%rsp),%rbp 654.cfi_restore %rbp 655 movq 40(%rsp),%rbx 656.cfi_restore %rbx 657 leaq 48(%rsp),%rax 658 leaq 48(%rsp),%rsp 659.cfi_adjust_cfa_offset -48 660.Lbase2_64_avx_epilogue: 661 jmp .Ldo_avx 662.cfi_endproc 663 664.align 32 665.Leven_avx: 666.cfi_startproc 667 vmovd 0(%rdi),%xmm0 668 vmovd 4(%rdi),%xmm1 669 vmovd 8(%rdi),%xmm2 670 vmovd 12(%rdi),%xmm3 671 vmovd 16(%rdi),%xmm4 672 673.Ldo_avx: 674 leaq -88(%rsp),%r11 675.cfi_def_cfa %r11,0x60 676 subq $0x178,%rsp 677 subq $64,%rdx 678 leaq -32(%rsi),%rax 679 cmovcq %rax,%rsi 680 681 vmovdqu 48(%rdi),%xmm14 682 leaq 112(%rdi),%rdi 683 leaq .Lconst(%rip),%rcx 684 685 686 687 vmovdqu 32(%rsi),%xmm5 688 vmovdqu 48(%rsi),%xmm6 689 vmovdqa 64(%rcx),%xmm15 690 691 vpsrldq $6,%xmm5,%xmm7 692 vpsrldq $6,%xmm6,%xmm8 693 vpunpckhqdq %xmm6,%xmm5,%xmm9 694 vpunpcklqdq %xmm6,%xmm5,%xmm5 695 vpunpcklqdq %xmm8,%xmm7,%xmm8 696 697 vpsrlq $40,%xmm9,%xmm9 698 vpsrlq $26,%xmm5,%xmm6 699 vpand %xmm15,%xmm5,%xmm5 700 vpsrlq $4,%xmm8,%xmm7 701 vpand %xmm15,%xmm6,%xmm6 702 vpsrlq $30,%xmm8,%xmm8 703 vpand %xmm15,%xmm7,%xmm7 704 vpand %xmm15,%xmm8,%xmm8 705 vpor 32(%rcx),%xmm9,%xmm9 706 707 jbe .Lskip_loop_avx 708 709 710 vmovdqu -48(%rdi),%xmm11 711 vmovdqu -32(%rdi),%xmm12 712 vpshufd $0xEE,%xmm14,%xmm13 713 vpshufd $0x44,%xmm14,%xmm10 714 vmovdqa %xmm13,-144(%r11) 715 vmovdqa %xmm10,0(%rsp) 716 vpshufd $0xEE,%xmm11,%xmm14 717 vmovdqu -16(%rdi),%xmm10 718 vpshufd $0x44,%xmm11,%xmm11 719 vmovdqa %xmm14,-128(%r11) 720 vmovdqa %xmm11,16(%rsp) 721 vpshufd $0xEE,%xmm12,%xmm13 722 vmovdqu 0(%rdi),%xmm11 723 vpshufd $0x44,%xmm12,%xmm12 724 vmovdqa %xmm13,-112(%r11) 725 vmovdqa %xmm12,32(%rsp) 726 vpshufd $0xEE,%xmm10,%xmm14 727 vmovdqu 16(%rdi),%xmm12 728 vpshufd $0x44,%xmm10,%xmm10 729 vmovdqa %xmm14,-96(%r11) 730 vmovdqa %xmm10,48(%rsp) 731 vpshufd $0xEE,%xmm11,%xmm13 732 vmovdqu 32(%rdi),%xmm10 733 vpshufd $0x44,%xmm11,%xmm11 734 vmovdqa %xmm13,-80(%r11) 735 vmovdqa %xmm11,64(%rsp) 736 vpshufd $0xEE,%xmm12,%xmm14 737 vmovdqu 48(%rdi),%xmm11 738 vpshufd $0x44,%xmm12,%xmm12 739 vmovdqa %xmm14,-64(%r11) 740 vmovdqa %xmm12,80(%rsp) 741 vpshufd $0xEE,%xmm10,%xmm13 742 vmovdqu 64(%rdi),%xmm12 743 vpshufd $0x44,%xmm10,%xmm10 744 vmovdqa %xmm13,-48(%r11) 745 vmovdqa %xmm10,96(%rsp) 746 vpshufd $0xEE,%xmm11,%xmm14 747 vpshufd $0x44,%xmm11,%xmm11 748 vmovdqa %xmm14,-32(%r11) 749 vmovdqa %xmm11,112(%rsp) 750 vpshufd $0xEE,%xmm12,%xmm13 751 vmovdqa 0(%rsp),%xmm14 752 vpshufd $0x44,%xmm12,%xmm12 753 vmovdqa %xmm13,-16(%r11) 754 vmovdqa %xmm12,128(%rsp) 755 756 jmp .Loop_avx 757 758.align 32 759.Loop_avx: 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 vpmuludq %xmm5,%xmm14,%xmm10 781 vpmuludq %xmm6,%xmm14,%xmm11 782 vmovdqa %xmm2,32(%r11) 783 vpmuludq %xmm7,%xmm14,%xmm12 784 vmovdqa 16(%rsp),%xmm2 785 vpmuludq %xmm8,%xmm14,%xmm13 786 vpmuludq %xmm9,%xmm14,%xmm14 787 788 vmovdqa %xmm0,0(%r11) 789 vpmuludq 32(%rsp),%xmm9,%xmm0 790 vmovdqa %xmm1,16(%r11) 791 vpmuludq %xmm8,%xmm2,%xmm1 792 vpaddq %xmm0,%xmm10,%xmm10 793 vpaddq %xmm1,%xmm14,%xmm14 794 vmovdqa %xmm3,48(%r11) 795 vpmuludq %xmm7,%xmm2,%xmm0 796 vpmuludq %xmm6,%xmm2,%xmm1 797 vpaddq %xmm0,%xmm13,%xmm13 798 vmovdqa 48(%rsp),%xmm3 799 vpaddq %xmm1,%xmm12,%xmm12 800 vmovdqa %xmm4,64(%r11) 801 vpmuludq %xmm5,%xmm2,%xmm2 802 vpmuludq %xmm7,%xmm3,%xmm0 803 vpaddq %xmm2,%xmm11,%xmm11 804 805 vmovdqa 64(%rsp),%xmm4 806 vpaddq %xmm0,%xmm14,%xmm14 807 vpmuludq %xmm6,%xmm3,%xmm1 808 vpmuludq %xmm5,%xmm3,%xmm3 809 vpaddq %xmm1,%xmm13,%xmm13 810 vmovdqa 80(%rsp),%xmm2 811 vpaddq %xmm3,%xmm12,%xmm12 812 vpmuludq %xmm9,%xmm4,%xmm0 813 vpmuludq %xmm8,%xmm4,%xmm4 814 vpaddq %xmm0,%xmm11,%xmm11 815 vmovdqa 96(%rsp),%xmm3 816 vpaddq %xmm4,%xmm10,%xmm10 817 818 vmovdqa 128(%rsp),%xmm4 819 vpmuludq %xmm6,%xmm2,%xmm1 820 vpmuludq %xmm5,%xmm2,%xmm2 821 vpaddq %xmm1,%xmm14,%xmm14 822 vpaddq %xmm2,%xmm13,%xmm13 823 vpmuludq %xmm9,%xmm3,%xmm0 824 vpmuludq %xmm8,%xmm3,%xmm1 825 vpaddq %xmm0,%xmm12,%xmm12 826 vmovdqu 0(%rsi),%xmm0 827 vpaddq %xmm1,%xmm11,%xmm11 828 vpmuludq %xmm7,%xmm3,%xmm3 829 vpmuludq %xmm7,%xmm4,%xmm7 830 vpaddq %xmm3,%xmm10,%xmm10 831 832 vmovdqu 16(%rsi),%xmm1 833 vpaddq %xmm7,%xmm11,%xmm11 834 vpmuludq %xmm8,%xmm4,%xmm8 835 vpmuludq %xmm9,%xmm4,%xmm9 836 vpsrldq $6,%xmm0,%xmm2 837 vpaddq %xmm8,%xmm12,%xmm12 838 vpaddq %xmm9,%xmm13,%xmm13 839 vpsrldq $6,%xmm1,%xmm3 840 vpmuludq 112(%rsp),%xmm5,%xmm9 841 vpmuludq %xmm6,%xmm4,%xmm5 842 vpunpckhqdq %xmm1,%xmm0,%xmm4 843 vpaddq %xmm9,%xmm14,%xmm14 844 vmovdqa -144(%r11),%xmm9 845 vpaddq %xmm5,%xmm10,%xmm10 846 847 vpunpcklqdq %xmm1,%xmm0,%xmm0 848 vpunpcklqdq %xmm3,%xmm2,%xmm3 849 850 851 vpsrldq $5,%xmm4,%xmm4 852 vpsrlq $26,%xmm0,%xmm1 853 vpand %xmm15,%xmm0,%xmm0 854 vpsrlq $4,%xmm3,%xmm2 855 vpand %xmm15,%xmm1,%xmm1 856 vpand 0(%rcx),%xmm4,%xmm4 857 vpsrlq $30,%xmm3,%xmm3 858 vpand %xmm15,%xmm2,%xmm2 859 vpand %xmm15,%xmm3,%xmm3 860 vpor 32(%rcx),%xmm4,%xmm4 861 862 vpaddq 0(%r11),%xmm0,%xmm0 863 vpaddq 16(%r11),%xmm1,%xmm1 864 vpaddq 32(%r11),%xmm2,%xmm2 865 vpaddq 48(%r11),%xmm3,%xmm3 866 vpaddq 64(%r11),%xmm4,%xmm4 867 868 leaq 32(%rsi),%rax 869 leaq 64(%rsi),%rsi 870 subq $64,%rdx 871 cmovcq %rax,%rsi 872 873 874 875 876 877 878 879 880 881 882 vpmuludq %xmm0,%xmm9,%xmm5 883 vpmuludq %xmm1,%xmm9,%xmm6 884 vpaddq %xmm5,%xmm10,%xmm10 885 vpaddq %xmm6,%xmm11,%xmm11 886 vmovdqa -128(%r11),%xmm7 887 vpmuludq %xmm2,%xmm9,%xmm5 888 vpmuludq %xmm3,%xmm9,%xmm6 889 vpaddq %xmm5,%xmm12,%xmm12 890 vpaddq %xmm6,%xmm13,%xmm13 891 vpmuludq %xmm4,%xmm9,%xmm9 892 vpmuludq -112(%r11),%xmm4,%xmm5 893 vpaddq %xmm9,%xmm14,%xmm14 894 895 vpaddq %xmm5,%xmm10,%xmm10 896 vpmuludq %xmm2,%xmm7,%xmm6 897 vpmuludq %xmm3,%xmm7,%xmm5 898 vpaddq %xmm6,%xmm13,%xmm13 899 vmovdqa -96(%r11),%xmm8 900 vpaddq %xmm5,%xmm14,%xmm14 901 vpmuludq %xmm1,%xmm7,%xmm6 902 vpmuludq %xmm0,%xmm7,%xmm7 903 vpaddq %xmm6,%xmm12,%xmm12 904 vpaddq %xmm7,%xmm11,%xmm11 905 906 vmovdqa -80(%r11),%xmm9 907 vpmuludq %xmm2,%xmm8,%xmm5 908 vpmuludq %xmm1,%xmm8,%xmm6 909 vpaddq %xmm5,%xmm14,%xmm14 910 vpaddq %xmm6,%xmm13,%xmm13 911 vmovdqa -64(%r11),%xmm7 912 vpmuludq %xmm0,%xmm8,%xmm8 913 vpmuludq %xmm4,%xmm9,%xmm5 914 vpaddq %xmm8,%xmm12,%xmm12 915 vpaddq %xmm5,%xmm11,%xmm11 916 vmovdqa -48(%r11),%xmm8 917 vpmuludq %xmm3,%xmm9,%xmm9 918 vpmuludq %xmm1,%xmm7,%xmm6 919 vpaddq %xmm9,%xmm10,%xmm10 920 921 vmovdqa -16(%r11),%xmm9 922 vpaddq %xmm6,%xmm14,%xmm14 923 vpmuludq %xmm0,%xmm7,%xmm7 924 vpmuludq %xmm4,%xmm8,%xmm5 925 vpaddq %xmm7,%xmm13,%xmm13 926 vpaddq %xmm5,%xmm12,%xmm12 927 vmovdqu 32(%rsi),%xmm5 928 vpmuludq %xmm3,%xmm8,%xmm7 929 vpmuludq %xmm2,%xmm8,%xmm8 930 vpaddq %xmm7,%xmm11,%xmm11 931 vmovdqu 48(%rsi),%xmm6 932 vpaddq %xmm8,%xmm10,%xmm10 933 934 vpmuludq %xmm2,%xmm9,%xmm2 935 vpmuludq %xmm3,%xmm9,%xmm3 936 vpsrldq $6,%xmm5,%xmm7 937 vpaddq %xmm2,%xmm11,%xmm11 938 vpmuludq %xmm4,%xmm9,%xmm4 939 vpsrldq $6,%xmm6,%xmm8 940 vpaddq %xmm3,%xmm12,%xmm2 941 vpaddq %xmm4,%xmm13,%xmm3 942 vpmuludq -32(%r11),%xmm0,%xmm4 943 vpmuludq %xmm1,%xmm9,%xmm0 944 vpunpckhqdq %xmm6,%xmm5,%xmm9 945 vpaddq %xmm4,%xmm14,%xmm4 946 vpaddq %xmm0,%xmm10,%xmm0 947 948 vpunpcklqdq %xmm6,%xmm5,%xmm5 949 vpunpcklqdq %xmm8,%xmm7,%xmm8 950 951 952 vpsrldq $5,%xmm9,%xmm9 953 vpsrlq $26,%xmm5,%xmm6 954 vmovdqa 0(%rsp),%xmm14 955 vpand %xmm15,%xmm5,%xmm5 956 vpsrlq $4,%xmm8,%xmm7 957 vpand %xmm15,%xmm6,%xmm6 958 vpand 0(%rcx),%xmm9,%xmm9 959 vpsrlq $30,%xmm8,%xmm8 960 vpand %xmm15,%xmm7,%xmm7 961 vpand %xmm15,%xmm8,%xmm8 962 vpor 32(%rcx),%xmm9,%xmm9 963 964 965 966 967 968 vpsrlq $26,%xmm3,%xmm13 969 vpand %xmm15,%xmm3,%xmm3 970 vpaddq %xmm13,%xmm4,%xmm4 971 972 vpsrlq $26,%xmm0,%xmm10 973 vpand %xmm15,%xmm0,%xmm0 974 vpaddq %xmm10,%xmm11,%xmm1 975 976 vpsrlq $26,%xmm4,%xmm10 977 vpand %xmm15,%xmm4,%xmm4 978 979 vpsrlq $26,%xmm1,%xmm11 980 vpand %xmm15,%xmm1,%xmm1 981 vpaddq %xmm11,%xmm2,%xmm2 982 983 vpaddq %xmm10,%xmm0,%xmm0 984 vpsllq $2,%xmm10,%xmm10 985 vpaddq %xmm10,%xmm0,%xmm0 986 987 vpsrlq $26,%xmm2,%xmm12 988 vpand %xmm15,%xmm2,%xmm2 989 vpaddq %xmm12,%xmm3,%xmm3 990 991 vpsrlq $26,%xmm0,%xmm10 992 vpand %xmm15,%xmm0,%xmm0 993 vpaddq %xmm10,%xmm1,%xmm1 994 995 vpsrlq $26,%xmm3,%xmm13 996 vpand %xmm15,%xmm3,%xmm3 997 vpaddq %xmm13,%xmm4,%xmm4 998 999 ja .Loop_avx 1000 1001.Lskip_loop_avx: 1002 1003 1004 1005 vpshufd $0x10,%xmm14,%xmm14 1006 addq $32,%rdx 1007 jnz .Long_tail_avx 1008 1009 vpaddq %xmm2,%xmm7,%xmm7 1010 vpaddq %xmm0,%xmm5,%xmm5 1011 vpaddq %xmm1,%xmm6,%xmm6 1012 vpaddq %xmm3,%xmm8,%xmm8 1013 vpaddq %xmm4,%xmm9,%xmm9 1014 1015.Long_tail_avx: 1016 vmovdqa %xmm2,32(%r11) 1017 vmovdqa %xmm0,0(%r11) 1018 vmovdqa %xmm1,16(%r11) 1019 vmovdqa %xmm3,48(%r11) 1020 vmovdqa %xmm4,64(%r11) 1021 1022 1023 1024 1025 1026 1027 1028 vpmuludq %xmm7,%xmm14,%xmm12 1029 vpmuludq %xmm5,%xmm14,%xmm10 1030 vpshufd $0x10,-48(%rdi),%xmm2 1031 vpmuludq %xmm6,%xmm14,%xmm11 1032 vpmuludq %xmm8,%xmm14,%xmm13 1033 vpmuludq %xmm9,%xmm14,%xmm14 1034 1035 vpmuludq %xmm8,%xmm2,%xmm0 1036 vpaddq %xmm0,%xmm14,%xmm14 1037 vpshufd $0x10,-32(%rdi),%xmm3 1038 vpmuludq %xmm7,%xmm2,%xmm1 1039 vpaddq %xmm1,%xmm13,%xmm13 1040 vpshufd $0x10,-16(%rdi),%xmm4 1041 vpmuludq %xmm6,%xmm2,%xmm0 1042 vpaddq %xmm0,%xmm12,%xmm12 1043 vpmuludq %xmm5,%xmm2,%xmm2 1044 vpaddq %xmm2,%xmm11,%xmm11 1045 vpmuludq %xmm9,%xmm3,%xmm3 1046 vpaddq %xmm3,%xmm10,%xmm10 1047 1048 vpshufd $0x10,0(%rdi),%xmm2 1049 vpmuludq %xmm7,%xmm4,%xmm1 1050 vpaddq %xmm1,%xmm14,%xmm14 1051 vpmuludq %xmm6,%xmm4,%xmm0 1052 vpaddq %xmm0,%xmm13,%xmm13 1053 vpshufd $0x10,16(%rdi),%xmm3 1054 vpmuludq %xmm5,%xmm4,%xmm4 1055 vpaddq %xmm4,%xmm12,%xmm12 1056 vpmuludq %xmm9,%xmm2,%xmm1 1057 vpaddq %xmm1,%xmm11,%xmm11 1058 vpshufd $0x10,32(%rdi),%xmm4 1059 vpmuludq %xmm8,%xmm2,%xmm2 1060 vpaddq %xmm2,%xmm10,%xmm10 1061 1062 vpmuludq %xmm6,%xmm3,%xmm0 1063 vpaddq %xmm0,%xmm14,%xmm14 1064 vpmuludq %xmm5,%xmm3,%xmm3 1065 vpaddq %xmm3,%xmm13,%xmm13 1066 vpshufd $0x10,48(%rdi),%xmm2 1067 vpmuludq %xmm9,%xmm4,%xmm1 1068 vpaddq %xmm1,%xmm12,%xmm12 1069 vpshufd $0x10,64(%rdi),%xmm3 1070 vpmuludq %xmm8,%xmm4,%xmm0 1071 vpaddq %xmm0,%xmm11,%xmm11 1072 vpmuludq %xmm7,%xmm4,%xmm4 1073 vpaddq %xmm4,%xmm10,%xmm10 1074 1075 vpmuludq %xmm5,%xmm2,%xmm2 1076 vpaddq %xmm2,%xmm14,%xmm14 1077 vpmuludq %xmm9,%xmm3,%xmm1 1078 vpaddq %xmm1,%xmm13,%xmm13 1079 vpmuludq %xmm8,%xmm3,%xmm0 1080 vpaddq %xmm0,%xmm12,%xmm12 1081 vpmuludq %xmm7,%xmm3,%xmm1 1082 vpaddq %xmm1,%xmm11,%xmm11 1083 vpmuludq %xmm6,%xmm3,%xmm3 1084 vpaddq %xmm3,%xmm10,%xmm10 1085 1086 jz .Lshort_tail_avx 1087 1088 vmovdqu 0(%rsi),%xmm0 1089 vmovdqu 16(%rsi),%xmm1 1090 1091 vpsrldq $6,%xmm0,%xmm2 1092 vpsrldq $6,%xmm1,%xmm3 1093 vpunpckhqdq %xmm1,%xmm0,%xmm4 1094 vpunpcklqdq %xmm1,%xmm0,%xmm0 1095 vpunpcklqdq %xmm3,%xmm2,%xmm3 1096 1097 vpsrlq $40,%xmm4,%xmm4 1098 vpsrlq $26,%xmm0,%xmm1 1099 vpand %xmm15,%xmm0,%xmm0 1100 vpsrlq $4,%xmm3,%xmm2 1101 vpand %xmm15,%xmm1,%xmm1 1102 vpsrlq $30,%xmm3,%xmm3 1103 vpand %xmm15,%xmm2,%xmm2 1104 vpand %xmm15,%xmm3,%xmm3 1105 vpor 32(%rcx),%xmm4,%xmm4 1106 1107 vpshufd $0x32,-64(%rdi),%xmm9 1108 vpaddq 0(%r11),%xmm0,%xmm0 1109 vpaddq 16(%r11),%xmm1,%xmm1 1110 vpaddq 32(%r11),%xmm2,%xmm2 1111 vpaddq 48(%r11),%xmm3,%xmm3 1112 vpaddq 64(%r11),%xmm4,%xmm4 1113 1114 1115 1116 1117 vpmuludq %xmm0,%xmm9,%xmm5 1118 vpaddq %xmm5,%xmm10,%xmm10 1119 vpmuludq %xmm1,%xmm9,%xmm6 1120 vpaddq %xmm6,%xmm11,%xmm11 1121 vpmuludq %xmm2,%xmm9,%xmm5 1122 vpaddq %xmm5,%xmm12,%xmm12 1123 vpshufd $0x32,-48(%rdi),%xmm7 1124 vpmuludq %xmm3,%xmm9,%xmm6 1125 vpaddq %xmm6,%xmm13,%xmm13 1126 vpmuludq %xmm4,%xmm9,%xmm9 1127 vpaddq %xmm9,%xmm14,%xmm14 1128 1129 vpmuludq %xmm3,%xmm7,%xmm5 1130 vpaddq %xmm5,%xmm14,%xmm14 1131 vpshufd $0x32,-32(%rdi),%xmm8 1132 vpmuludq %xmm2,%xmm7,%xmm6 1133 vpaddq %xmm6,%xmm13,%xmm13 1134 vpshufd $0x32,-16(%rdi),%xmm9 1135 vpmuludq %xmm1,%xmm7,%xmm5 1136 vpaddq %xmm5,%xmm12,%xmm12 1137 vpmuludq %xmm0,%xmm7,%xmm7 1138 vpaddq %xmm7,%xmm11,%xmm11 1139 vpmuludq %xmm4,%xmm8,%xmm8 1140 vpaddq %xmm8,%xmm10,%xmm10 1141 1142 vpshufd $0x32,0(%rdi),%xmm7 1143 vpmuludq %xmm2,%xmm9,%xmm6 1144 vpaddq %xmm6,%xmm14,%xmm14 1145 vpmuludq %xmm1,%xmm9,%xmm5 1146 vpaddq %xmm5,%xmm13,%xmm13 1147 vpshufd $0x32,16(%rdi),%xmm8 1148 vpmuludq %xmm0,%xmm9,%xmm9 1149 vpaddq %xmm9,%xmm12,%xmm12 1150 vpmuludq %xmm4,%xmm7,%xmm6 1151 vpaddq %xmm6,%xmm11,%xmm11 1152 vpshufd $0x32,32(%rdi),%xmm9 1153 vpmuludq %xmm3,%xmm7,%xmm7 1154 vpaddq %xmm7,%xmm10,%xmm10 1155 1156 vpmuludq %xmm1,%xmm8,%xmm5 1157 vpaddq %xmm5,%xmm14,%xmm14 1158 vpmuludq %xmm0,%xmm8,%xmm8 1159 vpaddq %xmm8,%xmm13,%xmm13 1160 vpshufd $0x32,48(%rdi),%xmm7 1161 vpmuludq %xmm4,%xmm9,%xmm6 1162 vpaddq %xmm6,%xmm12,%xmm12 1163 vpshufd $0x32,64(%rdi),%xmm8 1164 vpmuludq %xmm3,%xmm9,%xmm5 1165 vpaddq %xmm5,%xmm11,%xmm11 1166 vpmuludq %xmm2,%xmm9,%xmm9 1167 vpaddq %xmm9,%xmm10,%xmm10 1168 1169 vpmuludq %xmm0,%xmm7,%xmm7 1170 vpaddq %xmm7,%xmm14,%xmm14 1171 vpmuludq %xmm4,%xmm8,%xmm6 1172 vpaddq %xmm6,%xmm13,%xmm13 1173 vpmuludq %xmm3,%xmm8,%xmm5 1174 vpaddq %xmm5,%xmm12,%xmm12 1175 vpmuludq %xmm2,%xmm8,%xmm6 1176 vpaddq %xmm6,%xmm11,%xmm11 1177 vpmuludq %xmm1,%xmm8,%xmm8 1178 vpaddq %xmm8,%xmm10,%xmm10 1179 1180.Lshort_tail_avx: 1181 1182 1183 1184 vpsrldq $8,%xmm14,%xmm9 1185 vpsrldq $8,%xmm13,%xmm8 1186 vpsrldq $8,%xmm11,%xmm6 1187 vpsrldq $8,%xmm10,%xmm5 1188 vpsrldq $8,%xmm12,%xmm7 1189 vpaddq %xmm8,%xmm13,%xmm13 1190 vpaddq %xmm9,%xmm14,%xmm14 1191 vpaddq %xmm5,%xmm10,%xmm10 1192 vpaddq %xmm6,%xmm11,%xmm11 1193 vpaddq %xmm7,%xmm12,%xmm12 1194 1195 1196 1197 1198 vpsrlq $26,%xmm13,%xmm3 1199 vpand %xmm15,%xmm13,%xmm13 1200 vpaddq %xmm3,%xmm14,%xmm14 1201 1202 vpsrlq $26,%xmm10,%xmm0 1203 vpand %xmm15,%xmm10,%xmm10 1204 vpaddq %xmm0,%xmm11,%xmm11 1205 1206 vpsrlq $26,%xmm14,%xmm4 1207 vpand %xmm15,%xmm14,%xmm14 1208 1209 vpsrlq $26,%xmm11,%xmm1 1210 vpand %xmm15,%xmm11,%xmm11 1211 vpaddq %xmm1,%xmm12,%xmm12 1212 1213 vpaddq %xmm4,%xmm10,%xmm10 1214 vpsllq $2,%xmm4,%xmm4 1215 vpaddq %xmm4,%xmm10,%xmm10 1216 1217 vpsrlq $26,%xmm12,%xmm2 1218 vpand %xmm15,%xmm12,%xmm12 1219 vpaddq %xmm2,%xmm13,%xmm13 1220 1221 vpsrlq $26,%xmm10,%xmm0 1222 vpand %xmm15,%xmm10,%xmm10 1223 vpaddq %xmm0,%xmm11,%xmm11 1224 1225 vpsrlq $26,%xmm13,%xmm3 1226 vpand %xmm15,%xmm13,%xmm13 1227 vpaddq %xmm3,%xmm14,%xmm14 1228 1229 vmovd %xmm10,-112(%rdi) 1230 vmovd %xmm11,-108(%rdi) 1231 vmovd %xmm12,-104(%rdi) 1232 vmovd %xmm13,-100(%rdi) 1233 vmovd %xmm14,-96(%rdi) 1234 leaq 88(%r11),%rsp 1235.cfi_def_cfa %rsp,8 1236 vzeroupper 1237 .byte 0xf3,0xc3 1238.cfi_endproc 1239.size poly1305_blocks_avx,.-poly1305_blocks_avx 1240 1241.type poly1305_emit_avx,@function 1242.align 32 1243poly1305_emit_avx: 1244.cfi_startproc 1245 cmpl $0,20(%rdi) 1246 je .Lemit 1247 1248 movl 0(%rdi),%eax 1249 movl 4(%rdi),%ecx 1250 movl 8(%rdi),%r8d 1251 movl 12(%rdi),%r11d 1252 movl 16(%rdi),%r10d 1253 1254 shlq $26,%rcx 1255 movq %r8,%r9 1256 shlq $52,%r8 1257 addq %rcx,%rax 1258 shrq $12,%r9 1259 addq %rax,%r8 1260 adcq $0,%r9 1261 1262 shlq $14,%r11 1263 movq %r10,%rax 1264 shrq $24,%r10 1265 addq %r11,%r9 1266 shlq $40,%rax 1267 addq %rax,%r9 1268 adcq $0,%r10 1269 1270 movq %r10,%rax 1271 movq %r10,%rcx 1272 andq $3,%r10 1273 shrq $2,%rax 1274 andq $-4,%rcx 1275 addq %rcx,%rax 1276 addq %rax,%r8 1277 adcq $0,%r9 1278 adcq $0,%r10 1279 1280 movq %r8,%rax 1281 addq $5,%r8 1282 movq %r9,%rcx 1283 adcq $0,%r9 1284 adcq $0,%r10 1285 shrq $2,%r10 1286 cmovnzq %r8,%rax 1287 cmovnzq %r9,%rcx 1288 1289 addq 0(%rdx),%rax 1290 adcq 8(%rdx),%rcx 1291 movq %rax,0(%rsi) 1292 movq %rcx,8(%rsi) 1293 1294 .byte 0xf3,0xc3 1295.cfi_endproc 1296.size poly1305_emit_avx,.-poly1305_emit_avx 1297.type poly1305_blocks_avx2,@function 1298.align 32 1299poly1305_blocks_avx2: 1300.cfi_startproc 1301 movl 20(%rdi),%r8d 1302 cmpq $128,%rdx 1303 jae .Lblocks_avx2 1304 testl %r8d,%r8d 1305 jz .Lblocks 1306 1307.Lblocks_avx2: 1308 andq $-16,%rdx 1309 jz .Lno_data_avx2 1310 1311 vzeroupper 1312 1313 testl %r8d,%r8d 1314 jz .Lbase2_64_avx2 1315 1316 testq $63,%rdx 1317 jz .Leven_avx2 1318 1319 pushq %rbx 1320.cfi_adjust_cfa_offset 8 1321.cfi_offset %rbx,-16 1322 pushq %rbp 1323.cfi_adjust_cfa_offset 8 1324.cfi_offset %rbp,-24 1325 pushq %r12 1326.cfi_adjust_cfa_offset 8 1327.cfi_offset %r12,-32 1328 pushq %r13 1329.cfi_adjust_cfa_offset 8 1330.cfi_offset %r13,-40 1331 pushq %r14 1332.cfi_adjust_cfa_offset 8 1333.cfi_offset %r14,-48 1334 pushq %r15 1335.cfi_adjust_cfa_offset 8 1336.cfi_offset %r15,-56 1337.Lblocks_avx2_body: 1338 1339 movq %rdx,%r15 1340 1341 movq 0(%rdi),%r8 1342 movq 8(%rdi),%r9 1343 movl 16(%rdi),%ebp 1344 1345 movq 24(%rdi),%r11 1346 movq 32(%rdi),%r13 1347 1348 1349 movl %r8d,%r14d 1350 andq $-2147483648,%r8 1351 movq %r9,%r12 1352 movl %r9d,%ebx 1353 andq $-2147483648,%r9 1354 1355 shrq $6,%r8 1356 shlq $52,%r12 1357 addq %r8,%r14 1358 shrq $12,%rbx 1359 shrq $18,%r9 1360 addq %r12,%r14 1361 adcq %r9,%rbx 1362 1363 movq %rbp,%r8 1364 shlq $40,%r8 1365 shrq $24,%rbp 1366 addq %r8,%rbx 1367 adcq $0,%rbp 1368 1369 movq $-4,%r9 1370 movq %rbp,%r8 1371 andq %rbp,%r9 1372 shrq $2,%r8 1373 andq $3,%rbp 1374 addq %r9,%r8 1375 addq %r8,%r14 1376 adcq $0,%rbx 1377 adcq $0,%rbp 1378 1379 movq %r13,%r12 1380 movq %r13,%rax 1381 shrq $2,%r13 1382 addq %r12,%r13 1383 1384.Lbase2_26_pre_avx2: 1385 addq 0(%rsi),%r14 1386 adcq 8(%rsi),%rbx 1387 leaq 16(%rsi),%rsi 1388 adcq %rcx,%rbp 1389 subq $16,%r15 1390 1391 call __poly1305_block 1392 movq %r12,%rax 1393 1394 testq $63,%r15 1395 jnz .Lbase2_26_pre_avx2 1396 1397 testq %rcx,%rcx 1398 jz .Lstore_base2_64_avx2 1399 1400 1401 movq %r14,%rax 1402 movq %r14,%rdx 1403 shrq $52,%r14 1404 movq %rbx,%r11 1405 movq %rbx,%r12 1406 shrq $26,%rdx 1407 andq $0x3ffffff,%rax 1408 shlq $12,%r11 1409 andq $0x3ffffff,%rdx 1410 shrq $14,%rbx 1411 orq %r11,%r14 1412 shlq $24,%rbp 1413 andq $0x3ffffff,%r14 1414 shrq $40,%r12 1415 andq $0x3ffffff,%rbx 1416 orq %r12,%rbp 1417 1418 testq %r15,%r15 1419 jz .Lstore_base2_26_avx2 1420 1421 vmovd %eax,%xmm0 1422 vmovd %edx,%xmm1 1423 vmovd %r14d,%xmm2 1424 vmovd %ebx,%xmm3 1425 vmovd %ebp,%xmm4 1426 jmp .Lproceed_avx2 1427 1428.align 32 1429.Lstore_base2_64_avx2: 1430 movq %r14,0(%rdi) 1431 movq %rbx,8(%rdi) 1432 movq %rbp,16(%rdi) 1433 jmp .Ldone_avx2 1434 1435.align 16 1436.Lstore_base2_26_avx2: 1437 movl %eax,0(%rdi) 1438 movl %edx,4(%rdi) 1439 movl %r14d,8(%rdi) 1440 movl %ebx,12(%rdi) 1441 movl %ebp,16(%rdi) 1442.align 16 1443.Ldone_avx2: 1444 movq 0(%rsp),%r15 1445.cfi_restore %r15 1446 movq 8(%rsp),%r14 1447.cfi_restore %r14 1448 movq 16(%rsp),%r13 1449.cfi_restore %r13 1450 movq 24(%rsp),%r12 1451.cfi_restore %r12 1452 movq 32(%rsp),%rbp 1453.cfi_restore %rbp 1454 movq 40(%rsp),%rbx 1455.cfi_restore %rbx 1456 leaq 48(%rsp),%rsp 1457.cfi_adjust_cfa_offset -48 1458.Lno_data_avx2: 1459.Lblocks_avx2_epilogue: 1460 .byte 0xf3,0xc3 1461.cfi_endproc 1462 1463.align 32 1464.Lbase2_64_avx2: 1465.cfi_startproc 1466 pushq %rbx 1467.cfi_adjust_cfa_offset 8 1468.cfi_offset %rbx,-16 1469 pushq %rbp 1470.cfi_adjust_cfa_offset 8 1471.cfi_offset %rbp,-24 1472 pushq %r12 1473.cfi_adjust_cfa_offset 8 1474.cfi_offset %r12,-32 1475 pushq %r13 1476.cfi_adjust_cfa_offset 8 1477.cfi_offset %r13,-40 1478 pushq %r14 1479.cfi_adjust_cfa_offset 8 1480.cfi_offset %r14,-48 1481 pushq %r15 1482.cfi_adjust_cfa_offset 8 1483.cfi_offset %r15,-56 1484.Lbase2_64_avx2_body: 1485 1486 movq %rdx,%r15 1487 1488 movq 24(%rdi),%r11 1489 movq 32(%rdi),%r13 1490 1491 movq 0(%rdi),%r14 1492 movq 8(%rdi),%rbx 1493 movl 16(%rdi),%ebp 1494 1495 movq %r13,%r12 1496 movq %r13,%rax 1497 shrq $2,%r13 1498 addq %r12,%r13 1499 1500 testq $63,%rdx 1501 jz .Linit_avx2 1502 1503.Lbase2_64_pre_avx2: 1504 addq 0(%rsi),%r14 1505 adcq 8(%rsi),%rbx 1506 leaq 16(%rsi),%rsi 1507 adcq %rcx,%rbp 1508 subq $16,%r15 1509 1510 call __poly1305_block 1511 movq %r12,%rax 1512 1513 testq $63,%r15 1514 jnz .Lbase2_64_pre_avx2 1515 1516.Linit_avx2: 1517 1518 movq %r14,%rax 1519 movq %r14,%rdx 1520 shrq $52,%r14 1521 movq %rbx,%r8 1522 movq %rbx,%r9 1523 shrq $26,%rdx 1524 andq $0x3ffffff,%rax 1525 shlq $12,%r8 1526 andq $0x3ffffff,%rdx 1527 shrq $14,%rbx 1528 orq %r8,%r14 1529 shlq $24,%rbp 1530 andq $0x3ffffff,%r14 1531 shrq $40,%r9 1532 andq $0x3ffffff,%rbx 1533 orq %r9,%rbp 1534 1535 vmovd %eax,%xmm0 1536 vmovd %edx,%xmm1 1537 vmovd %r14d,%xmm2 1538 vmovd %ebx,%xmm3 1539 vmovd %ebp,%xmm4 1540 movl $1,20(%rdi) 1541 1542 call __poly1305_init_avx 1543 1544.Lproceed_avx2: 1545 movq %r15,%rdx 1546 movl OPENSSL_ia32cap_P+8(%rip),%r10d 1547 movl $3221291008,%r11d 1548 1549 movq 0(%rsp),%r15 1550.cfi_restore %r15 1551 movq 8(%rsp),%r14 1552.cfi_restore %r14 1553 movq 16(%rsp),%r13 1554.cfi_restore %r13 1555 movq 24(%rsp),%r12 1556.cfi_restore %r12 1557 movq 32(%rsp),%rbp 1558.cfi_restore %rbp 1559 movq 40(%rsp),%rbx 1560.cfi_restore %rbx 1561 leaq 48(%rsp),%rax 1562 leaq 48(%rsp),%rsp 1563.cfi_adjust_cfa_offset -48 1564.Lbase2_64_avx2_epilogue: 1565 jmp .Ldo_avx2 1566.cfi_endproc 1567 1568.align 32 1569.Leven_avx2: 1570.cfi_startproc 1571 movl OPENSSL_ia32cap_P+8(%rip),%r10d 1572 vmovd 0(%rdi),%xmm0 1573 vmovd 4(%rdi),%xmm1 1574 vmovd 8(%rdi),%xmm2 1575 vmovd 12(%rdi),%xmm3 1576 vmovd 16(%rdi),%xmm4 1577 1578.Ldo_avx2: 1579 cmpq $512,%rdx 1580 jb .Lskip_avx512 1581 andl %r11d,%r10d 1582 testl $65536,%r10d 1583 jnz .Lblocks_avx512 1584.Lskip_avx512: 1585 leaq -8(%rsp),%r11 1586.cfi_def_cfa %r11,16 1587 subq $0x128,%rsp 1588 leaq .Lconst(%rip),%rcx 1589 leaq 48+64(%rdi),%rdi 1590 vmovdqa 96(%rcx),%ymm7 1591 1592 1593 vmovdqu -64(%rdi),%xmm9 1594 andq $-512,%rsp 1595 vmovdqu -48(%rdi),%xmm10 1596 vmovdqu -32(%rdi),%xmm6 1597 vmovdqu -16(%rdi),%xmm11 1598 vmovdqu 0(%rdi),%xmm12 1599 vmovdqu 16(%rdi),%xmm13 1600 leaq 144(%rsp),%rax 1601 vmovdqu 32(%rdi),%xmm14 1602 vpermd %ymm9,%ymm7,%ymm9 1603 vmovdqu 48(%rdi),%xmm15 1604 vpermd %ymm10,%ymm7,%ymm10 1605 vmovdqu 64(%rdi),%xmm5 1606 vpermd %ymm6,%ymm7,%ymm6 1607 vmovdqa %ymm9,0(%rsp) 1608 vpermd %ymm11,%ymm7,%ymm11 1609 vmovdqa %ymm10,32-144(%rax) 1610 vpermd %ymm12,%ymm7,%ymm12 1611 vmovdqa %ymm6,64-144(%rax) 1612 vpermd %ymm13,%ymm7,%ymm13 1613 vmovdqa %ymm11,96-144(%rax) 1614 vpermd %ymm14,%ymm7,%ymm14 1615 vmovdqa %ymm12,128-144(%rax) 1616 vpermd %ymm15,%ymm7,%ymm15 1617 vmovdqa %ymm13,160-144(%rax) 1618 vpermd %ymm5,%ymm7,%ymm5 1619 vmovdqa %ymm14,192-144(%rax) 1620 vmovdqa %ymm15,224-144(%rax) 1621 vmovdqa %ymm5,256-144(%rax) 1622 vmovdqa 64(%rcx),%ymm5 1623 1624 1625 1626 vmovdqu 0(%rsi),%xmm7 1627 vmovdqu 16(%rsi),%xmm8 1628 vinserti128 $1,32(%rsi),%ymm7,%ymm7 1629 vinserti128 $1,48(%rsi),%ymm8,%ymm8 1630 leaq 64(%rsi),%rsi 1631 1632 vpsrldq $6,%ymm7,%ymm9 1633 vpsrldq $6,%ymm8,%ymm10 1634 vpunpckhqdq %ymm8,%ymm7,%ymm6 1635 vpunpcklqdq %ymm10,%ymm9,%ymm9 1636 vpunpcklqdq %ymm8,%ymm7,%ymm7 1637 1638 vpsrlq $30,%ymm9,%ymm10 1639 vpsrlq $4,%ymm9,%ymm9 1640 vpsrlq $26,%ymm7,%ymm8 1641 vpsrlq $40,%ymm6,%ymm6 1642 vpand %ymm5,%ymm9,%ymm9 1643 vpand %ymm5,%ymm7,%ymm7 1644 vpand %ymm5,%ymm8,%ymm8 1645 vpand %ymm5,%ymm10,%ymm10 1646 vpor 32(%rcx),%ymm6,%ymm6 1647 1648 vpaddq %ymm2,%ymm9,%ymm2 1649 subq $64,%rdx 1650 jz .Ltail_avx2 1651 jmp .Loop_avx2 1652 1653.align 32 1654.Loop_avx2: 1655 1656 1657 1658 1659 1660 1661 1662 1663 vpaddq %ymm0,%ymm7,%ymm0 1664 vmovdqa 0(%rsp),%ymm7 1665 vpaddq %ymm1,%ymm8,%ymm1 1666 vmovdqa 32(%rsp),%ymm8 1667 vpaddq %ymm3,%ymm10,%ymm3 1668 vmovdqa 96(%rsp),%ymm9 1669 vpaddq %ymm4,%ymm6,%ymm4 1670 vmovdqa 48(%rax),%ymm10 1671 vmovdqa 112(%rax),%ymm5 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 vpmuludq %ymm2,%ymm7,%ymm13 1689 vpmuludq %ymm2,%ymm8,%ymm14 1690 vpmuludq %ymm2,%ymm9,%ymm15 1691 vpmuludq %ymm2,%ymm10,%ymm11 1692 vpmuludq %ymm2,%ymm5,%ymm12 1693 1694 vpmuludq %ymm0,%ymm8,%ymm6 1695 vpmuludq %ymm1,%ymm8,%ymm2 1696 vpaddq %ymm6,%ymm12,%ymm12 1697 vpaddq %ymm2,%ymm13,%ymm13 1698 vpmuludq %ymm3,%ymm8,%ymm6 1699 vpmuludq 64(%rsp),%ymm4,%ymm2 1700 vpaddq %ymm6,%ymm15,%ymm15 1701 vpaddq %ymm2,%ymm11,%ymm11 1702 vmovdqa -16(%rax),%ymm8 1703 1704 vpmuludq %ymm0,%ymm7,%ymm6 1705 vpmuludq %ymm1,%ymm7,%ymm2 1706 vpaddq %ymm6,%ymm11,%ymm11 1707 vpaddq %ymm2,%ymm12,%ymm12 1708 vpmuludq %ymm3,%ymm7,%ymm6 1709 vpmuludq %ymm4,%ymm7,%ymm2 1710 vmovdqu 0(%rsi),%xmm7 1711 vpaddq %ymm6,%ymm14,%ymm14 1712 vpaddq %ymm2,%ymm15,%ymm15 1713 vinserti128 $1,32(%rsi),%ymm7,%ymm7 1714 1715 vpmuludq %ymm3,%ymm8,%ymm6 1716 vpmuludq %ymm4,%ymm8,%ymm2 1717 vmovdqu 16(%rsi),%xmm8 1718 vpaddq %ymm6,%ymm11,%ymm11 1719 vpaddq %ymm2,%ymm12,%ymm12 1720 vmovdqa 16(%rax),%ymm2 1721 vpmuludq %ymm1,%ymm9,%ymm6 1722 vpmuludq %ymm0,%ymm9,%ymm9 1723 vpaddq %ymm6,%ymm14,%ymm14 1724 vpaddq %ymm9,%ymm13,%ymm13 1725 vinserti128 $1,48(%rsi),%ymm8,%ymm8 1726 leaq 64(%rsi),%rsi 1727 1728 vpmuludq %ymm1,%ymm2,%ymm6 1729 vpmuludq %ymm0,%ymm2,%ymm2 1730 vpsrldq $6,%ymm7,%ymm9 1731 vpaddq %ymm6,%ymm15,%ymm15 1732 vpaddq %ymm2,%ymm14,%ymm14 1733 vpmuludq %ymm3,%ymm10,%ymm6 1734 vpmuludq %ymm4,%ymm10,%ymm2 1735 vpsrldq $6,%ymm8,%ymm10 1736 vpaddq %ymm6,%ymm12,%ymm12 1737 vpaddq %ymm2,%ymm13,%ymm13 1738 vpunpckhqdq %ymm8,%ymm7,%ymm6 1739 1740 vpmuludq %ymm3,%ymm5,%ymm3 1741 vpmuludq %ymm4,%ymm5,%ymm4 1742 vpunpcklqdq %ymm8,%ymm7,%ymm7 1743 vpaddq %ymm3,%ymm13,%ymm2 1744 vpaddq %ymm4,%ymm14,%ymm3 1745 vpunpcklqdq %ymm10,%ymm9,%ymm10 1746 vpmuludq 80(%rax),%ymm0,%ymm4 1747 vpmuludq %ymm1,%ymm5,%ymm0 1748 vmovdqa 64(%rcx),%ymm5 1749 vpaddq %ymm4,%ymm15,%ymm4 1750 vpaddq %ymm0,%ymm11,%ymm0 1751 1752 1753 1754 1755 vpsrlq $26,%ymm3,%ymm14 1756 vpand %ymm5,%ymm3,%ymm3 1757 vpaddq %ymm14,%ymm4,%ymm4 1758 1759 vpsrlq $26,%ymm0,%ymm11 1760 vpand %ymm5,%ymm0,%ymm0 1761 vpaddq %ymm11,%ymm12,%ymm1 1762 1763 vpsrlq $26,%ymm4,%ymm15 1764 vpand %ymm5,%ymm4,%ymm4 1765 1766 vpsrlq $4,%ymm10,%ymm9 1767 1768 vpsrlq $26,%ymm1,%ymm12 1769 vpand %ymm5,%ymm1,%ymm1 1770 vpaddq %ymm12,%ymm2,%ymm2 1771 1772 vpaddq %ymm15,%ymm0,%ymm0 1773 vpsllq $2,%ymm15,%ymm15 1774 vpaddq %ymm15,%ymm0,%ymm0 1775 1776 vpand %ymm5,%ymm9,%ymm9 1777 vpsrlq $26,%ymm7,%ymm8 1778 1779 vpsrlq $26,%ymm2,%ymm13 1780 vpand %ymm5,%ymm2,%ymm2 1781 vpaddq %ymm13,%ymm3,%ymm3 1782 1783 vpaddq %ymm9,%ymm2,%ymm2 1784 vpsrlq $30,%ymm10,%ymm10 1785 1786 vpsrlq $26,%ymm0,%ymm11 1787 vpand %ymm5,%ymm0,%ymm0 1788 vpaddq %ymm11,%ymm1,%ymm1 1789 1790 vpsrlq $40,%ymm6,%ymm6 1791 1792 vpsrlq $26,%ymm3,%ymm14 1793 vpand %ymm5,%ymm3,%ymm3 1794 vpaddq %ymm14,%ymm4,%ymm4 1795 1796 vpand %ymm5,%ymm7,%ymm7 1797 vpand %ymm5,%ymm8,%ymm8 1798 vpand %ymm5,%ymm10,%ymm10 1799 vpor 32(%rcx),%ymm6,%ymm6 1800 1801 subq $64,%rdx 1802 jnz .Loop_avx2 1803 1804.byte 0x66,0x90 1805.Ltail_avx2: 1806 1807 1808 1809 1810 1811 1812 1813 vpaddq %ymm0,%ymm7,%ymm0 1814 vmovdqu 4(%rsp),%ymm7 1815 vpaddq %ymm1,%ymm8,%ymm1 1816 vmovdqu 36(%rsp),%ymm8 1817 vpaddq %ymm3,%ymm10,%ymm3 1818 vmovdqu 100(%rsp),%ymm9 1819 vpaddq %ymm4,%ymm6,%ymm4 1820 vmovdqu 52(%rax),%ymm10 1821 vmovdqu 116(%rax),%ymm5 1822 1823 vpmuludq %ymm2,%ymm7,%ymm13 1824 vpmuludq %ymm2,%ymm8,%ymm14 1825 vpmuludq %ymm2,%ymm9,%ymm15 1826 vpmuludq %ymm2,%ymm10,%ymm11 1827 vpmuludq %ymm2,%ymm5,%ymm12 1828 1829 vpmuludq %ymm0,%ymm8,%ymm6 1830 vpmuludq %ymm1,%ymm8,%ymm2 1831 vpaddq %ymm6,%ymm12,%ymm12 1832 vpaddq %ymm2,%ymm13,%ymm13 1833 vpmuludq %ymm3,%ymm8,%ymm6 1834 vpmuludq 68(%rsp),%ymm4,%ymm2 1835 vpaddq %ymm6,%ymm15,%ymm15 1836 vpaddq %ymm2,%ymm11,%ymm11 1837 1838 vpmuludq %ymm0,%ymm7,%ymm6 1839 vpmuludq %ymm1,%ymm7,%ymm2 1840 vpaddq %ymm6,%ymm11,%ymm11 1841 vmovdqu -12(%rax),%ymm8 1842 vpaddq %ymm2,%ymm12,%ymm12 1843 vpmuludq %ymm3,%ymm7,%ymm6 1844 vpmuludq %ymm4,%ymm7,%ymm2 1845 vpaddq %ymm6,%ymm14,%ymm14 1846 vpaddq %ymm2,%ymm15,%ymm15 1847 1848 vpmuludq %ymm3,%ymm8,%ymm6 1849 vpmuludq %ymm4,%ymm8,%ymm2 1850 vpaddq %ymm6,%ymm11,%ymm11 1851 vpaddq %ymm2,%ymm12,%ymm12 1852 vmovdqu 20(%rax),%ymm2 1853 vpmuludq %ymm1,%ymm9,%ymm6 1854 vpmuludq %ymm0,%ymm9,%ymm9 1855 vpaddq %ymm6,%ymm14,%ymm14 1856 vpaddq %ymm9,%ymm13,%ymm13 1857 1858 vpmuludq %ymm1,%ymm2,%ymm6 1859 vpmuludq %ymm0,%ymm2,%ymm2 1860 vpaddq %ymm6,%ymm15,%ymm15 1861 vpaddq %ymm2,%ymm14,%ymm14 1862 vpmuludq %ymm3,%ymm10,%ymm6 1863 vpmuludq %ymm4,%ymm10,%ymm2 1864 vpaddq %ymm6,%ymm12,%ymm12 1865 vpaddq %ymm2,%ymm13,%ymm13 1866 1867 vpmuludq %ymm3,%ymm5,%ymm3 1868 vpmuludq %ymm4,%ymm5,%ymm4 1869 vpaddq %ymm3,%ymm13,%ymm2 1870 vpaddq %ymm4,%ymm14,%ymm3 1871 vpmuludq 84(%rax),%ymm0,%ymm4 1872 vpmuludq %ymm1,%ymm5,%ymm0 1873 vmovdqa 64(%rcx),%ymm5 1874 vpaddq %ymm4,%ymm15,%ymm4 1875 vpaddq %ymm0,%ymm11,%ymm0 1876 1877 1878 1879 1880 vpsrldq $8,%ymm12,%ymm8 1881 vpsrldq $8,%ymm2,%ymm9 1882 vpsrldq $8,%ymm3,%ymm10 1883 vpsrldq $8,%ymm4,%ymm6 1884 vpsrldq $8,%ymm0,%ymm7 1885 vpaddq %ymm8,%ymm12,%ymm12 1886 vpaddq %ymm9,%ymm2,%ymm2 1887 vpaddq %ymm10,%ymm3,%ymm3 1888 vpaddq %ymm6,%ymm4,%ymm4 1889 vpaddq %ymm7,%ymm0,%ymm0 1890 1891 vpermq $0x2,%ymm3,%ymm10 1892 vpermq $0x2,%ymm4,%ymm6 1893 vpermq $0x2,%ymm0,%ymm7 1894 vpermq $0x2,%ymm12,%ymm8 1895 vpermq $0x2,%ymm2,%ymm9 1896 vpaddq %ymm10,%ymm3,%ymm3 1897 vpaddq %ymm6,%ymm4,%ymm4 1898 vpaddq %ymm7,%ymm0,%ymm0 1899 vpaddq %ymm8,%ymm12,%ymm12 1900 vpaddq %ymm9,%ymm2,%ymm2 1901 1902 1903 1904 1905 vpsrlq $26,%ymm3,%ymm14 1906 vpand %ymm5,%ymm3,%ymm3 1907 vpaddq %ymm14,%ymm4,%ymm4 1908 1909 vpsrlq $26,%ymm0,%ymm11 1910 vpand %ymm5,%ymm0,%ymm0 1911 vpaddq %ymm11,%ymm12,%ymm1 1912 1913 vpsrlq $26,%ymm4,%ymm15 1914 vpand %ymm5,%ymm4,%ymm4 1915 1916 vpsrlq $26,%ymm1,%ymm12 1917 vpand %ymm5,%ymm1,%ymm1 1918 vpaddq %ymm12,%ymm2,%ymm2 1919 1920 vpaddq %ymm15,%ymm0,%ymm0 1921 vpsllq $2,%ymm15,%ymm15 1922 vpaddq %ymm15,%ymm0,%ymm0 1923 1924 vpsrlq $26,%ymm2,%ymm13 1925 vpand %ymm5,%ymm2,%ymm2 1926 vpaddq %ymm13,%ymm3,%ymm3 1927 1928 vpsrlq $26,%ymm0,%ymm11 1929 vpand %ymm5,%ymm0,%ymm0 1930 vpaddq %ymm11,%ymm1,%ymm1 1931 1932 vpsrlq $26,%ymm3,%ymm14 1933 vpand %ymm5,%ymm3,%ymm3 1934 vpaddq %ymm14,%ymm4,%ymm4 1935 1936 vmovd %xmm0,-112(%rdi) 1937 vmovd %xmm1,-108(%rdi) 1938 vmovd %xmm2,-104(%rdi) 1939 vmovd %xmm3,-100(%rdi) 1940 vmovd %xmm4,-96(%rdi) 1941 leaq 8(%r11),%rsp 1942.cfi_def_cfa %rsp,8 1943 vzeroupper 1944 .byte 0xf3,0xc3 1945.cfi_endproc 1946.size poly1305_blocks_avx2,.-poly1305_blocks_avx2 1947.type poly1305_blocks_avx512,@function 1948.align 32 1949poly1305_blocks_avx512: 1950.cfi_startproc 1951.Lblocks_avx512: 1952 movl $15,%eax 1953 kmovw %eax,%k2 1954 leaq -8(%rsp),%r11 1955.cfi_def_cfa %r11,16 1956 subq $0x128,%rsp 1957 leaq .Lconst(%rip),%rcx 1958 leaq 48+64(%rdi),%rdi 1959 vmovdqa 96(%rcx),%ymm9 1960 1961 1962 vmovdqu -64(%rdi),%xmm11 1963 andq $-512,%rsp 1964 vmovdqu -48(%rdi),%xmm12 1965 movq $0x20,%rax 1966 vmovdqu -32(%rdi),%xmm7 1967 vmovdqu -16(%rdi),%xmm13 1968 vmovdqu 0(%rdi),%xmm8 1969 vmovdqu 16(%rdi),%xmm14 1970 vmovdqu 32(%rdi),%xmm10 1971 vmovdqu 48(%rdi),%xmm15 1972 vmovdqu 64(%rdi),%xmm6 1973 vpermd %zmm11,%zmm9,%zmm16 1974 vpbroadcastq 64(%rcx),%zmm5 1975 vpermd %zmm12,%zmm9,%zmm17 1976 vpermd %zmm7,%zmm9,%zmm21 1977 vpermd %zmm13,%zmm9,%zmm18 1978 vmovdqa64 %zmm16,0(%rsp){%k2} 1979 vpsrlq $32,%zmm16,%zmm7 1980 vpermd %zmm8,%zmm9,%zmm22 1981 vmovdqu64 %zmm17,0(%rsp,%rax,1){%k2} 1982 vpsrlq $32,%zmm17,%zmm8 1983 vpermd %zmm14,%zmm9,%zmm19 1984 vmovdqa64 %zmm21,64(%rsp){%k2} 1985 vpermd %zmm10,%zmm9,%zmm23 1986 vpermd %zmm15,%zmm9,%zmm20 1987 vmovdqu64 %zmm18,64(%rsp,%rax,1){%k2} 1988 vpermd %zmm6,%zmm9,%zmm24 1989 vmovdqa64 %zmm22,128(%rsp){%k2} 1990 vmovdqu64 %zmm19,128(%rsp,%rax,1){%k2} 1991 vmovdqa64 %zmm23,192(%rsp){%k2} 1992 vmovdqu64 %zmm20,192(%rsp,%rax,1){%k2} 1993 vmovdqa64 %zmm24,256(%rsp){%k2} 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 vpmuludq %zmm7,%zmm16,%zmm11 2005 vpmuludq %zmm7,%zmm17,%zmm12 2006 vpmuludq %zmm7,%zmm18,%zmm13 2007 vpmuludq %zmm7,%zmm19,%zmm14 2008 vpmuludq %zmm7,%zmm20,%zmm15 2009 vpsrlq $32,%zmm18,%zmm9 2010 2011 vpmuludq %zmm8,%zmm24,%zmm25 2012 vpmuludq %zmm8,%zmm16,%zmm26 2013 vpmuludq %zmm8,%zmm17,%zmm27 2014 vpmuludq %zmm8,%zmm18,%zmm28 2015 vpmuludq %zmm8,%zmm19,%zmm29 2016 vpsrlq $32,%zmm19,%zmm10 2017 vpaddq %zmm25,%zmm11,%zmm11 2018 vpaddq %zmm26,%zmm12,%zmm12 2019 vpaddq %zmm27,%zmm13,%zmm13 2020 vpaddq %zmm28,%zmm14,%zmm14 2021 vpaddq %zmm29,%zmm15,%zmm15 2022 2023 vpmuludq %zmm9,%zmm23,%zmm25 2024 vpmuludq %zmm9,%zmm24,%zmm26 2025 vpmuludq %zmm9,%zmm17,%zmm28 2026 vpmuludq %zmm9,%zmm18,%zmm29 2027 vpmuludq %zmm9,%zmm16,%zmm27 2028 vpsrlq $32,%zmm20,%zmm6 2029 vpaddq %zmm25,%zmm11,%zmm11 2030 vpaddq %zmm26,%zmm12,%zmm12 2031 vpaddq %zmm28,%zmm14,%zmm14 2032 vpaddq %zmm29,%zmm15,%zmm15 2033 vpaddq %zmm27,%zmm13,%zmm13 2034 2035 vpmuludq %zmm10,%zmm22,%zmm25 2036 vpmuludq %zmm10,%zmm16,%zmm28 2037 vpmuludq %zmm10,%zmm17,%zmm29 2038 vpmuludq %zmm10,%zmm23,%zmm26 2039 vpmuludq %zmm10,%zmm24,%zmm27 2040 vpaddq %zmm25,%zmm11,%zmm11 2041 vpaddq %zmm28,%zmm14,%zmm14 2042 vpaddq %zmm29,%zmm15,%zmm15 2043 vpaddq %zmm26,%zmm12,%zmm12 2044 vpaddq %zmm27,%zmm13,%zmm13 2045 2046 vpmuludq %zmm6,%zmm24,%zmm28 2047 vpmuludq %zmm6,%zmm16,%zmm29 2048 vpmuludq %zmm6,%zmm21,%zmm25 2049 vpmuludq %zmm6,%zmm22,%zmm26 2050 vpmuludq %zmm6,%zmm23,%zmm27 2051 vpaddq %zmm28,%zmm14,%zmm14 2052 vpaddq %zmm29,%zmm15,%zmm15 2053 vpaddq %zmm25,%zmm11,%zmm11 2054 vpaddq %zmm26,%zmm12,%zmm12 2055 vpaddq %zmm27,%zmm13,%zmm13 2056 2057 2058 2059 vmovdqu64 0(%rsi),%zmm10 2060 vmovdqu64 64(%rsi),%zmm6 2061 leaq 128(%rsi),%rsi 2062 2063 2064 2065 2066 vpsrlq $26,%zmm14,%zmm28 2067 vpandq %zmm5,%zmm14,%zmm14 2068 vpaddq %zmm28,%zmm15,%zmm15 2069 2070 vpsrlq $26,%zmm11,%zmm25 2071 vpandq %zmm5,%zmm11,%zmm11 2072 vpaddq %zmm25,%zmm12,%zmm12 2073 2074 vpsrlq $26,%zmm15,%zmm29 2075 vpandq %zmm5,%zmm15,%zmm15 2076 2077 vpsrlq $26,%zmm12,%zmm26 2078 vpandq %zmm5,%zmm12,%zmm12 2079 vpaddq %zmm26,%zmm13,%zmm13 2080 2081 vpaddq %zmm29,%zmm11,%zmm11 2082 vpsllq $2,%zmm29,%zmm29 2083 vpaddq %zmm29,%zmm11,%zmm11 2084 2085 vpsrlq $26,%zmm13,%zmm27 2086 vpandq %zmm5,%zmm13,%zmm13 2087 vpaddq %zmm27,%zmm14,%zmm14 2088 2089 vpsrlq $26,%zmm11,%zmm25 2090 vpandq %zmm5,%zmm11,%zmm11 2091 vpaddq %zmm25,%zmm12,%zmm12 2092 2093 vpsrlq $26,%zmm14,%zmm28 2094 vpandq %zmm5,%zmm14,%zmm14 2095 vpaddq %zmm28,%zmm15,%zmm15 2096 2097 2098 2099 2100 2101 vpunpcklqdq %zmm6,%zmm10,%zmm7 2102 vpunpckhqdq %zmm6,%zmm10,%zmm6 2103 2104 2105 2106 2107 2108 2109 vmovdqa32 128(%rcx),%zmm25 2110 movl $0x7777,%eax 2111 kmovw %eax,%k1 2112 2113 vpermd %zmm16,%zmm25,%zmm16 2114 vpermd %zmm17,%zmm25,%zmm17 2115 vpermd %zmm18,%zmm25,%zmm18 2116 vpermd %zmm19,%zmm25,%zmm19 2117 vpermd %zmm20,%zmm25,%zmm20 2118 2119 vpermd %zmm11,%zmm25,%zmm16{%k1} 2120 vpermd %zmm12,%zmm25,%zmm17{%k1} 2121 vpermd %zmm13,%zmm25,%zmm18{%k1} 2122 vpermd %zmm14,%zmm25,%zmm19{%k1} 2123 vpermd %zmm15,%zmm25,%zmm20{%k1} 2124 2125 vpslld $2,%zmm17,%zmm21 2126 vpslld $2,%zmm18,%zmm22 2127 vpslld $2,%zmm19,%zmm23 2128 vpslld $2,%zmm20,%zmm24 2129 vpaddd %zmm17,%zmm21,%zmm21 2130 vpaddd %zmm18,%zmm22,%zmm22 2131 vpaddd %zmm19,%zmm23,%zmm23 2132 vpaddd %zmm20,%zmm24,%zmm24 2133 2134 vpbroadcastq 32(%rcx),%zmm30 2135 2136 vpsrlq $52,%zmm7,%zmm9 2137 vpsllq $12,%zmm6,%zmm10 2138 vporq %zmm10,%zmm9,%zmm9 2139 vpsrlq $26,%zmm7,%zmm8 2140 vpsrlq $14,%zmm6,%zmm10 2141 vpsrlq $40,%zmm6,%zmm6 2142 vpandq %zmm5,%zmm9,%zmm9 2143 vpandq %zmm5,%zmm7,%zmm7 2144 2145 2146 2147 2148 vpaddq %zmm2,%zmm9,%zmm2 2149 subq $192,%rdx 2150 jbe .Ltail_avx512 2151 jmp .Loop_avx512 2152 2153.align 32 2154.Loop_avx512: 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 vpmuludq %zmm2,%zmm17,%zmm14 2184 vpaddq %zmm0,%zmm7,%zmm0 2185 vpmuludq %zmm2,%zmm18,%zmm15 2186 vpandq %zmm5,%zmm8,%zmm8 2187 vpmuludq %zmm2,%zmm23,%zmm11 2188 vpandq %zmm5,%zmm10,%zmm10 2189 vpmuludq %zmm2,%zmm24,%zmm12 2190 vporq %zmm30,%zmm6,%zmm6 2191 vpmuludq %zmm2,%zmm16,%zmm13 2192 vpaddq %zmm1,%zmm8,%zmm1 2193 vpaddq %zmm3,%zmm10,%zmm3 2194 vpaddq %zmm4,%zmm6,%zmm4 2195 2196 vmovdqu64 0(%rsi),%zmm10 2197 vmovdqu64 64(%rsi),%zmm6 2198 leaq 128(%rsi),%rsi 2199 vpmuludq %zmm0,%zmm19,%zmm28 2200 vpmuludq %zmm0,%zmm20,%zmm29 2201 vpmuludq %zmm0,%zmm16,%zmm25 2202 vpmuludq %zmm0,%zmm17,%zmm26 2203 vpaddq %zmm28,%zmm14,%zmm14 2204 vpaddq %zmm29,%zmm15,%zmm15 2205 vpaddq %zmm25,%zmm11,%zmm11 2206 vpaddq %zmm26,%zmm12,%zmm12 2207 2208 vpmuludq %zmm1,%zmm18,%zmm28 2209 vpmuludq %zmm1,%zmm19,%zmm29 2210 vpmuludq %zmm1,%zmm24,%zmm25 2211 vpmuludq %zmm0,%zmm18,%zmm27 2212 vpaddq %zmm28,%zmm14,%zmm14 2213 vpaddq %zmm29,%zmm15,%zmm15 2214 vpaddq %zmm25,%zmm11,%zmm11 2215 vpaddq %zmm27,%zmm13,%zmm13 2216 2217 vpunpcklqdq %zmm6,%zmm10,%zmm7 2218 vpunpckhqdq %zmm6,%zmm10,%zmm6 2219 2220 vpmuludq %zmm3,%zmm16,%zmm28 2221 vpmuludq %zmm3,%zmm17,%zmm29 2222 vpmuludq %zmm1,%zmm16,%zmm26 2223 vpmuludq %zmm1,%zmm17,%zmm27 2224 vpaddq %zmm28,%zmm14,%zmm14 2225 vpaddq %zmm29,%zmm15,%zmm15 2226 vpaddq %zmm26,%zmm12,%zmm12 2227 vpaddq %zmm27,%zmm13,%zmm13 2228 2229 vpmuludq %zmm4,%zmm24,%zmm28 2230 vpmuludq %zmm4,%zmm16,%zmm29 2231 vpmuludq %zmm3,%zmm22,%zmm25 2232 vpmuludq %zmm3,%zmm23,%zmm26 2233 vpaddq %zmm28,%zmm14,%zmm14 2234 vpmuludq %zmm3,%zmm24,%zmm27 2235 vpaddq %zmm29,%zmm15,%zmm15 2236 vpaddq %zmm25,%zmm11,%zmm11 2237 vpaddq %zmm26,%zmm12,%zmm12 2238 vpaddq %zmm27,%zmm13,%zmm13 2239 2240 vpmuludq %zmm4,%zmm21,%zmm25 2241 vpmuludq %zmm4,%zmm22,%zmm26 2242 vpmuludq %zmm4,%zmm23,%zmm27 2243 vpaddq %zmm25,%zmm11,%zmm0 2244 vpaddq %zmm26,%zmm12,%zmm1 2245 vpaddq %zmm27,%zmm13,%zmm2 2246 2247 2248 2249 2250 vpsrlq $52,%zmm7,%zmm9 2251 vpsllq $12,%zmm6,%zmm10 2252 2253 vpsrlq $26,%zmm14,%zmm3 2254 vpandq %zmm5,%zmm14,%zmm14 2255 vpaddq %zmm3,%zmm15,%zmm4 2256 2257 vporq %zmm10,%zmm9,%zmm9 2258 2259 vpsrlq $26,%zmm0,%zmm11 2260 vpandq %zmm5,%zmm0,%zmm0 2261 vpaddq %zmm11,%zmm1,%zmm1 2262 2263 vpandq %zmm5,%zmm9,%zmm9 2264 2265 vpsrlq $26,%zmm4,%zmm15 2266 vpandq %zmm5,%zmm4,%zmm4 2267 2268 vpsrlq $26,%zmm1,%zmm12 2269 vpandq %zmm5,%zmm1,%zmm1 2270 vpaddq %zmm12,%zmm2,%zmm2 2271 2272 vpaddq %zmm15,%zmm0,%zmm0 2273 vpsllq $2,%zmm15,%zmm15 2274 vpaddq %zmm15,%zmm0,%zmm0 2275 2276 vpaddq %zmm9,%zmm2,%zmm2 2277 vpsrlq $26,%zmm7,%zmm8 2278 2279 vpsrlq $26,%zmm2,%zmm13 2280 vpandq %zmm5,%zmm2,%zmm2 2281 vpaddq %zmm13,%zmm14,%zmm3 2282 2283 vpsrlq $14,%zmm6,%zmm10 2284 2285 vpsrlq $26,%zmm0,%zmm11 2286 vpandq %zmm5,%zmm0,%zmm0 2287 vpaddq %zmm11,%zmm1,%zmm1 2288 2289 vpsrlq $40,%zmm6,%zmm6 2290 2291 vpsrlq $26,%zmm3,%zmm14 2292 vpandq %zmm5,%zmm3,%zmm3 2293 vpaddq %zmm14,%zmm4,%zmm4 2294 2295 vpandq %zmm5,%zmm7,%zmm7 2296 2297 2298 2299 2300 subq $128,%rdx 2301 ja .Loop_avx512 2302 2303.Ltail_avx512: 2304 2305 2306 2307 2308 2309 vpsrlq $32,%zmm16,%zmm16 2310 vpsrlq $32,%zmm17,%zmm17 2311 vpsrlq $32,%zmm18,%zmm18 2312 vpsrlq $32,%zmm23,%zmm23 2313 vpsrlq $32,%zmm24,%zmm24 2314 vpsrlq $32,%zmm19,%zmm19 2315 vpsrlq $32,%zmm20,%zmm20 2316 vpsrlq $32,%zmm21,%zmm21 2317 vpsrlq $32,%zmm22,%zmm22 2318 2319 2320 2321 leaq (%rsi,%rdx,1),%rsi 2322 2323 2324 vpaddq %zmm0,%zmm7,%zmm0 2325 2326 vpmuludq %zmm2,%zmm17,%zmm14 2327 vpmuludq %zmm2,%zmm18,%zmm15 2328 vpmuludq %zmm2,%zmm23,%zmm11 2329 vpandq %zmm5,%zmm8,%zmm8 2330 vpmuludq %zmm2,%zmm24,%zmm12 2331 vpandq %zmm5,%zmm10,%zmm10 2332 vpmuludq %zmm2,%zmm16,%zmm13 2333 vporq %zmm30,%zmm6,%zmm6 2334 vpaddq %zmm1,%zmm8,%zmm1 2335 vpaddq %zmm3,%zmm10,%zmm3 2336 vpaddq %zmm4,%zmm6,%zmm4 2337 2338 vmovdqu 0(%rsi),%xmm7 2339 vpmuludq %zmm0,%zmm19,%zmm28 2340 vpmuludq %zmm0,%zmm20,%zmm29 2341 vpmuludq %zmm0,%zmm16,%zmm25 2342 vpmuludq %zmm0,%zmm17,%zmm26 2343 vpaddq %zmm28,%zmm14,%zmm14 2344 vpaddq %zmm29,%zmm15,%zmm15 2345 vpaddq %zmm25,%zmm11,%zmm11 2346 vpaddq %zmm26,%zmm12,%zmm12 2347 2348 vmovdqu 16(%rsi),%xmm8 2349 vpmuludq %zmm1,%zmm18,%zmm28 2350 vpmuludq %zmm1,%zmm19,%zmm29 2351 vpmuludq %zmm1,%zmm24,%zmm25 2352 vpmuludq %zmm0,%zmm18,%zmm27 2353 vpaddq %zmm28,%zmm14,%zmm14 2354 vpaddq %zmm29,%zmm15,%zmm15 2355 vpaddq %zmm25,%zmm11,%zmm11 2356 vpaddq %zmm27,%zmm13,%zmm13 2357 2358 vinserti128 $1,32(%rsi),%ymm7,%ymm7 2359 vpmuludq %zmm3,%zmm16,%zmm28 2360 vpmuludq %zmm3,%zmm17,%zmm29 2361 vpmuludq %zmm1,%zmm16,%zmm26 2362 vpmuludq %zmm1,%zmm17,%zmm27 2363 vpaddq %zmm28,%zmm14,%zmm14 2364 vpaddq %zmm29,%zmm15,%zmm15 2365 vpaddq %zmm26,%zmm12,%zmm12 2366 vpaddq %zmm27,%zmm13,%zmm13 2367 2368 vinserti128 $1,48(%rsi),%ymm8,%ymm8 2369 vpmuludq %zmm4,%zmm24,%zmm28 2370 vpmuludq %zmm4,%zmm16,%zmm29 2371 vpmuludq %zmm3,%zmm22,%zmm25 2372 vpmuludq %zmm3,%zmm23,%zmm26 2373 vpmuludq %zmm3,%zmm24,%zmm27 2374 vpaddq %zmm28,%zmm14,%zmm3 2375 vpaddq %zmm29,%zmm15,%zmm15 2376 vpaddq %zmm25,%zmm11,%zmm11 2377 vpaddq %zmm26,%zmm12,%zmm12 2378 vpaddq %zmm27,%zmm13,%zmm13 2379 2380 vpmuludq %zmm4,%zmm21,%zmm25 2381 vpmuludq %zmm4,%zmm22,%zmm26 2382 vpmuludq %zmm4,%zmm23,%zmm27 2383 vpaddq %zmm25,%zmm11,%zmm0 2384 vpaddq %zmm26,%zmm12,%zmm1 2385 vpaddq %zmm27,%zmm13,%zmm2 2386 2387 2388 2389 2390 movl $1,%eax 2391 vpermq $0xb1,%zmm3,%zmm14 2392 vpermq $0xb1,%zmm15,%zmm4 2393 vpermq $0xb1,%zmm0,%zmm11 2394 vpermq $0xb1,%zmm1,%zmm12 2395 vpermq $0xb1,%zmm2,%zmm13 2396 vpaddq %zmm14,%zmm3,%zmm3 2397 vpaddq %zmm15,%zmm4,%zmm4 2398 vpaddq %zmm11,%zmm0,%zmm0 2399 vpaddq %zmm12,%zmm1,%zmm1 2400 vpaddq %zmm13,%zmm2,%zmm2 2401 2402 kmovw %eax,%k3 2403 vpermq $0x2,%zmm3,%zmm14 2404 vpermq $0x2,%zmm4,%zmm15 2405 vpermq $0x2,%zmm0,%zmm11 2406 vpermq $0x2,%zmm1,%zmm12 2407 vpermq $0x2,%zmm2,%zmm13 2408 vpaddq %zmm14,%zmm3,%zmm3 2409 vpaddq %zmm15,%zmm4,%zmm4 2410 vpaddq %zmm11,%zmm0,%zmm0 2411 vpaddq %zmm12,%zmm1,%zmm1 2412 vpaddq %zmm13,%zmm2,%zmm2 2413 2414 vextracti64x4 $0x1,%zmm3,%ymm14 2415 vextracti64x4 $0x1,%zmm4,%ymm15 2416 vextracti64x4 $0x1,%zmm0,%ymm11 2417 vextracti64x4 $0x1,%zmm1,%ymm12 2418 vextracti64x4 $0x1,%zmm2,%ymm13 2419 vpaddq %zmm14,%zmm3,%zmm3{%k3}{z} 2420 vpaddq %zmm15,%zmm4,%zmm4{%k3}{z} 2421 vpaddq %zmm11,%zmm0,%zmm0{%k3}{z} 2422 vpaddq %zmm12,%zmm1,%zmm1{%k3}{z} 2423 vpaddq %zmm13,%zmm2,%zmm2{%k3}{z} 2424 2425 2426 2427 vpsrlq $26,%ymm3,%ymm14 2428 vpand %ymm5,%ymm3,%ymm3 2429 vpsrldq $6,%ymm7,%ymm9 2430 vpsrldq $6,%ymm8,%ymm10 2431 vpunpckhqdq %ymm8,%ymm7,%ymm6 2432 vpaddq %ymm14,%ymm4,%ymm4 2433 2434 vpsrlq $26,%ymm0,%ymm11 2435 vpand %ymm5,%ymm0,%ymm0 2436 vpunpcklqdq %ymm10,%ymm9,%ymm9 2437 vpunpcklqdq %ymm8,%ymm7,%ymm7 2438 vpaddq %ymm11,%ymm1,%ymm1 2439 2440 vpsrlq $26,%ymm4,%ymm15 2441 vpand %ymm5,%ymm4,%ymm4 2442 2443 vpsrlq $26,%ymm1,%ymm12 2444 vpand %ymm5,%ymm1,%ymm1 2445 vpsrlq $30,%ymm9,%ymm10 2446 vpsrlq $4,%ymm9,%ymm9 2447 vpaddq %ymm12,%ymm2,%ymm2 2448 2449 vpaddq %ymm15,%ymm0,%ymm0 2450 vpsllq $2,%ymm15,%ymm15 2451 vpsrlq $26,%ymm7,%ymm8 2452 vpsrlq $40,%ymm6,%ymm6 2453 vpaddq %ymm15,%ymm0,%ymm0 2454 2455 vpsrlq $26,%ymm2,%ymm13 2456 vpand %ymm5,%ymm2,%ymm2 2457 vpand %ymm5,%ymm9,%ymm9 2458 vpand %ymm5,%ymm7,%ymm7 2459 vpaddq %ymm13,%ymm3,%ymm3 2460 2461 vpsrlq $26,%ymm0,%ymm11 2462 vpand %ymm5,%ymm0,%ymm0 2463 vpaddq %ymm2,%ymm9,%ymm2 2464 vpand %ymm5,%ymm8,%ymm8 2465 vpaddq %ymm11,%ymm1,%ymm1 2466 2467 vpsrlq $26,%ymm3,%ymm14 2468 vpand %ymm5,%ymm3,%ymm3 2469 vpand %ymm5,%ymm10,%ymm10 2470 vpor 32(%rcx),%ymm6,%ymm6 2471 vpaddq %ymm14,%ymm4,%ymm4 2472 2473 leaq 144(%rsp),%rax 2474 addq $64,%rdx 2475 jnz .Ltail_avx2 2476 2477 vpsubq %ymm9,%ymm2,%ymm2 2478 vmovd %xmm0,-112(%rdi) 2479 vmovd %xmm1,-108(%rdi) 2480 vmovd %xmm2,-104(%rdi) 2481 vmovd %xmm3,-100(%rdi) 2482 vmovd %xmm4,-96(%rdi) 2483 vzeroall 2484 leaq 8(%r11),%rsp 2485.cfi_def_cfa %rsp,8 2486 .byte 0xf3,0xc3 2487.cfi_endproc 2488.size poly1305_blocks_avx512,.-poly1305_blocks_avx512 2489.type poly1305_init_base2_44,@function 2490.align 32 2491poly1305_init_base2_44: 2492.cfi_startproc 2493 xorq %rax,%rax 2494 movq %rax,0(%rdi) 2495 movq %rax,8(%rdi) 2496 movq %rax,16(%rdi) 2497 2498.Linit_base2_44: 2499 leaq poly1305_blocks_vpmadd52(%rip),%r10 2500 leaq poly1305_emit_base2_44(%rip),%r11 2501 2502 movq $0x0ffffffc0fffffff,%rax 2503 movq $0x0ffffffc0ffffffc,%rcx 2504 andq 0(%rsi),%rax 2505 movq $0x00000fffffffffff,%r8 2506 andq 8(%rsi),%rcx 2507 movq $0x00000fffffffffff,%r9 2508 andq %rax,%r8 2509 shrdq $44,%rcx,%rax 2510 movq %r8,40(%rdi) 2511 andq %r9,%rax 2512 shrq $24,%rcx 2513 movq %rax,48(%rdi) 2514 leaq (%rax,%rax,4),%rax 2515 movq %rcx,56(%rdi) 2516 shlq $2,%rax 2517 leaq (%rcx,%rcx,4),%rcx 2518 shlq $2,%rcx 2519 movq %rax,24(%rdi) 2520 movq %rcx,32(%rdi) 2521 movq $-1,64(%rdi) 2522 movq %r10,0(%rdx) 2523 movq %r11,8(%rdx) 2524 movl $1,%eax 2525 .byte 0xf3,0xc3 2526.cfi_endproc 2527.size poly1305_init_base2_44,.-poly1305_init_base2_44 2528.type poly1305_blocks_vpmadd52,@function 2529.align 32 2530poly1305_blocks_vpmadd52: 2531.cfi_startproc 2532 shrq $4,%rdx 2533 jz .Lno_data_vpmadd52 2534 2535 shlq $40,%rcx 2536 movq 64(%rdi),%r8 2537 2538 2539 2540 2541 2542 2543 movq $3,%rax 2544 movq $1,%r10 2545 cmpq $4,%rdx 2546 cmovaeq %r10,%rax 2547 testq %r8,%r8 2548 cmovnsq %r10,%rax 2549 2550 andq %rdx,%rax 2551 jz .Lblocks_vpmadd52_4x 2552 2553 subq %rax,%rdx 2554 movl $7,%r10d 2555 movl $1,%r11d 2556 kmovw %r10d,%k7 2557 leaq .L2_44_inp_permd(%rip),%r10 2558 kmovw %r11d,%k1 2559 2560 vmovq %rcx,%xmm21 2561 vmovdqa64 0(%r10),%ymm19 2562 vmovdqa64 32(%r10),%ymm20 2563 vpermq $0xcf,%ymm21,%ymm21 2564 vmovdqa64 64(%r10),%ymm22 2565 2566 vmovdqu64 0(%rdi),%ymm16{%k7}{z} 2567 vmovdqu64 40(%rdi),%ymm3{%k7}{z} 2568 vmovdqu64 32(%rdi),%ymm4{%k7}{z} 2569 vmovdqu64 24(%rdi),%ymm5{%k7}{z} 2570 2571 vmovdqa64 96(%r10),%ymm23 2572 vmovdqa64 128(%r10),%ymm24 2573 2574 jmp .Loop_vpmadd52 2575 2576.align 32 2577.Loop_vpmadd52: 2578 vmovdqu32 0(%rsi),%xmm18 2579 leaq 16(%rsi),%rsi 2580 2581 vpermd %ymm18,%ymm19,%ymm18 2582 vpsrlvq %ymm20,%ymm18,%ymm18 2583 vpandq %ymm22,%ymm18,%ymm18 2584 vporq %ymm21,%ymm18,%ymm18 2585 2586 vpaddq %ymm18,%ymm16,%ymm16 2587 2588 vpermq $0,%ymm16,%ymm0{%k7}{z} 2589 vpermq $85,%ymm16,%ymm1{%k7}{z} 2590 vpermq $170,%ymm16,%ymm2{%k7}{z} 2591 2592 vpxord %ymm16,%ymm16,%ymm16 2593 vpxord %ymm17,%ymm17,%ymm17 2594 2595 vpmadd52luq %ymm3,%ymm0,%ymm16 2596 vpmadd52huq %ymm3,%ymm0,%ymm17 2597 2598 vpmadd52luq %ymm4,%ymm1,%ymm16 2599 vpmadd52huq %ymm4,%ymm1,%ymm17 2600 2601 vpmadd52luq %ymm5,%ymm2,%ymm16 2602 vpmadd52huq %ymm5,%ymm2,%ymm17 2603 2604 vpsrlvq %ymm23,%ymm16,%ymm18 2605 vpsllvq %ymm24,%ymm17,%ymm17 2606 vpandq %ymm22,%ymm16,%ymm16 2607 2608 vpaddq %ymm18,%ymm17,%ymm17 2609 2610 vpermq $147,%ymm17,%ymm17 2611 2612 vpaddq %ymm17,%ymm16,%ymm16 2613 2614 vpsrlvq %ymm23,%ymm16,%ymm18 2615 vpandq %ymm22,%ymm16,%ymm16 2616 2617 vpermq $147,%ymm18,%ymm18 2618 2619 vpaddq %ymm18,%ymm16,%ymm16 2620 2621 vpermq $147,%ymm16,%ymm18{%k1}{z} 2622 2623 vpaddq %ymm18,%ymm16,%ymm16 2624 vpsllq $2,%ymm18,%ymm18 2625 2626 vpaddq %ymm18,%ymm16,%ymm16 2627 2628 decq %rax 2629 jnz .Loop_vpmadd52 2630 2631 vmovdqu64 %ymm16,0(%rdi){%k7} 2632 2633 testq %rdx,%rdx 2634 jnz .Lblocks_vpmadd52_4x 2635 2636.Lno_data_vpmadd52: 2637 .byte 0xf3,0xc3 2638.cfi_endproc 2639.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52 2640.type poly1305_blocks_vpmadd52_4x,@function 2641.align 32 2642poly1305_blocks_vpmadd52_4x: 2643.cfi_startproc 2644 shrq $4,%rdx 2645 jz .Lno_data_vpmadd52_4x 2646 2647 shlq $40,%rcx 2648 movq 64(%rdi),%r8 2649 2650.Lblocks_vpmadd52_4x: 2651 vpbroadcastq %rcx,%ymm31 2652 2653 vmovdqa64 .Lx_mask44(%rip),%ymm28 2654 movl $5,%eax 2655 vmovdqa64 .Lx_mask42(%rip),%ymm29 2656 kmovw %eax,%k1 2657 2658 testq %r8,%r8 2659 js .Linit_vpmadd52 2660 2661 vmovq 0(%rdi),%xmm0 2662 vmovq 8(%rdi),%xmm1 2663 vmovq 16(%rdi),%xmm2 2664 2665 testq $3,%rdx 2666 jnz .Lblocks_vpmadd52_2x_do 2667 2668.Lblocks_vpmadd52_4x_do: 2669 vpbroadcastq 64(%rdi),%ymm3 2670 vpbroadcastq 96(%rdi),%ymm4 2671 vpbroadcastq 128(%rdi),%ymm5 2672 vpbroadcastq 160(%rdi),%ymm16 2673 2674.Lblocks_vpmadd52_4x_key_loaded: 2675 vpsllq $2,%ymm5,%ymm17 2676 vpaddq %ymm5,%ymm17,%ymm17 2677 vpsllq $2,%ymm17,%ymm17 2678 2679 testq $7,%rdx 2680 jz .Lblocks_vpmadd52_8x 2681 2682 vmovdqu64 0(%rsi),%ymm26 2683 vmovdqu64 32(%rsi),%ymm27 2684 leaq 64(%rsi),%rsi 2685 2686 vpunpcklqdq %ymm27,%ymm26,%ymm25 2687 vpunpckhqdq %ymm27,%ymm26,%ymm27 2688 2689 2690 2691 vpsrlq $24,%ymm27,%ymm26 2692 vporq %ymm31,%ymm26,%ymm26 2693 vpaddq %ymm26,%ymm2,%ymm2 2694 vpandq %ymm28,%ymm25,%ymm24 2695 vpsrlq $44,%ymm25,%ymm25 2696 vpsllq $20,%ymm27,%ymm27 2697 vporq %ymm27,%ymm25,%ymm25 2698 vpandq %ymm28,%ymm25,%ymm25 2699 2700 subq $4,%rdx 2701 jz .Ltail_vpmadd52_4x 2702 jmp .Loop_vpmadd52_4x 2703 ud2 2704 2705.align 32 2706.Linit_vpmadd52: 2707 vmovq 24(%rdi),%xmm16 2708 vmovq 56(%rdi),%xmm2 2709 vmovq 32(%rdi),%xmm17 2710 vmovq 40(%rdi),%xmm3 2711 vmovq 48(%rdi),%xmm4 2712 2713 vmovdqa %ymm3,%ymm0 2714 vmovdqa %ymm4,%ymm1 2715 vmovdqa %ymm2,%ymm5 2716 2717 movl $2,%eax 2718 2719.Lmul_init_vpmadd52: 2720 vpxorq %ymm18,%ymm18,%ymm18 2721 vpmadd52luq %ymm2,%ymm16,%ymm18 2722 vpxorq %ymm19,%ymm19,%ymm19 2723 vpmadd52huq %ymm2,%ymm16,%ymm19 2724 vpxorq %ymm20,%ymm20,%ymm20 2725 vpmadd52luq %ymm2,%ymm17,%ymm20 2726 vpxorq %ymm21,%ymm21,%ymm21 2727 vpmadd52huq %ymm2,%ymm17,%ymm21 2728 vpxorq %ymm22,%ymm22,%ymm22 2729 vpmadd52luq %ymm2,%ymm3,%ymm22 2730 vpxorq %ymm23,%ymm23,%ymm23 2731 vpmadd52huq %ymm2,%ymm3,%ymm23 2732 2733 vpmadd52luq %ymm0,%ymm3,%ymm18 2734 vpmadd52huq %ymm0,%ymm3,%ymm19 2735 vpmadd52luq %ymm0,%ymm4,%ymm20 2736 vpmadd52huq %ymm0,%ymm4,%ymm21 2737 vpmadd52luq %ymm0,%ymm5,%ymm22 2738 vpmadd52huq %ymm0,%ymm5,%ymm23 2739 2740 vpmadd52luq %ymm1,%ymm17,%ymm18 2741 vpmadd52huq %ymm1,%ymm17,%ymm19 2742 vpmadd52luq %ymm1,%ymm3,%ymm20 2743 vpmadd52huq %ymm1,%ymm3,%ymm21 2744 vpmadd52luq %ymm1,%ymm4,%ymm22 2745 vpmadd52huq %ymm1,%ymm4,%ymm23 2746 2747 2748 2749 vpsrlq $44,%ymm18,%ymm30 2750 vpsllq $8,%ymm19,%ymm19 2751 vpandq %ymm28,%ymm18,%ymm0 2752 vpaddq %ymm30,%ymm19,%ymm19 2753 2754 vpaddq %ymm19,%ymm20,%ymm20 2755 2756 vpsrlq $44,%ymm20,%ymm30 2757 vpsllq $8,%ymm21,%ymm21 2758 vpandq %ymm28,%ymm20,%ymm1 2759 vpaddq %ymm30,%ymm21,%ymm21 2760 2761 vpaddq %ymm21,%ymm22,%ymm22 2762 2763 vpsrlq $42,%ymm22,%ymm30 2764 vpsllq $10,%ymm23,%ymm23 2765 vpandq %ymm29,%ymm22,%ymm2 2766 vpaddq %ymm30,%ymm23,%ymm23 2767 2768 vpaddq %ymm23,%ymm0,%ymm0 2769 vpsllq $2,%ymm23,%ymm23 2770 2771 vpaddq %ymm23,%ymm0,%ymm0 2772 2773 vpsrlq $44,%ymm0,%ymm30 2774 vpandq %ymm28,%ymm0,%ymm0 2775 2776 vpaddq %ymm30,%ymm1,%ymm1 2777 2778 decl %eax 2779 jz .Ldone_init_vpmadd52 2780 2781 vpunpcklqdq %ymm4,%ymm1,%ymm4 2782 vpbroadcastq %xmm1,%xmm1 2783 vpunpcklqdq %ymm5,%ymm2,%ymm5 2784 vpbroadcastq %xmm2,%xmm2 2785 vpunpcklqdq %ymm3,%ymm0,%ymm3 2786 vpbroadcastq %xmm0,%xmm0 2787 2788 vpsllq $2,%ymm4,%ymm16 2789 vpsllq $2,%ymm5,%ymm17 2790 vpaddq %ymm4,%ymm16,%ymm16 2791 vpaddq %ymm5,%ymm17,%ymm17 2792 vpsllq $2,%ymm16,%ymm16 2793 vpsllq $2,%ymm17,%ymm17 2794 2795 jmp .Lmul_init_vpmadd52 2796 ud2 2797 2798.align 32 2799.Ldone_init_vpmadd52: 2800 vinserti128 $1,%xmm4,%ymm1,%ymm4 2801 vinserti128 $1,%xmm5,%ymm2,%ymm5 2802 vinserti128 $1,%xmm3,%ymm0,%ymm3 2803 2804 vpermq $216,%ymm4,%ymm4 2805 vpermq $216,%ymm5,%ymm5 2806 vpermq $216,%ymm3,%ymm3 2807 2808 vpsllq $2,%ymm4,%ymm16 2809 vpaddq %ymm4,%ymm16,%ymm16 2810 vpsllq $2,%ymm16,%ymm16 2811 2812 vmovq 0(%rdi),%xmm0 2813 vmovq 8(%rdi),%xmm1 2814 vmovq 16(%rdi),%xmm2 2815 2816 testq $3,%rdx 2817 jnz .Ldone_init_vpmadd52_2x 2818 2819 vmovdqu64 %ymm3,64(%rdi) 2820 vpbroadcastq %xmm3,%ymm3 2821 vmovdqu64 %ymm4,96(%rdi) 2822 vpbroadcastq %xmm4,%ymm4 2823 vmovdqu64 %ymm5,128(%rdi) 2824 vpbroadcastq %xmm5,%ymm5 2825 vmovdqu64 %ymm16,160(%rdi) 2826 vpbroadcastq %xmm16,%ymm16 2827 2828 jmp .Lblocks_vpmadd52_4x_key_loaded 2829 ud2 2830 2831.align 32 2832.Ldone_init_vpmadd52_2x: 2833 vmovdqu64 %ymm3,64(%rdi) 2834 vpsrldq $8,%ymm3,%ymm3 2835 vmovdqu64 %ymm4,96(%rdi) 2836 vpsrldq $8,%ymm4,%ymm4 2837 vmovdqu64 %ymm5,128(%rdi) 2838 vpsrldq $8,%ymm5,%ymm5 2839 vmovdqu64 %ymm16,160(%rdi) 2840 vpsrldq $8,%ymm16,%ymm16 2841 jmp .Lblocks_vpmadd52_2x_key_loaded 2842 ud2 2843 2844.align 32 2845.Lblocks_vpmadd52_2x_do: 2846 vmovdqu64 128+8(%rdi),%ymm5{%k1}{z} 2847 vmovdqu64 160+8(%rdi),%ymm16{%k1}{z} 2848 vmovdqu64 64+8(%rdi),%ymm3{%k1}{z} 2849 vmovdqu64 96+8(%rdi),%ymm4{%k1}{z} 2850 2851.Lblocks_vpmadd52_2x_key_loaded: 2852 vmovdqu64 0(%rsi),%ymm26 2853 vpxorq %ymm27,%ymm27,%ymm27 2854 leaq 32(%rsi),%rsi 2855 2856 vpunpcklqdq %ymm27,%ymm26,%ymm25 2857 vpunpckhqdq %ymm27,%ymm26,%ymm27 2858 2859 2860 2861 vpsrlq $24,%ymm27,%ymm26 2862 vporq %ymm31,%ymm26,%ymm26 2863 vpaddq %ymm26,%ymm2,%ymm2 2864 vpandq %ymm28,%ymm25,%ymm24 2865 vpsrlq $44,%ymm25,%ymm25 2866 vpsllq $20,%ymm27,%ymm27 2867 vporq %ymm27,%ymm25,%ymm25 2868 vpandq %ymm28,%ymm25,%ymm25 2869 2870 jmp .Ltail_vpmadd52_2x 2871 ud2 2872 2873.align 32 2874.Loop_vpmadd52_4x: 2875 2876 vpaddq %ymm24,%ymm0,%ymm0 2877 vpaddq %ymm25,%ymm1,%ymm1 2878 2879 vpxorq %ymm18,%ymm18,%ymm18 2880 vpmadd52luq %ymm2,%ymm16,%ymm18 2881 vpxorq %ymm19,%ymm19,%ymm19 2882 vpmadd52huq %ymm2,%ymm16,%ymm19 2883 vpxorq %ymm20,%ymm20,%ymm20 2884 vpmadd52luq %ymm2,%ymm17,%ymm20 2885 vpxorq %ymm21,%ymm21,%ymm21 2886 vpmadd52huq %ymm2,%ymm17,%ymm21 2887 vpxorq %ymm22,%ymm22,%ymm22 2888 vpmadd52luq %ymm2,%ymm3,%ymm22 2889 vpxorq %ymm23,%ymm23,%ymm23 2890 vpmadd52huq %ymm2,%ymm3,%ymm23 2891 2892 vmovdqu64 0(%rsi),%ymm26 2893 vmovdqu64 32(%rsi),%ymm27 2894 leaq 64(%rsi),%rsi 2895 vpmadd52luq %ymm0,%ymm3,%ymm18 2896 vpmadd52huq %ymm0,%ymm3,%ymm19 2897 vpmadd52luq %ymm0,%ymm4,%ymm20 2898 vpmadd52huq %ymm0,%ymm4,%ymm21 2899 vpmadd52luq %ymm0,%ymm5,%ymm22 2900 vpmadd52huq %ymm0,%ymm5,%ymm23 2901 2902 vpunpcklqdq %ymm27,%ymm26,%ymm25 2903 vpunpckhqdq %ymm27,%ymm26,%ymm27 2904 vpmadd52luq %ymm1,%ymm17,%ymm18 2905 vpmadd52huq %ymm1,%ymm17,%ymm19 2906 vpmadd52luq %ymm1,%ymm3,%ymm20 2907 vpmadd52huq %ymm1,%ymm3,%ymm21 2908 vpmadd52luq %ymm1,%ymm4,%ymm22 2909 vpmadd52huq %ymm1,%ymm4,%ymm23 2910 2911 2912 2913 vpsrlq $44,%ymm18,%ymm30 2914 vpsllq $8,%ymm19,%ymm19 2915 vpandq %ymm28,%ymm18,%ymm0 2916 vpaddq %ymm30,%ymm19,%ymm19 2917 2918 vpsrlq $24,%ymm27,%ymm26 2919 vporq %ymm31,%ymm26,%ymm26 2920 vpaddq %ymm19,%ymm20,%ymm20 2921 2922 vpsrlq $44,%ymm20,%ymm30 2923 vpsllq $8,%ymm21,%ymm21 2924 vpandq %ymm28,%ymm20,%ymm1 2925 vpaddq %ymm30,%ymm21,%ymm21 2926 2927 vpandq %ymm28,%ymm25,%ymm24 2928 vpsrlq $44,%ymm25,%ymm25 2929 vpsllq $20,%ymm27,%ymm27 2930 vpaddq %ymm21,%ymm22,%ymm22 2931 2932 vpsrlq $42,%ymm22,%ymm30 2933 vpsllq $10,%ymm23,%ymm23 2934 vpandq %ymm29,%ymm22,%ymm2 2935 vpaddq %ymm30,%ymm23,%ymm23 2936 2937 vpaddq %ymm26,%ymm2,%ymm2 2938 vpaddq %ymm23,%ymm0,%ymm0 2939 vpsllq $2,%ymm23,%ymm23 2940 2941 vpaddq %ymm23,%ymm0,%ymm0 2942 vporq %ymm27,%ymm25,%ymm25 2943 vpandq %ymm28,%ymm25,%ymm25 2944 2945 vpsrlq $44,%ymm0,%ymm30 2946 vpandq %ymm28,%ymm0,%ymm0 2947 2948 vpaddq %ymm30,%ymm1,%ymm1 2949 2950 subq $4,%rdx 2951 jnz .Loop_vpmadd52_4x 2952 2953.Ltail_vpmadd52_4x: 2954 vmovdqu64 128(%rdi),%ymm5 2955 vmovdqu64 160(%rdi),%ymm16 2956 vmovdqu64 64(%rdi),%ymm3 2957 vmovdqu64 96(%rdi),%ymm4 2958 2959.Ltail_vpmadd52_2x: 2960 vpsllq $2,%ymm5,%ymm17 2961 vpaddq %ymm5,%ymm17,%ymm17 2962 vpsllq $2,%ymm17,%ymm17 2963 2964 2965 vpaddq %ymm24,%ymm0,%ymm0 2966 vpaddq %ymm25,%ymm1,%ymm1 2967 2968 vpxorq %ymm18,%ymm18,%ymm18 2969 vpmadd52luq %ymm2,%ymm16,%ymm18 2970 vpxorq %ymm19,%ymm19,%ymm19 2971 vpmadd52huq %ymm2,%ymm16,%ymm19 2972 vpxorq %ymm20,%ymm20,%ymm20 2973 vpmadd52luq %ymm2,%ymm17,%ymm20 2974 vpxorq %ymm21,%ymm21,%ymm21 2975 vpmadd52huq %ymm2,%ymm17,%ymm21 2976 vpxorq %ymm22,%ymm22,%ymm22 2977 vpmadd52luq %ymm2,%ymm3,%ymm22 2978 vpxorq %ymm23,%ymm23,%ymm23 2979 vpmadd52huq %ymm2,%ymm3,%ymm23 2980 2981 vpmadd52luq %ymm0,%ymm3,%ymm18 2982 vpmadd52huq %ymm0,%ymm3,%ymm19 2983 vpmadd52luq %ymm0,%ymm4,%ymm20 2984 vpmadd52huq %ymm0,%ymm4,%ymm21 2985 vpmadd52luq %ymm0,%ymm5,%ymm22 2986 vpmadd52huq %ymm0,%ymm5,%ymm23 2987 2988 vpmadd52luq %ymm1,%ymm17,%ymm18 2989 vpmadd52huq %ymm1,%ymm17,%ymm19 2990 vpmadd52luq %ymm1,%ymm3,%ymm20 2991 vpmadd52huq %ymm1,%ymm3,%ymm21 2992 vpmadd52luq %ymm1,%ymm4,%ymm22 2993 vpmadd52huq %ymm1,%ymm4,%ymm23 2994 2995 2996 2997 2998 movl $1,%eax 2999 kmovw %eax,%k1 3000 vpsrldq $8,%ymm18,%ymm24 3001 vpsrldq $8,%ymm19,%ymm0 3002 vpsrldq $8,%ymm20,%ymm25 3003 vpsrldq $8,%ymm21,%ymm1 3004 vpaddq %ymm24,%ymm18,%ymm18 3005 vpaddq %ymm0,%ymm19,%ymm19 3006 vpsrldq $8,%ymm22,%ymm26 3007 vpsrldq $8,%ymm23,%ymm2 3008 vpaddq %ymm25,%ymm20,%ymm20 3009 vpaddq %ymm1,%ymm21,%ymm21 3010 vpermq $0x2,%ymm18,%ymm24 3011 vpermq $0x2,%ymm19,%ymm0 3012 vpaddq %ymm26,%ymm22,%ymm22 3013 vpaddq %ymm2,%ymm23,%ymm23 3014 3015 vpermq $0x2,%ymm20,%ymm25 3016 vpermq $0x2,%ymm21,%ymm1 3017 vpaddq %ymm24,%ymm18,%ymm18{%k1}{z} 3018 vpaddq %ymm0,%ymm19,%ymm19{%k1}{z} 3019 vpermq $0x2,%ymm22,%ymm26 3020 vpermq $0x2,%ymm23,%ymm2 3021 vpaddq %ymm25,%ymm20,%ymm20{%k1}{z} 3022 vpaddq %ymm1,%ymm21,%ymm21{%k1}{z} 3023 vpaddq %ymm26,%ymm22,%ymm22{%k1}{z} 3024 vpaddq %ymm2,%ymm23,%ymm23{%k1}{z} 3025 3026 3027 3028 vpsrlq $44,%ymm18,%ymm30 3029 vpsllq $8,%ymm19,%ymm19 3030 vpandq %ymm28,%ymm18,%ymm0 3031 vpaddq %ymm30,%ymm19,%ymm19 3032 3033 vpaddq %ymm19,%ymm20,%ymm20 3034 3035 vpsrlq $44,%ymm20,%ymm30 3036 vpsllq $8,%ymm21,%ymm21 3037 vpandq %ymm28,%ymm20,%ymm1 3038 vpaddq %ymm30,%ymm21,%ymm21 3039 3040 vpaddq %ymm21,%ymm22,%ymm22 3041 3042 vpsrlq $42,%ymm22,%ymm30 3043 vpsllq $10,%ymm23,%ymm23 3044 vpandq %ymm29,%ymm22,%ymm2 3045 vpaddq %ymm30,%ymm23,%ymm23 3046 3047 vpaddq %ymm23,%ymm0,%ymm0 3048 vpsllq $2,%ymm23,%ymm23 3049 3050 vpaddq %ymm23,%ymm0,%ymm0 3051 3052 vpsrlq $44,%ymm0,%ymm30 3053 vpandq %ymm28,%ymm0,%ymm0 3054 3055 vpaddq %ymm30,%ymm1,%ymm1 3056 3057 3058 subq $2,%rdx 3059 ja .Lblocks_vpmadd52_4x_do 3060 3061 vmovq %xmm0,0(%rdi) 3062 vmovq %xmm1,8(%rdi) 3063 vmovq %xmm2,16(%rdi) 3064 vzeroall 3065 3066.Lno_data_vpmadd52_4x: 3067 .byte 0xf3,0xc3 3068.cfi_endproc 3069.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x 3070.type poly1305_blocks_vpmadd52_8x,@function 3071.align 32 3072poly1305_blocks_vpmadd52_8x: 3073.cfi_startproc 3074 shrq $4,%rdx 3075 jz .Lno_data_vpmadd52_8x 3076 3077 shlq $40,%rcx 3078 movq 64(%rdi),%r8 3079 3080 vmovdqa64 .Lx_mask44(%rip),%ymm28 3081 vmovdqa64 .Lx_mask42(%rip),%ymm29 3082 3083 testq %r8,%r8 3084 js .Linit_vpmadd52 3085 3086 vmovq 0(%rdi),%xmm0 3087 vmovq 8(%rdi),%xmm1 3088 vmovq 16(%rdi),%xmm2 3089 3090.Lblocks_vpmadd52_8x: 3091 3092 3093 3094 vmovdqu64 128(%rdi),%ymm5 3095 vmovdqu64 160(%rdi),%ymm16 3096 vmovdqu64 64(%rdi),%ymm3 3097 vmovdqu64 96(%rdi),%ymm4 3098 3099 vpsllq $2,%ymm5,%ymm17 3100 vpaddq %ymm5,%ymm17,%ymm17 3101 vpsllq $2,%ymm17,%ymm17 3102 3103 vpbroadcastq %xmm5,%ymm8 3104 vpbroadcastq %xmm3,%ymm6 3105 vpbroadcastq %xmm4,%ymm7 3106 3107 vpxorq %ymm18,%ymm18,%ymm18 3108 vpmadd52luq %ymm8,%ymm16,%ymm18 3109 vpxorq %ymm19,%ymm19,%ymm19 3110 vpmadd52huq %ymm8,%ymm16,%ymm19 3111 vpxorq %ymm20,%ymm20,%ymm20 3112 vpmadd52luq %ymm8,%ymm17,%ymm20 3113 vpxorq %ymm21,%ymm21,%ymm21 3114 vpmadd52huq %ymm8,%ymm17,%ymm21 3115 vpxorq %ymm22,%ymm22,%ymm22 3116 vpmadd52luq %ymm8,%ymm3,%ymm22 3117 vpxorq %ymm23,%ymm23,%ymm23 3118 vpmadd52huq %ymm8,%ymm3,%ymm23 3119 3120 vpmadd52luq %ymm6,%ymm3,%ymm18 3121 vpmadd52huq %ymm6,%ymm3,%ymm19 3122 vpmadd52luq %ymm6,%ymm4,%ymm20 3123 vpmadd52huq %ymm6,%ymm4,%ymm21 3124 vpmadd52luq %ymm6,%ymm5,%ymm22 3125 vpmadd52huq %ymm6,%ymm5,%ymm23 3126 3127 vpmadd52luq %ymm7,%ymm17,%ymm18 3128 vpmadd52huq %ymm7,%ymm17,%ymm19 3129 vpmadd52luq %ymm7,%ymm3,%ymm20 3130 vpmadd52huq %ymm7,%ymm3,%ymm21 3131 vpmadd52luq %ymm7,%ymm4,%ymm22 3132 vpmadd52huq %ymm7,%ymm4,%ymm23 3133 3134 3135 3136 vpsrlq $44,%ymm18,%ymm30 3137 vpsllq $8,%ymm19,%ymm19 3138 vpandq %ymm28,%ymm18,%ymm6 3139 vpaddq %ymm30,%ymm19,%ymm19 3140 3141 vpaddq %ymm19,%ymm20,%ymm20 3142 3143 vpsrlq $44,%ymm20,%ymm30 3144 vpsllq $8,%ymm21,%ymm21 3145 vpandq %ymm28,%ymm20,%ymm7 3146 vpaddq %ymm30,%ymm21,%ymm21 3147 3148 vpaddq %ymm21,%ymm22,%ymm22 3149 3150 vpsrlq $42,%ymm22,%ymm30 3151 vpsllq $10,%ymm23,%ymm23 3152 vpandq %ymm29,%ymm22,%ymm8 3153 vpaddq %ymm30,%ymm23,%ymm23 3154 3155 vpaddq %ymm23,%ymm6,%ymm6 3156 vpsllq $2,%ymm23,%ymm23 3157 3158 vpaddq %ymm23,%ymm6,%ymm6 3159 3160 vpsrlq $44,%ymm6,%ymm30 3161 vpandq %ymm28,%ymm6,%ymm6 3162 3163 vpaddq %ymm30,%ymm7,%ymm7 3164 3165 3166 3167 3168 3169 vpunpcklqdq %ymm5,%ymm8,%ymm26 3170 vpunpckhqdq %ymm5,%ymm8,%ymm5 3171 vpunpcklqdq %ymm3,%ymm6,%ymm24 3172 vpunpckhqdq %ymm3,%ymm6,%ymm3 3173 vpunpcklqdq %ymm4,%ymm7,%ymm25 3174 vpunpckhqdq %ymm4,%ymm7,%ymm4 3175 vshufi64x2 $0x44,%zmm5,%zmm26,%zmm8 3176 vshufi64x2 $0x44,%zmm3,%zmm24,%zmm6 3177 vshufi64x2 $0x44,%zmm4,%zmm25,%zmm7 3178 3179 vmovdqu64 0(%rsi),%zmm26 3180 vmovdqu64 64(%rsi),%zmm27 3181 leaq 128(%rsi),%rsi 3182 3183 vpsllq $2,%zmm8,%zmm10 3184 vpsllq $2,%zmm7,%zmm9 3185 vpaddq %zmm8,%zmm10,%zmm10 3186 vpaddq %zmm7,%zmm9,%zmm9 3187 vpsllq $2,%zmm10,%zmm10 3188 vpsllq $2,%zmm9,%zmm9 3189 3190 vpbroadcastq %rcx,%zmm31 3191 vpbroadcastq %xmm28,%zmm28 3192 vpbroadcastq %xmm29,%zmm29 3193 3194 vpbroadcastq %xmm9,%zmm16 3195 vpbroadcastq %xmm10,%zmm17 3196 vpbroadcastq %xmm6,%zmm3 3197 vpbroadcastq %xmm7,%zmm4 3198 vpbroadcastq %xmm8,%zmm5 3199 3200 vpunpcklqdq %zmm27,%zmm26,%zmm25 3201 vpunpckhqdq %zmm27,%zmm26,%zmm27 3202 3203 3204 3205 vpsrlq $24,%zmm27,%zmm26 3206 vporq %zmm31,%zmm26,%zmm26 3207 vpaddq %zmm26,%zmm2,%zmm2 3208 vpandq %zmm28,%zmm25,%zmm24 3209 vpsrlq $44,%zmm25,%zmm25 3210 vpsllq $20,%zmm27,%zmm27 3211 vporq %zmm27,%zmm25,%zmm25 3212 vpandq %zmm28,%zmm25,%zmm25 3213 3214 subq $8,%rdx 3215 jz .Ltail_vpmadd52_8x 3216 jmp .Loop_vpmadd52_8x 3217 3218.align 32 3219.Loop_vpmadd52_8x: 3220 3221 vpaddq %zmm24,%zmm0,%zmm0 3222 vpaddq %zmm25,%zmm1,%zmm1 3223 3224 vpxorq %zmm18,%zmm18,%zmm18 3225 vpmadd52luq %zmm2,%zmm16,%zmm18 3226 vpxorq %zmm19,%zmm19,%zmm19 3227 vpmadd52huq %zmm2,%zmm16,%zmm19 3228 vpxorq %zmm20,%zmm20,%zmm20 3229 vpmadd52luq %zmm2,%zmm17,%zmm20 3230 vpxorq %zmm21,%zmm21,%zmm21 3231 vpmadd52huq %zmm2,%zmm17,%zmm21 3232 vpxorq %zmm22,%zmm22,%zmm22 3233 vpmadd52luq %zmm2,%zmm3,%zmm22 3234 vpxorq %zmm23,%zmm23,%zmm23 3235 vpmadd52huq %zmm2,%zmm3,%zmm23 3236 3237 vmovdqu64 0(%rsi),%zmm26 3238 vmovdqu64 64(%rsi),%zmm27 3239 leaq 128(%rsi),%rsi 3240 vpmadd52luq %zmm0,%zmm3,%zmm18 3241 vpmadd52huq %zmm0,%zmm3,%zmm19 3242 vpmadd52luq %zmm0,%zmm4,%zmm20 3243 vpmadd52huq %zmm0,%zmm4,%zmm21 3244 vpmadd52luq %zmm0,%zmm5,%zmm22 3245 vpmadd52huq %zmm0,%zmm5,%zmm23 3246 3247 vpunpcklqdq %zmm27,%zmm26,%zmm25 3248 vpunpckhqdq %zmm27,%zmm26,%zmm27 3249 vpmadd52luq %zmm1,%zmm17,%zmm18 3250 vpmadd52huq %zmm1,%zmm17,%zmm19 3251 vpmadd52luq %zmm1,%zmm3,%zmm20 3252 vpmadd52huq %zmm1,%zmm3,%zmm21 3253 vpmadd52luq %zmm1,%zmm4,%zmm22 3254 vpmadd52huq %zmm1,%zmm4,%zmm23 3255 3256 3257 3258 vpsrlq $44,%zmm18,%zmm30 3259 vpsllq $8,%zmm19,%zmm19 3260 vpandq %zmm28,%zmm18,%zmm0 3261 vpaddq %zmm30,%zmm19,%zmm19 3262 3263 vpsrlq $24,%zmm27,%zmm26 3264 vporq %zmm31,%zmm26,%zmm26 3265 vpaddq %zmm19,%zmm20,%zmm20 3266 3267 vpsrlq $44,%zmm20,%zmm30 3268 vpsllq $8,%zmm21,%zmm21 3269 vpandq %zmm28,%zmm20,%zmm1 3270 vpaddq %zmm30,%zmm21,%zmm21 3271 3272 vpandq %zmm28,%zmm25,%zmm24 3273 vpsrlq $44,%zmm25,%zmm25 3274 vpsllq $20,%zmm27,%zmm27 3275 vpaddq %zmm21,%zmm22,%zmm22 3276 3277 vpsrlq $42,%zmm22,%zmm30 3278 vpsllq $10,%zmm23,%zmm23 3279 vpandq %zmm29,%zmm22,%zmm2 3280 vpaddq %zmm30,%zmm23,%zmm23 3281 3282 vpaddq %zmm26,%zmm2,%zmm2 3283 vpaddq %zmm23,%zmm0,%zmm0 3284 vpsllq $2,%zmm23,%zmm23 3285 3286 vpaddq %zmm23,%zmm0,%zmm0 3287 vporq %zmm27,%zmm25,%zmm25 3288 vpandq %zmm28,%zmm25,%zmm25 3289 3290 vpsrlq $44,%zmm0,%zmm30 3291 vpandq %zmm28,%zmm0,%zmm0 3292 3293 vpaddq %zmm30,%zmm1,%zmm1 3294 3295 subq $8,%rdx 3296 jnz .Loop_vpmadd52_8x 3297 3298.Ltail_vpmadd52_8x: 3299 3300 vpaddq %zmm24,%zmm0,%zmm0 3301 vpaddq %zmm25,%zmm1,%zmm1 3302 3303 vpxorq %zmm18,%zmm18,%zmm18 3304 vpmadd52luq %zmm2,%zmm9,%zmm18 3305 vpxorq %zmm19,%zmm19,%zmm19 3306 vpmadd52huq %zmm2,%zmm9,%zmm19 3307 vpxorq %zmm20,%zmm20,%zmm20 3308 vpmadd52luq %zmm2,%zmm10,%zmm20 3309 vpxorq %zmm21,%zmm21,%zmm21 3310 vpmadd52huq %zmm2,%zmm10,%zmm21 3311 vpxorq %zmm22,%zmm22,%zmm22 3312 vpmadd52luq %zmm2,%zmm6,%zmm22 3313 vpxorq %zmm23,%zmm23,%zmm23 3314 vpmadd52huq %zmm2,%zmm6,%zmm23 3315 3316 vpmadd52luq %zmm0,%zmm6,%zmm18 3317 vpmadd52huq %zmm0,%zmm6,%zmm19 3318 vpmadd52luq %zmm0,%zmm7,%zmm20 3319 vpmadd52huq %zmm0,%zmm7,%zmm21 3320 vpmadd52luq %zmm0,%zmm8,%zmm22 3321 vpmadd52huq %zmm0,%zmm8,%zmm23 3322 3323 vpmadd52luq %zmm1,%zmm10,%zmm18 3324 vpmadd52huq %zmm1,%zmm10,%zmm19 3325 vpmadd52luq %zmm1,%zmm6,%zmm20 3326 vpmadd52huq %zmm1,%zmm6,%zmm21 3327 vpmadd52luq %zmm1,%zmm7,%zmm22 3328 vpmadd52huq %zmm1,%zmm7,%zmm23 3329 3330 3331 3332 3333 movl $1,%eax 3334 kmovw %eax,%k1 3335 vpsrldq $8,%zmm18,%zmm24 3336 vpsrldq $8,%zmm19,%zmm0 3337 vpsrldq $8,%zmm20,%zmm25 3338 vpsrldq $8,%zmm21,%zmm1 3339 vpaddq %zmm24,%zmm18,%zmm18 3340 vpaddq %zmm0,%zmm19,%zmm19 3341 vpsrldq $8,%zmm22,%zmm26 3342 vpsrldq $8,%zmm23,%zmm2 3343 vpaddq %zmm25,%zmm20,%zmm20 3344 vpaddq %zmm1,%zmm21,%zmm21 3345 vpermq $0x2,%zmm18,%zmm24 3346 vpermq $0x2,%zmm19,%zmm0 3347 vpaddq %zmm26,%zmm22,%zmm22 3348 vpaddq %zmm2,%zmm23,%zmm23 3349 3350 vpermq $0x2,%zmm20,%zmm25 3351 vpermq $0x2,%zmm21,%zmm1 3352 vpaddq %zmm24,%zmm18,%zmm18 3353 vpaddq %zmm0,%zmm19,%zmm19 3354 vpermq $0x2,%zmm22,%zmm26 3355 vpermq $0x2,%zmm23,%zmm2 3356 vpaddq %zmm25,%zmm20,%zmm20 3357 vpaddq %zmm1,%zmm21,%zmm21 3358 vextracti64x4 $1,%zmm18,%ymm24 3359 vextracti64x4 $1,%zmm19,%ymm0 3360 vpaddq %zmm26,%zmm22,%zmm22 3361 vpaddq %zmm2,%zmm23,%zmm23 3362 3363 vextracti64x4 $1,%zmm20,%ymm25 3364 vextracti64x4 $1,%zmm21,%ymm1 3365 vextracti64x4 $1,%zmm22,%ymm26 3366 vextracti64x4 $1,%zmm23,%ymm2 3367 vpaddq %ymm24,%ymm18,%ymm18{%k1}{z} 3368 vpaddq %ymm0,%ymm19,%ymm19{%k1}{z} 3369 vpaddq %ymm25,%ymm20,%ymm20{%k1}{z} 3370 vpaddq %ymm1,%ymm21,%ymm21{%k1}{z} 3371 vpaddq %ymm26,%ymm22,%ymm22{%k1}{z} 3372 vpaddq %ymm2,%ymm23,%ymm23{%k1}{z} 3373 3374 3375 3376 vpsrlq $44,%ymm18,%ymm30 3377 vpsllq $8,%ymm19,%ymm19 3378 vpandq %ymm28,%ymm18,%ymm0 3379 vpaddq %ymm30,%ymm19,%ymm19 3380 3381 vpaddq %ymm19,%ymm20,%ymm20 3382 3383 vpsrlq $44,%ymm20,%ymm30 3384 vpsllq $8,%ymm21,%ymm21 3385 vpandq %ymm28,%ymm20,%ymm1 3386 vpaddq %ymm30,%ymm21,%ymm21 3387 3388 vpaddq %ymm21,%ymm22,%ymm22 3389 3390 vpsrlq $42,%ymm22,%ymm30 3391 vpsllq $10,%ymm23,%ymm23 3392 vpandq %ymm29,%ymm22,%ymm2 3393 vpaddq %ymm30,%ymm23,%ymm23 3394 3395 vpaddq %ymm23,%ymm0,%ymm0 3396 vpsllq $2,%ymm23,%ymm23 3397 3398 vpaddq %ymm23,%ymm0,%ymm0 3399 3400 vpsrlq $44,%ymm0,%ymm30 3401 vpandq %ymm28,%ymm0,%ymm0 3402 3403 vpaddq %ymm30,%ymm1,%ymm1 3404 3405 3406 3407 vmovq %xmm0,0(%rdi) 3408 vmovq %xmm1,8(%rdi) 3409 vmovq %xmm2,16(%rdi) 3410 vzeroall 3411 3412.Lno_data_vpmadd52_8x: 3413 .byte 0xf3,0xc3 3414.cfi_endproc 3415.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x 3416.type poly1305_emit_base2_44,@function 3417.align 32 3418poly1305_emit_base2_44: 3419.cfi_startproc 3420 movq 0(%rdi),%r8 3421 movq 8(%rdi),%r9 3422 movq 16(%rdi),%r10 3423 3424 movq %r9,%rax 3425 shrq $20,%r9 3426 shlq $44,%rax 3427 movq %r10,%rcx 3428 shrq $40,%r10 3429 shlq $24,%rcx 3430 3431 addq %rax,%r8 3432 adcq %rcx,%r9 3433 adcq $0,%r10 3434 3435 movq %r8,%rax 3436 addq $5,%r8 3437 movq %r9,%rcx 3438 adcq $0,%r9 3439 adcq $0,%r10 3440 shrq $2,%r10 3441 cmovnzq %r8,%rax 3442 cmovnzq %r9,%rcx 3443 3444 addq 0(%rdx),%rax 3445 adcq 8(%rdx),%rcx 3446 movq %rax,0(%rsi) 3447 movq %rcx,8(%rsi) 3448 3449 .byte 0xf3,0xc3 3450.cfi_endproc 3451.size poly1305_emit_base2_44,.-poly1305_emit_base2_44 3452.align 64 3453.Lconst: 3454.Lmask24: 3455.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 3456.L129: 3457.long 16777216,0,16777216,0,16777216,0,16777216,0 3458.Lmask26: 3459.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 3460.Lpermd_avx2: 3461.long 2,2,2,3,2,0,2,1 3462.Lpermd_avx512: 3463.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 3464 3465.L2_44_inp_permd: 3466.long 0,1,1,2,2,3,7,7 3467.L2_44_inp_shift: 3468.quad 0,12,24,64 3469.L2_44_mask: 3470.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff 3471.L2_44_shift_rgt: 3472.quad 44,44,42,64 3473.L2_44_shift_lft: 3474.quad 8,8,10,64 3475 3476.align 64 3477.Lx_mask44: 3478.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff 3479.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff 3480.Lx_mask42: 3481.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff 3482.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff 3483.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 3484.align 16 3485.globl xor128_encrypt_n_pad 3486.type xor128_encrypt_n_pad,@function 3487.align 16 3488xor128_encrypt_n_pad: 3489.cfi_startproc 3490 subq %rdx,%rsi 3491 subq %rdx,%rdi 3492 movq %rcx,%r10 3493 shrq $4,%rcx 3494 jz .Ltail_enc 3495 nop 3496.Loop_enc_xmm: 3497 movdqu (%rsi,%rdx,1),%xmm0 3498 pxor (%rdx),%xmm0 3499 movdqu %xmm0,(%rdi,%rdx,1) 3500 movdqa %xmm0,(%rdx) 3501 leaq 16(%rdx),%rdx 3502 decq %rcx 3503 jnz .Loop_enc_xmm 3504 3505 andq $15,%r10 3506 jz .Ldone_enc 3507 3508.Ltail_enc: 3509 movq $16,%rcx 3510 subq %r10,%rcx 3511 xorl %eax,%eax 3512.Loop_enc_byte: 3513 movb (%rsi,%rdx,1),%al 3514 xorb (%rdx),%al 3515 movb %al,(%rdi,%rdx,1) 3516 movb %al,(%rdx) 3517 leaq 1(%rdx),%rdx 3518 decq %r10 3519 jnz .Loop_enc_byte 3520 3521 xorl %eax,%eax 3522.Loop_enc_pad: 3523 movb %al,(%rdx) 3524 leaq 1(%rdx),%rdx 3525 decq %rcx 3526 jnz .Loop_enc_pad 3527 3528.Ldone_enc: 3529 movq %rdx,%rax 3530 .byte 0xf3,0xc3 3531.cfi_endproc 3532.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad 3533 3534.globl xor128_decrypt_n_pad 3535.type xor128_decrypt_n_pad,@function 3536.align 16 3537xor128_decrypt_n_pad: 3538.cfi_startproc 3539 subq %rdx,%rsi 3540 subq %rdx,%rdi 3541 movq %rcx,%r10 3542 shrq $4,%rcx 3543 jz .Ltail_dec 3544 nop 3545.Loop_dec_xmm: 3546 movdqu (%rsi,%rdx,1),%xmm0 3547 movdqa (%rdx),%xmm1 3548 pxor %xmm0,%xmm1 3549 movdqu %xmm1,(%rdi,%rdx,1) 3550 movdqa %xmm0,(%rdx) 3551 leaq 16(%rdx),%rdx 3552 decq %rcx 3553 jnz .Loop_dec_xmm 3554 3555 pxor %xmm1,%xmm1 3556 andq $15,%r10 3557 jz .Ldone_dec 3558 3559.Ltail_dec: 3560 movq $16,%rcx 3561 subq %r10,%rcx 3562 xorl %eax,%eax 3563 xorq %r11,%r11 3564.Loop_dec_byte: 3565 movb (%rsi,%rdx,1),%r11b 3566 movb (%rdx),%al 3567 xorb %r11b,%al 3568 movb %al,(%rdi,%rdx,1) 3569 movb %r11b,(%rdx) 3570 leaq 1(%rdx),%rdx 3571 decq %r10 3572 jnz .Loop_dec_byte 3573 3574 xorl %eax,%eax 3575.Loop_dec_pad: 3576 movb %al,(%rdx) 3577 leaq 1(%rdx),%rdx 3578 decq %rcx 3579 jnz .Loop_dec_pad 3580 3581.Ldone_dec: 3582 movq %rdx,%rax 3583 .byte 0xf3,0xc3 3584.cfi_endproc 3585.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad 3586