rsaz-x86_64.S revision 356290
1/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/rsaz-x86_64.S 356290 2020-01-02 21:35:28Z jkim $ */ 2/* Do not modify. This file is auto-generated from rsaz-x86_64.pl. */ 3.text 4 5 6 7.globl rsaz_512_sqr 8.type rsaz_512_sqr,@function 9.align 32 10rsaz_512_sqr: 11 pushq %rbx 12 pushq %rbp 13 pushq %r12 14 pushq %r13 15 pushq %r14 16 pushq %r15 17 18 subq $128+24,%rsp 19.Lsqr_body: 20.byte 102,72,15,110,202 21 movq (%rsi),%rdx 22 movq 8(%rsi),%rax 23 movq %rcx,128(%rsp) 24 movl $0x80100,%r11d 25 andl OPENSSL_ia32cap_P+8(%rip),%r11d 26 cmpl $0x80100,%r11d 27 je .Loop_sqrx 28 jmp .Loop_sqr 29 30.align 32 31.Loop_sqr: 32 movl %r8d,128+8(%rsp) 33 34 movq %rdx,%rbx 35 movq %rax,%rbp 36 mulq %rdx 37 movq %rax,%r8 38 movq 16(%rsi),%rax 39 movq %rdx,%r9 40 41 mulq %rbx 42 addq %rax,%r9 43 movq 24(%rsi),%rax 44 movq %rdx,%r10 45 adcq $0,%r10 46 47 mulq %rbx 48 addq %rax,%r10 49 movq 32(%rsi),%rax 50 movq %rdx,%r11 51 adcq $0,%r11 52 53 mulq %rbx 54 addq %rax,%r11 55 movq 40(%rsi),%rax 56 movq %rdx,%r12 57 adcq $0,%r12 58 59 mulq %rbx 60 addq %rax,%r12 61 movq 48(%rsi),%rax 62 movq %rdx,%r13 63 adcq $0,%r13 64 65 mulq %rbx 66 addq %rax,%r13 67 movq 56(%rsi),%rax 68 movq %rdx,%r14 69 adcq $0,%r14 70 71 mulq %rbx 72 addq %rax,%r14 73 movq %rbx,%rax 74 adcq $0,%rdx 75 76 xorq %rcx,%rcx 77 addq %r8,%r8 78 movq %rdx,%r15 79 adcq $0,%rcx 80 81 mulq %rax 82 addq %r8,%rdx 83 adcq $0,%rcx 84 85 movq %rax,(%rsp) 86 movq %rdx,8(%rsp) 87 88 89 movq 16(%rsi),%rax 90 mulq %rbp 91 addq %rax,%r10 92 movq 24(%rsi),%rax 93 movq %rdx,%rbx 94 adcq $0,%rbx 95 96 mulq %rbp 97 addq %rax,%r11 98 movq 32(%rsi),%rax 99 adcq $0,%rdx 100 addq %rbx,%r11 101 movq %rdx,%rbx 102 adcq $0,%rbx 103 104 mulq %rbp 105 addq %rax,%r12 106 movq 40(%rsi),%rax 107 adcq $0,%rdx 108 addq %rbx,%r12 109 movq %rdx,%rbx 110 adcq $0,%rbx 111 112 mulq %rbp 113 addq %rax,%r13 114 movq 48(%rsi),%rax 115 adcq $0,%rdx 116 addq %rbx,%r13 117 movq %rdx,%rbx 118 adcq $0,%rbx 119 120 mulq %rbp 121 addq %rax,%r14 122 movq 56(%rsi),%rax 123 adcq $0,%rdx 124 addq %rbx,%r14 125 movq %rdx,%rbx 126 adcq $0,%rbx 127 128 mulq %rbp 129 addq %rax,%r15 130 movq %rbp,%rax 131 adcq $0,%rdx 132 addq %rbx,%r15 133 adcq $0,%rdx 134 135 xorq %rbx,%rbx 136 addq %r9,%r9 137 movq %rdx,%r8 138 adcq %r10,%r10 139 adcq $0,%rbx 140 141 mulq %rax 142 143 addq %rcx,%rax 144 movq 16(%rsi),%rbp 145 addq %rax,%r9 146 movq 24(%rsi),%rax 147 adcq %rdx,%r10 148 adcq $0,%rbx 149 150 movq %r9,16(%rsp) 151 movq %r10,24(%rsp) 152 153 154 mulq %rbp 155 addq %rax,%r12 156 movq 32(%rsi),%rax 157 movq %rdx,%rcx 158 adcq $0,%rcx 159 160 mulq %rbp 161 addq %rax,%r13 162 movq 40(%rsi),%rax 163 adcq $0,%rdx 164 addq %rcx,%r13 165 movq %rdx,%rcx 166 adcq $0,%rcx 167 168 mulq %rbp 169 addq %rax,%r14 170 movq 48(%rsi),%rax 171 adcq $0,%rdx 172 addq %rcx,%r14 173 movq %rdx,%rcx 174 adcq $0,%rcx 175 176 mulq %rbp 177 addq %rax,%r15 178 movq 56(%rsi),%rax 179 adcq $0,%rdx 180 addq %rcx,%r15 181 movq %rdx,%rcx 182 adcq $0,%rcx 183 184 mulq %rbp 185 addq %rax,%r8 186 movq %rbp,%rax 187 adcq $0,%rdx 188 addq %rcx,%r8 189 adcq $0,%rdx 190 191 xorq %rcx,%rcx 192 addq %r11,%r11 193 movq %rdx,%r9 194 adcq %r12,%r12 195 adcq $0,%rcx 196 197 mulq %rax 198 199 addq %rbx,%rax 200 movq 24(%rsi),%r10 201 addq %rax,%r11 202 movq 32(%rsi),%rax 203 adcq %rdx,%r12 204 adcq $0,%rcx 205 206 movq %r11,32(%rsp) 207 movq %r12,40(%rsp) 208 209 210 movq %rax,%r11 211 mulq %r10 212 addq %rax,%r14 213 movq 40(%rsi),%rax 214 movq %rdx,%rbx 215 adcq $0,%rbx 216 217 movq %rax,%r12 218 mulq %r10 219 addq %rax,%r15 220 movq 48(%rsi),%rax 221 adcq $0,%rdx 222 addq %rbx,%r15 223 movq %rdx,%rbx 224 adcq $0,%rbx 225 226 movq %rax,%rbp 227 mulq %r10 228 addq %rax,%r8 229 movq 56(%rsi),%rax 230 adcq $0,%rdx 231 addq %rbx,%r8 232 movq %rdx,%rbx 233 adcq $0,%rbx 234 235 mulq %r10 236 addq %rax,%r9 237 movq %r10,%rax 238 adcq $0,%rdx 239 addq %rbx,%r9 240 adcq $0,%rdx 241 242 xorq %rbx,%rbx 243 addq %r13,%r13 244 movq %rdx,%r10 245 adcq %r14,%r14 246 adcq $0,%rbx 247 248 mulq %rax 249 250 addq %rcx,%rax 251 addq %rax,%r13 252 movq %r12,%rax 253 adcq %rdx,%r14 254 adcq $0,%rbx 255 256 movq %r13,48(%rsp) 257 movq %r14,56(%rsp) 258 259 260 mulq %r11 261 addq %rax,%r8 262 movq %rbp,%rax 263 movq %rdx,%rcx 264 adcq $0,%rcx 265 266 mulq %r11 267 addq %rax,%r9 268 movq 56(%rsi),%rax 269 adcq $0,%rdx 270 addq %rcx,%r9 271 movq %rdx,%rcx 272 adcq $0,%rcx 273 274 movq %rax,%r14 275 mulq %r11 276 addq %rax,%r10 277 movq %r11,%rax 278 adcq $0,%rdx 279 addq %rcx,%r10 280 adcq $0,%rdx 281 282 xorq %rcx,%rcx 283 addq %r15,%r15 284 movq %rdx,%r11 285 adcq %r8,%r8 286 adcq $0,%rcx 287 288 mulq %rax 289 290 addq %rbx,%rax 291 addq %rax,%r15 292 movq %rbp,%rax 293 adcq %rdx,%r8 294 adcq $0,%rcx 295 296 movq %r15,64(%rsp) 297 movq %r8,72(%rsp) 298 299 300 mulq %r12 301 addq %rax,%r10 302 movq %r14,%rax 303 movq %rdx,%rbx 304 adcq $0,%rbx 305 306 mulq %r12 307 addq %rax,%r11 308 movq %r12,%rax 309 adcq $0,%rdx 310 addq %rbx,%r11 311 adcq $0,%rdx 312 313 xorq %rbx,%rbx 314 addq %r9,%r9 315 movq %rdx,%r12 316 adcq %r10,%r10 317 adcq $0,%rbx 318 319 mulq %rax 320 321 addq %rcx,%rax 322 addq %rax,%r9 323 movq %r14,%rax 324 adcq %rdx,%r10 325 adcq $0,%rbx 326 327 movq %r9,80(%rsp) 328 movq %r10,88(%rsp) 329 330 331 mulq %rbp 332 addq %rax,%r12 333 movq %rbp,%rax 334 adcq $0,%rdx 335 336 xorq %rcx,%rcx 337 addq %r11,%r11 338 movq %rdx,%r13 339 adcq %r12,%r12 340 adcq $0,%rcx 341 342 mulq %rax 343 344 addq %rbx,%rax 345 addq %rax,%r11 346 movq %r14,%rax 347 adcq %rdx,%r12 348 adcq $0,%rcx 349 350 movq %r11,96(%rsp) 351 movq %r12,104(%rsp) 352 353 354 xorq %rbx,%rbx 355 addq %r13,%r13 356 adcq $0,%rbx 357 358 mulq %rax 359 360 addq %rcx,%rax 361 addq %r13,%rax 362 adcq %rbx,%rdx 363 364 movq (%rsp),%r8 365 movq 8(%rsp),%r9 366 movq 16(%rsp),%r10 367 movq 24(%rsp),%r11 368 movq 32(%rsp),%r12 369 movq 40(%rsp),%r13 370 movq 48(%rsp),%r14 371 movq 56(%rsp),%r15 372.byte 102,72,15,126,205 373 374 movq %rax,112(%rsp) 375 movq %rdx,120(%rsp) 376 377 call __rsaz_512_reduce 378 379 addq 64(%rsp),%r8 380 adcq 72(%rsp),%r9 381 adcq 80(%rsp),%r10 382 adcq 88(%rsp),%r11 383 adcq 96(%rsp),%r12 384 adcq 104(%rsp),%r13 385 adcq 112(%rsp),%r14 386 adcq 120(%rsp),%r15 387 sbbq %rcx,%rcx 388 389 call __rsaz_512_subtract 390 391 movq %r8,%rdx 392 movq %r9,%rax 393 movl 128+8(%rsp),%r8d 394 movq %rdi,%rsi 395 396 decl %r8d 397 jnz .Loop_sqr 398 jmp .Lsqr_tail 399 400.align 32 401.Loop_sqrx: 402 movl %r8d,128+8(%rsp) 403.byte 102,72,15,110,199 404 405 mulxq %rax,%r8,%r9 406 movq %rax,%rbx 407 408 mulxq 16(%rsi),%rcx,%r10 409 xorq %rbp,%rbp 410 411 mulxq 24(%rsi),%rax,%r11 412 adcxq %rcx,%r9 413 414.byte 0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00 415 adcxq %rax,%r10 416 417.byte 0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00 418 adcxq %rcx,%r11 419 420 mulxq 48(%rsi),%rcx,%r14 421 adcxq %rax,%r12 422 adcxq %rcx,%r13 423 424 mulxq 56(%rsi),%rax,%r15 425 adcxq %rax,%r14 426 adcxq %rbp,%r15 427 428 mulxq %rdx,%rax,%rdi 429 movq %rbx,%rdx 430 xorq %rcx,%rcx 431 adoxq %r8,%r8 432 adcxq %rdi,%r8 433 adoxq %rbp,%rcx 434 adcxq %rbp,%rcx 435 436 movq %rax,(%rsp) 437 movq %r8,8(%rsp) 438 439 440.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00 441 adoxq %rax,%r10 442 adcxq %rbx,%r11 443 444 mulxq 24(%rsi),%rdi,%r8 445 adoxq %rdi,%r11 446.byte 0x66 447 adcxq %r8,%r12 448 449 mulxq 32(%rsi),%rax,%rbx 450 adoxq %rax,%r12 451 adcxq %rbx,%r13 452 453 mulxq 40(%rsi),%rdi,%r8 454 adoxq %rdi,%r13 455 adcxq %r8,%r14 456 457.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 458 adoxq %rax,%r14 459 adcxq %rbx,%r15 460 461.byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 462 adoxq %rdi,%r15 463 adcxq %rbp,%r8 464 mulxq %rdx,%rax,%rdi 465 adoxq %rbp,%r8 466.byte 0x48,0x8b,0x96,0x10,0x00,0x00,0x00 467 468 xorq %rbx,%rbx 469 adoxq %r9,%r9 470 471 adcxq %rcx,%rax 472 adoxq %r10,%r10 473 adcxq %rax,%r9 474 adoxq %rbp,%rbx 475 adcxq %rdi,%r10 476 adcxq %rbp,%rbx 477 478 movq %r9,16(%rsp) 479.byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 480 481 482 mulxq 24(%rsi),%rdi,%r9 483 adoxq %rdi,%r12 484 adcxq %r9,%r13 485 486 mulxq 32(%rsi),%rax,%rcx 487 adoxq %rax,%r13 488 adcxq %rcx,%r14 489 490.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00 491 adoxq %rdi,%r14 492 adcxq %r9,%r15 493 494.byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 495 adoxq %rax,%r15 496 adcxq %rcx,%r8 497 498 mulxq 56(%rsi),%rdi,%r9 499 adoxq %rdi,%r8 500 adcxq %rbp,%r9 501 mulxq %rdx,%rax,%rdi 502 adoxq %rbp,%r9 503 movq 24(%rsi),%rdx 504 505 xorq %rcx,%rcx 506 adoxq %r11,%r11 507 508 adcxq %rbx,%rax 509 adoxq %r12,%r12 510 adcxq %rax,%r11 511 adoxq %rbp,%rcx 512 adcxq %rdi,%r12 513 adcxq %rbp,%rcx 514 515 movq %r11,32(%rsp) 516 movq %r12,40(%rsp) 517 518 519 mulxq 32(%rsi),%rax,%rbx 520 adoxq %rax,%r14 521 adcxq %rbx,%r15 522 523 mulxq 40(%rsi),%rdi,%r10 524 adoxq %rdi,%r15 525 adcxq %r10,%r8 526 527 mulxq 48(%rsi),%rax,%rbx 528 adoxq %rax,%r8 529 adcxq %rbx,%r9 530 531 mulxq 56(%rsi),%rdi,%r10 532 adoxq %rdi,%r9 533 adcxq %rbp,%r10 534 mulxq %rdx,%rax,%rdi 535 adoxq %rbp,%r10 536 movq 32(%rsi),%rdx 537 538 xorq %rbx,%rbx 539 adoxq %r13,%r13 540 541 adcxq %rcx,%rax 542 adoxq %r14,%r14 543 adcxq %rax,%r13 544 adoxq %rbp,%rbx 545 adcxq %rdi,%r14 546 adcxq %rbp,%rbx 547 548 movq %r13,48(%rsp) 549 movq %r14,56(%rsp) 550 551 552 mulxq 40(%rsi),%rdi,%r11 553 adoxq %rdi,%r8 554 adcxq %r11,%r9 555 556 mulxq 48(%rsi),%rax,%rcx 557 adoxq %rax,%r9 558 adcxq %rcx,%r10 559 560 mulxq 56(%rsi),%rdi,%r11 561 adoxq %rdi,%r10 562 adcxq %rbp,%r11 563 mulxq %rdx,%rax,%rdi 564 movq 40(%rsi),%rdx 565 adoxq %rbp,%r11 566 567 xorq %rcx,%rcx 568 adoxq %r15,%r15 569 570 adcxq %rbx,%rax 571 adoxq %r8,%r8 572 adcxq %rax,%r15 573 adoxq %rbp,%rcx 574 adcxq %rdi,%r8 575 adcxq %rbp,%rcx 576 577 movq %r15,64(%rsp) 578 movq %r8,72(%rsp) 579 580 581.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 582 adoxq %rax,%r10 583 adcxq %rbx,%r11 584 585.byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 586 adoxq %rdi,%r11 587 adcxq %rbp,%r12 588 mulxq %rdx,%rax,%rdi 589 adoxq %rbp,%r12 590 movq 48(%rsi),%rdx 591 592 xorq %rbx,%rbx 593 adoxq %r9,%r9 594 595 adcxq %rcx,%rax 596 adoxq %r10,%r10 597 adcxq %rax,%r9 598 adcxq %rdi,%r10 599 adoxq %rbp,%rbx 600 adcxq %rbp,%rbx 601 602 movq %r9,80(%rsp) 603 movq %r10,88(%rsp) 604 605 606.byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 607 adoxq %rax,%r12 608 adoxq %rbp,%r13 609 610 mulxq %rdx,%rax,%rdi 611 xorq %rcx,%rcx 612 movq 56(%rsi),%rdx 613 adoxq %r11,%r11 614 615 adcxq %rbx,%rax 616 adoxq %r12,%r12 617 adcxq %rax,%r11 618 adoxq %rbp,%rcx 619 adcxq %rdi,%r12 620 adcxq %rbp,%rcx 621 622.byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 623.byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 624 625 626 mulxq %rdx,%rax,%rdx 627 xorq %rbx,%rbx 628 adoxq %r13,%r13 629 630 adcxq %rcx,%rax 631 adoxq %rbp,%rbx 632 adcxq %r13,%rax 633 adcxq %rdx,%rbx 634 635.byte 102,72,15,126,199 636.byte 102,72,15,126,205 637 638 movq 128(%rsp),%rdx 639 movq (%rsp),%r8 640 movq 8(%rsp),%r9 641 movq 16(%rsp),%r10 642 movq 24(%rsp),%r11 643 movq 32(%rsp),%r12 644 movq 40(%rsp),%r13 645 movq 48(%rsp),%r14 646 movq 56(%rsp),%r15 647 648 movq %rax,112(%rsp) 649 movq %rbx,120(%rsp) 650 651 call __rsaz_512_reducex 652 653 addq 64(%rsp),%r8 654 adcq 72(%rsp),%r9 655 adcq 80(%rsp),%r10 656 adcq 88(%rsp),%r11 657 adcq 96(%rsp),%r12 658 adcq 104(%rsp),%r13 659 adcq 112(%rsp),%r14 660 adcq 120(%rsp),%r15 661 sbbq %rcx,%rcx 662 663 call __rsaz_512_subtract 664 665 movq %r8,%rdx 666 movq %r9,%rax 667 movl 128+8(%rsp),%r8d 668 movq %rdi,%rsi 669 670 decl %r8d 671 jnz .Loop_sqrx 672 673.Lsqr_tail: 674 675 leaq 128+24+48(%rsp),%rax 676 movq -48(%rax),%r15 677 movq -40(%rax),%r14 678 movq -32(%rax),%r13 679 movq -24(%rax),%r12 680 movq -16(%rax),%rbp 681 movq -8(%rax),%rbx 682 leaq (%rax),%rsp 683.Lsqr_epilogue: 684 .byte 0xf3,0xc3 685.size rsaz_512_sqr,.-rsaz_512_sqr 686.globl rsaz_512_mul 687.type rsaz_512_mul,@function 688.align 32 689rsaz_512_mul: 690 pushq %rbx 691 pushq %rbp 692 pushq %r12 693 pushq %r13 694 pushq %r14 695 pushq %r15 696 697 subq $128+24,%rsp 698.Lmul_body: 699.byte 102,72,15,110,199 700.byte 102,72,15,110,201 701 movq %r8,128(%rsp) 702 movl $0x80100,%r11d 703 andl OPENSSL_ia32cap_P+8(%rip),%r11d 704 cmpl $0x80100,%r11d 705 je .Lmulx 706 movq (%rdx),%rbx 707 movq %rdx,%rbp 708 call __rsaz_512_mul 709 710.byte 102,72,15,126,199 711.byte 102,72,15,126,205 712 713 movq (%rsp),%r8 714 movq 8(%rsp),%r9 715 movq 16(%rsp),%r10 716 movq 24(%rsp),%r11 717 movq 32(%rsp),%r12 718 movq 40(%rsp),%r13 719 movq 48(%rsp),%r14 720 movq 56(%rsp),%r15 721 722 call __rsaz_512_reduce 723 jmp .Lmul_tail 724 725.align 32 726.Lmulx: 727 movq %rdx,%rbp 728 movq (%rdx),%rdx 729 call __rsaz_512_mulx 730 731.byte 102,72,15,126,199 732.byte 102,72,15,126,205 733 734 movq 128(%rsp),%rdx 735 movq (%rsp),%r8 736 movq 8(%rsp),%r9 737 movq 16(%rsp),%r10 738 movq 24(%rsp),%r11 739 movq 32(%rsp),%r12 740 movq 40(%rsp),%r13 741 movq 48(%rsp),%r14 742 movq 56(%rsp),%r15 743 744 call __rsaz_512_reducex 745.Lmul_tail: 746 addq 64(%rsp),%r8 747 adcq 72(%rsp),%r9 748 adcq 80(%rsp),%r10 749 adcq 88(%rsp),%r11 750 adcq 96(%rsp),%r12 751 adcq 104(%rsp),%r13 752 adcq 112(%rsp),%r14 753 adcq 120(%rsp),%r15 754 sbbq %rcx,%rcx 755 756 call __rsaz_512_subtract 757 758 leaq 128+24+48(%rsp),%rax 759 movq -48(%rax),%r15 760 movq -40(%rax),%r14 761 movq -32(%rax),%r13 762 movq -24(%rax),%r12 763 movq -16(%rax),%rbp 764 movq -8(%rax),%rbx 765 leaq (%rax),%rsp 766.Lmul_epilogue: 767 .byte 0xf3,0xc3 768.size rsaz_512_mul,.-rsaz_512_mul 769.globl rsaz_512_mul_gather4 770.type rsaz_512_mul_gather4,@function 771.align 32 772rsaz_512_mul_gather4: 773 pushq %rbx 774 pushq %rbp 775 pushq %r12 776 pushq %r13 777 pushq %r14 778 pushq %r15 779 780 subq $152,%rsp 781.Lmul_gather4_body: 782 movd %r9d,%xmm8 783 movdqa .Linc+16(%rip),%xmm1 784 movdqa .Linc(%rip),%xmm0 785 786 pshufd $0,%xmm8,%xmm8 787 movdqa %xmm1,%xmm7 788 movdqa %xmm1,%xmm2 789 paddd %xmm0,%xmm1 790 pcmpeqd %xmm8,%xmm0 791 movdqa %xmm7,%xmm3 792 paddd %xmm1,%xmm2 793 pcmpeqd %xmm8,%xmm1 794 movdqa %xmm7,%xmm4 795 paddd %xmm2,%xmm3 796 pcmpeqd %xmm8,%xmm2 797 movdqa %xmm7,%xmm5 798 paddd %xmm3,%xmm4 799 pcmpeqd %xmm8,%xmm3 800 movdqa %xmm7,%xmm6 801 paddd %xmm4,%xmm5 802 pcmpeqd %xmm8,%xmm4 803 paddd %xmm5,%xmm6 804 pcmpeqd %xmm8,%xmm5 805 paddd %xmm6,%xmm7 806 pcmpeqd %xmm8,%xmm6 807 pcmpeqd %xmm8,%xmm7 808 809 movdqa 0(%rdx),%xmm8 810 movdqa 16(%rdx),%xmm9 811 movdqa 32(%rdx),%xmm10 812 movdqa 48(%rdx),%xmm11 813 pand %xmm0,%xmm8 814 movdqa 64(%rdx),%xmm12 815 pand %xmm1,%xmm9 816 movdqa 80(%rdx),%xmm13 817 pand %xmm2,%xmm10 818 movdqa 96(%rdx),%xmm14 819 pand %xmm3,%xmm11 820 movdqa 112(%rdx),%xmm15 821 leaq 128(%rdx),%rbp 822 pand %xmm4,%xmm12 823 pand %xmm5,%xmm13 824 pand %xmm6,%xmm14 825 pand %xmm7,%xmm15 826 por %xmm10,%xmm8 827 por %xmm11,%xmm9 828 por %xmm12,%xmm8 829 por %xmm13,%xmm9 830 por %xmm14,%xmm8 831 por %xmm15,%xmm9 832 833 por %xmm9,%xmm8 834 pshufd $0x4e,%xmm8,%xmm9 835 por %xmm9,%xmm8 836 movl $0x80100,%r11d 837 andl OPENSSL_ia32cap_P+8(%rip),%r11d 838 cmpl $0x80100,%r11d 839 je .Lmulx_gather 840.byte 102,76,15,126,195 841 842 movq %r8,128(%rsp) 843 movq %rdi,128+8(%rsp) 844 movq %rcx,128+16(%rsp) 845 846 movq (%rsi),%rax 847 movq 8(%rsi),%rcx 848 mulq %rbx 849 movq %rax,(%rsp) 850 movq %rcx,%rax 851 movq %rdx,%r8 852 853 mulq %rbx 854 addq %rax,%r8 855 movq 16(%rsi),%rax 856 movq %rdx,%r9 857 adcq $0,%r9 858 859 mulq %rbx 860 addq %rax,%r9 861 movq 24(%rsi),%rax 862 movq %rdx,%r10 863 adcq $0,%r10 864 865 mulq %rbx 866 addq %rax,%r10 867 movq 32(%rsi),%rax 868 movq %rdx,%r11 869 adcq $0,%r11 870 871 mulq %rbx 872 addq %rax,%r11 873 movq 40(%rsi),%rax 874 movq %rdx,%r12 875 adcq $0,%r12 876 877 mulq %rbx 878 addq %rax,%r12 879 movq 48(%rsi),%rax 880 movq %rdx,%r13 881 adcq $0,%r13 882 883 mulq %rbx 884 addq %rax,%r13 885 movq 56(%rsi),%rax 886 movq %rdx,%r14 887 adcq $0,%r14 888 889 mulq %rbx 890 addq %rax,%r14 891 movq (%rsi),%rax 892 movq %rdx,%r15 893 adcq $0,%r15 894 895 leaq 8(%rsp),%rdi 896 movl $7,%ecx 897 jmp .Loop_mul_gather 898 899.align 32 900.Loop_mul_gather: 901 movdqa 0(%rbp),%xmm8 902 movdqa 16(%rbp),%xmm9 903 movdqa 32(%rbp),%xmm10 904 movdqa 48(%rbp),%xmm11 905 pand %xmm0,%xmm8 906 movdqa 64(%rbp),%xmm12 907 pand %xmm1,%xmm9 908 movdqa 80(%rbp),%xmm13 909 pand %xmm2,%xmm10 910 movdqa 96(%rbp),%xmm14 911 pand %xmm3,%xmm11 912 movdqa 112(%rbp),%xmm15 913 leaq 128(%rbp),%rbp 914 pand %xmm4,%xmm12 915 pand %xmm5,%xmm13 916 pand %xmm6,%xmm14 917 pand %xmm7,%xmm15 918 por %xmm10,%xmm8 919 por %xmm11,%xmm9 920 por %xmm12,%xmm8 921 por %xmm13,%xmm9 922 por %xmm14,%xmm8 923 por %xmm15,%xmm9 924 925 por %xmm9,%xmm8 926 pshufd $0x4e,%xmm8,%xmm9 927 por %xmm9,%xmm8 928.byte 102,76,15,126,195 929 930 mulq %rbx 931 addq %rax,%r8 932 movq 8(%rsi),%rax 933 movq %r8,(%rdi) 934 movq %rdx,%r8 935 adcq $0,%r8 936 937 mulq %rbx 938 addq %rax,%r9 939 movq 16(%rsi),%rax 940 adcq $0,%rdx 941 addq %r9,%r8 942 movq %rdx,%r9 943 adcq $0,%r9 944 945 mulq %rbx 946 addq %rax,%r10 947 movq 24(%rsi),%rax 948 adcq $0,%rdx 949 addq %r10,%r9 950 movq %rdx,%r10 951 adcq $0,%r10 952 953 mulq %rbx 954 addq %rax,%r11 955 movq 32(%rsi),%rax 956 adcq $0,%rdx 957 addq %r11,%r10 958 movq %rdx,%r11 959 adcq $0,%r11 960 961 mulq %rbx 962 addq %rax,%r12 963 movq 40(%rsi),%rax 964 adcq $0,%rdx 965 addq %r12,%r11 966 movq %rdx,%r12 967 adcq $0,%r12 968 969 mulq %rbx 970 addq %rax,%r13 971 movq 48(%rsi),%rax 972 adcq $0,%rdx 973 addq %r13,%r12 974 movq %rdx,%r13 975 adcq $0,%r13 976 977 mulq %rbx 978 addq %rax,%r14 979 movq 56(%rsi),%rax 980 adcq $0,%rdx 981 addq %r14,%r13 982 movq %rdx,%r14 983 adcq $0,%r14 984 985 mulq %rbx 986 addq %rax,%r15 987 movq (%rsi),%rax 988 adcq $0,%rdx 989 addq %r15,%r14 990 movq %rdx,%r15 991 adcq $0,%r15 992 993 leaq 8(%rdi),%rdi 994 995 decl %ecx 996 jnz .Loop_mul_gather 997 998 movq %r8,(%rdi) 999 movq %r9,8(%rdi) 1000 movq %r10,16(%rdi) 1001 movq %r11,24(%rdi) 1002 movq %r12,32(%rdi) 1003 movq %r13,40(%rdi) 1004 movq %r14,48(%rdi) 1005 movq %r15,56(%rdi) 1006 1007 movq 128+8(%rsp),%rdi 1008 movq 128+16(%rsp),%rbp 1009 1010 movq (%rsp),%r8 1011 movq 8(%rsp),%r9 1012 movq 16(%rsp),%r10 1013 movq 24(%rsp),%r11 1014 movq 32(%rsp),%r12 1015 movq 40(%rsp),%r13 1016 movq 48(%rsp),%r14 1017 movq 56(%rsp),%r15 1018 1019 call __rsaz_512_reduce 1020 jmp .Lmul_gather_tail 1021 1022.align 32 1023.Lmulx_gather: 1024.byte 102,76,15,126,194 1025 1026 movq %r8,128(%rsp) 1027 movq %rdi,128+8(%rsp) 1028 movq %rcx,128+16(%rsp) 1029 1030 mulxq (%rsi),%rbx,%r8 1031 movq %rbx,(%rsp) 1032 xorl %edi,%edi 1033 1034 mulxq 8(%rsi),%rax,%r9 1035 1036 mulxq 16(%rsi),%rbx,%r10 1037 adcxq %rax,%r8 1038 1039 mulxq 24(%rsi),%rax,%r11 1040 adcxq %rbx,%r9 1041 1042 mulxq 32(%rsi),%rbx,%r12 1043 adcxq %rax,%r10 1044 1045 mulxq 40(%rsi),%rax,%r13 1046 adcxq %rbx,%r11 1047 1048 mulxq 48(%rsi),%rbx,%r14 1049 adcxq %rax,%r12 1050 1051 mulxq 56(%rsi),%rax,%r15 1052 adcxq %rbx,%r13 1053 adcxq %rax,%r14 1054.byte 0x67 1055 movq %r8,%rbx 1056 adcxq %rdi,%r15 1057 1058 movq $-7,%rcx 1059 jmp .Loop_mulx_gather 1060 1061.align 32 1062.Loop_mulx_gather: 1063 movdqa 0(%rbp),%xmm8 1064 movdqa 16(%rbp),%xmm9 1065 movdqa 32(%rbp),%xmm10 1066 movdqa 48(%rbp),%xmm11 1067 pand %xmm0,%xmm8 1068 movdqa 64(%rbp),%xmm12 1069 pand %xmm1,%xmm9 1070 movdqa 80(%rbp),%xmm13 1071 pand %xmm2,%xmm10 1072 movdqa 96(%rbp),%xmm14 1073 pand %xmm3,%xmm11 1074 movdqa 112(%rbp),%xmm15 1075 leaq 128(%rbp),%rbp 1076 pand %xmm4,%xmm12 1077 pand %xmm5,%xmm13 1078 pand %xmm6,%xmm14 1079 pand %xmm7,%xmm15 1080 por %xmm10,%xmm8 1081 por %xmm11,%xmm9 1082 por %xmm12,%xmm8 1083 por %xmm13,%xmm9 1084 por %xmm14,%xmm8 1085 por %xmm15,%xmm9 1086 1087 por %xmm9,%xmm8 1088 pshufd $0x4e,%xmm8,%xmm9 1089 por %xmm9,%xmm8 1090.byte 102,76,15,126,194 1091 1092.byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 1093 adcxq %rax,%rbx 1094 adoxq %r9,%r8 1095 1096 mulxq 8(%rsi),%rax,%r9 1097 adcxq %rax,%r8 1098 adoxq %r10,%r9 1099 1100 mulxq 16(%rsi),%rax,%r10 1101 adcxq %rax,%r9 1102 adoxq %r11,%r10 1103 1104.byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 1105 adcxq %rax,%r10 1106 adoxq %r12,%r11 1107 1108 mulxq 32(%rsi),%rax,%r12 1109 adcxq %rax,%r11 1110 adoxq %r13,%r12 1111 1112 mulxq 40(%rsi),%rax,%r13 1113 adcxq %rax,%r12 1114 adoxq %r14,%r13 1115 1116.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 1117 adcxq %rax,%r13 1118.byte 0x67 1119 adoxq %r15,%r14 1120 1121 mulxq 56(%rsi),%rax,%r15 1122 movq %rbx,64(%rsp,%rcx,8) 1123 adcxq %rax,%r14 1124 adoxq %rdi,%r15 1125 movq %r8,%rbx 1126 adcxq %rdi,%r15 1127 1128 incq %rcx 1129 jnz .Loop_mulx_gather 1130 1131 movq %r8,64(%rsp) 1132 movq %r9,64+8(%rsp) 1133 movq %r10,64+16(%rsp) 1134 movq %r11,64+24(%rsp) 1135 movq %r12,64+32(%rsp) 1136 movq %r13,64+40(%rsp) 1137 movq %r14,64+48(%rsp) 1138 movq %r15,64+56(%rsp) 1139 1140 movq 128(%rsp),%rdx 1141 movq 128+8(%rsp),%rdi 1142 movq 128+16(%rsp),%rbp 1143 1144 movq (%rsp),%r8 1145 movq 8(%rsp),%r9 1146 movq 16(%rsp),%r10 1147 movq 24(%rsp),%r11 1148 movq 32(%rsp),%r12 1149 movq 40(%rsp),%r13 1150 movq 48(%rsp),%r14 1151 movq 56(%rsp),%r15 1152 1153 call __rsaz_512_reducex 1154 1155.Lmul_gather_tail: 1156 addq 64(%rsp),%r8 1157 adcq 72(%rsp),%r9 1158 adcq 80(%rsp),%r10 1159 adcq 88(%rsp),%r11 1160 adcq 96(%rsp),%r12 1161 adcq 104(%rsp),%r13 1162 adcq 112(%rsp),%r14 1163 adcq 120(%rsp),%r15 1164 sbbq %rcx,%rcx 1165 1166 call __rsaz_512_subtract 1167 1168 leaq 128+24+48(%rsp),%rax 1169 movq -48(%rax),%r15 1170 movq -40(%rax),%r14 1171 movq -32(%rax),%r13 1172 movq -24(%rax),%r12 1173 movq -16(%rax),%rbp 1174 movq -8(%rax),%rbx 1175 leaq (%rax),%rsp 1176.Lmul_gather4_epilogue: 1177 .byte 0xf3,0xc3 1178.size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4 1179.globl rsaz_512_mul_scatter4 1180.type rsaz_512_mul_scatter4,@function 1181.align 32 1182rsaz_512_mul_scatter4: 1183 pushq %rbx 1184 pushq %rbp 1185 pushq %r12 1186 pushq %r13 1187 pushq %r14 1188 pushq %r15 1189 1190 movl %r9d,%r9d 1191 subq $128+24,%rsp 1192.Lmul_scatter4_body: 1193 leaq (%r8,%r9,8),%r8 1194.byte 102,72,15,110,199 1195.byte 102,72,15,110,202 1196.byte 102,73,15,110,208 1197 movq %rcx,128(%rsp) 1198 1199 movq %rdi,%rbp 1200 movl $0x80100,%r11d 1201 andl OPENSSL_ia32cap_P+8(%rip),%r11d 1202 cmpl $0x80100,%r11d 1203 je .Lmulx_scatter 1204 movq (%rdi),%rbx 1205 call __rsaz_512_mul 1206 1207.byte 102,72,15,126,199 1208.byte 102,72,15,126,205 1209 1210 movq (%rsp),%r8 1211 movq 8(%rsp),%r9 1212 movq 16(%rsp),%r10 1213 movq 24(%rsp),%r11 1214 movq 32(%rsp),%r12 1215 movq 40(%rsp),%r13 1216 movq 48(%rsp),%r14 1217 movq 56(%rsp),%r15 1218 1219 call __rsaz_512_reduce 1220 jmp .Lmul_scatter_tail 1221 1222.align 32 1223.Lmulx_scatter: 1224 movq (%rdi),%rdx 1225 call __rsaz_512_mulx 1226 1227.byte 102,72,15,126,199 1228.byte 102,72,15,126,205 1229 1230 movq 128(%rsp),%rdx 1231 movq (%rsp),%r8 1232 movq 8(%rsp),%r9 1233 movq 16(%rsp),%r10 1234 movq 24(%rsp),%r11 1235 movq 32(%rsp),%r12 1236 movq 40(%rsp),%r13 1237 movq 48(%rsp),%r14 1238 movq 56(%rsp),%r15 1239 1240 call __rsaz_512_reducex 1241 1242.Lmul_scatter_tail: 1243 addq 64(%rsp),%r8 1244 adcq 72(%rsp),%r9 1245 adcq 80(%rsp),%r10 1246 adcq 88(%rsp),%r11 1247 adcq 96(%rsp),%r12 1248 adcq 104(%rsp),%r13 1249 adcq 112(%rsp),%r14 1250 adcq 120(%rsp),%r15 1251.byte 102,72,15,126,214 1252 sbbq %rcx,%rcx 1253 1254 call __rsaz_512_subtract 1255 1256 movq %r8,0(%rsi) 1257 movq %r9,128(%rsi) 1258 movq %r10,256(%rsi) 1259 movq %r11,384(%rsi) 1260 movq %r12,512(%rsi) 1261 movq %r13,640(%rsi) 1262 movq %r14,768(%rsi) 1263 movq %r15,896(%rsi) 1264 1265 leaq 128+24+48(%rsp),%rax 1266 movq -48(%rax),%r15 1267 movq -40(%rax),%r14 1268 movq -32(%rax),%r13 1269 movq -24(%rax),%r12 1270 movq -16(%rax),%rbp 1271 movq -8(%rax),%rbx 1272 leaq (%rax),%rsp 1273.Lmul_scatter4_epilogue: 1274 .byte 0xf3,0xc3 1275.size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4 1276.globl rsaz_512_mul_by_one 1277.type rsaz_512_mul_by_one,@function 1278.align 32 1279rsaz_512_mul_by_one: 1280 pushq %rbx 1281 pushq %rbp 1282 pushq %r12 1283 pushq %r13 1284 pushq %r14 1285 pushq %r15 1286 1287 subq $128+24,%rsp 1288.Lmul_by_one_body: 1289 movl OPENSSL_ia32cap_P+8(%rip),%eax 1290 movq %rdx,%rbp 1291 movq %rcx,128(%rsp) 1292 1293 movq (%rsi),%r8 1294 pxor %xmm0,%xmm0 1295 movq 8(%rsi),%r9 1296 movq 16(%rsi),%r10 1297 movq 24(%rsi),%r11 1298 movq 32(%rsi),%r12 1299 movq 40(%rsi),%r13 1300 movq 48(%rsi),%r14 1301 movq 56(%rsi),%r15 1302 1303 movdqa %xmm0,(%rsp) 1304 movdqa %xmm0,16(%rsp) 1305 movdqa %xmm0,32(%rsp) 1306 movdqa %xmm0,48(%rsp) 1307 movdqa %xmm0,64(%rsp) 1308 movdqa %xmm0,80(%rsp) 1309 movdqa %xmm0,96(%rsp) 1310 andl $0x80100,%eax 1311 cmpl $0x80100,%eax 1312 je .Lby_one_callx 1313 call __rsaz_512_reduce 1314 jmp .Lby_one_tail 1315.align 32 1316.Lby_one_callx: 1317 movq 128(%rsp),%rdx 1318 call __rsaz_512_reducex 1319.Lby_one_tail: 1320 movq %r8,(%rdi) 1321 movq %r9,8(%rdi) 1322 movq %r10,16(%rdi) 1323 movq %r11,24(%rdi) 1324 movq %r12,32(%rdi) 1325 movq %r13,40(%rdi) 1326 movq %r14,48(%rdi) 1327 movq %r15,56(%rdi) 1328 1329 leaq 128+24+48(%rsp),%rax 1330 movq -48(%rax),%r15 1331 movq -40(%rax),%r14 1332 movq -32(%rax),%r13 1333 movq -24(%rax),%r12 1334 movq -16(%rax),%rbp 1335 movq -8(%rax),%rbx 1336 leaq (%rax),%rsp 1337.Lmul_by_one_epilogue: 1338 .byte 0xf3,0xc3 1339.size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one 1340.type __rsaz_512_reduce,@function 1341.align 32 1342__rsaz_512_reduce: 1343 movq %r8,%rbx 1344 imulq 128+8(%rsp),%rbx 1345 movq 0(%rbp),%rax 1346 movl $8,%ecx 1347 jmp .Lreduction_loop 1348 1349.align 32 1350.Lreduction_loop: 1351 mulq %rbx 1352 movq 8(%rbp),%rax 1353 negq %r8 1354 movq %rdx,%r8 1355 adcq $0,%r8 1356 1357 mulq %rbx 1358 addq %rax,%r9 1359 movq 16(%rbp),%rax 1360 adcq $0,%rdx 1361 addq %r9,%r8 1362 movq %rdx,%r9 1363 adcq $0,%r9 1364 1365 mulq %rbx 1366 addq %rax,%r10 1367 movq 24(%rbp),%rax 1368 adcq $0,%rdx 1369 addq %r10,%r9 1370 movq %rdx,%r10 1371 adcq $0,%r10 1372 1373 mulq %rbx 1374 addq %rax,%r11 1375 movq 32(%rbp),%rax 1376 adcq $0,%rdx 1377 addq %r11,%r10 1378 movq 128+8(%rsp),%rsi 1379 1380 1381 adcq $0,%rdx 1382 movq %rdx,%r11 1383 1384 mulq %rbx 1385 addq %rax,%r12 1386 movq 40(%rbp),%rax 1387 adcq $0,%rdx 1388 imulq %r8,%rsi 1389 addq %r12,%r11 1390 movq %rdx,%r12 1391 adcq $0,%r12 1392 1393 mulq %rbx 1394 addq %rax,%r13 1395 movq 48(%rbp),%rax 1396 adcq $0,%rdx 1397 addq %r13,%r12 1398 movq %rdx,%r13 1399 adcq $0,%r13 1400 1401 mulq %rbx 1402 addq %rax,%r14 1403 movq 56(%rbp),%rax 1404 adcq $0,%rdx 1405 addq %r14,%r13 1406 movq %rdx,%r14 1407 adcq $0,%r14 1408 1409 mulq %rbx 1410 movq %rsi,%rbx 1411 addq %rax,%r15 1412 movq 0(%rbp),%rax 1413 adcq $0,%rdx 1414 addq %r15,%r14 1415 movq %rdx,%r15 1416 adcq $0,%r15 1417 1418 decl %ecx 1419 jne .Lreduction_loop 1420 1421 .byte 0xf3,0xc3 1422.size __rsaz_512_reduce,.-__rsaz_512_reduce 1423.type __rsaz_512_reducex,@function 1424.align 32 1425__rsaz_512_reducex: 1426 1427 imulq %r8,%rdx 1428 xorq %rsi,%rsi 1429 movl $8,%ecx 1430 jmp .Lreduction_loopx 1431 1432.align 32 1433.Lreduction_loopx: 1434 movq %r8,%rbx 1435 mulxq 0(%rbp),%rax,%r8 1436 adcxq %rbx,%rax 1437 adoxq %r9,%r8 1438 1439 mulxq 8(%rbp),%rax,%r9 1440 adcxq %rax,%r8 1441 adoxq %r10,%r9 1442 1443 mulxq 16(%rbp),%rbx,%r10 1444 adcxq %rbx,%r9 1445 adoxq %r11,%r10 1446 1447 mulxq 24(%rbp),%rbx,%r11 1448 adcxq %rbx,%r10 1449 adoxq %r12,%r11 1450 1451.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 1452 movq %rdx,%rax 1453 movq %r8,%rdx 1454 adcxq %rbx,%r11 1455 adoxq %r13,%r12 1456 1457 mulxq 128+8(%rsp),%rbx,%rdx 1458 movq %rax,%rdx 1459 1460 mulxq 40(%rbp),%rax,%r13 1461 adcxq %rax,%r12 1462 adoxq %r14,%r13 1463 1464.byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 1465 adcxq %rax,%r13 1466 adoxq %r15,%r14 1467 1468 mulxq 56(%rbp),%rax,%r15 1469 movq %rbx,%rdx 1470 adcxq %rax,%r14 1471 adoxq %rsi,%r15 1472 adcxq %rsi,%r15 1473 1474 decl %ecx 1475 jne .Lreduction_loopx 1476 1477 .byte 0xf3,0xc3 1478.size __rsaz_512_reducex,.-__rsaz_512_reducex 1479.type __rsaz_512_subtract,@function 1480.align 32 1481__rsaz_512_subtract: 1482 movq %r8,(%rdi) 1483 movq %r9,8(%rdi) 1484 movq %r10,16(%rdi) 1485 movq %r11,24(%rdi) 1486 movq %r12,32(%rdi) 1487 movq %r13,40(%rdi) 1488 movq %r14,48(%rdi) 1489 movq %r15,56(%rdi) 1490 1491 movq 0(%rbp),%r8 1492 movq 8(%rbp),%r9 1493 negq %r8 1494 notq %r9 1495 andq %rcx,%r8 1496 movq 16(%rbp),%r10 1497 andq %rcx,%r9 1498 notq %r10 1499 movq 24(%rbp),%r11 1500 andq %rcx,%r10 1501 notq %r11 1502 movq 32(%rbp),%r12 1503 andq %rcx,%r11 1504 notq %r12 1505 movq 40(%rbp),%r13 1506 andq %rcx,%r12 1507 notq %r13 1508 movq 48(%rbp),%r14 1509 andq %rcx,%r13 1510 notq %r14 1511 movq 56(%rbp),%r15 1512 andq %rcx,%r14 1513 notq %r15 1514 andq %rcx,%r15 1515 1516 addq (%rdi),%r8 1517 adcq 8(%rdi),%r9 1518 adcq 16(%rdi),%r10 1519 adcq 24(%rdi),%r11 1520 adcq 32(%rdi),%r12 1521 adcq 40(%rdi),%r13 1522 adcq 48(%rdi),%r14 1523 adcq 56(%rdi),%r15 1524 1525 movq %r8,(%rdi) 1526 movq %r9,8(%rdi) 1527 movq %r10,16(%rdi) 1528 movq %r11,24(%rdi) 1529 movq %r12,32(%rdi) 1530 movq %r13,40(%rdi) 1531 movq %r14,48(%rdi) 1532 movq %r15,56(%rdi) 1533 1534 .byte 0xf3,0xc3 1535.size __rsaz_512_subtract,.-__rsaz_512_subtract 1536.type __rsaz_512_mul,@function 1537.align 32 1538__rsaz_512_mul: 1539 leaq 8(%rsp),%rdi 1540 1541 movq (%rsi),%rax 1542 mulq %rbx 1543 movq %rax,(%rdi) 1544 movq 8(%rsi),%rax 1545 movq %rdx,%r8 1546 1547 mulq %rbx 1548 addq %rax,%r8 1549 movq 16(%rsi),%rax 1550 movq %rdx,%r9 1551 adcq $0,%r9 1552 1553 mulq %rbx 1554 addq %rax,%r9 1555 movq 24(%rsi),%rax 1556 movq %rdx,%r10 1557 adcq $0,%r10 1558 1559 mulq %rbx 1560 addq %rax,%r10 1561 movq 32(%rsi),%rax 1562 movq %rdx,%r11 1563 adcq $0,%r11 1564 1565 mulq %rbx 1566 addq %rax,%r11 1567 movq 40(%rsi),%rax 1568 movq %rdx,%r12 1569 adcq $0,%r12 1570 1571 mulq %rbx 1572 addq %rax,%r12 1573 movq 48(%rsi),%rax 1574 movq %rdx,%r13 1575 adcq $0,%r13 1576 1577 mulq %rbx 1578 addq %rax,%r13 1579 movq 56(%rsi),%rax 1580 movq %rdx,%r14 1581 adcq $0,%r14 1582 1583 mulq %rbx 1584 addq %rax,%r14 1585 movq (%rsi),%rax 1586 movq %rdx,%r15 1587 adcq $0,%r15 1588 1589 leaq 8(%rbp),%rbp 1590 leaq 8(%rdi),%rdi 1591 1592 movl $7,%ecx 1593 jmp .Loop_mul 1594 1595.align 32 1596.Loop_mul: 1597 movq (%rbp),%rbx 1598 mulq %rbx 1599 addq %rax,%r8 1600 movq 8(%rsi),%rax 1601 movq %r8,(%rdi) 1602 movq %rdx,%r8 1603 adcq $0,%r8 1604 1605 mulq %rbx 1606 addq %rax,%r9 1607 movq 16(%rsi),%rax 1608 adcq $0,%rdx 1609 addq %r9,%r8 1610 movq %rdx,%r9 1611 adcq $0,%r9 1612 1613 mulq %rbx 1614 addq %rax,%r10 1615 movq 24(%rsi),%rax 1616 adcq $0,%rdx 1617 addq %r10,%r9 1618 movq %rdx,%r10 1619 adcq $0,%r10 1620 1621 mulq %rbx 1622 addq %rax,%r11 1623 movq 32(%rsi),%rax 1624 adcq $0,%rdx 1625 addq %r11,%r10 1626 movq %rdx,%r11 1627 adcq $0,%r11 1628 1629 mulq %rbx 1630 addq %rax,%r12 1631 movq 40(%rsi),%rax 1632 adcq $0,%rdx 1633 addq %r12,%r11 1634 movq %rdx,%r12 1635 adcq $0,%r12 1636 1637 mulq %rbx 1638 addq %rax,%r13 1639 movq 48(%rsi),%rax 1640 adcq $0,%rdx 1641 addq %r13,%r12 1642 movq %rdx,%r13 1643 adcq $0,%r13 1644 1645 mulq %rbx 1646 addq %rax,%r14 1647 movq 56(%rsi),%rax 1648 adcq $0,%rdx 1649 addq %r14,%r13 1650 movq %rdx,%r14 1651 leaq 8(%rbp),%rbp 1652 adcq $0,%r14 1653 1654 mulq %rbx 1655 addq %rax,%r15 1656 movq (%rsi),%rax 1657 adcq $0,%rdx 1658 addq %r15,%r14 1659 movq %rdx,%r15 1660 adcq $0,%r15 1661 1662 leaq 8(%rdi),%rdi 1663 1664 decl %ecx 1665 jnz .Loop_mul 1666 1667 movq %r8,(%rdi) 1668 movq %r9,8(%rdi) 1669 movq %r10,16(%rdi) 1670 movq %r11,24(%rdi) 1671 movq %r12,32(%rdi) 1672 movq %r13,40(%rdi) 1673 movq %r14,48(%rdi) 1674 movq %r15,56(%rdi) 1675 1676 .byte 0xf3,0xc3 1677.size __rsaz_512_mul,.-__rsaz_512_mul 1678.type __rsaz_512_mulx,@function 1679.align 32 1680__rsaz_512_mulx: 1681 mulxq (%rsi),%rbx,%r8 1682 movq $-6,%rcx 1683 1684 mulxq 8(%rsi),%rax,%r9 1685 movq %rbx,8(%rsp) 1686 1687 mulxq 16(%rsi),%rbx,%r10 1688 adcq %rax,%r8 1689 1690 mulxq 24(%rsi),%rax,%r11 1691 adcq %rbx,%r9 1692 1693 mulxq 32(%rsi),%rbx,%r12 1694 adcq %rax,%r10 1695 1696 mulxq 40(%rsi),%rax,%r13 1697 adcq %rbx,%r11 1698 1699 mulxq 48(%rsi),%rbx,%r14 1700 adcq %rax,%r12 1701 1702 mulxq 56(%rsi),%rax,%r15 1703 movq 8(%rbp),%rdx 1704 adcq %rbx,%r13 1705 adcq %rax,%r14 1706 adcq $0,%r15 1707 1708 xorq %rdi,%rdi 1709 jmp .Loop_mulx 1710 1711.align 32 1712.Loop_mulx: 1713 movq %r8,%rbx 1714 mulxq (%rsi),%rax,%r8 1715 adcxq %rax,%rbx 1716 adoxq %r9,%r8 1717 1718 mulxq 8(%rsi),%rax,%r9 1719 adcxq %rax,%r8 1720 adoxq %r10,%r9 1721 1722 mulxq 16(%rsi),%rax,%r10 1723 adcxq %rax,%r9 1724 adoxq %r11,%r10 1725 1726 mulxq 24(%rsi),%rax,%r11 1727 adcxq %rax,%r10 1728 adoxq %r12,%r11 1729 1730.byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 1731 adcxq %rax,%r11 1732 adoxq %r13,%r12 1733 1734 mulxq 40(%rsi),%rax,%r13 1735 adcxq %rax,%r12 1736 adoxq %r14,%r13 1737 1738 mulxq 48(%rsi),%rax,%r14 1739 adcxq %rax,%r13 1740 adoxq %r15,%r14 1741 1742 mulxq 56(%rsi),%rax,%r15 1743 movq 64(%rbp,%rcx,8),%rdx 1744 movq %rbx,8+64-8(%rsp,%rcx,8) 1745 adcxq %rax,%r14 1746 adoxq %rdi,%r15 1747 adcxq %rdi,%r15 1748 1749 incq %rcx 1750 jnz .Loop_mulx 1751 1752 movq %r8,%rbx 1753 mulxq (%rsi),%rax,%r8 1754 adcxq %rax,%rbx 1755 adoxq %r9,%r8 1756 1757.byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 1758 adcxq %rax,%r8 1759 adoxq %r10,%r9 1760 1761.byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 1762 adcxq %rax,%r9 1763 adoxq %r11,%r10 1764 1765 mulxq 24(%rsi),%rax,%r11 1766 adcxq %rax,%r10 1767 adoxq %r12,%r11 1768 1769 mulxq 32(%rsi),%rax,%r12 1770 adcxq %rax,%r11 1771 adoxq %r13,%r12 1772 1773 mulxq 40(%rsi),%rax,%r13 1774 adcxq %rax,%r12 1775 adoxq %r14,%r13 1776 1777.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 1778 adcxq %rax,%r13 1779 adoxq %r15,%r14 1780 1781.byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 1782 adcxq %rax,%r14 1783 adoxq %rdi,%r15 1784 adcxq %rdi,%r15 1785 1786 movq %rbx,8+64-8(%rsp) 1787 movq %r8,8+64(%rsp) 1788 movq %r9,8+64+8(%rsp) 1789 movq %r10,8+64+16(%rsp) 1790 movq %r11,8+64+24(%rsp) 1791 movq %r12,8+64+32(%rsp) 1792 movq %r13,8+64+40(%rsp) 1793 movq %r14,8+64+48(%rsp) 1794 movq %r15,8+64+56(%rsp) 1795 1796 .byte 0xf3,0xc3 1797.size __rsaz_512_mulx,.-__rsaz_512_mulx 1798.globl rsaz_512_scatter4 1799.type rsaz_512_scatter4,@function 1800.align 16 1801rsaz_512_scatter4: 1802 leaq (%rdi,%rdx,8),%rdi 1803 movl $8,%r9d 1804 jmp .Loop_scatter 1805.align 16 1806.Loop_scatter: 1807 movq (%rsi),%rax 1808 leaq 8(%rsi),%rsi 1809 movq %rax,(%rdi) 1810 leaq 128(%rdi),%rdi 1811 decl %r9d 1812 jnz .Loop_scatter 1813 .byte 0xf3,0xc3 1814.size rsaz_512_scatter4,.-rsaz_512_scatter4 1815 1816.globl rsaz_512_gather4 1817.type rsaz_512_gather4,@function 1818.align 16 1819rsaz_512_gather4: 1820 movd %edx,%xmm8 1821 movdqa .Linc+16(%rip),%xmm1 1822 movdqa .Linc(%rip),%xmm0 1823 1824 pshufd $0,%xmm8,%xmm8 1825 movdqa %xmm1,%xmm7 1826 movdqa %xmm1,%xmm2 1827 paddd %xmm0,%xmm1 1828 pcmpeqd %xmm8,%xmm0 1829 movdqa %xmm7,%xmm3 1830 paddd %xmm1,%xmm2 1831 pcmpeqd %xmm8,%xmm1 1832 movdqa %xmm7,%xmm4 1833 paddd %xmm2,%xmm3 1834 pcmpeqd %xmm8,%xmm2 1835 movdqa %xmm7,%xmm5 1836 paddd %xmm3,%xmm4 1837 pcmpeqd %xmm8,%xmm3 1838 movdqa %xmm7,%xmm6 1839 paddd %xmm4,%xmm5 1840 pcmpeqd %xmm8,%xmm4 1841 paddd %xmm5,%xmm6 1842 pcmpeqd %xmm8,%xmm5 1843 paddd %xmm6,%xmm7 1844 pcmpeqd %xmm8,%xmm6 1845 pcmpeqd %xmm8,%xmm7 1846 movl $8,%r9d 1847 jmp .Loop_gather 1848.align 16 1849.Loop_gather: 1850 movdqa 0(%rsi),%xmm8 1851 movdqa 16(%rsi),%xmm9 1852 movdqa 32(%rsi),%xmm10 1853 movdqa 48(%rsi),%xmm11 1854 pand %xmm0,%xmm8 1855 movdqa 64(%rsi),%xmm12 1856 pand %xmm1,%xmm9 1857 movdqa 80(%rsi),%xmm13 1858 pand %xmm2,%xmm10 1859 movdqa 96(%rsi),%xmm14 1860 pand %xmm3,%xmm11 1861 movdqa 112(%rsi),%xmm15 1862 leaq 128(%rsi),%rsi 1863 pand %xmm4,%xmm12 1864 pand %xmm5,%xmm13 1865 pand %xmm6,%xmm14 1866 pand %xmm7,%xmm15 1867 por %xmm10,%xmm8 1868 por %xmm11,%xmm9 1869 por %xmm12,%xmm8 1870 por %xmm13,%xmm9 1871 por %xmm14,%xmm8 1872 por %xmm15,%xmm9 1873 1874 por %xmm9,%xmm8 1875 pshufd $0x4e,%xmm8,%xmm9 1876 por %xmm9,%xmm8 1877 movq %xmm8,(%rdi) 1878 leaq 8(%rdi),%rdi 1879 decl %r9d 1880 jnz .Loop_gather 1881 .byte 0xf3,0xc3 1882.LSEH_end_rsaz_512_gather4: 1883.size rsaz_512_gather4,.-rsaz_512_gather4 1884 1885.align 64 1886.Linc: 1887.long 0,0, 1,1 1888.long 2,2, 2,2 1889