rsaz-x86_64.S revision 1.2
1#include <machine/asm.h> 2.text 3 4 5 6.globl rsaz_512_sqr 7.type rsaz_512_sqr,@function 8.align 32 9rsaz_512_sqr: 10 pushq %rbx 11 pushq %rbp 12 pushq %r12 13 pushq %r13 14 pushq %r14 15 pushq %r15 16 17 subq $128+24,%rsp 18.Lsqr_body: 19 movq %rdx,%rbp 20 movq (%rsi),%rdx 21 movq 8(%rsi),%rax 22 movq %rcx,128(%rsp) 23 movl $0x80100,%r11d 24 andl OPENSSL_ia32cap_P+8(%rip),%r11d 25 cmpl $0x80100,%r11d 26 je .Loop_sqrx 27 jmp .Loop_sqr 28 29.align 32 30.Loop_sqr: 31 movl %r8d,128+8(%rsp) 32 33 movq %rdx,%rbx 34 mulq %rdx 35 movq %rax,%r8 36 movq 16(%rsi),%rax 37 movq %rdx,%r9 38 39 mulq %rbx 40 addq %rax,%r9 41 movq 24(%rsi),%rax 42 movq %rdx,%r10 43 adcq $0,%r10 44 45 mulq %rbx 46 addq %rax,%r10 47 movq 32(%rsi),%rax 48 movq %rdx,%r11 49 adcq $0,%r11 50 51 mulq %rbx 52 addq %rax,%r11 53 movq 40(%rsi),%rax 54 movq %rdx,%r12 55 adcq $0,%r12 56 57 mulq %rbx 58 addq %rax,%r12 59 movq 48(%rsi),%rax 60 movq %rdx,%r13 61 adcq $0,%r13 62 63 mulq %rbx 64 addq %rax,%r13 65 movq 56(%rsi),%rax 66 movq %rdx,%r14 67 adcq $0,%r14 68 69 mulq %rbx 70 addq %rax,%r14 71 movq %rbx,%rax 72 movq %rdx,%r15 73 adcq $0,%r15 74 75 addq %r8,%r8 76 movq %r9,%rcx 77 adcq %r9,%r9 78 79 mulq %rax 80 movq %rax,(%rsp) 81 addq %rdx,%r8 82 adcq $0,%r9 83 84 movq %r8,8(%rsp) 85 shrq $63,%rcx 86 87 88 movq 8(%rsi),%r8 89 movq 16(%rsi),%rax 90 mulq %r8 91 addq %rax,%r10 92 movq 24(%rsi),%rax 93 movq %rdx,%rbx 94 adcq $0,%rbx 95 96 mulq %r8 97 addq %rax,%r11 98 movq 32(%rsi),%rax 99 adcq $0,%rdx 100 addq %rbx,%r11 101 movq %rdx,%rbx 102 adcq $0,%rbx 103 104 mulq %r8 105 addq %rax,%r12 106 movq 40(%rsi),%rax 107 adcq $0,%rdx 108 addq %rbx,%r12 109 movq %rdx,%rbx 110 adcq $0,%rbx 111 112 mulq %r8 113 addq %rax,%r13 114 movq 48(%rsi),%rax 115 adcq $0,%rdx 116 addq %rbx,%r13 117 movq %rdx,%rbx 118 adcq $0,%rbx 119 120 mulq %r8 121 addq %rax,%r14 122 movq 56(%rsi),%rax 123 adcq $0,%rdx 124 addq %rbx,%r14 125 movq %rdx,%rbx 126 adcq $0,%rbx 127 128 mulq %r8 129 addq %rax,%r15 130 movq %r8,%rax 131 adcq $0,%rdx 132 addq %rbx,%r15 133 movq %rdx,%r8 134 movq %r10,%rdx 135 adcq $0,%r8 136 137 addq %rdx,%rdx 138 leaq (%rcx,%r10,2),%r10 139 movq %r11,%rbx 140 adcq %r11,%r11 141 142 mulq %rax 143 addq %rax,%r9 144 adcq %rdx,%r10 145 adcq $0,%r11 146 147 movq %r9,16(%rsp) 148 movq %r10,24(%rsp) 149 shrq $63,%rbx 150 151 152 movq 16(%rsi),%r9 153 movq 24(%rsi),%rax 154 mulq %r9 155 addq %rax,%r12 156 movq 32(%rsi),%rax 157 movq %rdx,%rcx 158 adcq $0,%rcx 159 160 mulq %r9 161 addq %rax,%r13 162 movq 40(%rsi),%rax 163 adcq $0,%rdx 164 addq %rcx,%r13 165 movq %rdx,%rcx 166 adcq $0,%rcx 167 168 mulq %r9 169 addq %rax,%r14 170 movq 48(%rsi),%rax 171 adcq $0,%rdx 172 addq %rcx,%r14 173 movq %rdx,%rcx 174 adcq $0,%rcx 175 176 mulq %r9 177 movq %r12,%r10 178 leaq (%rbx,%r12,2),%r12 179 addq %rax,%r15 180 movq 56(%rsi),%rax 181 adcq $0,%rdx 182 addq %rcx,%r15 183 movq %rdx,%rcx 184 adcq $0,%rcx 185 186 mulq %r9 187 shrq $63,%r10 188 addq %rax,%r8 189 movq %r9,%rax 190 adcq $0,%rdx 191 addq %rcx,%r8 192 movq %rdx,%r9 193 adcq $0,%r9 194 195 movq %r13,%rcx 196 leaq (%r10,%r13,2),%r13 197 198 mulq %rax 199 addq %rax,%r11 200 adcq %rdx,%r12 201 adcq $0,%r13 202 203 movq %r11,32(%rsp) 204 movq %r12,40(%rsp) 205 shrq $63,%rcx 206 207 208 movq 24(%rsi),%r10 209 movq 32(%rsi),%rax 210 mulq %r10 211 addq %rax,%r14 212 movq 40(%rsi),%rax 213 movq %rdx,%rbx 214 adcq $0,%rbx 215 216 mulq %r10 217 addq %rax,%r15 218 movq 48(%rsi),%rax 219 adcq $0,%rdx 220 addq %rbx,%r15 221 movq %rdx,%rbx 222 adcq $0,%rbx 223 224 mulq %r10 225 movq %r14,%r12 226 leaq (%rcx,%r14,2),%r14 227 addq %rax,%r8 228 movq 56(%rsi),%rax 229 adcq $0,%rdx 230 addq %rbx,%r8 231 movq %rdx,%rbx 232 adcq $0,%rbx 233 234 mulq %r10 235 shrq $63,%r12 236 addq %rax,%r9 237 movq %r10,%rax 238 adcq $0,%rdx 239 addq %rbx,%r9 240 movq %rdx,%r10 241 adcq $0,%r10 242 243 movq %r15,%rbx 244 leaq (%r12,%r15,2),%r15 245 246 mulq %rax 247 addq %rax,%r13 248 adcq %rdx,%r14 249 adcq $0,%r15 250 251 movq %r13,48(%rsp) 252 movq %r14,56(%rsp) 253 shrq $63,%rbx 254 255 256 movq 32(%rsi),%r11 257 movq 40(%rsi),%rax 258 mulq %r11 259 addq %rax,%r8 260 movq 48(%rsi),%rax 261 movq %rdx,%rcx 262 adcq $0,%rcx 263 264 mulq %r11 265 addq %rax,%r9 266 movq 56(%rsi),%rax 267 adcq $0,%rdx 268 movq %r8,%r12 269 leaq (%rbx,%r8,2),%r8 270 addq %rcx,%r9 271 movq %rdx,%rcx 272 adcq $0,%rcx 273 274 mulq %r11 275 shrq $63,%r12 276 addq %rax,%r10 277 movq %r11,%rax 278 adcq $0,%rdx 279 addq %rcx,%r10 280 movq %rdx,%r11 281 adcq $0,%r11 282 283 movq %r9,%rcx 284 leaq (%r12,%r9,2),%r9 285 286 mulq %rax 287 addq %rax,%r15 288 adcq %rdx,%r8 289 adcq $0,%r9 290 291 movq %r15,64(%rsp) 292 movq %r8,72(%rsp) 293 shrq $63,%rcx 294 295 296 movq 40(%rsi),%r12 297 movq 48(%rsi),%rax 298 mulq %r12 299 addq %rax,%r10 300 movq 56(%rsi),%rax 301 movq %rdx,%rbx 302 adcq $0,%rbx 303 304 mulq %r12 305 addq %rax,%r11 306 movq %r12,%rax 307 movq %r10,%r15 308 leaq (%rcx,%r10,2),%r10 309 adcq $0,%rdx 310 shrq $63,%r15 311 addq %rbx,%r11 312 movq %rdx,%r12 313 adcq $0,%r12 314 315 movq %r11,%rbx 316 leaq (%r15,%r11,2),%r11 317 318 mulq %rax 319 addq %rax,%r9 320 adcq %rdx,%r10 321 adcq $0,%r11 322 323 movq %r9,80(%rsp) 324 movq %r10,88(%rsp) 325 326 327 movq 48(%rsi),%r13 328 movq 56(%rsi),%rax 329 mulq %r13 330 addq %rax,%r12 331 movq %r13,%rax 332 movq %rdx,%r13 333 adcq $0,%r13 334 335 xorq %r14,%r14 336 shlq $1,%rbx 337 adcq %r12,%r12 338 adcq %r13,%r13 339 adcq %r14,%r14 340 341 mulq %rax 342 addq %rax,%r11 343 adcq %rdx,%r12 344 adcq $0,%r13 345 346 movq %r11,96(%rsp) 347 movq %r12,104(%rsp) 348 349 350 movq 56(%rsi),%rax 351 mulq %rax 352 addq %rax,%r13 353 adcq $0,%rdx 354 355 addq %rdx,%r14 356 357 movq %r13,112(%rsp) 358 movq %r14,120(%rsp) 359 360 movq (%rsp),%r8 361 movq 8(%rsp),%r9 362 movq 16(%rsp),%r10 363 movq 24(%rsp),%r11 364 movq 32(%rsp),%r12 365 movq 40(%rsp),%r13 366 movq 48(%rsp),%r14 367 movq 56(%rsp),%r15 368 369 call __rsaz_512_reduce 370 371 addq 64(%rsp),%r8 372 adcq 72(%rsp),%r9 373 adcq 80(%rsp),%r10 374 adcq 88(%rsp),%r11 375 adcq 96(%rsp),%r12 376 adcq 104(%rsp),%r13 377 adcq 112(%rsp),%r14 378 adcq 120(%rsp),%r15 379 sbbq %rcx,%rcx 380 381 call __rsaz_512_subtract 382 383 movq %r8,%rdx 384 movq %r9,%rax 385 movl 128+8(%rsp),%r8d 386 movq %rdi,%rsi 387 388 decl %r8d 389 jnz .Loop_sqr 390 jmp .Lsqr_tail 391 392.align 32 393.Loop_sqrx: 394 movl %r8d,128+8(%rsp) 395.byte 102,72,15,110,199 396.byte 102,72,15,110,205 397 398 mulxq %rax,%r8,%r9 399 400 mulxq 16(%rsi),%rcx,%r10 401 xorq %rbp,%rbp 402 403 mulxq 24(%rsi),%rax,%r11 404 adcxq %rcx,%r9 405 406 mulxq 32(%rsi),%rcx,%r12 407 adcxq %rax,%r10 408 409 mulxq 40(%rsi),%rax,%r13 410 adcxq %rcx,%r11 411 412.byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 413 adcxq %rax,%r12 414 adcxq %rcx,%r13 415 416.byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 417 adcxq %rax,%r14 418 adcxq %rbp,%r15 419 420 movq %r9,%rcx 421 shldq $1,%r8,%r9 422 shlq $1,%r8 423 424 xorl %ebp,%ebp 425 mulxq %rdx,%rax,%rdx 426 adcxq %rdx,%r8 427 movq 8(%rsi),%rdx 428 adcxq %rbp,%r9 429 430 movq %rax,(%rsp) 431 movq %r8,8(%rsp) 432 433 434 mulxq 16(%rsi),%rax,%rbx 435 adoxq %rax,%r10 436 adcxq %rbx,%r11 437 438.byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 439 adoxq %rdi,%r11 440 adcxq %r8,%r12 441 442 mulxq 32(%rsi),%rax,%rbx 443 adoxq %rax,%r12 444 adcxq %rbx,%r13 445 446 mulxq 40(%rsi),%rdi,%r8 447 adoxq %rdi,%r13 448 adcxq %r8,%r14 449 450.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 451 adoxq %rax,%r14 452 adcxq %rbx,%r15 453 454.byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 455 adoxq %rdi,%r15 456 adcxq %rbp,%r8 457 adoxq %rbp,%r8 458 459 movq %r11,%rbx 460 shldq $1,%r10,%r11 461 shldq $1,%rcx,%r10 462 463 xorl %ebp,%ebp 464 mulxq %rdx,%rax,%rcx 465 movq 16(%rsi),%rdx 466 adcxq %rax,%r9 467 adcxq %rcx,%r10 468 adcxq %rbp,%r11 469 470 movq %r9,16(%rsp) 471.byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 472 473 474.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 475 adoxq %rdi,%r12 476 adcxq %r9,%r13 477 478 mulxq 32(%rsi),%rax,%rcx 479 adoxq %rax,%r13 480 adcxq %rcx,%r14 481 482 mulxq 40(%rsi),%rdi,%r9 483 adoxq %rdi,%r14 484 adcxq %r9,%r15 485 486.byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 487 adoxq %rax,%r15 488 adcxq %rcx,%r8 489 490.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 491 adoxq %rdi,%r8 492 adcxq %rbp,%r9 493 adoxq %rbp,%r9 494 495 movq %r13,%rcx 496 shldq $1,%r12,%r13 497 shldq $1,%rbx,%r12 498 499 xorl %ebp,%ebp 500 mulxq %rdx,%rax,%rdx 501 adcxq %rax,%r11 502 adcxq %rdx,%r12 503 movq 24(%rsi),%rdx 504 adcxq %rbp,%r13 505 506 movq %r11,32(%rsp) 507.byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 508 509 510.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 511 adoxq %rax,%r14 512 adcxq %rbx,%r15 513 514 mulxq 40(%rsi),%rdi,%r10 515 adoxq %rdi,%r15 516 adcxq %r10,%r8 517 518 mulxq 48(%rsi),%rax,%rbx 519 adoxq %rax,%r8 520 adcxq %rbx,%r9 521 522 mulxq 56(%rsi),%rdi,%r10 523 adoxq %rdi,%r9 524 adcxq %rbp,%r10 525 adoxq %rbp,%r10 526 527.byte 0x66 528 movq %r15,%rbx 529 shldq $1,%r14,%r15 530 shldq $1,%rcx,%r14 531 532 xorl %ebp,%ebp 533 mulxq %rdx,%rax,%rdx 534 adcxq %rax,%r13 535 adcxq %rdx,%r14 536 movq 32(%rsi),%rdx 537 adcxq %rbp,%r15 538 539 movq %r13,48(%rsp) 540 movq %r14,56(%rsp) 541 542 543.byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 544 adoxq %rdi,%r8 545 adcxq %r11,%r9 546 547 mulxq 48(%rsi),%rax,%rcx 548 adoxq %rax,%r9 549 adcxq %rcx,%r10 550 551 mulxq 56(%rsi),%rdi,%r11 552 adoxq %rdi,%r10 553 adcxq %rbp,%r11 554 adoxq %rbp,%r11 555 556 movq %r9,%rcx 557 shldq $1,%r8,%r9 558 shldq $1,%rbx,%r8 559 560 xorl %ebp,%ebp 561 mulxq %rdx,%rax,%rdx 562 adcxq %rax,%r15 563 adcxq %rdx,%r8 564 movq 40(%rsi),%rdx 565 adcxq %rbp,%r9 566 567 movq %r15,64(%rsp) 568 movq %r8,72(%rsp) 569 570 571.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 572 adoxq %rax,%r10 573 adcxq %rbx,%r11 574 575.byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 576 adoxq %rdi,%r11 577 adcxq %rbp,%r12 578 adoxq %rbp,%r12 579 580 movq %r11,%rbx 581 shldq $1,%r10,%r11 582 shldq $1,%rcx,%r10 583 584 xorl %ebp,%ebp 585 mulxq %rdx,%rax,%rdx 586 adcxq %rax,%r9 587 adcxq %rdx,%r10 588 movq 48(%rsi),%rdx 589 adcxq %rbp,%r11 590 591 movq %r9,80(%rsp) 592 movq %r10,88(%rsp) 593 594 595.byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 596 adoxq %rax,%r12 597 adoxq %rbp,%r13 598 599 xorq %r14,%r14 600 shldq $1,%r13,%r14 601 shldq $1,%r12,%r13 602 shldq $1,%rbx,%r12 603 604 xorl %ebp,%ebp 605 mulxq %rdx,%rax,%rdx 606 adcxq %rax,%r11 607 adcxq %rdx,%r12 608 movq 56(%rsi),%rdx 609 adcxq %rbp,%r13 610 611.byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 612.byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 613 614 615 mulxq %rdx,%rax,%rdx 616 adoxq %rax,%r13 617 adoxq %rbp,%rdx 618 619.byte 0x66 620 addq %rdx,%r14 621 622 movq %r13,112(%rsp) 623 movq %r14,120(%rsp) 624.byte 102,72,15,126,199 625.byte 102,72,15,126,205 626 627 movq 128(%rsp),%rdx 628 movq (%rsp),%r8 629 movq 8(%rsp),%r9 630 movq 16(%rsp),%r10 631 movq 24(%rsp),%r11 632 movq 32(%rsp),%r12 633 movq 40(%rsp),%r13 634 movq 48(%rsp),%r14 635 movq 56(%rsp),%r15 636 637 call __rsaz_512_reducex 638 639 addq 64(%rsp),%r8 640 adcq 72(%rsp),%r9 641 adcq 80(%rsp),%r10 642 adcq 88(%rsp),%r11 643 adcq 96(%rsp),%r12 644 adcq 104(%rsp),%r13 645 adcq 112(%rsp),%r14 646 adcq 120(%rsp),%r15 647 sbbq %rcx,%rcx 648 649 call __rsaz_512_subtract 650 651 movq %r8,%rdx 652 movq %r9,%rax 653 movl 128+8(%rsp),%r8d 654 movq %rdi,%rsi 655 656 decl %r8d 657 jnz .Loop_sqrx 658 659.Lsqr_tail: 660 661 leaq 128+24+48(%rsp),%rax 662 movq -48(%rax),%r15 663 movq -40(%rax),%r14 664 movq -32(%rax),%r13 665 movq -24(%rax),%r12 666 movq -16(%rax),%rbp 667 movq -8(%rax),%rbx 668 leaq (%rax),%rsp 669.Lsqr_epilogue: 670 .byte 0xf3,0xc3 671.size rsaz_512_sqr,.-rsaz_512_sqr 672.globl rsaz_512_mul 673.type rsaz_512_mul,@function 674.align 32 675rsaz_512_mul: 676 pushq %rbx 677 pushq %rbp 678 pushq %r12 679 pushq %r13 680 pushq %r14 681 pushq %r15 682 683 subq $128+24,%rsp 684.Lmul_body: 685.byte 102,72,15,110,199 686.byte 102,72,15,110,201 687 movq %r8,128(%rsp) 688 movl $0x80100,%r11d 689 andl OPENSSL_ia32cap_P+8(%rip),%r11d 690 cmpl $0x80100,%r11d 691 je .Lmulx 692 movq (%rdx),%rbx 693 movq %rdx,%rbp 694 call __rsaz_512_mul 695 696.byte 102,72,15,126,199 697.byte 102,72,15,126,205 698 699 movq (%rsp),%r8 700 movq 8(%rsp),%r9 701 movq 16(%rsp),%r10 702 movq 24(%rsp),%r11 703 movq 32(%rsp),%r12 704 movq 40(%rsp),%r13 705 movq 48(%rsp),%r14 706 movq 56(%rsp),%r15 707 708 call __rsaz_512_reduce 709 jmp .Lmul_tail 710 711.align 32 712.Lmulx: 713 movq %rdx,%rbp 714 movq (%rdx),%rdx 715 call __rsaz_512_mulx 716 717.byte 102,72,15,126,199 718.byte 102,72,15,126,205 719 720 movq 128(%rsp),%rdx 721 movq (%rsp),%r8 722 movq 8(%rsp),%r9 723 movq 16(%rsp),%r10 724 movq 24(%rsp),%r11 725 movq 32(%rsp),%r12 726 movq 40(%rsp),%r13 727 movq 48(%rsp),%r14 728 movq 56(%rsp),%r15 729 730 call __rsaz_512_reducex 731.Lmul_tail: 732 addq 64(%rsp),%r8 733 adcq 72(%rsp),%r9 734 adcq 80(%rsp),%r10 735 adcq 88(%rsp),%r11 736 adcq 96(%rsp),%r12 737 adcq 104(%rsp),%r13 738 adcq 112(%rsp),%r14 739 adcq 120(%rsp),%r15 740 sbbq %rcx,%rcx 741 742 call __rsaz_512_subtract 743 744 leaq 128+24+48(%rsp),%rax 745 movq -48(%rax),%r15 746 movq -40(%rax),%r14 747 movq -32(%rax),%r13 748 movq -24(%rax),%r12 749 movq -16(%rax),%rbp 750 movq -8(%rax),%rbx 751 leaq (%rax),%rsp 752.Lmul_epilogue: 753 .byte 0xf3,0xc3 754.size rsaz_512_mul,.-rsaz_512_mul 755.globl rsaz_512_mul_gather4 756.type rsaz_512_mul_gather4,@function 757.align 32 758rsaz_512_mul_gather4: 759 pushq %rbx 760 pushq %rbp 761 pushq %r12 762 pushq %r13 763 pushq %r14 764 pushq %r15 765 766 subq $152,%rsp 767.Lmul_gather4_body: 768 movd %r9d,%xmm8 769 movdqa .Linc+16(%rip),%xmm1 770 movdqa .Linc(%rip),%xmm0 771 772 pshufd $0,%xmm8,%xmm8 773 movdqa %xmm1,%xmm7 774 movdqa %xmm1,%xmm2 775 paddd %xmm0,%xmm1 776 pcmpeqd %xmm8,%xmm0 777 movdqa %xmm7,%xmm3 778 paddd %xmm1,%xmm2 779 pcmpeqd %xmm8,%xmm1 780 movdqa %xmm7,%xmm4 781 paddd %xmm2,%xmm3 782 pcmpeqd %xmm8,%xmm2 783 movdqa %xmm7,%xmm5 784 paddd %xmm3,%xmm4 785 pcmpeqd %xmm8,%xmm3 786 movdqa %xmm7,%xmm6 787 paddd %xmm4,%xmm5 788 pcmpeqd %xmm8,%xmm4 789 paddd %xmm5,%xmm6 790 pcmpeqd %xmm8,%xmm5 791 paddd %xmm6,%xmm7 792 pcmpeqd %xmm8,%xmm6 793 pcmpeqd %xmm8,%xmm7 794 795 movdqa 0(%rdx),%xmm8 796 movdqa 16(%rdx),%xmm9 797 movdqa 32(%rdx),%xmm10 798 movdqa 48(%rdx),%xmm11 799 pand %xmm0,%xmm8 800 movdqa 64(%rdx),%xmm12 801 pand %xmm1,%xmm9 802 movdqa 80(%rdx),%xmm13 803 pand %xmm2,%xmm10 804 movdqa 96(%rdx),%xmm14 805 pand %xmm3,%xmm11 806 movdqa 112(%rdx),%xmm15 807 leaq 128(%rdx),%rbp 808 pand %xmm4,%xmm12 809 pand %xmm5,%xmm13 810 pand %xmm6,%xmm14 811 pand %xmm7,%xmm15 812 por %xmm10,%xmm8 813 por %xmm11,%xmm9 814 por %xmm12,%xmm8 815 por %xmm13,%xmm9 816 por %xmm14,%xmm8 817 por %xmm15,%xmm9 818 819 por %xmm9,%xmm8 820 pshufd $0x4e,%xmm8,%xmm9 821 por %xmm9,%xmm8 822 movl $0x80100,%r11d 823 andl OPENSSL_ia32cap_P+8(%rip),%r11d 824 cmpl $0x80100,%r11d 825 je .Lmulx_gather 826.byte 102,76,15,126,195 827 828 movq %r8,128(%rsp) 829 movq %rdi,128+8(%rsp) 830 movq %rcx,128+16(%rsp) 831 832 movq (%rsi),%rax 833 movq 8(%rsi),%rcx 834 mulq %rbx 835 movq %rax,(%rsp) 836 movq %rcx,%rax 837 movq %rdx,%r8 838 839 mulq %rbx 840 addq %rax,%r8 841 movq 16(%rsi),%rax 842 movq %rdx,%r9 843 adcq $0,%r9 844 845 mulq %rbx 846 addq %rax,%r9 847 movq 24(%rsi),%rax 848 movq %rdx,%r10 849 adcq $0,%r10 850 851 mulq %rbx 852 addq %rax,%r10 853 movq 32(%rsi),%rax 854 movq %rdx,%r11 855 adcq $0,%r11 856 857 mulq %rbx 858 addq %rax,%r11 859 movq 40(%rsi),%rax 860 movq %rdx,%r12 861 adcq $0,%r12 862 863 mulq %rbx 864 addq %rax,%r12 865 movq 48(%rsi),%rax 866 movq %rdx,%r13 867 adcq $0,%r13 868 869 mulq %rbx 870 addq %rax,%r13 871 movq 56(%rsi),%rax 872 movq %rdx,%r14 873 adcq $0,%r14 874 875 mulq %rbx 876 addq %rax,%r14 877 movq (%rsi),%rax 878 movq %rdx,%r15 879 adcq $0,%r15 880 881 leaq 8(%rsp),%rdi 882 movl $7,%ecx 883 jmp .Loop_mul_gather 884 885.align 32 886.Loop_mul_gather: 887 movdqa 0(%rbp),%xmm8 888 movdqa 16(%rbp),%xmm9 889 movdqa 32(%rbp),%xmm10 890 movdqa 48(%rbp),%xmm11 891 pand %xmm0,%xmm8 892 movdqa 64(%rbp),%xmm12 893 pand %xmm1,%xmm9 894 movdqa 80(%rbp),%xmm13 895 pand %xmm2,%xmm10 896 movdqa 96(%rbp),%xmm14 897 pand %xmm3,%xmm11 898 movdqa 112(%rbp),%xmm15 899 leaq 128(%rbp),%rbp 900 pand %xmm4,%xmm12 901 pand %xmm5,%xmm13 902 pand %xmm6,%xmm14 903 pand %xmm7,%xmm15 904 por %xmm10,%xmm8 905 por %xmm11,%xmm9 906 por %xmm12,%xmm8 907 por %xmm13,%xmm9 908 por %xmm14,%xmm8 909 por %xmm15,%xmm9 910 911 por %xmm9,%xmm8 912 pshufd $0x4e,%xmm8,%xmm9 913 por %xmm9,%xmm8 914.byte 102,76,15,126,195 915 916 mulq %rbx 917 addq %rax,%r8 918 movq 8(%rsi),%rax 919 movq %r8,(%rdi) 920 movq %rdx,%r8 921 adcq $0,%r8 922 923 mulq %rbx 924 addq %rax,%r9 925 movq 16(%rsi),%rax 926 adcq $0,%rdx 927 addq %r9,%r8 928 movq %rdx,%r9 929 adcq $0,%r9 930 931 mulq %rbx 932 addq %rax,%r10 933 movq 24(%rsi),%rax 934 adcq $0,%rdx 935 addq %r10,%r9 936 movq %rdx,%r10 937 adcq $0,%r10 938 939 mulq %rbx 940 addq %rax,%r11 941 movq 32(%rsi),%rax 942 adcq $0,%rdx 943 addq %r11,%r10 944 movq %rdx,%r11 945 adcq $0,%r11 946 947 mulq %rbx 948 addq %rax,%r12 949 movq 40(%rsi),%rax 950 adcq $0,%rdx 951 addq %r12,%r11 952 movq %rdx,%r12 953 adcq $0,%r12 954 955 mulq %rbx 956 addq %rax,%r13 957 movq 48(%rsi),%rax 958 adcq $0,%rdx 959 addq %r13,%r12 960 movq %rdx,%r13 961 adcq $0,%r13 962 963 mulq %rbx 964 addq %rax,%r14 965 movq 56(%rsi),%rax 966 adcq $0,%rdx 967 addq %r14,%r13 968 movq %rdx,%r14 969 adcq $0,%r14 970 971 mulq %rbx 972 addq %rax,%r15 973 movq (%rsi),%rax 974 adcq $0,%rdx 975 addq %r15,%r14 976 movq %rdx,%r15 977 adcq $0,%r15 978 979 leaq 8(%rdi),%rdi 980 981 decl %ecx 982 jnz .Loop_mul_gather 983 984 movq %r8,(%rdi) 985 movq %r9,8(%rdi) 986 movq %r10,16(%rdi) 987 movq %r11,24(%rdi) 988 movq %r12,32(%rdi) 989 movq %r13,40(%rdi) 990 movq %r14,48(%rdi) 991 movq %r15,56(%rdi) 992 993 movq 128+8(%rsp),%rdi 994 movq 128+16(%rsp),%rbp 995 996 movq (%rsp),%r8 997 movq 8(%rsp),%r9 998 movq 16(%rsp),%r10 999 movq 24(%rsp),%r11 1000 movq 32(%rsp),%r12 1001 movq 40(%rsp),%r13 1002 movq 48(%rsp),%r14 1003 movq 56(%rsp),%r15 1004 1005 call __rsaz_512_reduce 1006 jmp .Lmul_gather_tail 1007 1008.align 32 1009.Lmulx_gather: 1010.byte 102,76,15,126,194 1011 1012 movq %r8,128(%rsp) 1013 movq %rdi,128+8(%rsp) 1014 movq %rcx,128+16(%rsp) 1015 1016 mulxq (%rsi),%rbx,%r8 1017 movq %rbx,(%rsp) 1018 xorl %edi,%edi 1019 1020 mulxq 8(%rsi),%rax,%r9 1021 1022 mulxq 16(%rsi),%rbx,%r10 1023 adcxq %rax,%r8 1024 1025 mulxq 24(%rsi),%rax,%r11 1026 adcxq %rbx,%r9 1027 1028 mulxq 32(%rsi),%rbx,%r12 1029 adcxq %rax,%r10 1030 1031 mulxq 40(%rsi),%rax,%r13 1032 adcxq %rbx,%r11 1033 1034 mulxq 48(%rsi),%rbx,%r14 1035 adcxq %rax,%r12 1036 1037 mulxq 56(%rsi),%rax,%r15 1038 adcxq %rbx,%r13 1039 adcxq %rax,%r14 1040.byte 0x67 1041 movq %r8,%rbx 1042 adcxq %rdi,%r15 1043 1044 movq $-7,%rcx 1045 jmp .Loop_mulx_gather 1046 1047.align 32 1048.Loop_mulx_gather: 1049 movdqa 0(%rbp),%xmm8 1050 movdqa 16(%rbp),%xmm9 1051 movdqa 32(%rbp),%xmm10 1052 movdqa 48(%rbp),%xmm11 1053 pand %xmm0,%xmm8 1054 movdqa 64(%rbp),%xmm12 1055 pand %xmm1,%xmm9 1056 movdqa 80(%rbp),%xmm13 1057 pand %xmm2,%xmm10 1058 movdqa 96(%rbp),%xmm14 1059 pand %xmm3,%xmm11 1060 movdqa 112(%rbp),%xmm15 1061 leaq 128(%rbp),%rbp 1062 pand %xmm4,%xmm12 1063 pand %xmm5,%xmm13 1064 pand %xmm6,%xmm14 1065 pand %xmm7,%xmm15 1066 por %xmm10,%xmm8 1067 por %xmm11,%xmm9 1068 por %xmm12,%xmm8 1069 por %xmm13,%xmm9 1070 por %xmm14,%xmm8 1071 por %xmm15,%xmm9 1072 1073 por %xmm9,%xmm8 1074 pshufd $0x4e,%xmm8,%xmm9 1075 por %xmm9,%xmm8 1076.byte 102,76,15,126,194 1077 1078.byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 1079 adcxq %rax,%rbx 1080 adoxq %r9,%r8 1081 1082 mulxq 8(%rsi),%rax,%r9 1083 adcxq %rax,%r8 1084 adoxq %r10,%r9 1085 1086 mulxq 16(%rsi),%rax,%r10 1087 adcxq %rax,%r9 1088 adoxq %r11,%r10 1089 1090.byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 1091 adcxq %rax,%r10 1092 adoxq %r12,%r11 1093 1094 mulxq 32(%rsi),%rax,%r12 1095 adcxq %rax,%r11 1096 adoxq %r13,%r12 1097 1098 mulxq 40(%rsi),%rax,%r13 1099 adcxq %rax,%r12 1100 adoxq %r14,%r13 1101 1102.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 1103 adcxq %rax,%r13 1104.byte 0x67 1105 adoxq %r15,%r14 1106 1107 mulxq 56(%rsi),%rax,%r15 1108 movq %rbx,64(%rsp,%rcx,8) 1109 adcxq %rax,%r14 1110 adoxq %rdi,%r15 1111 movq %r8,%rbx 1112 adcxq %rdi,%r15 1113 1114 incq %rcx 1115 jnz .Loop_mulx_gather 1116 1117 movq %r8,64(%rsp) 1118 movq %r9,64+8(%rsp) 1119 movq %r10,64+16(%rsp) 1120 movq %r11,64+24(%rsp) 1121 movq %r12,64+32(%rsp) 1122 movq %r13,64+40(%rsp) 1123 movq %r14,64+48(%rsp) 1124 movq %r15,64+56(%rsp) 1125 1126 movq 128(%rsp),%rdx 1127 movq 128+8(%rsp),%rdi 1128 movq 128+16(%rsp),%rbp 1129 1130 movq (%rsp),%r8 1131 movq 8(%rsp),%r9 1132 movq 16(%rsp),%r10 1133 movq 24(%rsp),%r11 1134 movq 32(%rsp),%r12 1135 movq 40(%rsp),%r13 1136 movq 48(%rsp),%r14 1137 movq 56(%rsp),%r15 1138 1139 call __rsaz_512_reducex 1140 1141.Lmul_gather_tail: 1142 addq 64(%rsp),%r8 1143 adcq 72(%rsp),%r9 1144 adcq 80(%rsp),%r10 1145 adcq 88(%rsp),%r11 1146 adcq 96(%rsp),%r12 1147 adcq 104(%rsp),%r13 1148 adcq 112(%rsp),%r14 1149 adcq 120(%rsp),%r15 1150 sbbq %rcx,%rcx 1151 1152 call __rsaz_512_subtract 1153 1154 leaq 128+24+48(%rsp),%rax 1155 movq -48(%rax),%r15 1156 movq -40(%rax),%r14 1157 movq -32(%rax),%r13 1158 movq -24(%rax),%r12 1159 movq -16(%rax),%rbp 1160 movq -8(%rax),%rbx 1161 leaq (%rax),%rsp 1162.Lmul_gather4_epilogue: 1163 .byte 0xf3,0xc3 1164.size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4 1165.globl rsaz_512_mul_scatter4 1166.type rsaz_512_mul_scatter4,@function 1167.align 32 1168rsaz_512_mul_scatter4: 1169 pushq %rbx 1170 pushq %rbp 1171 pushq %r12 1172 pushq %r13 1173 pushq %r14 1174 pushq %r15 1175 1176 movl %r9d,%r9d 1177 subq $128+24,%rsp 1178.Lmul_scatter4_body: 1179 leaq (%r8,%r9,8),%r8 1180.byte 102,72,15,110,199 1181.byte 102,72,15,110,202 1182.byte 102,73,15,110,208 1183 movq %rcx,128(%rsp) 1184 1185 movq %rdi,%rbp 1186 movl $0x80100,%r11d 1187 andl OPENSSL_ia32cap_P+8(%rip),%r11d 1188 cmpl $0x80100,%r11d 1189 je .Lmulx_scatter 1190 movq (%rdi),%rbx 1191 call __rsaz_512_mul 1192 1193.byte 102,72,15,126,199 1194.byte 102,72,15,126,205 1195 1196 movq (%rsp),%r8 1197 movq 8(%rsp),%r9 1198 movq 16(%rsp),%r10 1199 movq 24(%rsp),%r11 1200 movq 32(%rsp),%r12 1201 movq 40(%rsp),%r13 1202 movq 48(%rsp),%r14 1203 movq 56(%rsp),%r15 1204 1205 call __rsaz_512_reduce 1206 jmp .Lmul_scatter_tail 1207 1208.align 32 1209.Lmulx_scatter: 1210 movq (%rdi),%rdx 1211 call __rsaz_512_mulx 1212 1213.byte 102,72,15,126,199 1214.byte 102,72,15,126,205 1215 1216 movq 128(%rsp),%rdx 1217 movq (%rsp),%r8 1218 movq 8(%rsp),%r9 1219 movq 16(%rsp),%r10 1220 movq 24(%rsp),%r11 1221 movq 32(%rsp),%r12 1222 movq 40(%rsp),%r13 1223 movq 48(%rsp),%r14 1224 movq 56(%rsp),%r15 1225 1226 call __rsaz_512_reducex 1227 1228.Lmul_scatter_tail: 1229 addq 64(%rsp),%r8 1230 adcq 72(%rsp),%r9 1231 adcq 80(%rsp),%r10 1232 adcq 88(%rsp),%r11 1233 adcq 96(%rsp),%r12 1234 adcq 104(%rsp),%r13 1235 adcq 112(%rsp),%r14 1236 adcq 120(%rsp),%r15 1237.byte 102,72,15,126,214 1238 sbbq %rcx,%rcx 1239 1240 call __rsaz_512_subtract 1241 1242 movq %r8,0(%rsi) 1243 movq %r9,128(%rsi) 1244 movq %r10,256(%rsi) 1245 movq %r11,384(%rsi) 1246 movq %r12,512(%rsi) 1247 movq %r13,640(%rsi) 1248 movq %r14,768(%rsi) 1249 movq %r15,896(%rsi) 1250 1251 leaq 128+24+48(%rsp),%rax 1252 movq -48(%rax),%r15 1253 movq -40(%rax),%r14 1254 movq -32(%rax),%r13 1255 movq -24(%rax),%r12 1256 movq -16(%rax),%rbp 1257 movq -8(%rax),%rbx 1258 leaq (%rax),%rsp 1259.Lmul_scatter4_epilogue: 1260 .byte 0xf3,0xc3 1261.size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4 1262.globl rsaz_512_mul_by_one 1263.type rsaz_512_mul_by_one,@function 1264.align 32 1265rsaz_512_mul_by_one: 1266 pushq %rbx 1267 pushq %rbp 1268 pushq %r12 1269 pushq %r13 1270 pushq %r14 1271 pushq %r15 1272 1273 subq $128+24,%rsp 1274.Lmul_by_one_body: 1275 movl OPENSSL_ia32cap_P+8(%rip),%eax 1276 movq %rdx,%rbp 1277 movq %rcx,128(%rsp) 1278 1279 movq (%rsi),%r8 1280 pxor %xmm0,%xmm0 1281 movq 8(%rsi),%r9 1282 movq 16(%rsi),%r10 1283 movq 24(%rsi),%r11 1284 movq 32(%rsi),%r12 1285 movq 40(%rsi),%r13 1286 movq 48(%rsi),%r14 1287 movq 56(%rsi),%r15 1288 1289 movdqa %xmm0,(%rsp) 1290 movdqa %xmm0,16(%rsp) 1291 movdqa %xmm0,32(%rsp) 1292 movdqa %xmm0,48(%rsp) 1293 movdqa %xmm0,64(%rsp) 1294 movdqa %xmm0,80(%rsp) 1295 movdqa %xmm0,96(%rsp) 1296 andl $0x80100,%eax 1297 cmpl $0x80100,%eax 1298 je .Lby_one_callx 1299 call __rsaz_512_reduce 1300 jmp .Lby_one_tail 1301.align 32 1302.Lby_one_callx: 1303 movq 128(%rsp),%rdx 1304 call __rsaz_512_reducex 1305.Lby_one_tail: 1306 movq %r8,(%rdi) 1307 movq %r9,8(%rdi) 1308 movq %r10,16(%rdi) 1309 movq %r11,24(%rdi) 1310 movq %r12,32(%rdi) 1311 movq %r13,40(%rdi) 1312 movq %r14,48(%rdi) 1313 movq %r15,56(%rdi) 1314 1315 leaq 128+24+48(%rsp),%rax 1316 movq -48(%rax),%r15 1317 movq -40(%rax),%r14 1318 movq -32(%rax),%r13 1319 movq -24(%rax),%r12 1320 movq -16(%rax),%rbp 1321 movq -8(%rax),%rbx 1322 leaq (%rax),%rsp 1323.Lmul_by_one_epilogue: 1324 .byte 0xf3,0xc3 1325.size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one 1326.type __rsaz_512_reduce,@function 1327.align 32 1328__rsaz_512_reduce: 1329 movq %r8,%rbx 1330 imulq 128+8(%rsp),%rbx 1331 movq 0(%rbp),%rax 1332 movl $8,%ecx 1333 jmp .Lreduction_loop 1334 1335.align 32 1336.Lreduction_loop: 1337 mulq %rbx 1338 movq 8(%rbp),%rax 1339 negq %r8 1340 movq %rdx,%r8 1341 adcq $0,%r8 1342 1343 mulq %rbx 1344 addq %rax,%r9 1345 movq 16(%rbp),%rax 1346 adcq $0,%rdx 1347 addq %r9,%r8 1348 movq %rdx,%r9 1349 adcq $0,%r9 1350 1351 mulq %rbx 1352 addq %rax,%r10 1353 movq 24(%rbp),%rax 1354 adcq $0,%rdx 1355 addq %r10,%r9 1356 movq %rdx,%r10 1357 adcq $0,%r10 1358 1359 mulq %rbx 1360 addq %rax,%r11 1361 movq 32(%rbp),%rax 1362 adcq $0,%rdx 1363 addq %r11,%r10 1364 movq 128+8(%rsp),%rsi 1365 1366 1367 adcq $0,%rdx 1368 movq %rdx,%r11 1369 1370 mulq %rbx 1371 addq %rax,%r12 1372 movq 40(%rbp),%rax 1373 adcq $0,%rdx 1374 imulq %r8,%rsi 1375 addq %r12,%r11 1376 movq %rdx,%r12 1377 adcq $0,%r12 1378 1379 mulq %rbx 1380 addq %rax,%r13 1381 movq 48(%rbp),%rax 1382 adcq $0,%rdx 1383 addq %r13,%r12 1384 movq %rdx,%r13 1385 adcq $0,%r13 1386 1387 mulq %rbx 1388 addq %rax,%r14 1389 movq 56(%rbp),%rax 1390 adcq $0,%rdx 1391 addq %r14,%r13 1392 movq %rdx,%r14 1393 adcq $0,%r14 1394 1395 mulq %rbx 1396 movq %rsi,%rbx 1397 addq %rax,%r15 1398 movq 0(%rbp),%rax 1399 adcq $0,%rdx 1400 addq %r15,%r14 1401 movq %rdx,%r15 1402 adcq $0,%r15 1403 1404 decl %ecx 1405 jne .Lreduction_loop 1406 1407 .byte 0xf3,0xc3 1408.size __rsaz_512_reduce,.-__rsaz_512_reduce 1409.type __rsaz_512_reducex,@function 1410.align 32 1411__rsaz_512_reducex: 1412 1413 imulq %r8,%rdx 1414 xorq %rsi,%rsi 1415 movl $8,%ecx 1416 jmp .Lreduction_loopx 1417 1418.align 32 1419.Lreduction_loopx: 1420 movq %r8,%rbx 1421 mulxq 0(%rbp),%rax,%r8 1422 adcxq %rbx,%rax 1423 adoxq %r9,%r8 1424 1425 mulxq 8(%rbp),%rax,%r9 1426 adcxq %rax,%r8 1427 adoxq %r10,%r9 1428 1429 mulxq 16(%rbp),%rbx,%r10 1430 adcxq %rbx,%r9 1431 adoxq %r11,%r10 1432 1433 mulxq 24(%rbp),%rbx,%r11 1434 adcxq %rbx,%r10 1435 adoxq %r12,%r11 1436 1437.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 1438 movq %rdx,%rax 1439 movq %r8,%rdx 1440 adcxq %rbx,%r11 1441 adoxq %r13,%r12 1442 1443 mulxq 128+8(%rsp),%rbx,%rdx 1444 movq %rax,%rdx 1445 1446 mulxq 40(%rbp),%rax,%r13 1447 adcxq %rax,%r12 1448 adoxq %r14,%r13 1449 1450.byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 1451 adcxq %rax,%r13 1452 adoxq %r15,%r14 1453 1454 mulxq 56(%rbp),%rax,%r15 1455 movq %rbx,%rdx 1456 adcxq %rax,%r14 1457 adoxq %rsi,%r15 1458 adcxq %rsi,%r15 1459 1460 decl %ecx 1461 jne .Lreduction_loopx 1462 1463 .byte 0xf3,0xc3 1464.size __rsaz_512_reducex,.-__rsaz_512_reducex 1465.type __rsaz_512_subtract,@function 1466.align 32 1467__rsaz_512_subtract: 1468 movq %r8,(%rdi) 1469 movq %r9,8(%rdi) 1470 movq %r10,16(%rdi) 1471 movq %r11,24(%rdi) 1472 movq %r12,32(%rdi) 1473 movq %r13,40(%rdi) 1474 movq %r14,48(%rdi) 1475 movq %r15,56(%rdi) 1476 1477 movq 0(%rbp),%r8 1478 movq 8(%rbp),%r9 1479 negq %r8 1480 notq %r9 1481 andq %rcx,%r8 1482 movq 16(%rbp),%r10 1483 andq %rcx,%r9 1484 notq %r10 1485 movq 24(%rbp),%r11 1486 andq %rcx,%r10 1487 notq %r11 1488 movq 32(%rbp),%r12 1489 andq %rcx,%r11 1490 notq %r12 1491 movq 40(%rbp),%r13 1492 andq %rcx,%r12 1493 notq %r13 1494 movq 48(%rbp),%r14 1495 andq %rcx,%r13 1496 notq %r14 1497 movq 56(%rbp),%r15 1498 andq %rcx,%r14 1499 notq %r15 1500 andq %rcx,%r15 1501 1502 addq (%rdi),%r8 1503 adcq 8(%rdi),%r9 1504 adcq 16(%rdi),%r10 1505 adcq 24(%rdi),%r11 1506 adcq 32(%rdi),%r12 1507 adcq 40(%rdi),%r13 1508 adcq 48(%rdi),%r14 1509 adcq 56(%rdi),%r15 1510 1511 movq %r8,(%rdi) 1512 movq %r9,8(%rdi) 1513 movq %r10,16(%rdi) 1514 movq %r11,24(%rdi) 1515 movq %r12,32(%rdi) 1516 movq %r13,40(%rdi) 1517 movq %r14,48(%rdi) 1518 movq %r15,56(%rdi) 1519 1520 .byte 0xf3,0xc3 1521.size __rsaz_512_subtract,.-__rsaz_512_subtract 1522.type __rsaz_512_mul,@function 1523.align 32 1524__rsaz_512_mul: 1525 leaq 8(%rsp),%rdi 1526 1527 movq (%rsi),%rax 1528 mulq %rbx 1529 movq %rax,(%rdi) 1530 movq 8(%rsi),%rax 1531 movq %rdx,%r8 1532 1533 mulq %rbx 1534 addq %rax,%r8 1535 movq 16(%rsi),%rax 1536 movq %rdx,%r9 1537 adcq $0,%r9 1538 1539 mulq %rbx 1540 addq %rax,%r9 1541 movq 24(%rsi),%rax 1542 movq %rdx,%r10 1543 adcq $0,%r10 1544 1545 mulq %rbx 1546 addq %rax,%r10 1547 movq 32(%rsi),%rax 1548 movq %rdx,%r11 1549 adcq $0,%r11 1550 1551 mulq %rbx 1552 addq %rax,%r11 1553 movq 40(%rsi),%rax 1554 movq %rdx,%r12 1555 adcq $0,%r12 1556 1557 mulq %rbx 1558 addq %rax,%r12 1559 movq 48(%rsi),%rax 1560 movq %rdx,%r13 1561 adcq $0,%r13 1562 1563 mulq %rbx 1564 addq %rax,%r13 1565 movq 56(%rsi),%rax 1566 movq %rdx,%r14 1567 adcq $0,%r14 1568 1569 mulq %rbx 1570 addq %rax,%r14 1571 movq (%rsi),%rax 1572 movq %rdx,%r15 1573 adcq $0,%r15 1574 1575 leaq 8(%rbp),%rbp 1576 leaq 8(%rdi),%rdi 1577 1578 movl $7,%ecx 1579 jmp .Loop_mul 1580 1581.align 32 1582.Loop_mul: 1583 movq (%rbp),%rbx 1584 mulq %rbx 1585 addq %rax,%r8 1586 movq 8(%rsi),%rax 1587 movq %r8,(%rdi) 1588 movq %rdx,%r8 1589 adcq $0,%r8 1590 1591 mulq %rbx 1592 addq %rax,%r9 1593 movq 16(%rsi),%rax 1594 adcq $0,%rdx 1595 addq %r9,%r8 1596 movq %rdx,%r9 1597 adcq $0,%r9 1598 1599 mulq %rbx 1600 addq %rax,%r10 1601 movq 24(%rsi),%rax 1602 adcq $0,%rdx 1603 addq %r10,%r9 1604 movq %rdx,%r10 1605 adcq $0,%r10 1606 1607 mulq %rbx 1608 addq %rax,%r11 1609 movq 32(%rsi),%rax 1610 adcq $0,%rdx 1611 addq %r11,%r10 1612 movq %rdx,%r11 1613 adcq $0,%r11 1614 1615 mulq %rbx 1616 addq %rax,%r12 1617 movq 40(%rsi),%rax 1618 adcq $0,%rdx 1619 addq %r12,%r11 1620 movq %rdx,%r12 1621 adcq $0,%r12 1622 1623 mulq %rbx 1624 addq %rax,%r13 1625 movq 48(%rsi),%rax 1626 adcq $0,%rdx 1627 addq %r13,%r12 1628 movq %rdx,%r13 1629 adcq $0,%r13 1630 1631 mulq %rbx 1632 addq %rax,%r14 1633 movq 56(%rsi),%rax 1634 adcq $0,%rdx 1635 addq %r14,%r13 1636 movq %rdx,%r14 1637 leaq 8(%rbp),%rbp 1638 adcq $0,%r14 1639 1640 mulq %rbx 1641 addq %rax,%r15 1642 movq (%rsi),%rax 1643 adcq $0,%rdx 1644 addq %r15,%r14 1645 movq %rdx,%r15 1646 adcq $0,%r15 1647 1648 leaq 8(%rdi),%rdi 1649 1650 decl %ecx 1651 jnz .Loop_mul 1652 1653 movq %r8,(%rdi) 1654 movq %r9,8(%rdi) 1655 movq %r10,16(%rdi) 1656 movq %r11,24(%rdi) 1657 movq %r12,32(%rdi) 1658 movq %r13,40(%rdi) 1659 movq %r14,48(%rdi) 1660 movq %r15,56(%rdi) 1661 1662 .byte 0xf3,0xc3 1663.size __rsaz_512_mul,.-__rsaz_512_mul 1664.type __rsaz_512_mulx,@function 1665.align 32 1666__rsaz_512_mulx: 1667 mulxq (%rsi),%rbx,%r8 1668 movq $-6,%rcx 1669 1670 mulxq 8(%rsi),%rax,%r9 1671 movq %rbx,8(%rsp) 1672 1673 mulxq 16(%rsi),%rbx,%r10 1674 adcq %rax,%r8 1675 1676 mulxq 24(%rsi),%rax,%r11 1677 adcq %rbx,%r9 1678 1679 mulxq 32(%rsi),%rbx,%r12 1680 adcq %rax,%r10 1681 1682 mulxq 40(%rsi),%rax,%r13 1683 adcq %rbx,%r11 1684 1685 mulxq 48(%rsi),%rbx,%r14 1686 adcq %rax,%r12 1687 1688 mulxq 56(%rsi),%rax,%r15 1689 movq 8(%rbp),%rdx 1690 adcq %rbx,%r13 1691 adcq %rax,%r14 1692 adcq $0,%r15 1693 1694 xorq %rdi,%rdi 1695 jmp .Loop_mulx 1696 1697.align 32 1698.Loop_mulx: 1699 movq %r8,%rbx 1700 mulxq (%rsi),%rax,%r8 1701 adcxq %rax,%rbx 1702 adoxq %r9,%r8 1703 1704 mulxq 8(%rsi),%rax,%r9 1705 adcxq %rax,%r8 1706 adoxq %r10,%r9 1707 1708 mulxq 16(%rsi),%rax,%r10 1709 adcxq %rax,%r9 1710 adoxq %r11,%r10 1711 1712 mulxq 24(%rsi),%rax,%r11 1713 adcxq %rax,%r10 1714 adoxq %r12,%r11 1715 1716.byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 1717 adcxq %rax,%r11 1718 adoxq %r13,%r12 1719 1720 mulxq 40(%rsi),%rax,%r13 1721 adcxq %rax,%r12 1722 adoxq %r14,%r13 1723 1724 mulxq 48(%rsi),%rax,%r14 1725 adcxq %rax,%r13 1726 adoxq %r15,%r14 1727 1728 mulxq 56(%rsi),%rax,%r15 1729 movq 64(%rbp,%rcx,8),%rdx 1730 movq %rbx,8+64-8(%rsp,%rcx,8) 1731 adcxq %rax,%r14 1732 adoxq %rdi,%r15 1733 adcxq %rdi,%r15 1734 1735 incq %rcx 1736 jnz .Loop_mulx 1737 1738 movq %r8,%rbx 1739 mulxq (%rsi),%rax,%r8 1740 adcxq %rax,%rbx 1741 adoxq %r9,%r8 1742 1743.byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 1744 adcxq %rax,%r8 1745 adoxq %r10,%r9 1746 1747.byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 1748 adcxq %rax,%r9 1749 adoxq %r11,%r10 1750 1751 mulxq 24(%rsi),%rax,%r11 1752 adcxq %rax,%r10 1753 adoxq %r12,%r11 1754 1755 mulxq 32(%rsi),%rax,%r12 1756 adcxq %rax,%r11 1757 adoxq %r13,%r12 1758 1759 mulxq 40(%rsi),%rax,%r13 1760 adcxq %rax,%r12 1761 adoxq %r14,%r13 1762 1763.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 1764 adcxq %rax,%r13 1765 adoxq %r15,%r14 1766 1767.byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 1768 adcxq %rax,%r14 1769 adoxq %rdi,%r15 1770 adcxq %rdi,%r15 1771 1772 movq %rbx,8+64-8(%rsp) 1773 movq %r8,8+64(%rsp) 1774 movq %r9,8+64+8(%rsp) 1775 movq %r10,8+64+16(%rsp) 1776 movq %r11,8+64+24(%rsp) 1777 movq %r12,8+64+32(%rsp) 1778 movq %r13,8+64+40(%rsp) 1779 movq %r14,8+64+48(%rsp) 1780 movq %r15,8+64+56(%rsp) 1781 1782 .byte 0xf3,0xc3 1783.size __rsaz_512_mulx,.-__rsaz_512_mulx 1784.globl rsaz_512_scatter4 1785.type rsaz_512_scatter4,@function 1786.align 16 1787rsaz_512_scatter4: 1788 leaq (%rdi,%rdx,8),%rdi 1789 movl $8,%r9d 1790 jmp .Loop_scatter 1791.align 16 1792.Loop_scatter: 1793 movq (%rsi),%rax 1794 leaq 8(%rsi),%rsi 1795 movq %rax,(%rdi) 1796 leaq 128(%rdi),%rdi 1797 decl %r9d 1798 jnz .Loop_scatter 1799 .byte 0xf3,0xc3 1800.size rsaz_512_scatter4,.-rsaz_512_scatter4 1801 1802.globl rsaz_512_gather4 1803.type rsaz_512_gather4,@function 1804.align 16 1805rsaz_512_gather4: 1806 movd %edx,%xmm8 1807 movdqa .Linc+16(%rip),%xmm1 1808 movdqa .Linc(%rip),%xmm0 1809 1810 pshufd $0,%xmm8,%xmm8 1811 movdqa %xmm1,%xmm7 1812 movdqa %xmm1,%xmm2 1813 paddd %xmm0,%xmm1 1814 pcmpeqd %xmm8,%xmm0 1815 movdqa %xmm7,%xmm3 1816 paddd %xmm1,%xmm2 1817 pcmpeqd %xmm8,%xmm1 1818 movdqa %xmm7,%xmm4 1819 paddd %xmm2,%xmm3 1820 pcmpeqd %xmm8,%xmm2 1821 movdqa %xmm7,%xmm5 1822 paddd %xmm3,%xmm4 1823 pcmpeqd %xmm8,%xmm3 1824 movdqa %xmm7,%xmm6 1825 paddd %xmm4,%xmm5 1826 pcmpeqd %xmm8,%xmm4 1827 paddd %xmm5,%xmm6 1828 pcmpeqd %xmm8,%xmm5 1829 paddd %xmm6,%xmm7 1830 pcmpeqd %xmm8,%xmm6 1831 pcmpeqd %xmm8,%xmm7 1832 movl $8,%r9d 1833 jmp .Loop_gather 1834.align 16 1835.Loop_gather: 1836 movdqa 0(%rsi),%xmm8 1837 movdqa 16(%rsi),%xmm9 1838 movdqa 32(%rsi),%xmm10 1839 movdqa 48(%rsi),%xmm11 1840 pand %xmm0,%xmm8 1841 movdqa 64(%rsi),%xmm12 1842 pand %xmm1,%xmm9 1843 movdqa 80(%rsi),%xmm13 1844 pand %xmm2,%xmm10 1845 movdqa 96(%rsi),%xmm14 1846 pand %xmm3,%xmm11 1847 movdqa 112(%rsi),%xmm15 1848 leaq 128(%rsi),%rsi 1849 pand %xmm4,%xmm12 1850 pand %xmm5,%xmm13 1851 pand %xmm6,%xmm14 1852 pand %xmm7,%xmm15 1853 por %xmm10,%xmm8 1854 por %xmm11,%xmm9 1855 por %xmm12,%xmm8 1856 por %xmm13,%xmm9 1857 por %xmm14,%xmm8 1858 por %xmm15,%xmm9 1859 1860 por %xmm9,%xmm8 1861 pshufd $0x4e,%xmm8,%xmm9 1862 por %xmm9,%xmm8 1863 movq %xmm8,(%rdi) 1864 leaq 8(%rdi),%rdi 1865 decl %r9d 1866 jnz .Loop_gather 1867 .byte 0xf3,0xc3 1868.LSEH_end_rsaz_512_gather4: 1869.size rsaz_512_gather4,.-rsaz_512_gather4 1870 1871.align 64 1872.Linc: 1873.long 0,0, 1,1 1874.long 2,2, 2,2 1875