rsaz-x86_64.S revision 305153
1/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/rsaz-x86_64.S 305153 2016-08-31 20:33:59Z jkim $ */ 2/* Do not modify. This file is auto-generated from rsaz-x86_64.pl. */ 3.text 4 5 6 7.globl rsaz_512_sqr 8.type rsaz_512_sqr,@function 9.align 32 10rsaz_512_sqr: 11 pushq %rbx 12 pushq %rbp 13 pushq %r12 14 pushq %r13 15 pushq %r14 16 pushq %r15 17 18 subq $128+24,%rsp 19.Lsqr_body: 20 movq %rdx,%rbp 21 movq (%rsi),%rdx 22 movq 8(%rsi),%rax 23 movq %rcx,128(%rsp) 24 movl $0x80100,%r11d 25 andl OPENSSL_ia32cap_P+8(%rip),%r11d 26 cmpl $0x80100,%r11d 27 je .Loop_sqrx 28 jmp .Loop_sqr 29 30.align 32 31.Loop_sqr: 32 movl %r8d,128+8(%rsp) 33 34 movq %rdx,%rbx 35 mulq %rdx 36 movq %rax,%r8 37 movq 16(%rsi),%rax 38 movq %rdx,%r9 39 40 mulq %rbx 41 addq %rax,%r9 42 movq 24(%rsi),%rax 43 movq %rdx,%r10 44 adcq $0,%r10 45 46 mulq %rbx 47 addq %rax,%r10 48 movq 32(%rsi),%rax 49 movq %rdx,%r11 50 adcq $0,%r11 51 52 mulq %rbx 53 addq %rax,%r11 54 movq 40(%rsi),%rax 55 movq %rdx,%r12 56 adcq $0,%r12 57 58 mulq %rbx 59 addq %rax,%r12 60 movq 48(%rsi),%rax 61 movq %rdx,%r13 62 adcq $0,%r13 63 64 mulq %rbx 65 addq %rax,%r13 66 movq 56(%rsi),%rax 67 movq %rdx,%r14 68 adcq $0,%r14 69 70 mulq %rbx 71 addq %rax,%r14 72 movq %rbx,%rax 73 movq %rdx,%r15 74 adcq $0,%r15 75 76 addq %r8,%r8 77 movq %r9,%rcx 78 adcq %r9,%r9 79 80 mulq %rax 81 movq %rax,(%rsp) 82 addq %rdx,%r8 83 adcq $0,%r9 84 85 movq %r8,8(%rsp) 86 shrq $63,%rcx 87 88 89 movq 8(%rsi),%r8 90 movq 16(%rsi),%rax 91 mulq %r8 92 addq %rax,%r10 93 movq 24(%rsi),%rax 94 movq %rdx,%rbx 95 adcq $0,%rbx 96 97 mulq %r8 98 addq %rax,%r11 99 movq 32(%rsi),%rax 100 adcq $0,%rdx 101 addq %rbx,%r11 102 movq %rdx,%rbx 103 adcq $0,%rbx 104 105 mulq %r8 106 addq %rax,%r12 107 movq 40(%rsi),%rax 108 adcq $0,%rdx 109 addq %rbx,%r12 110 movq %rdx,%rbx 111 adcq $0,%rbx 112 113 mulq %r8 114 addq %rax,%r13 115 movq 48(%rsi),%rax 116 adcq $0,%rdx 117 addq %rbx,%r13 118 movq %rdx,%rbx 119 adcq $0,%rbx 120 121 mulq %r8 122 addq %rax,%r14 123 movq 56(%rsi),%rax 124 adcq $0,%rdx 125 addq %rbx,%r14 126 movq %rdx,%rbx 127 adcq $0,%rbx 128 129 mulq %r8 130 addq %rax,%r15 131 movq %r8,%rax 132 adcq $0,%rdx 133 addq %rbx,%r15 134 movq %rdx,%r8 135 movq %r10,%rdx 136 adcq $0,%r8 137 138 addq %rdx,%rdx 139 leaq (%rcx,%r10,2),%r10 140 movq %r11,%rbx 141 adcq %r11,%r11 142 143 mulq %rax 144 addq %rax,%r9 145 adcq %rdx,%r10 146 adcq $0,%r11 147 148 movq %r9,16(%rsp) 149 movq %r10,24(%rsp) 150 shrq $63,%rbx 151 152 153 movq 16(%rsi),%r9 154 movq 24(%rsi),%rax 155 mulq %r9 156 addq %rax,%r12 157 movq 32(%rsi),%rax 158 movq %rdx,%rcx 159 adcq $0,%rcx 160 161 mulq %r9 162 addq %rax,%r13 163 movq 40(%rsi),%rax 164 adcq $0,%rdx 165 addq %rcx,%r13 166 movq %rdx,%rcx 167 adcq $0,%rcx 168 169 mulq %r9 170 addq %rax,%r14 171 movq 48(%rsi),%rax 172 adcq $0,%rdx 173 addq %rcx,%r14 174 movq %rdx,%rcx 175 adcq $0,%rcx 176 177 mulq %r9 178 movq %r12,%r10 179 leaq (%rbx,%r12,2),%r12 180 addq %rax,%r15 181 movq 56(%rsi),%rax 182 adcq $0,%rdx 183 addq %rcx,%r15 184 movq %rdx,%rcx 185 adcq $0,%rcx 186 187 mulq %r9 188 shrq $63,%r10 189 addq %rax,%r8 190 movq %r9,%rax 191 adcq $0,%rdx 192 addq %rcx,%r8 193 movq %rdx,%r9 194 adcq $0,%r9 195 196 movq %r13,%rcx 197 leaq (%r10,%r13,2),%r13 198 199 mulq %rax 200 addq %rax,%r11 201 adcq %rdx,%r12 202 adcq $0,%r13 203 204 movq %r11,32(%rsp) 205 movq %r12,40(%rsp) 206 shrq $63,%rcx 207 208 209 movq 24(%rsi),%r10 210 movq 32(%rsi),%rax 211 mulq %r10 212 addq %rax,%r14 213 movq 40(%rsi),%rax 214 movq %rdx,%rbx 215 adcq $0,%rbx 216 217 mulq %r10 218 addq %rax,%r15 219 movq 48(%rsi),%rax 220 adcq $0,%rdx 221 addq %rbx,%r15 222 movq %rdx,%rbx 223 adcq $0,%rbx 224 225 mulq %r10 226 movq %r14,%r12 227 leaq (%rcx,%r14,2),%r14 228 addq %rax,%r8 229 movq 56(%rsi),%rax 230 adcq $0,%rdx 231 addq %rbx,%r8 232 movq %rdx,%rbx 233 adcq $0,%rbx 234 235 mulq %r10 236 shrq $63,%r12 237 addq %rax,%r9 238 movq %r10,%rax 239 adcq $0,%rdx 240 addq %rbx,%r9 241 movq %rdx,%r10 242 adcq $0,%r10 243 244 movq %r15,%rbx 245 leaq (%r12,%r15,2),%r15 246 247 mulq %rax 248 addq %rax,%r13 249 adcq %rdx,%r14 250 adcq $0,%r15 251 252 movq %r13,48(%rsp) 253 movq %r14,56(%rsp) 254 shrq $63,%rbx 255 256 257 movq 32(%rsi),%r11 258 movq 40(%rsi),%rax 259 mulq %r11 260 addq %rax,%r8 261 movq 48(%rsi),%rax 262 movq %rdx,%rcx 263 adcq $0,%rcx 264 265 mulq %r11 266 addq %rax,%r9 267 movq 56(%rsi),%rax 268 adcq $0,%rdx 269 movq %r8,%r12 270 leaq (%rbx,%r8,2),%r8 271 addq %rcx,%r9 272 movq %rdx,%rcx 273 adcq $0,%rcx 274 275 mulq %r11 276 shrq $63,%r12 277 addq %rax,%r10 278 movq %r11,%rax 279 adcq $0,%rdx 280 addq %rcx,%r10 281 movq %rdx,%r11 282 adcq $0,%r11 283 284 movq %r9,%rcx 285 leaq (%r12,%r9,2),%r9 286 287 mulq %rax 288 addq %rax,%r15 289 adcq %rdx,%r8 290 adcq $0,%r9 291 292 movq %r15,64(%rsp) 293 movq %r8,72(%rsp) 294 shrq $63,%rcx 295 296 297 movq 40(%rsi),%r12 298 movq 48(%rsi),%rax 299 mulq %r12 300 addq %rax,%r10 301 movq 56(%rsi),%rax 302 movq %rdx,%rbx 303 adcq $0,%rbx 304 305 mulq %r12 306 addq %rax,%r11 307 movq %r12,%rax 308 movq %r10,%r15 309 leaq (%rcx,%r10,2),%r10 310 adcq $0,%rdx 311 shrq $63,%r15 312 addq %rbx,%r11 313 movq %rdx,%r12 314 adcq $0,%r12 315 316 movq %r11,%rbx 317 leaq (%r15,%r11,2),%r11 318 319 mulq %rax 320 addq %rax,%r9 321 adcq %rdx,%r10 322 adcq $0,%r11 323 324 movq %r9,80(%rsp) 325 movq %r10,88(%rsp) 326 327 328 movq 48(%rsi),%r13 329 movq 56(%rsi),%rax 330 mulq %r13 331 addq %rax,%r12 332 movq %r13,%rax 333 movq %rdx,%r13 334 adcq $0,%r13 335 336 xorq %r14,%r14 337 shlq $1,%rbx 338 adcq %r12,%r12 339 adcq %r13,%r13 340 adcq %r14,%r14 341 342 mulq %rax 343 addq %rax,%r11 344 adcq %rdx,%r12 345 adcq $0,%r13 346 347 movq %r11,96(%rsp) 348 movq %r12,104(%rsp) 349 350 351 movq 56(%rsi),%rax 352 mulq %rax 353 addq %rax,%r13 354 adcq $0,%rdx 355 356 addq %rdx,%r14 357 358 movq %r13,112(%rsp) 359 movq %r14,120(%rsp) 360 361 movq (%rsp),%r8 362 movq 8(%rsp),%r9 363 movq 16(%rsp),%r10 364 movq 24(%rsp),%r11 365 movq 32(%rsp),%r12 366 movq 40(%rsp),%r13 367 movq 48(%rsp),%r14 368 movq 56(%rsp),%r15 369 370 call __rsaz_512_reduce 371 372 addq 64(%rsp),%r8 373 adcq 72(%rsp),%r9 374 adcq 80(%rsp),%r10 375 adcq 88(%rsp),%r11 376 adcq 96(%rsp),%r12 377 adcq 104(%rsp),%r13 378 adcq 112(%rsp),%r14 379 adcq 120(%rsp),%r15 380 sbbq %rcx,%rcx 381 382 call __rsaz_512_subtract 383 384 movq %r8,%rdx 385 movq %r9,%rax 386 movl 128+8(%rsp),%r8d 387 movq %rdi,%rsi 388 389 decl %r8d 390 jnz .Loop_sqr 391 jmp .Lsqr_tail 392 393.align 32 394.Loop_sqrx: 395 movl %r8d,128+8(%rsp) 396.byte 102,72,15,110,199 397.byte 102,72,15,110,205 398 399 mulxq %rax,%r8,%r9 400 401 mulxq 16(%rsi),%rcx,%r10 402 xorq %rbp,%rbp 403 404 mulxq 24(%rsi),%rax,%r11 405 adcxq %rcx,%r9 406 407 mulxq 32(%rsi),%rcx,%r12 408 adcxq %rax,%r10 409 410 mulxq 40(%rsi),%rax,%r13 411 adcxq %rcx,%r11 412 413.byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 414 adcxq %rax,%r12 415 adcxq %rcx,%r13 416 417.byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 418 adcxq %rax,%r14 419 adcxq %rbp,%r15 420 421 movq %r9,%rcx 422 shldq $1,%r8,%r9 423 shlq $1,%r8 424 425 xorl %ebp,%ebp 426 mulxq %rdx,%rax,%rdx 427 adcxq %rdx,%r8 428 movq 8(%rsi),%rdx 429 adcxq %rbp,%r9 430 431 movq %rax,(%rsp) 432 movq %r8,8(%rsp) 433 434 435 mulxq 16(%rsi),%rax,%rbx 436 adoxq %rax,%r10 437 adcxq %rbx,%r11 438 439.byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 440 adoxq %rdi,%r11 441 adcxq %r8,%r12 442 443 mulxq 32(%rsi),%rax,%rbx 444 adoxq %rax,%r12 445 adcxq %rbx,%r13 446 447 mulxq 40(%rsi),%rdi,%r8 448 adoxq %rdi,%r13 449 adcxq %r8,%r14 450 451.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 452 adoxq %rax,%r14 453 adcxq %rbx,%r15 454 455.byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 456 adoxq %rdi,%r15 457 adcxq %rbp,%r8 458 adoxq %rbp,%r8 459 460 movq %r11,%rbx 461 shldq $1,%r10,%r11 462 shldq $1,%rcx,%r10 463 464 xorl %ebp,%ebp 465 mulxq %rdx,%rax,%rcx 466 movq 16(%rsi),%rdx 467 adcxq %rax,%r9 468 adcxq %rcx,%r10 469 adcxq %rbp,%r11 470 471 movq %r9,16(%rsp) 472.byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 473 474 475.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 476 adoxq %rdi,%r12 477 adcxq %r9,%r13 478 479 mulxq 32(%rsi),%rax,%rcx 480 adoxq %rax,%r13 481 adcxq %rcx,%r14 482 483 mulxq 40(%rsi),%rdi,%r9 484 adoxq %rdi,%r14 485 adcxq %r9,%r15 486 487.byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 488 adoxq %rax,%r15 489 adcxq %rcx,%r8 490 491.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 492 adoxq %rdi,%r8 493 adcxq %rbp,%r9 494 adoxq %rbp,%r9 495 496 movq %r13,%rcx 497 shldq $1,%r12,%r13 498 shldq $1,%rbx,%r12 499 500 xorl %ebp,%ebp 501 mulxq %rdx,%rax,%rdx 502 adcxq %rax,%r11 503 adcxq %rdx,%r12 504 movq 24(%rsi),%rdx 505 adcxq %rbp,%r13 506 507 movq %r11,32(%rsp) 508.byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 509 510 511.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 512 adoxq %rax,%r14 513 adcxq %rbx,%r15 514 515 mulxq 40(%rsi),%rdi,%r10 516 adoxq %rdi,%r15 517 adcxq %r10,%r8 518 519 mulxq 48(%rsi),%rax,%rbx 520 adoxq %rax,%r8 521 adcxq %rbx,%r9 522 523 mulxq 56(%rsi),%rdi,%r10 524 adoxq %rdi,%r9 525 adcxq %rbp,%r10 526 adoxq %rbp,%r10 527 528.byte 0x66 529 movq %r15,%rbx 530 shldq $1,%r14,%r15 531 shldq $1,%rcx,%r14 532 533 xorl %ebp,%ebp 534 mulxq %rdx,%rax,%rdx 535 adcxq %rax,%r13 536 adcxq %rdx,%r14 537 movq 32(%rsi),%rdx 538 adcxq %rbp,%r15 539 540 movq %r13,48(%rsp) 541 movq %r14,56(%rsp) 542 543 544.byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 545 adoxq %rdi,%r8 546 adcxq %r11,%r9 547 548 mulxq 48(%rsi),%rax,%rcx 549 adoxq %rax,%r9 550 adcxq %rcx,%r10 551 552 mulxq 56(%rsi),%rdi,%r11 553 adoxq %rdi,%r10 554 adcxq %rbp,%r11 555 adoxq %rbp,%r11 556 557 movq %r9,%rcx 558 shldq $1,%r8,%r9 559 shldq $1,%rbx,%r8 560 561 xorl %ebp,%ebp 562 mulxq %rdx,%rax,%rdx 563 adcxq %rax,%r15 564 adcxq %rdx,%r8 565 movq 40(%rsi),%rdx 566 adcxq %rbp,%r9 567 568 movq %r15,64(%rsp) 569 movq %r8,72(%rsp) 570 571 572.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 573 adoxq %rax,%r10 574 adcxq %rbx,%r11 575 576.byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 577 adoxq %rdi,%r11 578 adcxq %rbp,%r12 579 adoxq %rbp,%r12 580 581 movq %r11,%rbx 582 shldq $1,%r10,%r11 583 shldq $1,%rcx,%r10 584 585 xorl %ebp,%ebp 586 mulxq %rdx,%rax,%rdx 587 adcxq %rax,%r9 588 adcxq %rdx,%r10 589 movq 48(%rsi),%rdx 590 adcxq %rbp,%r11 591 592 movq %r9,80(%rsp) 593 movq %r10,88(%rsp) 594 595 596.byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 597 adoxq %rax,%r12 598 adoxq %rbp,%r13 599 600 xorq %r14,%r14 601 shldq $1,%r13,%r14 602 shldq $1,%r12,%r13 603 shldq $1,%rbx,%r12 604 605 xorl %ebp,%ebp 606 mulxq %rdx,%rax,%rdx 607 adcxq %rax,%r11 608 adcxq %rdx,%r12 609 movq 56(%rsi),%rdx 610 adcxq %rbp,%r13 611 612.byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 613.byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 614 615 616 mulxq %rdx,%rax,%rdx 617 adoxq %rax,%r13 618 adoxq %rbp,%rdx 619 620.byte 0x66 621 addq %rdx,%r14 622 623 movq %r13,112(%rsp) 624 movq %r14,120(%rsp) 625.byte 102,72,15,126,199 626.byte 102,72,15,126,205 627 628 movq 128(%rsp),%rdx 629 movq (%rsp),%r8 630 movq 8(%rsp),%r9 631 movq 16(%rsp),%r10 632 movq 24(%rsp),%r11 633 movq 32(%rsp),%r12 634 movq 40(%rsp),%r13 635 movq 48(%rsp),%r14 636 movq 56(%rsp),%r15 637 638 call __rsaz_512_reducex 639 640 addq 64(%rsp),%r8 641 adcq 72(%rsp),%r9 642 adcq 80(%rsp),%r10 643 adcq 88(%rsp),%r11 644 adcq 96(%rsp),%r12 645 adcq 104(%rsp),%r13 646 adcq 112(%rsp),%r14 647 adcq 120(%rsp),%r15 648 sbbq %rcx,%rcx 649 650 call __rsaz_512_subtract 651 652 movq %r8,%rdx 653 movq %r9,%rax 654 movl 128+8(%rsp),%r8d 655 movq %rdi,%rsi 656 657 decl %r8d 658 jnz .Loop_sqrx 659 660.Lsqr_tail: 661 662 leaq 128+24+48(%rsp),%rax 663 movq -48(%rax),%r15 664 movq -40(%rax),%r14 665 movq -32(%rax),%r13 666 movq -24(%rax),%r12 667 movq -16(%rax),%rbp 668 movq -8(%rax),%rbx 669 leaq (%rax),%rsp 670.Lsqr_epilogue: 671 .byte 0xf3,0xc3 672.size rsaz_512_sqr,.-rsaz_512_sqr 673.globl rsaz_512_mul 674.type rsaz_512_mul,@function 675.align 32 676rsaz_512_mul: 677 pushq %rbx 678 pushq %rbp 679 pushq %r12 680 pushq %r13 681 pushq %r14 682 pushq %r15 683 684 subq $128+24,%rsp 685.Lmul_body: 686.byte 102,72,15,110,199 687.byte 102,72,15,110,201 688 movq %r8,128(%rsp) 689 movl $0x80100,%r11d 690 andl OPENSSL_ia32cap_P+8(%rip),%r11d 691 cmpl $0x80100,%r11d 692 je .Lmulx 693 movq (%rdx),%rbx 694 movq %rdx,%rbp 695 call __rsaz_512_mul 696 697.byte 102,72,15,126,199 698.byte 102,72,15,126,205 699 700 movq (%rsp),%r8 701 movq 8(%rsp),%r9 702 movq 16(%rsp),%r10 703 movq 24(%rsp),%r11 704 movq 32(%rsp),%r12 705 movq 40(%rsp),%r13 706 movq 48(%rsp),%r14 707 movq 56(%rsp),%r15 708 709 call __rsaz_512_reduce 710 jmp .Lmul_tail 711 712.align 32 713.Lmulx: 714 movq %rdx,%rbp 715 movq (%rdx),%rdx 716 call __rsaz_512_mulx 717 718.byte 102,72,15,126,199 719.byte 102,72,15,126,205 720 721 movq 128(%rsp),%rdx 722 movq (%rsp),%r8 723 movq 8(%rsp),%r9 724 movq 16(%rsp),%r10 725 movq 24(%rsp),%r11 726 movq 32(%rsp),%r12 727 movq 40(%rsp),%r13 728 movq 48(%rsp),%r14 729 movq 56(%rsp),%r15 730 731 call __rsaz_512_reducex 732.Lmul_tail: 733 addq 64(%rsp),%r8 734 adcq 72(%rsp),%r9 735 adcq 80(%rsp),%r10 736 adcq 88(%rsp),%r11 737 adcq 96(%rsp),%r12 738 adcq 104(%rsp),%r13 739 adcq 112(%rsp),%r14 740 adcq 120(%rsp),%r15 741 sbbq %rcx,%rcx 742 743 call __rsaz_512_subtract 744 745 leaq 128+24+48(%rsp),%rax 746 movq -48(%rax),%r15 747 movq -40(%rax),%r14 748 movq -32(%rax),%r13 749 movq -24(%rax),%r12 750 movq -16(%rax),%rbp 751 movq -8(%rax),%rbx 752 leaq (%rax),%rsp 753.Lmul_epilogue: 754 .byte 0xf3,0xc3 755.size rsaz_512_mul,.-rsaz_512_mul 756.globl rsaz_512_mul_gather4 757.type rsaz_512_mul_gather4,@function 758.align 32 759rsaz_512_mul_gather4: 760 pushq %rbx 761 pushq %rbp 762 pushq %r12 763 pushq %r13 764 pushq %r14 765 pushq %r15 766 767 subq $152,%rsp 768.Lmul_gather4_body: 769 movd %r9d,%xmm8 770 movdqa .Linc+16(%rip),%xmm1 771 movdqa .Linc(%rip),%xmm0 772 773 pshufd $0,%xmm8,%xmm8 774 movdqa %xmm1,%xmm7 775 movdqa %xmm1,%xmm2 776 paddd %xmm0,%xmm1 777 pcmpeqd %xmm8,%xmm0 778 movdqa %xmm7,%xmm3 779 paddd %xmm1,%xmm2 780 pcmpeqd %xmm8,%xmm1 781 movdqa %xmm7,%xmm4 782 paddd %xmm2,%xmm3 783 pcmpeqd %xmm8,%xmm2 784 movdqa %xmm7,%xmm5 785 paddd %xmm3,%xmm4 786 pcmpeqd %xmm8,%xmm3 787 movdqa %xmm7,%xmm6 788 paddd %xmm4,%xmm5 789 pcmpeqd %xmm8,%xmm4 790 paddd %xmm5,%xmm6 791 pcmpeqd %xmm8,%xmm5 792 paddd %xmm6,%xmm7 793 pcmpeqd %xmm8,%xmm6 794 pcmpeqd %xmm8,%xmm7 795 796 movdqa 0(%rdx),%xmm8 797 movdqa 16(%rdx),%xmm9 798 movdqa 32(%rdx),%xmm10 799 movdqa 48(%rdx),%xmm11 800 pand %xmm0,%xmm8 801 movdqa 64(%rdx),%xmm12 802 pand %xmm1,%xmm9 803 movdqa 80(%rdx),%xmm13 804 pand %xmm2,%xmm10 805 movdqa 96(%rdx),%xmm14 806 pand %xmm3,%xmm11 807 movdqa 112(%rdx),%xmm15 808 leaq 128(%rdx),%rbp 809 pand %xmm4,%xmm12 810 pand %xmm5,%xmm13 811 pand %xmm6,%xmm14 812 pand %xmm7,%xmm15 813 por %xmm10,%xmm8 814 por %xmm11,%xmm9 815 por %xmm12,%xmm8 816 por %xmm13,%xmm9 817 por %xmm14,%xmm8 818 por %xmm15,%xmm9 819 820 por %xmm9,%xmm8 821 pshufd $0x4e,%xmm8,%xmm9 822 por %xmm9,%xmm8 823 movl $0x80100,%r11d 824 andl OPENSSL_ia32cap_P+8(%rip),%r11d 825 cmpl $0x80100,%r11d 826 je .Lmulx_gather 827.byte 102,76,15,126,195 828 829 movq %r8,128(%rsp) 830 movq %rdi,128+8(%rsp) 831 movq %rcx,128+16(%rsp) 832 833 movq (%rsi),%rax 834 movq 8(%rsi),%rcx 835 mulq %rbx 836 movq %rax,(%rsp) 837 movq %rcx,%rax 838 movq %rdx,%r8 839 840 mulq %rbx 841 addq %rax,%r8 842 movq 16(%rsi),%rax 843 movq %rdx,%r9 844 adcq $0,%r9 845 846 mulq %rbx 847 addq %rax,%r9 848 movq 24(%rsi),%rax 849 movq %rdx,%r10 850 adcq $0,%r10 851 852 mulq %rbx 853 addq %rax,%r10 854 movq 32(%rsi),%rax 855 movq %rdx,%r11 856 adcq $0,%r11 857 858 mulq %rbx 859 addq %rax,%r11 860 movq 40(%rsi),%rax 861 movq %rdx,%r12 862 adcq $0,%r12 863 864 mulq %rbx 865 addq %rax,%r12 866 movq 48(%rsi),%rax 867 movq %rdx,%r13 868 adcq $0,%r13 869 870 mulq %rbx 871 addq %rax,%r13 872 movq 56(%rsi),%rax 873 movq %rdx,%r14 874 adcq $0,%r14 875 876 mulq %rbx 877 addq %rax,%r14 878 movq (%rsi),%rax 879 movq %rdx,%r15 880 adcq $0,%r15 881 882 leaq 8(%rsp),%rdi 883 movl $7,%ecx 884 jmp .Loop_mul_gather 885 886.align 32 887.Loop_mul_gather: 888 movdqa 0(%rbp),%xmm8 889 movdqa 16(%rbp),%xmm9 890 movdqa 32(%rbp),%xmm10 891 movdqa 48(%rbp),%xmm11 892 pand %xmm0,%xmm8 893 movdqa 64(%rbp),%xmm12 894 pand %xmm1,%xmm9 895 movdqa 80(%rbp),%xmm13 896 pand %xmm2,%xmm10 897 movdqa 96(%rbp),%xmm14 898 pand %xmm3,%xmm11 899 movdqa 112(%rbp),%xmm15 900 leaq 128(%rbp),%rbp 901 pand %xmm4,%xmm12 902 pand %xmm5,%xmm13 903 pand %xmm6,%xmm14 904 pand %xmm7,%xmm15 905 por %xmm10,%xmm8 906 por %xmm11,%xmm9 907 por %xmm12,%xmm8 908 por %xmm13,%xmm9 909 por %xmm14,%xmm8 910 por %xmm15,%xmm9 911 912 por %xmm9,%xmm8 913 pshufd $0x4e,%xmm8,%xmm9 914 por %xmm9,%xmm8 915.byte 102,76,15,126,195 916 917 mulq %rbx 918 addq %rax,%r8 919 movq 8(%rsi),%rax 920 movq %r8,(%rdi) 921 movq %rdx,%r8 922 adcq $0,%r8 923 924 mulq %rbx 925 addq %rax,%r9 926 movq 16(%rsi),%rax 927 adcq $0,%rdx 928 addq %r9,%r8 929 movq %rdx,%r9 930 adcq $0,%r9 931 932 mulq %rbx 933 addq %rax,%r10 934 movq 24(%rsi),%rax 935 adcq $0,%rdx 936 addq %r10,%r9 937 movq %rdx,%r10 938 adcq $0,%r10 939 940 mulq %rbx 941 addq %rax,%r11 942 movq 32(%rsi),%rax 943 adcq $0,%rdx 944 addq %r11,%r10 945 movq %rdx,%r11 946 adcq $0,%r11 947 948 mulq %rbx 949 addq %rax,%r12 950 movq 40(%rsi),%rax 951 adcq $0,%rdx 952 addq %r12,%r11 953 movq %rdx,%r12 954 adcq $0,%r12 955 956 mulq %rbx 957 addq %rax,%r13 958 movq 48(%rsi),%rax 959 adcq $0,%rdx 960 addq %r13,%r12 961 movq %rdx,%r13 962 adcq $0,%r13 963 964 mulq %rbx 965 addq %rax,%r14 966 movq 56(%rsi),%rax 967 adcq $0,%rdx 968 addq %r14,%r13 969 movq %rdx,%r14 970 adcq $0,%r14 971 972 mulq %rbx 973 addq %rax,%r15 974 movq (%rsi),%rax 975 adcq $0,%rdx 976 addq %r15,%r14 977 movq %rdx,%r15 978 adcq $0,%r15 979 980 leaq 8(%rdi),%rdi 981 982 decl %ecx 983 jnz .Loop_mul_gather 984 985 movq %r8,(%rdi) 986 movq %r9,8(%rdi) 987 movq %r10,16(%rdi) 988 movq %r11,24(%rdi) 989 movq %r12,32(%rdi) 990 movq %r13,40(%rdi) 991 movq %r14,48(%rdi) 992 movq %r15,56(%rdi) 993 994 movq 128+8(%rsp),%rdi 995 movq 128+16(%rsp),%rbp 996 997 movq (%rsp),%r8 998 movq 8(%rsp),%r9 999 movq 16(%rsp),%r10 1000 movq 24(%rsp),%r11 1001 movq 32(%rsp),%r12 1002 movq 40(%rsp),%r13 1003 movq 48(%rsp),%r14 1004 movq 56(%rsp),%r15 1005 1006 call __rsaz_512_reduce 1007 jmp .Lmul_gather_tail 1008 1009.align 32 1010.Lmulx_gather: 1011.byte 102,76,15,126,194 1012 1013 movq %r8,128(%rsp) 1014 movq %rdi,128+8(%rsp) 1015 movq %rcx,128+16(%rsp) 1016 1017 mulxq (%rsi),%rbx,%r8 1018 movq %rbx,(%rsp) 1019 xorl %edi,%edi 1020 1021 mulxq 8(%rsi),%rax,%r9 1022 1023 mulxq 16(%rsi),%rbx,%r10 1024 adcxq %rax,%r8 1025 1026 mulxq 24(%rsi),%rax,%r11 1027 adcxq %rbx,%r9 1028 1029 mulxq 32(%rsi),%rbx,%r12 1030 adcxq %rax,%r10 1031 1032 mulxq 40(%rsi),%rax,%r13 1033 adcxq %rbx,%r11 1034 1035 mulxq 48(%rsi),%rbx,%r14 1036 adcxq %rax,%r12 1037 1038 mulxq 56(%rsi),%rax,%r15 1039 adcxq %rbx,%r13 1040 adcxq %rax,%r14 1041.byte 0x67 1042 movq %r8,%rbx 1043 adcxq %rdi,%r15 1044 1045 movq $-7,%rcx 1046 jmp .Loop_mulx_gather 1047 1048.align 32 1049.Loop_mulx_gather: 1050 movdqa 0(%rbp),%xmm8 1051 movdqa 16(%rbp),%xmm9 1052 movdqa 32(%rbp),%xmm10 1053 movdqa 48(%rbp),%xmm11 1054 pand %xmm0,%xmm8 1055 movdqa 64(%rbp),%xmm12 1056 pand %xmm1,%xmm9 1057 movdqa 80(%rbp),%xmm13 1058 pand %xmm2,%xmm10 1059 movdqa 96(%rbp),%xmm14 1060 pand %xmm3,%xmm11 1061 movdqa 112(%rbp),%xmm15 1062 leaq 128(%rbp),%rbp 1063 pand %xmm4,%xmm12 1064 pand %xmm5,%xmm13 1065 pand %xmm6,%xmm14 1066 pand %xmm7,%xmm15 1067 por %xmm10,%xmm8 1068 por %xmm11,%xmm9 1069 por %xmm12,%xmm8 1070 por %xmm13,%xmm9 1071 por %xmm14,%xmm8 1072 por %xmm15,%xmm9 1073 1074 por %xmm9,%xmm8 1075 pshufd $0x4e,%xmm8,%xmm9 1076 por %xmm9,%xmm8 1077.byte 102,76,15,126,194 1078 1079.byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 1080 adcxq %rax,%rbx 1081 adoxq %r9,%r8 1082 1083 mulxq 8(%rsi),%rax,%r9 1084 adcxq %rax,%r8 1085 adoxq %r10,%r9 1086 1087 mulxq 16(%rsi),%rax,%r10 1088 adcxq %rax,%r9 1089 adoxq %r11,%r10 1090 1091.byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 1092 adcxq %rax,%r10 1093 adoxq %r12,%r11 1094 1095 mulxq 32(%rsi),%rax,%r12 1096 adcxq %rax,%r11 1097 adoxq %r13,%r12 1098 1099 mulxq 40(%rsi),%rax,%r13 1100 adcxq %rax,%r12 1101 adoxq %r14,%r13 1102 1103.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 1104 adcxq %rax,%r13 1105.byte 0x67 1106 adoxq %r15,%r14 1107 1108 mulxq 56(%rsi),%rax,%r15 1109 movq %rbx,64(%rsp,%rcx,8) 1110 adcxq %rax,%r14 1111 adoxq %rdi,%r15 1112 movq %r8,%rbx 1113 adcxq %rdi,%r15 1114 1115 incq %rcx 1116 jnz .Loop_mulx_gather 1117 1118 movq %r8,64(%rsp) 1119 movq %r9,64+8(%rsp) 1120 movq %r10,64+16(%rsp) 1121 movq %r11,64+24(%rsp) 1122 movq %r12,64+32(%rsp) 1123 movq %r13,64+40(%rsp) 1124 movq %r14,64+48(%rsp) 1125 movq %r15,64+56(%rsp) 1126 1127 movq 128(%rsp),%rdx 1128 movq 128+8(%rsp),%rdi 1129 movq 128+16(%rsp),%rbp 1130 1131 movq (%rsp),%r8 1132 movq 8(%rsp),%r9 1133 movq 16(%rsp),%r10 1134 movq 24(%rsp),%r11 1135 movq 32(%rsp),%r12 1136 movq 40(%rsp),%r13 1137 movq 48(%rsp),%r14 1138 movq 56(%rsp),%r15 1139 1140 call __rsaz_512_reducex 1141 1142.Lmul_gather_tail: 1143 addq 64(%rsp),%r8 1144 adcq 72(%rsp),%r9 1145 adcq 80(%rsp),%r10 1146 adcq 88(%rsp),%r11 1147 adcq 96(%rsp),%r12 1148 adcq 104(%rsp),%r13 1149 adcq 112(%rsp),%r14 1150 adcq 120(%rsp),%r15 1151 sbbq %rcx,%rcx 1152 1153 call __rsaz_512_subtract 1154 1155 leaq 128+24+48(%rsp),%rax 1156 movq -48(%rax),%r15 1157 movq -40(%rax),%r14 1158 movq -32(%rax),%r13 1159 movq -24(%rax),%r12 1160 movq -16(%rax),%rbp 1161 movq -8(%rax),%rbx 1162 leaq (%rax),%rsp 1163.Lmul_gather4_epilogue: 1164 .byte 0xf3,0xc3 1165.size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4 1166.globl rsaz_512_mul_scatter4 1167.type rsaz_512_mul_scatter4,@function 1168.align 32 1169rsaz_512_mul_scatter4: 1170 pushq %rbx 1171 pushq %rbp 1172 pushq %r12 1173 pushq %r13 1174 pushq %r14 1175 pushq %r15 1176 1177 movl %r9d,%r9d 1178 subq $128+24,%rsp 1179.Lmul_scatter4_body: 1180 leaq (%r8,%r9,8),%r8 1181.byte 102,72,15,110,199 1182.byte 102,72,15,110,202 1183.byte 102,73,15,110,208 1184 movq %rcx,128(%rsp) 1185 1186 movq %rdi,%rbp 1187 movl $0x80100,%r11d 1188 andl OPENSSL_ia32cap_P+8(%rip),%r11d 1189 cmpl $0x80100,%r11d 1190 je .Lmulx_scatter 1191 movq (%rdi),%rbx 1192 call __rsaz_512_mul 1193 1194.byte 102,72,15,126,199 1195.byte 102,72,15,126,205 1196 1197 movq (%rsp),%r8 1198 movq 8(%rsp),%r9 1199 movq 16(%rsp),%r10 1200 movq 24(%rsp),%r11 1201 movq 32(%rsp),%r12 1202 movq 40(%rsp),%r13 1203 movq 48(%rsp),%r14 1204 movq 56(%rsp),%r15 1205 1206 call __rsaz_512_reduce 1207 jmp .Lmul_scatter_tail 1208 1209.align 32 1210.Lmulx_scatter: 1211 movq (%rdi),%rdx 1212 call __rsaz_512_mulx 1213 1214.byte 102,72,15,126,199 1215.byte 102,72,15,126,205 1216 1217 movq 128(%rsp),%rdx 1218 movq (%rsp),%r8 1219 movq 8(%rsp),%r9 1220 movq 16(%rsp),%r10 1221 movq 24(%rsp),%r11 1222 movq 32(%rsp),%r12 1223 movq 40(%rsp),%r13 1224 movq 48(%rsp),%r14 1225 movq 56(%rsp),%r15 1226 1227 call __rsaz_512_reducex 1228 1229.Lmul_scatter_tail: 1230 addq 64(%rsp),%r8 1231 adcq 72(%rsp),%r9 1232 adcq 80(%rsp),%r10 1233 adcq 88(%rsp),%r11 1234 adcq 96(%rsp),%r12 1235 adcq 104(%rsp),%r13 1236 adcq 112(%rsp),%r14 1237 adcq 120(%rsp),%r15 1238.byte 102,72,15,126,214 1239 sbbq %rcx,%rcx 1240 1241 call __rsaz_512_subtract 1242 1243 movq %r8,0(%rsi) 1244 movq %r9,128(%rsi) 1245 movq %r10,256(%rsi) 1246 movq %r11,384(%rsi) 1247 movq %r12,512(%rsi) 1248 movq %r13,640(%rsi) 1249 movq %r14,768(%rsi) 1250 movq %r15,896(%rsi) 1251 1252 leaq 128+24+48(%rsp),%rax 1253 movq -48(%rax),%r15 1254 movq -40(%rax),%r14 1255 movq -32(%rax),%r13 1256 movq -24(%rax),%r12 1257 movq -16(%rax),%rbp 1258 movq -8(%rax),%rbx 1259 leaq (%rax),%rsp 1260.Lmul_scatter4_epilogue: 1261 .byte 0xf3,0xc3 1262.size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4 1263.globl rsaz_512_mul_by_one 1264.type rsaz_512_mul_by_one,@function 1265.align 32 1266rsaz_512_mul_by_one: 1267 pushq %rbx 1268 pushq %rbp 1269 pushq %r12 1270 pushq %r13 1271 pushq %r14 1272 pushq %r15 1273 1274 subq $128+24,%rsp 1275.Lmul_by_one_body: 1276 movl OPENSSL_ia32cap_P+8(%rip),%eax 1277 movq %rdx,%rbp 1278 movq %rcx,128(%rsp) 1279 1280 movq (%rsi),%r8 1281 pxor %xmm0,%xmm0 1282 movq 8(%rsi),%r9 1283 movq 16(%rsi),%r10 1284 movq 24(%rsi),%r11 1285 movq 32(%rsi),%r12 1286 movq 40(%rsi),%r13 1287 movq 48(%rsi),%r14 1288 movq 56(%rsi),%r15 1289 1290 movdqa %xmm0,(%rsp) 1291 movdqa %xmm0,16(%rsp) 1292 movdqa %xmm0,32(%rsp) 1293 movdqa %xmm0,48(%rsp) 1294 movdqa %xmm0,64(%rsp) 1295 movdqa %xmm0,80(%rsp) 1296 movdqa %xmm0,96(%rsp) 1297 andl $0x80100,%eax 1298 cmpl $0x80100,%eax 1299 je .Lby_one_callx 1300 call __rsaz_512_reduce 1301 jmp .Lby_one_tail 1302.align 32 1303.Lby_one_callx: 1304 movq 128(%rsp),%rdx 1305 call __rsaz_512_reducex 1306.Lby_one_tail: 1307 movq %r8,(%rdi) 1308 movq %r9,8(%rdi) 1309 movq %r10,16(%rdi) 1310 movq %r11,24(%rdi) 1311 movq %r12,32(%rdi) 1312 movq %r13,40(%rdi) 1313 movq %r14,48(%rdi) 1314 movq %r15,56(%rdi) 1315 1316 leaq 128+24+48(%rsp),%rax 1317 movq -48(%rax),%r15 1318 movq -40(%rax),%r14 1319 movq -32(%rax),%r13 1320 movq -24(%rax),%r12 1321 movq -16(%rax),%rbp 1322 movq -8(%rax),%rbx 1323 leaq (%rax),%rsp 1324.Lmul_by_one_epilogue: 1325 .byte 0xf3,0xc3 1326.size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one 1327.type __rsaz_512_reduce,@function 1328.align 32 1329__rsaz_512_reduce: 1330 movq %r8,%rbx 1331 imulq 128+8(%rsp),%rbx 1332 movq 0(%rbp),%rax 1333 movl $8,%ecx 1334 jmp .Lreduction_loop 1335 1336.align 32 1337.Lreduction_loop: 1338 mulq %rbx 1339 movq 8(%rbp),%rax 1340 negq %r8 1341 movq %rdx,%r8 1342 adcq $0,%r8 1343 1344 mulq %rbx 1345 addq %rax,%r9 1346 movq 16(%rbp),%rax 1347 adcq $0,%rdx 1348 addq %r9,%r8 1349 movq %rdx,%r9 1350 adcq $0,%r9 1351 1352 mulq %rbx 1353 addq %rax,%r10 1354 movq 24(%rbp),%rax 1355 adcq $0,%rdx 1356 addq %r10,%r9 1357 movq %rdx,%r10 1358 adcq $0,%r10 1359 1360 mulq %rbx 1361 addq %rax,%r11 1362 movq 32(%rbp),%rax 1363 adcq $0,%rdx 1364 addq %r11,%r10 1365 movq 128+8(%rsp),%rsi 1366 1367 1368 adcq $0,%rdx 1369 movq %rdx,%r11 1370 1371 mulq %rbx 1372 addq %rax,%r12 1373 movq 40(%rbp),%rax 1374 adcq $0,%rdx 1375 imulq %r8,%rsi 1376 addq %r12,%r11 1377 movq %rdx,%r12 1378 adcq $0,%r12 1379 1380 mulq %rbx 1381 addq %rax,%r13 1382 movq 48(%rbp),%rax 1383 adcq $0,%rdx 1384 addq %r13,%r12 1385 movq %rdx,%r13 1386 adcq $0,%r13 1387 1388 mulq %rbx 1389 addq %rax,%r14 1390 movq 56(%rbp),%rax 1391 adcq $0,%rdx 1392 addq %r14,%r13 1393 movq %rdx,%r14 1394 adcq $0,%r14 1395 1396 mulq %rbx 1397 movq %rsi,%rbx 1398 addq %rax,%r15 1399 movq 0(%rbp),%rax 1400 adcq $0,%rdx 1401 addq %r15,%r14 1402 movq %rdx,%r15 1403 adcq $0,%r15 1404 1405 decl %ecx 1406 jne .Lreduction_loop 1407 1408 .byte 0xf3,0xc3 1409.size __rsaz_512_reduce,.-__rsaz_512_reduce 1410.type __rsaz_512_reducex,@function 1411.align 32 1412__rsaz_512_reducex: 1413 1414 imulq %r8,%rdx 1415 xorq %rsi,%rsi 1416 movl $8,%ecx 1417 jmp .Lreduction_loopx 1418 1419.align 32 1420.Lreduction_loopx: 1421 movq %r8,%rbx 1422 mulxq 0(%rbp),%rax,%r8 1423 adcxq %rbx,%rax 1424 adoxq %r9,%r8 1425 1426 mulxq 8(%rbp),%rax,%r9 1427 adcxq %rax,%r8 1428 adoxq %r10,%r9 1429 1430 mulxq 16(%rbp),%rbx,%r10 1431 adcxq %rbx,%r9 1432 adoxq %r11,%r10 1433 1434 mulxq 24(%rbp),%rbx,%r11 1435 adcxq %rbx,%r10 1436 adoxq %r12,%r11 1437 1438.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 1439 movq %rdx,%rax 1440 movq %r8,%rdx 1441 adcxq %rbx,%r11 1442 adoxq %r13,%r12 1443 1444 mulxq 128+8(%rsp),%rbx,%rdx 1445 movq %rax,%rdx 1446 1447 mulxq 40(%rbp),%rax,%r13 1448 adcxq %rax,%r12 1449 adoxq %r14,%r13 1450 1451.byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 1452 adcxq %rax,%r13 1453 adoxq %r15,%r14 1454 1455 mulxq 56(%rbp),%rax,%r15 1456 movq %rbx,%rdx 1457 adcxq %rax,%r14 1458 adoxq %rsi,%r15 1459 adcxq %rsi,%r15 1460 1461 decl %ecx 1462 jne .Lreduction_loopx 1463 1464 .byte 0xf3,0xc3 1465.size __rsaz_512_reducex,.-__rsaz_512_reducex 1466.type __rsaz_512_subtract,@function 1467.align 32 1468__rsaz_512_subtract: 1469 movq %r8,(%rdi) 1470 movq %r9,8(%rdi) 1471 movq %r10,16(%rdi) 1472 movq %r11,24(%rdi) 1473 movq %r12,32(%rdi) 1474 movq %r13,40(%rdi) 1475 movq %r14,48(%rdi) 1476 movq %r15,56(%rdi) 1477 1478 movq 0(%rbp),%r8 1479 movq 8(%rbp),%r9 1480 negq %r8 1481 notq %r9 1482 andq %rcx,%r8 1483 movq 16(%rbp),%r10 1484 andq %rcx,%r9 1485 notq %r10 1486 movq 24(%rbp),%r11 1487 andq %rcx,%r10 1488 notq %r11 1489 movq 32(%rbp),%r12 1490 andq %rcx,%r11 1491 notq %r12 1492 movq 40(%rbp),%r13 1493 andq %rcx,%r12 1494 notq %r13 1495 movq 48(%rbp),%r14 1496 andq %rcx,%r13 1497 notq %r14 1498 movq 56(%rbp),%r15 1499 andq %rcx,%r14 1500 notq %r15 1501 andq %rcx,%r15 1502 1503 addq (%rdi),%r8 1504 adcq 8(%rdi),%r9 1505 adcq 16(%rdi),%r10 1506 adcq 24(%rdi),%r11 1507 adcq 32(%rdi),%r12 1508 adcq 40(%rdi),%r13 1509 adcq 48(%rdi),%r14 1510 adcq 56(%rdi),%r15 1511 1512 movq %r8,(%rdi) 1513 movq %r9,8(%rdi) 1514 movq %r10,16(%rdi) 1515 movq %r11,24(%rdi) 1516 movq %r12,32(%rdi) 1517 movq %r13,40(%rdi) 1518 movq %r14,48(%rdi) 1519 movq %r15,56(%rdi) 1520 1521 .byte 0xf3,0xc3 1522.size __rsaz_512_subtract,.-__rsaz_512_subtract 1523.type __rsaz_512_mul,@function 1524.align 32 1525__rsaz_512_mul: 1526 leaq 8(%rsp),%rdi 1527 1528 movq (%rsi),%rax 1529 mulq %rbx 1530 movq %rax,(%rdi) 1531 movq 8(%rsi),%rax 1532 movq %rdx,%r8 1533 1534 mulq %rbx 1535 addq %rax,%r8 1536 movq 16(%rsi),%rax 1537 movq %rdx,%r9 1538 adcq $0,%r9 1539 1540 mulq %rbx 1541 addq %rax,%r9 1542 movq 24(%rsi),%rax 1543 movq %rdx,%r10 1544 adcq $0,%r10 1545 1546 mulq %rbx 1547 addq %rax,%r10 1548 movq 32(%rsi),%rax 1549 movq %rdx,%r11 1550 adcq $0,%r11 1551 1552 mulq %rbx 1553 addq %rax,%r11 1554 movq 40(%rsi),%rax 1555 movq %rdx,%r12 1556 adcq $0,%r12 1557 1558 mulq %rbx 1559 addq %rax,%r12 1560 movq 48(%rsi),%rax 1561 movq %rdx,%r13 1562 adcq $0,%r13 1563 1564 mulq %rbx 1565 addq %rax,%r13 1566 movq 56(%rsi),%rax 1567 movq %rdx,%r14 1568 adcq $0,%r14 1569 1570 mulq %rbx 1571 addq %rax,%r14 1572 movq (%rsi),%rax 1573 movq %rdx,%r15 1574 adcq $0,%r15 1575 1576 leaq 8(%rbp),%rbp 1577 leaq 8(%rdi),%rdi 1578 1579 movl $7,%ecx 1580 jmp .Loop_mul 1581 1582.align 32 1583.Loop_mul: 1584 movq (%rbp),%rbx 1585 mulq %rbx 1586 addq %rax,%r8 1587 movq 8(%rsi),%rax 1588 movq %r8,(%rdi) 1589 movq %rdx,%r8 1590 adcq $0,%r8 1591 1592 mulq %rbx 1593 addq %rax,%r9 1594 movq 16(%rsi),%rax 1595 adcq $0,%rdx 1596 addq %r9,%r8 1597 movq %rdx,%r9 1598 adcq $0,%r9 1599 1600 mulq %rbx 1601 addq %rax,%r10 1602 movq 24(%rsi),%rax 1603 adcq $0,%rdx 1604 addq %r10,%r9 1605 movq %rdx,%r10 1606 adcq $0,%r10 1607 1608 mulq %rbx 1609 addq %rax,%r11 1610 movq 32(%rsi),%rax 1611 adcq $0,%rdx 1612 addq %r11,%r10 1613 movq %rdx,%r11 1614 adcq $0,%r11 1615 1616 mulq %rbx 1617 addq %rax,%r12 1618 movq 40(%rsi),%rax 1619 adcq $0,%rdx 1620 addq %r12,%r11 1621 movq %rdx,%r12 1622 adcq $0,%r12 1623 1624 mulq %rbx 1625 addq %rax,%r13 1626 movq 48(%rsi),%rax 1627 adcq $0,%rdx 1628 addq %r13,%r12 1629 movq %rdx,%r13 1630 adcq $0,%r13 1631 1632 mulq %rbx 1633 addq %rax,%r14 1634 movq 56(%rsi),%rax 1635 adcq $0,%rdx 1636 addq %r14,%r13 1637 movq %rdx,%r14 1638 leaq 8(%rbp),%rbp 1639 adcq $0,%r14 1640 1641 mulq %rbx 1642 addq %rax,%r15 1643 movq (%rsi),%rax 1644 adcq $0,%rdx 1645 addq %r15,%r14 1646 movq %rdx,%r15 1647 adcq $0,%r15 1648 1649 leaq 8(%rdi),%rdi 1650 1651 decl %ecx 1652 jnz .Loop_mul 1653 1654 movq %r8,(%rdi) 1655 movq %r9,8(%rdi) 1656 movq %r10,16(%rdi) 1657 movq %r11,24(%rdi) 1658 movq %r12,32(%rdi) 1659 movq %r13,40(%rdi) 1660 movq %r14,48(%rdi) 1661 movq %r15,56(%rdi) 1662 1663 .byte 0xf3,0xc3 1664.size __rsaz_512_mul,.-__rsaz_512_mul 1665.type __rsaz_512_mulx,@function 1666.align 32 1667__rsaz_512_mulx: 1668 mulxq (%rsi),%rbx,%r8 1669 movq $-6,%rcx 1670 1671 mulxq 8(%rsi),%rax,%r9 1672 movq %rbx,8(%rsp) 1673 1674 mulxq 16(%rsi),%rbx,%r10 1675 adcq %rax,%r8 1676 1677 mulxq 24(%rsi),%rax,%r11 1678 adcq %rbx,%r9 1679 1680 mulxq 32(%rsi),%rbx,%r12 1681 adcq %rax,%r10 1682 1683 mulxq 40(%rsi),%rax,%r13 1684 adcq %rbx,%r11 1685 1686 mulxq 48(%rsi),%rbx,%r14 1687 adcq %rax,%r12 1688 1689 mulxq 56(%rsi),%rax,%r15 1690 movq 8(%rbp),%rdx 1691 adcq %rbx,%r13 1692 adcq %rax,%r14 1693 adcq $0,%r15 1694 1695 xorq %rdi,%rdi 1696 jmp .Loop_mulx 1697 1698.align 32 1699.Loop_mulx: 1700 movq %r8,%rbx 1701 mulxq (%rsi),%rax,%r8 1702 adcxq %rax,%rbx 1703 adoxq %r9,%r8 1704 1705 mulxq 8(%rsi),%rax,%r9 1706 adcxq %rax,%r8 1707 adoxq %r10,%r9 1708 1709 mulxq 16(%rsi),%rax,%r10 1710 adcxq %rax,%r9 1711 adoxq %r11,%r10 1712 1713 mulxq 24(%rsi),%rax,%r11 1714 adcxq %rax,%r10 1715 adoxq %r12,%r11 1716 1717.byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 1718 adcxq %rax,%r11 1719 adoxq %r13,%r12 1720 1721 mulxq 40(%rsi),%rax,%r13 1722 adcxq %rax,%r12 1723 adoxq %r14,%r13 1724 1725 mulxq 48(%rsi),%rax,%r14 1726 adcxq %rax,%r13 1727 adoxq %r15,%r14 1728 1729 mulxq 56(%rsi),%rax,%r15 1730 movq 64(%rbp,%rcx,8),%rdx 1731 movq %rbx,8+64-8(%rsp,%rcx,8) 1732 adcxq %rax,%r14 1733 adoxq %rdi,%r15 1734 adcxq %rdi,%r15 1735 1736 incq %rcx 1737 jnz .Loop_mulx 1738 1739 movq %r8,%rbx 1740 mulxq (%rsi),%rax,%r8 1741 adcxq %rax,%rbx 1742 adoxq %r9,%r8 1743 1744.byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 1745 adcxq %rax,%r8 1746 adoxq %r10,%r9 1747 1748.byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 1749 adcxq %rax,%r9 1750 adoxq %r11,%r10 1751 1752 mulxq 24(%rsi),%rax,%r11 1753 adcxq %rax,%r10 1754 adoxq %r12,%r11 1755 1756 mulxq 32(%rsi),%rax,%r12 1757 adcxq %rax,%r11 1758 adoxq %r13,%r12 1759 1760 mulxq 40(%rsi),%rax,%r13 1761 adcxq %rax,%r12 1762 adoxq %r14,%r13 1763 1764.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 1765 adcxq %rax,%r13 1766 adoxq %r15,%r14 1767 1768.byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 1769 adcxq %rax,%r14 1770 adoxq %rdi,%r15 1771 adcxq %rdi,%r15 1772 1773 movq %rbx,8+64-8(%rsp) 1774 movq %r8,8+64(%rsp) 1775 movq %r9,8+64+8(%rsp) 1776 movq %r10,8+64+16(%rsp) 1777 movq %r11,8+64+24(%rsp) 1778 movq %r12,8+64+32(%rsp) 1779 movq %r13,8+64+40(%rsp) 1780 movq %r14,8+64+48(%rsp) 1781 movq %r15,8+64+56(%rsp) 1782 1783 .byte 0xf3,0xc3 1784.size __rsaz_512_mulx,.-__rsaz_512_mulx 1785.globl rsaz_512_scatter4 1786.type rsaz_512_scatter4,@function 1787.align 16 1788rsaz_512_scatter4: 1789 leaq (%rdi,%rdx,8),%rdi 1790 movl $8,%r9d 1791 jmp .Loop_scatter 1792.align 16 1793.Loop_scatter: 1794 movq (%rsi),%rax 1795 leaq 8(%rsi),%rsi 1796 movq %rax,(%rdi) 1797 leaq 128(%rdi),%rdi 1798 decl %r9d 1799 jnz .Loop_scatter 1800 .byte 0xf3,0xc3 1801.size rsaz_512_scatter4,.-rsaz_512_scatter4 1802 1803.globl rsaz_512_gather4 1804.type rsaz_512_gather4,@function 1805.align 16 1806rsaz_512_gather4: 1807 movd %edx,%xmm8 1808 movdqa .Linc+16(%rip),%xmm1 1809 movdqa .Linc(%rip),%xmm0 1810 1811 pshufd $0,%xmm8,%xmm8 1812 movdqa %xmm1,%xmm7 1813 movdqa %xmm1,%xmm2 1814 paddd %xmm0,%xmm1 1815 pcmpeqd %xmm8,%xmm0 1816 movdqa %xmm7,%xmm3 1817 paddd %xmm1,%xmm2 1818 pcmpeqd %xmm8,%xmm1 1819 movdqa %xmm7,%xmm4 1820 paddd %xmm2,%xmm3 1821 pcmpeqd %xmm8,%xmm2 1822 movdqa %xmm7,%xmm5 1823 paddd %xmm3,%xmm4 1824 pcmpeqd %xmm8,%xmm3 1825 movdqa %xmm7,%xmm6 1826 paddd %xmm4,%xmm5 1827 pcmpeqd %xmm8,%xmm4 1828 paddd %xmm5,%xmm6 1829 pcmpeqd %xmm8,%xmm5 1830 paddd %xmm6,%xmm7 1831 pcmpeqd %xmm8,%xmm6 1832 pcmpeqd %xmm8,%xmm7 1833 movl $8,%r9d 1834 jmp .Loop_gather 1835.align 16 1836.Loop_gather: 1837 movdqa 0(%rsi),%xmm8 1838 movdqa 16(%rsi),%xmm9 1839 movdqa 32(%rsi),%xmm10 1840 movdqa 48(%rsi),%xmm11 1841 pand %xmm0,%xmm8 1842 movdqa 64(%rsi),%xmm12 1843 pand %xmm1,%xmm9 1844 movdqa 80(%rsi),%xmm13 1845 pand %xmm2,%xmm10 1846 movdqa 96(%rsi),%xmm14 1847 pand %xmm3,%xmm11 1848 movdqa 112(%rsi),%xmm15 1849 leaq 128(%rsi),%rsi 1850 pand %xmm4,%xmm12 1851 pand %xmm5,%xmm13 1852 pand %xmm6,%xmm14 1853 pand %xmm7,%xmm15 1854 por %xmm10,%xmm8 1855 por %xmm11,%xmm9 1856 por %xmm12,%xmm8 1857 por %xmm13,%xmm9 1858 por %xmm14,%xmm8 1859 por %xmm15,%xmm9 1860 1861 por %xmm9,%xmm8 1862 pshufd $0x4e,%xmm8,%xmm9 1863 por %xmm9,%xmm8 1864 movq %xmm8,(%rdi) 1865 leaq 8(%rdi),%rdi 1866 decl %r9d 1867 jnz .Loop_gather 1868 .byte 0xf3,0xc3 1869.LSEH_end_rsaz_512_gather4: 1870.size rsaz_512_gather4,.-rsaz_512_gather4 1871 1872.align 64 1873.Linc: 1874.long 0,0, 1,1 1875.long 2,2, 2,2 1876