rsaz-x86_64.S revision 1.1
1#include <machine/asm.h> 2.text 3 4 5 6.globl rsaz_512_sqr 7.type rsaz_512_sqr,@function 8.align 32 9rsaz_512_sqr: 10 pushq %rbx 11 pushq %rbp 12 pushq %r12 13 pushq %r13 14 pushq %r14 15 pushq %r15 16 17 subq $128+24,%rsp 18.Lsqr_body: 19 movq %rdx,%rbp 20 movq (%rsi),%rdx 21 movq 8(%rsi),%rax 22 movq %rcx,128(%rsp) 23 jmp .Loop_sqr 24 25.align 32 26.Loop_sqr: 27 movl %r8d,128+8(%rsp) 28 29 movq %rdx,%rbx 30 mulq %rdx 31 movq %rax,%r8 32 movq 16(%rsi),%rax 33 movq %rdx,%r9 34 35 mulq %rbx 36 addq %rax,%r9 37 movq 24(%rsi),%rax 38 movq %rdx,%r10 39 adcq $0,%r10 40 41 mulq %rbx 42 addq %rax,%r10 43 movq 32(%rsi),%rax 44 movq %rdx,%r11 45 adcq $0,%r11 46 47 mulq %rbx 48 addq %rax,%r11 49 movq 40(%rsi),%rax 50 movq %rdx,%r12 51 adcq $0,%r12 52 53 mulq %rbx 54 addq %rax,%r12 55 movq 48(%rsi),%rax 56 movq %rdx,%r13 57 adcq $0,%r13 58 59 mulq %rbx 60 addq %rax,%r13 61 movq 56(%rsi),%rax 62 movq %rdx,%r14 63 adcq $0,%r14 64 65 mulq %rbx 66 addq %rax,%r14 67 movq %rbx,%rax 68 movq %rdx,%r15 69 adcq $0,%r15 70 71 addq %r8,%r8 72 movq %r9,%rcx 73 adcq %r9,%r9 74 75 mulq %rax 76 movq %rax,(%rsp) 77 addq %rdx,%r8 78 adcq $0,%r9 79 80 movq %r8,8(%rsp) 81 shrq $63,%rcx 82 83 84 movq 8(%rsi),%r8 85 movq 16(%rsi),%rax 86 mulq %r8 87 addq %rax,%r10 88 movq 24(%rsi),%rax 89 movq %rdx,%rbx 90 adcq $0,%rbx 91 92 mulq %r8 93 addq %rax,%r11 94 movq 32(%rsi),%rax 95 adcq $0,%rdx 96 addq %rbx,%r11 97 movq %rdx,%rbx 98 adcq $0,%rbx 99 100 mulq %r8 101 addq %rax,%r12 102 movq 40(%rsi),%rax 103 adcq $0,%rdx 104 addq %rbx,%r12 105 movq %rdx,%rbx 106 adcq $0,%rbx 107 108 mulq %r8 109 addq %rax,%r13 110 movq 48(%rsi),%rax 111 adcq $0,%rdx 112 addq %rbx,%r13 113 movq %rdx,%rbx 114 adcq $0,%rbx 115 116 mulq %r8 117 addq %rax,%r14 118 movq 56(%rsi),%rax 119 adcq $0,%rdx 120 addq %rbx,%r14 121 movq %rdx,%rbx 122 adcq $0,%rbx 123 124 mulq %r8 125 addq %rax,%r15 126 movq %r8,%rax 127 adcq $0,%rdx 128 addq %rbx,%r15 129 movq %rdx,%r8 130 movq %r10,%rdx 131 adcq $0,%r8 132 133 addq %rdx,%rdx 134 leaq (%rcx,%r10,2),%r10 135 movq %r11,%rbx 136 adcq %r11,%r11 137 138 mulq %rax 139 addq %rax,%r9 140 adcq %rdx,%r10 141 adcq $0,%r11 142 143 movq %r9,16(%rsp) 144 movq %r10,24(%rsp) 145 shrq $63,%rbx 146 147 148 movq 16(%rsi),%r9 149 movq 24(%rsi),%rax 150 mulq %r9 151 addq %rax,%r12 152 movq 32(%rsi),%rax 153 movq %rdx,%rcx 154 adcq $0,%rcx 155 156 mulq %r9 157 addq %rax,%r13 158 movq 40(%rsi),%rax 159 adcq $0,%rdx 160 addq %rcx,%r13 161 movq %rdx,%rcx 162 adcq $0,%rcx 163 164 mulq %r9 165 addq %rax,%r14 166 movq 48(%rsi),%rax 167 adcq $0,%rdx 168 addq %rcx,%r14 169 movq %rdx,%rcx 170 adcq $0,%rcx 171 172 mulq %r9 173 movq %r12,%r10 174 leaq (%rbx,%r12,2),%r12 175 addq %rax,%r15 176 movq 56(%rsi),%rax 177 adcq $0,%rdx 178 addq %rcx,%r15 179 movq %rdx,%rcx 180 adcq $0,%rcx 181 182 mulq %r9 183 shrq $63,%r10 184 addq %rax,%r8 185 movq %r9,%rax 186 adcq $0,%rdx 187 addq %rcx,%r8 188 movq %rdx,%r9 189 adcq $0,%r9 190 191 movq %r13,%rcx 192 leaq (%r10,%r13,2),%r13 193 194 mulq %rax 195 addq %rax,%r11 196 adcq %rdx,%r12 197 adcq $0,%r13 198 199 movq %r11,32(%rsp) 200 movq %r12,40(%rsp) 201 shrq $63,%rcx 202 203 204 movq 24(%rsi),%r10 205 movq 32(%rsi),%rax 206 mulq %r10 207 addq %rax,%r14 208 movq 40(%rsi),%rax 209 movq %rdx,%rbx 210 adcq $0,%rbx 211 212 mulq %r10 213 addq %rax,%r15 214 movq 48(%rsi),%rax 215 adcq $0,%rdx 216 addq %rbx,%r15 217 movq %rdx,%rbx 218 adcq $0,%rbx 219 220 mulq %r10 221 movq %r14,%r12 222 leaq (%rcx,%r14,2),%r14 223 addq %rax,%r8 224 movq 56(%rsi),%rax 225 adcq $0,%rdx 226 addq %rbx,%r8 227 movq %rdx,%rbx 228 adcq $0,%rbx 229 230 mulq %r10 231 shrq $63,%r12 232 addq %rax,%r9 233 movq %r10,%rax 234 adcq $0,%rdx 235 addq %rbx,%r9 236 movq %rdx,%r10 237 adcq $0,%r10 238 239 movq %r15,%rbx 240 leaq (%r12,%r15,2),%r15 241 242 mulq %rax 243 addq %rax,%r13 244 adcq %rdx,%r14 245 adcq $0,%r15 246 247 movq %r13,48(%rsp) 248 movq %r14,56(%rsp) 249 shrq $63,%rbx 250 251 252 movq 32(%rsi),%r11 253 movq 40(%rsi),%rax 254 mulq %r11 255 addq %rax,%r8 256 movq 48(%rsi),%rax 257 movq %rdx,%rcx 258 adcq $0,%rcx 259 260 mulq %r11 261 addq %rax,%r9 262 movq 56(%rsi),%rax 263 adcq $0,%rdx 264 movq %r8,%r12 265 leaq (%rbx,%r8,2),%r8 266 addq %rcx,%r9 267 movq %rdx,%rcx 268 adcq $0,%rcx 269 270 mulq %r11 271 shrq $63,%r12 272 addq %rax,%r10 273 movq %r11,%rax 274 adcq $0,%rdx 275 addq %rcx,%r10 276 movq %rdx,%r11 277 adcq $0,%r11 278 279 movq %r9,%rcx 280 leaq (%r12,%r9,2),%r9 281 282 mulq %rax 283 addq %rax,%r15 284 adcq %rdx,%r8 285 adcq $0,%r9 286 287 movq %r15,64(%rsp) 288 movq %r8,72(%rsp) 289 shrq $63,%rcx 290 291 292 movq 40(%rsi),%r12 293 movq 48(%rsi),%rax 294 mulq %r12 295 addq %rax,%r10 296 movq 56(%rsi),%rax 297 movq %rdx,%rbx 298 adcq $0,%rbx 299 300 mulq %r12 301 addq %rax,%r11 302 movq %r12,%rax 303 movq %r10,%r15 304 leaq (%rcx,%r10,2),%r10 305 adcq $0,%rdx 306 shrq $63,%r15 307 addq %rbx,%r11 308 movq %rdx,%r12 309 adcq $0,%r12 310 311 movq %r11,%rbx 312 leaq (%r15,%r11,2),%r11 313 314 mulq %rax 315 addq %rax,%r9 316 adcq %rdx,%r10 317 adcq $0,%r11 318 319 movq %r9,80(%rsp) 320 movq %r10,88(%rsp) 321 322 323 movq 48(%rsi),%r13 324 movq 56(%rsi),%rax 325 mulq %r13 326 addq %rax,%r12 327 movq %r13,%rax 328 movq %rdx,%r13 329 adcq $0,%r13 330 331 xorq %r14,%r14 332 shlq $1,%rbx 333 adcq %r12,%r12 334 adcq %r13,%r13 335 adcq %r14,%r14 336 337 mulq %rax 338 addq %rax,%r11 339 adcq %rdx,%r12 340 adcq $0,%r13 341 342 movq %r11,96(%rsp) 343 movq %r12,104(%rsp) 344 345 346 movq 56(%rsi),%rax 347 mulq %rax 348 addq %rax,%r13 349 adcq $0,%rdx 350 351 addq %rdx,%r14 352 353 movq %r13,112(%rsp) 354 movq %r14,120(%rsp) 355 356 movq (%rsp),%r8 357 movq 8(%rsp),%r9 358 movq 16(%rsp),%r10 359 movq 24(%rsp),%r11 360 movq 32(%rsp),%r12 361 movq 40(%rsp),%r13 362 movq 48(%rsp),%r14 363 movq 56(%rsp),%r15 364 365 call __rsaz_512_reduce 366 367 addq 64(%rsp),%r8 368 adcq 72(%rsp),%r9 369 adcq 80(%rsp),%r10 370 adcq 88(%rsp),%r11 371 adcq 96(%rsp),%r12 372 adcq 104(%rsp),%r13 373 adcq 112(%rsp),%r14 374 adcq 120(%rsp),%r15 375 sbbq %rcx,%rcx 376 377 call __rsaz_512_subtract 378 379 movq %r8,%rdx 380 movq %r9,%rax 381 movl 128+8(%rsp),%r8d 382 movq %rdi,%rsi 383 384 decl %r8d 385 jnz .Loop_sqr 386 387 leaq 128+24+48(%rsp),%rax 388 movq -48(%rax),%r15 389 movq -40(%rax),%r14 390 movq -32(%rax),%r13 391 movq -24(%rax),%r12 392 movq -16(%rax),%rbp 393 movq -8(%rax),%rbx 394 leaq (%rax),%rsp 395.Lsqr_epilogue: 396 .byte 0xf3,0xc3 397.size rsaz_512_sqr,.-rsaz_512_sqr 398.globl rsaz_512_mul 399.type rsaz_512_mul,@function 400.align 32 401rsaz_512_mul: 402 pushq %rbx 403 pushq %rbp 404 pushq %r12 405 pushq %r13 406 pushq %r14 407 pushq %r15 408 409 subq $128+24,%rsp 410.Lmul_body: 411.byte 102,72,15,110,199 412.byte 102,72,15,110,201 413 movq %r8,128(%rsp) 414 movq (%rdx),%rbx 415 movq %rdx,%rbp 416 call __rsaz_512_mul 417 418.byte 102,72,15,126,199 419.byte 102,72,15,126,205 420 421 movq (%rsp),%r8 422 movq 8(%rsp),%r9 423 movq 16(%rsp),%r10 424 movq 24(%rsp),%r11 425 movq 32(%rsp),%r12 426 movq 40(%rsp),%r13 427 movq 48(%rsp),%r14 428 movq 56(%rsp),%r15 429 430 call __rsaz_512_reduce 431 addq 64(%rsp),%r8 432 adcq 72(%rsp),%r9 433 adcq 80(%rsp),%r10 434 adcq 88(%rsp),%r11 435 adcq 96(%rsp),%r12 436 adcq 104(%rsp),%r13 437 adcq 112(%rsp),%r14 438 adcq 120(%rsp),%r15 439 sbbq %rcx,%rcx 440 441 call __rsaz_512_subtract 442 443 leaq 128+24+48(%rsp),%rax 444 movq -48(%rax),%r15 445 movq -40(%rax),%r14 446 movq -32(%rax),%r13 447 movq -24(%rax),%r12 448 movq -16(%rax),%rbp 449 movq -8(%rax),%rbx 450 leaq (%rax),%rsp 451.Lmul_epilogue: 452 .byte 0xf3,0xc3 453.size rsaz_512_mul,.-rsaz_512_mul 454.globl rsaz_512_mul_gather4 455.type rsaz_512_mul_gather4,@function 456.align 32 457rsaz_512_mul_gather4: 458 pushq %rbx 459 pushq %rbp 460 pushq %r12 461 pushq %r13 462 pushq %r14 463 pushq %r15 464 465 subq $152,%rsp 466.Lmul_gather4_body: 467 movd %r9d,%xmm8 468 movdqa .Linc+16(%rip),%xmm1 469 movdqa .Linc(%rip),%xmm0 470 471 pshufd $0,%xmm8,%xmm8 472 movdqa %xmm1,%xmm7 473 movdqa %xmm1,%xmm2 474 paddd %xmm0,%xmm1 475 pcmpeqd %xmm8,%xmm0 476 movdqa %xmm7,%xmm3 477 paddd %xmm1,%xmm2 478 pcmpeqd %xmm8,%xmm1 479 movdqa %xmm7,%xmm4 480 paddd %xmm2,%xmm3 481 pcmpeqd %xmm8,%xmm2 482 movdqa %xmm7,%xmm5 483 paddd %xmm3,%xmm4 484 pcmpeqd %xmm8,%xmm3 485 movdqa %xmm7,%xmm6 486 paddd %xmm4,%xmm5 487 pcmpeqd %xmm8,%xmm4 488 paddd %xmm5,%xmm6 489 pcmpeqd %xmm8,%xmm5 490 paddd %xmm6,%xmm7 491 pcmpeqd %xmm8,%xmm6 492 pcmpeqd %xmm8,%xmm7 493 494 movdqa 0(%rdx),%xmm8 495 movdqa 16(%rdx),%xmm9 496 movdqa 32(%rdx),%xmm10 497 movdqa 48(%rdx),%xmm11 498 pand %xmm0,%xmm8 499 movdqa 64(%rdx),%xmm12 500 pand %xmm1,%xmm9 501 movdqa 80(%rdx),%xmm13 502 pand %xmm2,%xmm10 503 movdqa 96(%rdx),%xmm14 504 pand %xmm3,%xmm11 505 movdqa 112(%rdx),%xmm15 506 leaq 128(%rdx),%rbp 507 pand %xmm4,%xmm12 508 pand %xmm5,%xmm13 509 pand %xmm6,%xmm14 510 pand %xmm7,%xmm15 511 por %xmm10,%xmm8 512 por %xmm11,%xmm9 513 por %xmm12,%xmm8 514 por %xmm13,%xmm9 515 por %xmm14,%xmm8 516 por %xmm15,%xmm9 517 518 por %xmm9,%xmm8 519 pshufd $0x4e,%xmm8,%xmm9 520 por %xmm9,%xmm8 521.byte 102,76,15,126,195 522 523 movq %r8,128(%rsp) 524 movq %rdi,128+8(%rsp) 525 movq %rcx,128+16(%rsp) 526 527 movq (%rsi),%rax 528 movq 8(%rsi),%rcx 529 mulq %rbx 530 movq %rax,(%rsp) 531 movq %rcx,%rax 532 movq %rdx,%r8 533 534 mulq %rbx 535 addq %rax,%r8 536 movq 16(%rsi),%rax 537 movq %rdx,%r9 538 adcq $0,%r9 539 540 mulq %rbx 541 addq %rax,%r9 542 movq 24(%rsi),%rax 543 movq %rdx,%r10 544 adcq $0,%r10 545 546 mulq %rbx 547 addq %rax,%r10 548 movq 32(%rsi),%rax 549 movq %rdx,%r11 550 adcq $0,%r11 551 552 mulq %rbx 553 addq %rax,%r11 554 movq 40(%rsi),%rax 555 movq %rdx,%r12 556 adcq $0,%r12 557 558 mulq %rbx 559 addq %rax,%r12 560 movq 48(%rsi),%rax 561 movq %rdx,%r13 562 adcq $0,%r13 563 564 mulq %rbx 565 addq %rax,%r13 566 movq 56(%rsi),%rax 567 movq %rdx,%r14 568 adcq $0,%r14 569 570 mulq %rbx 571 addq %rax,%r14 572 movq (%rsi),%rax 573 movq %rdx,%r15 574 adcq $0,%r15 575 576 leaq 8(%rsp),%rdi 577 movl $7,%ecx 578 jmp .Loop_mul_gather 579 580.align 32 581.Loop_mul_gather: 582 movdqa 0(%rbp),%xmm8 583 movdqa 16(%rbp),%xmm9 584 movdqa 32(%rbp),%xmm10 585 movdqa 48(%rbp),%xmm11 586 pand %xmm0,%xmm8 587 movdqa 64(%rbp),%xmm12 588 pand %xmm1,%xmm9 589 movdqa 80(%rbp),%xmm13 590 pand %xmm2,%xmm10 591 movdqa 96(%rbp),%xmm14 592 pand %xmm3,%xmm11 593 movdqa 112(%rbp),%xmm15 594 leaq 128(%rbp),%rbp 595 pand %xmm4,%xmm12 596 pand %xmm5,%xmm13 597 pand %xmm6,%xmm14 598 pand %xmm7,%xmm15 599 por %xmm10,%xmm8 600 por %xmm11,%xmm9 601 por %xmm12,%xmm8 602 por %xmm13,%xmm9 603 por %xmm14,%xmm8 604 por %xmm15,%xmm9 605 606 por %xmm9,%xmm8 607 pshufd $0x4e,%xmm8,%xmm9 608 por %xmm9,%xmm8 609.byte 102,76,15,126,195 610 611 mulq %rbx 612 addq %rax,%r8 613 movq 8(%rsi),%rax 614 movq %r8,(%rdi) 615 movq %rdx,%r8 616 adcq $0,%r8 617 618 mulq %rbx 619 addq %rax,%r9 620 movq 16(%rsi),%rax 621 adcq $0,%rdx 622 addq %r9,%r8 623 movq %rdx,%r9 624 adcq $0,%r9 625 626 mulq %rbx 627 addq %rax,%r10 628 movq 24(%rsi),%rax 629 adcq $0,%rdx 630 addq %r10,%r9 631 movq %rdx,%r10 632 adcq $0,%r10 633 634 mulq %rbx 635 addq %rax,%r11 636 movq 32(%rsi),%rax 637 adcq $0,%rdx 638 addq %r11,%r10 639 movq %rdx,%r11 640 adcq $0,%r11 641 642 mulq %rbx 643 addq %rax,%r12 644 movq 40(%rsi),%rax 645 adcq $0,%rdx 646 addq %r12,%r11 647 movq %rdx,%r12 648 adcq $0,%r12 649 650 mulq %rbx 651 addq %rax,%r13 652 movq 48(%rsi),%rax 653 adcq $0,%rdx 654 addq %r13,%r12 655 movq %rdx,%r13 656 adcq $0,%r13 657 658 mulq %rbx 659 addq %rax,%r14 660 movq 56(%rsi),%rax 661 adcq $0,%rdx 662 addq %r14,%r13 663 movq %rdx,%r14 664 adcq $0,%r14 665 666 mulq %rbx 667 addq %rax,%r15 668 movq (%rsi),%rax 669 adcq $0,%rdx 670 addq %r15,%r14 671 movq %rdx,%r15 672 adcq $0,%r15 673 674 leaq 8(%rdi),%rdi 675 676 decl %ecx 677 jnz .Loop_mul_gather 678 679 movq %r8,(%rdi) 680 movq %r9,8(%rdi) 681 movq %r10,16(%rdi) 682 movq %r11,24(%rdi) 683 movq %r12,32(%rdi) 684 movq %r13,40(%rdi) 685 movq %r14,48(%rdi) 686 movq %r15,56(%rdi) 687 688 movq 128+8(%rsp),%rdi 689 movq 128+16(%rsp),%rbp 690 691 movq (%rsp),%r8 692 movq 8(%rsp),%r9 693 movq 16(%rsp),%r10 694 movq 24(%rsp),%r11 695 movq 32(%rsp),%r12 696 movq 40(%rsp),%r13 697 movq 48(%rsp),%r14 698 movq 56(%rsp),%r15 699 700 call __rsaz_512_reduce 701 addq 64(%rsp),%r8 702 adcq 72(%rsp),%r9 703 adcq 80(%rsp),%r10 704 adcq 88(%rsp),%r11 705 adcq 96(%rsp),%r12 706 adcq 104(%rsp),%r13 707 adcq 112(%rsp),%r14 708 adcq 120(%rsp),%r15 709 sbbq %rcx,%rcx 710 711 call __rsaz_512_subtract 712 713 leaq 128+24+48(%rsp),%rax 714 movq -48(%rax),%r15 715 movq -40(%rax),%r14 716 movq -32(%rax),%r13 717 movq -24(%rax),%r12 718 movq -16(%rax),%rbp 719 movq -8(%rax),%rbx 720 leaq (%rax),%rsp 721.Lmul_gather4_epilogue: 722 .byte 0xf3,0xc3 723.size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4 724.globl rsaz_512_mul_scatter4 725.type rsaz_512_mul_scatter4,@function 726.align 32 727rsaz_512_mul_scatter4: 728 pushq %rbx 729 pushq %rbp 730 pushq %r12 731 pushq %r13 732 pushq %r14 733 pushq %r15 734 735 movl %r9d,%r9d 736 subq $128+24,%rsp 737.Lmul_scatter4_body: 738 leaq (%r8,%r9,8),%r8 739.byte 102,72,15,110,199 740.byte 102,72,15,110,202 741.byte 102,73,15,110,208 742 movq %rcx,128(%rsp) 743 744 movq %rdi,%rbp 745 movq (%rdi),%rbx 746 call __rsaz_512_mul 747 748.byte 102,72,15,126,199 749.byte 102,72,15,126,205 750 751 movq (%rsp),%r8 752 movq 8(%rsp),%r9 753 movq 16(%rsp),%r10 754 movq 24(%rsp),%r11 755 movq 32(%rsp),%r12 756 movq 40(%rsp),%r13 757 movq 48(%rsp),%r14 758 movq 56(%rsp),%r15 759 760 call __rsaz_512_reduce 761 addq 64(%rsp),%r8 762 adcq 72(%rsp),%r9 763 adcq 80(%rsp),%r10 764 adcq 88(%rsp),%r11 765 adcq 96(%rsp),%r12 766 adcq 104(%rsp),%r13 767 adcq 112(%rsp),%r14 768 adcq 120(%rsp),%r15 769.byte 102,72,15,126,214 770 sbbq %rcx,%rcx 771 772 call __rsaz_512_subtract 773 774 movq %r8,0(%rsi) 775 movq %r9,128(%rsi) 776 movq %r10,256(%rsi) 777 movq %r11,384(%rsi) 778 movq %r12,512(%rsi) 779 movq %r13,640(%rsi) 780 movq %r14,768(%rsi) 781 movq %r15,896(%rsi) 782 783 leaq 128+24+48(%rsp),%rax 784 movq -48(%rax),%r15 785 movq -40(%rax),%r14 786 movq -32(%rax),%r13 787 movq -24(%rax),%r12 788 movq -16(%rax),%rbp 789 movq -8(%rax),%rbx 790 leaq (%rax),%rsp 791.Lmul_scatter4_epilogue: 792 .byte 0xf3,0xc3 793.size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4 794.globl rsaz_512_mul_by_one 795.type rsaz_512_mul_by_one,@function 796.align 32 797rsaz_512_mul_by_one: 798 pushq %rbx 799 pushq %rbp 800 pushq %r12 801 pushq %r13 802 pushq %r14 803 pushq %r15 804 805 subq $128+24,%rsp 806.Lmul_by_one_body: 807 movq %rdx,%rbp 808 movq %rcx,128(%rsp) 809 810 movq (%rsi),%r8 811 pxor %xmm0,%xmm0 812 movq 8(%rsi),%r9 813 movq 16(%rsi),%r10 814 movq 24(%rsi),%r11 815 movq 32(%rsi),%r12 816 movq 40(%rsi),%r13 817 movq 48(%rsi),%r14 818 movq 56(%rsi),%r15 819 820 movdqa %xmm0,(%rsp) 821 movdqa %xmm0,16(%rsp) 822 movdqa %xmm0,32(%rsp) 823 movdqa %xmm0,48(%rsp) 824 movdqa %xmm0,64(%rsp) 825 movdqa %xmm0,80(%rsp) 826 movdqa %xmm0,96(%rsp) 827 call __rsaz_512_reduce 828 movq %r8,(%rdi) 829 movq %r9,8(%rdi) 830 movq %r10,16(%rdi) 831 movq %r11,24(%rdi) 832 movq %r12,32(%rdi) 833 movq %r13,40(%rdi) 834 movq %r14,48(%rdi) 835 movq %r15,56(%rdi) 836 837 leaq 128+24+48(%rsp),%rax 838 movq -48(%rax),%r15 839 movq -40(%rax),%r14 840 movq -32(%rax),%r13 841 movq -24(%rax),%r12 842 movq -16(%rax),%rbp 843 movq -8(%rax),%rbx 844 leaq (%rax),%rsp 845.Lmul_by_one_epilogue: 846 .byte 0xf3,0xc3 847.size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one 848.type __rsaz_512_reduce,@function 849.align 32 850__rsaz_512_reduce: 851 movq %r8,%rbx 852 imulq 128+8(%rsp),%rbx 853 movq 0(%rbp),%rax 854 movl $8,%ecx 855 jmp .Lreduction_loop 856 857.align 32 858.Lreduction_loop: 859 mulq %rbx 860 movq 8(%rbp),%rax 861 negq %r8 862 movq %rdx,%r8 863 adcq $0,%r8 864 865 mulq %rbx 866 addq %rax,%r9 867 movq 16(%rbp),%rax 868 adcq $0,%rdx 869 addq %r9,%r8 870 movq %rdx,%r9 871 adcq $0,%r9 872 873 mulq %rbx 874 addq %rax,%r10 875 movq 24(%rbp),%rax 876 adcq $0,%rdx 877 addq %r10,%r9 878 movq %rdx,%r10 879 adcq $0,%r10 880 881 mulq %rbx 882 addq %rax,%r11 883 movq 32(%rbp),%rax 884 adcq $0,%rdx 885 addq %r11,%r10 886 movq 128+8(%rsp),%rsi 887 888 889 adcq $0,%rdx 890 movq %rdx,%r11 891 892 mulq %rbx 893 addq %rax,%r12 894 movq 40(%rbp),%rax 895 adcq $0,%rdx 896 imulq %r8,%rsi 897 addq %r12,%r11 898 movq %rdx,%r12 899 adcq $0,%r12 900 901 mulq %rbx 902 addq %rax,%r13 903 movq 48(%rbp),%rax 904 adcq $0,%rdx 905 addq %r13,%r12 906 movq %rdx,%r13 907 adcq $0,%r13 908 909 mulq %rbx 910 addq %rax,%r14 911 movq 56(%rbp),%rax 912 adcq $0,%rdx 913 addq %r14,%r13 914 movq %rdx,%r14 915 adcq $0,%r14 916 917 mulq %rbx 918 movq %rsi,%rbx 919 addq %rax,%r15 920 movq 0(%rbp),%rax 921 adcq $0,%rdx 922 addq %r15,%r14 923 movq %rdx,%r15 924 adcq $0,%r15 925 926 decl %ecx 927 jne .Lreduction_loop 928 929 .byte 0xf3,0xc3 930.size __rsaz_512_reduce,.-__rsaz_512_reduce 931.type __rsaz_512_subtract,@function 932.align 32 933__rsaz_512_subtract: 934 movq %r8,(%rdi) 935 movq %r9,8(%rdi) 936 movq %r10,16(%rdi) 937 movq %r11,24(%rdi) 938 movq %r12,32(%rdi) 939 movq %r13,40(%rdi) 940 movq %r14,48(%rdi) 941 movq %r15,56(%rdi) 942 943 movq 0(%rbp),%r8 944 movq 8(%rbp),%r9 945 negq %r8 946 notq %r9 947 andq %rcx,%r8 948 movq 16(%rbp),%r10 949 andq %rcx,%r9 950 notq %r10 951 movq 24(%rbp),%r11 952 andq %rcx,%r10 953 notq %r11 954 movq 32(%rbp),%r12 955 andq %rcx,%r11 956 notq %r12 957 movq 40(%rbp),%r13 958 andq %rcx,%r12 959 notq %r13 960 movq 48(%rbp),%r14 961 andq %rcx,%r13 962 notq %r14 963 movq 56(%rbp),%r15 964 andq %rcx,%r14 965 notq %r15 966 andq %rcx,%r15 967 968 addq (%rdi),%r8 969 adcq 8(%rdi),%r9 970 adcq 16(%rdi),%r10 971 adcq 24(%rdi),%r11 972 adcq 32(%rdi),%r12 973 adcq 40(%rdi),%r13 974 adcq 48(%rdi),%r14 975 adcq 56(%rdi),%r15 976 977 movq %r8,(%rdi) 978 movq %r9,8(%rdi) 979 movq %r10,16(%rdi) 980 movq %r11,24(%rdi) 981 movq %r12,32(%rdi) 982 movq %r13,40(%rdi) 983 movq %r14,48(%rdi) 984 movq %r15,56(%rdi) 985 986 .byte 0xf3,0xc3 987.size __rsaz_512_subtract,.-__rsaz_512_subtract 988.type __rsaz_512_mul,@function 989.align 32 990__rsaz_512_mul: 991 leaq 8(%rsp),%rdi 992 993 movq (%rsi),%rax 994 mulq %rbx 995 movq %rax,(%rdi) 996 movq 8(%rsi),%rax 997 movq %rdx,%r8 998 999 mulq %rbx 1000 addq %rax,%r8 1001 movq 16(%rsi),%rax 1002 movq %rdx,%r9 1003 adcq $0,%r9 1004 1005 mulq %rbx 1006 addq %rax,%r9 1007 movq 24(%rsi),%rax 1008 movq %rdx,%r10 1009 adcq $0,%r10 1010 1011 mulq %rbx 1012 addq %rax,%r10 1013 movq 32(%rsi),%rax 1014 movq %rdx,%r11 1015 adcq $0,%r11 1016 1017 mulq %rbx 1018 addq %rax,%r11 1019 movq 40(%rsi),%rax 1020 movq %rdx,%r12 1021 adcq $0,%r12 1022 1023 mulq %rbx 1024 addq %rax,%r12 1025 movq 48(%rsi),%rax 1026 movq %rdx,%r13 1027 adcq $0,%r13 1028 1029 mulq %rbx 1030 addq %rax,%r13 1031 movq 56(%rsi),%rax 1032 movq %rdx,%r14 1033 adcq $0,%r14 1034 1035 mulq %rbx 1036 addq %rax,%r14 1037 movq (%rsi),%rax 1038 movq %rdx,%r15 1039 adcq $0,%r15 1040 1041 leaq 8(%rbp),%rbp 1042 leaq 8(%rdi),%rdi 1043 1044 movl $7,%ecx 1045 jmp .Loop_mul 1046 1047.align 32 1048.Loop_mul: 1049 movq (%rbp),%rbx 1050 mulq %rbx 1051 addq %rax,%r8 1052 movq 8(%rsi),%rax 1053 movq %r8,(%rdi) 1054 movq %rdx,%r8 1055 adcq $0,%r8 1056 1057 mulq %rbx 1058 addq %rax,%r9 1059 movq 16(%rsi),%rax 1060 adcq $0,%rdx 1061 addq %r9,%r8 1062 movq %rdx,%r9 1063 adcq $0,%r9 1064 1065 mulq %rbx 1066 addq %rax,%r10 1067 movq 24(%rsi),%rax 1068 adcq $0,%rdx 1069 addq %r10,%r9 1070 movq %rdx,%r10 1071 adcq $0,%r10 1072 1073 mulq %rbx 1074 addq %rax,%r11 1075 movq 32(%rsi),%rax 1076 adcq $0,%rdx 1077 addq %r11,%r10 1078 movq %rdx,%r11 1079 adcq $0,%r11 1080 1081 mulq %rbx 1082 addq %rax,%r12 1083 movq 40(%rsi),%rax 1084 adcq $0,%rdx 1085 addq %r12,%r11 1086 movq %rdx,%r12 1087 adcq $0,%r12 1088 1089 mulq %rbx 1090 addq %rax,%r13 1091 movq 48(%rsi),%rax 1092 adcq $0,%rdx 1093 addq %r13,%r12 1094 movq %rdx,%r13 1095 adcq $0,%r13 1096 1097 mulq %rbx 1098 addq %rax,%r14 1099 movq 56(%rsi),%rax 1100 adcq $0,%rdx 1101 addq %r14,%r13 1102 movq %rdx,%r14 1103 leaq 8(%rbp),%rbp 1104 adcq $0,%r14 1105 1106 mulq %rbx 1107 addq %rax,%r15 1108 movq (%rsi),%rax 1109 adcq $0,%rdx 1110 addq %r15,%r14 1111 movq %rdx,%r15 1112 adcq $0,%r15 1113 1114 leaq 8(%rdi),%rdi 1115 1116 decl %ecx 1117 jnz .Loop_mul 1118 1119 movq %r8,(%rdi) 1120 movq %r9,8(%rdi) 1121 movq %r10,16(%rdi) 1122 movq %r11,24(%rdi) 1123 movq %r12,32(%rdi) 1124 movq %r13,40(%rdi) 1125 movq %r14,48(%rdi) 1126 movq %r15,56(%rdi) 1127 1128 .byte 0xf3,0xc3 1129.size __rsaz_512_mul,.-__rsaz_512_mul 1130.globl rsaz_512_scatter4 1131.type rsaz_512_scatter4,@function 1132.align 16 1133rsaz_512_scatter4: 1134 leaq (%rdi,%rdx,8),%rdi 1135 movl $8,%r9d 1136 jmp .Loop_scatter 1137.align 16 1138.Loop_scatter: 1139 movq (%rsi),%rax 1140 leaq 8(%rsi),%rsi 1141 movq %rax,(%rdi) 1142 leaq 128(%rdi),%rdi 1143 decl %r9d 1144 jnz .Loop_scatter 1145 .byte 0xf3,0xc3 1146.size rsaz_512_scatter4,.-rsaz_512_scatter4 1147 1148.globl rsaz_512_gather4 1149.type rsaz_512_gather4,@function 1150.align 16 1151rsaz_512_gather4: 1152 movd %edx,%xmm8 1153 movdqa .Linc+16(%rip),%xmm1 1154 movdqa .Linc(%rip),%xmm0 1155 1156 pshufd $0,%xmm8,%xmm8 1157 movdqa %xmm1,%xmm7 1158 movdqa %xmm1,%xmm2 1159 paddd %xmm0,%xmm1 1160 pcmpeqd %xmm8,%xmm0 1161 movdqa %xmm7,%xmm3 1162 paddd %xmm1,%xmm2 1163 pcmpeqd %xmm8,%xmm1 1164 movdqa %xmm7,%xmm4 1165 paddd %xmm2,%xmm3 1166 pcmpeqd %xmm8,%xmm2 1167 movdqa %xmm7,%xmm5 1168 paddd %xmm3,%xmm4 1169 pcmpeqd %xmm8,%xmm3 1170 movdqa %xmm7,%xmm6 1171 paddd %xmm4,%xmm5 1172 pcmpeqd %xmm8,%xmm4 1173 paddd %xmm5,%xmm6 1174 pcmpeqd %xmm8,%xmm5 1175 paddd %xmm6,%xmm7 1176 pcmpeqd %xmm8,%xmm6 1177 pcmpeqd %xmm8,%xmm7 1178 movl $8,%r9d 1179 jmp .Loop_gather 1180.align 16 1181.Loop_gather: 1182 movdqa 0(%rsi),%xmm8 1183 movdqa 16(%rsi),%xmm9 1184 movdqa 32(%rsi),%xmm10 1185 movdqa 48(%rsi),%xmm11 1186 pand %xmm0,%xmm8 1187 movdqa 64(%rsi),%xmm12 1188 pand %xmm1,%xmm9 1189 movdqa 80(%rsi),%xmm13 1190 pand %xmm2,%xmm10 1191 movdqa 96(%rsi),%xmm14 1192 pand %xmm3,%xmm11 1193 movdqa 112(%rsi),%xmm15 1194 leaq 128(%rsi),%rsi 1195 pand %xmm4,%xmm12 1196 pand %xmm5,%xmm13 1197 pand %xmm6,%xmm14 1198 pand %xmm7,%xmm15 1199 por %xmm10,%xmm8 1200 por %xmm11,%xmm9 1201 por %xmm12,%xmm8 1202 por %xmm13,%xmm9 1203 por %xmm14,%xmm8 1204 por %xmm15,%xmm9 1205 1206 por %xmm9,%xmm8 1207 pshufd $0x4e,%xmm8,%xmm9 1208 por %xmm9,%xmm8 1209 movq %xmm8,(%rdi) 1210 leaq 8(%rdi),%rdi 1211 decl %r9d 1212 jnz .Loop_gather 1213 .byte 0xf3,0xc3 1214.LSEH_end_rsaz_512_gather4: 1215.size rsaz_512_gather4,.-rsaz_512_gather4 1216 1217.align 64 1218.Linc: 1219.long 0,0, 1,1 1220.long 2,2, 2,2 1221