x86_64-mont.S revision 337982
1/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/x86_64-mont.S 337982 2018-08-17 18:32:53Z jkim $ */ 2/* Do not modify. This file is auto-generated from x86_64-mont.pl. */ 3.text 4 5 6 7.globl bn_mul_mont 8.type bn_mul_mont,@function 9.align 16 10bn_mul_mont: 11 movl %r9d,%r9d 12 movq %rsp,%rax 13 testl $3,%r9d 14 jnz .Lmul_enter 15 cmpl $8,%r9d 16 jb .Lmul_enter 17 movl OPENSSL_ia32cap_P+8(%rip),%r11d 18 cmpq %rsi,%rdx 19 jne .Lmul4x_enter 20 testl $7,%r9d 21 jz .Lsqr8x_enter 22 jmp .Lmul4x_enter 23 24.align 16 25.Lmul_enter: 26 pushq %rbx 27 pushq %rbp 28 pushq %r12 29 pushq %r13 30 pushq %r14 31 pushq %r15 32 33 negq %r9 34 movq %rsp,%r11 35 leaq -16(%rsp,%r9,8),%r10 36 negq %r9 37 andq $-1024,%r10 38 39 40 41 42 43 44 45 subq %r10,%r11 46 andq $-4096,%r11 47 leaq (%r10,%r11,1),%rsp 48 movq (%rsp),%r11 49 cmpq %r10,%rsp 50 ja .Lmul_page_walk 51 jmp .Lmul_page_walk_done 52 53.align 16 54.Lmul_page_walk: 55 leaq -4096(%rsp),%rsp 56 movq (%rsp),%r11 57 cmpq %r10,%rsp 58 ja .Lmul_page_walk 59.Lmul_page_walk_done: 60 61 movq %rax,8(%rsp,%r9,8) 62.Lmul_body: 63 movq %rdx,%r12 64 movq (%r8),%r8 65 movq (%r12),%rbx 66 movq (%rsi),%rax 67 68 xorq %r14,%r14 69 xorq %r15,%r15 70 71 movq %r8,%rbp 72 mulq %rbx 73 movq %rax,%r10 74 movq (%rcx),%rax 75 76 imulq %r10,%rbp 77 movq %rdx,%r11 78 79 mulq %rbp 80 addq %rax,%r10 81 movq 8(%rsi),%rax 82 adcq $0,%rdx 83 movq %rdx,%r13 84 85 leaq 1(%r15),%r15 86 jmp .L1st_enter 87 88.align 16 89.L1st: 90 addq %rax,%r13 91 movq (%rsi,%r15,8),%rax 92 adcq $0,%rdx 93 addq %r11,%r13 94 movq %r10,%r11 95 adcq $0,%rdx 96 movq %r13,-16(%rsp,%r15,8) 97 movq %rdx,%r13 98 99.L1st_enter: 100 mulq %rbx 101 addq %rax,%r11 102 movq (%rcx,%r15,8),%rax 103 adcq $0,%rdx 104 leaq 1(%r15),%r15 105 movq %rdx,%r10 106 107 mulq %rbp 108 cmpq %r9,%r15 109 jne .L1st 110 111 addq %rax,%r13 112 movq (%rsi),%rax 113 adcq $0,%rdx 114 addq %r11,%r13 115 adcq $0,%rdx 116 movq %r13,-16(%rsp,%r15,8) 117 movq %rdx,%r13 118 movq %r10,%r11 119 120 xorq %rdx,%rdx 121 addq %r11,%r13 122 adcq $0,%rdx 123 movq %r13,-8(%rsp,%r9,8) 124 movq %rdx,(%rsp,%r9,8) 125 126 leaq 1(%r14),%r14 127 jmp .Louter 128.align 16 129.Louter: 130 movq (%r12,%r14,8),%rbx 131 xorq %r15,%r15 132 movq %r8,%rbp 133 movq (%rsp),%r10 134 mulq %rbx 135 addq %rax,%r10 136 movq (%rcx),%rax 137 adcq $0,%rdx 138 139 imulq %r10,%rbp 140 movq %rdx,%r11 141 142 mulq %rbp 143 addq %rax,%r10 144 movq 8(%rsi),%rax 145 adcq $0,%rdx 146 movq 8(%rsp),%r10 147 movq %rdx,%r13 148 149 leaq 1(%r15),%r15 150 jmp .Linner_enter 151 152.align 16 153.Linner: 154 addq %rax,%r13 155 movq (%rsi,%r15,8),%rax 156 adcq $0,%rdx 157 addq %r10,%r13 158 movq (%rsp,%r15,8),%r10 159 adcq $0,%rdx 160 movq %r13,-16(%rsp,%r15,8) 161 movq %rdx,%r13 162 163.Linner_enter: 164 mulq %rbx 165 addq %rax,%r11 166 movq (%rcx,%r15,8),%rax 167 adcq $0,%rdx 168 addq %r11,%r10 169 movq %rdx,%r11 170 adcq $0,%r11 171 leaq 1(%r15),%r15 172 173 mulq %rbp 174 cmpq %r9,%r15 175 jne .Linner 176 177 addq %rax,%r13 178 movq (%rsi),%rax 179 adcq $0,%rdx 180 addq %r10,%r13 181 movq (%rsp,%r15,8),%r10 182 adcq $0,%rdx 183 movq %r13,-16(%rsp,%r15,8) 184 movq %rdx,%r13 185 186 xorq %rdx,%rdx 187 addq %r11,%r13 188 adcq $0,%rdx 189 addq %r10,%r13 190 adcq $0,%rdx 191 movq %r13,-8(%rsp,%r9,8) 192 movq %rdx,(%rsp,%r9,8) 193 194 leaq 1(%r14),%r14 195 cmpq %r9,%r14 196 jb .Louter 197 198 xorq %r14,%r14 199 movq (%rsp),%rax 200 movq %r9,%r15 201 202.align 16 203.Lsub: sbbq (%rcx,%r14,8),%rax 204 movq %rax,(%rdi,%r14,8) 205 movq 8(%rsp,%r14,8),%rax 206 leaq 1(%r14),%r14 207 decq %r15 208 jnz .Lsub 209 210 sbbq $0,%rax 211 movq $-1,%rbx 212 xorq %rax,%rbx 213 xorq %r14,%r14 214 movq %r9,%r15 215 216.Lcopy: 217 movq (%rdi,%r14,8),%rcx 218 movq (%rsp,%r14,8),%rdx 219 andq %rbx,%rcx 220 andq %rax,%rdx 221 movq %r9,(%rsp,%r14,8) 222 orq %rcx,%rdx 223 movq %rdx,(%rdi,%r14,8) 224 leaq 1(%r14),%r14 225 subq $1,%r15 226 jnz .Lcopy 227 228 movq 8(%rsp,%r9,8),%rsi 229 movq $1,%rax 230 movq -48(%rsi),%r15 231 movq -40(%rsi),%r14 232 movq -32(%rsi),%r13 233 movq -24(%rsi),%r12 234 movq -16(%rsi),%rbp 235 movq -8(%rsi),%rbx 236 leaq (%rsi),%rsp 237.Lmul_epilogue: 238 .byte 0xf3,0xc3 239.size bn_mul_mont,.-bn_mul_mont 240.type bn_mul4x_mont,@function 241.align 16 242bn_mul4x_mont: 243 movl %r9d,%r9d 244 movq %rsp,%rax 245.Lmul4x_enter: 246 andl $0x80100,%r11d 247 cmpl $0x80100,%r11d 248 je .Lmulx4x_enter 249 pushq %rbx 250 pushq %rbp 251 pushq %r12 252 pushq %r13 253 pushq %r14 254 pushq %r15 255 256 negq %r9 257 movq %rsp,%r11 258 leaq -32(%rsp,%r9,8),%r10 259 negq %r9 260 andq $-1024,%r10 261 262 subq %r10,%r11 263 andq $-4096,%r11 264 leaq (%r10,%r11,1),%rsp 265 movq (%rsp),%r11 266 cmpq %r10,%rsp 267 ja .Lmul4x_page_walk 268 jmp .Lmul4x_page_walk_done 269 270.Lmul4x_page_walk: 271 leaq -4096(%rsp),%rsp 272 movq (%rsp),%r11 273 cmpq %r10,%rsp 274 ja .Lmul4x_page_walk 275.Lmul4x_page_walk_done: 276 277 movq %rax,8(%rsp,%r9,8) 278.Lmul4x_body: 279 movq %rdi,16(%rsp,%r9,8) 280 movq %rdx,%r12 281 movq (%r8),%r8 282 movq (%r12),%rbx 283 movq (%rsi),%rax 284 285 xorq %r14,%r14 286 xorq %r15,%r15 287 288 movq %r8,%rbp 289 mulq %rbx 290 movq %rax,%r10 291 movq (%rcx),%rax 292 293 imulq %r10,%rbp 294 movq %rdx,%r11 295 296 mulq %rbp 297 addq %rax,%r10 298 movq 8(%rsi),%rax 299 adcq $0,%rdx 300 movq %rdx,%rdi 301 302 mulq %rbx 303 addq %rax,%r11 304 movq 8(%rcx),%rax 305 adcq $0,%rdx 306 movq %rdx,%r10 307 308 mulq %rbp 309 addq %rax,%rdi 310 movq 16(%rsi),%rax 311 adcq $0,%rdx 312 addq %r11,%rdi 313 leaq 4(%r15),%r15 314 adcq $0,%rdx 315 movq %rdi,(%rsp) 316 movq %rdx,%r13 317 jmp .L1st4x 318.align 16 319.L1st4x: 320 mulq %rbx 321 addq %rax,%r10 322 movq -16(%rcx,%r15,8),%rax 323 adcq $0,%rdx 324 movq %rdx,%r11 325 326 mulq %rbp 327 addq %rax,%r13 328 movq -8(%rsi,%r15,8),%rax 329 adcq $0,%rdx 330 addq %r10,%r13 331 adcq $0,%rdx 332 movq %r13,-24(%rsp,%r15,8) 333 movq %rdx,%rdi 334 335 mulq %rbx 336 addq %rax,%r11 337 movq -8(%rcx,%r15,8),%rax 338 adcq $0,%rdx 339 movq %rdx,%r10 340 341 mulq %rbp 342 addq %rax,%rdi 343 movq (%rsi,%r15,8),%rax 344 adcq $0,%rdx 345 addq %r11,%rdi 346 adcq $0,%rdx 347 movq %rdi,-16(%rsp,%r15,8) 348 movq %rdx,%r13 349 350 mulq %rbx 351 addq %rax,%r10 352 movq (%rcx,%r15,8),%rax 353 adcq $0,%rdx 354 movq %rdx,%r11 355 356 mulq %rbp 357 addq %rax,%r13 358 movq 8(%rsi,%r15,8),%rax 359 adcq $0,%rdx 360 addq %r10,%r13 361 adcq $0,%rdx 362 movq %r13,-8(%rsp,%r15,8) 363 movq %rdx,%rdi 364 365 mulq %rbx 366 addq %rax,%r11 367 movq 8(%rcx,%r15,8),%rax 368 adcq $0,%rdx 369 leaq 4(%r15),%r15 370 movq %rdx,%r10 371 372 mulq %rbp 373 addq %rax,%rdi 374 movq -16(%rsi,%r15,8),%rax 375 adcq $0,%rdx 376 addq %r11,%rdi 377 adcq $0,%rdx 378 movq %rdi,-32(%rsp,%r15,8) 379 movq %rdx,%r13 380 cmpq %r9,%r15 381 jb .L1st4x 382 383 mulq %rbx 384 addq %rax,%r10 385 movq -16(%rcx,%r15,8),%rax 386 adcq $0,%rdx 387 movq %rdx,%r11 388 389 mulq %rbp 390 addq %rax,%r13 391 movq -8(%rsi,%r15,8),%rax 392 adcq $0,%rdx 393 addq %r10,%r13 394 adcq $0,%rdx 395 movq %r13,-24(%rsp,%r15,8) 396 movq %rdx,%rdi 397 398 mulq %rbx 399 addq %rax,%r11 400 movq -8(%rcx,%r15,8),%rax 401 adcq $0,%rdx 402 movq %rdx,%r10 403 404 mulq %rbp 405 addq %rax,%rdi 406 movq (%rsi),%rax 407 adcq $0,%rdx 408 addq %r11,%rdi 409 adcq $0,%rdx 410 movq %rdi,-16(%rsp,%r15,8) 411 movq %rdx,%r13 412 413 xorq %rdi,%rdi 414 addq %r10,%r13 415 adcq $0,%rdi 416 movq %r13,-8(%rsp,%r15,8) 417 movq %rdi,(%rsp,%r15,8) 418 419 leaq 1(%r14),%r14 420.align 4 421.Louter4x: 422 movq (%r12,%r14,8),%rbx 423 xorq %r15,%r15 424 movq (%rsp),%r10 425 movq %r8,%rbp 426 mulq %rbx 427 addq %rax,%r10 428 movq (%rcx),%rax 429 adcq $0,%rdx 430 431 imulq %r10,%rbp 432 movq %rdx,%r11 433 434 mulq %rbp 435 addq %rax,%r10 436 movq 8(%rsi),%rax 437 adcq $0,%rdx 438 movq %rdx,%rdi 439 440 mulq %rbx 441 addq %rax,%r11 442 movq 8(%rcx),%rax 443 adcq $0,%rdx 444 addq 8(%rsp),%r11 445 adcq $0,%rdx 446 movq %rdx,%r10 447 448 mulq %rbp 449 addq %rax,%rdi 450 movq 16(%rsi),%rax 451 adcq $0,%rdx 452 addq %r11,%rdi 453 leaq 4(%r15),%r15 454 adcq $0,%rdx 455 movq %rdi,(%rsp) 456 movq %rdx,%r13 457 jmp .Linner4x 458.align 16 459.Linner4x: 460 mulq %rbx 461 addq %rax,%r10 462 movq -16(%rcx,%r15,8),%rax 463 adcq $0,%rdx 464 addq -16(%rsp,%r15,8),%r10 465 adcq $0,%rdx 466 movq %rdx,%r11 467 468 mulq %rbp 469 addq %rax,%r13 470 movq -8(%rsi,%r15,8),%rax 471 adcq $0,%rdx 472 addq %r10,%r13 473 adcq $0,%rdx 474 movq %r13,-24(%rsp,%r15,8) 475 movq %rdx,%rdi 476 477 mulq %rbx 478 addq %rax,%r11 479 movq -8(%rcx,%r15,8),%rax 480 adcq $0,%rdx 481 addq -8(%rsp,%r15,8),%r11 482 adcq $0,%rdx 483 movq %rdx,%r10 484 485 mulq %rbp 486 addq %rax,%rdi 487 movq (%rsi,%r15,8),%rax 488 adcq $0,%rdx 489 addq %r11,%rdi 490 adcq $0,%rdx 491 movq %rdi,-16(%rsp,%r15,8) 492 movq %rdx,%r13 493 494 mulq %rbx 495 addq %rax,%r10 496 movq (%rcx,%r15,8),%rax 497 adcq $0,%rdx 498 addq (%rsp,%r15,8),%r10 499 adcq $0,%rdx 500 movq %rdx,%r11 501 502 mulq %rbp 503 addq %rax,%r13 504 movq 8(%rsi,%r15,8),%rax 505 adcq $0,%rdx 506 addq %r10,%r13 507 adcq $0,%rdx 508 movq %r13,-8(%rsp,%r15,8) 509 movq %rdx,%rdi 510 511 mulq %rbx 512 addq %rax,%r11 513 movq 8(%rcx,%r15,8),%rax 514 adcq $0,%rdx 515 addq 8(%rsp,%r15,8),%r11 516 adcq $0,%rdx 517 leaq 4(%r15),%r15 518 movq %rdx,%r10 519 520 mulq %rbp 521 addq %rax,%rdi 522 movq -16(%rsi,%r15,8),%rax 523 adcq $0,%rdx 524 addq %r11,%rdi 525 adcq $0,%rdx 526 movq %rdi,-32(%rsp,%r15,8) 527 movq %rdx,%r13 528 cmpq %r9,%r15 529 jb .Linner4x 530 531 mulq %rbx 532 addq %rax,%r10 533 movq -16(%rcx,%r15,8),%rax 534 adcq $0,%rdx 535 addq -16(%rsp,%r15,8),%r10 536 adcq $0,%rdx 537 movq %rdx,%r11 538 539 mulq %rbp 540 addq %rax,%r13 541 movq -8(%rsi,%r15,8),%rax 542 adcq $0,%rdx 543 addq %r10,%r13 544 adcq $0,%rdx 545 movq %r13,-24(%rsp,%r15,8) 546 movq %rdx,%rdi 547 548 mulq %rbx 549 addq %rax,%r11 550 movq -8(%rcx,%r15,8),%rax 551 adcq $0,%rdx 552 addq -8(%rsp,%r15,8),%r11 553 adcq $0,%rdx 554 leaq 1(%r14),%r14 555 movq %rdx,%r10 556 557 mulq %rbp 558 addq %rax,%rdi 559 movq (%rsi),%rax 560 adcq $0,%rdx 561 addq %r11,%rdi 562 adcq $0,%rdx 563 movq %rdi,-16(%rsp,%r15,8) 564 movq %rdx,%r13 565 566 xorq %rdi,%rdi 567 addq %r10,%r13 568 adcq $0,%rdi 569 addq (%rsp,%r9,8),%r13 570 adcq $0,%rdi 571 movq %r13,-8(%rsp,%r15,8) 572 movq %rdi,(%rsp,%r15,8) 573 574 cmpq %r9,%r14 575 jb .Louter4x 576 movq 16(%rsp,%r9,8),%rdi 577 leaq -4(%r9),%r15 578 movq 0(%rsp),%rax 579 movq 8(%rsp),%rdx 580 shrq $2,%r15 581 leaq (%rsp),%rsi 582 xorq %r14,%r14 583 584 subq 0(%rcx),%rax 585 movq 16(%rsi),%rbx 586 movq 24(%rsi),%rbp 587 sbbq 8(%rcx),%rdx 588 589.Lsub4x: 590 movq %rax,0(%rdi,%r14,8) 591 movq %rdx,8(%rdi,%r14,8) 592 sbbq 16(%rcx,%r14,8),%rbx 593 movq 32(%rsi,%r14,8),%rax 594 movq 40(%rsi,%r14,8),%rdx 595 sbbq 24(%rcx,%r14,8),%rbp 596 movq %rbx,16(%rdi,%r14,8) 597 movq %rbp,24(%rdi,%r14,8) 598 sbbq 32(%rcx,%r14,8),%rax 599 movq 48(%rsi,%r14,8),%rbx 600 movq 56(%rsi,%r14,8),%rbp 601 sbbq 40(%rcx,%r14,8),%rdx 602 leaq 4(%r14),%r14 603 decq %r15 604 jnz .Lsub4x 605 606 movq %rax,0(%rdi,%r14,8) 607 movq 32(%rsi,%r14,8),%rax 608 sbbq 16(%rcx,%r14,8),%rbx 609 movq %rdx,8(%rdi,%r14,8) 610 sbbq 24(%rcx,%r14,8),%rbp 611 movq %rbx,16(%rdi,%r14,8) 612 613 sbbq $0,%rax 614 movq %rbp,24(%rdi,%r14,8) 615 pxor %xmm0,%xmm0 616.byte 102,72,15,110,224 617 pcmpeqd %xmm5,%xmm5 618 pshufd $0,%xmm4,%xmm4 619 movq %r9,%r15 620 pxor %xmm4,%xmm5 621 shrq $2,%r15 622 xorl %eax,%eax 623 624 jmp .Lcopy4x 625.align 16 626.Lcopy4x: 627 movdqa (%rsp,%rax,1),%xmm1 628 movdqu (%rdi,%rax,1),%xmm2 629 pand %xmm4,%xmm1 630 pand %xmm5,%xmm2 631 movdqa 16(%rsp,%rax,1),%xmm3 632 movdqa %xmm0,(%rsp,%rax,1) 633 por %xmm2,%xmm1 634 movdqu 16(%rdi,%rax,1),%xmm2 635 movdqu %xmm1,(%rdi,%rax,1) 636 pand %xmm4,%xmm3 637 pand %xmm5,%xmm2 638 movdqa %xmm0,16(%rsp,%rax,1) 639 por %xmm2,%xmm3 640 movdqu %xmm3,16(%rdi,%rax,1) 641 leaq 32(%rax),%rax 642 decq %r15 643 jnz .Lcopy4x 644 movq 8(%rsp,%r9,8),%rsi 645 movq $1,%rax 646 movq -48(%rsi),%r15 647 movq -40(%rsi),%r14 648 movq -32(%rsi),%r13 649 movq -24(%rsi),%r12 650 movq -16(%rsi),%rbp 651 movq -8(%rsi),%rbx 652 leaq (%rsi),%rsp 653.Lmul4x_epilogue: 654 .byte 0xf3,0xc3 655.size bn_mul4x_mont,.-bn_mul4x_mont 656 657 658 659.type bn_sqr8x_mont,@function 660.align 32 661bn_sqr8x_mont: 662 movq %rsp,%rax 663.Lsqr8x_enter: 664 pushq %rbx 665 pushq %rbp 666 pushq %r12 667 pushq %r13 668 pushq %r14 669 pushq %r15 670.Lsqr8x_prologue: 671 672 movl %r9d,%r10d 673 shll $3,%r9d 674 shlq $3+2,%r10 675 negq %r9 676 677 678 679 680 681 682 leaq -64(%rsp,%r9,2),%r11 683 movq %rsp,%rbp 684 movq (%r8),%r8 685 subq %rsi,%r11 686 andq $4095,%r11 687 cmpq %r11,%r10 688 jb .Lsqr8x_sp_alt 689 subq %r11,%rbp 690 leaq -64(%rbp,%r9,2),%rbp 691 jmp .Lsqr8x_sp_done 692 693.align 32 694.Lsqr8x_sp_alt: 695 leaq 4096-64(,%r9,2),%r10 696 leaq -64(%rbp,%r9,2),%rbp 697 subq %r10,%r11 698 movq $0,%r10 699 cmovcq %r10,%r11 700 subq %r11,%rbp 701.Lsqr8x_sp_done: 702 andq $-64,%rbp 703 movq %rsp,%r11 704 subq %rbp,%r11 705 andq $-4096,%r11 706 leaq (%r11,%rbp,1),%rsp 707 movq (%rsp),%r10 708 cmpq %rbp,%rsp 709 ja .Lsqr8x_page_walk 710 jmp .Lsqr8x_page_walk_done 711 712.align 16 713.Lsqr8x_page_walk: 714 leaq -4096(%rsp),%rsp 715 movq (%rsp),%r10 716 cmpq %rbp,%rsp 717 ja .Lsqr8x_page_walk 718.Lsqr8x_page_walk_done: 719 720 movq %r9,%r10 721 negq %r9 722 723 movq %r8,32(%rsp) 724 movq %rax,40(%rsp) 725.Lsqr8x_body: 726 727.byte 102,72,15,110,209 728 pxor %xmm0,%xmm0 729.byte 102,72,15,110,207 730.byte 102,73,15,110,218 731 movl OPENSSL_ia32cap_P+8(%rip),%eax 732 andl $0x80100,%eax 733 cmpl $0x80100,%eax 734 jne .Lsqr8x_nox 735 736 call bn_sqrx8x_internal 737 738 739 740 741 leaq (%r8,%rcx,1),%rbx 742 movq %rcx,%r9 743 movq %rcx,%rdx 744.byte 102,72,15,126,207 745 sarq $3+2,%rcx 746 jmp .Lsqr8x_sub 747 748.align 32 749.Lsqr8x_nox: 750 call bn_sqr8x_internal 751 752 753 754 755 leaq (%rdi,%r9,1),%rbx 756 movq %r9,%rcx 757 movq %r9,%rdx 758.byte 102,72,15,126,207 759 sarq $3+2,%rcx 760 jmp .Lsqr8x_sub 761 762.align 32 763.Lsqr8x_sub: 764 movq 0(%rbx),%r12 765 movq 8(%rbx),%r13 766 movq 16(%rbx),%r14 767 movq 24(%rbx),%r15 768 leaq 32(%rbx),%rbx 769 sbbq 0(%rbp),%r12 770 sbbq 8(%rbp),%r13 771 sbbq 16(%rbp),%r14 772 sbbq 24(%rbp),%r15 773 leaq 32(%rbp),%rbp 774 movq %r12,0(%rdi) 775 movq %r13,8(%rdi) 776 movq %r14,16(%rdi) 777 movq %r15,24(%rdi) 778 leaq 32(%rdi),%rdi 779 incq %rcx 780 jnz .Lsqr8x_sub 781 782 sbbq $0,%rax 783 leaq (%rbx,%r9,1),%rbx 784 leaq (%rdi,%r9,1),%rdi 785 786.byte 102,72,15,110,200 787 pxor %xmm0,%xmm0 788 pshufd $0,%xmm1,%xmm1 789 movq 40(%rsp),%rsi 790 jmp .Lsqr8x_cond_copy 791 792.align 32 793.Lsqr8x_cond_copy: 794 movdqa 0(%rbx),%xmm2 795 movdqa 16(%rbx),%xmm3 796 leaq 32(%rbx),%rbx 797 movdqu 0(%rdi),%xmm4 798 movdqu 16(%rdi),%xmm5 799 leaq 32(%rdi),%rdi 800 movdqa %xmm0,-32(%rbx) 801 movdqa %xmm0,-16(%rbx) 802 movdqa %xmm0,-32(%rbx,%rdx,1) 803 movdqa %xmm0,-16(%rbx,%rdx,1) 804 pcmpeqd %xmm1,%xmm0 805 pand %xmm1,%xmm2 806 pand %xmm1,%xmm3 807 pand %xmm0,%xmm4 808 pand %xmm0,%xmm5 809 pxor %xmm0,%xmm0 810 por %xmm2,%xmm4 811 por %xmm3,%xmm5 812 movdqu %xmm4,-32(%rdi) 813 movdqu %xmm5,-16(%rdi) 814 addq $32,%r9 815 jnz .Lsqr8x_cond_copy 816 817 movq $1,%rax 818 movq -48(%rsi),%r15 819 movq -40(%rsi),%r14 820 movq -32(%rsi),%r13 821 movq -24(%rsi),%r12 822 movq -16(%rsi),%rbp 823 movq -8(%rsi),%rbx 824 leaq (%rsi),%rsp 825.Lsqr8x_epilogue: 826 .byte 0xf3,0xc3 827.size bn_sqr8x_mont,.-bn_sqr8x_mont 828.type bn_mulx4x_mont,@function 829.align 32 830bn_mulx4x_mont: 831 movq %rsp,%rax 832.Lmulx4x_enter: 833 pushq %rbx 834 pushq %rbp 835 pushq %r12 836 pushq %r13 837 pushq %r14 838 pushq %r15 839.Lmulx4x_prologue: 840 841 shll $3,%r9d 842 xorq %r10,%r10 843 subq %r9,%r10 844 movq (%r8),%r8 845 leaq -72(%rsp,%r10,1),%rbp 846 andq $-128,%rbp 847 movq %rsp,%r11 848 subq %rbp,%r11 849 andq $-4096,%r11 850 leaq (%r11,%rbp,1),%rsp 851 movq (%rsp),%r10 852 cmpq %rbp,%rsp 853 ja .Lmulx4x_page_walk 854 jmp .Lmulx4x_page_walk_done 855 856.align 16 857.Lmulx4x_page_walk: 858 leaq -4096(%rsp),%rsp 859 movq (%rsp),%r10 860 cmpq %rbp,%rsp 861 ja .Lmulx4x_page_walk 862.Lmulx4x_page_walk_done: 863 864 leaq (%rdx,%r9,1),%r10 865 866 867 868 869 870 871 872 873 874 875 876 877 movq %r9,0(%rsp) 878 shrq $5,%r9 879 movq %r10,16(%rsp) 880 subq $1,%r9 881 movq %r8,24(%rsp) 882 movq %rdi,32(%rsp) 883 movq %rax,40(%rsp) 884 movq %r9,48(%rsp) 885 jmp .Lmulx4x_body 886 887.align 32 888.Lmulx4x_body: 889 leaq 8(%rdx),%rdi 890 movq (%rdx),%rdx 891 leaq 64+32(%rsp),%rbx 892 movq %rdx,%r9 893 894 mulxq 0(%rsi),%r8,%rax 895 mulxq 8(%rsi),%r11,%r14 896 addq %rax,%r11 897 movq %rdi,8(%rsp) 898 mulxq 16(%rsi),%r12,%r13 899 adcq %r14,%r12 900 adcq $0,%r13 901 902 movq %r8,%rdi 903 imulq 24(%rsp),%r8 904 xorq %rbp,%rbp 905 906 mulxq 24(%rsi),%rax,%r14 907 movq %r8,%rdx 908 leaq 32(%rsi),%rsi 909 adcxq %rax,%r13 910 adcxq %rbp,%r14 911 912 mulxq 0(%rcx),%rax,%r10 913 adcxq %rax,%rdi 914 adoxq %r11,%r10 915 mulxq 8(%rcx),%rax,%r11 916 adcxq %rax,%r10 917 adoxq %r12,%r11 918.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 919 movq 48(%rsp),%rdi 920 movq %r10,-32(%rbx) 921 adcxq %rax,%r11 922 adoxq %r13,%r12 923 mulxq 24(%rcx),%rax,%r15 924 movq %r9,%rdx 925 movq %r11,-24(%rbx) 926 adcxq %rax,%r12 927 adoxq %rbp,%r15 928 leaq 32(%rcx),%rcx 929 movq %r12,-16(%rbx) 930 931 jmp .Lmulx4x_1st 932 933.align 32 934.Lmulx4x_1st: 935 adcxq %rbp,%r15 936 mulxq 0(%rsi),%r10,%rax 937 adcxq %r14,%r10 938 mulxq 8(%rsi),%r11,%r14 939 adcxq %rax,%r11 940 mulxq 16(%rsi),%r12,%rax 941 adcxq %r14,%r12 942 mulxq 24(%rsi),%r13,%r14 943.byte 0x67,0x67 944 movq %r8,%rdx 945 adcxq %rax,%r13 946 adcxq %rbp,%r14 947 leaq 32(%rsi),%rsi 948 leaq 32(%rbx),%rbx 949 950 adoxq %r15,%r10 951 mulxq 0(%rcx),%rax,%r15 952 adcxq %rax,%r10 953 adoxq %r15,%r11 954 mulxq 8(%rcx),%rax,%r15 955 adcxq %rax,%r11 956 adoxq %r15,%r12 957 mulxq 16(%rcx),%rax,%r15 958 movq %r10,-40(%rbx) 959 adcxq %rax,%r12 960 movq %r11,-32(%rbx) 961 adoxq %r15,%r13 962 mulxq 24(%rcx),%rax,%r15 963 movq %r9,%rdx 964 movq %r12,-24(%rbx) 965 adcxq %rax,%r13 966 adoxq %rbp,%r15 967 leaq 32(%rcx),%rcx 968 movq %r13,-16(%rbx) 969 970 decq %rdi 971 jnz .Lmulx4x_1st 972 973 movq 0(%rsp),%rax 974 movq 8(%rsp),%rdi 975 adcq %rbp,%r15 976 addq %r15,%r14 977 sbbq %r15,%r15 978 movq %r14,-8(%rbx) 979 jmp .Lmulx4x_outer 980 981.align 32 982.Lmulx4x_outer: 983 movq (%rdi),%rdx 984 leaq 8(%rdi),%rdi 985 subq %rax,%rsi 986 movq %r15,(%rbx) 987 leaq 64+32(%rsp),%rbx 988 subq %rax,%rcx 989 990 mulxq 0(%rsi),%r8,%r11 991 xorl %ebp,%ebp 992 movq %rdx,%r9 993 mulxq 8(%rsi),%r14,%r12 994 adoxq -32(%rbx),%r8 995 adcxq %r14,%r11 996 mulxq 16(%rsi),%r15,%r13 997 adoxq -24(%rbx),%r11 998 adcxq %r15,%r12 999 adoxq -16(%rbx),%r12 1000 adcxq %rbp,%r13 1001 adoxq %rbp,%r13 1002 1003 movq %rdi,8(%rsp) 1004 movq %r8,%r15 1005 imulq 24(%rsp),%r8 1006 xorl %ebp,%ebp 1007 1008 mulxq 24(%rsi),%rax,%r14 1009 movq %r8,%rdx 1010 adcxq %rax,%r13 1011 adoxq -8(%rbx),%r13 1012 adcxq %rbp,%r14 1013 leaq 32(%rsi),%rsi 1014 adoxq %rbp,%r14 1015 1016 mulxq 0(%rcx),%rax,%r10 1017 adcxq %rax,%r15 1018 adoxq %r11,%r10 1019 mulxq 8(%rcx),%rax,%r11 1020 adcxq %rax,%r10 1021 adoxq %r12,%r11 1022 mulxq 16(%rcx),%rax,%r12 1023 movq %r10,-32(%rbx) 1024 adcxq %rax,%r11 1025 adoxq %r13,%r12 1026 mulxq 24(%rcx),%rax,%r15 1027 movq %r9,%rdx 1028 movq %r11,-24(%rbx) 1029 leaq 32(%rcx),%rcx 1030 adcxq %rax,%r12 1031 adoxq %rbp,%r15 1032 movq 48(%rsp),%rdi 1033 movq %r12,-16(%rbx) 1034 1035 jmp .Lmulx4x_inner 1036 1037.align 32 1038.Lmulx4x_inner: 1039 mulxq 0(%rsi),%r10,%rax 1040 adcxq %rbp,%r15 1041 adoxq %r14,%r10 1042 mulxq 8(%rsi),%r11,%r14 1043 adcxq 0(%rbx),%r10 1044 adoxq %rax,%r11 1045 mulxq 16(%rsi),%r12,%rax 1046 adcxq 8(%rbx),%r11 1047 adoxq %r14,%r12 1048 mulxq 24(%rsi),%r13,%r14 1049 movq %r8,%rdx 1050 adcxq 16(%rbx),%r12 1051 adoxq %rax,%r13 1052 adcxq 24(%rbx),%r13 1053 adoxq %rbp,%r14 1054 leaq 32(%rsi),%rsi 1055 leaq 32(%rbx),%rbx 1056 adcxq %rbp,%r14 1057 1058 adoxq %r15,%r10 1059 mulxq 0(%rcx),%rax,%r15 1060 adcxq %rax,%r10 1061 adoxq %r15,%r11 1062 mulxq 8(%rcx),%rax,%r15 1063 adcxq %rax,%r11 1064 adoxq %r15,%r12 1065 mulxq 16(%rcx),%rax,%r15 1066 movq %r10,-40(%rbx) 1067 adcxq %rax,%r12 1068 adoxq %r15,%r13 1069 mulxq 24(%rcx),%rax,%r15 1070 movq %r9,%rdx 1071 movq %r11,-32(%rbx) 1072 movq %r12,-24(%rbx) 1073 adcxq %rax,%r13 1074 adoxq %rbp,%r15 1075 leaq 32(%rcx),%rcx 1076 movq %r13,-16(%rbx) 1077 1078 decq %rdi 1079 jnz .Lmulx4x_inner 1080 1081 movq 0(%rsp),%rax 1082 movq 8(%rsp),%rdi 1083 adcq %rbp,%r15 1084 subq 0(%rbx),%rbp 1085 adcq %r15,%r14 1086 sbbq %r15,%r15 1087 movq %r14,-8(%rbx) 1088 1089 cmpq 16(%rsp),%rdi 1090 jne .Lmulx4x_outer 1091 1092 leaq 64(%rsp),%rbx 1093 subq %rax,%rcx 1094 negq %r15 1095 movq %rax,%rdx 1096 shrq $3+2,%rax 1097 movq 32(%rsp),%rdi 1098 jmp .Lmulx4x_sub 1099 1100.align 32 1101.Lmulx4x_sub: 1102 movq 0(%rbx),%r11 1103 movq 8(%rbx),%r12 1104 movq 16(%rbx),%r13 1105 movq 24(%rbx),%r14 1106 leaq 32(%rbx),%rbx 1107 sbbq 0(%rcx),%r11 1108 sbbq 8(%rcx),%r12 1109 sbbq 16(%rcx),%r13 1110 sbbq 24(%rcx),%r14 1111 leaq 32(%rcx),%rcx 1112 movq %r11,0(%rdi) 1113 movq %r12,8(%rdi) 1114 movq %r13,16(%rdi) 1115 movq %r14,24(%rdi) 1116 leaq 32(%rdi),%rdi 1117 decq %rax 1118 jnz .Lmulx4x_sub 1119 1120 sbbq $0,%r15 1121 leaq 64(%rsp),%rbx 1122 subq %rdx,%rdi 1123 1124.byte 102,73,15,110,207 1125 pxor %xmm0,%xmm0 1126 pshufd $0,%xmm1,%xmm1 1127 movq 40(%rsp),%rsi 1128 jmp .Lmulx4x_cond_copy 1129 1130.align 32 1131.Lmulx4x_cond_copy: 1132 movdqa 0(%rbx),%xmm2 1133 movdqa 16(%rbx),%xmm3 1134 leaq 32(%rbx),%rbx 1135 movdqu 0(%rdi),%xmm4 1136 movdqu 16(%rdi),%xmm5 1137 leaq 32(%rdi),%rdi 1138 movdqa %xmm0,-32(%rbx) 1139 movdqa %xmm0,-16(%rbx) 1140 pcmpeqd %xmm1,%xmm0 1141 pand %xmm1,%xmm2 1142 pand %xmm1,%xmm3 1143 pand %xmm0,%xmm4 1144 pand %xmm0,%xmm5 1145 pxor %xmm0,%xmm0 1146 por %xmm2,%xmm4 1147 por %xmm3,%xmm5 1148 movdqu %xmm4,-32(%rdi) 1149 movdqu %xmm5,-16(%rdi) 1150 subq $32,%rdx 1151 jnz .Lmulx4x_cond_copy 1152 1153 movq %rdx,(%rbx) 1154 1155 movq $1,%rax 1156 movq -48(%rsi),%r15 1157 movq -40(%rsi),%r14 1158 movq -32(%rsi),%r13 1159 movq -24(%rsi),%r12 1160 movq -16(%rsi),%rbp 1161 movq -8(%rsi),%rbx 1162 leaq (%rsi),%rsp 1163.Lmulx4x_epilogue: 1164 .byte 0xf3,0xc3 1165.size bn_mulx4x_mont,.-bn_mulx4x_mont 1166.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1167.align 16 1168