x86_64-mont.S revision 312826
1/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/x86_64-mont.S 312826 2017-01-26 19:14:14Z jkim $ */ 2/* Do not modify. This file is auto-generated from x86_64-mont.pl. */ 3.text 4 5 6 7.globl bn_mul_mont 8.type bn_mul_mont,@function 9.align 16 10bn_mul_mont: 11 movl %r9d,%r9d 12 movq %rsp,%rax 13 testl $3,%r9d 14 jnz .Lmul_enter 15 cmpl $8,%r9d 16 jb .Lmul_enter 17 movl OPENSSL_ia32cap_P+8(%rip),%r11d 18 cmpq %rsi,%rdx 19 jne .Lmul4x_enter 20 testl $7,%r9d 21 jz .Lsqr8x_enter 22 jmp .Lmul4x_enter 23 24.align 16 25.Lmul_enter: 26 pushq %rbx 27 pushq %rbp 28 pushq %r12 29 pushq %r13 30 pushq %r14 31 pushq %r15 32 33 negq %r9 34 movq %rsp,%r11 35 leaq -16(%rsp,%r9,8),%r10 36 negq %r9 37 andq $-1024,%r10 38 39 40 41 42 43 44 45 subq %r10,%r11 46 andq $-4096,%r11 47 leaq (%r10,%r11,1),%rsp 48 movq (%rsp),%r11 49 cmpq %r10,%rsp 50 ja .Lmul_page_walk 51 jmp .Lmul_page_walk_done 52 53.align 16 54.Lmul_page_walk: 55 leaq -4096(%rsp),%rsp 56 movq (%rsp),%r11 57 cmpq %r10,%rsp 58 ja .Lmul_page_walk 59.Lmul_page_walk_done: 60 61 movq %rax,8(%rsp,%r9,8) 62.Lmul_body: 63 movq %rdx,%r12 64 movq (%r8),%r8 65 movq (%r12),%rbx 66 movq (%rsi),%rax 67 68 xorq %r14,%r14 69 xorq %r15,%r15 70 71 movq %r8,%rbp 72 mulq %rbx 73 movq %rax,%r10 74 movq (%rcx),%rax 75 76 imulq %r10,%rbp 77 movq %rdx,%r11 78 79 mulq %rbp 80 addq %rax,%r10 81 movq 8(%rsi),%rax 82 adcq $0,%rdx 83 movq %rdx,%r13 84 85 leaq 1(%r15),%r15 86 jmp .L1st_enter 87 88.align 16 89.L1st: 90 addq %rax,%r13 91 movq (%rsi,%r15,8),%rax 92 adcq $0,%rdx 93 addq %r11,%r13 94 movq %r10,%r11 95 adcq $0,%rdx 96 movq %r13,-16(%rsp,%r15,8) 97 movq %rdx,%r13 98 99.L1st_enter: 100 mulq %rbx 101 addq %rax,%r11 102 movq (%rcx,%r15,8),%rax 103 adcq $0,%rdx 104 leaq 1(%r15),%r15 105 movq %rdx,%r10 106 107 mulq %rbp 108 cmpq %r9,%r15 109 jne .L1st 110 111 addq %rax,%r13 112 movq (%rsi),%rax 113 adcq $0,%rdx 114 addq %r11,%r13 115 adcq $0,%rdx 116 movq %r13,-16(%rsp,%r15,8) 117 movq %rdx,%r13 118 movq %r10,%r11 119 120 xorq %rdx,%rdx 121 addq %r11,%r13 122 adcq $0,%rdx 123 movq %r13,-8(%rsp,%r9,8) 124 movq %rdx,(%rsp,%r9,8) 125 126 leaq 1(%r14),%r14 127 jmp .Louter 128.align 16 129.Louter: 130 movq (%r12,%r14,8),%rbx 131 xorq %r15,%r15 132 movq %r8,%rbp 133 movq (%rsp),%r10 134 mulq %rbx 135 addq %rax,%r10 136 movq (%rcx),%rax 137 adcq $0,%rdx 138 139 imulq %r10,%rbp 140 movq %rdx,%r11 141 142 mulq %rbp 143 addq %rax,%r10 144 movq 8(%rsi),%rax 145 adcq $0,%rdx 146 movq 8(%rsp),%r10 147 movq %rdx,%r13 148 149 leaq 1(%r15),%r15 150 jmp .Linner_enter 151 152.align 16 153.Linner: 154 addq %rax,%r13 155 movq (%rsi,%r15,8),%rax 156 adcq $0,%rdx 157 addq %r10,%r13 158 movq (%rsp,%r15,8),%r10 159 adcq $0,%rdx 160 movq %r13,-16(%rsp,%r15,8) 161 movq %rdx,%r13 162 163.Linner_enter: 164 mulq %rbx 165 addq %rax,%r11 166 movq (%rcx,%r15,8),%rax 167 adcq $0,%rdx 168 addq %r11,%r10 169 movq %rdx,%r11 170 adcq $0,%r11 171 leaq 1(%r15),%r15 172 173 mulq %rbp 174 cmpq %r9,%r15 175 jne .Linner 176 177 addq %rax,%r13 178 movq (%rsi),%rax 179 adcq $0,%rdx 180 addq %r10,%r13 181 movq (%rsp,%r15,8),%r10 182 adcq $0,%rdx 183 movq %r13,-16(%rsp,%r15,8) 184 movq %rdx,%r13 185 186 xorq %rdx,%rdx 187 addq %r11,%r13 188 adcq $0,%rdx 189 addq %r10,%r13 190 adcq $0,%rdx 191 movq %r13,-8(%rsp,%r9,8) 192 movq %rdx,(%rsp,%r9,8) 193 194 leaq 1(%r14),%r14 195 cmpq %r9,%r14 196 jb .Louter 197 198 xorq %r14,%r14 199 movq (%rsp),%rax 200 leaq (%rsp),%rsi 201 movq %r9,%r15 202 jmp .Lsub 203.align 16 204.Lsub: sbbq (%rcx,%r14,8),%rax 205 movq %rax,(%rdi,%r14,8) 206 movq 8(%rsi,%r14,8),%rax 207 leaq 1(%r14),%r14 208 decq %r15 209 jnz .Lsub 210 211 sbbq $0,%rax 212 xorq %r14,%r14 213 andq %rax,%rsi 214 notq %rax 215 movq %rdi,%rcx 216 andq %rax,%rcx 217 movq %r9,%r15 218 orq %rcx,%rsi 219.align 16 220.Lcopy: 221 movq (%rsi,%r14,8),%rax 222 movq %r14,(%rsp,%r14,8) 223 movq %rax,(%rdi,%r14,8) 224 leaq 1(%r14),%r14 225 subq $1,%r15 226 jnz .Lcopy 227 228 movq 8(%rsp,%r9,8),%rsi 229 movq $1,%rax 230 movq -48(%rsi),%r15 231 movq -40(%rsi),%r14 232 movq -32(%rsi),%r13 233 movq -24(%rsi),%r12 234 movq -16(%rsi),%rbp 235 movq -8(%rsi),%rbx 236 leaq (%rsi),%rsp 237.Lmul_epilogue: 238 .byte 0xf3,0xc3 239.size bn_mul_mont,.-bn_mul_mont 240.type bn_mul4x_mont,@function 241.align 16 242bn_mul4x_mont: 243 movl %r9d,%r9d 244 movq %rsp,%rax 245.Lmul4x_enter: 246 andl $0x80100,%r11d 247 cmpl $0x80100,%r11d 248 je .Lmulx4x_enter 249 pushq %rbx 250 pushq %rbp 251 pushq %r12 252 pushq %r13 253 pushq %r14 254 pushq %r15 255 256 negq %r9 257 movq %rsp,%r11 258 leaq -32(%rsp,%r9,8),%r10 259 negq %r9 260 andq $-1024,%r10 261 262 subq %r10,%r11 263 andq $-4096,%r11 264 leaq (%r10,%r11,1),%rsp 265 movq (%rsp),%r11 266 cmpq %r10,%rsp 267 ja .Lmul4x_page_walk 268 jmp .Lmul4x_page_walk_done 269 270.Lmul4x_page_walk: 271 leaq -4096(%rsp),%rsp 272 movq (%rsp),%r11 273 cmpq %r10,%rsp 274 ja .Lmul4x_page_walk 275.Lmul4x_page_walk_done: 276 277 movq %rax,8(%rsp,%r9,8) 278.Lmul4x_body: 279 movq %rdi,16(%rsp,%r9,8) 280 movq %rdx,%r12 281 movq (%r8),%r8 282 movq (%r12),%rbx 283 movq (%rsi),%rax 284 285 xorq %r14,%r14 286 xorq %r15,%r15 287 288 movq %r8,%rbp 289 mulq %rbx 290 movq %rax,%r10 291 movq (%rcx),%rax 292 293 imulq %r10,%rbp 294 movq %rdx,%r11 295 296 mulq %rbp 297 addq %rax,%r10 298 movq 8(%rsi),%rax 299 adcq $0,%rdx 300 movq %rdx,%rdi 301 302 mulq %rbx 303 addq %rax,%r11 304 movq 8(%rcx),%rax 305 adcq $0,%rdx 306 movq %rdx,%r10 307 308 mulq %rbp 309 addq %rax,%rdi 310 movq 16(%rsi),%rax 311 adcq $0,%rdx 312 addq %r11,%rdi 313 leaq 4(%r15),%r15 314 adcq $0,%rdx 315 movq %rdi,(%rsp) 316 movq %rdx,%r13 317 jmp .L1st4x 318.align 16 319.L1st4x: 320 mulq %rbx 321 addq %rax,%r10 322 movq -16(%rcx,%r15,8),%rax 323 adcq $0,%rdx 324 movq %rdx,%r11 325 326 mulq %rbp 327 addq %rax,%r13 328 movq -8(%rsi,%r15,8),%rax 329 adcq $0,%rdx 330 addq %r10,%r13 331 adcq $0,%rdx 332 movq %r13,-24(%rsp,%r15,8) 333 movq %rdx,%rdi 334 335 mulq %rbx 336 addq %rax,%r11 337 movq -8(%rcx,%r15,8),%rax 338 adcq $0,%rdx 339 movq %rdx,%r10 340 341 mulq %rbp 342 addq %rax,%rdi 343 movq (%rsi,%r15,8),%rax 344 adcq $0,%rdx 345 addq %r11,%rdi 346 adcq $0,%rdx 347 movq %rdi,-16(%rsp,%r15,8) 348 movq %rdx,%r13 349 350 mulq %rbx 351 addq %rax,%r10 352 movq (%rcx,%r15,8),%rax 353 adcq $0,%rdx 354 movq %rdx,%r11 355 356 mulq %rbp 357 addq %rax,%r13 358 movq 8(%rsi,%r15,8),%rax 359 adcq $0,%rdx 360 addq %r10,%r13 361 adcq $0,%rdx 362 movq %r13,-8(%rsp,%r15,8) 363 movq %rdx,%rdi 364 365 mulq %rbx 366 addq %rax,%r11 367 movq 8(%rcx,%r15,8),%rax 368 adcq $0,%rdx 369 leaq 4(%r15),%r15 370 movq %rdx,%r10 371 372 mulq %rbp 373 addq %rax,%rdi 374 movq -16(%rsi,%r15,8),%rax 375 adcq $0,%rdx 376 addq %r11,%rdi 377 adcq $0,%rdx 378 movq %rdi,-32(%rsp,%r15,8) 379 movq %rdx,%r13 380 cmpq %r9,%r15 381 jb .L1st4x 382 383 mulq %rbx 384 addq %rax,%r10 385 movq -16(%rcx,%r15,8),%rax 386 adcq $0,%rdx 387 movq %rdx,%r11 388 389 mulq %rbp 390 addq %rax,%r13 391 movq -8(%rsi,%r15,8),%rax 392 adcq $0,%rdx 393 addq %r10,%r13 394 adcq $0,%rdx 395 movq %r13,-24(%rsp,%r15,8) 396 movq %rdx,%rdi 397 398 mulq %rbx 399 addq %rax,%r11 400 movq -8(%rcx,%r15,8),%rax 401 adcq $0,%rdx 402 movq %rdx,%r10 403 404 mulq %rbp 405 addq %rax,%rdi 406 movq (%rsi),%rax 407 adcq $0,%rdx 408 addq %r11,%rdi 409 adcq $0,%rdx 410 movq %rdi,-16(%rsp,%r15,8) 411 movq %rdx,%r13 412 413 xorq %rdi,%rdi 414 addq %r10,%r13 415 adcq $0,%rdi 416 movq %r13,-8(%rsp,%r15,8) 417 movq %rdi,(%rsp,%r15,8) 418 419 leaq 1(%r14),%r14 420.align 4 421.Louter4x: 422 movq (%r12,%r14,8),%rbx 423 xorq %r15,%r15 424 movq (%rsp),%r10 425 movq %r8,%rbp 426 mulq %rbx 427 addq %rax,%r10 428 movq (%rcx),%rax 429 adcq $0,%rdx 430 431 imulq %r10,%rbp 432 movq %rdx,%r11 433 434 mulq %rbp 435 addq %rax,%r10 436 movq 8(%rsi),%rax 437 adcq $0,%rdx 438 movq %rdx,%rdi 439 440 mulq %rbx 441 addq %rax,%r11 442 movq 8(%rcx),%rax 443 adcq $0,%rdx 444 addq 8(%rsp),%r11 445 adcq $0,%rdx 446 movq %rdx,%r10 447 448 mulq %rbp 449 addq %rax,%rdi 450 movq 16(%rsi),%rax 451 adcq $0,%rdx 452 addq %r11,%rdi 453 leaq 4(%r15),%r15 454 adcq $0,%rdx 455 movq %rdi,(%rsp) 456 movq %rdx,%r13 457 jmp .Linner4x 458.align 16 459.Linner4x: 460 mulq %rbx 461 addq %rax,%r10 462 movq -16(%rcx,%r15,8),%rax 463 adcq $0,%rdx 464 addq -16(%rsp,%r15,8),%r10 465 adcq $0,%rdx 466 movq %rdx,%r11 467 468 mulq %rbp 469 addq %rax,%r13 470 movq -8(%rsi,%r15,8),%rax 471 adcq $0,%rdx 472 addq %r10,%r13 473 adcq $0,%rdx 474 movq %r13,-24(%rsp,%r15,8) 475 movq %rdx,%rdi 476 477 mulq %rbx 478 addq %rax,%r11 479 movq -8(%rcx,%r15,8),%rax 480 adcq $0,%rdx 481 addq -8(%rsp,%r15,8),%r11 482 adcq $0,%rdx 483 movq %rdx,%r10 484 485 mulq %rbp 486 addq %rax,%rdi 487 movq (%rsi,%r15,8),%rax 488 adcq $0,%rdx 489 addq %r11,%rdi 490 adcq $0,%rdx 491 movq %rdi,-16(%rsp,%r15,8) 492 movq %rdx,%r13 493 494 mulq %rbx 495 addq %rax,%r10 496 movq (%rcx,%r15,8),%rax 497 adcq $0,%rdx 498 addq (%rsp,%r15,8),%r10 499 adcq $0,%rdx 500 movq %rdx,%r11 501 502 mulq %rbp 503 addq %rax,%r13 504 movq 8(%rsi,%r15,8),%rax 505 adcq $0,%rdx 506 addq %r10,%r13 507 adcq $0,%rdx 508 movq %r13,-8(%rsp,%r15,8) 509 movq %rdx,%rdi 510 511 mulq %rbx 512 addq %rax,%r11 513 movq 8(%rcx,%r15,8),%rax 514 adcq $0,%rdx 515 addq 8(%rsp,%r15,8),%r11 516 adcq $0,%rdx 517 leaq 4(%r15),%r15 518 movq %rdx,%r10 519 520 mulq %rbp 521 addq %rax,%rdi 522 movq -16(%rsi,%r15,8),%rax 523 adcq $0,%rdx 524 addq %r11,%rdi 525 adcq $0,%rdx 526 movq %rdi,-32(%rsp,%r15,8) 527 movq %rdx,%r13 528 cmpq %r9,%r15 529 jb .Linner4x 530 531 mulq %rbx 532 addq %rax,%r10 533 movq -16(%rcx,%r15,8),%rax 534 adcq $0,%rdx 535 addq -16(%rsp,%r15,8),%r10 536 adcq $0,%rdx 537 movq %rdx,%r11 538 539 mulq %rbp 540 addq %rax,%r13 541 movq -8(%rsi,%r15,8),%rax 542 adcq $0,%rdx 543 addq %r10,%r13 544 adcq $0,%rdx 545 movq %r13,-24(%rsp,%r15,8) 546 movq %rdx,%rdi 547 548 mulq %rbx 549 addq %rax,%r11 550 movq -8(%rcx,%r15,8),%rax 551 adcq $0,%rdx 552 addq -8(%rsp,%r15,8),%r11 553 adcq $0,%rdx 554 leaq 1(%r14),%r14 555 movq %rdx,%r10 556 557 mulq %rbp 558 addq %rax,%rdi 559 movq (%rsi),%rax 560 adcq $0,%rdx 561 addq %r11,%rdi 562 adcq $0,%rdx 563 movq %rdi,-16(%rsp,%r15,8) 564 movq %rdx,%r13 565 566 xorq %rdi,%rdi 567 addq %r10,%r13 568 adcq $0,%rdi 569 addq (%rsp,%r9,8),%r13 570 adcq $0,%rdi 571 movq %r13,-8(%rsp,%r15,8) 572 movq %rdi,(%rsp,%r15,8) 573 574 cmpq %r9,%r14 575 jb .Louter4x 576 movq 16(%rsp,%r9,8),%rdi 577 movq 0(%rsp),%rax 578 pxor %xmm0,%xmm0 579 movq 8(%rsp),%rdx 580 shrq $2,%r9 581 leaq (%rsp),%rsi 582 xorq %r14,%r14 583 584 subq 0(%rcx),%rax 585 movq 16(%rsi),%rbx 586 movq 24(%rsi),%rbp 587 sbbq 8(%rcx),%rdx 588 leaq -1(%r9),%r15 589 jmp .Lsub4x 590.align 16 591.Lsub4x: 592 movq %rax,0(%rdi,%r14,8) 593 movq %rdx,8(%rdi,%r14,8) 594 sbbq 16(%rcx,%r14,8),%rbx 595 movq 32(%rsi,%r14,8),%rax 596 movq 40(%rsi,%r14,8),%rdx 597 sbbq 24(%rcx,%r14,8),%rbp 598 movq %rbx,16(%rdi,%r14,8) 599 movq %rbp,24(%rdi,%r14,8) 600 sbbq 32(%rcx,%r14,8),%rax 601 movq 48(%rsi,%r14,8),%rbx 602 movq 56(%rsi,%r14,8),%rbp 603 sbbq 40(%rcx,%r14,8),%rdx 604 leaq 4(%r14),%r14 605 decq %r15 606 jnz .Lsub4x 607 608 movq %rax,0(%rdi,%r14,8) 609 movq 32(%rsi,%r14,8),%rax 610 sbbq 16(%rcx,%r14,8),%rbx 611 movq %rdx,8(%rdi,%r14,8) 612 sbbq 24(%rcx,%r14,8),%rbp 613 movq %rbx,16(%rdi,%r14,8) 614 615 sbbq $0,%rax 616 movq %rbp,24(%rdi,%r14,8) 617 xorq %r14,%r14 618 andq %rax,%rsi 619 notq %rax 620 movq %rdi,%rcx 621 andq %rax,%rcx 622 leaq -1(%r9),%r15 623 orq %rcx,%rsi 624 625 movdqu (%rsi),%xmm1 626 movdqa %xmm0,(%rsp) 627 movdqu %xmm1,(%rdi) 628 jmp .Lcopy4x 629.align 16 630.Lcopy4x: 631 movdqu 16(%rsi,%r14,1),%xmm2 632 movdqu 32(%rsi,%r14,1),%xmm1 633 movdqa %xmm0,16(%rsp,%r14,1) 634 movdqu %xmm2,16(%rdi,%r14,1) 635 movdqa %xmm0,32(%rsp,%r14,1) 636 movdqu %xmm1,32(%rdi,%r14,1) 637 leaq 32(%r14),%r14 638 decq %r15 639 jnz .Lcopy4x 640 641 shlq $2,%r9 642 movdqu 16(%rsi,%r14,1),%xmm2 643 movdqa %xmm0,16(%rsp,%r14,1) 644 movdqu %xmm2,16(%rdi,%r14,1) 645 movq 8(%rsp,%r9,8),%rsi 646 movq $1,%rax 647 movq -48(%rsi),%r15 648 movq -40(%rsi),%r14 649 movq -32(%rsi),%r13 650 movq -24(%rsi),%r12 651 movq -16(%rsi),%rbp 652 movq -8(%rsi),%rbx 653 leaq (%rsi),%rsp 654.Lmul4x_epilogue: 655 .byte 0xf3,0xc3 656.size bn_mul4x_mont,.-bn_mul4x_mont 657 658 659 660.type bn_sqr8x_mont,@function 661.align 32 662bn_sqr8x_mont: 663 movq %rsp,%rax 664.Lsqr8x_enter: 665 pushq %rbx 666 pushq %rbp 667 pushq %r12 668 pushq %r13 669 pushq %r14 670 pushq %r15 671.Lsqr8x_prologue: 672 673 movl %r9d,%r10d 674 shll $3,%r9d 675 shlq $3+2,%r10 676 negq %r9 677 678 679 680 681 682 683 leaq -64(%rsp,%r9,2),%r11 684 movq %rsp,%rbp 685 movq (%r8),%r8 686 subq %rsi,%r11 687 andq $4095,%r11 688 cmpq %r11,%r10 689 jb .Lsqr8x_sp_alt 690 subq %r11,%rbp 691 leaq -64(%rbp,%r9,2),%rbp 692 jmp .Lsqr8x_sp_done 693 694.align 32 695.Lsqr8x_sp_alt: 696 leaq 4096-64(,%r9,2),%r10 697 leaq -64(%rbp,%r9,2),%rbp 698 subq %r10,%r11 699 movq $0,%r10 700 cmovcq %r10,%r11 701 subq %r11,%rbp 702.Lsqr8x_sp_done: 703 andq $-64,%rbp 704 movq %rsp,%r11 705 subq %rbp,%r11 706 andq $-4096,%r11 707 leaq (%r11,%rbp,1),%rsp 708 movq (%rsp),%r10 709 cmpq %rbp,%rsp 710 ja .Lsqr8x_page_walk 711 jmp .Lsqr8x_page_walk_done 712 713.align 16 714.Lsqr8x_page_walk: 715 leaq -4096(%rsp),%rsp 716 movq (%rsp),%r10 717 cmpq %rbp,%rsp 718 ja .Lsqr8x_page_walk 719.Lsqr8x_page_walk_done: 720 721 movq %r9,%r10 722 negq %r9 723 724 movq %r8,32(%rsp) 725 movq %rax,40(%rsp) 726.Lsqr8x_body: 727 728.byte 102,72,15,110,209 729 pxor %xmm0,%xmm0 730.byte 102,72,15,110,207 731.byte 102,73,15,110,218 732 movl OPENSSL_ia32cap_P+8(%rip),%eax 733 andl $0x80100,%eax 734 cmpl $0x80100,%eax 735 jne .Lsqr8x_nox 736 737 call bn_sqrx8x_internal 738 739 740 741 742 leaq (%r8,%rcx,1),%rbx 743 movq %rcx,%r9 744 movq %rcx,%rdx 745.byte 102,72,15,126,207 746 sarq $3+2,%rcx 747 jmp .Lsqr8x_sub 748 749.align 32 750.Lsqr8x_nox: 751 call bn_sqr8x_internal 752 753 754 755 756 leaq (%rdi,%r9,1),%rbx 757 movq %r9,%rcx 758 movq %r9,%rdx 759.byte 102,72,15,126,207 760 sarq $3+2,%rcx 761 jmp .Lsqr8x_sub 762 763.align 32 764.Lsqr8x_sub: 765 movq 0(%rbx),%r12 766 movq 8(%rbx),%r13 767 movq 16(%rbx),%r14 768 movq 24(%rbx),%r15 769 leaq 32(%rbx),%rbx 770 sbbq 0(%rbp),%r12 771 sbbq 8(%rbp),%r13 772 sbbq 16(%rbp),%r14 773 sbbq 24(%rbp),%r15 774 leaq 32(%rbp),%rbp 775 movq %r12,0(%rdi) 776 movq %r13,8(%rdi) 777 movq %r14,16(%rdi) 778 movq %r15,24(%rdi) 779 leaq 32(%rdi),%rdi 780 incq %rcx 781 jnz .Lsqr8x_sub 782 783 sbbq $0,%rax 784 leaq (%rbx,%r9,1),%rbx 785 leaq (%rdi,%r9,1),%rdi 786 787.byte 102,72,15,110,200 788 pxor %xmm0,%xmm0 789 pshufd $0,%xmm1,%xmm1 790 movq 40(%rsp),%rsi 791 jmp .Lsqr8x_cond_copy 792 793.align 32 794.Lsqr8x_cond_copy: 795 movdqa 0(%rbx),%xmm2 796 movdqa 16(%rbx),%xmm3 797 leaq 32(%rbx),%rbx 798 movdqu 0(%rdi),%xmm4 799 movdqu 16(%rdi),%xmm5 800 leaq 32(%rdi),%rdi 801 movdqa %xmm0,-32(%rbx) 802 movdqa %xmm0,-16(%rbx) 803 movdqa %xmm0,-32(%rbx,%rdx,1) 804 movdqa %xmm0,-16(%rbx,%rdx,1) 805 pcmpeqd %xmm1,%xmm0 806 pand %xmm1,%xmm2 807 pand %xmm1,%xmm3 808 pand %xmm0,%xmm4 809 pand %xmm0,%xmm5 810 pxor %xmm0,%xmm0 811 por %xmm2,%xmm4 812 por %xmm3,%xmm5 813 movdqu %xmm4,-32(%rdi) 814 movdqu %xmm5,-16(%rdi) 815 addq $32,%r9 816 jnz .Lsqr8x_cond_copy 817 818 movq $1,%rax 819 movq -48(%rsi),%r15 820 movq -40(%rsi),%r14 821 movq -32(%rsi),%r13 822 movq -24(%rsi),%r12 823 movq -16(%rsi),%rbp 824 movq -8(%rsi),%rbx 825 leaq (%rsi),%rsp 826.Lsqr8x_epilogue: 827 .byte 0xf3,0xc3 828.size bn_sqr8x_mont,.-bn_sqr8x_mont 829.type bn_mulx4x_mont,@function 830.align 32 831bn_mulx4x_mont: 832 movq %rsp,%rax 833.Lmulx4x_enter: 834 pushq %rbx 835 pushq %rbp 836 pushq %r12 837 pushq %r13 838 pushq %r14 839 pushq %r15 840.Lmulx4x_prologue: 841 842 shll $3,%r9d 843 xorq %r10,%r10 844 subq %r9,%r10 845 movq (%r8),%r8 846 leaq -72(%rsp,%r10,1),%rbp 847 andq $-128,%rbp 848 movq %rsp,%r11 849 subq %rbp,%r11 850 andq $-4096,%r11 851 leaq (%r11,%rbp,1),%rsp 852 movq (%rsp),%r10 853 cmpq %rbp,%rsp 854 ja .Lmulx4x_page_walk 855 jmp .Lmulx4x_page_walk_done 856 857.align 16 858.Lmulx4x_page_walk: 859 leaq -4096(%rsp),%rsp 860 movq (%rsp),%r10 861 cmpq %rbp,%rsp 862 ja .Lmulx4x_page_walk 863.Lmulx4x_page_walk_done: 864 865 leaq (%rdx,%r9,1),%r10 866 867 868 869 870 871 872 873 874 875 876 877 878 movq %r9,0(%rsp) 879 shrq $5,%r9 880 movq %r10,16(%rsp) 881 subq $1,%r9 882 movq %r8,24(%rsp) 883 movq %rdi,32(%rsp) 884 movq %rax,40(%rsp) 885 movq %r9,48(%rsp) 886 jmp .Lmulx4x_body 887 888.align 32 889.Lmulx4x_body: 890 leaq 8(%rdx),%rdi 891 movq (%rdx),%rdx 892 leaq 64+32(%rsp),%rbx 893 movq %rdx,%r9 894 895 mulxq 0(%rsi),%r8,%rax 896 mulxq 8(%rsi),%r11,%r14 897 addq %rax,%r11 898 movq %rdi,8(%rsp) 899 mulxq 16(%rsi),%r12,%r13 900 adcq %r14,%r12 901 adcq $0,%r13 902 903 movq %r8,%rdi 904 imulq 24(%rsp),%r8 905 xorq %rbp,%rbp 906 907 mulxq 24(%rsi),%rax,%r14 908 movq %r8,%rdx 909 leaq 32(%rsi),%rsi 910 adcxq %rax,%r13 911 adcxq %rbp,%r14 912 913 mulxq 0(%rcx),%rax,%r10 914 adcxq %rax,%rdi 915 adoxq %r11,%r10 916 mulxq 8(%rcx),%rax,%r11 917 adcxq %rax,%r10 918 adoxq %r12,%r11 919.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 920 movq 48(%rsp),%rdi 921 movq %r10,-32(%rbx) 922 adcxq %rax,%r11 923 adoxq %r13,%r12 924 mulxq 24(%rcx),%rax,%r15 925 movq %r9,%rdx 926 movq %r11,-24(%rbx) 927 adcxq %rax,%r12 928 adoxq %rbp,%r15 929 leaq 32(%rcx),%rcx 930 movq %r12,-16(%rbx) 931 932 jmp .Lmulx4x_1st 933 934.align 32 935.Lmulx4x_1st: 936 adcxq %rbp,%r15 937 mulxq 0(%rsi),%r10,%rax 938 adcxq %r14,%r10 939 mulxq 8(%rsi),%r11,%r14 940 adcxq %rax,%r11 941 mulxq 16(%rsi),%r12,%rax 942 adcxq %r14,%r12 943 mulxq 24(%rsi),%r13,%r14 944.byte 0x67,0x67 945 movq %r8,%rdx 946 adcxq %rax,%r13 947 adcxq %rbp,%r14 948 leaq 32(%rsi),%rsi 949 leaq 32(%rbx),%rbx 950 951 adoxq %r15,%r10 952 mulxq 0(%rcx),%rax,%r15 953 adcxq %rax,%r10 954 adoxq %r15,%r11 955 mulxq 8(%rcx),%rax,%r15 956 adcxq %rax,%r11 957 adoxq %r15,%r12 958 mulxq 16(%rcx),%rax,%r15 959 movq %r10,-40(%rbx) 960 adcxq %rax,%r12 961 movq %r11,-32(%rbx) 962 adoxq %r15,%r13 963 mulxq 24(%rcx),%rax,%r15 964 movq %r9,%rdx 965 movq %r12,-24(%rbx) 966 adcxq %rax,%r13 967 adoxq %rbp,%r15 968 leaq 32(%rcx),%rcx 969 movq %r13,-16(%rbx) 970 971 decq %rdi 972 jnz .Lmulx4x_1st 973 974 movq 0(%rsp),%rax 975 movq 8(%rsp),%rdi 976 adcq %rbp,%r15 977 addq %r15,%r14 978 sbbq %r15,%r15 979 movq %r14,-8(%rbx) 980 jmp .Lmulx4x_outer 981 982.align 32 983.Lmulx4x_outer: 984 movq (%rdi),%rdx 985 leaq 8(%rdi),%rdi 986 subq %rax,%rsi 987 movq %r15,(%rbx) 988 leaq 64+32(%rsp),%rbx 989 subq %rax,%rcx 990 991 mulxq 0(%rsi),%r8,%r11 992 xorl %ebp,%ebp 993 movq %rdx,%r9 994 mulxq 8(%rsi),%r14,%r12 995 adoxq -32(%rbx),%r8 996 adcxq %r14,%r11 997 mulxq 16(%rsi),%r15,%r13 998 adoxq -24(%rbx),%r11 999 adcxq %r15,%r12 1000 adoxq -16(%rbx),%r12 1001 adcxq %rbp,%r13 1002 adoxq %rbp,%r13 1003 1004 movq %rdi,8(%rsp) 1005 movq %r8,%r15 1006 imulq 24(%rsp),%r8 1007 xorl %ebp,%ebp 1008 1009 mulxq 24(%rsi),%rax,%r14 1010 movq %r8,%rdx 1011 adcxq %rax,%r13 1012 adoxq -8(%rbx),%r13 1013 adcxq %rbp,%r14 1014 leaq 32(%rsi),%rsi 1015 adoxq %rbp,%r14 1016 1017 mulxq 0(%rcx),%rax,%r10 1018 adcxq %rax,%r15 1019 adoxq %r11,%r10 1020 mulxq 8(%rcx),%rax,%r11 1021 adcxq %rax,%r10 1022 adoxq %r12,%r11 1023 mulxq 16(%rcx),%rax,%r12 1024 movq %r10,-32(%rbx) 1025 adcxq %rax,%r11 1026 adoxq %r13,%r12 1027 mulxq 24(%rcx),%rax,%r15 1028 movq %r9,%rdx 1029 movq %r11,-24(%rbx) 1030 leaq 32(%rcx),%rcx 1031 adcxq %rax,%r12 1032 adoxq %rbp,%r15 1033 movq 48(%rsp),%rdi 1034 movq %r12,-16(%rbx) 1035 1036 jmp .Lmulx4x_inner 1037 1038.align 32 1039.Lmulx4x_inner: 1040 mulxq 0(%rsi),%r10,%rax 1041 adcxq %rbp,%r15 1042 adoxq %r14,%r10 1043 mulxq 8(%rsi),%r11,%r14 1044 adcxq 0(%rbx),%r10 1045 adoxq %rax,%r11 1046 mulxq 16(%rsi),%r12,%rax 1047 adcxq 8(%rbx),%r11 1048 adoxq %r14,%r12 1049 mulxq 24(%rsi),%r13,%r14 1050 movq %r8,%rdx 1051 adcxq 16(%rbx),%r12 1052 adoxq %rax,%r13 1053 adcxq 24(%rbx),%r13 1054 adoxq %rbp,%r14 1055 leaq 32(%rsi),%rsi 1056 leaq 32(%rbx),%rbx 1057 adcxq %rbp,%r14 1058 1059 adoxq %r15,%r10 1060 mulxq 0(%rcx),%rax,%r15 1061 adcxq %rax,%r10 1062 adoxq %r15,%r11 1063 mulxq 8(%rcx),%rax,%r15 1064 adcxq %rax,%r11 1065 adoxq %r15,%r12 1066 mulxq 16(%rcx),%rax,%r15 1067 movq %r10,-40(%rbx) 1068 adcxq %rax,%r12 1069 adoxq %r15,%r13 1070 mulxq 24(%rcx),%rax,%r15 1071 movq %r9,%rdx 1072 movq %r11,-32(%rbx) 1073 movq %r12,-24(%rbx) 1074 adcxq %rax,%r13 1075 adoxq %rbp,%r15 1076 leaq 32(%rcx),%rcx 1077 movq %r13,-16(%rbx) 1078 1079 decq %rdi 1080 jnz .Lmulx4x_inner 1081 1082 movq 0(%rsp),%rax 1083 movq 8(%rsp),%rdi 1084 adcq %rbp,%r15 1085 subq 0(%rbx),%rbp 1086 adcq %r15,%r14 1087 sbbq %r15,%r15 1088 movq %r14,-8(%rbx) 1089 1090 cmpq 16(%rsp),%rdi 1091 jne .Lmulx4x_outer 1092 1093 leaq 64(%rsp),%rbx 1094 subq %rax,%rcx 1095 negq %r15 1096 movq %rax,%rdx 1097 shrq $3+2,%rax 1098 movq 32(%rsp),%rdi 1099 jmp .Lmulx4x_sub 1100 1101.align 32 1102.Lmulx4x_sub: 1103 movq 0(%rbx),%r11 1104 movq 8(%rbx),%r12 1105 movq 16(%rbx),%r13 1106 movq 24(%rbx),%r14 1107 leaq 32(%rbx),%rbx 1108 sbbq 0(%rcx),%r11 1109 sbbq 8(%rcx),%r12 1110 sbbq 16(%rcx),%r13 1111 sbbq 24(%rcx),%r14 1112 leaq 32(%rcx),%rcx 1113 movq %r11,0(%rdi) 1114 movq %r12,8(%rdi) 1115 movq %r13,16(%rdi) 1116 movq %r14,24(%rdi) 1117 leaq 32(%rdi),%rdi 1118 decq %rax 1119 jnz .Lmulx4x_sub 1120 1121 sbbq $0,%r15 1122 leaq 64(%rsp),%rbx 1123 subq %rdx,%rdi 1124 1125.byte 102,73,15,110,207 1126 pxor %xmm0,%xmm0 1127 pshufd $0,%xmm1,%xmm1 1128 movq 40(%rsp),%rsi 1129 jmp .Lmulx4x_cond_copy 1130 1131.align 32 1132.Lmulx4x_cond_copy: 1133 movdqa 0(%rbx),%xmm2 1134 movdqa 16(%rbx),%xmm3 1135 leaq 32(%rbx),%rbx 1136 movdqu 0(%rdi),%xmm4 1137 movdqu 16(%rdi),%xmm5 1138 leaq 32(%rdi),%rdi 1139 movdqa %xmm0,-32(%rbx) 1140 movdqa %xmm0,-16(%rbx) 1141 pcmpeqd %xmm1,%xmm0 1142 pand %xmm1,%xmm2 1143 pand %xmm1,%xmm3 1144 pand %xmm0,%xmm4 1145 pand %xmm0,%xmm5 1146 pxor %xmm0,%xmm0 1147 por %xmm2,%xmm4 1148 por %xmm3,%xmm5 1149 movdqu %xmm4,-32(%rdi) 1150 movdqu %xmm5,-16(%rdi) 1151 subq $32,%rdx 1152 jnz .Lmulx4x_cond_copy 1153 1154 movq %rdx,(%rbx) 1155 1156 movq $1,%rax 1157 movq -48(%rsi),%r15 1158 movq -40(%rsi),%r14 1159 movq -32(%rsi),%r13 1160 movq -24(%rsi),%r12 1161 movq -16(%rsi),%rbp 1162 movq -8(%rsi),%rbx 1163 leaq (%rsi),%rsp 1164.Lmulx4x_epilogue: 1165 .byte 0xf3,0xc3 1166.size bn_mulx4x_mont,.-bn_mulx4x_mont 1167.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1168.align 16 1169