x86_64-mont.S revision 305153
1/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/x86_64-mont.S 305153 2016-08-31 20:33:59Z jkim $ */ 2/* Do not modify. This file is auto-generated from x86_64-mont.pl. */ 3.text 4 5 6 7.globl bn_mul_mont 8.type bn_mul_mont,@function 9.align 16 10bn_mul_mont: 11 testl $3,%r9d 12 jnz .Lmul_enter 13 cmpl $8,%r9d 14 jb .Lmul_enter 15 movl OPENSSL_ia32cap_P+8(%rip),%r11d 16 cmpq %rsi,%rdx 17 jne .Lmul4x_enter 18 testl $7,%r9d 19 jz .Lsqr8x_enter 20 jmp .Lmul4x_enter 21 22.align 16 23.Lmul_enter: 24 pushq %rbx 25 pushq %rbp 26 pushq %r12 27 pushq %r13 28 pushq %r14 29 pushq %r15 30 31 movl %r9d,%r9d 32 leaq 2(%r9),%r10 33 movq %rsp,%r11 34 negq %r10 35 leaq (%rsp,%r10,8),%rsp 36 andq $-1024,%rsp 37 38 movq %r11,8(%rsp,%r9,8) 39.Lmul_body: 40 41 42 43 44 45 46 subq %rsp,%r11 47 andq $-4096,%r11 48.Lmul_page_walk: 49 movq (%rsp,%r11,1),%r10 50 subq $4096,%r11 51.byte 0x66,0x2e 52 jnc .Lmul_page_walk 53 54 movq %rdx,%r12 55 movq (%r8),%r8 56 movq (%r12),%rbx 57 movq (%rsi),%rax 58 59 xorq %r14,%r14 60 xorq %r15,%r15 61 62 movq %r8,%rbp 63 mulq %rbx 64 movq %rax,%r10 65 movq (%rcx),%rax 66 67 imulq %r10,%rbp 68 movq %rdx,%r11 69 70 mulq %rbp 71 addq %rax,%r10 72 movq 8(%rsi),%rax 73 adcq $0,%rdx 74 movq %rdx,%r13 75 76 leaq 1(%r15),%r15 77 jmp .L1st_enter 78 79.align 16 80.L1st: 81 addq %rax,%r13 82 movq (%rsi,%r15,8),%rax 83 adcq $0,%rdx 84 addq %r11,%r13 85 movq %r10,%r11 86 adcq $0,%rdx 87 movq %r13,-16(%rsp,%r15,8) 88 movq %rdx,%r13 89 90.L1st_enter: 91 mulq %rbx 92 addq %rax,%r11 93 movq (%rcx,%r15,8),%rax 94 adcq $0,%rdx 95 leaq 1(%r15),%r15 96 movq %rdx,%r10 97 98 mulq %rbp 99 cmpq %r9,%r15 100 jne .L1st 101 102 addq %rax,%r13 103 movq (%rsi),%rax 104 adcq $0,%rdx 105 addq %r11,%r13 106 adcq $0,%rdx 107 movq %r13,-16(%rsp,%r15,8) 108 movq %rdx,%r13 109 movq %r10,%r11 110 111 xorq %rdx,%rdx 112 addq %r11,%r13 113 adcq $0,%rdx 114 movq %r13,-8(%rsp,%r9,8) 115 movq %rdx,(%rsp,%r9,8) 116 117 leaq 1(%r14),%r14 118 jmp .Louter 119.align 16 120.Louter: 121 movq (%r12,%r14,8),%rbx 122 xorq %r15,%r15 123 movq %r8,%rbp 124 movq (%rsp),%r10 125 mulq %rbx 126 addq %rax,%r10 127 movq (%rcx),%rax 128 adcq $0,%rdx 129 130 imulq %r10,%rbp 131 movq %rdx,%r11 132 133 mulq %rbp 134 addq %rax,%r10 135 movq 8(%rsi),%rax 136 adcq $0,%rdx 137 movq 8(%rsp),%r10 138 movq %rdx,%r13 139 140 leaq 1(%r15),%r15 141 jmp .Linner_enter 142 143.align 16 144.Linner: 145 addq %rax,%r13 146 movq (%rsi,%r15,8),%rax 147 adcq $0,%rdx 148 addq %r10,%r13 149 movq (%rsp,%r15,8),%r10 150 adcq $0,%rdx 151 movq %r13,-16(%rsp,%r15,8) 152 movq %rdx,%r13 153 154.Linner_enter: 155 mulq %rbx 156 addq %rax,%r11 157 movq (%rcx,%r15,8),%rax 158 adcq $0,%rdx 159 addq %r11,%r10 160 movq %rdx,%r11 161 adcq $0,%r11 162 leaq 1(%r15),%r15 163 164 mulq %rbp 165 cmpq %r9,%r15 166 jne .Linner 167 168 addq %rax,%r13 169 movq (%rsi),%rax 170 adcq $0,%rdx 171 addq %r10,%r13 172 movq (%rsp,%r15,8),%r10 173 adcq $0,%rdx 174 movq %r13,-16(%rsp,%r15,8) 175 movq %rdx,%r13 176 177 xorq %rdx,%rdx 178 addq %r11,%r13 179 adcq $0,%rdx 180 addq %r10,%r13 181 adcq $0,%rdx 182 movq %r13,-8(%rsp,%r9,8) 183 movq %rdx,(%rsp,%r9,8) 184 185 leaq 1(%r14),%r14 186 cmpq %r9,%r14 187 jb .Louter 188 189 xorq %r14,%r14 190 movq (%rsp),%rax 191 leaq (%rsp),%rsi 192 movq %r9,%r15 193 jmp .Lsub 194.align 16 195.Lsub: sbbq (%rcx,%r14,8),%rax 196 movq %rax,(%rdi,%r14,8) 197 movq 8(%rsi,%r14,8),%rax 198 leaq 1(%r14),%r14 199 decq %r15 200 jnz .Lsub 201 202 sbbq $0,%rax 203 xorq %r14,%r14 204 andq %rax,%rsi 205 notq %rax 206 movq %rdi,%rcx 207 andq %rax,%rcx 208 movq %r9,%r15 209 orq %rcx,%rsi 210.align 16 211.Lcopy: 212 movq (%rsi,%r14,8),%rax 213 movq %r14,(%rsp,%r14,8) 214 movq %rax,(%rdi,%r14,8) 215 leaq 1(%r14),%r14 216 subq $1,%r15 217 jnz .Lcopy 218 219 movq 8(%rsp,%r9,8),%rsi 220 movq $1,%rax 221 movq (%rsi),%r15 222 movq 8(%rsi),%r14 223 movq 16(%rsi),%r13 224 movq 24(%rsi),%r12 225 movq 32(%rsi),%rbp 226 movq 40(%rsi),%rbx 227 leaq 48(%rsi),%rsp 228.Lmul_epilogue: 229 .byte 0xf3,0xc3 230.size bn_mul_mont,.-bn_mul_mont 231.type bn_mul4x_mont,@function 232.align 16 233bn_mul4x_mont: 234.Lmul4x_enter: 235 andl $0x80100,%r11d 236 cmpl $0x80100,%r11d 237 je .Lmulx4x_enter 238 pushq %rbx 239 pushq %rbp 240 pushq %r12 241 pushq %r13 242 pushq %r14 243 pushq %r15 244 245 movl %r9d,%r9d 246 leaq 4(%r9),%r10 247 movq %rsp,%r11 248 negq %r10 249 leaq (%rsp,%r10,8),%rsp 250 andq $-1024,%rsp 251 252 movq %r11,8(%rsp,%r9,8) 253.Lmul4x_body: 254 subq %rsp,%r11 255 andq $-4096,%r11 256.Lmul4x_page_walk: 257 movq (%rsp,%r11,1),%r10 258 subq $4096,%r11 259.byte 0x2e 260 jnc .Lmul4x_page_walk 261 262 movq %rdi,16(%rsp,%r9,8) 263 movq %rdx,%r12 264 movq (%r8),%r8 265 movq (%r12),%rbx 266 movq (%rsi),%rax 267 268 xorq %r14,%r14 269 xorq %r15,%r15 270 271 movq %r8,%rbp 272 mulq %rbx 273 movq %rax,%r10 274 movq (%rcx),%rax 275 276 imulq %r10,%rbp 277 movq %rdx,%r11 278 279 mulq %rbp 280 addq %rax,%r10 281 movq 8(%rsi),%rax 282 adcq $0,%rdx 283 movq %rdx,%rdi 284 285 mulq %rbx 286 addq %rax,%r11 287 movq 8(%rcx),%rax 288 adcq $0,%rdx 289 movq %rdx,%r10 290 291 mulq %rbp 292 addq %rax,%rdi 293 movq 16(%rsi),%rax 294 adcq $0,%rdx 295 addq %r11,%rdi 296 leaq 4(%r15),%r15 297 adcq $0,%rdx 298 movq %rdi,(%rsp) 299 movq %rdx,%r13 300 jmp .L1st4x 301.align 16 302.L1st4x: 303 mulq %rbx 304 addq %rax,%r10 305 movq -16(%rcx,%r15,8),%rax 306 adcq $0,%rdx 307 movq %rdx,%r11 308 309 mulq %rbp 310 addq %rax,%r13 311 movq -8(%rsi,%r15,8),%rax 312 adcq $0,%rdx 313 addq %r10,%r13 314 adcq $0,%rdx 315 movq %r13,-24(%rsp,%r15,8) 316 movq %rdx,%rdi 317 318 mulq %rbx 319 addq %rax,%r11 320 movq -8(%rcx,%r15,8),%rax 321 adcq $0,%rdx 322 movq %rdx,%r10 323 324 mulq %rbp 325 addq %rax,%rdi 326 movq (%rsi,%r15,8),%rax 327 adcq $0,%rdx 328 addq %r11,%rdi 329 adcq $0,%rdx 330 movq %rdi,-16(%rsp,%r15,8) 331 movq %rdx,%r13 332 333 mulq %rbx 334 addq %rax,%r10 335 movq (%rcx,%r15,8),%rax 336 adcq $0,%rdx 337 movq %rdx,%r11 338 339 mulq %rbp 340 addq %rax,%r13 341 movq 8(%rsi,%r15,8),%rax 342 adcq $0,%rdx 343 addq %r10,%r13 344 adcq $0,%rdx 345 movq %r13,-8(%rsp,%r15,8) 346 movq %rdx,%rdi 347 348 mulq %rbx 349 addq %rax,%r11 350 movq 8(%rcx,%r15,8),%rax 351 adcq $0,%rdx 352 leaq 4(%r15),%r15 353 movq %rdx,%r10 354 355 mulq %rbp 356 addq %rax,%rdi 357 movq -16(%rsi,%r15,8),%rax 358 adcq $0,%rdx 359 addq %r11,%rdi 360 adcq $0,%rdx 361 movq %rdi,-32(%rsp,%r15,8) 362 movq %rdx,%r13 363 cmpq %r9,%r15 364 jb .L1st4x 365 366 mulq %rbx 367 addq %rax,%r10 368 movq -16(%rcx,%r15,8),%rax 369 adcq $0,%rdx 370 movq %rdx,%r11 371 372 mulq %rbp 373 addq %rax,%r13 374 movq -8(%rsi,%r15,8),%rax 375 adcq $0,%rdx 376 addq %r10,%r13 377 adcq $0,%rdx 378 movq %r13,-24(%rsp,%r15,8) 379 movq %rdx,%rdi 380 381 mulq %rbx 382 addq %rax,%r11 383 movq -8(%rcx,%r15,8),%rax 384 adcq $0,%rdx 385 movq %rdx,%r10 386 387 mulq %rbp 388 addq %rax,%rdi 389 movq (%rsi),%rax 390 adcq $0,%rdx 391 addq %r11,%rdi 392 adcq $0,%rdx 393 movq %rdi,-16(%rsp,%r15,8) 394 movq %rdx,%r13 395 396 xorq %rdi,%rdi 397 addq %r10,%r13 398 adcq $0,%rdi 399 movq %r13,-8(%rsp,%r15,8) 400 movq %rdi,(%rsp,%r15,8) 401 402 leaq 1(%r14),%r14 403.align 4 404.Louter4x: 405 movq (%r12,%r14,8),%rbx 406 xorq %r15,%r15 407 movq (%rsp),%r10 408 movq %r8,%rbp 409 mulq %rbx 410 addq %rax,%r10 411 movq (%rcx),%rax 412 adcq $0,%rdx 413 414 imulq %r10,%rbp 415 movq %rdx,%r11 416 417 mulq %rbp 418 addq %rax,%r10 419 movq 8(%rsi),%rax 420 adcq $0,%rdx 421 movq %rdx,%rdi 422 423 mulq %rbx 424 addq %rax,%r11 425 movq 8(%rcx),%rax 426 adcq $0,%rdx 427 addq 8(%rsp),%r11 428 adcq $0,%rdx 429 movq %rdx,%r10 430 431 mulq %rbp 432 addq %rax,%rdi 433 movq 16(%rsi),%rax 434 adcq $0,%rdx 435 addq %r11,%rdi 436 leaq 4(%r15),%r15 437 adcq $0,%rdx 438 movq %rdi,(%rsp) 439 movq %rdx,%r13 440 jmp .Linner4x 441.align 16 442.Linner4x: 443 mulq %rbx 444 addq %rax,%r10 445 movq -16(%rcx,%r15,8),%rax 446 adcq $0,%rdx 447 addq -16(%rsp,%r15,8),%r10 448 adcq $0,%rdx 449 movq %rdx,%r11 450 451 mulq %rbp 452 addq %rax,%r13 453 movq -8(%rsi,%r15,8),%rax 454 adcq $0,%rdx 455 addq %r10,%r13 456 adcq $0,%rdx 457 movq %r13,-24(%rsp,%r15,8) 458 movq %rdx,%rdi 459 460 mulq %rbx 461 addq %rax,%r11 462 movq -8(%rcx,%r15,8),%rax 463 adcq $0,%rdx 464 addq -8(%rsp,%r15,8),%r11 465 adcq $0,%rdx 466 movq %rdx,%r10 467 468 mulq %rbp 469 addq %rax,%rdi 470 movq (%rsi,%r15,8),%rax 471 adcq $0,%rdx 472 addq %r11,%rdi 473 adcq $0,%rdx 474 movq %rdi,-16(%rsp,%r15,8) 475 movq %rdx,%r13 476 477 mulq %rbx 478 addq %rax,%r10 479 movq (%rcx,%r15,8),%rax 480 adcq $0,%rdx 481 addq (%rsp,%r15,8),%r10 482 adcq $0,%rdx 483 movq %rdx,%r11 484 485 mulq %rbp 486 addq %rax,%r13 487 movq 8(%rsi,%r15,8),%rax 488 adcq $0,%rdx 489 addq %r10,%r13 490 adcq $0,%rdx 491 movq %r13,-8(%rsp,%r15,8) 492 movq %rdx,%rdi 493 494 mulq %rbx 495 addq %rax,%r11 496 movq 8(%rcx,%r15,8),%rax 497 adcq $0,%rdx 498 addq 8(%rsp,%r15,8),%r11 499 adcq $0,%rdx 500 leaq 4(%r15),%r15 501 movq %rdx,%r10 502 503 mulq %rbp 504 addq %rax,%rdi 505 movq -16(%rsi,%r15,8),%rax 506 adcq $0,%rdx 507 addq %r11,%rdi 508 adcq $0,%rdx 509 movq %rdi,-32(%rsp,%r15,8) 510 movq %rdx,%r13 511 cmpq %r9,%r15 512 jb .Linner4x 513 514 mulq %rbx 515 addq %rax,%r10 516 movq -16(%rcx,%r15,8),%rax 517 adcq $0,%rdx 518 addq -16(%rsp,%r15,8),%r10 519 adcq $0,%rdx 520 movq %rdx,%r11 521 522 mulq %rbp 523 addq %rax,%r13 524 movq -8(%rsi,%r15,8),%rax 525 adcq $0,%rdx 526 addq %r10,%r13 527 adcq $0,%rdx 528 movq %r13,-24(%rsp,%r15,8) 529 movq %rdx,%rdi 530 531 mulq %rbx 532 addq %rax,%r11 533 movq -8(%rcx,%r15,8),%rax 534 adcq $0,%rdx 535 addq -8(%rsp,%r15,8),%r11 536 adcq $0,%rdx 537 leaq 1(%r14),%r14 538 movq %rdx,%r10 539 540 mulq %rbp 541 addq %rax,%rdi 542 movq (%rsi),%rax 543 adcq $0,%rdx 544 addq %r11,%rdi 545 adcq $0,%rdx 546 movq %rdi,-16(%rsp,%r15,8) 547 movq %rdx,%r13 548 549 xorq %rdi,%rdi 550 addq %r10,%r13 551 adcq $0,%rdi 552 addq (%rsp,%r9,8),%r13 553 adcq $0,%rdi 554 movq %r13,-8(%rsp,%r15,8) 555 movq %rdi,(%rsp,%r15,8) 556 557 cmpq %r9,%r14 558 jb .Louter4x 559 movq 16(%rsp,%r9,8),%rdi 560 movq 0(%rsp),%rax 561 pxor %xmm0,%xmm0 562 movq 8(%rsp),%rdx 563 shrq $2,%r9 564 leaq (%rsp),%rsi 565 xorq %r14,%r14 566 567 subq 0(%rcx),%rax 568 movq 16(%rsi),%rbx 569 movq 24(%rsi),%rbp 570 sbbq 8(%rcx),%rdx 571 leaq -1(%r9),%r15 572 jmp .Lsub4x 573.align 16 574.Lsub4x: 575 movq %rax,0(%rdi,%r14,8) 576 movq %rdx,8(%rdi,%r14,8) 577 sbbq 16(%rcx,%r14,8),%rbx 578 movq 32(%rsi,%r14,8),%rax 579 movq 40(%rsi,%r14,8),%rdx 580 sbbq 24(%rcx,%r14,8),%rbp 581 movq %rbx,16(%rdi,%r14,8) 582 movq %rbp,24(%rdi,%r14,8) 583 sbbq 32(%rcx,%r14,8),%rax 584 movq 48(%rsi,%r14,8),%rbx 585 movq 56(%rsi,%r14,8),%rbp 586 sbbq 40(%rcx,%r14,8),%rdx 587 leaq 4(%r14),%r14 588 decq %r15 589 jnz .Lsub4x 590 591 movq %rax,0(%rdi,%r14,8) 592 movq 32(%rsi,%r14,8),%rax 593 sbbq 16(%rcx,%r14,8),%rbx 594 movq %rdx,8(%rdi,%r14,8) 595 sbbq 24(%rcx,%r14,8),%rbp 596 movq %rbx,16(%rdi,%r14,8) 597 598 sbbq $0,%rax 599 movq %rbp,24(%rdi,%r14,8) 600 xorq %r14,%r14 601 andq %rax,%rsi 602 notq %rax 603 movq %rdi,%rcx 604 andq %rax,%rcx 605 leaq -1(%r9),%r15 606 orq %rcx,%rsi 607 608 movdqu (%rsi),%xmm1 609 movdqa %xmm0,(%rsp) 610 movdqu %xmm1,(%rdi) 611 jmp .Lcopy4x 612.align 16 613.Lcopy4x: 614 movdqu 16(%rsi,%r14,1),%xmm2 615 movdqu 32(%rsi,%r14,1),%xmm1 616 movdqa %xmm0,16(%rsp,%r14,1) 617 movdqu %xmm2,16(%rdi,%r14,1) 618 movdqa %xmm0,32(%rsp,%r14,1) 619 movdqu %xmm1,32(%rdi,%r14,1) 620 leaq 32(%r14),%r14 621 decq %r15 622 jnz .Lcopy4x 623 624 shlq $2,%r9 625 movdqu 16(%rsi,%r14,1),%xmm2 626 movdqa %xmm0,16(%rsp,%r14,1) 627 movdqu %xmm2,16(%rdi,%r14,1) 628 movq 8(%rsp,%r9,8),%rsi 629 movq $1,%rax 630 movq (%rsi),%r15 631 movq 8(%rsi),%r14 632 movq 16(%rsi),%r13 633 movq 24(%rsi),%r12 634 movq 32(%rsi),%rbp 635 movq 40(%rsi),%rbx 636 leaq 48(%rsi),%rsp 637.Lmul4x_epilogue: 638 .byte 0xf3,0xc3 639.size bn_mul4x_mont,.-bn_mul4x_mont 640 641 642 643.type bn_sqr8x_mont,@function 644.align 32 645bn_sqr8x_mont: 646.Lsqr8x_enter: 647 movq %rsp,%rax 648 pushq %rbx 649 pushq %rbp 650 pushq %r12 651 pushq %r13 652 pushq %r14 653 pushq %r15 654 655 movl %r9d,%r10d 656 shll $3,%r9d 657 shlq $3+2,%r10 658 negq %r9 659 660 661 662 663 664 665 leaq -64(%rsp,%r9,2),%r11 666 movq (%r8),%r8 667 subq %rsi,%r11 668 andq $4095,%r11 669 cmpq %r11,%r10 670 jb .Lsqr8x_sp_alt 671 subq %r11,%rsp 672 leaq -64(%rsp,%r9,2),%rsp 673 jmp .Lsqr8x_sp_done 674 675.align 32 676.Lsqr8x_sp_alt: 677 leaq 4096-64(,%r9,2),%r10 678 leaq -64(%rsp,%r9,2),%rsp 679 subq %r10,%r11 680 movq $0,%r10 681 cmovcq %r10,%r11 682 subq %r11,%rsp 683.Lsqr8x_sp_done: 684 andq $-64,%rsp 685 movq %rax,%r11 686 subq %rsp,%r11 687 andq $-4096,%r11 688.Lsqr8x_page_walk: 689 movq (%rsp,%r11,1),%r10 690 subq $4096,%r11 691.byte 0x2e 692 jnc .Lsqr8x_page_walk 693 694 movq %r9,%r10 695 negq %r9 696 697 movq %r8,32(%rsp) 698 movq %rax,40(%rsp) 699.Lsqr8x_body: 700 701.byte 102,72,15,110,209 702 pxor %xmm0,%xmm0 703.byte 102,72,15,110,207 704.byte 102,73,15,110,218 705 movl OPENSSL_ia32cap_P+8(%rip),%eax 706 andl $0x80100,%eax 707 cmpl $0x80100,%eax 708 jne .Lsqr8x_nox 709 710 call bn_sqrx8x_internal 711 712 713 714 715 leaq (%r8,%rcx,1),%rbx 716 movq %rcx,%r9 717 movq %rcx,%rdx 718.byte 102,72,15,126,207 719 sarq $3+2,%rcx 720 jmp .Lsqr8x_sub 721 722.align 32 723.Lsqr8x_nox: 724 call bn_sqr8x_internal 725 726 727 728 729 leaq (%rdi,%r9,1),%rbx 730 movq %r9,%rcx 731 movq %r9,%rdx 732.byte 102,72,15,126,207 733 sarq $3+2,%rcx 734 jmp .Lsqr8x_sub 735 736.align 32 737.Lsqr8x_sub: 738 movq 0(%rbx),%r12 739 movq 8(%rbx),%r13 740 movq 16(%rbx),%r14 741 movq 24(%rbx),%r15 742 leaq 32(%rbx),%rbx 743 sbbq 0(%rbp),%r12 744 sbbq 8(%rbp),%r13 745 sbbq 16(%rbp),%r14 746 sbbq 24(%rbp),%r15 747 leaq 32(%rbp),%rbp 748 movq %r12,0(%rdi) 749 movq %r13,8(%rdi) 750 movq %r14,16(%rdi) 751 movq %r15,24(%rdi) 752 leaq 32(%rdi),%rdi 753 incq %rcx 754 jnz .Lsqr8x_sub 755 756 sbbq $0,%rax 757 leaq (%rbx,%r9,1),%rbx 758 leaq (%rdi,%r9,1),%rdi 759 760.byte 102,72,15,110,200 761 pxor %xmm0,%xmm0 762 pshufd $0,%xmm1,%xmm1 763 movq 40(%rsp),%rsi 764 jmp .Lsqr8x_cond_copy 765 766.align 32 767.Lsqr8x_cond_copy: 768 movdqa 0(%rbx),%xmm2 769 movdqa 16(%rbx),%xmm3 770 leaq 32(%rbx),%rbx 771 movdqu 0(%rdi),%xmm4 772 movdqu 16(%rdi),%xmm5 773 leaq 32(%rdi),%rdi 774 movdqa %xmm0,-32(%rbx) 775 movdqa %xmm0,-16(%rbx) 776 movdqa %xmm0,-32(%rbx,%rdx,1) 777 movdqa %xmm0,-16(%rbx,%rdx,1) 778 pcmpeqd %xmm1,%xmm0 779 pand %xmm1,%xmm2 780 pand %xmm1,%xmm3 781 pand %xmm0,%xmm4 782 pand %xmm0,%xmm5 783 pxor %xmm0,%xmm0 784 por %xmm2,%xmm4 785 por %xmm3,%xmm5 786 movdqu %xmm4,-32(%rdi) 787 movdqu %xmm5,-16(%rdi) 788 addq $32,%r9 789 jnz .Lsqr8x_cond_copy 790 791 movq $1,%rax 792 movq -48(%rsi),%r15 793 movq -40(%rsi),%r14 794 movq -32(%rsi),%r13 795 movq -24(%rsi),%r12 796 movq -16(%rsi),%rbp 797 movq -8(%rsi),%rbx 798 leaq (%rsi),%rsp 799.Lsqr8x_epilogue: 800 .byte 0xf3,0xc3 801.size bn_sqr8x_mont,.-bn_sqr8x_mont 802.type bn_mulx4x_mont,@function 803.align 32 804bn_mulx4x_mont: 805.Lmulx4x_enter: 806 movq %rsp,%rax 807 pushq %rbx 808 pushq %rbp 809 pushq %r12 810 pushq %r13 811 pushq %r14 812 pushq %r15 813 814 shll $3,%r9d 815.byte 0x67 816 xorq %r10,%r10 817 subq %r9,%r10 818 movq (%r8),%r8 819 leaq -72(%rsp,%r10,1),%rsp 820 andq $-128,%rsp 821 movq %rax,%r11 822 subq %rsp,%r11 823 andq $-4096,%r11 824.Lmulx4x_page_walk: 825 movq (%rsp,%r11,1),%r10 826 subq $4096,%r11 827.byte 0x66,0x2e 828 jnc .Lmulx4x_page_walk 829 830 leaq (%rdx,%r9,1),%r10 831 832 833 834 835 836 837 838 839 840 841 842 843 movq %r9,0(%rsp) 844 shrq $5,%r9 845 movq %r10,16(%rsp) 846 subq $1,%r9 847 movq %r8,24(%rsp) 848 movq %rdi,32(%rsp) 849 movq %rax,40(%rsp) 850 movq %r9,48(%rsp) 851 jmp .Lmulx4x_body 852 853.align 32 854.Lmulx4x_body: 855 leaq 8(%rdx),%rdi 856 movq (%rdx),%rdx 857 leaq 64+32(%rsp),%rbx 858 movq %rdx,%r9 859 860 mulxq 0(%rsi),%r8,%rax 861 mulxq 8(%rsi),%r11,%r14 862 addq %rax,%r11 863 movq %rdi,8(%rsp) 864 mulxq 16(%rsi),%r12,%r13 865 adcq %r14,%r12 866 adcq $0,%r13 867 868 movq %r8,%rdi 869 imulq 24(%rsp),%r8 870 xorq %rbp,%rbp 871 872 mulxq 24(%rsi),%rax,%r14 873 movq %r8,%rdx 874 leaq 32(%rsi),%rsi 875 adcxq %rax,%r13 876 adcxq %rbp,%r14 877 878 mulxq 0(%rcx),%rax,%r10 879 adcxq %rax,%rdi 880 adoxq %r11,%r10 881 mulxq 8(%rcx),%rax,%r11 882 adcxq %rax,%r10 883 adoxq %r12,%r11 884.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 885 movq 48(%rsp),%rdi 886 movq %r10,-32(%rbx) 887 adcxq %rax,%r11 888 adoxq %r13,%r12 889 mulxq 24(%rcx),%rax,%r15 890 movq %r9,%rdx 891 movq %r11,-24(%rbx) 892 adcxq %rax,%r12 893 adoxq %rbp,%r15 894 leaq 32(%rcx),%rcx 895 movq %r12,-16(%rbx) 896 897 jmp .Lmulx4x_1st 898 899.align 32 900.Lmulx4x_1st: 901 adcxq %rbp,%r15 902 mulxq 0(%rsi),%r10,%rax 903 adcxq %r14,%r10 904 mulxq 8(%rsi),%r11,%r14 905 adcxq %rax,%r11 906 mulxq 16(%rsi),%r12,%rax 907 adcxq %r14,%r12 908 mulxq 24(%rsi),%r13,%r14 909.byte 0x67,0x67 910 movq %r8,%rdx 911 adcxq %rax,%r13 912 adcxq %rbp,%r14 913 leaq 32(%rsi),%rsi 914 leaq 32(%rbx),%rbx 915 916 adoxq %r15,%r10 917 mulxq 0(%rcx),%rax,%r15 918 adcxq %rax,%r10 919 adoxq %r15,%r11 920 mulxq 8(%rcx),%rax,%r15 921 adcxq %rax,%r11 922 adoxq %r15,%r12 923 mulxq 16(%rcx),%rax,%r15 924 movq %r10,-40(%rbx) 925 adcxq %rax,%r12 926 movq %r11,-32(%rbx) 927 adoxq %r15,%r13 928 mulxq 24(%rcx),%rax,%r15 929 movq %r9,%rdx 930 movq %r12,-24(%rbx) 931 adcxq %rax,%r13 932 adoxq %rbp,%r15 933 leaq 32(%rcx),%rcx 934 movq %r13,-16(%rbx) 935 936 decq %rdi 937 jnz .Lmulx4x_1st 938 939 movq 0(%rsp),%rax 940 movq 8(%rsp),%rdi 941 adcq %rbp,%r15 942 addq %r15,%r14 943 sbbq %r15,%r15 944 movq %r14,-8(%rbx) 945 jmp .Lmulx4x_outer 946 947.align 32 948.Lmulx4x_outer: 949 movq (%rdi),%rdx 950 leaq 8(%rdi),%rdi 951 subq %rax,%rsi 952 movq %r15,(%rbx) 953 leaq 64+32(%rsp),%rbx 954 subq %rax,%rcx 955 956 mulxq 0(%rsi),%r8,%r11 957 xorl %ebp,%ebp 958 movq %rdx,%r9 959 mulxq 8(%rsi),%r14,%r12 960 adoxq -32(%rbx),%r8 961 adcxq %r14,%r11 962 mulxq 16(%rsi),%r15,%r13 963 adoxq -24(%rbx),%r11 964 adcxq %r15,%r12 965 adoxq %rbp,%r12 966 adcxq %rbp,%r13 967 968 movq %rdi,8(%rsp) 969.byte 0x67 970 movq %r8,%r15 971 imulq 24(%rsp),%r8 972 xorl %ebp,%ebp 973 974 mulxq 24(%rsi),%rax,%r14 975 movq %r8,%rdx 976 adoxq -16(%rbx),%r12 977 adcxq %rax,%r13 978 adoxq -8(%rbx),%r13 979 adcxq %rbp,%r14 980 leaq 32(%rsi),%rsi 981 adoxq %rbp,%r14 982 983 mulxq 0(%rcx),%rax,%r10 984 adcxq %rax,%r15 985 adoxq %r11,%r10 986 mulxq 8(%rcx),%rax,%r11 987 adcxq %rax,%r10 988 adoxq %r12,%r11 989 mulxq 16(%rcx),%rax,%r12 990 movq %r10,-32(%rbx) 991 adcxq %rax,%r11 992 adoxq %r13,%r12 993 mulxq 24(%rcx),%rax,%r15 994 movq %r9,%rdx 995 movq %r11,-24(%rbx) 996 leaq 32(%rcx),%rcx 997 adcxq %rax,%r12 998 adoxq %rbp,%r15 999 movq 48(%rsp),%rdi 1000 movq %r12,-16(%rbx) 1001 1002 jmp .Lmulx4x_inner 1003 1004.align 32 1005.Lmulx4x_inner: 1006 mulxq 0(%rsi),%r10,%rax 1007 adcxq %rbp,%r15 1008 adoxq %r14,%r10 1009 mulxq 8(%rsi),%r11,%r14 1010 adcxq 0(%rbx),%r10 1011 adoxq %rax,%r11 1012 mulxq 16(%rsi),%r12,%rax 1013 adcxq 8(%rbx),%r11 1014 adoxq %r14,%r12 1015 mulxq 24(%rsi),%r13,%r14 1016 movq %r8,%rdx 1017 adcxq 16(%rbx),%r12 1018 adoxq %rax,%r13 1019 adcxq 24(%rbx),%r13 1020 adoxq %rbp,%r14 1021 leaq 32(%rsi),%rsi 1022 leaq 32(%rbx),%rbx 1023 adcxq %rbp,%r14 1024 1025 adoxq %r15,%r10 1026 mulxq 0(%rcx),%rax,%r15 1027 adcxq %rax,%r10 1028 adoxq %r15,%r11 1029 mulxq 8(%rcx),%rax,%r15 1030 adcxq %rax,%r11 1031 adoxq %r15,%r12 1032 mulxq 16(%rcx),%rax,%r15 1033 movq %r10,-40(%rbx) 1034 adcxq %rax,%r12 1035 adoxq %r15,%r13 1036 mulxq 24(%rcx),%rax,%r15 1037 movq %r9,%rdx 1038 movq %r11,-32(%rbx) 1039 movq %r12,-24(%rbx) 1040 adcxq %rax,%r13 1041 adoxq %rbp,%r15 1042 leaq 32(%rcx),%rcx 1043 movq %r13,-16(%rbx) 1044 1045 decq %rdi 1046 jnz .Lmulx4x_inner 1047 1048 movq 0(%rsp),%rax 1049 movq 8(%rsp),%rdi 1050 adcq %rbp,%r15 1051 subq 0(%rbx),%rbp 1052 adcq %r15,%r14 1053 sbbq %r15,%r15 1054 movq %r14,-8(%rbx) 1055 1056 cmpq 16(%rsp),%rdi 1057 jne .Lmulx4x_outer 1058 1059 leaq 64(%rsp),%rbx 1060 subq %rax,%rcx 1061 negq %r15 1062 movq %rax,%rdx 1063 shrq $3+2,%rax 1064 movq 32(%rsp),%rdi 1065 jmp .Lmulx4x_sub 1066 1067.align 32 1068.Lmulx4x_sub: 1069 movq 0(%rbx),%r11 1070 movq 8(%rbx),%r12 1071 movq 16(%rbx),%r13 1072 movq 24(%rbx),%r14 1073 leaq 32(%rbx),%rbx 1074 sbbq 0(%rcx),%r11 1075 sbbq 8(%rcx),%r12 1076 sbbq 16(%rcx),%r13 1077 sbbq 24(%rcx),%r14 1078 leaq 32(%rcx),%rcx 1079 movq %r11,0(%rdi) 1080 movq %r12,8(%rdi) 1081 movq %r13,16(%rdi) 1082 movq %r14,24(%rdi) 1083 leaq 32(%rdi),%rdi 1084 decq %rax 1085 jnz .Lmulx4x_sub 1086 1087 sbbq $0,%r15 1088 leaq 64(%rsp),%rbx 1089 subq %rdx,%rdi 1090 1091.byte 102,73,15,110,207 1092 pxor %xmm0,%xmm0 1093 pshufd $0,%xmm1,%xmm1 1094 movq 40(%rsp),%rsi 1095 jmp .Lmulx4x_cond_copy 1096 1097.align 32 1098.Lmulx4x_cond_copy: 1099 movdqa 0(%rbx),%xmm2 1100 movdqa 16(%rbx),%xmm3 1101 leaq 32(%rbx),%rbx 1102 movdqu 0(%rdi),%xmm4 1103 movdqu 16(%rdi),%xmm5 1104 leaq 32(%rdi),%rdi 1105 movdqa %xmm0,-32(%rbx) 1106 movdqa %xmm0,-16(%rbx) 1107 pcmpeqd %xmm1,%xmm0 1108 pand %xmm1,%xmm2 1109 pand %xmm1,%xmm3 1110 pand %xmm0,%xmm4 1111 pand %xmm0,%xmm5 1112 pxor %xmm0,%xmm0 1113 por %xmm2,%xmm4 1114 por %xmm3,%xmm5 1115 movdqu %xmm4,-32(%rdi) 1116 movdqu %xmm5,-16(%rdi) 1117 subq $32,%rdx 1118 jnz .Lmulx4x_cond_copy 1119 1120 movq %rdx,(%rbx) 1121 1122 movq $1,%rax 1123 movq -48(%rsi),%r15 1124 movq -40(%rsi),%r14 1125 movq -32(%rsi),%r13 1126 movq -24(%rsi),%r12 1127 movq -16(%rsi),%rbp 1128 movq -8(%rsi),%rbx 1129 leaq (%rsi),%rsp 1130.Lmulx4x_epilogue: 1131 .byte 0xf3,0xc3 1132.size bn_mulx4x_mont,.-bn_mulx4x_mont 1133.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1134.align 16 1135