x86_64-mont.S revision 306195
11539Srgrimes/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/x86_64-mont.S 306195 2016-09-22 14:57:48Z jkim $ */ 21539Srgrimes/* Do not modify. This file is auto-generated from x86_64-mont.pl. */ 31539Srgrimes.text 41539Srgrimes 51539Srgrimes 61539Srgrimes 71539Srgrimes.globl bn_mul_mont 81539Srgrimes.type bn_mul_mont,@function 91539Srgrimes.align 16 101539Srgrimesbn_mul_mont: 111539Srgrimes movl %r9d,%r9d 121539Srgrimes movq %rsp,%rax 131539Srgrimes testl $3,%r9d 141539Srgrimes jnz .Lmul_enter 151539Srgrimes cmpl $8,%r9d 161539Srgrimes jb .Lmul_enter 171539Srgrimes movl OPENSSL_ia32cap_P+8(%rip),%r11d 181539Srgrimes cmpq %rsi,%rdx 191539Srgrimes jne .Lmul4x_enter 201539Srgrimes testl $7,%r9d 21203964Simp jz .Lsqr8x_enter 221539Srgrimes jmp .Lmul4x_enter 231539Srgrimes 241539Srgrimes.align 16 251539Srgrimes.Lmul_enter: 261539Srgrimes pushq %rbx 271539Srgrimes pushq %rbp 281539Srgrimes pushq %r12 291539Srgrimes pushq %r13 301539Srgrimes pushq %r14 311539Srgrimes pushq %r15 321539Srgrimes 331539Srgrimes negq %r9 341539Srgrimes movq %rsp,%r11 351539Srgrimes leaq -16(%rsp,%r9,8),%r10 361539Srgrimes negq %r9 371539Srgrimes andq $-1024,%r10 38164244Sjkoshy 39164244Sjkoshy 401539Srgrimes 411539Srgrimes 421539Srgrimes 431539Srgrimes 441539Srgrimes 45164244Sjkoshy subq %r10,%r11 46164244Sjkoshy andq $-4096,%r11 471539Srgrimes leaq (%r10,%r11,1),%rsp 481539Srgrimes movq (%rsp),%r11 491539Srgrimes cmpq %r10,%rsp 501539Srgrimes ja .Lmul_page_walk 511539Srgrimes jmp .Lmul_page_walk_done 521539Srgrimes 531539Srgrimes.align 16 541539Srgrimes.Lmul_page_walk: 551539Srgrimes leaq -4096(%rsp),%rsp 561539Srgrimes movq (%rsp),%r11 571539Srgrimes cmpq %r10,%rsp 581539Srgrimes ja .Lmul_page_walk 591539Srgrimes.Lmul_page_walk_done: 601539Srgrimes 611539Srgrimes movq %rax,8(%rsp,%r9,8) 621539Srgrimes.Lmul_body: 631539Srgrimes movq %rdx,%r12 641539Srgrimes movq (%r8),%r8 65164244Sjkoshy movq (%r12),%rbx 661539Srgrimes movq (%rsi),%rax 671539Srgrimes 68 xorq %r14,%r14 69 xorq %r15,%r15 70 71 movq %r8,%rbp 72 mulq %rbx 73 movq %rax,%r10 74 movq (%rcx),%rax 75 76 imulq %r10,%rbp 77 movq %rdx,%r11 78 79 mulq %rbp 80 addq %rax,%r10 81 movq 8(%rsi),%rax 82 adcq $0,%rdx 83 movq %rdx,%r13 84 85 leaq 1(%r15),%r15 86 jmp .L1st_enter 87 88.align 16 89.L1st: 90 addq %rax,%r13 91 movq (%rsi,%r15,8),%rax 92 adcq $0,%rdx 93 addq %r11,%r13 94 movq %r10,%r11 95 adcq $0,%rdx 96 movq %r13,-16(%rsp,%r15,8) 97 movq %rdx,%r13 98 99.L1st_enter: 100 mulq %rbx 101 addq %rax,%r11 102 movq (%rcx,%r15,8),%rax 103 adcq $0,%rdx 104 leaq 1(%r15),%r15 105 movq %rdx,%r10 106 107 mulq %rbp 108 cmpq %r9,%r15 109 jne .L1st 110 111 addq %rax,%r13 112 movq (%rsi),%rax 113 adcq $0,%rdx 114 addq %r11,%r13 115 adcq $0,%rdx 116 movq %r13,-16(%rsp,%r15,8) 117 movq %rdx,%r13 118 movq %r10,%r11 119 120 xorq %rdx,%rdx 121 addq %r11,%r13 122 adcq $0,%rdx 123 movq %r13,-8(%rsp,%r9,8) 124 movq %rdx,(%rsp,%r9,8) 125 126 leaq 1(%r14),%r14 127 jmp .Louter 128.align 16 129.Louter: 130 movq (%r12,%r14,8),%rbx 131 xorq %r15,%r15 132 movq %r8,%rbp 133 movq (%rsp),%r10 134 mulq %rbx 135 addq %rax,%r10 136 movq (%rcx),%rax 137 adcq $0,%rdx 138 139 imulq %r10,%rbp 140 movq %rdx,%r11 141 142 mulq %rbp 143 addq %rax,%r10 144 movq 8(%rsi),%rax 145 adcq $0,%rdx 146 movq 8(%rsp),%r10 147 movq %rdx,%r13 148 149 leaq 1(%r15),%r15 150 jmp .Linner_enter 151 152.align 16 153.Linner: 154 addq %rax,%r13 155 movq (%rsi,%r15,8),%rax 156 adcq $0,%rdx 157 addq %r10,%r13 158 movq (%rsp,%r15,8),%r10 159 adcq $0,%rdx 160 movq %r13,-16(%rsp,%r15,8) 161 movq %rdx,%r13 162 163.Linner_enter: 164 mulq %rbx 165 addq %rax,%r11 166 movq (%rcx,%r15,8),%rax 167 adcq $0,%rdx 168 addq %r11,%r10 169 movq %rdx,%r11 170 adcq $0,%r11 171 leaq 1(%r15),%r15 172 173 mulq %rbp 174 cmpq %r9,%r15 175 jne .Linner 176 177 addq %rax,%r13 178 movq (%rsi),%rax 179 adcq $0,%rdx 180 addq %r10,%r13 181 movq (%rsp,%r15,8),%r10 182 adcq $0,%rdx 183 movq %r13,-16(%rsp,%r15,8) 184 movq %rdx,%r13 185 186 xorq %rdx,%rdx 187 addq %r11,%r13 188 adcq $0,%rdx 189 addq %r10,%r13 190 adcq $0,%rdx 191 movq %r13,-8(%rsp,%r9,8) 192 movq %rdx,(%rsp,%r9,8) 193 194 leaq 1(%r14),%r14 195 cmpq %r9,%r14 196 jb .Louter 197 198 xorq %r14,%r14 199 movq (%rsp),%rax 200 leaq (%rsp),%rsi 201 movq %r9,%r15 202 jmp .Lsub 203.align 16 204.Lsub: sbbq (%rcx,%r14,8),%rax 205 movq %rax,(%rdi,%r14,8) 206 movq 8(%rsi,%r14,8),%rax 207 leaq 1(%r14),%r14 208 decq %r15 209 jnz .Lsub 210 211 sbbq $0,%rax 212 xorq %r14,%r14 213 andq %rax,%rsi 214 notq %rax 215 movq %rdi,%rcx 216 andq %rax,%rcx 217 movq %r9,%r15 218 orq %rcx,%rsi 219.align 16 220.Lcopy: 221 movq (%rsi,%r14,8),%rax 222 movq %r14,(%rsp,%r14,8) 223 movq %rax,(%rdi,%r14,8) 224 leaq 1(%r14),%r14 225 subq $1,%r15 226 jnz .Lcopy 227 228 movq 8(%rsp,%r9,8),%rsi 229 movq $1,%rax 230 movq -48(%rsi),%r15 231 movq -40(%rsi),%r14 232 movq -32(%rsi),%r13 233 movq -24(%rsi),%r12 234 movq -16(%rsi),%rbp 235 movq -8(%rsi),%rbx 236 leaq (%rsi),%rsp 237.Lmul_epilogue: 238 .byte 0xf3,0xc3 239.size bn_mul_mont,.-bn_mul_mont 240.type bn_mul4x_mont,@function 241.align 16 242bn_mul4x_mont: 243 movl %r9d,%r9d 244 movq %rsp,%rax 245.Lmul4x_enter: 246 andl $0x80100,%r11d 247 cmpl $0x80100,%r11d 248 je .Lmulx4x_enter 249 pushq %rbx 250 pushq %rbp 251 pushq %r12 252 pushq %r13 253 pushq %r14 254 pushq %r15 255 256 negq %r9 257 movq %rsp,%r11 258 leaq -32(%rsp,%r9,8),%r10 259 negq %r9 260 andq $-1024,%r10 261 262 subq %r10,%r11 263 andq $-4096,%r11 264 leaq (%r10,%r11,1),%rsp 265 movq (%rsp),%r11 266 cmpq %r10,%rsp 267 ja .Lmul4x_page_walk 268 jmp .Lmul4x_page_walk_done 269 270.Lmul4x_page_walk: 271 leaq -4096(%rsp),%rsp 272 movq (%rsp),%r11 273 cmpq %r10,%rsp 274 ja .Lmul4x_page_walk 275.Lmul4x_page_walk_done: 276 277 movq %rax,8(%rsp,%r9,8) 278.Lmul4x_body: 279 movq %rdi,16(%rsp,%r9,8) 280 movq %rdx,%r12 281 movq (%r8),%r8 282 movq (%r12),%rbx 283 movq (%rsi),%rax 284 285 xorq %r14,%r14 286 xorq %r15,%r15 287 288 movq %r8,%rbp 289 mulq %rbx 290 movq %rax,%r10 291 movq (%rcx),%rax 292 293 imulq %r10,%rbp 294 movq %rdx,%r11 295 296 mulq %rbp 297 addq %rax,%r10 298 movq 8(%rsi),%rax 299 adcq $0,%rdx 300 movq %rdx,%rdi 301 302 mulq %rbx 303 addq %rax,%r11 304 movq 8(%rcx),%rax 305 adcq $0,%rdx 306 movq %rdx,%r10 307 308 mulq %rbp 309 addq %rax,%rdi 310 movq 16(%rsi),%rax 311 adcq $0,%rdx 312 addq %r11,%rdi 313 leaq 4(%r15),%r15 314 adcq $0,%rdx 315 movq %rdi,(%rsp) 316 movq %rdx,%r13 317 jmp .L1st4x 318.align 16 319.L1st4x: 320 mulq %rbx 321 addq %rax,%r10 322 movq -16(%rcx,%r15,8),%rax 323 adcq $0,%rdx 324 movq %rdx,%r11 325 326 mulq %rbp 327 addq %rax,%r13 328 movq -8(%rsi,%r15,8),%rax 329 adcq $0,%rdx 330 addq %r10,%r13 331 adcq $0,%rdx 332 movq %r13,-24(%rsp,%r15,8) 333 movq %rdx,%rdi 334 335 mulq %rbx 336 addq %rax,%r11 337 movq -8(%rcx,%r15,8),%rax 338 adcq $0,%rdx 339 movq %rdx,%r10 340 341 mulq %rbp 342 addq %rax,%rdi 343 movq (%rsi,%r15,8),%rax 344 adcq $0,%rdx 345 addq %r11,%rdi 346 adcq $0,%rdx 347 movq %rdi,-16(%rsp,%r15,8) 348 movq %rdx,%r13 349 350 mulq %rbx 351 addq %rax,%r10 352 movq (%rcx,%r15,8),%rax 353 adcq $0,%rdx 354 movq %rdx,%r11 355 356 mulq %rbp 357 addq %rax,%r13 358 movq 8(%rsi,%r15,8),%rax 359 adcq $0,%rdx 360 addq %r10,%r13 361 adcq $0,%rdx 362 movq %r13,-8(%rsp,%r15,8) 363 movq %rdx,%rdi 364 365 mulq %rbx 366 addq %rax,%r11 367 movq 8(%rcx,%r15,8),%rax 368 adcq $0,%rdx 369 leaq 4(%r15),%r15 370 movq %rdx,%r10 371 372 mulq %rbp 373 addq %rax,%rdi 374 movq -16(%rsi,%r15,8),%rax 375 adcq $0,%rdx 376 addq %r11,%rdi 377 adcq $0,%rdx 378 movq %rdi,-32(%rsp,%r15,8) 379 movq %rdx,%r13 380 cmpq %r9,%r15 381 jb .L1st4x 382 383 mulq %rbx 384 addq %rax,%r10 385 movq -16(%rcx,%r15,8),%rax 386 adcq $0,%rdx 387 movq %rdx,%r11 388 389 mulq %rbp 390 addq %rax,%r13 391 movq -8(%rsi,%r15,8),%rax 392 adcq $0,%rdx 393 addq %r10,%r13 394 adcq $0,%rdx 395 movq %r13,-24(%rsp,%r15,8) 396 movq %rdx,%rdi 397 398 mulq %rbx 399 addq %rax,%r11 400 movq -8(%rcx,%r15,8),%rax 401 adcq $0,%rdx 402 movq %rdx,%r10 403 404 mulq %rbp 405 addq %rax,%rdi 406 movq (%rsi),%rax 407 adcq $0,%rdx 408 addq %r11,%rdi 409 adcq $0,%rdx 410 movq %rdi,-16(%rsp,%r15,8) 411 movq %rdx,%r13 412 413 xorq %rdi,%rdi 414 addq %r10,%r13 415 adcq $0,%rdi 416 movq %r13,-8(%rsp,%r15,8) 417 movq %rdi,(%rsp,%r15,8) 418 419 leaq 1(%r14),%r14 420.align 4 421.Louter4x: 422 movq (%r12,%r14,8),%rbx 423 xorq %r15,%r15 424 movq (%rsp),%r10 425 movq %r8,%rbp 426 mulq %rbx 427 addq %rax,%r10 428 movq (%rcx),%rax 429 adcq $0,%rdx 430 431 imulq %r10,%rbp 432 movq %rdx,%r11 433 434 mulq %rbp 435 addq %rax,%r10 436 movq 8(%rsi),%rax 437 adcq $0,%rdx 438 movq %rdx,%rdi 439 440 mulq %rbx 441 addq %rax,%r11 442 movq 8(%rcx),%rax 443 adcq $0,%rdx 444 addq 8(%rsp),%r11 445 adcq $0,%rdx 446 movq %rdx,%r10 447 448 mulq %rbp 449 addq %rax,%rdi 450 movq 16(%rsi),%rax 451 adcq $0,%rdx 452 addq %r11,%rdi 453 leaq 4(%r15),%r15 454 adcq $0,%rdx 455 movq %rdi,(%rsp) 456 movq %rdx,%r13 457 jmp .Linner4x 458.align 16 459.Linner4x: 460 mulq %rbx 461 addq %rax,%r10 462 movq -16(%rcx,%r15,8),%rax 463 adcq $0,%rdx 464 addq -16(%rsp,%r15,8),%r10 465 adcq $0,%rdx 466 movq %rdx,%r11 467 468 mulq %rbp 469 addq %rax,%r13 470 movq -8(%rsi,%r15,8),%rax 471 adcq $0,%rdx 472 addq %r10,%r13 473 adcq $0,%rdx 474 movq %r13,-24(%rsp,%r15,8) 475 movq %rdx,%rdi 476 477 mulq %rbx 478 addq %rax,%r11 479 movq -8(%rcx,%r15,8),%rax 480 adcq $0,%rdx 481 addq -8(%rsp,%r15,8),%r11 482 adcq $0,%rdx 483 movq %rdx,%r10 484 485 mulq %rbp 486 addq %rax,%rdi 487 movq (%rsi,%r15,8),%rax 488 adcq $0,%rdx 489 addq %r11,%rdi 490 adcq $0,%rdx 491 movq %rdi,-16(%rsp,%r15,8) 492 movq %rdx,%r13 493 494 mulq %rbx 495 addq %rax,%r10 496 movq (%rcx,%r15,8),%rax 497 adcq $0,%rdx 498 addq (%rsp,%r15,8),%r10 499 adcq $0,%rdx 500 movq %rdx,%r11 501 502 mulq %rbp 503 addq %rax,%r13 504 movq 8(%rsi,%r15,8),%rax 505 adcq $0,%rdx 506 addq %r10,%r13 507 adcq $0,%rdx 508 movq %r13,-8(%rsp,%r15,8) 509 movq %rdx,%rdi 510 511 mulq %rbx 512 addq %rax,%r11 513 movq 8(%rcx,%r15,8),%rax 514 adcq $0,%rdx 515 addq 8(%rsp,%r15,8),%r11 516 adcq $0,%rdx 517 leaq 4(%r15),%r15 518 movq %rdx,%r10 519 520 mulq %rbp 521 addq %rax,%rdi 522 movq -16(%rsi,%r15,8),%rax 523 adcq $0,%rdx 524 addq %r11,%rdi 525 adcq $0,%rdx 526 movq %rdi,-32(%rsp,%r15,8) 527 movq %rdx,%r13 528 cmpq %r9,%r15 529 jb .Linner4x 530 531 mulq %rbx 532 addq %rax,%r10 533 movq -16(%rcx,%r15,8),%rax 534 adcq $0,%rdx 535 addq -16(%rsp,%r15,8),%r10 536 adcq $0,%rdx 537 movq %rdx,%r11 538 539 mulq %rbp 540 addq %rax,%r13 541 movq -8(%rsi,%r15,8),%rax 542 adcq $0,%rdx 543 addq %r10,%r13 544 adcq $0,%rdx 545 movq %r13,-24(%rsp,%r15,8) 546 movq %rdx,%rdi 547 548 mulq %rbx 549 addq %rax,%r11 550 movq -8(%rcx,%r15,8),%rax 551 adcq $0,%rdx 552 addq -8(%rsp,%r15,8),%r11 553 adcq $0,%rdx 554 leaq 1(%r14),%r14 555 movq %rdx,%r10 556 557 mulq %rbp 558 addq %rax,%rdi 559 movq (%rsi),%rax 560 adcq $0,%rdx 561 addq %r11,%rdi 562 adcq $0,%rdx 563 movq %rdi,-16(%rsp,%r15,8) 564 movq %rdx,%r13 565 566 xorq %rdi,%rdi 567 addq %r10,%r13 568 adcq $0,%rdi 569 addq (%rsp,%r9,8),%r13 570 adcq $0,%rdi 571 movq %r13,-8(%rsp,%r15,8) 572 movq %rdi,(%rsp,%r15,8) 573 574 cmpq %r9,%r14 575 jb .Louter4x 576 movq 16(%rsp,%r9,8),%rdi 577 movq 0(%rsp),%rax 578 pxor %xmm0,%xmm0 579 movq 8(%rsp),%rdx 580 shrq $2,%r9 581 leaq (%rsp),%rsi 582 xorq %r14,%r14 583 584 subq 0(%rcx),%rax 585 movq 16(%rsi),%rbx 586 movq 24(%rsi),%rbp 587 sbbq 8(%rcx),%rdx 588 leaq -1(%r9),%r15 589 jmp .Lsub4x 590.align 16 591.Lsub4x: 592 movq %rax,0(%rdi,%r14,8) 593 movq %rdx,8(%rdi,%r14,8) 594 sbbq 16(%rcx,%r14,8),%rbx 595 movq 32(%rsi,%r14,8),%rax 596 movq 40(%rsi,%r14,8),%rdx 597 sbbq 24(%rcx,%r14,8),%rbp 598 movq %rbx,16(%rdi,%r14,8) 599 movq %rbp,24(%rdi,%r14,8) 600 sbbq 32(%rcx,%r14,8),%rax 601 movq 48(%rsi,%r14,8),%rbx 602 movq 56(%rsi,%r14,8),%rbp 603 sbbq 40(%rcx,%r14,8),%rdx 604 leaq 4(%r14),%r14 605 decq %r15 606 jnz .Lsub4x 607 608 movq %rax,0(%rdi,%r14,8) 609 movq 32(%rsi,%r14,8),%rax 610 sbbq 16(%rcx,%r14,8),%rbx 611 movq %rdx,8(%rdi,%r14,8) 612 sbbq 24(%rcx,%r14,8),%rbp 613 movq %rbx,16(%rdi,%r14,8) 614 615 sbbq $0,%rax 616 movq %rbp,24(%rdi,%r14,8) 617 xorq %r14,%r14 618 andq %rax,%rsi 619 notq %rax 620 movq %rdi,%rcx 621 andq %rax,%rcx 622 leaq -1(%r9),%r15 623 orq %rcx,%rsi 624 625 movdqu (%rsi),%xmm1 626 movdqa %xmm0,(%rsp) 627 movdqu %xmm1,(%rdi) 628 jmp .Lcopy4x 629.align 16 630.Lcopy4x: 631 movdqu 16(%rsi,%r14,1),%xmm2 632 movdqu 32(%rsi,%r14,1),%xmm1 633 movdqa %xmm0,16(%rsp,%r14,1) 634 movdqu %xmm2,16(%rdi,%r14,1) 635 movdqa %xmm0,32(%rsp,%r14,1) 636 movdqu %xmm1,32(%rdi,%r14,1) 637 leaq 32(%r14),%r14 638 decq %r15 639 jnz .Lcopy4x 640 641 shlq $2,%r9 642 movdqu 16(%rsi,%r14,1),%xmm2 643 movdqa %xmm0,16(%rsp,%r14,1) 644 movdqu %xmm2,16(%rdi,%r14,1) 645 movq 8(%rsp,%r9,8),%rsi 646 movq $1,%rax 647 movq -48(%rsi),%r15 648 movq -40(%rsi),%r14 649 movq -32(%rsi),%r13 650 movq -24(%rsi),%r12 651 movq -16(%rsi),%rbp 652 movq -8(%rsi),%rbx 653 leaq (%rsi),%rsp 654.Lmul4x_epilogue: 655 .byte 0xf3,0xc3 656.size bn_mul4x_mont,.-bn_mul4x_mont 657 658 659 660.type bn_sqr8x_mont,@function 661.align 32 662bn_sqr8x_mont: 663 movq %rsp,%rax 664.Lsqr8x_enter: 665 pushq %rbx 666 pushq %rbp 667 pushq %r12 668 pushq %r13 669 pushq %r14 670 pushq %r15 671.Lsqr8x_prologue: 672 673 movl %r9d,%r10d 674 shll $3,%r9d 675 shlq $3+2,%r10 676 negq %r9 677 678 679 680 681 682 683 leaq -64(%rsp,%r9,2),%r11 684 movq %rsp,%rbp 685 movq (%r8),%r8 686 subq %rsi,%r11 687 andq $4095,%r11 688 cmpq %r11,%r10 689 jb .Lsqr8x_sp_alt 690 subq %r11,%rbp 691 leaq -64(%rbp,%r9,2),%rbp 692 jmp .Lsqr8x_sp_done 693 694.align 32 695.Lsqr8x_sp_alt: 696 leaq 4096-64(,%r9,2),%r10 697 leaq -64(%rbp,%r9,2),%rbp 698 subq %r10,%r11 699 movq $0,%r10 700 cmovcq %r10,%r11 701 subq %r11,%rbp 702.Lsqr8x_sp_done: 703 andq $-64,%rbp 704 movq %rsp,%r11 705 subq %rbp,%r11 706 andq $-4096,%r11 707 leaq (%r11,%rbp,1),%rsp 708 movq (%rsp),%r10 709 cmpq %rbp,%rsp 710 ja .Lsqr8x_page_walk 711 jmp .Lsqr8x_page_walk_done 712 713.align 16 714.Lsqr8x_page_walk: 715 leaq -4096(%rsp),%rsp 716 movq (%rsp),%r10 717 cmpq %rbp,%rsp 718 ja .Lsqr8x_page_walk 719.Lsqr8x_page_walk_done: 720 721 movq %r9,%r10 722 negq %r9 723 724 movq %r8,32(%rsp) 725 movq %rax,40(%rsp) 726.Lsqr8x_body: 727 728.byte 102,72,15,110,209 729 pxor %xmm0,%xmm0 730.byte 102,72,15,110,207 731.byte 102,73,15,110,218 732 movl OPENSSL_ia32cap_P+8(%rip),%eax 733 andl $0x80100,%eax 734 cmpl $0x80100,%eax 735 jne .Lsqr8x_nox 736 737 call bn_sqrx8x_internal 738 739 740 741 742 leaq (%r8,%rcx,1),%rbx 743 movq %rcx,%r9 744 movq %rcx,%rdx 745.byte 102,72,15,126,207 746 sarq $3+2,%rcx 747 jmp .Lsqr8x_sub 748 749.align 32 750.Lsqr8x_nox: 751 call bn_sqr8x_internal 752 753 754 755 756 leaq (%rdi,%r9,1),%rbx 757 movq %r9,%rcx 758 movq %r9,%rdx 759.byte 102,72,15,126,207 760 sarq $3+2,%rcx 761 jmp .Lsqr8x_sub 762 763.align 32 764.Lsqr8x_sub: 765 movq 0(%rbx),%r12 766 movq 8(%rbx),%r13 767 movq 16(%rbx),%r14 768 movq 24(%rbx),%r15 769 leaq 32(%rbx),%rbx 770 sbbq 0(%rbp),%r12 771 sbbq 8(%rbp),%r13 772 sbbq 16(%rbp),%r14 773 sbbq 24(%rbp),%r15 774 leaq 32(%rbp),%rbp 775 movq %r12,0(%rdi) 776 movq %r13,8(%rdi) 777 movq %r14,16(%rdi) 778 movq %r15,24(%rdi) 779 leaq 32(%rdi),%rdi 780 incq %rcx 781 jnz .Lsqr8x_sub 782 783 sbbq $0,%rax 784 leaq (%rbx,%r9,1),%rbx 785 leaq (%rdi,%r9,1),%rdi 786 787.byte 102,72,15,110,200 788 pxor %xmm0,%xmm0 789 pshufd $0,%xmm1,%xmm1 790 movq 40(%rsp),%rsi 791 jmp .Lsqr8x_cond_copy 792 793.align 32 794.Lsqr8x_cond_copy: 795 movdqa 0(%rbx),%xmm2 796 movdqa 16(%rbx),%xmm3 797 leaq 32(%rbx),%rbx 798 movdqu 0(%rdi),%xmm4 799 movdqu 16(%rdi),%xmm5 800 leaq 32(%rdi),%rdi 801 movdqa %xmm0,-32(%rbx) 802 movdqa %xmm0,-16(%rbx) 803 movdqa %xmm0,-32(%rbx,%rdx,1) 804 movdqa %xmm0,-16(%rbx,%rdx,1) 805 pcmpeqd %xmm1,%xmm0 806 pand %xmm1,%xmm2 807 pand %xmm1,%xmm3 808 pand %xmm0,%xmm4 809 pand %xmm0,%xmm5 810 pxor %xmm0,%xmm0 811 por %xmm2,%xmm4 812 por %xmm3,%xmm5 813 movdqu %xmm4,-32(%rdi) 814 movdqu %xmm5,-16(%rdi) 815 addq $32,%r9 816 jnz .Lsqr8x_cond_copy 817 818 movq $1,%rax 819 movq -48(%rsi),%r15 820 movq -40(%rsi),%r14 821 movq -32(%rsi),%r13 822 movq -24(%rsi),%r12 823 movq -16(%rsi),%rbp 824 movq -8(%rsi),%rbx 825 leaq (%rsi),%rsp 826.Lsqr8x_epilogue: 827 .byte 0xf3,0xc3 828.size bn_sqr8x_mont,.-bn_sqr8x_mont 829.type bn_mulx4x_mont,@function 830.align 32 831bn_mulx4x_mont: 832 movq %rsp,%rax 833.Lmulx4x_enter: 834 pushq %rbx 835 pushq %rbp 836 pushq %r12 837 pushq %r13 838 pushq %r14 839 pushq %r15 840.Lmulx4x_prologue: 841 842 shll $3,%r9d 843 xorq %r10,%r10 844 subq %r9,%r10 845 movq (%r8),%r8 846 leaq -72(%rsp,%r10,1),%rbp 847 andq $-128,%rbp 848 movq %rsp,%r11 849 subq %rbp,%r11 850 andq $-4096,%r11 851 leaq (%r11,%rbp,1),%rsp 852 movq (%rsp),%r10 853 cmpq %rbp,%rsp 854 ja .Lmulx4x_page_walk 855 jmp .Lmulx4x_page_walk_done 856 857.align 16 858.Lmulx4x_page_walk: 859 leaq -4096(%rsp),%rsp 860 movq (%rsp),%r10 861 cmpq %rbp,%rsp 862 ja .Lmulx4x_page_walk 863.Lmulx4x_page_walk_done: 864 865 leaq (%rdx,%r9,1),%r10 866 867 868 869 870 871 872 873 874 875 876 877 878 movq %r9,0(%rsp) 879 shrq $5,%r9 880 movq %r10,16(%rsp) 881 subq $1,%r9 882 movq %r8,24(%rsp) 883 movq %rdi,32(%rsp) 884 movq %rax,40(%rsp) 885 movq %r9,48(%rsp) 886 jmp .Lmulx4x_body 887 888.align 32 889.Lmulx4x_body: 890 leaq 8(%rdx),%rdi 891 movq (%rdx),%rdx 892 leaq 64+32(%rsp),%rbx 893 movq %rdx,%r9 894 895 mulxq 0(%rsi),%r8,%rax 896 mulxq 8(%rsi),%r11,%r14 897 addq %rax,%r11 898 movq %rdi,8(%rsp) 899 mulxq 16(%rsi),%r12,%r13 900 adcq %r14,%r12 901 adcq $0,%r13 902 903 movq %r8,%rdi 904 imulq 24(%rsp),%r8 905 xorq %rbp,%rbp 906 907 mulxq 24(%rsi),%rax,%r14 908 movq %r8,%rdx 909 leaq 32(%rsi),%rsi 910 adcxq %rax,%r13 911 adcxq %rbp,%r14 912 913 mulxq 0(%rcx),%rax,%r10 914 adcxq %rax,%rdi 915 adoxq %r11,%r10 916 mulxq 8(%rcx),%rax,%r11 917 adcxq %rax,%r10 918 adoxq %r12,%r11 919.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 920 movq 48(%rsp),%rdi 921 movq %r10,-32(%rbx) 922 adcxq %rax,%r11 923 adoxq %r13,%r12 924 mulxq 24(%rcx),%rax,%r15 925 movq %r9,%rdx 926 movq %r11,-24(%rbx) 927 adcxq %rax,%r12 928 adoxq %rbp,%r15 929 leaq 32(%rcx),%rcx 930 movq %r12,-16(%rbx) 931 932 jmp .Lmulx4x_1st 933 934.align 32 935.Lmulx4x_1st: 936 adcxq %rbp,%r15 937 mulxq 0(%rsi),%r10,%rax 938 adcxq %r14,%r10 939 mulxq 8(%rsi),%r11,%r14 940 adcxq %rax,%r11 941 mulxq 16(%rsi),%r12,%rax 942 adcxq %r14,%r12 943 mulxq 24(%rsi),%r13,%r14 944.byte 0x67,0x67 945 movq %r8,%rdx 946 adcxq %rax,%r13 947 adcxq %rbp,%r14 948 leaq 32(%rsi),%rsi 949 leaq 32(%rbx),%rbx 950 951 adoxq %r15,%r10 952 mulxq 0(%rcx),%rax,%r15 953 adcxq %rax,%r10 954 adoxq %r15,%r11 955 mulxq 8(%rcx),%rax,%r15 956 adcxq %rax,%r11 957 adoxq %r15,%r12 958 mulxq 16(%rcx),%rax,%r15 959 movq %r10,-40(%rbx) 960 adcxq %rax,%r12 961 movq %r11,-32(%rbx) 962 adoxq %r15,%r13 963 mulxq 24(%rcx),%rax,%r15 964 movq %r9,%rdx 965 movq %r12,-24(%rbx) 966 adcxq %rax,%r13 967 adoxq %rbp,%r15 968 leaq 32(%rcx),%rcx 969 movq %r13,-16(%rbx) 970 971 decq %rdi 972 jnz .Lmulx4x_1st 973 974 movq 0(%rsp),%rax 975 movq 8(%rsp),%rdi 976 adcq %rbp,%r15 977 addq %r15,%r14 978 sbbq %r15,%r15 979 movq %r14,-8(%rbx) 980 jmp .Lmulx4x_outer 981 982.align 32 983.Lmulx4x_outer: 984 movq (%rdi),%rdx 985 leaq 8(%rdi),%rdi 986 subq %rax,%rsi 987 movq %r15,(%rbx) 988 leaq 64+32(%rsp),%rbx 989 subq %rax,%rcx 990 991 mulxq 0(%rsi),%r8,%r11 992 xorl %ebp,%ebp 993 movq %rdx,%r9 994 mulxq 8(%rsi),%r14,%r12 995 adoxq -32(%rbx),%r8 996 adcxq %r14,%r11 997 mulxq 16(%rsi),%r15,%r13 998 adoxq -24(%rbx),%r11 999 adcxq %r15,%r12 1000 adoxq %rbp,%r12 1001 adcxq %rbp,%r13 1002 1003 movq %rdi,8(%rsp) 1004.byte 0x67 1005 movq %r8,%r15 1006 imulq 24(%rsp),%r8 1007 xorl %ebp,%ebp 1008 1009 mulxq 24(%rsi),%rax,%r14 1010 movq %r8,%rdx 1011 adoxq -16(%rbx),%r12 1012 adcxq %rax,%r13 1013 adoxq -8(%rbx),%r13 1014 adcxq %rbp,%r14 1015 leaq 32(%rsi),%rsi 1016 adoxq %rbp,%r14 1017 1018 mulxq 0(%rcx),%rax,%r10 1019 adcxq %rax,%r15 1020 adoxq %r11,%r10 1021 mulxq 8(%rcx),%rax,%r11 1022 adcxq %rax,%r10 1023 adoxq %r12,%r11 1024 mulxq 16(%rcx),%rax,%r12 1025 movq %r10,-32(%rbx) 1026 adcxq %rax,%r11 1027 adoxq %r13,%r12 1028 mulxq 24(%rcx),%rax,%r15 1029 movq %r9,%rdx 1030 movq %r11,-24(%rbx) 1031 leaq 32(%rcx),%rcx 1032 adcxq %rax,%r12 1033 adoxq %rbp,%r15 1034 movq 48(%rsp),%rdi 1035 movq %r12,-16(%rbx) 1036 1037 jmp .Lmulx4x_inner 1038 1039.align 32 1040.Lmulx4x_inner: 1041 mulxq 0(%rsi),%r10,%rax 1042 adcxq %rbp,%r15 1043 adoxq %r14,%r10 1044 mulxq 8(%rsi),%r11,%r14 1045 adcxq 0(%rbx),%r10 1046 adoxq %rax,%r11 1047 mulxq 16(%rsi),%r12,%rax 1048 adcxq 8(%rbx),%r11 1049 adoxq %r14,%r12 1050 mulxq 24(%rsi),%r13,%r14 1051 movq %r8,%rdx 1052 adcxq 16(%rbx),%r12 1053 adoxq %rax,%r13 1054 adcxq 24(%rbx),%r13 1055 adoxq %rbp,%r14 1056 leaq 32(%rsi),%rsi 1057 leaq 32(%rbx),%rbx 1058 adcxq %rbp,%r14 1059 1060 adoxq %r15,%r10 1061 mulxq 0(%rcx),%rax,%r15 1062 adcxq %rax,%r10 1063 adoxq %r15,%r11 1064 mulxq 8(%rcx),%rax,%r15 1065 adcxq %rax,%r11 1066 adoxq %r15,%r12 1067 mulxq 16(%rcx),%rax,%r15 1068 movq %r10,-40(%rbx) 1069 adcxq %rax,%r12 1070 adoxq %r15,%r13 1071 mulxq 24(%rcx),%rax,%r15 1072 movq %r9,%rdx 1073 movq %r11,-32(%rbx) 1074 movq %r12,-24(%rbx) 1075 adcxq %rax,%r13 1076 adoxq %rbp,%r15 1077 leaq 32(%rcx),%rcx 1078 movq %r13,-16(%rbx) 1079 1080 decq %rdi 1081 jnz .Lmulx4x_inner 1082 1083 movq 0(%rsp),%rax 1084 movq 8(%rsp),%rdi 1085 adcq %rbp,%r15 1086 subq 0(%rbx),%rbp 1087 adcq %r15,%r14 1088 sbbq %r15,%r15 1089 movq %r14,-8(%rbx) 1090 1091 cmpq 16(%rsp),%rdi 1092 jne .Lmulx4x_outer 1093 1094 leaq 64(%rsp),%rbx 1095 subq %rax,%rcx 1096 negq %r15 1097 movq %rax,%rdx 1098 shrq $3+2,%rax 1099 movq 32(%rsp),%rdi 1100 jmp .Lmulx4x_sub 1101 1102.align 32 1103.Lmulx4x_sub: 1104 movq 0(%rbx),%r11 1105 movq 8(%rbx),%r12 1106 movq 16(%rbx),%r13 1107 movq 24(%rbx),%r14 1108 leaq 32(%rbx),%rbx 1109 sbbq 0(%rcx),%r11 1110 sbbq 8(%rcx),%r12 1111 sbbq 16(%rcx),%r13 1112 sbbq 24(%rcx),%r14 1113 leaq 32(%rcx),%rcx 1114 movq %r11,0(%rdi) 1115 movq %r12,8(%rdi) 1116 movq %r13,16(%rdi) 1117 movq %r14,24(%rdi) 1118 leaq 32(%rdi),%rdi 1119 decq %rax 1120 jnz .Lmulx4x_sub 1121 1122 sbbq $0,%r15 1123 leaq 64(%rsp),%rbx 1124 subq %rdx,%rdi 1125 1126.byte 102,73,15,110,207 1127 pxor %xmm0,%xmm0 1128 pshufd $0,%xmm1,%xmm1 1129 movq 40(%rsp),%rsi 1130 jmp .Lmulx4x_cond_copy 1131 1132.align 32 1133.Lmulx4x_cond_copy: 1134 movdqa 0(%rbx),%xmm2 1135 movdqa 16(%rbx),%xmm3 1136 leaq 32(%rbx),%rbx 1137 movdqu 0(%rdi),%xmm4 1138 movdqu 16(%rdi),%xmm5 1139 leaq 32(%rdi),%rdi 1140 movdqa %xmm0,-32(%rbx) 1141 movdqa %xmm0,-16(%rbx) 1142 pcmpeqd %xmm1,%xmm0 1143 pand %xmm1,%xmm2 1144 pand %xmm1,%xmm3 1145 pand %xmm0,%xmm4 1146 pand %xmm0,%xmm5 1147 pxor %xmm0,%xmm0 1148 por %xmm2,%xmm4 1149 por %xmm3,%xmm5 1150 movdqu %xmm4,-32(%rdi) 1151 movdqu %xmm5,-16(%rdi) 1152 subq $32,%rdx 1153 jnz .Lmulx4x_cond_copy 1154 1155 movq %rdx,(%rbx) 1156 1157 movq $1,%rax 1158 movq -48(%rsi),%r15 1159 movq -40(%rsi),%r14 1160 movq -32(%rsi),%r13 1161 movq -24(%rsi),%r12 1162 movq -16(%rsi),%rbp 1163 movq -8(%rsi),%rbx 1164 leaq (%rsi),%rsp 1165.Lmulx4x_epilogue: 1166 .byte 0xf3,0xc3 1167.size bn_mulx4x_mont,.-bn_mulx4x_mont 1168.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1169.align 16 1170