x86_64-mont5.S revision 1.8
1#include <machine/asm.h> 2.text 3 4 5 6.globl bn_mul_mont_gather5 7.type bn_mul_mont_gather5,@function 8.align 64 9bn_mul_mont_gather5: 10.cfi_startproc 11 movl %r9d,%r9d 12 movq %rsp,%rax 13.cfi_def_cfa_register %rax 14 testl $7,%r9d 15 jnz .Lmul_enter 16 movl OPENSSL_ia32cap_P+8(%rip),%r11d 17 jmp .Lmul4x_enter 18 19.align 16 20.Lmul_enter: 21 movd 8(%rsp),%xmm5 22 pushq %rbx 23.cfi_offset %rbx,-16 24 pushq %rbp 25.cfi_offset %rbp,-24 26 pushq %r12 27.cfi_offset %r12,-32 28 pushq %r13 29.cfi_offset %r13,-40 30 pushq %r14 31.cfi_offset %r14,-48 32 pushq %r15 33.cfi_offset %r15,-56 34 35 negq %r9 36 movq %rsp,%r11 37 leaq -280(%rsp,%r9,8),%r10 38 negq %r9 39 andq $-1024,%r10 40 41 42 43 44 45 46 47 48 49 subq %r10,%r11 50 andq $-4096,%r11 51 leaq (%r10,%r11,1),%rsp 52 movq (%rsp),%r11 53 cmpq %r10,%rsp 54 ja .Lmul_page_walk 55 jmp .Lmul_page_walk_done 56 57.Lmul_page_walk: 58 leaq -4096(%rsp),%rsp 59 movq (%rsp),%r11 60 cmpq %r10,%rsp 61 ja .Lmul_page_walk 62.Lmul_page_walk_done: 63 64 leaq .Linc(%rip),%r10 65 movq %rax,8(%rsp,%r9,8) 66.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08 67.Lmul_body: 68 69 leaq 128(%rdx),%r12 70 movdqa 0(%r10),%xmm0 71 movdqa 16(%r10),%xmm1 72 leaq 24-112(%rsp,%r9,8),%r10 73 andq $-16,%r10 74 75 pshufd $0,%xmm5,%xmm5 76 movdqa %xmm1,%xmm4 77 movdqa %xmm1,%xmm2 78 paddd %xmm0,%xmm1 79 pcmpeqd %xmm5,%xmm0 80.byte 0x67 81 movdqa %xmm4,%xmm3 82 paddd %xmm1,%xmm2 83 pcmpeqd %xmm5,%xmm1 84 movdqa %xmm0,112(%r10) 85 movdqa %xmm4,%xmm0 86 87 paddd %xmm2,%xmm3 88 pcmpeqd %xmm5,%xmm2 89 movdqa %xmm1,128(%r10) 90 movdqa %xmm4,%xmm1 91 92 paddd %xmm3,%xmm0 93 pcmpeqd %xmm5,%xmm3 94 movdqa %xmm2,144(%r10) 95 movdqa %xmm4,%xmm2 96 97 paddd %xmm0,%xmm1 98 pcmpeqd %xmm5,%xmm0 99 movdqa %xmm3,160(%r10) 100 movdqa %xmm4,%xmm3 101 paddd %xmm1,%xmm2 102 pcmpeqd %xmm5,%xmm1 103 movdqa %xmm0,176(%r10) 104 movdqa %xmm4,%xmm0 105 106 paddd %xmm2,%xmm3 107 pcmpeqd %xmm5,%xmm2 108 movdqa %xmm1,192(%r10) 109 movdqa %xmm4,%xmm1 110 111 paddd %xmm3,%xmm0 112 pcmpeqd %xmm5,%xmm3 113 movdqa %xmm2,208(%r10) 114 movdqa %xmm4,%xmm2 115 116 paddd %xmm0,%xmm1 117 pcmpeqd %xmm5,%xmm0 118 movdqa %xmm3,224(%r10) 119 movdqa %xmm4,%xmm3 120 paddd %xmm1,%xmm2 121 pcmpeqd %xmm5,%xmm1 122 movdqa %xmm0,240(%r10) 123 movdqa %xmm4,%xmm0 124 125 paddd %xmm2,%xmm3 126 pcmpeqd %xmm5,%xmm2 127 movdqa %xmm1,256(%r10) 128 movdqa %xmm4,%xmm1 129 130 paddd %xmm3,%xmm0 131 pcmpeqd %xmm5,%xmm3 132 movdqa %xmm2,272(%r10) 133 movdqa %xmm4,%xmm2 134 135 paddd %xmm0,%xmm1 136 pcmpeqd %xmm5,%xmm0 137 movdqa %xmm3,288(%r10) 138 movdqa %xmm4,%xmm3 139 paddd %xmm1,%xmm2 140 pcmpeqd %xmm5,%xmm1 141 movdqa %xmm0,304(%r10) 142 143 paddd %xmm2,%xmm3 144.byte 0x67 145 pcmpeqd %xmm5,%xmm2 146 movdqa %xmm1,320(%r10) 147 148 pcmpeqd %xmm5,%xmm3 149 movdqa %xmm2,336(%r10) 150 pand 64(%r12),%xmm0 151 152 pand 80(%r12),%xmm1 153 pand 96(%r12),%xmm2 154 movdqa %xmm3,352(%r10) 155 pand 112(%r12),%xmm3 156 por %xmm2,%xmm0 157 por %xmm3,%xmm1 158 movdqa -128(%r12),%xmm4 159 movdqa -112(%r12),%xmm5 160 movdqa -96(%r12),%xmm2 161 pand 112(%r10),%xmm4 162 movdqa -80(%r12),%xmm3 163 pand 128(%r10),%xmm5 164 por %xmm4,%xmm0 165 pand 144(%r10),%xmm2 166 por %xmm5,%xmm1 167 pand 160(%r10),%xmm3 168 por %xmm2,%xmm0 169 por %xmm3,%xmm1 170 movdqa -64(%r12),%xmm4 171 movdqa -48(%r12),%xmm5 172 movdqa -32(%r12),%xmm2 173 pand 176(%r10),%xmm4 174 movdqa -16(%r12),%xmm3 175 pand 192(%r10),%xmm5 176 por %xmm4,%xmm0 177 pand 208(%r10),%xmm2 178 por %xmm5,%xmm1 179 pand 224(%r10),%xmm3 180 por %xmm2,%xmm0 181 por %xmm3,%xmm1 182 movdqa 0(%r12),%xmm4 183 movdqa 16(%r12),%xmm5 184 movdqa 32(%r12),%xmm2 185 pand 240(%r10),%xmm4 186 movdqa 48(%r12),%xmm3 187 pand 256(%r10),%xmm5 188 por %xmm4,%xmm0 189 pand 272(%r10),%xmm2 190 por %xmm5,%xmm1 191 pand 288(%r10),%xmm3 192 por %xmm2,%xmm0 193 por %xmm3,%xmm1 194 por %xmm1,%xmm0 195 pshufd $0x4e,%xmm0,%xmm1 196 por %xmm1,%xmm0 197 leaq 256(%r12),%r12 198.byte 102,72,15,126,195 199 200 movq (%r8),%r8 201 movq (%rsi),%rax 202 203 xorq %r14,%r14 204 xorq %r15,%r15 205 206 movq %r8,%rbp 207 mulq %rbx 208 movq %rax,%r10 209 movq (%rcx),%rax 210 211 imulq %r10,%rbp 212 movq %rdx,%r11 213 214 mulq %rbp 215 addq %rax,%r10 216 movq 8(%rsi),%rax 217 adcq $0,%rdx 218 movq %rdx,%r13 219 220 leaq 1(%r15),%r15 221 jmp .L1st_enter 222 223.align 16 224.L1st: 225 addq %rax,%r13 226 movq (%rsi,%r15,8),%rax 227 adcq $0,%rdx 228 addq %r11,%r13 229 movq %r10,%r11 230 adcq $0,%rdx 231 movq %r13,-16(%rsp,%r15,8) 232 movq %rdx,%r13 233 234.L1st_enter: 235 mulq %rbx 236 addq %rax,%r11 237 movq (%rcx,%r15,8),%rax 238 adcq $0,%rdx 239 leaq 1(%r15),%r15 240 movq %rdx,%r10 241 242 mulq %rbp 243 cmpq %r9,%r15 244 jne .L1st 245 246 247 addq %rax,%r13 248 adcq $0,%rdx 249 addq %r11,%r13 250 adcq $0,%rdx 251 movq %r13,-16(%rsp,%r9,8) 252 movq %rdx,%r13 253 movq %r10,%r11 254 255 xorq %rdx,%rdx 256 addq %r11,%r13 257 adcq $0,%rdx 258 movq %r13,-8(%rsp,%r9,8) 259 movq %rdx,(%rsp,%r9,8) 260 261 leaq 1(%r14),%r14 262 jmp .Louter 263.align 16 264.Louter: 265 leaq 24+128(%rsp,%r9,8),%rdx 266 andq $-16,%rdx 267 pxor %xmm4,%xmm4 268 pxor %xmm5,%xmm5 269 movdqa -128(%r12),%xmm0 270 movdqa -112(%r12),%xmm1 271 movdqa -96(%r12),%xmm2 272 movdqa -80(%r12),%xmm3 273 pand -128(%rdx),%xmm0 274 pand -112(%rdx),%xmm1 275 por %xmm0,%xmm4 276 pand -96(%rdx),%xmm2 277 por %xmm1,%xmm5 278 pand -80(%rdx),%xmm3 279 por %xmm2,%xmm4 280 por %xmm3,%xmm5 281 movdqa -64(%r12),%xmm0 282 movdqa -48(%r12),%xmm1 283 movdqa -32(%r12),%xmm2 284 movdqa -16(%r12),%xmm3 285 pand -64(%rdx),%xmm0 286 pand -48(%rdx),%xmm1 287 por %xmm0,%xmm4 288 pand -32(%rdx),%xmm2 289 por %xmm1,%xmm5 290 pand -16(%rdx),%xmm3 291 por %xmm2,%xmm4 292 por %xmm3,%xmm5 293 movdqa 0(%r12),%xmm0 294 movdqa 16(%r12),%xmm1 295 movdqa 32(%r12),%xmm2 296 movdqa 48(%r12),%xmm3 297 pand 0(%rdx),%xmm0 298 pand 16(%rdx),%xmm1 299 por %xmm0,%xmm4 300 pand 32(%rdx),%xmm2 301 por %xmm1,%xmm5 302 pand 48(%rdx),%xmm3 303 por %xmm2,%xmm4 304 por %xmm3,%xmm5 305 movdqa 64(%r12),%xmm0 306 movdqa 80(%r12),%xmm1 307 movdqa 96(%r12),%xmm2 308 movdqa 112(%r12),%xmm3 309 pand 64(%rdx),%xmm0 310 pand 80(%rdx),%xmm1 311 por %xmm0,%xmm4 312 pand 96(%rdx),%xmm2 313 por %xmm1,%xmm5 314 pand 112(%rdx),%xmm3 315 por %xmm2,%xmm4 316 por %xmm3,%xmm5 317 por %xmm5,%xmm4 318 pshufd $0x4e,%xmm4,%xmm0 319 por %xmm4,%xmm0 320 leaq 256(%r12),%r12 321 322 movq (%rsi),%rax 323.byte 102,72,15,126,195 324 325 xorq %r15,%r15 326 movq %r8,%rbp 327 movq (%rsp),%r10 328 329 mulq %rbx 330 addq %rax,%r10 331 movq (%rcx),%rax 332 adcq $0,%rdx 333 334 imulq %r10,%rbp 335 movq %rdx,%r11 336 337 mulq %rbp 338 addq %rax,%r10 339 movq 8(%rsi),%rax 340 adcq $0,%rdx 341 movq 8(%rsp),%r10 342 movq %rdx,%r13 343 344 leaq 1(%r15),%r15 345 jmp .Linner_enter 346 347.align 16 348.Linner: 349 addq %rax,%r13 350 movq (%rsi,%r15,8),%rax 351 adcq $0,%rdx 352 addq %r10,%r13 353 movq (%rsp,%r15,8),%r10 354 adcq $0,%rdx 355 movq %r13,-16(%rsp,%r15,8) 356 movq %rdx,%r13 357 358.Linner_enter: 359 mulq %rbx 360 addq %rax,%r11 361 movq (%rcx,%r15,8),%rax 362 adcq $0,%rdx 363 addq %r11,%r10 364 movq %rdx,%r11 365 adcq $0,%r11 366 leaq 1(%r15),%r15 367 368 mulq %rbp 369 cmpq %r9,%r15 370 jne .Linner 371 372 addq %rax,%r13 373 adcq $0,%rdx 374 addq %r10,%r13 375 movq (%rsp,%r9,8),%r10 376 adcq $0,%rdx 377 movq %r13,-16(%rsp,%r9,8) 378 movq %rdx,%r13 379 380 xorq %rdx,%rdx 381 addq %r11,%r13 382 adcq $0,%rdx 383 addq %r10,%r13 384 adcq $0,%rdx 385 movq %r13,-8(%rsp,%r9,8) 386 movq %rdx,(%rsp,%r9,8) 387 388 leaq 1(%r14),%r14 389 cmpq %r9,%r14 390 jb .Louter 391 392 xorq %r14,%r14 393 movq (%rsp),%rax 394 leaq (%rsp),%rsi 395 movq %r9,%r15 396 jmp .Lsub 397.align 16 398.Lsub: sbbq (%rcx,%r14,8),%rax 399 movq %rax,(%rdi,%r14,8) 400 movq 8(%rsi,%r14,8),%rax 401 leaq 1(%r14),%r14 402 decq %r15 403 jnz .Lsub 404 405 sbbq $0,%rax 406 movq $-1,%rbx 407 xorq %rax,%rbx 408 xorq %r14,%r14 409 movq %r9,%r15 410 411.Lcopy: 412 movq (%rdi,%r14,8),%rcx 413 movq (%rsp,%r14,8),%rdx 414 andq %rbx,%rcx 415 andq %rax,%rdx 416 movq %r14,(%rsp,%r14,8) 417 orq %rcx,%rdx 418 movq %rdx,(%rdi,%r14,8) 419 leaq 1(%r14),%r14 420 subq $1,%r15 421 jnz .Lcopy 422 423 movq 8(%rsp,%r9,8),%rsi 424.cfi_def_cfa %rsi,8 425 movq $1,%rax 426 427 movq -48(%rsi),%r15 428.cfi_restore %r15 429 movq -40(%rsi),%r14 430.cfi_restore %r14 431 movq -32(%rsi),%r13 432.cfi_restore %r13 433 movq -24(%rsi),%r12 434.cfi_restore %r12 435 movq -16(%rsi),%rbp 436.cfi_restore %rbp 437 movq -8(%rsi),%rbx 438.cfi_restore %rbx 439 leaq (%rsi),%rsp 440.cfi_def_cfa_register %rsp 441.Lmul_epilogue: 442 .byte 0xf3,0xc3 443.cfi_endproc 444.size bn_mul_mont_gather5,.-bn_mul_mont_gather5 445.type bn_mul4x_mont_gather5,@function 446.align 32 447bn_mul4x_mont_gather5: 448.cfi_startproc 449.byte 0x67 450 movq %rsp,%rax 451.cfi_def_cfa_register %rax 452.Lmul4x_enter: 453 andl $0x80108,%r11d 454 cmpl $0x80108,%r11d 455 je .Lmulx4x_enter 456 pushq %rbx 457.cfi_offset %rbx,-16 458 pushq %rbp 459.cfi_offset %rbp,-24 460 pushq %r12 461.cfi_offset %r12,-32 462 pushq %r13 463.cfi_offset %r13,-40 464 pushq %r14 465.cfi_offset %r14,-48 466 pushq %r15 467.cfi_offset %r15,-56 468.Lmul4x_prologue: 469 470.byte 0x67 471 shll $3,%r9d 472 leaq (%r9,%r9,2),%r10 473 negq %r9 474 475 476 477 478 479 480 481 482 483 484 leaq -320(%rsp,%r9,2),%r11 485 movq %rsp,%rbp 486 subq %rdi,%r11 487 andq $4095,%r11 488 cmpq %r11,%r10 489 jb .Lmul4xsp_alt 490 subq %r11,%rbp 491 leaq -320(%rbp,%r9,2),%rbp 492 jmp .Lmul4xsp_done 493 494.align 32 495.Lmul4xsp_alt: 496 leaq 4096-320(,%r9,2),%r10 497 leaq -320(%rbp,%r9,2),%rbp 498 subq %r10,%r11 499 movq $0,%r10 500 cmovcq %r10,%r11 501 subq %r11,%rbp 502.Lmul4xsp_done: 503 andq $-64,%rbp 504 movq %rsp,%r11 505 subq %rbp,%r11 506 andq $-4096,%r11 507 leaq (%r11,%rbp,1),%rsp 508 movq (%rsp),%r10 509 cmpq %rbp,%rsp 510 ja .Lmul4x_page_walk 511 jmp .Lmul4x_page_walk_done 512 513.Lmul4x_page_walk: 514 leaq -4096(%rsp),%rsp 515 movq (%rsp),%r10 516 cmpq %rbp,%rsp 517 ja .Lmul4x_page_walk 518.Lmul4x_page_walk_done: 519 520 negq %r9 521 522 movq %rax,40(%rsp) 523.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 524.Lmul4x_body: 525 526 call mul4x_internal 527 528 movq 40(%rsp),%rsi 529.cfi_def_cfa %rsi,8 530 movq $1,%rax 531 532 movq -48(%rsi),%r15 533.cfi_restore %r15 534 movq -40(%rsi),%r14 535.cfi_restore %r14 536 movq -32(%rsi),%r13 537.cfi_restore %r13 538 movq -24(%rsi),%r12 539.cfi_restore %r12 540 movq -16(%rsi),%rbp 541.cfi_restore %rbp 542 movq -8(%rsi),%rbx 543.cfi_restore %rbx 544 leaq (%rsi),%rsp 545.cfi_def_cfa_register %rsp 546.Lmul4x_epilogue: 547 .byte 0xf3,0xc3 548.cfi_endproc 549.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 550 551.type mul4x_internal,@function 552.align 32 553mul4x_internal: 554 shlq $5,%r9 555 movd 8(%rax),%xmm5 556 leaq .Linc(%rip),%rax 557 leaq 128(%rdx,%r9,1),%r13 558 shrq $5,%r9 559 movdqa 0(%rax),%xmm0 560 movdqa 16(%rax),%xmm1 561 leaq 88-112(%rsp,%r9,1),%r10 562 leaq 128(%rdx),%r12 563 564 pshufd $0,%xmm5,%xmm5 565 movdqa %xmm1,%xmm4 566.byte 0x67,0x67 567 movdqa %xmm1,%xmm2 568 paddd %xmm0,%xmm1 569 pcmpeqd %xmm5,%xmm0 570.byte 0x67 571 movdqa %xmm4,%xmm3 572 paddd %xmm1,%xmm2 573 pcmpeqd %xmm5,%xmm1 574 movdqa %xmm0,112(%r10) 575 movdqa %xmm4,%xmm0 576 577 paddd %xmm2,%xmm3 578 pcmpeqd %xmm5,%xmm2 579 movdqa %xmm1,128(%r10) 580 movdqa %xmm4,%xmm1 581 582 paddd %xmm3,%xmm0 583 pcmpeqd %xmm5,%xmm3 584 movdqa %xmm2,144(%r10) 585 movdqa %xmm4,%xmm2 586 587 paddd %xmm0,%xmm1 588 pcmpeqd %xmm5,%xmm0 589 movdqa %xmm3,160(%r10) 590 movdqa %xmm4,%xmm3 591 paddd %xmm1,%xmm2 592 pcmpeqd %xmm5,%xmm1 593 movdqa %xmm0,176(%r10) 594 movdqa %xmm4,%xmm0 595 596 paddd %xmm2,%xmm3 597 pcmpeqd %xmm5,%xmm2 598 movdqa %xmm1,192(%r10) 599 movdqa %xmm4,%xmm1 600 601 paddd %xmm3,%xmm0 602 pcmpeqd %xmm5,%xmm3 603 movdqa %xmm2,208(%r10) 604 movdqa %xmm4,%xmm2 605 606 paddd %xmm0,%xmm1 607 pcmpeqd %xmm5,%xmm0 608 movdqa %xmm3,224(%r10) 609 movdqa %xmm4,%xmm3 610 paddd %xmm1,%xmm2 611 pcmpeqd %xmm5,%xmm1 612 movdqa %xmm0,240(%r10) 613 movdqa %xmm4,%xmm0 614 615 paddd %xmm2,%xmm3 616 pcmpeqd %xmm5,%xmm2 617 movdqa %xmm1,256(%r10) 618 movdqa %xmm4,%xmm1 619 620 paddd %xmm3,%xmm0 621 pcmpeqd %xmm5,%xmm3 622 movdqa %xmm2,272(%r10) 623 movdqa %xmm4,%xmm2 624 625 paddd %xmm0,%xmm1 626 pcmpeqd %xmm5,%xmm0 627 movdqa %xmm3,288(%r10) 628 movdqa %xmm4,%xmm3 629 paddd %xmm1,%xmm2 630 pcmpeqd %xmm5,%xmm1 631 movdqa %xmm0,304(%r10) 632 633 paddd %xmm2,%xmm3 634.byte 0x67 635 pcmpeqd %xmm5,%xmm2 636 movdqa %xmm1,320(%r10) 637 638 pcmpeqd %xmm5,%xmm3 639 movdqa %xmm2,336(%r10) 640 pand 64(%r12),%xmm0 641 642 pand 80(%r12),%xmm1 643 pand 96(%r12),%xmm2 644 movdqa %xmm3,352(%r10) 645 pand 112(%r12),%xmm3 646 por %xmm2,%xmm0 647 por %xmm3,%xmm1 648 movdqa -128(%r12),%xmm4 649 movdqa -112(%r12),%xmm5 650 movdqa -96(%r12),%xmm2 651 pand 112(%r10),%xmm4 652 movdqa -80(%r12),%xmm3 653 pand 128(%r10),%xmm5 654 por %xmm4,%xmm0 655 pand 144(%r10),%xmm2 656 por %xmm5,%xmm1 657 pand 160(%r10),%xmm3 658 por %xmm2,%xmm0 659 por %xmm3,%xmm1 660 movdqa -64(%r12),%xmm4 661 movdqa -48(%r12),%xmm5 662 movdqa -32(%r12),%xmm2 663 pand 176(%r10),%xmm4 664 movdqa -16(%r12),%xmm3 665 pand 192(%r10),%xmm5 666 por %xmm4,%xmm0 667 pand 208(%r10),%xmm2 668 por %xmm5,%xmm1 669 pand 224(%r10),%xmm3 670 por %xmm2,%xmm0 671 por %xmm3,%xmm1 672 movdqa 0(%r12),%xmm4 673 movdqa 16(%r12),%xmm5 674 movdqa 32(%r12),%xmm2 675 pand 240(%r10),%xmm4 676 movdqa 48(%r12),%xmm3 677 pand 256(%r10),%xmm5 678 por %xmm4,%xmm0 679 pand 272(%r10),%xmm2 680 por %xmm5,%xmm1 681 pand 288(%r10),%xmm3 682 por %xmm2,%xmm0 683 por %xmm3,%xmm1 684 por %xmm1,%xmm0 685 pshufd $0x4e,%xmm0,%xmm1 686 por %xmm1,%xmm0 687 leaq 256(%r12),%r12 688.byte 102,72,15,126,195 689 690 movq %r13,16+8(%rsp) 691 movq %rdi,56+8(%rsp) 692 693 movq (%r8),%r8 694 movq (%rsi),%rax 695 leaq (%rsi,%r9,1),%rsi 696 negq %r9 697 698 movq %r8,%rbp 699 mulq %rbx 700 movq %rax,%r10 701 movq (%rcx),%rax 702 703 imulq %r10,%rbp 704 leaq 64+8(%rsp),%r14 705 movq %rdx,%r11 706 707 mulq %rbp 708 addq %rax,%r10 709 movq 8(%rsi,%r9,1),%rax 710 adcq $0,%rdx 711 movq %rdx,%rdi 712 713 mulq %rbx 714 addq %rax,%r11 715 movq 8(%rcx),%rax 716 adcq $0,%rdx 717 movq %rdx,%r10 718 719 mulq %rbp 720 addq %rax,%rdi 721 movq 16(%rsi,%r9,1),%rax 722 adcq $0,%rdx 723 addq %r11,%rdi 724 leaq 32(%r9),%r15 725 leaq 32(%rcx),%rcx 726 adcq $0,%rdx 727 movq %rdi,(%r14) 728 movq %rdx,%r13 729 jmp .L1st4x 730 731.align 32 732.L1st4x: 733 mulq %rbx 734 addq %rax,%r10 735 movq -16(%rcx),%rax 736 leaq 32(%r14),%r14 737 adcq $0,%rdx 738 movq %rdx,%r11 739 740 mulq %rbp 741 addq %rax,%r13 742 movq -8(%rsi,%r15,1),%rax 743 adcq $0,%rdx 744 addq %r10,%r13 745 adcq $0,%rdx 746 movq %r13,-24(%r14) 747 movq %rdx,%rdi 748 749 mulq %rbx 750 addq %rax,%r11 751 movq -8(%rcx),%rax 752 adcq $0,%rdx 753 movq %rdx,%r10 754 755 mulq %rbp 756 addq %rax,%rdi 757 movq (%rsi,%r15,1),%rax 758 adcq $0,%rdx 759 addq %r11,%rdi 760 adcq $0,%rdx 761 movq %rdi,-16(%r14) 762 movq %rdx,%r13 763 764 mulq %rbx 765 addq %rax,%r10 766 movq 0(%rcx),%rax 767 adcq $0,%rdx 768 movq %rdx,%r11 769 770 mulq %rbp 771 addq %rax,%r13 772 movq 8(%rsi,%r15,1),%rax 773 adcq $0,%rdx 774 addq %r10,%r13 775 adcq $0,%rdx 776 movq %r13,-8(%r14) 777 movq %rdx,%rdi 778 779 mulq %rbx 780 addq %rax,%r11 781 movq 8(%rcx),%rax 782 adcq $0,%rdx 783 movq %rdx,%r10 784 785 mulq %rbp 786 addq %rax,%rdi 787 movq 16(%rsi,%r15,1),%rax 788 adcq $0,%rdx 789 addq %r11,%rdi 790 leaq 32(%rcx),%rcx 791 adcq $0,%rdx 792 movq %rdi,(%r14) 793 movq %rdx,%r13 794 795 addq $32,%r15 796 jnz .L1st4x 797 798 mulq %rbx 799 addq %rax,%r10 800 movq -16(%rcx),%rax 801 leaq 32(%r14),%r14 802 adcq $0,%rdx 803 movq %rdx,%r11 804 805 mulq %rbp 806 addq %rax,%r13 807 movq -8(%rsi),%rax 808 adcq $0,%rdx 809 addq %r10,%r13 810 adcq $0,%rdx 811 movq %r13,-24(%r14) 812 movq %rdx,%rdi 813 814 mulq %rbx 815 addq %rax,%r11 816 movq -8(%rcx),%rax 817 adcq $0,%rdx 818 movq %rdx,%r10 819 820 mulq %rbp 821 addq %rax,%rdi 822 movq (%rsi,%r9,1),%rax 823 adcq $0,%rdx 824 addq %r11,%rdi 825 adcq $0,%rdx 826 movq %rdi,-16(%r14) 827 movq %rdx,%r13 828 829 leaq (%rcx,%r9,1),%rcx 830 831 xorq %rdi,%rdi 832 addq %r10,%r13 833 adcq $0,%rdi 834 movq %r13,-8(%r14) 835 836 jmp .Louter4x 837 838.align 32 839.Louter4x: 840 leaq 16+128(%r14),%rdx 841 pxor %xmm4,%xmm4 842 pxor %xmm5,%xmm5 843 movdqa -128(%r12),%xmm0 844 movdqa -112(%r12),%xmm1 845 movdqa -96(%r12),%xmm2 846 movdqa -80(%r12),%xmm3 847 pand -128(%rdx),%xmm0 848 pand -112(%rdx),%xmm1 849 por %xmm0,%xmm4 850 pand -96(%rdx),%xmm2 851 por %xmm1,%xmm5 852 pand -80(%rdx),%xmm3 853 por %xmm2,%xmm4 854 por %xmm3,%xmm5 855 movdqa -64(%r12),%xmm0 856 movdqa -48(%r12),%xmm1 857 movdqa -32(%r12),%xmm2 858 movdqa -16(%r12),%xmm3 859 pand -64(%rdx),%xmm0 860 pand -48(%rdx),%xmm1 861 por %xmm0,%xmm4 862 pand -32(%rdx),%xmm2 863 por %xmm1,%xmm5 864 pand -16(%rdx),%xmm3 865 por %xmm2,%xmm4 866 por %xmm3,%xmm5 867 movdqa 0(%r12),%xmm0 868 movdqa 16(%r12),%xmm1 869 movdqa 32(%r12),%xmm2 870 movdqa 48(%r12),%xmm3 871 pand 0(%rdx),%xmm0 872 pand 16(%rdx),%xmm1 873 por %xmm0,%xmm4 874 pand 32(%rdx),%xmm2 875 por %xmm1,%xmm5 876 pand 48(%rdx),%xmm3 877 por %xmm2,%xmm4 878 por %xmm3,%xmm5 879 movdqa 64(%r12),%xmm0 880 movdqa 80(%r12),%xmm1 881 movdqa 96(%r12),%xmm2 882 movdqa 112(%r12),%xmm3 883 pand 64(%rdx),%xmm0 884 pand 80(%rdx),%xmm1 885 por %xmm0,%xmm4 886 pand 96(%rdx),%xmm2 887 por %xmm1,%xmm5 888 pand 112(%rdx),%xmm3 889 por %xmm2,%xmm4 890 por %xmm3,%xmm5 891 por %xmm5,%xmm4 892 pshufd $0x4e,%xmm4,%xmm0 893 por %xmm4,%xmm0 894 leaq 256(%r12),%r12 895.byte 102,72,15,126,195 896 897 movq (%r14,%r9,1),%r10 898 movq %r8,%rbp 899 mulq %rbx 900 addq %rax,%r10 901 movq (%rcx),%rax 902 adcq $0,%rdx 903 904 imulq %r10,%rbp 905 movq %rdx,%r11 906 movq %rdi,(%r14) 907 908 leaq (%r14,%r9,1),%r14 909 910 mulq %rbp 911 addq %rax,%r10 912 movq 8(%rsi,%r9,1),%rax 913 adcq $0,%rdx 914 movq %rdx,%rdi 915 916 mulq %rbx 917 addq %rax,%r11 918 movq 8(%rcx),%rax 919 adcq $0,%rdx 920 addq 8(%r14),%r11 921 adcq $0,%rdx 922 movq %rdx,%r10 923 924 mulq %rbp 925 addq %rax,%rdi 926 movq 16(%rsi,%r9,1),%rax 927 adcq $0,%rdx 928 addq %r11,%rdi 929 leaq 32(%r9),%r15 930 leaq 32(%rcx),%rcx 931 adcq $0,%rdx 932 movq %rdx,%r13 933 jmp .Linner4x 934 935.align 32 936.Linner4x: 937 mulq %rbx 938 addq %rax,%r10 939 movq -16(%rcx),%rax 940 adcq $0,%rdx 941 addq 16(%r14),%r10 942 leaq 32(%r14),%r14 943 adcq $0,%rdx 944 movq %rdx,%r11 945 946 mulq %rbp 947 addq %rax,%r13 948 movq -8(%rsi,%r15,1),%rax 949 adcq $0,%rdx 950 addq %r10,%r13 951 adcq $0,%rdx 952 movq %rdi,-32(%r14) 953 movq %rdx,%rdi 954 955 mulq %rbx 956 addq %rax,%r11 957 movq -8(%rcx),%rax 958 adcq $0,%rdx 959 addq -8(%r14),%r11 960 adcq $0,%rdx 961 movq %rdx,%r10 962 963 mulq %rbp 964 addq %rax,%rdi 965 movq (%rsi,%r15,1),%rax 966 adcq $0,%rdx 967 addq %r11,%rdi 968 adcq $0,%rdx 969 movq %r13,-24(%r14) 970 movq %rdx,%r13 971 972 mulq %rbx 973 addq %rax,%r10 974 movq 0(%rcx),%rax 975 adcq $0,%rdx 976 addq (%r14),%r10 977 adcq $0,%rdx 978 movq %rdx,%r11 979 980 mulq %rbp 981 addq %rax,%r13 982 movq 8(%rsi,%r15,1),%rax 983 adcq $0,%rdx 984 addq %r10,%r13 985 adcq $0,%rdx 986 movq %rdi,-16(%r14) 987 movq %rdx,%rdi 988 989 mulq %rbx 990 addq %rax,%r11 991 movq 8(%rcx),%rax 992 adcq $0,%rdx 993 addq 8(%r14),%r11 994 adcq $0,%rdx 995 movq %rdx,%r10 996 997 mulq %rbp 998 addq %rax,%rdi 999 movq 16(%rsi,%r15,1),%rax 1000 adcq $0,%rdx 1001 addq %r11,%rdi 1002 leaq 32(%rcx),%rcx 1003 adcq $0,%rdx 1004 movq %r13,-8(%r14) 1005 movq %rdx,%r13 1006 1007 addq $32,%r15 1008 jnz .Linner4x 1009 1010 mulq %rbx 1011 addq %rax,%r10 1012 movq -16(%rcx),%rax 1013 adcq $0,%rdx 1014 addq 16(%r14),%r10 1015 leaq 32(%r14),%r14 1016 adcq $0,%rdx 1017 movq %rdx,%r11 1018 1019 mulq %rbp 1020 addq %rax,%r13 1021 movq -8(%rsi),%rax 1022 adcq $0,%rdx 1023 addq %r10,%r13 1024 adcq $0,%rdx 1025 movq %rdi,-32(%r14) 1026 movq %rdx,%rdi 1027 1028 mulq %rbx 1029 addq %rax,%r11 1030 movq %rbp,%rax 1031 movq -8(%rcx),%rbp 1032 adcq $0,%rdx 1033 addq -8(%r14),%r11 1034 adcq $0,%rdx 1035 movq %rdx,%r10 1036 1037 mulq %rbp 1038 addq %rax,%rdi 1039 movq (%rsi,%r9,1),%rax 1040 adcq $0,%rdx 1041 addq %r11,%rdi 1042 adcq $0,%rdx 1043 movq %r13,-24(%r14) 1044 movq %rdx,%r13 1045 1046 movq %rdi,-16(%r14) 1047 leaq (%rcx,%r9,1),%rcx 1048 1049 xorq %rdi,%rdi 1050 addq %r10,%r13 1051 adcq $0,%rdi 1052 addq (%r14),%r13 1053 adcq $0,%rdi 1054 movq %r13,-8(%r14) 1055 1056 cmpq 16+8(%rsp),%r12 1057 jb .Louter4x 1058 xorq %rax,%rax 1059 subq %r13,%rbp 1060 adcq %r15,%r15 1061 orq %r15,%rdi 1062 subq %rdi,%rax 1063 leaq (%r14,%r9,1),%rbx 1064 movq (%rcx),%r12 1065 leaq (%rcx),%rbp 1066 movq %r9,%rcx 1067 sarq $3+2,%rcx 1068 movq 56+8(%rsp),%rdi 1069 decq %r12 1070 xorq %r10,%r10 1071 movq 8(%rbp),%r13 1072 movq 16(%rbp),%r14 1073 movq 24(%rbp),%r15 1074 jmp .Lsqr4x_sub_entry 1075.size mul4x_internal,.-mul4x_internal 1076.globl bn_power5 1077.type bn_power5,@function 1078.align 32 1079bn_power5: 1080.cfi_startproc 1081 movq %rsp,%rax 1082.cfi_def_cfa_register %rax 1083 movl OPENSSL_ia32cap_P+8(%rip),%r11d 1084 andl $0x80108,%r11d 1085 cmpl $0x80108,%r11d 1086 je .Lpowerx5_enter 1087 pushq %rbx 1088.cfi_offset %rbx,-16 1089 pushq %rbp 1090.cfi_offset %rbp,-24 1091 pushq %r12 1092.cfi_offset %r12,-32 1093 pushq %r13 1094.cfi_offset %r13,-40 1095 pushq %r14 1096.cfi_offset %r14,-48 1097 pushq %r15 1098.cfi_offset %r15,-56 1099.Lpower5_prologue: 1100 1101 shll $3,%r9d 1102 leal (%r9,%r9,2),%r10d 1103 negq %r9 1104 movq (%r8),%r8 1105 1106 1107 1108 1109 1110 1111 1112 1113 leaq -320(%rsp,%r9,2),%r11 1114 movq %rsp,%rbp 1115 subq %rdi,%r11 1116 andq $4095,%r11 1117 cmpq %r11,%r10 1118 jb .Lpwr_sp_alt 1119 subq %r11,%rbp 1120 leaq -320(%rbp,%r9,2),%rbp 1121 jmp .Lpwr_sp_done 1122 1123.align 32 1124.Lpwr_sp_alt: 1125 leaq 4096-320(,%r9,2),%r10 1126 leaq -320(%rbp,%r9,2),%rbp 1127 subq %r10,%r11 1128 movq $0,%r10 1129 cmovcq %r10,%r11 1130 subq %r11,%rbp 1131.Lpwr_sp_done: 1132 andq $-64,%rbp 1133 movq %rsp,%r11 1134 subq %rbp,%r11 1135 andq $-4096,%r11 1136 leaq (%r11,%rbp,1),%rsp 1137 movq (%rsp),%r10 1138 cmpq %rbp,%rsp 1139 ja .Lpwr_page_walk 1140 jmp .Lpwr_page_walk_done 1141 1142.Lpwr_page_walk: 1143 leaq -4096(%rsp),%rsp 1144 movq (%rsp),%r10 1145 cmpq %rbp,%rsp 1146 ja .Lpwr_page_walk 1147.Lpwr_page_walk_done: 1148 1149 movq %r9,%r10 1150 negq %r9 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 movq %r8,32(%rsp) 1162 movq %rax,40(%rsp) 1163.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 1164.Lpower5_body: 1165.byte 102,72,15,110,207 1166.byte 102,72,15,110,209 1167.byte 102,73,15,110,218 1168.byte 102,72,15,110,226 1169 1170 call __bn_sqr8x_internal 1171 call __bn_post4x_internal 1172 call __bn_sqr8x_internal 1173 call __bn_post4x_internal 1174 call __bn_sqr8x_internal 1175 call __bn_post4x_internal 1176 call __bn_sqr8x_internal 1177 call __bn_post4x_internal 1178 call __bn_sqr8x_internal 1179 call __bn_post4x_internal 1180 1181.byte 102,72,15,126,209 1182.byte 102,72,15,126,226 1183 movq %rsi,%rdi 1184 movq 40(%rsp),%rax 1185 leaq 32(%rsp),%r8 1186 1187 call mul4x_internal 1188 1189 movq 40(%rsp),%rsi 1190.cfi_def_cfa %rsi,8 1191 movq $1,%rax 1192 movq -48(%rsi),%r15 1193.cfi_restore %r15 1194 movq -40(%rsi),%r14 1195.cfi_restore %r14 1196 movq -32(%rsi),%r13 1197.cfi_restore %r13 1198 movq -24(%rsi),%r12 1199.cfi_restore %r12 1200 movq -16(%rsi),%rbp 1201.cfi_restore %rbp 1202 movq -8(%rsi),%rbx 1203.cfi_restore %rbx 1204 leaq (%rsi),%rsp 1205.cfi_def_cfa_register %rsp 1206.Lpower5_epilogue: 1207 .byte 0xf3,0xc3 1208.cfi_endproc 1209.size bn_power5,.-bn_power5 1210 1211.globl bn_sqr8x_internal 1212.hidden bn_sqr8x_internal 1213.type bn_sqr8x_internal,@function 1214.align 32 1215bn_sqr8x_internal: 1216__bn_sqr8x_internal: 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 leaq 32(%r10),%rbp 1291 leaq (%rsi,%r9,1),%rsi 1292 1293 movq %r9,%rcx 1294 1295 1296 movq -32(%rsi,%rbp,1),%r14 1297 leaq 48+8(%rsp,%r9,2),%rdi 1298 movq -24(%rsi,%rbp,1),%rax 1299 leaq -32(%rdi,%rbp,1),%rdi 1300 movq -16(%rsi,%rbp,1),%rbx 1301 movq %rax,%r15 1302 1303 mulq %r14 1304 movq %rax,%r10 1305 movq %rbx,%rax 1306 movq %rdx,%r11 1307 movq %r10,-24(%rdi,%rbp,1) 1308 1309 mulq %r14 1310 addq %rax,%r11 1311 movq %rbx,%rax 1312 adcq $0,%rdx 1313 movq %r11,-16(%rdi,%rbp,1) 1314 movq %rdx,%r10 1315 1316 1317 movq -8(%rsi,%rbp,1),%rbx 1318 mulq %r15 1319 movq %rax,%r12 1320 movq %rbx,%rax 1321 movq %rdx,%r13 1322 1323 leaq (%rbp),%rcx 1324 mulq %r14 1325 addq %rax,%r10 1326 movq %rbx,%rax 1327 movq %rdx,%r11 1328 adcq $0,%r11 1329 addq %r12,%r10 1330 adcq $0,%r11 1331 movq %r10,-8(%rdi,%rcx,1) 1332 jmp .Lsqr4x_1st 1333 1334.align 32 1335.Lsqr4x_1st: 1336 movq (%rsi,%rcx,1),%rbx 1337 mulq %r15 1338 addq %rax,%r13 1339 movq %rbx,%rax 1340 movq %rdx,%r12 1341 adcq $0,%r12 1342 1343 mulq %r14 1344 addq %rax,%r11 1345 movq %rbx,%rax 1346 movq 8(%rsi,%rcx,1),%rbx 1347 movq %rdx,%r10 1348 adcq $0,%r10 1349 addq %r13,%r11 1350 adcq $0,%r10 1351 1352 1353 mulq %r15 1354 addq %rax,%r12 1355 movq %rbx,%rax 1356 movq %r11,(%rdi,%rcx,1) 1357 movq %rdx,%r13 1358 adcq $0,%r13 1359 1360 mulq %r14 1361 addq %rax,%r10 1362 movq %rbx,%rax 1363 movq 16(%rsi,%rcx,1),%rbx 1364 movq %rdx,%r11 1365 adcq $0,%r11 1366 addq %r12,%r10 1367 adcq $0,%r11 1368 1369 mulq %r15 1370 addq %rax,%r13 1371 movq %rbx,%rax 1372 movq %r10,8(%rdi,%rcx,1) 1373 movq %rdx,%r12 1374 adcq $0,%r12 1375 1376 mulq %r14 1377 addq %rax,%r11 1378 movq %rbx,%rax 1379 movq 24(%rsi,%rcx,1),%rbx 1380 movq %rdx,%r10 1381 adcq $0,%r10 1382 addq %r13,%r11 1383 adcq $0,%r10 1384 1385 1386 mulq %r15 1387 addq %rax,%r12 1388 movq %rbx,%rax 1389 movq %r11,16(%rdi,%rcx,1) 1390 movq %rdx,%r13 1391 adcq $0,%r13 1392 leaq 32(%rcx),%rcx 1393 1394 mulq %r14 1395 addq %rax,%r10 1396 movq %rbx,%rax 1397 movq %rdx,%r11 1398 adcq $0,%r11 1399 addq %r12,%r10 1400 adcq $0,%r11 1401 movq %r10,-8(%rdi,%rcx,1) 1402 1403 cmpq $0,%rcx 1404 jne .Lsqr4x_1st 1405 1406 mulq %r15 1407 addq %rax,%r13 1408 leaq 16(%rbp),%rbp 1409 adcq $0,%rdx 1410 addq %r11,%r13 1411 adcq $0,%rdx 1412 1413 movq %r13,(%rdi) 1414 movq %rdx,%r12 1415 movq %rdx,8(%rdi) 1416 jmp .Lsqr4x_outer 1417 1418.align 32 1419.Lsqr4x_outer: 1420 movq -32(%rsi,%rbp,1),%r14 1421 leaq 48+8(%rsp,%r9,2),%rdi 1422 movq -24(%rsi,%rbp,1),%rax 1423 leaq -32(%rdi,%rbp,1),%rdi 1424 movq -16(%rsi,%rbp,1),%rbx 1425 movq %rax,%r15 1426 1427 mulq %r14 1428 movq -24(%rdi,%rbp,1),%r10 1429 addq %rax,%r10 1430 movq %rbx,%rax 1431 adcq $0,%rdx 1432 movq %r10,-24(%rdi,%rbp,1) 1433 movq %rdx,%r11 1434 1435 mulq %r14 1436 addq %rax,%r11 1437 movq %rbx,%rax 1438 adcq $0,%rdx 1439 addq -16(%rdi,%rbp,1),%r11 1440 movq %rdx,%r10 1441 adcq $0,%r10 1442 movq %r11,-16(%rdi,%rbp,1) 1443 1444 xorq %r12,%r12 1445 1446 movq -8(%rsi,%rbp,1),%rbx 1447 mulq %r15 1448 addq %rax,%r12 1449 movq %rbx,%rax 1450 adcq $0,%rdx 1451 addq -8(%rdi,%rbp,1),%r12 1452 movq %rdx,%r13 1453 adcq $0,%r13 1454 1455 mulq %r14 1456 addq %rax,%r10 1457 movq %rbx,%rax 1458 adcq $0,%rdx 1459 addq %r12,%r10 1460 movq %rdx,%r11 1461 adcq $0,%r11 1462 movq %r10,-8(%rdi,%rbp,1) 1463 1464 leaq (%rbp),%rcx 1465 jmp .Lsqr4x_inner 1466 1467.align 32 1468.Lsqr4x_inner: 1469 movq (%rsi,%rcx,1),%rbx 1470 mulq %r15 1471 addq %rax,%r13 1472 movq %rbx,%rax 1473 movq %rdx,%r12 1474 adcq $0,%r12 1475 addq (%rdi,%rcx,1),%r13 1476 adcq $0,%r12 1477 1478.byte 0x67 1479 mulq %r14 1480 addq %rax,%r11 1481 movq %rbx,%rax 1482 movq 8(%rsi,%rcx,1),%rbx 1483 movq %rdx,%r10 1484 adcq $0,%r10 1485 addq %r13,%r11 1486 adcq $0,%r10 1487 1488 mulq %r15 1489 addq %rax,%r12 1490 movq %r11,(%rdi,%rcx,1) 1491 movq %rbx,%rax 1492 movq %rdx,%r13 1493 adcq $0,%r13 1494 addq 8(%rdi,%rcx,1),%r12 1495 leaq 16(%rcx),%rcx 1496 adcq $0,%r13 1497 1498 mulq %r14 1499 addq %rax,%r10 1500 movq %rbx,%rax 1501 adcq $0,%rdx 1502 addq %r12,%r10 1503 movq %rdx,%r11 1504 adcq $0,%r11 1505 movq %r10,-8(%rdi,%rcx,1) 1506 1507 cmpq $0,%rcx 1508 jne .Lsqr4x_inner 1509 1510.byte 0x67 1511 mulq %r15 1512 addq %rax,%r13 1513 adcq $0,%rdx 1514 addq %r11,%r13 1515 adcq $0,%rdx 1516 1517 movq %r13,(%rdi) 1518 movq %rdx,%r12 1519 movq %rdx,8(%rdi) 1520 1521 addq $16,%rbp 1522 jnz .Lsqr4x_outer 1523 1524 1525 movq -32(%rsi),%r14 1526 leaq 48+8(%rsp,%r9,2),%rdi 1527 movq -24(%rsi),%rax 1528 leaq -32(%rdi,%rbp,1),%rdi 1529 movq -16(%rsi),%rbx 1530 movq %rax,%r15 1531 1532 mulq %r14 1533 addq %rax,%r10 1534 movq %rbx,%rax 1535 movq %rdx,%r11 1536 adcq $0,%r11 1537 1538 mulq %r14 1539 addq %rax,%r11 1540 movq %rbx,%rax 1541 movq %r10,-24(%rdi) 1542 movq %rdx,%r10 1543 adcq $0,%r10 1544 addq %r13,%r11 1545 movq -8(%rsi),%rbx 1546 adcq $0,%r10 1547 1548 mulq %r15 1549 addq %rax,%r12 1550 movq %rbx,%rax 1551 movq %r11,-16(%rdi) 1552 movq %rdx,%r13 1553 adcq $0,%r13 1554 1555 mulq %r14 1556 addq %rax,%r10 1557 movq %rbx,%rax 1558 movq %rdx,%r11 1559 adcq $0,%r11 1560 addq %r12,%r10 1561 adcq $0,%r11 1562 movq %r10,-8(%rdi) 1563 1564 mulq %r15 1565 addq %rax,%r13 1566 movq -16(%rsi),%rax 1567 adcq $0,%rdx 1568 addq %r11,%r13 1569 adcq $0,%rdx 1570 1571 movq %r13,(%rdi) 1572 movq %rdx,%r12 1573 movq %rdx,8(%rdi) 1574 1575 mulq %rbx 1576 addq $16,%rbp 1577 xorq %r14,%r14 1578 subq %r9,%rbp 1579 xorq %r15,%r15 1580 1581 addq %r12,%rax 1582 adcq $0,%rdx 1583 movq %rax,8(%rdi) 1584 movq %rdx,16(%rdi) 1585 movq %r15,24(%rdi) 1586 1587 movq -16(%rsi,%rbp,1),%rax 1588 leaq 48+8(%rsp),%rdi 1589 xorq %r10,%r10 1590 movq 8(%rdi),%r11 1591 1592 leaq (%r14,%r10,2),%r12 1593 shrq $63,%r10 1594 leaq (%rcx,%r11,2),%r13 1595 shrq $63,%r11 1596 orq %r10,%r13 1597 movq 16(%rdi),%r10 1598 movq %r11,%r14 1599 mulq %rax 1600 negq %r15 1601 movq 24(%rdi),%r11 1602 adcq %rax,%r12 1603 movq -8(%rsi,%rbp,1),%rax 1604 movq %r12,(%rdi) 1605 adcq %rdx,%r13 1606 1607 leaq (%r14,%r10,2),%rbx 1608 movq %r13,8(%rdi) 1609 sbbq %r15,%r15 1610 shrq $63,%r10 1611 leaq (%rcx,%r11,2),%r8 1612 shrq $63,%r11 1613 orq %r10,%r8 1614 movq 32(%rdi),%r10 1615 movq %r11,%r14 1616 mulq %rax 1617 negq %r15 1618 movq 40(%rdi),%r11 1619 adcq %rax,%rbx 1620 movq 0(%rsi,%rbp,1),%rax 1621 movq %rbx,16(%rdi) 1622 adcq %rdx,%r8 1623 leaq 16(%rbp),%rbp 1624 movq %r8,24(%rdi) 1625 sbbq %r15,%r15 1626 leaq 64(%rdi),%rdi 1627 jmp .Lsqr4x_shift_n_add 1628 1629.align 32 1630.Lsqr4x_shift_n_add: 1631 leaq (%r14,%r10,2),%r12 1632 shrq $63,%r10 1633 leaq (%rcx,%r11,2),%r13 1634 shrq $63,%r11 1635 orq %r10,%r13 1636 movq -16(%rdi),%r10 1637 movq %r11,%r14 1638 mulq %rax 1639 negq %r15 1640 movq -8(%rdi),%r11 1641 adcq %rax,%r12 1642 movq -8(%rsi,%rbp,1),%rax 1643 movq %r12,-32(%rdi) 1644 adcq %rdx,%r13 1645 1646 leaq (%r14,%r10,2),%rbx 1647 movq %r13,-24(%rdi) 1648 sbbq %r15,%r15 1649 shrq $63,%r10 1650 leaq (%rcx,%r11,2),%r8 1651 shrq $63,%r11 1652 orq %r10,%r8 1653 movq 0(%rdi),%r10 1654 movq %r11,%r14 1655 mulq %rax 1656 negq %r15 1657 movq 8(%rdi),%r11 1658 adcq %rax,%rbx 1659 movq 0(%rsi,%rbp,1),%rax 1660 movq %rbx,-16(%rdi) 1661 adcq %rdx,%r8 1662 1663 leaq (%r14,%r10,2),%r12 1664 movq %r8,-8(%rdi) 1665 sbbq %r15,%r15 1666 shrq $63,%r10 1667 leaq (%rcx,%r11,2),%r13 1668 shrq $63,%r11 1669 orq %r10,%r13 1670 movq 16(%rdi),%r10 1671 movq %r11,%r14 1672 mulq %rax 1673 negq %r15 1674 movq 24(%rdi),%r11 1675 adcq %rax,%r12 1676 movq 8(%rsi,%rbp,1),%rax 1677 movq %r12,0(%rdi) 1678 adcq %rdx,%r13 1679 1680 leaq (%r14,%r10,2),%rbx 1681 movq %r13,8(%rdi) 1682 sbbq %r15,%r15 1683 shrq $63,%r10 1684 leaq (%rcx,%r11,2),%r8 1685 shrq $63,%r11 1686 orq %r10,%r8 1687 movq 32(%rdi),%r10 1688 movq %r11,%r14 1689 mulq %rax 1690 negq %r15 1691 movq 40(%rdi),%r11 1692 adcq %rax,%rbx 1693 movq 16(%rsi,%rbp,1),%rax 1694 movq %rbx,16(%rdi) 1695 adcq %rdx,%r8 1696 movq %r8,24(%rdi) 1697 sbbq %r15,%r15 1698 leaq 64(%rdi),%rdi 1699 addq $32,%rbp 1700 jnz .Lsqr4x_shift_n_add 1701 1702 leaq (%r14,%r10,2),%r12 1703.byte 0x67 1704 shrq $63,%r10 1705 leaq (%rcx,%r11,2),%r13 1706 shrq $63,%r11 1707 orq %r10,%r13 1708 movq -16(%rdi),%r10 1709 movq %r11,%r14 1710 mulq %rax 1711 negq %r15 1712 movq -8(%rdi),%r11 1713 adcq %rax,%r12 1714 movq -8(%rsi),%rax 1715 movq %r12,-32(%rdi) 1716 adcq %rdx,%r13 1717 1718 leaq (%r14,%r10,2),%rbx 1719 movq %r13,-24(%rdi) 1720 sbbq %r15,%r15 1721 shrq $63,%r10 1722 leaq (%rcx,%r11,2),%r8 1723 shrq $63,%r11 1724 orq %r10,%r8 1725 mulq %rax 1726 negq %r15 1727 adcq %rax,%rbx 1728 adcq %rdx,%r8 1729 movq %rbx,-16(%rdi) 1730 movq %r8,-8(%rdi) 1731.byte 102,72,15,126,213 1732__bn_sqr8x_reduction: 1733 xorq %rax,%rax 1734 leaq (%r9,%rbp,1),%rcx 1735 leaq 48+8(%rsp,%r9,2),%rdx 1736 movq %rcx,0+8(%rsp) 1737 leaq 48+8(%rsp,%r9,1),%rdi 1738 movq %rdx,8+8(%rsp) 1739 negq %r9 1740 jmp .L8x_reduction_loop 1741 1742.align 32 1743.L8x_reduction_loop: 1744 leaq (%rdi,%r9,1),%rdi 1745.byte 0x66 1746 movq 0(%rdi),%rbx 1747 movq 8(%rdi),%r9 1748 movq 16(%rdi),%r10 1749 movq 24(%rdi),%r11 1750 movq 32(%rdi),%r12 1751 movq 40(%rdi),%r13 1752 movq 48(%rdi),%r14 1753 movq 56(%rdi),%r15 1754 movq %rax,(%rdx) 1755 leaq 64(%rdi),%rdi 1756 1757.byte 0x67 1758 movq %rbx,%r8 1759 imulq 32+8(%rsp),%rbx 1760 movq 0(%rbp),%rax 1761 movl $8,%ecx 1762 jmp .L8x_reduce 1763 1764.align 32 1765.L8x_reduce: 1766 mulq %rbx 1767 movq 8(%rbp),%rax 1768 negq %r8 1769 movq %rdx,%r8 1770 adcq $0,%r8 1771 1772 mulq %rbx 1773 addq %rax,%r9 1774 movq 16(%rbp),%rax 1775 adcq $0,%rdx 1776 addq %r9,%r8 1777 movq %rbx,48-8+8(%rsp,%rcx,8) 1778 movq %rdx,%r9 1779 adcq $0,%r9 1780 1781 mulq %rbx 1782 addq %rax,%r10 1783 movq 24(%rbp),%rax 1784 adcq $0,%rdx 1785 addq %r10,%r9 1786 movq 32+8(%rsp),%rsi 1787 movq %rdx,%r10 1788 adcq $0,%r10 1789 1790 mulq %rbx 1791 addq %rax,%r11 1792 movq 32(%rbp),%rax 1793 adcq $0,%rdx 1794 imulq %r8,%rsi 1795 addq %r11,%r10 1796 movq %rdx,%r11 1797 adcq $0,%r11 1798 1799 mulq %rbx 1800 addq %rax,%r12 1801 movq 40(%rbp),%rax 1802 adcq $0,%rdx 1803 addq %r12,%r11 1804 movq %rdx,%r12 1805 adcq $0,%r12 1806 1807 mulq %rbx 1808 addq %rax,%r13 1809 movq 48(%rbp),%rax 1810 adcq $0,%rdx 1811 addq %r13,%r12 1812 movq %rdx,%r13 1813 adcq $0,%r13 1814 1815 mulq %rbx 1816 addq %rax,%r14 1817 movq 56(%rbp),%rax 1818 adcq $0,%rdx 1819 addq %r14,%r13 1820 movq %rdx,%r14 1821 adcq $0,%r14 1822 1823 mulq %rbx 1824 movq %rsi,%rbx 1825 addq %rax,%r15 1826 movq 0(%rbp),%rax 1827 adcq $0,%rdx 1828 addq %r15,%r14 1829 movq %rdx,%r15 1830 adcq $0,%r15 1831 1832 decl %ecx 1833 jnz .L8x_reduce 1834 1835 leaq 64(%rbp),%rbp 1836 xorq %rax,%rax 1837 movq 8+8(%rsp),%rdx 1838 cmpq 0+8(%rsp),%rbp 1839 jae .L8x_no_tail 1840 1841.byte 0x66 1842 addq 0(%rdi),%r8 1843 adcq 8(%rdi),%r9 1844 adcq 16(%rdi),%r10 1845 adcq 24(%rdi),%r11 1846 adcq 32(%rdi),%r12 1847 adcq 40(%rdi),%r13 1848 adcq 48(%rdi),%r14 1849 adcq 56(%rdi),%r15 1850 sbbq %rsi,%rsi 1851 1852 movq 48+56+8(%rsp),%rbx 1853 movl $8,%ecx 1854 movq 0(%rbp),%rax 1855 jmp .L8x_tail 1856 1857.align 32 1858.L8x_tail: 1859 mulq %rbx 1860 addq %rax,%r8 1861 movq 8(%rbp),%rax 1862 movq %r8,(%rdi) 1863 movq %rdx,%r8 1864 adcq $0,%r8 1865 1866 mulq %rbx 1867 addq %rax,%r9 1868 movq 16(%rbp),%rax 1869 adcq $0,%rdx 1870 addq %r9,%r8 1871 leaq 8(%rdi),%rdi 1872 movq %rdx,%r9 1873 adcq $0,%r9 1874 1875 mulq %rbx 1876 addq %rax,%r10 1877 movq 24(%rbp),%rax 1878 adcq $0,%rdx 1879 addq %r10,%r9 1880 movq %rdx,%r10 1881 adcq $0,%r10 1882 1883 mulq %rbx 1884 addq %rax,%r11 1885 movq 32(%rbp),%rax 1886 adcq $0,%rdx 1887 addq %r11,%r10 1888 movq %rdx,%r11 1889 adcq $0,%r11 1890 1891 mulq %rbx 1892 addq %rax,%r12 1893 movq 40(%rbp),%rax 1894 adcq $0,%rdx 1895 addq %r12,%r11 1896 movq %rdx,%r12 1897 adcq $0,%r12 1898 1899 mulq %rbx 1900 addq %rax,%r13 1901 movq 48(%rbp),%rax 1902 adcq $0,%rdx 1903 addq %r13,%r12 1904 movq %rdx,%r13 1905 adcq $0,%r13 1906 1907 mulq %rbx 1908 addq %rax,%r14 1909 movq 56(%rbp),%rax 1910 adcq $0,%rdx 1911 addq %r14,%r13 1912 movq %rdx,%r14 1913 adcq $0,%r14 1914 1915 mulq %rbx 1916 movq 48-16+8(%rsp,%rcx,8),%rbx 1917 addq %rax,%r15 1918 adcq $0,%rdx 1919 addq %r15,%r14 1920 movq 0(%rbp),%rax 1921 movq %rdx,%r15 1922 adcq $0,%r15 1923 1924 decl %ecx 1925 jnz .L8x_tail 1926 1927 leaq 64(%rbp),%rbp 1928 movq 8+8(%rsp),%rdx 1929 cmpq 0+8(%rsp),%rbp 1930 jae .L8x_tail_done 1931 1932 movq 48+56+8(%rsp),%rbx 1933 negq %rsi 1934 movq 0(%rbp),%rax 1935 adcq 0(%rdi),%r8 1936 adcq 8(%rdi),%r9 1937 adcq 16(%rdi),%r10 1938 adcq 24(%rdi),%r11 1939 adcq 32(%rdi),%r12 1940 adcq 40(%rdi),%r13 1941 adcq 48(%rdi),%r14 1942 adcq 56(%rdi),%r15 1943 sbbq %rsi,%rsi 1944 1945 movl $8,%ecx 1946 jmp .L8x_tail 1947 1948.align 32 1949.L8x_tail_done: 1950 xorq %rax,%rax 1951 addq (%rdx),%r8 1952 adcq $0,%r9 1953 adcq $0,%r10 1954 adcq $0,%r11 1955 adcq $0,%r12 1956 adcq $0,%r13 1957 adcq $0,%r14 1958 adcq $0,%r15 1959 adcq $0,%rax 1960 1961 negq %rsi 1962.L8x_no_tail: 1963 adcq 0(%rdi),%r8 1964 adcq 8(%rdi),%r9 1965 adcq 16(%rdi),%r10 1966 adcq 24(%rdi),%r11 1967 adcq 32(%rdi),%r12 1968 adcq 40(%rdi),%r13 1969 adcq 48(%rdi),%r14 1970 adcq 56(%rdi),%r15 1971 adcq $0,%rax 1972 movq -8(%rbp),%rcx 1973 xorq %rsi,%rsi 1974 1975.byte 102,72,15,126,213 1976 1977 movq %r8,0(%rdi) 1978 movq %r9,8(%rdi) 1979.byte 102,73,15,126,217 1980 movq %r10,16(%rdi) 1981 movq %r11,24(%rdi) 1982 movq %r12,32(%rdi) 1983 movq %r13,40(%rdi) 1984 movq %r14,48(%rdi) 1985 movq %r15,56(%rdi) 1986 leaq 64(%rdi),%rdi 1987 1988 cmpq %rdx,%rdi 1989 jb .L8x_reduction_loop 1990 .byte 0xf3,0xc3 1991.size bn_sqr8x_internal,.-bn_sqr8x_internal 1992.type __bn_post4x_internal,@function 1993.align 32 1994__bn_post4x_internal: 1995 movq 0(%rbp),%r12 1996 leaq (%rdi,%r9,1),%rbx 1997 movq %r9,%rcx 1998.byte 102,72,15,126,207 1999 negq %rax 2000.byte 102,72,15,126,206 2001 sarq $3+2,%rcx 2002 decq %r12 2003 xorq %r10,%r10 2004 movq 8(%rbp),%r13 2005 movq 16(%rbp),%r14 2006 movq 24(%rbp),%r15 2007 jmp .Lsqr4x_sub_entry 2008 2009.align 16 2010.Lsqr4x_sub: 2011 movq 0(%rbp),%r12 2012 movq 8(%rbp),%r13 2013 movq 16(%rbp),%r14 2014 movq 24(%rbp),%r15 2015.Lsqr4x_sub_entry: 2016 leaq 32(%rbp),%rbp 2017 notq %r12 2018 notq %r13 2019 notq %r14 2020 notq %r15 2021 andq %rax,%r12 2022 andq %rax,%r13 2023 andq %rax,%r14 2024 andq %rax,%r15 2025 2026 negq %r10 2027 adcq 0(%rbx),%r12 2028 adcq 8(%rbx),%r13 2029 adcq 16(%rbx),%r14 2030 adcq 24(%rbx),%r15 2031 movq %r12,0(%rdi) 2032 leaq 32(%rbx),%rbx 2033 movq %r13,8(%rdi) 2034 sbbq %r10,%r10 2035 movq %r14,16(%rdi) 2036 movq %r15,24(%rdi) 2037 leaq 32(%rdi),%rdi 2038 2039 incq %rcx 2040 jnz .Lsqr4x_sub 2041 2042 movq %r9,%r10 2043 negq %r9 2044 .byte 0xf3,0xc3 2045.size __bn_post4x_internal,.-__bn_post4x_internal 2046.globl bn_from_montgomery 2047.type bn_from_montgomery,@function 2048.align 32 2049bn_from_montgomery: 2050 testl $7,%r9d 2051 jz bn_from_mont8x 2052 xorl %eax,%eax 2053 .byte 0xf3,0xc3 2054.size bn_from_montgomery,.-bn_from_montgomery 2055 2056.type bn_from_mont8x,@function 2057.align 32 2058bn_from_mont8x: 2059.cfi_startproc 2060.byte 0x67 2061 movq %rsp,%rax 2062.cfi_def_cfa_register %rax 2063 pushq %rbx 2064.cfi_offset %rbx,-16 2065 pushq %rbp 2066.cfi_offset %rbp,-24 2067 pushq %r12 2068.cfi_offset %r12,-32 2069 pushq %r13 2070.cfi_offset %r13,-40 2071 pushq %r14 2072.cfi_offset %r14,-48 2073 pushq %r15 2074.cfi_offset %r15,-56 2075.Lfrom_prologue: 2076 2077 shll $3,%r9d 2078 leaq (%r9,%r9,2),%r10 2079 negq %r9 2080 movq (%r8),%r8 2081 2082 2083 2084 2085 2086 2087 2088 2089 leaq -320(%rsp,%r9,2),%r11 2090 movq %rsp,%rbp 2091 subq %rdi,%r11 2092 andq $4095,%r11 2093 cmpq %r11,%r10 2094 jb .Lfrom_sp_alt 2095 subq %r11,%rbp 2096 leaq -320(%rbp,%r9,2),%rbp 2097 jmp .Lfrom_sp_done 2098 2099.align 32 2100.Lfrom_sp_alt: 2101 leaq 4096-320(,%r9,2),%r10 2102 leaq -320(%rbp,%r9,2),%rbp 2103 subq %r10,%r11 2104 movq $0,%r10 2105 cmovcq %r10,%r11 2106 subq %r11,%rbp 2107.Lfrom_sp_done: 2108 andq $-64,%rbp 2109 movq %rsp,%r11 2110 subq %rbp,%r11 2111 andq $-4096,%r11 2112 leaq (%r11,%rbp,1),%rsp 2113 movq (%rsp),%r10 2114 cmpq %rbp,%rsp 2115 ja .Lfrom_page_walk 2116 jmp .Lfrom_page_walk_done 2117 2118.Lfrom_page_walk: 2119 leaq -4096(%rsp),%rsp 2120 movq (%rsp),%r10 2121 cmpq %rbp,%rsp 2122 ja .Lfrom_page_walk 2123.Lfrom_page_walk_done: 2124 2125 movq %r9,%r10 2126 negq %r9 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 movq %r8,32(%rsp) 2138 movq %rax,40(%rsp) 2139.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 2140.Lfrom_body: 2141 movq %r9,%r11 2142 leaq 48(%rsp),%rax 2143 pxor %xmm0,%xmm0 2144 jmp .Lmul_by_1 2145 2146.align 32 2147.Lmul_by_1: 2148 movdqu (%rsi),%xmm1 2149 movdqu 16(%rsi),%xmm2 2150 movdqu 32(%rsi),%xmm3 2151 movdqa %xmm0,(%rax,%r9,1) 2152 movdqu 48(%rsi),%xmm4 2153 movdqa %xmm0,16(%rax,%r9,1) 2154.byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 2155 movdqa %xmm1,(%rax) 2156 movdqa %xmm0,32(%rax,%r9,1) 2157 movdqa %xmm2,16(%rax) 2158 movdqa %xmm0,48(%rax,%r9,1) 2159 movdqa %xmm3,32(%rax) 2160 movdqa %xmm4,48(%rax) 2161 leaq 64(%rax),%rax 2162 subq $64,%r11 2163 jnz .Lmul_by_1 2164 2165.byte 102,72,15,110,207 2166.byte 102,72,15,110,209 2167.byte 0x67 2168 movq %rcx,%rbp 2169.byte 102,73,15,110,218 2170 movl OPENSSL_ia32cap_P+8(%rip),%r11d 2171 andl $0x80108,%r11d 2172 cmpl $0x80108,%r11d 2173 jne .Lfrom_mont_nox 2174 2175 leaq (%rax,%r9,1),%rdi 2176 call __bn_sqrx8x_reduction 2177 call __bn_postx4x_internal 2178 2179 pxor %xmm0,%xmm0 2180 leaq 48(%rsp),%rax 2181 jmp .Lfrom_mont_zero 2182 2183.align 32 2184.Lfrom_mont_nox: 2185 call __bn_sqr8x_reduction 2186 call __bn_post4x_internal 2187 2188 pxor %xmm0,%xmm0 2189 leaq 48(%rsp),%rax 2190 jmp .Lfrom_mont_zero 2191 2192.align 32 2193.Lfrom_mont_zero: 2194 movq 40(%rsp),%rsi 2195.cfi_def_cfa %rsi,8 2196 movdqa %xmm0,0(%rax) 2197 movdqa %xmm0,16(%rax) 2198 movdqa %xmm0,32(%rax) 2199 movdqa %xmm0,48(%rax) 2200 leaq 64(%rax),%rax 2201 subq $32,%r9 2202 jnz .Lfrom_mont_zero 2203 2204 movq $1,%rax 2205 movq -48(%rsi),%r15 2206.cfi_restore %r15 2207 movq -40(%rsi),%r14 2208.cfi_restore %r14 2209 movq -32(%rsi),%r13 2210.cfi_restore %r13 2211 movq -24(%rsi),%r12 2212.cfi_restore %r12 2213 movq -16(%rsi),%rbp 2214.cfi_restore %rbp 2215 movq -8(%rsi),%rbx 2216.cfi_restore %rbx 2217 leaq (%rsi),%rsp 2218.cfi_def_cfa_register %rsp 2219.Lfrom_epilogue: 2220 .byte 0xf3,0xc3 2221.cfi_endproc 2222.size bn_from_mont8x,.-bn_from_mont8x 2223.type bn_mulx4x_mont_gather5,@function 2224.align 32 2225bn_mulx4x_mont_gather5: 2226.cfi_startproc 2227 movq %rsp,%rax 2228.cfi_def_cfa_register %rax 2229.Lmulx4x_enter: 2230 pushq %rbx 2231.cfi_offset %rbx,-16 2232 pushq %rbp 2233.cfi_offset %rbp,-24 2234 pushq %r12 2235.cfi_offset %r12,-32 2236 pushq %r13 2237.cfi_offset %r13,-40 2238 pushq %r14 2239.cfi_offset %r14,-48 2240 pushq %r15 2241.cfi_offset %r15,-56 2242.Lmulx4x_prologue: 2243 2244 shll $3,%r9d 2245 leaq (%r9,%r9,2),%r10 2246 negq %r9 2247 movq (%r8),%r8 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 leaq -320(%rsp,%r9,2),%r11 2259 movq %rsp,%rbp 2260 subq %rdi,%r11 2261 andq $4095,%r11 2262 cmpq %r11,%r10 2263 jb .Lmulx4xsp_alt 2264 subq %r11,%rbp 2265 leaq -320(%rbp,%r9,2),%rbp 2266 jmp .Lmulx4xsp_done 2267 2268.Lmulx4xsp_alt: 2269 leaq 4096-320(,%r9,2),%r10 2270 leaq -320(%rbp,%r9,2),%rbp 2271 subq %r10,%r11 2272 movq $0,%r10 2273 cmovcq %r10,%r11 2274 subq %r11,%rbp 2275.Lmulx4xsp_done: 2276 andq $-64,%rbp 2277 movq %rsp,%r11 2278 subq %rbp,%r11 2279 andq $-4096,%r11 2280 leaq (%r11,%rbp,1),%rsp 2281 movq (%rsp),%r10 2282 cmpq %rbp,%rsp 2283 ja .Lmulx4x_page_walk 2284 jmp .Lmulx4x_page_walk_done 2285 2286.Lmulx4x_page_walk: 2287 leaq -4096(%rsp),%rsp 2288 movq (%rsp),%r10 2289 cmpq %rbp,%rsp 2290 ja .Lmulx4x_page_walk 2291.Lmulx4x_page_walk_done: 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 movq %r8,32(%rsp) 2306 movq %rax,40(%rsp) 2307.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 2308.Lmulx4x_body: 2309 call mulx4x_internal 2310 2311 movq 40(%rsp),%rsi 2312.cfi_def_cfa %rsi,8 2313 movq $1,%rax 2314 2315 movq -48(%rsi),%r15 2316.cfi_restore %r15 2317 movq -40(%rsi),%r14 2318.cfi_restore %r14 2319 movq -32(%rsi),%r13 2320.cfi_restore %r13 2321 movq -24(%rsi),%r12 2322.cfi_restore %r12 2323 movq -16(%rsi),%rbp 2324.cfi_restore %rbp 2325 movq -8(%rsi),%rbx 2326.cfi_restore %rbx 2327 leaq (%rsi),%rsp 2328.cfi_def_cfa_register %rsp 2329.Lmulx4x_epilogue: 2330 .byte 0xf3,0xc3 2331.cfi_endproc 2332.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 2333 2334.type mulx4x_internal,@function 2335.align 32 2336mulx4x_internal: 2337 movq %r9,8(%rsp) 2338 movq %r9,%r10 2339 negq %r9 2340 shlq $5,%r9 2341 negq %r10 2342 leaq 128(%rdx,%r9,1),%r13 2343 shrq $5+5,%r9 2344 movd 8(%rax),%xmm5 2345 subq $1,%r9 2346 leaq .Linc(%rip),%rax 2347 movq %r13,16+8(%rsp) 2348 movq %r9,24+8(%rsp) 2349 movq %rdi,56+8(%rsp) 2350 movdqa 0(%rax),%xmm0 2351 movdqa 16(%rax),%xmm1 2352 leaq 88-112(%rsp,%r10,1),%r10 2353 leaq 128(%rdx),%rdi 2354 2355 pshufd $0,%xmm5,%xmm5 2356 movdqa %xmm1,%xmm4 2357.byte 0x67 2358 movdqa %xmm1,%xmm2 2359.byte 0x67 2360 paddd %xmm0,%xmm1 2361 pcmpeqd %xmm5,%xmm0 2362 movdqa %xmm4,%xmm3 2363 paddd %xmm1,%xmm2 2364 pcmpeqd %xmm5,%xmm1 2365 movdqa %xmm0,112(%r10) 2366 movdqa %xmm4,%xmm0 2367 2368 paddd %xmm2,%xmm3 2369 pcmpeqd %xmm5,%xmm2 2370 movdqa %xmm1,128(%r10) 2371 movdqa %xmm4,%xmm1 2372 2373 paddd %xmm3,%xmm0 2374 pcmpeqd %xmm5,%xmm3 2375 movdqa %xmm2,144(%r10) 2376 movdqa %xmm4,%xmm2 2377 2378 paddd %xmm0,%xmm1 2379 pcmpeqd %xmm5,%xmm0 2380 movdqa %xmm3,160(%r10) 2381 movdqa %xmm4,%xmm3 2382 paddd %xmm1,%xmm2 2383 pcmpeqd %xmm5,%xmm1 2384 movdqa %xmm0,176(%r10) 2385 movdqa %xmm4,%xmm0 2386 2387 paddd %xmm2,%xmm3 2388 pcmpeqd %xmm5,%xmm2 2389 movdqa %xmm1,192(%r10) 2390 movdqa %xmm4,%xmm1 2391 2392 paddd %xmm3,%xmm0 2393 pcmpeqd %xmm5,%xmm3 2394 movdqa %xmm2,208(%r10) 2395 movdqa %xmm4,%xmm2 2396 2397 paddd %xmm0,%xmm1 2398 pcmpeqd %xmm5,%xmm0 2399 movdqa %xmm3,224(%r10) 2400 movdqa %xmm4,%xmm3 2401 paddd %xmm1,%xmm2 2402 pcmpeqd %xmm5,%xmm1 2403 movdqa %xmm0,240(%r10) 2404 movdqa %xmm4,%xmm0 2405 2406 paddd %xmm2,%xmm3 2407 pcmpeqd %xmm5,%xmm2 2408 movdqa %xmm1,256(%r10) 2409 movdqa %xmm4,%xmm1 2410 2411 paddd %xmm3,%xmm0 2412 pcmpeqd %xmm5,%xmm3 2413 movdqa %xmm2,272(%r10) 2414 movdqa %xmm4,%xmm2 2415 2416 paddd %xmm0,%xmm1 2417 pcmpeqd %xmm5,%xmm0 2418 movdqa %xmm3,288(%r10) 2419 movdqa %xmm4,%xmm3 2420.byte 0x67 2421 paddd %xmm1,%xmm2 2422 pcmpeqd %xmm5,%xmm1 2423 movdqa %xmm0,304(%r10) 2424 2425 paddd %xmm2,%xmm3 2426 pcmpeqd %xmm5,%xmm2 2427 movdqa %xmm1,320(%r10) 2428 2429 pcmpeqd %xmm5,%xmm3 2430 movdqa %xmm2,336(%r10) 2431 2432 pand 64(%rdi),%xmm0 2433 pand 80(%rdi),%xmm1 2434 pand 96(%rdi),%xmm2 2435 movdqa %xmm3,352(%r10) 2436 pand 112(%rdi),%xmm3 2437 por %xmm2,%xmm0 2438 por %xmm3,%xmm1 2439 movdqa -128(%rdi),%xmm4 2440 movdqa -112(%rdi),%xmm5 2441 movdqa -96(%rdi),%xmm2 2442 pand 112(%r10),%xmm4 2443 movdqa -80(%rdi),%xmm3 2444 pand 128(%r10),%xmm5 2445 por %xmm4,%xmm0 2446 pand 144(%r10),%xmm2 2447 por %xmm5,%xmm1 2448 pand 160(%r10),%xmm3 2449 por %xmm2,%xmm0 2450 por %xmm3,%xmm1 2451 movdqa -64(%rdi),%xmm4 2452 movdqa -48(%rdi),%xmm5 2453 movdqa -32(%rdi),%xmm2 2454 pand 176(%r10),%xmm4 2455 movdqa -16(%rdi),%xmm3 2456 pand 192(%r10),%xmm5 2457 por %xmm4,%xmm0 2458 pand 208(%r10),%xmm2 2459 por %xmm5,%xmm1 2460 pand 224(%r10),%xmm3 2461 por %xmm2,%xmm0 2462 por %xmm3,%xmm1 2463 movdqa 0(%rdi),%xmm4 2464 movdqa 16(%rdi),%xmm5 2465 movdqa 32(%rdi),%xmm2 2466 pand 240(%r10),%xmm4 2467 movdqa 48(%rdi),%xmm3 2468 pand 256(%r10),%xmm5 2469 por %xmm4,%xmm0 2470 pand 272(%r10),%xmm2 2471 por %xmm5,%xmm1 2472 pand 288(%r10),%xmm3 2473 por %xmm2,%xmm0 2474 por %xmm3,%xmm1 2475 pxor %xmm1,%xmm0 2476 pshufd $0x4e,%xmm0,%xmm1 2477 por %xmm1,%xmm0 2478 leaq 256(%rdi),%rdi 2479.byte 102,72,15,126,194 2480 leaq 64+32+8(%rsp),%rbx 2481 2482 movq %rdx,%r9 2483 mulxq 0(%rsi),%r8,%rax 2484 mulxq 8(%rsi),%r11,%r12 2485 addq %rax,%r11 2486 mulxq 16(%rsi),%rax,%r13 2487 adcq %rax,%r12 2488 adcq $0,%r13 2489 mulxq 24(%rsi),%rax,%r14 2490 2491 movq %r8,%r15 2492 imulq 32+8(%rsp),%r8 2493 xorq %rbp,%rbp 2494 movq %r8,%rdx 2495 2496 movq %rdi,8+8(%rsp) 2497 2498 leaq 32(%rsi),%rsi 2499 adcxq %rax,%r13 2500 adcxq %rbp,%r14 2501 2502 mulxq 0(%rcx),%rax,%r10 2503 adcxq %rax,%r15 2504 adoxq %r11,%r10 2505 mulxq 8(%rcx),%rax,%r11 2506 adcxq %rax,%r10 2507 adoxq %r12,%r11 2508 mulxq 16(%rcx),%rax,%r12 2509 movq 24+8(%rsp),%rdi 2510 movq %r10,-32(%rbx) 2511 adcxq %rax,%r11 2512 adoxq %r13,%r12 2513 mulxq 24(%rcx),%rax,%r15 2514 movq %r9,%rdx 2515 movq %r11,-24(%rbx) 2516 adcxq %rax,%r12 2517 adoxq %rbp,%r15 2518 leaq 32(%rcx),%rcx 2519 movq %r12,-16(%rbx) 2520 jmp .Lmulx4x_1st 2521 2522.align 32 2523.Lmulx4x_1st: 2524 adcxq %rbp,%r15 2525 mulxq 0(%rsi),%r10,%rax 2526 adcxq %r14,%r10 2527 mulxq 8(%rsi),%r11,%r14 2528 adcxq %rax,%r11 2529 mulxq 16(%rsi),%r12,%rax 2530 adcxq %r14,%r12 2531 mulxq 24(%rsi),%r13,%r14 2532.byte 0x67,0x67 2533 movq %r8,%rdx 2534 adcxq %rax,%r13 2535 adcxq %rbp,%r14 2536 leaq 32(%rsi),%rsi 2537 leaq 32(%rbx),%rbx 2538 2539 adoxq %r15,%r10 2540 mulxq 0(%rcx),%rax,%r15 2541 adcxq %rax,%r10 2542 adoxq %r15,%r11 2543 mulxq 8(%rcx),%rax,%r15 2544 adcxq %rax,%r11 2545 adoxq %r15,%r12 2546 mulxq 16(%rcx),%rax,%r15 2547 movq %r10,-40(%rbx) 2548 adcxq %rax,%r12 2549 movq %r11,-32(%rbx) 2550 adoxq %r15,%r13 2551 mulxq 24(%rcx),%rax,%r15 2552 movq %r9,%rdx 2553 movq %r12,-24(%rbx) 2554 adcxq %rax,%r13 2555 adoxq %rbp,%r15 2556 leaq 32(%rcx),%rcx 2557 movq %r13,-16(%rbx) 2558 2559 decq %rdi 2560 jnz .Lmulx4x_1st 2561 2562 movq 8(%rsp),%rax 2563 adcq %rbp,%r15 2564 leaq (%rsi,%rax,1),%rsi 2565 addq %r15,%r14 2566 movq 8+8(%rsp),%rdi 2567 adcq %rbp,%rbp 2568 movq %r14,-8(%rbx) 2569 jmp .Lmulx4x_outer 2570 2571.align 32 2572.Lmulx4x_outer: 2573 leaq 16-256(%rbx),%r10 2574 pxor %xmm4,%xmm4 2575.byte 0x67,0x67 2576 pxor %xmm5,%xmm5 2577 movdqa -128(%rdi),%xmm0 2578 movdqa -112(%rdi),%xmm1 2579 movdqa -96(%rdi),%xmm2 2580 pand 256(%r10),%xmm0 2581 movdqa -80(%rdi),%xmm3 2582 pand 272(%r10),%xmm1 2583 por %xmm0,%xmm4 2584 pand 288(%r10),%xmm2 2585 por %xmm1,%xmm5 2586 pand 304(%r10),%xmm3 2587 por %xmm2,%xmm4 2588 por %xmm3,%xmm5 2589 movdqa -64(%rdi),%xmm0 2590 movdqa -48(%rdi),%xmm1 2591 movdqa -32(%rdi),%xmm2 2592 pand 320(%r10),%xmm0 2593 movdqa -16(%rdi),%xmm3 2594 pand 336(%r10),%xmm1 2595 por %xmm0,%xmm4 2596 pand 352(%r10),%xmm2 2597 por %xmm1,%xmm5 2598 pand 368(%r10),%xmm3 2599 por %xmm2,%xmm4 2600 por %xmm3,%xmm5 2601 movdqa 0(%rdi),%xmm0 2602 movdqa 16(%rdi),%xmm1 2603 movdqa 32(%rdi),%xmm2 2604 pand 384(%r10),%xmm0 2605 movdqa 48(%rdi),%xmm3 2606 pand 400(%r10),%xmm1 2607 por %xmm0,%xmm4 2608 pand 416(%r10),%xmm2 2609 por %xmm1,%xmm5 2610 pand 432(%r10),%xmm3 2611 por %xmm2,%xmm4 2612 por %xmm3,%xmm5 2613 movdqa 64(%rdi),%xmm0 2614 movdqa 80(%rdi),%xmm1 2615 movdqa 96(%rdi),%xmm2 2616 pand 448(%r10),%xmm0 2617 movdqa 112(%rdi),%xmm3 2618 pand 464(%r10),%xmm1 2619 por %xmm0,%xmm4 2620 pand 480(%r10),%xmm2 2621 por %xmm1,%xmm5 2622 pand 496(%r10),%xmm3 2623 por %xmm2,%xmm4 2624 por %xmm3,%xmm5 2625 por %xmm5,%xmm4 2626 pshufd $0x4e,%xmm4,%xmm0 2627 por %xmm4,%xmm0 2628 leaq 256(%rdi),%rdi 2629.byte 102,72,15,126,194 2630 2631 movq %rbp,(%rbx) 2632 leaq 32(%rbx,%rax,1),%rbx 2633 mulxq 0(%rsi),%r8,%r11 2634 xorq %rbp,%rbp 2635 movq %rdx,%r9 2636 mulxq 8(%rsi),%r14,%r12 2637 adoxq -32(%rbx),%r8 2638 adcxq %r14,%r11 2639 mulxq 16(%rsi),%r15,%r13 2640 adoxq -24(%rbx),%r11 2641 adcxq %r15,%r12 2642 mulxq 24(%rsi),%rdx,%r14 2643 adoxq -16(%rbx),%r12 2644 adcxq %rdx,%r13 2645 leaq (%rcx,%rax,1),%rcx 2646 leaq 32(%rsi),%rsi 2647 adoxq -8(%rbx),%r13 2648 adcxq %rbp,%r14 2649 adoxq %rbp,%r14 2650 2651 movq %r8,%r15 2652 imulq 32+8(%rsp),%r8 2653 2654 movq %r8,%rdx 2655 xorq %rbp,%rbp 2656 movq %rdi,8+8(%rsp) 2657 2658 mulxq 0(%rcx),%rax,%r10 2659 adcxq %rax,%r15 2660 adoxq %r11,%r10 2661 mulxq 8(%rcx),%rax,%r11 2662 adcxq %rax,%r10 2663 adoxq %r12,%r11 2664 mulxq 16(%rcx),%rax,%r12 2665 adcxq %rax,%r11 2666 adoxq %r13,%r12 2667 mulxq 24(%rcx),%rax,%r15 2668 movq %r9,%rdx 2669 movq 24+8(%rsp),%rdi 2670 movq %r10,-32(%rbx) 2671 adcxq %rax,%r12 2672 movq %r11,-24(%rbx) 2673 adoxq %rbp,%r15 2674 movq %r12,-16(%rbx) 2675 leaq 32(%rcx),%rcx 2676 jmp .Lmulx4x_inner 2677 2678.align 32 2679.Lmulx4x_inner: 2680 mulxq 0(%rsi),%r10,%rax 2681 adcxq %rbp,%r15 2682 adoxq %r14,%r10 2683 mulxq 8(%rsi),%r11,%r14 2684 adcxq 0(%rbx),%r10 2685 adoxq %rax,%r11 2686 mulxq 16(%rsi),%r12,%rax 2687 adcxq 8(%rbx),%r11 2688 adoxq %r14,%r12 2689 mulxq 24(%rsi),%r13,%r14 2690 movq %r8,%rdx 2691 adcxq 16(%rbx),%r12 2692 adoxq %rax,%r13 2693 adcxq 24(%rbx),%r13 2694 adoxq %rbp,%r14 2695 leaq 32(%rsi),%rsi 2696 leaq 32(%rbx),%rbx 2697 adcxq %rbp,%r14 2698 2699 adoxq %r15,%r10 2700 mulxq 0(%rcx),%rax,%r15 2701 adcxq %rax,%r10 2702 adoxq %r15,%r11 2703 mulxq 8(%rcx),%rax,%r15 2704 adcxq %rax,%r11 2705 adoxq %r15,%r12 2706 mulxq 16(%rcx),%rax,%r15 2707 movq %r10,-40(%rbx) 2708 adcxq %rax,%r12 2709 adoxq %r15,%r13 2710 movq %r11,-32(%rbx) 2711 mulxq 24(%rcx),%rax,%r15 2712 movq %r9,%rdx 2713 leaq 32(%rcx),%rcx 2714 movq %r12,-24(%rbx) 2715 adcxq %rax,%r13 2716 adoxq %rbp,%r15 2717 movq %r13,-16(%rbx) 2718 2719 decq %rdi 2720 jnz .Lmulx4x_inner 2721 2722 movq 0+8(%rsp),%rax 2723 adcq %rbp,%r15 2724 subq 0(%rbx),%rdi 2725 movq 8+8(%rsp),%rdi 2726 movq 16+8(%rsp),%r10 2727 adcq %r15,%r14 2728 leaq (%rsi,%rax,1),%rsi 2729 adcq %rbp,%rbp 2730 movq %r14,-8(%rbx) 2731 2732 cmpq %r10,%rdi 2733 jb .Lmulx4x_outer 2734 2735 movq -8(%rcx),%r10 2736 movq %rbp,%r8 2737 movq (%rcx,%rax,1),%r12 2738 leaq (%rcx,%rax,1),%rbp 2739 movq %rax,%rcx 2740 leaq (%rbx,%rax,1),%rdi 2741 xorl %eax,%eax 2742 xorq %r15,%r15 2743 subq %r14,%r10 2744 adcq %r15,%r15 2745 orq %r15,%r8 2746 sarq $3+2,%rcx 2747 subq %r8,%rax 2748 movq 56+8(%rsp),%rdx 2749 decq %r12 2750 movq 8(%rbp),%r13 2751 xorq %r8,%r8 2752 movq 16(%rbp),%r14 2753 movq 24(%rbp),%r15 2754 jmp .Lsqrx4x_sub_entry 2755.size mulx4x_internal,.-mulx4x_internal 2756.type bn_powerx5,@function 2757.align 32 2758bn_powerx5: 2759.cfi_startproc 2760 movq %rsp,%rax 2761.cfi_def_cfa_register %rax 2762.Lpowerx5_enter: 2763 pushq %rbx 2764.cfi_offset %rbx,-16 2765 pushq %rbp 2766.cfi_offset %rbp,-24 2767 pushq %r12 2768.cfi_offset %r12,-32 2769 pushq %r13 2770.cfi_offset %r13,-40 2771 pushq %r14 2772.cfi_offset %r14,-48 2773 pushq %r15 2774.cfi_offset %r15,-56 2775.Lpowerx5_prologue: 2776 2777 shll $3,%r9d 2778 leaq (%r9,%r9,2),%r10 2779 negq %r9 2780 movq (%r8),%r8 2781 2782 2783 2784 2785 2786 2787 2788 2789 leaq -320(%rsp,%r9,2),%r11 2790 movq %rsp,%rbp 2791 subq %rdi,%r11 2792 andq $4095,%r11 2793 cmpq %r11,%r10 2794 jb .Lpwrx_sp_alt 2795 subq %r11,%rbp 2796 leaq -320(%rbp,%r9,2),%rbp 2797 jmp .Lpwrx_sp_done 2798 2799.align 32 2800.Lpwrx_sp_alt: 2801 leaq 4096-320(,%r9,2),%r10 2802 leaq -320(%rbp,%r9,2),%rbp 2803 subq %r10,%r11 2804 movq $0,%r10 2805 cmovcq %r10,%r11 2806 subq %r11,%rbp 2807.Lpwrx_sp_done: 2808 andq $-64,%rbp 2809 movq %rsp,%r11 2810 subq %rbp,%r11 2811 andq $-4096,%r11 2812 leaq (%r11,%rbp,1),%rsp 2813 movq (%rsp),%r10 2814 cmpq %rbp,%rsp 2815 ja .Lpwrx_page_walk 2816 jmp .Lpwrx_page_walk_done 2817 2818.Lpwrx_page_walk: 2819 leaq -4096(%rsp),%rsp 2820 movq (%rsp),%r10 2821 cmpq %rbp,%rsp 2822 ja .Lpwrx_page_walk 2823.Lpwrx_page_walk_done: 2824 2825 movq %r9,%r10 2826 negq %r9 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 pxor %xmm0,%xmm0 2840.byte 102,72,15,110,207 2841.byte 102,72,15,110,209 2842.byte 102,73,15,110,218 2843.byte 102,72,15,110,226 2844 movq %r8,32(%rsp) 2845 movq %rax,40(%rsp) 2846.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 2847.Lpowerx5_body: 2848 2849 call __bn_sqrx8x_internal 2850 call __bn_postx4x_internal 2851 call __bn_sqrx8x_internal 2852 call __bn_postx4x_internal 2853 call __bn_sqrx8x_internal 2854 call __bn_postx4x_internal 2855 call __bn_sqrx8x_internal 2856 call __bn_postx4x_internal 2857 call __bn_sqrx8x_internal 2858 call __bn_postx4x_internal 2859 2860 movq %r10,%r9 2861 movq %rsi,%rdi 2862.byte 102,72,15,126,209 2863.byte 102,72,15,126,226 2864 movq 40(%rsp),%rax 2865 2866 call mulx4x_internal 2867 2868 movq 40(%rsp),%rsi 2869.cfi_def_cfa %rsi,8 2870 movq $1,%rax 2871 2872 movq -48(%rsi),%r15 2873.cfi_restore %r15 2874 movq -40(%rsi),%r14 2875.cfi_restore %r14 2876 movq -32(%rsi),%r13 2877.cfi_restore %r13 2878 movq -24(%rsi),%r12 2879.cfi_restore %r12 2880 movq -16(%rsi),%rbp 2881.cfi_restore %rbp 2882 movq -8(%rsi),%rbx 2883.cfi_restore %rbx 2884 leaq (%rsi),%rsp 2885.cfi_def_cfa_register %rsp 2886.Lpowerx5_epilogue: 2887 .byte 0xf3,0xc3 2888.cfi_endproc 2889.size bn_powerx5,.-bn_powerx5 2890 2891.globl bn_sqrx8x_internal 2892.hidden bn_sqrx8x_internal 2893.type bn_sqrx8x_internal,@function 2894.align 32 2895bn_sqrx8x_internal: 2896__bn_sqrx8x_internal: 2897.cfi_startproc 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 leaq 48+8(%rsp),%rdi 2939 leaq (%rsi,%r9,1),%rbp 2940 movq %r9,0+8(%rsp) 2941 movq %rbp,8+8(%rsp) 2942 jmp .Lsqr8x_zero_start 2943 2944.align 32 2945.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 2946.Lsqrx8x_zero: 2947.byte 0x3e 2948 movdqa %xmm0,0(%rdi) 2949 movdqa %xmm0,16(%rdi) 2950 movdqa %xmm0,32(%rdi) 2951 movdqa %xmm0,48(%rdi) 2952.Lsqr8x_zero_start: 2953 movdqa %xmm0,64(%rdi) 2954 movdqa %xmm0,80(%rdi) 2955 movdqa %xmm0,96(%rdi) 2956 movdqa %xmm0,112(%rdi) 2957 leaq 128(%rdi),%rdi 2958 subq $64,%r9 2959 jnz .Lsqrx8x_zero 2960 2961 movq 0(%rsi),%rdx 2962 2963 xorq %r10,%r10 2964 xorq %r11,%r11 2965 xorq %r12,%r12 2966 xorq %r13,%r13 2967 xorq %r14,%r14 2968 xorq %r15,%r15 2969 leaq 48+8(%rsp),%rdi 2970 xorq %rbp,%rbp 2971 jmp .Lsqrx8x_outer_loop 2972 2973.align 32 2974.Lsqrx8x_outer_loop: 2975 mulxq 8(%rsi),%r8,%rax 2976 adcxq %r9,%r8 2977 adoxq %rax,%r10 2978 mulxq 16(%rsi),%r9,%rax 2979 adcxq %r10,%r9 2980 adoxq %rax,%r11 2981.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 2982 adcxq %r11,%r10 2983 adoxq %rax,%r12 2984.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 2985 adcxq %r12,%r11 2986 adoxq %rax,%r13 2987 mulxq 40(%rsi),%r12,%rax 2988 adcxq %r13,%r12 2989 adoxq %rax,%r14 2990 mulxq 48(%rsi),%r13,%rax 2991 adcxq %r14,%r13 2992 adoxq %r15,%rax 2993 mulxq 56(%rsi),%r14,%r15 2994 movq 8(%rsi),%rdx 2995 adcxq %rax,%r14 2996 adoxq %rbp,%r15 2997 adcq 64(%rdi),%r15 2998 movq %r8,8(%rdi) 2999 movq %r9,16(%rdi) 3000 sbbq %rcx,%rcx 3001 xorq %rbp,%rbp 3002 3003 3004 mulxq 16(%rsi),%r8,%rbx 3005 mulxq 24(%rsi),%r9,%rax 3006 adcxq %r10,%r8 3007 adoxq %rbx,%r9 3008 mulxq 32(%rsi),%r10,%rbx 3009 adcxq %r11,%r9 3010 adoxq %rax,%r10 3011.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 3012 adcxq %r12,%r10 3013 adoxq %rbx,%r11 3014.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 3015 adcxq %r13,%r11 3016 adoxq %r14,%r12 3017.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 3018 movq 16(%rsi),%rdx 3019 adcxq %rax,%r12 3020 adoxq %rbx,%r13 3021 adcxq %r15,%r13 3022 adoxq %rbp,%r14 3023 adcxq %rbp,%r14 3024 3025 movq %r8,24(%rdi) 3026 movq %r9,32(%rdi) 3027 3028 mulxq 24(%rsi),%r8,%rbx 3029 mulxq 32(%rsi),%r9,%rax 3030 adcxq %r10,%r8 3031 adoxq %rbx,%r9 3032 mulxq 40(%rsi),%r10,%rbx 3033 adcxq %r11,%r9 3034 adoxq %rax,%r10 3035.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 3036 adcxq %r12,%r10 3037 adoxq %r13,%r11 3038.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 3039.byte 0x3e 3040 movq 24(%rsi),%rdx 3041 adcxq %rbx,%r11 3042 adoxq %rax,%r12 3043 adcxq %r14,%r12 3044 movq %r8,40(%rdi) 3045 movq %r9,48(%rdi) 3046 mulxq 32(%rsi),%r8,%rax 3047 adoxq %rbp,%r13 3048 adcxq %rbp,%r13 3049 3050 mulxq 40(%rsi),%r9,%rbx 3051 adcxq %r10,%r8 3052 adoxq %rax,%r9 3053 mulxq 48(%rsi),%r10,%rax 3054 adcxq %r11,%r9 3055 adoxq %r12,%r10 3056 mulxq 56(%rsi),%r11,%r12 3057 movq 32(%rsi),%rdx 3058 movq 40(%rsi),%r14 3059 adcxq %rbx,%r10 3060 adoxq %rax,%r11 3061 movq 48(%rsi),%r15 3062 adcxq %r13,%r11 3063 adoxq %rbp,%r12 3064 adcxq %rbp,%r12 3065 3066 movq %r8,56(%rdi) 3067 movq %r9,64(%rdi) 3068 3069 mulxq %r14,%r9,%rax 3070 movq 56(%rsi),%r8 3071 adcxq %r10,%r9 3072 mulxq %r15,%r10,%rbx 3073 adoxq %rax,%r10 3074 adcxq %r11,%r10 3075 mulxq %r8,%r11,%rax 3076 movq %r14,%rdx 3077 adoxq %rbx,%r11 3078 adcxq %r12,%r11 3079 3080 adcxq %rbp,%rax 3081 3082 mulxq %r15,%r14,%rbx 3083 mulxq %r8,%r12,%r13 3084 movq %r15,%rdx 3085 leaq 64(%rsi),%rsi 3086 adcxq %r14,%r11 3087 adoxq %rbx,%r12 3088 adcxq %rax,%r12 3089 adoxq %rbp,%r13 3090 3091.byte 0x67,0x67 3092 mulxq %r8,%r8,%r14 3093 adcxq %r8,%r13 3094 adcxq %rbp,%r14 3095 3096 cmpq 8+8(%rsp),%rsi 3097 je .Lsqrx8x_outer_break 3098 3099 negq %rcx 3100 movq $-8,%rcx 3101 movq %rbp,%r15 3102 movq 64(%rdi),%r8 3103 adcxq 72(%rdi),%r9 3104 adcxq 80(%rdi),%r10 3105 adcxq 88(%rdi),%r11 3106 adcq 96(%rdi),%r12 3107 adcq 104(%rdi),%r13 3108 adcq 112(%rdi),%r14 3109 adcq 120(%rdi),%r15 3110 leaq (%rsi),%rbp 3111 leaq 128(%rdi),%rdi 3112 sbbq %rax,%rax 3113 3114 movq -64(%rsi),%rdx 3115 movq %rax,16+8(%rsp) 3116 movq %rdi,24+8(%rsp) 3117 3118 3119 xorl %eax,%eax 3120 jmp .Lsqrx8x_loop 3121 3122.align 32 3123.Lsqrx8x_loop: 3124 movq %r8,%rbx 3125 mulxq 0(%rbp),%rax,%r8 3126 adcxq %rax,%rbx 3127 adoxq %r9,%r8 3128 3129 mulxq 8(%rbp),%rax,%r9 3130 adcxq %rax,%r8 3131 adoxq %r10,%r9 3132 3133 mulxq 16(%rbp),%rax,%r10 3134 adcxq %rax,%r9 3135 adoxq %r11,%r10 3136 3137 mulxq 24(%rbp),%rax,%r11 3138 adcxq %rax,%r10 3139 adoxq %r12,%r11 3140 3141.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 3142 adcxq %rax,%r11 3143 adoxq %r13,%r12 3144 3145 mulxq 40(%rbp),%rax,%r13 3146 adcxq %rax,%r12 3147 adoxq %r14,%r13 3148 3149 mulxq 48(%rbp),%rax,%r14 3150 movq %rbx,(%rdi,%rcx,8) 3151 movl $0,%ebx 3152 adcxq %rax,%r13 3153 adoxq %r15,%r14 3154 3155.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 3156 movq 8(%rsi,%rcx,8),%rdx 3157 adcxq %rax,%r14 3158 adoxq %rbx,%r15 3159 adcxq %rbx,%r15 3160 3161.byte 0x67 3162 incq %rcx 3163 jnz .Lsqrx8x_loop 3164 3165 leaq 64(%rbp),%rbp 3166 movq $-8,%rcx 3167 cmpq 8+8(%rsp),%rbp 3168 je .Lsqrx8x_break 3169 3170 subq 16+8(%rsp),%rbx 3171.byte 0x66 3172 movq -64(%rsi),%rdx 3173 adcxq 0(%rdi),%r8 3174 adcxq 8(%rdi),%r9 3175 adcq 16(%rdi),%r10 3176 adcq 24(%rdi),%r11 3177 adcq 32(%rdi),%r12 3178 adcq 40(%rdi),%r13 3179 adcq 48(%rdi),%r14 3180 adcq 56(%rdi),%r15 3181 leaq 64(%rdi),%rdi 3182.byte 0x67 3183 sbbq %rax,%rax 3184 xorl %ebx,%ebx 3185 movq %rax,16+8(%rsp) 3186 jmp .Lsqrx8x_loop 3187 3188.align 32 3189.Lsqrx8x_break: 3190 xorq %rbp,%rbp 3191 subq 16+8(%rsp),%rbx 3192 adcxq %rbp,%r8 3193 movq 24+8(%rsp),%rcx 3194 adcxq %rbp,%r9 3195 movq 0(%rsi),%rdx 3196 adcq $0,%r10 3197 movq %r8,0(%rdi) 3198 adcq $0,%r11 3199 adcq $0,%r12 3200 adcq $0,%r13 3201 adcq $0,%r14 3202 adcq $0,%r15 3203 cmpq %rcx,%rdi 3204 je .Lsqrx8x_outer_loop 3205 3206 movq %r9,8(%rdi) 3207 movq 8(%rcx),%r9 3208 movq %r10,16(%rdi) 3209 movq 16(%rcx),%r10 3210 movq %r11,24(%rdi) 3211 movq 24(%rcx),%r11 3212 movq %r12,32(%rdi) 3213 movq 32(%rcx),%r12 3214 movq %r13,40(%rdi) 3215 movq 40(%rcx),%r13 3216 movq %r14,48(%rdi) 3217 movq 48(%rcx),%r14 3218 movq %r15,56(%rdi) 3219 movq 56(%rcx),%r15 3220 movq %rcx,%rdi 3221 jmp .Lsqrx8x_outer_loop 3222 3223.align 32 3224.Lsqrx8x_outer_break: 3225 movq %r9,72(%rdi) 3226.byte 102,72,15,126,217 3227 movq %r10,80(%rdi) 3228 movq %r11,88(%rdi) 3229 movq %r12,96(%rdi) 3230 movq %r13,104(%rdi) 3231 movq %r14,112(%rdi) 3232 leaq 48+8(%rsp),%rdi 3233 movq (%rsi,%rcx,1),%rdx 3234 3235 movq 8(%rdi),%r11 3236 xorq %r10,%r10 3237 movq 0+8(%rsp),%r9 3238 adoxq %r11,%r11 3239 movq 16(%rdi),%r12 3240 movq 24(%rdi),%r13 3241 3242 3243.align 32 3244.Lsqrx4x_shift_n_add: 3245 mulxq %rdx,%rax,%rbx 3246 adoxq %r12,%r12 3247 adcxq %r10,%rax 3248.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 3249.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 3250 adoxq %r13,%r13 3251 adcxq %r11,%rbx 3252 movq 40(%rdi),%r11 3253 movq %rax,0(%rdi) 3254 movq %rbx,8(%rdi) 3255 3256 mulxq %rdx,%rax,%rbx 3257 adoxq %r10,%r10 3258 adcxq %r12,%rax 3259 movq 16(%rsi,%rcx,1),%rdx 3260 movq 48(%rdi),%r12 3261 adoxq %r11,%r11 3262 adcxq %r13,%rbx 3263 movq 56(%rdi),%r13 3264 movq %rax,16(%rdi) 3265 movq %rbx,24(%rdi) 3266 3267 mulxq %rdx,%rax,%rbx 3268 adoxq %r12,%r12 3269 adcxq %r10,%rax 3270 movq 24(%rsi,%rcx,1),%rdx 3271 leaq 32(%rcx),%rcx 3272 movq 64(%rdi),%r10 3273 adoxq %r13,%r13 3274 adcxq %r11,%rbx 3275 movq 72(%rdi),%r11 3276 movq %rax,32(%rdi) 3277 movq %rbx,40(%rdi) 3278 3279 mulxq %rdx,%rax,%rbx 3280 adoxq %r10,%r10 3281 adcxq %r12,%rax 3282 jrcxz .Lsqrx4x_shift_n_add_break 3283.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 3284 adoxq %r11,%r11 3285 adcxq %r13,%rbx 3286 movq 80(%rdi),%r12 3287 movq 88(%rdi),%r13 3288 movq %rax,48(%rdi) 3289 movq %rbx,56(%rdi) 3290 leaq 64(%rdi),%rdi 3291 nop 3292 jmp .Lsqrx4x_shift_n_add 3293 3294.align 32 3295.Lsqrx4x_shift_n_add_break: 3296 adcxq %r13,%rbx 3297 movq %rax,48(%rdi) 3298 movq %rbx,56(%rdi) 3299 leaq 64(%rdi),%rdi 3300.byte 102,72,15,126,213 3301__bn_sqrx8x_reduction: 3302 xorl %eax,%eax 3303 movq 32+8(%rsp),%rbx 3304 movq 48+8(%rsp),%rdx 3305 leaq -64(%rbp,%r9,1),%rcx 3306 3307 movq %rcx,0+8(%rsp) 3308 movq %rdi,8+8(%rsp) 3309 3310 leaq 48+8(%rsp),%rdi 3311 jmp .Lsqrx8x_reduction_loop 3312 3313.align 32 3314.Lsqrx8x_reduction_loop: 3315 movq 8(%rdi),%r9 3316 movq 16(%rdi),%r10 3317 movq 24(%rdi),%r11 3318 movq 32(%rdi),%r12 3319 movq %rdx,%r8 3320 imulq %rbx,%rdx 3321 movq 40(%rdi),%r13 3322 movq 48(%rdi),%r14 3323 movq 56(%rdi),%r15 3324 movq %rax,24+8(%rsp) 3325 3326 leaq 64(%rdi),%rdi 3327 xorq %rsi,%rsi 3328 movq $-8,%rcx 3329 jmp .Lsqrx8x_reduce 3330 3331.align 32 3332.Lsqrx8x_reduce: 3333 movq %r8,%rbx 3334 mulxq 0(%rbp),%rax,%r8 3335 adcxq %rbx,%rax 3336 adoxq %r9,%r8 3337 3338 mulxq 8(%rbp),%rbx,%r9 3339 adcxq %rbx,%r8 3340 adoxq %r10,%r9 3341 3342 mulxq 16(%rbp),%rbx,%r10 3343 adcxq %rbx,%r9 3344 adoxq %r11,%r10 3345 3346 mulxq 24(%rbp),%rbx,%r11 3347 adcxq %rbx,%r10 3348 adoxq %r12,%r11 3349 3350.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 3351 movq %rdx,%rax 3352 movq %r8,%rdx 3353 adcxq %rbx,%r11 3354 adoxq %r13,%r12 3355 3356 mulxq 32+8(%rsp),%rbx,%rdx 3357 movq %rax,%rdx 3358 movq %rax,64+48+8(%rsp,%rcx,8) 3359 3360 mulxq 40(%rbp),%rax,%r13 3361 adcxq %rax,%r12 3362 adoxq %r14,%r13 3363 3364 mulxq 48(%rbp),%rax,%r14 3365 adcxq %rax,%r13 3366 adoxq %r15,%r14 3367 3368 mulxq 56(%rbp),%rax,%r15 3369 movq %rbx,%rdx 3370 adcxq %rax,%r14 3371 adoxq %rsi,%r15 3372 adcxq %rsi,%r15 3373 3374.byte 0x67,0x67,0x67 3375 incq %rcx 3376 jnz .Lsqrx8x_reduce 3377 3378 movq %rsi,%rax 3379 cmpq 0+8(%rsp),%rbp 3380 jae .Lsqrx8x_no_tail 3381 3382 movq 48+8(%rsp),%rdx 3383 addq 0(%rdi),%r8 3384 leaq 64(%rbp),%rbp 3385 movq $-8,%rcx 3386 adcxq 8(%rdi),%r9 3387 adcxq 16(%rdi),%r10 3388 adcq 24(%rdi),%r11 3389 adcq 32(%rdi),%r12 3390 adcq 40(%rdi),%r13 3391 adcq 48(%rdi),%r14 3392 adcq 56(%rdi),%r15 3393 leaq 64(%rdi),%rdi 3394 sbbq %rax,%rax 3395 3396 xorq %rsi,%rsi 3397 movq %rax,16+8(%rsp) 3398 jmp .Lsqrx8x_tail 3399 3400.align 32 3401.Lsqrx8x_tail: 3402 movq %r8,%rbx 3403 mulxq 0(%rbp),%rax,%r8 3404 adcxq %rax,%rbx 3405 adoxq %r9,%r8 3406 3407 mulxq 8(%rbp),%rax,%r9 3408 adcxq %rax,%r8 3409 adoxq %r10,%r9 3410 3411 mulxq 16(%rbp),%rax,%r10 3412 adcxq %rax,%r9 3413 adoxq %r11,%r10 3414 3415 mulxq 24(%rbp),%rax,%r11 3416 adcxq %rax,%r10 3417 adoxq %r12,%r11 3418 3419.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 3420 adcxq %rax,%r11 3421 adoxq %r13,%r12 3422 3423 mulxq 40(%rbp),%rax,%r13 3424 adcxq %rax,%r12 3425 adoxq %r14,%r13 3426 3427 mulxq 48(%rbp),%rax,%r14 3428 adcxq %rax,%r13 3429 adoxq %r15,%r14 3430 3431 mulxq 56(%rbp),%rax,%r15 3432 movq 72+48+8(%rsp,%rcx,8),%rdx 3433 adcxq %rax,%r14 3434 adoxq %rsi,%r15 3435 movq %rbx,(%rdi,%rcx,8) 3436 movq %r8,%rbx 3437 adcxq %rsi,%r15 3438 3439 incq %rcx 3440 jnz .Lsqrx8x_tail 3441 3442 cmpq 0+8(%rsp),%rbp 3443 jae .Lsqrx8x_tail_done 3444 3445 subq 16+8(%rsp),%rsi 3446 movq 48+8(%rsp),%rdx 3447 leaq 64(%rbp),%rbp 3448 adcq 0(%rdi),%r8 3449 adcq 8(%rdi),%r9 3450 adcq 16(%rdi),%r10 3451 adcq 24(%rdi),%r11 3452 adcq 32(%rdi),%r12 3453 adcq 40(%rdi),%r13 3454 adcq 48(%rdi),%r14 3455 adcq 56(%rdi),%r15 3456 leaq 64(%rdi),%rdi 3457 sbbq %rax,%rax 3458 subq $8,%rcx 3459 3460 xorq %rsi,%rsi 3461 movq %rax,16+8(%rsp) 3462 jmp .Lsqrx8x_tail 3463 3464.align 32 3465.Lsqrx8x_tail_done: 3466 xorq %rax,%rax 3467 addq 24+8(%rsp),%r8 3468 adcq $0,%r9 3469 adcq $0,%r10 3470 adcq $0,%r11 3471 adcq $0,%r12 3472 adcq $0,%r13 3473 adcq $0,%r14 3474 adcq $0,%r15 3475 adcq $0,%rax 3476 3477 subq 16+8(%rsp),%rsi 3478.Lsqrx8x_no_tail: 3479 adcq 0(%rdi),%r8 3480.byte 102,72,15,126,217 3481 adcq 8(%rdi),%r9 3482 movq 56(%rbp),%rsi 3483.byte 102,72,15,126,213 3484 adcq 16(%rdi),%r10 3485 adcq 24(%rdi),%r11 3486 adcq 32(%rdi),%r12 3487 adcq 40(%rdi),%r13 3488 adcq 48(%rdi),%r14 3489 adcq 56(%rdi),%r15 3490 adcq $0,%rax 3491 3492 movq 32+8(%rsp),%rbx 3493 movq 64(%rdi,%rcx,1),%rdx 3494 3495 movq %r8,0(%rdi) 3496 leaq 64(%rdi),%r8 3497 movq %r9,8(%rdi) 3498 movq %r10,16(%rdi) 3499 movq %r11,24(%rdi) 3500 movq %r12,32(%rdi) 3501 movq %r13,40(%rdi) 3502 movq %r14,48(%rdi) 3503 movq %r15,56(%rdi) 3504 3505 leaq 64(%rdi,%rcx,1),%rdi 3506 cmpq 8+8(%rsp),%r8 3507 jb .Lsqrx8x_reduction_loop 3508 .byte 0xf3,0xc3 3509.cfi_endproc 3510.size bn_sqrx8x_internal,.-bn_sqrx8x_internal 3511.align 32 3512__bn_postx4x_internal: 3513 movq 0(%rbp),%r12 3514 movq %rcx,%r10 3515 movq %rcx,%r9 3516 negq %rax 3517 sarq $3+2,%rcx 3518 3519.byte 102,72,15,126,202 3520.byte 102,72,15,126,206 3521 decq %r12 3522 movq 8(%rbp),%r13 3523 xorq %r8,%r8 3524 movq 16(%rbp),%r14 3525 movq 24(%rbp),%r15 3526 jmp .Lsqrx4x_sub_entry 3527 3528.align 16 3529.Lsqrx4x_sub: 3530 movq 0(%rbp),%r12 3531 movq 8(%rbp),%r13 3532 movq 16(%rbp),%r14 3533 movq 24(%rbp),%r15 3534.Lsqrx4x_sub_entry: 3535 andnq %rax,%r12,%r12 3536 leaq 32(%rbp),%rbp 3537 andnq %rax,%r13,%r13 3538 andnq %rax,%r14,%r14 3539 andnq %rax,%r15,%r15 3540 3541 negq %r8 3542 adcq 0(%rdi),%r12 3543 adcq 8(%rdi),%r13 3544 adcq 16(%rdi),%r14 3545 adcq 24(%rdi),%r15 3546 movq %r12,0(%rdx) 3547 leaq 32(%rdi),%rdi 3548 movq %r13,8(%rdx) 3549 sbbq %r8,%r8 3550 movq %r14,16(%rdx) 3551 movq %r15,24(%rdx) 3552 leaq 32(%rdx),%rdx 3553 3554 incq %rcx 3555 jnz .Lsqrx4x_sub 3556 3557 negq %r9 3558 3559 .byte 0xf3,0xc3 3560.size __bn_postx4x_internal,.-__bn_postx4x_internal 3561.globl bn_get_bits5 3562.type bn_get_bits5,@function 3563.align 16 3564bn_get_bits5: 3565 leaq 0(%rdi),%r10 3566 leaq 1(%rdi),%r11 3567 movl %esi,%ecx 3568 shrl $4,%esi 3569 andl $15,%ecx 3570 leal -8(%rcx),%eax 3571 cmpl $11,%ecx 3572 cmovaq %r11,%r10 3573 cmoval %eax,%ecx 3574 movzwl (%r10,%rsi,2),%eax 3575 shrl %cl,%eax 3576 andl $31,%eax 3577 .byte 0xf3,0xc3 3578.size bn_get_bits5,.-bn_get_bits5 3579 3580.globl bn_scatter5 3581.type bn_scatter5,@function 3582.align 16 3583bn_scatter5: 3584 cmpl $0,%esi 3585 jz .Lscatter_epilogue 3586 leaq (%rdx,%rcx,8),%rdx 3587.Lscatter: 3588 movq (%rdi),%rax 3589 leaq 8(%rdi),%rdi 3590 movq %rax,(%rdx) 3591 leaq 256(%rdx),%rdx 3592 subl $1,%esi 3593 jnz .Lscatter 3594.Lscatter_epilogue: 3595 .byte 0xf3,0xc3 3596.size bn_scatter5,.-bn_scatter5 3597 3598.globl bn_gather5 3599.type bn_gather5,@function 3600.align 32 3601bn_gather5: 3602.LSEH_begin_bn_gather5: 3603 3604.byte 0x4c,0x8d,0x14,0x24 3605.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 3606 leaq .Linc(%rip),%rax 3607 andq $-16,%rsp 3608 3609 movd %ecx,%xmm5 3610 movdqa 0(%rax),%xmm0 3611 movdqa 16(%rax),%xmm1 3612 leaq 128(%rdx),%r11 3613 leaq 128(%rsp),%rax 3614 3615 pshufd $0,%xmm5,%xmm5 3616 movdqa %xmm1,%xmm4 3617 movdqa %xmm1,%xmm2 3618 paddd %xmm0,%xmm1 3619 pcmpeqd %xmm5,%xmm0 3620 movdqa %xmm4,%xmm3 3621 3622 paddd %xmm1,%xmm2 3623 pcmpeqd %xmm5,%xmm1 3624 movdqa %xmm0,-128(%rax) 3625 movdqa %xmm4,%xmm0 3626 3627 paddd %xmm2,%xmm3 3628 pcmpeqd %xmm5,%xmm2 3629 movdqa %xmm1,-112(%rax) 3630 movdqa %xmm4,%xmm1 3631 3632 paddd %xmm3,%xmm0 3633 pcmpeqd %xmm5,%xmm3 3634 movdqa %xmm2,-96(%rax) 3635 movdqa %xmm4,%xmm2 3636 paddd %xmm0,%xmm1 3637 pcmpeqd %xmm5,%xmm0 3638 movdqa %xmm3,-80(%rax) 3639 movdqa %xmm4,%xmm3 3640 3641 paddd %xmm1,%xmm2 3642 pcmpeqd %xmm5,%xmm1 3643 movdqa %xmm0,-64(%rax) 3644 movdqa %xmm4,%xmm0 3645 3646 paddd %xmm2,%xmm3 3647 pcmpeqd %xmm5,%xmm2 3648 movdqa %xmm1,-48(%rax) 3649 movdqa %xmm4,%xmm1 3650 3651 paddd %xmm3,%xmm0 3652 pcmpeqd %xmm5,%xmm3 3653 movdqa %xmm2,-32(%rax) 3654 movdqa %xmm4,%xmm2 3655 paddd %xmm0,%xmm1 3656 pcmpeqd %xmm5,%xmm0 3657 movdqa %xmm3,-16(%rax) 3658 movdqa %xmm4,%xmm3 3659 3660 paddd %xmm1,%xmm2 3661 pcmpeqd %xmm5,%xmm1 3662 movdqa %xmm0,0(%rax) 3663 movdqa %xmm4,%xmm0 3664 3665 paddd %xmm2,%xmm3 3666 pcmpeqd %xmm5,%xmm2 3667 movdqa %xmm1,16(%rax) 3668 movdqa %xmm4,%xmm1 3669 3670 paddd %xmm3,%xmm0 3671 pcmpeqd %xmm5,%xmm3 3672 movdqa %xmm2,32(%rax) 3673 movdqa %xmm4,%xmm2 3674 paddd %xmm0,%xmm1 3675 pcmpeqd %xmm5,%xmm0 3676 movdqa %xmm3,48(%rax) 3677 movdqa %xmm4,%xmm3 3678 3679 paddd %xmm1,%xmm2 3680 pcmpeqd %xmm5,%xmm1 3681 movdqa %xmm0,64(%rax) 3682 movdqa %xmm4,%xmm0 3683 3684 paddd %xmm2,%xmm3 3685 pcmpeqd %xmm5,%xmm2 3686 movdqa %xmm1,80(%rax) 3687 movdqa %xmm4,%xmm1 3688 3689 paddd %xmm3,%xmm0 3690 pcmpeqd %xmm5,%xmm3 3691 movdqa %xmm2,96(%rax) 3692 movdqa %xmm4,%xmm2 3693 movdqa %xmm3,112(%rax) 3694 jmp .Lgather 3695 3696.align 32 3697.Lgather: 3698 pxor %xmm4,%xmm4 3699 pxor %xmm5,%xmm5 3700 movdqa -128(%r11),%xmm0 3701 movdqa -112(%r11),%xmm1 3702 movdqa -96(%r11),%xmm2 3703 pand -128(%rax),%xmm0 3704 movdqa -80(%r11),%xmm3 3705 pand -112(%rax),%xmm1 3706 por %xmm0,%xmm4 3707 pand -96(%rax),%xmm2 3708 por %xmm1,%xmm5 3709 pand -80(%rax),%xmm3 3710 por %xmm2,%xmm4 3711 por %xmm3,%xmm5 3712 movdqa -64(%r11),%xmm0 3713 movdqa -48(%r11),%xmm1 3714 movdqa -32(%r11),%xmm2 3715 pand -64(%rax),%xmm0 3716 movdqa -16(%r11),%xmm3 3717 pand -48(%rax),%xmm1 3718 por %xmm0,%xmm4 3719 pand -32(%rax),%xmm2 3720 por %xmm1,%xmm5 3721 pand -16(%rax),%xmm3 3722 por %xmm2,%xmm4 3723 por %xmm3,%xmm5 3724 movdqa 0(%r11),%xmm0 3725 movdqa 16(%r11),%xmm1 3726 movdqa 32(%r11),%xmm2 3727 pand 0(%rax),%xmm0 3728 movdqa 48(%r11),%xmm3 3729 pand 16(%rax),%xmm1 3730 por %xmm0,%xmm4 3731 pand 32(%rax),%xmm2 3732 por %xmm1,%xmm5 3733 pand 48(%rax),%xmm3 3734 por %xmm2,%xmm4 3735 por %xmm3,%xmm5 3736 movdqa 64(%r11),%xmm0 3737 movdqa 80(%r11),%xmm1 3738 movdqa 96(%r11),%xmm2 3739 pand 64(%rax),%xmm0 3740 movdqa 112(%r11),%xmm3 3741 pand 80(%rax),%xmm1 3742 por %xmm0,%xmm4 3743 pand 96(%rax),%xmm2 3744 por %xmm1,%xmm5 3745 pand 112(%rax),%xmm3 3746 por %xmm2,%xmm4 3747 por %xmm3,%xmm5 3748 por %xmm5,%xmm4 3749 leaq 256(%r11),%r11 3750 pshufd $0x4e,%xmm4,%xmm0 3751 por %xmm4,%xmm0 3752 movq %xmm0,(%rdi) 3753 leaq 8(%rdi),%rdi 3754 subl $1,%esi 3755 jnz .Lgather 3756 3757 leaq (%r10),%rsp 3758 .byte 0xf3,0xc3 3759.LSEH_end_bn_gather5: 3760.size bn_gather5,.-bn_gather5 3761.align 64 3762.Linc: 3763.long 0,0, 1,1 3764.long 2,2, 2,2 3765.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 3766