x86_64-mont5.S revision 1.4
1#include <machine/asm.h> 2.text 3 4 5 6.globl bn_mul_mont_gather5 7.type bn_mul_mont_gather5,@function 8.align 64 9bn_mul_mont_gather5: 10 movl %r9d,%r9d 11 movq %rsp,%rax 12 testl $7,%r9d 13 jnz .Lmul_enter 14 movl OPENSSL_ia32cap_P+8(%rip),%r11d 15 jmp .Lmul4x_enter 16 17.align 16 18.Lmul_enter: 19 movd 8(%rsp),%xmm5 20 pushq %rbx 21 pushq %rbp 22 pushq %r12 23 pushq %r13 24 pushq %r14 25 pushq %r15 26 27 negq %r9 28 movq %rsp,%r11 29 leaq -280(%rsp,%r9,8),%r10 30 negq %r9 31 andq $-1024,%r10 32 33 34 35 36 37 38 39 subq %r10,%r11 40 andq $-4096,%r11 41 leaq (%r10,%r11,1),%rsp 42 movq (%rsp),%r11 43 cmpq %r10,%rsp 44 ja .Lmul_page_walk 45 jmp .Lmul_page_walk_done 46 47.Lmul_page_walk: 48 leaq -4096(%rsp),%rsp 49 movq (%rsp),%r11 50 cmpq %r10,%rsp 51 ja .Lmul_page_walk 52.Lmul_page_walk_done: 53 54 leaq .Linc(%rip),%r10 55 movq %rax,8(%rsp,%r9,8) 56.Lmul_body: 57 58 leaq 128(%rdx),%r12 59 movdqa 0(%r10),%xmm0 60 movdqa 16(%r10),%xmm1 61 leaq 24-112(%rsp,%r9,8),%r10 62 andq $-16,%r10 63 64 pshufd $0,%xmm5,%xmm5 65 movdqa %xmm1,%xmm4 66 movdqa %xmm1,%xmm2 67 paddd %xmm0,%xmm1 68 pcmpeqd %xmm5,%xmm0 69.byte 0x67 70 movdqa %xmm4,%xmm3 71 paddd %xmm1,%xmm2 72 pcmpeqd %xmm5,%xmm1 73 movdqa %xmm0,112(%r10) 74 movdqa %xmm4,%xmm0 75 76 paddd %xmm2,%xmm3 77 pcmpeqd %xmm5,%xmm2 78 movdqa %xmm1,128(%r10) 79 movdqa %xmm4,%xmm1 80 81 paddd %xmm3,%xmm0 82 pcmpeqd %xmm5,%xmm3 83 movdqa %xmm2,144(%r10) 84 movdqa %xmm4,%xmm2 85 86 paddd %xmm0,%xmm1 87 pcmpeqd %xmm5,%xmm0 88 movdqa %xmm3,160(%r10) 89 movdqa %xmm4,%xmm3 90 paddd %xmm1,%xmm2 91 pcmpeqd %xmm5,%xmm1 92 movdqa %xmm0,176(%r10) 93 movdqa %xmm4,%xmm0 94 95 paddd %xmm2,%xmm3 96 pcmpeqd %xmm5,%xmm2 97 movdqa %xmm1,192(%r10) 98 movdqa %xmm4,%xmm1 99 100 paddd %xmm3,%xmm0 101 pcmpeqd %xmm5,%xmm3 102 movdqa %xmm2,208(%r10) 103 movdqa %xmm4,%xmm2 104 105 paddd %xmm0,%xmm1 106 pcmpeqd %xmm5,%xmm0 107 movdqa %xmm3,224(%r10) 108 movdqa %xmm4,%xmm3 109 paddd %xmm1,%xmm2 110 pcmpeqd %xmm5,%xmm1 111 movdqa %xmm0,240(%r10) 112 movdqa %xmm4,%xmm0 113 114 paddd %xmm2,%xmm3 115 pcmpeqd %xmm5,%xmm2 116 movdqa %xmm1,256(%r10) 117 movdqa %xmm4,%xmm1 118 119 paddd %xmm3,%xmm0 120 pcmpeqd %xmm5,%xmm3 121 movdqa %xmm2,272(%r10) 122 movdqa %xmm4,%xmm2 123 124 paddd %xmm0,%xmm1 125 pcmpeqd %xmm5,%xmm0 126 movdqa %xmm3,288(%r10) 127 movdqa %xmm4,%xmm3 128 paddd %xmm1,%xmm2 129 pcmpeqd %xmm5,%xmm1 130 movdqa %xmm0,304(%r10) 131 132 paddd %xmm2,%xmm3 133.byte 0x67 134 pcmpeqd %xmm5,%xmm2 135 movdqa %xmm1,320(%r10) 136 137 pcmpeqd %xmm5,%xmm3 138 movdqa %xmm2,336(%r10) 139 pand 64(%r12),%xmm0 140 141 pand 80(%r12),%xmm1 142 pand 96(%r12),%xmm2 143 movdqa %xmm3,352(%r10) 144 pand 112(%r12),%xmm3 145 por %xmm2,%xmm0 146 por %xmm3,%xmm1 147 movdqa -128(%r12),%xmm4 148 movdqa -112(%r12),%xmm5 149 movdqa -96(%r12),%xmm2 150 pand 112(%r10),%xmm4 151 movdqa -80(%r12),%xmm3 152 pand 128(%r10),%xmm5 153 por %xmm4,%xmm0 154 pand 144(%r10),%xmm2 155 por %xmm5,%xmm1 156 pand 160(%r10),%xmm3 157 por %xmm2,%xmm0 158 por %xmm3,%xmm1 159 movdqa -64(%r12),%xmm4 160 movdqa -48(%r12),%xmm5 161 movdqa -32(%r12),%xmm2 162 pand 176(%r10),%xmm4 163 movdqa -16(%r12),%xmm3 164 pand 192(%r10),%xmm5 165 por %xmm4,%xmm0 166 pand 208(%r10),%xmm2 167 por %xmm5,%xmm1 168 pand 224(%r10),%xmm3 169 por %xmm2,%xmm0 170 por %xmm3,%xmm1 171 movdqa 0(%r12),%xmm4 172 movdqa 16(%r12),%xmm5 173 movdqa 32(%r12),%xmm2 174 pand 240(%r10),%xmm4 175 movdqa 48(%r12),%xmm3 176 pand 256(%r10),%xmm5 177 por %xmm4,%xmm0 178 pand 272(%r10),%xmm2 179 por %xmm5,%xmm1 180 pand 288(%r10),%xmm3 181 por %xmm2,%xmm0 182 por %xmm3,%xmm1 183 por %xmm1,%xmm0 184 pshufd $0x4e,%xmm0,%xmm1 185 por %xmm1,%xmm0 186 leaq 256(%r12),%r12 187.byte 102,72,15,126,195 188 189 movq (%r8),%r8 190 movq (%rsi),%rax 191 192 xorq %r14,%r14 193 xorq %r15,%r15 194 195 movq %r8,%rbp 196 mulq %rbx 197 movq %rax,%r10 198 movq (%rcx),%rax 199 200 imulq %r10,%rbp 201 movq %rdx,%r11 202 203 mulq %rbp 204 addq %rax,%r10 205 movq 8(%rsi),%rax 206 adcq $0,%rdx 207 movq %rdx,%r13 208 209 leaq 1(%r15),%r15 210 jmp .L1st_enter 211 212.align 16 213.L1st: 214 addq %rax,%r13 215 movq (%rsi,%r15,8),%rax 216 adcq $0,%rdx 217 addq %r11,%r13 218 movq %r10,%r11 219 adcq $0,%rdx 220 movq %r13,-16(%rsp,%r15,8) 221 movq %rdx,%r13 222 223.L1st_enter: 224 mulq %rbx 225 addq %rax,%r11 226 movq (%rcx,%r15,8),%rax 227 adcq $0,%rdx 228 leaq 1(%r15),%r15 229 movq %rdx,%r10 230 231 mulq %rbp 232 cmpq %r9,%r15 233 jne .L1st 234 235 236 addq %rax,%r13 237 adcq $0,%rdx 238 addq %r11,%r13 239 adcq $0,%rdx 240 movq %r13,-16(%rsp,%r9,8) 241 movq %rdx,%r13 242 movq %r10,%r11 243 244 xorq %rdx,%rdx 245 addq %r11,%r13 246 adcq $0,%rdx 247 movq %r13,-8(%rsp,%r9,8) 248 movq %rdx,(%rsp,%r9,8) 249 250 leaq 1(%r14),%r14 251 jmp .Louter 252.align 16 253.Louter: 254 leaq 24+128(%rsp,%r9,8),%rdx 255 andq $-16,%rdx 256 pxor %xmm4,%xmm4 257 pxor %xmm5,%xmm5 258 movdqa -128(%r12),%xmm0 259 movdqa -112(%r12),%xmm1 260 movdqa -96(%r12),%xmm2 261 movdqa -80(%r12),%xmm3 262 pand -128(%rdx),%xmm0 263 pand -112(%rdx),%xmm1 264 por %xmm0,%xmm4 265 pand -96(%rdx),%xmm2 266 por %xmm1,%xmm5 267 pand -80(%rdx),%xmm3 268 por %xmm2,%xmm4 269 por %xmm3,%xmm5 270 movdqa -64(%r12),%xmm0 271 movdqa -48(%r12),%xmm1 272 movdqa -32(%r12),%xmm2 273 movdqa -16(%r12),%xmm3 274 pand -64(%rdx),%xmm0 275 pand -48(%rdx),%xmm1 276 por %xmm0,%xmm4 277 pand -32(%rdx),%xmm2 278 por %xmm1,%xmm5 279 pand -16(%rdx),%xmm3 280 por %xmm2,%xmm4 281 por %xmm3,%xmm5 282 movdqa 0(%r12),%xmm0 283 movdqa 16(%r12),%xmm1 284 movdqa 32(%r12),%xmm2 285 movdqa 48(%r12),%xmm3 286 pand 0(%rdx),%xmm0 287 pand 16(%rdx),%xmm1 288 por %xmm0,%xmm4 289 pand 32(%rdx),%xmm2 290 por %xmm1,%xmm5 291 pand 48(%rdx),%xmm3 292 por %xmm2,%xmm4 293 por %xmm3,%xmm5 294 movdqa 64(%r12),%xmm0 295 movdqa 80(%r12),%xmm1 296 movdqa 96(%r12),%xmm2 297 movdqa 112(%r12),%xmm3 298 pand 64(%rdx),%xmm0 299 pand 80(%rdx),%xmm1 300 por %xmm0,%xmm4 301 pand 96(%rdx),%xmm2 302 por %xmm1,%xmm5 303 pand 112(%rdx),%xmm3 304 por %xmm2,%xmm4 305 por %xmm3,%xmm5 306 por %xmm5,%xmm4 307 pshufd $0x4e,%xmm4,%xmm0 308 por %xmm4,%xmm0 309 leaq 256(%r12),%r12 310 311 movq (%rsi),%rax 312.byte 102,72,15,126,195 313 314 xorq %r15,%r15 315 movq %r8,%rbp 316 movq (%rsp),%r10 317 318 mulq %rbx 319 addq %rax,%r10 320 movq (%rcx),%rax 321 adcq $0,%rdx 322 323 imulq %r10,%rbp 324 movq %rdx,%r11 325 326 mulq %rbp 327 addq %rax,%r10 328 movq 8(%rsi),%rax 329 adcq $0,%rdx 330 movq 8(%rsp),%r10 331 movq %rdx,%r13 332 333 leaq 1(%r15),%r15 334 jmp .Linner_enter 335 336.align 16 337.Linner: 338 addq %rax,%r13 339 movq (%rsi,%r15,8),%rax 340 adcq $0,%rdx 341 addq %r10,%r13 342 movq (%rsp,%r15,8),%r10 343 adcq $0,%rdx 344 movq %r13,-16(%rsp,%r15,8) 345 movq %rdx,%r13 346 347.Linner_enter: 348 mulq %rbx 349 addq %rax,%r11 350 movq (%rcx,%r15,8),%rax 351 adcq $0,%rdx 352 addq %r11,%r10 353 movq %rdx,%r11 354 adcq $0,%r11 355 leaq 1(%r15),%r15 356 357 mulq %rbp 358 cmpq %r9,%r15 359 jne .Linner 360 361 addq %rax,%r13 362 adcq $0,%rdx 363 addq %r10,%r13 364 movq (%rsp,%r9,8),%r10 365 adcq $0,%rdx 366 movq %r13,-16(%rsp,%r9,8) 367 movq %rdx,%r13 368 369 xorq %rdx,%rdx 370 addq %r11,%r13 371 adcq $0,%rdx 372 addq %r10,%r13 373 adcq $0,%rdx 374 movq %r13,-8(%rsp,%r9,8) 375 movq %rdx,(%rsp,%r9,8) 376 377 leaq 1(%r14),%r14 378 cmpq %r9,%r14 379 jb .Louter 380 381 xorq %r14,%r14 382 movq (%rsp),%rax 383 leaq (%rsp),%rsi 384 movq %r9,%r15 385 jmp .Lsub 386.align 16 387.Lsub: sbbq (%rcx,%r14,8),%rax 388 movq %rax,(%rdi,%r14,8) 389 movq 8(%rsi,%r14,8),%rax 390 leaq 1(%r14),%r14 391 decq %r15 392 jnz .Lsub 393 394 sbbq $0,%rax 395 xorq %r14,%r14 396 andq %rax,%rsi 397 notq %rax 398 movq %rdi,%rcx 399 andq %rax,%rcx 400 movq %r9,%r15 401 orq %rcx,%rsi 402.align 16 403.Lcopy: 404 movq (%rsi,%r14,8),%rax 405 movq %r14,(%rsp,%r14,8) 406 movq %rax,(%rdi,%r14,8) 407 leaq 1(%r14),%r14 408 subq $1,%r15 409 jnz .Lcopy 410 411 movq 8(%rsp,%r9,8),%rsi 412 movq $1,%rax 413 414 movq -48(%rsi),%r15 415 movq -40(%rsi),%r14 416 movq -32(%rsi),%r13 417 movq -24(%rsi),%r12 418 movq -16(%rsi),%rbp 419 movq -8(%rsi),%rbx 420 leaq (%rsi),%rsp 421.Lmul_epilogue: 422 .byte 0xf3,0xc3 423.size bn_mul_mont_gather5,.-bn_mul_mont_gather5 424.type bn_mul4x_mont_gather5,@function 425.align 32 426bn_mul4x_mont_gather5: 427.byte 0x67 428 movq %rsp,%rax 429.Lmul4x_enter: 430 andl $0x80108,%r11d 431 cmpl $0x80108,%r11d 432 je .Lmulx4x_enter 433 pushq %rbx 434 pushq %rbp 435 pushq %r12 436 pushq %r13 437 pushq %r14 438 pushq %r15 439.Lmul4x_prologue: 440 441.byte 0x67 442 shll $3,%r9d 443 leaq (%r9,%r9,2),%r10 444 negq %r9 445 446 447 448 449 450 451 452 453 454 455 leaq -320(%rsp,%r9,2),%r11 456 movq %rsp,%rbp 457 subq %rdi,%r11 458 andq $4095,%r11 459 cmpq %r11,%r10 460 jb .Lmul4xsp_alt 461 subq %r11,%rbp 462 leaq -320(%rbp,%r9,2),%rbp 463 jmp .Lmul4xsp_done 464 465.align 32 466.Lmul4xsp_alt: 467 leaq 4096-320(,%r9,2),%r10 468 leaq -320(%rbp,%r9,2),%rbp 469 subq %r10,%r11 470 movq $0,%r10 471 cmovcq %r10,%r11 472 subq %r11,%rbp 473.Lmul4xsp_done: 474 andq $-64,%rbp 475 movq %rsp,%r11 476 subq %rbp,%r11 477 andq $-4096,%r11 478 leaq (%r11,%rbp,1),%rsp 479 movq (%rsp),%r10 480 cmpq %rbp,%rsp 481 ja .Lmul4x_page_walk 482 jmp .Lmul4x_page_walk_done 483 484.Lmul4x_page_walk: 485 leaq -4096(%rsp),%rsp 486 movq (%rsp),%r10 487 cmpq %rbp,%rsp 488 ja .Lmul4x_page_walk 489.Lmul4x_page_walk_done: 490 491 negq %r9 492 493 movq %rax,40(%rsp) 494.Lmul4x_body: 495 496 call mul4x_internal 497 498 movq 40(%rsp),%rsi 499 movq $1,%rax 500 501 movq -48(%rsi),%r15 502 movq -40(%rsi),%r14 503 movq -32(%rsi),%r13 504 movq -24(%rsi),%r12 505 movq -16(%rsi),%rbp 506 movq -8(%rsi),%rbx 507 leaq (%rsi),%rsp 508.Lmul4x_epilogue: 509 .byte 0xf3,0xc3 510.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 511 512.type mul4x_internal,@function 513.align 32 514mul4x_internal: 515 shlq $5,%r9 516 movd 8(%rax),%xmm5 517 leaq .Linc(%rip),%rax 518 leaq 128(%rdx,%r9,1),%r13 519 shrq $5,%r9 520 movdqa 0(%rax),%xmm0 521 movdqa 16(%rax),%xmm1 522 leaq 88-112(%rsp,%r9,1),%r10 523 leaq 128(%rdx),%r12 524 525 pshufd $0,%xmm5,%xmm5 526 movdqa %xmm1,%xmm4 527.byte 0x67,0x67 528 movdqa %xmm1,%xmm2 529 paddd %xmm0,%xmm1 530 pcmpeqd %xmm5,%xmm0 531.byte 0x67 532 movdqa %xmm4,%xmm3 533 paddd %xmm1,%xmm2 534 pcmpeqd %xmm5,%xmm1 535 movdqa %xmm0,112(%r10) 536 movdqa %xmm4,%xmm0 537 538 paddd %xmm2,%xmm3 539 pcmpeqd %xmm5,%xmm2 540 movdqa %xmm1,128(%r10) 541 movdqa %xmm4,%xmm1 542 543 paddd %xmm3,%xmm0 544 pcmpeqd %xmm5,%xmm3 545 movdqa %xmm2,144(%r10) 546 movdqa %xmm4,%xmm2 547 548 paddd %xmm0,%xmm1 549 pcmpeqd %xmm5,%xmm0 550 movdqa %xmm3,160(%r10) 551 movdqa %xmm4,%xmm3 552 paddd %xmm1,%xmm2 553 pcmpeqd %xmm5,%xmm1 554 movdqa %xmm0,176(%r10) 555 movdqa %xmm4,%xmm0 556 557 paddd %xmm2,%xmm3 558 pcmpeqd %xmm5,%xmm2 559 movdqa %xmm1,192(%r10) 560 movdqa %xmm4,%xmm1 561 562 paddd %xmm3,%xmm0 563 pcmpeqd %xmm5,%xmm3 564 movdqa %xmm2,208(%r10) 565 movdqa %xmm4,%xmm2 566 567 paddd %xmm0,%xmm1 568 pcmpeqd %xmm5,%xmm0 569 movdqa %xmm3,224(%r10) 570 movdqa %xmm4,%xmm3 571 paddd %xmm1,%xmm2 572 pcmpeqd %xmm5,%xmm1 573 movdqa %xmm0,240(%r10) 574 movdqa %xmm4,%xmm0 575 576 paddd %xmm2,%xmm3 577 pcmpeqd %xmm5,%xmm2 578 movdqa %xmm1,256(%r10) 579 movdqa %xmm4,%xmm1 580 581 paddd %xmm3,%xmm0 582 pcmpeqd %xmm5,%xmm3 583 movdqa %xmm2,272(%r10) 584 movdqa %xmm4,%xmm2 585 586 paddd %xmm0,%xmm1 587 pcmpeqd %xmm5,%xmm0 588 movdqa %xmm3,288(%r10) 589 movdqa %xmm4,%xmm3 590 paddd %xmm1,%xmm2 591 pcmpeqd %xmm5,%xmm1 592 movdqa %xmm0,304(%r10) 593 594 paddd %xmm2,%xmm3 595.byte 0x67 596 pcmpeqd %xmm5,%xmm2 597 movdqa %xmm1,320(%r10) 598 599 pcmpeqd %xmm5,%xmm3 600 movdqa %xmm2,336(%r10) 601 pand 64(%r12),%xmm0 602 603 pand 80(%r12),%xmm1 604 pand 96(%r12),%xmm2 605 movdqa %xmm3,352(%r10) 606 pand 112(%r12),%xmm3 607 por %xmm2,%xmm0 608 por %xmm3,%xmm1 609 movdqa -128(%r12),%xmm4 610 movdqa -112(%r12),%xmm5 611 movdqa -96(%r12),%xmm2 612 pand 112(%r10),%xmm4 613 movdqa -80(%r12),%xmm3 614 pand 128(%r10),%xmm5 615 por %xmm4,%xmm0 616 pand 144(%r10),%xmm2 617 por %xmm5,%xmm1 618 pand 160(%r10),%xmm3 619 por %xmm2,%xmm0 620 por %xmm3,%xmm1 621 movdqa -64(%r12),%xmm4 622 movdqa -48(%r12),%xmm5 623 movdqa -32(%r12),%xmm2 624 pand 176(%r10),%xmm4 625 movdqa -16(%r12),%xmm3 626 pand 192(%r10),%xmm5 627 por %xmm4,%xmm0 628 pand 208(%r10),%xmm2 629 por %xmm5,%xmm1 630 pand 224(%r10),%xmm3 631 por %xmm2,%xmm0 632 por %xmm3,%xmm1 633 movdqa 0(%r12),%xmm4 634 movdqa 16(%r12),%xmm5 635 movdqa 32(%r12),%xmm2 636 pand 240(%r10),%xmm4 637 movdqa 48(%r12),%xmm3 638 pand 256(%r10),%xmm5 639 por %xmm4,%xmm0 640 pand 272(%r10),%xmm2 641 por %xmm5,%xmm1 642 pand 288(%r10),%xmm3 643 por %xmm2,%xmm0 644 por %xmm3,%xmm1 645 por %xmm1,%xmm0 646 pshufd $0x4e,%xmm0,%xmm1 647 por %xmm1,%xmm0 648 leaq 256(%r12),%r12 649.byte 102,72,15,126,195 650 651 movq %r13,16+8(%rsp) 652 movq %rdi,56+8(%rsp) 653 654 movq (%r8),%r8 655 movq (%rsi),%rax 656 leaq (%rsi,%r9,1),%rsi 657 negq %r9 658 659 movq %r8,%rbp 660 mulq %rbx 661 movq %rax,%r10 662 movq (%rcx),%rax 663 664 imulq %r10,%rbp 665 leaq 64+8(%rsp),%r14 666 movq %rdx,%r11 667 668 mulq %rbp 669 addq %rax,%r10 670 movq 8(%rsi,%r9,1),%rax 671 adcq $0,%rdx 672 movq %rdx,%rdi 673 674 mulq %rbx 675 addq %rax,%r11 676 movq 8(%rcx),%rax 677 adcq $0,%rdx 678 movq %rdx,%r10 679 680 mulq %rbp 681 addq %rax,%rdi 682 movq 16(%rsi,%r9,1),%rax 683 adcq $0,%rdx 684 addq %r11,%rdi 685 leaq 32(%r9),%r15 686 leaq 32(%rcx),%rcx 687 adcq $0,%rdx 688 movq %rdi,(%r14) 689 movq %rdx,%r13 690 jmp .L1st4x 691 692.align 32 693.L1st4x: 694 mulq %rbx 695 addq %rax,%r10 696 movq -16(%rcx),%rax 697 leaq 32(%r14),%r14 698 adcq $0,%rdx 699 movq %rdx,%r11 700 701 mulq %rbp 702 addq %rax,%r13 703 movq -8(%rsi,%r15,1),%rax 704 adcq $0,%rdx 705 addq %r10,%r13 706 adcq $0,%rdx 707 movq %r13,-24(%r14) 708 movq %rdx,%rdi 709 710 mulq %rbx 711 addq %rax,%r11 712 movq -8(%rcx),%rax 713 adcq $0,%rdx 714 movq %rdx,%r10 715 716 mulq %rbp 717 addq %rax,%rdi 718 movq (%rsi,%r15,1),%rax 719 adcq $0,%rdx 720 addq %r11,%rdi 721 adcq $0,%rdx 722 movq %rdi,-16(%r14) 723 movq %rdx,%r13 724 725 mulq %rbx 726 addq %rax,%r10 727 movq 0(%rcx),%rax 728 adcq $0,%rdx 729 movq %rdx,%r11 730 731 mulq %rbp 732 addq %rax,%r13 733 movq 8(%rsi,%r15,1),%rax 734 adcq $0,%rdx 735 addq %r10,%r13 736 adcq $0,%rdx 737 movq %r13,-8(%r14) 738 movq %rdx,%rdi 739 740 mulq %rbx 741 addq %rax,%r11 742 movq 8(%rcx),%rax 743 adcq $0,%rdx 744 movq %rdx,%r10 745 746 mulq %rbp 747 addq %rax,%rdi 748 movq 16(%rsi,%r15,1),%rax 749 adcq $0,%rdx 750 addq %r11,%rdi 751 leaq 32(%rcx),%rcx 752 adcq $0,%rdx 753 movq %rdi,(%r14) 754 movq %rdx,%r13 755 756 addq $32,%r15 757 jnz .L1st4x 758 759 mulq %rbx 760 addq %rax,%r10 761 movq -16(%rcx),%rax 762 leaq 32(%r14),%r14 763 adcq $0,%rdx 764 movq %rdx,%r11 765 766 mulq %rbp 767 addq %rax,%r13 768 movq -8(%rsi),%rax 769 adcq $0,%rdx 770 addq %r10,%r13 771 adcq $0,%rdx 772 movq %r13,-24(%r14) 773 movq %rdx,%rdi 774 775 mulq %rbx 776 addq %rax,%r11 777 movq -8(%rcx),%rax 778 adcq $0,%rdx 779 movq %rdx,%r10 780 781 mulq %rbp 782 addq %rax,%rdi 783 movq (%rsi,%r9,1),%rax 784 adcq $0,%rdx 785 addq %r11,%rdi 786 adcq $0,%rdx 787 movq %rdi,-16(%r14) 788 movq %rdx,%r13 789 790 leaq (%rcx,%r9,1),%rcx 791 792 xorq %rdi,%rdi 793 addq %r10,%r13 794 adcq $0,%rdi 795 movq %r13,-8(%r14) 796 797 jmp .Louter4x 798 799.align 32 800.Louter4x: 801 leaq 16+128(%r14),%rdx 802 pxor %xmm4,%xmm4 803 pxor %xmm5,%xmm5 804 movdqa -128(%r12),%xmm0 805 movdqa -112(%r12),%xmm1 806 movdqa -96(%r12),%xmm2 807 movdqa -80(%r12),%xmm3 808 pand -128(%rdx),%xmm0 809 pand -112(%rdx),%xmm1 810 por %xmm0,%xmm4 811 pand -96(%rdx),%xmm2 812 por %xmm1,%xmm5 813 pand -80(%rdx),%xmm3 814 por %xmm2,%xmm4 815 por %xmm3,%xmm5 816 movdqa -64(%r12),%xmm0 817 movdqa -48(%r12),%xmm1 818 movdqa -32(%r12),%xmm2 819 movdqa -16(%r12),%xmm3 820 pand -64(%rdx),%xmm0 821 pand -48(%rdx),%xmm1 822 por %xmm0,%xmm4 823 pand -32(%rdx),%xmm2 824 por %xmm1,%xmm5 825 pand -16(%rdx),%xmm3 826 por %xmm2,%xmm4 827 por %xmm3,%xmm5 828 movdqa 0(%r12),%xmm0 829 movdqa 16(%r12),%xmm1 830 movdqa 32(%r12),%xmm2 831 movdqa 48(%r12),%xmm3 832 pand 0(%rdx),%xmm0 833 pand 16(%rdx),%xmm1 834 por %xmm0,%xmm4 835 pand 32(%rdx),%xmm2 836 por %xmm1,%xmm5 837 pand 48(%rdx),%xmm3 838 por %xmm2,%xmm4 839 por %xmm3,%xmm5 840 movdqa 64(%r12),%xmm0 841 movdqa 80(%r12),%xmm1 842 movdqa 96(%r12),%xmm2 843 movdqa 112(%r12),%xmm3 844 pand 64(%rdx),%xmm0 845 pand 80(%rdx),%xmm1 846 por %xmm0,%xmm4 847 pand 96(%rdx),%xmm2 848 por %xmm1,%xmm5 849 pand 112(%rdx),%xmm3 850 por %xmm2,%xmm4 851 por %xmm3,%xmm5 852 por %xmm5,%xmm4 853 pshufd $0x4e,%xmm4,%xmm0 854 por %xmm4,%xmm0 855 leaq 256(%r12),%r12 856.byte 102,72,15,126,195 857 858 movq (%r14,%r9,1),%r10 859 movq %r8,%rbp 860 mulq %rbx 861 addq %rax,%r10 862 movq (%rcx),%rax 863 adcq $0,%rdx 864 865 imulq %r10,%rbp 866 movq %rdx,%r11 867 movq %rdi,(%r14) 868 869 leaq (%r14,%r9,1),%r14 870 871 mulq %rbp 872 addq %rax,%r10 873 movq 8(%rsi,%r9,1),%rax 874 adcq $0,%rdx 875 movq %rdx,%rdi 876 877 mulq %rbx 878 addq %rax,%r11 879 movq 8(%rcx),%rax 880 adcq $0,%rdx 881 addq 8(%r14),%r11 882 adcq $0,%rdx 883 movq %rdx,%r10 884 885 mulq %rbp 886 addq %rax,%rdi 887 movq 16(%rsi,%r9,1),%rax 888 adcq $0,%rdx 889 addq %r11,%rdi 890 leaq 32(%r9),%r15 891 leaq 32(%rcx),%rcx 892 adcq $0,%rdx 893 movq %rdx,%r13 894 jmp .Linner4x 895 896.align 32 897.Linner4x: 898 mulq %rbx 899 addq %rax,%r10 900 movq -16(%rcx),%rax 901 adcq $0,%rdx 902 addq 16(%r14),%r10 903 leaq 32(%r14),%r14 904 adcq $0,%rdx 905 movq %rdx,%r11 906 907 mulq %rbp 908 addq %rax,%r13 909 movq -8(%rsi,%r15,1),%rax 910 adcq $0,%rdx 911 addq %r10,%r13 912 adcq $0,%rdx 913 movq %rdi,-32(%r14) 914 movq %rdx,%rdi 915 916 mulq %rbx 917 addq %rax,%r11 918 movq -8(%rcx),%rax 919 adcq $0,%rdx 920 addq -8(%r14),%r11 921 adcq $0,%rdx 922 movq %rdx,%r10 923 924 mulq %rbp 925 addq %rax,%rdi 926 movq (%rsi,%r15,1),%rax 927 adcq $0,%rdx 928 addq %r11,%rdi 929 adcq $0,%rdx 930 movq %r13,-24(%r14) 931 movq %rdx,%r13 932 933 mulq %rbx 934 addq %rax,%r10 935 movq 0(%rcx),%rax 936 adcq $0,%rdx 937 addq (%r14),%r10 938 adcq $0,%rdx 939 movq %rdx,%r11 940 941 mulq %rbp 942 addq %rax,%r13 943 movq 8(%rsi,%r15,1),%rax 944 adcq $0,%rdx 945 addq %r10,%r13 946 adcq $0,%rdx 947 movq %rdi,-16(%r14) 948 movq %rdx,%rdi 949 950 mulq %rbx 951 addq %rax,%r11 952 movq 8(%rcx),%rax 953 adcq $0,%rdx 954 addq 8(%r14),%r11 955 adcq $0,%rdx 956 movq %rdx,%r10 957 958 mulq %rbp 959 addq %rax,%rdi 960 movq 16(%rsi,%r15,1),%rax 961 adcq $0,%rdx 962 addq %r11,%rdi 963 leaq 32(%rcx),%rcx 964 adcq $0,%rdx 965 movq %r13,-8(%r14) 966 movq %rdx,%r13 967 968 addq $32,%r15 969 jnz .Linner4x 970 971 mulq %rbx 972 addq %rax,%r10 973 movq -16(%rcx),%rax 974 adcq $0,%rdx 975 addq 16(%r14),%r10 976 leaq 32(%r14),%r14 977 adcq $0,%rdx 978 movq %rdx,%r11 979 980 mulq %rbp 981 addq %rax,%r13 982 movq -8(%rsi),%rax 983 adcq $0,%rdx 984 addq %r10,%r13 985 adcq $0,%rdx 986 movq %rdi,-32(%r14) 987 movq %rdx,%rdi 988 989 mulq %rbx 990 addq %rax,%r11 991 movq %rbp,%rax 992 movq -8(%rcx),%rbp 993 adcq $0,%rdx 994 addq -8(%r14),%r11 995 adcq $0,%rdx 996 movq %rdx,%r10 997 998 mulq %rbp 999 addq %rax,%rdi 1000 movq (%rsi,%r9,1),%rax 1001 adcq $0,%rdx 1002 addq %r11,%rdi 1003 adcq $0,%rdx 1004 movq %r13,-24(%r14) 1005 movq %rdx,%r13 1006 1007 movq %rdi,-16(%r14) 1008 leaq (%rcx,%r9,1),%rcx 1009 1010 xorq %rdi,%rdi 1011 addq %r10,%r13 1012 adcq $0,%rdi 1013 addq (%r14),%r13 1014 adcq $0,%rdi 1015 movq %r13,-8(%r14) 1016 1017 cmpq 16+8(%rsp),%r12 1018 jb .Louter4x 1019 xorq %rax,%rax 1020 subq %r13,%rbp 1021 adcq %r15,%r15 1022 orq %r15,%rdi 1023 subq %rdi,%rax 1024 leaq (%r14,%r9,1),%rbx 1025 movq (%rcx),%r12 1026 leaq (%rcx),%rbp 1027 movq %r9,%rcx 1028 sarq $3+2,%rcx 1029 movq 56+8(%rsp),%rdi 1030 decq %r12 1031 xorq %r10,%r10 1032 movq 8(%rbp),%r13 1033 movq 16(%rbp),%r14 1034 movq 24(%rbp),%r15 1035 jmp .Lsqr4x_sub_entry 1036.size mul4x_internal,.-mul4x_internal 1037.globl bn_power5 1038.type bn_power5,@function 1039.align 32 1040bn_power5: 1041 movq %rsp,%rax 1042 movl OPENSSL_ia32cap_P+8(%rip),%r11d 1043 andl $0x80108,%r11d 1044 cmpl $0x80108,%r11d 1045 je .Lpowerx5_enter 1046 pushq %rbx 1047 pushq %rbp 1048 pushq %r12 1049 pushq %r13 1050 pushq %r14 1051 pushq %r15 1052.Lpower5_prologue: 1053 1054 shll $3,%r9d 1055 leal (%r9,%r9,2),%r10d 1056 negq %r9 1057 movq (%r8),%r8 1058 1059 1060 1061 1062 1063 1064 1065 1066 leaq -320(%rsp,%r9,2),%r11 1067 movq %rsp,%rbp 1068 subq %rdi,%r11 1069 andq $4095,%r11 1070 cmpq %r11,%r10 1071 jb .Lpwr_sp_alt 1072 subq %r11,%rbp 1073 leaq -320(%rbp,%r9,2),%rbp 1074 jmp .Lpwr_sp_done 1075 1076.align 32 1077.Lpwr_sp_alt: 1078 leaq 4096-320(,%r9,2),%r10 1079 leaq -320(%rbp,%r9,2),%rbp 1080 subq %r10,%r11 1081 movq $0,%r10 1082 cmovcq %r10,%r11 1083 subq %r11,%rbp 1084.Lpwr_sp_done: 1085 andq $-64,%rbp 1086 movq %rsp,%r11 1087 subq %rbp,%r11 1088 andq $-4096,%r11 1089 leaq (%r11,%rbp,1),%rsp 1090 movq (%rsp),%r10 1091 cmpq %rbp,%rsp 1092 ja .Lpwr_page_walk 1093 jmp .Lpwr_page_walk_done 1094 1095.Lpwr_page_walk: 1096 leaq -4096(%rsp),%rsp 1097 movq (%rsp),%r10 1098 cmpq %rbp,%rsp 1099 ja .Lpwr_page_walk 1100.Lpwr_page_walk_done: 1101 1102 movq %r9,%r10 1103 negq %r9 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 movq %r8,32(%rsp) 1115 movq %rax,40(%rsp) 1116.Lpower5_body: 1117.byte 102,72,15,110,207 1118.byte 102,72,15,110,209 1119.byte 102,73,15,110,218 1120.byte 102,72,15,110,226 1121 1122 call __bn_sqr8x_internal 1123 call __bn_post4x_internal 1124 call __bn_sqr8x_internal 1125 call __bn_post4x_internal 1126 call __bn_sqr8x_internal 1127 call __bn_post4x_internal 1128 call __bn_sqr8x_internal 1129 call __bn_post4x_internal 1130 call __bn_sqr8x_internal 1131 call __bn_post4x_internal 1132 1133.byte 102,72,15,126,209 1134.byte 102,72,15,126,226 1135 movq %rsi,%rdi 1136 movq 40(%rsp),%rax 1137 leaq 32(%rsp),%r8 1138 1139 call mul4x_internal 1140 1141 movq 40(%rsp),%rsi 1142 movq $1,%rax 1143 movq -48(%rsi),%r15 1144 movq -40(%rsi),%r14 1145 movq -32(%rsi),%r13 1146 movq -24(%rsi),%r12 1147 movq -16(%rsi),%rbp 1148 movq -8(%rsi),%rbx 1149 leaq (%rsi),%rsp 1150.Lpower5_epilogue: 1151 .byte 0xf3,0xc3 1152.size bn_power5,.-bn_power5 1153 1154.globl bn_sqr8x_internal 1155.hidden bn_sqr8x_internal 1156.type bn_sqr8x_internal,@function 1157.align 32 1158bn_sqr8x_internal: 1159__bn_sqr8x_internal: 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 leaq 32(%r10),%rbp 1234 leaq (%rsi,%r9,1),%rsi 1235 1236 movq %r9,%rcx 1237 1238 1239 movq -32(%rsi,%rbp,1),%r14 1240 leaq 48+8(%rsp,%r9,2),%rdi 1241 movq -24(%rsi,%rbp,1),%rax 1242 leaq -32(%rdi,%rbp,1),%rdi 1243 movq -16(%rsi,%rbp,1),%rbx 1244 movq %rax,%r15 1245 1246 mulq %r14 1247 movq %rax,%r10 1248 movq %rbx,%rax 1249 movq %rdx,%r11 1250 movq %r10,-24(%rdi,%rbp,1) 1251 1252 mulq %r14 1253 addq %rax,%r11 1254 movq %rbx,%rax 1255 adcq $0,%rdx 1256 movq %r11,-16(%rdi,%rbp,1) 1257 movq %rdx,%r10 1258 1259 1260 movq -8(%rsi,%rbp,1),%rbx 1261 mulq %r15 1262 movq %rax,%r12 1263 movq %rbx,%rax 1264 movq %rdx,%r13 1265 1266 leaq (%rbp),%rcx 1267 mulq %r14 1268 addq %rax,%r10 1269 movq %rbx,%rax 1270 movq %rdx,%r11 1271 adcq $0,%r11 1272 addq %r12,%r10 1273 adcq $0,%r11 1274 movq %r10,-8(%rdi,%rcx,1) 1275 jmp .Lsqr4x_1st 1276 1277.align 32 1278.Lsqr4x_1st: 1279 movq (%rsi,%rcx,1),%rbx 1280 mulq %r15 1281 addq %rax,%r13 1282 movq %rbx,%rax 1283 movq %rdx,%r12 1284 adcq $0,%r12 1285 1286 mulq %r14 1287 addq %rax,%r11 1288 movq %rbx,%rax 1289 movq 8(%rsi,%rcx,1),%rbx 1290 movq %rdx,%r10 1291 adcq $0,%r10 1292 addq %r13,%r11 1293 adcq $0,%r10 1294 1295 1296 mulq %r15 1297 addq %rax,%r12 1298 movq %rbx,%rax 1299 movq %r11,(%rdi,%rcx,1) 1300 movq %rdx,%r13 1301 adcq $0,%r13 1302 1303 mulq %r14 1304 addq %rax,%r10 1305 movq %rbx,%rax 1306 movq 16(%rsi,%rcx,1),%rbx 1307 movq %rdx,%r11 1308 adcq $0,%r11 1309 addq %r12,%r10 1310 adcq $0,%r11 1311 1312 mulq %r15 1313 addq %rax,%r13 1314 movq %rbx,%rax 1315 movq %r10,8(%rdi,%rcx,1) 1316 movq %rdx,%r12 1317 adcq $0,%r12 1318 1319 mulq %r14 1320 addq %rax,%r11 1321 movq %rbx,%rax 1322 movq 24(%rsi,%rcx,1),%rbx 1323 movq %rdx,%r10 1324 adcq $0,%r10 1325 addq %r13,%r11 1326 adcq $0,%r10 1327 1328 1329 mulq %r15 1330 addq %rax,%r12 1331 movq %rbx,%rax 1332 movq %r11,16(%rdi,%rcx,1) 1333 movq %rdx,%r13 1334 adcq $0,%r13 1335 leaq 32(%rcx),%rcx 1336 1337 mulq %r14 1338 addq %rax,%r10 1339 movq %rbx,%rax 1340 movq %rdx,%r11 1341 adcq $0,%r11 1342 addq %r12,%r10 1343 adcq $0,%r11 1344 movq %r10,-8(%rdi,%rcx,1) 1345 1346 cmpq $0,%rcx 1347 jne .Lsqr4x_1st 1348 1349 mulq %r15 1350 addq %rax,%r13 1351 leaq 16(%rbp),%rbp 1352 adcq $0,%rdx 1353 addq %r11,%r13 1354 adcq $0,%rdx 1355 1356 movq %r13,(%rdi) 1357 movq %rdx,%r12 1358 movq %rdx,8(%rdi) 1359 jmp .Lsqr4x_outer 1360 1361.align 32 1362.Lsqr4x_outer: 1363 movq -32(%rsi,%rbp,1),%r14 1364 leaq 48+8(%rsp,%r9,2),%rdi 1365 movq -24(%rsi,%rbp,1),%rax 1366 leaq -32(%rdi,%rbp,1),%rdi 1367 movq -16(%rsi,%rbp,1),%rbx 1368 movq %rax,%r15 1369 1370 mulq %r14 1371 movq -24(%rdi,%rbp,1),%r10 1372 addq %rax,%r10 1373 movq %rbx,%rax 1374 adcq $0,%rdx 1375 movq %r10,-24(%rdi,%rbp,1) 1376 movq %rdx,%r11 1377 1378 mulq %r14 1379 addq %rax,%r11 1380 movq %rbx,%rax 1381 adcq $0,%rdx 1382 addq -16(%rdi,%rbp,1),%r11 1383 movq %rdx,%r10 1384 adcq $0,%r10 1385 movq %r11,-16(%rdi,%rbp,1) 1386 1387 xorq %r12,%r12 1388 1389 movq -8(%rsi,%rbp,1),%rbx 1390 mulq %r15 1391 addq %rax,%r12 1392 movq %rbx,%rax 1393 adcq $0,%rdx 1394 addq -8(%rdi,%rbp,1),%r12 1395 movq %rdx,%r13 1396 adcq $0,%r13 1397 1398 mulq %r14 1399 addq %rax,%r10 1400 movq %rbx,%rax 1401 adcq $0,%rdx 1402 addq %r12,%r10 1403 movq %rdx,%r11 1404 adcq $0,%r11 1405 movq %r10,-8(%rdi,%rbp,1) 1406 1407 leaq (%rbp),%rcx 1408 jmp .Lsqr4x_inner 1409 1410.align 32 1411.Lsqr4x_inner: 1412 movq (%rsi,%rcx,1),%rbx 1413 mulq %r15 1414 addq %rax,%r13 1415 movq %rbx,%rax 1416 movq %rdx,%r12 1417 adcq $0,%r12 1418 addq (%rdi,%rcx,1),%r13 1419 adcq $0,%r12 1420 1421.byte 0x67 1422 mulq %r14 1423 addq %rax,%r11 1424 movq %rbx,%rax 1425 movq 8(%rsi,%rcx,1),%rbx 1426 movq %rdx,%r10 1427 adcq $0,%r10 1428 addq %r13,%r11 1429 adcq $0,%r10 1430 1431 mulq %r15 1432 addq %rax,%r12 1433 movq %r11,(%rdi,%rcx,1) 1434 movq %rbx,%rax 1435 movq %rdx,%r13 1436 adcq $0,%r13 1437 addq 8(%rdi,%rcx,1),%r12 1438 leaq 16(%rcx),%rcx 1439 adcq $0,%r13 1440 1441 mulq %r14 1442 addq %rax,%r10 1443 movq %rbx,%rax 1444 adcq $0,%rdx 1445 addq %r12,%r10 1446 movq %rdx,%r11 1447 adcq $0,%r11 1448 movq %r10,-8(%rdi,%rcx,1) 1449 1450 cmpq $0,%rcx 1451 jne .Lsqr4x_inner 1452 1453.byte 0x67 1454 mulq %r15 1455 addq %rax,%r13 1456 adcq $0,%rdx 1457 addq %r11,%r13 1458 adcq $0,%rdx 1459 1460 movq %r13,(%rdi) 1461 movq %rdx,%r12 1462 movq %rdx,8(%rdi) 1463 1464 addq $16,%rbp 1465 jnz .Lsqr4x_outer 1466 1467 1468 movq -32(%rsi),%r14 1469 leaq 48+8(%rsp,%r9,2),%rdi 1470 movq -24(%rsi),%rax 1471 leaq -32(%rdi,%rbp,1),%rdi 1472 movq -16(%rsi),%rbx 1473 movq %rax,%r15 1474 1475 mulq %r14 1476 addq %rax,%r10 1477 movq %rbx,%rax 1478 movq %rdx,%r11 1479 adcq $0,%r11 1480 1481 mulq %r14 1482 addq %rax,%r11 1483 movq %rbx,%rax 1484 movq %r10,-24(%rdi) 1485 movq %rdx,%r10 1486 adcq $0,%r10 1487 addq %r13,%r11 1488 movq -8(%rsi),%rbx 1489 adcq $0,%r10 1490 1491 mulq %r15 1492 addq %rax,%r12 1493 movq %rbx,%rax 1494 movq %r11,-16(%rdi) 1495 movq %rdx,%r13 1496 adcq $0,%r13 1497 1498 mulq %r14 1499 addq %rax,%r10 1500 movq %rbx,%rax 1501 movq %rdx,%r11 1502 adcq $0,%r11 1503 addq %r12,%r10 1504 adcq $0,%r11 1505 movq %r10,-8(%rdi) 1506 1507 mulq %r15 1508 addq %rax,%r13 1509 movq -16(%rsi),%rax 1510 adcq $0,%rdx 1511 addq %r11,%r13 1512 adcq $0,%rdx 1513 1514 movq %r13,(%rdi) 1515 movq %rdx,%r12 1516 movq %rdx,8(%rdi) 1517 1518 mulq %rbx 1519 addq $16,%rbp 1520 xorq %r14,%r14 1521 subq %r9,%rbp 1522 xorq %r15,%r15 1523 1524 addq %r12,%rax 1525 adcq $0,%rdx 1526 movq %rax,8(%rdi) 1527 movq %rdx,16(%rdi) 1528 movq %r15,24(%rdi) 1529 1530 movq -16(%rsi,%rbp,1),%rax 1531 leaq 48+8(%rsp),%rdi 1532 xorq %r10,%r10 1533 movq 8(%rdi),%r11 1534 1535 leaq (%r14,%r10,2),%r12 1536 shrq $63,%r10 1537 leaq (%rcx,%r11,2),%r13 1538 shrq $63,%r11 1539 orq %r10,%r13 1540 movq 16(%rdi),%r10 1541 movq %r11,%r14 1542 mulq %rax 1543 negq %r15 1544 movq 24(%rdi),%r11 1545 adcq %rax,%r12 1546 movq -8(%rsi,%rbp,1),%rax 1547 movq %r12,(%rdi) 1548 adcq %rdx,%r13 1549 1550 leaq (%r14,%r10,2),%rbx 1551 movq %r13,8(%rdi) 1552 sbbq %r15,%r15 1553 shrq $63,%r10 1554 leaq (%rcx,%r11,2),%r8 1555 shrq $63,%r11 1556 orq %r10,%r8 1557 movq 32(%rdi),%r10 1558 movq %r11,%r14 1559 mulq %rax 1560 negq %r15 1561 movq 40(%rdi),%r11 1562 adcq %rax,%rbx 1563 movq 0(%rsi,%rbp,1),%rax 1564 movq %rbx,16(%rdi) 1565 adcq %rdx,%r8 1566 leaq 16(%rbp),%rbp 1567 movq %r8,24(%rdi) 1568 sbbq %r15,%r15 1569 leaq 64(%rdi),%rdi 1570 jmp .Lsqr4x_shift_n_add 1571 1572.align 32 1573.Lsqr4x_shift_n_add: 1574 leaq (%r14,%r10,2),%r12 1575 shrq $63,%r10 1576 leaq (%rcx,%r11,2),%r13 1577 shrq $63,%r11 1578 orq %r10,%r13 1579 movq -16(%rdi),%r10 1580 movq %r11,%r14 1581 mulq %rax 1582 negq %r15 1583 movq -8(%rdi),%r11 1584 adcq %rax,%r12 1585 movq -8(%rsi,%rbp,1),%rax 1586 movq %r12,-32(%rdi) 1587 adcq %rdx,%r13 1588 1589 leaq (%r14,%r10,2),%rbx 1590 movq %r13,-24(%rdi) 1591 sbbq %r15,%r15 1592 shrq $63,%r10 1593 leaq (%rcx,%r11,2),%r8 1594 shrq $63,%r11 1595 orq %r10,%r8 1596 movq 0(%rdi),%r10 1597 movq %r11,%r14 1598 mulq %rax 1599 negq %r15 1600 movq 8(%rdi),%r11 1601 adcq %rax,%rbx 1602 movq 0(%rsi,%rbp,1),%rax 1603 movq %rbx,-16(%rdi) 1604 adcq %rdx,%r8 1605 1606 leaq (%r14,%r10,2),%r12 1607 movq %r8,-8(%rdi) 1608 sbbq %r15,%r15 1609 shrq $63,%r10 1610 leaq (%rcx,%r11,2),%r13 1611 shrq $63,%r11 1612 orq %r10,%r13 1613 movq 16(%rdi),%r10 1614 movq %r11,%r14 1615 mulq %rax 1616 negq %r15 1617 movq 24(%rdi),%r11 1618 adcq %rax,%r12 1619 movq 8(%rsi,%rbp,1),%rax 1620 movq %r12,0(%rdi) 1621 adcq %rdx,%r13 1622 1623 leaq (%r14,%r10,2),%rbx 1624 movq %r13,8(%rdi) 1625 sbbq %r15,%r15 1626 shrq $63,%r10 1627 leaq (%rcx,%r11,2),%r8 1628 shrq $63,%r11 1629 orq %r10,%r8 1630 movq 32(%rdi),%r10 1631 movq %r11,%r14 1632 mulq %rax 1633 negq %r15 1634 movq 40(%rdi),%r11 1635 adcq %rax,%rbx 1636 movq 16(%rsi,%rbp,1),%rax 1637 movq %rbx,16(%rdi) 1638 adcq %rdx,%r8 1639 movq %r8,24(%rdi) 1640 sbbq %r15,%r15 1641 leaq 64(%rdi),%rdi 1642 addq $32,%rbp 1643 jnz .Lsqr4x_shift_n_add 1644 1645 leaq (%r14,%r10,2),%r12 1646.byte 0x67 1647 shrq $63,%r10 1648 leaq (%rcx,%r11,2),%r13 1649 shrq $63,%r11 1650 orq %r10,%r13 1651 movq -16(%rdi),%r10 1652 movq %r11,%r14 1653 mulq %rax 1654 negq %r15 1655 movq -8(%rdi),%r11 1656 adcq %rax,%r12 1657 movq -8(%rsi),%rax 1658 movq %r12,-32(%rdi) 1659 adcq %rdx,%r13 1660 1661 leaq (%r14,%r10,2),%rbx 1662 movq %r13,-24(%rdi) 1663 sbbq %r15,%r15 1664 shrq $63,%r10 1665 leaq (%rcx,%r11,2),%r8 1666 shrq $63,%r11 1667 orq %r10,%r8 1668 mulq %rax 1669 negq %r15 1670 adcq %rax,%rbx 1671 adcq %rdx,%r8 1672 movq %rbx,-16(%rdi) 1673 movq %r8,-8(%rdi) 1674.byte 102,72,15,126,213 1675__bn_sqr8x_reduction: 1676 xorq %rax,%rax 1677 leaq (%r9,%rbp,1),%rcx 1678 leaq 48+8(%rsp,%r9,2),%rdx 1679 movq %rcx,0+8(%rsp) 1680 leaq 48+8(%rsp,%r9,1),%rdi 1681 movq %rdx,8+8(%rsp) 1682 negq %r9 1683 jmp .L8x_reduction_loop 1684 1685.align 32 1686.L8x_reduction_loop: 1687 leaq (%rdi,%r9,1),%rdi 1688.byte 0x66 1689 movq 0(%rdi),%rbx 1690 movq 8(%rdi),%r9 1691 movq 16(%rdi),%r10 1692 movq 24(%rdi),%r11 1693 movq 32(%rdi),%r12 1694 movq 40(%rdi),%r13 1695 movq 48(%rdi),%r14 1696 movq 56(%rdi),%r15 1697 movq %rax,(%rdx) 1698 leaq 64(%rdi),%rdi 1699 1700.byte 0x67 1701 movq %rbx,%r8 1702 imulq 32+8(%rsp),%rbx 1703 movq 0(%rbp),%rax 1704 movl $8,%ecx 1705 jmp .L8x_reduce 1706 1707.align 32 1708.L8x_reduce: 1709 mulq %rbx 1710 movq 8(%rbp),%rax 1711 negq %r8 1712 movq %rdx,%r8 1713 adcq $0,%r8 1714 1715 mulq %rbx 1716 addq %rax,%r9 1717 movq 16(%rbp),%rax 1718 adcq $0,%rdx 1719 addq %r9,%r8 1720 movq %rbx,48-8+8(%rsp,%rcx,8) 1721 movq %rdx,%r9 1722 adcq $0,%r9 1723 1724 mulq %rbx 1725 addq %rax,%r10 1726 movq 24(%rbp),%rax 1727 adcq $0,%rdx 1728 addq %r10,%r9 1729 movq 32+8(%rsp),%rsi 1730 movq %rdx,%r10 1731 adcq $0,%r10 1732 1733 mulq %rbx 1734 addq %rax,%r11 1735 movq 32(%rbp),%rax 1736 adcq $0,%rdx 1737 imulq %r8,%rsi 1738 addq %r11,%r10 1739 movq %rdx,%r11 1740 adcq $0,%r11 1741 1742 mulq %rbx 1743 addq %rax,%r12 1744 movq 40(%rbp),%rax 1745 adcq $0,%rdx 1746 addq %r12,%r11 1747 movq %rdx,%r12 1748 adcq $0,%r12 1749 1750 mulq %rbx 1751 addq %rax,%r13 1752 movq 48(%rbp),%rax 1753 adcq $0,%rdx 1754 addq %r13,%r12 1755 movq %rdx,%r13 1756 adcq $0,%r13 1757 1758 mulq %rbx 1759 addq %rax,%r14 1760 movq 56(%rbp),%rax 1761 adcq $0,%rdx 1762 addq %r14,%r13 1763 movq %rdx,%r14 1764 adcq $0,%r14 1765 1766 mulq %rbx 1767 movq %rsi,%rbx 1768 addq %rax,%r15 1769 movq 0(%rbp),%rax 1770 adcq $0,%rdx 1771 addq %r15,%r14 1772 movq %rdx,%r15 1773 adcq $0,%r15 1774 1775 decl %ecx 1776 jnz .L8x_reduce 1777 1778 leaq 64(%rbp),%rbp 1779 xorq %rax,%rax 1780 movq 8+8(%rsp),%rdx 1781 cmpq 0+8(%rsp),%rbp 1782 jae .L8x_no_tail 1783 1784.byte 0x66 1785 addq 0(%rdi),%r8 1786 adcq 8(%rdi),%r9 1787 adcq 16(%rdi),%r10 1788 adcq 24(%rdi),%r11 1789 adcq 32(%rdi),%r12 1790 adcq 40(%rdi),%r13 1791 adcq 48(%rdi),%r14 1792 adcq 56(%rdi),%r15 1793 sbbq %rsi,%rsi 1794 1795 movq 48+56+8(%rsp),%rbx 1796 movl $8,%ecx 1797 movq 0(%rbp),%rax 1798 jmp .L8x_tail 1799 1800.align 32 1801.L8x_tail: 1802 mulq %rbx 1803 addq %rax,%r8 1804 movq 8(%rbp),%rax 1805 movq %r8,(%rdi) 1806 movq %rdx,%r8 1807 adcq $0,%r8 1808 1809 mulq %rbx 1810 addq %rax,%r9 1811 movq 16(%rbp),%rax 1812 adcq $0,%rdx 1813 addq %r9,%r8 1814 leaq 8(%rdi),%rdi 1815 movq %rdx,%r9 1816 adcq $0,%r9 1817 1818 mulq %rbx 1819 addq %rax,%r10 1820 movq 24(%rbp),%rax 1821 adcq $0,%rdx 1822 addq %r10,%r9 1823 movq %rdx,%r10 1824 adcq $0,%r10 1825 1826 mulq %rbx 1827 addq %rax,%r11 1828 movq 32(%rbp),%rax 1829 adcq $0,%rdx 1830 addq %r11,%r10 1831 movq %rdx,%r11 1832 adcq $0,%r11 1833 1834 mulq %rbx 1835 addq %rax,%r12 1836 movq 40(%rbp),%rax 1837 adcq $0,%rdx 1838 addq %r12,%r11 1839 movq %rdx,%r12 1840 adcq $0,%r12 1841 1842 mulq %rbx 1843 addq %rax,%r13 1844 movq 48(%rbp),%rax 1845 adcq $0,%rdx 1846 addq %r13,%r12 1847 movq %rdx,%r13 1848 adcq $0,%r13 1849 1850 mulq %rbx 1851 addq %rax,%r14 1852 movq 56(%rbp),%rax 1853 adcq $0,%rdx 1854 addq %r14,%r13 1855 movq %rdx,%r14 1856 adcq $0,%r14 1857 1858 mulq %rbx 1859 movq 48-16+8(%rsp,%rcx,8),%rbx 1860 addq %rax,%r15 1861 adcq $0,%rdx 1862 addq %r15,%r14 1863 movq 0(%rbp),%rax 1864 movq %rdx,%r15 1865 adcq $0,%r15 1866 1867 decl %ecx 1868 jnz .L8x_tail 1869 1870 leaq 64(%rbp),%rbp 1871 movq 8+8(%rsp),%rdx 1872 cmpq 0+8(%rsp),%rbp 1873 jae .L8x_tail_done 1874 1875 movq 48+56+8(%rsp),%rbx 1876 negq %rsi 1877 movq 0(%rbp),%rax 1878 adcq 0(%rdi),%r8 1879 adcq 8(%rdi),%r9 1880 adcq 16(%rdi),%r10 1881 adcq 24(%rdi),%r11 1882 adcq 32(%rdi),%r12 1883 adcq 40(%rdi),%r13 1884 adcq 48(%rdi),%r14 1885 adcq 56(%rdi),%r15 1886 sbbq %rsi,%rsi 1887 1888 movl $8,%ecx 1889 jmp .L8x_tail 1890 1891.align 32 1892.L8x_tail_done: 1893 addq (%rdx),%r8 1894 adcq $0,%r9 1895 adcq $0,%r10 1896 adcq $0,%r11 1897 adcq $0,%r12 1898 adcq $0,%r13 1899 adcq $0,%r14 1900 adcq $0,%r15 1901 1902 1903 xorq %rax,%rax 1904 1905 negq %rsi 1906.L8x_no_tail: 1907 adcq 0(%rdi),%r8 1908 adcq 8(%rdi),%r9 1909 adcq 16(%rdi),%r10 1910 adcq 24(%rdi),%r11 1911 adcq 32(%rdi),%r12 1912 adcq 40(%rdi),%r13 1913 adcq 48(%rdi),%r14 1914 adcq 56(%rdi),%r15 1915 adcq $0,%rax 1916 movq -8(%rbp),%rcx 1917 xorq %rsi,%rsi 1918 1919.byte 102,72,15,126,213 1920 1921 movq %r8,0(%rdi) 1922 movq %r9,8(%rdi) 1923.byte 102,73,15,126,217 1924 movq %r10,16(%rdi) 1925 movq %r11,24(%rdi) 1926 movq %r12,32(%rdi) 1927 movq %r13,40(%rdi) 1928 movq %r14,48(%rdi) 1929 movq %r15,56(%rdi) 1930 leaq 64(%rdi),%rdi 1931 1932 cmpq %rdx,%rdi 1933 jb .L8x_reduction_loop 1934 .byte 0xf3,0xc3 1935.size bn_sqr8x_internal,.-bn_sqr8x_internal 1936.type __bn_post4x_internal,@function 1937.align 32 1938__bn_post4x_internal: 1939 movq 0(%rbp),%r12 1940 leaq (%rdi,%r9,1),%rbx 1941 movq %r9,%rcx 1942.byte 102,72,15,126,207 1943 negq %rax 1944.byte 102,72,15,126,206 1945 sarq $3+2,%rcx 1946 decq %r12 1947 xorq %r10,%r10 1948 movq 8(%rbp),%r13 1949 movq 16(%rbp),%r14 1950 movq 24(%rbp),%r15 1951 jmp .Lsqr4x_sub_entry 1952 1953.align 16 1954.Lsqr4x_sub: 1955 movq 0(%rbp),%r12 1956 movq 8(%rbp),%r13 1957 movq 16(%rbp),%r14 1958 movq 24(%rbp),%r15 1959.Lsqr4x_sub_entry: 1960 leaq 32(%rbp),%rbp 1961 notq %r12 1962 notq %r13 1963 notq %r14 1964 notq %r15 1965 andq %rax,%r12 1966 andq %rax,%r13 1967 andq %rax,%r14 1968 andq %rax,%r15 1969 1970 negq %r10 1971 adcq 0(%rbx),%r12 1972 adcq 8(%rbx),%r13 1973 adcq 16(%rbx),%r14 1974 adcq 24(%rbx),%r15 1975 movq %r12,0(%rdi) 1976 leaq 32(%rbx),%rbx 1977 movq %r13,8(%rdi) 1978 sbbq %r10,%r10 1979 movq %r14,16(%rdi) 1980 movq %r15,24(%rdi) 1981 leaq 32(%rdi),%rdi 1982 1983 incq %rcx 1984 jnz .Lsqr4x_sub 1985 1986 movq %r9,%r10 1987 negq %r9 1988 .byte 0xf3,0xc3 1989.size __bn_post4x_internal,.-__bn_post4x_internal 1990.globl bn_from_montgomery 1991.type bn_from_montgomery,@function 1992.align 32 1993bn_from_montgomery: 1994 testl $7,%r9d 1995 jz bn_from_mont8x 1996 xorl %eax,%eax 1997 .byte 0xf3,0xc3 1998.size bn_from_montgomery,.-bn_from_montgomery 1999 2000.type bn_from_mont8x,@function 2001.align 32 2002bn_from_mont8x: 2003.byte 0x67 2004 movq %rsp,%rax 2005 pushq %rbx 2006 pushq %rbp 2007 pushq %r12 2008 pushq %r13 2009 pushq %r14 2010 pushq %r15 2011.Lfrom_prologue: 2012 2013 shll $3,%r9d 2014 leaq (%r9,%r9,2),%r10 2015 negq %r9 2016 movq (%r8),%r8 2017 2018 2019 2020 2021 2022 2023 2024 2025 leaq -320(%rsp,%r9,2),%r11 2026 movq %rsp,%rbp 2027 subq %rdi,%r11 2028 andq $4095,%r11 2029 cmpq %r11,%r10 2030 jb .Lfrom_sp_alt 2031 subq %r11,%rbp 2032 leaq -320(%rbp,%r9,2),%rbp 2033 jmp .Lfrom_sp_done 2034 2035.align 32 2036.Lfrom_sp_alt: 2037 leaq 4096-320(,%r9,2),%r10 2038 leaq -320(%rbp,%r9,2),%rbp 2039 subq %r10,%r11 2040 movq $0,%r10 2041 cmovcq %r10,%r11 2042 subq %r11,%rbp 2043.Lfrom_sp_done: 2044 andq $-64,%rbp 2045 movq %rsp,%r11 2046 subq %rbp,%r11 2047 andq $-4096,%r11 2048 leaq (%r11,%rbp,1),%rsp 2049 movq (%rsp),%r10 2050 cmpq %rbp,%rsp 2051 ja .Lfrom_page_walk 2052 jmp .Lfrom_page_walk_done 2053 2054.Lfrom_page_walk: 2055 leaq -4096(%rsp),%rsp 2056 movq (%rsp),%r10 2057 cmpq %rbp,%rsp 2058 ja .Lfrom_page_walk 2059.Lfrom_page_walk_done: 2060 2061 movq %r9,%r10 2062 negq %r9 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 movq %r8,32(%rsp) 2074 movq %rax,40(%rsp) 2075.Lfrom_body: 2076 movq %r9,%r11 2077 leaq 48(%rsp),%rax 2078 pxor %xmm0,%xmm0 2079 jmp .Lmul_by_1 2080 2081.align 32 2082.Lmul_by_1: 2083 movdqu (%rsi),%xmm1 2084 movdqu 16(%rsi),%xmm2 2085 movdqu 32(%rsi),%xmm3 2086 movdqa %xmm0,(%rax,%r9,1) 2087 movdqu 48(%rsi),%xmm4 2088 movdqa %xmm0,16(%rax,%r9,1) 2089.byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 2090 movdqa %xmm1,(%rax) 2091 movdqa %xmm0,32(%rax,%r9,1) 2092 movdqa %xmm2,16(%rax) 2093 movdqa %xmm0,48(%rax,%r9,1) 2094 movdqa %xmm3,32(%rax) 2095 movdqa %xmm4,48(%rax) 2096 leaq 64(%rax),%rax 2097 subq $64,%r11 2098 jnz .Lmul_by_1 2099 2100.byte 102,72,15,110,207 2101.byte 102,72,15,110,209 2102.byte 0x67 2103 movq %rcx,%rbp 2104.byte 102,73,15,110,218 2105 movl OPENSSL_ia32cap_P+8(%rip),%r11d 2106 andl $0x80108,%r11d 2107 cmpl $0x80108,%r11d 2108 jne .Lfrom_mont_nox 2109 2110 leaq (%rax,%r9,1),%rdi 2111 call __bn_sqrx8x_reduction 2112 call __bn_postx4x_internal 2113 2114 pxor %xmm0,%xmm0 2115 leaq 48(%rsp),%rax 2116 movq 40(%rsp),%rsi 2117 jmp .Lfrom_mont_zero 2118 2119.align 32 2120.Lfrom_mont_nox: 2121 call __bn_sqr8x_reduction 2122 call __bn_post4x_internal 2123 2124 pxor %xmm0,%xmm0 2125 leaq 48(%rsp),%rax 2126 movq 40(%rsp),%rsi 2127 jmp .Lfrom_mont_zero 2128 2129.align 32 2130.Lfrom_mont_zero: 2131 movdqa %xmm0,0(%rax) 2132 movdqa %xmm0,16(%rax) 2133 movdqa %xmm0,32(%rax) 2134 movdqa %xmm0,48(%rax) 2135 leaq 64(%rax),%rax 2136 subq $32,%r9 2137 jnz .Lfrom_mont_zero 2138 2139 movq $1,%rax 2140 movq -48(%rsi),%r15 2141 movq -40(%rsi),%r14 2142 movq -32(%rsi),%r13 2143 movq -24(%rsi),%r12 2144 movq -16(%rsi),%rbp 2145 movq -8(%rsi),%rbx 2146 leaq (%rsi),%rsp 2147.Lfrom_epilogue: 2148 .byte 0xf3,0xc3 2149.size bn_from_mont8x,.-bn_from_mont8x 2150.type bn_mulx4x_mont_gather5,@function 2151.align 32 2152bn_mulx4x_mont_gather5: 2153 movq %rsp,%rax 2154.Lmulx4x_enter: 2155 pushq %rbx 2156 pushq %rbp 2157 pushq %r12 2158 pushq %r13 2159 pushq %r14 2160 pushq %r15 2161.Lmulx4x_prologue: 2162 2163 shll $3,%r9d 2164 leaq (%r9,%r9,2),%r10 2165 negq %r9 2166 movq (%r8),%r8 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 leaq -320(%rsp,%r9,2),%r11 2178 movq %rsp,%rbp 2179 subq %rdi,%r11 2180 andq $4095,%r11 2181 cmpq %r11,%r10 2182 jb .Lmulx4xsp_alt 2183 subq %r11,%rbp 2184 leaq -320(%rbp,%r9,2),%rbp 2185 jmp .Lmulx4xsp_done 2186 2187.Lmulx4xsp_alt: 2188 leaq 4096-320(,%r9,2),%r10 2189 leaq -320(%rbp,%r9,2),%rbp 2190 subq %r10,%r11 2191 movq $0,%r10 2192 cmovcq %r10,%r11 2193 subq %r11,%rbp 2194.Lmulx4xsp_done: 2195 andq $-64,%rbp 2196 movq %rsp,%r11 2197 subq %rbp,%r11 2198 andq $-4096,%r11 2199 leaq (%r11,%rbp,1),%rsp 2200 movq (%rsp),%r10 2201 cmpq %rbp,%rsp 2202 ja .Lmulx4x_page_walk 2203 jmp .Lmulx4x_page_walk_done 2204 2205.Lmulx4x_page_walk: 2206 leaq -4096(%rsp),%rsp 2207 movq (%rsp),%r10 2208 cmpq %rbp,%rsp 2209 ja .Lmulx4x_page_walk 2210.Lmulx4x_page_walk_done: 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 movq %r8,32(%rsp) 2225 movq %rax,40(%rsp) 2226.Lmulx4x_body: 2227 call mulx4x_internal 2228 2229 movq 40(%rsp),%rsi 2230 movq $1,%rax 2231 2232 movq -48(%rsi),%r15 2233 movq -40(%rsi),%r14 2234 movq -32(%rsi),%r13 2235 movq -24(%rsi),%r12 2236 movq -16(%rsi),%rbp 2237 movq -8(%rsi),%rbx 2238 leaq (%rsi),%rsp 2239.Lmulx4x_epilogue: 2240 .byte 0xf3,0xc3 2241.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 2242 2243.type mulx4x_internal,@function 2244.align 32 2245mulx4x_internal: 2246 movq %r9,8(%rsp) 2247 movq %r9,%r10 2248 negq %r9 2249 shlq $5,%r9 2250 negq %r10 2251 leaq 128(%rdx,%r9,1),%r13 2252 shrq $5+5,%r9 2253 movd 8(%rax),%xmm5 2254 subq $1,%r9 2255 leaq .Linc(%rip),%rax 2256 movq %r13,16+8(%rsp) 2257 movq %r9,24+8(%rsp) 2258 movq %rdi,56+8(%rsp) 2259 movdqa 0(%rax),%xmm0 2260 movdqa 16(%rax),%xmm1 2261 leaq 88-112(%rsp,%r10,1),%r10 2262 leaq 128(%rdx),%rdi 2263 2264 pshufd $0,%xmm5,%xmm5 2265 movdqa %xmm1,%xmm4 2266.byte 0x67 2267 movdqa %xmm1,%xmm2 2268.byte 0x67 2269 paddd %xmm0,%xmm1 2270 pcmpeqd %xmm5,%xmm0 2271 movdqa %xmm4,%xmm3 2272 paddd %xmm1,%xmm2 2273 pcmpeqd %xmm5,%xmm1 2274 movdqa %xmm0,112(%r10) 2275 movdqa %xmm4,%xmm0 2276 2277 paddd %xmm2,%xmm3 2278 pcmpeqd %xmm5,%xmm2 2279 movdqa %xmm1,128(%r10) 2280 movdqa %xmm4,%xmm1 2281 2282 paddd %xmm3,%xmm0 2283 pcmpeqd %xmm5,%xmm3 2284 movdqa %xmm2,144(%r10) 2285 movdqa %xmm4,%xmm2 2286 2287 paddd %xmm0,%xmm1 2288 pcmpeqd %xmm5,%xmm0 2289 movdqa %xmm3,160(%r10) 2290 movdqa %xmm4,%xmm3 2291 paddd %xmm1,%xmm2 2292 pcmpeqd %xmm5,%xmm1 2293 movdqa %xmm0,176(%r10) 2294 movdqa %xmm4,%xmm0 2295 2296 paddd %xmm2,%xmm3 2297 pcmpeqd %xmm5,%xmm2 2298 movdqa %xmm1,192(%r10) 2299 movdqa %xmm4,%xmm1 2300 2301 paddd %xmm3,%xmm0 2302 pcmpeqd %xmm5,%xmm3 2303 movdqa %xmm2,208(%r10) 2304 movdqa %xmm4,%xmm2 2305 2306 paddd %xmm0,%xmm1 2307 pcmpeqd %xmm5,%xmm0 2308 movdqa %xmm3,224(%r10) 2309 movdqa %xmm4,%xmm3 2310 paddd %xmm1,%xmm2 2311 pcmpeqd %xmm5,%xmm1 2312 movdqa %xmm0,240(%r10) 2313 movdqa %xmm4,%xmm0 2314 2315 paddd %xmm2,%xmm3 2316 pcmpeqd %xmm5,%xmm2 2317 movdqa %xmm1,256(%r10) 2318 movdqa %xmm4,%xmm1 2319 2320 paddd %xmm3,%xmm0 2321 pcmpeqd %xmm5,%xmm3 2322 movdqa %xmm2,272(%r10) 2323 movdqa %xmm4,%xmm2 2324 2325 paddd %xmm0,%xmm1 2326 pcmpeqd %xmm5,%xmm0 2327 movdqa %xmm3,288(%r10) 2328 movdqa %xmm4,%xmm3 2329.byte 0x67 2330 paddd %xmm1,%xmm2 2331 pcmpeqd %xmm5,%xmm1 2332 movdqa %xmm0,304(%r10) 2333 2334 paddd %xmm2,%xmm3 2335 pcmpeqd %xmm5,%xmm2 2336 movdqa %xmm1,320(%r10) 2337 2338 pcmpeqd %xmm5,%xmm3 2339 movdqa %xmm2,336(%r10) 2340 2341 pand 64(%rdi),%xmm0 2342 pand 80(%rdi),%xmm1 2343 pand 96(%rdi),%xmm2 2344 movdqa %xmm3,352(%r10) 2345 pand 112(%rdi),%xmm3 2346 por %xmm2,%xmm0 2347 por %xmm3,%xmm1 2348 movdqa -128(%rdi),%xmm4 2349 movdqa -112(%rdi),%xmm5 2350 movdqa -96(%rdi),%xmm2 2351 pand 112(%r10),%xmm4 2352 movdqa -80(%rdi),%xmm3 2353 pand 128(%r10),%xmm5 2354 por %xmm4,%xmm0 2355 pand 144(%r10),%xmm2 2356 por %xmm5,%xmm1 2357 pand 160(%r10),%xmm3 2358 por %xmm2,%xmm0 2359 por %xmm3,%xmm1 2360 movdqa -64(%rdi),%xmm4 2361 movdqa -48(%rdi),%xmm5 2362 movdqa -32(%rdi),%xmm2 2363 pand 176(%r10),%xmm4 2364 movdqa -16(%rdi),%xmm3 2365 pand 192(%r10),%xmm5 2366 por %xmm4,%xmm0 2367 pand 208(%r10),%xmm2 2368 por %xmm5,%xmm1 2369 pand 224(%r10),%xmm3 2370 por %xmm2,%xmm0 2371 por %xmm3,%xmm1 2372 movdqa 0(%rdi),%xmm4 2373 movdqa 16(%rdi),%xmm5 2374 movdqa 32(%rdi),%xmm2 2375 pand 240(%r10),%xmm4 2376 movdqa 48(%rdi),%xmm3 2377 pand 256(%r10),%xmm5 2378 por %xmm4,%xmm0 2379 pand 272(%r10),%xmm2 2380 por %xmm5,%xmm1 2381 pand 288(%r10),%xmm3 2382 por %xmm2,%xmm0 2383 por %xmm3,%xmm1 2384 pxor %xmm1,%xmm0 2385 pshufd $0x4e,%xmm0,%xmm1 2386 por %xmm1,%xmm0 2387 leaq 256(%rdi),%rdi 2388.byte 102,72,15,126,194 2389 leaq 64+32+8(%rsp),%rbx 2390 2391 movq %rdx,%r9 2392 mulxq 0(%rsi),%r8,%rax 2393 mulxq 8(%rsi),%r11,%r12 2394 addq %rax,%r11 2395 mulxq 16(%rsi),%rax,%r13 2396 adcq %rax,%r12 2397 adcq $0,%r13 2398 mulxq 24(%rsi),%rax,%r14 2399 2400 movq %r8,%r15 2401 imulq 32+8(%rsp),%r8 2402 xorq %rbp,%rbp 2403 movq %r8,%rdx 2404 2405 movq %rdi,8+8(%rsp) 2406 2407 leaq 32(%rsi),%rsi 2408 adcxq %rax,%r13 2409 adcxq %rbp,%r14 2410 2411 mulxq 0(%rcx),%rax,%r10 2412 adcxq %rax,%r15 2413 adoxq %r11,%r10 2414 mulxq 8(%rcx),%rax,%r11 2415 adcxq %rax,%r10 2416 adoxq %r12,%r11 2417 mulxq 16(%rcx),%rax,%r12 2418 movq 24+8(%rsp),%rdi 2419 movq %r10,-32(%rbx) 2420 adcxq %rax,%r11 2421 adoxq %r13,%r12 2422 mulxq 24(%rcx),%rax,%r15 2423 movq %r9,%rdx 2424 movq %r11,-24(%rbx) 2425 adcxq %rax,%r12 2426 adoxq %rbp,%r15 2427 leaq 32(%rcx),%rcx 2428 movq %r12,-16(%rbx) 2429 jmp .Lmulx4x_1st 2430 2431.align 32 2432.Lmulx4x_1st: 2433 adcxq %rbp,%r15 2434 mulxq 0(%rsi),%r10,%rax 2435 adcxq %r14,%r10 2436 mulxq 8(%rsi),%r11,%r14 2437 adcxq %rax,%r11 2438 mulxq 16(%rsi),%r12,%rax 2439 adcxq %r14,%r12 2440 mulxq 24(%rsi),%r13,%r14 2441.byte 0x67,0x67 2442 movq %r8,%rdx 2443 adcxq %rax,%r13 2444 adcxq %rbp,%r14 2445 leaq 32(%rsi),%rsi 2446 leaq 32(%rbx),%rbx 2447 2448 adoxq %r15,%r10 2449 mulxq 0(%rcx),%rax,%r15 2450 adcxq %rax,%r10 2451 adoxq %r15,%r11 2452 mulxq 8(%rcx),%rax,%r15 2453 adcxq %rax,%r11 2454 adoxq %r15,%r12 2455 mulxq 16(%rcx),%rax,%r15 2456 movq %r10,-40(%rbx) 2457 adcxq %rax,%r12 2458 movq %r11,-32(%rbx) 2459 adoxq %r15,%r13 2460 mulxq 24(%rcx),%rax,%r15 2461 movq %r9,%rdx 2462 movq %r12,-24(%rbx) 2463 adcxq %rax,%r13 2464 adoxq %rbp,%r15 2465 leaq 32(%rcx),%rcx 2466 movq %r13,-16(%rbx) 2467 2468 decq %rdi 2469 jnz .Lmulx4x_1st 2470 2471 movq 8(%rsp),%rax 2472 adcq %rbp,%r15 2473 leaq (%rsi,%rax,1),%rsi 2474 addq %r15,%r14 2475 movq 8+8(%rsp),%rdi 2476 adcq %rbp,%rbp 2477 movq %r14,-8(%rbx) 2478 jmp .Lmulx4x_outer 2479 2480.align 32 2481.Lmulx4x_outer: 2482 leaq 16-256(%rbx),%r10 2483 pxor %xmm4,%xmm4 2484.byte 0x67,0x67 2485 pxor %xmm5,%xmm5 2486 movdqa -128(%rdi),%xmm0 2487 movdqa -112(%rdi),%xmm1 2488 movdqa -96(%rdi),%xmm2 2489 pand 256(%r10),%xmm0 2490 movdqa -80(%rdi),%xmm3 2491 pand 272(%r10),%xmm1 2492 por %xmm0,%xmm4 2493 pand 288(%r10),%xmm2 2494 por %xmm1,%xmm5 2495 pand 304(%r10),%xmm3 2496 por %xmm2,%xmm4 2497 por %xmm3,%xmm5 2498 movdqa -64(%rdi),%xmm0 2499 movdqa -48(%rdi),%xmm1 2500 movdqa -32(%rdi),%xmm2 2501 pand 320(%r10),%xmm0 2502 movdqa -16(%rdi),%xmm3 2503 pand 336(%r10),%xmm1 2504 por %xmm0,%xmm4 2505 pand 352(%r10),%xmm2 2506 por %xmm1,%xmm5 2507 pand 368(%r10),%xmm3 2508 por %xmm2,%xmm4 2509 por %xmm3,%xmm5 2510 movdqa 0(%rdi),%xmm0 2511 movdqa 16(%rdi),%xmm1 2512 movdqa 32(%rdi),%xmm2 2513 pand 384(%r10),%xmm0 2514 movdqa 48(%rdi),%xmm3 2515 pand 400(%r10),%xmm1 2516 por %xmm0,%xmm4 2517 pand 416(%r10),%xmm2 2518 por %xmm1,%xmm5 2519 pand 432(%r10),%xmm3 2520 por %xmm2,%xmm4 2521 por %xmm3,%xmm5 2522 movdqa 64(%rdi),%xmm0 2523 movdqa 80(%rdi),%xmm1 2524 movdqa 96(%rdi),%xmm2 2525 pand 448(%r10),%xmm0 2526 movdqa 112(%rdi),%xmm3 2527 pand 464(%r10),%xmm1 2528 por %xmm0,%xmm4 2529 pand 480(%r10),%xmm2 2530 por %xmm1,%xmm5 2531 pand 496(%r10),%xmm3 2532 por %xmm2,%xmm4 2533 por %xmm3,%xmm5 2534 por %xmm5,%xmm4 2535 pshufd $0x4e,%xmm4,%xmm0 2536 por %xmm4,%xmm0 2537 leaq 256(%rdi),%rdi 2538.byte 102,72,15,126,194 2539 2540 movq %rbp,(%rbx) 2541 leaq 32(%rbx,%rax,1),%rbx 2542 mulxq 0(%rsi),%r8,%r11 2543 xorq %rbp,%rbp 2544 movq %rdx,%r9 2545 mulxq 8(%rsi),%r14,%r12 2546 adoxq -32(%rbx),%r8 2547 adcxq %r14,%r11 2548 mulxq 16(%rsi),%r15,%r13 2549 adoxq -24(%rbx),%r11 2550 adcxq %r15,%r12 2551 mulxq 24(%rsi),%rdx,%r14 2552 adoxq -16(%rbx),%r12 2553 adcxq %rdx,%r13 2554 leaq (%rcx,%rax,1),%rcx 2555 leaq 32(%rsi),%rsi 2556 adoxq -8(%rbx),%r13 2557 adcxq %rbp,%r14 2558 adoxq %rbp,%r14 2559 2560 movq %r8,%r15 2561 imulq 32+8(%rsp),%r8 2562 2563 movq %r8,%rdx 2564 xorq %rbp,%rbp 2565 movq %rdi,8+8(%rsp) 2566 2567 mulxq 0(%rcx),%rax,%r10 2568 adcxq %rax,%r15 2569 adoxq %r11,%r10 2570 mulxq 8(%rcx),%rax,%r11 2571 adcxq %rax,%r10 2572 adoxq %r12,%r11 2573 mulxq 16(%rcx),%rax,%r12 2574 adcxq %rax,%r11 2575 adoxq %r13,%r12 2576 mulxq 24(%rcx),%rax,%r15 2577 movq %r9,%rdx 2578 movq 24+8(%rsp),%rdi 2579 movq %r10,-32(%rbx) 2580 adcxq %rax,%r12 2581 movq %r11,-24(%rbx) 2582 adoxq %rbp,%r15 2583 movq %r12,-16(%rbx) 2584 leaq 32(%rcx),%rcx 2585 jmp .Lmulx4x_inner 2586 2587.align 32 2588.Lmulx4x_inner: 2589 mulxq 0(%rsi),%r10,%rax 2590 adcxq %rbp,%r15 2591 adoxq %r14,%r10 2592 mulxq 8(%rsi),%r11,%r14 2593 adcxq 0(%rbx),%r10 2594 adoxq %rax,%r11 2595 mulxq 16(%rsi),%r12,%rax 2596 adcxq 8(%rbx),%r11 2597 adoxq %r14,%r12 2598 mulxq 24(%rsi),%r13,%r14 2599 movq %r8,%rdx 2600 adcxq 16(%rbx),%r12 2601 adoxq %rax,%r13 2602 adcxq 24(%rbx),%r13 2603 adoxq %rbp,%r14 2604 leaq 32(%rsi),%rsi 2605 leaq 32(%rbx),%rbx 2606 adcxq %rbp,%r14 2607 2608 adoxq %r15,%r10 2609 mulxq 0(%rcx),%rax,%r15 2610 adcxq %rax,%r10 2611 adoxq %r15,%r11 2612 mulxq 8(%rcx),%rax,%r15 2613 adcxq %rax,%r11 2614 adoxq %r15,%r12 2615 mulxq 16(%rcx),%rax,%r15 2616 movq %r10,-40(%rbx) 2617 adcxq %rax,%r12 2618 adoxq %r15,%r13 2619 movq %r11,-32(%rbx) 2620 mulxq 24(%rcx),%rax,%r15 2621 movq %r9,%rdx 2622 leaq 32(%rcx),%rcx 2623 movq %r12,-24(%rbx) 2624 adcxq %rax,%r13 2625 adoxq %rbp,%r15 2626 movq %r13,-16(%rbx) 2627 2628 decq %rdi 2629 jnz .Lmulx4x_inner 2630 2631 movq 0+8(%rsp),%rax 2632 adcq %rbp,%r15 2633 subq 0(%rbx),%rdi 2634 movq 8+8(%rsp),%rdi 2635 movq 16+8(%rsp),%r10 2636 adcq %r15,%r14 2637 leaq (%rsi,%rax,1),%rsi 2638 adcq %rbp,%rbp 2639 movq %r14,-8(%rbx) 2640 2641 cmpq %r10,%rdi 2642 jb .Lmulx4x_outer 2643 2644 movq -8(%rcx),%r10 2645 movq %rbp,%r8 2646 movq (%rcx,%rax,1),%r12 2647 leaq (%rcx,%rax,1),%rbp 2648 movq %rax,%rcx 2649 leaq (%rbx,%rax,1),%rdi 2650 xorl %eax,%eax 2651 xorq %r15,%r15 2652 subq %r14,%r10 2653 adcq %r15,%r15 2654 orq %r15,%r8 2655 sarq $3+2,%rcx 2656 subq %r8,%rax 2657 movq 56+8(%rsp),%rdx 2658 decq %r12 2659 movq 8(%rbp),%r13 2660 xorq %r8,%r8 2661 movq 16(%rbp),%r14 2662 movq 24(%rbp),%r15 2663 jmp .Lsqrx4x_sub_entry 2664.size mulx4x_internal,.-mulx4x_internal 2665.type bn_powerx5,@function 2666.align 32 2667bn_powerx5: 2668 movq %rsp,%rax 2669.Lpowerx5_enter: 2670 pushq %rbx 2671 pushq %rbp 2672 pushq %r12 2673 pushq %r13 2674 pushq %r14 2675 pushq %r15 2676.Lpowerx5_prologue: 2677 2678 shll $3,%r9d 2679 leaq (%r9,%r9,2),%r10 2680 negq %r9 2681 movq (%r8),%r8 2682 2683 2684 2685 2686 2687 2688 2689 2690 leaq -320(%rsp,%r9,2),%r11 2691 movq %rsp,%rbp 2692 subq %rdi,%r11 2693 andq $4095,%r11 2694 cmpq %r11,%r10 2695 jb .Lpwrx_sp_alt 2696 subq %r11,%rbp 2697 leaq -320(%rbp,%r9,2),%rbp 2698 jmp .Lpwrx_sp_done 2699 2700.align 32 2701.Lpwrx_sp_alt: 2702 leaq 4096-320(,%r9,2),%r10 2703 leaq -320(%rbp,%r9,2),%rbp 2704 subq %r10,%r11 2705 movq $0,%r10 2706 cmovcq %r10,%r11 2707 subq %r11,%rbp 2708.Lpwrx_sp_done: 2709 andq $-64,%rbp 2710 movq %rsp,%r11 2711 subq %rbp,%r11 2712 andq $-4096,%r11 2713 leaq (%r11,%rbp,1),%rsp 2714 movq (%rsp),%r10 2715 cmpq %rbp,%rsp 2716 ja .Lpwrx_page_walk 2717 jmp .Lpwrx_page_walk_done 2718 2719.Lpwrx_page_walk: 2720 leaq -4096(%rsp),%rsp 2721 movq (%rsp),%r10 2722 cmpq %rbp,%rsp 2723 ja .Lpwrx_page_walk 2724.Lpwrx_page_walk_done: 2725 2726 movq %r9,%r10 2727 negq %r9 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 pxor %xmm0,%xmm0 2741.byte 102,72,15,110,207 2742.byte 102,72,15,110,209 2743.byte 102,73,15,110,218 2744.byte 102,72,15,110,226 2745 movq %r8,32(%rsp) 2746 movq %rax,40(%rsp) 2747.Lpowerx5_body: 2748 2749 call __bn_sqrx8x_internal 2750 call __bn_postx4x_internal 2751 call __bn_sqrx8x_internal 2752 call __bn_postx4x_internal 2753 call __bn_sqrx8x_internal 2754 call __bn_postx4x_internal 2755 call __bn_sqrx8x_internal 2756 call __bn_postx4x_internal 2757 call __bn_sqrx8x_internal 2758 call __bn_postx4x_internal 2759 2760 movq %r10,%r9 2761 movq %rsi,%rdi 2762.byte 102,72,15,126,209 2763.byte 102,72,15,126,226 2764 movq 40(%rsp),%rax 2765 2766 call mulx4x_internal 2767 2768 movq 40(%rsp),%rsi 2769 movq $1,%rax 2770 2771 movq -48(%rsi),%r15 2772 movq -40(%rsi),%r14 2773 movq -32(%rsi),%r13 2774 movq -24(%rsi),%r12 2775 movq -16(%rsi),%rbp 2776 movq -8(%rsi),%rbx 2777 leaq (%rsi),%rsp 2778.Lpowerx5_epilogue: 2779 .byte 0xf3,0xc3 2780.size bn_powerx5,.-bn_powerx5 2781 2782.globl bn_sqrx8x_internal 2783.hidden bn_sqrx8x_internal 2784.type bn_sqrx8x_internal,@function 2785.align 32 2786bn_sqrx8x_internal: 2787__bn_sqrx8x_internal: 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 leaq 48+8(%rsp),%rdi 2829 leaq (%rsi,%r9,1),%rbp 2830 movq %r9,0+8(%rsp) 2831 movq %rbp,8+8(%rsp) 2832 jmp .Lsqr8x_zero_start 2833 2834.align 32 2835.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 2836.Lsqrx8x_zero: 2837.byte 0x3e 2838 movdqa %xmm0,0(%rdi) 2839 movdqa %xmm0,16(%rdi) 2840 movdqa %xmm0,32(%rdi) 2841 movdqa %xmm0,48(%rdi) 2842.Lsqr8x_zero_start: 2843 movdqa %xmm0,64(%rdi) 2844 movdqa %xmm0,80(%rdi) 2845 movdqa %xmm0,96(%rdi) 2846 movdqa %xmm0,112(%rdi) 2847 leaq 128(%rdi),%rdi 2848 subq $64,%r9 2849 jnz .Lsqrx8x_zero 2850 2851 movq 0(%rsi),%rdx 2852 2853 xorq %r10,%r10 2854 xorq %r11,%r11 2855 xorq %r12,%r12 2856 xorq %r13,%r13 2857 xorq %r14,%r14 2858 xorq %r15,%r15 2859 leaq 48+8(%rsp),%rdi 2860 xorq %rbp,%rbp 2861 jmp .Lsqrx8x_outer_loop 2862 2863.align 32 2864.Lsqrx8x_outer_loop: 2865 mulxq 8(%rsi),%r8,%rax 2866 adcxq %r9,%r8 2867 adoxq %rax,%r10 2868 mulxq 16(%rsi),%r9,%rax 2869 adcxq %r10,%r9 2870 adoxq %rax,%r11 2871.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 2872 adcxq %r11,%r10 2873 adoxq %rax,%r12 2874.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 2875 adcxq %r12,%r11 2876 adoxq %rax,%r13 2877 mulxq 40(%rsi),%r12,%rax 2878 adcxq %r13,%r12 2879 adoxq %rax,%r14 2880 mulxq 48(%rsi),%r13,%rax 2881 adcxq %r14,%r13 2882 adoxq %r15,%rax 2883 mulxq 56(%rsi),%r14,%r15 2884 movq 8(%rsi),%rdx 2885 adcxq %rax,%r14 2886 adoxq %rbp,%r15 2887 adcq 64(%rdi),%r15 2888 movq %r8,8(%rdi) 2889 movq %r9,16(%rdi) 2890 sbbq %rcx,%rcx 2891 xorq %rbp,%rbp 2892 2893 2894 mulxq 16(%rsi),%r8,%rbx 2895 mulxq 24(%rsi),%r9,%rax 2896 adcxq %r10,%r8 2897 adoxq %rbx,%r9 2898 mulxq 32(%rsi),%r10,%rbx 2899 adcxq %r11,%r9 2900 adoxq %rax,%r10 2901.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 2902 adcxq %r12,%r10 2903 adoxq %rbx,%r11 2904.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 2905 adcxq %r13,%r11 2906 adoxq %r14,%r12 2907.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 2908 movq 16(%rsi),%rdx 2909 adcxq %rax,%r12 2910 adoxq %rbx,%r13 2911 adcxq %r15,%r13 2912 adoxq %rbp,%r14 2913 adcxq %rbp,%r14 2914 2915 movq %r8,24(%rdi) 2916 movq %r9,32(%rdi) 2917 2918 mulxq 24(%rsi),%r8,%rbx 2919 mulxq 32(%rsi),%r9,%rax 2920 adcxq %r10,%r8 2921 adoxq %rbx,%r9 2922 mulxq 40(%rsi),%r10,%rbx 2923 adcxq %r11,%r9 2924 adoxq %rax,%r10 2925.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 2926 adcxq %r12,%r10 2927 adoxq %r13,%r11 2928.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 2929.byte 0x3e 2930 movq 24(%rsi),%rdx 2931 adcxq %rbx,%r11 2932 adoxq %rax,%r12 2933 adcxq %r14,%r12 2934 movq %r8,40(%rdi) 2935 movq %r9,48(%rdi) 2936 mulxq 32(%rsi),%r8,%rax 2937 adoxq %rbp,%r13 2938 adcxq %rbp,%r13 2939 2940 mulxq 40(%rsi),%r9,%rbx 2941 adcxq %r10,%r8 2942 adoxq %rax,%r9 2943 mulxq 48(%rsi),%r10,%rax 2944 adcxq %r11,%r9 2945 adoxq %r12,%r10 2946 mulxq 56(%rsi),%r11,%r12 2947 movq 32(%rsi),%rdx 2948 movq 40(%rsi),%r14 2949 adcxq %rbx,%r10 2950 adoxq %rax,%r11 2951 movq 48(%rsi),%r15 2952 adcxq %r13,%r11 2953 adoxq %rbp,%r12 2954 adcxq %rbp,%r12 2955 2956 movq %r8,56(%rdi) 2957 movq %r9,64(%rdi) 2958 2959 mulxq %r14,%r9,%rax 2960 movq 56(%rsi),%r8 2961 adcxq %r10,%r9 2962 mulxq %r15,%r10,%rbx 2963 adoxq %rax,%r10 2964 adcxq %r11,%r10 2965 mulxq %r8,%r11,%rax 2966 movq %r14,%rdx 2967 adoxq %rbx,%r11 2968 adcxq %r12,%r11 2969 2970 adcxq %rbp,%rax 2971 2972 mulxq %r15,%r14,%rbx 2973 mulxq %r8,%r12,%r13 2974 movq %r15,%rdx 2975 leaq 64(%rsi),%rsi 2976 adcxq %r14,%r11 2977 adoxq %rbx,%r12 2978 adcxq %rax,%r12 2979 adoxq %rbp,%r13 2980 2981.byte 0x67,0x67 2982 mulxq %r8,%r8,%r14 2983 adcxq %r8,%r13 2984 adcxq %rbp,%r14 2985 2986 cmpq 8+8(%rsp),%rsi 2987 je .Lsqrx8x_outer_break 2988 2989 negq %rcx 2990 movq $-8,%rcx 2991 movq %rbp,%r15 2992 movq 64(%rdi),%r8 2993 adcxq 72(%rdi),%r9 2994 adcxq 80(%rdi),%r10 2995 adcxq 88(%rdi),%r11 2996 adcq 96(%rdi),%r12 2997 adcq 104(%rdi),%r13 2998 adcq 112(%rdi),%r14 2999 adcq 120(%rdi),%r15 3000 leaq (%rsi),%rbp 3001 leaq 128(%rdi),%rdi 3002 sbbq %rax,%rax 3003 3004 movq -64(%rsi),%rdx 3005 movq %rax,16+8(%rsp) 3006 movq %rdi,24+8(%rsp) 3007 3008 3009 xorl %eax,%eax 3010 jmp .Lsqrx8x_loop 3011 3012.align 32 3013.Lsqrx8x_loop: 3014 movq %r8,%rbx 3015 mulxq 0(%rbp),%rax,%r8 3016 adcxq %rax,%rbx 3017 adoxq %r9,%r8 3018 3019 mulxq 8(%rbp),%rax,%r9 3020 adcxq %rax,%r8 3021 adoxq %r10,%r9 3022 3023 mulxq 16(%rbp),%rax,%r10 3024 adcxq %rax,%r9 3025 adoxq %r11,%r10 3026 3027 mulxq 24(%rbp),%rax,%r11 3028 adcxq %rax,%r10 3029 adoxq %r12,%r11 3030 3031.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 3032 adcxq %rax,%r11 3033 adoxq %r13,%r12 3034 3035 mulxq 40(%rbp),%rax,%r13 3036 adcxq %rax,%r12 3037 adoxq %r14,%r13 3038 3039 mulxq 48(%rbp),%rax,%r14 3040 movq %rbx,(%rdi,%rcx,8) 3041 movl $0,%ebx 3042 adcxq %rax,%r13 3043 adoxq %r15,%r14 3044 3045.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 3046 movq 8(%rsi,%rcx,8),%rdx 3047 adcxq %rax,%r14 3048 adoxq %rbx,%r15 3049 adcxq %rbx,%r15 3050 3051.byte 0x67 3052 incq %rcx 3053 jnz .Lsqrx8x_loop 3054 3055 leaq 64(%rbp),%rbp 3056 movq $-8,%rcx 3057 cmpq 8+8(%rsp),%rbp 3058 je .Lsqrx8x_break 3059 3060 subq 16+8(%rsp),%rbx 3061.byte 0x66 3062 movq -64(%rsi),%rdx 3063 adcxq 0(%rdi),%r8 3064 adcxq 8(%rdi),%r9 3065 adcq 16(%rdi),%r10 3066 adcq 24(%rdi),%r11 3067 adcq 32(%rdi),%r12 3068 adcq 40(%rdi),%r13 3069 adcq 48(%rdi),%r14 3070 adcq 56(%rdi),%r15 3071 leaq 64(%rdi),%rdi 3072.byte 0x67 3073 sbbq %rax,%rax 3074 xorl %ebx,%ebx 3075 movq %rax,16+8(%rsp) 3076 jmp .Lsqrx8x_loop 3077 3078.align 32 3079.Lsqrx8x_break: 3080 subq 16+8(%rsp),%r8 3081 movq 24+8(%rsp),%rcx 3082 movq 0(%rsi),%rdx 3083 xorl %ebp,%ebp 3084 movq %r8,0(%rdi) 3085 cmpq %rcx,%rdi 3086 je .Lsqrx8x_outer_loop 3087 3088 movq %r9,8(%rdi) 3089 movq 8(%rcx),%r9 3090 movq %r10,16(%rdi) 3091 movq 16(%rcx),%r10 3092 movq %r11,24(%rdi) 3093 movq 24(%rcx),%r11 3094 movq %r12,32(%rdi) 3095 movq 32(%rcx),%r12 3096 movq %r13,40(%rdi) 3097 movq 40(%rcx),%r13 3098 movq %r14,48(%rdi) 3099 movq 48(%rcx),%r14 3100 movq %r15,56(%rdi) 3101 movq 56(%rcx),%r15 3102 movq %rcx,%rdi 3103 jmp .Lsqrx8x_outer_loop 3104 3105.align 32 3106.Lsqrx8x_outer_break: 3107 movq %r9,72(%rdi) 3108.byte 102,72,15,126,217 3109 movq %r10,80(%rdi) 3110 movq %r11,88(%rdi) 3111 movq %r12,96(%rdi) 3112 movq %r13,104(%rdi) 3113 movq %r14,112(%rdi) 3114 leaq 48+8(%rsp),%rdi 3115 movq (%rsi,%rcx,1),%rdx 3116 3117 movq 8(%rdi),%r11 3118 xorq %r10,%r10 3119 movq 0+8(%rsp),%r9 3120 adoxq %r11,%r11 3121 movq 16(%rdi),%r12 3122 movq 24(%rdi),%r13 3123 3124 3125.align 32 3126.Lsqrx4x_shift_n_add: 3127 mulxq %rdx,%rax,%rbx 3128 adoxq %r12,%r12 3129 adcxq %r10,%rax 3130.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 3131.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 3132 adoxq %r13,%r13 3133 adcxq %r11,%rbx 3134 movq 40(%rdi),%r11 3135 movq %rax,0(%rdi) 3136 movq %rbx,8(%rdi) 3137 3138 mulxq %rdx,%rax,%rbx 3139 adoxq %r10,%r10 3140 adcxq %r12,%rax 3141 movq 16(%rsi,%rcx,1),%rdx 3142 movq 48(%rdi),%r12 3143 adoxq %r11,%r11 3144 adcxq %r13,%rbx 3145 movq 56(%rdi),%r13 3146 movq %rax,16(%rdi) 3147 movq %rbx,24(%rdi) 3148 3149 mulxq %rdx,%rax,%rbx 3150 adoxq %r12,%r12 3151 adcxq %r10,%rax 3152 movq 24(%rsi,%rcx,1),%rdx 3153 leaq 32(%rcx),%rcx 3154 movq 64(%rdi),%r10 3155 adoxq %r13,%r13 3156 adcxq %r11,%rbx 3157 movq 72(%rdi),%r11 3158 movq %rax,32(%rdi) 3159 movq %rbx,40(%rdi) 3160 3161 mulxq %rdx,%rax,%rbx 3162 adoxq %r10,%r10 3163 adcxq %r12,%rax 3164 jrcxz .Lsqrx4x_shift_n_add_break 3165.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 3166 adoxq %r11,%r11 3167 adcxq %r13,%rbx 3168 movq 80(%rdi),%r12 3169 movq 88(%rdi),%r13 3170 movq %rax,48(%rdi) 3171 movq %rbx,56(%rdi) 3172 leaq 64(%rdi),%rdi 3173 nop 3174 jmp .Lsqrx4x_shift_n_add 3175 3176.align 32 3177.Lsqrx4x_shift_n_add_break: 3178 adcxq %r13,%rbx 3179 movq %rax,48(%rdi) 3180 movq %rbx,56(%rdi) 3181 leaq 64(%rdi),%rdi 3182.byte 102,72,15,126,213 3183__bn_sqrx8x_reduction: 3184 xorl %eax,%eax 3185 movq 32+8(%rsp),%rbx 3186 movq 48+8(%rsp),%rdx 3187 leaq -64(%rbp,%r9,1),%rcx 3188 3189 movq %rcx,0+8(%rsp) 3190 movq %rdi,8+8(%rsp) 3191 3192 leaq 48+8(%rsp),%rdi 3193 jmp .Lsqrx8x_reduction_loop 3194 3195.align 32 3196.Lsqrx8x_reduction_loop: 3197 movq 8(%rdi),%r9 3198 movq 16(%rdi),%r10 3199 movq 24(%rdi),%r11 3200 movq 32(%rdi),%r12 3201 movq %rdx,%r8 3202 imulq %rbx,%rdx 3203 movq 40(%rdi),%r13 3204 movq 48(%rdi),%r14 3205 movq 56(%rdi),%r15 3206 movq %rax,24+8(%rsp) 3207 3208 leaq 64(%rdi),%rdi 3209 xorq %rsi,%rsi 3210 movq $-8,%rcx 3211 jmp .Lsqrx8x_reduce 3212 3213.align 32 3214.Lsqrx8x_reduce: 3215 movq %r8,%rbx 3216 mulxq 0(%rbp),%rax,%r8 3217 adcxq %rbx,%rax 3218 adoxq %r9,%r8 3219 3220 mulxq 8(%rbp),%rbx,%r9 3221 adcxq %rbx,%r8 3222 adoxq %r10,%r9 3223 3224 mulxq 16(%rbp),%rbx,%r10 3225 adcxq %rbx,%r9 3226 adoxq %r11,%r10 3227 3228 mulxq 24(%rbp),%rbx,%r11 3229 adcxq %rbx,%r10 3230 adoxq %r12,%r11 3231 3232.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 3233 movq %rdx,%rax 3234 movq %r8,%rdx 3235 adcxq %rbx,%r11 3236 adoxq %r13,%r12 3237 3238 mulxq 32+8(%rsp),%rbx,%rdx 3239 movq %rax,%rdx 3240 movq %rax,64+48+8(%rsp,%rcx,8) 3241 3242 mulxq 40(%rbp),%rax,%r13 3243 adcxq %rax,%r12 3244 adoxq %r14,%r13 3245 3246 mulxq 48(%rbp),%rax,%r14 3247 adcxq %rax,%r13 3248 adoxq %r15,%r14 3249 3250 mulxq 56(%rbp),%rax,%r15 3251 movq %rbx,%rdx 3252 adcxq %rax,%r14 3253 adoxq %rsi,%r15 3254 adcxq %rsi,%r15 3255 3256.byte 0x67,0x67,0x67 3257 incq %rcx 3258 jnz .Lsqrx8x_reduce 3259 3260 movq %rsi,%rax 3261 cmpq 0+8(%rsp),%rbp 3262 jae .Lsqrx8x_no_tail 3263 3264 movq 48+8(%rsp),%rdx 3265 addq 0(%rdi),%r8 3266 leaq 64(%rbp),%rbp 3267 movq $-8,%rcx 3268 adcxq 8(%rdi),%r9 3269 adcxq 16(%rdi),%r10 3270 adcq 24(%rdi),%r11 3271 adcq 32(%rdi),%r12 3272 adcq 40(%rdi),%r13 3273 adcq 48(%rdi),%r14 3274 adcq 56(%rdi),%r15 3275 leaq 64(%rdi),%rdi 3276 sbbq %rax,%rax 3277 3278 xorq %rsi,%rsi 3279 movq %rax,16+8(%rsp) 3280 jmp .Lsqrx8x_tail 3281 3282.align 32 3283.Lsqrx8x_tail: 3284 movq %r8,%rbx 3285 mulxq 0(%rbp),%rax,%r8 3286 adcxq %rax,%rbx 3287 adoxq %r9,%r8 3288 3289 mulxq 8(%rbp),%rax,%r9 3290 adcxq %rax,%r8 3291 adoxq %r10,%r9 3292 3293 mulxq 16(%rbp),%rax,%r10 3294 adcxq %rax,%r9 3295 adoxq %r11,%r10 3296 3297 mulxq 24(%rbp),%rax,%r11 3298 adcxq %rax,%r10 3299 adoxq %r12,%r11 3300 3301.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 3302 adcxq %rax,%r11 3303 adoxq %r13,%r12 3304 3305 mulxq 40(%rbp),%rax,%r13 3306 adcxq %rax,%r12 3307 adoxq %r14,%r13 3308 3309 mulxq 48(%rbp),%rax,%r14 3310 adcxq %rax,%r13 3311 adoxq %r15,%r14 3312 3313 mulxq 56(%rbp),%rax,%r15 3314 movq 72+48+8(%rsp,%rcx,8),%rdx 3315 adcxq %rax,%r14 3316 adoxq %rsi,%r15 3317 movq %rbx,(%rdi,%rcx,8) 3318 movq %r8,%rbx 3319 adcxq %rsi,%r15 3320 3321 incq %rcx 3322 jnz .Lsqrx8x_tail 3323 3324 cmpq 0+8(%rsp),%rbp 3325 jae .Lsqrx8x_tail_done 3326 3327 subq 16+8(%rsp),%rsi 3328 movq 48+8(%rsp),%rdx 3329 leaq 64(%rbp),%rbp 3330 adcq 0(%rdi),%r8 3331 adcq 8(%rdi),%r9 3332 adcq 16(%rdi),%r10 3333 adcq 24(%rdi),%r11 3334 adcq 32(%rdi),%r12 3335 adcq 40(%rdi),%r13 3336 adcq 48(%rdi),%r14 3337 adcq 56(%rdi),%r15 3338 leaq 64(%rdi),%rdi 3339 sbbq %rax,%rax 3340 subq $8,%rcx 3341 3342 xorq %rsi,%rsi 3343 movq %rax,16+8(%rsp) 3344 jmp .Lsqrx8x_tail 3345 3346.align 32 3347.Lsqrx8x_tail_done: 3348 addq 24+8(%rsp),%r8 3349 adcq $0,%r9 3350 adcq $0,%r10 3351 adcq $0,%r11 3352 adcq $0,%r12 3353 adcq $0,%r13 3354 adcq $0,%r14 3355 adcq $0,%r15 3356 3357 3358 movq %rsi,%rax 3359 3360 subq 16+8(%rsp),%rsi 3361.Lsqrx8x_no_tail: 3362 adcq 0(%rdi),%r8 3363.byte 102,72,15,126,217 3364 adcq 8(%rdi),%r9 3365 movq 56(%rbp),%rsi 3366.byte 102,72,15,126,213 3367 adcq 16(%rdi),%r10 3368 adcq 24(%rdi),%r11 3369 adcq 32(%rdi),%r12 3370 adcq 40(%rdi),%r13 3371 adcq 48(%rdi),%r14 3372 adcq 56(%rdi),%r15 3373 adcq %rax,%rax 3374 3375 movq 32+8(%rsp),%rbx 3376 movq 64(%rdi,%rcx,1),%rdx 3377 3378 movq %r8,0(%rdi) 3379 leaq 64(%rdi),%r8 3380 movq %r9,8(%rdi) 3381 movq %r10,16(%rdi) 3382 movq %r11,24(%rdi) 3383 movq %r12,32(%rdi) 3384 movq %r13,40(%rdi) 3385 movq %r14,48(%rdi) 3386 movq %r15,56(%rdi) 3387 3388 leaq 64(%rdi,%rcx,1),%rdi 3389 cmpq 8+8(%rsp),%r8 3390 jb .Lsqrx8x_reduction_loop 3391 .byte 0xf3,0xc3 3392.size bn_sqrx8x_internal,.-bn_sqrx8x_internal 3393.align 32 3394__bn_postx4x_internal: 3395 movq 0(%rbp),%r12 3396 movq %rcx,%r10 3397 movq %rcx,%r9 3398 negq %rax 3399 sarq $3+2,%rcx 3400 3401.byte 102,72,15,126,202 3402.byte 102,72,15,126,206 3403 decq %r12 3404 movq 8(%rbp),%r13 3405 xorq %r8,%r8 3406 movq 16(%rbp),%r14 3407 movq 24(%rbp),%r15 3408 jmp .Lsqrx4x_sub_entry 3409 3410.align 16 3411.Lsqrx4x_sub: 3412 movq 0(%rbp),%r12 3413 movq 8(%rbp),%r13 3414 movq 16(%rbp),%r14 3415 movq 24(%rbp),%r15 3416.Lsqrx4x_sub_entry: 3417 andnq %rax,%r12,%r12 3418 leaq 32(%rbp),%rbp 3419 andnq %rax,%r13,%r13 3420 andnq %rax,%r14,%r14 3421 andnq %rax,%r15,%r15 3422 3423 negq %r8 3424 adcq 0(%rdi),%r12 3425 adcq 8(%rdi),%r13 3426 adcq 16(%rdi),%r14 3427 adcq 24(%rdi),%r15 3428 movq %r12,0(%rdx) 3429 leaq 32(%rdi),%rdi 3430 movq %r13,8(%rdx) 3431 sbbq %r8,%r8 3432 movq %r14,16(%rdx) 3433 movq %r15,24(%rdx) 3434 leaq 32(%rdx),%rdx 3435 3436 incq %rcx 3437 jnz .Lsqrx4x_sub 3438 3439 negq %r9 3440 3441 .byte 0xf3,0xc3 3442.size __bn_postx4x_internal,.-__bn_postx4x_internal 3443.globl bn_get_bits5 3444.type bn_get_bits5,@function 3445.align 16 3446bn_get_bits5: 3447 leaq 0(%rdi),%r10 3448 leaq 1(%rdi),%r11 3449 movl %esi,%ecx 3450 shrl $4,%esi 3451 andl $15,%ecx 3452 leal -8(%rcx),%eax 3453 cmpl $11,%ecx 3454 cmovaq %r11,%r10 3455 cmoval %eax,%ecx 3456 movzwl (%r10,%rsi,2),%eax 3457 shrl %cl,%eax 3458 andl $31,%eax 3459 .byte 0xf3,0xc3 3460.size bn_get_bits5,.-bn_get_bits5 3461 3462.globl bn_scatter5 3463.type bn_scatter5,@function 3464.align 16 3465bn_scatter5: 3466 cmpl $0,%esi 3467 jz .Lscatter_epilogue 3468 leaq (%rdx,%rcx,8),%rdx 3469.Lscatter: 3470 movq (%rdi),%rax 3471 leaq 8(%rdi),%rdi 3472 movq %rax,(%rdx) 3473 leaq 256(%rdx),%rdx 3474 subl $1,%esi 3475 jnz .Lscatter 3476.Lscatter_epilogue: 3477 .byte 0xf3,0xc3 3478.size bn_scatter5,.-bn_scatter5 3479 3480.globl bn_gather5 3481.type bn_gather5,@function 3482.align 32 3483bn_gather5: 3484.LSEH_begin_bn_gather5: 3485 3486.byte 0x4c,0x8d,0x14,0x24 3487.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 3488 leaq .Linc(%rip),%rax 3489 andq $-16,%rsp 3490 3491 movd %ecx,%xmm5 3492 movdqa 0(%rax),%xmm0 3493 movdqa 16(%rax),%xmm1 3494 leaq 128(%rdx),%r11 3495 leaq 128(%rsp),%rax 3496 3497 pshufd $0,%xmm5,%xmm5 3498 movdqa %xmm1,%xmm4 3499 movdqa %xmm1,%xmm2 3500 paddd %xmm0,%xmm1 3501 pcmpeqd %xmm5,%xmm0 3502 movdqa %xmm4,%xmm3 3503 3504 paddd %xmm1,%xmm2 3505 pcmpeqd %xmm5,%xmm1 3506 movdqa %xmm0,-128(%rax) 3507 movdqa %xmm4,%xmm0 3508 3509 paddd %xmm2,%xmm3 3510 pcmpeqd %xmm5,%xmm2 3511 movdqa %xmm1,-112(%rax) 3512 movdqa %xmm4,%xmm1 3513 3514 paddd %xmm3,%xmm0 3515 pcmpeqd %xmm5,%xmm3 3516 movdqa %xmm2,-96(%rax) 3517 movdqa %xmm4,%xmm2 3518 paddd %xmm0,%xmm1 3519 pcmpeqd %xmm5,%xmm0 3520 movdqa %xmm3,-80(%rax) 3521 movdqa %xmm4,%xmm3 3522 3523 paddd %xmm1,%xmm2 3524 pcmpeqd %xmm5,%xmm1 3525 movdqa %xmm0,-64(%rax) 3526 movdqa %xmm4,%xmm0 3527 3528 paddd %xmm2,%xmm3 3529 pcmpeqd %xmm5,%xmm2 3530 movdqa %xmm1,-48(%rax) 3531 movdqa %xmm4,%xmm1 3532 3533 paddd %xmm3,%xmm0 3534 pcmpeqd %xmm5,%xmm3 3535 movdqa %xmm2,-32(%rax) 3536 movdqa %xmm4,%xmm2 3537 paddd %xmm0,%xmm1 3538 pcmpeqd %xmm5,%xmm0 3539 movdqa %xmm3,-16(%rax) 3540 movdqa %xmm4,%xmm3 3541 3542 paddd %xmm1,%xmm2 3543 pcmpeqd %xmm5,%xmm1 3544 movdqa %xmm0,0(%rax) 3545 movdqa %xmm4,%xmm0 3546 3547 paddd %xmm2,%xmm3 3548 pcmpeqd %xmm5,%xmm2 3549 movdqa %xmm1,16(%rax) 3550 movdqa %xmm4,%xmm1 3551 3552 paddd %xmm3,%xmm0 3553 pcmpeqd %xmm5,%xmm3 3554 movdqa %xmm2,32(%rax) 3555 movdqa %xmm4,%xmm2 3556 paddd %xmm0,%xmm1 3557 pcmpeqd %xmm5,%xmm0 3558 movdqa %xmm3,48(%rax) 3559 movdqa %xmm4,%xmm3 3560 3561 paddd %xmm1,%xmm2 3562 pcmpeqd %xmm5,%xmm1 3563 movdqa %xmm0,64(%rax) 3564 movdqa %xmm4,%xmm0 3565 3566 paddd %xmm2,%xmm3 3567 pcmpeqd %xmm5,%xmm2 3568 movdqa %xmm1,80(%rax) 3569 movdqa %xmm4,%xmm1 3570 3571 paddd %xmm3,%xmm0 3572 pcmpeqd %xmm5,%xmm3 3573 movdqa %xmm2,96(%rax) 3574 movdqa %xmm4,%xmm2 3575 movdqa %xmm3,112(%rax) 3576 jmp .Lgather 3577 3578.align 32 3579.Lgather: 3580 pxor %xmm4,%xmm4 3581 pxor %xmm5,%xmm5 3582 movdqa -128(%r11),%xmm0 3583 movdqa -112(%r11),%xmm1 3584 movdqa -96(%r11),%xmm2 3585 pand -128(%rax),%xmm0 3586 movdqa -80(%r11),%xmm3 3587 pand -112(%rax),%xmm1 3588 por %xmm0,%xmm4 3589 pand -96(%rax),%xmm2 3590 por %xmm1,%xmm5 3591 pand -80(%rax),%xmm3 3592 por %xmm2,%xmm4 3593 por %xmm3,%xmm5 3594 movdqa -64(%r11),%xmm0 3595 movdqa -48(%r11),%xmm1 3596 movdqa -32(%r11),%xmm2 3597 pand -64(%rax),%xmm0 3598 movdqa -16(%r11),%xmm3 3599 pand -48(%rax),%xmm1 3600 por %xmm0,%xmm4 3601 pand -32(%rax),%xmm2 3602 por %xmm1,%xmm5 3603 pand -16(%rax),%xmm3 3604 por %xmm2,%xmm4 3605 por %xmm3,%xmm5 3606 movdqa 0(%r11),%xmm0 3607 movdqa 16(%r11),%xmm1 3608 movdqa 32(%r11),%xmm2 3609 pand 0(%rax),%xmm0 3610 movdqa 48(%r11),%xmm3 3611 pand 16(%rax),%xmm1 3612 por %xmm0,%xmm4 3613 pand 32(%rax),%xmm2 3614 por %xmm1,%xmm5 3615 pand 48(%rax),%xmm3 3616 por %xmm2,%xmm4 3617 por %xmm3,%xmm5 3618 movdqa 64(%r11),%xmm0 3619 movdqa 80(%r11),%xmm1 3620 movdqa 96(%r11),%xmm2 3621 pand 64(%rax),%xmm0 3622 movdqa 112(%r11),%xmm3 3623 pand 80(%rax),%xmm1 3624 por %xmm0,%xmm4 3625 pand 96(%rax),%xmm2 3626 por %xmm1,%xmm5 3627 pand 112(%rax),%xmm3 3628 por %xmm2,%xmm4 3629 por %xmm3,%xmm5 3630 por %xmm5,%xmm4 3631 leaq 256(%r11),%r11 3632 pshufd $0x4e,%xmm4,%xmm0 3633 por %xmm4,%xmm0 3634 movq %xmm0,(%rdi) 3635 leaq 8(%rdi),%rdi 3636 subl $1,%esi 3637 jnz .Lgather 3638 3639 leaq (%r10),%rsp 3640 .byte 0xf3,0xc3 3641.LSEH_end_bn_gather5: 3642.size bn_gather5,.-bn_gather5 3643.align 64 3644.Linc: 3645.long 0,0, 1,1 3646.long 2,2, 2,2 3647.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 3648