x86_64-mont5.S revision 1.5
1#include <machine/asm.h> 2.text 3 4 5 6.globl bn_mul_mont_gather5 7.type bn_mul_mont_gather5,@function 8.align 64 9bn_mul_mont_gather5: 10 movl %r9d,%r9d 11 movq %rsp,%rax 12 testl $7,%r9d 13 jnz .Lmul_enter 14 movl OPENSSL_ia32cap_P+8(%rip),%r11d 15 jmp .Lmul4x_enter 16 17.align 16 18.Lmul_enter: 19 movd 8(%rsp),%xmm5 20 pushq %rbx 21 pushq %rbp 22 pushq %r12 23 pushq %r13 24 pushq %r14 25 pushq %r15 26 27 negq %r9 28 movq %rsp,%r11 29 leaq -280(%rsp,%r9,8),%r10 30 negq %r9 31 andq $-1024,%r10 32 33 34 35 36 37 38 39 subq %r10,%r11 40 andq $-4096,%r11 41 leaq (%r10,%r11,1),%rsp 42 movq (%rsp),%r11 43 cmpq %r10,%rsp 44 ja .Lmul_page_walk 45 jmp .Lmul_page_walk_done 46 47.Lmul_page_walk: 48 leaq -4096(%rsp),%rsp 49 movq (%rsp),%r11 50 cmpq %r10,%rsp 51 ja .Lmul_page_walk 52.Lmul_page_walk_done: 53 54 leaq .Linc(%rip),%r10 55 movq %rax,8(%rsp,%r9,8) 56.Lmul_body: 57 58 leaq 128(%rdx),%r12 59 movdqa 0(%r10),%xmm0 60 movdqa 16(%r10),%xmm1 61 leaq 24-112(%rsp,%r9,8),%r10 62 andq $-16,%r10 63 64 pshufd $0,%xmm5,%xmm5 65 movdqa %xmm1,%xmm4 66 movdqa %xmm1,%xmm2 67 paddd %xmm0,%xmm1 68 pcmpeqd %xmm5,%xmm0 69.byte 0x67 70 movdqa %xmm4,%xmm3 71 paddd %xmm1,%xmm2 72 pcmpeqd %xmm5,%xmm1 73 movdqa %xmm0,112(%r10) 74 movdqa %xmm4,%xmm0 75 76 paddd %xmm2,%xmm3 77 pcmpeqd %xmm5,%xmm2 78 movdqa %xmm1,128(%r10) 79 movdqa %xmm4,%xmm1 80 81 paddd %xmm3,%xmm0 82 pcmpeqd %xmm5,%xmm3 83 movdqa %xmm2,144(%r10) 84 movdqa %xmm4,%xmm2 85 86 paddd %xmm0,%xmm1 87 pcmpeqd %xmm5,%xmm0 88 movdqa %xmm3,160(%r10) 89 movdqa %xmm4,%xmm3 90 paddd %xmm1,%xmm2 91 pcmpeqd %xmm5,%xmm1 92 movdqa %xmm0,176(%r10) 93 movdqa %xmm4,%xmm0 94 95 paddd %xmm2,%xmm3 96 pcmpeqd %xmm5,%xmm2 97 movdqa %xmm1,192(%r10) 98 movdqa %xmm4,%xmm1 99 100 paddd %xmm3,%xmm0 101 pcmpeqd %xmm5,%xmm3 102 movdqa %xmm2,208(%r10) 103 movdqa %xmm4,%xmm2 104 105 paddd %xmm0,%xmm1 106 pcmpeqd %xmm5,%xmm0 107 movdqa %xmm3,224(%r10) 108 movdqa %xmm4,%xmm3 109 paddd %xmm1,%xmm2 110 pcmpeqd %xmm5,%xmm1 111 movdqa %xmm0,240(%r10) 112 movdqa %xmm4,%xmm0 113 114 paddd %xmm2,%xmm3 115 pcmpeqd %xmm5,%xmm2 116 movdqa %xmm1,256(%r10) 117 movdqa %xmm4,%xmm1 118 119 paddd %xmm3,%xmm0 120 pcmpeqd %xmm5,%xmm3 121 movdqa %xmm2,272(%r10) 122 movdqa %xmm4,%xmm2 123 124 paddd %xmm0,%xmm1 125 pcmpeqd %xmm5,%xmm0 126 movdqa %xmm3,288(%r10) 127 movdqa %xmm4,%xmm3 128 paddd %xmm1,%xmm2 129 pcmpeqd %xmm5,%xmm1 130 movdqa %xmm0,304(%r10) 131 132 paddd %xmm2,%xmm3 133.byte 0x67 134 pcmpeqd %xmm5,%xmm2 135 movdqa %xmm1,320(%r10) 136 137 pcmpeqd %xmm5,%xmm3 138 movdqa %xmm2,336(%r10) 139 pand 64(%r12),%xmm0 140 141 pand 80(%r12),%xmm1 142 pand 96(%r12),%xmm2 143 movdqa %xmm3,352(%r10) 144 pand 112(%r12),%xmm3 145 por %xmm2,%xmm0 146 por %xmm3,%xmm1 147 movdqa -128(%r12),%xmm4 148 movdqa -112(%r12),%xmm5 149 movdqa -96(%r12),%xmm2 150 pand 112(%r10),%xmm4 151 movdqa -80(%r12),%xmm3 152 pand 128(%r10),%xmm5 153 por %xmm4,%xmm0 154 pand 144(%r10),%xmm2 155 por %xmm5,%xmm1 156 pand 160(%r10),%xmm3 157 por %xmm2,%xmm0 158 por %xmm3,%xmm1 159 movdqa -64(%r12),%xmm4 160 movdqa -48(%r12),%xmm5 161 movdqa -32(%r12),%xmm2 162 pand 176(%r10),%xmm4 163 movdqa -16(%r12),%xmm3 164 pand 192(%r10),%xmm5 165 por %xmm4,%xmm0 166 pand 208(%r10),%xmm2 167 por %xmm5,%xmm1 168 pand 224(%r10),%xmm3 169 por %xmm2,%xmm0 170 por %xmm3,%xmm1 171 movdqa 0(%r12),%xmm4 172 movdqa 16(%r12),%xmm5 173 movdqa 32(%r12),%xmm2 174 pand 240(%r10),%xmm4 175 movdqa 48(%r12),%xmm3 176 pand 256(%r10),%xmm5 177 por %xmm4,%xmm0 178 pand 272(%r10),%xmm2 179 por %xmm5,%xmm1 180 pand 288(%r10),%xmm3 181 por %xmm2,%xmm0 182 por %xmm3,%xmm1 183 por %xmm1,%xmm0 184 pshufd $0x4e,%xmm0,%xmm1 185 por %xmm1,%xmm0 186 leaq 256(%r12),%r12 187.byte 102,72,15,126,195 188 189 movq (%r8),%r8 190 movq (%rsi),%rax 191 192 xorq %r14,%r14 193 xorq %r15,%r15 194 195 movq %r8,%rbp 196 mulq %rbx 197 movq %rax,%r10 198 movq (%rcx),%rax 199 200 imulq %r10,%rbp 201 movq %rdx,%r11 202 203 mulq %rbp 204 addq %rax,%r10 205 movq 8(%rsi),%rax 206 adcq $0,%rdx 207 movq %rdx,%r13 208 209 leaq 1(%r15),%r15 210 jmp .L1st_enter 211 212.align 16 213.L1st: 214 addq %rax,%r13 215 movq (%rsi,%r15,8),%rax 216 adcq $0,%rdx 217 addq %r11,%r13 218 movq %r10,%r11 219 adcq $0,%rdx 220 movq %r13,-16(%rsp,%r15,8) 221 movq %rdx,%r13 222 223.L1st_enter: 224 mulq %rbx 225 addq %rax,%r11 226 movq (%rcx,%r15,8),%rax 227 adcq $0,%rdx 228 leaq 1(%r15),%r15 229 movq %rdx,%r10 230 231 mulq %rbp 232 cmpq %r9,%r15 233 jne .L1st 234 235 236 addq %rax,%r13 237 adcq $0,%rdx 238 addq %r11,%r13 239 adcq $0,%rdx 240 movq %r13,-16(%rsp,%r9,8) 241 movq %rdx,%r13 242 movq %r10,%r11 243 244 xorq %rdx,%rdx 245 addq %r11,%r13 246 adcq $0,%rdx 247 movq %r13,-8(%rsp,%r9,8) 248 movq %rdx,(%rsp,%r9,8) 249 250 leaq 1(%r14),%r14 251 jmp .Louter 252.align 16 253.Louter: 254 leaq 24+128(%rsp,%r9,8),%rdx 255 andq $-16,%rdx 256 pxor %xmm4,%xmm4 257 pxor %xmm5,%xmm5 258 movdqa -128(%r12),%xmm0 259 movdqa -112(%r12),%xmm1 260 movdqa -96(%r12),%xmm2 261 movdqa -80(%r12),%xmm3 262 pand -128(%rdx),%xmm0 263 pand -112(%rdx),%xmm1 264 por %xmm0,%xmm4 265 pand -96(%rdx),%xmm2 266 por %xmm1,%xmm5 267 pand -80(%rdx),%xmm3 268 por %xmm2,%xmm4 269 por %xmm3,%xmm5 270 movdqa -64(%r12),%xmm0 271 movdqa -48(%r12),%xmm1 272 movdqa -32(%r12),%xmm2 273 movdqa -16(%r12),%xmm3 274 pand -64(%rdx),%xmm0 275 pand -48(%rdx),%xmm1 276 por %xmm0,%xmm4 277 pand -32(%rdx),%xmm2 278 por %xmm1,%xmm5 279 pand -16(%rdx),%xmm3 280 por %xmm2,%xmm4 281 por %xmm3,%xmm5 282 movdqa 0(%r12),%xmm0 283 movdqa 16(%r12),%xmm1 284 movdqa 32(%r12),%xmm2 285 movdqa 48(%r12),%xmm3 286 pand 0(%rdx),%xmm0 287 pand 16(%rdx),%xmm1 288 por %xmm0,%xmm4 289 pand 32(%rdx),%xmm2 290 por %xmm1,%xmm5 291 pand 48(%rdx),%xmm3 292 por %xmm2,%xmm4 293 por %xmm3,%xmm5 294 movdqa 64(%r12),%xmm0 295 movdqa 80(%r12),%xmm1 296 movdqa 96(%r12),%xmm2 297 movdqa 112(%r12),%xmm3 298 pand 64(%rdx),%xmm0 299 pand 80(%rdx),%xmm1 300 por %xmm0,%xmm4 301 pand 96(%rdx),%xmm2 302 por %xmm1,%xmm5 303 pand 112(%rdx),%xmm3 304 por %xmm2,%xmm4 305 por %xmm3,%xmm5 306 por %xmm5,%xmm4 307 pshufd $0x4e,%xmm4,%xmm0 308 por %xmm4,%xmm0 309 leaq 256(%r12),%r12 310 311 movq (%rsi),%rax 312.byte 102,72,15,126,195 313 314 xorq %r15,%r15 315 movq %r8,%rbp 316 movq (%rsp),%r10 317 318 mulq %rbx 319 addq %rax,%r10 320 movq (%rcx),%rax 321 adcq $0,%rdx 322 323 imulq %r10,%rbp 324 movq %rdx,%r11 325 326 mulq %rbp 327 addq %rax,%r10 328 movq 8(%rsi),%rax 329 adcq $0,%rdx 330 movq 8(%rsp),%r10 331 movq %rdx,%r13 332 333 leaq 1(%r15),%r15 334 jmp .Linner_enter 335 336.align 16 337.Linner: 338 addq %rax,%r13 339 movq (%rsi,%r15,8),%rax 340 adcq $0,%rdx 341 addq %r10,%r13 342 movq (%rsp,%r15,8),%r10 343 adcq $0,%rdx 344 movq %r13,-16(%rsp,%r15,8) 345 movq %rdx,%r13 346 347.Linner_enter: 348 mulq %rbx 349 addq %rax,%r11 350 movq (%rcx,%r15,8),%rax 351 adcq $0,%rdx 352 addq %r11,%r10 353 movq %rdx,%r11 354 adcq $0,%r11 355 leaq 1(%r15),%r15 356 357 mulq %rbp 358 cmpq %r9,%r15 359 jne .Linner 360 361 addq %rax,%r13 362 adcq $0,%rdx 363 addq %r10,%r13 364 movq (%rsp,%r9,8),%r10 365 adcq $0,%rdx 366 movq %r13,-16(%rsp,%r9,8) 367 movq %rdx,%r13 368 369 xorq %rdx,%rdx 370 addq %r11,%r13 371 adcq $0,%rdx 372 addq %r10,%r13 373 adcq $0,%rdx 374 movq %r13,-8(%rsp,%r9,8) 375 movq %rdx,(%rsp,%r9,8) 376 377 leaq 1(%r14),%r14 378 cmpq %r9,%r14 379 jb .Louter 380 381 xorq %r14,%r14 382 movq (%rsp),%rax 383 leaq (%rsp),%rsi 384 movq %r9,%r15 385 jmp .Lsub 386.align 16 387.Lsub: sbbq (%rcx,%r14,8),%rax 388 movq %rax,(%rdi,%r14,8) 389 movq 8(%rsi,%r14,8),%rax 390 leaq 1(%r14),%r14 391 decq %r15 392 jnz .Lsub 393 394 sbbq $0,%rax 395 xorq %r14,%r14 396 andq %rax,%rsi 397 notq %rax 398 movq %rdi,%rcx 399 andq %rax,%rcx 400 movq %r9,%r15 401 orq %rcx,%rsi 402.align 16 403.Lcopy: 404 movq (%rsi,%r14,8),%rax 405 movq %r14,(%rsp,%r14,8) 406 movq %rax,(%rdi,%r14,8) 407 leaq 1(%r14),%r14 408 subq $1,%r15 409 jnz .Lcopy 410 411 movq 8(%rsp,%r9,8),%rsi 412 movq $1,%rax 413 414 movq -48(%rsi),%r15 415 movq -40(%rsi),%r14 416 movq -32(%rsi),%r13 417 movq -24(%rsi),%r12 418 movq -16(%rsi),%rbp 419 movq -8(%rsi),%rbx 420 leaq (%rsi),%rsp 421.Lmul_epilogue: 422 .byte 0xf3,0xc3 423.size bn_mul_mont_gather5,.-bn_mul_mont_gather5 424.type bn_mul4x_mont_gather5,@function 425.align 32 426bn_mul4x_mont_gather5: 427.byte 0x67 428 movq %rsp,%rax 429.Lmul4x_enter: 430 andl $0x80108,%r11d 431 cmpl $0x80108,%r11d 432 je .Lmulx4x_enter 433 pushq %rbx 434 pushq %rbp 435 pushq %r12 436 pushq %r13 437 pushq %r14 438 pushq %r15 439.Lmul4x_prologue: 440 441.byte 0x67 442 shll $3,%r9d 443 leaq (%r9,%r9,2),%r10 444 negq %r9 445 446 447 448 449 450 451 452 453 454 455 leaq -320(%rsp,%r9,2),%r11 456 movq %rsp,%rbp 457 subq %rdi,%r11 458 andq $4095,%r11 459 cmpq %r11,%r10 460 jb .Lmul4xsp_alt 461 subq %r11,%rbp 462 leaq -320(%rbp,%r9,2),%rbp 463 jmp .Lmul4xsp_done 464 465.align 32 466.Lmul4xsp_alt: 467 leaq 4096-320(,%r9,2),%r10 468 leaq -320(%rbp,%r9,2),%rbp 469 subq %r10,%r11 470 movq $0,%r10 471 cmovcq %r10,%r11 472 subq %r11,%rbp 473.Lmul4xsp_done: 474 andq $-64,%rbp 475 movq %rsp,%r11 476 subq %rbp,%r11 477 andq $-4096,%r11 478 leaq (%r11,%rbp,1),%rsp 479 movq (%rsp),%r10 480 cmpq %rbp,%rsp 481 ja .Lmul4x_page_walk 482 jmp .Lmul4x_page_walk_done 483 484.Lmul4x_page_walk: 485 leaq -4096(%rsp),%rsp 486 movq (%rsp),%r10 487 cmpq %rbp,%rsp 488 ja .Lmul4x_page_walk 489.Lmul4x_page_walk_done: 490 491 negq %r9 492 493 movq %rax,40(%rsp) 494.Lmul4x_body: 495 496 call mul4x_internal 497 498 movq 40(%rsp),%rsi 499 movq $1,%rax 500 501 movq -48(%rsi),%r15 502 movq -40(%rsi),%r14 503 movq -32(%rsi),%r13 504 movq -24(%rsi),%r12 505 movq -16(%rsi),%rbp 506 movq -8(%rsi),%rbx 507 leaq (%rsi),%rsp 508.Lmul4x_epilogue: 509 .byte 0xf3,0xc3 510.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 511 512.type mul4x_internal,@function 513.align 32 514mul4x_internal: 515 shlq $5,%r9 516 movd 8(%rax),%xmm5 517 leaq .Linc(%rip),%rax 518 leaq 128(%rdx,%r9,1),%r13 519 shrq $5,%r9 520 movdqa 0(%rax),%xmm0 521 movdqa 16(%rax),%xmm1 522 leaq 88-112(%rsp,%r9,1),%r10 523 leaq 128(%rdx),%r12 524 525 pshufd $0,%xmm5,%xmm5 526 movdqa %xmm1,%xmm4 527.byte 0x67,0x67 528 movdqa %xmm1,%xmm2 529 paddd %xmm0,%xmm1 530 pcmpeqd %xmm5,%xmm0 531.byte 0x67 532 movdqa %xmm4,%xmm3 533 paddd %xmm1,%xmm2 534 pcmpeqd %xmm5,%xmm1 535 movdqa %xmm0,112(%r10) 536 movdqa %xmm4,%xmm0 537 538 paddd %xmm2,%xmm3 539 pcmpeqd %xmm5,%xmm2 540 movdqa %xmm1,128(%r10) 541 movdqa %xmm4,%xmm1 542 543 paddd %xmm3,%xmm0 544 pcmpeqd %xmm5,%xmm3 545 movdqa %xmm2,144(%r10) 546 movdqa %xmm4,%xmm2 547 548 paddd %xmm0,%xmm1 549 pcmpeqd %xmm5,%xmm0 550 movdqa %xmm3,160(%r10) 551 movdqa %xmm4,%xmm3 552 paddd %xmm1,%xmm2 553 pcmpeqd %xmm5,%xmm1 554 movdqa %xmm0,176(%r10) 555 movdqa %xmm4,%xmm0 556 557 paddd %xmm2,%xmm3 558 pcmpeqd %xmm5,%xmm2 559 movdqa %xmm1,192(%r10) 560 movdqa %xmm4,%xmm1 561 562 paddd %xmm3,%xmm0 563 pcmpeqd %xmm5,%xmm3 564 movdqa %xmm2,208(%r10) 565 movdqa %xmm4,%xmm2 566 567 paddd %xmm0,%xmm1 568 pcmpeqd %xmm5,%xmm0 569 movdqa %xmm3,224(%r10) 570 movdqa %xmm4,%xmm3 571 paddd %xmm1,%xmm2 572 pcmpeqd %xmm5,%xmm1 573 movdqa %xmm0,240(%r10) 574 movdqa %xmm4,%xmm0 575 576 paddd %xmm2,%xmm3 577 pcmpeqd %xmm5,%xmm2 578 movdqa %xmm1,256(%r10) 579 movdqa %xmm4,%xmm1 580 581 paddd %xmm3,%xmm0 582 pcmpeqd %xmm5,%xmm3 583 movdqa %xmm2,272(%r10) 584 movdqa %xmm4,%xmm2 585 586 paddd %xmm0,%xmm1 587 pcmpeqd %xmm5,%xmm0 588 movdqa %xmm3,288(%r10) 589 movdqa %xmm4,%xmm3 590 paddd %xmm1,%xmm2 591 pcmpeqd %xmm5,%xmm1 592 movdqa %xmm0,304(%r10) 593 594 paddd %xmm2,%xmm3 595.byte 0x67 596 pcmpeqd %xmm5,%xmm2 597 movdqa %xmm1,320(%r10) 598 599 pcmpeqd %xmm5,%xmm3 600 movdqa %xmm2,336(%r10) 601 pand 64(%r12),%xmm0 602 603 pand 80(%r12),%xmm1 604 pand 96(%r12),%xmm2 605 movdqa %xmm3,352(%r10) 606 pand 112(%r12),%xmm3 607 por %xmm2,%xmm0 608 por %xmm3,%xmm1 609 movdqa -128(%r12),%xmm4 610 movdqa -112(%r12),%xmm5 611 movdqa -96(%r12),%xmm2 612 pand 112(%r10),%xmm4 613 movdqa -80(%r12),%xmm3 614 pand 128(%r10),%xmm5 615 por %xmm4,%xmm0 616 pand 144(%r10),%xmm2 617 por %xmm5,%xmm1 618 pand 160(%r10),%xmm3 619 por %xmm2,%xmm0 620 por %xmm3,%xmm1 621 movdqa -64(%r12),%xmm4 622 movdqa -48(%r12),%xmm5 623 movdqa -32(%r12),%xmm2 624 pand 176(%r10),%xmm4 625 movdqa -16(%r12),%xmm3 626 pand 192(%r10),%xmm5 627 por %xmm4,%xmm0 628 pand 208(%r10),%xmm2 629 por %xmm5,%xmm1 630 pand 224(%r10),%xmm3 631 por %xmm2,%xmm0 632 por %xmm3,%xmm1 633 movdqa 0(%r12),%xmm4 634 movdqa 16(%r12),%xmm5 635 movdqa 32(%r12),%xmm2 636 pand 240(%r10),%xmm4 637 movdqa 48(%r12),%xmm3 638 pand 256(%r10),%xmm5 639 por %xmm4,%xmm0 640 pand 272(%r10),%xmm2 641 por %xmm5,%xmm1 642 pand 288(%r10),%xmm3 643 por %xmm2,%xmm0 644 por %xmm3,%xmm1 645 por %xmm1,%xmm0 646 pshufd $0x4e,%xmm0,%xmm1 647 por %xmm1,%xmm0 648 leaq 256(%r12),%r12 649.byte 102,72,15,126,195 650 651 movq %r13,16+8(%rsp) 652 movq %rdi,56+8(%rsp) 653 654 movq (%r8),%r8 655 movq (%rsi),%rax 656 leaq (%rsi,%r9,1),%rsi 657 negq %r9 658 659 movq %r8,%rbp 660 mulq %rbx 661 movq %rax,%r10 662 movq (%rcx),%rax 663 664 imulq %r10,%rbp 665 leaq 64+8(%rsp),%r14 666 movq %rdx,%r11 667 668 mulq %rbp 669 addq %rax,%r10 670 movq 8(%rsi,%r9,1),%rax 671 adcq $0,%rdx 672 movq %rdx,%rdi 673 674 mulq %rbx 675 addq %rax,%r11 676 movq 8(%rcx),%rax 677 adcq $0,%rdx 678 movq %rdx,%r10 679 680 mulq %rbp 681 addq %rax,%rdi 682 movq 16(%rsi,%r9,1),%rax 683 adcq $0,%rdx 684 addq %r11,%rdi 685 leaq 32(%r9),%r15 686 leaq 32(%rcx),%rcx 687 adcq $0,%rdx 688 movq %rdi,(%r14) 689 movq %rdx,%r13 690 jmp .L1st4x 691 692.align 32 693.L1st4x: 694 mulq %rbx 695 addq %rax,%r10 696 movq -16(%rcx),%rax 697 leaq 32(%r14),%r14 698 adcq $0,%rdx 699 movq %rdx,%r11 700 701 mulq %rbp 702 addq %rax,%r13 703 movq -8(%rsi,%r15,1),%rax 704 adcq $0,%rdx 705 addq %r10,%r13 706 adcq $0,%rdx 707 movq %r13,-24(%r14) 708 movq %rdx,%rdi 709 710 mulq %rbx 711 addq %rax,%r11 712 movq -8(%rcx),%rax 713 adcq $0,%rdx 714 movq %rdx,%r10 715 716 mulq %rbp 717 addq %rax,%rdi 718 movq (%rsi,%r15,1),%rax 719 adcq $0,%rdx 720 addq %r11,%rdi 721 adcq $0,%rdx 722 movq %rdi,-16(%r14) 723 movq %rdx,%r13 724 725 mulq %rbx 726 addq %rax,%r10 727 movq 0(%rcx),%rax 728 adcq $0,%rdx 729 movq %rdx,%r11 730 731 mulq %rbp 732 addq %rax,%r13 733 movq 8(%rsi,%r15,1),%rax 734 adcq $0,%rdx 735 addq %r10,%r13 736 adcq $0,%rdx 737 movq %r13,-8(%r14) 738 movq %rdx,%rdi 739 740 mulq %rbx 741 addq %rax,%r11 742 movq 8(%rcx),%rax 743 adcq $0,%rdx 744 movq %rdx,%r10 745 746 mulq %rbp 747 addq %rax,%rdi 748 movq 16(%rsi,%r15,1),%rax 749 adcq $0,%rdx 750 addq %r11,%rdi 751 leaq 32(%rcx),%rcx 752 adcq $0,%rdx 753 movq %rdi,(%r14) 754 movq %rdx,%r13 755 756 addq $32,%r15 757 jnz .L1st4x 758 759 mulq %rbx 760 addq %rax,%r10 761 movq -16(%rcx),%rax 762 leaq 32(%r14),%r14 763 adcq $0,%rdx 764 movq %rdx,%r11 765 766 mulq %rbp 767 addq %rax,%r13 768 movq -8(%rsi),%rax 769 adcq $0,%rdx 770 addq %r10,%r13 771 adcq $0,%rdx 772 movq %r13,-24(%r14) 773 movq %rdx,%rdi 774 775 mulq %rbx 776 addq %rax,%r11 777 movq -8(%rcx),%rax 778 adcq $0,%rdx 779 movq %rdx,%r10 780 781 mulq %rbp 782 addq %rax,%rdi 783 movq (%rsi,%r9,1),%rax 784 adcq $0,%rdx 785 addq %r11,%rdi 786 adcq $0,%rdx 787 movq %rdi,-16(%r14) 788 movq %rdx,%r13 789 790 leaq (%rcx,%r9,1),%rcx 791 792 xorq %rdi,%rdi 793 addq %r10,%r13 794 adcq $0,%rdi 795 movq %r13,-8(%r14) 796 797 jmp .Louter4x 798 799.align 32 800.Louter4x: 801 leaq 16+128(%r14),%rdx 802 pxor %xmm4,%xmm4 803 pxor %xmm5,%xmm5 804 movdqa -128(%r12),%xmm0 805 movdqa -112(%r12),%xmm1 806 movdqa -96(%r12),%xmm2 807 movdqa -80(%r12),%xmm3 808 pand -128(%rdx),%xmm0 809 pand -112(%rdx),%xmm1 810 por %xmm0,%xmm4 811 pand -96(%rdx),%xmm2 812 por %xmm1,%xmm5 813 pand -80(%rdx),%xmm3 814 por %xmm2,%xmm4 815 por %xmm3,%xmm5 816 movdqa -64(%r12),%xmm0 817 movdqa -48(%r12),%xmm1 818 movdqa -32(%r12),%xmm2 819 movdqa -16(%r12),%xmm3 820 pand -64(%rdx),%xmm0 821 pand -48(%rdx),%xmm1 822 por %xmm0,%xmm4 823 pand -32(%rdx),%xmm2 824 por %xmm1,%xmm5 825 pand -16(%rdx),%xmm3 826 por %xmm2,%xmm4 827 por %xmm3,%xmm5 828 movdqa 0(%r12),%xmm0 829 movdqa 16(%r12),%xmm1 830 movdqa 32(%r12),%xmm2 831 movdqa 48(%r12),%xmm3 832 pand 0(%rdx),%xmm0 833 pand 16(%rdx),%xmm1 834 por %xmm0,%xmm4 835 pand 32(%rdx),%xmm2 836 por %xmm1,%xmm5 837 pand 48(%rdx),%xmm3 838 por %xmm2,%xmm4 839 por %xmm3,%xmm5 840 movdqa 64(%r12),%xmm0 841 movdqa 80(%r12),%xmm1 842 movdqa 96(%r12),%xmm2 843 movdqa 112(%r12),%xmm3 844 pand 64(%rdx),%xmm0 845 pand 80(%rdx),%xmm1 846 por %xmm0,%xmm4 847 pand 96(%rdx),%xmm2 848 por %xmm1,%xmm5 849 pand 112(%rdx),%xmm3 850 por %xmm2,%xmm4 851 por %xmm3,%xmm5 852 por %xmm5,%xmm4 853 pshufd $0x4e,%xmm4,%xmm0 854 por %xmm4,%xmm0 855 leaq 256(%r12),%r12 856.byte 102,72,15,126,195 857 858 movq (%r14,%r9,1),%r10 859 movq %r8,%rbp 860 mulq %rbx 861 addq %rax,%r10 862 movq (%rcx),%rax 863 adcq $0,%rdx 864 865 imulq %r10,%rbp 866 movq %rdx,%r11 867 movq %rdi,(%r14) 868 869 leaq (%r14,%r9,1),%r14 870 871 mulq %rbp 872 addq %rax,%r10 873 movq 8(%rsi,%r9,1),%rax 874 adcq $0,%rdx 875 movq %rdx,%rdi 876 877 mulq %rbx 878 addq %rax,%r11 879 movq 8(%rcx),%rax 880 adcq $0,%rdx 881 addq 8(%r14),%r11 882 adcq $0,%rdx 883 movq %rdx,%r10 884 885 mulq %rbp 886 addq %rax,%rdi 887 movq 16(%rsi,%r9,1),%rax 888 adcq $0,%rdx 889 addq %r11,%rdi 890 leaq 32(%r9),%r15 891 leaq 32(%rcx),%rcx 892 adcq $0,%rdx 893 movq %rdx,%r13 894 jmp .Linner4x 895 896.align 32 897.Linner4x: 898 mulq %rbx 899 addq %rax,%r10 900 movq -16(%rcx),%rax 901 adcq $0,%rdx 902 addq 16(%r14),%r10 903 leaq 32(%r14),%r14 904 adcq $0,%rdx 905 movq %rdx,%r11 906 907 mulq %rbp 908 addq %rax,%r13 909 movq -8(%rsi,%r15,1),%rax 910 adcq $0,%rdx 911 addq %r10,%r13 912 adcq $0,%rdx 913 movq %rdi,-32(%r14) 914 movq %rdx,%rdi 915 916 mulq %rbx 917 addq %rax,%r11 918 movq -8(%rcx),%rax 919 adcq $0,%rdx 920 addq -8(%r14),%r11 921 adcq $0,%rdx 922 movq %rdx,%r10 923 924 mulq %rbp 925 addq %rax,%rdi 926 movq (%rsi,%r15,1),%rax 927 adcq $0,%rdx 928 addq %r11,%rdi 929 adcq $0,%rdx 930 movq %r13,-24(%r14) 931 movq %rdx,%r13 932 933 mulq %rbx 934 addq %rax,%r10 935 movq 0(%rcx),%rax 936 adcq $0,%rdx 937 addq (%r14),%r10 938 adcq $0,%rdx 939 movq %rdx,%r11 940 941 mulq %rbp 942 addq %rax,%r13 943 movq 8(%rsi,%r15,1),%rax 944 adcq $0,%rdx 945 addq %r10,%r13 946 adcq $0,%rdx 947 movq %rdi,-16(%r14) 948 movq %rdx,%rdi 949 950 mulq %rbx 951 addq %rax,%r11 952 movq 8(%rcx),%rax 953 adcq $0,%rdx 954 addq 8(%r14),%r11 955 adcq $0,%rdx 956 movq %rdx,%r10 957 958 mulq %rbp 959 addq %rax,%rdi 960 movq 16(%rsi,%r15,1),%rax 961 adcq $0,%rdx 962 addq %r11,%rdi 963 leaq 32(%rcx),%rcx 964 adcq $0,%rdx 965 movq %r13,-8(%r14) 966 movq %rdx,%r13 967 968 addq $32,%r15 969 jnz .Linner4x 970 971 mulq %rbx 972 addq %rax,%r10 973 movq -16(%rcx),%rax 974 adcq $0,%rdx 975 addq 16(%r14),%r10 976 leaq 32(%r14),%r14 977 adcq $0,%rdx 978 movq %rdx,%r11 979 980 mulq %rbp 981 addq %rax,%r13 982 movq -8(%rsi),%rax 983 adcq $0,%rdx 984 addq %r10,%r13 985 adcq $0,%rdx 986 movq %rdi,-32(%r14) 987 movq %rdx,%rdi 988 989 mulq %rbx 990 addq %rax,%r11 991 movq %rbp,%rax 992 movq -8(%rcx),%rbp 993 adcq $0,%rdx 994 addq -8(%r14),%r11 995 adcq $0,%rdx 996 movq %rdx,%r10 997 998 mulq %rbp 999 addq %rax,%rdi 1000 movq (%rsi,%r9,1),%rax 1001 adcq $0,%rdx 1002 addq %r11,%rdi 1003 adcq $0,%rdx 1004 movq %r13,-24(%r14) 1005 movq %rdx,%r13 1006 1007 movq %rdi,-16(%r14) 1008 leaq (%rcx,%r9,1),%rcx 1009 1010 xorq %rdi,%rdi 1011 addq %r10,%r13 1012 adcq $0,%rdi 1013 addq (%r14),%r13 1014 adcq $0,%rdi 1015 movq %r13,-8(%r14) 1016 1017 cmpq 16+8(%rsp),%r12 1018 jb .Louter4x 1019 xorq %rax,%rax 1020 subq %r13,%rbp 1021 adcq %r15,%r15 1022 orq %r15,%rdi 1023 subq %rdi,%rax 1024 leaq (%r14,%r9,1),%rbx 1025 movq (%rcx),%r12 1026 leaq (%rcx),%rbp 1027 movq %r9,%rcx 1028 sarq $3+2,%rcx 1029 movq 56+8(%rsp),%rdi 1030 decq %r12 1031 xorq %r10,%r10 1032 movq 8(%rbp),%r13 1033 movq 16(%rbp),%r14 1034 movq 24(%rbp),%r15 1035 jmp .Lsqr4x_sub_entry 1036.size mul4x_internal,.-mul4x_internal 1037.globl bn_power5 1038.type bn_power5,@function 1039.align 32 1040bn_power5: 1041 movq %rsp,%rax 1042 movl OPENSSL_ia32cap_P+8(%rip),%r11d 1043 andl $0x80108,%r11d 1044 cmpl $0x80108,%r11d 1045 je .Lpowerx5_enter 1046 pushq %rbx 1047 pushq %rbp 1048 pushq %r12 1049 pushq %r13 1050 pushq %r14 1051 pushq %r15 1052.Lpower5_prologue: 1053 1054 shll $3,%r9d 1055 leal (%r9,%r9,2),%r10d 1056 negq %r9 1057 movq (%r8),%r8 1058 1059 1060 1061 1062 1063 1064 1065 1066 leaq -320(%rsp,%r9,2),%r11 1067 movq %rsp,%rbp 1068 subq %rdi,%r11 1069 andq $4095,%r11 1070 cmpq %r11,%r10 1071 jb .Lpwr_sp_alt 1072 subq %r11,%rbp 1073 leaq -320(%rbp,%r9,2),%rbp 1074 jmp .Lpwr_sp_done 1075 1076.align 32 1077.Lpwr_sp_alt: 1078 leaq 4096-320(,%r9,2),%r10 1079 leaq -320(%rbp,%r9,2),%rbp 1080 subq %r10,%r11 1081 movq $0,%r10 1082 cmovcq %r10,%r11 1083 subq %r11,%rbp 1084.Lpwr_sp_done: 1085 andq $-64,%rbp 1086 movq %rsp,%r11 1087 subq %rbp,%r11 1088 andq $-4096,%r11 1089 leaq (%r11,%rbp,1),%rsp 1090 movq (%rsp),%r10 1091 cmpq %rbp,%rsp 1092 ja .Lpwr_page_walk 1093 jmp .Lpwr_page_walk_done 1094 1095.Lpwr_page_walk: 1096 leaq -4096(%rsp),%rsp 1097 movq (%rsp),%r10 1098 cmpq %rbp,%rsp 1099 ja .Lpwr_page_walk 1100.Lpwr_page_walk_done: 1101 1102 movq %r9,%r10 1103 negq %r9 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 movq %r8,32(%rsp) 1115 movq %rax,40(%rsp) 1116.Lpower5_body: 1117.byte 102,72,15,110,207 1118.byte 102,72,15,110,209 1119.byte 102,73,15,110,218 1120.byte 102,72,15,110,226 1121 1122 call __bn_sqr8x_internal 1123 call __bn_post4x_internal 1124 call __bn_sqr8x_internal 1125 call __bn_post4x_internal 1126 call __bn_sqr8x_internal 1127 call __bn_post4x_internal 1128 call __bn_sqr8x_internal 1129 call __bn_post4x_internal 1130 call __bn_sqr8x_internal 1131 call __bn_post4x_internal 1132 1133.byte 102,72,15,126,209 1134.byte 102,72,15,126,226 1135 movq %rsi,%rdi 1136 movq 40(%rsp),%rax 1137 leaq 32(%rsp),%r8 1138 1139 call mul4x_internal 1140 1141 movq 40(%rsp),%rsi 1142 movq $1,%rax 1143 movq -48(%rsi),%r15 1144 movq -40(%rsi),%r14 1145 movq -32(%rsi),%r13 1146 movq -24(%rsi),%r12 1147 movq -16(%rsi),%rbp 1148 movq -8(%rsi),%rbx 1149 leaq (%rsi),%rsp 1150.Lpower5_epilogue: 1151 .byte 0xf3,0xc3 1152.size bn_power5,.-bn_power5 1153 1154.globl bn_sqr8x_internal 1155.hidden bn_sqr8x_internal 1156.type bn_sqr8x_internal,@function 1157.align 32 1158bn_sqr8x_internal: 1159__bn_sqr8x_internal: 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 leaq 32(%r10),%rbp 1234 leaq (%rsi,%r9,1),%rsi 1235 1236 movq %r9,%rcx 1237 1238 1239 movq -32(%rsi,%rbp,1),%r14 1240 leaq 48+8(%rsp,%r9,2),%rdi 1241 movq -24(%rsi,%rbp,1),%rax 1242 leaq -32(%rdi,%rbp,1),%rdi 1243 movq -16(%rsi,%rbp,1),%rbx 1244 movq %rax,%r15 1245 1246 mulq %r14 1247 movq %rax,%r10 1248 movq %rbx,%rax 1249 movq %rdx,%r11 1250 movq %r10,-24(%rdi,%rbp,1) 1251 1252 mulq %r14 1253 addq %rax,%r11 1254 movq %rbx,%rax 1255 adcq $0,%rdx 1256 movq %r11,-16(%rdi,%rbp,1) 1257 movq %rdx,%r10 1258 1259 1260 movq -8(%rsi,%rbp,1),%rbx 1261 mulq %r15 1262 movq %rax,%r12 1263 movq %rbx,%rax 1264 movq %rdx,%r13 1265 1266 leaq (%rbp),%rcx 1267 mulq %r14 1268 addq %rax,%r10 1269 movq %rbx,%rax 1270 movq %rdx,%r11 1271 adcq $0,%r11 1272 addq %r12,%r10 1273 adcq $0,%r11 1274 movq %r10,-8(%rdi,%rcx,1) 1275 jmp .Lsqr4x_1st 1276 1277.align 32 1278.Lsqr4x_1st: 1279 movq (%rsi,%rcx,1),%rbx 1280 mulq %r15 1281 addq %rax,%r13 1282 movq %rbx,%rax 1283 movq %rdx,%r12 1284 adcq $0,%r12 1285 1286 mulq %r14 1287 addq %rax,%r11 1288 movq %rbx,%rax 1289 movq 8(%rsi,%rcx,1),%rbx 1290 movq %rdx,%r10 1291 adcq $0,%r10 1292 addq %r13,%r11 1293 adcq $0,%r10 1294 1295 1296 mulq %r15 1297 addq %rax,%r12 1298 movq %rbx,%rax 1299 movq %r11,(%rdi,%rcx,1) 1300 movq %rdx,%r13 1301 adcq $0,%r13 1302 1303 mulq %r14 1304 addq %rax,%r10 1305 movq %rbx,%rax 1306 movq 16(%rsi,%rcx,1),%rbx 1307 movq %rdx,%r11 1308 adcq $0,%r11 1309 addq %r12,%r10 1310 adcq $0,%r11 1311 1312 mulq %r15 1313 addq %rax,%r13 1314 movq %rbx,%rax 1315 movq %r10,8(%rdi,%rcx,1) 1316 movq %rdx,%r12 1317 adcq $0,%r12 1318 1319 mulq %r14 1320 addq %rax,%r11 1321 movq %rbx,%rax 1322 movq 24(%rsi,%rcx,1),%rbx 1323 movq %rdx,%r10 1324 adcq $0,%r10 1325 addq %r13,%r11 1326 adcq $0,%r10 1327 1328 1329 mulq %r15 1330 addq %rax,%r12 1331 movq %rbx,%rax 1332 movq %r11,16(%rdi,%rcx,1) 1333 movq %rdx,%r13 1334 adcq $0,%r13 1335 leaq 32(%rcx),%rcx 1336 1337 mulq %r14 1338 addq %rax,%r10 1339 movq %rbx,%rax 1340 movq %rdx,%r11 1341 adcq $0,%r11 1342 addq %r12,%r10 1343 adcq $0,%r11 1344 movq %r10,-8(%rdi,%rcx,1) 1345 1346 cmpq $0,%rcx 1347 jne .Lsqr4x_1st 1348 1349 mulq %r15 1350 addq %rax,%r13 1351 leaq 16(%rbp),%rbp 1352 adcq $0,%rdx 1353 addq %r11,%r13 1354 adcq $0,%rdx 1355 1356 movq %r13,(%rdi) 1357 movq %rdx,%r12 1358 movq %rdx,8(%rdi) 1359 jmp .Lsqr4x_outer 1360 1361.align 32 1362.Lsqr4x_outer: 1363 movq -32(%rsi,%rbp,1),%r14 1364 leaq 48+8(%rsp,%r9,2),%rdi 1365 movq -24(%rsi,%rbp,1),%rax 1366 leaq -32(%rdi,%rbp,1),%rdi 1367 movq -16(%rsi,%rbp,1),%rbx 1368 movq %rax,%r15 1369 1370 mulq %r14 1371 movq -24(%rdi,%rbp,1),%r10 1372 addq %rax,%r10 1373 movq %rbx,%rax 1374 adcq $0,%rdx 1375 movq %r10,-24(%rdi,%rbp,1) 1376 movq %rdx,%r11 1377 1378 mulq %r14 1379 addq %rax,%r11 1380 movq %rbx,%rax 1381 adcq $0,%rdx 1382 addq -16(%rdi,%rbp,1),%r11 1383 movq %rdx,%r10 1384 adcq $0,%r10 1385 movq %r11,-16(%rdi,%rbp,1) 1386 1387 xorq %r12,%r12 1388 1389 movq -8(%rsi,%rbp,1),%rbx 1390 mulq %r15 1391 addq %rax,%r12 1392 movq %rbx,%rax 1393 adcq $0,%rdx 1394 addq -8(%rdi,%rbp,1),%r12 1395 movq %rdx,%r13 1396 adcq $0,%r13 1397 1398 mulq %r14 1399 addq %rax,%r10 1400 movq %rbx,%rax 1401 adcq $0,%rdx 1402 addq %r12,%r10 1403 movq %rdx,%r11 1404 adcq $0,%r11 1405 movq %r10,-8(%rdi,%rbp,1) 1406 1407 leaq (%rbp),%rcx 1408 jmp .Lsqr4x_inner 1409 1410.align 32 1411.Lsqr4x_inner: 1412 movq (%rsi,%rcx,1),%rbx 1413 mulq %r15 1414 addq %rax,%r13 1415 movq %rbx,%rax 1416 movq %rdx,%r12 1417 adcq $0,%r12 1418 addq (%rdi,%rcx,1),%r13 1419 adcq $0,%r12 1420 1421.byte 0x67 1422 mulq %r14 1423 addq %rax,%r11 1424 movq %rbx,%rax 1425 movq 8(%rsi,%rcx,1),%rbx 1426 movq %rdx,%r10 1427 adcq $0,%r10 1428 addq %r13,%r11 1429 adcq $0,%r10 1430 1431 mulq %r15 1432 addq %rax,%r12 1433 movq %r11,(%rdi,%rcx,1) 1434 movq %rbx,%rax 1435 movq %rdx,%r13 1436 adcq $0,%r13 1437 addq 8(%rdi,%rcx,1),%r12 1438 leaq 16(%rcx),%rcx 1439 adcq $0,%r13 1440 1441 mulq %r14 1442 addq %rax,%r10 1443 movq %rbx,%rax 1444 adcq $0,%rdx 1445 addq %r12,%r10 1446 movq %rdx,%r11 1447 adcq $0,%r11 1448 movq %r10,-8(%rdi,%rcx,1) 1449 1450 cmpq $0,%rcx 1451 jne .Lsqr4x_inner 1452 1453.byte 0x67 1454 mulq %r15 1455 addq %rax,%r13 1456 adcq $0,%rdx 1457 addq %r11,%r13 1458 adcq $0,%rdx 1459 1460 movq %r13,(%rdi) 1461 movq %rdx,%r12 1462 movq %rdx,8(%rdi) 1463 1464 addq $16,%rbp 1465 jnz .Lsqr4x_outer 1466 1467 1468 movq -32(%rsi),%r14 1469 leaq 48+8(%rsp,%r9,2),%rdi 1470 movq -24(%rsi),%rax 1471 leaq -32(%rdi,%rbp,1),%rdi 1472 movq -16(%rsi),%rbx 1473 movq %rax,%r15 1474 1475 mulq %r14 1476 addq %rax,%r10 1477 movq %rbx,%rax 1478 movq %rdx,%r11 1479 adcq $0,%r11 1480 1481 mulq %r14 1482 addq %rax,%r11 1483 movq %rbx,%rax 1484 movq %r10,-24(%rdi) 1485 movq %rdx,%r10 1486 adcq $0,%r10 1487 addq %r13,%r11 1488 movq -8(%rsi),%rbx 1489 adcq $0,%r10 1490 1491 mulq %r15 1492 addq %rax,%r12 1493 movq %rbx,%rax 1494 movq %r11,-16(%rdi) 1495 movq %rdx,%r13 1496 adcq $0,%r13 1497 1498 mulq %r14 1499 addq %rax,%r10 1500 movq %rbx,%rax 1501 movq %rdx,%r11 1502 adcq $0,%r11 1503 addq %r12,%r10 1504 adcq $0,%r11 1505 movq %r10,-8(%rdi) 1506 1507 mulq %r15 1508 addq %rax,%r13 1509 movq -16(%rsi),%rax 1510 adcq $0,%rdx 1511 addq %r11,%r13 1512 adcq $0,%rdx 1513 1514 movq %r13,(%rdi) 1515 movq %rdx,%r12 1516 movq %rdx,8(%rdi) 1517 1518 mulq %rbx 1519 addq $16,%rbp 1520 xorq %r14,%r14 1521 subq %r9,%rbp 1522 xorq %r15,%r15 1523 1524 addq %r12,%rax 1525 adcq $0,%rdx 1526 movq %rax,8(%rdi) 1527 movq %rdx,16(%rdi) 1528 movq %r15,24(%rdi) 1529 1530 movq -16(%rsi,%rbp,1),%rax 1531 leaq 48+8(%rsp),%rdi 1532 xorq %r10,%r10 1533 movq 8(%rdi),%r11 1534 1535 leaq (%r14,%r10,2),%r12 1536 shrq $63,%r10 1537 leaq (%rcx,%r11,2),%r13 1538 shrq $63,%r11 1539 orq %r10,%r13 1540 movq 16(%rdi),%r10 1541 movq %r11,%r14 1542 mulq %rax 1543 negq %r15 1544 movq 24(%rdi),%r11 1545 adcq %rax,%r12 1546 movq -8(%rsi,%rbp,1),%rax 1547 movq %r12,(%rdi) 1548 adcq %rdx,%r13 1549 1550 leaq (%r14,%r10,2),%rbx 1551 movq %r13,8(%rdi) 1552 sbbq %r15,%r15 1553 shrq $63,%r10 1554 leaq (%rcx,%r11,2),%r8 1555 shrq $63,%r11 1556 orq %r10,%r8 1557 movq 32(%rdi),%r10 1558 movq %r11,%r14 1559 mulq %rax 1560 negq %r15 1561 movq 40(%rdi),%r11 1562 adcq %rax,%rbx 1563 movq 0(%rsi,%rbp,1),%rax 1564 movq %rbx,16(%rdi) 1565 adcq %rdx,%r8 1566 leaq 16(%rbp),%rbp 1567 movq %r8,24(%rdi) 1568 sbbq %r15,%r15 1569 leaq 64(%rdi),%rdi 1570 jmp .Lsqr4x_shift_n_add 1571 1572.align 32 1573.Lsqr4x_shift_n_add: 1574 leaq (%r14,%r10,2),%r12 1575 shrq $63,%r10 1576 leaq (%rcx,%r11,2),%r13 1577 shrq $63,%r11 1578 orq %r10,%r13 1579 movq -16(%rdi),%r10 1580 movq %r11,%r14 1581 mulq %rax 1582 negq %r15 1583 movq -8(%rdi),%r11 1584 adcq %rax,%r12 1585 movq -8(%rsi,%rbp,1),%rax 1586 movq %r12,-32(%rdi) 1587 adcq %rdx,%r13 1588 1589 leaq (%r14,%r10,2),%rbx 1590 movq %r13,-24(%rdi) 1591 sbbq %r15,%r15 1592 shrq $63,%r10 1593 leaq (%rcx,%r11,2),%r8 1594 shrq $63,%r11 1595 orq %r10,%r8 1596 movq 0(%rdi),%r10 1597 movq %r11,%r14 1598 mulq %rax 1599 negq %r15 1600 movq 8(%rdi),%r11 1601 adcq %rax,%rbx 1602 movq 0(%rsi,%rbp,1),%rax 1603 movq %rbx,-16(%rdi) 1604 adcq %rdx,%r8 1605 1606 leaq (%r14,%r10,2),%r12 1607 movq %r8,-8(%rdi) 1608 sbbq %r15,%r15 1609 shrq $63,%r10 1610 leaq (%rcx,%r11,2),%r13 1611 shrq $63,%r11 1612 orq %r10,%r13 1613 movq 16(%rdi),%r10 1614 movq %r11,%r14 1615 mulq %rax 1616 negq %r15 1617 movq 24(%rdi),%r11 1618 adcq %rax,%r12 1619 movq 8(%rsi,%rbp,1),%rax 1620 movq %r12,0(%rdi) 1621 adcq %rdx,%r13 1622 1623 leaq (%r14,%r10,2),%rbx 1624 movq %r13,8(%rdi) 1625 sbbq %r15,%r15 1626 shrq $63,%r10 1627 leaq (%rcx,%r11,2),%r8 1628 shrq $63,%r11 1629 orq %r10,%r8 1630 movq 32(%rdi),%r10 1631 movq %r11,%r14 1632 mulq %rax 1633 negq %r15 1634 movq 40(%rdi),%r11 1635 adcq %rax,%rbx 1636 movq 16(%rsi,%rbp,1),%rax 1637 movq %rbx,16(%rdi) 1638 adcq %rdx,%r8 1639 movq %r8,24(%rdi) 1640 sbbq %r15,%r15 1641 leaq 64(%rdi),%rdi 1642 addq $32,%rbp 1643 jnz .Lsqr4x_shift_n_add 1644 1645 leaq (%r14,%r10,2),%r12 1646.byte 0x67 1647 shrq $63,%r10 1648 leaq (%rcx,%r11,2),%r13 1649 shrq $63,%r11 1650 orq %r10,%r13 1651 movq -16(%rdi),%r10 1652 movq %r11,%r14 1653 mulq %rax 1654 negq %r15 1655 movq -8(%rdi),%r11 1656 adcq %rax,%r12 1657 movq -8(%rsi),%rax 1658 movq %r12,-32(%rdi) 1659 adcq %rdx,%r13 1660 1661 leaq (%r14,%r10,2),%rbx 1662 movq %r13,-24(%rdi) 1663 sbbq %r15,%r15 1664 shrq $63,%r10 1665 leaq (%rcx,%r11,2),%r8 1666 shrq $63,%r11 1667 orq %r10,%r8 1668 mulq %rax 1669 negq %r15 1670 adcq %rax,%rbx 1671 adcq %rdx,%r8 1672 movq %rbx,-16(%rdi) 1673 movq %r8,-8(%rdi) 1674.byte 102,72,15,126,213 1675__bn_sqr8x_reduction: 1676 xorq %rax,%rax 1677 leaq (%r9,%rbp,1),%rcx 1678 leaq 48+8(%rsp,%r9,2),%rdx 1679 movq %rcx,0+8(%rsp) 1680 leaq 48+8(%rsp,%r9,1),%rdi 1681 movq %rdx,8+8(%rsp) 1682 negq %r9 1683 jmp .L8x_reduction_loop 1684 1685.align 32 1686.L8x_reduction_loop: 1687 leaq (%rdi,%r9,1),%rdi 1688.byte 0x66 1689 movq 0(%rdi),%rbx 1690 movq 8(%rdi),%r9 1691 movq 16(%rdi),%r10 1692 movq 24(%rdi),%r11 1693 movq 32(%rdi),%r12 1694 movq 40(%rdi),%r13 1695 movq 48(%rdi),%r14 1696 movq 56(%rdi),%r15 1697 movq %rax,(%rdx) 1698 leaq 64(%rdi),%rdi 1699 1700.byte 0x67 1701 movq %rbx,%r8 1702 imulq 32+8(%rsp),%rbx 1703 movq 0(%rbp),%rax 1704 movl $8,%ecx 1705 jmp .L8x_reduce 1706 1707.align 32 1708.L8x_reduce: 1709 mulq %rbx 1710 movq 8(%rbp),%rax 1711 negq %r8 1712 movq %rdx,%r8 1713 adcq $0,%r8 1714 1715 mulq %rbx 1716 addq %rax,%r9 1717 movq 16(%rbp),%rax 1718 adcq $0,%rdx 1719 addq %r9,%r8 1720 movq %rbx,48-8+8(%rsp,%rcx,8) 1721 movq %rdx,%r9 1722 adcq $0,%r9 1723 1724 mulq %rbx 1725 addq %rax,%r10 1726 movq 24(%rbp),%rax 1727 adcq $0,%rdx 1728 addq %r10,%r9 1729 movq 32+8(%rsp),%rsi 1730 movq %rdx,%r10 1731 adcq $0,%r10 1732 1733 mulq %rbx 1734 addq %rax,%r11 1735 movq 32(%rbp),%rax 1736 adcq $0,%rdx 1737 imulq %r8,%rsi 1738 addq %r11,%r10 1739 movq %rdx,%r11 1740 adcq $0,%r11 1741 1742 mulq %rbx 1743 addq %rax,%r12 1744 movq 40(%rbp),%rax 1745 adcq $0,%rdx 1746 addq %r12,%r11 1747 movq %rdx,%r12 1748 adcq $0,%r12 1749 1750 mulq %rbx 1751 addq %rax,%r13 1752 movq 48(%rbp),%rax 1753 adcq $0,%rdx 1754 addq %r13,%r12 1755 movq %rdx,%r13 1756 adcq $0,%r13 1757 1758 mulq %rbx 1759 addq %rax,%r14 1760 movq 56(%rbp),%rax 1761 adcq $0,%rdx 1762 addq %r14,%r13 1763 movq %rdx,%r14 1764 adcq $0,%r14 1765 1766 mulq %rbx 1767 movq %rsi,%rbx 1768 addq %rax,%r15 1769 movq 0(%rbp),%rax 1770 adcq $0,%rdx 1771 addq %r15,%r14 1772 movq %rdx,%r15 1773 adcq $0,%r15 1774 1775 decl %ecx 1776 jnz .L8x_reduce 1777 1778 leaq 64(%rbp),%rbp 1779 xorq %rax,%rax 1780 movq 8+8(%rsp),%rdx 1781 cmpq 0+8(%rsp),%rbp 1782 jae .L8x_no_tail 1783 1784.byte 0x66 1785 addq 0(%rdi),%r8 1786 adcq 8(%rdi),%r9 1787 adcq 16(%rdi),%r10 1788 adcq 24(%rdi),%r11 1789 adcq 32(%rdi),%r12 1790 adcq 40(%rdi),%r13 1791 adcq 48(%rdi),%r14 1792 adcq 56(%rdi),%r15 1793 sbbq %rsi,%rsi 1794 1795 movq 48+56+8(%rsp),%rbx 1796 movl $8,%ecx 1797 movq 0(%rbp),%rax 1798 jmp .L8x_tail 1799 1800.align 32 1801.L8x_tail: 1802 mulq %rbx 1803 addq %rax,%r8 1804 movq 8(%rbp),%rax 1805 movq %r8,(%rdi) 1806 movq %rdx,%r8 1807 adcq $0,%r8 1808 1809 mulq %rbx 1810 addq %rax,%r9 1811 movq 16(%rbp),%rax 1812 adcq $0,%rdx 1813 addq %r9,%r8 1814 leaq 8(%rdi),%rdi 1815 movq %rdx,%r9 1816 adcq $0,%r9 1817 1818 mulq %rbx 1819 addq %rax,%r10 1820 movq 24(%rbp),%rax 1821 adcq $0,%rdx 1822 addq %r10,%r9 1823 movq %rdx,%r10 1824 adcq $0,%r10 1825 1826 mulq %rbx 1827 addq %rax,%r11 1828 movq 32(%rbp),%rax 1829 adcq $0,%rdx 1830 addq %r11,%r10 1831 movq %rdx,%r11 1832 adcq $0,%r11 1833 1834 mulq %rbx 1835 addq %rax,%r12 1836 movq 40(%rbp),%rax 1837 adcq $0,%rdx 1838 addq %r12,%r11 1839 movq %rdx,%r12 1840 adcq $0,%r12 1841 1842 mulq %rbx 1843 addq %rax,%r13 1844 movq 48(%rbp),%rax 1845 adcq $0,%rdx 1846 addq %r13,%r12 1847 movq %rdx,%r13 1848 adcq $0,%r13 1849 1850 mulq %rbx 1851 addq %rax,%r14 1852 movq 56(%rbp),%rax 1853 adcq $0,%rdx 1854 addq %r14,%r13 1855 movq %rdx,%r14 1856 adcq $0,%r14 1857 1858 mulq %rbx 1859 movq 48-16+8(%rsp,%rcx,8),%rbx 1860 addq %rax,%r15 1861 adcq $0,%rdx 1862 addq %r15,%r14 1863 movq 0(%rbp),%rax 1864 movq %rdx,%r15 1865 adcq $0,%r15 1866 1867 decl %ecx 1868 jnz .L8x_tail 1869 1870 leaq 64(%rbp),%rbp 1871 movq 8+8(%rsp),%rdx 1872 cmpq 0+8(%rsp),%rbp 1873 jae .L8x_tail_done 1874 1875 movq 48+56+8(%rsp),%rbx 1876 negq %rsi 1877 movq 0(%rbp),%rax 1878 adcq 0(%rdi),%r8 1879 adcq 8(%rdi),%r9 1880 adcq 16(%rdi),%r10 1881 adcq 24(%rdi),%r11 1882 adcq 32(%rdi),%r12 1883 adcq 40(%rdi),%r13 1884 adcq 48(%rdi),%r14 1885 adcq 56(%rdi),%r15 1886 sbbq %rsi,%rsi 1887 1888 movl $8,%ecx 1889 jmp .L8x_tail 1890 1891.align 32 1892.L8x_tail_done: 1893 xorq %rax,%rax 1894 addq (%rdx),%r8 1895 adcq $0,%r9 1896 adcq $0,%r10 1897 adcq $0,%r11 1898 adcq $0,%r12 1899 adcq $0,%r13 1900 adcq $0,%r14 1901 adcq $0,%r15 1902 adcq $0,%rax 1903 1904 negq %rsi 1905.L8x_no_tail: 1906 adcq 0(%rdi),%r8 1907 adcq 8(%rdi),%r9 1908 adcq 16(%rdi),%r10 1909 adcq 24(%rdi),%r11 1910 adcq 32(%rdi),%r12 1911 adcq 40(%rdi),%r13 1912 adcq 48(%rdi),%r14 1913 adcq 56(%rdi),%r15 1914 adcq $0,%rax 1915 movq -8(%rbp),%rcx 1916 xorq %rsi,%rsi 1917 1918.byte 102,72,15,126,213 1919 1920 movq %r8,0(%rdi) 1921 movq %r9,8(%rdi) 1922.byte 102,73,15,126,217 1923 movq %r10,16(%rdi) 1924 movq %r11,24(%rdi) 1925 movq %r12,32(%rdi) 1926 movq %r13,40(%rdi) 1927 movq %r14,48(%rdi) 1928 movq %r15,56(%rdi) 1929 leaq 64(%rdi),%rdi 1930 1931 cmpq %rdx,%rdi 1932 jb .L8x_reduction_loop 1933 .byte 0xf3,0xc3 1934.size bn_sqr8x_internal,.-bn_sqr8x_internal 1935.type __bn_post4x_internal,@function 1936.align 32 1937__bn_post4x_internal: 1938 movq 0(%rbp),%r12 1939 leaq (%rdi,%r9,1),%rbx 1940 movq %r9,%rcx 1941.byte 102,72,15,126,207 1942 negq %rax 1943.byte 102,72,15,126,206 1944 sarq $3+2,%rcx 1945 decq %r12 1946 xorq %r10,%r10 1947 movq 8(%rbp),%r13 1948 movq 16(%rbp),%r14 1949 movq 24(%rbp),%r15 1950 jmp .Lsqr4x_sub_entry 1951 1952.align 16 1953.Lsqr4x_sub: 1954 movq 0(%rbp),%r12 1955 movq 8(%rbp),%r13 1956 movq 16(%rbp),%r14 1957 movq 24(%rbp),%r15 1958.Lsqr4x_sub_entry: 1959 leaq 32(%rbp),%rbp 1960 notq %r12 1961 notq %r13 1962 notq %r14 1963 notq %r15 1964 andq %rax,%r12 1965 andq %rax,%r13 1966 andq %rax,%r14 1967 andq %rax,%r15 1968 1969 negq %r10 1970 adcq 0(%rbx),%r12 1971 adcq 8(%rbx),%r13 1972 adcq 16(%rbx),%r14 1973 adcq 24(%rbx),%r15 1974 movq %r12,0(%rdi) 1975 leaq 32(%rbx),%rbx 1976 movq %r13,8(%rdi) 1977 sbbq %r10,%r10 1978 movq %r14,16(%rdi) 1979 movq %r15,24(%rdi) 1980 leaq 32(%rdi),%rdi 1981 1982 incq %rcx 1983 jnz .Lsqr4x_sub 1984 1985 movq %r9,%r10 1986 negq %r9 1987 .byte 0xf3,0xc3 1988.size __bn_post4x_internal,.-__bn_post4x_internal 1989.globl bn_from_montgomery 1990.type bn_from_montgomery,@function 1991.align 32 1992bn_from_montgomery: 1993 testl $7,%r9d 1994 jz bn_from_mont8x 1995 xorl %eax,%eax 1996 .byte 0xf3,0xc3 1997.size bn_from_montgomery,.-bn_from_montgomery 1998 1999.type bn_from_mont8x,@function 2000.align 32 2001bn_from_mont8x: 2002.byte 0x67 2003 movq %rsp,%rax 2004 pushq %rbx 2005 pushq %rbp 2006 pushq %r12 2007 pushq %r13 2008 pushq %r14 2009 pushq %r15 2010.Lfrom_prologue: 2011 2012 shll $3,%r9d 2013 leaq (%r9,%r9,2),%r10 2014 negq %r9 2015 movq (%r8),%r8 2016 2017 2018 2019 2020 2021 2022 2023 2024 leaq -320(%rsp,%r9,2),%r11 2025 movq %rsp,%rbp 2026 subq %rdi,%r11 2027 andq $4095,%r11 2028 cmpq %r11,%r10 2029 jb .Lfrom_sp_alt 2030 subq %r11,%rbp 2031 leaq -320(%rbp,%r9,2),%rbp 2032 jmp .Lfrom_sp_done 2033 2034.align 32 2035.Lfrom_sp_alt: 2036 leaq 4096-320(,%r9,2),%r10 2037 leaq -320(%rbp,%r9,2),%rbp 2038 subq %r10,%r11 2039 movq $0,%r10 2040 cmovcq %r10,%r11 2041 subq %r11,%rbp 2042.Lfrom_sp_done: 2043 andq $-64,%rbp 2044 movq %rsp,%r11 2045 subq %rbp,%r11 2046 andq $-4096,%r11 2047 leaq (%r11,%rbp,1),%rsp 2048 movq (%rsp),%r10 2049 cmpq %rbp,%rsp 2050 ja .Lfrom_page_walk 2051 jmp .Lfrom_page_walk_done 2052 2053.Lfrom_page_walk: 2054 leaq -4096(%rsp),%rsp 2055 movq (%rsp),%r10 2056 cmpq %rbp,%rsp 2057 ja .Lfrom_page_walk 2058.Lfrom_page_walk_done: 2059 2060 movq %r9,%r10 2061 negq %r9 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 movq %r8,32(%rsp) 2073 movq %rax,40(%rsp) 2074.Lfrom_body: 2075 movq %r9,%r11 2076 leaq 48(%rsp),%rax 2077 pxor %xmm0,%xmm0 2078 jmp .Lmul_by_1 2079 2080.align 32 2081.Lmul_by_1: 2082 movdqu (%rsi),%xmm1 2083 movdqu 16(%rsi),%xmm2 2084 movdqu 32(%rsi),%xmm3 2085 movdqa %xmm0,(%rax,%r9,1) 2086 movdqu 48(%rsi),%xmm4 2087 movdqa %xmm0,16(%rax,%r9,1) 2088.byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 2089 movdqa %xmm1,(%rax) 2090 movdqa %xmm0,32(%rax,%r9,1) 2091 movdqa %xmm2,16(%rax) 2092 movdqa %xmm0,48(%rax,%r9,1) 2093 movdqa %xmm3,32(%rax) 2094 movdqa %xmm4,48(%rax) 2095 leaq 64(%rax),%rax 2096 subq $64,%r11 2097 jnz .Lmul_by_1 2098 2099.byte 102,72,15,110,207 2100.byte 102,72,15,110,209 2101.byte 0x67 2102 movq %rcx,%rbp 2103.byte 102,73,15,110,218 2104 movl OPENSSL_ia32cap_P+8(%rip),%r11d 2105 andl $0x80108,%r11d 2106 cmpl $0x80108,%r11d 2107 jne .Lfrom_mont_nox 2108 2109 leaq (%rax,%r9,1),%rdi 2110 call __bn_sqrx8x_reduction 2111 call __bn_postx4x_internal 2112 2113 pxor %xmm0,%xmm0 2114 leaq 48(%rsp),%rax 2115 movq 40(%rsp),%rsi 2116 jmp .Lfrom_mont_zero 2117 2118.align 32 2119.Lfrom_mont_nox: 2120 call __bn_sqr8x_reduction 2121 call __bn_post4x_internal 2122 2123 pxor %xmm0,%xmm0 2124 leaq 48(%rsp),%rax 2125 movq 40(%rsp),%rsi 2126 jmp .Lfrom_mont_zero 2127 2128.align 32 2129.Lfrom_mont_zero: 2130 movdqa %xmm0,0(%rax) 2131 movdqa %xmm0,16(%rax) 2132 movdqa %xmm0,32(%rax) 2133 movdqa %xmm0,48(%rax) 2134 leaq 64(%rax),%rax 2135 subq $32,%r9 2136 jnz .Lfrom_mont_zero 2137 2138 movq $1,%rax 2139 movq -48(%rsi),%r15 2140 movq -40(%rsi),%r14 2141 movq -32(%rsi),%r13 2142 movq -24(%rsi),%r12 2143 movq -16(%rsi),%rbp 2144 movq -8(%rsi),%rbx 2145 leaq (%rsi),%rsp 2146.Lfrom_epilogue: 2147 .byte 0xf3,0xc3 2148.size bn_from_mont8x,.-bn_from_mont8x 2149.type bn_mulx4x_mont_gather5,@function 2150.align 32 2151bn_mulx4x_mont_gather5: 2152 movq %rsp,%rax 2153.Lmulx4x_enter: 2154 pushq %rbx 2155 pushq %rbp 2156 pushq %r12 2157 pushq %r13 2158 pushq %r14 2159 pushq %r15 2160.Lmulx4x_prologue: 2161 2162 shll $3,%r9d 2163 leaq (%r9,%r9,2),%r10 2164 negq %r9 2165 movq (%r8),%r8 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 leaq -320(%rsp,%r9,2),%r11 2177 movq %rsp,%rbp 2178 subq %rdi,%r11 2179 andq $4095,%r11 2180 cmpq %r11,%r10 2181 jb .Lmulx4xsp_alt 2182 subq %r11,%rbp 2183 leaq -320(%rbp,%r9,2),%rbp 2184 jmp .Lmulx4xsp_done 2185 2186.Lmulx4xsp_alt: 2187 leaq 4096-320(,%r9,2),%r10 2188 leaq -320(%rbp,%r9,2),%rbp 2189 subq %r10,%r11 2190 movq $0,%r10 2191 cmovcq %r10,%r11 2192 subq %r11,%rbp 2193.Lmulx4xsp_done: 2194 andq $-64,%rbp 2195 movq %rsp,%r11 2196 subq %rbp,%r11 2197 andq $-4096,%r11 2198 leaq (%r11,%rbp,1),%rsp 2199 movq (%rsp),%r10 2200 cmpq %rbp,%rsp 2201 ja .Lmulx4x_page_walk 2202 jmp .Lmulx4x_page_walk_done 2203 2204.Lmulx4x_page_walk: 2205 leaq -4096(%rsp),%rsp 2206 movq (%rsp),%r10 2207 cmpq %rbp,%rsp 2208 ja .Lmulx4x_page_walk 2209.Lmulx4x_page_walk_done: 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 movq %r8,32(%rsp) 2224 movq %rax,40(%rsp) 2225.Lmulx4x_body: 2226 call mulx4x_internal 2227 2228 movq 40(%rsp),%rsi 2229 movq $1,%rax 2230 2231 movq -48(%rsi),%r15 2232 movq -40(%rsi),%r14 2233 movq -32(%rsi),%r13 2234 movq -24(%rsi),%r12 2235 movq -16(%rsi),%rbp 2236 movq -8(%rsi),%rbx 2237 leaq (%rsi),%rsp 2238.Lmulx4x_epilogue: 2239 .byte 0xf3,0xc3 2240.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 2241 2242.type mulx4x_internal,@function 2243.align 32 2244mulx4x_internal: 2245 movq %r9,8(%rsp) 2246 movq %r9,%r10 2247 negq %r9 2248 shlq $5,%r9 2249 negq %r10 2250 leaq 128(%rdx,%r9,1),%r13 2251 shrq $5+5,%r9 2252 movd 8(%rax),%xmm5 2253 subq $1,%r9 2254 leaq .Linc(%rip),%rax 2255 movq %r13,16+8(%rsp) 2256 movq %r9,24+8(%rsp) 2257 movq %rdi,56+8(%rsp) 2258 movdqa 0(%rax),%xmm0 2259 movdqa 16(%rax),%xmm1 2260 leaq 88-112(%rsp,%r10,1),%r10 2261 leaq 128(%rdx),%rdi 2262 2263 pshufd $0,%xmm5,%xmm5 2264 movdqa %xmm1,%xmm4 2265.byte 0x67 2266 movdqa %xmm1,%xmm2 2267.byte 0x67 2268 paddd %xmm0,%xmm1 2269 pcmpeqd %xmm5,%xmm0 2270 movdqa %xmm4,%xmm3 2271 paddd %xmm1,%xmm2 2272 pcmpeqd %xmm5,%xmm1 2273 movdqa %xmm0,112(%r10) 2274 movdqa %xmm4,%xmm0 2275 2276 paddd %xmm2,%xmm3 2277 pcmpeqd %xmm5,%xmm2 2278 movdqa %xmm1,128(%r10) 2279 movdqa %xmm4,%xmm1 2280 2281 paddd %xmm3,%xmm0 2282 pcmpeqd %xmm5,%xmm3 2283 movdqa %xmm2,144(%r10) 2284 movdqa %xmm4,%xmm2 2285 2286 paddd %xmm0,%xmm1 2287 pcmpeqd %xmm5,%xmm0 2288 movdqa %xmm3,160(%r10) 2289 movdqa %xmm4,%xmm3 2290 paddd %xmm1,%xmm2 2291 pcmpeqd %xmm5,%xmm1 2292 movdqa %xmm0,176(%r10) 2293 movdqa %xmm4,%xmm0 2294 2295 paddd %xmm2,%xmm3 2296 pcmpeqd %xmm5,%xmm2 2297 movdqa %xmm1,192(%r10) 2298 movdqa %xmm4,%xmm1 2299 2300 paddd %xmm3,%xmm0 2301 pcmpeqd %xmm5,%xmm3 2302 movdqa %xmm2,208(%r10) 2303 movdqa %xmm4,%xmm2 2304 2305 paddd %xmm0,%xmm1 2306 pcmpeqd %xmm5,%xmm0 2307 movdqa %xmm3,224(%r10) 2308 movdqa %xmm4,%xmm3 2309 paddd %xmm1,%xmm2 2310 pcmpeqd %xmm5,%xmm1 2311 movdqa %xmm0,240(%r10) 2312 movdqa %xmm4,%xmm0 2313 2314 paddd %xmm2,%xmm3 2315 pcmpeqd %xmm5,%xmm2 2316 movdqa %xmm1,256(%r10) 2317 movdqa %xmm4,%xmm1 2318 2319 paddd %xmm3,%xmm0 2320 pcmpeqd %xmm5,%xmm3 2321 movdqa %xmm2,272(%r10) 2322 movdqa %xmm4,%xmm2 2323 2324 paddd %xmm0,%xmm1 2325 pcmpeqd %xmm5,%xmm0 2326 movdqa %xmm3,288(%r10) 2327 movdqa %xmm4,%xmm3 2328.byte 0x67 2329 paddd %xmm1,%xmm2 2330 pcmpeqd %xmm5,%xmm1 2331 movdqa %xmm0,304(%r10) 2332 2333 paddd %xmm2,%xmm3 2334 pcmpeqd %xmm5,%xmm2 2335 movdqa %xmm1,320(%r10) 2336 2337 pcmpeqd %xmm5,%xmm3 2338 movdqa %xmm2,336(%r10) 2339 2340 pand 64(%rdi),%xmm0 2341 pand 80(%rdi),%xmm1 2342 pand 96(%rdi),%xmm2 2343 movdqa %xmm3,352(%r10) 2344 pand 112(%rdi),%xmm3 2345 por %xmm2,%xmm0 2346 por %xmm3,%xmm1 2347 movdqa -128(%rdi),%xmm4 2348 movdqa -112(%rdi),%xmm5 2349 movdqa -96(%rdi),%xmm2 2350 pand 112(%r10),%xmm4 2351 movdqa -80(%rdi),%xmm3 2352 pand 128(%r10),%xmm5 2353 por %xmm4,%xmm0 2354 pand 144(%r10),%xmm2 2355 por %xmm5,%xmm1 2356 pand 160(%r10),%xmm3 2357 por %xmm2,%xmm0 2358 por %xmm3,%xmm1 2359 movdqa -64(%rdi),%xmm4 2360 movdqa -48(%rdi),%xmm5 2361 movdqa -32(%rdi),%xmm2 2362 pand 176(%r10),%xmm4 2363 movdqa -16(%rdi),%xmm3 2364 pand 192(%r10),%xmm5 2365 por %xmm4,%xmm0 2366 pand 208(%r10),%xmm2 2367 por %xmm5,%xmm1 2368 pand 224(%r10),%xmm3 2369 por %xmm2,%xmm0 2370 por %xmm3,%xmm1 2371 movdqa 0(%rdi),%xmm4 2372 movdqa 16(%rdi),%xmm5 2373 movdqa 32(%rdi),%xmm2 2374 pand 240(%r10),%xmm4 2375 movdqa 48(%rdi),%xmm3 2376 pand 256(%r10),%xmm5 2377 por %xmm4,%xmm0 2378 pand 272(%r10),%xmm2 2379 por %xmm5,%xmm1 2380 pand 288(%r10),%xmm3 2381 por %xmm2,%xmm0 2382 por %xmm3,%xmm1 2383 pxor %xmm1,%xmm0 2384 pshufd $0x4e,%xmm0,%xmm1 2385 por %xmm1,%xmm0 2386 leaq 256(%rdi),%rdi 2387.byte 102,72,15,126,194 2388 leaq 64+32+8(%rsp),%rbx 2389 2390 movq %rdx,%r9 2391 mulxq 0(%rsi),%r8,%rax 2392 mulxq 8(%rsi),%r11,%r12 2393 addq %rax,%r11 2394 mulxq 16(%rsi),%rax,%r13 2395 adcq %rax,%r12 2396 adcq $0,%r13 2397 mulxq 24(%rsi),%rax,%r14 2398 2399 movq %r8,%r15 2400 imulq 32+8(%rsp),%r8 2401 xorq %rbp,%rbp 2402 movq %r8,%rdx 2403 2404 movq %rdi,8+8(%rsp) 2405 2406 leaq 32(%rsi),%rsi 2407 adcxq %rax,%r13 2408 adcxq %rbp,%r14 2409 2410 mulxq 0(%rcx),%rax,%r10 2411 adcxq %rax,%r15 2412 adoxq %r11,%r10 2413 mulxq 8(%rcx),%rax,%r11 2414 adcxq %rax,%r10 2415 adoxq %r12,%r11 2416 mulxq 16(%rcx),%rax,%r12 2417 movq 24+8(%rsp),%rdi 2418 movq %r10,-32(%rbx) 2419 adcxq %rax,%r11 2420 adoxq %r13,%r12 2421 mulxq 24(%rcx),%rax,%r15 2422 movq %r9,%rdx 2423 movq %r11,-24(%rbx) 2424 adcxq %rax,%r12 2425 adoxq %rbp,%r15 2426 leaq 32(%rcx),%rcx 2427 movq %r12,-16(%rbx) 2428 jmp .Lmulx4x_1st 2429 2430.align 32 2431.Lmulx4x_1st: 2432 adcxq %rbp,%r15 2433 mulxq 0(%rsi),%r10,%rax 2434 adcxq %r14,%r10 2435 mulxq 8(%rsi),%r11,%r14 2436 adcxq %rax,%r11 2437 mulxq 16(%rsi),%r12,%rax 2438 adcxq %r14,%r12 2439 mulxq 24(%rsi),%r13,%r14 2440.byte 0x67,0x67 2441 movq %r8,%rdx 2442 adcxq %rax,%r13 2443 adcxq %rbp,%r14 2444 leaq 32(%rsi),%rsi 2445 leaq 32(%rbx),%rbx 2446 2447 adoxq %r15,%r10 2448 mulxq 0(%rcx),%rax,%r15 2449 adcxq %rax,%r10 2450 adoxq %r15,%r11 2451 mulxq 8(%rcx),%rax,%r15 2452 adcxq %rax,%r11 2453 adoxq %r15,%r12 2454 mulxq 16(%rcx),%rax,%r15 2455 movq %r10,-40(%rbx) 2456 adcxq %rax,%r12 2457 movq %r11,-32(%rbx) 2458 adoxq %r15,%r13 2459 mulxq 24(%rcx),%rax,%r15 2460 movq %r9,%rdx 2461 movq %r12,-24(%rbx) 2462 adcxq %rax,%r13 2463 adoxq %rbp,%r15 2464 leaq 32(%rcx),%rcx 2465 movq %r13,-16(%rbx) 2466 2467 decq %rdi 2468 jnz .Lmulx4x_1st 2469 2470 movq 8(%rsp),%rax 2471 adcq %rbp,%r15 2472 leaq (%rsi,%rax,1),%rsi 2473 addq %r15,%r14 2474 movq 8+8(%rsp),%rdi 2475 adcq %rbp,%rbp 2476 movq %r14,-8(%rbx) 2477 jmp .Lmulx4x_outer 2478 2479.align 32 2480.Lmulx4x_outer: 2481 leaq 16-256(%rbx),%r10 2482 pxor %xmm4,%xmm4 2483.byte 0x67,0x67 2484 pxor %xmm5,%xmm5 2485 movdqa -128(%rdi),%xmm0 2486 movdqa -112(%rdi),%xmm1 2487 movdqa -96(%rdi),%xmm2 2488 pand 256(%r10),%xmm0 2489 movdqa -80(%rdi),%xmm3 2490 pand 272(%r10),%xmm1 2491 por %xmm0,%xmm4 2492 pand 288(%r10),%xmm2 2493 por %xmm1,%xmm5 2494 pand 304(%r10),%xmm3 2495 por %xmm2,%xmm4 2496 por %xmm3,%xmm5 2497 movdqa -64(%rdi),%xmm0 2498 movdqa -48(%rdi),%xmm1 2499 movdqa -32(%rdi),%xmm2 2500 pand 320(%r10),%xmm0 2501 movdqa -16(%rdi),%xmm3 2502 pand 336(%r10),%xmm1 2503 por %xmm0,%xmm4 2504 pand 352(%r10),%xmm2 2505 por %xmm1,%xmm5 2506 pand 368(%r10),%xmm3 2507 por %xmm2,%xmm4 2508 por %xmm3,%xmm5 2509 movdqa 0(%rdi),%xmm0 2510 movdqa 16(%rdi),%xmm1 2511 movdqa 32(%rdi),%xmm2 2512 pand 384(%r10),%xmm0 2513 movdqa 48(%rdi),%xmm3 2514 pand 400(%r10),%xmm1 2515 por %xmm0,%xmm4 2516 pand 416(%r10),%xmm2 2517 por %xmm1,%xmm5 2518 pand 432(%r10),%xmm3 2519 por %xmm2,%xmm4 2520 por %xmm3,%xmm5 2521 movdqa 64(%rdi),%xmm0 2522 movdqa 80(%rdi),%xmm1 2523 movdqa 96(%rdi),%xmm2 2524 pand 448(%r10),%xmm0 2525 movdqa 112(%rdi),%xmm3 2526 pand 464(%r10),%xmm1 2527 por %xmm0,%xmm4 2528 pand 480(%r10),%xmm2 2529 por %xmm1,%xmm5 2530 pand 496(%r10),%xmm3 2531 por %xmm2,%xmm4 2532 por %xmm3,%xmm5 2533 por %xmm5,%xmm4 2534 pshufd $0x4e,%xmm4,%xmm0 2535 por %xmm4,%xmm0 2536 leaq 256(%rdi),%rdi 2537.byte 102,72,15,126,194 2538 2539 movq %rbp,(%rbx) 2540 leaq 32(%rbx,%rax,1),%rbx 2541 mulxq 0(%rsi),%r8,%r11 2542 xorq %rbp,%rbp 2543 movq %rdx,%r9 2544 mulxq 8(%rsi),%r14,%r12 2545 adoxq -32(%rbx),%r8 2546 adcxq %r14,%r11 2547 mulxq 16(%rsi),%r15,%r13 2548 adoxq -24(%rbx),%r11 2549 adcxq %r15,%r12 2550 mulxq 24(%rsi),%rdx,%r14 2551 adoxq -16(%rbx),%r12 2552 adcxq %rdx,%r13 2553 leaq (%rcx,%rax,1),%rcx 2554 leaq 32(%rsi),%rsi 2555 adoxq -8(%rbx),%r13 2556 adcxq %rbp,%r14 2557 adoxq %rbp,%r14 2558 2559 movq %r8,%r15 2560 imulq 32+8(%rsp),%r8 2561 2562 movq %r8,%rdx 2563 xorq %rbp,%rbp 2564 movq %rdi,8+8(%rsp) 2565 2566 mulxq 0(%rcx),%rax,%r10 2567 adcxq %rax,%r15 2568 adoxq %r11,%r10 2569 mulxq 8(%rcx),%rax,%r11 2570 adcxq %rax,%r10 2571 adoxq %r12,%r11 2572 mulxq 16(%rcx),%rax,%r12 2573 adcxq %rax,%r11 2574 adoxq %r13,%r12 2575 mulxq 24(%rcx),%rax,%r15 2576 movq %r9,%rdx 2577 movq 24+8(%rsp),%rdi 2578 movq %r10,-32(%rbx) 2579 adcxq %rax,%r12 2580 movq %r11,-24(%rbx) 2581 adoxq %rbp,%r15 2582 movq %r12,-16(%rbx) 2583 leaq 32(%rcx),%rcx 2584 jmp .Lmulx4x_inner 2585 2586.align 32 2587.Lmulx4x_inner: 2588 mulxq 0(%rsi),%r10,%rax 2589 adcxq %rbp,%r15 2590 adoxq %r14,%r10 2591 mulxq 8(%rsi),%r11,%r14 2592 adcxq 0(%rbx),%r10 2593 adoxq %rax,%r11 2594 mulxq 16(%rsi),%r12,%rax 2595 adcxq 8(%rbx),%r11 2596 adoxq %r14,%r12 2597 mulxq 24(%rsi),%r13,%r14 2598 movq %r8,%rdx 2599 adcxq 16(%rbx),%r12 2600 adoxq %rax,%r13 2601 adcxq 24(%rbx),%r13 2602 adoxq %rbp,%r14 2603 leaq 32(%rsi),%rsi 2604 leaq 32(%rbx),%rbx 2605 adcxq %rbp,%r14 2606 2607 adoxq %r15,%r10 2608 mulxq 0(%rcx),%rax,%r15 2609 adcxq %rax,%r10 2610 adoxq %r15,%r11 2611 mulxq 8(%rcx),%rax,%r15 2612 adcxq %rax,%r11 2613 adoxq %r15,%r12 2614 mulxq 16(%rcx),%rax,%r15 2615 movq %r10,-40(%rbx) 2616 adcxq %rax,%r12 2617 adoxq %r15,%r13 2618 movq %r11,-32(%rbx) 2619 mulxq 24(%rcx),%rax,%r15 2620 movq %r9,%rdx 2621 leaq 32(%rcx),%rcx 2622 movq %r12,-24(%rbx) 2623 adcxq %rax,%r13 2624 adoxq %rbp,%r15 2625 movq %r13,-16(%rbx) 2626 2627 decq %rdi 2628 jnz .Lmulx4x_inner 2629 2630 movq 0+8(%rsp),%rax 2631 adcq %rbp,%r15 2632 subq 0(%rbx),%rdi 2633 movq 8+8(%rsp),%rdi 2634 movq 16+8(%rsp),%r10 2635 adcq %r15,%r14 2636 leaq (%rsi,%rax,1),%rsi 2637 adcq %rbp,%rbp 2638 movq %r14,-8(%rbx) 2639 2640 cmpq %r10,%rdi 2641 jb .Lmulx4x_outer 2642 2643 movq -8(%rcx),%r10 2644 movq %rbp,%r8 2645 movq (%rcx,%rax,1),%r12 2646 leaq (%rcx,%rax,1),%rbp 2647 movq %rax,%rcx 2648 leaq (%rbx,%rax,1),%rdi 2649 xorl %eax,%eax 2650 xorq %r15,%r15 2651 subq %r14,%r10 2652 adcq %r15,%r15 2653 orq %r15,%r8 2654 sarq $3+2,%rcx 2655 subq %r8,%rax 2656 movq 56+8(%rsp),%rdx 2657 decq %r12 2658 movq 8(%rbp),%r13 2659 xorq %r8,%r8 2660 movq 16(%rbp),%r14 2661 movq 24(%rbp),%r15 2662 jmp .Lsqrx4x_sub_entry 2663.size mulx4x_internal,.-mulx4x_internal 2664.type bn_powerx5,@function 2665.align 32 2666bn_powerx5: 2667 movq %rsp,%rax 2668.Lpowerx5_enter: 2669 pushq %rbx 2670 pushq %rbp 2671 pushq %r12 2672 pushq %r13 2673 pushq %r14 2674 pushq %r15 2675.Lpowerx5_prologue: 2676 2677 shll $3,%r9d 2678 leaq (%r9,%r9,2),%r10 2679 negq %r9 2680 movq (%r8),%r8 2681 2682 2683 2684 2685 2686 2687 2688 2689 leaq -320(%rsp,%r9,2),%r11 2690 movq %rsp,%rbp 2691 subq %rdi,%r11 2692 andq $4095,%r11 2693 cmpq %r11,%r10 2694 jb .Lpwrx_sp_alt 2695 subq %r11,%rbp 2696 leaq -320(%rbp,%r9,2),%rbp 2697 jmp .Lpwrx_sp_done 2698 2699.align 32 2700.Lpwrx_sp_alt: 2701 leaq 4096-320(,%r9,2),%r10 2702 leaq -320(%rbp,%r9,2),%rbp 2703 subq %r10,%r11 2704 movq $0,%r10 2705 cmovcq %r10,%r11 2706 subq %r11,%rbp 2707.Lpwrx_sp_done: 2708 andq $-64,%rbp 2709 movq %rsp,%r11 2710 subq %rbp,%r11 2711 andq $-4096,%r11 2712 leaq (%r11,%rbp,1),%rsp 2713 movq (%rsp),%r10 2714 cmpq %rbp,%rsp 2715 ja .Lpwrx_page_walk 2716 jmp .Lpwrx_page_walk_done 2717 2718.Lpwrx_page_walk: 2719 leaq -4096(%rsp),%rsp 2720 movq (%rsp),%r10 2721 cmpq %rbp,%rsp 2722 ja .Lpwrx_page_walk 2723.Lpwrx_page_walk_done: 2724 2725 movq %r9,%r10 2726 negq %r9 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 pxor %xmm0,%xmm0 2740.byte 102,72,15,110,207 2741.byte 102,72,15,110,209 2742.byte 102,73,15,110,218 2743.byte 102,72,15,110,226 2744 movq %r8,32(%rsp) 2745 movq %rax,40(%rsp) 2746.Lpowerx5_body: 2747 2748 call __bn_sqrx8x_internal 2749 call __bn_postx4x_internal 2750 call __bn_sqrx8x_internal 2751 call __bn_postx4x_internal 2752 call __bn_sqrx8x_internal 2753 call __bn_postx4x_internal 2754 call __bn_sqrx8x_internal 2755 call __bn_postx4x_internal 2756 call __bn_sqrx8x_internal 2757 call __bn_postx4x_internal 2758 2759 movq %r10,%r9 2760 movq %rsi,%rdi 2761.byte 102,72,15,126,209 2762.byte 102,72,15,126,226 2763 movq 40(%rsp),%rax 2764 2765 call mulx4x_internal 2766 2767 movq 40(%rsp),%rsi 2768 movq $1,%rax 2769 2770 movq -48(%rsi),%r15 2771 movq -40(%rsi),%r14 2772 movq -32(%rsi),%r13 2773 movq -24(%rsi),%r12 2774 movq -16(%rsi),%rbp 2775 movq -8(%rsi),%rbx 2776 leaq (%rsi),%rsp 2777.Lpowerx5_epilogue: 2778 .byte 0xf3,0xc3 2779.size bn_powerx5,.-bn_powerx5 2780 2781.globl bn_sqrx8x_internal 2782.hidden bn_sqrx8x_internal 2783.type bn_sqrx8x_internal,@function 2784.align 32 2785bn_sqrx8x_internal: 2786__bn_sqrx8x_internal: 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 leaq 48+8(%rsp),%rdi 2828 leaq (%rsi,%r9,1),%rbp 2829 movq %r9,0+8(%rsp) 2830 movq %rbp,8+8(%rsp) 2831 jmp .Lsqr8x_zero_start 2832 2833.align 32 2834.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 2835.Lsqrx8x_zero: 2836.byte 0x3e 2837 movdqa %xmm0,0(%rdi) 2838 movdqa %xmm0,16(%rdi) 2839 movdqa %xmm0,32(%rdi) 2840 movdqa %xmm0,48(%rdi) 2841.Lsqr8x_zero_start: 2842 movdqa %xmm0,64(%rdi) 2843 movdqa %xmm0,80(%rdi) 2844 movdqa %xmm0,96(%rdi) 2845 movdqa %xmm0,112(%rdi) 2846 leaq 128(%rdi),%rdi 2847 subq $64,%r9 2848 jnz .Lsqrx8x_zero 2849 2850 movq 0(%rsi),%rdx 2851 2852 xorq %r10,%r10 2853 xorq %r11,%r11 2854 xorq %r12,%r12 2855 xorq %r13,%r13 2856 xorq %r14,%r14 2857 xorq %r15,%r15 2858 leaq 48+8(%rsp),%rdi 2859 xorq %rbp,%rbp 2860 jmp .Lsqrx8x_outer_loop 2861 2862.align 32 2863.Lsqrx8x_outer_loop: 2864 mulxq 8(%rsi),%r8,%rax 2865 adcxq %r9,%r8 2866 adoxq %rax,%r10 2867 mulxq 16(%rsi),%r9,%rax 2868 adcxq %r10,%r9 2869 adoxq %rax,%r11 2870.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 2871 adcxq %r11,%r10 2872 adoxq %rax,%r12 2873.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 2874 adcxq %r12,%r11 2875 adoxq %rax,%r13 2876 mulxq 40(%rsi),%r12,%rax 2877 adcxq %r13,%r12 2878 adoxq %rax,%r14 2879 mulxq 48(%rsi),%r13,%rax 2880 adcxq %r14,%r13 2881 adoxq %r15,%rax 2882 mulxq 56(%rsi),%r14,%r15 2883 movq 8(%rsi),%rdx 2884 adcxq %rax,%r14 2885 adoxq %rbp,%r15 2886 adcq 64(%rdi),%r15 2887 movq %r8,8(%rdi) 2888 movq %r9,16(%rdi) 2889 sbbq %rcx,%rcx 2890 xorq %rbp,%rbp 2891 2892 2893 mulxq 16(%rsi),%r8,%rbx 2894 mulxq 24(%rsi),%r9,%rax 2895 adcxq %r10,%r8 2896 adoxq %rbx,%r9 2897 mulxq 32(%rsi),%r10,%rbx 2898 adcxq %r11,%r9 2899 adoxq %rax,%r10 2900.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 2901 adcxq %r12,%r10 2902 adoxq %rbx,%r11 2903.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 2904 adcxq %r13,%r11 2905 adoxq %r14,%r12 2906.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 2907 movq 16(%rsi),%rdx 2908 adcxq %rax,%r12 2909 adoxq %rbx,%r13 2910 adcxq %r15,%r13 2911 adoxq %rbp,%r14 2912 adcxq %rbp,%r14 2913 2914 movq %r8,24(%rdi) 2915 movq %r9,32(%rdi) 2916 2917 mulxq 24(%rsi),%r8,%rbx 2918 mulxq 32(%rsi),%r9,%rax 2919 adcxq %r10,%r8 2920 adoxq %rbx,%r9 2921 mulxq 40(%rsi),%r10,%rbx 2922 adcxq %r11,%r9 2923 adoxq %rax,%r10 2924.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 2925 adcxq %r12,%r10 2926 adoxq %r13,%r11 2927.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 2928.byte 0x3e 2929 movq 24(%rsi),%rdx 2930 adcxq %rbx,%r11 2931 adoxq %rax,%r12 2932 adcxq %r14,%r12 2933 movq %r8,40(%rdi) 2934 movq %r9,48(%rdi) 2935 mulxq 32(%rsi),%r8,%rax 2936 adoxq %rbp,%r13 2937 adcxq %rbp,%r13 2938 2939 mulxq 40(%rsi),%r9,%rbx 2940 adcxq %r10,%r8 2941 adoxq %rax,%r9 2942 mulxq 48(%rsi),%r10,%rax 2943 adcxq %r11,%r9 2944 adoxq %r12,%r10 2945 mulxq 56(%rsi),%r11,%r12 2946 movq 32(%rsi),%rdx 2947 movq 40(%rsi),%r14 2948 adcxq %rbx,%r10 2949 adoxq %rax,%r11 2950 movq 48(%rsi),%r15 2951 adcxq %r13,%r11 2952 adoxq %rbp,%r12 2953 adcxq %rbp,%r12 2954 2955 movq %r8,56(%rdi) 2956 movq %r9,64(%rdi) 2957 2958 mulxq %r14,%r9,%rax 2959 movq 56(%rsi),%r8 2960 adcxq %r10,%r9 2961 mulxq %r15,%r10,%rbx 2962 adoxq %rax,%r10 2963 adcxq %r11,%r10 2964 mulxq %r8,%r11,%rax 2965 movq %r14,%rdx 2966 adoxq %rbx,%r11 2967 adcxq %r12,%r11 2968 2969 adcxq %rbp,%rax 2970 2971 mulxq %r15,%r14,%rbx 2972 mulxq %r8,%r12,%r13 2973 movq %r15,%rdx 2974 leaq 64(%rsi),%rsi 2975 adcxq %r14,%r11 2976 adoxq %rbx,%r12 2977 adcxq %rax,%r12 2978 adoxq %rbp,%r13 2979 2980.byte 0x67,0x67 2981 mulxq %r8,%r8,%r14 2982 adcxq %r8,%r13 2983 adcxq %rbp,%r14 2984 2985 cmpq 8+8(%rsp),%rsi 2986 je .Lsqrx8x_outer_break 2987 2988 negq %rcx 2989 movq $-8,%rcx 2990 movq %rbp,%r15 2991 movq 64(%rdi),%r8 2992 adcxq 72(%rdi),%r9 2993 adcxq 80(%rdi),%r10 2994 adcxq 88(%rdi),%r11 2995 adcq 96(%rdi),%r12 2996 adcq 104(%rdi),%r13 2997 adcq 112(%rdi),%r14 2998 adcq 120(%rdi),%r15 2999 leaq (%rsi),%rbp 3000 leaq 128(%rdi),%rdi 3001 sbbq %rax,%rax 3002 3003 movq -64(%rsi),%rdx 3004 movq %rax,16+8(%rsp) 3005 movq %rdi,24+8(%rsp) 3006 3007 3008 xorl %eax,%eax 3009 jmp .Lsqrx8x_loop 3010 3011.align 32 3012.Lsqrx8x_loop: 3013 movq %r8,%rbx 3014 mulxq 0(%rbp),%rax,%r8 3015 adcxq %rax,%rbx 3016 adoxq %r9,%r8 3017 3018 mulxq 8(%rbp),%rax,%r9 3019 adcxq %rax,%r8 3020 adoxq %r10,%r9 3021 3022 mulxq 16(%rbp),%rax,%r10 3023 adcxq %rax,%r9 3024 adoxq %r11,%r10 3025 3026 mulxq 24(%rbp),%rax,%r11 3027 adcxq %rax,%r10 3028 adoxq %r12,%r11 3029 3030.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 3031 adcxq %rax,%r11 3032 adoxq %r13,%r12 3033 3034 mulxq 40(%rbp),%rax,%r13 3035 adcxq %rax,%r12 3036 adoxq %r14,%r13 3037 3038 mulxq 48(%rbp),%rax,%r14 3039 movq %rbx,(%rdi,%rcx,8) 3040 movl $0,%ebx 3041 adcxq %rax,%r13 3042 adoxq %r15,%r14 3043 3044.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 3045 movq 8(%rsi,%rcx,8),%rdx 3046 adcxq %rax,%r14 3047 adoxq %rbx,%r15 3048 adcxq %rbx,%r15 3049 3050.byte 0x67 3051 incq %rcx 3052 jnz .Lsqrx8x_loop 3053 3054 leaq 64(%rbp),%rbp 3055 movq $-8,%rcx 3056 cmpq 8+8(%rsp),%rbp 3057 je .Lsqrx8x_break 3058 3059 subq 16+8(%rsp),%rbx 3060.byte 0x66 3061 movq -64(%rsi),%rdx 3062 adcxq 0(%rdi),%r8 3063 adcxq 8(%rdi),%r9 3064 adcq 16(%rdi),%r10 3065 adcq 24(%rdi),%r11 3066 adcq 32(%rdi),%r12 3067 adcq 40(%rdi),%r13 3068 adcq 48(%rdi),%r14 3069 adcq 56(%rdi),%r15 3070 leaq 64(%rdi),%rdi 3071.byte 0x67 3072 sbbq %rax,%rax 3073 xorl %ebx,%ebx 3074 movq %rax,16+8(%rsp) 3075 jmp .Lsqrx8x_loop 3076 3077.align 32 3078.Lsqrx8x_break: 3079 subq 16+8(%rsp),%r8 3080 movq 24+8(%rsp),%rcx 3081 movq 0(%rsi),%rdx 3082 xorl %ebp,%ebp 3083 movq %r8,0(%rdi) 3084 cmpq %rcx,%rdi 3085 je .Lsqrx8x_outer_loop 3086 3087 movq %r9,8(%rdi) 3088 movq 8(%rcx),%r9 3089 movq %r10,16(%rdi) 3090 movq 16(%rcx),%r10 3091 movq %r11,24(%rdi) 3092 movq 24(%rcx),%r11 3093 movq %r12,32(%rdi) 3094 movq 32(%rcx),%r12 3095 movq %r13,40(%rdi) 3096 movq 40(%rcx),%r13 3097 movq %r14,48(%rdi) 3098 movq 48(%rcx),%r14 3099 movq %r15,56(%rdi) 3100 movq 56(%rcx),%r15 3101 movq %rcx,%rdi 3102 jmp .Lsqrx8x_outer_loop 3103 3104.align 32 3105.Lsqrx8x_outer_break: 3106 movq %r9,72(%rdi) 3107.byte 102,72,15,126,217 3108 movq %r10,80(%rdi) 3109 movq %r11,88(%rdi) 3110 movq %r12,96(%rdi) 3111 movq %r13,104(%rdi) 3112 movq %r14,112(%rdi) 3113 leaq 48+8(%rsp),%rdi 3114 movq (%rsi,%rcx,1),%rdx 3115 3116 movq 8(%rdi),%r11 3117 xorq %r10,%r10 3118 movq 0+8(%rsp),%r9 3119 adoxq %r11,%r11 3120 movq 16(%rdi),%r12 3121 movq 24(%rdi),%r13 3122 3123 3124.align 32 3125.Lsqrx4x_shift_n_add: 3126 mulxq %rdx,%rax,%rbx 3127 adoxq %r12,%r12 3128 adcxq %r10,%rax 3129.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 3130.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 3131 adoxq %r13,%r13 3132 adcxq %r11,%rbx 3133 movq 40(%rdi),%r11 3134 movq %rax,0(%rdi) 3135 movq %rbx,8(%rdi) 3136 3137 mulxq %rdx,%rax,%rbx 3138 adoxq %r10,%r10 3139 adcxq %r12,%rax 3140 movq 16(%rsi,%rcx,1),%rdx 3141 movq 48(%rdi),%r12 3142 adoxq %r11,%r11 3143 adcxq %r13,%rbx 3144 movq 56(%rdi),%r13 3145 movq %rax,16(%rdi) 3146 movq %rbx,24(%rdi) 3147 3148 mulxq %rdx,%rax,%rbx 3149 adoxq %r12,%r12 3150 adcxq %r10,%rax 3151 movq 24(%rsi,%rcx,1),%rdx 3152 leaq 32(%rcx),%rcx 3153 movq 64(%rdi),%r10 3154 adoxq %r13,%r13 3155 adcxq %r11,%rbx 3156 movq 72(%rdi),%r11 3157 movq %rax,32(%rdi) 3158 movq %rbx,40(%rdi) 3159 3160 mulxq %rdx,%rax,%rbx 3161 adoxq %r10,%r10 3162 adcxq %r12,%rax 3163 jrcxz .Lsqrx4x_shift_n_add_break 3164.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 3165 adoxq %r11,%r11 3166 adcxq %r13,%rbx 3167 movq 80(%rdi),%r12 3168 movq 88(%rdi),%r13 3169 movq %rax,48(%rdi) 3170 movq %rbx,56(%rdi) 3171 leaq 64(%rdi),%rdi 3172 nop 3173 jmp .Lsqrx4x_shift_n_add 3174 3175.align 32 3176.Lsqrx4x_shift_n_add_break: 3177 adcxq %r13,%rbx 3178 movq %rax,48(%rdi) 3179 movq %rbx,56(%rdi) 3180 leaq 64(%rdi),%rdi 3181.byte 102,72,15,126,213 3182__bn_sqrx8x_reduction: 3183 xorl %eax,%eax 3184 movq 32+8(%rsp),%rbx 3185 movq 48+8(%rsp),%rdx 3186 leaq -64(%rbp,%r9,1),%rcx 3187 3188 movq %rcx,0+8(%rsp) 3189 movq %rdi,8+8(%rsp) 3190 3191 leaq 48+8(%rsp),%rdi 3192 jmp .Lsqrx8x_reduction_loop 3193 3194.align 32 3195.Lsqrx8x_reduction_loop: 3196 movq 8(%rdi),%r9 3197 movq 16(%rdi),%r10 3198 movq 24(%rdi),%r11 3199 movq 32(%rdi),%r12 3200 movq %rdx,%r8 3201 imulq %rbx,%rdx 3202 movq 40(%rdi),%r13 3203 movq 48(%rdi),%r14 3204 movq 56(%rdi),%r15 3205 movq %rax,24+8(%rsp) 3206 3207 leaq 64(%rdi),%rdi 3208 xorq %rsi,%rsi 3209 movq $-8,%rcx 3210 jmp .Lsqrx8x_reduce 3211 3212.align 32 3213.Lsqrx8x_reduce: 3214 movq %r8,%rbx 3215 mulxq 0(%rbp),%rax,%r8 3216 adcxq %rbx,%rax 3217 adoxq %r9,%r8 3218 3219 mulxq 8(%rbp),%rbx,%r9 3220 adcxq %rbx,%r8 3221 adoxq %r10,%r9 3222 3223 mulxq 16(%rbp),%rbx,%r10 3224 adcxq %rbx,%r9 3225 adoxq %r11,%r10 3226 3227 mulxq 24(%rbp),%rbx,%r11 3228 adcxq %rbx,%r10 3229 adoxq %r12,%r11 3230 3231.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 3232 movq %rdx,%rax 3233 movq %r8,%rdx 3234 adcxq %rbx,%r11 3235 adoxq %r13,%r12 3236 3237 mulxq 32+8(%rsp),%rbx,%rdx 3238 movq %rax,%rdx 3239 movq %rax,64+48+8(%rsp,%rcx,8) 3240 3241 mulxq 40(%rbp),%rax,%r13 3242 adcxq %rax,%r12 3243 adoxq %r14,%r13 3244 3245 mulxq 48(%rbp),%rax,%r14 3246 adcxq %rax,%r13 3247 adoxq %r15,%r14 3248 3249 mulxq 56(%rbp),%rax,%r15 3250 movq %rbx,%rdx 3251 adcxq %rax,%r14 3252 adoxq %rsi,%r15 3253 adcxq %rsi,%r15 3254 3255.byte 0x67,0x67,0x67 3256 incq %rcx 3257 jnz .Lsqrx8x_reduce 3258 3259 movq %rsi,%rax 3260 cmpq 0+8(%rsp),%rbp 3261 jae .Lsqrx8x_no_tail 3262 3263 movq 48+8(%rsp),%rdx 3264 addq 0(%rdi),%r8 3265 leaq 64(%rbp),%rbp 3266 movq $-8,%rcx 3267 adcxq 8(%rdi),%r9 3268 adcxq 16(%rdi),%r10 3269 adcq 24(%rdi),%r11 3270 adcq 32(%rdi),%r12 3271 adcq 40(%rdi),%r13 3272 adcq 48(%rdi),%r14 3273 adcq 56(%rdi),%r15 3274 leaq 64(%rdi),%rdi 3275 sbbq %rax,%rax 3276 3277 xorq %rsi,%rsi 3278 movq %rax,16+8(%rsp) 3279 jmp .Lsqrx8x_tail 3280 3281.align 32 3282.Lsqrx8x_tail: 3283 movq %r8,%rbx 3284 mulxq 0(%rbp),%rax,%r8 3285 adcxq %rax,%rbx 3286 adoxq %r9,%r8 3287 3288 mulxq 8(%rbp),%rax,%r9 3289 adcxq %rax,%r8 3290 adoxq %r10,%r9 3291 3292 mulxq 16(%rbp),%rax,%r10 3293 adcxq %rax,%r9 3294 adoxq %r11,%r10 3295 3296 mulxq 24(%rbp),%rax,%r11 3297 adcxq %rax,%r10 3298 adoxq %r12,%r11 3299 3300.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 3301 adcxq %rax,%r11 3302 adoxq %r13,%r12 3303 3304 mulxq 40(%rbp),%rax,%r13 3305 adcxq %rax,%r12 3306 adoxq %r14,%r13 3307 3308 mulxq 48(%rbp),%rax,%r14 3309 adcxq %rax,%r13 3310 adoxq %r15,%r14 3311 3312 mulxq 56(%rbp),%rax,%r15 3313 movq 72+48+8(%rsp,%rcx,8),%rdx 3314 adcxq %rax,%r14 3315 adoxq %rsi,%r15 3316 movq %rbx,(%rdi,%rcx,8) 3317 movq %r8,%rbx 3318 adcxq %rsi,%r15 3319 3320 incq %rcx 3321 jnz .Lsqrx8x_tail 3322 3323 cmpq 0+8(%rsp),%rbp 3324 jae .Lsqrx8x_tail_done 3325 3326 subq 16+8(%rsp),%rsi 3327 movq 48+8(%rsp),%rdx 3328 leaq 64(%rbp),%rbp 3329 adcq 0(%rdi),%r8 3330 adcq 8(%rdi),%r9 3331 adcq 16(%rdi),%r10 3332 adcq 24(%rdi),%r11 3333 adcq 32(%rdi),%r12 3334 adcq 40(%rdi),%r13 3335 adcq 48(%rdi),%r14 3336 adcq 56(%rdi),%r15 3337 leaq 64(%rdi),%rdi 3338 sbbq %rax,%rax 3339 subq $8,%rcx 3340 3341 xorq %rsi,%rsi 3342 movq %rax,16+8(%rsp) 3343 jmp .Lsqrx8x_tail 3344 3345.align 32 3346.Lsqrx8x_tail_done: 3347 xorq %rax,%rax 3348 addq 24+8(%rsp),%r8 3349 adcq $0,%r9 3350 adcq $0,%r10 3351 adcq $0,%r11 3352 adcq $0,%r12 3353 adcq $0,%r13 3354 adcq $0,%r14 3355 adcq $0,%r15 3356 adcq $0,%rax 3357 3358 subq 16+8(%rsp),%rsi 3359.Lsqrx8x_no_tail: 3360 adcq 0(%rdi),%r8 3361.byte 102,72,15,126,217 3362 adcq 8(%rdi),%r9 3363 movq 56(%rbp),%rsi 3364.byte 102,72,15,126,213 3365 adcq 16(%rdi),%r10 3366 adcq 24(%rdi),%r11 3367 adcq 32(%rdi),%r12 3368 adcq 40(%rdi),%r13 3369 adcq 48(%rdi),%r14 3370 adcq 56(%rdi),%r15 3371 adcq $0,%rax 3372 3373 movq 32+8(%rsp),%rbx 3374 movq 64(%rdi,%rcx,1),%rdx 3375 3376 movq %r8,0(%rdi) 3377 leaq 64(%rdi),%r8 3378 movq %r9,8(%rdi) 3379 movq %r10,16(%rdi) 3380 movq %r11,24(%rdi) 3381 movq %r12,32(%rdi) 3382 movq %r13,40(%rdi) 3383 movq %r14,48(%rdi) 3384 movq %r15,56(%rdi) 3385 3386 leaq 64(%rdi,%rcx,1),%rdi 3387 cmpq 8+8(%rsp),%r8 3388 jb .Lsqrx8x_reduction_loop 3389 .byte 0xf3,0xc3 3390.size bn_sqrx8x_internal,.-bn_sqrx8x_internal 3391.align 32 3392__bn_postx4x_internal: 3393 movq 0(%rbp),%r12 3394 movq %rcx,%r10 3395 movq %rcx,%r9 3396 negq %rax 3397 sarq $3+2,%rcx 3398 3399.byte 102,72,15,126,202 3400.byte 102,72,15,126,206 3401 decq %r12 3402 movq 8(%rbp),%r13 3403 xorq %r8,%r8 3404 movq 16(%rbp),%r14 3405 movq 24(%rbp),%r15 3406 jmp .Lsqrx4x_sub_entry 3407 3408.align 16 3409.Lsqrx4x_sub: 3410 movq 0(%rbp),%r12 3411 movq 8(%rbp),%r13 3412 movq 16(%rbp),%r14 3413 movq 24(%rbp),%r15 3414.Lsqrx4x_sub_entry: 3415 andnq %rax,%r12,%r12 3416 leaq 32(%rbp),%rbp 3417 andnq %rax,%r13,%r13 3418 andnq %rax,%r14,%r14 3419 andnq %rax,%r15,%r15 3420 3421 negq %r8 3422 adcq 0(%rdi),%r12 3423 adcq 8(%rdi),%r13 3424 adcq 16(%rdi),%r14 3425 adcq 24(%rdi),%r15 3426 movq %r12,0(%rdx) 3427 leaq 32(%rdi),%rdi 3428 movq %r13,8(%rdx) 3429 sbbq %r8,%r8 3430 movq %r14,16(%rdx) 3431 movq %r15,24(%rdx) 3432 leaq 32(%rdx),%rdx 3433 3434 incq %rcx 3435 jnz .Lsqrx4x_sub 3436 3437 negq %r9 3438 3439 .byte 0xf3,0xc3 3440.size __bn_postx4x_internal,.-__bn_postx4x_internal 3441.globl bn_get_bits5 3442.type bn_get_bits5,@function 3443.align 16 3444bn_get_bits5: 3445 leaq 0(%rdi),%r10 3446 leaq 1(%rdi),%r11 3447 movl %esi,%ecx 3448 shrl $4,%esi 3449 andl $15,%ecx 3450 leal -8(%rcx),%eax 3451 cmpl $11,%ecx 3452 cmovaq %r11,%r10 3453 cmoval %eax,%ecx 3454 movzwl (%r10,%rsi,2),%eax 3455 shrl %cl,%eax 3456 andl $31,%eax 3457 .byte 0xf3,0xc3 3458.size bn_get_bits5,.-bn_get_bits5 3459 3460.globl bn_scatter5 3461.type bn_scatter5,@function 3462.align 16 3463bn_scatter5: 3464 cmpl $0,%esi 3465 jz .Lscatter_epilogue 3466 leaq (%rdx,%rcx,8),%rdx 3467.Lscatter: 3468 movq (%rdi),%rax 3469 leaq 8(%rdi),%rdi 3470 movq %rax,(%rdx) 3471 leaq 256(%rdx),%rdx 3472 subl $1,%esi 3473 jnz .Lscatter 3474.Lscatter_epilogue: 3475 .byte 0xf3,0xc3 3476.size bn_scatter5,.-bn_scatter5 3477 3478.globl bn_gather5 3479.type bn_gather5,@function 3480.align 32 3481bn_gather5: 3482.LSEH_begin_bn_gather5: 3483 3484.byte 0x4c,0x8d,0x14,0x24 3485.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 3486 leaq .Linc(%rip),%rax 3487 andq $-16,%rsp 3488 3489 movd %ecx,%xmm5 3490 movdqa 0(%rax),%xmm0 3491 movdqa 16(%rax),%xmm1 3492 leaq 128(%rdx),%r11 3493 leaq 128(%rsp),%rax 3494 3495 pshufd $0,%xmm5,%xmm5 3496 movdqa %xmm1,%xmm4 3497 movdqa %xmm1,%xmm2 3498 paddd %xmm0,%xmm1 3499 pcmpeqd %xmm5,%xmm0 3500 movdqa %xmm4,%xmm3 3501 3502 paddd %xmm1,%xmm2 3503 pcmpeqd %xmm5,%xmm1 3504 movdqa %xmm0,-128(%rax) 3505 movdqa %xmm4,%xmm0 3506 3507 paddd %xmm2,%xmm3 3508 pcmpeqd %xmm5,%xmm2 3509 movdqa %xmm1,-112(%rax) 3510 movdqa %xmm4,%xmm1 3511 3512 paddd %xmm3,%xmm0 3513 pcmpeqd %xmm5,%xmm3 3514 movdqa %xmm2,-96(%rax) 3515 movdqa %xmm4,%xmm2 3516 paddd %xmm0,%xmm1 3517 pcmpeqd %xmm5,%xmm0 3518 movdqa %xmm3,-80(%rax) 3519 movdqa %xmm4,%xmm3 3520 3521 paddd %xmm1,%xmm2 3522 pcmpeqd %xmm5,%xmm1 3523 movdqa %xmm0,-64(%rax) 3524 movdqa %xmm4,%xmm0 3525 3526 paddd %xmm2,%xmm3 3527 pcmpeqd %xmm5,%xmm2 3528 movdqa %xmm1,-48(%rax) 3529 movdqa %xmm4,%xmm1 3530 3531 paddd %xmm3,%xmm0 3532 pcmpeqd %xmm5,%xmm3 3533 movdqa %xmm2,-32(%rax) 3534 movdqa %xmm4,%xmm2 3535 paddd %xmm0,%xmm1 3536 pcmpeqd %xmm5,%xmm0 3537 movdqa %xmm3,-16(%rax) 3538 movdqa %xmm4,%xmm3 3539 3540 paddd %xmm1,%xmm2 3541 pcmpeqd %xmm5,%xmm1 3542 movdqa %xmm0,0(%rax) 3543 movdqa %xmm4,%xmm0 3544 3545 paddd %xmm2,%xmm3 3546 pcmpeqd %xmm5,%xmm2 3547 movdqa %xmm1,16(%rax) 3548 movdqa %xmm4,%xmm1 3549 3550 paddd %xmm3,%xmm0 3551 pcmpeqd %xmm5,%xmm3 3552 movdqa %xmm2,32(%rax) 3553 movdqa %xmm4,%xmm2 3554 paddd %xmm0,%xmm1 3555 pcmpeqd %xmm5,%xmm0 3556 movdqa %xmm3,48(%rax) 3557 movdqa %xmm4,%xmm3 3558 3559 paddd %xmm1,%xmm2 3560 pcmpeqd %xmm5,%xmm1 3561 movdqa %xmm0,64(%rax) 3562 movdqa %xmm4,%xmm0 3563 3564 paddd %xmm2,%xmm3 3565 pcmpeqd %xmm5,%xmm2 3566 movdqa %xmm1,80(%rax) 3567 movdqa %xmm4,%xmm1 3568 3569 paddd %xmm3,%xmm0 3570 pcmpeqd %xmm5,%xmm3 3571 movdqa %xmm2,96(%rax) 3572 movdqa %xmm4,%xmm2 3573 movdqa %xmm3,112(%rax) 3574 jmp .Lgather 3575 3576.align 32 3577.Lgather: 3578 pxor %xmm4,%xmm4 3579 pxor %xmm5,%xmm5 3580 movdqa -128(%r11),%xmm0 3581 movdqa -112(%r11),%xmm1 3582 movdqa -96(%r11),%xmm2 3583 pand -128(%rax),%xmm0 3584 movdqa -80(%r11),%xmm3 3585 pand -112(%rax),%xmm1 3586 por %xmm0,%xmm4 3587 pand -96(%rax),%xmm2 3588 por %xmm1,%xmm5 3589 pand -80(%rax),%xmm3 3590 por %xmm2,%xmm4 3591 por %xmm3,%xmm5 3592 movdqa -64(%r11),%xmm0 3593 movdqa -48(%r11),%xmm1 3594 movdqa -32(%r11),%xmm2 3595 pand -64(%rax),%xmm0 3596 movdqa -16(%r11),%xmm3 3597 pand -48(%rax),%xmm1 3598 por %xmm0,%xmm4 3599 pand -32(%rax),%xmm2 3600 por %xmm1,%xmm5 3601 pand -16(%rax),%xmm3 3602 por %xmm2,%xmm4 3603 por %xmm3,%xmm5 3604 movdqa 0(%r11),%xmm0 3605 movdqa 16(%r11),%xmm1 3606 movdqa 32(%r11),%xmm2 3607 pand 0(%rax),%xmm0 3608 movdqa 48(%r11),%xmm3 3609 pand 16(%rax),%xmm1 3610 por %xmm0,%xmm4 3611 pand 32(%rax),%xmm2 3612 por %xmm1,%xmm5 3613 pand 48(%rax),%xmm3 3614 por %xmm2,%xmm4 3615 por %xmm3,%xmm5 3616 movdqa 64(%r11),%xmm0 3617 movdqa 80(%r11),%xmm1 3618 movdqa 96(%r11),%xmm2 3619 pand 64(%rax),%xmm0 3620 movdqa 112(%r11),%xmm3 3621 pand 80(%rax),%xmm1 3622 por %xmm0,%xmm4 3623 pand 96(%rax),%xmm2 3624 por %xmm1,%xmm5 3625 pand 112(%rax),%xmm3 3626 por %xmm2,%xmm4 3627 por %xmm3,%xmm5 3628 por %xmm5,%xmm4 3629 leaq 256(%r11),%r11 3630 pshufd $0x4e,%xmm4,%xmm0 3631 por %xmm4,%xmm0 3632 movq %xmm0,(%rdi) 3633 leaq 8(%rdi),%rdi 3634 subl $1,%esi 3635 jnz .Lgather 3636 3637 leaq (%r10),%rsp 3638 .byte 0xf3,0xc3 3639.LSEH_end_bn_gather5: 3640.size bn_gather5,.-bn_gather5 3641.align 64 3642.Linc: 3643.long 0,0, 1,1 3644.long 2,2, 2,2 3645.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 3646