x86_64-mont5.S revision 1.7
1#include <machine/asm.h> 2.text 3 4 5 6.globl bn_mul_mont_gather5 7.type bn_mul_mont_gather5,@function 8.align 64 9bn_mul_mont_gather5: 10.cfi_startproc 11 movl %r9d,%r9d 12 movq %rsp,%rax 13.cfi_def_cfa_register %rax 14 testl $7,%r9d 15 jnz .Lmul_enter 16 movl OPENSSL_ia32cap_P+8(%rip),%r11d 17 jmp .Lmul4x_enter 18 19.align 16 20.Lmul_enter: 21 movd 8(%rsp),%xmm5 22 pushq %rbx 23.cfi_offset %rbx,-16 24 pushq %rbp 25.cfi_offset %rbp,-24 26 pushq %r12 27.cfi_offset %r12,-32 28 pushq %r13 29.cfi_offset %r13,-40 30 pushq %r14 31.cfi_offset %r14,-48 32 pushq %r15 33.cfi_offset %r15,-56 34 35 negq %r9 36 movq %rsp,%r11 37 leaq -280(%rsp,%r9,8),%r10 38 negq %r9 39 andq $-1024,%r10 40 41 42 43 44 45 46 47 48 49 subq %r10,%r11 50 andq $-4096,%r11 51 leaq (%r10,%r11,1),%rsp 52 movq (%rsp),%r11 53 cmpq %r10,%rsp 54 ja .Lmul_page_walk 55 jmp .Lmul_page_walk_done 56 57.Lmul_page_walk: 58 leaq -4096(%rsp),%rsp 59 movq (%rsp),%r11 60 cmpq %r10,%rsp 61 ja .Lmul_page_walk 62.Lmul_page_walk_done: 63 64 leaq .Linc(%rip),%r10 65 movq %rax,8(%rsp,%r9,8) 66.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08 67.Lmul_body: 68 69 leaq 128(%rdx),%r12 70 movdqa 0(%r10),%xmm0 71 movdqa 16(%r10),%xmm1 72 leaq 24-112(%rsp,%r9,8),%r10 73 andq $-16,%r10 74 75 pshufd $0,%xmm5,%xmm5 76 movdqa %xmm1,%xmm4 77 movdqa %xmm1,%xmm2 78 paddd %xmm0,%xmm1 79 pcmpeqd %xmm5,%xmm0 80.byte 0x67 81 movdqa %xmm4,%xmm3 82 paddd %xmm1,%xmm2 83 pcmpeqd %xmm5,%xmm1 84 movdqa %xmm0,112(%r10) 85 movdqa %xmm4,%xmm0 86 87 paddd %xmm2,%xmm3 88 pcmpeqd %xmm5,%xmm2 89 movdqa %xmm1,128(%r10) 90 movdqa %xmm4,%xmm1 91 92 paddd %xmm3,%xmm0 93 pcmpeqd %xmm5,%xmm3 94 movdqa %xmm2,144(%r10) 95 movdqa %xmm4,%xmm2 96 97 paddd %xmm0,%xmm1 98 pcmpeqd %xmm5,%xmm0 99 movdqa %xmm3,160(%r10) 100 movdqa %xmm4,%xmm3 101 paddd %xmm1,%xmm2 102 pcmpeqd %xmm5,%xmm1 103 movdqa %xmm0,176(%r10) 104 movdqa %xmm4,%xmm0 105 106 paddd %xmm2,%xmm3 107 pcmpeqd %xmm5,%xmm2 108 movdqa %xmm1,192(%r10) 109 movdqa %xmm4,%xmm1 110 111 paddd %xmm3,%xmm0 112 pcmpeqd %xmm5,%xmm3 113 movdqa %xmm2,208(%r10) 114 movdqa %xmm4,%xmm2 115 116 paddd %xmm0,%xmm1 117 pcmpeqd %xmm5,%xmm0 118 movdqa %xmm3,224(%r10) 119 movdqa %xmm4,%xmm3 120 paddd %xmm1,%xmm2 121 pcmpeqd %xmm5,%xmm1 122 movdqa %xmm0,240(%r10) 123 movdqa %xmm4,%xmm0 124 125 paddd %xmm2,%xmm3 126 pcmpeqd %xmm5,%xmm2 127 movdqa %xmm1,256(%r10) 128 movdqa %xmm4,%xmm1 129 130 paddd %xmm3,%xmm0 131 pcmpeqd %xmm5,%xmm3 132 movdqa %xmm2,272(%r10) 133 movdqa %xmm4,%xmm2 134 135 paddd %xmm0,%xmm1 136 pcmpeqd %xmm5,%xmm0 137 movdqa %xmm3,288(%r10) 138 movdqa %xmm4,%xmm3 139 paddd %xmm1,%xmm2 140 pcmpeqd %xmm5,%xmm1 141 movdqa %xmm0,304(%r10) 142 143 paddd %xmm2,%xmm3 144.byte 0x67 145 pcmpeqd %xmm5,%xmm2 146 movdqa %xmm1,320(%r10) 147 148 pcmpeqd %xmm5,%xmm3 149 movdqa %xmm2,336(%r10) 150 pand 64(%r12),%xmm0 151 152 pand 80(%r12),%xmm1 153 pand 96(%r12),%xmm2 154 movdqa %xmm3,352(%r10) 155 pand 112(%r12),%xmm3 156 por %xmm2,%xmm0 157 por %xmm3,%xmm1 158 movdqa -128(%r12),%xmm4 159 movdqa -112(%r12),%xmm5 160 movdqa -96(%r12),%xmm2 161 pand 112(%r10),%xmm4 162 movdqa -80(%r12),%xmm3 163 pand 128(%r10),%xmm5 164 por %xmm4,%xmm0 165 pand 144(%r10),%xmm2 166 por %xmm5,%xmm1 167 pand 160(%r10),%xmm3 168 por %xmm2,%xmm0 169 por %xmm3,%xmm1 170 movdqa -64(%r12),%xmm4 171 movdqa -48(%r12),%xmm5 172 movdqa -32(%r12),%xmm2 173 pand 176(%r10),%xmm4 174 movdqa -16(%r12),%xmm3 175 pand 192(%r10),%xmm5 176 por %xmm4,%xmm0 177 pand 208(%r10),%xmm2 178 por %xmm5,%xmm1 179 pand 224(%r10),%xmm3 180 por %xmm2,%xmm0 181 por %xmm3,%xmm1 182 movdqa 0(%r12),%xmm4 183 movdqa 16(%r12),%xmm5 184 movdqa 32(%r12),%xmm2 185 pand 240(%r10),%xmm4 186 movdqa 48(%r12),%xmm3 187 pand 256(%r10),%xmm5 188 por %xmm4,%xmm0 189 pand 272(%r10),%xmm2 190 por %xmm5,%xmm1 191 pand 288(%r10),%xmm3 192 por %xmm2,%xmm0 193 por %xmm3,%xmm1 194 por %xmm1,%xmm0 195 pshufd $0x4e,%xmm0,%xmm1 196 por %xmm1,%xmm0 197 leaq 256(%r12),%r12 198.byte 102,72,15,126,195 199 200 movq (%r8),%r8 201 movq (%rsi),%rax 202 203 xorq %r14,%r14 204 xorq %r15,%r15 205 206 movq %r8,%rbp 207 mulq %rbx 208 movq %rax,%r10 209 movq (%rcx),%rax 210 211 imulq %r10,%rbp 212 movq %rdx,%r11 213 214 mulq %rbp 215 addq %rax,%r10 216 movq 8(%rsi),%rax 217 adcq $0,%rdx 218 movq %rdx,%r13 219 220 leaq 1(%r15),%r15 221 jmp .L1st_enter 222 223.align 16 224.L1st: 225 addq %rax,%r13 226 movq (%rsi,%r15,8),%rax 227 adcq $0,%rdx 228 addq %r11,%r13 229 movq %r10,%r11 230 adcq $0,%rdx 231 movq %r13,-16(%rsp,%r15,8) 232 movq %rdx,%r13 233 234.L1st_enter: 235 mulq %rbx 236 addq %rax,%r11 237 movq (%rcx,%r15,8),%rax 238 adcq $0,%rdx 239 leaq 1(%r15),%r15 240 movq %rdx,%r10 241 242 mulq %rbp 243 cmpq %r9,%r15 244 jne .L1st 245 246 247 addq %rax,%r13 248 adcq $0,%rdx 249 addq %r11,%r13 250 adcq $0,%rdx 251 movq %r13,-16(%rsp,%r9,8) 252 movq %rdx,%r13 253 movq %r10,%r11 254 255 xorq %rdx,%rdx 256 addq %r11,%r13 257 adcq $0,%rdx 258 movq %r13,-8(%rsp,%r9,8) 259 movq %rdx,(%rsp,%r9,8) 260 261 leaq 1(%r14),%r14 262 jmp .Louter 263.align 16 264.Louter: 265 leaq 24+128(%rsp,%r9,8),%rdx 266 andq $-16,%rdx 267 pxor %xmm4,%xmm4 268 pxor %xmm5,%xmm5 269 movdqa -128(%r12),%xmm0 270 movdqa -112(%r12),%xmm1 271 movdqa -96(%r12),%xmm2 272 movdqa -80(%r12),%xmm3 273 pand -128(%rdx),%xmm0 274 pand -112(%rdx),%xmm1 275 por %xmm0,%xmm4 276 pand -96(%rdx),%xmm2 277 por %xmm1,%xmm5 278 pand -80(%rdx),%xmm3 279 por %xmm2,%xmm4 280 por %xmm3,%xmm5 281 movdqa -64(%r12),%xmm0 282 movdqa -48(%r12),%xmm1 283 movdqa -32(%r12),%xmm2 284 movdqa -16(%r12),%xmm3 285 pand -64(%rdx),%xmm0 286 pand -48(%rdx),%xmm1 287 por %xmm0,%xmm4 288 pand -32(%rdx),%xmm2 289 por %xmm1,%xmm5 290 pand -16(%rdx),%xmm3 291 por %xmm2,%xmm4 292 por %xmm3,%xmm5 293 movdqa 0(%r12),%xmm0 294 movdqa 16(%r12),%xmm1 295 movdqa 32(%r12),%xmm2 296 movdqa 48(%r12),%xmm3 297 pand 0(%rdx),%xmm0 298 pand 16(%rdx),%xmm1 299 por %xmm0,%xmm4 300 pand 32(%rdx),%xmm2 301 por %xmm1,%xmm5 302 pand 48(%rdx),%xmm3 303 por %xmm2,%xmm4 304 por %xmm3,%xmm5 305 movdqa 64(%r12),%xmm0 306 movdqa 80(%r12),%xmm1 307 movdqa 96(%r12),%xmm2 308 movdqa 112(%r12),%xmm3 309 pand 64(%rdx),%xmm0 310 pand 80(%rdx),%xmm1 311 por %xmm0,%xmm4 312 pand 96(%rdx),%xmm2 313 por %xmm1,%xmm5 314 pand 112(%rdx),%xmm3 315 por %xmm2,%xmm4 316 por %xmm3,%xmm5 317 por %xmm5,%xmm4 318 pshufd $0x4e,%xmm4,%xmm0 319 por %xmm4,%xmm0 320 leaq 256(%r12),%r12 321 322 movq (%rsi),%rax 323.byte 102,72,15,126,195 324 325 xorq %r15,%r15 326 movq %r8,%rbp 327 movq (%rsp),%r10 328 329 mulq %rbx 330 addq %rax,%r10 331 movq (%rcx),%rax 332 adcq $0,%rdx 333 334 imulq %r10,%rbp 335 movq %rdx,%r11 336 337 mulq %rbp 338 addq %rax,%r10 339 movq 8(%rsi),%rax 340 adcq $0,%rdx 341 movq 8(%rsp),%r10 342 movq %rdx,%r13 343 344 leaq 1(%r15),%r15 345 jmp .Linner_enter 346 347.align 16 348.Linner: 349 addq %rax,%r13 350 movq (%rsi,%r15,8),%rax 351 adcq $0,%rdx 352 addq %r10,%r13 353 movq (%rsp,%r15,8),%r10 354 adcq $0,%rdx 355 movq %r13,-16(%rsp,%r15,8) 356 movq %rdx,%r13 357 358.Linner_enter: 359 mulq %rbx 360 addq %rax,%r11 361 movq (%rcx,%r15,8),%rax 362 adcq $0,%rdx 363 addq %r11,%r10 364 movq %rdx,%r11 365 adcq $0,%r11 366 leaq 1(%r15),%r15 367 368 mulq %rbp 369 cmpq %r9,%r15 370 jne .Linner 371 372 addq %rax,%r13 373 adcq $0,%rdx 374 addq %r10,%r13 375 movq (%rsp,%r9,8),%r10 376 adcq $0,%rdx 377 movq %r13,-16(%rsp,%r9,8) 378 movq %rdx,%r13 379 380 xorq %rdx,%rdx 381 addq %r11,%r13 382 adcq $0,%rdx 383 addq %r10,%r13 384 adcq $0,%rdx 385 movq %r13,-8(%rsp,%r9,8) 386 movq %rdx,(%rsp,%r9,8) 387 388 leaq 1(%r14),%r14 389 cmpq %r9,%r14 390 jb .Louter 391 392 xorq %r14,%r14 393 movq (%rsp),%rax 394 leaq (%rsp),%rsi 395 movq %r9,%r15 396 jmp .Lsub 397.align 16 398.Lsub: sbbq (%rcx,%r14,8),%rax 399 movq %rax,(%rdi,%r14,8) 400 movq 8(%rsi,%r14,8),%rax 401 leaq 1(%r14),%r14 402 decq %r15 403 jnz .Lsub 404 405 sbbq $0,%rax 406 movq $-1,%rbx 407 xorq %rax,%rbx 408 xorq %r14,%r14 409 movq %r9,%r15 410 411.Lcopy: 412 movq (%rdi,%r14,8),%rcx 413 movq (%rsp,%r14,8),%rdx 414 andq %rbx,%rcx 415 andq %rax,%rdx 416 movq %r14,(%rsp,%r14,8) 417 orq %rcx,%rdx 418 movq %rdx,(%rdi,%r14,8) 419 leaq 1(%r14),%r14 420 subq $1,%r15 421 jnz .Lcopy 422 423 movq 8(%rsp,%r9,8),%rsi 424.cfi_def_cfa %rsi,8 425 movq $1,%rax 426 427 movq -48(%rsi),%r15 428.cfi_restore %r15 429 movq -40(%rsi),%r14 430.cfi_restore %r14 431 movq -32(%rsi),%r13 432.cfi_restore %r13 433 movq -24(%rsi),%r12 434.cfi_restore %r12 435 movq -16(%rsi),%rbp 436.cfi_restore %rbp 437 movq -8(%rsi),%rbx 438.cfi_restore %rbx 439 leaq (%rsi),%rsp 440.cfi_def_cfa_register %rsp 441.Lmul_epilogue: 442 .byte 0xf3,0xc3 443.cfi_endproc 444.size bn_mul_mont_gather5,.-bn_mul_mont_gather5 445.type bn_mul4x_mont_gather5,@function 446.align 32 447bn_mul4x_mont_gather5: 448.cfi_startproc 449.byte 0x67 450 movq %rsp,%rax 451.cfi_def_cfa_register %rax 452.Lmul4x_enter: 453 andl $0x80108,%r11d 454 cmpl $0x80108,%r11d 455 je .Lmulx4x_enter 456 pushq %rbx 457.cfi_offset %rbx,-16 458 pushq %rbp 459.cfi_offset %rbp,-24 460 pushq %r12 461.cfi_offset %r12,-32 462 pushq %r13 463.cfi_offset %r13,-40 464 pushq %r14 465.cfi_offset %r14,-48 466 pushq %r15 467.cfi_offset %r15,-56 468.Lmul4x_prologue: 469 470.byte 0x67 471 shll $3,%r9d 472 leaq (%r9,%r9,2),%r10 473 negq %r9 474 475 476 477 478 479 480 481 482 483 484 leaq -320(%rsp,%r9,2),%r11 485 movq %rsp,%rbp 486 subq %rdi,%r11 487 andq $4095,%r11 488 cmpq %r11,%r10 489 jb .Lmul4xsp_alt 490 subq %r11,%rbp 491 leaq -320(%rbp,%r9,2),%rbp 492 jmp .Lmul4xsp_done 493 494.align 32 495.Lmul4xsp_alt: 496 leaq 4096-320(,%r9,2),%r10 497 leaq -320(%rbp,%r9,2),%rbp 498 subq %r10,%r11 499 movq $0,%r10 500 cmovcq %r10,%r11 501 subq %r11,%rbp 502.Lmul4xsp_done: 503 andq $-64,%rbp 504 movq %rsp,%r11 505 subq %rbp,%r11 506 andq $-4096,%r11 507 leaq (%r11,%rbp,1),%rsp 508 movq (%rsp),%r10 509 cmpq %rbp,%rsp 510 ja .Lmul4x_page_walk 511 jmp .Lmul4x_page_walk_done 512 513.Lmul4x_page_walk: 514 leaq -4096(%rsp),%rsp 515 movq (%rsp),%r10 516 cmpq %rbp,%rsp 517 ja .Lmul4x_page_walk 518.Lmul4x_page_walk_done: 519 520 negq %r9 521 522 movq %rax,40(%rsp) 523.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 524.Lmul4x_body: 525 526 call mul4x_internal 527 528 movq 40(%rsp),%rsi 529.cfi_def_cfa %rsi,8 530 movq $1,%rax 531 532 movq -48(%rsi),%r15 533.cfi_restore %r15 534 movq -40(%rsi),%r14 535.cfi_restore %r14 536 movq -32(%rsi),%r13 537.cfi_restore %r13 538 movq -24(%rsi),%r12 539.cfi_restore %r12 540 movq -16(%rsi),%rbp 541.cfi_restore %rbp 542 movq -8(%rsi),%rbx 543.cfi_restore %rbx 544 leaq (%rsi),%rsp 545.cfi_def_cfa_register %rsp 546.Lmul4x_epilogue: 547 .byte 0xf3,0xc3 548.cfi_endproc 549.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 550 551.type mul4x_internal,@function 552.align 32 553mul4x_internal: 554 shlq $5,%r9 555 movd 8(%rax),%xmm5 556 leaq .Linc(%rip),%rax 557 leaq 128(%rdx,%r9,1),%r13 558 shrq $5,%r9 559 movdqa 0(%rax),%xmm0 560 movdqa 16(%rax),%xmm1 561 leaq 88-112(%rsp,%r9,1),%r10 562 leaq 128(%rdx),%r12 563 564 pshufd $0,%xmm5,%xmm5 565 movdqa %xmm1,%xmm4 566.byte 0x67,0x67 567 movdqa %xmm1,%xmm2 568 paddd %xmm0,%xmm1 569 pcmpeqd %xmm5,%xmm0 570.byte 0x67 571 movdqa %xmm4,%xmm3 572 paddd %xmm1,%xmm2 573 pcmpeqd %xmm5,%xmm1 574 movdqa %xmm0,112(%r10) 575 movdqa %xmm4,%xmm0 576 577 paddd %xmm2,%xmm3 578 pcmpeqd %xmm5,%xmm2 579 movdqa %xmm1,128(%r10) 580 movdqa %xmm4,%xmm1 581 582 paddd %xmm3,%xmm0 583 pcmpeqd %xmm5,%xmm3 584 movdqa %xmm2,144(%r10) 585 movdqa %xmm4,%xmm2 586 587 paddd %xmm0,%xmm1 588 pcmpeqd %xmm5,%xmm0 589 movdqa %xmm3,160(%r10) 590 movdqa %xmm4,%xmm3 591 paddd %xmm1,%xmm2 592 pcmpeqd %xmm5,%xmm1 593 movdqa %xmm0,176(%r10) 594 movdqa %xmm4,%xmm0 595 596 paddd %xmm2,%xmm3 597 pcmpeqd %xmm5,%xmm2 598 movdqa %xmm1,192(%r10) 599 movdqa %xmm4,%xmm1 600 601 paddd %xmm3,%xmm0 602 pcmpeqd %xmm5,%xmm3 603 movdqa %xmm2,208(%r10) 604 movdqa %xmm4,%xmm2 605 606 paddd %xmm0,%xmm1 607 pcmpeqd %xmm5,%xmm0 608 movdqa %xmm3,224(%r10) 609 movdqa %xmm4,%xmm3 610 paddd %xmm1,%xmm2 611 pcmpeqd %xmm5,%xmm1 612 movdqa %xmm0,240(%r10) 613 movdqa %xmm4,%xmm0 614 615 paddd %xmm2,%xmm3 616 pcmpeqd %xmm5,%xmm2 617 movdqa %xmm1,256(%r10) 618 movdqa %xmm4,%xmm1 619 620 paddd %xmm3,%xmm0 621 pcmpeqd %xmm5,%xmm3 622 movdqa %xmm2,272(%r10) 623 movdqa %xmm4,%xmm2 624 625 paddd %xmm0,%xmm1 626 pcmpeqd %xmm5,%xmm0 627 movdqa %xmm3,288(%r10) 628 movdqa %xmm4,%xmm3 629 paddd %xmm1,%xmm2 630 pcmpeqd %xmm5,%xmm1 631 movdqa %xmm0,304(%r10) 632 633 paddd %xmm2,%xmm3 634.byte 0x67 635 pcmpeqd %xmm5,%xmm2 636 movdqa %xmm1,320(%r10) 637 638 pcmpeqd %xmm5,%xmm3 639 movdqa %xmm2,336(%r10) 640 pand 64(%r12),%xmm0 641 642 pand 80(%r12),%xmm1 643 pand 96(%r12),%xmm2 644 movdqa %xmm3,352(%r10) 645 pand 112(%r12),%xmm3 646 por %xmm2,%xmm0 647 por %xmm3,%xmm1 648 movdqa -128(%r12),%xmm4 649 movdqa -112(%r12),%xmm5 650 movdqa -96(%r12),%xmm2 651 pand 112(%r10),%xmm4 652 movdqa -80(%r12),%xmm3 653 pand 128(%r10),%xmm5 654 por %xmm4,%xmm0 655 pand 144(%r10),%xmm2 656 por %xmm5,%xmm1 657 pand 160(%r10),%xmm3 658 por %xmm2,%xmm0 659 por %xmm3,%xmm1 660 movdqa -64(%r12),%xmm4 661 movdqa -48(%r12),%xmm5 662 movdqa -32(%r12),%xmm2 663 pand 176(%r10),%xmm4 664 movdqa -16(%r12),%xmm3 665 pand 192(%r10),%xmm5 666 por %xmm4,%xmm0 667 pand 208(%r10),%xmm2 668 por %xmm5,%xmm1 669 pand 224(%r10),%xmm3 670 por %xmm2,%xmm0 671 por %xmm3,%xmm1 672 movdqa 0(%r12),%xmm4 673 movdqa 16(%r12),%xmm5 674 movdqa 32(%r12),%xmm2 675 pand 240(%r10),%xmm4 676 movdqa 48(%r12),%xmm3 677 pand 256(%r10),%xmm5 678 por %xmm4,%xmm0 679 pand 272(%r10),%xmm2 680 por %xmm5,%xmm1 681 pand 288(%r10),%xmm3 682 por %xmm2,%xmm0 683 por %xmm3,%xmm1 684 por %xmm1,%xmm0 685 pshufd $0x4e,%xmm0,%xmm1 686 por %xmm1,%xmm0 687 leaq 256(%r12),%r12 688.byte 102,72,15,126,195 689 690 movq %r13,16+8(%rsp) 691 movq %rdi,56+8(%rsp) 692 693 movq (%r8),%r8 694 movq (%rsi),%rax 695 leaq (%rsi,%r9,1),%rsi 696 negq %r9 697 698 movq %r8,%rbp 699 mulq %rbx 700 movq %rax,%r10 701 movq (%rcx),%rax 702 703 imulq %r10,%rbp 704 leaq 64+8(%rsp),%r14 705 movq %rdx,%r11 706 707 mulq %rbp 708 addq %rax,%r10 709 movq 8(%rsi,%r9,1),%rax 710 adcq $0,%rdx 711 movq %rdx,%rdi 712 713 mulq %rbx 714 addq %rax,%r11 715 movq 8(%rcx),%rax 716 adcq $0,%rdx 717 movq %rdx,%r10 718 719 mulq %rbp 720 addq %rax,%rdi 721 movq 16(%rsi,%r9,1),%rax 722 adcq $0,%rdx 723 addq %r11,%rdi 724 leaq 32(%r9),%r15 725 leaq 32(%rcx),%rcx 726 adcq $0,%rdx 727 movq %rdi,(%r14) 728 movq %rdx,%r13 729 jmp .L1st4x 730 731.align 32 732.L1st4x: 733 mulq %rbx 734 addq %rax,%r10 735 movq -16(%rcx),%rax 736 leaq 32(%r14),%r14 737 adcq $0,%rdx 738 movq %rdx,%r11 739 740 mulq %rbp 741 addq %rax,%r13 742 movq -8(%rsi,%r15,1),%rax 743 adcq $0,%rdx 744 addq %r10,%r13 745 adcq $0,%rdx 746 movq %r13,-24(%r14) 747 movq %rdx,%rdi 748 749 mulq %rbx 750 addq %rax,%r11 751 movq -8(%rcx),%rax 752 adcq $0,%rdx 753 movq %rdx,%r10 754 755 mulq %rbp 756 addq %rax,%rdi 757 movq (%rsi,%r15,1),%rax 758 adcq $0,%rdx 759 addq %r11,%rdi 760 adcq $0,%rdx 761 movq %rdi,-16(%r14) 762 movq %rdx,%r13 763 764 mulq %rbx 765 addq %rax,%r10 766 movq 0(%rcx),%rax 767 adcq $0,%rdx 768 movq %rdx,%r11 769 770 mulq %rbp 771 addq %rax,%r13 772 movq 8(%rsi,%r15,1),%rax 773 adcq $0,%rdx 774 addq %r10,%r13 775 adcq $0,%rdx 776 movq %r13,-8(%r14) 777 movq %rdx,%rdi 778 779 mulq %rbx 780 addq %rax,%r11 781 movq 8(%rcx),%rax 782 adcq $0,%rdx 783 movq %rdx,%r10 784 785 mulq %rbp 786 addq %rax,%rdi 787 movq 16(%rsi,%r15,1),%rax 788 adcq $0,%rdx 789 addq %r11,%rdi 790 leaq 32(%rcx),%rcx 791 adcq $0,%rdx 792 movq %rdi,(%r14) 793 movq %rdx,%r13 794 795 addq $32,%r15 796 jnz .L1st4x 797 798 mulq %rbx 799 addq %rax,%r10 800 movq -16(%rcx),%rax 801 leaq 32(%r14),%r14 802 adcq $0,%rdx 803 movq %rdx,%r11 804 805 mulq %rbp 806 addq %rax,%r13 807 movq -8(%rsi),%rax 808 adcq $0,%rdx 809 addq %r10,%r13 810 adcq $0,%rdx 811 movq %r13,-24(%r14) 812 movq %rdx,%rdi 813 814 mulq %rbx 815 addq %rax,%r11 816 movq -8(%rcx),%rax 817 adcq $0,%rdx 818 movq %rdx,%r10 819 820 mulq %rbp 821 addq %rax,%rdi 822 movq (%rsi,%r9,1),%rax 823 adcq $0,%rdx 824 addq %r11,%rdi 825 adcq $0,%rdx 826 movq %rdi,-16(%r14) 827 movq %rdx,%r13 828 829 leaq (%rcx,%r9,1),%rcx 830 831 xorq %rdi,%rdi 832 addq %r10,%r13 833 adcq $0,%rdi 834 movq %r13,-8(%r14) 835 836 jmp .Louter4x 837 838.align 32 839.Louter4x: 840 leaq 16+128(%r14),%rdx 841 pxor %xmm4,%xmm4 842 pxor %xmm5,%xmm5 843 movdqa -128(%r12),%xmm0 844 movdqa -112(%r12),%xmm1 845 movdqa -96(%r12),%xmm2 846 movdqa -80(%r12),%xmm3 847 pand -128(%rdx),%xmm0 848 pand -112(%rdx),%xmm1 849 por %xmm0,%xmm4 850 pand -96(%rdx),%xmm2 851 por %xmm1,%xmm5 852 pand -80(%rdx),%xmm3 853 por %xmm2,%xmm4 854 por %xmm3,%xmm5 855 movdqa -64(%r12),%xmm0 856 movdqa -48(%r12),%xmm1 857 movdqa -32(%r12),%xmm2 858 movdqa -16(%r12),%xmm3 859 pand -64(%rdx),%xmm0 860 pand -48(%rdx),%xmm1 861 por %xmm0,%xmm4 862 pand -32(%rdx),%xmm2 863 por %xmm1,%xmm5 864 pand -16(%rdx),%xmm3 865 por %xmm2,%xmm4 866 por %xmm3,%xmm5 867 movdqa 0(%r12),%xmm0 868 movdqa 16(%r12),%xmm1 869 movdqa 32(%r12),%xmm2 870 movdqa 48(%r12),%xmm3 871 pand 0(%rdx),%xmm0 872 pand 16(%rdx),%xmm1 873 por %xmm0,%xmm4 874 pand 32(%rdx),%xmm2 875 por %xmm1,%xmm5 876 pand 48(%rdx),%xmm3 877 por %xmm2,%xmm4 878 por %xmm3,%xmm5 879 movdqa 64(%r12),%xmm0 880 movdqa 80(%r12),%xmm1 881 movdqa 96(%r12),%xmm2 882 movdqa 112(%r12),%xmm3 883 pand 64(%rdx),%xmm0 884 pand 80(%rdx),%xmm1 885 por %xmm0,%xmm4 886 pand 96(%rdx),%xmm2 887 por %xmm1,%xmm5 888 pand 112(%rdx),%xmm3 889 por %xmm2,%xmm4 890 por %xmm3,%xmm5 891 por %xmm5,%xmm4 892 pshufd $0x4e,%xmm4,%xmm0 893 por %xmm4,%xmm0 894 leaq 256(%r12),%r12 895.byte 102,72,15,126,195 896 897 movq (%r14,%r9,1),%r10 898 movq %r8,%rbp 899 mulq %rbx 900 addq %rax,%r10 901 movq (%rcx),%rax 902 adcq $0,%rdx 903 904 imulq %r10,%rbp 905 movq %rdx,%r11 906 movq %rdi,(%r14) 907 908 leaq (%r14,%r9,1),%r14 909 910 mulq %rbp 911 addq %rax,%r10 912 movq 8(%rsi,%r9,1),%rax 913 adcq $0,%rdx 914 movq %rdx,%rdi 915 916 mulq %rbx 917 addq %rax,%r11 918 movq 8(%rcx),%rax 919 adcq $0,%rdx 920 addq 8(%r14),%r11 921 adcq $0,%rdx 922 movq %rdx,%r10 923 924 mulq %rbp 925 addq %rax,%rdi 926 movq 16(%rsi,%r9,1),%rax 927 adcq $0,%rdx 928 addq %r11,%rdi 929 leaq 32(%r9),%r15 930 leaq 32(%rcx),%rcx 931 adcq $0,%rdx 932 movq %rdx,%r13 933 jmp .Linner4x 934 935.align 32 936.Linner4x: 937 mulq %rbx 938 addq %rax,%r10 939 movq -16(%rcx),%rax 940 adcq $0,%rdx 941 addq 16(%r14),%r10 942 leaq 32(%r14),%r14 943 adcq $0,%rdx 944 movq %rdx,%r11 945 946 mulq %rbp 947 addq %rax,%r13 948 movq -8(%rsi,%r15,1),%rax 949 adcq $0,%rdx 950 addq %r10,%r13 951 adcq $0,%rdx 952 movq %rdi,-32(%r14) 953 movq %rdx,%rdi 954 955 mulq %rbx 956 addq %rax,%r11 957 movq -8(%rcx),%rax 958 adcq $0,%rdx 959 addq -8(%r14),%r11 960 adcq $0,%rdx 961 movq %rdx,%r10 962 963 mulq %rbp 964 addq %rax,%rdi 965 movq (%rsi,%r15,1),%rax 966 adcq $0,%rdx 967 addq %r11,%rdi 968 adcq $0,%rdx 969 movq %r13,-24(%r14) 970 movq %rdx,%r13 971 972 mulq %rbx 973 addq %rax,%r10 974 movq 0(%rcx),%rax 975 adcq $0,%rdx 976 addq (%r14),%r10 977 adcq $0,%rdx 978 movq %rdx,%r11 979 980 mulq %rbp 981 addq %rax,%r13 982 movq 8(%rsi,%r15,1),%rax 983 adcq $0,%rdx 984 addq %r10,%r13 985 adcq $0,%rdx 986 movq %rdi,-16(%r14) 987 movq %rdx,%rdi 988 989 mulq %rbx 990 addq %rax,%r11 991 movq 8(%rcx),%rax 992 adcq $0,%rdx 993 addq 8(%r14),%r11 994 adcq $0,%rdx 995 movq %rdx,%r10 996 997 mulq %rbp 998 addq %rax,%rdi 999 movq 16(%rsi,%r15,1),%rax 1000 adcq $0,%rdx 1001 addq %r11,%rdi 1002 leaq 32(%rcx),%rcx 1003 adcq $0,%rdx 1004 movq %r13,-8(%r14) 1005 movq %rdx,%r13 1006 1007 addq $32,%r15 1008 jnz .Linner4x 1009 1010 mulq %rbx 1011 addq %rax,%r10 1012 movq -16(%rcx),%rax 1013 adcq $0,%rdx 1014 addq 16(%r14),%r10 1015 leaq 32(%r14),%r14 1016 adcq $0,%rdx 1017 movq %rdx,%r11 1018 1019 mulq %rbp 1020 addq %rax,%r13 1021 movq -8(%rsi),%rax 1022 adcq $0,%rdx 1023 addq %r10,%r13 1024 adcq $0,%rdx 1025 movq %rdi,-32(%r14) 1026 movq %rdx,%rdi 1027 1028 mulq %rbx 1029 addq %rax,%r11 1030 movq %rbp,%rax 1031 movq -8(%rcx),%rbp 1032 adcq $0,%rdx 1033 addq -8(%r14),%r11 1034 adcq $0,%rdx 1035 movq %rdx,%r10 1036 1037 mulq %rbp 1038 addq %rax,%rdi 1039 movq (%rsi,%r9,1),%rax 1040 adcq $0,%rdx 1041 addq %r11,%rdi 1042 adcq $0,%rdx 1043 movq %r13,-24(%r14) 1044 movq %rdx,%r13 1045 1046 movq %rdi,-16(%r14) 1047 leaq (%rcx,%r9,1),%rcx 1048 1049 xorq %rdi,%rdi 1050 addq %r10,%r13 1051 adcq $0,%rdi 1052 addq (%r14),%r13 1053 adcq $0,%rdi 1054 movq %r13,-8(%r14) 1055 1056 cmpq 16+8(%rsp),%r12 1057 jb .Louter4x 1058 xorq %rax,%rax 1059 subq %r13,%rbp 1060 adcq %r15,%r15 1061 orq %r15,%rdi 1062 subq %rdi,%rax 1063 leaq (%r14,%r9,1),%rbx 1064 movq (%rcx),%r12 1065 leaq (%rcx),%rbp 1066 movq %r9,%rcx 1067 sarq $3+2,%rcx 1068 movq 56+8(%rsp),%rdi 1069 decq %r12 1070 xorq %r10,%r10 1071 movq 8(%rbp),%r13 1072 movq 16(%rbp),%r14 1073 movq 24(%rbp),%r15 1074 jmp .Lsqr4x_sub_entry 1075.size mul4x_internal,.-mul4x_internal 1076.globl bn_power5 1077.type bn_power5,@function 1078.align 32 1079bn_power5: 1080.cfi_startproc 1081 movq %rsp,%rax 1082.cfi_def_cfa_register %rax 1083 movl OPENSSL_ia32cap_P+8(%rip),%r11d 1084 andl $0x80108,%r11d 1085 cmpl $0x80108,%r11d 1086 je .Lpowerx5_enter 1087 pushq %rbx 1088.cfi_offset %rbx,-16 1089 pushq %rbp 1090.cfi_offset %rbp,-24 1091 pushq %r12 1092.cfi_offset %r12,-32 1093 pushq %r13 1094.cfi_offset %r13,-40 1095 pushq %r14 1096.cfi_offset %r14,-48 1097 pushq %r15 1098.cfi_offset %r15,-56 1099.Lpower5_prologue: 1100 1101 shll $3,%r9d 1102 leal (%r9,%r9,2),%r10d 1103 negq %r9 1104 movq (%r8),%r8 1105 1106 1107 1108 1109 1110 1111 1112 1113 leaq -320(%rsp,%r9,2),%r11 1114 movq %rsp,%rbp 1115 subq %rdi,%r11 1116 andq $4095,%r11 1117 cmpq %r11,%r10 1118 jb .Lpwr_sp_alt 1119 subq %r11,%rbp 1120 leaq -320(%rbp,%r9,2),%rbp 1121 jmp .Lpwr_sp_done 1122 1123.align 32 1124.Lpwr_sp_alt: 1125 leaq 4096-320(,%r9,2),%r10 1126 leaq -320(%rbp,%r9,2),%rbp 1127 subq %r10,%r11 1128 movq $0,%r10 1129 cmovcq %r10,%r11 1130 subq %r11,%rbp 1131.Lpwr_sp_done: 1132 andq $-64,%rbp 1133 movq %rsp,%r11 1134 subq %rbp,%r11 1135 andq $-4096,%r11 1136 leaq (%r11,%rbp,1),%rsp 1137 movq (%rsp),%r10 1138 cmpq %rbp,%rsp 1139 ja .Lpwr_page_walk 1140 jmp .Lpwr_page_walk_done 1141 1142.Lpwr_page_walk: 1143 leaq -4096(%rsp),%rsp 1144 movq (%rsp),%r10 1145 cmpq %rbp,%rsp 1146 ja .Lpwr_page_walk 1147.Lpwr_page_walk_done: 1148 1149 movq %r9,%r10 1150 negq %r9 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 movq %r8,32(%rsp) 1162 movq %rax,40(%rsp) 1163.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 1164.Lpower5_body: 1165.byte 102,72,15,110,207 1166.byte 102,72,15,110,209 1167.byte 102,73,15,110,218 1168.byte 102,72,15,110,226 1169 1170 call __bn_sqr8x_internal 1171 call __bn_post4x_internal 1172 call __bn_sqr8x_internal 1173 call __bn_post4x_internal 1174 call __bn_sqr8x_internal 1175 call __bn_post4x_internal 1176 call __bn_sqr8x_internal 1177 call __bn_post4x_internal 1178 call __bn_sqr8x_internal 1179 call __bn_post4x_internal 1180 1181.byte 102,72,15,126,209 1182.byte 102,72,15,126,226 1183 movq %rsi,%rdi 1184 movq 40(%rsp),%rax 1185 leaq 32(%rsp),%r8 1186 1187 call mul4x_internal 1188 1189 movq 40(%rsp),%rsi 1190.cfi_def_cfa %rsi,8 1191 movq $1,%rax 1192 movq -48(%rsi),%r15 1193.cfi_restore %r15 1194 movq -40(%rsi),%r14 1195.cfi_restore %r14 1196 movq -32(%rsi),%r13 1197.cfi_restore %r13 1198 movq -24(%rsi),%r12 1199.cfi_restore %r12 1200 movq -16(%rsi),%rbp 1201.cfi_restore %rbp 1202 movq -8(%rsi),%rbx 1203.cfi_restore %rbx 1204 leaq (%rsi),%rsp 1205.cfi_def_cfa_register %rsp 1206.Lpower5_epilogue: 1207 .byte 0xf3,0xc3 1208.cfi_endproc 1209.size bn_power5,.-bn_power5 1210 1211.globl bn_sqr8x_internal 1212.hidden bn_sqr8x_internal 1213.type bn_sqr8x_internal,@function 1214.align 32 1215bn_sqr8x_internal: 1216__bn_sqr8x_internal: 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 leaq 32(%r10),%rbp 1291 leaq (%rsi,%r9,1),%rsi 1292 1293 movq %r9,%rcx 1294 1295 1296 movq -32(%rsi,%rbp,1),%r14 1297 leaq 48+8(%rsp,%r9,2),%rdi 1298 movq -24(%rsi,%rbp,1),%rax 1299 leaq -32(%rdi,%rbp,1),%rdi 1300 movq -16(%rsi,%rbp,1),%rbx 1301 movq %rax,%r15 1302 1303 mulq %r14 1304 movq %rax,%r10 1305 movq %rbx,%rax 1306 movq %rdx,%r11 1307 movq %r10,-24(%rdi,%rbp,1) 1308 1309 mulq %r14 1310 addq %rax,%r11 1311 movq %rbx,%rax 1312 adcq $0,%rdx 1313 movq %r11,-16(%rdi,%rbp,1) 1314 movq %rdx,%r10 1315 1316 1317 movq -8(%rsi,%rbp,1),%rbx 1318 mulq %r15 1319 movq %rax,%r12 1320 movq %rbx,%rax 1321 movq %rdx,%r13 1322 1323 leaq (%rbp),%rcx 1324 mulq %r14 1325 addq %rax,%r10 1326 movq %rbx,%rax 1327 movq %rdx,%r11 1328 adcq $0,%r11 1329 addq %r12,%r10 1330 adcq $0,%r11 1331 movq %r10,-8(%rdi,%rcx,1) 1332 jmp .Lsqr4x_1st 1333 1334.align 32 1335.Lsqr4x_1st: 1336 movq (%rsi,%rcx,1),%rbx 1337 mulq %r15 1338 addq %rax,%r13 1339 movq %rbx,%rax 1340 movq %rdx,%r12 1341 adcq $0,%r12 1342 1343 mulq %r14 1344 addq %rax,%r11 1345 movq %rbx,%rax 1346 movq 8(%rsi,%rcx,1),%rbx 1347 movq %rdx,%r10 1348 adcq $0,%r10 1349 addq %r13,%r11 1350 adcq $0,%r10 1351 1352 1353 mulq %r15 1354 addq %rax,%r12 1355 movq %rbx,%rax 1356 movq %r11,(%rdi,%rcx,1) 1357 movq %rdx,%r13 1358 adcq $0,%r13 1359 1360 mulq %r14 1361 addq %rax,%r10 1362 movq %rbx,%rax 1363 movq 16(%rsi,%rcx,1),%rbx 1364 movq %rdx,%r11 1365 adcq $0,%r11 1366 addq %r12,%r10 1367 adcq $0,%r11 1368 1369 mulq %r15 1370 addq %rax,%r13 1371 movq %rbx,%rax 1372 movq %r10,8(%rdi,%rcx,1) 1373 movq %rdx,%r12 1374 adcq $0,%r12 1375 1376 mulq %r14 1377 addq %rax,%r11 1378 movq %rbx,%rax 1379 movq 24(%rsi,%rcx,1),%rbx 1380 movq %rdx,%r10 1381 adcq $0,%r10 1382 addq %r13,%r11 1383 adcq $0,%r10 1384 1385 1386 mulq %r15 1387 addq %rax,%r12 1388 movq %rbx,%rax 1389 movq %r11,16(%rdi,%rcx,1) 1390 movq %rdx,%r13 1391 adcq $0,%r13 1392 leaq 32(%rcx),%rcx 1393 1394 mulq %r14 1395 addq %rax,%r10 1396 movq %rbx,%rax 1397 movq %rdx,%r11 1398 adcq $0,%r11 1399 addq %r12,%r10 1400 adcq $0,%r11 1401 movq %r10,-8(%rdi,%rcx,1) 1402 1403 cmpq $0,%rcx 1404 jne .Lsqr4x_1st 1405 1406 mulq %r15 1407 addq %rax,%r13 1408 leaq 16(%rbp),%rbp 1409 adcq $0,%rdx 1410 addq %r11,%r13 1411 adcq $0,%rdx 1412 1413 movq %r13,(%rdi) 1414 movq %rdx,%r12 1415 movq %rdx,8(%rdi) 1416 jmp .Lsqr4x_outer 1417 1418.align 32 1419.Lsqr4x_outer: 1420 movq -32(%rsi,%rbp,1),%r14 1421 leaq 48+8(%rsp,%r9,2),%rdi 1422 movq -24(%rsi,%rbp,1),%rax 1423 leaq -32(%rdi,%rbp,1),%rdi 1424 movq -16(%rsi,%rbp,1),%rbx 1425 movq %rax,%r15 1426 1427 mulq %r14 1428 movq -24(%rdi,%rbp,1),%r10 1429 addq %rax,%r10 1430 movq %rbx,%rax 1431 adcq $0,%rdx 1432 movq %r10,-24(%rdi,%rbp,1) 1433 movq %rdx,%r11 1434 1435 mulq %r14 1436 addq %rax,%r11 1437 movq %rbx,%rax 1438 adcq $0,%rdx 1439 addq -16(%rdi,%rbp,1),%r11 1440 movq %rdx,%r10 1441 adcq $0,%r10 1442 movq %r11,-16(%rdi,%rbp,1) 1443 1444 xorq %r12,%r12 1445 1446 movq -8(%rsi,%rbp,1),%rbx 1447 mulq %r15 1448 addq %rax,%r12 1449 movq %rbx,%rax 1450 adcq $0,%rdx 1451 addq -8(%rdi,%rbp,1),%r12 1452 movq %rdx,%r13 1453 adcq $0,%r13 1454 1455 mulq %r14 1456 addq %rax,%r10 1457 movq %rbx,%rax 1458 adcq $0,%rdx 1459 addq %r12,%r10 1460 movq %rdx,%r11 1461 adcq $0,%r11 1462 movq %r10,-8(%rdi,%rbp,1) 1463 1464 leaq (%rbp),%rcx 1465 jmp .Lsqr4x_inner 1466 1467.align 32 1468.Lsqr4x_inner: 1469 movq (%rsi,%rcx,1),%rbx 1470 mulq %r15 1471 addq %rax,%r13 1472 movq %rbx,%rax 1473 movq %rdx,%r12 1474 adcq $0,%r12 1475 addq (%rdi,%rcx,1),%r13 1476 adcq $0,%r12 1477 1478.byte 0x67 1479 mulq %r14 1480 addq %rax,%r11 1481 movq %rbx,%rax 1482 movq 8(%rsi,%rcx,1),%rbx 1483 movq %rdx,%r10 1484 adcq $0,%r10 1485 addq %r13,%r11 1486 adcq $0,%r10 1487 1488 mulq %r15 1489 addq %rax,%r12 1490 movq %r11,(%rdi,%rcx,1) 1491 movq %rbx,%rax 1492 movq %rdx,%r13 1493 adcq $0,%r13 1494 addq 8(%rdi,%rcx,1),%r12 1495 leaq 16(%rcx),%rcx 1496 adcq $0,%r13 1497 1498 mulq %r14 1499 addq %rax,%r10 1500 movq %rbx,%rax 1501 adcq $0,%rdx 1502 addq %r12,%r10 1503 movq %rdx,%r11 1504 adcq $0,%r11 1505 movq %r10,-8(%rdi,%rcx,1) 1506 1507 cmpq $0,%rcx 1508 jne .Lsqr4x_inner 1509 1510.byte 0x67 1511 mulq %r15 1512 addq %rax,%r13 1513 adcq $0,%rdx 1514 addq %r11,%r13 1515 adcq $0,%rdx 1516 1517 movq %r13,(%rdi) 1518 movq %rdx,%r12 1519 movq %rdx,8(%rdi) 1520 1521 addq $16,%rbp 1522 jnz .Lsqr4x_outer 1523 1524 1525 movq -32(%rsi),%r14 1526 leaq 48+8(%rsp,%r9,2),%rdi 1527 movq -24(%rsi),%rax 1528 leaq -32(%rdi,%rbp,1),%rdi 1529 movq -16(%rsi),%rbx 1530 movq %rax,%r15 1531 1532 mulq %r14 1533 addq %rax,%r10 1534 movq %rbx,%rax 1535 movq %rdx,%r11 1536 adcq $0,%r11 1537 1538 mulq %r14 1539 addq %rax,%r11 1540 movq %rbx,%rax 1541 movq %r10,-24(%rdi) 1542 movq %rdx,%r10 1543 adcq $0,%r10 1544 addq %r13,%r11 1545 movq -8(%rsi),%rbx 1546 adcq $0,%r10 1547 1548 mulq %r15 1549 addq %rax,%r12 1550 movq %rbx,%rax 1551 movq %r11,-16(%rdi) 1552 movq %rdx,%r13 1553 adcq $0,%r13 1554 1555 mulq %r14 1556 addq %rax,%r10 1557 movq %rbx,%rax 1558 movq %rdx,%r11 1559 adcq $0,%r11 1560 addq %r12,%r10 1561 adcq $0,%r11 1562 movq %r10,-8(%rdi) 1563 1564 mulq %r15 1565 addq %rax,%r13 1566 movq -16(%rsi),%rax 1567 adcq $0,%rdx 1568 addq %r11,%r13 1569 adcq $0,%rdx 1570 1571 movq %r13,(%rdi) 1572 movq %rdx,%r12 1573 movq %rdx,8(%rdi) 1574 1575 mulq %rbx 1576 addq $16,%rbp 1577 xorq %r14,%r14 1578 subq %r9,%rbp 1579 xorq %r15,%r15 1580 1581 addq %r12,%rax 1582 adcq $0,%rdx 1583 movq %rax,8(%rdi) 1584 movq %rdx,16(%rdi) 1585 movq %r15,24(%rdi) 1586 1587 movq -16(%rsi,%rbp,1),%rax 1588 leaq 48+8(%rsp),%rdi 1589 xorq %r10,%r10 1590 movq 8(%rdi),%r11 1591 1592 leaq (%r14,%r10,2),%r12 1593 shrq $63,%r10 1594 leaq (%rcx,%r11,2),%r13 1595 shrq $63,%r11 1596 orq %r10,%r13 1597 movq 16(%rdi),%r10 1598 movq %r11,%r14 1599 mulq %rax 1600 negq %r15 1601 movq 24(%rdi),%r11 1602 adcq %rax,%r12 1603 movq -8(%rsi,%rbp,1),%rax 1604 movq %r12,(%rdi) 1605 adcq %rdx,%r13 1606 1607 leaq (%r14,%r10,2),%rbx 1608 movq %r13,8(%rdi) 1609 sbbq %r15,%r15 1610 shrq $63,%r10 1611 leaq (%rcx,%r11,2),%r8 1612 shrq $63,%r11 1613 orq %r10,%r8 1614 movq 32(%rdi),%r10 1615 movq %r11,%r14 1616 mulq %rax 1617 negq %r15 1618 movq 40(%rdi),%r11 1619 adcq %rax,%rbx 1620 movq 0(%rsi,%rbp,1),%rax 1621 movq %rbx,16(%rdi) 1622 adcq %rdx,%r8 1623 leaq 16(%rbp),%rbp 1624 movq %r8,24(%rdi) 1625 sbbq %r15,%r15 1626 leaq 64(%rdi),%rdi 1627 jmp .Lsqr4x_shift_n_add 1628 1629.align 32 1630.Lsqr4x_shift_n_add: 1631 leaq (%r14,%r10,2),%r12 1632 shrq $63,%r10 1633 leaq (%rcx,%r11,2),%r13 1634 shrq $63,%r11 1635 orq %r10,%r13 1636 movq -16(%rdi),%r10 1637 movq %r11,%r14 1638 mulq %rax 1639 negq %r15 1640 movq -8(%rdi),%r11 1641 adcq %rax,%r12 1642 movq -8(%rsi,%rbp,1),%rax 1643 movq %r12,-32(%rdi) 1644 adcq %rdx,%r13 1645 1646 leaq (%r14,%r10,2),%rbx 1647 movq %r13,-24(%rdi) 1648 sbbq %r15,%r15 1649 shrq $63,%r10 1650 leaq (%rcx,%r11,2),%r8 1651 shrq $63,%r11 1652 orq %r10,%r8 1653 movq 0(%rdi),%r10 1654 movq %r11,%r14 1655 mulq %rax 1656 negq %r15 1657 movq 8(%rdi),%r11 1658 adcq %rax,%rbx 1659 movq 0(%rsi,%rbp,1),%rax 1660 movq %rbx,-16(%rdi) 1661 adcq %rdx,%r8 1662 1663 leaq (%r14,%r10,2),%r12 1664 movq %r8,-8(%rdi) 1665 sbbq %r15,%r15 1666 shrq $63,%r10 1667 leaq (%rcx,%r11,2),%r13 1668 shrq $63,%r11 1669 orq %r10,%r13 1670 movq 16(%rdi),%r10 1671 movq %r11,%r14 1672 mulq %rax 1673 negq %r15 1674 movq 24(%rdi),%r11 1675 adcq %rax,%r12 1676 movq 8(%rsi,%rbp,1),%rax 1677 movq %r12,0(%rdi) 1678 adcq %rdx,%r13 1679 1680 leaq (%r14,%r10,2),%rbx 1681 movq %r13,8(%rdi) 1682 sbbq %r15,%r15 1683 shrq $63,%r10 1684 leaq (%rcx,%r11,2),%r8 1685 shrq $63,%r11 1686 orq %r10,%r8 1687 movq 32(%rdi),%r10 1688 movq %r11,%r14 1689 mulq %rax 1690 negq %r15 1691 movq 40(%rdi),%r11 1692 adcq %rax,%rbx 1693 movq 16(%rsi,%rbp,1),%rax 1694 movq %rbx,16(%rdi) 1695 adcq %rdx,%r8 1696 movq %r8,24(%rdi) 1697 sbbq %r15,%r15 1698 leaq 64(%rdi),%rdi 1699 addq $32,%rbp 1700 jnz .Lsqr4x_shift_n_add 1701 1702 leaq (%r14,%r10,2),%r12 1703.byte 0x67 1704 shrq $63,%r10 1705 leaq (%rcx,%r11,2),%r13 1706 shrq $63,%r11 1707 orq %r10,%r13 1708 movq -16(%rdi),%r10 1709 movq %r11,%r14 1710 mulq %rax 1711 negq %r15 1712 movq -8(%rdi),%r11 1713 adcq %rax,%r12 1714 movq -8(%rsi),%rax 1715 movq %r12,-32(%rdi) 1716 adcq %rdx,%r13 1717 1718 leaq (%r14,%r10,2),%rbx 1719 movq %r13,-24(%rdi) 1720 sbbq %r15,%r15 1721 shrq $63,%r10 1722 leaq (%rcx,%r11,2),%r8 1723 shrq $63,%r11 1724 orq %r10,%r8 1725 mulq %rax 1726 negq %r15 1727 adcq %rax,%rbx 1728 adcq %rdx,%r8 1729 movq %rbx,-16(%rdi) 1730 movq %r8,-8(%rdi) 1731.byte 102,72,15,126,213 1732__bn_sqr8x_reduction: 1733 xorq %rax,%rax 1734 leaq (%r9,%rbp,1),%rcx 1735 leaq 48+8(%rsp,%r9,2),%rdx 1736 movq %rcx,0+8(%rsp) 1737 leaq 48+8(%rsp,%r9,1),%rdi 1738 movq %rdx,8+8(%rsp) 1739 negq %r9 1740 jmp .L8x_reduction_loop 1741 1742.align 32 1743.L8x_reduction_loop: 1744 leaq (%rdi,%r9,1),%rdi 1745.byte 0x66 1746 movq 0(%rdi),%rbx 1747 movq 8(%rdi),%r9 1748 movq 16(%rdi),%r10 1749 movq 24(%rdi),%r11 1750 movq 32(%rdi),%r12 1751 movq 40(%rdi),%r13 1752 movq 48(%rdi),%r14 1753 movq 56(%rdi),%r15 1754 movq %rax,(%rdx) 1755 leaq 64(%rdi),%rdi 1756 1757.byte 0x67 1758 movq %rbx,%r8 1759 imulq 32+8(%rsp),%rbx 1760 movq 0(%rbp),%rax 1761 movl $8,%ecx 1762 jmp .L8x_reduce 1763 1764.align 32 1765.L8x_reduce: 1766 mulq %rbx 1767 movq 8(%rbp),%rax 1768 negq %r8 1769 movq %rdx,%r8 1770 adcq $0,%r8 1771 1772 mulq %rbx 1773 addq %rax,%r9 1774 movq 16(%rbp),%rax 1775 adcq $0,%rdx 1776 addq %r9,%r8 1777 movq %rbx,48-8+8(%rsp,%rcx,8) 1778 movq %rdx,%r9 1779 adcq $0,%r9 1780 1781 mulq %rbx 1782 addq %rax,%r10 1783 movq 24(%rbp),%rax 1784 adcq $0,%rdx 1785 addq %r10,%r9 1786 movq 32+8(%rsp),%rsi 1787 movq %rdx,%r10 1788 adcq $0,%r10 1789 1790 mulq %rbx 1791 addq %rax,%r11 1792 movq 32(%rbp),%rax 1793 adcq $0,%rdx 1794 imulq %r8,%rsi 1795 addq %r11,%r10 1796 movq %rdx,%r11 1797 adcq $0,%r11 1798 1799 mulq %rbx 1800 addq %rax,%r12 1801 movq 40(%rbp),%rax 1802 adcq $0,%rdx 1803 addq %r12,%r11 1804 movq %rdx,%r12 1805 adcq $0,%r12 1806 1807 mulq %rbx 1808 addq %rax,%r13 1809 movq 48(%rbp),%rax 1810 adcq $0,%rdx 1811 addq %r13,%r12 1812 movq %rdx,%r13 1813 adcq $0,%r13 1814 1815 mulq %rbx 1816 addq %rax,%r14 1817 movq 56(%rbp),%rax 1818 adcq $0,%rdx 1819 addq %r14,%r13 1820 movq %rdx,%r14 1821 adcq $0,%r14 1822 1823 mulq %rbx 1824 movq %rsi,%rbx 1825 addq %rax,%r15 1826 movq 0(%rbp),%rax 1827 adcq $0,%rdx 1828 addq %r15,%r14 1829 movq %rdx,%r15 1830 adcq $0,%r15 1831 1832 decl %ecx 1833 jnz .L8x_reduce 1834 1835 leaq 64(%rbp),%rbp 1836 xorq %rax,%rax 1837 movq 8+8(%rsp),%rdx 1838 cmpq 0+8(%rsp),%rbp 1839 jae .L8x_no_tail 1840 1841.byte 0x66 1842 addq 0(%rdi),%r8 1843 adcq 8(%rdi),%r9 1844 adcq 16(%rdi),%r10 1845 adcq 24(%rdi),%r11 1846 adcq 32(%rdi),%r12 1847 adcq 40(%rdi),%r13 1848 adcq 48(%rdi),%r14 1849 adcq 56(%rdi),%r15 1850 sbbq %rsi,%rsi 1851 1852 movq 48+56+8(%rsp),%rbx 1853 movl $8,%ecx 1854 movq 0(%rbp),%rax 1855 jmp .L8x_tail 1856 1857.align 32 1858.L8x_tail: 1859 mulq %rbx 1860 addq %rax,%r8 1861 movq 8(%rbp),%rax 1862 movq %r8,(%rdi) 1863 movq %rdx,%r8 1864 adcq $0,%r8 1865 1866 mulq %rbx 1867 addq %rax,%r9 1868 movq 16(%rbp),%rax 1869 adcq $0,%rdx 1870 addq %r9,%r8 1871 leaq 8(%rdi),%rdi 1872 movq %rdx,%r9 1873 adcq $0,%r9 1874 1875 mulq %rbx 1876 addq %rax,%r10 1877 movq 24(%rbp),%rax 1878 adcq $0,%rdx 1879 addq %r10,%r9 1880 movq %rdx,%r10 1881 adcq $0,%r10 1882 1883 mulq %rbx 1884 addq %rax,%r11 1885 movq 32(%rbp),%rax 1886 adcq $0,%rdx 1887 addq %r11,%r10 1888 movq %rdx,%r11 1889 adcq $0,%r11 1890 1891 mulq %rbx 1892 addq %rax,%r12 1893 movq 40(%rbp),%rax 1894 adcq $0,%rdx 1895 addq %r12,%r11 1896 movq %rdx,%r12 1897 adcq $0,%r12 1898 1899 mulq %rbx 1900 addq %rax,%r13 1901 movq 48(%rbp),%rax 1902 adcq $0,%rdx 1903 addq %r13,%r12 1904 movq %rdx,%r13 1905 adcq $0,%r13 1906 1907 mulq %rbx 1908 addq %rax,%r14 1909 movq 56(%rbp),%rax 1910 adcq $0,%rdx 1911 addq %r14,%r13 1912 movq %rdx,%r14 1913 adcq $0,%r14 1914 1915 mulq %rbx 1916 movq 48-16+8(%rsp,%rcx,8),%rbx 1917 addq %rax,%r15 1918 adcq $0,%rdx 1919 addq %r15,%r14 1920 movq 0(%rbp),%rax 1921 movq %rdx,%r15 1922 adcq $0,%r15 1923 1924 decl %ecx 1925 jnz .L8x_tail 1926 1927 leaq 64(%rbp),%rbp 1928 movq 8+8(%rsp),%rdx 1929 cmpq 0+8(%rsp),%rbp 1930 jae .L8x_tail_done 1931 1932 movq 48+56+8(%rsp),%rbx 1933 negq %rsi 1934 movq 0(%rbp),%rax 1935 adcq 0(%rdi),%r8 1936 adcq 8(%rdi),%r9 1937 adcq 16(%rdi),%r10 1938 adcq 24(%rdi),%r11 1939 adcq 32(%rdi),%r12 1940 adcq 40(%rdi),%r13 1941 adcq 48(%rdi),%r14 1942 adcq 56(%rdi),%r15 1943 sbbq %rsi,%rsi 1944 1945 movl $8,%ecx 1946 jmp .L8x_tail 1947 1948.align 32 1949.L8x_tail_done: 1950 xorq %rax,%rax 1951 addq (%rdx),%r8 1952 adcq $0,%r9 1953 adcq $0,%r10 1954 adcq $0,%r11 1955 adcq $0,%r12 1956 adcq $0,%r13 1957 adcq $0,%r14 1958 adcq $0,%r15 1959 adcq $0,%rax 1960 1961 negq %rsi 1962.L8x_no_tail: 1963 adcq 0(%rdi),%r8 1964 adcq 8(%rdi),%r9 1965 adcq 16(%rdi),%r10 1966 adcq 24(%rdi),%r11 1967 adcq 32(%rdi),%r12 1968 adcq 40(%rdi),%r13 1969 adcq 48(%rdi),%r14 1970 adcq 56(%rdi),%r15 1971 adcq $0,%rax 1972 movq -8(%rbp),%rcx 1973 xorq %rsi,%rsi 1974 1975.byte 102,72,15,126,213 1976 1977 movq %r8,0(%rdi) 1978 movq %r9,8(%rdi) 1979.byte 102,73,15,126,217 1980 movq %r10,16(%rdi) 1981 movq %r11,24(%rdi) 1982 movq %r12,32(%rdi) 1983 movq %r13,40(%rdi) 1984 movq %r14,48(%rdi) 1985 movq %r15,56(%rdi) 1986 leaq 64(%rdi),%rdi 1987 1988 cmpq %rdx,%rdi 1989 jb .L8x_reduction_loop 1990 .byte 0xf3,0xc3 1991.size bn_sqr8x_internal,.-bn_sqr8x_internal 1992.type __bn_post4x_internal,@function 1993.align 32 1994__bn_post4x_internal: 1995 movq 0(%rbp),%r12 1996 leaq (%rdi,%r9,1),%rbx 1997 movq %r9,%rcx 1998.byte 102,72,15,126,207 1999 negq %rax 2000.byte 102,72,15,126,206 2001 sarq $3+2,%rcx 2002 decq %r12 2003 xorq %r10,%r10 2004 movq 8(%rbp),%r13 2005 movq 16(%rbp),%r14 2006 movq 24(%rbp),%r15 2007 jmp .Lsqr4x_sub_entry 2008 2009.align 16 2010.Lsqr4x_sub: 2011 movq 0(%rbp),%r12 2012 movq 8(%rbp),%r13 2013 movq 16(%rbp),%r14 2014 movq 24(%rbp),%r15 2015.Lsqr4x_sub_entry: 2016 leaq 32(%rbp),%rbp 2017 notq %r12 2018 notq %r13 2019 notq %r14 2020 notq %r15 2021 andq %rax,%r12 2022 andq %rax,%r13 2023 andq %rax,%r14 2024 andq %rax,%r15 2025 2026 negq %r10 2027 adcq 0(%rbx),%r12 2028 adcq 8(%rbx),%r13 2029 adcq 16(%rbx),%r14 2030 adcq 24(%rbx),%r15 2031 movq %r12,0(%rdi) 2032 leaq 32(%rbx),%rbx 2033 movq %r13,8(%rdi) 2034 sbbq %r10,%r10 2035 movq %r14,16(%rdi) 2036 movq %r15,24(%rdi) 2037 leaq 32(%rdi),%rdi 2038 2039 incq %rcx 2040 jnz .Lsqr4x_sub 2041 2042 movq %r9,%r10 2043 negq %r9 2044 .byte 0xf3,0xc3 2045.size __bn_post4x_internal,.-__bn_post4x_internal 2046.globl bn_from_montgomery 2047.type bn_from_montgomery,@function 2048.align 32 2049bn_from_montgomery: 2050 testl $7,%r9d 2051 jz bn_from_mont8x 2052 xorl %eax,%eax 2053 .byte 0xf3,0xc3 2054.size bn_from_montgomery,.-bn_from_montgomery 2055 2056.type bn_from_mont8x,@function 2057.align 32 2058bn_from_mont8x: 2059.cfi_startproc 2060.byte 0x67 2061 movq %rsp,%rax 2062.cfi_def_cfa_register %rax 2063 pushq %rbx 2064.cfi_offset %rbx,-16 2065 pushq %rbp 2066.cfi_offset %rbp,-24 2067 pushq %r12 2068.cfi_offset %r12,-32 2069 pushq %r13 2070.cfi_offset %r13,-40 2071 pushq %r14 2072.cfi_offset %r14,-48 2073 pushq %r15 2074.cfi_offset %r15,-56 2075.Lfrom_prologue: 2076 2077 shll $3,%r9d 2078 leaq (%r9,%r9,2),%r10 2079 negq %r9 2080 movq (%r8),%r8 2081 2082 2083 2084 2085 2086 2087 2088 2089 leaq -320(%rsp,%r9,2),%r11 2090 movq %rsp,%rbp 2091 subq %rdi,%r11 2092 andq $4095,%r11 2093 cmpq %r11,%r10 2094 jb .Lfrom_sp_alt 2095 subq %r11,%rbp 2096 leaq -320(%rbp,%r9,2),%rbp 2097 jmp .Lfrom_sp_done 2098 2099.align 32 2100.Lfrom_sp_alt: 2101 leaq 4096-320(,%r9,2),%r10 2102 leaq -320(%rbp,%r9,2),%rbp 2103 subq %r10,%r11 2104 movq $0,%r10 2105 cmovcq %r10,%r11 2106 subq %r11,%rbp 2107.Lfrom_sp_done: 2108 andq $-64,%rbp 2109 movq %rsp,%r11 2110 subq %rbp,%r11 2111 andq $-4096,%r11 2112 leaq (%r11,%rbp,1),%rsp 2113 movq (%rsp),%r10 2114 cmpq %rbp,%rsp 2115 ja .Lfrom_page_walk 2116 jmp .Lfrom_page_walk_done 2117 2118.Lfrom_page_walk: 2119 leaq -4096(%rsp),%rsp 2120 movq (%rsp),%r10 2121 cmpq %rbp,%rsp 2122 ja .Lfrom_page_walk 2123.Lfrom_page_walk_done: 2124 2125 movq %r9,%r10 2126 negq %r9 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 movq %r8,32(%rsp) 2138 movq %rax,40(%rsp) 2139.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 2140.Lfrom_body: 2141 movq %r9,%r11 2142 leaq 48(%rsp),%rax 2143 pxor %xmm0,%xmm0 2144 jmp .Lmul_by_1 2145 2146.align 32 2147.Lmul_by_1: 2148 movdqu (%rsi),%xmm1 2149 movdqu 16(%rsi),%xmm2 2150 movdqu 32(%rsi),%xmm3 2151 movdqa %xmm0,(%rax,%r9,1) 2152 movdqu 48(%rsi),%xmm4 2153 movdqa %xmm0,16(%rax,%r9,1) 2154.byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 2155 movdqa %xmm1,(%rax) 2156 movdqa %xmm0,32(%rax,%r9,1) 2157 movdqa %xmm2,16(%rax) 2158 movdqa %xmm0,48(%rax,%r9,1) 2159 movdqa %xmm3,32(%rax) 2160 movdqa %xmm4,48(%rax) 2161 leaq 64(%rax),%rax 2162 subq $64,%r11 2163 jnz .Lmul_by_1 2164 2165.byte 102,72,15,110,207 2166.byte 102,72,15,110,209 2167.byte 0x67 2168 movq %rcx,%rbp 2169.byte 102,73,15,110,218 2170 movl OPENSSL_ia32cap_P+8(%rip),%r11d 2171 andl $0x80108,%r11d 2172 cmpl $0x80108,%r11d 2173 jne .Lfrom_mont_nox 2174 2175 leaq (%rax,%r9,1),%rdi 2176 call __bn_sqrx8x_reduction 2177 call __bn_postx4x_internal 2178 2179 pxor %xmm0,%xmm0 2180 leaq 48(%rsp),%rax 2181 jmp .Lfrom_mont_zero 2182 2183.align 32 2184.Lfrom_mont_nox: 2185 call __bn_sqr8x_reduction 2186 call __bn_post4x_internal 2187 2188 pxor %xmm0,%xmm0 2189 leaq 48(%rsp),%rax 2190 jmp .Lfrom_mont_zero 2191 2192.align 32 2193.Lfrom_mont_zero: 2194 movq 40(%rsp),%rsi 2195.cfi_def_cfa %rsi,8 2196 movdqa %xmm0,0(%rax) 2197 movdqa %xmm0,16(%rax) 2198 movdqa %xmm0,32(%rax) 2199 movdqa %xmm0,48(%rax) 2200 leaq 64(%rax),%rax 2201 subq $32,%r9 2202 jnz .Lfrom_mont_zero 2203 2204 movq $1,%rax 2205 movq -48(%rsi),%r15 2206.cfi_restore %r15 2207 movq -40(%rsi),%r14 2208.cfi_restore %r14 2209 movq -32(%rsi),%r13 2210.cfi_restore %r13 2211 movq -24(%rsi),%r12 2212.cfi_restore %r12 2213 movq -16(%rsi),%rbp 2214.cfi_restore %rbp 2215 movq -8(%rsi),%rbx 2216.cfi_restore %rbx 2217 leaq (%rsi),%rsp 2218.cfi_def_cfa_register %rsp 2219.Lfrom_epilogue: 2220 .byte 0xf3,0xc3 2221.cfi_endproc 2222.size bn_from_mont8x,.-bn_from_mont8x 2223.type bn_mulx4x_mont_gather5,@function 2224.align 32 2225bn_mulx4x_mont_gather5: 2226.cfi_startproc 2227 movq %rsp,%rax 2228.cfi_def_cfa_register %rax 2229.Lmulx4x_enter: 2230 pushq %rbx 2231.cfi_offset %rbx,-16 2232 pushq %rbp 2233.cfi_offset %rbp,-24 2234 pushq %r12 2235.cfi_offset %r12,-32 2236 pushq %r13 2237.cfi_offset %r13,-40 2238 pushq %r14 2239.cfi_offset %r14,-48 2240 pushq %r15 2241.cfi_offset %r15,-56 2242.Lmulx4x_prologue: 2243 2244 shll $3,%r9d 2245 leaq (%r9,%r9,2),%r10 2246 negq %r9 2247 movq (%r8),%r8 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 leaq -320(%rsp,%r9,2),%r11 2259 movq %rsp,%rbp 2260 subq %rdi,%r11 2261 andq $4095,%r11 2262 cmpq %r11,%r10 2263 jb .Lmulx4xsp_alt 2264 subq %r11,%rbp 2265 leaq -320(%rbp,%r9,2),%rbp 2266 jmp .Lmulx4xsp_done 2267 2268.Lmulx4xsp_alt: 2269 leaq 4096-320(,%r9,2),%r10 2270 leaq -320(%rbp,%r9,2),%rbp 2271 subq %r10,%r11 2272 movq $0,%r10 2273 cmovcq %r10,%r11 2274 subq %r11,%rbp 2275.Lmulx4xsp_done: 2276 andq $-64,%rbp 2277 movq %rsp,%r11 2278 subq %rbp,%r11 2279 andq $-4096,%r11 2280 leaq (%r11,%rbp,1),%rsp 2281 movq (%rsp),%r10 2282 cmpq %rbp,%rsp 2283 ja .Lmulx4x_page_walk 2284 jmp .Lmulx4x_page_walk_done 2285 2286.Lmulx4x_page_walk: 2287 leaq -4096(%rsp),%rsp 2288 movq (%rsp),%r10 2289 cmpq %rbp,%rsp 2290 ja .Lmulx4x_page_walk 2291.Lmulx4x_page_walk_done: 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 movq %r8,32(%rsp) 2306 movq %rax,40(%rsp) 2307.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 2308.Lmulx4x_body: 2309 call mulx4x_internal 2310 2311 movq 40(%rsp),%rsi 2312.cfi_def_cfa %rsi,8 2313 movq $1,%rax 2314 2315 movq -48(%rsi),%r15 2316.cfi_restore %r15 2317 movq -40(%rsi),%r14 2318.cfi_restore %r14 2319 movq -32(%rsi),%r13 2320.cfi_restore %r13 2321 movq -24(%rsi),%r12 2322.cfi_restore %r12 2323 movq -16(%rsi),%rbp 2324.cfi_restore %rbp 2325 movq -8(%rsi),%rbx 2326.cfi_restore %rbx 2327 leaq (%rsi),%rsp 2328.cfi_def_cfa_register %rsp 2329.Lmulx4x_epilogue: 2330 .byte 0xf3,0xc3 2331.cfi_endproc 2332.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 2333 2334.type mulx4x_internal,@function 2335.align 32 2336mulx4x_internal: 2337 movq %r9,8(%rsp) 2338 movq %r9,%r10 2339 negq %r9 2340 shlq $5,%r9 2341 negq %r10 2342 leaq 128(%rdx,%r9,1),%r13 2343 shrq $5+5,%r9 2344 movd 8(%rax),%xmm5 2345 subq $1,%r9 2346 leaq .Linc(%rip),%rax 2347 movq %r13,16+8(%rsp) 2348 movq %r9,24+8(%rsp) 2349 movq %rdi,56+8(%rsp) 2350 movdqa 0(%rax),%xmm0 2351 movdqa 16(%rax),%xmm1 2352 leaq 88-112(%rsp,%r10,1),%r10 2353 leaq 128(%rdx),%rdi 2354 2355 pshufd $0,%xmm5,%xmm5 2356 movdqa %xmm1,%xmm4 2357.byte 0x67 2358 movdqa %xmm1,%xmm2 2359.byte 0x67 2360 paddd %xmm0,%xmm1 2361 pcmpeqd %xmm5,%xmm0 2362 movdqa %xmm4,%xmm3 2363 paddd %xmm1,%xmm2 2364 pcmpeqd %xmm5,%xmm1 2365 movdqa %xmm0,112(%r10) 2366 movdqa %xmm4,%xmm0 2367 2368 paddd %xmm2,%xmm3 2369 pcmpeqd %xmm5,%xmm2 2370 movdqa %xmm1,128(%r10) 2371 movdqa %xmm4,%xmm1 2372 2373 paddd %xmm3,%xmm0 2374 pcmpeqd %xmm5,%xmm3 2375 movdqa %xmm2,144(%r10) 2376 movdqa %xmm4,%xmm2 2377 2378 paddd %xmm0,%xmm1 2379 pcmpeqd %xmm5,%xmm0 2380 movdqa %xmm3,160(%r10) 2381 movdqa %xmm4,%xmm3 2382 paddd %xmm1,%xmm2 2383 pcmpeqd %xmm5,%xmm1 2384 movdqa %xmm0,176(%r10) 2385 movdqa %xmm4,%xmm0 2386 2387 paddd %xmm2,%xmm3 2388 pcmpeqd %xmm5,%xmm2 2389 movdqa %xmm1,192(%r10) 2390 movdqa %xmm4,%xmm1 2391 2392 paddd %xmm3,%xmm0 2393 pcmpeqd %xmm5,%xmm3 2394 movdqa %xmm2,208(%r10) 2395 movdqa %xmm4,%xmm2 2396 2397 paddd %xmm0,%xmm1 2398 pcmpeqd %xmm5,%xmm0 2399 movdqa %xmm3,224(%r10) 2400 movdqa %xmm4,%xmm3 2401 paddd %xmm1,%xmm2 2402 pcmpeqd %xmm5,%xmm1 2403 movdqa %xmm0,240(%r10) 2404 movdqa %xmm4,%xmm0 2405 2406 paddd %xmm2,%xmm3 2407 pcmpeqd %xmm5,%xmm2 2408 movdqa %xmm1,256(%r10) 2409 movdqa %xmm4,%xmm1 2410 2411 paddd %xmm3,%xmm0 2412 pcmpeqd %xmm5,%xmm3 2413 movdqa %xmm2,272(%r10) 2414 movdqa %xmm4,%xmm2 2415 2416 paddd %xmm0,%xmm1 2417 pcmpeqd %xmm5,%xmm0 2418 movdqa %xmm3,288(%r10) 2419 movdqa %xmm4,%xmm3 2420.byte 0x67 2421 paddd %xmm1,%xmm2 2422 pcmpeqd %xmm5,%xmm1 2423 movdqa %xmm0,304(%r10) 2424 2425 paddd %xmm2,%xmm3 2426 pcmpeqd %xmm5,%xmm2 2427 movdqa %xmm1,320(%r10) 2428 2429 pcmpeqd %xmm5,%xmm3 2430 movdqa %xmm2,336(%r10) 2431 2432 pand 64(%rdi),%xmm0 2433 pand 80(%rdi),%xmm1 2434 pand 96(%rdi),%xmm2 2435 movdqa %xmm3,352(%r10) 2436 pand 112(%rdi),%xmm3 2437 por %xmm2,%xmm0 2438 por %xmm3,%xmm1 2439 movdqa -128(%rdi),%xmm4 2440 movdqa -112(%rdi),%xmm5 2441 movdqa -96(%rdi),%xmm2 2442 pand 112(%r10),%xmm4 2443 movdqa -80(%rdi),%xmm3 2444 pand 128(%r10),%xmm5 2445 por %xmm4,%xmm0 2446 pand 144(%r10),%xmm2 2447 por %xmm5,%xmm1 2448 pand 160(%r10),%xmm3 2449 por %xmm2,%xmm0 2450 por %xmm3,%xmm1 2451 movdqa -64(%rdi),%xmm4 2452 movdqa -48(%rdi),%xmm5 2453 movdqa -32(%rdi),%xmm2 2454 pand 176(%r10),%xmm4 2455 movdqa -16(%rdi),%xmm3 2456 pand 192(%r10),%xmm5 2457 por %xmm4,%xmm0 2458 pand 208(%r10),%xmm2 2459 por %xmm5,%xmm1 2460 pand 224(%r10),%xmm3 2461 por %xmm2,%xmm0 2462 por %xmm3,%xmm1 2463 movdqa 0(%rdi),%xmm4 2464 movdqa 16(%rdi),%xmm5 2465 movdqa 32(%rdi),%xmm2 2466 pand 240(%r10),%xmm4 2467 movdqa 48(%rdi),%xmm3 2468 pand 256(%r10),%xmm5 2469 por %xmm4,%xmm0 2470 pand 272(%r10),%xmm2 2471 por %xmm5,%xmm1 2472 pand 288(%r10),%xmm3 2473 por %xmm2,%xmm0 2474 por %xmm3,%xmm1 2475 pxor %xmm1,%xmm0 2476 pshufd $0x4e,%xmm0,%xmm1 2477 por %xmm1,%xmm0 2478 leaq 256(%rdi),%rdi 2479.byte 102,72,15,126,194 2480 leaq 64+32+8(%rsp),%rbx 2481 2482 movq %rdx,%r9 2483 mulxq 0(%rsi),%r8,%rax 2484 mulxq 8(%rsi),%r11,%r12 2485 addq %rax,%r11 2486 mulxq 16(%rsi),%rax,%r13 2487 adcq %rax,%r12 2488 adcq $0,%r13 2489 mulxq 24(%rsi),%rax,%r14 2490 2491 movq %r8,%r15 2492 imulq 32+8(%rsp),%r8 2493 xorq %rbp,%rbp 2494 movq %r8,%rdx 2495 2496 movq %rdi,8+8(%rsp) 2497 2498 leaq 32(%rsi),%rsi 2499 adcxq %rax,%r13 2500 adcxq %rbp,%r14 2501 2502 mulxq 0(%rcx),%rax,%r10 2503 adcxq %rax,%r15 2504 adoxq %r11,%r10 2505 mulxq 8(%rcx),%rax,%r11 2506 adcxq %rax,%r10 2507 adoxq %r12,%r11 2508 mulxq 16(%rcx),%rax,%r12 2509 movq 24+8(%rsp),%rdi 2510 movq %r10,-32(%rbx) 2511 adcxq %rax,%r11 2512 adoxq %r13,%r12 2513 mulxq 24(%rcx),%rax,%r15 2514 movq %r9,%rdx 2515 movq %r11,-24(%rbx) 2516 adcxq %rax,%r12 2517 adoxq %rbp,%r15 2518 leaq 32(%rcx),%rcx 2519 movq %r12,-16(%rbx) 2520 jmp .Lmulx4x_1st 2521 2522.align 32 2523.Lmulx4x_1st: 2524 adcxq %rbp,%r15 2525 mulxq 0(%rsi),%r10,%rax 2526 adcxq %r14,%r10 2527 mulxq 8(%rsi),%r11,%r14 2528 adcxq %rax,%r11 2529 mulxq 16(%rsi),%r12,%rax 2530 adcxq %r14,%r12 2531 mulxq 24(%rsi),%r13,%r14 2532.byte 0x67,0x67 2533 movq %r8,%rdx 2534 adcxq %rax,%r13 2535 adcxq %rbp,%r14 2536 leaq 32(%rsi),%rsi 2537 leaq 32(%rbx),%rbx 2538 2539 adoxq %r15,%r10 2540 mulxq 0(%rcx),%rax,%r15 2541 adcxq %rax,%r10 2542 adoxq %r15,%r11 2543 mulxq 8(%rcx),%rax,%r15 2544 adcxq %rax,%r11 2545 adoxq %r15,%r12 2546 mulxq 16(%rcx),%rax,%r15 2547 movq %r10,-40(%rbx) 2548 adcxq %rax,%r12 2549 movq %r11,-32(%rbx) 2550 adoxq %r15,%r13 2551 mulxq 24(%rcx),%rax,%r15 2552 movq %r9,%rdx 2553 movq %r12,-24(%rbx) 2554 adcxq %rax,%r13 2555 adoxq %rbp,%r15 2556 leaq 32(%rcx),%rcx 2557 movq %r13,-16(%rbx) 2558 2559 decq %rdi 2560 jnz .Lmulx4x_1st 2561 2562 movq 8(%rsp),%rax 2563 adcq %rbp,%r15 2564 leaq (%rsi,%rax,1),%rsi 2565 addq %r15,%r14 2566 movq 8+8(%rsp),%rdi 2567 adcq %rbp,%rbp 2568 movq %r14,-8(%rbx) 2569 jmp .Lmulx4x_outer 2570 2571.align 32 2572.Lmulx4x_outer: 2573 leaq 16-256(%rbx),%r10 2574 pxor %xmm4,%xmm4 2575.byte 0x67,0x67 2576 pxor %xmm5,%xmm5 2577 movdqa -128(%rdi),%xmm0 2578 movdqa -112(%rdi),%xmm1 2579 movdqa -96(%rdi),%xmm2 2580 pand 256(%r10),%xmm0 2581 movdqa -80(%rdi),%xmm3 2582 pand 272(%r10),%xmm1 2583 por %xmm0,%xmm4 2584 pand 288(%r10),%xmm2 2585 por %xmm1,%xmm5 2586 pand 304(%r10),%xmm3 2587 por %xmm2,%xmm4 2588 por %xmm3,%xmm5 2589 movdqa -64(%rdi),%xmm0 2590 movdqa -48(%rdi),%xmm1 2591 movdqa -32(%rdi),%xmm2 2592 pand 320(%r10),%xmm0 2593 movdqa -16(%rdi),%xmm3 2594 pand 336(%r10),%xmm1 2595 por %xmm0,%xmm4 2596 pand 352(%r10),%xmm2 2597 por %xmm1,%xmm5 2598 pand 368(%r10),%xmm3 2599 por %xmm2,%xmm4 2600 por %xmm3,%xmm5 2601 movdqa 0(%rdi),%xmm0 2602 movdqa 16(%rdi),%xmm1 2603 movdqa 32(%rdi),%xmm2 2604 pand 384(%r10),%xmm0 2605 movdqa 48(%rdi),%xmm3 2606 pand 400(%r10),%xmm1 2607 por %xmm0,%xmm4 2608 pand 416(%r10),%xmm2 2609 por %xmm1,%xmm5 2610 pand 432(%r10),%xmm3 2611 por %xmm2,%xmm4 2612 por %xmm3,%xmm5 2613 movdqa 64(%rdi),%xmm0 2614 movdqa 80(%rdi),%xmm1 2615 movdqa 96(%rdi),%xmm2 2616 pand 448(%r10),%xmm0 2617 movdqa 112(%rdi),%xmm3 2618 pand 464(%r10),%xmm1 2619 por %xmm0,%xmm4 2620 pand 480(%r10),%xmm2 2621 por %xmm1,%xmm5 2622 pand 496(%r10),%xmm3 2623 por %xmm2,%xmm4 2624 por %xmm3,%xmm5 2625 por %xmm5,%xmm4 2626 pshufd $0x4e,%xmm4,%xmm0 2627 por %xmm4,%xmm0 2628 leaq 256(%rdi),%rdi 2629.byte 102,72,15,126,194 2630 2631 movq %rbp,(%rbx) 2632 leaq 32(%rbx,%rax,1),%rbx 2633 mulxq 0(%rsi),%r8,%r11 2634 xorq %rbp,%rbp 2635 movq %rdx,%r9 2636 mulxq 8(%rsi),%r14,%r12 2637 adoxq -32(%rbx),%r8 2638 adcxq %r14,%r11 2639 mulxq 16(%rsi),%r15,%r13 2640 adoxq -24(%rbx),%r11 2641 adcxq %r15,%r12 2642 mulxq 24(%rsi),%rdx,%r14 2643 adoxq -16(%rbx),%r12 2644 adcxq %rdx,%r13 2645 leaq (%rcx,%rax,1),%rcx 2646 leaq 32(%rsi),%rsi 2647 adoxq -8(%rbx),%r13 2648 adcxq %rbp,%r14 2649 adoxq %rbp,%r14 2650 2651 movq %r8,%r15 2652 imulq 32+8(%rsp),%r8 2653 2654 movq %r8,%rdx 2655 xorq %rbp,%rbp 2656 movq %rdi,8+8(%rsp) 2657 2658 mulxq 0(%rcx),%rax,%r10 2659 adcxq %rax,%r15 2660 adoxq %r11,%r10 2661 mulxq 8(%rcx),%rax,%r11 2662 adcxq %rax,%r10 2663 adoxq %r12,%r11 2664 mulxq 16(%rcx),%rax,%r12 2665 adcxq %rax,%r11 2666 adoxq %r13,%r12 2667 mulxq 24(%rcx),%rax,%r15 2668 movq %r9,%rdx 2669 movq 24+8(%rsp),%rdi 2670 movq %r10,-32(%rbx) 2671 adcxq %rax,%r12 2672 movq %r11,-24(%rbx) 2673 adoxq %rbp,%r15 2674 movq %r12,-16(%rbx) 2675 leaq 32(%rcx),%rcx 2676 jmp .Lmulx4x_inner 2677 2678.align 32 2679.Lmulx4x_inner: 2680 mulxq 0(%rsi),%r10,%rax 2681 adcxq %rbp,%r15 2682 adoxq %r14,%r10 2683 mulxq 8(%rsi),%r11,%r14 2684 adcxq 0(%rbx),%r10 2685 adoxq %rax,%r11 2686 mulxq 16(%rsi),%r12,%rax 2687 adcxq 8(%rbx),%r11 2688 adoxq %r14,%r12 2689 mulxq 24(%rsi),%r13,%r14 2690 movq %r8,%rdx 2691 adcxq 16(%rbx),%r12 2692 adoxq %rax,%r13 2693 adcxq 24(%rbx),%r13 2694 adoxq %rbp,%r14 2695 leaq 32(%rsi),%rsi 2696 leaq 32(%rbx),%rbx 2697 adcxq %rbp,%r14 2698 2699 adoxq %r15,%r10 2700 mulxq 0(%rcx),%rax,%r15 2701 adcxq %rax,%r10 2702 adoxq %r15,%r11 2703 mulxq 8(%rcx),%rax,%r15 2704 adcxq %rax,%r11 2705 adoxq %r15,%r12 2706 mulxq 16(%rcx),%rax,%r15 2707 movq %r10,-40(%rbx) 2708 adcxq %rax,%r12 2709 adoxq %r15,%r13 2710 movq %r11,-32(%rbx) 2711 mulxq 24(%rcx),%rax,%r15 2712 movq %r9,%rdx 2713 leaq 32(%rcx),%rcx 2714 movq %r12,-24(%rbx) 2715 adcxq %rax,%r13 2716 adoxq %rbp,%r15 2717 movq %r13,-16(%rbx) 2718 2719 decq %rdi 2720 jnz .Lmulx4x_inner 2721 2722 movq 0+8(%rsp),%rax 2723 adcq %rbp,%r15 2724 subq 0(%rbx),%rdi 2725 movq 8+8(%rsp),%rdi 2726 movq 16+8(%rsp),%r10 2727 adcq %r15,%r14 2728 leaq (%rsi,%rax,1),%rsi 2729 adcq %rbp,%rbp 2730 movq %r14,-8(%rbx) 2731 2732 cmpq %r10,%rdi 2733 jb .Lmulx4x_outer 2734 2735 movq -8(%rcx),%r10 2736 movq %rbp,%r8 2737 movq (%rcx,%rax,1),%r12 2738 leaq (%rcx,%rax,1),%rbp 2739 movq %rax,%rcx 2740 leaq (%rbx,%rax,1),%rdi 2741 xorl %eax,%eax 2742 xorq %r15,%r15 2743 subq %r14,%r10 2744 adcq %r15,%r15 2745 orq %r15,%r8 2746 sarq $3+2,%rcx 2747 subq %r8,%rax 2748 movq 56+8(%rsp),%rdx 2749 decq %r12 2750 movq 8(%rbp),%r13 2751 xorq %r8,%r8 2752 movq 16(%rbp),%r14 2753 movq 24(%rbp),%r15 2754 jmp .Lsqrx4x_sub_entry 2755.size mulx4x_internal,.-mulx4x_internal 2756.type bn_powerx5,@function 2757.align 32 2758bn_powerx5: 2759.cfi_startproc 2760 movq %rsp,%rax 2761.cfi_def_cfa_register %rax 2762.Lpowerx5_enter: 2763 pushq %rbx 2764.cfi_offset %rbx,-16 2765 pushq %rbp 2766.cfi_offset %rbp,-24 2767 pushq %r12 2768.cfi_offset %r12,-32 2769 pushq %r13 2770.cfi_offset %r13,-40 2771 pushq %r14 2772.cfi_offset %r14,-48 2773 pushq %r15 2774.cfi_offset %r15,-56 2775.Lpowerx5_prologue: 2776 2777 shll $3,%r9d 2778 leaq (%r9,%r9,2),%r10 2779 negq %r9 2780 movq (%r8),%r8 2781 2782 2783 2784 2785 2786 2787 2788 2789 leaq -320(%rsp,%r9,2),%r11 2790 movq %rsp,%rbp 2791 subq %rdi,%r11 2792 andq $4095,%r11 2793 cmpq %r11,%r10 2794 jb .Lpwrx_sp_alt 2795 subq %r11,%rbp 2796 leaq -320(%rbp,%r9,2),%rbp 2797 jmp .Lpwrx_sp_done 2798 2799.align 32 2800.Lpwrx_sp_alt: 2801 leaq 4096-320(,%r9,2),%r10 2802 leaq -320(%rbp,%r9,2),%rbp 2803 subq %r10,%r11 2804 movq $0,%r10 2805 cmovcq %r10,%r11 2806 subq %r11,%rbp 2807.Lpwrx_sp_done: 2808 andq $-64,%rbp 2809 movq %rsp,%r11 2810 subq %rbp,%r11 2811 andq $-4096,%r11 2812 leaq (%r11,%rbp,1),%rsp 2813 movq (%rsp),%r10 2814 cmpq %rbp,%rsp 2815 ja .Lpwrx_page_walk 2816 jmp .Lpwrx_page_walk_done 2817 2818.Lpwrx_page_walk: 2819 leaq -4096(%rsp),%rsp 2820 movq (%rsp),%r10 2821 cmpq %rbp,%rsp 2822 ja .Lpwrx_page_walk 2823.Lpwrx_page_walk_done: 2824 2825 movq %r9,%r10 2826 negq %r9 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 pxor %xmm0,%xmm0 2840.byte 102,72,15,110,207 2841.byte 102,72,15,110,209 2842.byte 102,73,15,110,218 2843.byte 102,72,15,110,226 2844 movq %r8,32(%rsp) 2845 movq %rax,40(%rsp) 2846.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 2847.Lpowerx5_body: 2848 2849 call __bn_sqrx8x_internal 2850 call __bn_postx4x_internal 2851 call __bn_sqrx8x_internal 2852 call __bn_postx4x_internal 2853 call __bn_sqrx8x_internal 2854 call __bn_postx4x_internal 2855 call __bn_sqrx8x_internal 2856 call __bn_postx4x_internal 2857 call __bn_sqrx8x_internal 2858 call __bn_postx4x_internal 2859 2860 movq %r10,%r9 2861 movq %rsi,%rdi 2862.byte 102,72,15,126,209 2863.byte 102,72,15,126,226 2864 movq 40(%rsp),%rax 2865 2866 call mulx4x_internal 2867 2868 movq 40(%rsp),%rsi 2869.cfi_def_cfa %rsi,8 2870 movq $1,%rax 2871 2872 movq -48(%rsi),%r15 2873.cfi_restore %r15 2874 movq -40(%rsi),%r14 2875.cfi_restore %r14 2876 movq -32(%rsi),%r13 2877.cfi_restore %r13 2878 movq -24(%rsi),%r12 2879.cfi_restore %r12 2880 movq -16(%rsi),%rbp 2881.cfi_restore %rbp 2882 movq -8(%rsi),%rbx 2883.cfi_restore %rbx 2884 leaq (%rsi),%rsp 2885.cfi_def_cfa_register %rsp 2886.Lpowerx5_epilogue: 2887 .byte 0xf3,0xc3 2888.cfi_endproc 2889.size bn_powerx5,.-bn_powerx5 2890 2891.globl bn_sqrx8x_internal 2892.hidden bn_sqrx8x_internal 2893.type bn_sqrx8x_internal,@function 2894.align 32 2895bn_sqrx8x_internal: 2896__bn_sqrx8x_internal: 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 leaq 48+8(%rsp),%rdi 2938 leaq (%rsi,%r9,1),%rbp 2939 movq %r9,0+8(%rsp) 2940 movq %rbp,8+8(%rsp) 2941 jmp .Lsqr8x_zero_start 2942 2943.align 32 2944.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 2945.Lsqrx8x_zero: 2946.byte 0x3e 2947 movdqa %xmm0,0(%rdi) 2948 movdqa %xmm0,16(%rdi) 2949 movdqa %xmm0,32(%rdi) 2950 movdqa %xmm0,48(%rdi) 2951.Lsqr8x_zero_start: 2952 movdqa %xmm0,64(%rdi) 2953 movdqa %xmm0,80(%rdi) 2954 movdqa %xmm0,96(%rdi) 2955 movdqa %xmm0,112(%rdi) 2956 leaq 128(%rdi),%rdi 2957 subq $64,%r9 2958 jnz .Lsqrx8x_zero 2959 2960 movq 0(%rsi),%rdx 2961 2962 xorq %r10,%r10 2963 xorq %r11,%r11 2964 xorq %r12,%r12 2965 xorq %r13,%r13 2966 xorq %r14,%r14 2967 xorq %r15,%r15 2968 leaq 48+8(%rsp),%rdi 2969 xorq %rbp,%rbp 2970 jmp .Lsqrx8x_outer_loop 2971 2972.align 32 2973.Lsqrx8x_outer_loop: 2974 mulxq 8(%rsi),%r8,%rax 2975 adcxq %r9,%r8 2976 adoxq %rax,%r10 2977 mulxq 16(%rsi),%r9,%rax 2978 adcxq %r10,%r9 2979 adoxq %rax,%r11 2980.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 2981 adcxq %r11,%r10 2982 adoxq %rax,%r12 2983.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 2984 adcxq %r12,%r11 2985 adoxq %rax,%r13 2986 mulxq 40(%rsi),%r12,%rax 2987 adcxq %r13,%r12 2988 adoxq %rax,%r14 2989 mulxq 48(%rsi),%r13,%rax 2990 adcxq %r14,%r13 2991 adoxq %r15,%rax 2992 mulxq 56(%rsi),%r14,%r15 2993 movq 8(%rsi),%rdx 2994 adcxq %rax,%r14 2995 adoxq %rbp,%r15 2996 adcq 64(%rdi),%r15 2997 movq %r8,8(%rdi) 2998 movq %r9,16(%rdi) 2999 sbbq %rcx,%rcx 3000 xorq %rbp,%rbp 3001 3002 3003 mulxq 16(%rsi),%r8,%rbx 3004 mulxq 24(%rsi),%r9,%rax 3005 adcxq %r10,%r8 3006 adoxq %rbx,%r9 3007 mulxq 32(%rsi),%r10,%rbx 3008 adcxq %r11,%r9 3009 adoxq %rax,%r10 3010.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 3011 adcxq %r12,%r10 3012 adoxq %rbx,%r11 3013.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 3014 adcxq %r13,%r11 3015 adoxq %r14,%r12 3016.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 3017 movq 16(%rsi),%rdx 3018 adcxq %rax,%r12 3019 adoxq %rbx,%r13 3020 adcxq %r15,%r13 3021 adoxq %rbp,%r14 3022 adcxq %rbp,%r14 3023 3024 movq %r8,24(%rdi) 3025 movq %r9,32(%rdi) 3026 3027 mulxq 24(%rsi),%r8,%rbx 3028 mulxq 32(%rsi),%r9,%rax 3029 adcxq %r10,%r8 3030 adoxq %rbx,%r9 3031 mulxq 40(%rsi),%r10,%rbx 3032 adcxq %r11,%r9 3033 adoxq %rax,%r10 3034.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 3035 adcxq %r12,%r10 3036 adoxq %r13,%r11 3037.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 3038.byte 0x3e 3039 movq 24(%rsi),%rdx 3040 adcxq %rbx,%r11 3041 adoxq %rax,%r12 3042 adcxq %r14,%r12 3043 movq %r8,40(%rdi) 3044 movq %r9,48(%rdi) 3045 mulxq 32(%rsi),%r8,%rax 3046 adoxq %rbp,%r13 3047 adcxq %rbp,%r13 3048 3049 mulxq 40(%rsi),%r9,%rbx 3050 adcxq %r10,%r8 3051 adoxq %rax,%r9 3052 mulxq 48(%rsi),%r10,%rax 3053 adcxq %r11,%r9 3054 adoxq %r12,%r10 3055 mulxq 56(%rsi),%r11,%r12 3056 movq 32(%rsi),%rdx 3057 movq 40(%rsi),%r14 3058 adcxq %rbx,%r10 3059 adoxq %rax,%r11 3060 movq 48(%rsi),%r15 3061 adcxq %r13,%r11 3062 adoxq %rbp,%r12 3063 adcxq %rbp,%r12 3064 3065 movq %r8,56(%rdi) 3066 movq %r9,64(%rdi) 3067 3068 mulxq %r14,%r9,%rax 3069 movq 56(%rsi),%r8 3070 adcxq %r10,%r9 3071 mulxq %r15,%r10,%rbx 3072 adoxq %rax,%r10 3073 adcxq %r11,%r10 3074 mulxq %r8,%r11,%rax 3075 movq %r14,%rdx 3076 adoxq %rbx,%r11 3077 adcxq %r12,%r11 3078 3079 adcxq %rbp,%rax 3080 3081 mulxq %r15,%r14,%rbx 3082 mulxq %r8,%r12,%r13 3083 movq %r15,%rdx 3084 leaq 64(%rsi),%rsi 3085 adcxq %r14,%r11 3086 adoxq %rbx,%r12 3087 adcxq %rax,%r12 3088 adoxq %rbp,%r13 3089 3090.byte 0x67,0x67 3091 mulxq %r8,%r8,%r14 3092 adcxq %r8,%r13 3093 adcxq %rbp,%r14 3094 3095 cmpq 8+8(%rsp),%rsi 3096 je .Lsqrx8x_outer_break 3097 3098 negq %rcx 3099 movq $-8,%rcx 3100 movq %rbp,%r15 3101 movq 64(%rdi),%r8 3102 adcxq 72(%rdi),%r9 3103 adcxq 80(%rdi),%r10 3104 adcxq 88(%rdi),%r11 3105 adcq 96(%rdi),%r12 3106 adcq 104(%rdi),%r13 3107 adcq 112(%rdi),%r14 3108 adcq 120(%rdi),%r15 3109 leaq (%rsi),%rbp 3110 leaq 128(%rdi),%rdi 3111 sbbq %rax,%rax 3112 3113 movq -64(%rsi),%rdx 3114 movq %rax,16+8(%rsp) 3115 movq %rdi,24+8(%rsp) 3116 3117 3118 xorl %eax,%eax 3119 jmp .Lsqrx8x_loop 3120 3121.align 32 3122.Lsqrx8x_loop: 3123 movq %r8,%rbx 3124 mulxq 0(%rbp),%rax,%r8 3125 adcxq %rax,%rbx 3126 adoxq %r9,%r8 3127 3128 mulxq 8(%rbp),%rax,%r9 3129 adcxq %rax,%r8 3130 adoxq %r10,%r9 3131 3132 mulxq 16(%rbp),%rax,%r10 3133 adcxq %rax,%r9 3134 adoxq %r11,%r10 3135 3136 mulxq 24(%rbp),%rax,%r11 3137 adcxq %rax,%r10 3138 adoxq %r12,%r11 3139 3140.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 3141 adcxq %rax,%r11 3142 adoxq %r13,%r12 3143 3144 mulxq 40(%rbp),%rax,%r13 3145 adcxq %rax,%r12 3146 adoxq %r14,%r13 3147 3148 mulxq 48(%rbp),%rax,%r14 3149 movq %rbx,(%rdi,%rcx,8) 3150 movl $0,%ebx 3151 adcxq %rax,%r13 3152 adoxq %r15,%r14 3153 3154.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 3155 movq 8(%rsi,%rcx,8),%rdx 3156 adcxq %rax,%r14 3157 adoxq %rbx,%r15 3158 adcxq %rbx,%r15 3159 3160.byte 0x67 3161 incq %rcx 3162 jnz .Lsqrx8x_loop 3163 3164 leaq 64(%rbp),%rbp 3165 movq $-8,%rcx 3166 cmpq 8+8(%rsp),%rbp 3167 je .Lsqrx8x_break 3168 3169 subq 16+8(%rsp),%rbx 3170.byte 0x66 3171 movq -64(%rsi),%rdx 3172 adcxq 0(%rdi),%r8 3173 adcxq 8(%rdi),%r9 3174 adcq 16(%rdi),%r10 3175 adcq 24(%rdi),%r11 3176 adcq 32(%rdi),%r12 3177 adcq 40(%rdi),%r13 3178 adcq 48(%rdi),%r14 3179 adcq 56(%rdi),%r15 3180 leaq 64(%rdi),%rdi 3181.byte 0x67 3182 sbbq %rax,%rax 3183 xorl %ebx,%ebx 3184 movq %rax,16+8(%rsp) 3185 jmp .Lsqrx8x_loop 3186 3187.align 32 3188.Lsqrx8x_break: 3189 xorq %rbp,%rbp 3190 subq 16+8(%rsp),%rbx 3191 adcxq %rbp,%r8 3192 movq 24+8(%rsp),%rcx 3193 adcxq %rbp,%r9 3194 movq 0(%rsi),%rdx 3195 adcq $0,%r10 3196 movq %r8,0(%rdi) 3197 adcq $0,%r11 3198 adcq $0,%r12 3199 adcq $0,%r13 3200 adcq $0,%r14 3201 adcq $0,%r15 3202 cmpq %rcx,%rdi 3203 je .Lsqrx8x_outer_loop 3204 3205 movq %r9,8(%rdi) 3206 movq 8(%rcx),%r9 3207 movq %r10,16(%rdi) 3208 movq 16(%rcx),%r10 3209 movq %r11,24(%rdi) 3210 movq 24(%rcx),%r11 3211 movq %r12,32(%rdi) 3212 movq 32(%rcx),%r12 3213 movq %r13,40(%rdi) 3214 movq 40(%rcx),%r13 3215 movq %r14,48(%rdi) 3216 movq 48(%rcx),%r14 3217 movq %r15,56(%rdi) 3218 movq 56(%rcx),%r15 3219 movq %rcx,%rdi 3220 jmp .Lsqrx8x_outer_loop 3221 3222.align 32 3223.Lsqrx8x_outer_break: 3224 movq %r9,72(%rdi) 3225.byte 102,72,15,126,217 3226 movq %r10,80(%rdi) 3227 movq %r11,88(%rdi) 3228 movq %r12,96(%rdi) 3229 movq %r13,104(%rdi) 3230 movq %r14,112(%rdi) 3231 leaq 48+8(%rsp),%rdi 3232 movq (%rsi,%rcx,1),%rdx 3233 3234 movq 8(%rdi),%r11 3235 xorq %r10,%r10 3236 movq 0+8(%rsp),%r9 3237 adoxq %r11,%r11 3238 movq 16(%rdi),%r12 3239 movq 24(%rdi),%r13 3240 3241 3242.align 32 3243.Lsqrx4x_shift_n_add: 3244 mulxq %rdx,%rax,%rbx 3245 adoxq %r12,%r12 3246 adcxq %r10,%rax 3247.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 3248.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 3249 adoxq %r13,%r13 3250 adcxq %r11,%rbx 3251 movq 40(%rdi),%r11 3252 movq %rax,0(%rdi) 3253 movq %rbx,8(%rdi) 3254 3255 mulxq %rdx,%rax,%rbx 3256 adoxq %r10,%r10 3257 adcxq %r12,%rax 3258 movq 16(%rsi,%rcx,1),%rdx 3259 movq 48(%rdi),%r12 3260 adoxq %r11,%r11 3261 adcxq %r13,%rbx 3262 movq 56(%rdi),%r13 3263 movq %rax,16(%rdi) 3264 movq %rbx,24(%rdi) 3265 3266 mulxq %rdx,%rax,%rbx 3267 adoxq %r12,%r12 3268 adcxq %r10,%rax 3269 movq 24(%rsi,%rcx,1),%rdx 3270 leaq 32(%rcx),%rcx 3271 movq 64(%rdi),%r10 3272 adoxq %r13,%r13 3273 adcxq %r11,%rbx 3274 movq 72(%rdi),%r11 3275 movq %rax,32(%rdi) 3276 movq %rbx,40(%rdi) 3277 3278 mulxq %rdx,%rax,%rbx 3279 adoxq %r10,%r10 3280 adcxq %r12,%rax 3281 jrcxz .Lsqrx4x_shift_n_add_break 3282.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 3283 adoxq %r11,%r11 3284 adcxq %r13,%rbx 3285 movq 80(%rdi),%r12 3286 movq 88(%rdi),%r13 3287 movq %rax,48(%rdi) 3288 movq %rbx,56(%rdi) 3289 leaq 64(%rdi),%rdi 3290 nop 3291 jmp .Lsqrx4x_shift_n_add 3292 3293.align 32 3294.Lsqrx4x_shift_n_add_break: 3295 adcxq %r13,%rbx 3296 movq %rax,48(%rdi) 3297 movq %rbx,56(%rdi) 3298 leaq 64(%rdi),%rdi 3299.byte 102,72,15,126,213 3300__bn_sqrx8x_reduction: 3301 xorl %eax,%eax 3302 movq 32+8(%rsp),%rbx 3303 movq 48+8(%rsp),%rdx 3304 leaq -64(%rbp,%r9,1),%rcx 3305 3306 movq %rcx,0+8(%rsp) 3307 movq %rdi,8+8(%rsp) 3308 3309 leaq 48+8(%rsp),%rdi 3310 jmp .Lsqrx8x_reduction_loop 3311 3312.align 32 3313.Lsqrx8x_reduction_loop: 3314 movq 8(%rdi),%r9 3315 movq 16(%rdi),%r10 3316 movq 24(%rdi),%r11 3317 movq 32(%rdi),%r12 3318 movq %rdx,%r8 3319 imulq %rbx,%rdx 3320 movq 40(%rdi),%r13 3321 movq 48(%rdi),%r14 3322 movq 56(%rdi),%r15 3323 movq %rax,24+8(%rsp) 3324 3325 leaq 64(%rdi),%rdi 3326 xorq %rsi,%rsi 3327 movq $-8,%rcx 3328 jmp .Lsqrx8x_reduce 3329 3330.align 32 3331.Lsqrx8x_reduce: 3332 movq %r8,%rbx 3333 mulxq 0(%rbp),%rax,%r8 3334 adcxq %rbx,%rax 3335 adoxq %r9,%r8 3336 3337 mulxq 8(%rbp),%rbx,%r9 3338 adcxq %rbx,%r8 3339 adoxq %r10,%r9 3340 3341 mulxq 16(%rbp),%rbx,%r10 3342 adcxq %rbx,%r9 3343 adoxq %r11,%r10 3344 3345 mulxq 24(%rbp),%rbx,%r11 3346 adcxq %rbx,%r10 3347 adoxq %r12,%r11 3348 3349.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 3350 movq %rdx,%rax 3351 movq %r8,%rdx 3352 adcxq %rbx,%r11 3353 adoxq %r13,%r12 3354 3355 mulxq 32+8(%rsp),%rbx,%rdx 3356 movq %rax,%rdx 3357 movq %rax,64+48+8(%rsp,%rcx,8) 3358 3359 mulxq 40(%rbp),%rax,%r13 3360 adcxq %rax,%r12 3361 adoxq %r14,%r13 3362 3363 mulxq 48(%rbp),%rax,%r14 3364 adcxq %rax,%r13 3365 adoxq %r15,%r14 3366 3367 mulxq 56(%rbp),%rax,%r15 3368 movq %rbx,%rdx 3369 adcxq %rax,%r14 3370 adoxq %rsi,%r15 3371 adcxq %rsi,%r15 3372 3373.byte 0x67,0x67,0x67 3374 incq %rcx 3375 jnz .Lsqrx8x_reduce 3376 3377 movq %rsi,%rax 3378 cmpq 0+8(%rsp),%rbp 3379 jae .Lsqrx8x_no_tail 3380 3381 movq 48+8(%rsp),%rdx 3382 addq 0(%rdi),%r8 3383 leaq 64(%rbp),%rbp 3384 movq $-8,%rcx 3385 adcxq 8(%rdi),%r9 3386 adcxq 16(%rdi),%r10 3387 adcq 24(%rdi),%r11 3388 adcq 32(%rdi),%r12 3389 adcq 40(%rdi),%r13 3390 adcq 48(%rdi),%r14 3391 adcq 56(%rdi),%r15 3392 leaq 64(%rdi),%rdi 3393 sbbq %rax,%rax 3394 3395 xorq %rsi,%rsi 3396 movq %rax,16+8(%rsp) 3397 jmp .Lsqrx8x_tail 3398 3399.align 32 3400.Lsqrx8x_tail: 3401 movq %r8,%rbx 3402 mulxq 0(%rbp),%rax,%r8 3403 adcxq %rax,%rbx 3404 adoxq %r9,%r8 3405 3406 mulxq 8(%rbp),%rax,%r9 3407 adcxq %rax,%r8 3408 adoxq %r10,%r9 3409 3410 mulxq 16(%rbp),%rax,%r10 3411 adcxq %rax,%r9 3412 adoxq %r11,%r10 3413 3414 mulxq 24(%rbp),%rax,%r11 3415 adcxq %rax,%r10 3416 adoxq %r12,%r11 3417 3418.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 3419 adcxq %rax,%r11 3420 adoxq %r13,%r12 3421 3422 mulxq 40(%rbp),%rax,%r13 3423 adcxq %rax,%r12 3424 adoxq %r14,%r13 3425 3426 mulxq 48(%rbp),%rax,%r14 3427 adcxq %rax,%r13 3428 adoxq %r15,%r14 3429 3430 mulxq 56(%rbp),%rax,%r15 3431 movq 72+48+8(%rsp,%rcx,8),%rdx 3432 adcxq %rax,%r14 3433 adoxq %rsi,%r15 3434 movq %rbx,(%rdi,%rcx,8) 3435 movq %r8,%rbx 3436 adcxq %rsi,%r15 3437 3438 incq %rcx 3439 jnz .Lsqrx8x_tail 3440 3441 cmpq 0+8(%rsp),%rbp 3442 jae .Lsqrx8x_tail_done 3443 3444 subq 16+8(%rsp),%rsi 3445 movq 48+8(%rsp),%rdx 3446 leaq 64(%rbp),%rbp 3447 adcq 0(%rdi),%r8 3448 adcq 8(%rdi),%r9 3449 adcq 16(%rdi),%r10 3450 adcq 24(%rdi),%r11 3451 adcq 32(%rdi),%r12 3452 adcq 40(%rdi),%r13 3453 adcq 48(%rdi),%r14 3454 adcq 56(%rdi),%r15 3455 leaq 64(%rdi),%rdi 3456 sbbq %rax,%rax 3457 subq $8,%rcx 3458 3459 xorq %rsi,%rsi 3460 movq %rax,16+8(%rsp) 3461 jmp .Lsqrx8x_tail 3462 3463.align 32 3464.Lsqrx8x_tail_done: 3465 xorq %rax,%rax 3466 addq 24+8(%rsp),%r8 3467 adcq $0,%r9 3468 adcq $0,%r10 3469 adcq $0,%r11 3470 adcq $0,%r12 3471 adcq $0,%r13 3472 adcq $0,%r14 3473 adcq $0,%r15 3474 adcq $0,%rax 3475 3476 subq 16+8(%rsp),%rsi 3477.Lsqrx8x_no_tail: 3478 adcq 0(%rdi),%r8 3479.byte 102,72,15,126,217 3480 adcq 8(%rdi),%r9 3481 movq 56(%rbp),%rsi 3482.byte 102,72,15,126,213 3483 adcq 16(%rdi),%r10 3484 adcq 24(%rdi),%r11 3485 adcq 32(%rdi),%r12 3486 adcq 40(%rdi),%r13 3487 adcq 48(%rdi),%r14 3488 adcq 56(%rdi),%r15 3489 adcq $0,%rax 3490 3491 movq 32+8(%rsp),%rbx 3492 movq 64(%rdi,%rcx,1),%rdx 3493 3494 movq %r8,0(%rdi) 3495 leaq 64(%rdi),%r8 3496 movq %r9,8(%rdi) 3497 movq %r10,16(%rdi) 3498 movq %r11,24(%rdi) 3499 movq %r12,32(%rdi) 3500 movq %r13,40(%rdi) 3501 movq %r14,48(%rdi) 3502 movq %r15,56(%rdi) 3503 3504 leaq 64(%rdi,%rcx,1),%rdi 3505 cmpq 8+8(%rsp),%r8 3506 jb .Lsqrx8x_reduction_loop 3507 .byte 0xf3,0xc3 3508.size bn_sqrx8x_internal,.-bn_sqrx8x_internal 3509.align 32 3510__bn_postx4x_internal: 3511 movq 0(%rbp),%r12 3512 movq %rcx,%r10 3513 movq %rcx,%r9 3514 negq %rax 3515 sarq $3+2,%rcx 3516 3517.byte 102,72,15,126,202 3518.byte 102,72,15,126,206 3519 decq %r12 3520 movq 8(%rbp),%r13 3521 xorq %r8,%r8 3522 movq 16(%rbp),%r14 3523 movq 24(%rbp),%r15 3524 jmp .Lsqrx4x_sub_entry 3525 3526.align 16 3527.Lsqrx4x_sub: 3528 movq 0(%rbp),%r12 3529 movq 8(%rbp),%r13 3530 movq 16(%rbp),%r14 3531 movq 24(%rbp),%r15 3532.Lsqrx4x_sub_entry: 3533 andnq %rax,%r12,%r12 3534 leaq 32(%rbp),%rbp 3535 andnq %rax,%r13,%r13 3536 andnq %rax,%r14,%r14 3537 andnq %rax,%r15,%r15 3538 3539 negq %r8 3540 adcq 0(%rdi),%r12 3541 adcq 8(%rdi),%r13 3542 adcq 16(%rdi),%r14 3543 adcq 24(%rdi),%r15 3544 movq %r12,0(%rdx) 3545 leaq 32(%rdi),%rdi 3546 movq %r13,8(%rdx) 3547 sbbq %r8,%r8 3548 movq %r14,16(%rdx) 3549 movq %r15,24(%rdx) 3550 leaq 32(%rdx),%rdx 3551 3552 incq %rcx 3553 jnz .Lsqrx4x_sub 3554 3555 negq %r9 3556 3557 .byte 0xf3,0xc3 3558.size __bn_postx4x_internal,.-__bn_postx4x_internal 3559.globl bn_get_bits5 3560.type bn_get_bits5,@function 3561.align 16 3562bn_get_bits5: 3563 leaq 0(%rdi),%r10 3564 leaq 1(%rdi),%r11 3565 movl %esi,%ecx 3566 shrl $4,%esi 3567 andl $15,%ecx 3568 leal -8(%rcx),%eax 3569 cmpl $11,%ecx 3570 cmovaq %r11,%r10 3571 cmoval %eax,%ecx 3572 movzwl (%r10,%rsi,2),%eax 3573 shrl %cl,%eax 3574 andl $31,%eax 3575 .byte 0xf3,0xc3 3576.size bn_get_bits5,.-bn_get_bits5 3577 3578.globl bn_scatter5 3579.type bn_scatter5,@function 3580.align 16 3581bn_scatter5: 3582 cmpl $0,%esi 3583 jz .Lscatter_epilogue 3584 leaq (%rdx,%rcx,8),%rdx 3585.Lscatter: 3586 movq (%rdi),%rax 3587 leaq 8(%rdi),%rdi 3588 movq %rax,(%rdx) 3589 leaq 256(%rdx),%rdx 3590 subl $1,%esi 3591 jnz .Lscatter 3592.Lscatter_epilogue: 3593 .byte 0xf3,0xc3 3594.size bn_scatter5,.-bn_scatter5 3595 3596.globl bn_gather5 3597.type bn_gather5,@function 3598.align 32 3599bn_gather5: 3600.LSEH_begin_bn_gather5: 3601 3602.byte 0x4c,0x8d,0x14,0x24 3603.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 3604 leaq .Linc(%rip),%rax 3605 andq $-16,%rsp 3606 3607 movd %ecx,%xmm5 3608 movdqa 0(%rax),%xmm0 3609 movdqa 16(%rax),%xmm1 3610 leaq 128(%rdx),%r11 3611 leaq 128(%rsp),%rax 3612 3613 pshufd $0,%xmm5,%xmm5 3614 movdqa %xmm1,%xmm4 3615 movdqa %xmm1,%xmm2 3616 paddd %xmm0,%xmm1 3617 pcmpeqd %xmm5,%xmm0 3618 movdqa %xmm4,%xmm3 3619 3620 paddd %xmm1,%xmm2 3621 pcmpeqd %xmm5,%xmm1 3622 movdqa %xmm0,-128(%rax) 3623 movdqa %xmm4,%xmm0 3624 3625 paddd %xmm2,%xmm3 3626 pcmpeqd %xmm5,%xmm2 3627 movdqa %xmm1,-112(%rax) 3628 movdqa %xmm4,%xmm1 3629 3630 paddd %xmm3,%xmm0 3631 pcmpeqd %xmm5,%xmm3 3632 movdqa %xmm2,-96(%rax) 3633 movdqa %xmm4,%xmm2 3634 paddd %xmm0,%xmm1 3635 pcmpeqd %xmm5,%xmm0 3636 movdqa %xmm3,-80(%rax) 3637 movdqa %xmm4,%xmm3 3638 3639 paddd %xmm1,%xmm2 3640 pcmpeqd %xmm5,%xmm1 3641 movdqa %xmm0,-64(%rax) 3642 movdqa %xmm4,%xmm0 3643 3644 paddd %xmm2,%xmm3 3645 pcmpeqd %xmm5,%xmm2 3646 movdqa %xmm1,-48(%rax) 3647 movdqa %xmm4,%xmm1 3648 3649 paddd %xmm3,%xmm0 3650 pcmpeqd %xmm5,%xmm3 3651 movdqa %xmm2,-32(%rax) 3652 movdqa %xmm4,%xmm2 3653 paddd %xmm0,%xmm1 3654 pcmpeqd %xmm5,%xmm0 3655 movdqa %xmm3,-16(%rax) 3656 movdqa %xmm4,%xmm3 3657 3658 paddd %xmm1,%xmm2 3659 pcmpeqd %xmm5,%xmm1 3660 movdqa %xmm0,0(%rax) 3661 movdqa %xmm4,%xmm0 3662 3663 paddd %xmm2,%xmm3 3664 pcmpeqd %xmm5,%xmm2 3665 movdqa %xmm1,16(%rax) 3666 movdqa %xmm4,%xmm1 3667 3668 paddd %xmm3,%xmm0 3669 pcmpeqd %xmm5,%xmm3 3670 movdqa %xmm2,32(%rax) 3671 movdqa %xmm4,%xmm2 3672 paddd %xmm0,%xmm1 3673 pcmpeqd %xmm5,%xmm0 3674 movdqa %xmm3,48(%rax) 3675 movdqa %xmm4,%xmm3 3676 3677 paddd %xmm1,%xmm2 3678 pcmpeqd %xmm5,%xmm1 3679 movdqa %xmm0,64(%rax) 3680 movdqa %xmm4,%xmm0 3681 3682 paddd %xmm2,%xmm3 3683 pcmpeqd %xmm5,%xmm2 3684 movdqa %xmm1,80(%rax) 3685 movdqa %xmm4,%xmm1 3686 3687 paddd %xmm3,%xmm0 3688 pcmpeqd %xmm5,%xmm3 3689 movdqa %xmm2,96(%rax) 3690 movdqa %xmm4,%xmm2 3691 movdqa %xmm3,112(%rax) 3692 jmp .Lgather 3693 3694.align 32 3695.Lgather: 3696 pxor %xmm4,%xmm4 3697 pxor %xmm5,%xmm5 3698 movdqa -128(%r11),%xmm0 3699 movdqa -112(%r11),%xmm1 3700 movdqa -96(%r11),%xmm2 3701 pand -128(%rax),%xmm0 3702 movdqa -80(%r11),%xmm3 3703 pand -112(%rax),%xmm1 3704 por %xmm0,%xmm4 3705 pand -96(%rax),%xmm2 3706 por %xmm1,%xmm5 3707 pand -80(%rax),%xmm3 3708 por %xmm2,%xmm4 3709 por %xmm3,%xmm5 3710 movdqa -64(%r11),%xmm0 3711 movdqa -48(%r11),%xmm1 3712 movdqa -32(%r11),%xmm2 3713 pand -64(%rax),%xmm0 3714 movdqa -16(%r11),%xmm3 3715 pand -48(%rax),%xmm1 3716 por %xmm0,%xmm4 3717 pand -32(%rax),%xmm2 3718 por %xmm1,%xmm5 3719 pand -16(%rax),%xmm3 3720 por %xmm2,%xmm4 3721 por %xmm3,%xmm5 3722 movdqa 0(%r11),%xmm0 3723 movdqa 16(%r11),%xmm1 3724 movdqa 32(%r11),%xmm2 3725 pand 0(%rax),%xmm0 3726 movdqa 48(%r11),%xmm3 3727 pand 16(%rax),%xmm1 3728 por %xmm0,%xmm4 3729 pand 32(%rax),%xmm2 3730 por %xmm1,%xmm5 3731 pand 48(%rax),%xmm3 3732 por %xmm2,%xmm4 3733 por %xmm3,%xmm5 3734 movdqa 64(%r11),%xmm0 3735 movdqa 80(%r11),%xmm1 3736 movdqa 96(%r11),%xmm2 3737 pand 64(%rax),%xmm0 3738 movdqa 112(%r11),%xmm3 3739 pand 80(%rax),%xmm1 3740 por %xmm0,%xmm4 3741 pand 96(%rax),%xmm2 3742 por %xmm1,%xmm5 3743 pand 112(%rax),%xmm3 3744 por %xmm2,%xmm4 3745 por %xmm3,%xmm5 3746 por %xmm5,%xmm4 3747 leaq 256(%r11),%r11 3748 pshufd $0x4e,%xmm4,%xmm0 3749 por %xmm4,%xmm0 3750 movq %xmm0,(%rdi) 3751 leaq 8(%rdi),%rdi 3752 subl $1,%esi 3753 jnz .Lgather 3754 3755 leaq (%r10),%rsp 3756 .byte 0xf3,0xc3 3757.LSEH_end_bn_gather5: 3758.size bn_gather5,.-bn_gather5 3759.align 64 3760.Linc: 3761.long 0,0, 1,1 3762.long 2,2, 2,2 3763.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 3764