x86_64-mont5.S revision 1.9
1#include <machine/asm.h> 2.text 3 4 5 6.globl bn_mul_mont_gather5 7.type bn_mul_mont_gather5,@function 8.align 64 9bn_mul_mont_gather5: 10.cfi_startproc 11 movl %r9d,%r9d 12 movq %rsp,%rax 13.cfi_def_cfa_register %rax 14 testl $7,%r9d 15 jnz .Lmul_enter 16 movl OPENSSL_ia32cap_P+8(%rip),%r11d 17 jmp .Lmul4x_enter 18 19.align 16 20.Lmul_enter: 21 movd 8(%rsp),%xmm5 22 pushq %rbx 23.cfi_offset %rbx,-16 24 pushq %rbp 25.cfi_offset %rbp,-24 26 pushq %r12 27.cfi_offset %r12,-32 28 pushq %r13 29.cfi_offset %r13,-40 30 pushq %r14 31.cfi_offset %r14,-48 32 pushq %r15 33.cfi_offset %r15,-56 34 35 negq %r9 36 movq %rsp,%r11 37 leaq -280(%rsp,%r9,8),%r10 38 negq %r9 39 andq $-1024,%r10 40 41 42 43 44 45 46 47 48 49 subq %r10,%r11 50 andq $-4096,%r11 51 leaq (%r10,%r11,1),%rsp 52 movq (%rsp),%r11 53 cmpq %r10,%rsp 54 ja .Lmul_page_walk 55 jmp .Lmul_page_walk_done 56 57.Lmul_page_walk: 58 leaq -4096(%rsp),%rsp 59 movq (%rsp),%r11 60 cmpq %r10,%rsp 61 ja .Lmul_page_walk 62.Lmul_page_walk_done: 63 64 leaq .Linc(%rip),%r10 65 movq %rax,8(%rsp,%r9,8) 66.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08 67.Lmul_body: 68 69 leaq 128(%rdx),%r12 70 movdqa 0(%r10),%xmm0 71 movdqa 16(%r10),%xmm1 72 leaq 24-112(%rsp,%r9,8),%r10 73 andq $-16,%r10 74 75 pshufd $0,%xmm5,%xmm5 76 movdqa %xmm1,%xmm4 77 movdqa %xmm1,%xmm2 78 paddd %xmm0,%xmm1 79 pcmpeqd %xmm5,%xmm0 80.byte 0x67 81 movdqa %xmm4,%xmm3 82 paddd %xmm1,%xmm2 83 pcmpeqd %xmm5,%xmm1 84 movdqa %xmm0,112(%r10) 85 movdqa %xmm4,%xmm0 86 87 paddd %xmm2,%xmm3 88 pcmpeqd %xmm5,%xmm2 89 movdqa %xmm1,128(%r10) 90 movdqa %xmm4,%xmm1 91 92 paddd %xmm3,%xmm0 93 pcmpeqd %xmm5,%xmm3 94 movdqa %xmm2,144(%r10) 95 movdqa %xmm4,%xmm2 96 97 paddd %xmm0,%xmm1 98 pcmpeqd %xmm5,%xmm0 99 movdqa %xmm3,160(%r10) 100 movdqa %xmm4,%xmm3 101 paddd %xmm1,%xmm2 102 pcmpeqd %xmm5,%xmm1 103 movdqa %xmm0,176(%r10) 104 movdqa %xmm4,%xmm0 105 106 paddd %xmm2,%xmm3 107 pcmpeqd %xmm5,%xmm2 108 movdqa %xmm1,192(%r10) 109 movdqa %xmm4,%xmm1 110 111 paddd %xmm3,%xmm0 112 pcmpeqd %xmm5,%xmm3 113 movdqa %xmm2,208(%r10) 114 movdqa %xmm4,%xmm2 115 116 paddd %xmm0,%xmm1 117 pcmpeqd %xmm5,%xmm0 118 movdqa %xmm3,224(%r10) 119 movdqa %xmm4,%xmm3 120 paddd %xmm1,%xmm2 121 pcmpeqd %xmm5,%xmm1 122 movdqa %xmm0,240(%r10) 123 movdqa %xmm4,%xmm0 124 125 paddd %xmm2,%xmm3 126 pcmpeqd %xmm5,%xmm2 127 movdqa %xmm1,256(%r10) 128 movdqa %xmm4,%xmm1 129 130 paddd %xmm3,%xmm0 131 pcmpeqd %xmm5,%xmm3 132 movdqa %xmm2,272(%r10) 133 movdqa %xmm4,%xmm2 134 135 paddd %xmm0,%xmm1 136 pcmpeqd %xmm5,%xmm0 137 movdqa %xmm3,288(%r10) 138 movdqa %xmm4,%xmm3 139 paddd %xmm1,%xmm2 140 pcmpeqd %xmm5,%xmm1 141 movdqa %xmm0,304(%r10) 142 143 paddd %xmm2,%xmm3 144.byte 0x67 145 pcmpeqd %xmm5,%xmm2 146 movdqa %xmm1,320(%r10) 147 148 pcmpeqd %xmm5,%xmm3 149 movdqa %xmm2,336(%r10) 150 pand 64(%r12),%xmm0 151 152 pand 80(%r12),%xmm1 153 pand 96(%r12),%xmm2 154 movdqa %xmm3,352(%r10) 155 pand 112(%r12),%xmm3 156 por %xmm2,%xmm0 157 por %xmm3,%xmm1 158 movdqa -128(%r12),%xmm4 159 movdqa -112(%r12),%xmm5 160 movdqa -96(%r12),%xmm2 161 pand 112(%r10),%xmm4 162 movdqa -80(%r12),%xmm3 163 pand 128(%r10),%xmm5 164 por %xmm4,%xmm0 165 pand 144(%r10),%xmm2 166 por %xmm5,%xmm1 167 pand 160(%r10),%xmm3 168 por %xmm2,%xmm0 169 por %xmm3,%xmm1 170 movdqa -64(%r12),%xmm4 171 movdqa -48(%r12),%xmm5 172 movdqa -32(%r12),%xmm2 173 pand 176(%r10),%xmm4 174 movdqa -16(%r12),%xmm3 175 pand 192(%r10),%xmm5 176 por %xmm4,%xmm0 177 pand 208(%r10),%xmm2 178 por %xmm5,%xmm1 179 pand 224(%r10),%xmm3 180 por %xmm2,%xmm0 181 por %xmm3,%xmm1 182 movdqa 0(%r12),%xmm4 183 movdqa 16(%r12),%xmm5 184 movdqa 32(%r12),%xmm2 185 pand 240(%r10),%xmm4 186 movdqa 48(%r12),%xmm3 187 pand 256(%r10),%xmm5 188 por %xmm4,%xmm0 189 pand 272(%r10),%xmm2 190 por %xmm5,%xmm1 191 pand 288(%r10),%xmm3 192 por %xmm2,%xmm0 193 por %xmm3,%xmm1 194 por %xmm1,%xmm0 195 pshufd $0x4e,%xmm0,%xmm1 196 por %xmm1,%xmm0 197 leaq 256(%r12),%r12 198.byte 102,72,15,126,195 199 200 movq (%r8),%r8 201 movq (%rsi),%rax 202 203 xorq %r14,%r14 204 xorq %r15,%r15 205 206 movq %r8,%rbp 207 mulq %rbx 208 movq %rax,%r10 209 movq (%rcx),%rax 210 211 imulq %r10,%rbp 212 movq %rdx,%r11 213 214 mulq %rbp 215 addq %rax,%r10 216 movq 8(%rsi),%rax 217 adcq $0,%rdx 218 movq %rdx,%r13 219 220 leaq 1(%r15),%r15 221 jmp .L1st_enter 222 223.align 16 224.L1st: 225 addq %rax,%r13 226 movq (%rsi,%r15,8),%rax 227 adcq $0,%rdx 228 addq %r11,%r13 229 movq %r10,%r11 230 adcq $0,%rdx 231 movq %r13,-16(%rsp,%r15,8) 232 movq %rdx,%r13 233 234.L1st_enter: 235 mulq %rbx 236 addq %rax,%r11 237 movq (%rcx,%r15,8),%rax 238 adcq $0,%rdx 239 leaq 1(%r15),%r15 240 movq %rdx,%r10 241 242 mulq %rbp 243 cmpq %r9,%r15 244 jne .L1st 245 246 247 addq %rax,%r13 248 adcq $0,%rdx 249 addq %r11,%r13 250 adcq $0,%rdx 251 movq %r13,-16(%rsp,%r9,8) 252 movq %rdx,%r13 253 movq %r10,%r11 254 255 xorq %rdx,%rdx 256 addq %r11,%r13 257 adcq $0,%rdx 258 movq %r13,-8(%rsp,%r9,8) 259 movq %rdx,(%rsp,%r9,8) 260 261 leaq 1(%r14),%r14 262 jmp .Louter 263.align 16 264.Louter: 265 leaq 24+128(%rsp,%r9,8),%rdx 266 andq $-16,%rdx 267 pxor %xmm4,%xmm4 268 pxor %xmm5,%xmm5 269 movdqa -128(%r12),%xmm0 270 movdqa -112(%r12),%xmm1 271 movdqa -96(%r12),%xmm2 272 movdqa -80(%r12),%xmm3 273 pand -128(%rdx),%xmm0 274 pand -112(%rdx),%xmm1 275 por %xmm0,%xmm4 276 pand -96(%rdx),%xmm2 277 por %xmm1,%xmm5 278 pand -80(%rdx),%xmm3 279 por %xmm2,%xmm4 280 por %xmm3,%xmm5 281 movdqa -64(%r12),%xmm0 282 movdqa -48(%r12),%xmm1 283 movdqa -32(%r12),%xmm2 284 movdqa -16(%r12),%xmm3 285 pand -64(%rdx),%xmm0 286 pand -48(%rdx),%xmm1 287 por %xmm0,%xmm4 288 pand -32(%rdx),%xmm2 289 por %xmm1,%xmm5 290 pand -16(%rdx),%xmm3 291 por %xmm2,%xmm4 292 por %xmm3,%xmm5 293 movdqa 0(%r12),%xmm0 294 movdqa 16(%r12),%xmm1 295 movdqa 32(%r12),%xmm2 296 movdqa 48(%r12),%xmm3 297 pand 0(%rdx),%xmm0 298 pand 16(%rdx),%xmm1 299 por %xmm0,%xmm4 300 pand 32(%rdx),%xmm2 301 por %xmm1,%xmm5 302 pand 48(%rdx),%xmm3 303 por %xmm2,%xmm4 304 por %xmm3,%xmm5 305 movdqa 64(%r12),%xmm0 306 movdqa 80(%r12),%xmm1 307 movdqa 96(%r12),%xmm2 308 movdqa 112(%r12),%xmm3 309 pand 64(%rdx),%xmm0 310 pand 80(%rdx),%xmm1 311 por %xmm0,%xmm4 312 pand 96(%rdx),%xmm2 313 por %xmm1,%xmm5 314 pand 112(%rdx),%xmm3 315 por %xmm2,%xmm4 316 por %xmm3,%xmm5 317 por %xmm5,%xmm4 318 pshufd $0x4e,%xmm4,%xmm0 319 por %xmm4,%xmm0 320 leaq 256(%r12),%r12 321 322 movq (%rsi),%rax 323.byte 102,72,15,126,195 324 325 xorq %r15,%r15 326 movq %r8,%rbp 327 movq (%rsp),%r10 328 329 mulq %rbx 330 addq %rax,%r10 331 movq (%rcx),%rax 332 adcq $0,%rdx 333 334 imulq %r10,%rbp 335 movq %rdx,%r11 336 337 mulq %rbp 338 addq %rax,%r10 339 movq 8(%rsi),%rax 340 adcq $0,%rdx 341 movq 8(%rsp),%r10 342 movq %rdx,%r13 343 344 leaq 1(%r15),%r15 345 jmp .Linner_enter 346 347.align 16 348.Linner: 349 addq %rax,%r13 350 movq (%rsi,%r15,8),%rax 351 adcq $0,%rdx 352 addq %r10,%r13 353 movq (%rsp,%r15,8),%r10 354 adcq $0,%rdx 355 movq %r13,-16(%rsp,%r15,8) 356 movq %rdx,%r13 357 358.Linner_enter: 359 mulq %rbx 360 addq %rax,%r11 361 movq (%rcx,%r15,8),%rax 362 adcq $0,%rdx 363 addq %r11,%r10 364 movq %rdx,%r11 365 adcq $0,%r11 366 leaq 1(%r15),%r15 367 368 mulq %rbp 369 cmpq %r9,%r15 370 jne .Linner 371 372 addq %rax,%r13 373 adcq $0,%rdx 374 addq %r10,%r13 375 movq (%rsp,%r9,8),%r10 376 adcq $0,%rdx 377 movq %r13,-16(%rsp,%r9,8) 378 movq %rdx,%r13 379 380 xorq %rdx,%rdx 381 addq %r11,%r13 382 adcq $0,%rdx 383 addq %r10,%r13 384 adcq $0,%rdx 385 movq %r13,-8(%rsp,%r9,8) 386 movq %rdx,(%rsp,%r9,8) 387 388 leaq 1(%r14),%r14 389 cmpq %r9,%r14 390 jb .Louter 391 392 xorq %r14,%r14 393 movq (%rsp),%rax 394 leaq (%rsp),%rsi 395 movq %r9,%r15 396 jmp .Lsub 397.align 16 398.Lsub: sbbq (%rcx,%r14,8),%rax 399 movq %rax,(%rdi,%r14,8) 400 movq 8(%rsi,%r14,8),%rax 401 leaq 1(%r14),%r14 402 decq %r15 403 jnz .Lsub 404 405 sbbq $0,%rax 406 movq $-1,%rbx 407 xorq %rax,%rbx 408 xorq %r14,%r14 409 movq %r9,%r15 410 411.Lcopy: 412 movq (%rdi,%r14,8),%rcx 413 movq (%rsp,%r14,8),%rdx 414 andq %rbx,%rcx 415 andq %rax,%rdx 416 movq %r14,(%rsp,%r14,8) 417 orq %rcx,%rdx 418 movq %rdx,(%rdi,%r14,8) 419 leaq 1(%r14),%r14 420 subq $1,%r15 421 jnz .Lcopy 422 423 movq 8(%rsp,%r9,8),%rsi 424.cfi_def_cfa %rsi,8 425 movq $1,%rax 426 427 movq -48(%rsi),%r15 428.cfi_restore %r15 429 movq -40(%rsi),%r14 430.cfi_restore %r14 431 movq -32(%rsi),%r13 432.cfi_restore %r13 433 movq -24(%rsi),%r12 434.cfi_restore %r12 435 movq -16(%rsi),%rbp 436.cfi_restore %rbp 437 movq -8(%rsi),%rbx 438.cfi_restore %rbx 439 leaq (%rsi),%rsp 440.cfi_def_cfa_register %rsp 441.Lmul_epilogue: 442 .byte 0xf3,0xc3 443.cfi_endproc 444.size bn_mul_mont_gather5,.-bn_mul_mont_gather5 445.type bn_mul4x_mont_gather5,@function 446.align 32 447bn_mul4x_mont_gather5: 448.cfi_startproc 449.byte 0x67 450 movq %rsp,%rax 451.cfi_def_cfa_register %rax 452.Lmul4x_enter: 453 andl $0x80108,%r11d 454 cmpl $0x80108,%r11d 455 je .Lmulx4x_enter 456 pushq %rbx 457.cfi_offset %rbx,-16 458 pushq %rbp 459.cfi_offset %rbp,-24 460 pushq %r12 461.cfi_offset %r12,-32 462 pushq %r13 463.cfi_offset %r13,-40 464 pushq %r14 465.cfi_offset %r14,-48 466 pushq %r15 467.cfi_offset %r15,-56 468.Lmul4x_prologue: 469 470.byte 0x67 471 shll $3,%r9d 472 leaq (%r9,%r9,2),%r10 473 negq %r9 474 475 476 477 478 479 480 481 482 483 484 leaq -320(%rsp,%r9,2),%r11 485 movq %rsp,%rbp 486 subq %rdi,%r11 487 andq $4095,%r11 488 cmpq %r11,%r10 489 jb .Lmul4xsp_alt 490 subq %r11,%rbp 491 leaq -320(%rbp,%r9,2),%rbp 492 jmp .Lmul4xsp_done 493 494.align 32 495.Lmul4xsp_alt: 496 leaq 4096-320(,%r9,2),%r10 497 leaq -320(%rbp,%r9,2),%rbp 498 subq %r10,%r11 499 movq $0,%r10 500 cmovcq %r10,%r11 501 subq %r11,%rbp 502.Lmul4xsp_done: 503 andq $-64,%rbp 504 movq %rsp,%r11 505 subq %rbp,%r11 506 andq $-4096,%r11 507 leaq (%r11,%rbp,1),%rsp 508 movq (%rsp),%r10 509 cmpq %rbp,%rsp 510 ja .Lmul4x_page_walk 511 jmp .Lmul4x_page_walk_done 512 513.Lmul4x_page_walk: 514 leaq -4096(%rsp),%rsp 515 movq (%rsp),%r10 516 cmpq %rbp,%rsp 517 ja .Lmul4x_page_walk 518.Lmul4x_page_walk_done: 519 520 negq %r9 521 522 movq %rax,40(%rsp) 523.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 524.Lmul4x_body: 525 526 call mul4x_internal 527 528 movq 40(%rsp),%rsi 529.cfi_def_cfa %rsi,8 530 movq $1,%rax 531 532 movq -48(%rsi),%r15 533.cfi_restore %r15 534 movq -40(%rsi),%r14 535.cfi_restore %r14 536 movq -32(%rsi),%r13 537.cfi_restore %r13 538 movq -24(%rsi),%r12 539.cfi_restore %r12 540 movq -16(%rsi),%rbp 541.cfi_restore %rbp 542 movq -8(%rsi),%rbx 543.cfi_restore %rbx 544 leaq (%rsi),%rsp 545.cfi_def_cfa_register %rsp 546.Lmul4x_epilogue: 547 .byte 0xf3,0xc3 548.cfi_endproc 549.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 550 551.type mul4x_internal,@function 552.align 32 553mul4x_internal: 554.cfi_startproc 555 shlq $5,%r9 556 movd 8(%rax),%xmm5 557 leaq .Linc(%rip),%rax 558 leaq 128(%rdx,%r9,1),%r13 559 shrq $5,%r9 560 movdqa 0(%rax),%xmm0 561 movdqa 16(%rax),%xmm1 562 leaq 88-112(%rsp,%r9,1),%r10 563 leaq 128(%rdx),%r12 564 565 pshufd $0,%xmm5,%xmm5 566 movdqa %xmm1,%xmm4 567.byte 0x67,0x67 568 movdqa %xmm1,%xmm2 569 paddd %xmm0,%xmm1 570 pcmpeqd %xmm5,%xmm0 571.byte 0x67 572 movdqa %xmm4,%xmm3 573 paddd %xmm1,%xmm2 574 pcmpeqd %xmm5,%xmm1 575 movdqa %xmm0,112(%r10) 576 movdqa %xmm4,%xmm0 577 578 paddd %xmm2,%xmm3 579 pcmpeqd %xmm5,%xmm2 580 movdqa %xmm1,128(%r10) 581 movdqa %xmm4,%xmm1 582 583 paddd %xmm3,%xmm0 584 pcmpeqd %xmm5,%xmm3 585 movdqa %xmm2,144(%r10) 586 movdqa %xmm4,%xmm2 587 588 paddd %xmm0,%xmm1 589 pcmpeqd %xmm5,%xmm0 590 movdqa %xmm3,160(%r10) 591 movdqa %xmm4,%xmm3 592 paddd %xmm1,%xmm2 593 pcmpeqd %xmm5,%xmm1 594 movdqa %xmm0,176(%r10) 595 movdqa %xmm4,%xmm0 596 597 paddd %xmm2,%xmm3 598 pcmpeqd %xmm5,%xmm2 599 movdqa %xmm1,192(%r10) 600 movdqa %xmm4,%xmm1 601 602 paddd %xmm3,%xmm0 603 pcmpeqd %xmm5,%xmm3 604 movdqa %xmm2,208(%r10) 605 movdqa %xmm4,%xmm2 606 607 paddd %xmm0,%xmm1 608 pcmpeqd %xmm5,%xmm0 609 movdqa %xmm3,224(%r10) 610 movdqa %xmm4,%xmm3 611 paddd %xmm1,%xmm2 612 pcmpeqd %xmm5,%xmm1 613 movdqa %xmm0,240(%r10) 614 movdqa %xmm4,%xmm0 615 616 paddd %xmm2,%xmm3 617 pcmpeqd %xmm5,%xmm2 618 movdqa %xmm1,256(%r10) 619 movdqa %xmm4,%xmm1 620 621 paddd %xmm3,%xmm0 622 pcmpeqd %xmm5,%xmm3 623 movdqa %xmm2,272(%r10) 624 movdqa %xmm4,%xmm2 625 626 paddd %xmm0,%xmm1 627 pcmpeqd %xmm5,%xmm0 628 movdqa %xmm3,288(%r10) 629 movdqa %xmm4,%xmm3 630 paddd %xmm1,%xmm2 631 pcmpeqd %xmm5,%xmm1 632 movdqa %xmm0,304(%r10) 633 634 paddd %xmm2,%xmm3 635.byte 0x67 636 pcmpeqd %xmm5,%xmm2 637 movdqa %xmm1,320(%r10) 638 639 pcmpeqd %xmm5,%xmm3 640 movdqa %xmm2,336(%r10) 641 pand 64(%r12),%xmm0 642 643 pand 80(%r12),%xmm1 644 pand 96(%r12),%xmm2 645 movdqa %xmm3,352(%r10) 646 pand 112(%r12),%xmm3 647 por %xmm2,%xmm0 648 por %xmm3,%xmm1 649 movdqa -128(%r12),%xmm4 650 movdqa -112(%r12),%xmm5 651 movdqa -96(%r12),%xmm2 652 pand 112(%r10),%xmm4 653 movdqa -80(%r12),%xmm3 654 pand 128(%r10),%xmm5 655 por %xmm4,%xmm0 656 pand 144(%r10),%xmm2 657 por %xmm5,%xmm1 658 pand 160(%r10),%xmm3 659 por %xmm2,%xmm0 660 por %xmm3,%xmm1 661 movdqa -64(%r12),%xmm4 662 movdqa -48(%r12),%xmm5 663 movdqa -32(%r12),%xmm2 664 pand 176(%r10),%xmm4 665 movdqa -16(%r12),%xmm3 666 pand 192(%r10),%xmm5 667 por %xmm4,%xmm0 668 pand 208(%r10),%xmm2 669 por %xmm5,%xmm1 670 pand 224(%r10),%xmm3 671 por %xmm2,%xmm0 672 por %xmm3,%xmm1 673 movdqa 0(%r12),%xmm4 674 movdqa 16(%r12),%xmm5 675 movdqa 32(%r12),%xmm2 676 pand 240(%r10),%xmm4 677 movdqa 48(%r12),%xmm3 678 pand 256(%r10),%xmm5 679 por %xmm4,%xmm0 680 pand 272(%r10),%xmm2 681 por %xmm5,%xmm1 682 pand 288(%r10),%xmm3 683 por %xmm2,%xmm0 684 por %xmm3,%xmm1 685 por %xmm1,%xmm0 686 pshufd $0x4e,%xmm0,%xmm1 687 por %xmm1,%xmm0 688 leaq 256(%r12),%r12 689.byte 102,72,15,126,195 690 691 movq %r13,16+8(%rsp) 692 movq %rdi,56+8(%rsp) 693 694 movq (%r8),%r8 695 movq (%rsi),%rax 696 leaq (%rsi,%r9,1),%rsi 697 negq %r9 698 699 movq %r8,%rbp 700 mulq %rbx 701 movq %rax,%r10 702 movq (%rcx),%rax 703 704 imulq %r10,%rbp 705 leaq 64+8(%rsp),%r14 706 movq %rdx,%r11 707 708 mulq %rbp 709 addq %rax,%r10 710 movq 8(%rsi,%r9,1),%rax 711 adcq $0,%rdx 712 movq %rdx,%rdi 713 714 mulq %rbx 715 addq %rax,%r11 716 movq 8(%rcx),%rax 717 adcq $0,%rdx 718 movq %rdx,%r10 719 720 mulq %rbp 721 addq %rax,%rdi 722 movq 16(%rsi,%r9,1),%rax 723 adcq $0,%rdx 724 addq %r11,%rdi 725 leaq 32(%r9),%r15 726 leaq 32(%rcx),%rcx 727 adcq $0,%rdx 728 movq %rdi,(%r14) 729 movq %rdx,%r13 730 jmp .L1st4x 731 732.align 32 733.L1st4x: 734 mulq %rbx 735 addq %rax,%r10 736 movq -16(%rcx),%rax 737 leaq 32(%r14),%r14 738 adcq $0,%rdx 739 movq %rdx,%r11 740 741 mulq %rbp 742 addq %rax,%r13 743 movq -8(%rsi,%r15,1),%rax 744 adcq $0,%rdx 745 addq %r10,%r13 746 adcq $0,%rdx 747 movq %r13,-24(%r14) 748 movq %rdx,%rdi 749 750 mulq %rbx 751 addq %rax,%r11 752 movq -8(%rcx),%rax 753 adcq $0,%rdx 754 movq %rdx,%r10 755 756 mulq %rbp 757 addq %rax,%rdi 758 movq (%rsi,%r15,1),%rax 759 adcq $0,%rdx 760 addq %r11,%rdi 761 adcq $0,%rdx 762 movq %rdi,-16(%r14) 763 movq %rdx,%r13 764 765 mulq %rbx 766 addq %rax,%r10 767 movq 0(%rcx),%rax 768 adcq $0,%rdx 769 movq %rdx,%r11 770 771 mulq %rbp 772 addq %rax,%r13 773 movq 8(%rsi,%r15,1),%rax 774 adcq $0,%rdx 775 addq %r10,%r13 776 adcq $0,%rdx 777 movq %r13,-8(%r14) 778 movq %rdx,%rdi 779 780 mulq %rbx 781 addq %rax,%r11 782 movq 8(%rcx),%rax 783 adcq $0,%rdx 784 movq %rdx,%r10 785 786 mulq %rbp 787 addq %rax,%rdi 788 movq 16(%rsi,%r15,1),%rax 789 adcq $0,%rdx 790 addq %r11,%rdi 791 leaq 32(%rcx),%rcx 792 adcq $0,%rdx 793 movq %rdi,(%r14) 794 movq %rdx,%r13 795 796 addq $32,%r15 797 jnz .L1st4x 798 799 mulq %rbx 800 addq %rax,%r10 801 movq -16(%rcx),%rax 802 leaq 32(%r14),%r14 803 adcq $0,%rdx 804 movq %rdx,%r11 805 806 mulq %rbp 807 addq %rax,%r13 808 movq -8(%rsi),%rax 809 adcq $0,%rdx 810 addq %r10,%r13 811 adcq $0,%rdx 812 movq %r13,-24(%r14) 813 movq %rdx,%rdi 814 815 mulq %rbx 816 addq %rax,%r11 817 movq -8(%rcx),%rax 818 adcq $0,%rdx 819 movq %rdx,%r10 820 821 mulq %rbp 822 addq %rax,%rdi 823 movq (%rsi,%r9,1),%rax 824 adcq $0,%rdx 825 addq %r11,%rdi 826 adcq $0,%rdx 827 movq %rdi,-16(%r14) 828 movq %rdx,%r13 829 830 leaq (%rcx,%r9,1),%rcx 831 832 xorq %rdi,%rdi 833 addq %r10,%r13 834 adcq $0,%rdi 835 movq %r13,-8(%r14) 836 837 jmp .Louter4x 838 839.align 32 840.Louter4x: 841 leaq 16+128(%r14),%rdx 842 pxor %xmm4,%xmm4 843 pxor %xmm5,%xmm5 844 movdqa -128(%r12),%xmm0 845 movdqa -112(%r12),%xmm1 846 movdqa -96(%r12),%xmm2 847 movdqa -80(%r12),%xmm3 848 pand -128(%rdx),%xmm0 849 pand -112(%rdx),%xmm1 850 por %xmm0,%xmm4 851 pand -96(%rdx),%xmm2 852 por %xmm1,%xmm5 853 pand -80(%rdx),%xmm3 854 por %xmm2,%xmm4 855 por %xmm3,%xmm5 856 movdqa -64(%r12),%xmm0 857 movdqa -48(%r12),%xmm1 858 movdqa -32(%r12),%xmm2 859 movdqa -16(%r12),%xmm3 860 pand -64(%rdx),%xmm0 861 pand -48(%rdx),%xmm1 862 por %xmm0,%xmm4 863 pand -32(%rdx),%xmm2 864 por %xmm1,%xmm5 865 pand -16(%rdx),%xmm3 866 por %xmm2,%xmm4 867 por %xmm3,%xmm5 868 movdqa 0(%r12),%xmm0 869 movdqa 16(%r12),%xmm1 870 movdqa 32(%r12),%xmm2 871 movdqa 48(%r12),%xmm3 872 pand 0(%rdx),%xmm0 873 pand 16(%rdx),%xmm1 874 por %xmm0,%xmm4 875 pand 32(%rdx),%xmm2 876 por %xmm1,%xmm5 877 pand 48(%rdx),%xmm3 878 por %xmm2,%xmm4 879 por %xmm3,%xmm5 880 movdqa 64(%r12),%xmm0 881 movdqa 80(%r12),%xmm1 882 movdqa 96(%r12),%xmm2 883 movdqa 112(%r12),%xmm3 884 pand 64(%rdx),%xmm0 885 pand 80(%rdx),%xmm1 886 por %xmm0,%xmm4 887 pand 96(%rdx),%xmm2 888 por %xmm1,%xmm5 889 pand 112(%rdx),%xmm3 890 por %xmm2,%xmm4 891 por %xmm3,%xmm5 892 por %xmm5,%xmm4 893 pshufd $0x4e,%xmm4,%xmm0 894 por %xmm4,%xmm0 895 leaq 256(%r12),%r12 896.byte 102,72,15,126,195 897 898 movq (%r14,%r9,1),%r10 899 movq %r8,%rbp 900 mulq %rbx 901 addq %rax,%r10 902 movq (%rcx),%rax 903 adcq $0,%rdx 904 905 imulq %r10,%rbp 906 movq %rdx,%r11 907 movq %rdi,(%r14) 908 909 leaq (%r14,%r9,1),%r14 910 911 mulq %rbp 912 addq %rax,%r10 913 movq 8(%rsi,%r9,1),%rax 914 adcq $0,%rdx 915 movq %rdx,%rdi 916 917 mulq %rbx 918 addq %rax,%r11 919 movq 8(%rcx),%rax 920 adcq $0,%rdx 921 addq 8(%r14),%r11 922 adcq $0,%rdx 923 movq %rdx,%r10 924 925 mulq %rbp 926 addq %rax,%rdi 927 movq 16(%rsi,%r9,1),%rax 928 adcq $0,%rdx 929 addq %r11,%rdi 930 leaq 32(%r9),%r15 931 leaq 32(%rcx),%rcx 932 adcq $0,%rdx 933 movq %rdx,%r13 934 jmp .Linner4x 935 936.align 32 937.Linner4x: 938 mulq %rbx 939 addq %rax,%r10 940 movq -16(%rcx),%rax 941 adcq $0,%rdx 942 addq 16(%r14),%r10 943 leaq 32(%r14),%r14 944 adcq $0,%rdx 945 movq %rdx,%r11 946 947 mulq %rbp 948 addq %rax,%r13 949 movq -8(%rsi,%r15,1),%rax 950 adcq $0,%rdx 951 addq %r10,%r13 952 adcq $0,%rdx 953 movq %rdi,-32(%r14) 954 movq %rdx,%rdi 955 956 mulq %rbx 957 addq %rax,%r11 958 movq -8(%rcx),%rax 959 adcq $0,%rdx 960 addq -8(%r14),%r11 961 adcq $0,%rdx 962 movq %rdx,%r10 963 964 mulq %rbp 965 addq %rax,%rdi 966 movq (%rsi,%r15,1),%rax 967 adcq $0,%rdx 968 addq %r11,%rdi 969 adcq $0,%rdx 970 movq %r13,-24(%r14) 971 movq %rdx,%r13 972 973 mulq %rbx 974 addq %rax,%r10 975 movq 0(%rcx),%rax 976 adcq $0,%rdx 977 addq (%r14),%r10 978 adcq $0,%rdx 979 movq %rdx,%r11 980 981 mulq %rbp 982 addq %rax,%r13 983 movq 8(%rsi,%r15,1),%rax 984 adcq $0,%rdx 985 addq %r10,%r13 986 adcq $0,%rdx 987 movq %rdi,-16(%r14) 988 movq %rdx,%rdi 989 990 mulq %rbx 991 addq %rax,%r11 992 movq 8(%rcx),%rax 993 adcq $0,%rdx 994 addq 8(%r14),%r11 995 adcq $0,%rdx 996 movq %rdx,%r10 997 998 mulq %rbp 999 addq %rax,%rdi 1000 movq 16(%rsi,%r15,1),%rax 1001 adcq $0,%rdx 1002 addq %r11,%rdi 1003 leaq 32(%rcx),%rcx 1004 adcq $0,%rdx 1005 movq %r13,-8(%r14) 1006 movq %rdx,%r13 1007 1008 addq $32,%r15 1009 jnz .Linner4x 1010 1011 mulq %rbx 1012 addq %rax,%r10 1013 movq -16(%rcx),%rax 1014 adcq $0,%rdx 1015 addq 16(%r14),%r10 1016 leaq 32(%r14),%r14 1017 adcq $0,%rdx 1018 movq %rdx,%r11 1019 1020 mulq %rbp 1021 addq %rax,%r13 1022 movq -8(%rsi),%rax 1023 adcq $0,%rdx 1024 addq %r10,%r13 1025 adcq $0,%rdx 1026 movq %rdi,-32(%r14) 1027 movq %rdx,%rdi 1028 1029 mulq %rbx 1030 addq %rax,%r11 1031 movq %rbp,%rax 1032 movq -8(%rcx),%rbp 1033 adcq $0,%rdx 1034 addq -8(%r14),%r11 1035 adcq $0,%rdx 1036 movq %rdx,%r10 1037 1038 mulq %rbp 1039 addq %rax,%rdi 1040 movq (%rsi,%r9,1),%rax 1041 adcq $0,%rdx 1042 addq %r11,%rdi 1043 adcq $0,%rdx 1044 movq %r13,-24(%r14) 1045 movq %rdx,%r13 1046 1047 movq %rdi,-16(%r14) 1048 leaq (%rcx,%r9,1),%rcx 1049 1050 xorq %rdi,%rdi 1051 addq %r10,%r13 1052 adcq $0,%rdi 1053 addq (%r14),%r13 1054 adcq $0,%rdi 1055 movq %r13,-8(%r14) 1056 1057 cmpq 16+8(%rsp),%r12 1058 jb .Louter4x 1059 xorq %rax,%rax 1060 subq %r13,%rbp 1061 adcq %r15,%r15 1062 orq %r15,%rdi 1063 subq %rdi,%rax 1064 leaq (%r14,%r9,1),%rbx 1065 movq (%rcx),%r12 1066 leaq (%rcx),%rbp 1067 movq %r9,%rcx 1068 sarq $3+2,%rcx 1069 movq 56+8(%rsp),%rdi 1070 decq %r12 1071 xorq %r10,%r10 1072 movq 8(%rbp),%r13 1073 movq 16(%rbp),%r14 1074 movq 24(%rbp),%r15 1075 jmp .Lsqr4x_sub_entry 1076.cfi_endproc 1077.size mul4x_internal,.-mul4x_internal 1078.globl bn_power5 1079.type bn_power5,@function 1080.align 32 1081bn_power5: 1082.cfi_startproc 1083 movq %rsp,%rax 1084.cfi_def_cfa_register %rax 1085 movl OPENSSL_ia32cap_P+8(%rip),%r11d 1086 andl $0x80108,%r11d 1087 cmpl $0x80108,%r11d 1088 je .Lpowerx5_enter 1089 pushq %rbx 1090.cfi_offset %rbx,-16 1091 pushq %rbp 1092.cfi_offset %rbp,-24 1093 pushq %r12 1094.cfi_offset %r12,-32 1095 pushq %r13 1096.cfi_offset %r13,-40 1097 pushq %r14 1098.cfi_offset %r14,-48 1099 pushq %r15 1100.cfi_offset %r15,-56 1101.Lpower5_prologue: 1102 1103 shll $3,%r9d 1104 leal (%r9,%r9,2),%r10d 1105 negq %r9 1106 movq (%r8),%r8 1107 1108 1109 1110 1111 1112 1113 1114 1115 leaq -320(%rsp,%r9,2),%r11 1116 movq %rsp,%rbp 1117 subq %rdi,%r11 1118 andq $4095,%r11 1119 cmpq %r11,%r10 1120 jb .Lpwr_sp_alt 1121 subq %r11,%rbp 1122 leaq -320(%rbp,%r9,2),%rbp 1123 jmp .Lpwr_sp_done 1124 1125.align 32 1126.Lpwr_sp_alt: 1127 leaq 4096-320(,%r9,2),%r10 1128 leaq -320(%rbp,%r9,2),%rbp 1129 subq %r10,%r11 1130 movq $0,%r10 1131 cmovcq %r10,%r11 1132 subq %r11,%rbp 1133.Lpwr_sp_done: 1134 andq $-64,%rbp 1135 movq %rsp,%r11 1136 subq %rbp,%r11 1137 andq $-4096,%r11 1138 leaq (%r11,%rbp,1),%rsp 1139 movq (%rsp),%r10 1140 cmpq %rbp,%rsp 1141 ja .Lpwr_page_walk 1142 jmp .Lpwr_page_walk_done 1143 1144.Lpwr_page_walk: 1145 leaq -4096(%rsp),%rsp 1146 movq (%rsp),%r10 1147 cmpq %rbp,%rsp 1148 ja .Lpwr_page_walk 1149.Lpwr_page_walk_done: 1150 1151 movq %r9,%r10 1152 negq %r9 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 movq %r8,32(%rsp) 1164 movq %rax,40(%rsp) 1165.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 1166.Lpower5_body: 1167.byte 102,72,15,110,207 1168.byte 102,72,15,110,209 1169.byte 102,73,15,110,218 1170.byte 102,72,15,110,226 1171 1172 call __bn_sqr8x_internal 1173 call __bn_post4x_internal 1174 call __bn_sqr8x_internal 1175 call __bn_post4x_internal 1176 call __bn_sqr8x_internal 1177 call __bn_post4x_internal 1178 call __bn_sqr8x_internal 1179 call __bn_post4x_internal 1180 call __bn_sqr8x_internal 1181 call __bn_post4x_internal 1182 1183.byte 102,72,15,126,209 1184.byte 102,72,15,126,226 1185 movq %rsi,%rdi 1186 movq 40(%rsp),%rax 1187 leaq 32(%rsp),%r8 1188 1189 call mul4x_internal 1190 1191 movq 40(%rsp),%rsi 1192.cfi_def_cfa %rsi,8 1193 movq $1,%rax 1194 movq -48(%rsi),%r15 1195.cfi_restore %r15 1196 movq -40(%rsi),%r14 1197.cfi_restore %r14 1198 movq -32(%rsi),%r13 1199.cfi_restore %r13 1200 movq -24(%rsi),%r12 1201.cfi_restore %r12 1202 movq -16(%rsi),%rbp 1203.cfi_restore %rbp 1204 movq -8(%rsi),%rbx 1205.cfi_restore %rbx 1206 leaq (%rsi),%rsp 1207.cfi_def_cfa_register %rsp 1208.Lpower5_epilogue: 1209 .byte 0xf3,0xc3 1210.cfi_endproc 1211.size bn_power5,.-bn_power5 1212 1213.globl bn_sqr8x_internal 1214.hidden bn_sqr8x_internal 1215.type bn_sqr8x_internal,@function 1216.align 32 1217bn_sqr8x_internal: 1218__bn_sqr8x_internal: 1219.cfi_startproc 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 leaq 32(%r10),%rbp 1294 leaq (%rsi,%r9,1),%rsi 1295 1296 movq %r9,%rcx 1297 1298 1299 movq -32(%rsi,%rbp,1),%r14 1300 leaq 48+8(%rsp,%r9,2),%rdi 1301 movq -24(%rsi,%rbp,1),%rax 1302 leaq -32(%rdi,%rbp,1),%rdi 1303 movq -16(%rsi,%rbp,1),%rbx 1304 movq %rax,%r15 1305 1306 mulq %r14 1307 movq %rax,%r10 1308 movq %rbx,%rax 1309 movq %rdx,%r11 1310 movq %r10,-24(%rdi,%rbp,1) 1311 1312 mulq %r14 1313 addq %rax,%r11 1314 movq %rbx,%rax 1315 adcq $0,%rdx 1316 movq %r11,-16(%rdi,%rbp,1) 1317 movq %rdx,%r10 1318 1319 1320 movq -8(%rsi,%rbp,1),%rbx 1321 mulq %r15 1322 movq %rax,%r12 1323 movq %rbx,%rax 1324 movq %rdx,%r13 1325 1326 leaq (%rbp),%rcx 1327 mulq %r14 1328 addq %rax,%r10 1329 movq %rbx,%rax 1330 movq %rdx,%r11 1331 adcq $0,%r11 1332 addq %r12,%r10 1333 adcq $0,%r11 1334 movq %r10,-8(%rdi,%rcx,1) 1335 jmp .Lsqr4x_1st 1336 1337.align 32 1338.Lsqr4x_1st: 1339 movq (%rsi,%rcx,1),%rbx 1340 mulq %r15 1341 addq %rax,%r13 1342 movq %rbx,%rax 1343 movq %rdx,%r12 1344 adcq $0,%r12 1345 1346 mulq %r14 1347 addq %rax,%r11 1348 movq %rbx,%rax 1349 movq 8(%rsi,%rcx,1),%rbx 1350 movq %rdx,%r10 1351 adcq $0,%r10 1352 addq %r13,%r11 1353 adcq $0,%r10 1354 1355 1356 mulq %r15 1357 addq %rax,%r12 1358 movq %rbx,%rax 1359 movq %r11,(%rdi,%rcx,1) 1360 movq %rdx,%r13 1361 adcq $0,%r13 1362 1363 mulq %r14 1364 addq %rax,%r10 1365 movq %rbx,%rax 1366 movq 16(%rsi,%rcx,1),%rbx 1367 movq %rdx,%r11 1368 adcq $0,%r11 1369 addq %r12,%r10 1370 adcq $0,%r11 1371 1372 mulq %r15 1373 addq %rax,%r13 1374 movq %rbx,%rax 1375 movq %r10,8(%rdi,%rcx,1) 1376 movq %rdx,%r12 1377 adcq $0,%r12 1378 1379 mulq %r14 1380 addq %rax,%r11 1381 movq %rbx,%rax 1382 movq 24(%rsi,%rcx,1),%rbx 1383 movq %rdx,%r10 1384 adcq $0,%r10 1385 addq %r13,%r11 1386 adcq $0,%r10 1387 1388 1389 mulq %r15 1390 addq %rax,%r12 1391 movq %rbx,%rax 1392 movq %r11,16(%rdi,%rcx,1) 1393 movq %rdx,%r13 1394 adcq $0,%r13 1395 leaq 32(%rcx),%rcx 1396 1397 mulq %r14 1398 addq %rax,%r10 1399 movq %rbx,%rax 1400 movq %rdx,%r11 1401 adcq $0,%r11 1402 addq %r12,%r10 1403 adcq $0,%r11 1404 movq %r10,-8(%rdi,%rcx,1) 1405 1406 cmpq $0,%rcx 1407 jne .Lsqr4x_1st 1408 1409 mulq %r15 1410 addq %rax,%r13 1411 leaq 16(%rbp),%rbp 1412 adcq $0,%rdx 1413 addq %r11,%r13 1414 adcq $0,%rdx 1415 1416 movq %r13,(%rdi) 1417 movq %rdx,%r12 1418 movq %rdx,8(%rdi) 1419 jmp .Lsqr4x_outer 1420 1421.align 32 1422.Lsqr4x_outer: 1423 movq -32(%rsi,%rbp,1),%r14 1424 leaq 48+8(%rsp,%r9,2),%rdi 1425 movq -24(%rsi,%rbp,1),%rax 1426 leaq -32(%rdi,%rbp,1),%rdi 1427 movq -16(%rsi,%rbp,1),%rbx 1428 movq %rax,%r15 1429 1430 mulq %r14 1431 movq -24(%rdi,%rbp,1),%r10 1432 addq %rax,%r10 1433 movq %rbx,%rax 1434 adcq $0,%rdx 1435 movq %r10,-24(%rdi,%rbp,1) 1436 movq %rdx,%r11 1437 1438 mulq %r14 1439 addq %rax,%r11 1440 movq %rbx,%rax 1441 adcq $0,%rdx 1442 addq -16(%rdi,%rbp,1),%r11 1443 movq %rdx,%r10 1444 adcq $0,%r10 1445 movq %r11,-16(%rdi,%rbp,1) 1446 1447 xorq %r12,%r12 1448 1449 movq -8(%rsi,%rbp,1),%rbx 1450 mulq %r15 1451 addq %rax,%r12 1452 movq %rbx,%rax 1453 adcq $0,%rdx 1454 addq -8(%rdi,%rbp,1),%r12 1455 movq %rdx,%r13 1456 adcq $0,%r13 1457 1458 mulq %r14 1459 addq %rax,%r10 1460 movq %rbx,%rax 1461 adcq $0,%rdx 1462 addq %r12,%r10 1463 movq %rdx,%r11 1464 adcq $0,%r11 1465 movq %r10,-8(%rdi,%rbp,1) 1466 1467 leaq (%rbp),%rcx 1468 jmp .Lsqr4x_inner 1469 1470.align 32 1471.Lsqr4x_inner: 1472 movq (%rsi,%rcx,1),%rbx 1473 mulq %r15 1474 addq %rax,%r13 1475 movq %rbx,%rax 1476 movq %rdx,%r12 1477 adcq $0,%r12 1478 addq (%rdi,%rcx,1),%r13 1479 adcq $0,%r12 1480 1481.byte 0x67 1482 mulq %r14 1483 addq %rax,%r11 1484 movq %rbx,%rax 1485 movq 8(%rsi,%rcx,1),%rbx 1486 movq %rdx,%r10 1487 adcq $0,%r10 1488 addq %r13,%r11 1489 adcq $0,%r10 1490 1491 mulq %r15 1492 addq %rax,%r12 1493 movq %r11,(%rdi,%rcx,1) 1494 movq %rbx,%rax 1495 movq %rdx,%r13 1496 adcq $0,%r13 1497 addq 8(%rdi,%rcx,1),%r12 1498 leaq 16(%rcx),%rcx 1499 adcq $0,%r13 1500 1501 mulq %r14 1502 addq %rax,%r10 1503 movq %rbx,%rax 1504 adcq $0,%rdx 1505 addq %r12,%r10 1506 movq %rdx,%r11 1507 adcq $0,%r11 1508 movq %r10,-8(%rdi,%rcx,1) 1509 1510 cmpq $0,%rcx 1511 jne .Lsqr4x_inner 1512 1513.byte 0x67 1514 mulq %r15 1515 addq %rax,%r13 1516 adcq $0,%rdx 1517 addq %r11,%r13 1518 adcq $0,%rdx 1519 1520 movq %r13,(%rdi) 1521 movq %rdx,%r12 1522 movq %rdx,8(%rdi) 1523 1524 addq $16,%rbp 1525 jnz .Lsqr4x_outer 1526 1527 1528 movq -32(%rsi),%r14 1529 leaq 48+8(%rsp,%r9,2),%rdi 1530 movq -24(%rsi),%rax 1531 leaq -32(%rdi,%rbp,1),%rdi 1532 movq -16(%rsi),%rbx 1533 movq %rax,%r15 1534 1535 mulq %r14 1536 addq %rax,%r10 1537 movq %rbx,%rax 1538 movq %rdx,%r11 1539 adcq $0,%r11 1540 1541 mulq %r14 1542 addq %rax,%r11 1543 movq %rbx,%rax 1544 movq %r10,-24(%rdi) 1545 movq %rdx,%r10 1546 adcq $0,%r10 1547 addq %r13,%r11 1548 movq -8(%rsi),%rbx 1549 adcq $0,%r10 1550 1551 mulq %r15 1552 addq %rax,%r12 1553 movq %rbx,%rax 1554 movq %r11,-16(%rdi) 1555 movq %rdx,%r13 1556 adcq $0,%r13 1557 1558 mulq %r14 1559 addq %rax,%r10 1560 movq %rbx,%rax 1561 movq %rdx,%r11 1562 adcq $0,%r11 1563 addq %r12,%r10 1564 adcq $0,%r11 1565 movq %r10,-8(%rdi) 1566 1567 mulq %r15 1568 addq %rax,%r13 1569 movq -16(%rsi),%rax 1570 adcq $0,%rdx 1571 addq %r11,%r13 1572 adcq $0,%rdx 1573 1574 movq %r13,(%rdi) 1575 movq %rdx,%r12 1576 movq %rdx,8(%rdi) 1577 1578 mulq %rbx 1579 addq $16,%rbp 1580 xorq %r14,%r14 1581 subq %r9,%rbp 1582 xorq %r15,%r15 1583 1584 addq %r12,%rax 1585 adcq $0,%rdx 1586 movq %rax,8(%rdi) 1587 movq %rdx,16(%rdi) 1588 movq %r15,24(%rdi) 1589 1590 movq -16(%rsi,%rbp,1),%rax 1591 leaq 48+8(%rsp),%rdi 1592 xorq %r10,%r10 1593 movq 8(%rdi),%r11 1594 1595 leaq (%r14,%r10,2),%r12 1596 shrq $63,%r10 1597 leaq (%rcx,%r11,2),%r13 1598 shrq $63,%r11 1599 orq %r10,%r13 1600 movq 16(%rdi),%r10 1601 movq %r11,%r14 1602 mulq %rax 1603 negq %r15 1604 movq 24(%rdi),%r11 1605 adcq %rax,%r12 1606 movq -8(%rsi,%rbp,1),%rax 1607 movq %r12,(%rdi) 1608 adcq %rdx,%r13 1609 1610 leaq (%r14,%r10,2),%rbx 1611 movq %r13,8(%rdi) 1612 sbbq %r15,%r15 1613 shrq $63,%r10 1614 leaq (%rcx,%r11,2),%r8 1615 shrq $63,%r11 1616 orq %r10,%r8 1617 movq 32(%rdi),%r10 1618 movq %r11,%r14 1619 mulq %rax 1620 negq %r15 1621 movq 40(%rdi),%r11 1622 adcq %rax,%rbx 1623 movq 0(%rsi,%rbp,1),%rax 1624 movq %rbx,16(%rdi) 1625 adcq %rdx,%r8 1626 leaq 16(%rbp),%rbp 1627 movq %r8,24(%rdi) 1628 sbbq %r15,%r15 1629 leaq 64(%rdi),%rdi 1630 jmp .Lsqr4x_shift_n_add 1631 1632.align 32 1633.Lsqr4x_shift_n_add: 1634 leaq (%r14,%r10,2),%r12 1635 shrq $63,%r10 1636 leaq (%rcx,%r11,2),%r13 1637 shrq $63,%r11 1638 orq %r10,%r13 1639 movq -16(%rdi),%r10 1640 movq %r11,%r14 1641 mulq %rax 1642 negq %r15 1643 movq -8(%rdi),%r11 1644 adcq %rax,%r12 1645 movq -8(%rsi,%rbp,1),%rax 1646 movq %r12,-32(%rdi) 1647 adcq %rdx,%r13 1648 1649 leaq (%r14,%r10,2),%rbx 1650 movq %r13,-24(%rdi) 1651 sbbq %r15,%r15 1652 shrq $63,%r10 1653 leaq (%rcx,%r11,2),%r8 1654 shrq $63,%r11 1655 orq %r10,%r8 1656 movq 0(%rdi),%r10 1657 movq %r11,%r14 1658 mulq %rax 1659 negq %r15 1660 movq 8(%rdi),%r11 1661 adcq %rax,%rbx 1662 movq 0(%rsi,%rbp,1),%rax 1663 movq %rbx,-16(%rdi) 1664 adcq %rdx,%r8 1665 1666 leaq (%r14,%r10,2),%r12 1667 movq %r8,-8(%rdi) 1668 sbbq %r15,%r15 1669 shrq $63,%r10 1670 leaq (%rcx,%r11,2),%r13 1671 shrq $63,%r11 1672 orq %r10,%r13 1673 movq 16(%rdi),%r10 1674 movq %r11,%r14 1675 mulq %rax 1676 negq %r15 1677 movq 24(%rdi),%r11 1678 adcq %rax,%r12 1679 movq 8(%rsi,%rbp,1),%rax 1680 movq %r12,0(%rdi) 1681 adcq %rdx,%r13 1682 1683 leaq (%r14,%r10,2),%rbx 1684 movq %r13,8(%rdi) 1685 sbbq %r15,%r15 1686 shrq $63,%r10 1687 leaq (%rcx,%r11,2),%r8 1688 shrq $63,%r11 1689 orq %r10,%r8 1690 movq 32(%rdi),%r10 1691 movq %r11,%r14 1692 mulq %rax 1693 negq %r15 1694 movq 40(%rdi),%r11 1695 adcq %rax,%rbx 1696 movq 16(%rsi,%rbp,1),%rax 1697 movq %rbx,16(%rdi) 1698 adcq %rdx,%r8 1699 movq %r8,24(%rdi) 1700 sbbq %r15,%r15 1701 leaq 64(%rdi),%rdi 1702 addq $32,%rbp 1703 jnz .Lsqr4x_shift_n_add 1704 1705 leaq (%r14,%r10,2),%r12 1706.byte 0x67 1707 shrq $63,%r10 1708 leaq (%rcx,%r11,2),%r13 1709 shrq $63,%r11 1710 orq %r10,%r13 1711 movq -16(%rdi),%r10 1712 movq %r11,%r14 1713 mulq %rax 1714 negq %r15 1715 movq -8(%rdi),%r11 1716 adcq %rax,%r12 1717 movq -8(%rsi),%rax 1718 movq %r12,-32(%rdi) 1719 adcq %rdx,%r13 1720 1721 leaq (%r14,%r10,2),%rbx 1722 movq %r13,-24(%rdi) 1723 sbbq %r15,%r15 1724 shrq $63,%r10 1725 leaq (%rcx,%r11,2),%r8 1726 shrq $63,%r11 1727 orq %r10,%r8 1728 mulq %rax 1729 negq %r15 1730 adcq %rax,%rbx 1731 adcq %rdx,%r8 1732 movq %rbx,-16(%rdi) 1733 movq %r8,-8(%rdi) 1734.byte 102,72,15,126,213 1735__bn_sqr8x_reduction: 1736 xorq %rax,%rax 1737 leaq (%r9,%rbp,1),%rcx 1738 leaq 48+8(%rsp,%r9,2),%rdx 1739 movq %rcx,0+8(%rsp) 1740 leaq 48+8(%rsp,%r9,1),%rdi 1741 movq %rdx,8+8(%rsp) 1742 negq %r9 1743 jmp .L8x_reduction_loop 1744 1745.align 32 1746.L8x_reduction_loop: 1747 leaq (%rdi,%r9,1),%rdi 1748.byte 0x66 1749 movq 0(%rdi),%rbx 1750 movq 8(%rdi),%r9 1751 movq 16(%rdi),%r10 1752 movq 24(%rdi),%r11 1753 movq 32(%rdi),%r12 1754 movq 40(%rdi),%r13 1755 movq 48(%rdi),%r14 1756 movq 56(%rdi),%r15 1757 movq %rax,(%rdx) 1758 leaq 64(%rdi),%rdi 1759 1760.byte 0x67 1761 movq %rbx,%r8 1762 imulq 32+8(%rsp),%rbx 1763 movq 0(%rbp),%rax 1764 movl $8,%ecx 1765 jmp .L8x_reduce 1766 1767.align 32 1768.L8x_reduce: 1769 mulq %rbx 1770 movq 8(%rbp),%rax 1771 negq %r8 1772 movq %rdx,%r8 1773 adcq $0,%r8 1774 1775 mulq %rbx 1776 addq %rax,%r9 1777 movq 16(%rbp),%rax 1778 adcq $0,%rdx 1779 addq %r9,%r8 1780 movq %rbx,48-8+8(%rsp,%rcx,8) 1781 movq %rdx,%r9 1782 adcq $0,%r9 1783 1784 mulq %rbx 1785 addq %rax,%r10 1786 movq 24(%rbp),%rax 1787 adcq $0,%rdx 1788 addq %r10,%r9 1789 movq 32+8(%rsp),%rsi 1790 movq %rdx,%r10 1791 adcq $0,%r10 1792 1793 mulq %rbx 1794 addq %rax,%r11 1795 movq 32(%rbp),%rax 1796 adcq $0,%rdx 1797 imulq %r8,%rsi 1798 addq %r11,%r10 1799 movq %rdx,%r11 1800 adcq $0,%r11 1801 1802 mulq %rbx 1803 addq %rax,%r12 1804 movq 40(%rbp),%rax 1805 adcq $0,%rdx 1806 addq %r12,%r11 1807 movq %rdx,%r12 1808 adcq $0,%r12 1809 1810 mulq %rbx 1811 addq %rax,%r13 1812 movq 48(%rbp),%rax 1813 adcq $0,%rdx 1814 addq %r13,%r12 1815 movq %rdx,%r13 1816 adcq $0,%r13 1817 1818 mulq %rbx 1819 addq %rax,%r14 1820 movq 56(%rbp),%rax 1821 adcq $0,%rdx 1822 addq %r14,%r13 1823 movq %rdx,%r14 1824 adcq $0,%r14 1825 1826 mulq %rbx 1827 movq %rsi,%rbx 1828 addq %rax,%r15 1829 movq 0(%rbp),%rax 1830 adcq $0,%rdx 1831 addq %r15,%r14 1832 movq %rdx,%r15 1833 adcq $0,%r15 1834 1835 decl %ecx 1836 jnz .L8x_reduce 1837 1838 leaq 64(%rbp),%rbp 1839 xorq %rax,%rax 1840 movq 8+8(%rsp),%rdx 1841 cmpq 0+8(%rsp),%rbp 1842 jae .L8x_no_tail 1843 1844.byte 0x66 1845 addq 0(%rdi),%r8 1846 adcq 8(%rdi),%r9 1847 adcq 16(%rdi),%r10 1848 adcq 24(%rdi),%r11 1849 adcq 32(%rdi),%r12 1850 adcq 40(%rdi),%r13 1851 adcq 48(%rdi),%r14 1852 adcq 56(%rdi),%r15 1853 sbbq %rsi,%rsi 1854 1855 movq 48+56+8(%rsp),%rbx 1856 movl $8,%ecx 1857 movq 0(%rbp),%rax 1858 jmp .L8x_tail 1859 1860.align 32 1861.L8x_tail: 1862 mulq %rbx 1863 addq %rax,%r8 1864 movq 8(%rbp),%rax 1865 movq %r8,(%rdi) 1866 movq %rdx,%r8 1867 adcq $0,%r8 1868 1869 mulq %rbx 1870 addq %rax,%r9 1871 movq 16(%rbp),%rax 1872 adcq $0,%rdx 1873 addq %r9,%r8 1874 leaq 8(%rdi),%rdi 1875 movq %rdx,%r9 1876 adcq $0,%r9 1877 1878 mulq %rbx 1879 addq %rax,%r10 1880 movq 24(%rbp),%rax 1881 adcq $0,%rdx 1882 addq %r10,%r9 1883 movq %rdx,%r10 1884 adcq $0,%r10 1885 1886 mulq %rbx 1887 addq %rax,%r11 1888 movq 32(%rbp),%rax 1889 adcq $0,%rdx 1890 addq %r11,%r10 1891 movq %rdx,%r11 1892 adcq $0,%r11 1893 1894 mulq %rbx 1895 addq %rax,%r12 1896 movq 40(%rbp),%rax 1897 adcq $0,%rdx 1898 addq %r12,%r11 1899 movq %rdx,%r12 1900 adcq $0,%r12 1901 1902 mulq %rbx 1903 addq %rax,%r13 1904 movq 48(%rbp),%rax 1905 adcq $0,%rdx 1906 addq %r13,%r12 1907 movq %rdx,%r13 1908 adcq $0,%r13 1909 1910 mulq %rbx 1911 addq %rax,%r14 1912 movq 56(%rbp),%rax 1913 adcq $0,%rdx 1914 addq %r14,%r13 1915 movq %rdx,%r14 1916 adcq $0,%r14 1917 1918 mulq %rbx 1919 movq 48-16+8(%rsp,%rcx,8),%rbx 1920 addq %rax,%r15 1921 adcq $0,%rdx 1922 addq %r15,%r14 1923 movq 0(%rbp),%rax 1924 movq %rdx,%r15 1925 adcq $0,%r15 1926 1927 decl %ecx 1928 jnz .L8x_tail 1929 1930 leaq 64(%rbp),%rbp 1931 movq 8+8(%rsp),%rdx 1932 cmpq 0+8(%rsp),%rbp 1933 jae .L8x_tail_done 1934 1935 movq 48+56+8(%rsp),%rbx 1936 negq %rsi 1937 movq 0(%rbp),%rax 1938 adcq 0(%rdi),%r8 1939 adcq 8(%rdi),%r9 1940 adcq 16(%rdi),%r10 1941 adcq 24(%rdi),%r11 1942 adcq 32(%rdi),%r12 1943 adcq 40(%rdi),%r13 1944 adcq 48(%rdi),%r14 1945 adcq 56(%rdi),%r15 1946 sbbq %rsi,%rsi 1947 1948 movl $8,%ecx 1949 jmp .L8x_tail 1950 1951.align 32 1952.L8x_tail_done: 1953 xorq %rax,%rax 1954 addq (%rdx),%r8 1955 adcq $0,%r9 1956 adcq $0,%r10 1957 adcq $0,%r11 1958 adcq $0,%r12 1959 adcq $0,%r13 1960 adcq $0,%r14 1961 adcq $0,%r15 1962 adcq $0,%rax 1963 1964 negq %rsi 1965.L8x_no_tail: 1966 adcq 0(%rdi),%r8 1967 adcq 8(%rdi),%r9 1968 adcq 16(%rdi),%r10 1969 adcq 24(%rdi),%r11 1970 adcq 32(%rdi),%r12 1971 adcq 40(%rdi),%r13 1972 adcq 48(%rdi),%r14 1973 adcq 56(%rdi),%r15 1974 adcq $0,%rax 1975 movq -8(%rbp),%rcx 1976 xorq %rsi,%rsi 1977 1978.byte 102,72,15,126,213 1979 1980 movq %r8,0(%rdi) 1981 movq %r9,8(%rdi) 1982.byte 102,73,15,126,217 1983 movq %r10,16(%rdi) 1984 movq %r11,24(%rdi) 1985 movq %r12,32(%rdi) 1986 movq %r13,40(%rdi) 1987 movq %r14,48(%rdi) 1988 movq %r15,56(%rdi) 1989 leaq 64(%rdi),%rdi 1990 1991 cmpq %rdx,%rdi 1992 jb .L8x_reduction_loop 1993 .byte 0xf3,0xc3 1994.cfi_endproc 1995.size bn_sqr8x_internal,.-bn_sqr8x_internal 1996.type __bn_post4x_internal,@function 1997.align 32 1998__bn_post4x_internal: 1999.cfi_startproc 2000 movq 0(%rbp),%r12 2001 leaq (%rdi,%r9,1),%rbx 2002 movq %r9,%rcx 2003.byte 102,72,15,126,207 2004 negq %rax 2005.byte 102,72,15,126,206 2006 sarq $3+2,%rcx 2007 decq %r12 2008 xorq %r10,%r10 2009 movq 8(%rbp),%r13 2010 movq 16(%rbp),%r14 2011 movq 24(%rbp),%r15 2012 jmp .Lsqr4x_sub_entry 2013 2014.align 16 2015.Lsqr4x_sub: 2016 movq 0(%rbp),%r12 2017 movq 8(%rbp),%r13 2018 movq 16(%rbp),%r14 2019 movq 24(%rbp),%r15 2020.Lsqr4x_sub_entry: 2021 leaq 32(%rbp),%rbp 2022 notq %r12 2023 notq %r13 2024 notq %r14 2025 notq %r15 2026 andq %rax,%r12 2027 andq %rax,%r13 2028 andq %rax,%r14 2029 andq %rax,%r15 2030 2031 negq %r10 2032 adcq 0(%rbx),%r12 2033 adcq 8(%rbx),%r13 2034 adcq 16(%rbx),%r14 2035 adcq 24(%rbx),%r15 2036 movq %r12,0(%rdi) 2037 leaq 32(%rbx),%rbx 2038 movq %r13,8(%rdi) 2039 sbbq %r10,%r10 2040 movq %r14,16(%rdi) 2041 movq %r15,24(%rdi) 2042 leaq 32(%rdi),%rdi 2043 2044 incq %rcx 2045 jnz .Lsqr4x_sub 2046 2047 movq %r9,%r10 2048 negq %r9 2049 .byte 0xf3,0xc3 2050.cfi_endproc 2051.size __bn_post4x_internal,.-__bn_post4x_internal 2052.globl bn_from_montgomery 2053.type bn_from_montgomery,@function 2054.align 32 2055bn_from_montgomery: 2056.cfi_startproc 2057 testl $7,%r9d 2058 jz bn_from_mont8x 2059 xorl %eax,%eax 2060 .byte 0xf3,0xc3 2061.cfi_endproc 2062.size bn_from_montgomery,.-bn_from_montgomery 2063 2064.type bn_from_mont8x,@function 2065.align 32 2066bn_from_mont8x: 2067.cfi_startproc 2068.byte 0x67 2069 movq %rsp,%rax 2070.cfi_def_cfa_register %rax 2071 pushq %rbx 2072.cfi_offset %rbx,-16 2073 pushq %rbp 2074.cfi_offset %rbp,-24 2075 pushq %r12 2076.cfi_offset %r12,-32 2077 pushq %r13 2078.cfi_offset %r13,-40 2079 pushq %r14 2080.cfi_offset %r14,-48 2081 pushq %r15 2082.cfi_offset %r15,-56 2083.Lfrom_prologue: 2084 2085 shll $3,%r9d 2086 leaq (%r9,%r9,2),%r10 2087 negq %r9 2088 movq (%r8),%r8 2089 2090 2091 2092 2093 2094 2095 2096 2097 leaq -320(%rsp,%r9,2),%r11 2098 movq %rsp,%rbp 2099 subq %rdi,%r11 2100 andq $4095,%r11 2101 cmpq %r11,%r10 2102 jb .Lfrom_sp_alt 2103 subq %r11,%rbp 2104 leaq -320(%rbp,%r9,2),%rbp 2105 jmp .Lfrom_sp_done 2106 2107.align 32 2108.Lfrom_sp_alt: 2109 leaq 4096-320(,%r9,2),%r10 2110 leaq -320(%rbp,%r9,2),%rbp 2111 subq %r10,%r11 2112 movq $0,%r10 2113 cmovcq %r10,%r11 2114 subq %r11,%rbp 2115.Lfrom_sp_done: 2116 andq $-64,%rbp 2117 movq %rsp,%r11 2118 subq %rbp,%r11 2119 andq $-4096,%r11 2120 leaq (%r11,%rbp,1),%rsp 2121 movq (%rsp),%r10 2122 cmpq %rbp,%rsp 2123 ja .Lfrom_page_walk 2124 jmp .Lfrom_page_walk_done 2125 2126.Lfrom_page_walk: 2127 leaq -4096(%rsp),%rsp 2128 movq (%rsp),%r10 2129 cmpq %rbp,%rsp 2130 ja .Lfrom_page_walk 2131.Lfrom_page_walk_done: 2132 2133 movq %r9,%r10 2134 negq %r9 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 movq %r8,32(%rsp) 2146 movq %rax,40(%rsp) 2147.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 2148.Lfrom_body: 2149 movq %r9,%r11 2150 leaq 48(%rsp),%rax 2151 pxor %xmm0,%xmm0 2152 jmp .Lmul_by_1 2153 2154.align 32 2155.Lmul_by_1: 2156 movdqu (%rsi),%xmm1 2157 movdqu 16(%rsi),%xmm2 2158 movdqu 32(%rsi),%xmm3 2159 movdqa %xmm0,(%rax,%r9,1) 2160 movdqu 48(%rsi),%xmm4 2161 movdqa %xmm0,16(%rax,%r9,1) 2162.byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 2163 movdqa %xmm1,(%rax) 2164 movdqa %xmm0,32(%rax,%r9,1) 2165 movdqa %xmm2,16(%rax) 2166 movdqa %xmm0,48(%rax,%r9,1) 2167 movdqa %xmm3,32(%rax) 2168 movdqa %xmm4,48(%rax) 2169 leaq 64(%rax),%rax 2170 subq $64,%r11 2171 jnz .Lmul_by_1 2172 2173.byte 102,72,15,110,207 2174.byte 102,72,15,110,209 2175.byte 0x67 2176 movq %rcx,%rbp 2177.byte 102,73,15,110,218 2178 movl OPENSSL_ia32cap_P+8(%rip),%r11d 2179 andl $0x80108,%r11d 2180 cmpl $0x80108,%r11d 2181 jne .Lfrom_mont_nox 2182 2183 leaq (%rax,%r9,1),%rdi 2184 call __bn_sqrx8x_reduction 2185 call __bn_postx4x_internal 2186 2187 pxor %xmm0,%xmm0 2188 leaq 48(%rsp),%rax 2189 jmp .Lfrom_mont_zero 2190 2191.align 32 2192.Lfrom_mont_nox: 2193 call __bn_sqr8x_reduction 2194 call __bn_post4x_internal 2195 2196 pxor %xmm0,%xmm0 2197 leaq 48(%rsp),%rax 2198 jmp .Lfrom_mont_zero 2199 2200.align 32 2201.Lfrom_mont_zero: 2202 movq 40(%rsp),%rsi 2203.cfi_def_cfa %rsi,8 2204 movdqa %xmm0,0(%rax) 2205 movdqa %xmm0,16(%rax) 2206 movdqa %xmm0,32(%rax) 2207 movdqa %xmm0,48(%rax) 2208 leaq 64(%rax),%rax 2209 subq $32,%r9 2210 jnz .Lfrom_mont_zero 2211 2212 movq $1,%rax 2213 movq -48(%rsi),%r15 2214.cfi_restore %r15 2215 movq -40(%rsi),%r14 2216.cfi_restore %r14 2217 movq -32(%rsi),%r13 2218.cfi_restore %r13 2219 movq -24(%rsi),%r12 2220.cfi_restore %r12 2221 movq -16(%rsi),%rbp 2222.cfi_restore %rbp 2223 movq -8(%rsi),%rbx 2224.cfi_restore %rbx 2225 leaq (%rsi),%rsp 2226.cfi_def_cfa_register %rsp 2227.Lfrom_epilogue: 2228 .byte 0xf3,0xc3 2229.cfi_endproc 2230.size bn_from_mont8x,.-bn_from_mont8x 2231.type bn_mulx4x_mont_gather5,@function 2232.align 32 2233bn_mulx4x_mont_gather5: 2234.cfi_startproc 2235 movq %rsp,%rax 2236.cfi_def_cfa_register %rax 2237.Lmulx4x_enter: 2238 pushq %rbx 2239.cfi_offset %rbx,-16 2240 pushq %rbp 2241.cfi_offset %rbp,-24 2242 pushq %r12 2243.cfi_offset %r12,-32 2244 pushq %r13 2245.cfi_offset %r13,-40 2246 pushq %r14 2247.cfi_offset %r14,-48 2248 pushq %r15 2249.cfi_offset %r15,-56 2250.Lmulx4x_prologue: 2251 2252 shll $3,%r9d 2253 leaq (%r9,%r9,2),%r10 2254 negq %r9 2255 movq (%r8),%r8 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 leaq -320(%rsp,%r9,2),%r11 2267 movq %rsp,%rbp 2268 subq %rdi,%r11 2269 andq $4095,%r11 2270 cmpq %r11,%r10 2271 jb .Lmulx4xsp_alt 2272 subq %r11,%rbp 2273 leaq -320(%rbp,%r9,2),%rbp 2274 jmp .Lmulx4xsp_done 2275 2276.Lmulx4xsp_alt: 2277 leaq 4096-320(,%r9,2),%r10 2278 leaq -320(%rbp,%r9,2),%rbp 2279 subq %r10,%r11 2280 movq $0,%r10 2281 cmovcq %r10,%r11 2282 subq %r11,%rbp 2283.Lmulx4xsp_done: 2284 andq $-64,%rbp 2285 movq %rsp,%r11 2286 subq %rbp,%r11 2287 andq $-4096,%r11 2288 leaq (%r11,%rbp,1),%rsp 2289 movq (%rsp),%r10 2290 cmpq %rbp,%rsp 2291 ja .Lmulx4x_page_walk 2292 jmp .Lmulx4x_page_walk_done 2293 2294.Lmulx4x_page_walk: 2295 leaq -4096(%rsp),%rsp 2296 movq (%rsp),%r10 2297 cmpq %rbp,%rsp 2298 ja .Lmulx4x_page_walk 2299.Lmulx4x_page_walk_done: 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 movq %r8,32(%rsp) 2314 movq %rax,40(%rsp) 2315.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 2316.Lmulx4x_body: 2317 call mulx4x_internal 2318 2319 movq 40(%rsp),%rsi 2320.cfi_def_cfa %rsi,8 2321 movq $1,%rax 2322 2323 movq -48(%rsi),%r15 2324.cfi_restore %r15 2325 movq -40(%rsi),%r14 2326.cfi_restore %r14 2327 movq -32(%rsi),%r13 2328.cfi_restore %r13 2329 movq -24(%rsi),%r12 2330.cfi_restore %r12 2331 movq -16(%rsi),%rbp 2332.cfi_restore %rbp 2333 movq -8(%rsi),%rbx 2334.cfi_restore %rbx 2335 leaq (%rsi),%rsp 2336.cfi_def_cfa_register %rsp 2337.Lmulx4x_epilogue: 2338 .byte 0xf3,0xc3 2339.cfi_endproc 2340.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 2341 2342.type mulx4x_internal,@function 2343.align 32 2344mulx4x_internal: 2345.cfi_startproc 2346 movq %r9,8(%rsp) 2347 movq %r9,%r10 2348 negq %r9 2349 shlq $5,%r9 2350 negq %r10 2351 leaq 128(%rdx,%r9,1),%r13 2352 shrq $5+5,%r9 2353 movd 8(%rax),%xmm5 2354 subq $1,%r9 2355 leaq .Linc(%rip),%rax 2356 movq %r13,16+8(%rsp) 2357 movq %r9,24+8(%rsp) 2358 movq %rdi,56+8(%rsp) 2359 movdqa 0(%rax),%xmm0 2360 movdqa 16(%rax),%xmm1 2361 leaq 88-112(%rsp,%r10,1),%r10 2362 leaq 128(%rdx),%rdi 2363 2364 pshufd $0,%xmm5,%xmm5 2365 movdqa %xmm1,%xmm4 2366.byte 0x67 2367 movdqa %xmm1,%xmm2 2368.byte 0x67 2369 paddd %xmm0,%xmm1 2370 pcmpeqd %xmm5,%xmm0 2371 movdqa %xmm4,%xmm3 2372 paddd %xmm1,%xmm2 2373 pcmpeqd %xmm5,%xmm1 2374 movdqa %xmm0,112(%r10) 2375 movdqa %xmm4,%xmm0 2376 2377 paddd %xmm2,%xmm3 2378 pcmpeqd %xmm5,%xmm2 2379 movdqa %xmm1,128(%r10) 2380 movdqa %xmm4,%xmm1 2381 2382 paddd %xmm3,%xmm0 2383 pcmpeqd %xmm5,%xmm3 2384 movdqa %xmm2,144(%r10) 2385 movdqa %xmm4,%xmm2 2386 2387 paddd %xmm0,%xmm1 2388 pcmpeqd %xmm5,%xmm0 2389 movdqa %xmm3,160(%r10) 2390 movdqa %xmm4,%xmm3 2391 paddd %xmm1,%xmm2 2392 pcmpeqd %xmm5,%xmm1 2393 movdqa %xmm0,176(%r10) 2394 movdqa %xmm4,%xmm0 2395 2396 paddd %xmm2,%xmm3 2397 pcmpeqd %xmm5,%xmm2 2398 movdqa %xmm1,192(%r10) 2399 movdqa %xmm4,%xmm1 2400 2401 paddd %xmm3,%xmm0 2402 pcmpeqd %xmm5,%xmm3 2403 movdqa %xmm2,208(%r10) 2404 movdqa %xmm4,%xmm2 2405 2406 paddd %xmm0,%xmm1 2407 pcmpeqd %xmm5,%xmm0 2408 movdqa %xmm3,224(%r10) 2409 movdqa %xmm4,%xmm3 2410 paddd %xmm1,%xmm2 2411 pcmpeqd %xmm5,%xmm1 2412 movdqa %xmm0,240(%r10) 2413 movdqa %xmm4,%xmm0 2414 2415 paddd %xmm2,%xmm3 2416 pcmpeqd %xmm5,%xmm2 2417 movdqa %xmm1,256(%r10) 2418 movdqa %xmm4,%xmm1 2419 2420 paddd %xmm3,%xmm0 2421 pcmpeqd %xmm5,%xmm3 2422 movdqa %xmm2,272(%r10) 2423 movdqa %xmm4,%xmm2 2424 2425 paddd %xmm0,%xmm1 2426 pcmpeqd %xmm5,%xmm0 2427 movdqa %xmm3,288(%r10) 2428 movdqa %xmm4,%xmm3 2429.byte 0x67 2430 paddd %xmm1,%xmm2 2431 pcmpeqd %xmm5,%xmm1 2432 movdqa %xmm0,304(%r10) 2433 2434 paddd %xmm2,%xmm3 2435 pcmpeqd %xmm5,%xmm2 2436 movdqa %xmm1,320(%r10) 2437 2438 pcmpeqd %xmm5,%xmm3 2439 movdqa %xmm2,336(%r10) 2440 2441 pand 64(%rdi),%xmm0 2442 pand 80(%rdi),%xmm1 2443 pand 96(%rdi),%xmm2 2444 movdqa %xmm3,352(%r10) 2445 pand 112(%rdi),%xmm3 2446 por %xmm2,%xmm0 2447 por %xmm3,%xmm1 2448 movdqa -128(%rdi),%xmm4 2449 movdqa -112(%rdi),%xmm5 2450 movdqa -96(%rdi),%xmm2 2451 pand 112(%r10),%xmm4 2452 movdqa -80(%rdi),%xmm3 2453 pand 128(%r10),%xmm5 2454 por %xmm4,%xmm0 2455 pand 144(%r10),%xmm2 2456 por %xmm5,%xmm1 2457 pand 160(%r10),%xmm3 2458 por %xmm2,%xmm0 2459 por %xmm3,%xmm1 2460 movdqa -64(%rdi),%xmm4 2461 movdqa -48(%rdi),%xmm5 2462 movdqa -32(%rdi),%xmm2 2463 pand 176(%r10),%xmm4 2464 movdqa -16(%rdi),%xmm3 2465 pand 192(%r10),%xmm5 2466 por %xmm4,%xmm0 2467 pand 208(%r10),%xmm2 2468 por %xmm5,%xmm1 2469 pand 224(%r10),%xmm3 2470 por %xmm2,%xmm0 2471 por %xmm3,%xmm1 2472 movdqa 0(%rdi),%xmm4 2473 movdqa 16(%rdi),%xmm5 2474 movdqa 32(%rdi),%xmm2 2475 pand 240(%r10),%xmm4 2476 movdqa 48(%rdi),%xmm3 2477 pand 256(%r10),%xmm5 2478 por %xmm4,%xmm0 2479 pand 272(%r10),%xmm2 2480 por %xmm5,%xmm1 2481 pand 288(%r10),%xmm3 2482 por %xmm2,%xmm0 2483 por %xmm3,%xmm1 2484 pxor %xmm1,%xmm0 2485 pshufd $0x4e,%xmm0,%xmm1 2486 por %xmm1,%xmm0 2487 leaq 256(%rdi),%rdi 2488.byte 102,72,15,126,194 2489 leaq 64+32+8(%rsp),%rbx 2490 2491 movq %rdx,%r9 2492 mulxq 0(%rsi),%r8,%rax 2493 mulxq 8(%rsi),%r11,%r12 2494 addq %rax,%r11 2495 mulxq 16(%rsi),%rax,%r13 2496 adcq %rax,%r12 2497 adcq $0,%r13 2498 mulxq 24(%rsi),%rax,%r14 2499 2500 movq %r8,%r15 2501 imulq 32+8(%rsp),%r8 2502 xorq %rbp,%rbp 2503 movq %r8,%rdx 2504 2505 movq %rdi,8+8(%rsp) 2506 2507 leaq 32(%rsi),%rsi 2508 adcxq %rax,%r13 2509 adcxq %rbp,%r14 2510 2511 mulxq 0(%rcx),%rax,%r10 2512 adcxq %rax,%r15 2513 adoxq %r11,%r10 2514 mulxq 8(%rcx),%rax,%r11 2515 adcxq %rax,%r10 2516 adoxq %r12,%r11 2517 mulxq 16(%rcx),%rax,%r12 2518 movq 24+8(%rsp),%rdi 2519 movq %r10,-32(%rbx) 2520 adcxq %rax,%r11 2521 adoxq %r13,%r12 2522 mulxq 24(%rcx),%rax,%r15 2523 movq %r9,%rdx 2524 movq %r11,-24(%rbx) 2525 adcxq %rax,%r12 2526 adoxq %rbp,%r15 2527 leaq 32(%rcx),%rcx 2528 movq %r12,-16(%rbx) 2529 jmp .Lmulx4x_1st 2530 2531.align 32 2532.Lmulx4x_1st: 2533 adcxq %rbp,%r15 2534 mulxq 0(%rsi),%r10,%rax 2535 adcxq %r14,%r10 2536 mulxq 8(%rsi),%r11,%r14 2537 adcxq %rax,%r11 2538 mulxq 16(%rsi),%r12,%rax 2539 adcxq %r14,%r12 2540 mulxq 24(%rsi),%r13,%r14 2541.byte 0x67,0x67 2542 movq %r8,%rdx 2543 adcxq %rax,%r13 2544 adcxq %rbp,%r14 2545 leaq 32(%rsi),%rsi 2546 leaq 32(%rbx),%rbx 2547 2548 adoxq %r15,%r10 2549 mulxq 0(%rcx),%rax,%r15 2550 adcxq %rax,%r10 2551 adoxq %r15,%r11 2552 mulxq 8(%rcx),%rax,%r15 2553 adcxq %rax,%r11 2554 adoxq %r15,%r12 2555 mulxq 16(%rcx),%rax,%r15 2556 movq %r10,-40(%rbx) 2557 adcxq %rax,%r12 2558 movq %r11,-32(%rbx) 2559 adoxq %r15,%r13 2560 mulxq 24(%rcx),%rax,%r15 2561 movq %r9,%rdx 2562 movq %r12,-24(%rbx) 2563 adcxq %rax,%r13 2564 adoxq %rbp,%r15 2565 leaq 32(%rcx),%rcx 2566 movq %r13,-16(%rbx) 2567 2568 decq %rdi 2569 jnz .Lmulx4x_1st 2570 2571 movq 8(%rsp),%rax 2572 adcq %rbp,%r15 2573 leaq (%rsi,%rax,1),%rsi 2574 addq %r15,%r14 2575 movq 8+8(%rsp),%rdi 2576 adcq %rbp,%rbp 2577 movq %r14,-8(%rbx) 2578 jmp .Lmulx4x_outer 2579 2580.align 32 2581.Lmulx4x_outer: 2582 leaq 16-256(%rbx),%r10 2583 pxor %xmm4,%xmm4 2584.byte 0x67,0x67 2585 pxor %xmm5,%xmm5 2586 movdqa -128(%rdi),%xmm0 2587 movdqa -112(%rdi),%xmm1 2588 movdqa -96(%rdi),%xmm2 2589 pand 256(%r10),%xmm0 2590 movdqa -80(%rdi),%xmm3 2591 pand 272(%r10),%xmm1 2592 por %xmm0,%xmm4 2593 pand 288(%r10),%xmm2 2594 por %xmm1,%xmm5 2595 pand 304(%r10),%xmm3 2596 por %xmm2,%xmm4 2597 por %xmm3,%xmm5 2598 movdqa -64(%rdi),%xmm0 2599 movdqa -48(%rdi),%xmm1 2600 movdqa -32(%rdi),%xmm2 2601 pand 320(%r10),%xmm0 2602 movdqa -16(%rdi),%xmm3 2603 pand 336(%r10),%xmm1 2604 por %xmm0,%xmm4 2605 pand 352(%r10),%xmm2 2606 por %xmm1,%xmm5 2607 pand 368(%r10),%xmm3 2608 por %xmm2,%xmm4 2609 por %xmm3,%xmm5 2610 movdqa 0(%rdi),%xmm0 2611 movdqa 16(%rdi),%xmm1 2612 movdqa 32(%rdi),%xmm2 2613 pand 384(%r10),%xmm0 2614 movdqa 48(%rdi),%xmm3 2615 pand 400(%r10),%xmm1 2616 por %xmm0,%xmm4 2617 pand 416(%r10),%xmm2 2618 por %xmm1,%xmm5 2619 pand 432(%r10),%xmm3 2620 por %xmm2,%xmm4 2621 por %xmm3,%xmm5 2622 movdqa 64(%rdi),%xmm0 2623 movdqa 80(%rdi),%xmm1 2624 movdqa 96(%rdi),%xmm2 2625 pand 448(%r10),%xmm0 2626 movdqa 112(%rdi),%xmm3 2627 pand 464(%r10),%xmm1 2628 por %xmm0,%xmm4 2629 pand 480(%r10),%xmm2 2630 por %xmm1,%xmm5 2631 pand 496(%r10),%xmm3 2632 por %xmm2,%xmm4 2633 por %xmm3,%xmm5 2634 por %xmm5,%xmm4 2635 pshufd $0x4e,%xmm4,%xmm0 2636 por %xmm4,%xmm0 2637 leaq 256(%rdi),%rdi 2638.byte 102,72,15,126,194 2639 2640 movq %rbp,(%rbx) 2641 leaq 32(%rbx,%rax,1),%rbx 2642 mulxq 0(%rsi),%r8,%r11 2643 xorq %rbp,%rbp 2644 movq %rdx,%r9 2645 mulxq 8(%rsi),%r14,%r12 2646 adoxq -32(%rbx),%r8 2647 adcxq %r14,%r11 2648 mulxq 16(%rsi),%r15,%r13 2649 adoxq -24(%rbx),%r11 2650 adcxq %r15,%r12 2651 mulxq 24(%rsi),%rdx,%r14 2652 adoxq -16(%rbx),%r12 2653 adcxq %rdx,%r13 2654 leaq (%rcx,%rax,1),%rcx 2655 leaq 32(%rsi),%rsi 2656 adoxq -8(%rbx),%r13 2657 adcxq %rbp,%r14 2658 adoxq %rbp,%r14 2659 2660 movq %r8,%r15 2661 imulq 32+8(%rsp),%r8 2662 2663 movq %r8,%rdx 2664 xorq %rbp,%rbp 2665 movq %rdi,8+8(%rsp) 2666 2667 mulxq 0(%rcx),%rax,%r10 2668 adcxq %rax,%r15 2669 adoxq %r11,%r10 2670 mulxq 8(%rcx),%rax,%r11 2671 adcxq %rax,%r10 2672 adoxq %r12,%r11 2673 mulxq 16(%rcx),%rax,%r12 2674 adcxq %rax,%r11 2675 adoxq %r13,%r12 2676 mulxq 24(%rcx),%rax,%r15 2677 movq %r9,%rdx 2678 movq 24+8(%rsp),%rdi 2679 movq %r10,-32(%rbx) 2680 adcxq %rax,%r12 2681 movq %r11,-24(%rbx) 2682 adoxq %rbp,%r15 2683 movq %r12,-16(%rbx) 2684 leaq 32(%rcx),%rcx 2685 jmp .Lmulx4x_inner 2686 2687.align 32 2688.Lmulx4x_inner: 2689 mulxq 0(%rsi),%r10,%rax 2690 adcxq %rbp,%r15 2691 adoxq %r14,%r10 2692 mulxq 8(%rsi),%r11,%r14 2693 adcxq 0(%rbx),%r10 2694 adoxq %rax,%r11 2695 mulxq 16(%rsi),%r12,%rax 2696 adcxq 8(%rbx),%r11 2697 adoxq %r14,%r12 2698 mulxq 24(%rsi),%r13,%r14 2699 movq %r8,%rdx 2700 adcxq 16(%rbx),%r12 2701 adoxq %rax,%r13 2702 adcxq 24(%rbx),%r13 2703 adoxq %rbp,%r14 2704 leaq 32(%rsi),%rsi 2705 leaq 32(%rbx),%rbx 2706 adcxq %rbp,%r14 2707 2708 adoxq %r15,%r10 2709 mulxq 0(%rcx),%rax,%r15 2710 adcxq %rax,%r10 2711 adoxq %r15,%r11 2712 mulxq 8(%rcx),%rax,%r15 2713 adcxq %rax,%r11 2714 adoxq %r15,%r12 2715 mulxq 16(%rcx),%rax,%r15 2716 movq %r10,-40(%rbx) 2717 adcxq %rax,%r12 2718 adoxq %r15,%r13 2719 movq %r11,-32(%rbx) 2720 mulxq 24(%rcx),%rax,%r15 2721 movq %r9,%rdx 2722 leaq 32(%rcx),%rcx 2723 movq %r12,-24(%rbx) 2724 adcxq %rax,%r13 2725 adoxq %rbp,%r15 2726 movq %r13,-16(%rbx) 2727 2728 decq %rdi 2729 jnz .Lmulx4x_inner 2730 2731 movq 0+8(%rsp),%rax 2732 adcq %rbp,%r15 2733 subq 0(%rbx),%rdi 2734 movq 8+8(%rsp),%rdi 2735 movq 16+8(%rsp),%r10 2736 adcq %r15,%r14 2737 leaq (%rsi,%rax,1),%rsi 2738 adcq %rbp,%rbp 2739 movq %r14,-8(%rbx) 2740 2741 cmpq %r10,%rdi 2742 jb .Lmulx4x_outer 2743 2744 movq -8(%rcx),%r10 2745 movq %rbp,%r8 2746 movq (%rcx,%rax,1),%r12 2747 leaq (%rcx,%rax,1),%rbp 2748 movq %rax,%rcx 2749 leaq (%rbx,%rax,1),%rdi 2750 xorl %eax,%eax 2751 xorq %r15,%r15 2752 subq %r14,%r10 2753 adcq %r15,%r15 2754 orq %r15,%r8 2755 sarq $3+2,%rcx 2756 subq %r8,%rax 2757 movq 56+8(%rsp),%rdx 2758 decq %r12 2759 movq 8(%rbp),%r13 2760 xorq %r8,%r8 2761 movq 16(%rbp),%r14 2762 movq 24(%rbp),%r15 2763 jmp .Lsqrx4x_sub_entry 2764.cfi_endproc 2765.size mulx4x_internal,.-mulx4x_internal 2766.type bn_powerx5,@function 2767.align 32 2768bn_powerx5: 2769.cfi_startproc 2770 movq %rsp,%rax 2771.cfi_def_cfa_register %rax 2772.Lpowerx5_enter: 2773 pushq %rbx 2774.cfi_offset %rbx,-16 2775 pushq %rbp 2776.cfi_offset %rbp,-24 2777 pushq %r12 2778.cfi_offset %r12,-32 2779 pushq %r13 2780.cfi_offset %r13,-40 2781 pushq %r14 2782.cfi_offset %r14,-48 2783 pushq %r15 2784.cfi_offset %r15,-56 2785.Lpowerx5_prologue: 2786 2787 shll $3,%r9d 2788 leaq (%r9,%r9,2),%r10 2789 negq %r9 2790 movq (%r8),%r8 2791 2792 2793 2794 2795 2796 2797 2798 2799 leaq -320(%rsp,%r9,2),%r11 2800 movq %rsp,%rbp 2801 subq %rdi,%r11 2802 andq $4095,%r11 2803 cmpq %r11,%r10 2804 jb .Lpwrx_sp_alt 2805 subq %r11,%rbp 2806 leaq -320(%rbp,%r9,2),%rbp 2807 jmp .Lpwrx_sp_done 2808 2809.align 32 2810.Lpwrx_sp_alt: 2811 leaq 4096-320(,%r9,2),%r10 2812 leaq -320(%rbp,%r9,2),%rbp 2813 subq %r10,%r11 2814 movq $0,%r10 2815 cmovcq %r10,%r11 2816 subq %r11,%rbp 2817.Lpwrx_sp_done: 2818 andq $-64,%rbp 2819 movq %rsp,%r11 2820 subq %rbp,%r11 2821 andq $-4096,%r11 2822 leaq (%r11,%rbp,1),%rsp 2823 movq (%rsp),%r10 2824 cmpq %rbp,%rsp 2825 ja .Lpwrx_page_walk 2826 jmp .Lpwrx_page_walk_done 2827 2828.Lpwrx_page_walk: 2829 leaq -4096(%rsp),%rsp 2830 movq (%rsp),%r10 2831 cmpq %rbp,%rsp 2832 ja .Lpwrx_page_walk 2833.Lpwrx_page_walk_done: 2834 2835 movq %r9,%r10 2836 negq %r9 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 pxor %xmm0,%xmm0 2850.byte 102,72,15,110,207 2851.byte 102,72,15,110,209 2852.byte 102,73,15,110,218 2853.byte 102,72,15,110,226 2854 movq %r8,32(%rsp) 2855 movq %rax,40(%rsp) 2856.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 2857.Lpowerx5_body: 2858 2859 call __bn_sqrx8x_internal 2860 call __bn_postx4x_internal 2861 call __bn_sqrx8x_internal 2862 call __bn_postx4x_internal 2863 call __bn_sqrx8x_internal 2864 call __bn_postx4x_internal 2865 call __bn_sqrx8x_internal 2866 call __bn_postx4x_internal 2867 call __bn_sqrx8x_internal 2868 call __bn_postx4x_internal 2869 2870 movq %r10,%r9 2871 movq %rsi,%rdi 2872.byte 102,72,15,126,209 2873.byte 102,72,15,126,226 2874 movq 40(%rsp),%rax 2875 2876 call mulx4x_internal 2877 2878 movq 40(%rsp),%rsi 2879.cfi_def_cfa %rsi,8 2880 movq $1,%rax 2881 2882 movq -48(%rsi),%r15 2883.cfi_restore %r15 2884 movq -40(%rsi),%r14 2885.cfi_restore %r14 2886 movq -32(%rsi),%r13 2887.cfi_restore %r13 2888 movq -24(%rsi),%r12 2889.cfi_restore %r12 2890 movq -16(%rsi),%rbp 2891.cfi_restore %rbp 2892 movq -8(%rsi),%rbx 2893.cfi_restore %rbx 2894 leaq (%rsi),%rsp 2895.cfi_def_cfa_register %rsp 2896.Lpowerx5_epilogue: 2897 .byte 0xf3,0xc3 2898.cfi_endproc 2899.size bn_powerx5,.-bn_powerx5 2900 2901.globl bn_sqrx8x_internal 2902.hidden bn_sqrx8x_internal 2903.type bn_sqrx8x_internal,@function 2904.align 32 2905bn_sqrx8x_internal: 2906__bn_sqrx8x_internal: 2907.cfi_startproc 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 leaq 48+8(%rsp),%rdi 2949 leaq (%rsi,%r9,1),%rbp 2950 movq %r9,0+8(%rsp) 2951 movq %rbp,8+8(%rsp) 2952 jmp .Lsqr8x_zero_start 2953 2954.align 32 2955.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 2956.Lsqrx8x_zero: 2957.byte 0x3e 2958 movdqa %xmm0,0(%rdi) 2959 movdqa %xmm0,16(%rdi) 2960 movdqa %xmm0,32(%rdi) 2961 movdqa %xmm0,48(%rdi) 2962.Lsqr8x_zero_start: 2963 movdqa %xmm0,64(%rdi) 2964 movdqa %xmm0,80(%rdi) 2965 movdqa %xmm0,96(%rdi) 2966 movdqa %xmm0,112(%rdi) 2967 leaq 128(%rdi),%rdi 2968 subq $64,%r9 2969 jnz .Lsqrx8x_zero 2970 2971 movq 0(%rsi),%rdx 2972 2973 xorq %r10,%r10 2974 xorq %r11,%r11 2975 xorq %r12,%r12 2976 xorq %r13,%r13 2977 xorq %r14,%r14 2978 xorq %r15,%r15 2979 leaq 48+8(%rsp),%rdi 2980 xorq %rbp,%rbp 2981 jmp .Lsqrx8x_outer_loop 2982 2983.align 32 2984.Lsqrx8x_outer_loop: 2985 mulxq 8(%rsi),%r8,%rax 2986 adcxq %r9,%r8 2987 adoxq %rax,%r10 2988 mulxq 16(%rsi),%r9,%rax 2989 adcxq %r10,%r9 2990 adoxq %rax,%r11 2991.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 2992 adcxq %r11,%r10 2993 adoxq %rax,%r12 2994.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 2995 adcxq %r12,%r11 2996 adoxq %rax,%r13 2997 mulxq 40(%rsi),%r12,%rax 2998 adcxq %r13,%r12 2999 adoxq %rax,%r14 3000 mulxq 48(%rsi),%r13,%rax 3001 adcxq %r14,%r13 3002 adoxq %r15,%rax 3003 mulxq 56(%rsi),%r14,%r15 3004 movq 8(%rsi),%rdx 3005 adcxq %rax,%r14 3006 adoxq %rbp,%r15 3007 adcq 64(%rdi),%r15 3008 movq %r8,8(%rdi) 3009 movq %r9,16(%rdi) 3010 sbbq %rcx,%rcx 3011 xorq %rbp,%rbp 3012 3013 3014 mulxq 16(%rsi),%r8,%rbx 3015 mulxq 24(%rsi),%r9,%rax 3016 adcxq %r10,%r8 3017 adoxq %rbx,%r9 3018 mulxq 32(%rsi),%r10,%rbx 3019 adcxq %r11,%r9 3020 adoxq %rax,%r10 3021.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 3022 adcxq %r12,%r10 3023 adoxq %rbx,%r11 3024.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 3025 adcxq %r13,%r11 3026 adoxq %r14,%r12 3027.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 3028 movq 16(%rsi),%rdx 3029 adcxq %rax,%r12 3030 adoxq %rbx,%r13 3031 adcxq %r15,%r13 3032 adoxq %rbp,%r14 3033 adcxq %rbp,%r14 3034 3035 movq %r8,24(%rdi) 3036 movq %r9,32(%rdi) 3037 3038 mulxq 24(%rsi),%r8,%rbx 3039 mulxq 32(%rsi),%r9,%rax 3040 adcxq %r10,%r8 3041 adoxq %rbx,%r9 3042 mulxq 40(%rsi),%r10,%rbx 3043 adcxq %r11,%r9 3044 adoxq %rax,%r10 3045.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 3046 adcxq %r12,%r10 3047 adoxq %r13,%r11 3048.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 3049.byte 0x3e 3050 movq 24(%rsi),%rdx 3051 adcxq %rbx,%r11 3052 adoxq %rax,%r12 3053 adcxq %r14,%r12 3054 movq %r8,40(%rdi) 3055 movq %r9,48(%rdi) 3056 mulxq 32(%rsi),%r8,%rax 3057 adoxq %rbp,%r13 3058 adcxq %rbp,%r13 3059 3060 mulxq 40(%rsi),%r9,%rbx 3061 adcxq %r10,%r8 3062 adoxq %rax,%r9 3063 mulxq 48(%rsi),%r10,%rax 3064 adcxq %r11,%r9 3065 adoxq %r12,%r10 3066 mulxq 56(%rsi),%r11,%r12 3067 movq 32(%rsi),%rdx 3068 movq 40(%rsi),%r14 3069 adcxq %rbx,%r10 3070 adoxq %rax,%r11 3071 movq 48(%rsi),%r15 3072 adcxq %r13,%r11 3073 adoxq %rbp,%r12 3074 adcxq %rbp,%r12 3075 3076 movq %r8,56(%rdi) 3077 movq %r9,64(%rdi) 3078 3079 mulxq %r14,%r9,%rax 3080 movq 56(%rsi),%r8 3081 adcxq %r10,%r9 3082 mulxq %r15,%r10,%rbx 3083 adoxq %rax,%r10 3084 adcxq %r11,%r10 3085 mulxq %r8,%r11,%rax 3086 movq %r14,%rdx 3087 adoxq %rbx,%r11 3088 adcxq %r12,%r11 3089 3090 adcxq %rbp,%rax 3091 3092 mulxq %r15,%r14,%rbx 3093 mulxq %r8,%r12,%r13 3094 movq %r15,%rdx 3095 leaq 64(%rsi),%rsi 3096 adcxq %r14,%r11 3097 adoxq %rbx,%r12 3098 adcxq %rax,%r12 3099 adoxq %rbp,%r13 3100 3101.byte 0x67,0x67 3102 mulxq %r8,%r8,%r14 3103 adcxq %r8,%r13 3104 adcxq %rbp,%r14 3105 3106 cmpq 8+8(%rsp),%rsi 3107 je .Lsqrx8x_outer_break 3108 3109 negq %rcx 3110 movq $-8,%rcx 3111 movq %rbp,%r15 3112 movq 64(%rdi),%r8 3113 adcxq 72(%rdi),%r9 3114 adcxq 80(%rdi),%r10 3115 adcxq 88(%rdi),%r11 3116 adcq 96(%rdi),%r12 3117 adcq 104(%rdi),%r13 3118 adcq 112(%rdi),%r14 3119 adcq 120(%rdi),%r15 3120 leaq (%rsi),%rbp 3121 leaq 128(%rdi),%rdi 3122 sbbq %rax,%rax 3123 3124 movq -64(%rsi),%rdx 3125 movq %rax,16+8(%rsp) 3126 movq %rdi,24+8(%rsp) 3127 3128 3129 xorl %eax,%eax 3130 jmp .Lsqrx8x_loop 3131 3132.align 32 3133.Lsqrx8x_loop: 3134 movq %r8,%rbx 3135 mulxq 0(%rbp),%rax,%r8 3136 adcxq %rax,%rbx 3137 adoxq %r9,%r8 3138 3139 mulxq 8(%rbp),%rax,%r9 3140 adcxq %rax,%r8 3141 adoxq %r10,%r9 3142 3143 mulxq 16(%rbp),%rax,%r10 3144 adcxq %rax,%r9 3145 adoxq %r11,%r10 3146 3147 mulxq 24(%rbp),%rax,%r11 3148 adcxq %rax,%r10 3149 adoxq %r12,%r11 3150 3151.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 3152 adcxq %rax,%r11 3153 adoxq %r13,%r12 3154 3155 mulxq 40(%rbp),%rax,%r13 3156 adcxq %rax,%r12 3157 adoxq %r14,%r13 3158 3159 mulxq 48(%rbp),%rax,%r14 3160 movq %rbx,(%rdi,%rcx,8) 3161 movl $0,%ebx 3162 adcxq %rax,%r13 3163 adoxq %r15,%r14 3164 3165.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 3166 movq 8(%rsi,%rcx,8),%rdx 3167 adcxq %rax,%r14 3168 adoxq %rbx,%r15 3169 adcxq %rbx,%r15 3170 3171.byte 0x67 3172 incq %rcx 3173 jnz .Lsqrx8x_loop 3174 3175 leaq 64(%rbp),%rbp 3176 movq $-8,%rcx 3177 cmpq 8+8(%rsp),%rbp 3178 je .Lsqrx8x_break 3179 3180 subq 16+8(%rsp),%rbx 3181.byte 0x66 3182 movq -64(%rsi),%rdx 3183 adcxq 0(%rdi),%r8 3184 adcxq 8(%rdi),%r9 3185 adcq 16(%rdi),%r10 3186 adcq 24(%rdi),%r11 3187 adcq 32(%rdi),%r12 3188 adcq 40(%rdi),%r13 3189 adcq 48(%rdi),%r14 3190 adcq 56(%rdi),%r15 3191 leaq 64(%rdi),%rdi 3192.byte 0x67 3193 sbbq %rax,%rax 3194 xorl %ebx,%ebx 3195 movq %rax,16+8(%rsp) 3196 jmp .Lsqrx8x_loop 3197 3198.align 32 3199.Lsqrx8x_break: 3200 xorq %rbp,%rbp 3201 subq 16+8(%rsp),%rbx 3202 adcxq %rbp,%r8 3203 movq 24+8(%rsp),%rcx 3204 adcxq %rbp,%r9 3205 movq 0(%rsi),%rdx 3206 adcq $0,%r10 3207 movq %r8,0(%rdi) 3208 adcq $0,%r11 3209 adcq $0,%r12 3210 adcq $0,%r13 3211 adcq $0,%r14 3212 adcq $0,%r15 3213 cmpq %rcx,%rdi 3214 je .Lsqrx8x_outer_loop 3215 3216 movq %r9,8(%rdi) 3217 movq 8(%rcx),%r9 3218 movq %r10,16(%rdi) 3219 movq 16(%rcx),%r10 3220 movq %r11,24(%rdi) 3221 movq 24(%rcx),%r11 3222 movq %r12,32(%rdi) 3223 movq 32(%rcx),%r12 3224 movq %r13,40(%rdi) 3225 movq 40(%rcx),%r13 3226 movq %r14,48(%rdi) 3227 movq 48(%rcx),%r14 3228 movq %r15,56(%rdi) 3229 movq 56(%rcx),%r15 3230 movq %rcx,%rdi 3231 jmp .Lsqrx8x_outer_loop 3232 3233.align 32 3234.Lsqrx8x_outer_break: 3235 movq %r9,72(%rdi) 3236.byte 102,72,15,126,217 3237 movq %r10,80(%rdi) 3238 movq %r11,88(%rdi) 3239 movq %r12,96(%rdi) 3240 movq %r13,104(%rdi) 3241 movq %r14,112(%rdi) 3242 leaq 48+8(%rsp),%rdi 3243 movq (%rsi,%rcx,1),%rdx 3244 3245 movq 8(%rdi),%r11 3246 xorq %r10,%r10 3247 movq 0+8(%rsp),%r9 3248 adoxq %r11,%r11 3249 movq 16(%rdi),%r12 3250 movq 24(%rdi),%r13 3251 3252 3253.align 32 3254.Lsqrx4x_shift_n_add: 3255 mulxq %rdx,%rax,%rbx 3256 adoxq %r12,%r12 3257 adcxq %r10,%rax 3258.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 3259.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 3260 adoxq %r13,%r13 3261 adcxq %r11,%rbx 3262 movq 40(%rdi),%r11 3263 movq %rax,0(%rdi) 3264 movq %rbx,8(%rdi) 3265 3266 mulxq %rdx,%rax,%rbx 3267 adoxq %r10,%r10 3268 adcxq %r12,%rax 3269 movq 16(%rsi,%rcx,1),%rdx 3270 movq 48(%rdi),%r12 3271 adoxq %r11,%r11 3272 adcxq %r13,%rbx 3273 movq 56(%rdi),%r13 3274 movq %rax,16(%rdi) 3275 movq %rbx,24(%rdi) 3276 3277 mulxq %rdx,%rax,%rbx 3278 adoxq %r12,%r12 3279 adcxq %r10,%rax 3280 movq 24(%rsi,%rcx,1),%rdx 3281 leaq 32(%rcx),%rcx 3282 movq 64(%rdi),%r10 3283 adoxq %r13,%r13 3284 adcxq %r11,%rbx 3285 movq 72(%rdi),%r11 3286 movq %rax,32(%rdi) 3287 movq %rbx,40(%rdi) 3288 3289 mulxq %rdx,%rax,%rbx 3290 adoxq %r10,%r10 3291 adcxq %r12,%rax 3292 jrcxz .Lsqrx4x_shift_n_add_break 3293.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 3294 adoxq %r11,%r11 3295 adcxq %r13,%rbx 3296 movq 80(%rdi),%r12 3297 movq 88(%rdi),%r13 3298 movq %rax,48(%rdi) 3299 movq %rbx,56(%rdi) 3300 leaq 64(%rdi),%rdi 3301 nop 3302 jmp .Lsqrx4x_shift_n_add 3303 3304.align 32 3305.Lsqrx4x_shift_n_add_break: 3306 adcxq %r13,%rbx 3307 movq %rax,48(%rdi) 3308 movq %rbx,56(%rdi) 3309 leaq 64(%rdi),%rdi 3310.byte 102,72,15,126,213 3311__bn_sqrx8x_reduction: 3312 xorl %eax,%eax 3313 movq 32+8(%rsp),%rbx 3314 movq 48+8(%rsp),%rdx 3315 leaq -64(%rbp,%r9,1),%rcx 3316 3317 movq %rcx,0+8(%rsp) 3318 movq %rdi,8+8(%rsp) 3319 3320 leaq 48+8(%rsp),%rdi 3321 jmp .Lsqrx8x_reduction_loop 3322 3323.align 32 3324.Lsqrx8x_reduction_loop: 3325 movq 8(%rdi),%r9 3326 movq 16(%rdi),%r10 3327 movq 24(%rdi),%r11 3328 movq 32(%rdi),%r12 3329 movq %rdx,%r8 3330 imulq %rbx,%rdx 3331 movq 40(%rdi),%r13 3332 movq 48(%rdi),%r14 3333 movq 56(%rdi),%r15 3334 movq %rax,24+8(%rsp) 3335 3336 leaq 64(%rdi),%rdi 3337 xorq %rsi,%rsi 3338 movq $-8,%rcx 3339 jmp .Lsqrx8x_reduce 3340 3341.align 32 3342.Lsqrx8x_reduce: 3343 movq %r8,%rbx 3344 mulxq 0(%rbp),%rax,%r8 3345 adcxq %rbx,%rax 3346 adoxq %r9,%r8 3347 3348 mulxq 8(%rbp),%rbx,%r9 3349 adcxq %rbx,%r8 3350 adoxq %r10,%r9 3351 3352 mulxq 16(%rbp),%rbx,%r10 3353 adcxq %rbx,%r9 3354 adoxq %r11,%r10 3355 3356 mulxq 24(%rbp),%rbx,%r11 3357 adcxq %rbx,%r10 3358 adoxq %r12,%r11 3359 3360.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 3361 movq %rdx,%rax 3362 movq %r8,%rdx 3363 adcxq %rbx,%r11 3364 adoxq %r13,%r12 3365 3366 mulxq 32+8(%rsp),%rbx,%rdx 3367 movq %rax,%rdx 3368 movq %rax,64+48+8(%rsp,%rcx,8) 3369 3370 mulxq 40(%rbp),%rax,%r13 3371 adcxq %rax,%r12 3372 adoxq %r14,%r13 3373 3374 mulxq 48(%rbp),%rax,%r14 3375 adcxq %rax,%r13 3376 adoxq %r15,%r14 3377 3378 mulxq 56(%rbp),%rax,%r15 3379 movq %rbx,%rdx 3380 adcxq %rax,%r14 3381 adoxq %rsi,%r15 3382 adcxq %rsi,%r15 3383 3384.byte 0x67,0x67,0x67 3385 incq %rcx 3386 jnz .Lsqrx8x_reduce 3387 3388 movq %rsi,%rax 3389 cmpq 0+8(%rsp),%rbp 3390 jae .Lsqrx8x_no_tail 3391 3392 movq 48+8(%rsp),%rdx 3393 addq 0(%rdi),%r8 3394 leaq 64(%rbp),%rbp 3395 movq $-8,%rcx 3396 adcxq 8(%rdi),%r9 3397 adcxq 16(%rdi),%r10 3398 adcq 24(%rdi),%r11 3399 adcq 32(%rdi),%r12 3400 adcq 40(%rdi),%r13 3401 adcq 48(%rdi),%r14 3402 adcq 56(%rdi),%r15 3403 leaq 64(%rdi),%rdi 3404 sbbq %rax,%rax 3405 3406 xorq %rsi,%rsi 3407 movq %rax,16+8(%rsp) 3408 jmp .Lsqrx8x_tail 3409 3410.align 32 3411.Lsqrx8x_tail: 3412 movq %r8,%rbx 3413 mulxq 0(%rbp),%rax,%r8 3414 adcxq %rax,%rbx 3415 adoxq %r9,%r8 3416 3417 mulxq 8(%rbp),%rax,%r9 3418 adcxq %rax,%r8 3419 adoxq %r10,%r9 3420 3421 mulxq 16(%rbp),%rax,%r10 3422 adcxq %rax,%r9 3423 adoxq %r11,%r10 3424 3425 mulxq 24(%rbp),%rax,%r11 3426 adcxq %rax,%r10 3427 adoxq %r12,%r11 3428 3429.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 3430 adcxq %rax,%r11 3431 adoxq %r13,%r12 3432 3433 mulxq 40(%rbp),%rax,%r13 3434 adcxq %rax,%r12 3435 adoxq %r14,%r13 3436 3437 mulxq 48(%rbp),%rax,%r14 3438 adcxq %rax,%r13 3439 adoxq %r15,%r14 3440 3441 mulxq 56(%rbp),%rax,%r15 3442 movq 72+48+8(%rsp,%rcx,8),%rdx 3443 adcxq %rax,%r14 3444 adoxq %rsi,%r15 3445 movq %rbx,(%rdi,%rcx,8) 3446 movq %r8,%rbx 3447 adcxq %rsi,%r15 3448 3449 incq %rcx 3450 jnz .Lsqrx8x_tail 3451 3452 cmpq 0+8(%rsp),%rbp 3453 jae .Lsqrx8x_tail_done 3454 3455 subq 16+8(%rsp),%rsi 3456 movq 48+8(%rsp),%rdx 3457 leaq 64(%rbp),%rbp 3458 adcq 0(%rdi),%r8 3459 adcq 8(%rdi),%r9 3460 adcq 16(%rdi),%r10 3461 adcq 24(%rdi),%r11 3462 adcq 32(%rdi),%r12 3463 adcq 40(%rdi),%r13 3464 adcq 48(%rdi),%r14 3465 adcq 56(%rdi),%r15 3466 leaq 64(%rdi),%rdi 3467 sbbq %rax,%rax 3468 subq $8,%rcx 3469 3470 xorq %rsi,%rsi 3471 movq %rax,16+8(%rsp) 3472 jmp .Lsqrx8x_tail 3473 3474.align 32 3475.Lsqrx8x_tail_done: 3476 xorq %rax,%rax 3477 addq 24+8(%rsp),%r8 3478 adcq $0,%r9 3479 adcq $0,%r10 3480 adcq $0,%r11 3481 adcq $0,%r12 3482 adcq $0,%r13 3483 adcq $0,%r14 3484 adcq $0,%r15 3485 adcq $0,%rax 3486 3487 subq 16+8(%rsp),%rsi 3488.Lsqrx8x_no_tail: 3489 adcq 0(%rdi),%r8 3490.byte 102,72,15,126,217 3491 adcq 8(%rdi),%r9 3492 movq 56(%rbp),%rsi 3493.byte 102,72,15,126,213 3494 adcq 16(%rdi),%r10 3495 adcq 24(%rdi),%r11 3496 adcq 32(%rdi),%r12 3497 adcq 40(%rdi),%r13 3498 adcq 48(%rdi),%r14 3499 adcq 56(%rdi),%r15 3500 adcq $0,%rax 3501 3502 movq 32+8(%rsp),%rbx 3503 movq 64(%rdi,%rcx,1),%rdx 3504 3505 movq %r8,0(%rdi) 3506 leaq 64(%rdi),%r8 3507 movq %r9,8(%rdi) 3508 movq %r10,16(%rdi) 3509 movq %r11,24(%rdi) 3510 movq %r12,32(%rdi) 3511 movq %r13,40(%rdi) 3512 movq %r14,48(%rdi) 3513 movq %r15,56(%rdi) 3514 3515 leaq 64(%rdi,%rcx,1),%rdi 3516 cmpq 8+8(%rsp),%r8 3517 jb .Lsqrx8x_reduction_loop 3518 .byte 0xf3,0xc3 3519.cfi_endproc 3520.size bn_sqrx8x_internal,.-bn_sqrx8x_internal 3521.align 32 3522__bn_postx4x_internal: 3523.cfi_startproc 3524 movq 0(%rbp),%r12 3525 movq %rcx,%r10 3526 movq %rcx,%r9 3527 negq %rax 3528 sarq $3+2,%rcx 3529 3530.byte 102,72,15,126,202 3531.byte 102,72,15,126,206 3532 decq %r12 3533 movq 8(%rbp),%r13 3534 xorq %r8,%r8 3535 movq 16(%rbp),%r14 3536 movq 24(%rbp),%r15 3537 jmp .Lsqrx4x_sub_entry 3538 3539.align 16 3540.Lsqrx4x_sub: 3541 movq 0(%rbp),%r12 3542 movq 8(%rbp),%r13 3543 movq 16(%rbp),%r14 3544 movq 24(%rbp),%r15 3545.Lsqrx4x_sub_entry: 3546 andnq %rax,%r12,%r12 3547 leaq 32(%rbp),%rbp 3548 andnq %rax,%r13,%r13 3549 andnq %rax,%r14,%r14 3550 andnq %rax,%r15,%r15 3551 3552 negq %r8 3553 adcq 0(%rdi),%r12 3554 adcq 8(%rdi),%r13 3555 adcq 16(%rdi),%r14 3556 adcq 24(%rdi),%r15 3557 movq %r12,0(%rdx) 3558 leaq 32(%rdi),%rdi 3559 movq %r13,8(%rdx) 3560 sbbq %r8,%r8 3561 movq %r14,16(%rdx) 3562 movq %r15,24(%rdx) 3563 leaq 32(%rdx),%rdx 3564 3565 incq %rcx 3566 jnz .Lsqrx4x_sub 3567 3568 negq %r9 3569 3570 .byte 0xf3,0xc3 3571.cfi_endproc 3572.size __bn_postx4x_internal,.-__bn_postx4x_internal 3573.globl bn_get_bits5 3574.type bn_get_bits5,@function 3575.align 16 3576bn_get_bits5: 3577.cfi_startproc 3578 leaq 0(%rdi),%r10 3579 leaq 1(%rdi),%r11 3580 movl %esi,%ecx 3581 shrl $4,%esi 3582 andl $15,%ecx 3583 leal -8(%rcx),%eax 3584 cmpl $11,%ecx 3585 cmovaq %r11,%r10 3586 cmoval %eax,%ecx 3587 movzwl (%r10,%rsi,2),%eax 3588 shrl %cl,%eax 3589 andl $31,%eax 3590 .byte 0xf3,0xc3 3591.cfi_endproc 3592.size bn_get_bits5,.-bn_get_bits5 3593 3594.globl bn_scatter5 3595.type bn_scatter5,@function 3596.align 16 3597bn_scatter5: 3598.cfi_startproc 3599 cmpl $0,%esi 3600 jz .Lscatter_epilogue 3601 leaq (%rdx,%rcx,8),%rdx 3602.Lscatter: 3603 movq (%rdi),%rax 3604 leaq 8(%rdi),%rdi 3605 movq %rax,(%rdx) 3606 leaq 256(%rdx),%rdx 3607 subl $1,%esi 3608 jnz .Lscatter 3609.Lscatter_epilogue: 3610 .byte 0xf3,0xc3 3611.cfi_endproc 3612.size bn_scatter5,.-bn_scatter5 3613 3614.globl bn_gather5 3615.type bn_gather5,@function 3616.align 32 3617bn_gather5: 3618.LSEH_begin_bn_gather5: 3619.cfi_startproc 3620 3621.byte 0x4c,0x8d,0x14,0x24 3622.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 3623 leaq .Linc(%rip),%rax 3624 andq $-16,%rsp 3625 3626 movd %ecx,%xmm5 3627 movdqa 0(%rax),%xmm0 3628 movdqa 16(%rax),%xmm1 3629 leaq 128(%rdx),%r11 3630 leaq 128(%rsp),%rax 3631 3632 pshufd $0,%xmm5,%xmm5 3633 movdqa %xmm1,%xmm4 3634 movdqa %xmm1,%xmm2 3635 paddd %xmm0,%xmm1 3636 pcmpeqd %xmm5,%xmm0 3637 movdqa %xmm4,%xmm3 3638 3639 paddd %xmm1,%xmm2 3640 pcmpeqd %xmm5,%xmm1 3641 movdqa %xmm0,-128(%rax) 3642 movdqa %xmm4,%xmm0 3643 3644 paddd %xmm2,%xmm3 3645 pcmpeqd %xmm5,%xmm2 3646 movdqa %xmm1,-112(%rax) 3647 movdqa %xmm4,%xmm1 3648 3649 paddd %xmm3,%xmm0 3650 pcmpeqd %xmm5,%xmm3 3651 movdqa %xmm2,-96(%rax) 3652 movdqa %xmm4,%xmm2 3653 paddd %xmm0,%xmm1 3654 pcmpeqd %xmm5,%xmm0 3655 movdqa %xmm3,-80(%rax) 3656 movdqa %xmm4,%xmm3 3657 3658 paddd %xmm1,%xmm2 3659 pcmpeqd %xmm5,%xmm1 3660 movdqa %xmm0,-64(%rax) 3661 movdqa %xmm4,%xmm0 3662 3663 paddd %xmm2,%xmm3 3664 pcmpeqd %xmm5,%xmm2 3665 movdqa %xmm1,-48(%rax) 3666 movdqa %xmm4,%xmm1 3667 3668 paddd %xmm3,%xmm0 3669 pcmpeqd %xmm5,%xmm3 3670 movdqa %xmm2,-32(%rax) 3671 movdqa %xmm4,%xmm2 3672 paddd %xmm0,%xmm1 3673 pcmpeqd %xmm5,%xmm0 3674 movdqa %xmm3,-16(%rax) 3675 movdqa %xmm4,%xmm3 3676 3677 paddd %xmm1,%xmm2 3678 pcmpeqd %xmm5,%xmm1 3679 movdqa %xmm0,0(%rax) 3680 movdqa %xmm4,%xmm0 3681 3682 paddd %xmm2,%xmm3 3683 pcmpeqd %xmm5,%xmm2 3684 movdqa %xmm1,16(%rax) 3685 movdqa %xmm4,%xmm1 3686 3687 paddd %xmm3,%xmm0 3688 pcmpeqd %xmm5,%xmm3 3689 movdqa %xmm2,32(%rax) 3690 movdqa %xmm4,%xmm2 3691 paddd %xmm0,%xmm1 3692 pcmpeqd %xmm5,%xmm0 3693 movdqa %xmm3,48(%rax) 3694 movdqa %xmm4,%xmm3 3695 3696 paddd %xmm1,%xmm2 3697 pcmpeqd %xmm5,%xmm1 3698 movdqa %xmm0,64(%rax) 3699 movdqa %xmm4,%xmm0 3700 3701 paddd %xmm2,%xmm3 3702 pcmpeqd %xmm5,%xmm2 3703 movdqa %xmm1,80(%rax) 3704 movdqa %xmm4,%xmm1 3705 3706 paddd %xmm3,%xmm0 3707 pcmpeqd %xmm5,%xmm3 3708 movdqa %xmm2,96(%rax) 3709 movdqa %xmm4,%xmm2 3710 movdqa %xmm3,112(%rax) 3711 jmp .Lgather 3712 3713.align 32 3714.Lgather: 3715 pxor %xmm4,%xmm4 3716 pxor %xmm5,%xmm5 3717 movdqa -128(%r11),%xmm0 3718 movdqa -112(%r11),%xmm1 3719 movdqa -96(%r11),%xmm2 3720 pand -128(%rax),%xmm0 3721 movdqa -80(%r11),%xmm3 3722 pand -112(%rax),%xmm1 3723 por %xmm0,%xmm4 3724 pand -96(%rax),%xmm2 3725 por %xmm1,%xmm5 3726 pand -80(%rax),%xmm3 3727 por %xmm2,%xmm4 3728 por %xmm3,%xmm5 3729 movdqa -64(%r11),%xmm0 3730 movdqa -48(%r11),%xmm1 3731 movdqa -32(%r11),%xmm2 3732 pand -64(%rax),%xmm0 3733 movdqa -16(%r11),%xmm3 3734 pand -48(%rax),%xmm1 3735 por %xmm0,%xmm4 3736 pand -32(%rax),%xmm2 3737 por %xmm1,%xmm5 3738 pand -16(%rax),%xmm3 3739 por %xmm2,%xmm4 3740 por %xmm3,%xmm5 3741 movdqa 0(%r11),%xmm0 3742 movdqa 16(%r11),%xmm1 3743 movdqa 32(%r11),%xmm2 3744 pand 0(%rax),%xmm0 3745 movdqa 48(%r11),%xmm3 3746 pand 16(%rax),%xmm1 3747 por %xmm0,%xmm4 3748 pand 32(%rax),%xmm2 3749 por %xmm1,%xmm5 3750 pand 48(%rax),%xmm3 3751 por %xmm2,%xmm4 3752 por %xmm3,%xmm5 3753 movdqa 64(%r11),%xmm0 3754 movdqa 80(%r11),%xmm1 3755 movdqa 96(%r11),%xmm2 3756 pand 64(%rax),%xmm0 3757 movdqa 112(%r11),%xmm3 3758 pand 80(%rax),%xmm1 3759 por %xmm0,%xmm4 3760 pand 96(%rax),%xmm2 3761 por %xmm1,%xmm5 3762 pand 112(%rax),%xmm3 3763 por %xmm2,%xmm4 3764 por %xmm3,%xmm5 3765 por %xmm5,%xmm4 3766 leaq 256(%r11),%r11 3767 pshufd $0x4e,%xmm4,%xmm0 3768 por %xmm4,%xmm0 3769 movq %xmm0,(%rdi) 3770 leaq 8(%rdi),%rdi 3771 subl $1,%esi 3772 jnz .Lgather 3773 3774 leaq (%r10),%rsp 3775 .byte 0xf3,0xc3 3776.LSEH_end_bn_gather5: 3777.cfi_endproc 3778.size bn_gather5,.-bn_gather5 3779.align 64 3780.Linc: 3781.long 0,0, 1,1 3782.long 2,2, 2,2 3783.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 3784