x86_64-mont5.S revision 1.3
1#include <machine/asm.h> 2.text 3 4 5 6.globl bn_mul_mont_gather5 7.type bn_mul_mont_gather5,@function 8.align 64 9bn_mul_mont_gather5: 10 movl %r9d,%r9d 11 movq %rsp,%rax 12 testl $7,%r9d 13 jnz .Lmul_enter 14 jmp .Lmul4x_enter 15 16.align 16 17.Lmul_enter: 18 movd 8(%rsp),%xmm5 19 pushq %rbx 20 pushq %rbp 21 pushq %r12 22 pushq %r13 23 pushq %r14 24 pushq %r15 25 26 negq %r9 27 movq %rsp,%r11 28 leaq -280(%rsp,%r9,8),%r10 29 negq %r9 30 andq $-1024,%r10 31 32 33 34 35 36 37 38 subq %r10,%r11 39 andq $-4096,%r11 40 leaq (%r10,%r11,1),%rsp 41 movq (%rsp),%r11 42 cmpq %r10,%rsp 43 ja .Lmul_page_walk 44 jmp .Lmul_page_walk_done 45 46.Lmul_page_walk: 47 leaq -4096(%rsp),%rsp 48 movq (%rsp),%r11 49 cmpq %r10,%rsp 50 ja .Lmul_page_walk 51.Lmul_page_walk_done: 52 53 leaq .Linc(%rip),%r10 54 movq %rax,8(%rsp,%r9,8) 55.Lmul_body: 56 57 leaq 128(%rdx),%r12 58 movdqa 0(%r10),%xmm0 59 movdqa 16(%r10),%xmm1 60 leaq 24-112(%rsp,%r9,8),%r10 61 andq $-16,%r10 62 63 pshufd $0,%xmm5,%xmm5 64 movdqa %xmm1,%xmm4 65 movdqa %xmm1,%xmm2 66 paddd %xmm0,%xmm1 67 pcmpeqd %xmm5,%xmm0 68.byte 0x67 69 movdqa %xmm4,%xmm3 70 paddd %xmm1,%xmm2 71 pcmpeqd %xmm5,%xmm1 72 movdqa %xmm0,112(%r10) 73 movdqa %xmm4,%xmm0 74 75 paddd %xmm2,%xmm3 76 pcmpeqd %xmm5,%xmm2 77 movdqa %xmm1,128(%r10) 78 movdqa %xmm4,%xmm1 79 80 paddd %xmm3,%xmm0 81 pcmpeqd %xmm5,%xmm3 82 movdqa %xmm2,144(%r10) 83 movdqa %xmm4,%xmm2 84 85 paddd %xmm0,%xmm1 86 pcmpeqd %xmm5,%xmm0 87 movdqa %xmm3,160(%r10) 88 movdqa %xmm4,%xmm3 89 paddd %xmm1,%xmm2 90 pcmpeqd %xmm5,%xmm1 91 movdqa %xmm0,176(%r10) 92 movdqa %xmm4,%xmm0 93 94 paddd %xmm2,%xmm3 95 pcmpeqd %xmm5,%xmm2 96 movdqa %xmm1,192(%r10) 97 movdqa %xmm4,%xmm1 98 99 paddd %xmm3,%xmm0 100 pcmpeqd %xmm5,%xmm3 101 movdqa %xmm2,208(%r10) 102 movdqa %xmm4,%xmm2 103 104 paddd %xmm0,%xmm1 105 pcmpeqd %xmm5,%xmm0 106 movdqa %xmm3,224(%r10) 107 movdqa %xmm4,%xmm3 108 paddd %xmm1,%xmm2 109 pcmpeqd %xmm5,%xmm1 110 movdqa %xmm0,240(%r10) 111 movdqa %xmm4,%xmm0 112 113 paddd %xmm2,%xmm3 114 pcmpeqd %xmm5,%xmm2 115 movdqa %xmm1,256(%r10) 116 movdqa %xmm4,%xmm1 117 118 paddd %xmm3,%xmm0 119 pcmpeqd %xmm5,%xmm3 120 movdqa %xmm2,272(%r10) 121 movdqa %xmm4,%xmm2 122 123 paddd %xmm0,%xmm1 124 pcmpeqd %xmm5,%xmm0 125 movdqa %xmm3,288(%r10) 126 movdqa %xmm4,%xmm3 127 paddd %xmm1,%xmm2 128 pcmpeqd %xmm5,%xmm1 129 movdqa %xmm0,304(%r10) 130 131 paddd %xmm2,%xmm3 132.byte 0x67 133 pcmpeqd %xmm5,%xmm2 134 movdqa %xmm1,320(%r10) 135 136 pcmpeqd %xmm5,%xmm3 137 movdqa %xmm2,336(%r10) 138 pand 64(%r12),%xmm0 139 140 pand 80(%r12),%xmm1 141 pand 96(%r12),%xmm2 142 movdqa %xmm3,352(%r10) 143 pand 112(%r12),%xmm3 144 por %xmm2,%xmm0 145 por %xmm3,%xmm1 146 movdqa -128(%r12),%xmm4 147 movdqa -112(%r12),%xmm5 148 movdqa -96(%r12),%xmm2 149 pand 112(%r10),%xmm4 150 movdqa -80(%r12),%xmm3 151 pand 128(%r10),%xmm5 152 por %xmm4,%xmm0 153 pand 144(%r10),%xmm2 154 por %xmm5,%xmm1 155 pand 160(%r10),%xmm3 156 por %xmm2,%xmm0 157 por %xmm3,%xmm1 158 movdqa -64(%r12),%xmm4 159 movdqa -48(%r12),%xmm5 160 movdqa -32(%r12),%xmm2 161 pand 176(%r10),%xmm4 162 movdqa -16(%r12),%xmm3 163 pand 192(%r10),%xmm5 164 por %xmm4,%xmm0 165 pand 208(%r10),%xmm2 166 por %xmm5,%xmm1 167 pand 224(%r10),%xmm3 168 por %xmm2,%xmm0 169 por %xmm3,%xmm1 170 movdqa 0(%r12),%xmm4 171 movdqa 16(%r12),%xmm5 172 movdqa 32(%r12),%xmm2 173 pand 240(%r10),%xmm4 174 movdqa 48(%r12),%xmm3 175 pand 256(%r10),%xmm5 176 por %xmm4,%xmm0 177 pand 272(%r10),%xmm2 178 por %xmm5,%xmm1 179 pand 288(%r10),%xmm3 180 por %xmm2,%xmm0 181 por %xmm3,%xmm1 182 por %xmm1,%xmm0 183 pshufd $0x4e,%xmm0,%xmm1 184 por %xmm1,%xmm0 185 leaq 256(%r12),%r12 186.byte 102,72,15,126,195 187 188 movq (%r8),%r8 189 movq (%rsi),%rax 190 191 xorq %r14,%r14 192 xorq %r15,%r15 193 194 movq %r8,%rbp 195 mulq %rbx 196 movq %rax,%r10 197 movq (%rcx),%rax 198 199 imulq %r10,%rbp 200 movq %rdx,%r11 201 202 mulq %rbp 203 addq %rax,%r10 204 movq 8(%rsi),%rax 205 adcq $0,%rdx 206 movq %rdx,%r13 207 208 leaq 1(%r15),%r15 209 jmp .L1st_enter 210 211.align 16 212.L1st: 213 addq %rax,%r13 214 movq (%rsi,%r15,8),%rax 215 adcq $0,%rdx 216 addq %r11,%r13 217 movq %r10,%r11 218 adcq $0,%rdx 219 movq %r13,-16(%rsp,%r15,8) 220 movq %rdx,%r13 221 222.L1st_enter: 223 mulq %rbx 224 addq %rax,%r11 225 movq (%rcx,%r15,8),%rax 226 adcq $0,%rdx 227 leaq 1(%r15),%r15 228 movq %rdx,%r10 229 230 mulq %rbp 231 cmpq %r9,%r15 232 jne .L1st 233 234 235 addq %rax,%r13 236 adcq $0,%rdx 237 addq %r11,%r13 238 adcq $0,%rdx 239 movq %r13,-16(%rsp,%r9,8) 240 movq %rdx,%r13 241 movq %r10,%r11 242 243 xorq %rdx,%rdx 244 addq %r11,%r13 245 adcq $0,%rdx 246 movq %r13,-8(%rsp,%r9,8) 247 movq %rdx,(%rsp,%r9,8) 248 249 leaq 1(%r14),%r14 250 jmp .Louter 251.align 16 252.Louter: 253 leaq 24+128(%rsp,%r9,8),%rdx 254 andq $-16,%rdx 255 pxor %xmm4,%xmm4 256 pxor %xmm5,%xmm5 257 movdqa -128(%r12),%xmm0 258 movdqa -112(%r12),%xmm1 259 movdqa -96(%r12),%xmm2 260 movdqa -80(%r12),%xmm3 261 pand -128(%rdx),%xmm0 262 pand -112(%rdx),%xmm1 263 por %xmm0,%xmm4 264 pand -96(%rdx),%xmm2 265 por %xmm1,%xmm5 266 pand -80(%rdx),%xmm3 267 por %xmm2,%xmm4 268 por %xmm3,%xmm5 269 movdqa -64(%r12),%xmm0 270 movdqa -48(%r12),%xmm1 271 movdqa -32(%r12),%xmm2 272 movdqa -16(%r12),%xmm3 273 pand -64(%rdx),%xmm0 274 pand -48(%rdx),%xmm1 275 por %xmm0,%xmm4 276 pand -32(%rdx),%xmm2 277 por %xmm1,%xmm5 278 pand -16(%rdx),%xmm3 279 por %xmm2,%xmm4 280 por %xmm3,%xmm5 281 movdqa 0(%r12),%xmm0 282 movdqa 16(%r12),%xmm1 283 movdqa 32(%r12),%xmm2 284 movdqa 48(%r12),%xmm3 285 pand 0(%rdx),%xmm0 286 pand 16(%rdx),%xmm1 287 por %xmm0,%xmm4 288 pand 32(%rdx),%xmm2 289 por %xmm1,%xmm5 290 pand 48(%rdx),%xmm3 291 por %xmm2,%xmm4 292 por %xmm3,%xmm5 293 movdqa 64(%r12),%xmm0 294 movdqa 80(%r12),%xmm1 295 movdqa 96(%r12),%xmm2 296 movdqa 112(%r12),%xmm3 297 pand 64(%rdx),%xmm0 298 pand 80(%rdx),%xmm1 299 por %xmm0,%xmm4 300 pand 96(%rdx),%xmm2 301 por %xmm1,%xmm5 302 pand 112(%rdx),%xmm3 303 por %xmm2,%xmm4 304 por %xmm3,%xmm5 305 por %xmm5,%xmm4 306 pshufd $0x4e,%xmm4,%xmm0 307 por %xmm4,%xmm0 308 leaq 256(%r12),%r12 309 310 movq (%rsi),%rax 311.byte 102,72,15,126,195 312 313 xorq %r15,%r15 314 movq %r8,%rbp 315 movq (%rsp),%r10 316 317 mulq %rbx 318 addq %rax,%r10 319 movq (%rcx),%rax 320 adcq $0,%rdx 321 322 imulq %r10,%rbp 323 movq %rdx,%r11 324 325 mulq %rbp 326 addq %rax,%r10 327 movq 8(%rsi),%rax 328 adcq $0,%rdx 329 movq 8(%rsp),%r10 330 movq %rdx,%r13 331 332 leaq 1(%r15),%r15 333 jmp .Linner_enter 334 335.align 16 336.Linner: 337 addq %rax,%r13 338 movq (%rsi,%r15,8),%rax 339 adcq $0,%rdx 340 addq %r10,%r13 341 movq (%rsp,%r15,8),%r10 342 adcq $0,%rdx 343 movq %r13,-16(%rsp,%r15,8) 344 movq %rdx,%r13 345 346.Linner_enter: 347 mulq %rbx 348 addq %rax,%r11 349 movq (%rcx,%r15,8),%rax 350 adcq $0,%rdx 351 addq %r11,%r10 352 movq %rdx,%r11 353 adcq $0,%r11 354 leaq 1(%r15),%r15 355 356 mulq %rbp 357 cmpq %r9,%r15 358 jne .Linner 359 360 addq %rax,%r13 361 adcq $0,%rdx 362 addq %r10,%r13 363 movq (%rsp,%r9,8),%r10 364 adcq $0,%rdx 365 movq %r13,-16(%rsp,%r9,8) 366 movq %rdx,%r13 367 368 xorq %rdx,%rdx 369 addq %r11,%r13 370 adcq $0,%rdx 371 addq %r10,%r13 372 adcq $0,%rdx 373 movq %r13,-8(%rsp,%r9,8) 374 movq %rdx,(%rsp,%r9,8) 375 376 leaq 1(%r14),%r14 377 cmpq %r9,%r14 378 jb .Louter 379 380 xorq %r14,%r14 381 movq (%rsp),%rax 382 leaq (%rsp),%rsi 383 movq %r9,%r15 384 jmp .Lsub 385.align 16 386.Lsub: sbbq (%rcx,%r14,8),%rax 387 movq %rax,(%rdi,%r14,8) 388 movq 8(%rsi,%r14,8),%rax 389 leaq 1(%r14),%r14 390 decq %r15 391 jnz .Lsub 392 393 sbbq $0,%rax 394 xorq %r14,%r14 395 andq %rax,%rsi 396 notq %rax 397 movq %rdi,%rcx 398 andq %rax,%rcx 399 movq %r9,%r15 400 orq %rcx,%rsi 401.align 16 402.Lcopy: 403 movq (%rsi,%r14,8),%rax 404 movq %r14,(%rsp,%r14,8) 405 movq %rax,(%rdi,%r14,8) 406 leaq 1(%r14),%r14 407 subq $1,%r15 408 jnz .Lcopy 409 410 movq 8(%rsp,%r9,8),%rsi 411 movq $1,%rax 412 413 movq -48(%rsi),%r15 414 movq -40(%rsi),%r14 415 movq -32(%rsi),%r13 416 movq -24(%rsi),%r12 417 movq -16(%rsi),%rbp 418 movq -8(%rsi),%rbx 419 leaq (%rsi),%rsp 420.Lmul_epilogue: 421 .byte 0xf3,0xc3 422.size bn_mul_mont_gather5,.-bn_mul_mont_gather5 423.type bn_mul4x_mont_gather5,@function 424.align 32 425bn_mul4x_mont_gather5: 426.byte 0x67 427 movq %rsp,%rax 428.Lmul4x_enter: 429 pushq %rbx 430 pushq %rbp 431 pushq %r12 432 pushq %r13 433 pushq %r14 434 pushq %r15 435.Lmul4x_prologue: 436 437.byte 0x67 438 shll $3,%r9d 439 leaq (%r9,%r9,2),%r10 440 negq %r9 441 442 443 444 445 446 447 448 449 450 451 leaq -320(%rsp,%r9,2),%r11 452 movq %rsp,%rbp 453 subq %rdi,%r11 454 andq $4095,%r11 455 cmpq %r11,%r10 456 jb .Lmul4xsp_alt 457 subq %r11,%rbp 458 leaq -320(%rbp,%r9,2),%rbp 459 jmp .Lmul4xsp_done 460 461.align 32 462.Lmul4xsp_alt: 463 leaq 4096-320(,%r9,2),%r10 464 leaq -320(%rbp,%r9,2),%rbp 465 subq %r10,%r11 466 movq $0,%r10 467 cmovcq %r10,%r11 468 subq %r11,%rbp 469.Lmul4xsp_done: 470 andq $-64,%rbp 471 movq %rsp,%r11 472 subq %rbp,%r11 473 andq $-4096,%r11 474 leaq (%r11,%rbp,1),%rsp 475 movq (%rsp),%r10 476 cmpq %rbp,%rsp 477 ja .Lmul4x_page_walk 478 jmp .Lmul4x_page_walk_done 479 480.Lmul4x_page_walk: 481 leaq -4096(%rsp),%rsp 482 movq (%rsp),%r10 483 cmpq %rbp,%rsp 484 ja .Lmul4x_page_walk 485.Lmul4x_page_walk_done: 486 487 negq %r9 488 489 movq %rax,40(%rsp) 490.Lmul4x_body: 491 492 call mul4x_internal 493 494 movq 40(%rsp),%rsi 495 movq $1,%rax 496 497 movq -48(%rsi),%r15 498 movq -40(%rsi),%r14 499 movq -32(%rsi),%r13 500 movq -24(%rsi),%r12 501 movq -16(%rsi),%rbp 502 movq -8(%rsi),%rbx 503 leaq (%rsi),%rsp 504.Lmul4x_epilogue: 505 .byte 0xf3,0xc3 506.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 507 508.type mul4x_internal,@function 509.align 32 510mul4x_internal: 511 shlq $5,%r9 512 movd 8(%rax),%xmm5 513 leaq .Linc(%rip),%rax 514 leaq 128(%rdx,%r9,1),%r13 515 shrq $5,%r9 516 movdqa 0(%rax),%xmm0 517 movdqa 16(%rax),%xmm1 518 leaq 88-112(%rsp,%r9,1),%r10 519 leaq 128(%rdx),%r12 520 521 pshufd $0,%xmm5,%xmm5 522 movdqa %xmm1,%xmm4 523.byte 0x67,0x67 524 movdqa %xmm1,%xmm2 525 paddd %xmm0,%xmm1 526 pcmpeqd %xmm5,%xmm0 527.byte 0x67 528 movdqa %xmm4,%xmm3 529 paddd %xmm1,%xmm2 530 pcmpeqd %xmm5,%xmm1 531 movdqa %xmm0,112(%r10) 532 movdqa %xmm4,%xmm0 533 534 paddd %xmm2,%xmm3 535 pcmpeqd %xmm5,%xmm2 536 movdqa %xmm1,128(%r10) 537 movdqa %xmm4,%xmm1 538 539 paddd %xmm3,%xmm0 540 pcmpeqd %xmm5,%xmm3 541 movdqa %xmm2,144(%r10) 542 movdqa %xmm4,%xmm2 543 544 paddd %xmm0,%xmm1 545 pcmpeqd %xmm5,%xmm0 546 movdqa %xmm3,160(%r10) 547 movdqa %xmm4,%xmm3 548 paddd %xmm1,%xmm2 549 pcmpeqd %xmm5,%xmm1 550 movdqa %xmm0,176(%r10) 551 movdqa %xmm4,%xmm0 552 553 paddd %xmm2,%xmm3 554 pcmpeqd %xmm5,%xmm2 555 movdqa %xmm1,192(%r10) 556 movdqa %xmm4,%xmm1 557 558 paddd %xmm3,%xmm0 559 pcmpeqd %xmm5,%xmm3 560 movdqa %xmm2,208(%r10) 561 movdqa %xmm4,%xmm2 562 563 paddd %xmm0,%xmm1 564 pcmpeqd %xmm5,%xmm0 565 movdqa %xmm3,224(%r10) 566 movdqa %xmm4,%xmm3 567 paddd %xmm1,%xmm2 568 pcmpeqd %xmm5,%xmm1 569 movdqa %xmm0,240(%r10) 570 movdqa %xmm4,%xmm0 571 572 paddd %xmm2,%xmm3 573 pcmpeqd %xmm5,%xmm2 574 movdqa %xmm1,256(%r10) 575 movdqa %xmm4,%xmm1 576 577 paddd %xmm3,%xmm0 578 pcmpeqd %xmm5,%xmm3 579 movdqa %xmm2,272(%r10) 580 movdqa %xmm4,%xmm2 581 582 paddd %xmm0,%xmm1 583 pcmpeqd %xmm5,%xmm0 584 movdqa %xmm3,288(%r10) 585 movdqa %xmm4,%xmm3 586 paddd %xmm1,%xmm2 587 pcmpeqd %xmm5,%xmm1 588 movdqa %xmm0,304(%r10) 589 590 paddd %xmm2,%xmm3 591.byte 0x67 592 pcmpeqd %xmm5,%xmm2 593 movdqa %xmm1,320(%r10) 594 595 pcmpeqd %xmm5,%xmm3 596 movdqa %xmm2,336(%r10) 597 pand 64(%r12),%xmm0 598 599 pand 80(%r12),%xmm1 600 pand 96(%r12),%xmm2 601 movdqa %xmm3,352(%r10) 602 pand 112(%r12),%xmm3 603 por %xmm2,%xmm0 604 por %xmm3,%xmm1 605 movdqa -128(%r12),%xmm4 606 movdqa -112(%r12),%xmm5 607 movdqa -96(%r12),%xmm2 608 pand 112(%r10),%xmm4 609 movdqa -80(%r12),%xmm3 610 pand 128(%r10),%xmm5 611 por %xmm4,%xmm0 612 pand 144(%r10),%xmm2 613 por %xmm5,%xmm1 614 pand 160(%r10),%xmm3 615 por %xmm2,%xmm0 616 por %xmm3,%xmm1 617 movdqa -64(%r12),%xmm4 618 movdqa -48(%r12),%xmm5 619 movdqa -32(%r12),%xmm2 620 pand 176(%r10),%xmm4 621 movdqa -16(%r12),%xmm3 622 pand 192(%r10),%xmm5 623 por %xmm4,%xmm0 624 pand 208(%r10),%xmm2 625 por %xmm5,%xmm1 626 pand 224(%r10),%xmm3 627 por %xmm2,%xmm0 628 por %xmm3,%xmm1 629 movdqa 0(%r12),%xmm4 630 movdqa 16(%r12),%xmm5 631 movdqa 32(%r12),%xmm2 632 pand 240(%r10),%xmm4 633 movdqa 48(%r12),%xmm3 634 pand 256(%r10),%xmm5 635 por %xmm4,%xmm0 636 pand 272(%r10),%xmm2 637 por %xmm5,%xmm1 638 pand 288(%r10),%xmm3 639 por %xmm2,%xmm0 640 por %xmm3,%xmm1 641 por %xmm1,%xmm0 642 pshufd $0x4e,%xmm0,%xmm1 643 por %xmm1,%xmm0 644 leaq 256(%r12),%r12 645.byte 102,72,15,126,195 646 647 movq %r13,16+8(%rsp) 648 movq %rdi,56+8(%rsp) 649 650 movq (%r8),%r8 651 movq (%rsi),%rax 652 leaq (%rsi,%r9,1),%rsi 653 negq %r9 654 655 movq %r8,%rbp 656 mulq %rbx 657 movq %rax,%r10 658 movq (%rcx),%rax 659 660 imulq %r10,%rbp 661 leaq 64+8(%rsp),%r14 662 movq %rdx,%r11 663 664 mulq %rbp 665 addq %rax,%r10 666 movq 8(%rsi,%r9,1),%rax 667 adcq $0,%rdx 668 movq %rdx,%rdi 669 670 mulq %rbx 671 addq %rax,%r11 672 movq 8(%rcx),%rax 673 adcq $0,%rdx 674 movq %rdx,%r10 675 676 mulq %rbp 677 addq %rax,%rdi 678 movq 16(%rsi,%r9,1),%rax 679 adcq $0,%rdx 680 addq %r11,%rdi 681 leaq 32(%r9),%r15 682 leaq 32(%rcx),%rcx 683 adcq $0,%rdx 684 movq %rdi,(%r14) 685 movq %rdx,%r13 686 jmp .L1st4x 687 688.align 32 689.L1st4x: 690 mulq %rbx 691 addq %rax,%r10 692 movq -16(%rcx),%rax 693 leaq 32(%r14),%r14 694 adcq $0,%rdx 695 movq %rdx,%r11 696 697 mulq %rbp 698 addq %rax,%r13 699 movq -8(%rsi,%r15,1),%rax 700 adcq $0,%rdx 701 addq %r10,%r13 702 adcq $0,%rdx 703 movq %r13,-24(%r14) 704 movq %rdx,%rdi 705 706 mulq %rbx 707 addq %rax,%r11 708 movq -8(%rcx),%rax 709 adcq $0,%rdx 710 movq %rdx,%r10 711 712 mulq %rbp 713 addq %rax,%rdi 714 movq (%rsi,%r15,1),%rax 715 adcq $0,%rdx 716 addq %r11,%rdi 717 adcq $0,%rdx 718 movq %rdi,-16(%r14) 719 movq %rdx,%r13 720 721 mulq %rbx 722 addq %rax,%r10 723 movq 0(%rcx),%rax 724 adcq $0,%rdx 725 movq %rdx,%r11 726 727 mulq %rbp 728 addq %rax,%r13 729 movq 8(%rsi,%r15,1),%rax 730 adcq $0,%rdx 731 addq %r10,%r13 732 adcq $0,%rdx 733 movq %r13,-8(%r14) 734 movq %rdx,%rdi 735 736 mulq %rbx 737 addq %rax,%r11 738 movq 8(%rcx),%rax 739 adcq $0,%rdx 740 movq %rdx,%r10 741 742 mulq %rbp 743 addq %rax,%rdi 744 movq 16(%rsi,%r15,1),%rax 745 adcq $0,%rdx 746 addq %r11,%rdi 747 leaq 32(%rcx),%rcx 748 adcq $0,%rdx 749 movq %rdi,(%r14) 750 movq %rdx,%r13 751 752 addq $32,%r15 753 jnz .L1st4x 754 755 mulq %rbx 756 addq %rax,%r10 757 movq -16(%rcx),%rax 758 leaq 32(%r14),%r14 759 adcq $0,%rdx 760 movq %rdx,%r11 761 762 mulq %rbp 763 addq %rax,%r13 764 movq -8(%rsi),%rax 765 adcq $0,%rdx 766 addq %r10,%r13 767 adcq $0,%rdx 768 movq %r13,-24(%r14) 769 movq %rdx,%rdi 770 771 mulq %rbx 772 addq %rax,%r11 773 movq -8(%rcx),%rax 774 adcq $0,%rdx 775 movq %rdx,%r10 776 777 mulq %rbp 778 addq %rax,%rdi 779 movq (%rsi,%r9,1),%rax 780 adcq $0,%rdx 781 addq %r11,%rdi 782 adcq $0,%rdx 783 movq %rdi,-16(%r14) 784 movq %rdx,%r13 785 786 leaq (%rcx,%r9,1),%rcx 787 788 xorq %rdi,%rdi 789 addq %r10,%r13 790 adcq $0,%rdi 791 movq %r13,-8(%r14) 792 793 jmp .Louter4x 794 795.align 32 796.Louter4x: 797 leaq 16+128(%r14),%rdx 798 pxor %xmm4,%xmm4 799 pxor %xmm5,%xmm5 800 movdqa -128(%r12),%xmm0 801 movdqa -112(%r12),%xmm1 802 movdqa -96(%r12),%xmm2 803 movdqa -80(%r12),%xmm3 804 pand -128(%rdx),%xmm0 805 pand -112(%rdx),%xmm1 806 por %xmm0,%xmm4 807 pand -96(%rdx),%xmm2 808 por %xmm1,%xmm5 809 pand -80(%rdx),%xmm3 810 por %xmm2,%xmm4 811 por %xmm3,%xmm5 812 movdqa -64(%r12),%xmm0 813 movdqa -48(%r12),%xmm1 814 movdqa -32(%r12),%xmm2 815 movdqa -16(%r12),%xmm3 816 pand -64(%rdx),%xmm0 817 pand -48(%rdx),%xmm1 818 por %xmm0,%xmm4 819 pand -32(%rdx),%xmm2 820 por %xmm1,%xmm5 821 pand -16(%rdx),%xmm3 822 por %xmm2,%xmm4 823 por %xmm3,%xmm5 824 movdqa 0(%r12),%xmm0 825 movdqa 16(%r12),%xmm1 826 movdqa 32(%r12),%xmm2 827 movdqa 48(%r12),%xmm3 828 pand 0(%rdx),%xmm0 829 pand 16(%rdx),%xmm1 830 por %xmm0,%xmm4 831 pand 32(%rdx),%xmm2 832 por %xmm1,%xmm5 833 pand 48(%rdx),%xmm3 834 por %xmm2,%xmm4 835 por %xmm3,%xmm5 836 movdqa 64(%r12),%xmm0 837 movdqa 80(%r12),%xmm1 838 movdqa 96(%r12),%xmm2 839 movdqa 112(%r12),%xmm3 840 pand 64(%rdx),%xmm0 841 pand 80(%rdx),%xmm1 842 por %xmm0,%xmm4 843 pand 96(%rdx),%xmm2 844 por %xmm1,%xmm5 845 pand 112(%rdx),%xmm3 846 por %xmm2,%xmm4 847 por %xmm3,%xmm5 848 por %xmm5,%xmm4 849 pshufd $0x4e,%xmm4,%xmm0 850 por %xmm4,%xmm0 851 leaq 256(%r12),%r12 852.byte 102,72,15,126,195 853 854 movq (%r14,%r9,1),%r10 855 movq %r8,%rbp 856 mulq %rbx 857 addq %rax,%r10 858 movq (%rcx),%rax 859 adcq $0,%rdx 860 861 imulq %r10,%rbp 862 movq %rdx,%r11 863 movq %rdi,(%r14) 864 865 leaq (%r14,%r9,1),%r14 866 867 mulq %rbp 868 addq %rax,%r10 869 movq 8(%rsi,%r9,1),%rax 870 adcq $0,%rdx 871 movq %rdx,%rdi 872 873 mulq %rbx 874 addq %rax,%r11 875 movq 8(%rcx),%rax 876 adcq $0,%rdx 877 addq 8(%r14),%r11 878 adcq $0,%rdx 879 movq %rdx,%r10 880 881 mulq %rbp 882 addq %rax,%rdi 883 movq 16(%rsi,%r9,1),%rax 884 adcq $0,%rdx 885 addq %r11,%rdi 886 leaq 32(%r9),%r15 887 leaq 32(%rcx),%rcx 888 adcq $0,%rdx 889 movq %rdx,%r13 890 jmp .Linner4x 891 892.align 32 893.Linner4x: 894 mulq %rbx 895 addq %rax,%r10 896 movq -16(%rcx),%rax 897 adcq $0,%rdx 898 addq 16(%r14),%r10 899 leaq 32(%r14),%r14 900 adcq $0,%rdx 901 movq %rdx,%r11 902 903 mulq %rbp 904 addq %rax,%r13 905 movq -8(%rsi,%r15,1),%rax 906 adcq $0,%rdx 907 addq %r10,%r13 908 adcq $0,%rdx 909 movq %rdi,-32(%r14) 910 movq %rdx,%rdi 911 912 mulq %rbx 913 addq %rax,%r11 914 movq -8(%rcx),%rax 915 adcq $0,%rdx 916 addq -8(%r14),%r11 917 adcq $0,%rdx 918 movq %rdx,%r10 919 920 mulq %rbp 921 addq %rax,%rdi 922 movq (%rsi,%r15,1),%rax 923 adcq $0,%rdx 924 addq %r11,%rdi 925 adcq $0,%rdx 926 movq %r13,-24(%r14) 927 movq %rdx,%r13 928 929 mulq %rbx 930 addq %rax,%r10 931 movq 0(%rcx),%rax 932 adcq $0,%rdx 933 addq (%r14),%r10 934 adcq $0,%rdx 935 movq %rdx,%r11 936 937 mulq %rbp 938 addq %rax,%r13 939 movq 8(%rsi,%r15,1),%rax 940 adcq $0,%rdx 941 addq %r10,%r13 942 adcq $0,%rdx 943 movq %rdi,-16(%r14) 944 movq %rdx,%rdi 945 946 mulq %rbx 947 addq %rax,%r11 948 movq 8(%rcx),%rax 949 adcq $0,%rdx 950 addq 8(%r14),%r11 951 adcq $0,%rdx 952 movq %rdx,%r10 953 954 mulq %rbp 955 addq %rax,%rdi 956 movq 16(%rsi,%r15,1),%rax 957 adcq $0,%rdx 958 addq %r11,%rdi 959 leaq 32(%rcx),%rcx 960 adcq $0,%rdx 961 movq %r13,-8(%r14) 962 movq %rdx,%r13 963 964 addq $32,%r15 965 jnz .Linner4x 966 967 mulq %rbx 968 addq %rax,%r10 969 movq -16(%rcx),%rax 970 adcq $0,%rdx 971 addq 16(%r14),%r10 972 leaq 32(%r14),%r14 973 adcq $0,%rdx 974 movq %rdx,%r11 975 976 mulq %rbp 977 addq %rax,%r13 978 movq -8(%rsi),%rax 979 adcq $0,%rdx 980 addq %r10,%r13 981 adcq $0,%rdx 982 movq %rdi,-32(%r14) 983 movq %rdx,%rdi 984 985 mulq %rbx 986 addq %rax,%r11 987 movq %rbp,%rax 988 movq -8(%rcx),%rbp 989 adcq $0,%rdx 990 addq -8(%r14),%r11 991 adcq $0,%rdx 992 movq %rdx,%r10 993 994 mulq %rbp 995 addq %rax,%rdi 996 movq (%rsi,%r9,1),%rax 997 adcq $0,%rdx 998 addq %r11,%rdi 999 adcq $0,%rdx 1000 movq %r13,-24(%r14) 1001 movq %rdx,%r13 1002 1003 movq %rdi,-16(%r14) 1004 leaq (%rcx,%r9,1),%rcx 1005 1006 xorq %rdi,%rdi 1007 addq %r10,%r13 1008 adcq $0,%rdi 1009 addq (%r14),%r13 1010 adcq $0,%rdi 1011 movq %r13,-8(%r14) 1012 1013 cmpq 16+8(%rsp),%r12 1014 jb .Louter4x 1015 xorq %rax,%rax 1016 subq %r13,%rbp 1017 adcq %r15,%r15 1018 orq %r15,%rdi 1019 subq %rdi,%rax 1020 leaq (%r14,%r9,1),%rbx 1021 movq (%rcx),%r12 1022 leaq (%rcx),%rbp 1023 movq %r9,%rcx 1024 sarq $3+2,%rcx 1025 movq 56+8(%rsp),%rdi 1026 decq %r12 1027 xorq %r10,%r10 1028 movq 8(%rbp),%r13 1029 movq 16(%rbp),%r14 1030 movq 24(%rbp),%r15 1031 jmp .Lsqr4x_sub_entry 1032.size mul4x_internal,.-mul4x_internal 1033.globl bn_power5 1034.type bn_power5,@function 1035.align 32 1036bn_power5: 1037 movq %rsp,%rax 1038 pushq %rbx 1039 pushq %rbp 1040 pushq %r12 1041 pushq %r13 1042 pushq %r14 1043 pushq %r15 1044.Lpower5_prologue: 1045 1046 shll $3,%r9d 1047 leal (%r9,%r9,2),%r10d 1048 negq %r9 1049 movq (%r8),%r8 1050 1051 1052 1053 1054 1055 1056 1057 1058 leaq -320(%rsp,%r9,2),%r11 1059 movq %rsp,%rbp 1060 subq %rdi,%r11 1061 andq $4095,%r11 1062 cmpq %r11,%r10 1063 jb .Lpwr_sp_alt 1064 subq %r11,%rbp 1065 leaq -320(%rbp,%r9,2),%rbp 1066 jmp .Lpwr_sp_done 1067 1068.align 32 1069.Lpwr_sp_alt: 1070 leaq 4096-320(,%r9,2),%r10 1071 leaq -320(%rbp,%r9,2),%rbp 1072 subq %r10,%r11 1073 movq $0,%r10 1074 cmovcq %r10,%r11 1075 subq %r11,%rbp 1076.Lpwr_sp_done: 1077 andq $-64,%rbp 1078 movq %rsp,%r11 1079 subq %rbp,%r11 1080 andq $-4096,%r11 1081 leaq (%r11,%rbp,1),%rsp 1082 movq (%rsp),%r10 1083 cmpq %rbp,%rsp 1084 ja .Lpwr_page_walk 1085 jmp .Lpwr_page_walk_done 1086 1087.Lpwr_page_walk: 1088 leaq -4096(%rsp),%rsp 1089 movq (%rsp),%r10 1090 cmpq %rbp,%rsp 1091 ja .Lpwr_page_walk 1092.Lpwr_page_walk_done: 1093 1094 movq %r9,%r10 1095 negq %r9 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 movq %r8,32(%rsp) 1107 movq %rax,40(%rsp) 1108.Lpower5_body: 1109.byte 102,72,15,110,207 1110.byte 102,72,15,110,209 1111.byte 102,73,15,110,218 1112.byte 102,72,15,110,226 1113 1114 call __bn_sqr8x_internal 1115 call __bn_post4x_internal 1116 call __bn_sqr8x_internal 1117 call __bn_post4x_internal 1118 call __bn_sqr8x_internal 1119 call __bn_post4x_internal 1120 call __bn_sqr8x_internal 1121 call __bn_post4x_internal 1122 call __bn_sqr8x_internal 1123 call __bn_post4x_internal 1124 1125.byte 102,72,15,126,209 1126.byte 102,72,15,126,226 1127 movq %rsi,%rdi 1128 movq 40(%rsp),%rax 1129 leaq 32(%rsp),%r8 1130 1131 call mul4x_internal 1132 1133 movq 40(%rsp),%rsi 1134 movq $1,%rax 1135 movq -48(%rsi),%r15 1136 movq -40(%rsi),%r14 1137 movq -32(%rsi),%r13 1138 movq -24(%rsi),%r12 1139 movq -16(%rsi),%rbp 1140 movq -8(%rsi),%rbx 1141 leaq (%rsi),%rsp 1142.Lpower5_epilogue: 1143 .byte 0xf3,0xc3 1144.size bn_power5,.-bn_power5 1145 1146.globl bn_sqr8x_internal 1147.hidden bn_sqr8x_internal 1148.type bn_sqr8x_internal,@function 1149.align 32 1150bn_sqr8x_internal: 1151__bn_sqr8x_internal: 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 leaq 32(%r10),%rbp 1226 leaq (%rsi,%r9,1),%rsi 1227 1228 movq %r9,%rcx 1229 1230 1231 movq -32(%rsi,%rbp,1),%r14 1232 leaq 48+8(%rsp,%r9,2),%rdi 1233 movq -24(%rsi,%rbp,1),%rax 1234 leaq -32(%rdi,%rbp,1),%rdi 1235 movq -16(%rsi,%rbp,1),%rbx 1236 movq %rax,%r15 1237 1238 mulq %r14 1239 movq %rax,%r10 1240 movq %rbx,%rax 1241 movq %rdx,%r11 1242 movq %r10,-24(%rdi,%rbp,1) 1243 1244 mulq %r14 1245 addq %rax,%r11 1246 movq %rbx,%rax 1247 adcq $0,%rdx 1248 movq %r11,-16(%rdi,%rbp,1) 1249 movq %rdx,%r10 1250 1251 1252 movq -8(%rsi,%rbp,1),%rbx 1253 mulq %r15 1254 movq %rax,%r12 1255 movq %rbx,%rax 1256 movq %rdx,%r13 1257 1258 leaq (%rbp),%rcx 1259 mulq %r14 1260 addq %rax,%r10 1261 movq %rbx,%rax 1262 movq %rdx,%r11 1263 adcq $0,%r11 1264 addq %r12,%r10 1265 adcq $0,%r11 1266 movq %r10,-8(%rdi,%rcx,1) 1267 jmp .Lsqr4x_1st 1268 1269.align 32 1270.Lsqr4x_1st: 1271 movq (%rsi,%rcx,1),%rbx 1272 mulq %r15 1273 addq %rax,%r13 1274 movq %rbx,%rax 1275 movq %rdx,%r12 1276 adcq $0,%r12 1277 1278 mulq %r14 1279 addq %rax,%r11 1280 movq %rbx,%rax 1281 movq 8(%rsi,%rcx,1),%rbx 1282 movq %rdx,%r10 1283 adcq $0,%r10 1284 addq %r13,%r11 1285 adcq $0,%r10 1286 1287 1288 mulq %r15 1289 addq %rax,%r12 1290 movq %rbx,%rax 1291 movq %r11,(%rdi,%rcx,1) 1292 movq %rdx,%r13 1293 adcq $0,%r13 1294 1295 mulq %r14 1296 addq %rax,%r10 1297 movq %rbx,%rax 1298 movq 16(%rsi,%rcx,1),%rbx 1299 movq %rdx,%r11 1300 adcq $0,%r11 1301 addq %r12,%r10 1302 adcq $0,%r11 1303 1304 mulq %r15 1305 addq %rax,%r13 1306 movq %rbx,%rax 1307 movq %r10,8(%rdi,%rcx,1) 1308 movq %rdx,%r12 1309 adcq $0,%r12 1310 1311 mulq %r14 1312 addq %rax,%r11 1313 movq %rbx,%rax 1314 movq 24(%rsi,%rcx,1),%rbx 1315 movq %rdx,%r10 1316 adcq $0,%r10 1317 addq %r13,%r11 1318 adcq $0,%r10 1319 1320 1321 mulq %r15 1322 addq %rax,%r12 1323 movq %rbx,%rax 1324 movq %r11,16(%rdi,%rcx,1) 1325 movq %rdx,%r13 1326 adcq $0,%r13 1327 leaq 32(%rcx),%rcx 1328 1329 mulq %r14 1330 addq %rax,%r10 1331 movq %rbx,%rax 1332 movq %rdx,%r11 1333 adcq $0,%r11 1334 addq %r12,%r10 1335 adcq $0,%r11 1336 movq %r10,-8(%rdi,%rcx,1) 1337 1338 cmpq $0,%rcx 1339 jne .Lsqr4x_1st 1340 1341 mulq %r15 1342 addq %rax,%r13 1343 leaq 16(%rbp),%rbp 1344 adcq $0,%rdx 1345 addq %r11,%r13 1346 adcq $0,%rdx 1347 1348 movq %r13,(%rdi) 1349 movq %rdx,%r12 1350 movq %rdx,8(%rdi) 1351 jmp .Lsqr4x_outer 1352 1353.align 32 1354.Lsqr4x_outer: 1355 movq -32(%rsi,%rbp,1),%r14 1356 leaq 48+8(%rsp,%r9,2),%rdi 1357 movq -24(%rsi,%rbp,1),%rax 1358 leaq -32(%rdi,%rbp,1),%rdi 1359 movq -16(%rsi,%rbp,1),%rbx 1360 movq %rax,%r15 1361 1362 mulq %r14 1363 movq -24(%rdi,%rbp,1),%r10 1364 addq %rax,%r10 1365 movq %rbx,%rax 1366 adcq $0,%rdx 1367 movq %r10,-24(%rdi,%rbp,1) 1368 movq %rdx,%r11 1369 1370 mulq %r14 1371 addq %rax,%r11 1372 movq %rbx,%rax 1373 adcq $0,%rdx 1374 addq -16(%rdi,%rbp,1),%r11 1375 movq %rdx,%r10 1376 adcq $0,%r10 1377 movq %r11,-16(%rdi,%rbp,1) 1378 1379 xorq %r12,%r12 1380 1381 movq -8(%rsi,%rbp,1),%rbx 1382 mulq %r15 1383 addq %rax,%r12 1384 movq %rbx,%rax 1385 adcq $0,%rdx 1386 addq -8(%rdi,%rbp,1),%r12 1387 movq %rdx,%r13 1388 adcq $0,%r13 1389 1390 mulq %r14 1391 addq %rax,%r10 1392 movq %rbx,%rax 1393 adcq $0,%rdx 1394 addq %r12,%r10 1395 movq %rdx,%r11 1396 adcq $0,%r11 1397 movq %r10,-8(%rdi,%rbp,1) 1398 1399 leaq (%rbp),%rcx 1400 jmp .Lsqr4x_inner 1401 1402.align 32 1403.Lsqr4x_inner: 1404 movq (%rsi,%rcx,1),%rbx 1405 mulq %r15 1406 addq %rax,%r13 1407 movq %rbx,%rax 1408 movq %rdx,%r12 1409 adcq $0,%r12 1410 addq (%rdi,%rcx,1),%r13 1411 adcq $0,%r12 1412 1413.byte 0x67 1414 mulq %r14 1415 addq %rax,%r11 1416 movq %rbx,%rax 1417 movq 8(%rsi,%rcx,1),%rbx 1418 movq %rdx,%r10 1419 adcq $0,%r10 1420 addq %r13,%r11 1421 adcq $0,%r10 1422 1423 mulq %r15 1424 addq %rax,%r12 1425 movq %r11,(%rdi,%rcx,1) 1426 movq %rbx,%rax 1427 movq %rdx,%r13 1428 adcq $0,%r13 1429 addq 8(%rdi,%rcx,1),%r12 1430 leaq 16(%rcx),%rcx 1431 adcq $0,%r13 1432 1433 mulq %r14 1434 addq %rax,%r10 1435 movq %rbx,%rax 1436 adcq $0,%rdx 1437 addq %r12,%r10 1438 movq %rdx,%r11 1439 adcq $0,%r11 1440 movq %r10,-8(%rdi,%rcx,1) 1441 1442 cmpq $0,%rcx 1443 jne .Lsqr4x_inner 1444 1445.byte 0x67 1446 mulq %r15 1447 addq %rax,%r13 1448 adcq $0,%rdx 1449 addq %r11,%r13 1450 adcq $0,%rdx 1451 1452 movq %r13,(%rdi) 1453 movq %rdx,%r12 1454 movq %rdx,8(%rdi) 1455 1456 addq $16,%rbp 1457 jnz .Lsqr4x_outer 1458 1459 1460 movq -32(%rsi),%r14 1461 leaq 48+8(%rsp,%r9,2),%rdi 1462 movq -24(%rsi),%rax 1463 leaq -32(%rdi,%rbp,1),%rdi 1464 movq -16(%rsi),%rbx 1465 movq %rax,%r15 1466 1467 mulq %r14 1468 addq %rax,%r10 1469 movq %rbx,%rax 1470 movq %rdx,%r11 1471 adcq $0,%r11 1472 1473 mulq %r14 1474 addq %rax,%r11 1475 movq %rbx,%rax 1476 movq %r10,-24(%rdi) 1477 movq %rdx,%r10 1478 adcq $0,%r10 1479 addq %r13,%r11 1480 movq -8(%rsi),%rbx 1481 adcq $0,%r10 1482 1483 mulq %r15 1484 addq %rax,%r12 1485 movq %rbx,%rax 1486 movq %r11,-16(%rdi) 1487 movq %rdx,%r13 1488 adcq $0,%r13 1489 1490 mulq %r14 1491 addq %rax,%r10 1492 movq %rbx,%rax 1493 movq %rdx,%r11 1494 adcq $0,%r11 1495 addq %r12,%r10 1496 adcq $0,%r11 1497 movq %r10,-8(%rdi) 1498 1499 mulq %r15 1500 addq %rax,%r13 1501 movq -16(%rsi),%rax 1502 adcq $0,%rdx 1503 addq %r11,%r13 1504 adcq $0,%rdx 1505 1506 movq %r13,(%rdi) 1507 movq %rdx,%r12 1508 movq %rdx,8(%rdi) 1509 1510 mulq %rbx 1511 addq $16,%rbp 1512 xorq %r14,%r14 1513 subq %r9,%rbp 1514 xorq %r15,%r15 1515 1516 addq %r12,%rax 1517 adcq $0,%rdx 1518 movq %rax,8(%rdi) 1519 movq %rdx,16(%rdi) 1520 movq %r15,24(%rdi) 1521 1522 movq -16(%rsi,%rbp,1),%rax 1523 leaq 48+8(%rsp),%rdi 1524 xorq %r10,%r10 1525 movq 8(%rdi),%r11 1526 1527 leaq (%r14,%r10,2),%r12 1528 shrq $63,%r10 1529 leaq (%rcx,%r11,2),%r13 1530 shrq $63,%r11 1531 orq %r10,%r13 1532 movq 16(%rdi),%r10 1533 movq %r11,%r14 1534 mulq %rax 1535 negq %r15 1536 movq 24(%rdi),%r11 1537 adcq %rax,%r12 1538 movq -8(%rsi,%rbp,1),%rax 1539 movq %r12,(%rdi) 1540 adcq %rdx,%r13 1541 1542 leaq (%r14,%r10,2),%rbx 1543 movq %r13,8(%rdi) 1544 sbbq %r15,%r15 1545 shrq $63,%r10 1546 leaq (%rcx,%r11,2),%r8 1547 shrq $63,%r11 1548 orq %r10,%r8 1549 movq 32(%rdi),%r10 1550 movq %r11,%r14 1551 mulq %rax 1552 negq %r15 1553 movq 40(%rdi),%r11 1554 adcq %rax,%rbx 1555 movq 0(%rsi,%rbp,1),%rax 1556 movq %rbx,16(%rdi) 1557 adcq %rdx,%r8 1558 leaq 16(%rbp),%rbp 1559 movq %r8,24(%rdi) 1560 sbbq %r15,%r15 1561 leaq 64(%rdi),%rdi 1562 jmp .Lsqr4x_shift_n_add 1563 1564.align 32 1565.Lsqr4x_shift_n_add: 1566 leaq (%r14,%r10,2),%r12 1567 shrq $63,%r10 1568 leaq (%rcx,%r11,2),%r13 1569 shrq $63,%r11 1570 orq %r10,%r13 1571 movq -16(%rdi),%r10 1572 movq %r11,%r14 1573 mulq %rax 1574 negq %r15 1575 movq -8(%rdi),%r11 1576 adcq %rax,%r12 1577 movq -8(%rsi,%rbp,1),%rax 1578 movq %r12,-32(%rdi) 1579 adcq %rdx,%r13 1580 1581 leaq (%r14,%r10,2),%rbx 1582 movq %r13,-24(%rdi) 1583 sbbq %r15,%r15 1584 shrq $63,%r10 1585 leaq (%rcx,%r11,2),%r8 1586 shrq $63,%r11 1587 orq %r10,%r8 1588 movq 0(%rdi),%r10 1589 movq %r11,%r14 1590 mulq %rax 1591 negq %r15 1592 movq 8(%rdi),%r11 1593 adcq %rax,%rbx 1594 movq 0(%rsi,%rbp,1),%rax 1595 movq %rbx,-16(%rdi) 1596 adcq %rdx,%r8 1597 1598 leaq (%r14,%r10,2),%r12 1599 movq %r8,-8(%rdi) 1600 sbbq %r15,%r15 1601 shrq $63,%r10 1602 leaq (%rcx,%r11,2),%r13 1603 shrq $63,%r11 1604 orq %r10,%r13 1605 movq 16(%rdi),%r10 1606 movq %r11,%r14 1607 mulq %rax 1608 negq %r15 1609 movq 24(%rdi),%r11 1610 adcq %rax,%r12 1611 movq 8(%rsi,%rbp,1),%rax 1612 movq %r12,0(%rdi) 1613 adcq %rdx,%r13 1614 1615 leaq (%r14,%r10,2),%rbx 1616 movq %r13,8(%rdi) 1617 sbbq %r15,%r15 1618 shrq $63,%r10 1619 leaq (%rcx,%r11,2),%r8 1620 shrq $63,%r11 1621 orq %r10,%r8 1622 movq 32(%rdi),%r10 1623 movq %r11,%r14 1624 mulq %rax 1625 negq %r15 1626 movq 40(%rdi),%r11 1627 adcq %rax,%rbx 1628 movq 16(%rsi,%rbp,1),%rax 1629 movq %rbx,16(%rdi) 1630 adcq %rdx,%r8 1631 movq %r8,24(%rdi) 1632 sbbq %r15,%r15 1633 leaq 64(%rdi),%rdi 1634 addq $32,%rbp 1635 jnz .Lsqr4x_shift_n_add 1636 1637 leaq (%r14,%r10,2),%r12 1638.byte 0x67 1639 shrq $63,%r10 1640 leaq (%rcx,%r11,2),%r13 1641 shrq $63,%r11 1642 orq %r10,%r13 1643 movq -16(%rdi),%r10 1644 movq %r11,%r14 1645 mulq %rax 1646 negq %r15 1647 movq -8(%rdi),%r11 1648 adcq %rax,%r12 1649 movq -8(%rsi),%rax 1650 movq %r12,-32(%rdi) 1651 adcq %rdx,%r13 1652 1653 leaq (%r14,%r10,2),%rbx 1654 movq %r13,-24(%rdi) 1655 sbbq %r15,%r15 1656 shrq $63,%r10 1657 leaq (%rcx,%r11,2),%r8 1658 shrq $63,%r11 1659 orq %r10,%r8 1660 mulq %rax 1661 negq %r15 1662 adcq %rax,%rbx 1663 adcq %rdx,%r8 1664 movq %rbx,-16(%rdi) 1665 movq %r8,-8(%rdi) 1666.byte 102,72,15,126,213 1667__bn_sqr8x_reduction: 1668 xorq %rax,%rax 1669 leaq (%r9,%rbp,1),%rcx 1670 leaq 48+8(%rsp,%r9,2),%rdx 1671 movq %rcx,0+8(%rsp) 1672 leaq 48+8(%rsp,%r9,1),%rdi 1673 movq %rdx,8+8(%rsp) 1674 negq %r9 1675 jmp .L8x_reduction_loop 1676 1677.align 32 1678.L8x_reduction_loop: 1679 leaq (%rdi,%r9,1),%rdi 1680.byte 0x66 1681 movq 0(%rdi),%rbx 1682 movq 8(%rdi),%r9 1683 movq 16(%rdi),%r10 1684 movq 24(%rdi),%r11 1685 movq 32(%rdi),%r12 1686 movq 40(%rdi),%r13 1687 movq 48(%rdi),%r14 1688 movq 56(%rdi),%r15 1689 movq %rax,(%rdx) 1690 leaq 64(%rdi),%rdi 1691 1692.byte 0x67 1693 movq %rbx,%r8 1694 imulq 32+8(%rsp),%rbx 1695 movq 0(%rbp),%rax 1696 movl $8,%ecx 1697 jmp .L8x_reduce 1698 1699.align 32 1700.L8x_reduce: 1701 mulq %rbx 1702 movq 8(%rbp),%rax 1703 negq %r8 1704 movq %rdx,%r8 1705 adcq $0,%r8 1706 1707 mulq %rbx 1708 addq %rax,%r9 1709 movq 16(%rbp),%rax 1710 adcq $0,%rdx 1711 addq %r9,%r8 1712 movq %rbx,48-8+8(%rsp,%rcx,8) 1713 movq %rdx,%r9 1714 adcq $0,%r9 1715 1716 mulq %rbx 1717 addq %rax,%r10 1718 movq 24(%rbp),%rax 1719 adcq $0,%rdx 1720 addq %r10,%r9 1721 movq 32+8(%rsp),%rsi 1722 movq %rdx,%r10 1723 adcq $0,%r10 1724 1725 mulq %rbx 1726 addq %rax,%r11 1727 movq 32(%rbp),%rax 1728 adcq $0,%rdx 1729 imulq %r8,%rsi 1730 addq %r11,%r10 1731 movq %rdx,%r11 1732 adcq $0,%r11 1733 1734 mulq %rbx 1735 addq %rax,%r12 1736 movq 40(%rbp),%rax 1737 adcq $0,%rdx 1738 addq %r12,%r11 1739 movq %rdx,%r12 1740 adcq $0,%r12 1741 1742 mulq %rbx 1743 addq %rax,%r13 1744 movq 48(%rbp),%rax 1745 adcq $0,%rdx 1746 addq %r13,%r12 1747 movq %rdx,%r13 1748 adcq $0,%r13 1749 1750 mulq %rbx 1751 addq %rax,%r14 1752 movq 56(%rbp),%rax 1753 adcq $0,%rdx 1754 addq %r14,%r13 1755 movq %rdx,%r14 1756 adcq $0,%r14 1757 1758 mulq %rbx 1759 movq %rsi,%rbx 1760 addq %rax,%r15 1761 movq 0(%rbp),%rax 1762 adcq $0,%rdx 1763 addq %r15,%r14 1764 movq %rdx,%r15 1765 adcq $0,%r15 1766 1767 decl %ecx 1768 jnz .L8x_reduce 1769 1770 leaq 64(%rbp),%rbp 1771 xorq %rax,%rax 1772 movq 8+8(%rsp),%rdx 1773 cmpq 0+8(%rsp),%rbp 1774 jae .L8x_no_tail 1775 1776.byte 0x66 1777 addq 0(%rdi),%r8 1778 adcq 8(%rdi),%r9 1779 adcq 16(%rdi),%r10 1780 adcq 24(%rdi),%r11 1781 adcq 32(%rdi),%r12 1782 adcq 40(%rdi),%r13 1783 adcq 48(%rdi),%r14 1784 adcq 56(%rdi),%r15 1785 sbbq %rsi,%rsi 1786 1787 movq 48+56+8(%rsp),%rbx 1788 movl $8,%ecx 1789 movq 0(%rbp),%rax 1790 jmp .L8x_tail 1791 1792.align 32 1793.L8x_tail: 1794 mulq %rbx 1795 addq %rax,%r8 1796 movq 8(%rbp),%rax 1797 movq %r8,(%rdi) 1798 movq %rdx,%r8 1799 adcq $0,%r8 1800 1801 mulq %rbx 1802 addq %rax,%r9 1803 movq 16(%rbp),%rax 1804 adcq $0,%rdx 1805 addq %r9,%r8 1806 leaq 8(%rdi),%rdi 1807 movq %rdx,%r9 1808 adcq $0,%r9 1809 1810 mulq %rbx 1811 addq %rax,%r10 1812 movq 24(%rbp),%rax 1813 adcq $0,%rdx 1814 addq %r10,%r9 1815 movq %rdx,%r10 1816 adcq $0,%r10 1817 1818 mulq %rbx 1819 addq %rax,%r11 1820 movq 32(%rbp),%rax 1821 adcq $0,%rdx 1822 addq %r11,%r10 1823 movq %rdx,%r11 1824 adcq $0,%r11 1825 1826 mulq %rbx 1827 addq %rax,%r12 1828 movq 40(%rbp),%rax 1829 adcq $0,%rdx 1830 addq %r12,%r11 1831 movq %rdx,%r12 1832 adcq $0,%r12 1833 1834 mulq %rbx 1835 addq %rax,%r13 1836 movq 48(%rbp),%rax 1837 adcq $0,%rdx 1838 addq %r13,%r12 1839 movq %rdx,%r13 1840 adcq $0,%r13 1841 1842 mulq %rbx 1843 addq %rax,%r14 1844 movq 56(%rbp),%rax 1845 adcq $0,%rdx 1846 addq %r14,%r13 1847 movq %rdx,%r14 1848 adcq $0,%r14 1849 1850 mulq %rbx 1851 movq 48-16+8(%rsp,%rcx,8),%rbx 1852 addq %rax,%r15 1853 adcq $0,%rdx 1854 addq %r15,%r14 1855 movq 0(%rbp),%rax 1856 movq %rdx,%r15 1857 adcq $0,%r15 1858 1859 decl %ecx 1860 jnz .L8x_tail 1861 1862 leaq 64(%rbp),%rbp 1863 movq 8+8(%rsp),%rdx 1864 cmpq 0+8(%rsp),%rbp 1865 jae .L8x_tail_done 1866 1867 movq 48+56+8(%rsp),%rbx 1868 negq %rsi 1869 movq 0(%rbp),%rax 1870 adcq 0(%rdi),%r8 1871 adcq 8(%rdi),%r9 1872 adcq 16(%rdi),%r10 1873 adcq 24(%rdi),%r11 1874 adcq 32(%rdi),%r12 1875 adcq 40(%rdi),%r13 1876 adcq 48(%rdi),%r14 1877 adcq 56(%rdi),%r15 1878 sbbq %rsi,%rsi 1879 1880 movl $8,%ecx 1881 jmp .L8x_tail 1882 1883.align 32 1884.L8x_tail_done: 1885 addq (%rdx),%r8 1886 adcq $0,%r9 1887 adcq $0,%r10 1888 adcq $0,%r11 1889 adcq $0,%r12 1890 adcq $0,%r13 1891 adcq $0,%r14 1892 adcq $0,%r15 1893 1894 1895 xorq %rax,%rax 1896 1897 negq %rsi 1898.L8x_no_tail: 1899 adcq 0(%rdi),%r8 1900 adcq 8(%rdi),%r9 1901 adcq 16(%rdi),%r10 1902 adcq 24(%rdi),%r11 1903 adcq 32(%rdi),%r12 1904 adcq 40(%rdi),%r13 1905 adcq 48(%rdi),%r14 1906 adcq 56(%rdi),%r15 1907 adcq $0,%rax 1908 movq -8(%rbp),%rcx 1909 xorq %rsi,%rsi 1910 1911.byte 102,72,15,126,213 1912 1913 movq %r8,0(%rdi) 1914 movq %r9,8(%rdi) 1915.byte 102,73,15,126,217 1916 movq %r10,16(%rdi) 1917 movq %r11,24(%rdi) 1918 movq %r12,32(%rdi) 1919 movq %r13,40(%rdi) 1920 movq %r14,48(%rdi) 1921 movq %r15,56(%rdi) 1922 leaq 64(%rdi),%rdi 1923 1924 cmpq %rdx,%rdi 1925 jb .L8x_reduction_loop 1926 .byte 0xf3,0xc3 1927.size bn_sqr8x_internal,.-bn_sqr8x_internal 1928.type __bn_post4x_internal,@function 1929.align 32 1930__bn_post4x_internal: 1931 movq 0(%rbp),%r12 1932 leaq (%rdi,%r9,1),%rbx 1933 movq %r9,%rcx 1934.byte 102,72,15,126,207 1935 negq %rax 1936.byte 102,72,15,126,206 1937 sarq $3+2,%rcx 1938 decq %r12 1939 xorq %r10,%r10 1940 movq 8(%rbp),%r13 1941 movq 16(%rbp),%r14 1942 movq 24(%rbp),%r15 1943 jmp .Lsqr4x_sub_entry 1944 1945.align 16 1946.Lsqr4x_sub: 1947 movq 0(%rbp),%r12 1948 movq 8(%rbp),%r13 1949 movq 16(%rbp),%r14 1950 movq 24(%rbp),%r15 1951.Lsqr4x_sub_entry: 1952 leaq 32(%rbp),%rbp 1953 notq %r12 1954 notq %r13 1955 notq %r14 1956 notq %r15 1957 andq %rax,%r12 1958 andq %rax,%r13 1959 andq %rax,%r14 1960 andq %rax,%r15 1961 1962 negq %r10 1963 adcq 0(%rbx),%r12 1964 adcq 8(%rbx),%r13 1965 adcq 16(%rbx),%r14 1966 adcq 24(%rbx),%r15 1967 movq %r12,0(%rdi) 1968 leaq 32(%rbx),%rbx 1969 movq %r13,8(%rdi) 1970 sbbq %r10,%r10 1971 movq %r14,16(%rdi) 1972 movq %r15,24(%rdi) 1973 leaq 32(%rdi),%rdi 1974 1975 incq %rcx 1976 jnz .Lsqr4x_sub 1977 1978 movq %r9,%r10 1979 negq %r9 1980 .byte 0xf3,0xc3 1981.size __bn_post4x_internal,.-__bn_post4x_internal 1982.globl bn_from_montgomery 1983.type bn_from_montgomery,@function 1984.align 32 1985bn_from_montgomery: 1986 testl $7,%r9d 1987 jz bn_from_mont8x 1988 xorl %eax,%eax 1989 .byte 0xf3,0xc3 1990.size bn_from_montgomery,.-bn_from_montgomery 1991 1992.type bn_from_mont8x,@function 1993.align 32 1994bn_from_mont8x: 1995.byte 0x67 1996 movq %rsp,%rax 1997 pushq %rbx 1998 pushq %rbp 1999 pushq %r12 2000 pushq %r13 2001 pushq %r14 2002 pushq %r15 2003.Lfrom_prologue: 2004 2005 shll $3,%r9d 2006 leaq (%r9,%r9,2),%r10 2007 negq %r9 2008 movq (%r8),%r8 2009 2010 2011 2012 2013 2014 2015 2016 2017 leaq -320(%rsp,%r9,2),%r11 2018 movq %rsp,%rbp 2019 subq %rdi,%r11 2020 andq $4095,%r11 2021 cmpq %r11,%r10 2022 jb .Lfrom_sp_alt 2023 subq %r11,%rbp 2024 leaq -320(%rbp,%r9,2),%rbp 2025 jmp .Lfrom_sp_done 2026 2027.align 32 2028.Lfrom_sp_alt: 2029 leaq 4096-320(,%r9,2),%r10 2030 leaq -320(%rbp,%r9,2),%rbp 2031 subq %r10,%r11 2032 movq $0,%r10 2033 cmovcq %r10,%r11 2034 subq %r11,%rbp 2035.Lfrom_sp_done: 2036 andq $-64,%rbp 2037 movq %rsp,%r11 2038 subq %rbp,%r11 2039 andq $-4096,%r11 2040 leaq (%r11,%rbp,1),%rsp 2041 movq (%rsp),%r10 2042 cmpq %rbp,%rsp 2043 ja .Lfrom_page_walk 2044 jmp .Lfrom_page_walk_done 2045 2046.Lfrom_page_walk: 2047 leaq -4096(%rsp),%rsp 2048 movq (%rsp),%r10 2049 cmpq %rbp,%rsp 2050 ja .Lfrom_page_walk 2051.Lfrom_page_walk_done: 2052 2053 movq %r9,%r10 2054 negq %r9 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 movq %r8,32(%rsp) 2066 movq %rax,40(%rsp) 2067.Lfrom_body: 2068 movq %r9,%r11 2069 leaq 48(%rsp),%rax 2070 pxor %xmm0,%xmm0 2071 jmp .Lmul_by_1 2072 2073.align 32 2074.Lmul_by_1: 2075 movdqu (%rsi),%xmm1 2076 movdqu 16(%rsi),%xmm2 2077 movdqu 32(%rsi),%xmm3 2078 movdqa %xmm0,(%rax,%r9,1) 2079 movdqu 48(%rsi),%xmm4 2080 movdqa %xmm0,16(%rax,%r9,1) 2081.byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 2082 movdqa %xmm1,(%rax) 2083 movdqa %xmm0,32(%rax,%r9,1) 2084 movdqa %xmm2,16(%rax) 2085 movdqa %xmm0,48(%rax,%r9,1) 2086 movdqa %xmm3,32(%rax) 2087 movdqa %xmm4,48(%rax) 2088 leaq 64(%rax),%rax 2089 subq $64,%r11 2090 jnz .Lmul_by_1 2091 2092.byte 102,72,15,110,207 2093.byte 102,72,15,110,209 2094.byte 0x67 2095 movq %rcx,%rbp 2096.byte 102,73,15,110,218 2097 call __bn_sqr8x_reduction 2098 call __bn_post4x_internal 2099 2100 pxor %xmm0,%xmm0 2101 leaq 48(%rsp),%rax 2102 movq 40(%rsp),%rsi 2103 jmp .Lfrom_mont_zero 2104 2105.align 32 2106.Lfrom_mont_zero: 2107 movdqa %xmm0,0(%rax) 2108 movdqa %xmm0,16(%rax) 2109 movdqa %xmm0,32(%rax) 2110 movdqa %xmm0,48(%rax) 2111 leaq 64(%rax),%rax 2112 subq $32,%r9 2113 jnz .Lfrom_mont_zero 2114 2115 movq $1,%rax 2116 movq -48(%rsi),%r15 2117 movq -40(%rsi),%r14 2118 movq -32(%rsi),%r13 2119 movq -24(%rsi),%r12 2120 movq -16(%rsi),%rbp 2121 movq -8(%rsi),%rbx 2122 leaq (%rsi),%rsp 2123.Lfrom_epilogue: 2124 .byte 0xf3,0xc3 2125.size bn_from_mont8x,.-bn_from_mont8x 2126.globl bn_get_bits5 2127.type bn_get_bits5,@function 2128.align 16 2129bn_get_bits5: 2130 leaq 0(%rdi),%r10 2131 leaq 1(%rdi),%r11 2132 movl %esi,%ecx 2133 shrl $4,%esi 2134 andl $15,%ecx 2135 leal -8(%rcx),%eax 2136 cmpl $11,%ecx 2137 cmovaq %r11,%r10 2138 cmoval %eax,%ecx 2139 movzwl (%r10,%rsi,2),%eax 2140 shrl %cl,%eax 2141 andl $31,%eax 2142 .byte 0xf3,0xc3 2143.size bn_get_bits5,.-bn_get_bits5 2144 2145.globl bn_scatter5 2146.type bn_scatter5,@function 2147.align 16 2148bn_scatter5: 2149 cmpl $0,%esi 2150 jz .Lscatter_epilogue 2151 leaq (%rdx,%rcx,8),%rdx 2152.Lscatter: 2153 movq (%rdi),%rax 2154 leaq 8(%rdi),%rdi 2155 movq %rax,(%rdx) 2156 leaq 256(%rdx),%rdx 2157 subl $1,%esi 2158 jnz .Lscatter 2159.Lscatter_epilogue: 2160 .byte 0xf3,0xc3 2161.size bn_scatter5,.-bn_scatter5 2162 2163.globl bn_gather5 2164.type bn_gather5,@function 2165.align 32 2166bn_gather5: 2167.LSEH_begin_bn_gather5: 2168 2169.byte 0x4c,0x8d,0x14,0x24 2170.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 2171 leaq .Linc(%rip),%rax 2172 andq $-16,%rsp 2173 2174 movd %ecx,%xmm5 2175 movdqa 0(%rax),%xmm0 2176 movdqa 16(%rax),%xmm1 2177 leaq 128(%rdx),%r11 2178 leaq 128(%rsp),%rax 2179 2180 pshufd $0,%xmm5,%xmm5 2181 movdqa %xmm1,%xmm4 2182 movdqa %xmm1,%xmm2 2183 paddd %xmm0,%xmm1 2184 pcmpeqd %xmm5,%xmm0 2185 movdqa %xmm4,%xmm3 2186 2187 paddd %xmm1,%xmm2 2188 pcmpeqd %xmm5,%xmm1 2189 movdqa %xmm0,-128(%rax) 2190 movdqa %xmm4,%xmm0 2191 2192 paddd %xmm2,%xmm3 2193 pcmpeqd %xmm5,%xmm2 2194 movdqa %xmm1,-112(%rax) 2195 movdqa %xmm4,%xmm1 2196 2197 paddd %xmm3,%xmm0 2198 pcmpeqd %xmm5,%xmm3 2199 movdqa %xmm2,-96(%rax) 2200 movdqa %xmm4,%xmm2 2201 paddd %xmm0,%xmm1 2202 pcmpeqd %xmm5,%xmm0 2203 movdqa %xmm3,-80(%rax) 2204 movdqa %xmm4,%xmm3 2205 2206 paddd %xmm1,%xmm2 2207 pcmpeqd %xmm5,%xmm1 2208 movdqa %xmm0,-64(%rax) 2209 movdqa %xmm4,%xmm0 2210 2211 paddd %xmm2,%xmm3 2212 pcmpeqd %xmm5,%xmm2 2213 movdqa %xmm1,-48(%rax) 2214 movdqa %xmm4,%xmm1 2215 2216 paddd %xmm3,%xmm0 2217 pcmpeqd %xmm5,%xmm3 2218 movdqa %xmm2,-32(%rax) 2219 movdqa %xmm4,%xmm2 2220 paddd %xmm0,%xmm1 2221 pcmpeqd %xmm5,%xmm0 2222 movdqa %xmm3,-16(%rax) 2223 movdqa %xmm4,%xmm3 2224 2225 paddd %xmm1,%xmm2 2226 pcmpeqd %xmm5,%xmm1 2227 movdqa %xmm0,0(%rax) 2228 movdqa %xmm4,%xmm0 2229 2230 paddd %xmm2,%xmm3 2231 pcmpeqd %xmm5,%xmm2 2232 movdqa %xmm1,16(%rax) 2233 movdqa %xmm4,%xmm1 2234 2235 paddd %xmm3,%xmm0 2236 pcmpeqd %xmm5,%xmm3 2237 movdqa %xmm2,32(%rax) 2238 movdqa %xmm4,%xmm2 2239 paddd %xmm0,%xmm1 2240 pcmpeqd %xmm5,%xmm0 2241 movdqa %xmm3,48(%rax) 2242 movdqa %xmm4,%xmm3 2243 2244 paddd %xmm1,%xmm2 2245 pcmpeqd %xmm5,%xmm1 2246 movdqa %xmm0,64(%rax) 2247 movdqa %xmm4,%xmm0 2248 2249 paddd %xmm2,%xmm3 2250 pcmpeqd %xmm5,%xmm2 2251 movdqa %xmm1,80(%rax) 2252 movdqa %xmm4,%xmm1 2253 2254 paddd %xmm3,%xmm0 2255 pcmpeqd %xmm5,%xmm3 2256 movdqa %xmm2,96(%rax) 2257 movdqa %xmm4,%xmm2 2258 movdqa %xmm3,112(%rax) 2259 jmp .Lgather 2260 2261.align 32 2262.Lgather: 2263 pxor %xmm4,%xmm4 2264 pxor %xmm5,%xmm5 2265 movdqa -128(%r11),%xmm0 2266 movdqa -112(%r11),%xmm1 2267 movdqa -96(%r11),%xmm2 2268 pand -128(%rax),%xmm0 2269 movdqa -80(%r11),%xmm3 2270 pand -112(%rax),%xmm1 2271 por %xmm0,%xmm4 2272 pand -96(%rax),%xmm2 2273 por %xmm1,%xmm5 2274 pand -80(%rax),%xmm3 2275 por %xmm2,%xmm4 2276 por %xmm3,%xmm5 2277 movdqa -64(%r11),%xmm0 2278 movdqa -48(%r11),%xmm1 2279 movdqa -32(%r11),%xmm2 2280 pand -64(%rax),%xmm0 2281 movdqa -16(%r11),%xmm3 2282 pand -48(%rax),%xmm1 2283 por %xmm0,%xmm4 2284 pand -32(%rax),%xmm2 2285 por %xmm1,%xmm5 2286 pand -16(%rax),%xmm3 2287 por %xmm2,%xmm4 2288 por %xmm3,%xmm5 2289 movdqa 0(%r11),%xmm0 2290 movdqa 16(%r11),%xmm1 2291 movdqa 32(%r11),%xmm2 2292 pand 0(%rax),%xmm0 2293 movdqa 48(%r11),%xmm3 2294 pand 16(%rax),%xmm1 2295 por %xmm0,%xmm4 2296 pand 32(%rax),%xmm2 2297 por %xmm1,%xmm5 2298 pand 48(%rax),%xmm3 2299 por %xmm2,%xmm4 2300 por %xmm3,%xmm5 2301 movdqa 64(%r11),%xmm0 2302 movdqa 80(%r11),%xmm1 2303 movdqa 96(%r11),%xmm2 2304 pand 64(%rax),%xmm0 2305 movdqa 112(%r11),%xmm3 2306 pand 80(%rax),%xmm1 2307 por %xmm0,%xmm4 2308 pand 96(%rax),%xmm2 2309 por %xmm1,%xmm5 2310 pand 112(%rax),%xmm3 2311 por %xmm2,%xmm4 2312 por %xmm3,%xmm5 2313 por %xmm5,%xmm4 2314 leaq 256(%r11),%r11 2315 pshufd $0x4e,%xmm4,%xmm0 2316 por %xmm4,%xmm0 2317 movq %xmm0,(%rdi) 2318 leaq 8(%rdi),%rdi 2319 subl $1,%esi 2320 jnz .Lgather 2321 2322 leaq (%r10),%rsp 2323 .byte 0xf3,0xc3 2324.LSEH_end_bn_gather5: 2325.size bn_gather5,.-bn_gather5 2326.align 64 2327.Linc: 2328.long 0,0, 1,1 2329.long 2,2, 2,2 2330.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 2331