x86_64-mont5.S revision 1.1
1#include <machine/asm.h> 2.text 3 4.globl bn_mul_mont_gather5 5.type bn_mul_mont_gather5,@function 6.align 64 7bn_mul_mont_gather5: 8 testl $3,%r9d 9 jnz .Lmul_enter 10 cmpl $8,%r9d 11 jb .Lmul_enter 12 jmp .Lmul4x_enter 13 14.align 16 15.Lmul_enter: 16 movl %r9d,%r9d 17 movl 8(%rsp),%r10d 18 pushq %rbx 19 pushq %rbp 20 pushq %r12 21 pushq %r13 22 pushq %r14 23 pushq %r15 24 movq %rsp,%rax 25 leaq 2(%r9),%r11 26 negq %r11 27 leaq (%rsp,%r11,8),%rsp 28 andq $-1024,%rsp 29 30 movq %rax,8(%rsp,%r9,8) 31.Lmul_body: 32 movq %rdx,%r12 33 movq %r10,%r11 34 shrq $3,%r10 35 andq $7,%r11 36 notq %r10 37 leaq .Lmagic_masks(%rip),%rax 38 andq $3,%r10 39 leaq 96(%r12,%r11,8),%r12 40 movq 0(%rax,%r10,8),%xmm4 41 movq 8(%rax,%r10,8),%xmm5 42 movq 16(%rax,%r10,8),%xmm6 43 movq 24(%rax,%r10,8),%xmm7 44 45 movq -96(%r12),%xmm0 46 movq -32(%r12),%xmm1 47 pand %xmm4,%xmm0 48 movq 32(%r12),%xmm2 49 pand %xmm5,%xmm1 50 movq 96(%r12),%xmm3 51 pand %xmm6,%xmm2 52 por %xmm1,%xmm0 53 pand %xmm7,%xmm3 54 por %xmm2,%xmm0 55 leaq 256(%r12),%r12 56 por %xmm3,%xmm0 57 58.byte 102,72,15,126,195 59 60 movq (%r8),%r8 61 movq (%rsi),%rax 62 63 xorq %r14,%r14 64 xorq %r15,%r15 65 66 movq -96(%r12),%xmm0 67 movq -32(%r12),%xmm1 68 pand %xmm4,%xmm0 69 movq 32(%r12),%xmm2 70 pand %xmm5,%xmm1 71 72 movq %r8,%rbp 73 mulq %rbx 74 movq %rax,%r10 75 movq (%rcx),%rax 76 77 movq 96(%r12),%xmm3 78 pand %xmm6,%xmm2 79 por %xmm1,%xmm0 80 pand %xmm7,%xmm3 81 82 imulq %r10,%rbp 83 movq %rdx,%r11 84 85 por %xmm2,%xmm0 86 leaq 256(%r12),%r12 87 por %xmm3,%xmm0 88 89 mulq %rbp 90 addq %rax,%r10 91 movq 8(%rsi),%rax 92 adcq $0,%rdx 93 movq %rdx,%r13 94 95 leaq 1(%r15),%r15 96 jmp .L1st_enter 97 98.align 16 99.L1st: 100 addq %rax,%r13 101 movq (%rsi,%r15,8),%rax 102 adcq $0,%rdx 103 addq %r11,%r13 104 movq %r10,%r11 105 adcq $0,%rdx 106 movq %r13,-16(%rsp,%r15,8) 107 movq %rdx,%r13 108 109.L1st_enter: 110 mulq %rbx 111 addq %rax,%r11 112 movq (%rcx,%r15,8),%rax 113 adcq $0,%rdx 114 leaq 1(%r15),%r15 115 movq %rdx,%r10 116 117 mulq %rbp 118 cmpq %r9,%r15 119 jne .L1st 120 121.byte 102,72,15,126,195 122 123 addq %rax,%r13 124 movq (%rsi),%rax 125 adcq $0,%rdx 126 addq %r11,%r13 127 adcq $0,%rdx 128 movq %r13,-16(%rsp,%r15,8) 129 movq %rdx,%r13 130 movq %r10,%r11 131 132 xorq %rdx,%rdx 133 addq %r11,%r13 134 adcq $0,%rdx 135 movq %r13,-8(%rsp,%r9,8) 136 movq %rdx,(%rsp,%r9,8) 137 138 leaq 1(%r14),%r14 139 jmp .Louter 140.align 16 141.Louter: 142 xorq %r15,%r15 143 movq %r8,%rbp 144 movq (%rsp),%r10 145 146 movq -96(%r12),%xmm0 147 movq -32(%r12),%xmm1 148 pand %xmm4,%xmm0 149 movq 32(%r12),%xmm2 150 pand %xmm5,%xmm1 151 152 mulq %rbx 153 addq %rax,%r10 154 movq (%rcx),%rax 155 adcq $0,%rdx 156 157 movq 96(%r12),%xmm3 158 pand %xmm6,%xmm2 159 por %xmm1,%xmm0 160 pand %xmm7,%xmm3 161 162 imulq %r10,%rbp 163 movq %rdx,%r11 164 165 por %xmm2,%xmm0 166 leaq 256(%r12),%r12 167 por %xmm3,%xmm0 168 169 mulq %rbp 170 addq %rax,%r10 171 movq 8(%rsi),%rax 172 adcq $0,%rdx 173 movq 8(%rsp),%r10 174 movq %rdx,%r13 175 176 leaq 1(%r15),%r15 177 jmp .Linner_enter 178 179.align 16 180.Linner: 181 addq %rax,%r13 182 movq (%rsi,%r15,8),%rax 183 adcq $0,%rdx 184 addq %r10,%r13 185 movq (%rsp,%r15,8),%r10 186 adcq $0,%rdx 187 movq %r13,-16(%rsp,%r15,8) 188 movq %rdx,%r13 189 190.Linner_enter: 191 mulq %rbx 192 addq %rax,%r11 193 movq (%rcx,%r15,8),%rax 194 adcq $0,%rdx 195 addq %r11,%r10 196 movq %rdx,%r11 197 adcq $0,%r11 198 leaq 1(%r15),%r15 199 200 mulq %rbp 201 cmpq %r9,%r15 202 jne .Linner 203 204.byte 102,72,15,126,195 205 206 addq %rax,%r13 207 movq (%rsi),%rax 208 adcq $0,%rdx 209 addq %r10,%r13 210 movq (%rsp,%r15,8),%r10 211 adcq $0,%rdx 212 movq %r13,-16(%rsp,%r15,8) 213 movq %rdx,%r13 214 215 xorq %rdx,%rdx 216 addq %r11,%r13 217 adcq $0,%rdx 218 addq %r10,%r13 219 adcq $0,%rdx 220 movq %r13,-8(%rsp,%r9,8) 221 movq %rdx,(%rsp,%r9,8) 222 223 leaq 1(%r14),%r14 224 cmpq %r9,%r14 225 jl .Louter 226 227 xorq %r14,%r14 228 movq (%rsp),%rax 229 leaq (%rsp),%rsi 230 movq %r9,%r15 231 jmp .Lsub 232.align 16 233.Lsub: sbbq (%rcx,%r14,8),%rax 234 movq %rax,(%rdi,%r14,8) 235 movq 8(%rsi,%r14,8),%rax 236 leaq 1(%r14),%r14 237 decq %r15 238 jnz .Lsub 239 240 sbbq $0,%rax 241 xorq %r14,%r14 242 andq %rax,%rsi 243 notq %rax 244 movq %rdi,%rcx 245 andq %rax,%rcx 246 movq %r9,%r15 247 orq %rcx,%rsi 248.align 16 249.Lcopy: 250 movq (%rsi,%r14,8),%rax 251 movq %r14,(%rsp,%r14,8) 252 movq %rax,(%rdi,%r14,8) 253 leaq 1(%r14),%r14 254 subq $1,%r15 255 jnz .Lcopy 256 257 movq 8(%rsp,%r9,8),%rsi 258 movq $1,%rax 259 movq (%rsi),%r15 260 movq 8(%rsi),%r14 261 movq 16(%rsi),%r13 262 movq 24(%rsi),%r12 263 movq 32(%rsi),%rbp 264 movq 40(%rsi),%rbx 265 leaq 48(%rsi),%rsp 266.Lmul_epilogue: 267 .byte 0xf3,0xc3 268.size bn_mul_mont_gather5,.-bn_mul_mont_gather5 269.type bn_mul4x_mont_gather5,@function 270.align 16 271bn_mul4x_mont_gather5: 272.Lmul4x_enter: 273 movl %r9d,%r9d 274 movl 8(%rsp),%r10d 275 pushq %rbx 276 pushq %rbp 277 pushq %r12 278 pushq %r13 279 pushq %r14 280 pushq %r15 281 movq %rsp,%rax 282 leaq 4(%r9),%r11 283 negq %r11 284 leaq (%rsp,%r11,8),%rsp 285 andq $-1024,%rsp 286 287 movq %rax,8(%rsp,%r9,8) 288.Lmul4x_body: 289 movq %rdi,16(%rsp,%r9,8) 290 movq %rdx,%r12 291 movq %r10,%r11 292 shrq $3,%r10 293 andq $7,%r11 294 notq %r10 295 leaq .Lmagic_masks(%rip),%rax 296 andq $3,%r10 297 leaq 96(%r12,%r11,8),%r12 298 movq 0(%rax,%r10,8),%xmm4 299 movq 8(%rax,%r10,8),%xmm5 300 movq 16(%rax,%r10,8),%xmm6 301 movq 24(%rax,%r10,8),%xmm7 302 303 movq -96(%r12),%xmm0 304 movq -32(%r12),%xmm1 305 pand %xmm4,%xmm0 306 movq 32(%r12),%xmm2 307 pand %xmm5,%xmm1 308 movq 96(%r12),%xmm3 309 pand %xmm6,%xmm2 310 por %xmm1,%xmm0 311 pand %xmm7,%xmm3 312 por %xmm2,%xmm0 313 leaq 256(%r12),%r12 314 por %xmm3,%xmm0 315 316.byte 102,72,15,126,195 317 movq (%r8),%r8 318 movq (%rsi),%rax 319 320 xorq %r14,%r14 321 xorq %r15,%r15 322 323 movq -96(%r12),%xmm0 324 movq -32(%r12),%xmm1 325 pand %xmm4,%xmm0 326 movq 32(%r12),%xmm2 327 pand %xmm5,%xmm1 328 329 movq %r8,%rbp 330 mulq %rbx 331 movq %rax,%r10 332 movq (%rcx),%rax 333 334 movq 96(%r12),%xmm3 335 pand %xmm6,%xmm2 336 por %xmm1,%xmm0 337 pand %xmm7,%xmm3 338 339 imulq %r10,%rbp 340 movq %rdx,%r11 341 342 por %xmm2,%xmm0 343 leaq 256(%r12),%r12 344 por %xmm3,%xmm0 345 346 mulq %rbp 347 addq %rax,%r10 348 movq 8(%rsi),%rax 349 adcq $0,%rdx 350 movq %rdx,%rdi 351 352 mulq %rbx 353 addq %rax,%r11 354 movq 8(%rcx),%rax 355 adcq $0,%rdx 356 movq %rdx,%r10 357 358 mulq %rbp 359 addq %rax,%rdi 360 movq 16(%rsi),%rax 361 adcq $0,%rdx 362 addq %r11,%rdi 363 leaq 4(%r15),%r15 364 adcq $0,%rdx 365 movq %rdi,(%rsp) 366 movq %rdx,%r13 367 jmp .L1st4x 368.align 16 369.L1st4x: 370 mulq %rbx 371 addq %rax,%r10 372 movq -16(%rcx,%r15,8),%rax 373 adcq $0,%rdx 374 movq %rdx,%r11 375 376 mulq %rbp 377 addq %rax,%r13 378 movq -8(%rsi,%r15,8),%rax 379 adcq $0,%rdx 380 addq %r10,%r13 381 adcq $0,%rdx 382 movq %r13,-24(%rsp,%r15,8) 383 movq %rdx,%rdi 384 385 mulq %rbx 386 addq %rax,%r11 387 movq -8(%rcx,%r15,8),%rax 388 adcq $0,%rdx 389 movq %rdx,%r10 390 391 mulq %rbp 392 addq %rax,%rdi 393 movq (%rsi,%r15,8),%rax 394 adcq $0,%rdx 395 addq %r11,%rdi 396 adcq $0,%rdx 397 movq %rdi,-16(%rsp,%r15,8) 398 movq %rdx,%r13 399 400 mulq %rbx 401 addq %rax,%r10 402 movq (%rcx,%r15,8),%rax 403 adcq $0,%rdx 404 movq %rdx,%r11 405 406 mulq %rbp 407 addq %rax,%r13 408 movq 8(%rsi,%r15,8),%rax 409 adcq $0,%rdx 410 addq %r10,%r13 411 adcq $0,%rdx 412 movq %r13,-8(%rsp,%r15,8) 413 movq %rdx,%rdi 414 415 mulq %rbx 416 addq %rax,%r11 417 movq 8(%rcx,%r15,8),%rax 418 adcq $0,%rdx 419 leaq 4(%r15),%r15 420 movq %rdx,%r10 421 422 mulq %rbp 423 addq %rax,%rdi 424 movq -16(%rsi,%r15,8),%rax 425 adcq $0,%rdx 426 addq %r11,%rdi 427 adcq $0,%rdx 428 movq %rdi,-32(%rsp,%r15,8) 429 movq %rdx,%r13 430 cmpq %r9,%r15 431 jl .L1st4x 432 433 mulq %rbx 434 addq %rax,%r10 435 movq -16(%rcx,%r15,8),%rax 436 adcq $0,%rdx 437 movq %rdx,%r11 438 439 mulq %rbp 440 addq %rax,%r13 441 movq -8(%rsi,%r15,8),%rax 442 adcq $0,%rdx 443 addq %r10,%r13 444 adcq $0,%rdx 445 movq %r13,-24(%rsp,%r15,8) 446 movq %rdx,%rdi 447 448 mulq %rbx 449 addq %rax,%r11 450 movq -8(%rcx,%r15,8),%rax 451 adcq $0,%rdx 452 movq %rdx,%r10 453 454 mulq %rbp 455 addq %rax,%rdi 456 movq (%rsi),%rax 457 adcq $0,%rdx 458 addq %r11,%rdi 459 adcq $0,%rdx 460 movq %rdi,-16(%rsp,%r15,8) 461 movq %rdx,%r13 462 463.byte 102,72,15,126,195 464 465 xorq %rdi,%rdi 466 addq %r10,%r13 467 adcq $0,%rdi 468 movq %r13,-8(%rsp,%r15,8) 469 movq %rdi,(%rsp,%r15,8) 470 471 leaq 1(%r14),%r14 472.align 4 473.Louter4x: 474 xorq %r15,%r15 475 movq -96(%r12),%xmm0 476 movq -32(%r12),%xmm1 477 pand %xmm4,%xmm0 478 movq 32(%r12),%xmm2 479 pand %xmm5,%xmm1 480 481 movq (%rsp),%r10 482 movq %r8,%rbp 483 mulq %rbx 484 addq %rax,%r10 485 movq (%rcx),%rax 486 adcq $0,%rdx 487 488 movq 96(%r12),%xmm3 489 pand %xmm6,%xmm2 490 por %xmm1,%xmm0 491 pand %xmm7,%xmm3 492 493 imulq %r10,%rbp 494 movq %rdx,%r11 495 496 por %xmm2,%xmm0 497 leaq 256(%r12),%r12 498 por %xmm3,%xmm0 499 500 mulq %rbp 501 addq %rax,%r10 502 movq 8(%rsi),%rax 503 adcq $0,%rdx 504 movq %rdx,%rdi 505 506 mulq %rbx 507 addq %rax,%r11 508 movq 8(%rcx),%rax 509 adcq $0,%rdx 510 addq 8(%rsp),%r11 511 adcq $0,%rdx 512 movq %rdx,%r10 513 514 mulq %rbp 515 addq %rax,%rdi 516 movq 16(%rsi),%rax 517 adcq $0,%rdx 518 addq %r11,%rdi 519 leaq 4(%r15),%r15 520 adcq $0,%rdx 521 movq %rdx,%r13 522 jmp .Linner4x 523.align 16 524.Linner4x: 525 mulq %rbx 526 addq %rax,%r10 527 movq -16(%rcx,%r15,8),%rax 528 adcq $0,%rdx 529 addq -16(%rsp,%r15,8),%r10 530 adcq $0,%rdx 531 movq %rdx,%r11 532 533 mulq %rbp 534 addq %rax,%r13 535 movq -8(%rsi,%r15,8),%rax 536 adcq $0,%rdx 537 addq %r10,%r13 538 adcq $0,%rdx 539 movq %rdi,-32(%rsp,%r15,8) 540 movq %rdx,%rdi 541 542 mulq %rbx 543 addq %rax,%r11 544 movq -8(%rcx,%r15,8),%rax 545 adcq $0,%rdx 546 addq -8(%rsp,%r15,8),%r11 547 adcq $0,%rdx 548 movq %rdx,%r10 549 550 mulq %rbp 551 addq %rax,%rdi 552 movq (%rsi,%r15,8),%rax 553 adcq $0,%rdx 554 addq %r11,%rdi 555 adcq $0,%rdx 556 movq %r13,-24(%rsp,%r15,8) 557 movq %rdx,%r13 558 559 mulq %rbx 560 addq %rax,%r10 561 movq (%rcx,%r15,8),%rax 562 adcq $0,%rdx 563 addq (%rsp,%r15,8),%r10 564 adcq $0,%rdx 565 movq %rdx,%r11 566 567 mulq %rbp 568 addq %rax,%r13 569 movq 8(%rsi,%r15,8),%rax 570 adcq $0,%rdx 571 addq %r10,%r13 572 adcq $0,%rdx 573 movq %rdi,-16(%rsp,%r15,8) 574 movq %rdx,%rdi 575 576 mulq %rbx 577 addq %rax,%r11 578 movq 8(%rcx,%r15,8),%rax 579 adcq $0,%rdx 580 addq 8(%rsp,%r15,8),%r11 581 adcq $0,%rdx 582 leaq 4(%r15),%r15 583 movq %rdx,%r10 584 585 mulq %rbp 586 addq %rax,%rdi 587 movq -16(%rsi,%r15,8),%rax 588 adcq $0,%rdx 589 addq %r11,%rdi 590 adcq $0,%rdx 591 movq %r13,-40(%rsp,%r15,8) 592 movq %rdx,%r13 593 cmpq %r9,%r15 594 jl .Linner4x 595 596 mulq %rbx 597 addq %rax,%r10 598 movq -16(%rcx,%r15,8),%rax 599 adcq $0,%rdx 600 addq -16(%rsp,%r15,8),%r10 601 adcq $0,%rdx 602 movq %rdx,%r11 603 604 mulq %rbp 605 addq %rax,%r13 606 movq -8(%rsi,%r15,8),%rax 607 adcq $0,%rdx 608 addq %r10,%r13 609 adcq $0,%rdx 610 movq %rdi,-32(%rsp,%r15,8) 611 movq %rdx,%rdi 612 613 mulq %rbx 614 addq %rax,%r11 615 movq -8(%rcx,%r15,8),%rax 616 adcq $0,%rdx 617 addq -8(%rsp,%r15,8),%r11 618 adcq $0,%rdx 619 leaq 1(%r14),%r14 620 movq %rdx,%r10 621 622 mulq %rbp 623 addq %rax,%rdi 624 movq (%rsi),%rax 625 adcq $0,%rdx 626 addq %r11,%rdi 627 adcq $0,%rdx 628 movq %r13,-24(%rsp,%r15,8) 629 movq %rdx,%r13 630 631.byte 102,72,15,126,195 632 movq %rdi,-16(%rsp,%r15,8) 633 634 xorq %rdi,%rdi 635 addq %r10,%r13 636 adcq $0,%rdi 637 addq (%rsp,%r9,8),%r13 638 adcq $0,%rdi 639 movq %r13,-8(%rsp,%r15,8) 640 movq %rdi,(%rsp,%r15,8) 641 642 cmpq %r9,%r14 643 jl .Louter4x 644 movq 16(%rsp,%r9,8),%rdi 645 movq 0(%rsp),%rax 646 pxor %xmm0,%xmm0 647 movq 8(%rsp),%rdx 648 shrq $2,%r9 649 leaq (%rsp),%rsi 650 xorq %r14,%r14 651 652 subq 0(%rcx),%rax 653 movq 16(%rsi),%rbx 654 movq 24(%rsi),%rbp 655 sbbq 8(%rcx),%rdx 656 leaq -1(%r9),%r15 657 jmp .Lsub4x 658.align 16 659.Lsub4x: 660 movq %rax,0(%rdi,%r14,8) 661 movq %rdx,8(%rdi,%r14,8) 662 sbbq 16(%rcx,%r14,8),%rbx 663 movq 32(%rsi,%r14,8),%rax 664 movq 40(%rsi,%r14,8),%rdx 665 sbbq 24(%rcx,%r14,8),%rbp 666 movq %rbx,16(%rdi,%r14,8) 667 movq %rbp,24(%rdi,%r14,8) 668 sbbq 32(%rcx,%r14,8),%rax 669 movq 48(%rsi,%r14,8),%rbx 670 movq 56(%rsi,%r14,8),%rbp 671 sbbq 40(%rcx,%r14,8),%rdx 672 leaq 4(%r14),%r14 673 decq %r15 674 jnz .Lsub4x 675 676 movq %rax,0(%rdi,%r14,8) 677 movq 32(%rsi,%r14,8),%rax 678 sbbq 16(%rcx,%r14,8),%rbx 679 movq %rdx,8(%rdi,%r14,8) 680 sbbq 24(%rcx,%r14,8),%rbp 681 movq %rbx,16(%rdi,%r14,8) 682 683 sbbq $0,%rax 684 movq %rbp,24(%rdi,%r14,8) 685 xorq %r14,%r14 686 andq %rax,%rsi 687 notq %rax 688 movq %rdi,%rcx 689 andq %rax,%rcx 690 leaq -1(%r9),%r15 691 orq %rcx,%rsi 692 693 movdqu (%rsi),%xmm1 694 movdqa %xmm0,(%rsp) 695 movdqu %xmm1,(%rdi) 696 jmp .Lcopy4x 697.align 16 698.Lcopy4x: 699 movdqu 16(%rsi,%r14,1),%xmm2 700 movdqu 32(%rsi,%r14,1),%xmm1 701 movdqa %xmm0,16(%rsp,%r14,1) 702 movdqu %xmm2,16(%rdi,%r14,1) 703 movdqa %xmm0,32(%rsp,%r14,1) 704 movdqu %xmm1,32(%rdi,%r14,1) 705 leaq 32(%r14),%r14 706 decq %r15 707 jnz .Lcopy4x 708 709 shlq $2,%r9 710 movdqu 16(%rsi,%r14,1),%xmm2 711 movdqa %xmm0,16(%rsp,%r14,1) 712 movdqu %xmm2,16(%rdi,%r14,1) 713 movq 8(%rsp,%r9,8),%rsi 714 movq $1,%rax 715 movq (%rsi),%r15 716 movq 8(%rsi),%r14 717 movq 16(%rsi),%r13 718 movq 24(%rsi),%r12 719 movq 32(%rsi),%rbp 720 movq 40(%rsi),%rbx 721 leaq 48(%rsi),%rsp 722.Lmul4x_epilogue: 723 .byte 0xf3,0xc3 724.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 725.globl bn_scatter5 726.type bn_scatter5,@function 727.align 16 728bn_scatter5: 729 cmpq $0,%rsi 730 jz .Lscatter_epilogue 731 leaq (%rdx,%rcx,8),%rdx 732.Lscatter: 733 movq (%rdi),%rax 734 leaq 8(%rdi),%rdi 735 movq %rax,(%rdx) 736 leaq 256(%rdx),%rdx 737 subq $1,%rsi 738 jnz .Lscatter 739.Lscatter_epilogue: 740 .byte 0xf3,0xc3 741.size bn_scatter5,.-bn_scatter5 742 743.globl bn_gather5 744.type bn_gather5,@function 745.align 16 746bn_gather5: 747 movq %rcx,%r11 748 shrq $3,%rcx 749 andq $7,%r11 750 notq %rcx 751 leaq .Lmagic_masks(%rip),%rax 752 andq $3,%rcx 753 leaq 96(%rdx,%r11,8),%rdx 754 movq 0(%rax,%rcx,8),%xmm4 755 movq 8(%rax,%rcx,8),%xmm5 756 movq 16(%rax,%rcx,8),%xmm6 757 movq 24(%rax,%rcx,8),%xmm7 758 jmp .Lgather 759.align 16 760.Lgather: 761 movq -96(%rdx),%xmm0 762 movq -32(%rdx),%xmm1 763 pand %xmm4,%xmm0 764 movq 32(%rdx),%xmm2 765 pand %xmm5,%xmm1 766 movq 96(%rdx),%xmm3 767 pand %xmm6,%xmm2 768 por %xmm1,%xmm0 769 pand %xmm7,%xmm3 770 por %xmm2,%xmm0 771 leaq 256(%rdx),%rdx 772 por %xmm3,%xmm0 773 774 movq %xmm0,(%rdi) 775 leaq 8(%rdi),%rdi 776 subq $1,%rsi 777 jnz .Lgather 778 .byte 0xf3,0xc3 779.LSEH_end_bn_gather5: 780.size bn_gather5,.-bn_gather5 781.align 64 782.Lmagic_masks: 783.long 0,0, 0,0, 0,0, -1,-1 784.long 0,0, 0,0, 0,0, 0,0 785.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 786