x86_64-mont5.S revision 305153
1/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/x86_64-mont5.S 305153 2016-08-31 20:33:59Z jkim $ */ 2/* Do not modify. This file is auto-generated from x86_64-mont5.pl. */ 3.text 4 5 6 7.globl bn_mul_mont_gather5 8.type bn_mul_mont_gather5,@function 9.align 64 10bn_mul_mont_gather5: 11 testl $7,%r9d 12 jnz .Lmul_enter 13 movl OPENSSL_ia32cap_P+8(%rip),%r11d 14 jmp .Lmul4x_enter 15 16.align 16 17.Lmul_enter: 18 movl %r9d,%r9d 19 movq %rsp,%rax 20 movd 8(%rsp),%xmm5 21 leaq .Linc(%rip),%r10 22 pushq %rbx 23 pushq %rbp 24 pushq %r12 25 pushq %r13 26 pushq %r14 27 pushq %r15 28 29 leaq 2(%r9),%r11 30 negq %r11 31 leaq -264(%rsp,%r11,8),%rsp 32 andq $-1024,%rsp 33 34 movq %rax,8(%rsp,%r9,8) 35.Lmul_body: 36 37 38 39 40 41 42 subq %rsp,%rax 43 andq $-4096,%rax 44.Lmul_page_walk: 45 movq (%rsp,%rax,1),%r11 46 subq $4096,%rax 47.byte 0x2e 48 jnc .Lmul_page_walk 49 50 leaq 128(%rdx),%r12 51 movdqa 0(%r10),%xmm0 52 movdqa 16(%r10),%xmm1 53 leaq 24-112(%rsp,%r9,8),%r10 54 andq $-16,%r10 55 56 pshufd $0,%xmm5,%xmm5 57 movdqa %xmm1,%xmm4 58 movdqa %xmm1,%xmm2 59 paddd %xmm0,%xmm1 60 pcmpeqd %xmm5,%xmm0 61.byte 0x67 62 movdqa %xmm4,%xmm3 63 paddd %xmm1,%xmm2 64 pcmpeqd %xmm5,%xmm1 65 movdqa %xmm0,112(%r10) 66 movdqa %xmm4,%xmm0 67 68 paddd %xmm2,%xmm3 69 pcmpeqd %xmm5,%xmm2 70 movdqa %xmm1,128(%r10) 71 movdqa %xmm4,%xmm1 72 73 paddd %xmm3,%xmm0 74 pcmpeqd %xmm5,%xmm3 75 movdqa %xmm2,144(%r10) 76 movdqa %xmm4,%xmm2 77 78 paddd %xmm0,%xmm1 79 pcmpeqd %xmm5,%xmm0 80 movdqa %xmm3,160(%r10) 81 movdqa %xmm4,%xmm3 82 paddd %xmm1,%xmm2 83 pcmpeqd %xmm5,%xmm1 84 movdqa %xmm0,176(%r10) 85 movdqa %xmm4,%xmm0 86 87 paddd %xmm2,%xmm3 88 pcmpeqd %xmm5,%xmm2 89 movdqa %xmm1,192(%r10) 90 movdqa %xmm4,%xmm1 91 92 paddd %xmm3,%xmm0 93 pcmpeqd %xmm5,%xmm3 94 movdqa %xmm2,208(%r10) 95 movdqa %xmm4,%xmm2 96 97 paddd %xmm0,%xmm1 98 pcmpeqd %xmm5,%xmm0 99 movdqa %xmm3,224(%r10) 100 movdqa %xmm4,%xmm3 101 paddd %xmm1,%xmm2 102 pcmpeqd %xmm5,%xmm1 103 movdqa %xmm0,240(%r10) 104 movdqa %xmm4,%xmm0 105 106 paddd %xmm2,%xmm3 107 pcmpeqd %xmm5,%xmm2 108 movdqa %xmm1,256(%r10) 109 movdqa %xmm4,%xmm1 110 111 paddd %xmm3,%xmm0 112 pcmpeqd %xmm5,%xmm3 113 movdqa %xmm2,272(%r10) 114 movdqa %xmm4,%xmm2 115 116 paddd %xmm0,%xmm1 117 pcmpeqd %xmm5,%xmm0 118 movdqa %xmm3,288(%r10) 119 movdqa %xmm4,%xmm3 120 paddd %xmm1,%xmm2 121 pcmpeqd %xmm5,%xmm1 122 movdqa %xmm0,304(%r10) 123 124 paddd %xmm2,%xmm3 125.byte 0x67 126 pcmpeqd %xmm5,%xmm2 127 movdqa %xmm1,320(%r10) 128 129 pcmpeqd %xmm5,%xmm3 130 movdqa %xmm2,336(%r10) 131 pand 64(%r12),%xmm0 132 133 pand 80(%r12),%xmm1 134 pand 96(%r12),%xmm2 135 movdqa %xmm3,352(%r10) 136 pand 112(%r12),%xmm3 137 por %xmm2,%xmm0 138 por %xmm3,%xmm1 139 movdqa -128(%r12),%xmm4 140 movdqa -112(%r12),%xmm5 141 movdqa -96(%r12),%xmm2 142 pand 112(%r10),%xmm4 143 movdqa -80(%r12),%xmm3 144 pand 128(%r10),%xmm5 145 por %xmm4,%xmm0 146 pand 144(%r10),%xmm2 147 por %xmm5,%xmm1 148 pand 160(%r10),%xmm3 149 por %xmm2,%xmm0 150 por %xmm3,%xmm1 151 movdqa -64(%r12),%xmm4 152 movdqa -48(%r12),%xmm5 153 movdqa -32(%r12),%xmm2 154 pand 176(%r10),%xmm4 155 movdqa -16(%r12),%xmm3 156 pand 192(%r10),%xmm5 157 por %xmm4,%xmm0 158 pand 208(%r10),%xmm2 159 por %xmm5,%xmm1 160 pand 224(%r10),%xmm3 161 por %xmm2,%xmm0 162 por %xmm3,%xmm1 163 movdqa 0(%r12),%xmm4 164 movdqa 16(%r12),%xmm5 165 movdqa 32(%r12),%xmm2 166 pand 240(%r10),%xmm4 167 movdqa 48(%r12),%xmm3 168 pand 256(%r10),%xmm5 169 por %xmm4,%xmm0 170 pand 272(%r10),%xmm2 171 por %xmm5,%xmm1 172 pand 288(%r10),%xmm3 173 por %xmm2,%xmm0 174 por %xmm3,%xmm1 175 por %xmm1,%xmm0 176 pshufd $0x4e,%xmm0,%xmm1 177 por %xmm1,%xmm0 178 leaq 256(%r12),%r12 179.byte 102,72,15,126,195 180 181 movq (%r8),%r8 182 movq (%rsi),%rax 183 184 xorq %r14,%r14 185 xorq %r15,%r15 186 187 movq %r8,%rbp 188 mulq %rbx 189 movq %rax,%r10 190 movq (%rcx),%rax 191 192 imulq %r10,%rbp 193 movq %rdx,%r11 194 195 mulq %rbp 196 addq %rax,%r10 197 movq 8(%rsi),%rax 198 adcq $0,%rdx 199 movq %rdx,%r13 200 201 leaq 1(%r15),%r15 202 jmp .L1st_enter 203 204.align 16 205.L1st: 206 addq %rax,%r13 207 movq (%rsi,%r15,8),%rax 208 adcq $0,%rdx 209 addq %r11,%r13 210 movq %r10,%r11 211 adcq $0,%rdx 212 movq %r13,-16(%rsp,%r15,8) 213 movq %rdx,%r13 214 215.L1st_enter: 216 mulq %rbx 217 addq %rax,%r11 218 movq (%rcx,%r15,8),%rax 219 adcq $0,%rdx 220 leaq 1(%r15),%r15 221 movq %rdx,%r10 222 223 mulq %rbp 224 cmpq %r9,%r15 225 jne .L1st 226 227 228 addq %rax,%r13 229 adcq $0,%rdx 230 addq %r11,%r13 231 adcq $0,%rdx 232 movq %r13,-16(%rsp,%r9,8) 233 movq %rdx,%r13 234 movq %r10,%r11 235 236 xorq %rdx,%rdx 237 addq %r11,%r13 238 adcq $0,%rdx 239 movq %r13,-8(%rsp,%r9,8) 240 movq %rdx,(%rsp,%r9,8) 241 242 leaq 1(%r14),%r14 243 jmp .Louter 244.align 16 245.Louter: 246 leaq 24+128(%rsp,%r9,8),%rdx 247 andq $-16,%rdx 248 pxor %xmm4,%xmm4 249 pxor %xmm5,%xmm5 250 movdqa -128(%r12),%xmm0 251 movdqa -112(%r12),%xmm1 252 movdqa -96(%r12),%xmm2 253 movdqa -80(%r12),%xmm3 254 pand -128(%rdx),%xmm0 255 pand -112(%rdx),%xmm1 256 por %xmm0,%xmm4 257 pand -96(%rdx),%xmm2 258 por %xmm1,%xmm5 259 pand -80(%rdx),%xmm3 260 por %xmm2,%xmm4 261 por %xmm3,%xmm5 262 movdqa -64(%r12),%xmm0 263 movdqa -48(%r12),%xmm1 264 movdqa -32(%r12),%xmm2 265 movdqa -16(%r12),%xmm3 266 pand -64(%rdx),%xmm0 267 pand -48(%rdx),%xmm1 268 por %xmm0,%xmm4 269 pand -32(%rdx),%xmm2 270 por %xmm1,%xmm5 271 pand -16(%rdx),%xmm3 272 por %xmm2,%xmm4 273 por %xmm3,%xmm5 274 movdqa 0(%r12),%xmm0 275 movdqa 16(%r12),%xmm1 276 movdqa 32(%r12),%xmm2 277 movdqa 48(%r12),%xmm3 278 pand 0(%rdx),%xmm0 279 pand 16(%rdx),%xmm1 280 por %xmm0,%xmm4 281 pand 32(%rdx),%xmm2 282 por %xmm1,%xmm5 283 pand 48(%rdx),%xmm3 284 por %xmm2,%xmm4 285 por %xmm3,%xmm5 286 movdqa 64(%r12),%xmm0 287 movdqa 80(%r12),%xmm1 288 movdqa 96(%r12),%xmm2 289 movdqa 112(%r12),%xmm3 290 pand 64(%rdx),%xmm0 291 pand 80(%rdx),%xmm1 292 por %xmm0,%xmm4 293 pand 96(%rdx),%xmm2 294 por %xmm1,%xmm5 295 pand 112(%rdx),%xmm3 296 por %xmm2,%xmm4 297 por %xmm3,%xmm5 298 por %xmm5,%xmm4 299 pshufd $0x4e,%xmm4,%xmm0 300 por %xmm4,%xmm0 301 leaq 256(%r12),%r12 302 303 movq (%rsi),%rax 304.byte 102,72,15,126,195 305 306 xorq %r15,%r15 307 movq %r8,%rbp 308 movq (%rsp),%r10 309 310 mulq %rbx 311 addq %rax,%r10 312 movq (%rcx),%rax 313 adcq $0,%rdx 314 315 imulq %r10,%rbp 316 movq %rdx,%r11 317 318 mulq %rbp 319 addq %rax,%r10 320 movq 8(%rsi),%rax 321 adcq $0,%rdx 322 movq 8(%rsp),%r10 323 movq %rdx,%r13 324 325 leaq 1(%r15),%r15 326 jmp .Linner_enter 327 328.align 16 329.Linner: 330 addq %rax,%r13 331 movq (%rsi,%r15,8),%rax 332 adcq $0,%rdx 333 addq %r10,%r13 334 movq (%rsp,%r15,8),%r10 335 adcq $0,%rdx 336 movq %r13,-16(%rsp,%r15,8) 337 movq %rdx,%r13 338 339.Linner_enter: 340 mulq %rbx 341 addq %rax,%r11 342 movq (%rcx,%r15,8),%rax 343 adcq $0,%rdx 344 addq %r11,%r10 345 movq %rdx,%r11 346 adcq $0,%r11 347 leaq 1(%r15),%r15 348 349 mulq %rbp 350 cmpq %r9,%r15 351 jne .Linner 352 353 addq %rax,%r13 354 adcq $0,%rdx 355 addq %r10,%r13 356 movq (%rsp,%r9,8),%r10 357 adcq $0,%rdx 358 movq %r13,-16(%rsp,%r9,8) 359 movq %rdx,%r13 360 361 xorq %rdx,%rdx 362 addq %r11,%r13 363 adcq $0,%rdx 364 addq %r10,%r13 365 adcq $0,%rdx 366 movq %r13,-8(%rsp,%r9,8) 367 movq %rdx,(%rsp,%r9,8) 368 369 leaq 1(%r14),%r14 370 cmpq %r9,%r14 371 jb .Louter 372 373 xorq %r14,%r14 374 movq (%rsp),%rax 375 leaq (%rsp),%rsi 376 movq %r9,%r15 377 jmp .Lsub 378.align 16 379.Lsub: sbbq (%rcx,%r14,8),%rax 380 movq %rax,(%rdi,%r14,8) 381 movq 8(%rsi,%r14,8),%rax 382 leaq 1(%r14),%r14 383 decq %r15 384 jnz .Lsub 385 386 sbbq $0,%rax 387 xorq %r14,%r14 388 andq %rax,%rsi 389 notq %rax 390 movq %rdi,%rcx 391 andq %rax,%rcx 392 movq %r9,%r15 393 orq %rcx,%rsi 394.align 16 395.Lcopy: 396 movq (%rsi,%r14,8),%rax 397 movq %r14,(%rsp,%r14,8) 398 movq %rax,(%rdi,%r14,8) 399 leaq 1(%r14),%r14 400 subq $1,%r15 401 jnz .Lcopy 402 403 movq 8(%rsp,%r9,8),%rsi 404 movq $1,%rax 405 406 movq -48(%rsi),%r15 407 movq -40(%rsi),%r14 408 movq -32(%rsi),%r13 409 movq -24(%rsi),%r12 410 movq -16(%rsi),%rbp 411 movq -8(%rsi),%rbx 412 leaq (%rsi),%rsp 413.Lmul_epilogue: 414 .byte 0xf3,0xc3 415.size bn_mul_mont_gather5,.-bn_mul_mont_gather5 416.type bn_mul4x_mont_gather5,@function 417.align 32 418bn_mul4x_mont_gather5: 419.Lmul4x_enter: 420 andl $0x80108,%r11d 421 cmpl $0x80108,%r11d 422 je .Lmulx4x_enter 423.byte 0x67 424 movq %rsp,%rax 425 pushq %rbx 426 pushq %rbp 427 pushq %r12 428 pushq %r13 429 pushq %r14 430 pushq %r15 431 432.byte 0x67 433 shll $3,%r9d 434 leaq (%r9,%r9,2),%r10 435 negq %r9 436 437 438 439 440 441 442 443 444 445 446 leaq -320(%rsp,%r9,2),%r11 447 subq %rdi,%r11 448 andq $4095,%r11 449 cmpq %r11,%r10 450 jb .Lmul4xsp_alt 451 subq %r11,%rsp 452 leaq -320(%rsp,%r9,2),%rsp 453 jmp .Lmul4xsp_done 454 455.align 32 456.Lmul4xsp_alt: 457 leaq 4096-320(,%r9,2),%r10 458 leaq -320(%rsp,%r9,2),%rsp 459 subq %r10,%r11 460 movq $0,%r10 461 cmovcq %r10,%r11 462 subq %r11,%rsp 463.Lmul4xsp_done: 464 andq $-64,%rsp 465 movq %rax,%r11 466 subq %rsp,%r11 467 andq $-4096,%r11 468.Lmul4x_page_walk: 469 movq (%rsp,%r11,1),%r10 470 subq $4096,%r11 471.byte 0x2e 472 jnc .Lmul4x_page_walk 473 474 negq %r9 475 476 movq %rax,40(%rsp) 477.Lmul4x_body: 478 479 call mul4x_internal 480 481 movq 40(%rsp),%rsi 482 movq $1,%rax 483 484 movq -48(%rsi),%r15 485 movq -40(%rsi),%r14 486 movq -32(%rsi),%r13 487 movq -24(%rsi),%r12 488 movq -16(%rsi),%rbp 489 movq -8(%rsi),%rbx 490 leaq (%rsi),%rsp 491.Lmul4x_epilogue: 492 .byte 0xf3,0xc3 493.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 494 495.type mul4x_internal,@function 496.align 32 497mul4x_internal: 498 shlq $5,%r9 499 movd 8(%rax),%xmm5 500 leaq .Linc(%rip),%rax 501 leaq 128(%rdx,%r9,1),%r13 502 shrq $5,%r9 503 movdqa 0(%rax),%xmm0 504 movdqa 16(%rax),%xmm1 505 leaq 88-112(%rsp,%r9,1),%r10 506 leaq 128(%rdx),%r12 507 508 pshufd $0,%xmm5,%xmm5 509 movdqa %xmm1,%xmm4 510.byte 0x67,0x67 511 movdqa %xmm1,%xmm2 512 paddd %xmm0,%xmm1 513 pcmpeqd %xmm5,%xmm0 514.byte 0x67 515 movdqa %xmm4,%xmm3 516 paddd %xmm1,%xmm2 517 pcmpeqd %xmm5,%xmm1 518 movdqa %xmm0,112(%r10) 519 movdqa %xmm4,%xmm0 520 521 paddd %xmm2,%xmm3 522 pcmpeqd %xmm5,%xmm2 523 movdqa %xmm1,128(%r10) 524 movdqa %xmm4,%xmm1 525 526 paddd %xmm3,%xmm0 527 pcmpeqd %xmm5,%xmm3 528 movdqa %xmm2,144(%r10) 529 movdqa %xmm4,%xmm2 530 531 paddd %xmm0,%xmm1 532 pcmpeqd %xmm5,%xmm0 533 movdqa %xmm3,160(%r10) 534 movdqa %xmm4,%xmm3 535 paddd %xmm1,%xmm2 536 pcmpeqd %xmm5,%xmm1 537 movdqa %xmm0,176(%r10) 538 movdqa %xmm4,%xmm0 539 540 paddd %xmm2,%xmm3 541 pcmpeqd %xmm5,%xmm2 542 movdqa %xmm1,192(%r10) 543 movdqa %xmm4,%xmm1 544 545 paddd %xmm3,%xmm0 546 pcmpeqd %xmm5,%xmm3 547 movdqa %xmm2,208(%r10) 548 movdqa %xmm4,%xmm2 549 550 paddd %xmm0,%xmm1 551 pcmpeqd %xmm5,%xmm0 552 movdqa %xmm3,224(%r10) 553 movdqa %xmm4,%xmm3 554 paddd %xmm1,%xmm2 555 pcmpeqd %xmm5,%xmm1 556 movdqa %xmm0,240(%r10) 557 movdqa %xmm4,%xmm0 558 559 paddd %xmm2,%xmm3 560 pcmpeqd %xmm5,%xmm2 561 movdqa %xmm1,256(%r10) 562 movdqa %xmm4,%xmm1 563 564 paddd %xmm3,%xmm0 565 pcmpeqd %xmm5,%xmm3 566 movdqa %xmm2,272(%r10) 567 movdqa %xmm4,%xmm2 568 569 paddd %xmm0,%xmm1 570 pcmpeqd %xmm5,%xmm0 571 movdqa %xmm3,288(%r10) 572 movdqa %xmm4,%xmm3 573 paddd %xmm1,%xmm2 574 pcmpeqd %xmm5,%xmm1 575 movdqa %xmm0,304(%r10) 576 577 paddd %xmm2,%xmm3 578.byte 0x67 579 pcmpeqd %xmm5,%xmm2 580 movdqa %xmm1,320(%r10) 581 582 pcmpeqd %xmm5,%xmm3 583 movdqa %xmm2,336(%r10) 584 pand 64(%r12),%xmm0 585 586 pand 80(%r12),%xmm1 587 pand 96(%r12),%xmm2 588 movdqa %xmm3,352(%r10) 589 pand 112(%r12),%xmm3 590 por %xmm2,%xmm0 591 por %xmm3,%xmm1 592 movdqa -128(%r12),%xmm4 593 movdqa -112(%r12),%xmm5 594 movdqa -96(%r12),%xmm2 595 pand 112(%r10),%xmm4 596 movdqa -80(%r12),%xmm3 597 pand 128(%r10),%xmm5 598 por %xmm4,%xmm0 599 pand 144(%r10),%xmm2 600 por %xmm5,%xmm1 601 pand 160(%r10),%xmm3 602 por %xmm2,%xmm0 603 por %xmm3,%xmm1 604 movdqa -64(%r12),%xmm4 605 movdqa -48(%r12),%xmm5 606 movdqa -32(%r12),%xmm2 607 pand 176(%r10),%xmm4 608 movdqa -16(%r12),%xmm3 609 pand 192(%r10),%xmm5 610 por %xmm4,%xmm0 611 pand 208(%r10),%xmm2 612 por %xmm5,%xmm1 613 pand 224(%r10),%xmm3 614 por %xmm2,%xmm0 615 por %xmm3,%xmm1 616 movdqa 0(%r12),%xmm4 617 movdqa 16(%r12),%xmm5 618 movdqa 32(%r12),%xmm2 619 pand 240(%r10),%xmm4 620 movdqa 48(%r12),%xmm3 621 pand 256(%r10),%xmm5 622 por %xmm4,%xmm0 623 pand 272(%r10),%xmm2 624 por %xmm5,%xmm1 625 pand 288(%r10),%xmm3 626 por %xmm2,%xmm0 627 por %xmm3,%xmm1 628 por %xmm1,%xmm0 629 pshufd $0x4e,%xmm0,%xmm1 630 por %xmm1,%xmm0 631 leaq 256(%r12),%r12 632.byte 102,72,15,126,195 633 634 movq %r13,16+8(%rsp) 635 movq %rdi,56+8(%rsp) 636 637 movq (%r8),%r8 638 movq (%rsi),%rax 639 leaq (%rsi,%r9,1),%rsi 640 negq %r9 641 642 movq %r8,%rbp 643 mulq %rbx 644 movq %rax,%r10 645 movq (%rcx),%rax 646 647 imulq %r10,%rbp 648 leaq 64+8(%rsp),%r14 649 movq %rdx,%r11 650 651 mulq %rbp 652 addq %rax,%r10 653 movq 8(%rsi,%r9,1),%rax 654 adcq $0,%rdx 655 movq %rdx,%rdi 656 657 mulq %rbx 658 addq %rax,%r11 659 movq 8(%rcx),%rax 660 adcq $0,%rdx 661 movq %rdx,%r10 662 663 mulq %rbp 664 addq %rax,%rdi 665 movq 16(%rsi,%r9,1),%rax 666 adcq $0,%rdx 667 addq %r11,%rdi 668 leaq 32(%r9),%r15 669 leaq 32(%rcx),%rcx 670 adcq $0,%rdx 671 movq %rdi,(%r14) 672 movq %rdx,%r13 673 jmp .L1st4x 674 675.align 32 676.L1st4x: 677 mulq %rbx 678 addq %rax,%r10 679 movq -16(%rcx),%rax 680 leaq 32(%r14),%r14 681 adcq $0,%rdx 682 movq %rdx,%r11 683 684 mulq %rbp 685 addq %rax,%r13 686 movq -8(%rsi,%r15,1),%rax 687 adcq $0,%rdx 688 addq %r10,%r13 689 adcq $0,%rdx 690 movq %r13,-24(%r14) 691 movq %rdx,%rdi 692 693 mulq %rbx 694 addq %rax,%r11 695 movq -8(%rcx),%rax 696 adcq $0,%rdx 697 movq %rdx,%r10 698 699 mulq %rbp 700 addq %rax,%rdi 701 movq (%rsi,%r15,1),%rax 702 adcq $0,%rdx 703 addq %r11,%rdi 704 adcq $0,%rdx 705 movq %rdi,-16(%r14) 706 movq %rdx,%r13 707 708 mulq %rbx 709 addq %rax,%r10 710 movq 0(%rcx),%rax 711 adcq $0,%rdx 712 movq %rdx,%r11 713 714 mulq %rbp 715 addq %rax,%r13 716 movq 8(%rsi,%r15,1),%rax 717 adcq $0,%rdx 718 addq %r10,%r13 719 adcq $0,%rdx 720 movq %r13,-8(%r14) 721 movq %rdx,%rdi 722 723 mulq %rbx 724 addq %rax,%r11 725 movq 8(%rcx),%rax 726 adcq $0,%rdx 727 movq %rdx,%r10 728 729 mulq %rbp 730 addq %rax,%rdi 731 movq 16(%rsi,%r15,1),%rax 732 adcq $0,%rdx 733 addq %r11,%rdi 734 leaq 32(%rcx),%rcx 735 adcq $0,%rdx 736 movq %rdi,(%r14) 737 movq %rdx,%r13 738 739 addq $32,%r15 740 jnz .L1st4x 741 742 mulq %rbx 743 addq %rax,%r10 744 movq -16(%rcx),%rax 745 leaq 32(%r14),%r14 746 adcq $0,%rdx 747 movq %rdx,%r11 748 749 mulq %rbp 750 addq %rax,%r13 751 movq -8(%rsi),%rax 752 adcq $0,%rdx 753 addq %r10,%r13 754 adcq $0,%rdx 755 movq %r13,-24(%r14) 756 movq %rdx,%rdi 757 758 mulq %rbx 759 addq %rax,%r11 760 movq -8(%rcx),%rax 761 adcq $0,%rdx 762 movq %rdx,%r10 763 764 mulq %rbp 765 addq %rax,%rdi 766 movq (%rsi,%r9,1),%rax 767 adcq $0,%rdx 768 addq %r11,%rdi 769 adcq $0,%rdx 770 movq %rdi,-16(%r14) 771 movq %rdx,%r13 772 773 leaq (%rcx,%r9,1),%rcx 774 775 xorq %rdi,%rdi 776 addq %r10,%r13 777 adcq $0,%rdi 778 movq %r13,-8(%r14) 779 780 jmp .Louter4x 781 782.align 32 783.Louter4x: 784 leaq 16+128(%r14),%rdx 785 pxor %xmm4,%xmm4 786 pxor %xmm5,%xmm5 787 movdqa -128(%r12),%xmm0 788 movdqa -112(%r12),%xmm1 789 movdqa -96(%r12),%xmm2 790 movdqa -80(%r12),%xmm3 791 pand -128(%rdx),%xmm0 792 pand -112(%rdx),%xmm1 793 por %xmm0,%xmm4 794 pand -96(%rdx),%xmm2 795 por %xmm1,%xmm5 796 pand -80(%rdx),%xmm3 797 por %xmm2,%xmm4 798 por %xmm3,%xmm5 799 movdqa -64(%r12),%xmm0 800 movdqa -48(%r12),%xmm1 801 movdqa -32(%r12),%xmm2 802 movdqa -16(%r12),%xmm3 803 pand -64(%rdx),%xmm0 804 pand -48(%rdx),%xmm1 805 por %xmm0,%xmm4 806 pand -32(%rdx),%xmm2 807 por %xmm1,%xmm5 808 pand -16(%rdx),%xmm3 809 por %xmm2,%xmm4 810 por %xmm3,%xmm5 811 movdqa 0(%r12),%xmm0 812 movdqa 16(%r12),%xmm1 813 movdqa 32(%r12),%xmm2 814 movdqa 48(%r12),%xmm3 815 pand 0(%rdx),%xmm0 816 pand 16(%rdx),%xmm1 817 por %xmm0,%xmm4 818 pand 32(%rdx),%xmm2 819 por %xmm1,%xmm5 820 pand 48(%rdx),%xmm3 821 por %xmm2,%xmm4 822 por %xmm3,%xmm5 823 movdqa 64(%r12),%xmm0 824 movdqa 80(%r12),%xmm1 825 movdqa 96(%r12),%xmm2 826 movdqa 112(%r12),%xmm3 827 pand 64(%rdx),%xmm0 828 pand 80(%rdx),%xmm1 829 por %xmm0,%xmm4 830 pand 96(%rdx),%xmm2 831 por %xmm1,%xmm5 832 pand 112(%rdx),%xmm3 833 por %xmm2,%xmm4 834 por %xmm3,%xmm5 835 por %xmm5,%xmm4 836 pshufd $0x4e,%xmm4,%xmm0 837 por %xmm4,%xmm0 838 leaq 256(%r12),%r12 839.byte 102,72,15,126,195 840 841 movq (%r14,%r9,1),%r10 842 movq %r8,%rbp 843 mulq %rbx 844 addq %rax,%r10 845 movq (%rcx),%rax 846 adcq $0,%rdx 847 848 imulq %r10,%rbp 849 movq %rdx,%r11 850 movq %rdi,(%r14) 851 852 leaq (%r14,%r9,1),%r14 853 854 mulq %rbp 855 addq %rax,%r10 856 movq 8(%rsi,%r9,1),%rax 857 adcq $0,%rdx 858 movq %rdx,%rdi 859 860 mulq %rbx 861 addq %rax,%r11 862 movq 8(%rcx),%rax 863 adcq $0,%rdx 864 addq 8(%r14),%r11 865 adcq $0,%rdx 866 movq %rdx,%r10 867 868 mulq %rbp 869 addq %rax,%rdi 870 movq 16(%rsi,%r9,1),%rax 871 adcq $0,%rdx 872 addq %r11,%rdi 873 leaq 32(%r9),%r15 874 leaq 32(%rcx),%rcx 875 adcq $0,%rdx 876 movq %rdx,%r13 877 jmp .Linner4x 878 879.align 32 880.Linner4x: 881 mulq %rbx 882 addq %rax,%r10 883 movq -16(%rcx),%rax 884 adcq $0,%rdx 885 addq 16(%r14),%r10 886 leaq 32(%r14),%r14 887 adcq $0,%rdx 888 movq %rdx,%r11 889 890 mulq %rbp 891 addq %rax,%r13 892 movq -8(%rsi,%r15,1),%rax 893 adcq $0,%rdx 894 addq %r10,%r13 895 adcq $0,%rdx 896 movq %rdi,-32(%r14) 897 movq %rdx,%rdi 898 899 mulq %rbx 900 addq %rax,%r11 901 movq -8(%rcx),%rax 902 adcq $0,%rdx 903 addq -8(%r14),%r11 904 adcq $0,%rdx 905 movq %rdx,%r10 906 907 mulq %rbp 908 addq %rax,%rdi 909 movq (%rsi,%r15,1),%rax 910 adcq $0,%rdx 911 addq %r11,%rdi 912 adcq $0,%rdx 913 movq %r13,-24(%r14) 914 movq %rdx,%r13 915 916 mulq %rbx 917 addq %rax,%r10 918 movq 0(%rcx),%rax 919 adcq $0,%rdx 920 addq (%r14),%r10 921 adcq $0,%rdx 922 movq %rdx,%r11 923 924 mulq %rbp 925 addq %rax,%r13 926 movq 8(%rsi,%r15,1),%rax 927 adcq $0,%rdx 928 addq %r10,%r13 929 adcq $0,%rdx 930 movq %rdi,-16(%r14) 931 movq %rdx,%rdi 932 933 mulq %rbx 934 addq %rax,%r11 935 movq 8(%rcx),%rax 936 adcq $0,%rdx 937 addq 8(%r14),%r11 938 adcq $0,%rdx 939 movq %rdx,%r10 940 941 mulq %rbp 942 addq %rax,%rdi 943 movq 16(%rsi,%r15,1),%rax 944 adcq $0,%rdx 945 addq %r11,%rdi 946 leaq 32(%rcx),%rcx 947 adcq $0,%rdx 948 movq %r13,-8(%r14) 949 movq %rdx,%r13 950 951 addq $32,%r15 952 jnz .Linner4x 953 954 mulq %rbx 955 addq %rax,%r10 956 movq -16(%rcx),%rax 957 adcq $0,%rdx 958 addq 16(%r14),%r10 959 leaq 32(%r14),%r14 960 adcq $0,%rdx 961 movq %rdx,%r11 962 963 mulq %rbp 964 addq %rax,%r13 965 movq -8(%rsi),%rax 966 adcq $0,%rdx 967 addq %r10,%r13 968 adcq $0,%rdx 969 movq %rdi,-32(%r14) 970 movq %rdx,%rdi 971 972 mulq %rbx 973 addq %rax,%r11 974 movq %rbp,%rax 975 movq -8(%rcx),%rbp 976 adcq $0,%rdx 977 addq -8(%r14),%r11 978 adcq $0,%rdx 979 movq %rdx,%r10 980 981 mulq %rbp 982 addq %rax,%rdi 983 movq (%rsi,%r9,1),%rax 984 adcq $0,%rdx 985 addq %r11,%rdi 986 adcq $0,%rdx 987 movq %r13,-24(%r14) 988 movq %rdx,%r13 989 990 movq %rdi,-16(%r14) 991 leaq (%rcx,%r9,1),%rcx 992 993 xorq %rdi,%rdi 994 addq %r10,%r13 995 adcq $0,%rdi 996 addq (%r14),%r13 997 adcq $0,%rdi 998 movq %r13,-8(%r14) 999 1000 cmpq 16+8(%rsp),%r12 1001 jb .Louter4x 1002 xorq %rax,%rax 1003 subq %r13,%rbp 1004 adcq %r15,%r15 1005 orq %r15,%rdi 1006 subq %rdi,%rax 1007 leaq (%r14,%r9,1),%rbx 1008 movq (%rcx),%r12 1009 leaq (%rcx),%rbp 1010 movq %r9,%rcx 1011 sarq $3+2,%rcx 1012 movq 56+8(%rsp),%rdi 1013 decq %r12 1014 xorq %r10,%r10 1015 movq 8(%rbp),%r13 1016 movq 16(%rbp),%r14 1017 movq 24(%rbp),%r15 1018 jmp .Lsqr4x_sub_entry 1019.size mul4x_internal,.-mul4x_internal 1020.globl bn_power5 1021.type bn_power5,@function 1022.align 32 1023bn_power5: 1024 movl OPENSSL_ia32cap_P+8(%rip),%r11d 1025 andl $0x80108,%r11d 1026 cmpl $0x80108,%r11d 1027 je .Lpowerx5_enter 1028 movq %rsp,%rax 1029 pushq %rbx 1030 pushq %rbp 1031 pushq %r12 1032 pushq %r13 1033 pushq %r14 1034 pushq %r15 1035 1036 shll $3,%r9d 1037 leal (%r9,%r9,2),%r10d 1038 negq %r9 1039 movq (%r8),%r8 1040 1041 1042 1043 1044 1045 1046 1047 1048 leaq -320(%rsp,%r9,2),%r11 1049 subq %rdi,%r11 1050 andq $4095,%r11 1051 cmpq %r11,%r10 1052 jb .Lpwr_sp_alt 1053 subq %r11,%rsp 1054 leaq -320(%rsp,%r9,2),%rsp 1055 jmp .Lpwr_sp_done 1056 1057.align 32 1058.Lpwr_sp_alt: 1059 leaq 4096-320(,%r9,2),%r10 1060 leaq -320(%rsp,%r9,2),%rsp 1061 subq %r10,%r11 1062 movq $0,%r10 1063 cmovcq %r10,%r11 1064 subq %r11,%rsp 1065.Lpwr_sp_done: 1066 andq $-64,%rsp 1067 movq %rax,%r11 1068 subq %rsp,%r11 1069 andq $-4096,%r11 1070.Lpwr_page_walk: 1071 movq (%rsp,%r11,1),%r10 1072 subq $4096,%r11 1073.byte 0x2e 1074 jnc .Lpwr_page_walk 1075 1076 movq %r9,%r10 1077 negq %r9 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 movq %r8,32(%rsp) 1089 movq %rax,40(%rsp) 1090.Lpower5_body: 1091.byte 102,72,15,110,207 1092.byte 102,72,15,110,209 1093.byte 102,73,15,110,218 1094.byte 102,72,15,110,226 1095 1096 call __bn_sqr8x_internal 1097 call __bn_post4x_internal 1098 call __bn_sqr8x_internal 1099 call __bn_post4x_internal 1100 call __bn_sqr8x_internal 1101 call __bn_post4x_internal 1102 call __bn_sqr8x_internal 1103 call __bn_post4x_internal 1104 call __bn_sqr8x_internal 1105 call __bn_post4x_internal 1106 1107.byte 102,72,15,126,209 1108.byte 102,72,15,126,226 1109 movq %rsi,%rdi 1110 movq 40(%rsp),%rax 1111 leaq 32(%rsp),%r8 1112 1113 call mul4x_internal 1114 1115 movq 40(%rsp),%rsi 1116 movq $1,%rax 1117 movq -48(%rsi),%r15 1118 movq -40(%rsi),%r14 1119 movq -32(%rsi),%r13 1120 movq -24(%rsi),%r12 1121 movq -16(%rsi),%rbp 1122 movq -8(%rsi),%rbx 1123 leaq (%rsi),%rsp 1124.Lpower5_epilogue: 1125 .byte 0xf3,0xc3 1126.size bn_power5,.-bn_power5 1127 1128.globl bn_sqr8x_internal 1129.hidden bn_sqr8x_internal 1130.type bn_sqr8x_internal,@function 1131.align 32 1132bn_sqr8x_internal: 1133__bn_sqr8x_internal: 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 leaq 32(%r10),%rbp 1208 leaq (%rsi,%r9,1),%rsi 1209 1210 movq %r9,%rcx 1211 1212 1213 movq -32(%rsi,%rbp,1),%r14 1214 leaq 48+8(%rsp,%r9,2),%rdi 1215 movq -24(%rsi,%rbp,1),%rax 1216 leaq -32(%rdi,%rbp,1),%rdi 1217 movq -16(%rsi,%rbp,1),%rbx 1218 movq %rax,%r15 1219 1220 mulq %r14 1221 movq %rax,%r10 1222 movq %rbx,%rax 1223 movq %rdx,%r11 1224 movq %r10,-24(%rdi,%rbp,1) 1225 1226 mulq %r14 1227 addq %rax,%r11 1228 movq %rbx,%rax 1229 adcq $0,%rdx 1230 movq %r11,-16(%rdi,%rbp,1) 1231 movq %rdx,%r10 1232 1233 1234 movq -8(%rsi,%rbp,1),%rbx 1235 mulq %r15 1236 movq %rax,%r12 1237 movq %rbx,%rax 1238 movq %rdx,%r13 1239 1240 leaq (%rbp),%rcx 1241 mulq %r14 1242 addq %rax,%r10 1243 movq %rbx,%rax 1244 movq %rdx,%r11 1245 adcq $0,%r11 1246 addq %r12,%r10 1247 adcq $0,%r11 1248 movq %r10,-8(%rdi,%rcx,1) 1249 jmp .Lsqr4x_1st 1250 1251.align 32 1252.Lsqr4x_1st: 1253 movq (%rsi,%rcx,1),%rbx 1254 mulq %r15 1255 addq %rax,%r13 1256 movq %rbx,%rax 1257 movq %rdx,%r12 1258 adcq $0,%r12 1259 1260 mulq %r14 1261 addq %rax,%r11 1262 movq %rbx,%rax 1263 movq 8(%rsi,%rcx,1),%rbx 1264 movq %rdx,%r10 1265 adcq $0,%r10 1266 addq %r13,%r11 1267 adcq $0,%r10 1268 1269 1270 mulq %r15 1271 addq %rax,%r12 1272 movq %rbx,%rax 1273 movq %r11,(%rdi,%rcx,1) 1274 movq %rdx,%r13 1275 adcq $0,%r13 1276 1277 mulq %r14 1278 addq %rax,%r10 1279 movq %rbx,%rax 1280 movq 16(%rsi,%rcx,1),%rbx 1281 movq %rdx,%r11 1282 adcq $0,%r11 1283 addq %r12,%r10 1284 adcq $0,%r11 1285 1286 mulq %r15 1287 addq %rax,%r13 1288 movq %rbx,%rax 1289 movq %r10,8(%rdi,%rcx,1) 1290 movq %rdx,%r12 1291 adcq $0,%r12 1292 1293 mulq %r14 1294 addq %rax,%r11 1295 movq %rbx,%rax 1296 movq 24(%rsi,%rcx,1),%rbx 1297 movq %rdx,%r10 1298 adcq $0,%r10 1299 addq %r13,%r11 1300 adcq $0,%r10 1301 1302 1303 mulq %r15 1304 addq %rax,%r12 1305 movq %rbx,%rax 1306 movq %r11,16(%rdi,%rcx,1) 1307 movq %rdx,%r13 1308 adcq $0,%r13 1309 leaq 32(%rcx),%rcx 1310 1311 mulq %r14 1312 addq %rax,%r10 1313 movq %rbx,%rax 1314 movq %rdx,%r11 1315 adcq $0,%r11 1316 addq %r12,%r10 1317 adcq $0,%r11 1318 movq %r10,-8(%rdi,%rcx,1) 1319 1320 cmpq $0,%rcx 1321 jne .Lsqr4x_1st 1322 1323 mulq %r15 1324 addq %rax,%r13 1325 leaq 16(%rbp),%rbp 1326 adcq $0,%rdx 1327 addq %r11,%r13 1328 adcq $0,%rdx 1329 1330 movq %r13,(%rdi) 1331 movq %rdx,%r12 1332 movq %rdx,8(%rdi) 1333 jmp .Lsqr4x_outer 1334 1335.align 32 1336.Lsqr4x_outer: 1337 movq -32(%rsi,%rbp,1),%r14 1338 leaq 48+8(%rsp,%r9,2),%rdi 1339 movq -24(%rsi,%rbp,1),%rax 1340 leaq -32(%rdi,%rbp,1),%rdi 1341 movq -16(%rsi,%rbp,1),%rbx 1342 movq %rax,%r15 1343 1344 mulq %r14 1345 movq -24(%rdi,%rbp,1),%r10 1346 addq %rax,%r10 1347 movq %rbx,%rax 1348 adcq $0,%rdx 1349 movq %r10,-24(%rdi,%rbp,1) 1350 movq %rdx,%r11 1351 1352 mulq %r14 1353 addq %rax,%r11 1354 movq %rbx,%rax 1355 adcq $0,%rdx 1356 addq -16(%rdi,%rbp,1),%r11 1357 movq %rdx,%r10 1358 adcq $0,%r10 1359 movq %r11,-16(%rdi,%rbp,1) 1360 1361 xorq %r12,%r12 1362 1363 movq -8(%rsi,%rbp,1),%rbx 1364 mulq %r15 1365 addq %rax,%r12 1366 movq %rbx,%rax 1367 adcq $0,%rdx 1368 addq -8(%rdi,%rbp,1),%r12 1369 movq %rdx,%r13 1370 adcq $0,%r13 1371 1372 mulq %r14 1373 addq %rax,%r10 1374 movq %rbx,%rax 1375 adcq $0,%rdx 1376 addq %r12,%r10 1377 movq %rdx,%r11 1378 adcq $0,%r11 1379 movq %r10,-8(%rdi,%rbp,1) 1380 1381 leaq (%rbp),%rcx 1382 jmp .Lsqr4x_inner 1383 1384.align 32 1385.Lsqr4x_inner: 1386 movq (%rsi,%rcx,1),%rbx 1387 mulq %r15 1388 addq %rax,%r13 1389 movq %rbx,%rax 1390 movq %rdx,%r12 1391 adcq $0,%r12 1392 addq (%rdi,%rcx,1),%r13 1393 adcq $0,%r12 1394 1395.byte 0x67 1396 mulq %r14 1397 addq %rax,%r11 1398 movq %rbx,%rax 1399 movq 8(%rsi,%rcx,1),%rbx 1400 movq %rdx,%r10 1401 adcq $0,%r10 1402 addq %r13,%r11 1403 adcq $0,%r10 1404 1405 mulq %r15 1406 addq %rax,%r12 1407 movq %r11,(%rdi,%rcx,1) 1408 movq %rbx,%rax 1409 movq %rdx,%r13 1410 adcq $0,%r13 1411 addq 8(%rdi,%rcx,1),%r12 1412 leaq 16(%rcx),%rcx 1413 adcq $0,%r13 1414 1415 mulq %r14 1416 addq %rax,%r10 1417 movq %rbx,%rax 1418 adcq $0,%rdx 1419 addq %r12,%r10 1420 movq %rdx,%r11 1421 adcq $0,%r11 1422 movq %r10,-8(%rdi,%rcx,1) 1423 1424 cmpq $0,%rcx 1425 jne .Lsqr4x_inner 1426 1427.byte 0x67 1428 mulq %r15 1429 addq %rax,%r13 1430 adcq $0,%rdx 1431 addq %r11,%r13 1432 adcq $0,%rdx 1433 1434 movq %r13,(%rdi) 1435 movq %rdx,%r12 1436 movq %rdx,8(%rdi) 1437 1438 addq $16,%rbp 1439 jnz .Lsqr4x_outer 1440 1441 1442 movq -32(%rsi),%r14 1443 leaq 48+8(%rsp,%r9,2),%rdi 1444 movq -24(%rsi),%rax 1445 leaq -32(%rdi,%rbp,1),%rdi 1446 movq -16(%rsi),%rbx 1447 movq %rax,%r15 1448 1449 mulq %r14 1450 addq %rax,%r10 1451 movq %rbx,%rax 1452 movq %rdx,%r11 1453 adcq $0,%r11 1454 1455 mulq %r14 1456 addq %rax,%r11 1457 movq %rbx,%rax 1458 movq %r10,-24(%rdi) 1459 movq %rdx,%r10 1460 adcq $0,%r10 1461 addq %r13,%r11 1462 movq -8(%rsi),%rbx 1463 adcq $0,%r10 1464 1465 mulq %r15 1466 addq %rax,%r12 1467 movq %rbx,%rax 1468 movq %r11,-16(%rdi) 1469 movq %rdx,%r13 1470 adcq $0,%r13 1471 1472 mulq %r14 1473 addq %rax,%r10 1474 movq %rbx,%rax 1475 movq %rdx,%r11 1476 adcq $0,%r11 1477 addq %r12,%r10 1478 adcq $0,%r11 1479 movq %r10,-8(%rdi) 1480 1481 mulq %r15 1482 addq %rax,%r13 1483 movq -16(%rsi),%rax 1484 adcq $0,%rdx 1485 addq %r11,%r13 1486 adcq $0,%rdx 1487 1488 movq %r13,(%rdi) 1489 movq %rdx,%r12 1490 movq %rdx,8(%rdi) 1491 1492 mulq %rbx 1493 addq $16,%rbp 1494 xorq %r14,%r14 1495 subq %r9,%rbp 1496 xorq %r15,%r15 1497 1498 addq %r12,%rax 1499 adcq $0,%rdx 1500 movq %rax,8(%rdi) 1501 movq %rdx,16(%rdi) 1502 movq %r15,24(%rdi) 1503 1504 movq -16(%rsi,%rbp,1),%rax 1505 leaq 48+8(%rsp),%rdi 1506 xorq %r10,%r10 1507 movq 8(%rdi),%r11 1508 1509 leaq (%r14,%r10,2),%r12 1510 shrq $63,%r10 1511 leaq (%rcx,%r11,2),%r13 1512 shrq $63,%r11 1513 orq %r10,%r13 1514 movq 16(%rdi),%r10 1515 movq %r11,%r14 1516 mulq %rax 1517 negq %r15 1518 movq 24(%rdi),%r11 1519 adcq %rax,%r12 1520 movq -8(%rsi,%rbp,1),%rax 1521 movq %r12,(%rdi) 1522 adcq %rdx,%r13 1523 1524 leaq (%r14,%r10,2),%rbx 1525 movq %r13,8(%rdi) 1526 sbbq %r15,%r15 1527 shrq $63,%r10 1528 leaq (%rcx,%r11,2),%r8 1529 shrq $63,%r11 1530 orq %r10,%r8 1531 movq 32(%rdi),%r10 1532 movq %r11,%r14 1533 mulq %rax 1534 negq %r15 1535 movq 40(%rdi),%r11 1536 adcq %rax,%rbx 1537 movq 0(%rsi,%rbp,1),%rax 1538 movq %rbx,16(%rdi) 1539 adcq %rdx,%r8 1540 leaq 16(%rbp),%rbp 1541 movq %r8,24(%rdi) 1542 sbbq %r15,%r15 1543 leaq 64(%rdi),%rdi 1544 jmp .Lsqr4x_shift_n_add 1545 1546.align 32 1547.Lsqr4x_shift_n_add: 1548 leaq (%r14,%r10,2),%r12 1549 shrq $63,%r10 1550 leaq (%rcx,%r11,2),%r13 1551 shrq $63,%r11 1552 orq %r10,%r13 1553 movq -16(%rdi),%r10 1554 movq %r11,%r14 1555 mulq %rax 1556 negq %r15 1557 movq -8(%rdi),%r11 1558 adcq %rax,%r12 1559 movq -8(%rsi,%rbp,1),%rax 1560 movq %r12,-32(%rdi) 1561 adcq %rdx,%r13 1562 1563 leaq (%r14,%r10,2),%rbx 1564 movq %r13,-24(%rdi) 1565 sbbq %r15,%r15 1566 shrq $63,%r10 1567 leaq (%rcx,%r11,2),%r8 1568 shrq $63,%r11 1569 orq %r10,%r8 1570 movq 0(%rdi),%r10 1571 movq %r11,%r14 1572 mulq %rax 1573 negq %r15 1574 movq 8(%rdi),%r11 1575 adcq %rax,%rbx 1576 movq 0(%rsi,%rbp,1),%rax 1577 movq %rbx,-16(%rdi) 1578 adcq %rdx,%r8 1579 1580 leaq (%r14,%r10,2),%r12 1581 movq %r8,-8(%rdi) 1582 sbbq %r15,%r15 1583 shrq $63,%r10 1584 leaq (%rcx,%r11,2),%r13 1585 shrq $63,%r11 1586 orq %r10,%r13 1587 movq 16(%rdi),%r10 1588 movq %r11,%r14 1589 mulq %rax 1590 negq %r15 1591 movq 24(%rdi),%r11 1592 adcq %rax,%r12 1593 movq 8(%rsi,%rbp,1),%rax 1594 movq %r12,0(%rdi) 1595 adcq %rdx,%r13 1596 1597 leaq (%r14,%r10,2),%rbx 1598 movq %r13,8(%rdi) 1599 sbbq %r15,%r15 1600 shrq $63,%r10 1601 leaq (%rcx,%r11,2),%r8 1602 shrq $63,%r11 1603 orq %r10,%r8 1604 movq 32(%rdi),%r10 1605 movq %r11,%r14 1606 mulq %rax 1607 negq %r15 1608 movq 40(%rdi),%r11 1609 adcq %rax,%rbx 1610 movq 16(%rsi,%rbp,1),%rax 1611 movq %rbx,16(%rdi) 1612 adcq %rdx,%r8 1613 movq %r8,24(%rdi) 1614 sbbq %r15,%r15 1615 leaq 64(%rdi),%rdi 1616 addq $32,%rbp 1617 jnz .Lsqr4x_shift_n_add 1618 1619 leaq (%r14,%r10,2),%r12 1620.byte 0x67 1621 shrq $63,%r10 1622 leaq (%rcx,%r11,2),%r13 1623 shrq $63,%r11 1624 orq %r10,%r13 1625 movq -16(%rdi),%r10 1626 movq %r11,%r14 1627 mulq %rax 1628 negq %r15 1629 movq -8(%rdi),%r11 1630 adcq %rax,%r12 1631 movq -8(%rsi),%rax 1632 movq %r12,-32(%rdi) 1633 adcq %rdx,%r13 1634 1635 leaq (%r14,%r10,2),%rbx 1636 movq %r13,-24(%rdi) 1637 sbbq %r15,%r15 1638 shrq $63,%r10 1639 leaq (%rcx,%r11,2),%r8 1640 shrq $63,%r11 1641 orq %r10,%r8 1642 mulq %rax 1643 negq %r15 1644 adcq %rax,%rbx 1645 adcq %rdx,%r8 1646 movq %rbx,-16(%rdi) 1647 movq %r8,-8(%rdi) 1648.byte 102,72,15,126,213 1649__bn_sqr8x_reduction: 1650 xorq %rax,%rax 1651 leaq (%r9,%rbp,1),%rcx 1652 leaq 48+8(%rsp,%r9,2),%rdx 1653 movq %rcx,0+8(%rsp) 1654 leaq 48+8(%rsp,%r9,1),%rdi 1655 movq %rdx,8+8(%rsp) 1656 negq %r9 1657 jmp .L8x_reduction_loop 1658 1659.align 32 1660.L8x_reduction_loop: 1661 leaq (%rdi,%r9,1),%rdi 1662.byte 0x66 1663 movq 0(%rdi),%rbx 1664 movq 8(%rdi),%r9 1665 movq 16(%rdi),%r10 1666 movq 24(%rdi),%r11 1667 movq 32(%rdi),%r12 1668 movq 40(%rdi),%r13 1669 movq 48(%rdi),%r14 1670 movq 56(%rdi),%r15 1671 movq %rax,(%rdx) 1672 leaq 64(%rdi),%rdi 1673 1674.byte 0x67 1675 movq %rbx,%r8 1676 imulq 32+8(%rsp),%rbx 1677 movq 0(%rbp),%rax 1678 movl $8,%ecx 1679 jmp .L8x_reduce 1680 1681.align 32 1682.L8x_reduce: 1683 mulq %rbx 1684 movq 8(%rbp),%rax 1685 negq %r8 1686 movq %rdx,%r8 1687 adcq $0,%r8 1688 1689 mulq %rbx 1690 addq %rax,%r9 1691 movq 16(%rbp),%rax 1692 adcq $0,%rdx 1693 addq %r9,%r8 1694 movq %rbx,48-8+8(%rsp,%rcx,8) 1695 movq %rdx,%r9 1696 adcq $0,%r9 1697 1698 mulq %rbx 1699 addq %rax,%r10 1700 movq 24(%rbp),%rax 1701 adcq $0,%rdx 1702 addq %r10,%r9 1703 movq 32+8(%rsp),%rsi 1704 movq %rdx,%r10 1705 adcq $0,%r10 1706 1707 mulq %rbx 1708 addq %rax,%r11 1709 movq 32(%rbp),%rax 1710 adcq $0,%rdx 1711 imulq %r8,%rsi 1712 addq %r11,%r10 1713 movq %rdx,%r11 1714 adcq $0,%r11 1715 1716 mulq %rbx 1717 addq %rax,%r12 1718 movq 40(%rbp),%rax 1719 adcq $0,%rdx 1720 addq %r12,%r11 1721 movq %rdx,%r12 1722 adcq $0,%r12 1723 1724 mulq %rbx 1725 addq %rax,%r13 1726 movq 48(%rbp),%rax 1727 adcq $0,%rdx 1728 addq %r13,%r12 1729 movq %rdx,%r13 1730 adcq $0,%r13 1731 1732 mulq %rbx 1733 addq %rax,%r14 1734 movq 56(%rbp),%rax 1735 adcq $0,%rdx 1736 addq %r14,%r13 1737 movq %rdx,%r14 1738 adcq $0,%r14 1739 1740 mulq %rbx 1741 movq %rsi,%rbx 1742 addq %rax,%r15 1743 movq 0(%rbp),%rax 1744 adcq $0,%rdx 1745 addq %r15,%r14 1746 movq %rdx,%r15 1747 adcq $0,%r15 1748 1749 decl %ecx 1750 jnz .L8x_reduce 1751 1752 leaq 64(%rbp),%rbp 1753 xorq %rax,%rax 1754 movq 8+8(%rsp),%rdx 1755 cmpq 0+8(%rsp),%rbp 1756 jae .L8x_no_tail 1757 1758.byte 0x66 1759 addq 0(%rdi),%r8 1760 adcq 8(%rdi),%r9 1761 adcq 16(%rdi),%r10 1762 adcq 24(%rdi),%r11 1763 adcq 32(%rdi),%r12 1764 adcq 40(%rdi),%r13 1765 adcq 48(%rdi),%r14 1766 adcq 56(%rdi),%r15 1767 sbbq %rsi,%rsi 1768 1769 movq 48+56+8(%rsp),%rbx 1770 movl $8,%ecx 1771 movq 0(%rbp),%rax 1772 jmp .L8x_tail 1773 1774.align 32 1775.L8x_tail: 1776 mulq %rbx 1777 addq %rax,%r8 1778 movq 8(%rbp),%rax 1779 movq %r8,(%rdi) 1780 movq %rdx,%r8 1781 adcq $0,%r8 1782 1783 mulq %rbx 1784 addq %rax,%r9 1785 movq 16(%rbp),%rax 1786 adcq $0,%rdx 1787 addq %r9,%r8 1788 leaq 8(%rdi),%rdi 1789 movq %rdx,%r9 1790 adcq $0,%r9 1791 1792 mulq %rbx 1793 addq %rax,%r10 1794 movq 24(%rbp),%rax 1795 adcq $0,%rdx 1796 addq %r10,%r9 1797 movq %rdx,%r10 1798 adcq $0,%r10 1799 1800 mulq %rbx 1801 addq %rax,%r11 1802 movq 32(%rbp),%rax 1803 adcq $0,%rdx 1804 addq %r11,%r10 1805 movq %rdx,%r11 1806 adcq $0,%r11 1807 1808 mulq %rbx 1809 addq %rax,%r12 1810 movq 40(%rbp),%rax 1811 adcq $0,%rdx 1812 addq %r12,%r11 1813 movq %rdx,%r12 1814 adcq $0,%r12 1815 1816 mulq %rbx 1817 addq %rax,%r13 1818 movq 48(%rbp),%rax 1819 adcq $0,%rdx 1820 addq %r13,%r12 1821 movq %rdx,%r13 1822 adcq $0,%r13 1823 1824 mulq %rbx 1825 addq %rax,%r14 1826 movq 56(%rbp),%rax 1827 adcq $0,%rdx 1828 addq %r14,%r13 1829 movq %rdx,%r14 1830 adcq $0,%r14 1831 1832 mulq %rbx 1833 movq 48-16+8(%rsp,%rcx,8),%rbx 1834 addq %rax,%r15 1835 adcq $0,%rdx 1836 addq %r15,%r14 1837 movq 0(%rbp),%rax 1838 movq %rdx,%r15 1839 adcq $0,%r15 1840 1841 decl %ecx 1842 jnz .L8x_tail 1843 1844 leaq 64(%rbp),%rbp 1845 movq 8+8(%rsp),%rdx 1846 cmpq 0+8(%rsp),%rbp 1847 jae .L8x_tail_done 1848 1849 movq 48+56+8(%rsp),%rbx 1850 negq %rsi 1851 movq 0(%rbp),%rax 1852 adcq 0(%rdi),%r8 1853 adcq 8(%rdi),%r9 1854 adcq 16(%rdi),%r10 1855 adcq 24(%rdi),%r11 1856 adcq 32(%rdi),%r12 1857 adcq 40(%rdi),%r13 1858 adcq 48(%rdi),%r14 1859 adcq 56(%rdi),%r15 1860 sbbq %rsi,%rsi 1861 1862 movl $8,%ecx 1863 jmp .L8x_tail 1864 1865.align 32 1866.L8x_tail_done: 1867 addq (%rdx),%r8 1868 adcq $0,%r9 1869 adcq $0,%r10 1870 adcq $0,%r11 1871 adcq $0,%r12 1872 adcq $0,%r13 1873 adcq $0,%r14 1874 adcq $0,%r15 1875 1876 1877 xorq %rax,%rax 1878 1879 negq %rsi 1880.L8x_no_tail: 1881 adcq 0(%rdi),%r8 1882 adcq 8(%rdi),%r9 1883 adcq 16(%rdi),%r10 1884 adcq 24(%rdi),%r11 1885 adcq 32(%rdi),%r12 1886 adcq 40(%rdi),%r13 1887 adcq 48(%rdi),%r14 1888 adcq 56(%rdi),%r15 1889 adcq $0,%rax 1890 movq -8(%rbp),%rcx 1891 xorq %rsi,%rsi 1892 1893.byte 102,72,15,126,213 1894 1895 movq %r8,0(%rdi) 1896 movq %r9,8(%rdi) 1897.byte 102,73,15,126,217 1898 movq %r10,16(%rdi) 1899 movq %r11,24(%rdi) 1900 movq %r12,32(%rdi) 1901 movq %r13,40(%rdi) 1902 movq %r14,48(%rdi) 1903 movq %r15,56(%rdi) 1904 leaq 64(%rdi),%rdi 1905 1906 cmpq %rdx,%rdi 1907 jb .L8x_reduction_loop 1908 .byte 0xf3,0xc3 1909.size bn_sqr8x_internal,.-bn_sqr8x_internal 1910.type __bn_post4x_internal,@function 1911.align 32 1912__bn_post4x_internal: 1913 movq 0(%rbp),%r12 1914 leaq (%rdi,%r9,1),%rbx 1915 movq %r9,%rcx 1916.byte 102,72,15,126,207 1917 negq %rax 1918.byte 102,72,15,126,206 1919 sarq $3+2,%rcx 1920 decq %r12 1921 xorq %r10,%r10 1922 movq 8(%rbp),%r13 1923 movq 16(%rbp),%r14 1924 movq 24(%rbp),%r15 1925 jmp .Lsqr4x_sub_entry 1926 1927.align 16 1928.Lsqr4x_sub: 1929 movq 0(%rbp),%r12 1930 movq 8(%rbp),%r13 1931 movq 16(%rbp),%r14 1932 movq 24(%rbp),%r15 1933.Lsqr4x_sub_entry: 1934 leaq 32(%rbp),%rbp 1935 notq %r12 1936 notq %r13 1937 notq %r14 1938 notq %r15 1939 andq %rax,%r12 1940 andq %rax,%r13 1941 andq %rax,%r14 1942 andq %rax,%r15 1943 1944 negq %r10 1945 adcq 0(%rbx),%r12 1946 adcq 8(%rbx),%r13 1947 adcq 16(%rbx),%r14 1948 adcq 24(%rbx),%r15 1949 movq %r12,0(%rdi) 1950 leaq 32(%rbx),%rbx 1951 movq %r13,8(%rdi) 1952 sbbq %r10,%r10 1953 movq %r14,16(%rdi) 1954 movq %r15,24(%rdi) 1955 leaq 32(%rdi),%rdi 1956 1957 incq %rcx 1958 jnz .Lsqr4x_sub 1959 1960 movq %r9,%r10 1961 negq %r9 1962 .byte 0xf3,0xc3 1963.size __bn_post4x_internal,.-__bn_post4x_internal 1964.globl bn_from_montgomery 1965.type bn_from_montgomery,@function 1966.align 32 1967bn_from_montgomery: 1968 testl $7,%r9d 1969 jz bn_from_mont8x 1970 xorl %eax,%eax 1971 .byte 0xf3,0xc3 1972.size bn_from_montgomery,.-bn_from_montgomery 1973 1974.type bn_from_mont8x,@function 1975.align 32 1976bn_from_mont8x: 1977.byte 0x67 1978 movq %rsp,%rax 1979 pushq %rbx 1980 pushq %rbp 1981 pushq %r12 1982 pushq %r13 1983 pushq %r14 1984 pushq %r15 1985 1986 shll $3,%r9d 1987 leaq (%r9,%r9,2),%r10 1988 negq %r9 1989 movq (%r8),%r8 1990 1991 1992 1993 1994 1995 1996 1997 1998 leaq -320(%rsp,%r9,2),%r11 1999 subq %rdi,%r11 2000 andq $4095,%r11 2001 cmpq %r11,%r10 2002 jb .Lfrom_sp_alt 2003 subq %r11,%rsp 2004 leaq -320(%rsp,%r9,2),%rsp 2005 jmp .Lfrom_sp_done 2006 2007.align 32 2008.Lfrom_sp_alt: 2009 leaq 4096-320(,%r9,2),%r10 2010 leaq -320(%rsp,%r9,2),%rsp 2011 subq %r10,%r11 2012 movq $0,%r10 2013 cmovcq %r10,%r11 2014 subq %r11,%rsp 2015.Lfrom_sp_done: 2016 andq $-64,%rsp 2017 movq %rax,%r11 2018 subq %rsp,%r11 2019 andq $-4096,%r11 2020.Lfrom_page_walk: 2021 movq (%rsp,%r11,1),%r10 2022 subq $4096,%r11 2023.byte 0x2e 2024 jnc .Lfrom_page_walk 2025 2026 movq %r9,%r10 2027 negq %r9 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 movq %r8,32(%rsp) 2039 movq %rax,40(%rsp) 2040.Lfrom_body: 2041 movq %r9,%r11 2042 leaq 48(%rsp),%rax 2043 pxor %xmm0,%xmm0 2044 jmp .Lmul_by_1 2045 2046.align 32 2047.Lmul_by_1: 2048 movdqu (%rsi),%xmm1 2049 movdqu 16(%rsi),%xmm2 2050 movdqu 32(%rsi),%xmm3 2051 movdqa %xmm0,(%rax,%r9,1) 2052 movdqu 48(%rsi),%xmm4 2053 movdqa %xmm0,16(%rax,%r9,1) 2054.byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 2055 movdqa %xmm1,(%rax) 2056 movdqa %xmm0,32(%rax,%r9,1) 2057 movdqa %xmm2,16(%rax) 2058 movdqa %xmm0,48(%rax,%r9,1) 2059 movdqa %xmm3,32(%rax) 2060 movdqa %xmm4,48(%rax) 2061 leaq 64(%rax),%rax 2062 subq $64,%r11 2063 jnz .Lmul_by_1 2064 2065.byte 102,72,15,110,207 2066.byte 102,72,15,110,209 2067.byte 0x67 2068 movq %rcx,%rbp 2069.byte 102,73,15,110,218 2070 movl OPENSSL_ia32cap_P+8(%rip),%r11d 2071 andl $0x80108,%r11d 2072 cmpl $0x80108,%r11d 2073 jne .Lfrom_mont_nox 2074 2075 leaq (%rax,%r9,1),%rdi 2076 call __bn_sqrx8x_reduction 2077 call __bn_postx4x_internal 2078 2079 pxor %xmm0,%xmm0 2080 leaq 48(%rsp),%rax 2081 movq 40(%rsp),%rsi 2082 jmp .Lfrom_mont_zero 2083 2084.align 32 2085.Lfrom_mont_nox: 2086 call __bn_sqr8x_reduction 2087 call __bn_post4x_internal 2088 2089 pxor %xmm0,%xmm0 2090 leaq 48(%rsp),%rax 2091 movq 40(%rsp),%rsi 2092 jmp .Lfrom_mont_zero 2093 2094.align 32 2095.Lfrom_mont_zero: 2096 movdqa %xmm0,0(%rax) 2097 movdqa %xmm0,16(%rax) 2098 movdqa %xmm0,32(%rax) 2099 movdqa %xmm0,48(%rax) 2100 leaq 64(%rax),%rax 2101 subq $32,%r9 2102 jnz .Lfrom_mont_zero 2103 2104 movq $1,%rax 2105 movq -48(%rsi),%r15 2106 movq -40(%rsi),%r14 2107 movq -32(%rsi),%r13 2108 movq -24(%rsi),%r12 2109 movq -16(%rsi),%rbp 2110 movq -8(%rsi),%rbx 2111 leaq (%rsi),%rsp 2112.Lfrom_epilogue: 2113 .byte 0xf3,0xc3 2114.size bn_from_mont8x,.-bn_from_mont8x 2115.type bn_mulx4x_mont_gather5,@function 2116.align 32 2117bn_mulx4x_mont_gather5: 2118.Lmulx4x_enter: 2119 movq %rsp,%rax 2120 pushq %rbx 2121 pushq %rbp 2122 pushq %r12 2123 pushq %r13 2124 pushq %r14 2125 pushq %r15 2126 2127 shll $3,%r9d 2128 leaq (%r9,%r9,2),%r10 2129 negq %r9 2130 movq (%r8),%r8 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 leaq -320(%rsp,%r9,2),%r11 2142 subq %rdi,%r11 2143 andq $4095,%r11 2144 cmpq %r11,%r10 2145 jb .Lmulx4xsp_alt 2146 subq %r11,%rsp 2147 leaq -320(%rsp,%r9,2),%rsp 2148 jmp .Lmulx4xsp_done 2149 2150.Lmulx4xsp_alt: 2151 leaq 4096-320(,%r9,2),%r10 2152 leaq -320(%rsp,%r9,2),%rsp 2153 subq %r10,%r11 2154 movq $0,%r10 2155 cmovcq %r10,%r11 2156 subq %r11,%rsp 2157.Lmulx4xsp_done: 2158 andq $-64,%rsp 2159 movq %rax,%r11 2160 subq %rsp,%r11 2161 andq $-4096,%r11 2162.Lmulx4x_page_walk: 2163 movq (%rsp,%r11,1),%r10 2164 subq $4096,%r11 2165.byte 0x2e 2166 jnc .Lmulx4x_page_walk 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 movq %r8,32(%rsp) 2181 movq %rax,40(%rsp) 2182.Lmulx4x_body: 2183 call mulx4x_internal 2184 2185 movq 40(%rsp),%rsi 2186 movq $1,%rax 2187 2188 movq -48(%rsi),%r15 2189 movq -40(%rsi),%r14 2190 movq -32(%rsi),%r13 2191 movq -24(%rsi),%r12 2192 movq -16(%rsi),%rbp 2193 movq -8(%rsi),%rbx 2194 leaq (%rsi),%rsp 2195.Lmulx4x_epilogue: 2196 .byte 0xf3,0xc3 2197.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 2198 2199.type mulx4x_internal,@function 2200.align 32 2201mulx4x_internal: 2202 movq %r9,8(%rsp) 2203 movq %r9,%r10 2204 negq %r9 2205 shlq $5,%r9 2206 negq %r10 2207 leaq 128(%rdx,%r9,1),%r13 2208 shrq $5+5,%r9 2209 movd 8(%rax),%xmm5 2210 subq $1,%r9 2211 leaq .Linc(%rip),%rax 2212 movq %r13,16+8(%rsp) 2213 movq %r9,24+8(%rsp) 2214 movq %rdi,56+8(%rsp) 2215 movdqa 0(%rax),%xmm0 2216 movdqa 16(%rax),%xmm1 2217 leaq 88-112(%rsp,%r10,1),%r10 2218 leaq 128(%rdx),%rdi 2219 2220 pshufd $0,%xmm5,%xmm5 2221 movdqa %xmm1,%xmm4 2222.byte 0x67 2223 movdqa %xmm1,%xmm2 2224.byte 0x67 2225 paddd %xmm0,%xmm1 2226 pcmpeqd %xmm5,%xmm0 2227 movdqa %xmm4,%xmm3 2228 paddd %xmm1,%xmm2 2229 pcmpeqd %xmm5,%xmm1 2230 movdqa %xmm0,112(%r10) 2231 movdqa %xmm4,%xmm0 2232 2233 paddd %xmm2,%xmm3 2234 pcmpeqd %xmm5,%xmm2 2235 movdqa %xmm1,128(%r10) 2236 movdqa %xmm4,%xmm1 2237 2238 paddd %xmm3,%xmm0 2239 pcmpeqd %xmm5,%xmm3 2240 movdqa %xmm2,144(%r10) 2241 movdqa %xmm4,%xmm2 2242 2243 paddd %xmm0,%xmm1 2244 pcmpeqd %xmm5,%xmm0 2245 movdqa %xmm3,160(%r10) 2246 movdqa %xmm4,%xmm3 2247 paddd %xmm1,%xmm2 2248 pcmpeqd %xmm5,%xmm1 2249 movdqa %xmm0,176(%r10) 2250 movdqa %xmm4,%xmm0 2251 2252 paddd %xmm2,%xmm3 2253 pcmpeqd %xmm5,%xmm2 2254 movdqa %xmm1,192(%r10) 2255 movdqa %xmm4,%xmm1 2256 2257 paddd %xmm3,%xmm0 2258 pcmpeqd %xmm5,%xmm3 2259 movdqa %xmm2,208(%r10) 2260 movdqa %xmm4,%xmm2 2261 2262 paddd %xmm0,%xmm1 2263 pcmpeqd %xmm5,%xmm0 2264 movdqa %xmm3,224(%r10) 2265 movdqa %xmm4,%xmm3 2266 paddd %xmm1,%xmm2 2267 pcmpeqd %xmm5,%xmm1 2268 movdqa %xmm0,240(%r10) 2269 movdqa %xmm4,%xmm0 2270 2271 paddd %xmm2,%xmm3 2272 pcmpeqd %xmm5,%xmm2 2273 movdqa %xmm1,256(%r10) 2274 movdqa %xmm4,%xmm1 2275 2276 paddd %xmm3,%xmm0 2277 pcmpeqd %xmm5,%xmm3 2278 movdqa %xmm2,272(%r10) 2279 movdqa %xmm4,%xmm2 2280 2281 paddd %xmm0,%xmm1 2282 pcmpeqd %xmm5,%xmm0 2283 movdqa %xmm3,288(%r10) 2284 movdqa %xmm4,%xmm3 2285.byte 0x67 2286 paddd %xmm1,%xmm2 2287 pcmpeqd %xmm5,%xmm1 2288 movdqa %xmm0,304(%r10) 2289 2290 paddd %xmm2,%xmm3 2291 pcmpeqd %xmm5,%xmm2 2292 movdqa %xmm1,320(%r10) 2293 2294 pcmpeqd %xmm5,%xmm3 2295 movdqa %xmm2,336(%r10) 2296 2297 pand 64(%rdi),%xmm0 2298 pand 80(%rdi),%xmm1 2299 pand 96(%rdi),%xmm2 2300 movdqa %xmm3,352(%r10) 2301 pand 112(%rdi),%xmm3 2302 por %xmm2,%xmm0 2303 por %xmm3,%xmm1 2304 movdqa -128(%rdi),%xmm4 2305 movdqa -112(%rdi),%xmm5 2306 movdqa -96(%rdi),%xmm2 2307 pand 112(%r10),%xmm4 2308 movdqa -80(%rdi),%xmm3 2309 pand 128(%r10),%xmm5 2310 por %xmm4,%xmm0 2311 pand 144(%r10),%xmm2 2312 por %xmm5,%xmm1 2313 pand 160(%r10),%xmm3 2314 por %xmm2,%xmm0 2315 por %xmm3,%xmm1 2316 movdqa -64(%rdi),%xmm4 2317 movdqa -48(%rdi),%xmm5 2318 movdqa -32(%rdi),%xmm2 2319 pand 176(%r10),%xmm4 2320 movdqa -16(%rdi),%xmm3 2321 pand 192(%r10),%xmm5 2322 por %xmm4,%xmm0 2323 pand 208(%r10),%xmm2 2324 por %xmm5,%xmm1 2325 pand 224(%r10),%xmm3 2326 por %xmm2,%xmm0 2327 por %xmm3,%xmm1 2328 movdqa 0(%rdi),%xmm4 2329 movdqa 16(%rdi),%xmm5 2330 movdqa 32(%rdi),%xmm2 2331 pand 240(%r10),%xmm4 2332 movdqa 48(%rdi),%xmm3 2333 pand 256(%r10),%xmm5 2334 por %xmm4,%xmm0 2335 pand 272(%r10),%xmm2 2336 por %xmm5,%xmm1 2337 pand 288(%r10),%xmm3 2338 por %xmm2,%xmm0 2339 por %xmm3,%xmm1 2340 pxor %xmm1,%xmm0 2341 pshufd $0x4e,%xmm0,%xmm1 2342 por %xmm1,%xmm0 2343 leaq 256(%rdi),%rdi 2344.byte 102,72,15,126,194 2345 leaq 64+32+8(%rsp),%rbx 2346 2347 movq %rdx,%r9 2348 mulxq 0(%rsi),%r8,%rax 2349 mulxq 8(%rsi),%r11,%r12 2350 addq %rax,%r11 2351 mulxq 16(%rsi),%rax,%r13 2352 adcq %rax,%r12 2353 adcq $0,%r13 2354 mulxq 24(%rsi),%rax,%r14 2355 2356 movq %r8,%r15 2357 imulq 32+8(%rsp),%r8 2358 xorq %rbp,%rbp 2359 movq %r8,%rdx 2360 2361 movq %rdi,8+8(%rsp) 2362 2363 leaq 32(%rsi),%rsi 2364 adcxq %rax,%r13 2365 adcxq %rbp,%r14 2366 2367 mulxq 0(%rcx),%rax,%r10 2368 adcxq %rax,%r15 2369 adoxq %r11,%r10 2370 mulxq 8(%rcx),%rax,%r11 2371 adcxq %rax,%r10 2372 adoxq %r12,%r11 2373 mulxq 16(%rcx),%rax,%r12 2374 movq 24+8(%rsp),%rdi 2375 movq %r10,-32(%rbx) 2376 adcxq %rax,%r11 2377 adoxq %r13,%r12 2378 mulxq 24(%rcx),%rax,%r15 2379 movq %r9,%rdx 2380 movq %r11,-24(%rbx) 2381 adcxq %rax,%r12 2382 adoxq %rbp,%r15 2383 leaq 32(%rcx),%rcx 2384 movq %r12,-16(%rbx) 2385 jmp .Lmulx4x_1st 2386 2387.align 32 2388.Lmulx4x_1st: 2389 adcxq %rbp,%r15 2390 mulxq 0(%rsi),%r10,%rax 2391 adcxq %r14,%r10 2392 mulxq 8(%rsi),%r11,%r14 2393 adcxq %rax,%r11 2394 mulxq 16(%rsi),%r12,%rax 2395 adcxq %r14,%r12 2396 mulxq 24(%rsi),%r13,%r14 2397.byte 0x67,0x67 2398 movq %r8,%rdx 2399 adcxq %rax,%r13 2400 adcxq %rbp,%r14 2401 leaq 32(%rsi),%rsi 2402 leaq 32(%rbx),%rbx 2403 2404 adoxq %r15,%r10 2405 mulxq 0(%rcx),%rax,%r15 2406 adcxq %rax,%r10 2407 adoxq %r15,%r11 2408 mulxq 8(%rcx),%rax,%r15 2409 adcxq %rax,%r11 2410 adoxq %r15,%r12 2411 mulxq 16(%rcx),%rax,%r15 2412 movq %r10,-40(%rbx) 2413 adcxq %rax,%r12 2414 movq %r11,-32(%rbx) 2415 adoxq %r15,%r13 2416 mulxq 24(%rcx),%rax,%r15 2417 movq %r9,%rdx 2418 movq %r12,-24(%rbx) 2419 adcxq %rax,%r13 2420 adoxq %rbp,%r15 2421 leaq 32(%rcx),%rcx 2422 movq %r13,-16(%rbx) 2423 2424 decq %rdi 2425 jnz .Lmulx4x_1st 2426 2427 movq 8(%rsp),%rax 2428 adcq %rbp,%r15 2429 leaq (%rsi,%rax,1),%rsi 2430 addq %r15,%r14 2431 movq 8+8(%rsp),%rdi 2432 adcq %rbp,%rbp 2433 movq %r14,-8(%rbx) 2434 jmp .Lmulx4x_outer 2435 2436.align 32 2437.Lmulx4x_outer: 2438 leaq 16-256(%rbx),%r10 2439 pxor %xmm4,%xmm4 2440.byte 0x67,0x67 2441 pxor %xmm5,%xmm5 2442 movdqa -128(%rdi),%xmm0 2443 movdqa -112(%rdi),%xmm1 2444 movdqa -96(%rdi),%xmm2 2445 pand 256(%r10),%xmm0 2446 movdqa -80(%rdi),%xmm3 2447 pand 272(%r10),%xmm1 2448 por %xmm0,%xmm4 2449 pand 288(%r10),%xmm2 2450 por %xmm1,%xmm5 2451 pand 304(%r10),%xmm3 2452 por %xmm2,%xmm4 2453 por %xmm3,%xmm5 2454 movdqa -64(%rdi),%xmm0 2455 movdqa -48(%rdi),%xmm1 2456 movdqa -32(%rdi),%xmm2 2457 pand 320(%r10),%xmm0 2458 movdqa -16(%rdi),%xmm3 2459 pand 336(%r10),%xmm1 2460 por %xmm0,%xmm4 2461 pand 352(%r10),%xmm2 2462 por %xmm1,%xmm5 2463 pand 368(%r10),%xmm3 2464 por %xmm2,%xmm4 2465 por %xmm3,%xmm5 2466 movdqa 0(%rdi),%xmm0 2467 movdqa 16(%rdi),%xmm1 2468 movdqa 32(%rdi),%xmm2 2469 pand 384(%r10),%xmm0 2470 movdqa 48(%rdi),%xmm3 2471 pand 400(%r10),%xmm1 2472 por %xmm0,%xmm4 2473 pand 416(%r10),%xmm2 2474 por %xmm1,%xmm5 2475 pand 432(%r10),%xmm3 2476 por %xmm2,%xmm4 2477 por %xmm3,%xmm5 2478 movdqa 64(%rdi),%xmm0 2479 movdqa 80(%rdi),%xmm1 2480 movdqa 96(%rdi),%xmm2 2481 pand 448(%r10),%xmm0 2482 movdqa 112(%rdi),%xmm3 2483 pand 464(%r10),%xmm1 2484 por %xmm0,%xmm4 2485 pand 480(%r10),%xmm2 2486 por %xmm1,%xmm5 2487 pand 496(%r10),%xmm3 2488 por %xmm2,%xmm4 2489 por %xmm3,%xmm5 2490 por %xmm5,%xmm4 2491 pshufd $0x4e,%xmm4,%xmm0 2492 por %xmm4,%xmm0 2493 leaq 256(%rdi),%rdi 2494.byte 102,72,15,126,194 2495 2496 movq %rbp,(%rbx) 2497 leaq 32(%rbx,%rax,1),%rbx 2498 mulxq 0(%rsi),%r8,%r11 2499 xorq %rbp,%rbp 2500 movq %rdx,%r9 2501 mulxq 8(%rsi),%r14,%r12 2502 adoxq -32(%rbx),%r8 2503 adcxq %r14,%r11 2504 mulxq 16(%rsi),%r15,%r13 2505 adoxq -24(%rbx),%r11 2506 adcxq %r15,%r12 2507 mulxq 24(%rsi),%rdx,%r14 2508 adoxq -16(%rbx),%r12 2509 adcxq %rdx,%r13 2510 leaq (%rcx,%rax,1),%rcx 2511 leaq 32(%rsi),%rsi 2512 adoxq -8(%rbx),%r13 2513 adcxq %rbp,%r14 2514 adoxq %rbp,%r14 2515 2516 movq %r8,%r15 2517 imulq 32+8(%rsp),%r8 2518 2519 movq %r8,%rdx 2520 xorq %rbp,%rbp 2521 movq %rdi,8+8(%rsp) 2522 2523 mulxq 0(%rcx),%rax,%r10 2524 adcxq %rax,%r15 2525 adoxq %r11,%r10 2526 mulxq 8(%rcx),%rax,%r11 2527 adcxq %rax,%r10 2528 adoxq %r12,%r11 2529 mulxq 16(%rcx),%rax,%r12 2530 adcxq %rax,%r11 2531 adoxq %r13,%r12 2532 mulxq 24(%rcx),%rax,%r15 2533 movq %r9,%rdx 2534 movq 24+8(%rsp),%rdi 2535 movq %r10,-32(%rbx) 2536 adcxq %rax,%r12 2537 movq %r11,-24(%rbx) 2538 adoxq %rbp,%r15 2539 movq %r12,-16(%rbx) 2540 leaq 32(%rcx),%rcx 2541 jmp .Lmulx4x_inner 2542 2543.align 32 2544.Lmulx4x_inner: 2545 mulxq 0(%rsi),%r10,%rax 2546 adcxq %rbp,%r15 2547 adoxq %r14,%r10 2548 mulxq 8(%rsi),%r11,%r14 2549 adcxq 0(%rbx),%r10 2550 adoxq %rax,%r11 2551 mulxq 16(%rsi),%r12,%rax 2552 adcxq 8(%rbx),%r11 2553 adoxq %r14,%r12 2554 mulxq 24(%rsi),%r13,%r14 2555 movq %r8,%rdx 2556 adcxq 16(%rbx),%r12 2557 adoxq %rax,%r13 2558 adcxq 24(%rbx),%r13 2559 adoxq %rbp,%r14 2560 leaq 32(%rsi),%rsi 2561 leaq 32(%rbx),%rbx 2562 adcxq %rbp,%r14 2563 2564 adoxq %r15,%r10 2565 mulxq 0(%rcx),%rax,%r15 2566 adcxq %rax,%r10 2567 adoxq %r15,%r11 2568 mulxq 8(%rcx),%rax,%r15 2569 adcxq %rax,%r11 2570 adoxq %r15,%r12 2571 mulxq 16(%rcx),%rax,%r15 2572 movq %r10,-40(%rbx) 2573 adcxq %rax,%r12 2574 adoxq %r15,%r13 2575 movq %r11,-32(%rbx) 2576 mulxq 24(%rcx),%rax,%r15 2577 movq %r9,%rdx 2578 leaq 32(%rcx),%rcx 2579 movq %r12,-24(%rbx) 2580 adcxq %rax,%r13 2581 adoxq %rbp,%r15 2582 movq %r13,-16(%rbx) 2583 2584 decq %rdi 2585 jnz .Lmulx4x_inner 2586 2587 movq 0+8(%rsp),%rax 2588 adcq %rbp,%r15 2589 subq 0(%rbx),%rdi 2590 movq 8+8(%rsp),%rdi 2591 movq 16+8(%rsp),%r10 2592 adcq %r15,%r14 2593 leaq (%rsi,%rax,1),%rsi 2594 adcq %rbp,%rbp 2595 movq %r14,-8(%rbx) 2596 2597 cmpq %r10,%rdi 2598 jb .Lmulx4x_outer 2599 2600 movq -8(%rcx),%r10 2601 movq %rbp,%r8 2602 movq (%rcx,%rax,1),%r12 2603 leaq (%rcx,%rax,1),%rbp 2604 movq %rax,%rcx 2605 leaq (%rbx,%rax,1),%rdi 2606 xorl %eax,%eax 2607 xorq %r15,%r15 2608 subq %r14,%r10 2609 adcq %r15,%r15 2610 orq %r15,%r8 2611 sarq $3+2,%rcx 2612 subq %r8,%rax 2613 movq 56+8(%rsp),%rdx 2614 decq %r12 2615 movq 8(%rbp),%r13 2616 xorq %r8,%r8 2617 movq 16(%rbp),%r14 2618 movq 24(%rbp),%r15 2619 jmp .Lsqrx4x_sub_entry 2620.size mulx4x_internal,.-mulx4x_internal 2621.type bn_powerx5,@function 2622.align 32 2623bn_powerx5: 2624.Lpowerx5_enter: 2625 movq %rsp,%rax 2626 pushq %rbx 2627 pushq %rbp 2628 pushq %r12 2629 pushq %r13 2630 pushq %r14 2631 pushq %r15 2632 2633 shll $3,%r9d 2634 leaq (%r9,%r9,2),%r10 2635 negq %r9 2636 movq (%r8),%r8 2637 2638 2639 2640 2641 2642 2643 2644 2645 leaq -320(%rsp,%r9,2),%r11 2646 subq %rdi,%r11 2647 andq $4095,%r11 2648 cmpq %r11,%r10 2649 jb .Lpwrx_sp_alt 2650 subq %r11,%rsp 2651 leaq -320(%rsp,%r9,2),%rsp 2652 jmp .Lpwrx_sp_done 2653 2654.align 32 2655.Lpwrx_sp_alt: 2656 leaq 4096-320(,%r9,2),%r10 2657 leaq -320(%rsp,%r9,2),%rsp 2658 subq %r10,%r11 2659 movq $0,%r10 2660 cmovcq %r10,%r11 2661 subq %r11,%rsp 2662.Lpwrx_sp_done: 2663 andq $-64,%rsp 2664 movq %rax,%r11 2665 subq %rsp,%r11 2666 andq $-4096,%r11 2667.Lpwrx_page_walk: 2668 movq (%rsp,%r11,1),%r10 2669 subq $4096,%r11 2670.byte 0x2e 2671 jnc .Lpwrx_page_walk 2672 2673 movq %r9,%r10 2674 negq %r9 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 pxor %xmm0,%xmm0 2688.byte 102,72,15,110,207 2689.byte 102,72,15,110,209 2690.byte 102,73,15,110,218 2691.byte 102,72,15,110,226 2692 movq %r8,32(%rsp) 2693 movq %rax,40(%rsp) 2694.Lpowerx5_body: 2695 2696 call __bn_sqrx8x_internal 2697 call __bn_postx4x_internal 2698 call __bn_sqrx8x_internal 2699 call __bn_postx4x_internal 2700 call __bn_sqrx8x_internal 2701 call __bn_postx4x_internal 2702 call __bn_sqrx8x_internal 2703 call __bn_postx4x_internal 2704 call __bn_sqrx8x_internal 2705 call __bn_postx4x_internal 2706 2707 movq %r10,%r9 2708 movq %rsi,%rdi 2709.byte 102,72,15,126,209 2710.byte 102,72,15,126,226 2711 movq 40(%rsp),%rax 2712 2713 call mulx4x_internal 2714 2715 movq 40(%rsp),%rsi 2716 movq $1,%rax 2717 2718 movq -48(%rsi),%r15 2719 movq -40(%rsi),%r14 2720 movq -32(%rsi),%r13 2721 movq -24(%rsi),%r12 2722 movq -16(%rsi),%rbp 2723 movq -8(%rsi),%rbx 2724 leaq (%rsi),%rsp 2725.Lpowerx5_epilogue: 2726 .byte 0xf3,0xc3 2727.size bn_powerx5,.-bn_powerx5 2728 2729.globl bn_sqrx8x_internal 2730.hidden bn_sqrx8x_internal 2731.type bn_sqrx8x_internal,@function 2732.align 32 2733bn_sqrx8x_internal: 2734__bn_sqrx8x_internal: 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 leaq 48+8(%rsp),%rdi 2776 leaq (%rsi,%r9,1),%rbp 2777 movq %r9,0+8(%rsp) 2778 movq %rbp,8+8(%rsp) 2779 jmp .Lsqr8x_zero_start 2780 2781.align 32 2782.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 2783.Lsqrx8x_zero: 2784.byte 0x3e 2785 movdqa %xmm0,0(%rdi) 2786 movdqa %xmm0,16(%rdi) 2787 movdqa %xmm0,32(%rdi) 2788 movdqa %xmm0,48(%rdi) 2789.Lsqr8x_zero_start: 2790 movdqa %xmm0,64(%rdi) 2791 movdqa %xmm0,80(%rdi) 2792 movdqa %xmm0,96(%rdi) 2793 movdqa %xmm0,112(%rdi) 2794 leaq 128(%rdi),%rdi 2795 subq $64,%r9 2796 jnz .Lsqrx8x_zero 2797 2798 movq 0(%rsi),%rdx 2799 2800 xorq %r10,%r10 2801 xorq %r11,%r11 2802 xorq %r12,%r12 2803 xorq %r13,%r13 2804 xorq %r14,%r14 2805 xorq %r15,%r15 2806 leaq 48+8(%rsp),%rdi 2807 xorq %rbp,%rbp 2808 jmp .Lsqrx8x_outer_loop 2809 2810.align 32 2811.Lsqrx8x_outer_loop: 2812 mulxq 8(%rsi),%r8,%rax 2813 adcxq %r9,%r8 2814 adoxq %rax,%r10 2815 mulxq 16(%rsi),%r9,%rax 2816 adcxq %r10,%r9 2817 adoxq %rax,%r11 2818.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 2819 adcxq %r11,%r10 2820 adoxq %rax,%r12 2821.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 2822 adcxq %r12,%r11 2823 adoxq %rax,%r13 2824 mulxq 40(%rsi),%r12,%rax 2825 adcxq %r13,%r12 2826 adoxq %rax,%r14 2827 mulxq 48(%rsi),%r13,%rax 2828 adcxq %r14,%r13 2829 adoxq %r15,%rax 2830 mulxq 56(%rsi),%r14,%r15 2831 movq 8(%rsi),%rdx 2832 adcxq %rax,%r14 2833 adoxq %rbp,%r15 2834 adcq 64(%rdi),%r15 2835 movq %r8,8(%rdi) 2836 movq %r9,16(%rdi) 2837 sbbq %rcx,%rcx 2838 xorq %rbp,%rbp 2839 2840 2841 mulxq 16(%rsi),%r8,%rbx 2842 mulxq 24(%rsi),%r9,%rax 2843 adcxq %r10,%r8 2844 adoxq %rbx,%r9 2845 mulxq 32(%rsi),%r10,%rbx 2846 adcxq %r11,%r9 2847 adoxq %rax,%r10 2848.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 2849 adcxq %r12,%r10 2850 adoxq %rbx,%r11 2851.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 2852 adcxq %r13,%r11 2853 adoxq %r14,%r12 2854.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 2855 movq 16(%rsi),%rdx 2856 adcxq %rax,%r12 2857 adoxq %rbx,%r13 2858 adcxq %r15,%r13 2859 adoxq %rbp,%r14 2860 adcxq %rbp,%r14 2861 2862 movq %r8,24(%rdi) 2863 movq %r9,32(%rdi) 2864 2865 mulxq 24(%rsi),%r8,%rbx 2866 mulxq 32(%rsi),%r9,%rax 2867 adcxq %r10,%r8 2868 adoxq %rbx,%r9 2869 mulxq 40(%rsi),%r10,%rbx 2870 adcxq %r11,%r9 2871 adoxq %rax,%r10 2872.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 2873 adcxq %r12,%r10 2874 adoxq %r13,%r11 2875.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 2876.byte 0x3e 2877 movq 24(%rsi),%rdx 2878 adcxq %rbx,%r11 2879 adoxq %rax,%r12 2880 adcxq %r14,%r12 2881 movq %r8,40(%rdi) 2882 movq %r9,48(%rdi) 2883 mulxq 32(%rsi),%r8,%rax 2884 adoxq %rbp,%r13 2885 adcxq %rbp,%r13 2886 2887 mulxq 40(%rsi),%r9,%rbx 2888 adcxq %r10,%r8 2889 adoxq %rax,%r9 2890 mulxq 48(%rsi),%r10,%rax 2891 adcxq %r11,%r9 2892 adoxq %r12,%r10 2893 mulxq 56(%rsi),%r11,%r12 2894 movq 32(%rsi),%rdx 2895 movq 40(%rsi),%r14 2896 adcxq %rbx,%r10 2897 adoxq %rax,%r11 2898 movq 48(%rsi),%r15 2899 adcxq %r13,%r11 2900 adoxq %rbp,%r12 2901 adcxq %rbp,%r12 2902 2903 movq %r8,56(%rdi) 2904 movq %r9,64(%rdi) 2905 2906 mulxq %r14,%r9,%rax 2907 movq 56(%rsi),%r8 2908 adcxq %r10,%r9 2909 mulxq %r15,%r10,%rbx 2910 adoxq %rax,%r10 2911 adcxq %r11,%r10 2912 mulxq %r8,%r11,%rax 2913 movq %r14,%rdx 2914 adoxq %rbx,%r11 2915 adcxq %r12,%r11 2916 2917 adcxq %rbp,%rax 2918 2919 mulxq %r15,%r14,%rbx 2920 mulxq %r8,%r12,%r13 2921 movq %r15,%rdx 2922 leaq 64(%rsi),%rsi 2923 adcxq %r14,%r11 2924 adoxq %rbx,%r12 2925 adcxq %rax,%r12 2926 adoxq %rbp,%r13 2927 2928.byte 0x67,0x67 2929 mulxq %r8,%r8,%r14 2930 adcxq %r8,%r13 2931 adcxq %rbp,%r14 2932 2933 cmpq 8+8(%rsp),%rsi 2934 je .Lsqrx8x_outer_break 2935 2936 negq %rcx 2937 movq $-8,%rcx 2938 movq %rbp,%r15 2939 movq 64(%rdi),%r8 2940 adcxq 72(%rdi),%r9 2941 adcxq 80(%rdi),%r10 2942 adcxq 88(%rdi),%r11 2943 adcq 96(%rdi),%r12 2944 adcq 104(%rdi),%r13 2945 adcq 112(%rdi),%r14 2946 adcq 120(%rdi),%r15 2947 leaq (%rsi),%rbp 2948 leaq 128(%rdi),%rdi 2949 sbbq %rax,%rax 2950 2951 movq -64(%rsi),%rdx 2952 movq %rax,16+8(%rsp) 2953 movq %rdi,24+8(%rsp) 2954 2955 2956 xorl %eax,%eax 2957 jmp .Lsqrx8x_loop 2958 2959.align 32 2960.Lsqrx8x_loop: 2961 movq %r8,%rbx 2962 mulxq 0(%rbp),%rax,%r8 2963 adcxq %rax,%rbx 2964 adoxq %r9,%r8 2965 2966 mulxq 8(%rbp),%rax,%r9 2967 adcxq %rax,%r8 2968 adoxq %r10,%r9 2969 2970 mulxq 16(%rbp),%rax,%r10 2971 adcxq %rax,%r9 2972 adoxq %r11,%r10 2973 2974 mulxq 24(%rbp),%rax,%r11 2975 adcxq %rax,%r10 2976 adoxq %r12,%r11 2977 2978.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 2979 adcxq %rax,%r11 2980 adoxq %r13,%r12 2981 2982 mulxq 40(%rbp),%rax,%r13 2983 adcxq %rax,%r12 2984 adoxq %r14,%r13 2985 2986 mulxq 48(%rbp),%rax,%r14 2987 movq %rbx,(%rdi,%rcx,8) 2988 movl $0,%ebx 2989 adcxq %rax,%r13 2990 adoxq %r15,%r14 2991 2992.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 2993 movq 8(%rsi,%rcx,8),%rdx 2994 adcxq %rax,%r14 2995 adoxq %rbx,%r15 2996 adcxq %rbx,%r15 2997 2998.byte 0x67 2999 incq %rcx 3000 jnz .Lsqrx8x_loop 3001 3002 leaq 64(%rbp),%rbp 3003 movq $-8,%rcx 3004 cmpq 8+8(%rsp),%rbp 3005 je .Lsqrx8x_break 3006 3007 subq 16+8(%rsp),%rbx 3008.byte 0x66 3009 movq -64(%rsi),%rdx 3010 adcxq 0(%rdi),%r8 3011 adcxq 8(%rdi),%r9 3012 adcq 16(%rdi),%r10 3013 adcq 24(%rdi),%r11 3014 adcq 32(%rdi),%r12 3015 adcq 40(%rdi),%r13 3016 adcq 48(%rdi),%r14 3017 adcq 56(%rdi),%r15 3018 leaq 64(%rdi),%rdi 3019.byte 0x67 3020 sbbq %rax,%rax 3021 xorl %ebx,%ebx 3022 movq %rax,16+8(%rsp) 3023 jmp .Lsqrx8x_loop 3024 3025.align 32 3026.Lsqrx8x_break: 3027 subq 16+8(%rsp),%r8 3028 movq 24+8(%rsp),%rcx 3029 movq 0(%rsi),%rdx 3030 xorl %ebp,%ebp 3031 movq %r8,0(%rdi) 3032 cmpq %rcx,%rdi 3033 je .Lsqrx8x_outer_loop 3034 3035 movq %r9,8(%rdi) 3036 movq 8(%rcx),%r9 3037 movq %r10,16(%rdi) 3038 movq 16(%rcx),%r10 3039 movq %r11,24(%rdi) 3040 movq 24(%rcx),%r11 3041 movq %r12,32(%rdi) 3042 movq 32(%rcx),%r12 3043 movq %r13,40(%rdi) 3044 movq 40(%rcx),%r13 3045 movq %r14,48(%rdi) 3046 movq 48(%rcx),%r14 3047 movq %r15,56(%rdi) 3048 movq 56(%rcx),%r15 3049 movq %rcx,%rdi 3050 jmp .Lsqrx8x_outer_loop 3051 3052.align 32 3053.Lsqrx8x_outer_break: 3054 movq %r9,72(%rdi) 3055.byte 102,72,15,126,217 3056 movq %r10,80(%rdi) 3057 movq %r11,88(%rdi) 3058 movq %r12,96(%rdi) 3059 movq %r13,104(%rdi) 3060 movq %r14,112(%rdi) 3061 leaq 48+8(%rsp),%rdi 3062 movq (%rsi,%rcx,1),%rdx 3063 3064 movq 8(%rdi),%r11 3065 xorq %r10,%r10 3066 movq 0+8(%rsp),%r9 3067 adoxq %r11,%r11 3068 movq 16(%rdi),%r12 3069 movq 24(%rdi),%r13 3070 3071 3072.align 32 3073.Lsqrx4x_shift_n_add: 3074 mulxq %rdx,%rax,%rbx 3075 adoxq %r12,%r12 3076 adcxq %r10,%rax 3077.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 3078.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 3079 adoxq %r13,%r13 3080 adcxq %r11,%rbx 3081 movq 40(%rdi),%r11 3082 movq %rax,0(%rdi) 3083 movq %rbx,8(%rdi) 3084 3085 mulxq %rdx,%rax,%rbx 3086 adoxq %r10,%r10 3087 adcxq %r12,%rax 3088 movq 16(%rsi,%rcx,1),%rdx 3089 movq 48(%rdi),%r12 3090 adoxq %r11,%r11 3091 adcxq %r13,%rbx 3092 movq 56(%rdi),%r13 3093 movq %rax,16(%rdi) 3094 movq %rbx,24(%rdi) 3095 3096 mulxq %rdx,%rax,%rbx 3097 adoxq %r12,%r12 3098 adcxq %r10,%rax 3099 movq 24(%rsi,%rcx,1),%rdx 3100 leaq 32(%rcx),%rcx 3101 movq 64(%rdi),%r10 3102 adoxq %r13,%r13 3103 adcxq %r11,%rbx 3104 movq 72(%rdi),%r11 3105 movq %rax,32(%rdi) 3106 movq %rbx,40(%rdi) 3107 3108 mulxq %rdx,%rax,%rbx 3109 adoxq %r10,%r10 3110 adcxq %r12,%rax 3111 jrcxz .Lsqrx4x_shift_n_add_break 3112.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 3113 adoxq %r11,%r11 3114 adcxq %r13,%rbx 3115 movq 80(%rdi),%r12 3116 movq 88(%rdi),%r13 3117 movq %rax,48(%rdi) 3118 movq %rbx,56(%rdi) 3119 leaq 64(%rdi),%rdi 3120 nop 3121 jmp .Lsqrx4x_shift_n_add 3122 3123.align 32 3124.Lsqrx4x_shift_n_add_break: 3125 adcxq %r13,%rbx 3126 movq %rax,48(%rdi) 3127 movq %rbx,56(%rdi) 3128 leaq 64(%rdi),%rdi 3129.byte 102,72,15,126,213 3130__bn_sqrx8x_reduction: 3131 xorl %eax,%eax 3132 movq 32+8(%rsp),%rbx 3133 movq 48+8(%rsp),%rdx 3134 leaq -64(%rbp,%r9,1),%rcx 3135 3136 movq %rcx,0+8(%rsp) 3137 movq %rdi,8+8(%rsp) 3138 3139 leaq 48+8(%rsp),%rdi 3140 jmp .Lsqrx8x_reduction_loop 3141 3142.align 32 3143.Lsqrx8x_reduction_loop: 3144 movq 8(%rdi),%r9 3145 movq 16(%rdi),%r10 3146 movq 24(%rdi),%r11 3147 movq 32(%rdi),%r12 3148 movq %rdx,%r8 3149 imulq %rbx,%rdx 3150 movq 40(%rdi),%r13 3151 movq 48(%rdi),%r14 3152 movq 56(%rdi),%r15 3153 movq %rax,24+8(%rsp) 3154 3155 leaq 64(%rdi),%rdi 3156 xorq %rsi,%rsi 3157 movq $-8,%rcx 3158 jmp .Lsqrx8x_reduce 3159 3160.align 32 3161.Lsqrx8x_reduce: 3162 movq %r8,%rbx 3163 mulxq 0(%rbp),%rax,%r8 3164 adcxq %rbx,%rax 3165 adoxq %r9,%r8 3166 3167 mulxq 8(%rbp),%rbx,%r9 3168 adcxq %rbx,%r8 3169 adoxq %r10,%r9 3170 3171 mulxq 16(%rbp),%rbx,%r10 3172 adcxq %rbx,%r9 3173 adoxq %r11,%r10 3174 3175 mulxq 24(%rbp),%rbx,%r11 3176 adcxq %rbx,%r10 3177 adoxq %r12,%r11 3178 3179.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 3180 movq %rdx,%rax 3181 movq %r8,%rdx 3182 adcxq %rbx,%r11 3183 adoxq %r13,%r12 3184 3185 mulxq 32+8(%rsp),%rbx,%rdx 3186 movq %rax,%rdx 3187 movq %rax,64+48+8(%rsp,%rcx,8) 3188 3189 mulxq 40(%rbp),%rax,%r13 3190 adcxq %rax,%r12 3191 adoxq %r14,%r13 3192 3193 mulxq 48(%rbp),%rax,%r14 3194 adcxq %rax,%r13 3195 adoxq %r15,%r14 3196 3197 mulxq 56(%rbp),%rax,%r15 3198 movq %rbx,%rdx 3199 adcxq %rax,%r14 3200 adoxq %rsi,%r15 3201 adcxq %rsi,%r15 3202 3203.byte 0x67,0x67,0x67 3204 incq %rcx 3205 jnz .Lsqrx8x_reduce 3206 3207 movq %rsi,%rax 3208 cmpq 0+8(%rsp),%rbp 3209 jae .Lsqrx8x_no_tail 3210 3211 movq 48+8(%rsp),%rdx 3212 addq 0(%rdi),%r8 3213 leaq 64(%rbp),%rbp 3214 movq $-8,%rcx 3215 adcxq 8(%rdi),%r9 3216 adcxq 16(%rdi),%r10 3217 adcq 24(%rdi),%r11 3218 adcq 32(%rdi),%r12 3219 adcq 40(%rdi),%r13 3220 adcq 48(%rdi),%r14 3221 adcq 56(%rdi),%r15 3222 leaq 64(%rdi),%rdi 3223 sbbq %rax,%rax 3224 3225 xorq %rsi,%rsi 3226 movq %rax,16+8(%rsp) 3227 jmp .Lsqrx8x_tail 3228 3229.align 32 3230.Lsqrx8x_tail: 3231 movq %r8,%rbx 3232 mulxq 0(%rbp),%rax,%r8 3233 adcxq %rax,%rbx 3234 adoxq %r9,%r8 3235 3236 mulxq 8(%rbp),%rax,%r9 3237 adcxq %rax,%r8 3238 adoxq %r10,%r9 3239 3240 mulxq 16(%rbp),%rax,%r10 3241 adcxq %rax,%r9 3242 adoxq %r11,%r10 3243 3244 mulxq 24(%rbp),%rax,%r11 3245 adcxq %rax,%r10 3246 adoxq %r12,%r11 3247 3248.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 3249 adcxq %rax,%r11 3250 adoxq %r13,%r12 3251 3252 mulxq 40(%rbp),%rax,%r13 3253 adcxq %rax,%r12 3254 adoxq %r14,%r13 3255 3256 mulxq 48(%rbp),%rax,%r14 3257 adcxq %rax,%r13 3258 adoxq %r15,%r14 3259 3260 mulxq 56(%rbp),%rax,%r15 3261 movq 72+48+8(%rsp,%rcx,8),%rdx 3262 adcxq %rax,%r14 3263 adoxq %rsi,%r15 3264 movq %rbx,(%rdi,%rcx,8) 3265 movq %r8,%rbx 3266 adcxq %rsi,%r15 3267 3268 incq %rcx 3269 jnz .Lsqrx8x_tail 3270 3271 cmpq 0+8(%rsp),%rbp 3272 jae .Lsqrx8x_tail_done 3273 3274 subq 16+8(%rsp),%rsi 3275 movq 48+8(%rsp),%rdx 3276 leaq 64(%rbp),%rbp 3277 adcq 0(%rdi),%r8 3278 adcq 8(%rdi),%r9 3279 adcq 16(%rdi),%r10 3280 adcq 24(%rdi),%r11 3281 adcq 32(%rdi),%r12 3282 adcq 40(%rdi),%r13 3283 adcq 48(%rdi),%r14 3284 adcq 56(%rdi),%r15 3285 leaq 64(%rdi),%rdi 3286 sbbq %rax,%rax 3287 subq $8,%rcx 3288 3289 xorq %rsi,%rsi 3290 movq %rax,16+8(%rsp) 3291 jmp .Lsqrx8x_tail 3292 3293.align 32 3294.Lsqrx8x_tail_done: 3295 addq 24+8(%rsp),%r8 3296 adcq $0,%r9 3297 adcq $0,%r10 3298 adcq $0,%r11 3299 adcq $0,%r12 3300 adcq $0,%r13 3301 adcq $0,%r14 3302 adcq $0,%r15 3303 3304 3305 movq %rsi,%rax 3306 3307 subq 16+8(%rsp),%rsi 3308.Lsqrx8x_no_tail: 3309 adcq 0(%rdi),%r8 3310.byte 102,72,15,126,217 3311 adcq 8(%rdi),%r9 3312 movq 56(%rbp),%rsi 3313.byte 102,72,15,126,213 3314 adcq 16(%rdi),%r10 3315 adcq 24(%rdi),%r11 3316 adcq 32(%rdi),%r12 3317 adcq 40(%rdi),%r13 3318 adcq 48(%rdi),%r14 3319 adcq 56(%rdi),%r15 3320 adcq %rax,%rax 3321 3322 movq 32+8(%rsp),%rbx 3323 movq 64(%rdi,%rcx,1),%rdx 3324 3325 movq %r8,0(%rdi) 3326 leaq 64(%rdi),%r8 3327 movq %r9,8(%rdi) 3328 movq %r10,16(%rdi) 3329 movq %r11,24(%rdi) 3330 movq %r12,32(%rdi) 3331 movq %r13,40(%rdi) 3332 movq %r14,48(%rdi) 3333 movq %r15,56(%rdi) 3334 3335 leaq 64(%rdi,%rcx,1),%rdi 3336 cmpq 8+8(%rsp),%r8 3337 jb .Lsqrx8x_reduction_loop 3338 .byte 0xf3,0xc3 3339.size bn_sqrx8x_internal,.-bn_sqrx8x_internal 3340.align 32 3341__bn_postx4x_internal: 3342 movq 0(%rbp),%r12 3343 movq %rcx,%r10 3344 movq %rcx,%r9 3345 negq %rax 3346 sarq $3+2,%rcx 3347 3348.byte 102,72,15,126,202 3349.byte 102,72,15,126,206 3350 decq %r12 3351 movq 8(%rbp),%r13 3352 xorq %r8,%r8 3353 movq 16(%rbp),%r14 3354 movq 24(%rbp),%r15 3355 jmp .Lsqrx4x_sub_entry 3356 3357.align 16 3358.Lsqrx4x_sub: 3359 movq 0(%rbp),%r12 3360 movq 8(%rbp),%r13 3361 movq 16(%rbp),%r14 3362 movq 24(%rbp),%r15 3363.Lsqrx4x_sub_entry: 3364 andnq %rax,%r12,%r12 3365 leaq 32(%rbp),%rbp 3366 andnq %rax,%r13,%r13 3367 andnq %rax,%r14,%r14 3368 andnq %rax,%r15,%r15 3369 3370 negq %r8 3371 adcq 0(%rdi),%r12 3372 adcq 8(%rdi),%r13 3373 adcq 16(%rdi),%r14 3374 adcq 24(%rdi),%r15 3375 movq %r12,0(%rdx) 3376 leaq 32(%rdi),%rdi 3377 movq %r13,8(%rdx) 3378 sbbq %r8,%r8 3379 movq %r14,16(%rdx) 3380 movq %r15,24(%rdx) 3381 leaq 32(%rdx),%rdx 3382 3383 incq %rcx 3384 jnz .Lsqrx4x_sub 3385 3386 negq %r9 3387 3388 .byte 0xf3,0xc3 3389.size __bn_postx4x_internal,.-__bn_postx4x_internal 3390.globl bn_get_bits5 3391.type bn_get_bits5,@function 3392.align 16 3393bn_get_bits5: 3394 leaq 0(%rdi),%r10 3395 leaq 1(%rdi),%r11 3396 movl %esi,%ecx 3397 shrl $4,%esi 3398 andl $15,%ecx 3399 leal -8(%rcx),%eax 3400 cmpl $11,%ecx 3401 cmovaq %r11,%r10 3402 cmoval %eax,%ecx 3403 movzwl (%r10,%rsi,2),%eax 3404 shrl %cl,%eax 3405 andl $31,%eax 3406 .byte 0xf3,0xc3 3407.size bn_get_bits5,.-bn_get_bits5 3408 3409.globl bn_scatter5 3410.type bn_scatter5,@function 3411.align 16 3412bn_scatter5: 3413 cmpl $0,%esi 3414 jz .Lscatter_epilogue 3415 leaq (%rdx,%rcx,8),%rdx 3416.Lscatter: 3417 movq (%rdi),%rax 3418 leaq 8(%rdi),%rdi 3419 movq %rax,(%rdx) 3420 leaq 256(%rdx),%rdx 3421 subl $1,%esi 3422 jnz .Lscatter 3423.Lscatter_epilogue: 3424 .byte 0xf3,0xc3 3425.size bn_scatter5,.-bn_scatter5 3426 3427.globl bn_gather5 3428.type bn_gather5,@function 3429.align 32 3430bn_gather5: 3431.LSEH_begin_bn_gather5: 3432 3433.byte 0x4c,0x8d,0x14,0x24 3434.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 3435 leaq .Linc(%rip),%rax 3436 andq $-16,%rsp 3437 3438 movd %ecx,%xmm5 3439 movdqa 0(%rax),%xmm0 3440 movdqa 16(%rax),%xmm1 3441 leaq 128(%rdx),%r11 3442 leaq 128(%rsp),%rax 3443 3444 pshufd $0,%xmm5,%xmm5 3445 movdqa %xmm1,%xmm4 3446 movdqa %xmm1,%xmm2 3447 paddd %xmm0,%xmm1 3448 pcmpeqd %xmm5,%xmm0 3449 movdqa %xmm4,%xmm3 3450 3451 paddd %xmm1,%xmm2 3452 pcmpeqd %xmm5,%xmm1 3453 movdqa %xmm0,-128(%rax) 3454 movdqa %xmm4,%xmm0 3455 3456 paddd %xmm2,%xmm3 3457 pcmpeqd %xmm5,%xmm2 3458 movdqa %xmm1,-112(%rax) 3459 movdqa %xmm4,%xmm1 3460 3461 paddd %xmm3,%xmm0 3462 pcmpeqd %xmm5,%xmm3 3463 movdqa %xmm2,-96(%rax) 3464 movdqa %xmm4,%xmm2 3465 paddd %xmm0,%xmm1 3466 pcmpeqd %xmm5,%xmm0 3467 movdqa %xmm3,-80(%rax) 3468 movdqa %xmm4,%xmm3 3469 3470 paddd %xmm1,%xmm2 3471 pcmpeqd %xmm5,%xmm1 3472 movdqa %xmm0,-64(%rax) 3473 movdqa %xmm4,%xmm0 3474 3475 paddd %xmm2,%xmm3 3476 pcmpeqd %xmm5,%xmm2 3477 movdqa %xmm1,-48(%rax) 3478 movdqa %xmm4,%xmm1 3479 3480 paddd %xmm3,%xmm0 3481 pcmpeqd %xmm5,%xmm3 3482 movdqa %xmm2,-32(%rax) 3483 movdqa %xmm4,%xmm2 3484 paddd %xmm0,%xmm1 3485 pcmpeqd %xmm5,%xmm0 3486 movdqa %xmm3,-16(%rax) 3487 movdqa %xmm4,%xmm3 3488 3489 paddd %xmm1,%xmm2 3490 pcmpeqd %xmm5,%xmm1 3491 movdqa %xmm0,0(%rax) 3492 movdqa %xmm4,%xmm0 3493 3494 paddd %xmm2,%xmm3 3495 pcmpeqd %xmm5,%xmm2 3496 movdqa %xmm1,16(%rax) 3497 movdqa %xmm4,%xmm1 3498 3499 paddd %xmm3,%xmm0 3500 pcmpeqd %xmm5,%xmm3 3501 movdqa %xmm2,32(%rax) 3502 movdqa %xmm4,%xmm2 3503 paddd %xmm0,%xmm1 3504 pcmpeqd %xmm5,%xmm0 3505 movdqa %xmm3,48(%rax) 3506 movdqa %xmm4,%xmm3 3507 3508 paddd %xmm1,%xmm2 3509 pcmpeqd %xmm5,%xmm1 3510 movdqa %xmm0,64(%rax) 3511 movdqa %xmm4,%xmm0 3512 3513 paddd %xmm2,%xmm3 3514 pcmpeqd %xmm5,%xmm2 3515 movdqa %xmm1,80(%rax) 3516 movdqa %xmm4,%xmm1 3517 3518 paddd %xmm3,%xmm0 3519 pcmpeqd %xmm5,%xmm3 3520 movdqa %xmm2,96(%rax) 3521 movdqa %xmm4,%xmm2 3522 movdqa %xmm3,112(%rax) 3523 jmp .Lgather 3524 3525.align 32 3526.Lgather: 3527 pxor %xmm4,%xmm4 3528 pxor %xmm5,%xmm5 3529 movdqa -128(%r11),%xmm0 3530 movdqa -112(%r11),%xmm1 3531 movdqa -96(%r11),%xmm2 3532 pand -128(%rax),%xmm0 3533 movdqa -80(%r11),%xmm3 3534 pand -112(%rax),%xmm1 3535 por %xmm0,%xmm4 3536 pand -96(%rax),%xmm2 3537 por %xmm1,%xmm5 3538 pand -80(%rax),%xmm3 3539 por %xmm2,%xmm4 3540 por %xmm3,%xmm5 3541 movdqa -64(%r11),%xmm0 3542 movdqa -48(%r11),%xmm1 3543 movdqa -32(%r11),%xmm2 3544 pand -64(%rax),%xmm0 3545 movdqa -16(%r11),%xmm3 3546 pand -48(%rax),%xmm1 3547 por %xmm0,%xmm4 3548 pand -32(%rax),%xmm2 3549 por %xmm1,%xmm5 3550 pand -16(%rax),%xmm3 3551 por %xmm2,%xmm4 3552 por %xmm3,%xmm5 3553 movdqa 0(%r11),%xmm0 3554 movdqa 16(%r11),%xmm1 3555 movdqa 32(%r11),%xmm2 3556 pand 0(%rax),%xmm0 3557 movdqa 48(%r11),%xmm3 3558 pand 16(%rax),%xmm1 3559 por %xmm0,%xmm4 3560 pand 32(%rax),%xmm2 3561 por %xmm1,%xmm5 3562 pand 48(%rax),%xmm3 3563 por %xmm2,%xmm4 3564 por %xmm3,%xmm5 3565 movdqa 64(%r11),%xmm0 3566 movdqa 80(%r11),%xmm1 3567 movdqa 96(%r11),%xmm2 3568 pand 64(%rax),%xmm0 3569 movdqa 112(%r11),%xmm3 3570 pand 80(%rax),%xmm1 3571 por %xmm0,%xmm4 3572 pand 96(%rax),%xmm2 3573 por %xmm1,%xmm5 3574 pand 112(%rax),%xmm3 3575 por %xmm2,%xmm4 3576 por %xmm3,%xmm5 3577 por %xmm5,%xmm4 3578 leaq 256(%r11),%r11 3579 pshufd $0x4e,%xmm4,%xmm0 3580 por %xmm4,%xmm0 3581 movq %xmm0,(%rdi) 3582 leaq 8(%rdi),%rdi 3583 subl $1,%esi 3584 jnz .Lgather 3585 3586 leaq (%r10),%rsp 3587 .byte 0xf3,0xc3 3588.LSEH_end_bn_gather5: 3589.size bn_gather5,.-bn_gather5 3590.align 64 3591.Linc: 3592.long 0,0, 1,1 3593.long 2,2, 2,2 3594.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 3595