rsaz-x86_64.pl revision 356290
1#!/usr/bin/env perl 2 3############################################################################## 4# # 5# Copyright (c) 2012, Intel Corporation # 6# # 7# All rights reserved. # 8# # 9# Redistribution and use in source and binary forms, with or without # 10# modification, are permitted provided that the following conditions are # 11# met: # 12# # 13# * Redistributions of source code must retain the above copyright # 14# notice, this list of conditions and the following disclaimer. # 15# # 16# * Redistributions in binary form must reproduce the above copyright # 17# notice, this list of conditions and the following disclaimer in the # 18# documentation and/or other materials provided with the # 19# distribution. # 20# # 21# * Neither the name of the Intel Corporation nor the names of its # 22# contributors may be used to endorse or promote products derived from # 23# this software without specific prior written permission. # 24# # 25# # 26# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY # 27# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # 28# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # 29# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR # 30# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # 31# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # 32# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # 33# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # 34# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # 35# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # 36# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # 37# # 38############################################################################## 39# Developers and authors: # 40# Shay Gueron (1, 2), and Vlad Krasnov (1) # 41# (1) Intel Architecture Group, Microprocessor and Chipset Development, # 42# Israel Development Center, Haifa, Israel # 43# (2) University of Haifa # 44############################################################################## 45# Reference: # 46# [1] S. Gueron, "Efficient Software Implementations of Modular # 47# Exponentiation", http://eprint.iacr.org/2011/239 # 48# [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". # 49# IEEE Proceedings of 9th International Conference on Information # 50# Technology: New Generations (ITNG 2012), 821-823 (2012). # 51# [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation# 52# Journal of Cryptographic Engineering 2:31-43 (2012). # 53# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis # 54# resistant 512-bit and 1024-bit modular exponentiation for optimizing # 55# RSA1024 and RSA2048 on x86_64 platforms", # 56# http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest# 57############################################################################## 58 59# While original submission covers 512- and 1024-bit exponentiation, 60# this module is limited to 512-bit version only (and as such 61# accelerates RSA1024 sign). This is because improvement for longer 62# keys is not high enough to justify the effort, highest measured 63# was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming 64# for the moment of this writing!] Nor does this module implement 65# "monolithic" complete exponentiation jumbo-subroutine, but adheres 66# to more modular mixture of C and assembly. And it's optimized even 67# for processors other than Intel Core family (see table below for 68# improvement coefficients). 69# <appro@openssl.org> 70# 71# RSA1024 sign/sec this/original |this/rsax(*) this/fips(*) 72# ----------------+--------------------------- 73# Opteron +13% |+5% +20% 74# Bulldozer -0% |-1% +10% 75# P4 +11% |+7% +8% 76# Westmere +5% |+14% +17% 77# Sandy Bridge +2% |+12% +29% 78# Ivy Bridge +1% |+11% +35% 79# Haswell(**) -0% |+12% +39% 80# Atom +13% |+11% +4% 81# VIA Nano +70% |+9% +25% 82# 83# (*) rsax engine and fips numbers are presented for reference 84# purposes; 85# (**) MULX was attempted, but found to give only marginal improvement; 86 87$flavour = shift; 88$output = shift; 89if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 90 91$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 92 93$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 94( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 95( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 96die "can't locate x86_64-xlate.pl"; 97 98open OUT,"| \"$^X\" $xlate $flavour $output"; 99*STDOUT=*OUT; 100 101if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 102 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 103 $addx = ($1>=2.23); 104} 105 106if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 107 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 108 $addx = ($1>=2.10); 109} 110 111if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 112 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 113 $addx = ($1>=12); 114} 115 116if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) { 117 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 118 $addx = ($ver>=3.03); 119} 120 121($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API 122{ 123my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d"); 124 125$code.=<<___; 126.text 127 128.extern OPENSSL_ia32cap_P 129 130.globl rsaz_512_sqr 131.type rsaz_512_sqr,\@function,5 132.align 32 133rsaz_512_sqr: # 25-29% faster than rsaz_512_mul 134 push %rbx 135 push %rbp 136 push %r12 137 push %r13 138 push %r14 139 push %r15 140 141 subq \$128+24, %rsp 142.Lsqr_body: 143 movq $mod, %xmm1 # common off-load 144 movq ($inp), %rdx 145 movq 8($inp), %rax 146 movq $n0, 128(%rsp) 147___ 148$code.=<<___ if ($addx); 149 movl \$0x80100,%r11d 150 andl OPENSSL_ia32cap_P+8(%rip),%r11d 151 cmpl \$0x80100,%r11d # check for MULX and ADO/CX 152 je .Loop_sqrx 153___ 154$code.=<<___; 155 jmp .Loop_sqr 156 157.align 32 158.Loop_sqr: 159 movl $times,128+8(%rsp) 160#first iteration 161 movq %rdx, %rbx # 0($inp) 162 mov %rax, %rbp # 8($inp) 163 mulq %rdx 164 movq %rax, %r8 165 movq 16($inp), %rax 166 movq %rdx, %r9 167 168 mulq %rbx 169 addq %rax, %r9 170 movq 24($inp), %rax 171 movq %rdx, %r10 172 adcq \$0, %r10 173 174 mulq %rbx 175 addq %rax, %r10 176 movq 32($inp), %rax 177 movq %rdx, %r11 178 adcq \$0, %r11 179 180 mulq %rbx 181 addq %rax, %r11 182 movq 40($inp), %rax 183 movq %rdx, %r12 184 adcq \$0, %r12 185 186 mulq %rbx 187 addq %rax, %r12 188 movq 48($inp), %rax 189 movq %rdx, %r13 190 adcq \$0, %r13 191 192 mulq %rbx 193 addq %rax, %r13 194 movq 56($inp), %rax 195 movq %rdx, %r14 196 adcq \$0, %r14 197 198 mulq %rbx 199 addq %rax, %r14 200 movq %rbx, %rax 201 adcq \$0, %rdx 202 203 xorq %rcx,%rcx # rcx:r8 = r8 << 1 204 addq %r8, %r8 205 movq %rdx, %r15 206 adcq \$0, %rcx 207 208 mulq %rax 209 addq %r8, %rdx 210 adcq \$0, %rcx 211 212 movq %rax, (%rsp) 213 movq %rdx, 8(%rsp) 214 215#second iteration 216 movq 16($inp), %rax 217 mulq %rbp 218 addq %rax, %r10 219 movq 24($inp), %rax 220 movq %rdx, %rbx 221 adcq \$0, %rbx 222 223 mulq %rbp 224 addq %rax, %r11 225 movq 32($inp), %rax 226 adcq \$0, %rdx 227 addq %rbx, %r11 228 movq %rdx, %rbx 229 adcq \$0, %rbx 230 231 mulq %rbp 232 addq %rax, %r12 233 movq 40($inp), %rax 234 adcq \$0, %rdx 235 addq %rbx, %r12 236 movq %rdx, %rbx 237 adcq \$0, %rbx 238 239 mulq %rbp 240 addq %rax, %r13 241 movq 48($inp), %rax 242 adcq \$0, %rdx 243 addq %rbx, %r13 244 movq %rdx, %rbx 245 adcq \$0, %rbx 246 247 mulq %rbp 248 addq %rax, %r14 249 movq 56($inp), %rax 250 adcq \$0, %rdx 251 addq %rbx, %r14 252 movq %rdx, %rbx 253 adcq \$0, %rbx 254 255 mulq %rbp 256 addq %rax, %r15 257 movq %rbp, %rax 258 adcq \$0, %rdx 259 addq %rbx, %r15 260 adcq \$0, %rdx 261 262 xorq %rbx, %rbx # rbx:r10:r9 = r10:r9 << 1 263 addq %r9, %r9 264 movq %rdx, %r8 265 adcq %r10, %r10 266 adcq \$0, %rbx 267 268 mulq %rax 269 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 270 addq %rcx, %rax 271 movq 16($inp), %rbp 272 addq %rax, %r9 273 movq 24($inp), %rax 274 adcq %rdx, %r10 275 adcq \$0, %rbx 276 277 movq %r9, 16(%rsp) 278 movq %r10, 24(%rsp) 279 280#third iteration 281 mulq %rbp 282 addq %rax, %r12 283 movq 32($inp), %rax 284 movq %rdx, %rcx 285 adcq \$0, %rcx 286 287 mulq %rbp 288 addq %rax, %r13 289 movq 40($inp), %rax 290 adcq \$0, %rdx 291 addq %rcx, %r13 292 movq %rdx, %rcx 293 adcq \$0, %rcx 294 295 mulq %rbp 296 addq %rax, %r14 297 movq 48($inp), %rax 298 adcq \$0, %rdx 299 addq %rcx, %r14 300 movq %rdx, %rcx 301 adcq \$0, %rcx 302 303 mulq %rbp 304 addq %rax, %r15 305 movq 56($inp), %rax 306 adcq \$0, %rdx 307 addq %rcx, %r15 308 movq %rdx, %rcx 309 adcq \$0, %rcx 310 311 mulq %rbp 312 addq %rax, %r8 313 movq %rbp, %rax 314 adcq \$0, %rdx 315 addq %rcx, %r8 316 adcq \$0, %rdx 317 318 xorq %rcx, %rcx # rcx:r12:r11 = r12:r11 << 1 319 addq %r11, %r11 320 movq %rdx, %r9 321 adcq %r12, %r12 322 adcq \$0, %rcx 323 324 mulq %rax 325 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 326 addq %rbx, %rax 327 movq 24($inp), %r10 328 addq %rax, %r11 329 movq 32($inp), %rax 330 adcq %rdx, %r12 331 adcq \$0, %rcx 332 333 movq %r11, 32(%rsp) 334 movq %r12, 40(%rsp) 335 336#fourth iteration 337 mov %rax, %r11 # 32($inp) 338 mulq %r10 339 addq %rax, %r14 340 movq 40($inp), %rax 341 movq %rdx, %rbx 342 adcq \$0, %rbx 343 344 mov %rax, %r12 # 40($inp) 345 mulq %r10 346 addq %rax, %r15 347 movq 48($inp), %rax 348 adcq \$0, %rdx 349 addq %rbx, %r15 350 movq %rdx, %rbx 351 adcq \$0, %rbx 352 353 mov %rax, %rbp # 48($inp) 354 mulq %r10 355 addq %rax, %r8 356 movq 56($inp), %rax 357 adcq \$0, %rdx 358 addq %rbx, %r8 359 movq %rdx, %rbx 360 adcq \$0, %rbx 361 362 mulq %r10 363 addq %rax, %r9 364 movq %r10, %rax 365 adcq \$0, %rdx 366 addq %rbx, %r9 367 adcq \$0, %rdx 368 369 xorq %rbx, %rbx # rbx:r13:r14 = r13:r14 << 1 370 addq %r13, %r13 371 movq %rdx, %r10 372 adcq %r14, %r14 373 adcq \$0, %rbx 374 375 mulq %rax 376 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 377 addq %rcx, %rax 378 addq %rax, %r13 379 movq %r12, %rax # 40($inp) 380 adcq %rdx, %r14 381 adcq \$0, %rbx 382 383 movq %r13, 48(%rsp) 384 movq %r14, 56(%rsp) 385 386#fifth iteration 387 mulq %r11 388 addq %rax, %r8 389 movq %rbp, %rax # 48($inp) 390 movq %rdx, %rcx 391 adcq \$0, %rcx 392 393 mulq %r11 394 addq %rax, %r9 395 movq 56($inp), %rax 396 adcq \$0, %rdx 397 addq %rcx, %r9 398 movq %rdx, %rcx 399 adcq \$0, %rcx 400 401 mov %rax, %r14 # 56($inp) 402 mulq %r11 403 addq %rax, %r10 404 movq %r11, %rax 405 adcq \$0, %rdx 406 addq %rcx, %r10 407 adcq \$0, %rdx 408 409 xorq %rcx, %rcx # rcx:r8:r15 = r8:r15 << 1 410 addq %r15, %r15 411 movq %rdx, %r11 412 adcq %r8, %r8 413 adcq \$0, %rcx 414 415 mulq %rax 416 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 417 addq %rbx, %rax 418 addq %rax, %r15 419 movq %rbp, %rax # 48($inp) 420 adcq %rdx, %r8 421 adcq \$0, %rcx 422 423 movq %r15, 64(%rsp) 424 movq %r8, 72(%rsp) 425 426#sixth iteration 427 mulq %r12 428 addq %rax, %r10 429 movq %r14, %rax # 56($inp) 430 movq %rdx, %rbx 431 adcq \$0, %rbx 432 433 mulq %r12 434 addq %rax, %r11 435 movq %r12, %rax 436 adcq \$0, %rdx 437 addq %rbx, %r11 438 adcq \$0, %rdx 439 440 xorq %rbx, %rbx # rbx:r10:r9 = r10:r9 << 1 441 addq %r9, %r9 442 movq %rdx, %r12 443 adcq %r10, %r10 444 adcq \$0, %rbx 445 446 mulq %rax 447 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 448 addq %rcx, %rax 449 addq %rax, %r9 450 movq %r14, %rax # 56($inp) 451 adcq %rdx, %r10 452 adcq \$0, %rbx 453 454 movq %r9, 80(%rsp) 455 movq %r10, 88(%rsp) 456 457#seventh iteration 458 mulq %rbp 459 addq %rax, %r12 460 movq %rbp, %rax 461 adcq \$0, %rdx 462 463 xorq %rcx, %rcx # rcx:r12:r11 = r12:r11 << 1 464 addq %r11, %r11 465 movq %rdx, %r13 466 adcq %r12, %r12 467 adcq \$0, %rcx 468 469 mulq %rax 470 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 471 addq %rbx, %rax 472 addq %rax, %r11 473 movq %r14, %rax # 56($inp) 474 adcq %rdx, %r12 475 adcq \$0, %rcx 476 477 movq %r11, 96(%rsp) 478 movq %r12, 104(%rsp) 479 480#eighth iteration 481 xorq %rbx, %rbx # rbx:r13 = r13 << 1 482 addq %r13, %r13 483 adcq \$0, %rbx 484 485 mulq %rax 486 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 487 addq %rcx, %rax 488 addq %r13, %rax 489 adcq %rbx, %rdx 490 491 movq (%rsp), %r8 492 movq 8(%rsp), %r9 493 movq 16(%rsp), %r10 494 movq 24(%rsp), %r11 495 movq 32(%rsp), %r12 496 movq 40(%rsp), %r13 497 movq 48(%rsp), %r14 498 movq 56(%rsp), %r15 499 movq %xmm1, %rbp 500 501 movq %rax, 112(%rsp) 502 movq %rdx, 120(%rsp) 503 504 call __rsaz_512_reduce 505 506 addq 64(%rsp), %r8 507 adcq 72(%rsp), %r9 508 adcq 80(%rsp), %r10 509 adcq 88(%rsp), %r11 510 adcq 96(%rsp), %r12 511 adcq 104(%rsp), %r13 512 adcq 112(%rsp), %r14 513 adcq 120(%rsp), %r15 514 sbbq %rcx, %rcx 515 516 call __rsaz_512_subtract 517 518 movq %r8, %rdx 519 movq %r9, %rax 520 movl 128+8(%rsp), $times 521 movq $out, $inp 522 523 decl $times 524 jnz .Loop_sqr 525___ 526if ($addx) { 527$code.=<<___; 528 jmp .Lsqr_tail 529 530.align 32 531.Loop_sqrx: 532 movl $times,128+8(%rsp) 533 movq $out, %xmm0 # off-load 534#first iteration 535 mulx %rax, %r8, %r9 536 mov %rax, %rbx 537 538 mulx 16($inp), %rcx, %r10 539 xor %rbp, %rbp # cf=0, of=0 540 541 mulx 24($inp), %rax, %r11 542 adcx %rcx, %r9 543 544 .byte 0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($inp), %rcx, %r12 545 adcx %rax, %r10 546 547 .byte 0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00 # mulx 40($inp), %rax, %r13 548 adcx %rcx, %r11 549 550 mulx 48($inp), %rcx, %r14 551 adcx %rax, %r12 552 adcx %rcx, %r13 553 554 mulx 56($inp), %rax, %r15 555 adcx %rax, %r14 556 adcx %rbp, %r15 # %rbp is 0 557 558 mulx %rdx, %rax, $out 559 mov %rbx, %rdx # 8($inp) 560 xor %rcx, %rcx 561 adox %r8, %r8 562 adcx $out, %r8 563 adox %rbp, %rcx 564 adcx %rbp, %rcx 565 566 mov %rax, (%rsp) 567 mov %r8, 8(%rsp) 568 569#second iteration 570 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00 # mulx 16($inp), %rax, %rbx 571 adox %rax, %r10 572 adcx %rbx, %r11 573 574 mulx 24($inp), $out, %r8 575 adox $out, %r11 576 .byte 0x66 577 adcx %r8, %r12 578 579 mulx 32($inp), %rax, %rbx 580 adox %rax, %r12 581 adcx %rbx, %r13 582 583 mulx 40($inp), $out, %r8 584 adox $out, %r13 585 adcx %r8, %r14 586 587 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx 588 adox %rax, %r14 589 adcx %rbx, %r15 590 591 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8 592 adox $out, %r15 593 adcx %rbp, %r8 594 mulx %rdx, %rax, $out 595 adox %rbp, %r8 596 .byte 0x48,0x8b,0x96,0x10,0x00,0x00,0x00 # mov 16($inp), %rdx 597 598 xor %rbx, %rbx 599 adox %r9, %r9 600 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 601 adcx %rcx, %rax 602 adox %r10, %r10 603 adcx %rax, %r9 604 adox %rbp, %rbx 605 adcx $out, %r10 606 adcx %rbp, %rbx 607 608 mov %r9, 16(%rsp) 609 .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp) 610 611#third iteration 612 mulx 24($inp), $out, %r9 613 adox $out, %r12 614 adcx %r9, %r13 615 616 mulx 32($inp), %rax, %rcx 617 adox %rax, %r13 618 adcx %rcx, %r14 619 620 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r9 621 adox $out, %r14 622 adcx %r9, %r15 623 624 .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx 625 adox %rax, %r15 626 adcx %rcx, %r8 627 628 mulx 56($inp), $out, %r9 629 adox $out, %r8 630 adcx %rbp, %r9 631 mulx %rdx, %rax, $out 632 adox %rbp, %r9 633 mov 24($inp), %rdx 634 635 xor %rcx, %rcx 636 adox %r11, %r11 637 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 638 adcx %rbx, %rax 639 adox %r12, %r12 640 adcx %rax, %r11 641 adox %rbp, %rcx 642 adcx $out, %r12 643 adcx %rbp, %rcx 644 645 mov %r11, 32(%rsp) 646 mov %r12, 40(%rsp) 647 648#fourth iteration 649 mulx 32($inp), %rax, %rbx 650 adox %rax, %r14 651 adcx %rbx, %r15 652 653 mulx 40($inp), $out, %r10 654 adox $out, %r15 655 adcx %r10, %r8 656 657 mulx 48($inp), %rax, %rbx 658 adox %rax, %r8 659 adcx %rbx, %r9 660 661 mulx 56($inp), $out, %r10 662 adox $out, %r9 663 adcx %rbp, %r10 664 mulx %rdx, %rax, $out 665 adox %rbp, %r10 666 mov 32($inp), %rdx 667 668 xor %rbx, %rbx 669 adox %r13, %r13 670 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 671 adcx %rcx, %rax 672 adox %r14, %r14 673 adcx %rax, %r13 674 adox %rbp, %rbx 675 adcx $out, %r14 676 adcx %rbp, %rbx 677 678 mov %r13, 48(%rsp) 679 mov %r14, 56(%rsp) 680 681#fifth iteration 682 mulx 40($inp), $out, %r11 683 adox $out, %r8 684 adcx %r11, %r9 685 686 mulx 48($inp), %rax, %rcx 687 adox %rax, %r9 688 adcx %rcx, %r10 689 690 mulx 56($inp), $out, %r11 691 adox $out, %r10 692 adcx %rbp, %r11 693 mulx %rdx, %rax, $out 694 mov 40($inp), %rdx 695 adox %rbp, %r11 696 697 xor %rcx, %rcx 698 adox %r15, %r15 699 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 700 adcx %rbx, %rax 701 adox %r8, %r8 702 adcx %rax, %r15 703 adox %rbp, %rcx 704 adcx $out, %r8 705 adcx %rbp, %rcx 706 707 mov %r15, 64(%rsp) 708 mov %r8, 72(%rsp) 709 710#sixth iteration 711 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx 712 adox %rax, %r10 713 adcx %rbx, %r11 714 715 .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12 716 adox $out, %r11 717 adcx %rbp, %r12 718 mulx %rdx, %rax, $out 719 adox %rbp, %r12 720 mov 48($inp), %rdx 721 722 xor %rbx, %rbx 723 adox %r9, %r9 724 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 725 adcx %rcx, %rax 726 adox %r10, %r10 727 adcx %rax, %r9 728 adcx $out, %r10 729 adox %rbp, %rbx 730 adcx %rbp, %rbx 731 732 mov %r9, 80(%rsp) 733 mov %r10, 88(%rsp) 734 735#seventh iteration 736 .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13 737 adox %rax, %r12 738 adox %rbp, %r13 739 740 mulx %rdx, %rax, $out 741 xor %rcx, %rcx 742 mov 56($inp), %rdx 743 adox %r11, %r11 744 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 745 adcx %rbx, %rax 746 adox %r12, %r12 747 adcx %rax, %r11 748 adox %rbp, %rcx 749 adcx $out, %r12 750 adcx %rbp, %rcx 751 752 .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp) 753 .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp) 754 755#eighth iteration 756 mulx %rdx, %rax, %rdx 757 xor %rbx, %rbx 758 adox %r13, %r13 759 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 760 adcx %rcx, %rax 761 adox %rbp, %rbx 762 adcx %r13, %rax 763 adcx %rdx, %rbx 764 765 movq %xmm0, $out 766 movq %xmm1, %rbp 767 768 movq 128(%rsp), %rdx # pull $n0 769 movq (%rsp), %r8 770 movq 8(%rsp), %r9 771 movq 16(%rsp), %r10 772 movq 24(%rsp), %r11 773 movq 32(%rsp), %r12 774 movq 40(%rsp), %r13 775 movq 48(%rsp), %r14 776 movq 56(%rsp), %r15 777 778 movq %rax, 112(%rsp) 779 movq %rbx, 120(%rsp) 780 781 call __rsaz_512_reducex 782 783 addq 64(%rsp), %r8 784 adcq 72(%rsp), %r9 785 adcq 80(%rsp), %r10 786 adcq 88(%rsp), %r11 787 adcq 96(%rsp), %r12 788 adcq 104(%rsp), %r13 789 adcq 112(%rsp), %r14 790 adcq 120(%rsp), %r15 791 sbbq %rcx, %rcx 792 793 call __rsaz_512_subtract 794 795 movq %r8, %rdx 796 movq %r9, %rax 797 movl 128+8(%rsp), $times 798 movq $out, $inp 799 800 decl $times 801 jnz .Loop_sqrx 802 803.Lsqr_tail: 804___ 805} 806$code.=<<___; 807 808 leaq 128+24+48(%rsp), %rax 809 movq -48(%rax), %r15 810 movq -40(%rax), %r14 811 movq -32(%rax), %r13 812 movq -24(%rax), %r12 813 movq -16(%rax), %rbp 814 movq -8(%rax), %rbx 815 leaq (%rax), %rsp 816.Lsqr_epilogue: 817 ret 818.size rsaz_512_sqr,.-rsaz_512_sqr 819___ 820} 821{ 822my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); 823$code.=<<___; 824.globl rsaz_512_mul 825.type rsaz_512_mul,\@function,5 826.align 32 827rsaz_512_mul: 828 push %rbx 829 push %rbp 830 push %r12 831 push %r13 832 push %r14 833 push %r15 834 835 subq \$128+24, %rsp 836.Lmul_body: 837 movq $out, %xmm0 # off-load arguments 838 movq $mod, %xmm1 839 movq $n0, 128(%rsp) 840___ 841$code.=<<___ if ($addx); 842 movl \$0x80100,%r11d 843 andl OPENSSL_ia32cap_P+8(%rip),%r11d 844 cmpl \$0x80100,%r11d # check for MULX and ADO/CX 845 je .Lmulx 846___ 847$code.=<<___; 848 movq ($bp), %rbx # pass b[0] 849 movq $bp, %rbp # pass argument 850 call __rsaz_512_mul 851 852 movq %xmm0, $out 853 movq %xmm1, %rbp 854 855 movq (%rsp), %r8 856 movq 8(%rsp), %r9 857 movq 16(%rsp), %r10 858 movq 24(%rsp), %r11 859 movq 32(%rsp), %r12 860 movq 40(%rsp), %r13 861 movq 48(%rsp), %r14 862 movq 56(%rsp), %r15 863 864 call __rsaz_512_reduce 865___ 866$code.=<<___ if ($addx); 867 jmp .Lmul_tail 868 869.align 32 870.Lmulx: 871 movq $bp, %rbp # pass argument 872 movq ($bp), %rdx # pass b[0] 873 call __rsaz_512_mulx 874 875 movq %xmm0, $out 876 movq %xmm1, %rbp 877 878 movq 128(%rsp), %rdx # pull $n0 879 movq (%rsp), %r8 880 movq 8(%rsp), %r9 881 movq 16(%rsp), %r10 882 movq 24(%rsp), %r11 883 movq 32(%rsp), %r12 884 movq 40(%rsp), %r13 885 movq 48(%rsp), %r14 886 movq 56(%rsp), %r15 887 888 call __rsaz_512_reducex 889.Lmul_tail: 890___ 891$code.=<<___; 892 addq 64(%rsp), %r8 893 adcq 72(%rsp), %r9 894 adcq 80(%rsp), %r10 895 adcq 88(%rsp), %r11 896 adcq 96(%rsp), %r12 897 adcq 104(%rsp), %r13 898 adcq 112(%rsp), %r14 899 adcq 120(%rsp), %r15 900 sbbq %rcx, %rcx 901 902 call __rsaz_512_subtract 903 904 leaq 128+24+48(%rsp), %rax 905 movq -48(%rax), %r15 906 movq -40(%rax), %r14 907 movq -32(%rax), %r13 908 movq -24(%rax), %r12 909 movq -16(%rax), %rbp 910 movq -8(%rax), %rbx 911 leaq (%rax), %rsp 912.Lmul_epilogue: 913 ret 914.size rsaz_512_mul,.-rsaz_512_mul 915___ 916} 917{ 918my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); 919$code.=<<___; 920.globl rsaz_512_mul_gather4 921.type rsaz_512_mul_gather4,\@function,6 922.align 32 923rsaz_512_mul_gather4: 924 push %rbx 925 push %rbp 926 push %r12 927 push %r13 928 push %r14 929 push %r15 930 931 subq \$`128+24+($win64?0xb0:0)`, %rsp 932___ 933$code.=<<___ if ($win64); 934 movaps %xmm6,0xa0(%rsp) 935 movaps %xmm7,0xb0(%rsp) 936 movaps %xmm8,0xc0(%rsp) 937 movaps %xmm9,0xd0(%rsp) 938 movaps %xmm10,0xe0(%rsp) 939 movaps %xmm11,0xf0(%rsp) 940 movaps %xmm12,0x100(%rsp) 941 movaps %xmm13,0x110(%rsp) 942 movaps %xmm14,0x120(%rsp) 943 movaps %xmm15,0x130(%rsp) 944___ 945$code.=<<___; 946.Lmul_gather4_body: 947 movd $pwr,%xmm8 948 movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002 949 movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000 950 951 pshufd \$0,%xmm8,%xmm8 # broadcast $power 952 movdqa %xmm1,%xmm7 953 movdqa %xmm1,%xmm2 954___ 955######################################################################## 956# calculate mask by comparing 0..15 to $power 957# 958for($i=0;$i<4;$i++) { 959$code.=<<___; 960 paddd %xmm`$i`,%xmm`$i+1` 961 pcmpeqd %xmm8,%xmm`$i` 962 movdqa %xmm7,%xmm`$i+3` 963___ 964} 965for(;$i<7;$i++) { 966$code.=<<___; 967 paddd %xmm`$i`,%xmm`$i+1` 968 pcmpeqd %xmm8,%xmm`$i` 969___ 970} 971$code.=<<___; 972 pcmpeqd %xmm8,%xmm7 973 974 movdqa 16*0($bp),%xmm8 975 movdqa 16*1($bp),%xmm9 976 movdqa 16*2($bp),%xmm10 977 movdqa 16*3($bp),%xmm11 978 pand %xmm0,%xmm8 979 movdqa 16*4($bp),%xmm12 980 pand %xmm1,%xmm9 981 movdqa 16*5($bp),%xmm13 982 pand %xmm2,%xmm10 983 movdqa 16*6($bp),%xmm14 984 pand %xmm3,%xmm11 985 movdqa 16*7($bp),%xmm15 986 leaq 128($bp), %rbp 987 pand %xmm4,%xmm12 988 pand %xmm5,%xmm13 989 pand %xmm6,%xmm14 990 pand %xmm7,%xmm15 991 por %xmm10,%xmm8 992 por %xmm11,%xmm9 993 por %xmm12,%xmm8 994 por %xmm13,%xmm9 995 por %xmm14,%xmm8 996 por %xmm15,%xmm9 997 998 por %xmm9,%xmm8 999 pshufd \$0x4e,%xmm8,%xmm9 1000 por %xmm9,%xmm8 1001___ 1002$code.=<<___ if ($addx); 1003 movl \$0x80100,%r11d 1004 andl OPENSSL_ia32cap_P+8(%rip),%r11d 1005 cmpl \$0x80100,%r11d # check for MULX and ADO/CX 1006 je .Lmulx_gather 1007___ 1008$code.=<<___; 1009 movq %xmm8,%rbx 1010 1011 movq $n0, 128(%rsp) # off-load arguments 1012 movq $out, 128+8(%rsp) 1013 movq $mod, 128+16(%rsp) 1014 1015 movq ($ap), %rax 1016 movq 8($ap), %rcx 1017 mulq %rbx # 0 iteration 1018 movq %rax, (%rsp) 1019 movq %rcx, %rax 1020 movq %rdx, %r8 1021 1022 mulq %rbx 1023 addq %rax, %r8 1024 movq 16($ap), %rax 1025 movq %rdx, %r9 1026 adcq \$0, %r9 1027 1028 mulq %rbx 1029 addq %rax, %r9 1030 movq 24($ap), %rax 1031 movq %rdx, %r10 1032 adcq \$0, %r10 1033 1034 mulq %rbx 1035 addq %rax, %r10 1036 movq 32($ap), %rax 1037 movq %rdx, %r11 1038 adcq \$0, %r11 1039 1040 mulq %rbx 1041 addq %rax, %r11 1042 movq 40($ap), %rax 1043 movq %rdx, %r12 1044 adcq \$0, %r12 1045 1046 mulq %rbx 1047 addq %rax, %r12 1048 movq 48($ap), %rax 1049 movq %rdx, %r13 1050 adcq \$0, %r13 1051 1052 mulq %rbx 1053 addq %rax, %r13 1054 movq 56($ap), %rax 1055 movq %rdx, %r14 1056 adcq \$0, %r14 1057 1058 mulq %rbx 1059 addq %rax, %r14 1060 movq ($ap), %rax 1061 movq %rdx, %r15 1062 adcq \$0, %r15 1063 1064 leaq 8(%rsp), %rdi 1065 movl \$7, %ecx 1066 jmp .Loop_mul_gather 1067 1068.align 32 1069.Loop_mul_gather: 1070 movdqa 16*0(%rbp),%xmm8 1071 movdqa 16*1(%rbp),%xmm9 1072 movdqa 16*2(%rbp),%xmm10 1073 movdqa 16*3(%rbp),%xmm11 1074 pand %xmm0,%xmm8 1075 movdqa 16*4(%rbp),%xmm12 1076 pand %xmm1,%xmm9 1077 movdqa 16*5(%rbp),%xmm13 1078 pand %xmm2,%xmm10 1079 movdqa 16*6(%rbp),%xmm14 1080 pand %xmm3,%xmm11 1081 movdqa 16*7(%rbp),%xmm15 1082 leaq 128(%rbp), %rbp 1083 pand %xmm4,%xmm12 1084 pand %xmm5,%xmm13 1085 pand %xmm6,%xmm14 1086 pand %xmm7,%xmm15 1087 por %xmm10,%xmm8 1088 por %xmm11,%xmm9 1089 por %xmm12,%xmm8 1090 por %xmm13,%xmm9 1091 por %xmm14,%xmm8 1092 por %xmm15,%xmm9 1093 1094 por %xmm9,%xmm8 1095 pshufd \$0x4e,%xmm8,%xmm9 1096 por %xmm9,%xmm8 1097 movq %xmm8,%rbx 1098 1099 mulq %rbx 1100 addq %rax, %r8 1101 movq 8($ap), %rax 1102 movq %r8, (%rdi) 1103 movq %rdx, %r8 1104 adcq \$0, %r8 1105 1106 mulq %rbx 1107 addq %rax, %r9 1108 movq 16($ap), %rax 1109 adcq \$0, %rdx 1110 addq %r9, %r8 1111 movq %rdx, %r9 1112 adcq \$0, %r9 1113 1114 mulq %rbx 1115 addq %rax, %r10 1116 movq 24($ap), %rax 1117 adcq \$0, %rdx 1118 addq %r10, %r9 1119 movq %rdx, %r10 1120 adcq \$0, %r10 1121 1122 mulq %rbx 1123 addq %rax, %r11 1124 movq 32($ap), %rax 1125 adcq \$0, %rdx 1126 addq %r11, %r10 1127 movq %rdx, %r11 1128 adcq \$0, %r11 1129 1130 mulq %rbx 1131 addq %rax, %r12 1132 movq 40($ap), %rax 1133 adcq \$0, %rdx 1134 addq %r12, %r11 1135 movq %rdx, %r12 1136 adcq \$0, %r12 1137 1138 mulq %rbx 1139 addq %rax, %r13 1140 movq 48($ap), %rax 1141 adcq \$0, %rdx 1142 addq %r13, %r12 1143 movq %rdx, %r13 1144 adcq \$0, %r13 1145 1146 mulq %rbx 1147 addq %rax, %r14 1148 movq 56($ap), %rax 1149 adcq \$0, %rdx 1150 addq %r14, %r13 1151 movq %rdx, %r14 1152 adcq \$0, %r14 1153 1154 mulq %rbx 1155 addq %rax, %r15 1156 movq ($ap), %rax 1157 adcq \$0, %rdx 1158 addq %r15, %r14 1159 movq %rdx, %r15 1160 adcq \$0, %r15 1161 1162 leaq 8(%rdi), %rdi 1163 1164 decl %ecx 1165 jnz .Loop_mul_gather 1166 1167 movq %r8, (%rdi) 1168 movq %r9, 8(%rdi) 1169 movq %r10, 16(%rdi) 1170 movq %r11, 24(%rdi) 1171 movq %r12, 32(%rdi) 1172 movq %r13, 40(%rdi) 1173 movq %r14, 48(%rdi) 1174 movq %r15, 56(%rdi) 1175 1176 movq 128+8(%rsp), $out 1177 movq 128+16(%rsp), %rbp 1178 1179 movq (%rsp), %r8 1180 movq 8(%rsp), %r9 1181 movq 16(%rsp), %r10 1182 movq 24(%rsp), %r11 1183 movq 32(%rsp), %r12 1184 movq 40(%rsp), %r13 1185 movq 48(%rsp), %r14 1186 movq 56(%rsp), %r15 1187 1188 call __rsaz_512_reduce 1189___ 1190$code.=<<___ if ($addx); 1191 jmp .Lmul_gather_tail 1192 1193.align 32 1194.Lmulx_gather: 1195 movq %xmm8,%rdx 1196 1197 mov $n0, 128(%rsp) # off-load arguments 1198 mov $out, 128+8(%rsp) 1199 mov $mod, 128+16(%rsp) 1200 1201 mulx ($ap), %rbx, %r8 # 0 iteration 1202 mov %rbx, (%rsp) 1203 xor %edi, %edi # cf=0, of=0 1204 1205 mulx 8($ap), %rax, %r9 1206 1207 mulx 16($ap), %rbx, %r10 1208 adcx %rax, %r8 1209 1210 mulx 24($ap), %rax, %r11 1211 adcx %rbx, %r9 1212 1213 mulx 32($ap), %rbx, %r12 1214 adcx %rax, %r10 1215 1216 mulx 40($ap), %rax, %r13 1217 adcx %rbx, %r11 1218 1219 mulx 48($ap), %rbx, %r14 1220 adcx %rax, %r12 1221 1222 mulx 56($ap), %rax, %r15 1223 adcx %rbx, %r13 1224 adcx %rax, %r14 1225 .byte 0x67 1226 mov %r8, %rbx 1227 adcx %rdi, %r15 # %rdi is 0 1228 1229 mov \$-7, %rcx 1230 jmp .Loop_mulx_gather 1231 1232.align 32 1233.Loop_mulx_gather: 1234 movdqa 16*0(%rbp),%xmm8 1235 movdqa 16*1(%rbp),%xmm9 1236 movdqa 16*2(%rbp),%xmm10 1237 movdqa 16*3(%rbp),%xmm11 1238 pand %xmm0,%xmm8 1239 movdqa 16*4(%rbp),%xmm12 1240 pand %xmm1,%xmm9 1241 movdqa 16*5(%rbp),%xmm13 1242 pand %xmm2,%xmm10 1243 movdqa 16*6(%rbp),%xmm14 1244 pand %xmm3,%xmm11 1245 movdqa 16*7(%rbp),%xmm15 1246 leaq 128(%rbp), %rbp 1247 pand %xmm4,%xmm12 1248 pand %xmm5,%xmm13 1249 pand %xmm6,%xmm14 1250 pand %xmm7,%xmm15 1251 por %xmm10,%xmm8 1252 por %xmm11,%xmm9 1253 por %xmm12,%xmm8 1254 por %xmm13,%xmm9 1255 por %xmm14,%xmm8 1256 por %xmm15,%xmm9 1257 1258 por %xmm9,%xmm8 1259 pshufd \$0x4e,%xmm8,%xmm9 1260 por %xmm9,%xmm8 1261 movq %xmm8,%rdx 1262 1263 .byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 # mulx ($ap), %rax, %r8 1264 adcx %rax, %rbx 1265 adox %r9, %r8 1266 1267 mulx 8($ap), %rax, %r9 1268 adcx %rax, %r8 1269 adox %r10, %r9 1270 1271 mulx 16($ap), %rax, %r10 1272 adcx %rax, %r9 1273 adox %r11, %r10 1274 1275 .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11 1276 adcx %rax, %r10 1277 adox %r12, %r11 1278 1279 mulx 32($ap), %rax, %r12 1280 adcx %rax, %r11 1281 adox %r13, %r12 1282 1283 mulx 40($ap), %rax, %r13 1284 adcx %rax, %r12 1285 adox %r14, %r13 1286 1287 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14 1288 adcx %rax, %r13 1289 .byte 0x67 1290 adox %r15, %r14 1291 1292 mulx 56($ap), %rax, %r15 1293 mov %rbx, 64(%rsp,%rcx,8) 1294 adcx %rax, %r14 1295 adox %rdi, %r15 1296 mov %r8, %rbx 1297 adcx %rdi, %r15 # cf=0 1298 1299 inc %rcx # of=0 1300 jnz .Loop_mulx_gather 1301 1302 mov %r8, 64(%rsp) 1303 mov %r9, 64+8(%rsp) 1304 mov %r10, 64+16(%rsp) 1305 mov %r11, 64+24(%rsp) 1306 mov %r12, 64+32(%rsp) 1307 mov %r13, 64+40(%rsp) 1308 mov %r14, 64+48(%rsp) 1309 mov %r15, 64+56(%rsp) 1310 1311 mov 128(%rsp), %rdx # pull arguments 1312 mov 128+8(%rsp), $out 1313 mov 128+16(%rsp), %rbp 1314 1315 mov (%rsp), %r8 1316 mov 8(%rsp), %r9 1317 mov 16(%rsp), %r10 1318 mov 24(%rsp), %r11 1319 mov 32(%rsp), %r12 1320 mov 40(%rsp), %r13 1321 mov 48(%rsp), %r14 1322 mov 56(%rsp), %r15 1323 1324 call __rsaz_512_reducex 1325 1326.Lmul_gather_tail: 1327___ 1328$code.=<<___; 1329 addq 64(%rsp), %r8 1330 adcq 72(%rsp), %r9 1331 adcq 80(%rsp), %r10 1332 adcq 88(%rsp), %r11 1333 adcq 96(%rsp), %r12 1334 adcq 104(%rsp), %r13 1335 adcq 112(%rsp), %r14 1336 adcq 120(%rsp), %r15 1337 sbbq %rcx, %rcx 1338 1339 call __rsaz_512_subtract 1340 1341 leaq 128+24+48(%rsp), %rax 1342___ 1343$code.=<<___ if ($win64); 1344 movaps 0xa0-0xc8(%rax),%xmm6 1345 movaps 0xb0-0xc8(%rax),%xmm7 1346 movaps 0xc0-0xc8(%rax),%xmm8 1347 movaps 0xd0-0xc8(%rax),%xmm9 1348 movaps 0xe0-0xc8(%rax),%xmm10 1349 movaps 0xf0-0xc8(%rax),%xmm11 1350 movaps 0x100-0xc8(%rax),%xmm12 1351 movaps 0x110-0xc8(%rax),%xmm13 1352 movaps 0x120-0xc8(%rax),%xmm14 1353 movaps 0x130-0xc8(%rax),%xmm15 1354 lea 0xb0(%rax),%rax 1355___ 1356$code.=<<___; 1357 movq -48(%rax), %r15 1358 movq -40(%rax), %r14 1359 movq -32(%rax), %r13 1360 movq -24(%rax), %r12 1361 movq -16(%rax), %rbp 1362 movq -8(%rax), %rbx 1363 leaq (%rax), %rsp 1364.Lmul_gather4_epilogue: 1365 ret 1366.size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4 1367___ 1368} 1369{ 1370my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); 1371$code.=<<___; 1372.globl rsaz_512_mul_scatter4 1373.type rsaz_512_mul_scatter4,\@function,6 1374.align 32 1375rsaz_512_mul_scatter4: 1376 push %rbx 1377 push %rbp 1378 push %r12 1379 push %r13 1380 push %r14 1381 push %r15 1382 1383 mov $pwr, $pwr 1384 subq \$128+24, %rsp 1385.Lmul_scatter4_body: 1386 leaq ($tbl,$pwr,8), $tbl 1387 movq $out, %xmm0 # off-load arguments 1388 movq $mod, %xmm1 1389 movq $tbl, %xmm2 1390 movq $n0, 128(%rsp) 1391 1392 movq $out, %rbp 1393___ 1394$code.=<<___ if ($addx); 1395 movl \$0x80100,%r11d 1396 andl OPENSSL_ia32cap_P+8(%rip),%r11d 1397 cmpl \$0x80100,%r11d # check for MULX and ADO/CX 1398 je .Lmulx_scatter 1399___ 1400$code.=<<___; 1401 movq ($out),%rbx # pass b[0] 1402 call __rsaz_512_mul 1403 1404 movq %xmm0, $out 1405 movq %xmm1, %rbp 1406 1407 movq (%rsp), %r8 1408 movq 8(%rsp), %r9 1409 movq 16(%rsp), %r10 1410 movq 24(%rsp), %r11 1411 movq 32(%rsp), %r12 1412 movq 40(%rsp), %r13 1413 movq 48(%rsp), %r14 1414 movq 56(%rsp), %r15 1415 1416 call __rsaz_512_reduce 1417___ 1418$code.=<<___ if ($addx); 1419 jmp .Lmul_scatter_tail 1420 1421.align 32 1422.Lmulx_scatter: 1423 movq ($out), %rdx # pass b[0] 1424 call __rsaz_512_mulx 1425 1426 movq %xmm0, $out 1427 movq %xmm1, %rbp 1428 1429 movq 128(%rsp), %rdx # pull $n0 1430 movq (%rsp), %r8 1431 movq 8(%rsp), %r9 1432 movq 16(%rsp), %r10 1433 movq 24(%rsp), %r11 1434 movq 32(%rsp), %r12 1435 movq 40(%rsp), %r13 1436 movq 48(%rsp), %r14 1437 movq 56(%rsp), %r15 1438 1439 call __rsaz_512_reducex 1440 1441.Lmul_scatter_tail: 1442___ 1443$code.=<<___; 1444 addq 64(%rsp), %r8 1445 adcq 72(%rsp), %r9 1446 adcq 80(%rsp), %r10 1447 adcq 88(%rsp), %r11 1448 adcq 96(%rsp), %r12 1449 adcq 104(%rsp), %r13 1450 adcq 112(%rsp), %r14 1451 adcq 120(%rsp), %r15 1452 movq %xmm2, $inp 1453 sbbq %rcx, %rcx 1454 1455 call __rsaz_512_subtract 1456 1457 movq %r8, 128*0($inp) # scatter 1458 movq %r9, 128*1($inp) 1459 movq %r10, 128*2($inp) 1460 movq %r11, 128*3($inp) 1461 movq %r12, 128*4($inp) 1462 movq %r13, 128*5($inp) 1463 movq %r14, 128*6($inp) 1464 movq %r15, 128*7($inp) 1465 1466 leaq 128+24+48(%rsp), %rax 1467 movq -48(%rax), %r15 1468 movq -40(%rax), %r14 1469 movq -32(%rax), %r13 1470 movq -24(%rax), %r12 1471 movq -16(%rax), %rbp 1472 movq -8(%rax), %rbx 1473 leaq (%rax), %rsp 1474.Lmul_scatter4_epilogue: 1475 ret 1476.size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4 1477___ 1478} 1479{ 1480my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx"); 1481$code.=<<___; 1482.globl rsaz_512_mul_by_one 1483.type rsaz_512_mul_by_one,\@function,4 1484.align 32 1485rsaz_512_mul_by_one: 1486 push %rbx 1487 push %rbp 1488 push %r12 1489 push %r13 1490 push %r14 1491 push %r15 1492 1493 subq \$128+24, %rsp 1494.Lmul_by_one_body: 1495___ 1496$code.=<<___ if ($addx); 1497 movl OPENSSL_ia32cap_P+8(%rip),%eax 1498___ 1499$code.=<<___; 1500 movq $mod, %rbp # reassign argument 1501 movq $n0, 128(%rsp) 1502 1503 movq ($inp), %r8 1504 pxor %xmm0, %xmm0 1505 movq 8($inp), %r9 1506 movq 16($inp), %r10 1507 movq 24($inp), %r11 1508 movq 32($inp), %r12 1509 movq 40($inp), %r13 1510 movq 48($inp), %r14 1511 movq 56($inp), %r15 1512 1513 movdqa %xmm0, (%rsp) 1514 movdqa %xmm0, 16(%rsp) 1515 movdqa %xmm0, 32(%rsp) 1516 movdqa %xmm0, 48(%rsp) 1517 movdqa %xmm0, 64(%rsp) 1518 movdqa %xmm0, 80(%rsp) 1519 movdqa %xmm0, 96(%rsp) 1520___ 1521$code.=<<___ if ($addx); 1522 andl \$0x80100,%eax 1523 cmpl \$0x80100,%eax # check for MULX and ADO/CX 1524 je .Lby_one_callx 1525___ 1526$code.=<<___; 1527 call __rsaz_512_reduce 1528___ 1529$code.=<<___ if ($addx); 1530 jmp .Lby_one_tail 1531.align 32 1532.Lby_one_callx: 1533 movq 128(%rsp), %rdx # pull $n0 1534 call __rsaz_512_reducex 1535.Lby_one_tail: 1536___ 1537$code.=<<___; 1538 movq %r8, ($out) 1539 movq %r9, 8($out) 1540 movq %r10, 16($out) 1541 movq %r11, 24($out) 1542 movq %r12, 32($out) 1543 movq %r13, 40($out) 1544 movq %r14, 48($out) 1545 movq %r15, 56($out) 1546 1547 leaq 128+24+48(%rsp), %rax 1548 movq -48(%rax), %r15 1549 movq -40(%rax), %r14 1550 movq -32(%rax), %r13 1551 movq -24(%rax), %r12 1552 movq -16(%rax), %rbp 1553 movq -8(%rax), %rbx 1554 leaq (%rax), %rsp 1555.Lmul_by_one_epilogue: 1556 ret 1557.size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one 1558___ 1559} 1560{ # __rsaz_512_reduce 1561 # 1562 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0 1563 # output: %r8-%r15 1564 # clobbers: everything except %rbp and %rdi 1565$code.=<<___; 1566.type __rsaz_512_reduce,\@abi-omnipotent 1567.align 32 1568__rsaz_512_reduce: 1569 movq %r8, %rbx 1570 imulq 128+8(%rsp), %rbx 1571 movq 0(%rbp), %rax 1572 movl \$8, %ecx 1573 jmp .Lreduction_loop 1574 1575.align 32 1576.Lreduction_loop: 1577 mulq %rbx 1578 movq 8(%rbp), %rax 1579 negq %r8 1580 movq %rdx, %r8 1581 adcq \$0, %r8 1582 1583 mulq %rbx 1584 addq %rax, %r9 1585 movq 16(%rbp), %rax 1586 adcq \$0, %rdx 1587 addq %r9, %r8 1588 movq %rdx, %r9 1589 adcq \$0, %r9 1590 1591 mulq %rbx 1592 addq %rax, %r10 1593 movq 24(%rbp), %rax 1594 adcq \$0, %rdx 1595 addq %r10, %r9 1596 movq %rdx, %r10 1597 adcq \$0, %r10 1598 1599 mulq %rbx 1600 addq %rax, %r11 1601 movq 32(%rbp), %rax 1602 adcq \$0, %rdx 1603 addq %r11, %r10 1604 movq 128+8(%rsp), %rsi 1605 #movq %rdx, %r11 1606 #adcq \$0, %r11 1607 adcq \$0, %rdx 1608 movq %rdx, %r11 1609 1610 mulq %rbx 1611 addq %rax, %r12 1612 movq 40(%rbp), %rax 1613 adcq \$0, %rdx 1614 imulq %r8, %rsi 1615 addq %r12, %r11 1616 movq %rdx, %r12 1617 adcq \$0, %r12 1618 1619 mulq %rbx 1620 addq %rax, %r13 1621 movq 48(%rbp), %rax 1622 adcq \$0, %rdx 1623 addq %r13, %r12 1624 movq %rdx, %r13 1625 adcq \$0, %r13 1626 1627 mulq %rbx 1628 addq %rax, %r14 1629 movq 56(%rbp), %rax 1630 adcq \$0, %rdx 1631 addq %r14, %r13 1632 movq %rdx, %r14 1633 adcq \$0, %r14 1634 1635 mulq %rbx 1636 movq %rsi, %rbx 1637 addq %rax, %r15 1638 movq 0(%rbp), %rax 1639 adcq \$0, %rdx 1640 addq %r15, %r14 1641 movq %rdx, %r15 1642 adcq \$0, %r15 1643 1644 decl %ecx 1645 jne .Lreduction_loop 1646 1647 ret 1648.size __rsaz_512_reduce,.-__rsaz_512_reduce 1649___ 1650} 1651if ($addx) { 1652 # __rsaz_512_reducex 1653 # 1654 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0 1655 # output: %r8-%r15 1656 # clobbers: everything except %rbp and %rdi 1657$code.=<<___; 1658.type __rsaz_512_reducex,\@abi-omnipotent 1659.align 32 1660__rsaz_512_reducex: 1661 #movq 128+8(%rsp), %rdx # pull $n0 1662 imulq %r8, %rdx 1663 xorq %rsi, %rsi # cf=0,of=0 1664 movl \$8, %ecx 1665 jmp .Lreduction_loopx 1666 1667.align 32 1668.Lreduction_loopx: 1669 mov %r8, %rbx 1670 mulx 0(%rbp), %rax, %r8 1671 adcx %rbx, %rax 1672 adox %r9, %r8 1673 1674 mulx 8(%rbp), %rax, %r9 1675 adcx %rax, %r8 1676 adox %r10, %r9 1677 1678 mulx 16(%rbp), %rbx, %r10 1679 adcx %rbx, %r9 1680 adox %r11, %r10 1681 1682 mulx 24(%rbp), %rbx, %r11 1683 adcx %rbx, %r10 1684 adox %r12, %r11 1685 1686 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12 1687 mov %rdx, %rax 1688 mov %r8, %rdx 1689 adcx %rbx, %r11 1690 adox %r13, %r12 1691 1692 mulx 128+8(%rsp), %rbx, %rdx 1693 mov %rax, %rdx 1694 1695 mulx 40(%rbp), %rax, %r13 1696 adcx %rax, %r12 1697 adox %r14, %r13 1698 1699 .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14 1700 adcx %rax, %r13 1701 adox %r15, %r14 1702 1703 mulx 56(%rbp), %rax, %r15 1704 mov %rbx, %rdx 1705 adcx %rax, %r14 1706 adox %rsi, %r15 # %rsi is 0 1707 adcx %rsi, %r15 # cf=0 1708 1709 decl %ecx # of=0 1710 jne .Lreduction_loopx 1711 1712 ret 1713.size __rsaz_512_reducex,.-__rsaz_512_reducex 1714___ 1715} 1716{ # __rsaz_512_subtract 1717 # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask 1718 # output: 1719 # clobbers: everything but %rdi, %rsi and %rbp 1720$code.=<<___; 1721.type __rsaz_512_subtract,\@abi-omnipotent 1722.align 32 1723__rsaz_512_subtract: 1724 movq %r8, ($out) 1725 movq %r9, 8($out) 1726 movq %r10, 16($out) 1727 movq %r11, 24($out) 1728 movq %r12, 32($out) 1729 movq %r13, 40($out) 1730 movq %r14, 48($out) 1731 movq %r15, 56($out) 1732 1733 movq 0($mod), %r8 1734 movq 8($mod), %r9 1735 negq %r8 1736 notq %r9 1737 andq %rcx, %r8 1738 movq 16($mod), %r10 1739 andq %rcx, %r9 1740 notq %r10 1741 movq 24($mod), %r11 1742 andq %rcx, %r10 1743 notq %r11 1744 movq 32($mod), %r12 1745 andq %rcx, %r11 1746 notq %r12 1747 movq 40($mod), %r13 1748 andq %rcx, %r12 1749 notq %r13 1750 movq 48($mod), %r14 1751 andq %rcx, %r13 1752 notq %r14 1753 movq 56($mod), %r15 1754 andq %rcx, %r14 1755 notq %r15 1756 andq %rcx, %r15 1757 1758 addq ($out), %r8 1759 adcq 8($out), %r9 1760 adcq 16($out), %r10 1761 adcq 24($out), %r11 1762 adcq 32($out), %r12 1763 adcq 40($out), %r13 1764 adcq 48($out), %r14 1765 adcq 56($out), %r15 1766 1767 movq %r8, ($out) 1768 movq %r9, 8($out) 1769 movq %r10, 16($out) 1770 movq %r11, 24($out) 1771 movq %r12, 32($out) 1772 movq %r13, 40($out) 1773 movq %r14, 48($out) 1774 movq %r15, 56($out) 1775 1776 ret 1777.size __rsaz_512_subtract,.-__rsaz_512_subtract 1778___ 1779} 1780{ # __rsaz_512_mul 1781 # 1782 # input: %rsi - ap, %rbp - bp 1783 # ouput: 1784 # clobbers: everything 1785my ($ap,$bp) = ("%rsi","%rbp"); 1786$code.=<<___; 1787.type __rsaz_512_mul,\@abi-omnipotent 1788.align 32 1789__rsaz_512_mul: 1790 leaq 8(%rsp), %rdi 1791 1792 movq ($ap), %rax 1793 mulq %rbx 1794 movq %rax, (%rdi) 1795 movq 8($ap), %rax 1796 movq %rdx, %r8 1797 1798 mulq %rbx 1799 addq %rax, %r8 1800 movq 16($ap), %rax 1801 movq %rdx, %r9 1802 adcq \$0, %r9 1803 1804 mulq %rbx 1805 addq %rax, %r9 1806 movq 24($ap), %rax 1807 movq %rdx, %r10 1808 adcq \$0, %r10 1809 1810 mulq %rbx 1811 addq %rax, %r10 1812 movq 32($ap), %rax 1813 movq %rdx, %r11 1814 adcq \$0, %r11 1815 1816 mulq %rbx 1817 addq %rax, %r11 1818 movq 40($ap), %rax 1819 movq %rdx, %r12 1820 adcq \$0, %r12 1821 1822 mulq %rbx 1823 addq %rax, %r12 1824 movq 48($ap), %rax 1825 movq %rdx, %r13 1826 adcq \$0, %r13 1827 1828 mulq %rbx 1829 addq %rax, %r13 1830 movq 56($ap), %rax 1831 movq %rdx, %r14 1832 adcq \$0, %r14 1833 1834 mulq %rbx 1835 addq %rax, %r14 1836 movq ($ap), %rax 1837 movq %rdx, %r15 1838 adcq \$0, %r15 1839 1840 leaq 8($bp), $bp 1841 leaq 8(%rdi), %rdi 1842 1843 movl \$7, %ecx 1844 jmp .Loop_mul 1845 1846.align 32 1847.Loop_mul: 1848 movq ($bp), %rbx 1849 mulq %rbx 1850 addq %rax, %r8 1851 movq 8($ap), %rax 1852 movq %r8, (%rdi) 1853 movq %rdx, %r8 1854 adcq \$0, %r8 1855 1856 mulq %rbx 1857 addq %rax, %r9 1858 movq 16($ap), %rax 1859 adcq \$0, %rdx 1860 addq %r9, %r8 1861 movq %rdx, %r9 1862 adcq \$0, %r9 1863 1864 mulq %rbx 1865 addq %rax, %r10 1866 movq 24($ap), %rax 1867 adcq \$0, %rdx 1868 addq %r10, %r9 1869 movq %rdx, %r10 1870 adcq \$0, %r10 1871 1872 mulq %rbx 1873 addq %rax, %r11 1874 movq 32($ap), %rax 1875 adcq \$0, %rdx 1876 addq %r11, %r10 1877 movq %rdx, %r11 1878 adcq \$0, %r11 1879 1880 mulq %rbx 1881 addq %rax, %r12 1882 movq 40($ap), %rax 1883 adcq \$0, %rdx 1884 addq %r12, %r11 1885 movq %rdx, %r12 1886 adcq \$0, %r12 1887 1888 mulq %rbx 1889 addq %rax, %r13 1890 movq 48($ap), %rax 1891 adcq \$0, %rdx 1892 addq %r13, %r12 1893 movq %rdx, %r13 1894 adcq \$0, %r13 1895 1896 mulq %rbx 1897 addq %rax, %r14 1898 movq 56($ap), %rax 1899 adcq \$0, %rdx 1900 addq %r14, %r13 1901 movq %rdx, %r14 1902 leaq 8($bp), $bp 1903 adcq \$0, %r14 1904 1905 mulq %rbx 1906 addq %rax, %r15 1907 movq ($ap), %rax 1908 adcq \$0, %rdx 1909 addq %r15, %r14 1910 movq %rdx, %r15 1911 adcq \$0, %r15 1912 1913 leaq 8(%rdi), %rdi 1914 1915 decl %ecx 1916 jnz .Loop_mul 1917 1918 movq %r8, (%rdi) 1919 movq %r9, 8(%rdi) 1920 movq %r10, 16(%rdi) 1921 movq %r11, 24(%rdi) 1922 movq %r12, 32(%rdi) 1923 movq %r13, 40(%rdi) 1924 movq %r14, 48(%rdi) 1925 movq %r15, 56(%rdi) 1926 1927 ret 1928.size __rsaz_512_mul,.-__rsaz_512_mul 1929___ 1930} 1931if ($addx) { 1932 # __rsaz_512_mulx 1933 # 1934 # input: %rsi - ap, %rbp - bp 1935 # ouput: 1936 # clobbers: everything 1937my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi"); 1938$code.=<<___; 1939.type __rsaz_512_mulx,\@abi-omnipotent 1940.align 32 1941__rsaz_512_mulx: 1942 mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller 1943 mov \$-6, %rcx 1944 1945 mulx 8($ap), %rax, %r9 1946 movq %rbx, 8(%rsp) 1947 1948 mulx 16($ap), %rbx, %r10 1949 adc %rax, %r8 1950 1951 mulx 24($ap), %rax, %r11 1952 adc %rbx, %r9 1953 1954 mulx 32($ap), %rbx, %r12 1955 adc %rax, %r10 1956 1957 mulx 40($ap), %rax, %r13 1958 adc %rbx, %r11 1959 1960 mulx 48($ap), %rbx, %r14 1961 adc %rax, %r12 1962 1963 mulx 56($ap), %rax, %r15 1964 mov 8($bp), %rdx 1965 adc %rbx, %r13 1966 adc %rax, %r14 1967 adc \$0, %r15 1968 1969 xor $zero, $zero # cf=0,of=0 1970 jmp .Loop_mulx 1971 1972.align 32 1973.Loop_mulx: 1974 movq %r8, %rbx 1975 mulx ($ap), %rax, %r8 1976 adcx %rax, %rbx 1977 adox %r9, %r8 1978 1979 mulx 8($ap), %rax, %r9 1980 adcx %rax, %r8 1981 adox %r10, %r9 1982 1983 mulx 16($ap), %rax, %r10 1984 adcx %rax, %r9 1985 adox %r11, %r10 1986 1987 mulx 24($ap), %rax, %r11 1988 adcx %rax, %r10 1989 adox %r12, %r11 1990 1991 .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12 1992 adcx %rax, %r11 1993 adox %r13, %r12 1994 1995 mulx 40($ap), %rax, %r13 1996 adcx %rax, %r12 1997 adox %r14, %r13 1998 1999 mulx 48($ap), %rax, %r14 2000 adcx %rax, %r13 2001 adox %r15, %r14 2002 2003 mulx 56($ap), %rax, %r15 2004 movq 64($bp,%rcx,8), %rdx 2005 movq %rbx, 8+64-8(%rsp,%rcx,8) 2006 adcx %rax, %r14 2007 adox $zero, %r15 2008 adcx $zero, %r15 # cf=0 2009 2010 inc %rcx # of=0 2011 jnz .Loop_mulx 2012 2013 movq %r8, %rbx 2014 mulx ($ap), %rax, %r8 2015 adcx %rax, %rbx 2016 adox %r9, %r8 2017 2018 .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9 2019 adcx %rax, %r8 2020 adox %r10, %r9 2021 2022 .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10 2023 adcx %rax, %r9 2024 adox %r11, %r10 2025 2026 mulx 24($ap), %rax, %r11 2027 adcx %rax, %r10 2028 adox %r12, %r11 2029 2030 mulx 32($ap), %rax, %r12 2031 adcx %rax, %r11 2032 adox %r13, %r12 2033 2034 mulx 40($ap), %rax, %r13 2035 adcx %rax, %r12 2036 adox %r14, %r13 2037 2038 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14 2039 adcx %rax, %r13 2040 adox %r15, %r14 2041 2042 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15 2043 adcx %rax, %r14 2044 adox $zero, %r15 2045 adcx $zero, %r15 2046 2047 mov %rbx, 8+64-8(%rsp) 2048 mov %r8, 8+64(%rsp) 2049 mov %r9, 8+64+8(%rsp) 2050 mov %r10, 8+64+16(%rsp) 2051 mov %r11, 8+64+24(%rsp) 2052 mov %r12, 8+64+32(%rsp) 2053 mov %r13, 8+64+40(%rsp) 2054 mov %r14, 8+64+48(%rsp) 2055 mov %r15, 8+64+56(%rsp) 2056 2057 ret 2058.size __rsaz_512_mulx,.-__rsaz_512_mulx 2059___ 2060} 2061{ 2062my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx"); 2063$code.=<<___; 2064.globl rsaz_512_scatter4 2065.type rsaz_512_scatter4,\@abi-omnipotent 2066.align 16 2067rsaz_512_scatter4: 2068 leaq ($out,$power,8), $out 2069 movl \$8, %r9d 2070 jmp .Loop_scatter 2071.align 16 2072.Loop_scatter: 2073 movq ($inp), %rax 2074 leaq 8($inp), $inp 2075 movq %rax, ($out) 2076 leaq 128($out), $out 2077 decl %r9d 2078 jnz .Loop_scatter 2079 ret 2080.size rsaz_512_scatter4,.-rsaz_512_scatter4 2081 2082.globl rsaz_512_gather4 2083.type rsaz_512_gather4,\@abi-omnipotent 2084.align 16 2085rsaz_512_gather4: 2086___ 2087$code.=<<___ if ($win64); 2088.LSEH_begin_rsaz_512_gather4: 2089 .byte 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 # sub $0xa8,%rsp 2090 .byte 0x0f,0x29,0x34,0x24 # movaps %xmm6,(%rsp) 2091 .byte 0x0f,0x29,0x7c,0x24,0x10 # movaps %xmm7,0x10(%rsp) 2092 .byte 0x44,0x0f,0x29,0x44,0x24,0x20 # movaps %xmm8,0x20(%rsp) 2093 .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 # movaps %xmm9,0x30(%rsp) 2094 .byte 0x44,0x0f,0x29,0x54,0x24,0x40 # movaps %xmm10,0x40(%rsp) 2095 .byte 0x44,0x0f,0x29,0x5c,0x24,0x50 # movaps %xmm11,0x50(%rsp) 2096 .byte 0x44,0x0f,0x29,0x64,0x24,0x60 # movaps %xmm12,0x60(%rsp) 2097 .byte 0x44,0x0f,0x29,0x6c,0x24,0x70 # movaps %xmm13,0x70(%rsp) 2098 .byte 0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 # movaps %xmm14,0x80(%rsp) 2099 .byte 0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 # movaps %xmm15,0x90(%rsp) 2100___ 2101$code.=<<___; 2102 movd $power,%xmm8 2103 movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002 2104 movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000 2105 2106 pshufd \$0,%xmm8,%xmm8 # broadcast $power 2107 movdqa %xmm1,%xmm7 2108 movdqa %xmm1,%xmm2 2109___ 2110######################################################################## 2111# calculate mask by comparing 0..15 to $power 2112# 2113for($i=0;$i<4;$i++) { 2114$code.=<<___; 2115 paddd %xmm`$i`,%xmm`$i+1` 2116 pcmpeqd %xmm8,%xmm`$i` 2117 movdqa %xmm7,%xmm`$i+3` 2118___ 2119} 2120for(;$i<7;$i++) { 2121$code.=<<___; 2122 paddd %xmm`$i`,%xmm`$i+1` 2123 pcmpeqd %xmm8,%xmm`$i` 2124___ 2125} 2126$code.=<<___; 2127 pcmpeqd %xmm8,%xmm7 2128 movl \$8, %r9d 2129 jmp .Loop_gather 2130.align 16 2131.Loop_gather: 2132 movdqa 16*0($inp),%xmm8 2133 movdqa 16*1($inp),%xmm9 2134 movdqa 16*2($inp),%xmm10 2135 movdqa 16*3($inp),%xmm11 2136 pand %xmm0,%xmm8 2137 movdqa 16*4($inp),%xmm12 2138 pand %xmm1,%xmm9 2139 movdqa 16*5($inp),%xmm13 2140 pand %xmm2,%xmm10 2141 movdqa 16*6($inp),%xmm14 2142 pand %xmm3,%xmm11 2143 movdqa 16*7($inp),%xmm15 2144 leaq 128($inp), $inp 2145 pand %xmm4,%xmm12 2146 pand %xmm5,%xmm13 2147 pand %xmm6,%xmm14 2148 pand %xmm7,%xmm15 2149 por %xmm10,%xmm8 2150 por %xmm11,%xmm9 2151 por %xmm12,%xmm8 2152 por %xmm13,%xmm9 2153 por %xmm14,%xmm8 2154 por %xmm15,%xmm9 2155 2156 por %xmm9,%xmm8 2157 pshufd \$0x4e,%xmm8,%xmm9 2158 por %xmm9,%xmm8 2159 movq %xmm8,($out) 2160 leaq 8($out), $out 2161 decl %r9d 2162 jnz .Loop_gather 2163___ 2164$code.=<<___ if ($win64); 2165 movaps 0x00(%rsp),%xmm6 2166 movaps 0x10(%rsp),%xmm7 2167 movaps 0x20(%rsp),%xmm8 2168 movaps 0x30(%rsp),%xmm9 2169 movaps 0x40(%rsp),%xmm10 2170 movaps 0x50(%rsp),%xmm11 2171 movaps 0x60(%rsp),%xmm12 2172 movaps 0x70(%rsp),%xmm13 2173 movaps 0x80(%rsp),%xmm14 2174 movaps 0x90(%rsp),%xmm15 2175 add \$0xa8,%rsp 2176___ 2177$code.=<<___; 2178 ret 2179.LSEH_end_rsaz_512_gather4: 2180.size rsaz_512_gather4,.-rsaz_512_gather4 2181 2182.align 64 2183.Linc: 2184 .long 0,0, 1,1 2185 .long 2,2, 2,2 2186___ 2187} 2188 2189# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 2190# CONTEXT *context,DISPATCHER_CONTEXT *disp) 2191if ($win64) { 2192$rec="%rcx"; 2193$frame="%rdx"; 2194$context="%r8"; 2195$disp="%r9"; 2196 2197$code.=<<___; 2198.extern __imp_RtlVirtualUnwind 2199.type se_handler,\@abi-omnipotent 2200.align 16 2201se_handler: 2202 push %rsi 2203 push %rdi 2204 push %rbx 2205 push %rbp 2206 push %r12 2207 push %r13 2208 push %r14 2209 push %r15 2210 pushfq 2211 sub \$64,%rsp 2212 2213 mov 120($context),%rax # pull context->Rax 2214 mov 248($context),%rbx # pull context->Rip 2215 2216 mov 8($disp),%rsi # disp->ImageBase 2217 mov 56($disp),%r11 # disp->HandlerData 2218 2219 mov 0(%r11),%r10d # HandlerData[0] 2220 lea (%rsi,%r10),%r10 # end of prologue label 2221 cmp %r10,%rbx # context->Rip<end of prologue label 2222 jb .Lcommon_seh_tail 2223 2224 mov 152($context),%rax # pull context->Rsp 2225 2226 mov 4(%r11),%r10d # HandlerData[1] 2227 lea (%rsi,%r10),%r10 # epilogue label 2228 cmp %r10,%rbx # context->Rip>=epilogue label 2229 jae .Lcommon_seh_tail 2230 2231 lea 128+24+48(%rax),%rax 2232 2233 lea .Lmul_gather4_epilogue(%rip),%rbx 2234 cmp %r10,%rbx 2235 jne .Lse_not_in_mul_gather4 2236 2237 lea 0xb0(%rax),%rax 2238 2239 lea -48-0xa8(%rax),%rsi 2240 lea 512($context),%rdi 2241 mov \$20,%ecx 2242 .long 0xa548f3fc # cld; rep movsq 2243 2244.Lse_not_in_mul_gather4: 2245 mov -8(%rax),%rbx 2246 mov -16(%rax),%rbp 2247 mov -24(%rax),%r12 2248 mov -32(%rax),%r13 2249 mov -40(%rax),%r14 2250 mov -48(%rax),%r15 2251 mov %rbx,144($context) # restore context->Rbx 2252 mov %rbp,160($context) # restore context->Rbp 2253 mov %r12,216($context) # restore context->R12 2254 mov %r13,224($context) # restore context->R13 2255 mov %r14,232($context) # restore context->R14 2256 mov %r15,240($context) # restore context->R15 2257 2258.Lcommon_seh_tail: 2259 mov 8(%rax),%rdi 2260 mov 16(%rax),%rsi 2261 mov %rax,152($context) # restore context->Rsp 2262 mov %rsi,168($context) # restore context->Rsi 2263 mov %rdi,176($context) # restore context->Rdi 2264 2265 mov 40($disp),%rdi # disp->ContextRecord 2266 mov $context,%rsi # context 2267 mov \$154,%ecx # sizeof(CONTEXT) 2268 .long 0xa548f3fc # cld; rep movsq 2269 2270 mov $disp,%rsi 2271 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 2272 mov 8(%rsi),%rdx # arg2, disp->ImageBase 2273 mov 0(%rsi),%r8 # arg3, disp->ControlPc 2274 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 2275 mov 40(%rsi),%r10 # disp->ContextRecord 2276 lea 56(%rsi),%r11 # &disp->HandlerData 2277 lea 24(%rsi),%r12 # &disp->EstablisherFrame 2278 mov %r10,32(%rsp) # arg5 2279 mov %r11,40(%rsp) # arg6 2280 mov %r12,48(%rsp) # arg7 2281 mov %rcx,56(%rsp) # arg8, (NULL) 2282 call *__imp_RtlVirtualUnwind(%rip) 2283 2284 mov \$1,%eax # ExceptionContinueSearch 2285 add \$64,%rsp 2286 popfq 2287 pop %r15 2288 pop %r14 2289 pop %r13 2290 pop %r12 2291 pop %rbp 2292 pop %rbx 2293 pop %rdi 2294 pop %rsi 2295 ret 2296.size se_handler,.-se_handler 2297 2298.section .pdata 2299.align 4 2300 .rva .LSEH_begin_rsaz_512_sqr 2301 .rva .LSEH_end_rsaz_512_sqr 2302 .rva .LSEH_info_rsaz_512_sqr 2303 2304 .rva .LSEH_begin_rsaz_512_mul 2305 .rva .LSEH_end_rsaz_512_mul 2306 .rva .LSEH_info_rsaz_512_mul 2307 2308 .rva .LSEH_begin_rsaz_512_mul_gather4 2309 .rva .LSEH_end_rsaz_512_mul_gather4 2310 .rva .LSEH_info_rsaz_512_mul_gather4 2311 2312 .rva .LSEH_begin_rsaz_512_mul_scatter4 2313 .rva .LSEH_end_rsaz_512_mul_scatter4 2314 .rva .LSEH_info_rsaz_512_mul_scatter4 2315 2316 .rva .LSEH_begin_rsaz_512_mul_by_one 2317 .rva .LSEH_end_rsaz_512_mul_by_one 2318 .rva .LSEH_info_rsaz_512_mul_by_one 2319 2320 .rva .LSEH_begin_rsaz_512_gather4 2321 .rva .LSEH_end_rsaz_512_gather4 2322 .rva .LSEH_info_rsaz_512_gather4 2323 2324.section .xdata 2325.align 8 2326.LSEH_info_rsaz_512_sqr: 2327 .byte 9,0,0,0 2328 .rva se_handler 2329 .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[] 2330.LSEH_info_rsaz_512_mul: 2331 .byte 9,0,0,0 2332 .rva se_handler 2333 .rva .Lmul_body,.Lmul_epilogue # HandlerData[] 2334.LSEH_info_rsaz_512_mul_gather4: 2335 .byte 9,0,0,0 2336 .rva se_handler 2337 .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[] 2338.LSEH_info_rsaz_512_mul_scatter4: 2339 .byte 9,0,0,0 2340 .rva se_handler 2341 .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[] 2342.LSEH_info_rsaz_512_mul_by_one: 2343 .byte 9,0,0,0 2344 .rva se_handler 2345 .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[] 2346.LSEH_info_rsaz_512_gather4: 2347 .byte 0x01,0x46,0x16,0x00 2348 .byte 0x46,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15 2349 .byte 0x3d,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14 2350 .byte 0x34,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13 2351 .byte 0x2e,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12 2352 .byte 0x28,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11 2353 .byte 0x22,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10 2354 .byte 0x1c,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9 2355 .byte 0x16,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8 2356 .byte 0x10,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7 2357 .byte 0x0b,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6 2358 .byte 0x07,0x01,0x15,0x00 # sub rsp,0xa8 2359___ 2360} 2361 2362$code =~ s/\`([^\`]*)\`/eval $1/gem; 2363print $code; 2364close STDOUT; 2365