1#!/usr/bin/env perl 2 3############################################################################## 4# # 5# Copyright (c) 2012, Intel Corporation # 6# # 7# All rights reserved. # 8# # 9# Redistribution and use in source and binary forms, with or without # 10# modification, are permitted provided that the following conditions are # 11# met: # 12# # 13# * Redistributions of source code must retain the above copyright # 14# notice, this list of conditions and the following disclaimer. # 15# # 16# * Redistributions in binary form must reproduce the above copyright # 17# notice, this list of conditions and the following disclaimer in the # 18# documentation and/or other materials provided with the # 19# distribution. # 20# # 21# * Neither the name of the Intel Corporation nor the names of its # 22# contributors may be used to endorse or promote products derived from # 23# this software without specific prior written permission. # 24# # 25# # 26# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY # 27# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # 28# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # 29# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR # 30# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # 31# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # 32# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # 33# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # 34# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # 35# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # 36# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # 37# # 38############################################################################## 39# Developers and authors: # 40# Shay Gueron (1, 2), and Vlad Krasnov (1) # 41# (1) Intel Architecture Group, Microprocessor and Chipset Development, # 42# Israel Development Center, Haifa, Israel # 43# (2) University of Haifa # 44############################################################################## 45# Reference: # 46# [1] S. Gueron, "Efficient Software Implementations of Modular # 47# Exponentiation", http://eprint.iacr.org/2011/239 # 48# [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". # 49# IEEE Proceedings of 9th International Conference on Information # 50# Technology: New Generations (ITNG 2012), 821-823 (2012). # 51# [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation# 52# Journal of Cryptographic Engineering 2:31-43 (2012). # 53# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis # 54# resistant 512-bit and 1024-bit modular exponentiation for optimizing # 55# RSA1024 and RSA2048 on x86_64 platforms", # 56# http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest# 57############################################################################## 58 59# While original submission covers 512- and 1024-bit exponentiation, 60# this module is limited to 512-bit version only (and as such 61# accelerates RSA1024 sign). This is because improvement for longer 62# keys is not high enough to justify the effort, highest measured 63# was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming 64# for the moment of this writing!] Nor does this module implement 65# "monolithic" complete exponentiation jumbo-subroutine, but adheres 66# to more modular mixture of C and assembly. And it's optimized even 67# for processors other than Intel Core family (see table below for 68# improvement coefficients). 69# <appro@openssl.org> 70# 71# RSA1024 sign/sec this/original |this/rsax(*) this/fips(*) 72# ----------------+--------------------------- 73# Opteron +13% |+5% +20% 74# Bulldozer -0% |-1% +10% 75# P4 +11% |+7% +8% 76# Westmere +5% |+14% +17% 77# Sandy Bridge +2% |+12% +29% 78# Ivy Bridge +1% |+11% +35% 79# Haswell(**) -0% |+12% +39% 80# Atom +13% |+11% +4% 81# VIA Nano +70% |+9% +25% 82# 83# (*) rsax engine and fips numbers are presented for reference 84# purposes; 85# (**) MULX was attempted, but found to give only marginal improvement; 86 87$flavour = shift; 88$output = shift; 89if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 90 91$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 92 93$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 94( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 95( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 96die "can't locate x86_64-xlate.pl"; 97 98open OUT,"| \"$^X\" $xlate $flavour $output"; 99*STDOUT=*OUT; 100 101if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 102 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 103 $addx = ($1>=2.23); 104} 105 106if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 107 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 108 $addx = ($1>=2.10); 109} 110 111if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 112 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 113 $addx = ($1>=12); 114} 115 116if (!$addx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) { 117 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 118 $addx = ($ver>=3.03); 119} 120 121($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API 122{ 123my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d"); 124 125$code.=<<___; 126.text 127 128.extern OPENSSL_ia32cap_P 129 130.globl rsaz_512_sqr 131.type rsaz_512_sqr,\@function,5 132.align 32 133rsaz_512_sqr: # 25-29% faster than rsaz_512_mul 134 push %rbx 135 push %rbp 136 push %r12 137 push %r13 138 push %r14 139 push %r15 140 141 subq \$128+24, %rsp 142.Lsqr_body: 143 movq $mod, %rbp # common argument 144 movq ($inp), %rdx 145 movq 8($inp), %rax 146 movq $n0, 128(%rsp) 147___ 148$code.=<<___ if ($addx); 149 movl \$0x80100,%r11d 150 andl OPENSSL_ia32cap_P+8(%rip),%r11d 151 cmpl \$0x80100,%r11d # check for MULX and ADO/CX 152 je .Loop_sqrx 153___ 154$code.=<<___; 155 jmp .Loop_sqr 156 157.align 32 158.Loop_sqr: 159 movl $times,128+8(%rsp) 160#first iteration 161 movq %rdx, %rbx 162 mulq %rdx 163 movq %rax, %r8 164 movq 16($inp), %rax 165 movq %rdx, %r9 166 167 mulq %rbx 168 addq %rax, %r9 169 movq 24($inp), %rax 170 movq %rdx, %r10 171 adcq \$0, %r10 172 173 mulq %rbx 174 addq %rax, %r10 175 movq 32($inp), %rax 176 movq %rdx, %r11 177 adcq \$0, %r11 178 179 mulq %rbx 180 addq %rax, %r11 181 movq 40($inp), %rax 182 movq %rdx, %r12 183 adcq \$0, %r12 184 185 mulq %rbx 186 addq %rax, %r12 187 movq 48($inp), %rax 188 movq %rdx, %r13 189 adcq \$0, %r13 190 191 mulq %rbx 192 addq %rax, %r13 193 movq 56($inp), %rax 194 movq %rdx, %r14 195 adcq \$0, %r14 196 197 mulq %rbx 198 addq %rax, %r14 199 movq %rbx, %rax 200 movq %rdx, %r15 201 adcq \$0, %r15 202 203 addq %r8, %r8 #shlq \$1, %r8 204 movq %r9, %rcx 205 adcq %r9, %r9 #shld \$1, %r8, %r9 206 207 mulq %rax 208 movq %rax, (%rsp) 209 addq %rdx, %r8 210 adcq \$0, %r9 211 212 movq %r8, 8(%rsp) 213 shrq \$63, %rcx 214 215#second iteration 216 movq 8($inp), %r8 217 movq 16($inp), %rax 218 mulq %r8 219 addq %rax, %r10 220 movq 24($inp), %rax 221 movq %rdx, %rbx 222 adcq \$0, %rbx 223 224 mulq %r8 225 addq %rax, %r11 226 movq 32($inp), %rax 227 adcq \$0, %rdx 228 addq %rbx, %r11 229 movq %rdx, %rbx 230 adcq \$0, %rbx 231 232 mulq %r8 233 addq %rax, %r12 234 movq 40($inp), %rax 235 adcq \$0, %rdx 236 addq %rbx, %r12 237 movq %rdx, %rbx 238 adcq \$0, %rbx 239 240 mulq %r8 241 addq %rax, %r13 242 movq 48($inp), %rax 243 adcq \$0, %rdx 244 addq %rbx, %r13 245 movq %rdx, %rbx 246 adcq \$0, %rbx 247 248 mulq %r8 249 addq %rax, %r14 250 movq 56($inp), %rax 251 adcq \$0, %rdx 252 addq %rbx, %r14 253 movq %rdx, %rbx 254 adcq \$0, %rbx 255 256 mulq %r8 257 addq %rax, %r15 258 movq %r8, %rax 259 adcq \$0, %rdx 260 addq %rbx, %r15 261 movq %rdx, %r8 262 movq %r10, %rdx 263 adcq \$0, %r8 264 265 add %rdx, %rdx 266 lea (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10 267 movq %r11, %rbx 268 adcq %r11, %r11 #shld \$1, %r10, %r11 269 270 mulq %rax 271 addq %rax, %r9 272 adcq %rdx, %r10 273 adcq \$0, %r11 274 275 movq %r9, 16(%rsp) 276 movq %r10, 24(%rsp) 277 shrq \$63, %rbx 278 279#third iteration 280 movq 16($inp), %r9 281 movq 24($inp), %rax 282 mulq %r9 283 addq %rax, %r12 284 movq 32($inp), %rax 285 movq %rdx, %rcx 286 adcq \$0, %rcx 287 288 mulq %r9 289 addq %rax, %r13 290 movq 40($inp), %rax 291 adcq \$0, %rdx 292 addq %rcx, %r13 293 movq %rdx, %rcx 294 adcq \$0, %rcx 295 296 mulq %r9 297 addq %rax, %r14 298 movq 48($inp), %rax 299 adcq \$0, %rdx 300 addq %rcx, %r14 301 movq %rdx, %rcx 302 adcq \$0, %rcx 303 304 mulq %r9 305 movq %r12, %r10 306 lea (%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12 307 addq %rax, %r15 308 movq 56($inp), %rax 309 adcq \$0, %rdx 310 addq %rcx, %r15 311 movq %rdx, %rcx 312 adcq \$0, %rcx 313 314 mulq %r9 315 shrq \$63, %r10 316 addq %rax, %r8 317 movq %r9, %rax 318 adcq \$0, %rdx 319 addq %rcx, %r8 320 movq %rdx, %r9 321 adcq \$0, %r9 322 323 movq %r13, %rcx 324 leaq (%r10,%r13,2), %r13 #shld \$1, %r12, %r13 325 326 mulq %rax 327 addq %rax, %r11 328 adcq %rdx, %r12 329 adcq \$0, %r13 330 331 movq %r11, 32(%rsp) 332 movq %r12, 40(%rsp) 333 shrq \$63, %rcx 334 335#fourth iteration 336 movq 24($inp), %r10 337 movq 32($inp), %rax 338 mulq %r10 339 addq %rax, %r14 340 movq 40($inp), %rax 341 movq %rdx, %rbx 342 adcq \$0, %rbx 343 344 mulq %r10 345 addq %rax, %r15 346 movq 48($inp), %rax 347 adcq \$0, %rdx 348 addq %rbx, %r15 349 movq %rdx, %rbx 350 adcq \$0, %rbx 351 352 mulq %r10 353 movq %r14, %r12 354 leaq (%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14 355 addq %rax, %r8 356 movq 56($inp), %rax 357 adcq \$0, %rdx 358 addq %rbx, %r8 359 movq %rdx, %rbx 360 adcq \$0, %rbx 361 362 mulq %r10 363 shrq \$63, %r12 364 addq %rax, %r9 365 movq %r10, %rax 366 adcq \$0, %rdx 367 addq %rbx, %r9 368 movq %rdx, %r10 369 adcq \$0, %r10 370 371 movq %r15, %rbx 372 leaq (%r12,%r15,2),%r15 #shld \$1, %r14, %r15 373 374 mulq %rax 375 addq %rax, %r13 376 adcq %rdx, %r14 377 adcq \$0, %r15 378 379 movq %r13, 48(%rsp) 380 movq %r14, 56(%rsp) 381 shrq \$63, %rbx 382 383#fifth iteration 384 movq 32($inp), %r11 385 movq 40($inp), %rax 386 mulq %r11 387 addq %rax, %r8 388 movq 48($inp), %rax 389 movq %rdx, %rcx 390 adcq \$0, %rcx 391 392 mulq %r11 393 addq %rax, %r9 394 movq 56($inp), %rax 395 adcq \$0, %rdx 396 movq %r8, %r12 397 leaq (%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8 398 addq %rcx, %r9 399 movq %rdx, %rcx 400 adcq \$0, %rcx 401 402 mulq %r11 403 shrq \$63, %r12 404 addq %rax, %r10 405 movq %r11, %rax 406 adcq \$0, %rdx 407 addq %rcx, %r10 408 movq %rdx, %r11 409 adcq \$0, %r11 410 411 movq %r9, %rcx 412 leaq (%r12,%r9,2), %r9 #shld \$1, %r8, %r9 413 414 mulq %rax 415 addq %rax, %r15 416 adcq %rdx, %r8 417 adcq \$0, %r9 418 419 movq %r15, 64(%rsp) 420 movq %r8, 72(%rsp) 421 shrq \$63, %rcx 422 423#sixth iteration 424 movq 40($inp), %r12 425 movq 48($inp), %rax 426 mulq %r12 427 addq %rax, %r10 428 movq 56($inp), %rax 429 movq %rdx, %rbx 430 adcq \$0, %rbx 431 432 mulq %r12 433 addq %rax, %r11 434 movq %r12, %rax 435 movq %r10, %r15 436 leaq (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10 437 adcq \$0, %rdx 438 shrq \$63, %r15 439 addq %rbx, %r11 440 movq %rdx, %r12 441 adcq \$0, %r12 442 443 movq %r11, %rbx 444 leaq (%r15,%r11,2), %r11 #shld \$1, %r10, %r11 445 446 mulq %rax 447 addq %rax, %r9 448 adcq %rdx, %r10 449 adcq \$0, %r11 450 451 movq %r9, 80(%rsp) 452 movq %r10, 88(%rsp) 453 454#seventh iteration 455 movq 48($inp), %r13 456 movq 56($inp), %rax 457 mulq %r13 458 addq %rax, %r12 459 movq %r13, %rax 460 movq %rdx, %r13 461 adcq \$0, %r13 462 463 xorq %r14, %r14 464 shlq \$1, %rbx 465 adcq %r12, %r12 #shld \$1, %rbx, %r12 466 adcq %r13, %r13 #shld \$1, %r12, %r13 467 adcq %r14, %r14 #shld \$1, %r13, %r14 468 469 mulq %rax 470 addq %rax, %r11 471 adcq %rdx, %r12 472 adcq \$0, %r13 473 474 movq %r11, 96(%rsp) 475 movq %r12, 104(%rsp) 476 477#eighth iteration 478 movq 56($inp), %rax 479 mulq %rax 480 addq %rax, %r13 481 adcq \$0, %rdx 482 483 addq %rdx, %r14 484 485 movq %r13, 112(%rsp) 486 movq %r14, 120(%rsp) 487 488 movq (%rsp), %r8 489 movq 8(%rsp), %r9 490 movq 16(%rsp), %r10 491 movq 24(%rsp), %r11 492 movq 32(%rsp), %r12 493 movq 40(%rsp), %r13 494 movq 48(%rsp), %r14 495 movq 56(%rsp), %r15 496 497 call __rsaz_512_reduce 498 499 addq 64(%rsp), %r8 500 adcq 72(%rsp), %r9 501 adcq 80(%rsp), %r10 502 adcq 88(%rsp), %r11 503 adcq 96(%rsp), %r12 504 adcq 104(%rsp), %r13 505 adcq 112(%rsp), %r14 506 adcq 120(%rsp), %r15 507 sbbq %rcx, %rcx 508 509 call __rsaz_512_subtract 510 511 movq %r8, %rdx 512 movq %r9, %rax 513 movl 128+8(%rsp), $times 514 movq $out, $inp 515 516 decl $times 517 jnz .Loop_sqr 518___ 519if ($addx) { 520$code.=<<___; 521 jmp .Lsqr_tail 522 523.align 32 524.Loop_sqrx: 525 movl $times,128+8(%rsp) 526 movq $out, %xmm0 # off-load 527 movq %rbp, %xmm1 # off-load 528#first iteration 529 mulx %rax, %r8, %r9 530 531 mulx 16($inp), %rcx, %r10 532 xor %rbp, %rbp # cf=0, of=0 533 534 mulx 24($inp), %rax, %r11 535 adcx %rcx, %r9 536 537 mulx 32($inp), %rcx, %r12 538 adcx %rax, %r10 539 540 mulx 40($inp), %rax, %r13 541 adcx %rcx, %r11 542 543 .byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($inp), %rcx, %r14 544 adcx %rax, %r12 545 adcx %rcx, %r13 546 547 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r15 548 adcx %rax, %r14 549 adcx %rbp, %r15 # %rbp is 0 550 551 mov %r9, %rcx 552 shld \$1, %r8, %r9 553 shl \$1, %r8 554 555 xor %ebp, %ebp 556 mulx %rdx, %rax, %rdx 557 adcx %rdx, %r8 558 mov 8($inp), %rdx 559 adcx %rbp, %r9 560 561 mov %rax, (%rsp) 562 mov %r8, 8(%rsp) 563 564#second iteration 565 mulx 16($inp), %rax, %rbx 566 adox %rax, %r10 567 adcx %rbx, %r11 568 569 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r8 570 adox $out, %r11 571 adcx %r8, %r12 572 573 mulx 32($inp), %rax, %rbx 574 adox %rax, %r12 575 adcx %rbx, %r13 576 577 mulx 40($inp), $out, %r8 578 adox $out, %r13 579 adcx %r8, %r14 580 581 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx 582 adox %rax, %r14 583 adcx %rbx, %r15 584 585 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8 586 adox $out, %r15 587 adcx %rbp, %r8 588 adox %rbp, %r8 589 590 mov %r11, %rbx 591 shld \$1, %r10, %r11 592 shld \$1, %rcx, %r10 593 594 xor %ebp,%ebp 595 mulx %rdx, %rax, %rcx 596 mov 16($inp), %rdx 597 adcx %rax, %r9 598 adcx %rcx, %r10 599 adcx %rbp, %r11 600 601 mov %r9, 16(%rsp) 602 .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp) 603 604#third iteration 605 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9 606 adox $out, %r12 607 adcx %r9, %r13 608 609 mulx 32($inp), %rax, %rcx 610 adox %rax, %r13 611 adcx %rcx, %r14 612 613 mulx 40($inp), $out, %r9 614 adox $out, %r14 615 adcx %r9, %r15 616 617 .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx 618 adox %rax, %r15 619 adcx %rcx, %r8 620 621 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r9 622 adox $out, %r8 623 adcx %rbp, %r9 624 adox %rbp, %r9 625 626 mov %r13, %rcx 627 shld \$1, %r12, %r13 628 shld \$1, %rbx, %r12 629 630 xor %ebp, %ebp 631 mulx %rdx, %rax, %rdx 632 adcx %rax, %r11 633 adcx %rdx, %r12 634 mov 24($inp), %rdx 635 adcx %rbp, %r13 636 637 mov %r11, 32(%rsp) 638 .byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp) 639 640#fourth iteration 641 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx 642 adox %rax, %r14 643 adcx %rbx, %r15 644 645 mulx 40($inp), $out, %r10 646 adox $out, %r15 647 adcx %r10, %r8 648 649 mulx 48($inp), %rax, %rbx 650 adox %rax, %r8 651 adcx %rbx, %r9 652 653 mulx 56($inp), $out, %r10 654 adox $out, %r9 655 adcx %rbp, %r10 656 adox %rbp, %r10 657 658 .byte 0x66 659 mov %r15, %rbx 660 shld \$1, %r14, %r15 661 shld \$1, %rcx, %r14 662 663 xor %ebp, %ebp 664 mulx %rdx, %rax, %rdx 665 adcx %rax, %r13 666 adcx %rdx, %r14 667 mov 32($inp), %rdx 668 adcx %rbp, %r15 669 670 mov %r13, 48(%rsp) 671 mov %r14, 56(%rsp) 672 673#fifth iteration 674 .byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11 675 adox $out, %r8 676 adcx %r11, %r9 677 678 mulx 48($inp), %rax, %rcx 679 adox %rax, %r9 680 adcx %rcx, %r10 681 682 mulx 56($inp), $out, %r11 683 adox $out, %r10 684 adcx %rbp, %r11 685 adox %rbp, %r11 686 687 mov %r9, %rcx 688 shld \$1, %r8, %r9 689 shld \$1, %rbx, %r8 690 691 xor %ebp, %ebp 692 mulx %rdx, %rax, %rdx 693 adcx %rax, %r15 694 adcx %rdx, %r8 695 mov 40($inp), %rdx 696 adcx %rbp, %r9 697 698 mov %r15, 64(%rsp) 699 mov %r8, 72(%rsp) 700 701#sixth iteration 702 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx 703 adox %rax, %r10 704 adcx %rbx, %r11 705 706 .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12 707 adox $out, %r11 708 adcx %rbp, %r12 709 adox %rbp, %r12 710 711 mov %r11, %rbx 712 shld \$1, %r10, %r11 713 shld \$1, %rcx, %r10 714 715 xor %ebp, %ebp 716 mulx %rdx, %rax, %rdx 717 adcx %rax, %r9 718 adcx %rdx, %r10 719 mov 48($inp), %rdx 720 adcx %rbp, %r11 721 722 mov %r9, 80(%rsp) 723 mov %r10, 88(%rsp) 724 725#seventh iteration 726 .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13 727 adox %rax, %r12 728 adox %rbp, %r13 729 730 xor %r14, %r14 731 shld \$1, %r13, %r14 732 shld \$1, %r12, %r13 733 shld \$1, %rbx, %r12 734 735 xor %ebp, %ebp 736 mulx %rdx, %rax, %rdx 737 adcx %rax, %r11 738 adcx %rdx, %r12 739 mov 56($inp), %rdx 740 adcx %rbp, %r13 741 742 .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp) 743 .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp) 744 745#eighth iteration 746 mulx %rdx, %rax, %rdx 747 adox %rax, %r13 748 adox %rbp, %rdx 749 750 .byte 0x66 751 add %rdx, %r14 752 753 movq %r13, 112(%rsp) 754 movq %r14, 120(%rsp) 755 movq %xmm0, $out 756 movq %xmm1, %rbp 757 758 movq 128(%rsp), %rdx # pull $n0 759 movq (%rsp), %r8 760 movq 8(%rsp), %r9 761 movq 16(%rsp), %r10 762 movq 24(%rsp), %r11 763 movq 32(%rsp), %r12 764 movq 40(%rsp), %r13 765 movq 48(%rsp), %r14 766 movq 56(%rsp), %r15 767 768 call __rsaz_512_reducex 769 770 addq 64(%rsp), %r8 771 adcq 72(%rsp), %r9 772 adcq 80(%rsp), %r10 773 adcq 88(%rsp), %r11 774 adcq 96(%rsp), %r12 775 adcq 104(%rsp), %r13 776 adcq 112(%rsp), %r14 777 adcq 120(%rsp), %r15 778 sbbq %rcx, %rcx 779 780 call __rsaz_512_subtract 781 782 movq %r8, %rdx 783 movq %r9, %rax 784 movl 128+8(%rsp), $times 785 movq $out, $inp 786 787 decl $times 788 jnz .Loop_sqrx 789 790.Lsqr_tail: 791___ 792} 793$code.=<<___; 794 795 leaq 128+24+48(%rsp), %rax 796 movq -48(%rax), %r15 797 movq -40(%rax), %r14 798 movq -32(%rax), %r13 799 movq -24(%rax), %r12 800 movq -16(%rax), %rbp 801 movq -8(%rax), %rbx 802 leaq (%rax), %rsp 803.Lsqr_epilogue: 804 ret 805.size rsaz_512_sqr,.-rsaz_512_sqr 806___ 807} 808{ 809my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); 810$code.=<<___; 811.globl rsaz_512_mul 812.type rsaz_512_mul,\@function,5 813.align 32 814rsaz_512_mul: 815 push %rbx 816 push %rbp 817 push %r12 818 push %r13 819 push %r14 820 push %r15 821 822 subq \$128+24, %rsp 823.Lmul_body: 824 movq $out, %xmm0 # off-load arguments 825 movq $mod, %xmm1 826 movq $n0, 128(%rsp) 827___ 828$code.=<<___ if ($addx); 829 movl \$0x80100,%r11d 830 andl OPENSSL_ia32cap_P+8(%rip),%r11d 831 cmpl \$0x80100,%r11d # check for MULX and ADO/CX 832 je .Lmulx 833___ 834$code.=<<___; 835 movq ($bp), %rbx # pass b[0] 836 movq $bp, %rbp # pass argument 837 call __rsaz_512_mul 838 839 movq %xmm0, $out 840 movq %xmm1, %rbp 841 842 movq (%rsp), %r8 843 movq 8(%rsp), %r9 844 movq 16(%rsp), %r10 845 movq 24(%rsp), %r11 846 movq 32(%rsp), %r12 847 movq 40(%rsp), %r13 848 movq 48(%rsp), %r14 849 movq 56(%rsp), %r15 850 851 call __rsaz_512_reduce 852___ 853$code.=<<___ if ($addx); 854 jmp .Lmul_tail 855 856.align 32 857.Lmulx: 858 movq $bp, %rbp # pass argument 859 movq ($bp), %rdx # pass b[0] 860 call __rsaz_512_mulx 861 862 movq %xmm0, $out 863 movq %xmm1, %rbp 864 865 movq 128(%rsp), %rdx # pull $n0 866 movq (%rsp), %r8 867 movq 8(%rsp), %r9 868 movq 16(%rsp), %r10 869 movq 24(%rsp), %r11 870 movq 32(%rsp), %r12 871 movq 40(%rsp), %r13 872 movq 48(%rsp), %r14 873 movq 56(%rsp), %r15 874 875 call __rsaz_512_reducex 876.Lmul_tail: 877___ 878$code.=<<___; 879 addq 64(%rsp), %r8 880 adcq 72(%rsp), %r9 881 adcq 80(%rsp), %r10 882 adcq 88(%rsp), %r11 883 adcq 96(%rsp), %r12 884 adcq 104(%rsp), %r13 885 adcq 112(%rsp), %r14 886 adcq 120(%rsp), %r15 887 sbbq %rcx, %rcx 888 889 call __rsaz_512_subtract 890 891 leaq 128+24+48(%rsp), %rax 892 movq -48(%rax), %r15 893 movq -40(%rax), %r14 894 movq -32(%rax), %r13 895 movq -24(%rax), %r12 896 movq -16(%rax), %rbp 897 movq -8(%rax), %rbx 898 leaq (%rax), %rsp 899.Lmul_epilogue: 900 ret 901.size rsaz_512_mul,.-rsaz_512_mul 902___ 903} 904{ 905my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); 906$code.=<<___; 907.globl rsaz_512_mul_gather4 908.type rsaz_512_mul_gather4,\@function,6 909.align 32 910rsaz_512_mul_gather4: 911 push %rbx 912 push %rbp 913 push %r12 914 push %r13 915 push %r14 916 push %r15 917 918 mov $pwr, $pwr 919 subq \$128+24, %rsp 920.Lmul_gather4_body: 921___ 922$code.=<<___ if ($addx); 923 movl \$0x80100,%r11d 924 andl OPENSSL_ia32cap_P+8(%rip),%r11d 925 cmpl \$0x80100,%r11d # check for MULX and ADO/CX 926 je .Lmulx_gather 927___ 928$code.=<<___; 929 movl 64($bp,$pwr,4), %eax 930 movq $out, %xmm0 # off-load arguments 931 movl ($bp,$pwr,4), %ebx 932 movq $mod, %xmm1 933 movq $n0, 128(%rsp) 934 935 shlq \$32, %rax 936 or %rax, %rbx 937 movq ($ap), %rax 938 movq 8($ap), %rcx 939 leaq 128($bp,$pwr,4), %rbp 940 mulq %rbx # 0 iteration 941 movq %rax, (%rsp) 942 movq %rcx, %rax 943 movq %rdx, %r8 944 945 mulq %rbx 946 movd (%rbp), %xmm4 947 addq %rax, %r8 948 movq 16($ap), %rax 949 movq %rdx, %r9 950 adcq \$0, %r9 951 952 mulq %rbx 953 movd 64(%rbp), %xmm5 954 addq %rax, %r9 955 movq 24($ap), %rax 956 movq %rdx, %r10 957 adcq \$0, %r10 958 959 mulq %rbx 960 pslldq \$4, %xmm5 961 addq %rax, %r10 962 movq 32($ap), %rax 963 movq %rdx, %r11 964 adcq \$0, %r11 965 966 mulq %rbx 967 por %xmm5, %xmm4 968 addq %rax, %r11 969 movq 40($ap), %rax 970 movq %rdx, %r12 971 adcq \$0, %r12 972 973 mulq %rbx 974 addq %rax, %r12 975 movq 48($ap), %rax 976 movq %rdx, %r13 977 adcq \$0, %r13 978 979 mulq %rbx 980 leaq 128(%rbp), %rbp 981 addq %rax, %r13 982 movq 56($ap), %rax 983 movq %rdx, %r14 984 adcq \$0, %r14 985 986 mulq %rbx 987 movq %xmm4, %rbx 988 addq %rax, %r14 989 movq ($ap), %rax 990 movq %rdx, %r15 991 adcq \$0, %r15 992 993 leaq 8(%rsp), %rdi 994 movl \$7, %ecx 995 jmp .Loop_mul_gather 996 997.align 32 998.Loop_mul_gather: 999 mulq %rbx 1000 addq %rax, %r8 1001 movq 8($ap), %rax 1002 movq %r8, (%rdi) 1003 movq %rdx, %r8 1004 adcq \$0, %r8 1005 1006 mulq %rbx 1007 movd (%rbp), %xmm4 1008 addq %rax, %r9 1009 movq 16($ap), %rax 1010 adcq \$0, %rdx 1011 addq %r9, %r8 1012 movq %rdx, %r9 1013 adcq \$0, %r9 1014 1015 mulq %rbx 1016 movd 64(%rbp), %xmm5 1017 addq %rax, %r10 1018 movq 24($ap), %rax 1019 adcq \$0, %rdx 1020 addq %r10, %r9 1021 movq %rdx, %r10 1022 adcq \$0, %r10 1023 1024 mulq %rbx 1025 pslldq \$4, %xmm5 1026 addq %rax, %r11 1027 movq 32($ap), %rax 1028 adcq \$0, %rdx 1029 addq %r11, %r10 1030 movq %rdx, %r11 1031 adcq \$0, %r11 1032 1033 mulq %rbx 1034 por %xmm5, %xmm4 1035 addq %rax, %r12 1036 movq 40($ap), %rax 1037 adcq \$0, %rdx 1038 addq %r12, %r11 1039 movq %rdx, %r12 1040 adcq \$0, %r12 1041 1042 mulq %rbx 1043 addq %rax, %r13 1044 movq 48($ap), %rax 1045 adcq \$0, %rdx 1046 addq %r13, %r12 1047 movq %rdx, %r13 1048 adcq \$0, %r13 1049 1050 mulq %rbx 1051 addq %rax, %r14 1052 movq 56($ap), %rax 1053 adcq \$0, %rdx 1054 addq %r14, %r13 1055 movq %rdx, %r14 1056 adcq \$0, %r14 1057 1058 mulq %rbx 1059 movq %xmm4, %rbx 1060 addq %rax, %r15 1061 movq ($ap), %rax 1062 adcq \$0, %rdx 1063 addq %r15, %r14 1064 movq %rdx, %r15 1065 adcq \$0, %r15 1066 1067 leaq 128(%rbp), %rbp 1068 leaq 8(%rdi), %rdi 1069 1070 decl %ecx 1071 jnz .Loop_mul_gather 1072 1073 movq %r8, (%rdi) 1074 movq %r9, 8(%rdi) 1075 movq %r10, 16(%rdi) 1076 movq %r11, 24(%rdi) 1077 movq %r12, 32(%rdi) 1078 movq %r13, 40(%rdi) 1079 movq %r14, 48(%rdi) 1080 movq %r15, 56(%rdi) 1081 1082 movq %xmm0, $out 1083 movq %xmm1, %rbp 1084 1085 movq (%rsp), %r8 1086 movq 8(%rsp), %r9 1087 movq 16(%rsp), %r10 1088 movq 24(%rsp), %r11 1089 movq 32(%rsp), %r12 1090 movq 40(%rsp), %r13 1091 movq 48(%rsp), %r14 1092 movq 56(%rsp), %r15 1093 1094 call __rsaz_512_reduce 1095___ 1096$code.=<<___ if ($addx); 1097 jmp .Lmul_gather_tail 1098 1099.align 32 1100.Lmulx_gather: 1101 mov 64($bp,$pwr,4), %eax 1102 movq $out, %xmm0 # off-load arguments 1103 lea 128($bp,$pwr,4), %rbp 1104 mov ($bp,$pwr,4), %edx 1105 movq $mod, %xmm1 1106 mov $n0, 128(%rsp) 1107 1108 shl \$32, %rax 1109 or %rax, %rdx 1110 mulx ($ap), %rbx, %r8 # 0 iteration 1111 mov %rbx, (%rsp) 1112 xor %edi, %edi # cf=0, of=0 1113 1114 mulx 8($ap), %rax, %r9 1115 movd (%rbp), %xmm4 1116 1117 mulx 16($ap), %rbx, %r10 1118 movd 64(%rbp), %xmm5 1119 adcx %rax, %r8 1120 1121 mulx 24($ap), %rax, %r11 1122 pslldq \$4, %xmm5 1123 adcx %rbx, %r9 1124 1125 mulx 32($ap), %rbx, %r12 1126 por %xmm5, %xmm4 1127 adcx %rax, %r10 1128 1129 mulx 40($ap), %rax, %r13 1130 adcx %rbx, %r11 1131 1132 mulx 48($ap), %rbx, %r14 1133 lea 128(%rbp), %rbp 1134 adcx %rax, %r12 1135 1136 mulx 56($ap), %rax, %r15 1137 movq %xmm4, %rdx 1138 adcx %rbx, %r13 1139 adcx %rax, %r14 1140 mov %r8, %rbx 1141 adcx %rdi, %r15 # %rdi is 0 1142 1143 mov \$-7, %rcx 1144 jmp .Loop_mulx_gather 1145 1146.align 32 1147.Loop_mulx_gather: 1148 mulx ($ap), %rax, %r8 1149 adcx %rax, %rbx 1150 adox %r9, %r8 1151 1152 mulx 8($ap), %rax, %r9 1153 .byte 0x66,0x0f,0x6e,0xa5,0x00,0x00,0x00,0x00 # movd (%rbp), %xmm4 1154 adcx %rax, %r8 1155 adox %r10, %r9 1156 1157 mulx 16($ap), %rax, %r10 1158 movd 64(%rbp), %xmm5 1159 lea 128(%rbp), %rbp 1160 adcx %rax, %r9 1161 adox %r11, %r10 1162 1163 .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11 1164 pslldq \$4, %xmm5 1165 por %xmm5, %xmm4 1166 adcx %rax, %r10 1167 adox %r12, %r11 1168 1169 mulx 32($ap), %rax, %r12 1170 adcx %rax, %r11 1171 adox %r13, %r12 1172 1173 mulx 40($ap), %rax, %r13 1174 adcx %rax, %r12 1175 adox %r14, %r13 1176 1177 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14 1178 adcx %rax, %r13 1179 adox %r15, %r14 1180 1181 mulx 56($ap), %rax, %r15 1182 movq %xmm4, %rdx 1183 mov %rbx, 64(%rsp,%rcx,8) 1184 adcx %rax, %r14 1185 adox %rdi, %r15 1186 mov %r8, %rbx 1187 adcx %rdi, %r15 # cf=0 1188 1189 inc %rcx # of=0 1190 jnz .Loop_mulx_gather 1191 1192 mov %r8, 64(%rsp) 1193 mov %r9, 64+8(%rsp) 1194 mov %r10, 64+16(%rsp) 1195 mov %r11, 64+24(%rsp) 1196 mov %r12, 64+32(%rsp) 1197 mov %r13, 64+40(%rsp) 1198 mov %r14, 64+48(%rsp) 1199 mov %r15, 64+56(%rsp) 1200 1201 movq %xmm0, $out 1202 movq %xmm1, %rbp 1203 1204 mov 128(%rsp), %rdx # pull $n0 1205 mov (%rsp), %r8 1206 mov 8(%rsp), %r9 1207 mov 16(%rsp), %r10 1208 mov 24(%rsp), %r11 1209 mov 32(%rsp), %r12 1210 mov 40(%rsp), %r13 1211 mov 48(%rsp), %r14 1212 mov 56(%rsp), %r15 1213 1214 call __rsaz_512_reducex 1215 1216.Lmul_gather_tail: 1217___ 1218$code.=<<___; 1219 addq 64(%rsp), %r8 1220 adcq 72(%rsp), %r9 1221 adcq 80(%rsp), %r10 1222 adcq 88(%rsp), %r11 1223 adcq 96(%rsp), %r12 1224 adcq 104(%rsp), %r13 1225 adcq 112(%rsp), %r14 1226 adcq 120(%rsp), %r15 1227 sbbq %rcx, %rcx 1228 1229 call __rsaz_512_subtract 1230 1231 leaq 128+24+48(%rsp), %rax 1232 movq -48(%rax), %r15 1233 movq -40(%rax), %r14 1234 movq -32(%rax), %r13 1235 movq -24(%rax), %r12 1236 movq -16(%rax), %rbp 1237 movq -8(%rax), %rbx 1238 leaq (%rax), %rsp 1239.Lmul_gather4_epilogue: 1240 ret 1241.size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4 1242___ 1243} 1244{ 1245my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); 1246$code.=<<___; 1247.globl rsaz_512_mul_scatter4 1248.type rsaz_512_mul_scatter4,\@function,6 1249.align 32 1250rsaz_512_mul_scatter4: 1251 push %rbx 1252 push %rbp 1253 push %r12 1254 push %r13 1255 push %r14 1256 push %r15 1257 1258 mov $pwr, $pwr 1259 subq \$128+24, %rsp 1260.Lmul_scatter4_body: 1261 leaq ($tbl,$pwr,4), $tbl 1262 movq $out, %xmm0 # off-load arguments 1263 movq $mod, %xmm1 1264 movq $tbl, %xmm2 1265 movq $n0, 128(%rsp) 1266 1267 movq $out, %rbp 1268___ 1269$code.=<<___ if ($addx); 1270 movl \$0x80100,%r11d 1271 andl OPENSSL_ia32cap_P+8(%rip),%r11d 1272 cmpl \$0x80100,%r11d # check for MULX and ADO/CX 1273 je .Lmulx_scatter 1274___ 1275$code.=<<___; 1276 movq ($out),%rbx # pass b[0] 1277 call __rsaz_512_mul 1278 1279 movq %xmm0, $out 1280 movq %xmm1, %rbp 1281 1282 movq (%rsp), %r8 1283 movq 8(%rsp), %r9 1284 movq 16(%rsp), %r10 1285 movq 24(%rsp), %r11 1286 movq 32(%rsp), %r12 1287 movq 40(%rsp), %r13 1288 movq 48(%rsp), %r14 1289 movq 56(%rsp), %r15 1290 1291 call __rsaz_512_reduce 1292___ 1293$code.=<<___ if ($addx); 1294 jmp .Lmul_scatter_tail 1295 1296.align 32 1297.Lmulx_scatter: 1298 movq ($out), %rdx # pass b[0] 1299 call __rsaz_512_mulx 1300 1301 movq %xmm0, $out 1302 movq %xmm1, %rbp 1303 1304 movq 128(%rsp), %rdx # pull $n0 1305 movq (%rsp), %r8 1306 movq 8(%rsp), %r9 1307 movq 16(%rsp), %r10 1308 movq 24(%rsp), %r11 1309 movq 32(%rsp), %r12 1310 movq 40(%rsp), %r13 1311 movq 48(%rsp), %r14 1312 movq 56(%rsp), %r15 1313 1314 call __rsaz_512_reducex 1315 1316.Lmul_scatter_tail: 1317___ 1318$code.=<<___; 1319 addq 64(%rsp), %r8 1320 adcq 72(%rsp), %r9 1321 adcq 80(%rsp), %r10 1322 adcq 88(%rsp), %r11 1323 adcq 96(%rsp), %r12 1324 adcq 104(%rsp), %r13 1325 adcq 112(%rsp), %r14 1326 adcq 120(%rsp), %r15 1327 movq %xmm2, $inp 1328 sbbq %rcx, %rcx 1329 1330 call __rsaz_512_subtract 1331 1332 movl %r8d, 64*0($inp) # scatter 1333 shrq \$32, %r8 1334 movl %r9d, 64*2($inp) 1335 shrq \$32, %r9 1336 movl %r10d, 64*4($inp) 1337 shrq \$32, %r10 1338 movl %r11d, 64*6($inp) 1339 shrq \$32, %r11 1340 movl %r12d, 64*8($inp) 1341 shrq \$32, %r12 1342 movl %r13d, 64*10($inp) 1343 shrq \$32, %r13 1344 movl %r14d, 64*12($inp) 1345 shrq \$32, %r14 1346 movl %r15d, 64*14($inp) 1347 shrq \$32, %r15 1348 movl %r8d, 64*1($inp) 1349 movl %r9d, 64*3($inp) 1350 movl %r10d, 64*5($inp) 1351 movl %r11d, 64*7($inp) 1352 movl %r12d, 64*9($inp) 1353 movl %r13d, 64*11($inp) 1354 movl %r14d, 64*13($inp) 1355 movl %r15d, 64*15($inp) 1356 1357 leaq 128+24+48(%rsp), %rax 1358 movq -48(%rax), %r15 1359 movq -40(%rax), %r14 1360 movq -32(%rax), %r13 1361 movq -24(%rax), %r12 1362 movq -16(%rax), %rbp 1363 movq -8(%rax), %rbx 1364 leaq (%rax), %rsp 1365.Lmul_scatter4_epilogue: 1366 ret 1367.size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4 1368___ 1369} 1370{ 1371my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx"); 1372$code.=<<___; 1373.globl rsaz_512_mul_by_one 1374.type rsaz_512_mul_by_one,\@function,4 1375.align 32 1376rsaz_512_mul_by_one: 1377 push %rbx 1378 push %rbp 1379 push %r12 1380 push %r13 1381 push %r14 1382 push %r15 1383 1384 subq \$128+24, %rsp 1385.Lmul_by_one_body: 1386___ 1387$code.=<<___ if ($addx); 1388 movl OPENSSL_ia32cap_P+8(%rip),%eax 1389___ 1390$code.=<<___; 1391 movq $mod, %rbp # reassign argument 1392 movq $n0, 128(%rsp) 1393 1394 movq ($inp), %r8 1395 pxor %xmm0, %xmm0 1396 movq 8($inp), %r9 1397 movq 16($inp), %r10 1398 movq 24($inp), %r11 1399 movq 32($inp), %r12 1400 movq 40($inp), %r13 1401 movq 48($inp), %r14 1402 movq 56($inp), %r15 1403 1404 movdqa %xmm0, (%rsp) 1405 movdqa %xmm0, 16(%rsp) 1406 movdqa %xmm0, 32(%rsp) 1407 movdqa %xmm0, 48(%rsp) 1408 movdqa %xmm0, 64(%rsp) 1409 movdqa %xmm0, 80(%rsp) 1410 movdqa %xmm0, 96(%rsp) 1411___ 1412$code.=<<___ if ($addx); 1413 andl \$0x80100,%eax 1414 cmpl \$0x80100,%eax # check for MULX and ADO/CX 1415 je .Lby_one_callx 1416___ 1417$code.=<<___; 1418 call __rsaz_512_reduce 1419___ 1420$code.=<<___ if ($addx); 1421 jmp .Lby_one_tail 1422.align 32 1423.Lby_one_callx: 1424 movq 128(%rsp), %rdx # pull $n0 1425 call __rsaz_512_reducex 1426.Lby_one_tail: 1427___ 1428$code.=<<___; 1429 movq %r8, ($out) 1430 movq %r9, 8($out) 1431 movq %r10, 16($out) 1432 movq %r11, 24($out) 1433 movq %r12, 32($out) 1434 movq %r13, 40($out) 1435 movq %r14, 48($out) 1436 movq %r15, 56($out) 1437 1438 leaq 128+24+48(%rsp), %rax 1439 movq -48(%rax), %r15 1440 movq -40(%rax), %r14 1441 movq -32(%rax), %r13 1442 movq -24(%rax), %r12 1443 movq -16(%rax), %rbp 1444 movq -8(%rax), %rbx 1445 leaq (%rax), %rsp 1446.Lmul_by_one_epilogue: 1447 ret 1448.size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one 1449___ 1450} 1451{ # __rsaz_512_reduce 1452 # 1453 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0 1454 # output: %r8-%r15 1455 # clobbers: everything except %rbp and %rdi 1456$code.=<<___; 1457.type __rsaz_512_reduce,\@abi-omnipotent 1458.align 32 1459__rsaz_512_reduce: 1460 movq %r8, %rbx 1461 imulq 128+8(%rsp), %rbx 1462 movq 0(%rbp), %rax 1463 movl \$8, %ecx 1464 jmp .Lreduction_loop 1465 1466.align 32 1467.Lreduction_loop: 1468 mulq %rbx 1469 movq 8(%rbp), %rax 1470 negq %r8 1471 movq %rdx, %r8 1472 adcq \$0, %r8 1473 1474 mulq %rbx 1475 addq %rax, %r9 1476 movq 16(%rbp), %rax 1477 adcq \$0, %rdx 1478 addq %r9, %r8 1479 movq %rdx, %r9 1480 adcq \$0, %r9 1481 1482 mulq %rbx 1483 addq %rax, %r10 1484 movq 24(%rbp), %rax 1485 adcq \$0, %rdx 1486 addq %r10, %r9 1487 movq %rdx, %r10 1488 adcq \$0, %r10 1489 1490 mulq %rbx 1491 addq %rax, %r11 1492 movq 32(%rbp), %rax 1493 adcq \$0, %rdx 1494 addq %r11, %r10 1495 movq 128+8(%rsp), %rsi 1496 #movq %rdx, %r11 1497 #adcq \$0, %r11 1498 adcq \$0, %rdx 1499 movq %rdx, %r11 1500 1501 mulq %rbx 1502 addq %rax, %r12 1503 movq 40(%rbp), %rax 1504 adcq \$0, %rdx 1505 imulq %r8, %rsi 1506 addq %r12, %r11 1507 movq %rdx, %r12 1508 adcq \$0, %r12 1509 1510 mulq %rbx 1511 addq %rax, %r13 1512 movq 48(%rbp), %rax 1513 adcq \$0, %rdx 1514 addq %r13, %r12 1515 movq %rdx, %r13 1516 adcq \$0, %r13 1517 1518 mulq %rbx 1519 addq %rax, %r14 1520 movq 56(%rbp), %rax 1521 adcq \$0, %rdx 1522 addq %r14, %r13 1523 movq %rdx, %r14 1524 adcq \$0, %r14 1525 1526 mulq %rbx 1527 movq %rsi, %rbx 1528 addq %rax, %r15 1529 movq 0(%rbp), %rax 1530 adcq \$0, %rdx 1531 addq %r15, %r14 1532 movq %rdx, %r15 1533 adcq \$0, %r15 1534 1535 decl %ecx 1536 jne .Lreduction_loop 1537 1538 ret 1539.size __rsaz_512_reduce,.-__rsaz_512_reduce 1540___ 1541} 1542if ($addx) { 1543 # __rsaz_512_reducex 1544 # 1545 # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0 1546 # output: %r8-%r15 1547 # clobbers: everything except %rbp and %rdi 1548$code.=<<___; 1549.type __rsaz_512_reducex,\@abi-omnipotent 1550.align 32 1551__rsaz_512_reducex: 1552 #movq 128+8(%rsp), %rdx # pull $n0 1553 imulq %r8, %rdx 1554 xorq %rsi, %rsi # cf=0,of=0 1555 movl \$8, %ecx 1556 jmp .Lreduction_loopx 1557 1558.align 32 1559.Lreduction_loopx: 1560 mov %r8, %rbx 1561 mulx 0(%rbp), %rax, %r8 1562 adcx %rbx, %rax 1563 adox %r9, %r8 1564 1565 mulx 8(%rbp), %rax, %r9 1566 adcx %rax, %r8 1567 adox %r10, %r9 1568 1569 mulx 16(%rbp), %rbx, %r10 1570 adcx %rbx, %r9 1571 adox %r11, %r10 1572 1573 mulx 24(%rbp), %rbx, %r11 1574 adcx %rbx, %r10 1575 adox %r12, %r11 1576 1577 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12 1578 mov %rdx, %rax 1579 mov %r8, %rdx 1580 adcx %rbx, %r11 1581 adox %r13, %r12 1582 1583 mulx 128+8(%rsp), %rbx, %rdx 1584 mov %rax, %rdx 1585 1586 mulx 40(%rbp), %rax, %r13 1587 adcx %rax, %r12 1588 adox %r14, %r13 1589 1590 .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14 1591 adcx %rax, %r13 1592 adox %r15, %r14 1593 1594 mulx 56(%rbp), %rax, %r15 1595 mov %rbx, %rdx 1596 adcx %rax, %r14 1597 adox %rsi, %r15 # %rsi is 0 1598 adcx %rsi, %r15 # cf=0 1599 1600 decl %ecx # of=0 1601 jne .Lreduction_loopx 1602 1603 ret 1604.size __rsaz_512_reducex,.-__rsaz_512_reducex 1605___ 1606} 1607{ # __rsaz_512_subtract 1608 # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask 1609 # output: 1610 # clobbers: everything but %rdi, %rsi and %rbp 1611$code.=<<___; 1612.type __rsaz_512_subtract,\@abi-omnipotent 1613.align 32 1614__rsaz_512_subtract: 1615 movq %r8, ($out) 1616 movq %r9, 8($out) 1617 movq %r10, 16($out) 1618 movq %r11, 24($out) 1619 movq %r12, 32($out) 1620 movq %r13, 40($out) 1621 movq %r14, 48($out) 1622 movq %r15, 56($out) 1623 1624 movq 0($mod), %r8 1625 movq 8($mod), %r9 1626 negq %r8 1627 notq %r9 1628 andq %rcx, %r8 1629 movq 16($mod), %r10 1630 andq %rcx, %r9 1631 notq %r10 1632 movq 24($mod), %r11 1633 andq %rcx, %r10 1634 notq %r11 1635 movq 32($mod), %r12 1636 andq %rcx, %r11 1637 notq %r12 1638 movq 40($mod), %r13 1639 andq %rcx, %r12 1640 notq %r13 1641 movq 48($mod), %r14 1642 andq %rcx, %r13 1643 notq %r14 1644 movq 56($mod), %r15 1645 andq %rcx, %r14 1646 notq %r15 1647 andq %rcx, %r15 1648 1649 addq ($out), %r8 1650 adcq 8($out), %r9 1651 adcq 16($out), %r10 1652 adcq 24($out), %r11 1653 adcq 32($out), %r12 1654 adcq 40($out), %r13 1655 adcq 48($out), %r14 1656 adcq 56($out), %r15 1657 1658 movq %r8, ($out) 1659 movq %r9, 8($out) 1660 movq %r10, 16($out) 1661 movq %r11, 24($out) 1662 movq %r12, 32($out) 1663 movq %r13, 40($out) 1664 movq %r14, 48($out) 1665 movq %r15, 56($out) 1666 1667 ret 1668.size __rsaz_512_subtract,.-__rsaz_512_subtract 1669___ 1670} 1671{ # __rsaz_512_mul 1672 # 1673 # input: %rsi - ap, %rbp - bp 1674 # ouput: 1675 # clobbers: everything 1676my ($ap,$bp) = ("%rsi","%rbp"); 1677$code.=<<___; 1678.type __rsaz_512_mul,\@abi-omnipotent 1679.align 32 1680__rsaz_512_mul: 1681 leaq 8(%rsp), %rdi 1682 1683 movq ($ap), %rax 1684 mulq %rbx 1685 movq %rax, (%rdi) 1686 movq 8($ap), %rax 1687 movq %rdx, %r8 1688 1689 mulq %rbx 1690 addq %rax, %r8 1691 movq 16($ap), %rax 1692 movq %rdx, %r9 1693 adcq \$0, %r9 1694 1695 mulq %rbx 1696 addq %rax, %r9 1697 movq 24($ap), %rax 1698 movq %rdx, %r10 1699 adcq \$0, %r10 1700 1701 mulq %rbx 1702 addq %rax, %r10 1703 movq 32($ap), %rax 1704 movq %rdx, %r11 1705 adcq \$0, %r11 1706 1707 mulq %rbx 1708 addq %rax, %r11 1709 movq 40($ap), %rax 1710 movq %rdx, %r12 1711 adcq \$0, %r12 1712 1713 mulq %rbx 1714 addq %rax, %r12 1715 movq 48($ap), %rax 1716 movq %rdx, %r13 1717 adcq \$0, %r13 1718 1719 mulq %rbx 1720 addq %rax, %r13 1721 movq 56($ap), %rax 1722 movq %rdx, %r14 1723 adcq \$0, %r14 1724 1725 mulq %rbx 1726 addq %rax, %r14 1727 movq ($ap), %rax 1728 movq %rdx, %r15 1729 adcq \$0, %r15 1730 1731 leaq 8($bp), $bp 1732 leaq 8(%rdi), %rdi 1733 1734 movl \$7, %ecx 1735 jmp .Loop_mul 1736 1737.align 32 1738.Loop_mul: 1739 movq ($bp), %rbx 1740 mulq %rbx 1741 addq %rax, %r8 1742 movq 8($ap), %rax 1743 movq %r8, (%rdi) 1744 movq %rdx, %r8 1745 adcq \$0, %r8 1746 1747 mulq %rbx 1748 addq %rax, %r9 1749 movq 16($ap), %rax 1750 adcq \$0, %rdx 1751 addq %r9, %r8 1752 movq %rdx, %r9 1753 adcq \$0, %r9 1754 1755 mulq %rbx 1756 addq %rax, %r10 1757 movq 24($ap), %rax 1758 adcq \$0, %rdx 1759 addq %r10, %r9 1760 movq %rdx, %r10 1761 adcq \$0, %r10 1762 1763 mulq %rbx 1764 addq %rax, %r11 1765 movq 32($ap), %rax 1766 adcq \$0, %rdx 1767 addq %r11, %r10 1768 movq %rdx, %r11 1769 adcq \$0, %r11 1770 1771 mulq %rbx 1772 addq %rax, %r12 1773 movq 40($ap), %rax 1774 adcq \$0, %rdx 1775 addq %r12, %r11 1776 movq %rdx, %r12 1777 adcq \$0, %r12 1778 1779 mulq %rbx 1780 addq %rax, %r13 1781 movq 48($ap), %rax 1782 adcq \$0, %rdx 1783 addq %r13, %r12 1784 movq %rdx, %r13 1785 adcq \$0, %r13 1786 1787 mulq %rbx 1788 addq %rax, %r14 1789 movq 56($ap), %rax 1790 adcq \$0, %rdx 1791 addq %r14, %r13 1792 movq %rdx, %r14 1793 leaq 8($bp), $bp 1794 adcq \$0, %r14 1795 1796 mulq %rbx 1797 addq %rax, %r15 1798 movq ($ap), %rax 1799 adcq \$0, %rdx 1800 addq %r15, %r14 1801 movq %rdx, %r15 1802 adcq \$0, %r15 1803 1804 leaq 8(%rdi), %rdi 1805 1806 decl %ecx 1807 jnz .Loop_mul 1808 1809 movq %r8, (%rdi) 1810 movq %r9, 8(%rdi) 1811 movq %r10, 16(%rdi) 1812 movq %r11, 24(%rdi) 1813 movq %r12, 32(%rdi) 1814 movq %r13, 40(%rdi) 1815 movq %r14, 48(%rdi) 1816 movq %r15, 56(%rdi) 1817 1818 ret 1819.size __rsaz_512_mul,.-__rsaz_512_mul 1820___ 1821} 1822if ($addx) { 1823 # __rsaz_512_mulx 1824 # 1825 # input: %rsi - ap, %rbp - bp 1826 # ouput: 1827 # clobbers: everything 1828my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi"); 1829$code.=<<___; 1830.type __rsaz_512_mulx,\@abi-omnipotent 1831.align 32 1832__rsaz_512_mulx: 1833 mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller 1834 mov \$-6, %rcx 1835 1836 mulx 8($ap), %rax, %r9 1837 movq %rbx, 8(%rsp) 1838 1839 mulx 16($ap), %rbx, %r10 1840 adc %rax, %r8 1841 1842 mulx 24($ap), %rax, %r11 1843 adc %rbx, %r9 1844 1845 mulx 32($ap), %rbx, %r12 1846 adc %rax, %r10 1847 1848 mulx 40($ap), %rax, %r13 1849 adc %rbx, %r11 1850 1851 mulx 48($ap), %rbx, %r14 1852 adc %rax, %r12 1853 1854 mulx 56($ap), %rax, %r15 1855 mov 8($bp), %rdx 1856 adc %rbx, %r13 1857 adc %rax, %r14 1858 adc \$0, %r15 1859 1860 xor $zero, $zero # cf=0,of=0 1861 jmp .Loop_mulx 1862 1863.align 32 1864.Loop_mulx: 1865 movq %r8, %rbx 1866 mulx ($ap), %rax, %r8 1867 adcx %rax, %rbx 1868 adox %r9, %r8 1869 1870 mulx 8($ap), %rax, %r9 1871 adcx %rax, %r8 1872 adox %r10, %r9 1873 1874 mulx 16($ap), %rax, %r10 1875 adcx %rax, %r9 1876 adox %r11, %r10 1877 1878 mulx 24($ap), %rax, %r11 1879 adcx %rax, %r10 1880 adox %r12, %r11 1881 1882 .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12 1883 adcx %rax, %r11 1884 adox %r13, %r12 1885 1886 mulx 40($ap), %rax, %r13 1887 adcx %rax, %r12 1888 adox %r14, %r13 1889 1890 mulx 48($ap), %rax, %r14 1891 adcx %rax, %r13 1892 adox %r15, %r14 1893 1894 mulx 56($ap), %rax, %r15 1895 movq 64($bp,%rcx,8), %rdx 1896 movq %rbx, 8+64-8(%rsp,%rcx,8) 1897 adcx %rax, %r14 1898 adox $zero, %r15 1899 adcx $zero, %r15 # cf=0 1900 1901 inc %rcx # of=0 1902 jnz .Loop_mulx 1903 1904 movq %r8, %rbx 1905 mulx ($ap), %rax, %r8 1906 adcx %rax, %rbx 1907 adox %r9, %r8 1908 1909 .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9 1910 adcx %rax, %r8 1911 adox %r10, %r9 1912 1913 .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10 1914 adcx %rax, %r9 1915 adox %r11, %r10 1916 1917 mulx 24($ap), %rax, %r11 1918 adcx %rax, %r10 1919 adox %r12, %r11 1920 1921 mulx 32($ap), %rax, %r12 1922 adcx %rax, %r11 1923 adox %r13, %r12 1924 1925 mulx 40($ap), %rax, %r13 1926 adcx %rax, %r12 1927 adox %r14, %r13 1928 1929 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14 1930 adcx %rax, %r13 1931 adox %r15, %r14 1932 1933 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15 1934 adcx %rax, %r14 1935 adox $zero, %r15 1936 adcx $zero, %r15 1937 1938 mov %rbx, 8+64-8(%rsp) 1939 mov %r8, 8+64(%rsp) 1940 mov %r9, 8+64+8(%rsp) 1941 mov %r10, 8+64+16(%rsp) 1942 mov %r11, 8+64+24(%rsp) 1943 mov %r12, 8+64+32(%rsp) 1944 mov %r13, 8+64+40(%rsp) 1945 mov %r14, 8+64+48(%rsp) 1946 mov %r15, 8+64+56(%rsp) 1947 1948 ret 1949.size __rsaz_512_mulx,.-__rsaz_512_mulx 1950___ 1951} 1952{ 1953my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx"); 1954$code.=<<___; 1955.globl rsaz_512_scatter4 1956.type rsaz_512_scatter4,\@abi-omnipotent 1957.align 16 1958rsaz_512_scatter4: 1959 leaq ($out,$power,4), $out 1960 movl \$8, %r9d 1961 jmp .Loop_scatter 1962.align 16 1963.Loop_scatter: 1964 movq ($inp), %rax 1965 leaq 8($inp), $inp 1966 movl %eax, ($out) 1967 shrq \$32, %rax 1968 movl %eax, 64($out) 1969 leaq 128($out), $out 1970 decl %r9d 1971 jnz .Loop_scatter 1972 ret 1973.size rsaz_512_scatter4,.-rsaz_512_scatter4 1974 1975.globl rsaz_512_gather4 1976.type rsaz_512_gather4,\@abi-omnipotent 1977.align 16 1978rsaz_512_gather4: 1979 leaq ($inp,$power,4), $inp 1980 movl \$8, %r9d 1981 jmp .Loop_gather 1982.align 16 1983.Loop_gather: 1984 movl ($inp), %eax 1985 movl 64($inp), %r8d 1986 leaq 128($inp), $inp 1987 shlq \$32, %r8 1988 or %r8, %rax 1989 movq %rax, ($out) 1990 leaq 8($out), $out 1991 decl %r9d 1992 jnz .Loop_gather 1993 ret 1994.size rsaz_512_gather4,.-rsaz_512_gather4 1995___ 1996} 1997 1998# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1999# CONTEXT *context,DISPATCHER_CONTEXT *disp) 2000if ($win64) { 2001$rec="%rcx"; 2002$frame="%rdx"; 2003$context="%r8"; 2004$disp="%r9"; 2005 2006$code.=<<___; 2007.extern __imp_RtlVirtualUnwind 2008.type se_handler,\@abi-omnipotent 2009.align 16 2010se_handler: 2011 push %rsi 2012 push %rdi 2013 push %rbx 2014 push %rbp 2015 push %r12 2016 push %r13 2017 push %r14 2018 push %r15 2019 pushfq 2020 sub \$64,%rsp 2021 2022 mov 120($context),%rax # pull context->Rax 2023 mov 248($context),%rbx # pull context->Rip 2024 2025 mov 8($disp),%rsi # disp->ImageBase 2026 mov 56($disp),%r11 # disp->HandlerData 2027 2028 mov 0(%r11),%r10d # HandlerData[0] 2029 lea (%rsi,%r10),%r10 # end of prologue label 2030 cmp %r10,%rbx # context->Rip<end of prologue label 2031 jb .Lcommon_seh_tail 2032 2033 mov 152($context),%rax # pull context->Rsp 2034 2035 mov 4(%r11),%r10d # HandlerData[1] 2036 lea (%rsi,%r10),%r10 # epilogue label 2037 cmp %r10,%rbx # context->Rip>=epilogue label 2038 jae .Lcommon_seh_tail 2039 2040 lea 128+24+48(%rax),%rax 2041 2042 mov -8(%rax),%rbx 2043 mov -16(%rax),%rbp 2044 mov -24(%rax),%r12 2045 mov -32(%rax),%r13 2046 mov -40(%rax),%r14 2047 mov -48(%rax),%r15 2048 mov %rbx,144($context) # restore context->Rbx 2049 mov %rbp,160($context) # restore context->Rbp 2050 mov %r12,216($context) # restore context->R12 2051 mov %r13,224($context) # restore context->R13 2052 mov %r14,232($context) # restore context->R14 2053 mov %r15,240($context) # restore context->R15 2054 2055.Lcommon_seh_tail: 2056 mov 8(%rax),%rdi 2057 mov 16(%rax),%rsi 2058 mov %rax,152($context) # restore context->Rsp 2059 mov %rsi,168($context) # restore context->Rsi 2060 mov %rdi,176($context) # restore context->Rdi 2061 2062 mov 40($disp),%rdi # disp->ContextRecord 2063 mov $context,%rsi # context 2064 mov \$154,%ecx # sizeof(CONTEXT) 2065 .long 0xa548f3fc # cld; rep movsq 2066 2067 mov $disp,%rsi 2068 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 2069 mov 8(%rsi),%rdx # arg2, disp->ImageBase 2070 mov 0(%rsi),%r8 # arg3, disp->ControlPc 2071 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 2072 mov 40(%rsi),%r10 # disp->ContextRecord 2073 lea 56(%rsi),%r11 # &disp->HandlerData 2074 lea 24(%rsi),%r12 # &disp->EstablisherFrame 2075 mov %r10,32(%rsp) # arg5 2076 mov %r11,40(%rsp) # arg6 2077 mov %r12,48(%rsp) # arg7 2078 mov %rcx,56(%rsp) # arg8, (NULL) 2079 call *__imp_RtlVirtualUnwind(%rip) 2080 2081 mov \$1,%eax # ExceptionContinueSearch 2082 add \$64,%rsp 2083 popfq 2084 pop %r15 2085 pop %r14 2086 pop %r13 2087 pop %r12 2088 pop %rbp 2089 pop %rbx 2090 pop %rdi 2091 pop %rsi 2092 ret 2093.size sqr_handler,.-sqr_handler 2094 2095.section .pdata 2096.align 4 2097 .rva .LSEH_begin_rsaz_512_sqr 2098 .rva .LSEH_end_rsaz_512_sqr 2099 .rva .LSEH_info_rsaz_512_sqr 2100 2101 .rva .LSEH_begin_rsaz_512_mul 2102 .rva .LSEH_end_rsaz_512_mul 2103 .rva .LSEH_info_rsaz_512_mul 2104 2105 .rva .LSEH_begin_rsaz_512_mul_gather4 2106 .rva .LSEH_end_rsaz_512_mul_gather4 2107 .rva .LSEH_info_rsaz_512_mul_gather4 2108 2109 .rva .LSEH_begin_rsaz_512_mul_scatter4 2110 .rva .LSEH_end_rsaz_512_mul_scatter4 2111 .rva .LSEH_info_rsaz_512_mul_scatter4 2112 2113 .rva .LSEH_begin_rsaz_512_mul_by_one 2114 .rva .LSEH_end_rsaz_512_mul_by_one 2115 .rva .LSEH_info_rsaz_512_mul_by_one 2116 2117.section .xdata 2118.align 8 2119.LSEH_info_rsaz_512_sqr: 2120 .byte 9,0,0,0 2121 .rva se_handler 2122 .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[] 2123.LSEH_info_rsaz_512_mul: 2124 .byte 9,0,0,0 2125 .rva se_handler 2126 .rva .Lmul_body,.Lmul_epilogue # HandlerData[] 2127.LSEH_info_rsaz_512_mul_gather4: 2128 .byte 9,0,0,0 2129 .rva se_handler 2130 .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[] 2131.LSEH_info_rsaz_512_mul_scatter4: 2132 .byte 9,0,0,0 2133 .rva se_handler 2134 .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[] 2135.LSEH_info_rsaz_512_mul_by_one: 2136 .byte 9,0,0,0 2137 .rva se_handler 2138 .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[] 2139___ 2140} 2141 2142$code =~ s/\`([^\`]*)\`/eval $1/gem; 2143print $code; 2144close STDOUT; 2145