rsaz-x86_64.pl revision 296279
1289848Sjkim#!/usr/bin/env perl 2289848Sjkim 3289848Sjkim############################################################################## 4289848Sjkim# # 5289848Sjkim# Copyright (c) 2012, Intel Corporation # 6289848Sjkim# # 7289848Sjkim# All rights reserved. # 8289848Sjkim# # 9289848Sjkim# Redistribution and use in source and binary forms, with or without # 10289848Sjkim# modification, are permitted provided that the following conditions are # 11289848Sjkim# met: # 12289848Sjkim# # 13289848Sjkim# * Redistributions of source code must retain the above copyright # 14289848Sjkim# notice, this list of conditions and the following disclaimer. # 15289848Sjkim# # 16289848Sjkim# * Redistributions in binary form must reproduce the above copyright # 17289848Sjkim# notice, this list of conditions and the following disclaimer in the # 18289848Sjkim# documentation and/or other materials provided with the # 19289848Sjkim# distribution. # 20289848Sjkim# # 21289848Sjkim# * Neither the name of the Intel Corporation nor the names of its # 22289848Sjkim# contributors may be used to endorse or promote products derived from # 23289848Sjkim# this software without specific prior written permission. # 24289848Sjkim# # 25289848Sjkim# # 26289848Sjkim# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY # 27289848Sjkim# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # 28289848Sjkim# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # 29289848Sjkim# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR # 30289848Sjkim# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # 31289848Sjkim# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # 32289848Sjkim# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # 33289848Sjkim# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # 34289848Sjkim# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # 35289848Sjkim# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # 36289848Sjkim# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # 37289848Sjkim# # 38289848Sjkim############################################################################## 39289848Sjkim# Developers and authors: # 40289848Sjkim# Shay Gueron (1, 2), and Vlad Krasnov (1) # 41289848Sjkim# (1) Intel Architecture Group, Microprocessor and Chipset Development, # 42289848Sjkim# Israel Development Center, Haifa, Israel # 43289848Sjkim# (2) University of Haifa # 44289848Sjkim############################################################################## 45289848Sjkim# Reference: # 46289848Sjkim# [1] S. Gueron, "Efficient Software Implementations of Modular # 47289848Sjkim# Exponentiation", http://eprint.iacr.org/2011/239 # 48289848Sjkim# [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". # 49289848Sjkim# IEEE Proceedings of 9th International Conference on Information # 50289848Sjkim# Technology: New Generations (ITNG 2012), 821-823 (2012). # 51289848Sjkim# [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation# 52289848Sjkim# Journal of Cryptographic Engineering 2:31-43 (2012). # 53289848Sjkim# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis # 54289848Sjkim# resistant 512-bit and 1024-bit modular exponentiation for optimizing # 55289848Sjkim# RSA1024 and RSA2048 on x86_64 platforms", # 56289848Sjkim# http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest# 57289848Sjkim############################################################################## 58289848Sjkim 59289848Sjkim# While original submission covers 512- and 1024-bit exponentiation, 60289848Sjkim# this module is limited to 512-bit version only (and as such 61289848Sjkim# accelerates RSA1024 sign). This is because improvement for longer 62289848Sjkim# keys is not high enough to justify the effort, highest measured 63289848Sjkim# was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming 64289848Sjkim# for the moment of this writing!] Nor does this module implement 65289848Sjkim# "monolithic" complete exponentiation jumbo-subroutine, but adheres 66289848Sjkim# to more modular mixture of C and assembly. And it's optimized even 67289848Sjkim# for processors other than Intel Core family (see table below for 68289848Sjkim# improvement coefficients). 69289848Sjkim# <appro@openssl.org> 70289848Sjkim# 71289848Sjkim# RSA1024 sign/sec this/original |this/rsax(*) this/fips(*) 72289848Sjkim# ----------------+--------------------------- 73289848Sjkim# Opteron +13% |+5% +20% 74289848Sjkim# Bulldozer -0% |-1% +10% 75289848Sjkim# P4 +11% |+7% +8% 76289848Sjkim# Westmere +5% |+14% +17% 77289848Sjkim# Sandy Bridge +2% |+12% +29% 78289848Sjkim# Ivy Bridge +1% |+11% +35% 79289848Sjkim# Haswell(**) -0% |+12% +39% 80289848Sjkim# Atom +13% |+11% +4% 81289848Sjkim# VIA Nano +70% |+9% +25% 82289848Sjkim# 83289848Sjkim# (*) rsax engine and fips numbers are presented for reference 84289848Sjkim# purposes; 85289848Sjkim# (**) MULX was attempted, but found to give only marginal improvement; 86289848Sjkim 87289848Sjkim$flavour = shift; 88289848Sjkim$output = shift; 89289848Sjkimif ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 90289848Sjkim 91289848Sjkim$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 92289848Sjkim 93289848Sjkim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 94289848Sjkim( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 95289848Sjkim( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 96289848Sjkimdie "can't locate x86_64-xlate.pl"; 97289848Sjkim 98289848Sjkimopen OUT,"| \"$^X\" $xlate $flavour $output"; 99289848Sjkim*STDOUT=*OUT; 100289848Sjkim 101289848Sjkimif (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 102289848Sjkim =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 103289848Sjkim $addx = ($1>=2.23); 104289848Sjkim} 105289848Sjkim 106289848Sjkimif (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 107289848Sjkim `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 108289848Sjkim $addx = ($1>=2.10); 109289848Sjkim} 110289848Sjkim 111289848Sjkimif (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 112289848Sjkim `ml64 2>&1` =~ /Version ([0-9]+)\./) { 113289848Sjkim $addx = ($1>=12); 114289848Sjkim} 115289848Sjkim 116295009Sjkimif (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) { 117289848Sjkim my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 118289848Sjkim $addx = ($ver>=3.03); 119289848Sjkim} 120289848Sjkim 121289848Sjkim($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API 122289848Sjkim{ 123289848Sjkimmy ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d"); 124289848Sjkim 125289848Sjkim$code.=<<___; 126289848Sjkim.text 127289848Sjkim 128289848Sjkim.extern OPENSSL_ia32cap_P 129289848Sjkim 130289848Sjkim.globl rsaz_512_sqr 131289848Sjkim.type rsaz_512_sqr,\@function,5 132289848Sjkim.align 32 133289848Sjkimrsaz_512_sqr: # 25-29% faster than rsaz_512_mul 134289848Sjkim push %rbx 135289848Sjkim push %rbp 136289848Sjkim push %r12 137289848Sjkim push %r13 138289848Sjkim push %r14 139289848Sjkim push %r15 140289848Sjkim 141289848Sjkim subq \$128+24, %rsp 142289848Sjkim.Lsqr_body: 143289848Sjkim movq $mod, %rbp # common argument 144289848Sjkim movq ($inp), %rdx 145289848Sjkim movq 8($inp), %rax 146289848Sjkim movq $n0, 128(%rsp) 147289848Sjkim___ 148289848Sjkim$code.=<<___ if ($addx); 149289848Sjkim movl \$0x80100,%r11d 150289848Sjkim andl OPENSSL_ia32cap_P+8(%rip),%r11d 151289848Sjkim cmpl \$0x80100,%r11d # check for MULX and ADO/CX 152289848Sjkim je .Loop_sqrx 153289848Sjkim___ 154289848Sjkim$code.=<<___; 155289848Sjkim jmp .Loop_sqr 156289848Sjkim 157289848Sjkim.align 32 158289848Sjkim.Loop_sqr: 159289848Sjkim movl $times,128+8(%rsp) 160289848Sjkim#first iteration 161289848Sjkim movq %rdx, %rbx 162289848Sjkim mulq %rdx 163289848Sjkim movq %rax, %r8 164289848Sjkim movq 16($inp), %rax 165289848Sjkim movq %rdx, %r9 166289848Sjkim 167289848Sjkim mulq %rbx 168289848Sjkim addq %rax, %r9 169289848Sjkim movq 24($inp), %rax 170289848Sjkim movq %rdx, %r10 171289848Sjkim adcq \$0, %r10 172289848Sjkim 173289848Sjkim mulq %rbx 174289848Sjkim addq %rax, %r10 175289848Sjkim movq 32($inp), %rax 176289848Sjkim movq %rdx, %r11 177289848Sjkim adcq \$0, %r11 178289848Sjkim 179289848Sjkim mulq %rbx 180289848Sjkim addq %rax, %r11 181289848Sjkim movq 40($inp), %rax 182289848Sjkim movq %rdx, %r12 183289848Sjkim adcq \$0, %r12 184289848Sjkim 185289848Sjkim mulq %rbx 186289848Sjkim addq %rax, %r12 187289848Sjkim movq 48($inp), %rax 188289848Sjkim movq %rdx, %r13 189289848Sjkim adcq \$0, %r13 190289848Sjkim 191289848Sjkim mulq %rbx 192289848Sjkim addq %rax, %r13 193289848Sjkim movq 56($inp), %rax 194289848Sjkim movq %rdx, %r14 195289848Sjkim adcq \$0, %r14 196289848Sjkim 197289848Sjkim mulq %rbx 198289848Sjkim addq %rax, %r14 199289848Sjkim movq %rbx, %rax 200289848Sjkim movq %rdx, %r15 201289848Sjkim adcq \$0, %r15 202289848Sjkim 203289848Sjkim addq %r8, %r8 #shlq \$1, %r8 204289848Sjkim movq %r9, %rcx 205289848Sjkim adcq %r9, %r9 #shld \$1, %r8, %r9 206289848Sjkim 207289848Sjkim mulq %rax 208289848Sjkim movq %rax, (%rsp) 209289848Sjkim addq %rdx, %r8 210289848Sjkim adcq \$0, %r9 211289848Sjkim 212289848Sjkim movq %r8, 8(%rsp) 213289848Sjkim shrq \$63, %rcx 214289848Sjkim 215289848Sjkim#second iteration 216289848Sjkim movq 8($inp), %r8 217289848Sjkim movq 16($inp), %rax 218289848Sjkim mulq %r8 219289848Sjkim addq %rax, %r10 220289848Sjkim movq 24($inp), %rax 221289848Sjkim movq %rdx, %rbx 222289848Sjkim adcq \$0, %rbx 223289848Sjkim 224289848Sjkim mulq %r8 225289848Sjkim addq %rax, %r11 226289848Sjkim movq 32($inp), %rax 227289848Sjkim adcq \$0, %rdx 228289848Sjkim addq %rbx, %r11 229289848Sjkim movq %rdx, %rbx 230289848Sjkim adcq \$0, %rbx 231289848Sjkim 232289848Sjkim mulq %r8 233289848Sjkim addq %rax, %r12 234289848Sjkim movq 40($inp), %rax 235289848Sjkim adcq \$0, %rdx 236289848Sjkim addq %rbx, %r12 237289848Sjkim movq %rdx, %rbx 238289848Sjkim adcq \$0, %rbx 239289848Sjkim 240289848Sjkim mulq %r8 241289848Sjkim addq %rax, %r13 242289848Sjkim movq 48($inp), %rax 243289848Sjkim adcq \$0, %rdx 244289848Sjkim addq %rbx, %r13 245289848Sjkim movq %rdx, %rbx 246289848Sjkim adcq \$0, %rbx 247289848Sjkim 248289848Sjkim mulq %r8 249289848Sjkim addq %rax, %r14 250289848Sjkim movq 56($inp), %rax 251289848Sjkim adcq \$0, %rdx 252289848Sjkim addq %rbx, %r14 253289848Sjkim movq %rdx, %rbx 254289848Sjkim adcq \$0, %rbx 255289848Sjkim 256289848Sjkim mulq %r8 257289848Sjkim addq %rax, %r15 258289848Sjkim movq %r8, %rax 259289848Sjkim adcq \$0, %rdx 260289848Sjkim addq %rbx, %r15 261289848Sjkim movq %rdx, %r8 262289848Sjkim movq %r10, %rdx 263289848Sjkim adcq \$0, %r8 264289848Sjkim 265289848Sjkim add %rdx, %rdx 266289848Sjkim lea (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10 267289848Sjkim movq %r11, %rbx 268289848Sjkim adcq %r11, %r11 #shld \$1, %r10, %r11 269289848Sjkim 270289848Sjkim mulq %rax 271289848Sjkim addq %rax, %r9 272289848Sjkim adcq %rdx, %r10 273289848Sjkim adcq \$0, %r11 274289848Sjkim 275289848Sjkim movq %r9, 16(%rsp) 276289848Sjkim movq %r10, 24(%rsp) 277289848Sjkim shrq \$63, %rbx 278289848Sjkim 279289848Sjkim#third iteration 280289848Sjkim movq 16($inp), %r9 281289848Sjkim movq 24($inp), %rax 282289848Sjkim mulq %r9 283289848Sjkim addq %rax, %r12 284289848Sjkim movq 32($inp), %rax 285289848Sjkim movq %rdx, %rcx 286289848Sjkim adcq \$0, %rcx 287289848Sjkim 288289848Sjkim mulq %r9 289289848Sjkim addq %rax, %r13 290289848Sjkim movq 40($inp), %rax 291289848Sjkim adcq \$0, %rdx 292289848Sjkim addq %rcx, %r13 293289848Sjkim movq %rdx, %rcx 294289848Sjkim adcq \$0, %rcx 295289848Sjkim 296289848Sjkim mulq %r9 297289848Sjkim addq %rax, %r14 298289848Sjkim movq 48($inp), %rax 299289848Sjkim adcq \$0, %rdx 300289848Sjkim addq %rcx, %r14 301289848Sjkim movq %rdx, %rcx 302289848Sjkim adcq \$0, %rcx 303289848Sjkim 304289848Sjkim mulq %r9 305289848Sjkim movq %r12, %r10 306289848Sjkim lea (%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12 307289848Sjkim addq %rax, %r15 308289848Sjkim movq 56($inp), %rax 309289848Sjkim adcq \$0, %rdx 310289848Sjkim addq %rcx, %r15 311289848Sjkim movq %rdx, %rcx 312289848Sjkim adcq \$0, %rcx 313289848Sjkim 314289848Sjkim mulq %r9 315289848Sjkim shrq \$63, %r10 316289848Sjkim addq %rax, %r8 317289848Sjkim movq %r9, %rax 318289848Sjkim adcq \$0, %rdx 319289848Sjkim addq %rcx, %r8 320289848Sjkim movq %rdx, %r9 321289848Sjkim adcq \$0, %r9 322289848Sjkim 323289848Sjkim movq %r13, %rcx 324289848Sjkim leaq (%r10,%r13,2), %r13 #shld \$1, %r12, %r13 325289848Sjkim 326289848Sjkim mulq %rax 327289848Sjkim addq %rax, %r11 328289848Sjkim adcq %rdx, %r12 329289848Sjkim adcq \$0, %r13 330289848Sjkim 331289848Sjkim movq %r11, 32(%rsp) 332289848Sjkim movq %r12, 40(%rsp) 333289848Sjkim shrq \$63, %rcx 334289848Sjkim 335289848Sjkim#fourth iteration 336289848Sjkim movq 24($inp), %r10 337289848Sjkim movq 32($inp), %rax 338289848Sjkim mulq %r10 339289848Sjkim addq %rax, %r14 340289848Sjkim movq 40($inp), %rax 341289848Sjkim movq %rdx, %rbx 342289848Sjkim adcq \$0, %rbx 343289848Sjkim 344289848Sjkim mulq %r10 345289848Sjkim addq %rax, %r15 346289848Sjkim movq 48($inp), %rax 347289848Sjkim adcq \$0, %rdx 348289848Sjkim addq %rbx, %r15 349289848Sjkim movq %rdx, %rbx 350289848Sjkim adcq \$0, %rbx 351289848Sjkim 352289848Sjkim mulq %r10 353289848Sjkim movq %r14, %r12 354289848Sjkim leaq (%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14 355289848Sjkim addq %rax, %r8 356289848Sjkim movq 56($inp), %rax 357289848Sjkim adcq \$0, %rdx 358289848Sjkim addq %rbx, %r8 359289848Sjkim movq %rdx, %rbx 360289848Sjkim adcq \$0, %rbx 361289848Sjkim 362289848Sjkim mulq %r10 363289848Sjkim shrq \$63, %r12 364289848Sjkim addq %rax, %r9 365289848Sjkim movq %r10, %rax 366289848Sjkim adcq \$0, %rdx 367289848Sjkim addq %rbx, %r9 368289848Sjkim movq %rdx, %r10 369289848Sjkim adcq \$0, %r10 370289848Sjkim 371289848Sjkim movq %r15, %rbx 372289848Sjkim leaq (%r12,%r15,2),%r15 #shld \$1, %r14, %r15 373289848Sjkim 374289848Sjkim mulq %rax 375289848Sjkim addq %rax, %r13 376289848Sjkim adcq %rdx, %r14 377289848Sjkim adcq \$0, %r15 378289848Sjkim 379289848Sjkim movq %r13, 48(%rsp) 380289848Sjkim movq %r14, 56(%rsp) 381289848Sjkim shrq \$63, %rbx 382289848Sjkim 383289848Sjkim#fifth iteration 384289848Sjkim movq 32($inp), %r11 385289848Sjkim movq 40($inp), %rax 386289848Sjkim mulq %r11 387289848Sjkim addq %rax, %r8 388289848Sjkim movq 48($inp), %rax 389289848Sjkim movq %rdx, %rcx 390289848Sjkim adcq \$0, %rcx 391289848Sjkim 392289848Sjkim mulq %r11 393289848Sjkim addq %rax, %r9 394289848Sjkim movq 56($inp), %rax 395289848Sjkim adcq \$0, %rdx 396289848Sjkim movq %r8, %r12 397289848Sjkim leaq (%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8 398289848Sjkim addq %rcx, %r9 399289848Sjkim movq %rdx, %rcx 400289848Sjkim adcq \$0, %rcx 401289848Sjkim 402289848Sjkim mulq %r11 403289848Sjkim shrq \$63, %r12 404289848Sjkim addq %rax, %r10 405289848Sjkim movq %r11, %rax 406289848Sjkim adcq \$0, %rdx 407289848Sjkim addq %rcx, %r10 408289848Sjkim movq %rdx, %r11 409289848Sjkim adcq \$0, %r11 410289848Sjkim 411289848Sjkim movq %r9, %rcx 412289848Sjkim leaq (%r12,%r9,2), %r9 #shld \$1, %r8, %r9 413289848Sjkim 414289848Sjkim mulq %rax 415289848Sjkim addq %rax, %r15 416289848Sjkim adcq %rdx, %r8 417289848Sjkim adcq \$0, %r9 418289848Sjkim 419289848Sjkim movq %r15, 64(%rsp) 420289848Sjkim movq %r8, 72(%rsp) 421289848Sjkim shrq \$63, %rcx 422289848Sjkim 423289848Sjkim#sixth iteration 424289848Sjkim movq 40($inp), %r12 425289848Sjkim movq 48($inp), %rax 426289848Sjkim mulq %r12 427289848Sjkim addq %rax, %r10 428289848Sjkim movq 56($inp), %rax 429289848Sjkim movq %rdx, %rbx 430289848Sjkim adcq \$0, %rbx 431289848Sjkim 432289848Sjkim mulq %r12 433289848Sjkim addq %rax, %r11 434289848Sjkim movq %r12, %rax 435289848Sjkim movq %r10, %r15 436289848Sjkim leaq (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10 437289848Sjkim adcq \$0, %rdx 438289848Sjkim shrq \$63, %r15 439289848Sjkim addq %rbx, %r11 440289848Sjkim movq %rdx, %r12 441289848Sjkim adcq \$0, %r12 442289848Sjkim 443289848Sjkim movq %r11, %rbx 444289848Sjkim leaq (%r15,%r11,2), %r11 #shld \$1, %r10, %r11 445289848Sjkim 446289848Sjkim mulq %rax 447289848Sjkim addq %rax, %r9 448289848Sjkim adcq %rdx, %r10 449289848Sjkim adcq \$0, %r11 450289848Sjkim 451289848Sjkim movq %r9, 80(%rsp) 452289848Sjkim movq %r10, 88(%rsp) 453289848Sjkim 454289848Sjkim#seventh iteration 455289848Sjkim movq 48($inp), %r13 456289848Sjkim movq 56($inp), %rax 457289848Sjkim mulq %r13 458289848Sjkim addq %rax, %r12 459289848Sjkim movq %r13, %rax 460289848Sjkim movq %rdx, %r13 461289848Sjkim adcq \$0, %r13 462289848Sjkim 463289848Sjkim xorq %r14, %r14 464289848Sjkim shlq \$1, %rbx 465289848Sjkim adcq %r12, %r12 #shld \$1, %rbx, %r12 466289848Sjkim adcq %r13, %r13 #shld \$1, %r12, %r13 467289848Sjkim adcq %r14, %r14 #shld \$1, %r13, %r14 468289848Sjkim 469289848Sjkim mulq %rax 470289848Sjkim addq %rax, %r11 471289848Sjkim adcq %rdx, %r12 472289848Sjkim adcq \$0, %r13 473289848Sjkim 474289848Sjkim movq %r11, 96(%rsp) 475289848Sjkim movq %r12, 104(%rsp) 476289848Sjkim 477289848Sjkim#eighth iteration 478289848Sjkim movq 56($inp), %rax 479289848Sjkim mulq %rax 480289848Sjkim addq %rax, %r13 481289848Sjkim adcq \$0, %rdx 482289848Sjkim 483289848Sjkim addq %rdx, %r14 484289848Sjkim 485289848Sjkim movq %r13, 112(%rsp) 486289848Sjkim movq %r14, 120(%rsp) 487289848Sjkim 488289848Sjkim movq (%rsp), %r8 489289848Sjkim movq 8(%rsp), %r9 490289848Sjkim movq 16(%rsp), %r10 491289848Sjkim movq 24(%rsp), %r11 492289848Sjkim movq 32(%rsp), %r12 493289848Sjkim movq 40(%rsp), %r13 494289848Sjkim movq 48(%rsp), %r14 495289848Sjkim movq 56(%rsp), %r15 496289848Sjkim 497289848Sjkim call __rsaz_512_reduce 498289848Sjkim 499289848Sjkim addq 64(%rsp), %r8 500289848Sjkim adcq 72(%rsp), %r9 501289848Sjkim adcq 80(%rsp), %r10 502289848Sjkim adcq 88(%rsp), %r11 503289848Sjkim adcq 96(%rsp), %r12 504289848Sjkim adcq 104(%rsp), %r13 505289848Sjkim adcq 112(%rsp), %r14 506289848Sjkim adcq 120(%rsp), %r15 507289848Sjkim sbbq %rcx, %rcx 508289848Sjkim 509289848Sjkim call __rsaz_512_subtract 510289848Sjkim 511289848Sjkim movq %r8, %rdx 512289848Sjkim movq %r9, %rax 513289848Sjkim movl 128+8(%rsp), $times 514289848Sjkim movq $out, $inp 515289848Sjkim 516289848Sjkim decl $times 517289848Sjkim jnz .Loop_sqr 518289848Sjkim___ 519289848Sjkimif ($addx) { 520289848Sjkim$code.=<<___; 521289848Sjkim jmp .Lsqr_tail 522289848Sjkim 523289848Sjkim.align 32 524289848Sjkim.Loop_sqrx: 525289848Sjkim movl $times,128+8(%rsp) 526289848Sjkim movq $out, %xmm0 # off-load 527289848Sjkim movq %rbp, %xmm1 # off-load 528289848Sjkim#first iteration 529289848Sjkim mulx %rax, %r8, %r9 530289848Sjkim 531289848Sjkim mulx 16($inp), %rcx, %r10 532289848Sjkim xor %rbp, %rbp # cf=0, of=0 533289848Sjkim 534289848Sjkim mulx 24($inp), %rax, %r11 535289848Sjkim adcx %rcx, %r9 536289848Sjkim 537289848Sjkim mulx 32($inp), %rcx, %r12 538289848Sjkim adcx %rax, %r10 539289848Sjkim 540289848Sjkim mulx 40($inp), %rax, %r13 541289848Sjkim adcx %rcx, %r11 542289848Sjkim 543289848Sjkim .byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($inp), %rcx, %r14 544289848Sjkim adcx %rax, %r12 545289848Sjkim adcx %rcx, %r13 546289848Sjkim 547289848Sjkim .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r15 548289848Sjkim adcx %rax, %r14 549289848Sjkim adcx %rbp, %r15 # %rbp is 0 550289848Sjkim 551289848Sjkim mov %r9, %rcx 552289848Sjkim shld \$1, %r8, %r9 553289848Sjkim shl \$1, %r8 554289848Sjkim 555289848Sjkim xor %ebp, %ebp 556289848Sjkim mulx %rdx, %rax, %rdx 557289848Sjkim adcx %rdx, %r8 558289848Sjkim mov 8($inp), %rdx 559289848Sjkim adcx %rbp, %r9 560289848Sjkim 561289848Sjkim mov %rax, (%rsp) 562289848Sjkim mov %r8, 8(%rsp) 563289848Sjkim 564289848Sjkim#second iteration 565289848Sjkim mulx 16($inp), %rax, %rbx 566289848Sjkim adox %rax, %r10 567289848Sjkim adcx %rbx, %r11 568289848Sjkim 569289848Sjkim .byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r8 570289848Sjkim adox $out, %r11 571289848Sjkim adcx %r8, %r12 572289848Sjkim 573289848Sjkim mulx 32($inp), %rax, %rbx 574289848Sjkim adox %rax, %r12 575289848Sjkim adcx %rbx, %r13 576289848Sjkim 577289848Sjkim mulx 40($inp), $out, %r8 578289848Sjkim adox $out, %r13 579289848Sjkim adcx %r8, %r14 580289848Sjkim 581289848Sjkim .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx 582289848Sjkim adox %rax, %r14 583289848Sjkim adcx %rbx, %r15 584289848Sjkim 585289848Sjkim .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8 586289848Sjkim adox $out, %r15 587289848Sjkim adcx %rbp, %r8 588289848Sjkim adox %rbp, %r8 589289848Sjkim 590289848Sjkim mov %r11, %rbx 591289848Sjkim shld \$1, %r10, %r11 592289848Sjkim shld \$1, %rcx, %r10 593289848Sjkim 594289848Sjkim xor %ebp,%ebp 595289848Sjkim mulx %rdx, %rax, %rcx 596289848Sjkim mov 16($inp), %rdx 597289848Sjkim adcx %rax, %r9 598289848Sjkim adcx %rcx, %r10 599289848Sjkim adcx %rbp, %r11 600289848Sjkim 601289848Sjkim mov %r9, 16(%rsp) 602289848Sjkim .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp) 603289848Sjkim 604289848Sjkim#third iteration 605289848Sjkim .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9 606289848Sjkim adox $out, %r12 607289848Sjkim adcx %r9, %r13 608289848Sjkim 609289848Sjkim mulx 32($inp), %rax, %rcx 610289848Sjkim adox %rax, %r13 611289848Sjkim adcx %rcx, %r14 612289848Sjkim 613289848Sjkim mulx 40($inp), $out, %r9 614289848Sjkim adox $out, %r14 615289848Sjkim adcx %r9, %r15 616289848Sjkim 617289848Sjkim .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx 618289848Sjkim adox %rax, %r15 619289848Sjkim adcx %rcx, %r8 620289848Sjkim 621289848Sjkim .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r9 622289848Sjkim adox $out, %r8 623289848Sjkim adcx %rbp, %r9 624289848Sjkim adox %rbp, %r9 625289848Sjkim 626289848Sjkim mov %r13, %rcx 627289848Sjkim shld \$1, %r12, %r13 628289848Sjkim shld \$1, %rbx, %r12 629289848Sjkim 630289848Sjkim xor %ebp, %ebp 631289848Sjkim mulx %rdx, %rax, %rdx 632289848Sjkim adcx %rax, %r11 633289848Sjkim adcx %rdx, %r12 634289848Sjkim mov 24($inp), %rdx 635289848Sjkim adcx %rbp, %r13 636289848Sjkim 637289848Sjkim mov %r11, 32(%rsp) 638289848Sjkim .byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp) 639289848Sjkim 640289848Sjkim#fourth iteration 641289848Sjkim .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx 642289848Sjkim adox %rax, %r14 643289848Sjkim adcx %rbx, %r15 644289848Sjkim 645289848Sjkim mulx 40($inp), $out, %r10 646289848Sjkim adox $out, %r15 647289848Sjkim adcx %r10, %r8 648289848Sjkim 649289848Sjkim mulx 48($inp), %rax, %rbx 650289848Sjkim adox %rax, %r8 651289848Sjkim adcx %rbx, %r9 652289848Sjkim 653289848Sjkim mulx 56($inp), $out, %r10 654289848Sjkim adox $out, %r9 655289848Sjkim adcx %rbp, %r10 656289848Sjkim adox %rbp, %r10 657289848Sjkim 658289848Sjkim .byte 0x66 659289848Sjkim mov %r15, %rbx 660289848Sjkim shld \$1, %r14, %r15 661289848Sjkim shld \$1, %rcx, %r14 662289848Sjkim 663289848Sjkim xor %ebp, %ebp 664289848Sjkim mulx %rdx, %rax, %rdx 665289848Sjkim adcx %rax, %r13 666289848Sjkim adcx %rdx, %r14 667289848Sjkim mov 32($inp), %rdx 668289848Sjkim adcx %rbp, %r15 669289848Sjkim 670289848Sjkim mov %r13, 48(%rsp) 671289848Sjkim mov %r14, 56(%rsp) 672289848Sjkim 673289848Sjkim#fifth iteration 674289848Sjkim .byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11 675289848Sjkim adox $out, %r8 676289848Sjkim adcx %r11, %r9 677289848Sjkim 678289848Sjkim mulx 48($inp), %rax, %rcx 679289848Sjkim adox %rax, %r9 680289848Sjkim adcx %rcx, %r10 681289848Sjkim 682289848Sjkim mulx 56($inp), $out, %r11 683289848Sjkim adox $out, %r10 684289848Sjkim adcx %rbp, %r11 685289848Sjkim adox %rbp, %r11 686289848Sjkim 687289848Sjkim mov %r9, %rcx 688289848Sjkim shld \$1, %r8, %r9 689289848Sjkim shld \$1, %rbx, %r8 690289848Sjkim 691289848Sjkim xor %ebp, %ebp 692289848Sjkim mulx %rdx, %rax, %rdx 693289848Sjkim adcx %rax, %r15 694289848Sjkim adcx %rdx, %r8 695289848Sjkim mov 40($inp), %rdx 696289848Sjkim adcx %rbp, %r9 697289848Sjkim 698289848Sjkim mov %r15, 64(%rsp) 699289848Sjkim mov %r8, 72(%rsp) 700289848Sjkim 701289848Sjkim#sixth iteration 702289848Sjkim .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx 703289848Sjkim adox %rax, %r10 704289848Sjkim adcx %rbx, %r11 705289848Sjkim 706289848Sjkim .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12 707289848Sjkim adox $out, %r11 708289848Sjkim adcx %rbp, %r12 709289848Sjkim adox %rbp, %r12 710289848Sjkim 711289848Sjkim mov %r11, %rbx 712289848Sjkim shld \$1, %r10, %r11 713289848Sjkim shld \$1, %rcx, %r10 714289848Sjkim 715289848Sjkim xor %ebp, %ebp 716289848Sjkim mulx %rdx, %rax, %rdx 717289848Sjkim adcx %rax, %r9 718289848Sjkim adcx %rdx, %r10 719289848Sjkim mov 48($inp), %rdx 720289848Sjkim adcx %rbp, %r11 721289848Sjkim 722289848Sjkim mov %r9, 80(%rsp) 723289848Sjkim mov %r10, 88(%rsp) 724289848Sjkim 725289848Sjkim#seventh iteration 726289848Sjkim .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13 727289848Sjkim adox %rax, %r12 728289848Sjkim adox %rbp, %r13 729289848Sjkim 730289848Sjkim xor %r14, %r14 731289848Sjkim shld \$1, %r13, %r14 732289848Sjkim shld \$1, %r12, %r13 733289848Sjkim shld \$1, %rbx, %r12 734289848Sjkim 735289848Sjkim xor %ebp, %ebp 736289848Sjkim mulx %rdx, %rax, %rdx 737289848Sjkim adcx %rax, %r11 738289848Sjkim adcx %rdx, %r12 739289848Sjkim mov 56($inp), %rdx 740289848Sjkim adcx %rbp, %r13 741289848Sjkim 742289848Sjkim .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp) 743289848Sjkim .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp) 744289848Sjkim 745289848Sjkim#eighth iteration 746289848Sjkim mulx %rdx, %rax, %rdx 747289848Sjkim adox %rax, %r13 748289848Sjkim adox %rbp, %rdx 749289848Sjkim 750289848Sjkim .byte 0x66 751289848Sjkim add %rdx, %r14 752289848Sjkim 753289848Sjkim movq %r13, 112(%rsp) 754289848Sjkim movq %r14, 120(%rsp) 755289848Sjkim movq %xmm0, $out 756289848Sjkim movq %xmm1, %rbp 757289848Sjkim 758289848Sjkim movq 128(%rsp), %rdx # pull $n0 759289848Sjkim movq (%rsp), %r8 760289848Sjkim movq 8(%rsp), %r9 761289848Sjkim movq 16(%rsp), %r10 762289848Sjkim movq 24(%rsp), %r11 763289848Sjkim movq 32(%rsp), %r12 764289848Sjkim movq 40(%rsp), %r13 765289848Sjkim movq 48(%rsp), %r14 766289848Sjkim movq 56(%rsp), %r15 767289848Sjkim 768289848Sjkim call __rsaz_512_reducex 769289848Sjkim 770289848Sjkim addq 64(%rsp), %r8 771289848Sjkim adcq 72(%rsp), %r9 772289848Sjkim adcq 80(%rsp), %r10 773289848Sjkim adcq 88(%rsp), %r11 774289848Sjkim adcq 96(%rsp), %r12 775289848Sjkim adcq 104(%rsp), %r13 776289848Sjkim adcq 112(%rsp), %r14 777289848Sjkim adcq 120(%rsp), %r15 778289848Sjkim sbbq %rcx, %rcx 779289848Sjkim 780289848Sjkim call __rsaz_512_subtract 781289848Sjkim 782289848Sjkim movq %r8, %rdx 783289848Sjkim movq %r9, %rax 784289848Sjkim movl 128+8(%rsp), $times 785289848Sjkim movq $out, $inp 786289848Sjkim 787289848Sjkim decl $times 788289848Sjkim jnz .Loop_sqrx 789289848Sjkim 790289848Sjkim.Lsqr_tail: 791289848Sjkim___ 792289848Sjkim} 793289848Sjkim$code.=<<___; 794289848Sjkim 795289848Sjkim leaq 128+24+48(%rsp), %rax 796289848Sjkim movq -48(%rax), %r15 797289848Sjkim movq -40(%rax), %r14 798289848Sjkim movq -32(%rax), %r13 799289848Sjkim movq -24(%rax), %r12 800289848Sjkim movq -16(%rax), %rbp 801289848Sjkim movq -8(%rax), %rbx 802289848Sjkim leaq (%rax), %rsp 803289848Sjkim.Lsqr_epilogue: 804289848Sjkim ret 805289848Sjkim.size rsaz_512_sqr,.-rsaz_512_sqr 806289848Sjkim___ 807289848Sjkim} 808289848Sjkim{ 809289848Sjkimmy ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); 810289848Sjkim$code.=<<___; 811289848Sjkim.globl rsaz_512_mul 812289848Sjkim.type rsaz_512_mul,\@function,5 813289848Sjkim.align 32 814289848Sjkimrsaz_512_mul: 815289848Sjkim push %rbx 816289848Sjkim push %rbp 817289848Sjkim push %r12 818289848Sjkim push %r13 819289848Sjkim push %r14 820289848Sjkim push %r15 821289848Sjkim 822289848Sjkim subq \$128+24, %rsp 823289848Sjkim.Lmul_body: 824289848Sjkim movq $out, %xmm0 # off-load arguments 825289848Sjkim movq $mod, %xmm1 826289848Sjkim movq $n0, 128(%rsp) 827289848Sjkim___ 828289848Sjkim$code.=<<___ if ($addx); 829289848Sjkim movl \$0x80100,%r11d 830289848Sjkim andl OPENSSL_ia32cap_P+8(%rip),%r11d 831289848Sjkim cmpl \$0x80100,%r11d # check for MULX and ADO/CX 832289848Sjkim je .Lmulx 833289848Sjkim___ 834289848Sjkim$code.=<<___; 835289848Sjkim movq ($bp), %rbx # pass b[0] 836289848Sjkim movq $bp, %rbp # pass argument 837289848Sjkim call __rsaz_512_mul 838289848Sjkim 839289848Sjkim movq %xmm0, $out 840289848Sjkim movq %xmm1, %rbp 841289848Sjkim 842289848Sjkim movq (%rsp), %r8 843289848Sjkim movq 8(%rsp), %r9 844289848Sjkim movq 16(%rsp), %r10 845289848Sjkim movq 24(%rsp), %r11 846289848Sjkim movq 32(%rsp), %r12 847289848Sjkim movq 40(%rsp), %r13 848289848Sjkim movq 48(%rsp), %r14 849289848Sjkim movq 56(%rsp), %r15 850289848Sjkim 851289848Sjkim call __rsaz_512_reduce 852289848Sjkim___ 853289848Sjkim$code.=<<___ if ($addx); 854289848Sjkim jmp .Lmul_tail 855289848Sjkim 856289848Sjkim.align 32 857289848Sjkim.Lmulx: 858289848Sjkim movq $bp, %rbp # pass argument 859289848Sjkim movq ($bp), %rdx # pass b[0] 860289848Sjkim call __rsaz_512_mulx 861289848Sjkim 862289848Sjkim movq %xmm0, $out 863289848Sjkim movq %xmm1, %rbp 864289848Sjkim 865289848Sjkim movq 128(%rsp), %rdx # pull $n0 866289848Sjkim movq (%rsp), %r8 867289848Sjkim movq 8(%rsp), %r9 868289848Sjkim movq 16(%rsp), %r10 869289848Sjkim movq 24(%rsp), %r11 870289848Sjkim movq 32(%rsp), %r12 871289848Sjkim movq 40(%rsp), %r13 872289848Sjkim movq 48(%rsp), %r14 873289848Sjkim movq 56(%rsp), %r15 874289848Sjkim 875289848Sjkim call __rsaz_512_reducex 876289848Sjkim.Lmul_tail: 877289848Sjkim___ 878289848Sjkim$code.=<<___; 879289848Sjkim addq 64(%rsp), %r8 880289848Sjkim adcq 72(%rsp), %r9 881289848Sjkim adcq 80(%rsp), %r10 882289848Sjkim adcq 88(%rsp), %r11 883289848Sjkim adcq 96(%rsp), %r12 884289848Sjkim adcq 104(%rsp), %r13 885289848Sjkim adcq 112(%rsp), %r14 886289848Sjkim adcq 120(%rsp), %r15 887289848Sjkim sbbq %rcx, %rcx 888289848Sjkim 889289848Sjkim call __rsaz_512_subtract 890289848Sjkim 891289848Sjkim leaq 128+24+48(%rsp), %rax 892289848Sjkim movq -48(%rax), %r15 893289848Sjkim movq -40(%rax), %r14 894289848Sjkim movq -32(%rax), %r13 895289848Sjkim movq -24(%rax), %r12 896289848Sjkim movq -16(%rax), %rbp 897289848Sjkim movq -8(%rax), %rbx 898289848Sjkim leaq (%rax), %rsp 899289848Sjkim.Lmul_epilogue: 900289848Sjkim ret 901289848Sjkim.size rsaz_512_mul,.-rsaz_512_mul 902289848Sjkim___ 903289848Sjkim} 904289848Sjkim{ 905289848Sjkimmy ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); 906289848Sjkim$code.=<<___; 907289848Sjkim.globl rsaz_512_mul_gather4 908289848Sjkim.type rsaz_512_mul_gather4,\@function,6 909289848Sjkim.align 32 910289848Sjkimrsaz_512_mul_gather4: 911289848Sjkim push %rbx 912289848Sjkim push %rbp 913289848Sjkim push %r12 914289848Sjkim push %r13 915289848Sjkim push %r14 916289848Sjkim push %r15 917289848Sjkim 918296279Sjkim subq \$`128+24+($win64?0xb0:0)`, %rsp 919296279Sjkim___ 920296279Sjkim$code.=<<___ if ($win64); 921296279Sjkim movaps %xmm6,0xa0(%rsp) 922296279Sjkim movaps %xmm7,0xb0(%rsp) 923296279Sjkim movaps %xmm8,0xc0(%rsp) 924296279Sjkim movaps %xmm9,0xd0(%rsp) 925296279Sjkim movaps %xmm10,0xe0(%rsp) 926296279Sjkim movaps %xmm11,0xf0(%rsp) 927296279Sjkim movaps %xmm12,0x100(%rsp) 928296279Sjkim movaps %xmm13,0x110(%rsp) 929296279Sjkim movaps %xmm14,0x120(%rsp) 930296279Sjkim movaps %xmm15,0x130(%rsp) 931296279Sjkim___ 932296279Sjkim$code.=<<___; 933289848Sjkim.Lmul_gather4_body: 934296279Sjkim movd $pwr,%xmm8 935296279Sjkim movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002 936296279Sjkim movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000 937296279Sjkim 938296279Sjkim pshufd \$0,%xmm8,%xmm8 # broadcast $power 939296279Sjkim movdqa %xmm1,%xmm7 940296279Sjkim movdqa %xmm1,%xmm2 941289848Sjkim___ 942296279Sjkim######################################################################## 943296279Sjkim# calculate mask by comparing 0..15 to $power 944296279Sjkim# 945296279Sjkimfor($i=0;$i<4;$i++) { 946296279Sjkim$code.=<<___; 947296279Sjkim paddd %xmm`$i`,%xmm`$i+1` 948296279Sjkim pcmpeqd %xmm8,%xmm`$i` 949296279Sjkim movdqa %xmm7,%xmm`$i+3` 950296279Sjkim___ 951296279Sjkim} 952296279Sjkimfor(;$i<7;$i++) { 953296279Sjkim$code.=<<___; 954296279Sjkim paddd %xmm`$i`,%xmm`$i+1` 955296279Sjkim pcmpeqd %xmm8,%xmm`$i` 956296279Sjkim___ 957296279Sjkim} 958296279Sjkim$code.=<<___; 959296279Sjkim pcmpeqd %xmm8,%xmm7 960296279Sjkim 961296279Sjkim movdqa 16*0($bp),%xmm8 962296279Sjkim movdqa 16*1($bp),%xmm9 963296279Sjkim movdqa 16*2($bp),%xmm10 964296279Sjkim movdqa 16*3($bp),%xmm11 965296279Sjkim pand %xmm0,%xmm8 966296279Sjkim movdqa 16*4($bp),%xmm12 967296279Sjkim pand %xmm1,%xmm9 968296279Sjkim movdqa 16*5($bp),%xmm13 969296279Sjkim pand %xmm2,%xmm10 970296279Sjkim movdqa 16*6($bp),%xmm14 971296279Sjkim pand %xmm3,%xmm11 972296279Sjkim movdqa 16*7($bp),%xmm15 973296279Sjkim leaq 128($bp), %rbp 974296279Sjkim pand %xmm4,%xmm12 975296279Sjkim pand %xmm5,%xmm13 976296279Sjkim pand %xmm6,%xmm14 977296279Sjkim pand %xmm7,%xmm15 978296279Sjkim por %xmm10,%xmm8 979296279Sjkim por %xmm11,%xmm9 980296279Sjkim por %xmm12,%xmm8 981296279Sjkim por %xmm13,%xmm9 982296279Sjkim por %xmm14,%xmm8 983296279Sjkim por %xmm15,%xmm9 984296279Sjkim 985296279Sjkim por %xmm9,%xmm8 986296279Sjkim pshufd \$0x4e,%xmm8,%xmm9 987296279Sjkim por %xmm9,%xmm8 988296279Sjkim___ 989289848Sjkim$code.=<<___ if ($addx); 990289848Sjkim movl \$0x80100,%r11d 991289848Sjkim andl OPENSSL_ia32cap_P+8(%rip),%r11d 992289848Sjkim cmpl \$0x80100,%r11d # check for MULX and ADO/CX 993289848Sjkim je .Lmulx_gather 994289848Sjkim___ 995289848Sjkim$code.=<<___; 996296279Sjkim movq %xmm8,%rbx 997289848Sjkim 998296279Sjkim movq $n0, 128(%rsp) # off-load arguments 999296279Sjkim movq $out, 128+8(%rsp) 1000296279Sjkim movq $mod, 128+16(%rsp) 1001296279Sjkim 1002289848Sjkim movq ($ap), %rax 1003289848Sjkim movq 8($ap), %rcx 1004289848Sjkim mulq %rbx # 0 iteration 1005289848Sjkim movq %rax, (%rsp) 1006289848Sjkim movq %rcx, %rax 1007289848Sjkim movq %rdx, %r8 1008289848Sjkim 1009289848Sjkim mulq %rbx 1010289848Sjkim addq %rax, %r8 1011289848Sjkim movq 16($ap), %rax 1012289848Sjkim movq %rdx, %r9 1013289848Sjkim adcq \$0, %r9 1014289848Sjkim 1015289848Sjkim mulq %rbx 1016289848Sjkim addq %rax, %r9 1017289848Sjkim movq 24($ap), %rax 1018289848Sjkim movq %rdx, %r10 1019289848Sjkim adcq \$0, %r10 1020289848Sjkim 1021289848Sjkim mulq %rbx 1022289848Sjkim addq %rax, %r10 1023289848Sjkim movq 32($ap), %rax 1024289848Sjkim movq %rdx, %r11 1025289848Sjkim adcq \$0, %r11 1026289848Sjkim 1027289848Sjkim mulq %rbx 1028289848Sjkim addq %rax, %r11 1029289848Sjkim movq 40($ap), %rax 1030289848Sjkim movq %rdx, %r12 1031289848Sjkim adcq \$0, %r12 1032289848Sjkim 1033289848Sjkim mulq %rbx 1034289848Sjkim addq %rax, %r12 1035289848Sjkim movq 48($ap), %rax 1036289848Sjkim movq %rdx, %r13 1037289848Sjkim adcq \$0, %r13 1038289848Sjkim 1039289848Sjkim mulq %rbx 1040289848Sjkim addq %rax, %r13 1041289848Sjkim movq 56($ap), %rax 1042289848Sjkim movq %rdx, %r14 1043289848Sjkim adcq \$0, %r14 1044289848Sjkim 1045289848Sjkim mulq %rbx 1046289848Sjkim addq %rax, %r14 1047289848Sjkim movq ($ap), %rax 1048289848Sjkim movq %rdx, %r15 1049289848Sjkim adcq \$0, %r15 1050289848Sjkim 1051289848Sjkim leaq 8(%rsp), %rdi 1052289848Sjkim movl \$7, %ecx 1053289848Sjkim jmp .Loop_mul_gather 1054289848Sjkim 1055289848Sjkim.align 32 1056289848Sjkim.Loop_mul_gather: 1057296279Sjkim movdqa 16*0(%rbp),%xmm8 1058296279Sjkim movdqa 16*1(%rbp),%xmm9 1059296279Sjkim movdqa 16*2(%rbp),%xmm10 1060296279Sjkim movdqa 16*3(%rbp),%xmm11 1061296279Sjkim pand %xmm0,%xmm8 1062296279Sjkim movdqa 16*4(%rbp),%xmm12 1063296279Sjkim pand %xmm1,%xmm9 1064296279Sjkim movdqa 16*5(%rbp),%xmm13 1065296279Sjkim pand %xmm2,%xmm10 1066296279Sjkim movdqa 16*6(%rbp),%xmm14 1067296279Sjkim pand %xmm3,%xmm11 1068296279Sjkim movdqa 16*7(%rbp),%xmm15 1069296279Sjkim leaq 128(%rbp), %rbp 1070296279Sjkim pand %xmm4,%xmm12 1071296279Sjkim pand %xmm5,%xmm13 1072296279Sjkim pand %xmm6,%xmm14 1073296279Sjkim pand %xmm7,%xmm15 1074296279Sjkim por %xmm10,%xmm8 1075296279Sjkim por %xmm11,%xmm9 1076296279Sjkim por %xmm12,%xmm8 1077296279Sjkim por %xmm13,%xmm9 1078296279Sjkim por %xmm14,%xmm8 1079296279Sjkim por %xmm15,%xmm9 1080296279Sjkim 1081296279Sjkim por %xmm9,%xmm8 1082296279Sjkim pshufd \$0x4e,%xmm8,%xmm9 1083296279Sjkim por %xmm9,%xmm8 1084296279Sjkim movq %xmm8,%rbx 1085296279Sjkim 1086289848Sjkim mulq %rbx 1087289848Sjkim addq %rax, %r8 1088289848Sjkim movq 8($ap), %rax 1089289848Sjkim movq %r8, (%rdi) 1090289848Sjkim movq %rdx, %r8 1091289848Sjkim adcq \$0, %r8 1092289848Sjkim 1093289848Sjkim mulq %rbx 1094289848Sjkim addq %rax, %r9 1095289848Sjkim movq 16($ap), %rax 1096289848Sjkim adcq \$0, %rdx 1097289848Sjkim addq %r9, %r8 1098289848Sjkim movq %rdx, %r9 1099289848Sjkim adcq \$0, %r9 1100289848Sjkim 1101289848Sjkim mulq %rbx 1102289848Sjkim addq %rax, %r10 1103289848Sjkim movq 24($ap), %rax 1104289848Sjkim adcq \$0, %rdx 1105289848Sjkim addq %r10, %r9 1106289848Sjkim movq %rdx, %r10 1107289848Sjkim adcq \$0, %r10 1108289848Sjkim 1109289848Sjkim mulq %rbx 1110289848Sjkim addq %rax, %r11 1111289848Sjkim movq 32($ap), %rax 1112289848Sjkim adcq \$0, %rdx 1113289848Sjkim addq %r11, %r10 1114289848Sjkim movq %rdx, %r11 1115289848Sjkim adcq \$0, %r11 1116289848Sjkim 1117289848Sjkim mulq %rbx 1118289848Sjkim addq %rax, %r12 1119289848Sjkim movq 40($ap), %rax 1120289848Sjkim adcq \$0, %rdx 1121289848Sjkim addq %r12, %r11 1122289848Sjkim movq %rdx, %r12 1123289848Sjkim adcq \$0, %r12 1124289848Sjkim 1125289848Sjkim mulq %rbx 1126289848Sjkim addq %rax, %r13 1127289848Sjkim movq 48($ap), %rax 1128289848Sjkim adcq \$0, %rdx 1129289848Sjkim addq %r13, %r12 1130289848Sjkim movq %rdx, %r13 1131289848Sjkim adcq \$0, %r13 1132289848Sjkim 1133289848Sjkim mulq %rbx 1134289848Sjkim addq %rax, %r14 1135289848Sjkim movq 56($ap), %rax 1136289848Sjkim adcq \$0, %rdx 1137289848Sjkim addq %r14, %r13 1138289848Sjkim movq %rdx, %r14 1139289848Sjkim adcq \$0, %r14 1140289848Sjkim 1141289848Sjkim mulq %rbx 1142289848Sjkim addq %rax, %r15 1143289848Sjkim movq ($ap), %rax 1144289848Sjkim adcq \$0, %rdx 1145289848Sjkim addq %r15, %r14 1146289848Sjkim movq %rdx, %r15 1147289848Sjkim adcq \$0, %r15 1148289848Sjkim 1149289848Sjkim leaq 8(%rdi), %rdi 1150289848Sjkim 1151289848Sjkim decl %ecx 1152289848Sjkim jnz .Loop_mul_gather 1153289848Sjkim 1154289848Sjkim movq %r8, (%rdi) 1155289848Sjkim movq %r9, 8(%rdi) 1156289848Sjkim movq %r10, 16(%rdi) 1157289848Sjkim movq %r11, 24(%rdi) 1158289848Sjkim movq %r12, 32(%rdi) 1159289848Sjkim movq %r13, 40(%rdi) 1160289848Sjkim movq %r14, 48(%rdi) 1161289848Sjkim movq %r15, 56(%rdi) 1162289848Sjkim 1163296279Sjkim movq 128+8(%rsp), $out 1164296279Sjkim movq 128+16(%rsp), %rbp 1165289848Sjkim 1166289848Sjkim movq (%rsp), %r8 1167289848Sjkim movq 8(%rsp), %r9 1168289848Sjkim movq 16(%rsp), %r10 1169289848Sjkim movq 24(%rsp), %r11 1170289848Sjkim movq 32(%rsp), %r12 1171289848Sjkim movq 40(%rsp), %r13 1172289848Sjkim movq 48(%rsp), %r14 1173289848Sjkim movq 56(%rsp), %r15 1174289848Sjkim 1175289848Sjkim call __rsaz_512_reduce 1176289848Sjkim___ 1177289848Sjkim$code.=<<___ if ($addx); 1178289848Sjkim jmp .Lmul_gather_tail 1179289848Sjkim 1180289848Sjkim.align 32 1181289848Sjkim.Lmulx_gather: 1182296279Sjkim movq %xmm8,%rdx 1183289848Sjkim 1184296279Sjkim mov $n0, 128(%rsp) # off-load arguments 1185296279Sjkim mov $out, 128+8(%rsp) 1186296279Sjkim mov $mod, 128+16(%rsp) 1187296279Sjkim 1188289848Sjkim mulx ($ap), %rbx, %r8 # 0 iteration 1189289848Sjkim mov %rbx, (%rsp) 1190289848Sjkim xor %edi, %edi # cf=0, of=0 1191289848Sjkim 1192289848Sjkim mulx 8($ap), %rax, %r9 1193289848Sjkim 1194289848Sjkim mulx 16($ap), %rbx, %r10 1195289848Sjkim adcx %rax, %r8 1196289848Sjkim 1197289848Sjkim mulx 24($ap), %rax, %r11 1198289848Sjkim adcx %rbx, %r9 1199289848Sjkim 1200289848Sjkim mulx 32($ap), %rbx, %r12 1201289848Sjkim adcx %rax, %r10 1202289848Sjkim 1203289848Sjkim mulx 40($ap), %rax, %r13 1204289848Sjkim adcx %rbx, %r11 1205289848Sjkim 1206289848Sjkim mulx 48($ap), %rbx, %r14 1207289848Sjkim adcx %rax, %r12 1208289848Sjkim 1209289848Sjkim mulx 56($ap), %rax, %r15 1210289848Sjkim adcx %rbx, %r13 1211289848Sjkim adcx %rax, %r14 1212296279Sjkim .byte 0x67 1213289848Sjkim mov %r8, %rbx 1214289848Sjkim adcx %rdi, %r15 # %rdi is 0 1215289848Sjkim 1216289848Sjkim mov \$-7, %rcx 1217289848Sjkim jmp .Loop_mulx_gather 1218289848Sjkim 1219289848Sjkim.align 32 1220289848Sjkim.Loop_mulx_gather: 1221296279Sjkim movdqa 16*0(%rbp),%xmm8 1222296279Sjkim movdqa 16*1(%rbp),%xmm9 1223296279Sjkim movdqa 16*2(%rbp),%xmm10 1224296279Sjkim movdqa 16*3(%rbp),%xmm11 1225296279Sjkim pand %xmm0,%xmm8 1226296279Sjkim movdqa 16*4(%rbp),%xmm12 1227296279Sjkim pand %xmm1,%xmm9 1228296279Sjkim movdqa 16*5(%rbp),%xmm13 1229296279Sjkim pand %xmm2,%xmm10 1230296279Sjkim movdqa 16*6(%rbp),%xmm14 1231296279Sjkim pand %xmm3,%xmm11 1232296279Sjkim movdqa 16*7(%rbp),%xmm15 1233296279Sjkim leaq 128(%rbp), %rbp 1234296279Sjkim pand %xmm4,%xmm12 1235296279Sjkim pand %xmm5,%xmm13 1236296279Sjkim pand %xmm6,%xmm14 1237296279Sjkim pand %xmm7,%xmm15 1238296279Sjkim por %xmm10,%xmm8 1239296279Sjkim por %xmm11,%xmm9 1240296279Sjkim por %xmm12,%xmm8 1241296279Sjkim por %xmm13,%xmm9 1242296279Sjkim por %xmm14,%xmm8 1243296279Sjkim por %xmm15,%xmm9 1244296279Sjkim 1245296279Sjkim por %xmm9,%xmm8 1246296279Sjkim pshufd \$0x4e,%xmm8,%xmm9 1247296279Sjkim por %xmm9,%xmm8 1248296279Sjkim movq %xmm8,%rdx 1249296279Sjkim 1250296279Sjkim .byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 # mulx ($ap), %rax, %r8 1251289848Sjkim adcx %rax, %rbx 1252289848Sjkim adox %r9, %r8 1253289848Sjkim 1254289848Sjkim mulx 8($ap), %rax, %r9 1255289848Sjkim adcx %rax, %r8 1256289848Sjkim adox %r10, %r9 1257289848Sjkim 1258289848Sjkim mulx 16($ap), %rax, %r10 1259289848Sjkim adcx %rax, %r9 1260289848Sjkim adox %r11, %r10 1261289848Sjkim 1262289848Sjkim .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11 1263289848Sjkim adcx %rax, %r10 1264289848Sjkim adox %r12, %r11 1265289848Sjkim 1266289848Sjkim mulx 32($ap), %rax, %r12 1267289848Sjkim adcx %rax, %r11 1268289848Sjkim adox %r13, %r12 1269289848Sjkim 1270289848Sjkim mulx 40($ap), %rax, %r13 1271289848Sjkim adcx %rax, %r12 1272289848Sjkim adox %r14, %r13 1273289848Sjkim 1274289848Sjkim .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14 1275289848Sjkim adcx %rax, %r13 1276296279Sjkim .byte 0x67 1277289848Sjkim adox %r15, %r14 1278289848Sjkim 1279289848Sjkim mulx 56($ap), %rax, %r15 1280289848Sjkim mov %rbx, 64(%rsp,%rcx,8) 1281289848Sjkim adcx %rax, %r14 1282289848Sjkim adox %rdi, %r15 1283289848Sjkim mov %r8, %rbx 1284289848Sjkim adcx %rdi, %r15 # cf=0 1285289848Sjkim 1286289848Sjkim inc %rcx # of=0 1287289848Sjkim jnz .Loop_mulx_gather 1288289848Sjkim 1289289848Sjkim mov %r8, 64(%rsp) 1290289848Sjkim mov %r9, 64+8(%rsp) 1291289848Sjkim mov %r10, 64+16(%rsp) 1292289848Sjkim mov %r11, 64+24(%rsp) 1293289848Sjkim mov %r12, 64+32(%rsp) 1294289848Sjkim mov %r13, 64+40(%rsp) 1295289848Sjkim mov %r14, 64+48(%rsp) 1296289848Sjkim mov %r15, 64+56(%rsp) 1297289848Sjkim 1298296279Sjkim mov 128(%rsp), %rdx # pull arguments 1299296279Sjkim mov 128+8(%rsp), $out 1300296279Sjkim mov 128+16(%rsp), %rbp 1301289848Sjkim 1302289848Sjkim mov (%rsp), %r8 1303289848Sjkim mov 8(%rsp), %r9 1304289848Sjkim mov 16(%rsp), %r10 1305289848Sjkim mov 24(%rsp), %r11 1306289848Sjkim mov 32(%rsp), %r12 1307289848Sjkim mov 40(%rsp), %r13 1308289848Sjkim mov 48(%rsp), %r14 1309289848Sjkim mov 56(%rsp), %r15 1310289848Sjkim 1311289848Sjkim call __rsaz_512_reducex 1312289848Sjkim 1313289848Sjkim.Lmul_gather_tail: 1314289848Sjkim___ 1315289848Sjkim$code.=<<___; 1316289848Sjkim addq 64(%rsp), %r8 1317289848Sjkim adcq 72(%rsp), %r9 1318289848Sjkim adcq 80(%rsp), %r10 1319289848Sjkim adcq 88(%rsp), %r11 1320289848Sjkim adcq 96(%rsp), %r12 1321289848Sjkim adcq 104(%rsp), %r13 1322289848Sjkim adcq 112(%rsp), %r14 1323289848Sjkim adcq 120(%rsp), %r15 1324289848Sjkim sbbq %rcx, %rcx 1325289848Sjkim 1326289848Sjkim call __rsaz_512_subtract 1327289848Sjkim 1328289848Sjkim leaq 128+24+48(%rsp), %rax 1329296279Sjkim___ 1330296279Sjkim$code.=<<___ if ($win64); 1331296279Sjkim movaps 0xa0-0xc8(%rax),%xmm6 1332296279Sjkim movaps 0xb0-0xc8(%rax),%xmm7 1333296279Sjkim movaps 0xc0-0xc8(%rax),%xmm8 1334296279Sjkim movaps 0xd0-0xc8(%rax),%xmm9 1335296279Sjkim movaps 0xe0-0xc8(%rax),%xmm10 1336296279Sjkim movaps 0xf0-0xc8(%rax),%xmm11 1337296279Sjkim movaps 0x100-0xc8(%rax),%xmm12 1338296279Sjkim movaps 0x110-0xc8(%rax),%xmm13 1339296279Sjkim movaps 0x120-0xc8(%rax),%xmm14 1340296279Sjkim movaps 0x130-0xc8(%rax),%xmm15 1341296279Sjkim lea 0xb0(%rax),%rax 1342296279Sjkim___ 1343296279Sjkim$code.=<<___; 1344289848Sjkim movq -48(%rax), %r15 1345289848Sjkim movq -40(%rax), %r14 1346289848Sjkim movq -32(%rax), %r13 1347289848Sjkim movq -24(%rax), %r12 1348289848Sjkim movq -16(%rax), %rbp 1349289848Sjkim movq -8(%rax), %rbx 1350289848Sjkim leaq (%rax), %rsp 1351289848Sjkim.Lmul_gather4_epilogue: 1352289848Sjkim ret 1353289848Sjkim.size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4 1354289848Sjkim___ 1355289848Sjkim} 1356289848Sjkim{ 1357289848Sjkimmy ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); 1358289848Sjkim$code.=<<___; 1359289848Sjkim.globl rsaz_512_mul_scatter4 1360289848Sjkim.type rsaz_512_mul_scatter4,\@function,6 1361289848Sjkim.align 32 1362289848Sjkimrsaz_512_mul_scatter4: 1363289848Sjkim push %rbx 1364289848Sjkim push %rbp 1365289848Sjkim push %r12 1366289848Sjkim push %r13 1367289848Sjkim push %r14 1368289848Sjkim push %r15 1369289848Sjkim 1370289848Sjkim mov $pwr, $pwr 1371289848Sjkim subq \$128+24, %rsp 1372289848Sjkim.Lmul_scatter4_body: 1373296279Sjkim leaq ($tbl,$pwr,8), $tbl 1374289848Sjkim movq $out, %xmm0 # off-load arguments 1375289848Sjkim movq $mod, %xmm1 1376289848Sjkim movq $tbl, %xmm2 1377289848Sjkim movq $n0, 128(%rsp) 1378289848Sjkim 1379289848Sjkim movq $out, %rbp 1380289848Sjkim___ 1381289848Sjkim$code.=<<___ if ($addx); 1382289848Sjkim movl \$0x80100,%r11d 1383289848Sjkim andl OPENSSL_ia32cap_P+8(%rip),%r11d 1384289848Sjkim cmpl \$0x80100,%r11d # check for MULX and ADO/CX 1385289848Sjkim je .Lmulx_scatter 1386289848Sjkim___ 1387289848Sjkim$code.=<<___; 1388289848Sjkim movq ($out),%rbx # pass b[0] 1389289848Sjkim call __rsaz_512_mul 1390289848Sjkim 1391289848Sjkim movq %xmm0, $out 1392289848Sjkim movq %xmm1, %rbp 1393289848Sjkim 1394289848Sjkim movq (%rsp), %r8 1395289848Sjkim movq 8(%rsp), %r9 1396289848Sjkim movq 16(%rsp), %r10 1397289848Sjkim movq 24(%rsp), %r11 1398289848Sjkim movq 32(%rsp), %r12 1399289848Sjkim movq 40(%rsp), %r13 1400289848Sjkim movq 48(%rsp), %r14 1401289848Sjkim movq 56(%rsp), %r15 1402289848Sjkim 1403289848Sjkim call __rsaz_512_reduce 1404289848Sjkim___ 1405289848Sjkim$code.=<<___ if ($addx); 1406289848Sjkim jmp .Lmul_scatter_tail 1407289848Sjkim 1408289848Sjkim.align 32 1409289848Sjkim.Lmulx_scatter: 1410289848Sjkim movq ($out), %rdx # pass b[0] 1411289848Sjkim call __rsaz_512_mulx 1412289848Sjkim 1413289848Sjkim movq %xmm0, $out 1414289848Sjkim movq %xmm1, %rbp 1415289848Sjkim 1416289848Sjkim movq 128(%rsp), %rdx # pull $n0 1417289848Sjkim movq (%rsp), %r8 1418289848Sjkim movq 8(%rsp), %r9 1419289848Sjkim movq 16(%rsp), %r10 1420289848Sjkim movq 24(%rsp), %r11 1421289848Sjkim movq 32(%rsp), %r12 1422289848Sjkim movq 40(%rsp), %r13 1423289848Sjkim movq 48(%rsp), %r14 1424289848Sjkim movq 56(%rsp), %r15 1425289848Sjkim 1426289848Sjkim call __rsaz_512_reducex 1427289848Sjkim 1428289848Sjkim.Lmul_scatter_tail: 1429289848Sjkim___ 1430289848Sjkim$code.=<<___; 1431289848Sjkim addq 64(%rsp), %r8 1432289848Sjkim adcq 72(%rsp), %r9 1433289848Sjkim adcq 80(%rsp), %r10 1434289848Sjkim adcq 88(%rsp), %r11 1435289848Sjkim adcq 96(%rsp), %r12 1436289848Sjkim adcq 104(%rsp), %r13 1437289848Sjkim adcq 112(%rsp), %r14 1438289848Sjkim adcq 120(%rsp), %r15 1439289848Sjkim movq %xmm2, $inp 1440289848Sjkim sbbq %rcx, %rcx 1441289848Sjkim 1442289848Sjkim call __rsaz_512_subtract 1443289848Sjkim 1444296279Sjkim movq %r8, 128*0($inp) # scatter 1445296279Sjkim movq %r9, 128*1($inp) 1446296279Sjkim movq %r10, 128*2($inp) 1447296279Sjkim movq %r11, 128*3($inp) 1448296279Sjkim movq %r12, 128*4($inp) 1449296279Sjkim movq %r13, 128*5($inp) 1450296279Sjkim movq %r14, 128*6($inp) 1451296279Sjkim movq %r15, 128*7($inp) 1452289848Sjkim 1453289848Sjkim leaq 128+24+48(%rsp), %rax 1454289848Sjkim movq -48(%rax), %r15 1455289848Sjkim movq -40(%rax), %r14 1456289848Sjkim movq -32(%rax), %r13 1457289848Sjkim movq -24(%rax), %r12 1458289848Sjkim movq -16(%rax), %rbp 1459289848Sjkim movq -8(%rax), %rbx 1460289848Sjkim leaq (%rax), %rsp 1461289848Sjkim.Lmul_scatter4_epilogue: 1462289848Sjkim ret 1463289848Sjkim.size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4 1464289848Sjkim___ 1465289848Sjkim} 1466289848Sjkim{ 1467289848Sjkimmy ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx"); 1468289848Sjkim$code.=<<___; 1469289848Sjkim.globl rsaz_512_mul_by_one 1470289848Sjkim.type rsaz_512_mul_by_one,\@function,4 1471289848Sjkim.align 32 1472289848Sjkimrsaz_512_mul_by_one: 1473289848Sjkim push %rbx 1474289848Sjkim push %rbp 1475289848Sjkim push %r12 1476289848Sjkim push %r13 1477289848Sjkim push %r14 1478289848Sjkim push %r15 1479289848Sjkim 1480289848Sjkim subq \$128+24, %rsp 1481289848Sjkim.Lmul_by_one_body: 1482289848Sjkim___ 1483289848Sjkim$code.=<<___ if ($addx); 1484289848Sjkim movl OPENSSL_ia32cap_P+8(%rip),%eax 1485289848Sjkim___ 1486289848Sjkim$code.=<<___; 1487289848Sjkim movq $mod, %rbp # reassign argument 1488289848Sjkim movq $n0, 128(%rsp) 1489289848Sjkim 1490289848Sjkim movq ($inp), %r8 1491289848Sjkim pxor %xmm0, %xmm0 1492289848Sjkim movq 8($inp), %r9 1493289848Sjkim movq 16($inp), %r10 1494289848Sjkim movq 24($inp), %r11 1495289848Sjkim movq 32($inp), %r12 1496289848Sjkim movq 40($inp), %r13 1497289848Sjkim movq 48($inp), %r14 1498289848Sjkim movq 56($inp), %r15 1499289848Sjkim 1500289848Sjkim movdqa %xmm0, (%rsp) 1501289848Sjkim movdqa %xmm0, 16(%rsp) 1502289848Sjkim movdqa %xmm0, 32(%rsp) 1503289848Sjkim movdqa %xmm0, 48(%rsp) 1504289848Sjkim movdqa %xmm0, 64(%rsp) 1505289848Sjkim movdqa %xmm0, 80(%rsp) 1506289848Sjkim movdqa %xmm0, 96(%rsp) 1507289848Sjkim___ 1508289848Sjkim$code.=<<___ if ($addx); 1509289848Sjkim andl \$0x80100,%eax 1510289848Sjkim cmpl \$0x80100,%eax # check for MULX and ADO/CX 1511289848Sjkim je .Lby_one_callx 1512289848Sjkim___ 1513289848Sjkim$code.=<<___; 1514289848Sjkim call __rsaz_512_reduce 1515289848Sjkim___ 1516289848Sjkim$code.=<<___ if ($addx); 1517289848Sjkim jmp .Lby_one_tail 1518289848Sjkim.align 32 1519289848Sjkim.Lby_one_callx: 1520289848Sjkim movq 128(%rsp), %rdx # pull $n0 1521289848Sjkim call __rsaz_512_reducex 1522289848Sjkim.Lby_one_tail: 1523289848Sjkim___ 1524289848Sjkim$code.=<<___; 1525289848Sjkim movq %r8, ($out) 1526289848Sjkim movq %r9, 8($out) 1527289848Sjkim movq %r10, 16($out) 1528289848Sjkim movq %r11, 24($out) 1529289848Sjkim movq %r12, 32($out) 1530289848Sjkim movq %r13, 40($out) 1531289848Sjkim movq %r14, 48($out) 1532289848Sjkim movq %r15, 56($out) 1533289848Sjkim 1534289848Sjkim leaq 128+24+48(%rsp), %rax 1535289848Sjkim movq -48(%rax), %r15 1536289848Sjkim movq -40(%rax), %r14 1537289848Sjkim movq -32(%rax), %r13 1538289848Sjkim movq -24(%rax), %r12 1539289848Sjkim movq -16(%rax), %rbp 1540289848Sjkim movq -8(%rax), %rbx 1541289848Sjkim leaq (%rax), %rsp 1542289848Sjkim.Lmul_by_one_epilogue: 1543289848Sjkim ret 1544289848Sjkim.size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one 1545289848Sjkim___ 1546289848Sjkim} 1547289848Sjkim{ # __rsaz_512_reduce 1548289848Sjkim # 1549289848Sjkim # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0 1550289848Sjkim # output: %r8-%r15 1551289848Sjkim # clobbers: everything except %rbp and %rdi 1552289848Sjkim$code.=<<___; 1553289848Sjkim.type __rsaz_512_reduce,\@abi-omnipotent 1554289848Sjkim.align 32 1555289848Sjkim__rsaz_512_reduce: 1556289848Sjkim movq %r8, %rbx 1557289848Sjkim imulq 128+8(%rsp), %rbx 1558289848Sjkim movq 0(%rbp), %rax 1559289848Sjkim movl \$8, %ecx 1560289848Sjkim jmp .Lreduction_loop 1561289848Sjkim 1562289848Sjkim.align 32 1563289848Sjkim.Lreduction_loop: 1564289848Sjkim mulq %rbx 1565289848Sjkim movq 8(%rbp), %rax 1566289848Sjkim negq %r8 1567289848Sjkim movq %rdx, %r8 1568289848Sjkim adcq \$0, %r8 1569289848Sjkim 1570289848Sjkim mulq %rbx 1571289848Sjkim addq %rax, %r9 1572289848Sjkim movq 16(%rbp), %rax 1573289848Sjkim adcq \$0, %rdx 1574289848Sjkim addq %r9, %r8 1575289848Sjkim movq %rdx, %r9 1576289848Sjkim adcq \$0, %r9 1577289848Sjkim 1578289848Sjkim mulq %rbx 1579289848Sjkim addq %rax, %r10 1580289848Sjkim movq 24(%rbp), %rax 1581289848Sjkim adcq \$0, %rdx 1582289848Sjkim addq %r10, %r9 1583289848Sjkim movq %rdx, %r10 1584289848Sjkim adcq \$0, %r10 1585289848Sjkim 1586289848Sjkim mulq %rbx 1587289848Sjkim addq %rax, %r11 1588289848Sjkim movq 32(%rbp), %rax 1589289848Sjkim adcq \$0, %rdx 1590289848Sjkim addq %r11, %r10 1591289848Sjkim movq 128+8(%rsp), %rsi 1592289848Sjkim #movq %rdx, %r11 1593289848Sjkim #adcq \$0, %r11 1594289848Sjkim adcq \$0, %rdx 1595289848Sjkim movq %rdx, %r11 1596289848Sjkim 1597289848Sjkim mulq %rbx 1598289848Sjkim addq %rax, %r12 1599289848Sjkim movq 40(%rbp), %rax 1600289848Sjkim adcq \$0, %rdx 1601289848Sjkim imulq %r8, %rsi 1602289848Sjkim addq %r12, %r11 1603289848Sjkim movq %rdx, %r12 1604289848Sjkim adcq \$0, %r12 1605289848Sjkim 1606289848Sjkim mulq %rbx 1607289848Sjkim addq %rax, %r13 1608289848Sjkim movq 48(%rbp), %rax 1609289848Sjkim adcq \$0, %rdx 1610289848Sjkim addq %r13, %r12 1611289848Sjkim movq %rdx, %r13 1612289848Sjkim adcq \$0, %r13 1613289848Sjkim 1614289848Sjkim mulq %rbx 1615289848Sjkim addq %rax, %r14 1616289848Sjkim movq 56(%rbp), %rax 1617289848Sjkim adcq \$0, %rdx 1618289848Sjkim addq %r14, %r13 1619289848Sjkim movq %rdx, %r14 1620289848Sjkim adcq \$0, %r14 1621289848Sjkim 1622289848Sjkim mulq %rbx 1623289848Sjkim movq %rsi, %rbx 1624289848Sjkim addq %rax, %r15 1625289848Sjkim movq 0(%rbp), %rax 1626289848Sjkim adcq \$0, %rdx 1627289848Sjkim addq %r15, %r14 1628289848Sjkim movq %rdx, %r15 1629289848Sjkim adcq \$0, %r15 1630289848Sjkim 1631289848Sjkim decl %ecx 1632289848Sjkim jne .Lreduction_loop 1633289848Sjkim 1634289848Sjkim ret 1635289848Sjkim.size __rsaz_512_reduce,.-__rsaz_512_reduce 1636289848Sjkim___ 1637289848Sjkim} 1638289848Sjkimif ($addx) { 1639289848Sjkim # __rsaz_512_reducex 1640289848Sjkim # 1641289848Sjkim # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0 1642289848Sjkim # output: %r8-%r15 1643289848Sjkim # clobbers: everything except %rbp and %rdi 1644289848Sjkim$code.=<<___; 1645289848Sjkim.type __rsaz_512_reducex,\@abi-omnipotent 1646289848Sjkim.align 32 1647289848Sjkim__rsaz_512_reducex: 1648289848Sjkim #movq 128+8(%rsp), %rdx # pull $n0 1649289848Sjkim imulq %r8, %rdx 1650289848Sjkim xorq %rsi, %rsi # cf=0,of=0 1651289848Sjkim movl \$8, %ecx 1652289848Sjkim jmp .Lreduction_loopx 1653289848Sjkim 1654289848Sjkim.align 32 1655289848Sjkim.Lreduction_loopx: 1656289848Sjkim mov %r8, %rbx 1657289848Sjkim mulx 0(%rbp), %rax, %r8 1658289848Sjkim adcx %rbx, %rax 1659289848Sjkim adox %r9, %r8 1660289848Sjkim 1661289848Sjkim mulx 8(%rbp), %rax, %r9 1662289848Sjkim adcx %rax, %r8 1663289848Sjkim adox %r10, %r9 1664289848Sjkim 1665289848Sjkim mulx 16(%rbp), %rbx, %r10 1666289848Sjkim adcx %rbx, %r9 1667289848Sjkim adox %r11, %r10 1668289848Sjkim 1669289848Sjkim mulx 24(%rbp), %rbx, %r11 1670289848Sjkim adcx %rbx, %r10 1671289848Sjkim adox %r12, %r11 1672289848Sjkim 1673289848Sjkim .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12 1674289848Sjkim mov %rdx, %rax 1675289848Sjkim mov %r8, %rdx 1676289848Sjkim adcx %rbx, %r11 1677289848Sjkim adox %r13, %r12 1678289848Sjkim 1679289848Sjkim mulx 128+8(%rsp), %rbx, %rdx 1680289848Sjkim mov %rax, %rdx 1681289848Sjkim 1682289848Sjkim mulx 40(%rbp), %rax, %r13 1683289848Sjkim adcx %rax, %r12 1684289848Sjkim adox %r14, %r13 1685289848Sjkim 1686289848Sjkim .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14 1687289848Sjkim adcx %rax, %r13 1688289848Sjkim adox %r15, %r14 1689289848Sjkim 1690289848Sjkim mulx 56(%rbp), %rax, %r15 1691289848Sjkim mov %rbx, %rdx 1692289848Sjkim adcx %rax, %r14 1693289848Sjkim adox %rsi, %r15 # %rsi is 0 1694289848Sjkim adcx %rsi, %r15 # cf=0 1695289848Sjkim 1696289848Sjkim decl %ecx # of=0 1697289848Sjkim jne .Lreduction_loopx 1698289848Sjkim 1699289848Sjkim ret 1700289848Sjkim.size __rsaz_512_reducex,.-__rsaz_512_reducex 1701289848Sjkim___ 1702289848Sjkim} 1703289848Sjkim{ # __rsaz_512_subtract 1704289848Sjkim # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask 1705289848Sjkim # output: 1706289848Sjkim # clobbers: everything but %rdi, %rsi and %rbp 1707289848Sjkim$code.=<<___; 1708289848Sjkim.type __rsaz_512_subtract,\@abi-omnipotent 1709289848Sjkim.align 32 1710289848Sjkim__rsaz_512_subtract: 1711289848Sjkim movq %r8, ($out) 1712289848Sjkim movq %r9, 8($out) 1713289848Sjkim movq %r10, 16($out) 1714289848Sjkim movq %r11, 24($out) 1715289848Sjkim movq %r12, 32($out) 1716289848Sjkim movq %r13, 40($out) 1717289848Sjkim movq %r14, 48($out) 1718289848Sjkim movq %r15, 56($out) 1719289848Sjkim 1720289848Sjkim movq 0($mod), %r8 1721289848Sjkim movq 8($mod), %r9 1722289848Sjkim negq %r8 1723289848Sjkim notq %r9 1724289848Sjkim andq %rcx, %r8 1725289848Sjkim movq 16($mod), %r10 1726289848Sjkim andq %rcx, %r9 1727289848Sjkim notq %r10 1728289848Sjkim movq 24($mod), %r11 1729289848Sjkim andq %rcx, %r10 1730289848Sjkim notq %r11 1731289848Sjkim movq 32($mod), %r12 1732289848Sjkim andq %rcx, %r11 1733289848Sjkim notq %r12 1734289848Sjkim movq 40($mod), %r13 1735289848Sjkim andq %rcx, %r12 1736289848Sjkim notq %r13 1737289848Sjkim movq 48($mod), %r14 1738289848Sjkim andq %rcx, %r13 1739289848Sjkim notq %r14 1740289848Sjkim movq 56($mod), %r15 1741289848Sjkim andq %rcx, %r14 1742289848Sjkim notq %r15 1743289848Sjkim andq %rcx, %r15 1744289848Sjkim 1745289848Sjkim addq ($out), %r8 1746289848Sjkim adcq 8($out), %r9 1747289848Sjkim adcq 16($out), %r10 1748289848Sjkim adcq 24($out), %r11 1749289848Sjkim adcq 32($out), %r12 1750289848Sjkim adcq 40($out), %r13 1751289848Sjkim adcq 48($out), %r14 1752289848Sjkim adcq 56($out), %r15 1753289848Sjkim 1754289848Sjkim movq %r8, ($out) 1755289848Sjkim movq %r9, 8($out) 1756289848Sjkim movq %r10, 16($out) 1757289848Sjkim movq %r11, 24($out) 1758289848Sjkim movq %r12, 32($out) 1759289848Sjkim movq %r13, 40($out) 1760289848Sjkim movq %r14, 48($out) 1761289848Sjkim movq %r15, 56($out) 1762289848Sjkim 1763289848Sjkim ret 1764289848Sjkim.size __rsaz_512_subtract,.-__rsaz_512_subtract 1765289848Sjkim___ 1766289848Sjkim} 1767289848Sjkim{ # __rsaz_512_mul 1768289848Sjkim # 1769289848Sjkim # input: %rsi - ap, %rbp - bp 1770289848Sjkim # ouput: 1771289848Sjkim # clobbers: everything 1772289848Sjkimmy ($ap,$bp) = ("%rsi","%rbp"); 1773289848Sjkim$code.=<<___; 1774289848Sjkim.type __rsaz_512_mul,\@abi-omnipotent 1775289848Sjkim.align 32 1776289848Sjkim__rsaz_512_mul: 1777289848Sjkim leaq 8(%rsp), %rdi 1778289848Sjkim 1779289848Sjkim movq ($ap), %rax 1780289848Sjkim mulq %rbx 1781289848Sjkim movq %rax, (%rdi) 1782289848Sjkim movq 8($ap), %rax 1783289848Sjkim movq %rdx, %r8 1784289848Sjkim 1785289848Sjkim mulq %rbx 1786289848Sjkim addq %rax, %r8 1787289848Sjkim movq 16($ap), %rax 1788289848Sjkim movq %rdx, %r9 1789289848Sjkim adcq \$0, %r9 1790289848Sjkim 1791289848Sjkim mulq %rbx 1792289848Sjkim addq %rax, %r9 1793289848Sjkim movq 24($ap), %rax 1794289848Sjkim movq %rdx, %r10 1795289848Sjkim adcq \$0, %r10 1796289848Sjkim 1797289848Sjkim mulq %rbx 1798289848Sjkim addq %rax, %r10 1799289848Sjkim movq 32($ap), %rax 1800289848Sjkim movq %rdx, %r11 1801289848Sjkim adcq \$0, %r11 1802289848Sjkim 1803289848Sjkim mulq %rbx 1804289848Sjkim addq %rax, %r11 1805289848Sjkim movq 40($ap), %rax 1806289848Sjkim movq %rdx, %r12 1807289848Sjkim adcq \$0, %r12 1808289848Sjkim 1809289848Sjkim mulq %rbx 1810289848Sjkim addq %rax, %r12 1811289848Sjkim movq 48($ap), %rax 1812289848Sjkim movq %rdx, %r13 1813289848Sjkim adcq \$0, %r13 1814289848Sjkim 1815289848Sjkim mulq %rbx 1816289848Sjkim addq %rax, %r13 1817289848Sjkim movq 56($ap), %rax 1818289848Sjkim movq %rdx, %r14 1819289848Sjkim adcq \$0, %r14 1820289848Sjkim 1821289848Sjkim mulq %rbx 1822289848Sjkim addq %rax, %r14 1823289848Sjkim movq ($ap), %rax 1824289848Sjkim movq %rdx, %r15 1825289848Sjkim adcq \$0, %r15 1826289848Sjkim 1827289848Sjkim leaq 8($bp), $bp 1828289848Sjkim leaq 8(%rdi), %rdi 1829289848Sjkim 1830289848Sjkim movl \$7, %ecx 1831289848Sjkim jmp .Loop_mul 1832289848Sjkim 1833289848Sjkim.align 32 1834289848Sjkim.Loop_mul: 1835289848Sjkim movq ($bp), %rbx 1836289848Sjkim mulq %rbx 1837289848Sjkim addq %rax, %r8 1838289848Sjkim movq 8($ap), %rax 1839289848Sjkim movq %r8, (%rdi) 1840289848Sjkim movq %rdx, %r8 1841289848Sjkim adcq \$0, %r8 1842289848Sjkim 1843289848Sjkim mulq %rbx 1844289848Sjkim addq %rax, %r9 1845289848Sjkim movq 16($ap), %rax 1846289848Sjkim adcq \$0, %rdx 1847289848Sjkim addq %r9, %r8 1848289848Sjkim movq %rdx, %r9 1849289848Sjkim adcq \$0, %r9 1850289848Sjkim 1851289848Sjkim mulq %rbx 1852289848Sjkim addq %rax, %r10 1853289848Sjkim movq 24($ap), %rax 1854289848Sjkim adcq \$0, %rdx 1855289848Sjkim addq %r10, %r9 1856289848Sjkim movq %rdx, %r10 1857289848Sjkim adcq \$0, %r10 1858289848Sjkim 1859289848Sjkim mulq %rbx 1860289848Sjkim addq %rax, %r11 1861289848Sjkim movq 32($ap), %rax 1862289848Sjkim adcq \$0, %rdx 1863289848Sjkim addq %r11, %r10 1864289848Sjkim movq %rdx, %r11 1865289848Sjkim adcq \$0, %r11 1866289848Sjkim 1867289848Sjkim mulq %rbx 1868289848Sjkim addq %rax, %r12 1869289848Sjkim movq 40($ap), %rax 1870289848Sjkim adcq \$0, %rdx 1871289848Sjkim addq %r12, %r11 1872289848Sjkim movq %rdx, %r12 1873289848Sjkim adcq \$0, %r12 1874289848Sjkim 1875289848Sjkim mulq %rbx 1876289848Sjkim addq %rax, %r13 1877289848Sjkim movq 48($ap), %rax 1878289848Sjkim adcq \$0, %rdx 1879289848Sjkim addq %r13, %r12 1880289848Sjkim movq %rdx, %r13 1881289848Sjkim adcq \$0, %r13 1882289848Sjkim 1883289848Sjkim mulq %rbx 1884289848Sjkim addq %rax, %r14 1885289848Sjkim movq 56($ap), %rax 1886289848Sjkim adcq \$0, %rdx 1887289848Sjkim addq %r14, %r13 1888289848Sjkim movq %rdx, %r14 1889289848Sjkim leaq 8($bp), $bp 1890289848Sjkim adcq \$0, %r14 1891289848Sjkim 1892289848Sjkim mulq %rbx 1893289848Sjkim addq %rax, %r15 1894289848Sjkim movq ($ap), %rax 1895289848Sjkim adcq \$0, %rdx 1896289848Sjkim addq %r15, %r14 1897289848Sjkim movq %rdx, %r15 1898289848Sjkim adcq \$0, %r15 1899289848Sjkim 1900289848Sjkim leaq 8(%rdi), %rdi 1901289848Sjkim 1902289848Sjkim decl %ecx 1903289848Sjkim jnz .Loop_mul 1904289848Sjkim 1905289848Sjkim movq %r8, (%rdi) 1906289848Sjkim movq %r9, 8(%rdi) 1907289848Sjkim movq %r10, 16(%rdi) 1908289848Sjkim movq %r11, 24(%rdi) 1909289848Sjkim movq %r12, 32(%rdi) 1910289848Sjkim movq %r13, 40(%rdi) 1911289848Sjkim movq %r14, 48(%rdi) 1912289848Sjkim movq %r15, 56(%rdi) 1913289848Sjkim 1914289848Sjkim ret 1915289848Sjkim.size __rsaz_512_mul,.-__rsaz_512_mul 1916289848Sjkim___ 1917289848Sjkim} 1918289848Sjkimif ($addx) { 1919289848Sjkim # __rsaz_512_mulx 1920289848Sjkim # 1921289848Sjkim # input: %rsi - ap, %rbp - bp 1922289848Sjkim # ouput: 1923289848Sjkim # clobbers: everything 1924289848Sjkimmy ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi"); 1925289848Sjkim$code.=<<___; 1926289848Sjkim.type __rsaz_512_mulx,\@abi-omnipotent 1927289848Sjkim.align 32 1928289848Sjkim__rsaz_512_mulx: 1929289848Sjkim mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller 1930289848Sjkim mov \$-6, %rcx 1931289848Sjkim 1932289848Sjkim mulx 8($ap), %rax, %r9 1933289848Sjkim movq %rbx, 8(%rsp) 1934289848Sjkim 1935289848Sjkim mulx 16($ap), %rbx, %r10 1936289848Sjkim adc %rax, %r8 1937289848Sjkim 1938289848Sjkim mulx 24($ap), %rax, %r11 1939289848Sjkim adc %rbx, %r9 1940289848Sjkim 1941289848Sjkim mulx 32($ap), %rbx, %r12 1942289848Sjkim adc %rax, %r10 1943289848Sjkim 1944289848Sjkim mulx 40($ap), %rax, %r13 1945289848Sjkim adc %rbx, %r11 1946289848Sjkim 1947289848Sjkim mulx 48($ap), %rbx, %r14 1948289848Sjkim adc %rax, %r12 1949289848Sjkim 1950289848Sjkim mulx 56($ap), %rax, %r15 1951289848Sjkim mov 8($bp), %rdx 1952289848Sjkim adc %rbx, %r13 1953289848Sjkim adc %rax, %r14 1954289848Sjkim adc \$0, %r15 1955289848Sjkim 1956289848Sjkim xor $zero, $zero # cf=0,of=0 1957289848Sjkim jmp .Loop_mulx 1958289848Sjkim 1959289848Sjkim.align 32 1960289848Sjkim.Loop_mulx: 1961289848Sjkim movq %r8, %rbx 1962289848Sjkim mulx ($ap), %rax, %r8 1963289848Sjkim adcx %rax, %rbx 1964289848Sjkim adox %r9, %r8 1965289848Sjkim 1966289848Sjkim mulx 8($ap), %rax, %r9 1967289848Sjkim adcx %rax, %r8 1968289848Sjkim adox %r10, %r9 1969289848Sjkim 1970289848Sjkim mulx 16($ap), %rax, %r10 1971289848Sjkim adcx %rax, %r9 1972289848Sjkim adox %r11, %r10 1973289848Sjkim 1974289848Sjkim mulx 24($ap), %rax, %r11 1975289848Sjkim adcx %rax, %r10 1976289848Sjkim adox %r12, %r11 1977289848Sjkim 1978289848Sjkim .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12 1979289848Sjkim adcx %rax, %r11 1980289848Sjkim adox %r13, %r12 1981289848Sjkim 1982289848Sjkim mulx 40($ap), %rax, %r13 1983289848Sjkim adcx %rax, %r12 1984289848Sjkim adox %r14, %r13 1985289848Sjkim 1986289848Sjkim mulx 48($ap), %rax, %r14 1987289848Sjkim adcx %rax, %r13 1988289848Sjkim adox %r15, %r14 1989289848Sjkim 1990289848Sjkim mulx 56($ap), %rax, %r15 1991289848Sjkim movq 64($bp,%rcx,8), %rdx 1992289848Sjkim movq %rbx, 8+64-8(%rsp,%rcx,8) 1993289848Sjkim adcx %rax, %r14 1994289848Sjkim adox $zero, %r15 1995289848Sjkim adcx $zero, %r15 # cf=0 1996289848Sjkim 1997289848Sjkim inc %rcx # of=0 1998289848Sjkim jnz .Loop_mulx 1999289848Sjkim 2000289848Sjkim movq %r8, %rbx 2001289848Sjkim mulx ($ap), %rax, %r8 2002289848Sjkim adcx %rax, %rbx 2003289848Sjkim adox %r9, %r8 2004289848Sjkim 2005289848Sjkim .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9 2006289848Sjkim adcx %rax, %r8 2007289848Sjkim adox %r10, %r9 2008289848Sjkim 2009289848Sjkim .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10 2010289848Sjkim adcx %rax, %r9 2011289848Sjkim adox %r11, %r10 2012289848Sjkim 2013289848Sjkim mulx 24($ap), %rax, %r11 2014289848Sjkim adcx %rax, %r10 2015289848Sjkim adox %r12, %r11 2016289848Sjkim 2017289848Sjkim mulx 32($ap), %rax, %r12 2018289848Sjkim adcx %rax, %r11 2019289848Sjkim adox %r13, %r12 2020289848Sjkim 2021289848Sjkim mulx 40($ap), %rax, %r13 2022289848Sjkim adcx %rax, %r12 2023289848Sjkim adox %r14, %r13 2024289848Sjkim 2025289848Sjkim .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14 2026289848Sjkim adcx %rax, %r13 2027289848Sjkim adox %r15, %r14 2028289848Sjkim 2029289848Sjkim .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15 2030289848Sjkim adcx %rax, %r14 2031289848Sjkim adox $zero, %r15 2032289848Sjkim adcx $zero, %r15 2033289848Sjkim 2034289848Sjkim mov %rbx, 8+64-8(%rsp) 2035289848Sjkim mov %r8, 8+64(%rsp) 2036289848Sjkim mov %r9, 8+64+8(%rsp) 2037289848Sjkim mov %r10, 8+64+16(%rsp) 2038289848Sjkim mov %r11, 8+64+24(%rsp) 2039289848Sjkim mov %r12, 8+64+32(%rsp) 2040289848Sjkim mov %r13, 8+64+40(%rsp) 2041289848Sjkim mov %r14, 8+64+48(%rsp) 2042289848Sjkim mov %r15, 8+64+56(%rsp) 2043289848Sjkim 2044289848Sjkim ret 2045289848Sjkim.size __rsaz_512_mulx,.-__rsaz_512_mulx 2046289848Sjkim___ 2047289848Sjkim} 2048289848Sjkim{ 2049289848Sjkimmy ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx"); 2050289848Sjkim$code.=<<___; 2051289848Sjkim.globl rsaz_512_scatter4 2052289848Sjkim.type rsaz_512_scatter4,\@abi-omnipotent 2053289848Sjkim.align 16 2054289848Sjkimrsaz_512_scatter4: 2055296279Sjkim leaq ($out,$power,8), $out 2056289848Sjkim movl \$8, %r9d 2057289848Sjkim jmp .Loop_scatter 2058289848Sjkim.align 16 2059289848Sjkim.Loop_scatter: 2060289848Sjkim movq ($inp), %rax 2061289848Sjkim leaq 8($inp), $inp 2062296279Sjkim movq %rax, ($out) 2063289848Sjkim leaq 128($out), $out 2064289848Sjkim decl %r9d 2065289848Sjkim jnz .Loop_scatter 2066289848Sjkim ret 2067289848Sjkim.size rsaz_512_scatter4,.-rsaz_512_scatter4 2068289848Sjkim 2069289848Sjkim.globl rsaz_512_gather4 2070289848Sjkim.type rsaz_512_gather4,\@abi-omnipotent 2071289848Sjkim.align 16 2072289848Sjkimrsaz_512_gather4: 2073296279Sjkim___ 2074296279Sjkim$code.=<<___ if ($win64); 2075296279Sjkim.LSEH_begin_rsaz_512_gather4: 2076296279Sjkim .byte 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 # sub $0xa8,%rsp 2077296279Sjkim .byte 0x0f,0x29,0x34,0x24 # movaps %xmm6,(%rsp) 2078296279Sjkim .byte 0x0f,0x29,0x7c,0x24,0x10 # movaps %xmm7,0x10(%rsp) 2079296279Sjkim .byte 0x44,0x0f,0x29,0x44,0x24,0x20 # movaps %xmm8,0x20(%rsp) 2080296279Sjkim .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 # movaps %xmm9,0x30(%rsp) 2081296279Sjkim .byte 0x44,0x0f,0x29,0x54,0x24,0x40 # movaps %xmm10,0x40(%rsp) 2082296279Sjkim .byte 0x44,0x0f,0x29,0x5c,0x24,0x50 # movaps %xmm11,0x50(%rsp) 2083296279Sjkim .byte 0x44,0x0f,0x29,0x64,0x24,0x60 # movaps %xmm12,0x60(%rsp) 2084296279Sjkim .byte 0x44,0x0f,0x29,0x6c,0x24,0x70 # movaps %xmm13,0x70(%rsp) 2085296279Sjkim .byte 0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 # movaps %xmm14,0x80(%rsp) 2086296279Sjkim .byte 0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 # movaps %xmm15,0x90(%rsp) 2087296279Sjkim___ 2088296279Sjkim$code.=<<___; 2089296279Sjkim movd $power,%xmm8 2090296279Sjkim movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002 2091296279Sjkim movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000 2092296279Sjkim 2093296279Sjkim pshufd \$0,%xmm8,%xmm8 # broadcast $power 2094296279Sjkim movdqa %xmm1,%xmm7 2095296279Sjkim movdqa %xmm1,%xmm2 2096296279Sjkim___ 2097296279Sjkim######################################################################## 2098296279Sjkim# calculate mask by comparing 0..15 to $power 2099296279Sjkim# 2100296279Sjkimfor($i=0;$i<4;$i++) { 2101296279Sjkim$code.=<<___; 2102296279Sjkim paddd %xmm`$i`,%xmm`$i+1` 2103296279Sjkim pcmpeqd %xmm8,%xmm`$i` 2104296279Sjkim movdqa %xmm7,%xmm`$i+3` 2105296279Sjkim___ 2106296279Sjkim} 2107296279Sjkimfor(;$i<7;$i++) { 2108296279Sjkim$code.=<<___; 2109296279Sjkim paddd %xmm`$i`,%xmm`$i+1` 2110296279Sjkim pcmpeqd %xmm8,%xmm`$i` 2111296279Sjkim___ 2112296279Sjkim} 2113296279Sjkim$code.=<<___; 2114296279Sjkim pcmpeqd %xmm8,%xmm7 2115289848Sjkim movl \$8, %r9d 2116289848Sjkim jmp .Loop_gather 2117289848Sjkim.align 16 2118289848Sjkim.Loop_gather: 2119296279Sjkim movdqa 16*0($inp),%xmm8 2120296279Sjkim movdqa 16*1($inp),%xmm9 2121296279Sjkim movdqa 16*2($inp),%xmm10 2122296279Sjkim movdqa 16*3($inp),%xmm11 2123296279Sjkim pand %xmm0,%xmm8 2124296279Sjkim movdqa 16*4($inp),%xmm12 2125296279Sjkim pand %xmm1,%xmm9 2126296279Sjkim movdqa 16*5($inp),%xmm13 2127296279Sjkim pand %xmm2,%xmm10 2128296279Sjkim movdqa 16*6($inp),%xmm14 2129296279Sjkim pand %xmm3,%xmm11 2130296279Sjkim movdqa 16*7($inp),%xmm15 2131289848Sjkim leaq 128($inp), $inp 2132296279Sjkim pand %xmm4,%xmm12 2133296279Sjkim pand %xmm5,%xmm13 2134296279Sjkim pand %xmm6,%xmm14 2135296279Sjkim pand %xmm7,%xmm15 2136296279Sjkim por %xmm10,%xmm8 2137296279Sjkim por %xmm11,%xmm9 2138296279Sjkim por %xmm12,%xmm8 2139296279Sjkim por %xmm13,%xmm9 2140296279Sjkim por %xmm14,%xmm8 2141296279Sjkim por %xmm15,%xmm9 2142296279Sjkim 2143296279Sjkim por %xmm9,%xmm8 2144296279Sjkim pshufd \$0x4e,%xmm8,%xmm9 2145296279Sjkim por %xmm9,%xmm8 2146296279Sjkim movq %xmm8,($out) 2147289848Sjkim leaq 8($out), $out 2148289848Sjkim decl %r9d 2149289848Sjkim jnz .Loop_gather 2150296279Sjkim___ 2151296279Sjkim$code.=<<___ if ($win64); 2152296279Sjkim movaps 0x00(%rsp),%xmm6 2153296279Sjkim movaps 0x10(%rsp),%xmm7 2154296279Sjkim movaps 0x20(%rsp),%xmm8 2155296279Sjkim movaps 0x30(%rsp),%xmm9 2156296279Sjkim movaps 0x40(%rsp),%xmm10 2157296279Sjkim movaps 0x50(%rsp),%xmm11 2158296279Sjkim movaps 0x60(%rsp),%xmm12 2159296279Sjkim movaps 0x70(%rsp),%xmm13 2160296279Sjkim movaps 0x80(%rsp),%xmm14 2161296279Sjkim movaps 0x90(%rsp),%xmm15 2162296279Sjkim add \$0xa8,%rsp 2163296279Sjkim___ 2164296279Sjkim$code.=<<___; 2165289848Sjkim ret 2166296279Sjkim.LSEH_end_rsaz_512_gather4: 2167289848Sjkim.size rsaz_512_gather4,.-rsaz_512_gather4 2168296279Sjkim 2169296279Sjkim.align 64 2170296279Sjkim.Linc: 2171296279Sjkim .long 0,0, 1,1 2172296279Sjkim .long 2,2, 2,2 2173289848Sjkim___ 2174289848Sjkim} 2175289848Sjkim 2176289848Sjkim# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 2177289848Sjkim# CONTEXT *context,DISPATCHER_CONTEXT *disp) 2178289848Sjkimif ($win64) { 2179289848Sjkim$rec="%rcx"; 2180289848Sjkim$frame="%rdx"; 2181289848Sjkim$context="%r8"; 2182289848Sjkim$disp="%r9"; 2183289848Sjkim 2184289848Sjkim$code.=<<___; 2185289848Sjkim.extern __imp_RtlVirtualUnwind 2186289848Sjkim.type se_handler,\@abi-omnipotent 2187289848Sjkim.align 16 2188289848Sjkimse_handler: 2189289848Sjkim push %rsi 2190289848Sjkim push %rdi 2191289848Sjkim push %rbx 2192289848Sjkim push %rbp 2193289848Sjkim push %r12 2194289848Sjkim push %r13 2195289848Sjkim push %r14 2196289848Sjkim push %r15 2197289848Sjkim pushfq 2198289848Sjkim sub \$64,%rsp 2199289848Sjkim 2200289848Sjkim mov 120($context),%rax # pull context->Rax 2201289848Sjkim mov 248($context),%rbx # pull context->Rip 2202289848Sjkim 2203289848Sjkim mov 8($disp),%rsi # disp->ImageBase 2204289848Sjkim mov 56($disp),%r11 # disp->HandlerData 2205289848Sjkim 2206289848Sjkim mov 0(%r11),%r10d # HandlerData[0] 2207289848Sjkim lea (%rsi,%r10),%r10 # end of prologue label 2208289848Sjkim cmp %r10,%rbx # context->Rip<end of prologue label 2209289848Sjkim jb .Lcommon_seh_tail 2210289848Sjkim 2211289848Sjkim mov 152($context),%rax # pull context->Rsp 2212289848Sjkim 2213289848Sjkim mov 4(%r11),%r10d # HandlerData[1] 2214289848Sjkim lea (%rsi,%r10),%r10 # epilogue label 2215289848Sjkim cmp %r10,%rbx # context->Rip>=epilogue label 2216289848Sjkim jae .Lcommon_seh_tail 2217289848Sjkim 2218289848Sjkim lea 128+24+48(%rax),%rax 2219289848Sjkim 2220296279Sjkim lea .Lmul_gather4_epilogue(%rip),%rbx 2221296279Sjkim cmp %r10,%rbx 2222296279Sjkim jne .Lse_not_in_mul_gather4 2223296279Sjkim 2224296279Sjkim lea 0xb0(%rax),%rax 2225296279Sjkim 2226296279Sjkim lea -48-0xa8(%rax),%rsi 2227296279Sjkim lea 512($context),%rdi 2228296279Sjkim mov \$20,%ecx 2229296279Sjkim .long 0xa548f3fc # cld; rep movsq 2230296279Sjkim 2231296279Sjkim.Lse_not_in_mul_gather4: 2232289848Sjkim mov -8(%rax),%rbx 2233289848Sjkim mov -16(%rax),%rbp 2234289848Sjkim mov -24(%rax),%r12 2235289848Sjkim mov -32(%rax),%r13 2236289848Sjkim mov -40(%rax),%r14 2237289848Sjkim mov -48(%rax),%r15 2238289848Sjkim mov %rbx,144($context) # restore context->Rbx 2239289848Sjkim mov %rbp,160($context) # restore context->Rbp 2240289848Sjkim mov %r12,216($context) # restore context->R12 2241289848Sjkim mov %r13,224($context) # restore context->R13 2242289848Sjkim mov %r14,232($context) # restore context->R14 2243289848Sjkim mov %r15,240($context) # restore context->R15 2244289848Sjkim 2245289848Sjkim.Lcommon_seh_tail: 2246289848Sjkim mov 8(%rax),%rdi 2247289848Sjkim mov 16(%rax),%rsi 2248289848Sjkim mov %rax,152($context) # restore context->Rsp 2249289848Sjkim mov %rsi,168($context) # restore context->Rsi 2250289848Sjkim mov %rdi,176($context) # restore context->Rdi 2251289848Sjkim 2252289848Sjkim mov 40($disp),%rdi # disp->ContextRecord 2253289848Sjkim mov $context,%rsi # context 2254289848Sjkim mov \$154,%ecx # sizeof(CONTEXT) 2255289848Sjkim .long 0xa548f3fc # cld; rep movsq 2256289848Sjkim 2257289848Sjkim mov $disp,%rsi 2258289848Sjkim xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 2259289848Sjkim mov 8(%rsi),%rdx # arg2, disp->ImageBase 2260289848Sjkim mov 0(%rsi),%r8 # arg3, disp->ControlPc 2261289848Sjkim mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 2262289848Sjkim mov 40(%rsi),%r10 # disp->ContextRecord 2263289848Sjkim lea 56(%rsi),%r11 # &disp->HandlerData 2264289848Sjkim lea 24(%rsi),%r12 # &disp->EstablisherFrame 2265289848Sjkim mov %r10,32(%rsp) # arg5 2266289848Sjkim mov %r11,40(%rsp) # arg6 2267289848Sjkim mov %r12,48(%rsp) # arg7 2268289848Sjkim mov %rcx,56(%rsp) # arg8, (NULL) 2269289848Sjkim call *__imp_RtlVirtualUnwind(%rip) 2270289848Sjkim 2271289848Sjkim mov \$1,%eax # ExceptionContinueSearch 2272289848Sjkim add \$64,%rsp 2273289848Sjkim popfq 2274289848Sjkim pop %r15 2275289848Sjkim pop %r14 2276289848Sjkim pop %r13 2277289848Sjkim pop %r12 2278289848Sjkim pop %rbp 2279289848Sjkim pop %rbx 2280289848Sjkim pop %rdi 2281289848Sjkim pop %rsi 2282289848Sjkim ret 2283296279Sjkim.size se_handler,.-se_handler 2284289848Sjkim 2285289848Sjkim.section .pdata 2286289848Sjkim.align 4 2287289848Sjkim .rva .LSEH_begin_rsaz_512_sqr 2288289848Sjkim .rva .LSEH_end_rsaz_512_sqr 2289289848Sjkim .rva .LSEH_info_rsaz_512_sqr 2290289848Sjkim 2291289848Sjkim .rva .LSEH_begin_rsaz_512_mul 2292289848Sjkim .rva .LSEH_end_rsaz_512_mul 2293289848Sjkim .rva .LSEH_info_rsaz_512_mul 2294289848Sjkim 2295289848Sjkim .rva .LSEH_begin_rsaz_512_mul_gather4 2296289848Sjkim .rva .LSEH_end_rsaz_512_mul_gather4 2297289848Sjkim .rva .LSEH_info_rsaz_512_mul_gather4 2298289848Sjkim 2299289848Sjkim .rva .LSEH_begin_rsaz_512_mul_scatter4 2300289848Sjkim .rva .LSEH_end_rsaz_512_mul_scatter4 2301289848Sjkim .rva .LSEH_info_rsaz_512_mul_scatter4 2302289848Sjkim 2303289848Sjkim .rva .LSEH_begin_rsaz_512_mul_by_one 2304289848Sjkim .rva .LSEH_end_rsaz_512_mul_by_one 2305289848Sjkim .rva .LSEH_info_rsaz_512_mul_by_one 2306289848Sjkim 2307296279Sjkim .rva .LSEH_begin_rsaz_512_gather4 2308296279Sjkim .rva .LSEH_end_rsaz_512_gather4 2309296279Sjkim .rva .LSEH_info_rsaz_512_gather4 2310296279Sjkim 2311289848Sjkim.section .xdata 2312289848Sjkim.align 8 2313289848Sjkim.LSEH_info_rsaz_512_sqr: 2314289848Sjkim .byte 9,0,0,0 2315289848Sjkim .rva se_handler 2316289848Sjkim .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[] 2317289848Sjkim.LSEH_info_rsaz_512_mul: 2318289848Sjkim .byte 9,0,0,0 2319289848Sjkim .rva se_handler 2320289848Sjkim .rva .Lmul_body,.Lmul_epilogue # HandlerData[] 2321289848Sjkim.LSEH_info_rsaz_512_mul_gather4: 2322289848Sjkim .byte 9,0,0,0 2323289848Sjkim .rva se_handler 2324289848Sjkim .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[] 2325289848Sjkim.LSEH_info_rsaz_512_mul_scatter4: 2326289848Sjkim .byte 9,0,0,0 2327289848Sjkim .rva se_handler 2328289848Sjkim .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[] 2329289848Sjkim.LSEH_info_rsaz_512_mul_by_one: 2330289848Sjkim .byte 9,0,0,0 2331289848Sjkim .rva se_handler 2332289848Sjkim .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[] 2333296279Sjkim.LSEH_info_rsaz_512_gather4: 2334296279Sjkim .byte 0x01,0x46,0x16,0x00 2335296279Sjkim .byte 0x46,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15 2336296279Sjkim .byte 0x3d,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14 2337296279Sjkim .byte 0x34,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13 2338296279Sjkim .byte 0x2e,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12 2339296279Sjkim .byte 0x28,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11 2340296279Sjkim .byte 0x22,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10 2341296279Sjkim .byte 0x1c,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9 2342296279Sjkim .byte 0x16,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8 2343296279Sjkim .byte 0x10,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7 2344296279Sjkim .byte 0x0b,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6 2345296279Sjkim .byte 0x07,0x01,0x15,0x00 # sub rsp,0xa8 2346289848Sjkim___ 2347289848Sjkim} 2348289848Sjkim 2349289848Sjkim$code =~ s/\`([^\`]*)\`/eval $1/gem; 2350289848Sjkimprint $code; 2351289848Sjkimclose STDOUT; 2352