1289848Sjkim#!/usr/bin/env perl 2289848Sjkim 3289848Sjkim############################################################################## 4289848Sjkim# # 5289848Sjkim# Copyright (c) 2012, Intel Corporation # 6289848Sjkim# # 7289848Sjkim# All rights reserved. # 8289848Sjkim# # 9289848Sjkim# Redistribution and use in source and binary forms, with or without # 10289848Sjkim# modification, are permitted provided that the following conditions are # 11289848Sjkim# met: # 12289848Sjkim# # 13289848Sjkim# * Redistributions of source code must retain the above copyright # 14289848Sjkim# notice, this list of conditions and the following disclaimer. # 15289848Sjkim# # 16289848Sjkim# * Redistributions in binary form must reproduce the above copyright # 17289848Sjkim# notice, this list of conditions and the following disclaimer in the # 18289848Sjkim# documentation and/or other materials provided with the # 19289848Sjkim# distribution. # 20289848Sjkim# # 21289848Sjkim# * Neither the name of the Intel Corporation nor the names of its # 22289848Sjkim# contributors may be used to endorse or promote products derived from # 23289848Sjkim# this software without specific prior written permission. # 24289848Sjkim# # 25289848Sjkim# # 26289848Sjkim# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY # 27289848Sjkim# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # 28289848Sjkim# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # 29289848Sjkim# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR # 30289848Sjkim# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # 31289848Sjkim# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # 32289848Sjkim# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # 33289848Sjkim# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # 34289848Sjkim# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # 35289848Sjkim# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # 36289848Sjkim# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # 37289848Sjkim# # 38289848Sjkim############################################################################## 39289848Sjkim# Developers and authors: # 40289848Sjkim# Shay Gueron (1, 2), and Vlad Krasnov (1) # 41289848Sjkim# (1) Intel Architecture Group, Microprocessor and Chipset Development, # 42289848Sjkim# Israel Development Center, Haifa, Israel # 43289848Sjkim# (2) University of Haifa # 44289848Sjkim############################################################################## 45289848Sjkim# Reference: # 46289848Sjkim# [1] S. Gueron, "Efficient Software Implementations of Modular # 47289848Sjkim# Exponentiation", http://eprint.iacr.org/2011/239 # 48289848Sjkim# [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". # 49289848Sjkim# IEEE Proceedings of 9th International Conference on Information # 50289848Sjkim# Technology: New Generations (ITNG 2012), 821-823 (2012). # 51289848Sjkim# [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation# 52289848Sjkim# Journal of Cryptographic Engineering 2:31-43 (2012). # 53289848Sjkim# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis # 54289848Sjkim# resistant 512-bit and 1024-bit modular exponentiation for optimizing # 55289848Sjkim# RSA1024 and RSA2048 on x86_64 platforms", # 56289848Sjkim# http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest# 57289848Sjkim############################################################################## 58289848Sjkim 59289848Sjkim# While original submission covers 512- and 1024-bit exponentiation, 60289848Sjkim# this module is limited to 512-bit version only (and as such 61289848Sjkim# accelerates RSA1024 sign). This is because improvement for longer 62289848Sjkim# keys is not high enough to justify the effort, highest measured 63289848Sjkim# was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming 64289848Sjkim# for the moment of this writing!] Nor does this module implement 65289848Sjkim# "monolithic" complete exponentiation jumbo-subroutine, but adheres 66289848Sjkim# to more modular mixture of C and assembly. And it's optimized even 67289848Sjkim# for processors other than Intel Core family (see table below for 68289848Sjkim# improvement coefficients). 69289848Sjkim# <appro@openssl.org> 70289848Sjkim# 71289848Sjkim# RSA1024 sign/sec this/original |this/rsax(*) this/fips(*) 72289848Sjkim# ----------------+--------------------------- 73289848Sjkim# Opteron +13% |+5% +20% 74289848Sjkim# Bulldozer -0% |-1% +10% 75289848Sjkim# P4 +11% |+7% +8% 76289848Sjkim# Westmere +5% |+14% +17% 77289848Sjkim# Sandy Bridge +2% |+12% +29% 78289848Sjkim# Ivy Bridge +1% |+11% +35% 79289848Sjkim# Haswell(**) -0% |+12% +39% 80289848Sjkim# Atom +13% |+11% +4% 81289848Sjkim# VIA Nano +70% |+9% +25% 82289848Sjkim# 83289848Sjkim# (*) rsax engine and fips numbers are presented for reference 84289848Sjkim# purposes; 85289848Sjkim# (**) MULX was attempted, but found to give only marginal improvement; 86289848Sjkim 87289848Sjkim$flavour = shift; 88289848Sjkim$output = shift; 89289848Sjkimif ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 90289848Sjkim 91289848Sjkim$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 92289848Sjkim 93289848Sjkim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 94289848Sjkim( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 95289848Sjkim( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 96289848Sjkimdie "can't locate x86_64-xlate.pl"; 97289848Sjkim 98289848Sjkimopen OUT,"| \"$^X\" $xlate $flavour $output"; 99289848Sjkim*STDOUT=*OUT; 100289848Sjkim 101289848Sjkimif (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 102289848Sjkim =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 103289848Sjkim $addx = ($1>=2.23); 104289848Sjkim} 105289848Sjkim 106289848Sjkimif (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 107289848Sjkim `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 108289848Sjkim $addx = ($1>=2.10); 109289848Sjkim} 110289848Sjkim 111289848Sjkimif (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 112289848Sjkim `ml64 2>&1` =~ /Version ([0-9]+)\./) { 113289848Sjkim $addx = ($1>=12); 114289848Sjkim} 115289848Sjkim 116295009Sjkimif (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) { 117289848Sjkim my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 118289848Sjkim $addx = ($ver>=3.03); 119289848Sjkim} 120289848Sjkim 121289848Sjkim($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API 122289848Sjkim{ 123289848Sjkimmy ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d"); 124289848Sjkim 125289848Sjkim$code.=<<___; 126289848Sjkim.text 127289848Sjkim 128289848Sjkim.extern OPENSSL_ia32cap_P 129289848Sjkim 130289848Sjkim.globl rsaz_512_sqr 131289848Sjkim.type rsaz_512_sqr,\@function,5 132289848Sjkim.align 32 133289848Sjkimrsaz_512_sqr: # 25-29% faster than rsaz_512_mul 134289848Sjkim push %rbx 135289848Sjkim push %rbp 136289848Sjkim push %r12 137289848Sjkim push %r13 138289848Sjkim push %r14 139289848Sjkim push %r15 140289848Sjkim 141289848Sjkim subq \$128+24, %rsp 142289848Sjkim.Lsqr_body: 143356290Sjkim movq $mod, %xmm1 # common off-load 144289848Sjkim movq ($inp), %rdx 145289848Sjkim movq 8($inp), %rax 146289848Sjkim movq $n0, 128(%rsp) 147289848Sjkim___ 148289848Sjkim$code.=<<___ if ($addx); 149289848Sjkim movl \$0x80100,%r11d 150289848Sjkim andl OPENSSL_ia32cap_P+8(%rip),%r11d 151289848Sjkim cmpl \$0x80100,%r11d # check for MULX and ADO/CX 152289848Sjkim je .Loop_sqrx 153289848Sjkim___ 154289848Sjkim$code.=<<___; 155289848Sjkim jmp .Loop_sqr 156289848Sjkim 157289848Sjkim.align 32 158289848Sjkim.Loop_sqr: 159289848Sjkim movl $times,128+8(%rsp) 160289848Sjkim#first iteration 161356290Sjkim movq %rdx, %rbx # 0($inp) 162356290Sjkim mov %rax, %rbp # 8($inp) 163289848Sjkim mulq %rdx 164289848Sjkim movq %rax, %r8 165289848Sjkim movq 16($inp), %rax 166289848Sjkim movq %rdx, %r9 167289848Sjkim 168289848Sjkim mulq %rbx 169289848Sjkim addq %rax, %r9 170289848Sjkim movq 24($inp), %rax 171289848Sjkim movq %rdx, %r10 172289848Sjkim adcq \$0, %r10 173289848Sjkim 174289848Sjkim mulq %rbx 175289848Sjkim addq %rax, %r10 176289848Sjkim movq 32($inp), %rax 177289848Sjkim movq %rdx, %r11 178289848Sjkim adcq \$0, %r11 179289848Sjkim 180289848Sjkim mulq %rbx 181289848Sjkim addq %rax, %r11 182289848Sjkim movq 40($inp), %rax 183289848Sjkim movq %rdx, %r12 184289848Sjkim adcq \$0, %r12 185289848Sjkim 186289848Sjkim mulq %rbx 187289848Sjkim addq %rax, %r12 188289848Sjkim movq 48($inp), %rax 189289848Sjkim movq %rdx, %r13 190289848Sjkim adcq \$0, %r13 191289848Sjkim 192289848Sjkim mulq %rbx 193289848Sjkim addq %rax, %r13 194289848Sjkim movq 56($inp), %rax 195289848Sjkim movq %rdx, %r14 196289848Sjkim adcq \$0, %r14 197289848Sjkim 198289848Sjkim mulq %rbx 199289848Sjkim addq %rax, %r14 200289848Sjkim movq %rbx, %rax 201356290Sjkim adcq \$0, %rdx 202289848Sjkim 203356290Sjkim xorq %rcx,%rcx # rcx:r8 = r8 << 1 204356290Sjkim addq %r8, %r8 205356290Sjkim movq %rdx, %r15 206356290Sjkim adcq \$0, %rcx 207289848Sjkim 208289848Sjkim mulq %rax 209356290Sjkim addq %r8, %rdx 210356290Sjkim adcq \$0, %rcx 211356290Sjkim 212289848Sjkim movq %rax, (%rsp) 213356290Sjkim movq %rdx, 8(%rsp) 214289848Sjkim 215289848Sjkim#second iteration 216289848Sjkim movq 16($inp), %rax 217356290Sjkim mulq %rbp 218289848Sjkim addq %rax, %r10 219289848Sjkim movq 24($inp), %rax 220289848Sjkim movq %rdx, %rbx 221289848Sjkim adcq \$0, %rbx 222289848Sjkim 223356290Sjkim mulq %rbp 224289848Sjkim addq %rax, %r11 225289848Sjkim movq 32($inp), %rax 226289848Sjkim adcq \$0, %rdx 227289848Sjkim addq %rbx, %r11 228289848Sjkim movq %rdx, %rbx 229289848Sjkim adcq \$0, %rbx 230289848Sjkim 231356290Sjkim mulq %rbp 232289848Sjkim addq %rax, %r12 233289848Sjkim movq 40($inp), %rax 234289848Sjkim adcq \$0, %rdx 235289848Sjkim addq %rbx, %r12 236289848Sjkim movq %rdx, %rbx 237289848Sjkim adcq \$0, %rbx 238289848Sjkim 239356290Sjkim mulq %rbp 240289848Sjkim addq %rax, %r13 241289848Sjkim movq 48($inp), %rax 242289848Sjkim adcq \$0, %rdx 243289848Sjkim addq %rbx, %r13 244289848Sjkim movq %rdx, %rbx 245289848Sjkim adcq \$0, %rbx 246289848Sjkim 247356290Sjkim mulq %rbp 248289848Sjkim addq %rax, %r14 249289848Sjkim movq 56($inp), %rax 250289848Sjkim adcq \$0, %rdx 251289848Sjkim addq %rbx, %r14 252289848Sjkim movq %rdx, %rbx 253289848Sjkim adcq \$0, %rbx 254289848Sjkim 255356290Sjkim mulq %rbp 256289848Sjkim addq %rax, %r15 257356290Sjkim movq %rbp, %rax 258289848Sjkim adcq \$0, %rdx 259289848Sjkim addq %rbx, %r15 260356290Sjkim adcq \$0, %rdx 261289848Sjkim 262356290Sjkim xorq %rbx, %rbx # rbx:r10:r9 = r10:r9 << 1 263356290Sjkim addq %r9, %r9 264356290Sjkim movq %rdx, %r8 265356290Sjkim adcq %r10, %r10 266356290Sjkim adcq \$0, %rbx 267289848Sjkim 268289848Sjkim mulq %rax 269356290Sjkim # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 270356290Sjkim addq %rcx, %rax 271356290Sjkim movq 16($inp), %rbp 272289848Sjkim addq %rax, %r9 273356290Sjkim movq 24($inp), %rax 274289848Sjkim adcq %rdx, %r10 275356290Sjkim adcq \$0, %rbx 276289848Sjkim 277289848Sjkim movq %r9, 16(%rsp) 278289848Sjkim movq %r10, 24(%rsp) 279356290Sjkim 280289848Sjkim#third iteration 281356290Sjkim mulq %rbp 282289848Sjkim addq %rax, %r12 283289848Sjkim movq 32($inp), %rax 284289848Sjkim movq %rdx, %rcx 285289848Sjkim adcq \$0, %rcx 286289848Sjkim 287356290Sjkim mulq %rbp 288289848Sjkim addq %rax, %r13 289289848Sjkim movq 40($inp), %rax 290289848Sjkim adcq \$0, %rdx 291289848Sjkim addq %rcx, %r13 292289848Sjkim movq %rdx, %rcx 293289848Sjkim adcq \$0, %rcx 294289848Sjkim 295356290Sjkim mulq %rbp 296289848Sjkim addq %rax, %r14 297289848Sjkim movq 48($inp), %rax 298289848Sjkim adcq \$0, %rdx 299289848Sjkim addq %rcx, %r14 300289848Sjkim movq %rdx, %rcx 301289848Sjkim adcq \$0, %rcx 302289848Sjkim 303356290Sjkim mulq %rbp 304289848Sjkim addq %rax, %r15 305289848Sjkim movq 56($inp), %rax 306289848Sjkim adcq \$0, %rdx 307289848Sjkim addq %rcx, %r15 308289848Sjkim movq %rdx, %rcx 309289848Sjkim adcq \$0, %rcx 310289848Sjkim 311356290Sjkim mulq %rbp 312289848Sjkim addq %rax, %r8 313356290Sjkim movq %rbp, %rax 314289848Sjkim adcq \$0, %rdx 315289848Sjkim addq %rcx, %r8 316356290Sjkim adcq \$0, %rdx 317289848Sjkim 318356290Sjkim xorq %rcx, %rcx # rcx:r12:r11 = r12:r11 << 1 319356290Sjkim addq %r11, %r11 320356290Sjkim movq %rdx, %r9 321356290Sjkim adcq %r12, %r12 322356290Sjkim adcq \$0, %rcx 323289848Sjkim 324289848Sjkim mulq %rax 325356290Sjkim # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 326356290Sjkim addq %rbx, %rax 327356290Sjkim movq 24($inp), %r10 328289848Sjkim addq %rax, %r11 329356290Sjkim movq 32($inp), %rax 330289848Sjkim adcq %rdx, %r12 331356290Sjkim adcq \$0, %rcx 332289848Sjkim 333289848Sjkim movq %r11, 32(%rsp) 334289848Sjkim movq %r12, 40(%rsp) 335289848Sjkim 336289848Sjkim#fourth iteration 337356290Sjkim mov %rax, %r11 # 32($inp) 338289848Sjkim mulq %r10 339289848Sjkim addq %rax, %r14 340289848Sjkim movq 40($inp), %rax 341289848Sjkim movq %rdx, %rbx 342289848Sjkim adcq \$0, %rbx 343289848Sjkim 344356290Sjkim mov %rax, %r12 # 40($inp) 345289848Sjkim mulq %r10 346289848Sjkim addq %rax, %r15 347289848Sjkim movq 48($inp), %rax 348289848Sjkim adcq \$0, %rdx 349289848Sjkim addq %rbx, %r15 350289848Sjkim movq %rdx, %rbx 351289848Sjkim adcq \$0, %rbx 352289848Sjkim 353356290Sjkim mov %rax, %rbp # 48($inp) 354289848Sjkim mulq %r10 355289848Sjkim addq %rax, %r8 356289848Sjkim movq 56($inp), %rax 357289848Sjkim adcq \$0, %rdx 358289848Sjkim addq %rbx, %r8 359289848Sjkim movq %rdx, %rbx 360289848Sjkim adcq \$0, %rbx 361289848Sjkim 362289848Sjkim mulq %r10 363289848Sjkim addq %rax, %r9 364289848Sjkim movq %r10, %rax 365289848Sjkim adcq \$0, %rdx 366289848Sjkim addq %rbx, %r9 367356290Sjkim adcq \$0, %rdx 368289848Sjkim 369356290Sjkim xorq %rbx, %rbx # rbx:r13:r14 = r13:r14 << 1 370356290Sjkim addq %r13, %r13 371356290Sjkim movq %rdx, %r10 372356290Sjkim adcq %r14, %r14 373356290Sjkim adcq \$0, %rbx 374289848Sjkim 375289848Sjkim mulq %rax 376356290Sjkim # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 377356290Sjkim addq %rcx, %rax 378289848Sjkim addq %rax, %r13 379356290Sjkim movq %r12, %rax # 40($inp) 380289848Sjkim adcq %rdx, %r14 381356290Sjkim adcq \$0, %rbx 382289848Sjkim 383289848Sjkim movq %r13, 48(%rsp) 384289848Sjkim movq %r14, 56(%rsp) 385289848Sjkim 386289848Sjkim#fifth iteration 387289848Sjkim mulq %r11 388289848Sjkim addq %rax, %r8 389356290Sjkim movq %rbp, %rax # 48($inp) 390289848Sjkim movq %rdx, %rcx 391289848Sjkim adcq \$0, %rcx 392289848Sjkim 393289848Sjkim mulq %r11 394289848Sjkim addq %rax, %r9 395289848Sjkim movq 56($inp), %rax 396289848Sjkim adcq \$0, %rdx 397289848Sjkim addq %rcx, %r9 398289848Sjkim movq %rdx, %rcx 399289848Sjkim adcq \$0, %rcx 400289848Sjkim 401356290Sjkim mov %rax, %r14 # 56($inp) 402289848Sjkim mulq %r11 403289848Sjkim addq %rax, %r10 404289848Sjkim movq %r11, %rax 405289848Sjkim adcq \$0, %rdx 406289848Sjkim addq %rcx, %r10 407356290Sjkim adcq \$0, %rdx 408289848Sjkim 409356290Sjkim xorq %rcx, %rcx # rcx:r8:r15 = r8:r15 << 1 410356290Sjkim addq %r15, %r15 411356290Sjkim movq %rdx, %r11 412356290Sjkim adcq %r8, %r8 413356290Sjkim adcq \$0, %rcx 414289848Sjkim 415289848Sjkim mulq %rax 416356290Sjkim # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 417356290Sjkim addq %rbx, %rax 418289848Sjkim addq %rax, %r15 419356290Sjkim movq %rbp, %rax # 48($inp) 420289848Sjkim adcq %rdx, %r8 421356290Sjkim adcq \$0, %rcx 422289848Sjkim 423289848Sjkim movq %r15, 64(%rsp) 424289848Sjkim movq %r8, 72(%rsp) 425289848Sjkim 426289848Sjkim#sixth iteration 427289848Sjkim mulq %r12 428289848Sjkim addq %rax, %r10 429356290Sjkim movq %r14, %rax # 56($inp) 430289848Sjkim movq %rdx, %rbx 431289848Sjkim adcq \$0, %rbx 432289848Sjkim 433289848Sjkim mulq %r12 434289848Sjkim addq %rax, %r11 435289848Sjkim movq %r12, %rax 436289848Sjkim adcq \$0, %rdx 437289848Sjkim addq %rbx, %r11 438356290Sjkim adcq \$0, %rdx 439289848Sjkim 440356290Sjkim xorq %rbx, %rbx # rbx:r10:r9 = r10:r9 << 1 441356290Sjkim addq %r9, %r9 442356290Sjkim movq %rdx, %r12 443356290Sjkim adcq %r10, %r10 444356290Sjkim adcq \$0, %rbx 445289848Sjkim 446289848Sjkim mulq %rax 447356290Sjkim # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 448356290Sjkim addq %rcx, %rax 449289848Sjkim addq %rax, %r9 450356290Sjkim movq %r14, %rax # 56($inp) 451289848Sjkim adcq %rdx, %r10 452356290Sjkim adcq \$0, %rbx 453289848Sjkim 454289848Sjkim movq %r9, 80(%rsp) 455289848Sjkim movq %r10, 88(%rsp) 456289848Sjkim 457289848Sjkim#seventh iteration 458356290Sjkim mulq %rbp 459289848Sjkim addq %rax, %r12 460356290Sjkim movq %rbp, %rax 461356290Sjkim adcq \$0, %rdx 462289848Sjkim 463356290Sjkim xorq %rcx, %rcx # rcx:r12:r11 = r12:r11 << 1 464356290Sjkim addq %r11, %r11 465356290Sjkim movq %rdx, %r13 466356290Sjkim adcq %r12, %r12 467356290Sjkim adcq \$0, %rcx 468289848Sjkim 469289848Sjkim mulq %rax 470356290Sjkim # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 471356290Sjkim addq %rbx, %rax 472289848Sjkim addq %rax, %r11 473356290Sjkim movq %r14, %rax # 56($inp) 474289848Sjkim adcq %rdx, %r12 475356290Sjkim adcq \$0, %rcx 476289848Sjkim 477289848Sjkim movq %r11, 96(%rsp) 478289848Sjkim movq %r12, 104(%rsp) 479289848Sjkim 480289848Sjkim#eighth iteration 481356290Sjkim xorq %rbx, %rbx # rbx:r13 = r13 << 1 482356290Sjkim addq %r13, %r13 483356290Sjkim adcq \$0, %rbx 484356290Sjkim 485289848Sjkim mulq %rax 486356290Sjkim # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 487356290Sjkim addq %rcx, %rax 488356290Sjkim addq %r13, %rax 489356290Sjkim adcq %rbx, %rdx 490289848Sjkim 491289848Sjkim movq (%rsp), %r8 492289848Sjkim movq 8(%rsp), %r9 493289848Sjkim movq 16(%rsp), %r10 494289848Sjkim movq 24(%rsp), %r11 495289848Sjkim movq 32(%rsp), %r12 496289848Sjkim movq 40(%rsp), %r13 497289848Sjkim movq 48(%rsp), %r14 498289848Sjkim movq 56(%rsp), %r15 499356290Sjkim movq %xmm1, %rbp 500289848Sjkim 501356290Sjkim movq %rax, 112(%rsp) 502356290Sjkim movq %rdx, 120(%rsp) 503356290Sjkim 504289848Sjkim call __rsaz_512_reduce 505289848Sjkim 506289848Sjkim addq 64(%rsp), %r8 507289848Sjkim adcq 72(%rsp), %r9 508289848Sjkim adcq 80(%rsp), %r10 509289848Sjkim adcq 88(%rsp), %r11 510289848Sjkim adcq 96(%rsp), %r12 511289848Sjkim adcq 104(%rsp), %r13 512289848Sjkim adcq 112(%rsp), %r14 513289848Sjkim adcq 120(%rsp), %r15 514289848Sjkim sbbq %rcx, %rcx 515289848Sjkim 516289848Sjkim call __rsaz_512_subtract 517289848Sjkim 518289848Sjkim movq %r8, %rdx 519289848Sjkim movq %r9, %rax 520289848Sjkim movl 128+8(%rsp), $times 521289848Sjkim movq $out, $inp 522289848Sjkim 523289848Sjkim decl $times 524289848Sjkim jnz .Loop_sqr 525289848Sjkim___ 526289848Sjkimif ($addx) { 527289848Sjkim$code.=<<___; 528289848Sjkim jmp .Lsqr_tail 529289848Sjkim 530289848Sjkim.align 32 531289848Sjkim.Loop_sqrx: 532289848Sjkim movl $times,128+8(%rsp) 533289848Sjkim movq $out, %xmm0 # off-load 534356290Sjkim#first iteration 535289848Sjkim mulx %rax, %r8, %r9 536356290Sjkim mov %rax, %rbx 537289848Sjkim 538289848Sjkim mulx 16($inp), %rcx, %r10 539289848Sjkim xor %rbp, %rbp # cf=0, of=0 540289848Sjkim 541289848Sjkim mulx 24($inp), %rax, %r11 542289848Sjkim adcx %rcx, %r9 543289848Sjkim 544356290Sjkim .byte 0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($inp), %rcx, %r12 545289848Sjkim adcx %rax, %r10 546289848Sjkim 547356290Sjkim .byte 0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00 # mulx 40($inp), %rax, %r13 548289848Sjkim adcx %rcx, %r11 549289848Sjkim 550356290Sjkim mulx 48($inp), %rcx, %r14 551289848Sjkim adcx %rax, %r12 552289848Sjkim adcx %rcx, %r13 553289848Sjkim 554356290Sjkim mulx 56($inp), %rax, %r15 555289848Sjkim adcx %rax, %r14 556289848Sjkim adcx %rbp, %r15 # %rbp is 0 557289848Sjkim 558356290Sjkim mulx %rdx, %rax, $out 559356290Sjkim mov %rbx, %rdx # 8($inp) 560356290Sjkim xor %rcx, %rcx 561356290Sjkim adox %r8, %r8 562356290Sjkim adcx $out, %r8 563356290Sjkim adox %rbp, %rcx 564356290Sjkim adcx %rbp, %rcx 565289848Sjkim 566289848Sjkim mov %rax, (%rsp) 567289848Sjkim mov %r8, 8(%rsp) 568289848Sjkim 569356290Sjkim#second iteration 570356290Sjkim .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00 # mulx 16($inp), %rax, %rbx 571289848Sjkim adox %rax, %r10 572289848Sjkim adcx %rbx, %r11 573289848Sjkim 574356290Sjkim mulx 24($inp), $out, %r8 575289848Sjkim adox $out, %r11 576356290Sjkim .byte 0x66 577289848Sjkim adcx %r8, %r12 578289848Sjkim 579289848Sjkim mulx 32($inp), %rax, %rbx 580289848Sjkim adox %rax, %r12 581289848Sjkim adcx %rbx, %r13 582289848Sjkim 583289848Sjkim mulx 40($inp), $out, %r8 584289848Sjkim adox $out, %r13 585289848Sjkim adcx %r8, %r14 586289848Sjkim 587289848Sjkim .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx 588289848Sjkim adox %rax, %r14 589289848Sjkim adcx %rbx, %r15 590289848Sjkim 591289848Sjkim .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8 592289848Sjkim adox $out, %r15 593289848Sjkim adcx %rbp, %r8 594356290Sjkim mulx %rdx, %rax, $out 595289848Sjkim adox %rbp, %r8 596356290Sjkim .byte 0x48,0x8b,0x96,0x10,0x00,0x00,0x00 # mov 16($inp), %rdx 597289848Sjkim 598356290Sjkim xor %rbx, %rbx 599356290Sjkim adox %r9, %r9 600356290Sjkim # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 601356290Sjkim adcx %rcx, %rax 602356290Sjkim adox %r10, %r10 603289848Sjkim adcx %rax, %r9 604356290Sjkim adox %rbp, %rbx 605356290Sjkim adcx $out, %r10 606356290Sjkim adcx %rbp, %rbx 607289848Sjkim 608289848Sjkim mov %r9, 16(%rsp) 609289848Sjkim .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp) 610356290Sjkim 611356290Sjkim#third iteration 612356290Sjkim mulx 24($inp), $out, %r9 613289848Sjkim adox $out, %r12 614289848Sjkim adcx %r9, %r13 615289848Sjkim 616289848Sjkim mulx 32($inp), %rax, %rcx 617289848Sjkim adox %rax, %r13 618289848Sjkim adcx %rcx, %r14 619289848Sjkim 620356290Sjkim .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r9 621289848Sjkim adox $out, %r14 622289848Sjkim adcx %r9, %r15 623289848Sjkim 624289848Sjkim .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx 625289848Sjkim adox %rax, %r15 626289848Sjkim adcx %rcx, %r8 627289848Sjkim 628356290Sjkim mulx 56($inp), $out, %r9 629289848Sjkim adox $out, %r8 630289848Sjkim adcx %rbp, %r9 631356290Sjkim mulx %rdx, %rax, $out 632289848Sjkim adox %rbp, %r9 633356290Sjkim mov 24($inp), %rdx 634289848Sjkim 635356290Sjkim xor %rcx, %rcx 636356290Sjkim adox %r11, %r11 637356290Sjkim # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 638356290Sjkim adcx %rbx, %rax 639356290Sjkim adox %r12, %r12 640289848Sjkim adcx %rax, %r11 641356290Sjkim adox %rbp, %rcx 642356290Sjkim adcx $out, %r12 643356290Sjkim adcx %rbp, %rcx 644289848Sjkim 645289848Sjkim mov %r11, 32(%rsp) 646356290Sjkim mov %r12, 40(%rsp) 647356290Sjkim 648356290Sjkim#fourth iteration 649356290Sjkim mulx 32($inp), %rax, %rbx 650289848Sjkim adox %rax, %r14 651289848Sjkim adcx %rbx, %r15 652289848Sjkim 653289848Sjkim mulx 40($inp), $out, %r10 654289848Sjkim adox $out, %r15 655289848Sjkim adcx %r10, %r8 656289848Sjkim 657289848Sjkim mulx 48($inp), %rax, %rbx 658289848Sjkim adox %rax, %r8 659289848Sjkim adcx %rbx, %r9 660289848Sjkim 661289848Sjkim mulx 56($inp), $out, %r10 662289848Sjkim adox $out, %r9 663289848Sjkim adcx %rbp, %r10 664356290Sjkim mulx %rdx, %rax, $out 665289848Sjkim adox %rbp, %r10 666356290Sjkim mov 32($inp), %rdx 667289848Sjkim 668356290Sjkim xor %rbx, %rbx 669356290Sjkim adox %r13, %r13 670356290Sjkim # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 671356290Sjkim adcx %rcx, %rax 672356290Sjkim adox %r14, %r14 673289848Sjkim adcx %rax, %r13 674356290Sjkim adox %rbp, %rbx 675356290Sjkim adcx $out, %r14 676356290Sjkim adcx %rbp, %rbx 677289848Sjkim 678289848Sjkim mov %r13, 48(%rsp) 679289848Sjkim mov %r14, 56(%rsp) 680356290Sjkim 681356290Sjkim#fifth iteration 682356290Sjkim mulx 40($inp), $out, %r11 683289848Sjkim adox $out, %r8 684289848Sjkim adcx %r11, %r9 685289848Sjkim 686289848Sjkim mulx 48($inp), %rax, %rcx 687289848Sjkim adox %rax, %r9 688289848Sjkim adcx %rcx, %r10 689289848Sjkim 690289848Sjkim mulx 56($inp), $out, %r11 691289848Sjkim adox $out, %r10 692289848Sjkim adcx %rbp, %r11 693356290Sjkim mulx %rdx, %rax, $out 694356290Sjkim mov 40($inp), %rdx 695289848Sjkim adox %rbp, %r11 696289848Sjkim 697356290Sjkim xor %rcx, %rcx 698356290Sjkim adox %r15, %r15 699356290Sjkim # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 700356290Sjkim adcx %rbx, %rax 701356290Sjkim adox %r8, %r8 702289848Sjkim adcx %rax, %r15 703356290Sjkim adox %rbp, %rcx 704356290Sjkim adcx $out, %r8 705356290Sjkim adcx %rbp, %rcx 706289848Sjkim 707289848Sjkim mov %r15, 64(%rsp) 708289848Sjkim mov %r8, 72(%rsp) 709289848Sjkim 710289848Sjkim#sixth iteration 711289848Sjkim .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx 712289848Sjkim adox %rax, %r10 713289848Sjkim adcx %rbx, %r11 714289848Sjkim 715289848Sjkim .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12 716289848Sjkim adox $out, %r11 717289848Sjkim adcx %rbp, %r12 718356290Sjkim mulx %rdx, %rax, $out 719289848Sjkim adox %rbp, %r12 720356290Sjkim mov 48($inp), %rdx 721289848Sjkim 722356290Sjkim xor %rbx, %rbx 723356290Sjkim adox %r9, %r9 724356290Sjkim # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 725356290Sjkim adcx %rcx, %rax 726356290Sjkim adox %r10, %r10 727289848Sjkim adcx %rax, %r9 728356290Sjkim adcx $out, %r10 729356290Sjkim adox %rbp, %rbx 730356290Sjkim adcx %rbp, %rbx 731289848Sjkim 732289848Sjkim mov %r9, 80(%rsp) 733289848Sjkim mov %r10, 88(%rsp) 734289848Sjkim 735289848Sjkim#seventh iteration 736289848Sjkim .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13 737289848Sjkim adox %rax, %r12 738289848Sjkim adox %rbp, %r13 739289848Sjkim 740356290Sjkim mulx %rdx, %rax, $out 741356290Sjkim xor %rcx, %rcx 742356290Sjkim mov 56($inp), %rdx 743356290Sjkim adox %r11, %r11 744356290Sjkim # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 745356290Sjkim adcx %rbx, %rax 746356290Sjkim adox %r12, %r12 747289848Sjkim adcx %rax, %r11 748356290Sjkim adox %rbp, %rcx 749356290Sjkim adcx $out, %r12 750356290Sjkim adcx %rbp, %rcx 751289848Sjkim 752289848Sjkim .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp) 753289848Sjkim .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp) 754289848Sjkim 755289848Sjkim#eighth iteration 756289848Sjkim mulx %rdx, %rax, %rdx 757356290Sjkim xor %rbx, %rbx 758356290Sjkim adox %r13, %r13 759356290Sjkim # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here 760356290Sjkim adcx %rcx, %rax 761356290Sjkim adox %rbp, %rbx 762356290Sjkim adcx %r13, %rax 763356290Sjkim adcx %rdx, %rbx 764289848Sjkim 765289848Sjkim movq %xmm0, $out 766289848Sjkim movq %xmm1, %rbp 767289848Sjkim 768289848Sjkim movq 128(%rsp), %rdx # pull $n0 769289848Sjkim movq (%rsp), %r8 770289848Sjkim movq 8(%rsp), %r9 771289848Sjkim movq 16(%rsp), %r10 772289848Sjkim movq 24(%rsp), %r11 773289848Sjkim movq 32(%rsp), %r12 774289848Sjkim movq 40(%rsp), %r13 775289848Sjkim movq 48(%rsp), %r14 776289848Sjkim movq 56(%rsp), %r15 777289848Sjkim 778356290Sjkim movq %rax, 112(%rsp) 779356290Sjkim movq %rbx, 120(%rsp) 780356290Sjkim 781289848Sjkim call __rsaz_512_reducex 782289848Sjkim 783289848Sjkim addq 64(%rsp), %r8 784289848Sjkim adcq 72(%rsp), %r9 785289848Sjkim adcq 80(%rsp), %r10 786289848Sjkim adcq 88(%rsp), %r11 787289848Sjkim adcq 96(%rsp), %r12 788289848Sjkim adcq 104(%rsp), %r13 789289848Sjkim adcq 112(%rsp), %r14 790289848Sjkim adcq 120(%rsp), %r15 791289848Sjkim sbbq %rcx, %rcx 792289848Sjkim 793289848Sjkim call __rsaz_512_subtract 794289848Sjkim 795289848Sjkim movq %r8, %rdx 796289848Sjkim movq %r9, %rax 797289848Sjkim movl 128+8(%rsp), $times 798289848Sjkim movq $out, $inp 799289848Sjkim 800289848Sjkim decl $times 801289848Sjkim jnz .Loop_sqrx 802289848Sjkim 803289848Sjkim.Lsqr_tail: 804289848Sjkim___ 805289848Sjkim} 806289848Sjkim$code.=<<___; 807289848Sjkim 808289848Sjkim leaq 128+24+48(%rsp), %rax 809289848Sjkim movq -48(%rax), %r15 810289848Sjkim movq -40(%rax), %r14 811289848Sjkim movq -32(%rax), %r13 812289848Sjkim movq -24(%rax), %r12 813289848Sjkim movq -16(%rax), %rbp 814289848Sjkim movq -8(%rax), %rbx 815289848Sjkim leaq (%rax), %rsp 816289848Sjkim.Lsqr_epilogue: 817289848Sjkim ret 818289848Sjkim.size rsaz_512_sqr,.-rsaz_512_sqr 819289848Sjkim___ 820289848Sjkim} 821289848Sjkim{ 822289848Sjkimmy ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); 823289848Sjkim$code.=<<___; 824289848Sjkim.globl rsaz_512_mul 825289848Sjkim.type rsaz_512_mul,\@function,5 826289848Sjkim.align 32 827289848Sjkimrsaz_512_mul: 828289848Sjkim push %rbx 829289848Sjkim push %rbp 830289848Sjkim push %r12 831289848Sjkim push %r13 832289848Sjkim push %r14 833289848Sjkim push %r15 834289848Sjkim 835289848Sjkim subq \$128+24, %rsp 836289848Sjkim.Lmul_body: 837289848Sjkim movq $out, %xmm0 # off-load arguments 838289848Sjkim movq $mod, %xmm1 839289848Sjkim movq $n0, 128(%rsp) 840289848Sjkim___ 841289848Sjkim$code.=<<___ if ($addx); 842289848Sjkim movl \$0x80100,%r11d 843289848Sjkim andl OPENSSL_ia32cap_P+8(%rip),%r11d 844289848Sjkim cmpl \$0x80100,%r11d # check for MULX and ADO/CX 845289848Sjkim je .Lmulx 846289848Sjkim___ 847289848Sjkim$code.=<<___; 848289848Sjkim movq ($bp), %rbx # pass b[0] 849289848Sjkim movq $bp, %rbp # pass argument 850289848Sjkim call __rsaz_512_mul 851289848Sjkim 852289848Sjkim movq %xmm0, $out 853289848Sjkim movq %xmm1, %rbp 854289848Sjkim 855289848Sjkim movq (%rsp), %r8 856289848Sjkim movq 8(%rsp), %r9 857289848Sjkim movq 16(%rsp), %r10 858289848Sjkim movq 24(%rsp), %r11 859289848Sjkim movq 32(%rsp), %r12 860289848Sjkim movq 40(%rsp), %r13 861289848Sjkim movq 48(%rsp), %r14 862289848Sjkim movq 56(%rsp), %r15 863289848Sjkim 864289848Sjkim call __rsaz_512_reduce 865289848Sjkim___ 866289848Sjkim$code.=<<___ if ($addx); 867289848Sjkim jmp .Lmul_tail 868289848Sjkim 869289848Sjkim.align 32 870289848Sjkim.Lmulx: 871289848Sjkim movq $bp, %rbp # pass argument 872289848Sjkim movq ($bp), %rdx # pass b[0] 873289848Sjkim call __rsaz_512_mulx 874289848Sjkim 875289848Sjkim movq %xmm0, $out 876289848Sjkim movq %xmm1, %rbp 877289848Sjkim 878289848Sjkim movq 128(%rsp), %rdx # pull $n0 879289848Sjkim movq (%rsp), %r8 880289848Sjkim movq 8(%rsp), %r9 881289848Sjkim movq 16(%rsp), %r10 882289848Sjkim movq 24(%rsp), %r11 883289848Sjkim movq 32(%rsp), %r12 884289848Sjkim movq 40(%rsp), %r13 885289848Sjkim movq 48(%rsp), %r14 886289848Sjkim movq 56(%rsp), %r15 887289848Sjkim 888289848Sjkim call __rsaz_512_reducex 889289848Sjkim.Lmul_tail: 890289848Sjkim___ 891289848Sjkim$code.=<<___; 892289848Sjkim addq 64(%rsp), %r8 893289848Sjkim adcq 72(%rsp), %r9 894289848Sjkim adcq 80(%rsp), %r10 895289848Sjkim adcq 88(%rsp), %r11 896289848Sjkim adcq 96(%rsp), %r12 897289848Sjkim adcq 104(%rsp), %r13 898289848Sjkim adcq 112(%rsp), %r14 899289848Sjkim adcq 120(%rsp), %r15 900289848Sjkim sbbq %rcx, %rcx 901289848Sjkim 902289848Sjkim call __rsaz_512_subtract 903289848Sjkim 904289848Sjkim leaq 128+24+48(%rsp), %rax 905289848Sjkim movq -48(%rax), %r15 906289848Sjkim movq -40(%rax), %r14 907289848Sjkim movq -32(%rax), %r13 908289848Sjkim movq -24(%rax), %r12 909289848Sjkim movq -16(%rax), %rbp 910289848Sjkim movq -8(%rax), %rbx 911289848Sjkim leaq (%rax), %rsp 912289848Sjkim.Lmul_epilogue: 913289848Sjkim ret 914289848Sjkim.size rsaz_512_mul,.-rsaz_512_mul 915289848Sjkim___ 916289848Sjkim} 917289848Sjkim{ 918289848Sjkimmy ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); 919289848Sjkim$code.=<<___; 920289848Sjkim.globl rsaz_512_mul_gather4 921289848Sjkim.type rsaz_512_mul_gather4,\@function,6 922289848Sjkim.align 32 923289848Sjkimrsaz_512_mul_gather4: 924289848Sjkim push %rbx 925289848Sjkim push %rbp 926289848Sjkim push %r12 927289848Sjkim push %r13 928289848Sjkim push %r14 929289848Sjkim push %r15 930289848Sjkim 931296279Sjkim subq \$`128+24+($win64?0xb0:0)`, %rsp 932296279Sjkim___ 933296279Sjkim$code.=<<___ if ($win64); 934296279Sjkim movaps %xmm6,0xa0(%rsp) 935296279Sjkim movaps %xmm7,0xb0(%rsp) 936296279Sjkim movaps %xmm8,0xc0(%rsp) 937296279Sjkim movaps %xmm9,0xd0(%rsp) 938296279Sjkim movaps %xmm10,0xe0(%rsp) 939296279Sjkim movaps %xmm11,0xf0(%rsp) 940296279Sjkim movaps %xmm12,0x100(%rsp) 941296279Sjkim movaps %xmm13,0x110(%rsp) 942296279Sjkim movaps %xmm14,0x120(%rsp) 943296279Sjkim movaps %xmm15,0x130(%rsp) 944296279Sjkim___ 945296279Sjkim$code.=<<___; 946289848Sjkim.Lmul_gather4_body: 947296279Sjkim movd $pwr,%xmm8 948296279Sjkim movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002 949296279Sjkim movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000 950296279Sjkim 951296279Sjkim pshufd \$0,%xmm8,%xmm8 # broadcast $power 952296279Sjkim movdqa %xmm1,%xmm7 953296279Sjkim movdqa %xmm1,%xmm2 954289848Sjkim___ 955296279Sjkim######################################################################## 956296279Sjkim# calculate mask by comparing 0..15 to $power 957296279Sjkim# 958296279Sjkimfor($i=0;$i<4;$i++) { 959296279Sjkim$code.=<<___; 960296279Sjkim paddd %xmm`$i`,%xmm`$i+1` 961296279Sjkim pcmpeqd %xmm8,%xmm`$i` 962296279Sjkim movdqa %xmm7,%xmm`$i+3` 963296279Sjkim___ 964296279Sjkim} 965296279Sjkimfor(;$i<7;$i++) { 966296279Sjkim$code.=<<___; 967296279Sjkim paddd %xmm`$i`,%xmm`$i+1` 968296279Sjkim pcmpeqd %xmm8,%xmm`$i` 969296279Sjkim___ 970296279Sjkim} 971296279Sjkim$code.=<<___; 972296279Sjkim pcmpeqd %xmm8,%xmm7 973296279Sjkim 974296279Sjkim movdqa 16*0($bp),%xmm8 975296279Sjkim movdqa 16*1($bp),%xmm9 976296279Sjkim movdqa 16*2($bp),%xmm10 977296279Sjkim movdqa 16*3($bp),%xmm11 978296279Sjkim pand %xmm0,%xmm8 979296279Sjkim movdqa 16*4($bp),%xmm12 980296279Sjkim pand %xmm1,%xmm9 981296279Sjkim movdqa 16*5($bp),%xmm13 982296279Sjkim pand %xmm2,%xmm10 983296279Sjkim movdqa 16*6($bp),%xmm14 984296279Sjkim pand %xmm3,%xmm11 985296279Sjkim movdqa 16*7($bp),%xmm15 986296279Sjkim leaq 128($bp), %rbp 987296279Sjkim pand %xmm4,%xmm12 988296279Sjkim pand %xmm5,%xmm13 989296279Sjkim pand %xmm6,%xmm14 990296279Sjkim pand %xmm7,%xmm15 991296279Sjkim por %xmm10,%xmm8 992296279Sjkim por %xmm11,%xmm9 993296279Sjkim por %xmm12,%xmm8 994296279Sjkim por %xmm13,%xmm9 995296279Sjkim por %xmm14,%xmm8 996296279Sjkim por %xmm15,%xmm9 997296279Sjkim 998296279Sjkim por %xmm9,%xmm8 999296279Sjkim pshufd \$0x4e,%xmm8,%xmm9 1000296279Sjkim por %xmm9,%xmm8 1001296279Sjkim___ 1002289848Sjkim$code.=<<___ if ($addx); 1003289848Sjkim movl \$0x80100,%r11d 1004289848Sjkim andl OPENSSL_ia32cap_P+8(%rip),%r11d 1005289848Sjkim cmpl \$0x80100,%r11d # check for MULX and ADO/CX 1006289848Sjkim je .Lmulx_gather 1007289848Sjkim___ 1008289848Sjkim$code.=<<___; 1009296279Sjkim movq %xmm8,%rbx 1010289848Sjkim 1011296279Sjkim movq $n0, 128(%rsp) # off-load arguments 1012296279Sjkim movq $out, 128+8(%rsp) 1013296279Sjkim movq $mod, 128+16(%rsp) 1014296279Sjkim 1015289848Sjkim movq ($ap), %rax 1016289848Sjkim movq 8($ap), %rcx 1017289848Sjkim mulq %rbx # 0 iteration 1018289848Sjkim movq %rax, (%rsp) 1019289848Sjkim movq %rcx, %rax 1020289848Sjkim movq %rdx, %r8 1021289848Sjkim 1022289848Sjkim mulq %rbx 1023289848Sjkim addq %rax, %r8 1024289848Sjkim movq 16($ap), %rax 1025289848Sjkim movq %rdx, %r9 1026289848Sjkim adcq \$0, %r9 1027289848Sjkim 1028289848Sjkim mulq %rbx 1029289848Sjkim addq %rax, %r9 1030289848Sjkim movq 24($ap), %rax 1031289848Sjkim movq %rdx, %r10 1032289848Sjkim adcq \$0, %r10 1033289848Sjkim 1034289848Sjkim mulq %rbx 1035289848Sjkim addq %rax, %r10 1036289848Sjkim movq 32($ap), %rax 1037289848Sjkim movq %rdx, %r11 1038289848Sjkim adcq \$0, %r11 1039289848Sjkim 1040289848Sjkim mulq %rbx 1041289848Sjkim addq %rax, %r11 1042289848Sjkim movq 40($ap), %rax 1043289848Sjkim movq %rdx, %r12 1044289848Sjkim adcq \$0, %r12 1045289848Sjkim 1046289848Sjkim mulq %rbx 1047289848Sjkim addq %rax, %r12 1048289848Sjkim movq 48($ap), %rax 1049289848Sjkim movq %rdx, %r13 1050289848Sjkim adcq \$0, %r13 1051289848Sjkim 1052289848Sjkim mulq %rbx 1053289848Sjkim addq %rax, %r13 1054289848Sjkim movq 56($ap), %rax 1055289848Sjkim movq %rdx, %r14 1056289848Sjkim adcq \$0, %r14 1057289848Sjkim 1058289848Sjkim mulq %rbx 1059289848Sjkim addq %rax, %r14 1060289848Sjkim movq ($ap), %rax 1061289848Sjkim movq %rdx, %r15 1062289848Sjkim adcq \$0, %r15 1063289848Sjkim 1064289848Sjkim leaq 8(%rsp), %rdi 1065289848Sjkim movl \$7, %ecx 1066289848Sjkim jmp .Loop_mul_gather 1067289848Sjkim 1068289848Sjkim.align 32 1069289848Sjkim.Loop_mul_gather: 1070296279Sjkim movdqa 16*0(%rbp),%xmm8 1071296279Sjkim movdqa 16*1(%rbp),%xmm9 1072296279Sjkim movdqa 16*2(%rbp),%xmm10 1073296279Sjkim movdqa 16*3(%rbp),%xmm11 1074296279Sjkim pand %xmm0,%xmm8 1075296279Sjkim movdqa 16*4(%rbp),%xmm12 1076296279Sjkim pand %xmm1,%xmm9 1077296279Sjkim movdqa 16*5(%rbp),%xmm13 1078296279Sjkim pand %xmm2,%xmm10 1079296279Sjkim movdqa 16*6(%rbp),%xmm14 1080296279Sjkim pand %xmm3,%xmm11 1081296279Sjkim movdqa 16*7(%rbp),%xmm15 1082296279Sjkim leaq 128(%rbp), %rbp 1083296279Sjkim pand %xmm4,%xmm12 1084296279Sjkim pand %xmm5,%xmm13 1085296279Sjkim pand %xmm6,%xmm14 1086296279Sjkim pand %xmm7,%xmm15 1087296279Sjkim por %xmm10,%xmm8 1088296279Sjkim por %xmm11,%xmm9 1089296279Sjkim por %xmm12,%xmm8 1090296279Sjkim por %xmm13,%xmm9 1091296279Sjkim por %xmm14,%xmm8 1092296279Sjkim por %xmm15,%xmm9 1093296279Sjkim 1094296279Sjkim por %xmm9,%xmm8 1095296279Sjkim pshufd \$0x4e,%xmm8,%xmm9 1096296279Sjkim por %xmm9,%xmm8 1097296279Sjkim movq %xmm8,%rbx 1098296279Sjkim 1099289848Sjkim mulq %rbx 1100289848Sjkim addq %rax, %r8 1101289848Sjkim movq 8($ap), %rax 1102289848Sjkim movq %r8, (%rdi) 1103289848Sjkim movq %rdx, %r8 1104289848Sjkim adcq \$0, %r8 1105289848Sjkim 1106289848Sjkim mulq %rbx 1107289848Sjkim addq %rax, %r9 1108289848Sjkim movq 16($ap), %rax 1109289848Sjkim adcq \$0, %rdx 1110289848Sjkim addq %r9, %r8 1111289848Sjkim movq %rdx, %r9 1112289848Sjkim adcq \$0, %r9 1113289848Sjkim 1114289848Sjkim mulq %rbx 1115289848Sjkim addq %rax, %r10 1116289848Sjkim movq 24($ap), %rax 1117289848Sjkim adcq \$0, %rdx 1118289848Sjkim addq %r10, %r9 1119289848Sjkim movq %rdx, %r10 1120289848Sjkim adcq \$0, %r10 1121289848Sjkim 1122289848Sjkim mulq %rbx 1123289848Sjkim addq %rax, %r11 1124289848Sjkim movq 32($ap), %rax 1125289848Sjkim adcq \$0, %rdx 1126289848Sjkim addq %r11, %r10 1127289848Sjkim movq %rdx, %r11 1128289848Sjkim adcq \$0, %r11 1129289848Sjkim 1130289848Sjkim mulq %rbx 1131289848Sjkim addq %rax, %r12 1132289848Sjkim movq 40($ap), %rax 1133289848Sjkim adcq \$0, %rdx 1134289848Sjkim addq %r12, %r11 1135289848Sjkim movq %rdx, %r12 1136289848Sjkim adcq \$0, %r12 1137289848Sjkim 1138289848Sjkim mulq %rbx 1139289848Sjkim addq %rax, %r13 1140289848Sjkim movq 48($ap), %rax 1141289848Sjkim adcq \$0, %rdx 1142289848Sjkim addq %r13, %r12 1143289848Sjkim movq %rdx, %r13 1144289848Sjkim adcq \$0, %r13 1145289848Sjkim 1146289848Sjkim mulq %rbx 1147289848Sjkim addq %rax, %r14 1148289848Sjkim movq 56($ap), %rax 1149289848Sjkim adcq \$0, %rdx 1150289848Sjkim addq %r14, %r13 1151289848Sjkim movq %rdx, %r14 1152289848Sjkim adcq \$0, %r14 1153289848Sjkim 1154289848Sjkim mulq %rbx 1155289848Sjkim addq %rax, %r15 1156289848Sjkim movq ($ap), %rax 1157289848Sjkim adcq \$0, %rdx 1158289848Sjkim addq %r15, %r14 1159289848Sjkim movq %rdx, %r15 1160289848Sjkim adcq \$0, %r15 1161289848Sjkim 1162289848Sjkim leaq 8(%rdi), %rdi 1163289848Sjkim 1164289848Sjkim decl %ecx 1165289848Sjkim jnz .Loop_mul_gather 1166289848Sjkim 1167289848Sjkim movq %r8, (%rdi) 1168289848Sjkim movq %r9, 8(%rdi) 1169289848Sjkim movq %r10, 16(%rdi) 1170289848Sjkim movq %r11, 24(%rdi) 1171289848Sjkim movq %r12, 32(%rdi) 1172289848Sjkim movq %r13, 40(%rdi) 1173289848Sjkim movq %r14, 48(%rdi) 1174289848Sjkim movq %r15, 56(%rdi) 1175289848Sjkim 1176296279Sjkim movq 128+8(%rsp), $out 1177296279Sjkim movq 128+16(%rsp), %rbp 1178289848Sjkim 1179289848Sjkim movq (%rsp), %r8 1180289848Sjkim movq 8(%rsp), %r9 1181289848Sjkim movq 16(%rsp), %r10 1182289848Sjkim movq 24(%rsp), %r11 1183289848Sjkim movq 32(%rsp), %r12 1184289848Sjkim movq 40(%rsp), %r13 1185289848Sjkim movq 48(%rsp), %r14 1186289848Sjkim movq 56(%rsp), %r15 1187289848Sjkim 1188289848Sjkim call __rsaz_512_reduce 1189289848Sjkim___ 1190289848Sjkim$code.=<<___ if ($addx); 1191289848Sjkim jmp .Lmul_gather_tail 1192289848Sjkim 1193289848Sjkim.align 32 1194289848Sjkim.Lmulx_gather: 1195296279Sjkim movq %xmm8,%rdx 1196289848Sjkim 1197296279Sjkim mov $n0, 128(%rsp) # off-load arguments 1198296279Sjkim mov $out, 128+8(%rsp) 1199296279Sjkim mov $mod, 128+16(%rsp) 1200296279Sjkim 1201289848Sjkim mulx ($ap), %rbx, %r8 # 0 iteration 1202289848Sjkim mov %rbx, (%rsp) 1203289848Sjkim xor %edi, %edi # cf=0, of=0 1204289848Sjkim 1205289848Sjkim mulx 8($ap), %rax, %r9 1206289848Sjkim 1207289848Sjkim mulx 16($ap), %rbx, %r10 1208289848Sjkim adcx %rax, %r8 1209289848Sjkim 1210289848Sjkim mulx 24($ap), %rax, %r11 1211289848Sjkim adcx %rbx, %r9 1212289848Sjkim 1213289848Sjkim mulx 32($ap), %rbx, %r12 1214289848Sjkim adcx %rax, %r10 1215289848Sjkim 1216289848Sjkim mulx 40($ap), %rax, %r13 1217289848Sjkim adcx %rbx, %r11 1218289848Sjkim 1219289848Sjkim mulx 48($ap), %rbx, %r14 1220289848Sjkim adcx %rax, %r12 1221289848Sjkim 1222289848Sjkim mulx 56($ap), %rax, %r15 1223289848Sjkim adcx %rbx, %r13 1224289848Sjkim adcx %rax, %r14 1225296279Sjkim .byte 0x67 1226289848Sjkim mov %r8, %rbx 1227289848Sjkim adcx %rdi, %r15 # %rdi is 0 1228289848Sjkim 1229289848Sjkim mov \$-7, %rcx 1230289848Sjkim jmp .Loop_mulx_gather 1231289848Sjkim 1232289848Sjkim.align 32 1233289848Sjkim.Loop_mulx_gather: 1234296279Sjkim movdqa 16*0(%rbp),%xmm8 1235296279Sjkim movdqa 16*1(%rbp),%xmm9 1236296279Sjkim movdqa 16*2(%rbp),%xmm10 1237296279Sjkim movdqa 16*3(%rbp),%xmm11 1238296279Sjkim pand %xmm0,%xmm8 1239296279Sjkim movdqa 16*4(%rbp),%xmm12 1240296279Sjkim pand %xmm1,%xmm9 1241296279Sjkim movdqa 16*5(%rbp),%xmm13 1242296279Sjkim pand %xmm2,%xmm10 1243296279Sjkim movdqa 16*6(%rbp),%xmm14 1244296279Sjkim pand %xmm3,%xmm11 1245296279Sjkim movdqa 16*7(%rbp),%xmm15 1246296279Sjkim leaq 128(%rbp), %rbp 1247296279Sjkim pand %xmm4,%xmm12 1248296279Sjkim pand %xmm5,%xmm13 1249296279Sjkim pand %xmm6,%xmm14 1250296279Sjkim pand %xmm7,%xmm15 1251296279Sjkim por %xmm10,%xmm8 1252296279Sjkim por %xmm11,%xmm9 1253296279Sjkim por %xmm12,%xmm8 1254296279Sjkim por %xmm13,%xmm9 1255296279Sjkim por %xmm14,%xmm8 1256296279Sjkim por %xmm15,%xmm9 1257296279Sjkim 1258296279Sjkim por %xmm9,%xmm8 1259296279Sjkim pshufd \$0x4e,%xmm8,%xmm9 1260296279Sjkim por %xmm9,%xmm8 1261296279Sjkim movq %xmm8,%rdx 1262296279Sjkim 1263296279Sjkim .byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 # mulx ($ap), %rax, %r8 1264289848Sjkim adcx %rax, %rbx 1265289848Sjkim adox %r9, %r8 1266289848Sjkim 1267289848Sjkim mulx 8($ap), %rax, %r9 1268289848Sjkim adcx %rax, %r8 1269289848Sjkim adox %r10, %r9 1270289848Sjkim 1271289848Sjkim mulx 16($ap), %rax, %r10 1272289848Sjkim adcx %rax, %r9 1273289848Sjkim adox %r11, %r10 1274289848Sjkim 1275289848Sjkim .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11 1276289848Sjkim adcx %rax, %r10 1277289848Sjkim adox %r12, %r11 1278289848Sjkim 1279289848Sjkim mulx 32($ap), %rax, %r12 1280289848Sjkim adcx %rax, %r11 1281289848Sjkim adox %r13, %r12 1282289848Sjkim 1283289848Sjkim mulx 40($ap), %rax, %r13 1284289848Sjkim adcx %rax, %r12 1285289848Sjkim adox %r14, %r13 1286289848Sjkim 1287289848Sjkim .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14 1288289848Sjkim adcx %rax, %r13 1289296279Sjkim .byte 0x67 1290289848Sjkim adox %r15, %r14 1291289848Sjkim 1292289848Sjkim mulx 56($ap), %rax, %r15 1293289848Sjkim mov %rbx, 64(%rsp,%rcx,8) 1294289848Sjkim adcx %rax, %r14 1295289848Sjkim adox %rdi, %r15 1296289848Sjkim mov %r8, %rbx 1297289848Sjkim adcx %rdi, %r15 # cf=0 1298289848Sjkim 1299289848Sjkim inc %rcx # of=0 1300289848Sjkim jnz .Loop_mulx_gather 1301289848Sjkim 1302289848Sjkim mov %r8, 64(%rsp) 1303289848Sjkim mov %r9, 64+8(%rsp) 1304289848Sjkim mov %r10, 64+16(%rsp) 1305289848Sjkim mov %r11, 64+24(%rsp) 1306289848Sjkim mov %r12, 64+32(%rsp) 1307289848Sjkim mov %r13, 64+40(%rsp) 1308289848Sjkim mov %r14, 64+48(%rsp) 1309289848Sjkim mov %r15, 64+56(%rsp) 1310289848Sjkim 1311296279Sjkim mov 128(%rsp), %rdx # pull arguments 1312296279Sjkim mov 128+8(%rsp), $out 1313296279Sjkim mov 128+16(%rsp), %rbp 1314289848Sjkim 1315289848Sjkim mov (%rsp), %r8 1316289848Sjkim mov 8(%rsp), %r9 1317289848Sjkim mov 16(%rsp), %r10 1318289848Sjkim mov 24(%rsp), %r11 1319289848Sjkim mov 32(%rsp), %r12 1320289848Sjkim mov 40(%rsp), %r13 1321289848Sjkim mov 48(%rsp), %r14 1322289848Sjkim mov 56(%rsp), %r15 1323289848Sjkim 1324289848Sjkim call __rsaz_512_reducex 1325289848Sjkim 1326289848Sjkim.Lmul_gather_tail: 1327289848Sjkim___ 1328289848Sjkim$code.=<<___; 1329289848Sjkim addq 64(%rsp), %r8 1330289848Sjkim adcq 72(%rsp), %r9 1331289848Sjkim adcq 80(%rsp), %r10 1332289848Sjkim adcq 88(%rsp), %r11 1333289848Sjkim adcq 96(%rsp), %r12 1334289848Sjkim adcq 104(%rsp), %r13 1335289848Sjkim adcq 112(%rsp), %r14 1336289848Sjkim adcq 120(%rsp), %r15 1337289848Sjkim sbbq %rcx, %rcx 1338289848Sjkim 1339289848Sjkim call __rsaz_512_subtract 1340289848Sjkim 1341289848Sjkim leaq 128+24+48(%rsp), %rax 1342296279Sjkim___ 1343296279Sjkim$code.=<<___ if ($win64); 1344296279Sjkim movaps 0xa0-0xc8(%rax),%xmm6 1345296279Sjkim movaps 0xb0-0xc8(%rax),%xmm7 1346296279Sjkim movaps 0xc0-0xc8(%rax),%xmm8 1347296279Sjkim movaps 0xd0-0xc8(%rax),%xmm9 1348296279Sjkim movaps 0xe0-0xc8(%rax),%xmm10 1349296279Sjkim movaps 0xf0-0xc8(%rax),%xmm11 1350296279Sjkim movaps 0x100-0xc8(%rax),%xmm12 1351296279Sjkim movaps 0x110-0xc8(%rax),%xmm13 1352296279Sjkim movaps 0x120-0xc8(%rax),%xmm14 1353296279Sjkim movaps 0x130-0xc8(%rax),%xmm15 1354296279Sjkim lea 0xb0(%rax),%rax 1355296279Sjkim___ 1356296279Sjkim$code.=<<___; 1357289848Sjkim movq -48(%rax), %r15 1358289848Sjkim movq -40(%rax), %r14 1359289848Sjkim movq -32(%rax), %r13 1360289848Sjkim movq -24(%rax), %r12 1361289848Sjkim movq -16(%rax), %rbp 1362289848Sjkim movq -8(%rax), %rbx 1363289848Sjkim leaq (%rax), %rsp 1364289848Sjkim.Lmul_gather4_epilogue: 1365289848Sjkim ret 1366289848Sjkim.size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4 1367289848Sjkim___ 1368289848Sjkim} 1369289848Sjkim{ 1370289848Sjkimmy ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); 1371289848Sjkim$code.=<<___; 1372289848Sjkim.globl rsaz_512_mul_scatter4 1373289848Sjkim.type rsaz_512_mul_scatter4,\@function,6 1374289848Sjkim.align 32 1375289848Sjkimrsaz_512_mul_scatter4: 1376289848Sjkim push %rbx 1377289848Sjkim push %rbp 1378289848Sjkim push %r12 1379289848Sjkim push %r13 1380289848Sjkim push %r14 1381289848Sjkim push %r15 1382289848Sjkim 1383289848Sjkim mov $pwr, $pwr 1384289848Sjkim subq \$128+24, %rsp 1385289848Sjkim.Lmul_scatter4_body: 1386296279Sjkim leaq ($tbl,$pwr,8), $tbl 1387289848Sjkim movq $out, %xmm0 # off-load arguments 1388289848Sjkim movq $mod, %xmm1 1389289848Sjkim movq $tbl, %xmm2 1390289848Sjkim movq $n0, 128(%rsp) 1391289848Sjkim 1392289848Sjkim movq $out, %rbp 1393289848Sjkim___ 1394289848Sjkim$code.=<<___ if ($addx); 1395289848Sjkim movl \$0x80100,%r11d 1396289848Sjkim andl OPENSSL_ia32cap_P+8(%rip),%r11d 1397289848Sjkim cmpl \$0x80100,%r11d # check for MULX and ADO/CX 1398289848Sjkim je .Lmulx_scatter 1399289848Sjkim___ 1400289848Sjkim$code.=<<___; 1401289848Sjkim movq ($out),%rbx # pass b[0] 1402289848Sjkim call __rsaz_512_mul 1403289848Sjkim 1404289848Sjkim movq %xmm0, $out 1405289848Sjkim movq %xmm1, %rbp 1406289848Sjkim 1407289848Sjkim movq (%rsp), %r8 1408289848Sjkim movq 8(%rsp), %r9 1409289848Sjkim movq 16(%rsp), %r10 1410289848Sjkim movq 24(%rsp), %r11 1411289848Sjkim movq 32(%rsp), %r12 1412289848Sjkim movq 40(%rsp), %r13 1413289848Sjkim movq 48(%rsp), %r14 1414289848Sjkim movq 56(%rsp), %r15 1415289848Sjkim 1416289848Sjkim call __rsaz_512_reduce 1417289848Sjkim___ 1418289848Sjkim$code.=<<___ if ($addx); 1419289848Sjkim jmp .Lmul_scatter_tail 1420289848Sjkim 1421289848Sjkim.align 32 1422289848Sjkim.Lmulx_scatter: 1423289848Sjkim movq ($out), %rdx # pass b[0] 1424289848Sjkim call __rsaz_512_mulx 1425289848Sjkim 1426289848Sjkim movq %xmm0, $out 1427289848Sjkim movq %xmm1, %rbp 1428289848Sjkim 1429289848Sjkim movq 128(%rsp), %rdx # pull $n0 1430289848Sjkim movq (%rsp), %r8 1431289848Sjkim movq 8(%rsp), %r9 1432289848Sjkim movq 16(%rsp), %r10 1433289848Sjkim movq 24(%rsp), %r11 1434289848Sjkim movq 32(%rsp), %r12 1435289848Sjkim movq 40(%rsp), %r13 1436289848Sjkim movq 48(%rsp), %r14 1437289848Sjkim movq 56(%rsp), %r15 1438289848Sjkim 1439289848Sjkim call __rsaz_512_reducex 1440289848Sjkim 1441289848Sjkim.Lmul_scatter_tail: 1442289848Sjkim___ 1443289848Sjkim$code.=<<___; 1444289848Sjkim addq 64(%rsp), %r8 1445289848Sjkim adcq 72(%rsp), %r9 1446289848Sjkim adcq 80(%rsp), %r10 1447289848Sjkim adcq 88(%rsp), %r11 1448289848Sjkim adcq 96(%rsp), %r12 1449289848Sjkim adcq 104(%rsp), %r13 1450289848Sjkim adcq 112(%rsp), %r14 1451289848Sjkim adcq 120(%rsp), %r15 1452289848Sjkim movq %xmm2, $inp 1453289848Sjkim sbbq %rcx, %rcx 1454289848Sjkim 1455289848Sjkim call __rsaz_512_subtract 1456289848Sjkim 1457296279Sjkim movq %r8, 128*0($inp) # scatter 1458296279Sjkim movq %r9, 128*1($inp) 1459296279Sjkim movq %r10, 128*2($inp) 1460296279Sjkim movq %r11, 128*3($inp) 1461296279Sjkim movq %r12, 128*4($inp) 1462296279Sjkim movq %r13, 128*5($inp) 1463296279Sjkim movq %r14, 128*6($inp) 1464296279Sjkim movq %r15, 128*7($inp) 1465289848Sjkim 1466289848Sjkim leaq 128+24+48(%rsp), %rax 1467289848Sjkim movq -48(%rax), %r15 1468289848Sjkim movq -40(%rax), %r14 1469289848Sjkim movq -32(%rax), %r13 1470289848Sjkim movq -24(%rax), %r12 1471289848Sjkim movq -16(%rax), %rbp 1472289848Sjkim movq -8(%rax), %rbx 1473289848Sjkim leaq (%rax), %rsp 1474289848Sjkim.Lmul_scatter4_epilogue: 1475289848Sjkim ret 1476289848Sjkim.size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4 1477289848Sjkim___ 1478289848Sjkim} 1479289848Sjkim{ 1480289848Sjkimmy ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx"); 1481289848Sjkim$code.=<<___; 1482289848Sjkim.globl rsaz_512_mul_by_one 1483289848Sjkim.type rsaz_512_mul_by_one,\@function,4 1484289848Sjkim.align 32 1485289848Sjkimrsaz_512_mul_by_one: 1486289848Sjkim push %rbx 1487289848Sjkim push %rbp 1488289848Sjkim push %r12 1489289848Sjkim push %r13 1490289848Sjkim push %r14 1491289848Sjkim push %r15 1492289848Sjkim 1493289848Sjkim subq \$128+24, %rsp 1494289848Sjkim.Lmul_by_one_body: 1495289848Sjkim___ 1496289848Sjkim$code.=<<___ if ($addx); 1497289848Sjkim movl OPENSSL_ia32cap_P+8(%rip),%eax 1498289848Sjkim___ 1499289848Sjkim$code.=<<___; 1500289848Sjkim movq $mod, %rbp # reassign argument 1501289848Sjkim movq $n0, 128(%rsp) 1502289848Sjkim 1503289848Sjkim movq ($inp), %r8 1504289848Sjkim pxor %xmm0, %xmm0 1505289848Sjkim movq 8($inp), %r9 1506289848Sjkim movq 16($inp), %r10 1507289848Sjkim movq 24($inp), %r11 1508289848Sjkim movq 32($inp), %r12 1509289848Sjkim movq 40($inp), %r13 1510289848Sjkim movq 48($inp), %r14 1511289848Sjkim movq 56($inp), %r15 1512289848Sjkim 1513289848Sjkim movdqa %xmm0, (%rsp) 1514289848Sjkim movdqa %xmm0, 16(%rsp) 1515289848Sjkim movdqa %xmm0, 32(%rsp) 1516289848Sjkim movdqa %xmm0, 48(%rsp) 1517289848Sjkim movdqa %xmm0, 64(%rsp) 1518289848Sjkim movdqa %xmm0, 80(%rsp) 1519289848Sjkim movdqa %xmm0, 96(%rsp) 1520289848Sjkim___ 1521289848Sjkim$code.=<<___ if ($addx); 1522289848Sjkim andl \$0x80100,%eax 1523289848Sjkim cmpl \$0x80100,%eax # check for MULX and ADO/CX 1524289848Sjkim je .Lby_one_callx 1525289848Sjkim___ 1526289848Sjkim$code.=<<___; 1527289848Sjkim call __rsaz_512_reduce 1528289848Sjkim___ 1529289848Sjkim$code.=<<___ if ($addx); 1530289848Sjkim jmp .Lby_one_tail 1531289848Sjkim.align 32 1532289848Sjkim.Lby_one_callx: 1533289848Sjkim movq 128(%rsp), %rdx # pull $n0 1534289848Sjkim call __rsaz_512_reducex 1535289848Sjkim.Lby_one_tail: 1536289848Sjkim___ 1537289848Sjkim$code.=<<___; 1538289848Sjkim movq %r8, ($out) 1539289848Sjkim movq %r9, 8($out) 1540289848Sjkim movq %r10, 16($out) 1541289848Sjkim movq %r11, 24($out) 1542289848Sjkim movq %r12, 32($out) 1543289848Sjkim movq %r13, 40($out) 1544289848Sjkim movq %r14, 48($out) 1545289848Sjkim movq %r15, 56($out) 1546289848Sjkim 1547289848Sjkim leaq 128+24+48(%rsp), %rax 1548289848Sjkim movq -48(%rax), %r15 1549289848Sjkim movq -40(%rax), %r14 1550289848Sjkim movq -32(%rax), %r13 1551289848Sjkim movq -24(%rax), %r12 1552289848Sjkim movq -16(%rax), %rbp 1553289848Sjkim movq -8(%rax), %rbx 1554289848Sjkim leaq (%rax), %rsp 1555289848Sjkim.Lmul_by_one_epilogue: 1556289848Sjkim ret 1557289848Sjkim.size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one 1558289848Sjkim___ 1559289848Sjkim} 1560289848Sjkim{ # __rsaz_512_reduce 1561289848Sjkim # 1562289848Sjkim # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0 1563289848Sjkim # output: %r8-%r15 1564289848Sjkim # clobbers: everything except %rbp and %rdi 1565289848Sjkim$code.=<<___; 1566289848Sjkim.type __rsaz_512_reduce,\@abi-omnipotent 1567289848Sjkim.align 32 1568289848Sjkim__rsaz_512_reduce: 1569289848Sjkim movq %r8, %rbx 1570289848Sjkim imulq 128+8(%rsp), %rbx 1571289848Sjkim movq 0(%rbp), %rax 1572289848Sjkim movl \$8, %ecx 1573289848Sjkim jmp .Lreduction_loop 1574289848Sjkim 1575289848Sjkim.align 32 1576289848Sjkim.Lreduction_loop: 1577289848Sjkim mulq %rbx 1578289848Sjkim movq 8(%rbp), %rax 1579289848Sjkim negq %r8 1580289848Sjkim movq %rdx, %r8 1581289848Sjkim adcq \$0, %r8 1582289848Sjkim 1583289848Sjkim mulq %rbx 1584289848Sjkim addq %rax, %r9 1585289848Sjkim movq 16(%rbp), %rax 1586289848Sjkim adcq \$0, %rdx 1587289848Sjkim addq %r9, %r8 1588289848Sjkim movq %rdx, %r9 1589289848Sjkim adcq \$0, %r9 1590289848Sjkim 1591289848Sjkim mulq %rbx 1592289848Sjkim addq %rax, %r10 1593289848Sjkim movq 24(%rbp), %rax 1594289848Sjkim adcq \$0, %rdx 1595289848Sjkim addq %r10, %r9 1596289848Sjkim movq %rdx, %r10 1597289848Sjkim adcq \$0, %r10 1598289848Sjkim 1599289848Sjkim mulq %rbx 1600289848Sjkim addq %rax, %r11 1601289848Sjkim movq 32(%rbp), %rax 1602289848Sjkim adcq \$0, %rdx 1603289848Sjkim addq %r11, %r10 1604289848Sjkim movq 128+8(%rsp), %rsi 1605289848Sjkim #movq %rdx, %r11 1606289848Sjkim #adcq \$0, %r11 1607289848Sjkim adcq \$0, %rdx 1608289848Sjkim movq %rdx, %r11 1609289848Sjkim 1610289848Sjkim mulq %rbx 1611289848Sjkim addq %rax, %r12 1612289848Sjkim movq 40(%rbp), %rax 1613289848Sjkim adcq \$0, %rdx 1614289848Sjkim imulq %r8, %rsi 1615289848Sjkim addq %r12, %r11 1616289848Sjkim movq %rdx, %r12 1617289848Sjkim adcq \$0, %r12 1618289848Sjkim 1619289848Sjkim mulq %rbx 1620289848Sjkim addq %rax, %r13 1621289848Sjkim movq 48(%rbp), %rax 1622289848Sjkim adcq \$0, %rdx 1623289848Sjkim addq %r13, %r12 1624289848Sjkim movq %rdx, %r13 1625289848Sjkim adcq \$0, %r13 1626289848Sjkim 1627289848Sjkim mulq %rbx 1628289848Sjkim addq %rax, %r14 1629289848Sjkim movq 56(%rbp), %rax 1630289848Sjkim adcq \$0, %rdx 1631289848Sjkim addq %r14, %r13 1632289848Sjkim movq %rdx, %r14 1633289848Sjkim adcq \$0, %r14 1634289848Sjkim 1635289848Sjkim mulq %rbx 1636289848Sjkim movq %rsi, %rbx 1637289848Sjkim addq %rax, %r15 1638289848Sjkim movq 0(%rbp), %rax 1639289848Sjkim adcq \$0, %rdx 1640289848Sjkim addq %r15, %r14 1641289848Sjkim movq %rdx, %r15 1642289848Sjkim adcq \$0, %r15 1643289848Sjkim 1644289848Sjkim decl %ecx 1645289848Sjkim jne .Lreduction_loop 1646289848Sjkim 1647289848Sjkim ret 1648289848Sjkim.size __rsaz_512_reduce,.-__rsaz_512_reduce 1649289848Sjkim___ 1650289848Sjkim} 1651289848Sjkimif ($addx) { 1652289848Sjkim # __rsaz_512_reducex 1653289848Sjkim # 1654289848Sjkim # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0 1655289848Sjkim # output: %r8-%r15 1656289848Sjkim # clobbers: everything except %rbp and %rdi 1657289848Sjkim$code.=<<___; 1658289848Sjkim.type __rsaz_512_reducex,\@abi-omnipotent 1659289848Sjkim.align 32 1660289848Sjkim__rsaz_512_reducex: 1661289848Sjkim #movq 128+8(%rsp), %rdx # pull $n0 1662289848Sjkim imulq %r8, %rdx 1663289848Sjkim xorq %rsi, %rsi # cf=0,of=0 1664289848Sjkim movl \$8, %ecx 1665289848Sjkim jmp .Lreduction_loopx 1666289848Sjkim 1667289848Sjkim.align 32 1668289848Sjkim.Lreduction_loopx: 1669289848Sjkim mov %r8, %rbx 1670289848Sjkim mulx 0(%rbp), %rax, %r8 1671289848Sjkim adcx %rbx, %rax 1672289848Sjkim adox %r9, %r8 1673289848Sjkim 1674289848Sjkim mulx 8(%rbp), %rax, %r9 1675289848Sjkim adcx %rax, %r8 1676289848Sjkim adox %r10, %r9 1677289848Sjkim 1678289848Sjkim mulx 16(%rbp), %rbx, %r10 1679289848Sjkim adcx %rbx, %r9 1680289848Sjkim adox %r11, %r10 1681289848Sjkim 1682289848Sjkim mulx 24(%rbp), %rbx, %r11 1683289848Sjkim adcx %rbx, %r10 1684289848Sjkim adox %r12, %r11 1685289848Sjkim 1686289848Sjkim .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12 1687289848Sjkim mov %rdx, %rax 1688289848Sjkim mov %r8, %rdx 1689289848Sjkim adcx %rbx, %r11 1690289848Sjkim adox %r13, %r12 1691289848Sjkim 1692289848Sjkim mulx 128+8(%rsp), %rbx, %rdx 1693289848Sjkim mov %rax, %rdx 1694289848Sjkim 1695289848Sjkim mulx 40(%rbp), %rax, %r13 1696289848Sjkim adcx %rax, %r12 1697289848Sjkim adox %r14, %r13 1698289848Sjkim 1699289848Sjkim .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14 1700289848Sjkim adcx %rax, %r13 1701289848Sjkim adox %r15, %r14 1702289848Sjkim 1703289848Sjkim mulx 56(%rbp), %rax, %r15 1704289848Sjkim mov %rbx, %rdx 1705289848Sjkim adcx %rax, %r14 1706289848Sjkim adox %rsi, %r15 # %rsi is 0 1707289848Sjkim adcx %rsi, %r15 # cf=0 1708289848Sjkim 1709289848Sjkim decl %ecx # of=0 1710289848Sjkim jne .Lreduction_loopx 1711289848Sjkim 1712289848Sjkim ret 1713289848Sjkim.size __rsaz_512_reducex,.-__rsaz_512_reducex 1714289848Sjkim___ 1715289848Sjkim} 1716289848Sjkim{ # __rsaz_512_subtract 1717289848Sjkim # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask 1718289848Sjkim # output: 1719289848Sjkim # clobbers: everything but %rdi, %rsi and %rbp 1720289848Sjkim$code.=<<___; 1721289848Sjkim.type __rsaz_512_subtract,\@abi-omnipotent 1722289848Sjkim.align 32 1723289848Sjkim__rsaz_512_subtract: 1724289848Sjkim movq %r8, ($out) 1725289848Sjkim movq %r9, 8($out) 1726289848Sjkim movq %r10, 16($out) 1727289848Sjkim movq %r11, 24($out) 1728289848Sjkim movq %r12, 32($out) 1729289848Sjkim movq %r13, 40($out) 1730289848Sjkim movq %r14, 48($out) 1731289848Sjkim movq %r15, 56($out) 1732289848Sjkim 1733289848Sjkim movq 0($mod), %r8 1734289848Sjkim movq 8($mod), %r9 1735289848Sjkim negq %r8 1736289848Sjkim notq %r9 1737289848Sjkim andq %rcx, %r8 1738289848Sjkim movq 16($mod), %r10 1739289848Sjkim andq %rcx, %r9 1740289848Sjkim notq %r10 1741289848Sjkim movq 24($mod), %r11 1742289848Sjkim andq %rcx, %r10 1743289848Sjkim notq %r11 1744289848Sjkim movq 32($mod), %r12 1745289848Sjkim andq %rcx, %r11 1746289848Sjkim notq %r12 1747289848Sjkim movq 40($mod), %r13 1748289848Sjkim andq %rcx, %r12 1749289848Sjkim notq %r13 1750289848Sjkim movq 48($mod), %r14 1751289848Sjkim andq %rcx, %r13 1752289848Sjkim notq %r14 1753289848Sjkim movq 56($mod), %r15 1754289848Sjkim andq %rcx, %r14 1755289848Sjkim notq %r15 1756289848Sjkim andq %rcx, %r15 1757289848Sjkim 1758289848Sjkim addq ($out), %r8 1759289848Sjkim adcq 8($out), %r9 1760289848Sjkim adcq 16($out), %r10 1761289848Sjkim adcq 24($out), %r11 1762289848Sjkim adcq 32($out), %r12 1763289848Sjkim adcq 40($out), %r13 1764289848Sjkim adcq 48($out), %r14 1765289848Sjkim adcq 56($out), %r15 1766289848Sjkim 1767289848Sjkim movq %r8, ($out) 1768289848Sjkim movq %r9, 8($out) 1769289848Sjkim movq %r10, 16($out) 1770289848Sjkim movq %r11, 24($out) 1771289848Sjkim movq %r12, 32($out) 1772289848Sjkim movq %r13, 40($out) 1773289848Sjkim movq %r14, 48($out) 1774289848Sjkim movq %r15, 56($out) 1775289848Sjkim 1776289848Sjkim ret 1777289848Sjkim.size __rsaz_512_subtract,.-__rsaz_512_subtract 1778289848Sjkim___ 1779289848Sjkim} 1780289848Sjkim{ # __rsaz_512_mul 1781289848Sjkim # 1782289848Sjkim # input: %rsi - ap, %rbp - bp 1783289848Sjkim # ouput: 1784289848Sjkim # clobbers: everything 1785289848Sjkimmy ($ap,$bp) = ("%rsi","%rbp"); 1786289848Sjkim$code.=<<___; 1787289848Sjkim.type __rsaz_512_mul,\@abi-omnipotent 1788289848Sjkim.align 32 1789289848Sjkim__rsaz_512_mul: 1790289848Sjkim leaq 8(%rsp), %rdi 1791289848Sjkim 1792289848Sjkim movq ($ap), %rax 1793289848Sjkim mulq %rbx 1794289848Sjkim movq %rax, (%rdi) 1795289848Sjkim movq 8($ap), %rax 1796289848Sjkim movq %rdx, %r8 1797289848Sjkim 1798289848Sjkim mulq %rbx 1799289848Sjkim addq %rax, %r8 1800289848Sjkim movq 16($ap), %rax 1801289848Sjkim movq %rdx, %r9 1802289848Sjkim adcq \$0, %r9 1803289848Sjkim 1804289848Sjkim mulq %rbx 1805289848Sjkim addq %rax, %r9 1806289848Sjkim movq 24($ap), %rax 1807289848Sjkim movq %rdx, %r10 1808289848Sjkim adcq \$0, %r10 1809289848Sjkim 1810289848Sjkim mulq %rbx 1811289848Sjkim addq %rax, %r10 1812289848Sjkim movq 32($ap), %rax 1813289848Sjkim movq %rdx, %r11 1814289848Sjkim adcq \$0, %r11 1815289848Sjkim 1816289848Sjkim mulq %rbx 1817289848Sjkim addq %rax, %r11 1818289848Sjkim movq 40($ap), %rax 1819289848Sjkim movq %rdx, %r12 1820289848Sjkim adcq \$0, %r12 1821289848Sjkim 1822289848Sjkim mulq %rbx 1823289848Sjkim addq %rax, %r12 1824289848Sjkim movq 48($ap), %rax 1825289848Sjkim movq %rdx, %r13 1826289848Sjkim adcq \$0, %r13 1827289848Sjkim 1828289848Sjkim mulq %rbx 1829289848Sjkim addq %rax, %r13 1830289848Sjkim movq 56($ap), %rax 1831289848Sjkim movq %rdx, %r14 1832289848Sjkim adcq \$0, %r14 1833289848Sjkim 1834289848Sjkim mulq %rbx 1835289848Sjkim addq %rax, %r14 1836289848Sjkim movq ($ap), %rax 1837289848Sjkim movq %rdx, %r15 1838289848Sjkim adcq \$0, %r15 1839289848Sjkim 1840289848Sjkim leaq 8($bp), $bp 1841289848Sjkim leaq 8(%rdi), %rdi 1842289848Sjkim 1843289848Sjkim movl \$7, %ecx 1844289848Sjkim jmp .Loop_mul 1845289848Sjkim 1846289848Sjkim.align 32 1847289848Sjkim.Loop_mul: 1848289848Sjkim movq ($bp), %rbx 1849289848Sjkim mulq %rbx 1850289848Sjkim addq %rax, %r8 1851289848Sjkim movq 8($ap), %rax 1852289848Sjkim movq %r8, (%rdi) 1853289848Sjkim movq %rdx, %r8 1854289848Sjkim adcq \$0, %r8 1855289848Sjkim 1856289848Sjkim mulq %rbx 1857289848Sjkim addq %rax, %r9 1858289848Sjkim movq 16($ap), %rax 1859289848Sjkim adcq \$0, %rdx 1860289848Sjkim addq %r9, %r8 1861289848Sjkim movq %rdx, %r9 1862289848Sjkim adcq \$0, %r9 1863289848Sjkim 1864289848Sjkim mulq %rbx 1865289848Sjkim addq %rax, %r10 1866289848Sjkim movq 24($ap), %rax 1867289848Sjkim adcq \$0, %rdx 1868289848Sjkim addq %r10, %r9 1869289848Sjkim movq %rdx, %r10 1870289848Sjkim adcq \$0, %r10 1871289848Sjkim 1872289848Sjkim mulq %rbx 1873289848Sjkim addq %rax, %r11 1874289848Sjkim movq 32($ap), %rax 1875289848Sjkim adcq \$0, %rdx 1876289848Sjkim addq %r11, %r10 1877289848Sjkim movq %rdx, %r11 1878289848Sjkim adcq \$0, %r11 1879289848Sjkim 1880289848Sjkim mulq %rbx 1881289848Sjkim addq %rax, %r12 1882289848Sjkim movq 40($ap), %rax 1883289848Sjkim adcq \$0, %rdx 1884289848Sjkim addq %r12, %r11 1885289848Sjkim movq %rdx, %r12 1886289848Sjkim adcq \$0, %r12 1887289848Sjkim 1888289848Sjkim mulq %rbx 1889289848Sjkim addq %rax, %r13 1890289848Sjkim movq 48($ap), %rax 1891289848Sjkim adcq \$0, %rdx 1892289848Sjkim addq %r13, %r12 1893289848Sjkim movq %rdx, %r13 1894289848Sjkim adcq \$0, %r13 1895289848Sjkim 1896289848Sjkim mulq %rbx 1897289848Sjkim addq %rax, %r14 1898289848Sjkim movq 56($ap), %rax 1899289848Sjkim adcq \$0, %rdx 1900289848Sjkim addq %r14, %r13 1901289848Sjkim movq %rdx, %r14 1902289848Sjkim leaq 8($bp), $bp 1903289848Sjkim adcq \$0, %r14 1904289848Sjkim 1905289848Sjkim mulq %rbx 1906289848Sjkim addq %rax, %r15 1907289848Sjkim movq ($ap), %rax 1908289848Sjkim adcq \$0, %rdx 1909289848Sjkim addq %r15, %r14 1910289848Sjkim movq %rdx, %r15 1911289848Sjkim adcq \$0, %r15 1912289848Sjkim 1913289848Sjkim leaq 8(%rdi), %rdi 1914289848Sjkim 1915289848Sjkim decl %ecx 1916289848Sjkim jnz .Loop_mul 1917289848Sjkim 1918289848Sjkim movq %r8, (%rdi) 1919289848Sjkim movq %r9, 8(%rdi) 1920289848Sjkim movq %r10, 16(%rdi) 1921289848Sjkim movq %r11, 24(%rdi) 1922289848Sjkim movq %r12, 32(%rdi) 1923289848Sjkim movq %r13, 40(%rdi) 1924289848Sjkim movq %r14, 48(%rdi) 1925289848Sjkim movq %r15, 56(%rdi) 1926289848Sjkim 1927289848Sjkim ret 1928289848Sjkim.size __rsaz_512_mul,.-__rsaz_512_mul 1929289848Sjkim___ 1930289848Sjkim} 1931289848Sjkimif ($addx) { 1932289848Sjkim # __rsaz_512_mulx 1933289848Sjkim # 1934289848Sjkim # input: %rsi - ap, %rbp - bp 1935289848Sjkim # ouput: 1936289848Sjkim # clobbers: everything 1937289848Sjkimmy ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi"); 1938289848Sjkim$code.=<<___; 1939289848Sjkim.type __rsaz_512_mulx,\@abi-omnipotent 1940289848Sjkim.align 32 1941289848Sjkim__rsaz_512_mulx: 1942289848Sjkim mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller 1943289848Sjkim mov \$-6, %rcx 1944289848Sjkim 1945289848Sjkim mulx 8($ap), %rax, %r9 1946289848Sjkim movq %rbx, 8(%rsp) 1947289848Sjkim 1948289848Sjkim mulx 16($ap), %rbx, %r10 1949289848Sjkim adc %rax, %r8 1950289848Sjkim 1951289848Sjkim mulx 24($ap), %rax, %r11 1952289848Sjkim adc %rbx, %r9 1953289848Sjkim 1954289848Sjkim mulx 32($ap), %rbx, %r12 1955289848Sjkim adc %rax, %r10 1956289848Sjkim 1957289848Sjkim mulx 40($ap), %rax, %r13 1958289848Sjkim adc %rbx, %r11 1959289848Sjkim 1960289848Sjkim mulx 48($ap), %rbx, %r14 1961289848Sjkim adc %rax, %r12 1962289848Sjkim 1963289848Sjkim mulx 56($ap), %rax, %r15 1964289848Sjkim mov 8($bp), %rdx 1965289848Sjkim adc %rbx, %r13 1966289848Sjkim adc %rax, %r14 1967289848Sjkim adc \$0, %r15 1968289848Sjkim 1969289848Sjkim xor $zero, $zero # cf=0,of=0 1970289848Sjkim jmp .Loop_mulx 1971289848Sjkim 1972289848Sjkim.align 32 1973289848Sjkim.Loop_mulx: 1974289848Sjkim movq %r8, %rbx 1975289848Sjkim mulx ($ap), %rax, %r8 1976289848Sjkim adcx %rax, %rbx 1977289848Sjkim adox %r9, %r8 1978289848Sjkim 1979289848Sjkim mulx 8($ap), %rax, %r9 1980289848Sjkim adcx %rax, %r8 1981289848Sjkim adox %r10, %r9 1982289848Sjkim 1983289848Sjkim mulx 16($ap), %rax, %r10 1984289848Sjkim adcx %rax, %r9 1985289848Sjkim adox %r11, %r10 1986289848Sjkim 1987289848Sjkim mulx 24($ap), %rax, %r11 1988289848Sjkim adcx %rax, %r10 1989289848Sjkim adox %r12, %r11 1990289848Sjkim 1991289848Sjkim .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12 1992289848Sjkim adcx %rax, %r11 1993289848Sjkim adox %r13, %r12 1994289848Sjkim 1995289848Sjkim mulx 40($ap), %rax, %r13 1996289848Sjkim adcx %rax, %r12 1997289848Sjkim adox %r14, %r13 1998289848Sjkim 1999289848Sjkim mulx 48($ap), %rax, %r14 2000289848Sjkim adcx %rax, %r13 2001289848Sjkim adox %r15, %r14 2002289848Sjkim 2003289848Sjkim mulx 56($ap), %rax, %r15 2004289848Sjkim movq 64($bp,%rcx,8), %rdx 2005289848Sjkim movq %rbx, 8+64-8(%rsp,%rcx,8) 2006289848Sjkim adcx %rax, %r14 2007289848Sjkim adox $zero, %r15 2008289848Sjkim adcx $zero, %r15 # cf=0 2009289848Sjkim 2010289848Sjkim inc %rcx # of=0 2011289848Sjkim jnz .Loop_mulx 2012289848Sjkim 2013289848Sjkim movq %r8, %rbx 2014289848Sjkim mulx ($ap), %rax, %r8 2015289848Sjkim adcx %rax, %rbx 2016289848Sjkim adox %r9, %r8 2017289848Sjkim 2018289848Sjkim .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9 2019289848Sjkim adcx %rax, %r8 2020289848Sjkim adox %r10, %r9 2021289848Sjkim 2022289848Sjkim .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10 2023289848Sjkim adcx %rax, %r9 2024289848Sjkim adox %r11, %r10 2025289848Sjkim 2026289848Sjkim mulx 24($ap), %rax, %r11 2027289848Sjkim adcx %rax, %r10 2028289848Sjkim adox %r12, %r11 2029289848Sjkim 2030289848Sjkim mulx 32($ap), %rax, %r12 2031289848Sjkim adcx %rax, %r11 2032289848Sjkim adox %r13, %r12 2033289848Sjkim 2034289848Sjkim mulx 40($ap), %rax, %r13 2035289848Sjkim adcx %rax, %r12 2036289848Sjkim adox %r14, %r13 2037289848Sjkim 2038289848Sjkim .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14 2039289848Sjkim adcx %rax, %r13 2040289848Sjkim adox %r15, %r14 2041289848Sjkim 2042289848Sjkim .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15 2043289848Sjkim adcx %rax, %r14 2044289848Sjkim adox $zero, %r15 2045289848Sjkim adcx $zero, %r15 2046289848Sjkim 2047289848Sjkim mov %rbx, 8+64-8(%rsp) 2048289848Sjkim mov %r8, 8+64(%rsp) 2049289848Sjkim mov %r9, 8+64+8(%rsp) 2050289848Sjkim mov %r10, 8+64+16(%rsp) 2051289848Sjkim mov %r11, 8+64+24(%rsp) 2052289848Sjkim mov %r12, 8+64+32(%rsp) 2053289848Sjkim mov %r13, 8+64+40(%rsp) 2054289848Sjkim mov %r14, 8+64+48(%rsp) 2055289848Sjkim mov %r15, 8+64+56(%rsp) 2056289848Sjkim 2057289848Sjkim ret 2058289848Sjkim.size __rsaz_512_mulx,.-__rsaz_512_mulx 2059289848Sjkim___ 2060289848Sjkim} 2061289848Sjkim{ 2062289848Sjkimmy ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx"); 2063289848Sjkim$code.=<<___; 2064289848Sjkim.globl rsaz_512_scatter4 2065289848Sjkim.type rsaz_512_scatter4,\@abi-omnipotent 2066289848Sjkim.align 16 2067289848Sjkimrsaz_512_scatter4: 2068296279Sjkim leaq ($out,$power,8), $out 2069289848Sjkim movl \$8, %r9d 2070289848Sjkim jmp .Loop_scatter 2071289848Sjkim.align 16 2072289848Sjkim.Loop_scatter: 2073289848Sjkim movq ($inp), %rax 2074289848Sjkim leaq 8($inp), $inp 2075296279Sjkim movq %rax, ($out) 2076289848Sjkim leaq 128($out), $out 2077289848Sjkim decl %r9d 2078289848Sjkim jnz .Loop_scatter 2079289848Sjkim ret 2080289848Sjkim.size rsaz_512_scatter4,.-rsaz_512_scatter4 2081289848Sjkim 2082289848Sjkim.globl rsaz_512_gather4 2083289848Sjkim.type rsaz_512_gather4,\@abi-omnipotent 2084289848Sjkim.align 16 2085289848Sjkimrsaz_512_gather4: 2086296279Sjkim___ 2087296279Sjkim$code.=<<___ if ($win64); 2088296279Sjkim.LSEH_begin_rsaz_512_gather4: 2089296279Sjkim .byte 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 # sub $0xa8,%rsp 2090296279Sjkim .byte 0x0f,0x29,0x34,0x24 # movaps %xmm6,(%rsp) 2091296279Sjkim .byte 0x0f,0x29,0x7c,0x24,0x10 # movaps %xmm7,0x10(%rsp) 2092296279Sjkim .byte 0x44,0x0f,0x29,0x44,0x24,0x20 # movaps %xmm8,0x20(%rsp) 2093296279Sjkim .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 # movaps %xmm9,0x30(%rsp) 2094296279Sjkim .byte 0x44,0x0f,0x29,0x54,0x24,0x40 # movaps %xmm10,0x40(%rsp) 2095296279Sjkim .byte 0x44,0x0f,0x29,0x5c,0x24,0x50 # movaps %xmm11,0x50(%rsp) 2096296279Sjkim .byte 0x44,0x0f,0x29,0x64,0x24,0x60 # movaps %xmm12,0x60(%rsp) 2097296279Sjkim .byte 0x44,0x0f,0x29,0x6c,0x24,0x70 # movaps %xmm13,0x70(%rsp) 2098296279Sjkim .byte 0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 # movaps %xmm14,0x80(%rsp) 2099296279Sjkim .byte 0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 # movaps %xmm15,0x90(%rsp) 2100296279Sjkim___ 2101296279Sjkim$code.=<<___; 2102296279Sjkim movd $power,%xmm8 2103296279Sjkim movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002 2104296279Sjkim movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000 2105296279Sjkim 2106296279Sjkim pshufd \$0,%xmm8,%xmm8 # broadcast $power 2107296279Sjkim movdqa %xmm1,%xmm7 2108296279Sjkim movdqa %xmm1,%xmm2 2109296279Sjkim___ 2110296279Sjkim######################################################################## 2111296279Sjkim# calculate mask by comparing 0..15 to $power 2112296279Sjkim# 2113296279Sjkimfor($i=0;$i<4;$i++) { 2114296279Sjkim$code.=<<___; 2115296279Sjkim paddd %xmm`$i`,%xmm`$i+1` 2116296279Sjkim pcmpeqd %xmm8,%xmm`$i` 2117296279Sjkim movdqa %xmm7,%xmm`$i+3` 2118296279Sjkim___ 2119296279Sjkim} 2120296279Sjkimfor(;$i<7;$i++) { 2121296279Sjkim$code.=<<___; 2122296279Sjkim paddd %xmm`$i`,%xmm`$i+1` 2123296279Sjkim pcmpeqd %xmm8,%xmm`$i` 2124296279Sjkim___ 2125296279Sjkim} 2126296279Sjkim$code.=<<___; 2127296279Sjkim pcmpeqd %xmm8,%xmm7 2128289848Sjkim movl \$8, %r9d 2129289848Sjkim jmp .Loop_gather 2130289848Sjkim.align 16 2131289848Sjkim.Loop_gather: 2132296279Sjkim movdqa 16*0($inp),%xmm8 2133296279Sjkim movdqa 16*1($inp),%xmm9 2134296279Sjkim movdqa 16*2($inp),%xmm10 2135296279Sjkim movdqa 16*3($inp),%xmm11 2136296279Sjkim pand %xmm0,%xmm8 2137296279Sjkim movdqa 16*4($inp),%xmm12 2138296279Sjkim pand %xmm1,%xmm9 2139296279Sjkim movdqa 16*5($inp),%xmm13 2140296279Sjkim pand %xmm2,%xmm10 2141296279Sjkim movdqa 16*6($inp),%xmm14 2142296279Sjkim pand %xmm3,%xmm11 2143296279Sjkim movdqa 16*7($inp),%xmm15 2144289848Sjkim leaq 128($inp), $inp 2145296279Sjkim pand %xmm4,%xmm12 2146296279Sjkim pand %xmm5,%xmm13 2147296279Sjkim pand %xmm6,%xmm14 2148296279Sjkim pand %xmm7,%xmm15 2149296279Sjkim por %xmm10,%xmm8 2150296279Sjkim por %xmm11,%xmm9 2151296279Sjkim por %xmm12,%xmm8 2152296279Sjkim por %xmm13,%xmm9 2153296279Sjkim por %xmm14,%xmm8 2154296279Sjkim por %xmm15,%xmm9 2155296279Sjkim 2156296279Sjkim por %xmm9,%xmm8 2157296279Sjkim pshufd \$0x4e,%xmm8,%xmm9 2158296279Sjkim por %xmm9,%xmm8 2159296279Sjkim movq %xmm8,($out) 2160289848Sjkim leaq 8($out), $out 2161289848Sjkim decl %r9d 2162289848Sjkim jnz .Loop_gather 2163296279Sjkim___ 2164296279Sjkim$code.=<<___ if ($win64); 2165296279Sjkim movaps 0x00(%rsp),%xmm6 2166296279Sjkim movaps 0x10(%rsp),%xmm7 2167296279Sjkim movaps 0x20(%rsp),%xmm8 2168296279Sjkim movaps 0x30(%rsp),%xmm9 2169296279Sjkim movaps 0x40(%rsp),%xmm10 2170296279Sjkim movaps 0x50(%rsp),%xmm11 2171296279Sjkim movaps 0x60(%rsp),%xmm12 2172296279Sjkim movaps 0x70(%rsp),%xmm13 2173296279Sjkim movaps 0x80(%rsp),%xmm14 2174296279Sjkim movaps 0x90(%rsp),%xmm15 2175296279Sjkim add \$0xa8,%rsp 2176296279Sjkim___ 2177296279Sjkim$code.=<<___; 2178289848Sjkim ret 2179296279Sjkim.LSEH_end_rsaz_512_gather4: 2180289848Sjkim.size rsaz_512_gather4,.-rsaz_512_gather4 2181296279Sjkim 2182296279Sjkim.align 64 2183296279Sjkim.Linc: 2184296279Sjkim .long 0,0, 1,1 2185296279Sjkim .long 2,2, 2,2 2186289848Sjkim___ 2187289848Sjkim} 2188289848Sjkim 2189289848Sjkim# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 2190289848Sjkim# CONTEXT *context,DISPATCHER_CONTEXT *disp) 2191289848Sjkimif ($win64) { 2192289848Sjkim$rec="%rcx"; 2193289848Sjkim$frame="%rdx"; 2194289848Sjkim$context="%r8"; 2195289848Sjkim$disp="%r9"; 2196289848Sjkim 2197289848Sjkim$code.=<<___; 2198289848Sjkim.extern __imp_RtlVirtualUnwind 2199289848Sjkim.type se_handler,\@abi-omnipotent 2200289848Sjkim.align 16 2201289848Sjkimse_handler: 2202289848Sjkim push %rsi 2203289848Sjkim push %rdi 2204289848Sjkim push %rbx 2205289848Sjkim push %rbp 2206289848Sjkim push %r12 2207289848Sjkim push %r13 2208289848Sjkim push %r14 2209289848Sjkim push %r15 2210289848Sjkim pushfq 2211289848Sjkim sub \$64,%rsp 2212289848Sjkim 2213289848Sjkim mov 120($context),%rax # pull context->Rax 2214289848Sjkim mov 248($context),%rbx # pull context->Rip 2215289848Sjkim 2216289848Sjkim mov 8($disp),%rsi # disp->ImageBase 2217289848Sjkim mov 56($disp),%r11 # disp->HandlerData 2218289848Sjkim 2219289848Sjkim mov 0(%r11),%r10d # HandlerData[0] 2220289848Sjkim lea (%rsi,%r10),%r10 # end of prologue label 2221289848Sjkim cmp %r10,%rbx # context->Rip<end of prologue label 2222289848Sjkim jb .Lcommon_seh_tail 2223289848Sjkim 2224289848Sjkim mov 152($context),%rax # pull context->Rsp 2225289848Sjkim 2226289848Sjkim mov 4(%r11),%r10d # HandlerData[1] 2227289848Sjkim lea (%rsi,%r10),%r10 # epilogue label 2228289848Sjkim cmp %r10,%rbx # context->Rip>=epilogue label 2229289848Sjkim jae .Lcommon_seh_tail 2230289848Sjkim 2231289848Sjkim lea 128+24+48(%rax),%rax 2232289848Sjkim 2233296279Sjkim lea .Lmul_gather4_epilogue(%rip),%rbx 2234296279Sjkim cmp %r10,%rbx 2235296279Sjkim jne .Lse_not_in_mul_gather4 2236296279Sjkim 2237296279Sjkim lea 0xb0(%rax),%rax 2238296279Sjkim 2239296279Sjkim lea -48-0xa8(%rax),%rsi 2240296279Sjkim lea 512($context),%rdi 2241296279Sjkim mov \$20,%ecx 2242296279Sjkim .long 0xa548f3fc # cld; rep movsq 2243296279Sjkim 2244296279Sjkim.Lse_not_in_mul_gather4: 2245289848Sjkim mov -8(%rax),%rbx 2246289848Sjkim mov -16(%rax),%rbp 2247289848Sjkim mov -24(%rax),%r12 2248289848Sjkim mov -32(%rax),%r13 2249289848Sjkim mov -40(%rax),%r14 2250289848Sjkim mov -48(%rax),%r15 2251289848Sjkim mov %rbx,144($context) # restore context->Rbx 2252289848Sjkim mov %rbp,160($context) # restore context->Rbp 2253289848Sjkim mov %r12,216($context) # restore context->R12 2254289848Sjkim mov %r13,224($context) # restore context->R13 2255289848Sjkim mov %r14,232($context) # restore context->R14 2256289848Sjkim mov %r15,240($context) # restore context->R15 2257289848Sjkim 2258289848Sjkim.Lcommon_seh_tail: 2259289848Sjkim mov 8(%rax),%rdi 2260289848Sjkim mov 16(%rax),%rsi 2261289848Sjkim mov %rax,152($context) # restore context->Rsp 2262289848Sjkim mov %rsi,168($context) # restore context->Rsi 2263289848Sjkim mov %rdi,176($context) # restore context->Rdi 2264289848Sjkim 2265289848Sjkim mov 40($disp),%rdi # disp->ContextRecord 2266289848Sjkim mov $context,%rsi # context 2267289848Sjkim mov \$154,%ecx # sizeof(CONTEXT) 2268289848Sjkim .long 0xa548f3fc # cld; rep movsq 2269289848Sjkim 2270289848Sjkim mov $disp,%rsi 2271289848Sjkim xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 2272289848Sjkim mov 8(%rsi),%rdx # arg2, disp->ImageBase 2273289848Sjkim mov 0(%rsi),%r8 # arg3, disp->ControlPc 2274289848Sjkim mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 2275289848Sjkim mov 40(%rsi),%r10 # disp->ContextRecord 2276289848Sjkim lea 56(%rsi),%r11 # &disp->HandlerData 2277289848Sjkim lea 24(%rsi),%r12 # &disp->EstablisherFrame 2278289848Sjkim mov %r10,32(%rsp) # arg5 2279289848Sjkim mov %r11,40(%rsp) # arg6 2280289848Sjkim mov %r12,48(%rsp) # arg7 2281289848Sjkim mov %rcx,56(%rsp) # arg8, (NULL) 2282289848Sjkim call *__imp_RtlVirtualUnwind(%rip) 2283289848Sjkim 2284289848Sjkim mov \$1,%eax # ExceptionContinueSearch 2285289848Sjkim add \$64,%rsp 2286289848Sjkim popfq 2287289848Sjkim pop %r15 2288289848Sjkim pop %r14 2289289848Sjkim pop %r13 2290289848Sjkim pop %r12 2291289848Sjkim pop %rbp 2292289848Sjkim pop %rbx 2293289848Sjkim pop %rdi 2294289848Sjkim pop %rsi 2295289848Sjkim ret 2296296279Sjkim.size se_handler,.-se_handler 2297289848Sjkim 2298289848Sjkim.section .pdata 2299289848Sjkim.align 4 2300289848Sjkim .rva .LSEH_begin_rsaz_512_sqr 2301289848Sjkim .rva .LSEH_end_rsaz_512_sqr 2302289848Sjkim .rva .LSEH_info_rsaz_512_sqr 2303289848Sjkim 2304289848Sjkim .rva .LSEH_begin_rsaz_512_mul 2305289848Sjkim .rva .LSEH_end_rsaz_512_mul 2306289848Sjkim .rva .LSEH_info_rsaz_512_mul 2307289848Sjkim 2308289848Sjkim .rva .LSEH_begin_rsaz_512_mul_gather4 2309289848Sjkim .rva .LSEH_end_rsaz_512_mul_gather4 2310289848Sjkim .rva .LSEH_info_rsaz_512_mul_gather4 2311289848Sjkim 2312289848Sjkim .rva .LSEH_begin_rsaz_512_mul_scatter4 2313289848Sjkim .rva .LSEH_end_rsaz_512_mul_scatter4 2314289848Sjkim .rva .LSEH_info_rsaz_512_mul_scatter4 2315289848Sjkim 2316289848Sjkim .rva .LSEH_begin_rsaz_512_mul_by_one 2317289848Sjkim .rva .LSEH_end_rsaz_512_mul_by_one 2318289848Sjkim .rva .LSEH_info_rsaz_512_mul_by_one 2319289848Sjkim 2320296279Sjkim .rva .LSEH_begin_rsaz_512_gather4 2321296279Sjkim .rva .LSEH_end_rsaz_512_gather4 2322296279Sjkim .rva .LSEH_info_rsaz_512_gather4 2323296279Sjkim 2324289848Sjkim.section .xdata 2325289848Sjkim.align 8 2326289848Sjkim.LSEH_info_rsaz_512_sqr: 2327289848Sjkim .byte 9,0,0,0 2328289848Sjkim .rva se_handler 2329289848Sjkim .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[] 2330289848Sjkim.LSEH_info_rsaz_512_mul: 2331289848Sjkim .byte 9,0,0,0 2332289848Sjkim .rva se_handler 2333289848Sjkim .rva .Lmul_body,.Lmul_epilogue # HandlerData[] 2334289848Sjkim.LSEH_info_rsaz_512_mul_gather4: 2335289848Sjkim .byte 9,0,0,0 2336289848Sjkim .rva se_handler 2337289848Sjkim .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[] 2338289848Sjkim.LSEH_info_rsaz_512_mul_scatter4: 2339289848Sjkim .byte 9,0,0,0 2340289848Sjkim .rva se_handler 2341289848Sjkim .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[] 2342289848Sjkim.LSEH_info_rsaz_512_mul_by_one: 2343289848Sjkim .byte 9,0,0,0 2344289848Sjkim .rva se_handler 2345289848Sjkim .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[] 2346296279Sjkim.LSEH_info_rsaz_512_gather4: 2347296279Sjkim .byte 0x01,0x46,0x16,0x00 2348296279Sjkim .byte 0x46,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15 2349296279Sjkim .byte 0x3d,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14 2350296279Sjkim .byte 0x34,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13 2351296279Sjkim .byte 0x2e,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12 2352296279Sjkim .byte 0x28,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11 2353296279Sjkim .byte 0x22,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10 2354296279Sjkim .byte 0x1c,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9 2355296279Sjkim .byte 0x16,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8 2356296279Sjkim .byte 0x10,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7 2357296279Sjkim .byte 0x0b,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6 2358296279Sjkim .byte 0x07,0x01,0x15,0x00 # sub rsp,0xa8 2359289848Sjkim___ 2360289848Sjkim} 2361289848Sjkim 2362289848Sjkim$code =~ s/\`([^\`]*)\`/eval $1/gem; 2363289848Sjkimprint $code; 2364289848Sjkimclose STDOUT; 2365