rsaz-avx2.pl revision 326663
1#!/usr/bin/env perl 2 3############################################################################## 4# # 5# Copyright (c) 2012, Intel Corporation # 6# # 7# All rights reserved. # 8# # 9# Redistribution and use in source and binary forms, with or without # 10# modification, are permitted provided that the following conditions are # 11# met: # 12# # 13# * Redistributions of source code must retain the above copyright # 14# notice, this list of conditions and the following disclaimer. # 15# # 16# * Redistributions in binary form must reproduce the above copyright # 17# notice, this list of conditions and the following disclaimer in the # 18# documentation and/or other materials provided with the # 19# distribution. # 20# # 21# * Neither the name of the Intel Corporation nor the names of its # 22# contributors may be used to endorse or promote products derived from # 23# this software without specific prior written permission. # 24# # 25# # 26# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY # 27# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # 28# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # 29# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR # 30# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # 31# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # 32# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # 33# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # 34# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # 35# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # 36# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # 37# # 38############################################################################## 39# Developers and authors: # 40# Shay Gueron (1, 2), and Vlad Krasnov (1) # 41# (1) Intel Corporation, Israel Development Center, Haifa, Israel # 42# (2) University of Haifa, Israel # 43############################################################################## 44# Reference: # 45# [1] S. Gueron, V. Krasnov: "Software Implementation of Modular # 46# Exponentiation, Using Advanced Vector Instructions Architectures", # 47# F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369, # 48# pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012 # 49# [2] S. Gueron: "Efficient Software Implementations of Modular # 50# Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012). # 51# [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE # 52# Proceedings of 9th International Conference on Information Technology: # 53# New Generations (ITNG 2012), pp.821-823 (2012) # 54# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis # 55# resistant 1024-bit modular exponentiation, for optimizing RSA2048 # 56# on AVX2 capable x86_64 platforms", # 57# http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest# 58############################################################################## 59# 60# +13% improvement over original submission by <appro@openssl.org> 61# 62# rsa2048 sign/sec OpenSSL 1.0.1 scalar(*) this 63# 2.3GHz Haswell 621 765/+23% 1113/+79% 64# 2.3GHz Broadwell(**) 688 1200(***)/+74% 1120/+63% 65# 66# (*) if system doesn't support AVX2, for reference purposes; 67# (**) scaled to 2.3GHz to simplify comparison; 68# (***) scalar AD*X code is faster than AVX2 and is preferred code 69# path for Broadwell; 70 71$flavour = shift; 72$output = shift; 73if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 74 75$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 76 77$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 78( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 79( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 80die "can't locate x86_64-xlate.pl"; 81 82if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 83 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 84 $avx = ($1>=2.19) + ($1>=2.22); 85 $addx = ($1>=2.23); 86} 87 88if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 89 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 90 $avx = ($1>=2.09) + ($1>=2.10); 91 $addx = ($1>=2.10); 92} 93 94if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 95 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 96 $avx = ($1>=10) + ($1>=11); 97 $addx = ($1>=11); 98} 99 100if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) { 101 my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 102 $avx = ($ver>=3.0) + ($ver>=3.01); 103 $addx = ($ver>=3.03); 104} 105 106open OUT,"| \"$^X\" $xlate $flavour $output"; 107*STDOUT = *OUT; 108 109if ($avx>1) {{{ 110{ # void AMS_WW( 111my $rp="%rdi"; # BN_ULONG *rp, 112my $ap="%rsi"; # const BN_ULONG *ap, 113my $np="%rdx"; # const BN_ULONG *np, 114my $n0="%ecx"; # const BN_ULONG n0, 115my $rep="%r8d"; # int repeat); 116 117# The registers that hold the accumulated redundant result 118# The AMM works on 1024 bit operands, and redundant word size is 29 119# Therefore: ceil(1024/29)/4 = 9 120my $ACC0="%ymm0"; 121my $ACC1="%ymm1"; 122my $ACC2="%ymm2"; 123my $ACC3="%ymm3"; 124my $ACC4="%ymm4"; 125my $ACC5="%ymm5"; 126my $ACC6="%ymm6"; 127my $ACC7="%ymm7"; 128my $ACC8="%ymm8"; 129my $ACC9="%ymm9"; 130# Registers that hold the broadcasted words of bp, currently used 131my $B1="%ymm10"; 132my $B2="%ymm11"; 133# Registers that hold the broadcasted words of Y, currently used 134my $Y1="%ymm12"; 135my $Y2="%ymm13"; 136# Helper registers 137my $TEMP1="%ymm14"; 138my $AND_MASK="%ymm15"; 139# alu registers that hold the first words of the ACC 140my $r0="%r9"; 141my $r1="%r10"; 142my $r2="%r11"; 143my $r3="%r12"; 144 145my $i="%r14d"; # loop counter 146my $tmp = "%r15"; 147 148my $FrameSize=32*18+32*8; # place for A^2 and 2*A 149 150my $aap=$r0; 151my $tp0="%rbx"; 152my $tp1=$r3; 153my $tpa=$tmp; 154 155$np="%r13"; # reassigned argument 156 157$code.=<<___; 158.text 159 160.globl rsaz_1024_sqr_avx2 161.type rsaz_1024_sqr_avx2,\@function,5 162.align 64 163rsaz_1024_sqr_avx2: # 702 cycles, 14% faster than rsaz_1024_mul_avx2 164 lea (%rsp), %rax 165 push %rbx 166 push %rbp 167 push %r12 168 push %r13 169 push %r14 170 push %r15 171 vzeroupper 172___ 173$code.=<<___ if ($win64); 174 lea -0xa8(%rsp),%rsp 175 vmovaps %xmm6,-0xd8(%rax) 176 vmovaps %xmm7,-0xc8(%rax) 177 vmovaps %xmm8,-0xb8(%rax) 178 vmovaps %xmm9,-0xa8(%rax) 179 vmovaps %xmm10,-0x98(%rax) 180 vmovaps %xmm11,-0x88(%rax) 181 vmovaps %xmm12,-0x78(%rax) 182 vmovaps %xmm13,-0x68(%rax) 183 vmovaps %xmm14,-0x58(%rax) 184 vmovaps %xmm15,-0x48(%rax) 185.Lsqr_1024_body: 186___ 187$code.=<<___; 188 mov %rax,%rbp 189 mov %rdx, $np # reassigned argument 190 sub \$$FrameSize, %rsp 191 mov $np, $tmp 192 sub \$-128, $rp # size optimization 193 sub \$-128, $ap 194 sub \$-128, $np 195 196 and \$4095, $tmp # see if $np crosses page 197 add \$32*10, $tmp 198 shr \$12, $tmp 199 vpxor $ACC9,$ACC9,$ACC9 200 jz .Lsqr_1024_no_n_copy 201 202 # unaligned 256-bit load that crosses page boundary can 203 # cause >2x performance degradation here, so if $np does 204 # cross page boundary, copy it to stack and make sure stack 205 # frame doesn't... 206 sub \$32*10,%rsp 207 vmovdqu 32*0-128($np), $ACC0 208 and \$-2048, %rsp 209 vmovdqu 32*1-128($np), $ACC1 210 vmovdqu 32*2-128($np), $ACC2 211 vmovdqu 32*3-128($np), $ACC3 212 vmovdqu 32*4-128($np), $ACC4 213 vmovdqu 32*5-128($np), $ACC5 214 vmovdqu 32*6-128($np), $ACC6 215 vmovdqu 32*7-128($np), $ACC7 216 vmovdqu 32*8-128($np), $ACC8 217 lea $FrameSize+128(%rsp),$np 218 vmovdqu $ACC0, 32*0-128($np) 219 vmovdqu $ACC1, 32*1-128($np) 220 vmovdqu $ACC2, 32*2-128($np) 221 vmovdqu $ACC3, 32*3-128($np) 222 vmovdqu $ACC4, 32*4-128($np) 223 vmovdqu $ACC5, 32*5-128($np) 224 vmovdqu $ACC6, 32*6-128($np) 225 vmovdqu $ACC7, 32*7-128($np) 226 vmovdqu $ACC8, 32*8-128($np) 227 vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero 228 229.Lsqr_1024_no_n_copy: 230 and \$-1024, %rsp 231 232 vmovdqu 32*1-128($ap), $ACC1 233 vmovdqu 32*2-128($ap), $ACC2 234 vmovdqu 32*3-128($ap), $ACC3 235 vmovdqu 32*4-128($ap), $ACC4 236 vmovdqu 32*5-128($ap), $ACC5 237 vmovdqu 32*6-128($ap), $ACC6 238 vmovdqu 32*7-128($ap), $ACC7 239 vmovdqu 32*8-128($ap), $ACC8 240 241 lea 192(%rsp), $tp0 # 64+128=192 242 vmovdqu .Land_mask(%rip), $AND_MASK 243 jmp .LOOP_GRANDE_SQR_1024 244 245.align 32 246.LOOP_GRANDE_SQR_1024: 247 lea 32*18+128(%rsp), $aap # size optimization 248 lea 448(%rsp), $tp1 # 64+128+256=448 249 250 # the squaring is performed as described in Variant B of 251 # "Speeding up Big-Number Squaring", so start by calculating 252 # the A*2=A+A vector 253 vpaddq $ACC1, $ACC1, $ACC1 254 vpbroadcastq 32*0-128($ap), $B1 255 vpaddq $ACC2, $ACC2, $ACC2 256 vmovdqa $ACC1, 32*0-128($aap) 257 vpaddq $ACC3, $ACC3, $ACC3 258 vmovdqa $ACC2, 32*1-128($aap) 259 vpaddq $ACC4, $ACC4, $ACC4 260 vmovdqa $ACC3, 32*2-128($aap) 261 vpaddq $ACC5, $ACC5, $ACC5 262 vmovdqa $ACC4, 32*3-128($aap) 263 vpaddq $ACC6, $ACC6, $ACC6 264 vmovdqa $ACC5, 32*4-128($aap) 265 vpaddq $ACC7, $ACC7, $ACC7 266 vmovdqa $ACC6, 32*5-128($aap) 267 vpaddq $ACC8, $ACC8, $ACC8 268 vmovdqa $ACC7, 32*6-128($aap) 269 vpxor $ACC9, $ACC9, $ACC9 270 vmovdqa $ACC8, 32*7-128($aap) 271 272 vpmuludq 32*0-128($ap), $B1, $ACC0 273 vpbroadcastq 32*1-128($ap), $B2 274 vmovdqu $ACC9, 32*9-192($tp0) # zero upper half 275 vpmuludq $B1, $ACC1, $ACC1 276 vmovdqu $ACC9, 32*10-448($tp1) 277 vpmuludq $B1, $ACC2, $ACC2 278 vmovdqu $ACC9, 32*11-448($tp1) 279 vpmuludq $B1, $ACC3, $ACC3 280 vmovdqu $ACC9, 32*12-448($tp1) 281 vpmuludq $B1, $ACC4, $ACC4 282 vmovdqu $ACC9, 32*13-448($tp1) 283 vpmuludq $B1, $ACC5, $ACC5 284 vmovdqu $ACC9, 32*14-448($tp1) 285 vpmuludq $B1, $ACC6, $ACC6 286 vmovdqu $ACC9, 32*15-448($tp1) 287 vpmuludq $B1, $ACC7, $ACC7 288 vmovdqu $ACC9, 32*16-448($tp1) 289 vpmuludq $B1, $ACC8, $ACC8 290 vpbroadcastq 32*2-128($ap), $B1 291 vmovdqu $ACC9, 32*17-448($tp1) 292 293 mov $ap, $tpa 294 mov \$4, $i 295 jmp .Lsqr_entry_1024 296___ 297$TEMP0=$Y1; 298$TEMP2=$Y2; 299$code.=<<___; 300.align 32 301.LOOP_SQR_1024: 302 vpbroadcastq 32*1-128($tpa), $B2 303 vpmuludq 32*0-128($ap), $B1, $ACC0 304 vpaddq 32*0-192($tp0), $ACC0, $ACC0 305 vpmuludq 32*0-128($aap), $B1, $ACC1 306 vpaddq 32*1-192($tp0), $ACC1, $ACC1 307 vpmuludq 32*1-128($aap), $B1, $ACC2 308 vpaddq 32*2-192($tp0), $ACC2, $ACC2 309 vpmuludq 32*2-128($aap), $B1, $ACC3 310 vpaddq 32*3-192($tp0), $ACC3, $ACC3 311 vpmuludq 32*3-128($aap), $B1, $ACC4 312 vpaddq 32*4-192($tp0), $ACC4, $ACC4 313 vpmuludq 32*4-128($aap), $B1, $ACC5 314 vpaddq 32*5-192($tp0), $ACC5, $ACC5 315 vpmuludq 32*5-128($aap), $B1, $ACC6 316 vpaddq 32*6-192($tp0), $ACC6, $ACC6 317 vpmuludq 32*6-128($aap), $B1, $ACC7 318 vpaddq 32*7-192($tp0), $ACC7, $ACC7 319 vpmuludq 32*7-128($aap), $B1, $ACC8 320 vpbroadcastq 32*2-128($tpa), $B1 321 vpaddq 32*8-192($tp0), $ACC8, $ACC8 322.Lsqr_entry_1024: 323 vmovdqu $ACC0, 32*0-192($tp0) 324 vmovdqu $ACC1, 32*1-192($tp0) 325 326 vpmuludq 32*1-128($ap), $B2, $TEMP0 327 vpaddq $TEMP0, $ACC2, $ACC2 328 vpmuludq 32*1-128($aap), $B2, $TEMP1 329 vpaddq $TEMP1, $ACC3, $ACC3 330 vpmuludq 32*2-128($aap), $B2, $TEMP2 331 vpaddq $TEMP2, $ACC4, $ACC4 332 vpmuludq 32*3-128($aap), $B2, $TEMP0 333 vpaddq $TEMP0, $ACC5, $ACC5 334 vpmuludq 32*4-128($aap), $B2, $TEMP1 335 vpaddq $TEMP1, $ACC6, $ACC6 336 vpmuludq 32*5-128($aap), $B2, $TEMP2 337 vpaddq $TEMP2, $ACC7, $ACC7 338 vpmuludq 32*6-128($aap), $B2, $TEMP0 339 vpaddq $TEMP0, $ACC8, $ACC8 340 vpmuludq 32*7-128($aap), $B2, $ACC0 341 vpbroadcastq 32*3-128($tpa), $B2 342 vpaddq 32*9-192($tp0), $ACC0, $ACC0 343 344 vmovdqu $ACC2, 32*2-192($tp0) 345 vmovdqu $ACC3, 32*3-192($tp0) 346 347 vpmuludq 32*2-128($ap), $B1, $TEMP2 348 vpaddq $TEMP2, $ACC4, $ACC4 349 vpmuludq 32*2-128($aap), $B1, $TEMP0 350 vpaddq $TEMP0, $ACC5, $ACC5 351 vpmuludq 32*3-128($aap), $B1, $TEMP1 352 vpaddq $TEMP1, $ACC6, $ACC6 353 vpmuludq 32*4-128($aap), $B1, $TEMP2 354 vpaddq $TEMP2, $ACC7, $ACC7 355 vpmuludq 32*5-128($aap), $B1, $TEMP0 356 vpaddq $TEMP0, $ACC8, $ACC8 357 vpmuludq 32*6-128($aap), $B1, $TEMP1 358 vpaddq $TEMP1, $ACC0, $ACC0 359 vpmuludq 32*7-128($aap), $B1, $ACC1 360 vpbroadcastq 32*4-128($tpa), $B1 361 vpaddq 32*10-448($tp1), $ACC1, $ACC1 362 363 vmovdqu $ACC4, 32*4-192($tp0) 364 vmovdqu $ACC5, 32*5-192($tp0) 365 366 vpmuludq 32*3-128($ap), $B2, $TEMP0 367 vpaddq $TEMP0, $ACC6, $ACC6 368 vpmuludq 32*3-128($aap), $B2, $TEMP1 369 vpaddq $TEMP1, $ACC7, $ACC7 370 vpmuludq 32*4-128($aap), $B2, $TEMP2 371 vpaddq $TEMP2, $ACC8, $ACC8 372 vpmuludq 32*5-128($aap), $B2, $TEMP0 373 vpaddq $TEMP0, $ACC0, $ACC0 374 vpmuludq 32*6-128($aap), $B2, $TEMP1 375 vpaddq $TEMP1, $ACC1, $ACC1 376 vpmuludq 32*7-128($aap), $B2, $ACC2 377 vpbroadcastq 32*5-128($tpa), $B2 378 vpaddq 32*11-448($tp1), $ACC2, $ACC2 379 380 vmovdqu $ACC6, 32*6-192($tp0) 381 vmovdqu $ACC7, 32*7-192($tp0) 382 383 vpmuludq 32*4-128($ap), $B1, $TEMP0 384 vpaddq $TEMP0, $ACC8, $ACC8 385 vpmuludq 32*4-128($aap), $B1, $TEMP1 386 vpaddq $TEMP1, $ACC0, $ACC0 387 vpmuludq 32*5-128($aap), $B1, $TEMP2 388 vpaddq $TEMP2, $ACC1, $ACC1 389 vpmuludq 32*6-128($aap), $B1, $TEMP0 390 vpaddq $TEMP0, $ACC2, $ACC2 391 vpmuludq 32*7-128($aap), $B1, $ACC3 392 vpbroadcastq 32*6-128($tpa), $B1 393 vpaddq 32*12-448($tp1), $ACC3, $ACC3 394 395 vmovdqu $ACC8, 32*8-192($tp0) 396 vmovdqu $ACC0, 32*9-192($tp0) 397 lea 8($tp0), $tp0 398 399 vpmuludq 32*5-128($ap), $B2, $TEMP2 400 vpaddq $TEMP2, $ACC1, $ACC1 401 vpmuludq 32*5-128($aap), $B2, $TEMP0 402 vpaddq $TEMP0, $ACC2, $ACC2 403 vpmuludq 32*6-128($aap), $B2, $TEMP1 404 vpaddq $TEMP1, $ACC3, $ACC3 405 vpmuludq 32*7-128($aap), $B2, $ACC4 406 vpbroadcastq 32*7-128($tpa), $B2 407 vpaddq 32*13-448($tp1), $ACC4, $ACC4 408 409 vmovdqu $ACC1, 32*10-448($tp1) 410 vmovdqu $ACC2, 32*11-448($tp1) 411 412 vpmuludq 32*6-128($ap), $B1, $TEMP0 413 vpaddq $TEMP0, $ACC3, $ACC3 414 vpmuludq 32*6-128($aap), $B1, $TEMP1 415 vpbroadcastq 32*8-128($tpa), $ACC0 # borrow $ACC0 for $B1 416 vpaddq $TEMP1, $ACC4, $ACC4 417 vpmuludq 32*7-128($aap), $B1, $ACC5 418 vpbroadcastq 32*0+8-128($tpa), $B1 # for next iteration 419 vpaddq 32*14-448($tp1), $ACC5, $ACC5 420 421 vmovdqu $ACC3, 32*12-448($tp1) 422 vmovdqu $ACC4, 32*13-448($tp1) 423 lea 8($tpa), $tpa 424 425 vpmuludq 32*7-128($ap), $B2, $TEMP0 426 vpaddq $TEMP0, $ACC5, $ACC5 427 vpmuludq 32*7-128($aap), $B2, $ACC6 428 vpaddq 32*15-448($tp1), $ACC6, $ACC6 429 430 vpmuludq 32*8-128($ap), $ACC0, $ACC7 431 vmovdqu $ACC5, 32*14-448($tp1) 432 vpaddq 32*16-448($tp1), $ACC7, $ACC7 433 vmovdqu $ACC6, 32*15-448($tp1) 434 vmovdqu $ACC7, 32*16-448($tp1) 435 lea 8($tp1), $tp1 436 437 dec $i 438 jnz .LOOP_SQR_1024 439___ 440$ZERO = $ACC9; 441$TEMP0 = $B1; 442$TEMP2 = $B2; 443$TEMP3 = $Y1; 444$TEMP4 = $Y2; 445$code.=<<___; 446 # we need to fix indices 32-39 to avoid overflow 447 vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0), 448 vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0) 449 vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0) 450 lea 192(%rsp), $tp0 # 64+128=192 451 452 vpsrlq \$29, $ACC8, $TEMP1 453 vpand $AND_MASK, $ACC8, $ACC8 454 vpsrlq \$29, $ACC1, $TEMP2 455 vpand $AND_MASK, $ACC1, $ACC1 456 457 vpermq \$0x93, $TEMP1, $TEMP1 458 vpxor $ZERO, $ZERO, $ZERO 459 vpermq \$0x93, $TEMP2, $TEMP2 460 461 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 462 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 463 vpaddq $TEMP0, $ACC8, $ACC8 464 vpblendd \$3, $TEMP2, $ZERO, $TEMP2 465 vpaddq $TEMP1, $ACC1, $ACC1 466 vpaddq $TEMP2, $ACC2, $ACC2 467 vmovdqu $ACC1, 32*9-192($tp0) 468 vmovdqu $ACC2, 32*10-192($tp0) 469 470 mov (%rsp), %rax 471 mov 8(%rsp), $r1 472 mov 16(%rsp), $r2 473 mov 24(%rsp), $r3 474 vmovdqu 32*1(%rsp), $ACC1 475 vmovdqu 32*2-192($tp0), $ACC2 476 vmovdqu 32*3-192($tp0), $ACC3 477 vmovdqu 32*4-192($tp0), $ACC4 478 vmovdqu 32*5-192($tp0), $ACC5 479 vmovdqu 32*6-192($tp0), $ACC6 480 vmovdqu 32*7-192($tp0), $ACC7 481 482 mov %rax, $r0 483 imull $n0, %eax 484 and \$0x1fffffff, %eax 485 vmovd %eax, $Y1 486 487 mov %rax, %rdx 488 imulq -128($np), %rax 489 vpbroadcastq $Y1, $Y1 490 add %rax, $r0 491 mov %rdx, %rax 492 imulq 8-128($np), %rax 493 shr \$29, $r0 494 add %rax, $r1 495 mov %rdx, %rax 496 imulq 16-128($np), %rax 497 add $r0, $r1 498 add %rax, $r2 499 imulq 24-128($np), %rdx 500 add %rdx, $r3 501 502 mov $r1, %rax 503 imull $n0, %eax 504 and \$0x1fffffff, %eax 505 506 mov \$9, $i 507 jmp .LOOP_REDUCE_1024 508 509.align 32 510.LOOP_REDUCE_1024: 511 vmovd %eax, $Y2 512 vpbroadcastq $Y2, $Y2 513 514 vpmuludq 32*1-128($np), $Y1, $TEMP0 515 mov %rax, %rdx 516 imulq -128($np), %rax 517 vpaddq $TEMP0, $ACC1, $ACC1 518 add %rax, $r1 519 vpmuludq 32*2-128($np), $Y1, $TEMP1 520 mov %rdx, %rax 521 imulq 8-128($np), %rax 522 vpaddq $TEMP1, $ACC2, $ACC2 523 vpmuludq 32*3-128($np), $Y1, $TEMP2 524 .byte 0x67 525 add %rax, $r2 526 .byte 0x67 527 mov %rdx, %rax 528 imulq 16-128($np), %rax 529 shr \$29, $r1 530 vpaddq $TEMP2, $ACC3, $ACC3 531 vpmuludq 32*4-128($np), $Y1, $TEMP0 532 add %rax, $r3 533 add $r1, $r2 534 vpaddq $TEMP0, $ACC4, $ACC4 535 vpmuludq 32*5-128($np), $Y1, $TEMP1 536 mov $r2, %rax 537 imull $n0, %eax 538 vpaddq $TEMP1, $ACC5, $ACC5 539 vpmuludq 32*6-128($np), $Y1, $TEMP2 540 and \$0x1fffffff, %eax 541 vpaddq $TEMP2, $ACC6, $ACC6 542 vpmuludq 32*7-128($np), $Y1, $TEMP0 543 vpaddq $TEMP0, $ACC7, $ACC7 544 vpmuludq 32*8-128($np), $Y1, $TEMP1 545 vmovd %eax, $Y1 546 #vmovdqu 32*1-8-128($np), $TEMP2 # moved below 547 vpaddq $TEMP1, $ACC8, $ACC8 548 #vmovdqu 32*2-8-128($np), $TEMP0 # moved below 549 vpbroadcastq $Y1, $Y1 550 551 vpmuludq 32*1-8-128($np), $Y2, $TEMP2 # see above 552 vmovdqu 32*3-8-128($np), $TEMP1 553 mov %rax, %rdx 554 imulq -128($np), %rax 555 vpaddq $TEMP2, $ACC1, $ACC1 556 vpmuludq 32*2-8-128($np), $Y2, $TEMP0 # see above 557 vmovdqu 32*4-8-128($np), $TEMP2 558 add %rax, $r2 559 mov %rdx, %rax 560 imulq 8-128($np), %rax 561 vpaddq $TEMP0, $ACC2, $ACC2 562 add $r3, %rax 563 shr \$29, $r2 564 vpmuludq $Y2, $TEMP1, $TEMP1 565 vmovdqu 32*5-8-128($np), $TEMP0 566 add $r2, %rax 567 vpaddq $TEMP1, $ACC3, $ACC3 568 vpmuludq $Y2, $TEMP2, $TEMP2 569 vmovdqu 32*6-8-128($np), $TEMP1 570 .byte 0x67 571 mov %rax, $r3 572 imull $n0, %eax 573 vpaddq $TEMP2, $ACC4, $ACC4 574 vpmuludq $Y2, $TEMP0, $TEMP0 575 .byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 # vmovdqu 32*7-8-128($np), $TEMP2 576 and \$0x1fffffff, %eax 577 vpaddq $TEMP0, $ACC5, $ACC5 578 vpmuludq $Y2, $TEMP1, $TEMP1 579 vmovdqu 32*8-8-128($np), $TEMP0 580 vpaddq $TEMP1, $ACC6, $ACC6 581 vpmuludq $Y2, $TEMP2, $TEMP2 582 vmovdqu 32*9-8-128($np), $ACC9 583 vmovd %eax, $ACC0 # borrow ACC0 for Y2 584 imulq -128($np), %rax 585 vpaddq $TEMP2, $ACC7, $ACC7 586 vpmuludq $Y2, $TEMP0, $TEMP0 587 vmovdqu 32*1-16-128($np), $TEMP1 588 vpbroadcastq $ACC0, $ACC0 589 vpaddq $TEMP0, $ACC8, $ACC8 590 vpmuludq $Y2, $ACC9, $ACC9 591 vmovdqu 32*2-16-128($np), $TEMP2 592 add %rax, $r3 593 594___ 595($ACC0,$Y2)=($Y2,$ACC0); 596$code.=<<___; 597 vmovdqu 32*1-24-128($np), $ACC0 598 vpmuludq $Y1, $TEMP1, $TEMP1 599 vmovdqu 32*3-16-128($np), $TEMP0 600 vpaddq $TEMP1, $ACC1, $ACC1 601 vpmuludq $Y2, $ACC0, $ACC0 602 vpmuludq $Y1, $TEMP2, $TEMP2 603 .byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff # vmovdqu 32*4-16-128($np), $TEMP1 604 vpaddq $ACC1, $ACC0, $ACC0 605 vpaddq $TEMP2, $ACC2, $ACC2 606 vpmuludq $Y1, $TEMP0, $TEMP0 607 vmovdqu 32*5-16-128($np), $TEMP2 608 .byte 0x67 609 vmovq $ACC0, %rax 610 vmovdqu $ACC0, (%rsp) # transfer $r0-$r3 611 vpaddq $TEMP0, $ACC3, $ACC3 612 vpmuludq $Y1, $TEMP1, $TEMP1 613 vmovdqu 32*6-16-128($np), $TEMP0 614 vpaddq $TEMP1, $ACC4, $ACC4 615 vpmuludq $Y1, $TEMP2, $TEMP2 616 vmovdqu 32*7-16-128($np), $TEMP1 617 vpaddq $TEMP2, $ACC5, $ACC5 618 vpmuludq $Y1, $TEMP0, $TEMP0 619 vmovdqu 32*8-16-128($np), $TEMP2 620 vpaddq $TEMP0, $ACC6, $ACC6 621 vpmuludq $Y1, $TEMP1, $TEMP1 622 shr \$29, $r3 623 vmovdqu 32*9-16-128($np), $TEMP0 624 add $r3, %rax 625 vpaddq $TEMP1, $ACC7, $ACC7 626 vpmuludq $Y1, $TEMP2, $TEMP2 627 #vmovdqu 32*2-24-128($np), $TEMP1 # moved below 628 mov %rax, $r0 629 imull $n0, %eax 630 vpaddq $TEMP2, $ACC8, $ACC8 631 vpmuludq $Y1, $TEMP0, $TEMP0 632 and \$0x1fffffff, %eax 633 vmovd %eax, $Y1 634 vmovdqu 32*3-24-128($np), $TEMP2 635 .byte 0x67 636 vpaddq $TEMP0, $ACC9, $ACC9 637 vpbroadcastq $Y1, $Y1 638 639 vpmuludq 32*2-24-128($np), $Y2, $TEMP1 # see above 640 vmovdqu 32*4-24-128($np), $TEMP0 641 mov %rax, %rdx 642 imulq -128($np), %rax 643 mov 8(%rsp), $r1 644 vpaddq $TEMP1, $ACC2, $ACC1 645 vpmuludq $Y2, $TEMP2, $TEMP2 646 vmovdqu 32*5-24-128($np), $TEMP1 647 add %rax, $r0 648 mov %rdx, %rax 649 imulq 8-128($np), %rax 650 .byte 0x67 651 shr \$29, $r0 652 mov 16(%rsp), $r2 653 vpaddq $TEMP2, $ACC3, $ACC2 654 vpmuludq $Y2, $TEMP0, $TEMP0 655 vmovdqu 32*6-24-128($np), $TEMP2 656 add %rax, $r1 657 mov %rdx, %rax 658 imulq 16-128($np), %rax 659 vpaddq $TEMP0, $ACC4, $ACC3 660 vpmuludq $Y2, $TEMP1, $TEMP1 661 vmovdqu 32*7-24-128($np), $TEMP0 662 imulq 24-128($np), %rdx # future $r3 663 add %rax, $r2 664 lea ($r0,$r1), %rax 665 vpaddq $TEMP1, $ACC5, $ACC4 666 vpmuludq $Y2, $TEMP2, $TEMP2 667 vmovdqu 32*8-24-128($np), $TEMP1 668 mov %rax, $r1 669 imull $n0, %eax 670 vpmuludq $Y2, $TEMP0, $TEMP0 671 vpaddq $TEMP2, $ACC6, $ACC5 672 vmovdqu 32*9-24-128($np), $TEMP2 673 and \$0x1fffffff, %eax 674 vpaddq $TEMP0, $ACC7, $ACC6 675 vpmuludq $Y2, $TEMP1, $TEMP1 676 add 24(%rsp), %rdx 677 vpaddq $TEMP1, $ACC8, $ACC7 678 vpmuludq $Y2, $TEMP2, $TEMP2 679 vpaddq $TEMP2, $ACC9, $ACC8 680 vmovq $r3, $ACC9 681 mov %rdx, $r3 682 683 dec $i 684 jnz .LOOP_REDUCE_1024 685___ 686($ACC0,$Y2)=($Y2,$ACC0); 687$code.=<<___; 688 lea 448(%rsp), $tp1 # size optimization 689 vpaddq $ACC9, $Y2, $ACC0 690 vpxor $ZERO, $ZERO, $ZERO 691 692 vpaddq 32*9-192($tp0), $ACC0, $ACC0 693 vpaddq 32*10-448($tp1), $ACC1, $ACC1 694 vpaddq 32*11-448($tp1), $ACC2, $ACC2 695 vpaddq 32*12-448($tp1), $ACC3, $ACC3 696 vpaddq 32*13-448($tp1), $ACC4, $ACC4 697 vpaddq 32*14-448($tp1), $ACC5, $ACC5 698 vpaddq 32*15-448($tp1), $ACC6, $ACC6 699 vpaddq 32*16-448($tp1), $ACC7, $ACC7 700 vpaddq 32*17-448($tp1), $ACC8, $ACC8 701 702 vpsrlq \$29, $ACC0, $TEMP1 703 vpand $AND_MASK, $ACC0, $ACC0 704 vpsrlq \$29, $ACC1, $TEMP2 705 vpand $AND_MASK, $ACC1, $ACC1 706 vpsrlq \$29, $ACC2, $TEMP3 707 vpermq \$0x93, $TEMP1, $TEMP1 708 vpand $AND_MASK, $ACC2, $ACC2 709 vpsrlq \$29, $ACC3, $TEMP4 710 vpermq \$0x93, $TEMP2, $TEMP2 711 vpand $AND_MASK, $ACC3, $ACC3 712 vpermq \$0x93, $TEMP3, $TEMP3 713 714 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 715 vpermq \$0x93, $TEMP4, $TEMP4 716 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 717 vpaddq $TEMP0, $ACC0, $ACC0 718 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 719 vpaddq $TEMP1, $ACC1, $ACC1 720 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 721 vpaddq $TEMP2, $ACC2, $ACC2 722 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 723 vpaddq $TEMP3, $ACC3, $ACC3 724 vpaddq $TEMP4, $ACC4, $ACC4 725 726 vpsrlq \$29, $ACC0, $TEMP1 727 vpand $AND_MASK, $ACC0, $ACC0 728 vpsrlq \$29, $ACC1, $TEMP2 729 vpand $AND_MASK, $ACC1, $ACC1 730 vpsrlq \$29, $ACC2, $TEMP3 731 vpermq \$0x93, $TEMP1, $TEMP1 732 vpand $AND_MASK, $ACC2, $ACC2 733 vpsrlq \$29, $ACC3, $TEMP4 734 vpermq \$0x93, $TEMP2, $TEMP2 735 vpand $AND_MASK, $ACC3, $ACC3 736 vpermq \$0x93, $TEMP3, $TEMP3 737 738 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 739 vpermq \$0x93, $TEMP4, $TEMP4 740 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 741 vpaddq $TEMP0, $ACC0, $ACC0 742 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 743 vpaddq $TEMP1, $ACC1, $ACC1 744 vmovdqu $ACC0, 32*0-128($rp) 745 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 746 vpaddq $TEMP2, $ACC2, $ACC2 747 vmovdqu $ACC1, 32*1-128($rp) 748 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 749 vpaddq $TEMP3, $ACC3, $ACC3 750 vmovdqu $ACC2, 32*2-128($rp) 751 vpaddq $TEMP4, $ACC4, $ACC4 752 vmovdqu $ACC3, 32*3-128($rp) 753___ 754$TEMP5=$ACC0; 755$code.=<<___; 756 vpsrlq \$29, $ACC4, $TEMP1 757 vpand $AND_MASK, $ACC4, $ACC4 758 vpsrlq \$29, $ACC5, $TEMP2 759 vpand $AND_MASK, $ACC5, $ACC5 760 vpsrlq \$29, $ACC6, $TEMP3 761 vpermq \$0x93, $TEMP1, $TEMP1 762 vpand $AND_MASK, $ACC6, $ACC6 763 vpsrlq \$29, $ACC7, $TEMP4 764 vpermq \$0x93, $TEMP2, $TEMP2 765 vpand $AND_MASK, $ACC7, $ACC7 766 vpsrlq \$29, $ACC8, $TEMP5 767 vpermq \$0x93, $TEMP3, $TEMP3 768 vpand $AND_MASK, $ACC8, $ACC8 769 vpermq \$0x93, $TEMP4, $TEMP4 770 771 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 772 vpermq \$0x93, $TEMP5, $TEMP5 773 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 774 vpaddq $TEMP0, $ACC4, $ACC4 775 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 776 vpaddq $TEMP1, $ACC5, $ACC5 777 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 778 vpaddq $TEMP2, $ACC6, $ACC6 779 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 780 vpaddq $TEMP3, $ACC7, $ACC7 781 vpaddq $TEMP4, $ACC8, $ACC8 782 783 vpsrlq \$29, $ACC4, $TEMP1 784 vpand $AND_MASK, $ACC4, $ACC4 785 vpsrlq \$29, $ACC5, $TEMP2 786 vpand $AND_MASK, $ACC5, $ACC5 787 vpsrlq \$29, $ACC6, $TEMP3 788 vpermq \$0x93, $TEMP1, $TEMP1 789 vpand $AND_MASK, $ACC6, $ACC6 790 vpsrlq \$29, $ACC7, $TEMP4 791 vpermq \$0x93, $TEMP2, $TEMP2 792 vpand $AND_MASK, $ACC7, $ACC7 793 vpsrlq \$29, $ACC8, $TEMP5 794 vpermq \$0x93, $TEMP3, $TEMP3 795 vpand $AND_MASK, $ACC8, $ACC8 796 vpermq \$0x93, $TEMP4, $TEMP4 797 798 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 799 vpermq \$0x93, $TEMP5, $TEMP5 800 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 801 vpaddq $TEMP0, $ACC4, $ACC4 802 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 803 vpaddq $TEMP1, $ACC5, $ACC5 804 vmovdqu $ACC4, 32*4-128($rp) 805 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 806 vpaddq $TEMP2, $ACC6, $ACC6 807 vmovdqu $ACC5, 32*5-128($rp) 808 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 809 vpaddq $TEMP3, $ACC7, $ACC7 810 vmovdqu $ACC6, 32*6-128($rp) 811 vpaddq $TEMP4, $ACC8, $ACC8 812 vmovdqu $ACC7, 32*7-128($rp) 813 vmovdqu $ACC8, 32*8-128($rp) 814 815 mov $rp, $ap 816 dec $rep 817 jne .LOOP_GRANDE_SQR_1024 818 819 vzeroall 820 mov %rbp, %rax 821___ 822$code.=<<___ if ($win64); 823 movaps -0xd8(%rax),%xmm6 824 movaps -0xc8(%rax),%xmm7 825 movaps -0xb8(%rax),%xmm8 826 movaps -0xa8(%rax),%xmm9 827 movaps -0x98(%rax),%xmm10 828 movaps -0x88(%rax),%xmm11 829 movaps -0x78(%rax),%xmm12 830 movaps -0x68(%rax),%xmm13 831 movaps -0x58(%rax),%xmm14 832 movaps -0x48(%rax),%xmm15 833___ 834$code.=<<___; 835 mov -48(%rax),%r15 836 mov -40(%rax),%r14 837 mov -32(%rax),%r13 838 mov -24(%rax),%r12 839 mov -16(%rax),%rbp 840 mov -8(%rax),%rbx 841 lea (%rax),%rsp # restore %rsp 842.Lsqr_1024_epilogue: 843 ret 844.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 845___ 846} 847 848{ # void AMM_WW( 849my $rp="%rdi"; # BN_ULONG *rp, 850my $ap="%rsi"; # const BN_ULONG *ap, 851my $bp="%rdx"; # const BN_ULONG *bp, 852my $np="%rcx"; # const BN_ULONG *np, 853my $n0="%r8d"; # unsigned int n0); 854 855# The registers that hold the accumulated redundant result 856# The AMM works on 1024 bit operands, and redundant word size is 29 857# Therefore: ceil(1024/29)/4 = 9 858my $ACC0="%ymm0"; 859my $ACC1="%ymm1"; 860my $ACC2="%ymm2"; 861my $ACC3="%ymm3"; 862my $ACC4="%ymm4"; 863my $ACC5="%ymm5"; 864my $ACC6="%ymm6"; 865my $ACC7="%ymm7"; 866my $ACC8="%ymm8"; 867my $ACC9="%ymm9"; 868 869# Registers that hold the broadcasted words of multiplier, currently used 870my $Bi="%ymm10"; 871my $Yi="%ymm11"; 872 873# Helper registers 874my $TEMP0=$ACC0; 875my $TEMP1="%ymm12"; 876my $TEMP2="%ymm13"; 877my $ZERO="%ymm14"; 878my $AND_MASK="%ymm15"; 879 880# alu registers that hold the first words of the ACC 881my $r0="%r9"; 882my $r1="%r10"; 883my $r2="%r11"; 884my $r3="%r12"; 885 886my $i="%r14d"; 887my $tmp="%r15"; 888 889$bp="%r13"; # reassigned argument 890 891$code.=<<___; 892.globl rsaz_1024_mul_avx2 893.type rsaz_1024_mul_avx2,\@function,5 894.align 64 895rsaz_1024_mul_avx2: 896 lea (%rsp), %rax 897 push %rbx 898 push %rbp 899 push %r12 900 push %r13 901 push %r14 902 push %r15 903___ 904$code.=<<___ if ($win64); 905 vzeroupper 906 lea -0xa8(%rsp),%rsp 907 vmovaps %xmm6,-0xd8(%rax) 908 vmovaps %xmm7,-0xc8(%rax) 909 vmovaps %xmm8,-0xb8(%rax) 910 vmovaps %xmm9,-0xa8(%rax) 911 vmovaps %xmm10,-0x98(%rax) 912 vmovaps %xmm11,-0x88(%rax) 913 vmovaps %xmm12,-0x78(%rax) 914 vmovaps %xmm13,-0x68(%rax) 915 vmovaps %xmm14,-0x58(%rax) 916 vmovaps %xmm15,-0x48(%rax) 917.Lmul_1024_body: 918___ 919$code.=<<___; 920 mov %rax,%rbp 921 vzeroall 922 mov %rdx, $bp # reassigned argument 923 sub \$64,%rsp 924 925 # unaligned 256-bit load that crosses page boundary can 926 # cause severe performance degradation here, so if $ap does 927 # cross page boundary, swap it with $bp [meaning that caller 928 # is advised to lay down $ap and $bp next to each other, so 929 # that only one can cross page boundary]. 930 .byte 0x67,0x67 931 mov $ap, $tmp 932 and \$4095, $tmp 933 add \$32*10, $tmp 934 shr \$12, $tmp 935 mov $ap, $tmp 936 cmovnz $bp, $ap 937 cmovnz $tmp, $bp 938 939 mov $np, $tmp 940 sub \$-128,$ap # size optimization 941 sub \$-128,$np 942 sub \$-128,$rp 943 944 and \$4095, $tmp # see if $np crosses page 945 add \$32*10, $tmp 946 .byte 0x67,0x67 947 shr \$12, $tmp 948 jz .Lmul_1024_no_n_copy 949 950 # unaligned 256-bit load that crosses page boundary can 951 # cause severe performance degradation here, so if $np does 952 # cross page boundary, copy it to stack and make sure stack 953 # frame doesn't... 954 sub \$32*10,%rsp 955 vmovdqu 32*0-128($np), $ACC0 956 and \$-512, %rsp 957 vmovdqu 32*1-128($np), $ACC1 958 vmovdqu 32*2-128($np), $ACC2 959 vmovdqu 32*3-128($np), $ACC3 960 vmovdqu 32*4-128($np), $ACC4 961 vmovdqu 32*5-128($np), $ACC5 962 vmovdqu 32*6-128($np), $ACC6 963 vmovdqu 32*7-128($np), $ACC7 964 vmovdqu 32*8-128($np), $ACC8 965 lea 64+128(%rsp),$np 966 vmovdqu $ACC0, 32*0-128($np) 967 vpxor $ACC0, $ACC0, $ACC0 968 vmovdqu $ACC1, 32*1-128($np) 969 vpxor $ACC1, $ACC1, $ACC1 970 vmovdqu $ACC2, 32*2-128($np) 971 vpxor $ACC2, $ACC2, $ACC2 972 vmovdqu $ACC3, 32*3-128($np) 973 vpxor $ACC3, $ACC3, $ACC3 974 vmovdqu $ACC4, 32*4-128($np) 975 vpxor $ACC4, $ACC4, $ACC4 976 vmovdqu $ACC5, 32*5-128($np) 977 vpxor $ACC5, $ACC5, $ACC5 978 vmovdqu $ACC6, 32*6-128($np) 979 vpxor $ACC6, $ACC6, $ACC6 980 vmovdqu $ACC7, 32*7-128($np) 981 vpxor $ACC7, $ACC7, $ACC7 982 vmovdqu $ACC8, 32*8-128($np) 983 vmovdqa $ACC0, $ACC8 984 vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero after vzeroall 985.Lmul_1024_no_n_copy: 986 and \$-64,%rsp 987 988 mov ($bp), %rbx 989 vpbroadcastq ($bp), $Bi 990 vmovdqu $ACC0, (%rsp) # clear top of stack 991 xor $r0, $r0 992 .byte 0x67 993 xor $r1, $r1 994 xor $r2, $r2 995 xor $r3, $r3 996 997 vmovdqu .Land_mask(%rip), $AND_MASK 998 mov \$9, $i 999 vmovdqu $ACC9, 32*9-128($rp) # $ACC9 is zero after vzeroall 1000 jmp .Loop_mul_1024 1001 1002.align 32 1003.Loop_mul_1024: 1004 vpsrlq \$29, $ACC3, $ACC9 # correct $ACC3(*) 1005 mov %rbx, %rax 1006 imulq -128($ap), %rax 1007 add $r0, %rax 1008 mov %rbx, $r1 1009 imulq 8-128($ap), $r1 1010 add 8(%rsp), $r1 1011 1012 mov %rax, $r0 1013 imull $n0, %eax 1014 and \$0x1fffffff, %eax 1015 1016 mov %rbx, $r2 1017 imulq 16-128($ap), $r2 1018 add 16(%rsp), $r2 1019 1020 mov %rbx, $r3 1021 imulq 24-128($ap), $r3 1022 add 24(%rsp), $r3 1023 vpmuludq 32*1-128($ap),$Bi,$TEMP0 1024 vmovd %eax, $Yi 1025 vpaddq $TEMP0,$ACC1,$ACC1 1026 vpmuludq 32*2-128($ap),$Bi,$TEMP1 1027 vpbroadcastq $Yi, $Yi 1028 vpaddq $TEMP1,$ACC2,$ACC2 1029 vpmuludq 32*3-128($ap),$Bi,$TEMP2 1030 vpand $AND_MASK, $ACC3, $ACC3 # correct $ACC3 1031 vpaddq $TEMP2,$ACC3,$ACC3 1032 vpmuludq 32*4-128($ap),$Bi,$TEMP0 1033 vpaddq $TEMP0,$ACC4,$ACC4 1034 vpmuludq 32*5-128($ap),$Bi,$TEMP1 1035 vpaddq $TEMP1,$ACC5,$ACC5 1036 vpmuludq 32*6-128($ap),$Bi,$TEMP2 1037 vpaddq $TEMP2,$ACC6,$ACC6 1038 vpmuludq 32*7-128($ap),$Bi,$TEMP0 1039 vpermq \$0x93, $ACC9, $ACC9 # correct $ACC3 1040 vpaddq $TEMP0,$ACC7,$ACC7 1041 vpmuludq 32*8-128($ap),$Bi,$TEMP1 1042 vpbroadcastq 8($bp), $Bi 1043 vpaddq $TEMP1,$ACC8,$ACC8 1044 1045 mov %rax,%rdx 1046 imulq -128($np),%rax 1047 add %rax,$r0 1048 mov %rdx,%rax 1049 imulq 8-128($np),%rax 1050 add %rax,$r1 1051 mov %rdx,%rax 1052 imulq 16-128($np),%rax 1053 add %rax,$r2 1054 shr \$29, $r0 1055 imulq 24-128($np),%rdx 1056 add %rdx,$r3 1057 add $r0, $r1 1058 1059 vpmuludq 32*1-128($np),$Yi,$TEMP2 1060 vmovq $Bi, %rbx 1061 vpaddq $TEMP2,$ACC1,$ACC1 1062 vpmuludq 32*2-128($np),$Yi,$TEMP0 1063 vpaddq $TEMP0,$ACC2,$ACC2 1064 vpmuludq 32*3-128($np),$Yi,$TEMP1 1065 vpaddq $TEMP1,$ACC3,$ACC3 1066 vpmuludq 32*4-128($np),$Yi,$TEMP2 1067 vpaddq $TEMP2,$ACC4,$ACC4 1068 vpmuludq 32*5-128($np),$Yi,$TEMP0 1069 vpaddq $TEMP0,$ACC5,$ACC5 1070 vpmuludq 32*6-128($np),$Yi,$TEMP1 1071 vpaddq $TEMP1,$ACC6,$ACC6 1072 vpmuludq 32*7-128($np),$Yi,$TEMP2 1073 vpblendd \$3, $ZERO, $ACC9, $TEMP1 # correct $ACC3 1074 vpaddq $TEMP2,$ACC7,$ACC7 1075 vpmuludq 32*8-128($np),$Yi,$TEMP0 1076 vpaddq $TEMP1, $ACC3, $ACC3 # correct $ACC3 1077 vpaddq $TEMP0,$ACC8,$ACC8 1078 1079 mov %rbx, %rax 1080 imulq -128($ap),%rax 1081 add %rax,$r1 1082 vmovdqu -8+32*1-128($ap),$TEMP1 1083 mov %rbx, %rax 1084 imulq 8-128($ap),%rax 1085 add %rax,$r2 1086 vmovdqu -8+32*2-128($ap),$TEMP2 1087 1088 mov $r1, %rax 1089 vpblendd \$0xfc, $ZERO, $ACC9, $ACC9 # correct $ACC3 1090 imull $n0, %eax 1091 vpaddq $ACC9,$ACC4,$ACC4 # correct $ACC3 1092 and \$0x1fffffff, %eax 1093 1094 imulq 16-128($ap),%rbx 1095 add %rbx,$r3 1096 vpmuludq $Bi,$TEMP1,$TEMP1 1097 vmovd %eax, $Yi 1098 vmovdqu -8+32*3-128($ap),$TEMP0 1099 vpaddq $TEMP1,$ACC1,$ACC1 1100 vpmuludq $Bi,$TEMP2,$TEMP2 1101 vpbroadcastq $Yi, $Yi 1102 vmovdqu -8+32*4-128($ap),$TEMP1 1103 vpaddq $TEMP2,$ACC2,$ACC2 1104 vpmuludq $Bi,$TEMP0,$TEMP0 1105 vmovdqu -8+32*5-128($ap),$TEMP2 1106 vpaddq $TEMP0,$ACC3,$ACC3 1107 vpmuludq $Bi,$TEMP1,$TEMP1 1108 vmovdqu -8+32*6-128($ap),$TEMP0 1109 vpaddq $TEMP1,$ACC4,$ACC4 1110 vpmuludq $Bi,$TEMP2,$TEMP2 1111 vmovdqu -8+32*7-128($ap),$TEMP1 1112 vpaddq $TEMP2,$ACC5,$ACC5 1113 vpmuludq $Bi,$TEMP0,$TEMP0 1114 vmovdqu -8+32*8-128($ap),$TEMP2 1115 vpaddq $TEMP0,$ACC6,$ACC6 1116 vpmuludq $Bi,$TEMP1,$TEMP1 1117 vmovdqu -8+32*9-128($ap),$ACC9 1118 vpaddq $TEMP1,$ACC7,$ACC7 1119 vpmuludq $Bi,$TEMP2,$TEMP2 1120 vpaddq $TEMP2,$ACC8,$ACC8 1121 vpmuludq $Bi,$ACC9,$ACC9 1122 vpbroadcastq 16($bp), $Bi 1123 1124 mov %rax,%rdx 1125 imulq -128($np),%rax 1126 add %rax,$r1 1127 vmovdqu -8+32*1-128($np),$TEMP0 1128 mov %rdx,%rax 1129 imulq 8-128($np),%rax 1130 add %rax,$r2 1131 vmovdqu -8+32*2-128($np),$TEMP1 1132 shr \$29, $r1 1133 imulq 16-128($np),%rdx 1134 add %rdx,$r3 1135 add $r1, $r2 1136 1137 vpmuludq $Yi,$TEMP0,$TEMP0 1138 vmovq $Bi, %rbx 1139 vmovdqu -8+32*3-128($np),$TEMP2 1140 vpaddq $TEMP0,$ACC1,$ACC1 1141 vpmuludq $Yi,$TEMP1,$TEMP1 1142 vmovdqu -8+32*4-128($np),$TEMP0 1143 vpaddq $TEMP1,$ACC2,$ACC2 1144 vpmuludq $Yi,$TEMP2,$TEMP2 1145 vmovdqu -8+32*5-128($np),$TEMP1 1146 vpaddq $TEMP2,$ACC3,$ACC3 1147 vpmuludq $Yi,$TEMP0,$TEMP0 1148 vmovdqu -8+32*6-128($np),$TEMP2 1149 vpaddq $TEMP0,$ACC4,$ACC4 1150 vpmuludq $Yi,$TEMP1,$TEMP1 1151 vmovdqu -8+32*7-128($np),$TEMP0 1152 vpaddq $TEMP1,$ACC5,$ACC5 1153 vpmuludq $Yi,$TEMP2,$TEMP2 1154 vmovdqu -8+32*8-128($np),$TEMP1 1155 vpaddq $TEMP2,$ACC6,$ACC6 1156 vpmuludq $Yi,$TEMP0,$TEMP0 1157 vmovdqu -8+32*9-128($np),$TEMP2 1158 vpaddq $TEMP0,$ACC7,$ACC7 1159 vpmuludq $Yi,$TEMP1,$TEMP1 1160 vpaddq $TEMP1,$ACC8,$ACC8 1161 vpmuludq $Yi,$TEMP2,$TEMP2 1162 vpaddq $TEMP2,$ACC9,$ACC9 1163 1164 vmovdqu -16+32*1-128($ap),$TEMP0 1165 mov %rbx,%rax 1166 imulq -128($ap),%rax 1167 add $r2,%rax 1168 1169 vmovdqu -16+32*2-128($ap),$TEMP1 1170 mov %rax,$r2 1171 imull $n0, %eax 1172 and \$0x1fffffff, %eax 1173 1174 imulq 8-128($ap),%rbx 1175 add %rbx,$r3 1176 vpmuludq $Bi,$TEMP0,$TEMP0 1177 vmovd %eax, $Yi 1178 vmovdqu -16+32*3-128($ap),$TEMP2 1179 vpaddq $TEMP0,$ACC1,$ACC1 1180 vpmuludq $Bi,$TEMP1,$TEMP1 1181 vpbroadcastq $Yi, $Yi 1182 vmovdqu -16+32*4-128($ap),$TEMP0 1183 vpaddq $TEMP1,$ACC2,$ACC2 1184 vpmuludq $Bi,$TEMP2,$TEMP2 1185 vmovdqu -16+32*5-128($ap),$TEMP1 1186 vpaddq $TEMP2,$ACC3,$ACC3 1187 vpmuludq $Bi,$TEMP0,$TEMP0 1188 vmovdqu -16+32*6-128($ap),$TEMP2 1189 vpaddq $TEMP0,$ACC4,$ACC4 1190 vpmuludq $Bi,$TEMP1,$TEMP1 1191 vmovdqu -16+32*7-128($ap),$TEMP0 1192 vpaddq $TEMP1,$ACC5,$ACC5 1193 vpmuludq $Bi,$TEMP2,$TEMP2 1194 vmovdqu -16+32*8-128($ap),$TEMP1 1195 vpaddq $TEMP2,$ACC6,$ACC6 1196 vpmuludq $Bi,$TEMP0,$TEMP0 1197 vmovdqu -16+32*9-128($ap),$TEMP2 1198 vpaddq $TEMP0,$ACC7,$ACC7 1199 vpmuludq $Bi,$TEMP1,$TEMP1 1200 vpaddq $TEMP1,$ACC8,$ACC8 1201 vpmuludq $Bi,$TEMP2,$TEMP2 1202 vpbroadcastq 24($bp), $Bi 1203 vpaddq $TEMP2,$ACC9,$ACC9 1204 1205 vmovdqu -16+32*1-128($np),$TEMP0 1206 mov %rax,%rdx 1207 imulq -128($np),%rax 1208 add %rax,$r2 1209 vmovdqu -16+32*2-128($np),$TEMP1 1210 imulq 8-128($np),%rdx 1211 add %rdx,$r3 1212 shr \$29, $r2 1213 1214 vpmuludq $Yi,$TEMP0,$TEMP0 1215 vmovq $Bi, %rbx 1216 vmovdqu -16+32*3-128($np),$TEMP2 1217 vpaddq $TEMP0,$ACC1,$ACC1 1218 vpmuludq $Yi,$TEMP1,$TEMP1 1219 vmovdqu -16+32*4-128($np),$TEMP0 1220 vpaddq $TEMP1,$ACC2,$ACC2 1221 vpmuludq $Yi,$TEMP2,$TEMP2 1222 vmovdqu -16+32*5-128($np),$TEMP1 1223 vpaddq $TEMP2,$ACC3,$ACC3 1224 vpmuludq $Yi,$TEMP0,$TEMP0 1225 vmovdqu -16+32*6-128($np),$TEMP2 1226 vpaddq $TEMP0,$ACC4,$ACC4 1227 vpmuludq $Yi,$TEMP1,$TEMP1 1228 vmovdqu -16+32*7-128($np),$TEMP0 1229 vpaddq $TEMP1,$ACC5,$ACC5 1230 vpmuludq $Yi,$TEMP2,$TEMP2 1231 vmovdqu -16+32*8-128($np),$TEMP1 1232 vpaddq $TEMP2,$ACC6,$ACC6 1233 vpmuludq $Yi,$TEMP0,$TEMP0 1234 vmovdqu -16+32*9-128($np),$TEMP2 1235 vpaddq $TEMP0,$ACC7,$ACC7 1236 vpmuludq $Yi,$TEMP1,$TEMP1 1237 vmovdqu -24+32*1-128($ap),$TEMP0 1238 vpaddq $TEMP1,$ACC8,$ACC8 1239 vpmuludq $Yi,$TEMP2,$TEMP2 1240 vmovdqu -24+32*2-128($ap),$TEMP1 1241 vpaddq $TEMP2,$ACC9,$ACC9 1242 1243 add $r2, $r3 1244 imulq -128($ap),%rbx 1245 add %rbx,$r3 1246 1247 mov $r3, %rax 1248 imull $n0, %eax 1249 and \$0x1fffffff, %eax 1250 1251 vpmuludq $Bi,$TEMP0,$TEMP0 1252 vmovd %eax, $Yi 1253 vmovdqu -24+32*3-128($ap),$TEMP2 1254 vpaddq $TEMP0,$ACC1,$ACC1 1255 vpmuludq $Bi,$TEMP1,$TEMP1 1256 vpbroadcastq $Yi, $Yi 1257 vmovdqu -24+32*4-128($ap),$TEMP0 1258 vpaddq $TEMP1,$ACC2,$ACC2 1259 vpmuludq $Bi,$TEMP2,$TEMP2 1260 vmovdqu -24+32*5-128($ap),$TEMP1 1261 vpaddq $TEMP2,$ACC3,$ACC3 1262 vpmuludq $Bi,$TEMP0,$TEMP0 1263 vmovdqu -24+32*6-128($ap),$TEMP2 1264 vpaddq $TEMP0,$ACC4,$ACC4 1265 vpmuludq $Bi,$TEMP1,$TEMP1 1266 vmovdqu -24+32*7-128($ap),$TEMP0 1267 vpaddq $TEMP1,$ACC5,$ACC5 1268 vpmuludq $Bi,$TEMP2,$TEMP2 1269 vmovdqu -24+32*8-128($ap),$TEMP1 1270 vpaddq $TEMP2,$ACC6,$ACC6 1271 vpmuludq $Bi,$TEMP0,$TEMP0 1272 vmovdqu -24+32*9-128($ap),$TEMP2 1273 vpaddq $TEMP0,$ACC7,$ACC7 1274 vpmuludq $Bi,$TEMP1,$TEMP1 1275 vpaddq $TEMP1,$ACC8,$ACC8 1276 vpmuludq $Bi,$TEMP2,$TEMP2 1277 vpbroadcastq 32($bp), $Bi 1278 vpaddq $TEMP2,$ACC9,$ACC9 1279 add \$32, $bp # $bp++ 1280 1281 vmovdqu -24+32*1-128($np),$TEMP0 1282 imulq -128($np),%rax 1283 add %rax,$r3 1284 shr \$29, $r3 1285 1286 vmovdqu -24+32*2-128($np),$TEMP1 1287 vpmuludq $Yi,$TEMP0,$TEMP0 1288 vmovq $Bi, %rbx 1289 vmovdqu -24+32*3-128($np),$TEMP2 1290 vpaddq $TEMP0,$ACC1,$ACC0 # $ACC0==$TEMP0 1291 vpmuludq $Yi,$TEMP1,$TEMP1 1292 vmovdqu $ACC0, (%rsp) # transfer $r0-$r3 1293 vpaddq $TEMP1,$ACC2,$ACC1 1294 vmovdqu -24+32*4-128($np),$TEMP0 1295 vpmuludq $Yi,$TEMP2,$TEMP2 1296 vmovdqu -24+32*5-128($np),$TEMP1 1297 vpaddq $TEMP2,$ACC3,$ACC2 1298 vpmuludq $Yi,$TEMP0,$TEMP0 1299 vmovdqu -24+32*6-128($np),$TEMP2 1300 vpaddq $TEMP0,$ACC4,$ACC3 1301 vpmuludq $Yi,$TEMP1,$TEMP1 1302 vmovdqu -24+32*7-128($np),$TEMP0 1303 vpaddq $TEMP1,$ACC5,$ACC4 1304 vpmuludq $Yi,$TEMP2,$TEMP2 1305 vmovdqu -24+32*8-128($np),$TEMP1 1306 vpaddq $TEMP2,$ACC6,$ACC5 1307 vpmuludq $Yi,$TEMP0,$TEMP0 1308 vmovdqu -24+32*9-128($np),$TEMP2 1309 mov $r3, $r0 1310 vpaddq $TEMP0,$ACC7,$ACC6 1311 vpmuludq $Yi,$TEMP1,$TEMP1 1312 add (%rsp), $r0 1313 vpaddq $TEMP1,$ACC8,$ACC7 1314 vpmuludq $Yi,$TEMP2,$TEMP2 1315 vmovq $r3, $TEMP1 1316 vpaddq $TEMP2,$ACC9,$ACC8 1317 1318 dec $i 1319 jnz .Loop_mul_1024 1320___ 1321 1322# (*) Original implementation was correcting ACC1-ACC3 for overflow 1323# after 7 loop runs, or after 28 iterations, or 56 additions. 1324# But as we underutilize resources, it's possible to correct in 1325# each iteration with marginal performance loss. But then, as 1326# we do it in each iteration, we can correct less digits, and 1327# avoid performance penalties completely. 1328 1329$TEMP0 = $ACC9; 1330$TEMP3 = $Bi; 1331$TEMP4 = $Yi; 1332$code.=<<___; 1333 vpaddq (%rsp), $TEMP1, $ACC0 1334 1335 vpsrlq \$29, $ACC0, $TEMP1 1336 vpand $AND_MASK, $ACC0, $ACC0 1337 vpsrlq \$29, $ACC1, $TEMP2 1338 vpand $AND_MASK, $ACC1, $ACC1 1339 vpsrlq \$29, $ACC2, $TEMP3 1340 vpermq \$0x93, $TEMP1, $TEMP1 1341 vpand $AND_MASK, $ACC2, $ACC2 1342 vpsrlq \$29, $ACC3, $TEMP4 1343 vpermq \$0x93, $TEMP2, $TEMP2 1344 vpand $AND_MASK, $ACC3, $ACC3 1345 1346 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 1347 vpermq \$0x93, $TEMP3, $TEMP3 1348 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 1349 vpermq \$0x93, $TEMP4, $TEMP4 1350 vpaddq $TEMP0, $ACC0, $ACC0 1351 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 1352 vpaddq $TEMP1, $ACC1, $ACC1 1353 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 1354 vpaddq $TEMP2, $ACC2, $ACC2 1355 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 1356 vpaddq $TEMP3, $ACC3, $ACC3 1357 vpaddq $TEMP4, $ACC4, $ACC4 1358 1359 vpsrlq \$29, $ACC0, $TEMP1 1360 vpand $AND_MASK, $ACC0, $ACC0 1361 vpsrlq \$29, $ACC1, $TEMP2 1362 vpand $AND_MASK, $ACC1, $ACC1 1363 vpsrlq \$29, $ACC2, $TEMP3 1364 vpermq \$0x93, $TEMP1, $TEMP1 1365 vpand $AND_MASK, $ACC2, $ACC2 1366 vpsrlq \$29, $ACC3, $TEMP4 1367 vpermq \$0x93, $TEMP2, $TEMP2 1368 vpand $AND_MASK, $ACC3, $ACC3 1369 vpermq \$0x93, $TEMP3, $TEMP3 1370 1371 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 1372 vpermq \$0x93, $TEMP4, $TEMP4 1373 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 1374 vpaddq $TEMP0, $ACC0, $ACC0 1375 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 1376 vpaddq $TEMP1, $ACC1, $ACC1 1377 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 1378 vpaddq $TEMP2, $ACC2, $ACC2 1379 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 1380 vpaddq $TEMP3, $ACC3, $ACC3 1381 vpaddq $TEMP4, $ACC4, $ACC4 1382 1383 vmovdqu $ACC0, 0-128($rp) 1384 vmovdqu $ACC1, 32-128($rp) 1385 vmovdqu $ACC2, 64-128($rp) 1386 vmovdqu $ACC3, 96-128($rp) 1387___ 1388 1389$TEMP5=$ACC0; 1390$code.=<<___; 1391 vpsrlq \$29, $ACC4, $TEMP1 1392 vpand $AND_MASK, $ACC4, $ACC4 1393 vpsrlq \$29, $ACC5, $TEMP2 1394 vpand $AND_MASK, $ACC5, $ACC5 1395 vpsrlq \$29, $ACC6, $TEMP3 1396 vpermq \$0x93, $TEMP1, $TEMP1 1397 vpand $AND_MASK, $ACC6, $ACC6 1398 vpsrlq \$29, $ACC7, $TEMP4 1399 vpermq \$0x93, $TEMP2, $TEMP2 1400 vpand $AND_MASK, $ACC7, $ACC7 1401 vpsrlq \$29, $ACC8, $TEMP5 1402 vpermq \$0x93, $TEMP3, $TEMP3 1403 vpand $AND_MASK, $ACC8, $ACC8 1404 vpermq \$0x93, $TEMP4, $TEMP4 1405 1406 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 1407 vpermq \$0x93, $TEMP5, $TEMP5 1408 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 1409 vpaddq $TEMP0, $ACC4, $ACC4 1410 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 1411 vpaddq $TEMP1, $ACC5, $ACC5 1412 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 1413 vpaddq $TEMP2, $ACC6, $ACC6 1414 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 1415 vpaddq $TEMP3, $ACC7, $ACC7 1416 vpaddq $TEMP4, $ACC8, $ACC8 1417 1418 vpsrlq \$29, $ACC4, $TEMP1 1419 vpand $AND_MASK, $ACC4, $ACC4 1420 vpsrlq \$29, $ACC5, $TEMP2 1421 vpand $AND_MASK, $ACC5, $ACC5 1422 vpsrlq \$29, $ACC6, $TEMP3 1423 vpermq \$0x93, $TEMP1, $TEMP1 1424 vpand $AND_MASK, $ACC6, $ACC6 1425 vpsrlq \$29, $ACC7, $TEMP4 1426 vpermq \$0x93, $TEMP2, $TEMP2 1427 vpand $AND_MASK, $ACC7, $ACC7 1428 vpsrlq \$29, $ACC8, $TEMP5 1429 vpermq \$0x93, $TEMP3, $TEMP3 1430 vpand $AND_MASK, $ACC8, $ACC8 1431 vpermq \$0x93, $TEMP4, $TEMP4 1432 1433 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 1434 vpermq \$0x93, $TEMP5, $TEMP5 1435 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 1436 vpaddq $TEMP0, $ACC4, $ACC4 1437 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 1438 vpaddq $TEMP1, $ACC5, $ACC5 1439 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 1440 vpaddq $TEMP2, $ACC6, $ACC6 1441 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 1442 vpaddq $TEMP3, $ACC7, $ACC7 1443 vpaddq $TEMP4, $ACC8, $ACC8 1444 1445 vmovdqu $ACC4, 128-128($rp) 1446 vmovdqu $ACC5, 160-128($rp) 1447 vmovdqu $ACC6, 192-128($rp) 1448 vmovdqu $ACC7, 224-128($rp) 1449 vmovdqu $ACC8, 256-128($rp) 1450 vzeroupper 1451 1452 mov %rbp, %rax 1453___ 1454$code.=<<___ if ($win64); 1455 movaps -0xd8(%rax),%xmm6 1456 movaps -0xc8(%rax),%xmm7 1457 movaps -0xb8(%rax),%xmm8 1458 movaps -0xa8(%rax),%xmm9 1459 movaps -0x98(%rax),%xmm10 1460 movaps -0x88(%rax),%xmm11 1461 movaps -0x78(%rax),%xmm12 1462 movaps -0x68(%rax),%xmm13 1463 movaps -0x58(%rax),%xmm14 1464 movaps -0x48(%rax),%xmm15 1465___ 1466$code.=<<___; 1467 mov -48(%rax),%r15 1468 mov -40(%rax),%r14 1469 mov -32(%rax),%r13 1470 mov -24(%rax),%r12 1471 mov -16(%rax),%rbp 1472 mov -8(%rax),%rbx 1473 lea (%rax),%rsp # restore %rsp 1474.Lmul_1024_epilogue: 1475 ret 1476.size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2 1477___ 1478} 1479{ 1480my ($out,$inp) = $win64 ? ("%rcx","%rdx") : ("%rdi","%rsi"); 1481my @T = map("%r$_",(8..11)); 1482 1483$code.=<<___; 1484.globl rsaz_1024_red2norm_avx2 1485.type rsaz_1024_red2norm_avx2,\@abi-omnipotent 1486.align 32 1487rsaz_1024_red2norm_avx2: 1488 sub \$-128,$inp # size optimization 1489 xor %rax,%rax 1490___ 1491 1492for ($j=0,$i=0; $i<16; $i++) { 1493 my $k=0; 1494 while (29*$j<64*($i+1)) { # load data till boundary 1495 $code.=" mov `8*$j-128`($inp), @T[0]\n"; 1496 $j++; $k++; push(@T,shift(@T)); 1497 } 1498 $l=$k; 1499 while ($k>1) { # shift loaded data but last value 1500 $code.=" shl \$`29*($j-$k)`,@T[-$k]\n"; 1501 $k--; 1502 } 1503 $code.=<<___; # shift last value 1504 mov @T[-1], @T[0] 1505 shl \$`29*($j-1)`, @T[-1] 1506 shr \$`-29*($j-1)`, @T[0] 1507___ 1508 while ($l) { # accumulate all values 1509 $code.=" add @T[-$l], %rax\n"; 1510 $l--; 1511 } 1512 $code.=<<___; 1513 adc \$0, @T[0] # consume eventual carry 1514 mov %rax, 8*$i($out) 1515 mov @T[0], %rax 1516___ 1517 push(@T,shift(@T)); 1518} 1519$code.=<<___; 1520 ret 1521.size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2 1522 1523.globl rsaz_1024_norm2red_avx2 1524.type rsaz_1024_norm2red_avx2,\@abi-omnipotent 1525.align 32 1526rsaz_1024_norm2red_avx2: 1527 sub \$-128,$out # size optimization 1528 mov ($inp),@T[0] 1529 mov \$0x1fffffff,%eax 1530___ 1531for ($j=0,$i=0; $i<16; $i++) { 1532 $code.=" mov `8*($i+1)`($inp),@T[1]\n" if ($i<15); 1533 $code.=" xor @T[1],@T[1]\n" if ($i==15); 1534 my $k=1; 1535 while (29*($j+1)<64*($i+1)) { 1536 $code.=<<___; 1537 mov @T[0],@T[-$k] 1538 shr \$`29*$j`,@T[-$k] 1539 and %rax,@T[-$k] # &0x1fffffff 1540 mov @T[-$k],`8*$j-128`($out) 1541___ 1542 $j++; $k++; 1543 } 1544 $code.=<<___; 1545 shrd \$`29*$j`,@T[1],@T[0] 1546 and %rax,@T[0] 1547 mov @T[0],`8*$j-128`($out) 1548___ 1549 $j++; 1550 push(@T,shift(@T)); 1551} 1552$code.=<<___; 1553 mov @T[0],`8*$j-128`($out) # zero 1554 mov @T[0],`8*($j+1)-128`($out) 1555 mov @T[0],`8*($j+2)-128`($out) 1556 mov @T[0],`8*($j+3)-128`($out) 1557 ret 1558.size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2 1559___ 1560} 1561{ 1562my ($out,$inp,$power) = $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx"); 1563 1564$code.=<<___; 1565.globl rsaz_1024_scatter5_avx2 1566.type rsaz_1024_scatter5_avx2,\@abi-omnipotent 1567.align 32 1568rsaz_1024_scatter5_avx2: 1569 vzeroupper 1570 vmovdqu .Lscatter_permd(%rip),%ymm5 1571 shl \$4,$power 1572 lea ($out,$power),$out 1573 mov \$9,%eax 1574 jmp .Loop_scatter_1024 1575 1576.align 32 1577.Loop_scatter_1024: 1578 vmovdqu ($inp),%ymm0 1579 lea 32($inp),$inp 1580 vpermd %ymm0,%ymm5,%ymm0 1581 vmovdqu %xmm0,($out) 1582 lea 16*32($out),$out 1583 dec %eax 1584 jnz .Loop_scatter_1024 1585 1586 vzeroupper 1587 ret 1588.size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2 1589 1590.globl rsaz_1024_gather5_avx2 1591.type rsaz_1024_gather5_avx2,\@abi-omnipotent 1592.align 32 1593rsaz_1024_gather5_avx2: 1594 vzeroupper 1595 mov %rsp,%r11 1596___ 1597$code.=<<___ if ($win64); 1598 lea -0x88(%rsp),%rax 1599.LSEH_begin_rsaz_1024_gather5: 1600 # I can't trust assembler to use specific encoding:-( 1601 .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax),%rsp 1602 .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6,-0x20(%rax) 1603 .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7,-0x10(%rax) 1604 .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8,0(%rax) 1605 .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9,0x10(%rax) 1606 .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10,0x20(%rax) 1607 .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11,0x30(%rax) 1608 .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12,0x40(%rax) 1609 .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13,0x50(%rax) 1610 .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14,0x60(%rax) 1611 .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15,0x70(%rax) 1612___ 1613$code.=<<___; 1614 lea -0x100(%rsp),%rsp 1615 and \$-32, %rsp 1616 lea .Linc(%rip), %r10 1617 lea -128(%rsp),%rax # control u-op density 1618 1619 vmovd $power, %xmm4 1620 vmovdqa (%r10),%ymm0 1621 vmovdqa 32(%r10),%ymm1 1622 vmovdqa 64(%r10),%ymm5 1623 vpbroadcastd %xmm4,%ymm4 1624 1625 vpaddd %ymm5, %ymm0, %ymm2 1626 vpcmpeqd %ymm4, %ymm0, %ymm0 1627 vpaddd %ymm5, %ymm1, %ymm3 1628 vpcmpeqd %ymm4, %ymm1, %ymm1 1629 vmovdqa %ymm0, 32*0+128(%rax) 1630 vpaddd %ymm5, %ymm2, %ymm0 1631 vpcmpeqd %ymm4, %ymm2, %ymm2 1632 vmovdqa %ymm1, 32*1+128(%rax) 1633 vpaddd %ymm5, %ymm3, %ymm1 1634 vpcmpeqd %ymm4, %ymm3, %ymm3 1635 vmovdqa %ymm2, 32*2+128(%rax) 1636 vpaddd %ymm5, %ymm0, %ymm2 1637 vpcmpeqd %ymm4, %ymm0, %ymm0 1638 vmovdqa %ymm3, 32*3+128(%rax) 1639 vpaddd %ymm5, %ymm1, %ymm3 1640 vpcmpeqd %ymm4, %ymm1, %ymm1 1641 vmovdqa %ymm0, 32*4+128(%rax) 1642 vpaddd %ymm5, %ymm2, %ymm8 1643 vpcmpeqd %ymm4, %ymm2, %ymm2 1644 vmovdqa %ymm1, 32*5+128(%rax) 1645 vpaddd %ymm5, %ymm3, %ymm9 1646 vpcmpeqd %ymm4, %ymm3, %ymm3 1647 vmovdqa %ymm2, 32*6+128(%rax) 1648 vpaddd %ymm5, %ymm8, %ymm10 1649 vpcmpeqd %ymm4, %ymm8, %ymm8 1650 vmovdqa %ymm3, 32*7+128(%rax) 1651 vpaddd %ymm5, %ymm9, %ymm11 1652 vpcmpeqd %ymm4, %ymm9, %ymm9 1653 vpaddd %ymm5, %ymm10, %ymm12 1654 vpcmpeqd %ymm4, %ymm10, %ymm10 1655 vpaddd %ymm5, %ymm11, %ymm13 1656 vpcmpeqd %ymm4, %ymm11, %ymm11 1657 vpaddd %ymm5, %ymm12, %ymm14 1658 vpcmpeqd %ymm4, %ymm12, %ymm12 1659 vpaddd %ymm5, %ymm13, %ymm15 1660 vpcmpeqd %ymm4, %ymm13, %ymm13 1661 vpcmpeqd %ymm4, %ymm14, %ymm14 1662 vpcmpeqd %ymm4, %ymm15, %ymm15 1663 1664 vmovdqa -32(%r10),%ymm7 # .Lgather_permd 1665 lea 128($inp), $inp 1666 mov \$9,$power 1667 1668.Loop_gather_1024: 1669 vmovdqa 32*0-128($inp), %ymm0 1670 vmovdqa 32*1-128($inp), %ymm1 1671 vmovdqa 32*2-128($inp), %ymm2 1672 vmovdqa 32*3-128($inp), %ymm3 1673 vpand 32*0+128(%rax), %ymm0, %ymm0 1674 vpand 32*1+128(%rax), %ymm1, %ymm1 1675 vpand 32*2+128(%rax), %ymm2, %ymm2 1676 vpor %ymm0, %ymm1, %ymm4 1677 vpand 32*3+128(%rax), %ymm3, %ymm3 1678 vmovdqa 32*4-128($inp), %ymm0 1679 vmovdqa 32*5-128($inp), %ymm1 1680 vpor %ymm2, %ymm3, %ymm5 1681 vmovdqa 32*6-128($inp), %ymm2 1682 vmovdqa 32*7-128($inp), %ymm3 1683 vpand 32*4+128(%rax), %ymm0, %ymm0 1684 vpand 32*5+128(%rax), %ymm1, %ymm1 1685 vpand 32*6+128(%rax), %ymm2, %ymm2 1686 vpor %ymm0, %ymm4, %ymm4 1687 vpand 32*7+128(%rax), %ymm3, %ymm3 1688 vpand 32*8-128($inp), %ymm8, %ymm0 1689 vpor %ymm1, %ymm5, %ymm5 1690 vpand 32*9-128($inp), %ymm9, %ymm1 1691 vpor %ymm2, %ymm4, %ymm4 1692 vpand 32*10-128($inp),%ymm10, %ymm2 1693 vpor %ymm3, %ymm5, %ymm5 1694 vpand 32*11-128($inp),%ymm11, %ymm3 1695 vpor %ymm0, %ymm4, %ymm4 1696 vpand 32*12-128($inp),%ymm12, %ymm0 1697 vpor %ymm1, %ymm5, %ymm5 1698 vpand 32*13-128($inp),%ymm13, %ymm1 1699 vpor %ymm2, %ymm4, %ymm4 1700 vpand 32*14-128($inp),%ymm14, %ymm2 1701 vpor %ymm3, %ymm5, %ymm5 1702 vpand 32*15-128($inp),%ymm15, %ymm3 1703 lea 32*16($inp), $inp 1704 vpor %ymm0, %ymm4, %ymm4 1705 vpor %ymm1, %ymm5, %ymm5 1706 vpor %ymm2, %ymm4, %ymm4 1707 vpor %ymm3, %ymm5, %ymm5 1708 1709 vpor %ymm5, %ymm4, %ymm4 1710 vextracti128 \$1, %ymm4, %xmm5 # upper half is cleared 1711 vpor %xmm4, %xmm5, %xmm5 1712 vpermd %ymm5,%ymm7,%ymm5 1713 vmovdqu %ymm5,($out) 1714 lea 32($out),$out 1715 dec $power 1716 jnz .Loop_gather_1024 1717 1718 vpxor %ymm0,%ymm0,%ymm0 1719 vmovdqu %ymm0,($out) 1720 vzeroupper 1721___ 1722$code.=<<___ if ($win64); 1723 movaps -0xa8(%r11),%xmm6 1724 movaps -0x98(%r11),%xmm7 1725 movaps -0x88(%r11),%xmm8 1726 movaps -0x78(%r11),%xmm9 1727 movaps -0x68(%r11),%xmm10 1728 movaps -0x58(%r11),%xmm11 1729 movaps -0x48(%r11),%xmm12 1730 movaps -0x38(%r11),%xmm13 1731 movaps -0x28(%r11),%xmm14 1732 movaps -0x18(%r11),%xmm15 1733.LSEH_end_rsaz_1024_gather5: 1734___ 1735$code.=<<___; 1736 lea (%r11),%rsp 1737 ret 1738.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2 1739___ 1740} 1741 1742$code.=<<___; 1743.extern OPENSSL_ia32cap_P 1744.globl rsaz_avx2_eligible 1745.type rsaz_avx2_eligible,\@abi-omnipotent 1746.align 32 1747rsaz_avx2_eligible: 1748 mov OPENSSL_ia32cap_P+8(%rip),%eax 1749___ 1750$code.=<<___ if ($addx); 1751 mov \$`1<<8|1<<19`,%ecx 1752 mov \$0,%edx 1753 and %eax,%ecx 1754 cmp \$`1<<8|1<<19`,%ecx # check for BMI2+AD*X 1755 cmove %edx,%eax 1756___ 1757$code.=<<___; 1758 and \$`1<<5`,%eax 1759 shr \$5,%eax 1760 ret 1761.size rsaz_avx2_eligible,.-rsaz_avx2_eligible 1762 1763.align 64 1764.Land_mask: 1765 .quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff 1766.Lscatter_permd: 1767 .long 0,2,4,6,7,7,7,7 1768.Lgather_permd: 1769 .long 0,7,1,7,2,7,3,7 1770.Linc: 1771 .long 0,0,0,0, 1,1,1,1 1772 .long 2,2,2,2, 3,3,3,3 1773 .long 4,4,4,4, 4,4,4,4 1774.align 64 1775___ 1776 1777if ($win64) { 1778$rec="%rcx"; 1779$frame="%rdx"; 1780$context="%r8"; 1781$disp="%r9"; 1782 1783$code.=<<___ 1784.extern __imp_RtlVirtualUnwind 1785.type rsaz_se_handler,\@abi-omnipotent 1786.align 16 1787rsaz_se_handler: 1788 push %rsi 1789 push %rdi 1790 push %rbx 1791 push %rbp 1792 push %r12 1793 push %r13 1794 push %r14 1795 push %r15 1796 pushfq 1797 sub \$64,%rsp 1798 1799 mov 120($context),%rax # pull context->Rax 1800 mov 248($context),%rbx # pull context->Rip 1801 1802 mov 8($disp),%rsi # disp->ImageBase 1803 mov 56($disp),%r11 # disp->HandlerData 1804 1805 mov 0(%r11),%r10d # HandlerData[0] 1806 lea (%rsi,%r10),%r10 # prologue label 1807 cmp %r10,%rbx # context->Rip<prologue label 1808 jb .Lcommon_seh_tail 1809 1810 mov 152($context),%rax # pull context->Rsp 1811 1812 mov 4(%r11),%r10d # HandlerData[1] 1813 lea (%rsi,%r10),%r10 # epilogue label 1814 cmp %r10,%rbx # context->Rip>=epilogue label 1815 jae .Lcommon_seh_tail 1816 1817 mov 160($context),%rax # pull context->Rbp 1818 1819 mov -48(%rax),%r15 1820 mov -40(%rax),%r14 1821 mov -32(%rax),%r13 1822 mov -24(%rax),%r12 1823 mov -16(%rax),%rbp 1824 mov -8(%rax),%rbx 1825 mov %r15,240($context) 1826 mov %r14,232($context) 1827 mov %r13,224($context) 1828 mov %r12,216($context) 1829 mov %rbp,160($context) 1830 mov %rbx,144($context) 1831 1832 lea -0xd8(%rax),%rsi # %xmm save area 1833 lea 512($context),%rdi # & context.Xmm6 1834 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) 1835 .long 0xa548f3fc # cld; rep movsq 1836 1837.Lcommon_seh_tail: 1838 mov 8(%rax),%rdi 1839 mov 16(%rax),%rsi 1840 mov %rax,152($context) # restore context->Rsp 1841 mov %rsi,168($context) # restore context->Rsi 1842 mov %rdi,176($context) # restore context->Rdi 1843 1844 mov 40($disp),%rdi # disp->ContextRecord 1845 mov $context,%rsi # context 1846 mov \$154,%ecx # sizeof(CONTEXT) 1847 .long 0xa548f3fc # cld; rep movsq 1848 1849 mov $disp,%rsi 1850 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1851 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1852 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1853 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1854 mov 40(%rsi),%r10 # disp->ContextRecord 1855 lea 56(%rsi),%r11 # &disp->HandlerData 1856 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1857 mov %r10,32(%rsp) # arg5 1858 mov %r11,40(%rsp) # arg6 1859 mov %r12,48(%rsp) # arg7 1860 mov %rcx,56(%rsp) # arg8, (NULL) 1861 call *__imp_RtlVirtualUnwind(%rip) 1862 1863 mov \$1,%eax # ExceptionContinueSearch 1864 add \$64,%rsp 1865 popfq 1866 pop %r15 1867 pop %r14 1868 pop %r13 1869 pop %r12 1870 pop %rbp 1871 pop %rbx 1872 pop %rdi 1873 pop %rsi 1874 ret 1875.size rsaz_se_handler,.-rsaz_se_handler 1876 1877.section .pdata 1878.align 4 1879 .rva .LSEH_begin_rsaz_1024_sqr_avx2 1880 .rva .LSEH_end_rsaz_1024_sqr_avx2 1881 .rva .LSEH_info_rsaz_1024_sqr_avx2 1882 1883 .rva .LSEH_begin_rsaz_1024_mul_avx2 1884 .rva .LSEH_end_rsaz_1024_mul_avx2 1885 .rva .LSEH_info_rsaz_1024_mul_avx2 1886 1887 .rva .LSEH_begin_rsaz_1024_gather5 1888 .rva .LSEH_end_rsaz_1024_gather5 1889 .rva .LSEH_info_rsaz_1024_gather5 1890.section .xdata 1891.align 8 1892.LSEH_info_rsaz_1024_sqr_avx2: 1893 .byte 9,0,0,0 1894 .rva rsaz_se_handler 1895 .rva .Lsqr_1024_body,.Lsqr_1024_epilogue 1896.LSEH_info_rsaz_1024_mul_avx2: 1897 .byte 9,0,0,0 1898 .rva rsaz_se_handler 1899 .rva .Lmul_1024_body,.Lmul_1024_epilogue 1900.LSEH_info_rsaz_1024_gather5: 1901 .byte 0x01,0x36,0x17,0x0b 1902 .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15 1903 .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14 1904 .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13 1905 .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12 1906 .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11 1907 .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10 1908 .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9 1909 .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8 1910 .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7 1911 .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6 1912 .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8 1913 .byte 0x00,0xb3,0x00,0x00 # set_frame r11 1914___ 1915} 1916 1917foreach (split("\n",$code)) { 1918 s/\`([^\`]*)\`/eval($1)/ge; 1919 1920 s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge or 1921 1922 s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1923 s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or 1924 s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1925 s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or 1926 s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go; 1927 print $_,"\n"; 1928} 1929 1930}}} else {{{ 1931print <<___; # assembler is too old 1932.text 1933 1934.globl rsaz_avx2_eligible 1935.type rsaz_avx2_eligible,\@abi-omnipotent 1936rsaz_avx2_eligible: 1937 xor %eax,%eax 1938 ret 1939.size rsaz_avx2_eligible,.-rsaz_avx2_eligible 1940 1941.globl rsaz_1024_sqr_avx2 1942.globl rsaz_1024_mul_avx2 1943.globl rsaz_1024_norm2red_avx2 1944.globl rsaz_1024_red2norm_avx2 1945.globl rsaz_1024_scatter5_avx2 1946.globl rsaz_1024_gather5_avx2 1947.type rsaz_1024_sqr_avx2,\@abi-omnipotent 1948rsaz_1024_sqr_avx2: 1949rsaz_1024_mul_avx2: 1950rsaz_1024_norm2red_avx2: 1951rsaz_1024_red2norm_avx2: 1952rsaz_1024_scatter5_avx2: 1953rsaz_1024_gather5_avx2: 1954 .byte 0x0f,0x0b # ud2 1955 ret 1956.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 1957___ 1958}}} 1959 1960close STDOUT; 1961