aesni-sha1-x86_64.pl revision 325335
1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9# 10# June 2011 11# 12# This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled 13# in http://download.intel.com/design/intarch/papers/323686.pdf, is 14# that since AESNI-CBC encrypt exhibit *very* low instruction-level 15# parallelism, interleaving it with another algorithm would allow to 16# utilize processor resources better and achieve better performance. 17# SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and 18# AESNI code is weaved into it. Below are performance numbers in 19# cycles per processed byte, less is better, for standalone AESNI-CBC 20# encrypt, sum of the latter and standalone SHA1, and "stitched" 21# subroutine: 22# 23# AES-128-CBC +SHA1 stitch gain 24# Westmere 3.77[+5.3] 9.07 6.55 +38% 25# Sandy Bridge 5.05[+5.0(6.1)] 10.06(11.15) 5.98(7.05) +68%(+58%) 26# Ivy Bridge 5.05[+4.6] 9.65 5.54 +74% 27# Haswell 4.43[+3.6(4.2)] 8.00(8.58) 4.55(5.21) +75%(+65%) 28# Bulldozer 5.77[+6.0] 11.72 6.37 +84% 29# 30# AES-192-CBC 31# Westmere 4.51 9.81 6.80 +44% 32# Sandy Bridge 6.05 11.06(12.15) 6.11(7.19) +81%(+69%) 33# Ivy Bridge 6.05 10.65 6.07 +75% 34# Haswell 5.29 8.86(9.44) 5.32(5.32) +67%(+77%) 35# Bulldozer 6.89 12.84 6.96 +84% 36# 37# AES-256-CBC 38# Westmere 5.25 10.55 7.21 +46% 39# Sandy Bridge 7.05 12.06(13.15) 7.12(7.72) +69%(+70%) 40# Ivy Bridge 7.05 11.65 7.12 +64% 41# Haswell 6.19 9.76(10.34) 6.21(6.25) +57%(+65%) 42# Bulldozer 8.00 13.95 8.25 +69% 43# 44# (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for 45# background information. Above numbers in parentheses are SSSE3 46# results collected on AVX-capable CPU, i.e. apply on OSes that 47# don't support AVX. 48# 49# Needless to mention that it makes no sense to implement "stitched" 50# *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1 51# fully utilize parallelism, so stitching would not give any gain 52# anyway. Well, there might be some, e.g. because of better cache 53# locality... For reference, here are performance results for 54# standalone AESNI-CBC decrypt: 55# 56# AES-128-CBC AES-192-CBC AES-256-CBC 57# Westmere 1.25 1.50 1.75 58# Sandy Bridge 0.74 0.91 1.09 59# Ivy Bridge 0.74 0.90 1.11 60# Haswell 0.63 0.76 0.88 61# Bulldozer 0.70 0.85 0.99 62 63# And indeed: 64# 65# AES-256-CBC +SHA1 stitch gain 66# Westmere 1.75 7.20 6.68 +7.8% 67# Sandy Bridge 1.09 6.09(7.22) 5.82(6.95) +4.6%(+3.9%) 68# Ivy Bridge 1.11 5.70 5.45 +4.6% 69# Haswell 0.88 4.45(5.00) 4.39(4.69) +1.4%(*)(+6.6%) 70# Bulldozer 0.99 6.95 5.95 +17%(**) 71# 72# (*) Tiny improvement coefficient on Haswell is because we compare 73# AVX1 stitch to sum with AVX2 SHA1. 74# (**) Execution is fully dominated by integer code sequence and 75# SIMD still hardly shows [in single-process benchmark;-] 76 77$flavour = shift; 78$output = shift; 79if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 80 81$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 82 83$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 84( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 85( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 86die "can't locate x86_64-xlate.pl"; 87 88$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 89 =~ /GNU assembler version ([2-9]\.[0-9]+)/ && 90 $1>=2.19); 91$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 92 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && 93 $1>=2.09); 94$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 95 `ml64 2>&1` =~ /Version ([0-9]+)\./ && 96 $1>=10); 97$avx=1 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/ && $2>=3.0); 98 99$shaext=1; ### set to zero if compiling for 1.0.1 100 101$stitched_decrypt=0; 102 103open OUT,"| \"$^X\" $xlate $flavour $output"; 104*STDOUT=*OUT; 105 106# void aesni_cbc_sha1_enc(const void *inp, 107# void *out, 108# size_t length, 109# const AES_KEY *key, 110# unsigned char *iv, 111# SHA_CTX *ctx, 112# const void *in0); 113 114$code.=<<___; 115.text 116.extern OPENSSL_ia32cap_P 117 118.globl aesni_cbc_sha1_enc 119.type aesni_cbc_sha1_enc,\@abi-omnipotent 120.align 32 121aesni_cbc_sha1_enc: 122 # caller should check for SSSE3 and AES-NI bits 123 mov OPENSSL_ia32cap_P+0(%rip),%r10d 124 mov OPENSSL_ia32cap_P+4(%rip),%r11 125___ 126$code.=<<___ if ($shaext); 127 bt \$61,%r11 # check SHA bit 128 jc aesni_cbc_sha1_enc_shaext 129___ 130$code.=<<___ if ($avx); 131 and \$`1<<28`,%r11d # mask AVX bit 132 and \$`1<<30`,%r10d # mask "Intel CPU" bit 133 or %r11d,%r10d 134 cmp \$`1<<28|1<<30`,%r10d 135 je aesni_cbc_sha1_enc_avx 136___ 137$code.=<<___; 138 jmp aesni_cbc_sha1_enc_ssse3 139 ret 140.size aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc 141___ 142 143my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); 144 145my $Xi=4; 146my @X=map("%xmm$_",(4..7,0..3)); 147my @Tx=map("%xmm$_",(8..10)); 148my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization 149my @T=("%esi","%edi"); 150my $j=0; my $jj=0; my $r=0; my $sn=0; my $rx=0; 151my $K_XX_XX="%r11"; 152my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13)); # for enc 153my @rndkey=("%xmm14","%xmm15"); # for enc 154my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15)); # for dec 155 156if (1) { # reassign for Atom Silvermont 157 # The goal is to minimize amount of instructions with more than 158 # 3 prefix bytes. Or in more practical terms to keep AES-NI *and* 159 # SSSE3 instructions to upper half of the register bank. 160 @X=map("%xmm$_",(8..11,4..7)); 161 @Tx=map("%xmm$_",(12,13,3)); 162 ($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15)); 163 @rndkey=("%xmm0","%xmm1"); 164} 165 166sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 167{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 168 my $arg = pop; 169 $arg = "\$$arg" if ($arg*1 eq $arg); 170 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 171} 172 173my $_rol=sub { &rol(@_) }; 174my $_ror=sub { &ror(@_) }; 175 176$code.=<<___; 177.type aesni_cbc_sha1_enc_ssse3,\@function,6 178.align 32 179aesni_cbc_sha1_enc_ssse3: 180 mov `($win64?56:8)`(%rsp),$inp # load 7th argument 181 #shr \$6,$len # debugging artefact 182 #jz .Lepilogue_ssse3 # debugging artefact 183 push %rbx 184 push %rbp 185 push %r12 186 push %r13 187 push %r14 188 push %r15 189 lea `-104-($win64?10*16:0)`(%rsp),%rsp 190 #mov $in0,$inp # debugging artefact 191 #lea 64(%rsp),$ctx # debugging artefact 192___ 193$code.=<<___ if ($win64); 194 movaps %xmm6,96+0(%rsp) 195 movaps %xmm7,96+16(%rsp) 196 movaps %xmm8,96+32(%rsp) 197 movaps %xmm9,96+48(%rsp) 198 movaps %xmm10,96+64(%rsp) 199 movaps %xmm11,96+80(%rsp) 200 movaps %xmm12,96+96(%rsp) 201 movaps %xmm13,96+112(%rsp) 202 movaps %xmm14,96+128(%rsp) 203 movaps %xmm15,96+144(%rsp) 204.Lprologue_ssse3: 205___ 206$code.=<<___; 207 mov $in0,%r12 # reassign arguments 208 mov $out,%r13 209 mov $len,%r14 210 lea 112($key),%r15 # size optimization 211 movdqu ($ivp),$iv # load IV 212 mov $ivp,88(%rsp) # save $ivp 213___ 214($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments 215my $rounds="${ivp}d"; 216$code.=<<___; 217 shl \$6,$len 218 sub $in0,$out 219 mov 240-112($key),$rounds 220 add $inp,$len # end of input 221 222 lea K_XX_XX(%rip),$K_XX_XX 223 mov 0($ctx),$A # load context 224 mov 4($ctx),$B 225 mov 8($ctx),$C 226 mov 12($ctx),$D 227 mov $B,@T[0] # magic seed 228 mov 16($ctx),$E 229 mov $C,@T[1] 230 xor $D,@T[1] 231 and @T[1],@T[0] 232 233 movdqa 64($K_XX_XX),@Tx[2] # pbswap mask 234 movdqa 0($K_XX_XX),@Tx[1] # K_00_19 235 movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] 236 movdqu 16($inp),@X[-3&7] 237 movdqu 32($inp),@X[-2&7] 238 movdqu 48($inp),@X[-1&7] 239 pshufb @Tx[2],@X[-4&7] # byte swap 240 pshufb @Tx[2],@X[-3&7] 241 pshufb @Tx[2],@X[-2&7] 242 add \$64,$inp 243 paddd @Tx[1],@X[-4&7] # add K_00_19 244 pshufb @Tx[2],@X[-1&7] 245 paddd @Tx[1],@X[-3&7] 246 paddd @Tx[1],@X[-2&7] 247 movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU 248 psubd @Tx[1],@X[-4&7] # restore X[] 249 movdqa @X[-3&7],16(%rsp) 250 psubd @Tx[1],@X[-3&7] 251 movdqa @X[-2&7],32(%rsp) 252 psubd @Tx[1],@X[-2&7] 253 movups -112($key),$rndkey0 # $key[0] 254 movups 16-112($key),$rndkey[0] # forward reference 255 jmp .Loop_ssse3 256___ 257 258my $aesenc=sub { 259 use integer; 260 my ($n,$k)=($r/10,$r%10); 261 if ($k==0) { 262 $code.=<<___; 263 movups `16*$n`($in0),$in # load input 264 xorps $rndkey0,$in 265___ 266 $code.=<<___ if ($n); 267 movups $iv,`16*($n-1)`($out,$in0) # write output 268___ 269 $code.=<<___; 270 xorps $in,$iv 271 movups `32+16*$k-112`($key),$rndkey[1] 272 aesenc $rndkey[0],$iv 273___ 274 } elsif ($k==9) { 275 $sn++; 276 $code.=<<___; 277 cmp \$11,$rounds 278 jb .Laesenclast$sn 279 movups `32+16*($k+0)-112`($key),$rndkey[1] 280 aesenc $rndkey[0],$iv 281 movups `32+16*($k+1)-112`($key),$rndkey[0] 282 aesenc $rndkey[1],$iv 283 je .Laesenclast$sn 284 movups `32+16*($k+2)-112`($key),$rndkey[1] 285 aesenc $rndkey[0],$iv 286 movups `32+16*($k+3)-112`($key),$rndkey[0] 287 aesenc $rndkey[1],$iv 288.Laesenclast$sn: 289 aesenclast $rndkey[0],$iv 290 movups 16-112($key),$rndkey[1] # forward reference 291___ 292 } else { 293 $code.=<<___; 294 movups `32+16*$k-112`($key),$rndkey[1] 295 aesenc $rndkey[0],$iv 296___ 297 } 298 $r++; unshift(@rndkey,pop(@rndkey)); 299}; 300 301sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 302{ use integer; 303 my $body = shift; 304 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions 305 my ($a,$b,$c,$d,$e); 306 307 eval(shift(@insns)); # ror 308 &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]); 309 eval(shift(@insns)); 310 &movdqa (@Tx[0],@X[-1&7]); 311 &paddd (@Tx[1],@X[-1&7]); 312 eval(shift(@insns)); 313 eval(shift(@insns)); 314 315 &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8); 316 eval(shift(@insns)); 317 eval(shift(@insns)); # rol 318 eval(shift(@insns)); 319 &psrldq (@Tx[0],4); # "X[-3]", 3 dwords 320 eval(shift(@insns)); 321 eval(shift(@insns)); 322 323 &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" 324 eval(shift(@insns)); 325 eval(shift(@insns)); # ror 326 &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" 327 eval(shift(@insns)); 328 eval(shift(@insns)); 329 eval(shift(@insns)); 330 331 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" 332 eval(shift(@insns)); 333 eval(shift(@insns)); # rol 334 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU 335 eval(shift(@insns)); 336 eval(shift(@insns)); 337 338 &movdqa (@Tx[2],@X[0]); 339 eval(shift(@insns)); 340 eval(shift(@insns)); 341 eval(shift(@insns)); # ror 342 &movdqa (@Tx[0],@X[0]); 343 eval(shift(@insns)); 344 345 &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword 346 &paddd (@X[0],@X[0]); 347 eval(shift(@insns)); 348 eval(shift(@insns)); 349 350 &psrld (@Tx[0],31); 351 eval(shift(@insns)); 352 eval(shift(@insns)); # rol 353 eval(shift(@insns)); 354 &movdqa (@Tx[1],@Tx[2]); 355 eval(shift(@insns)); 356 eval(shift(@insns)); 357 358 &psrld (@Tx[2],30); 359 eval(shift(@insns)); 360 eval(shift(@insns)); # ror 361 &por (@X[0],@Tx[0]); # "X[0]"<<<=1 362 eval(shift(@insns)); 363 eval(shift(@insns)); 364 eval(shift(@insns)); 365 366 &pslld (@Tx[1],2); 367 &pxor (@X[0],@Tx[2]); 368 eval(shift(@insns)); 369 &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX 370 eval(shift(@insns)); # rol 371 eval(shift(@insns)); 372 eval(shift(@insns)); 373 374 &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 375 &pshufd (@Tx[1],@X[-1&7],0xee) if ($Xi==7); # was &movdqa (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79 376 377 foreach (@insns) { eval; } # remaining instructions [if any] 378 379 $Xi++; push(@X,shift(@X)); # "rotate" X[] 380 push(@Tx,shift(@Tx)); 381} 382 383sub Xupdate_ssse3_32_79() 384{ use integer; 385 my $body = shift; 386 my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions 387 my ($a,$b,$c,$d,$e); 388 389 eval(shift(@insns)) if ($Xi==8); 390 &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" 391 eval(shift(@insns)) if ($Xi==8); 392 eval(shift(@insns)); # body_20_39 393 eval(shift(@insns)); 394 eval(shift(@insns)) if (@insns[1] =~ /_ror/); 395 eval(shift(@insns)) if (@insns[0] =~ /_ror/); 396 &punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8); 397 eval(shift(@insns)); 398 eval(shift(@insns)); # rol 399 400 &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" 401 eval(shift(@insns)); 402 eval(shift(@insns)); 403 if ($Xi%5) { 404 &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... 405 } else { # ... or load next one 406 &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); 407 } 408 eval(shift(@insns)); # ror 409 &paddd (@Tx[1],@X[-1&7]); 410 eval(shift(@insns)); 411 412 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]" 413 eval(shift(@insns)); # body_20_39 414 eval(shift(@insns)); 415 eval(shift(@insns)); 416 eval(shift(@insns)); # rol 417 eval(shift(@insns)) if (@insns[0] =~ /_ror/); 418 419 &movdqa (@Tx[0],@X[0]); 420 eval(shift(@insns)); 421 eval(shift(@insns)); 422 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU 423 eval(shift(@insns)); # ror 424 eval(shift(@insns)); 425 eval(shift(@insns)); # body_20_39 426 427 &pslld (@X[0],2); 428 eval(shift(@insns)); 429 eval(shift(@insns)); 430 &psrld (@Tx[0],30); 431 eval(shift(@insns)) if (@insns[0] =~ /_rol/);# rol 432 eval(shift(@insns)); 433 eval(shift(@insns)); 434 eval(shift(@insns)); # ror 435 436 &por (@X[0],@Tx[0]); # "X[0]"<<<=2 437 eval(shift(@insns)); 438 eval(shift(@insns)); # body_20_39 439 eval(shift(@insns)) if (@insns[1] =~ /_rol/); 440 eval(shift(@insns)) if (@insns[0] =~ /_rol/); 441 &pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19); # was &movdqa (@Tx[1],@X[0]) 442 eval(shift(@insns)); 443 eval(shift(@insns)); # rol 444 eval(shift(@insns)); 445 eval(shift(@insns)); 446 eval(shift(@insns)); # rol 447 eval(shift(@insns)); 448 449 foreach (@insns) { eval; } # remaining instructions 450 451 $Xi++; push(@X,shift(@X)); # "rotate" X[] 452 push(@Tx,shift(@Tx)); 453} 454 455sub Xuplast_ssse3_80() 456{ use integer; 457 my $body = shift; 458 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 459 my ($a,$b,$c,$d,$e); 460 461 eval(shift(@insns)); 462 eval(shift(@insns)); 463 eval(shift(@insns)); 464 eval(shift(@insns)); 465 &paddd (@Tx[1],@X[-1&7]); 466 eval(shift(@insns)); 467 eval(shift(@insns)); 468 469 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU 470 471 foreach (@insns) { eval; } # remaining instructions 472 473 &cmp ($inp,$len); 474 &je (shift); 475 476 unshift(@Tx,pop(@Tx)); 477 478 &movdqa (@Tx[2],"64($K_XX_XX)"); # pbswap mask 479 &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19 480 &movdqu (@X[-4&7],"0($inp)"); # load input 481 &movdqu (@X[-3&7],"16($inp)"); 482 &movdqu (@X[-2&7],"32($inp)"); 483 &movdqu (@X[-1&7],"48($inp)"); 484 &pshufb (@X[-4&7],@Tx[2]); # byte swap 485 &add ($inp,64); 486 487 $Xi=0; 488} 489 490sub Xloop_ssse3() 491{ use integer; 492 my $body = shift; 493 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 494 my ($a,$b,$c,$d,$e); 495 496 eval(shift(@insns)); 497 eval(shift(@insns)); 498 eval(shift(@insns)); 499 &pshufb (@X[($Xi-3)&7],@Tx[2]); 500 eval(shift(@insns)); 501 eval(shift(@insns)); 502 eval(shift(@insns)); 503 eval(shift(@insns)); 504 &paddd (@X[($Xi-4)&7],@Tx[1]); 505 eval(shift(@insns)); 506 eval(shift(@insns)); 507 eval(shift(@insns)); 508 eval(shift(@insns)); 509 &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU 510 eval(shift(@insns)); 511 eval(shift(@insns)); 512 eval(shift(@insns)); 513 eval(shift(@insns)); 514 &psubd (@X[($Xi-4)&7],@Tx[1]); 515 516 foreach (@insns) { eval; } 517 $Xi++; 518} 519 520sub Xtail_ssse3() 521{ use integer; 522 my $body = shift; 523 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 524 my ($a,$b,$c,$d,$e); 525 526 foreach (@insns) { eval; } 527} 528 529my @body_00_19 = ( 530 '($a,$b,$c,$d,$e)=@V;'. 531 '&$_ror ($b,$j?7:2);', # $b>>>2 532 '&xor (@T[0],$d);', 533 '&mov (@T[1],$a);', # $b for next round 534 535 '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer 536 '&xor ($b,$c);', # $c^$d for next round 537 538 '&$_rol ($a,5);', 539 '&add ($e,@T[0]);', 540 '&and (@T[1],$b);', # ($b&($c^$d)) for next round 541 542 '&xor ($b,$c);', # restore $b 543 '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' 544 ); 545 546sub body_00_19 () { # ((c^d)&b)^d 547 # on start @T[0]=(c^d)&b 548 return &body_20_39() if ($rx==19); $rx++; 549 550 use integer; 551 my ($k,$n); 552 my @r=@body_00_19; 553 554 $n = scalar(@r); 555 $k = (($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds 556 @r[$k%$n].='&$aesenc();' if ($jj==$k/$n); 557 $jj++; 558 559 return @r; 560} 561 562my @body_20_39 = ( 563 '($a,$b,$c,$d,$e)=@V;'. 564 '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer 565 '&xor (@T[0],$d) if($j==19);'. 566 '&xor (@T[0],$c) if($j> 19);', # ($b^$d^$c) 567 '&mov (@T[1],$a);', # $b for next round 568 569 '&$_rol ($a,5);', 570 '&add ($e,@T[0]);', 571 '&xor (@T[1],$c) if ($j< 79);', # $b^$d for next round 572 573 '&$_ror ($b,7);', # $b>>>2 574 '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' 575 ); 576 577sub body_20_39 () { # b^d^c 578 # on entry @T[0]=b^d 579 return &body_40_59() if ($rx==39); $rx++; 580 581 use integer; 582 my ($k,$n); 583 my @r=@body_20_39; 584 585 $n = scalar(@r); 586 $k = (($jj+1)*8/20)*20*$n/8; # 8 aesencs per these 20 rounds 587 @r[$k%$n].='&$aesenc();' if ($jj==$k/$n && $rx!=20); 588 $jj++; 589 590 return @r; 591} 592 593my @body_40_59 = ( 594 '($a,$b,$c,$d,$e)=@V;'. 595 '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer 596 '&and (@T[0],$c) if ($j>=40);', # (b^c)&(c^d) 597 '&xor ($c,$d) if ($j>=40);', # restore $c 598 599 '&$_ror ($b,7);', # $b>>>2 600 '&mov (@T[1],$a);', # $b for next round 601 '&xor (@T[0],$c);', 602 603 '&$_rol ($a,5);', 604 '&add ($e,@T[0]);', 605 '&xor (@T[1],$c) if ($j==59);'. 606 '&xor (@T[1],$b) if ($j< 59);', # b^c for next round 607 608 '&xor ($b,$c) if ($j< 59);', # c^d for next round 609 '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' 610 ); 611 612sub body_40_59 () { # ((b^c)&(c^d))^c 613 # on entry @T[0]=(b^c), (c^=d) 614 $rx++; 615 616 use integer; 617 my ($k,$n); 618 my @r=@body_40_59; 619 620 $n = scalar(@r); 621 $k=(($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds 622 @r[$k%$n].='&$aesenc();' if ($jj==$k/$n && $rx!=40); 623 $jj++; 624 625 return @r; 626} 627$code.=<<___; 628.align 32 629.Loop_ssse3: 630___ 631 &Xupdate_ssse3_16_31(\&body_00_19); 632 &Xupdate_ssse3_16_31(\&body_00_19); 633 &Xupdate_ssse3_16_31(\&body_00_19); 634 &Xupdate_ssse3_16_31(\&body_00_19); 635 &Xupdate_ssse3_32_79(\&body_00_19); 636 &Xupdate_ssse3_32_79(\&body_20_39); 637 &Xupdate_ssse3_32_79(\&body_20_39); 638 &Xupdate_ssse3_32_79(\&body_20_39); 639 &Xupdate_ssse3_32_79(\&body_20_39); 640 &Xupdate_ssse3_32_79(\&body_20_39); 641 &Xupdate_ssse3_32_79(\&body_40_59); 642 &Xupdate_ssse3_32_79(\&body_40_59); 643 &Xupdate_ssse3_32_79(\&body_40_59); 644 &Xupdate_ssse3_32_79(\&body_40_59); 645 &Xupdate_ssse3_32_79(\&body_40_59); 646 &Xupdate_ssse3_32_79(\&body_20_39); 647 &Xuplast_ssse3_80(\&body_20_39,".Ldone_ssse3"); # can jump to "done" 648 649 $saved_j=$j; @saved_V=@V; 650 $saved_r=$r; @saved_rndkey=@rndkey; 651 652 &Xloop_ssse3(\&body_20_39); 653 &Xloop_ssse3(\&body_20_39); 654 &Xloop_ssse3(\&body_20_39); 655 656$code.=<<___; 657 movups $iv,48($out,$in0) # write output 658 lea 64($in0),$in0 659 660 add 0($ctx),$A # update context 661 add 4($ctx),@T[0] 662 add 8($ctx),$C 663 add 12($ctx),$D 664 mov $A,0($ctx) 665 add 16($ctx),$E 666 mov @T[0],4($ctx) 667 mov @T[0],$B # magic seed 668 mov $C,8($ctx) 669 mov $C,@T[1] 670 mov $D,12($ctx) 671 xor $D,@T[1] 672 mov $E,16($ctx) 673 and @T[1],@T[0] 674 jmp .Loop_ssse3 675 676.Ldone_ssse3: 677___ 678 $jj=$j=$saved_j; @V=@saved_V; 679 $r=$saved_r; @rndkey=@saved_rndkey; 680 681 &Xtail_ssse3(\&body_20_39); 682 &Xtail_ssse3(\&body_20_39); 683 &Xtail_ssse3(\&body_20_39); 684 685$code.=<<___; 686 movups $iv,48($out,$in0) # write output 687 mov 88(%rsp),$ivp # restore $ivp 688 689 add 0($ctx),$A # update context 690 add 4($ctx),@T[0] 691 add 8($ctx),$C 692 mov $A,0($ctx) 693 add 12($ctx),$D 694 mov @T[0],4($ctx) 695 add 16($ctx),$E 696 mov $C,8($ctx) 697 mov $D,12($ctx) 698 mov $E,16($ctx) 699 movups $iv,($ivp) # write IV 700___ 701$code.=<<___ if ($win64); 702 movaps 96+0(%rsp),%xmm6 703 movaps 96+16(%rsp),%xmm7 704 movaps 96+32(%rsp),%xmm8 705 movaps 96+48(%rsp),%xmm9 706 movaps 96+64(%rsp),%xmm10 707 movaps 96+80(%rsp),%xmm11 708 movaps 96+96(%rsp),%xmm12 709 movaps 96+112(%rsp),%xmm13 710 movaps 96+128(%rsp),%xmm14 711 movaps 96+144(%rsp),%xmm15 712___ 713$code.=<<___; 714 lea `104+($win64?10*16:0)`(%rsp),%rsi 715 mov 0(%rsi),%r15 716 mov 8(%rsi),%r14 717 mov 16(%rsi),%r13 718 mov 24(%rsi),%r12 719 mov 32(%rsi),%rbp 720 mov 40(%rsi),%rbx 721 lea 48(%rsi),%rsp 722.Lepilogue_ssse3: 723 ret 724.size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3 725___ 726 727 if ($stitched_decrypt) {{{ 728# reset 729($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); 730$j=$jj=$r=$rx=0; 731$Xi=4; 732 733# reassign for Atom Silvermont (see above) 734($inout0,$inout1,$inout2,$inout3,$rndkey0)=map("%xmm$_",(0..4)); 735@X=map("%xmm$_",(8..13,6,7)); 736@Tx=map("%xmm$_",(14,15,5)); 737 738my @aes256_dec = ( 739 '&movdqu($inout0,"0x00($in0)");', 740 '&movdqu($inout1,"0x10($in0)"); &pxor ($inout0,$rndkey0);', 741 '&movdqu($inout2,"0x20($in0)"); &pxor ($inout1,$rndkey0);', 742 '&movdqu($inout3,"0x30($in0)"); &pxor ($inout2,$rndkey0);', 743 744 '&pxor ($inout3,$rndkey0); &movups ($rndkey0,"16-112($key)");', 745 '&movaps("64(%rsp)",@X[2]);', # save IV, originally @X[3] 746 undef,undef 747 ); 748for ($i=0;$i<13;$i++) { 749 push (@aes256_dec,( 750 '&aesdec ($inout0,$rndkey0);', 751 '&aesdec ($inout1,$rndkey0);', 752 '&aesdec ($inout2,$rndkey0);', 753 '&aesdec ($inout3,$rndkey0); &movups($rndkey0,"'.(16*($i+2)-112).'($key)");' 754 )); 755 push (@aes256_dec,(undef,undef)) if (($i>=3 && $i<=5) || $i>=11); 756 push (@aes256_dec,(undef,undef)) if ($i==5); 757} 758push(@aes256_dec,( 759 '&aesdeclast ($inout0,$rndkey0); &movups (@X[0],"0x00($in0)");', 760 '&aesdeclast ($inout1,$rndkey0); &movups (@X[1],"0x10($in0)");', 761 '&aesdeclast ($inout2,$rndkey0); &movups (@X[2],"0x20($in0)");', 762 '&aesdeclast ($inout3,$rndkey0); &movups (@X[3],"0x30($in0)");', 763 764 '&xorps ($inout0,"64(%rsp)"); &movdqu ($rndkey0,"-112($key)");', 765 '&xorps ($inout1,@X[0]); &movups ("0x00($out,$in0)",$inout0);', 766 '&xorps ($inout2,@X[1]); &movups ("0x10($out,$in0)",$inout1);', 767 '&xorps ($inout3,@X[2]); &movups ("0x20($out,$in0)",$inout2);', 768 769 '&movups ("0x30($out,$in0)",$inout3);' 770 )); 771 772sub body_00_19_dec () { # ((c^d)&b)^d 773 # on start @T[0]=(c^d)&b 774 return &body_20_39_dec() if ($rx==19); 775 776 my @r=@body_00_19; 777 778 unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]); 779 $rx++; 780 781 return @r; 782} 783 784sub body_20_39_dec () { # b^d^c 785 # on entry @T[0]=b^d 786 return &body_40_59_dec() if ($rx==39); 787 788 my @r=@body_20_39; 789 790 unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]); 791 $rx++; 792 793 return @r; 794} 795 796sub body_40_59_dec () { # ((b^c)&(c^d))^c 797 # on entry @T[0]=(b^c), (c^=d) 798 799 my @r=@body_40_59; 800 801 unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]); 802 $rx++; 803 804 return @r; 805} 806 807$code.=<<___; 808.globl aesni256_cbc_sha1_dec 809.type aesni256_cbc_sha1_dec,\@abi-omnipotent 810.align 32 811aesni256_cbc_sha1_dec: 812 # caller should check for SSSE3 and AES-NI bits 813 mov OPENSSL_ia32cap_P+0(%rip),%r10d 814 mov OPENSSL_ia32cap_P+4(%rip),%r11d 815___ 816$code.=<<___ if ($avx); 817 and \$`1<<28`,%r11d # mask AVX bit 818 and \$`1<<30`,%r10d # mask "Intel CPU" bit 819 or %r11d,%r10d 820 cmp \$`1<<28|1<<30`,%r10d 821 je aesni256_cbc_sha1_dec_avx 822___ 823$code.=<<___; 824 jmp aesni256_cbc_sha1_dec_ssse3 825 ret 826.size aesni256_cbc_sha1_dec,.-aesni256_cbc_sha1_dec 827 828.type aesni256_cbc_sha1_dec_ssse3,\@function,6 829.align 32 830aesni256_cbc_sha1_dec_ssse3: 831 mov `($win64?56:8)`(%rsp),$inp # load 7th argument 832 push %rbx 833 push %rbp 834 push %r12 835 push %r13 836 push %r14 837 push %r15 838 lea `-104-($win64?10*16:0)`(%rsp),%rsp 839___ 840$code.=<<___ if ($win64); 841 movaps %xmm6,96+0(%rsp) 842 movaps %xmm7,96+16(%rsp) 843 movaps %xmm8,96+32(%rsp) 844 movaps %xmm9,96+48(%rsp) 845 movaps %xmm10,96+64(%rsp) 846 movaps %xmm11,96+80(%rsp) 847 movaps %xmm12,96+96(%rsp) 848 movaps %xmm13,96+112(%rsp) 849 movaps %xmm14,96+128(%rsp) 850 movaps %xmm15,96+144(%rsp) 851.Lprologue_dec_ssse3: 852___ 853$code.=<<___; 854 mov $in0,%r12 # reassign arguments 855 mov $out,%r13 856 mov $len,%r14 857 lea 112($key),%r15 # size optimization 858 movdqu ($ivp),@X[3] # load IV 859 #mov $ivp,88(%rsp) # save $ivp 860___ 861($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments 862$code.=<<___; 863 shl \$6,$len 864 sub $in0,$out 865 add $inp,$len # end of input 866 867 lea K_XX_XX(%rip),$K_XX_XX 868 mov 0($ctx),$A # load context 869 mov 4($ctx),$B 870 mov 8($ctx),$C 871 mov 12($ctx),$D 872 mov $B,@T[0] # magic seed 873 mov 16($ctx),$E 874 mov $C,@T[1] 875 xor $D,@T[1] 876 and @T[1],@T[0] 877 878 movdqa 64($K_XX_XX),@Tx[2] # pbswap mask 879 movdqa 0($K_XX_XX),@Tx[1] # K_00_19 880 movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] 881 movdqu 16($inp),@X[-3&7] 882 movdqu 32($inp),@X[-2&7] 883 movdqu 48($inp),@X[-1&7] 884 pshufb @Tx[2],@X[-4&7] # byte swap 885 add \$64,$inp 886 pshufb @Tx[2],@X[-3&7] 887 pshufb @Tx[2],@X[-2&7] 888 pshufb @Tx[2],@X[-1&7] 889 paddd @Tx[1],@X[-4&7] # add K_00_19 890 paddd @Tx[1],@X[-3&7] 891 paddd @Tx[1],@X[-2&7] 892 movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU 893 psubd @Tx[1],@X[-4&7] # restore X[] 894 movdqa @X[-3&7],16(%rsp) 895 psubd @Tx[1],@X[-3&7] 896 movdqa @X[-2&7],32(%rsp) 897 psubd @Tx[1],@X[-2&7] 898 movdqu -112($key),$rndkey0 # $key[0] 899 jmp .Loop_dec_ssse3 900 901.align 32 902.Loop_dec_ssse3: 903___ 904 &Xupdate_ssse3_16_31(\&body_00_19_dec); 905 &Xupdate_ssse3_16_31(\&body_00_19_dec); 906 &Xupdate_ssse3_16_31(\&body_00_19_dec); 907 &Xupdate_ssse3_16_31(\&body_00_19_dec); 908 &Xupdate_ssse3_32_79(\&body_00_19_dec); 909 &Xupdate_ssse3_32_79(\&body_20_39_dec); 910 &Xupdate_ssse3_32_79(\&body_20_39_dec); 911 &Xupdate_ssse3_32_79(\&body_20_39_dec); 912 &Xupdate_ssse3_32_79(\&body_20_39_dec); 913 &Xupdate_ssse3_32_79(\&body_20_39_dec); 914 &Xupdate_ssse3_32_79(\&body_40_59_dec); 915 &Xupdate_ssse3_32_79(\&body_40_59_dec); 916 &Xupdate_ssse3_32_79(\&body_40_59_dec); 917 &Xupdate_ssse3_32_79(\&body_40_59_dec); 918 &Xupdate_ssse3_32_79(\&body_40_59_dec); 919 &Xupdate_ssse3_32_79(\&body_20_39_dec); 920 &Xuplast_ssse3_80(\&body_20_39_dec,".Ldone_dec_ssse3"); # can jump to "done" 921 922 $saved_j=$j; @saved_V=@V; 923 $saved_rx=$rx; 924 925 &Xloop_ssse3(\&body_20_39_dec); 926 &Xloop_ssse3(\&body_20_39_dec); 927 &Xloop_ssse3(\&body_20_39_dec); 928 929 eval(@aes256_dec[-1]); # last store 930$code.=<<___; 931 lea 64($in0),$in0 932 933 add 0($ctx),$A # update context 934 add 4($ctx),@T[0] 935 add 8($ctx),$C 936 add 12($ctx),$D 937 mov $A,0($ctx) 938 add 16($ctx),$E 939 mov @T[0],4($ctx) 940 mov @T[0],$B # magic seed 941 mov $C,8($ctx) 942 mov $C,@T[1] 943 mov $D,12($ctx) 944 xor $D,@T[1] 945 mov $E,16($ctx) 946 and @T[1],@T[0] 947 jmp .Loop_dec_ssse3 948 949.Ldone_dec_ssse3: 950___ 951 $jj=$j=$saved_j; @V=@saved_V; 952 $rx=$saved_rx; 953 954 &Xtail_ssse3(\&body_20_39_dec); 955 &Xtail_ssse3(\&body_20_39_dec); 956 &Xtail_ssse3(\&body_20_39_dec); 957 958 eval(@aes256_dec[-1]); # last store 959$code.=<<___; 960 add 0($ctx),$A # update context 961 add 4($ctx),@T[0] 962 add 8($ctx),$C 963 mov $A,0($ctx) 964 add 12($ctx),$D 965 mov @T[0],4($ctx) 966 add 16($ctx),$E 967 mov $C,8($ctx) 968 mov $D,12($ctx) 969 mov $E,16($ctx) 970 movups @X[3],($ivp) # write IV 971___ 972$code.=<<___ if ($win64); 973 movaps 96+0(%rsp),%xmm6 974 movaps 96+16(%rsp),%xmm7 975 movaps 96+32(%rsp),%xmm8 976 movaps 96+48(%rsp),%xmm9 977 movaps 96+64(%rsp),%xmm10 978 movaps 96+80(%rsp),%xmm11 979 movaps 96+96(%rsp),%xmm12 980 movaps 96+112(%rsp),%xmm13 981 movaps 96+128(%rsp),%xmm14 982 movaps 96+144(%rsp),%xmm15 983___ 984$code.=<<___; 985 lea `104+($win64?10*16:0)`(%rsp),%rsi 986 mov 0(%rsi),%r15 987 mov 8(%rsi),%r14 988 mov 16(%rsi),%r13 989 mov 24(%rsi),%r12 990 mov 32(%rsi),%rbp 991 mov 40(%rsi),%rbx 992 lea 48(%rsi),%rsp 993.Lepilogue_dec_ssse3: 994 ret 995.size aesni256_cbc_sha1_dec_ssse3,.-aesni256_cbc_sha1_dec_ssse3 996___ 997 }}} 998$j=$jj=$r=$rx=0; 999 1000if ($avx) { 1001my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); 1002 1003my $Xi=4; 1004my @X=map("%xmm$_",(4..7,0..3)); 1005my @Tx=map("%xmm$_",(8..10)); 1006my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization 1007my @T=("%esi","%edi"); 1008my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13)); 1009my @rndkey=("%xmm14","%xmm15"); 1010my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15)); # for dec 1011my $Kx=@Tx[2]; 1012 1013my $_rol=sub { &shld(@_[0],@_) }; 1014my $_ror=sub { &shrd(@_[0],@_) }; 1015 1016$code.=<<___; 1017.type aesni_cbc_sha1_enc_avx,\@function,6 1018.align 32 1019aesni_cbc_sha1_enc_avx: 1020 mov `($win64?56:8)`(%rsp),$inp # load 7th argument 1021 #shr \$6,$len # debugging artefact 1022 #jz .Lepilogue_avx # debugging artefact 1023 push %rbx 1024 push %rbp 1025 push %r12 1026 push %r13 1027 push %r14 1028 push %r15 1029 lea `-104-($win64?10*16:0)`(%rsp),%rsp 1030 #mov $in0,$inp # debugging artefact 1031 #lea 64(%rsp),$ctx # debugging artefact 1032___ 1033$code.=<<___ if ($win64); 1034 movaps %xmm6,96+0(%rsp) 1035 movaps %xmm7,96+16(%rsp) 1036 movaps %xmm8,96+32(%rsp) 1037 movaps %xmm9,96+48(%rsp) 1038 movaps %xmm10,96+64(%rsp) 1039 movaps %xmm11,96+80(%rsp) 1040 movaps %xmm12,96+96(%rsp) 1041 movaps %xmm13,96+112(%rsp) 1042 movaps %xmm14,96+128(%rsp) 1043 movaps %xmm15,96+144(%rsp) 1044.Lprologue_avx: 1045___ 1046$code.=<<___; 1047 vzeroall 1048 mov $in0,%r12 # reassign arguments 1049 mov $out,%r13 1050 mov $len,%r14 1051 lea 112($key),%r15 # size optimization 1052 vmovdqu ($ivp),$iv # load IV 1053 mov $ivp,88(%rsp) # save $ivp 1054___ 1055($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments 1056my $rounds="${ivp}d"; 1057$code.=<<___; 1058 shl \$6,$len 1059 sub $in0,$out 1060 mov 240-112($key),$rounds 1061 add $inp,$len # end of input 1062 1063 lea K_XX_XX(%rip),$K_XX_XX 1064 mov 0($ctx),$A # load context 1065 mov 4($ctx),$B 1066 mov 8($ctx),$C 1067 mov 12($ctx),$D 1068 mov $B,@T[0] # magic seed 1069 mov 16($ctx),$E 1070 mov $C,@T[1] 1071 xor $D,@T[1] 1072 and @T[1],@T[0] 1073 1074 vmovdqa 64($K_XX_XX),@X[2] # pbswap mask 1075 vmovdqa 0($K_XX_XX),$Kx # K_00_19 1076 vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] 1077 vmovdqu 16($inp),@X[-3&7] 1078 vmovdqu 32($inp),@X[-2&7] 1079 vmovdqu 48($inp),@X[-1&7] 1080 vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap 1081 add \$64,$inp 1082 vpshufb @X[2],@X[-3&7],@X[-3&7] 1083 vpshufb @X[2],@X[-2&7],@X[-2&7] 1084 vpshufb @X[2],@X[-1&7],@X[-1&7] 1085 vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19 1086 vpaddd $Kx,@X[-3&7],@X[1] 1087 vpaddd $Kx,@X[-2&7],@X[2] 1088 vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU 1089 vmovdqa @X[1],16(%rsp) 1090 vmovdqa @X[2],32(%rsp) 1091 vmovups -112($key),$rndkey[1] # $key[0] 1092 vmovups 16-112($key),$rndkey[0] # forward reference 1093 jmp .Loop_avx 1094___ 1095 1096my $aesenc=sub { 1097 use integer; 1098 my ($n,$k)=($r/10,$r%10); 1099 if ($k==0) { 1100 $code.=<<___; 1101 vmovdqu `16*$n`($in0),$in # load input 1102 vpxor $rndkey[1],$in,$in 1103___ 1104 $code.=<<___ if ($n); 1105 vmovups $iv,`16*($n-1)`($out,$in0) # write output 1106___ 1107 $code.=<<___; 1108 vpxor $in,$iv,$iv 1109 vaesenc $rndkey[0],$iv,$iv 1110 vmovups `32+16*$k-112`($key),$rndkey[1] 1111___ 1112 } elsif ($k==9) { 1113 $sn++; 1114 $code.=<<___; 1115 cmp \$11,$rounds 1116 jb .Lvaesenclast$sn 1117 vaesenc $rndkey[0],$iv,$iv 1118 vmovups `32+16*($k+0)-112`($key),$rndkey[1] 1119 vaesenc $rndkey[1],$iv,$iv 1120 vmovups `32+16*($k+1)-112`($key),$rndkey[0] 1121 je .Lvaesenclast$sn 1122 vaesenc $rndkey[0],$iv,$iv 1123 vmovups `32+16*($k+2)-112`($key),$rndkey[1] 1124 vaesenc $rndkey[1],$iv,$iv 1125 vmovups `32+16*($k+3)-112`($key),$rndkey[0] 1126.Lvaesenclast$sn: 1127 vaesenclast $rndkey[0],$iv,$iv 1128 vmovups -112($key),$rndkey[0] 1129 vmovups 16-112($key),$rndkey[1] # forward reference 1130___ 1131 } else { 1132 $code.=<<___; 1133 vaesenc $rndkey[0],$iv,$iv 1134 vmovups `32+16*$k-112`($key),$rndkey[1] 1135___ 1136 } 1137 $r++; unshift(@rndkey,pop(@rndkey)); 1138}; 1139 1140sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 1141{ use integer; 1142 my $body = shift; 1143 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions 1144 my ($a,$b,$c,$d,$e); 1145 1146 eval(shift(@insns)); 1147 eval(shift(@insns)); 1148 &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" 1149 eval(shift(@insns)); 1150 eval(shift(@insns)); 1151 1152 &vpaddd (@Tx[1],$Kx,@X[-1&7]); 1153 eval(shift(@insns)); 1154 eval(shift(@insns)); 1155 &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords 1156 eval(shift(@insns)); 1157 eval(shift(@insns)); 1158 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" 1159 eval(shift(@insns)); 1160 eval(shift(@insns)); 1161 1162 &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" 1163 eval(shift(@insns)); 1164 eval(shift(@insns)); 1165 eval(shift(@insns)); 1166 eval(shift(@insns)); 1167 1168 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" 1169 eval(shift(@insns)); 1170 eval(shift(@insns)); 1171 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU 1172 eval(shift(@insns)); 1173 eval(shift(@insns)); 1174 1175 &vpsrld (@Tx[0],@X[0],31); 1176 eval(shift(@insns)); 1177 eval(shift(@insns)); 1178 eval(shift(@insns)); 1179 eval(shift(@insns)); 1180 1181 &vpslldq(@Tx[1],@X[0],12); # "X[0]"<<96, extract one dword 1182 &vpaddd (@X[0],@X[0],@X[0]); 1183 eval(shift(@insns)); 1184 eval(shift(@insns)); 1185 eval(shift(@insns)); 1186 eval(shift(@insns)); 1187 1188 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1 1189 &vpsrld (@Tx[0],@Tx[1],30); 1190 eval(shift(@insns)); 1191 eval(shift(@insns)); 1192 eval(shift(@insns)); 1193 eval(shift(@insns)); 1194 1195 &vpslld (@Tx[1],@Tx[1],2); 1196 &vpxor (@X[0],@X[0],@Tx[0]); 1197 eval(shift(@insns)); 1198 eval(shift(@insns)); 1199 eval(shift(@insns)); 1200 eval(shift(@insns)); 1201 1202 &vpxor (@X[0],@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 1203 eval(shift(@insns)); 1204 eval(shift(@insns)); 1205 &vmovdqa ($Kx,eval(16*(($Xi)/5))."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX 1206 eval(shift(@insns)); 1207 eval(shift(@insns)); 1208 1209 1210 foreach (@insns) { eval; } # remaining instructions [if any] 1211 1212 $Xi++; push(@X,shift(@X)); # "rotate" X[] 1213} 1214 1215sub Xupdate_avx_32_79() 1216{ use integer; 1217 my $body = shift; 1218 my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions 1219 my ($a,$b,$c,$d,$e); 1220 1221 &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]" 1222 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" 1223 eval(shift(@insns)); # body_20_39 1224 eval(shift(@insns)); 1225 eval(shift(@insns)); 1226 eval(shift(@insns)); # rol 1227 1228 &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" 1229 eval(shift(@insns)); 1230 eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); 1231 &vpaddd (@Tx[1],$Kx,@X[-1&7]); 1232 &vmovdqa ($Kx,eval(16*($Xi/5))."($K_XX_XX)") if ($Xi%5==0); 1233 eval(shift(@insns)); # ror 1234 eval(shift(@insns)); 1235 1236 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]" 1237 eval(shift(@insns)); # body_20_39 1238 eval(shift(@insns)); 1239 eval(shift(@insns)); 1240 eval(shift(@insns)); # rol 1241 1242 &vpsrld (@Tx[0],@X[0],30); 1243 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU 1244 eval(shift(@insns)); 1245 eval(shift(@insns)); 1246 eval(shift(@insns)); # ror 1247 eval(shift(@insns)); 1248 1249 &vpslld (@X[0],@X[0],2); 1250 eval(shift(@insns)); # body_20_39 1251 eval(shift(@insns)); 1252 eval(shift(@insns)); 1253 eval(shift(@insns)); # rol 1254 eval(shift(@insns)); 1255 eval(shift(@insns)); 1256 eval(shift(@insns)); # ror 1257 eval(shift(@insns)); 1258 1259 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2 1260 eval(shift(@insns)); # body_20_39 1261 eval(shift(@insns)); 1262 eval(shift(@insns)); 1263 eval(shift(@insns)); # rol 1264 eval(shift(@insns)); 1265 eval(shift(@insns)); 1266 eval(shift(@insns)); # rol 1267 eval(shift(@insns)); 1268 1269 foreach (@insns) { eval; } # remaining instructions 1270 1271 $Xi++; push(@X,shift(@X)); # "rotate" X[] 1272} 1273 1274sub Xuplast_avx_80() 1275{ use integer; 1276 my $body = shift; 1277 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 1278 my ($a,$b,$c,$d,$e); 1279 1280 eval(shift(@insns)); 1281 &vpaddd (@Tx[1],$Kx,@X[-1&7]); 1282 eval(shift(@insns)); 1283 eval(shift(@insns)); 1284 eval(shift(@insns)); 1285 eval(shift(@insns)); 1286 1287 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU 1288 1289 foreach (@insns) { eval; } # remaining instructions 1290 1291 &cmp ($inp,$len); 1292 &je (shift); 1293 1294 &vmovdqa(@Tx[1],"64($K_XX_XX)"); # pbswap mask 1295 &vmovdqa($Kx,"0($K_XX_XX)"); # K_00_19 1296 &vmovdqu(@X[-4&7],"0($inp)"); # load input 1297 &vmovdqu(@X[-3&7],"16($inp)"); 1298 &vmovdqu(@X[-2&7],"32($inp)"); 1299 &vmovdqu(@X[-1&7],"48($inp)"); 1300 &vpshufb(@X[-4&7],@X[-4&7],@Tx[1]); # byte swap 1301 &add ($inp,64); 1302 1303 $Xi=0; 1304} 1305 1306sub Xloop_avx() 1307{ use integer; 1308 my $body = shift; 1309 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 1310 my ($a,$b,$c,$d,$e); 1311 1312 eval(shift(@insns)); 1313 eval(shift(@insns)); 1314 &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@Tx[1]); 1315 eval(shift(@insns)); 1316 eval(shift(@insns)); 1317 &vpaddd (@Tx[0],@X[($Xi-4)&7],$Kx); 1318 eval(shift(@insns)); 1319 eval(shift(@insns)); 1320 eval(shift(@insns)); 1321 eval(shift(@insns)); 1322 &vmovdqa(eval(16*$Xi)."(%rsp)",@Tx[0]); # X[]+K xfer to IALU 1323 eval(shift(@insns)); 1324 eval(shift(@insns)); 1325 1326 foreach (@insns) { eval; } 1327 $Xi++; 1328} 1329 1330sub Xtail_avx() 1331{ use integer; 1332 my $body = shift; 1333 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 1334 my ($a,$b,$c,$d,$e); 1335 1336 foreach (@insns) { eval; } 1337} 1338 1339$code.=<<___; 1340.align 32 1341.Loop_avx: 1342___ 1343 &Xupdate_avx_16_31(\&body_00_19); 1344 &Xupdate_avx_16_31(\&body_00_19); 1345 &Xupdate_avx_16_31(\&body_00_19); 1346 &Xupdate_avx_16_31(\&body_00_19); 1347 &Xupdate_avx_32_79(\&body_00_19); 1348 &Xupdate_avx_32_79(\&body_20_39); 1349 &Xupdate_avx_32_79(\&body_20_39); 1350 &Xupdate_avx_32_79(\&body_20_39); 1351 &Xupdate_avx_32_79(\&body_20_39); 1352 &Xupdate_avx_32_79(\&body_20_39); 1353 &Xupdate_avx_32_79(\&body_40_59); 1354 &Xupdate_avx_32_79(\&body_40_59); 1355 &Xupdate_avx_32_79(\&body_40_59); 1356 &Xupdate_avx_32_79(\&body_40_59); 1357 &Xupdate_avx_32_79(\&body_40_59); 1358 &Xupdate_avx_32_79(\&body_20_39); 1359 &Xuplast_avx_80(\&body_20_39,".Ldone_avx"); # can jump to "done" 1360 1361 $saved_j=$j; @saved_V=@V; 1362 $saved_r=$r; @saved_rndkey=@rndkey; 1363 1364 &Xloop_avx(\&body_20_39); 1365 &Xloop_avx(\&body_20_39); 1366 &Xloop_avx(\&body_20_39); 1367 1368$code.=<<___; 1369 vmovups $iv,48($out,$in0) # write output 1370 lea 64($in0),$in0 1371 1372 add 0($ctx),$A # update context 1373 add 4($ctx),@T[0] 1374 add 8($ctx),$C 1375 add 12($ctx),$D 1376 mov $A,0($ctx) 1377 add 16($ctx),$E 1378 mov @T[0],4($ctx) 1379 mov @T[0],$B # magic seed 1380 mov $C,8($ctx) 1381 mov $C,@T[1] 1382 mov $D,12($ctx) 1383 xor $D,@T[1] 1384 mov $E,16($ctx) 1385 and @T[1],@T[0] 1386 jmp .Loop_avx 1387 1388.Ldone_avx: 1389___ 1390 $jj=$j=$saved_j; @V=@saved_V; 1391 $r=$saved_r; @rndkey=@saved_rndkey; 1392 1393 &Xtail_avx(\&body_20_39); 1394 &Xtail_avx(\&body_20_39); 1395 &Xtail_avx(\&body_20_39); 1396 1397$code.=<<___; 1398 vmovups $iv,48($out,$in0) # write output 1399 mov 88(%rsp),$ivp # restore $ivp 1400 1401 add 0($ctx),$A # update context 1402 add 4($ctx),@T[0] 1403 add 8($ctx),$C 1404 mov $A,0($ctx) 1405 add 12($ctx),$D 1406 mov @T[0],4($ctx) 1407 add 16($ctx),$E 1408 mov $C,8($ctx) 1409 mov $D,12($ctx) 1410 mov $E,16($ctx) 1411 vmovups $iv,($ivp) # write IV 1412 vzeroall 1413___ 1414$code.=<<___ if ($win64); 1415 movaps 96+0(%rsp),%xmm6 1416 movaps 96+16(%rsp),%xmm7 1417 movaps 96+32(%rsp),%xmm8 1418 movaps 96+48(%rsp),%xmm9 1419 movaps 96+64(%rsp),%xmm10 1420 movaps 96+80(%rsp),%xmm11 1421 movaps 96+96(%rsp),%xmm12 1422 movaps 96+112(%rsp),%xmm13 1423 movaps 96+128(%rsp),%xmm14 1424 movaps 96+144(%rsp),%xmm15 1425___ 1426$code.=<<___; 1427 lea `104+($win64?10*16:0)`(%rsp),%rsi 1428 mov 0(%rsi),%r15 1429 mov 8(%rsi),%r14 1430 mov 16(%rsi),%r13 1431 mov 24(%rsi),%r12 1432 mov 32(%rsi),%rbp 1433 mov 40(%rsi),%rbx 1434 lea 48(%rsi),%rsp 1435.Lepilogue_avx: 1436 ret 1437.size aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx 1438___ 1439 1440 if ($stitched_decrypt) {{{ 1441# reset 1442($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); 1443 1444$j=$jj=$r=$rx=0; 1445$Xi=4; 1446 1447@aes256_dec = ( 1448 '&vpxor ($inout0,$rndkey0,"0x00($in0)");', 1449 '&vpxor ($inout1,$rndkey0,"0x10($in0)");', 1450 '&vpxor ($inout2,$rndkey0,"0x20($in0)");', 1451 '&vpxor ($inout3,$rndkey0,"0x30($in0)");', 1452 1453 '&vmovups($rndkey0,"16-112($key)");', 1454 '&vmovups("64(%rsp)",@X[2]);', # save IV, originally @X[3] 1455 undef,undef 1456 ); 1457for ($i=0;$i<13;$i++) { 1458 push (@aes256_dec,( 1459 '&vaesdec ($inout0,$inout0,$rndkey0);', 1460 '&vaesdec ($inout1,$inout1,$rndkey0);', 1461 '&vaesdec ($inout2,$inout2,$rndkey0);', 1462 '&vaesdec ($inout3,$inout3,$rndkey0); &vmovups($rndkey0,"'.(16*($i+2)-112).'($key)");' 1463 )); 1464 push (@aes256_dec,(undef,undef)) if (($i>=3 && $i<=5) || $i>=11); 1465 push (@aes256_dec,(undef,undef)) if ($i==5); 1466} 1467push(@aes256_dec,( 1468 '&vaesdeclast ($inout0,$inout0,$rndkey0); &vmovups(@X[0],"0x00($in0)");', 1469 '&vaesdeclast ($inout1,$inout1,$rndkey0); &vmovups(@X[1],"0x10($in0)");', 1470 '&vaesdeclast ($inout2,$inout2,$rndkey0); &vmovups(@X[2],"0x20($in0)");', 1471 '&vaesdeclast ($inout3,$inout3,$rndkey0); &vmovups(@X[3],"0x30($in0)");', 1472 1473 '&vxorps ($inout0,$inout0,"64(%rsp)"); &vmovdqu($rndkey0,"-112($key)");', 1474 '&vxorps ($inout1,$inout1,@X[0]); &vmovups("0x00($out,$in0)",$inout0);', 1475 '&vxorps ($inout2,$inout2,@X[1]); &vmovups("0x10($out,$in0)",$inout1);', 1476 '&vxorps ($inout3,$inout3,@X[2]); &vmovups("0x20($out,$in0)",$inout2);', 1477 1478 '&vmovups ("0x30($out,$in0)",$inout3);' 1479 )); 1480 1481$code.=<<___; 1482.type aesni256_cbc_sha1_dec_avx,\@function,6 1483.align 32 1484aesni256_cbc_sha1_dec_avx: 1485 mov `($win64?56:8)`(%rsp),$inp # load 7th argument 1486 push %rbx 1487 push %rbp 1488 push %r12 1489 push %r13 1490 push %r14 1491 push %r15 1492 lea `-104-($win64?10*16:0)`(%rsp),%rsp 1493___ 1494$code.=<<___ if ($win64); 1495 movaps %xmm6,96+0(%rsp) 1496 movaps %xmm7,96+16(%rsp) 1497 movaps %xmm8,96+32(%rsp) 1498 movaps %xmm9,96+48(%rsp) 1499 movaps %xmm10,96+64(%rsp) 1500 movaps %xmm11,96+80(%rsp) 1501 movaps %xmm12,96+96(%rsp) 1502 movaps %xmm13,96+112(%rsp) 1503 movaps %xmm14,96+128(%rsp) 1504 movaps %xmm15,96+144(%rsp) 1505.Lprologue_dec_avx: 1506___ 1507$code.=<<___; 1508 vzeroall 1509 mov $in0,%r12 # reassign arguments 1510 mov $out,%r13 1511 mov $len,%r14 1512 lea 112($key),%r15 # size optimization 1513 vmovdqu ($ivp),@X[3] # load IV 1514___ 1515($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments 1516$code.=<<___; 1517 shl \$6,$len 1518 sub $in0,$out 1519 add $inp,$len # end of input 1520 1521 lea K_XX_XX(%rip),$K_XX_XX 1522 mov 0($ctx),$A # load context 1523 mov 4($ctx),$B 1524 mov 8($ctx),$C 1525 mov 12($ctx),$D 1526 mov $B,@T[0] # magic seed 1527 mov 16($ctx),$E 1528 mov $C,@T[1] 1529 xor $D,@T[1] 1530 and @T[1],@T[0] 1531 1532 vmovdqa 64($K_XX_XX),@X[2] # pbswap mask 1533 vmovdqa 0($K_XX_XX),$Kx # K_00_19 1534 vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] 1535 vmovdqu 16($inp),@X[-3&7] 1536 vmovdqu 32($inp),@X[-2&7] 1537 vmovdqu 48($inp),@X[-1&7] 1538 vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap 1539 add \$64,$inp 1540 vpshufb @X[2],@X[-3&7],@X[-3&7] 1541 vpshufb @X[2],@X[-2&7],@X[-2&7] 1542 vpshufb @X[2],@X[-1&7],@X[-1&7] 1543 vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19 1544 vpaddd $Kx,@X[-3&7],@X[1] 1545 vpaddd $Kx,@X[-2&7],@X[2] 1546 vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU 1547 vmovdqa @X[1],16(%rsp) 1548 vmovdqa @X[2],32(%rsp) 1549 vmovups -112($key),$rndkey0 # $key[0] 1550 jmp .Loop_dec_avx 1551 1552.align 32 1553.Loop_dec_avx: 1554___ 1555 &Xupdate_avx_16_31(\&body_00_19_dec); 1556 &Xupdate_avx_16_31(\&body_00_19_dec); 1557 &Xupdate_avx_16_31(\&body_00_19_dec); 1558 &Xupdate_avx_16_31(\&body_00_19_dec); 1559 &Xupdate_avx_32_79(\&body_00_19_dec); 1560 &Xupdate_avx_32_79(\&body_20_39_dec); 1561 &Xupdate_avx_32_79(\&body_20_39_dec); 1562 &Xupdate_avx_32_79(\&body_20_39_dec); 1563 &Xupdate_avx_32_79(\&body_20_39_dec); 1564 &Xupdate_avx_32_79(\&body_20_39_dec); 1565 &Xupdate_avx_32_79(\&body_40_59_dec); 1566 &Xupdate_avx_32_79(\&body_40_59_dec); 1567 &Xupdate_avx_32_79(\&body_40_59_dec); 1568 &Xupdate_avx_32_79(\&body_40_59_dec); 1569 &Xupdate_avx_32_79(\&body_40_59_dec); 1570 &Xupdate_avx_32_79(\&body_20_39_dec); 1571 &Xuplast_avx_80(\&body_20_39_dec,".Ldone_dec_avx"); # can jump to "done" 1572 1573 $saved_j=$j; @saved_V=@V; 1574 $saved_rx=$rx; 1575 1576 &Xloop_avx(\&body_20_39_dec); 1577 &Xloop_avx(\&body_20_39_dec); 1578 &Xloop_avx(\&body_20_39_dec); 1579 1580 eval(@aes256_dec[-1]); # last store 1581$code.=<<___; 1582 lea 64($in0),$in0 1583 1584 add 0($ctx),$A # update context 1585 add 4($ctx),@T[0] 1586 add 8($ctx),$C 1587 add 12($ctx),$D 1588 mov $A,0($ctx) 1589 add 16($ctx),$E 1590 mov @T[0],4($ctx) 1591 mov @T[0],$B # magic seed 1592 mov $C,8($ctx) 1593 mov $C,@T[1] 1594 mov $D,12($ctx) 1595 xor $D,@T[1] 1596 mov $E,16($ctx) 1597 and @T[1],@T[0] 1598 jmp .Loop_dec_avx 1599 1600.Ldone_dec_avx: 1601___ 1602 $jj=$j=$saved_j; @V=@saved_V; 1603 $rx=$saved_rx; 1604 1605 &Xtail_avx(\&body_20_39_dec); 1606 &Xtail_avx(\&body_20_39_dec); 1607 &Xtail_avx(\&body_20_39_dec); 1608 1609 eval(@aes256_dec[-1]); # last store 1610$code.=<<___; 1611 1612 add 0($ctx),$A # update context 1613 add 4($ctx),@T[0] 1614 add 8($ctx),$C 1615 mov $A,0($ctx) 1616 add 12($ctx),$D 1617 mov @T[0],4($ctx) 1618 add 16($ctx),$E 1619 mov $C,8($ctx) 1620 mov $D,12($ctx) 1621 mov $E,16($ctx) 1622 vmovups @X[3],($ivp) # write IV 1623 vzeroall 1624___ 1625$code.=<<___ if ($win64); 1626 movaps 96+0(%rsp),%xmm6 1627 movaps 96+16(%rsp),%xmm7 1628 movaps 96+32(%rsp),%xmm8 1629 movaps 96+48(%rsp),%xmm9 1630 movaps 96+64(%rsp),%xmm10 1631 movaps 96+80(%rsp),%xmm11 1632 movaps 96+96(%rsp),%xmm12 1633 movaps 96+112(%rsp),%xmm13 1634 movaps 96+128(%rsp),%xmm14 1635 movaps 96+144(%rsp),%xmm15 1636___ 1637$code.=<<___; 1638 lea `104+($win64?10*16:0)`(%rsp),%rsi 1639 mov 0(%rsi),%r15 1640 mov 8(%rsi),%r14 1641 mov 16(%rsi),%r13 1642 mov 24(%rsi),%r12 1643 mov 32(%rsi),%rbp 1644 mov 40(%rsi),%rbx 1645 lea 48(%rsi),%rsp 1646.Lepilogue_dec_avx: 1647 ret 1648.size aesni256_cbc_sha1_dec_avx,.-aesni256_cbc_sha1_dec_avx 1649___ 1650 }}} 1651} 1652$code.=<<___; 1653.align 64 1654K_XX_XX: 1655.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 1656.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 1657.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 1658.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 1659.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask 1660.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 1661 1662.asciz "AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 1663.align 64 1664___ 1665 if ($shaext) {{{ 1666($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); 1667 1668$rounds="%r11d"; 1669 1670($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15)); 1671@rndkey=("%xmm0","%xmm1"); 1672$r=0; 1673 1674my ($BSWAP,$ABCD,$E,$E_,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(7..12)); 1675my @MSG=map("%xmm$_",(3..6)); 1676 1677$code.=<<___; 1678.type aesni_cbc_sha1_enc_shaext,\@function,6 1679.align 32 1680aesni_cbc_sha1_enc_shaext: 1681 mov `($win64?56:8)`(%rsp),$inp # load 7th argument 1682___ 1683$code.=<<___ if ($win64); 1684 lea `-8-10*16`(%rsp),%rsp 1685 movaps %xmm6,-8-10*16(%rax) 1686 movaps %xmm7,-8-9*16(%rax) 1687 movaps %xmm8,-8-8*16(%rax) 1688 movaps %xmm9,-8-7*16(%rax) 1689 movaps %xmm10,-8-6*16(%rax) 1690 movaps %xmm11,-8-5*16(%rax) 1691 movaps %xmm12,-8-4*16(%rax) 1692 movaps %xmm13,-8-3*16(%rax) 1693 movaps %xmm14,-8-2*16(%rax) 1694 movaps %xmm15,-8-1*16(%rax) 1695.Lprologue_shaext: 1696___ 1697$code.=<<___; 1698 movdqu ($ctx),$ABCD 1699 movd 16($ctx),$E 1700 movdqa K_XX_XX+0x50(%rip),$BSWAP # byte-n-word swap 1701 1702 mov 240($key),$rounds 1703 sub $in0,$out 1704 movups ($key),$rndkey0 # $key[0] 1705 movups ($ivp),$iv # load IV 1706 movups 16($key),$rndkey[0] # forward reference 1707 lea 112($key),$key # size optimization 1708 1709 pshufd \$0b00011011,$ABCD,$ABCD # flip word order 1710 pshufd \$0b00011011,$E,$E # flip word order 1711 jmp .Loop_shaext 1712 1713.align 16 1714.Loop_shaext: 1715___ 1716 &$aesenc(); 1717$code.=<<___; 1718 movdqu ($inp),@MSG[0] 1719 movdqa $E,$E_SAVE # offload $E 1720 pshufb $BSWAP,@MSG[0] 1721 movdqu 0x10($inp),@MSG[1] 1722 movdqa $ABCD,$ABCD_SAVE # offload $ABCD 1723___ 1724 &$aesenc(); 1725$code.=<<___; 1726 pshufb $BSWAP,@MSG[1] 1727 1728 paddd @MSG[0],$E 1729 movdqu 0x20($inp),@MSG[2] 1730 lea 0x40($inp),$inp 1731 pxor $E_SAVE,@MSG[0] # black magic 1732___ 1733 &$aesenc(); 1734$code.=<<___; 1735 pxor $E_SAVE,@MSG[0] # black magic 1736 movdqa $ABCD,$E_ 1737 pshufb $BSWAP,@MSG[2] 1738 sha1rnds4 \$0,$E,$ABCD # 0-3 1739 sha1nexte @MSG[1],$E_ 1740___ 1741 &$aesenc(); 1742$code.=<<___; 1743 sha1msg1 @MSG[1],@MSG[0] 1744 movdqu -0x10($inp),@MSG[3] 1745 movdqa $ABCD,$E 1746 pshufb $BSWAP,@MSG[3] 1747___ 1748 &$aesenc(); 1749$code.=<<___; 1750 sha1rnds4 \$0,$E_,$ABCD # 4-7 1751 sha1nexte @MSG[2],$E 1752 pxor @MSG[2],@MSG[0] 1753 sha1msg1 @MSG[2],@MSG[1] 1754___ 1755 &$aesenc(); 1756 1757for($i=2;$i<20-4;$i++) { 1758$code.=<<___; 1759 movdqa $ABCD,$E_ 1760 sha1rnds4 \$`int($i/5)`,$E,$ABCD # 8-11 1761 sha1nexte @MSG[3],$E_ 1762___ 1763 &$aesenc(); 1764$code.=<<___; 1765 sha1msg2 @MSG[3],@MSG[0] 1766 pxor @MSG[3],@MSG[1] 1767 sha1msg1 @MSG[3],@MSG[2] 1768___ 1769 ($E,$E_)=($E_,$E); 1770 push(@MSG,shift(@MSG)); 1771 1772 &$aesenc(); 1773} 1774$code.=<<___; 1775 movdqa $ABCD,$E_ 1776 sha1rnds4 \$3,$E,$ABCD # 64-67 1777 sha1nexte @MSG[3],$E_ 1778 sha1msg2 @MSG[3],@MSG[0] 1779 pxor @MSG[3],@MSG[1] 1780___ 1781 &$aesenc(); 1782$code.=<<___; 1783 movdqa $ABCD,$E 1784 sha1rnds4 \$3,$E_,$ABCD # 68-71 1785 sha1nexte @MSG[0],$E 1786 sha1msg2 @MSG[0],@MSG[1] 1787___ 1788 &$aesenc(); 1789$code.=<<___; 1790 movdqa $E_SAVE,@MSG[0] 1791 movdqa $ABCD,$E_ 1792 sha1rnds4 \$3,$E,$ABCD # 72-75 1793 sha1nexte @MSG[1],$E_ 1794___ 1795 &$aesenc(); 1796$code.=<<___; 1797 movdqa $ABCD,$E 1798 sha1rnds4 \$3,$E_,$ABCD # 76-79 1799 sha1nexte $MSG[0],$E 1800___ 1801 while($r<40) { &$aesenc(); } # remaining aesenc's 1802$code.=<<___; 1803 dec $len 1804 1805 paddd $ABCD_SAVE,$ABCD 1806 movups $iv,48($out,$in0) # write output 1807 lea 64($in0),$in0 1808 jnz .Loop_shaext 1809 1810 pshufd \$0b00011011,$ABCD,$ABCD 1811 pshufd \$0b00011011,$E,$E 1812 movups $iv,($ivp) # write IV 1813 movdqu $ABCD,($ctx) 1814 movd $E,16($ctx) 1815___ 1816$code.=<<___ if ($win64); 1817 movaps -8-10*16(%rax),%xmm6 1818 movaps -8-9*16(%rax),%xmm7 1819 movaps -8-8*16(%rax),%xmm8 1820 movaps -8-7*16(%rax),%xmm9 1821 movaps -8-6*16(%rax),%xmm10 1822 movaps -8-5*16(%rax),%xmm11 1823 movaps -8-4*16(%rax),%xmm12 1824 movaps -8-3*16(%rax),%xmm13 1825 movaps -8-2*16(%rax),%xmm14 1826 movaps -8-1*16(%rax),%xmm15 1827 mov %rax,%rsp 1828.Lepilogue_shaext: 1829___ 1830$code.=<<___; 1831 ret 1832.size aesni_cbc_sha1_enc_shaext,.-aesni_cbc_sha1_enc_shaext 1833___ 1834 }}} 1835# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1836# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1837if ($win64) { 1838$rec="%rcx"; 1839$frame="%rdx"; 1840$context="%r8"; 1841$disp="%r9"; 1842 1843$code.=<<___; 1844.extern __imp_RtlVirtualUnwind 1845.type ssse3_handler,\@abi-omnipotent 1846.align 16 1847ssse3_handler: 1848 push %rsi 1849 push %rdi 1850 push %rbx 1851 push %rbp 1852 push %r12 1853 push %r13 1854 push %r14 1855 push %r15 1856 pushfq 1857 sub \$64,%rsp 1858 1859 mov 120($context),%rax # pull context->Rax 1860 mov 248($context),%rbx # pull context->Rip 1861 1862 mov 8($disp),%rsi # disp->ImageBase 1863 mov 56($disp),%r11 # disp->HandlerData 1864 1865 mov 0(%r11),%r10d # HandlerData[0] 1866 lea (%rsi,%r10),%r10 # prologue label 1867 cmp %r10,%rbx # context->Rip<prologue label 1868 jb .Lcommon_seh_tail 1869 1870 mov 152($context),%rax # pull context->Rsp 1871 1872 mov 4(%r11),%r10d # HandlerData[1] 1873 lea (%rsi,%r10),%r10 # epilogue label 1874 cmp %r10,%rbx # context->Rip>=epilogue label 1875 jae .Lcommon_seh_tail 1876___ 1877$code.=<<___ if ($shaext); 1878 lea aesni_cbc_sha1_enc_shaext(%rip),%r10 1879 cmp %r10,%rbx 1880 jb .Lseh_no_shaext 1881 1882 lea (%rax),%rsi 1883 lea 512($context),%rdi # &context.Xmm6 1884 mov \$20,%ecx 1885 .long 0xa548f3fc # cld; rep movsq 1886 lea 168(%rax),%rax # adjust stack pointer 1887 jmp .Lcommon_seh_tail 1888.Lseh_no_shaext: 1889___ 1890$code.=<<___; 1891 lea 96(%rax),%rsi 1892 lea 512($context),%rdi # &context.Xmm6 1893 mov \$20,%ecx 1894 .long 0xa548f3fc # cld; rep movsq 1895 lea `104+10*16`(%rax),%rax # adjust stack pointer 1896 1897 mov 0(%rax),%r15 1898 mov 8(%rax),%r14 1899 mov 16(%rax),%r13 1900 mov 24(%rax),%r12 1901 mov 32(%rax),%rbp 1902 mov 40(%rax),%rbx 1903 lea 48(%rax),%rax 1904 mov %rbx,144($context) # restore context->Rbx 1905 mov %rbp,160($context) # restore context->Rbp 1906 mov %r12,216($context) # restore context->R12 1907 mov %r13,224($context) # restore context->R13 1908 mov %r14,232($context) # restore context->R14 1909 mov %r15,240($context) # restore context->R15 1910 1911.Lcommon_seh_tail: 1912 mov 8(%rax),%rdi 1913 mov 16(%rax),%rsi 1914 mov %rax,152($context) # restore context->Rsp 1915 mov %rsi,168($context) # restore context->Rsi 1916 mov %rdi,176($context) # restore context->Rdi 1917 1918 mov 40($disp),%rdi # disp->ContextRecord 1919 mov $context,%rsi # context 1920 mov \$154,%ecx # sizeof(CONTEXT) 1921 .long 0xa548f3fc # cld; rep movsq 1922 1923 mov $disp,%rsi 1924 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1925 mov 8(%rsi),%rdx # arg2, disp->ImageBase 1926 mov 0(%rsi),%r8 # arg3, disp->ControlPc 1927 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1928 mov 40(%rsi),%r10 # disp->ContextRecord 1929 lea 56(%rsi),%r11 # &disp->HandlerData 1930 lea 24(%rsi),%r12 # &disp->EstablisherFrame 1931 mov %r10,32(%rsp) # arg5 1932 mov %r11,40(%rsp) # arg6 1933 mov %r12,48(%rsp) # arg7 1934 mov %rcx,56(%rsp) # arg8, (NULL) 1935 call *__imp_RtlVirtualUnwind(%rip) 1936 1937 mov \$1,%eax # ExceptionContinueSearch 1938 add \$64,%rsp 1939 popfq 1940 pop %r15 1941 pop %r14 1942 pop %r13 1943 pop %r12 1944 pop %rbp 1945 pop %rbx 1946 pop %rdi 1947 pop %rsi 1948 ret 1949.size ssse3_handler,.-ssse3_handler 1950 1951.section .pdata 1952.align 4 1953 .rva .LSEH_begin_aesni_cbc_sha1_enc_ssse3 1954 .rva .LSEH_end_aesni_cbc_sha1_enc_ssse3 1955 .rva .LSEH_info_aesni_cbc_sha1_enc_ssse3 1956___ 1957$code.=<<___ if ($avx); 1958 .rva .LSEH_begin_aesni_cbc_sha1_enc_avx 1959 .rva .LSEH_end_aesni_cbc_sha1_enc_avx 1960 .rva .LSEH_info_aesni_cbc_sha1_enc_avx 1961___ 1962$code.=<<___ if ($shaext); 1963 .rva .LSEH_begin_aesni_cbc_sha1_enc_shaext 1964 .rva .LSEH_end_aesni_cbc_sha1_enc_shaext 1965 .rva .LSEH_info_aesni_cbc_sha1_enc_shaext 1966___ 1967$code.=<<___; 1968.section .xdata 1969.align 8 1970.LSEH_info_aesni_cbc_sha1_enc_ssse3: 1971 .byte 9,0,0,0 1972 .rva ssse3_handler 1973 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] 1974___ 1975$code.=<<___ if ($avx); 1976.LSEH_info_aesni_cbc_sha1_enc_avx: 1977 .byte 9,0,0,0 1978 .rva ssse3_handler 1979 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] 1980___ 1981$code.=<<___ if ($shaext); 1982.LSEH_info_aesni_cbc_sha1_enc_shaext: 1983 .byte 9,0,0,0 1984 .rva ssse3_handler 1985 .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[] 1986___ 1987} 1988 1989#################################################################### 1990sub rex { 1991 local *opcode=shift; 1992 my ($dst,$src)=@_; 1993 my $rex=0; 1994 1995 $rex|=0x04 if($dst>=8); 1996 $rex|=0x01 if($src>=8); 1997 unshift @opcode,$rex|0x40 if($rex); 1998} 1999 2000sub sha1rnds4 { 2001 if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { 2002 my @opcode=(0x0f,0x3a,0xcc); 2003 rex(\@opcode,$3,$2); 2004 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M 2005 my $c=$1; 2006 push @opcode,$c=~/^0/?oct($c):$c; 2007 return ".byte\t".join(',',@opcode); 2008 } else { 2009 return "sha1rnds4\t".@_[0]; 2010 } 2011} 2012 2013sub sha1op38 { 2014 my $instr = shift; 2015 my %opcodelet = ( 2016 "sha1nexte" => 0xc8, 2017 "sha1msg1" => 0xc9, 2018 "sha1msg2" => 0xca ); 2019 2020 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { 2021 my @opcode=(0x0f,0x38); 2022 rex(\@opcode,$2,$1); 2023 push @opcode,$opcodelet{$instr}; 2024 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M 2025 return ".byte\t".join(',',@opcode); 2026 } else { 2027 return $instr."\t".@_[0]; 2028 } 2029} 2030 2031sub aesni { 2032 my $line=shift; 2033 my @opcode=(0x0f,0x38); 2034 2035 if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { 2036 my %opcodelet = ( 2037 "aesenc" => 0xdc, "aesenclast" => 0xdd, 2038 "aesdec" => 0xde, "aesdeclast" => 0xdf 2039 ); 2040 return undef if (!defined($opcodelet{$1})); 2041 rex(\@opcode,$3,$2); 2042 push @opcode,$opcodelet{$1},0xc0|($2&7)|(($3&7)<<3); # ModR/M 2043 unshift @opcode,0x66; 2044 return ".byte\t".join(',',@opcode); 2045 } 2046 return $line; 2047} 2048 2049foreach (split("\n",$code)) { 2050 s/\`([^\`]*)\`/eval $1/geo; 2051 2052 s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or 2053 s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo or 2054 s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/geo; 2055 2056 print $_,"\n"; 2057} 2058close STDOUT; 2059