1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. Rights for redistribution and usage in source and binary 6# forms are granted according to the OpenSSL license. 7# ==================================================================== 8# 9# sha256/512_block procedure for x86_64. 10# 11# 40% improvement over compiler-generated code on Opteron. On EM64T 12# sha256 was observed to run >80% faster and sha512 - >40%. No magical 13# tricks, just straight implementation... I really wonder why gcc 14# [being armed with inline assembler] fails to generate as fast code. 15# The only thing which is cool about this module is that it's very 16# same instruction sequence used for both SHA-256 and SHA-512. In 17# former case the instructions operate on 32-bit operands, while in 18# latter - on 64-bit ones. All I had to do is to get one flavor right, 19# the other one passed the test right away:-) 20# 21# sha256_block runs in ~1005 cycles on Opteron, which gives you 22# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock 23# frequency in GHz. sha512_block runs in ~1275 cycles, which results 24# in 128*1000/1275=100MBps per GHz. Is there room for improvement? 25# Well, if you compare it to IA-64 implementation, which maintains 26# X[16] in register bank[!], tends to 4 instructions per CPU clock 27# cycle and runs in 1003 cycles, 1275 is very good result for 3-way 28# issue Opteron pipeline and X[16] maintained in memory. So that *if* 29# there is a way to improve it, *then* the only way would be to try to 30# offload X[16] updates to SSE unit, but that would require "deeper" 31# loop unroll, which in turn would naturally cause size blow-up, not 32# to mention increased complexity! And once again, only *if* it's 33# actually possible to noticeably improve overall ILP, instruction 34# level parallelism, on a given CPU implementation in this case. 35# 36# Special note on Intel EM64T. While Opteron CPU exhibits perfect 37# perfromance ratio of 1.5 between 64- and 32-bit flavors [see above], 38# [currently available] EM64T CPUs apparently are far from it. On the 39# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit 40# sha256_block:-( This is presumably because 64-bit shifts/rotates 41# apparently are not atomic instructions, but implemented in microcode. 42# 43# May 2012. 44# 45# Optimization including one of Pavel Semjanov's ideas, alternative 46# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and 47# unfortunately -2% SHA512 on P4 [which nobody should care about 48# that much]. 49# 50# June 2012. 51# 52# Add SIMD code paths, see below for improvement coefficients. SSSE3 53# code path was not attempted for SHA512, because improvement is not 54# estimated to be high enough, noticeably less than 9%, to justify 55# the effort, not on pre-AVX processors. [Obviously with exclusion 56# for VIA Nano, but it has SHA512 instruction that is faster and 57# should be used instead.] For reference, corresponding estimated 58# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that 59# higher coefficients are observed on VIA Nano and Bulldozer has more 60# to do with specifics of their architecture [which is topic for 61# separate discussion]. 62# 63# November 2012. 64# 65# Add AVX2 code path. Two consecutive input blocks are loaded to 66# 256-bit %ymm registers, with data from first block to least 67# significant 128-bit halves and data from second to most significant. 68# The data is then processed with same SIMD instruction sequence as 69# for AVX, but with %ymm as operands. Side effect is increased stack 70# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB 71# code size increase. 72# 73# March 2014. 74# 75# Add support for Intel SHA Extensions. 76 77###################################################################### 78# Current performance in cycles per processed byte (less is better): 79# 80# SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*) 81# 82# AMD K8 14.9 - - 9.57 - 83# P4 17.3 - - 30.8 - 84# Core 2 15.6 13.8(+13%) - 9.97 - 85# Westmere 14.8 12.3(+19%) - 9.58 - 86# Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**)) 87# Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%) 88# Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%) 89# Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%) 90# VIA Nano 23.0 16.5(+39%) - 14.7 - 91# Atom 23.0 18.9(+22%) - 14.7 - 92# Silvermont 27.4 20.6(+33%) - 17.5 - 93# 94# (*) whichever best applicable; 95# (**) switch from ror to shrd stands for fair share of improvement; 96# (***) execution time is fully determined by remaining integer-only 97# part, body_00_15; reducing the amount of SIMD instructions 98# below certain limit makes no difference/sense; to conserve 99# space SHA256 XOP code path is therefore omitted; 100 101$flavour = shift; 102$output = shift; 103if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 104 105$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 106 107$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 108( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 109( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 110die "can't locate x86_64-xlate.pl"; 111 112if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 113 =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 114 $avx = ($1>=2.19) + ($1>=2.22); 115} 116 117if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 118 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { 119 $avx = ($1>=2.09) + ($1>=2.10); 120} 121 122if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 123 `ml64 2>&1` =~ /Version ([0-9]+)\./) { 124 $avx = ($1>=10) + ($1>=11); 125} 126 127if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) { 128 $avx = ($2>=3.0) + ($2>3.0); 129} 130 131$shaext=1; ### set to zero if compiling for 1.0.1 132$avx=1 if (!$shaext && $avx); 133 134open OUT,"| \"$^X\" $xlate $flavour $output"; 135*STDOUT=*OUT; 136 137if ($output =~ /512/) { 138 $func="sha512_block_data_order"; 139 $TABLE="K512"; 140 $SZ=8; 141 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx", 142 "%r8", "%r9", "%r10","%r11"); 143 ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi"); 144 @Sigma0=(28,34,39); 145 @Sigma1=(14,18,41); 146 @sigma0=(1, 8, 7); 147 @sigma1=(19,61, 6); 148 $rounds=80; 149} else { 150 $func="sha256_block_data_order"; 151 $TABLE="K256"; 152 $SZ=4; 153 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", 154 "%r8d","%r9d","%r10d","%r11d"); 155 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi"); 156 @Sigma0=( 2,13,22); 157 @Sigma1=( 6,11,25); 158 @sigma0=( 7,18, 3); 159 @sigma1=(17,19,10); 160 $rounds=64; 161} 162 163$ctx="%rdi"; # 1st arg, zapped by $a3 164$inp="%rsi"; # 2nd arg 165$Tbl="%rbp"; 166 167$_ctx="16*$SZ+0*8(%rsp)"; 168$_inp="16*$SZ+1*8(%rsp)"; 169$_end="16*$SZ+2*8(%rsp)"; 170$_rsp="16*$SZ+3*8(%rsp)"; 171$framesz="16*$SZ+4*8"; 172 173 174sub ROUND_00_15() 175{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 176 my $STRIDE=$SZ; 177 $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1)); 178 179$code.=<<___; 180 ror \$`$Sigma1[2]-$Sigma1[1]`,$a0 181 mov $f,$a2 182 183 xor $e,$a0 184 ror \$`$Sigma0[2]-$Sigma0[1]`,$a1 185 xor $g,$a2 # f^g 186 187 mov $T1,`$SZ*($i&0xf)`(%rsp) 188 xor $a,$a1 189 and $e,$a2 # (f^g)&e 190 191 ror \$`$Sigma1[1]-$Sigma1[0]`,$a0 192 add $h,$T1 # T1+=h 193 xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g 194 195 ror \$`$Sigma0[1]-$Sigma0[0]`,$a1 196 xor $e,$a0 197 add $a2,$T1 # T1+=Ch(e,f,g) 198 199 mov $a,$a2 200 add ($Tbl),$T1 # T1+=K[round] 201 xor $a,$a1 202 203 xor $b,$a2 # a^b, b^c in next round 204 ror \$$Sigma1[0],$a0 # Sigma1(e) 205 mov $b,$h 206 207 and $a2,$a3 208 ror \$$Sigma0[0],$a1 # Sigma0(a) 209 add $a0,$T1 # T1+=Sigma1(e) 210 211 xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b) 212 add $T1,$d # d+=T1 213 add $T1,$h # h+=T1 214 215 lea $STRIDE($Tbl),$Tbl # round++ 216___ 217$code.=<<___ if ($i<15); 218 add $a1,$h # h+=Sigma0(a) 219___ 220 ($a2,$a3) = ($a3,$a2); 221} 222 223sub ROUND_16_XX() 224{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 225 226$code.=<<___; 227 mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 228 mov `$SZ*(($i+14)&0xf)`(%rsp),$a2 229 230 mov $a0,$T1 231 ror \$`$sigma0[1]-$sigma0[0]`,$a0 232 add $a1,$a # modulo-scheduled h+=Sigma0(a) 233 mov $a2,$a1 234 ror \$`$sigma1[1]-$sigma1[0]`,$a2 235 236 xor $T1,$a0 237 shr \$$sigma0[2],$T1 238 ror \$$sigma0[0],$a0 239 xor $a1,$a2 240 shr \$$sigma1[2],$a1 241 242 ror \$$sigma1[0],$a2 243 xor $a0,$T1 # sigma0(X[(i+1)&0xf]) 244 xor $a1,$a2 # sigma1(X[(i+14)&0xf]) 245 add `$SZ*(($i+9)&0xf)`(%rsp),$T1 246 247 add `$SZ*($i&0xf)`(%rsp),$T1 248 mov $e,$a0 249 add $a2,$T1 250 mov $a,$a1 251___ 252 &ROUND_00_15(@_); 253} 254 255$code=<<___; 256.text 257 258.extern OPENSSL_ia32cap_P 259.globl $func 260.type $func,\@function,3 261.align 16 262$func: 263___ 264$code.=<<___ if ($SZ==4 || $avx); 265 lea OPENSSL_ia32cap_P(%rip),%r11 266 mov 0(%r11),%r9d 267 mov 4(%r11),%r10d 268 mov 8(%r11),%r11d 269___ 270$code.=<<___ if ($SZ==4 && $shaext); 271 test \$`1<<29`,%r11d # check for SHA 272 jnz _shaext_shortcut 273___ 274$code.=<<___ if ($avx && $SZ==8); 275 test \$`1<<11`,%r10d # check for XOP 276 jnz .Lxop_shortcut 277___ 278$code.=<<___ if ($avx>1); 279 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 280 cmp \$`1<<8|1<<5|1<<3`,%r11d 281 je .Lavx2_shortcut 282___ 283$code.=<<___ if ($avx); 284 and \$`1<<30`,%r9d # mask "Intel CPU" bit 285 and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits 286 or %r9d,%r10d 287 cmp \$`1<<28|1<<9|1<<30`,%r10d 288 je .Lavx_shortcut 289___ 290$code.=<<___ if ($SZ==4); 291 test \$`1<<9`,%r10d 292 jnz .Lssse3_shortcut 293___ 294$code.=<<___; 295 push %rbx 296 push %rbp 297 push %r12 298 push %r13 299 push %r14 300 push %r15 301 mov %rsp,%r11 # copy %rsp 302 shl \$4,%rdx # num*16 303 sub \$$framesz,%rsp 304 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 305 and \$-64,%rsp # align stack frame 306 mov $ctx,$_ctx # save ctx, 1st arg 307 mov $inp,$_inp # save inp, 2nd arh 308 mov %rdx,$_end # save end pointer, "3rd" arg 309 mov %r11,$_rsp # save copy of %rsp 310.Lprologue: 311 312 mov $SZ*0($ctx),$A 313 mov $SZ*1($ctx),$B 314 mov $SZ*2($ctx),$C 315 mov $SZ*3($ctx),$D 316 mov $SZ*4($ctx),$E 317 mov $SZ*5($ctx),$F 318 mov $SZ*6($ctx),$G 319 mov $SZ*7($ctx),$H 320 jmp .Lloop 321 322.align 16 323.Lloop: 324 mov $B,$a3 325 lea $TABLE(%rip),$Tbl 326 xor $C,$a3 # magic 327___ 328 for($i=0;$i<16;$i++) { 329 $code.=" mov $SZ*$i($inp),$T1\n"; 330 $code.=" mov @ROT[4],$a0\n"; 331 $code.=" mov @ROT[0],$a1\n"; 332 $code.=" bswap $T1\n"; 333 &ROUND_00_15($i,@ROT); 334 unshift(@ROT,pop(@ROT)); 335 } 336$code.=<<___; 337 jmp .Lrounds_16_xx 338.align 16 339.Lrounds_16_xx: 340___ 341 for(;$i<32;$i++) { 342 &ROUND_16_XX($i,@ROT); 343 unshift(@ROT,pop(@ROT)); 344 } 345 346$code.=<<___; 347 cmpb \$0,`$SZ-1`($Tbl) 348 jnz .Lrounds_16_xx 349 350 mov $_ctx,$ctx 351 add $a1,$A # modulo-scheduled h+=Sigma0(a) 352 lea 16*$SZ($inp),$inp 353 354 add $SZ*0($ctx),$A 355 add $SZ*1($ctx),$B 356 add $SZ*2($ctx),$C 357 add $SZ*3($ctx),$D 358 add $SZ*4($ctx),$E 359 add $SZ*5($ctx),$F 360 add $SZ*6($ctx),$G 361 add $SZ*7($ctx),$H 362 363 cmp $_end,$inp 364 365 mov $A,$SZ*0($ctx) 366 mov $B,$SZ*1($ctx) 367 mov $C,$SZ*2($ctx) 368 mov $D,$SZ*3($ctx) 369 mov $E,$SZ*4($ctx) 370 mov $F,$SZ*5($ctx) 371 mov $G,$SZ*6($ctx) 372 mov $H,$SZ*7($ctx) 373 jb .Lloop 374 375 mov $_rsp,%rsi 376 mov (%rsi),%r15 377 mov 8(%rsi),%r14 378 mov 16(%rsi),%r13 379 mov 24(%rsi),%r12 380 mov 32(%rsi),%rbp 381 mov 40(%rsi),%rbx 382 lea 48(%rsi),%rsp 383.Lepilogue: 384 ret 385.size $func,.-$func 386___ 387 388if ($SZ==4) { 389$code.=<<___; 390.align 64 391.type $TABLE,\@object 392$TABLE: 393 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 394 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 395 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 396 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 397 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 398 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 399 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 400 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 401 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 402 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 403 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 404 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 405 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 406 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 407 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 408 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 409 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 410 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 411 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 412 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 413 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 414 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 415 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 416 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 417 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 418 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 419 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 420 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 421 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 422 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 423 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 424 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 425 426 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 427 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 428 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff 429 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff 430 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 431 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 432 .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 433___ 434} else { 435$code.=<<___; 436.align 64 437.type $TABLE,\@object 438$TABLE: 439 .quad 0x428a2f98d728ae22,0x7137449123ef65cd 440 .quad 0x428a2f98d728ae22,0x7137449123ef65cd 441 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 442 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 443 .quad 0x3956c25bf348b538,0x59f111f1b605d019 444 .quad 0x3956c25bf348b538,0x59f111f1b605d019 445 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 446 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 447 .quad 0xd807aa98a3030242,0x12835b0145706fbe 448 .quad 0xd807aa98a3030242,0x12835b0145706fbe 449 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 450 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 451 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 452 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 453 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 454 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 455 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 456 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 457 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 458 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 459 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 460 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 461 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 462 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 463 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 464 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 465 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 466 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 467 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 468 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 469 .quad 0x06ca6351e003826f,0x142929670a0e6e70 470 .quad 0x06ca6351e003826f,0x142929670a0e6e70 471 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 472 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 473 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 474 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 475 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 476 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 477 .quad 0x81c2c92e47edaee6,0x92722c851482353b 478 .quad 0x81c2c92e47edaee6,0x92722c851482353b 479 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 480 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 481 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 482 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 483 .quad 0xd192e819d6ef5218,0xd69906245565a910 484 .quad 0xd192e819d6ef5218,0xd69906245565a910 485 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 486 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 487 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 488 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 489 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 490 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 491 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 492 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 493 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 494 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 495 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 496 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 497 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 498 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 499 .quad 0x90befffa23631e28,0xa4506cebde82bde9 500 .quad 0x90befffa23631e28,0xa4506cebde82bde9 501 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 502 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 503 .quad 0xca273eceea26619c,0xd186b8c721c0c207 504 .quad 0xca273eceea26619c,0xd186b8c721c0c207 505 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 506 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 507 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 508 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 509 .quad 0x113f9804bef90dae,0x1b710b35131c471b 510 .quad 0x113f9804bef90dae,0x1b710b35131c471b 511 .quad 0x28db77f523047d84,0x32caab7b40c72493 512 .quad 0x28db77f523047d84,0x32caab7b40c72493 513 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 514 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 515 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 516 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 517 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 518 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 519 520 .quad 0x0001020304050607,0x08090a0b0c0d0e0f 521 .quad 0x0001020304050607,0x08090a0b0c0d0e0f 522 .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 523___ 524} 525 526###################################################################### 527# SIMD code paths 528# 529if ($SZ==4 && $shaext) {{{ 530###################################################################### 531# Intel SHA Extensions implementation of SHA256 update function. 532# 533my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx"); 534 535my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10)); 536my @MSG=map("%xmm$_",(3..6)); 537 538$code.=<<___; 539.type sha256_block_data_order_shaext,\@function,3 540.align 64 541sha256_block_data_order_shaext: 542_shaext_shortcut: 543___ 544$code.=<<___ if ($win64); 545 lea `-8-5*16`(%rsp),%rsp 546 movaps %xmm6,-8-5*16(%rax) 547 movaps %xmm7,-8-4*16(%rax) 548 movaps %xmm8,-8-3*16(%rax) 549 movaps %xmm9,-8-2*16(%rax) 550 movaps %xmm10,-8-1*16(%rax) 551.Lprologue_shaext: 552___ 553$code.=<<___; 554 lea K256+0x80(%rip),$Tbl 555 movdqu ($ctx),$ABEF # DCBA 556 movdqu 16($ctx),$CDGH # HGFE 557 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask 558 559 pshufd \$0x1b,$ABEF,$Wi # ABCD 560 pshufd \$0xb1,$ABEF,$ABEF # CDAB 561 pshufd \$0x1b,$CDGH,$CDGH # EFGH 562 movdqa $TMP,$BSWAP # offload 563 palignr \$8,$CDGH,$ABEF # ABEF 564 punpcklqdq $Wi,$CDGH # CDGH 565 jmp .Loop_shaext 566 567.align 16 568.Loop_shaext: 569 movdqu ($inp),@MSG[0] 570 movdqu 0x10($inp),@MSG[1] 571 movdqu 0x20($inp),@MSG[2] 572 pshufb $TMP,@MSG[0] 573 movdqu 0x30($inp),@MSG[3] 574 575 movdqa 0*32-0x80($Tbl),$Wi 576 paddd @MSG[0],$Wi 577 pshufb $TMP,@MSG[1] 578 movdqa $CDGH,$CDGH_SAVE # offload 579 sha256rnds2 $ABEF,$CDGH # 0-3 580 pshufd \$0x0e,$Wi,$Wi 581 nop 582 movdqa $ABEF,$ABEF_SAVE # offload 583 sha256rnds2 $CDGH,$ABEF 584 585 movdqa 1*32-0x80($Tbl),$Wi 586 paddd @MSG[1],$Wi 587 pshufb $TMP,@MSG[2] 588 sha256rnds2 $ABEF,$CDGH # 4-7 589 pshufd \$0x0e,$Wi,$Wi 590 lea 0x40($inp),$inp 591 sha256msg1 @MSG[1],@MSG[0] 592 sha256rnds2 $CDGH,$ABEF 593 594 movdqa 2*32-0x80($Tbl),$Wi 595 paddd @MSG[2],$Wi 596 pshufb $TMP,@MSG[3] 597 sha256rnds2 $ABEF,$CDGH # 8-11 598 pshufd \$0x0e,$Wi,$Wi 599 movdqa @MSG[3],$TMP 600 palignr \$4,@MSG[2],$TMP 601 nop 602 paddd $TMP,@MSG[0] 603 sha256msg1 @MSG[2],@MSG[1] 604 sha256rnds2 $CDGH,$ABEF 605 606 movdqa 3*32-0x80($Tbl),$Wi 607 paddd @MSG[3],$Wi 608 sha256msg2 @MSG[3],@MSG[0] 609 sha256rnds2 $ABEF,$CDGH # 12-15 610 pshufd \$0x0e,$Wi,$Wi 611 movdqa @MSG[0],$TMP 612 palignr \$4,@MSG[3],$TMP 613 nop 614 paddd $TMP,@MSG[1] 615 sha256msg1 @MSG[3],@MSG[2] 616 sha256rnds2 $CDGH,$ABEF 617___ 618for($i=4;$i<16-3;$i++) { 619$code.=<<___; 620 movdqa $i*32-0x80($Tbl),$Wi 621 paddd @MSG[0],$Wi 622 sha256msg2 @MSG[0],@MSG[1] 623 sha256rnds2 $ABEF,$CDGH # 16-19... 624 pshufd \$0x0e,$Wi,$Wi 625 movdqa @MSG[1],$TMP 626 palignr \$4,@MSG[0],$TMP 627 nop 628 paddd $TMP,@MSG[2] 629 sha256msg1 @MSG[0],@MSG[3] 630 sha256rnds2 $CDGH,$ABEF 631___ 632 push(@MSG,shift(@MSG)); 633} 634$code.=<<___; 635 movdqa 13*32-0x80($Tbl),$Wi 636 paddd @MSG[0],$Wi 637 sha256msg2 @MSG[0],@MSG[1] 638 sha256rnds2 $ABEF,$CDGH # 52-55 639 pshufd \$0x0e,$Wi,$Wi 640 movdqa @MSG[1],$TMP 641 palignr \$4,@MSG[0],$TMP 642 sha256rnds2 $CDGH,$ABEF 643 paddd $TMP,@MSG[2] 644 645 movdqa 14*32-0x80($Tbl),$Wi 646 paddd @MSG[1],$Wi 647 sha256rnds2 $ABEF,$CDGH # 56-59 648 pshufd \$0x0e,$Wi,$Wi 649 sha256msg2 @MSG[1],@MSG[2] 650 movdqa $BSWAP,$TMP 651 sha256rnds2 $CDGH,$ABEF 652 653 movdqa 15*32-0x80($Tbl),$Wi 654 paddd @MSG[2],$Wi 655 nop 656 sha256rnds2 $ABEF,$CDGH # 60-63 657 pshufd \$0x0e,$Wi,$Wi 658 dec $num 659 nop 660 sha256rnds2 $CDGH,$ABEF 661 662 paddd $CDGH_SAVE,$CDGH 663 paddd $ABEF_SAVE,$ABEF 664 jnz .Loop_shaext 665 666 pshufd \$0xb1,$CDGH,$CDGH # DCHG 667 pshufd \$0x1b,$ABEF,$TMP # FEBA 668 pshufd \$0xb1,$ABEF,$ABEF # BAFE 669 punpckhqdq $CDGH,$ABEF # DCBA 670 palignr \$8,$TMP,$CDGH # HGFE 671 672 movdqu $ABEF,($ctx) 673 movdqu $CDGH,16($ctx) 674___ 675$code.=<<___ if ($win64); 676 movaps -8-5*16(%rax),%xmm6 677 movaps -8-4*16(%rax),%xmm7 678 movaps -8-3*16(%rax),%xmm8 679 movaps -8-2*16(%rax),%xmm9 680 movaps -8-1*16(%rax),%xmm10 681 mov %rax,%rsp 682.Lepilogue_shaext: 683___ 684$code.=<<___; 685 ret 686.size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext 687___ 688}}} 689{{{ 690 691my $a4=$T1; 692my ($a,$b,$c,$d,$e,$f,$g,$h); 693 694sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 695{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 696 my $arg = pop; 697 $arg = "\$$arg" if ($arg*1 eq $arg); 698 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 699} 700 701sub body_00_15 () { 702 ( 703 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 704 705 '&ror ($a0,$Sigma1[2]-$Sigma1[1])', 706 '&mov ($a,$a1)', 707 '&mov ($a4,$f)', 708 709 '&ror ($a1,$Sigma0[2]-$Sigma0[1])', 710 '&xor ($a0,$e)', 711 '&xor ($a4,$g)', # f^g 712 713 '&ror ($a0,$Sigma1[1]-$Sigma1[0])', 714 '&xor ($a1,$a)', 715 '&and ($a4,$e)', # (f^g)&e 716 717 '&xor ($a0,$e)', 718 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] 719 '&mov ($a2,$a)', 720 721 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g 722 '&ror ($a1,$Sigma0[1]-$Sigma0[0])', 723 '&xor ($a2,$b)', # a^b, b^c in next round 724 725 '&add ($h,$a4)', # h+=Ch(e,f,g) 726 '&ror ($a0,$Sigma1[0])', # Sigma1(e) 727 '&and ($a3,$a2)', # (b^c)&(a^b) 728 729 '&xor ($a1,$a)', 730 '&add ($h,$a0)', # h+=Sigma1(e) 731 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 732 733 '&ror ($a1,$Sigma0[0])', # Sigma0(a) 734 '&add ($d,$h)', # d+=h 735 '&add ($h,$a3)', # h+=Maj(a,b,c) 736 737 '&mov ($a0,$d)', 738 '&add ($a1,$h);'. # h+=Sigma0(a) 739 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 740 ); 741} 742 743###################################################################### 744# SSSE3 code path 745# 746if ($SZ==4) { # SHA256 only 747my @X = map("%xmm$_",(0..3)); 748my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); 749 750$code.=<<___; 751.type ${func}_ssse3,\@function,3 752.align 64 753${func}_ssse3: 754.Lssse3_shortcut: 755 push %rbx 756 push %rbp 757 push %r12 758 push %r13 759 push %r14 760 push %r15 761 mov %rsp,%r11 # copy %rsp 762 shl \$4,%rdx # num*16 763 sub \$`$framesz+$win64*16*4`,%rsp 764 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 765 and \$-64,%rsp # align stack frame 766 mov $ctx,$_ctx # save ctx, 1st arg 767 mov $inp,$_inp # save inp, 2nd arh 768 mov %rdx,$_end # save end pointer, "3rd" arg 769 mov %r11,$_rsp # save copy of %rsp 770___ 771$code.=<<___ if ($win64); 772 movaps %xmm6,16*$SZ+32(%rsp) 773 movaps %xmm7,16*$SZ+48(%rsp) 774 movaps %xmm8,16*$SZ+64(%rsp) 775 movaps %xmm9,16*$SZ+80(%rsp) 776___ 777$code.=<<___; 778.Lprologue_ssse3: 779 780 mov $SZ*0($ctx),$A 781 mov $SZ*1($ctx),$B 782 mov $SZ*2($ctx),$C 783 mov $SZ*3($ctx),$D 784 mov $SZ*4($ctx),$E 785 mov $SZ*5($ctx),$F 786 mov $SZ*6($ctx),$G 787 mov $SZ*7($ctx),$H 788___ 789 790$code.=<<___; 791 #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 792 #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 793 jmp .Lloop_ssse3 794.align 16 795.Lloop_ssse3: 796 movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 797 movdqu 0x00($inp),@X[0] 798 movdqu 0x10($inp),@X[1] 799 movdqu 0x20($inp),@X[2] 800 pshufb $t3,@X[0] 801 movdqu 0x30($inp),@X[3] 802 lea $TABLE(%rip),$Tbl 803 pshufb $t3,@X[1] 804 movdqa 0x00($Tbl),$t0 805 movdqa 0x20($Tbl),$t1 806 pshufb $t3,@X[2] 807 paddd @X[0],$t0 808 movdqa 0x40($Tbl),$t2 809 pshufb $t3,@X[3] 810 movdqa 0x60($Tbl),$t3 811 paddd @X[1],$t1 812 paddd @X[2],$t2 813 paddd @X[3],$t3 814 movdqa $t0,0x00(%rsp) 815 mov $A,$a1 816 movdqa $t1,0x10(%rsp) 817 mov $B,$a3 818 movdqa $t2,0x20(%rsp) 819 xor $C,$a3 # magic 820 movdqa $t3,0x30(%rsp) 821 mov $E,$a0 822 jmp .Lssse3_00_47 823 824.align 16 825.Lssse3_00_47: 826 sub \$`-16*2*$SZ`,$Tbl # size optimization 827___ 828sub Xupdate_256_SSSE3 () { 829 ( 830 '&movdqa ($t0,@X[1]);', 831 '&movdqa ($t3,@X[3])', 832 '&palignr ($t0,@X[0],$SZ)', # X[1..4] 833 '&palignr ($t3,@X[2],$SZ);', # X[9..12] 834 '&movdqa ($t1,$t0)', 835 '&movdqa ($t2,$t0);', 836 '&psrld ($t0,$sigma0[2])', 837 '&paddd (@X[0],$t3);', # X[0..3] += X[9..12] 838 '&psrld ($t2,$sigma0[0])', 839 '&pshufd ($t3,@X[3],0b11111010)',# X[14..15] 840 '&pslld ($t1,8*$SZ-$sigma0[1]);'. 841 '&pxor ($t0,$t2)', 842 '&psrld ($t2,$sigma0[1]-$sigma0[0]);'. 843 '&pxor ($t0,$t1)', 844 '&pslld ($t1,$sigma0[1]-$sigma0[0]);'. 845 '&pxor ($t0,$t2);', 846 '&movdqa ($t2,$t3)', 847 '&pxor ($t0,$t1);', # sigma0(X[1..4]) 848 '&psrld ($t3,$sigma1[2])', 849 '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) 850 '&psrlq ($t2,$sigma1[0])', 851 '&pxor ($t3,$t2);', 852 '&psrlq ($t2,$sigma1[1]-$sigma1[0])', 853 '&pxor ($t3,$t2)', 854 '&pshufb ($t3,$t4)', # sigma1(X[14..15]) 855 '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) 856 '&pshufd ($t3,@X[0],0b01010000)',# X[16..17] 857 '&movdqa ($t2,$t3);', 858 '&psrld ($t3,$sigma1[2])', 859 '&psrlq ($t2,$sigma1[0])', 860 '&pxor ($t3,$t2);', 861 '&psrlq ($t2,$sigma1[1]-$sigma1[0])', 862 '&pxor ($t3,$t2);', 863 '&movdqa ($t2,16*2*$j."($Tbl)")', 864 '&pshufb ($t3,$t5)', 865 '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17]) 866 ); 867} 868 869sub SSSE3_256_00_47 () { 870my $j = shift; 871my $body = shift; 872my @X = @_; 873my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 874 875 if (0) { 876 foreach (Xupdate_256_SSSE3()) { # 36 instructions 877 eval; 878 eval(shift(@insns)); 879 eval(shift(@insns)); 880 eval(shift(@insns)); 881 } 882 } else { # squeeze extra 4% on Westmere and 19% on Atom 883 eval(shift(@insns)); #@ 884 &movdqa ($t0,@X[1]); 885 eval(shift(@insns)); 886 eval(shift(@insns)); 887 &movdqa ($t3,@X[3]); 888 eval(shift(@insns)); #@ 889 eval(shift(@insns)); 890 eval(shift(@insns)); 891 eval(shift(@insns)); #@ 892 eval(shift(@insns)); 893 &palignr ($t0,@X[0],$SZ); # X[1..4] 894 eval(shift(@insns)); 895 eval(shift(@insns)); 896 &palignr ($t3,@X[2],$SZ); # X[9..12] 897 eval(shift(@insns)); 898 eval(shift(@insns)); 899 eval(shift(@insns)); 900 eval(shift(@insns)); #@ 901 &movdqa ($t1,$t0); 902 eval(shift(@insns)); 903 eval(shift(@insns)); 904 &movdqa ($t2,$t0); 905 eval(shift(@insns)); #@ 906 eval(shift(@insns)); 907 &psrld ($t0,$sigma0[2]); 908 eval(shift(@insns)); 909 eval(shift(@insns)); 910 eval(shift(@insns)); 911 &paddd (@X[0],$t3); # X[0..3] += X[9..12] 912 eval(shift(@insns)); #@ 913 eval(shift(@insns)); 914 &psrld ($t2,$sigma0[0]); 915 eval(shift(@insns)); 916 eval(shift(@insns)); 917 &pshufd ($t3,@X[3],0b11111010); # X[4..15] 918 eval(shift(@insns)); 919 eval(shift(@insns)); #@ 920 &pslld ($t1,8*$SZ-$sigma0[1]); 921 eval(shift(@insns)); 922 eval(shift(@insns)); 923 &pxor ($t0,$t2); 924 eval(shift(@insns)); #@ 925 eval(shift(@insns)); 926 eval(shift(@insns)); 927 eval(shift(@insns)); #@ 928 &psrld ($t2,$sigma0[1]-$sigma0[0]); 929 eval(shift(@insns)); 930 &pxor ($t0,$t1); 931 eval(shift(@insns)); 932 eval(shift(@insns)); 933 &pslld ($t1,$sigma0[1]-$sigma0[0]); 934 eval(shift(@insns)); 935 eval(shift(@insns)); 936 &pxor ($t0,$t2); 937 eval(shift(@insns)); 938 eval(shift(@insns)); #@ 939 &movdqa ($t2,$t3); 940 eval(shift(@insns)); 941 eval(shift(@insns)); 942 &pxor ($t0,$t1); # sigma0(X[1..4]) 943 eval(shift(@insns)); #@ 944 eval(shift(@insns)); 945 eval(shift(@insns)); 946 &psrld ($t3,$sigma1[2]); 947 eval(shift(@insns)); 948 eval(shift(@insns)); 949 &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) 950 eval(shift(@insns)); #@ 951 eval(shift(@insns)); 952 &psrlq ($t2,$sigma1[0]); 953 eval(shift(@insns)); 954 eval(shift(@insns)); 955 eval(shift(@insns)); 956 &pxor ($t3,$t2); 957 eval(shift(@insns)); #@ 958 eval(shift(@insns)); 959 eval(shift(@insns)); 960 eval(shift(@insns)); #@ 961 &psrlq ($t2,$sigma1[1]-$sigma1[0]); 962 eval(shift(@insns)); 963 eval(shift(@insns)); 964 &pxor ($t3,$t2); 965 eval(shift(@insns)); #@ 966 eval(shift(@insns)); 967 eval(shift(@insns)); 968 #&pshufb ($t3,$t4); # sigma1(X[14..15]) 969 &pshufd ($t3,$t3,0b10000000); 970 eval(shift(@insns)); 971 eval(shift(@insns)); 972 eval(shift(@insns)); 973 &psrldq ($t3,8); 974 eval(shift(@insns)); 975 eval(shift(@insns)); #@ 976 eval(shift(@insns)); 977 eval(shift(@insns)); 978 eval(shift(@insns)); #@ 979 &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 980 eval(shift(@insns)); 981 eval(shift(@insns)); 982 eval(shift(@insns)); 983 &pshufd ($t3,@X[0],0b01010000); # X[16..17] 984 eval(shift(@insns)); 985 eval(shift(@insns)); #@ 986 eval(shift(@insns)); 987 &movdqa ($t2,$t3); 988 eval(shift(@insns)); 989 eval(shift(@insns)); 990 &psrld ($t3,$sigma1[2]); 991 eval(shift(@insns)); 992 eval(shift(@insns)); #@ 993 &psrlq ($t2,$sigma1[0]); 994 eval(shift(@insns)); 995 eval(shift(@insns)); 996 &pxor ($t3,$t2); 997 eval(shift(@insns)); #@ 998 eval(shift(@insns)); 999 eval(shift(@insns)); 1000 eval(shift(@insns)); #@ 1001 eval(shift(@insns)); 1002 &psrlq ($t2,$sigma1[1]-$sigma1[0]); 1003 eval(shift(@insns)); 1004 eval(shift(@insns)); 1005 eval(shift(@insns)); 1006 &pxor ($t3,$t2); 1007 eval(shift(@insns)); 1008 eval(shift(@insns)); 1009 eval(shift(@insns)); #@ 1010 #&pshufb ($t3,$t5); 1011 &pshufd ($t3,$t3,0b00001000); 1012 eval(shift(@insns)); 1013 eval(shift(@insns)); 1014 &movdqa ($t2,16*2*$j."($Tbl)"); 1015 eval(shift(@insns)); #@ 1016 eval(shift(@insns)); 1017 &pslldq ($t3,8); 1018 eval(shift(@insns)); 1019 eval(shift(@insns)); 1020 eval(shift(@insns)); 1021 &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) 1022 eval(shift(@insns)); #@ 1023 eval(shift(@insns)); 1024 eval(shift(@insns)); 1025 } 1026 &paddd ($t2,@X[0]); 1027 foreach (@insns) { eval; } # remaining instructions 1028 &movdqa (16*$j."(%rsp)",$t2); 1029} 1030 1031 for ($i=0,$j=0; $j<4; $j++) { 1032 &SSSE3_256_00_47($j,\&body_00_15,@X); 1033 push(@X,shift(@X)); # rotate(@X) 1034 } 1035 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1036 &jne (".Lssse3_00_47"); 1037 1038 for ($i=0; $i<16; ) { 1039 foreach(body_00_15()) { eval; } 1040 } 1041$code.=<<___; 1042 mov $_ctx,$ctx 1043 mov $a1,$A 1044 1045 add $SZ*0($ctx),$A 1046 lea 16*$SZ($inp),$inp 1047 add $SZ*1($ctx),$B 1048 add $SZ*2($ctx),$C 1049 add $SZ*3($ctx),$D 1050 add $SZ*4($ctx),$E 1051 add $SZ*5($ctx),$F 1052 add $SZ*6($ctx),$G 1053 add $SZ*7($ctx),$H 1054 1055 cmp $_end,$inp 1056 1057 mov $A,$SZ*0($ctx) 1058 mov $B,$SZ*1($ctx) 1059 mov $C,$SZ*2($ctx) 1060 mov $D,$SZ*3($ctx) 1061 mov $E,$SZ*4($ctx) 1062 mov $F,$SZ*5($ctx) 1063 mov $G,$SZ*6($ctx) 1064 mov $H,$SZ*7($ctx) 1065 jb .Lloop_ssse3 1066 1067 mov $_rsp,%rsi 1068___ 1069$code.=<<___ if ($win64); 1070 movaps 16*$SZ+32(%rsp),%xmm6 1071 movaps 16*$SZ+48(%rsp),%xmm7 1072 movaps 16*$SZ+64(%rsp),%xmm8 1073 movaps 16*$SZ+80(%rsp),%xmm9 1074___ 1075$code.=<<___; 1076 mov (%rsi),%r15 1077 mov 8(%rsi),%r14 1078 mov 16(%rsi),%r13 1079 mov 24(%rsi),%r12 1080 mov 32(%rsi),%rbp 1081 mov 40(%rsi),%rbx 1082 lea 48(%rsi),%rsp 1083.Lepilogue_ssse3: 1084 ret 1085.size ${func}_ssse3,.-${func}_ssse3 1086___ 1087} 1088 1089if ($avx) {{ 1090###################################################################### 1091# XOP code path 1092# 1093if ($SZ==8) { # SHA512 only 1094$code.=<<___; 1095.type ${func}_xop,\@function,3 1096.align 64 1097${func}_xop: 1098.Lxop_shortcut: 1099 push %rbx 1100 push %rbp 1101 push %r12 1102 push %r13 1103 push %r14 1104 push %r15 1105 mov %rsp,%r11 # copy %rsp 1106 shl \$4,%rdx # num*16 1107 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp 1108 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 1109 and \$-64,%rsp # align stack frame 1110 mov $ctx,$_ctx # save ctx, 1st arg 1111 mov $inp,$_inp # save inp, 2nd arh 1112 mov %rdx,$_end # save end pointer, "3rd" arg 1113 mov %r11,$_rsp # save copy of %rsp 1114___ 1115$code.=<<___ if ($win64); 1116 movaps %xmm6,16*$SZ+32(%rsp) 1117 movaps %xmm7,16*$SZ+48(%rsp) 1118 movaps %xmm8,16*$SZ+64(%rsp) 1119 movaps %xmm9,16*$SZ+80(%rsp) 1120___ 1121$code.=<<___ if ($win64 && $SZ>4); 1122 movaps %xmm10,16*$SZ+96(%rsp) 1123 movaps %xmm11,16*$SZ+112(%rsp) 1124___ 1125$code.=<<___; 1126.Lprologue_xop: 1127 1128 vzeroupper 1129 mov $SZ*0($ctx),$A 1130 mov $SZ*1($ctx),$B 1131 mov $SZ*2($ctx),$C 1132 mov $SZ*3($ctx),$D 1133 mov $SZ*4($ctx),$E 1134 mov $SZ*5($ctx),$F 1135 mov $SZ*6($ctx),$G 1136 mov $SZ*7($ctx),$H 1137 jmp .Lloop_xop 1138___ 1139 if ($SZ==4) { # SHA256 1140 my @X = map("%xmm$_",(0..3)); 1141 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); 1142 1143$code.=<<___; 1144.align 16 1145.Lloop_xop: 1146 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1147 vmovdqu 0x00($inp),@X[0] 1148 vmovdqu 0x10($inp),@X[1] 1149 vmovdqu 0x20($inp),@X[2] 1150 vmovdqu 0x30($inp),@X[3] 1151 vpshufb $t3,@X[0],@X[0] 1152 lea $TABLE(%rip),$Tbl 1153 vpshufb $t3,@X[1],@X[1] 1154 vpshufb $t3,@X[2],@X[2] 1155 vpaddd 0x00($Tbl),@X[0],$t0 1156 vpshufb $t3,@X[3],@X[3] 1157 vpaddd 0x20($Tbl),@X[1],$t1 1158 vpaddd 0x40($Tbl),@X[2],$t2 1159 vpaddd 0x60($Tbl),@X[3],$t3 1160 vmovdqa $t0,0x00(%rsp) 1161 mov $A,$a1 1162 vmovdqa $t1,0x10(%rsp) 1163 mov $B,$a3 1164 vmovdqa $t2,0x20(%rsp) 1165 xor $C,$a3 # magic 1166 vmovdqa $t3,0x30(%rsp) 1167 mov $E,$a0 1168 jmp .Lxop_00_47 1169 1170.align 16 1171.Lxop_00_47: 1172 sub \$`-16*2*$SZ`,$Tbl # size optimization 1173___ 1174sub XOP_256_00_47 () { 1175my $j = shift; 1176my $body = shift; 1177my @X = @_; 1178my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 1179 1180 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4] 1181 eval(shift(@insns)); 1182 eval(shift(@insns)); 1183 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12] 1184 eval(shift(@insns)); 1185 eval(shift(@insns)); 1186 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]); 1187 eval(shift(@insns)); 1188 eval(shift(@insns)); 1189 &vpsrld ($t0,$t0,$sigma0[2]); 1190 eval(shift(@insns)); 1191 eval(shift(@insns)); 1192 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12] 1193 eval(shift(@insns)); 1194 eval(shift(@insns)); 1195 eval(shift(@insns)); 1196 eval(shift(@insns)); 1197 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]); 1198 eval(shift(@insns)); 1199 eval(shift(@insns)); 1200 &vpxor ($t0,$t0,$t1); 1201 eval(shift(@insns)); 1202 eval(shift(@insns)); 1203 eval(shift(@insns)); 1204 eval(shift(@insns)); 1205 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]); 1206 eval(shift(@insns)); 1207 eval(shift(@insns)); 1208 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4]) 1209 eval(shift(@insns)); 1210 eval(shift(@insns)); 1211 &vpsrld ($t2,@X[3],$sigma1[2]); 1212 eval(shift(@insns)); 1213 eval(shift(@insns)); 1214 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4]) 1215 eval(shift(@insns)); 1216 eval(shift(@insns)); 1217 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); 1218 eval(shift(@insns)); 1219 eval(shift(@insns)); 1220 &vpxor ($t3,$t3,$t2); 1221 eval(shift(@insns)); 1222 eval(shift(@insns)); 1223 eval(shift(@insns)); 1224 eval(shift(@insns)); 1225 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) 1226 eval(shift(@insns)); 1227 eval(shift(@insns)); 1228 eval(shift(@insns)); 1229 eval(shift(@insns)); 1230 &vpsrldq ($t3,$t3,8); 1231 eval(shift(@insns)); 1232 eval(shift(@insns)); 1233 eval(shift(@insns)); 1234 eval(shift(@insns)); 1235 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 1236 eval(shift(@insns)); 1237 eval(shift(@insns)); 1238 eval(shift(@insns)); 1239 eval(shift(@insns)); 1240 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]); 1241 eval(shift(@insns)); 1242 eval(shift(@insns)); 1243 &vpsrld ($t2,@X[0],$sigma1[2]); 1244 eval(shift(@insns)); 1245 eval(shift(@insns)); 1246 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); 1247 eval(shift(@insns)); 1248 eval(shift(@insns)); 1249 &vpxor ($t3,$t3,$t2); 1250 eval(shift(@insns)); 1251 eval(shift(@insns)); 1252 eval(shift(@insns)); 1253 eval(shift(@insns)); 1254 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17]) 1255 eval(shift(@insns)); 1256 eval(shift(@insns)); 1257 eval(shift(@insns)); 1258 eval(shift(@insns)); 1259 &vpslldq ($t3,$t3,8); # 22 instructions 1260 eval(shift(@insns)); 1261 eval(shift(@insns)); 1262 eval(shift(@insns)); 1263 eval(shift(@insns)); 1264 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17]) 1265 eval(shift(@insns)); 1266 eval(shift(@insns)); 1267 eval(shift(@insns)); 1268 eval(shift(@insns)); 1269 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1270 foreach (@insns) { eval; } # remaining instructions 1271 &vmovdqa (16*$j."(%rsp)",$t2); 1272} 1273 1274 for ($i=0,$j=0; $j<4; $j++) { 1275 &XOP_256_00_47($j,\&body_00_15,@X); 1276 push(@X,shift(@X)); # rotate(@X) 1277 } 1278 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1279 &jne (".Lxop_00_47"); 1280 1281 for ($i=0; $i<16; ) { 1282 foreach(body_00_15()) { eval; } 1283 } 1284 1285 } else { # SHA512 1286 my @X = map("%xmm$_",(0..7)); 1287 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); 1288 1289$code.=<<___; 1290.align 16 1291.Lloop_xop: 1292 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1293 vmovdqu 0x00($inp),@X[0] 1294 lea $TABLE+0x80(%rip),$Tbl # size optimization 1295 vmovdqu 0x10($inp),@X[1] 1296 vmovdqu 0x20($inp),@X[2] 1297 vpshufb $t3,@X[0],@X[0] 1298 vmovdqu 0x30($inp),@X[3] 1299 vpshufb $t3,@X[1],@X[1] 1300 vmovdqu 0x40($inp),@X[4] 1301 vpshufb $t3,@X[2],@X[2] 1302 vmovdqu 0x50($inp),@X[5] 1303 vpshufb $t3,@X[3],@X[3] 1304 vmovdqu 0x60($inp),@X[6] 1305 vpshufb $t3,@X[4],@X[4] 1306 vmovdqu 0x70($inp),@X[7] 1307 vpshufb $t3,@X[5],@X[5] 1308 vpaddq -0x80($Tbl),@X[0],$t0 1309 vpshufb $t3,@X[6],@X[6] 1310 vpaddq -0x60($Tbl),@X[1],$t1 1311 vpshufb $t3,@X[7],@X[7] 1312 vpaddq -0x40($Tbl),@X[2],$t2 1313 vpaddq -0x20($Tbl),@X[3],$t3 1314 vmovdqa $t0,0x00(%rsp) 1315 vpaddq 0x00($Tbl),@X[4],$t0 1316 vmovdqa $t1,0x10(%rsp) 1317 vpaddq 0x20($Tbl),@X[5],$t1 1318 vmovdqa $t2,0x20(%rsp) 1319 vpaddq 0x40($Tbl),@X[6],$t2 1320 vmovdqa $t3,0x30(%rsp) 1321 vpaddq 0x60($Tbl),@X[7],$t3 1322 vmovdqa $t0,0x40(%rsp) 1323 mov $A,$a1 1324 vmovdqa $t1,0x50(%rsp) 1325 mov $B,$a3 1326 vmovdqa $t2,0x60(%rsp) 1327 xor $C,$a3 # magic 1328 vmovdqa $t3,0x70(%rsp) 1329 mov $E,$a0 1330 jmp .Lxop_00_47 1331 1332.align 16 1333.Lxop_00_47: 1334 add \$`16*2*$SZ`,$Tbl 1335___ 1336sub XOP_512_00_47 () { 1337my $j = shift; 1338my $body = shift; 1339my @X = @_; 1340my @insns = (&$body,&$body); # 52 instructions 1341 1342 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2] 1343 eval(shift(@insns)); 1344 eval(shift(@insns)); 1345 &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10] 1346 eval(shift(@insns)); 1347 eval(shift(@insns)); 1348 &vprotq ($t1,$t0,8*$SZ-$sigma0[1]); 1349 eval(shift(@insns)); 1350 eval(shift(@insns)); 1351 &vpsrlq ($t0,$t0,$sigma0[2]); 1352 eval(shift(@insns)); 1353 eval(shift(@insns)); 1354 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10] 1355 eval(shift(@insns)); 1356 eval(shift(@insns)); 1357 eval(shift(@insns)); 1358 eval(shift(@insns)); 1359 &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]); 1360 eval(shift(@insns)); 1361 eval(shift(@insns)); 1362 &vpxor ($t0,$t0,$t1); 1363 eval(shift(@insns)); 1364 eval(shift(@insns)); 1365 eval(shift(@insns)); 1366 eval(shift(@insns)); 1367 &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]); 1368 eval(shift(@insns)); 1369 eval(shift(@insns)); 1370 &vpxor ($t0,$t0,$t2); # sigma0(X[1..2]) 1371 eval(shift(@insns)); 1372 eval(shift(@insns)); 1373 &vpsrlq ($t2,@X[7],$sigma1[2]); 1374 eval(shift(@insns)); 1375 eval(shift(@insns)); 1376 &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2]) 1377 eval(shift(@insns)); 1378 eval(shift(@insns)); 1379 &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]); 1380 eval(shift(@insns)); 1381 eval(shift(@insns)); 1382 &vpxor ($t3,$t3,$t2); 1383 eval(shift(@insns)); 1384 eval(shift(@insns)); 1385 eval(shift(@insns)); 1386 eval(shift(@insns)); 1387 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) 1388 eval(shift(@insns)); 1389 eval(shift(@insns)); 1390 eval(shift(@insns)); 1391 eval(shift(@insns)); 1392 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 1393 eval(shift(@insns)); 1394 eval(shift(@insns)); 1395 eval(shift(@insns)); 1396 eval(shift(@insns)); 1397 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); 1398 foreach (@insns) { eval; } # remaining instructions 1399 &vmovdqa (16*$j."(%rsp)",$t2); 1400} 1401 1402 for ($i=0,$j=0; $j<8; $j++) { 1403 &XOP_512_00_47($j,\&body_00_15,@X); 1404 push(@X,shift(@X)); # rotate(@X) 1405 } 1406 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); 1407 &jne (".Lxop_00_47"); 1408 1409 for ($i=0; $i<16; ) { 1410 foreach(body_00_15()) { eval; } 1411 } 1412} 1413$code.=<<___; 1414 mov $_ctx,$ctx 1415 mov $a1,$A 1416 1417 add $SZ*0($ctx),$A 1418 lea 16*$SZ($inp),$inp 1419 add $SZ*1($ctx),$B 1420 add $SZ*2($ctx),$C 1421 add $SZ*3($ctx),$D 1422 add $SZ*4($ctx),$E 1423 add $SZ*5($ctx),$F 1424 add $SZ*6($ctx),$G 1425 add $SZ*7($ctx),$H 1426 1427 cmp $_end,$inp 1428 1429 mov $A,$SZ*0($ctx) 1430 mov $B,$SZ*1($ctx) 1431 mov $C,$SZ*2($ctx) 1432 mov $D,$SZ*3($ctx) 1433 mov $E,$SZ*4($ctx) 1434 mov $F,$SZ*5($ctx) 1435 mov $G,$SZ*6($ctx) 1436 mov $H,$SZ*7($ctx) 1437 jb .Lloop_xop 1438 1439 mov $_rsp,%rsi 1440 vzeroupper 1441___ 1442$code.=<<___ if ($win64); 1443 movaps 16*$SZ+32(%rsp),%xmm6 1444 movaps 16*$SZ+48(%rsp),%xmm7 1445 movaps 16*$SZ+64(%rsp),%xmm8 1446 movaps 16*$SZ+80(%rsp),%xmm9 1447___ 1448$code.=<<___ if ($win64 && $SZ>4); 1449 movaps 16*$SZ+96(%rsp),%xmm10 1450 movaps 16*$SZ+112(%rsp),%xmm11 1451___ 1452$code.=<<___; 1453 mov (%rsi),%r15 1454 mov 8(%rsi),%r14 1455 mov 16(%rsi),%r13 1456 mov 24(%rsi),%r12 1457 mov 32(%rsi),%rbp 1458 mov 40(%rsi),%rbx 1459 lea 48(%rsi),%rsp 1460.Lepilogue_xop: 1461 ret 1462.size ${func}_xop,.-${func}_xop 1463___ 1464} 1465###################################################################### 1466# AVX+shrd code path 1467# 1468local *ror = sub { &shrd(@_[0],@_) }; 1469 1470$code.=<<___; 1471.type ${func}_avx,\@function,3 1472.align 64 1473${func}_avx: 1474.Lavx_shortcut: 1475 push %rbx 1476 push %rbp 1477 push %r12 1478 push %r13 1479 push %r14 1480 push %r15 1481 mov %rsp,%r11 # copy %rsp 1482 shl \$4,%rdx # num*16 1483 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp 1484 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 1485 and \$-64,%rsp # align stack frame 1486 mov $ctx,$_ctx # save ctx, 1st arg 1487 mov $inp,$_inp # save inp, 2nd arh 1488 mov %rdx,$_end # save end pointer, "3rd" arg 1489 mov %r11,$_rsp # save copy of %rsp 1490___ 1491$code.=<<___ if ($win64); 1492 movaps %xmm6,16*$SZ+32(%rsp) 1493 movaps %xmm7,16*$SZ+48(%rsp) 1494 movaps %xmm8,16*$SZ+64(%rsp) 1495 movaps %xmm9,16*$SZ+80(%rsp) 1496___ 1497$code.=<<___ if ($win64 && $SZ>4); 1498 movaps %xmm10,16*$SZ+96(%rsp) 1499 movaps %xmm11,16*$SZ+112(%rsp) 1500___ 1501$code.=<<___; 1502.Lprologue_avx: 1503 1504 vzeroupper 1505 mov $SZ*0($ctx),$A 1506 mov $SZ*1($ctx),$B 1507 mov $SZ*2($ctx),$C 1508 mov $SZ*3($ctx),$D 1509 mov $SZ*4($ctx),$E 1510 mov $SZ*5($ctx),$F 1511 mov $SZ*6($ctx),$G 1512 mov $SZ*7($ctx),$H 1513___ 1514 if ($SZ==4) { # SHA256 1515 my @X = map("%xmm$_",(0..3)); 1516 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); 1517 1518$code.=<<___; 1519 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 1520 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 1521 jmp .Lloop_avx 1522.align 16 1523.Lloop_avx: 1524 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1525 vmovdqu 0x00($inp),@X[0] 1526 vmovdqu 0x10($inp),@X[1] 1527 vmovdqu 0x20($inp),@X[2] 1528 vmovdqu 0x30($inp),@X[3] 1529 vpshufb $t3,@X[0],@X[0] 1530 lea $TABLE(%rip),$Tbl 1531 vpshufb $t3,@X[1],@X[1] 1532 vpshufb $t3,@X[2],@X[2] 1533 vpaddd 0x00($Tbl),@X[0],$t0 1534 vpshufb $t3,@X[3],@X[3] 1535 vpaddd 0x20($Tbl),@X[1],$t1 1536 vpaddd 0x40($Tbl),@X[2],$t2 1537 vpaddd 0x60($Tbl),@X[3],$t3 1538 vmovdqa $t0,0x00(%rsp) 1539 mov $A,$a1 1540 vmovdqa $t1,0x10(%rsp) 1541 mov $B,$a3 1542 vmovdqa $t2,0x20(%rsp) 1543 xor $C,$a3 # magic 1544 vmovdqa $t3,0x30(%rsp) 1545 mov $E,$a0 1546 jmp .Lavx_00_47 1547 1548.align 16 1549.Lavx_00_47: 1550 sub \$`-16*2*$SZ`,$Tbl # size optimization 1551___ 1552sub Xupdate_256_AVX () { 1553 ( 1554 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4] 1555 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12] 1556 '&vpsrld ($t2,$t0,$sigma0[0]);', 1557 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12] 1558 '&vpsrld ($t3,$t0,$sigma0[2])', 1559 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);', 1560 '&vpxor ($t0,$t3,$t2)', 1561 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] 1562 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);', 1563 '&vpxor ($t0,$t0,$t1)', 1564 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);', 1565 '&vpxor ($t0,$t0,$t2)', 1566 '&vpsrld ($t2,$t3,$sigma1[2]);', 1567 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4]) 1568 '&vpsrlq ($t3,$t3,$sigma1[0]);', 1569 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4]) 1570 '&vpxor ($t2,$t2,$t3);', 1571 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 1572 '&vpxor ($t2,$t2,$t3)', 1573 '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15]) 1574 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15]) 1575 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] 1576 '&vpsrld ($t2,$t3,$sigma1[2])', 1577 '&vpsrlq ($t3,$t3,$sigma1[0])', 1578 '&vpxor ($t2,$t2,$t3);', 1579 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 1580 '&vpxor ($t2,$t2,$t3)', 1581 '&vpshufb ($t2,$t2,$t5)', 1582 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17]) 1583 ); 1584} 1585 1586sub AVX_256_00_47 () { 1587my $j = shift; 1588my $body = shift; 1589my @X = @_; 1590my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 1591 1592 foreach (Xupdate_256_AVX()) { # 29 instructions 1593 eval; 1594 eval(shift(@insns)); 1595 eval(shift(@insns)); 1596 eval(shift(@insns)); 1597 } 1598 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1599 foreach (@insns) { eval; } # remaining instructions 1600 &vmovdqa (16*$j."(%rsp)",$t2); 1601} 1602 1603 for ($i=0,$j=0; $j<4; $j++) { 1604 &AVX_256_00_47($j,\&body_00_15,@X); 1605 push(@X,shift(@X)); # rotate(@X) 1606 } 1607 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1608 &jne (".Lavx_00_47"); 1609 1610 for ($i=0; $i<16; ) { 1611 foreach(body_00_15()) { eval; } 1612 } 1613 1614 } else { # SHA512 1615 my @X = map("%xmm$_",(0..7)); 1616 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); 1617 1618$code.=<<___; 1619 jmp .Lloop_avx 1620.align 16 1621.Lloop_avx: 1622 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1623 vmovdqu 0x00($inp),@X[0] 1624 lea $TABLE+0x80(%rip),$Tbl # size optimization 1625 vmovdqu 0x10($inp),@X[1] 1626 vmovdqu 0x20($inp),@X[2] 1627 vpshufb $t3,@X[0],@X[0] 1628 vmovdqu 0x30($inp),@X[3] 1629 vpshufb $t3,@X[1],@X[1] 1630 vmovdqu 0x40($inp),@X[4] 1631 vpshufb $t3,@X[2],@X[2] 1632 vmovdqu 0x50($inp),@X[5] 1633 vpshufb $t3,@X[3],@X[3] 1634 vmovdqu 0x60($inp),@X[6] 1635 vpshufb $t3,@X[4],@X[4] 1636 vmovdqu 0x70($inp),@X[7] 1637 vpshufb $t3,@X[5],@X[5] 1638 vpaddq -0x80($Tbl),@X[0],$t0 1639 vpshufb $t3,@X[6],@X[6] 1640 vpaddq -0x60($Tbl),@X[1],$t1 1641 vpshufb $t3,@X[7],@X[7] 1642 vpaddq -0x40($Tbl),@X[2],$t2 1643 vpaddq -0x20($Tbl),@X[3],$t3 1644 vmovdqa $t0,0x00(%rsp) 1645 vpaddq 0x00($Tbl),@X[4],$t0 1646 vmovdqa $t1,0x10(%rsp) 1647 vpaddq 0x20($Tbl),@X[5],$t1 1648 vmovdqa $t2,0x20(%rsp) 1649 vpaddq 0x40($Tbl),@X[6],$t2 1650 vmovdqa $t3,0x30(%rsp) 1651 vpaddq 0x60($Tbl),@X[7],$t3 1652 vmovdqa $t0,0x40(%rsp) 1653 mov $A,$a1 1654 vmovdqa $t1,0x50(%rsp) 1655 mov $B,$a3 1656 vmovdqa $t2,0x60(%rsp) 1657 xor $C,$a3 # magic 1658 vmovdqa $t3,0x70(%rsp) 1659 mov $E,$a0 1660 jmp .Lavx_00_47 1661 1662.align 16 1663.Lavx_00_47: 1664 add \$`16*2*$SZ`,$Tbl 1665___ 1666sub Xupdate_512_AVX () { 1667 ( 1668 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2] 1669 '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10] 1670 '&vpsrlq ($t2,$t0,$sigma0[0])', 1671 '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10] 1672 '&vpsrlq ($t3,$t0,$sigma0[2])', 1673 '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);', 1674 '&vpxor ($t0,$t3,$t2)', 1675 '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);', 1676 '&vpxor ($t0,$t0,$t1)', 1677 '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);', 1678 '&vpxor ($t0,$t0,$t2)', 1679 '&vpsrlq ($t3,@X[7],$sigma1[2]);', 1680 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2]) 1681 '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);', 1682 '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2]) 1683 '&vpsrlq ($t1,@X[7],$sigma1[0]);', 1684 '&vpxor ($t3,$t3,$t2)', 1685 '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);', 1686 '&vpxor ($t3,$t3,$t1)', 1687 '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);', 1688 '&vpxor ($t3,$t3,$t2)', 1689 '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15]) 1690 '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) 1691 ); 1692} 1693 1694sub AVX_512_00_47 () { 1695my $j = shift; 1696my $body = shift; 1697my @X = @_; 1698my @insns = (&$body,&$body); # 52 instructions 1699 1700 foreach (Xupdate_512_AVX()) { # 23 instructions 1701 eval; 1702 eval(shift(@insns)); 1703 eval(shift(@insns)); 1704 } 1705 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); 1706 foreach (@insns) { eval; } # remaining instructions 1707 &vmovdqa (16*$j."(%rsp)",$t2); 1708} 1709 1710 for ($i=0,$j=0; $j<8; $j++) { 1711 &AVX_512_00_47($j,\&body_00_15,@X); 1712 push(@X,shift(@X)); # rotate(@X) 1713 } 1714 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); 1715 &jne (".Lavx_00_47"); 1716 1717 for ($i=0; $i<16; ) { 1718 foreach(body_00_15()) { eval; } 1719 } 1720} 1721$code.=<<___; 1722 mov $_ctx,$ctx 1723 mov $a1,$A 1724 1725 add $SZ*0($ctx),$A 1726 lea 16*$SZ($inp),$inp 1727 add $SZ*1($ctx),$B 1728 add $SZ*2($ctx),$C 1729 add $SZ*3($ctx),$D 1730 add $SZ*4($ctx),$E 1731 add $SZ*5($ctx),$F 1732 add $SZ*6($ctx),$G 1733 add $SZ*7($ctx),$H 1734 1735 cmp $_end,$inp 1736 1737 mov $A,$SZ*0($ctx) 1738 mov $B,$SZ*1($ctx) 1739 mov $C,$SZ*2($ctx) 1740 mov $D,$SZ*3($ctx) 1741 mov $E,$SZ*4($ctx) 1742 mov $F,$SZ*5($ctx) 1743 mov $G,$SZ*6($ctx) 1744 mov $H,$SZ*7($ctx) 1745 jb .Lloop_avx 1746 1747 mov $_rsp,%rsi 1748 vzeroupper 1749___ 1750$code.=<<___ if ($win64); 1751 movaps 16*$SZ+32(%rsp),%xmm6 1752 movaps 16*$SZ+48(%rsp),%xmm7 1753 movaps 16*$SZ+64(%rsp),%xmm8 1754 movaps 16*$SZ+80(%rsp),%xmm9 1755___ 1756$code.=<<___ if ($win64 && $SZ>4); 1757 movaps 16*$SZ+96(%rsp),%xmm10 1758 movaps 16*$SZ+112(%rsp),%xmm11 1759___ 1760$code.=<<___; 1761 mov (%rsi),%r15 1762 mov 8(%rsi),%r14 1763 mov 16(%rsi),%r13 1764 mov 24(%rsi),%r12 1765 mov 32(%rsi),%rbp 1766 mov 40(%rsi),%rbx 1767 lea 48(%rsi),%rsp 1768.Lepilogue_avx: 1769 ret 1770.size ${func}_avx,.-${func}_avx 1771___ 1772 1773if ($avx>1) {{ 1774###################################################################### 1775# AVX2+BMI code path 1776# 1777my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp 1778my $PUSH8=8*2*$SZ; 1779use integer; 1780 1781sub bodyx_00_15 () { 1782 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f 1783 ( 1784 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 1785 1786 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i] 1787 '&and ($a4,$e)', # f&e 1788 '&rorx ($a0,$e,$Sigma1[2])', 1789 '&rorx ($a2,$e,$Sigma1[1])', 1790 1791 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past 1792 '&lea ($h,"($h,$a4)")', 1793 '&andn ($a4,$e,$g)', # ~e&g 1794 '&xor ($a0,$a2)', 1795 1796 '&rorx ($a1,$e,$Sigma1[0])', 1797 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g) 1798 '&xor ($a0,$a1)', # Sigma1(e) 1799 '&mov ($a2,$a)', 1800 1801 '&rorx ($a4,$a,$Sigma0[2])', 1802 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e) 1803 '&xor ($a2,$b)', # a^b, b^c in next round 1804 '&rorx ($a1,$a,$Sigma0[1])', 1805 1806 '&rorx ($a0,$a,$Sigma0[0])', 1807 '&lea ($d,"($d,$h)")', # d+=h 1808 '&and ($a3,$a2)', # (b^c)&(a^b) 1809 '&xor ($a1,$a4)', 1810 1811 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 1812 '&xor ($a1,$a0)', # Sigma0(a) 1813 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c) 1814 '&mov ($a4,$e)', # copy of f in future 1815 1816 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 1817 ); 1818 # and at the finish one has to $a+=$a1 1819} 1820 1821$code.=<<___; 1822.type ${func}_avx2,\@function,3 1823.align 64 1824${func}_avx2: 1825.Lavx2_shortcut: 1826 push %rbx 1827 push %rbp 1828 push %r12 1829 push %r13 1830 push %r14 1831 push %r15 1832 mov %rsp,%r11 # copy %rsp 1833 sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp 1834 shl \$4,%rdx # num*16 1835 and \$-256*$SZ,%rsp # align stack frame 1836 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 1837 add \$`2*$SZ*($rounds-8)`,%rsp 1838 mov $ctx,$_ctx # save ctx, 1st arg 1839 mov $inp,$_inp # save inp, 2nd arh 1840 mov %rdx,$_end # save end pointer, "3rd" arg 1841 mov %r11,$_rsp # save copy of %rsp 1842___ 1843$code.=<<___ if ($win64); 1844 movaps %xmm6,16*$SZ+32(%rsp) 1845 movaps %xmm7,16*$SZ+48(%rsp) 1846 movaps %xmm8,16*$SZ+64(%rsp) 1847 movaps %xmm9,16*$SZ+80(%rsp) 1848___ 1849$code.=<<___ if ($win64 && $SZ>4); 1850 movaps %xmm10,16*$SZ+96(%rsp) 1851 movaps %xmm11,16*$SZ+112(%rsp) 1852___ 1853$code.=<<___; 1854.Lprologue_avx2: 1855 1856 vzeroupper 1857 sub \$-16*$SZ,$inp # inp++, size optimization 1858 mov $SZ*0($ctx),$A 1859 mov $inp,%r12 # borrow $T1 1860 mov $SZ*1($ctx),$B 1861 cmp %rdx,$inp # $_end 1862 mov $SZ*2($ctx),$C 1863 cmove %rsp,%r12 # next block or random data 1864 mov $SZ*3($ctx),$D 1865 mov $SZ*4($ctx),$E 1866 mov $SZ*5($ctx),$F 1867 mov $SZ*6($ctx),$G 1868 mov $SZ*7($ctx),$H 1869___ 1870 if ($SZ==4) { # SHA256 1871 my @X = map("%ymm$_",(0..3)); 1872 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9)); 1873 1874$code.=<<___; 1875 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 1876 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 1877 jmp .Loop_avx2 1878.align 16 1879.Loop_avx2: 1880 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1881 vmovdqu -16*$SZ+0($inp),%xmm0 1882 vmovdqu -16*$SZ+16($inp),%xmm1 1883 vmovdqu -16*$SZ+32($inp),%xmm2 1884 vmovdqu -16*$SZ+48($inp),%xmm3 1885 #mov $inp,$_inp # offload $inp 1886 vinserti128 \$1,(%r12),@X[0],@X[0] 1887 vinserti128 \$1,16(%r12),@X[1],@X[1] 1888 vpshufb $t3,@X[0],@X[0] 1889 vinserti128 \$1,32(%r12),@X[2],@X[2] 1890 vpshufb $t3,@X[1],@X[1] 1891 vinserti128 \$1,48(%r12),@X[3],@X[3] 1892 1893 lea $TABLE(%rip),$Tbl 1894 vpshufb $t3,@X[2],@X[2] 1895 vpaddd 0x00($Tbl),@X[0],$t0 1896 vpshufb $t3,@X[3],@X[3] 1897 vpaddd 0x20($Tbl),@X[1],$t1 1898 vpaddd 0x40($Tbl),@X[2],$t2 1899 vpaddd 0x60($Tbl),@X[3],$t3 1900 vmovdqa $t0,0x00(%rsp) 1901 xor $a1,$a1 1902 vmovdqa $t1,0x20(%rsp) 1903 lea -$PUSH8(%rsp),%rsp 1904 mov $B,$a3 1905 vmovdqa $t2,0x00(%rsp) 1906 xor $C,$a3 # magic 1907 vmovdqa $t3,0x20(%rsp) 1908 mov $F,$a4 1909 sub \$-16*2*$SZ,$Tbl # size optimization 1910 jmp .Lavx2_00_47 1911 1912.align 16 1913.Lavx2_00_47: 1914___ 1915 1916sub AVX2_256_00_47 () { 1917my $j = shift; 1918my $body = shift; 1919my @X = @_; 1920my @insns = (&$body,&$body,&$body,&$body); # 96 instructions 1921my $base = "+2*$PUSH8(%rsp)"; 1922 1923 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0); 1924 foreach (Xupdate_256_AVX()) { # 29 instructions 1925 eval; 1926 eval(shift(@insns)); 1927 eval(shift(@insns)); 1928 eval(shift(@insns)); 1929 } 1930 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1931 foreach (@insns) { eval; } # remaining instructions 1932 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); 1933} 1934 1935 for ($i=0,$j=0; $j<4; $j++) { 1936 &AVX2_256_00_47($j,\&bodyx_00_15,@X); 1937 push(@X,shift(@X)); # rotate(@X) 1938 } 1939 &lea ($Tbl,16*2*$SZ."($Tbl)"); 1940 &cmpb (($SZ-1)."($Tbl)",0); 1941 &jne (".Lavx2_00_47"); 1942 1943 for ($i=0; $i<16; ) { 1944 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; 1945 foreach(bodyx_00_15()) { eval; } 1946 } 1947 } else { # SHA512 1948 my @X = map("%ymm$_",(0..7)); 1949 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11)); 1950 1951$code.=<<___; 1952 jmp .Loop_avx2 1953.align 16 1954.Loop_avx2: 1955 vmovdqu -16*$SZ($inp),%xmm0 1956 vmovdqu -16*$SZ+16($inp),%xmm1 1957 vmovdqu -16*$SZ+32($inp),%xmm2 1958 lea $TABLE+0x80(%rip),$Tbl # size optimization 1959 vmovdqu -16*$SZ+48($inp),%xmm3 1960 vmovdqu -16*$SZ+64($inp),%xmm4 1961 vmovdqu -16*$SZ+80($inp),%xmm5 1962 vmovdqu -16*$SZ+96($inp),%xmm6 1963 vmovdqu -16*$SZ+112($inp),%xmm7 1964 #mov $inp,$_inp # offload $inp 1965 vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2 1966 vinserti128 \$1,(%r12),@X[0],@X[0] 1967 vinserti128 \$1,16(%r12),@X[1],@X[1] 1968 vpshufb $t2,@X[0],@X[0] 1969 vinserti128 \$1,32(%r12),@X[2],@X[2] 1970 vpshufb $t2,@X[1],@X[1] 1971 vinserti128 \$1,48(%r12),@X[3],@X[3] 1972 vpshufb $t2,@X[2],@X[2] 1973 vinserti128 \$1,64(%r12),@X[4],@X[4] 1974 vpshufb $t2,@X[3],@X[3] 1975 vinserti128 \$1,80(%r12),@X[5],@X[5] 1976 vpshufb $t2,@X[4],@X[4] 1977 vinserti128 \$1,96(%r12),@X[6],@X[6] 1978 vpshufb $t2,@X[5],@X[5] 1979 vinserti128 \$1,112(%r12),@X[7],@X[7] 1980 1981 vpaddq -0x80($Tbl),@X[0],$t0 1982 vpshufb $t2,@X[6],@X[6] 1983 vpaddq -0x60($Tbl),@X[1],$t1 1984 vpshufb $t2,@X[7],@X[7] 1985 vpaddq -0x40($Tbl),@X[2],$t2 1986 vpaddq -0x20($Tbl),@X[3],$t3 1987 vmovdqa $t0,0x00(%rsp) 1988 vpaddq 0x00($Tbl),@X[4],$t0 1989 vmovdqa $t1,0x20(%rsp) 1990 vpaddq 0x20($Tbl),@X[5],$t1 1991 vmovdqa $t2,0x40(%rsp) 1992 vpaddq 0x40($Tbl),@X[6],$t2 1993 vmovdqa $t3,0x60(%rsp) 1994 lea -$PUSH8(%rsp),%rsp 1995 vpaddq 0x60($Tbl),@X[7],$t3 1996 vmovdqa $t0,0x00(%rsp) 1997 xor $a1,$a1 1998 vmovdqa $t1,0x20(%rsp) 1999 mov $B,$a3 2000 vmovdqa $t2,0x40(%rsp) 2001 xor $C,$a3 # magic 2002 vmovdqa $t3,0x60(%rsp) 2003 mov $F,$a4 2004 add \$16*2*$SZ,$Tbl 2005 jmp .Lavx2_00_47 2006 2007.align 16 2008.Lavx2_00_47: 2009___ 2010 2011sub AVX2_512_00_47 () { 2012my $j = shift; 2013my $body = shift; 2014my @X = @_; 2015my @insns = (&$body,&$body); # 48 instructions 2016my $base = "+2*$PUSH8(%rsp)"; 2017 2018 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%4)==0); 2019 foreach (Xupdate_512_AVX()) { # 23 instructions 2020 eval; 2021 if ($_ !~ /\;$/) { 2022 eval(shift(@insns)); 2023 eval(shift(@insns)); 2024 eval(shift(@insns)); 2025 } 2026 } 2027 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); 2028 foreach (@insns) { eval; } # remaining instructions 2029 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); 2030} 2031 2032 for ($i=0,$j=0; $j<8; $j++) { 2033 &AVX2_512_00_47($j,\&bodyx_00_15,@X); 2034 push(@X,shift(@X)); # rotate(@X) 2035 } 2036 &lea ($Tbl,16*2*$SZ."($Tbl)"); 2037 &cmpb (($SZ-1-0x80)."($Tbl)",0); 2038 &jne (".Lavx2_00_47"); 2039 2040 for ($i=0; $i<16; ) { 2041 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; 2042 foreach(bodyx_00_15()) { eval; } 2043 } 2044} 2045$code.=<<___; 2046 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx 2047 add $a1,$A 2048 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp 2049 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl 2050 2051 add $SZ*0($ctx),$A 2052 add $SZ*1($ctx),$B 2053 add $SZ*2($ctx),$C 2054 add $SZ*3($ctx),$D 2055 add $SZ*4($ctx),$E 2056 add $SZ*5($ctx),$F 2057 add $SZ*6($ctx),$G 2058 add $SZ*7($ctx),$H 2059 2060 mov $A,$SZ*0($ctx) 2061 mov $B,$SZ*1($ctx) 2062 mov $C,$SZ*2($ctx) 2063 mov $D,$SZ*3($ctx) 2064 mov $E,$SZ*4($ctx) 2065 mov $F,$SZ*5($ctx) 2066 mov $G,$SZ*6($ctx) 2067 mov $H,$SZ*7($ctx) 2068 2069 cmp `$PUSH8+2*8`($Tbl),$inp # $_end 2070 je .Ldone_avx2 2071 2072 xor $a1,$a1 2073 mov $B,$a3 2074 xor $C,$a3 # magic 2075 mov $F,$a4 2076 jmp .Lower_avx2 2077.align 16 2078.Lower_avx2: 2079___ 2080 for ($i=0; $i<8; ) { 2081 my $base="+16($Tbl)"; 2082 foreach(bodyx_00_15()) { eval; } 2083 } 2084$code.=<<___; 2085 lea -$PUSH8($Tbl),$Tbl 2086 cmp %rsp,$Tbl 2087 jae .Lower_avx2 2088 2089 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx 2090 add $a1,$A 2091 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp 2092 lea `2*$SZ*($rounds-8)`(%rsp),%rsp 2093 2094 add $SZ*0($ctx),$A 2095 add $SZ*1($ctx),$B 2096 add $SZ*2($ctx),$C 2097 add $SZ*3($ctx),$D 2098 add $SZ*4($ctx),$E 2099 add $SZ*5($ctx),$F 2100 lea `2*16*$SZ`($inp),$inp # inp+=2 2101 add $SZ*6($ctx),$G 2102 mov $inp,%r12 2103 add $SZ*7($ctx),$H 2104 cmp $_end,$inp 2105 2106 mov $A,$SZ*0($ctx) 2107 cmove %rsp,%r12 # next block or stale data 2108 mov $B,$SZ*1($ctx) 2109 mov $C,$SZ*2($ctx) 2110 mov $D,$SZ*3($ctx) 2111 mov $E,$SZ*4($ctx) 2112 mov $F,$SZ*5($ctx) 2113 mov $G,$SZ*6($ctx) 2114 mov $H,$SZ*7($ctx) 2115 2116 jbe .Loop_avx2 2117 lea (%rsp),$Tbl 2118 2119.Ldone_avx2: 2120 lea ($Tbl),%rsp 2121 mov $_rsp,%rsi 2122 vzeroupper 2123___ 2124$code.=<<___ if ($win64); 2125 movaps 16*$SZ+32(%rsp),%xmm6 2126 movaps 16*$SZ+48(%rsp),%xmm7 2127 movaps 16*$SZ+64(%rsp),%xmm8 2128 movaps 16*$SZ+80(%rsp),%xmm9 2129___ 2130$code.=<<___ if ($win64 && $SZ>4); 2131 movaps 16*$SZ+96(%rsp),%xmm10 2132 movaps 16*$SZ+112(%rsp),%xmm11 2133___ 2134$code.=<<___; 2135 mov (%rsi),%r15 2136 mov 8(%rsi),%r14 2137 mov 16(%rsi),%r13 2138 mov 24(%rsi),%r12 2139 mov 32(%rsi),%rbp 2140 mov 40(%rsi),%rbx 2141 lea 48(%rsi),%rsp 2142.Lepilogue_avx2: 2143 ret 2144.size ${func}_avx2,.-${func}_avx2 2145___ 2146}} 2147}}}}} 2148 2149# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 2150# CONTEXT *context,DISPATCHER_CONTEXT *disp) 2151if ($win64) { 2152$rec="%rcx"; 2153$frame="%rdx"; 2154$context="%r8"; 2155$disp="%r9"; 2156 2157$code.=<<___; 2158.extern __imp_RtlVirtualUnwind 2159.type se_handler,\@abi-omnipotent 2160.align 16 2161se_handler: 2162 push %rsi 2163 push %rdi 2164 push %rbx 2165 push %rbp 2166 push %r12 2167 push %r13 2168 push %r14 2169 push %r15 2170 pushfq 2171 sub \$64,%rsp 2172 2173 mov 120($context),%rax # pull context->Rax 2174 mov 248($context),%rbx # pull context->Rip 2175 2176 mov 8($disp),%rsi # disp->ImageBase 2177 mov 56($disp),%r11 # disp->HanderlData 2178 2179 mov 0(%r11),%r10d # HandlerData[0] 2180 lea (%rsi,%r10),%r10 # prologue label 2181 cmp %r10,%rbx # context->Rip<prologue label 2182 jb .Lin_prologue 2183 2184 mov 152($context),%rax # pull context->Rsp 2185 2186 mov 4(%r11),%r10d # HandlerData[1] 2187 lea (%rsi,%r10),%r10 # epilogue label 2188 cmp %r10,%rbx # context->Rip>=epilogue label 2189 jae .Lin_prologue 2190___ 2191$code.=<<___ if ($avx>1); 2192 lea .Lavx2_shortcut(%rip),%r10 2193 cmp %r10,%rbx # context->Rip<avx2_shortcut 2194 jb .Lnot_in_avx2 2195 2196 and \$-256*$SZ,%rax 2197 add \$`2*$SZ*($rounds-8)`,%rax 2198.Lnot_in_avx2: 2199___ 2200$code.=<<___; 2201 mov %rax,%rsi # put aside Rsp 2202 mov 16*$SZ+3*8(%rax),%rax # pull $_rsp 2203 lea 48(%rax),%rax 2204 2205 mov -8(%rax),%rbx 2206 mov -16(%rax),%rbp 2207 mov -24(%rax),%r12 2208 mov -32(%rax),%r13 2209 mov -40(%rax),%r14 2210 mov -48(%rax),%r15 2211 mov %rbx,144($context) # restore context->Rbx 2212 mov %rbp,160($context) # restore context->Rbp 2213 mov %r12,216($context) # restore context->R12 2214 mov %r13,224($context) # restore context->R13 2215 mov %r14,232($context) # restore context->R14 2216 mov %r15,240($context) # restore context->R15 2217 2218 lea .Lepilogue(%rip),%r10 2219 cmp %r10,%rbx 2220 jb .Lin_prologue # non-AVX code 2221 2222 lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area 2223 lea 512($context),%rdi # &context.Xmm6 2224 mov \$`$SZ==4?8:12`,%ecx 2225 .long 0xa548f3fc # cld; rep movsq 2226 2227.Lin_prologue: 2228 mov 8(%rax),%rdi 2229 mov 16(%rax),%rsi 2230 mov %rax,152($context) # restore context->Rsp 2231 mov %rsi,168($context) # restore context->Rsi 2232 mov %rdi,176($context) # restore context->Rdi 2233 2234 mov 40($disp),%rdi # disp->ContextRecord 2235 mov $context,%rsi # context 2236 mov \$154,%ecx # sizeof(CONTEXT) 2237 .long 0xa548f3fc # cld; rep movsq 2238 2239 mov $disp,%rsi 2240 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 2241 mov 8(%rsi),%rdx # arg2, disp->ImageBase 2242 mov 0(%rsi),%r8 # arg3, disp->ControlPc 2243 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 2244 mov 40(%rsi),%r10 # disp->ContextRecord 2245 lea 56(%rsi),%r11 # &disp->HandlerData 2246 lea 24(%rsi),%r12 # &disp->EstablisherFrame 2247 mov %r10,32(%rsp) # arg5 2248 mov %r11,40(%rsp) # arg6 2249 mov %r12,48(%rsp) # arg7 2250 mov %rcx,56(%rsp) # arg8, (NULL) 2251 call *__imp_RtlVirtualUnwind(%rip) 2252 2253 mov \$1,%eax # ExceptionContinueSearch 2254 add \$64,%rsp 2255 popfq 2256 pop %r15 2257 pop %r14 2258 pop %r13 2259 pop %r12 2260 pop %rbp 2261 pop %rbx 2262 pop %rdi 2263 pop %rsi 2264 ret 2265.size se_handler,.-se_handler 2266___ 2267 2268$code.=<<___ if ($SZ==4 && $shaext); 2269.type shaext_handler,\@abi-omnipotent 2270.align 16 2271shaext_handler: 2272 push %rsi 2273 push %rdi 2274 push %rbx 2275 push %rbp 2276 push %r12 2277 push %r13 2278 push %r14 2279 push %r15 2280 pushfq 2281 sub \$64,%rsp 2282 2283 mov 120($context),%rax # pull context->Rax 2284 mov 248($context),%rbx # pull context->Rip 2285 2286 lea .Lprologue_shaext(%rip),%r10 2287 cmp %r10,%rbx # context->Rip<.Lprologue 2288 jb .Lin_prologue 2289 2290 lea .Lepilogue_shaext(%rip),%r10 2291 cmp %r10,%rbx # context->Rip>=.Lepilogue 2292 jae .Lin_prologue 2293 2294 lea -8-5*16(%rax),%rsi 2295 lea 512($context),%rdi # &context.Xmm6 2296 mov \$10,%ecx 2297 .long 0xa548f3fc # cld; rep movsq 2298 2299 jmp .Lin_prologue 2300.size shaext_handler,.-shaext_handler 2301___ 2302 2303$code.=<<___; 2304.section .pdata 2305.align 4 2306 .rva .LSEH_begin_$func 2307 .rva .LSEH_end_$func 2308 .rva .LSEH_info_$func 2309___ 2310$code.=<<___ if ($SZ==4 && $shaext); 2311 .rva .LSEH_begin_${func}_shaext 2312 .rva .LSEH_end_${func}_shaext 2313 .rva .LSEH_info_${func}_shaext 2314___ 2315$code.=<<___ if ($SZ==4); 2316 .rva .LSEH_begin_${func}_ssse3 2317 .rva .LSEH_end_${func}_ssse3 2318 .rva .LSEH_info_${func}_ssse3 2319___ 2320$code.=<<___ if ($avx && $SZ==8); 2321 .rva .LSEH_begin_${func}_xop 2322 .rva .LSEH_end_${func}_xop 2323 .rva .LSEH_info_${func}_xop 2324___ 2325$code.=<<___ if ($avx); 2326 .rva .LSEH_begin_${func}_avx 2327 .rva .LSEH_end_${func}_avx 2328 .rva .LSEH_info_${func}_avx 2329___ 2330$code.=<<___ if ($avx>1); 2331 .rva .LSEH_begin_${func}_avx2 2332 .rva .LSEH_end_${func}_avx2 2333 .rva .LSEH_info_${func}_avx2 2334___ 2335$code.=<<___; 2336.section .xdata 2337.align 8 2338.LSEH_info_$func: 2339 .byte 9,0,0,0 2340 .rva se_handler 2341 .rva .Lprologue,.Lepilogue # HandlerData[] 2342___ 2343$code.=<<___ if ($SZ==4 && $shaext); 2344.LSEH_info_${func}_shaext: 2345 .byte 9,0,0,0 2346 .rva shaext_handler 2347___ 2348$code.=<<___ if ($SZ==4); 2349.LSEH_info_${func}_ssse3: 2350 .byte 9,0,0,0 2351 .rva se_handler 2352 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] 2353___ 2354$code.=<<___ if ($avx && $SZ==8); 2355.LSEH_info_${func}_xop: 2356 .byte 9,0,0,0 2357 .rva se_handler 2358 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[] 2359___ 2360$code.=<<___ if ($avx); 2361.LSEH_info_${func}_avx: 2362 .byte 9,0,0,0 2363 .rva se_handler 2364 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] 2365___ 2366$code.=<<___ if ($avx>1); 2367.LSEH_info_${func}_avx2: 2368 .byte 9,0,0,0 2369 .rva se_handler 2370 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] 2371___ 2372} 2373 2374sub sha256op38 { 2375 my $instr = shift; 2376 my %opcodelet = ( 2377 "sha256rnds2" => 0xcb, 2378 "sha256msg1" => 0xcc, 2379 "sha256msg2" => 0xcd ); 2380 2381 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) { 2382 my @opcode=(0x0f,0x38); 2383 push @opcode,$opcodelet{$instr}; 2384 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M 2385 return ".byte\t".join(',',@opcode); 2386 } else { 2387 return $instr."\t".@_[0]; 2388 } 2389} 2390 2391foreach (split("\n",$code)) { 2392 s/\`([^\`]*)\`/eval $1/geo; 2393 2394 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo; 2395 2396 print $_,"\n"; 2397} 2398close STDOUT; 2399