1#! /usr/bin/env perl 2# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. Rights for redistribution and usage in source and binary 13# forms are granted according to the OpenSSL license. 14# ==================================================================== 15# 16# sha256/512_block procedure for x86_64. 17# 18# 40% improvement over compiler-generated code on Opteron. On EM64T 19# sha256 was observed to run >80% faster and sha512 - >40%. No magical 20# tricks, just straight implementation... I really wonder why gcc 21# [being armed with inline assembler] fails to generate as fast code. 22# The only thing which is cool about this module is that it's very 23# same instruction sequence used for both SHA-256 and SHA-512. In 24# former case the instructions operate on 32-bit operands, while in 25# latter - on 64-bit ones. All I had to do is to get one flavor right, 26# the other one passed the test right away:-) 27# 28# sha256_block runs in ~1005 cycles on Opteron, which gives you 29# asymptotic performance of 64*1000/1005=63.7MBps times CPU clock 30# frequency in GHz. sha512_block runs in ~1275 cycles, which results 31# in 128*1000/1275=100MBps per GHz. Is there room for improvement? 32# Well, if you compare it to IA-64 implementation, which maintains 33# X[16] in register bank[!], tends to 4 instructions per CPU clock 34# cycle and runs in 1003 cycles, 1275 is very good result for 3-way 35# issue Opteron pipeline and X[16] maintained in memory. So that *if* 36# there is a way to improve it, *then* the only way would be to try to 37# offload X[16] updates to SSE unit, but that would require "deeper" 38# loop unroll, which in turn would naturally cause size blow-up, not 39# to mention increased complexity! And once again, only *if* it's 40# actually possible to noticeably improve overall ILP, instruction 41# level parallelism, on a given CPU implementation in this case. 42# 43# Special note on Intel EM64T. While Opteron CPU exhibits perfect 44# performance ratio of 1.5 between 64- and 32-bit flavors [see above], 45# [currently available] EM64T CPUs apparently are far from it. On the 46# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit 47# sha256_block:-( This is presumably because 64-bit shifts/rotates 48# apparently are not atomic instructions, but implemented in microcode. 49# 50# May 2012. 51# 52# Optimization including one of Pavel Semjanov's ideas, alternative 53# Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and 54# unfortunately -2% SHA512 on P4 [which nobody should care about 55# that much]. 56# 57# June 2012. 58# 59# Add SIMD code paths, see below for improvement coefficients. SSSE3 60# code path was not attempted for SHA512, because improvement is not 61# estimated to be high enough, noticeably less than 9%, to justify 62# the effort, not on pre-AVX processors. [Obviously with exclusion 63# for VIA Nano, but it has SHA512 instruction that is faster and 64# should be used instead.] For reference, corresponding estimated 65# upper limit for improvement for SSSE3 SHA256 is 28%. The fact that 66# higher coefficients are observed on VIA Nano and Bulldozer has more 67# to do with specifics of their architecture [which is topic for 68# separate discussion]. 69# 70# November 2012. 71# 72# Add AVX2 code path. Two consecutive input blocks are loaded to 73# 256-bit %ymm registers, with data from first block to least 74# significant 128-bit halves and data from second to most significant. 75# The data is then processed with same SIMD instruction sequence as 76# for AVX, but with %ymm as operands. Side effect is increased stack 77# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB 78# code size increase. 79# 80# March 2014. 81# 82# Add support for Intel SHA Extensions. 83 84###################################################################### 85# Current performance in cycles per processed byte (less is better): 86# 87# SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*) 88# 89# AMD K8 14.9 - - 9.57 - 90# P4 17.3 - - 30.8 - 91# Core 2 15.6 13.8(+13%) - 9.97 - 92# Westmere 14.8 12.3(+19%) - 9.58 - 93# Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**)) 94# Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%) 95# Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%) 96# Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%) 97# Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%) 98# Ryzen 11.0 9.02(+22%) 2.05(+440%) 7.05 5.67(+20%) 99# VIA Nano 23.0 16.5(+39%) - 14.7 - 100# Atom 23.0 18.9(+22%) - 14.7 - 101# Silvermont 27.4 20.6(+33%) - 17.5 - 102# Knights L 27.4 21.0(+30%) 19.6(+40%) 17.5 12.8(+37%) 103# Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 - 104# 105# (*) whichever best applicable, including SHAEXT; 106# (**) switch from ror to shrd stands for fair share of improvement; 107# (***) execution time is fully determined by remaining integer-only 108# part, body_00_15; reducing the amount of SIMD instructions 109# below certain limit makes no difference/sense; to conserve 110# space SHA256 XOP code path is therefore omitted; 111 112$flavour = shift; 113$output = shift; 114if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 115 116$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 117 118$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 119( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 120( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or 121die "can't locate x86_64-xlate.pl"; 122 123# In upstream, this is controlled by shelling out to the compiler to check 124# versions, but BoringSSL is intended to be used with pre-generated perlasm 125# output, so this isn't useful anyway. 126# 127# TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. Is it 128# necessary to disable AVX2 code when SHA Extensions code is disabled? Upstream 129# did not tie them together until after $shaext was added. 130$avx = 1; 131 132# TODO(davidben): Consider enabling the Intel SHA Extensions code once it's 133# been tested. 134$shaext=0; ### set to zero if compiling for 1.0.1 135$avx=1 if (!$shaext && $avx); 136 137open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 138*STDOUT=*OUT; 139 140if ($output =~ /512/) { 141 $func="sha512_block_data_order"; 142 $TABLE="K512"; 143 $SZ=8; 144 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx", 145 "%r8", "%r9", "%r10","%r11"); 146 ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi"); 147 @Sigma0=(28,34,39); 148 @Sigma1=(14,18,41); 149 @sigma0=(1, 8, 7); 150 @sigma1=(19,61, 6); 151 $rounds=80; 152} else { 153 $func="sha256_block_data_order"; 154 $TABLE="K256"; 155 $SZ=4; 156 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", 157 "%r8d","%r9d","%r10d","%r11d"); 158 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi"); 159 @Sigma0=( 2,13,22); 160 @Sigma1=( 6,11,25); 161 @sigma0=( 7,18, 3); 162 @sigma1=(17,19,10); 163 $rounds=64; 164} 165 166$ctx="%rdi"; # 1st arg, zapped by $a3 167$inp="%rsi"; # 2nd arg 168$Tbl="%rbp"; 169 170$_ctx="16*$SZ+0*8(%rsp)"; 171$_inp="16*$SZ+1*8(%rsp)"; 172$_end="16*$SZ+2*8(%rsp)"; 173$_rsp="16*$SZ+3*8(%rsp)"; 174$framesz="16*$SZ+4*8"; 175 176 177sub ROUND_00_15() 178{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 179 my $STRIDE=$SZ; 180 $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1)); 181 182$code.=<<___; 183 ror \$`$Sigma1[2]-$Sigma1[1]`,$a0 184 mov $f,$a2 185 186 xor $e,$a0 187 ror \$`$Sigma0[2]-$Sigma0[1]`,$a1 188 xor $g,$a2 # f^g 189 190 mov $T1,`$SZ*($i&0xf)`(%rsp) 191 xor $a,$a1 192 and $e,$a2 # (f^g)&e 193 194 ror \$`$Sigma1[1]-$Sigma1[0]`,$a0 195 add $h,$T1 # T1+=h 196 xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g 197 198 ror \$`$Sigma0[1]-$Sigma0[0]`,$a1 199 xor $e,$a0 200 add $a2,$T1 # T1+=Ch(e,f,g) 201 202 mov $a,$a2 203 add ($Tbl),$T1 # T1+=K[round] 204 xor $a,$a1 205 206 xor $b,$a2 # a^b, b^c in next round 207 ror \$$Sigma1[0],$a0 # Sigma1(e) 208 mov $b,$h 209 210 and $a2,$a3 211 ror \$$Sigma0[0],$a1 # Sigma0(a) 212 add $a0,$T1 # T1+=Sigma1(e) 213 214 xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b) 215 add $T1,$d # d+=T1 216 add $T1,$h # h+=T1 217 218 lea $STRIDE($Tbl),$Tbl # round++ 219___ 220$code.=<<___ if ($i<15); 221 add $a1,$h # h+=Sigma0(a) 222___ 223 ($a2,$a3) = ($a3,$a2); 224} 225 226sub ROUND_16_XX() 227{ my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 228 229$code.=<<___; 230 mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 231 mov `$SZ*(($i+14)&0xf)`(%rsp),$a2 232 233 mov $a0,$T1 234 ror \$`$sigma0[1]-$sigma0[0]`,$a0 235 add $a1,$a # modulo-scheduled h+=Sigma0(a) 236 mov $a2,$a1 237 ror \$`$sigma1[1]-$sigma1[0]`,$a2 238 239 xor $T1,$a0 240 shr \$$sigma0[2],$T1 241 ror \$$sigma0[0],$a0 242 xor $a1,$a2 243 shr \$$sigma1[2],$a1 244 245 ror \$$sigma1[0],$a2 246 xor $a0,$T1 # sigma0(X[(i+1)&0xf]) 247 xor $a1,$a2 # sigma1(X[(i+14)&0xf]) 248 add `$SZ*(($i+9)&0xf)`(%rsp),$T1 249 250 add `$SZ*($i&0xf)`(%rsp),$T1 251 mov $e,$a0 252 add $a2,$T1 253 mov $a,$a1 254___ 255 &ROUND_00_15(@_); 256} 257 258$code=<<___; 259.text 260 261.extern OPENSSL_ia32cap_P 262.globl $func 263.type $func,\@function,3 264.align 16 265$func: 266___ 267$code.=<<___ if ($SZ==4 || $avx); 268 leaq OPENSSL_ia32cap_P(%rip),%r11 269 mov 0(%r11),%r9d 270 mov 4(%r11),%r10d 271 mov 8(%r11),%r11d 272___ 273$code.=<<___ if ($SZ==4 && $shaext); 274 test \$`1<<29`,%r11d # check for SHA 275 jnz _shaext_shortcut 276___ 277$code.=<<___ if ($avx && $SZ==8); 278 test \$`1<<11`,%r10d # check for XOP 279 jnz .Lxop_shortcut 280___ 281$code.=<<___ if ($avx>1); 282 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 283 cmp \$`1<<8|1<<5|1<<3`,%r11d 284 je .Lavx2_shortcut 285___ 286$code.=<<___ if ($avx); 287 and \$`1<<30`,%r9d # mask "Intel CPU" bit 288 and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits 289 or %r9d,%r10d 290 cmp \$`1<<28|1<<9|1<<30`,%r10d 291 je .Lavx_shortcut 292___ 293$code.=<<___ if ($SZ==4); 294 test \$`1<<9`,%r10d 295 jnz .Lssse3_shortcut 296___ 297$code.=<<___; 298 mov %rsp,%rax # copy %rsp 299 push %rbx 300 push %rbp 301 push %r12 302 push %r13 303 push %r14 304 push %r15 305 shl \$4,%rdx # num*16 306 sub \$$framesz,%rsp 307 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 308 and \$-64,%rsp # align stack frame 309 mov $ctx,$_ctx # save ctx, 1st arg 310 mov $inp,$_inp # save inp, 2nd arh 311 mov %rdx,$_end # save end pointer, "3rd" arg 312 mov %rax,$_rsp # save copy of %rsp 313.Lprologue: 314 315 mov $SZ*0($ctx),$A 316 mov $SZ*1($ctx),$B 317 mov $SZ*2($ctx),$C 318 mov $SZ*3($ctx),$D 319 mov $SZ*4($ctx),$E 320 mov $SZ*5($ctx),$F 321 mov $SZ*6($ctx),$G 322 mov $SZ*7($ctx),$H 323 jmp .Lloop 324 325.align 16 326.Lloop: 327 mov $B,$a3 328 lea $TABLE(%rip),$Tbl 329 xor $C,$a3 # magic 330___ 331 for($i=0;$i<16;$i++) { 332 $code.=" mov $SZ*$i($inp),$T1\n"; 333 $code.=" mov @ROT[4],$a0\n"; 334 $code.=" mov @ROT[0],$a1\n"; 335 $code.=" bswap $T1\n"; 336 &ROUND_00_15($i,@ROT); 337 unshift(@ROT,pop(@ROT)); 338 } 339$code.=<<___; 340 jmp .Lrounds_16_xx 341.align 16 342.Lrounds_16_xx: 343___ 344 for(;$i<32;$i++) { 345 &ROUND_16_XX($i,@ROT); 346 unshift(@ROT,pop(@ROT)); 347 } 348 349$code.=<<___; 350 cmpb \$0,`$SZ-1`($Tbl) 351 jnz .Lrounds_16_xx 352 353 mov $_ctx,$ctx 354 add $a1,$A # modulo-scheduled h+=Sigma0(a) 355 lea 16*$SZ($inp),$inp 356 357 add $SZ*0($ctx),$A 358 add $SZ*1($ctx),$B 359 add $SZ*2($ctx),$C 360 add $SZ*3($ctx),$D 361 add $SZ*4($ctx),$E 362 add $SZ*5($ctx),$F 363 add $SZ*6($ctx),$G 364 add $SZ*7($ctx),$H 365 366 cmp $_end,$inp 367 368 mov $A,$SZ*0($ctx) 369 mov $B,$SZ*1($ctx) 370 mov $C,$SZ*2($ctx) 371 mov $D,$SZ*3($ctx) 372 mov $E,$SZ*4($ctx) 373 mov $F,$SZ*5($ctx) 374 mov $G,$SZ*6($ctx) 375 mov $H,$SZ*7($ctx) 376 jb .Lloop 377 378 mov $_rsp,%rsi 379 mov -48(%rsi),%r15 380 mov -40(%rsi),%r14 381 mov -32(%rsi),%r13 382 mov -24(%rsi),%r12 383 mov -16(%rsi),%rbp 384 mov -8(%rsi),%rbx 385 lea (%rsi),%rsp 386.Lepilogue: 387 ret 388.size $func,.-$func 389___ 390 391if ($SZ==4) { 392$code.=<<___; 393.align 64 394.type $TABLE,\@object 395$TABLE: 396 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 397 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 398 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 399 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 400 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 401 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 402 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 403 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 404 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 405 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 406 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 407 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 408 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 409 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 410 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 411 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 412 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 413 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 414 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 415 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 416 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 417 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 418 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 419 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 420 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 421 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 422 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 423 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 424 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 425 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 426 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 427 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 428 429 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 430 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f 431 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff 432 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff 433 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 434 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 435 .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 436___ 437} else { 438$code.=<<___; 439.align 64 440.type $TABLE,\@object 441$TABLE: 442 .quad 0x428a2f98d728ae22,0x7137449123ef65cd 443 .quad 0x428a2f98d728ae22,0x7137449123ef65cd 444 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 445 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 446 .quad 0x3956c25bf348b538,0x59f111f1b605d019 447 .quad 0x3956c25bf348b538,0x59f111f1b605d019 448 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 449 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 450 .quad 0xd807aa98a3030242,0x12835b0145706fbe 451 .quad 0xd807aa98a3030242,0x12835b0145706fbe 452 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 453 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 454 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 455 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 456 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 457 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 458 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 459 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 460 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 461 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 462 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 463 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 464 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 465 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 466 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 467 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 468 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 469 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 470 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 471 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 472 .quad 0x06ca6351e003826f,0x142929670a0e6e70 473 .quad 0x06ca6351e003826f,0x142929670a0e6e70 474 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 475 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 476 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 477 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 478 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 479 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 480 .quad 0x81c2c92e47edaee6,0x92722c851482353b 481 .quad 0x81c2c92e47edaee6,0x92722c851482353b 482 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 483 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 484 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 485 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 486 .quad 0xd192e819d6ef5218,0xd69906245565a910 487 .quad 0xd192e819d6ef5218,0xd69906245565a910 488 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 489 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 490 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 491 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 492 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 493 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 494 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 495 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 496 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 497 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 498 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 499 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 500 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 501 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 502 .quad 0x90befffa23631e28,0xa4506cebde82bde9 503 .quad 0x90befffa23631e28,0xa4506cebde82bde9 504 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 505 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 506 .quad 0xca273eceea26619c,0xd186b8c721c0c207 507 .quad 0xca273eceea26619c,0xd186b8c721c0c207 508 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 509 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 510 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 511 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 512 .quad 0x113f9804bef90dae,0x1b710b35131c471b 513 .quad 0x113f9804bef90dae,0x1b710b35131c471b 514 .quad 0x28db77f523047d84,0x32caab7b40c72493 515 .quad 0x28db77f523047d84,0x32caab7b40c72493 516 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 517 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 518 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 519 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 520 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 521 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 522 523 .quad 0x0001020304050607,0x08090a0b0c0d0e0f 524 .quad 0x0001020304050607,0x08090a0b0c0d0e0f 525 .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 526___ 527} 528 529###################################################################### 530# SIMD code paths 531# 532if ($SZ==4 && $shaext) {{{ 533###################################################################### 534# Intel SHA Extensions implementation of SHA256 update function. 535# 536my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx"); 537 538my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10)); 539my @MSG=map("%xmm$_",(3..6)); 540 541$code.=<<___; 542.type sha256_block_data_order_shaext,\@function,3 543.align 64 544sha256_block_data_order_shaext: 545_shaext_shortcut: 546___ 547$code.=<<___ if ($win64); 548 lea `-8-5*16`(%rsp),%rsp 549 movaps %xmm6,-8-5*16(%rax) 550 movaps %xmm7,-8-4*16(%rax) 551 movaps %xmm8,-8-3*16(%rax) 552 movaps %xmm9,-8-2*16(%rax) 553 movaps %xmm10,-8-1*16(%rax) 554.Lprologue_shaext: 555___ 556$code.=<<___; 557 lea K256+0x80(%rip),$Tbl 558 movdqu ($ctx),$ABEF # DCBA 559 movdqu 16($ctx),$CDGH # HGFE 560 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask 561 562 pshufd \$0x1b,$ABEF,$Wi # ABCD 563 pshufd \$0xb1,$ABEF,$ABEF # CDAB 564 pshufd \$0x1b,$CDGH,$CDGH # EFGH 565 movdqa $TMP,$BSWAP # offload 566 palignr \$8,$CDGH,$ABEF # ABEF 567 punpcklqdq $Wi,$CDGH # CDGH 568 jmp .Loop_shaext 569 570.align 16 571.Loop_shaext: 572 movdqu ($inp),@MSG[0] 573 movdqu 0x10($inp),@MSG[1] 574 movdqu 0x20($inp),@MSG[2] 575 pshufb $TMP,@MSG[0] 576 movdqu 0x30($inp),@MSG[3] 577 578 movdqa 0*32-0x80($Tbl),$Wi 579 paddd @MSG[0],$Wi 580 pshufb $TMP,@MSG[1] 581 movdqa $CDGH,$CDGH_SAVE # offload 582 sha256rnds2 $ABEF,$CDGH # 0-3 583 pshufd \$0x0e,$Wi,$Wi 584 nop 585 movdqa $ABEF,$ABEF_SAVE # offload 586 sha256rnds2 $CDGH,$ABEF 587 588 movdqa 1*32-0x80($Tbl),$Wi 589 paddd @MSG[1],$Wi 590 pshufb $TMP,@MSG[2] 591 sha256rnds2 $ABEF,$CDGH # 4-7 592 pshufd \$0x0e,$Wi,$Wi 593 lea 0x40($inp),$inp 594 sha256msg1 @MSG[1],@MSG[0] 595 sha256rnds2 $CDGH,$ABEF 596 597 movdqa 2*32-0x80($Tbl),$Wi 598 paddd @MSG[2],$Wi 599 pshufb $TMP,@MSG[3] 600 sha256rnds2 $ABEF,$CDGH # 8-11 601 pshufd \$0x0e,$Wi,$Wi 602 movdqa @MSG[3],$TMP 603 palignr \$4,@MSG[2],$TMP 604 nop 605 paddd $TMP,@MSG[0] 606 sha256msg1 @MSG[2],@MSG[1] 607 sha256rnds2 $CDGH,$ABEF 608 609 movdqa 3*32-0x80($Tbl),$Wi 610 paddd @MSG[3],$Wi 611 sha256msg2 @MSG[3],@MSG[0] 612 sha256rnds2 $ABEF,$CDGH # 12-15 613 pshufd \$0x0e,$Wi,$Wi 614 movdqa @MSG[0],$TMP 615 palignr \$4,@MSG[3],$TMP 616 nop 617 paddd $TMP,@MSG[1] 618 sha256msg1 @MSG[3],@MSG[2] 619 sha256rnds2 $CDGH,$ABEF 620___ 621for($i=4;$i<16-3;$i++) { 622$code.=<<___; 623 movdqa $i*32-0x80($Tbl),$Wi 624 paddd @MSG[0],$Wi 625 sha256msg2 @MSG[0],@MSG[1] 626 sha256rnds2 $ABEF,$CDGH # 16-19... 627 pshufd \$0x0e,$Wi,$Wi 628 movdqa @MSG[1],$TMP 629 palignr \$4,@MSG[0],$TMP 630 nop 631 paddd $TMP,@MSG[2] 632 sha256msg1 @MSG[0],@MSG[3] 633 sha256rnds2 $CDGH,$ABEF 634___ 635 push(@MSG,shift(@MSG)); 636} 637$code.=<<___; 638 movdqa 13*32-0x80($Tbl),$Wi 639 paddd @MSG[0],$Wi 640 sha256msg2 @MSG[0],@MSG[1] 641 sha256rnds2 $ABEF,$CDGH # 52-55 642 pshufd \$0x0e,$Wi,$Wi 643 movdqa @MSG[1],$TMP 644 palignr \$4,@MSG[0],$TMP 645 sha256rnds2 $CDGH,$ABEF 646 paddd $TMP,@MSG[2] 647 648 movdqa 14*32-0x80($Tbl),$Wi 649 paddd @MSG[1],$Wi 650 sha256rnds2 $ABEF,$CDGH # 56-59 651 pshufd \$0x0e,$Wi,$Wi 652 sha256msg2 @MSG[1],@MSG[2] 653 movdqa $BSWAP,$TMP 654 sha256rnds2 $CDGH,$ABEF 655 656 movdqa 15*32-0x80($Tbl),$Wi 657 paddd @MSG[2],$Wi 658 nop 659 sha256rnds2 $ABEF,$CDGH # 60-63 660 pshufd \$0x0e,$Wi,$Wi 661 dec $num 662 nop 663 sha256rnds2 $CDGH,$ABEF 664 665 paddd $CDGH_SAVE,$CDGH 666 paddd $ABEF_SAVE,$ABEF 667 jnz .Loop_shaext 668 669 pshufd \$0xb1,$CDGH,$CDGH # DCHG 670 pshufd \$0x1b,$ABEF,$TMP # FEBA 671 pshufd \$0xb1,$ABEF,$ABEF # BAFE 672 punpckhqdq $CDGH,$ABEF # DCBA 673 palignr \$8,$TMP,$CDGH # HGFE 674 675 movdqu $ABEF,($ctx) 676 movdqu $CDGH,16($ctx) 677___ 678$code.=<<___ if ($win64); 679 movaps -8-5*16(%rax),%xmm6 680 movaps -8-4*16(%rax),%xmm7 681 movaps -8-3*16(%rax),%xmm8 682 movaps -8-2*16(%rax),%xmm9 683 movaps -8-1*16(%rax),%xmm10 684 mov %rax,%rsp 685.Lepilogue_shaext: 686___ 687$code.=<<___; 688 ret 689.size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext 690___ 691}}} 692{{{ 693 694my $a4=$T1; 695my ($a,$b,$c,$d,$e,$f,$g,$h); 696 697sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 698{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 699 my $arg = pop; 700 $arg = "\$$arg" if ($arg*1 eq $arg); 701 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 702} 703 704sub body_00_15 () { 705 ( 706 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 707 708 '&ror ($a0,$Sigma1[2]-$Sigma1[1])', 709 '&mov ($a,$a1)', 710 '&mov ($a4,$f)', 711 712 '&ror ($a1,$Sigma0[2]-$Sigma0[1])', 713 '&xor ($a0,$e)', 714 '&xor ($a4,$g)', # f^g 715 716 '&ror ($a0,$Sigma1[1]-$Sigma1[0])', 717 '&xor ($a1,$a)', 718 '&and ($a4,$e)', # (f^g)&e 719 720 '&xor ($a0,$e)', 721 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] 722 '&mov ($a2,$a)', 723 724 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g 725 '&ror ($a1,$Sigma0[1]-$Sigma0[0])', 726 '&xor ($a2,$b)', # a^b, b^c in next round 727 728 '&add ($h,$a4)', # h+=Ch(e,f,g) 729 '&ror ($a0,$Sigma1[0])', # Sigma1(e) 730 '&and ($a3,$a2)', # (b^c)&(a^b) 731 732 '&xor ($a1,$a)', 733 '&add ($h,$a0)', # h+=Sigma1(e) 734 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 735 736 '&ror ($a1,$Sigma0[0])', # Sigma0(a) 737 '&add ($d,$h)', # d+=h 738 '&add ($h,$a3)', # h+=Maj(a,b,c) 739 740 '&mov ($a0,$d)', 741 '&add ($a1,$h);'. # h+=Sigma0(a) 742 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 743 ); 744} 745 746###################################################################### 747# SSSE3 code path 748# 749if ($SZ==4) { # SHA256 only 750my @X = map("%xmm$_",(0..3)); 751my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); 752 753$code.=<<___; 754.type ${func}_ssse3,\@function,3 755.align 64 756${func}_ssse3: 757.Lssse3_shortcut: 758 mov %rsp,%rax # copy %rsp 759 push %rbx 760 push %rbp 761 push %r12 762 push %r13 763 push %r14 764 push %r15 765 shl \$4,%rdx # num*16 766 sub \$`$framesz+$win64*16*4`,%rsp 767 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 768 and \$-64,%rsp # align stack frame 769 mov $ctx,$_ctx # save ctx, 1st arg 770 mov $inp,$_inp # save inp, 2nd arh 771 mov %rdx,$_end # save end pointer, "3rd" arg 772 mov %rax,$_rsp # save copy of %rsp 773___ 774$code.=<<___ if ($win64); 775 movaps %xmm6,16*$SZ+32(%rsp) 776 movaps %xmm7,16*$SZ+48(%rsp) 777 movaps %xmm8,16*$SZ+64(%rsp) 778 movaps %xmm9,16*$SZ+80(%rsp) 779___ 780$code.=<<___; 781.Lprologue_ssse3: 782 783 mov $SZ*0($ctx),$A 784 mov $SZ*1($ctx),$B 785 mov $SZ*2($ctx),$C 786 mov $SZ*3($ctx),$D 787 mov $SZ*4($ctx),$E 788 mov $SZ*5($ctx),$F 789 mov $SZ*6($ctx),$G 790 mov $SZ*7($ctx),$H 791___ 792 793$code.=<<___; 794 #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 795 #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 796 jmp .Lloop_ssse3 797.align 16 798.Lloop_ssse3: 799 movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 800 movdqu 0x00($inp),@X[0] 801 movdqu 0x10($inp),@X[1] 802 movdqu 0x20($inp),@X[2] 803 pshufb $t3,@X[0] 804 movdqu 0x30($inp),@X[3] 805 lea $TABLE(%rip),$Tbl 806 pshufb $t3,@X[1] 807 movdqa 0x00($Tbl),$t0 808 movdqa 0x20($Tbl),$t1 809 pshufb $t3,@X[2] 810 paddd @X[0],$t0 811 movdqa 0x40($Tbl),$t2 812 pshufb $t3,@X[3] 813 movdqa 0x60($Tbl),$t3 814 paddd @X[1],$t1 815 paddd @X[2],$t2 816 paddd @X[3],$t3 817 movdqa $t0,0x00(%rsp) 818 mov $A,$a1 819 movdqa $t1,0x10(%rsp) 820 mov $B,$a3 821 movdqa $t2,0x20(%rsp) 822 xor $C,$a3 # magic 823 movdqa $t3,0x30(%rsp) 824 mov $E,$a0 825 jmp .Lssse3_00_47 826 827.align 16 828.Lssse3_00_47: 829 sub \$`-16*2*$SZ`,$Tbl # size optimization 830___ 831sub Xupdate_256_SSSE3 () { 832 ( 833 '&movdqa ($t0,@X[1]);', 834 '&movdqa ($t3,@X[3])', 835 '&palignr ($t0,@X[0],$SZ)', # X[1..4] 836 '&palignr ($t3,@X[2],$SZ);', # X[9..12] 837 '&movdqa ($t1,$t0)', 838 '&movdqa ($t2,$t0);', 839 '&psrld ($t0,$sigma0[2])', 840 '&paddd (@X[0],$t3);', # X[0..3] += X[9..12] 841 '&psrld ($t2,$sigma0[0])', 842 '&pshufd ($t3,@X[3],0b11111010)',# X[14..15] 843 '&pslld ($t1,8*$SZ-$sigma0[1]);'. 844 '&pxor ($t0,$t2)', 845 '&psrld ($t2,$sigma0[1]-$sigma0[0]);'. 846 '&pxor ($t0,$t1)', 847 '&pslld ($t1,$sigma0[1]-$sigma0[0]);'. 848 '&pxor ($t0,$t2);', 849 '&movdqa ($t2,$t3)', 850 '&pxor ($t0,$t1);', # sigma0(X[1..4]) 851 '&psrld ($t3,$sigma1[2])', 852 '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) 853 '&psrlq ($t2,$sigma1[0])', 854 '&pxor ($t3,$t2);', 855 '&psrlq ($t2,$sigma1[1]-$sigma1[0])', 856 '&pxor ($t3,$t2)', 857 '&pshufb ($t3,$t4)', # sigma1(X[14..15]) 858 '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) 859 '&pshufd ($t3,@X[0],0b01010000)',# X[16..17] 860 '&movdqa ($t2,$t3);', 861 '&psrld ($t3,$sigma1[2])', 862 '&psrlq ($t2,$sigma1[0])', 863 '&pxor ($t3,$t2);', 864 '&psrlq ($t2,$sigma1[1]-$sigma1[0])', 865 '&pxor ($t3,$t2);', 866 '&movdqa ($t2,16*2*$j."($Tbl)")', 867 '&pshufb ($t3,$t5)', 868 '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17]) 869 ); 870} 871 872sub SSSE3_256_00_47 () { 873my $j = shift; 874my $body = shift; 875my @X = @_; 876my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 877 878 if (0) { 879 foreach (Xupdate_256_SSSE3()) { # 36 instructions 880 eval; 881 eval(shift(@insns)); 882 eval(shift(@insns)); 883 eval(shift(@insns)); 884 } 885 } else { # squeeze extra 4% on Westmere and 19% on Atom 886 eval(shift(@insns)); #@ 887 &movdqa ($t0,@X[1]); 888 eval(shift(@insns)); 889 eval(shift(@insns)); 890 &movdqa ($t3,@X[3]); 891 eval(shift(@insns)); #@ 892 eval(shift(@insns)); 893 eval(shift(@insns)); 894 eval(shift(@insns)); #@ 895 eval(shift(@insns)); 896 &palignr ($t0,@X[0],$SZ); # X[1..4] 897 eval(shift(@insns)); 898 eval(shift(@insns)); 899 &palignr ($t3,@X[2],$SZ); # X[9..12] 900 eval(shift(@insns)); 901 eval(shift(@insns)); 902 eval(shift(@insns)); 903 eval(shift(@insns)); #@ 904 &movdqa ($t1,$t0); 905 eval(shift(@insns)); 906 eval(shift(@insns)); 907 &movdqa ($t2,$t0); 908 eval(shift(@insns)); #@ 909 eval(shift(@insns)); 910 &psrld ($t0,$sigma0[2]); 911 eval(shift(@insns)); 912 eval(shift(@insns)); 913 eval(shift(@insns)); 914 &paddd (@X[0],$t3); # X[0..3] += X[9..12] 915 eval(shift(@insns)); #@ 916 eval(shift(@insns)); 917 &psrld ($t2,$sigma0[0]); 918 eval(shift(@insns)); 919 eval(shift(@insns)); 920 &pshufd ($t3,@X[3],0b11111010); # X[4..15] 921 eval(shift(@insns)); 922 eval(shift(@insns)); #@ 923 &pslld ($t1,8*$SZ-$sigma0[1]); 924 eval(shift(@insns)); 925 eval(shift(@insns)); 926 &pxor ($t0,$t2); 927 eval(shift(@insns)); #@ 928 eval(shift(@insns)); 929 eval(shift(@insns)); 930 eval(shift(@insns)); #@ 931 &psrld ($t2,$sigma0[1]-$sigma0[0]); 932 eval(shift(@insns)); 933 &pxor ($t0,$t1); 934 eval(shift(@insns)); 935 eval(shift(@insns)); 936 &pslld ($t1,$sigma0[1]-$sigma0[0]); 937 eval(shift(@insns)); 938 eval(shift(@insns)); 939 &pxor ($t0,$t2); 940 eval(shift(@insns)); 941 eval(shift(@insns)); #@ 942 &movdqa ($t2,$t3); 943 eval(shift(@insns)); 944 eval(shift(@insns)); 945 &pxor ($t0,$t1); # sigma0(X[1..4]) 946 eval(shift(@insns)); #@ 947 eval(shift(@insns)); 948 eval(shift(@insns)); 949 &psrld ($t3,$sigma1[2]); 950 eval(shift(@insns)); 951 eval(shift(@insns)); 952 &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) 953 eval(shift(@insns)); #@ 954 eval(shift(@insns)); 955 &psrlq ($t2,$sigma1[0]); 956 eval(shift(@insns)); 957 eval(shift(@insns)); 958 eval(shift(@insns)); 959 &pxor ($t3,$t2); 960 eval(shift(@insns)); #@ 961 eval(shift(@insns)); 962 eval(shift(@insns)); 963 eval(shift(@insns)); #@ 964 &psrlq ($t2,$sigma1[1]-$sigma1[0]); 965 eval(shift(@insns)); 966 eval(shift(@insns)); 967 &pxor ($t3,$t2); 968 eval(shift(@insns)); #@ 969 eval(shift(@insns)); 970 eval(shift(@insns)); 971 #&pshufb ($t3,$t4); # sigma1(X[14..15]) 972 &pshufd ($t3,$t3,0b10000000); 973 eval(shift(@insns)); 974 eval(shift(@insns)); 975 eval(shift(@insns)); 976 &psrldq ($t3,8); 977 eval(shift(@insns)); 978 eval(shift(@insns)); #@ 979 eval(shift(@insns)); 980 eval(shift(@insns)); 981 eval(shift(@insns)); #@ 982 &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 983 eval(shift(@insns)); 984 eval(shift(@insns)); 985 eval(shift(@insns)); 986 &pshufd ($t3,@X[0],0b01010000); # X[16..17] 987 eval(shift(@insns)); 988 eval(shift(@insns)); #@ 989 eval(shift(@insns)); 990 &movdqa ($t2,$t3); 991 eval(shift(@insns)); 992 eval(shift(@insns)); 993 &psrld ($t3,$sigma1[2]); 994 eval(shift(@insns)); 995 eval(shift(@insns)); #@ 996 &psrlq ($t2,$sigma1[0]); 997 eval(shift(@insns)); 998 eval(shift(@insns)); 999 &pxor ($t3,$t2); 1000 eval(shift(@insns)); #@ 1001 eval(shift(@insns)); 1002 eval(shift(@insns)); 1003 eval(shift(@insns)); #@ 1004 eval(shift(@insns)); 1005 &psrlq ($t2,$sigma1[1]-$sigma1[0]); 1006 eval(shift(@insns)); 1007 eval(shift(@insns)); 1008 eval(shift(@insns)); 1009 &pxor ($t3,$t2); 1010 eval(shift(@insns)); 1011 eval(shift(@insns)); 1012 eval(shift(@insns)); #@ 1013 #&pshufb ($t3,$t5); 1014 &pshufd ($t3,$t3,0b00001000); 1015 eval(shift(@insns)); 1016 eval(shift(@insns)); 1017 &movdqa ($t2,16*2*$j."($Tbl)"); 1018 eval(shift(@insns)); #@ 1019 eval(shift(@insns)); 1020 &pslldq ($t3,8); 1021 eval(shift(@insns)); 1022 eval(shift(@insns)); 1023 eval(shift(@insns)); 1024 &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) 1025 eval(shift(@insns)); #@ 1026 eval(shift(@insns)); 1027 eval(shift(@insns)); 1028 } 1029 &paddd ($t2,@X[0]); 1030 foreach (@insns) { eval; } # remaining instructions 1031 &movdqa (16*$j."(%rsp)",$t2); 1032} 1033 1034 for ($i=0,$j=0; $j<4; $j++) { 1035 &SSSE3_256_00_47($j,\&body_00_15,@X); 1036 push(@X,shift(@X)); # rotate(@X) 1037 } 1038 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1039 &jne (".Lssse3_00_47"); 1040 1041 for ($i=0; $i<16; ) { 1042 foreach(body_00_15()) { eval; } 1043 } 1044$code.=<<___; 1045 mov $_ctx,$ctx 1046 mov $a1,$A 1047 1048 add $SZ*0($ctx),$A 1049 lea 16*$SZ($inp),$inp 1050 add $SZ*1($ctx),$B 1051 add $SZ*2($ctx),$C 1052 add $SZ*3($ctx),$D 1053 add $SZ*4($ctx),$E 1054 add $SZ*5($ctx),$F 1055 add $SZ*6($ctx),$G 1056 add $SZ*7($ctx),$H 1057 1058 cmp $_end,$inp 1059 1060 mov $A,$SZ*0($ctx) 1061 mov $B,$SZ*1($ctx) 1062 mov $C,$SZ*2($ctx) 1063 mov $D,$SZ*3($ctx) 1064 mov $E,$SZ*4($ctx) 1065 mov $F,$SZ*5($ctx) 1066 mov $G,$SZ*6($ctx) 1067 mov $H,$SZ*7($ctx) 1068 jb .Lloop_ssse3 1069 1070 mov $_rsp,%rsi 1071___ 1072$code.=<<___ if ($win64); 1073 movaps 16*$SZ+32(%rsp),%xmm6 1074 movaps 16*$SZ+48(%rsp),%xmm7 1075 movaps 16*$SZ+64(%rsp),%xmm8 1076 movaps 16*$SZ+80(%rsp),%xmm9 1077___ 1078$code.=<<___; 1079 mov -48(%rsi),%r15 1080 mov -40(%rsi),%r14 1081 mov -32(%rsi),%r13 1082 mov -24(%rsi),%r12 1083 mov -16(%rsi),%rbp 1084 mov -8(%rsi),%rbx 1085 lea (%rsi),%rsp 1086.Lepilogue_ssse3: 1087 ret 1088.size ${func}_ssse3,.-${func}_ssse3 1089___ 1090} 1091 1092if ($avx) {{ 1093###################################################################### 1094# XOP code path 1095# 1096if ($SZ==8) { # SHA512 only 1097$code.=<<___; 1098.type ${func}_xop,\@function,3 1099.align 64 1100${func}_xop: 1101.Lxop_shortcut: 1102 mov %rsp,%rax # copy %rsp 1103 push %rbx 1104 push %rbp 1105 push %r12 1106 push %r13 1107 push %r14 1108 push %r15 1109 shl \$4,%rdx # num*16 1110 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp 1111 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 1112 and \$-64,%rsp # align stack frame 1113 mov $ctx,$_ctx # save ctx, 1st arg 1114 mov $inp,$_inp # save inp, 2nd arh 1115 mov %rdx,$_end # save end pointer, "3rd" arg 1116 mov %rax,$_rsp # save copy of %rsp 1117___ 1118$code.=<<___ if ($win64); 1119 movaps %xmm6,16*$SZ+32(%rsp) 1120 movaps %xmm7,16*$SZ+48(%rsp) 1121 movaps %xmm8,16*$SZ+64(%rsp) 1122 movaps %xmm9,16*$SZ+80(%rsp) 1123___ 1124$code.=<<___ if ($win64 && $SZ>4); 1125 movaps %xmm10,16*$SZ+96(%rsp) 1126 movaps %xmm11,16*$SZ+112(%rsp) 1127___ 1128$code.=<<___; 1129.Lprologue_xop: 1130 1131 vzeroupper 1132 mov $SZ*0($ctx),$A 1133 mov $SZ*1($ctx),$B 1134 mov $SZ*2($ctx),$C 1135 mov $SZ*3($ctx),$D 1136 mov $SZ*4($ctx),$E 1137 mov $SZ*5($ctx),$F 1138 mov $SZ*6($ctx),$G 1139 mov $SZ*7($ctx),$H 1140 jmp .Lloop_xop 1141___ 1142 if ($SZ==4) { # SHA256 1143 my @X = map("%xmm$_",(0..3)); 1144 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); 1145 1146$code.=<<___; 1147.align 16 1148.Lloop_xop: 1149 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1150 vmovdqu 0x00($inp),@X[0] 1151 vmovdqu 0x10($inp),@X[1] 1152 vmovdqu 0x20($inp),@X[2] 1153 vmovdqu 0x30($inp),@X[3] 1154 vpshufb $t3,@X[0],@X[0] 1155 lea $TABLE(%rip),$Tbl 1156 vpshufb $t3,@X[1],@X[1] 1157 vpshufb $t3,@X[2],@X[2] 1158 vpaddd 0x00($Tbl),@X[0],$t0 1159 vpshufb $t3,@X[3],@X[3] 1160 vpaddd 0x20($Tbl),@X[1],$t1 1161 vpaddd 0x40($Tbl),@X[2],$t2 1162 vpaddd 0x60($Tbl),@X[3],$t3 1163 vmovdqa $t0,0x00(%rsp) 1164 mov $A,$a1 1165 vmovdqa $t1,0x10(%rsp) 1166 mov $B,$a3 1167 vmovdqa $t2,0x20(%rsp) 1168 xor $C,$a3 # magic 1169 vmovdqa $t3,0x30(%rsp) 1170 mov $E,$a0 1171 jmp .Lxop_00_47 1172 1173.align 16 1174.Lxop_00_47: 1175 sub \$`-16*2*$SZ`,$Tbl # size optimization 1176___ 1177sub XOP_256_00_47 () { 1178my $j = shift; 1179my $body = shift; 1180my @X = @_; 1181my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 1182 1183 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4] 1184 eval(shift(@insns)); 1185 eval(shift(@insns)); 1186 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12] 1187 eval(shift(@insns)); 1188 eval(shift(@insns)); 1189 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]); 1190 eval(shift(@insns)); 1191 eval(shift(@insns)); 1192 &vpsrld ($t0,$t0,$sigma0[2]); 1193 eval(shift(@insns)); 1194 eval(shift(@insns)); 1195 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12] 1196 eval(shift(@insns)); 1197 eval(shift(@insns)); 1198 eval(shift(@insns)); 1199 eval(shift(@insns)); 1200 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]); 1201 eval(shift(@insns)); 1202 eval(shift(@insns)); 1203 &vpxor ($t0,$t0,$t1); 1204 eval(shift(@insns)); 1205 eval(shift(@insns)); 1206 eval(shift(@insns)); 1207 eval(shift(@insns)); 1208 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]); 1209 eval(shift(@insns)); 1210 eval(shift(@insns)); 1211 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4]) 1212 eval(shift(@insns)); 1213 eval(shift(@insns)); 1214 &vpsrld ($t2,@X[3],$sigma1[2]); 1215 eval(shift(@insns)); 1216 eval(shift(@insns)); 1217 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4]) 1218 eval(shift(@insns)); 1219 eval(shift(@insns)); 1220 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); 1221 eval(shift(@insns)); 1222 eval(shift(@insns)); 1223 &vpxor ($t3,$t3,$t2); 1224 eval(shift(@insns)); 1225 eval(shift(@insns)); 1226 eval(shift(@insns)); 1227 eval(shift(@insns)); 1228 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) 1229 eval(shift(@insns)); 1230 eval(shift(@insns)); 1231 eval(shift(@insns)); 1232 eval(shift(@insns)); 1233 &vpsrldq ($t3,$t3,8); 1234 eval(shift(@insns)); 1235 eval(shift(@insns)); 1236 eval(shift(@insns)); 1237 eval(shift(@insns)); 1238 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 1239 eval(shift(@insns)); 1240 eval(shift(@insns)); 1241 eval(shift(@insns)); 1242 eval(shift(@insns)); 1243 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]); 1244 eval(shift(@insns)); 1245 eval(shift(@insns)); 1246 &vpsrld ($t2,@X[0],$sigma1[2]); 1247 eval(shift(@insns)); 1248 eval(shift(@insns)); 1249 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); 1250 eval(shift(@insns)); 1251 eval(shift(@insns)); 1252 &vpxor ($t3,$t3,$t2); 1253 eval(shift(@insns)); 1254 eval(shift(@insns)); 1255 eval(shift(@insns)); 1256 eval(shift(@insns)); 1257 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17]) 1258 eval(shift(@insns)); 1259 eval(shift(@insns)); 1260 eval(shift(@insns)); 1261 eval(shift(@insns)); 1262 &vpslldq ($t3,$t3,8); # 22 instructions 1263 eval(shift(@insns)); 1264 eval(shift(@insns)); 1265 eval(shift(@insns)); 1266 eval(shift(@insns)); 1267 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17]) 1268 eval(shift(@insns)); 1269 eval(shift(@insns)); 1270 eval(shift(@insns)); 1271 eval(shift(@insns)); 1272 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1273 foreach (@insns) { eval; } # remaining instructions 1274 &vmovdqa (16*$j."(%rsp)",$t2); 1275} 1276 1277 for ($i=0,$j=0; $j<4; $j++) { 1278 &XOP_256_00_47($j,\&body_00_15,@X); 1279 push(@X,shift(@X)); # rotate(@X) 1280 } 1281 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1282 &jne (".Lxop_00_47"); 1283 1284 for ($i=0; $i<16; ) { 1285 foreach(body_00_15()) { eval; } 1286 } 1287 1288 } else { # SHA512 1289 my @X = map("%xmm$_",(0..7)); 1290 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); 1291 1292$code.=<<___; 1293.align 16 1294.Lloop_xop: 1295 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1296 vmovdqu 0x00($inp),@X[0] 1297 lea $TABLE+0x80(%rip),$Tbl # size optimization 1298 vmovdqu 0x10($inp),@X[1] 1299 vmovdqu 0x20($inp),@X[2] 1300 vpshufb $t3,@X[0],@X[0] 1301 vmovdqu 0x30($inp),@X[3] 1302 vpshufb $t3,@X[1],@X[1] 1303 vmovdqu 0x40($inp),@X[4] 1304 vpshufb $t3,@X[2],@X[2] 1305 vmovdqu 0x50($inp),@X[5] 1306 vpshufb $t3,@X[3],@X[3] 1307 vmovdqu 0x60($inp),@X[6] 1308 vpshufb $t3,@X[4],@X[4] 1309 vmovdqu 0x70($inp),@X[7] 1310 vpshufb $t3,@X[5],@X[5] 1311 vpaddq -0x80($Tbl),@X[0],$t0 1312 vpshufb $t3,@X[6],@X[6] 1313 vpaddq -0x60($Tbl),@X[1],$t1 1314 vpshufb $t3,@X[7],@X[7] 1315 vpaddq -0x40($Tbl),@X[2],$t2 1316 vpaddq -0x20($Tbl),@X[3],$t3 1317 vmovdqa $t0,0x00(%rsp) 1318 vpaddq 0x00($Tbl),@X[4],$t0 1319 vmovdqa $t1,0x10(%rsp) 1320 vpaddq 0x20($Tbl),@X[5],$t1 1321 vmovdqa $t2,0x20(%rsp) 1322 vpaddq 0x40($Tbl),@X[6],$t2 1323 vmovdqa $t3,0x30(%rsp) 1324 vpaddq 0x60($Tbl),@X[7],$t3 1325 vmovdqa $t0,0x40(%rsp) 1326 mov $A,$a1 1327 vmovdqa $t1,0x50(%rsp) 1328 mov $B,$a3 1329 vmovdqa $t2,0x60(%rsp) 1330 xor $C,$a3 # magic 1331 vmovdqa $t3,0x70(%rsp) 1332 mov $E,$a0 1333 jmp .Lxop_00_47 1334 1335.align 16 1336.Lxop_00_47: 1337 add \$`16*2*$SZ`,$Tbl 1338___ 1339sub XOP_512_00_47 () { 1340my $j = shift; 1341my $body = shift; 1342my @X = @_; 1343my @insns = (&$body,&$body); # 52 instructions 1344 1345 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2] 1346 eval(shift(@insns)); 1347 eval(shift(@insns)); 1348 &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10] 1349 eval(shift(@insns)); 1350 eval(shift(@insns)); 1351 &vprotq ($t1,$t0,8*$SZ-$sigma0[1]); 1352 eval(shift(@insns)); 1353 eval(shift(@insns)); 1354 &vpsrlq ($t0,$t0,$sigma0[2]); 1355 eval(shift(@insns)); 1356 eval(shift(@insns)); 1357 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10] 1358 eval(shift(@insns)); 1359 eval(shift(@insns)); 1360 eval(shift(@insns)); 1361 eval(shift(@insns)); 1362 &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]); 1363 eval(shift(@insns)); 1364 eval(shift(@insns)); 1365 &vpxor ($t0,$t0,$t1); 1366 eval(shift(@insns)); 1367 eval(shift(@insns)); 1368 eval(shift(@insns)); 1369 eval(shift(@insns)); 1370 &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]); 1371 eval(shift(@insns)); 1372 eval(shift(@insns)); 1373 &vpxor ($t0,$t0,$t2); # sigma0(X[1..2]) 1374 eval(shift(@insns)); 1375 eval(shift(@insns)); 1376 &vpsrlq ($t2,@X[7],$sigma1[2]); 1377 eval(shift(@insns)); 1378 eval(shift(@insns)); 1379 &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2]) 1380 eval(shift(@insns)); 1381 eval(shift(@insns)); 1382 &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]); 1383 eval(shift(@insns)); 1384 eval(shift(@insns)); 1385 &vpxor ($t3,$t3,$t2); 1386 eval(shift(@insns)); 1387 eval(shift(@insns)); 1388 eval(shift(@insns)); 1389 eval(shift(@insns)); 1390 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) 1391 eval(shift(@insns)); 1392 eval(shift(@insns)); 1393 eval(shift(@insns)); 1394 eval(shift(@insns)); 1395 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) 1396 eval(shift(@insns)); 1397 eval(shift(@insns)); 1398 eval(shift(@insns)); 1399 eval(shift(@insns)); 1400 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); 1401 foreach (@insns) { eval; } # remaining instructions 1402 &vmovdqa (16*$j."(%rsp)",$t2); 1403} 1404 1405 for ($i=0,$j=0; $j<8; $j++) { 1406 &XOP_512_00_47($j,\&body_00_15,@X); 1407 push(@X,shift(@X)); # rotate(@X) 1408 } 1409 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); 1410 &jne (".Lxop_00_47"); 1411 1412 for ($i=0; $i<16; ) { 1413 foreach(body_00_15()) { eval; } 1414 } 1415} 1416$code.=<<___; 1417 mov $_ctx,$ctx 1418 mov $a1,$A 1419 1420 add $SZ*0($ctx),$A 1421 lea 16*$SZ($inp),$inp 1422 add $SZ*1($ctx),$B 1423 add $SZ*2($ctx),$C 1424 add $SZ*3($ctx),$D 1425 add $SZ*4($ctx),$E 1426 add $SZ*5($ctx),$F 1427 add $SZ*6($ctx),$G 1428 add $SZ*7($ctx),$H 1429 1430 cmp $_end,$inp 1431 1432 mov $A,$SZ*0($ctx) 1433 mov $B,$SZ*1($ctx) 1434 mov $C,$SZ*2($ctx) 1435 mov $D,$SZ*3($ctx) 1436 mov $E,$SZ*4($ctx) 1437 mov $F,$SZ*5($ctx) 1438 mov $G,$SZ*6($ctx) 1439 mov $H,$SZ*7($ctx) 1440 jb .Lloop_xop 1441 1442 mov $_rsp,%rsi 1443 vzeroupper 1444___ 1445$code.=<<___ if ($win64); 1446 movaps 16*$SZ+32(%rsp),%xmm6 1447 movaps 16*$SZ+48(%rsp),%xmm7 1448 movaps 16*$SZ+64(%rsp),%xmm8 1449 movaps 16*$SZ+80(%rsp),%xmm9 1450___ 1451$code.=<<___ if ($win64 && $SZ>4); 1452 movaps 16*$SZ+96(%rsp),%xmm10 1453 movaps 16*$SZ+112(%rsp),%xmm11 1454___ 1455$code.=<<___; 1456 mov -48(%rsi),%r15 1457 mov -40(%rsi),%r14 1458 mov -32(%rsi),%r13 1459 mov -24(%rsi),%r12 1460 mov -16(%rsi),%rbp 1461 mov -8(%rsi),%rbx 1462 lea (%rsi),%rsp 1463.Lepilogue_xop: 1464 ret 1465.size ${func}_xop,.-${func}_xop 1466___ 1467} 1468###################################################################### 1469# AVX+shrd code path 1470# 1471local *ror = sub { &shrd(@_[0],@_) }; 1472 1473$code.=<<___; 1474.type ${func}_avx,\@function,3 1475.align 64 1476${func}_avx: 1477.Lavx_shortcut: 1478 mov %rsp,%rax # copy %rsp 1479 push %rbx 1480 push %rbp 1481 push %r12 1482 push %r13 1483 push %r14 1484 push %r15 1485 shl \$4,%rdx # num*16 1486 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp 1487 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 1488 and \$-64,%rsp # align stack frame 1489 mov $ctx,$_ctx # save ctx, 1st arg 1490 mov $inp,$_inp # save inp, 2nd arh 1491 mov %rdx,$_end # save end pointer, "3rd" arg 1492 mov %rax,$_rsp # save copy of %rsp 1493___ 1494$code.=<<___ if ($win64); 1495 movaps %xmm6,16*$SZ+32(%rsp) 1496 movaps %xmm7,16*$SZ+48(%rsp) 1497 movaps %xmm8,16*$SZ+64(%rsp) 1498 movaps %xmm9,16*$SZ+80(%rsp) 1499___ 1500$code.=<<___ if ($win64 && $SZ>4); 1501 movaps %xmm10,16*$SZ+96(%rsp) 1502 movaps %xmm11,16*$SZ+112(%rsp) 1503___ 1504$code.=<<___; 1505.Lprologue_avx: 1506 1507 vzeroupper 1508 mov $SZ*0($ctx),$A 1509 mov $SZ*1($ctx),$B 1510 mov $SZ*2($ctx),$C 1511 mov $SZ*3($ctx),$D 1512 mov $SZ*4($ctx),$E 1513 mov $SZ*5($ctx),$F 1514 mov $SZ*6($ctx),$G 1515 mov $SZ*7($ctx),$H 1516___ 1517 if ($SZ==4) { # SHA256 1518 my @X = map("%xmm$_",(0..3)); 1519 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); 1520 1521$code.=<<___; 1522 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 1523 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 1524 jmp .Lloop_avx 1525.align 16 1526.Lloop_avx: 1527 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1528 vmovdqu 0x00($inp),@X[0] 1529 vmovdqu 0x10($inp),@X[1] 1530 vmovdqu 0x20($inp),@X[2] 1531 vmovdqu 0x30($inp),@X[3] 1532 vpshufb $t3,@X[0],@X[0] 1533 lea $TABLE(%rip),$Tbl 1534 vpshufb $t3,@X[1],@X[1] 1535 vpshufb $t3,@X[2],@X[2] 1536 vpaddd 0x00($Tbl),@X[0],$t0 1537 vpshufb $t3,@X[3],@X[3] 1538 vpaddd 0x20($Tbl),@X[1],$t1 1539 vpaddd 0x40($Tbl),@X[2],$t2 1540 vpaddd 0x60($Tbl),@X[3],$t3 1541 vmovdqa $t0,0x00(%rsp) 1542 mov $A,$a1 1543 vmovdqa $t1,0x10(%rsp) 1544 mov $B,$a3 1545 vmovdqa $t2,0x20(%rsp) 1546 xor $C,$a3 # magic 1547 vmovdqa $t3,0x30(%rsp) 1548 mov $E,$a0 1549 jmp .Lavx_00_47 1550 1551.align 16 1552.Lavx_00_47: 1553 sub \$`-16*2*$SZ`,$Tbl # size optimization 1554___ 1555sub Xupdate_256_AVX () { 1556 ( 1557 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4] 1558 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12] 1559 '&vpsrld ($t2,$t0,$sigma0[0]);', 1560 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12] 1561 '&vpsrld ($t3,$t0,$sigma0[2])', 1562 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);', 1563 '&vpxor ($t0,$t3,$t2)', 1564 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] 1565 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);', 1566 '&vpxor ($t0,$t0,$t1)', 1567 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);', 1568 '&vpxor ($t0,$t0,$t2)', 1569 '&vpsrld ($t2,$t3,$sigma1[2]);', 1570 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4]) 1571 '&vpsrlq ($t3,$t3,$sigma1[0]);', 1572 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4]) 1573 '&vpxor ($t2,$t2,$t3);', 1574 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 1575 '&vpxor ($t2,$t2,$t3)', 1576 '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15]) 1577 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15]) 1578 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] 1579 '&vpsrld ($t2,$t3,$sigma1[2])', 1580 '&vpsrlq ($t3,$t3,$sigma1[0])', 1581 '&vpxor ($t2,$t2,$t3);', 1582 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', 1583 '&vpxor ($t2,$t2,$t3)', 1584 '&vpshufb ($t2,$t2,$t5)', 1585 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17]) 1586 ); 1587} 1588 1589sub AVX_256_00_47 () { 1590my $j = shift; 1591my $body = shift; 1592my @X = @_; 1593my @insns = (&$body,&$body,&$body,&$body); # 104 instructions 1594 1595 foreach (Xupdate_256_AVX()) { # 29 instructions 1596 eval; 1597 eval(shift(@insns)); 1598 eval(shift(@insns)); 1599 eval(shift(@insns)); 1600 } 1601 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1602 foreach (@insns) { eval; } # remaining instructions 1603 &vmovdqa (16*$j."(%rsp)",$t2); 1604} 1605 1606 for ($i=0,$j=0; $j<4; $j++) { 1607 &AVX_256_00_47($j,\&body_00_15,@X); 1608 push(@X,shift(@X)); # rotate(@X) 1609 } 1610 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); 1611 &jne (".Lavx_00_47"); 1612 1613 for ($i=0; $i<16; ) { 1614 foreach(body_00_15()) { eval; } 1615 } 1616 1617 } else { # SHA512 1618 my @X = map("%xmm$_",(0..7)); 1619 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); 1620 1621$code.=<<___; 1622 jmp .Lloop_avx 1623.align 16 1624.Lloop_avx: 1625 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1626 vmovdqu 0x00($inp),@X[0] 1627 lea $TABLE+0x80(%rip),$Tbl # size optimization 1628 vmovdqu 0x10($inp),@X[1] 1629 vmovdqu 0x20($inp),@X[2] 1630 vpshufb $t3,@X[0],@X[0] 1631 vmovdqu 0x30($inp),@X[3] 1632 vpshufb $t3,@X[1],@X[1] 1633 vmovdqu 0x40($inp),@X[4] 1634 vpshufb $t3,@X[2],@X[2] 1635 vmovdqu 0x50($inp),@X[5] 1636 vpshufb $t3,@X[3],@X[3] 1637 vmovdqu 0x60($inp),@X[6] 1638 vpshufb $t3,@X[4],@X[4] 1639 vmovdqu 0x70($inp),@X[7] 1640 vpshufb $t3,@X[5],@X[5] 1641 vpaddq -0x80($Tbl),@X[0],$t0 1642 vpshufb $t3,@X[6],@X[6] 1643 vpaddq -0x60($Tbl),@X[1],$t1 1644 vpshufb $t3,@X[7],@X[7] 1645 vpaddq -0x40($Tbl),@X[2],$t2 1646 vpaddq -0x20($Tbl),@X[3],$t3 1647 vmovdqa $t0,0x00(%rsp) 1648 vpaddq 0x00($Tbl),@X[4],$t0 1649 vmovdqa $t1,0x10(%rsp) 1650 vpaddq 0x20($Tbl),@X[5],$t1 1651 vmovdqa $t2,0x20(%rsp) 1652 vpaddq 0x40($Tbl),@X[6],$t2 1653 vmovdqa $t3,0x30(%rsp) 1654 vpaddq 0x60($Tbl),@X[7],$t3 1655 vmovdqa $t0,0x40(%rsp) 1656 mov $A,$a1 1657 vmovdqa $t1,0x50(%rsp) 1658 mov $B,$a3 1659 vmovdqa $t2,0x60(%rsp) 1660 xor $C,$a3 # magic 1661 vmovdqa $t3,0x70(%rsp) 1662 mov $E,$a0 1663 jmp .Lavx_00_47 1664 1665.align 16 1666.Lavx_00_47: 1667 add \$`16*2*$SZ`,$Tbl 1668___ 1669sub Xupdate_512_AVX () { 1670 ( 1671 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2] 1672 '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10] 1673 '&vpsrlq ($t2,$t0,$sigma0[0])', 1674 '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10] 1675 '&vpsrlq ($t3,$t0,$sigma0[2])', 1676 '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);', 1677 '&vpxor ($t0,$t3,$t2)', 1678 '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);', 1679 '&vpxor ($t0,$t0,$t1)', 1680 '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);', 1681 '&vpxor ($t0,$t0,$t2)', 1682 '&vpsrlq ($t3,@X[7],$sigma1[2]);', 1683 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2]) 1684 '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);', 1685 '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2]) 1686 '&vpsrlq ($t1,@X[7],$sigma1[0]);', 1687 '&vpxor ($t3,$t3,$t2)', 1688 '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);', 1689 '&vpxor ($t3,$t3,$t1)', 1690 '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);', 1691 '&vpxor ($t3,$t3,$t2)', 1692 '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15]) 1693 '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) 1694 ); 1695} 1696 1697sub AVX_512_00_47 () { 1698my $j = shift; 1699my $body = shift; 1700my @X = @_; 1701my @insns = (&$body,&$body); # 52 instructions 1702 1703 foreach (Xupdate_512_AVX()) { # 23 instructions 1704 eval; 1705 eval(shift(@insns)); 1706 eval(shift(@insns)); 1707 } 1708 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); 1709 foreach (@insns) { eval; } # remaining instructions 1710 &vmovdqa (16*$j."(%rsp)",$t2); 1711} 1712 1713 for ($i=0,$j=0; $j<8; $j++) { 1714 &AVX_512_00_47($j,\&body_00_15,@X); 1715 push(@X,shift(@X)); # rotate(@X) 1716 } 1717 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); 1718 &jne (".Lavx_00_47"); 1719 1720 for ($i=0; $i<16; ) { 1721 foreach(body_00_15()) { eval; } 1722 } 1723} 1724$code.=<<___; 1725 mov $_ctx,$ctx 1726 mov $a1,$A 1727 1728 add $SZ*0($ctx),$A 1729 lea 16*$SZ($inp),$inp 1730 add $SZ*1($ctx),$B 1731 add $SZ*2($ctx),$C 1732 add $SZ*3($ctx),$D 1733 add $SZ*4($ctx),$E 1734 add $SZ*5($ctx),$F 1735 add $SZ*6($ctx),$G 1736 add $SZ*7($ctx),$H 1737 1738 cmp $_end,$inp 1739 1740 mov $A,$SZ*0($ctx) 1741 mov $B,$SZ*1($ctx) 1742 mov $C,$SZ*2($ctx) 1743 mov $D,$SZ*3($ctx) 1744 mov $E,$SZ*4($ctx) 1745 mov $F,$SZ*5($ctx) 1746 mov $G,$SZ*6($ctx) 1747 mov $H,$SZ*7($ctx) 1748 jb .Lloop_avx 1749 1750 mov $_rsp,%rsi 1751 vzeroupper 1752___ 1753$code.=<<___ if ($win64); 1754 movaps 16*$SZ+32(%rsp),%xmm6 1755 movaps 16*$SZ+48(%rsp),%xmm7 1756 movaps 16*$SZ+64(%rsp),%xmm8 1757 movaps 16*$SZ+80(%rsp),%xmm9 1758___ 1759$code.=<<___ if ($win64 && $SZ>4); 1760 movaps 16*$SZ+96(%rsp),%xmm10 1761 movaps 16*$SZ+112(%rsp),%xmm11 1762___ 1763$code.=<<___; 1764 mov -48(%rsi),%r15 1765 mov -40(%rsi),%r14 1766 mov -32(%rsi),%r13 1767 mov -24(%rsi),%r12 1768 mov -16(%rsi),%rbp 1769 mov -8(%rsi),%rbx 1770 lea (%rsi),%rsp 1771.Lepilogue_avx: 1772 ret 1773.size ${func}_avx,.-${func}_avx 1774___ 1775 1776if ($avx>1) {{ 1777###################################################################### 1778# AVX2+BMI code path 1779# 1780my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp 1781my $PUSH8=8*2*$SZ; 1782use integer; 1783 1784sub bodyx_00_15 () { 1785 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f 1786 ( 1787 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. 1788 1789 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i] 1790 '&and ($a4,$e)', # f&e 1791 '&rorx ($a0,$e,$Sigma1[2])', 1792 '&rorx ($a2,$e,$Sigma1[1])', 1793 1794 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past 1795 '&lea ($h,"($h,$a4)")', 1796 '&andn ($a4,$e,$g)', # ~e&g 1797 '&xor ($a0,$a2)', 1798 1799 '&rorx ($a1,$e,$Sigma1[0])', 1800 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g) 1801 '&xor ($a0,$a1)', # Sigma1(e) 1802 '&mov ($a2,$a)', 1803 1804 '&rorx ($a4,$a,$Sigma0[2])', 1805 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e) 1806 '&xor ($a2,$b)', # a^b, b^c in next round 1807 '&rorx ($a1,$a,$Sigma0[1])', 1808 1809 '&rorx ($a0,$a,$Sigma0[0])', 1810 '&lea ($d,"($d,$h)")', # d+=h 1811 '&and ($a3,$a2)', # (b^c)&(a^b) 1812 '&xor ($a1,$a4)', 1813 1814 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) 1815 '&xor ($a1,$a0)', # Sigma0(a) 1816 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c) 1817 '&mov ($a4,$e)', # copy of f in future 1818 1819 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' 1820 ); 1821 # and at the finish one has to $a+=$a1 1822} 1823 1824$code.=<<___; 1825.type ${func}_avx2,\@function,3 1826.align 64 1827${func}_avx2: 1828.Lavx2_shortcut: 1829 mov %rsp,%rax # copy %rsp 1830 push %rbx 1831 push %rbp 1832 push %r12 1833 push %r13 1834 push %r14 1835 push %r15 1836 sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp 1837 shl \$4,%rdx # num*16 1838 and \$-256*$SZ,%rsp # align stack frame 1839 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ 1840 add \$`2*$SZ*($rounds-8)`,%rsp 1841 mov $ctx,$_ctx # save ctx, 1st arg 1842 mov $inp,$_inp # save inp, 2nd arh 1843 mov %rdx,$_end # save end pointer, "3rd" arg 1844 mov %rax,$_rsp # save copy of %rsp 1845___ 1846$code.=<<___ if ($win64); 1847 movaps %xmm6,16*$SZ+32(%rsp) 1848 movaps %xmm7,16*$SZ+48(%rsp) 1849 movaps %xmm8,16*$SZ+64(%rsp) 1850 movaps %xmm9,16*$SZ+80(%rsp) 1851___ 1852$code.=<<___ if ($win64 && $SZ>4); 1853 movaps %xmm10,16*$SZ+96(%rsp) 1854 movaps %xmm11,16*$SZ+112(%rsp) 1855___ 1856$code.=<<___; 1857.Lprologue_avx2: 1858 1859 vzeroupper 1860 sub \$-16*$SZ,$inp # inp++, size optimization 1861 mov $SZ*0($ctx),$A 1862 mov $inp,%r12 # borrow $T1 1863 mov $SZ*1($ctx),$B 1864 cmp %rdx,$inp # $_end 1865 mov $SZ*2($ctx),$C 1866 cmove %rsp,%r12 # next block or random data 1867 mov $SZ*3($ctx),$D 1868 mov $SZ*4($ctx),$E 1869 mov $SZ*5($ctx),$F 1870 mov $SZ*6($ctx),$G 1871 mov $SZ*7($ctx),$H 1872___ 1873 if ($SZ==4) { # SHA256 1874 my @X = map("%ymm$_",(0..3)); 1875 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9)); 1876 1877$code.=<<___; 1878 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 1879 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 1880 jmp .Loop_avx2 1881.align 16 1882.Loop_avx2: 1883 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 1884 vmovdqu -16*$SZ+0($inp),%xmm0 1885 vmovdqu -16*$SZ+16($inp),%xmm1 1886 vmovdqu -16*$SZ+32($inp),%xmm2 1887 vmovdqu -16*$SZ+48($inp),%xmm3 1888 #mov $inp,$_inp # offload $inp 1889 vinserti128 \$1,(%r12),@X[0],@X[0] 1890 vinserti128 \$1,16(%r12),@X[1],@X[1] 1891 vpshufb $t3,@X[0],@X[0] 1892 vinserti128 \$1,32(%r12),@X[2],@X[2] 1893 vpshufb $t3,@X[1],@X[1] 1894 vinserti128 \$1,48(%r12),@X[3],@X[3] 1895 1896 lea $TABLE(%rip),$Tbl 1897 vpshufb $t3,@X[2],@X[2] 1898 vpaddd 0x00($Tbl),@X[0],$t0 1899 vpshufb $t3,@X[3],@X[3] 1900 vpaddd 0x20($Tbl),@X[1],$t1 1901 vpaddd 0x40($Tbl),@X[2],$t2 1902 vpaddd 0x60($Tbl),@X[3],$t3 1903 vmovdqa $t0,0x00(%rsp) 1904 xor $a1,$a1 1905 vmovdqa $t1,0x20(%rsp) 1906 lea -$PUSH8(%rsp),%rsp 1907 mov $B,$a3 1908 vmovdqa $t2,0x00(%rsp) 1909 xor $C,$a3 # magic 1910 vmovdqa $t3,0x20(%rsp) 1911 mov $F,$a4 1912 sub \$-16*2*$SZ,$Tbl # size optimization 1913 jmp .Lavx2_00_47 1914 1915.align 16 1916.Lavx2_00_47: 1917___ 1918 1919sub AVX2_256_00_47 () { 1920my $j = shift; 1921my $body = shift; 1922my @X = @_; 1923my @insns = (&$body,&$body,&$body,&$body); # 96 instructions 1924my $base = "+2*$PUSH8(%rsp)"; 1925 1926 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0); 1927 foreach (Xupdate_256_AVX()) { # 29 instructions 1928 eval; 1929 eval(shift(@insns)); 1930 eval(shift(@insns)); 1931 eval(shift(@insns)); 1932 } 1933 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); 1934 foreach (@insns) { eval; } # remaining instructions 1935 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); 1936} 1937 1938 for ($i=0,$j=0; $j<4; $j++) { 1939 &AVX2_256_00_47($j,\&bodyx_00_15,@X); 1940 push(@X,shift(@X)); # rotate(@X) 1941 } 1942 &lea ($Tbl,16*2*$SZ."($Tbl)"); 1943 &cmpb (($SZ-1)."($Tbl)",0); 1944 &jne (".Lavx2_00_47"); 1945 1946 for ($i=0; $i<16; ) { 1947 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; 1948 foreach(bodyx_00_15()) { eval; } 1949 } 1950 } else { # SHA512 1951 my @X = map("%ymm$_",(0..7)); 1952 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11)); 1953 1954$code.=<<___; 1955 jmp .Loop_avx2 1956.align 16 1957.Loop_avx2: 1958 vmovdqu -16*$SZ($inp),%xmm0 1959 vmovdqu -16*$SZ+16($inp),%xmm1 1960 vmovdqu -16*$SZ+32($inp),%xmm2 1961 lea $TABLE+0x80(%rip),$Tbl # size optimization 1962 vmovdqu -16*$SZ+48($inp),%xmm3 1963 vmovdqu -16*$SZ+64($inp),%xmm4 1964 vmovdqu -16*$SZ+80($inp),%xmm5 1965 vmovdqu -16*$SZ+96($inp),%xmm6 1966 vmovdqu -16*$SZ+112($inp),%xmm7 1967 #mov $inp,$_inp # offload $inp 1968 vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2 1969 vinserti128 \$1,(%r12),@X[0],@X[0] 1970 vinserti128 \$1,16(%r12),@X[1],@X[1] 1971 vpshufb $t2,@X[0],@X[0] 1972 vinserti128 \$1,32(%r12),@X[2],@X[2] 1973 vpshufb $t2,@X[1],@X[1] 1974 vinserti128 \$1,48(%r12),@X[3],@X[3] 1975 vpshufb $t2,@X[2],@X[2] 1976 vinserti128 \$1,64(%r12),@X[4],@X[4] 1977 vpshufb $t2,@X[3],@X[3] 1978 vinserti128 \$1,80(%r12),@X[5],@X[5] 1979 vpshufb $t2,@X[4],@X[4] 1980 vinserti128 \$1,96(%r12),@X[6],@X[6] 1981 vpshufb $t2,@X[5],@X[5] 1982 vinserti128 \$1,112(%r12),@X[7],@X[7] 1983 1984 vpaddq -0x80($Tbl),@X[0],$t0 1985 vpshufb $t2,@X[6],@X[6] 1986 vpaddq -0x60($Tbl),@X[1],$t1 1987 vpshufb $t2,@X[7],@X[7] 1988 vpaddq -0x40($Tbl),@X[2],$t2 1989 vpaddq -0x20($Tbl),@X[3],$t3 1990 vmovdqa $t0,0x00(%rsp) 1991 vpaddq 0x00($Tbl),@X[4],$t0 1992 vmovdqa $t1,0x20(%rsp) 1993 vpaddq 0x20($Tbl),@X[5],$t1 1994 vmovdqa $t2,0x40(%rsp) 1995 vpaddq 0x40($Tbl),@X[6],$t2 1996 vmovdqa $t3,0x60(%rsp) 1997 lea -$PUSH8(%rsp),%rsp 1998 vpaddq 0x60($Tbl),@X[7],$t3 1999 vmovdqa $t0,0x00(%rsp) 2000 xor $a1,$a1 2001 vmovdqa $t1,0x20(%rsp) 2002 mov $B,$a3 2003 vmovdqa $t2,0x40(%rsp) 2004 xor $C,$a3 # magic 2005 vmovdqa $t3,0x60(%rsp) 2006 mov $F,$a4 2007 add \$16*2*$SZ,$Tbl 2008 jmp .Lavx2_00_47 2009 2010.align 16 2011.Lavx2_00_47: 2012___ 2013 2014sub AVX2_512_00_47 () { 2015my $j = shift; 2016my $body = shift; 2017my @X = @_; 2018my @insns = (&$body,&$body); # 48 instructions 2019my $base = "+2*$PUSH8(%rsp)"; 2020 2021 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%4)==0); 2022 foreach (Xupdate_512_AVX()) { # 23 instructions 2023 eval; 2024 if ($_ !~ /\;$/) { 2025 eval(shift(@insns)); 2026 eval(shift(@insns)); 2027 eval(shift(@insns)); 2028 } 2029 } 2030 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); 2031 foreach (@insns) { eval; } # remaining instructions 2032 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); 2033} 2034 2035 for ($i=0,$j=0; $j<8; $j++) { 2036 &AVX2_512_00_47($j,\&bodyx_00_15,@X); 2037 push(@X,shift(@X)); # rotate(@X) 2038 } 2039 &lea ($Tbl,16*2*$SZ."($Tbl)"); 2040 &cmpb (($SZ-1-0x80)."($Tbl)",0); 2041 &jne (".Lavx2_00_47"); 2042 2043 for ($i=0; $i<16; ) { 2044 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; 2045 foreach(bodyx_00_15()) { eval; } 2046 } 2047} 2048$code.=<<___; 2049 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx 2050 add $a1,$A 2051 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp 2052 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl 2053 2054 add $SZ*0($ctx),$A 2055 add $SZ*1($ctx),$B 2056 add $SZ*2($ctx),$C 2057 add $SZ*3($ctx),$D 2058 add $SZ*4($ctx),$E 2059 add $SZ*5($ctx),$F 2060 add $SZ*6($ctx),$G 2061 add $SZ*7($ctx),$H 2062 2063 mov $A,$SZ*0($ctx) 2064 mov $B,$SZ*1($ctx) 2065 mov $C,$SZ*2($ctx) 2066 mov $D,$SZ*3($ctx) 2067 mov $E,$SZ*4($ctx) 2068 mov $F,$SZ*5($ctx) 2069 mov $G,$SZ*6($ctx) 2070 mov $H,$SZ*7($ctx) 2071 2072 cmp `$PUSH8+2*8`($Tbl),$inp # $_end 2073 je .Ldone_avx2 2074 2075 xor $a1,$a1 2076 mov $B,$a3 2077 xor $C,$a3 # magic 2078 mov $F,$a4 2079 jmp .Lower_avx2 2080.align 16 2081.Lower_avx2: 2082___ 2083 for ($i=0; $i<8; ) { 2084 my $base="+16($Tbl)"; 2085 foreach(bodyx_00_15()) { eval; } 2086 } 2087$code.=<<___; 2088 lea -$PUSH8($Tbl),$Tbl 2089 cmp %rsp,$Tbl 2090 jae .Lower_avx2 2091 2092 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx 2093 add $a1,$A 2094 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp 2095 lea `2*$SZ*($rounds-8)`(%rsp),%rsp 2096 2097 add $SZ*0($ctx),$A 2098 add $SZ*1($ctx),$B 2099 add $SZ*2($ctx),$C 2100 add $SZ*3($ctx),$D 2101 add $SZ*4($ctx),$E 2102 add $SZ*5($ctx),$F 2103 lea `2*16*$SZ`($inp),$inp # inp+=2 2104 add $SZ*6($ctx),$G 2105 mov $inp,%r12 2106 add $SZ*7($ctx),$H 2107 cmp $_end,$inp 2108 2109 mov $A,$SZ*0($ctx) 2110 cmove %rsp,%r12 # next block or stale data 2111 mov $B,$SZ*1($ctx) 2112 mov $C,$SZ*2($ctx) 2113 mov $D,$SZ*3($ctx) 2114 mov $E,$SZ*4($ctx) 2115 mov $F,$SZ*5($ctx) 2116 mov $G,$SZ*6($ctx) 2117 mov $H,$SZ*7($ctx) 2118 2119 jbe .Loop_avx2 2120 lea (%rsp),$Tbl 2121 2122.Ldone_avx2: 2123 lea ($Tbl),%rsp 2124 mov $_rsp,%rsi 2125 vzeroupper 2126___ 2127$code.=<<___ if ($win64); 2128 movaps 16*$SZ+32(%rsp),%xmm6 2129 movaps 16*$SZ+48(%rsp),%xmm7 2130 movaps 16*$SZ+64(%rsp),%xmm8 2131 movaps 16*$SZ+80(%rsp),%xmm9 2132___ 2133$code.=<<___ if ($win64 && $SZ>4); 2134 movaps 16*$SZ+96(%rsp),%xmm10 2135 movaps 16*$SZ+112(%rsp),%xmm11 2136___ 2137$code.=<<___; 2138 mov -48(%rsi),%r15 2139 mov -40(%rsi),%r14 2140 mov -32(%rsi),%r13 2141 mov -24(%rsi),%r12 2142 mov -16(%rsi),%rbp 2143 mov -8(%rsi),%rbx 2144 lea (%rsi),%rsp 2145.Lepilogue_avx2: 2146 ret 2147.size ${func}_avx2,.-${func}_avx2 2148___ 2149}} 2150}}}}} 2151 2152# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 2153# CONTEXT *context,DISPATCHER_CONTEXT *disp) 2154if ($win64) { 2155$rec="%rcx"; 2156$frame="%rdx"; 2157$context="%r8"; 2158$disp="%r9"; 2159 2160$code.=<<___; 2161.extern __imp_RtlVirtualUnwind 2162.type se_handler,\@abi-omnipotent 2163.align 16 2164se_handler: 2165 push %rsi 2166 push %rdi 2167 push %rbx 2168 push %rbp 2169 push %r12 2170 push %r13 2171 push %r14 2172 push %r15 2173 pushfq 2174 sub \$64,%rsp 2175 2176 mov 120($context),%rax # pull context->Rax 2177 mov 248($context),%rbx # pull context->Rip 2178 2179 mov 8($disp),%rsi # disp->ImageBase 2180 mov 56($disp),%r11 # disp->HanderlData 2181 2182 mov 0(%r11),%r10d # HandlerData[0] 2183 lea (%rsi,%r10),%r10 # prologue label 2184 cmp %r10,%rbx # context->Rip<prologue label 2185 jb .Lin_prologue 2186 2187 mov 152($context),%rax # pull context->Rsp 2188 2189 mov 4(%r11),%r10d # HandlerData[1] 2190 lea (%rsi,%r10),%r10 # epilogue label 2191 cmp %r10,%rbx # context->Rip>=epilogue label 2192 jae .Lin_prologue 2193___ 2194$code.=<<___ if ($avx>1); 2195 lea .Lavx2_shortcut(%rip),%r10 2196 cmp %r10,%rbx # context->Rip<avx2_shortcut 2197 jb .Lnot_in_avx2 2198 2199 and \$-256*$SZ,%rax 2200 add \$`2*$SZ*($rounds-8)`,%rax 2201.Lnot_in_avx2: 2202___ 2203$code.=<<___; 2204 mov %rax,%rsi # put aside Rsp 2205 mov 16*$SZ+3*8(%rax),%rax # pull $_rsp 2206 2207 mov -8(%rax),%rbx 2208 mov -16(%rax),%rbp 2209 mov -24(%rax),%r12 2210 mov -32(%rax),%r13 2211 mov -40(%rax),%r14 2212 mov -48(%rax),%r15 2213 mov %rbx,144($context) # restore context->Rbx 2214 mov %rbp,160($context) # restore context->Rbp 2215 mov %r12,216($context) # restore context->R12 2216 mov %r13,224($context) # restore context->R13 2217 mov %r14,232($context) # restore context->R14 2218 mov %r15,240($context) # restore context->R15 2219 2220 lea .Lepilogue(%rip),%r10 2221 cmp %r10,%rbx 2222 jb .Lin_prologue # non-AVX code 2223 2224 lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area 2225 lea 512($context),%rdi # &context.Xmm6 2226 mov \$`$SZ==4?8:12`,%ecx 2227 .long 0xa548f3fc # cld; rep movsq 2228 2229.Lin_prologue: 2230 mov 8(%rax),%rdi 2231 mov 16(%rax),%rsi 2232 mov %rax,152($context) # restore context->Rsp 2233 mov %rsi,168($context) # restore context->Rsi 2234 mov %rdi,176($context) # restore context->Rdi 2235 2236 mov 40($disp),%rdi # disp->ContextRecord 2237 mov $context,%rsi # context 2238 mov \$154,%ecx # sizeof(CONTEXT) 2239 .long 0xa548f3fc # cld; rep movsq 2240 2241 mov $disp,%rsi 2242 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 2243 mov 8(%rsi),%rdx # arg2, disp->ImageBase 2244 mov 0(%rsi),%r8 # arg3, disp->ControlPc 2245 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 2246 mov 40(%rsi),%r10 # disp->ContextRecord 2247 lea 56(%rsi),%r11 # &disp->HandlerData 2248 lea 24(%rsi),%r12 # &disp->EstablisherFrame 2249 mov %r10,32(%rsp) # arg5 2250 mov %r11,40(%rsp) # arg6 2251 mov %r12,48(%rsp) # arg7 2252 mov %rcx,56(%rsp) # arg8, (NULL) 2253 call *__imp_RtlVirtualUnwind(%rip) 2254 2255 mov \$1,%eax # ExceptionContinueSearch 2256 add \$64,%rsp 2257 popfq 2258 pop %r15 2259 pop %r14 2260 pop %r13 2261 pop %r12 2262 pop %rbp 2263 pop %rbx 2264 pop %rdi 2265 pop %rsi 2266 ret 2267.size se_handler,.-se_handler 2268___ 2269 2270$code.=<<___ if ($SZ==4 && $shaext); 2271.type shaext_handler,\@abi-omnipotent 2272.align 16 2273shaext_handler: 2274 push %rsi 2275 push %rdi 2276 push %rbx 2277 push %rbp 2278 push %r12 2279 push %r13 2280 push %r14 2281 push %r15 2282 pushfq 2283 sub \$64,%rsp 2284 2285 mov 120($context),%rax # pull context->Rax 2286 mov 248($context),%rbx # pull context->Rip 2287 2288 lea .Lprologue_shaext(%rip),%r10 2289 cmp %r10,%rbx # context->Rip<.Lprologue 2290 jb .Lin_prologue 2291 2292 lea .Lepilogue_shaext(%rip),%r10 2293 cmp %r10,%rbx # context->Rip>=.Lepilogue 2294 jae .Lin_prologue 2295 2296 lea -8-5*16(%rax),%rsi 2297 lea 512($context),%rdi # &context.Xmm6 2298 mov \$10,%ecx 2299 .long 0xa548f3fc # cld; rep movsq 2300 2301 jmp .Lin_prologue 2302.size shaext_handler,.-shaext_handler 2303___ 2304 2305$code.=<<___; 2306.section .pdata 2307.align 4 2308 .rva .LSEH_begin_$func 2309 .rva .LSEH_end_$func 2310 .rva .LSEH_info_$func 2311___ 2312$code.=<<___ if ($SZ==4 && $shaext); 2313 .rva .LSEH_begin_${func}_shaext 2314 .rva .LSEH_end_${func}_shaext 2315 .rva .LSEH_info_${func}_shaext 2316___ 2317$code.=<<___ if ($SZ==4); 2318 .rva .LSEH_begin_${func}_ssse3 2319 .rva .LSEH_end_${func}_ssse3 2320 .rva .LSEH_info_${func}_ssse3 2321___ 2322$code.=<<___ if ($avx && $SZ==8); 2323 .rva .LSEH_begin_${func}_xop 2324 .rva .LSEH_end_${func}_xop 2325 .rva .LSEH_info_${func}_xop 2326___ 2327$code.=<<___ if ($avx); 2328 .rva .LSEH_begin_${func}_avx 2329 .rva .LSEH_end_${func}_avx 2330 .rva .LSEH_info_${func}_avx 2331___ 2332$code.=<<___ if ($avx>1); 2333 .rva .LSEH_begin_${func}_avx2 2334 .rva .LSEH_end_${func}_avx2 2335 .rva .LSEH_info_${func}_avx2 2336___ 2337$code.=<<___; 2338.section .xdata 2339.align 8 2340.LSEH_info_$func: 2341 .byte 9,0,0,0 2342 .rva se_handler 2343 .rva .Lprologue,.Lepilogue # HandlerData[] 2344___ 2345$code.=<<___ if ($SZ==4 && $shaext); 2346.LSEH_info_${func}_shaext: 2347 .byte 9,0,0,0 2348 .rva shaext_handler 2349___ 2350$code.=<<___ if ($SZ==4); 2351.LSEH_info_${func}_ssse3: 2352 .byte 9,0,0,0 2353 .rva se_handler 2354 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] 2355___ 2356$code.=<<___ if ($avx && $SZ==8); 2357.LSEH_info_${func}_xop: 2358 .byte 9,0,0,0 2359 .rva se_handler 2360 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[] 2361___ 2362$code.=<<___ if ($avx); 2363.LSEH_info_${func}_avx: 2364 .byte 9,0,0,0 2365 .rva se_handler 2366 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] 2367___ 2368$code.=<<___ if ($avx>1); 2369.LSEH_info_${func}_avx2: 2370 .byte 9,0,0,0 2371 .rva se_handler 2372 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] 2373___ 2374} 2375 2376sub sha256op38 { 2377 my $instr = shift; 2378 my %opcodelet = ( 2379 "sha256rnds2" => 0xcb, 2380 "sha256msg1" => 0xcc, 2381 "sha256msg2" => 0xcd ); 2382 2383 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) { 2384 my @opcode=(0x0f,0x38); 2385 push @opcode,$opcodelet{$instr}; 2386 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M 2387 return ".byte\t".join(',',@opcode); 2388 } else { 2389 return $instr."\t".@_[0]; 2390 } 2391} 2392 2393foreach (split("\n",$code)) { 2394 s/\`([^\`]*)\`/eval $1/geo; 2395 2396 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo; 2397 2398 print $_,"\n"; 2399} 2400close STDOUT; 2401