1183234Ssimon#!/usr/bin/env perl 2183234Ssimon# 3183234Ssimon# ==================================================================== 4183234Ssimon# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5183234Ssimon# project. The module is, however, dual licensed under OpenSSL and 6183234Ssimon# CRYPTOGAMS licenses depending on where you obtain it. For further 7183234Ssimon# details see http://www.openssl.org/~appro/cryptogams/. 8183234Ssimon# ==================================================================== 9183234Ssimon# 10183234Ssimon# sha1_block procedure for x86_64. 11183234Ssimon# 12183234Ssimon# It was brought to my attention that on EM64T compiler-generated code 13183234Ssimon# was far behind 32-bit assembler implementation. This is unlike on 14183234Ssimon# Opteron where compiler-generated code was only 15% behind 32-bit 15183234Ssimon# assembler, which originally made it hard to motivate the effort. 16183234Ssimon# There was suggestion to mechanically translate 32-bit code, but I 17183234Ssimon# dismissed it, reasoning that x86_64 offers enough register bank 18183234Ssimon# capacity to fully utilize SHA-1 parallelism. Therefore this fresh 19238405Sjkim# implementation:-) However! While 64-bit code does perform better 20183234Ssimon# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well, 21183234Ssimon# x86_64 does offer larger *addressable* bank, but out-of-order core 22183234Ssimon# reaches for even more registers through dynamic aliasing, and EM64T 23183234Ssimon# core must have managed to run-time optimize even 32-bit code just as 24183234Ssimon# good as 64-bit one. Performance improvement is summarized in the 25183234Ssimon# following table: 26183234Ssimon# 27183234Ssimon# gcc 3.4 32-bit asm cycles/byte 28183234Ssimon# Opteron +45% +20% 6.8 29183234Ssimon# Xeon P4 +65% +0% 9.9 30183234Ssimon# Core2 +60% +10% 7.0 31183234Ssimon 32238405Sjkim# August 2009. 33238405Sjkim# 34238405Sjkim# The code was revised to minimize code size and to maximize 35238405Sjkim# "distance" between instructions producing input to 'lea' 36238405Sjkim# instruction and the 'lea' instruction itself, which is essential 37238405Sjkim# for Intel Atom core. 38183234Ssimon 39238405Sjkim# October 2010. 40238405Sjkim# 41238405Sjkim# Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it 42238405Sjkim# is to offload message schedule denoted by Wt in NIST specification, 43238405Sjkim# or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module 44238405Sjkim# for background and implementation details. The only difference from 45238405Sjkim# 32-bit code is that 64-bit code doesn't have to spill @X[] elements 46238405Sjkim# to free temporary registers. 47238405Sjkim 48238405Sjkim# April 2011. 49238405Sjkim# 50238405Sjkim# Add AVX code path. See sha1-586.pl for further information. 51238405Sjkim 52238405Sjkim###################################################################### 53238405Sjkim# Current performance is summarized in following table. Numbers are 54238405Sjkim# CPU clock cycles spent to process single byte (less is better). 55238405Sjkim# 56238405Sjkim# x86_64 SSSE3 AVX 57238405Sjkim# P4 9.8 - 58238405Sjkim# Opteron 6.6 - 59238405Sjkim# Core2 6.7 6.1/+10% - 60238405Sjkim# Atom 11.0 9.7/+13% - 61238405Sjkim# Westmere 7.1 5.6/+27% - 62238405Sjkim# Sandy Bridge 7.9 6.3/+25% 5.2/+51% 63238405Sjkim 64238405Sjkim$flavour = shift; 65238405Sjkim$output = shift; 66238405Sjkimif ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 67238405Sjkim 68238405Sjkim$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 69238405Sjkim 70183234Ssimon$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 71183234Ssimon( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 72183234Ssimon( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 73183234Ssimondie "can't locate x86_64-xlate.pl"; 74183234Ssimon 75238405Sjkim$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 76238405Sjkim =~ /GNU assembler version ([2-9]\.[0-9]+)/ && 77238405Sjkim $1>=2.19); 78238405Sjkim$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 79238405Sjkim `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && 80238405Sjkim $1>=2.09); 81238405Sjkim$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 82238405Sjkim `ml64 2>&1` =~ /Version ([0-9]+)\./ && 83238405Sjkim $1>=10); 84183234Ssimon 85246772Sjkimopen OUT,"| \"$^X\" $xlate $flavour $output"; 86246772Sjkim*STDOUT=*OUT; 87238405Sjkim 88183234Ssimon$ctx="%rdi"; # 1st arg 89183234Ssimon$inp="%rsi"; # 2nd arg 90183234Ssimon$num="%rdx"; # 3rd arg 91183234Ssimon 92183234Ssimon# reassign arguments in order to produce more compact code 93183234Ssimon$ctx="%r8"; 94183234Ssimon$inp="%r9"; 95183234Ssimon$num="%r10"; 96183234Ssimon 97238405Sjkim$t0="%eax"; 98238405Sjkim$t1="%ebx"; 99238405Sjkim$t2="%ecx"; 100238405Sjkim@xi=("%edx","%ebp"); 101238405Sjkim$A="%esi"; 102238405Sjkim$B="%edi"; 103238405Sjkim$C="%r11d"; 104238405Sjkim$D="%r12d"; 105238405Sjkim$E="%r13d"; 106183234Ssimon 107238405Sjkim@V=($A,$B,$C,$D,$E); 108183234Ssimon 109183234Ssimonsub BODY_00_19 { 110238405Sjkimmy ($i,$a,$b,$c,$d,$e)=@_; 111183234Ssimonmy $j=$i+1; 112183234Ssimon$code.=<<___ if ($i==0); 113238405Sjkim mov `4*$i`($inp),$xi[0] 114238405Sjkim bswap $xi[0] 115238405Sjkim mov $xi[0],`4*$i`(%rsp) 116183234Ssimon___ 117183234Ssimon$code.=<<___ if ($i<15); 118183234Ssimon mov $c,$t0 119238405Sjkim mov `4*$j`($inp),$xi[1] 120238405Sjkim mov $a,$t2 121183234Ssimon xor $d,$t0 122238405Sjkim bswap $xi[1] 123238405Sjkim rol \$5,$t2 124238405Sjkim lea 0x5a827999($xi[0],$e),$e 125183234Ssimon and $b,$t0 126238405Sjkim mov $xi[1],`4*$j`(%rsp) 127238405Sjkim add $t2,$e 128183234Ssimon xor $d,$t0 129183234Ssimon rol \$30,$b 130238405Sjkim add $t0,$e 131183234Ssimon___ 132183234Ssimon$code.=<<___ if ($i>=15); 133238405Sjkim mov `4*($j%16)`(%rsp),$xi[1] 134183234Ssimon mov $c,$t0 135238405Sjkim mov $a,$t2 136238405Sjkim xor `4*(($j+2)%16)`(%rsp),$xi[1] 137183234Ssimon xor $d,$t0 138238405Sjkim rol \$5,$t2 139238405Sjkim xor `4*(($j+8)%16)`(%rsp),$xi[1] 140183234Ssimon and $b,$t0 141238405Sjkim lea 0x5a827999($xi[0],$e),$e 142238405Sjkim xor `4*(($j+13)%16)`(%rsp),$xi[1] 143183234Ssimon xor $d,$t0 144238405Sjkim rol \$1,$xi[1] 145238405Sjkim add $t2,$e 146183234Ssimon rol \$30,$b 147238405Sjkim mov $xi[1],`4*($j%16)`(%rsp) 148238405Sjkim add $t0,$e 149183234Ssimon___ 150238405Sjkimunshift(@xi,pop(@xi)); 151183234Ssimon} 152183234Ssimon 153183234Ssimonsub BODY_20_39 { 154238405Sjkimmy ($i,$a,$b,$c,$d,$e)=@_; 155183234Ssimonmy $j=$i+1; 156183234Ssimonmy $K=($i<40)?0x6ed9eba1:0xca62c1d6; 157183234Ssimon$code.=<<___ if ($i<79); 158238405Sjkim mov `4*($j%16)`(%rsp),$xi[1] 159183234Ssimon mov $c,$t0 160238405Sjkim mov $a,$t2 161238405Sjkim xor `4*(($j+2)%16)`(%rsp),$xi[1] 162183234Ssimon xor $b,$t0 163238405Sjkim rol \$5,$t2 164238405Sjkim lea $K($xi[0],$e),$e 165238405Sjkim xor `4*(($j+8)%16)`(%rsp),$xi[1] 166183234Ssimon xor $d,$t0 167238405Sjkim add $t2,$e 168238405Sjkim xor `4*(($j+13)%16)`(%rsp),$xi[1] 169183234Ssimon rol \$30,$b 170238405Sjkim add $t0,$e 171238405Sjkim rol \$1,$xi[1] 172183234Ssimon___ 173183234Ssimon$code.=<<___ if ($i<76); 174238405Sjkim mov $xi[1],`4*($j%16)`(%rsp) 175183234Ssimon___ 176183234Ssimon$code.=<<___ if ($i==79); 177183234Ssimon mov $c,$t0 178238405Sjkim mov $a,$t2 179183234Ssimon xor $b,$t0 180238405Sjkim lea $K($xi[0],$e),$e 181238405Sjkim rol \$5,$t2 182183234Ssimon xor $d,$t0 183238405Sjkim add $t2,$e 184183234Ssimon rol \$30,$b 185238405Sjkim add $t0,$e 186183234Ssimon___ 187238405Sjkimunshift(@xi,pop(@xi)); 188183234Ssimon} 189183234Ssimon 190183234Ssimonsub BODY_40_59 { 191238405Sjkimmy ($i,$a,$b,$c,$d,$e)=@_; 192183234Ssimonmy $j=$i+1; 193183234Ssimon$code.=<<___; 194238405Sjkim mov `4*($j%16)`(%rsp),$xi[1] 195238405Sjkim mov $c,$t0 196238405Sjkim mov $c,$t1 197238405Sjkim xor `4*(($j+2)%16)`(%rsp),$xi[1] 198238405Sjkim and $d,$t0 199238405Sjkim mov $a,$t2 200238405Sjkim xor `4*(($j+8)%16)`(%rsp),$xi[1] 201238405Sjkim xor $d,$t1 202238405Sjkim lea 0x8f1bbcdc($xi[0],$e),$e 203238405Sjkim rol \$5,$t2 204238405Sjkim xor `4*(($j+13)%16)`(%rsp),$xi[1] 205238405Sjkim add $t0,$e 206238405Sjkim and $b,$t1 207238405Sjkim rol \$1,$xi[1] 208238405Sjkim add $t1,$e 209183234Ssimon rol \$30,$b 210238405Sjkim mov $xi[1],`4*($j%16)`(%rsp) 211238405Sjkim add $t2,$e 212183234Ssimon___ 213238405Sjkimunshift(@xi,pop(@xi)); 214183234Ssimon} 215183234Ssimon 216238405Sjkim$code.=<<___; 217238405Sjkim.text 218238405Sjkim.extern OPENSSL_ia32cap_P 219183234Ssimon 220238405Sjkim.globl sha1_block_data_order 221238405Sjkim.type sha1_block_data_order,\@function,3 222238405Sjkim.align 16 223238405Sjkimsha1_block_data_order: 224238405Sjkim mov OPENSSL_ia32cap_P+0(%rip),%r9d 225238405Sjkim mov OPENSSL_ia32cap_P+4(%rip),%r8d 226238405Sjkim test \$`1<<9`,%r8d # check SSSE3 bit 227238405Sjkim jz .Lialu 228238405Sjkim___ 229238405Sjkim$code.=<<___ if ($avx); 230238405Sjkim and \$`1<<28`,%r8d # mask AVX bit 231238405Sjkim and \$`1<<30`,%r9d # mask "Intel CPU" bit 232238405Sjkim or %r9d,%r8d 233238405Sjkim cmp \$`1<<28|1<<30`,%r8d 234238405Sjkim je _avx_shortcut 235238405Sjkim___ 236238405Sjkim$code.=<<___; 237238405Sjkim jmp _ssse3_shortcut 238238405Sjkim 239238405Sjkim.align 16 240238405Sjkim.Lialu: 241238405Sjkim push %rbx 242238405Sjkim push %rbp 243238405Sjkim push %r12 244238405Sjkim push %r13 245238405Sjkim mov %rsp,%r11 246238405Sjkim mov %rdi,$ctx # reassigned argument 247238405Sjkim sub \$`8+16*4`,%rsp 248238405Sjkim mov %rsi,$inp # reassigned argument 249238405Sjkim and \$-64,%rsp 250238405Sjkim mov %rdx,$num # reassigned argument 251238405Sjkim mov %r11,`16*4`(%rsp) 252238405Sjkim.Lprologue: 253238405Sjkim 254238405Sjkim mov 0($ctx),$A 255238405Sjkim mov 4($ctx),$B 256238405Sjkim mov 8($ctx),$C 257238405Sjkim mov 12($ctx),$D 258238405Sjkim mov 16($ctx),$E 259238405Sjkim jmp .Lloop 260238405Sjkim 261238405Sjkim.align 16 262238405Sjkim.Lloop: 263238405Sjkim___ 264183234Ssimonfor($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } 265183234Ssimonfor(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 266183234Ssimonfor(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 267183234Ssimonfor(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 268183234Ssimon$code.=<<___; 269238405Sjkim add 0($ctx),$A 270238405Sjkim add 4($ctx),$B 271238405Sjkim add 8($ctx),$C 272238405Sjkim add 12($ctx),$D 273238405Sjkim add 16($ctx),$E 274238405Sjkim mov $A,0($ctx) 275238405Sjkim mov $B,4($ctx) 276238405Sjkim mov $C,8($ctx) 277238405Sjkim mov $D,12($ctx) 278238405Sjkim mov $E,16($ctx) 279183234Ssimon 280238405Sjkim sub \$1,$num 281183234Ssimon lea `16*4`($inp),$inp 282183234Ssimon jnz .Lloop 283238405Sjkim 284238405Sjkim mov `16*4`(%rsp),%rsi 285238405Sjkim mov (%rsi),%r13 286238405Sjkim mov 8(%rsi),%r12 287238405Sjkim mov 16(%rsi),%rbp 288238405Sjkim mov 24(%rsi),%rbx 289238405Sjkim lea 32(%rsi),%rsp 290238405Sjkim.Lepilogue: 291238405Sjkim ret 292238405Sjkim.size sha1_block_data_order,.-sha1_block_data_order 293183234Ssimon___ 294238405Sjkim{{{ 295238405Sjkimmy $Xi=4; 296238405Sjkimmy @X=map("%xmm$_",(4..7,0..3)); 297238405Sjkimmy @Tx=map("%xmm$_",(8..10)); 298238405Sjkimmy @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization 299238405Sjkimmy @T=("%esi","%edi"); 300238405Sjkimmy $j=0; 301238405Sjkimmy $K_XX_XX="%r11"; 302238405Sjkim 303238405Sjkimmy $_rol=sub { &rol(@_) }; 304238405Sjkimmy $_ror=sub { &ror(@_) }; 305238405Sjkim 306183234Ssimon$code.=<<___; 307238405Sjkim.type sha1_block_data_order_ssse3,\@function,3 308238405Sjkim.align 16 309238405Sjkimsha1_block_data_order_ssse3: 310238405Sjkim_ssse3_shortcut: 311238405Sjkim push %rbx 312238405Sjkim push %rbp 313238405Sjkim push %r12 314238405Sjkim lea `-64-($win64?5*16:0)`(%rsp),%rsp 315238405Sjkim___ 316238405Sjkim$code.=<<___ if ($win64); 317238405Sjkim movaps %xmm6,64+0(%rsp) 318238405Sjkim movaps %xmm7,64+16(%rsp) 319238405Sjkim movaps %xmm8,64+32(%rsp) 320238405Sjkim movaps %xmm9,64+48(%rsp) 321238405Sjkim movaps %xmm10,64+64(%rsp) 322238405Sjkim.Lprologue_ssse3: 323238405Sjkim___ 324238405Sjkim$code.=<<___; 325238405Sjkim mov %rdi,$ctx # reassigned argument 326238405Sjkim mov %rsi,$inp # reassigned argument 327238405Sjkim mov %rdx,$num # reassigned argument 328238405Sjkim 329238405Sjkim shl \$6,$num 330238405Sjkim add $inp,$num 331238405Sjkim lea K_XX_XX(%rip),$K_XX_XX 332238405Sjkim 333238405Sjkim mov 0($ctx),$A # load context 334238405Sjkim mov 4($ctx),$B 335238405Sjkim mov 8($ctx),$C 336238405Sjkim mov 12($ctx),$D 337238405Sjkim mov $B,@T[0] # magic seed 338238405Sjkim mov 16($ctx),$E 339238405Sjkim 340238405Sjkim movdqa 64($K_XX_XX),@X[2] # pbswap mask 341238405Sjkim movdqa 0($K_XX_XX),@Tx[1] # K_00_19 342238405Sjkim movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] 343238405Sjkim movdqu 16($inp),@X[-3&7] 344238405Sjkim movdqu 32($inp),@X[-2&7] 345238405Sjkim movdqu 48($inp),@X[-1&7] 346238405Sjkim pshufb @X[2],@X[-4&7] # byte swap 347238405Sjkim add \$64,$inp 348238405Sjkim pshufb @X[2],@X[-3&7] 349238405Sjkim pshufb @X[2],@X[-2&7] 350238405Sjkim pshufb @X[2],@X[-1&7] 351238405Sjkim paddd @Tx[1],@X[-4&7] # add K_00_19 352238405Sjkim paddd @Tx[1],@X[-3&7] 353238405Sjkim paddd @Tx[1],@X[-2&7] 354238405Sjkim movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU 355238405Sjkim psubd @Tx[1],@X[-4&7] # restore X[] 356238405Sjkim movdqa @X[-3&7],16(%rsp) 357238405Sjkim psubd @Tx[1],@X[-3&7] 358238405Sjkim movdqa @X[-2&7],32(%rsp) 359238405Sjkim psubd @Tx[1],@X[-2&7] 360238405Sjkim jmp .Loop_ssse3 361238405Sjkim___ 362238405Sjkim 363238405Sjkimsub AUTOLOAD() # thunk [simplified] 32-bit style perlasm 364238405Sjkim{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; 365238405Sjkim my $arg = pop; 366238405Sjkim $arg = "\$$arg" if ($arg*1 eq $arg); 367238405Sjkim $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; 368238405Sjkim} 369238405Sjkim 370238405Sjkimsub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 371238405Sjkim{ use integer; 372238405Sjkim my $body = shift; 373238405Sjkim my @insns = (&$body,&$body,&$body,&$body); # 40 instructions 374238405Sjkim my ($a,$b,$c,$d,$e); 375238405Sjkim 376238405Sjkim &movdqa (@X[0],@X[-3&7]); 377238405Sjkim eval(shift(@insns)); 378238405Sjkim eval(shift(@insns)); 379238405Sjkim &movdqa (@Tx[0],@X[-1&7]); 380238405Sjkim &palignr(@X[0],@X[-4&7],8); # compose "X[-14]" in "X[0]" 381238405Sjkim eval(shift(@insns)); 382238405Sjkim eval(shift(@insns)); 383238405Sjkim 384238405Sjkim &paddd (@Tx[1],@X[-1&7]); 385238405Sjkim eval(shift(@insns)); 386238405Sjkim eval(shift(@insns)); 387238405Sjkim &psrldq (@Tx[0],4); # "X[-3]", 3 dwords 388238405Sjkim eval(shift(@insns)); 389238405Sjkim eval(shift(@insns)); 390238405Sjkim &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" 391238405Sjkim eval(shift(@insns)); 392238405Sjkim eval(shift(@insns)); 393238405Sjkim 394238405Sjkim &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" 395238405Sjkim eval(shift(@insns)); 396238405Sjkim eval(shift(@insns)); 397238405Sjkim eval(shift(@insns)); 398238405Sjkim eval(shift(@insns)); 399238405Sjkim 400238405Sjkim &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" 401238405Sjkim eval(shift(@insns)); 402238405Sjkim eval(shift(@insns)); 403238405Sjkim &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU 404238405Sjkim eval(shift(@insns)); 405238405Sjkim eval(shift(@insns)); 406238405Sjkim 407238405Sjkim &movdqa (@Tx[2],@X[0]); 408238405Sjkim &movdqa (@Tx[0],@X[0]); 409238405Sjkim eval(shift(@insns)); 410238405Sjkim eval(shift(@insns)); 411238405Sjkim eval(shift(@insns)); 412238405Sjkim eval(shift(@insns)); 413238405Sjkim 414238405Sjkim &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword 415238405Sjkim &paddd (@X[0],@X[0]); 416238405Sjkim eval(shift(@insns)); 417238405Sjkim eval(shift(@insns)); 418238405Sjkim eval(shift(@insns)); 419238405Sjkim eval(shift(@insns)); 420238405Sjkim 421238405Sjkim &psrld (@Tx[0],31); 422238405Sjkim eval(shift(@insns)); 423238405Sjkim eval(shift(@insns)); 424238405Sjkim &movdqa (@Tx[1],@Tx[2]); 425238405Sjkim eval(shift(@insns)); 426238405Sjkim eval(shift(@insns)); 427238405Sjkim 428238405Sjkim &psrld (@Tx[2],30); 429238405Sjkim &por (@X[0],@Tx[0]); # "X[0]"<<<=1 430238405Sjkim eval(shift(@insns)); 431238405Sjkim eval(shift(@insns)); 432238405Sjkim eval(shift(@insns)); 433238405Sjkim eval(shift(@insns)); 434238405Sjkim 435238405Sjkim &pslld (@Tx[1],2); 436238405Sjkim &pxor (@X[0],@Tx[2]); 437238405Sjkim eval(shift(@insns)); 438238405Sjkim eval(shift(@insns)); 439238405Sjkim &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX 440238405Sjkim eval(shift(@insns)); 441238405Sjkim eval(shift(@insns)); 442238405Sjkim 443238405Sjkim &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 444238405Sjkim 445238405Sjkim foreach (@insns) { eval; } # remaining instructions [if any] 446238405Sjkim 447238405Sjkim $Xi++; push(@X,shift(@X)); # "rotate" X[] 448238405Sjkim push(@Tx,shift(@Tx)); 449238405Sjkim} 450238405Sjkim 451238405Sjkimsub Xupdate_ssse3_32_79() 452238405Sjkim{ use integer; 453238405Sjkim my $body = shift; 454238405Sjkim my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions 455238405Sjkim my ($a,$b,$c,$d,$e); 456238405Sjkim 457238405Sjkim &movdqa (@Tx[0],@X[-1&7]) if ($Xi==8); 458238405Sjkim eval(shift(@insns)); # body_20_39 459238405Sjkim &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" 460238405Sjkim &palignr(@Tx[0],@X[-2&7],8); # compose "X[-6]" 461238405Sjkim eval(shift(@insns)); 462238405Sjkim eval(shift(@insns)); 463238405Sjkim eval(shift(@insns)); # rol 464238405Sjkim 465238405Sjkim &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" 466238405Sjkim eval(shift(@insns)); 467238405Sjkim eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); 468238405Sjkim if ($Xi%5) { 469238405Sjkim &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... 470238405Sjkim } else { # ... or load next one 471238405Sjkim &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); 472238405Sjkim } 473238405Sjkim &paddd (@Tx[1],@X[-1&7]); 474238405Sjkim eval(shift(@insns)); # ror 475238405Sjkim eval(shift(@insns)); 476238405Sjkim 477238405Sjkim &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]" 478238405Sjkim eval(shift(@insns)); # body_20_39 479238405Sjkim eval(shift(@insns)); 480238405Sjkim eval(shift(@insns)); 481238405Sjkim eval(shift(@insns)); # rol 482238405Sjkim 483238405Sjkim &movdqa (@Tx[0],@X[0]); 484238405Sjkim &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU 485238405Sjkim eval(shift(@insns)); 486238405Sjkim eval(shift(@insns)); 487238405Sjkim eval(shift(@insns)); # ror 488238405Sjkim eval(shift(@insns)); 489238405Sjkim 490238405Sjkim &pslld (@X[0],2); 491238405Sjkim eval(shift(@insns)); # body_20_39 492238405Sjkim eval(shift(@insns)); 493238405Sjkim &psrld (@Tx[0],30); 494238405Sjkim eval(shift(@insns)); 495238405Sjkim eval(shift(@insns)); # rol 496238405Sjkim eval(shift(@insns)); 497238405Sjkim eval(shift(@insns)); 498238405Sjkim eval(shift(@insns)); # ror 499238405Sjkim eval(shift(@insns)); 500238405Sjkim 501238405Sjkim &por (@X[0],@Tx[0]); # "X[0]"<<<=2 502238405Sjkim eval(shift(@insns)); # body_20_39 503238405Sjkim eval(shift(@insns)); 504238405Sjkim &movdqa (@Tx[1],@X[0]) if ($Xi<19); 505238405Sjkim eval(shift(@insns)); 506238405Sjkim eval(shift(@insns)); # rol 507238405Sjkim eval(shift(@insns)); 508238405Sjkim eval(shift(@insns)); 509238405Sjkim eval(shift(@insns)); # rol 510238405Sjkim eval(shift(@insns)); 511238405Sjkim 512238405Sjkim foreach (@insns) { eval; } # remaining instructions 513238405Sjkim 514238405Sjkim $Xi++; push(@X,shift(@X)); # "rotate" X[] 515238405Sjkim push(@Tx,shift(@Tx)); 516238405Sjkim} 517238405Sjkim 518238405Sjkimsub Xuplast_ssse3_80() 519238405Sjkim{ use integer; 520238405Sjkim my $body = shift; 521238405Sjkim my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 522238405Sjkim my ($a,$b,$c,$d,$e); 523238405Sjkim 524238405Sjkim eval(shift(@insns)); 525238405Sjkim &paddd (@Tx[1],@X[-1&7]); 526238405Sjkim eval(shift(@insns)); 527238405Sjkim eval(shift(@insns)); 528238405Sjkim eval(shift(@insns)); 529238405Sjkim eval(shift(@insns)); 530238405Sjkim 531238405Sjkim &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU 532238405Sjkim 533238405Sjkim foreach (@insns) { eval; } # remaining instructions 534238405Sjkim 535238405Sjkim &cmp ($inp,$num); 536238405Sjkim &je (".Ldone_ssse3"); 537238405Sjkim 538238405Sjkim unshift(@Tx,pop(@Tx)); 539238405Sjkim 540238405Sjkim &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask 541238405Sjkim &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19 542238405Sjkim &movdqu (@X[-4&7],"0($inp)"); # load input 543238405Sjkim &movdqu (@X[-3&7],"16($inp)"); 544238405Sjkim &movdqu (@X[-2&7],"32($inp)"); 545238405Sjkim &movdqu (@X[-1&7],"48($inp)"); 546238405Sjkim &pshufb (@X[-4&7],@X[2]); # byte swap 547238405Sjkim &add ($inp,64); 548238405Sjkim 549238405Sjkim $Xi=0; 550238405Sjkim} 551238405Sjkim 552238405Sjkimsub Xloop_ssse3() 553238405Sjkim{ use integer; 554238405Sjkim my $body = shift; 555238405Sjkim my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 556238405Sjkim my ($a,$b,$c,$d,$e); 557238405Sjkim 558238405Sjkim eval(shift(@insns)); 559238405Sjkim eval(shift(@insns)); 560238405Sjkim &pshufb (@X[($Xi-3)&7],@X[2]); 561238405Sjkim eval(shift(@insns)); 562238405Sjkim eval(shift(@insns)); 563238405Sjkim &paddd (@X[($Xi-4)&7],@Tx[1]); 564238405Sjkim eval(shift(@insns)); 565238405Sjkim eval(shift(@insns)); 566238405Sjkim eval(shift(@insns)); 567238405Sjkim eval(shift(@insns)); 568238405Sjkim &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU 569238405Sjkim eval(shift(@insns)); 570238405Sjkim eval(shift(@insns)); 571238405Sjkim &psubd (@X[($Xi-4)&7],@Tx[1]); 572238405Sjkim 573238405Sjkim foreach (@insns) { eval; } 574238405Sjkim $Xi++; 575238405Sjkim} 576238405Sjkim 577238405Sjkimsub Xtail_ssse3() 578238405Sjkim{ use integer; 579238405Sjkim my $body = shift; 580238405Sjkim my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 581238405Sjkim my ($a,$b,$c,$d,$e); 582238405Sjkim 583238405Sjkim foreach (@insns) { eval; } 584238405Sjkim} 585238405Sjkim 586238405Sjkimsub body_00_19 () { 587238405Sjkim ( 588238405Sjkim '($a,$b,$c,$d,$e)=@V;'. 589238405Sjkim '&add ($e,eval(4*($j&15))."(%rsp)");', # X[]+K xfer 590238405Sjkim '&xor ($c,$d);', 591238405Sjkim '&mov (@T[1],$a);', # $b in next round 592238405Sjkim '&$_rol ($a,5);', 593238405Sjkim '&and (@T[0],$c);', # ($b&($c^$d)) 594238405Sjkim '&xor ($c,$d);', # restore $c 595238405Sjkim '&xor (@T[0],$d);', 596238405Sjkim '&add ($e,$a);', 597238405Sjkim '&$_ror ($b,$j?7:2);', # $b>>>2 598238405Sjkim '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' 599238405Sjkim ); 600238405Sjkim} 601238405Sjkim 602238405Sjkimsub body_20_39 () { 603238405Sjkim ( 604238405Sjkim '($a,$b,$c,$d,$e)=@V;'. 605238405Sjkim '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer 606238405Sjkim '&xor (@T[0],$d);', # ($b^$d) 607238405Sjkim '&mov (@T[1],$a);', # $b in next round 608238405Sjkim '&$_rol ($a,5);', 609238405Sjkim '&xor (@T[0],$c);', # ($b^$d^$c) 610238405Sjkim '&add ($e,$a);', 611238405Sjkim '&$_ror ($b,7);', # $b>>>2 612238405Sjkim '&add ($e,@T[0]);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' 613238405Sjkim ); 614238405Sjkim} 615238405Sjkim 616238405Sjkimsub body_40_59 () { 617238405Sjkim ( 618238405Sjkim '($a,$b,$c,$d,$e)=@V;'. 619238405Sjkim '&mov (@T[1],$c);', 620238405Sjkim '&xor ($c,$d);', 621238405Sjkim '&add ($e,eval(4*($j++&15))."(%rsp)");', # X[]+K xfer 622238405Sjkim '&and (@T[1],$d);', 623238405Sjkim '&and (@T[0],$c);', # ($b&($c^$d)) 624238405Sjkim '&$_ror ($b,7);', # $b>>>2 625238405Sjkim '&add ($e,@T[1]);', 626238405Sjkim '&mov (@T[1],$a);', # $b in next round 627238405Sjkim '&$_rol ($a,5);', 628238405Sjkim '&add ($e,@T[0]);', 629238405Sjkim '&xor ($c,$d);', # restore $c 630238405Sjkim '&add ($e,$a);' .'unshift(@V,pop(@V)); unshift(@T,pop(@T));' 631238405Sjkim ); 632238405Sjkim} 633238405Sjkim$code.=<<___; 634238405Sjkim.align 16 635238405Sjkim.Loop_ssse3: 636238405Sjkim___ 637238405Sjkim &Xupdate_ssse3_16_31(\&body_00_19); 638238405Sjkim &Xupdate_ssse3_16_31(\&body_00_19); 639238405Sjkim &Xupdate_ssse3_16_31(\&body_00_19); 640238405Sjkim &Xupdate_ssse3_16_31(\&body_00_19); 641238405Sjkim &Xupdate_ssse3_32_79(\&body_00_19); 642238405Sjkim &Xupdate_ssse3_32_79(\&body_20_39); 643238405Sjkim &Xupdate_ssse3_32_79(\&body_20_39); 644238405Sjkim &Xupdate_ssse3_32_79(\&body_20_39); 645238405Sjkim &Xupdate_ssse3_32_79(\&body_20_39); 646238405Sjkim &Xupdate_ssse3_32_79(\&body_20_39); 647238405Sjkim &Xupdate_ssse3_32_79(\&body_40_59); 648238405Sjkim &Xupdate_ssse3_32_79(\&body_40_59); 649238405Sjkim &Xupdate_ssse3_32_79(\&body_40_59); 650238405Sjkim &Xupdate_ssse3_32_79(\&body_40_59); 651238405Sjkim &Xupdate_ssse3_32_79(\&body_40_59); 652238405Sjkim &Xupdate_ssse3_32_79(\&body_20_39); 653238405Sjkim &Xuplast_ssse3_80(\&body_20_39); # can jump to "done" 654238405Sjkim 655238405Sjkim $saved_j=$j; @saved_V=@V; 656238405Sjkim 657238405Sjkim &Xloop_ssse3(\&body_20_39); 658238405Sjkim &Xloop_ssse3(\&body_20_39); 659238405Sjkim &Xloop_ssse3(\&body_20_39); 660238405Sjkim 661238405Sjkim$code.=<<___; 662238405Sjkim add 0($ctx),$A # update context 663238405Sjkim add 4($ctx),@T[0] 664238405Sjkim add 8($ctx),$C 665238405Sjkim add 12($ctx),$D 666238405Sjkim mov $A,0($ctx) 667238405Sjkim add 16($ctx),$E 668238405Sjkim mov @T[0],4($ctx) 669238405Sjkim mov @T[0],$B # magic seed 670238405Sjkim mov $C,8($ctx) 671238405Sjkim mov $D,12($ctx) 672238405Sjkim mov $E,16($ctx) 673238405Sjkim jmp .Loop_ssse3 674238405Sjkim 675238405Sjkim.align 16 676238405Sjkim.Ldone_ssse3: 677238405Sjkim___ 678238405Sjkim $j=$saved_j; @V=@saved_V; 679238405Sjkim 680238405Sjkim &Xtail_ssse3(\&body_20_39); 681238405Sjkim &Xtail_ssse3(\&body_20_39); 682238405Sjkim &Xtail_ssse3(\&body_20_39); 683238405Sjkim 684238405Sjkim$code.=<<___; 685238405Sjkim add 0($ctx),$A # update context 686238405Sjkim add 4($ctx),@T[0] 687238405Sjkim add 8($ctx),$C 688238405Sjkim mov $A,0($ctx) 689238405Sjkim add 12($ctx),$D 690238405Sjkim mov @T[0],4($ctx) 691238405Sjkim add 16($ctx),$E 692238405Sjkim mov $C,8($ctx) 693238405Sjkim mov $D,12($ctx) 694238405Sjkim mov $E,16($ctx) 695238405Sjkim___ 696238405Sjkim$code.=<<___ if ($win64); 697238405Sjkim movaps 64+0(%rsp),%xmm6 698238405Sjkim movaps 64+16(%rsp),%xmm7 699238405Sjkim movaps 64+32(%rsp),%xmm8 700238405Sjkim movaps 64+48(%rsp),%xmm9 701238405Sjkim movaps 64+64(%rsp),%xmm10 702238405Sjkim___ 703238405Sjkim$code.=<<___; 704238405Sjkim lea `64+($win64?5*16:0)`(%rsp),%rsi 705238405Sjkim mov 0(%rsi),%r12 706238405Sjkim mov 8(%rsi),%rbp 707238405Sjkim mov 16(%rsi),%rbx 708238405Sjkim lea 24(%rsi),%rsp 709238405Sjkim.Lepilogue_ssse3: 710238405Sjkim ret 711238405Sjkim.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 712238405Sjkim___ 713238405Sjkim 714238405Sjkimif ($avx) { 715238405Sjkimmy $Xi=4; 716238405Sjkimmy @X=map("%xmm$_",(4..7,0..3)); 717238405Sjkimmy @Tx=map("%xmm$_",(8..10)); 718238405Sjkimmy @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization 719238405Sjkimmy @T=("%esi","%edi"); 720238405Sjkimmy $j=0; 721238405Sjkimmy $K_XX_XX="%r11"; 722238405Sjkim 723238405Sjkimmy $_rol=sub { &shld(@_[0],@_) }; 724238405Sjkimmy $_ror=sub { &shrd(@_[0],@_) }; 725238405Sjkim 726238405Sjkim$code.=<<___; 727238405Sjkim.type sha1_block_data_order_avx,\@function,3 728238405Sjkim.align 16 729238405Sjkimsha1_block_data_order_avx: 730238405Sjkim_avx_shortcut: 731238405Sjkim push %rbx 732238405Sjkim push %rbp 733238405Sjkim push %r12 734238405Sjkim lea `-64-($win64?5*16:0)`(%rsp),%rsp 735238405Sjkim___ 736238405Sjkim$code.=<<___ if ($win64); 737238405Sjkim movaps %xmm6,64+0(%rsp) 738238405Sjkim movaps %xmm7,64+16(%rsp) 739238405Sjkim movaps %xmm8,64+32(%rsp) 740238405Sjkim movaps %xmm9,64+48(%rsp) 741238405Sjkim movaps %xmm10,64+64(%rsp) 742238405Sjkim.Lprologue_avx: 743238405Sjkim___ 744238405Sjkim$code.=<<___; 745238405Sjkim mov %rdi,$ctx # reassigned argument 746238405Sjkim mov %rsi,$inp # reassigned argument 747238405Sjkim mov %rdx,$num # reassigned argument 748279264Sdelphij vzeroupper 749238405Sjkim 750238405Sjkim shl \$6,$num 751238405Sjkim add $inp,$num 752238405Sjkim lea K_XX_XX(%rip),$K_XX_XX 753238405Sjkim 754238405Sjkim mov 0($ctx),$A # load context 755238405Sjkim mov 4($ctx),$B 756238405Sjkim mov 8($ctx),$C 757238405Sjkim mov 12($ctx),$D 758238405Sjkim mov $B,@T[0] # magic seed 759238405Sjkim mov 16($ctx),$E 760238405Sjkim 761238405Sjkim vmovdqa 64($K_XX_XX),@X[2] # pbswap mask 762238405Sjkim vmovdqa 0($K_XX_XX),@Tx[1] # K_00_19 763238405Sjkim vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] 764238405Sjkim vmovdqu 16($inp),@X[-3&7] 765238405Sjkim vmovdqu 32($inp),@X[-2&7] 766238405Sjkim vmovdqu 48($inp),@X[-1&7] 767238405Sjkim vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap 768238405Sjkim add \$64,$inp 769238405Sjkim vpshufb @X[2],@X[-3&7],@X[-3&7] 770238405Sjkim vpshufb @X[2],@X[-2&7],@X[-2&7] 771238405Sjkim vpshufb @X[2],@X[-1&7],@X[-1&7] 772238405Sjkim vpaddd @Tx[1],@X[-4&7],@X[0] # add K_00_19 773238405Sjkim vpaddd @Tx[1],@X[-3&7],@X[1] 774238405Sjkim vpaddd @Tx[1],@X[-2&7],@X[2] 775238405Sjkim vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU 776238405Sjkim vmovdqa @X[1],16(%rsp) 777238405Sjkim vmovdqa @X[2],32(%rsp) 778238405Sjkim jmp .Loop_avx 779238405Sjkim___ 780238405Sjkim 781238405Sjkimsub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 782238405Sjkim{ use integer; 783238405Sjkim my $body = shift; 784238405Sjkim my @insns = (&$body,&$body,&$body,&$body); # 40 instructions 785238405Sjkim my ($a,$b,$c,$d,$e); 786238405Sjkim 787238405Sjkim eval(shift(@insns)); 788238405Sjkim eval(shift(@insns)); 789238405Sjkim &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" 790238405Sjkim eval(shift(@insns)); 791238405Sjkim eval(shift(@insns)); 792238405Sjkim 793238405Sjkim &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); 794238405Sjkim eval(shift(@insns)); 795238405Sjkim eval(shift(@insns)); 796238405Sjkim &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords 797238405Sjkim eval(shift(@insns)); 798238405Sjkim eval(shift(@insns)); 799238405Sjkim &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" 800238405Sjkim eval(shift(@insns)); 801238405Sjkim eval(shift(@insns)); 802238405Sjkim 803238405Sjkim &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" 804238405Sjkim eval(shift(@insns)); 805238405Sjkim eval(shift(@insns)); 806238405Sjkim eval(shift(@insns)); 807238405Sjkim eval(shift(@insns)); 808238405Sjkim 809238405Sjkim &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" 810238405Sjkim eval(shift(@insns)); 811238405Sjkim eval(shift(@insns)); 812238405Sjkim &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU 813238405Sjkim eval(shift(@insns)); 814238405Sjkim eval(shift(@insns)); 815238405Sjkim 816238405Sjkim &vpsrld (@Tx[0],@X[0],31); 817238405Sjkim eval(shift(@insns)); 818238405Sjkim eval(shift(@insns)); 819238405Sjkim eval(shift(@insns)); 820238405Sjkim eval(shift(@insns)); 821238405Sjkim 822238405Sjkim &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword 823238405Sjkim &vpaddd (@X[0],@X[0],@X[0]); 824238405Sjkim eval(shift(@insns)); 825238405Sjkim eval(shift(@insns)); 826238405Sjkim eval(shift(@insns)); 827238405Sjkim eval(shift(@insns)); 828238405Sjkim 829238405Sjkim &vpsrld (@Tx[1],@Tx[2],30); 830238405Sjkim &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1 831238405Sjkim eval(shift(@insns)); 832238405Sjkim eval(shift(@insns)); 833238405Sjkim eval(shift(@insns)); 834238405Sjkim eval(shift(@insns)); 835238405Sjkim 836238405Sjkim &vpslld (@Tx[2],@Tx[2],2); 837238405Sjkim &vpxor (@X[0],@X[0],@Tx[1]); 838238405Sjkim eval(shift(@insns)); 839238405Sjkim eval(shift(@insns)); 840238405Sjkim eval(shift(@insns)); 841238405Sjkim eval(shift(@insns)); 842238405Sjkim 843238405Sjkim &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2 844238405Sjkim eval(shift(@insns)); 845238405Sjkim eval(shift(@insns)); 846238405Sjkim &vmovdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX 847238405Sjkim eval(shift(@insns)); 848238405Sjkim eval(shift(@insns)); 849238405Sjkim 850238405Sjkim 851238405Sjkim foreach (@insns) { eval; } # remaining instructions [if any] 852238405Sjkim 853238405Sjkim $Xi++; push(@X,shift(@X)); # "rotate" X[] 854238405Sjkim push(@Tx,shift(@Tx)); 855238405Sjkim} 856238405Sjkim 857238405Sjkimsub Xupdate_avx_32_79() 858238405Sjkim{ use integer; 859238405Sjkim my $body = shift; 860238405Sjkim my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions 861238405Sjkim my ($a,$b,$c,$d,$e); 862238405Sjkim 863238405Sjkim &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]" 864238405Sjkim &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" 865238405Sjkim eval(shift(@insns)); # body_20_39 866238405Sjkim eval(shift(@insns)); 867238405Sjkim eval(shift(@insns)); 868238405Sjkim eval(shift(@insns)); # rol 869238405Sjkim 870238405Sjkim &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" 871238405Sjkim eval(shift(@insns)); 872238405Sjkim eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); 873238405Sjkim if ($Xi%5) { 874238405Sjkim &vmovdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... 875238405Sjkim } else { # ... or load next one 876238405Sjkim &vmovdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); 877238405Sjkim } 878238405Sjkim &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); 879238405Sjkim eval(shift(@insns)); # ror 880238405Sjkim eval(shift(@insns)); 881238405Sjkim 882238405Sjkim &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]" 883238405Sjkim eval(shift(@insns)); # body_20_39 884238405Sjkim eval(shift(@insns)); 885238405Sjkim eval(shift(@insns)); 886238405Sjkim eval(shift(@insns)); # rol 887238405Sjkim 888238405Sjkim &vpsrld (@Tx[0],@X[0],30); 889238405Sjkim &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU 890238405Sjkim eval(shift(@insns)); 891238405Sjkim eval(shift(@insns)); 892238405Sjkim eval(shift(@insns)); # ror 893238405Sjkim eval(shift(@insns)); 894238405Sjkim 895238405Sjkim &vpslld (@X[0],@X[0],2); 896238405Sjkim eval(shift(@insns)); # body_20_39 897238405Sjkim eval(shift(@insns)); 898238405Sjkim eval(shift(@insns)); 899238405Sjkim eval(shift(@insns)); # rol 900238405Sjkim eval(shift(@insns)); 901238405Sjkim eval(shift(@insns)); 902238405Sjkim eval(shift(@insns)); # ror 903238405Sjkim eval(shift(@insns)); 904238405Sjkim 905238405Sjkim &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2 906238405Sjkim eval(shift(@insns)); # body_20_39 907238405Sjkim eval(shift(@insns)); 908238405Sjkim &vmovdqa (@Tx[1],@X[0]) if ($Xi<19); 909238405Sjkim eval(shift(@insns)); 910238405Sjkim eval(shift(@insns)); # rol 911238405Sjkim eval(shift(@insns)); 912238405Sjkim eval(shift(@insns)); 913238405Sjkim eval(shift(@insns)); # rol 914238405Sjkim eval(shift(@insns)); 915238405Sjkim 916238405Sjkim foreach (@insns) { eval; } # remaining instructions 917238405Sjkim 918238405Sjkim $Xi++; push(@X,shift(@X)); # "rotate" X[] 919238405Sjkim push(@Tx,shift(@Tx)); 920238405Sjkim} 921238405Sjkim 922238405Sjkimsub Xuplast_avx_80() 923238405Sjkim{ use integer; 924238405Sjkim my $body = shift; 925238405Sjkim my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 926238405Sjkim my ($a,$b,$c,$d,$e); 927238405Sjkim 928238405Sjkim eval(shift(@insns)); 929238405Sjkim &vpaddd (@Tx[1],@Tx[1],@X[-1&7]); 930238405Sjkim eval(shift(@insns)); 931238405Sjkim eval(shift(@insns)); 932238405Sjkim eval(shift(@insns)); 933238405Sjkim eval(shift(@insns)); 934238405Sjkim 935238405Sjkim &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU 936238405Sjkim 937238405Sjkim foreach (@insns) { eval; } # remaining instructions 938238405Sjkim 939238405Sjkim &cmp ($inp,$num); 940238405Sjkim &je (".Ldone_avx"); 941238405Sjkim 942238405Sjkim unshift(@Tx,pop(@Tx)); 943238405Sjkim 944238405Sjkim &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask 945238405Sjkim &vmovdqa(@Tx[1],"0($K_XX_XX)"); # K_00_19 946238405Sjkim &vmovdqu(@X[-4&7],"0($inp)"); # load input 947238405Sjkim &vmovdqu(@X[-3&7],"16($inp)"); 948238405Sjkim &vmovdqu(@X[-2&7],"32($inp)"); 949238405Sjkim &vmovdqu(@X[-1&7],"48($inp)"); 950238405Sjkim &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap 951238405Sjkim &add ($inp,64); 952238405Sjkim 953238405Sjkim $Xi=0; 954238405Sjkim} 955238405Sjkim 956238405Sjkimsub Xloop_avx() 957238405Sjkim{ use integer; 958238405Sjkim my $body = shift; 959238405Sjkim my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 960238405Sjkim my ($a,$b,$c,$d,$e); 961238405Sjkim 962238405Sjkim eval(shift(@insns)); 963238405Sjkim eval(shift(@insns)); 964238405Sjkim &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]); 965238405Sjkim eval(shift(@insns)); 966238405Sjkim eval(shift(@insns)); 967238405Sjkim &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@Tx[1]); 968238405Sjkim eval(shift(@insns)); 969238405Sjkim eval(shift(@insns)); 970238405Sjkim eval(shift(@insns)); 971238405Sjkim eval(shift(@insns)); 972238405Sjkim &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU 973238405Sjkim eval(shift(@insns)); 974238405Sjkim eval(shift(@insns)); 975238405Sjkim 976238405Sjkim foreach (@insns) { eval; } 977238405Sjkim $Xi++; 978238405Sjkim} 979238405Sjkim 980238405Sjkimsub Xtail_avx() 981238405Sjkim{ use integer; 982238405Sjkim my $body = shift; 983238405Sjkim my @insns = (&$body,&$body,&$body,&$body); # 32 instructions 984238405Sjkim my ($a,$b,$c,$d,$e); 985238405Sjkim 986238405Sjkim foreach (@insns) { eval; } 987238405Sjkim} 988238405Sjkim 989238405Sjkim$code.=<<___; 990238405Sjkim.align 16 991238405Sjkim.Loop_avx: 992238405Sjkim___ 993238405Sjkim &Xupdate_avx_16_31(\&body_00_19); 994238405Sjkim &Xupdate_avx_16_31(\&body_00_19); 995238405Sjkim &Xupdate_avx_16_31(\&body_00_19); 996238405Sjkim &Xupdate_avx_16_31(\&body_00_19); 997238405Sjkim &Xupdate_avx_32_79(\&body_00_19); 998238405Sjkim &Xupdate_avx_32_79(\&body_20_39); 999238405Sjkim &Xupdate_avx_32_79(\&body_20_39); 1000238405Sjkim &Xupdate_avx_32_79(\&body_20_39); 1001238405Sjkim &Xupdate_avx_32_79(\&body_20_39); 1002238405Sjkim &Xupdate_avx_32_79(\&body_20_39); 1003238405Sjkim &Xupdate_avx_32_79(\&body_40_59); 1004238405Sjkim &Xupdate_avx_32_79(\&body_40_59); 1005238405Sjkim &Xupdate_avx_32_79(\&body_40_59); 1006238405Sjkim &Xupdate_avx_32_79(\&body_40_59); 1007238405Sjkim &Xupdate_avx_32_79(\&body_40_59); 1008238405Sjkim &Xupdate_avx_32_79(\&body_20_39); 1009238405Sjkim &Xuplast_avx_80(\&body_20_39); # can jump to "done" 1010238405Sjkim 1011238405Sjkim $saved_j=$j; @saved_V=@V; 1012238405Sjkim 1013238405Sjkim &Xloop_avx(\&body_20_39); 1014238405Sjkim &Xloop_avx(\&body_20_39); 1015238405Sjkim &Xloop_avx(\&body_20_39); 1016238405Sjkim 1017238405Sjkim$code.=<<___; 1018238405Sjkim add 0($ctx),$A # update context 1019238405Sjkim add 4($ctx),@T[0] 1020238405Sjkim add 8($ctx),$C 1021238405Sjkim add 12($ctx),$D 1022238405Sjkim mov $A,0($ctx) 1023238405Sjkim add 16($ctx),$E 1024238405Sjkim mov @T[0],4($ctx) 1025238405Sjkim mov @T[0],$B # magic seed 1026238405Sjkim mov $C,8($ctx) 1027238405Sjkim mov $D,12($ctx) 1028238405Sjkim mov $E,16($ctx) 1029238405Sjkim jmp .Loop_avx 1030238405Sjkim 1031238405Sjkim.align 16 1032238405Sjkim.Ldone_avx: 1033238405Sjkim___ 1034238405Sjkim $j=$saved_j; @V=@saved_V; 1035238405Sjkim 1036238405Sjkim &Xtail_avx(\&body_20_39); 1037238405Sjkim &Xtail_avx(\&body_20_39); 1038238405Sjkim &Xtail_avx(\&body_20_39); 1039238405Sjkim 1040238405Sjkim$code.=<<___; 1041279264Sdelphij vzeroupper 1042238405Sjkim 1043238405Sjkim add 0($ctx),$A # update context 1044238405Sjkim add 4($ctx),@T[0] 1045238405Sjkim add 8($ctx),$C 1046238405Sjkim mov $A,0($ctx) 1047238405Sjkim add 12($ctx),$D 1048238405Sjkim mov @T[0],4($ctx) 1049238405Sjkim add 16($ctx),$E 1050238405Sjkim mov $C,8($ctx) 1051238405Sjkim mov $D,12($ctx) 1052238405Sjkim mov $E,16($ctx) 1053238405Sjkim___ 1054238405Sjkim$code.=<<___ if ($win64); 1055238405Sjkim movaps 64+0(%rsp),%xmm6 1056238405Sjkim movaps 64+16(%rsp),%xmm7 1057238405Sjkim movaps 64+32(%rsp),%xmm8 1058238405Sjkim movaps 64+48(%rsp),%xmm9 1059238405Sjkim movaps 64+64(%rsp),%xmm10 1060238405Sjkim___ 1061238405Sjkim$code.=<<___; 1062238405Sjkim lea `64+($win64?5*16:0)`(%rsp),%rsi 1063238405Sjkim mov 0(%rsi),%r12 1064238405Sjkim mov 8(%rsi),%rbp 1065238405Sjkim mov 16(%rsi),%rbx 1066238405Sjkim lea 24(%rsi),%rsp 1067238405Sjkim.Lepilogue_avx: 1068238405Sjkim ret 1069238405Sjkim.size sha1_block_data_order_avx,.-sha1_block_data_order_avx 1070238405Sjkim___ 1071238405Sjkim} 1072238405Sjkim$code.=<<___; 1073238405Sjkim.align 64 1074238405SjkimK_XX_XX: 1075238405Sjkim.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 1076238405Sjkim.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 1077238405Sjkim.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 1078238405Sjkim.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 1079238405Sjkim.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask 1080238405Sjkim___ 1081238405Sjkim}}} 1082238405Sjkim$code.=<<___; 1083183234Ssimon.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 1084238405Sjkim.align 64 1085183234Ssimon___ 1086183234Ssimon 1087238405Sjkim# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 1088238405Sjkim# CONTEXT *context,DISPATCHER_CONTEXT *disp) 1089238405Sjkimif ($win64) { 1090238405Sjkim$rec="%rcx"; 1091238405Sjkim$frame="%rdx"; 1092238405Sjkim$context="%r8"; 1093238405Sjkim$disp="%r9"; 1094238405Sjkim 1095238405Sjkim$code.=<<___; 1096238405Sjkim.extern __imp_RtlVirtualUnwind 1097238405Sjkim.type se_handler,\@abi-omnipotent 1098238405Sjkim.align 16 1099238405Sjkimse_handler: 1100238405Sjkim push %rsi 1101238405Sjkim push %rdi 1102238405Sjkim push %rbx 1103238405Sjkim push %rbp 1104238405Sjkim push %r12 1105238405Sjkim push %r13 1106238405Sjkim push %r14 1107238405Sjkim push %r15 1108238405Sjkim pushfq 1109238405Sjkim sub \$64,%rsp 1110238405Sjkim 1111238405Sjkim mov 120($context),%rax # pull context->Rax 1112238405Sjkim mov 248($context),%rbx # pull context->Rip 1113238405Sjkim 1114238405Sjkim lea .Lprologue(%rip),%r10 1115238405Sjkim cmp %r10,%rbx # context->Rip<.Lprologue 1116238405Sjkim jb .Lcommon_seh_tail 1117238405Sjkim 1118238405Sjkim mov 152($context),%rax # pull context->Rsp 1119238405Sjkim 1120238405Sjkim lea .Lepilogue(%rip),%r10 1121238405Sjkim cmp %r10,%rbx # context->Rip>=.Lepilogue 1122238405Sjkim jae .Lcommon_seh_tail 1123238405Sjkim 1124238405Sjkim mov `16*4`(%rax),%rax # pull saved stack pointer 1125238405Sjkim lea 32(%rax),%rax 1126238405Sjkim 1127238405Sjkim mov -8(%rax),%rbx 1128238405Sjkim mov -16(%rax),%rbp 1129238405Sjkim mov -24(%rax),%r12 1130238405Sjkim mov -32(%rax),%r13 1131238405Sjkim mov %rbx,144($context) # restore context->Rbx 1132238405Sjkim mov %rbp,160($context) # restore context->Rbp 1133238405Sjkim mov %r12,216($context) # restore context->R12 1134238405Sjkim mov %r13,224($context) # restore context->R13 1135238405Sjkim 1136238405Sjkim jmp .Lcommon_seh_tail 1137238405Sjkim.size se_handler,.-se_handler 1138238405Sjkim 1139238405Sjkim.type ssse3_handler,\@abi-omnipotent 1140238405Sjkim.align 16 1141238405Sjkimssse3_handler: 1142238405Sjkim push %rsi 1143238405Sjkim push %rdi 1144238405Sjkim push %rbx 1145238405Sjkim push %rbp 1146238405Sjkim push %r12 1147238405Sjkim push %r13 1148238405Sjkim push %r14 1149238405Sjkim push %r15 1150238405Sjkim pushfq 1151238405Sjkim sub \$64,%rsp 1152238405Sjkim 1153238405Sjkim mov 120($context),%rax # pull context->Rax 1154238405Sjkim mov 248($context),%rbx # pull context->Rip 1155238405Sjkim 1156238405Sjkim mov 8($disp),%rsi # disp->ImageBase 1157238405Sjkim mov 56($disp),%r11 # disp->HandlerData 1158238405Sjkim 1159238405Sjkim mov 0(%r11),%r10d # HandlerData[0] 1160238405Sjkim lea (%rsi,%r10),%r10 # prologue label 1161238405Sjkim cmp %r10,%rbx # context->Rip<prologue label 1162238405Sjkim jb .Lcommon_seh_tail 1163238405Sjkim 1164238405Sjkim mov 152($context),%rax # pull context->Rsp 1165238405Sjkim 1166238405Sjkim mov 4(%r11),%r10d # HandlerData[1] 1167238405Sjkim lea (%rsi,%r10),%r10 # epilogue label 1168238405Sjkim cmp %r10,%rbx # context->Rip>=epilogue label 1169238405Sjkim jae .Lcommon_seh_tail 1170238405Sjkim 1171238405Sjkim lea 64(%rax),%rsi 1172238405Sjkim lea 512($context),%rdi # &context.Xmm6 1173238405Sjkim mov \$10,%ecx 1174238405Sjkim .long 0xa548f3fc # cld; rep movsq 1175238405Sjkim lea `24+64+5*16`(%rax),%rax # adjust stack pointer 1176238405Sjkim 1177238405Sjkim mov -8(%rax),%rbx 1178238405Sjkim mov -16(%rax),%rbp 1179238405Sjkim mov -24(%rax),%r12 1180238405Sjkim mov %rbx,144($context) # restore context->Rbx 1181238405Sjkim mov %rbp,160($context) # restore context->Rbp 1182238405Sjkim mov %r12,216($context) # restore cotnext->R12 1183238405Sjkim 1184238405Sjkim.Lcommon_seh_tail: 1185238405Sjkim mov 8(%rax),%rdi 1186238405Sjkim mov 16(%rax),%rsi 1187238405Sjkim mov %rax,152($context) # restore context->Rsp 1188238405Sjkim mov %rsi,168($context) # restore context->Rsi 1189238405Sjkim mov %rdi,176($context) # restore context->Rdi 1190238405Sjkim 1191238405Sjkim mov 40($disp),%rdi # disp->ContextRecord 1192238405Sjkim mov $context,%rsi # context 1193238405Sjkim mov \$154,%ecx # sizeof(CONTEXT) 1194238405Sjkim .long 0xa548f3fc # cld; rep movsq 1195238405Sjkim 1196238405Sjkim mov $disp,%rsi 1197238405Sjkim xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 1198238405Sjkim mov 8(%rsi),%rdx # arg2, disp->ImageBase 1199238405Sjkim mov 0(%rsi),%r8 # arg3, disp->ControlPc 1200238405Sjkim mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 1201238405Sjkim mov 40(%rsi),%r10 # disp->ContextRecord 1202238405Sjkim lea 56(%rsi),%r11 # &disp->HandlerData 1203238405Sjkim lea 24(%rsi),%r12 # &disp->EstablisherFrame 1204238405Sjkim mov %r10,32(%rsp) # arg5 1205238405Sjkim mov %r11,40(%rsp) # arg6 1206238405Sjkim mov %r12,48(%rsp) # arg7 1207238405Sjkim mov %rcx,56(%rsp) # arg8, (NULL) 1208238405Sjkim call *__imp_RtlVirtualUnwind(%rip) 1209238405Sjkim 1210238405Sjkim mov \$1,%eax # ExceptionContinueSearch 1211238405Sjkim add \$64,%rsp 1212238405Sjkim popfq 1213238405Sjkim pop %r15 1214238405Sjkim pop %r14 1215238405Sjkim pop %r13 1216238405Sjkim pop %r12 1217238405Sjkim pop %rbp 1218238405Sjkim pop %rbx 1219238405Sjkim pop %rdi 1220238405Sjkim pop %rsi 1221238405Sjkim ret 1222238405Sjkim.size ssse3_handler,.-ssse3_handler 1223238405Sjkim 1224238405Sjkim.section .pdata 1225238405Sjkim.align 4 1226238405Sjkim .rva .LSEH_begin_sha1_block_data_order 1227238405Sjkim .rva .LSEH_end_sha1_block_data_order 1228238405Sjkim .rva .LSEH_info_sha1_block_data_order 1229238405Sjkim .rva .LSEH_begin_sha1_block_data_order_ssse3 1230238405Sjkim .rva .LSEH_end_sha1_block_data_order_ssse3 1231238405Sjkim .rva .LSEH_info_sha1_block_data_order_ssse3 1232238405Sjkim___ 1233238405Sjkim$code.=<<___ if ($avx); 1234238405Sjkim .rva .LSEH_begin_sha1_block_data_order_avx 1235238405Sjkim .rva .LSEH_end_sha1_block_data_order_avx 1236238405Sjkim .rva .LSEH_info_sha1_block_data_order_avx 1237238405Sjkim___ 1238238405Sjkim$code.=<<___; 1239238405Sjkim.section .xdata 1240238405Sjkim.align 8 1241238405Sjkim.LSEH_info_sha1_block_data_order: 1242238405Sjkim .byte 9,0,0,0 1243238405Sjkim .rva se_handler 1244238405Sjkim.LSEH_info_sha1_block_data_order_ssse3: 1245238405Sjkim .byte 9,0,0,0 1246238405Sjkim .rva ssse3_handler 1247238405Sjkim .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] 1248238405Sjkim___ 1249238405Sjkim$code.=<<___ if ($avx); 1250238405Sjkim.LSEH_info_sha1_block_data_order_avx: 1251238405Sjkim .byte 9,0,0,0 1252238405Sjkim .rva ssse3_handler 1253238405Sjkim .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] 1254238405Sjkim___ 1255238405Sjkim} 1256238405Sjkim 1257183234Ssimon#################################################################### 1258183234Ssimon 1259183234Ssimon$code =~ s/\`([^\`]*)\`/eval $1/gem; 1260183234Ssimonprint $code; 1261183234Ssimonclose STDOUT; 1262