sha1-586.pl revision 225736
136641Speter#!/usr/bin/env perl 215353Swosch 318566Sbde# ==================================================================== 431859Sbde# [Re]written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 515353Swosch# project. The module is, however, dual licensed under OpenSSL and 635789Sbde# CRYPTOGAMS licenses depending on where you obtain it. For further 735789Sbde# details see http://www.openssl.org/~appro/cryptogams/. 835789Sbde# ==================================================================== 935789Sbde 1015353Swosch# "[Re]written" was achieved in two major overhauls. In 2004 BODY_* 1115353Swosch# functions were re-implemented to address P4 performance issue [see 1215353Swosch# commentary below], and in 2006 the rest was rewritten in order to 1333815Sbde# gain freedom to liberate licensing terms. 1415948Swosch 1533815Sbde# It was noted that Intel IA-32 C compiler generates code which 1633815Sbde# performs ~30% *faster* on P4 CPU than original *hand-coded* 1718566Sbde# SHA1 assembler implementation. To address this problem (and 1818566Sbde# prove that humans are still better than machines:-), the 1918566Sbde# original code was overhauled, which resulted in following 2018566Sbde# performance changes: 2115353Swosch# 2218566Sbde# compared with original compared with Intel cc 2318566Sbde# assembler impl. generated code 2418566Sbde# Pentium -16% +48% 2518566Sbde# PIII/AMD +8% +16% 2618566Sbde# P4 +85%(!) +45% 2718566Sbde# 2818566Sbde# As you can see Pentium came out as looser:-( Yet I reckoned that 2918340Sswallace# improvement on P4 outweights the loss and incorporate this 3015948Swosch# re-tuned code to 0.9.7 and later. 3118566Sbde# ---------------------------------------------------------------- 3218566Sbde# <appro@fy.chalmers.se> 3315353Swosch 3418566Sbde$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 3518566Sbdepush(@INC,"${dir}","${dir}../../perlasm"); 3618566Sbderequire "x86asm.pl"; 3715353Swosch 3815353Swosch&asm_init($ARGV[0],"sha1-586.pl",$ARGV[$#ARGV] eq "386"); 3915353Swosch 4015353Swosch$A="eax"; 4133815Sbde$B="ebx"; 4215353Swosch$C="ecx"; 4315353Swosch$D="edx"; 4416663Sjkh$E="edi"; 4515353Swosch$T="esi"; 4615353Swosch$tmp1="ebp"; 4716663Sjkh 4815353Swosch@V=($A,$B,$C,$D,$E,$T); 4914801Swosch 5018340Sswallacesub BODY_00_15 5118340Sswallace { 5218340Sswallace local($n,$a,$b,$c,$d,$e,$f)=@_; 5318340Sswallace 5418340Sswallace &comment("00_15 $n"); 5514801Swosch 5618340Sswallace &mov($f,$c); # f to hold F_00_19(b,c,d) 5718340Sswallace if ($n==0) { &mov($tmp1,$a); } 5818340Sswallace else { &mov($a,$tmp1); } 5918340Sswallace &rotl($tmp1,5); # tmp1=ROTATE(a,5) 6018566Sbde &xor($f,$d); 6118340Sswallace &add($tmp1,$e); # tmp1+=e; 6218373Speter &and($f,$b); 6318566Sbde &mov($e,&swtmp($n%16)); # e becomes volatile and is loaded 6418373Speter # with xi, also note that e becomes 6518340Sswallace # f in next round... 6618340Sswallace &xor($f,$d); # f holds F_00_19(b,c,d) 6718340Sswallace &rotr($b,2); # b=ROTATE(b,30) 6818340Sswallace &lea($tmp1,&DWP(0x5a827999,$tmp1,$e)); # tmp1+=K_00_19+xi 6914801Swosch 7014801Swosch if ($n==15) { &add($f,$tmp1); } # f+=tmp1 7114801Swosch else { &add($tmp1,$f); } # f becomes a in next round 7214801Swosch } 7316663Sjkh 7416663Sjkhsub BODY_16_19 7525316Sbde { 7618340Sswallace local($n,$a,$b,$c,$d,$e,$f)=@_; 7725316Sbde 7818340Sswallace &comment("16_19 $n"); 7916663Sjkh 8016663Sjkh &mov($f,&swtmp($n%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) 8118340Sswallace &mov($tmp1,$c); # tmp1 to hold F_00_19(b,c,d) 8216663Sjkh &xor($f,&swtmp(($n+2)%16)); 8315353Swosch &xor($tmp1,$d); 8416663Sjkh &xor($f,&swtmp(($n+8)%16)); 8525316Sbde &and($tmp1,$b); # tmp1 holds F_00_19(b,c,d) 8618340Sswallace &rotr($b,2); # b=ROTATE(b,30) 8725316Sbde &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd 8818340Sswallace &rotl($f,1); # f=ROTATE(f,1) 8916663Sjkh &xor($tmp1,$d); # tmp1=F_00_19(b,c,d) 9016663Sjkh &mov(&swtmp($n%16),$f); # xi=f 9118051Sbde &lea($f,&DWP(0x5a827999,$f,$e));# f+=K_00_19+e 9218340Sswallace &mov($e,$a); # e becomes volatile 9318340Sswallace &rotl($e,5); # e=ROTATE(a,5) 9414801Swosch &add($f,$tmp1); # f+=F_00_19(b,c,d) 9514801Swosch &add($f,$e); # f+=ROTATE(a,5) 9614801Swosch } 9715353Swosch 9814801Swoschsub BODY_20_39 9916663Sjkh { 10016663Sjkh local($n,$a,$b,$c,$d,$e,$f)=@_; 10125316Sbde local $K=($n<40)?0x6ed9eba1:0xca62c1d6; 10218051Sbde 10318340Sswallace &comment("20_39 $n"); 10416663Sjkh 10518340Sswallace &mov($tmp1,$b); # tmp1 to hold F_20_39(b,c,d) 10616663Sjkh &mov($f,&swtmp($n%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) 10716663Sjkh &rotr($b,2); # b=ROTATE(b,30) 10816663Sjkh &xor($f,&swtmp(($n+2)%16)); 10915166Swosch &xor($tmp1,$c); 11017116Spst &xor($f,&swtmp(($n+8)%16)); 11117116Spst &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d) 11217116Spst &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd 11317116Spst &rotl($f,1); # f=ROTATE(f,1) 11436641Speter &add($tmp1,$e); 11517116Spst &mov(&swtmp($n%16),$f); # xi=f 11617116Spst &mov($e,$a); # e becomes volatile 11716663Sjkh &rotl($e,5); # e=ROTATE(a,5) 11825316Sbde &lea($f,&DWP($K,$f,$tmp1)); # f+=K_20_39+e 11918340Sswallace &add($f,$e); # f+=ROTATE(a,5) 12016663Sjkh } 12116663Sjkh 12216663Sjkhsub BODY_40_59 12316663Sjkh { 12415166Swosch local($n,$a,$b,$c,$d,$e,$f)=@_; 12531859Sbde 12631859Sbde &comment("40_59 $n"); 12731859Sbde 12831859Sbde &mov($f,&swtmp($n%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) 12916663Sjkh &mov($tmp1,&swtmp(($n+2)%16)); 13033815Sbde &xor($f,$tmp1); 13133815Sbde &mov($tmp1,&swtmp(($n+8)%16)); 13215166Swosch &xor($f,$tmp1); 13333815Sbde &mov($tmp1,&swtmp(($n+13)%16)); 13415166Swosch &xor($f,$tmp1); # f holds xa^xb^xc^xd 13518427Sbde &mov($tmp1,$b); # tmp1 to hold F_40_59(b,c,d) 13618427Sbde &rotl($f,1); # f=ROTATE(f,1) 13718427Sbde &or($tmp1,$c); 13818427Sbde &mov(&swtmp($n%16),$f); # xi=f 13931859Sbde &and($tmp1,$d); 14034576Sbde &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e 14134576Sbde &mov($e,$b); # e becomes volatile and is used 14234576Sbde # to calculate F_40_59(b,c,d) 14334576Sbde &rotr($b,2); # b=ROTATE(b,30) 14434576Sbde &and($e,$c); 14518427Sbde &or($tmp1,$e); # tmp1 holds F_40_59(b,c,d) 14634576Sbde &mov($e,$a); 14734576Sbde &rotl($e,5); # e=ROTATE(a,5) 14818427Sbde &add($f,$tmp1); # f+=tmp1; 14931859Sbde &add($f,$e); # f+=ROTATE(a,5) 15031859Sbde } 15131859Sbde 15231859Sbde&function_begin("sha1_block_data_order"); 15334576Sbde &mov($tmp1,&wparam(0)); # SHA_CTX *c 15434576Sbde &mov($T,&wparam(1)); # const void *input 15531859Sbde &mov($A,&wparam(2)); # size_t num 15618427Sbde &stack_push(16); # allocate X[16] 15718427Sbde &shl($A,6); 15831859Sbde &add($A,$T); 15918427Sbde &mov(&wparam(2),$A); # pointer beyond the end of input 16016663Sjkh &mov($E,&DWP(16,$tmp1));# pre-load E 16135789Sbde 16235789Sbde &set_label("loop",16); 16335789Sbde 16435789Sbde # copy input chunk to X, but reversing byte order! 16535789Sbde for ($i=0; $i<16; $i+=4) 16635789Sbde { 16735789Sbde &mov($A,&DWP(4*($i+0),$T)); 16835789Sbde &mov($B,&DWP(4*($i+1),$T)); 16935789Sbde &mov($C,&DWP(4*($i+2),$T)); 17035789Sbde &mov($D,&DWP(4*($i+3),$T)); 17135789Sbde &bswap($A); 17235789Sbde &bswap($B); 17335789Sbde &bswap($C); 17435789Sbde &bswap($D); 17535789Sbde &mov(&swtmp($i+0),$A); 17635789Sbde &mov(&swtmp($i+1),$B); 17735789Sbde &mov(&swtmp($i+2),$C); 17835789Sbde &mov(&swtmp($i+3),$D); 17935789Sbde } 180 &mov(&wparam(1),$T); # redundant in 1st spin 181 182 &mov($A,&DWP(0,$tmp1)); # load SHA_CTX 183 &mov($B,&DWP(4,$tmp1)); 184 &mov($C,&DWP(8,$tmp1)); 185 &mov($D,&DWP(12,$tmp1)); 186 # E is pre-loaded 187 188 for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } 189 for(;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); } 190 for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 191 for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 192 for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 193 194 (($V[5] eq $D) and ($V[0] eq $E)) or die; # double-check 195 196 &mov($tmp1,&wparam(0)); # re-load SHA_CTX* 197 &mov($D,&wparam(1)); # D is last "T" and is discarded 198 199 &add($E,&DWP(0,$tmp1)); # E is last "A"... 200 &add($T,&DWP(4,$tmp1)); 201 &add($A,&DWP(8,$tmp1)); 202 &add($B,&DWP(12,$tmp1)); 203 &add($C,&DWP(16,$tmp1)); 204 205 &mov(&DWP(0,$tmp1),$E); # update SHA_CTX 206 &add($D,64); # advance input pointer 207 &mov(&DWP(4,$tmp1),$T); 208 &cmp($D,&wparam(2)); # have we reached the end yet? 209 &mov(&DWP(8,$tmp1),$A); 210 &mov($E,$C); # C is last "E" which needs to be "pre-loaded" 211 &mov(&DWP(12,$tmp1),$B); 212 &mov($T,$D); # input pointer 213 &mov(&DWP(16,$tmp1),$C); 214 &jb(&label("loop")); 215 216 &stack_pop(16); 217&function_end("sha1_block_data_order"); 218 219&asm_finish(); 220