1183234Ssimon#!/usr/bin/env perl 2183234Ssimon# 3183234Ssimon# ==================================================================== 4183234Ssimon# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5183234Ssimon# project. The module is, however, dual licensed under OpenSSL and 6183234Ssimon# CRYPTOGAMS licenses depending on where you obtain it. For further 7183234Ssimon# details see http://www.openssl.org/~appro/cryptogams/. 8183234Ssimon# ==================================================================== 9183234Ssimon# 10183234Ssimon# sha1_block procedure for x86_64. 11183234Ssimon# 12183234Ssimon# It was brought to my attention that on EM64T compiler-generated code 13183234Ssimon# was far behind 32-bit assembler implementation. This is unlike on 14183234Ssimon# Opteron where compiler-generated code was only 15% behind 32-bit 15183234Ssimon# assembler, which originally made it hard to motivate the effort. 16183234Ssimon# There was suggestion to mechanically translate 32-bit code, but I 17183234Ssimon# dismissed it, reasoning that x86_64 offers enough register bank 18183234Ssimon# capacity to fully utilize SHA-1 parallelism. Therefore this fresh 19183234Ssimon# implementation:-) However! While 64-bit code does performs better 20183234Ssimon# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well, 21183234Ssimon# x86_64 does offer larger *addressable* bank, but out-of-order core 22183234Ssimon# reaches for even more registers through dynamic aliasing, and EM64T 23183234Ssimon# core must have managed to run-time optimize even 32-bit code just as 24183234Ssimon# good as 64-bit one. Performance improvement is summarized in the 25183234Ssimon# following table: 26183234Ssimon# 27183234Ssimon# gcc 3.4 32-bit asm cycles/byte 28183234Ssimon# Opteron +45% +20% 6.8 29183234Ssimon# Xeon P4 +65% +0% 9.9 30183234Ssimon# Core2 +60% +10% 7.0 31183234Ssimon 32183234Ssimon$output=shift; 33183234Ssimon 34183234Ssimon$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 35183234Ssimon( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 36183234Ssimon( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 37183234Ssimondie "can't locate x86_64-xlate.pl"; 38183234Ssimon 39183234Ssimonopen STDOUT,"| $^X $xlate $output"; 40183234Ssimon 41183234Ssimon$ctx="%rdi"; # 1st arg 42183234Ssimon$inp="%rsi"; # 2nd arg 43183234Ssimon$num="%rdx"; # 3rd arg 44183234Ssimon 45183234Ssimon# reassign arguments in order to produce more compact code 46183234Ssimon$ctx="%r8"; 47183234Ssimon$inp="%r9"; 48183234Ssimon$num="%r10"; 49183234Ssimon 50183234Ssimon$xi="%eax"; 51183234Ssimon$t0="%ebx"; 52183234Ssimon$t1="%ecx"; 53183234Ssimon$A="%edx"; 54183234Ssimon$B="%esi"; 55183234Ssimon$C="%edi"; 56183234Ssimon$D="%ebp"; 57183234Ssimon$E="%r11d"; 58183234Ssimon$T="%r12d"; 59183234Ssimon 60183234Ssimon@V=($A,$B,$C,$D,$E,$T); 61183234Ssimon 62183234Ssimonsub PROLOGUE { 63183234Ssimonmy $func=shift; 64183234Ssimon$code.=<<___; 65183234Ssimon.globl $func 66183234Ssimon.type $func,\@function,3 67183234Ssimon.align 16 68183234Ssimon$func: 69183234Ssimon push %rbx 70183234Ssimon push %rbp 71183234Ssimon push %r12 72183234Ssimon mov %rsp,%rax 73183234Ssimon mov %rdi,$ctx # reassigned argument 74183234Ssimon sub \$`8+16*4`,%rsp 75183234Ssimon mov %rsi,$inp # reassigned argument 76183234Ssimon and \$-64,%rsp 77183234Ssimon mov %rdx,$num # reassigned argument 78183234Ssimon mov %rax,`16*4`(%rsp) 79183234Ssimon 80183234Ssimon mov 0($ctx),$A 81183234Ssimon mov 4($ctx),$B 82183234Ssimon mov 8($ctx),$C 83183234Ssimon mov 12($ctx),$D 84183234Ssimon mov 16($ctx),$E 85183234Ssimon___ 86183234Ssimon} 87183234Ssimon 88183234Ssimonsub EPILOGUE { 89183234Ssimonmy $func=shift; 90183234Ssimon$code.=<<___; 91183234Ssimon mov `16*4`(%rsp),%rsp 92183234Ssimon pop %r12 93183234Ssimon pop %rbp 94183234Ssimon pop %rbx 95183234Ssimon ret 96183234Ssimon.size $func,.-$func 97183234Ssimon___ 98183234Ssimon} 99183234Ssimon 100183234Ssimonsub BODY_00_19 { 101183234Ssimonmy ($i,$a,$b,$c,$d,$e,$f,$host)=@_; 102183234Ssimonmy $j=$i+1; 103183234Ssimon$code.=<<___ if ($i==0); 104183234Ssimon mov `4*$i`($inp),$xi 105183234Ssimon `"bswap $xi" if(!defined($host))` 106183234Ssimon mov $xi,`4*$i`(%rsp) 107183234Ssimon___ 108183234Ssimon$code.=<<___ if ($i<15); 109183234Ssimon lea 0x5a827999($xi,$e),$f 110183234Ssimon mov $c,$t0 111183234Ssimon mov `4*$j`($inp),$xi 112183234Ssimon mov $a,$e 113183234Ssimon xor $d,$t0 114183234Ssimon `"bswap $xi" if(!defined($host))` 115183234Ssimon rol \$5,$e 116183234Ssimon and $b,$t0 117183234Ssimon mov $xi,`4*$j`(%rsp) 118183234Ssimon add $e,$f 119183234Ssimon xor $d,$t0 120183234Ssimon rol \$30,$b 121183234Ssimon add $t0,$f 122183234Ssimon___ 123183234Ssimon$code.=<<___ if ($i>=15); 124183234Ssimon lea 0x5a827999($xi,$e),$f 125183234Ssimon mov `4*($j%16)`(%rsp),$xi 126183234Ssimon mov $c,$t0 127183234Ssimon mov $a,$e 128183234Ssimon xor `4*(($j+2)%16)`(%rsp),$xi 129183234Ssimon xor $d,$t0 130183234Ssimon rol \$5,$e 131183234Ssimon xor `4*(($j+8)%16)`(%rsp),$xi 132183234Ssimon and $b,$t0 133183234Ssimon add $e,$f 134183234Ssimon xor `4*(($j+13)%16)`(%rsp),$xi 135183234Ssimon xor $d,$t0 136183234Ssimon rol \$30,$b 137183234Ssimon add $t0,$f 138183234Ssimon rol \$1,$xi 139183234Ssimon mov $xi,`4*($j%16)`(%rsp) 140183234Ssimon___ 141183234Ssimon} 142183234Ssimon 143183234Ssimonsub BODY_20_39 { 144183234Ssimonmy ($i,$a,$b,$c,$d,$e,$f)=@_; 145183234Ssimonmy $j=$i+1; 146183234Ssimonmy $K=($i<40)?0x6ed9eba1:0xca62c1d6; 147183234Ssimon$code.=<<___ if ($i<79); 148183234Ssimon lea $K($xi,$e),$f 149183234Ssimon mov `4*($j%16)`(%rsp),$xi 150183234Ssimon mov $c,$t0 151183234Ssimon mov $a,$e 152183234Ssimon xor `4*(($j+2)%16)`(%rsp),$xi 153183234Ssimon xor $b,$t0 154183234Ssimon rol \$5,$e 155183234Ssimon xor `4*(($j+8)%16)`(%rsp),$xi 156183234Ssimon xor $d,$t0 157183234Ssimon add $e,$f 158183234Ssimon xor `4*(($j+13)%16)`(%rsp),$xi 159183234Ssimon rol \$30,$b 160183234Ssimon add $t0,$f 161183234Ssimon rol \$1,$xi 162183234Ssimon___ 163183234Ssimon$code.=<<___ if ($i<76); 164183234Ssimon mov $xi,`4*($j%16)`(%rsp) 165183234Ssimon___ 166183234Ssimon$code.=<<___ if ($i==79); 167183234Ssimon lea $K($xi,$e),$f 168183234Ssimon mov $c,$t0 169183234Ssimon mov $a,$e 170183234Ssimon xor $b,$t0 171183234Ssimon rol \$5,$e 172183234Ssimon xor $d,$t0 173183234Ssimon add $e,$f 174183234Ssimon rol \$30,$b 175183234Ssimon add $t0,$f 176183234Ssimon___ 177183234Ssimon} 178183234Ssimon 179183234Ssimonsub BODY_40_59 { 180183234Ssimonmy ($i,$a,$b,$c,$d,$e,$f)=@_; 181183234Ssimonmy $j=$i+1; 182183234Ssimon$code.=<<___; 183183234Ssimon lea 0x8f1bbcdc($xi,$e),$f 184183234Ssimon mov `4*($j%16)`(%rsp),$xi 185183234Ssimon mov $b,$t0 186183234Ssimon mov $b,$t1 187183234Ssimon xor `4*(($j+2)%16)`(%rsp),$xi 188183234Ssimon mov $a,$e 189183234Ssimon and $c,$t0 190183234Ssimon xor `4*(($j+8)%16)`(%rsp),$xi 191183234Ssimon or $c,$t1 192183234Ssimon rol \$5,$e 193183234Ssimon xor `4*(($j+13)%16)`(%rsp),$xi 194183234Ssimon and $d,$t1 195183234Ssimon add $e,$f 196183234Ssimon rol \$1,$xi 197183234Ssimon or $t1,$t0 198183234Ssimon rol \$30,$b 199183234Ssimon mov $xi,`4*($j%16)`(%rsp) 200183234Ssimon add $t0,$f 201183234Ssimon___ 202183234Ssimon} 203183234Ssimon 204183234Ssimon$code=".text\n"; 205183234Ssimon 206183234Ssimon&PROLOGUE("sha1_block_data_order"); 207183234Ssimon$code.=".align 4\n.Lloop:\n"; 208183234Ssimonfor($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } 209183234Ssimonfor(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 210183234Ssimonfor(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 211183234Ssimonfor(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 212183234Ssimon$code.=<<___; 213183234Ssimon add 0($ctx),$E 214183234Ssimon add 4($ctx),$T 215183234Ssimon add 8($ctx),$A 216183234Ssimon add 12($ctx),$B 217183234Ssimon add 16($ctx),$C 218183234Ssimon mov $E,0($ctx) 219183234Ssimon mov $T,4($ctx) 220183234Ssimon mov $A,8($ctx) 221183234Ssimon mov $B,12($ctx) 222183234Ssimon mov $C,16($ctx) 223183234Ssimon 224183234Ssimon xchg $E,$A # mov $E,$A 225183234Ssimon xchg $T,$B # mov $T,$B 226183234Ssimon xchg $E,$C # mov $A,$C 227183234Ssimon xchg $T,$D # mov $B,$D 228183234Ssimon # mov $C,$E 229183234Ssimon lea `16*4`($inp),$inp 230183234Ssimon sub \$1,$num 231183234Ssimon jnz .Lloop 232183234Ssimon___ 233183234Ssimon&EPILOGUE("sha1_block_data_order"); 234183234Ssimon$code.=<<___; 235183234Ssimon.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 236183234Ssimon___ 237183234Ssimon 238183234Ssimon#################################################################### 239183234Ssimon 240183234Ssimon$code =~ s/\`([^\`]*)\`/eval $1/gem; 241183234Ssimonprint $code; 242183234Ssimonclose STDOUT; 243