1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9# 10# sha1_block procedure for x86_64. 11# 12# It was brought to my attention that on EM64T compiler-generated code 13# was far behind 32-bit assembler implementation. This is unlike on 14# Opteron where compiler-generated code was only 15% behind 32-bit 15# assembler, which originally made it hard to motivate the effort. 16# There was suggestion to mechanically translate 32-bit code, but I 17# dismissed it, reasoning that x86_64 offers enough register bank 18# capacity to fully utilize SHA-1 parallelism. Therefore this fresh 19# implementation:-) However! While 64-bit code does performs better 20# on Opteron, I failed to beat 32-bit assembler on EM64T core. Well, 21# x86_64 does offer larger *addressable* bank, but out-of-order core 22# reaches for even more registers through dynamic aliasing, and EM64T 23# core must have managed to run-time optimize even 32-bit code just as 24# good as 64-bit one. Performance improvement is summarized in the 25# following table: 26# 27# gcc 3.4 32-bit asm cycles/byte 28# Opteron +45% +20% 6.8 29# Xeon P4 +65% +0% 9.9 30# Core2 +60% +10% 7.0 31 32$flavour = shift; 33$output = shift; 34if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 35 36$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 37 38$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 39( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 40( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 41die "can't locate x86_64-xlate.pl"; 42 43open OUT,"| \"$^X\" $xlate $flavour $output"; 44*STDOUT=*OUT; 45 46$ctx="%rdi"; # 1st arg 47$inp="%rsi"; # 2nd arg 48$num="%rdx"; # 3rd arg 49 50# reassign arguments in order to produce more compact code 51$ctx="%r8"; 52$inp="%r9"; 53$num="%r10"; 54 55$xi="%eax"; 56$t0="%ebx"; 57$t1="%ecx"; 58$A="%edx"; 59$B="%esi"; 60$C="%edi"; 61$D="%ebp"; 62$E="%r11d"; 63$T="%r12d"; 64 65@V=($A,$B,$C,$D,$E,$T); 66 67sub PROLOGUE { 68my $func=shift; 69$code.=<<___; 70.globl $func 71.type $func,\@function,3 72.align 16 73$func: 74 push %rbx 75 push %rbp 76 push %r12 77 mov %rsp,%r11 78 mov %rdi,$ctx # reassigned argument 79 sub \$`8+16*4`,%rsp 80 mov %rsi,$inp # reassigned argument 81 and \$-64,%rsp 82 mov %rdx,$num # reassigned argument 83 mov %r11,`16*4`(%rsp) 84.Lprologue: 85 86 mov 0($ctx),$A 87 mov 4($ctx),$B 88 mov 8($ctx),$C 89 mov 12($ctx),$D 90 mov 16($ctx),$E 91___ 92} 93 94sub EPILOGUE { 95my $func=shift; 96$code.=<<___; 97 mov `16*4`(%rsp),%rsi 98 mov (%rsi),%r12 99 mov 8(%rsi),%rbp 100 mov 16(%rsi),%rbx 101 lea 24(%rsi),%rsp 102.Lepilogue: 103 ret 104.size $func,.-$func 105___ 106} 107 108sub BODY_00_19 { 109my ($i,$a,$b,$c,$d,$e,$f,$host)=@_; 110my $j=$i+1; 111$code.=<<___ if ($i==0); 112 mov `4*$i`($inp),$xi 113 `"bswap $xi" if(!defined($host))` 114 mov $xi,`4*$i`(%rsp) 115___ 116$code.=<<___ if ($i<15); 117 lea 0x5a827999($xi,$e),$f 118 mov $c,$t0 119 mov `4*$j`($inp),$xi 120 mov $a,$e 121 xor $d,$t0 122 `"bswap $xi" if(!defined($host))` 123 rol \$5,$e 124 and $b,$t0 125 mov $xi,`4*$j`(%rsp) 126 add $e,$f 127 xor $d,$t0 128 rol \$30,$b 129 add $t0,$f 130___ 131$code.=<<___ if ($i>=15); 132 lea 0x5a827999($xi,$e),$f 133 mov `4*($j%16)`(%rsp),$xi 134 mov $c,$t0 135 mov $a,$e 136 xor `4*(($j+2)%16)`(%rsp),$xi 137 xor $d,$t0 138 rol \$5,$e 139 xor `4*(($j+8)%16)`(%rsp),$xi 140 and $b,$t0 141 add $e,$f 142 xor `4*(($j+13)%16)`(%rsp),$xi 143 xor $d,$t0 144 rol \$30,$b 145 add $t0,$f 146 rol \$1,$xi 147 mov $xi,`4*($j%16)`(%rsp) 148___ 149} 150 151sub BODY_20_39 { 152my ($i,$a,$b,$c,$d,$e,$f)=@_; 153my $j=$i+1; 154my $K=($i<40)?0x6ed9eba1:0xca62c1d6; 155$code.=<<___ if ($i<79); 156 lea $K($xi,$e),$f 157 mov `4*($j%16)`(%rsp),$xi 158 mov $c,$t0 159 mov $a,$e 160 xor `4*(($j+2)%16)`(%rsp),$xi 161 xor $b,$t0 162 rol \$5,$e 163 xor `4*(($j+8)%16)`(%rsp),$xi 164 xor $d,$t0 165 add $e,$f 166 xor `4*(($j+13)%16)`(%rsp),$xi 167 rol \$30,$b 168 add $t0,$f 169 rol \$1,$xi 170___ 171$code.=<<___ if ($i<76); 172 mov $xi,`4*($j%16)`(%rsp) 173___ 174$code.=<<___ if ($i==79); 175 lea $K($xi,$e),$f 176 mov $c,$t0 177 mov $a,$e 178 xor $b,$t0 179 rol \$5,$e 180 xor $d,$t0 181 add $e,$f 182 rol \$30,$b 183 add $t0,$f 184___ 185} 186 187sub BODY_40_59 { 188my ($i,$a,$b,$c,$d,$e,$f)=@_; 189my $j=$i+1; 190$code.=<<___; 191 lea 0x8f1bbcdc($xi,$e),$f 192 mov `4*($j%16)`(%rsp),$xi 193 mov $b,$t0 194 mov $b,$t1 195 xor `4*(($j+2)%16)`(%rsp),$xi 196 mov $a,$e 197 and $c,$t0 198 xor `4*(($j+8)%16)`(%rsp),$xi 199 or $c,$t1 200 rol \$5,$e 201 xor `4*(($j+13)%16)`(%rsp),$xi 202 and $d,$t1 203 add $e,$f 204 rol \$1,$xi 205 or $t1,$t0 206 rol \$30,$b 207 mov $xi,`4*($j%16)`(%rsp) 208 add $t0,$f 209___ 210} 211 212$code=".text\n"; 213 214&PROLOGUE("sha1_block_data_order"); 215$code.=".align 4\n.Lloop:\n"; 216for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } 217for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 218for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } 219for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } 220$code.=<<___; 221 add 0($ctx),$E 222 add 4($ctx),$T 223 add 8($ctx),$A 224 add 12($ctx),$B 225 add 16($ctx),$C 226 mov $E,0($ctx) 227 mov $T,4($ctx) 228 mov $A,8($ctx) 229 mov $B,12($ctx) 230 mov $C,16($ctx) 231 232 xchg $E,$A # mov $E,$A 233 xchg $T,$B # mov $T,$B 234 xchg $E,$C # mov $A,$C 235 xchg $T,$D # mov $B,$D 236 # mov $C,$E 237 lea `16*4`($inp),$inp 238 sub \$1,$num 239 jnz .Lloop 240___ 241&EPILOGUE("sha1_block_data_order"); 242$code.=<<___; 243.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 244.align 16 245___ 246 247# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 248# CONTEXT *context,DISPATCHER_CONTEXT *disp) 249if ($win64) { 250$rec="%rcx"; 251$frame="%rdx"; 252$context="%r8"; 253$disp="%r9"; 254 255$code.=<<___; 256.extern __imp_RtlVirtualUnwind 257.type se_handler,\@abi-omnipotent 258.align 16 259se_handler: 260 push %rsi 261 push %rdi 262 push %rbx 263 push %rbp 264 push %r12 265 push %r13 266 push %r14 267 push %r15 268 pushfq 269 sub \$64,%rsp 270 271 mov 120($context),%rax # pull context->Rax 272 mov 248($context),%rbx # pull context->Rip 273 274 lea .Lprologue(%rip),%r10 275 cmp %r10,%rbx # context->Rip<.Lprologue 276 jb .Lin_prologue 277 278 mov 152($context),%rax # pull context->Rsp 279 280 lea .Lepilogue(%rip),%r10 281 cmp %r10,%rbx # context->Rip>=.Lepilogue 282 jae .Lin_prologue 283 284 mov `16*4`(%rax),%rax # pull saved stack pointer 285 lea 24(%rax),%rax 286 287 mov -8(%rax),%rbx 288 mov -16(%rax),%rbp 289 mov -24(%rax),%r12 290 mov %rbx,144($context) # restore context->Rbx 291 mov %rbp,160($context) # restore context->Rbp 292 mov %r12,216($context) # restore context->R12 293 294.Lin_prologue: 295 mov 8(%rax),%rdi 296 mov 16(%rax),%rsi 297 mov %rax,152($context) # restore context->Rsp 298 mov %rsi,168($context) # restore context->Rsi 299 mov %rdi,176($context) # restore context->Rdi 300 301 mov 40($disp),%rdi # disp->ContextRecord 302 mov $context,%rsi # context 303 mov \$154,%ecx # sizeof(CONTEXT) 304 .long 0xa548f3fc # cld; rep movsq 305 306 mov $disp,%rsi 307 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER 308 mov 8(%rsi),%rdx # arg2, disp->ImageBase 309 mov 0(%rsi),%r8 # arg3, disp->ControlPc 310 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 311 mov 40(%rsi),%r10 # disp->ContextRecord 312 lea 56(%rsi),%r11 # &disp->HandlerData 313 lea 24(%rsi),%r12 # &disp->EstablisherFrame 314 mov %r10,32(%rsp) # arg5 315 mov %r11,40(%rsp) # arg6 316 mov %r12,48(%rsp) # arg7 317 mov %rcx,56(%rsp) # arg8, (NULL) 318 call *__imp_RtlVirtualUnwind(%rip) 319 320 mov \$1,%eax # ExceptionContinueSearch 321 add \$64,%rsp 322 popfq 323 pop %r15 324 pop %r14 325 pop %r13 326 pop %r12 327 pop %rbp 328 pop %rbx 329 pop %rdi 330 pop %rsi 331 ret 332.size se_handler,.-se_handler 333 334.section .pdata 335.align 4 336 .rva .LSEH_begin_sha1_block_data_order 337 .rva .LSEH_end_sha1_block_data_order 338 .rva .LSEH_info_sha1_block_data_order 339 340.section .xdata 341.align 8 342.LSEH_info_sha1_block_data_order: 343 .byte 9,0,0,0 344 .rva se_handler 345___ 346} 347 348#################################################################### 349 350$code =~ s/\`([^\`]*)\`/eval $1/gem; 351print $code; 352close STDOUT; 353