1238384Sjkim#!/usr/bin/env perl 2238384Sjkim 3238384Sjkim# ==================================================================== 4290207Sjkim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and 6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further 7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/. 8238384Sjkim# ==================================================================== 9238384Sjkim 10238384Sjkim# sha1_block procedure for ARMv4. 11238384Sjkim# 12238384Sjkim# January 2007. 13238384Sjkim 14238384Sjkim# Size/performance trade-off 15238384Sjkim# ==================================================================== 16238384Sjkim# impl size in bytes comp cycles[*] measured performance 17238384Sjkim# ==================================================================== 18238384Sjkim# thumb 304 3212 4420 19238384Sjkim# armv4-small 392/+29% 1958/+64% 2250/+96% 20238384Sjkim# armv4-compact 740/+89% 1552/+26% 1840/+22% 21238384Sjkim# armv4-large 1420/+92% 1307/+19% 1370/+34%[***] 22238384Sjkim# full unroll ~5100/+260% ~1260/+4% ~1300/+5% 23238384Sjkim# ==================================================================== 24238384Sjkim# thumb = same as 'small' but in Thumb instructions[**] and 25238384Sjkim# with recurring code in two private functions; 26238384Sjkim# small = detached Xload/update, loops are folded; 27238384Sjkim# compact = detached Xload/update, 5x unroll; 28238384Sjkim# large = interleaved Xload/update, 5x unroll; 29238384Sjkim# full unroll = interleaved Xload/update, full unroll, estimated[!]; 30238384Sjkim# 31238384Sjkim# [*] Manually counted instructions in "grand" loop body. Measured 32238384Sjkim# performance is affected by prologue and epilogue overhead, 33238384Sjkim# i-cache availability, branch penalties, etc. 34238384Sjkim# [**] While each Thumb instruction is twice smaller, they are not as 35238384Sjkim# diverse as ARM ones: e.g., there are only two arithmetic 36238384Sjkim# instructions with 3 arguments, no [fixed] rotate, addressing 37238384Sjkim# modes are limited. As result it takes more instructions to do 38238384Sjkim# the same job in Thumb, therefore the code is never twice as 39238384Sjkim# small and always slower. 40238384Sjkim# [***] which is also ~35% better than compiler generated code. Dual- 41238384Sjkim# issue Cortex A8 core was measured to process input block in 42238384Sjkim# ~990 cycles. 43238384Sjkim 44238384Sjkim# August 2010. 45238384Sjkim# 46238384Sjkim# Rescheduling for dual-issue pipeline resulted in 13% improvement on 47238384Sjkim# Cortex A8 core and in absolute terms ~870 cycles per input block 48238384Sjkim# [or 13.6 cycles per byte]. 49238384Sjkim 50238384Sjkim# February 2011. 51238384Sjkim# 52238384Sjkim# Profiler-assisted and platform-specific optimization resulted in 10% 53238384Sjkim# improvement on Cortex A8 core and 12.2 cycles per byte. 54238384Sjkim 55290207Sjkim# September 2013. 56290207Sjkim# 57290207Sjkim# Add NEON implementation (see sha1-586.pl for background info). On 58290207Sjkim# Cortex A8 it was measured to process one byte in 6.7 cycles or >80% 59290207Sjkim# faster than integer-only code. Because [fully unrolled] NEON code 60290207Sjkim# is ~2.5x larger and there are some redundant instructions executed 61290207Sjkim# when processing last block, improvement is not as big for smallest 62290207Sjkim# blocks, only ~30%. Snapdragon S4 is a tad faster, 6.4 cycles per 63290207Sjkim# byte, which is also >80% faster than integer-only code. 64290207Sjkim 65290207Sjkim# May 2014. 66290207Sjkim# 67290207Sjkim# Add ARMv8 code path performing at 2.35 cpb on Apple A7. 68290207Sjkim 69238384Sjkimwhile (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 70238384Sjkimopen STDOUT,">$output"; 71238384Sjkim 72238384Sjkim$ctx="r0"; 73238384Sjkim$inp="r1"; 74238384Sjkim$len="r2"; 75238384Sjkim$a="r3"; 76238384Sjkim$b="r4"; 77238384Sjkim$c="r5"; 78238384Sjkim$d="r6"; 79238384Sjkim$e="r7"; 80238384Sjkim$K="r8"; 81238384Sjkim$t0="r9"; 82238384Sjkim$t1="r10"; 83238384Sjkim$t2="r11"; 84238384Sjkim$t3="r12"; 85238384Sjkim$Xi="r14"; 86238384Sjkim@V=($a,$b,$c,$d,$e); 87238384Sjkim 88238384Sjkimsub Xupdate { 89238384Sjkimmy ($a,$b,$c,$d,$e,$opt1,$opt2)=@_; 90238384Sjkim$code.=<<___; 91238384Sjkim ldr $t0,[$Xi,#15*4] 92238384Sjkim ldr $t1,[$Xi,#13*4] 93238384Sjkim ldr $t2,[$Xi,#7*4] 94238384Sjkim add $e,$K,$e,ror#2 @ E+=K_xx_xx 95238384Sjkim ldr $t3,[$Xi,#2*4] 96238384Sjkim eor $t0,$t0,$t1 97238384Sjkim eor $t2,$t2,$t3 @ 1 cycle stall 98238384Sjkim eor $t1,$c,$d @ F_xx_xx 99238384Sjkim mov $t0,$t0,ror#31 100238384Sjkim add $e,$e,$a,ror#27 @ E+=ROR(A,27) 101238384Sjkim eor $t0,$t0,$t2,ror#31 102238384Sjkim str $t0,[$Xi,#-4]! 103238384Sjkim $opt1 @ F_xx_xx 104238384Sjkim $opt2 @ F_xx_xx 105238384Sjkim add $e,$e,$t0 @ E+=X[i] 106238384Sjkim___ 107238384Sjkim} 108238384Sjkim 109238384Sjkimsub BODY_00_15 { 110238384Sjkimmy ($a,$b,$c,$d,$e)=@_; 111238384Sjkim$code.=<<___; 112238384Sjkim#if __ARM_ARCH__<7 113238384Sjkim ldrb $t1,[$inp,#2] 114238384Sjkim ldrb $t0,[$inp,#3] 115238384Sjkim ldrb $t2,[$inp,#1] 116238384Sjkim add $e,$K,$e,ror#2 @ E+=K_00_19 117238384Sjkim ldrb $t3,[$inp],#4 118238384Sjkim orr $t0,$t0,$t1,lsl#8 119238384Sjkim eor $t1,$c,$d @ F_xx_xx 120238384Sjkim orr $t0,$t0,$t2,lsl#16 121238384Sjkim add $e,$e,$a,ror#27 @ E+=ROR(A,27) 122238384Sjkim orr $t0,$t0,$t3,lsl#24 123238384Sjkim#else 124238384Sjkim ldr $t0,[$inp],#4 @ handles unaligned 125238384Sjkim add $e,$K,$e,ror#2 @ E+=K_00_19 126238384Sjkim eor $t1,$c,$d @ F_xx_xx 127238384Sjkim add $e,$e,$a,ror#27 @ E+=ROR(A,27) 128238384Sjkim#ifdef __ARMEL__ 129238384Sjkim rev $t0,$t0 @ byte swap 130238384Sjkim#endif 131238384Sjkim#endif 132238384Sjkim and $t1,$b,$t1,ror#2 133238384Sjkim add $e,$e,$t0 @ E+=X[i] 134238384Sjkim eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D) 135238384Sjkim str $t0,[$Xi,#-4]! 136238384Sjkim add $e,$e,$t1 @ E+=F_00_19(B,C,D) 137238384Sjkim___ 138238384Sjkim} 139238384Sjkim 140238384Sjkimsub BODY_16_19 { 141238384Sjkimmy ($a,$b,$c,$d,$e)=@_; 142238384Sjkim &Xupdate(@_,"and $t1,$b,$t1,ror#2"); 143238384Sjkim$code.=<<___; 144238384Sjkim eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D) 145238384Sjkim add $e,$e,$t1 @ E+=F_00_19(B,C,D) 146238384Sjkim___ 147238384Sjkim} 148238384Sjkim 149238384Sjkimsub BODY_20_39 { 150238384Sjkimmy ($a,$b,$c,$d,$e)=@_; 151238384Sjkim &Xupdate(@_,"eor $t1,$b,$t1,ror#2"); 152238384Sjkim$code.=<<___; 153238384Sjkim add $e,$e,$t1 @ E+=F_20_39(B,C,D) 154238384Sjkim___ 155238384Sjkim} 156238384Sjkim 157238384Sjkimsub BODY_40_59 { 158238384Sjkimmy ($a,$b,$c,$d,$e)=@_; 159238384Sjkim &Xupdate(@_,"and $t1,$b,$t1,ror#2","and $t2,$c,$d"); 160238384Sjkim$code.=<<___; 161238384Sjkim add $e,$e,$t1 @ E+=F_40_59(B,C,D) 162238384Sjkim add $e,$e,$t2,ror#2 163238384Sjkim___ 164238384Sjkim} 165238384Sjkim 166238384Sjkim$code=<<___; 167238384Sjkim#include "arm_arch.h" 168238384Sjkim 169238384Sjkim.text 170290207Sjkim.code 32 171238384Sjkim 172238384Sjkim.global sha1_block_data_order 173238384Sjkim.type sha1_block_data_order,%function 174238384Sjkim 175290207Sjkim.align 5 176238384Sjkimsha1_block_data_order: 177290207Sjkim#if __ARM_MAX_ARCH__>=7 178290207Sjkim sub r3,pc,#8 @ sha1_block_data_order 179290207Sjkim ldr r12,.LOPENSSL_armcap 180290207Sjkim ldr r12,[r3,r12] @ OPENSSL_armcap_P 181290207Sjkim tst r12,#ARMV8_SHA1 182290207Sjkim bne .LARMv8 183290207Sjkim tst r12,#ARMV7_NEON 184290207Sjkim bne .LNEON 185290207Sjkim#endif 186238384Sjkim stmdb sp!,{r4-r12,lr} 187238384Sjkim add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp 188238384Sjkim ldmia $ctx,{$a,$b,$c,$d,$e} 189238384Sjkim.Lloop: 190238384Sjkim ldr $K,.LK_00_19 191238384Sjkim mov $Xi,sp 192238384Sjkim sub sp,sp,#15*4 193238384Sjkim mov $c,$c,ror#30 194238384Sjkim mov $d,$d,ror#30 195238384Sjkim mov $e,$e,ror#30 @ [6] 196238384Sjkim.L_00_15: 197238384Sjkim___ 198238384Sjkimfor($i=0;$i<5;$i++) { 199238384Sjkim &BODY_00_15(@V); unshift(@V,pop(@V)); 200238384Sjkim} 201238384Sjkim$code.=<<___; 202238384Sjkim teq $Xi,sp 203238384Sjkim bne .L_00_15 @ [((11+4)*5+2)*3] 204246772Sjkim sub sp,sp,#25*4 205238384Sjkim___ 206238384Sjkim &BODY_00_15(@V); unshift(@V,pop(@V)); 207238384Sjkim &BODY_16_19(@V); unshift(@V,pop(@V)); 208238384Sjkim &BODY_16_19(@V); unshift(@V,pop(@V)); 209238384Sjkim &BODY_16_19(@V); unshift(@V,pop(@V)); 210238384Sjkim &BODY_16_19(@V); unshift(@V,pop(@V)); 211238384Sjkim$code.=<<___; 212238384Sjkim 213238384Sjkim ldr $K,.LK_20_39 @ [+15+16*4] 214238384Sjkim cmn sp,#0 @ [+3], clear carry to denote 20_39 215238384Sjkim.L_20_39_or_60_79: 216238384Sjkim___ 217238384Sjkimfor($i=0;$i<5;$i++) { 218238384Sjkim &BODY_20_39(@V); unshift(@V,pop(@V)); 219238384Sjkim} 220238384Sjkim$code.=<<___; 221238384Sjkim teq $Xi,sp @ preserve carry 222238384Sjkim bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4] 223238384Sjkim bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes 224238384Sjkim 225238384Sjkim ldr $K,.LK_40_59 226238384Sjkim sub sp,sp,#20*4 @ [+2] 227238384Sjkim.L_40_59: 228238384Sjkim___ 229238384Sjkimfor($i=0;$i<5;$i++) { 230238384Sjkim &BODY_40_59(@V); unshift(@V,pop(@V)); 231238384Sjkim} 232238384Sjkim$code.=<<___; 233238384Sjkim teq $Xi,sp 234238384Sjkim bne .L_40_59 @ [+((12+5)*5+2)*4] 235238384Sjkim 236238384Sjkim ldr $K,.LK_60_79 237238384Sjkim sub sp,sp,#20*4 238238384Sjkim cmp sp,#0 @ set carry to denote 60_79 239238384Sjkim b .L_20_39_or_60_79 @ [+4], spare 300 bytes 240238384Sjkim.L_done: 241238384Sjkim add sp,sp,#80*4 @ "deallocate" stack frame 242238384Sjkim ldmia $ctx,{$K,$t0,$t1,$t2,$t3} 243238384Sjkim add $a,$K,$a 244238384Sjkim add $b,$t0,$b 245238384Sjkim add $c,$t1,$c,ror#2 246238384Sjkim add $d,$t2,$d,ror#2 247238384Sjkim add $e,$t3,$e,ror#2 248238384Sjkim stmia $ctx,{$a,$b,$c,$d,$e} 249238384Sjkim teq $inp,$len 250238384Sjkim bne .Lloop @ [+18], total 1307 251238384Sjkim 252238384Sjkim#if __ARM_ARCH__>=5 253238384Sjkim ldmia sp!,{r4-r12,pc} 254238384Sjkim#else 255238384Sjkim ldmia sp!,{r4-r12,lr} 256238384Sjkim tst lr,#1 257238384Sjkim moveq pc,lr @ be binary compatible with V4, yet 258238384Sjkim bx lr @ interoperable with Thumb ISA:-) 259238384Sjkim#endif 260290207Sjkim.size sha1_block_data_order,.-sha1_block_data_order 261290207Sjkim 262290207Sjkim.align 5 263238384Sjkim.LK_00_19: .word 0x5a827999 264238384Sjkim.LK_20_39: .word 0x6ed9eba1 265238384Sjkim.LK_40_59: .word 0x8f1bbcdc 266238384Sjkim.LK_60_79: .word 0xca62c1d6 267290207Sjkim#if __ARM_MAX_ARCH__>=7 268290207Sjkim.LOPENSSL_armcap: 269290207Sjkim.word OPENSSL_armcap_P-sha1_block_data_order 270290207Sjkim#endif 271290207Sjkim.asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 272290207Sjkim.align 5 273238384Sjkim___ 274290207Sjkim##################################################################### 275290207Sjkim# NEON stuff 276290207Sjkim# 277290207Sjkim{{{ 278290207Sjkimmy @V=($a,$b,$c,$d,$e); 279290207Sjkimmy ($K_XX_XX,$Ki,$t0,$t1,$Xfer,$saved_sp)=map("r$_",(8..12,14)); 280290207Sjkimmy $Xi=4; 281290207Sjkimmy @X=map("q$_",(8..11,0..3)); 282290207Sjkimmy @Tx=("q12","q13"); 283290207Sjkimmy ($K,$zero)=("q14","q15"); 284290207Sjkimmy $j=0; 285238384Sjkim 286290207Sjkimsub AUTOLOAD() # thunk [simplified] x86-style perlasm 287290207Sjkim{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; 288290207Sjkim my $arg = pop; 289290207Sjkim $arg = "#$arg" if ($arg*1 eq $arg); 290290207Sjkim $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; 291290207Sjkim} 292290207Sjkim 293290207Sjkimsub body_00_19 () { 294290207Sjkim ( 295290207Sjkim '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'. 296290207Sjkim '&bic ($t0,$d,$b)', 297290207Sjkim '&add ($e,$e,$Ki)', # e+=X[i]+K 298290207Sjkim '&and ($t1,$c,$b)', 299290207Sjkim '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))', 300290207Sjkim '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27) 301290207Sjkim '&eor ($t1,$t1,$t0)', # F_00_19 302290207Sjkim '&mov ($b,$b,"ror#2")', # b=ROR(b,2) 303290207Sjkim '&add ($e,$e,$t1);'. # e+=F_00_19 304290207Sjkim '$j++; unshift(@V,pop(@V));' 305290207Sjkim ) 306290207Sjkim} 307290207Sjkimsub body_20_39 () { 308290207Sjkim ( 309290207Sjkim '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'. 310290207Sjkim '&eor ($t0,$b,$d)', 311290207Sjkim '&add ($e,$e,$Ki)', # e+=X[i]+K 312290207Sjkim '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15)) if ($j<79)', 313290207Sjkim '&eor ($t1,$t0,$c)', # F_20_39 314290207Sjkim '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27) 315290207Sjkim '&mov ($b,$b,"ror#2")', # b=ROR(b,2) 316290207Sjkim '&add ($e,$e,$t1);'. # e+=F_20_39 317290207Sjkim '$j++; unshift(@V,pop(@V));' 318290207Sjkim ) 319290207Sjkim} 320290207Sjkimsub body_40_59 () { 321290207Sjkim ( 322290207Sjkim '($a,$b,$c,$d,$e)=@V;'. # '$code.="@ $j\n";'. 323290207Sjkim '&add ($e,$e,$Ki)', # e+=X[i]+K 324290207Sjkim '&and ($t0,$c,$d)', 325290207Sjkim '&ldr ($Ki,sprintf "[sp,#%d]",4*(($j+1)&15))', 326290207Sjkim '&add ($e,$e,$a,"ror#27")', # e+=ROR(A,27) 327290207Sjkim '&eor ($t1,$c,$d)', 328290207Sjkim '&add ($e,$e,$t0)', 329290207Sjkim '&and ($t1,$t1,$b)', 330290207Sjkim '&mov ($b,$b,"ror#2")', # b=ROR(b,2) 331290207Sjkim '&add ($e,$e,$t1);'. # e+=F_40_59 332290207Sjkim '$j++; unshift(@V,pop(@V));' 333290207Sjkim ) 334290207Sjkim} 335290207Sjkim 336290207Sjkimsub Xupdate_16_31 () 337290207Sjkim{ use integer; 338290207Sjkim my $body = shift; 339290207Sjkim my @insns = (&$body,&$body,&$body,&$body); 340290207Sjkim my ($a,$b,$c,$d,$e); 341290207Sjkim 342290207Sjkim &vext_8 (@X[0],@X[-4&7],@X[-3&7],8); # compose "X[-14]" in "X[0]" 343290207Sjkim eval(shift(@insns)); 344290207Sjkim eval(shift(@insns)); 345290207Sjkim eval(shift(@insns)); 346290207Sjkim &vadd_i32 (@Tx[1],@X[-1&7],$K); 347290207Sjkim eval(shift(@insns)); 348290207Sjkim &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0); 349290207Sjkim eval(shift(@insns)); 350290207Sjkim &vext_8 (@Tx[0],@X[-1&7],$zero,4); # "X[-3]", 3 words 351290207Sjkim eval(shift(@insns)); 352290207Sjkim eval(shift(@insns)); 353290207Sjkim eval(shift(@insns)); 354290207Sjkim &veor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" 355290207Sjkim eval(shift(@insns)); 356290207Sjkim eval(shift(@insns)); 357290207Sjkim &veor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" 358290207Sjkim eval(shift(@insns)); 359290207Sjkim eval(shift(@insns)); 360290207Sjkim &veor (@Tx[0],@Tx[0],@X[0]); # "X[0]"^="X[-3]"^"X[-8] 361290207Sjkim eval(shift(@insns)); 362290207Sjkim eval(shift(@insns)); 363290207Sjkim &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer 364290207Sjkim &sub ($Xfer,$Xfer,64) if ($Xi%4==0); 365290207Sjkim eval(shift(@insns)); 366290207Sjkim eval(shift(@insns)); 367290207Sjkim &vext_8 (@Tx[1],$zero,@Tx[0],4); # "X[0]"<<96, extract one dword 368290207Sjkim eval(shift(@insns)); 369290207Sjkim eval(shift(@insns)); 370290207Sjkim &vadd_i32 (@X[0],@Tx[0],@Tx[0]); 371290207Sjkim eval(shift(@insns)); 372290207Sjkim eval(shift(@insns)); 373290207Sjkim &vsri_32 (@X[0],@Tx[0],31); # "X[0]"<<<=1 374290207Sjkim eval(shift(@insns)); 375290207Sjkim eval(shift(@insns)); 376290207Sjkim eval(shift(@insns)); 377290207Sjkim &vshr_u32 (@Tx[0],@Tx[1],30); 378290207Sjkim eval(shift(@insns)); 379290207Sjkim eval(shift(@insns)); 380290207Sjkim &vshl_u32 (@Tx[1],@Tx[1],2); 381290207Sjkim eval(shift(@insns)); 382290207Sjkim eval(shift(@insns)); 383290207Sjkim &veor (@X[0],@X[0],@Tx[0]); 384290207Sjkim eval(shift(@insns)); 385290207Sjkim eval(shift(@insns)); 386290207Sjkim &veor (@X[0],@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 387290207Sjkim 388290207Sjkim foreach (@insns) { eval; } # remaining instructions [if any] 389290207Sjkim 390290207Sjkim $Xi++; push(@X,shift(@X)); # "rotate" X[] 391290207Sjkim} 392290207Sjkim 393290207Sjkimsub Xupdate_32_79 () 394290207Sjkim{ use integer; 395290207Sjkim my $body = shift; 396290207Sjkim my @insns = (&$body,&$body,&$body,&$body); 397290207Sjkim my ($a,$b,$c,$d,$e); 398290207Sjkim 399290207Sjkim &vext_8 (@Tx[0],@X[-2&7],@X[-1&7],8); # compose "X[-6]" 400290207Sjkim eval(shift(@insns)); 401290207Sjkim eval(shift(@insns)); 402290207Sjkim eval(shift(@insns)); 403290207Sjkim &veor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" 404290207Sjkim eval(shift(@insns)); 405290207Sjkim eval(shift(@insns)); 406290207Sjkim &veor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" 407290207Sjkim eval(shift(@insns)); 408290207Sjkim eval(shift(@insns)); 409290207Sjkim &vadd_i32 (@Tx[1],@X[-1&7],$K); 410290207Sjkim eval(shift(@insns)); 411290207Sjkim &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!") if ($Xi%5==0); 412290207Sjkim eval(shift(@insns)); 413290207Sjkim &veor (@Tx[0],@Tx[0],@X[0]); # "X[-6]"^="X[0]" 414290207Sjkim eval(shift(@insns)); 415290207Sjkim eval(shift(@insns)); 416290207Sjkim &vshr_u32 (@X[0],@Tx[0],30); 417290207Sjkim eval(shift(@insns)); 418290207Sjkim eval(shift(@insns)); 419290207Sjkim &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); # X[]+K xfer 420290207Sjkim &sub ($Xfer,$Xfer,64) if ($Xi%4==0); 421290207Sjkim eval(shift(@insns)); 422290207Sjkim eval(shift(@insns)); 423290207Sjkim &vsli_32 (@X[0],@Tx[0],2); # "X[0]"="X[-6]"<<<2 424290207Sjkim 425290207Sjkim foreach (@insns) { eval; } # remaining instructions [if any] 426290207Sjkim 427290207Sjkim $Xi++; push(@X,shift(@X)); # "rotate" X[] 428290207Sjkim} 429290207Sjkim 430290207Sjkimsub Xuplast_80 () 431290207Sjkim{ use integer; 432290207Sjkim my $body = shift; 433290207Sjkim my @insns = (&$body,&$body,&$body,&$body); 434290207Sjkim my ($a,$b,$c,$d,$e); 435290207Sjkim 436290207Sjkim &vadd_i32 (@Tx[1],@X[-1&7],$K); 437290207Sjkim eval(shift(@insns)); 438290207Sjkim eval(shift(@insns)); 439290207Sjkim &vst1_32 ("{@Tx[1]}","[$Xfer,:128]!"); 440290207Sjkim &sub ($Xfer,$Xfer,64); 441290207Sjkim 442290207Sjkim &teq ($inp,$len); 443290207Sjkim &sub ($K_XX_XX,$K_XX_XX,16); # rewind $K_XX_XX 444290207Sjkim &subeq ($inp,$inp,64); # reload last block to avoid SEGV 445290207Sjkim &vld1_8 ("{@X[-4&7]-@X[-3&7]}","[$inp]!"); 446290207Sjkim eval(shift(@insns)); 447290207Sjkim eval(shift(@insns)); 448290207Sjkim &vld1_8 ("{@X[-2&7]-@X[-1&7]}","[$inp]!"); 449290207Sjkim eval(shift(@insns)); 450290207Sjkim eval(shift(@insns)); 451290207Sjkim &vld1_32 ("{$K\[]}","[$K_XX_XX,:32]!"); # load K_00_19 452290207Sjkim eval(shift(@insns)); 453290207Sjkim eval(shift(@insns)); 454290207Sjkim &vrev32_8 (@X[-4&7],@X[-4&7]); 455290207Sjkim 456290207Sjkim foreach (@insns) { eval; } # remaining instructions 457290207Sjkim 458290207Sjkim $Xi=0; 459290207Sjkim} 460290207Sjkim 461290207Sjkimsub Xloop() 462290207Sjkim{ use integer; 463290207Sjkim my $body = shift; 464290207Sjkim my @insns = (&$body,&$body,&$body,&$body); 465290207Sjkim my ($a,$b,$c,$d,$e); 466290207Sjkim 467290207Sjkim &vrev32_8 (@X[($Xi-3)&7],@X[($Xi-3)&7]); 468290207Sjkim eval(shift(@insns)); 469290207Sjkim eval(shift(@insns)); 470290207Sjkim &vadd_i32 (@X[$Xi&7],@X[($Xi-4)&7],$K); 471290207Sjkim eval(shift(@insns)); 472290207Sjkim eval(shift(@insns)); 473290207Sjkim &vst1_32 ("{@X[$Xi&7]}","[$Xfer,:128]!");# X[]+K xfer to IALU 474290207Sjkim 475290207Sjkim foreach (@insns) { eval; } 476290207Sjkim 477290207Sjkim $Xi++; 478290207Sjkim} 479290207Sjkim 480290207Sjkim$code.=<<___; 481290207Sjkim#if __ARM_MAX_ARCH__>=7 482290207Sjkim.arch armv7-a 483290207Sjkim.fpu neon 484290207Sjkim 485290207Sjkim.type sha1_block_data_order_neon,%function 486290207Sjkim.align 4 487290207Sjkimsha1_block_data_order_neon: 488290207Sjkim.LNEON: 489290207Sjkim stmdb sp!,{r4-r12,lr} 490290207Sjkim add $len,$inp,$len,lsl#6 @ $len to point at the end of $inp 491290207Sjkim @ dmb @ errata #451034 on early Cortex A8 492290207Sjkim @ vstmdb sp!,{d8-d15} @ ABI specification says so 493290207Sjkim mov $saved_sp,sp 494290207Sjkim sub sp,sp,#64 @ alloca 495290207Sjkim adr $K_XX_XX,.LK_00_19 496290207Sjkim bic sp,sp,#15 @ align for 128-bit stores 497290207Sjkim 498290207Sjkim ldmia $ctx,{$a,$b,$c,$d,$e} @ load context 499290207Sjkim mov $Xfer,sp 500290207Sjkim 501290207Sjkim vld1.8 {@X[-4&7]-@X[-3&7]},[$inp]! @ handles unaligned 502290207Sjkim veor $zero,$zero,$zero 503290207Sjkim vld1.8 {@X[-2&7]-@X[-1&7]},[$inp]! 504290207Sjkim vld1.32 {${K}\[]},[$K_XX_XX,:32]! @ load K_00_19 505290207Sjkim vrev32.8 @X[-4&7],@X[-4&7] @ yes, even on 506290207Sjkim vrev32.8 @X[-3&7],@X[-3&7] @ big-endian... 507290207Sjkim vrev32.8 @X[-2&7],@X[-2&7] 508290207Sjkim vadd.i32 @X[0],@X[-4&7],$K 509290207Sjkim vrev32.8 @X[-1&7],@X[-1&7] 510290207Sjkim vadd.i32 @X[1],@X[-3&7],$K 511290207Sjkim vst1.32 {@X[0]},[$Xfer,:128]! 512290207Sjkim vadd.i32 @X[2],@X[-2&7],$K 513290207Sjkim vst1.32 {@X[1]},[$Xfer,:128]! 514290207Sjkim vst1.32 {@X[2]},[$Xfer,:128]! 515290207Sjkim ldr $Ki,[sp] @ big RAW stall 516290207Sjkim 517290207Sjkim.Loop_neon: 518290207Sjkim___ 519290207Sjkim &Xupdate_16_31(\&body_00_19); 520290207Sjkim &Xupdate_16_31(\&body_00_19); 521290207Sjkim &Xupdate_16_31(\&body_00_19); 522290207Sjkim &Xupdate_16_31(\&body_00_19); 523290207Sjkim &Xupdate_32_79(\&body_00_19); 524290207Sjkim &Xupdate_32_79(\&body_20_39); 525290207Sjkim &Xupdate_32_79(\&body_20_39); 526290207Sjkim &Xupdate_32_79(\&body_20_39); 527290207Sjkim &Xupdate_32_79(\&body_20_39); 528290207Sjkim &Xupdate_32_79(\&body_20_39); 529290207Sjkim &Xupdate_32_79(\&body_40_59); 530290207Sjkim &Xupdate_32_79(\&body_40_59); 531290207Sjkim &Xupdate_32_79(\&body_40_59); 532290207Sjkim &Xupdate_32_79(\&body_40_59); 533290207Sjkim &Xupdate_32_79(\&body_40_59); 534290207Sjkim &Xupdate_32_79(\&body_20_39); 535290207Sjkim &Xuplast_80(\&body_20_39); 536290207Sjkim &Xloop(\&body_20_39); 537290207Sjkim &Xloop(\&body_20_39); 538290207Sjkim &Xloop(\&body_20_39); 539290207Sjkim$code.=<<___; 540290207Sjkim ldmia $ctx,{$Ki,$t0,$t1,$Xfer} @ accumulate context 541290207Sjkim add $a,$a,$Ki 542290207Sjkim ldr $Ki,[$ctx,#16] 543290207Sjkim add $b,$b,$t0 544290207Sjkim add $c,$c,$t1 545290207Sjkim add $d,$d,$Xfer 546290207Sjkim moveq sp,$saved_sp 547290207Sjkim add $e,$e,$Ki 548290207Sjkim ldrne $Ki,[sp] 549290207Sjkim stmia $ctx,{$a,$b,$c,$d,$e} 550290207Sjkim addne $Xfer,sp,#3*16 551290207Sjkim bne .Loop_neon 552290207Sjkim 553290207Sjkim @ vldmia sp!,{d8-d15} 554290207Sjkim ldmia sp!,{r4-r12,pc} 555290207Sjkim.size sha1_block_data_order_neon,.-sha1_block_data_order_neon 556290207Sjkim#endif 557290207Sjkim___ 558290207Sjkim}}} 559290207Sjkim##################################################################### 560290207Sjkim# ARMv8 stuff 561290207Sjkim# 562290207Sjkim{{{ 563290207Sjkimmy ($ABCD,$E,$E0,$E1)=map("q$_",(0..3)); 564290207Sjkimmy @MSG=map("q$_",(4..7)); 565290207Sjkimmy @Kxx=map("q$_",(8..11)); 566290207Sjkimmy ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14)); 567290207Sjkim 568290207Sjkim$code.=<<___; 569290207Sjkim#if __ARM_MAX_ARCH__>=7 570290207Sjkim.type sha1_block_data_order_armv8,%function 571290207Sjkim.align 5 572290207Sjkimsha1_block_data_order_armv8: 573290207Sjkim.LARMv8: 574290207Sjkim vstmdb sp!,{d8-d15} @ ABI specification says so 575290207Sjkim 576290207Sjkim veor $E,$E,$E 577290207Sjkim adr r3,.LK_00_19 578290207Sjkim vld1.32 {$ABCD},[$ctx]! 579290207Sjkim vld1.32 {$E\[0]},[$ctx] 580290207Sjkim sub $ctx,$ctx,#16 581290207Sjkim vld1.32 {@Kxx[0]\[]},[r3,:32]! 582290207Sjkim vld1.32 {@Kxx[1]\[]},[r3,:32]! 583290207Sjkim vld1.32 {@Kxx[2]\[]},[r3,:32]! 584290207Sjkim vld1.32 {@Kxx[3]\[]},[r3,:32] 585290207Sjkim 586290207Sjkim.Loop_v8: 587290207Sjkim vld1.8 {@MSG[0]-@MSG[1]},[$inp]! 588290207Sjkim vld1.8 {@MSG[2]-@MSG[3]},[$inp]! 589290207Sjkim vrev32.8 @MSG[0],@MSG[0] 590290207Sjkim vrev32.8 @MSG[1],@MSG[1] 591290207Sjkim 592290207Sjkim vadd.i32 $W0,@Kxx[0],@MSG[0] 593290207Sjkim vrev32.8 @MSG[2],@MSG[2] 594290207Sjkim vmov $ABCD_SAVE,$ABCD @ offload 595290207Sjkim subs $len,$len,#1 596290207Sjkim 597290207Sjkim vadd.i32 $W1,@Kxx[0],@MSG[1] 598290207Sjkim vrev32.8 @MSG[3],@MSG[3] 599290207Sjkim sha1h $E1,$ABCD @ 0 600290207Sjkim sha1c $ABCD,$E,$W0 601290207Sjkim vadd.i32 $W0,@Kxx[$j],@MSG[2] 602290207Sjkim sha1su0 @MSG[0],@MSG[1],@MSG[2] 603290207Sjkim___ 604290207Sjkimfor ($j=0,$i=1;$i<20-3;$i++) { 605290207Sjkimmy $f=("c","p","m","p")[$i/5]; 606290207Sjkim$code.=<<___; 607290207Sjkim sha1h $E0,$ABCD @ $i 608290207Sjkim sha1$f $ABCD,$E1,$W1 609290207Sjkim vadd.i32 $W1,@Kxx[$j],@MSG[3] 610290207Sjkim sha1su1 @MSG[0],@MSG[3] 611290207Sjkim___ 612290207Sjkim$code.=<<___ if ($i<20-4); 613290207Sjkim sha1su0 @MSG[1],@MSG[2],@MSG[3] 614290207Sjkim___ 615290207Sjkim ($E0,$E1)=($E1,$E0); ($W0,$W1)=($W1,$W0); 616290207Sjkim push(@MSG,shift(@MSG)); $j++ if ((($i+3)%5)==0); 617290207Sjkim} 618290207Sjkim$code.=<<___; 619290207Sjkim sha1h $E0,$ABCD @ $i 620290207Sjkim sha1p $ABCD,$E1,$W1 621290207Sjkim vadd.i32 $W1,@Kxx[$j],@MSG[3] 622290207Sjkim 623290207Sjkim sha1h $E1,$ABCD @ 18 624290207Sjkim sha1p $ABCD,$E0,$W0 625290207Sjkim 626290207Sjkim sha1h $E0,$ABCD @ 19 627290207Sjkim sha1p $ABCD,$E1,$W1 628290207Sjkim 629290207Sjkim vadd.i32 $E,$E,$E0 630290207Sjkim vadd.i32 $ABCD,$ABCD,$ABCD_SAVE 631290207Sjkim bne .Loop_v8 632290207Sjkim 633290207Sjkim vst1.32 {$ABCD},[$ctx]! 634290207Sjkim vst1.32 {$E\[0]},[$ctx] 635290207Sjkim 636290207Sjkim vldmia sp!,{d8-d15} 637290207Sjkim ret @ bx lr 638290207Sjkim.size sha1_block_data_order_armv8,.-sha1_block_data_order_armv8 639290207Sjkim#endif 640290207Sjkim___ 641290207Sjkim}}} 642290207Sjkim$code.=<<___; 643290207Sjkim#if __ARM_MAX_ARCH__>=7 644290207Sjkim.comm OPENSSL_armcap_P,4,4 645290207Sjkim#endif 646290207Sjkim___ 647290207Sjkim 648290207Sjkim{ my %opcode = ( 649290207Sjkim "sha1c" => 0xf2000c40, "sha1p" => 0xf2100c40, 650290207Sjkim "sha1m" => 0xf2200c40, "sha1su0" => 0xf2300c40, 651290207Sjkim "sha1h" => 0xf3b902c0, "sha1su1" => 0xf3ba0380 ); 652290207Sjkim 653290207Sjkim sub unsha1 { 654290207Sjkim my ($mnemonic,$arg)=@_; 655290207Sjkim 656290207Sjkim if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) { 657290207Sjkim my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) 658290207Sjkim |(($2&7)<<17)|(($2&8)<<4) 659290207Sjkim |(($3&7)<<1) |(($3&8)<<2); 660290207Sjkim # since ARMv7 instructions are always encoded little-endian. 661290207Sjkim # correct solution is to use .inst directive, but older 662290207Sjkim # assemblers don't implement it:-( 663290207Sjkim sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", 664290207Sjkim $word&0xff,($word>>8)&0xff, 665290207Sjkim ($word>>16)&0xff,($word>>24)&0xff, 666290207Sjkim $mnemonic,$arg; 667290207Sjkim } 668290207Sjkim } 669290207Sjkim} 670290207Sjkim 671290207Sjkimforeach (split($/,$code)) { 672290207Sjkim s/{q([0-9]+)\[\]}/sprintf "{d%d[],d%d[]}",2*$1,2*$1+1/eo or 673290207Sjkim s/{q([0-9]+)\[0\]}/sprintf "{d%d[0]}",2*$1/eo; 674290207Sjkim 675290207Sjkim s/\b(sha1\w+)\s+(q.*)/unsha1($1,$2)/geo; 676290207Sjkim 677290207Sjkim s/\bret\b/bx lr/o or 678290207Sjkim s/\bbx\s+lr\b/.word\t0xe12fff1e/o; # make it possible to compile with -march=armv4 679290207Sjkim 680290207Sjkim print $_,$/; 681290207Sjkim} 682290207Sjkim 683238384Sjkimclose STDOUT; # enforce flush 684