1238384Sjkim#!/usr/bin/env perl 2238384Sjkim 3238384Sjkim# ==================================================================== 4290207Sjkim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and 6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further 7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/. 8290207Sjkim# 9290207Sjkim# Permission to use under GPL terms is granted. 10238384Sjkim# ==================================================================== 11238384Sjkim 12238384Sjkim# SHA256 block procedure for ARMv4. May 2007. 13238384Sjkim 14238384Sjkim# Performance is ~2x better than gcc 3.4 generated code and in "abso- 15238384Sjkim# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per 16238384Sjkim# byte [on single-issue Xscale PXA250 core]. 17238384Sjkim 18238384Sjkim# July 2010. 19238384Sjkim# 20238384Sjkim# Rescheduling for dual-issue pipeline resulted in 22% improvement on 21238384Sjkim# Cortex A8 core and ~20 cycles per processed byte. 22238384Sjkim 23238384Sjkim# February 2011. 24238384Sjkim# 25238384Sjkim# Profiler-assisted and platform-specific optimization resulted in 16% 26290207Sjkim# improvement on Cortex A8 core and ~15.4 cycles per processed byte. 27238384Sjkim 28290207Sjkim# September 2013. 29290207Sjkim# 30290207Sjkim# Add NEON implementation. On Cortex A8 it was measured to process one 31290207Sjkim# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon 32290207Sjkim# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only 33290207Sjkim# code (meaning that latter performs sub-optimally, nothing was done 34290207Sjkim# about it). 35290207Sjkim 36290207Sjkim# May 2014. 37290207Sjkim# 38290207Sjkim# Add ARMv8 code path performing at 2.0 cpb on Apple A7. 39290207Sjkim 40238384Sjkimwhile (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 41238384Sjkimopen STDOUT,">$output"; 42238384Sjkim 43238384Sjkim$ctx="r0"; $t0="r0"; 44290207Sjkim$inp="r1"; $t4="r1"; 45238384Sjkim$len="r2"; $t1="r2"; 46290207Sjkim$T1="r3"; $t3="r3"; 47238384Sjkim$A="r4"; 48238384Sjkim$B="r5"; 49238384Sjkim$C="r6"; 50238384Sjkim$D="r7"; 51238384Sjkim$E="r8"; 52238384Sjkim$F="r9"; 53238384Sjkim$G="r10"; 54238384Sjkim$H="r11"; 55238384Sjkim@V=($A,$B,$C,$D,$E,$F,$G,$H); 56238384Sjkim$t2="r12"; 57238384Sjkim$Ktbl="r14"; 58238384Sjkim 59238384Sjkim@Sigma0=( 2,13,22); 60238384Sjkim@Sigma1=( 6,11,25); 61238384Sjkim@sigma0=( 7,18, 3); 62238384Sjkim@sigma1=(17,19,10); 63238384Sjkim 64238384Sjkimsub BODY_00_15 { 65238384Sjkimmy ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 66238384Sjkim 67238384Sjkim$code.=<<___ if ($i<16); 68238384Sjkim#if __ARM_ARCH__>=7 69290207Sjkim @ ldr $t1,[$inp],#4 @ $i 70290207Sjkim# if $i==15 71290207Sjkim str $inp,[sp,#17*4] @ make room for $t4 72290207Sjkim# endif 73290207Sjkim eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` 74290207Sjkim add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 75290207Sjkim eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 76290207Sjkim rev $t1,$t1 77238384Sjkim#else 78290207Sjkim @ ldrb $t1,[$inp,#3] @ $i 79290207Sjkim add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 80238384Sjkim ldrb $t2,[$inp,#2] 81290207Sjkim ldrb $t0,[$inp,#1] 82290207Sjkim orr $t1,$t1,$t2,lsl#8 83290207Sjkim ldrb $t2,[$inp],#4 84290207Sjkim orr $t1,$t1,$t0,lsl#16 85290207Sjkim# if $i==15 86290207Sjkim str $inp,[sp,#17*4] @ make room for $t4 87290207Sjkim# endif 88290207Sjkim eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` 89290207Sjkim orr $t1,$t1,$t2,lsl#24 90290207Sjkim eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 91238384Sjkim#endif 92238384Sjkim___ 93238384Sjkim$code.=<<___; 94238384Sjkim ldr $t2,[$Ktbl],#4 @ *K256++ 95290207Sjkim add $h,$h,$t1 @ h+=X[i] 96290207Sjkim str $t1,[sp,#`$i%16`*4] 97238384Sjkim eor $t1,$f,$g 98290207Sjkim add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e) 99238384Sjkim and $t1,$t1,$e 100290207Sjkim add $h,$h,$t2 @ h+=K256[i] 101238384Sjkim eor $t1,$t1,$g @ Ch(e,f,g) 102290207Sjkim eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]` 103290207Sjkim add $h,$h,$t1 @ h+=Ch(e,f,g) 104290207Sjkim#if $i==31 105290207Sjkim and $t2,$t2,#0xff 106290207Sjkim cmp $t2,#0xf2 @ done? 107238384Sjkim#endif 108290207Sjkim#if $i<15 109290207Sjkim# if __ARM_ARCH__>=7 110290207Sjkim ldr $t1,[$inp],#4 @ prefetch 111290207Sjkim# else 112290207Sjkim ldrb $t1,[$inp,#3] 113290207Sjkim# endif 114290207Sjkim eor $t2,$a,$b @ a^b, b^c in next round 115290207Sjkim#else 116290207Sjkim ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx 117290207Sjkim eor $t2,$a,$b @ a^b, b^c in next round 118290207Sjkim ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx 119290207Sjkim#endif 120290207Sjkim eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a) 121290207Sjkim and $t3,$t3,$t2 @ (b^c)&=(a^b) 122290207Sjkim add $d,$d,$h @ d+=h 123290207Sjkim eor $t3,$t3,$b @ Maj(a,b,c) 124290207Sjkim add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a) 125290207Sjkim @ add $h,$h,$t3 @ h+=Maj(a,b,c) 126238384Sjkim___ 127290207Sjkim ($t2,$t3)=($t3,$t2); 128238384Sjkim} 129238384Sjkim 130238384Sjkimsub BODY_16_XX { 131238384Sjkimmy ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 132238384Sjkim 133238384Sjkim$code.=<<___; 134290207Sjkim @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i 135290207Sjkim @ ldr $t4,[sp,#`($i+14)%16`*4] 136290207Sjkim mov $t0,$t1,ror#$sigma0[0] 137290207Sjkim add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 138290207Sjkim mov $t2,$t4,ror#$sigma1[0] 139290207Sjkim eor $t0,$t0,$t1,ror#$sigma0[1] 140290207Sjkim eor $t2,$t2,$t4,ror#$sigma1[1] 141290207Sjkim eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) 142290207Sjkim ldr $t1,[sp,#`($i+0)%16`*4] 143290207Sjkim eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14]) 144290207Sjkim ldr $t4,[sp,#`($i+9)%16`*4] 145290207Sjkim 146290207Sjkim add $t2,$t2,$t0 147290207Sjkim eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15 148290207Sjkim add $t1,$t1,$t2 149290207Sjkim eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 150290207Sjkim add $t1,$t1,$t4 @ X[i] 151238384Sjkim___ 152238384Sjkim &BODY_00_15(@_); 153238384Sjkim} 154238384Sjkim 155238384Sjkim$code=<<___; 156290207Sjkim#ifndef __KERNEL__ 157290207Sjkim# include "arm_arch.h" 158290207Sjkim#else 159290207Sjkim# define __ARM_ARCH__ __LINUX_ARM_ARCH__ 160290207Sjkim# define __ARM_MAX_ARCH__ 7 161290207Sjkim#endif 162238384Sjkim 163238384Sjkim.text 164290207Sjkim#if __ARM_ARCH__<7 165238384Sjkim.code 32 166290207Sjkim#else 167290207Sjkim.syntax unified 168290207Sjkim# ifdef __thumb2__ 169290207Sjkim.thumb 170290207Sjkim# else 171290207Sjkim.code 32 172290207Sjkim# endif 173290207Sjkim#endif 174238384Sjkim 175238384Sjkim.type K256,%object 176238384Sjkim.align 5 177238384SjkimK256: 178238384Sjkim.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 179238384Sjkim.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 180238384Sjkim.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 181238384Sjkim.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 182238384Sjkim.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 183238384Sjkim.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 184238384Sjkim.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 185238384Sjkim.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 186238384Sjkim.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 187238384Sjkim.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 188238384Sjkim.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 189238384Sjkim.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 190238384Sjkim.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 191238384Sjkim.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 192238384Sjkim.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 193238384Sjkim.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 194238384Sjkim.size K256,.-K256 195290207Sjkim.word 0 @ terminator 196290207Sjkim#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 197290207Sjkim.LOPENSSL_armcap: 198290207Sjkim.word OPENSSL_armcap_P-sha256_block_data_order 199290207Sjkim#endif 200290207Sjkim.align 5 201238384Sjkim 202238384Sjkim.global sha256_block_data_order 203238384Sjkim.type sha256_block_data_order,%function 204238384Sjkimsha256_block_data_order: 205290207Sjkim#if __ARM_ARCH__<7 206238384Sjkim sub r3,pc,#8 @ sha256_block_data_order 207290207Sjkim#else 208326663Sjkim adr r3,. 209290207Sjkim#endif 210290207Sjkim#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 211290207Sjkim ldr r12,.LOPENSSL_armcap 212290207Sjkim ldr r12,[r3,r12] @ OPENSSL_armcap_P 213290207Sjkim tst r12,#ARMV8_SHA256 214290207Sjkim bne .LARMv8 215290207Sjkim tst r12,#ARMV7_NEON 216290207Sjkim bne .LNEON 217290207Sjkim#endif 218238384Sjkim add $len,$inp,$len,lsl#6 @ len to point at the end of inp 219238384Sjkim stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} 220238384Sjkim ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} 221290207Sjkim sub $Ktbl,r3,#256+32 @ K256 222238384Sjkim sub sp,sp,#16*4 @ alloca(X[16]) 223238384Sjkim.Loop: 224290207Sjkim# if __ARM_ARCH__>=7 225290207Sjkim ldr $t1,[$inp],#4 226290207Sjkim# else 227290207Sjkim ldrb $t1,[$inp,#3] 228290207Sjkim# endif 229290207Sjkim eor $t3,$B,$C @ magic 230290207Sjkim eor $t2,$t2,$t2 231238384Sjkim___ 232238384Sjkimfor($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } 233238384Sjkim$code.=".Lrounds_16_xx:\n"; 234238384Sjkimfor (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); } 235238384Sjkim$code.=<<___; 236290207Sjkim#if __ARM_ARCH__>=7 237290207Sjkim ite eq @ Thumb2 thing, sanity check in ARM 238290207Sjkim#endif 239290207Sjkim ldreq $t3,[sp,#16*4] @ pull ctx 240238384Sjkim bne .Lrounds_16_xx 241238384Sjkim 242290207Sjkim add $A,$A,$t2 @ h+=Maj(a,b,c) from the past 243290207Sjkim ldr $t0,[$t3,#0] 244290207Sjkim ldr $t1,[$t3,#4] 245290207Sjkim ldr $t2,[$t3,#8] 246238384Sjkim add $A,$A,$t0 247290207Sjkim ldr $t0,[$t3,#12] 248238384Sjkim add $B,$B,$t1 249290207Sjkim ldr $t1,[$t3,#16] 250238384Sjkim add $C,$C,$t2 251290207Sjkim ldr $t2,[$t3,#20] 252238384Sjkim add $D,$D,$t0 253290207Sjkim ldr $t0,[$t3,#24] 254238384Sjkim add $E,$E,$t1 255290207Sjkim ldr $t1,[$t3,#28] 256238384Sjkim add $F,$F,$t2 257238384Sjkim ldr $inp,[sp,#17*4] @ pull inp 258238384Sjkim ldr $t2,[sp,#18*4] @ pull inp+len 259238384Sjkim add $G,$G,$t0 260238384Sjkim add $H,$H,$t1 261290207Sjkim stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H} 262238384Sjkim cmp $inp,$t2 263238384Sjkim sub $Ktbl,$Ktbl,#256 @ rewind Ktbl 264238384Sjkim bne .Loop 265238384Sjkim 266238384Sjkim add sp,sp,#`16+3`*4 @ destroy frame 267238384Sjkim#if __ARM_ARCH__>=5 268238384Sjkim ldmia sp!,{r4-r11,pc} 269238384Sjkim#else 270238384Sjkim ldmia sp!,{r4-r11,lr} 271238384Sjkim tst lr,#1 272238384Sjkim moveq pc,lr @ be binary compatible with V4, yet 273238384Sjkim bx lr @ interoperable with Thumb ISA:-) 274238384Sjkim#endif 275290207Sjkim.size sha256_block_data_order,.-sha256_block_data_order 276290207Sjkim___ 277290207Sjkim###################################################################### 278290207Sjkim# NEON stuff 279290207Sjkim# 280290207Sjkim{{{ 281290207Sjkimmy @X=map("q$_",(0..3)); 282290207Sjkimmy ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25"); 283290207Sjkimmy $Xfer=$t4; 284290207Sjkimmy $j=0; 285290207Sjkim 286290207Sjkimsub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } 287290207Sjkimsub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } 288290207Sjkim 289290207Sjkimsub AUTOLOAD() # thunk [simplified] x86-style perlasm 290290207Sjkim{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; 291290207Sjkim my $arg = pop; 292290207Sjkim $arg = "#$arg" if ($arg*1 eq $arg); 293290207Sjkim $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; 294290207Sjkim} 295290207Sjkim 296290207Sjkimsub Xupdate() 297290207Sjkim{ use integer; 298290207Sjkim my $body = shift; 299290207Sjkim my @insns = (&$body,&$body,&$body,&$body); 300290207Sjkim my ($a,$b,$c,$d,$e,$f,$g,$h); 301290207Sjkim 302290207Sjkim &vext_8 ($T0,@X[0],@X[1],4); # X[1..4] 303290207Sjkim eval(shift(@insns)); 304290207Sjkim eval(shift(@insns)); 305290207Sjkim eval(shift(@insns)); 306290207Sjkim &vext_8 ($T1,@X[2],@X[3],4); # X[9..12] 307290207Sjkim eval(shift(@insns)); 308290207Sjkim eval(shift(@insns)); 309290207Sjkim eval(shift(@insns)); 310290207Sjkim &vshr_u32 ($T2,$T0,$sigma0[0]); 311290207Sjkim eval(shift(@insns)); 312290207Sjkim eval(shift(@insns)); 313290207Sjkim &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12] 314290207Sjkim eval(shift(@insns)); 315290207Sjkim eval(shift(@insns)); 316290207Sjkim &vshr_u32 ($T1,$T0,$sigma0[2]); 317290207Sjkim eval(shift(@insns)); 318290207Sjkim eval(shift(@insns)); 319290207Sjkim &vsli_32 ($T2,$T0,32-$sigma0[0]); 320290207Sjkim eval(shift(@insns)); 321290207Sjkim eval(shift(@insns)); 322290207Sjkim &vshr_u32 ($T3,$T0,$sigma0[1]); 323290207Sjkim eval(shift(@insns)); 324290207Sjkim eval(shift(@insns)); 325290207Sjkim &veor ($T1,$T1,$T2); 326290207Sjkim eval(shift(@insns)); 327290207Sjkim eval(shift(@insns)); 328290207Sjkim &vsli_32 ($T3,$T0,32-$sigma0[1]); 329290207Sjkim eval(shift(@insns)); 330290207Sjkim eval(shift(@insns)); 331290207Sjkim &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]); 332290207Sjkim eval(shift(@insns)); 333290207Sjkim eval(shift(@insns)); 334290207Sjkim &veor ($T1,$T1,$T3); # sigma0(X[1..4]) 335290207Sjkim eval(shift(@insns)); 336290207Sjkim eval(shift(@insns)); 337290207Sjkim &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]); 338290207Sjkim eval(shift(@insns)); 339290207Sjkim eval(shift(@insns)); 340290207Sjkim &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]); 341290207Sjkim eval(shift(@insns)); 342290207Sjkim eval(shift(@insns)); 343290207Sjkim &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) 344290207Sjkim eval(shift(@insns)); 345290207Sjkim eval(shift(@insns)); 346290207Sjkim &veor ($T5,$T5,$T4); 347290207Sjkim eval(shift(@insns)); 348290207Sjkim eval(shift(@insns)); 349290207Sjkim &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]); 350290207Sjkim eval(shift(@insns)); 351290207Sjkim eval(shift(@insns)); 352290207Sjkim &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]); 353290207Sjkim eval(shift(@insns)); 354290207Sjkim eval(shift(@insns)); 355290207Sjkim &veor ($T5,$T5,$T4); # sigma1(X[14..15]) 356290207Sjkim eval(shift(@insns)); 357290207Sjkim eval(shift(@insns)); 358290207Sjkim &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15]) 359290207Sjkim eval(shift(@insns)); 360290207Sjkim eval(shift(@insns)); 361290207Sjkim &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]); 362290207Sjkim eval(shift(@insns)); 363290207Sjkim eval(shift(@insns)); 364290207Sjkim &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]); 365290207Sjkim eval(shift(@insns)); 366290207Sjkim eval(shift(@insns)); 367290207Sjkim &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]); 368290207Sjkim eval(shift(@insns)); 369290207Sjkim eval(shift(@insns)); 370290207Sjkim &veor ($T5,$T5,$T4); 371290207Sjkim eval(shift(@insns)); 372290207Sjkim eval(shift(@insns)); 373290207Sjkim &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]); 374290207Sjkim eval(shift(@insns)); 375290207Sjkim eval(shift(@insns)); 376290207Sjkim &vld1_32 ("{$T0}","[$Ktbl,:128]!"); 377290207Sjkim eval(shift(@insns)); 378290207Sjkim eval(shift(@insns)); 379290207Sjkim &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]); 380290207Sjkim eval(shift(@insns)); 381290207Sjkim eval(shift(@insns)); 382290207Sjkim &veor ($T5,$T5,$T4); # sigma1(X[16..17]) 383290207Sjkim eval(shift(@insns)); 384290207Sjkim eval(shift(@insns)); 385290207Sjkim &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17]) 386290207Sjkim eval(shift(@insns)); 387290207Sjkim eval(shift(@insns)); 388290207Sjkim &vadd_i32 ($T0,$T0,@X[0]); 389290207Sjkim while($#insns>=2) { eval(shift(@insns)); } 390290207Sjkim &vst1_32 ("{$T0}","[$Xfer,:128]!"); 391290207Sjkim eval(shift(@insns)); 392290207Sjkim eval(shift(@insns)); 393290207Sjkim 394290207Sjkim push(@X,shift(@X)); # "rotate" X[] 395290207Sjkim} 396290207Sjkim 397290207Sjkimsub Xpreload() 398290207Sjkim{ use integer; 399290207Sjkim my $body = shift; 400290207Sjkim my @insns = (&$body,&$body,&$body,&$body); 401290207Sjkim my ($a,$b,$c,$d,$e,$f,$g,$h); 402290207Sjkim 403290207Sjkim eval(shift(@insns)); 404290207Sjkim eval(shift(@insns)); 405290207Sjkim eval(shift(@insns)); 406290207Sjkim eval(shift(@insns)); 407290207Sjkim &vld1_32 ("{$T0}","[$Ktbl,:128]!"); 408290207Sjkim eval(shift(@insns)); 409290207Sjkim eval(shift(@insns)); 410290207Sjkim eval(shift(@insns)); 411290207Sjkim eval(shift(@insns)); 412290207Sjkim &vrev32_8 (@X[0],@X[0]); 413290207Sjkim eval(shift(@insns)); 414290207Sjkim eval(shift(@insns)); 415290207Sjkim eval(shift(@insns)); 416290207Sjkim eval(shift(@insns)); 417290207Sjkim &vadd_i32 ($T0,$T0,@X[0]); 418290207Sjkim foreach (@insns) { eval; } # remaining instructions 419290207Sjkim &vst1_32 ("{$T0}","[$Xfer,:128]!"); 420290207Sjkim 421290207Sjkim push(@X,shift(@X)); # "rotate" X[] 422290207Sjkim} 423290207Sjkim 424290207Sjkimsub body_00_15 () { 425290207Sjkim ( 426290207Sjkim '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. 427290207Sjkim '&add ($h,$h,$t1)', # h+=X[i]+K[i] 428290207Sjkim '&eor ($t1,$f,$g)', 429290207Sjkim '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', 430290207Sjkim '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past 431290207Sjkim '&and ($t1,$t1,$e)', 432290207Sjkim '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) 433290207Sjkim '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', 434290207Sjkim '&eor ($t1,$t1,$g)', # Ch(e,f,g) 435290207Sjkim '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e) 436290207Sjkim '&eor ($t2,$a,$b)', # a^b, b^c in next round 437290207Sjkim '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) 438290207Sjkim '&add ($h,$h,$t1)', # h+=Ch(e,f,g) 439290207Sjkim '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. 440290207Sjkim '&ldr ($t1,"[$Ktbl]") if ($j==15);'. 441290207Sjkim '&ldr ($t1,"[sp,#64]") if ($j==31)', 442290207Sjkim '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) 443290207Sjkim '&add ($d,$d,$h)', # d+=h 444290207Sjkim '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a) 445290207Sjkim '&eor ($t3,$t3,$b)', # Maj(a,b,c) 446290207Sjkim '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' 447290207Sjkim ) 448290207Sjkim} 449290207Sjkim 450290207Sjkim$code.=<<___; 451290207Sjkim#if __ARM_MAX_ARCH__>=7 452290207Sjkim.arch armv7-a 453290207Sjkim.fpu neon 454290207Sjkim 455290207Sjkim.global sha256_block_data_order_neon 456290207Sjkim.type sha256_block_data_order_neon,%function 457290207Sjkim.align 4 458290207Sjkimsha256_block_data_order_neon: 459290207Sjkim.LNEON: 460290207Sjkim stmdb sp!,{r4-r12,lr} 461290207Sjkim 462290207Sjkim sub $H,sp,#16*4+16 463290207Sjkim adr $Ktbl,K256 464290207Sjkim bic $H,$H,#15 @ align for 128-bit stores 465290207Sjkim mov $t2,sp 466290207Sjkim mov sp,$H @ alloca 467290207Sjkim add $len,$inp,$len,lsl#6 @ len to point at the end of inp 468290207Sjkim 469290207Sjkim vld1.8 {@X[0]},[$inp]! 470290207Sjkim vld1.8 {@X[1]},[$inp]! 471290207Sjkim vld1.8 {@X[2]},[$inp]! 472290207Sjkim vld1.8 {@X[3]},[$inp]! 473290207Sjkim vld1.32 {$T0},[$Ktbl,:128]! 474290207Sjkim vld1.32 {$T1},[$Ktbl,:128]! 475290207Sjkim vld1.32 {$T2},[$Ktbl,:128]! 476290207Sjkim vld1.32 {$T3},[$Ktbl,:128]! 477290207Sjkim vrev32.8 @X[0],@X[0] @ yes, even on 478290207Sjkim str $ctx,[sp,#64] 479290207Sjkim vrev32.8 @X[1],@X[1] @ big-endian 480290207Sjkim str $inp,[sp,#68] 481290207Sjkim mov $Xfer,sp 482290207Sjkim vrev32.8 @X[2],@X[2] 483290207Sjkim str $len,[sp,#72] 484290207Sjkim vrev32.8 @X[3],@X[3] 485290207Sjkim str $t2,[sp,#76] @ save original sp 486290207Sjkim vadd.i32 $T0,$T0,@X[0] 487290207Sjkim vadd.i32 $T1,$T1,@X[1] 488290207Sjkim vst1.32 {$T0},[$Xfer,:128]! 489290207Sjkim vadd.i32 $T2,$T2,@X[2] 490290207Sjkim vst1.32 {$T1},[$Xfer,:128]! 491290207Sjkim vadd.i32 $T3,$T3,@X[3] 492290207Sjkim vst1.32 {$T2},[$Xfer,:128]! 493290207Sjkim vst1.32 {$T3},[$Xfer,:128]! 494290207Sjkim 495290207Sjkim ldmia $ctx,{$A-$H} 496290207Sjkim sub $Xfer,$Xfer,#64 497290207Sjkim ldr $t1,[sp,#0] 498290207Sjkim eor $t2,$t2,$t2 499290207Sjkim eor $t3,$B,$C 500290207Sjkim b .L_00_48 501290207Sjkim 502290207Sjkim.align 4 503290207Sjkim.L_00_48: 504290207Sjkim___ 505290207Sjkim &Xupdate(\&body_00_15); 506290207Sjkim &Xupdate(\&body_00_15); 507290207Sjkim &Xupdate(\&body_00_15); 508290207Sjkim &Xupdate(\&body_00_15); 509290207Sjkim$code.=<<___; 510290207Sjkim teq $t1,#0 @ check for K256 terminator 511290207Sjkim ldr $t1,[sp,#0] 512290207Sjkim sub $Xfer,$Xfer,#64 513290207Sjkim bne .L_00_48 514290207Sjkim 515290207Sjkim ldr $inp,[sp,#68] 516290207Sjkim ldr $t0,[sp,#72] 517290207Sjkim sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl 518290207Sjkim teq $inp,$t0 519290207Sjkim it eq 520290207Sjkim subeq $inp,$inp,#64 @ avoid SEGV 521290207Sjkim vld1.8 {@X[0]},[$inp]! @ load next input block 522290207Sjkim vld1.8 {@X[1]},[$inp]! 523290207Sjkim vld1.8 {@X[2]},[$inp]! 524290207Sjkim vld1.8 {@X[3]},[$inp]! 525290207Sjkim it ne 526290207Sjkim strne $inp,[sp,#68] 527290207Sjkim mov $Xfer,sp 528290207Sjkim___ 529290207Sjkim &Xpreload(\&body_00_15); 530290207Sjkim &Xpreload(\&body_00_15); 531290207Sjkim &Xpreload(\&body_00_15); 532290207Sjkim &Xpreload(\&body_00_15); 533290207Sjkim$code.=<<___; 534290207Sjkim ldr $t0,[$t1,#0] 535290207Sjkim add $A,$A,$t2 @ h+=Maj(a,b,c) from the past 536290207Sjkim ldr $t2,[$t1,#4] 537290207Sjkim ldr $t3,[$t1,#8] 538290207Sjkim ldr $t4,[$t1,#12] 539290207Sjkim add $A,$A,$t0 @ accumulate 540290207Sjkim ldr $t0,[$t1,#16] 541290207Sjkim add $B,$B,$t2 542290207Sjkim ldr $t2,[$t1,#20] 543290207Sjkim add $C,$C,$t3 544290207Sjkim ldr $t3,[$t1,#24] 545290207Sjkim add $D,$D,$t4 546290207Sjkim ldr $t4,[$t1,#28] 547290207Sjkim add $E,$E,$t0 548290207Sjkim str $A,[$t1],#4 549290207Sjkim add $F,$F,$t2 550290207Sjkim str $B,[$t1],#4 551290207Sjkim add $G,$G,$t3 552290207Sjkim str $C,[$t1],#4 553290207Sjkim add $H,$H,$t4 554290207Sjkim str $D,[$t1],#4 555290207Sjkim stmia $t1,{$E-$H} 556290207Sjkim 557290207Sjkim ittte ne 558290207Sjkim movne $Xfer,sp 559290207Sjkim ldrne $t1,[sp,#0] 560290207Sjkim eorne $t2,$t2,$t2 561290207Sjkim ldreq sp,[sp,#76] @ restore original sp 562290207Sjkim itt ne 563290207Sjkim eorne $t3,$B,$C 564290207Sjkim bne .L_00_48 565290207Sjkim 566290207Sjkim ldmia sp!,{r4-r12,pc} 567290207Sjkim.size sha256_block_data_order_neon,.-sha256_block_data_order_neon 568290207Sjkim#endif 569290207Sjkim___ 570290207Sjkim}}} 571290207Sjkim###################################################################### 572290207Sjkim# ARMv8 stuff 573290207Sjkim# 574290207Sjkim{{{ 575290207Sjkimmy ($ABCD,$EFGH,$abcd)=map("q$_",(0..2)); 576290207Sjkimmy @MSG=map("q$_",(8..11)); 577290207Sjkimmy ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15)); 578290207Sjkimmy $Ktbl="r3"; 579290207Sjkim 580290207Sjkim$code.=<<___; 581290207Sjkim#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 582290207Sjkim 583290207Sjkim# ifdef __thumb2__ 584290207Sjkim# define INST(a,b,c,d) .byte c,d|0xc,a,b 585290207Sjkim# else 586290207Sjkim# define INST(a,b,c,d) .byte a,b,c,d 587290207Sjkim# endif 588290207Sjkim 589290207Sjkim.type sha256_block_data_order_armv8,%function 590290207Sjkim.align 5 591290207Sjkimsha256_block_data_order_armv8: 592290207Sjkim.LARMv8: 593290207Sjkim vld1.32 {$ABCD,$EFGH},[$ctx] 594290207Sjkim# ifdef __thumb2__ 595290207Sjkim adr $Ktbl,.LARMv8 596290207Sjkim sub $Ktbl,$Ktbl,#.LARMv8-K256 597290207Sjkim# else 598305152Sjkim sub $Ktbl,$Ktbl,#256+32 599290207Sjkim# endif 600290207Sjkim add $len,$inp,$len,lsl#6 @ len to point at the end of inp 601290207Sjkim 602290207Sjkim.Loop_v8: 603290207Sjkim vld1.8 {@MSG[0]-@MSG[1]},[$inp]! 604290207Sjkim vld1.8 {@MSG[2]-@MSG[3]},[$inp]! 605290207Sjkim vld1.32 {$W0},[$Ktbl]! 606290207Sjkim vrev32.8 @MSG[0],@MSG[0] 607290207Sjkim vrev32.8 @MSG[1],@MSG[1] 608290207Sjkim vrev32.8 @MSG[2],@MSG[2] 609290207Sjkim vrev32.8 @MSG[3],@MSG[3] 610290207Sjkim vmov $ABCD_SAVE,$ABCD @ offload 611290207Sjkim vmov $EFGH_SAVE,$EFGH 612290207Sjkim teq $inp,$len 613290207Sjkim___ 614290207Sjkimfor($i=0;$i<12;$i++) { 615290207Sjkim$code.=<<___; 616290207Sjkim vld1.32 {$W1},[$Ktbl]! 617290207Sjkim vadd.i32 $W0,$W0,@MSG[0] 618290207Sjkim sha256su0 @MSG[0],@MSG[1] 619290207Sjkim vmov $abcd,$ABCD 620290207Sjkim sha256h $ABCD,$EFGH,$W0 621290207Sjkim sha256h2 $EFGH,$abcd,$W0 622290207Sjkim sha256su1 @MSG[0],@MSG[2],@MSG[3] 623290207Sjkim___ 624290207Sjkim ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); 625290207Sjkim} 626290207Sjkim$code.=<<___; 627290207Sjkim vld1.32 {$W1},[$Ktbl]! 628290207Sjkim vadd.i32 $W0,$W0,@MSG[0] 629290207Sjkim vmov $abcd,$ABCD 630290207Sjkim sha256h $ABCD,$EFGH,$W0 631290207Sjkim sha256h2 $EFGH,$abcd,$W0 632290207Sjkim 633290207Sjkim vld1.32 {$W0},[$Ktbl]! 634290207Sjkim vadd.i32 $W1,$W1,@MSG[1] 635290207Sjkim vmov $abcd,$ABCD 636290207Sjkim sha256h $ABCD,$EFGH,$W1 637290207Sjkim sha256h2 $EFGH,$abcd,$W1 638290207Sjkim 639290207Sjkim vld1.32 {$W1},[$Ktbl] 640290207Sjkim vadd.i32 $W0,$W0,@MSG[2] 641290207Sjkim sub $Ktbl,$Ktbl,#256-16 @ rewind 642290207Sjkim vmov $abcd,$ABCD 643290207Sjkim sha256h $ABCD,$EFGH,$W0 644290207Sjkim sha256h2 $EFGH,$abcd,$W0 645290207Sjkim 646290207Sjkim vadd.i32 $W1,$W1,@MSG[3] 647290207Sjkim vmov $abcd,$ABCD 648290207Sjkim sha256h $ABCD,$EFGH,$W1 649290207Sjkim sha256h2 $EFGH,$abcd,$W1 650290207Sjkim 651290207Sjkim vadd.i32 $ABCD,$ABCD,$ABCD_SAVE 652290207Sjkim vadd.i32 $EFGH,$EFGH,$EFGH_SAVE 653290207Sjkim it ne 654290207Sjkim bne .Loop_v8 655290207Sjkim 656290207Sjkim vst1.32 {$ABCD,$EFGH},[$ctx] 657290207Sjkim 658290207Sjkim ret @ bx lr 659290207Sjkim.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8 660290207Sjkim#endif 661290207Sjkim___ 662290207Sjkim}}} 663290207Sjkim$code.=<<___; 664290207Sjkim.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 665238384Sjkim.align 2 666290207Sjkim#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 667290207Sjkim.comm OPENSSL_armcap_P,4,4 668290207Sjkim#endif 669238384Sjkim___ 670238384Sjkim 671290207Sjkimopen SELF,$0; 672290207Sjkimwhile(<SELF>) { 673290207Sjkim next if (/^#!/); 674290207Sjkim last if (!s/^#/@/ and !/^$/); 675290207Sjkim print; 676290207Sjkim} 677290207Sjkimclose SELF; 678290207Sjkim 679290207Sjkim{ my %opcode = ( 680290207Sjkim "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40, 681290207Sjkim "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 ); 682290207Sjkim 683290207Sjkim sub unsha256 { 684290207Sjkim my ($mnemonic,$arg)=@_; 685290207Sjkim 686290207Sjkim if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) { 687290207Sjkim my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) 688290207Sjkim |(($2&7)<<17)|(($2&8)<<4) 689290207Sjkim |(($3&7)<<1) |(($3&8)<<2); 690290207Sjkim # since ARMv7 instructions are always encoded little-endian. 691290207Sjkim # correct solution is to use .inst directive, but older 692290207Sjkim # assemblers don't implement it:-( 693290207Sjkim sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", 694290207Sjkim $word&0xff,($word>>8)&0xff, 695290207Sjkim ($word>>16)&0xff,($word>>24)&0xff, 696290207Sjkim $mnemonic,$arg; 697290207Sjkim } 698290207Sjkim } 699290207Sjkim} 700290207Sjkim 701290207Sjkimforeach (split($/,$code)) { 702290207Sjkim 703290207Sjkim s/\`([^\`]*)\`/eval $1/geo; 704290207Sjkim 705290207Sjkim s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo; 706290207Sjkim 707290207Sjkim s/\bret\b/bx lr/go or 708290207Sjkim s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 709290207Sjkim 710290207Sjkim print $_,"\n"; 711290207Sjkim} 712290207Sjkim 713238384Sjkimclose STDOUT; # enforce flush 714