1169695Skan#!/usr/bin/env perl 2169695Skan 3169695Skan# ==================================================================== 4169695Skan# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5169695Skan# project. The module is, however, dual licensed under OpenSSL and 6169695Skan# CRYPTOGAMS licenses depending on where you obtain it. For further 7169695Skan# details see http://www.openssl.org/~appro/cryptogams/. 8169695Skan# ==================================================================== 9169695Skan 10169695Skan# SHA256 block procedure for ARMv4. May 2007. 11169695Skan 12169695Skan# Performance is ~2x better than gcc 3.4 generated code and in "abso- 13169695Skan# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per 14169695Skan# byte [on single-issue Xscale PXA250 core]. 15169695Skan 16169695Skan# July 2010. 17169695Skan# 18169695Skan# Rescheduling for dual-issue pipeline resulted in 22% improvement on 19169695Skan# Cortex A8 core and ~20 cycles per processed byte. 20169695Skan 21169695Skan# February 2011. 22169695Skan# 23169695Skan# Profiler-assisted and platform-specific optimization resulted in 16% 24169695Skan# improvement on Cortex A8 core and ~17 cycles per processed byte. 25169695Skan 26169695Skanwhile (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} 27169695Skanopen STDOUT,">$output"; 28169695Skan 29169695Skan$ctx="r0"; $t0="r0"; 30169695Skan$inp="r1"; $t3="r1"; 31169695Skan$len="r2"; $t1="r2"; 32169695Skan$T1="r3"; 33169695Skan$A="r4"; 34169695Skan$B="r5"; 35169695Skan$C="r6"; 36169695Skan$D="r7"; 37169695Skan$E="r8"; 38169695Skan$F="r9"; 39169695Skan$G="r10"; 40169695Skan$H="r11"; 41169695Skan@V=($A,$B,$C,$D,$E,$F,$G,$H); 42169695Skan$t2="r12"; 43169695Skan$Ktbl="r14"; 44169695Skan 45169695Skan@Sigma0=( 2,13,22); 46169695Skan@Sigma1=( 6,11,25); 47169695Skan@sigma0=( 7,18, 3); 48169695Skan@sigma1=(17,19,10); 49169695Skan 50169695Skansub BODY_00_15 { 51169695Skanmy ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 52169695Skan 53169695Skan$code.=<<___ if ($i<16); 54169695Skan#if __ARM_ARCH__>=7 55169695Skan ldr $T1,[$inp],#4 56169695Skan#else 57169695Skan ldrb $T1,[$inp,#3] @ $i 58169695Skan ldrb $t2,[$inp,#2] 59169695Skan ldrb $t1,[$inp,#1] 60169695Skan ldrb $t0,[$inp],#4 61169695Skan orr $T1,$T1,$t2,lsl#8 62169695Skan orr $T1,$T1,$t1,lsl#16 63169695Skan orr $T1,$T1,$t0,lsl#24 64169695Skan#endif 65169695Skan___ 66169695Skan$code.=<<___; 67169695Skan mov $t0,$e,ror#$Sigma1[0] 68169695Skan ldr $t2,[$Ktbl],#4 @ *K256++ 69169695Skan eor $t0,$t0,$e,ror#$Sigma1[1] 70169695Skan eor $t1,$f,$g 71169695Skan#if $i>=16 72169695Skan add $T1,$T1,$t3 @ from BODY_16_xx 73169695Skan#elif __ARM_ARCH__>=7 && defined(__ARMEL__) 74169695Skan rev $T1,$T1 75169695Skan#endif 76169695Skan#if $i==15 77169695Skan str $inp,[sp,#17*4] @ leave room for $t3 78169695Skan#endif 79169695Skan eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e) 80169695Skan and $t1,$t1,$e 81169695Skan str $T1,[sp,#`$i%16`*4] 82169695Skan add $T1,$T1,$t0 83169695Skan eor $t1,$t1,$g @ Ch(e,f,g) 84169695Skan add $T1,$T1,$h 85169695Skan mov $h,$a,ror#$Sigma0[0] 86169695Skan add $T1,$T1,$t1 87169695Skan eor $h,$h,$a,ror#$Sigma0[1] 88169695Skan add $T1,$T1,$t2 89169695Skan eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a) 90169695Skan#if $i>=15 91169695Skan ldr $t3,[sp,#`($i+2)%16`*4] @ from BODY_16_xx 92169695Skan#endif 93169695Skan orr $t0,$a,$b 94169695Skan and $t1,$a,$b 95169695Skan and $t0,$t0,$c 96169695Skan add $h,$h,$T1 97169695Skan orr $t0,$t0,$t1 @ Maj(a,b,c) 98169695Skan add $d,$d,$T1 99169695Skan add $h,$h,$t0 100169695Skan___ 101169695Skan} 102169695Skan 103169695Skansub BODY_16_XX { 104169695Skanmy ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 105169695Skan 106169695Skan$code.=<<___; 107169695Skan @ ldr $t3,[sp,#`($i+1)%16`*4] @ $i 108169695Skan ldr $t2,[sp,#`($i+14)%16`*4] 109169695Skan mov $t0,$t3,ror#$sigma0[0] 110169695Skan ldr $T1,[sp,#`($i+0)%16`*4] 111169695Skan eor $t0,$t0,$t3,ror#$sigma0[1] 112169695Skan ldr $t1,[sp,#`($i+9)%16`*4] 113169695Skan eor $t0,$t0,$t3,lsr#$sigma0[2] @ sigma0(X[i+1]) 114169695Skan mov $t3,$t2,ror#$sigma1[0] 115169695Skan add $T1,$T1,$t0 116169695Skan eor $t3,$t3,$t2,ror#$sigma1[1] 117169695Skan add $T1,$T1,$t1 118169695Skan eor $t3,$t3,$t2,lsr#$sigma1[2] @ sigma1(X[i+14]) 119169695Skan @ add $T1,$T1,$t3 120169695Skan___ 121169695Skan &BODY_00_15(@_); 122169695Skan} 123169695Skan 124169695Skan$code=<<___; 125169695Skan#include "arm_arch.h" 126169695Skan 127169695Skan.text 128169695Skan.code 32 129169695Skan 130169695Skan.type K256,%object 131169695Skan.align 5 132169695SkanK256: 133169695Skan.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 134169695Skan.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 135169695Skan.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 136169695Skan.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 137169695Skan.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 138169695Skan.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 139169695Skan.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 140169695Skan.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 141169695Skan.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 142169695Skan.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 143169695Skan.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 144169695Skan.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 145169695Skan.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 146169695Skan.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 147169695Skan.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 148169695Skan.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 149169695Skan.size K256,.-K256 150169695Skan 151169695Skan.global sha256_block_data_order 152169695Skan.type sha256_block_data_order,%function 153169695Skansha256_block_data_order: 154169695Skan sub r3,pc,#8 @ sha256_block_data_order 155169695Skan add $len,$inp,$len,lsl#6 @ len to point at the end of inp 156169695Skan stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} 157169695Skan ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} 158169695Skan sub $Ktbl,r3,#256 @ K256 159169695Skan sub sp,sp,#16*4 @ alloca(X[16]) 160169695Skan.Loop: 161169695Skan___ 162169695Skanfor($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } 163169695Skan$code.=".Lrounds_16_xx:\n"; 164169695Skanfor (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); } 165169695Skan$code.=<<___; 166169695Skan and $t2,$t2,#0xff 167169695Skan cmp $t2,#0xf2 168169695Skan bne .Lrounds_16_xx 169169695Skan 170169695Skan ldr $T1,[sp,#16*4] @ pull ctx 171169695Skan ldr $t0,[$T1,#0] 172169695Skan ldr $t1,[$T1,#4] 173169695Skan ldr $t2,[$T1,#8] 174169695Skan add $A,$A,$t0 175169695Skan ldr $t0,[$T1,#12] 176169695Skan add $B,$B,$t1 177169695Skan ldr $t1,[$T1,#16] 178169695Skan add $C,$C,$t2 179169695Skan ldr $t2,[$T1,#20] 180169695Skan add $D,$D,$t0 181169695Skan ldr $t0,[$T1,#24] 182169695Skan add $E,$E,$t1 183169695Skan ldr $t1,[$T1,#28] 184169695Skan add $F,$F,$t2 185169695Skan ldr $inp,[sp,#17*4] @ pull inp 186169695Skan ldr $t2,[sp,#18*4] @ pull inp+len 187169695Skan add $G,$G,$t0 188169695Skan add $H,$H,$t1 189169695Skan stmia $T1,{$A,$B,$C,$D,$E,$F,$G,$H} 190169695Skan cmp $inp,$t2 191169695Skan sub $Ktbl,$Ktbl,#256 @ rewind Ktbl 192169695Skan bne .Loop 193 194 add sp,sp,#`16+3`*4 @ destroy frame 195#if __ARM_ARCH__>=5 196 ldmia sp!,{r4-r11,pc} 197#else 198 ldmia sp!,{r4-r11,lr} 199 tst lr,#1 200 moveq pc,lr @ be binary compatible with V4, yet 201 bx lr @ interoperable with Thumb ISA:-) 202#endif 203.size sha256_block_data_order,.-sha256_block_data_order 204.asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>" 205.align 2 206___ 207 208$code =~ s/\`([^\`]*)\`/eval $1/gem; 209$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4 210print $code; 211close STDOUT; # enforce flush 212