1289848Sjkim#!/usr/bin/env perl 2289848Sjkim# 3289848Sjkim# ==================================================================== 4289848Sjkim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5289848Sjkim# project. The module is, however, dual licensed under OpenSSL and 6289848Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further 7289848Sjkim# details see http://www.openssl.org/~appro/cryptogams/. 8289848Sjkim# ==================================================================== 9289848Sjkim# 10289848Sjkim# SHA256/512 for ARMv8. 11289848Sjkim# 12289848Sjkim# Performance in cycles per processed byte and improvement coefficient 13289848Sjkim# over code generated with "default" compiler: 14289848Sjkim# 15289848Sjkim# SHA256-hw SHA256(*) SHA512 16289848Sjkim# Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) 17289848Sjkim# Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) 18289848Sjkim# Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) 19289848Sjkim# Denver 2.01 10.5 (+26%) 6.70 (+8%) 20289848Sjkim# X-Gene 20.0 (+100%) 12.8 (+300%(***)) 21289848Sjkim# 22289848Sjkim# (*) Software SHA256 results are of lesser relevance, presented 23289848Sjkim# mostly for informational purposes. 24289848Sjkim# (**) The result is a trade-off: it's possible to improve it by 25289848Sjkim# 10% (or by 1 cycle per round), but at the cost of 20% loss 26289848Sjkim# on Cortex-A53 (or by 4 cycles per round). 27289848Sjkim# (***) Super-impressive coefficients over gcc-generated code are 28289848Sjkim# indication of some compiler "pathology", most notably code 29289848Sjkim# generated with -mgeneral-regs-only is significanty faster 30289848Sjkim# and the gap is only 40-90%. 31289848Sjkim 32289848Sjkim$flavour=shift; 33289848Sjkim$output=shift; 34289848Sjkimopen STDOUT,">$output"; 35289848Sjkim 36289848Sjkimif ($output =~ /512/) { 37289848Sjkim $BITS=512; 38289848Sjkim $SZ=8; 39289848Sjkim @Sigma0=(28,34,39); 40289848Sjkim @Sigma1=(14,18,41); 41289848Sjkim @sigma0=(1, 8, 7); 42289848Sjkim @sigma1=(19,61, 6); 43289848Sjkim $rounds=80; 44289848Sjkim $reg_t="x"; 45289848Sjkim} else { 46289848Sjkim $BITS=256; 47289848Sjkim $SZ=4; 48289848Sjkim @Sigma0=( 2,13,22); 49289848Sjkim @Sigma1=( 6,11,25); 50289848Sjkim @sigma0=( 7,18, 3); 51289848Sjkim @sigma1=(17,19,10); 52289848Sjkim $rounds=64; 53289848Sjkim $reg_t="w"; 54289848Sjkim} 55289848Sjkim 56289848Sjkim$func="sha${BITS}_block_data_order"; 57289848Sjkim 58289848Sjkim($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30)); 59289848Sjkim 60289848Sjkim@X=map("$reg_t$_",(3..15,0..2)); 61289848Sjkim@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27)); 62289848Sjkim($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28)); 63289848Sjkim 64289848Sjkimsub BODY_00_xx { 65289848Sjkimmy ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; 66289848Sjkimmy $j=($i+1)&15; 67289848Sjkimmy ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]); 68289848Sjkim $T0=@X[$i+3] if ($i<11); 69289848Sjkim 70289848Sjkim$code.=<<___ if ($i<16); 71289848Sjkim#ifndef __ARMEB__ 72289848Sjkim rev @X[$i],@X[$i] // $i 73289848Sjkim#endif 74289848Sjkim___ 75289848Sjkim$code.=<<___ if ($i<13 && ($i&1)); 76289848Sjkim ldp @X[$i+1],@X[$i+2],[$inp],#2*$SZ 77289848Sjkim___ 78289848Sjkim$code.=<<___ if ($i==13); 79289848Sjkim ldp @X[14],@X[15],[$inp] 80289848Sjkim___ 81289848Sjkim$code.=<<___ if ($i>=14); 82289848Sjkim ldr @X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`] 83289848Sjkim___ 84289848Sjkim$code.=<<___ if ($i>0 && $i<16); 85289848Sjkim add $a,$a,$t1 // h+=Sigma0(a) 86289848Sjkim___ 87289848Sjkim$code.=<<___ if ($i>=11); 88289848Sjkim str @X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`] 89289848Sjkim___ 90289848Sjkim# While ARMv8 specifies merged rotate-n-logical operation such as 91289848Sjkim# 'eor x,y,z,ror#n', it was found to negatively affect performance 92289848Sjkim# on Apple A7. The reason seems to be that it requires even 'y' to 93289848Sjkim# be available earlier. This means that such merged instruction is 94289848Sjkim# not necessarily best choice on critical path... On the other hand 95289848Sjkim# Cortex-A5x handles merged instructions much better than disjoint 96289848Sjkim# rotate and logical... See (**) footnote above. 97289848Sjkim$code.=<<___ if ($i<15); 98289848Sjkim ror $t0,$e,#$Sigma1[0] 99289848Sjkim add $h,$h,$t2 // h+=K[i] 100289848Sjkim eor $T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]` 101289848Sjkim and $t1,$f,$e 102289848Sjkim bic $t2,$g,$e 103289848Sjkim add $h,$h,@X[$i&15] // h+=X[i] 104289848Sjkim orr $t1,$t1,$t2 // Ch(e,f,g) 105289848Sjkim eor $t2,$a,$b // a^b, b^c in next round 106289848Sjkim eor $t0,$t0,$T0,ror#$Sigma1[1] // Sigma1(e) 107289848Sjkim ror $T0,$a,#$Sigma0[0] 108289848Sjkim add $h,$h,$t1 // h+=Ch(e,f,g) 109289848Sjkim eor $t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]` 110289848Sjkim add $h,$h,$t0 // h+=Sigma1(e) 111289848Sjkim and $t3,$t3,$t2 // (b^c)&=(a^b) 112289848Sjkim add $d,$d,$h // d+=h 113289848Sjkim eor $t3,$t3,$b // Maj(a,b,c) 114289848Sjkim eor $t1,$T0,$t1,ror#$Sigma0[1] // Sigma0(a) 115289848Sjkim add $h,$h,$t3 // h+=Maj(a,b,c) 116289848Sjkim ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round 117289848Sjkim //add $h,$h,$t1 // h+=Sigma0(a) 118289848Sjkim___ 119289848Sjkim$code.=<<___ if ($i>=15); 120289848Sjkim ror $t0,$e,#$Sigma1[0] 121289848Sjkim add $h,$h,$t2 // h+=K[i] 122289848Sjkim ror $T1,@X[($j+1)&15],#$sigma0[0] 123289848Sjkim and $t1,$f,$e 124289848Sjkim ror $T2,@X[($j+14)&15],#$sigma1[0] 125289848Sjkim bic $t2,$g,$e 126289848Sjkim ror $T0,$a,#$Sigma0[0] 127289848Sjkim add $h,$h,@X[$i&15] // h+=X[i] 128289848Sjkim eor $t0,$t0,$e,ror#$Sigma1[1] 129289848Sjkim eor $T1,$T1,@X[($j+1)&15],ror#$sigma0[1] 130289848Sjkim orr $t1,$t1,$t2 // Ch(e,f,g) 131289848Sjkim eor $t2,$a,$b // a^b, b^c in next round 132289848Sjkim eor $t0,$t0,$e,ror#$Sigma1[2] // Sigma1(e) 133289848Sjkim eor $T0,$T0,$a,ror#$Sigma0[1] 134289848Sjkim add $h,$h,$t1 // h+=Ch(e,f,g) 135289848Sjkim and $t3,$t3,$t2 // (b^c)&=(a^b) 136289848Sjkim eor $T2,$T2,@X[($j+14)&15],ror#$sigma1[1] 137289848Sjkim eor $T1,$T1,@X[($j+1)&15],lsr#$sigma0[2] // sigma0(X[i+1]) 138289848Sjkim add $h,$h,$t0 // h+=Sigma1(e) 139289848Sjkim eor $t3,$t3,$b // Maj(a,b,c) 140289848Sjkim eor $t1,$T0,$a,ror#$Sigma0[2] // Sigma0(a) 141289848Sjkim eor $T2,$T2,@X[($j+14)&15],lsr#$sigma1[2] // sigma1(X[i+14]) 142289848Sjkim add @X[$j],@X[$j],@X[($j+9)&15] 143289848Sjkim add $d,$d,$h // d+=h 144289848Sjkim add $h,$h,$t3 // h+=Maj(a,b,c) 145289848Sjkim ldr $t3,[$Ktbl],#$SZ // *K++, $t2 in next round 146289848Sjkim add @X[$j],@X[$j],$T1 147289848Sjkim add $h,$h,$t1 // h+=Sigma0(a) 148289848Sjkim add @X[$j],@X[$j],$T2 149289848Sjkim___ 150289848Sjkim ($t2,$t3)=($t3,$t2); 151289848Sjkim} 152289848Sjkim 153289848Sjkim$code.=<<___; 154289848Sjkim#include "arm_arch.h" 155289848Sjkim 156289848Sjkim.text 157289848Sjkim 158289848Sjkim.globl $func 159289848Sjkim.type $func,%function 160289848Sjkim.align 6 161289848Sjkim$func: 162289848Sjkim___ 163289848Sjkim$code.=<<___ if ($SZ==4); 164289848Sjkim ldr x16,.LOPENSSL_armcap_P 165289848Sjkim adr x17,.LOPENSSL_armcap_P 166289848Sjkim add x16,x16,x17 167289848Sjkim ldr w16,[x16] 168289848Sjkim tst w16,#ARMV8_SHA256 169289848Sjkim b.ne .Lv8_entry 170289848Sjkim___ 171289848Sjkim$code.=<<___; 172289848Sjkim stp x29,x30,[sp,#-128]! 173289848Sjkim add x29,sp,#0 174289848Sjkim 175289848Sjkim stp x19,x20,[sp,#16] 176289848Sjkim stp x21,x22,[sp,#32] 177289848Sjkim stp x23,x24,[sp,#48] 178289848Sjkim stp x25,x26,[sp,#64] 179289848Sjkim stp x27,x28,[sp,#80] 180289848Sjkim sub sp,sp,#4*$SZ 181289848Sjkim 182289848Sjkim ldp $A,$B,[$ctx] // load context 183289848Sjkim ldp $C,$D,[$ctx,#2*$SZ] 184289848Sjkim ldp $E,$F,[$ctx,#4*$SZ] 185289848Sjkim add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input 186289848Sjkim ldp $G,$H,[$ctx,#6*$SZ] 187289848Sjkim adr $Ktbl,K$BITS 188289848Sjkim stp $ctx,$num,[x29,#96] 189289848Sjkim 190289848Sjkim.Loop: 191289848Sjkim ldp @X[0],@X[1],[$inp],#2*$SZ 192289848Sjkim ldr $t2,[$Ktbl],#$SZ // *K++ 193289848Sjkim eor $t3,$B,$C // magic seed 194289848Sjkim str $inp,[x29,#112] 195289848Sjkim___ 196289848Sjkimfor ($i=0;$i<16;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } 197289848Sjkim$code.=".Loop_16_xx:\n"; 198289848Sjkimfor (;$i<32;$i++) { &BODY_00_xx($i,@V); unshift(@V,pop(@V)); } 199289848Sjkim$code.=<<___; 200289848Sjkim cbnz $t2,.Loop_16_xx 201289848Sjkim 202289848Sjkim ldp $ctx,$num,[x29,#96] 203289848Sjkim ldr $inp,[x29,#112] 204289848Sjkim sub $Ktbl,$Ktbl,#`$SZ*($rounds+1)` // rewind 205289848Sjkim 206289848Sjkim ldp @X[0],@X[1],[$ctx] 207289848Sjkim ldp @X[2],@X[3],[$ctx,#2*$SZ] 208289848Sjkim add $inp,$inp,#14*$SZ // advance input pointer 209289848Sjkim ldp @X[4],@X[5],[$ctx,#4*$SZ] 210289848Sjkim add $A,$A,@X[0] 211289848Sjkim ldp @X[6],@X[7],[$ctx,#6*$SZ] 212289848Sjkim add $B,$B,@X[1] 213289848Sjkim add $C,$C,@X[2] 214289848Sjkim add $D,$D,@X[3] 215289848Sjkim stp $A,$B,[$ctx] 216289848Sjkim add $E,$E,@X[4] 217289848Sjkim add $F,$F,@X[5] 218289848Sjkim stp $C,$D,[$ctx,#2*$SZ] 219289848Sjkim add $G,$G,@X[6] 220289848Sjkim add $H,$H,@X[7] 221289848Sjkim cmp $inp,$num 222289848Sjkim stp $E,$F,[$ctx,#4*$SZ] 223289848Sjkim stp $G,$H,[$ctx,#6*$SZ] 224289848Sjkim b.ne .Loop 225289848Sjkim 226289848Sjkim ldp x19,x20,[x29,#16] 227289848Sjkim add sp,sp,#4*$SZ 228289848Sjkim ldp x21,x22,[x29,#32] 229289848Sjkim ldp x23,x24,[x29,#48] 230289848Sjkim ldp x25,x26,[x29,#64] 231289848Sjkim ldp x27,x28,[x29,#80] 232289848Sjkim ldp x29,x30,[sp],#128 233289848Sjkim ret 234289848Sjkim.size $func,.-$func 235289848Sjkim 236289848Sjkim.align 6 237289848Sjkim.type K$BITS,%object 238289848SjkimK$BITS: 239289848Sjkim___ 240289848Sjkim$code.=<<___ if ($SZ==8); 241289848Sjkim .quad 0x428a2f98d728ae22,0x7137449123ef65cd 242289848Sjkim .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc 243289848Sjkim .quad 0x3956c25bf348b538,0x59f111f1b605d019 244289848Sjkim .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 245289848Sjkim .quad 0xd807aa98a3030242,0x12835b0145706fbe 246289848Sjkim .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 247289848Sjkim .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 248289848Sjkim .quad 0x9bdc06a725c71235,0xc19bf174cf692694 249289848Sjkim .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 250289848Sjkim .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 251289848Sjkim .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 252289848Sjkim .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 253289848Sjkim .quad 0x983e5152ee66dfab,0xa831c66d2db43210 254289848Sjkim .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 255289848Sjkim .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 256289848Sjkim .quad 0x06ca6351e003826f,0x142929670a0e6e70 257289848Sjkim .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 258289848Sjkim .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df 259289848Sjkim .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 260289848Sjkim .quad 0x81c2c92e47edaee6,0x92722c851482353b 261289848Sjkim .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 262289848Sjkim .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 263289848Sjkim .quad 0xd192e819d6ef5218,0xd69906245565a910 264289848Sjkim .quad 0xf40e35855771202a,0x106aa07032bbd1b8 265289848Sjkim .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 266289848Sjkim .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 267289848Sjkim .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb 268289848Sjkim .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 269289848Sjkim .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 270289848Sjkim .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec 271289848Sjkim .quad 0x90befffa23631e28,0xa4506cebde82bde9 272289848Sjkim .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b 273289848Sjkim .quad 0xca273eceea26619c,0xd186b8c721c0c207 274289848Sjkim .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 275289848Sjkim .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 276289848Sjkim .quad 0x113f9804bef90dae,0x1b710b35131c471b 277289848Sjkim .quad 0x28db77f523047d84,0x32caab7b40c72493 278289848Sjkim .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c 279289848Sjkim .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a 280289848Sjkim .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 281289848Sjkim .quad 0 // terminator 282289848Sjkim___ 283289848Sjkim$code.=<<___ if ($SZ==4); 284289848Sjkim .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 285289848Sjkim .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 286289848Sjkim .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 287289848Sjkim .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 288289848Sjkim .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 289289848Sjkim .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 290289848Sjkim .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 291289848Sjkim .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 292289848Sjkim .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 293289848Sjkim .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 294289848Sjkim .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 295289848Sjkim .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 296289848Sjkim .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 297289848Sjkim .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 298289848Sjkim .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 299289848Sjkim .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 300289848Sjkim .long 0 //terminator 301289848Sjkim___ 302289848Sjkim$code.=<<___; 303289848Sjkim.size K$BITS,.-K$BITS 304289848Sjkim.align 3 305289848Sjkim.LOPENSSL_armcap_P: 306289848Sjkim .quad OPENSSL_armcap_P-. 307289848Sjkim.asciz "SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 308289848Sjkim.align 2 309289848Sjkim___ 310289848Sjkim 311289848Sjkimif ($SZ==4) { 312289848Sjkimmy $Ktbl="x3"; 313289848Sjkim 314289848Sjkimmy ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2)); 315289848Sjkimmy @MSG=map("v$_.16b",(4..7)); 316289848Sjkimmy ($W0,$W1)=("v16.4s","v17.4s"); 317289848Sjkimmy ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b"); 318289848Sjkim 319289848Sjkim$code.=<<___; 320289848Sjkim.type sha256_block_armv8,%function 321289848Sjkim.align 6 322289848Sjkimsha256_block_armv8: 323289848Sjkim.Lv8_entry: 324289848Sjkim stp x29,x30,[sp,#-16]! 325289848Sjkim add x29,sp,#0 326289848Sjkim 327289848Sjkim ld1.32 {$ABCD,$EFGH},[$ctx] 328289848Sjkim adr $Ktbl,K256 329289848Sjkim 330289848Sjkim.Loop_hw: 331289848Sjkim ld1 {@MSG[0]-@MSG[3]},[$inp],#64 332289848Sjkim sub $num,$num,#1 333289848Sjkim ld1.32 {$W0},[$Ktbl],#16 334289848Sjkim rev32 @MSG[0],@MSG[0] 335289848Sjkim rev32 @MSG[1],@MSG[1] 336289848Sjkim rev32 @MSG[2],@MSG[2] 337289848Sjkim rev32 @MSG[3],@MSG[3] 338289848Sjkim orr $ABCD_SAVE,$ABCD,$ABCD // offload 339289848Sjkim orr $EFGH_SAVE,$EFGH,$EFGH 340289848Sjkim___ 341289848Sjkimfor($i=0;$i<12;$i++) { 342289848Sjkim$code.=<<___; 343289848Sjkim ld1.32 {$W1},[$Ktbl],#16 344289848Sjkim add.i32 $W0,$W0,@MSG[0] 345289848Sjkim sha256su0 @MSG[0],@MSG[1] 346289848Sjkim orr $abcd,$ABCD,$ABCD 347289848Sjkim sha256h $ABCD,$EFGH,$W0 348289848Sjkim sha256h2 $EFGH,$abcd,$W0 349289848Sjkim sha256su1 @MSG[0],@MSG[2],@MSG[3] 350289848Sjkim___ 351289848Sjkim ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); 352289848Sjkim} 353289848Sjkim$code.=<<___; 354289848Sjkim ld1.32 {$W1},[$Ktbl],#16 355289848Sjkim add.i32 $W0,$W0,@MSG[0] 356289848Sjkim orr $abcd,$ABCD,$ABCD 357289848Sjkim sha256h $ABCD,$EFGH,$W0 358289848Sjkim sha256h2 $EFGH,$abcd,$W0 359289848Sjkim 360289848Sjkim ld1.32 {$W0},[$Ktbl],#16 361289848Sjkim add.i32 $W1,$W1,@MSG[1] 362289848Sjkim orr $abcd,$ABCD,$ABCD 363289848Sjkim sha256h $ABCD,$EFGH,$W1 364289848Sjkim sha256h2 $EFGH,$abcd,$W1 365289848Sjkim 366289848Sjkim ld1.32 {$W1},[$Ktbl] 367289848Sjkim add.i32 $W0,$W0,@MSG[2] 368289848Sjkim sub $Ktbl,$Ktbl,#$rounds*$SZ-16 // rewind 369289848Sjkim orr $abcd,$ABCD,$ABCD 370289848Sjkim sha256h $ABCD,$EFGH,$W0 371289848Sjkim sha256h2 $EFGH,$abcd,$W0 372289848Sjkim 373289848Sjkim add.i32 $W1,$W1,@MSG[3] 374289848Sjkim orr $abcd,$ABCD,$ABCD 375289848Sjkim sha256h $ABCD,$EFGH,$W1 376289848Sjkim sha256h2 $EFGH,$abcd,$W1 377289848Sjkim 378289848Sjkim add.i32 $ABCD,$ABCD,$ABCD_SAVE 379289848Sjkim add.i32 $EFGH,$EFGH,$EFGH_SAVE 380289848Sjkim 381289848Sjkim cbnz $num,.Loop_hw 382289848Sjkim 383289848Sjkim st1.32 {$ABCD,$EFGH},[$ctx] 384289848Sjkim 385289848Sjkim ldr x29,[sp],#16 386289848Sjkim ret 387289848Sjkim.size sha256_block_armv8,.-sha256_block_armv8 388289848Sjkim___ 389289848Sjkim} 390289848Sjkim 391289848Sjkim$code.=<<___; 392289848Sjkim.comm OPENSSL_armcap_P,4,4 393289848Sjkim___ 394289848Sjkim 395289848Sjkim{ my %opcode = ( 396289848Sjkim "sha256h" => 0x5e004000, "sha256h2" => 0x5e005000, 397289848Sjkim "sha256su0" => 0x5e282800, "sha256su1" => 0x5e006000 ); 398289848Sjkim 399289848Sjkim sub unsha256 { 400289848Sjkim my ($mnemonic,$arg)=@_; 401289848Sjkim 402289848Sjkim $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o 403289848Sjkim && 404289848Sjkim sprintf ".inst\t0x%08x\t//%s %s", 405289848Sjkim $opcode{$mnemonic}|$1|($2<<5)|($3<<16), 406289848Sjkim $mnemonic,$arg; 407289848Sjkim } 408289848Sjkim} 409289848Sjkim 410289848Sjkimforeach(split("\n",$code)) { 411289848Sjkim 412289848Sjkim s/\`([^\`]*)\`/eval($1)/geo; 413289848Sjkim 414289848Sjkim s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/geo; 415289848Sjkim 416289848Sjkim s/\.\w?32\b//o and s/\.16b/\.4s/go; 417289848Sjkim m/(ld|st)1[^\[]+\[0\]/o and s/\.4s/\.s/go; 418289848Sjkim 419289848Sjkim print $_,"\n"; 420289848Sjkim} 421289848Sjkim 422289848Sjkimclose STDOUT; 423