Deleted Added
full compact
sha256-armv4.pl (305152) sha256-armv4.pl (326663)
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8#
9# Permission to use under GPL terms is granted.
10# ====================================================================
11
12# SHA256 block procedure for ARMv4. May 2007.
13
14# Performance is ~2x better than gcc 3.4 generated code and in "abso-
15# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
16# byte [on single-issue Xscale PXA250 core].
17
18# July 2010.
19#
20# Rescheduling for dual-issue pipeline resulted in 22% improvement on
21# Cortex A8 core and ~20 cycles per processed byte.
22
23# February 2011.
24#
25# Profiler-assisted and platform-specific optimization resulted in 16%
26# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
27
28# September 2013.
29#
30# Add NEON implementation. On Cortex A8 it was measured to process one
31# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
32# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
33# code (meaning that latter performs sub-optimally, nothing was done
34# about it).
35
36# May 2014.
37#
38# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
39
40while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
41open STDOUT,">$output";
42
43$ctx="r0"; $t0="r0";
44$inp="r1"; $t4="r1";
45$len="r2"; $t1="r2";
46$T1="r3"; $t3="r3";
47$A="r4";
48$B="r5";
49$C="r6";
50$D="r7";
51$E="r8";
52$F="r9";
53$G="r10";
54$H="r11";
55@V=($A,$B,$C,$D,$E,$F,$G,$H);
56$t2="r12";
57$Ktbl="r14";
58
59@Sigma0=( 2,13,22);
60@Sigma1=( 6,11,25);
61@sigma0=( 7,18, 3);
62@sigma1=(17,19,10);
63
64sub BODY_00_15 {
65my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
66
67$code.=<<___ if ($i<16);
68#if __ARM_ARCH__>=7
69 @ ldr $t1,[$inp],#4 @ $i
70# if $i==15
71 str $inp,[sp,#17*4] @ make room for $t4
72# endif
73 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
74 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
75 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
76 rev $t1,$t1
77#else
78 @ ldrb $t1,[$inp,#3] @ $i
79 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
80 ldrb $t2,[$inp,#2]
81 ldrb $t0,[$inp,#1]
82 orr $t1,$t1,$t2,lsl#8
83 ldrb $t2,[$inp],#4
84 orr $t1,$t1,$t0,lsl#16
85# if $i==15
86 str $inp,[sp,#17*4] @ make room for $t4
87# endif
88 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
89 orr $t1,$t1,$t2,lsl#24
90 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
91#endif
92___
93$code.=<<___;
94 ldr $t2,[$Ktbl],#4 @ *K256++
95 add $h,$h,$t1 @ h+=X[i]
96 str $t1,[sp,#`$i%16`*4]
97 eor $t1,$f,$g
98 add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
99 and $t1,$t1,$e
100 add $h,$h,$t2 @ h+=K256[i]
101 eor $t1,$t1,$g @ Ch(e,f,g)
102 eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
103 add $h,$h,$t1 @ h+=Ch(e,f,g)
104#if $i==31
105 and $t2,$t2,#0xff
106 cmp $t2,#0xf2 @ done?
107#endif
108#if $i<15
109# if __ARM_ARCH__>=7
110 ldr $t1,[$inp],#4 @ prefetch
111# else
112 ldrb $t1,[$inp,#3]
113# endif
114 eor $t2,$a,$b @ a^b, b^c in next round
115#else
116 ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
117 eor $t2,$a,$b @ a^b, b^c in next round
118 ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
119#endif
120 eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
121 and $t3,$t3,$t2 @ (b^c)&=(a^b)
122 add $d,$d,$h @ d+=h
123 eor $t3,$t3,$b @ Maj(a,b,c)
124 add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
125 @ add $h,$h,$t3 @ h+=Maj(a,b,c)
126___
127 ($t2,$t3)=($t3,$t2);
128}
129
130sub BODY_16_XX {
131my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
132
133$code.=<<___;
134 @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
135 @ ldr $t4,[sp,#`($i+14)%16`*4]
136 mov $t0,$t1,ror#$sigma0[0]
137 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
138 mov $t2,$t4,ror#$sigma1[0]
139 eor $t0,$t0,$t1,ror#$sigma0[1]
140 eor $t2,$t2,$t4,ror#$sigma1[1]
141 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
142 ldr $t1,[sp,#`($i+0)%16`*4]
143 eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
144 ldr $t4,[sp,#`($i+9)%16`*4]
145
146 add $t2,$t2,$t0
147 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
148 add $t1,$t1,$t2
149 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
150 add $t1,$t1,$t4 @ X[i]
151___
152 &BODY_00_15(@_);
153}
154
155$code=<<___;
156#ifndef __KERNEL__
157# include "arm_arch.h"
158#else
159# define __ARM_ARCH__ __LINUX_ARM_ARCH__
160# define __ARM_MAX_ARCH__ 7
161#endif
162
163.text
164#if __ARM_ARCH__<7
165.code 32
166#else
167.syntax unified
168# ifdef __thumb2__
169.thumb
170# else
171.code 32
172# endif
173#endif
174
175.type K256,%object
176.align 5
177K256:
178.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
179.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
180.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
181.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
182.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
183.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
184.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
185.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
186.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
187.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
188.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
189.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
190.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
191.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
192.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
193.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
194.size K256,.-K256
195.word 0 @ terminator
196#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
197.LOPENSSL_armcap:
198.word OPENSSL_armcap_P-sha256_block_data_order
199#endif
200.align 5
201
202.global sha256_block_data_order
203.type sha256_block_data_order,%function
204sha256_block_data_order:
205#if __ARM_ARCH__<7
206 sub r3,pc,#8 @ sha256_block_data_order
207#else
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8#
9# Permission to use under GPL terms is granted.
10# ====================================================================
11
12# SHA256 block procedure for ARMv4. May 2007.
13
14# Performance is ~2x better than gcc 3.4 generated code and in "abso-
15# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
16# byte [on single-issue Xscale PXA250 core].
17
18# July 2010.
19#
20# Rescheduling for dual-issue pipeline resulted in 22% improvement on
21# Cortex A8 core and ~20 cycles per processed byte.
22
23# February 2011.
24#
25# Profiler-assisted and platform-specific optimization resulted in 16%
26# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
27
28# September 2013.
29#
30# Add NEON implementation. On Cortex A8 it was measured to process one
31# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
32# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
33# code (meaning that latter performs sub-optimally, nothing was done
34# about it).
35
36# May 2014.
37#
38# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
39
40while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
41open STDOUT,">$output";
42
43$ctx="r0"; $t0="r0";
44$inp="r1"; $t4="r1";
45$len="r2"; $t1="r2";
46$T1="r3"; $t3="r3";
47$A="r4";
48$B="r5";
49$C="r6";
50$D="r7";
51$E="r8";
52$F="r9";
53$G="r10";
54$H="r11";
55@V=($A,$B,$C,$D,$E,$F,$G,$H);
56$t2="r12";
57$Ktbl="r14";
58
59@Sigma0=( 2,13,22);
60@Sigma1=( 6,11,25);
61@sigma0=( 7,18, 3);
62@sigma1=(17,19,10);
63
64sub BODY_00_15 {
65my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
66
67$code.=<<___ if ($i<16);
68#if __ARM_ARCH__>=7
69 @ ldr $t1,[$inp],#4 @ $i
70# if $i==15
71 str $inp,[sp,#17*4] @ make room for $t4
72# endif
73 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
74 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
75 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
76 rev $t1,$t1
77#else
78 @ ldrb $t1,[$inp,#3] @ $i
79 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
80 ldrb $t2,[$inp,#2]
81 ldrb $t0,[$inp,#1]
82 orr $t1,$t1,$t2,lsl#8
83 ldrb $t2,[$inp],#4
84 orr $t1,$t1,$t0,lsl#16
85# if $i==15
86 str $inp,[sp,#17*4] @ make room for $t4
87# endif
88 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
89 orr $t1,$t1,$t2,lsl#24
90 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
91#endif
92___
93$code.=<<___;
94 ldr $t2,[$Ktbl],#4 @ *K256++
95 add $h,$h,$t1 @ h+=X[i]
96 str $t1,[sp,#`$i%16`*4]
97 eor $t1,$f,$g
98 add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
99 and $t1,$t1,$e
100 add $h,$h,$t2 @ h+=K256[i]
101 eor $t1,$t1,$g @ Ch(e,f,g)
102 eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
103 add $h,$h,$t1 @ h+=Ch(e,f,g)
104#if $i==31
105 and $t2,$t2,#0xff
106 cmp $t2,#0xf2 @ done?
107#endif
108#if $i<15
109# if __ARM_ARCH__>=7
110 ldr $t1,[$inp],#4 @ prefetch
111# else
112 ldrb $t1,[$inp,#3]
113# endif
114 eor $t2,$a,$b @ a^b, b^c in next round
115#else
116 ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
117 eor $t2,$a,$b @ a^b, b^c in next round
118 ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
119#endif
120 eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
121 and $t3,$t3,$t2 @ (b^c)&=(a^b)
122 add $d,$d,$h @ d+=h
123 eor $t3,$t3,$b @ Maj(a,b,c)
124 add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
125 @ add $h,$h,$t3 @ h+=Maj(a,b,c)
126___
127 ($t2,$t3)=($t3,$t2);
128}
129
130sub BODY_16_XX {
131my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
132
133$code.=<<___;
134 @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
135 @ ldr $t4,[sp,#`($i+14)%16`*4]
136 mov $t0,$t1,ror#$sigma0[0]
137 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
138 mov $t2,$t4,ror#$sigma1[0]
139 eor $t0,$t0,$t1,ror#$sigma0[1]
140 eor $t2,$t2,$t4,ror#$sigma1[1]
141 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
142 ldr $t1,[sp,#`($i+0)%16`*4]
143 eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
144 ldr $t4,[sp,#`($i+9)%16`*4]
145
146 add $t2,$t2,$t0
147 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
148 add $t1,$t1,$t2
149 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
150 add $t1,$t1,$t4 @ X[i]
151___
152 &BODY_00_15(@_);
153}
154
155$code=<<___;
156#ifndef __KERNEL__
157# include "arm_arch.h"
158#else
159# define __ARM_ARCH__ __LINUX_ARM_ARCH__
160# define __ARM_MAX_ARCH__ 7
161#endif
162
163.text
164#if __ARM_ARCH__<7
165.code 32
166#else
167.syntax unified
168# ifdef __thumb2__
169.thumb
170# else
171.code 32
172# endif
173#endif
174
175.type K256,%object
176.align 5
177K256:
178.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
179.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
180.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
181.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
182.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
183.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
184.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
185.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
186.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
187.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
188.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
189.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
190.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
191.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
192.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
193.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
194.size K256,.-K256
195.word 0 @ terminator
196#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
197.LOPENSSL_armcap:
198.word OPENSSL_armcap_P-sha256_block_data_order
199#endif
200.align 5
201
202.global sha256_block_data_order
203.type sha256_block_data_order,%function
204sha256_block_data_order:
205#if __ARM_ARCH__<7
206 sub r3,pc,#8 @ sha256_block_data_order
207#else
208 adr r3,sha256_block_data_order
208 adr r3,.
209#endif
210#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
211 ldr r12,.LOPENSSL_armcap
212 ldr r12,[r3,r12] @ OPENSSL_armcap_P
213 tst r12,#ARMV8_SHA256
214 bne .LARMv8
215 tst r12,#ARMV7_NEON
216 bne .LNEON
217#endif
218 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
219 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
220 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
221 sub $Ktbl,r3,#256+32 @ K256
222 sub sp,sp,#16*4 @ alloca(X[16])
223.Loop:
224# if __ARM_ARCH__>=7
225 ldr $t1,[$inp],#4
226# else
227 ldrb $t1,[$inp,#3]
228# endif
229 eor $t3,$B,$C @ magic
230 eor $t2,$t2,$t2
231___
232for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
233$code.=".Lrounds_16_xx:\n";
234for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
235$code.=<<___;
236#if __ARM_ARCH__>=7
237 ite eq @ Thumb2 thing, sanity check in ARM
238#endif
239 ldreq $t3,[sp,#16*4] @ pull ctx
240 bne .Lrounds_16_xx
241
242 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
243 ldr $t0,[$t3,#0]
244 ldr $t1,[$t3,#4]
245 ldr $t2,[$t3,#8]
246 add $A,$A,$t0
247 ldr $t0,[$t3,#12]
248 add $B,$B,$t1
249 ldr $t1,[$t3,#16]
250 add $C,$C,$t2
251 ldr $t2,[$t3,#20]
252 add $D,$D,$t0
253 ldr $t0,[$t3,#24]
254 add $E,$E,$t1
255 ldr $t1,[$t3,#28]
256 add $F,$F,$t2
257 ldr $inp,[sp,#17*4] @ pull inp
258 ldr $t2,[sp,#18*4] @ pull inp+len
259 add $G,$G,$t0
260 add $H,$H,$t1
261 stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
262 cmp $inp,$t2
263 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
264 bne .Loop
265
266 add sp,sp,#`16+3`*4 @ destroy frame
267#if __ARM_ARCH__>=5
268 ldmia sp!,{r4-r11,pc}
269#else
270 ldmia sp!,{r4-r11,lr}
271 tst lr,#1
272 moveq pc,lr @ be binary compatible with V4, yet
273 bx lr @ interoperable with Thumb ISA:-)
274#endif
275.size sha256_block_data_order,.-sha256_block_data_order
276___
277######################################################################
278# NEON stuff
279#
280{{{
281my @X=map("q$_",(0..3));
282my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
283my $Xfer=$t4;
284my $j=0;
285
286sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
287sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
288
289sub AUTOLOAD() # thunk [simplified] x86-style perlasm
290{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
291 my $arg = pop;
292 $arg = "#$arg" if ($arg*1 eq $arg);
293 $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
294}
295
296sub Xupdate()
297{ use integer;
298 my $body = shift;
299 my @insns = (&$body,&$body,&$body,&$body);
300 my ($a,$b,$c,$d,$e,$f,$g,$h);
301
302 &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
303 eval(shift(@insns));
304 eval(shift(@insns));
305 eval(shift(@insns));
306 &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
307 eval(shift(@insns));
308 eval(shift(@insns));
309 eval(shift(@insns));
310 &vshr_u32 ($T2,$T0,$sigma0[0]);
311 eval(shift(@insns));
312 eval(shift(@insns));
313 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
314 eval(shift(@insns));
315 eval(shift(@insns));
316 &vshr_u32 ($T1,$T0,$sigma0[2]);
317 eval(shift(@insns));
318 eval(shift(@insns));
319 &vsli_32 ($T2,$T0,32-$sigma0[0]);
320 eval(shift(@insns));
321 eval(shift(@insns));
322 &vshr_u32 ($T3,$T0,$sigma0[1]);
323 eval(shift(@insns));
324 eval(shift(@insns));
325 &veor ($T1,$T1,$T2);
326 eval(shift(@insns));
327 eval(shift(@insns));
328 &vsli_32 ($T3,$T0,32-$sigma0[1]);
329 eval(shift(@insns));
330 eval(shift(@insns));
331 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
332 eval(shift(@insns));
333 eval(shift(@insns));
334 &veor ($T1,$T1,$T3); # sigma0(X[1..4])
335 eval(shift(@insns));
336 eval(shift(@insns));
337 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
338 eval(shift(@insns));
339 eval(shift(@insns));
340 &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
341 eval(shift(@insns));
342 eval(shift(@insns));
343 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
344 eval(shift(@insns));
345 eval(shift(@insns));
346 &veor ($T5,$T5,$T4);
347 eval(shift(@insns));
348 eval(shift(@insns));
349 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
350 eval(shift(@insns));
351 eval(shift(@insns));
352 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
353 eval(shift(@insns));
354 eval(shift(@insns));
355 &veor ($T5,$T5,$T4); # sigma1(X[14..15])
356 eval(shift(@insns));
357 eval(shift(@insns));
358 &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
359 eval(shift(@insns));
360 eval(shift(@insns));
361 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
362 eval(shift(@insns));
363 eval(shift(@insns));
364 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
365 eval(shift(@insns));
366 eval(shift(@insns));
367 &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
368 eval(shift(@insns));
369 eval(shift(@insns));
370 &veor ($T5,$T5,$T4);
371 eval(shift(@insns));
372 eval(shift(@insns));
373 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
374 eval(shift(@insns));
375 eval(shift(@insns));
376 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
377 eval(shift(@insns));
378 eval(shift(@insns));
379 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
380 eval(shift(@insns));
381 eval(shift(@insns));
382 &veor ($T5,$T5,$T4); # sigma1(X[16..17])
383 eval(shift(@insns));
384 eval(shift(@insns));
385 &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
386 eval(shift(@insns));
387 eval(shift(@insns));
388 &vadd_i32 ($T0,$T0,@X[0]);
389 while($#insns>=2) { eval(shift(@insns)); }
390 &vst1_32 ("{$T0}","[$Xfer,:128]!");
391 eval(shift(@insns));
392 eval(shift(@insns));
393
394 push(@X,shift(@X)); # "rotate" X[]
395}
396
397sub Xpreload()
398{ use integer;
399 my $body = shift;
400 my @insns = (&$body,&$body,&$body,&$body);
401 my ($a,$b,$c,$d,$e,$f,$g,$h);
402
403 eval(shift(@insns));
404 eval(shift(@insns));
405 eval(shift(@insns));
406 eval(shift(@insns));
407 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
408 eval(shift(@insns));
409 eval(shift(@insns));
410 eval(shift(@insns));
411 eval(shift(@insns));
412 &vrev32_8 (@X[0],@X[0]);
413 eval(shift(@insns));
414 eval(shift(@insns));
415 eval(shift(@insns));
416 eval(shift(@insns));
417 &vadd_i32 ($T0,$T0,@X[0]);
418 foreach (@insns) { eval; } # remaining instructions
419 &vst1_32 ("{$T0}","[$Xfer,:128]!");
420
421 push(@X,shift(@X)); # "rotate" X[]
422}
423
424sub body_00_15 () {
425 (
426 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
427 '&add ($h,$h,$t1)', # h+=X[i]+K[i]
428 '&eor ($t1,$f,$g)',
429 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
430 '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
431 '&and ($t1,$t1,$e)',
432 '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
433 '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
434 '&eor ($t1,$t1,$g)', # Ch(e,f,g)
435 '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
436 '&eor ($t2,$a,$b)', # a^b, b^c in next round
437 '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
438 '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
439 '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
440 '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
441 '&ldr ($t1,"[sp,#64]") if ($j==31)',
442 '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
443 '&add ($d,$d,$h)', # d+=h
444 '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
445 '&eor ($t3,$t3,$b)', # Maj(a,b,c)
446 '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
447 )
448}
449
450$code.=<<___;
451#if __ARM_MAX_ARCH__>=7
452.arch armv7-a
453.fpu neon
454
455.global sha256_block_data_order_neon
456.type sha256_block_data_order_neon,%function
457.align 4
458sha256_block_data_order_neon:
459.LNEON:
460 stmdb sp!,{r4-r12,lr}
461
462 sub $H,sp,#16*4+16
463 adr $Ktbl,K256
464 bic $H,$H,#15 @ align for 128-bit stores
465 mov $t2,sp
466 mov sp,$H @ alloca
467 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
468
469 vld1.8 {@X[0]},[$inp]!
470 vld1.8 {@X[1]},[$inp]!
471 vld1.8 {@X[2]},[$inp]!
472 vld1.8 {@X[3]},[$inp]!
473 vld1.32 {$T0},[$Ktbl,:128]!
474 vld1.32 {$T1},[$Ktbl,:128]!
475 vld1.32 {$T2},[$Ktbl,:128]!
476 vld1.32 {$T3},[$Ktbl,:128]!
477 vrev32.8 @X[0],@X[0] @ yes, even on
478 str $ctx,[sp,#64]
479 vrev32.8 @X[1],@X[1] @ big-endian
480 str $inp,[sp,#68]
481 mov $Xfer,sp
482 vrev32.8 @X[2],@X[2]
483 str $len,[sp,#72]
484 vrev32.8 @X[3],@X[3]
485 str $t2,[sp,#76] @ save original sp
486 vadd.i32 $T0,$T0,@X[0]
487 vadd.i32 $T1,$T1,@X[1]
488 vst1.32 {$T0},[$Xfer,:128]!
489 vadd.i32 $T2,$T2,@X[2]
490 vst1.32 {$T1},[$Xfer,:128]!
491 vadd.i32 $T3,$T3,@X[3]
492 vst1.32 {$T2},[$Xfer,:128]!
493 vst1.32 {$T3},[$Xfer,:128]!
494
495 ldmia $ctx,{$A-$H}
496 sub $Xfer,$Xfer,#64
497 ldr $t1,[sp,#0]
498 eor $t2,$t2,$t2
499 eor $t3,$B,$C
500 b .L_00_48
501
502.align 4
503.L_00_48:
504___
505 &Xupdate(\&body_00_15);
506 &Xupdate(\&body_00_15);
507 &Xupdate(\&body_00_15);
508 &Xupdate(\&body_00_15);
509$code.=<<___;
510 teq $t1,#0 @ check for K256 terminator
511 ldr $t1,[sp,#0]
512 sub $Xfer,$Xfer,#64
513 bne .L_00_48
514
515 ldr $inp,[sp,#68]
516 ldr $t0,[sp,#72]
517 sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
518 teq $inp,$t0
519 it eq
520 subeq $inp,$inp,#64 @ avoid SEGV
521 vld1.8 {@X[0]},[$inp]! @ load next input block
522 vld1.8 {@X[1]},[$inp]!
523 vld1.8 {@X[2]},[$inp]!
524 vld1.8 {@X[3]},[$inp]!
525 it ne
526 strne $inp,[sp,#68]
527 mov $Xfer,sp
528___
529 &Xpreload(\&body_00_15);
530 &Xpreload(\&body_00_15);
531 &Xpreload(\&body_00_15);
532 &Xpreload(\&body_00_15);
533$code.=<<___;
534 ldr $t0,[$t1,#0]
535 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
536 ldr $t2,[$t1,#4]
537 ldr $t3,[$t1,#8]
538 ldr $t4,[$t1,#12]
539 add $A,$A,$t0 @ accumulate
540 ldr $t0,[$t1,#16]
541 add $B,$B,$t2
542 ldr $t2,[$t1,#20]
543 add $C,$C,$t3
544 ldr $t3,[$t1,#24]
545 add $D,$D,$t4
546 ldr $t4,[$t1,#28]
547 add $E,$E,$t0
548 str $A,[$t1],#4
549 add $F,$F,$t2
550 str $B,[$t1],#4
551 add $G,$G,$t3
552 str $C,[$t1],#4
553 add $H,$H,$t4
554 str $D,[$t1],#4
555 stmia $t1,{$E-$H}
556
557 ittte ne
558 movne $Xfer,sp
559 ldrne $t1,[sp,#0]
560 eorne $t2,$t2,$t2
561 ldreq sp,[sp,#76] @ restore original sp
562 itt ne
563 eorne $t3,$B,$C
564 bne .L_00_48
565
566 ldmia sp!,{r4-r12,pc}
567.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
568#endif
569___
570}}}
571######################################################################
572# ARMv8 stuff
573#
574{{{
575my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
576my @MSG=map("q$_",(8..11));
577my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
578my $Ktbl="r3";
579
580$code.=<<___;
581#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
582
583# ifdef __thumb2__
584# define INST(a,b,c,d) .byte c,d|0xc,a,b
585# else
586# define INST(a,b,c,d) .byte a,b,c,d
587# endif
588
589.type sha256_block_data_order_armv8,%function
590.align 5
591sha256_block_data_order_armv8:
592.LARMv8:
593 vld1.32 {$ABCD,$EFGH},[$ctx]
594# ifdef __thumb2__
595 adr $Ktbl,.LARMv8
596 sub $Ktbl,$Ktbl,#.LARMv8-K256
597# else
598 sub $Ktbl,$Ktbl,#256+32
599# endif
600 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
601
602.Loop_v8:
603 vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
604 vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
605 vld1.32 {$W0},[$Ktbl]!
606 vrev32.8 @MSG[0],@MSG[0]
607 vrev32.8 @MSG[1],@MSG[1]
608 vrev32.8 @MSG[2],@MSG[2]
609 vrev32.8 @MSG[3],@MSG[3]
610 vmov $ABCD_SAVE,$ABCD @ offload
611 vmov $EFGH_SAVE,$EFGH
612 teq $inp,$len
613___
614for($i=0;$i<12;$i++) {
615$code.=<<___;
616 vld1.32 {$W1},[$Ktbl]!
617 vadd.i32 $W0,$W0,@MSG[0]
618 sha256su0 @MSG[0],@MSG[1]
619 vmov $abcd,$ABCD
620 sha256h $ABCD,$EFGH,$W0
621 sha256h2 $EFGH,$abcd,$W0
622 sha256su1 @MSG[0],@MSG[2],@MSG[3]
623___
624 ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
625}
626$code.=<<___;
627 vld1.32 {$W1},[$Ktbl]!
628 vadd.i32 $W0,$W0,@MSG[0]
629 vmov $abcd,$ABCD
630 sha256h $ABCD,$EFGH,$W0
631 sha256h2 $EFGH,$abcd,$W0
632
633 vld1.32 {$W0},[$Ktbl]!
634 vadd.i32 $W1,$W1,@MSG[1]
635 vmov $abcd,$ABCD
636 sha256h $ABCD,$EFGH,$W1
637 sha256h2 $EFGH,$abcd,$W1
638
639 vld1.32 {$W1},[$Ktbl]
640 vadd.i32 $W0,$W0,@MSG[2]
641 sub $Ktbl,$Ktbl,#256-16 @ rewind
642 vmov $abcd,$ABCD
643 sha256h $ABCD,$EFGH,$W0
644 sha256h2 $EFGH,$abcd,$W0
645
646 vadd.i32 $W1,$W1,@MSG[3]
647 vmov $abcd,$ABCD
648 sha256h $ABCD,$EFGH,$W1
649 sha256h2 $EFGH,$abcd,$W1
650
651 vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
652 vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
653 it ne
654 bne .Loop_v8
655
656 vst1.32 {$ABCD,$EFGH},[$ctx]
657
658 ret @ bx lr
659.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
660#endif
661___
662}}}
663$code.=<<___;
664.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
665.align 2
666#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
667.comm OPENSSL_armcap_P,4,4
668#endif
669___
670
671open SELF,$0;
672while(<SELF>) {
673 next if (/^#!/);
674 last if (!s/^#/@/ and !/^$/);
675 print;
676}
677close SELF;
678
679{ my %opcode = (
680 "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
681 "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
682
683 sub unsha256 {
684 my ($mnemonic,$arg)=@_;
685
686 if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
687 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
688 |(($2&7)<<17)|(($2&8)<<4)
689 |(($3&7)<<1) |(($3&8)<<2);
690 # since ARMv7 instructions are always encoded little-endian.
691 # correct solution is to use .inst directive, but older
692 # assemblers don't implement it:-(
693 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
694 $word&0xff,($word>>8)&0xff,
695 ($word>>16)&0xff,($word>>24)&0xff,
696 $mnemonic,$arg;
697 }
698 }
699}
700
701foreach (split($/,$code)) {
702
703 s/\`([^\`]*)\`/eval $1/geo;
704
705 s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
706
707 s/\bret\b/bx lr/go or
708 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
709
710 print $_,"\n";
711}
712
713close STDOUT; # enforce flush
209#endif
210#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
211 ldr r12,.LOPENSSL_armcap
212 ldr r12,[r3,r12] @ OPENSSL_armcap_P
213 tst r12,#ARMV8_SHA256
214 bne .LARMv8
215 tst r12,#ARMV7_NEON
216 bne .LNEON
217#endif
218 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
219 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
220 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
221 sub $Ktbl,r3,#256+32 @ K256
222 sub sp,sp,#16*4 @ alloca(X[16])
223.Loop:
224# if __ARM_ARCH__>=7
225 ldr $t1,[$inp],#4
226# else
227 ldrb $t1,[$inp,#3]
228# endif
229 eor $t3,$B,$C @ magic
230 eor $t2,$t2,$t2
231___
232for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
233$code.=".Lrounds_16_xx:\n";
234for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
235$code.=<<___;
236#if __ARM_ARCH__>=7
237 ite eq @ Thumb2 thing, sanity check in ARM
238#endif
239 ldreq $t3,[sp,#16*4] @ pull ctx
240 bne .Lrounds_16_xx
241
242 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
243 ldr $t0,[$t3,#0]
244 ldr $t1,[$t3,#4]
245 ldr $t2,[$t3,#8]
246 add $A,$A,$t0
247 ldr $t0,[$t3,#12]
248 add $B,$B,$t1
249 ldr $t1,[$t3,#16]
250 add $C,$C,$t2
251 ldr $t2,[$t3,#20]
252 add $D,$D,$t0
253 ldr $t0,[$t3,#24]
254 add $E,$E,$t1
255 ldr $t1,[$t3,#28]
256 add $F,$F,$t2
257 ldr $inp,[sp,#17*4] @ pull inp
258 ldr $t2,[sp,#18*4] @ pull inp+len
259 add $G,$G,$t0
260 add $H,$H,$t1
261 stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
262 cmp $inp,$t2
263 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
264 bne .Loop
265
266 add sp,sp,#`16+3`*4 @ destroy frame
267#if __ARM_ARCH__>=5
268 ldmia sp!,{r4-r11,pc}
269#else
270 ldmia sp!,{r4-r11,lr}
271 tst lr,#1
272 moveq pc,lr @ be binary compatible with V4, yet
273 bx lr @ interoperable with Thumb ISA:-)
274#endif
275.size sha256_block_data_order,.-sha256_block_data_order
276___
277######################################################################
278# NEON stuff
279#
280{{{
281my @X=map("q$_",(0..3));
282my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
283my $Xfer=$t4;
284my $j=0;
285
286sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
287sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
288
289sub AUTOLOAD() # thunk [simplified] x86-style perlasm
290{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
291 my $arg = pop;
292 $arg = "#$arg" if ($arg*1 eq $arg);
293 $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
294}
295
296sub Xupdate()
297{ use integer;
298 my $body = shift;
299 my @insns = (&$body,&$body,&$body,&$body);
300 my ($a,$b,$c,$d,$e,$f,$g,$h);
301
302 &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
303 eval(shift(@insns));
304 eval(shift(@insns));
305 eval(shift(@insns));
306 &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
307 eval(shift(@insns));
308 eval(shift(@insns));
309 eval(shift(@insns));
310 &vshr_u32 ($T2,$T0,$sigma0[0]);
311 eval(shift(@insns));
312 eval(shift(@insns));
313 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
314 eval(shift(@insns));
315 eval(shift(@insns));
316 &vshr_u32 ($T1,$T0,$sigma0[2]);
317 eval(shift(@insns));
318 eval(shift(@insns));
319 &vsli_32 ($T2,$T0,32-$sigma0[0]);
320 eval(shift(@insns));
321 eval(shift(@insns));
322 &vshr_u32 ($T3,$T0,$sigma0[1]);
323 eval(shift(@insns));
324 eval(shift(@insns));
325 &veor ($T1,$T1,$T2);
326 eval(shift(@insns));
327 eval(shift(@insns));
328 &vsli_32 ($T3,$T0,32-$sigma0[1]);
329 eval(shift(@insns));
330 eval(shift(@insns));
331 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
332 eval(shift(@insns));
333 eval(shift(@insns));
334 &veor ($T1,$T1,$T3); # sigma0(X[1..4])
335 eval(shift(@insns));
336 eval(shift(@insns));
337 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
338 eval(shift(@insns));
339 eval(shift(@insns));
340 &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
341 eval(shift(@insns));
342 eval(shift(@insns));
343 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
344 eval(shift(@insns));
345 eval(shift(@insns));
346 &veor ($T5,$T5,$T4);
347 eval(shift(@insns));
348 eval(shift(@insns));
349 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
350 eval(shift(@insns));
351 eval(shift(@insns));
352 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
353 eval(shift(@insns));
354 eval(shift(@insns));
355 &veor ($T5,$T5,$T4); # sigma1(X[14..15])
356 eval(shift(@insns));
357 eval(shift(@insns));
358 &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
359 eval(shift(@insns));
360 eval(shift(@insns));
361 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
362 eval(shift(@insns));
363 eval(shift(@insns));
364 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
365 eval(shift(@insns));
366 eval(shift(@insns));
367 &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
368 eval(shift(@insns));
369 eval(shift(@insns));
370 &veor ($T5,$T5,$T4);
371 eval(shift(@insns));
372 eval(shift(@insns));
373 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
374 eval(shift(@insns));
375 eval(shift(@insns));
376 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
377 eval(shift(@insns));
378 eval(shift(@insns));
379 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
380 eval(shift(@insns));
381 eval(shift(@insns));
382 &veor ($T5,$T5,$T4); # sigma1(X[16..17])
383 eval(shift(@insns));
384 eval(shift(@insns));
385 &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
386 eval(shift(@insns));
387 eval(shift(@insns));
388 &vadd_i32 ($T0,$T0,@X[0]);
389 while($#insns>=2) { eval(shift(@insns)); }
390 &vst1_32 ("{$T0}","[$Xfer,:128]!");
391 eval(shift(@insns));
392 eval(shift(@insns));
393
394 push(@X,shift(@X)); # "rotate" X[]
395}
396
397sub Xpreload()
398{ use integer;
399 my $body = shift;
400 my @insns = (&$body,&$body,&$body,&$body);
401 my ($a,$b,$c,$d,$e,$f,$g,$h);
402
403 eval(shift(@insns));
404 eval(shift(@insns));
405 eval(shift(@insns));
406 eval(shift(@insns));
407 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
408 eval(shift(@insns));
409 eval(shift(@insns));
410 eval(shift(@insns));
411 eval(shift(@insns));
412 &vrev32_8 (@X[0],@X[0]);
413 eval(shift(@insns));
414 eval(shift(@insns));
415 eval(shift(@insns));
416 eval(shift(@insns));
417 &vadd_i32 ($T0,$T0,@X[0]);
418 foreach (@insns) { eval; } # remaining instructions
419 &vst1_32 ("{$T0}","[$Xfer,:128]!");
420
421 push(@X,shift(@X)); # "rotate" X[]
422}
423
424sub body_00_15 () {
425 (
426 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
427 '&add ($h,$h,$t1)', # h+=X[i]+K[i]
428 '&eor ($t1,$f,$g)',
429 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
430 '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
431 '&and ($t1,$t1,$e)',
432 '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
433 '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
434 '&eor ($t1,$t1,$g)', # Ch(e,f,g)
435 '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
436 '&eor ($t2,$a,$b)', # a^b, b^c in next round
437 '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
438 '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
439 '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
440 '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
441 '&ldr ($t1,"[sp,#64]") if ($j==31)',
442 '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
443 '&add ($d,$d,$h)', # d+=h
444 '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
445 '&eor ($t3,$t3,$b)', # Maj(a,b,c)
446 '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
447 )
448}
449
450$code.=<<___;
451#if __ARM_MAX_ARCH__>=7
452.arch armv7-a
453.fpu neon
454
455.global sha256_block_data_order_neon
456.type sha256_block_data_order_neon,%function
457.align 4
458sha256_block_data_order_neon:
459.LNEON:
460 stmdb sp!,{r4-r12,lr}
461
462 sub $H,sp,#16*4+16
463 adr $Ktbl,K256
464 bic $H,$H,#15 @ align for 128-bit stores
465 mov $t2,sp
466 mov sp,$H @ alloca
467 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
468
469 vld1.8 {@X[0]},[$inp]!
470 vld1.8 {@X[1]},[$inp]!
471 vld1.8 {@X[2]},[$inp]!
472 vld1.8 {@X[3]},[$inp]!
473 vld1.32 {$T0},[$Ktbl,:128]!
474 vld1.32 {$T1},[$Ktbl,:128]!
475 vld1.32 {$T2},[$Ktbl,:128]!
476 vld1.32 {$T3},[$Ktbl,:128]!
477 vrev32.8 @X[0],@X[0] @ yes, even on
478 str $ctx,[sp,#64]
479 vrev32.8 @X[1],@X[1] @ big-endian
480 str $inp,[sp,#68]
481 mov $Xfer,sp
482 vrev32.8 @X[2],@X[2]
483 str $len,[sp,#72]
484 vrev32.8 @X[3],@X[3]
485 str $t2,[sp,#76] @ save original sp
486 vadd.i32 $T0,$T0,@X[0]
487 vadd.i32 $T1,$T1,@X[1]
488 vst1.32 {$T0},[$Xfer,:128]!
489 vadd.i32 $T2,$T2,@X[2]
490 vst1.32 {$T1},[$Xfer,:128]!
491 vadd.i32 $T3,$T3,@X[3]
492 vst1.32 {$T2},[$Xfer,:128]!
493 vst1.32 {$T3},[$Xfer,:128]!
494
495 ldmia $ctx,{$A-$H}
496 sub $Xfer,$Xfer,#64
497 ldr $t1,[sp,#0]
498 eor $t2,$t2,$t2
499 eor $t3,$B,$C
500 b .L_00_48
501
502.align 4
503.L_00_48:
504___
505 &Xupdate(\&body_00_15);
506 &Xupdate(\&body_00_15);
507 &Xupdate(\&body_00_15);
508 &Xupdate(\&body_00_15);
509$code.=<<___;
510 teq $t1,#0 @ check for K256 terminator
511 ldr $t1,[sp,#0]
512 sub $Xfer,$Xfer,#64
513 bne .L_00_48
514
515 ldr $inp,[sp,#68]
516 ldr $t0,[sp,#72]
517 sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
518 teq $inp,$t0
519 it eq
520 subeq $inp,$inp,#64 @ avoid SEGV
521 vld1.8 {@X[0]},[$inp]! @ load next input block
522 vld1.8 {@X[1]},[$inp]!
523 vld1.8 {@X[2]},[$inp]!
524 vld1.8 {@X[3]},[$inp]!
525 it ne
526 strne $inp,[sp,#68]
527 mov $Xfer,sp
528___
529 &Xpreload(\&body_00_15);
530 &Xpreload(\&body_00_15);
531 &Xpreload(\&body_00_15);
532 &Xpreload(\&body_00_15);
533$code.=<<___;
534 ldr $t0,[$t1,#0]
535 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
536 ldr $t2,[$t1,#4]
537 ldr $t3,[$t1,#8]
538 ldr $t4,[$t1,#12]
539 add $A,$A,$t0 @ accumulate
540 ldr $t0,[$t1,#16]
541 add $B,$B,$t2
542 ldr $t2,[$t1,#20]
543 add $C,$C,$t3
544 ldr $t3,[$t1,#24]
545 add $D,$D,$t4
546 ldr $t4,[$t1,#28]
547 add $E,$E,$t0
548 str $A,[$t1],#4
549 add $F,$F,$t2
550 str $B,[$t1],#4
551 add $G,$G,$t3
552 str $C,[$t1],#4
553 add $H,$H,$t4
554 str $D,[$t1],#4
555 stmia $t1,{$E-$H}
556
557 ittte ne
558 movne $Xfer,sp
559 ldrne $t1,[sp,#0]
560 eorne $t2,$t2,$t2
561 ldreq sp,[sp,#76] @ restore original sp
562 itt ne
563 eorne $t3,$B,$C
564 bne .L_00_48
565
566 ldmia sp!,{r4-r12,pc}
567.size sha256_block_data_order_neon,.-sha256_block_data_order_neon
568#endif
569___
570}}}
571######################################################################
572# ARMv8 stuff
573#
574{{{
575my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
576my @MSG=map("q$_",(8..11));
577my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
578my $Ktbl="r3";
579
580$code.=<<___;
581#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
582
583# ifdef __thumb2__
584# define INST(a,b,c,d) .byte c,d|0xc,a,b
585# else
586# define INST(a,b,c,d) .byte a,b,c,d
587# endif
588
589.type sha256_block_data_order_armv8,%function
590.align 5
591sha256_block_data_order_armv8:
592.LARMv8:
593 vld1.32 {$ABCD,$EFGH},[$ctx]
594# ifdef __thumb2__
595 adr $Ktbl,.LARMv8
596 sub $Ktbl,$Ktbl,#.LARMv8-K256
597# else
598 sub $Ktbl,$Ktbl,#256+32
599# endif
600 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
601
602.Loop_v8:
603 vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
604 vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
605 vld1.32 {$W0},[$Ktbl]!
606 vrev32.8 @MSG[0],@MSG[0]
607 vrev32.8 @MSG[1],@MSG[1]
608 vrev32.8 @MSG[2],@MSG[2]
609 vrev32.8 @MSG[3],@MSG[3]
610 vmov $ABCD_SAVE,$ABCD @ offload
611 vmov $EFGH_SAVE,$EFGH
612 teq $inp,$len
613___
614for($i=0;$i<12;$i++) {
615$code.=<<___;
616 vld1.32 {$W1},[$Ktbl]!
617 vadd.i32 $W0,$W0,@MSG[0]
618 sha256su0 @MSG[0],@MSG[1]
619 vmov $abcd,$ABCD
620 sha256h $ABCD,$EFGH,$W0
621 sha256h2 $EFGH,$abcd,$W0
622 sha256su1 @MSG[0],@MSG[2],@MSG[3]
623___
624 ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
625}
626$code.=<<___;
627 vld1.32 {$W1},[$Ktbl]!
628 vadd.i32 $W0,$W0,@MSG[0]
629 vmov $abcd,$ABCD
630 sha256h $ABCD,$EFGH,$W0
631 sha256h2 $EFGH,$abcd,$W0
632
633 vld1.32 {$W0},[$Ktbl]!
634 vadd.i32 $W1,$W1,@MSG[1]
635 vmov $abcd,$ABCD
636 sha256h $ABCD,$EFGH,$W1
637 sha256h2 $EFGH,$abcd,$W1
638
639 vld1.32 {$W1},[$Ktbl]
640 vadd.i32 $W0,$W0,@MSG[2]
641 sub $Ktbl,$Ktbl,#256-16 @ rewind
642 vmov $abcd,$ABCD
643 sha256h $ABCD,$EFGH,$W0
644 sha256h2 $EFGH,$abcd,$W0
645
646 vadd.i32 $W1,$W1,@MSG[3]
647 vmov $abcd,$ABCD
648 sha256h $ABCD,$EFGH,$W1
649 sha256h2 $EFGH,$abcd,$W1
650
651 vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
652 vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
653 it ne
654 bne .Loop_v8
655
656 vst1.32 {$ABCD,$EFGH},[$ctx]
657
658 ret @ bx lr
659.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
660#endif
661___
662}}}
663$code.=<<___;
664.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
665.align 2
666#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
667.comm OPENSSL_armcap_P,4,4
668#endif
669___
670
671open SELF,$0;
672while(<SELF>) {
673 next if (/^#!/);
674 last if (!s/^#/@/ and !/^$/);
675 print;
676}
677close SELF;
678
679{ my %opcode = (
680 "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
681 "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
682
683 sub unsha256 {
684 my ($mnemonic,$arg)=@_;
685
686 if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
687 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
688 |(($2&7)<<17)|(($2&8)<<4)
689 |(($3&7)<<1) |(($3&8)<<2);
690 # since ARMv7 instructions are always encoded little-endian.
691 # correct solution is to use .inst directive, but older
692 # assemblers don't implement it:-(
693 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
694 $word&0xff,($word>>8)&0xff,
695 ($word>>16)&0xff,($word>>24)&0xff,
696 $mnemonic,$arg;
697 }
698 }
699}
700
701foreach (split($/,$code)) {
702
703 s/\`([^\`]*)\`/eval $1/geo;
704
705 s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
706
707 s/\bret\b/bx lr/go or
708 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
709
710 print $_,"\n";
711}
712
713close STDOUT; # enforce flush