1238384Sjkim#!/usr/bin/env perl
2238384Sjkim
3238384Sjkim# ====================================================================
4290207Sjkim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5238384Sjkim# project. The module is, however, dual licensed under OpenSSL and
6238384Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further
7238384Sjkim# details see http://www.openssl.org/~appro/cryptogams/.
8290207Sjkim#
9290207Sjkim# Permission to use under GPL terms is granted.
10238384Sjkim# ====================================================================
11238384Sjkim
12238384Sjkim# SHA256 block procedure for ARMv4. May 2007.
13238384Sjkim
14238384Sjkim# Performance is ~2x better than gcc 3.4 generated code and in "abso-
15238384Sjkim# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
16238384Sjkim# byte [on single-issue Xscale PXA250 core].
17238384Sjkim
18238384Sjkim# July 2010.
19238384Sjkim#
20238384Sjkim# Rescheduling for dual-issue pipeline resulted in 22% improvement on
21238384Sjkim# Cortex A8 core and ~20 cycles per processed byte.
22238384Sjkim
23238384Sjkim# February 2011.
24238384Sjkim#
25238384Sjkim# Profiler-assisted and platform-specific optimization resulted in 16%
26290207Sjkim# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
27238384Sjkim
28290207Sjkim# September 2013.
29290207Sjkim#
30290207Sjkim# Add NEON implementation. On Cortex A8 it was measured to process one
31290207Sjkim# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
32290207Sjkim# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
33290207Sjkim# code (meaning that latter performs sub-optimally, nothing was done
34290207Sjkim# about it).
35290207Sjkim
36290207Sjkim# May 2014.
37290207Sjkim#
38290207Sjkim# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
39290207Sjkim
40238384Sjkimwhile (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
41238384Sjkimopen STDOUT,">$output";
42238384Sjkim
43238384Sjkim$ctx="r0";	$t0="r0";
44290207Sjkim$inp="r1";	$t4="r1";
45238384Sjkim$len="r2";	$t1="r2";
46290207Sjkim$T1="r3";	$t3="r3";
47238384Sjkim$A="r4";
48238384Sjkim$B="r5";
49238384Sjkim$C="r6";
50238384Sjkim$D="r7";
51238384Sjkim$E="r8";
52238384Sjkim$F="r9";
53238384Sjkim$G="r10";
54238384Sjkim$H="r11";
55238384Sjkim@V=($A,$B,$C,$D,$E,$F,$G,$H);
56238384Sjkim$t2="r12";
57238384Sjkim$Ktbl="r14";
58238384Sjkim
59238384Sjkim@Sigma0=( 2,13,22);
60238384Sjkim@Sigma1=( 6,11,25);
61238384Sjkim@sigma0=( 7,18, 3);
62238384Sjkim@sigma1=(17,19,10);
63238384Sjkim
64238384Sjkimsub BODY_00_15 {
65238384Sjkimmy ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
66238384Sjkim
67238384Sjkim$code.=<<___ if ($i<16);
68238384Sjkim#if __ARM_ARCH__>=7
69290207Sjkim	@ ldr	$t1,[$inp],#4			@ $i
70290207Sjkim# if $i==15
71290207Sjkim	str	$inp,[sp,#17*4]			@ make room for $t4
72290207Sjkim# endif
73290207Sjkim	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
74290207Sjkim	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
75290207Sjkim	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
76290207Sjkim	rev	$t1,$t1
77238384Sjkim#else
78290207Sjkim	@ ldrb	$t1,[$inp,#3]			@ $i
79290207Sjkim	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
80238384Sjkim	ldrb	$t2,[$inp,#2]
81290207Sjkim	ldrb	$t0,[$inp,#1]
82290207Sjkim	orr	$t1,$t1,$t2,lsl#8
83290207Sjkim	ldrb	$t2,[$inp],#4
84290207Sjkim	orr	$t1,$t1,$t0,lsl#16
85290207Sjkim# if $i==15
86290207Sjkim	str	$inp,[sp,#17*4]			@ make room for $t4
87290207Sjkim# endif
88290207Sjkim	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
89290207Sjkim	orr	$t1,$t1,$t2,lsl#24
90290207Sjkim	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
91238384Sjkim#endif
92238384Sjkim___
93238384Sjkim$code.=<<___;
94238384Sjkim	ldr	$t2,[$Ktbl],#4			@ *K256++
95290207Sjkim	add	$h,$h,$t1			@ h+=X[i]
96290207Sjkim	str	$t1,[sp,#`$i%16`*4]
97238384Sjkim	eor	$t1,$f,$g
98290207Sjkim	add	$h,$h,$t0,ror#$Sigma1[0]	@ h+=Sigma1(e)
99238384Sjkim	and	$t1,$t1,$e
100290207Sjkim	add	$h,$h,$t2			@ h+=K256[i]
101238384Sjkim	eor	$t1,$t1,$g			@ Ch(e,f,g)
102290207Sjkim	eor	$t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
103290207Sjkim	add	$h,$h,$t1			@ h+=Ch(e,f,g)
104290207Sjkim#if $i==31
105290207Sjkim	and	$t2,$t2,#0xff
106290207Sjkim	cmp	$t2,#0xf2			@ done?
107238384Sjkim#endif
108290207Sjkim#if $i<15
109290207Sjkim# if __ARM_ARCH__>=7
110290207Sjkim	ldr	$t1,[$inp],#4			@ prefetch
111290207Sjkim# else
112290207Sjkim	ldrb	$t1,[$inp,#3]
113290207Sjkim# endif
114290207Sjkim	eor	$t2,$a,$b			@ a^b, b^c in next round
115290207Sjkim#else
116290207Sjkim	ldr	$t1,[sp,#`($i+2)%16`*4]		@ from future BODY_16_xx
117290207Sjkim	eor	$t2,$a,$b			@ a^b, b^c in next round
118290207Sjkim	ldr	$t4,[sp,#`($i+15)%16`*4]	@ from future BODY_16_xx
119290207Sjkim#endif
120290207Sjkim	eor	$t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`	@ Sigma0(a)
121290207Sjkim	and	$t3,$t3,$t2			@ (b^c)&=(a^b)
122290207Sjkim	add	$d,$d,$h			@ d+=h
123290207Sjkim	eor	$t3,$t3,$b			@ Maj(a,b,c)
124290207Sjkim	add	$h,$h,$t0,ror#$Sigma0[0]	@ h+=Sigma0(a)
125290207Sjkim	@ add	$h,$h,$t3			@ h+=Maj(a,b,c)
126238384Sjkim___
127290207Sjkim	($t2,$t3)=($t3,$t2);
128238384Sjkim}
129238384Sjkim
130238384Sjkimsub BODY_16_XX {
131238384Sjkimmy ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
132238384Sjkim
133238384Sjkim$code.=<<___;
134290207Sjkim	@ ldr	$t1,[sp,#`($i+1)%16`*4]		@ $i
135290207Sjkim	@ ldr	$t4,[sp,#`($i+14)%16`*4]
136290207Sjkim	mov	$t0,$t1,ror#$sigma0[0]
137290207Sjkim	add	$a,$a,$t2			@ h+=Maj(a,b,c) from the past
138290207Sjkim	mov	$t2,$t4,ror#$sigma1[0]
139290207Sjkim	eor	$t0,$t0,$t1,ror#$sigma0[1]
140290207Sjkim	eor	$t2,$t2,$t4,ror#$sigma1[1]
141290207Sjkim	eor	$t0,$t0,$t1,lsr#$sigma0[2]	@ sigma0(X[i+1])
142290207Sjkim	ldr	$t1,[sp,#`($i+0)%16`*4]
143290207Sjkim	eor	$t2,$t2,$t4,lsr#$sigma1[2]	@ sigma1(X[i+14])
144290207Sjkim	ldr	$t4,[sp,#`($i+9)%16`*4]
145290207Sjkim
146290207Sjkim	add	$t2,$t2,$t0
147290207Sjkim	eor	$t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`	@ from BODY_00_15
148290207Sjkim	add	$t1,$t1,$t2
149290207Sjkim	eor	$t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`	@ Sigma1(e)
150290207Sjkim	add	$t1,$t1,$t4			@ X[i]
151238384Sjkim___
152238384Sjkim	&BODY_00_15(@_);
153238384Sjkim}
154238384Sjkim
155238384Sjkim$code=<<___;
156290207Sjkim#ifndef __KERNEL__
157290207Sjkim# include "arm_arch.h"
158290207Sjkim#else
159290207Sjkim# define __ARM_ARCH__ __LINUX_ARM_ARCH__
160290207Sjkim# define __ARM_MAX_ARCH__ 7
161290207Sjkim#endif
162238384Sjkim
163238384Sjkim.text
164290207Sjkim#if __ARM_ARCH__<7
165238384Sjkim.code	32
166290207Sjkim#else
167290207Sjkim.syntax unified
168290207Sjkim# ifdef __thumb2__
169290207Sjkim.thumb
170290207Sjkim# else
171290207Sjkim.code   32
172290207Sjkim# endif
173290207Sjkim#endif
174238384Sjkim
175238384Sjkim.type	K256,%object
176238384Sjkim.align	5
177238384SjkimK256:
178238384Sjkim.word	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
179238384Sjkim.word	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
180238384Sjkim.word	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
181238384Sjkim.word	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
182238384Sjkim.word	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
183238384Sjkim.word	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
184238384Sjkim.word	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
185238384Sjkim.word	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
186238384Sjkim.word	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
187238384Sjkim.word	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
188238384Sjkim.word	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
189238384Sjkim.word	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
190238384Sjkim.word	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
191238384Sjkim.word	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
192238384Sjkim.word	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
193238384Sjkim.word	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
194238384Sjkim.size	K256,.-K256
195290207Sjkim.word	0				@ terminator
196290207Sjkim#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
197290207Sjkim.LOPENSSL_armcap:
198290207Sjkim.word	OPENSSL_armcap_P-sha256_block_data_order
199290207Sjkim#endif
200290207Sjkim.align	5
201238384Sjkim
202238384Sjkim.global	sha256_block_data_order
203238384Sjkim.type	sha256_block_data_order,%function
204238384Sjkimsha256_block_data_order:
205290207Sjkim#if __ARM_ARCH__<7
206238384Sjkim	sub	r3,pc,#8		@ sha256_block_data_order
207290207Sjkim#else
208326663Sjkim	adr	r3,.
209290207Sjkim#endif
210290207Sjkim#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
211290207Sjkim	ldr	r12,.LOPENSSL_armcap
212290207Sjkim	ldr	r12,[r3,r12]		@ OPENSSL_armcap_P
213290207Sjkim	tst	r12,#ARMV8_SHA256
214290207Sjkim	bne	.LARMv8
215290207Sjkim	tst	r12,#ARMV7_NEON
216290207Sjkim	bne	.LNEON
217290207Sjkim#endif
218238384Sjkim	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
219238384Sjkim	stmdb	sp!,{$ctx,$inp,$len,r4-r11,lr}
220238384Sjkim	ldmia	$ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
221290207Sjkim	sub	$Ktbl,r3,#256+32	@ K256
222238384Sjkim	sub	sp,sp,#16*4		@ alloca(X[16])
223238384Sjkim.Loop:
224290207Sjkim# if __ARM_ARCH__>=7
225290207Sjkim	ldr	$t1,[$inp],#4
226290207Sjkim# else
227290207Sjkim	ldrb	$t1,[$inp,#3]
228290207Sjkim# endif
229290207Sjkim	eor	$t3,$B,$C		@ magic
230290207Sjkim	eor	$t2,$t2,$t2
231238384Sjkim___
232238384Sjkimfor($i=0;$i<16;$i++)	{ &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
233238384Sjkim$code.=".Lrounds_16_xx:\n";
234238384Sjkimfor (;$i<32;$i++)	{ &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
235238384Sjkim$code.=<<___;
236290207Sjkim#if __ARM_ARCH__>=7
237290207Sjkim	ite	eq			@ Thumb2 thing, sanity check in ARM
238290207Sjkim#endif
239290207Sjkim	ldreq	$t3,[sp,#16*4]		@ pull ctx
240238384Sjkim	bne	.Lrounds_16_xx
241238384Sjkim
242290207Sjkim	add	$A,$A,$t2		@ h+=Maj(a,b,c) from the past
243290207Sjkim	ldr	$t0,[$t3,#0]
244290207Sjkim	ldr	$t1,[$t3,#4]
245290207Sjkim	ldr	$t2,[$t3,#8]
246238384Sjkim	add	$A,$A,$t0
247290207Sjkim	ldr	$t0,[$t3,#12]
248238384Sjkim	add	$B,$B,$t1
249290207Sjkim	ldr	$t1,[$t3,#16]
250238384Sjkim	add	$C,$C,$t2
251290207Sjkim	ldr	$t2,[$t3,#20]
252238384Sjkim	add	$D,$D,$t0
253290207Sjkim	ldr	$t0,[$t3,#24]
254238384Sjkim	add	$E,$E,$t1
255290207Sjkim	ldr	$t1,[$t3,#28]
256238384Sjkim	add	$F,$F,$t2
257238384Sjkim	ldr	$inp,[sp,#17*4]		@ pull inp
258238384Sjkim	ldr	$t2,[sp,#18*4]		@ pull inp+len
259238384Sjkim	add	$G,$G,$t0
260238384Sjkim	add	$H,$H,$t1
261290207Sjkim	stmia	$t3,{$A,$B,$C,$D,$E,$F,$G,$H}
262238384Sjkim	cmp	$inp,$t2
263238384Sjkim	sub	$Ktbl,$Ktbl,#256	@ rewind Ktbl
264238384Sjkim	bne	.Loop
265238384Sjkim
266238384Sjkim	add	sp,sp,#`16+3`*4	@ destroy frame
267238384Sjkim#if __ARM_ARCH__>=5
268238384Sjkim	ldmia	sp!,{r4-r11,pc}
269238384Sjkim#else
270238384Sjkim	ldmia	sp!,{r4-r11,lr}
271238384Sjkim	tst	lr,#1
272238384Sjkim	moveq	pc,lr			@ be binary compatible with V4, yet
273238384Sjkim	bx	lr			@ interoperable with Thumb ISA:-)
274238384Sjkim#endif
275290207Sjkim.size	sha256_block_data_order,.-sha256_block_data_order
276290207Sjkim___
277290207Sjkim######################################################################
278290207Sjkim# NEON stuff
279290207Sjkim#
280290207Sjkim{{{
281290207Sjkimmy @X=map("q$_",(0..3));
282290207Sjkimmy ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
283290207Sjkimmy $Xfer=$t4;
284290207Sjkimmy $j=0;
285290207Sjkim
286290207Sjkimsub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
287290207Sjkimsub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
288290207Sjkim
289290207Sjkimsub AUTOLOAD()          # thunk [simplified] x86-style perlasm
290290207Sjkim{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
291290207Sjkim  my $arg = pop;
292290207Sjkim    $arg = "#$arg" if ($arg*1 eq $arg);
293290207Sjkim    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
294290207Sjkim}
295290207Sjkim
296290207Sjkimsub Xupdate()
297290207Sjkim{ use integer;
298290207Sjkim  my $body = shift;
299290207Sjkim  my @insns = (&$body,&$body,&$body,&$body);
300290207Sjkim  my ($a,$b,$c,$d,$e,$f,$g,$h);
301290207Sjkim
302290207Sjkim	&vext_8		($T0,@X[0],@X[1],4);	# X[1..4]
303290207Sjkim	 eval(shift(@insns));
304290207Sjkim	 eval(shift(@insns));
305290207Sjkim	 eval(shift(@insns));
306290207Sjkim	&vext_8		($T1,@X[2],@X[3],4);	# X[9..12]
307290207Sjkim	 eval(shift(@insns));
308290207Sjkim	 eval(shift(@insns));
309290207Sjkim	 eval(shift(@insns));
310290207Sjkim	&vshr_u32	($T2,$T0,$sigma0[0]);
311290207Sjkim	 eval(shift(@insns));
312290207Sjkim	 eval(shift(@insns));
313290207Sjkim	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += X[9..12]
314290207Sjkim	 eval(shift(@insns));
315290207Sjkim	 eval(shift(@insns));
316290207Sjkim	&vshr_u32	($T1,$T0,$sigma0[2]);
317290207Sjkim	 eval(shift(@insns));
318290207Sjkim	 eval(shift(@insns));
319290207Sjkim	&vsli_32	($T2,$T0,32-$sigma0[0]);
320290207Sjkim	 eval(shift(@insns));
321290207Sjkim	 eval(shift(@insns));
322290207Sjkim	&vshr_u32	($T3,$T0,$sigma0[1]);
323290207Sjkim	 eval(shift(@insns));
324290207Sjkim	 eval(shift(@insns));
325290207Sjkim	&veor		($T1,$T1,$T2);
326290207Sjkim	 eval(shift(@insns));
327290207Sjkim	 eval(shift(@insns));
328290207Sjkim	&vsli_32	($T3,$T0,32-$sigma0[1]);
329290207Sjkim	 eval(shift(@insns));
330290207Sjkim	 eval(shift(@insns));
331290207Sjkim	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[0]);
332290207Sjkim	 eval(shift(@insns));
333290207Sjkim	 eval(shift(@insns));
334290207Sjkim	&veor		($T1,$T1,$T3);		# sigma0(X[1..4])
335290207Sjkim	 eval(shift(@insns));
336290207Sjkim	 eval(shift(@insns));
337290207Sjkim	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[0]);
338290207Sjkim	 eval(shift(@insns));
339290207Sjkim	 eval(shift(@insns));
340290207Sjkim	  &vshr_u32	($T5,&Dhi(@X[3]),$sigma1[2]);
341290207Sjkim	 eval(shift(@insns));
342290207Sjkim	 eval(shift(@insns));
343290207Sjkim	&vadd_i32	(@X[0],@X[0],$T1);	# X[0..3] += sigma0(X[1..4])
344290207Sjkim	 eval(shift(@insns));
345290207Sjkim	 eval(shift(@insns));
346290207Sjkim	  &veor		($T5,$T5,$T4);
347290207Sjkim	 eval(shift(@insns));
348290207Sjkim	 eval(shift(@insns));
349290207Sjkim	  &vshr_u32	($T4,&Dhi(@X[3]),$sigma1[1]);
350290207Sjkim	 eval(shift(@insns));
351290207Sjkim	 eval(shift(@insns));
352290207Sjkim	  &vsli_32	($T4,&Dhi(@X[3]),32-$sigma1[1]);
353290207Sjkim	 eval(shift(@insns));
354290207Sjkim	 eval(shift(@insns));
355290207Sjkim	  &veor		($T5,$T5,$T4);		# sigma1(X[14..15])
356290207Sjkim	 eval(shift(@insns));
357290207Sjkim	 eval(shift(@insns));
358290207Sjkim	&vadd_i32	(&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
359290207Sjkim	 eval(shift(@insns));
360290207Sjkim	 eval(shift(@insns));
361290207Sjkim	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[0]);
362290207Sjkim	 eval(shift(@insns));
363290207Sjkim	 eval(shift(@insns));
364290207Sjkim	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[0]);
365290207Sjkim	 eval(shift(@insns));
366290207Sjkim	 eval(shift(@insns));
367290207Sjkim	  &vshr_u32	($T5,&Dlo(@X[0]),$sigma1[2]);
368290207Sjkim	 eval(shift(@insns));
369290207Sjkim	 eval(shift(@insns));
370290207Sjkim	  &veor		($T5,$T5,$T4);
371290207Sjkim	 eval(shift(@insns));
372290207Sjkim	 eval(shift(@insns));
373290207Sjkim	  &vshr_u32	($T4,&Dlo(@X[0]),$sigma1[1]);
374290207Sjkim	 eval(shift(@insns));
375290207Sjkim	 eval(shift(@insns));
376290207Sjkim	&vld1_32	("{$T0}","[$Ktbl,:128]!");
377290207Sjkim	 eval(shift(@insns));
378290207Sjkim	 eval(shift(@insns));
379290207Sjkim	  &vsli_32	($T4,&Dlo(@X[0]),32-$sigma1[1]);
380290207Sjkim	 eval(shift(@insns));
381290207Sjkim	 eval(shift(@insns));
382290207Sjkim	  &veor		($T5,$T5,$T4);		# sigma1(X[16..17])
383290207Sjkim	 eval(shift(@insns));
384290207Sjkim	 eval(shift(@insns));
385290207Sjkim	&vadd_i32	(&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
386290207Sjkim	 eval(shift(@insns));
387290207Sjkim	 eval(shift(@insns));
388290207Sjkim	&vadd_i32	($T0,$T0,@X[0]);
389290207Sjkim	 while($#insns>=2) { eval(shift(@insns)); }
390290207Sjkim	&vst1_32	("{$T0}","[$Xfer,:128]!");
391290207Sjkim	 eval(shift(@insns));
392290207Sjkim	 eval(shift(@insns));
393290207Sjkim
394290207Sjkim	push(@X,shift(@X));		# "rotate" X[]
395290207Sjkim}
396290207Sjkim
397290207Sjkimsub Xpreload()
398290207Sjkim{ use integer;
399290207Sjkim  my $body = shift;
400290207Sjkim  my @insns = (&$body,&$body,&$body,&$body);
401290207Sjkim  my ($a,$b,$c,$d,$e,$f,$g,$h);
402290207Sjkim
403290207Sjkim	 eval(shift(@insns));
404290207Sjkim	 eval(shift(@insns));
405290207Sjkim	 eval(shift(@insns));
406290207Sjkim	 eval(shift(@insns));
407290207Sjkim	&vld1_32	("{$T0}","[$Ktbl,:128]!");
408290207Sjkim	 eval(shift(@insns));
409290207Sjkim	 eval(shift(@insns));
410290207Sjkim	 eval(shift(@insns));
411290207Sjkim	 eval(shift(@insns));
412290207Sjkim	&vrev32_8	(@X[0],@X[0]);
413290207Sjkim	 eval(shift(@insns));
414290207Sjkim	 eval(shift(@insns));
415290207Sjkim	 eval(shift(@insns));
416290207Sjkim	 eval(shift(@insns));
417290207Sjkim	&vadd_i32	($T0,$T0,@X[0]);
418290207Sjkim	 foreach (@insns) { eval; }	# remaining instructions
419290207Sjkim	&vst1_32	("{$T0}","[$Xfer,:128]!");
420290207Sjkim
421290207Sjkim	push(@X,shift(@X));		# "rotate" X[]
422290207Sjkim}
423290207Sjkim
424290207Sjkimsub body_00_15 () {
425290207Sjkim	(
426290207Sjkim	'($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
427290207Sjkim	'&add	($h,$h,$t1)',			# h+=X[i]+K[i]
428290207Sjkim	'&eor	($t1,$f,$g)',
429290207Sjkim	'&eor	($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
430290207Sjkim	'&add	($a,$a,$t2)',			# h+=Maj(a,b,c) from the past
431290207Sjkim	'&and	($t1,$t1,$e)',
432290207Sjkim	'&eor	($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',	# Sigma1(e)
433290207Sjkim	'&eor	($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
434290207Sjkim	'&eor	($t1,$t1,$g)',			# Ch(e,f,g)
435290207Sjkim	'&add	($h,$h,$t2,"ror#$Sigma1[0]")',	# h+=Sigma1(e)
436290207Sjkim	'&eor	($t2,$a,$b)',			# a^b, b^c in next round
437290207Sjkim	'&eor	($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',	# Sigma0(a)
438290207Sjkim	'&add	($h,$h,$t1)',			# h+=Ch(e,f,g)
439290207Sjkim	'&ldr	($t1,sprintf "[sp,#%d]",4*(($j+1)&15))	if (($j&15)!=15);'.
440290207Sjkim	'&ldr	($t1,"[$Ktbl]")				if ($j==15);'.
441290207Sjkim	'&ldr	($t1,"[sp,#64]")			if ($j==31)',
442290207Sjkim	'&and	($t3,$t3,$t2)',			# (b^c)&=(a^b)
443290207Sjkim	'&add	($d,$d,$h)',			# d+=h
444290207Sjkim	'&add	($h,$h,$t0,"ror#$Sigma0[0]");'.	# h+=Sigma0(a)
445290207Sjkim	'&eor	($t3,$t3,$b)',			# Maj(a,b,c)
446290207Sjkim	'$j++;	unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
447290207Sjkim	)
448290207Sjkim}
449290207Sjkim
450290207Sjkim$code.=<<___;
451290207Sjkim#if __ARM_MAX_ARCH__>=7
452290207Sjkim.arch	armv7-a
453290207Sjkim.fpu	neon
454290207Sjkim
455290207Sjkim.global	sha256_block_data_order_neon
456290207Sjkim.type	sha256_block_data_order_neon,%function
457290207Sjkim.align	4
458290207Sjkimsha256_block_data_order_neon:
459290207Sjkim.LNEON:
460290207Sjkim	stmdb	sp!,{r4-r12,lr}
461290207Sjkim
462290207Sjkim	sub	$H,sp,#16*4+16
463290207Sjkim	adr	$Ktbl,K256
464290207Sjkim	bic	$H,$H,#15		@ align for 128-bit stores
465290207Sjkim	mov	$t2,sp
466290207Sjkim	mov	sp,$H			@ alloca
467290207Sjkim	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
468290207Sjkim
469290207Sjkim	vld1.8		{@X[0]},[$inp]!
470290207Sjkim	vld1.8		{@X[1]},[$inp]!
471290207Sjkim	vld1.8		{@X[2]},[$inp]!
472290207Sjkim	vld1.8		{@X[3]},[$inp]!
473290207Sjkim	vld1.32		{$T0},[$Ktbl,:128]!
474290207Sjkim	vld1.32		{$T1},[$Ktbl,:128]!
475290207Sjkim	vld1.32		{$T2},[$Ktbl,:128]!
476290207Sjkim	vld1.32		{$T3},[$Ktbl,:128]!
477290207Sjkim	vrev32.8	@X[0],@X[0]		@ yes, even on
478290207Sjkim	str		$ctx,[sp,#64]
479290207Sjkim	vrev32.8	@X[1],@X[1]		@ big-endian
480290207Sjkim	str		$inp,[sp,#68]
481290207Sjkim	mov		$Xfer,sp
482290207Sjkim	vrev32.8	@X[2],@X[2]
483290207Sjkim	str		$len,[sp,#72]
484290207Sjkim	vrev32.8	@X[3],@X[3]
485290207Sjkim	str		$t2,[sp,#76]		@ save original sp
486290207Sjkim	vadd.i32	$T0,$T0,@X[0]
487290207Sjkim	vadd.i32	$T1,$T1,@X[1]
488290207Sjkim	vst1.32		{$T0},[$Xfer,:128]!
489290207Sjkim	vadd.i32	$T2,$T2,@X[2]
490290207Sjkim	vst1.32		{$T1},[$Xfer,:128]!
491290207Sjkim	vadd.i32	$T3,$T3,@X[3]
492290207Sjkim	vst1.32		{$T2},[$Xfer,:128]!
493290207Sjkim	vst1.32		{$T3},[$Xfer,:128]!
494290207Sjkim
495290207Sjkim	ldmia		$ctx,{$A-$H}
496290207Sjkim	sub		$Xfer,$Xfer,#64
497290207Sjkim	ldr		$t1,[sp,#0]
498290207Sjkim	eor		$t2,$t2,$t2
499290207Sjkim	eor		$t3,$B,$C
500290207Sjkim	b		.L_00_48
501290207Sjkim
502290207Sjkim.align	4
503290207Sjkim.L_00_48:
504290207Sjkim___
505290207Sjkim	&Xupdate(\&body_00_15);
506290207Sjkim	&Xupdate(\&body_00_15);
507290207Sjkim	&Xupdate(\&body_00_15);
508290207Sjkim	&Xupdate(\&body_00_15);
509290207Sjkim$code.=<<___;
510290207Sjkim	teq	$t1,#0				@ check for K256 terminator
511290207Sjkim	ldr	$t1,[sp,#0]
512290207Sjkim	sub	$Xfer,$Xfer,#64
513290207Sjkim	bne	.L_00_48
514290207Sjkim
515290207Sjkim	ldr		$inp,[sp,#68]
516290207Sjkim	ldr		$t0,[sp,#72]
517290207Sjkim	sub		$Ktbl,$Ktbl,#256	@ rewind $Ktbl
518290207Sjkim	teq		$inp,$t0
519290207Sjkim	it		eq
520290207Sjkim	subeq		$inp,$inp,#64		@ avoid SEGV
521290207Sjkim	vld1.8		{@X[0]},[$inp]!		@ load next input block
522290207Sjkim	vld1.8		{@X[1]},[$inp]!
523290207Sjkim	vld1.8		{@X[2]},[$inp]!
524290207Sjkim	vld1.8		{@X[3]},[$inp]!
525290207Sjkim	it		ne
526290207Sjkim	strne		$inp,[sp,#68]
527290207Sjkim	mov		$Xfer,sp
528290207Sjkim___
529290207Sjkim	&Xpreload(\&body_00_15);
530290207Sjkim	&Xpreload(\&body_00_15);
531290207Sjkim	&Xpreload(\&body_00_15);
532290207Sjkim	&Xpreload(\&body_00_15);
533290207Sjkim$code.=<<___;
534290207Sjkim	ldr	$t0,[$t1,#0]
535290207Sjkim	add	$A,$A,$t2			@ h+=Maj(a,b,c) from the past
536290207Sjkim	ldr	$t2,[$t1,#4]
537290207Sjkim	ldr	$t3,[$t1,#8]
538290207Sjkim	ldr	$t4,[$t1,#12]
539290207Sjkim	add	$A,$A,$t0			@ accumulate
540290207Sjkim	ldr	$t0,[$t1,#16]
541290207Sjkim	add	$B,$B,$t2
542290207Sjkim	ldr	$t2,[$t1,#20]
543290207Sjkim	add	$C,$C,$t3
544290207Sjkim	ldr	$t3,[$t1,#24]
545290207Sjkim	add	$D,$D,$t4
546290207Sjkim	ldr	$t4,[$t1,#28]
547290207Sjkim	add	$E,$E,$t0
548290207Sjkim	str	$A,[$t1],#4
549290207Sjkim	add	$F,$F,$t2
550290207Sjkim	str	$B,[$t1],#4
551290207Sjkim	add	$G,$G,$t3
552290207Sjkim	str	$C,[$t1],#4
553290207Sjkim	add	$H,$H,$t4
554290207Sjkim	str	$D,[$t1],#4
555290207Sjkim	stmia	$t1,{$E-$H}
556290207Sjkim
557290207Sjkim	ittte	ne
558290207Sjkim	movne	$Xfer,sp
559290207Sjkim	ldrne	$t1,[sp,#0]
560290207Sjkim	eorne	$t2,$t2,$t2
561290207Sjkim	ldreq	sp,[sp,#76]			@ restore original sp
562290207Sjkim	itt	ne
563290207Sjkim	eorne	$t3,$B,$C
564290207Sjkim	bne	.L_00_48
565290207Sjkim
566290207Sjkim	ldmia	sp!,{r4-r12,pc}
567290207Sjkim.size	sha256_block_data_order_neon,.-sha256_block_data_order_neon
568290207Sjkim#endif
569290207Sjkim___
570290207Sjkim}}}
571290207Sjkim######################################################################
572290207Sjkim# ARMv8 stuff
573290207Sjkim#
574290207Sjkim{{{
575290207Sjkimmy ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
576290207Sjkimmy @MSG=map("q$_",(8..11));
577290207Sjkimmy ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
578290207Sjkimmy $Ktbl="r3";
579290207Sjkim
580290207Sjkim$code.=<<___;
581290207Sjkim#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
582290207Sjkim
583290207Sjkim# ifdef __thumb2__
584290207Sjkim#  define INST(a,b,c,d)	.byte	c,d|0xc,a,b
585290207Sjkim# else
586290207Sjkim#  define INST(a,b,c,d)	.byte	a,b,c,d
587290207Sjkim# endif
588290207Sjkim
589290207Sjkim.type	sha256_block_data_order_armv8,%function
590290207Sjkim.align	5
591290207Sjkimsha256_block_data_order_armv8:
592290207Sjkim.LARMv8:
593290207Sjkim	vld1.32	{$ABCD,$EFGH},[$ctx]
594290207Sjkim# ifdef __thumb2__
595290207Sjkim	adr	$Ktbl,.LARMv8
596290207Sjkim	sub	$Ktbl,$Ktbl,#.LARMv8-K256
597290207Sjkim# else
598305152Sjkim	sub	$Ktbl,$Ktbl,#256+32
599290207Sjkim# endif
600290207Sjkim	add	$len,$inp,$len,lsl#6	@ len to point at the end of inp
601290207Sjkim
602290207Sjkim.Loop_v8:
603290207Sjkim	vld1.8		{@MSG[0]-@MSG[1]},[$inp]!
604290207Sjkim	vld1.8		{@MSG[2]-@MSG[3]},[$inp]!
605290207Sjkim	vld1.32		{$W0},[$Ktbl]!
606290207Sjkim	vrev32.8	@MSG[0],@MSG[0]
607290207Sjkim	vrev32.8	@MSG[1],@MSG[1]
608290207Sjkim	vrev32.8	@MSG[2],@MSG[2]
609290207Sjkim	vrev32.8	@MSG[3],@MSG[3]
610290207Sjkim	vmov		$ABCD_SAVE,$ABCD	@ offload
611290207Sjkim	vmov		$EFGH_SAVE,$EFGH
612290207Sjkim	teq		$inp,$len
613290207Sjkim___
614290207Sjkimfor($i=0;$i<12;$i++) {
615290207Sjkim$code.=<<___;
616290207Sjkim	vld1.32		{$W1},[$Ktbl]!
617290207Sjkim	vadd.i32	$W0,$W0,@MSG[0]
618290207Sjkim	sha256su0	@MSG[0],@MSG[1]
619290207Sjkim	vmov		$abcd,$ABCD
620290207Sjkim	sha256h		$ABCD,$EFGH,$W0
621290207Sjkim	sha256h2	$EFGH,$abcd,$W0
622290207Sjkim	sha256su1	@MSG[0],@MSG[2],@MSG[3]
623290207Sjkim___
624290207Sjkim	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
625290207Sjkim}
626290207Sjkim$code.=<<___;
627290207Sjkim	vld1.32		{$W1},[$Ktbl]!
628290207Sjkim	vadd.i32	$W0,$W0,@MSG[0]
629290207Sjkim	vmov		$abcd,$ABCD
630290207Sjkim	sha256h		$ABCD,$EFGH,$W0
631290207Sjkim	sha256h2	$EFGH,$abcd,$W0
632290207Sjkim
633290207Sjkim	vld1.32		{$W0},[$Ktbl]!
634290207Sjkim	vadd.i32	$W1,$W1,@MSG[1]
635290207Sjkim	vmov		$abcd,$ABCD
636290207Sjkim	sha256h		$ABCD,$EFGH,$W1
637290207Sjkim	sha256h2	$EFGH,$abcd,$W1
638290207Sjkim
639290207Sjkim	vld1.32		{$W1},[$Ktbl]
640290207Sjkim	vadd.i32	$W0,$W0,@MSG[2]
641290207Sjkim	sub		$Ktbl,$Ktbl,#256-16	@ rewind
642290207Sjkim	vmov		$abcd,$ABCD
643290207Sjkim	sha256h		$ABCD,$EFGH,$W0
644290207Sjkim	sha256h2	$EFGH,$abcd,$W0
645290207Sjkim
646290207Sjkim	vadd.i32	$W1,$W1,@MSG[3]
647290207Sjkim	vmov		$abcd,$ABCD
648290207Sjkim	sha256h		$ABCD,$EFGH,$W1
649290207Sjkim	sha256h2	$EFGH,$abcd,$W1
650290207Sjkim
651290207Sjkim	vadd.i32	$ABCD,$ABCD,$ABCD_SAVE
652290207Sjkim	vadd.i32	$EFGH,$EFGH,$EFGH_SAVE
653290207Sjkim	it		ne
654290207Sjkim	bne		.Loop_v8
655290207Sjkim
656290207Sjkim	vst1.32		{$ABCD,$EFGH},[$ctx]
657290207Sjkim
658290207Sjkim	ret		@ bx lr
659290207Sjkim.size	sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
660290207Sjkim#endif
661290207Sjkim___
662290207Sjkim}}}
663290207Sjkim$code.=<<___;
664290207Sjkim.asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
665238384Sjkim.align	2
666290207Sjkim#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
667290207Sjkim.comm   OPENSSL_armcap_P,4,4
668290207Sjkim#endif
669238384Sjkim___
670238384Sjkim
671290207Sjkimopen SELF,$0;
672290207Sjkimwhile(<SELF>) {
673290207Sjkim	next if (/^#!/);
674290207Sjkim	last if (!s/^#/@/ and !/^$/);
675290207Sjkim	print;
676290207Sjkim}
677290207Sjkimclose SELF;
678290207Sjkim
679290207Sjkim{   my  %opcode = (
680290207Sjkim	"sha256h"	=> 0xf3000c40,	"sha256h2"	=> 0xf3100c40,
681290207Sjkim	"sha256su0"	=> 0xf3ba03c0,	"sha256su1"	=> 0xf3200c40	);
682290207Sjkim
683290207Sjkim    sub unsha256 {
684290207Sjkim	my ($mnemonic,$arg)=@_;
685290207Sjkim
686290207Sjkim	if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
687290207Sjkim	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
688290207Sjkim					 |(($2&7)<<17)|(($2&8)<<4)
689290207Sjkim					 |(($3&7)<<1) |(($3&8)<<2);
690290207Sjkim	    # since ARMv7 instructions are always encoded little-endian.
691290207Sjkim	    # correct solution is to use .inst directive, but older
692290207Sjkim	    # assemblers don't implement it:-(
693290207Sjkim	    sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
694290207Sjkim			$word&0xff,($word>>8)&0xff,
695290207Sjkim			($word>>16)&0xff,($word>>24)&0xff,
696290207Sjkim			$mnemonic,$arg;
697290207Sjkim	}
698290207Sjkim    }
699290207Sjkim}
700290207Sjkim
701290207Sjkimforeach (split($/,$code)) {
702290207Sjkim
703290207Sjkim	s/\`([^\`]*)\`/eval $1/geo;
704290207Sjkim
705290207Sjkim	s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
706290207Sjkim
707290207Sjkim	s/\bret\b/bx	lr/go		or
708290207Sjkim	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
709290207Sjkim
710290207Sjkim	print $_,"\n";
711290207Sjkim}
712290207Sjkim
713238384Sjkimclose STDOUT; # enforce flush
714