1289848Sjkim#!/usr/bin/env perl
2289848Sjkim#
3289848Sjkim# ====================================================================
4289848Sjkim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5289848Sjkim# project. The module is, however, dual licensed under OpenSSL and
6289848Sjkim# CRYPTOGAMS licenses depending on where you obtain it. For further
7289848Sjkim# details see http://www.openssl.org/~appro/cryptogams/.
8289848Sjkim# ====================================================================
9289848Sjkim#
10289848Sjkim# SHA256/512 for ARMv8.
11289848Sjkim#
12289848Sjkim# Performance in cycles per processed byte and improvement coefficient
13289848Sjkim# over code generated with "default" compiler:
14289848Sjkim#
15289848Sjkim#		SHA256-hw	SHA256(*)	SHA512
16289848Sjkim# Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
17289848Sjkim# Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
18289848Sjkim# Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
19289848Sjkim# Denver	2.01		10.5 (+26%)	6.70 (+8%)
20289848Sjkim# X-Gene			20.0 (+100%)	12.8 (+300%(***))
21289848Sjkim#
22289848Sjkim# (*)	Software SHA256 results are of lesser relevance, presented
23289848Sjkim#	mostly for informational purposes.
24289848Sjkim# (**)	The result is a trade-off: it's possible to improve it by
25289848Sjkim#	10% (or by 1 cycle per round), but at the cost of 20% loss
26289848Sjkim#	on Cortex-A53 (or by 4 cycles per round).
27289848Sjkim# (***)	Super-impressive coefficients over gcc-generated code are
28289848Sjkim#	indication of some compiler "pathology", most notably code
29289848Sjkim#	generated with -mgeneral-regs-only is significanty faster
30289848Sjkim#	and the gap is only 40-90%.
31289848Sjkim
32289848Sjkim$flavour=shift;
33289848Sjkim$output=shift;
34289848Sjkimopen STDOUT,">$output";
35289848Sjkim
36289848Sjkimif ($output =~ /512/) {
37289848Sjkim	$BITS=512;
38289848Sjkim	$SZ=8;
39289848Sjkim	@Sigma0=(28,34,39);
40289848Sjkim	@Sigma1=(14,18,41);
41289848Sjkim	@sigma0=(1,  8, 7);
42289848Sjkim	@sigma1=(19,61, 6);
43289848Sjkim	$rounds=80;
44289848Sjkim	$reg_t="x";
45289848Sjkim} else {
46289848Sjkim	$BITS=256;
47289848Sjkim	$SZ=4;
48289848Sjkim	@Sigma0=( 2,13,22);
49289848Sjkim	@Sigma1=( 6,11,25);
50289848Sjkim	@sigma0=( 7,18, 3);
51289848Sjkim	@sigma1=(17,19,10);
52289848Sjkim	$rounds=64;
53289848Sjkim	$reg_t="w";
54289848Sjkim}
55289848Sjkim
56289848Sjkim$func="sha${BITS}_block_data_order";
57289848Sjkim
58289848Sjkim($ctx,$inp,$num,$Ktbl)=map("x$_",(0..2,30));
59289848Sjkim
60289848Sjkim@X=map("$reg_t$_",(3..15,0..2));
61289848Sjkim@V=($A,$B,$C,$D,$E,$F,$G,$H)=map("$reg_t$_",(20..27));
62289848Sjkim($t0,$t1,$t2,$t3)=map("$reg_t$_",(16,17,19,28));
63289848Sjkim
64289848Sjkimsub BODY_00_xx {
65289848Sjkimmy ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
66289848Sjkimmy $j=($i+1)&15;
67289848Sjkimmy ($T0,$T1,$T2)=(@X[($i-8)&15],@X[($i-9)&15],@X[($i-10)&15]);
68289848Sjkim   $T0=@X[$i+3] if ($i<11);
69289848Sjkim
70289848Sjkim$code.=<<___	if ($i<16);
71289848Sjkim#ifndef	__ARMEB__
72289848Sjkim	rev	@X[$i],@X[$i]			// $i
73289848Sjkim#endif
74289848Sjkim___
75289848Sjkim$code.=<<___	if ($i<13 && ($i&1));
76289848Sjkim	ldp	@X[$i+1],@X[$i+2],[$inp],#2*$SZ
77289848Sjkim___
78289848Sjkim$code.=<<___	if ($i==13);
79289848Sjkim	ldp	@X[14],@X[15],[$inp]
80289848Sjkim___
81289848Sjkim$code.=<<___	if ($i>=14);
82289848Sjkim	ldr	@X[($i-11)&15],[sp,#`$SZ*(($i-11)%4)`]
83289848Sjkim___
84289848Sjkim$code.=<<___	if ($i>0 && $i<16);
85289848Sjkim	add	$a,$a,$t1			// h+=Sigma0(a)
86289848Sjkim___
87289848Sjkim$code.=<<___	if ($i>=11);
88289848Sjkim	str	@X[($i-8)&15],[sp,#`$SZ*(($i-8)%4)`]
89289848Sjkim___
90289848Sjkim# While ARMv8 specifies merged rotate-n-logical operation such as
91289848Sjkim# 'eor x,y,z,ror#n', it was found to negatively affect performance
92289848Sjkim# on Apple A7. The reason seems to be that it requires even 'y' to
93289848Sjkim# be available earlier. This means that such merged instruction is
94289848Sjkim# not necessarily best choice on critical path... On the other hand
95289848Sjkim# Cortex-A5x handles merged instructions much better than disjoint
96289848Sjkim# rotate and logical... See (**) footnote above.
97289848Sjkim$code.=<<___	if ($i<15);
98289848Sjkim	ror	$t0,$e,#$Sigma1[0]
99289848Sjkim	add	$h,$h,$t2			// h+=K[i]
100289848Sjkim	eor	$T0,$e,$e,ror#`$Sigma1[2]-$Sigma1[1]`
101289848Sjkim	and	$t1,$f,$e
102289848Sjkim	bic	$t2,$g,$e
103289848Sjkim	add	$h,$h,@X[$i&15]			// h+=X[i]
104289848Sjkim	orr	$t1,$t1,$t2			// Ch(e,f,g)
105289848Sjkim	eor	$t2,$a,$b			// a^b, b^c in next round
106289848Sjkim	eor	$t0,$t0,$T0,ror#$Sigma1[1]	// Sigma1(e)
107289848Sjkim	ror	$T0,$a,#$Sigma0[0]
108289848Sjkim	add	$h,$h,$t1			// h+=Ch(e,f,g)
109289848Sjkim	eor	$t1,$a,$a,ror#`$Sigma0[2]-$Sigma0[1]`
110289848Sjkim	add	$h,$h,$t0			// h+=Sigma1(e)
111289848Sjkim	and	$t3,$t3,$t2			// (b^c)&=(a^b)
112289848Sjkim	add	$d,$d,$h			// d+=h
113289848Sjkim	eor	$t3,$t3,$b			// Maj(a,b,c)
114289848Sjkim	eor	$t1,$T0,$t1,ror#$Sigma0[1]	// Sigma0(a)
115289848Sjkim	add	$h,$h,$t3			// h+=Maj(a,b,c)
116289848Sjkim	ldr	$t3,[$Ktbl],#$SZ		// *K++, $t2 in next round
117289848Sjkim	//add	$h,$h,$t1			// h+=Sigma0(a)
118289848Sjkim___
119289848Sjkim$code.=<<___	if ($i>=15);
120289848Sjkim	ror	$t0,$e,#$Sigma1[0]
121289848Sjkim	add	$h,$h,$t2			// h+=K[i]
122289848Sjkim	ror	$T1,@X[($j+1)&15],#$sigma0[0]
123289848Sjkim	and	$t1,$f,$e
124289848Sjkim	ror	$T2,@X[($j+14)&15],#$sigma1[0]
125289848Sjkim	bic	$t2,$g,$e
126289848Sjkim	ror	$T0,$a,#$Sigma0[0]
127289848Sjkim	add	$h,$h,@X[$i&15]			// h+=X[i]
128289848Sjkim	eor	$t0,$t0,$e,ror#$Sigma1[1]
129289848Sjkim	eor	$T1,$T1,@X[($j+1)&15],ror#$sigma0[1]
130289848Sjkim	orr	$t1,$t1,$t2			// Ch(e,f,g)
131289848Sjkim	eor	$t2,$a,$b			// a^b, b^c in next round
132289848Sjkim	eor	$t0,$t0,$e,ror#$Sigma1[2]	// Sigma1(e)
133289848Sjkim	eor	$T0,$T0,$a,ror#$Sigma0[1]
134289848Sjkim	add	$h,$h,$t1			// h+=Ch(e,f,g)
135289848Sjkim	and	$t3,$t3,$t2			// (b^c)&=(a^b)
136289848Sjkim	eor	$T2,$T2,@X[($j+14)&15],ror#$sigma1[1]
137289848Sjkim	eor	$T1,$T1,@X[($j+1)&15],lsr#$sigma0[2]	// sigma0(X[i+1])
138289848Sjkim	add	$h,$h,$t0			// h+=Sigma1(e)
139289848Sjkim	eor	$t3,$t3,$b			// Maj(a,b,c)
140289848Sjkim	eor	$t1,$T0,$a,ror#$Sigma0[2]	// Sigma0(a)
141289848Sjkim	eor	$T2,$T2,@X[($j+14)&15],lsr#$sigma1[2]	// sigma1(X[i+14])
142289848Sjkim	add	@X[$j],@X[$j],@X[($j+9)&15]
143289848Sjkim	add	$d,$d,$h			// d+=h
144289848Sjkim	add	$h,$h,$t3			// h+=Maj(a,b,c)
145289848Sjkim	ldr	$t3,[$Ktbl],#$SZ		// *K++, $t2 in next round
146289848Sjkim	add	@X[$j],@X[$j],$T1
147289848Sjkim	add	$h,$h,$t1			// h+=Sigma0(a)
148289848Sjkim	add	@X[$j],@X[$j],$T2
149289848Sjkim___
150289848Sjkim	($t2,$t3)=($t3,$t2);
151289848Sjkim}
152289848Sjkim
153289848Sjkim$code.=<<___;
154289848Sjkim#include "arm_arch.h"
155289848Sjkim
156289848Sjkim.text
157289848Sjkim
158289848Sjkim.globl	$func
159289848Sjkim.type	$func,%function
160289848Sjkim.align	6
161289848Sjkim$func:
162289848Sjkim___
163289848Sjkim$code.=<<___	if ($SZ==4);
164289848Sjkim	ldr	x16,.LOPENSSL_armcap_P
165289848Sjkim	adr	x17,.LOPENSSL_armcap_P
166289848Sjkim	add	x16,x16,x17
167289848Sjkim	ldr	w16,[x16]
168289848Sjkim	tst	w16,#ARMV8_SHA256
169289848Sjkim	b.ne	.Lv8_entry
170289848Sjkim___
171289848Sjkim$code.=<<___;
172289848Sjkim	stp	x29,x30,[sp,#-128]!
173289848Sjkim	add	x29,sp,#0
174289848Sjkim
175289848Sjkim	stp	x19,x20,[sp,#16]
176289848Sjkim	stp	x21,x22,[sp,#32]
177289848Sjkim	stp	x23,x24,[sp,#48]
178289848Sjkim	stp	x25,x26,[sp,#64]
179289848Sjkim	stp	x27,x28,[sp,#80]
180289848Sjkim	sub	sp,sp,#4*$SZ
181289848Sjkim
182289848Sjkim	ldp	$A,$B,[$ctx]				// load context
183289848Sjkim	ldp	$C,$D,[$ctx,#2*$SZ]
184289848Sjkim	ldp	$E,$F,[$ctx,#4*$SZ]
185289848Sjkim	add	$num,$inp,$num,lsl#`log(16*$SZ)/log(2)`	// end of input
186289848Sjkim	ldp	$G,$H,[$ctx,#6*$SZ]
187289848Sjkim	adr	$Ktbl,K$BITS
188289848Sjkim	stp	$ctx,$num,[x29,#96]
189289848Sjkim
190289848Sjkim.Loop:
191289848Sjkim	ldp	@X[0],@X[1],[$inp],#2*$SZ
192289848Sjkim	ldr	$t2,[$Ktbl],#$SZ			// *K++
193289848Sjkim	eor	$t3,$B,$C				// magic seed
194289848Sjkim	str	$inp,[x29,#112]
195289848Sjkim___
196289848Sjkimfor ($i=0;$i<16;$i++)	{ &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
197289848Sjkim$code.=".Loop_16_xx:\n";
198289848Sjkimfor (;$i<32;$i++)	{ &BODY_00_xx($i,@V); unshift(@V,pop(@V)); }
199289848Sjkim$code.=<<___;
200289848Sjkim	cbnz	$t2,.Loop_16_xx
201289848Sjkim
202289848Sjkim	ldp	$ctx,$num,[x29,#96]
203289848Sjkim	ldr	$inp,[x29,#112]
204289848Sjkim	sub	$Ktbl,$Ktbl,#`$SZ*($rounds+1)`		// rewind
205289848Sjkim
206289848Sjkim	ldp	@X[0],@X[1],[$ctx]
207289848Sjkim	ldp	@X[2],@X[3],[$ctx,#2*$SZ]
208289848Sjkim	add	$inp,$inp,#14*$SZ			// advance input pointer
209289848Sjkim	ldp	@X[4],@X[5],[$ctx,#4*$SZ]
210289848Sjkim	add	$A,$A,@X[0]
211289848Sjkim	ldp	@X[6],@X[7],[$ctx,#6*$SZ]
212289848Sjkim	add	$B,$B,@X[1]
213289848Sjkim	add	$C,$C,@X[2]
214289848Sjkim	add	$D,$D,@X[3]
215289848Sjkim	stp	$A,$B,[$ctx]
216289848Sjkim	add	$E,$E,@X[4]
217289848Sjkim	add	$F,$F,@X[5]
218289848Sjkim	stp	$C,$D,[$ctx,#2*$SZ]
219289848Sjkim	add	$G,$G,@X[6]
220289848Sjkim	add	$H,$H,@X[7]
221289848Sjkim	cmp	$inp,$num
222289848Sjkim	stp	$E,$F,[$ctx,#4*$SZ]
223289848Sjkim	stp	$G,$H,[$ctx,#6*$SZ]
224289848Sjkim	b.ne	.Loop
225289848Sjkim
226289848Sjkim	ldp	x19,x20,[x29,#16]
227289848Sjkim	add	sp,sp,#4*$SZ
228289848Sjkim	ldp	x21,x22,[x29,#32]
229289848Sjkim	ldp	x23,x24,[x29,#48]
230289848Sjkim	ldp	x25,x26,[x29,#64]
231289848Sjkim	ldp	x27,x28,[x29,#80]
232289848Sjkim	ldp	x29,x30,[sp],#128
233289848Sjkim	ret
234289848Sjkim.size	$func,.-$func
235289848Sjkim
236289848Sjkim.align	6
237289848Sjkim.type	K$BITS,%object
238289848SjkimK$BITS:
239289848Sjkim___
240289848Sjkim$code.=<<___ if ($SZ==8);
241289848Sjkim	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
242289848Sjkim	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
243289848Sjkim	.quad	0x3956c25bf348b538,0x59f111f1b605d019
244289848Sjkim	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
245289848Sjkim	.quad	0xd807aa98a3030242,0x12835b0145706fbe
246289848Sjkim	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
247289848Sjkim	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
248289848Sjkim	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
249289848Sjkim	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
250289848Sjkim	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
251289848Sjkim	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
252289848Sjkim	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
253289848Sjkim	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
254289848Sjkim	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
255289848Sjkim	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
256289848Sjkim	.quad	0x06ca6351e003826f,0x142929670a0e6e70
257289848Sjkim	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
258289848Sjkim	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
259289848Sjkim	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
260289848Sjkim	.quad	0x81c2c92e47edaee6,0x92722c851482353b
261289848Sjkim	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
262289848Sjkim	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
263289848Sjkim	.quad	0xd192e819d6ef5218,0xd69906245565a910
264289848Sjkim	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
265289848Sjkim	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
266289848Sjkim	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
267289848Sjkim	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
268289848Sjkim	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
269289848Sjkim	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
270289848Sjkim	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
271289848Sjkim	.quad	0x90befffa23631e28,0xa4506cebde82bde9
272289848Sjkim	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
273289848Sjkim	.quad	0xca273eceea26619c,0xd186b8c721c0c207
274289848Sjkim	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
275289848Sjkim	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
276289848Sjkim	.quad	0x113f9804bef90dae,0x1b710b35131c471b
277289848Sjkim	.quad	0x28db77f523047d84,0x32caab7b40c72493
278289848Sjkim	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
279289848Sjkim	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
280289848Sjkim	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
281289848Sjkim	.quad	0	// terminator
282289848Sjkim___
283289848Sjkim$code.=<<___ if ($SZ==4);
284289848Sjkim	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
285289848Sjkim	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
286289848Sjkim	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
287289848Sjkim	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
288289848Sjkim	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
289289848Sjkim	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
290289848Sjkim	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
291289848Sjkim	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
292289848Sjkim	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
293289848Sjkim	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
294289848Sjkim	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
295289848Sjkim	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
296289848Sjkim	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
297289848Sjkim	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
298289848Sjkim	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
299289848Sjkim	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
300289848Sjkim	.long	0	//terminator
301289848Sjkim___
302289848Sjkim$code.=<<___;
303289848Sjkim.size	K$BITS,.-K$BITS
304289848Sjkim.align	3
305289848Sjkim.LOPENSSL_armcap_P:
306289848Sjkim	.quad	OPENSSL_armcap_P-.
307289848Sjkim.asciz	"SHA$BITS block transform for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
308289848Sjkim.align	2
309289848Sjkim___
310289848Sjkim
311289848Sjkimif ($SZ==4) {
312289848Sjkimmy $Ktbl="x3";
313289848Sjkim
314289848Sjkimmy ($ABCD,$EFGH,$abcd)=map("v$_.16b",(0..2));
315289848Sjkimmy @MSG=map("v$_.16b",(4..7));
316289848Sjkimmy ($W0,$W1)=("v16.4s","v17.4s");
317289848Sjkimmy ($ABCD_SAVE,$EFGH_SAVE)=("v18.16b","v19.16b");
318289848Sjkim
319289848Sjkim$code.=<<___;
320289848Sjkim.type	sha256_block_armv8,%function
321289848Sjkim.align	6
322289848Sjkimsha256_block_armv8:
323289848Sjkim.Lv8_entry:
324289848Sjkim	stp		x29,x30,[sp,#-16]!
325289848Sjkim	add		x29,sp,#0
326289848Sjkim
327289848Sjkim	ld1.32		{$ABCD,$EFGH},[$ctx]
328289848Sjkim	adr		$Ktbl,K256
329289848Sjkim
330289848Sjkim.Loop_hw:
331289848Sjkim	ld1		{@MSG[0]-@MSG[3]},[$inp],#64
332289848Sjkim	sub		$num,$num,#1
333289848Sjkim	ld1.32		{$W0},[$Ktbl],#16
334289848Sjkim	rev32		@MSG[0],@MSG[0]
335289848Sjkim	rev32		@MSG[1],@MSG[1]
336289848Sjkim	rev32		@MSG[2],@MSG[2]
337289848Sjkim	rev32		@MSG[3],@MSG[3]
338289848Sjkim	orr		$ABCD_SAVE,$ABCD,$ABCD		// offload
339289848Sjkim	orr		$EFGH_SAVE,$EFGH,$EFGH
340289848Sjkim___
341289848Sjkimfor($i=0;$i<12;$i++) {
342289848Sjkim$code.=<<___;
343289848Sjkim	ld1.32		{$W1},[$Ktbl],#16
344289848Sjkim	add.i32		$W0,$W0,@MSG[0]
345289848Sjkim	sha256su0	@MSG[0],@MSG[1]
346289848Sjkim	orr		$abcd,$ABCD,$ABCD
347289848Sjkim	sha256h		$ABCD,$EFGH,$W0
348289848Sjkim	sha256h2	$EFGH,$abcd,$W0
349289848Sjkim	sha256su1	@MSG[0],@MSG[2],@MSG[3]
350289848Sjkim___
351289848Sjkim	($W0,$W1)=($W1,$W0);	push(@MSG,shift(@MSG));
352289848Sjkim}
353289848Sjkim$code.=<<___;
354289848Sjkim	ld1.32		{$W1},[$Ktbl],#16
355289848Sjkim	add.i32		$W0,$W0,@MSG[0]
356289848Sjkim	orr		$abcd,$ABCD,$ABCD
357289848Sjkim	sha256h		$ABCD,$EFGH,$W0
358289848Sjkim	sha256h2	$EFGH,$abcd,$W0
359289848Sjkim
360289848Sjkim	ld1.32		{$W0},[$Ktbl],#16
361289848Sjkim	add.i32		$W1,$W1,@MSG[1]
362289848Sjkim	orr		$abcd,$ABCD,$ABCD
363289848Sjkim	sha256h		$ABCD,$EFGH,$W1
364289848Sjkim	sha256h2	$EFGH,$abcd,$W1
365289848Sjkim
366289848Sjkim	ld1.32		{$W1},[$Ktbl]
367289848Sjkim	add.i32		$W0,$W0,@MSG[2]
368289848Sjkim	sub		$Ktbl,$Ktbl,#$rounds*$SZ-16	// rewind
369289848Sjkim	orr		$abcd,$ABCD,$ABCD
370289848Sjkim	sha256h		$ABCD,$EFGH,$W0
371289848Sjkim	sha256h2	$EFGH,$abcd,$W0
372289848Sjkim
373289848Sjkim	add.i32		$W1,$W1,@MSG[3]
374289848Sjkim	orr		$abcd,$ABCD,$ABCD
375289848Sjkim	sha256h		$ABCD,$EFGH,$W1
376289848Sjkim	sha256h2	$EFGH,$abcd,$W1
377289848Sjkim
378289848Sjkim	add.i32		$ABCD,$ABCD,$ABCD_SAVE
379289848Sjkim	add.i32		$EFGH,$EFGH,$EFGH_SAVE
380289848Sjkim
381289848Sjkim	cbnz		$num,.Loop_hw
382289848Sjkim
383289848Sjkim	st1.32		{$ABCD,$EFGH},[$ctx]
384289848Sjkim
385289848Sjkim	ldr		x29,[sp],#16
386289848Sjkim	ret
387289848Sjkim.size	sha256_block_armv8,.-sha256_block_armv8
388289848Sjkim___
389289848Sjkim}
390289848Sjkim
391289848Sjkim$code.=<<___;
392289848Sjkim.comm	OPENSSL_armcap_P,4,4
393289848Sjkim___
394289848Sjkim
395289848Sjkim{   my  %opcode = (
396289848Sjkim	"sha256h"	=> 0x5e004000,	"sha256h2"	=> 0x5e005000,
397289848Sjkim	"sha256su0"	=> 0x5e282800,	"sha256su1"	=> 0x5e006000	);
398289848Sjkim
399289848Sjkim    sub unsha256 {
400289848Sjkim	my ($mnemonic,$arg)=@_;
401289848Sjkim
402289848Sjkim	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
403289848Sjkim	&&
404289848Sjkim	sprintf ".inst\t0x%08x\t//%s %s",
405289848Sjkim			$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
406289848Sjkim			$mnemonic,$arg;
407289848Sjkim    }
408289848Sjkim}
409289848Sjkim
410289848Sjkimforeach(split("\n",$code)) {
411289848Sjkim
412289848Sjkim	s/\`([^\`]*)\`/eval($1)/geo;
413289848Sjkim
414289848Sjkim	s/\b(sha256\w+)\s+([qv].*)/unsha256($1,$2)/geo;
415289848Sjkim
416289848Sjkim	s/\.\w?32\b//o		and s/\.16b/\.4s/go;
417289848Sjkim	m/(ld|st)1[^\[]+\[0\]/o	and s/\.4s/\.s/go;
418289848Sjkim
419289848Sjkim	print $_,"\n";
420289848Sjkim}
421289848Sjkim
422289848Sjkimclose STDOUT;
423