aesv8-armx.pl revision 325333
1#!/usr/bin/env perl
2#
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9#
10# This module implements support for ARMv8 AES instructions. The
11# module is endian-agnostic in sense that it supports both big- and
12# little-endian cases. As does it support both 32- and 64-bit modes
13# of operation. Latter is achieved by limiting amount of utilized
14# registers to 16, which implies additional NEON load and integer
15# instructions. This has no effect on mighty Apple A7, where results
16# are literally equal to the theoretical estimates based on AES
17# instruction latencies and issue rates. On Cortex-A53, an in-order
18# execution core, this costs up to 10-15%, which is partially
19# compensated by implementing dedicated code path for 128-bit
20# CBC encrypt case. On Cortex-A57 parallelizable mode performance
21# seems to be limited by sheer amount of NEON instructions...
22#
23# Performance in cycles per byte processed with 128-bit key:
24#
25#		CBC enc		CBC dec		CTR
26# Apple A7	2.39		1.20		1.20
27# Cortex-A53	1.32		1.29		1.46
28# Cortex-A57(*)	1.95		0.85		0.93
29# Denver	1.96		0.86		0.80
30#
31# (*)	original 3.64/1.34/1.32 results were for r0p0 revision
32#	and are still same even for updated module;
33
34$flavour = shift;
35open STDOUT,">".shift;
36
37$prefix="aes_v8";
38
39$code=<<___;
40#include "arm_arch.h"
41
42#if __ARM_MAX_ARCH__>=7
43.text
44___
45# $code.=".arch	armv8-a+crypto\n"			if ($flavour =~ /64/);
46$code.=".arch	armv7-a\n.fpu	neon\n.code	32\n"	if ($flavour !~ /64/);
47		#^^^^^^ this is done to simplify adoption by not depending
48		#	on latest binutils.
49
50# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
51# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
52# maintain both 32- and 64-bit codes within single module and
53# transliterate common code to either flavour with regex vodoo.
54#
55{{{
56my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
57my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
58	$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
59
60
61$code.=<<___;
62.align	5
63rcon:
64.long	0x01,0x01,0x01,0x01
65.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
66.long	0x1b,0x1b,0x1b,0x1b
67
68.globl	${prefix}_set_encrypt_key
69.type	${prefix}_set_encrypt_key,%function
70.align	5
71${prefix}_set_encrypt_key:
72.Lenc_key:
73___
74$code.=<<___	if ($flavour =~ /64/);
75	stp	x29,x30,[sp,#-16]!
76	add	x29,sp,#0
77___
78$code.=<<___;
79	mov	$ptr,#-1
80	cmp	$inp,#0
81	b.eq	.Lenc_key_abort
82	cmp	$out,#0
83	b.eq	.Lenc_key_abort
84	mov	$ptr,#-2
85	cmp	$bits,#128
86	b.lt	.Lenc_key_abort
87	cmp	$bits,#256
88	b.gt	.Lenc_key_abort
89	tst	$bits,#0x3f
90	b.ne	.Lenc_key_abort
91
92	adr	$ptr,rcon
93	cmp	$bits,#192
94
95	veor	$zero,$zero,$zero
96	vld1.8	{$in0},[$inp],#16
97	mov	$bits,#8		// reuse $bits
98	vld1.32	{$rcon,$mask},[$ptr],#32
99
100	b.lt	.Loop128
101	b.eq	.L192
102	b	.L256
103
104.align	4
105.Loop128:
106	vtbl.8	$key,{$in0},$mask
107	vext.8	$tmp,$zero,$in0,#12
108	vst1.32	{$in0},[$out],#16
109	aese	$key,$zero
110	subs	$bits,$bits,#1
111
112	veor	$in0,$in0,$tmp
113	vext.8	$tmp,$zero,$tmp,#12
114	veor	$in0,$in0,$tmp
115	vext.8	$tmp,$zero,$tmp,#12
116	 veor	$key,$key,$rcon
117	veor	$in0,$in0,$tmp
118	vshl.u8	$rcon,$rcon,#1
119	veor	$in0,$in0,$key
120	b.ne	.Loop128
121
122	vld1.32	{$rcon},[$ptr]
123
124	vtbl.8	$key,{$in0},$mask
125	vext.8	$tmp,$zero,$in0,#12
126	vst1.32	{$in0},[$out],#16
127	aese	$key,$zero
128
129	veor	$in0,$in0,$tmp
130	vext.8	$tmp,$zero,$tmp,#12
131	veor	$in0,$in0,$tmp
132	vext.8	$tmp,$zero,$tmp,#12
133	 veor	$key,$key,$rcon
134	veor	$in0,$in0,$tmp
135	vshl.u8	$rcon,$rcon,#1
136	veor	$in0,$in0,$key
137
138	vtbl.8	$key,{$in0},$mask
139	vext.8	$tmp,$zero,$in0,#12
140	vst1.32	{$in0},[$out],#16
141	aese	$key,$zero
142
143	veor	$in0,$in0,$tmp
144	vext.8	$tmp,$zero,$tmp,#12
145	veor	$in0,$in0,$tmp
146	vext.8	$tmp,$zero,$tmp,#12
147	 veor	$key,$key,$rcon
148	veor	$in0,$in0,$tmp
149	veor	$in0,$in0,$key
150	vst1.32	{$in0},[$out]
151	add	$out,$out,#0x50
152
153	mov	$rounds,#10
154	b	.Ldone
155
156.align	4
157.L192:
158	vld1.8	{$in1},[$inp],#8
159	vmov.i8	$key,#8			// borrow $key
160	vst1.32	{$in0},[$out],#16
161	vsub.i8	$mask,$mask,$key	// adjust the mask
162
163.Loop192:
164	vtbl.8	$key,{$in1},$mask
165	vext.8	$tmp,$zero,$in0,#12
166	vst1.32	{$in1},[$out],#8
167	aese	$key,$zero
168	subs	$bits,$bits,#1
169
170	veor	$in0,$in0,$tmp
171	vext.8	$tmp,$zero,$tmp,#12
172	veor	$in0,$in0,$tmp
173	vext.8	$tmp,$zero,$tmp,#12
174	veor	$in0,$in0,$tmp
175
176	vdup.32	$tmp,${in0}[3]
177	veor	$tmp,$tmp,$in1
178	 veor	$key,$key,$rcon
179	vext.8	$in1,$zero,$in1,#12
180	vshl.u8	$rcon,$rcon,#1
181	veor	$in1,$in1,$tmp
182	veor	$in0,$in0,$key
183	veor	$in1,$in1,$key
184	vst1.32	{$in0},[$out],#16
185	b.ne	.Loop192
186
187	mov	$rounds,#12
188	add	$out,$out,#0x20
189	b	.Ldone
190
191.align	4
192.L256:
193	vld1.8	{$in1},[$inp]
194	mov	$bits,#7
195	mov	$rounds,#14
196	vst1.32	{$in0},[$out],#16
197
198.Loop256:
199	vtbl.8	$key,{$in1},$mask
200	vext.8	$tmp,$zero,$in0,#12
201	vst1.32	{$in1},[$out],#16
202	aese	$key,$zero
203	subs	$bits,$bits,#1
204
205	veor	$in0,$in0,$tmp
206	vext.8	$tmp,$zero,$tmp,#12
207	veor	$in0,$in0,$tmp
208	vext.8	$tmp,$zero,$tmp,#12
209	 veor	$key,$key,$rcon
210	veor	$in0,$in0,$tmp
211	vshl.u8	$rcon,$rcon,#1
212	veor	$in0,$in0,$key
213	vst1.32	{$in0},[$out],#16
214	b.eq	.Ldone
215
216	vdup.32	$key,${in0}[3]		// just splat
217	vext.8	$tmp,$zero,$in1,#12
218	aese	$key,$zero
219
220	veor	$in1,$in1,$tmp
221	vext.8	$tmp,$zero,$tmp,#12
222	veor	$in1,$in1,$tmp
223	vext.8	$tmp,$zero,$tmp,#12
224	veor	$in1,$in1,$tmp
225
226	veor	$in1,$in1,$key
227	b	.Loop256
228
229.Ldone:
230	str	$rounds,[$out]
231	mov	$ptr,#0
232
233.Lenc_key_abort:
234	mov	x0,$ptr			// return value
235	`"ldr	x29,[sp],#16"		if ($flavour =~ /64/)`
236	ret
237.size	${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
238
239.globl	${prefix}_set_decrypt_key
240.type	${prefix}_set_decrypt_key,%function
241.align	5
242${prefix}_set_decrypt_key:
243___
244$code.=<<___	if ($flavour =~ /64/);
245	stp	x29,x30,[sp,#-16]!
246	add	x29,sp,#0
247___
248$code.=<<___	if ($flavour !~ /64/);
249	stmdb	sp!,{r4,lr}
250___
251$code.=<<___;
252	bl	.Lenc_key
253
254	cmp	x0,#0
255	b.ne	.Ldec_key_abort
256
257	sub	$out,$out,#240		// restore original $out
258	mov	x4,#-16
259	add	$inp,$out,x12,lsl#4	// end of key schedule
260
261	vld1.32	{v0.16b},[$out]
262	vld1.32	{v1.16b},[$inp]
263	vst1.32	{v0.16b},[$inp],x4
264	vst1.32	{v1.16b},[$out],#16
265
266.Loop_imc:
267	vld1.32	{v0.16b},[$out]
268	vld1.32	{v1.16b},[$inp]
269	aesimc	v0.16b,v0.16b
270	aesimc	v1.16b,v1.16b
271	vst1.32	{v0.16b},[$inp],x4
272	vst1.32	{v1.16b},[$out],#16
273	cmp	$inp,$out
274	b.hi	.Loop_imc
275
276	vld1.32	{v0.16b},[$out]
277	aesimc	v0.16b,v0.16b
278	vst1.32	{v0.16b},[$inp]
279
280	eor	x0,x0,x0		// return value
281.Ldec_key_abort:
282___
283$code.=<<___	if ($flavour !~ /64/);
284	ldmia	sp!,{r4,pc}
285___
286$code.=<<___	if ($flavour =~ /64/);
287	ldp	x29,x30,[sp],#16
288	ret
289___
290$code.=<<___;
291.size	${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
292___
293}}}
294{{{
295sub gen_block () {
296my $dir = shift;
297my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
298my ($inp,$out,$key)=map("x$_",(0..2));
299my $rounds="w3";
300my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
301
302$code.=<<___;
303.globl	${prefix}_${dir}crypt
304.type	${prefix}_${dir}crypt,%function
305.align	5
306${prefix}_${dir}crypt:
307	ldr	$rounds,[$key,#240]
308	vld1.32	{$rndkey0},[$key],#16
309	vld1.8	{$inout},[$inp]
310	sub	$rounds,$rounds,#2
311	vld1.32	{$rndkey1},[$key],#16
312
313.Loop_${dir}c:
314	aes$e	$inout,$rndkey0
315	aes$mc	$inout,$inout
316	vld1.32	{$rndkey0},[$key],#16
317	subs	$rounds,$rounds,#2
318	aes$e	$inout,$rndkey1
319	aes$mc	$inout,$inout
320	vld1.32	{$rndkey1},[$key],#16
321	b.gt	.Loop_${dir}c
322
323	aes$e	$inout,$rndkey0
324	aes$mc	$inout,$inout
325	vld1.32	{$rndkey0},[$key]
326	aes$e	$inout,$rndkey1
327	veor	$inout,$inout,$rndkey0
328
329	vst1.8	{$inout},[$out]
330	ret
331.size	${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
332___
333}
334&gen_block("en");
335&gen_block("de");
336}}}
337{{{
338my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
339my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
340my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
341
342my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
343my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
344
345### q8-q15	preloaded key schedule
346
347$code.=<<___;
348.globl	${prefix}_cbc_encrypt
349.type	${prefix}_cbc_encrypt,%function
350.align	5
351${prefix}_cbc_encrypt:
352___
353$code.=<<___	if ($flavour =~ /64/);
354	stp	x29,x30,[sp,#-16]!
355	add	x29,sp,#0
356___
357$code.=<<___	if ($flavour !~ /64/);
358	mov	ip,sp
359	stmdb	sp!,{r4-r8,lr}
360	vstmdb	sp!,{d8-d15}            @ ABI specification says so
361	ldmia	ip,{r4-r5}		@ load remaining args
362___
363$code.=<<___;
364	subs	$len,$len,#16
365	mov	$step,#16
366	b.lo	.Lcbc_abort
367	cclr	$step,eq
368
369	cmp	$enc,#0			// en- or decrypting?
370	ldr	$rounds,[$key,#240]
371	and	$len,$len,#-16
372	vld1.8	{$ivec},[$ivp]
373	vld1.8	{$dat},[$inp],$step
374
375	vld1.32	{q8-q9},[$key]		// load key schedule...
376	sub	$rounds,$rounds,#6
377	add	$key_,$key,x5,lsl#4	// pointer to last 7 round keys
378	sub	$rounds,$rounds,#2
379	vld1.32	{q10-q11},[$key_],#32
380	vld1.32	{q12-q13},[$key_],#32
381	vld1.32	{q14-q15},[$key_],#32
382	vld1.32	{$rndlast},[$key_]
383
384	add	$key_,$key,#32
385	mov	$cnt,$rounds
386	b.eq	.Lcbc_dec
387
388	cmp	$rounds,#2
389	veor	$dat,$dat,$ivec
390	veor	$rndzero_n_last,q8,$rndlast
391	b.eq	.Lcbc_enc128
392
393	vld1.32	{$in0-$in1},[$key_]
394	add	$key_,$key,#16
395	add	$key4,$key,#16*4
396	add	$key5,$key,#16*5
397	aese	$dat,q8
398	aesmc	$dat,$dat
399	add	$key6,$key,#16*6
400	add	$key7,$key,#16*7
401	b	.Lenter_cbc_enc
402
403.align	4
404.Loop_cbc_enc:
405	aese	$dat,q8
406	aesmc	$dat,$dat
407	 vst1.8	{$ivec},[$out],#16
408.Lenter_cbc_enc:
409	aese	$dat,q9
410	aesmc	$dat,$dat
411	aese	$dat,$in0
412	aesmc	$dat,$dat
413	vld1.32	{q8},[$key4]
414	cmp	$rounds,#4
415	aese	$dat,$in1
416	aesmc	$dat,$dat
417	vld1.32	{q9},[$key5]
418	b.eq	.Lcbc_enc192
419
420	aese	$dat,q8
421	aesmc	$dat,$dat
422	vld1.32	{q8},[$key6]
423	aese	$dat,q9
424	aesmc	$dat,$dat
425	vld1.32	{q9},[$key7]
426	nop
427
428.Lcbc_enc192:
429	aese	$dat,q8
430	aesmc	$dat,$dat
431	 subs	$len,$len,#16
432	aese	$dat,q9
433	aesmc	$dat,$dat
434	 cclr	$step,eq
435	aese	$dat,q10
436	aesmc	$dat,$dat
437	aese	$dat,q11
438	aesmc	$dat,$dat
439	 vld1.8	{q8},[$inp],$step
440	aese	$dat,q12
441	aesmc	$dat,$dat
442	 veor	q8,q8,$rndzero_n_last
443	aese	$dat,q13
444	aesmc	$dat,$dat
445	 vld1.32 {q9},[$key_]		// re-pre-load rndkey[1]
446	aese	$dat,q14
447	aesmc	$dat,$dat
448	aese	$dat,q15
449	veor	$ivec,$dat,$rndlast
450	b.hs	.Loop_cbc_enc
451
452	vst1.8	{$ivec},[$out],#16
453	b	.Lcbc_done
454
455.align	5
456.Lcbc_enc128:
457	vld1.32	{$in0-$in1},[$key_]
458	aese	$dat,q8
459	aesmc	$dat,$dat
460	b	.Lenter_cbc_enc128
461.Loop_cbc_enc128:
462	aese	$dat,q8
463	aesmc	$dat,$dat
464	 vst1.8	{$ivec},[$out],#16
465.Lenter_cbc_enc128:
466	aese	$dat,q9
467	aesmc	$dat,$dat
468	 subs	$len,$len,#16
469	aese	$dat,$in0
470	aesmc	$dat,$dat
471	 cclr	$step,eq
472	aese	$dat,$in1
473	aesmc	$dat,$dat
474	aese	$dat,q10
475	aesmc	$dat,$dat
476	aese	$dat,q11
477	aesmc	$dat,$dat
478	 vld1.8	{q8},[$inp],$step
479	aese	$dat,q12
480	aesmc	$dat,$dat
481	aese	$dat,q13
482	aesmc	$dat,$dat
483	aese	$dat,q14
484	aesmc	$dat,$dat
485	 veor	q8,q8,$rndzero_n_last
486	aese	$dat,q15
487	veor	$ivec,$dat,$rndlast
488	b.hs	.Loop_cbc_enc128
489
490	vst1.8	{$ivec},[$out],#16
491	b	.Lcbc_done
492___
493{
494my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
495$code.=<<___;
496.align	5
497.Lcbc_dec:
498	vld1.8	{$dat2},[$inp],#16
499	subs	$len,$len,#32		// bias
500	add	$cnt,$rounds,#2
501	vorr	$in1,$dat,$dat
502	vorr	$dat1,$dat,$dat
503	vorr	$in2,$dat2,$dat2
504	b.lo	.Lcbc_dec_tail
505
506	vorr	$dat1,$dat2,$dat2
507	vld1.8	{$dat2},[$inp],#16
508	vorr	$in0,$dat,$dat
509	vorr	$in1,$dat1,$dat1
510	vorr	$in2,$dat2,$dat2
511
512.Loop3x_cbc_dec:
513	aesd	$dat0,q8
514	aesimc	$dat0,$dat0
515	aesd	$dat1,q8
516	aesimc	$dat1,$dat1
517	aesd	$dat2,q8
518	aesimc	$dat2,$dat2
519	vld1.32	{q8},[$key_],#16
520	subs	$cnt,$cnt,#2
521	aesd	$dat0,q9
522	aesimc	$dat0,$dat0
523	aesd	$dat1,q9
524	aesimc	$dat1,$dat1
525	aesd	$dat2,q9
526	aesimc	$dat2,$dat2
527	vld1.32	{q9},[$key_],#16
528	b.gt	.Loop3x_cbc_dec
529
530	aesd	$dat0,q8
531	aesimc	$dat0,$dat0
532	aesd	$dat1,q8
533	aesimc	$dat1,$dat1
534	aesd	$dat2,q8
535	aesimc	$dat2,$dat2
536	 veor	$tmp0,$ivec,$rndlast
537	 subs	$len,$len,#0x30
538	 veor	$tmp1,$in0,$rndlast
539	 mov.lo	x6,$len			// x6, $cnt, is zero at this point
540	aesd	$dat0,q9
541	aesimc	$dat0,$dat0
542	aesd	$dat1,q9
543	aesimc	$dat1,$dat1
544	aesd	$dat2,q9
545	aesimc	$dat2,$dat2
546	 veor	$tmp2,$in1,$rndlast
547	 add	$inp,$inp,x6		// $inp is adjusted in such way that
548					// at exit from the loop $dat1-$dat2
549					// are loaded with last "words"
550	 vorr	$ivec,$in2,$in2
551	 mov	$key_,$key
552	aesd	$dat0,q12
553	aesimc	$dat0,$dat0
554	aesd	$dat1,q12
555	aesimc	$dat1,$dat1
556	aesd	$dat2,q12
557	aesimc	$dat2,$dat2
558	 vld1.8	{$in0},[$inp],#16
559	aesd	$dat0,q13
560	aesimc	$dat0,$dat0
561	aesd	$dat1,q13
562	aesimc	$dat1,$dat1
563	aesd	$dat2,q13
564	aesimc	$dat2,$dat2
565	 vld1.8	{$in1},[$inp],#16
566	aesd	$dat0,q14
567	aesimc	$dat0,$dat0
568	aesd	$dat1,q14
569	aesimc	$dat1,$dat1
570	aesd	$dat2,q14
571	aesimc	$dat2,$dat2
572	 vld1.8	{$in2},[$inp],#16
573	aesd	$dat0,q15
574	aesd	$dat1,q15
575	aesd	$dat2,q15
576	 vld1.32 {q8},[$key_],#16	// re-pre-load rndkey[0]
577	 add	$cnt,$rounds,#2
578	veor	$tmp0,$tmp0,$dat0
579	veor	$tmp1,$tmp1,$dat1
580	veor	$dat2,$dat2,$tmp2
581	 vld1.32 {q9},[$key_],#16	// re-pre-load rndkey[1]
582	vst1.8	{$tmp0},[$out],#16
583	 vorr	$dat0,$in0,$in0
584	vst1.8	{$tmp1},[$out],#16
585	 vorr	$dat1,$in1,$in1
586	vst1.8	{$dat2},[$out],#16
587	 vorr	$dat2,$in2,$in2
588	b.hs	.Loop3x_cbc_dec
589
590	cmn	$len,#0x30
591	b.eq	.Lcbc_done
592	nop
593
594.Lcbc_dec_tail:
595	aesd	$dat1,q8
596	aesimc	$dat1,$dat1
597	aesd	$dat2,q8
598	aesimc	$dat2,$dat2
599	vld1.32	{q8},[$key_],#16
600	subs	$cnt,$cnt,#2
601	aesd	$dat1,q9
602	aesimc	$dat1,$dat1
603	aesd	$dat2,q9
604	aesimc	$dat2,$dat2
605	vld1.32	{q9},[$key_],#16
606	b.gt	.Lcbc_dec_tail
607
608	aesd	$dat1,q8
609	aesimc	$dat1,$dat1
610	aesd	$dat2,q8
611	aesimc	$dat2,$dat2
612	aesd	$dat1,q9
613	aesimc	$dat1,$dat1
614	aesd	$dat2,q9
615	aesimc	$dat2,$dat2
616	aesd	$dat1,q12
617	aesimc	$dat1,$dat1
618	aesd	$dat2,q12
619	aesimc	$dat2,$dat2
620	 cmn	$len,#0x20
621	aesd	$dat1,q13
622	aesimc	$dat1,$dat1
623	aesd	$dat2,q13
624	aesimc	$dat2,$dat2
625	 veor	$tmp1,$ivec,$rndlast
626	aesd	$dat1,q14
627	aesimc	$dat1,$dat1
628	aesd	$dat2,q14
629	aesimc	$dat2,$dat2
630	 veor	$tmp2,$in1,$rndlast
631	aesd	$dat1,q15
632	aesd	$dat2,q15
633	b.eq	.Lcbc_dec_one
634	veor	$tmp1,$tmp1,$dat1
635	veor	$tmp2,$tmp2,$dat2
636	 vorr	$ivec,$in2,$in2
637	vst1.8	{$tmp1},[$out],#16
638	vst1.8	{$tmp2},[$out],#16
639	b	.Lcbc_done
640
641.Lcbc_dec_one:
642	veor	$tmp1,$tmp1,$dat2
643	 vorr	$ivec,$in2,$in2
644	vst1.8	{$tmp1},[$out],#16
645
646.Lcbc_done:
647	vst1.8	{$ivec},[$ivp]
648.Lcbc_abort:
649___
650}
651$code.=<<___	if ($flavour !~ /64/);
652	vldmia	sp!,{d8-d15}
653	ldmia	sp!,{r4-r8,pc}
654___
655$code.=<<___	if ($flavour =~ /64/);
656	ldr	x29,[sp],#16
657	ret
658___
659$code.=<<___;
660.size	${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
661___
662}}}
663{{{
664my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
665my ($rounds,$cnt,$key_)=("w5","w6","x7");
666my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
667my $step="x12";		# aliases with $tctr2
668
669my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
670my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
671
672my ($dat,$tmp)=($dat0,$tmp0);
673
674### q8-q15	preloaded key schedule
675
676$code.=<<___;
677.globl	${prefix}_ctr32_encrypt_blocks
678.type	${prefix}_ctr32_encrypt_blocks,%function
679.align	5
680${prefix}_ctr32_encrypt_blocks:
681___
682$code.=<<___	if ($flavour =~ /64/);
683	stp		x29,x30,[sp,#-16]!
684	add		x29,sp,#0
685___
686$code.=<<___	if ($flavour !~ /64/);
687	mov		ip,sp
688	stmdb		sp!,{r4-r10,lr}
689	vstmdb		sp!,{d8-d15}            @ ABI specification says so
690	ldr		r4, [ip]		@ load remaining arg
691___
692$code.=<<___;
693	ldr		$rounds,[$key,#240]
694
695	ldr		$ctr, [$ivp, #12]
696	vld1.32		{$dat0},[$ivp]
697
698	vld1.32		{q8-q9},[$key]		// load key schedule...
699	sub		$rounds,$rounds,#4
700	mov		$step,#16
701	cmp		$len,#2
702	add		$key_,$key,x5,lsl#4	// pointer to last 5 round keys
703	sub		$rounds,$rounds,#2
704	vld1.32		{q12-q13},[$key_],#32
705	vld1.32		{q14-q15},[$key_],#32
706	vld1.32		{$rndlast},[$key_]
707	add		$key_,$key,#32
708	mov		$cnt,$rounds
709	cclr		$step,lo
710#ifndef __ARMEB__
711	rev		$ctr, $ctr
712#endif
713	vorr		$dat1,$dat0,$dat0
714	add		$tctr1, $ctr, #1
715	vorr		$dat2,$dat0,$dat0
716	add		$ctr, $ctr, #2
717	vorr		$ivec,$dat0,$dat0
718	rev		$tctr1, $tctr1
719	vmov.32		${dat1}[3],$tctr1
720	b.ls		.Lctr32_tail
721	rev		$tctr2, $ctr
722	sub		$len,$len,#3		// bias
723	vmov.32		${dat2}[3],$tctr2
724	b		.Loop3x_ctr32
725
726.align	4
727.Loop3x_ctr32:
728	aese		$dat0,q8
729	aesmc		$dat0,$dat0
730	aese		$dat1,q8
731	aesmc		$dat1,$dat1
732	aese		$dat2,q8
733	aesmc		$dat2,$dat2
734	vld1.32		{q8},[$key_],#16
735	subs		$cnt,$cnt,#2
736	aese		$dat0,q9
737	aesmc		$dat0,$dat0
738	aese		$dat1,q9
739	aesmc		$dat1,$dat1
740	aese		$dat2,q9
741	aesmc		$dat2,$dat2
742	vld1.32		{q9},[$key_],#16
743	b.gt		.Loop3x_ctr32
744
745	aese		$dat0,q8
746	aesmc		$tmp0,$dat0
747	aese		$dat1,q8
748	aesmc		$tmp1,$dat1
749	 vld1.8		{$in0},[$inp],#16
750	 vorr		$dat0,$ivec,$ivec
751	aese		$dat2,q8
752	aesmc		$dat2,$dat2
753	 vld1.8		{$in1},[$inp],#16
754	 vorr		$dat1,$ivec,$ivec
755	aese		$tmp0,q9
756	aesmc		$tmp0,$tmp0
757	aese		$tmp1,q9
758	aesmc		$tmp1,$tmp1
759	 vld1.8		{$in2},[$inp],#16
760	 mov		$key_,$key
761	aese		$dat2,q9
762	aesmc		$tmp2,$dat2
763	 vorr		$dat2,$ivec,$ivec
764	 add		$tctr0,$ctr,#1
765	aese		$tmp0,q12
766	aesmc		$tmp0,$tmp0
767	aese		$tmp1,q12
768	aesmc		$tmp1,$tmp1
769	 veor		$in0,$in0,$rndlast
770	 add		$tctr1,$ctr,#2
771	aese		$tmp2,q12
772	aesmc		$tmp2,$tmp2
773	 veor		$in1,$in1,$rndlast
774	 add		$ctr,$ctr,#3
775	aese		$tmp0,q13
776	aesmc		$tmp0,$tmp0
777	aese		$tmp1,q13
778	aesmc		$tmp1,$tmp1
779	 veor		$in2,$in2,$rndlast
780	 rev		$tctr0,$tctr0
781	aese		$tmp2,q13
782	aesmc		$tmp2,$tmp2
783	 vmov.32	${dat0}[3], $tctr0
784	 rev		$tctr1,$tctr1
785	aese		$tmp0,q14
786	aesmc		$tmp0,$tmp0
787	aese		$tmp1,q14
788	aesmc		$tmp1,$tmp1
789	 vmov.32	${dat1}[3], $tctr1
790	 rev		$tctr2,$ctr
791	aese		$tmp2,q14
792	aesmc		$tmp2,$tmp2
793	 vmov.32	${dat2}[3], $tctr2
794	 subs		$len,$len,#3
795	aese		$tmp0,q15
796	aese		$tmp1,q15
797	aese		$tmp2,q15
798
799	veor		$in0,$in0,$tmp0
800	 vld1.32	 {q8},[$key_],#16	// re-pre-load rndkey[0]
801	vst1.8		{$in0},[$out],#16
802	veor		$in1,$in1,$tmp1
803	 mov		$cnt,$rounds
804	vst1.8		{$in1},[$out],#16
805	veor		$in2,$in2,$tmp2
806	 vld1.32	 {q9},[$key_],#16	// re-pre-load rndkey[1]
807	vst1.8		{$in2},[$out],#16
808	b.hs		.Loop3x_ctr32
809
810	adds		$len,$len,#3
811	b.eq		.Lctr32_done
812	cmp		$len,#1
813	mov		$step,#16
814	cclr		$step,eq
815
816.Lctr32_tail:
817	aese		$dat0,q8
818	aesmc		$dat0,$dat0
819	aese		$dat1,q8
820	aesmc		$dat1,$dat1
821	vld1.32		{q8},[$key_],#16
822	subs		$cnt,$cnt,#2
823	aese		$dat0,q9
824	aesmc		$dat0,$dat0
825	aese		$dat1,q9
826	aesmc		$dat1,$dat1
827	vld1.32		{q9},[$key_],#16
828	b.gt		.Lctr32_tail
829
830	aese		$dat0,q8
831	aesmc		$dat0,$dat0
832	aese		$dat1,q8
833	aesmc		$dat1,$dat1
834	aese		$dat0,q9
835	aesmc		$dat0,$dat0
836	aese		$dat1,q9
837	aesmc		$dat1,$dat1
838	 vld1.8		{$in0},[$inp],$step
839	aese		$dat0,q12
840	aesmc		$dat0,$dat0
841	aese		$dat1,q12
842	aesmc		$dat1,$dat1
843	 vld1.8		{$in1},[$inp]
844	aese		$dat0,q13
845	aesmc		$dat0,$dat0
846	aese		$dat1,q13
847	aesmc		$dat1,$dat1
848	 veor		$in0,$in0,$rndlast
849	aese		$dat0,q14
850	aesmc		$dat0,$dat0
851	aese		$dat1,q14
852	aesmc		$dat1,$dat1
853	 veor		$in1,$in1,$rndlast
854	aese		$dat0,q15
855	aese		$dat1,q15
856
857	cmp		$len,#1
858	veor		$in0,$in0,$dat0
859	veor		$in1,$in1,$dat1
860	vst1.8		{$in0},[$out],#16
861	b.eq		.Lctr32_done
862	vst1.8		{$in1},[$out]
863
864.Lctr32_done:
865___
866$code.=<<___	if ($flavour !~ /64/);
867	vldmia		sp!,{d8-d15}
868	ldmia		sp!,{r4-r10,pc}
869___
870$code.=<<___	if ($flavour =~ /64/);
871	ldr		x29,[sp],#16
872	ret
873___
874$code.=<<___;
875.size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
876___
877}}}
878$code.=<<___;
879#endif
880___
881########################################
882if ($flavour =~ /64/) {			######## 64-bit code
883    my %opcode = (
884	"aesd"	=>	0x4e285800,	"aese"	=>	0x4e284800,
885	"aesimc"=>	0x4e287800,	"aesmc"	=>	0x4e286800	);
886
887    local *unaes = sub {
888	my ($mnemonic,$arg)=@_;
889
890	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o	&&
891	sprintf ".inst\t0x%08x\t//%s %s",
892			$opcode{$mnemonic}|$1|($2<<5),
893			$mnemonic,$arg;
894    };
895
896    foreach(split("\n",$code)) {
897	s/\`([^\`]*)\`/eval($1)/geo;
898
899	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
900	s/@\s/\/\//o;			# old->new style commentary
901
902	#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
903	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
904	s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel	$2,$3,$2,$1/o	or
905	s/vmov\.i8/movi/o	or	# fix up legacy mnemonics
906	s/vext\.8/ext/o		or
907	s/vrev32\.8/rev32/o	or
908	s/vtst\.8/cmtst/o	or
909	s/vshr/ushr/o		or
910	s/^(\s+)v/$1/o		or	# strip off v prefix
911	s/\bbx\s+lr\b/ret/o;
912
913	# fix up remainig legacy suffixes
914	s/\.[ui]?8//o;
915	m/\],#8/o and s/\.16b/\.8b/go;
916	s/\.[ui]?32//o and s/\.16b/\.4s/go;
917	s/\.[ui]?64//o and s/\.16b/\.2d/go;
918	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
919
920	print $_,"\n";
921    }
922} else {				######## 32-bit code
923    my %opcode = (
924	"aesd"	=>	0xf3b00340,	"aese"	=>	0xf3b00300,
925	"aesimc"=>	0xf3b003c0,	"aesmc"	=>	0xf3b00380	);
926
927    local *unaes = sub {
928	my ($mnemonic,$arg)=@_;
929
930	if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
931	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
932					 |(($2&7)<<1) |(($2&8)<<2);
933	    # since ARMv7 instructions are always encoded little-endian.
934	    # correct solution is to use .inst directive, but older
935	    # assemblers don't implement it:-(
936	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
937			$word&0xff,($word>>8)&0xff,
938			($word>>16)&0xff,($word>>24)&0xff,
939			$mnemonic,$arg;
940	}
941    };
942
943    sub unvtbl {
944	my $arg=shift;
945
946	$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
947	sprintf	"vtbl.8	d%d,{q%d},d%d\n\t".
948		"vtbl.8	d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
949    }
950
951    sub unvdup32 {
952	my $arg=shift;
953
954	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
955	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
956    }
957
958    sub unvmov32 {
959	my $arg=shift;
960
961	$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
962	sprintf	"vmov.32	d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
963    }
964
965    foreach(split("\n",$code)) {
966	s/\`([^\`]*)\`/eval($1)/geo;
967
968	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
969	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
970	s/\/\/\s?/@ /o;				# new->old style commentary
971
972	# fix up remainig new-style suffixes
973	s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo	or
974	s/\],#[0-9]+/]!/o;
975
976	s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
977	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2	$1,#0/o	or
978	s/vtbl\.8\s+(.*)/unvtbl($1)/geo			or
979	s/vdup\.32\s+(.*)/unvdup32($1)/geo		or
980	s/vmov\.32\s+(.*)/unvmov32($1)/geo		or
981	s/^(\s+)b\./$1b/o				or
982	s/^(\s+)mov\./$1mov/o				or
983	s/^(\s+)ret/$1bx\tlr/o;
984
985	print $_,"\n";
986    }
987}
988
989close STDOUT;
990