1#! /usr/bin/env perl
2# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the OpenSSL license (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10# ====================================================================
11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12# project. The module is, however, dual licensed under OpenSSL and
13# CRYPTOGAMS licenses depending on where you obtain it. For further
14# details see http://www.openssl.org/~appro/cryptogams/.
15# ====================================================================
16#
17# This module implements support for ARMv8 AES instructions. The
18# module is endian-agnostic in sense that it supports both big- and
19# little-endian cases. As does it support both 32- and 64-bit modes
20# of operation. Latter is achieved by limiting amount of utilized
21# registers to 16, which implies additional NEON load and integer
22# instructions. This has no effect on mighty Apple A7, where results
23# are literally equal to the theoretical estimates based on AES
24# instruction latencies and issue rates. On Cortex-A53, an in-order
25# execution core, this costs up to 10-15%, which is partially
26# compensated by implementing dedicated code path for 128-bit
27# CBC encrypt case. On Cortex-A57 parallelizable mode performance
28# seems to be limited by sheer amount of NEON instructions...
29#
30# Performance in cycles per byte processed with 128-bit key:
31#
32#		CBC enc		CBC dec		CTR
33# Apple A7	2.39		1.20		1.20
34# Cortex-A53	1.32		1.29		1.46
35# Cortex-A57(*)	1.95		0.85		0.93
36# Denver	1.96		0.86		0.80
37# Mongoose	1.33		1.20		1.20
38#
39# (*)	original 3.64/1.34/1.32 results were for r0p0 revision
40#	and are still same even for updated module;
41
42$flavour = shift;
43$output  = shift;
44
45$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
46( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
47( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or
48die "can't locate arm-xlate.pl";
49
50open OUT,"| \"$^X\" $xlate $flavour $output";
51*STDOUT=*OUT;
52
53$prefix="aes_hw";
54
55$code=<<___;
56#include <openssl/arm_arch.h>
57
58#if __ARM_MAX_ARCH__>=7
59.text
60___
61$code.=".arch	armv8-a+crypto\n"			if ($flavour =~ /64/);
62$code.=<<___						if ($flavour !~ /64/);
63.arch	armv7-a	// don't confuse not-so-latest binutils with argv8 :-)
64.fpu	neon
65.code	32
66#undef	__thumb2__
67___
68
69# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
70# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
71# maintain both 32- and 64-bit codes within single module and
72# transliterate common code to either flavour with regex vodoo.
73#
74{{{
75my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
76my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
77	$flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
78
79
80$code.=<<___;
81.align	5
82.Lrcon:
83.long	0x01,0x01,0x01,0x01
84.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
85.long	0x1b,0x1b,0x1b,0x1b
86
87.globl	${prefix}_set_encrypt_key
88.type	${prefix}_set_encrypt_key,%function
89.align	5
90${prefix}_set_encrypt_key:
91.Lenc_key:
92___
93$code.=<<___	if ($flavour =~ /64/);
94	stp	x29,x30,[sp,#-16]!
95	add	x29,sp,#0
96___
97$code.=<<___;
98	mov	$ptr,#-1
99	cmp	$inp,#0
100	b.eq	.Lenc_key_abort
101	cmp	$out,#0
102	b.eq	.Lenc_key_abort
103	mov	$ptr,#-2
104	cmp	$bits,#128
105	b.lt	.Lenc_key_abort
106	cmp	$bits,#256
107	b.gt	.Lenc_key_abort
108	tst	$bits,#0x3f
109	b.ne	.Lenc_key_abort
110
111	adr	$ptr,.Lrcon
112	cmp	$bits,#192
113
114	veor	$zero,$zero,$zero
115	vld1.8	{$in0},[$inp],#16
116	mov	$bits,#8		// reuse $bits
117	vld1.32	{$rcon,$mask},[$ptr],#32
118
119	b.lt	.Loop128
120	b.eq	.L192
121	b	.L256
122
123.align	4
124.Loop128:
125	vtbl.8	$key,{$in0},$mask
126	vext.8	$tmp,$zero,$in0,#12
127	vst1.32	{$in0},[$out],#16
128	aese	$key,$zero
129	subs	$bits,$bits,#1
130
131	veor	$in0,$in0,$tmp
132	vext.8	$tmp,$zero,$tmp,#12
133	veor	$in0,$in0,$tmp
134	vext.8	$tmp,$zero,$tmp,#12
135	 veor	$key,$key,$rcon
136	veor	$in0,$in0,$tmp
137	vshl.u8	$rcon,$rcon,#1
138	veor	$in0,$in0,$key
139	b.ne	.Loop128
140
141	vld1.32	{$rcon},[$ptr]
142
143	vtbl.8	$key,{$in0},$mask
144	vext.8	$tmp,$zero,$in0,#12
145	vst1.32	{$in0},[$out],#16
146	aese	$key,$zero
147
148	veor	$in0,$in0,$tmp
149	vext.8	$tmp,$zero,$tmp,#12
150	veor	$in0,$in0,$tmp
151	vext.8	$tmp,$zero,$tmp,#12
152	 veor	$key,$key,$rcon
153	veor	$in0,$in0,$tmp
154	vshl.u8	$rcon,$rcon,#1
155	veor	$in0,$in0,$key
156
157	vtbl.8	$key,{$in0},$mask
158	vext.8	$tmp,$zero,$in0,#12
159	vst1.32	{$in0},[$out],#16
160	aese	$key,$zero
161
162	veor	$in0,$in0,$tmp
163	vext.8	$tmp,$zero,$tmp,#12
164	veor	$in0,$in0,$tmp
165	vext.8	$tmp,$zero,$tmp,#12
166	 veor	$key,$key,$rcon
167	veor	$in0,$in0,$tmp
168	veor	$in0,$in0,$key
169	vst1.32	{$in0},[$out]
170	add	$out,$out,#0x50
171
172	mov	$rounds,#10
173	b	.Ldone
174
175.align	4
176.L192:
177	vld1.8	{$in1},[$inp],#8
178	vmov.i8	$key,#8			// borrow $key
179	vst1.32	{$in0},[$out],#16
180	vsub.i8	$mask,$mask,$key	// adjust the mask
181
182.Loop192:
183	vtbl.8	$key,{$in1},$mask
184	vext.8	$tmp,$zero,$in0,#12
185	vst1.32	{$in1},[$out],#8
186	aese	$key,$zero
187	subs	$bits,$bits,#1
188
189	veor	$in0,$in0,$tmp
190	vext.8	$tmp,$zero,$tmp,#12
191	veor	$in0,$in0,$tmp
192	vext.8	$tmp,$zero,$tmp,#12
193	veor	$in0,$in0,$tmp
194
195	vdup.32	$tmp,${in0}[3]
196	veor	$tmp,$tmp,$in1
197	 veor	$key,$key,$rcon
198	vext.8	$in1,$zero,$in1,#12
199	vshl.u8	$rcon,$rcon,#1
200	veor	$in1,$in1,$tmp
201	veor	$in0,$in0,$key
202	veor	$in1,$in1,$key
203	vst1.32	{$in0},[$out],#16
204	b.ne	.Loop192
205
206	mov	$rounds,#12
207	add	$out,$out,#0x20
208	b	.Ldone
209
210.align	4
211.L256:
212	vld1.8	{$in1},[$inp]
213	mov	$bits,#7
214	mov	$rounds,#14
215	vst1.32	{$in0},[$out],#16
216
217.Loop256:
218	vtbl.8	$key,{$in1},$mask
219	vext.8	$tmp,$zero,$in0,#12
220	vst1.32	{$in1},[$out],#16
221	aese	$key,$zero
222	subs	$bits,$bits,#1
223
224	veor	$in0,$in0,$tmp
225	vext.8	$tmp,$zero,$tmp,#12
226	veor	$in0,$in0,$tmp
227	vext.8	$tmp,$zero,$tmp,#12
228	 veor	$key,$key,$rcon
229	veor	$in0,$in0,$tmp
230	vshl.u8	$rcon,$rcon,#1
231	veor	$in0,$in0,$key
232	vst1.32	{$in0},[$out],#16
233	b.eq	.Ldone
234
235	vdup.32	$key,${in0}[3]		// just splat
236	vext.8	$tmp,$zero,$in1,#12
237	aese	$key,$zero
238
239	veor	$in1,$in1,$tmp
240	vext.8	$tmp,$zero,$tmp,#12
241	veor	$in1,$in1,$tmp
242	vext.8	$tmp,$zero,$tmp,#12
243	veor	$in1,$in1,$tmp
244
245	veor	$in1,$in1,$key
246	b	.Loop256
247
248.Ldone:
249	str	$rounds,[$out]
250	mov	$ptr,#0
251
252.Lenc_key_abort:
253	mov	x0,$ptr			// return value
254	`"ldr	x29,[sp],#16"		if ($flavour =~ /64/)`
255	ret
256.size	${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
257
258.globl	${prefix}_set_decrypt_key
259.type	${prefix}_set_decrypt_key,%function
260.align	5
261${prefix}_set_decrypt_key:
262___
263$code.=<<___	if ($flavour =~ /64/);
264	stp	x29,x30,[sp,#-16]!
265	add	x29,sp,#0
266___
267$code.=<<___	if ($flavour !~ /64/);
268	stmdb	sp!,{r4,lr}
269___
270$code.=<<___;
271	bl	.Lenc_key
272
273	cmp	x0,#0
274	b.ne	.Ldec_key_abort
275
276	sub	$out,$out,#240		// restore original $out
277	mov	x4,#-16
278	add	$inp,$out,x12,lsl#4	// end of key schedule
279
280	vld1.32	{v0.16b},[$out]
281	vld1.32	{v1.16b},[$inp]
282	vst1.32	{v0.16b},[$inp],x4
283	vst1.32	{v1.16b},[$out],#16
284
285.Loop_imc:
286	vld1.32	{v0.16b},[$out]
287	vld1.32	{v1.16b},[$inp]
288	aesimc	v0.16b,v0.16b
289	aesimc	v1.16b,v1.16b
290	vst1.32	{v0.16b},[$inp],x4
291	vst1.32	{v1.16b},[$out],#16
292	cmp	$inp,$out
293	b.hi	.Loop_imc
294
295	vld1.32	{v0.16b},[$out]
296	aesimc	v0.16b,v0.16b
297	vst1.32	{v0.16b},[$inp]
298
299	eor	x0,x0,x0		// return value
300.Ldec_key_abort:
301___
302$code.=<<___	if ($flavour !~ /64/);
303	ldmia	sp!,{r4,pc}
304___
305$code.=<<___	if ($flavour =~ /64/);
306	ldp	x29,x30,[sp],#16
307	ret
308___
309$code.=<<___;
310.size	${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
311___
312}}}
313{{{
314sub gen_block () {
315my $dir = shift;
316my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
317my ($inp,$out,$key)=map("x$_",(0..2));
318my $rounds="w3";
319my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
320
321$code.=<<___;
322.globl	${prefix}_${dir}crypt
323.type	${prefix}_${dir}crypt,%function
324.align	5
325${prefix}_${dir}crypt:
326	ldr	$rounds,[$key,#240]
327	vld1.32	{$rndkey0},[$key],#16
328	vld1.8	{$inout},[$inp]
329	sub	$rounds,$rounds,#2
330	vld1.32	{$rndkey1},[$key],#16
331
332.Loop_${dir}c:
333	aes$e	$inout,$rndkey0
334	aes$mc	$inout,$inout
335	vld1.32	{$rndkey0},[$key],#16
336	subs	$rounds,$rounds,#2
337	aes$e	$inout,$rndkey1
338	aes$mc	$inout,$inout
339	vld1.32	{$rndkey1},[$key],#16
340	b.gt	.Loop_${dir}c
341
342	aes$e	$inout,$rndkey0
343	aes$mc	$inout,$inout
344	vld1.32	{$rndkey0},[$key]
345	aes$e	$inout,$rndkey1
346	veor	$inout,$inout,$rndkey0
347
348	vst1.8	{$inout},[$out]
349	ret
350.size	${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
351___
352}
353&gen_block("en");
354&gen_block("de");
355}}}
356{{{
357my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
358my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
359my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
360
361my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
362my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
363
364### q8-q15	preloaded key schedule
365
366$code.=<<___;
367.globl	${prefix}_cbc_encrypt
368.type	${prefix}_cbc_encrypt,%function
369.align	5
370${prefix}_cbc_encrypt:
371___
372$code.=<<___	if ($flavour =~ /64/);
373	stp	x29,x30,[sp,#-16]!
374	add	x29,sp,#0
375___
376$code.=<<___	if ($flavour !~ /64/);
377	mov	ip,sp
378	stmdb	sp!,{r4-r8,lr}
379	vstmdb	sp!,{d8-d15}            @ ABI specification says so
380	ldmia	ip,{r4-r5}		@ load remaining args
381___
382$code.=<<___;
383	subs	$len,$len,#16
384	mov	$step,#16
385	b.lo	.Lcbc_abort
386	cclr	$step,eq
387
388	cmp	$enc,#0			// en- or decrypting?
389	ldr	$rounds,[$key,#240]
390	and	$len,$len,#-16
391	vld1.8	{$ivec},[$ivp]
392	vld1.8	{$dat},[$inp],$step
393
394	vld1.32	{q8-q9},[$key]		// load key schedule...
395	sub	$rounds,$rounds,#6
396	add	$key_,$key,x5,lsl#4	// pointer to last 7 round keys
397	sub	$rounds,$rounds,#2
398	vld1.32	{q10-q11},[$key_],#32
399	vld1.32	{q12-q13},[$key_],#32
400	vld1.32	{q14-q15},[$key_],#32
401	vld1.32	{$rndlast},[$key_]
402
403	add	$key_,$key,#32
404	mov	$cnt,$rounds
405	b.eq	.Lcbc_dec
406
407	cmp	$rounds,#2
408	veor	$dat,$dat,$ivec
409	veor	$rndzero_n_last,q8,$rndlast
410	b.eq	.Lcbc_enc128
411
412	vld1.32	{$in0-$in1},[$key_]
413	add	$key_,$key,#16
414	add	$key4,$key,#16*4
415	add	$key5,$key,#16*5
416	aese	$dat,q8
417	aesmc	$dat,$dat
418	add	$key6,$key,#16*6
419	add	$key7,$key,#16*7
420	b	.Lenter_cbc_enc
421
422.align	4
423.Loop_cbc_enc:
424	aese	$dat,q8
425	aesmc	$dat,$dat
426	 vst1.8	{$ivec},[$out],#16
427.Lenter_cbc_enc:
428	aese	$dat,q9
429	aesmc	$dat,$dat
430	aese	$dat,$in0
431	aesmc	$dat,$dat
432	vld1.32	{q8},[$key4]
433	cmp	$rounds,#4
434	aese	$dat,$in1
435	aesmc	$dat,$dat
436	vld1.32	{q9},[$key5]
437	b.eq	.Lcbc_enc192
438
439	aese	$dat,q8
440	aesmc	$dat,$dat
441	vld1.32	{q8},[$key6]
442	aese	$dat,q9
443	aesmc	$dat,$dat
444	vld1.32	{q9},[$key7]
445	nop
446
447.Lcbc_enc192:
448	aese	$dat,q8
449	aesmc	$dat,$dat
450	 subs	$len,$len,#16
451	aese	$dat,q9
452	aesmc	$dat,$dat
453	 cclr	$step,eq
454	aese	$dat,q10
455	aesmc	$dat,$dat
456	aese	$dat,q11
457	aesmc	$dat,$dat
458	 vld1.8	{q8},[$inp],$step
459	aese	$dat,q12
460	aesmc	$dat,$dat
461	 veor	q8,q8,$rndzero_n_last
462	aese	$dat,q13
463	aesmc	$dat,$dat
464	 vld1.32 {q9},[$key_]		// re-pre-load rndkey[1]
465	aese	$dat,q14
466	aesmc	$dat,$dat
467	aese	$dat,q15
468	veor	$ivec,$dat,$rndlast
469	b.hs	.Loop_cbc_enc
470
471	vst1.8	{$ivec},[$out],#16
472	b	.Lcbc_done
473
474.align	5
475.Lcbc_enc128:
476	vld1.32	{$in0-$in1},[$key_]
477	aese	$dat,q8
478	aesmc	$dat,$dat
479	b	.Lenter_cbc_enc128
480.Loop_cbc_enc128:
481	aese	$dat,q8
482	aesmc	$dat,$dat
483	 vst1.8	{$ivec},[$out],#16
484.Lenter_cbc_enc128:
485	aese	$dat,q9
486	aesmc	$dat,$dat
487	 subs	$len,$len,#16
488	aese	$dat,$in0
489	aesmc	$dat,$dat
490	 cclr	$step,eq
491	aese	$dat,$in1
492	aesmc	$dat,$dat
493	aese	$dat,q10
494	aesmc	$dat,$dat
495	aese	$dat,q11
496	aesmc	$dat,$dat
497	 vld1.8	{q8},[$inp],$step
498	aese	$dat,q12
499	aesmc	$dat,$dat
500	aese	$dat,q13
501	aesmc	$dat,$dat
502	aese	$dat,q14
503	aesmc	$dat,$dat
504	 veor	q8,q8,$rndzero_n_last
505	aese	$dat,q15
506	veor	$ivec,$dat,$rndlast
507	b.hs	.Loop_cbc_enc128
508
509	vst1.8	{$ivec},[$out],#16
510	b	.Lcbc_done
511___
512{
513my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
514$code.=<<___;
515.align	5
516.Lcbc_dec:
517	vld1.8	{$dat2},[$inp],#16
518	subs	$len,$len,#32		// bias
519	add	$cnt,$rounds,#2
520	vorr	$in1,$dat,$dat
521	vorr	$dat1,$dat,$dat
522	vorr	$in2,$dat2,$dat2
523	b.lo	.Lcbc_dec_tail
524
525	vorr	$dat1,$dat2,$dat2
526	vld1.8	{$dat2},[$inp],#16
527	vorr	$in0,$dat,$dat
528	vorr	$in1,$dat1,$dat1
529	vorr	$in2,$dat2,$dat2
530
531.Loop3x_cbc_dec:
532	aesd	$dat0,q8
533	aesimc	$dat0,$dat0
534	aesd	$dat1,q8
535	aesimc	$dat1,$dat1
536	aesd	$dat2,q8
537	aesimc	$dat2,$dat2
538	vld1.32	{q8},[$key_],#16
539	subs	$cnt,$cnt,#2
540	aesd	$dat0,q9
541	aesimc	$dat0,$dat0
542	aesd	$dat1,q9
543	aesimc	$dat1,$dat1
544	aesd	$dat2,q9
545	aesimc	$dat2,$dat2
546	vld1.32	{q9},[$key_],#16
547	b.gt	.Loop3x_cbc_dec
548
549	aesd	$dat0,q8
550	aesimc	$dat0,$dat0
551	aesd	$dat1,q8
552	aesimc	$dat1,$dat1
553	aesd	$dat2,q8
554	aesimc	$dat2,$dat2
555	 veor	$tmp0,$ivec,$rndlast
556	 subs	$len,$len,#0x30
557	 veor	$tmp1,$in0,$rndlast
558	 mov.lo	x6,$len			// x6, $cnt, is zero at this point
559	aesd	$dat0,q9
560	aesimc	$dat0,$dat0
561	aesd	$dat1,q9
562	aesimc	$dat1,$dat1
563	aesd	$dat2,q9
564	aesimc	$dat2,$dat2
565	 veor	$tmp2,$in1,$rndlast
566	 add	$inp,$inp,x6		// $inp is adjusted in such way that
567					// at exit from the loop $dat1-$dat2
568					// are loaded with last "words"
569	 vorr	$ivec,$in2,$in2
570	 mov	$key_,$key
571	aesd	$dat0,q12
572	aesimc	$dat0,$dat0
573	aesd	$dat1,q12
574	aesimc	$dat1,$dat1
575	aesd	$dat2,q12
576	aesimc	$dat2,$dat2
577	 vld1.8	{$in0},[$inp],#16
578	aesd	$dat0,q13
579	aesimc	$dat0,$dat0
580	aesd	$dat1,q13
581	aesimc	$dat1,$dat1
582	aesd	$dat2,q13
583	aesimc	$dat2,$dat2
584	 vld1.8	{$in1},[$inp],#16
585	aesd	$dat0,q14
586	aesimc	$dat0,$dat0
587	aesd	$dat1,q14
588	aesimc	$dat1,$dat1
589	aesd	$dat2,q14
590	aesimc	$dat2,$dat2
591	 vld1.8	{$in2},[$inp],#16
592	aesd	$dat0,q15
593	aesd	$dat1,q15
594	aesd	$dat2,q15
595	 vld1.32 {q8},[$key_],#16	// re-pre-load rndkey[0]
596	 add	$cnt,$rounds,#2
597	veor	$tmp0,$tmp0,$dat0
598	veor	$tmp1,$tmp1,$dat1
599	veor	$dat2,$dat2,$tmp2
600	 vld1.32 {q9},[$key_],#16	// re-pre-load rndkey[1]
601	vst1.8	{$tmp0},[$out],#16
602	 vorr	$dat0,$in0,$in0
603	vst1.8	{$tmp1},[$out],#16
604	 vorr	$dat1,$in1,$in1
605	vst1.8	{$dat2},[$out],#16
606	 vorr	$dat2,$in2,$in2
607	b.hs	.Loop3x_cbc_dec
608
609	cmn	$len,#0x30
610	b.eq	.Lcbc_done
611	nop
612
613.Lcbc_dec_tail:
614	aesd	$dat1,q8
615	aesimc	$dat1,$dat1
616	aesd	$dat2,q8
617	aesimc	$dat2,$dat2
618	vld1.32	{q8},[$key_],#16
619	subs	$cnt,$cnt,#2
620	aesd	$dat1,q9
621	aesimc	$dat1,$dat1
622	aesd	$dat2,q9
623	aesimc	$dat2,$dat2
624	vld1.32	{q9},[$key_],#16
625	b.gt	.Lcbc_dec_tail
626
627	aesd	$dat1,q8
628	aesimc	$dat1,$dat1
629	aesd	$dat2,q8
630	aesimc	$dat2,$dat2
631	aesd	$dat1,q9
632	aesimc	$dat1,$dat1
633	aesd	$dat2,q9
634	aesimc	$dat2,$dat2
635	aesd	$dat1,q12
636	aesimc	$dat1,$dat1
637	aesd	$dat2,q12
638	aesimc	$dat2,$dat2
639	 cmn	$len,#0x20
640	aesd	$dat1,q13
641	aesimc	$dat1,$dat1
642	aesd	$dat2,q13
643	aesimc	$dat2,$dat2
644	 veor	$tmp1,$ivec,$rndlast
645	aesd	$dat1,q14
646	aesimc	$dat1,$dat1
647	aesd	$dat2,q14
648	aesimc	$dat2,$dat2
649	 veor	$tmp2,$in1,$rndlast
650	aesd	$dat1,q15
651	aesd	$dat2,q15
652	b.eq	.Lcbc_dec_one
653	veor	$tmp1,$tmp1,$dat1
654	veor	$tmp2,$tmp2,$dat2
655	 vorr	$ivec,$in2,$in2
656	vst1.8	{$tmp1},[$out],#16
657	vst1.8	{$tmp2},[$out],#16
658	b	.Lcbc_done
659
660.Lcbc_dec_one:
661	veor	$tmp1,$tmp1,$dat2
662	 vorr	$ivec,$in2,$in2
663	vst1.8	{$tmp1},[$out],#16
664
665.Lcbc_done:
666	vst1.8	{$ivec},[$ivp]
667.Lcbc_abort:
668___
669}
670$code.=<<___	if ($flavour !~ /64/);
671	vldmia	sp!,{d8-d15}
672	ldmia	sp!,{r4-r8,pc}
673___
674$code.=<<___	if ($flavour =~ /64/);
675	ldr	x29,[sp],#16
676	ret
677___
678$code.=<<___;
679.size	${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
680___
681}}}
682{{{
683my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
684my ($rounds,$cnt,$key_)=("w5","w6","x7");
685my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
686my $step="x12";		# aliases with $tctr2
687
688my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
689my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
690
691my ($dat,$tmp)=($dat0,$tmp0);
692
693### q8-q15	preloaded key schedule
694
695$code.=<<___;
696.globl	${prefix}_ctr32_encrypt_blocks
697.type	${prefix}_ctr32_encrypt_blocks,%function
698.align	5
699${prefix}_ctr32_encrypt_blocks:
700___
701$code.=<<___	if ($flavour =~ /64/);
702	stp		x29,x30,[sp,#-16]!
703	add		x29,sp,#0
704___
705$code.=<<___	if ($flavour !~ /64/);
706	mov		ip,sp
707	stmdb		sp!,{r4-r10,lr}
708	vstmdb		sp!,{d8-d15}            @ ABI specification says so
709	ldr		r4, [ip]		@ load remaining arg
710___
711$code.=<<___;
712	ldr		$rounds,[$key,#240]
713
714	ldr		$ctr, [$ivp, #12]
715	vld1.32		{$dat0},[$ivp]
716
717	vld1.32		{q8-q9},[$key]		// load key schedule...
718	sub		$rounds,$rounds,#4
719	mov		$step,#16
720	cmp		$len,#2
721	add		$key_,$key,x5,lsl#4	// pointer to last 5 round keys
722	sub		$rounds,$rounds,#2
723	vld1.32		{q12-q13},[$key_],#32
724	vld1.32		{q14-q15},[$key_],#32
725	vld1.32		{$rndlast},[$key_]
726	add		$key_,$key,#32
727	mov		$cnt,$rounds
728	cclr		$step,lo
729#ifndef __ARMEB__
730	rev		$ctr, $ctr
731#endif
732	vorr		$dat1,$dat0,$dat0
733	add		$tctr1, $ctr, #1
734	vorr		$dat2,$dat0,$dat0
735	add		$ctr, $ctr, #2
736	vorr		$ivec,$dat0,$dat0
737	rev		$tctr1, $tctr1
738	vmov.32		${dat1}[3],$tctr1
739	b.ls		.Lctr32_tail
740	rev		$tctr2, $ctr
741	sub		$len,$len,#3		// bias
742	vmov.32		${dat2}[3],$tctr2
743	b		.Loop3x_ctr32
744
745.align	4
746.Loop3x_ctr32:
747	aese		$dat0,q8
748	aesmc		$dat0,$dat0
749	aese		$dat1,q8
750	aesmc		$dat1,$dat1
751	aese		$dat2,q8
752	aesmc		$dat2,$dat2
753	vld1.32		{q8},[$key_],#16
754	subs		$cnt,$cnt,#2
755	aese		$dat0,q9
756	aesmc		$dat0,$dat0
757	aese		$dat1,q9
758	aesmc		$dat1,$dat1
759	aese		$dat2,q9
760	aesmc		$dat2,$dat2
761	vld1.32		{q9},[$key_],#16
762	b.gt		.Loop3x_ctr32
763
764	aese		$dat0,q8
765	aesmc		$tmp0,$dat0
766	aese		$dat1,q8
767	aesmc		$tmp1,$dat1
768	 vld1.8		{$in0},[$inp],#16
769	 vorr		$dat0,$ivec,$ivec
770	aese		$dat2,q8
771	aesmc		$dat2,$dat2
772	 vld1.8		{$in1},[$inp],#16
773	 vorr		$dat1,$ivec,$ivec
774	aese		$tmp0,q9
775	aesmc		$tmp0,$tmp0
776	aese		$tmp1,q9
777	aesmc		$tmp1,$tmp1
778	 vld1.8		{$in2},[$inp],#16
779	 mov		$key_,$key
780	aese		$dat2,q9
781	aesmc		$tmp2,$dat2
782	 vorr		$dat2,$ivec,$ivec
783	 add		$tctr0,$ctr,#1
784	aese		$tmp0,q12
785	aesmc		$tmp0,$tmp0
786	aese		$tmp1,q12
787	aesmc		$tmp1,$tmp1
788	 veor		$in0,$in0,$rndlast
789	 add		$tctr1,$ctr,#2
790	aese		$tmp2,q12
791	aesmc		$tmp2,$tmp2
792	 veor		$in1,$in1,$rndlast
793	 add		$ctr,$ctr,#3
794	aese		$tmp0,q13
795	aesmc		$tmp0,$tmp0
796	aese		$tmp1,q13
797	aesmc		$tmp1,$tmp1
798	 veor		$in2,$in2,$rndlast
799	 rev		$tctr0,$tctr0
800	aese		$tmp2,q13
801	aesmc		$tmp2,$tmp2
802	 vmov.32	${dat0}[3], $tctr0
803	 rev		$tctr1,$tctr1
804	aese		$tmp0,q14
805	aesmc		$tmp0,$tmp0
806	aese		$tmp1,q14
807	aesmc		$tmp1,$tmp1
808	 vmov.32	${dat1}[3], $tctr1
809	 rev		$tctr2,$ctr
810	aese		$tmp2,q14
811	aesmc		$tmp2,$tmp2
812	 vmov.32	${dat2}[3], $tctr2
813	 subs		$len,$len,#3
814	aese		$tmp0,q15
815	aese		$tmp1,q15
816	aese		$tmp2,q15
817
818	veor		$in0,$in0,$tmp0
819	 vld1.32	 {q8},[$key_],#16	// re-pre-load rndkey[0]
820	vst1.8		{$in0},[$out],#16
821	veor		$in1,$in1,$tmp1
822	 mov		$cnt,$rounds
823	vst1.8		{$in1},[$out],#16
824	veor		$in2,$in2,$tmp2
825	 vld1.32	 {q9},[$key_],#16	// re-pre-load rndkey[1]
826	vst1.8		{$in2},[$out],#16
827	b.hs		.Loop3x_ctr32
828
829	adds		$len,$len,#3
830	b.eq		.Lctr32_done
831	cmp		$len,#1
832	mov		$step,#16
833	cclr		$step,eq
834
835.Lctr32_tail:
836	aese		$dat0,q8
837	aesmc		$dat0,$dat0
838	aese		$dat1,q8
839	aesmc		$dat1,$dat1
840	vld1.32		{q8},[$key_],#16
841	subs		$cnt,$cnt,#2
842	aese		$dat0,q9
843	aesmc		$dat0,$dat0
844	aese		$dat1,q9
845	aesmc		$dat1,$dat1
846	vld1.32		{q9},[$key_],#16
847	b.gt		.Lctr32_tail
848
849	aese		$dat0,q8
850	aesmc		$dat0,$dat0
851	aese		$dat1,q8
852	aesmc		$dat1,$dat1
853	aese		$dat0,q9
854	aesmc		$dat0,$dat0
855	aese		$dat1,q9
856	aesmc		$dat1,$dat1
857	 vld1.8		{$in0},[$inp],$step
858	aese		$dat0,q12
859	aesmc		$dat0,$dat0
860	aese		$dat1,q12
861	aesmc		$dat1,$dat1
862	 vld1.8		{$in1},[$inp]
863	aese		$dat0,q13
864	aesmc		$dat0,$dat0
865	aese		$dat1,q13
866	aesmc		$dat1,$dat1
867	 veor		$in0,$in0,$rndlast
868	aese		$dat0,q14
869	aesmc		$dat0,$dat0
870	aese		$dat1,q14
871	aesmc		$dat1,$dat1
872	 veor		$in1,$in1,$rndlast
873	aese		$dat0,q15
874	aese		$dat1,q15
875
876	cmp		$len,#1
877	veor		$in0,$in0,$dat0
878	veor		$in1,$in1,$dat1
879	vst1.8		{$in0},[$out],#16
880	b.eq		.Lctr32_done
881	vst1.8		{$in1},[$out]
882
883.Lctr32_done:
884___
885$code.=<<___	if ($flavour !~ /64/);
886	vldmia		sp!,{d8-d15}
887	ldmia		sp!,{r4-r10,pc}
888___
889$code.=<<___	if ($flavour =~ /64/);
890	ldr		x29,[sp],#16
891	ret
892___
893$code.=<<___;
894.size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
895___
896}}}
897$code.=<<___;
898#endif
899___
900########################################
901if ($flavour =~ /64/) {			######## 64-bit code
902    my %opcode = (
903	"aesd"	=>	0x4e285800,	"aese"	=>	0x4e284800,
904	"aesimc"=>	0x4e287800,	"aesmc"	=>	0x4e286800	);
905
906    local *unaes = sub {
907	my ($mnemonic,$arg)=@_;
908
909	$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o	&&
910	sprintf ".inst\t0x%08x\t//%s %s",
911			$opcode{$mnemonic}|$1|($2<<5),
912			$mnemonic,$arg;
913    };
914
915    foreach(split("\n",$code)) {
916	s/\`([^\`]*)\`/eval($1)/geo;
917
918	s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;	# old->new registers
919	s/@\s/\/\//o;			# old->new style commentary
920
921	#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
922	s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel	$1$2,$1zr,$1$2,$3/o	or
923	s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel	$2,$3,$2,$1/o	or
924	s/vmov\.i8/movi/o	or	# fix up legacy mnemonics
925	s/vext\.8/ext/o		or
926	s/vrev32\.8/rev32/o	or
927	s/vtst\.8/cmtst/o	or
928	s/vshr/ushr/o		or
929	s/^(\s+)v/$1/o		or	# strip off v prefix
930	s/\bbx\s+lr\b/ret/o;
931
932	# fix up remaining legacy suffixes
933	s/\.[ui]?8//o;
934	m/\],#8/o and s/\.16b/\.8b/go;
935	s/\.[ui]?32//o and s/\.16b/\.4s/go;
936	s/\.[ui]?64//o and s/\.16b/\.2d/go;
937	s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
938
939	print $_,"\n";
940    }
941} else {				######## 32-bit code
942    my %opcode = (
943	"aesd"	=>	0xf3b00340,	"aese"	=>	0xf3b00300,
944	"aesimc"=>	0xf3b003c0,	"aesmc"	=>	0xf3b00380	);
945
946    local *unaes = sub {
947	my ($mnemonic,$arg)=@_;
948
949	if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
950	    my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
951					 |(($2&7)<<1) |(($2&8)<<2);
952	    # since ARMv7 instructions are always encoded little-endian.
953	    # correct solution is to use .inst directive, but older
954	    # assemblers don't implement it:-(
955	    sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
956			$word&0xff,($word>>8)&0xff,
957			($word>>16)&0xff,($word>>24)&0xff,
958			$mnemonic,$arg;
959	}
960    };
961
962    sub unvtbl {
963	my $arg=shift;
964
965	$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
966	sprintf	"vtbl.8	d%d,{q%d},d%d\n\t".
967		"vtbl.8	d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
968    }
969
970    sub unvdup32 {
971	my $arg=shift;
972
973	$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
974	sprintf	"vdup.32	q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
975    }
976
977    sub unvmov32 {
978	my $arg=shift;
979
980	$arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
981	sprintf	"vmov.32	d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
982    }
983
984    foreach(split("\n",$code)) {
985	s/\`([^\`]*)\`/eval($1)/geo;
986
987	s/\b[wx]([0-9]+)\b/r$1/go;		# new->old registers
988	s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;	# new->old registers
989	s/\/\/\s?/@ /o;				# new->old style commentary
990
991	# fix up remaining new-style suffixes
992	s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo	or
993	s/\],#[0-9]+/]!/o;
994
995	s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo	or
996	s/cclr\s+([^,]+),\s*([a-z]+)/mov$2	$1,#0/o	or
997	s/vtbl\.8\s+(.*)/unvtbl($1)/geo			or
998	s/vdup\.32\s+(.*)/unvdup32($1)/geo		or
999	s/vmov\.32\s+(.*)/unvmov32($1)/geo		or
1000	s/^(\s+)b\./$1b/o				or
1001	s/^(\s+)mov\./$1mov/o				or
1002	s/^(\s+)ret/$1bx\tlr/o;
1003
1004	print $_,"\n";
1005    }
1006}
1007
1008close STDOUT;
1009