aesv8-armx.S revision 305153
1/* $FreeBSD: stable/11/secure/lib/libcrypto/arm/aesv8-armx.S 305153 2016-08-31 20:33:59Z jkim $ */
2/* Do not modify. This file is auto-generated from aesv8-armx.pl. */
3#include "arm_arch.h"
4
5#if __ARM_MAX_ARCH__>=7
6.text
7.arch	armv7-a
8.fpu	neon
9.code	32
10.align	5
11rcon:
12.long	0x01,0x01,0x01,0x01
13.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	@ rotate-n-splat
14.long	0x1b,0x1b,0x1b,0x1b
15
16.globl	aes_v8_set_encrypt_key
17.type	aes_v8_set_encrypt_key,%function
18.align	5
19aes_v8_set_encrypt_key:
20.Lenc_key:
21	mov	r3,#-1
22	cmp	r0,#0
23	beq	.Lenc_key_abort
24	cmp	r2,#0
25	beq	.Lenc_key_abort
26	mov	r3,#-2
27	cmp	r1,#128
28	blt	.Lenc_key_abort
29	cmp	r1,#256
30	bgt	.Lenc_key_abort
31	tst	r1,#0x3f
32	bne	.Lenc_key_abort
33
34	adr	r3,rcon
35	cmp	r1,#192
36
37	veor	q0,q0,q0
38	vld1.8	{q3},[r0]!
39	mov	r1,#8		@ reuse r1
40	vld1.32	{q1,q2},[r3]!
41
42	blt	.Loop128
43	beq	.L192
44	b	.L256
45
46.align	4
47.Loop128:
48	vtbl.8	d20,{q3},d4
49	vtbl.8	d21,{q3},d5
50	vext.8	q9,q0,q3,#12
51	vst1.32	{q3},[r2]!
52	.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
53	subs	r1,r1,#1
54
55	veor	q3,q3,q9
56	vext.8	q9,q0,q9,#12
57	veor	q3,q3,q9
58	vext.8	q9,q0,q9,#12
59	 veor	q10,q10,q1
60	veor	q3,q3,q9
61	vshl.u8	q1,q1,#1
62	veor	q3,q3,q10
63	bne	.Loop128
64
65	vld1.32	{q1},[r3]
66
67	vtbl.8	d20,{q3},d4
68	vtbl.8	d21,{q3},d5
69	vext.8	q9,q0,q3,#12
70	vst1.32	{q3},[r2]!
71	.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
72
73	veor	q3,q3,q9
74	vext.8	q9,q0,q9,#12
75	veor	q3,q3,q9
76	vext.8	q9,q0,q9,#12
77	 veor	q10,q10,q1
78	veor	q3,q3,q9
79	vshl.u8	q1,q1,#1
80	veor	q3,q3,q10
81
82	vtbl.8	d20,{q3},d4
83	vtbl.8	d21,{q3},d5
84	vext.8	q9,q0,q3,#12
85	vst1.32	{q3},[r2]!
86	.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
87
88	veor	q3,q3,q9
89	vext.8	q9,q0,q9,#12
90	veor	q3,q3,q9
91	vext.8	q9,q0,q9,#12
92	 veor	q10,q10,q1
93	veor	q3,q3,q9
94	veor	q3,q3,q10
95	vst1.32	{q3},[r2]
96	add	r2,r2,#0x50
97
98	mov	r12,#10
99	b	.Ldone
100
101.align	4
102.L192:
103	vld1.8	{d16},[r0]!
104	vmov.i8	q10,#8			@ borrow q10
105	vst1.32	{q3},[r2]!
106	vsub.i8	q2,q2,q10	@ adjust the mask
107
108.Loop192:
109	vtbl.8	d20,{q8},d4
110	vtbl.8	d21,{q8},d5
111	vext.8	q9,q0,q3,#12
112	vst1.32	{d16},[r2]!
113	.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
114	subs	r1,r1,#1
115
116	veor	q3,q3,q9
117	vext.8	q9,q0,q9,#12
118	veor	q3,q3,q9
119	vext.8	q9,q0,q9,#12
120	veor	q3,q3,q9
121
122	vdup.32	q9,d7[1]
123	veor	q9,q9,q8
124	 veor	q10,q10,q1
125	vext.8	q8,q0,q8,#12
126	vshl.u8	q1,q1,#1
127	veor	q8,q8,q9
128	veor	q3,q3,q10
129	veor	q8,q8,q10
130	vst1.32	{q3},[r2]!
131	bne	.Loop192
132
133	mov	r12,#12
134	add	r2,r2,#0x20
135	b	.Ldone
136
137.align	4
138.L256:
139	vld1.8	{q8},[r0]
140	mov	r1,#7
141	mov	r12,#14
142	vst1.32	{q3},[r2]!
143
144.Loop256:
145	vtbl.8	d20,{q8},d4
146	vtbl.8	d21,{q8},d5
147	vext.8	q9,q0,q3,#12
148	vst1.32	{q8},[r2]!
149	.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
150	subs	r1,r1,#1
151
152	veor	q3,q3,q9
153	vext.8	q9,q0,q9,#12
154	veor	q3,q3,q9
155	vext.8	q9,q0,q9,#12
156	 veor	q10,q10,q1
157	veor	q3,q3,q9
158	vshl.u8	q1,q1,#1
159	veor	q3,q3,q10
160	vst1.32	{q3},[r2]!
161	beq	.Ldone
162
163	vdup.32	q10,d7[1]
164	vext.8	q9,q0,q8,#12
165	.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
166
167	veor	q8,q8,q9
168	vext.8	q9,q0,q9,#12
169	veor	q8,q8,q9
170	vext.8	q9,q0,q9,#12
171	veor	q8,q8,q9
172
173	veor	q8,q8,q10
174	b	.Loop256
175
176.Ldone:
177	str	r12,[r2]
178	mov	r3,#0
179
180.Lenc_key_abort:
181	mov	r0,r3			@ return value
182
183	bx	lr
184.size	aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key
185
186.globl	aes_v8_set_decrypt_key
187.type	aes_v8_set_decrypt_key,%function
188.align	5
189aes_v8_set_decrypt_key:
190	stmdb	sp!,{r4,lr}
191	bl	.Lenc_key
192
193	cmp	r0,#0
194	bne	.Ldec_key_abort
195
196	sub	r2,r2,#240		@ restore original r2
197	mov	r4,#-16
198	add	r0,r2,r12,lsl#4	@ end of key schedule
199
200	vld1.32	{q0},[r2]
201	vld1.32	{q1},[r0]
202	vst1.32	{q0},[r0],r4
203	vst1.32	{q1},[r2]!
204
205.Loop_imc:
206	vld1.32	{q0},[r2]
207	vld1.32	{q1},[r0]
208	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
209	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
210	vst1.32	{q0},[r0],r4
211	vst1.32	{q1},[r2]!
212	cmp	r0,r2
213	bhi	.Loop_imc
214
215	vld1.32	{q0},[r2]
216	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
217	vst1.32	{q0},[r0]
218
219	eor	r0,r0,r0		@ return value
220.Ldec_key_abort:
221	ldmia	sp!,{r4,pc}
222.size	aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key
223.globl	aes_v8_encrypt
224.type	aes_v8_encrypt,%function
225.align	5
226aes_v8_encrypt:
227	ldr	r3,[r2,#240]
228	vld1.32	{q0},[r2]!
229	vld1.8	{q2},[r0]
230	sub	r3,r3,#2
231	vld1.32	{q1},[r2]!
232
233.Loop_enc:
234	.byte	0x00,0x43,0xb0,0xf3	@ aese q2,q0
235	.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
236	vld1.32	{q0},[r2]!
237	subs	r3,r3,#2
238	.byte	0x02,0x43,0xb0,0xf3	@ aese q2,q1
239	.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
240	vld1.32	{q1},[r2]!
241	bgt	.Loop_enc
242
243	.byte	0x00,0x43,0xb0,0xf3	@ aese q2,q0
244	.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
245	vld1.32	{q0},[r2]
246	.byte	0x02,0x43,0xb0,0xf3	@ aese q2,q1
247	veor	q2,q2,q0
248
249	vst1.8	{q2},[r1]
250	bx	lr
251.size	aes_v8_encrypt,.-aes_v8_encrypt
252.globl	aes_v8_decrypt
253.type	aes_v8_decrypt,%function
254.align	5
255aes_v8_decrypt:
256	ldr	r3,[r2,#240]
257	vld1.32	{q0},[r2]!
258	vld1.8	{q2},[r0]
259	sub	r3,r3,#2
260	vld1.32	{q1},[r2]!
261
262.Loop_dec:
263	.byte	0x40,0x43,0xb0,0xf3	@ aesd q2,q0
264	.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
265	vld1.32	{q0},[r2]!
266	subs	r3,r3,#2
267	.byte	0x42,0x43,0xb0,0xf3	@ aesd q2,q1
268	.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
269	vld1.32	{q1},[r2]!
270	bgt	.Loop_dec
271
272	.byte	0x40,0x43,0xb0,0xf3	@ aesd q2,q0
273	.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
274	vld1.32	{q0},[r2]
275	.byte	0x42,0x43,0xb0,0xf3	@ aesd q2,q1
276	veor	q2,q2,q0
277
278	vst1.8	{q2},[r1]
279	bx	lr
280.size	aes_v8_decrypt,.-aes_v8_decrypt
281.globl	aes_v8_cbc_encrypt
282.type	aes_v8_cbc_encrypt,%function
283.align	5
284aes_v8_cbc_encrypt:
285	mov	ip,sp
286	stmdb	sp!,{r4-r8,lr}
287	vstmdb	sp!,{d8-d15}            @ ABI specification says so
288	ldmia	ip,{r4-r5}		@ load remaining args
289	subs	r2,r2,#16
290	mov	r8,#16
291	blo	.Lcbc_abort
292	moveq	r8,#0
293
294	cmp	r5,#0			@ en- or decrypting?
295	ldr	r5,[r3,#240]
296	and	r2,r2,#-16
297	vld1.8	{q6},[r4]
298	vld1.8	{q0},[r0],r8
299
300	vld1.32	{q8-q9},[r3]		@ load key schedule...
301	sub	r5,r5,#6
302	add	r7,r3,r5,lsl#4	@ pointer to last 7 round keys
303	sub	r5,r5,#2
304	vld1.32	{q10-q11},[r7]!
305	vld1.32	{q12-q13},[r7]!
306	vld1.32	{q14-q15},[r7]!
307	vld1.32	{q7},[r7]
308
309	add	r7,r3,#32
310	mov	r6,r5
311	beq	.Lcbc_dec
312
313	cmp	r5,#2
314	veor	q0,q0,q6
315	veor	q5,q8,q7
316	beq	.Lcbc_enc128
317
318	vld1.32	{q2-q3},[r7]
319	add	r7,r3,#16
320	add	r6,r3,#16*4
321	add	r12,r3,#16*5
322	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
323	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
324	add	r14,r3,#16*6
325	add	r3,r3,#16*7
326	b	.Lenter_cbc_enc
327
328.align	4
329.Loop_cbc_enc:
330	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
331	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
332	 vst1.8	{q6},[r1]!
333.Lenter_cbc_enc:
334	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
335	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
336	.byte	0x04,0x03,0xb0,0xf3	@ aese q0,q2
337	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
338	vld1.32	{q8},[r6]
339	cmp	r5,#4
340	.byte	0x06,0x03,0xb0,0xf3	@ aese q0,q3
341	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
342	vld1.32	{q9},[r12]
343	beq	.Lcbc_enc192
344
345	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
346	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
347	vld1.32	{q8},[r14]
348	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
349	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
350	vld1.32	{q9},[r3]
351	nop
352
353.Lcbc_enc192:
354	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
355	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
356	 subs	r2,r2,#16
357	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
358	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
359	 moveq	r8,#0
360	.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
361	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
362	.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
363	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
364	 vld1.8	{q8},[r0],r8
365	.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
366	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
367	 veor	q8,q8,q5
368	.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
369	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
370	 vld1.32 {q9},[r7]		@ re-pre-load rndkey[1]
371	.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
372	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
373	.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
374	veor	q6,q0,q7
375	bhs	.Loop_cbc_enc
376
377	vst1.8	{q6},[r1]!
378	b	.Lcbc_done
379
380.align	5
381.Lcbc_enc128:
382	vld1.32	{q2-q3},[r7]
383	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
384	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
385	b	.Lenter_cbc_enc128
386.Loop_cbc_enc128:
387	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
388	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
389	 vst1.8	{q6},[r1]!
390.Lenter_cbc_enc128:
391	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
392	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
393	 subs	r2,r2,#16
394	.byte	0x04,0x03,0xb0,0xf3	@ aese q0,q2
395	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
396	 moveq	r8,#0
397	.byte	0x06,0x03,0xb0,0xf3	@ aese q0,q3
398	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
399	.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
400	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
401	.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
402	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
403	 vld1.8	{q8},[r0],r8
404	.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
405	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
406	.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
407	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
408	.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
409	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
410	 veor	q8,q8,q5
411	.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
412	veor	q6,q0,q7
413	bhs	.Loop_cbc_enc128
414
415	vst1.8	{q6},[r1]!
416	b	.Lcbc_done
417.align	5
418.Lcbc_dec:
419	vld1.8	{q10},[r0]!
420	subs	r2,r2,#32		@ bias
421	add	r6,r5,#2
422	vorr	q3,q0,q0
423	vorr	q1,q0,q0
424	vorr	q11,q10,q10
425	blo	.Lcbc_dec_tail
426
427	vorr	q1,q10,q10
428	vld1.8	{q10},[r0]!
429	vorr	q2,q0,q0
430	vorr	q3,q1,q1
431	vorr	q11,q10,q10
432
433.Loop3x_cbc_dec:
434	.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
435	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
436	.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
437	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
438	.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
439	.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
440	vld1.32	{q8},[r7]!
441	subs	r6,r6,#2
442	.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
443	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
444	.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
445	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
446	.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
447	.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
448	vld1.32	{q9},[r7]!
449	bgt	.Loop3x_cbc_dec
450
451	.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
452	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
453	.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
454	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
455	.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
456	.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
457	 veor	q4,q6,q7
458	 subs	r2,r2,#0x30
459	 veor	q5,q2,q7
460	 movlo	r6,r2			@ r6, r6, is zero at this point
461	.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
462	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
463	.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
464	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
465	.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
466	.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
467	 veor	q9,q3,q7
468	 add	r0,r0,r6		@ r0 is adjusted in such way that
469					@ at exit from the loop q1-q10
470					@ are loaded with last "words"
471	 vorr	q6,q11,q11
472	 mov	r7,r3
473	.byte	0x68,0x03,0xb0,0xf3	@ aesd q0,q12
474	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
475	.byte	0x68,0x23,0xb0,0xf3	@ aesd q1,q12
476	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
477	.byte	0x68,0x43,0xf0,0xf3	@ aesd q10,q12
478	.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
479	 vld1.8	{q2},[r0]!
480	.byte	0x6a,0x03,0xb0,0xf3	@ aesd q0,q13
481	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
482	.byte	0x6a,0x23,0xb0,0xf3	@ aesd q1,q13
483	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
484	.byte	0x6a,0x43,0xf0,0xf3	@ aesd q10,q13
485	.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
486	 vld1.8	{q3},[r0]!
487	.byte	0x6c,0x03,0xb0,0xf3	@ aesd q0,q14
488	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
489	.byte	0x6c,0x23,0xb0,0xf3	@ aesd q1,q14
490	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
491	.byte	0x6c,0x43,0xf0,0xf3	@ aesd q10,q14
492	.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
493	 vld1.8	{q11},[r0]!
494	.byte	0x6e,0x03,0xb0,0xf3	@ aesd q0,q15
495	.byte	0x6e,0x23,0xb0,0xf3	@ aesd q1,q15
496	.byte	0x6e,0x43,0xf0,0xf3	@ aesd q10,q15
497	 vld1.32 {q8},[r7]!	@ re-pre-load rndkey[0]
498	 add	r6,r5,#2
499	veor	q4,q4,q0
500	veor	q5,q5,q1
501	veor	q10,q10,q9
502	 vld1.32 {q9},[r7]!	@ re-pre-load rndkey[1]
503	vst1.8	{q4},[r1]!
504	 vorr	q0,q2,q2
505	vst1.8	{q5},[r1]!
506	 vorr	q1,q3,q3
507	vst1.8	{q10},[r1]!
508	 vorr	q10,q11,q11
509	bhs	.Loop3x_cbc_dec
510
511	cmn	r2,#0x30
512	beq	.Lcbc_done
513	nop
514
515.Lcbc_dec_tail:
516	.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
517	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
518	.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
519	.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
520	vld1.32	{q8},[r7]!
521	subs	r6,r6,#2
522	.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
523	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
524	.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
525	.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
526	vld1.32	{q9},[r7]!
527	bgt	.Lcbc_dec_tail
528
529	.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
530	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
531	.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
532	.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
533	.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
534	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
535	.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
536	.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
537	.byte	0x68,0x23,0xb0,0xf3	@ aesd q1,q12
538	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
539	.byte	0x68,0x43,0xf0,0xf3	@ aesd q10,q12
540	.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
541	 cmn	r2,#0x20
542	.byte	0x6a,0x23,0xb0,0xf3	@ aesd q1,q13
543	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
544	.byte	0x6a,0x43,0xf0,0xf3	@ aesd q10,q13
545	.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
546	 veor	q5,q6,q7
547	.byte	0x6c,0x23,0xb0,0xf3	@ aesd q1,q14
548	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
549	.byte	0x6c,0x43,0xf0,0xf3	@ aesd q10,q14
550	.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
551	 veor	q9,q3,q7
552	.byte	0x6e,0x23,0xb0,0xf3	@ aesd q1,q15
553	.byte	0x6e,0x43,0xf0,0xf3	@ aesd q10,q15
554	beq	.Lcbc_dec_one
555	veor	q5,q5,q1
556	veor	q9,q9,q10
557	 vorr	q6,q11,q11
558	vst1.8	{q5},[r1]!
559	vst1.8	{q9},[r1]!
560	b	.Lcbc_done
561
562.Lcbc_dec_one:
563	veor	q5,q5,q10
564	 vorr	q6,q11,q11
565	vst1.8	{q5},[r1]!
566
567.Lcbc_done:
568	vst1.8	{q6},[r4]
569.Lcbc_abort:
570	vldmia	sp!,{d8-d15}
571	ldmia	sp!,{r4-r8,pc}
572.size	aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt
573.globl	aes_v8_ctr32_encrypt_blocks
574.type	aes_v8_ctr32_encrypt_blocks,%function
575.align	5
576aes_v8_ctr32_encrypt_blocks:
577	mov		ip,sp
578	stmdb		sp!,{r4-r10,lr}
579	vstmdb		sp!,{d8-d15}            @ ABI specification says so
580	ldr		r4, [ip]		@ load remaining arg
581	ldr		r5,[r3,#240]
582
583	ldr		r8, [r4, #12]
584	vld1.32		{q0},[r4]
585
586	vld1.32		{q8-q9},[r3]		@ load key schedule...
587	sub		r5,r5,#4
588	mov		r12,#16
589	cmp		r2,#2
590	add		r7,r3,r5,lsl#4	@ pointer to last 5 round keys
591	sub		r5,r5,#2
592	vld1.32		{q12-q13},[r7]!
593	vld1.32		{q14-q15},[r7]!
594	vld1.32		{q7},[r7]
595	add		r7,r3,#32
596	mov		r6,r5
597	movlo	r12,#0
598#ifndef __ARMEB__
599	rev		r8, r8
600#endif
601	vorr		q1,q0,q0
602	add		r10, r8, #1
603	vorr		q10,q0,q0
604	add		r8, r8, #2
605	vorr		q6,q0,q0
606	rev		r10, r10
607	vmov.32	d3[1],r10
608	bls		.Lctr32_tail
609	rev		r12, r8
610	sub		r2,r2,#3		@ bias
611	vmov.32	d21[1],r12
612	b		.Loop3x_ctr32
613
614.align	4
615.Loop3x_ctr32:
616	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
617	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
618	.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
619	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
620	.byte	0x20,0x43,0xf0,0xf3	@ aese q10,q8
621	.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
622	vld1.32		{q8},[r7]!
623	subs		r6,r6,#2
624	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
625	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
626	.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
627	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
628	.byte	0x22,0x43,0xf0,0xf3	@ aese q10,q9
629	.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
630	vld1.32		{q9},[r7]!
631	bgt		.Loop3x_ctr32
632
633	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
634	.byte	0x80,0x83,0xb0,0xf3	@ aesmc q4,q0
635	.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
636	.byte	0x82,0xa3,0xb0,0xf3	@ aesmc q5,q1
637	 vld1.8		{q2},[r0]!
638	 vorr		q0,q6,q6
639	.byte	0x20,0x43,0xf0,0xf3	@ aese q10,q8
640	.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
641	 vld1.8		{q3},[r0]!
642	 vorr		q1,q6,q6
643	.byte	0x22,0x83,0xb0,0xf3	@ aese q4,q9
644	.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
645	.byte	0x22,0xa3,0xb0,0xf3	@ aese q5,q9
646	.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
647	 vld1.8		{q11},[r0]!
648	 mov		r7,r3
649	.byte	0x22,0x43,0xf0,0xf3	@ aese q10,q9
650	.byte	0xa4,0x23,0xf0,0xf3	@ aesmc q9,q10
651	 vorr		q10,q6,q6
652	 add		r9,r8,#1
653	.byte	0x28,0x83,0xb0,0xf3	@ aese q4,q12
654	.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
655	.byte	0x28,0xa3,0xb0,0xf3	@ aese q5,q12
656	.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
657	 veor		q2,q2,q7
658	 add		r10,r8,#2
659	.byte	0x28,0x23,0xf0,0xf3	@ aese q9,q12
660	.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
661	 veor		q3,q3,q7
662	 add		r8,r8,#3
663	.byte	0x2a,0x83,0xb0,0xf3	@ aese q4,q13
664	.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
665	.byte	0x2a,0xa3,0xb0,0xf3	@ aese q5,q13
666	.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
667	 veor		q11,q11,q7
668	 rev		r9,r9
669	.byte	0x2a,0x23,0xf0,0xf3	@ aese q9,q13
670	.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
671	 vmov.32	d1[1], r9
672	 rev		r10,r10
673	.byte	0x2c,0x83,0xb0,0xf3	@ aese q4,q14
674	.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
675	.byte	0x2c,0xa3,0xb0,0xf3	@ aese q5,q14
676	.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
677	 vmov.32	d3[1], r10
678	 rev		r12,r8
679	.byte	0x2c,0x23,0xf0,0xf3	@ aese q9,q14
680	.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
681	 vmov.32	d21[1], r12
682	 subs		r2,r2,#3
683	.byte	0x2e,0x83,0xb0,0xf3	@ aese q4,q15
684	.byte	0x2e,0xa3,0xb0,0xf3	@ aese q5,q15
685	.byte	0x2e,0x23,0xf0,0xf3	@ aese q9,q15
686
687	veor		q2,q2,q4
688	 vld1.32	 {q8},[r7]!	@ re-pre-load rndkey[0]
689	vst1.8		{q2},[r1]!
690	veor		q3,q3,q5
691	 mov		r6,r5
692	vst1.8		{q3},[r1]!
693	veor		q11,q11,q9
694	 vld1.32	 {q9},[r7]!	@ re-pre-load rndkey[1]
695	vst1.8		{q11},[r1]!
696	bhs		.Loop3x_ctr32
697
698	adds		r2,r2,#3
699	beq		.Lctr32_done
700	cmp		r2,#1
701	mov		r12,#16
702	moveq	r12,#0
703
704.Lctr32_tail:
705	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
706	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
707	.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
708	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
709	vld1.32		{q8},[r7]!
710	subs		r6,r6,#2
711	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
712	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
713	.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
714	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
715	vld1.32		{q9},[r7]!
716	bgt		.Lctr32_tail
717
718	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
719	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
720	.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
721	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
722	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
723	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
724	.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
725	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
726	 vld1.8		{q2},[r0],r12
727	.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
728	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
729	.byte	0x28,0x23,0xb0,0xf3	@ aese q1,q12
730	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
731	 vld1.8		{q3},[r0]
732	.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
733	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
734	.byte	0x2a,0x23,0xb0,0xf3	@ aese q1,q13
735	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
736	 veor		q2,q2,q7
737	.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
738	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
739	.byte	0x2c,0x23,0xb0,0xf3	@ aese q1,q14
740	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
741	 veor		q3,q3,q7
742	.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
743	.byte	0x2e,0x23,0xb0,0xf3	@ aese q1,q15
744
745	cmp		r2,#1
746	veor		q2,q2,q0
747	veor		q3,q3,q1
748	vst1.8		{q2},[r1]!
749	beq		.Lctr32_done
750	vst1.8		{q3},[r1]
751
752.Lctr32_done:
753	vldmia		sp!,{d8-d15}
754	ldmia		sp!,{r4-r10,pc}
755.size	aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
756#endif
757