1#include "arm_arch.h"
2
3#if __ARM_MAX_ARCH__>=7
4.text
5.arch	armv7-a
6.fpu	neon
7.code	32
8.align	5
9rcon:
10.long	0x01,0x01,0x01,0x01
11.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	@ rotate-n-splat
12.long	0x1b,0x1b,0x1b,0x1b
13
14.globl	aes_v8_set_encrypt_key
15.type	aes_v8_set_encrypt_key,%function
16.align	5
17aes_v8_set_encrypt_key:
18.Lenc_key:
19	mov	r3,#-1
20	cmp	r0,#0
21	beq	.Lenc_key_abort
22	cmp	r2,#0
23	beq	.Lenc_key_abort
24	mov	r3,#-2
25	cmp	r1,#128
26	blt	.Lenc_key_abort
27	cmp	r1,#256
28	bgt	.Lenc_key_abort
29	tst	r1,#0x3f
30	bne	.Lenc_key_abort
31
32	adr	r3,rcon
33	cmp	r1,#192
34
35	veor	q0,q0,q0
36	vld1.8	{q3},[r0]!
37	mov	r1,#8		@ reuse r1
38	vld1.32	{q1,q2},[r3]!
39
40	blt	.Loop128
41	beq	.L192
42	b	.L256
43
44.align	4
45.Loop128:
46	vtbl.8	d20,{q3},d4
47	vtbl.8	d21,{q3},d5
48	vext.8	q9,q0,q3,#12
49	vst1.32	{q3},[r2]!
50	.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
51	subs	r1,r1,#1
52
53	veor	q3,q3,q9
54	vext.8	q9,q0,q9,#12
55	veor	q3,q3,q9
56	vext.8	q9,q0,q9,#12
57	 veor	q10,q10,q1
58	veor	q3,q3,q9
59	vshl.u8	q1,q1,#1
60	veor	q3,q3,q10
61	bne	.Loop128
62
63	vld1.32	{q1},[r3]
64
65	vtbl.8	d20,{q3},d4
66	vtbl.8	d21,{q3},d5
67	vext.8	q9,q0,q3,#12
68	vst1.32	{q3},[r2]!
69	.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
70
71	veor	q3,q3,q9
72	vext.8	q9,q0,q9,#12
73	veor	q3,q3,q9
74	vext.8	q9,q0,q9,#12
75	 veor	q10,q10,q1
76	veor	q3,q3,q9
77	vshl.u8	q1,q1,#1
78	veor	q3,q3,q10
79
80	vtbl.8	d20,{q3},d4
81	vtbl.8	d21,{q3},d5
82	vext.8	q9,q0,q3,#12
83	vst1.32	{q3},[r2]!
84	.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
85
86	veor	q3,q3,q9
87	vext.8	q9,q0,q9,#12
88	veor	q3,q3,q9
89	vext.8	q9,q0,q9,#12
90	 veor	q10,q10,q1
91	veor	q3,q3,q9
92	veor	q3,q3,q10
93	vst1.32	{q3},[r2]
94	add	r2,r2,#0x50
95
96	mov	r12,#10
97	b	.Ldone
98
99.align	4
100.L192:
101	vld1.8	{d16},[r0]!
102	vmov.i8	q10,#8			@ borrow q10
103	vst1.32	{q3},[r2]!
104	vsub.i8	q2,q2,q10	@ adjust the mask
105
106.Loop192:
107	vtbl.8	d20,{q8},d4
108	vtbl.8	d21,{q8},d5
109	vext.8	q9,q0,q3,#12
110	vst1.32	{d16},[r2]!
111	.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
112	subs	r1,r1,#1
113
114	veor	q3,q3,q9
115	vext.8	q9,q0,q9,#12
116	veor	q3,q3,q9
117	vext.8	q9,q0,q9,#12
118	veor	q3,q3,q9
119
120	vdup.32	q9,d7[1]
121	veor	q9,q9,q8
122	 veor	q10,q10,q1
123	vext.8	q8,q0,q8,#12
124	vshl.u8	q1,q1,#1
125	veor	q8,q8,q9
126	veor	q3,q3,q10
127	veor	q8,q8,q10
128	vst1.32	{q3},[r2]!
129	bne	.Loop192
130
131	mov	r12,#12
132	add	r2,r2,#0x20
133	b	.Ldone
134
135.align	4
136.L256:
137	vld1.8	{q8},[r0]
138	mov	r1,#7
139	mov	r12,#14
140	vst1.32	{q3},[r2]!
141
142.Loop256:
143	vtbl.8	d20,{q8},d4
144	vtbl.8	d21,{q8},d5
145	vext.8	q9,q0,q3,#12
146	vst1.32	{q8},[r2]!
147	.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
148	subs	r1,r1,#1
149
150	veor	q3,q3,q9
151	vext.8	q9,q0,q9,#12
152	veor	q3,q3,q9
153	vext.8	q9,q0,q9,#12
154	 veor	q10,q10,q1
155	veor	q3,q3,q9
156	vshl.u8	q1,q1,#1
157	veor	q3,q3,q10
158	vst1.32	{q3},[r2]!
159	beq	.Ldone
160
161	vdup.32	q10,d7[1]
162	vext.8	q9,q0,q8,#12
163	.byte	0x00,0x43,0xf0,0xf3	@ aese q10,q0
164
165	veor	q8,q8,q9
166	vext.8	q9,q0,q9,#12
167	veor	q8,q8,q9
168	vext.8	q9,q0,q9,#12
169	veor	q8,q8,q9
170
171	veor	q8,q8,q10
172	b	.Loop256
173
174.Ldone:
175	str	r12,[r2]
176	mov	r3,#0
177
178.Lenc_key_abort:
179	mov	r0,r3			@ return value
180
181	bx	lr
182.size	aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key
183
184.globl	aes_v8_set_decrypt_key
185.type	aes_v8_set_decrypt_key,%function
186.align	5
187aes_v8_set_decrypt_key:
188	stmdb	sp!,{r4,lr}
189	bl	.Lenc_key
190
191	cmp	r0,#0
192	bne	.Ldec_key_abort
193
194	sub	r2,r2,#240		@ restore original r2
195	mov	r4,#-16
196	add	r0,r2,r12,lsl#4	@ end of key schedule
197
198	vld1.32	{q0},[r2]
199	vld1.32	{q1},[r0]
200	vst1.32	{q0},[r0],r4
201	vst1.32	{q1},[r2]!
202
203.Loop_imc:
204	vld1.32	{q0},[r2]
205	vld1.32	{q1},[r0]
206	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
207	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
208	vst1.32	{q0},[r0],r4
209	vst1.32	{q1},[r2]!
210	cmp	r0,r2
211	bhi	.Loop_imc
212
213	vld1.32	{q0},[r2]
214	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
215	vst1.32	{q0},[r0]
216
217	eor	r0,r0,r0		@ return value
218.Ldec_key_abort:
219	ldmia	sp!,{r4,pc}
220.size	aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key
221.globl	aes_v8_encrypt
222.type	aes_v8_encrypt,%function
223.align	5
224aes_v8_encrypt:
225	ldr	r3,[r2,#240]
226	vld1.32	{q0},[r2]!
227	vld1.8	{q2},[r0]
228	sub	r3,r3,#2
229	vld1.32	{q1},[r2]!
230
231.Loop_enc:
232	.byte	0x00,0x43,0xb0,0xf3	@ aese q2,q0
233	.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
234	vld1.32	{q0},[r2]!
235	subs	r3,r3,#2
236	.byte	0x02,0x43,0xb0,0xf3	@ aese q2,q1
237	.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
238	vld1.32	{q1},[r2]!
239	bgt	.Loop_enc
240
241	.byte	0x00,0x43,0xb0,0xf3	@ aese q2,q0
242	.byte	0x84,0x43,0xb0,0xf3	@ aesmc q2,q2
243	vld1.32	{q0},[r2]
244	.byte	0x02,0x43,0xb0,0xf3	@ aese q2,q1
245	veor	q2,q2,q0
246
247	vst1.8	{q2},[r1]
248	bx	lr
249.size	aes_v8_encrypt,.-aes_v8_encrypt
250.globl	aes_v8_decrypt
251.type	aes_v8_decrypt,%function
252.align	5
253aes_v8_decrypt:
254	ldr	r3,[r2,#240]
255	vld1.32	{q0},[r2]!
256	vld1.8	{q2},[r0]
257	sub	r3,r3,#2
258	vld1.32	{q1},[r2]!
259
260.Loop_dec:
261	.byte	0x40,0x43,0xb0,0xf3	@ aesd q2,q0
262	.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
263	vld1.32	{q0},[r2]!
264	subs	r3,r3,#2
265	.byte	0x42,0x43,0xb0,0xf3	@ aesd q2,q1
266	.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
267	vld1.32	{q1},[r2]!
268	bgt	.Loop_dec
269
270	.byte	0x40,0x43,0xb0,0xf3	@ aesd q2,q0
271	.byte	0xc4,0x43,0xb0,0xf3	@ aesimc q2,q2
272	vld1.32	{q0},[r2]
273	.byte	0x42,0x43,0xb0,0xf3	@ aesd q2,q1
274	veor	q2,q2,q0
275
276	vst1.8	{q2},[r1]
277	bx	lr
278.size	aes_v8_decrypt,.-aes_v8_decrypt
279.globl	aes_v8_cbc_encrypt
280.type	aes_v8_cbc_encrypt,%function
281.align	5
282aes_v8_cbc_encrypt:
283	mov	ip,sp
284	stmdb	sp!,{r4-r8,lr}
285	vstmdb	sp!,{d8-d15}            @ ABI specification says so
286	ldmia	ip,{r4-r5}		@ load remaining args
287	subs	r2,r2,#16
288	mov	r8,#16
289	blo	.Lcbc_abort
290	moveq	r8,#0
291
292	cmp	r5,#0			@ en- or decrypting?
293	ldr	r5,[r3,#240]
294	and	r2,r2,#-16
295	vld1.8	{q6},[r4]
296	vld1.8	{q0},[r0],r8
297
298	vld1.32	{q8-q9},[r3]		@ load key schedule...
299	sub	r5,r5,#6
300	add	r7,r3,r5,lsl#4	@ pointer to last 7 round keys
301	sub	r5,r5,#2
302	vld1.32	{q10-q11},[r7]!
303	vld1.32	{q12-q13},[r7]!
304	vld1.32	{q14-q15},[r7]!
305	vld1.32	{q7},[r7]
306
307	add	r7,r3,#32
308	mov	r6,r5
309	beq	.Lcbc_dec
310
311	cmp	r5,#2
312	veor	q0,q0,q6
313	veor	q5,q8,q7
314	beq	.Lcbc_enc128
315
316	vld1.32	{q2-q3},[r7]
317	add	r7,r3,#16
318	add	r6,r3,#16*4
319	add	r12,r3,#16*5
320	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
321	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
322	add	r14,r3,#16*6
323	add	r3,r3,#16*7
324	b	.Lenter_cbc_enc
325
326.align	4
327.Loop_cbc_enc:
328	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
329	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
330	 vst1.8	{q6},[r1]!
331.Lenter_cbc_enc:
332	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
333	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
334	.byte	0x04,0x03,0xb0,0xf3	@ aese q0,q2
335	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
336	vld1.32	{q8},[r6]
337	cmp	r5,#4
338	.byte	0x06,0x03,0xb0,0xf3	@ aese q0,q3
339	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
340	vld1.32	{q9},[r12]
341	beq	.Lcbc_enc192
342
343	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
344	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
345	vld1.32	{q8},[r14]
346	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
347	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
348	vld1.32	{q9},[r3]
349	nop
350
351.Lcbc_enc192:
352	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
353	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
354	 subs	r2,r2,#16
355	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
356	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
357	 moveq	r8,#0
358	.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
359	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
360	.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
361	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
362	 vld1.8	{q8},[r0],r8
363	.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
364	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
365	 veor	q8,q8,q5
366	.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
367	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
368	 vld1.32 {q9},[r7]		@ re-pre-load rndkey[1]
369	.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
370	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
371	.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
372	veor	q6,q0,q7
373	bhs	.Loop_cbc_enc
374
375	vst1.8	{q6},[r1]!
376	b	.Lcbc_done
377
378.align	5
379.Lcbc_enc128:
380	vld1.32	{q2-q3},[r7]
381	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
382	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
383	b	.Lenter_cbc_enc128
384.Loop_cbc_enc128:
385	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
386	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
387	 vst1.8	{q6},[r1]!
388.Lenter_cbc_enc128:
389	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
390	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
391	 subs	r2,r2,#16
392	.byte	0x04,0x03,0xb0,0xf3	@ aese q0,q2
393	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
394	 moveq	r8,#0
395	.byte	0x06,0x03,0xb0,0xf3	@ aese q0,q3
396	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
397	.byte	0x24,0x03,0xb0,0xf3	@ aese q0,q10
398	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
399	.byte	0x26,0x03,0xb0,0xf3	@ aese q0,q11
400	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
401	 vld1.8	{q8},[r0],r8
402	.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
403	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
404	.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
405	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
406	.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
407	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
408	 veor	q8,q8,q5
409	.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
410	veor	q6,q0,q7
411	bhs	.Loop_cbc_enc128
412
413	vst1.8	{q6},[r1]!
414	b	.Lcbc_done
415.align	5
416.Lcbc_dec:
417	vld1.8	{q10},[r0]!
418	subs	r2,r2,#32		@ bias
419	add	r6,r5,#2
420	vorr	q3,q0,q0
421	vorr	q1,q0,q0
422	vorr	q11,q10,q10
423	blo	.Lcbc_dec_tail
424
425	vorr	q1,q10,q10
426	vld1.8	{q10},[r0]!
427	vorr	q2,q0,q0
428	vorr	q3,q1,q1
429	vorr	q11,q10,q10
430
431.Loop3x_cbc_dec:
432	.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
433	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
434	.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
435	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
436	.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
437	.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
438	vld1.32	{q8},[r7]!
439	subs	r6,r6,#2
440	.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
441	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
442	.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
443	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
444	.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
445	.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
446	vld1.32	{q9},[r7]!
447	bgt	.Loop3x_cbc_dec
448
449	.byte	0x60,0x03,0xb0,0xf3	@ aesd q0,q8
450	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
451	.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
452	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
453	.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
454	.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
455	 veor	q4,q6,q7
456	 subs	r2,r2,#0x30
457	 veor	q5,q2,q7
458	 movlo	r6,r2			@ r6, r6, is zero at this point
459	.byte	0x62,0x03,0xb0,0xf3	@ aesd q0,q9
460	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
461	.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
462	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
463	.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
464	.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
465	 veor	q9,q3,q7
466	 add	r0,r0,r6		@ r0 is adjusted in such way that
467					@ at exit from the loop q1-q10
468					@ are loaded with last "words"
469	 vorr	q6,q11,q11
470	 mov	r7,r3
471	.byte	0x68,0x03,0xb0,0xf3	@ aesd q0,q12
472	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
473	.byte	0x68,0x23,0xb0,0xf3	@ aesd q1,q12
474	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
475	.byte	0x68,0x43,0xf0,0xf3	@ aesd q10,q12
476	.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
477	 vld1.8	{q2},[r0]!
478	.byte	0x6a,0x03,0xb0,0xf3	@ aesd q0,q13
479	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
480	.byte	0x6a,0x23,0xb0,0xf3	@ aesd q1,q13
481	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
482	.byte	0x6a,0x43,0xf0,0xf3	@ aesd q10,q13
483	.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
484	 vld1.8	{q3},[r0]!
485	.byte	0x6c,0x03,0xb0,0xf3	@ aesd q0,q14
486	.byte	0xc0,0x03,0xb0,0xf3	@ aesimc q0,q0
487	.byte	0x6c,0x23,0xb0,0xf3	@ aesd q1,q14
488	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
489	.byte	0x6c,0x43,0xf0,0xf3	@ aesd q10,q14
490	.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
491	 vld1.8	{q11},[r0]!
492	.byte	0x6e,0x03,0xb0,0xf3	@ aesd q0,q15
493	.byte	0x6e,0x23,0xb0,0xf3	@ aesd q1,q15
494	.byte	0x6e,0x43,0xf0,0xf3	@ aesd q10,q15
495	 vld1.32 {q8},[r7]!	@ re-pre-load rndkey[0]
496	 add	r6,r5,#2
497	veor	q4,q4,q0
498	veor	q5,q5,q1
499	veor	q10,q10,q9
500	 vld1.32 {q9},[r7]!	@ re-pre-load rndkey[1]
501	vst1.8	{q4},[r1]!
502	 vorr	q0,q2,q2
503	vst1.8	{q5},[r1]!
504	 vorr	q1,q3,q3
505	vst1.8	{q10},[r1]!
506	 vorr	q10,q11,q11
507	bhs	.Loop3x_cbc_dec
508
509	cmn	r2,#0x30
510	beq	.Lcbc_done
511	nop
512
513.Lcbc_dec_tail:
514	.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
515	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
516	.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
517	.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
518	vld1.32	{q8},[r7]!
519	subs	r6,r6,#2
520	.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
521	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
522	.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
523	.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
524	vld1.32	{q9},[r7]!
525	bgt	.Lcbc_dec_tail
526
527	.byte	0x60,0x23,0xb0,0xf3	@ aesd q1,q8
528	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
529	.byte	0x60,0x43,0xf0,0xf3	@ aesd q10,q8
530	.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
531	.byte	0x62,0x23,0xb0,0xf3	@ aesd q1,q9
532	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
533	.byte	0x62,0x43,0xf0,0xf3	@ aesd q10,q9
534	.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
535	.byte	0x68,0x23,0xb0,0xf3	@ aesd q1,q12
536	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
537	.byte	0x68,0x43,0xf0,0xf3	@ aesd q10,q12
538	.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
539	 cmn	r2,#0x20
540	.byte	0x6a,0x23,0xb0,0xf3	@ aesd q1,q13
541	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
542	.byte	0x6a,0x43,0xf0,0xf3	@ aesd q10,q13
543	.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
544	 veor	q5,q6,q7
545	.byte	0x6c,0x23,0xb0,0xf3	@ aesd q1,q14
546	.byte	0xc2,0x23,0xb0,0xf3	@ aesimc q1,q1
547	.byte	0x6c,0x43,0xf0,0xf3	@ aesd q10,q14
548	.byte	0xe4,0x43,0xf0,0xf3	@ aesimc q10,q10
549	 veor	q9,q3,q7
550	.byte	0x6e,0x23,0xb0,0xf3	@ aesd q1,q15
551	.byte	0x6e,0x43,0xf0,0xf3	@ aesd q10,q15
552	beq	.Lcbc_dec_one
553	veor	q5,q5,q1
554	veor	q9,q9,q10
555	 vorr	q6,q11,q11
556	vst1.8	{q5},[r1]!
557	vst1.8	{q9},[r1]!
558	b	.Lcbc_done
559
560.Lcbc_dec_one:
561	veor	q5,q5,q10
562	 vorr	q6,q11,q11
563	vst1.8	{q5},[r1]!
564
565.Lcbc_done:
566	vst1.8	{q6},[r4]
567.Lcbc_abort:
568	vldmia	sp!,{d8-d15}
569	ldmia	sp!,{r4-r8,pc}
570.size	aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt
571.globl	aes_v8_ctr32_encrypt_blocks
572.type	aes_v8_ctr32_encrypt_blocks,%function
573.align	5
574aes_v8_ctr32_encrypt_blocks:
575	mov		ip,sp
576	stmdb		sp!,{r4-r10,lr}
577	vstmdb		sp!,{d8-d15}            @ ABI specification says so
578	ldr		r4, [ip]		@ load remaining arg
579	ldr		r5,[r3,#240]
580
581	ldr		r8, [r4, #12]
582	vld1.32		{q0},[r4]
583
584	vld1.32		{q8-q9},[r3]		@ load key schedule...
585	sub		r5,r5,#4
586	mov		r12,#16
587	cmp		r2,#2
588	add		r7,r3,r5,lsl#4	@ pointer to last 5 round keys
589	sub		r5,r5,#2
590	vld1.32		{q12-q13},[r7]!
591	vld1.32		{q14-q15},[r7]!
592	vld1.32		{q7},[r7]
593	add		r7,r3,#32
594	mov		r6,r5
595	movlo	r12,#0
596#ifndef __ARMEB__
597	rev		r8, r8
598#endif
599	vorr		q1,q0,q0
600	add		r10, r8, #1
601	vorr		q10,q0,q0
602	add		r8, r8, #2
603	vorr		q6,q0,q0
604	rev		r10, r10
605	vmov.32	d3[1],r10
606	bls		.Lctr32_tail
607	rev		r12, r8
608	sub		r2,r2,#3		@ bias
609	vmov.32	d21[1],r12
610	b		.Loop3x_ctr32
611
612.align	4
613.Loop3x_ctr32:
614	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
615	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
616	.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
617	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
618	.byte	0x20,0x43,0xf0,0xf3	@ aese q10,q8
619	.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
620	vld1.32		{q8},[r7]!
621	subs		r6,r6,#2
622	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
623	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
624	.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
625	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
626	.byte	0x22,0x43,0xf0,0xf3	@ aese q10,q9
627	.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
628	vld1.32		{q9},[r7]!
629	bgt		.Loop3x_ctr32
630
631	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
632	.byte	0x80,0x83,0xb0,0xf3	@ aesmc q4,q0
633	.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
634	.byte	0x82,0xa3,0xb0,0xf3	@ aesmc q5,q1
635	 vld1.8		{q2},[r0]!
636	 vorr		q0,q6,q6
637	.byte	0x20,0x43,0xf0,0xf3	@ aese q10,q8
638	.byte	0xa4,0x43,0xf0,0xf3	@ aesmc q10,q10
639	 vld1.8		{q3},[r0]!
640	 vorr		q1,q6,q6
641	.byte	0x22,0x83,0xb0,0xf3	@ aese q4,q9
642	.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
643	.byte	0x22,0xa3,0xb0,0xf3	@ aese q5,q9
644	.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
645	 vld1.8		{q11},[r0]!
646	 mov		r7,r3
647	.byte	0x22,0x43,0xf0,0xf3	@ aese q10,q9
648	.byte	0xa4,0x23,0xf0,0xf3	@ aesmc q9,q10
649	 vorr		q10,q6,q6
650	 add		r9,r8,#1
651	.byte	0x28,0x83,0xb0,0xf3	@ aese q4,q12
652	.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
653	.byte	0x28,0xa3,0xb0,0xf3	@ aese q5,q12
654	.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
655	 veor		q2,q2,q7
656	 add		r10,r8,#2
657	.byte	0x28,0x23,0xf0,0xf3	@ aese q9,q12
658	.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
659	 veor		q3,q3,q7
660	 add		r8,r8,#3
661	.byte	0x2a,0x83,0xb0,0xf3	@ aese q4,q13
662	.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
663	.byte	0x2a,0xa3,0xb0,0xf3	@ aese q5,q13
664	.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
665	 veor		q11,q11,q7
666	 rev		r9,r9
667	.byte	0x2a,0x23,0xf0,0xf3	@ aese q9,q13
668	.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
669	 vmov.32	d1[1], r9
670	 rev		r10,r10
671	.byte	0x2c,0x83,0xb0,0xf3	@ aese q4,q14
672	.byte	0x88,0x83,0xb0,0xf3	@ aesmc q4,q4
673	.byte	0x2c,0xa3,0xb0,0xf3	@ aese q5,q14
674	.byte	0x8a,0xa3,0xb0,0xf3	@ aesmc q5,q5
675	 vmov.32	d3[1], r10
676	 rev		r12,r8
677	.byte	0x2c,0x23,0xf0,0xf3	@ aese q9,q14
678	.byte	0xa2,0x23,0xf0,0xf3	@ aesmc q9,q9
679	 vmov.32	d21[1], r12
680	 subs		r2,r2,#3
681	.byte	0x2e,0x83,0xb0,0xf3	@ aese q4,q15
682	.byte	0x2e,0xa3,0xb0,0xf3	@ aese q5,q15
683	.byte	0x2e,0x23,0xf0,0xf3	@ aese q9,q15
684
685	veor		q2,q2,q4
686	 vld1.32	 {q8},[r7]!	@ re-pre-load rndkey[0]
687	vst1.8		{q2},[r1]!
688	veor		q3,q3,q5
689	 mov		r6,r5
690	vst1.8		{q3},[r1]!
691	veor		q11,q11,q9
692	 vld1.32	 {q9},[r7]!	@ re-pre-load rndkey[1]
693	vst1.8		{q11},[r1]!
694	bhs		.Loop3x_ctr32
695
696	adds		r2,r2,#3
697	beq		.Lctr32_done
698	cmp		r2,#1
699	mov		r12,#16
700	moveq	r12,#0
701
702.Lctr32_tail:
703	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
704	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
705	.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
706	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
707	vld1.32		{q8},[r7]!
708	subs		r6,r6,#2
709	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
710	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
711	.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
712	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
713	vld1.32		{q9},[r7]!
714	bgt		.Lctr32_tail
715
716	.byte	0x20,0x03,0xb0,0xf3	@ aese q0,q8
717	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
718	.byte	0x20,0x23,0xb0,0xf3	@ aese q1,q8
719	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
720	.byte	0x22,0x03,0xb0,0xf3	@ aese q0,q9
721	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
722	.byte	0x22,0x23,0xb0,0xf3	@ aese q1,q9
723	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
724	 vld1.8		{q2},[r0],r12
725	.byte	0x28,0x03,0xb0,0xf3	@ aese q0,q12
726	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
727	.byte	0x28,0x23,0xb0,0xf3	@ aese q1,q12
728	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
729	 vld1.8		{q3},[r0]
730	.byte	0x2a,0x03,0xb0,0xf3	@ aese q0,q13
731	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
732	.byte	0x2a,0x23,0xb0,0xf3	@ aese q1,q13
733	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
734	 veor		q2,q2,q7
735	.byte	0x2c,0x03,0xb0,0xf3	@ aese q0,q14
736	.byte	0x80,0x03,0xb0,0xf3	@ aesmc q0,q0
737	.byte	0x2c,0x23,0xb0,0xf3	@ aese q1,q14
738	.byte	0x82,0x23,0xb0,0xf3	@ aesmc q1,q1
739	 veor		q3,q3,q7
740	.byte	0x2e,0x03,0xb0,0xf3	@ aese q0,q15
741	.byte	0x2e,0x23,0xb0,0xf3	@ aese q1,q15
742
743	cmp		r2,#1
744	veor		q2,q2,q0
745	veor		q3,q3,q1
746	vst1.8		{q2},[r1]!
747	beq		.Lctr32_done
748	vst1.8		{q3},[r1]
749
750.Lctr32_done:
751	vldmia		sp!,{d8-d15}
752	ldmia		sp!,{r4-r10,pc}
753.size	aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks
754#endif
755