1#if defined(__aarch64__)
2#include <openssl/arm_arch.h>
3
4#if __ARM_MAX_ARCH__>=7
5.text
6.arch	armv8-a+crypto
7.align	5
8.Lrcon:
9.long	0x01,0x01,0x01,0x01
10.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d	// rotate-n-splat
11.long	0x1b,0x1b,0x1b,0x1b
12
13.globl	aes_hw_set_encrypt_key
14.hidden	aes_hw_set_encrypt_key
15.type	aes_hw_set_encrypt_key,%function
16.align	5
17aes_hw_set_encrypt_key:
18.Lenc_key:
19	stp	x29,x30,[sp,#-16]!
20	add	x29,sp,#0
21	mov	x3,#-1
22	cmp	x0,#0
23	b.eq	.Lenc_key_abort
24	cmp	x2,#0
25	b.eq	.Lenc_key_abort
26	mov	x3,#-2
27	cmp	w1,#128
28	b.lt	.Lenc_key_abort
29	cmp	w1,#256
30	b.gt	.Lenc_key_abort
31	tst	w1,#0x3f
32	b.ne	.Lenc_key_abort
33
34	adr	x3,.Lrcon
35	cmp	w1,#192
36
37	eor	v0.16b,v0.16b,v0.16b
38	ld1	{v3.16b},[x0],#16
39	mov	w1,#8		// reuse w1
40	ld1	{v1.4s,v2.4s},[x3],#32
41
42	b.lt	.Loop128
43	b.eq	.L192
44	b	.L256
45
46.align	4
47.Loop128:
48	tbl	v6.16b,{v3.16b},v2.16b
49	ext	v5.16b,v0.16b,v3.16b,#12
50	st1	{v3.4s},[x2],#16
51	aese	v6.16b,v0.16b
52	subs	w1,w1,#1
53
54	eor	v3.16b,v3.16b,v5.16b
55	ext	v5.16b,v0.16b,v5.16b,#12
56	eor	v3.16b,v3.16b,v5.16b
57	ext	v5.16b,v0.16b,v5.16b,#12
58	eor	v6.16b,v6.16b,v1.16b
59	eor	v3.16b,v3.16b,v5.16b
60	shl	v1.16b,v1.16b,#1
61	eor	v3.16b,v3.16b,v6.16b
62	b.ne	.Loop128
63
64	ld1	{v1.4s},[x3]
65
66	tbl	v6.16b,{v3.16b},v2.16b
67	ext	v5.16b,v0.16b,v3.16b,#12
68	st1	{v3.4s},[x2],#16
69	aese	v6.16b,v0.16b
70
71	eor	v3.16b,v3.16b,v5.16b
72	ext	v5.16b,v0.16b,v5.16b,#12
73	eor	v3.16b,v3.16b,v5.16b
74	ext	v5.16b,v0.16b,v5.16b,#12
75	eor	v6.16b,v6.16b,v1.16b
76	eor	v3.16b,v3.16b,v5.16b
77	shl	v1.16b,v1.16b,#1
78	eor	v3.16b,v3.16b,v6.16b
79
80	tbl	v6.16b,{v3.16b},v2.16b
81	ext	v5.16b,v0.16b,v3.16b,#12
82	st1	{v3.4s},[x2],#16
83	aese	v6.16b,v0.16b
84
85	eor	v3.16b,v3.16b,v5.16b
86	ext	v5.16b,v0.16b,v5.16b,#12
87	eor	v3.16b,v3.16b,v5.16b
88	ext	v5.16b,v0.16b,v5.16b,#12
89	eor	v6.16b,v6.16b,v1.16b
90	eor	v3.16b,v3.16b,v5.16b
91	eor	v3.16b,v3.16b,v6.16b
92	st1	{v3.4s},[x2]
93	add	x2,x2,#0x50
94
95	mov	w12,#10
96	b	.Ldone
97
98.align	4
99.L192:
100	ld1	{v4.8b},[x0],#8
101	movi	v6.16b,#8			// borrow v6.16b
102	st1	{v3.4s},[x2],#16
103	sub	v2.16b,v2.16b,v6.16b	// adjust the mask
104
105.Loop192:
106	tbl	v6.16b,{v4.16b},v2.16b
107	ext	v5.16b,v0.16b,v3.16b,#12
108	st1	{v4.8b},[x2],#8
109	aese	v6.16b,v0.16b
110	subs	w1,w1,#1
111
112	eor	v3.16b,v3.16b,v5.16b
113	ext	v5.16b,v0.16b,v5.16b,#12
114	eor	v3.16b,v3.16b,v5.16b
115	ext	v5.16b,v0.16b,v5.16b,#12
116	eor	v3.16b,v3.16b,v5.16b
117
118	dup	v5.4s,v3.s[3]
119	eor	v5.16b,v5.16b,v4.16b
120	eor	v6.16b,v6.16b,v1.16b
121	ext	v4.16b,v0.16b,v4.16b,#12
122	shl	v1.16b,v1.16b,#1
123	eor	v4.16b,v4.16b,v5.16b
124	eor	v3.16b,v3.16b,v6.16b
125	eor	v4.16b,v4.16b,v6.16b
126	st1	{v3.4s},[x2],#16
127	b.ne	.Loop192
128
129	mov	w12,#12
130	add	x2,x2,#0x20
131	b	.Ldone
132
133.align	4
134.L256:
135	ld1	{v4.16b},[x0]
136	mov	w1,#7
137	mov	w12,#14
138	st1	{v3.4s},[x2],#16
139
140.Loop256:
141	tbl	v6.16b,{v4.16b},v2.16b
142	ext	v5.16b,v0.16b,v3.16b,#12
143	st1	{v4.4s},[x2],#16
144	aese	v6.16b,v0.16b
145	subs	w1,w1,#1
146
147	eor	v3.16b,v3.16b,v5.16b
148	ext	v5.16b,v0.16b,v5.16b,#12
149	eor	v3.16b,v3.16b,v5.16b
150	ext	v5.16b,v0.16b,v5.16b,#12
151	eor	v6.16b,v6.16b,v1.16b
152	eor	v3.16b,v3.16b,v5.16b
153	shl	v1.16b,v1.16b,#1
154	eor	v3.16b,v3.16b,v6.16b
155	st1	{v3.4s},[x2],#16
156	b.eq	.Ldone
157
158	dup	v6.4s,v3.s[3]		// just splat
159	ext	v5.16b,v0.16b,v4.16b,#12
160	aese	v6.16b,v0.16b
161
162	eor	v4.16b,v4.16b,v5.16b
163	ext	v5.16b,v0.16b,v5.16b,#12
164	eor	v4.16b,v4.16b,v5.16b
165	ext	v5.16b,v0.16b,v5.16b,#12
166	eor	v4.16b,v4.16b,v5.16b
167
168	eor	v4.16b,v4.16b,v6.16b
169	b	.Loop256
170
171.Ldone:
172	str	w12,[x2]
173	mov	x3,#0
174
175.Lenc_key_abort:
176	mov	x0,x3			// return value
177	ldr	x29,[sp],#16
178	ret
179.size	aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key
180
181.globl	aes_hw_set_decrypt_key
182.hidden	aes_hw_set_decrypt_key
183.type	aes_hw_set_decrypt_key,%function
184.align	5
185aes_hw_set_decrypt_key:
186	stp	x29,x30,[sp,#-16]!
187	add	x29,sp,#0
188	bl	.Lenc_key
189
190	cmp	x0,#0
191	b.ne	.Ldec_key_abort
192
193	sub	x2,x2,#240		// restore original x2
194	mov	x4,#-16
195	add	x0,x2,x12,lsl#4	// end of key schedule
196
197	ld1	{v0.4s},[x2]
198	ld1	{v1.4s},[x0]
199	st1	{v0.4s},[x0],x4
200	st1	{v1.4s},[x2],#16
201
202.Loop_imc:
203	ld1	{v0.4s},[x2]
204	ld1	{v1.4s},[x0]
205	aesimc	v0.16b,v0.16b
206	aesimc	v1.16b,v1.16b
207	st1	{v0.4s},[x0],x4
208	st1	{v1.4s},[x2],#16
209	cmp	x0,x2
210	b.hi	.Loop_imc
211
212	ld1	{v0.4s},[x2]
213	aesimc	v0.16b,v0.16b
214	st1	{v0.4s},[x0]
215
216	eor	x0,x0,x0		// return value
217.Ldec_key_abort:
218	ldp	x29,x30,[sp],#16
219	ret
220.size	aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key
221.globl	aes_hw_encrypt
222.hidden	aes_hw_encrypt
223.type	aes_hw_encrypt,%function
224.align	5
225aes_hw_encrypt:
226	ldr	w3,[x2,#240]
227	ld1	{v0.4s},[x2],#16
228	ld1	{v2.16b},[x0]
229	sub	w3,w3,#2
230	ld1	{v1.4s},[x2],#16
231
232.Loop_enc:
233	aese	v2.16b,v0.16b
234	aesmc	v2.16b,v2.16b
235	ld1	{v0.4s},[x2],#16
236	subs	w3,w3,#2
237	aese	v2.16b,v1.16b
238	aesmc	v2.16b,v2.16b
239	ld1	{v1.4s},[x2],#16
240	b.gt	.Loop_enc
241
242	aese	v2.16b,v0.16b
243	aesmc	v2.16b,v2.16b
244	ld1	{v0.4s},[x2]
245	aese	v2.16b,v1.16b
246	eor	v2.16b,v2.16b,v0.16b
247
248	st1	{v2.16b},[x1]
249	ret
250.size	aes_hw_encrypt,.-aes_hw_encrypt
251.globl	aes_hw_decrypt
252.hidden	aes_hw_decrypt
253.type	aes_hw_decrypt,%function
254.align	5
255aes_hw_decrypt:
256	ldr	w3,[x2,#240]
257	ld1	{v0.4s},[x2],#16
258	ld1	{v2.16b},[x0]
259	sub	w3,w3,#2
260	ld1	{v1.4s},[x2],#16
261
262.Loop_dec:
263	aesd	v2.16b,v0.16b
264	aesimc	v2.16b,v2.16b
265	ld1	{v0.4s},[x2],#16
266	subs	w3,w3,#2
267	aesd	v2.16b,v1.16b
268	aesimc	v2.16b,v2.16b
269	ld1	{v1.4s},[x2],#16
270	b.gt	.Loop_dec
271
272	aesd	v2.16b,v0.16b
273	aesimc	v2.16b,v2.16b
274	ld1	{v0.4s},[x2]
275	aesd	v2.16b,v1.16b
276	eor	v2.16b,v2.16b,v0.16b
277
278	st1	{v2.16b},[x1]
279	ret
280.size	aes_hw_decrypt,.-aes_hw_decrypt
281.globl	aes_hw_cbc_encrypt
282.hidden	aes_hw_cbc_encrypt
283.type	aes_hw_cbc_encrypt,%function
284.align	5
285aes_hw_cbc_encrypt:
286	stp	x29,x30,[sp,#-16]!
287	add	x29,sp,#0
288	subs	x2,x2,#16
289	mov	x8,#16
290	b.lo	.Lcbc_abort
291	csel	x8,xzr,x8,eq
292
293	cmp	w5,#0			// en- or decrypting?
294	ldr	w5,[x3,#240]
295	and	x2,x2,#-16
296	ld1	{v6.16b},[x4]
297	ld1	{v0.16b},[x0],x8
298
299	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
300	sub	w5,w5,#6
301	add	x7,x3,x5,lsl#4	// pointer to last 7 round keys
302	sub	w5,w5,#2
303	ld1	{v18.4s,v19.4s},[x7],#32
304	ld1	{v20.4s,v21.4s},[x7],#32
305	ld1	{v22.4s,v23.4s},[x7],#32
306	ld1	{v7.4s},[x7]
307
308	add	x7,x3,#32
309	mov	w6,w5
310	b.eq	.Lcbc_dec
311
312	cmp	w5,#2
313	eor	v0.16b,v0.16b,v6.16b
314	eor	v5.16b,v16.16b,v7.16b
315	b.eq	.Lcbc_enc128
316
317	ld1	{v2.4s,v3.4s},[x7]
318	add	x7,x3,#16
319	add	x6,x3,#16*4
320	add	x12,x3,#16*5
321	aese	v0.16b,v16.16b
322	aesmc	v0.16b,v0.16b
323	add	x14,x3,#16*6
324	add	x3,x3,#16*7
325	b	.Lenter_cbc_enc
326
327.align	4
328.Loop_cbc_enc:
329	aese	v0.16b,v16.16b
330	aesmc	v0.16b,v0.16b
331	st1	{v6.16b},[x1],#16
332.Lenter_cbc_enc:
333	aese	v0.16b,v17.16b
334	aesmc	v0.16b,v0.16b
335	aese	v0.16b,v2.16b
336	aesmc	v0.16b,v0.16b
337	ld1	{v16.4s},[x6]
338	cmp	w5,#4
339	aese	v0.16b,v3.16b
340	aesmc	v0.16b,v0.16b
341	ld1	{v17.4s},[x12]
342	b.eq	.Lcbc_enc192
343
344	aese	v0.16b,v16.16b
345	aesmc	v0.16b,v0.16b
346	ld1	{v16.4s},[x14]
347	aese	v0.16b,v17.16b
348	aesmc	v0.16b,v0.16b
349	ld1	{v17.4s},[x3]
350	nop
351
352.Lcbc_enc192:
353	aese	v0.16b,v16.16b
354	aesmc	v0.16b,v0.16b
355	subs	x2,x2,#16
356	aese	v0.16b,v17.16b
357	aesmc	v0.16b,v0.16b
358	csel	x8,xzr,x8,eq
359	aese	v0.16b,v18.16b
360	aesmc	v0.16b,v0.16b
361	aese	v0.16b,v19.16b
362	aesmc	v0.16b,v0.16b
363	ld1	{v16.16b},[x0],x8
364	aese	v0.16b,v20.16b
365	aesmc	v0.16b,v0.16b
366	eor	v16.16b,v16.16b,v5.16b
367	aese	v0.16b,v21.16b
368	aesmc	v0.16b,v0.16b
369	ld1	{v17.4s},[x7]		// re-pre-load rndkey[1]
370	aese	v0.16b,v22.16b
371	aesmc	v0.16b,v0.16b
372	aese	v0.16b,v23.16b
373	eor	v6.16b,v0.16b,v7.16b
374	b.hs	.Loop_cbc_enc
375
376	st1	{v6.16b},[x1],#16
377	b	.Lcbc_done
378
379.align	5
380.Lcbc_enc128:
381	ld1	{v2.4s,v3.4s},[x7]
382	aese	v0.16b,v16.16b
383	aesmc	v0.16b,v0.16b
384	b	.Lenter_cbc_enc128
385.Loop_cbc_enc128:
386	aese	v0.16b,v16.16b
387	aesmc	v0.16b,v0.16b
388	st1	{v6.16b},[x1],#16
389.Lenter_cbc_enc128:
390	aese	v0.16b,v17.16b
391	aesmc	v0.16b,v0.16b
392	subs	x2,x2,#16
393	aese	v0.16b,v2.16b
394	aesmc	v0.16b,v0.16b
395	csel	x8,xzr,x8,eq
396	aese	v0.16b,v3.16b
397	aesmc	v0.16b,v0.16b
398	aese	v0.16b,v18.16b
399	aesmc	v0.16b,v0.16b
400	aese	v0.16b,v19.16b
401	aesmc	v0.16b,v0.16b
402	ld1	{v16.16b},[x0],x8
403	aese	v0.16b,v20.16b
404	aesmc	v0.16b,v0.16b
405	aese	v0.16b,v21.16b
406	aesmc	v0.16b,v0.16b
407	aese	v0.16b,v22.16b
408	aesmc	v0.16b,v0.16b
409	eor	v16.16b,v16.16b,v5.16b
410	aese	v0.16b,v23.16b
411	eor	v6.16b,v0.16b,v7.16b
412	b.hs	.Loop_cbc_enc128
413
414	st1	{v6.16b},[x1],#16
415	b	.Lcbc_done
416.align	5
417.Lcbc_dec:
418	ld1	{v18.16b},[x0],#16
419	subs	x2,x2,#32		// bias
420	add	w6,w5,#2
421	orr	v3.16b,v0.16b,v0.16b
422	orr	v1.16b,v0.16b,v0.16b
423	orr	v19.16b,v18.16b,v18.16b
424	b.lo	.Lcbc_dec_tail
425
426	orr	v1.16b,v18.16b,v18.16b
427	ld1	{v18.16b},[x0],#16
428	orr	v2.16b,v0.16b,v0.16b
429	orr	v3.16b,v1.16b,v1.16b
430	orr	v19.16b,v18.16b,v18.16b
431
432.Loop3x_cbc_dec:
433	aesd	v0.16b,v16.16b
434	aesimc	v0.16b,v0.16b
435	aesd	v1.16b,v16.16b
436	aesimc	v1.16b,v1.16b
437	aesd	v18.16b,v16.16b
438	aesimc	v18.16b,v18.16b
439	ld1	{v16.4s},[x7],#16
440	subs	w6,w6,#2
441	aesd	v0.16b,v17.16b
442	aesimc	v0.16b,v0.16b
443	aesd	v1.16b,v17.16b
444	aesimc	v1.16b,v1.16b
445	aesd	v18.16b,v17.16b
446	aesimc	v18.16b,v18.16b
447	ld1	{v17.4s},[x7],#16
448	b.gt	.Loop3x_cbc_dec
449
450	aesd	v0.16b,v16.16b
451	aesimc	v0.16b,v0.16b
452	aesd	v1.16b,v16.16b
453	aesimc	v1.16b,v1.16b
454	aesd	v18.16b,v16.16b
455	aesimc	v18.16b,v18.16b
456	eor	v4.16b,v6.16b,v7.16b
457	subs	x2,x2,#0x30
458	eor	v5.16b,v2.16b,v7.16b
459	csel	x6,x2,x6,lo			// x6, w6, is zero at this point
460	aesd	v0.16b,v17.16b
461	aesimc	v0.16b,v0.16b
462	aesd	v1.16b,v17.16b
463	aesimc	v1.16b,v1.16b
464	aesd	v18.16b,v17.16b
465	aesimc	v18.16b,v18.16b
466	eor	v17.16b,v3.16b,v7.16b
467	add	x0,x0,x6		// x0 is adjusted in such way that
468					// at exit from the loop v1.16b-v18.16b
469					// are loaded with last "words"
470	orr	v6.16b,v19.16b,v19.16b
471	mov	x7,x3
472	aesd	v0.16b,v20.16b
473	aesimc	v0.16b,v0.16b
474	aesd	v1.16b,v20.16b
475	aesimc	v1.16b,v1.16b
476	aesd	v18.16b,v20.16b
477	aesimc	v18.16b,v18.16b
478	ld1	{v2.16b},[x0],#16
479	aesd	v0.16b,v21.16b
480	aesimc	v0.16b,v0.16b
481	aesd	v1.16b,v21.16b
482	aesimc	v1.16b,v1.16b
483	aesd	v18.16b,v21.16b
484	aesimc	v18.16b,v18.16b
485	ld1	{v3.16b},[x0],#16
486	aesd	v0.16b,v22.16b
487	aesimc	v0.16b,v0.16b
488	aesd	v1.16b,v22.16b
489	aesimc	v1.16b,v1.16b
490	aesd	v18.16b,v22.16b
491	aesimc	v18.16b,v18.16b
492	ld1	{v19.16b},[x0],#16
493	aesd	v0.16b,v23.16b
494	aesd	v1.16b,v23.16b
495	aesd	v18.16b,v23.16b
496	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
497	add	w6,w5,#2
498	eor	v4.16b,v4.16b,v0.16b
499	eor	v5.16b,v5.16b,v1.16b
500	eor	v18.16b,v18.16b,v17.16b
501	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
502	st1	{v4.16b},[x1],#16
503	orr	v0.16b,v2.16b,v2.16b
504	st1	{v5.16b},[x1],#16
505	orr	v1.16b,v3.16b,v3.16b
506	st1	{v18.16b},[x1],#16
507	orr	v18.16b,v19.16b,v19.16b
508	b.hs	.Loop3x_cbc_dec
509
510	cmn	x2,#0x30
511	b.eq	.Lcbc_done
512	nop
513
514.Lcbc_dec_tail:
515	aesd	v1.16b,v16.16b
516	aesimc	v1.16b,v1.16b
517	aesd	v18.16b,v16.16b
518	aesimc	v18.16b,v18.16b
519	ld1	{v16.4s},[x7],#16
520	subs	w6,w6,#2
521	aesd	v1.16b,v17.16b
522	aesimc	v1.16b,v1.16b
523	aesd	v18.16b,v17.16b
524	aesimc	v18.16b,v18.16b
525	ld1	{v17.4s},[x7],#16
526	b.gt	.Lcbc_dec_tail
527
528	aesd	v1.16b,v16.16b
529	aesimc	v1.16b,v1.16b
530	aesd	v18.16b,v16.16b
531	aesimc	v18.16b,v18.16b
532	aesd	v1.16b,v17.16b
533	aesimc	v1.16b,v1.16b
534	aesd	v18.16b,v17.16b
535	aesimc	v18.16b,v18.16b
536	aesd	v1.16b,v20.16b
537	aesimc	v1.16b,v1.16b
538	aesd	v18.16b,v20.16b
539	aesimc	v18.16b,v18.16b
540	cmn	x2,#0x20
541	aesd	v1.16b,v21.16b
542	aesimc	v1.16b,v1.16b
543	aesd	v18.16b,v21.16b
544	aesimc	v18.16b,v18.16b
545	eor	v5.16b,v6.16b,v7.16b
546	aesd	v1.16b,v22.16b
547	aesimc	v1.16b,v1.16b
548	aesd	v18.16b,v22.16b
549	aesimc	v18.16b,v18.16b
550	eor	v17.16b,v3.16b,v7.16b
551	aesd	v1.16b,v23.16b
552	aesd	v18.16b,v23.16b
553	b.eq	.Lcbc_dec_one
554	eor	v5.16b,v5.16b,v1.16b
555	eor	v17.16b,v17.16b,v18.16b
556	orr	v6.16b,v19.16b,v19.16b
557	st1	{v5.16b},[x1],#16
558	st1	{v17.16b},[x1],#16
559	b	.Lcbc_done
560
561.Lcbc_dec_one:
562	eor	v5.16b,v5.16b,v18.16b
563	orr	v6.16b,v19.16b,v19.16b
564	st1	{v5.16b},[x1],#16
565
566.Lcbc_done:
567	st1	{v6.16b},[x4]
568.Lcbc_abort:
569	ldr	x29,[sp],#16
570	ret
571.size	aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt
572.globl	aes_hw_ctr32_encrypt_blocks
573.hidden	aes_hw_ctr32_encrypt_blocks
574.type	aes_hw_ctr32_encrypt_blocks,%function
575.align	5
576aes_hw_ctr32_encrypt_blocks:
577	stp	x29,x30,[sp,#-16]!
578	add	x29,sp,#0
579	ldr	w5,[x3,#240]
580
581	ldr	w8, [x4, #12]
582	ld1	{v0.4s},[x4]
583
584	ld1	{v16.4s,v17.4s},[x3]		// load key schedule...
585	sub	w5,w5,#4
586	mov	x12,#16
587	cmp	x2,#2
588	add	x7,x3,x5,lsl#4	// pointer to last 5 round keys
589	sub	w5,w5,#2
590	ld1	{v20.4s,v21.4s},[x7],#32
591	ld1	{v22.4s,v23.4s},[x7],#32
592	ld1	{v7.4s},[x7]
593	add	x7,x3,#32
594	mov	w6,w5
595	csel	x12,xzr,x12,lo
596#ifndef __ARMEB__
597	rev	w8, w8
598#endif
599	orr	v1.16b,v0.16b,v0.16b
600	add	w10, w8, #1
601	orr	v18.16b,v0.16b,v0.16b
602	add	w8, w8, #2
603	orr	v6.16b,v0.16b,v0.16b
604	rev	w10, w10
605	mov	v1.s[3],w10
606	b.ls	.Lctr32_tail
607	rev	w12, w8
608	sub	x2,x2,#3		// bias
609	mov	v18.s[3],w12
610	b	.Loop3x_ctr32
611
612.align	4
613.Loop3x_ctr32:
614	aese	v0.16b,v16.16b
615	aesmc	v0.16b,v0.16b
616	aese	v1.16b,v16.16b
617	aesmc	v1.16b,v1.16b
618	aese	v18.16b,v16.16b
619	aesmc	v18.16b,v18.16b
620	ld1	{v16.4s},[x7],#16
621	subs	w6,w6,#2
622	aese	v0.16b,v17.16b
623	aesmc	v0.16b,v0.16b
624	aese	v1.16b,v17.16b
625	aesmc	v1.16b,v1.16b
626	aese	v18.16b,v17.16b
627	aesmc	v18.16b,v18.16b
628	ld1	{v17.4s},[x7],#16
629	b.gt	.Loop3x_ctr32
630
631	aese	v0.16b,v16.16b
632	aesmc	v4.16b,v0.16b
633	aese	v1.16b,v16.16b
634	aesmc	v5.16b,v1.16b
635	ld1	{v2.16b},[x0],#16
636	orr	v0.16b,v6.16b,v6.16b
637	aese	v18.16b,v16.16b
638	aesmc	v18.16b,v18.16b
639	ld1	{v3.16b},[x0],#16
640	orr	v1.16b,v6.16b,v6.16b
641	aese	v4.16b,v17.16b
642	aesmc	v4.16b,v4.16b
643	aese	v5.16b,v17.16b
644	aesmc	v5.16b,v5.16b
645	ld1	{v19.16b},[x0],#16
646	mov	x7,x3
647	aese	v18.16b,v17.16b
648	aesmc	v17.16b,v18.16b
649	orr	v18.16b,v6.16b,v6.16b
650	add	w9,w8,#1
651	aese	v4.16b,v20.16b
652	aesmc	v4.16b,v4.16b
653	aese	v5.16b,v20.16b
654	aesmc	v5.16b,v5.16b
655	eor	v2.16b,v2.16b,v7.16b
656	add	w10,w8,#2
657	aese	v17.16b,v20.16b
658	aesmc	v17.16b,v17.16b
659	eor	v3.16b,v3.16b,v7.16b
660	add	w8,w8,#3
661	aese	v4.16b,v21.16b
662	aesmc	v4.16b,v4.16b
663	aese	v5.16b,v21.16b
664	aesmc	v5.16b,v5.16b
665	eor	v19.16b,v19.16b,v7.16b
666	rev	w9,w9
667	aese	v17.16b,v21.16b
668	aesmc	v17.16b,v17.16b
669	mov	v0.s[3], w9
670	rev	w10,w10
671	aese	v4.16b,v22.16b
672	aesmc	v4.16b,v4.16b
673	aese	v5.16b,v22.16b
674	aesmc	v5.16b,v5.16b
675	mov	v1.s[3], w10
676	rev	w12,w8
677	aese	v17.16b,v22.16b
678	aesmc	v17.16b,v17.16b
679	mov	v18.s[3], w12
680	subs	x2,x2,#3
681	aese	v4.16b,v23.16b
682	aese	v5.16b,v23.16b
683	aese	v17.16b,v23.16b
684
685	eor	v2.16b,v2.16b,v4.16b
686	ld1	{v16.4s},[x7],#16	// re-pre-load rndkey[0]
687	st1	{v2.16b},[x1],#16
688	eor	v3.16b,v3.16b,v5.16b
689	mov	w6,w5
690	st1	{v3.16b},[x1],#16
691	eor	v19.16b,v19.16b,v17.16b
692	ld1	{v17.4s},[x7],#16	// re-pre-load rndkey[1]
693	st1	{v19.16b},[x1],#16
694	b.hs	.Loop3x_ctr32
695
696	adds	x2,x2,#3
697	b.eq	.Lctr32_done
698	cmp	x2,#1
699	mov	x12,#16
700	csel	x12,xzr,x12,eq
701
702.Lctr32_tail:
703	aese	v0.16b,v16.16b
704	aesmc	v0.16b,v0.16b
705	aese	v1.16b,v16.16b
706	aesmc	v1.16b,v1.16b
707	ld1	{v16.4s},[x7],#16
708	subs	w6,w6,#2
709	aese	v0.16b,v17.16b
710	aesmc	v0.16b,v0.16b
711	aese	v1.16b,v17.16b
712	aesmc	v1.16b,v1.16b
713	ld1	{v17.4s},[x7],#16
714	b.gt	.Lctr32_tail
715
716	aese	v0.16b,v16.16b
717	aesmc	v0.16b,v0.16b
718	aese	v1.16b,v16.16b
719	aesmc	v1.16b,v1.16b
720	aese	v0.16b,v17.16b
721	aesmc	v0.16b,v0.16b
722	aese	v1.16b,v17.16b
723	aesmc	v1.16b,v1.16b
724	ld1	{v2.16b},[x0],x12
725	aese	v0.16b,v20.16b
726	aesmc	v0.16b,v0.16b
727	aese	v1.16b,v20.16b
728	aesmc	v1.16b,v1.16b
729	ld1	{v3.16b},[x0]
730	aese	v0.16b,v21.16b
731	aesmc	v0.16b,v0.16b
732	aese	v1.16b,v21.16b
733	aesmc	v1.16b,v1.16b
734	eor	v2.16b,v2.16b,v7.16b
735	aese	v0.16b,v22.16b
736	aesmc	v0.16b,v0.16b
737	aese	v1.16b,v22.16b
738	aesmc	v1.16b,v1.16b
739	eor	v3.16b,v3.16b,v7.16b
740	aese	v0.16b,v23.16b
741	aese	v1.16b,v23.16b
742
743	cmp	x2,#1
744	eor	v2.16b,v2.16b,v0.16b
745	eor	v3.16b,v3.16b,v1.16b
746	st1	{v2.16b},[x1],#16
747	b.eq	.Lctr32_done
748	st1	{v3.16b},[x1]
749
750.Lctr32_done:
751	ldr	x29,[sp],#16
752	ret
753.size	aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks
754#endif
755#endif
756