1#include "arm_asm.h"
2#include "arm_arch.h"
3
4#if __ARM_MAX_ARCH__>=8
5.fpu	neon
6#ifdef __thumb2__
7.syntax	unified
8.thumb
9# define INST(a,b,c,d)   c,0xef,a,b
10#else
11.code	32
12# define INST(a,b,c,d)   a,b,c,0xf2
13#endif
14
15.text
16.globl	aes_gcm_enc_128_kernel
17.type	aes_gcm_enc_128_kernel,%function
18.align	4
19aes_gcm_enc_128_kernel:
20	cbz	r1, .L128_enc_ret
21	stp	r19, r20, [sp, #-112]!
22	mov	r16, r4
23	mov	r8, r5
24	stp	r21, r22, [sp, #16]
25	stp	r23, r24, [sp, #32]
26	stp	d8, d9, [sp, #48]
27	stp	d10, d11, [sp, #64]
28	stp	d12, d13, [sp, #80]
29	stp	d14, d15, [sp, #96]
30
31	ldp	r10, r11, [r16]              @ ctr96_b64, ctr96_t32
32#ifdef __ARMEB__
33	rev	r10, r10
34	rev	r11, r11
35#endif
36	ldp	r13, r14, [r8, #160]                     @ load rk10
37#ifdef __ARMEB__
38	ror	r13, r13, #32
39	ror	r14, r14, #32
40#endif
41	ld1	{v11.16b}, [r3]
42	ext	v11.16b, v11.16b, v11.16b, #8
43	rev64	v11.16b, v11.16b
44	lsr	r5, r1, #3              @ byte_len
45	mov	r15, r5
46
47	ld1	{v18.4s}, [r8], #16								  @ load rk0
48	add	r4, r0, r1, lsr #3   @ end_input_ptr
49	sub	r5, r5, #1      @ byte_len - 1
50
51	lsr	r12, r11, #32
52	ldr	q15, [r3, #112]                        @ load h4l | h4h
53#ifndef __ARMEB__
54	ext	v15.16b, v15.16b, v15.16b, #8
55#endif
56	fmov	d1, r10                               @ CTR block 1
57	rev	r12, r12                                @ rev_ctr32
58
59	add	r12, r12, #1                            @ increment rev_ctr32
60	orr	r11, r11, r11
61	ld1	{v19.4s}, [r8], #16								  @ load rk1
62
63	rev	r9, r12                                 @ CTR block 1
64	add	r12, r12, #1                            @ CTR block 1
65	fmov	d3, r10                               @ CTR block 3
66
67	orr	r9, r11, r9, lsl #32            @ CTR block 1
68	ld1	{ q0}, [r16]                             @ special case vector load initial counter so we can start first AES block as quickly as possible
69
70	fmov	v1.d[1], r9                               @ CTR block 1
71	rev	r9, r12                                 @ CTR block 2
72
73	fmov	d2, r10                               @ CTR block 2
74	orr	r9, r11, r9, lsl #32            @ CTR block 2
75	add	r12, r12, #1                            @ CTR block 2
76
77	fmov	v2.d[1], r9                               @ CTR block 2
78	rev	r9, r12                                 @ CTR block 3
79
80	orr	r9, r11, r9, lsl #32            @ CTR block 3
81	ld1	{v20.4s}, [r8], #16								  @ load rk2
82
83	add	r12, r12, #1                            @ CTR block 3
84	fmov	v3.d[1], r9                               @ CTR block 3
85
86	ldr	q14, [r3, #80]                         @ load h3l | h3h
87#ifndef __ARMEB__
88	ext	v14.16b, v14.16b, v14.16b, #8
89#endif
90	aese	q1, v18.16b
91	aesmc	q1, q1          @ AES block 1 - round 0
92	ld1	{v21.4s}, [r8], #16								  @ load rk3
93
94	aese	q2, v18.16b
95	aesmc	q2, q2          @ AES block 2 - round 0
96	ldr	q12, [r3, #32]                         @ load h1l | h1h
97#ifndef __ARMEB__
98	ext	v12.16b, v12.16b, v12.16b, #8
99#endif
100
101	aese	q0, v18.16b
102	aesmc	q0, q0          @ AES block 0 - round 0
103	ld1	{v22.4s}, [r8], #16								  @ load rk4
104
105	aese	q3, v18.16b
106	aesmc	q3, q3          @ AES block 3 - round 0
107	ld1	{v23.4s}, [r8], #16								  @ load rk5
108
109	aese	q2, v19.16b
110	aesmc	q2, q2          @ AES block 2 - round 1
111	trn2	v17.2d,  v14.2d,    v15.2d                      @ h4l | h3l
112
113	aese	q0, v19.16b
114	aesmc	q0, q0          @ AES block 0 - round 1
115	ld1	{v24.4s}, [r8], #16								  @ load rk6
116
117	aese	q1, v19.16b
118	aesmc	q1, q1          @ AES block 1 - round 1
119	ld1	{v25.4s}, [r8], #16								  @ load rk7
120
121	aese	q3, v19.16b
122	aesmc	q3, q3          @ AES block 3 - round 1
123	trn1	q9, v14.2d,    v15.2d                      @ h4h | h3h
124
125	aese	q0, v20.16b
126	aesmc	q0, q0          @ AES block 0 - round 2
127	ld1	{v26.4s}, [r8], #16								  @ load rk8
128
129	aese	q1, v20.16b
130	aesmc	q1, q1          @ AES block 1 - round 2
131	ldr	q13, [r3, #64]                         @ load h2l | h2h
132#ifndef __ARMEB__
133	ext	v13.16b, v13.16b, v13.16b, #8
134#endif
135
136	aese	q3, v20.16b
137	aesmc	q3, q3          @ AES block 3 - round 2
138
139	aese	q2, v20.16b
140	aesmc	q2, q2          @ AES block 2 - round 2
141	eor	v17.16b, v17.16b, q9                  @ h4k | h3k
142
143	aese	q0, v21.16b
144	aesmc	q0, q0          @ AES block 0 - round 3
145
146	aese	q1, v21.16b
147	aesmc	q1, q1          @ AES block 1 - round 3
148
149	aese	q2, v21.16b
150	aesmc	q2, q2          @ AES block 2 - round 3
151	ld1	{v27.4s}, [r8], #16								  @ load rk9
152
153	aese	q3, v21.16b
154	aesmc	q3, q3          @ AES block 3 - round 3
155
156	and	r5, r5, #0xffffffffffffffc0    @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
157	trn2	v16.2d,  v12.2d,    v13.2d                      @ h2l | h1l
158
159	aese	q3, v22.16b
160	aesmc	q3, q3          @ AES block 3 - round 4
161	add	r5, r5, r0
162
163	aese	q2, v22.16b
164	aesmc	q2, q2          @ AES block 2 - round 4
165	cmp	r0, r5                   @ check if we have <= 4 blocks
166
167	aese	q0, v22.16b
168	aesmc	q0, q0          @ AES block 0 - round 4
169
170	aese	q3, v23.16b
171	aesmc	q3, q3          @ AES block 3 - round 5
172
173	aese	q2, v23.16b
174	aesmc	q2, q2          @ AES block 2 - round 5
175
176	aese	q0, v23.16b
177	aesmc	q0, q0          @ AES block 0 - round 5
178
179	aese	q3, v24.16b
180	aesmc	q3, q3          @ AES block 3 - round 6
181
182	aese	q1, v22.16b
183	aesmc	q1, q1          @ AES block 1 - round 4
184
185	aese	q2, v24.16b
186	aesmc	q2, q2          @ AES block 2 - round 6
187	trn1	q8,    v12.2d,    v13.2d                      @ h2h | h1h
188
189	aese	q0, v24.16b
190	aesmc	q0, q0          @ AES block 0 - round 6
191
192	aese	q1, v23.16b
193	aesmc	q1, q1          @ AES block 1 - round 5
194
195	aese	q3, v25.16b
196	aesmc	q3, q3          @ AES block 3 - round 7
197
198	aese	q0, v25.16b
199	aesmc	q0, q0          @ AES block 0 - round 7
200
201	aese	q1, v24.16b
202	aesmc	q1, q1          @ AES block 1 - round 6
203
204	aese	q2, v25.16b
205	aesmc	q2, q2          @ AES block 2 - round 7
206
207	aese	q0, v26.16b
208	aesmc	q0, q0          @ AES block 0 - round 8
209
210	aese	q1, v25.16b
211	aesmc	q1, q1          @ AES block 1 - round 7
212
213	aese	q2, v26.16b
214	aesmc	q2, q2          @ AES block 2 - round 8
215
216	aese	q3, v26.16b
217	aesmc	q3, q3          @ AES block 3 - round 8
218
219	aese	q1, v26.16b
220	aesmc	q1, q1          @ AES block 1 - round 8
221
222	aese	q2, v27.16b                                      @ AES block 2 - round 9
223
224	aese	q0, v27.16b                                      @ AES block 0 - round 9
225
226	eor	v16.16b, v16.16b, q8                     @ h2k | h1k
227
228	aese	q1, v27.16b                                      @ AES block 1 - round 9
229
230	aese	q3, v27.16b                                      @ AES block 3 - round 9
231	bge	.L128_enc_tail                                    @ handle tail
232
233	ldp	r6, r7, [r0, #0]            @ AES block 0 - load plaintext
234#ifdef __ARMEB__
235	rev	r6, r6
236	rev	r7, r7
237#endif
238	ldp	r21, r22, [r0, #32]           @ AES block 2 - load plaintext
239#ifdef __ARMEB__
240	rev	r21, r21
241	rev	r22, r22
242#endif
243	ldp	r19, r20, [r0, #16]           @ AES block 1 - load plaintext
244#ifdef __ARMEB__
245	rev	r19, r19
246	rev	r20, r20
247#endif
248	ldp	r23, r24, [r0, #48]           @ AES block 3 - load plaintext
249#ifdef __ARMEB__
250	rev	r23, r23
251	rev	r24, r24
252#endif
253	eor	r6, r6, r13                     @ AES block 0 - round 10 low
254	eor	r7, r7, r14                     @ AES block 0 - round 10 high
255
256	eor	r21, r21, r13                     @ AES block 2 - round 10 low
257	fmov	d4, r6                               @ AES block 0 - mov low
258
259	eor	r19, r19, r13                     @ AES block 1 - round 10 low
260	eor	r22, r22, r14                     @ AES block 2 - round 10 high
261	fmov	v4.d[1], r7                           @ AES block 0 - mov high
262
263	fmov	d5, r19                               @ AES block 1 - mov low
264	eor	r20, r20, r14                     @ AES block 1 - round 10 high
265
266	eor	r23, r23, r13                     @ AES block 3 - round 10 low
267	fmov	v5.d[1], r20                           @ AES block 1 - mov high
268
269	fmov	d6, r21                               @ AES block 2 - mov low
270	eor	r24, r24, r14                     @ AES block 3 - round 10 high
271	rev	r9, r12                                 @ CTR block 4
272
273	fmov	v6.d[1], r22                           @ AES block 2 - mov high
274	orr	r9, r11, r9, lsl #32            @ CTR block 4
275
276	eor	q4, q4, q0                          @ AES block 0 - result
277	fmov	d0, r10                               @ CTR block 4
278	add	r12, r12, #1                            @ CTR block 4
279
280	fmov	v0.d[1], r9                               @ CTR block 4
281	rev	r9, r12                                 @ CTR block 5
282
283	eor	q5, q5, q1                          @ AES block 1 - result
284	fmov	d1, r10                               @ CTR block 5
285	orr	r9, r11, r9, lsl #32            @ CTR block 5
286
287	add	r12, r12, #1                            @ CTR block 5
288	add	r0, r0, #64                       @ AES input_ptr update
289	fmov	v1.d[1], r9                               @ CTR block 5
290
291	fmov	d7, r23                               @ AES block 3 - mov low
292	rev	r9, r12                                 @ CTR block 6
293	st1	{ q4}, [r2], #16                     @ AES block 0 - store result
294
295	fmov	v7.d[1], r24                           @ AES block 3 - mov high
296	orr	r9, r11, r9, lsl #32            @ CTR block 6
297
298	add	r12, r12, #1                            @ CTR block 6
299	eor	q6, q6, q2                          @ AES block 2 - result
300	st1	{ q5}, [r2], #16                     @ AES block 1 - store result
301
302	fmov	d2, r10                               @ CTR block 6
303	cmp	r0, r5                   @ check if we have <= 8 blocks
304
305	fmov	v2.d[1], r9                               @ CTR block 6
306	rev	r9, r12                                 @ CTR block 7
307	st1	{ q6}, [r2], #16                     @ AES block 2 - store result
308
309	orr	r9, r11, r9, lsl #32            @ CTR block 7
310
311	eor	q7, q7, q3                          @ AES block 3 - result
312	st1	{ q7}, [r2], #16                     @ AES block 3 - store result
313	bge	.L128_enc_prepretail                              @ do prepretail
314
315.L128_enc_main_loop:@ main loop start
316	ldp	r23, r24, [r0, #48]           @ AES block 4k+3 - load plaintext
317#ifdef __ARMEB__
318	rev	r23, r23
319	rev	r24, r24
320#endif
321	rev64	q4, q4                                    @ GHASH block 4k (only t0 is free)
322	rev64	q6, q6                                    @ GHASH block 4k+2 (t0, t1, and t2 free)
323
324	aese	q2, v18.16b
325	aesmc	q2, q2          @ AES block 4k+6 - round 0
326	fmov	d3, r10                               @ CTR block 4k+3
327
328	ext	v11.16b, v11.16b, v11.16b, #8                     @ PRE 0
329	rev64	q5, q5                                    @ GHASH block 4k+1 (t0 and t1 free)
330
331	aese	q1, v18.16b
332	aesmc	q1, q1          @ AES block 4k+5 - round 0
333	add	r12, r12, #1                            @ CTR block 4k+3
334	fmov	v3.d[1], r9                               @ CTR block 4k+3
335
336	aese	q0, v18.16b
337	aesmc	q0, q0          @ AES block 4k+4 - round 0
338	mov	d31, v6.d[1]                                  @ GHASH block 4k+2 - mid
339
340	aese	q2, v19.16b
341	aesmc	q2, q2          @ AES block 4k+6 - round 1
342	mov	d30, v5.d[1]                                  @ GHASH block 4k+1 - mid
343
344	aese	q1, v19.16b
345	aesmc	q1, q1          @ AES block 4k+5 - round 1
346	eor	q4, q4, v11.16b                           @ PRE 1
347
348	aese	q3, v18.16b
349	aesmc	q3, q3          @ AES block 4k+7 - round 0
350	eor	r24, r24, r14                     @ AES block 4k+3 - round 10 high
351
352	pmull2	v28.1q, q5, v14.2d                          @ GHASH block 4k+1 - high
353	eor	v31.8b, v31.8b, q6                          @ GHASH block 4k+2 - mid
354	ldp	r6, r7, [r0, #0]            @ AES block 4k+4 - load plaintext
355#ifdef __ARMEB__
356	rev	r6, r6
357	rev	r7, r7
358#endif
359	aese	q0, v19.16b
360	aesmc	q0, q0          @ AES block 4k+4 - round 1
361	rev	r9, r12                                 @ CTR block 4k+8
362
363	eor	v30.8b, v30.8b, q5                          @ GHASH block 4k+1 - mid
364	mov	d8, v4.d[1]                                  @ GHASH block 4k - mid
365	orr	r9, r11, r9, lsl #32            @ CTR block 4k+8
366
367	pmull2	v9.1q, q4, v15.2d                       @ GHASH block 4k - high
368	add	r12, r12, #1                            @ CTR block 4k+8
369	mov	d10, v17.d[1]                               @ GHASH block 4k - mid
370
371	aese	q0, v20.16b
372	aesmc	q0, q0          @ AES block 4k+4 - round 2
373
374	pmull	v11.1q, q4, v15.1d                       @ GHASH block 4k - low
375	eor	q8, q8, q4                          @ GHASH block 4k - mid
376
377	aese	q1, v20.16b
378	aesmc	q1, q1          @ AES block 4k+5 - round 2
379
380	aese	q0, v21.16b
381	aesmc	q0, q0          @ AES block 4k+4 - round 3
382	eor	q9, q9, v28.16b                         @ GHASH block 4k+1 - high
383
384	pmull	v28.1q, q6, v13.1d                          @ GHASH block 4k+2 - low
385
386	pmull	v10.1q, q8, v10.1d                      @ GHASH block 4k - mid
387	rev64	q7, q7                                    @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
388
389	pmull	v30.1q, v30.1d, v17.1d                          @ GHASH block 4k+1 - mid
390
391	pmull	v29.1q, q5, v14.1d                          @ GHASH block 4k+1 - low
392	ins	v31.d[1], v31.d[0]                                @ GHASH block 4k+2 - mid
393
394	pmull2	v8.1q, q6, v13.2d                          @ GHASH block 4k+2 - high
395	eor	r7, r7, r14                     @ AES block 4k+4 - round 10 high
396
397	eor	v10.16b, v10.16b, v30.16b                         @ GHASH block 4k+1 - mid
398	mov	d30, v7.d[1]                                  @ GHASH block 4k+3 - mid
399
400	aese	q3, v19.16b
401	aesmc	q3, q3          @ AES block 4k+7 - round 1
402	eor	v11.16b, v11.16b, v29.16b                         @ GHASH block 4k+1 - low
403
404	aese	q2, v20.16b
405	aesmc	q2, q2          @ AES block 4k+6 - round 2
406	eor	r6, r6, r13                     @ AES block 4k+4 - round 10 low
407
408	aese	q1, v21.16b
409	aesmc	q1, q1          @ AES block 4k+5 - round 3
410	eor	v30.8b, v30.8b, q7                          @ GHASH block 4k+3 - mid
411
412	pmull2	v4.1q, q7, v12.2d                          @ GHASH block 4k+3 - high
413
414	aese	q2, v21.16b
415	aesmc	q2, q2          @ AES block 4k+6 - round 3
416	eor	q9, q9, q8                         @ GHASH block 4k+2 - high
417
418	pmull2	v31.1q, v31.2d, v16.2d                          @ GHASH block 4k+2 - mid
419
420	pmull	v29.1q, q7, v12.1d                          @ GHASH block 4k+3 - low
421	movi	q8, #0xc2
422
423	pmull	v30.1q, v30.1d, v16.1d                          @ GHASH block 4k+3 - mid
424	eor	v11.16b, v11.16b, v28.16b                         @ GHASH block 4k+2 - low
425
426	aese	q1, v22.16b
427	aesmc	q1, q1          @ AES block 4k+5 - round 4
428
429	aese	q3, v20.16b
430	aesmc	q3, q3          @ AES block 4k+7 - round 2
431	shl	d8, d8, #56               @ mod_constant
432
433	aese	q0, v22.16b
434	aesmc	q0, q0          @ AES block 4k+4 - round 4
435	eor	q9, q9, q4                         @ GHASH block 4k+3 - high
436
437	aese	q1, v23.16b
438	aesmc	q1, q1          @ AES block 4k+5 - round 5
439	ldp	r19, r20, [r0, #16]           @ AES block 4k+5 - load plaintext
440#ifdef __ARMEB__
441	rev	r19, r19
442	rev	r20, r20
443#endif
444	aese	q3, v21.16b
445	aesmc	q3, q3          @ AES block 4k+7 - round 3
446	eor	v10.16b, v10.16b, v31.16b                         @ GHASH block 4k+2 - mid
447
448	aese	q0, v23.16b
449	aesmc	q0, q0          @ AES block 4k+4 - round 5
450	ldp	r21, r22, [r0, #32]           @ AES block 4k+6 - load plaintext
451#ifdef __ARMEB__
452	rev	r21, r21
453	rev	r22, r22
454#endif
455	pmull	v31.1q, q9, q8            @ MODULO - top 64b align with mid
456	eor	v11.16b, v11.16b, v29.16b                         @ GHASH block 4k+3 - low
457
458	aese	q2, v22.16b
459	aesmc	q2, q2          @ AES block 4k+6 - round 4
460	eor	r19, r19, r13                     @ AES block 4k+5 - round 10 low
461
462	aese	q3, v22.16b
463	aesmc	q3, q3          @ AES block 4k+7 - round 4
464	eor	v10.16b, v10.16b, v30.16b                         @ GHASH block 4k+3 - mid
465
466	aese	q1, v24.16b
467	aesmc	q1, q1          @ AES block 4k+5 - round 6
468	eor	r23, r23, r13                     @ AES block 4k+3 - round 10 low
469
470	aese	q2, v23.16b
471	aesmc	q2, q2          @ AES block 4k+6 - round 5
472	eor	v30.16b, v11.16b, q9                         @ MODULO - karatsuba tidy up
473
474	fmov	d4, r6                               @ AES block 4k+4 - mov low
475	aese	q0, v24.16b
476	aesmc	q0, q0          @ AES block 4k+4 - round 6
477	fmov	v4.d[1], r7                           @ AES block 4k+4 - mov high
478
479	add	r0, r0, #64                       @ AES input_ptr update
480	fmov	d7, r23                               @ AES block 4k+3 - mov low
481	ext	q9, q9, q9, #8                     @ MODULO - other top alignment
482
483	aese	q3, v23.16b
484	aesmc	q3, q3          @ AES block 4k+7 - round 5
485	fmov	d5, r19                               @ AES block 4k+5 - mov low
486
487	aese	q0, v25.16b
488	aesmc	q0, q0          @ AES block 4k+4 - round 7
489	eor	v10.16b, v10.16b, v30.16b                         @ MODULO - karatsuba tidy up
490
491	aese	q2, v24.16b
492	aesmc	q2, q2          @ AES block 4k+6 - round 6
493	eor	r20, r20, r14                     @ AES block 4k+5 - round 10 high
494
495	aese	q1, v25.16b
496	aesmc	q1, q1          @ AES block 4k+5 - round 7
497	fmov	v5.d[1], r20                           @ AES block 4k+5 - mov high
498
499	aese	q0, v26.16b
500	aesmc	q0, q0          @ AES block 4k+4 - round 8
501	fmov	v7.d[1], r24                           @ AES block 4k+3 - mov high
502
503	aese	q3, v24.16b
504	aesmc	q3, q3          @ AES block 4k+7 - round 6
505	cmp	r0, r5                   @ .LOOP CONTROL
506
507	aese	q1, v26.16b
508	aesmc	q1, q1          @ AES block 4k+5 - round 8
509	eor	v10.16b, v10.16b, v31.16b                      @ MODULO - fold into mid
510
511	aese	q0, v27.16b                                      @ AES block 4k+4 - round 9
512	eor	r21, r21, r13                     @ AES block 4k+6 - round 10 low
513	eor	r22, r22, r14                     @ AES block 4k+6 - round 10 high
514
515	aese	q3, v25.16b
516	aesmc	q3, q3          @ AES block 4k+7 - round 7
517	fmov	d6, r21                               @ AES block 4k+6 - mov low
518
519	aese	q1, v27.16b                                      @ AES block 4k+5 - round 9
520	fmov	v6.d[1], r22                           @ AES block 4k+6 - mov high
521
522	aese	q2, v25.16b
523	aesmc	q2, q2          @ AES block 4k+6 - round 7
524	eor	q4, q4, q0                          @ AES block 4k+4 - result
525
526	fmov	d0, r10                               @ CTR block 4k+8
527	aese	q3, v26.16b
528	aesmc	q3, q3          @ AES block 4k+7 - round 8
529
530	fmov	v0.d[1], r9                               @ CTR block 4k+8
531	rev	r9, r12                                 @ CTR block 4k+9
532	eor	v10.16b, v10.16b, q9                         @ MODULO - fold into mid
533
534	aese	q2, v26.16b
535	aesmc	q2, q2          @ AES block 4k+6 - round 8
536	eor	q5, q5, q1                          @ AES block 4k+5 - result
537
538	add	r12, r12, #1                            @ CTR block 4k+9
539	orr	r9, r11, r9, lsl #32            @ CTR block 4k+9
540	fmov	d1, r10                               @ CTR block 4k+9
541
542	pmull	v9.1q, v10.1d, q8            @ MODULO - mid 64b align with low
543	fmov	v1.d[1], r9                               @ CTR block 4k+9
544	rev	r9, r12                                 @ CTR block 4k+10
545
546	aese	q2, v27.16b                                      @ AES block 4k+6 - round 9
547	st1	{ q4}, [r2], #16                     @ AES block 4k+4 - store result
548	eor	q6, q6, q2                          @ AES block 4k+6 - result
549	orr	r9, r11, r9, lsl #32            @ CTR block 4k+10
550
551	aese	q3, v27.16b                                      @ AES block 4k+7 - round 9
552	add	r12, r12, #1                            @ CTR block 4k+10
553	ext	v10.16b, v10.16b, v10.16b, #8                     @ MODULO - other mid alignment
554	fmov	d2, r10                               @ CTR block 4k+10
555
556	eor	v11.16b, v11.16b, q9                         @ MODULO - fold into low
557	st1	{ q5}, [r2], #16                     @ AES block 4k+5 - store result
558
559	fmov	v2.d[1], r9                               @ CTR block 4k+10
560	st1	{ q6}, [r2], #16                     @ AES block 4k+6 - store result
561	rev	r9, r12                                 @ CTR block 4k+11
562
563	orr	r9, r11, r9, lsl #32            @ CTR block 4k+11
564	eor	q7, q7, q3                          @ AES block 4k+3 - result
565
566	eor	v11.16b, v11.16b, v10.16b                         @ MODULO - fold into low
567	st1	{ q7}, [r2], #16                     @ AES block 4k+3 - store result
568	blt	.L128_enc_main_loop
569
570.L128_enc_prepretail:@ PREPRETAIL
571	rev64	q4, q4                                    @ GHASH block 4k (only t0 is free)
572	fmov	d3, r10                               @ CTR block 4k+3
573	rev64	q5, q5                                    @ GHASH block 4k+1 (t0 and t1 free)
574
575	ext	v11.16b, v11.16b, v11.16b, #8                     @ PRE 0
576	add	r12, r12, #1                            @ CTR block 4k+3
577	fmov	v3.d[1], r9                               @ CTR block 4k+3
578
579	aese	q1, v18.16b
580	aesmc	q1, q1          @ AES block 4k+5 - round 0
581	rev64	q6, q6                                    @ GHASH block 4k+2 (t0, t1, and t2 free)
582
583	pmull	v29.1q, q5, v14.1d                          @ GHASH block 4k+1 - low
584
585	rev64	q7, q7                                    @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
586	eor	q4, q4, v11.16b                           @ PRE 1
587
588	pmull2	v28.1q, q5, v14.2d                          @ GHASH block 4k+1 - high
589
590	aese	q3, v18.16b
591	aesmc	q3, q3          @ AES block 4k+7 - round 0
592	mov	d30, v5.d[1]                                  @ GHASH block 4k+1 - mid
593
594	pmull	v11.1q, q4, v15.1d                       @ GHASH block 4k - low
595	mov	d8, v4.d[1]                                  @ GHASH block 4k - mid
596
597	mov	d31, v6.d[1]                                  @ GHASH block 4k+2 - mid
598	mov	d10, v17.d[1]                               @ GHASH block 4k - mid
599
600	aese	q1, v19.16b
601	aesmc	q1, q1          @ AES block 4k+5 - round 1
602	eor	v30.8b, v30.8b, q5                          @ GHASH block 4k+1 - mid
603
604	eor	q8, q8, q4                          @ GHASH block 4k - mid
605
606	pmull2	v9.1q, q4, v15.2d                       @ GHASH block 4k - high
607	eor	v31.8b, v31.8b, q6                          @ GHASH block 4k+2 - mid
608
609	aese	q3, v19.16b
610	aesmc	q3, q3          @ AES block 4k+7 - round 1
611
612	pmull	v30.1q, v30.1d, v17.1d                          @ GHASH block 4k+1 - mid
613	eor	v11.16b, v11.16b, v29.16b                         @ GHASH block 4k+1 - low
614
615	pmull	v10.1q, q8, v10.1d                      @ GHASH block 4k - mid
616
617	aese	q0, v18.16b
618	aesmc	q0, q0          @ AES block 4k+4 - round 0
619	ins	v31.d[1], v31.d[0]                                @ GHASH block 4k+2 - mid
620
621	aese	q2, v18.16b
622	aesmc	q2, q2          @ AES block 4k+6 - round 0
623
624	eor	v10.16b, v10.16b, v30.16b                         @ GHASH block 4k+1 - mid
625	mov	d30, v7.d[1]                                  @ GHASH block 4k+3 - mid
626
627	aese	q0, v19.16b
628	aesmc	q0, q0          @ AES block 4k+4 - round 1
629	eor	q9, q9, v28.16b                         @ GHASH block 4k+1 - high
630
631	pmull2	v31.1q, v31.2d, v16.2d                          @ GHASH block 4k+2 - mid
632
633	pmull2	v8.1q, q6, v13.2d                          @ GHASH block 4k+2 - high
634	eor	v30.8b, v30.8b, q7                          @ GHASH block 4k+3 - mid
635
636	pmull2	v4.1q, q7, v12.2d                          @ GHASH block 4k+3 - high
637
638	pmull	v28.1q, q6, v13.1d                          @ GHASH block 4k+2 - low
639
640	aese	q2, v19.16b
641	aesmc	q2, q2          @ AES block 4k+6 - round 1
642	eor	q9, q9, q8                         @ GHASH block 4k+2 - high
643
644	aese	q0, v20.16b
645	aesmc	q0, q0          @ AES block 4k+4 - round 2
646
647	pmull	v29.1q, q7, v12.1d                          @ GHASH block 4k+3 - low
648	movi	q8, #0xc2
649
650	aese	q2, v20.16b
651	aesmc	q2, q2          @ AES block 4k+6 - round 2
652	eor	v11.16b, v11.16b, v28.16b                         @ GHASH block 4k+2 - low
653
654	aese	q3, v20.16b
655	aesmc	q3, q3          @ AES block 4k+7 - round 2
656
657	pmull	v30.1q, v30.1d, v16.1d                          @ GHASH block 4k+3 - mid
658	eor	v10.16b, v10.16b, v31.16b                         @ GHASH block 4k+2 - mid
659
660	aese	q2, v21.16b
661	aesmc	q2, q2          @ AES block 4k+6 - round 3
662
663	aese	q1, v20.16b
664	aesmc	q1, q1          @ AES block 4k+5 - round 2
665	eor	q9, q9, q4                         @ GHASH block 4k+3 - high
666
667	aese	q0, v21.16b
668	aesmc	q0, q0          @ AES block 4k+4 - round 3
669
670	eor	v10.16b, v10.16b, v30.16b                         @ GHASH block 4k+3 - mid
671	shl	d8, d8, #56               @ mod_constant
672
673	aese	q1, v21.16b
674	aesmc	q1, q1          @ AES block 4k+5 - round 3
675	eor	v11.16b, v11.16b, v29.16b                         @ GHASH block 4k+3 - low
676
677	aese	q0, v22.16b
678	aesmc	q0, q0          @ AES block 4k+4 - round 4
679
680	pmull	v28.1q, q9, q8
681	eor	v10.16b, v10.16b, q9                         @ karatsuba tidy up
682
683	aese	q1, v22.16b
684	aesmc	q1, q1          @ AES block 4k+5 - round 4
685
686	aese	q0, v23.16b
687	aesmc	q0, q0          @ AES block 4k+4 - round 5
688	ext	q9, q9, q9, #8
689
690	aese	q3, v21.16b
691	aesmc	q3, q3          @ AES block 4k+7 - round 3
692
693	aese	q2, v22.16b
694	aesmc	q2, q2          @ AES block 4k+6 - round 4
695	eor	v10.16b, v10.16b, v11.16b
696
697	aese	q0, v24.16b
698	aesmc	q0, q0          @ AES block 4k+4 - round 6
699
700	aese	q3, v22.16b
701	aesmc	q3, q3          @ AES block 4k+7 - round 4
702
703	aese	q1, v23.16b
704	aesmc	q1, q1          @ AES block 4k+5 - round 5
705
706	aese	q2, v23.16b
707	aesmc	q2, q2          @ AES block 4k+6 - round 5
708	eor	v10.16b, v10.16b, v28.16b
709
710	aese	q3, v23.16b
711	aesmc	q3, q3          @ AES block 4k+7 - round 5
712
713	aese	q1, v24.16b
714	aesmc	q1, q1          @ AES block 4k+5 - round 6
715
716	aese	q2, v24.16b
717	aesmc	q2, q2          @ AES block 4k+6 - round 6
718
719	aese	q3, v24.16b
720	aesmc	q3, q3          @ AES block 4k+7 - round 6
721	eor	v10.16b, v10.16b, q9
722
723	aese	q0, v25.16b
724	aesmc	q0, q0          @ AES block 4k+4 - round 7
725
726	aese	q2, v25.16b
727	aesmc	q2, q2          @ AES block 4k+6 - round 7
728
729	aese	q3, v25.16b
730	aesmc	q3, q3          @ AES block 4k+7 - round 7
731
732	pmull	v28.1q, v10.1d, q8
733
734	aese	q1, v25.16b
735	aesmc	q1, q1          @ AES block 4k+5 - round 7
736	ext	v10.16b, v10.16b, v10.16b, #8
737
738	aese	q3, v26.16b
739	aesmc	q3, q3          @ AES block 4k+7 - round 8
740
741	aese	q0, v26.16b
742	aesmc	q0, q0          @ AES block 4k+4 - round 8
743	eor	v11.16b, v11.16b, v28.16b
744
745	aese	q1, v26.16b
746	aesmc	q1, q1          @ AES block 4k+5 - round 8
747
748	aese	q3, v27.16b                                      @ AES block 4k+7 - round 9
749
750	aese	q2, v26.16b
751	aesmc	q2, q2          @ AES block 4k+6 - round 8
752
753	aese	q0, v27.16b                                      @ AES block 4k+4 - round 9
754
755	aese	q1, v27.16b                                      @ AES block 4k+5 - round 9
756	eor	v11.16b, v11.16b, v10.16b
757
758	aese	q2, v27.16b                                      @ AES block 4k+6 - round 9
759.L128_enc_tail:@ TAIL
760
761	sub	r5, r4, r0   @ main_end_input_ptr is number of bytes left to process
762	ldp	r6, r7, [r0], #16           @ AES block 4k+4 - load plaintext
763#ifdef __ARMEB__
764	rev	r6, r6
765	rev	r7, r7
766#endif
767	cmp	r5, #48
768
769	ext	q8, v11.16b, v11.16b, #8                     @ prepare final partial tag
770	eor	r6, r6, r13                     @ AES block 4k+4 - round 10 low
771	eor	r7, r7, r14                     @ AES block 4k+4 - round 10 high
772
773	fmov	d4, r6                               @ AES block 4k+4 - mov low
774
775	fmov	v4.d[1], r7                           @ AES block 4k+4 - mov high
776
777	eor	q5, q4, q0                          @ AES block 4k+4 - result
778
779	bgt	.L128_enc_blocks_more_than_3
780
781	sub	r12, r12, #1
782	movi	v11.8b, #0
783	mov	q3, q2
784
785	cmp	r5, #32
786	mov	q2, q1
787	movi	q9, #0
788
789	movi	v10.8b, #0
790	bgt	.L128_enc_blocks_more_than_2
791
792	mov	q3, q1
793	cmp	r5, #16
794
795	sub	r12, r12, #1
796	bgt	.L128_enc_blocks_more_than_1
797
798	sub	r12, r12, #1
799	b	.L128_enc_blocks_less_than_1
800.L128_enc_blocks_more_than_3:@ blocks left >  3
801	st1	{ q5}, [r2], #16                     @ AES final-3 block  - store result
802
803	ldp	r6, r7, [r0], #16           @ AES final-2 block - load input low & high
804#ifdef __ARMEB__
805	rev	r6, r6
806	rev	r7, r7
807#endif
808	rev64	q4, q5                                    @ GHASH final-3 block
809
810	eor	q4, q4, q8                           @ feed in partial tag
811	eor	r7, r7, r14                     @ AES final-2 block - round 10 high
812	eor	r6, r6, r13                     @ AES final-2 block - round 10 low
813
814	fmov	d5, r6                                 @ AES final-2 block - mov low
815
816	movi	q8, #0                                        @ suppress further partial tag feed in
817	fmov	v5.d[1], r7                             @ AES final-2 block - mov high
818
819	pmull	v11.1q, q4, v15.1d                       @ GHASH final-3 block - low
820	mov	d22, v4.d[1]                                 @ GHASH final-3 block - mid
821
822	pmull2	v9.1q, q4, v15.2d                       @ GHASH final-3 block - high
823
824	mov	d10, v17.d[1]                               @ GHASH final-3 block - mid
825
826	eor	q5, q5, q1                            @ AES final-2 block - result
827	eor	v22.8b, v22.8b, q4                      @ GHASH final-3 block - mid
828
829	pmull	v10.1q, v22.1d, v10.1d                    @ GHASH final-3 block - mid
830.L128_enc_blocks_more_than_2:@ blocks left >  2
831
832	st1	{ q5}, [r2], #16                     @ AES final-2 block - store result
833
834	rev64	q4, q5                                    @ GHASH final-2 block
835	ldp	r6, r7, [r0], #16           @ AES final-1 block - load input low & high
836#ifdef __ARMEB__
837	rev	r6, r6
838	rev	r7, r7
839#endif
840	eor	q4, q4, q8                           @ feed in partial tag
841
842	eor	r6, r6, r13                     @ AES final-1 block - round 10 low
843
844	fmov	d5, r6                                 @ AES final-1 block - mov low
845	eor	r7, r7, r14                     @ AES final-1 block - round 10 high
846
847	pmull2	v20.1q, q4, v14.2d                          @ GHASH final-2 block - high
848	fmov	v5.d[1], r7                             @ AES final-1 block - mov high
849
850	mov	d22, v4.d[1]                                 @ GHASH final-2 block - mid
851
852	pmull	v21.1q, q4, v14.1d                          @ GHASH final-2 block - low
853
854	eor	q9, q9, v20.16b                            @ GHASH final-2 block - high
855
856	eor	v22.8b, v22.8b, q4                      @ GHASH final-2 block - mid
857
858	eor	q5, q5, q2                            @ AES final-1 block - result
859
860	eor	v11.16b, v11.16b, v21.16b                            @ GHASH final-2 block - low
861
862	pmull	v22.1q, v22.1d, v17.1d                      @ GHASH final-2 block - mid
863
864	movi	q8, #0                                        @ suppress further partial tag feed in
865
866	eor	v10.16b, v10.16b, v22.16b                       @ GHASH final-2 block - mid
867.L128_enc_blocks_more_than_1:@ blocks left >  1
868
869	st1	{ q5}, [r2], #16                     @ AES final-1 block - store result
870
871	rev64	q4, q5                                    @ GHASH final-1 block
872	ldp	r6, r7, [r0], #16           @ AES final block - load input low & high
873#ifdef __ARMEB__
874	rev	r6, r6
875	rev	r7, r7
876#endif
877	eor	q4, q4, q8                           @ feed in partial tag
878
879	eor	r7, r7, r14                     @ AES final block - round 10 high
880	eor	r6, r6, r13                     @ AES final block - round 10 low
881
882	fmov	d5, r6                                 @ AES final block - mov low
883
884	pmull2	v20.1q, q4, v13.2d                          @ GHASH final-1 block - high
885	fmov	v5.d[1], r7                             @ AES final block - mov high
886
887	mov	d22, v4.d[1]                                 @ GHASH final-1 block - mid
888
889	pmull	v21.1q, q4, v13.1d                          @ GHASH final-1 block - low
890
891	eor	v22.8b, v22.8b, q4                      @ GHASH final-1 block - mid
892
893	eor	q5, q5, q3                            @ AES final block - result
894
895	ins	v22.d[1], v22.d[0]                            @ GHASH final-1 block - mid
896
897	pmull2	v22.1q, v22.2d, v16.2d                      @ GHASH final-1 block - mid
898
899	eor	v11.16b, v11.16b, v21.16b                            @ GHASH final-1 block - low
900
901	eor	q9, q9, v20.16b                            @ GHASH final-1 block - high
902
903	eor	v10.16b, v10.16b, v22.16b                       @ GHASH final-1 block - mid
904	movi	q8, #0                                        @ suppress further partial tag feed in
905.L128_enc_blocks_less_than_1:@ blocks left <= 1
906
907	and	r1, r1, #127                    @ bit_length %= 128
908	mvn	r13, xzr                                      @ rk10_l = 0xffffffffffffffff
909
910	mvn	r14, xzr                                      @ rk10_h = 0xffffffffffffffff
911	sub	r1, r1, #128                    @ bit_length -= 128
912
913	neg	r1, r1                          @ bit_length = 128 - #bits in input (in range [1,128])
914
915	and	r1, r1, #127                    @ bit_length %= 128
916
917	lsr	r14, r14, r1                     @ rk10_h is mask for top 64b of last block
918	cmp	r1, #64
919
920	csel	r6, r13, r14, lt
921	csel	r7, r14, xzr, lt
922
923	fmov	d0, r6                                 @ ctr0b is mask for last block
924
925	fmov	v0.d[1], r7
926
927	and	q5, q5, q0                            @ possibly partial last block has zeroes in highest bits
928
929	rev64	q4, q5                                    @ GHASH final block
930
931	eor	q4, q4, q8                           @ feed in partial tag
932
933	mov	d8, v4.d[1]                                  @ GHASH final block - mid
934
935	pmull	v21.1q, q4, v12.1d                          @ GHASH final block - low
936	ld1	{ v18.16b}, [r2]                            @ load existing bytes where the possibly partial last block is to be stored
937
938	eor	q8, q8, q4                          @ GHASH final block - mid
939#ifndef __ARMEB__
940	rev	r9, r12
941#else
942	mov	r9, r12
943#endif
944	pmull2	v20.1q, q4, v12.2d                          @ GHASH final block - high
945
946	pmull	v8.1q, q8, v16.1d                          @ GHASH final block - mid
947
948	eor	v11.16b, v11.16b, v21.16b                            @ GHASH final block - low
949
950	eor	q9, q9, v20.16b                            @ GHASH final block - high
951
952	eor	v10.16b, v10.16b, q8                         @ GHASH final block - mid
953	movi	q8, #0xc2
954
955	eor	v30.16b, v11.16b, q9                         @ MODULO - karatsuba tidy up
956
957	shl	d8, d8, #56               @ mod_constant
958
959	eor	v10.16b, v10.16b, v30.16b                         @ MODULO - karatsuba tidy up
960
961	pmull	v31.1q, q9, q8            @ MODULO - top 64b align with mid
962
963	ext	q9, q9, q9, #8                     @ MODULO - other top alignment
964
965	eor	v10.16b, v10.16b, v31.16b                      @ MODULO - fold into mid
966
967	eor	v10.16b, v10.16b, q9                         @ MODULO - fold into mid
968
969	pmull	v9.1q, v10.1d, q8            @ MODULO - mid 64b align with low
970
971	ext	v10.16b, v10.16b, v10.16b, #8                     @ MODULO - other mid alignment
972
973	bif	q5, v18.16b, q0                              @ insert existing bytes in top end of result before storing
974
975	eor	v11.16b, v11.16b, q9                         @ MODULO - fold into low
976	st1	{ q5}, [r2]                          @ store all 16B
977
978	str	r9, [r16, #12]                          @ store the updated counter
979
980	eor	v11.16b, v11.16b, v10.16b                         @ MODULO - fold into low
981	ext	v11.16b, v11.16b, v11.16b, #8
982	rev64	v11.16b, v11.16b
983	mov	r0, r15
984	st1	{ v11.16b }, [r3]
985	ldp	r21, r22, [sp, #16]
986	ldp	r23, r24, [sp, #32]
987	ldp	d8, d9, [sp, #48]
988	ldp	d10, d11, [sp, #64]
989	ldp	d12, d13, [sp, #80]
990	ldp	d14, d15, [sp, #96]
991	ldp	r19, r20, [sp], #112
992	RET
993
994.L128_enc_ret:
995	mov	r0, #0x0
996	RET
997.size	aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel
998.globl	aes_gcm_dec_128_kernel
999.type	aes_gcm_dec_128_kernel,%function
1000.align	4
1001aes_gcm_dec_128_kernel:
1002	cbz	r1, .L128_dec_ret
1003	stp	r19, r20, [sp, #-112]!
1004	mov	r16, r4
1005	mov	r8, r5
1006	stp	r21, r22, [sp, #16]
1007	stp	r23, r24, [sp, #32]
1008	stp	d8, d9, [sp, #48]
1009	stp	d10, d11, [sp, #64]
1010	stp	d12, d13, [sp, #80]
1011	stp	d14, d15, [sp, #96]
1012
1013	lsr	r5, r1, #3              @ byte_len
1014	mov	r15, r5
1015	ldp	r10, r11, [r16]              @ ctr96_b64, ctr96_t32
1016#ifdef __ARMEB__
1017	rev	r10, r10
1018	rev	r11, r11
1019#endif
1020	ldp	r13, r14, [r8, #160]                     @ load rk10
1021#ifdef __ARMEB__
1022	ror	r14, r14, 32
1023	ror	r13, r13, 32
1024#endif
1025	sub	r5, r5, #1      @ byte_len - 1
1026	ld1	{v18.4s}, [r8], #16                                @ load rk0
1027
1028	and	r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
1029	ld1	{ q0}, [r16]                             @ special case vector load initial counter so we can start first AES block as quickly as possible
1030
1031	ldr	q13, [r3, #64]                         @ load h2l | h2h
1032#ifndef __ARMEB__
1033	ext	v13.16b, v13.16b, v13.16b, #8
1034#endif
1035	lsr	r12, r11, #32
1036	fmov	d2, r10                               @ CTR block 2
1037
1038	ld1	{v19.4s}, [r8], #16                                @ load rk1
1039	orr	r11, r11, r11
1040	rev	r12, r12                                @ rev_ctr32
1041
1042	fmov	d1, r10                               @ CTR block 1
1043	add	r12, r12, #1                            @ increment rev_ctr32
1044
1045	aese	q0, v18.16b
1046	aesmc	q0, q0          @ AES block 0 - round 0
1047	rev	r9, r12                                 @ CTR block 1
1048
1049	orr	r9, r11, r9, lsl #32            @ CTR block 1
1050	ld1	{v20.4s}, [r8], #16                                @ load rk2
1051	add	r12, r12, #1                            @ CTR block 1
1052
1053	fmov	v1.d[1], r9                               @ CTR block 1
1054	rev	r9, r12                                 @ CTR block 2
1055	add	r12, r12, #1                            @ CTR block 2
1056
1057	aese	q0, v19.16b
1058	aesmc	q0, q0          @ AES block 0 - round 1
1059	orr	r9, r11, r9, lsl #32            @ CTR block 2
1060
1061	fmov	v2.d[1], r9                               @ CTR block 2
1062	rev	r9, r12                                 @ CTR block 3
1063
1064	fmov	d3, r10                               @ CTR block 3
1065	orr	r9, r11, r9, lsl #32            @ CTR block 3
1066	add	r12, r12, #1                            @ CTR block 3
1067
1068	fmov	v3.d[1], r9                               @ CTR block 3
1069	add	r4, r0, r1, lsr #3   @ end_input_ptr
1070
1071	aese	q1, v18.16b
1072	aesmc	q1, q1          @ AES block 1 - round 0
1073	ld1	{v21.4s}, [r8], #16                                @ load rk3
1074
1075	aese	q0, v20.16b
1076	aesmc	q0, q0          @ AES block 0 - round 2
1077	ld1	{v22.4s}, [r8], #16                                @ load rk4
1078
1079	aese	q2, v18.16b
1080	aesmc	q2, q2          @ AES block 2 - round 0
1081	ld1	{v23.4s}, [r8], #16                                @ load rk5
1082
1083	aese	q1, v19.16b
1084	aesmc	q1, q1          @ AES block 1 - round 1
1085	ld1	{v24.4s}, [r8], #16                                @ load rk6
1086
1087	aese	q3, v18.16b
1088	aesmc	q3, q3          @ AES block 3 - round 0
1089
1090	aese	q2, v19.16b
1091	aesmc	q2, q2          @ AES block 2 - round 1
1092
1093	aese	q1, v20.16b
1094	aesmc	q1, q1          @ AES block 1 - round 2
1095
1096	aese	q3, v19.16b
1097	aesmc	q3, q3          @ AES block 3 - round 1
1098	ld1	{ v11.16b}, [r3]
1099	ext	v11.16b, v11.16b, v11.16b, #8
1100	rev64	v11.16b, v11.16b
1101
1102	aese	q0, v21.16b
1103	aesmc	q0, q0          @ AES block 0 - round 3
1104	ld1	{v25.4s}, [r8], #16                                @ load rk7
1105
1106	aese	q1, v21.16b
1107	aesmc	q1, q1          @ AES block 1 - round 3
1108
1109	aese	q3, v20.16b
1110	aesmc	q3, q3          @ AES block 3 - round 2
1111
1112	aese	q2, v20.16b
1113	aesmc	q2, q2          @ AES block 2 - round 2
1114	ld1	{v26.4s}, [r8], #16                                @ load rk8
1115
1116	aese	q1, v22.16b
1117	aesmc	q1, q1          @ AES block 1 - round 4
1118
1119	aese	q3, v21.16b
1120	aesmc	q3, q3          @ AES block 3 - round 3
1121
1122	aese	q2, v21.16b
1123	aesmc	q2, q2          @ AES block 2 - round 3
1124	ldr	q14, [r3, #80]                         @ load h3l | h3h
1125#ifndef __ARMEB__
1126	ext	v14.16b, v14.16b, v14.16b, #8
1127#endif
1128	aese	q0, v22.16b
1129	aesmc	q0, q0          @ AES block 0 - round 4
1130	ld1	{v27.4s}, [r8], #16                                @ load rk9
1131
1132	aese	q1, v23.16b
1133	aesmc	q1, q1          @ AES block 1 - round 5
1134
1135	aese	q2, v22.16b
1136	aesmc	q2, q2          @ AES block 2 - round 4
1137
1138	aese	q3, v22.16b
1139	aesmc	q3, q3          @ AES block 3 - round 4
1140
1141	aese	q0, v23.16b
1142	aesmc	q0, q0          @ AES block 0 - round 5
1143
1144	aese	q2, v23.16b
1145	aesmc	q2, q2          @ AES block 2 - round 5
1146	ldr	q12, [r3, #32]                         @ load h1l | h1h
1147#ifndef __ARMEB__
1148	ext	v12.16b, v12.16b, v12.16b, #8
1149#endif
1150	aese	q3, v23.16b
1151	aesmc	q3, q3          @ AES block 3 - round 5
1152
1153	aese	q0, v24.16b
1154	aesmc	q0, q0          @ AES block 0 - round 6
1155
1156	aese	q1, v24.16b
1157	aesmc	q1, q1          @ AES block 1 - round 6
1158
1159	aese	q3, v24.16b
1160	aesmc	q3, q3          @ AES block 3 - round 6
1161
1162	aese	q2, v24.16b
1163	aesmc	q2, q2          @ AES block 2 - round 6
1164	trn1	q8,    v12.2d,    v13.2d                      @ h2h | h1h
1165
1166	ldr	q15, [r3, #112]                        @ load h4l | h4h
1167#ifndef __ARMEB__
1168	ext	v15.16b, v15.16b, v15.16b, #8
1169#endif
1170	trn2	v16.2d,  v12.2d,    v13.2d                      @ h2l | h1l
1171	add	r5, r5, r0
1172
1173	aese	q1, v25.16b
1174	aesmc	q1, q1          @ AES block 1 - round 7
1175
1176	aese	q2, v25.16b
1177	aesmc	q2, q2          @ AES block 2 - round 7
1178
1179	aese	q0, v25.16b
1180	aesmc	q0, q0          @ AES block 0 - round 7
1181	eor	v16.16b, v16.16b, q8                     @ h2k | h1k
1182
1183	aese	q3, v25.16b
1184	aesmc	q3, q3          @ AES block 3 - round 7
1185
1186	aese	q1, v26.16b
1187	aesmc	q1, q1          @ AES block 1 - round 8
1188	trn2	v17.2d,  v14.2d,    v15.2d                      @ h4l | h3l
1189
1190	aese	q2, v26.16b
1191	aesmc	q2, q2          @ AES block 2 - round 8
1192
1193	aese	q3, v26.16b
1194	aesmc	q3, q3          @ AES block 3 - round 8
1195
1196	aese	q0, v26.16b
1197	aesmc	q0, q0          @ AES block 0 - round 8
1198	trn1	q9, v14.2d,    v15.2d                      @ h4h | h3h
1199
1200	aese	q2, v27.16b                                      @ AES block 2 - round 9
1201
1202	aese	q3, v27.16b                                      @ AES block 3 - round 9
1203
1204	aese	q0, v27.16b                                      @ AES block 0 - round 9
1205	cmp	r0, r5                   @ check if we have <= 4 blocks
1206
1207	aese	q1, v27.16b                                      @ AES block 1 - round 9
1208	eor	v17.16b, v17.16b, q9                  @ h4k | h3k
1209	bge	.L128_dec_tail                                    @ handle tail
1210
1211	ld1	{q4, q5}, [r0], #32               @ AES block 0 - load ciphertext; AES block 1 - load ciphertext
1212
1213	eor	q1, q5, q1                            @ AES block 1 - result
1214	ld1	{q6}, [r0], #16                       @ AES block 2 - load ciphertext
1215
1216	eor	q0, q4, q0                            @ AES block 0 - result
1217	rev64	q4, q4                                    @ GHASH block 0
1218	rev	r9, r12                                 @ CTR block 4
1219
1220	orr	r9, r11, r9, lsl #32            @ CTR block 4
1221	add	r12, r12, #1                            @ CTR block 4
1222	ld1	{q7}, [r0], #16                       @ AES block 3 - load ciphertext
1223
1224	rev64	q5, q5                                    @ GHASH block 1
1225	mov	r19, v1.d[0]                            @ AES block 1 - mov low
1226
1227	mov	r20, v1.d[1]                            @ AES block 1 - mov high
1228
1229	mov	r6, v0.d[0]                            @ AES block 0 - mov low
1230	cmp	r0, r5                   @ check if we have <= 8 blocks
1231
1232	mov	r7, v0.d[1]                            @ AES block 0 - mov high
1233
1234	fmov	d0, r10                               @ CTR block 4
1235
1236	fmov	v0.d[1], r9                               @ CTR block 4
1237	rev	r9, r12                                 @ CTR block 5
1238	eor	r19, r19, r13                   @ AES block 1 - round 10 low
1239#ifdef __ARMEB__
1240	rev	r19, r19
1241#endif
1242	fmov	d1, r10                               @ CTR block 5
1243	add	r12, r12, #1                            @ CTR block 5
1244	orr	r9, r11, r9, lsl #32            @ CTR block 5
1245
1246	fmov	v1.d[1], r9                               @ CTR block 5
1247	rev	r9, r12                                 @ CTR block 6
1248	add	r12, r12, #1                            @ CTR block 6
1249
1250	orr	r9, r11, r9, lsl #32            @ CTR block 6
1251
1252	eor	r20, r20, r14                   @ AES block 1 - round 10 high
1253#ifdef __ARMEB__
1254	rev	r20, r20
1255#endif
1256	eor	r6, r6, r13                   @ AES block 0 - round 10 low
1257#ifdef __ARMEB__
1258	rev	r6, r6
1259#endif
1260	eor	q2, q6, q2                            @ AES block 2 - result
1261
1262	eor	r7, r7, r14                   @ AES block 0 - round 10 high
1263#ifdef __ARMEB__
1264	rev	r7, r7
1265#endif
1266	stp	r6, r7, [r2], #16        @ AES block 0 - store result
1267
1268	stp	r19, r20, [r2], #16        @ AES block 1 - store result
1269	bge	.L128_dec_prepretail                              @ do prepretail
1270
1271.L128_dec_main_loop:@ main loop start
1272	eor	q3, q7, q3                            @ AES block 4k+3 - result
1273	ext	v11.16b, v11.16b, v11.16b, #8                     @ PRE 0
1274	mov	r21, v2.d[0]                            @ AES block 4k+2 - mov low
1275
1276	pmull2	v28.1q, q5, v14.2d                          @ GHASH block 4k+1 - high
1277	mov	r22, v2.d[1]                            @ AES block 4k+2 - mov high
1278
1279	aese	q1, v18.16b
1280	aesmc	q1, q1          @ AES block 4k+5 - round 0
1281	fmov	d2, r10                               @ CTR block 4k+6
1282
1283	rev64	q6, q6                                    @ GHASH block 4k+2
1284	fmov	v2.d[1], r9                               @ CTR block 4k+6
1285	rev	r9, r12                                 @ CTR block 4k+7
1286
1287	mov	r23, v3.d[0]                            @ AES block 4k+3 - mov low
1288	eor	q4, q4, v11.16b                           @ PRE 1
1289	mov	d30, v5.d[1]                                  @ GHASH block 4k+1 - mid
1290
1291	aese	q1, v19.16b
1292	aesmc	q1, q1          @ AES block 4k+5 - round 1
1293	rev64	q7, q7                                    @ GHASH block 4k+3
1294
1295	pmull	v29.1q, q5, v14.1d                          @ GHASH block 4k+1 - low
1296	mov	r24, v3.d[1]                            @ AES block 4k+3 - mov high
1297	orr	r9, r11, r9, lsl #32            @ CTR block 4k+7
1298
1299	pmull	v11.1q, q4, v15.1d                       @ GHASH block 4k - low
1300	fmov	d3, r10                               @ CTR block 4k+7
1301	eor	v30.8b, v30.8b, q5                          @ GHASH block 4k+1 - mid
1302
1303	aese	q1, v20.16b
1304	aesmc	q1, q1          @ AES block 4k+5 - round 2
1305	fmov	v3.d[1], r9                               @ CTR block 4k+7
1306
1307	aese	q2, v18.16b
1308	aesmc	q2, q2          @ AES block 4k+6 - round 0
1309	mov	d10, v17.d[1]                               @ GHASH block 4k - mid
1310
1311	pmull2	v9.1q, q4, v15.2d                       @ GHASH block 4k - high
1312	eor	v11.16b, v11.16b, v29.16b                         @ GHASH block 4k+1 - low
1313
1314	pmull	v29.1q, q7, v12.1d                          @ GHASH block 4k+3 - low
1315
1316	aese	q1, v21.16b
1317	aesmc	q1, q1          @ AES block 4k+5 - round 3
1318	mov	d8, v4.d[1]                                  @ GHASH block 4k - mid
1319
1320	aese	q3, v18.16b
1321	aesmc	q3, q3          @ AES block 4k+7 - round 0
1322	eor	q9, q9, v28.16b                         @ GHASH block 4k+1 - high
1323
1324	aese	q0, v18.16b
1325	aesmc	q0, q0          @ AES block 4k+4 - round 0
1326
1327	pmull	v28.1q, q6, v13.1d                          @ GHASH block 4k+2 - low
1328	eor	q8, q8, q4                          @ GHASH block 4k - mid
1329
1330	aese	q3, v19.16b
1331	aesmc	q3, q3          @ AES block 4k+7 - round 1
1332	eor	r23, r23, r13                   @ AES block 4k+3 - round 10 low
1333#ifdef __ARMEB__
1334	rev	r23, r23
1335#endif
1336	pmull	v30.1q, v30.1d, v17.1d                          @ GHASH block 4k+1 - mid
1337	eor	r22, r22, r14                   @ AES block 4k+2 - round 10 high
1338#ifdef __ARMEB__
1339	rev	r22, r22
1340#endif
1341	mov	d31, v6.d[1]                                  @ GHASH block 4k+2 - mid
1342
1343	aese	q0, v19.16b
1344	aesmc	q0, q0          @ AES block 4k+4 - round 1
1345	eor	v11.16b, v11.16b, v28.16b                         @ GHASH block 4k+2 - low
1346
1347	pmull	v10.1q, q8, v10.1d                      @ GHASH block 4k - mid
1348
1349	aese	q3, v20.16b
1350	aesmc	q3, q3          @ AES block 4k+7 - round 2
1351	eor	v31.8b, v31.8b, q6                          @ GHASH block 4k+2 - mid
1352
1353	aese	q0, v20.16b
1354	aesmc	q0, q0          @ AES block 4k+4 - round 2
1355
1356	aese	q1, v22.16b
1357	aesmc	q1, q1          @ AES block 4k+5 - round 4
1358	eor	v10.16b, v10.16b, v30.16b                         @ GHASH block 4k+1 - mid
1359
1360	pmull2	v8.1q, q6, v13.2d                          @ GHASH block 4k+2 - high
1361
1362	aese	q0, v21.16b
1363	aesmc	q0, q0          @ AES block 4k+4 - round 3
1364	ins	v31.d[1], v31.d[0]                                @ GHASH block 4k+2 - mid
1365
1366	pmull2	v4.1q, q7, v12.2d                          @ GHASH block 4k+3 - high
1367
1368	aese	q2, v19.16b
1369	aesmc	q2, q2          @ AES block 4k+6 - round 1
1370	mov	d30, v7.d[1]                                  @ GHASH block 4k+3 - mid
1371
1372	aese	q0, v22.16b
1373	aesmc	q0, q0          @ AES block 4k+4 - round 4
1374	eor	q9, q9, q8                         @ GHASH block 4k+2 - high
1375
1376	pmull2	v31.1q, v31.2d, v16.2d                          @ GHASH block 4k+2 - mid
1377	eor	r24, r24, r14                   @ AES block 4k+3 - round 10 high
1378#ifdef __ARMEB__
1379	rev	r24, r24
1380#endif
1381	aese	q2, v20.16b
1382	aesmc	q2, q2          @ AES block 4k+6 - round 2
1383	eor	v30.8b, v30.8b, q7                          @ GHASH block 4k+3 - mid
1384
1385	aese	q1, v23.16b
1386	aesmc	q1, q1          @ AES block 4k+5 - round 5
1387	eor	r21, r21, r13                   @ AES block 4k+2 - round 10 low
1388#ifdef __ARMEB__
1389	rev	r21, r21
1390#endif
1391	aese	q0, v23.16b
1392	aesmc	q0, q0          @ AES block 4k+4 - round 5
1393	movi	q8, #0xc2
1394
1395	aese	q2, v21.16b
1396	aesmc	q2, q2          @ AES block 4k+6 - round 3
1397	eor	v11.16b, v11.16b, v29.16b                         @ GHASH block 4k+3 - low
1398
1399	aese	q1, v24.16b
1400	aesmc	q1, q1          @ AES block 4k+5 - round 6
1401
1402	aese	q0, v24.16b
1403	aesmc	q0, q0          @ AES block 4k+4 - round 6
1404	eor	v10.16b, v10.16b, v31.16b                         @ GHASH block 4k+2 - mid
1405
1406	aese	q2, v22.16b
1407	aesmc	q2, q2          @ AES block 4k+6 - round 4
1408	stp	r21, r22, [r2], #16        @ AES block 4k+2 - store result
1409
1410	pmull	v30.1q, v30.1d, v16.1d                          @ GHASH block 4k+3 - mid
1411	eor	q9, q9, q4                         @ GHASH block 4k+3 - high
1412	ld1	{q4}, [r0], #16                       @ AES block 4k+3 - load ciphertext
1413
1414	aese	q1, v25.16b
1415	aesmc	q1, q1          @ AES block 4k+5 - round 7
1416	add	r12, r12, #1                            @ CTR block 4k+7
1417
1418	aese	q0, v25.16b
1419	aesmc	q0, q0          @ AES block 4k+4 - round 7
1420	shl	d8, d8, #56               @ mod_constant
1421
1422	aese	q2, v23.16b
1423	aesmc	q2, q2          @ AES block 4k+6 - round 5
1424	eor	v10.16b, v10.16b, v30.16b                         @ GHASH block 4k+3 - mid
1425
1426	aese	q1, v26.16b
1427	aesmc	q1, q1          @ AES block 4k+5 - round 8
1428	stp	r23, r24, [r2], #16        @ AES block 4k+3 - store result
1429
1430	aese	q0, v26.16b
1431	aesmc	q0, q0          @ AES block 4k+4 - round 8
1432	eor	v30.16b, v11.16b, q9                         @ MODULO - karatsuba tidy up
1433
1434	aese	q3, v21.16b
1435	aesmc	q3, q3          @ AES block 4k+7 - round 3
1436	rev	r9, r12                                 @ CTR block 4k+8
1437
1438	pmull	v31.1q, q9, q8            @ MODULO - top 64b align with mid
1439	ld1	{q5}, [r0], #16                       @ AES block 4k+4 - load ciphertext
1440	ext	q9, q9, q9, #8                     @ MODULO - other top alignment
1441
1442	aese	q0, v27.16b                                      @ AES block 4k+4 - round 9
1443	orr	r9, r11, r9, lsl #32            @ CTR block 4k+8
1444
1445	aese	q3, v22.16b
1446	aesmc	q3, q3          @ AES block 4k+7 - round 4
1447	eor	v10.16b, v10.16b, v30.16b                         @ MODULO - karatsuba tidy up
1448
1449	aese	q1, v27.16b                                      @ AES block 4k+5 - round 9
1450
1451	aese	q2, v24.16b
1452	aesmc	q2, q2          @ AES block 4k+6 - round 6
1453	eor	q0, q4, q0                            @ AES block 4k+4 - result
1454
1455	aese	q3, v23.16b
1456	aesmc	q3, q3          @ AES block 4k+7 - round 5
1457	ld1	{q6}, [r0], #16                       @ AES block 4k+5 - load ciphertext
1458
1459	add	r12, r12, #1                            @ CTR block 4k+8
1460	eor	v10.16b, v10.16b, v31.16b                      @ MODULO - fold into mid
1461	eor	q1, q5, q1                            @ AES block 4k+5 - result
1462
1463	aese	q2, v25.16b
1464	aesmc	q2, q2          @ AES block 4k+6 - round 7
1465	ld1	{q7}, [r0], #16                       @ AES block 4k+6 - load ciphertext
1466
1467	aese	q3, v24.16b
1468	aesmc	q3, q3          @ AES block 4k+7 - round 6
1469
1470	rev64	q5, q5                                    @ GHASH block 4k+5
1471	eor	v10.16b, v10.16b, q9                         @ MODULO - fold into mid
1472	mov	r7, v0.d[1]                            @ AES block 4k+4 - mov high
1473
1474	aese	q2, v26.16b
1475	aesmc	q2, q2          @ AES block 4k+6 - round 8
1476	mov	r6, v0.d[0]                            @ AES block 4k+4 - mov low
1477
1478	aese	q3, v25.16b
1479	aesmc	q3, q3          @ AES block 4k+7 - round 7
1480	fmov	d0, r10                               @ CTR block 4k+8
1481
1482	pmull	v8.1q, v10.1d, q8     @ MODULO - mid 64b align with low
1483	fmov	v0.d[1], r9                               @ CTR block 4k+8
1484	rev	r9, r12                                 @ CTR block 4k+9
1485
1486	aese	q2, v27.16b                                      @ AES block 4k+6 - round 9
1487	orr	r9, r11, r9, lsl #32            @ CTR block 4k+9
1488	ext	v10.16b, v10.16b, v10.16b, #8                     @ MODULO - other mid alignment
1489
1490	aese	q3, v26.16b
1491	aesmc	q3, q3          @ AES block 4k+7 - round 8
1492	eor	r7, r7, r14                   @ AES block 4k+4 - round 10 high
1493#ifdef __ARMEB__
1494	rev	r7, r7
1495#endif
1496	eor	v11.16b, v11.16b, q8               @ MODULO - fold into low
1497	mov	r20, v1.d[1]                            @ AES block 4k+5 - mov high
1498	eor	r6, r6, r13                   @ AES block 4k+4 - round 10 low
1499#ifdef __ARMEB__
1500	rev	r6, r6
1501#endif
1502	eor	q2, q6, q2                            @ AES block 4k+6 - result
1503	mov	r19, v1.d[0]                            @ AES block 4k+5 - mov low
1504	add	r12, r12, #1                            @ CTR block 4k+9
1505
1506	aese	q3, v27.16b                                      @ AES block 4k+7 - round 9
1507	fmov	d1, r10                               @ CTR block 4k+9
1508	cmp	r0, r5                   @ .LOOP CONTROL
1509
1510	rev64	q4, q4                                    @ GHASH block 4k+4
1511	eor	v11.16b, v11.16b, v10.16b                         @ MODULO - fold into low
1512	fmov	v1.d[1], r9                               @ CTR block 4k+9
1513
1514	rev	r9, r12                                 @ CTR block 4k+10
1515	add	r12, r12, #1                            @ CTR block 4k+10
1516
1517	eor	r20, r20, r14                   @ AES block 4k+5 - round 10 high
1518#ifdef __ARMEB__
1519	rev	r20, r20
1520#endif
1521	stp	r6, r7, [r2], #16        @ AES block 4k+4 - store result
1522
1523	eor	r19, r19, r13                   @ AES block 4k+5 - round 10 low
1524#ifdef __ARMEB__
1525	rev	r19, r19
1526#endif
1527	stp	r19, r20, [r2], #16        @ AES block 4k+5 - store result
1528
1529	orr	r9, r11, r9, lsl #32            @ CTR block 4k+10
1530	blt	.L128_dec_main_loop
1531
1532.L128_dec_prepretail:@ PREPRETAIL
1533	ext	v11.16b, v11.16b, v11.16b, #8                     @ PRE 0
1534	mov	r21, v2.d[0]                            @ AES block 4k+2 - mov low
1535	mov	d30, v5.d[1]                                  @ GHASH block 4k+1 - mid
1536
1537	aese	q0, v18.16b
1538	aesmc	q0, q0          @ AES block 4k+4 - round 0
1539	eor	q3, q7, q3                            @ AES block 4k+3 - result
1540
1541	aese	q1, v18.16b
1542	aesmc	q1, q1          @ AES block 4k+5 - round 0
1543	mov	r22, v2.d[1]                            @ AES block 4k+2 - mov high
1544
1545	eor	q4, q4, v11.16b                           @ PRE 1
1546	fmov	d2, r10                               @ CTR block 4k+6
1547	rev64	q6, q6                                    @ GHASH block 4k+2
1548
1549	aese	q0, v19.16b
1550	aesmc	q0, q0          @ AES block 4k+4 - round 1
1551	fmov	v2.d[1], r9                               @ CTR block 4k+6
1552
1553	rev	r9, r12                                 @ CTR block 4k+7
1554	mov	r23, v3.d[0]                            @ AES block 4k+3 - mov low
1555	eor	v30.8b, v30.8b, q5                          @ GHASH block 4k+1 - mid
1556
1557	pmull	v11.1q, q4, v15.1d                       @ GHASH block 4k - low
1558	mov	d10, v17.d[1]                               @ GHASH block 4k - mid
1559	mov	r24, v3.d[1]                            @ AES block 4k+3 - mov high
1560
1561	aese	q1, v19.16b
1562	aesmc	q1, q1          @ AES block 4k+5 - round 1
1563	mov	d31, v6.d[1]                                  @ GHASH block 4k+2 - mid
1564
1565	aese	q0, v20.16b
1566	aesmc	q0, q0          @ AES block 4k+4 - round 2
1567	orr	r9, r11, r9, lsl #32            @ CTR block 4k+7
1568
1569	pmull	v29.1q, q5, v14.1d                          @ GHASH block 4k+1 - low
1570	mov	d8, v4.d[1]                                  @ GHASH block 4k - mid
1571	fmov	d3, r10                               @ CTR block 4k+7
1572
1573	aese	q2, v18.16b
1574	aesmc	q2, q2          @ AES block 4k+6 - round 0
1575	fmov	v3.d[1], r9                               @ CTR block 4k+7
1576
1577	pmull	v30.1q, v30.1d, v17.1d                          @ GHASH block 4k+1 - mid
1578	eor	v31.8b, v31.8b, q6                          @ GHASH block 4k+2 - mid
1579
1580	rev64	q7, q7                                    @ GHASH block 4k+3
1581
1582	aese	q2, v19.16b
1583	aesmc	q2, q2          @ AES block 4k+6 - round 1
1584	eor	q8, q8, q4                          @ GHASH block 4k - mid
1585
1586	pmull2	v9.1q, q4, v15.2d                       @ GHASH block 4k - high
1587
1588	aese	q3, v18.16b
1589	aesmc	q3, q3          @ AES block 4k+7 - round 0
1590	ins	v31.d[1], v31.d[0]                                @ GHASH block 4k+2 - mid
1591
1592	pmull2	v28.1q, q5, v14.2d                          @ GHASH block 4k+1 - high
1593
1594	pmull	v10.1q, q8, v10.1d                      @ GHASH block 4k - mid
1595	eor	v11.16b, v11.16b, v29.16b                         @ GHASH block 4k+1 - low
1596
1597	pmull	v29.1q, q7, v12.1d                          @ GHASH block 4k+3 - low
1598
1599	pmull2	v31.1q, v31.2d, v16.2d                          @ GHASH block 4k+2 - mid
1600	eor	q9, q9, v28.16b                         @ GHASH block 4k+1 - high
1601
1602	eor	v10.16b, v10.16b, v30.16b                         @ GHASH block 4k+1 - mid
1603
1604	pmull2	v4.1q, q7, v12.2d                          @ GHASH block 4k+3 - high
1605
1606	pmull2	v8.1q, q6, v13.2d                          @ GHASH block 4k+2 - high
1607	mov	d30, v7.d[1]                                  @ GHASH block 4k+3 - mid
1608
1609	aese	q1, v20.16b
1610	aesmc	q1, q1          @ AES block 4k+5 - round 2
1611	eor	v10.16b, v10.16b, v31.16b                         @ GHASH block 4k+2 - mid
1612
1613	pmull	v28.1q, q6, v13.1d                          @ GHASH block 4k+2 - low
1614
1615	eor	q9, q9, q8                         @ GHASH block 4k+2 - high
1616	movi	q8, #0xc2
1617
1618	aese	q3, v19.16b
1619	aesmc	q3, q3          @ AES block 4k+7 - round 1
1620	eor	v30.8b, v30.8b, q7                          @ GHASH block 4k+3 - mid
1621
1622	eor	v11.16b, v11.16b, v28.16b                         @ GHASH block 4k+2 - low
1623
1624	aese	q2, v20.16b
1625	aesmc	q2, q2          @ AES block 4k+6 - round 2
1626	eor	q9, q9, q4                         @ GHASH block 4k+3 - high
1627
1628	aese	q3, v20.16b
1629	aesmc	q3, q3          @ AES block 4k+7 - round 2
1630	eor	r23, r23, r13                   @ AES block 4k+3 - round 10 low
1631#ifdef __ARMEB__
1632	rev	r23, r23
1633#endif
1634	pmull	v30.1q, v30.1d, v16.1d                          @ GHASH block 4k+3 - mid
1635	eor	r21, r21, r13                   @ AES block 4k+2 - round 10 low
1636#ifdef __ARMEB__
1637	rev	r21, r21
1638#endif
1639	eor	v11.16b, v11.16b, v29.16b                         @ GHASH block 4k+3 - low
1640
1641	aese	q2, v21.16b
1642	aesmc	q2, q2          @ AES block 4k+6 - round 3
1643
1644	aese	q1, v21.16b
1645	aesmc	q1, q1          @ AES block 4k+5 - round 3
1646	shl	d8, d8, #56               @ mod_constant
1647
1648	aese	q0, v21.16b
1649	aesmc	q0, q0          @ AES block 4k+4 - round 3
1650
1651	aese	q2, v22.16b
1652	aesmc	q2, q2          @ AES block 4k+6 - round 4
1653	eor	v10.16b, v10.16b, v30.16b                         @ GHASH block 4k+3 - mid
1654
1655	aese	q1, v22.16b
1656	aesmc	q1, q1          @ AES block 4k+5 - round 4
1657
1658	aese	q3, v21.16b
1659	aesmc	q3, q3          @ AES block 4k+7 - round 3
1660	eor	v30.16b, v11.16b, q9                         @ MODULO - karatsuba tidy up
1661
1662	aese	q2, v23.16b
1663	aesmc	q2, q2          @ AES block 4k+6 - round 5
1664
1665	aese	q1, v23.16b
1666	aesmc	q1, q1          @ AES block 4k+5 - round 5
1667
1668	aese	q3, v22.16b
1669	aesmc	q3, q3          @ AES block 4k+7 - round 4
1670
1671	aese	q0, v22.16b
1672	aesmc	q0, q0          @ AES block 4k+4 - round 4
1673	eor	v10.16b, v10.16b, v30.16b                         @ MODULO - karatsuba tidy up
1674
1675	pmull	v31.1q, q9, q8            @ MODULO - top 64b align with mid
1676
1677	aese	q1, v24.16b
1678	aesmc	q1, q1          @ AES block 4k+5 - round 6
1679	ext	q9, q9, q9, #8                     @ MODULO - other top alignment
1680
1681	aese	q3, v23.16b
1682	aesmc	q3, q3          @ AES block 4k+7 - round 5
1683
1684	aese	q0, v23.16b
1685	aesmc	q0, q0          @ AES block 4k+4 - round 5
1686	eor	v10.16b, v10.16b, v31.16b                      @ MODULO - fold into mid
1687
1688	aese	q1, v25.16b
1689	aesmc	q1, q1          @ AES block 4k+5 - round 7
1690
1691	aese	q2, v24.16b
1692	aesmc	q2, q2          @ AES block 4k+6 - round 6
1693
1694	aese	q0, v24.16b
1695	aesmc	q0, q0          @ AES block 4k+4 - round 6
1696
1697	aese	q1, v26.16b
1698	aesmc	q1, q1          @ AES block 4k+5 - round 8
1699	eor	v10.16b, v10.16b, q9                         @ MODULO - fold into mid
1700
1701	aese	q3, v24.16b
1702	aesmc	q3, q3          @ AES block 4k+7 - round 6
1703
1704	aese	q0, v25.16b
1705	aesmc	q0, q0          @ AES block 4k+4 - round 7
1706
1707	aese	q1, v27.16b                                      @ AES block 4k+5 - round 9
1708
1709	pmull	v8.1q, v10.1d, q8     @ MODULO - mid 64b align with low
1710	eor	r24, r24, r14                   @ AES block 4k+3 - round 10 high
1711#ifdef __ARMEB__
1712	rev	r24, r24
1713#endif
1714	aese	q2, v25.16b
1715	aesmc	q2, q2          @ AES block 4k+6 - round 7
1716	ext	v10.16b, v10.16b, v10.16b, #8                     @ MODULO - other mid alignment
1717
1718	aese	q3, v25.16b
1719	aesmc	q3, q3          @ AES block 4k+7 - round 7
1720
1721	aese	q0, v26.16b
1722	aesmc	q0, q0          @ AES block 4k+4 - round 8
1723	eor	v11.16b, v11.16b, q8               @ MODULO - fold into low
1724
1725	aese	q2, v26.16b
1726	aesmc	q2, q2          @ AES block 4k+6 - round 8
1727
1728	aese	q3, v26.16b
1729	aesmc	q3, q3          @ AES block 4k+7 - round 8
1730	eor	r22, r22, r14                   @ AES block 4k+2 - round 10 high
1731#ifdef __ARMEB__
1732	rev	r22, r22
1733#endif
1734	aese	q0, v27.16b                                      @ AES block 4k+4 - round 9
1735	stp	r21, r22, [r2], #16        @ AES block 4k+2 - store result
1736
1737	aese	q2, v27.16b                                      @ AES block 4k+6 - round 9
1738	add	r12, r12, #1                            @ CTR block 4k+7
1739	stp	r23, r24, [r2], #16        @ AES block 4k+3 - store result
1740
1741	aese	q3, v27.16b                                      @ AES block 4k+7 - round 9
1742	eor	v11.16b, v11.16b, v10.16b                         @ MODULO - fold into low
1743.L128_dec_tail:@ TAIL
1744
1745	sub	r5, r4, r0   @ main_end_input_ptr is number of bytes left to process
1746	ld1	{ q5}, [r0], #16                      @ AES block 4k+4 - load ciphertext
1747
1748	eor	q0, q5, q0                            @ AES block 4k+4 - result
1749
1750	mov	r7, v0.d[1]                            @ AES block 4k+4 - mov high
1751
1752	mov	r6, v0.d[0]                            @ AES block 4k+4 - mov low
1753
1754	cmp	r5, #48
1755
1756	eor	r7, r7, r14                   @ AES block 4k+4 - round 10 high
1757#ifdef __ARMEB__
1758	rev	r7, r7
1759#endif
1760	ext	q8, v11.16b, v11.16b, #8                     @ prepare final partial tag
1761	eor	r6, r6, r13                   @ AES block 4k+4 - round 10 low
1762#ifdef __ARMEB__
1763	rev	r6, r6
1764#endif
1765	bgt	.L128_dec_blocks_more_than_3
1766
1767	mov	q3, q2
1768	sub	r12, r12, #1
1769	movi	v11.8b, #0
1770
1771	movi	q9, #0
1772	mov	q2, q1
1773
1774	movi	v10.8b, #0
1775	cmp	r5, #32
1776	bgt	.L128_dec_blocks_more_than_2
1777
1778	cmp	r5, #16
1779
1780	mov	q3, q1
1781	sub	r12, r12, #1
1782	bgt	.L128_dec_blocks_more_than_1
1783
1784	sub	r12, r12, #1
1785	b	.L128_dec_blocks_less_than_1
1786.L128_dec_blocks_more_than_3:@ blocks left >  3
1787	rev64	q4, q5                                    @ GHASH final-3 block
1788	ld1	{ q5}, [r0], #16                      @ AES final-2 block - load ciphertext
1789
1790	eor	q4, q4, q8                           @ feed in partial tag
1791
1792	mov	d10, v17.d[1]                               @ GHASH final-3 block - mid
1793	stp	r6, r7, [r2], #16        @ AES final-3 block  - store result
1794	eor	q0, q5, q1                            @ AES final-2 block - result
1795
1796	mov	d22, v4.d[1]                                 @ GHASH final-3 block - mid
1797	mov	r7, v0.d[1]                            @ AES final-2 block - mov high
1798
1799	pmull	v11.1q, q4, v15.1d                       @ GHASH final-3 block - low
1800	mov	r6, v0.d[0]                            @ AES final-2 block - mov low
1801
1802	pmull2	v9.1q, q4, v15.2d                       @ GHASH final-3 block - high
1803
1804	eor	v22.8b, v22.8b, q4                      @ GHASH final-3 block - mid
1805
1806	movi	q8, #0                                        @ suppress further partial tag feed in
1807	eor	r7, r7, r14                   @ AES final-2 block - round 10 high
1808#ifdef __ARMEB__
1809	rev	r7, r7
1810#endif
1811	pmull	v10.1q, v22.1d, v10.1d                    @ GHASH final-3 block - mid
1812	eor	r6, r6, r13                   @ AES final-2 block - round 10 low
1813#ifdef __ARMEB__
1814	rev	r6, r6
1815#endif
1816.L128_dec_blocks_more_than_2:@ blocks left >  2
1817
1818	rev64	q4, q5                                    @ GHASH final-2 block
1819	ld1	{ q5}, [r0], #16                      @ AES final-1 block - load ciphertext
1820
1821	eor	q4, q4, q8                           @ feed in partial tag
1822
1823	eor	q0, q5, q2                            @ AES final-1 block - result
1824	stp	r6, r7, [r2], #16        @ AES final-2 block  - store result
1825
1826	mov	d22, v4.d[1]                                 @ GHASH final-2 block - mid
1827
1828	pmull	v21.1q, q4, v14.1d                          @ GHASH final-2 block - low
1829
1830	pmull2	v20.1q, q4, v14.2d                          @ GHASH final-2 block - high
1831	mov	r6, v0.d[0]                            @ AES final-1 block - mov low
1832
1833	mov	r7, v0.d[1]                            @ AES final-1 block - mov high
1834	eor	v22.8b, v22.8b, q4                      @ GHASH final-2 block - mid
1835
1836	movi	q8, #0                                        @ suppress further partial tag feed in
1837
1838	pmull	v22.1q, v22.1d, v17.1d                      @ GHASH final-2 block - mid
1839
1840	eor	r6, r6, r13                   @ AES final-1 block - round 10 low
1841#ifdef __ARMEB__
1842	rev	r6, r6
1843#endif
1844	eor	v11.16b, v11.16b, v21.16b                            @ GHASH final-2 block - low
1845
1846	eor	q9, q9, v20.16b                            @ GHASH final-2 block - high
1847
1848	eor	v10.16b, v10.16b, v22.16b                       @ GHASH final-2 block - mid
1849	eor	r7, r7, r14                   @ AES final-1 block - round 10 high
1850#ifdef __ARMEB__
1851	rev	r7, r7
1852#endif
1853.L128_dec_blocks_more_than_1:@ blocks left >  1
1854
1855	rev64	q4, q5                                    @ GHASH final-1 block
1856
1857	ld1	{ q5}, [r0], #16                      @ AES final block - load ciphertext
1858	eor	q4, q4, q8                           @ feed in partial tag
1859
1860	mov	d22, v4.d[1]                                 @ GHASH final-1 block - mid
1861
1862	eor	q0, q5, q3                            @ AES final block - result
1863
1864	eor	v22.8b, v22.8b, q4                      @ GHASH final-1 block - mid
1865
1866	stp	r6, r7, [r2], #16        @ AES final-1 block  - store result
1867	mov	r6, v0.d[0]                            @ AES final block - mov low
1868
1869	mov	r7, v0.d[1]                            @ AES final block - mov high
1870	ins	v22.d[1], v22.d[0]                            @ GHASH final-1 block - mid
1871
1872	pmull	v21.1q, q4, v13.1d                          @ GHASH final-1 block - low
1873
1874	pmull2	v20.1q, q4, v13.2d                          @ GHASH final-1 block - high
1875
1876	pmull2	v22.1q, v22.2d, v16.2d                      @ GHASH final-1 block - mid
1877	movi	q8, #0                                        @ suppress further partial tag feed in
1878
1879	eor	v11.16b, v11.16b, v21.16b                            @ GHASH final-1 block - low
1880
1881	eor	q9, q9, v20.16b                            @ GHASH final-1 block - high
1882	eor	r7, r7, r14                   @ AES final block - round 10 high
1883#ifdef __ARMEB__
1884	rev	r7, r7
1885#endif
1886	eor	r6, r6, r13                   @ AES final block - round 10 low
1887#ifdef __ARMEB__
1888	rev	r6, r6
1889#endif
1890	eor	v10.16b, v10.16b, v22.16b                       @ GHASH final-1 block - mid
1891.L128_dec_blocks_less_than_1:@ blocks left <= 1
1892
1893	mvn	r14, xzr                                      @ rk10_h = 0xffffffffffffffff
1894	and	r1, r1, #127                    @ bit_length %= 128
1895
1896	mvn	r13, xzr                                      @ rk10_l = 0xffffffffffffffff
1897	sub	r1, r1, #128                    @ bit_length -= 128
1898
1899	neg	r1, r1                          @ bit_length = 128 - #bits in input (in range [1,128])
1900
1901	and	r1, r1, #127                    @ bit_length %= 128
1902
1903	lsr	r14, r14, r1                     @ rk10_h is mask for top 64b of last block
1904	cmp	r1, #64
1905
1906	csel	r10, r14, xzr, lt
1907	csel	r9, r13, r14, lt
1908
1909	fmov	d0, r9                                   @ ctr0b is mask for last block
1910
1911	mov	v0.d[1], r10
1912
1913	and	q5, q5, q0                            @ possibly partial last block has zeroes in highest bits
1914
1915	rev64	q4, q5                                    @ GHASH final block
1916
1917	eor	q4, q4, q8                           @ feed in partial tag
1918
1919	ldp	r4, r5, [r2] @ load existing bytes we need to not overwrite
1920
1921	and	r7, r7, r10
1922
1923	pmull2	v20.1q, q4, v12.2d                          @ GHASH final block - high
1924	mov	d8, v4.d[1]                                  @ GHASH final block - mid
1925
1926	eor	q8, q8, q4                          @ GHASH final block - mid
1927	eor	q9, q9, v20.16b                            @ GHASH final block - high
1928
1929	pmull	v8.1q, q8, v16.1d                          @ GHASH final block - mid
1930
1931	pmull	v21.1q, q4, v12.1d                          @ GHASH final block - low
1932	bic	r4, r4, r9           @ mask out low existing bytes
1933	and	r6, r6, r9
1934
1935#ifndef __ARMEB__
1936	rev	r9, r12
1937#else
1938	mov	r9, r12
1939#endif
1940
1941	eor	v10.16b, v10.16b, q8                         @ GHASH final block - mid
1942	movi	q8, #0xc2
1943
1944	eor	v11.16b, v11.16b, v21.16b                            @ GHASH final block - low
1945
1946	bic	r5, r5, r10   @ mask out high existing bytes
1947	shl	d8, d8, #56               @ mod_constant
1948
1949	eor	v30.16b, v11.16b, q9                         @ MODULO - karatsuba tidy up
1950
1951	pmull	v31.1q, q9, q8            @ MODULO - top 64b align with mid
1952
1953	eor	v10.16b, v10.16b, v30.16b                         @ MODULO - karatsuba tidy up
1954
1955	orr	r6, r6, r4
1956	str	r9, [r16, #12]                          @ store the updated counter
1957
1958	orr	r7, r7, r5
1959	stp	r6, r7, [r2]
1960	ext	q9, q9, q9, #8                     @ MODULO - other top alignment
1961
1962	eor	v10.16b, v10.16b, v31.16b                      @ MODULO - fold into mid
1963
1964	eor	v10.16b, v10.16b, q9                         @ MODULO - fold into mid
1965
1966	pmull	v8.1q, v10.1d, q8     @ MODULO - mid 64b align with low
1967	ext	v10.16b, v10.16b, v10.16b, #8                     @ MODULO - other mid alignment
1968
1969	eor	v11.16b, v11.16b, q8               @ MODULO - fold into low
1970
1971	eor	v11.16b, v11.16b, v10.16b                         @ MODULO - fold into low
1972	ext	v11.16b, v11.16b, v11.16b, #8
1973	rev64	v11.16b, v11.16b
1974	mov	r0, r15
1975	st1	{ v11.16b }, [r3]
1976
1977	ldp	r21, r22, [sp, #16]
1978	ldp	r23, r24, [sp, #32]
1979	ldp	d8, d9, [sp, #48]
1980	ldp	d10, d11, [sp, #64]
1981	ldp	d12, d13, [sp, #80]
1982	ldp	d14, d15, [sp, #96]
1983	ldp	r19, r20, [sp], #112
1984	RET
1985
1986.L128_dec_ret:
1987	mov	r0, #0x0
1988	RET
1989.size	aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel
1990.globl	aes_gcm_enc_192_kernel
1991.type	aes_gcm_enc_192_kernel,%function
1992.align	4
1993aes_gcm_enc_192_kernel:
1994	cbz	r1, .L192_enc_ret
1995	stp	r19, r20, [sp, #-112]!
1996	mov	r16, r4
1997	mov	r8, r5
1998	stp	r21, r22, [sp, #16]
1999	stp	r23, r24, [sp, #32]
2000	stp	d8, d9, [sp, #48]
2001	stp	d10, d11, [sp, #64]
2002	stp	d12, d13, [sp, #80]
2003	stp	d14, d15, [sp, #96]
2004
2005	ldp	r10, r11, [r16]             @ ctr96_b64, ctr96_t32
2006#ifdef __ARMEB__
2007	rev	r10, r10
2008	rev	r11, r11
2009#endif
2010	ldp	r13, r14, [r8, #192]                     @ load rk12
2011#ifdef __ARMEB__
2012	ror	r13, r13, #32
2013	ror	r14, r14, #32
2014#endif
2015	ld1	{v18.4s}, [r8], #16	                             @ load rk0
2016
2017	ld1	{v19.4s}, [r8], #16	                             @ load rk1
2018
2019	ld1	{v20.4s}, [r8], #16	                             @ load rk2
2020
2021	lsr	r12, r11, #32
2022	ld1	{v21.4s}, [r8], #16	                             @ load rk3
2023	orr	r11, r11, r11
2024
2025	ld1	{v22.4s}, [r8], #16	                             @ load rk4
2026	rev	r12, r12                               @ rev_ctr32
2027
2028	add	r12, r12, #1                           @ increment rev_ctr32
2029	fmov	d3, r10                              @ CTR block 3
2030
2031	rev	r9, r12                                @ CTR block 1
2032	add	r12, r12, #1                           @ CTR block 1
2033	fmov	d1, r10                              @ CTR block 1
2034
2035	orr	r9, r11, r9, lsl #32           @ CTR block 1
2036	ld1	{ q0}, [r16]                            @ special case vector load initial counter so we can start first AES block as quickly as possible
2037
2038	fmov	v1.d[1], r9                              @ CTR block 1
2039	rev	r9, r12                                @ CTR block 2
2040	add	r12, r12, #1                           @ CTR block 2
2041
2042	fmov	d2, r10                              @ CTR block 2
2043	orr	r9, r11, r9, lsl #32           @ CTR block 2
2044
2045	fmov	v2.d[1], r9                              @ CTR block 2
2046	rev	r9, r12                                @ CTR block 3
2047
2048	orr	r9, r11, r9, lsl #32           @ CTR block 3
2049	ld1	{v23.4s}, [r8], #16	                             @ load rk5
2050
2051	fmov	v3.d[1], r9                              @ CTR block 3
2052
2053	ld1	{v24.4s}, [r8], #16	                             @ load rk6
2054
2055	ld1	{v25.4s}, [r8], #16	                             @ load rk7
2056
2057	aese	q0, v18.16b
2058	aesmc	q0, q0         @ AES block 0 - round 0
2059	ld1	{ v11.16b}, [r3]
2060	ext	v11.16b, v11.16b, v11.16b, #8
2061	rev64	v11.16b, v11.16b
2062
2063	aese	q3, v18.16b
2064	aesmc	q3, q3         @ AES block 3 - round 0
2065	ld1	{v26.4s}, [r8], #16	                             @ load rk8
2066
2067	aese	q1, v18.16b
2068	aesmc	q1, q1         @ AES block 1 - round 0
2069	ldr	q15, [r3, #112]                       @ load h4l | h4h
2070#ifndef __ARMEB__
2071	ext	v15.16b, v15.16b, v15.16b, #8
2072#endif
2073	aese	q2, v18.16b
2074	aesmc	q2, q2         @ AES block 2 - round 0
2075	ld1	{v27.4s}, [r8], #16	                             @ load rk9
2076
2077	aese	q0, v19.16b
2078	aesmc	q0, q0         @ AES block 0 - round 1
2079	ld1	{v28.4s}, [r8], #16	                         @ load rk10
2080
2081	aese	q1, v19.16b
2082	aesmc	q1, q1         @ AES block 1 - round 1
2083	ldr	q12, [r3, #32]                        @ load h1l | h1h
2084#ifndef __ARMEB__
2085	ext	v12.16b, v12.16b, v12.16b, #8
2086#endif
2087	aese	q2, v19.16b
2088	aesmc	q2, q2         @ AES block 2 - round 1
2089	ld1	{v29.4s}, [r8], #16	                         @ load rk11
2090
2091	aese	q3, v19.16b
2092	aesmc	q3, q3         @ AES block 3 - round 1
2093	ldr	q14, [r3, #80]                        @ load h3l | h3h
2094#ifndef __ARMEB__
2095	ext	v14.16b, v14.16b, v14.16b, #8
2096#endif
2097	aese	q0, v20.16b
2098	aesmc	q0, q0         @ AES block 0 - round 2
2099
2100	aese	q2, v20.16b
2101	aesmc	q2, q2         @ AES block 2 - round 2
2102
2103	aese	q3, v20.16b
2104	aesmc	q3, q3         @ AES block 3 - round 2
2105
2106	aese	q0, v21.16b
2107	aesmc	q0, q0         @ AES block 0 - round 3
2108	trn1	q9, v14.2d,    v15.2d                     @ h4h | h3h
2109
2110	aese	q2, v21.16b
2111	aesmc	q2, q2         @ AES block 2 - round 3
2112
2113	aese	q1, v20.16b
2114	aesmc	q1, q1         @ AES block 1 - round 2
2115	trn2	v17.2d,  v14.2d,    v15.2d                     @ h4l | h3l
2116
2117	aese	q0, v22.16b
2118	aesmc	q0, q0         @ AES block 0 - round 4
2119
2120	aese	q3, v21.16b
2121	aesmc	q3, q3         @ AES block 3 - round 3
2122
2123	aese	q1, v21.16b
2124	aesmc	q1, q1         @ AES block 1 - round 3
2125
2126	aese	q0, v23.16b
2127	aesmc	q0, q0         @ AES block 0 - round 5
2128
2129	aese	q2, v22.16b
2130	aesmc	q2, q2         @ AES block 2 - round 4
2131
2132	aese	q1, v22.16b
2133	aesmc	q1, q1         @ AES block 1 - round 4
2134
2135	aese	q0, v24.16b
2136	aesmc	q0, q0         @ AES block 0 - round 6
2137
2138	aese	q3, v22.16b
2139	aesmc	q3, q3         @ AES block 3 - round 4
2140
2141	aese	q2, v23.16b
2142	aesmc	q2, q2         @ AES block 2 - round 5
2143
2144	aese	q1, v23.16b
2145	aesmc	q1, q1         @ AES block 1 - round 5
2146
2147	aese	q3, v23.16b
2148	aesmc	q3, q3         @ AES block 3 - round 5
2149
2150	aese	q2, v24.16b
2151	aesmc	q2, q2         @ AES block 2 - round 6
2152	ldr	q13, [r3, #64]                        @ load h2l | h2h
2153#ifndef __ARMEB__
2154	ext	v13.16b, v13.16b, v13.16b, #8
2155#endif
2156	aese	q1, v24.16b
2157	aesmc	q1, q1         @ AES block 1 - round 6
2158
2159	aese	q3, v24.16b
2160	aesmc	q3, q3         @ AES block 3 - round 6
2161
2162	aese	q0, v25.16b
2163	aesmc	q0, q0         @ AES block 0 - round 7
2164
2165	aese	q1, v25.16b
2166	aesmc	q1, q1         @ AES block 1 - round 7
2167	trn2	v16.2d,  v12.2d,    v13.2d                     @ h2l | h1l
2168
2169	aese	q3, v25.16b
2170	aesmc	q3, q3         @ AES block 3 - round 7
2171
2172	aese	q0, v26.16b
2173	aesmc	q0, q0         @ AES block 0 - round 8
2174
2175	aese	q2, v25.16b
2176	aesmc	q2, q2         @ AES block 2 - round 7
2177	trn1	q8,    v12.2d,    v13.2d                     @ h2h | h1h
2178
2179	aese	q1, v26.16b
2180	aesmc	q1, q1         @ AES block 1 - round 8
2181
2182	aese	q3, v26.16b
2183	aesmc	q3, q3         @ AES block 3 - round 8
2184
2185	aese	q2, v26.16b
2186	aesmc	q2, q2         @ AES block 2 - round 8
2187
2188	aese	q0, v27.16b
2189	aesmc	q0, q0         @ AES block 0 - round 9
2190
2191	aese	q3, v27.16b
2192	aesmc	q3, q3         @ AES block 3 - round 9
2193
2194	aese	q2, v27.16b
2195	aesmc	q2, q2         @ AES block 2 - round 9
2196
2197	aese	q1, v27.16b
2198	aesmc	q1, q1         @ AES block 1 - round 9
2199
2200	aese	q0, v28.16b
2201	aesmc	q0, q0         @ AES block 0 - round 10
2202
2203	aese	q2, v28.16b
2204	aesmc	q2, q2         @ AES block 2 - round 10
2205
2206	aese	q1, v28.16b
2207	aesmc	q1, q1         @ AES block 1 - round 10
2208	lsr	r5, r1, #3             @ byte_len
2209	mov	r15, r5
2210
2211	aese	q3, v28.16b
2212	aesmc	q3, q3         @ AES block 3 - round 10
2213	sub	r5, r5, #1     @ byte_len - 1
2214
2215	eor	v16.16b, v16.16b, q8                    @ h2k | h1k
2216	and	r5, r5, #0xffffffffffffffc0   @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
2217
2218	eor	v17.16b, v17.16b, q9                 @ h4k | h3k
2219
2220	aese	q2, v29.16b                                    @ AES block 2 - round 11
2221	add	r4, r0, r1, lsr #3  @ end_input_ptr
2222	add	r5, r5, r0
2223
2224	aese	q1, v29.16b                                    @ AES block 1 - round 11
2225	cmp	r0, r5                  @ check if we have <= 4 blocks
2226
2227	aese	q0, v29.16b                                    @ AES block 0 - round 11
2228	add	r12, r12, #1                           @ CTR block 3
2229
2230	aese	q3, v29.16b                                    @ AES block 3 - round 11
2231	bge	.L192_enc_tail                                   @ handle tail
2232
2233	rev	r9, r12                                @ CTR block 4
2234	ldp	r6, r7, [r0, #0]           @ AES block 0 - load plaintext
2235#ifdef __ARMEB__
2236	rev	r6, r6
2237	rev	r7, r7
2238#endif
2239	orr	r9, r11, r9, lsl #32           @ CTR block 4
2240	ldp	r21, r22, [r0, #32]          @ AES block 2 - load plaintext
2241#ifdef __ARMEB__
2242	rev	r21, r21
2243	rev	r22, r22
2244#endif
2245	ldp	r23, r24, [r0, #48]          @ AES block 3 - load plaintext
2246#ifdef __ARMEB__
2247	rev	r23, r23
2248	rev	r24, r24
2249#endif
2250	ldp	r19, r20, [r0, #16]          @ AES block 1 - load plaintext
2251#ifdef __ARMEB__
2252	rev	r19, r19
2253	rev	r20, r20
2254#endif
2255	add	r0, r0, #64                      @ AES input_ptr update
2256	cmp	r0, r5                  @ check if we have <= 8 blocks
2257
2258	eor	r6, r6, r13                    @ AES block 0 - round 12 low
2259
2260	eor	r7, r7, r14                    @ AES block 0 - round 12 high
2261	eor	r22, r22, r14                    @ AES block 2 - round 12 high
2262	fmov	d4, r6                              @ AES block 0 - mov low
2263
2264	eor	r24, r24, r14                    @ AES block 3 - round 12 high
2265	fmov	v4.d[1], r7                          @ AES block 0 - mov high
2266
2267	eor	r21, r21, r13                    @ AES block 2 - round 12 low
2268	eor	r19, r19, r13                    @ AES block 1 - round 12 low
2269
2270	fmov	d5, r19                              @ AES block 1 - mov low
2271	eor	r20, r20, r14                    @ AES block 1 - round 12 high
2272
2273	fmov	v5.d[1], r20                          @ AES block 1 - mov high
2274
2275	eor	r23, r23, r13                    @ AES block 3 - round 12 low
2276	fmov	d6, r21                              @ AES block 2 - mov low
2277
2278	add	r12, r12, #1                           @ CTR block 4
2279	eor	q4, q4, q0                         @ AES block 0 - result
2280	fmov	d0, r10                              @ CTR block 4
2281
2282	fmov	v0.d[1], r9                              @ CTR block 4
2283	rev	r9, r12                                @ CTR block 5
2284
2285	orr	r9, r11, r9, lsl #32           @ CTR block 5
2286	add	r12, r12, #1                           @ CTR block 5
2287
2288	fmov	d7, r23                              @ AES block 3 - mov low
2289	st1	{ q4}, [r2], #16                    @ AES block 0 - store result
2290
2291	fmov	v6.d[1], r22                          @ AES block 2 - mov high
2292
2293	eor	q5, q5, q1                         @ AES block 1 - result
2294	fmov	d1, r10                              @ CTR block 5
2295	st1	{ q5}, [r2], #16                    @ AES block 1 - store result
2296
2297	fmov	v7.d[1], r24                          @ AES block 3 - mov high
2298
2299	fmov	v1.d[1], r9                              @ CTR block 5
2300	rev	r9, r12                                @ CTR block 6
2301
2302	orr	r9, r11, r9, lsl #32           @ CTR block 6
2303
2304	add	r12, r12, #1                           @ CTR block 6
2305	eor	q6, q6, q2                         @ AES block 2 - result
2306	fmov	d2, r10                              @ CTR block 6
2307
2308	fmov	v2.d[1], r9                              @ CTR block 6
2309	rev	r9, r12                                @ CTR block 7
2310
2311	orr	r9, r11, r9, lsl #32           @ CTR block 7
2312	st1	{ q6}, [r2], #16                    @ AES block 2 - store result
2313
2314	eor	q7, q7, q3                         @ AES block 3 - result
2315	st1	{ q7}, [r2], #16                    @ AES block 3 - store result
2316	bge	.L192_enc_prepretail                             @ do prepretail
2317
2318.L192_enc_main_loop:@ main loop start
2319	aese	q2, v18.16b
2320	aesmc	q2, q2         @ AES block 4k+6 - round 0
2321	rev64	q5, q5                                   @ GHASH block 4k+1 (t0 and t1 free)
2322
2323	aese	q1, v18.16b
2324	aesmc	q1, q1         @ AES block 4k+5 - round 0
2325	ldp	r19, r20, [r0, #16]          @ AES block 4k+5 - load plaintext
2326#ifdef __ARMEB__
2327	rev	r19, r19
2328	rev	r20, r20
2329#endif
2330	ext	v11.16b, v11.16b, v11.16b, #8                    @ PRE 0
2331	fmov	d3, r10                              @ CTR block 4k+3
2332	rev64	q4, q4                                   @ GHASH block 4k (only t0 is free)
2333
2334	aese	q2, v19.16b
2335	aesmc	q2, q2         @ AES block 4k+6 - round 1
2336	fmov	v3.d[1], r9                              @ CTR block 4k+3
2337
2338	pmull2	v30.1q, q5, v14.2d                         @ GHASH block 4k+1 - high
2339	rev64	q7, q7                                   @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
2340	ldp	r21, r22, [r0, #32]          @ AES block 4k+6 - load plaintext
2341#ifdef __ARMEB__
2342	rev	r21, r21
2343	rev	r22, r22
2344#endif
2345	aese	q0, v18.16b
2346	aesmc	q0, q0         @ AES block 4k+4 - round 0
2347	ldp	r23, r24, [r0, #48]          @ AES block 4k+3 - load plaintext
2348#ifdef __ARMEB__
2349	rev	r23, r23
2350	rev	r24, r24
2351#endif
2352	pmull	v31.1q, q5, v14.1d                         @ GHASH block 4k+1 - low
2353	eor	q4, q4, v11.16b                          @ PRE 1
2354
2355	aese	q1, v19.16b
2356	aesmc	q1, q1         @ AES block 4k+5 - round 1
2357
2358	aese	q0, v19.16b
2359	aesmc	q0, q0         @ AES block 4k+4 - round 1
2360	rev64	q6, q6                                   @ GHASH block 4k+2 (t0, t1, and t2 free)
2361
2362	aese	q3, v18.16b
2363	aesmc	q3, q3         @ AES block 4k+7 - round 0
2364	eor	r24, r24, r14                    @ AES block 4k+3 - round 12 high
2365
2366	pmull	v11.1q, q4, v15.1d                      @ GHASH block 4k - low
2367	mov	d8, v4.d[1]                                 @ GHASH block 4k - mid
2368
2369	aese	q0, v20.16b
2370	aesmc	q0, q0         @ AES block 4k+4 - round 2
2371
2372	aese	q3, v19.16b
2373	aesmc	q3, q3         @ AES block 4k+7 - round 1
2374	eor	r21, r21, r13                    @ AES block 4k+6 - round 12 low
2375
2376	eor	q8, q8, q4                         @ GHASH block 4k - mid
2377	eor	v11.16b, v11.16b, v31.16b                        @ GHASH block 4k+1 - low
2378
2379	aese	q0, v21.16b
2380	aesmc	q0, q0         @ AES block 4k+4 - round 3
2381	eor	r19, r19, r13                    @ AES block 4k+5 - round 12 low
2382
2383	aese	q1, v20.16b
2384	aesmc	q1, q1         @ AES block 4k+5 - round 2
2385	mov	d31, v6.d[1]                                 @ GHASH block 4k+2 - mid
2386
2387	pmull2	v9.1q, q4, v15.2d                      @ GHASH block 4k - high
2388	mov	d4, v5.d[1]                                 @ GHASH block 4k+1 - mid
2389
2390	aese	q2, v20.16b
2391	aesmc	q2, q2         @ AES block 4k+6 - round 2
2392
2393	aese	q1, v21.16b
2394	aesmc	q1, q1         @ AES block 4k+5 - round 3
2395
2396	mov	d10, v17.d[1]                              @ GHASH block 4k - mid
2397	eor	q9, q9, v30.16b                        @ GHASH block 4k+1 - high
2398
2399	aese	q3, v20.16b
2400	aesmc	q3, q3         @ AES block 4k+7 - round 2
2401	eor	v31.8b, v31.8b, q6                         @ GHASH block 4k+2 - mid
2402
2403	pmull2	v30.1q, q6, v13.2d                         @ GHASH block 4k+2 - high
2404
2405	aese	q0, v22.16b
2406	aesmc	q0, q0         @ AES block 4k+4 - round 4
2407	eor	q4, q4, q5                         @ GHASH block 4k+1 - mid
2408
2409	aese	q3, v21.16b
2410	aesmc	q3, q3         @ AES block 4k+7 - round 3
2411
2412	pmull2	v5.1q, q7, v12.2d                         @ GHASH block 4k+3 - high
2413	eor	r20, r20, r14                    @ AES block 4k+5 - round 12 high
2414	ins	v31.d[1], v31.d[0]                               @ GHASH block 4k+2 - mid
2415
2416	aese	q0, v23.16b
2417	aesmc	q0, q0         @ AES block 4k+4 - round 5
2418	add	r12, r12, #1                           @ CTR block 4k+3
2419
2420	aese	q3, v22.16b
2421	aesmc	q3, q3         @ AES block 4k+7 - round 4
2422	eor	q9, q9, v30.16b                        @ GHASH block 4k+2 - high
2423
2424	pmull	v4.1q, q4, v17.1d                         @ GHASH block 4k+1 - mid
2425	eor	r22, r22, r14                    @ AES block 4k+6 - round 12 high
2426
2427	pmull2	v31.1q, v31.2d, v16.2d                         @ GHASH block 4k+2 - mid
2428	eor	r23, r23, r13                    @ AES block 4k+3 - round 12 low
2429	mov	d30, v7.d[1]                                 @ GHASH block 4k+3 - mid
2430
2431	pmull	v10.1q, q8, v10.1d                     @ GHASH block 4k - mid
2432	rev	r9, r12                                @ CTR block 4k+8
2433
2434	pmull	v8.1q, q6, v13.1d                         @ GHASH block 4k+2 - low
2435	orr	r9, r11, r9, lsl #32           @ CTR block 4k+8
2436
2437	aese	q2, v21.16b
2438	aesmc	q2, q2         @ AES block 4k+6 - round 3
2439	eor	v30.8b, v30.8b, q7                         @ GHASH block 4k+3 - mid
2440
2441	aese	q1, v22.16b
2442	aesmc	q1, q1         @ AES block 4k+5 - round 4
2443	ldp	r6, r7, [r0, #0]           @ AES block 4k+4 - load plaintext
2444#ifdef __ARMEB__
2445	rev	r6, r6
2446	rev	r7, r7
2447#endif
2448	aese	q0, v24.16b
2449	aesmc	q0, q0         @ AES block 4k+4 - round 6
2450	eor	v11.16b, v11.16b, q8                        @ GHASH block 4k+2 - low
2451
2452	aese	q2, v22.16b
2453	aesmc	q2, q2         @ AES block 4k+6 - round 4
2454	add	r0, r0, #64                      @ AES input_ptr update
2455
2456	aese	q1, v23.16b
2457	aesmc	q1, q1         @ AES block 4k+5 - round 5
2458	movi	q8, #0xc2
2459
2460	pmull	v6.1q, q7, v12.1d                         @ GHASH block 4k+3 - low
2461	eor	r7, r7, r14                    @ AES block 4k+4 - round 12 high
2462	eor	v10.16b, v10.16b, q4                        @ GHASH block 4k+1 - mid
2463
2464	aese	q2, v23.16b
2465	aesmc	q2, q2         @ AES block 4k+6 - round 5
2466	eor	r6, r6, r13                    @ AES block 4k+4 - round 12 low
2467
2468	aese	q1, v24.16b
2469	aesmc	q1, q1         @ AES block 4k+5 - round 6
2470	shl	d8, d8, #56              @ mod_constant
2471
2472	aese	q3, v23.16b
2473	aesmc	q3, q3         @ AES block 4k+7 - round 5
2474	eor	q9, q9, q5                        @ GHASH block 4k+3 - high
2475
2476	aese	q0, v25.16b
2477	aesmc	q0, q0         @ AES block 4k+4 - round 7
2478	fmov	d5, r19                              @ AES block 4k+5 - mov low
2479
2480	aese	q1, v25.16b
2481	aesmc	q1, q1         @ AES block 4k+5 - round 7
2482	eor	v10.16b, v10.16b, v31.16b                        @ GHASH block 4k+2 - mid
2483
2484	aese	q3, v24.16b
2485	aesmc	q3, q3         @ AES block 4k+7 - round 6
2486	fmov	v5.d[1], r20                          @ AES block 4k+5 - mov high
2487
2488	aese	q0, v26.16b
2489	aesmc	q0, q0         @ AES block 4k+4 - round 8
2490	eor	v11.16b, v11.16b, q6                        @ GHASH block 4k+3 - low
2491
2492	pmull	v30.1q, v30.1d, v16.1d                         @ GHASH block 4k+3 - mid
2493	cmp	r0, r5                  @ .LOOP CONTROL
2494	fmov	d4, r6                              @ AES block 4k+4 - mov low
2495
2496	aese	q2, v24.16b
2497	aesmc	q2, q2         @ AES block 4k+6 - round 6
2498	fmov	v4.d[1], r7                          @ AES block 4k+4 - mov high
2499
2500	aese	q1, v26.16b
2501	aesmc	q1, q1         @ AES block 4k+5 - round 8
2502	fmov	d7, r23                              @ AES block 4k+3 - mov low
2503
2504	eor	v10.16b, v10.16b, v30.16b                        @ GHASH block 4k+3 - mid
2505	eor	v30.16b, v11.16b, q9                        @ MODULO - karatsuba tidy up
2506	add	r12, r12, #1                           @ CTR block 4k+8
2507
2508	aese	q2, v25.16b
2509	aesmc	q2, q2         @ AES block 4k+6 - round 7
2510	fmov	v7.d[1], r24                          @ AES block 4k+3 - mov high
2511
2512	pmull	v31.1q, q9, q8           @ MODULO - top 64b align with mid
2513	ext	q9, q9, q9, #8                    @ MODULO - other top alignment
2514	fmov	d6, r21                              @ AES block 4k+6 - mov low
2515
2516	aese	q3, v25.16b
2517	aesmc	q3, q3         @ AES block 4k+7 - round 7
2518
2519	aese	q0, v27.16b
2520	aesmc	q0, q0         @ AES block 4k+4 - round 9
2521	eor	v10.16b, v10.16b, v30.16b                        @ MODULO - karatsuba tidy up
2522
2523	aese	q2, v26.16b
2524	aesmc	q2, q2         @ AES block 4k+6 - round 8
2525
2526	aese	q3, v26.16b
2527	aesmc	q3, q3         @ AES block 4k+7 - round 8
2528
2529	aese	q1, v27.16b
2530	aesmc	q1, q1         @ AES block 4k+5 - round 9
2531
2532	aese	q0, v28.16b
2533	aesmc	q0, q0         @ AES block 4k+4 - round 10
2534	eor	v10.16b, v10.16b, v31.16b                     @ MODULO - fold into mid
2535
2536	aese	q3, v27.16b
2537	aesmc	q3, q3         @ AES block 4k+7 - round 9
2538
2539	aese	q2, v27.16b
2540	aesmc	q2, q2         @ AES block 4k+6 - round 9
2541
2542	aese	q0, v29.16b                                    @ AES block 4k+4 - round 11
2543
2544	aese	q1, v28.16b
2545	aesmc	q1, q1         @ AES block 4k+5 - round 10
2546	eor	v10.16b, v10.16b, q9                        @ MODULO - fold into mid
2547
2548	aese	q2, v28.16b
2549	aesmc	q2, q2         @ AES block 4k+6 - round 10
2550
2551	eor	q4, q4, q0                         @ AES block 4k+4 - result
2552	fmov	d0, r10                              @ CTR block 4k+8
2553
2554	aese	q1, v29.16b                                    @ AES block 4k+5 - round 11
2555	fmov	v0.d[1], r9                              @ CTR block 4k+8
2556	rev	r9, r12                                @ CTR block 4k+9
2557
2558	pmull	v9.1q, v10.1d, q8           @ MODULO - mid 64b align with low
2559	fmov	v6.d[1], r22                          @ AES block 4k+6 - mov high
2560	st1	{ q4}, [r2], #16                    @ AES block 4k+4 - store result
2561
2562	aese	q3, v28.16b
2563	aesmc	q3, q3         @ AES block 4k+7 - round 10
2564	orr	r9, r11, r9, lsl #32           @ CTR block 4k+9
2565
2566	eor	q5, q5, q1                         @ AES block 4k+5 - result
2567	add	r12, r12, #1                           @ CTR block 4k+9
2568	fmov	d1, r10                              @ CTR block 4k+9
2569
2570	aese	q2, v29.16b                                    @ AES block 4k+6 - round 11
2571	fmov	v1.d[1], r9                              @ CTR block 4k+9
2572	rev	r9, r12                                @ CTR block 4k+10
2573
2574	add	r12, r12, #1                           @ CTR block 4k+10
2575	ext	v10.16b, v10.16b, v10.16b, #8                    @ MODULO - other mid alignment
2576	orr	r9, r11, r9, lsl #32           @ CTR block 4k+10
2577
2578	st1	{ q5}, [r2], #16                    @ AES block 4k+5 - store result
2579	eor	v11.16b, v11.16b, q9                        @ MODULO - fold into low
2580
2581	aese	q3, v29.16b                                    @ AES block 4k+7 - round 11
2582	eor	q6, q6, q2                         @ AES block 4k+6 - result
2583	fmov	d2, r10                              @ CTR block 4k+10
2584
2585	st1	{ q6}, [r2], #16                    @ AES block 4k+6 - store result
2586	fmov	v2.d[1], r9                              @ CTR block 4k+10
2587	rev	r9, r12                                @ CTR block 4k+11
2588
2589	eor	v11.16b, v11.16b, v10.16b                        @ MODULO - fold into low
2590	orr	r9, r11, r9, lsl #32           @ CTR block 4k+11
2591
2592	eor	q7, q7, q3                         @ AES block 4k+3 - result
2593	st1	{ q7}, [r2], #16                    @ AES block 4k+3 - store result
2594	blt	.L192_enc_main_loop
2595
2596.L192_enc_prepretail:@ PREPRETAIL
2597	aese	q0, v18.16b
2598	aesmc	q0, q0         @ AES block 4k+4 - round 0
2599	rev64	q4, q4                                   @ GHASH block 4k (only t0 is free)
2600
2601	fmov	d3, r10                              @ CTR block 4k+3
2602	ext	v11.16b, v11.16b, v11.16b, #8                    @ PRE 0
2603	add	r12, r12, #1                           @ CTR block 4k+3
2604
2605	aese	q1, v18.16b
2606	aesmc	q1, q1         @ AES block 4k+5 - round 0
2607	rev64	q5, q5                                   @ GHASH block 4k+1 (t0 and t1 free)
2608
2609	aese	q2, v18.16b
2610	aesmc	q2, q2         @ AES block 4k+6 - round 0
2611
2612	fmov	v3.d[1], r9                              @ CTR block 4k+3
2613	eor	q4, q4, v11.16b                          @ PRE 1
2614	mov	d10, v17.d[1]                              @ GHASH block 4k - mid
2615
2616	aese	q1, v19.16b
2617	aesmc	q1, q1         @ AES block 4k+5 - round 1
2618	rev64	q6, q6                                   @ GHASH block 4k+2 (t0, t1, and t2 free)
2619
2620	pmull2	v30.1q, q5, v14.2d                         @ GHASH block 4k+1 - high
2621
2622	pmull	v11.1q, q4, v15.1d                      @ GHASH block 4k - low
2623	mov	d8, v4.d[1]                                 @ GHASH block 4k - mid
2624
2625	pmull	v31.1q, q5, v14.1d                         @ GHASH block 4k+1 - low
2626	rev64	q7, q7                                   @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
2627
2628	pmull2	v9.1q, q4, v15.2d                      @ GHASH block 4k - high
2629
2630	eor	q8, q8, q4                         @ GHASH block 4k - mid
2631	mov	d4, v5.d[1]                                 @ GHASH block 4k+1 - mid
2632
2633	eor	v11.16b, v11.16b, v31.16b                        @ GHASH block 4k+1 - low
2634	mov	d31, v6.d[1]                                 @ GHASH block 4k+2 - mid
2635
2636	aese	q3, v18.16b
2637	aesmc	q3, q3         @ AES block 4k+7 - round 0
2638	eor	q9, q9, v30.16b                        @ GHASH block 4k+1 - high
2639
2640	pmull2	v30.1q, q6, v13.2d                         @ GHASH block 4k+2 - high
2641
2642	eor	q4, q4, q5                         @ GHASH block 4k+1 - mid
2643	eor	v31.8b, v31.8b, q6                         @ GHASH block 4k+2 - mid
2644
2645	aese	q3, v19.16b
2646	aesmc	q3, q3         @ AES block 4k+7 - round 1
2647
2648	aese	q2, v19.16b
2649	aesmc	q2, q2         @ AES block 4k+6 - round 1
2650	eor	q9, q9, v30.16b                        @ GHASH block 4k+2 - high
2651
2652	aese	q0, v19.16b
2653	aesmc	q0, q0         @ AES block 4k+4 - round 1
2654
2655	aese	q1, v20.16b
2656	aesmc	q1, q1         @ AES block 4k+5 - round 2
2657	mov	d30, v7.d[1]                                 @ GHASH block 4k+3 - mid
2658
2659	pmull2	v5.1q, q7, v12.2d                         @ GHASH block 4k+3 - high
2660	ins	v31.d[1], v31.d[0]                               @ GHASH block 4k+2 - mid
2661
2662	aese	q0, v20.16b
2663	aesmc	q0, q0         @ AES block 4k+4 - round 2
2664
2665	pmull	v10.1q, q8, v10.1d                     @ GHASH block 4k - mid
2666	eor	v30.8b, v30.8b, q7                         @ GHASH block 4k+3 - mid
2667
2668	aese	q1, v21.16b
2669	aesmc	q1, q1         @ AES block 4k+5 - round 3
2670
2671	pmull2	v31.1q, v31.2d, v16.2d                         @ GHASH block 4k+2 - mid
2672
2673	pmull	v4.1q, q4, v17.1d                         @ GHASH block 4k+1 - mid
2674
2675	pmull	v30.1q, v30.1d, v16.1d                         @ GHASH block 4k+3 - mid
2676	eor	q9, q9, q5                        @ GHASH block 4k+3 - high
2677
2678	pmull	v8.1q, q6, v13.1d                         @ GHASH block 4k+2 - low
2679
2680	aese	q0, v21.16b
2681	aesmc	q0, q0         @ AES block 4k+4 - round 3
2682	eor	v10.16b, v10.16b, q4                        @ GHASH block 4k+1 - mid
2683
2684	aese	q3, v20.16b
2685	aesmc	q3, q3         @ AES block 4k+7 - round 2
2686
2687	aese	q2, v20.16b
2688	aesmc	q2, q2         @ AES block 4k+6 - round 2
2689	eor	v11.16b, v11.16b, q8                        @ GHASH block 4k+2 - low
2690
2691	aese	q0, v22.16b
2692	aesmc	q0, q0         @ AES block 4k+4 - round 4
2693
2694	aese	q3, v21.16b
2695	aesmc	q3, q3         @ AES block 4k+7 - round 3
2696	eor	v10.16b, v10.16b, v31.16b                        @ GHASH block 4k+2 - mid
2697
2698	aese	q2, v21.16b
2699	aesmc	q2, q2         @ AES block 4k+6 - round 3
2700
2701	pmull	v6.1q, q7, v12.1d                         @ GHASH block 4k+3 - low
2702	movi	q8, #0xc2
2703
2704	aese	q3, v22.16b
2705	aesmc	q3, q3         @ AES block 4k+7 - round 4
2706
2707	aese	q2, v22.16b
2708	aesmc	q2, q2         @ AES block 4k+6 - round 4
2709
2710	aese	q1, v22.16b
2711	aesmc	q1, q1         @ AES block 4k+5 - round 4
2712	eor	v10.16b, v10.16b, v30.16b                        @ GHASH block 4k+3 - mid
2713
2714	aese	q3, v23.16b
2715	aesmc	q3, q3         @ AES block 4k+7 - round 5
2716
2717	aese	q2, v23.16b
2718	aesmc	q2, q2         @ AES block 4k+6 - round 5
2719
2720	aese	q1, v23.16b
2721	aesmc	q1, q1         @ AES block 4k+5 - round 5
2722	eor	v11.16b, v11.16b, q6                        @ GHASH block 4k+3 - low
2723
2724	aese	q0, v23.16b
2725	aesmc	q0, q0         @ AES block 4k+4 - round 5
2726
2727	aese	q3, v24.16b
2728	aesmc	q3, q3         @ AES block 4k+7 - round 6
2729	eor	v10.16b, v10.16b, q9                        @ karatsuba tidy up
2730
2731	aese	q1, v24.16b
2732	aesmc	q1, q1         @ AES block 4k+5 - round 6
2733
2734	aese	q0, v24.16b
2735	aesmc	q0, q0         @ AES block 4k+4 - round 6
2736	shl	d8, d8, #56              @ mod_constant
2737
2738	aese	q3, v25.16b
2739	aesmc	q3, q3         @ AES block 4k+7 - round 7
2740
2741	aese	q1, v25.16b
2742	aesmc	q1, q1         @ AES block 4k+5 - round 7
2743	eor	v10.16b, v10.16b, v11.16b
2744
2745	aese	q0, v25.16b
2746	aesmc	q0, q0         @ AES block 4k+4 - round 7
2747
2748	pmull	v30.1q, q9, q8
2749
2750	aese	q2, v24.16b
2751	aesmc	q2, q2         @ AES block 4k+6 - round 6
2752	ext	q9, q9, q9, #8
2753
2754	aese	q0, v26.16b
2755	aesmc	q0, q0         @ AES block 4k+4 - round 8
2756
2757	aese	q1, v26.16b
2758	aesmc	q1, q1         @ AES block 4k+5 - round 8
2759	eor	v10.16b, v10.16b, v30.16b
2760
2761	aese	q2, v25.16b
2762	aesmc	q2, q2         @ AES block 4k+6 - round 7
2763
2764	aese	q3, v26.16b
2765	aesmc	q3, q3         @ AES block 4k+7 - round 8
2766
2767	aese	q0, v27.16b
2768	aesmc	q0, q0         @ AES block 4k+4 - round 9
2769
2770	aese	q2, v26.16b
2771	aesmc	q2, q2         @ AES block 4k+6 - round 8
2772	eor	v10.16b, v10.16b, q9
2773
2774	aese	q3, v27.16b
2775	aesmc	q3, q3         @ AES block 4k+7 - round 9
2776
2777	aese	q1, v27.16b
2778	aesmc	q1, q1         @ AES block 4k+5 - round 9
2779
2780	aese	q2, v27.16b
2781	aesmc	q2, q2         @ AES block 4k+6 - round 9
2782
2783	pmull	v30.1q, v10.1d, q8
2784
2785	ext	v10.16b, v10.16b, v10.16b, #8
2786
2787	aese	q3, v28.16b
2788	aesmc	q3, q3         @ AES block 4k+7 - round 10
2789
2790	aese	q0, v28.16b
2791	aesmc	q0, q0         @ AES block 4k+4 - round 10
2792
2793	aese	q2, v28.16b
2794	aesmc	q2, q2         @ AES block 4k+6 - round 10
2795
2796	aese	q1, v28.16b
2797	aesmc	q1, q1         @ AES block 4k+5 - round 10
2798	eor	v11.16b, v11.16b, v30.16b
2799
2800	aese	q0, v29.16b                                    @ AES block 4k+4 - round 11
2801
2802	aese	q3, v29.16b                                    @ AES block 4k+7 - round 11
2803
2804	aese	q2, v29.16b                                    @ AES block 4k+6 - round 11
2805
2806	aese	q1, v29.16b                                    @ AES block 4k+5 - round 11
2807	eor	v11.16b, v11.16b, v10.16b
2808.L192_enc_tail:@ TAIL
2809
2810	sub	r5, r4, r0  @ main_end_input_ptr is number of bytes left to process
2811	ldp	r6, r7, [r0], #16          @ AES block 4k+4 - load plaintext
2812#ifdef __ARMEB__
2813	rev	r6, r6
2814	rev	r7, r7
2815#endif
2816	eor	r6, r6, r13                    @ AES block 4k+4 - round 12 low
2817	eor	r7, r7, r14                    @ AES block 4k+4 - round 12 high
2818
2819	fmov	d4, r6                              @ AES block 4k+4 - mov low
2820
2821	fmov	v4.d[1], r7                          @ AES block 4k+4 - mov high
2822	cmp	r5, #48
2823
2824	eor	q5, q4, q0                         @ AES block 4k+4 - result
2825
2826	ext	q8, v11.16b, v11.16b, #8                    @ prepare final partial tag
2827	bgt	.L192_enc_blocks_more_than_3
2828
2829	sub	r12, r12, #1
2830	movi	v10.8b, #0
2831
2832	mov	q3, q2
2833	movi	q9, #0
2834	cmp	r5, #32
2835
2836	mov	q2, q1
2837	movi	v11.8b, #0
2838	bgt	.L192_enc_blocks_more_than_2
2839
2840	sub	r12, r12, #1
2841
2842	mov	q3, q1
2843	cmp	r5, #16
2844	bgt	.L192_enc_blocks_more_than_1
2845
2846	sub	r12, r12, #1
2847	b	.L192_enc_blocks_less_than_1
2848.L192_enc_blocks_more_than_3:@ blocks left >  3
2849	st1	{ q5}, [r2], #16                    @ AES final-3 block  - store result
2850
2851	ldp	r6, r7, [r0], #16          @ AES final-2 block - load input low & high
2852#ifdef __ARMEB__
2853	rev	r6, r6
2854	rev	r7, r7
2855#endif
2856	rev64	q4, q5                                   @ GHASH final-3 block
2857
2858	eor	r6, r6, r13                    @ AES final-2 block - round 12 low
2859	eor	q4, q4, q8                          @ feed in partial tag
2860
2861	eor	r7, r7, r14                    @ AES final-2 block - round 12 high
2862	fmov	d5, r6                                @ AES final-2 block - mov low
2863
2864	fmov	v5.d[1], r7                            @ AES final-2 block - mov high
2865
2866	mov	d22, v4.d[1]                                @ GHASH final-3 block - mid
2867
2868	pmull	v11.1q, q4, v15.1d                      @ GHASH final-3 block - low
2869
2870	mov	d10, v17.d[1]                              @ GHASH final-3 block - mid
2871
2872	eor	v22.8b, v22.8b, q4                     @ GHASH final-3 block - mid
2873
2874	movi	q8, #0                                       @ suppress further partial tag feed in
2875
2876	pmull2	v9.1q, q4, v15.2d                      @ GHASH final-3 block - high
2877
2878	pmull	v10.1q, v22.1d, v10.1d                   @ GHASH final-3 block - mid
2879	eor	q5, q5, q1                           @ AES final-2 block - result
2880.L192_enc_blocks_more_than_2:@ blocks left >  2
2881
2882	st1	{ q5}, [r2], #16                    @ AES final-2 block - store result
2883
2884	rev64	q4, q5                                   @ GHASH final-2 block
2885	ldp	r6, r7, [r0], #16          @ AES final-1 block - load input low & high
2886#ifdef __ARMEB__
2887	rev	r6, r6
2888	rev	r7, r7
2889#endif
2890	eor	q4, q4, q8                          @ feed in partial tag
2891
2892	eor	r7, r7, r14                    @ AES final-1 block - round 12 high
2893
2894	pmull2	v20.1q, q4, v14.2d                         @ GHASH final-2 block - high
2895	mov	d22, v4.d[1]                                @ GHASH final-2 block - mid
2896
2897	pmull	v21.1q, q4, v14.1d                         @ GHASH final-2 block - low
2898	eor	r6, r6, r13                    @ AES final-1 block - round 12 low
2899
2900	fmov	d5, r6                                @ AES final-1 block - mov low
2901
2902	fmov	v5.d[1], r7                            @ AES final-1 block - mov high
2903	eor	q9, q9, v20.16b                           @ GHASH final-2 block - high
2904	eor	v22.8b, v22.8b, q4                     @ GHASH final-2 block - mid
2905
2906	eor	v11.16b, v11.16b, v21.16b                           @ GHASH final-2 block - low
2907
2908	pmull	v22.1q, v22.1d, v17.1d                     @ GHASH final-2 block - mid
2909
2910	movi	q8, #0                                       @ suppress further partial tag feed in
2911
2912	eor	q5, q5, q2                           @ AES final-1 block - result
2913
2914	eor	v10.16b, v10.16b, v22.16b                      @ GHASH final-2 block - mid
2915.L192_enc_blocks_more_than_1:@ blocks left >  1
2916
2917	st1	{ q5}, [r2], #16                    @ AES final-1 block - store result
2918
2919	ldp	r6, r7, [r0], #16          @ AES final block - load input low & high
2920#ifdef __ARMEB__
2921	rev	r6, r6
2922	rev	r7, r7
2923#endif
2924	rev64	q4, q5                                   @ GHASH final-1 block
2925
2926	eor	r6, r6, r13                    @ AES final block - round 12 low
2927	eor	q4, q4, q8                          @ feed in partial tag
2928	movi	q8, #0                                       @ suppress further partial tag feed in
2929
2930	mov	d22, v4.d[1]                                @ GHASH final-1 block - mid
2931
2932	eor	v22.8b, v22.8b, q4                     @ GHASH final-1 block - mid
2933	eor	r7, r7, r14                    @ AES final block - round 12 high
2934	fmov	d5, r6                                @ AES final block - mov low
2935
2936	pmull2	v20.1q, q4, v13.2d                         @ GHASH final-1 block - high
2937	fmov	v5.d[1], r7                            @ AES final block - mov high
2938
2939	ins	v22.d[1], v22.d[0]                           @ GHASH final-1 block - mid
2940
2941	eor	q9, q9, v20.16b                           @ GHASH final-1 block - high
2942
2943	pmull	v21.1q, q4, v13.1d                         @ GHASH final-1 block - low
2944
2945	pmull2	v22.1q, v22.2d, v16.2d                     @ GHASH final-1 block - mid
2946
2947	eor	q5, q5, q3                           @ AES final block - result
2948
2949	eor	v11.16b, v11.16b, v21.16b                           @ GHASH final-1 block - low
2950
2951	eor	v10.16b, v10.16b, v22.16b                      @ GHASH final-1 block - mid
2952.L192_enc_blocks_less_than_1:@ blocks left <= 1
2953
2954	ld1	{ v18.16b}, [r2]                           @ load existing bytes where the possibly partial last block is to be stored
2955#ifndef __ARMEB__
2956	rev	r9, r12
2957#else
2958	mov	r9, r12
2959#endif
2960	and	r1, r1, #127                   @ bit_length %= 128
2961
2962	sub	r1, r1, #128                   @ bit_length -= 128
2963	mvn	r14, xzr                                     @ rk12_h = 0xffffffffffffffff
2964
2965	neg	r1, r1                         @ bit_length = 128 - #bits in input (in range [1,128])
2966	mvn	r13, xzr                                     @ rk12_l = 0xffffffffffffffff
2967
2968	and	r1, r1, #127                   @ bit_length %= 128
2969
2970	lsr	r14, r14, r1                    @ rk12_h is mask for top 64b of last block
2971	cmp	r1, #64
2972
2973	csel	r6, r13, r14, lt
2974	csel	r7, r14, xzr, lt
2975
2976	fmov	d0, r6                                @ ctr0b is mask for last block
2977
2978	fmov	v0.d[1], r7
2979
2980	and	q5, q5, q0                           @ possibly partial last block has zeroes in highest bits
2981
2982	rev64	q4, q5                                   @ GHASH final block
2983
2984	eor	q4, q4, q8                          @ feed in partial tag
2985
2986	mov	d8, v4.d[1]                                 @ GHASH final block - mid
2987
2988	pmull	v21.1q, q4, v12.1d                         @ GHASH final block - low
2989
2990	pmull2	v20.1q, q4, v12.2d                         @ GHASH final block - high
2991
2992	eor	q8, q8, q4                         @ GHASH final block - mid
2993
2994	eor	v11.16b, v11.16b, v21.16b                           @ GHASH final block - low
2995
2996	eor	q9, q9, v20.16b                           @ GHASH final block - high
2997
2998	pmull	v8.1q, q8, v16.1d                         @ GHASH final block - mid
2999
3000	eor	v10.16b, v10.16b, q8                        @ GHASH final block - mid
3001	movi	q8, #0xc2
3002
3003	eor	v30.16b, v11.16b, q9                        @ MODULO - karatsuba tidy up
3004
3005	shl	d8, d8, #56              @ mod_constant
3006
3007	bif	q5, v18.16b, q0                             @ insert existing bytes in top end of result before storing
3008
3009	eor	v10.16b, v10.16b, v30.16b                        @ MODULO - karatsuba tidy up
3010
3011	pmull	v31.1q, q9, q8           @ MODULO - top 64b align with mid
3012
3013	ext	q9, q9, q9, #8                    @ MODULO - other top alignment
3014
3015	eor	v10.16b, v10.16b, v31.16b                     @ MODULO - fold into mid
3016
3017	eor	v10.16b, v10.16b, q9                        @ MODULO - fold into mid
3018
3019	pmull	v9.1q, v10.1d, q8           @ MODULO - mid 64b align with low
3020
3021	ext	v10.16b, v10.16b, v10.16b, #8                    @ MODULO - other mid alignment
3022
3023	eor	v11.16b, v11.16b, q9                        @ MODULO - fold into low
3024	str	r9, [r16, #12]                         @ store the updated counter
3025
3026	st1	{ q5}, [r2]                         @ store all 16B
3027
3028	eor	v11.16b, v11.16b, v10.16b                        @ MODULO - fold into low
3029	ext	v11.16b, v11.16b, v11.16b, #8
3030	rev64	v11.16b, v11.16b
3031	mov	r0, r15
3032	st1	{ v11.16b }, [r3]
3033
3034	ldp	r21, r22, [sp, #16]
3035	ldp	r23, r24, [sp, #32]
3036	ldp	d8, d9, [sp, #48]
3037	ldp	d10, d11, [sp, #64]
3038	ldp	d12, d13, [sp, #80]
3039	ldp	d14, d15, [sp, #96]
3040	ldp	r19, r20, [sp], #112
3041	RET
3042
3043.L192_enc_ret:
3044	mov	r0, #0x0
3045	RET
3046.size	aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel
3047.globl	aes_gcm_dec_192_kernel
3048.type	aes_gcm_dec_192_kernel,%function
3049.align	4
3050aes_gcm_dec_192_kernel:
3051	cbz	r1, .L192_dec_ret
3052	stp	r19, r20, [sp, #-112]!
3053	mov	r16, r4
3054	mov	r8, r5
3055	stp	r21, r22, [sp, #16]
3056	stp	r23, r24, [sp, #32]
3057	stp	d8, d9, [sp, #48]
3058	stp	d10, d11, [sp, #64]
3059	stp	d12, d13, [sp, #80]
3060	stp	d14, d15, [sp, #96]
3061
3062	add	r4, r0, r1, lsr #3   @ end_input_ptr
3063	ldp	r10, r11, [r16]              @ ctr96_b64, ctr96_t32
3064#ifdef __ARMEB__
3065	rev	r10, r10
3066	rev	r11, r11
3067#endif
3068	ldp	r13, r14, [r8, #192]                     @ load rk12
3069#ifdef __ARMEB__
3070	ror	r13, r13, #32
3071	ror	r14, r14, #32
3072#endif
3073	ld1	{ q0}, [r16]                             @ special case vector load initial counter so we can start first AES block as quickly as possible
3074
3075	ld1	{v18.4s}, [r8], #16                                  @ load rk0
3076
3077	lsr	r5, r1, #3              @ byte_len
3078	mov	r15, r5
3079	ld1	{v19.4s}, [r8], #16                               @ load rk1
3080
3081	lsr	r12, r11, #32
3082	orr	r11, r11, r11
3083	fmov	d3, r10                               @ CTR block 3
3084
3085	rev	r12, r12                                @ rev_ctr32
3086	fmov	d1, r10                               @ CTR block 1
3087
3088	add	r12, r12, #1                            @ increment rev_ctr32
3089	ld1	{v20.4s}, [r8], #16                               @ load rk2
3090
3091	aese	q0, v18.16b
3092	aesmc	q0, q0          @ AES block 0 - round 0
3093	rev	r9, r12                                 @ CTR block 1
3094
3095	add	r12, r12, #1                            @ CTR block 1
3096	orr	r9, r11, r9, lsl #32            @ CTR block 1
3097	ld1	{v21.4s}, [r8], #16                               @ load rk3
3098
3099	fmov	v1.d[1], r9                               @ CTR block 1
3100	rev	r9, r12                                 @ CTR block 2
3101	add	r12, r12, #1                            @ CTR block 2
3102
3103	fmov	d2, r10                               @ CTR block 2
3104	orr	r9, r11, r9, lsl #32            @ CTR block 2
3105
3106	fmov	v2.d[1], r9                               @ CTR block 2
3107	rev	r9, r12                                 @ CTR block 3
3108
3109	aese	q0, v19.16b
3110	aesmc	q0, q0          @ AES block 0 - round 1
3111	orr	r9, r11, r9, lsl #32            @ CTR block 3
3112
3113	fmov	v3.d[1], r9                               @ CTR block 3
3114
3115	ld1	{v22.4s}, [r8], #16                               @ load rk4
3116
3117	aese	q0, v20.16b
3118	aesmc	q0, q0          @ AES block 0 - round 2
3119
3120	aese	q2, v18.16b
3121	aesmc	q2, q2          @ AES block 2 - round 0
3122	ld1	{v23.4s}, [r8], #16                               @ load rk5
3123
3124	aese	q1, v18.16b
3125	aesmc	q1, q1          @ AES block 1 - round 0
3126	ldr	q15, [r3, #112]                        @ load h4l | h4h
3127#ifndef __ARMEB__
3128	ext	v15.16b, v15.16b, v15.16b, #8
3129#endif
3130	aese	q3, v18.16b
3131	aesmc	q3, q3          @ AES block 3 - round 0
3132	ldr	q13, [r3, #64]                         @ load h2l | h2h
3133#ifndef __ARMEB__
3134	ext	v13.16b, v13.16b, v13.16b, #8
3135#endif
3136	aese	q2, v19.16b
3137	aesmc	q2, q2          @ AES block 2 - round 1
3138	ldr	q14, [r3, #80]                         @ load h3l | h3h
3139#ifndef __ARMEB__
3140	ext	v14.16b, v14.16b, v14.16b, #8
3141#endif
3142	aese	q1, v19.16b
3143	aesmc	q1, q1          @ AES block 1 - round 1
3144
3145	aese	q3, v19.16b
3146	aesmc	q3, q3          @ AES block 3 - round 1
3147	ldr	q12, [r3, #32]                         @ load h1l | h1h
3148#ifndef __ARMEB__
3149	ext	v12.16b, v12.16b, v12.16b, #8
3150#endif
3151	aese	q2, v20.16b
3152	aesmc	q2, q2          @ AES block 2 - round 2
3153	ld1	{v24.4s}, [r8], #16                               @ load rk6
3154
3155	aese	q0, v21.16b
3156	aesmc	q0, q0          @ AES block 0 - round 3
3157	ld1	{v25.4s}, [r8], #16                               @ load rk7
3158
3159	aese	q1, v20.16b
3160	aesmc	q1, q1          @ AES block 1 - round 2
3161	ld1	{v26.4s}, [r8], #16                               @ load rk8
3162
3163	aese	q3, v20.16b
3164	aesmc	q3, q3          @ AES block 3 - round 2
3165	ld1	{v27.4s}, [r8], #16                               @ load rk9
3166
3167	aese	q2, v21.16b
3168	aesmc	q2, q2          @ AES block 2 - round 3
3169	ld1	{ v11.16b}, [r3]
3170	ext	v11.16b, v11.16b, v11.16b, #8
3171	rev64	v11.16b, v11.16b
3172
3173	aese	q1, v21.16b
3174	aesmc	q1, q1          @ AES block 1 - round 3
3175	add	r12, r12, #1                            @ CTR block 3
3176
3177	aese	q3, v21.16b
3178	aesmc	q3, q3          @ AES block 3 - round 3
3179	trn1	q9, v14.2d,    v15.2d                      @ h4h | h3h
3180
3181	aese	q0, v22.16b
3182	aesmc	q0, q0          @ AES block 0 - round 4
3183	ld1	{v28.4s}, [r8], #16                              @ load rk10
3184
3185	aese	q1, v22.16b
3186	aesmc	q1, q1          @ AES block 1 - round 4
3187	trn2	v17.2d,  v14.2d,    v15.2d                      @ h4l | h3l
3188
3189	aese	q2, v22.16b
3190	aesmc	q2, q2          @ AES block 2 - round 4
3191
3192	aese	q3, v22.16b
3193	aesmc	q3, q3          @ AES block 3 - round 4
3194	trn2	v16.2d,  v12.2d,    v13.2d                      @ h2l | h1l
3195
3196	aese	q0, v23.16b
3197	aesmc	q0, q0          @ AES block 0 - round 5
3198	ld1	{v29.4s}, [r8], #16                              @ load rk11
3199
3200	aese	q1, v23.16b
3201	aesmc	q1, q1          @ AES block 1 - round 5
3202
3203	aese	q2, v23.16b
3204	aesmc	q2, q2          @ AES block 2 - round 5
3205
3206	aese	q3, v23.16b
3207	aesmc	q3, q3          @ AES block 3 - round 5
3208
3209	aese	q0, v24.16b
3210	aesmc	q0, q0          @ AES block 0 - round 6
3211
3212	aese	q2, v24.16b
3213	aesmc	q2, q2          @ AES block 2 - round 6
3214
3215	aese	q3, v24.16b
3216	aesmc	q3, q3          @ AES block 3 - round 6
3217
3218	aese	q0, v25.16b
3219	aesmc	q0, q0          @ AES block 0 - round 7
3220
3221	aese	q2, v25.16b
3222	aesmc	q2, q2          @ AES block 2 - round 7
3223
3224	aese	q3, v25.16b
3225	aesmc	q3, q3          @ AES block 3 - round 7
3226
3227	aese	q1, v24.16b
3228	aesmc	q1, q1          @ AES block 1 - round 6
3229
3230	aese	q2, v26.16b
3231	aesmc	q2, q2          @ AES block 2 - round 8
3232
3233	aese	q3, v26.16b
3234	aesmc	q3, q3          @ AES block 3 - round 8
3235
3236	aese	q1, v25.16b
3237	aesmc	q1, q1          @ AES block 1 - round 7
3238
3239	aese	q2, v27.16b
3240	aesmc	q2, q2          @ AES block 2 - round 9
3241
3242	aese	q3, v27.16b
3243	aesmc	q3, q3          @ AES block 3 - round 9
3244
3245	aese	q1, v26.16b
3246	aesmc	q1, q1          @ AES block 1 - round 8
3247	sub	r5, r5, #1      @ byte_len - 1
3248
3249	aese	q0, v26.16b
3250	aesmc	q0, q0          @ AES block 0 - round 8
3251	and	r5, r5, #0xffffffffffffffc0    @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
3252
3253	aese	q3, v28.16b
3254	aesmc	q3, q3          @ AES block 3 - round 10
3255	add	r5, r5, r0
3256
3257	aese	q1, v27.16b
3258	aesmc	q1, q1          @ AES block 1 - round 9
3259	cmp	r0, r5                   @ check if we have <= 4 blocks
3260
3261	aese	q0, v27.16b
3262	aesmc	q0, q0          @ AES block 0 - round 9
3263	trn1	q8,    v12.2d,    v13.2d                      @ h2h | h1h
3264
3265	aese	q3, v29.16b                                     @ AES block 3 - round 11
3266
3267	aese	q2, v28.16b
3268	aesmc	q2, q2          @ AES block 2 - round 10
3269
3270	aese	q1, v28.16b
3271	aesmc	q1, q1          @ AES block 1 - round 10
3272
3273	aese	q0, v28.16b
3274	aesmc	q0, q0          @ AES block 0 - round 10
3275	eor	v16.16b, v16.16b, q8                     @ h2k | h1k
3276
3277	aese	q2, v29.16b                                     @ AES block 2 - round 11
3278
3279	aese	q1, v29.16b                                     @ AES block 1 - round 11
3280	eor	v17.16b, v17.16b, q9                  @ h4k | h3k
3281
3282	aese	q0, v29.16b                                     @ AES block 0 - round 11
3283	bge	.L192_dec_tail                                    @ handle tail
3284
3285	ld1	{q4, q5}, [r0], #32               @ AES block 0,1 - load ciphertext
3286
3287	eor	q1, q5, q1                            @ AES block 1 - result
3288
3289	eor	q0, q4, q0                            @ AES block 0 - result
3290	rev	r9, r12                                 @ CTR block 4
3291	ld1	{q6, q7}, [r0], #32               @ AES block 2,3 - load ciphertext
3292
3293	mov	r19, v1.d[0]                            @ AES block 1 - mov low
3294
3295	mov	r20, v1.d[1]                            @ AES block 1 - mov high
3296
3297	mov	r6, v0.d[0]                            @ AES block 0 - mov low
3298	orr	r9, r11, r9, lsl #32            @ CTR block 4
3299	add	r12, r12, #1                            @ CTR block 4
3300
3301	mov	r7, v0.d[1]                            @ AES block 0 - mov high
3302	rev64	q4, q4                                    @ GHASH block 0
3303
3304	fmov	d0, r10                               @ CTR block 4
3305	rev64	q5, q5                                    @ GHASH block 1
3306	cmp	r0, r5                   @ check if we have <= 8 blocks
3307
3308	eor	r19, r19, r13                   @ AES block 1 - round 12 low
3309#ifdef __ARMEB__
3310	rev	r19, r19
3311#endif
3312	fmov	v0.d[1], r9                               @ CTR block 4
3313	rev	r9, r12                                 @ CTR block 5
3314
3315	orr	r9, r11, r9, lsl #32            @ CTR block 5
3316	fmov	d1, r10                               @ CTR block 5
3317	eor	r20, r20, r14                   @ AES block 1 - round 12 high
3318#ifdef __ARMEB__
3319	rev	r20, r20
3320#endif
3321	add	r12, r12, #1                            @ CTR block 5
3322	fmov	v1.d[1], r9                               @ CTR block 5
3323	eor	r6, r6, r13                   @ AES block 0 - round 12 low
3324#ifdef __ARMEB__
3325	rev	r6, r6
3326#endif
3327	rev	r9, r12                                 @ CTR block 6
3328	eor	r7, r7, r14                   @ AES block 0 - round 12 high
3329#ifdef __ARMEB__
3330	rev	r7, r7
3331#endif
3332	stp	r6, r7, [r2], #16        @ AES block 0 - store result
3333	orr	r9, r11, r9, lsl #32            @ CTR block 6
3334
3335	stp	r19, r20, [r2], #16        @ AES block 1 - store result
3336
3337	add	r12, r12, #1                            @ CTR block 6
3338	eor	q2, q6, q2                            @ AES block 2 - result
3339	bge	.L192_dec_prepretail                              @ do prepretail
3340
3341.L192_dec_main_loop:@ main loop start
3342	aese	q1, v18.16b
3343	aesmc	q1, q1          @ AES block 4k+5 - round 0
3344	ext	v11.16b, v11.16b, v11.16b, #8                     @ PRE 0
3345
3346	pmull	v31.1q, q5, v14.1d                          @ GHASH block 4k+1 - low
3347	mov	r21, v2.d[0]                            @ AES block 4k+2 - mov low
3348
3349	mov	r22, v2.d[1]                            @ AES block 4k+2 - mov high
3350	eor	q3, q7, q3                            @ AES block 4k+3 - result
3351	rev64	q7, q7                                    @ GHASH block 4k+3
3352
3353	aese	q1, v19.16b
3354	aesmc	q1, q1          @ AES block 4k+5 - round 1
3355	fmov	d2, r10                               @ CTR block 4k+6
3356
3357	aese	q0, v18.16b
3358	aesmc	q0, q0          @ AES block 4k+4 - round 0
3359	eor	q4, q4, v11.16b                           @ PRE 1
3360
3361	pmull2	v30.1q, q5, v14.2d                          @ GHASH block 4k+1 - high
3362	fmov	v2.d[1], r9                               @ CTR block 4k+6
3363
3364	aese	q1, v20.16b
3365	aesmc	q1, q1          @ AES block 4k+5 - round 2
3366	mov	r24, v3.d[1]                            @ AES block 4k+3 - mov high
3367
3368	aese	q0, v19.16b
3369	aesmc	q0, q0          @ AES block 4k+4 - round 1
3370	mov	r23, v3.d[0]                            @ AES block 4k+3 - mov low
3371
3372	pmull2	v9.1q, q4, v15.2d                       @ GHASH block 4k - high
3373	fmov	d3, r10                               @ CTR block 4k+7
3374	mov	d8, v4.d[1]                                  @ GHASH block 4k - mid
3375
3376	pmull	v11.1q, q4, v15.1d                       @ GHASH block 4k - low
3377	mov	d10, v17.d[1]                               @ GHASH block 4k - mid
3378	rev	r9, r12                                 @ CTR block 4k+7
3379
3380	aese	q2, v18.16b
3381	aesmc	q2, q2          @ AES block 4k+6 - round 0
3382	orr	r9, r11, r9, lsl #32            @ CTR block 4k+7
3383
3384	fmov	v3.d[1], r9                               @ CTR block 4k+7
3385	eor	q8, q8, q4                          @ GHASH block 4k - mid
3386	mov	d4, v5.d[1]                                  @ GHASH block 4k+1 - mid
3387
3388	aese	q1, v21.16b
3389	aesmc	q1, q1          @ AES block 4k+5 - round 3
3390
3391	aese	q0, v20.16b
3392	aesmc	q0, q0          @ AES block 4k+4 - round 2
3393	eor	r22, r22, r14                   @ AES block 4k+2 - round 12 high
3394#ifdef __ARMEB__
3395	rev	r22, r22
3396#endif
3397	aese	q2, v19.16b
3398	aesmc	q2, q2          @ AES block 4k+6 - round 1
3399	eor	q4, q4, q5                          @ GHASH block 4k+1 - mid
3400
3401	pmull	v10.1q, q8, v10.1d                      @ GHASH block 4k - mid
3402
3403	aese	q3, v18.16b
3404	aesmc	q3, q3          @ AES block 4k+7 - round 0
3405	rev64	q6, q6                                    @ GHASH block 4k+2
3406
3407	aese	q2, v20.16b
3408	aesmc	q2, q2          @ AES block 4k+6 - round 2
3409
3410	pmull	v4.1q, q4, v17.1d                          @ GHASH block 4k+1 - mid
3411	eor	v11.16b, v11.16b, v31.16b                         @ GHASH block 4k+1 - low
3412	eor	r21, r21, r13                   @ AES block 4k+2 - round 12 low
3413#ifdef __ARMEB__
3414	rev	r21, r21
3415#endif
3416	aese	q1, v22.16b
3417	aesmc	q1, q1          @ AES block 4k+5 - round 4
3418
3419	aese	q0, v21.16b
3420	aesmc	q0, q0          @ AES block 4k+4 - round 3
3421
3422	eor	v10.16b, v10.16b, q4                         @ GHASH block 4k+1 - mid
3423	mov	d31, v6.d[1]                                  @ GHASH block 4k+2 - mid
3424
3425	aese	q3, v19.16b
3426	aesmc	q3, q3          @ AES block 4k+7 - round 1
3427	eor	q9, q9, v30.16b                         @ GHASH block 4k+1 - high
3428
3429	aese	q0, v22.16b
3430	aesmc	q0, q0          @ AES block 4k+4 - round 4
3431
3432	pmull2	v30.1q, q6, v13.2d                          @ GHASH block 4k+2 - high
3433	eor	v31.8b, v31.8b, q6                          @ GHASH block 4k+2 - mid
3434
3435	pmull	v8.1q, q6, v13.1d                          @ GHASH block 4k+2 - low
3436
3437	aese	q0, v23.16b
3438	aesmc	q0, q0          @ AES block 4k+4 - round 5
3439
3440	eor	q9, q9, v30.16b                         @ GHASH block 4k+2 - high
3441	mov	d30, v7.d[1]                                  @ GHASH block 4k+3 - mid
3442
3443	aese	q1, v23.16b
3444	aesmc	q1, q1          @ AES block 4k+5 - round 5
3445
3446	pmull2	v5.1q, q7, v12.2d                          @ GHASH block 4k+3 - high
3447
3448	aese	q3, v20.16b
3449	aesmc	q3, q3          @ AES block 4k+7 - round 2
3450	eor	v30.8b, v30.8b, q7                          @ GHASH block 4k+3 - mid
3451
3452	aese	q1, v24.16b
3453	aesmc	q1, q1          @ AES block 4k+5 - round 6
3454
3455	aese	q0, v24.16b
3456	aesmc	q0, q0          @ AES block 4k+4 - round 6
3457	ins	v31.d[1], v31.d[0]                                @ GHASH block 4k+2 - mid
3458
3459	aese	q3, v21.16b
3460	aesmc	q3, q3          @ AES block 4k+7 - round 3
3461
3462	pmull	v30.1q, v30.1d, v16.1d                          @ GHASH block 4k+3 - mid
3463	eor	v11.16b, v11.16b, q8                         @ GHASH block 4k+2 - low
3464
3465	aese	q0, v25.16b
3466	aesmc	q0, q0          @ AES block 4k+4 - round 7
3467
3468	pmull2	v31.1q, v31.2d, v16.2d                          @ GHASH block 4k+2 - mid
3469	eor	q9, q9, q5                         @ GHASH block 4k+3 - high
3470
3471	aese	q1, v25.16b
3472	aesmc	q1, q1          @ AES block 4k+5 - round 7
3473
3474	aese	q0, v26.16b
3475	aesmc	q0, q0          @ AES block 4k+4 - round 8
3476	movi	q8, #0xc2
3477
3478	pmull	v6.1q, q7, v12.1d                          @ GHASH block 4k+3 - low
3479
3480	aese	q1, v26.16b
3481	aesmc	q1, q1          @ AES block 4k+5 - round 8
3482	eor	v10.16b, v10.16b, v31.16b                         @ GHASH block 4k+2 - mid
3483
3484	aese	q2, v21.16b
3485	aesmc	q2, q2          @ AES block 4k+6 - round 3
3486
3487	aese	q0, v27.16b
3488	aesmc	q0, q0          @ AES block 4k+4 - round 9
3489	eor	v11.16b, v11.16b, q6                         @ GHASH block 4k+3 - low
3490
3491	aese	q3, v22.16b
3492	aesmc	q3, q3          @ AES block 4k+7 - round 4
3493
3494	aese	q2, v22.16b
3495	aesmc	q2, q2          @ AES block 4k+6 - round 4
3496	eor	v10.16b, v10.16b, v30.16b                         @ GHASH block 4k+3 - mid
3497
3498	aese	q0, v28.16b
3499	aesmc	q0, q0          @ AES block 4k+4 - round 10
3500
3501	aese	q1, v27.16b
3502	aesmc	q1, q1          @ AES block 4k+5 - round 9
3503	eor	v30.16b, v11.16b, q9                         @ MODULO - karatsuba tidy up
3504
3505	aese	q2, v23.16b
3506	aesmc	q2, q2          @ AES block 4k+6 - round 5
3507
3508	aese	q3, v23.16b
3509	aesmc	q3, q3          @ AES block 4k+7 - round 5
3510	shl	d8, d8, #56               @ mod_constant
3511
3512	aese	q1, v28.16b
3513	aesmc	q1, q1          @ AES block 4k+5 - round 10
3514
3515	aese	q2, v24.16b
3516	aesmc	q2, q2          @ AES block 4k+6 - round 6
3517	ld1	{q4}, [r0], #16                       @ AES block 4k+4 - load ciphertext
3518
3519	aese	q3, v24.16b
3520	aesmc	q3, q3          @ AES block 4k+7 - round 6
3521	eor	v10.16b, v10.16b, v30.16b                         @ MODULO - karatsuba tidy up
3522
3523	pmull	v31.1q, q9, q8            @ MODULO - top 64b align with mid
3524	ld1	{q5}, [r0], #16                       @ AES block 4k+5 - load ciphertext
3525	eor	r23, r23, r13                   @ AES block 4k+3 - round 12 low
3526#ifdef __ARMEB__
3527	rev	r23, r23
3528#endif
3529	aese	q2, v25.16b
3530	aesmc	q2, q2          @ AES block 4k+6 - round 7
3531	ext	q9, q9, q9, #8                     @ MODULO - other top alignment
3532
3533	aese	q0, v29.16b                                     @ AES block 4k+4 - round 11
3534	add	r12, r12, #1                            @ CTR block 4k+7
3535
3536	aese	q3, v25.16b
3537	aesmc	q3, q3          @ AES block 4k+7 - round 7
3538	eor	v10.16b, v10.16b, v31.16b                      @ MODULO - fold into mid
3539
3540	aese	q2, v26.16b
3541	aesmc	q2, q2          @ AES block 4k+6 - round 8
3542	ld1	{q6}, [r0], #16                       @ AES block 4k+6 - load ciphertext
3543
3544	aese	q1, v29.16b                                     @ AES block 4k+5 - round 11
3545	ld1	{q7}, [r0], #16                       @ AES block 4k+7 - load ciphertext
3546	rev	r9, r12                                 @ CTR block 4k+8
3547
3548	aese	q3, v26.16b
3549	aesmc	q3, q3          @ AES block 4k+7 - round 8
3550	stp	r21, r22, [r2], #16        @ AES block 4k+2 - store result
3551
3552	aese	q2, v27.16b
3553	aesmc	q2, q2          @ AES block 4k+6 - round 9
3554	eor	v10.16b, v10.16b, q9                         @ MODULO - fold into mid
3555
3556	cmp	r0, r5                   @ .LOOP CONTROL
3557
3558	eor	q0, q4, q0                            @ AES block 4k+4 - result
3559	eor	r24, r24, r14                   @ AES block 4k+3 - round 12 high
3560#ifdef __ARMEB__
3561	rev	r24, r24
3562#endif
3563	eor	q1, q5, q1                            @ AES block 4k+5 - result
3564
3565	aese	q2, v28.16b
3566	aesmc	q2, q2          @ AES block 4k+6 - round 10
3567	orr	r9, r11, r9, lsl #32            @ CTR block 4k+8
3568
3569	aese	q3, v27.16b
3570	aesmc	q3, q3          @ AES block 4k+7 - round 9
3571
3572	pmull	v8.1q, v10.1d, q8     @ MODULO - mid 64b align with low
3573	mov	r19, v1.d[0]                            @ AES block 4k+5 - mov low
3574
3575	mov	r6, v0.d[0]                            @ AES block 4k+4 - mov low
3576	stp	r23, r24, [r2], #16        @ AES block 4k+3 - store result
3577	rev64	q5, q5                                    @ GHASH block 4k+5
3578
3579	aese	q2, v29.16b                                     @ AES block 4k+6 - round 11
3580	mov	r7, v0.d[1]                            @ AES block 4k+4 - mov high
3581
3582	aese	q3, v28.16b
3583	aesmc	q3, q3          @ AES block 4k+7 - round 10
3584	mov	r20, v1.d[1]                            @ AES block 4k+5 - mov high
3585
3586	fmov	d0, r10                               @ CTR block 4k+8
3587	add	r12, r12, #1                            @ CTR block 4k+8
3588	ext	v10.16b, v10.16b, v10.16b, #8                     @ MODULO - other mid alignment
3589
3590	eor	q2, q6, q2                            @ AES block 4k+6 - result
3591	fmov	v0.d[1], r9                               @ CTR block 4k+8
3592	rev	r9, r12                                 @ CTR block 4k+9
3593
3594	eor	r6, r6, r13                   @ AES block 4k+4 - round 12 low
3595#ifdef __ARMEB__
3596	rev	r6, r6
3597#endif
3598	orr	r9, r11, r9, lsl #32            @ CTR block 4k+9
3599	eor	v11.16b, v11.16b, q8               @ MODULO - fold into low
3600
3601	fmov	d1, r10                               @ CTR block 4k+9
3602	add	r12, r12, #1                            @ CTR block 4k+9
3603	eor	r19, r19, r13                   @ AES block 4k+5 - round 12 low
3604#ifdef __ARMEB__
3605	rev	r19, r19
3606#endif
3607	fmov	v1.d[1], r9                               @ CTR block 4k+9
3608	rev	r9, r12                                 @ CTR block 4k+10
3609	eor	r20, r20, r14                   @ AES block 4k+5 - round 12 high
3610#ifdef __ARMEB__
3611	rev	r20, r20
3612#endif
3613	eor	r7, r7, r14                   @ AES block 4k+4 - round 12 high
3614#ifdef __ARMEB__
3615	rev	r7, r7
3616#endif
3617	stp	r6, r7, [r2], #16        @ AES block 4k+4 - store result
3618	eor	v11.16b, v11.16b, v10.16b                         @ MODULO - fold into low
3619
3620	add	r12, r12, #1                            @ CTR block 4k+10
3621	rev64	q4, q4                                    @ GHASH block 4k+4
3622	orr	r9, r11, r9, lsl #32            @ CTR block 4k+10
3623
3624	aese	q3, v29.16b                                     @ AES block 4k+7 - round 11
3625	stp	r19, r20, [r2], #16        @ AES block 4k+5 - store result
3626	blt	.L192_dec_main_loop
3627
3628.L192_dec_prepretail:@ PREPRETAIL
3629	mov	r22, v2.d[1]                            @ AES block 4k+2 - mov high
3630	ext	v11.16b, v11.16b, v11.16b, #8                     @ PRE 0
3631	eor	q3, q7, q3                            @ AES block 4k+3 - result
3632
3633	aese	q1, v18.16b
3634	aesmc	q1, q1          @ AES block 4k+5 - round 0
3635	mov	r21, v2.d[0]                            @ AES block 4k+2 - mov low
3636
3637	aese	q0, v18.16b
3638	aesmc	q0, q0          @ AES block 4k+4 - round 0
3639	mov	d10, v17.d[1]                               @ GHASH block 4k - mid
3640
3641	eor	q4, q4, v11.16b                           @ PRE 1
3642	fmov	d2, r10                               @ CTR block 4k+6
3643
3644	aese	q1, v19.16b
3645	aesmc	q1, q1          @ AES block 4k+5 - round 1
3646	mov	r23, v3.d[0]                            @ AES block 4k+3 - mov low
3647
3648	aese	q0, v19.16b
3649	aesmc	q0, q0          @ AES block 4k+4 - round 1
3650	mov	r24, v3.d[1]                            @ AES block 4k+3 - mov high
3651
3652	pmull	v11.1q, q4, v15.1d                       @ GHASH block 4k - low
3653	mov	d8, v4.d[1]                                  @ GHASH block 4k - mid
3654	fmov	d3, r10                               @ CTR block 4k+7
3655
3656	aese	q1, v20.16b
3657	aesmc	q1, q1          @ AES block 4k+5 - round 2
3658	rev64	q6, q6                                    @ GHASH block 4k+2
3659
3660	pmull2	v9.1q, q4, v15.2d                       @ GHASH block 4k - high
3661	fmov	v2.d[1], r9                               @ CTR block 4k+6
3662	rev	r9, r12                                 @ CTR block 4k+7
3663
3664	orr	r9, r11, r9, lsl #32            @ CTR block 4k+7
3665	eor	q8, q8, q4                          @ GHASH block 4k - mid
3666	mov	d4, v5.d[1]                                  @ GHASH block 4k+1 - mid
3667
3668	pmull	v31.1q, q5, v14.1d                          @ GHASH block 4k+1 - low
3669	eor	r24, r24, r14                   @ AES block 4k+3 - round 12 high
3670#ifdef __ARMEB__
3671	rev	r24, r24
3672#endif
3673	fmov	v3.d[1], r9                               @ CTR block 4k+7
3674
3675	aese	q0, v20.16b
3676	aesmc	q0, q0          @ AES block 4k+4 - round 2
3677	eor	r21, r21, r13                   @ AES block 4k+2 - round 12 low
3678#ifdef __ARMEB__
3679	rev	r21, r21
3680#endif
3681	pmull2	v30.1q, q5, v14.2d                          @ GHASH block 4k+1 - high
3682	eor	r22, r22, r14                   @ AES block 4k+2 - round 12 high
3683#ifdef __ARMEB__
3684	rev	r22, r22
3685#endif
3686	eor	q4, q4, q5                          @ GHASH block 4k+1 - mid
3687
3688	pmull	v10.1q, q8, v10.1d                      @ GHASH block 4k - mid
3689	eor	r23, r23, r13                   @ AES block 4k+3 - round 12 low
3690#ifdef __ARMEB__
3691	rev	r23, r23
3692#endif
3693	stp	r21, r22, [r2], #16        @ AES block 4k+2 - store result
3694
3695	rev64	q7, q7                                    @ GHASH block 4k+3
3696	stp	r23, r24, [r2], #16        @ AES block 4k+3 - store result
3697
3698	aese	q3, v18.16b
3699	aesmc	q3, q3          @ AES block 4k+7 - round 0
3700	eor	q9, q9, v30.16b                         @ GHASH block 4k+1 - high
3701
3702	pmull	v4.1q, q4, v17.1d                          @ GHASH block 4k+1 - mid
3703	add	r12, r12, #1                            @ CTR block 4k+7
3704
3705	pmull2	v30.1q, q6, v13.2d                          @ GHASH block 4k+2 - high
3706	eor	v11.16b, v11.16b, v31.16b                         @ GHASH block 4k+1 - low
3707
3708	aese	q2, v18.16b
3709	aesmc	q2, q2          @ AES block 4k+6 - round 0
3710
3711	eor	v10.16b, v10.16b, q4                         @ GHASH block 4k+1 - mid
3712	mov	d31, v6.d[1]                                  @ GHASH block 4k+2 - mid
3713
3714	aese	q3, v19.16b
3715	aesmc	q3, q3          @ AES block 4k+7 - round 1
3716
3717	aese	q2, v19.16b
3718	aesmc	q2, q2          @ AES block 4k+6 - round 1
3719	eor	q9, q9, v30.16b                         @ GHASH block 4k+2 - high
3720
3721	eor	v31.8b, v31.8b, q6                          @ GHASH block 4k+2 - mid
3722
3723	pmull	v8.1q, q6, v13.1d                          @ GHASH block 4k+2 - low
3724
3725	aese	q2, v20.16b
3726	aesmc	q2, q2          @ AES block 4k+6 - round 2
3727	mov	d30, v7.d[1]                                  @ GHASH block 4k+3 - mid
3728
3729	aese	q3, v20.16b
3730	aesmc	q3, q3          @ AES block 4k+7 - round 2
3731	ins	v31.d[1], v31.d[0]                                @ GHASH block 4k+2 - mid
3732
3733	pmull	v6.1q, q7, v12.1d                          @ GHASH block 4k+3 - low
3734
3735	aese	q0, v21.16b
3736	aesmc	q0, q0          @ AES block 4k+4 - round 3
3737	eor	v30.8b, v30.8b, q7                          @ GHASH block 4k+3 - mid
3738
3739	aese	q1, v21.16b
3740	aesmc	q1, q1          @ AES block 4k+5 - round 3
3741
3742	pmull2	v31.1q, v31.2d, v16.2d                          @ GHASH block 4k+2 - mid
3743	eor	v11.16b, v11.16b, q8                         @ GHASH block 4k+2 - low
3744
3745	aese	q0, v22.16b
3746	aesmc	q0, q0          @ AES block 4k+4 - round 4
3747
3748	pmull2	v5.1q, q7, v12.2d                          @ GHASH block 4k+3 - high
3749	movi	q8, #0xc2
3750
3751	pmull	v30.1q, v30.1d, v16.1d                          @ GHASH block 4k+3 - mid
3752
3753	aese	q2, v21.16b
3754	aesmc	q2, q2          @ AES block 4k+6 - round 3
3755
3756	shl	d8, d8, #56               @ mod_constant
3757	eor	q9, q9, q5                         @ GHASH block 4k+3 - high
3758
3759	aese	q0, v23.16b
3760	aesmc	q0, q0          @ AES block 4k+4 - round 5
3761	eor	v10.16b, v10.16b, v31.16b                         @ GHASH block 4k+2 - mid
3762
3763	aese	q2, v22.16b
3764	aesmc	q2, q2          @ AES block 4k+6 - round 4
3765
3766	pmull	v31.1q, q9, q8            @ MODULO - top 64b align with mid
3767	eor	v11.16b, v11.16b, q6                         @ GHASH block 4k+3 - low
3768
3769	aese	q0, v24.16b
3770	aesmc	q0, q0          @ AES block 4k+4 - round 6
3771
3772	aese	q3, v21.16b
3773	aesmc	q3, q3          @ AES block 4k+7 - round 3
3774	eor	v10.16b, v10.16b, v30.16b                         @ GHASH block 4k+3 - mid
3775
3776	aese	q2, v23.16b
3777	aesmc	q2, q2          @ AES block 4k+6 - round 5
3778
3779	aese	q0, v25.16b
3780	aesmc	q0, q0          @ AES block 4k+4 - round 7
3781	eor	v30.16b, v11.16b, q9                         @ MODULO - karatsuba tidy up
3782
3783	aese	q3, v22.16b
3784	aesmc	q3, q3          @ AES block 4k+7 - round 4
3785
3786	aese	q2, v24.16b
3787	aesmc	q2, q2          @ AES block 4k+6 - round 6
3788	ext	q9, q9, q9, #8                     @ MODULO - other top alignment
3789
3790	aese	q0, v26.16b
3791	aesmc	q0, q0          @ AES block 4k+4 - round 8
3792
3793	aese	q3, v23.16b
3794	aesmc	q3, q3          @ AES block 4k+7 - round 5
3795	eor	v10.16b, v10.16b, v30.16b                         @ MODULO - karatsuba tidy up
3796
3797	aese	q1, v22.16b
3798	aesmc	q1, q1          @ AES block 4k+5 - round 4
3799
3800	aese	q2, v25.16b
3801	aesmc	q2, q2          @ AES block 4k+6 - round 7
3802
3803	aese	q0, v27.16b
3804	aesmc	q0, q0          @ AES block 4k+4 - round 9
3805
3806	aese	q1, v23.16b
3807	aesmc	q1, q1          @ AES block 4k+5 - round 5
3808
3809	aese	q3, v24.16b
3810	aesmc	q3, q3          @ AES block 4k+7 - round 6
3811	eor	v10.16b, v10.16b, v31.16b                      @ MODULO - fold into mid
3812
3813	aese	q0, v28.16b
3814	aesmc	q0, q0          @ AES block 4k+4 - round 10
3815
3816	aese	q1, v24.16b
3817	aesmc	q1, q1          @ AES block 4k+5 - round 6
3818
3819	aese	q3, v25.16b
3820	aesmc	q3, q3          @ AES block 4k+7 - round 7
3821
3822	aese	q2, v26.16b
3823	aesmc	q2, q2          @ AES block 4k+6 - round 8
3824	eor	v10.16b, v10.16b, q9                         @ MODULO - fold into mid
3825
3826	aese	q1, v25.16b
3827	aesmc	q1, q1          @ AES block 4k+5 - round 7
3828
3829	aese	q3, v26.16b
3830	aesmc	q3, q3          @ AES block 4k+7 - round 8
3831
3832	aese	q2, v27.16b
3833	aesmc	q2, q2          @ AES block 4k+6 - round 9
3834
3835	aese	q1, v26.16b
3836	aesmc	q1, q1          @ AES block 4k+5 - round 8
3837
3838	aese	q3, v27.16b
3839	aesmc	q3, q3          @ AES block 4k+7 - round 9
3840
3841	pmull	v8.1q, v10.1d, q8     @ MODULO - mid 64b align with low
3842
3843	aese	q1, v27.16b
3844	aesmc	q1, q1          @ AES block 4k+5 - round 9
3845
3846	aese	q2, v28.16b
3847	aesmc	q2, q2          @ AES block 4k+6 - round 10
3848
3849	aese	q3, v28.16b
3850	aesmc	q3, q3          @ AES block 4k+7 - round 10
3851	ext	v10.16b, v10.16b, v10.16b, #8                     @ MODULO - other mid alignment
3852
3853	aese	q1, v28.16b
3854	aesmc	q1, q1          @ AES block 4k+5 - round 10
3855
3856	aese	q0, v29.16b
3857	eor	v11.16b, v11.16b, q8               @ MODULO - fold into low
3858
3859	aese	q2, v29.16b
3860
3861	aese	q1, v29.16b
3862
3863	aese	q3, v29.16b
3864
3865	eor	v11.16b, v11.16b, v10.16b                         @ MODULO - fold into low
3866.L192_dec_tail:@ TAIL
3867
3868	sub	r5, r4, r0   @ main_end_input_ptr is number of bytes left to process
3869	ld1	{ q5}, [r0], #16                      @ AES block 4k+4 - load ciphertext
3870
3871	eor	q0, q5, q0                            @ AES block 4k+4 - result
3872
3873	mov	r7, v0.d[1]                            @ AES block 4k+4 - mov high
3874
3875	mov	r6, v0.d[0]                            @ AES block 4k+4 - mov low
3876
3877	ext	q8, v11.16b, v11.16b, #8                     @ prepare final partial tag
3878
3879	cmp	r5, #48
3880
3881	eor	r7, r7, r14                   @ AES block 4k+4 - round 12 high
3882#ifdef __ARMEB__
3883	rev	r7, r7
3884#endif
3885	eor	r6, r6, r13                   @ AES block 4k+4 - round 12 low
3886#ifdef __ARMEB__
3887	rev	r6, r6
3888#endif
3889	bgt	.L192_dec_blocks_more_than_3
3890
3891	movi	v11.8b, #0
3892	movi	q9, #0
3893
3894	mov	q3, q2
3895	mov	q2, q1
3896	sub	r12, r12, #1
3897
3898	movi	v10.8b, #0
3899	cmp	r5, #32
3900	bgt	.L192_dec_blocks_more_than_2
3901
3902	mov	q3, q1
3903	cmp	r5, #16
3904	sub	r12, r12, #1
3905
3906	bgt	.L192_dec_blocks_more_than_1
3907
3908	sub	r12, r12, #1
3909	b	.L192_dec_blocks_less_than_1
3910.L192_dec_blocks_more_than_3:@ blocks left >  3
3911	rev64	q4, q5                                    @ GHASH final-3 block
3912	ld1	{ q5}, [r0], #16                      @ AES final-2 block - load ciphertext
3913
3914	stp	r6, r7, [r2], #16        @ AES final-3 block  - store result
3915
3916	eor	q4, q4, q8                           @ feed in partial tag
3917
3918	eor	q0, q5, q1                            @ AES final-2 block - result
3919
3920	pmull	v11.1q, q4, v15.1d                       @ GHASH final-3 block - low
3921	mov	r6, v0.d[0]                            @ AES final-2 block - mov low
3922	mov	d22, v4.d[1]                                 @ GHASH final-3 block - mid
3923
3924	mov	r7, v0.d[1]                            @ AES final-2 block - mov high
3925
3926	mov	d10, v17.d[1]                               @ GHASH final-3 block - mid
3927	eor	v22.8b, v22.8b, q4                      @ GHASH final-3 block - mid
3928
3929	pmull2	v9.1q, q4, v15.2d                       @ GHASH final-3 block - high
3930
3931	eor	r6, r6, r13                   @ AES final-2 block - round 12 low
3932#ifdef __ARMEB__
3933	rev	r6, r6
3934#endif
3935	movi	q8, #0                                        @ suppress further partial tag feed in
3936
3937	pmull	v10.1q, v22.1d, v10.1d                    @ GHASH final-3 block - mid
3938	eor	r7, r7, r14                   @ AES final-2 block - round 12 high
3939#ifdef __ARMEB__
3940	rev	r7, r7
3941#endif
3942.L192_dec_blocks_more_than_2:@ blocks left >  2
3943
3944	rev64	q4, q5                                    @ GHASH final-2 block
3945	ld1	{ q5}, [r0], #16                      @ AES final-1 block - load ciphertext
3946
3947	eor	q4, q4, q8                           @ feed in partial tag
3948
3949	movi	q8, #0                                        @ suppress further partial tag feed in
3950
3951	eor	q0, q5, q2                            @ AES final-1 block - result
3952
3953	mov	d22, v4.d[1]                                 @ GHASH final-2 block - mid
3954
3955	pmull	v21.1q, q4, v14.1d                          @ GHASH final-2 block - low
3956
3957	stp	r6, r7, [r2], #16        @ AES final-2 block  - store result
3958
3959	eor	v22.8b, v22.8b, q4                      @ GHASH final-2 block - mid
3960	mov	r7, v0.d[1]                            @ AES final-1 block - mov high
3961
3962	eor	v11.16b, v11.16b, v21.16b                            @ GHASH final-2 block - low
3963	mov	r6, v0.d[0]                            @ AES final-1 block - mov low
3964
3965	pmull2	v20.1q, q4, v14.2d                          @ GHASH final-2 block - high
3966
3967	pmull	v22.1q, v22.1d, v17.1d                      @ GHASH final-2 block - mid
3968
3969	eor	q9, q9, v20.16b                            @ GHASH final-2 block - high
3970	eor	r7, r7, r14                   @ AES final-1 block - round 12 high
3971#ifdef __ARMEB__
3972	rev	r7, r7
3973#endif
3974	eor	r6, r6, r13                   @ AES final-1 block - round 12 low
3975#ifdef __ARMEB__
3976	rev	r6, r6
3977#endif
3978	eor	v10.16b, v10.16b, v22.16b                       @ GHASH final-2 block - mid
3979.L192_dec_blocks_more_than_1:@ blocks left >  1
3980
3981	rev64	q4, q5                                    @ GHASH final-1 block
3982
3983	eor	q4, q4, q8                           @ feed in partial tag
3984	ld1	{ q5}, [r0], #16                      @ AES final block - load ciphertext
3985
3986	mov	d22, v4.d[1]                                 @ GHASH final-1 block - mid
3987
3988	pmull2	v20.1q, q4, v13.2d                          @ GHASH final-1 block - high
3989
3990	eor	q0, q5, q3                            @ AES final block - result
3991	stp	r6, r7, [r2], #16        @ AES final-1 block  - store result
3992
3993	eor	v22.8b, v22.8b, q4                      @ GHASH final-1 block - mid
3994
3995	eor	q9, q9, v20.16b                            @ GHASH final-1 block - high
3996
3997	pmull	v21.1q, q4, v13.1d                          @ GHASH final-1 block - low
3998	mov	r7, v0.d[1]                            @ AES final block - mov high
3999
4000	ins	v22.d[1], v22.d[0]                            @ GHASH final-1 block - mid
4001	mov	r6, v0.d[0]                            @ AES final block - mov low
4002
4003	pmull2	v22.1q, v22.2d, v16.2d                      @ GHASH final-1 block - mid
4004
4005	movi	q8, #0                                        @ suppress further partial tag feed in
4006	eor	v11.16b, v11.16b, v21.16b                            @ GHASH final-1 block - low
4007	eor	r7, r7, r14                   @ AES final block - round 12 high
4008#ifdef __ARMEB__
4009	rev	r7, r7
4010#endif
4011	eor	r6, r6, r13                   @ AES final block - round 12 low
4012#ifdef __ARMEB__
4013	rev	r6, r6
4014#endif
4015	eor	v10.16b, v10.16b, v22.16b                       @ GHASH final-1 block - mid
4016.L192_dec_blocks_less_than_1:@ blocks left <= 1
4017
4018	mvn	r13, xzr                                      @ rk12_l = 0xffffffffffffffff
4019	ldp	r4, r5, [r2]  @ load existing bytes we need to not overwrite
4020	and	r1, r1, #127                    @ bit_length %= 128
4021
4022	sub	r1, r1, #128                    @ bit_length -= 128
4023
4024	neg	r1, r1                          @ bit_length = 128 - #bits in input (in range [1,128])
4025
4026	and	r1, r1, #127                    @ bit_length %= 128
4027	mvn	r14, xzr                                      @ rk12_h = 0xffffffffffffffff
4028
4029	lsr	r14, r14, r1                     @ rk12_h is mask for top 64b of last block
4030	cmp	r1, #64
4031
4032	csel	r9, r13, r14, lt
4033	csel	r10, r14, xzr, lt
4034
4035	fmov	d0, r9                                   @ ctr0b is mask for last block
4036	and	r6, r6, r9
4037	bic	r4, r4, r9           @ mask out low existing bytes
4038
4039	orr	r6, r6, r4
4040	mov	v0.d[1], r10
4041#ifndef __ARMEB__
4042	rev	r9, r12
4043#else
4044	mov	r9, r12
4045#endif
4046
4047	and	q5, q5, q0                            @ possibly partial last block has zeroes in highest bits
4048	str	r9, [r16, #12]                          @ store the updated counter
4049
4050	rev64	q4, q5                                    @ GHASH final block
4051
4052	eor	q4, q4, q8                           @ feed in partial tag
4053	bic	r5, r5, r10 @ mask out high existing bytes
4054
4055	and	r7, r7, r10
4056
4057	pmull2	v20.1q, q4, v12.2d                          @ GHASH final block - high
4058	mov	d8, v4.d[1]                                  @ GHASH final block - mid
4059
4060	pmull	v21.1q, q4, v12.1d                          @ GHASH final block - low
4061
4062	eor	q8, q8, q4                          @ GHASH final block - mid
4063
4064	eor	q9, q9, v20.16b                            @ GHASH final block - high
4065
4066	pmull	v8.1q, q8, v16.1d                          @ GHASH final block - mid
4067
4068	eor	v11.16b, v11.16b, v21.16b                            @ GHASH final block - low
4069
4070	eor	v10.16b, v10.16b, q8                         @ GHASH final block - mid
4071	movi	q8, #0xc2
4072
4073	eor	v30.16b, v11.16b, q9                         @ MODULO - karatsuba tidy up
4074
4075	shl	d8, d8, #56               @ mod_constant
4076
4077	eor	v10.16b, v10.16b, v30.16b                         @ MODULO - karatsuba tidy up
4078
4079	pmull	v31.1q, q9, q8            @ MODULO - top 64b align with mid
4080	orr	r7, r7, r5
4081	stp	r6, r7, [r2]
4082
4083	ext	q9, q9, q9, #8                     @ MODULO - other top alignment
4084
4085	eor	v10.16b, v10.16b, v31.16b                      @ MODULO - fold into mid
4086
4087	eor	v10.16b, v10.16b, q9                         @ MODULO - fold into mid
4088
4089	pmull	v8.1q, v10.1d, q8     @ MODULO - mid 64b align with low
4090
4091	eor	v11.16b, v11.16b, q8               @ MODULO - fold into low
4092
4093	ext	v10.16b, v10.16b, v10.16b, #8                     @ MODULO - other mid alignment
4094
4095	eor	v11.16b, v11.16b, v10.16b                         @ MODULO - fold into low
4096	ext	v11.16b, v11.16b, v11.16b, #8
4097	rev64	v11.16b, v11.16b
4098	mov	r0, r15
4099	st1	{ v11.16b }, [r3]
4100
4101	ldp	r21, r22, [sp, #16]
4102	ldp	r23, r24, [sp, #32]
4103	ldp	d8, d9, [sp, #48]
4104	ldp	d10, d11, [sp, #64]
4105	ldp	d12, d13, [sp, #80]
4106	ldp	d14, d15, [sp, #96]
4107	ldp	r19, r20, [sp], #112
4108	RET
4109
4110.L192_dec_ret:
4111	mov	r0, #0x0
4112	RET
4113.size	aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel
4114.globl	aes_gcm_enc_256_kernel
4115.type	aes_gcm_enc_256_kernel,%function
4116.align	4
4117aes_gcm_enc_256_kernel:
4118	cbz	r1, .L256_enc_ret
4119	stp	r19, r20, [sp, #-112]!
4120	mov	r16, r4
4121	mov	r8, r5
4122	stp	r21, r22, [sp, #16]
4123	stp	r23, r24, [sp, #32]
4124	stp	d8, d9, [sp, #48]
4125	stp	d10, d11, [sp, #64]
4126	stp	d12, d13, [sp, #80]
4127	stp	d14, d15, [sp, #96]
4128
4129	add	r4, r0, r1, lsr #3   @ end_input_ptr
4130	lsr	r5, r1, #3              @ byte_len
4131	mov	r15, r5
4132	ldp	r10, r11, [r16]              @ ctr96_b64, ctr96_t32
4133#ifdef __ARMEB__
4134	rev	r10, r10
4135	rev	r11, r11
4136#endif
4137	ldp	r13, r14, [r8, #224]                     @ load rk14
4138#ifdef __ARMEB__
4139	ror	r13, r13, #32
4140	ror	r14, r14, #32
4141#endif
4142	ld1	{ q0}, [r16]                             @ special case vector load initial counter so we can start first AES block as quickly as possible
4143	sub	r5, r5, #1      @ byte_len - 1
4144
4145	ld1	{v18.4s}, [r8], #16                               @ load rk0
4146	and	r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
4147
4148	ld1	{v19.4s}, [r8], #16                               @ load rk1
4149	add	r5, r5, r0
4150
4151	lsr	r12, r11, #32
4152	fmov	d2, r10                               @ CTR block 2
4153	orr	r11, r11, r11
4154
4155	rev	r12, r12                                @ rev_ctr32
4156	cmp	r0, r5                   @ check if we have <= 4 blocks
4157	fmov	d1, r10                               @ CTR block 1
4158
4159	aese	q0, v18.16b
4160	aesmc	q0, q0          @ AES block 0 - round 0
4161	add	r12, r12, #1                            @ increment rev_ctr32
4162
4163	rev	r9, r12                                 @ CTR block 1
4164	fmov	d3, r10                               @ CTR block 3
4165
4166	orr	r9, r11, r9, lsl #32            @ CTR block 1
4167	add	r12, r12, #1                            @ CTR block 1
4168	ld1	{v20.4s}, [r8], #16                               @ load rk2
4169
4170	fmov	v1.d[1], r9                               @ CTR block 1
4171	rev	r9, r12                                 @ CTR block 2
4172	add	r12, r12, #1                            @ CTR block 2
4173
4174	orr	r9, r11, r9, lsl #32            @ CTR block 2
4175	ld1	{v21.4s}, [r8], #16                               @ load rk3
4176
4177	fmov	v2.d[1], r9                               @ CTR block 2
4178	rev	r9, r12                                 @ CTR block 3
4179
4180	aese	q0, v19.16b
4181	aesmc	q0, q0          @ AES block 0 - round 1
4182	orr	r9, r11, r9, lsl #32            @ CTR block 3
4183
4184	fmov	v3.d[1], r9                               @ CTR block 3
4185
4186	aese	q1, v18.16b
4187	aesmc	q1, q1          @ AES block 1 - round 0
4188	ld1	{v22.4s}, [r8], #16                               @ load rk4
4189
4190	aese	q0, v20.16b
4191	aesmc	q0, q0          @ AES block 0 - round 2
4192	ld1	{v23.4s}, [r8], #16                               @ load rk5
4193
4194	aese	q2, v18.16b
4195	aesmc	q2, q2          @ AES block 2 - round 0
4196	ld1	{v24.4s}, [r8], #16                               @ load rk6
4197
4198	aese	q1, v19.16b
4199	aesmc	q1, q1          @ AES block 1 - round 1
4200	ldr	q14, [r3, #80]                         @ load h3l | h3h
4201#ifndef __ARMEB__
4202	ext	v14.16b, v14.16b, v14.16b, #8
4203#endif
4204	aese	q3, v18.16b
4205	aesmc	q3, q3          @ AES block 3 - round 0
4206	ld1	{v25.4s}, [r8], #16                               @ load rk7
4207
4208	aese	q2, v19.16b
4209	aesmc	q2, q2          @ AES block 2 - round 1
4210	ld1	{v26.4s}, [r8], #16                               @ load rk8
4211
4212	aese	q1, v20.16b
4213	aesmc	q1, q1          @ AES block 1 - round 2
4214	ldr	q13, [r3, #64]                         @ load h2l | h2h
4215#ifndef __ARMEB__
4216	ext	v13.16b, v13.16b, v13.16b, #8
4217#endif
4218	aese	q3, v19.16b
4219	aesmc	q3, q3          @ AES block 3 - round 1
4220	ld1	{v27.4s}, [r8], #16                               @ load rk9
4221
4222	aese	q2, v20.16b
4223	aesmc	q2, q2          @ AES block 2 - round 2
4224	ldr	q15, [r3, #112]                        @ load h4l | h4h
4225#ifndef __ARMEB__
4226	ext	v15.16b, v15.16b, v15.16b, #8
4227#endif
4228	aese	q1, v21.16b
4229	aesmc	q1, q1          @ AES block 1 - round 3
4230	ld1	{v28.4s}, [r8], #16                              @ load rk10
4231
4232	aese	q3, v20.16b
4233	aesmc	q3, q3          @ AES block 3 - round 2
4234	ld1	{v29.4s}, [r8], #16                              @ load rk11
4235
4236	aese	q2, v21.16b
4237	aesmc	q2, q2          @ AES block 2 - round 3
4238	add	r12, r12, #1                            @ CTR block 3
4239
4240	aese	q0, v21.16b
4241	aesmc	q0, q0          @ AES block 0 - round 3
4242
4243	aese	q3, v21.16b
4244	aesmc	q3, q3          @ AES block 3 - round 3
4245	ld1	{ v11.16b}, [r3]
4246	ext	v11.16b, v11.16b, v11.16b, #8
4247	rev64	v11.16b, v11.16b
4248
4249	aese	q2, v22.16b
4250	aesmc	q2, q2          @ AES block 2 - round 4
4251
4252	aese	q0, v22.16b
4253	aesmc	q0, q0          @ AES block 0 - round 4
4254
4255	aese	q1, v22.16b
4256	aesmc	q1, q1          @ AES block 1 - round 4
4257
4258	aese	q3, v22.16b
4259	aesmc	q3, q3          @ AES block 3 - round 4
4260
4261	aese	q0, v23.16b
4262	aesmc	q0, q0          @ AES block 0 - round 5
4263
4264	aese	q1, v23.16b
4265	aesmc	q1, q1          @ AES block 1 - round 5
4266
4267	aese	q3, v23.16b
4268	aesmc	q3, q3          @ AES block 3 - round 5
4269
4270	aese	q2, v23.16b
4271	aesmc	q2, q2          @ AES block 2 - round 5
4272
4273	aese	q1, v24.16b
4274	aesmc	q1, q1          @ AES block 1 - round 6
4275	trn2	v17.2d,  v14.2d,    v15.2d                      @ h4l | h3l
4276
4277	aese	q3, v24.16b
4278	aesmc	q3, q3          @ AES block 3 - round 6
4279	ld1	{v30.4s}, [r8], #16                              @ load rk12
4280
4281	aese	q0, v24.16b
4282	aesmc	q0, q0          @ AES block 0 - round 6
4283	ldr	q12, [r3, #32]                         @ load h1l | h1h
4284#ifndef __ARMEB__
4285	ext	v12.16b, v12.16b, v12.16b, #8
4286#endif
4287	aese	q2, v24.16b
4288	aesmc	q2, q2          @ AES block 2 - round 6
4289	ld1	{v31.4s}, [r8], #16                              @ load rk13
4290
4291	aese	q1, v25.16b
4292	aesmc	q1, q1          @ AES block 1 - round 7
4293	trn1	q9, v14.2d,    v15.2d                      @ h4h | h3h
4294
4295	aese	q0, v25.16b
4296	aesmc	q0, q0          @ AES block 0 - round 7
4297
4298	aese	q2, v25.16b
4299	aesmc	q2, q2          @ AES block 2 - round 7
4300
4301	aese	q3, v25.16b
4302	aesmc	q3, q3          @ AES block 3 - round 7
4303	trn2	v16.2d,  v12.2d,    v13.2d                      @ h2l | h1l
4304
4305	aese	q1, v26.16b
4306	aesmc	q1, q1          @ AES block 1 - round 8
4307
4308	aese	q2, v26.16b
4309	aesmc	q2, q2          @ AES block 2 - round 8
4310
4311	aese	q3, v26.16b
4312	aesmc	q3, q3          @ AES block 3 - round 8
4313
4314	aese	q1, v27.16b
4315	aesmc	q1, q1          @ AES block 1 - round 9
4316
4317	aese	q2, v27.16b
4318	aesmc	q2, q2          @ AES block 2 - round 9
4319
4320	aese	q0, v26.16b
4321	aesmc	q0, q0          @ AES block 0 - round 8
4322
4323	aese	q1, v28.16b
4324	aesmc	q1, q1          @ AES block 1 - round 10
4325
4326	aese	q3, v27.16b
4327	aesmc	q3, q3          @ AES block 3 - round 9
4328
4329	aese	q0, v27.16b
4330	aesmc	q0, q0          @ AES block 0 - round 9
4331
4332	aese	q2, v28.16b
4333	aesmc	q2, q2          @ AES block 2 - round 10
4334
4335	aese	q3, v28.16b
4336	aesmc	q3, q3          @ AES block 3 - round 10
4337
4338	aese	q1, v29.16b
4339	aesmc	q1, q1          @ AES block 1 - round 11
4340
4341	aese	q2, v29.16b
4342	aesmc	q2, q2          @ AES block 2 - round 11
4343
4344	aese	q0, v28.16b
4345	aesmc	q0, q0          @ AES block 0 - round 10
4346
4347	aese	q1, v30.16b
4348	aesmc	q1, q1          @ AES block 1 - round 12
4349
4350	aese	q2, v30.16b
4351	aesmc	q2, q2          @ AES block 2 - round 12
4352
4353	aese	q0, v29.16b
4354	aesmc	q0, q0          @ AES block 0 - round 11
4355	eor	v17.16b, v17.16b, q9                  @ h4k | h3k
4356
4357	aese	q3, v29.16b
4358	aesmc	q3, q3          @ AES block 3 - round 11
4359
4360	aese	q2, v31.16b                                     @ AES block 2 - round 13
4361	trn1	q8,    v12.2d,    v13.2d                      @ h2h | h1h
4362
4363	aese	q0, v30.16b
4364	aesmc	q0, q0          @ AES block 0 - round 12
4365
4366	aese	q3, v30.16b
4367	aesmc	q3, q3          @ AES block 3 - round 12
4368
4369	aese	q1, v31.16b                                     @ AES block 1 - round 13
4370
4371	aese	q0, v31.16b                                     @ AES block 0 - round 13
4372
4373	aese	q3, v31.16b                                     @ AES block 3 - round 13
4374	eor	v16.16b, v16.16b, q8                     @ h2k | h1k
4375	bge	.L256_enc_tail                                    @ handle tail
4376
4377	ldp	r19, r20, [r0, #16]           @ AES block 1 - load plaintext
4378#ifdef __ARMEB__
4379	rev	r19, r19
4380	rev	r20, r20
4381#endif
4382	rev	r9, r12                                 @ CTR block 4
4383	ldp	r6, r7, [r0, #0]            @ AES block 0 - load plaintext
4384#ifdef __ARMEB__
4385	rev	r6, r6
4386	rev	r7, r7
4387#endif
4388	ldp	r23, r24, [r0, #48]           @ AES block 3 - load plaintext
4389#ifdef __ARMEB__
4390	rev	r23, r23
4391	rev	r24, r24
4392#endif
4393	ldp	r21, r22, [r0, #32]           @ AES block 2 - load plaintext
4394#ifdef __ARMEB__
4395	rev	r21, r21
4396	rev	r22, r22
4397#endif
4398	add	r0, r0, #64                       @ AES input_ptr update
4399
4400	eor	r19, r19, r13                     @ AES block 1 - round 14 low
4401	eor	r20, r20, r14                     @ AES block 1 - round 14 high
4402
4403	fmov	d5, r19                               @ AES block 1 - mov low
4404	eor	r6, r6, r13                     @ AES block 0 - round 14 low
4405
4406	eor	r7, r7, r14                     @ AES block 0 - round 14 high
4407	eor	r24, r24, r14                     @ AES block 3 - round 14 high
4408	fmov	d4, r6                               @ AES block 0 - mov low
4409
4410	cmp	r0, r5                   @ check if we have <= 8 blocks
4411	fmov	v4.d[1], r7                           @ AES block 0 - mov high
4412	eor	r23, r23, r13                     @ AES block 3 - round 14 low
4413
4414	eor	r21, r21, r13                     @ AES block 2 - round 14 low
4415	fmov	v5.d[1], r20                           @ AES block 1 - mov high
4416
4417	fmov	d6, r21                               @ AES block 2 - mov low
4418	add	r12, r12, #1                            @ CTR block 4
4419
4420	orr	r9, r11, r9, lsl #32            @ CTR block 4
4421	fmov	d7, r23                               @ AES block 3 - mov low
4422	eor	r22, r22, r14                     @ AES block 2 - round 14 high
4423
4424	fmov	v6.d[1], r22                           @ AES block 2 - mov high
4425
4426	eor	q4, q4, q0                          @ AES block 0 - result
4427	fmov	d0, r10                               @ CTR block 4
4428
4429	fmov	v0.d[1], r9                               @ CTR block 4
4430	rev	r9, r12                                 @ CTR block 5
4431	add	r12, r12, #1                            @ CTR block 5
4432
4433	eor	q5, q5, q1                          @ AES block 1 - result
4434	fmov	d1, r10                               @ CTR block 5
4435	orr	r9, r11, r9, lsl #32            @ CTR block 5
4436
4437	fmov	v1.d[1], r9                               @ CTR block 5
4438	rev	r9, r12                                 @ CTR block 6
4439	st1	{ q4}, [r2], #16                     @ AES block 0 - store result
4440
4441	fmov	v7.d[1], r24                           @ AES block 3 - mov high
4442	orr	r9, r11, r9, lsl #32            @ CTR block 6
4443	eor	q6, q6, q2                          @ AES block 2 - result
4444
4445	st1	{ q5}, [r2], #16                     @ AES block 1 - store result
4446
4447	add	r12, r12, #1                            @ CTR block 6
4448	fmov	d2, r10                               @ CTR block 6
4449
4450	fmov	v2.d[1], r9                               @ CTR block 6
4451	st1	{ q6}, [r2], #16                     @ AES block 2 - store result
4452	rev	r9, r12                                 @ CTR block 7
4453
4454	orr	r9, r11, r9, lsl #32            @ CTR block 7
4455
4456	eor	q7, q7, q3                          @ AES block 3 - result
4457	st1	{ q7}, [r2], #16                     @ AES block 3 - store result
4458	bge	.L256_enc_prepretail                               @ do prepretail
4459
4460.L256_enc_main_loop:@ main loop start
4461	aese	q0, v18.16b
4462	aesmc	q0, q0          @ AES block 4k+4 - round 0
4463	rev64	q4, q4                                    @ GHASH block 4k (only t0 is free)
4464
4465	aese	q1, v18.16b
4466	aesmc	q1, q1          @ AES block 4k+5 - round 0
4467	fmov	d3, r10                               @ CTR block 4k+3
4468
4469	aese	q2, v18.16b
4470	aesmc	q2, q2          @ AES block 4k+6 - round 0
4471	ext	v11.16b, v11.16b, v11.16b, #8                     @ PRE 0
4472
4473	aese	q0, v19.16b
4474	aesmc	q0, q0          @ AES block 4k+4 - round 1
4475	fmov	v3.d[1], r9                               @ CTR block 4k+3
4476
4477	aese	q1, v19.16b
4478	aesmc	q1, q1          @ AES block 4k+5 - round 1
4479	ldp	r23, r24, [r0, #48]           @ AES block 4k+7 - load plaintext
4480#ifdef __ARMEB__
4481	rev	r23, r23
4482	rev	r24, r24
4483#endif
4484	aese	q2, v19.16b
4485	aesmc	q2, q2          @ AES block 4k+6 - round 1
4486	ldp	r21, r22, [r0, #32]           @ AES block 4k+6 - load plaintext
4487#ifdef __ARMEB__
4488	rev	r21, r21
4489	rev	r22, r22
4490#endif
4491	aese	q0, v20.16b
4492	aesmc	q0, q0          @ AES block 4k+4 - round 2
4493	eor	q4, q4, v11.16b                           @ PRE 1
4494
4495	aese	q1, v20.16b
4496	aesmc	q1, q1          @ AES block 4k+5 - round 2
4497
4498	aese	q3, v18.16b
4499	aesmc	q3, q3          @ AES block 4k+7 - round 0
4500	eor	r23, r23, r13                     @ AES block 4k+7 - round 14 low
4501
4502	aese	q0, v21.16b
4503	aesmc	q0, q0          @ AES block 4k+4 - round 3
4504	mov	d10, v17.d[1]                               @ GHASH block 4k - mid
4505
4506	pmull2	v9.1q, q4, v15.2d                       @ GHASH block 4k - high
4507	eor	r22, r22, r14                     @ AES block 4k+6 - round 14 high
4508	mov	d8, v4.d[1]                                  @ GHASH block 4k - mid
4509
4510	aese	q3, v19.16b
4511	aesmc	q3, q3          @ AES block 4k+7 - round 1
4512	rev64	q5, q5                                    @ GHASH block 4k+1 (t0 and t1 free)
4513
4514	aese	q0, v22.16b
4515	aesmc	q0, q0          @ AES block 4k+4 - round 4
4516
4517	pmull	v11.1q, q4, v15.1d                       @ GHASH block 4k - low
4518	eor	q8, q8, q4                          @ GHASH block 4k - mid
4519
4520	aese	q2, v20.16b
4521	aesmc	q2, q2          @ AES block 4k+6 - round 2
4522
4523	aese	q0, v23.16b
4524	aesmc	q0, q0          @ AES block 4k+4 - round 5
4525	rev64	q7, q7                                    @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
4526
4527	pmull2	v4.1q, q5, v14.2d                          @ GHASH block 4k+1 - high
4528
4529	pmull	v10.1q, q8, v10.1d                      @ GHASH block 4k - mid
4530	rev64	q6, q6                                    @ GHASH block 4k+2 (t0, t1, and t2 free)
4531
4532	pmull	v8.1q, q5, v14.1d                          @ GHASH block 4k+1 - low
4533
4534	eor	q9, q9, q4                         @ GHASH block 4k+1 - high
4535	mov	d4, v5.d[1]                                  @ GHASH block 4k+1 - mid
4536
4537	aese	q1, v21.16b
4538	aesmc	q1, q1          @ AES block 4k+5 - round 3
4539
4540	aese	q3, v20.16b
4541	aesmc	q3, q3          @ AES block 4k+7 - round 2
4542	eor	v11.16b, v11.16b, q8                         @ GHASH block 4k+1 - low
4543
4544	aese	q2, v21.16b
4545	aesmc	q2, q2          @ AES block 4k+6 - round 3
4546
4547	aese	q1, v22.16b
4548	aesmc	q1, q1          @ AES block 4k+5 - round 4
4549	mov	d8, v6.d[1]                                  @ GHASH block 4k+2 - mid
4550
4551	aese	q3, v21.16b
4552	aesmc	q3, q3          @ AES block 4k+7 - round 3
4553	eor	q4, q4, q5                          @ GHASH block 4k+1 - mid
4554
4555	aese	q2, v22.16b
4556	aesmc	q2, q2          @ AES block 4k+6 - round 4
4557
4558	aese	q0, v24.16b
4559	aesmc	q0, q0          @ AES block 4k+4 - round 6
4560	eor	q8, q8, q6                          @ GHASH block 4k+2 - mid
4561
4562	aese	q3, v22.16b
4563	aesmc	q3, q3          @ AES block 4k+7 - round 4
4564
4565	pmull	v4.1q, q4, v17.1d                          @ GHASH block 4k+1 - mid
4566
4567	aese	q0, v25.16b
4568	aesmc	q0, q0          @ AES block 4k+4 - round 7
4569
4570	aese	q3, v23.16b
4571	aesmc	q3, q3          @ AES block 4k+7 - round 5
4572	ins	v8.d[1], v8.d[0]                                @ GHASH block 4k+2 - mid
4573
4574	aese	q1, v23.16b
4575	aesmc	q1, q1          @ AES block 4k+5 - round 5
4576
4577	aese	q0, v26.16b
4578	aesmc	q0, q0          @ AES block 4k+4 - round 8
4579
4580	aese	q2, v23.16b
4581	aesmc	q2, q2          @ AES block 4k+6 - round 5
4582
4583	aese	q1, v24.16b
4584	aesmc	q1, q1          @ AES block 4k+5 - round 6
4585	eor	v10.16b, v10.16b, q4                         @ GHASH block 4k+1 - mid
4586
4587	pmull2	v4.1q, q6, v13.2d                          @ GHASH block 4k+2 - high
4588
4589	pmull	v5.1q, q6, v13.1d                          @ GHASH block 4k+2 - low
4590
4591	aese	q1, v25.16b
4592	aesmc	q1, q1          @ AES block 4k+5 - round 7
4593
4594	pmull	v6.1q, q7, v12.1d                          @ GHASH block 4k+3 - low
4595	eor	q9, q9, q4                         @ GHASH block 4k+2 - high
4596
4597	aese	q3, v24.16b
4598	aesmc	q3, q3          @ AES block 4k+7 - round 6
4599	ldp	r19, r20, [r0, #16]           @ AES block 4k+5 - load plaintext
4600#ifdef __ARMEB__
4601	rev	r19, r19
4602	rev	r20, r20
4603#endif
4604	aese	q1, v26.16b
4605	aesmc	q1, q1          @ AES block 4k+5 - round 8
4606	mov	d4, v7.d[1]                                  @ GHASH block 4k+3 - mid
4607
4608	aese	q2, v24.16b
4609	aesmc	q2, q2          @ AES block 4k+6 - round 6
4610	eor	v11.16b, v11.16b, q5                         @ GHASH block 4k+2 - low
4611
4612	pmull2	v8.1q, q8, v16.2d                          @ GHASH block 4k+2 - mid
4613
4614	pmull2	v5.1q, q7, v12.2d                          @ GHASH block 4k+3 - high
4615	eor	q4, q4, q7                          @ GHASH block 4k+3 - mid
4616
4617	aese	q2, v25.16b
4618	aesmc	q2, q2          @ AES block 4k+6 - round 7
4619	eor	r19, r19, r13                     @ AES block 4k+5 - round 14 low
4620
4621	aese	q1, v27.16b
4622	aesmc	q1, q1          @ AES block 4k+5 - round 9
4623	eor	v10.16b, v10.16b, q8                         @ GHASH block 4k+2 - mid
4624
4625	aese	q3, v25.16b
4626	aesmc	q3, q3          @ AES block 4k+7 - round 7
4627	eor	r21, r21, r13                     @ AES block 4k+6 - round 14 low
4628
4629	aese	q0, v27.16b
4630	aesmc	q0, q0          @ AES block 4k+4 - round 9
4631	movi	q8, #0xc2
4632
4633	pmull	v4.1q, q4, v16.1d                          @ GHASH block 4k+3 - mid
4634	eor	q9, q9, q5                         @ GHASH block 4k+3 - high
4635	fmov	d5, r19                               @ AES block 4k+5 - mov low
4636
4637	aese	q2, v26.16b
4638	aesmc	q2, q2          @ AES block 4k+6 - round 8
4639	ldp	r6, r7, [r0, #0]            @ AES block 4k+4 - load plaintext
4640#ifdef __ARMEB__
4641	rev	r6, r6
4642	rev	r7, r7
4643#endif
4644	aese	q0, v28.16b
4645	aesmc	q0, q0          @ AES block 4k+4 - round 10
4646	shl	d8, d8, #56               @ mod_constant
4647
4648	aese	q3, v26.16b
4649	aesmc	q3, q3          @ AES block 4k+7 - round 8
4650	eor	v11.16b, v11.16b, q6                         @ GHASH block 4k+3 - low
4651
4652	aese	q2, v27.16b
4653	aesmc	q2, q2          @ AES block 4k+6 - round 9
4654
4655	aese	q1, v28.16b
4656	aesmc	q1, q1          @ AES block 4k+5 - round 10
4657	eor	v10.16b, v10.16b, q4                         @ GHASH block 4k+3 - mid
4658
4659	aese	q3, v27.16b
4660	aesmc	q3, q3          @ AES block 4k+7 - round 9
4661	add	r12, r12, #1                            @ CTR block 4k+3
4662
4663	aese	q0, v29.16b
4664	aesmc	q0, q0          @ AES block 4k+4 - round 11
4665	eor	q4, v11.16b, q9                         @ MODULO - karatsuba tidy up
4666
4667	aese	q1, v29.16b
4668	aesmc	q1, q1          @ AES block 4k+5 - round 11
4669	add	r0, r0, #64                       @ AES input_ptr update
4670
4671	pmull	v7.1q, q9, q8            @ MODULO - top 64b align with mid
4672	rev	r9, r12                                 @ CTR block 4k+8
4673	ext	q9, q9, q9, #8                     @ MODULO - other top alignment
4674
4675	aese	q2, v28.16b
4676	aesmc	q2, q2          @ AES block 4k+6 - round 10
4677	eor	r6, r6, r13                     @ AES block 4k+4 - round 14 low
4678
4679	aese	q1, v30.16b
4680	aesmc	q1, q1          @ AES block 4k+5 - round 12
4681	eor	v10.16b, v10.16b, q4                         @ MODULO - karatsuba tidy up
4682
4683	aese	q3, v28.16b
4684	aesmc	q3, q3          @ AES block 4k+7 - round 10
4685	eor	r7, r7, r14                     @ AES block 4k+4 - round 14 high
4686
4687	fmov	d4, r6                               @ AES block 4k+4 - mov low
4688	orr	r9, r11, r9, lsl #32            @ CTR block 4k+8
4689	eor	q7, q9, q7                   @ MODULO - fold into mid
4690
4691	aese	q0, v30.16b
4692	aesmc	q0, q0          @ AES block 4k+4 - round 12
4693	eor	r20, r20, r14                     @ AES block 4k+5 - round 14 high
4694
4695	aese	q2, v29.16b
4696	aesmc	q2, q2          @ AES block 4k+6 - round 11
4697	eor	r24, r24, r14                     @ AES block 4k+7 - round 14 high
4698
4699	aese	q3, v29.16b
4700	aesmc	q3, q3          @ AES block 4k+7 - round 11
4701	add	r12, r12, #1                            @ CTR block 4k+8
4702
4703	aese	q0, v31.16b                                     @ AES block 4k+4 - round 13
4704	fmov	v4.d[1], r7                           @ AES block 4k+4 - mov high
4705	eor	v10.16b, v10.16b, q7                      @ MODULO - fold into mid
4706
4707	aese	q2, v30.16b
4708	aesmc	q2, q2          @ AES block 4k+6 - round 12
4709	fmov	d7, r23                               @ AES block 4k+7 - mov low
4710
4711	aese	q1, v31.16b                                     @ AES block 4k+5 - round 13
4712	fmov	v5.d[1], r20                           @ AES block 4k+5 - mov high
4713
4714	fmov	d6, r21                               @ AES block 4k+6 - mov low
4715	cmp	r0, r5                   @ .LOOP CONTROL
4716
4717	fmov	v6.d[1], r22                           @ AES block 4k+6 - mov high
4718
4719	pmull	v9.1q, v10.1d, q8            @ MODULO - mid 64b align with low
4720	eor	q4, q4, q0                          @ AES block 4k+4 - result
4721	fmov	d0, r10                               @ CTR block 4k+8
4722
4723	fmov	v0.d[1], r9                               @ CTR block 4k+8
4724	rev	r9, r12                                 @ CTR block 4k+9
4725	add	r12, r12, #1                            @ CTR block 4k+9
4726
4727	eor	q5, q5, q1                          @ AES block 4k+5 - result
4728	fmov	d1, r10                               @ CTR block 4k+9
4729	orr	r9, r11, r9, lsl #32            @ CTR block 4k+9
4730
4731	aese	q3, v30.16b
4732	aesmc	q3, q3          @ AES block 4k+7 - round 12
4733	fmov	v1.d[1], r9                               @ CTR block 4k+9
4734
4735	aese	q2, v31.16b                                     @ AES block 4k+6 - round 13
4736	rev	r9, r12                                 @ CTR block 4k+10
4737	st1	{ q4}, [r2], #16                     @ AES block 4k+4 - store result
4738
4739	orr	r9, r11, r9, lsl #32            @ CTR block 4k+10
4740	eor	v11.16b, v11.16b, q9                         @ MODULO - fold into low
4741	fmov	v7.d[1], r24                           @ AES block 4k+7 - mov high
4742
4743	ext	v10.16b, v10.16b, v10.16b, #8                     @ MODULO - other mid alignment
4744	st1	{ q5}, [r2], #16                     @ AES block 4k+5 - store result
4745	add	r12, r12, #1                            @ CTR block 4k+10
4746
4747	aese	q3, v31.16b                                     @ AES block 4k+7 - round 13
4748	eor	q6, q6, q2                          @ AES block 4k+6 - result
4749	fmov	d2, r10                               @ CTR block 4k+10
4750
4751	st1	{ q6}, [r2], #16                     @ AES block 4k+6 - store result
4752	fmov	v2.d[1], r9                               @ CTR block 4k+10
4753	rev	r9, r12                                 @ CTR block 4k+11
4754
4755	eor	v11.16b, v11.16b, v10.16b                         @ MODULO - fold into low
4756	orr	r9, r11, r9, lsl #32            @ CTR block 4k+11
4757
4758	eor	q7, q7, q3                          @ AES block 4k+7 - result
4759	st1	{ q7}, [r2], #16                     @ AES block 4k+7 - store result
4760	blt	.L256_enc_main_loop
4761
4762.L256_enc_prepretail:@ PREPRETAIL
4763	aese	q1, v18.16b
4764	aesmc	q1, q1          @ AES block 4k+5 - round 0
4765	rev64	q6, q6                                    @ GHASH block 4k+2 (t0, t1, and t2 free)
4766
4767	aese	q2, v18.16b
4768	aesmc	q2, q2          @ AES block 4k+6 - round 0
4769	fmov	d3, r10                               @ CTR block 4k+3
4770
4771	aese	q0, v18.16b
4772	aesmc	q0, q0          @ AES block 4k+4 - round 0
4773	rev64	q4, q4                                    @ GHASH block 4k (only t0 is free)
4774
4775	fmov	v3.d[1], r9                               @ CTR block 4k+3
4776	ext	v11.16b, v11.16b, v11.16b, #8                     @ PRE 0
4777
4778	aese	q2, v19.16b
4779	aesmc	q2, q2          @ AES block 4k+6 - round 1
4780
4781	aese	q0, v19.16b
4782	aesmc	q0, q0          @ AES block 4k+4 - round 1
4783
4784	eor	q4, q4, v11.16b                           @ PRE 1
4785	rev64	q5, q5                                    @ GHASH block 4k+1 (t0 and t1 free)
4786
4787	aese	q2, v20.16b
4788	aesmc	q2, q2          @ AES block 4k+6 - round 2
4789
4790	aese	q3, v18.16b
4791	aesmc	q3, q3          @ AES block 4k+7 - round 0
4792	mov	d10, v17.d[1]                               @ GHASH block 4k - mid
4793
4794	aese	q1, v19.16b
4795	aesmc	q1, q1          @ AES block 4k+5 - round 1
4796
4797	pmull	v11.1q, q4, v15.1d                       @ GHASH block 4k - low
4798	mov	d8, v4.d[1]                                  @ GHASH block 4k - mid
4799
4800	pmull2	v9.1q, q4, v15.2d                       @ GHASH block 4k - high
4801
4802	aese	q2, v21.16b
4803	aesmc	q2, q2          @ AES block 4k+6 - round 3
4804
4805	aese	q1, v20.16b
4806	aesmc	q1, q1          @ AES block 4k+5 - round 2
4807	eor	q8, q8, q4                          @ GHASH block 4k - mid
4808
4809	aese	q0, v20.16b
4810	aesmc	q0, q0          @ AES block 4k+4 - round 2
4811
4812	aese	q3, v19.16b
4813	aesmc	q3, q3          @ AES block 4k+7 - round 1
4814
4815	aese	q1, v21.16b
4816	aesmc	q1, q1          @ AES block 4k+5 - round 3
4817
4818	pmull	v10.1q, q8, v10.1d                      @ GHASH block 4k - mid
4819
4820	pmull2	v4.1q, q5, v14.2d                          @ GHASH block 4k+1 - high
4821
4822	pmull	v8.1q, q5, v14.1d                          @ GHASH block 4k+1 - low
4823
4824	aese	q3, v20.16b
4825	aesmc	q3, q3          @ AES block 4k+7 - round 2
4826
4827	eor	q9, q9, q4                         @ GHASH block 4k+1 - high
4828	mov	d4, v5.d[1]                                  @ GHASH block 4k+1 - mid
4829
4830	aese	q0, v21.16b
4831	aesmc	q0, q0          @ AES block 4k+4 - round 3
4832	eor	v11.16b, v11.16b, q8                         @ GHASH block 4k+1 - low
4833
4834	aese	q3, v21.16b
4835	aesmc	q3, q3          @ AES block 4k+7 - round 3
4836
4837	eor	q4, q4, q5                          @ GHASH block 4k+1 - mid
4838	mov	d8, v6.d[1]                                  @ GHASH block 4k+2 - mid
4839
4840	aese	q0, v22.16b
4841	aesmc	q0, q0          @ AES block 4k+4 - round 4
4842	rev64	q7, q7                                    @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
4843
4844	aese	q3, v22.16b
4845	aesmc	q3, q3          @ AES block 4k+7 - round 4
4846
4847	pmull	v4.1q, q4, v17.1d                          @ GHASH block 4k+1 - mid
4848	eor	q8, q8, q6                          @ GHASH block 4k+2 - mid
4849	add	r12, r12, #1                            @ CTR block 4k+3
4850
4851	pmull	v5.1q, q6, v13.1d                          @ GHASH block 4k+2 - low
4852
4853	aese	q3, v23.16b
4854	aesmc	q3, q3          @ AES block 4k+7 - round 5
4855
4856	aese	q2, v22.16b
4857	aesmc	q2, q2          @ AES block 4k+6 - round 4
4858	eor	v10.16b, v10.16b, q4                         @ GHASH block 4k+1 - mid
4859
4860	pmull2	v4.1q, q6, v13.2d                          @ GHASH block 4k+2 - high
4861
4862	eor	v11.16b, v11.16b, q5                         @ GHASH block 4k+2 - low
4863	ins	v8.d[1], v8.d[0]                                @ GHASH block 4k+2 - mid
4864
4865	aese	q2, v23.16b
4866	aesmc	q2, q2          @ AES block 4k+6 - round 5
4867
4868	eor	q9, q9, q4                         @ GHASH block 4k+2 - high
4869	mov	d4, v7.d[1]                                  @ GHASH block 4k+3 - mid
4870
4871	aese	q1, v22.16b
4872	aesmc	q1, q1          @ AES block 4k+5 - round 4
4873
4874	pmull2	v8.1q, q8, v16.2d                          @ GHASH block 4k+2 - mid
4875
4876	eor	q4, q4, q7                          @ GHASH block 4k+3 - mid
4877
4878	pmull2	v5.1q, q7, v12.2d                          @ GHASH block 4k+3 - high
4879
4880	aese	q1, v23.16b
4881	aesmc	q1, q1          @ AES block 4k+5 - round 5
4882
4883	pmull	v4.1q, q4, v16.1d                          @ GHASH block 4k+3 - mid
4884	eor	v10.16b, v10.16b, q8                         @ GHASH block 4k+2 - mid
4885
4886	aese	q0, v23.16b
4887	aesmc	q0, q0          @ AES block 4k+4 - round 5
4888
4889	aese	q1, v24.16b
4890	aesmc	q1, q1          @ AES block 4k+5 - round 6
4891
4892	aese	q2, v24.16b
4893	aesmc	q2, q2          @ AES block 4k+6 - round 6
4894
4895	aese	q0, v24.16b
4896	aesmc	q0, q0          @ AES block 4k+4 - round 6
4897	movi	q8, #0xc2
4898
4899	aese	q3, v24.16b
4900	aesmc	q3, q3          @ AES block 4k+7 - round 6
4901
4902	aese	q1, v25.16b
4903	aesmc	q1, q1          @ AES block 4k+5 - round 7
4904	eor	q9, q9, q5                         @ GHASH block 4k+3 - high
4905
4906	aese	q0, v25.16b
4907	aesmc	q0, q0          @ AES block 4k+4 - round 7
4908
4909	aese	q3, v25.16b
4910	aesmc	q3, q3          @ AES block 4k+7 - round 7
4911	shl	d8, d8, #56               @ mod_constant
4912
4913	aese	q1, v26.16b
4914	aesmc	q1, q1          @ AES block 4k+5 - round 8
4915	eor	v10.16b, v10.16b, q4                         @ GHASH block 4k+3 - mid
4916
4917	pmull	v6.1q, q7, v12.1d                          @ GHASH block 4k+3 - low
4918
4919	aese	q3, v26.16b
4920	aesmc	q3, q3          @ AES block 4k+7 - round 8
4921
4922	aese	q1, v27.16b
4923	aesmc	q1, q1          @ AES block 4k+5 - round 9
4924
4925	aese	q0, v26.16b
4926	aesmc	q0, q0          @ AES block 4k+4 - round 8
4927	eor	v11.16b, v11.16b, q6                         @ GHASH block 4k+3 - low
4928
4929	aese	q3, v27.16b
4930	aesmc	q3, q3          @ AES block 4k+7 - round 9
4931
4932	eor	v10.16b, v10.16b, q9                         @ karatsuba tidy up
4933
4934	pmull	v4.1q, q9, q8
4935	ext	q9, q9, q9, #8
4936
4937	aese	q3, v28.16b
4938	aesmc	q3, q3          @ AES block 4k+7 - round 10
4939
4940	aese	q2, v25.16b
4941	aesmc	q2, q2          @ AES block 4k+6 - round 7
4942	eor	v10.16b, v10.16b, v11.16b
4943
4944	aese	q1, v28.16b
4945	aesmc	q1, q1          @ AES block 4k+5 - round 10
4946
4947	aese	q0, v27.16b
4948	aesmc	q0, q0          @ AES block 4k+4 - round 9
4949
4950	aese	q2, v26.16b
4951	aesmc	q2, q2          @ AES block 4k+6 - round 8
4952
4953	aese	q1, v29.16b
4954	aesmc	q1, q1          @ AES block 4k+5 - round 11
4955	eor	v10.16b, v10.16b, q4
4956
4957	aese	q0, v28.16b
4958	aesmc	q0, q0          @ AES block 4k+4 - round 10
4959
4960	aese	q2, v27.16b
4961	aesmc	q2, q2          @ AES block 4k+6 - round 9
4962
4963	aese	q1, v30.16b
4964	aesmc	q1, q1          @ AES block 4k+5 - round 12
4965
4966	aese	q0, v29.16b
4967	aesmc	q0, q0          @ AES block 4k+4 - round 11
4968	eor	v10.16b, v10.16b, q9
4969
4970	aese	q3, v29.16b
4971	aesmc	q3, q3          @ AES block 4k+7 - round 11
4972
4973	aese	q2, v28.16b
4974	aesmc	q2, q2          @ AES block 4k+6 - round 10
4975
4976	aese	q0, v30.16b
4977	aesmc	q0, q0          @ AES block 4k+4 - round 12
4978
4979	pmull	v4.1q, v10.1d, q8
4980
4981	aese	q2, v29.16b
4982	aesmc	q2, q2          @ AES block 4k+6 - round 11
4983	ext	v10.16b, v10.16b, v10.16b, #8
4984
4985	aese	q3, v30.16b
4986	aesmc	q3, q3          @ AES block 4k+7 - round 12
4987
4988	aese	q1, v31.16b                                     @ AES block 4k+5 - round 13
4989	eor	v11.16b, v11.16b, q4
4990
4991	aese	q2, v30.16b
4992	aesmc	q2, q2          @ AES block 4k+6 - round 12
4993
4994	aese	q3, v31.16b                                     @ AES block 4k+7 - round 13
4995
4996	aese	q0, v31.16b                                     @ AES block 4k+4 - round 13
4997
4998	aese	q2, v31.16b                                     @ AES block 4k+6 - round 13
4999	eor	v11.16b, v11.16b, v10.16b
5000.L256_enc_tail:@ TAIL
5001
5002	ext	q8, v11.16b, v11.16b, #8                     @ prepare final partial tag
5003	sub	r5, r4, r0   @ main_end_input_ptr is number of bytes left to process
5004	ldp	r6, r7, [r0], #16           @ AES block 4k+4 - load plaintext
5005#ifdef __ARMEB__
5006	rev	r6, r6
5007	rev	r7, r7
5008#endif
5009	eor	r6, r6, r13                     @ AES block 4k+4 - round 14 low
5010	eor	r7, r7, r14                     @ AES block 4k+4 - round 14 high
5011
5012	cmp	r5, #48
5013	fmov	d4, r6                               @ AES block 4k+4 - mov low
5014
5015	fmov	v4.d[1], r7                           @ AES block 4k+4 - mov high
5016
5017	eor	q5, q4, q0                          @ AES block 4k+4 - result
5018	bgt	.L256_enc_blocks_more_than_3
5019
5020	cmp	r5, #32
5021	mov	q3, q2
5022	movi	v11.8b, #0
5023
5024	movi	q9, #0
5025	sub	r12, r12, #1
5026
5027	mov	q2, q1
5028	movi	v10.8b, #0
5029	bgt	.L256_enc_blocks_more_than_2
5030
5031	mov	q3, q1
5032	sub	r12, r12, #1
5033	cmp	r5, #16
5034
5035	bgt	.L256_enc_blocks_more_than_1
5036
5037	sub	r12, r12, #1
5038	b	.L256_enc_blocks_less_than_1
5039.L256_enc_blocks_more_than_3:@ blocks left >  3
5040	st1	{ q5}, [r2], #16                    @ AES final-3 block  - store result
5041
5042	ldp	r6, r7, [r0], #16          @ AES final-2 block - load input low & high
5043#ifdef __ARMEB__
5044	rev	r6, r6
5045	rev	r7, r7
5046#endif
5047	rev64	q4, q5                                   @ GHASH final-3 block
5048
5049	eor	r6, r6, r13                    @ AES final-2 block - round 14 low
5050	eor	q4, q4, q8                          @ feed in partial tag
5051
5052	eor	r7, r7, r14                    @ AES final-2 block - round 14 high
5053
5054	mov	d22, v4.d[1]                                @ GHASH final-3 block - mid
5055	fmov	d5, r6                                @ AES final-2 block - mov low
5056
5057	fmov	v5.d[1], r7                            @ AES final-2 block - mov high
5058
5059	eor	v22.8b, v22.8b, q4                     @ GHASH final-3 block - mid
5060	movi	q8, #0                                       @ suppress further partial tag feed in
5061
5062	mov	d10, v17.d[1]                              @ GHASH final-3 block - mid
5063
5064	pmull	v11.1q, q4, v15.1d                      @ GHASH final-3 block - low
5065
5066	pmull2	v9.1q, q4, v15.2d                      @ GHASH final-3 block - high
5067
5068	pmull	v10.1q, v22.1d, v10.1d                   @ GHASH final-3 block - mid
5069	eor	q5, q5, q1                           @ AES final-2 block - result
5070.L256_enc_blocks_more_than_2:@ blocks left >  2
5071
5072	st1	{ q5}, [r2], #16                    @ AES final-2 block - store result
5073
5074	ldp	r6, r7, [r0], #16          @ AES final-1 block - load input low & high
5075#ifdef __ARMEB__
5076	rev	r6, r6
5077	rev	r7, r7
5078#endif
5079	rev64	q4, q5                                   @ GHASH final-2 block
5080
5081	eor	r6, r6, r13                    @ AES final-1 block - round 14 low
5082	eor	q4, q4, q8                          @ feed in partial tag
5083
5084	fmov	d5, r6                                @ AES final-1 block - mov low
5085	eor	r7, r7, r14                    @ AES final-1 block - round 14 high
5086
5087	fmov	v5.d[1], r7                            @ AES final-1 block - mov high
5088
5089	movi	q8, #0                                       @ suppress further partial tag feed in
5090
5091	pmull2	v20.1q, q4, v14.2d                         @ GHASH final-2 block - high
5092	mov	d22, v4.d[1]                                @ GHASH final-2 block - mid
5093
5094	pmull	v21.1q, q4, v14.1d                         @ GHASH final-2 block - low
5095
5096	eor	v22.8b, v22.8b, q4                     @ GHASH final-2 block - mid
5097
5098	eor	q5, q5, q2                           @ AES final-1 block - result
5099
5100	eor	q9, q9, v20.16b                           @ GHASH final-2 block - high
5101
5102	pmull	v22.1q, v22.1d, v17.1d                     @ GHASH final-2 block - mid
5103
5104	eor	v11.16b, v11.16b, v21.16b                           @ GHASH final-2 block - low
5105
5106	eor	v10.16b, v10.16b, v22.16b                      @ GHASH final-2 block - mid
5107.L256_enc_blocks_more_than_1:@ blocks left >  1
5108
5109	st1	{ q5}, [r2], #16                    @ AES final-1 block - store result
5110
5111	rev64	q4, q5                                   @ GHASH final-1 block
5112
5113	ldp	r6, r7, [r0], #16          @ AES final block - load input low & high
5114#ifdef __ARMEB__
5115	rev	r6, r6
5116	rev	r7, r7
5117#endif
5118	eor	q4, q4, q8                          @ feed in partial tag
5119
5120	movi	q8, #0                                       @ suppress further partial tag feed in
5121
5122	eor	r6, r6, r13                    @ AES final block - round 14 low
5123	mov	d22, v4.d[1]                                @ GHASH final-1 block - mid
5124
5125	pmull2	v20.1q, q4, v13.2d                         @ GHASH final-1 block - high
5126	eor	r7, r7, r14                    @ AES final block - round 14 high
5127
5128	eor	v22.8b, v22.8b, q4                     @ GHASH final-1 block - mid
5129
5130	eor	q9, q9, v20.16b                           @ GHASH final-1 block - high
5131
5132	ins	v22.d[1], v22.d[0]                           @ GHASH final-1 block - mid
5133	fmov	d5, r6                                @ AES final block - mov low
5134
5135	fmov	v5.d[1], r7                            @ AES final block - mov high
5136
5137	pmull2	v22.1q, v22.2d, v16.2d                     @ GHASH final-1 block - mid
5138
5139	pmull	v21.1q, q4, v13.1d                         @ GHASH final-1 block - low
5140
5141	eor	q5, q5, q3                           @ AES final block - result
5142	eor	v10.16b, v10.16b, v22.16b                      @ GHASH final-1 block - mid
5143
5144	eor	v11.16b, v11.16b, v21.16b                           @ GHASH final-1 block - low
5145.L256_enc_blocks_less_than_1:@ blocks left <= 1
5146
5147	and	r1, r1, #127                   @ bit_length %= 128
5148
5149	mvn	r13, xzr                                     @ rk14_l = 0xffffffffffffffff
5150	sub	r1, r1, #128                   @ bit_length -= 128
5151
5152	neg	r1, r1                         @ bit_length = 128 - #bits in input (in range [1,128])
5153	ld1	{ v18.16b}, [r2]                           @ load existing bytes where the possibly partial last block is to be stored
5154
5155	mvn	r14, xzr                                     @ rk14_h = 0xffffffffffffffff
5156	and	r1, r1, #127                   @ bit_length %= 128
5157
5158	lsr	r14, r14, r1                    @ rk14_h is mask for top 64b of last block
5159	cmp	r1, #64
5160
5161	csel	r6, r13, r14, lt
5162	csel	r7, r14, xzr, lt
5163
5164	fmov	d0, r6                                @ ctr0b is mask for last block
5165
5166	fmov	v0.d[1], r7
5167
5168	and	q5, q5, q0                           @ possibly partial last block has zeroes in highest bits
5169
5170	rev64	q4, q5                                   @ GHASH final block
5171
5172	eor	q4, q4, q8                          @ feed in partial tag
5173
5174	bif	q5, v18.16b, q0                             @ insert existing bytes in top end of result before storing
5175
5176	pmull2	v20.1q, q4, v12.2d                         @ GHASH final block - high
5177	mov	d8, v4.d[1]                                 @ GHASH final block - mid
5178#ifndef __ARMEB__
5179	rev	r9, r12
5180#else
5181	mov	r9, r12
5182#endif
5183
5184	pmull	v21.1q, q4, v12.1d                         @ GHASH final block - low
5185
5186	eor	q9, q9, v20.16b                           @ GHASH final block - high
5187	eor	q8, q8, q4                         @ GHASH final block - mid
5188
5189	pmull	v8.1q, q8, v16.1d                         @ GHASH final block - mid
5190
5191	eor	v11.16b, v11.16b, v21.16b                           @ GHASH final block - low
5192
5193	eor	v10.16b, v10.16b, q8                        @ GHASH final block - mid
5194	movi	q8, #0xc2
5195
5196	eor	q4, v11.16b, q9                        @ MODULO - karatsuba tidy up
5197
5198	shl	d8, d8, #56              @ mod_constant
5199
5200	eor	v10.16b, v10.16b, q4                        @ MODULO - karatsuba tidy up
5201
5202	pmull	v7.1q, q9, q8           @ MODULO - top 64b align with mid
5203
5204	ext	q9, q9, q9, #8                    @ MODULO - other top alignment
5205
5206	eor	v10.16b, v10.16b, q7                     @ MODULO - fold into mid
5207
5208	eor	v10.16b, v10.16b, q9                        @ MODULO - fold into mid
5209
5210	pmull	v9.1q, v10.1d, q8           @ MODULO - mid 64b align with low
5211
5212	ext	v10.16b, v10.16b, v10.16b, #8                    @ MODULO - other mid alignment
5213
5214	str	r9, [r16, #12]                         @ store the updated counter
5215
5216	st1	{ q5}, [r2]                         @ store all 16B
5217	eor	v11.16b, v11.16b, q9                        @ MODULO - fold into low
5218
5219	eor	v11.16b, v11.16b, v10.16b                        @ MODULO - fold into low
5220	ext	v11.16b, v11.16b, v11.16b, #8
5221	rev64	v11.16b, v11.16b
5222	mov	r0, r15
5223	st1	{ v11.16b }, [r3]
5224
5225	ldp	r21, r22, [sp, #16]
5226	ldp	r23, r24, [sp, #32]
5227	ldp	d8, d9, [sp, #48]
5228	ldp	d10, d11, [sp, #64]
5229	ldp	d12, d13, [sp, #80]
5230	ldp	d14, d15, [sp, #96]
5231	ldp	r19, r20, [sp], #112
5232	RET
5233
5234.L256_enc_ret:
5235	mov	r0, #0x0
5236	RET
5237.size	aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel
5238.globl	aes_gcm_dec_256_kernel
5239.type	aes_gcm_dec_256_kernel,%function
5240.align	4
5241aes_gcm_dec_256_kernel:
5242	cbz	r1, .L256_dec_ret
5243	stp	r19, r20, [sp, #-112]!
5244	mov	r16, r4
5245	mov	r8, r5
5246	stp	r21, r22, [sp, #16]
5247	stp	r23, r24, [sp, #32]
5248	stp	d8, d9, [sp, #48]
5249	stp	d10, d11, [sp, #64]
5250	stp	d12, d13, [sp, #80]
5251	stp	d14, d15, [sp, #96]
5252
5253	lsr	r5, r1, #3              @ byte_len
5254	mov	r15, r5
5255	ldp	r10, r11, [r16]              @ ctr96_b64, ctr96_t32
5256#ifdef __ARMEB__
5257	rev	r10, r10
5258	rev	r11, r11
5259#endif
5260	ldp	r13, r14, [r8, #224]                     @ load rk14
5261#ifdef __ARMEB__
5262	ror	r14, r14, #32
5263	ror	r13, r13, #32
5264#endif
5265	ld1	{v18.4s}, [r8], #16                               @ load rk0
5266	sub	r5, r5, #1      @ byte_len - 1
5267
5268	ld1	{v19.4s}, [r8], #16                               @ load rk1
5269	and	r5, r5, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
5270
5271	add	r4, r0, r1, lsr #3   @ end_input_ptr
5272	ld1	{v20.4s}, [r8], #16                               @ load rk2
5273
5274	lsr	r12, r11, #32
5275	ld1	{v21.4s}, [r8], #16                               @ load rk3
5276	orr	r11, r11, r11
5277
5278	ld1	{v22.4s}, [r8], #16                               @ load rk4
5279	add	r5, r5, r0
5280	rev	r12, r12                                @ rev_ctr32
5281
5282	add	r12, r12, #1                            @ increment rev_ctr32
5283	fmov	d3, r10                               @ CTR block 3
5284
5285	rev	r9, r12                                 @ CTR block 1
5286	add	r12, r12, #1                            @ CTR block 1
5287	fmov	d1, r10                               @ CTR block 1
5288
5289	orr	r9, r11, r9, lsl #32            @ CTR block 1
5290	ld1	{ q0}, [r16]                             @ special case vector load initial counter so we can start first AES block as quickly as possible
5291
5292	fmov	v1.d[1], r9                               @ CTR block 1
5293	rev	r9, r12                                 @ CTR block 2
5294	add	r12, r12, #1                            @ CTR block 2
5295
5296	fmov	d2, r10                               @ CTR block 2
5297	orr	r9, r11, r9, lsl #32            @ CTR block 2
5298
5299	fmov	v2.d[1], r9                               @ CTR block 2
5300	rev	r9, r12                                 @ CTR block 3
5301
5302	orr	r9, r11, r9, lsl #32            @ CTR block 3
5303	ld1	{v23.4s}, [r8], #16                               @ load rk5
5304
5305	fmov	v3.d[1], r9                               @ CTR block 3
5306	add	r12, r12, #1                            @ CTR block 3
5307
5308	ld1	{v24.4s}, [r8], #16                               @ load rk6
5309
5310	ld1	{v25.4s}, [r8], #16                               @ load rk7
5311
5312	ld1	{v26.4s}, [r8], #16                               @ load rk8
5313
5314	aese	q0, v18.16b
5315	aesmc	q0, q0          @ AES block 0 - round 0
5316	ldr	q14, [r3, #80]                         @ load h3l | h3h
5317#ifndef __ARMEB__
5318	ext	v14.16b, v14.16b, v14.16b, #8
5319#endif
5320
5321	aese	q3, v18.16b
5322	aesmc	q3, q3          @ AES block 3 - round 0
5323	ldr	q15, [r3, #112]                        @ load h4l | h4h
5324#ifndef __ARMEB__
5325	ext	v15.16b, v15.16b, v15.16b, #8
5326#endif
5327
5328	aese	q1, v18.16b
5329	aesmc	q1, q1          @ AES block 1 - round 0
5330	ldr	q13, [r3, #64]                         @ load h2l | h2h
5331#ifndef __ARMEB__
5332	ext	v13.16b, v13.16b, v13.16b, #8
5333#endif
5334
5335	aese	q2, v18.16b
5336	aesmc	q2, q2          @ AES block 2 - round 0
5337	ld1	{v27.4s}, [r8], #16                                 @ load rk9
5338
5339	aese	q0, v19.16b
5340	aesmc	q0, q0          @ AES block 0 - round 1
5341
5342	aese	q1, v19.16b
5343	aesmc	q1, q1          @ AES block 1 - round 1
5344	ld1	{ v11.16b}, [r3]
5345	ext	v11.16b, v11.16b, v11.16b, #8
5346	rev64	v11.16b, v11.16b
5347
5348	aese	q2, v19.16b
5349	aesmc	q2, q2          @ AES block 2 - round 1
5350	ld1	{v28.4s}, [r8], #16                              @ load rk10
5351
5352	aese	q3, v19.16b
5353	aesmc	q3, q3          @ AES block 3 - round 1
5354	ld1	{v29.4s}, [r8], #16                              @ load rk11
5355
5356	aese	q0, v20.16b
5357	aesmc	q0, q0          @ AES block 0 - round 2
5358	ldr	q12, [r3, #32]                         @ load h1l | h1h
5359#ifndef __ARMEB__
5360	ext	v12.16b, v12.16b, v12.16b, #8
5361#endif
5362	aese	q2, v20.16b
5363	aesmc	q2, q2          @ AES block 2 - round 2
5364	ld1	{v30.4s}, [r8], #16                              @ load rk12
5365
5366	aese	q3, v20.16b
5367	aesmc	q3, q3          @ AES block 3 - round 2
5368
5369	aese	q0, v21.16b
5370	aesmc	q0, q0          @ AES block 0 - round 3
5371
5372	aese	q1, v20.16b
5373	aesmc	q1, q1          @ AES block 1 - round 2
5374
5375	aese	q3, v21.16b
5376	aesmc	q3, q3          @ AES block 3 - round 3
5377
5378	aese	q0, v22.16b
5379	aesmc	q0, q0          @ AES block 0 - round 4
5380	cmp	r0, r5                   @ check if we have <= 4 blocks
5381
5382	aese	q2, v21.16b
5383	aesmc	q2, q2          @ AES block 2 - round 3
5384
5385	aese	q1, v21.16b
5386	aesmc	q1, q1          @ AES block 1 - round 3
5387
5388	aese	q3, v22.16b
5389	aesmc	q3, q3          @ AES block 3 - round 4
5390
5391	aese	q2, v22.16b
5392	aesmc	q2, q2          @ AES block 2 - round 4
5393
5394	aese	q1, v22.16b
5395	aesmc	q1, q1          @ AES block 1 - round 4
5396
5397	aese	q3, v23.16b
5398	aesmc	q3, q3          @ AES block 3 - round 5
5399
5400	aese	q0, v23.16b
5401	aesmc	q0, q0          @ AES block 0 - round 5
5402
5403	aese	q1, v23.16b
5404	aesmc	q1, q1          @ AES block 1 - round 5
5405
5406	aese	q2, v23.16b
5407	aesmc	q2, q2          @ AES block 2 - round 5
5408
5409	aese	q0, v24.16b
5410	aesmc	q0, q0          @ AES block 0 - round 6
5411
5412	aese	q3, v24.16b
5413	aesmc	q3, q3          @ AES block 3 - round 6
5414
5415	aese	q1, v24.16b
5416	aesmc	q1, q1          @ AES block 1 - round 6
5417
5418	aese	q2, v24.16b
5419	aesmc	q2, q2          @ AES block 2 - round 6
5420
5421	aese	q0, v25.16b
5422	aesmc	q0, q0          @ AES block 0 - round 7
5423
5424	aese	q1, v25.16b
5425	aesmc	q1, q1          @ AES block 1 - round 7
5426
5427	aese	q3, v25.16b
5428	aesmc	q3, q3          @ AES block 3 - round 7
5429
5430	aese	q0, v26.16b
5431	aesmc	q0, q0          @ AES block 0 - round 8
5432
5433	aese	q2, v25.16b
5434	aesmc	q2, q2          @ AES block 2 - round 7
5435
5436	aese	q3, v26.16b
5437	aesmc	q3, q3          @ AES block 3 - round 8
5438
5439	aese	q1, v26.16b
5440	aesmc	q1, q1          @ AES block 1 - round 8
5441
5442	aese	q0, v27.16b
5443	aesmc	q0, q0          @ AES block 0 - round 9
5444
5445	aese	q2, v26.16b
5446	aesmc	q2, q2          @ AES block 2 - round 8
5447	ld1	{v31.4s}, [r8], #16                             @ load rk13
5448
5449	aese	q1, v27.16b
5450	aesmc	q1, q1          @ AES block 1 - round 9
5451
5452	aese	q0, v28.16b
5453	aesmc	q0, q0          @ AES block 0 - round 10
5454
5455	aese	q3, v27.16b
5456	aesmc	q3, q3          @ AES block 3 - round 9
5457
5458	aese	q1, v28.16b
5459	aesmc	q1, q1          @ AES block 1 - round 10
5460
5461	aese	q2, v27.16b
5462	aesmc	q2, q2          @ AES block 2 - round 9
5463
5464	aese	q3, v28.16b
5465	aesmc	q3, q3          @ AES block 3 - round 10
5466
5467	aese	q0, v29.16b
5468	aesmc	q0, q0          @ AES block 0 - round 11
5469
5470	aese	q2, v28.16b
5471	aesmc	q2, q2          @ AES block 2 - round 10
5472
5473	aese	q3, v29.16b
5474	aesmc	q3, q3          @ AES block 3 - round 11
5475
5476	aese	q1, v29.16b
5477	aesmc	q1, q1          @ AES block 1 - round 11
5478
5479	aese	q2, v29.16b
5480	aesmc	q2, q2          @ AES block 2 - round 11
5481
5482	trn1	q9, v14.2d,    v15.2d                      @ h4h | h3h
5483
5484	trn2	v17.2d,  v14.2d,    v15.2d                      @ h4l | h3l
5485
5486	trn1	q8,    v12.2d,    v13.2d                      @ h2h | h1h
5487	trn2	v16.2d,  v12.2d,    v13.2d                      @ h2l | h1l
5488
5489	aese	q1, v30.16b
5490	aesmc	q1, q1          @ AES block 1 - round 12
5491
5492	aese	q0, v30.16b
5493	aesmc	q0, q0          @ AES block 0 - round 12
5494
5495	aese	q2, v30.16b
5496	aesmc	q2, q2          @ AES block 2 - round 12
5497
5498	aese	q3, v30.16b
5499	aesmc	q3, q3          @ AES block 3 - round 12
5500	eor	v17.16b, v17.16b, q9                  @ h4k | h3k
5501
5502	aese	q1, v31.16b                                     @ AES block 1 - round 13
5503
5504	aese	q2, v31.16b                                     @ AES block 2 - round 13
5505	eor	v16.16b, v16.16b, q8                     @ h2k | h1k
5506
5507	aese	q3, v31.16b                                     @ AES block 3 - round 13
5508
5509	aese	q0, v31.16b                                     @ AES block 0 - round 13
5510	bge	.L256_dec_tail                                    @ handle tail
5511
5512	ld1	{q4, q5}, [r0], #32               @ AES block 0,1 - load ciphertext
5513
5514	rev	r9, r12                                 @ CTR block 4
5515
5516	eor	q0, q4, q0                            @ AES block 0 - result
5517
5518	eor	q1, q5, q1                            @ AES block 1 - result
5519	rev64	q5, q5                                    @ GHASH block 1
5520	ld1	{q6}, [r0], #16                       @ AES block 2 - load ciphertext
5521
5522	mov	r7, v0.d[1]                            @ AES block 0 - mov high
5523
5524	mov	r6, v0.d[0]                            @ AES block 0 - mov low
5525	rev64	q4, q4                                    @ GHASH block 0
5526	add	r12, r12, #1                            @ CTR block 4
5527
5528	fmov	d0, r10                               @ CTR block 4
5529	orr	r9, r11, r9, lsl #32            @ CTR block 4
5530
5531	fmov	v0.d[1], r9                               @ CTR block 4
5532	rev	r9, r12                                 @ CTR block 5
5533	add	r12, r12, #1                            @ CTR block 5
5534
5535	mov	r19, v1.d[0]                            @ AES block 1 - mov low
5536
5537	orr	r9, r11, r9, lsl #32            @ CTR block 5
5538	mov	r20, v1.d[1]                            @ AES block 1 - mov high
5539	eor	r7, r7, r14                   @ AES block 0 - round 14 high
5540#ifdef __ARMEB__
5541	rev	r7, r7
5542#endif
5543	eor	r6, r6, r13                   @ AES block 0 - round 14 low
5544#ifdef __ARMEB__
5545	rev	r6, r6
5546#endif
5547	stp	r6, r7, [r2], #16        @ AES block 0 - store result
5548	fmov	d1, r10                               @ CTR block 5
5549
5550	ld1	{q7}, [r0], #16                       @ AES block 3 - load ciphertext
5551
5552	fmov	v1.d[1], r9                               @ CTR block 5
5553	rev	r9, r12                                 @ CTR block 6
5554	add	r12, r12, #1                            @ CTR block 6
5555
5556	eor	r19, r19, r13                   @ AES block 1 - round 14 low
5557#ifdef __ARMEB__
5558	rev	r19, r19
5559#endif
5560	orr	r9, r11, r9, lsl #32            @ CTR block 6
5561
5562	eor	r20, r20, r14                   @ AES block 1 - round 14 high
5563#ifdef __ARMEB__
5564	rev	r20, r20
5565#endif
5566	stp	r19, r20, [r2], #16        @ AES block 1 - store result
5567
5568	eor	q2, q6, q2                            @ AES block 2 - result
5569	cmp	r0, r5                   @ check if we have <= 8 blocks
5570	bge	.L256_dec_prepretail                              @ do prepretail
5571
5572.L256_dec_main_loop:@ main loop start
5573	mov	r21, v2.d[0]                            @ AES block 4k+2 - mov low
5574	ext	v11.16b, v11.16b, v11.16b, #8                     @ PRE 0
5575	eor	q3, q7, q3                            @ AES block 4k+3 - result
5576
5577	aese	q0, v18.16b
5578	aesmc	q0, q0          @ AES block 4k+4 - round 0
5579	mov	r22, v2.d[1]                            @ AES block 4k+2 - mov high
5580
5581	aese	q1, v18.16b
5582	aesmc	q1, q1          @ AES block 4k+5 - round 0
5583	fmov	d2, r10                               @ CTR block 4k+6
5584
5585	fmov	v2.d[1], r9                               @ CTR block 4k+6
5586	eor	q4, q4, v11.16b                           @ PRE 1
5587	rev	r9, r12                                 @ CTR block 4k+7
5588
5589	aese	q0, v19.16b
5590	aesmc	q0, q0          @ AES block 4k+4 - round 1
5591	mov	r24, v3.d[1]                            @ AES block 4k+3 - mov high
5592
5593	aese	q1, v19.16b
5594	aesmc	q1, q1          @ AES block 4k+5 - round 1
5595	mov	r23, v3.d[0]                            @ AES block 4k+3 - mov low
5596
5597	pmull2	v9.1q, q4, v15.2d                       @ GHASH block 4k - high
5598	mov	d8, v4.d[1]                                  @ GHASH block 4k - mid
5599	fmov	d3, r10                               @ CTR block 4k+7
5600
5601	aese	q0, v20.16b
5602	aesmc	q0, q0          @ AES block 4k+4 - round 2
5603	orr	r9, r11, r9, lsl #32            @ CTR block 4k+7
5604
5605	aese	q2, v18.16b
5606	aesmc	q2, q2          @ AES block 4k+6 - round 0
5607	fmov	v3.d[1], r9                               @ CTR block 4k+7
5608
5609	aese	q1, v20.16b
5610	aesmc	q1, q1          @ AES block 4k+5 - round 2
5611	eor	q8, q8, q4                          @ GHASH block 4k - mid
5612
5613	aese	q0, v21.16b
5614	aesmc	q0, q0          @ AES block 4k+4 - round 3
5615	eor	r22, r22, r14                   @ AES block 4k+2 - round 14 high
5616#ifdef __ARMEB__
5617	rev	r22, r22
5618#endif
5619	aese	q2, v19.16b
5620	aesmc	q2, q2          @ AES block 4k+6 - round 1
5621	mov	d10, v17.d[1]                               @ GHASH block 4k - mid
5622
5623	aese	q1, v21.16b
5624	aesmc	q1, q1          @ AES block 4k+5 - round 3
5625	rev64	q6, q6                                    @ GHASH block 4k+2
5626
5627	aese	q3, v18.16b
5628	aesmc	q3, q3          @ AES block 4k+7 - round 0
5629	eor	r21, r21, r13                   @ AES block 4k+2 - round 14 low
5630#ifdef __ARMEB__
5631	rev	r21, r21
5632#endif
5633	aese	q2, v20.16b
5634	aesmc	q2, q2          @ AES block 4k+6 - round 2
5635	stp	r21, r22, [r2], #16        @ AES block 4k+2 - store result
5636
5637	pmull	v11.1q, q4, v15.1d                       @ GHASH block 4k - low
5638
5639	pmull2	v4.1q, q5, v14.2d                          @ GHASH block 4k+1 - high
5640
5641	aese	q2, v21.16b
5642	aesmc	q2, q2          @ AES block 4k+6 - round 3
5643	rev64	q7, q7                                    @ GHASH block 4k+3
5644
5645	pmull	v10.1q, q8, v10.1d                      @ GHASH block 4k - mid
5646	eor	r23, r23, r13                   @ AES block 4k+3 - round 14 low
5647#ifdef __ARMEB__
5648	rev	r23, r23
5649#endif
5650	pmull	v8.1q, q5, v14.1d                          @ GHASH block 4k+1 - low
5651	eor	r24, r24, r14                   @ AES block 4k+3 - round 14 high
5652#ifdef __ARMEB__
5653	rev	r24, r24
5654#endif
5655	eor	q9, q9, q4                         @ GHASH block 4k+1 - high
5656
5657	aese	q2, v22.16b
5658	aesmc	q2, q2          @ AES block 4k+6 - round 4
5659
5660	aese	q3, v19.16b
5661	aesmc	q3, q3          @ AES block 4k+7 - round 1
5662	mov	d4, v5.d[1]                                  @ GHASH block 4k+1 - mid
5663
5664	aese	q0, v22.16b
5665	aesmc	q0, q0          @ AES block 4k+4 - round 4
5666	eor	v11.16b, v11.16b, q8                         @ GHASH block 4k+1 - low
5667
5668	aese	q2, v23.16b
5669	aesmc	q2, q2          @ AES block 4k+6 - round 5
5670	add	r12, r12, #1                            @ CTR block 4k+7
5671
5672	aese	q3, v20.16b
5673	aesmc	q3, q3          @ AES block 4k+7 - round 2
5674	mov	d8, v6.d[1]                                  @ GHASH block 4k+2 - mid
5675
5676	aese	q1, v22.16b
5677	aesmc	q1, q1          @ AES block 4k+5 - round 4
5678	eor	q4, q4, q5                          @ GHASH block 4k+1 - mid
5679
5680	pmull	v5.1q, q6, v13.1d                          @ GHASH block 4k+2 - low
5681
5682	aese	q3, v21.16b
5683	aesmc	q3, q3          @ AES block 4k+7 - round 3
5684	eor	q8, q8, q6                          @ GHASH block 4k+2 - mid
5685
5686	aese	q1, v23.16b
5687	aesmc	q1, q1          @ AES block 4k+5 - round 5
5688
5689	aese	q0, v23.16b
5690	aesmc	q0, q0          @ AES block 4k+4 - round 5
5691	eor	v11.16b, v11.16b, q5                         @ GHASH block 4k+2 - low
5692
5693	pmull	v4.1q, q4, v17.1d                          @ GHASH block 4k+1 - mid
5694	rev	r9, r12                                 @ CTR block 4k+8
5695
5696	aese	q1, v24.16b
5697	aesmc	q1, q1          @ AES block 4k+5 - round 6
5698	ins	v8.d[1], v8.d[0]                                @ GHASH block 4k+2 - mid
5699
5700	aese	q0, v24.16b
5701	aesmc	q0, q0          @ AES block 4k+4 - round 6
5702	add	r12, r12, #1                            @ CTR block 4k+8
5703
5704	aese	q3, v22.16b
5705	aesmc	q3, q3          @ AES block 4k+7 - round 4
5706
5707	aese	q1, v25.16b
5708	aesmc	q1, q1          @ AES block 4k+5 - round 7
5709	eor	v10.16b, v10.16b, q4                         @ GHASH block 4k+1 - mid
5710
5711	aese	q0, v25.16b
5712	aesmc	q0, q0          @ AES block 4k+4 - round 7
5713
5714	pmull2	v4.1q, q6, v13.2d                          @ GHASH block 4k+2 - high
5715	mov	d6, v7.d[1]                                  @ GHASH block 4k+3 - mid
5716
5717	aese	q3, v23.16b
5718	aesmc	q3, q3          @ AES block 4k+7 - round 5
5719
5720	pmull2	v8.1q, q8, v16.2d                          @ GHASH block 4k+2 - mid
5721
5722	aese	q0, v26.16b
5723	aesmc	q0, q0          @ AES block 4k+4 - round 8
5724	eor	q9, q9, q4                         @ GHASH block 4k+2 - high
5725
5726	aese	q3, v24.16b
5727	aesmc	q3, q3          @ AES block 4k+7 - round 6
5728
5729	pmull	v4.1q, q7, v12.1d                          @ GHASH block 4k+3 - low
5730	orr	r9, r11, r9, lsl #32            @ CTR block 4k+8
5731	eor	v10.16b, v10.16b, q8                         @ GHASH block 4k+2 - mid
5732
5733	pmull2	v5.1q, q7, v12.2d                          @ GHASH block 4k+3 - high
5734
5735	aese	q0, v27.16b
5736	aesmc	q0, q0          @ AES block 4k+4 - round 9
5737	eor	q6, q6, q7                          @ GHASH block 4k+3 - mid
5738
5739	aese	q1, v26.16b
5740	aesmc	q1, q1          @ AES block 4k+5 - round 8
5741
5742	aese	q2, v24.16b
5743	aesmc	q2, q2          @ AES block 4k+6 - round 6
5744	eor	q9, q9, q5                         @ GHASH block 4k+3 - high
5745
5746	aese	q0, v28.16b
5747	aesmc	q0, q0          @ AES block 4k+4 - round 10
5748
5749	pmull	v6.1q, q6, v16.1d                          @ GHASH block 4k+3 - mid
5750	movi	q8, #0xc2
5751
5752	aese	q2, v25.16b
5753	aesmc	q2, q2          @ AES block 4k+6 - round 7
5754	eor	v11.16b, v11.16b, q4                         @ GHASH block 4k+3 - low
5755
5756	aese	q0, v29.16b
5757	aesmc	q0, q0          @ AES block 4k+4 - round 11
5758
5759	aese	q3, v25.16b
5760	aesmc	q3, q3          @ AES block 4k+7 - round 7
5761	shl	d8, d8, #56               @ mod_constant
5762
5763	aese	q2, v26.16b
5764	aesmc	q2, q2          @ AES block 4k+6 - round 8
5765	eor	v10.16b, v10.16b, q6                         @ GHASH block 4k+3 - mid
5766
5767	aese	q0, v30.16b
5768	aesmc	q0, q0          @ AES block 4k+4 - round 12
5769
5770	pmull	v7.1q, q9, q8            @ MODULO - top 64b align with mid
5771	eor	q6, v11.16b, q9                         @ MODULO - karatsuba tidy up
5772
5773	aese	q1, v27.16b
5774	aesmc	q1, q1          @ AES block 4k+5 - round 9
5775	ld1	{q4}, [r0], #16                       @ AES block 4k+4 - load ciphertext
5776
5777	aese	q0, v31.16b                                     @ AES block 4k+4 - round 13
5778	ext	q9, q9, q9, #8                     @ MODULO - other top alignment
5779
5780	aese	q1, v28.16b
5781	aesmc	q1, q1          @ AES block 4k+5 - round 10
5782	eor	v10.16b, v10.16b, q6                         @ MODULO - karatsuba tidy up
5783
5784	aese	q2, v27.16b
5785	aesmc	q2, q2          @ AES block 4k+6 - round 9
5786	ld1	{q5}, [r0], #16                       @ AES block 4k+5 - load ciphertext
5787
5788	aese	q3, v26.16b
5789	aesmc	q3, q3          @ AES block 4k+7 - round 8
5790	eor	q0, q4, q0                            @ AES block 4k+4 - result
5791
5792	aese	q1, v29.16b
5793	aesmc	q1, q1          @ AES block 4k+5 - round 11
5794	stp	r23, r24, [r2], #16        @ AES block 4k+3 - store result
5795
5796	aese	q2, v28.16b
5797	aesmc	q2, q2          @ AES block 4k+6 - round 10
5798	eor	v10.16b, v10.16b, q7                      @ MODULO - fold into mid
5799
5800	aese	q3, v27.16b
5801	aesmc	q3, q3          @ AES block 4k+7 - round 9
5802	ld1	{q6}, [r0], #16                       @ AES block 4k+6 - load ciphertext
5803
5804	aese	q1, v30.16b
5805	aesmc	q1, q1          @ AES block 4k+5 - round 12
5806	ld1	{q7}, [r0], #16                       @ AES block 4k+7 - load ciphertext
5807
5808	aese	q2, v29.16b
5809	aesmc	q2, q2          @ AES block 4k+6 - round 11
5810	mov	r7, v0.d[1]                            @ AES block 4k+4 - mov high
5811
5812	aese	q3, v28.16b
5813	aesmc	q3, q3          @ AES block 4k+7 - round 10
5814	eor	v10.16b, v10.16b, q9                         @ MODULO - fold into mid
5815
5816	aese	q1, v31.16b                                     @ AES block 4k+5 - round 13
5817	mov	r6, v0.d[0]                            @ AES block 4k+4 - mov low
5818
5819	aese	q2, v30.16b
5820	aesmc	q2, q2          @ AES block 4k+6 - round 12
5821	fmov	d0, r10                               @ CTR block 4k+8
5822
5823	aese	q3, v29.16b
5824	aesmc	q3, q3          @ AES block 4k+7 - round 11
5825	fmov	v0.d[1], r9                               @ CTR block 4k+8
5826
5827	pmull	v8.1q, v10.1d, q8     @ MODULO - mid 64b align with low
5828	eor	q1, q5, q1                            @ AES block 4k+5 - result
5829	rev	r9, r12                                 @ CTR block 4k+9
5830
5831	aese	q2, v31.16b                                     @ AES block 4k+6 - round 13
5832	orr	r9, r11, r9, lsl #32            @ CTR block 4k+9
5833	cmp	r0, r5                   @ .LOOP CONTROL
5834
5835	add	r12, r12, #1                            @ CTR block 4k+9
5836
5837	eor	r6, r6, r13                   @ AES block 4k+4 - round 14 low
5838#ifdef __ARMEB__
5839	rev	r6, r6
5840#endif
5841	eor	r7, r7, r14                   @ AES block 4k+4 - round 14 high
5842#ifdef __ARMEB__
5843	rev	r7, r7
5844#endif
5845	mov	r20, v1.d[1]                            @ AES block 4k+5 - mov high
5846	eor	q2, q6, q2                            @ AES block 4k+6 - result
5847	eor	v11.16b, v11.16b, q8               @ MODULO - fold into low
5848
5849	aese	q3, v30.16b
5850	aesmc	q3, q3          @ AES block 4k+7 - round 12
5851	mov	r19, v1.d[0]                            @ AES block 4k+5 - mov low
5852
5853	fmov	d1, r10                               @ CTR block 4k+9
5854	ext	v10.16b, v10.16b, v10.16b, #8                     @ MODULO - other mid alignment
5855
5856	fmov	v1.d[1], r9                               @ CTR block 4k+9
5857	rev	r9, r12                                 @ CTR block 4k+10
5858	add	r12, r12, #1                            @ CTR block 4k+10
5859
5860	aese	q3, v31.16b                                     @ AES block 4k+7 - round 13
5861	orr	r9, r11, r9, lsl #32            @ CTR block 4k+10
5862
5863	rev64	q5, q5                                    @ GHASH block 4k+5
5864	eor	r20, r20, r14                   @ AES block 4k+5 - round 14 high
5865#ifdef __ARMEB__
5866	rev	r20, r20
5867#endif
5868	stp	r6, r7, [r2], #16        @ AES block 4k+4 - store result
5869
5870	eor	r19, r19, r13                   @ AES block 4k+5 - round 14 low
5871#ifdef __ARMEB__
5872	rev	r19, r19
5873#endif
5874	stp	r19, r20, [r2], #16        @ AES block 4k+5 - store result
5875
5876	rev64	q4, q4                                    @ GHASH block 4k+4
5877	eor	v11.16b, v11.16b, v10.16b                         @ MODULO - fold into low
5878	blt	.L256_dec_main_loop
5879
5880
5881.L256_dec_prepretail:@ PREPRETAIL
5882	ext	v11.16b, v11.16b, v11.16b, #8                     @ PRE 0
5883	mov	r21, v2.d[0]                            @ AES block 4k+2 - mov low
5884	eor	q3, q7, q3                            @ AES block 4k+3 - result
5885
5886	aese	q0, v18.16b
5887	aesmc	q0, q0          @ AES block 4k+4 - round 0
5888	mov	r22, v2.d[1]                            @ AES block 4k+2 - mov high
5889
5890	aese	q1, v18.16b
5891	aesmc	q1, q1          @ AES block 4k+5 - round 0
5892	fmov	d2, r10                               @ CTR block 4k+6
5893
5894	fmov	v2.d[1], r9                               @ CTR block 4k+6
5895	rev	r9, r12                                 @ CTR block 4k+7
5896	eor	q4, q4, v11.16b                           @ PRE 1
5897
5898	rev64	q6, q6                                    @ GHASH block 4k+2
5899	orr	r9, r11, r9, lsl #32            @ CTR block 4k+7
5900	mov	r23, v3.d[0]                            @ AES block 4k+3 - mov low
5901
5902	aese	q1, v19.16b
5903	aesmc	q1, q1          @ AES block 4k+5 - round 1
5904	mov	r24, v3.d[1]                            @ AES block 4k+3 - mov high
5905
5906	pmull	v11.1q, q4, v15.1d                       @ GHASH block 4k - low
5907	mov	d8, v4.d[1]                                  @ GHASH block 4k - mid
5908	fmov	d3, r10                               @ CTR block 4k+7
5909
5910	pmull2	v9.1q, q4, v15.2d                       @ GHASH block 4k - high
5911	fmov	v3.d[1], r9                               @ CTR block 4k+7
5912
5913	aese	q2, v18.16b
5914	aesmc	q2, q2          @ AES block 4k+6 - round 0
5915	mov	d10, v17.d[1]                               @ GHASH block 4k - mid
5916
5917	aese	q0, v19.16b
5918	aesmc	q0, q0          @ AES block 4k+4 - round 1
5919	eor	q8, q8, q4                          @ GHASH block 4k - mid
5920
5921	pmull2	v4.1q, q5, v14.2d                          @ GHASH block 4k+1 - high
5922
5923	aese	q2, v19.16b
5924	aesmc	q2, q2          @ AES block 4k+6 - round 1
5925	rev64	q7, q7                                    @ GHASH block 4k+3
5926
5927	aese	q3, v18.16b
5928	aesmc	q3, q3          @ AES block 4k+7 - round 0
5929
5930	pmull	v10.1q, q8, v10.1d                      @ GHASH block 4k - mid
5931	eor	q9, q9, q4                         @ GHASH block 4k+1 - high
5932
5933	pmull	v8.1q, q5, v14.1d                          @ GHASH block 4k+1 - low
5934
5935	aese	q3, v19.16b
5936	aesmc	q3, q3          @ AES block 4k+7 - round 1
5937	mov	d4, v5.d[1]                                  @ GHASH block 4k+1 - mid
5938
5939	aese	q0, v20.16b
5940	aesmc	q0, q0          @ AES block 4k+4 - round 2
5941
5942	aese	q1, v20.16b
5943	aesmc	q1, q1          @ AES block 4k+5 - round 2
5944	eor	v11.16b, v11.16b, q8                         @ GHASH block 4k+1 - low
5945
5946	aese	q2, v20.16b
5947	aesmc	q2, q2          @ AES block 4k+6 - round 2
5948
5949	aese	q0, v21.16b
5950	aesmc	q0, q0          @ AES block 4k+4 - round 3
5951	mov	d8, v6.d[1]                                  @ GHASH block 4k+2 - mid
5952
5953	aese	q3, v20.16b
5954	aesmc	q3, q3          @ AES block 4k+7 - round 2
5955	eor	q4, q4, q5                          @ GHASH block 4k+1 - mid
5956
5957	pmull	v5.1q, q6, v13.1d                          @ GHASH block 4k+2 - low
5958
5959	aese	q0, v22.16b
5960	aesmc	q0, q0          @ AES block 4k+4 - round 4
5961
5962	aese	q3, v21.16b
5963	aesmc	q3, q3          @ AES block 4k+7 - round 3
5964	eor	q8, q8, q6                          @ GHASH block 4k+2 - mid
5965
5966	pmull	v4.1q, q4, v17.1d                          @ GHASH block 4k+1 - mid
5967
5968	aese	q0, v23.16b
5969	aesmc	q0, q0          @ AES block 4k+4 - round 5
5970	eor	v11.16b, v11.16b, q5                         @ GHASH block 4k+2 - low
5971
5972	aese	q3, v22.16b
5973	aesmc	q3, q3          @ AES block 4k+7 - round 4
5974
5975	pmull2	v5.1q, q7, v12.2d                          @ GHASH block 4k+3 - high
5976	eor	v10.16b, v10.16b, q4                         @ GHASH block 4k+1 - mid
5977
5978	pmull2	v4.1q, q6, v13.2d                          @ GHASH block 4k+2 - high
5979
5980	aese	q3, v23.16b
5981	aesmc	q3, q3          @ AES block 4k+7 - round 5
5982	ins	v8.d[1], v8.d[0]                                @ GHASH block 4k+2 - mid
5983
5984	aese	q2, v21.16b
5985	aesmc	q2, q2          @ AES block 4k+6 - round 3
5986
5987	aese	q1, v21.16b
5988	aesmc	q1, q1          @ AES block 4k+5 - round 3
5989	eor	q9, q9, q4                         @ GHASH block 4k+2 - high
5990
5991	pmull	v4.1q, q7, v12.1d                          @ GHASH block 4k+3 - low
5992
5993	aese	q2, v22.16b
5994	aesmc	q2, q2          @ AES block 4k+6 - round 4
5995	mov	d6, v7.d[1]                                  @ GHASH block 4k+3 - mid
5996
5997	aese	q1, v22.16b
5998	aesmc	q1, q1          @ AES block 4k+5 - round 4
5999
6000	pmull2	v8.1q, q8, v16.2d                          @ GHASH block 4k+2 - mid
6001
6002	aese	q2, v23.16b
6003	aesmc	q2, q2          @ AES block 4k+6 - round 5
6004	eor	q6, q6, q7                          @ GHASH block 4k+3 - mid
6005
6006	aese	q1, v23.16b
6007	aesmc	q1, q1          @ AES block 4k+5 - round 5
6008
6009	aese	q3, v24.16b
6010	aesmc	q3, q3          @ AES block 4k+7 - round 6
6011	eor	v10.16b, v10.16b, q8                         @ GHASH block 4k+2 - mid
6012
6013	aese	q2, v24.16b
6014	aesmc	q2, q2          @ AES block 4k+6 - round 6
6015
6016	aese	q0, v24.16b
6017	aesmc	q0, q0          @ AES block 4k+4 - round 6
6018	movi	q8, #0xc2
6019
6020	aese	q1, v24.16b
6021	aesmc	q1, q1          @ AES block 4k+5 - round 6
6022	eor	v11.16b, v11.16b, q4                         @ GHASH block 4k+3 - low
6023
6024	pmull	v6.1q, q6, v16.1d                          @ GHASH block 4k+3 - mid
6025
6026	aese	q3, v25.16b
6027	aesmc	q3, q3          @ AES block 4k+7 - round 7
6028	eor	q9, q9, q5                         @ GHASH block 4k+3 - high
6029
6030	aese	q1, v25.16b
6031	aesmc	q1, q1          @ AES block 4k+5 - round 7
6032
6033	aese	q0, v25.16b
6034	aesmc	q0, q0          @ AES block 4k+4 - round 7
6035	eor	v10.16b, v10.16b, q6                         @ GHASH block 4k+3 - mid
6036
6037	aese	q3, v26.16b
6038	aesmc	q3, q3          @ AES block 4k+7 - round 8
6039
6040	aese	q2, v25.16b
6041	aesmc	q2, q2          @ AES block 4k+6 - round 7
6042	eor	q6, v11.16b, q9                         @ MODULO - karatsuba tidy up
6043
6044	aese	q1, v26.16b
6045	aesmc	q1, q1          @ AES block 4k+5 - round 8
6046
6047	aese	q0, v26.16b
6048	aesmc	q0, q0          @ AES block 4k+4 - round 8
6049	shl	d8, d8, #56               @ mod_constant
6050
6051	aese	q2, v26.16b
6052	aesmc	q2, q2          @ AES block 4k+6 - round 8
6053
6054	aese	q1, v27.16b
6055	aesmc	q1, q1          @ AES block 4k+5 - round 9
6056	eor	v10.16b, v10.16b, q6                         @ MODULO - karatsuba tidy up
6057
6058	pmull	v7.1q, q9, q8            @ MODULO - top 64b align with mid
6059
6060	aese	q2, v27.16b
6061	aesmc	q2, q2          @ AES block 4k+6 - round 9
6062	ext	q9, q9, q9, #8                     @ MODULO - other top alignment
6063
6064	aese	q3, v27.16b
6065	aesmc	q3, q3          @ AES block 4k+7 - round 9
6066
6067	aese	q0, v27.16b
6068	aesmc	q0, q0          @ AES block 4k+4 - round 9
6069	eor	v10.16b, v10.16b, q7                      @ MODULO - fold into mid
6070
6071	aese	q2, v28.16b
6072	aesmc	q2, q2          @ AES block 4k+6 - round 10
6073
6074	aese	q3, v28.16b
6075	aesmc	q3, q3          @ AES block 4k+7 - round 10
6076
6077	aese	q0, v28.16b
6078	aesmc	q0, q0          @ AES block 4k+4 - round 10
6079	eor	r22, r22, r14                   @ AES block 4k+2 - round 14 high
6080#ifdef __ARMEB__
6081	rev	r22, r22
6082#endif
6083	aese	q1, v28.16b
6084	aesmc	q1, q1          @ AES block 4k+5 - round 10
6085	eor	r23, r23, r13                   @ AES block 4k+3 - round 14 low
6086#ifdef __ARMEB__
6087	rev	r23, r23
6088#endif
6089	aese	q2, v29.16b
6090	aesmc	q2, q2          @ AES block 4k+6 - round 11
6091	eor	v10.16b, v10.16b, q9                         @ MODULO - fold into mid
6092
6093	aese	q0, v29.16b
6094	aesmc	q0, q0          @ AES block 4k+4 - round 11
6095	add	r12, r12, #1                            @ CTR block 4k+7
6096
6097	aese	q1, v29.16b
6098	aesmc	q1, q1          @ AES block 4k+5 - round 11
6099	eor	r21, r21, r13                   @ AES block 4k+2 - round 14 low
6100#ifdef __ARMEB__
6101	rev	r21, r21
6102#endif
6103
6104	aese	q2, v30.16b
6105	aesmc	q2, q2          @ AES block 4k+6 - round 12
6106
6107	pmull	v8.1q, v10.1d, q8     @ MODULO - mid 64b align with low
6108	eor	r24, r24, r14                   @ AES block 4k+3 - round 14 high
6109#ifdef __ARMEB__
6110	rev	r24, r24
6111#endif
6112
6113	aese	q3, v29.16b
6114	aesmc	q3, q3          @ AES block 4k+7 - round 11
6115	stp	r21, r22, [r2], #16        @ AES block 4k+2 - store result
6116
6117	aese	q1, v30.16b
6118	aesmc	q1, q1          @ AES block 4k+5 - round 12
6119	ext	v10.16b, v10.16b, v10.16b, #8                     @ MODULO - other mid alignment
6120
6121	aese	q0, v30.16b
6122	aesmc	q0, q0          @ AES block 4k+4 - round 12
6123	stp	r23, r24, [r2], #16        @ AES block 4k+3 - store result
6124
6125	aese	q3, v30.16b
6126	aesmc	q3, q3          @ AES block 4k+7 - round 12
6127	eor	v11.16b, v11.16b, q8               @ MODULO - fold into low
6128
6129	aese	q1, v31.16b                                     @ AES block 4k+5 - round 13
6130
6131	aese	q0, v31.16b                                     @ AES block 4k+4 - round 13
6132
6133	aese	q3, v31.16b                                     @ AES block 4k+7 - round 13
6134
6135	aese	q2, v31.16b                                     @ AES block 4k+6 - round 13
6136	eor	v11.16b, v11.16b, v10.16b                         @ MODULO - fold into low
6137.L256_dec_tail:@ TAIL
6138
6139	sub	r5, r4, r0   @ main_end_input_ptr is number of bytes left to process
6140	ld1	{ q5}, [r0], #16                      @ AES block 4k+4 - load ciphertext
6141
6142	eor	q0, q5, q0                            @ AES block 4k+4 - result
6143
6144	mov	r6, v0.d[0]                            @ AES block 4k+4 - mov low
6145
6146	mov	r7, v0.d[1]                            @ AES block 4k+4 - mov high
6147	ext	q8, v11.16b, v11.16b, #8                     @ prepare final partial tag
6148
6149	cmp	r5, #48
6150
6151	eor	r6, r6, r13                   @ AES block 4k+4 - round 14 low
6152#ifdef __ARMEB__
6153	rev	r6, r6
6154#endif
6155
6156	eor	r7, r7, r14                   @ AES block 4k+4 - round 14 high
6157#ifdef __ARMEB__
6158	rev	r7, r7
6159#endif
6160	bgt	.L256_dec_blocks_more_than_3
6161
6162	sub	r12, r12, #1
6163	mov	q3, q2
6164	movi	v10.8b, #0
6165
6166	movi	v11.8b, #0
6167	cmp	r5, #32
6168
6169	movi	q9, #0
6170	mov	q2, q1
6171	bgt	.L256_dec_blocks_more_than_2
6172
6173	sub	r12, r12, #1
6174
6175	mov	q3, q1
6176	cmp	r5, #16
6177	bgt	.L256_dec_blocks_more_than_1
6178
6179	sub	r12, r12, #1
6180	b	.L256_dec_blocks_less_than_1
6181.L256_dec_blocks_more_than_3:@ blocks left >  3
6182	rev64	q4, q5                                   @ GHASH final-3 block
6183	ld1	{ q5}, [r0], #16                     @ AES final-2 block - load ciphertext
6184
6185	stp	r6, r7, [r2], #16       @ AES final-3 block  - store result
6186
6187	mov	d10, v17.d[1]                              @ GHASH final-3 block - mid
6188
6189	eor	q4, q4, q8                          @ feed in partial tag
6190
6191	eor	q0, q5, q1                           @ AES final-2 block - result
6192
6193	mov	d22, v4.d[1]                                @ GHASH final-3 block - mid
6194
6195	mov	r6, v0.d[0]                           @ AES final-2 block - mov low
6196
6197	mov	r7, v0.d[1]                           @ AES final-2 block - mov high
6198
6199	eor	v22.8b, v22.8b, q4                     @ GHASH final-3 block - mid
6200
6201	movi	q8, #0                                       @ suppress further partial tag feed in
6202
6203	pmull2	v9.1q, q4, v15.2d                      @ GHASH final-3 block - high
6204
6205	pmull	v10.1q, v22.1d, v10.1d                   @ GHASH final-3 block - mid
6206	eor	r6, r6, r13                  @ AES final-2 block - round 14 low
6207#ifdef __ARMEB__
6208	rev	r6, r6
6209#endif
6210
6211	pmull	v11.1q, q4, v15.1d                      @ GHASH final-3 block - low
6212	eor	r7, r7, r14                  @ AES final-2 block - round 14 high
6213#ifdef __ARMEB__
6214	rev	r7, r7
6215#endif
6216.L256_dec_blocks_more_than_2:@ blocks left >  2
6217
6218	rev64	q4, q5                                   @ GHASH final-2 block
6219	ld1	{ q5}, [r0], #16                     @ AES final-1 block - load ciphertext
6220
6221	eor	q4, q4, q8                          @ feed in partial tag
6222	stp	r6, r7, [r2], #16       @ AES final-2 block  - store result
6223
6224	eor	q0, q5, q2                           @ AES final-1 block - result
6225
6226	mov	d22, v4.d[1]                                @ GHASH final-2 block - mid
6227
6228	pmull	v21.1q, q4, v14.1d                         @ GHASH final-2 block - low
6229
6230	pmull2	v20.1q, q4, v14.2d                         @ GHASH final-2 block - high
6231
6232	eor	v22.8b, v22.8b, q4                     @ GHASH final-2 block - mid
6233	mov	r6, v0.d[0]                           @ AES final-1 block - mov low
6234
6235	mov	r7, v0.d[1]                           @ AES final-1 block - mov high
6236	eor	v11.16b, v11.16b, v21.16b                           @ GHASH final-2 block - low
6237	movi	q8, #0                                       @ suppress further partial tag feed in
6238
6239	pmull	v22.1q, v22.1d, v17.1d                     @ GHASH final-2 block - mid
6240
6241	eor	q9, q9, v20.16b                           @ GHASH final-2 block - high
6242	eor	r6, r6, r13                  @ AES final-1 block - round 14 low
6243#ifdef __ARMEB__
6244	rev	r6, r6
6245#endif
6246
6247	eor	v10.16b, v10.16b, v22.16b                      @ GHASH final-2 block - mid
6248	eor	r7, r7, r14                  @ AES final-1 block - round 14 high
6249#ifdef __ARMEB__
6250	rev	r7, r7
6251#endif
6252.L256_dec_blocks_more_than_1:@ blocks left >  1
6253
6254	stp	r6, r7, [r2], #16       @ AES final-1 block  - store result
6255	rev64	q4, q5                                   @ GHASH final-1 block
6256
6257	ld1	{ q5}, [r0], #16                     @ AES final block - load ciphertext
6258
6259	eor	q4, q4, q8                          @ feed in partial tag
6260	movi	q8, #0                                       @ suppress further partial tag feed in
6261
6262	mov	d22, v4.d[1]                                @ GHASH final-1 block - mid
6263
6264	eor	q0, q5, q3                           @ AES final block - result
6265
6266	pmull2	v20.1q, q4, v13.2d                         @ GHASH final-1 block - high
6267
6268	eor	v22.8b, v22.8b, q4                     @ GHASH final-1 block - mid
6269
6270	pmull	v21.1q, q4, v13.1d                         @ GHASH final-1 block - low
6271	mov	r6, v0.d[0]                           @ AES final block - mov low
6272
6273	ins	v22.d[1], v22.d[0]                           @ GHASH final-1 block - mid
6274
6275	mov	r7, v0.d[1]                           @ AES final block - mov high
6276
6277	pmull2	v22.1q, v22.2d, v16.2d                     @ GHASH final-1 block - mid
6278	eor	r6, r6, r13                  @ AES final block - round 14 low
6279#ifdef __ARMEB__
6280	rev	r6, r6
6281#endif
6282	eor	v11.16b, v11.16b, v21.16b                           @ GHASH final-1 block - low
6283
6284	eor	q9, q9, v20.16b                           @ GHASH final-1 block - high
6285
6286	eor	v10.16b, v10.16b, v22.16b                      @ GHASH final-1 block - mid
6287	eor	r7, r7, r14                  @ AES final block - round 14 high
6288#ifdef __ARMEB__
6289	rev	r7, r7
6290#endif
6291.L256_dec_blocks_less_than_1:@ blocks left <= 1
6292
6293	and	r1, r1, #127                   @ bit_length %= 128
6294	mvn	r14, xzr                                     @ rk14_h = 0xffffffffffffffff
6295
6296	sub	r1, r1, #128                   @ bit_length -= 128
6297	mvn	r13, xzr                                     @ rk14_l = 0xffffffffffffffff
6298
6299	ldp	r4, r5, [r2] @ load existing bytes we need to not overwrite
6300	neg	r1, r1                         @ bit_length = 128 - #bits in input (in range [1,128])
6301
6302	and	r1, r1, #127                   @ bit_length %= 128
6303
6304	lsr	r14, r14, r1                    @ rk14_h is mask for top 64b of last block
6305	cmp	r1, #64
6306
6307	csel	r9, r13, r14, lt
6308	csel	r10, r14, xzr, lt
6309
6310	fmov	d0, r9                                  @ ctr0b is mask for last block
6311	and	r6, r6, r9
6312
6313	mov	v0.d[1], r10
6314	bic	r4, r4, r9          @ mask out low existing bytes
6315
6316#ifndef __ARMEB__
6317	rev	r9, r12
6318#else
6319	mov	r9, r12
6320#endif
6321
6322	bic	r5, r5, r10      @ mask out high existing bytes
6323
6324	orr	r6, r6, r4
6325
6326	and	r7, r7, r10
6327
6328	orr	r7, r7, r5
6329
6330	and	q5, q5, q0                            @ possibly partial last block has zeroes in highest bits
6331
6332	rev64	q4, q5                                    @ GHASH final block
6333
6334	eor	q4, q4, q8                           @ feed in partial tag
6335
6336	pmull	v21.1q, q4, v12.1d                          @ GHASH final block - low
6337
6338	mov	d8, v4.d[1]                                  @ GHASH final block - mid
6339
6340	eor	q8, q8, q4                          @ GHASH final block - mid
6341
6342	pmull2	v20.1q, q4, v12.2d                          @ GHASH final block - high
6343
6344	pmull	v8.1q, q8, v16.1d                          @ GHASH final block - mid
6345
6346	eor	q9, q9, v20.16b                            @ GHASH final block - high
6347
6348	eor	v11.16b, v11.16b, v21.16b                            @ GHASH final block - low
6349
6350	eor	v10.16b, v10.16b, q8                         @ GHASH final block - mid
6351	movi	q8, #0xc2
6352
6353	eor	q6, v11.16b, q9                         @ MODULO - karatsuba tidy up
6354
6355	shl	d8, d8, #56               @ mod_constant
6356
6357	eor	v10.16b, v10.16b, q6                         @ MODULO - karatsuba tidy up
6358
6359	pmull	v7.1q, q9, q8            @ MODULO - top 64b align with mid
6360
6361	ext	q9, q9, q9, #8                     @ MODULO - other top alignment
6362
6363	eor	v10.16b, v10.16b, q7                      @ MODULO - fold into mid
6364
6365	eor	v10.16b, v10.16b, q9                         @ MODULO - fold into mid
6366
6367	pmull	v8.1q, v10.1d, q8     @ MODULO - mid 64b align with low
6368
6369	ext	v10.16b, v10.16b, v10.16b, #8                     @ MODULO - other mid alignment
6370
6371	eor	v11.16b, v11.16b, q8               @ MODULO - fold into low
6372
6373	stp	r6, r7, [r2]
6374
6375	str	r9, [r16, #12]                          @ store the updated counter
6376
6377	eor	v11.16b, v11.16b, v10.16b                         @ MODULO - fold into low
6378	ext	v11.16b, v11.16b, v11.16b, #8
6379	rev64	v11.16b, v11.16b
6380	mov	r0, r15
6381	st1	{ v11.16b }, [r3]
6382
6383	ldp	r21, r22, [sp, #16]
6384	ldp	r23, r24, [sp, #32]
6385	ldp	d8, d9, [sp, #48]
6386	ldp	d10, d11, [sp, #64]
6387	ldp	d12, d13, [sp, #80]
6388	ldp	d14, d15, [sp, #96]
6389	ldp	r19, r20, [sp], #112
6390	RET
6391
6392.L256_dec_ret:
6393	mov	r0, #0x0
6394	RET
6395.size	aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel
6396.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
6397.align	2
6398.align	2
6399#endif
6400