1#include <machine/asm.h>
2.text
3
4
5
6.globl	aesni_multi_cbc_encrypt
7.type	aesni_multi_cbc_encrypt,@function
8.align	32
9aesni_multi_cbc_encrypt:
10.cfi_startproc
11	cmpl	$2,%edx
12	jb	.Lenc_non_avx
13	movl	OPENSSL_ia32cap_P+4(%rip),%ecx
14	testl	$268435456,%ecx
15	jnz	_avx_cbc_enc_shortcut
16	jmp	.Lenc_non_avx
17.align	16
18.Lenc_non_avx:
19	movq	%rsp,%rax
20.cfi_def_cfa_register	%rax
21	pushq	%rbx
22.cfi_offset	%rbx,-16
23	pushq	%rbp
24.cfi_offset	%rbp,-24
25	pushq	%r12
26.cfi_offset	%r12,-32
27	pushq	%r13
28.cfi_offset	%r13,-40
29	pushq	%r14
30.cfi_offset	%r14,-48
31	pushq	%r15
32.cfi_offset	%r15,-56
33
34
35
36
37
38
39	subq	$48,%rsp
40	andq	$-64,%rsp
41	movq	%rax,16(%rsp)
42.cfi_escape	0x0f,0x05,0x77,0x10,0x06,0x23,0x08
43
44.Lenc4x_body:
45	movdqu	(%rsi),%xmm12
46	leaq	120(%rsi),%rsi
47	leaq	80(%rdi),%rdi
48
49.Lenc4x_loop_grande:
50	movl	%edx,24(%rsp)
51	xorl	%edx,%edx
52	movl	-64(%rdi),%ecx
53	movq	-80(%rdi),%r8
54	cmpl	%edx,%ecx
55	movq	-72(%rdi),%r12
56	cmovgl	%ecx,%edx
57	testl	%ecx,%ecx
58	movdqu	-56(%rdi),%xmm2
59	movl	%ecx,32(%rsp)
60	cmovleq	%rsp,%r8
61	movl	-24(%rdi),%ecx
62	movq	-40(%rdi),%r9
63	cmpl	%edx,%ecx
64	movq	-32(%rdi),%r13
65	cmovgl	%ecx,%edx
66	testl	%ecx,%ecx
67	movdqu	-16(%rdi),%xmm3
68	movl	%ecx,36(%rsp)
69	cmovleq	%rsp,%r9
70	movl	16(%rdi),%ecx
71	movq	0(%rdi),%r10
72	cmpl	%edx,%ecx
73	movq	8(%rdi),%r14
74	cmovgl	%ecx,%edx
75	testl	%ecx,%ecx
76	movdqu	24(%rdi),%xmm4
77	movl	%ecx,40(%rsp)
78	cmovleq	%rsp,%r10
79	movl	56(%rdi),%ecx
80	movq	40(%rdi),%r11
81	cmpl	%edx,%ecx
82	movq	48(%rdi),%r15
83	cmovgl	%ecx,%edx
84	testl	%ecx,%ecx
85	movdqu	64(%rdi),%xmm5
86	movl	%ecx,44(%rsp)
87	cmovleq	%rsp,%r11
88	testl	%edx,%edx
89	jz	.Lenc4x_done
90
91	movups	16-120(%rsi),%xmm1
92	pxor	%xmm12,%xmm2
93	movups	32-120(%rsi),%xmm0
94	pxor	%xmm12,%xmm3
95	movl	240-120(%rsi),%eax
96	pxor	%xmm12,%xmm4
97	movdqu	(%r8),%xmm6
98	pxor	%xmm12,%xmm5
99	movdqu	(%r9),%xmm7
100	pxor	%xmm6,%xmm2
101	movdqu	(%r10),%xmm8
102	pxor	%xmm7,%xmm3
103	movdqu	(%r11),%xmm9
104	pxor	%xmm8,%xmm4
105	pxor	%xmm9,%xmm5
106	movdqa	32(%rsp),%xmm10
107	xorq	%rbx,%rbx
108	jmp	.Loop_enc4x
109
110.align	32
111.Loop_enc4x:
112	addq	$16,%rbx
113	leaq	16(%rsp),%rbp
114	movl	$1,%ecx
115	subq	%rbx,%rbp
116
117.byte	102,15,56,220,209
118	prefetcht0	31(%r8,%rbx,1)
119	prefetcht0	31(%r9,%rbx,1)
120.byte	102,15,56,220,217
121	prefetcht0	31(%r10,%rbx,1)
122	prefetcht0	31(%r10,%rbx,1)
123.byte	102,15,56,220,225
124.byte	102,15,56,220,233
125	movups	48-120(%rsi),%xmm1
126	cmpl	32(%rsp),%ecx
127.byte	102,15,56,220,208
128.byte	102,15,56,220,216
129.byte	102,15,56,220,224
130	cmovgeq	%rbp,%r8
131	cmovgq	%rbp,%r12
132.byte	102,15,56,220,232
133	movups	-56(%rsi),%xmm0
134	cmpl	36(%rsp),%ecx
135.byte	102,15,56,220,209
136.byte	102,15,56,220,217
137.byte	102,15,56,220,225
138	cmovgeq	%rbp,%r9
139	cmovgq	%rbp,%r13
140.byte	102,15,56,220,233
141	movups	-40(%rsi),%xmm1
142	cmpl	40(%rsp),%ecx
143.byte	102,15,56,220,208
144.byte	102,15,56,220,216
145.byte	102,15,56,220,224
146	cmovgeq	%rbp,%r10
147	cmovgq	%rbp,%r14
148.byte	102,15,56,220,232
149	movups	-24(%rsi),%xmm0
150	cmpl	44(%rsp),%ecx
151.byte	102,15,56,220,209
152.byte	102,15,56,220,217
153.byte	102,15,56,220,225
154	cmovgeq	%rbp,%r11
155	cmovgq	%rbp,%r15
156.byte	102,15,56,220,233
157	movups	-8(%rsi),%xmm1
158	movdqa	%xmm10,%xmm11
159.byte	102,15,56,220,208
160	prefetcht0	15(%r12,%rbx,1)
161	prefetcht0	15(%r13,%rbx,1)
162.byte	102,15,56,220,216
163	prefetcht0	15(%r14,%rbx,1)
164	prefetcht0	15(%r15,%rbx,1)
165.byte	102,15,56,220,224
166.byte	102,15,56,220,232
167	movups	128-120(%rsi),%xmm0
168	pxor	%xmm12,%xmm12
169
170.byte	102,15,56,220,209
171	pcmpgtd	%xmm12,%xmm11
172	movdqu	-120(%rsi),%xmm12
173.byte	102,15,56,220,217
174	paddd	%xmm11,%xmm10
175	movdqa	%xmm10,32(%rsp)
176.byte	102,15,56,220,225
177.byte	102,15,56,220,233
178	movups	144-120(%rsi),%xmm1
179
180	cmpl	$11,%eax
181
182.byte	102,15,56,220,208
183.byte	102,15,56,220,216
184.byte	102,15,56,220,224
185.byte	102,15,56,220,232
186	movups	160-120(%rsi),%xmm0
187
188	jb	.Lenc4x_tail
189
190.byte	102,15,56,220,209
191.byte	102,15,56,220,217
192.byte	102,15,56,220,225
193.byte	102,15,56,220,233
194	movups	176-120(%rsi),%xmm1
195
196.byte	102,15,56,220,208
197.byte	102,15,56,220,216
198.byte	102,15,56,220,224
199.byte	102,15,56,220,232
200	movups	192-120(%rsi),%xmm0
201
202	je	.Lenc4x_tail
203
204.byte	102,15,56,220,209
205.byte	102,15,56,220,217
206.byte	102,15,56,220,225
207.byte	102,15,56,220,233
208	movups	208-120(%rsi),%xmm1
209
210.byte	102,15,56,220,208
211.byte	102,15,56,220,216
212.byte	102,15,56,220,224
213.byte	102,15,56,220,232
214	movups	224-120(%rsi),%xmm0
215	jmp	.Lenc4x_tail
216
217.align	32
218.Lenc4x_tail:
219.byte	102,15,56,220,209
220.byte	102,15,56,220,217
221.byte	102,15,56,220,225
222.byte	102,15,56,220,233
223	movdqu	(%r8,%rbx,1),%xmm6
224	movdqu	16-120(%rsi),%xmm1
225
226.byte	102,15,56,221,208
227	movdqu	(%r9,%rbx,1),%xmm7
228	pxor	%xmm12,%xmm6
229.byte	102,15,56,221,216
230	movdqu	(%r10,%rbx,1),%xmm8
231	pxor	%xmm12,%xmm7
232.byte	102,15,56,221,224
233	movdqu	(%r11,%rbx,1),%xmm9
234	pxor	%xmm12,%xmm8
235.byte	102,15,56,221,232
236	movdqu	32-120(%rsi),%xmm0
237	pxor	%xmm12,%xmm9
238
239	movups	%xmm2,-16(%r12,%rbx,1)
240	pxor	%xmm6,%xmm2
241	movups	%xmm3,-16(%r13,%rbx,1)
242	pxor	%xmm7,%xmm3
243	movups	%xmm4,-16(%r14,%rbx,1)
244	pxor	%xmm8,%xmm4
245	movups	%xmm5,-16(%r15,%rbx,1)
246	pxor	%xmm9,%xmm5
247
248	decl	%edx
249	jnz	.Loop_enc4x
250
251	movq	16(%rsp),%rax
252.cfi_def_cfa	%rax,8
253	movl	24(%rsp),%edx
254
255
256
257
258
259
260
261
262
263
264	leaq	160(%rdi),%rdi
265	decl	%edx
266	jnz	.Lenc4x_loop_grande
267
268.Lenc4x_done:
269	movq	-48(%rax),%r15
270.cfi_restore	%r15
271	movq	-40(%rax),%r14
272.cfi_restore	%r14
273	movq	-32(%rax),%r13
274.cfi_restore	%r13
275	movq	-24(%rax),%r12
276.cfi_restore	%r12
277	movq	-16(%rax),%rbp
278.cfi_restore	%rbp
279	movq	-8(%rax),%rbx
280.cfi_restore	%rbx
281	leaq	(%rax),%rsp
282.cfi_def_cfa_register	%rsp
283.Lenc4x_epilogue:
284	.byte	0xf3,0xc3
285.cfi_endproc
286.size	aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt
287
288.globl	aesni_multi_cbc_decrypt
289.type	aesni_multi_cbc_decrypt,@function
290.align	32
291aesni_multi_cbc_decrypt:
292.cfi_startproc
293	cmpl	$2,%edx
294	jb	.Ldec_non_avx
295	movl	OPENSSL_ia32cap_P+4(%rip),%ecx
296	testl	$268435456,%ecx
297	jnz	_avx_cbc_dec_shortcut
298	jmp	.Ldec_non_avx
299.align	16
300.Ldec_non_avx:
301	movq	%rsp,%rax
302.cfi_def_cfa_register	%rax
303	pushq	%rbx
304.cfi_offset	%rbx,-16
305	pushq	%rbp
306.cfi_offset	%rbp,-24
307	pushq	%r12
308.cfi_offset	%r12,-32
309	pushq	%r13
310.cfi_offset	%r13,-40
311	pushq	%r14
312.cfi_offset	%r14,-48
313	pushq	%r15
314.cfi_offset	%r15,-56
315
316
317
318
319
320
321	subq	$48,%rsp
322	andq	$-64,%rsp
323	movq	%rax,16(%rsp)
324.cfi_escape	0x0f,0x05,0x77,0x10,0x06,0x23,0x08
325
326.Ldec4x_body:
327	movdqu	(%rsi),%xmm12
328	leaq	120(%rsi),%rsi
329	leaq	80(%rdi),%rdi
330
331.Ldec4x_loop_grande:
332	movl	%edx,24(%rsp)
333	xorl	%edx,%edx
334	movl	-64(%rdi),%ecx
335	movq	-80(%rdi),%r8
336	cmpl	%edx,%ecx
337	movq	-72(%rdi),%r12
338	cmovgl	%ecx,%edx
339	testl	%ecx,%ecx
340	movdqu	-56(%rdi),%xmm6
341	movl	%ecx,32(%rsp)
342	cmovleq	%rsp,%r8
343	movl	-24(%rdi),%ecx
344	movq	-40(%rdi),%r9
345	cmpl	%edx,%ecx
346	movq	-32(%rdi),%r13
347	cmovgl	%ecx,%edx
348	testl	%ecx,%ecx
349	movdqu	-16(%rdi),%xmm7
350	movl	%ecx,36(%rsp)
351	cmovleq	%rsp,%r9
352	movl	16(%rdi),%ecx
353	movq	0(%rdi),%r10
354	cmpl	%edx,%ecx
355	movq	8(%rdi),%r14
356	cmovgl	%ecx,%edx
357	testl	%ecx,%ecx
358	movdqu	24(%rdi),%xmm8
359	movl	%ecx,40(%rsp)
360	cmovleq	%rsp,%r10
361	movl	56(%rdi),%ecx
362	movq	40(%rdi),%r11
363	cmpl	%edx,%ecx
364	movq	48(%rdi),%r15
365	cmovgl	%ecx,%edx
366	testl	%ecx,%ecx
367	movdqu	64(%rdi),%xmm9
368	movl	%ecx,44(%rsp)
369	cmovleq	%rsp,%r11
370	testl	%edx,%edx
371	jz	.Ldec4x_done
372
373	movups	16-120(%rsi),%xmm1
374	movups	32-120(%rsi),%xmm0
375	movl	240-120(%rsi),%eax
376	movdqu	(%r8),%xmm2
377	movdqu	(%r9),%xmm3
378	pxor	%xmm12,%xmm2
379	movdqu	(%r10),%xmm4
380	pxor	%xmm12,%xmm3
381	movdqu	(%r11),%xmm5
382	pxor	%xmm12,%xmm4
383	pxor	%xmm12,%xmm5
384	movdqa	32(%rsp),%xmm10
385	xorq	%rbx,%rbx
386	jmp	.Loop_dec4x
387
388.align	32
389.Loop_dec4x:
390	addq	$16,%rbx
391	leaq	16(%rsp),%rbp
392	movl	$1,%ecx
393	subq	%rbx,%rbp
394
395.byte	102,15,56,222,209
396	prefetcht0	31(%r8,%rbx,1)
397	prefetcht0	31(%r9,%rbx,1)
398.byte	102,15,56,222,217
399	prefetcht0	31(%r10,%rbx,1)
400	prefetcht0	31(%r11,%rbx,1)
401.byte	102,15,56,222,225
402.byte	102,15,56,222,233
403	movups	48-120(%rsi),%xmm1
404	cmpl	32(%rsp),%ecx
405.byte	102,15,56,222,208
406.byte	102,15,56,222,216
407.byte	102,15,56,222,224
408	cmovgeq	%rbp,%r8
409	cmovgq	%rbp,%r12
410.byte	102,15,56,222,232
411	movups	-56(%rsi),%xmm0
412	cmpl	36(%rsp),%ecx
413.byte	102,15,56,222,209
414.byte	102,15,56,222,217
415.byte	102,15,56,222,225
416	cmovgeq	%rbp,%r9
417	cmovgq	%rbp,%r13
418.byte	102,15,56,222,233
419	movups	-40(%rsi),%xmm1
420	cmpl	40(%rsp),%ecx
421.byte	102,15,56,222,208
422.byte	102,15,56,222,216
423.byte	102,15,56,222,224
424	cmovgeq	%rbp,%r10
425	cmovgq	%rbp,%r14
426.byte	102,15,56,222,232
427	movups	-24(%rsi),%xmm0
428	cmpl	44(%rsp),%ecx
429.byte	102,15,56,222,209
430.byte	102,15,56,222,217
431.byte	102,15,56,222,225
432	cmovgeq	%rbp,%r11
433	cmovgq	%rbp,%r15
434.byte	102,15,56,222,233
435	movups	-8(%rsi),%xmm1
436	movdqa	%xmm10,%xmm11
437.byte	102,15,56,222,208
438	prefetcht0	15(%r12,%rbx,1)
439	prefetcht0	15(%r13,%rbx,1)
440.byte	102,15,56,222,216
441	prefetcht0	15(%r14,%rbx,1)
442	prefetcht0	15(%r15,%rbx,1)
443.byte	102,15,56,222,224
444.byte	102,15,56,222,232
445	movups	128-120(%rsi),%xmm0
446	pxor	%xmm12,%xmm12
447
448.byte	102,15,56,222,209
449	pcmpgtd	%xmm12,%xmm11
450	movdqu	-120(%rsi),%xmm12
451.byte	102,15,56,222,217
452	paddd	%xmm11,%xmm10
453	movdqa	%xmm10,32(%rsp)
454.byte	102,15,56,222,225
455.byte	102,15,56,222,233
456	movups	144-120(%rsi),%xmm1
457
458	cmpl	$11,%eax
459
460.byte	102,15,56,222,208
461.byte	102,15,56,222,216
462.byte	102,15,56,222,224
463.byte	102,15,56,222,232
464	movups	160-120(%rsi),%xmm0
465
466	jb	.Ldec4x_tail
467
468.byte	102,15,56,222,209
469.byte	102,15,56,222,217
470.byte	102,15,56,222,225
471.byte	102,15,56,222,233
472	movups	176-120(%rsi),%xmm1
473
474.byte	102,15,56,222,208
475.byte	102,15,56,222,216
476.byte	102,15,56,222,224
477.byte	102,15,56,222,232
478	movups	192-120(%rsi),%xmm0
479
480	je	.Ldec4x_tail
481
482.byte	102,15,56,222,209
483.byte	102,15,56,222,217
484.byte	102,15,56,222,225
485.byte	102,15,56,222,233
486	movups	208-120(%rsi),%xmm1
487
488.byte	102,15,56,222,208
489.byte	102,15,56,222,216
490.byte	102,15,56,222,224
491.byte	102,15,56,222,232
492	movups	224-120(%rsi),%xmm0
493	jmp	.Ldec4x_tail
494
495.align	32
496.Ldec4x_tail:
497.byte	102,15,56,222,209
498.byte	102,15,56,222,217
499.byte	102,15,56,222,225
500	pxor	%xmm0,%xmm6
501	pxor	%xmm0,%xmm7
502.byte	102,15,56,222,233
503	movdqu	16-120(%rsi),%xmm1
504	pxor	%xmm0,%xmm8
505	pxor	%xmm0,%xmm9
506	movdqu	32-120(%rsi),%xmm0
507
508.byte	102,15,56,223,214
509.byte	102,15,56,223,223
510	movdqu	-16(%r8,%rbx,1),%xmm6
511	movdqu	-16(%r9,%rbx,1),%xmm7
512.byte	102,65,15,56,223,224
513.byte	102,65,15,56,223,233
514	movdqu	-16(%r10,%rbx,1),%xmm8
515	movdqu	-16(%r11,%rbx,1),%xmm9
516
517	movups	%xmm2,-16(%r12,%rbx,1)
518	movdqu	(%r8,%rbx,1),%xmm2
519	movups	%xmm3,-16(%r13,%rbx,1)
520	movdqu	(%r9,%rbx,1),%xmm3
521	pxor	%xmm12,%xmm2
522	movups	%xmm4,-16(%r14,%rbx,1)
523	movdqu	(%r10,%rbx,1),%xmm4
524	pxor	%xmm12,%xmm3
525	movups	%xmm5,-16(%r15,%rbx,1)
526	movdqu	(%r11,%rbx,1),%xmm5
527	pxor	%xmm12,%xmm4
528	pxor	%xmm12,%xmm5
529
530	decl	%edx
531	jnz	.Loop_dec4x
532
533	movq	16(%rsp),%rax
534.cfi_def_cfa	%rax,8
535	movl	24(%rsp),%edx
536
537	leaq	160(%rdi),%rdi
538	decl	%edx
539	jnz	.Ldec4x_loop_grande
540
541.Ldec4x_done:
542	movq	-48(%rax),%r15
543.cfi_restore	%r15
544	movq	-40(%rax),%r14
545.cfi_restore	%r14
546	movq	-32(%rax),%r13
547.cfi_restore	%r13
548	movq	-24(%rax),%r12
549.cfi_restore	%r12
550	movq	-16(%rax),%rbp
551.cfi_restore	%rbp
552	movq	-8(%rax),%rbx
553.cfi_restore	%rbx
554	leaq	(%rax),%rsp
555.cfi_def_cfa_register	%rsp
556.Ldec4x_epilogue:
557	.byte	0xf3,0xc3
558.cfi_endproc
559.size	aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
560.type	aesni_multi_cbc_encrypt_avx,@function
561.align	32
562aesni_multi_cbc_encrypt_avx:
563.cfi_startproc
564_avx_cbc_enc_shortcut:
565	movq	%rsp,%rax
566.cfi_def_cfa_register	%rax
567	pushq	%rbx
568.cfi_offset	%rbx,-16
569	pushq	%rbp
570.cfi_offset	%rbp,-24
571	pushq	%r12
572.cfi_offset	%r12,-32
573	pushq	%r13
574.cfi_offset	%r13,-40
575	pushq	%r14
576.cfi_offset	%r14,-48
577	pushq	%r15
578.cfi_offset	%r15,-56
579
580
581
582
583
584
585
586
587	subq	$192,%rsp
588	andq	$-128,%rsp
589	movq	%rax,16(%rsp)
590.cfi_escape	0x0f,0x05,0x77,0x10,0x06,0x23,0x08
591
592.Lenc8x_body:
593	vzeroupper
594	vmovdqu	(%rsi),%xmm15
595	leaq	120(%rsi),%rsi
596	leaq	160(%rdi),%rdi
597	shrl	$1,%edx
598
599.Lenc8x_loop_grande:
600
601	xorl	%edx,%edx
602	movl	-144(%rdi),%ecx
603	movq	-160(%rdi),%r8
604	cmpl	%edx,%ecx
605	movq	-152(%rdi),%rbx
606	cmovgl	%ecx,%edx
607	testl	%ecx,%ecx
608	vmovdqu	-136(%rdi),%xmm2
609	movl	%ecx,32(%rsp)
610	cmovleq	%rsp,%r8
611	subq	%r8,%rbx
612	movq	%rbx,64(%rsp)
613	movl	-104(%rdi),%ecx
614	movq	-120(%rdi),%r9
615	cmpl	%edx,%ecx
616	movq	-112(%rdi),%rbp
617	cmovgl	%ecx,%edx
618	testl	%ecx,%ecx
619	vmovdqu	-96(%rdi),%xmm3
620	movl	%ecx,36(%rsp)
621	cmovleq	%rsp,%r9
622	subq	%r9,%rbp
623	movq	%rbp,72(%rsp)
624	movl	-64(%rdi),%ecx
625	movq	-80(%rdi),%r10
626	cmpl	%edx,%ecx
627	movq	-72(%rdi),%rbp
628	cmovgl	%ecx,%edx
629	testl	%ecx,%ecx
630	vmovdqu	-56(%rdi),%xmm4
631	movl	%ecx,40(%rsp)
632	cmovleq	%rsp,%r10
633	subq	%r10,%rbp
634	movq	%rbp,80(%rsp)
635	movl	-24(%rdi),%ecx
636	movq	-40(%rdi),%r11
637	cmpl	%edx,%ecx
638	movq	-32(%rdi),%rbp
639	cmovgl	%ecx,%edx
640	testl	%ecx,%ecx
641	vmovdqu	-16(%rdi),%xmm5
642	movl	%ecx,44(%rsp)
643	cmovleq	%rsp,%r11
644	subq	%r11,%rbp
645	movq	%rbp,88(%rsp)
646	movl	16(%rdi),%ecx
647	movq	0(%rdi),%r12
648	cmpl	%edx,%ecx
649	movq	8(%rdi),%rbp
650	cmovgl	%ecx,%edx
651	testl	%ecx,%ecx
652	vmovdqu	24(%rdi),%xmm6
653	movl	%ecx,48(%rsp)
654	cmovleq	%rsp,%r12
655	subq	%r12,%rbp
656	movq	%rbp,96(%rsp)
657	movl	56(%rdi),%ecx
658	movq	40(%rdi),%r13
659	cmpl	%edx,%ecx
660	movq	48(%rdi),%rbp
661	cmovgl	%ecx,%edx
662	testl	%ecx,%ecx
663	vmovdqu	64(%rdi),%xmm7
664	movl	%ecx,52(%rsp)
665	cmovleq	%rsp,%r13
666	subq	%r13,%rbp
667	movq	%rbp,104(%rsp)
668	movl	96(%rdi),%ecx
669	movq	80(%rdi),%r14
670	cmpl	%edx,%ecx
671	movq	88(%rdi),%rbp
672	cmovgl	%ecx,%edx
673	testl	%ecx,%ecx
674	vmovdqu	104(%rdi),%xmm8
675	movl	%ecx,56(%rsp)
676	cmovleq	%rsp,%r14
677	subq	%r14,%rbp
678	movq	%rbp,112(%rsp)
679	movl	136(%rdi),%ecx
680	movq	120(%rdi),%r15
681	cmpl	%edx,%ecx
682	movq	128(%rdi),%rbp
683	cmovgl	%ecx,%edx
684	testl	%ecx,%ecx
685	vmovdqu	144(%rdi),%xmm9
686	movl	%ecx,60(%rsp)
687	cmovleq	%rsp,%r15
688	subq	%r15,%rbp
689	movq	%rbp,120(%rsp)
690	testl	%edx,%edx
691	jz	.Lenc8x_done
692
693	vmovups	16-120(%rsi),%xmm1
694	vmovups	32-120(%rsi),%xmm0
695	movl	240-120(%rsi),%eax
696
697	vpxor	(%r8),%xmm15,%xmm10
698	leaq	128(%rsp),%rbp
699	vpxor	(%r9),%xmm15,%xmm11
700	vpxor	(%r10),%xmm15,%xmm12
701	vpxor	(%r11),%xmm15,%xmm13
702	vpxor	%xmm10,%xmm2,%xmm2
703	vpxor	(%r12),%xmm15,%xmm10
704	vpxor	%xmm11,%xmm3,%xmm3
705	vpxor	(%r13),%xmm15,%xmm11
706	vpxor	%xmm12,%xmm4,%xmm4
707	vpxor	(%r14),%xmm15,%xmm12
708	vpxor	%xmm13,%xmm5,%xmm5
709	vpxor	(%r15),%xmm15,%xmm13
710	vpxor	%xmm10,%xmm6,%xmm6
711	movl	$1,%ecx
712	vpxor	%xmm11,%xmm7,%xmm7
713	vpxor	%xmm12,%xmm8,%xmm8
714	vpxor	%xmm13,%xmm9,%xmm9
715	jmp	.Loop_enc8x
716
717.align	32
718.Loop_enc8x:
719	vaesenc	%xmm1,%xmm2,%xmm2
720	cmpl	32+0(%rsp),%ecx
721	vaesenc	%xmm1,%xmm3,%xmm3
722	prefetcht0	31(%r8)
723	vaesenc	%xmm1,%xmm4,%xmm4
724	vaesenc	%xmm1,%xmm5,%xmm5
725	leaq	(%r8,%rbx,1),%rbx
726	cmovgeq	%rsp,%r8
727	vaesenc	%xmm1,%xmm6,%xmm6
728	cmovgq	%rsp,%rbx
729	vaesenc	%xmm1,%xmm7,%xmm7
730	subq	%r8,%rbx
731	vaesenc	%xmm1,%xmm8,%xmm8
732	vpxor	16(%r8),%xmm15,%xmm10
733	movq	%rbx,64+0(%rsp)
734	vaesenc	%xmm1,%xmm9,%xmm9
735	vmovups	-72(%rsi),%xmm1
736	leaq	16(%r8,%rbx,1),%r8
737	vmovdqu	%xmm10,0(%rbp)
738	vaesenc	%xmm0,%xmm2,%xmm2
739	cmpl	32+4(%rsp),%ecx
740	movq	64+8(%rsp),%rbx
741	vaesenc	%xmm0,%xmm3,%xmm3
742	prefetcht0	31(%r9)
743	vaesenc	%xmm0,%xmm4,%xmm4
744	vaesenc	%xmm0,%xmm5,%xmm5
745	leaq	(%r9,%rbx,1),%rbx
746	cmovgeq	%rsp,%r9
747	vaesenc	%xmm0,%xmm6,%xmm6
748	cmovgq	%rsp,%rbx
749	vaesenc	%xmm0,%xmm7,%xmm7
750	subq	%r9,%rbx
751	vaesenc	%xmm0,%xmm8,%xmm8
752	vpxor	16(%r9),%xmm15,%xmm11
753	movq	%rbx,64+8(%rsp)
754	vaesenc	%xmm0,%xmm9,%xmm9
755	vmovups	-56(%rsi),%xmm0
756	leaq	16(%r9,%rbx,1),%r9
757	vmovdqu	%xmm11,16(%rbp)
758	vaesenc	%xmm1,%xmm2,%xmm2
759	cmpl	32+8(%rsp),%ecx
760	movq	64+16(%rsp),%rbx
761	vaesenc	%xmm1,%xmm3,%xmm3
762	prefetcht0	31(%r10)
763	vaesenc	%xmm1,%xmm4,%xmm4
764	prefetcht0	15(%r8)
765	vaesenc	%xmm1,%xmm5,%xmm5
766	leaq	(%r10,%rbx,1),%rbx
767	cmovgeq	%rsp,%r10
768	vaesenc	%xmm1,%xmm6,%xmm6
769	cmovgq	%rsp,%rbx
770	vaesenc	%xmm1,%xmm7,%xmm7
771	subq	%r10,%rbx
772	vaesenc	%xmm1,%xmm8,%xmm8
773	vpxor	16(%r10),%xmm15,%xmm12
774	movq	%rbx,64+16(%rsp)
775	vaesenc	%xmm1,%xmm9,%xmm9
776	vmovups	-40(%rsi),%xmm1
777	leaq	16(%r10,%rbx,1),%r10
778	vmovdqu	%xmm12,32(%rbp)
779	vaesenc	%xmm0,%xmm2,%xmm2
780	cmpl	32+12(%rsp),%ecx
781	movq	64+24(%rsp),%rbx
782	vaesenc	%xmm0,%xmm3,%xmm3
783	prefetcht0	31(%r11)
784	vaesenc	%xmm0,%xmm4,%xmm4
785	prefetcht0	15(%r9)
786	vaesenc	%xmm0,%xmm5,%xmm5
787	leaq	(%r11,%rbx,1),%rbx
788	cmovgeq	%rsp,%r11
789	vaesenc	%xmm0,%xmm6,%xmm6
790	cmovgq	%rsp,%rbx
791	vaesenc	%xmm0,%xmm7,%xmm7
792	subq	%r11,%rbx
793	vaesenc	%xmm0,%xmm8,%xmm8
794	vpxor	16(%r11),%xmm15,%xmm13
795	movq	%rbx,64+24(%rsp)
796	vaesenc	%xmm0,%xmm9,%xmm9
797	vmovups	-24(%rsi),%xmm0
798	leaq	16(%r11,%rbx,1),%r11
799	vmovdqu	%xmm13,48(%rbp)
800	vaesenc	%xmm1,%xmm2,%xmm2
801	cmpl	32+16(%rsp),%ecx
802	movq	64+32(%rsp),%rbx
803	vaesenc	%xmm1,%xmm3,%xmm3
804	prefetcht0	31(%r12)
805	vaesenc	%xmm1,%xmm4,%xmm4
806	prefetcht0	15(%r10)
807	vaesenc	%xmm1,%xmm5,%xmm5
808	leaq	(%r12,%rbx,1),%rbx
809	cmovgeq	%rsp,%r12
810	vaesenc	%xmm1,%xmm6,%xmm6
811	cmovgq	%rsp,%rbx
812	vaesenc	%xmm1,%xmm7,%xmm7
813	subq	%r12,%rbx
814	vaesenc	%xmm1,%xmm8,%xmm8
815	vpxor	16(%r12),%xmm15,%xmm10
816	movq	%rbx,64+32(%rsp)
817	vaesenc	%xmm1,%xmm9,%xmm9
818	vmovups	-8(%rsi),%xmm1
819	leaq	16(%r12,%rbx,1),%r12
820	vaesenc	%xmm0,%xmm2,%xmm2
821	cmpl	32+20(%rsp),%ecx
822	movq	64+40(%rsp),%rbx
823	vaesenc	%xmm0,%xmm3,%xmm3
824	prefetcht0	31(%r13)
825	vaesenc	%xmm0,%xmm4,%xmm4
826	prefetcht0	15(%r11)
827	vaesenc	%xmm0,%xmm5,%xmm5
828	leaq	(%rbx,%r13,1),%rbx
829	cmovgeq	%rsp,%r13
830	vaesenc	%xmm0,%xmm6,%xmm6
831	cmovgq	%rsp,%rbx
832	vaesenc	%xmm0,%xmm7,%xmm7
833	subq	%r13,%rbx
834	vaesenc	%xmm0,%xmm8,%xmm8
835	vpxor	16(%r13),%xmm15,%xmm11
836	movq	%rbx,64+40(%rsp)
837	vaesenc	%xmm0,%xmm9,%xmm9
838	vmovups	8(%rsi),%xmm0
839	leaq	16(%r13,%rbx,1),%r13
840	vaesenc	%xmm1,%xmm2,%xmm2
841	cmpl	32+24(%rsp),%ecx
842	movq	64+48(%rsp),%rbx
843	vaesenc	%xmm1,%xmm3,%xmm3
844	prefetcht0	31(%r14)
845	vaesenc	%xmm1,%xmm4,%xmm4
846	prefetcht0	15(%r12)
847	vaesenc	%xmm1,%xmm5,%xmm5
848	leaq	(%r14,%rbx,1),%rbx
849	cmovgeq	%rsp,%r14
850	vaesenc	%xmm1,%xmm6,%xmm6
851	cmovgq	%rsp,%rbx
852	vaesenc	%xmm1,%xmm7,%xmm7
853	subq	%r14,%rbx
854	vaesenc	%xmm1,%xmm8,%xmm8
855	vpxor	16(%r14),%xmm15,%xmm12
856	movq	%rbx,64+48(%rsp)
857	vaesenc	%xmm1,%xmm9,%xmm9
858	vmovups	24(%rsi),%xmm1
859	leaq	16(%r14,%rbx,1),%r14
860	vaesenc	%xmm0,%xmm2,%xmm2
861	cmpl	32+28(%rsp),%ecx
862	movq	64+56(%rsp),%rbx
863	vaesenc	%xmm0,%xmm3,%xmm3
864	prefetcht0	31(%r15)
865	vaesenc	%xmm0,%xmm4,%xmm4
866	prefetcht0	15(%r13)
867	vaesenc	%xmm0,%xmm5,%xmm5
868	leaq	(%r15,%rbx,1),%rbx
869	cmovgeq	%rsp,%r15
870	vaesenc	%xmm0,%xmm6,%xmm6
871	cmovgq	%rsp,%rbx
872	vaesenc	%xmm0,%xmm7,%xmm7
873	subq	%r15,%rbx
874	vaesenc	%xmm0,%xmm8,%xmm8
875	vpxor	16(%r15),%xmm15,%xmm13
876	movq	%rbx,64+56(%rsp)
877	vaesenc	%xmm0,%xmm9,%xmm9
878	vmovups	40(%rsi),%xmm0
879	leaq	16(%r15,%rbx,1),%r15
880	vmovdqu	32(%rsp),%xmm14
881	prefetcht0	15(%r14)
882	prefetcht0	15(%r15)
883	cmpl	$11,%eax
884	jb	.Lenc8x_tail
885
886	vaesenc	%xmm1,%xmm2,%xmm2
887	vaesenc	%xmm1,%xmm3,%xmm3
888	vaesenc	%xmm1,%xmm4,%xmm4
889	vaesenc	%xmm1,%xmm5,%xmm5
890	vaesenc	%xmm1,%xmm6,%xmm6
891	vaesenc	%xmm1,%xmm7,%xmm7
892	vaesenc	%xmm1,%xmm8,%xmm8
893	vaesenc	%xmm1,%xmm9,%xmm9
894	vmovups	176-120(%rsi),%xmm1
895
896	vaesenc	%xmm0,%xmm2,%xmm2
897	vaesenc	%xmm0,%xmm3,%xmm3
898	vaesenc	%xmm0,%xmm4,%xmm4
899	vaesenc	%xmm0,%xmm5,%xmm5
900	vaesenc	%xmm0,%xmm6,%xmm6
901	vaesenc	%xmm0,%xmm7,%xmm7
902	vaesenc	%xmm0,%xmm8,%xmm8
903	vaesenc	%xmm0,%xmm9,%xmm9
904	vmovups	192-120(%rsi),%xmm0
905	je	.Lenc8x_tail
906
907	vaesenc	%xmm1,%xmm2,%xmm2
908	vaesenc	%xmm1,%xmm3,%xmm3
909	vaesenc	%xmm1,%xmm4,%xmm4
910	vaesenc	%xmm1,%xmm5,%xmm5
911	vaesenc	%xmm1,%xmm6,%xmm6
912	vaesenc	%xmm1,%xmm7,%xmm7
913	vaesenc	%xmm1,%xmm8,%xmm8
914	vaesenc	%xmm1,%xmm9,%xmm9
915	vmovups	208-120(%rsi),%xmm1
916
917	vaesenc	%xmm0,%xmm2,%xmm2
918	vaesenc	%xmm0,%xmm3,%xmm3
919	vaesenc	%xmm0,%xmm4,%xmm4
920	vaesenc	%xmm0,%xmm5,%xmm5
921	vaesenc	%xmm0,%xmm6,%xmm6
922	vaesenc	%xmm0,%xmm7,%xmm7
923	vaesenc	%xmm0,%xmm8,%xmm8
924	vaesenc	%xmm0,%xmm9,%xmm9
925	vmovups	224-120(%rsi),%xmm0
926
927.Lenc8x_tail:
928	vaesenc	%xmm1,%xmm2,%xmm2
929	vpxor	%xmm15,%xmm15,%xmm15
930	vaesenc	%xmm1,%xmm3,%xmm3
931	vaesenc	%xmm1,%xmm4,%xmm4
932	vpcmpgtd	%xmm15,%xmm14,%xmm15
933	vaesenc	%xmm1,%xmm5,%xmm5
934	vaesenc	%xmm1,%xmm6,%xmm6
935	vpaddd	%xmm14,%xmm15,%xmm15
936	vmovdqu	48(%rsp),%xmm14
937	vaesenc	%xmm1,%xmm7,%xmm7
938	movq	64(%rsp),%rbx
939	vaesenc	%xmm1,%xmm8,%xmm8
940	vaesenc	%xmm1,%xmm9,%xmm9
941	vmovups	16-120(%rsi),%xmm1
942
943	vaesenclast	%xmm0,%xmm2,%xmm2
944	vmovdqa	%xmm15,32(%rsp)
945	vpxor	%xmm15,%xmm15,%xmm15
946	vaesenclast	%xmm0,%xmm3,%xmm3
947	vaesenclast	%xmm0,%xmm4,%xmm4
948	vpcmpgtd	%xmm15,%xmm14,%xmm15
949	vaesenclast	%xmm0,%xmm5,%xmm5
950	vaesenclast	%xmm0,%xmm6,%xmm6
951	vpaddd	%xmm15,%xmm14,%xmm14
952	vmovdqu	-120(%rsi),%xmm15
953	vaesenclast	%xmm0,%xmm7,%xmm7
954	vaesenclast	%xmm0,%xmm8,%xmm8
955	vmovdqa	%xmm14,48(%rsp)
956	vaesenclast	%xmm0,%xmm9,%xmm9
957	vmovups	32-120(%rsi),%xmm0
958
959	vmovups	%xmm2,-16(%r8)
960	subq	%rbx,%r8
961	vpxor	0(%rbp),%xmm2,%xmm2
962	vmovups	%xmm3,-16(%r9)
963	subq	72(%rsp),%r9
964	vpxor	16(%rbp),%xmm3,%xmm3
965	vmovups	%xmm4,-16(%r10)
966	subq	80(%rsp),%r10
967	vpxor	32(%rbp),%xmm4,%xmm4
968	vmovups	%xmm5,-16(%r11)
969	subq	88(%rsp),%r11
970	vpxor	48(%rbp),%xmm5,%xmm5
971	vmovups	%xmm6,-16(%r12)
972	subq	96(%rsp),%r12
973	vpxor	%xmm10,%xmm6,%xmm6
974	vmovups	%xmm7,-16(%r13)
975	subq	104(%rsp),%r13
976	vpxor	%xmm11,%xmm7,%xmm7
977	vmovups	%xmm8,-16(%r14)
978	subq	112(%rsp),%r14
979	vpxor	%xmm12,%xmm8,%xmm8
980	vmovups	%xmm9,-16(%r15)
981	subq	120(%rsp),%r15
982	vpxor	%xmm13,%xmm9,%xmm9
983
984	decl	%edx
985	jnz	.Loop_enc8x
986
987	movq	16(%rsp),%rax
988.cfi_def_cfa	%rax,8
989
990
991
992
993
994.Lenc8x_done:
995	vzeroupper
996	movq	-48(%rax),%r15
997.cfi_restore	%r15
998	movq	-40(%rax),%r14
999.cfi_restore	%r14
1000	movq	-32(%rax),%r13
1001.cfi_restore	%r13
1002	movq	-24(%rax),%r12
1003.cfi_restore	%r12
1004	movq	-16(%rax),%rbp
1005.cfi_restore	%rbp
1006	movq	-8(%rax),%rbx
1007.cfi_restore	%rbx
1008	leaq	(%rax),%rsp
1009.cfi_def_cfa_register	%rsp
1010.Lenc8x_epilogue:
1011	.byte	0xf3,0xc3
1012.cfi_endproc
1013.size	aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx
1014
1015.type	aesni_multi_cbc_decrypt_avx,@function
1016.align	32
1017aesni_multi_cbc_decrypt_avx:
1018.cfi_startproc
1019_avx_cbc_dec_shortcut:
1020	movq	%rsp,%rax
1021.cfi_def_cfa_register	%rax
1022	pushq	%rbx
1023.cfi_offset	%rbx,-16
1024	pushq	%rbp
1025.cfi_offset	%rbp,-24
1026	pushq	%r12
1027.cfi_offset	%r12,-32
1028	pushq	%r13
1029.cfi_offset	%r13,-40
1030	pushq	%r14
1031.cfi_offset	%r14,-48
1032	pushq	%r15
1033.cfi_offset	%r15,-56
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043	subq	$256,%rsp
1044	andq	$-256,%rsp
1045	subq	$192,%rsp
1046	movq	%rax,16(%rsp)
1047.cfi_escape	0x0f,0x05,0x77,0x10,0x06,0x23,0x08
1048
1049.Ldec8x_body:
1050	vzeroupper
1051	vmovdqu	(%rsi),%xmm15
1052	leaq	120(%rsi),%rsi
1053	leaq	160(%rdi),%rdi
1054	shrl	$1,%edx
1055
1056.Ldec8x_loop_grande:
1057
1058	xorl	%edx,%edx
1059	movl	-144(%rdi),%ecx
1060	movq	-160(%rdi),%r8
1061	cmpl	%edx,%ecx
1062	movq	-152(%rdi),%rbx
1063	cmovgl	%ecx,%edx
1064	testl	%ecx,%ecx
1065	vmovdqu	-136(%rdi),%xmm2
1066	movl	%ecx,32(%rsp)
1067	cmovleq	%rsp,%r8
1068	subq	%r8,%rbx
1069	movq	%rbx,64(%rsp)
1070	vmovdqu	%xmm2,192(%rsp)
1071	movl	-104(%rdi),%ecx
1072	movq	-120(%rdi),%r9
1073	cmpl	%edx,%ecx
1074	movq	-112(%rdi),%rbp
1075	cmovgl	%ecx,%edx
1076	testl	%ecx,%ecx
1077	vmovdqu	-96(%rdi),%xmm3
1078	movl	%ecx,36(%rsp)
1079	cmovleq	%rsp,%r9
1080	subq	%r9,%rbp
1081	movq	%rbp,72(%rsp)
1082	vmovdqu	%xmm3,208(%rsp)
1083	movl	-64(%rdi),%ecx
1084	movq	-80(%rdi),%r10
1085	cmpl	%edx,%ecx
1086	movq	-72(%rdi),%rbp
1087	cmovgl	%ecx,%edx
1088	testl	%ecx,%ecx
1089	vmovdqu	-56(%rdi),%xmm4
1090	movl	%ecx,40(%rsp)
1091	cmovleq	%rsp,%r10
1092	subq	%r10,%rbp
1093	movq	%rbp,80(%rsp)
1094	vmovdqu	%xmm4,224(%rsp)
1095	movl	-24(%rdi),%ecx
1096	movq	-40(%rdi),%r11
1097	cmpl	%edx,%ecx
1098	movq	-32(%rdi),%rbp
1099	cmovgl	%ecx,%edx
1100	testl	%ecx,%ecx
1101	vmovdqu	-16(%rdi),%xmm5
1102	movl	%ecx,44(%rsp)
1103	cmovleq	%rsp,%r11
1104	subq	%r11,%rbp
1105	movq	%rbp,88(%rsp)
1106	vmovdqu	%xmm5,240(%rsp)
1107	movl	16(%rdi),%ecx
1108	movq	0(%rdi),%r12
1109	cmpl	%edx,%ecx
1110	movq	8(%rdi),%rbp
1111	cmovgl	%ecx,%edx
1112	testl	%ecx,%ecx
1113	vmovdqu	24(%rdi),%xmm6
1114	movl	%ecx,48(%rsp)
1115	cmovleq	%rsp,%r12
1116	subq	%r12,%rbp
1117	movq	%rbp,96(%rsp)
1118	vmovdqu	%xmm6,256(%rsp)
1119	movl	56(%rdi),%ecx
1120	movq	40(%rdi),%r13
1121	cmpl	%edx,%ecx
1122	movq	48(%rdi),%rbp
1123	cmovgl	%ecx,%edx
1124	testl	%ecx,%ecx
1125	vmovdqu	64(%rdi),%xmm7
1126	movl	%ecx,52(%rsp)
1127	cmovleq	%rsp,%r13
1128	subq	%r13,%rbp
1129	movq	%rbp,104(%rsp)
1130	vmovdqu	%xmm7,272(%rsp)
1131	movl	96(%rdi),%ecx
1132	movq	80(%rdi),%r14
1133	cmpl	%edx,%ecx
1134	movq	88(%rdi),%rbp
1135	cmovgl	%ecx,%edx
1136	testl	%ecx,%ecx
1137	vmovdqu	104(%rdi),%xmm8
1138	movl	%ecx,56(%rsp)
1139	cmovleq	%rsp,%r14
1140	subq	%r14,%rbp
1141	movq	%rbp,112(%rsp)
1142	vmovdqu	%xmm8,288(%rsp)
1143	movl	136(%rdi),%ecx
1144	movq	120(%rdi),%r15
1145	cmpl	%edx,%ecx
1146	movq	128(%rdi),%rbp
1147	cmovgl	%ecx,%edx
1148	testl	%ecx,%ecx
1149	vmovdqu	144(%rdi),%xmm9
1150	movl	%ecx,60(%rsp)
1151	cmovleq	%rsp,%r15
1152	subq	%r15,%rbp
1153	movq	%rbp,120(%rsp)
1154	vmovdqu	%xmm9,304(%rsp)
1155	testl	%edx,%edx
1156	jz	.Ldec8x_done
1157
1158	vmovups	16-120(%rsi),%xmm1
1159	vmovups	32-120(%rsi),%xmm0
1160	movl	240-120(%rsi),%eax
1161	leaq	192+128(%rsp),%rbp
1162
1163	vmovdqu	(%r8),%xmm2
1164	vmovdqu	(%r9),%xmm3
1165	vmovdqu	(%r10),%xmm4
1166	vmovdqu	(%r11),%xmm5
1167	vmovdqu	(%r12),%xmm6
1168	vmovdqu	(%r13),%xmm7
1169	vmovdqu	(%r14),%xmm8
1170	vmovdqu	(%r15),%xmm9
1171	vmovdqu	%xmm2,0(%rbp)
1172	vpxor	%xmm15,%xmm2,%xmm2
1173	vmovdqu	%xmm3,16(%rbp)
1174	vpxor	%xmm15,%xmm3,%xmm3
1175	vmovdqu	%xmm4,32(%rbp)
1176	vpxor	%xmm15,%xmm4,%xmm4
1177	vmovdqu	%xmm5,48(%rbp)
1178	vpxor	%xmm15,%xmm5,%xmm5
1179	vmovdqu	%xmm6,64(%rbp)
1180	vpxor	%xmm15,%xmm6,%xmm6
1181	vmovdqu	%xmm7,80(%rbp)
1182	vpxor	%xmm15,%xmm7,%xmm7
1183	vmovdqu	%xmm8,96(%rbp)
1184	vpxor	%xmm15,%xmm8,%xmm8
1185	vmovdqu	%xmm9,112(%rbp)
1186	vpxor	%xmm15,%xmm9,%xmm9
1187	xorq	$0x80,%rbp
1188	movl	$1,%ecx
1189	jmp	.Loop_dec8x
1190
1191.align	32
1192.Loop_dec8x:
1193	vaesdec	%xmm1,%xmm2,%xmm2
1194	cmpl	32+0(%rsp),%ecx
1195	vaesdec	%xmm1,%xmm3,%xmm3
1196	prefetcht0	31(%r8)
1197	vaesdec	%xmm1,%xmm4,%xmm4
1198	vaesdec	%xmm1,%xmm5,%xmm5
1199	leaq	(%r8,%rbx,1),%rbx
1200	cmovgeq	%rsp,%r8
1201	vaesdec	%xmm1,%xmm6,%xmm6
1202	cmovgq	%rsp,%rbx
1203	vaesdec	%xmm1,%xmm7,%xmm7
1204	subq	%r8,%rbx
1205	vaesdec	%xmm1,%xmm8,%xmm8
1206	vmovdqu	16(%r8),%xmm10
1207	movq	%rbx,64+0(%rsp)
1208	vaesdec	%xmm1,%xmm9,%xmm9
1209	vmovups	-72(%rsi),%xmm1
1210	leaq	16(%r8,%rbx,1),%r8
1211	vmovdqu	%xmm10,128(%rsp)
1212	vaesdec	%xmm0,%xmm2,%xmm2
1213	cmpl	32+4(%rsp),%ecx
1214	movq	64+8(%rsp),%rbx
1215	vaesdec	%xmm0,%xmm3,%xmm3
1216	prefetcht0	31(%r9)
1217	vaesdec	%xmm0,%xmm4,%xmm4
1218	vaesdec	%xmm0,%xmm5,%xmm5
1219	leaq	(%r9,%rbx,1),%rbx
1220	cmovgeq	%rsp,%r9
1221	vaesdec	%xmm0,%xmm6,%xmm6
1222	cmovgq	%rsp,%rbx
1223	vaesdec	%xmm0,%xmm7,%xmm7
1224	subq	%r9,%rbx
1225	vaesdec	%xmm0,%xmm8,%xmm8
1226	vmovdqu	16(%r9),%xmm11
1227	movq	%rbx,64+8(%rsp)
1228	vaesdec	%xmm0,%xmm9,%xmm9
1229	vmovups	-56(%rsi),%xmm0
1230	leaq	16(%r9,%rbx,1),%r9
1231	vmovdqu	%xmm11,144(%rsp)
1232	vaesdec	%xmm1,%xmm2,%xmm2
1233	cmpl	32+8(%rsp),%ecx
1234	movq	64+16(%rsp),%rbx
1235	vaesdec	%xmm1,%xmm3,%xmm3
1236	prefetcht0	31(%r10)
1237	vaesdec	%xmm1,%xmm4,%xmm4
1238	prefetcht0	15(%r8)
1239	vaesdec	%xmm1,%xmm5,%xmm5
1240	leaq	(%r10,%rbx,1),%rbx
1241	cmovgeq	%rsp,%r10
1242	vaesdec	%xmm1,%xmm6,%xmm6
1243	cmovgq	%rsp,%rbx
1244	vaesdec	%xmm1,%xmm7,%xmm7
1245	subq	%r10,%rbx
1246	vaesdec	%xmm1,%xmm8,%xmm8
1247	vmovdqu	16(%r10),%xmm12
1248	movq	%rbx,64+16(%rsp)
1249	vaesdec	%xmm1,%xmm9,%xmm9
1250	vmovups	-40(%rsi),%xmm1
1251	leaq	16(%r10,%rbx,1),%r10
1252	vmovdqu	%xmm12,160(%rsp)
1253	vaesdec	%xmm0,%xmm2,%xmm2
1254	cmpl	32+12(%rsp),%ecx
1255	movq	64+24(%rsp),%rbx
1256	vaesdec	%xmm0,%xmm3,%xmm3
1257	prefetcht0	31(%r11)
1258	vaesdec	%xmm0,%xmm4,%xmm4
1259	prefetcht0	15(%r9)
1260	vaesdec	%xmm0,%xmm5,%xmm5
1261	leaq	(%r11,%rbx,1),%rbx
1262	cmovgeq	%rsp,%r11
1263	vaesdec	%xmm0,%xmm6,%xmm6
1264	cmovgq	%rsp,%rbx
1265	vaesdec	%xmm0,%xmm7,%xmm7
1266	subq	%r11,%rbx
1267	vaesdec	%xmm0,%xmm8,%xmm8
1268	vmovdqu	16(%r11),%xmm13
1269	movq	%rbx,64+24(%rsp)
1270	vaesdec	%xmm0,%xmm9,%xmm9
1271	vmovups	-24(%rsi),%xmm0
1272	leaq	16(%r11,%rbx,1),%r11
1273	vmovdqu	%xmm13,176(%rsp)
1274	vaesdec	%xmm1,%xmm2,%xmm2
1275	cmpl	32+16(%rsp),%ecx
1276	movq	64+32(%rsp),%rbx
1277	vaesdec	%xmm1,%xmm3,%xmm3
1278	prefetcht0	31(%r12)
1279	vaesdec	%xmm1,%xmm4,%xmm4
1280	prefetcht0	15(%r10)
1281	vaesdec	%xmm1,%xmm5,%xmm5
1282	leaq	(%r12,%rbx,1),%rbx
1283	cmovgeq	%rsp,%r12
1284	vaesdec	%xmm1,%xmm6,%xmm6
1285	cmovgq	%rsp,%rbx
1286	vaesdec	%xmm1,%xmm7,%xmm7
1287	subq	%r12,%rbx
1288	vaesdec	%xmm1,%xmm8,%xmm8
1289	vmovdqu	16(%r12),%xmm10
1290	movq	%rbx,64+32(%rsp)
1291	vaesdec	%xmm1,%xmm9,%xmm9
1292	vmovups	-8(%rsi),%xmm1
1293	leaq	16(%r12,%rbx,1),%r12
1294	vaesdec	%xmm0,%xmm2,%xmm2
1295	cmpl	32+20(%rsp),%ecx
1296	movq	64+40(%rsp),%rbx
1297	vaesdec	%xmm0,%xmm3,%xmm3
1298	prefetcht0	31(%r13)
1299	vaesdec	%xmm0,%xmm4,%xmm4
1300	prefetcht0	15(%r11)
1301	vaesdec	%xmm0,%xmm5,%xmm5
1302	leaq	(%rbx,%r13,1),%rbx
1303	cmovgeq	%rsp,%r13
1304	vaesdec	%xmm0,%xmm6,%xmm6
1305	cmovgq	%rsp,%rbx
1306	vaesdec	%xmm0,%xmm7,%xmm7
1307	subq	%r13,%rbx
1308	vaesdec	%xmm0,%xmm8,%xmm8
1309	vmovdqu	16(%r13),%xmm11
1310	movq	%rbx,64+40(%rsp)
1311	vaesdec	%xmm0,%xmm9,%xmm9
1312	vmovups	8(%rsi),%xmm0
1313	leaq	16(%r13,%rbx,1),%r13
1314	vaesdec	%xmm1,%xmm2,%xmm2
1315	cmpl	32+24(%rsp),%ecx
1316	movq	64+48(%rsp),%rbx
1317	vaesdec	%xmm1,%xmm3,%xmm3
1318	prefetcht0	31(%r14)
1319	vaesdec	%xmm1,%xmm4,%xmm4
1320	prefetcht0	15(%r12)
1321	vaesdec	%xmm1,%xmm5,%xmm5
1322	leaq	(%r14,%rbx,1),%rbx
1323	cmovgeq	%rsp,%r14
1324	vaesdec	%xmm1,%xmm6,%xmm6
1325	cmovgq	%rsp,%rbx
1326	vaesdec	%xmm1,%xmm7,%xmm7
1327	subq	%r14,%rbx
1328	vaesdec	%xmm1,%xmm8,%xmm8
1329	vmovdqu	16(%r14),%xmm12
1330	movq	%rbx,64+48(%rsp)
1331	vaesdec	%xmm1,%xmm9,%xmm9
1332	vmovups	24(%rsi),%xmm1
1333	leaq	16(%r14,%rbx,1),%r14
1334	vaesdec	%xmm0,%xmm2,%xmm2
1335	cmpl	32+28(%rsp),%ecx
1336	movq	64+56(%rsp),%rbx
1337	vaesdec	%xmm0,%xmm3,%xmm3
1338	prefetcht0	31(%r15)
1339	vaesdec	%xmm0,%xmm4,%xmm4
1340	prefetcht0	15(%r13)
1341	vaesdec	%xmm0,%xmm5,%xmm5
1342	leaq	(%r15,%rbx,1),%rbx
1343	cmovgeq	%rsp,%r15
1344	vaesdec	%xmm0,%xmm6,%xmm6
1345	cmovgq	%rsp,%rbx
1346	vaesdec	%xmm0,%xmm7,%xmm7
1347	subq	%r15,%rbx
1348	vaesdec	%xmm0,%xmm8,%xmm8
1349	vmovdqu	16(%r15),%xmm13
1350	movq	%rbx,64+56(%rsp)
1351	vaesdec	%xmm0,%xmm9,%xmm9
1352	vmovups	40(%rsi),%xmm0
1353	leaq	16(%r15,%rbx,1),%r15
1354	vmovdqu	32(%rsp),%xmm14
1355	prefetcht0	15(%r14)
1356	prefetcht0	15(%r15)
1357	cmpl	$11,%eax
1358	jb	.Ldec8x_tail
1359
1360	vaesdec	%xmm1,%xmm2,%xmm2
1361	vaesdec	%xmm1,%xmm3,%xmm3
1362	vaesdec	%xmm1,%xmm4,%xmm4
1363	vaesdec	%xmm1,%xmm5,%xmm5
1364	vaesdec	%xmm1,%xmm6,%xmm6
1365	vaesdec	%xmm1,%xmm7,%xmm7
1366	vaesdec	%xmm1,%xmm8,%xmm8
1367	vaesdec	%xmm1,%xmm9,%xmm9
1368	vmovups	176-120(%rsi),%xmm1
1369
1370	vaesdec	%xmm0,%xmm2,%xmm2
1371	vaesdec	%xmm0,%xmm3,%xmm3
1372	vaesdec	%xmm0,%xmm4,%xmm4
1373	vaesdec	%xmm0,%xmm5,%xmm5
1374	vaesdec	%xmm0,%xmm6,%xmm6
1375	vaesdec	%xmm0,%xmm7,%xmm7
1376	vaesdec	%xmm0,%xmm8,%xmm8
1377	vaesdec	%xmm0,%xmm9,%xmm9
1378	vmovups	192-120(%rsi),%xmm0
1379	je	.Ldec8x_tail
1380
1381	vaesdec	%xmm1,%xmm2,%xmm2
1382	vaesdec	%xmm1,%xmm3,%xmm3
1383	vaesdec	%xmm1,%xmm4,%xmm4
1384	vaesdec	%xmm1,%xmm5,%xmm5
1385	vaesdec	%xmm1,%xmm6,%xmm6
1386	vaesdec	%xmm1,%xmm7,%xmm7
1387	vaesdec	%xmm1,%xmm8,%xmm8
1388	vaesdec	%xmm1,%xmm9,%xmm9
1389	vmovups	208-120(%rsi),%xmm1
1390
1391	vaesdec	%xmm0,%xmm2,%xmm2
1392	vaesdec	%xmm0,%xmm3,%xmm3
1393	vaesdec	%xmm0,%xmm4,%xmm4
1394	vaesdec	%xmm0,%xmm5,%xmm5
1395	vaesdec	%xmm0,%xmm6,%xmm6
1396	vaesdec	%xmm0,%xmm7,%xmm7
1397	vaesdec	%xmm0,%xmm8,%xmm8
1398	vaesdec	%xmm0,%xmm9,%xmm9
1399	vmovups	224-120(%rsi),%xmm0
1400
1401.Ldec8x_tail:
1402	vaesdec	%xmm1,%xmm2,%xmm2
1403	vpxor	%xmm15,%xmm15,%xmm15
1404	vaesdec	%xmm1,%xmm3,%xmm3
1405	vaesdec	%xmm1,%xmm4,%xmm4
1406	vpcmpgtd	%xmm15,%xmm14,%xmm15
1407	vaesdec	%xmm1,%xmm5,%xmm5
1408	vaesdec	%xmm1,%xmm6,%xmm6
1409	vpaddd	%xmm14,%xmm15,%xmm15
1410	vmovdqu	48(%rsp),%xmm14
1411	vaesdec	%xmm1,%xmm7,%xmm7
1412	movq	64(%rsp),%rbx
1413	vaesdec	%xmm1,%xmm8,%xmm8
1414	vaesdec	%xmm1,%xmm9,%xmm9
1415	vmovups	16-120(%rsi),%xmm1
1416
1417	vaesdeclast	%xmm0,%xmm2,%xmm2
1418	vmovdqa	%xmm15,32(%rsp)
1419	vpxor	%xmm15,%xmm15,%xmm15
1420	vaesdeclast	%xmm0,%xmm3,%xmm3
1421	vpxor	0(%rbp),%xmm2,%xmm2
1422	vaesdeclast	%xmm0,%xmm4,%xmm4
1423	vpxor	16(%rbp),%xmm3,%xmm3
1424	vpcmpgtd	%xmm15,%xmm14,%xmm15
1425	vaesdeclast	%xmm0,%xmm5,%xmm5
1426	vpxor	32(%rbp),%xmm4,%xmm4
1427	vaesdeclast	%xmm0,%xmm6,%xmm6
1428	vpxor	48(%rbp),%xmm5,%xmm5
1429	vpaddd	%xmm15,%xmm14,%xmm14
1430	vmovdqu	-120(%rsi),%xmm15
1431	vaesdeclast	%xmm0,%xmm7,%xmm7
1432	vpxor	64(%rbp),%xmm6,%xmm6
1433	vaesdeclast	%xmm0,%xmm8,%xmm8
1434	vpxor	80(%rbp),%xmm7,%xmm7
1435	vmovdqa	%xmm14,48(%rsp)
1436	vaesdeclast	%xmm0,%xmm9,%xmm9
1437	vpxor	96(%rbp),%xmm8,%xmm8
1438	vmovups	32-120(%rsi),%xmm0
1439
1440	vmovups	%xmm2,-16(%r8)
1441	subq	%rbx,%r8
1442	vmovdqu	128+0(%rsp),%xmm2
1443	vpxor	112(%rbp),%xmm9,%xmm9
1444	vmovups	%xmm3,-16(%r9)
1445	subq	72(%rsp),%r9
1446	vmovdqu	%xmm2,0(%rbp)
1447	vpxor	%xmm15,%xmm2,%xmm2
1448	vmovdqu	128+16(%rsp),%xmm3
1449	vmovups	%xmm4,-16(%r10)
1450	subq	80(%rsp),%r10
1451	vmovdqu	%xmm3,16(%rbp)
1452	vpxor	%xmm15,%xmm3,%xmm3
1453	vmovdqu	128+32(%rsp),%xmm4
1454	vmovups	%xmm5,-16(%r11)
1455	subq	88(%rsp),%r11
1456	vmovdqu	%xmm4,32(%rbp)
1457	vpxor	%xmm15,%xmm4,%xmm4
1458	vmovdqu	128+48(%rsp),%xmm5
1459	vmovups	%xmm6,-16(%r12)
1460	subq	96(%rsp),%r12
1461	vmovdqu	%xmm5,48(%rbp)
1462	vpxor	%xmm15,%xmm5,%xmm5
1463	vmovdqu	%xmm10,64(%rbp)
1464	vpxor	%xmm10,%xmm15,%xmm6
1465	vmovups	%xmm7,-16(%r13)
1466	subq	104(%rsp),%r13
1467	vmovdqu	%xmm11,80(%rbp)
1468	vpxor	%xmm11,%xmm15,%xmm7
1469	vmovups	%xmm8,-16(%r14)
1470	subq	112(%rsp),%r14
1471	vmovdqu	%xmm12,96(%rbp)
1472	vpxor	%xmm12,%xmm15,%xmm8
1473	vmovups	%xmm9,-16(%r15)
1474	subq	120(%rsp),%r15
1475	vmovdqu	%xmm13,112(%rbp)
1476	vpxor	%xmm13,%xmm15,%xmm9
1477
1478	xorq	$128,%rbp
1479	decl	%edx
1480	jnz	.Loop_dec8x
1481
1482	movq	16(%rsp),%rax
1483.cfi_def_cfa	%rax,8
1484
1485
1486
1487
1488
1489.Ldec8x_done:
1490	vzeroupper
1491	movq	-48(%rax),%r15
1492.cfi_restore	%r15
1493	movq	-40(%rax),%r14
1494.cfi_restore	%r14
1495	movq	-32(%rax),%r13
1496.cfi_restore	%r13
1497	movq	-24(%rax),%r12
1498.cfi_restore	%r12
1499	movq	-16(%rax),%rbp
1500.cfi_restore	%rbp
1501	movq	-8(%rax),%rbx
1502.cfi_restore	%rbx
1503	leaq	(%rax),%rsp
1504.cfi_def_cfa_register	%rsp
1505.Ldec8x_epilogue:
1506	.byte	0xf3,0xc3
1507.cfi_endproc
1508.size	aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx
1509