aesni-x86_64.S revision 298998
1	# $FreeBSD: head/secure/lib/libcrypto/amd64/aesni-x86_64.S 298998 2016-05-03 18:50:10Z jkim $
2.text
3
4.globl	aesni_encrypt
5.type	aesni_encrypt,@function
6.align	16
7aesni_encrypt:
8	movups	(%rdi),%xmm2
9	movl	240(%rdx),%eax
10	movups	(%rdx),%xmm0
11	movups	16(%rdx),%xmm1
12	leaq	32(%rdx),%rdx
13	xorps	%xmm0,%xmm2
14.Loop_enc1_1:
15.byte	102,15,56,220,209
16	decl	%eax
17	movups	(%rdx),%xmm1
18	leaq	16(%rdx),%rdx
19	jnz	.Loop_enc1_1
20.byte	102,15,56,221,209
21	pxor	%xmm0,%xmm0
22	pxor	%xmm1,%xmm1
23	movups	%xmm2,(%rsi)
24	pxor	%xmm2,%xmm2
25	.byte	0xf3,0xc3
26.size	aesni_encrypt,.-aesni_encrypt
27
28.globl	aesni_decrypt
29.type	aesni_decrypt,@function
30.align	16
31aesni_decrypt:
32	movups	(%rdi),%xmm2
33	movl	240(%rdx),%eax
34	movups	(%rdx),%xmm0
35	movups	16(%rdx),%xmm1
36	leaq	32(%rdx),%rdx
37	xorps	%xmm0,%xmm2
38.Loop_dec1_2:
39.byte	102,15,56,222,209
40	decl	%eax
41	movups	(%rdx),%xmm1
42	leaq	16(%rdx),%rdx
43	jnz	.Loop_dec1_2
44.byte	102,15,56,223,209
45	pxor	%xmm0,%xmm0
46	pxor	%xmm1,%xmm1
47	movups	%xmm2,(%rsi)
48	pxor	%xmm2,%xmm2
49	.byte	0xf3,0xc3
50.size	aesni_decrypt, .-aesni_decrypt
51.type	_aesni_encrypt2,@function
52.align	16
53_aesni_encrypt2:
54	movups	(%rcx),%xmm0
55	shll	$4,%eax
56	movups	16(%rcx),%xmm1
57	xorps	%xmm0,%xmm2
58	xorps	%xmm0,%xmm3
59	movups	32(%rcx),%xmm0
60	leaq	32(%rcx,%rax,1),%rcx
61	negq	%rax
62	addq	$16,%rax
63
64.Lenc_loop2:
65.byte	102,15,56,220,209
66.byte	102,15,56,220,217
67	movups	(%rcx,%rax,1),%xmm1
68	addq	$32,%rax
69.byte	102,15,56,220,208
70.byte	102,15,56,220,216
71	movups	-16(%rcx,%rax,1),%xmm0
72	jnz	.Lenc_loop2
73
74.byte	102,15,56,220,209
75.byte	102,15,56,220,217
76.byte	102,15,56,221,208
77.byte	102,15,56,221,216
78	.byte	0xf3,0xc3
79.size	_aesni_encrypt2,.-_aesni_encrypt2
80.type	_aesni_decrypt2,@function
81.align	16
82_aesni_decrypt2:
83	movups	(%rcx),%xmm0
84	shll	$4,%eax
85	movups	16(%rcx),%xmm1
86	xorps	%xmm0,%xmm2
87	xorps	%xmm0,%xmm3
88	movups	32(%rcx),%xmm0
89	leaq	32(%rcx,%rax,1),%rcx
90	negq	%rax
91	addq	$16,%rax
92
93.Ldec_loop2:
94.byte	102,15,56,222,209
95.byte	102,15,56,222,217
96	movups	(%rcx,%rax,1),%xmm1
97	addq	$32,%rax
98.byte	102,15,56,222,208
99.byte	102,15,56,222,216
100	movups	-16(%rcx,%rax,1),%xmm0
101	jnz	.Ldec_loop2
102
103.byte	102,15,56,222,209
104.byte	102,15,56,222,217
105.byte	102,15,56,223,208
106.byte	102,15,56,223,216
107	.byte	0xf3,0xc3
108.size	_aesni_decrypt2,.-_aesni_decrypt2
109.type	_aesni_encrypt3,@function
110.align	16
111_aesni_encrypt3:
112	movups	(%rcx),%xmm0
113	shll	$4,%eax
114	movups	16(%rcx),%xmm1
115	xorps	%xmm0,%xmm2
116	xorps	%xmm0,%xmm3
117	xorps	%xmm0,%xmm4
118	movups	32(%rcx),%xmm0
119	leaq	32(%rcx,%rax,1),%rcx
120	negq	%rax
121	addq	$16,%rax
122
123.Lenc_loop3:
124.byte	102,15,56,220,209
125.byte	102,15,56,220,217
126.byte	102,15,56,220,225
127	movups	(%rcx,%rax,1),%xmm1
128	addq	$32,%rax
129.byte	102,15,56,220,208
130.byte	102,15,56,220,216
131.byte	102,15,56,220,224
132	movups	-16(%rcx,%rax,1),%xmm0
133	jnz	.Lenc_loop3
134
135.byte	102,15,56,220,209
136.byte	102,15,56,220,217
137.byte	102,15,56,220,225
138.byte	102,15,56,221,208
139.byte	102,15,56,221,216
140.byte	102,15,56,221,224
141	.byte	0xf3,0xc3
142.size	_aesni_encrypt3,.-_aesni_encrypt3
143.type	_aesni_decrypt3,@function
144.align	16
145_aesni_decrypt3:
146	movups	(%rcx),%xmm0
147	shll	$4,%eax
148	movups	16(%rcx),%xmm1
149	xorps	%xmm0,%xmm2
150	xorps	%xmm0,%xmm3
151	xorps	%xmm0,%xmm4
152	movups	32(%rcx),%xmm0
153	leaq	32(%rcx,%rax,1),%rcx
154	negq	%rax
155	addq	$16,%rax
156
157.Ldec_loop3:
158.byte	102,15,56,222,209
159.byte	102,15,56,222,217
160.byte	102,15,56,222,225
161	movups	(%rcx,%rax,1),%xmm1
162	addq	$32,%rax
163.byte	102,15,56,222,208
164.byte	102,15,56,222,216
165.byte	102,15,56,222,224
166	movups	-16(%rcx,%rax,1),%xmm0
167	jnz	.Ldec_loop3
168
169.byte	102,15,56,222,209
170.byte	102,15,56,222,217
171.byte	102,15,56,222,225
172.byte	102,15,56,223,208
173.byte	102,15,56,223,216
174.byte	102,15,56,223,224
175	.byte	0xf3,0xc3
176.size	_aesni_decrypt3,.-_aesni_decrypt3
177.type	_aesni_encrypt4,@function
178.align	16
179_aesni_encrypt4:
180	movups	(%rcx),%xmm0
181	shll	$4,%eax
182	movups	16(%rcx),%xmm1
183	xorps	%xmm0,%xmm2
184	xorps	%xmm0,%xmm3
185	xorps	%xmm0,%xmm4
186	xorps	%xmm0,%xmm5
187	movups	32(%rcx),%xmm0
188	leaq	32(%rcx,%rax,1),%rcx
189	negq	%rax
190.byte	0x0f,0x1f,0x00
191	addq	$16,%rax
192
193.Lenc_loop4:
194.byte	102,15,56,220,209
195.byte	102,15,56,220,217
196.byte	102,15,56,220,225
197.byte	102,15,56,220,233
198	movups	(%rcx,%rax,1),%xmm1
199	addq	$32,%rax
200.byte	102,15,56,220,208
201.byte	102,15,56,220,216
202.byte	102,15,56,220,224
203.byte	102,15,56,220,232
204	movups	-16(%rcx,%rax,1),%xmm0
205	jnz	.Lenc_loop4
206
207.byte	102,15,56,220,209
208.byte	102,15,56,220,217
209.byte	102,15,56,220,225
210.byte	102,15,56,220,233
211.byte	102,15,56,221,208
212.byte	102,15,56,221,216
213.byte	102,15,56,221,224
214.byte	102,15,56,221,232
215	.byte	0xf3,0xc3
216.size	_aesni_encrypt4,.-_aesni_encrypt4
217.type	_aesni_decrypt4,@function
218.align	16
219_aesni_decrypt4:
220	movups	(%rcx),%xmm0
221	shll	$4,%eax
222	movups	16(%rcx),%xmm1
223	xorps	%xmm0,%xmm2
224	xorps	%xmm0,%xmm3
225	xorps	%xmm0,%xmm4
226	xorps	%xmm0,%xmm5
227	movups	32(%rcx),%xmm0
228	leaq	32(%rcx,%rax,1),%rcx
229	negq	%rax
230.byte	0x0f,0x1f,0x00
231	addq	$16,%rax
232
233.Ldec_loop4:
234.byte	102,15,56,222,209
235.byte	102,15,56,222,217
236.byte	102,15,56,222,225
237.byte	102,15,56,222,233
238	movups	(%rcx,%rax,1),%xmm1
239	addq	$32,%rax
240.byte	102,15,56,222,208
241.byte	102,15,56,222,216
242.byte	102,15,56,222,224
243.byte	102,15,56,222,232
244	movups	-16(%rcx,%rax,1),%xmm0
245	jnz	.Ldec_loop4
246
247.byte	102,15,56,222,209
248.byte	102,15,56,222,217
249.byte	102,15,56,222,225
250.byte	102,15,56,222,233
251.byte	102,15,56,223,208
252.byte	102,15,56,223,216
253.byte	102,15,56,223,224
254.byte	102,15,56,223,232
255	.byte	0xf3,0xc3
256.size	_aesni_decrypt4,.-_aesni_decrypt4
257.type	_aesni_encrypt6,@function
258.align	16
259_aesni_encrypt6:
260	movups	(%rcx),%xmm0
261	shll	$4,%eax
262	movups	16(%rcx),%xmm1
263	xorps	%xmm0,%xmm2
264	pxor	%xmm0,%xmm3
265	pxor	%xmm0,%xmm4
266.byte	102,15,56,220,209
267	leaq	32(%rcx,%rax,1),%rcx
268	negq	%rax
269.byte	102,15,56,220,217
270	pxor	%xmm0,%xmm5
271	pxor	%xmm0,%xmm6
272.byte	102,15,56,220,225
273	pxor	%xmm0,%xmm7
274	movups	(%rcx,%rax,1),%xmm0
275	addq	$16,%rax
276	jmp	.Lenc_loop6_enter
277.align	16
278.Lenc_loop6:
279.byte	102,15,56,220,209
280.byte	102,15,56,220,217
281.byte	102,15,56,220,225
282.Lenc_loop6_enter:
283.byte	102,15,56,220,233
284.byte	102,15,56,220,241
285.byte	102,15,56,220,249
286	movups	(%rcx,%rax,1),%xmm1
287	addq	$32,%rax
288.byte	102,15,56,220,208
289.byte	102,15,56,220,216
290.byte	102,15,56,220,224
291.byte	102,15,56,220,232
292.byte	102,15,56,220,240
293.byte	102,15,56,220,248
294	movups	-16(%rcx,%rax,1),%xmm0
295	jnz	.Lenc_loop6
296
297.byte	102,15,56,220,209
298.byte	102,15,56,220,217
299.byte	102,15,56,220,225
300.byte	102,15,56,220,233
301.byte	102,15,56,220,241
302.byte	102,15,56,220,249
303.byte	102,15,56,221,208
304.byte	102,15,56,221,216
305.byte	102,15,56,221,224
306.byte	102,15,56,221,232
307.byte	102,15,56,221,240
308.byte	102,15,56,221,248
309	.byte	0xf3,0xc3
310.size	_aesni_encrypt6,.-_aesni_encrypt6
311.type	_aesni_decrypt6,@function
312.align	16
313_aesni_decrypt6:
314	movups	(%rcx),%xmm0
315	shll	$4,%eax
316	movups	16(%rcx),%xmm1
317	xorps	%xmm0,%xmm2
318	pxor	%xmm0,%xmm3
319	pxor	%xmm0,%xmm4
320.byte	102,15,56,222,209
321	leaq	32(%rcx,%rax,1),%rcx
322	negq	%rax
323.byte	102,15,56,222,217
324	pxor	%xmm0,%xmm5
325	pxor	%xmm0,%xmm6
326.byte	102,15,56,222,225
327	pxor	%xmm0,%xmm7
328	movups	(%rcx,%rax,1),%xmm0
329	addq	$16,%rax
330	jmp	.Ldec_loop6_enter
331.align	16
332.Ldec_loop6:
333.byte	102,15,56,222,209
334.byte	102,15,56,222,217
335.byte	102,15,56,222,225
336.Ldec_loop6_enter:
337.byte	102,15,56,222,233
338.byte	102,15,56,222,241
339.byte	102,15,56,222,249
340	movups	(%rcx,%rax,1),%xmm1
341	addq	$32,%rax
342.byte	102,15,56,222,208
343.byte	102,15,56,222,216
344.byte	102,15,56,222,224
345.byte	102,15,56,222,232
346.byte	102,15,56,222,240
347.byte	102,15,56,222,248
348	movups	-16(%rcx,%rax,1),%xmm0
349	jnz	.Ldec_loop6
350
351.byte	102,15,56,222,209
352.byte	102,15,56,222,217
353.byte	102,15,56,222,225
354.byte	102,15,56,222,233
355.byte	102,15,56,222,241
356.byte	102,15,56,222,249
357.byte	102,15,56,223,208
358.byte	102,15,56,223,216
359.byte	102,15,56,223,224
360.byte	102,15,56,223,232
361.byte	102,15,56,223,240
362.byte	102,15,56,223,248
363	.byte	0xf3,0xc3
364.size	_aesni_decrypt6,.-_aesni_decrypt6
365.type	_aesni_encrypt8,@function
366.align	16
367_aesni_encrypt8:
368	movups	(%rcx),%xmm0
369	shll	$4,%eax
370	movups	16(%rcx),%xmm1
371	xorps	%xmm0,%xmm2
372	xorps	%xmm0,%xmm3
373	pxor	%xmm0,%xmm4
374	pxor	%xmm0,%xmm5
375	pxor	%xmm0,%xmm6
376	leaq	32(%rcx,%rax,1),%rcx
377	negq	%rax
378.byte	102,15,56,220,209
379	pxor	%xmm0,%xmm7
380	pxor	%xmm0,%xmm8
381.byte	102,15,56,220,217
382	pxor	%xmm0,%xmm9
383	movups	(%rcx,%rax,1),%xmm0
384	addq	$16,%rax
385	jmp	.Lenc_loop8_inner
386.align	16
387.Lenc_loop8:
388.byte	102,15,56,220,209
389.byte	102,15,56,220,217
390.Lenc_loop8_inner:
391.byte	102,15,56,220,225
392.byte	102,15,56,220,233
393.byte	102,15,56,220,241
394.byte	102,15,56,220,249
395.byte	102,68,15,56,220,193
396.byte	102,68,15,56,220,201
397.Lenc_loop8_enter:
398	movups	(%rcx,%rax,1),%xmm1
399	addq	$32,%rax
400.byte	102,15,56,220,208
401.byte	102,15,56,220,216
402.byte	102,15,56,220,224
403.byte	102,15,56,220,232
404.byte	102,15,56,220,240
405.byte	102,15,56,220,248
406.byte	102,68,15,56,220,192
407.byte	102,68,15,56,220,200
408	movups	-16(%rcx,%rax,1),%xmm0
409	jnz	.Lenc_loop8
410
411.byte	102,15,56,220,209
412.byte	102,15,56,220,217
413.byte	102,15,56,220,225
414.byte	102,15,56,220,233
415.byte	102,15,56,220,241
416.byte	102,15,56,220,249
417.byte	102,68,15,56,220,193
418.byte	102,68,15,56,220,201
419.byte	102,15,56,221,208
420.byte	102,15,56,221,216
421.byte	102,15,56,221,224
422.byte	102,15,56,221,232
423.byte	102,15,56,221,240
424.byte	102,15,56,221,248
425.byte	102,68,15,56,221,192
426.byte	102,68,15,56,221,200
427	.byte	0xf3,0xc3
428.size	_aesni_encrypt8,.-_aesni_encrypt8
429.type	_aesni_decrypt8,@function
430.align	16
431_aesni_decrypt8:
432	movups	(%rcx),%xmm0
433	shll	$4,%eax
434	movups	16(%rcx),%xmm1
435	xorps	%xmm0,%xmm2
436	xorps	%xmm0,%xmm3
437	pxor	%xmm0,%xmm4
438	pxor	%xmm0,%xmm5
439	pxor	%xmm0,%xmm6
440	leaq	32(%rcx,%rax,1),%rcx
441	negq	%rax
442.byte	102,15,56,222,209
443	pxor	%xmm0,%xmm7
444	pxor	%xmm0,%xmm8
445.byte	102,15,56,222,217
446	pxor	%xmm0,%xmm9
447	movups	(%rcx,%rax,1),%xmm0
448	addq	$16,%rax
449	jmp	.Ldec_loop8_inner
450.align	16
451.Ldec_loop8:
452.byte	102,15,56,222,209
453.byte	102,15,56,222,217
454.Ldec_loop8_inner:
455.byte	102,15,56,222,225
456.byte	102,15,56,222,233
457.byte	102,15,56,222,241
458.byte	102,15,56,222,249
459.byte	102,68,15,56,222,193
460.byte	102,68,15,56,222,201
461.Ldec_loop8_enter:
462	movups	(%rcx,%rax,1),%xmm1
463	addq	$32,%rax
464.byte	102,15,56,222,208
465.byte	102,15,56,222,216
466.byte	102,15,56,222,224
467.byte	102,15,56,222,232
468.byte	102,15,56,222,240
469.byte	102,15,56,222,248
470.byte	102,68,15,56,222,192
471.byte	102,68,15,56,222,200
472	movups	-16(%rcx,%rax,1),%xmm0
473	jnz	.Ldec_loop8
474
475.byte	102,15,56,222,209
476.byte	102,15,56,222,217
477.byte	102,15,56,222,225
478.byte	102,15,56,222,233
479.byte	102,15,56,222,241
480.byte	102,15,56,222,249
481.byte	102,68,15,56,222,193
482.byte	102,68,15,56,222,201
483.byte	102,15,56,223,208
484.byte	102,15,56,223,216
485.byte	102,15,56,223,224
486.byte	102,15,56,223,232
487.byte	102,15,56,223,240
488.byte	102,15,56,223,248
489.byte	102,68,15,56,223,192
490.byte	102,68,15,56,223,200
491	.byte	0xf3,0xc3
492.size	_aesni_decrypt8,.-_aesni_decrypt8
493.globl	aesni_ecb_encrypt
494.type	aesni_ecb_encrypt,@function
495.align	16
496aesni_ecb_encrypt:
497	andq	$-16,%rdx
498	jz	.Lecb_ret
499
500	movl	240(%rcx),%eax
501	movups	(%rcx),%xmm0
502	movq	%rcx,%r11
503	movl	%eax,%r10d
504	testl	%r8d,%r8d
505	jz	.Lecb_decrypt
506
507	cmpq	$0x80,%rdx
508	jb	.Lecb_enc_tail
509
510	movdqu	(%rdi),%xmm2
511	movdqu	16(%rdi),%xmm3
512	movdqu	32(%rdi),%xmm4
513	movdqu	48(%rdi),%xmm5
514	movdqu	64(%rdi),%xmm6
515	movdqu	80(%rdi),%xmm7
516	movdqu	96(%rdi),%xmm8
517	movdqu	112(%rdi),%xmm9
518	leaq	128(%rdi),%rdi
519	subq	$0x80,%rdx
520	jmp	.Lecb_enc_loop8_enter
521.align	16
522.Lecb_enc_loop8:
523	movups	%xmm2,(%rsi)
524	movq	%r11,%rcx
525	movdqu	(%rdi),%xmm2
526	movl	%r10d,%eax
527	movups	%xmm3,16(%rsi)
528	movdqu	16(%rdi),%xmm3
529	movups	%xmm4,32(%rsi)
530	movdqu	32(%rdi),%xmm4
531	movups	%xmm5,48(%rsi)
532	movdqu	48(%rdi),%xmm5
533	movups	%xmm6,64(%rsi)
534	movdqu	64(%rdi),%xmm6
535	movups	%xmm7,80(%rsi)
536	movdqu	80(%rdi),%xmm7
537	movups	%xmm8,96(%rsi)
538	movdqu	96(%rdi),%xmm8
539	movups	%xmm9,112(%rsi)
540	leaq	128(%rsi),%rsi
541	movdqu	112(%rdi),%xmm9
542	leaq	128(%rdi),%rdi
543.Lecb_enc_loop8_enter:
544
545	call	_aesni_encrypt8
546
547	subq	$0x80,%rdx
548	jnc	.Lecb_enc_loop8
549
550	movups	%xmm2,(%rsi)
551	movq	%r11,%rcx
552	movups	%xmm3,16(%rsi)
553	movl	%r10d,%eax
554	movups	%xmm4,32(%rsi)
555	movups	%xmm5,48(%rsi)
556	movups	%xmm6,64(%rsi)
557	movups	%xmm7,80(%rsi)
558	movups	%xmm8,96(%rsi)
559	movups	%xmm9,112(%rsi)
560	leaq	128(%rsi),%rsi
561	addq	$0x80,%rdx
562	jz	.Lecb_ret
563
564.Lecb_enc_tail:
565	movups	(%rdi),%xmm2
566	cmpq	$0x20,%rdx
567	jb	.Lecb_enc_one
568	movups	16(%rdi),%xmm3
569	je	.Lecb_enc_two
570	movups	32(%rdi),%xmm4
571	cmpq	$0x40,%rdx
572	jb	.Lecb_enc_three
573	movups	48(%rdi),%xmm5
574	je	.Lecb_enc_four
575	movups	64(%rdi),%xmm6
576	cmpq	$0x60,%rdx
577	jb	.Lecb_enc_five
578	movups	80(%rdi),%xmm7
579	je	.Lecb_enc_six
580	movdqu	96(%rdi),%xmm8
581	xorps	%xmm9,%xmm9
582	call	_aesni_encrypt8
583	movups	%xmm2,(%rsi)
584	movups	%xmm3,16(%rsi)
585	movups	%xmm4,32(%rsi)
586	movups	%xmm5,48(%rsi)
587	movups	%xmm6,64(%rsi)
588	movups	%xmm7,80(%rsi)
589	movups	%xmm8,96(%rsi)
590	jmp	.Lecb_ret
591.align	16
592.Lecb_enc_one:
593	movups	(%rcx),%xmm0
594	movups	16(%rcx),%xmm1
595	leaq	32(%rcx),%rcx
596	xorps	%xmm0,%xmm2
597.Loop_enc1_3:
598.byte	102,15,56,220,209
599	decl	%eax
600	movups	(%rcx),%xmm1
601	leaq	16(%rcx),%rcx
602	jnz	.Loop_enc1_3
603.byte	102,15,56,221,209
604	movups	%xmm2,(%rsi)
605	jmp	.Lecb_ret
606.align	16
607.Lecb_enc_two:
608	call	_aesni_encrypt2
609	movups	%xmm2,(%rsi)
610	movups	%xmm3,16(%rsi)
611	jmp	.Lecb_ret
612.align	16
613.Lecb_enc_three:
614	call	_aesni_encrypt3
615	movups	%xmm2,(%rsi)
616	movups	%xmm3,16(%rsi)
617	movups	%xmm4,32(%rsi)
618	jmp	.Lecb_ret
619.align	16
620.Lecb_enc_four:
621	call	_aesni_encrypt4
622	movups	%xmm2,(%rsi)
623	movups	%xmm3,16(%rsi)
624	movups	%xmm4,32(%rsi)
625	movups	%xmm5,48(%rsi)
626	jmp	.Lecb_ret
627.align	16
628.Lecb_enc_five:
629	xorps	%xmm7,%xmm7
630	call	_aesni_encrypt6
631	movups	%xmm2,(%rsi)
632	movups	%xmm3,16(%rsi)
633	movups	%xmm4,32(%rsi)
634	movups	%xmm5,48(%rsi)
635	movups	%xmm6,64(%rsi)
636	jmp	.Lecb_ret
637.align	16
638.Lecb_enc_six:
639	call	_aesni_encrypt6
640	movups	%xmm2,(%rsi)
641	movups	%xmm3,16(%rsi)
642	movups	%xmm4,32(%rsi)
643	movups	%xmm5,48(%rsi)
644	movups	%xmm6,64(%rsi)
645	movups	%xmm7,80(%rsi)
646	jmp	.Lecb_ret
647
648.align	16
649.Lecb_decrypt:
650	cmpq	$0x80,%rdx
651	jb	.Lecb_dec_tail
652
653	movdqu	(%rdi),%xmm2
654	movdqu	16(%rdi),%xmm3
655	movdqu	32(%rdi),%xmm4
656	movdqu	48(%rdi),%xmm5
657	movdqu	64(%rdi),%xmm6
658	movdqu	80(%rdi),%xmm7
659	movdqu	96(%rdi),%xmm8
660	movdqu	112(%rdi),%xmm9
661	leaq	128(%rdi),%rdi
662	subq	$0x80,%rdx
663	jmp	.Lecb_dec_loop8_enter
664.align	16
665.Lecb_dec_loop8:
666	movups	%xmm2,(%rsi)
667	movq	%r11,%rcx
668	movdqu	(%rdi),%xmm2
669	movl	%r10d,%eax
670	movups	%xmm3,16(%rsi)
671	movdqu	16(%rdi),%xmm3
672	movups	%xmm4,32(%rsi)
673	movdqu	32(%rdi),%xmm4
674	movups	%xmm5,48(%rsi)
675	movdqu	48(%rdi),%xmm5
676	movups	%xmm6,64(%rsi)
677	movdqu	64(%rdi),%xmm6
678	movups	%xmm7,80(%rsi)
679	movdqu	80(%rdi),%xmm7
680	movups	%xmm8,96(%rsi)
681	movdqu	96(%rdi),%xmm8
682	movups	%xmm9,112(%rsi)
683	leaq	128(%rsi),%rsi
684	movdqu	112(%rdi),%xmm9
685	leaq	128(%rdi),%rdi
686.Lecb_dec_loop8_enter:
687
688	call	_aesni_decrypt8
689
690	movups	(%r11),%xmm0
691	subq	$0x80,%rdx
692	jnc	.Lecb_dec_loop8
693
694	movups	%xmm2,(%rsi)
695	pxor	%xmm2,%xmm2
696	movq	%r11,%rcx
697	movups	%xmm3,16(%rsi)
698	pxor	%xmm3,%xmm3
699	movl	%r10d,%eax
700	movups	%xmm4,32(%rsi)
701	pxor	%xmm4,%xmm4
702	movups	%xmm5,48(%rsi)
703	pxor	%xmm5,%xmm5
704	movups	%xmm6,64(%rsi)
705	pxor	%xmm6,%xmm6
706	movups	%xmm7,80(%rsi)
707	pxor	%xmm7,%xmm7
708	movups	%xmm8,96(%rsi)
709	pxor	%xmm8,%xmm8
710	movups	%xmm9,112(%rsi)
711	pxor	%xmm9,%xmm9
712	leaq	128(%rsi),%rsi
713	addq	$0x80,%rdx
714	jz	.Lecb_ret
715
716.Lecb_dec_tail:
717	movups	(%rdi),%xmm2
718	cmpq	$0x20,%rdx
719	jb	.Lecb_dec_one
720	movups	16(%rdi),%xmm3
721	je	.Lecb_dec_two
722	movups	32(%rdi),%xmm4
723	cmpq	$0x40,%rdx
724	jb	.Lecb_dec_three
725	movups	48(%rdi),%xmm5
726	je	.Lecb_dec_four
727	movups	64(%rdi),%xmm6
728	cmpq	$0x60,%rdx
729	jb	.Lecb_dec_five
730	movups	80(%rdi),%xmm7
731	je	.Lecb_dec_six
732	movups	96(%rdi),%xmm8
733	movups	(%rcx),%xmm0
734	xorps	%xmm9,%xmm9
735	call	_aesni_decrypt8
736	movups	%xmm2,(%rsi)
737	pxor	%xmm2,%xmm2
738	movups	%xmm3,16(%rsi)
739	pxor	%xmm3,%xmm3
740	movups	%xmm4,32(%rsi)
741	pxor	%xmm4,%xmm4
742	movups	%xmm5,48(%rsi)
743	pxor	%xmm5,%xmm5
744	movups	%xmm6,64(%rsi)
745	pxor	%xmm6,%xmm6
746	movups	%xmm7,80(%rsi)
747	pxor	%xmm7,%xmm7
748	movups	%xmm8,96(%rsi)
749	pxor	%xmm8,%xmm8
750	pxor	%xmm9,%xmm9
751	jmp	.Lecb_ret
752.align	16
753.Lecb_dec_one:
754	movups	(%rcx),%xmm0
755	movups	16(%rcx),%xmm1
756	leaq	32(%rcx),%rcx
757	xorps	%xmm0,%xmm2
758.Loop_dec1_4:
759.byte	102,15,56,222,209
760	decl	%eax
761	movups	(%rcx),%xmm1
762	leaq	16(%rcx),%rcx
763	jnz	.Loop_dec1_4
764.byte	102,15,56,223,209
765	movups	%xmm2,(%rsi)
766	pxor	%xmm2,%xmm2
767	jmp	.Lecb_ret
768.align	16
769.Lecb_dec_two:
770	call	_aesni_decrypt2
771	movups	%xmm2,(%rsi)
772	pxor	%xmm2,%xmm2
773	movups	%xmm3,16(%rsi)
774	pxor	%xmm3,%xmm3
775	jmp	.Lecb_ret
776.align	16
777.Lecb_dec_three:
778	call	_aesni_decrypt3
779	movups	%xmm2,(%rsi)
780	pxor	%xmm2,%xmm2
781	movups	%xmm3,16(%rsi)
782	pxor	%xmm3,%xmm3
783	movups	%xmm4,32(%rsi)
784	pxor	%xmm4,%xmm4
785	jmp	.Lecb_ret
786.align	16
787.Lecb_dec_four:
788	call	_aesni_decrypt4
789	movups	%xmm2,(%rsi)
790	pxor	%xmm2,%xmm2
791	movups	%xmm3,16(%rsi)
792	pxor	%xmm3,%xmm3
793	movups	%xmm4,32(%rsi)
794	pxor	%xmm4,%xmm4
795	movups	%xmm5,48(%rsi)
796	pxor	%xmm5,%xmm5
797	jmp	.Lecb_ret
798.align	16
799.Lecb_dec_five:
800	xorps	%xmm7,%xmm7
801	call	_aesni_decrypt6
802	movups	%xmm2,(%rsi)
803	pxor	%xmm2,%xmm2
804	movups	%xmm3,16(%rsi)
805	pxor	%xmm3,%xmm3
806	movups	%xmm4,32(%rsi)
807	pxor	%xmm4,%xmm4
808	movups	%xmm5,48(%rsi)
809	pxor	%xmm5,%xmm5
810	movups	%xmm6,64(%rsi)
811	pxor	%xmm6,%xmm6
812	pxor	%xmm7,%xmm7
813	jmp	.Lecb_ret
814.align	16
815.Lecb_dec_six:
816	call	_aesni_decrypt6
817	movups	%xmm2,(%rsi)
818	pxor	%xmm2,%xmm2
819	movups	%xmm3,16(%rsi)
820	pxor	%xmm3,%xmm3
821	movups	%xmm4,32(%rsi)
822	pxor	%xmm4,%xmm4
823	movups	%xmm5,48(%rsi)
824	pxor	%xmm5,%xmm5
825	movups	%xmm6,64(%rsi)
826	pxor	%xmm6,%xmm6
827	movups	%xmm7,80(%rsi)
828	pxor	%xmm7,%xmm7
829
830.Lecb_ret:
831	xorps	%xmm0,%xmm0
832	pxor	%xmm1,%xmm1
833	.byte	0xf3,0xc3
834.size	aesni_ecb_encrypt,.-aesni_ecb_encrypt
835.globl	aesni_ccm64_encrypt_blocks
836.type	aesni_ccm64_encrypt_blocks,@function
837.align	16
838aesni_ccm64_encrypt_blocks:
839	movl	240(%rcx),%eax
840	movdqu	(%r8),%xmm6
841	movdqa	.Lincrement64(%rip),%xmm9
842	movdqa	.Lbswap_mask(%rip),%xmm7
843
844	shll	$4,%eax
845	movl	$16,%r10d
846	leaq	0(%rcx),%r11
847	movdqu	(%r9),%xmm3
848	movdqa	%xmm6,%xmm2
849	leaq	32(%rcx,%rax,1),%rcx
850.byte	102,15,56,0,247
851	subq	%rax,%r10
852	jmp	.Lccm64_enc_outer
853.align	16
854.Lccm64_enc_outer:
855	movups	(%r11),%xmm0
856	movq	%r10,%rax
857	movups	(%rdi),%xmm8
858
859	xorps	%xmm0,%xmm2
860	movups	16(%r11),%xmm1
861	xorps	%xmm8,%xmm0
862	xorps	%xmm0,%xmm3
863	movups	32(%r11),%xmm0
864
865.Lccm64_enc2_loop:
866.byte	102,15,56,220,209
867.byte	102,15,56,220,217
868	movups	(%rcx,%rax,1),%xmm1
869	addq	$32,%rax
870.byte	102,15,56,220,208
871.byte	102,15,56,220,216
872	movups	-16(%rcx,%rax,1),%xmm0
873	jnz	.Lccm64_enc2_loop
874.byte	102,15,56,220,209
875.byte	102,15,56,220,217
876	paddq	%xmm9,%xmm6
877	decq	%rdx
878.byte	102,15,56,221,208
879.byte	102,15,56,221,216
880
881	leaq	16(%rdi),%rdi
882	xorps	%xmm2,%xmm8
883	movdqa	%xmm6,%xmm2
884	movups	%xmm8,(%rsi)
885.byte	102,15,56,0,215
886	leaq	16(%rsi),%rsi
887	jnz	.Lccm64_enc_outer
888
889	pxor	%xmm0,%xmm0
890	pxor	%xmm1,%xmm1
891	pxor	%xmm2,%xmm2
892	movups	%xmm3,(%r9)
893	pxor	%xmm3,%xmm3
894	pxor	%xmm8,%xmm8
895	pxor	%xmm6,%xmm6
896	.byte	0xf3,0xc3
897.size	aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
898.globl	aesni_ccm64_decrypt_blocks
899.type	aesni_ccm64_decrypt_blocks,@function
900.align	16
901aesni_ccm64_decrypt_blocks:
902	movl	240(%rcx),%eax
903	movups	(%r8),%xmm6
904	movdqu	(%r9),%xmm3
905	movdqa	.Lincrement64(%rip),%xmm9
906	movdqa	.Lbswap_mask(%rip),%xmm7
907
908	movaps	%xmm6,%xmm2
909	movl	%eax,%r10d
910	movq	%rcx,%r11
911.byte	102,15,56,0,247
912	movups	(%rcx),%xmm0
913	movups	16(%rcx),%xmm1
914	leaq	32(%rcx),%rcx
915	xorps	%xmm0,%xmm2
916.Loop_enc1_5:
917.byte	102,15,56,220,209
918	decl	%eax
919	movups	(%rcx),%xmm1
920	leaq	16(%rcx),%rcx
921	jnz	.Loop_enc1_5
922.byte	102,15,56,221,209
923	shll	$4,%r10d
924	movl	$16,%eax
925	movups	(%rdi),%xmm8
926	paddq	%xmm9,%xmm6
927	leaq	16(%rdi),%rdi
928	subq	%r10,%rax
929	leaq	32(%r11,%r10,1),%rcx
930	movq	%rax,%r10
931	jmp	.Lccm64_dec_outer
932.align	16
933.Lccm64_dec_outer:
934	xorps	%xmm2,%xmm8
935	movdqa	%xmm6,%xmm2
936	movups	%xmm8,(%rsi)
937	leaq	16(%rsi),%rsi
938.byte	102,15,56,0,215
939
940	subq	$1,%rdx
941	jz	.Lccm64_dec_break
942
943	movups	(%r11),%xmm0
944	movq	%r10,%rax
945	movups	16(%r11),%xmm1
946	xorps	%xmm0,%xmm8
947	xorps	%xmm0,%xmm2
948	xorps	%xmm8,%xmm3
949	movups	32(%r11),%xmm0
950	jmp	.Lccm64_dec2_loop
951.align	16
952.Lccm64_dec2_loop:
953.byte	102,15,56,220,209
954.byte	102,15,56,220,217
955	movups	(%rcx,%rax,1),%xmm1
956	addq	$32,%rax
957.byte	102,15,56,220,208
958.byte	102,15,56,220,216
959	movups	-16(%rcx,%rax,1),%xmm0
960	jnz	.Lccm64_dec2_loop
961	movups	(%rdi),%xmm8
962	paddq	%xmm9,%xmm6
963.byte	102,15,56,220,209
964.byte	102,15,56,220,217
965.byte	102,15,56,221,208
966.byte	102,15,56,221,216
967	leaq	16(%rdi),%rdi
968	jmp	.Lccm64_dec_outer
969
970.align	16
971.Lccm64_dec_break:
972
973	movl	240(%r11),%eax
974	movups	(%r11),%xmm0
975	movups	16(%r11),%xmm1
976	xorps	%xmm0,%xmm8
977	leaq	32(%r11),%r11
978	xorps	%xmm8,%xmm3
979.Loop_enc1_6:
980.byte	102,15,56,220,217
981	decl	%eax
982	movups	(%r11),%xmm1
983	leaq	16(%r11),%r11
984	jnz	.Loop_enc1_6
985.byte	102,15,56,221,217
986	pxor	%xmm0,%xmm0
987	pxor	%xmm1,%xmm1
988	pxor	%xmm2,%xmm2
989	movups	%xmm3,(%r9)
990	pxor	%xmm3,%xmm3
991	pxor	%xmm8,%xmm8
992	pxor	%xmm6,%xmm6
993	.byte	0xf3,0xc3
994.size	aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
995.globl	aesni_ctr32_encrypt_blocks
996.type	aesni_ctr32_encrypt_blocks,@function
997.align	16
998aesni_ctr32_encrypt_blocks:
999	cmpq	$1,%rdx
1000	jne	.Lctr32_bulk
1001
1002
1003
1004	movups	(%r8),%xmm2
1005	movups	(%rdi),%xmm3
1006	movl	240(%rcx),%edx
1007	movups	(%rcx),%xmm0
1008	movups	16(%rcx),%xmm1
1009	leaq	32(%rcx),%rcx
1010	xorps	%xmm0,%xmm2
1011.Loop_enc1_7:
1012.byte	102,15,56,220,209
1013	decl	%edx
1014	movups	(%rcx),%xmm1
1015	leaq	16(%rcx),%rcx
1016	jnz	.Loop_enc1_7
1017.byte	102,15,56,221,209
1018	pxor	%xmm0,%xmm0
1019	pxor	%xmm1,%xmm1
1020	xorps	%xmm3,%xmm2
1021	pxor	%xmm3,%xmm3
1022	movups	%xmm2,(%rsi)
1023	xorps	%xmm2,%xmm2
1024	jmp	.Lctr32_epilogue
1025
1026.align	16
1027.Lctr32_bulk:
1028	leaq	(%rsp),%rax
1029	pushq	%rbp
1030	subq	$128,%rsp
1031	andq	$-16,%rsp
1032	leaq	-8(%rax),%rbp
1033
1034
1035
1036
1037	movdqu	(%r8),%xmm2
1038	movdqu	(%rcx),%xmm0
1039	movl	12(%r8),%r8d
1040	pxor	%xmm0,%xmm2
1041	movl	12(%rcx),%r11d
1042	movdqa	%xmm2,0(%rsp)
1043	bswapl	%r8d
1044	movdqa	%xmm2,%xmm3
1045	movdqa	%xmm2,%xmm4
1046	movdqa	%xmm2,%xmm5
1047	movdqa	%xmm2,64(%rsp)
1048	movdqa	%xmm2,80(%rsp)
1049	movdqa	%xmm2,96(%rsp)
1050	movq	%rdx,%r10
1051	movdqa	%xmm2,112(%rsp)
1052
1053	leaq	1(%r8),%rax
1054	leaq	2(%r8),%rdx
1055	bswapl	%eax
1056	bswapl	%edx
1057	xorl	%r11d,%eax
1058	xorl	%r11d,%edx
1059.byte	102,15,58,34,216,3
1060	leaq	3(%r8),%rax
1061	movdqa	%xmm3,16(%rsp)
1062.byte	102,15,58,34,226,3
1063	bswapl	%eax
1064	movq	%r10,%rdx
1065	leaq	4(%r8),%r10
1066	movdqa	%xmm4,32(%rsp)
1067	xorl	%r11d,%eax
1068	bswapl	%r10d
1069.byte	102,15,58,34,232,3
1070	xorl	%r11d,%r10d
1071	movdqa	%xmm5,48(%rsp)
1072	leaq	5(%r8),%r9
1073	movl	%r10d,64+12(%rsp)
1074	bswapl	%r9d
1075	leaq	6(%r8),%r10
1076	movl	240(%rcx),%eax
1077	xorl	%r11d,%r9d
1078	bswapl	%r10d
1079	movl	%r9d,80+12(%rsp)
1080	xorl	%r11d,%r10d
1081	leaq	7(%r8),%r9
1082	movl	%r10d,96+12(%rsp)
1083	bswapl	%r9d
1084	movl	OPENSSL_ia32cap_P+4(%rip),%r10d
1085	xorl	%r11d,%r9d
1086	andl	$71303168,%r10d
1087	movl	%r9d,112+12(%rsp)
1088
1089	movups	16(%rcx),%xmm1
1090
1091	movdqa	64(%rsp),%xmm6
1092	movdqa	80(%rsp),%xmm7
1093
1094	cmpq	$8,%rdx
1095	jb	.Lctr32_tail
1096
1097	subq	$6,%rdx
1098	cmpl	$4194304,%r10d
1099	je	.Lctr32_6x
1100
1101	leaq	128(%rcx),%rcx
1102	subq	$2,%rdx
1103	jmp	.Lctr32_loop8
1104
1105.align	16
1106.Lctr32_6x:
1107	shll	$4,%eax
1108	movl	$48,%r10d
1109	bswapl	%r11d
1110	leaq	32(%rcx,%rax,1),%rcx
1111	subq	%rax,%r10
1112	jmp	.Lctr32_loop6
1113
1114.align	16
1115.Lctr32_loop6:
1116	addl	$6,%r8d
1117	movups	-48(%rcx,%r10,1),%xmm0
1118.byte	102,15,56,220,209
1119	movl	%r8d,%eax
1120	xorl	%r11d,%eax
1121.byte	102,15,56,220,217
1122.byte	0x0f,0x38,0xf1,0x44,0x24,12
1123	leal	1(%r8),%eax
1124.byte	102,15,56,220,225
1125	xorl	%r11d,%eax
1126.byte	0x0f,0x38,0xf1,0x44,0x24,28
1127.byte	102,15,56,220,233
1128	leal	2(%r8),%eax
1129	xorl	%r11d,%eax
1130.byte	102,15,56,220,241
1131.byte	0x0f,0x38,0xf1,0x44,0x24,44
1132	leal	3(%r8),%eax
1133.byte	102,15,56,220,249
1134	movups	-32(%rcx,%r10,1),%xmm1
1135	xorl	%r11d,%eax
1136
1137.byte	102,15,56,220,208
1138.byte	0x0f,0x38,0xf1,0x44,0x24,60
1139	leal	4(%r8),%eax
1140.byte	102,15,56,220,216
1141	xorl	%r11d,%eax
1142.byte	0x0f,0x38,0xf1,0x44,0x24,76
1143.byte	102,15,56,220,224
1144	leal	5(%r8),%eax
1145	xorl	%r11d,%eax
1146.byte	102,15,56,220,232
1147.byte	0x0f,0x38,0xf1,0x44,0x24,92
1148	movq	%r10,%rax
1149.byte	102,15,56,220,240
1150.byte	102,15,56,220,248
1151	movups	-16(%rcx,%r10,1),%xmm0
1152
1153	call	.Lenc_loop6
1154
1155	movdqu	(%rdi),%xmm8
1156	movdqu	16(%rdi),%xmm9
1157	movdqu	32(%rdi),%xmm10
1158	movdqu	48(%rdi),%xmm11
1159	movdqu	64(%rdi),%xmm12
1160	movdqu	80(%rdi),%xmm13
1161	leaq	96(%rdi),%rdi
1162	movups	-64(%rcx,%r10,1),%xmm1
1163	pxor	%xmm2,%xmm8
1164	movaps	0(%rsp),%xmm2
1165	pxor	%xmm3,%xmm9
1166	movaps	16(%rsp),%xmm3
1167	pxor	%xmm4,%xmm10
1168	movaps	32(%rsp),%xmm4
1169	pxor	%xmm5,%xmm11
1170	movaps	48(%rsp),%xmm5
1171	pxor	%xmm6,%xmm12
1172	movaps	64(%rsp),%xmm6
1173	pxor	%xmm7,%xmm13
1174	movaps	80(%rsp),%xmm7
1175	movdqu	%xmm8,(%rsi)
1176	movdqu	%xmm9,16(%rsi)
1177	movdqu	%xmm10,32(%rsi)
1178	movdqu	%xmm11,48(%rsi)
1179	movdqu	%xmm12,64(%rsi)
1180	movdqu	%xmm13,80(%rsi)
1181	leaq	96(%rsi),%rsi
1182
1183	subq	$6,%rdx
1184	jnc	.Lctr32_loop6
1185
1186	addq	$6,%rdx
1187	jz	.Lctr32_done
1188
1189	leal	-48(%r10),%eax
1190	leaq	-80(%rcx,%r10,1),%rcx
1191	negl	%eax
1192	shrl	$4,%eax
1193	jmp	.Lctr32_tail
1194
1195.align	32
1196.Lctr32_loop8:
1197	addl	$8,%r8d
1198	movdqa	96(%rsp),%xmm8
1199.byte	102,15,56,220,209
1200	movl	%r8d,%r9d
1201	movdqa	112(%rsp),%xmm9
1202.byte	102,15,56,220,217
1203	bswapl	%r9d
1204	movups	32-128(%rcx),%xmm0
1205.byte	102,15,56,220,225
1206	xorl	%r11d,%r9d
1207	nop
1208.byte	102,15,56,220,233
1209	movl	%r9d,0+12(%rsp)
1210	leaq	1(%r8),%r9
1211.byte	102,15,56,220,241
1212.byte	102,15,56,220,249
1213.byte	102,68,15,56,220,193
1214.byte	102,68,15,56,220,201
1215	movups	48-128(%rcx),%xmm1
1216	bswapl	%r9d
1217.byte	102,15,56,220,208
1218.byte	102,15,56,220,216
1219	xorl	%r11d,%r9d
1220.byte	0x66,0x90
1221.byte	102,15,56,220,224
1222.byte	102,15,56,220,232
1223	movl	%r9d,16+12(%rsp)
1224	leaq	2(%r8),%r9
1225.byte	102,15,56,220,240
1226.byte	102,15,56,220,248
1227.byte	102,68,15,56,220,192
1228.byte	102,68,15,56,220,200
1229	movups	64-128(%rcx),%xmm0
1230	bswapl	%r9d
1231.byte	102,15,56,220,209
1232.byte	102,15,56,220,217
1233	xorl	%r11d,%r9d
1234.byte	0x66,0x90
1235.byte	102,15,56,220,225
1236.byte	102,15,56,220,233
1237	movl	%r9d,32+12(%rsp)
1238	leaq	3(%r8),%r9
1239.byte	102,15,56,220,241
1240.byte	102,15,56,220,249
1241.byte	102,68,15,56,220,193
1242.byte	102,68,15,56,220,201
1243	movups	80-128(%rcx),%xmm1
1244	bswapl	%r9d
1245.byte	102,15,56,220,208
1246.byte	102,15,56,220,216
1247	xorl	%r11d,%r9d
1248.byte	0x66,0x90
1249.byte	102,15,56,220,224
1250.byte	102,15,56,220,232
1251	movl	%r9d,48+12(%rsp)
1252	leaq	4(%r8),%r9
1253.byte	102,15,56,220,240
1254.byte	102,15,56,220,248
1255.byte	102,68,15,56,220,192
1256.byte	102,68,15,56,220,200
1257	movups	96-128(%rcx),%xmm0
1258	bswapl	%r9d
1259.byte	102,15,56,220,209
1260.byte	102,15,56,220,217
1261	xorl	%r11d,%r9d
1262.byte	0x66,0x90
1263.byte	102,15,56,220,225
1264.byte	102,15,56,220,233
1265	movl	%r9d,64+12(%rsp)
1266	leaq	5(%r8),%r9
1267.byte	102,15,56,220,241
1268.byte	102,15,56,220,249
1269.byte	102,68,15,56,220,193
1270.byte	102,68,15,56,220,201
1271	movups	112-128(%rcx),%xmm1
1272	bswapl	%r9d
1273.byte	102,15,56,220,208
1274.byte	102,15,56,220,216
1275	xorl	%r11d,%r9d
1276.byte	0x66,0x90
1277.byte	102,15,56,220,224
1278.byte	102,15,56,220,232
1279	movl	%r9d,80+12(%rsp)
1280	leaq	6(%r8),%r9
1281.byte	102,15,56,220,240
1282.byte	102,15,56,220,248
1283.byte	102,68,15,56,220,192
1284.byte	102,68,15,56,220,200
1285	movups	128-128(%rcx),%xmm0
1286	bswapl	%r9d
1287.byte	102,15,56,220,209
1288.byte	102,15,56,220,217
1289	xorl	%r11d,%r9d
1290.byte	0x66,0x90
1291.byte	102,15,56,220,225
1292.byte	102,15,56,220,233
1293	movl	%r9d,96+12(%rsp)
1294	leaq	7(%r8),%r9
1295.byte	102,15,56,220,241
1296.byte	102,15,56,220,249
1297.byte	102,68,15,56,220,193
1298.byte	102,68,15,56,220,201
1299	movups	144-128(%rcx),%xmm1
1300	bswapl	%r9d
1301.byte	102,15,56,220,208
1302.byte	102,15,56,220,216
1303.byte	102,15,56,220,224
1304	xorl	%r11d,%r9d
1305	movdqu	0(%rdi),%xmm10
1306.byte	102,15,56,220,232
1307	movl	%r9d,112+12(%rsp)
1308	cmpl	$11,%eax
1309.byte	102,15,56,220,240
1310.byte	102,15,56,220,248
1311.byte	102,68,15,56,220,192
1312.byte	102,68,15,56,220,200
1313	movups	160-128(%rcx),%xmm0
1314
1315	jb	.Lctr32_enc_done
1316
1317.byte	102,15,56,220,209
1318.byte	102,15,56,220,217
1319.byte	102,15,56,220,225
1320.byte	102,15,56,220,233
1321.byte	102,15,56,220,241
1322.byte	102,15,56,220,249
1323.byte	102,68,15,56,220,193
1324.byte	102,68,15,56,220,201
1325	movups	176-128(%rcx),%xmm1
1326
1327.byte	102,15,56,220,208
1328.byte	102,15,56,220,216
1329.byte	102,15,56,220,224
1330.byte	102,15,56,220,232
1331.byte	102,15,56,220,240
1332.byte	102,15,56,220,248
1333.byte	102,68,15,56,220,192
1334.byte	102,68,15,56,220,200
1335	movups	192-128(%rcx),%xmm0
1336	je	.Lctr32_enc_done
1337
1338.byte	102,15,56,220,209
1339.byte	102,15,56,220,217
1340.byte	102,15,56,220,225
1341.byte	102,15,56,220,233
1342.byte	102,15,56,220,241
1343.byte	102,15,56,220,249
1344.byte	102,68,15,56,220,193
1345.byte	102,68,15,56,220,201
1346	movups	208-128(%rcx),%xmm1
1347
1348.byte	102,15,56,220,208
1349.byte	102,15,56,220,216
1350.byte	102,15,56,220,224
1351.byte	102,15,56,220,232
1352.byte	102,15,56,220,240
1353.byte	102,15,56,220,248
1354.byte	102,68,15,56,220,192
1355.byte	102,68,15,56,220,200
1356	movups	224-128(%rcx),%xmm0
1357	jmp	.Lctr32_enc_done
1358
1359.align	16
1360.Lctr32_enc_done:
1361	movdqu	16(%rdi),%xmm11
1362	pxor	%xmm0,%xmm10
1363	movdqu	32(%rdi),%xmm12
1364	pxor	%xmm0,%xmm11
1365	movdqu	48(%rdi),%xmm13
1366	pxor	%xmm0,%xmm12
1367	movdqu	64(%rdi),%xmm14
1368	pxor	%xmm0,%xmm13
1369	movdqu	80(%rdi),%xmm15
1370	pxor	%xmm0,%xmm14
1371	pxor	%xmm0,%xmm15
1372.byte	102,15,56,220,209
1373.byte	102,15,56,220,217
1374.byte	102,15,56,220,225
1375.byte	102,15,56,220,233
1376.byte	102,15,56,220,241
1377.byte	102,15,56,220,249
1378.byte	102,68,15,56,220,193
1379.byte	102,68,15,56,220,201
1380	movdqu	96(%rdi),%xmm1
1381	leaq	128(%rdi),%rdi
1382
1383.byte	102,65,15,56,221,210
1384	pxor	%xmm0,%xmm1
1385	movdqu	112-128(%rdi),%xmm10
1386.byte	102,65,15,56,221,219
1387	pxor	%xmm0,%xmm10
1388	movdqa	0(%rsp),%xmm11
1389.byte	102,65,15,56,221,228
1390.byte	102,65,15,56,221,237
1391	movdqa	16(%rsp),%xmm12
1392	movdqa	32(%rsp),%xmm13
1393.byte	102,65,15,56,221,246
1394.byte	102,65,15,56,221,255
1395	movdqa	48(%rsp),%xmm14
1396	movdqa	64(%rsp),%xmm15
1397.byte	102,68,15,56,221,193
1398	movdqa	80(%rsp),%xmm0
1399	movups	16-128(%rcx),%xmm1
1400.byte	102,69,15,56,221,202
1401
1402	movups	%xmm2,(%rsi)
1403	movdqa	%xmm11,%xmm2
1404	movups	%xmm3,16(%rsi)
1405	movdqa	%xmm12,%xmm3
1406	movups	%xmm4,32(%rsi)
1407	movdqa	%xmm13,%xmm4
1408	movups	%xmm5,48(%rsi)
1409	movdqa	%xmm14,%xmm5
1410	movups	%xmm6,64(%rsi)
1411	movdqa	%xmm15,%xmm6
1412	movups	%xmm7,80(%rsi)
1413	movdqa	%xmm0,%xmm7
1414	movups	%xmm8,96(%rsi)
1415	movups	%xmm9,112(%rsi)
1416	leaq	128(%rsi),%rsi
1417
1418	subq	$8,%rdx
1419	jnc	.Lctr32_loop8
1420
1421	addq	$8,%rdx
1422	jz	.Lctr32_done
1423	leaq	-128(%rcx),%rcx
1424
1425.Lctr32_tail:
1426
1427
1428	leaq	16(%rcx),%rcx
1429	cmpq	$4,%rdx
1430	jb	.Lctr32_loop3
1431	je	.Lctr32_loop4
1432
1433
1434	shll	$4,%eax
1435	movdqa	96(%rsp),%xmm8
1436	pxor	%xmm9,%xmm9
1437
1438	movups	16(%rcx),%xmm0
1439.byte	102,15,56,220,209
1440.byte	102,15,56,220,217
1441	leaq	32-16(%rcx,%rax,1),%rcx
1442	negq	%rax
1443.byte	102,15,56,220,225
1444	addq	$16,%rax
1445	movups	(%rdi),%xmm10
1446.byte	102,15,56,220,233
1447.byte	102,15,56,220,241
1448	movups	16(%rdi),%xmm11
1449	movups	32(%rdi),%xmm12
1450.byte	102,15,56,220,249
1451.byte	102,68,15,56,220,193
1452
1453	call	.Lenc_loop8_enter
1454
1455	movdqu	48(%rdi),%xmm13
1456	pxor	%xmm10,%xmm2
1457	movdqu	64(%rdi),%xmm10
1458	pxor	%xmm11,%xmm3
1459	movdqu	%xmm2,(%rsi)
1460	pxor	%xmm12,%xmm4
1461	movdqu	%xmm3,16(%rsi)
1462	pxor	%xmm13,%xmm5
1463	movdqu	%xmm4,32(%rsi)
1464	pxor	%xmm10,%xmm6
1465	movdqu	%xmm5,48(%rsi)
1466	movdqu	%xmm6,64(%rsi)
1467	cmpq	$6,%rdx
1468	jb	.Lctr32_done
1469
1470	movups	80(%rdi),%xmm11
1471	xorps	%xmm11,%xmm7
1472	movups	%xmm7,80(%rsi)
1473	je	.Lctr32_done
1474
1475	movups	96(%rdi),%xmm12
1476	xorps	%xmm12,%xmm8
1477	movups	%xmm8,96(%rsi)
1478	jmp	.Lctr32_done
1479
1480.align	32
1481.Lctr32_loop4:
1482.byte	102,15,56,220,209
1483	leaq	16(%rcx),%rcx
1484	decl	%eax
1485.byte	102,15,56,220,217
1486.byte	102,15,56,220,225
1487.byte	102,15,56,220,233
1488	movups	(%rcx),%xmm1
1489	jnz	.Lctr32_loop4
1490.byte	102,15,56,221,209
1491.byte	102,15,56,221,217
1492	movups	(%rdi),%xmm10
1493	movups	16(%rdi),%xmm11
1494.byte	102,15,56,221,225
1495.byte	102,15,56,221,233
1496	movups	32(%rdi),%xmm12
1497	movups	48(%rdi),%xmm13
1498
1499	xorps	%xmm10,%xmm2
1500	movups	%xmm2,(%rsi)
1501	xorps	%xmm11,%xmm3
1502	movups	%xmm3,16(%rsi)
1503	pxor	%xmm12,%xmm4
1504	movdqu	%xmm4,32(%rsi)
1505	pxor	%xmm13,%xmm5
1506	movdqu	%xmm5,48(%rsi)
1507	jmp	.Lctr32_done
1508
1509.align	32
1510.Lctr32_loop3:
1511.byte	102,15,56,220,209
1512	leaq	16(%rcx),%rcx
1513	decl	%eax
1514.byte	102,15,56,220,217
1515.byte	102,15,56,220,225
1516	movups	(%rcx),%xmm1
1517	jnz	.Lctr32_loop3
1518.byte	102,15,56,221,209
1519.byte	102,15,56,221,217
1520.byte	102,15,56,221,225
1521
1522	movups	(%rdi),%xmm10
1523	xorps	%xmm10,%xmm2
1524	movups	%xmm2,(%rsi)
1525	cmpq	$2,%rdx
1526	jb	.Lctr32_done
1527
1528	movups	16(%rdi),%xmm11
1529	xorps	%xmm11,%xmm3
1530	movups	%xmm3,16(%rsi)
1531	je	.Lctr32_done
1532
1533	movups	32(%rdi),%xmm12
1534	xorps	%xmm12,%xmm4
1535	movups	%xmm4,32(%rsi)
1536
1537.Lctr32_done:
1538	xorps	%xmm0,%xmm0
1539	xorl	%r11d,%r11d
1540	pxor	%xmm1,%xmm1
1541	pxor	%xmm2,%xmm2
1542	pxor	%xmm3,%xmm3
1543	pxor	%xmm4,%xmm4
1544	pxor	%xmm5,%xmm5
1545	pxor	%xmm6,%xmm6
1546	pxor	%xmm7,%xmm7
1547	movaps	%xmm0,0(%rsp)
1548	pxor	%xmm8,%xmm8
1549	movaps	%xmm0,16(%rsp)
1550	pxor	%xmm9,%xmm9
1551	movaps	%xmm0,32(%rsp)
1552	pxor	%xmm10,%xmm10
1553	movaps	%xmm0,48(%rsp)
1554	pxor	%xmm11,%xmm11
1555	movaps	%xmm0,64(%rsp)
1556	pxor	%xmm12,%xmm12
1557	movaps	%xmm0,80(%rsp)
1558	pxor	%xmm13,%xmm13
1559	movaps	%xmm0,96(%rsp)
1560	pxor	%xmm14,%xmm14
1561	movaps	%xmm0,112(%rsp)
1562	pxor	%xmm15,%xmm15
1563	leaq	(%rbp),%rsp
1564	popq	%rbp
1565.Lctr32_epilogue:
1566	.byte	0xf3,0xc3
1567.size	aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
1568.globl	aesni_xts_encrypt
1569.type	aesni_xts_encrypt,@function
1570.align	16
1571aesni_xts_encrypt:
1572	leaq	(%rsp),%rax
1573	pushq	%rbp
1574	subq	$112,%rsp
1575	andq	$-16,%rsp
1576	leaq	-8(%rax),%rbp
1577	movups	(%r9),%xmm2
1578	movl	240(%r8),%eax
1579	movl	240(%rcx),%r10d
1580	movups	(%r8),%xmm0
1581	movups	16(%r8),%xmm1
1582	leaq	32(%r8),%r8
1583	xorps	%xmm0,%xmm2
1584.Loop_enc1_8:
1585.byte	102,15,56,220,209
1586	decl	%eax
1587	movups	(%r8),%xmm1
1588	leaq	16(%r8),%r8
1589	jnz	.Loop_enc1_8
1590.byte	102,15,56,221,209
1591	movups	(%rcx),%xmm0
1592	movq	%rcx,%r11
1593	movl	%r10d,%eax
1594	shll	$4,%r10d
1595	movq	%rdx,%r9
1596	andq	$-16,%rdx
1597
1598	movups	16(%rcx,%r10,1),%xmm1
1599
1600	movdqa	.Lxts_magic(%rip),%xmm8
1601	movdqa	%xmm2,%xmm15
1602	pshufd	$0x5f,%xmm2,%xmm9
1603	pxor	%xmm0,%xmm1
1604	movdqa	%xmm9,%xmm14
1605	paddd	%xmm9,%xmm9
1606	movdqa	%xmm15,%xmm10
1607	psrad	$31,%xmm14
1608	paddq	%xmm15,%xmm15
1609	pand	%xmm8,%xmm14
1610	pxor	%xmm0,%xmm10
1611	pxor	%xmm14,%xmm15
1612	movdqa	%xmm9,%xmm14
1613	paddd	%xmm9,%xmm9
1614	movdqa	%xmm15,%xmm11
1615	psrad	$31,%xmm14
1616	paddq	%xmm15,%xmm15
1617	pand	%xmm8,%xmm14
1618	pxor	%xmm0,%xmm11
1619	pxor	%xmm14,%xmm15
1620	movdqa	%xmm9,%xmm14
1621	paddd	%xmm9,%xmm9
1622	movdqa	%xmm15,%xmm12
1623	psrad	$31,%xmm14
1624	paddq	%xmm15,%xmm15
1625	pand	%xmm8,%xmm14
1626	pxor	%xmm0,%xmm12
1627	pxor	%xmm14,%xmm15
1628	movdqa	%xmm9,%xmm14
1629	paddd	%xmm9,%xmm9
1630	movdqa	%xmm15,%xmm13
1631	psrad	$31,%xmm14
1632	paddq	%xmm15,%xmm15
1633	pand	%xmm8,%xmm14
1634	pxor	%xmm0,%xmm13
1635	pxor	%xmm14,%xmm15
1636	movdqa	%xmm15,%xmm14
1637	psrad	$31,%xmm9
1638	paddq	%xmm15,%xmm15
1639	pand	%xmm8,%xmm9
1640	pxor	%xmm0,%xmm14
1641	pxor	%xmm9,%xmm15
1642	movaps	%xmm1,96(%rsp)
1643
1644	subq	$96,%rdx
1645	jc	.Lxts_enc_short
1646
1647	movl	$16+96,%eax
1648	leaq	32(%r11,%r10,1),%rcx
1649	subq	%r10,%rax
1650	movups	16(%r11),%xmm1
1651	movq	%rax,%r10
1652	leaq	.Lxts_magic(%rip),%r8
1653	jmp	.Lxts_enc_grandloop
1654
1655.align	32
1656.Lxts_enc_grandloop:
1657	movdqu	0(%rdi),%xmm2
1658	movdqa	%xmm0,%xmm8
1659	movdqu	16(%rdi),%xmm3
1660	pxor	%xmm10,%xmm2
1661	movdqu	32(%rdi),%xmm4
1662	pxor	%xmm11,%xmm3
1663.byte	102,15,56,220,209
1664	movdqu	48(%rdi),%xmm5
1665	pxor	%xmm12,%xmm4
1666.byte	102,15,56,220,217
1667	movdqu	64(%rdi),%xmm6
1668	pxor	%xmm13,%xmm5
1669.byte	102,15,56,220,225
1670	movdqu	80(%rdi),%xmm7
1671	pxor	%xmm15,%xmm8
1672	movdqa	96(%rsp),%xmm9
1673	pxor	%xmm14,%xmm6
1674.byte	102,15,56,220,233
1675	movups	32(%r11),%xmm0
1676	leaq	96(%rdi),%rdi
1677	pxor	%xmm8,%xmm7
1678
1679	pxor	%xmm9,%xmm10
1680.byte	102,15,56,220,241
1681	pxor	%xmm9,%xmm11
1682	movdqa	%xmm10,0(%rsp)
1683.byte	102,15,56,220,249
1684	movups	48(%r11),%xmm1
1685	pxor	%xmm9,%xmm12
1686
1687.byte	102,15,56,220,208
1688	pxor	%xmm9,%xmm13
1689	movdqa	%xmm11,16(%rsp)
1690.byte	102,15,56,220,216
1691	pxor	%xmm9,%xmm14
1692	movdqa	%xmm12,32(%rsp)
1693.byte	102,15,56,220,224
1694.byte	102,15,56,220,232
1695	pxor	%xmm9,%xmm8
1696	movdqa	%xmm14,64(%rsp)
1697.byte	102,15,56,220,240
1698.byte	102,15,56,220,248
1699	movups	64(%r11),%xmm0
1700	movdqa	%xmm8,80(%rsp)
1701	pshufd	$0x5f,%xmm15,%xmm9
1702	jmp	.Lxts_enc_loop6
1703.align	32
1704.Lxts_enc_loop6:
1705.byte	102,15,56,220,209
1706.byte	102,15,56,220,217
1707.byte	102,15,56,220,225
1708.byte	102,15,56,220,233
1709.byte	102,15,56,220,241
1710.byte	102,15,56,220,249
1711	movups	-64(%rcx,%rax,1),%xmm1
1712	addq	$32,%rax
1713
1714.byte	102,15,56,220,208
1715.byte	102,15,56,220,216
1716.byte	102,15,56,220,224
1717.byte	102,15,56,220,232
1718.byte	102,15,56,220,240
1719.byte	102,15,56,220,248
1720	movups	-80(%rcx,%rax,1),%xmm0
1721	jnz	.Lxts_enc_loop6
1722
1723	movdqa	(%r8),%xmm8
1724	movdqa	%xmm9,%xmm14
1725	paddd	%xmm9,%xmm9
1726.byte	102,15,56,220,209
1727	paddq	%xmm15,%xmm15
1728	psrad	$31,%xmm14
1729.byte	102,15,56,220,217
1730	pand	%xmm8,%xmm14
1731	movups	(%r11),%xmm10
1732.byte	102,15,56,220,225
1733.byte	102,15,56,220,233
1734.byte	102,15,56,220,241
1735	pxor	%xmm14,%xmm15
1736	movaps	%xmm10,%xmm11
1737.byte	102,15,56,220,249
1738	movups	-64(%rcx),%xmm1
1739
1740	movdqa	%xmm9,%xmm14
1741.byte	102,15,56,220,208
1742	paddd	%xmm9,%xmm9
1743	pxor	%xmm15,%xmm10
1744.byte	102,15,56,220,216
1745	psrad	$31,%xmm14
1746	paddq	%xmm15,%xmm15
1747.byte	102,15,56,220,224
1748.byte	102,15,56,220,232
1749	pand	%xmm8,%xmm14
1750	movaps	%xmm11,%xmm12
1751.byte	102,15,56,220,240
1752	pxor	%xmm14,%xmm15
1753	movdqa	%xmm9,%xmm14
1754.byte	102,15,56,220,248
1755	movups	-48(%rcx),%xmm0
1756
1757	paddd	%xmm9,%xmm9
1758.byte	102,15,56,220,209
1759	pxor	%xmm15,%xmm11
1760	psrad	$31,%xmm14
1761.byte	102,15,56,220,217
1762	paddq	%xmm15,%xmm15
1763	pand	%xmm8,%xmm14
1764.byte	102,15,56,220,225
1765.byte	102,15,56,220,233
1766	movdqa	%xmm13,48(%rsp)
1767	pxor	%xmm14,%xmm15
1768.byte	102,15,56,220,241
1769	movaps	%xmm12,%xmm13
1770	movdqa	%xmm9,%xmm14
1771.byte	102,15,56,220,249
1772	movups	-32(%rcx),%xmm1
1773
1774	paddd	%xmm9,%xmm9
1775.byte	102,15,56,220,208
1776	pxor	%xmm15,%xmm12
1777	psrad	$31,%xmm14
1778.byte	102,15,56,220,216
1779	paddq	%xmm15,%xmm15
1780	pand	%xmm8,%xmm14
1781.byte	102,15,56,220,224
1782.byte	102,15,56,220,232
1783.byte	102,15,56,220,240
1784	pxor	%xmm14,%xmm15
1785	movaps	%xmm13,%xmm14
1786.byte	102,15,56,220,248
1787
1788	movdqa	%xmm9,%xmm0
1789	paddd	%xmm9,%xmm9
1790.byte	102,15,56,220,209
1791	pxor	%xmm15,%xmm13
1792	psrad	$31,%xmm0
1793.byte	102,15,56,220,217
1794	paddq	%xmm15,%xmm15
1795	pand	%xmm8,%xmm0
1796.byte	102,15,56,220,225
1797.byte	102,15,56,220,233
1798	pxor	%xmm0,%xmm15
1799	movups	(%r11),%xmm0
1800.byte	102,15,56,220,241
1801.byte	102,15,56,220,249
1802	movups	16(%r11),%xmm1
1803
1804	pxor	%xmm15,%xmm14
1805.byte	102,15,56,221,84,36,0
1806	psrad	$31,%xmm9
1807	paddq	%xmm15,%xmm15
1808.byte	102,15,56,221,92,36,16
1809.byte	102,15,56,221,100,36,32
1810	pand	%xmm8,%xmm9
1811	movq	%r10,%rax
1812.byte	102,15,56,221,108,36,48
1813.byte	102,15,56,221,116,36,64
1814.byte	102,15,56,221,124,36,80
1815	pxor	%xmm9,%xmm15
1816
1817	leaq	96(%rsi),%rsi
1818	movups	%xmm2,-96(%rsi)
1819	movups	%xmm3,-80(%rsi)
1820	movups	%xmm4,-64(%rsi)
1821	movups	%xmm5,-48(%rsi)
1822	movups	%xmm6,-32(%rsi)
1823	movups	%xmm7,-16(%rsi)
1824	subq	$96,%rdx
1825	jnc	.Lxts_enc_grandloop
1826
1827	movl	$16+96,%eax
1828	subl	%r10d,%eax
1829	movq	%r11,%rcx
1830	shrl	$4,%eax
1831
1832.Lxts_enc_short:
1833
1834	movl	%eax,%r10d
1835	pxor	%xmm0,%xmm10
1836	addq	$96,%rdx
1837	jz	.Lxts_enc_done
1838
1839	pxor	%xmm0,%xmm11
1840	cmpq	$0x20,%rdx
1841	jb	.Lxts_enc_one
1842	pxor	%xmm0,%xmm12
1843	je	.Lxts_enc_two
1844
1845	pxor	%xmm0,%xmm13
1846	cmpq	$0x40,%rdx
1847	jb	.Lxts_enc_three
1848	pxor	%xmm0,%xmm14
1849	je	.Lxts_enc_four
1850
1851	movdqu	(%rdi),%xmm2
1852	movdqu	16(%rdi),%xmm3
1853	movdqu	32(%rdi),%xmm4
1854	pxor	%xmm10,%xmm2
1855	movdqu	48(%rdi),%xmm5
1856	pxor	%xmm11,%xmm3
1857	movdqu	64(%rdi),%xmm6
1858	leaq	80(%rdi),%rdi
1859	pxor	%xmm12,%xmm4
1860	pxor	%xmm13,%xmm5
1861	pxor	%xmm14,%xmm6
1862	pxor	%xmm7,%xmm7
1863
1864	call	_aesni_encrypt6
1865
1866	xorps	%xmm10,%xmm2
1867	movdqa	%xmm15,%xmm10
1868	xorps	%xmm11,%xmm3
1869	xorps	%xmm12,%xmm4
1870	movdqu	%xmm2,(%rsi)
1871	xorps	%xmm13,%xmm5
1872	movdqu	%xmm3,16(%rsi)
1873	xorps	%xmm14,%xmm6
1874	movdqu	%xmm4,32(%rsi)
1875	movdqu	%xmm5,48(%rsi)
1876	movdqu	%xmm6,64(%rsi)
1877	leaq	80(%rsi),%rsi
1878	jmp	.Lxts_enc_done
1879
1880.align	16
1881.Lxts_enc_one:
1882	movups	(%rdi),%xmm2
1883	leaq	16(%rdi),%rdi
1884	xorps	%xmm10,%xmm2
1885	movups	(%rcx),%xmm0
1886	movups	16(%rcx),%xmm1
1887	leaq	32(%rcx),%rcx
1888	xorps	%xmm0,%xmm2
1889.Loop_enc1_9:
1890.byte	102,15,56,220,209
1891	decl	%eax
1892	movups	(%rcx),%xmm1
1893	leaq	16(%rcx),%rcx
1894	jnz	.Loop_enc1_9
1895.byte	102,15,56,221,209
1896	xorps	%xmm10,%xmm2
1897	movdqa	%xmm11,%xmm10
1898	movups	%xmm2,(%rsi)
1899	leaq	16(%rsi),%rsi
1900	jmp	.Lxts_enc_done
1901
1902.align	16
1903.Lxts_enc_two:
1904	movups	(%rdi),%xmm2
1905	movups	16(%rdi),%xmm3
1906	leaq	32(%rdi),%rdi
1907	xorps	%xmm10,%xmm2
1908	xorps	%xmm11,%xmm3
1909
1910	call	_aesni_encrypt2
1911
1912	xorps	%xmm10,%xmm2
1913	movdqa	%xmm12,%xmm10
1914	xorps	%xmm11,%xmm3
1915	movups	%xmm2,(%rsi)
1916	movups	%xmm3,16(%rsi)
1917	leaq	32(%rsi),%rsi
1918	jmp	.Lxts_enc_done
1919
1920.align	16
1921.Lxts_enc_three:
1922	movups	(%rdi),%xmm2
1923	movups	16(%rdi),%xmm3
1924	movups	32(%rdi),%xmm4
1925	leaq	48(%rdi),%rdi
1926	xorps	%xmm10,%xmm2
1927	xorps	%xmm11,%xmm3
1928	xorps	%xmm12,%xmm4
1929
1930	call	_aesni_encrypt3
1931
1932	xorps	%xmm10,%xmm2
1933	movdqa	%xmm13,%xmm10
1934	xorps	%xmm11,%xmm3
1935	xorps	%xmm12,%xmm4
1936	movups	%xmm2,(%rsi)
1937	movups	%xmm3,16(%rsi)
1938	movups	%xmm4,32(%rsi)
1939	leaq	48(%rsi),%rsi
1940	jmp	.Lxts_enc_done
1941
1942.align	16
1943.Lxts_enc_four:
1944	movups	(%rdi),%xmm2
1945	movups	16(%rdi),%xmm3
1946	movups	32(%rdi),%xmm4
1947	xorps	%xmm10,%xmm2
1948	movups	48(%rdi),%xmm5
1949	leaq	64(%rdi),%rdi
1950	xorps	%xmm11,%xmm3
1951	xorps	%xmm12,%xmm4
1952	xorps	%xmm13,%xmm5
1953
1954	call	_aesni_encrypt4
1955
1956	pxor	%xmm10,%xmm2
1957	movdqa	%xmm14,%xmm10
1958	pxor	%xmm11,%xmm3
1959	pxor	%xmm12,%xmm4
1960	movdqu	%xmm2,(%rsi)
1961	pxor	%xmm13,%xmm5
1962	movdqu	%xmm3,16(%rsi)
1963	movdqu	%xmm4,32(%rsi)
1964	movdqu	%xmm5,48(%rsi)
1965	leaq	64(%rsi),%rsi
1966	jmp	.Lxts_enc_done
1967
1968.align	16
1969.Lxts_enc_done:
1970	andq	$15,%r9
1971	jz	.Lxts_enc_ret
1972	movq	%r9,%rdx
1973
1974.Lxts_enc_steal:
1975	movzbl	(%rdi),%eax
1976	movzbl	-16(%rsi),%ecx
1977	leaq	1(%rdi),%rdi
1978	movb	%al,-16(%rsi)
1979	movb	%cl,0(%rsi)
1980	leaq	1(%rsi),%rsi
1981	subq	$1,%rdx
1982	jnz	.Lxts_enc_steal
1983
1984	subq	%r9,%rsi
1985	movq	%r11,%rcx
1986	movl	%r10d,%eax
1987
1988	movups	-16(%rsi),%xmm2
1989	xorps	%xmm10,%xmm2
1990	movups	(%rcx),%xmm0
1991	movups	16(%rcx),%xmm1
1992	leaq	32(%rcx),%rcx
1993	xorps	%xmm0,%xmm2
1994.Loop_enc1_10:
1995.byte	102,15,56,220,209
1996	decl	%eax
1997	movups	(%rcx),%xmm1
1998	leaq	16(%rcx),%rcx
1999	jnz	.Loop_enc1_10
2000.byte	102,15,56,221,209
2001	xorps	%xmm10,%xmm2
2002	movups	%xmm2,-16(%rsi)
2003
2004.Lxts_enc_ret:
2005	xorps	%xmm0,%xmm0
2006	pxor	%xmm1,%xmm1
2007	pxor	%xmm2,%xmm2
2008	pxor	%xmm3,%xmm3
2009	pxor	%xmm4,%xmm4
2010	pxor	%xmm5,%xmm5
2011	pxor	%xmm6,%xmm6
2012	pxor	%xmm7,%xmm7
2013	movaps	%xmm0,0(%rsp)
2014	pxor	%xmm8,%xmm8
2015	movaps	%xmm0,16(%rsp)
2016	pxor	%xmm9,%xmm9
2017	movaps	%xmm0,32(%rsp)
2018	pxor	%xmm10,%xmm10
2019	movaps	%xmm0,48(%rsp)
2020	pxor	%xmm11,%xmm11
2021	movaps	%xmm0,64(%rsp)
2022	pxor	%xmm12,%xmm12
2023	movaps	%xmm0,80(%rsp)
2024	pxor	%xmm13,%xmm13
2025	movaps	%xmm0,96(%rsp)
2026	pxor	%xmm14,%xmm14
2027	pxor	%xmm15,%xmm15
2028	leaq	(%rbp),%rsp
2029	popq	%rbp
2030.Lxts_enc_epilogue:
2031	.byte	0xf3,0xc3
2032.size	aesni_xts_encrypt,.-aesni_xts_encrypt
2033.globl	aesni_xts_decrypt
2034.type	aesni_xts_decrypt,@function
2035.align	16
2036aesni_xts_decrypt:
2037	leaq	(%rsp),%rax
2038	pushq	%rbp
2039	subq	$112,%rsp
2040	andq	$-16,%rsp
2041	leaq	-8(%rax),%rbp
2042	movups	(%r9),%xmm2
2043	movl	240(%r8),%eax
2044	movl	240(%rcx),%r10d
2045	movups	(%r8),%xmm0
2046	movups	16(%r8),%xmm1
2047	leaq	32(%r8),%r8
2048	xorps	%xmm0,%xmm2
2049.Loop_enc1_11:
2050.byte	102,15,56,220,209
2051	decl	%eax
2052	movups	(%r8),%xmm1
2053	leaq	16(%r8),%r8
2054	jnz	.Loop_enc1_11
2055.byte	102,15,56,221,209
2056	xorl	%eax,%eax
2057	testq	$15,%rdx
2058	setnz	%al
2059	shlq	$4,%rax
2060	subq	%rax,%rdx
2061
2062	movups	(%rcx),%xmm0
2063	movq	%rcx,%r11
2064	movl	%r10d,%eax
2065	shll	$4,%r10d
2066	movq	%rdx,%r9
2067	andq	$-16,%rdx
2068
2069	movups	16(%rcx,%r10,1),%xmm1
2070
2071	movdqa	.Lxts_magic(%rip),%xmm8
2072	movdqa	%xmm2,%xmm15
2073	pshufd	$0x5f,%xmm2,%xmm9
2074	pxor	%xmm0,%xmm1
2075	movdqa	%xmm9,%xmm14
2076	paddd	%xmm9,%xmm9
2077	movdqa	%xmm15,%xmm10
2078	psrad	$31,%xmm14
2079	paddq	%xmm15,%xmm15
2080	pand	%xmm8,%xmm14
2081	pxor	%xmm0,%xmm10
2082	pxor	%xmm14,%xmm15
2083	movdqa	%xmm9,%xmm14
2084	paddd	%xmm9,%xmm9
2085	movdqa	%xmm15,%xmm11
2086	psrad	$31,%xmm14
2087	paddq	%xmm15,%xmm15
2088	pand	%xmm8,%xmm14
2089	pxor	%xmm0,%xmm11
2090	pxor	%xmm14,%xmm15
2091	movdqa	%xmm9,%xmm14
2092	paddd	%xmm9,%xmm9
2093	movdqa	%xmm15,%xmm12
2094	psrad	$31,%xmm14
2095	paddq	%xmm15,%xmm15
2096	pand	%xmm8,%xmm14
2097	pxor	%xmm0,%xmm12
2098	pxor	%xmm14,%xmm15
2099	movdqa	%xmm9,%xmm14
2100	paddd	%xmm9,%xmm9
2101	movdqa	%xmm15,%xmm13
2102	psrad	$31,%xmm14
2103	paddq	%xmm15,%xmm15
2104	pand	%xmm8,%xmm14
2105	pxor	%xmm0,%xmm13
2106	pxor	%xmm14,%xmm15
2107	movdqa	%xmm15,%xmm14
2108	psrad	$31,%xmm9
2109	paddq	%xmm15,%xmm15
2110	pand	%xmm8,%xmm9
2111	pxor	%xmm0,%xmm14
2112	pxor	%xmm9,%xmm15
2113	movaps	%xmm1,96(%rsp)
2114
2115	subq	$96,%rdx
2116	jc	.Lxts_dec_short
2117
2118	movl	$16+96,%eax
2119	leaq	32(%r11,%r10,1),%rcx
2120	subq	%r10,%rax
2121	movups	16(%r11),%xmm1
2122	movq	%rax,%r10
2123	leaq	.Lxts_magic(%rip),%r8
2124	jmp	.Lxts_dec_grandloop
2125
2126.align	32
2127.Lxts_dec_grandloop:
2128	movdqu	0(%rdi),%xmm2
2129	movdqa	%xmm0,%xmm8
2130	movdqu	16(%rdi),%xmm3
2131	pxor	%xmm10,%xmm2
2132	movdqu	32(%rdi),%xmm4
2133	pxor	%xmm11,%xmm3
2134.byte	102,15,56,222,209
2135	movdqu	48(%rdi),%xmm5
2136	pxor	%xmm12,%xmm4
2137.byte	102,15,56,222,217
2138	movdqu	64(%rdi),%xmm6
2139	pxor	%xmm13,%xmm5
2140.byte	102,15,56,222,225
2141	movdqu	80(%rdi),%xmm7
2142	pxor	%xmm15,%xmm8
2143	movdqa	96(%rsp),%xmm9
2144	pxor	%xmm14,%xmm6
2145.byte	102,15,56,222,233
2146	movups	32(%r11),%xmm0
2147	leaq	96(%rdi),%rdi
2148	pxor	%xmm8,%xmm7
2149
2150	pxor	%xmm9,%xmm10
2151.byte	102,15,56,222,241
2152	pxor	%xmm9,%xmm11
2153	movdqa	%xmm10,0(%rsp)
2154.byte	102,15,56,222,249
2155	movups	48(%r11),%xmm1
2156	pxor	%xmm9,%xmm12
2157
2158.byte	102,15,56,222,208
2159	pxor	%xmm9,%xmm13
2160	movdqa	%xmm11,16(%rsp)
2161.byte	102,15,56,222,216
2162	pxor	%xmm9,%xmm14
2163	movdqa	%xmm12,32(%rsp)
2164.byte	102,15,56,222,224
2165.byte	102,15,56,222,232
2166	pxor	%xmm9,%xmm8
2167	movdqa	%xmm14,64(%rsp)
2168.byte	102,15,56,222,240
2169.byte	102,15,56,222,248
2170	movups	64(%r11),%xmm0
2171	movdqa	%xmm8,80(%rsp)
2172	pshufd	$0x5f,%xmm15,%xmm9
2173	jmp	.Lxts_dec_loop6
2174.align	32
2175.Lxts_dec_loop6:
2176.byte	102,15,56,222,209
2177.byte	102,15,56,222,217
2178.byte	102,15,56,222,225
2179.byte	102,15,56,222,233
2180.byte	102,15,56,222,241
2181.byte	102,15,56,222,249
2182	movups	-64(%rcx,%rax,1),%xmm1
2183	addq	$32,%rax
2184
2185.byte	102,15,56,222,208
2186.byte	102,15,56,222,216
2187.byte	102,15,56,222,224
2188.byte	102,15,56,222,232
2189.byte	102,15,56,222,240
2190.byte	102,15,56,222,248
2191	movups	-80(%rcx,%rax,1),%xmm0
2192	jnz	.Lxts_dec_loop6
2193
2194	movdqa	(%r8),%xmm8
2195	movdqa	%xmm9,%xmm14
2196	paddd	%xmm9,%xmm9
2197.byte	102,15,56,222,209
2198	paddq	%xmm15,%xmm15
2199	psrad	$31,%xmm14
2200.byte	102,15,56,222,217
2201	pand	%xmm8,%xmm14
2202	movups	(%r11),%xmm10
2203.byte	102,15,56,222,225
2204.byte	102,15,56,222,233
2205.byte	102,15,56,222,241
2206	pxor	%xmm14,%xmm15
2207	movaps	%xmm10,%xmm11
2208.byte	102,15,56,222,249
2209	movups	-64(%rcx),%xmm1
2210
2211	movdqa	%xmm9,%xmm14
2212.byte	102,15,56,222,208
2213	paddd	%xmm9,%xmm9
2214	pxor	%xmm15,%xmm10
2215.byte	102,15,56,222,216
2216	psrad	$31,%xmm14
2217	paddq	%xmm15,%xmm15
2218.byte	102,15,56,222,224
2219.byte	102,15,56,222,232
2220	pand	%xmm8,%xmm14
2221	movaps	%xmm11,%xmm12
2222.byte	102,15,56,222,240
2223	pxor	%xmm14,%xmm15
2224	movdqa	%xmm9,%xmm14
2225.byte	102,15,56,222,248
2226	movups	-48(%rcx),%xmm0
2227
2228	paddd	%xmm9,%xmm9
2229.byte	102,15,56,222,209
2230	pxor	%xmm15,%xmm11
2231	psrad	$31,%xmm14
2232.byte	102,15,56,222,217
2233	paddq	%xmm15,%xmm15
2234	pand	%xmm8,%xmm14
2235.byte	102,15,56,222,225
2236.byte	102,15,56,222,233
2237	movdqa	%xmm13,48(%rsp)
2238	pxor	%xmm14,%xmm15
2239.byte	102,15,56,222,241
2240	movaps	%xmm12,%xmm13
2241	movdqa	%xmm9,%xmm14
2242.byte	102,15,56,222,249
2243	movups	-32(%rcx),%xmm1
2244
2245	paddd	%xmm9,%xmm9
2246.byte	102,15,56,222,208
2247	pxor	%xmm15,%xmm12
2248	psrad	$31,%xmm14
2249.byte	102,15,56,222,216
2250	paddq	%xmm15,%xmm15
2251	pand	%xmm8,%xmm14
2252.byte	102,15,56,222,224
2253.byte	102,15,56,222,232
2254.byte	102,15,56,222,240
2255	pxor	%xmm14,%xmm15
2256	movaps	%xmm13,%xmm14
2257.byte	102,15,56,222,248
2258
2259	movdqa	%xmm9,%xmm0
2260	paddd	%xmm9,%xmm9
2261.byte	102,15,56,222,209
2262	pxor	%xmm15,%xmm13
2263	psrad	$31,%xmm0
2264.byte	102,15,56,222,217
2265	paddq	%xmm15,%xmm15
2266	pand	%xmm8,%xmm0
2267.byte	102,15,56,222,225
2268.byte	102,15,56,222,233
2269	pxor	%xmm0,%xmm15
2270	movups	(%r11),%xmm0
2271.byte	102,15,56,222,241
2272.byte	102,15,56,222,249
2273	movups	16(%r11),%xmm1
2274
2275	pxor	%xmm15,%xmm14
2276.byte	102,15,56,223,84,36,0
2277	psrad	$31,%xmm9
2278	paddq	%xmm15,%xmm15
2279.byte	102,15,56,223,92,36,16
2280.byte	102,15,56,223,100,36,32
2281	pand	%xmm8,%xmm9
2282	movq	%r10,%rax
2283.byte	102,15,56,223,108,36,48
2284.byte	102,15,56,223,116,36,64
2285.byte	102,15,56,223,124,36,80
2286	pxor	%xmm9,%xmm15
2287
2288	leaq	96(%rsi),%rsi
2289	movups	%xmm2,-96(%rsi)
2290	movups	%xmm3,-80(%rsi)
2291	movups	%xmm4,-64(%rsi)
2292	movups	%xmm5,-48(%rsi)
2293	movups	%xmm6,-32(%rsi)
2294	movups	%xmm7,-16(%rsi)
2295	subq	$96,%rdx
2296	jnc	.Lxts_dec_grandloop
2297
2298	movl	$16+96,%eax
2299	subl	%r10d,%eax
2300	movq	%r11,%rcx
2301	shrl	$4,%eax
2302
2303.Lxts_dec_short:
2304
2305	movl	%eax,%r10d
2306	pxor	%xmm0,%xmm10
2307	pxor	%xmm0,%xmm11
2308	addq	$96,%rdx
2309	jz	.Lxts_dec_done
2310
2311	pxor	%xmm0,%xmm12
2312	cmpq	$0x20,%rdx
2313	jb	.Lxts_dec_one
2314	pxor	%xmm0,%xmm13
2315	je	.Lxts_dec_two
2316
2317	pxor	%xmm0,%xmm14
2318	cmpq	$0x40,%rdx
2319	jb	.Lxts_dec_three
2320	je	.Lxts_dec_four
2321
2322	movdqu	(%rdi),%xmm2
2323	movdqu	16(%rdi),%xmm3
2324	movdqu	32(%rdi),%xmm4
2325	pxor	%xmm10,%xmm2
2326	movdqu	48(%rdi),%xmm5
2327	pxor	%xmm11,%xmm3
2328	movdqu	64(%rdi),%xmm6
2329	leaq	80(%rdi),%rdi
2330	pxor	%xmm12,%xmm4
2331	pxor	%xmm13,%xmm5
2332	pxor	%xmm14,%xmm6
2333
2334	call	_aesni_decrypt6
2335
2336	xorps	%xmm10,%xmm2
2337	xorps	%xmm11,%xmm3
2338	xorps	%xmm12,%xmm4
2339	movdqu	%xmm2,(%rsi)
2340	xorps	%xmm13,%xmm5
2341	movdqu	%xmm3,16(%rsi)
2342	xorps	%xmm14,%xmm6
2343	movdqu	%xmm4,32(%rsi)
2344	pxor	%xmm14,%xmm14
2345	movdqu	%xmm5,48(%rsi)
2346	pcmpgtd	%xmm15,%xmm14
2347	movdqu	%xmm6,64(%rsi)
2348	leaq	80(%rsi),%rsi
2349	pshufd	$0x13,%xmm14,%xmm11
2350	andq	$15,%r9
2351	jz	.Lxts_dec_ret
2352
2353	movdqa	%xmm15,%xmm10
2354	paddq	%xmm15,%xmm15
2355	pand	%xmm8,%xmm11
2356	pxor	%xmm15,%xmm11
2357	jmp	.Lxts_dec_done2
2358
2359.align	16
2360.Lxts_dec_one:
2361	movups	(%rdi),%xmm2
2362	leaq	16(%rdi),%rdi
2363	xorps	%xmm10,%xmm2
2364	movups	(%rcx),%xmm0
2365	movups	16(%rcx),%xmm1
2366	leaq	32(%rcx),%rcx
2367	xorps	%xmm0,%xmm2
2368.Loop_dec1_12:
2369.byte	102,15,56,222,209
2370	decl	%eax
2371	movups	(%rcx),%xmm1
2372	leaq	16(%rcx),%rcx
2373	jnz	.Loop_dec1_12
2374.byte	102,15,56,223,209
2375	xorps	%xmm10,%xmm2
2376	movdqa	%xmm11,%xmm10
2377	movups	%xmm2,(%rsi)
2378	movdqa	%xmm12,%xmm11
2379	leaq	16(%rsi),%rsi
2380	jmp	.Lxts_dec_done
2381
2382.align	16
2383.Lxts_dec_two:
2384	movups	(%rdi),%xmm2
2385	movups	16(%rdi),%xmm3
2386	leaq	32(%rdi),%rdi
2387	xorps	%xmm10,%xmm2
2388	xorps	%xmm11,%xmm3
2389
2390	call	_aesni_decrypt2
2391
2392	xorps	%xmm10,%xmm2
2393	movdqa	%xmm12,%xmm10
2394	xorps	%xmm11,%xmm3
2395	movdqa	%xmm13,%xmm11
2396	movups	%xmm2,(%rsi)
2397	movups	%xmm3,16(%rsi)
2398	leaq	32(%rsi),%rsi
2399	jmp	.Lxts_dec_done
2400
2401.align	16
2402.Lxts_dec_three:
2403	movups	(%rdi),%xmm2
2404	movups	16(%rdi),%xmm3
2405	movups	32(%rdi),%xmm4
2406	leaq	48(%rdi),%rdi
2407	xorps	%xmm10,%xmm2
2408	xorps	%xmm11,%xmm3
2409	xorps	%xmm12,%xmm4
2410
2411	call	_aesni_decrypt3
2412
2413	xorps	%xmm10,%xmm2
2414	movdqa	%xmm13,%xmm10
2415	xorps	%xmm11,%xmm3
2416	movdqa	%xmm14,%xmm11
2417	xorps	%xmm12,%xmm4
2418	movups	%xmm2,(%rsi)
2419	movups	%xmm3,16(%rsi)
2420	movups	%xmm4,32(%rsi)
2421	leaq	48(%rsi),%rsi
2422	jmp	.Lxts_dec_done
2423
2424.align	16
2425.Lxts_dec_four:
2426	movups	(%rdi),%xmm2
2427	movups	16(%rdi),%xmm3
2428	movups	32(%rdi),%xmm4
2429	xorps	%xmm10,%xmm2
2430	movups	48(%rdi),%xmm5
2431	leaq	64(%rdi),%rdi
2432	xorps	%xmm11,%xmm3
2433	xorps	%xmm12,%xmm4
2434	xorps	%xmm13,%xmm5
2435
2436	call	_aesni_decrypt4
2437
2438	pxor	%xmm10,%xmm2
2439	movdqa	%xmm14,%xmm10
2440	pxor	%xmm11,%xmm3
2441	movdqa	%xmm15,%xmm11
2442	pxor	%xmm12,%xmm4
2443	movdqu	%xmm2,(%rsi)
2444	pxor	%xmm13,%xmm5
2445	movdqu	%xmm3,16(%rsi)
2446	movdqu	%xmm4,32(%rsi)
2447	movdqu	%xmm5,48(%rsi)
2448	leaq	64(%rsi),%rsi
2449	jmp	.Lxts_dec_done
2450
2451.align	16
2452.Lxts_dec_done:
2453	andq	$15,%r9
2454	jz	.Lxts_dec_ret
2455.Lxts_dec_done2:
2456	movq	%r9,%rdx
2457	movq	%r11,%rcx
2458	movl	%r10d,%eax
2459
2460	movups	(%rdi),%xmm2
2461	xorps	%xmm11,%xmm2
2462	movups	(%rcx),%xmm0
2463	movups	16(%rcx),%xmm1
2464	leaq	32(%rcx),%rcx
2465	xorps	%xmm0,%xmm2
2466.Loop_dec1_13:
2467.byte	102,15,56,222,209
2468	decl	%eax
2469	movups	(%rcx),%xmm1
2470	leaq	16(%rcx),%rcx
2471	jnz	.Loop_dec1_13
2472.byte	102,15,56,223,209
2473	xorps	%xmm11,%xmm2
2474	movups	%xmm2,(%rsi)
2475
2476.Lxts_dec_steal:
2477	movzbl	16(%rdi),%eax
2478	movzbl	(%rsi),%ecx
2479	leaq	1(%rdi),%rdi
2480	movb	%al,(%rsi)
2481	movb	%cl,16(%rsi)
2482	leaq	1(%rsi),%rsi
2483	subq	$1,%rdx
2484	jnz	.Lxts_dec_steal
2485
2486	subq	%r9,%rsi
2487	movq	%r11,%rcx
2488	movl	%r10d,%eax
2489
2490	movups	(%rsi),%xmm2
2491	xorps	%xmm10,%xmm2
2492	movups	(%rcx),%xmm0
2493	movups	16(%rcx),%xmm1
2494	leaq	32(%rcx),%rcx
2495	xorps	%xmm0,%xmm2
2496.Loop_dec1_14:
2497.byte	102,15,56,222,209
2498	decl	%eax
2499	movups	(%rcx),%xmm1
2500	leaq	16(%rcx),%rcx
2501	jnz	.Loop_dec1_14
2502.byte	102,15,56,223,209
2503	xorps	%xmm10,%xmm2
2504	movups	%xmm2,(%rsi)
2505
2506.Lxts_dec_ret:
2507	xorps	%xmm0,%xmm0
2508	pxor	%xmm1,%xmm1
2509	pxor	%xmm2,%xmm2
2510	pxor	%xmm3,%xmm3
2511	pxor	%xmm4,%xmm4
2512	pxor	%xmm5,%xmm5
2513	pxor	%xmm6,%xmm6
2514	pxor	%xmm7,%xmm7
2515	movaps	%xmm0,0(%rsp)
2516	pxor	%xmm8,%xmm8
2517	movaps	%xmm0,16(%rsp)
2518	pxor	%xmm9,%xmm9
2519	movaps	%xmm0,32(%rsp)
2520	pxor	%xmm10,%xmm10
2521	movaps	%xmm0,48(%rsp)
2522	pxor	%xmm11,%xmm11
2523	movaps	%xmm0,64(%rsp)
2524	pxor	%xmm12,%xmm12
2525	movaps	%xmm0,80(%rsp)
2526	pxor	%xmm13,%xmm13
2527	movaps	%xmm0,96(%rsp)
2528	pxor	%xmm14,%xmm14
2529	pxor	%xmm15,%xmm15
2530	leaq	(%rbp),%rsp
2531	popq	%rbp
2532.Lxts_dec_epilogue:
2533	.byte	0xf3,0xc3
2534.size	aesni_xts_decrypt,.-aesni_xts_decrypt
2535.globl	aesni_cbc_encrypt
2536.type	aesni_cbc_encrypt,@function
2537.align	16
2538aesni_cbc_encrypt:
2539	testq	%rdx,%rdx
2540	jz	.Lcbc_ret
2541
2542	movl	240(%rcx),%r10d
2543	movq	%rcx,%r11
2544	testl	%r9d,%r9d
2545	jz	.Lcbc_decrypt
2546
2547	movups	(%r8),%xmm2
2548	movl	%r10d,%eax
2549	cmpq	$16,%rdx
2550	jb	.Lcbc_enc_tail
2551	subq	$16,%rdx
2552	jmp	.Lcbc_enc_loop
2553.align	16
2554.Lcbc_enc_loop:
2555	movups	(%rdi),%xmm3
2556	leaq	16(%rdi),%rdi
2557
2558	movups	(%rcx),%xmm0
2559	movups	16(%rcx),%xmm1
2560	xorps	%xmm0,%xmm3
2561	leaq	32(%rcx),%rcx
2562	xorps	%xmm3,%xmm2
2563.Loop_enc1_15:
2564.byte	102,15,56,220,209
2565	decl	%eax
2566	movups	(%rcx),%xmm1
2567	leaq	16(%rcx),%rcx
2568	jnz	.Loop_enc1_15
2569.byte	102,15,56,221,209
2570	movl	%r10d,%eax
2571	movq	%r11,%rcx
2572	movups	%xmm2,0(%rsi)
2573	leaq	16(%rsi),%rsi
2574	subq	$16,%rdx
2575	jnc	.Lcbc_enc_loop
2576	addq	$16,%rdx
2577	jnz	.Lcbc_enc_tail
2578	pxor	%xmm0,%xmm0
2579	pxor	%xmm1,%xmm1
2580	movups	%xmm2,(%r8)
2581	pxor	%xmm2,%xmm2
2582	pxor	%xmm3,%xmm3
2583	jmp	.Lcbc_ret
2584
2585.Lcbc_enc_tail:
2586	movq	%rdx,%rcx
2587	xchgq	%rdi,%rsi
2588.long	0x9066A4F3
2589	movl	$16,%ecx
2590	subq	%rdx,%rcx
2591	xorl	%eax,%eax
2592.long	0x9066AAF3
2593	leaq	-16(%rdi),%rdi
2594	movl	%r10d,%eax
2595	movq	%rdi,%rsi
2596	movq	%r11,%rcx
2597	xorq	%rdx,%rdx
2598	jmp	.Lcbc_enc_loop
2599
2600.align	16
2601.Lcbc_decrypt:
2602	cmpq	$16,%rdx
2603	jne	.Lcbc_decrypt_bulk
2604
2605
2606
2607	movdqu	(%rdi),%xmm2
2608	movdqu	(%r8),%xmm3
2609	movdqa	%xmm2,%xmm4
2610	movups	(%rcx),%xmm0
2611	movups	16(%rcx),%xmm1
2612	leaq	32(%rcx),%rcx
2613	xorps	%xmm0,%xmm2
2614.Loop_dec1_16:
2615.byte	102,15,56,222,209
2616	decl	%r10d
2617	movups	(%rcx),%xmm1
2618	leaq	16(%rcx),%rcx
2619	jnz	.Loop_dec1_16
2620.byte	102,15,56,223,209
2621	pxor	%xmm0,%xmm0
2622	pxor	%xmm1,%xmm1
2623	movdqu	%xmm4,(%r8)
2624	xorps	%xmm3,%xmm2
2625	pxor	%xmm3,%xmm3
2626	movups	%xmm2,(%rsi)
2627	pxor	%xmm2,%xmm2
2628	jmp	.Lcbc_ret
2629.align	16
2630.Lcbc_decrypt_bulk:
2631	leaq	(%rsp),%rax
2632	pushq	%rbp
2633	subq	$16,%rsp
2634	andq	$-16,%rsp
2635	leaq	-8(%rax),%rbp
2636	movups	(%r8),%xmm10
2637	movl	%r10d,%eax
2638	cmpq	$0x50,%rdx
2639	jbe	.Lcbc_dec_tail
2640
2641	movups	(%rcx),%xmm0
2642	movdqu	0(%rdi),%xmm2
2643	movdqu	16(%rdi),%xmm3
2644	movdqa	%xmm2,%xmm11
2645	movdqu	32(%rdi),%xmm4
2646	movdqa	%xmm3,%xmm12
2647	movdqu	48(%rdi),%xmm5
2648	movdqa	%xmm4,%xmm13
2649	movdqu	64(%rdi),%xmm6
2650	movdqa	%xmm5,%xmm14
2651	movdqu	80(%rdi),%xmm7
2652	movdqa	%xmm6,%xmm15
2653	movl	OPENSSL_ia32cap_P+4(%rip),%r9d
2654	cmpq	$0x70,%rdx
2655	jbe	.Lcbc_dec_six_or_seven
2656
2657	andl	$71303168,%r9d
2658	subq	$0x50,%rdx
2659	cmpl	$4194304,%r9d
2660	je	.Lcbc_dec_loop6_enter
2661	subq	$0x20,%rdx
2662	leaq	112(%rcx),%rcx
2663	jmp	.Lcbc_dec_loop8_enter
2664.align	16
2665.Lcbc_dec_loop8:
2666	movups	%xmm9,(%rsi)
2667	leaq	16(%rsi),%rsi
2668.Lcbc_dec_loop8_enter:
2669	movdqu	96(%rdi),%xmm8
2670	pxor	%xmm0,%xmm2
2671	movdqu	112(%rdi),%xmm9
2672	pxor	%xmm0,%xmm3
2673	movups	16-112(%rcx),%xmm1
2674	pxor	%xmm0,%xmm4
2675	xorq	%r11,%r11
2676	cmpq	$0x70,%rdx
2677	pxor	%xmm0,%xmm5
2678	pxor	%xmm0,%xmm6
2679	pxor	%xmm0,%xmm7
2680	pxor	%xmm0,%xmm8
2681
2682.byte	102,15,56,222,209
2683	pxor	%xmm0,%xmm9
2684	movups	32-112(%rcx),%xmm0
2685.byte	102,15,56,222,217
2686.byte	102,15,56,222,225
2687.byte	102,15,56,222,233
2688.byte	102,15,56,222,241
2689.byte	102,15,56,222,249
2690.byte	102,68,15,56,222,193
2691	setnc	%r11b
2692	shlq	$7,%r11
2693.byte	102,68,15,56,222,201
2694	addq	%rdi,%r11
2695	movups	48-112(%rcx),%xmm1
2696.byte	102,15,56,222,208
2697.byte	102,15,56,222,216
2698.byte	102,15,56,222,224
2699.byte	102,15,56,222,232
2700.byte	102,15,56,222,240
2701.byte	102,15,56,222,248
2702.byte	102,68,15,56,222,192
2703.byte	102,68,15,56,222,200
2704	movups	64-112(%rcx),%xmm0
2705	nop
2706.byte	102,15,56,222,209
2707.byte	102,15,56,222,217
2708.byte	102,15,56,222,225
2709.byte	102,15,56,222,233
2710.byte	102,15,56,222,241
2711.byte	102,15,56,222,249
2712.byte	102,68,15,56,222,193
2713.byte	102,68,15,56,222,201
2714	movups	80-112(%rcx),%xmm1
2715	nop
2716.byte	102,15,56,222,208
2717.byte	102,15,56,222,216
2718.byte	102,15,56,222,224
2719.byte	102,15,56,222,232
2720.byte	102,15,56,222,240
2721.byte	102,15,56,222,248
2722.byte	102,68,15,56,222,192
2723.byte	102,68,15,56,222,200
2724	movups	96-112(%rcx),%xmm0
2725	nop
2726.byte	102,15,56,222,209
2727.byte	102,15,56,222,217
2728.byte	102,15,56,222,225
2729.byte	102,15,56,222,233
2730.byte	102,15,56,222,241
2731.byte	102,15,56,222,249
2732.byte	102,68,15,56,222,193
2733.byte	102,68,15,56,222,201
2734	movups	112-112(%rcx),%xmm1
2735	nop
2736.byte	102,15,56,222,208
2737.byte	102,15,56,222,216
2738.byte	102,15,56,222,224
2739.byte	102,15,56,222,232
2740.byte	102,15,56,222,240
2741.byte	102,15,56,222,248
2742.byte	102,68,15,56,222,192
2743.byte	102,68,15,56,222,200
2744	movups	128-112(%rcx),%xmm0
2745	nop
2746.byte	102,15,56,222,209
2747.byte	102,15,56,222,217
2748.byte	102,15,56,222,225
2749.byte	102,15,56,222,233
2750.byte	102,15,56,222,241
2751.byte	102,15,56,222,249
2752.byte	102,68,15,56,222,193
2753.byte	102,68,15,56,222,201
2754	movups	144-112(%rcx),%xmm1
2755	cmpl	$11,%eax
2756.byte	102,15,56,222,208
2757.byte	102,15,56,222,216
2758.byte	102,15,56,222,224
2759.byte	102,15,56,222,232
2760.byte	102,15,56,222,240
2761.byte	102,15,56,222,248
2762.byte	102,68,15,56,222,192
2763.byte	102,68,15,56,222,200
2764	movups	160-112(%rcx),%xmm0
2765	jb	.Lcbc_dec_done
2766.byte	102,15,56,222,209
2767.byte	102,15,56,222,217
2768.byte	102,15,56,222,225
2769.byte	102,15,56,222,233
2770.byte	102,15,56,222,241
2771.byte	102,15,56,222,249
2772.byte	102,68,15,56,222,193
2773.byte	102,68,15,56,222,201
2774	movups	176-112(%rcx),%xmm1
2775	nop
2776.byte	102,15,56,222,208
2777.byte	102,15,56,222,216
2778.byte	102,15,56,222,224
2779.byte	102,15,56,222,232
2780.byte	102,15,56,222,240
2781.byte	102,15,56,222,248
2782.byte	102,68,15,56,222,192
2783.byte	102,68,15,56,222,200
2784	movups	192-112(%rcx),%xmm0
2785	je	.Lcbc_dec_done
2786.byte	102,15,56,222,209
2787.byte	102,15,56,222,217
2788.byte	102,15,56,222,225
2789.byte	102,15,56,222,233
2790.byte	102,15,56,222,241
2791.byte	102,15,56,222,249
2792.byte	102,68,15,56,222,193
2793.byte	102,68,15,56,222,201
2794	movups	208-112(%rcx),%xmm1
2795	nop
2796.byte	102,15,56,222,208
2797.byte	102,15,56,222,216
2798.byte	102,15,56,222,224
2799.byte	102,15,56,222,232
2800.byte	102,15,56,222,240
2801.byte	102,15,56,222,248
2802.byte	102,68,15,56,222,192
2803.byte	102,68,15,56,222,200
2804	movups	224-112(%rcx),%xmm0
2805	jmp	.Lcbc_dec_done
2806.align	16
2807.Lcbc_dec_done:
2808.byte	102,15,56,222,209
2809.byte	102,15,56,222,217
2810	pxor	%xmm0,%xmm10
2811	pxor	%xmm0,%xmm11
2812.byte	102,15,56,222,225
2813.byte	102,15,56,222,233
2814	pxor	%xmm0,%xmm12
2815	pxor	%xmm0,%xmm13
2816.byte	102,15,56,222,241
2817.byte	102,15,56,222,249
2818	pxor	%xmm0,%xmm14
2819	pxor	%xmm0,%xmm15
2820.byte	102,68,15,56,222,193
2821.byte	102,68,15,56,222,201
2822	movdqu	80(%rdi),%xmm1
2823
2824.byte	102,65,15,56,223,210
2825	movdqu	96(%rdi),%xmm10
2826	pxor	%xmm0,%xmm1
2827.byte	102,65,15,56,223,219
2828	pxor	%xmm0,%xmm10
2829	movdqu	112(%rdi),%xmm0
2830.byte	102,65,15,56,223,228
2831	leaq	128(%rdi),%rdi
2832	movdqu	0(%r11),%xmm11
2833.byte	102,65,15,56,223,237
2834.byte	102,65,15,56,223,246
2835	movdqu	16(%r11),%xmm12
2836	movdqu	32(%r11),%xmm13
2837.byte	102,65,15,56,223,255
2838.byte	102,68,15,56,223,193
2839	movdqu	48(%r11),%xmm14
2840	movdqu	64(%r11),%xmm15
2841.byte	102,69,15,56,223,202
2842	movdqa	%xmm0,%xmm10
2843	movdqu	80(%r11),%xmm1
2844	movups	-112(%rcx),%xmm0
2845
2846	movups	%xmm2,(%rsi)
2847	movdqa	%xmm11,%xmm2
2848	movups	%xmm3,16(%rsi)
2849	movdqa	%xmm12,%xmm3
2850	movups	%xmm4,32(%rsi)
2851	movdqa	%xmm13,%xmm4
2852	movups	%xmm5,48(%rsi)
2853	movdqa	%xmm14,%xmm5
2854	movups	%xmm6,64(%rsi)
2855	movdqa	%xmm15,%xmm6
2856	movups	%xmm7,80(%rsi)
2857	movdqa	%xmm1,%xmm7
2858	movups	%xmm8,96(%rsi)
2859	leaq	112(%rsi),%rsi
2860
2861	subq	$0x80,%rdx
2862	ja	.Lcbc_dec_loop8
2863
2864	movaps	%xmm9,%xmm2
2865	leaq	-112(%rcx),%rcx
2866	addq	$0x70,%rdx
2867	jle	.Lcbc_dec_clear_tail_collected
2868	movups	%xmm9,(%rsi)
2869	leaq	16(%rsi),%rsi
2870	cmpq	$0x50,%rdx
2871	jbe	.Lcbc_dec_tail
2872
2873	movaps	%xmm11,%xmm2
2874.Lcbc_dec_six_or_seven:
2875	cmpq	$0x60,%rdx
2876	ja	.Lcbc_dec_seven
2877
2878	movaps	%xmm7,%xmm8
2879	call	_aesni_decrypt6
2880	pxor	%xmm10,%xmm2
2881	movaps	%xmm8,%xmm10
2882	pxor	%xmm11,%xmm3
2883	movdqu	%xmm2,(%rsi)
2884	pxor	%xmm12,%xmm4
2885	movdqu	%xmm3,16(%rsi)
2886	pxor	%xmm3,%xmm3
2887	pxor	%xmm13,%xmm5
2888	movdqu	%xmm4,32(%rsi)
2889	pxor	%xmm4,%xmm4
2890	pxor	%xmm14,%xmm6
2891	movdqu	%xmm5,48(%rsi)
2892	pxor	%xmm5,%xmm5
2893	pxor	%xmm15,%xmm7
2894	movdqu	%xmm6,64(%rsi)
2895	pxor	%xmm6,%xmm6
2896	leaq	80(%rsi),%rsi
2897	movdqa	%xmm7,%xmm2
2898	pxor	%xmm7,%xmm7
2899	jmp	.Lcbc_dec_tail_collected
2900
2901.align	16
2902.Lcbc_dec_seven:
2903	movups	96(%rdi),%xmm8
2904	xorps	%xmm9,%xmm9
2905	call	_aesni_decrypt8
2906	movups	80(%rdi),%xmm9
2907	pxor	%xmm10,%xmm2
2908	movups	96(%rdi),%xmm10
2909	pxor	%xmm11,%xmm3
2910	movdqu	%xmm2,(%rsi)
2911	pxor	%xmm12,%xmm4
2912	movdqu	%xmm3,16(%rsi)
2913	pxor	%xmm3,%xmm3
2914	pxor	%xmm13,%xmm5
2915	movdqu	%xmm4,32(%rsi)
2916	pxor	%xmm4,%xmm4
2917	pxor	%xmm14,%xmm6
2918	movdqu	%xmm5,48(%rsi)
2919	pxor	%xmm5,%xmm5
2920	pxor	%xmm15,%xmm7
2921	movdqu	%xmm6,64(%rsi)
2922	pxor	%xmm6,%xmm6
2923	pxor	%xmm9,%xmm8
2924	movdqu	%xmm7,80(%rsi)
2925	pxor	%xmm7,%xmm7
2926	leaq	96(%rsi),%rsi
2927	movdqa	%xmm8,%xmm2
2928	pxor	%xmm8,%xmm8
2929	pxor	%xmm9,%xmm9
2930	jmp	.Lcbc_dec_tail_collected
2931
2932.align	16
2933.Lcbc_dec_loop6:
2934	movups	%xmm7,(%rsi)
2935	leaq	16(%rsi),%rsi
2936	movdqu	0(%rdi),%xmm2
2937	movdqu	16(%rdi),%xmm3
2938	movdqa	%xmm2,%xmm11
2939	movdqu	32(%rdi),%xmm4
2940	movdqa	%xmm3,%xmm12
2941	movdqu	48(%rdi),%xmm5
2942	movdqa	%xmm4,%xmm13
2943	movdqu	64(%rdi),%xmm6
2944	movdqa	%xmm5,%xmm14
2945	movdqu	80(%rdi),%xmm7
2946	movdqa	%xmm6,%xmm15
2947.Lcbc_dec_loop6_enter:
2948	leaq	96(%rdi),%rdi
2949	movdqa	%xmm7,%xmm8
2950
2951	call	_aesni_decrypt6
2952
2953	pxor	%xmm10,%xmm2
2954	movdqa	%xmm8,%xmm10
2955	pxor	%xmm11,%xmm3
2956	movdqu	%xmm2,(%rsi)
2957	pxor	%xmm12,%xmm4
2958	movdqu	%xmm3,16(%rsi)
2959	pxor	%xmm13,%xmm5
2960	movdqu	%xmm4,32(%rsi)
2961	pxor	%xmm14,%xmm6
2962	movq	%r11,%rcx
2963	movdqu	%xmm5,48(%rsi)
2964	pxor	%xmm15,%xmm7
2965	movl	%r10d,%eax
2966	movdqu	%xmm6,64(%rsi)
2967	leaq	80(%rsi),%rsi
2968	subq	$0x60,%rdx
2969	ja	.Lcbc_dec_loop6
2970
2971	movdqa	%xmm7,%xmm2
2972	addq	$0x50,%rdx
2973	jle	.Lcbc_dec_clear_tail_collected
2974	movups	%xmm7,(%rsi)
2975	leaq	16(%rsi),%rsi
2976
2977.Lcbc_dec_tail:
2978	movups	(%rdi),%xmm2
2979	subq	$0x10,%rdx
2980	jbe	.Lcbc_dec_one
2981
2982	movups	16(%rdi),%xmm3
2983	movaps	%xmm2,%xmm11
2984	subq	$0x10,%rdx
2985	jbe	.Lcbc_dec_two
2986
2987	movups	32(%rdi),%xmm4
2988	movaps	%xmm3,%xmm12
2989	subq	$0x10,%rdx
2990	jbe	.Lcbc_dec_three
2991
2992	movups	48(%rdi),%xmm5
2993	movaps	%xmm4,%xmm13
2994	subq	$0x10,%rdx
2995	jbe	.Lcbc_dec_four
2996
2997	movups	64(%rdi),%xmm6
2998	movaps	%xmm5,%xmm14
2999	movaps	%xmm6,%xmm15
3000	xorps	%xmm7,%xmm7
3001	call	_aesni_decrypt6
3002	pxor	%xmm10,%xmm2
3003	movaps	%xmm15,%xmm10
3004	pxor	%xmm11,%xmm3
3005	movdqu	%xmm2,(%rsi)
3006	pxor	%xmm12,%xmm4
3007	movdqu	%xmm3,16(%rsi)
3008	pxor	%xmm3,%xmm3
3009	pxor	%xmm13,%xmm5
3010	movdqu	%xmm4,32(%rsi)
3011	pxor	%xmm4,%xmm4
3012	pxor	%xmm14,%xmm6
3013	movdqu	%xmm5,48(%rsi)
3014	pxor	%xmm5,%xmm5
3015	leaq	64(%rsi),%rsi
3016	movdqa	%xmm6,%xmm2
3017	pxor	%xmm6,%xmm6
3018	pxor	%xmm7,%xmm7
3019	subq	$0x10,%rdx
3020	jmp	.Lcbc_dec_tail_collected
3021
3022.align	16
3023.Lcbc_dec_one:
3024	movaps	%xmm2,%xmm11
3025	movups	(%rcx),%xmm0
3026	movups	16(%rcx),%xmm1
3027	leaq	32(%rcx),%rcx
3028	xorps	%xmm0,%xmm2
3029.Loop_dec1_17:
3030.byte	102,15,56,222,209
3031	decl	%eax
3032	movups	(%rcx),%xmm1
3033	leaq	16(%rcx),%rcx
3034	jnz	.Loop_dec1_17
3035.byte	102,15,56,223,209
3036	xorps	%xmm10,%xmm2
3037	movaps	%xmm11,%xmm10
3038	jmp	.Lcbc_dec_tail_collected
3039.align	16
3040.Lcbc_dec_two:
3041	movaps	%xmm3,%xmm12
3042	call	_aesni_decrypt2
3043	pxor	%xmm10,%xmm2
3044	movaps	%xmm12,%xmm10
3045	pxor	%xmm11,%xmm3
3046	movdqu	%xmm2,(%rsi)
3047	movdqa	%xmm3,%xmm2
3048	pxor	%xmm3,%xmm3
3049	leaq	16(%rsi),%rsi
3050	jmp	.Lcbc_dec_tail_collected
3051.align	16
3052.Lcbc_dec_three:
3053	movaps	%xmm4,%xmm13
3054	call	_aesni_decrypt3
3055	pxor	%xmm10,%xmm2
3056	movaps	%xmm13,%xmm10
3057	pxor	%xmm11,%xmm3
3058	movdqu	%xmm2,(%rsi)
3059	pxor	%xmm12,%xmm4
3060	movdqu	%xmm3,16(%rsi)
3061	pxor	%xmm3,%xmm3
3062	movdqa	%xmm4,%xmm2
3063	pxor	%xmm4,%xmm4
3064	leaq	32(%rsi),%rsi
3065	jmp	.Lcbc_dec_tail_collected
3066.align	16
3067.Lcbc_dec_four:
3068	movaps	%xmm5,%xmm14
3069	call	_aesni_decrypt4
3070	pxor	%xmm10,%xmm2
3071	movaps	%xmm14,%xmm10
3072	pxor	%xmm11,%xmm3
3073	movdqu	%xmm2,(%rsi)
3074	pxor	%xmm12,%xmm4
3075	movdqu	%xmm3,16(%rsi)
3076	pxor	%xmm3,%xmm3
3077	pxor	%xmm13,%xmm5
3078	movdqu	%xmm4,32(%rsi)
3079	pxor	%xmm4,%xmm4
3080	movdqa	%xmm5,%xmm2
3081	pxor	%xmm5,%xmm5
3082	leaq	48(%rsi),%rsi
3083	jmp	.Lcbc_dec_tail_collected
3084
3085.align	16
3086.Lcbc_dec_clear_tail_collected:
3087	pxor	%xmm3,%xmm3
3088	pxor	%xmm4,%xmm4
3089	pxor	%xmm5,%xmm5
3090	pxor	%xmm6,%xmm6
3091	pxor	%xmm7,%xmm7
3092	pxor	%xmm8,%xmm8
3093	pxor	%xmm9,%xmm9
3094.Lcbc_dec_tail_collected:
3095	movups	%xmm10,(%r8)
3096	andq	$15,%rdx
3097	jnz	.Lcbc_dec_tail_partial
3098	movups	%xmm2,(%rsi)
3099	pxor	%xmm2,%xmm2
3100	jmp	.Lcbc_dec_ret
3101.align	16
3102.Lcbc_dec_tail_partial:
3103	movaps	%xmm2,(%rsp)
3104	pxor	%xmm2,%xmm2
3105	movq	$16,%rcx
3106	movq	%rsi,%rdi
3107	subq	%rdx,%rcx
3108	leaq	(%rsp),%rsi
3109.long	0x9066A4F3
3110	movdqa	%xmm2,(%rsp)
3111
3112.Lcbc_dec_ret:
3113	xorps	%xmm0,%xmm0
3114	pxor	%xmm1,%xmm1
3115	leaq	(%rbp),%rsp
3116	popq	%rbp
3117.Lcbc_ret:
3118	.byte	0xf3,0xc3
3119.size	aesni_cbc_encrypt,.-aesni_cbc_encrypt
3120.globl	aesni_set_decrypt_key
3121.type	aesni_set_decrypt_key,@function
3122.align	16
3123aesni_set_decrypt_key:
3124.byte	0x48,0x83,0xEC,0x08
3125	call	__aesni_set_encrypt_key
3126	shll	$4,%esi
3127	testl	%eax,%eax
3128	jnz	.Ldec_key_ret
3129	leaq	16(%rdx,%rsi,1),%rdi
3130
3131	movups	(%rdx),%xmm0
3132	movups	(%rdi),%xmm1
3133	movups	%xmm0,(%rdi)
3134	movups	%xmm1,(%rdx)
3135	leaq	16(%rdx),%rdx
3136	leaq	-16(%rdi),%rdi
3137
3138.Ldec_key_inverse:
3139	movups	(%rdx),%xmm0
3140	movups	(%rdi),%xmm1
3141.byte	102,15,56,219,192
3142.byte	102,15,56,219,201
3143	leaq	16(%rdx),%rdx
3144	leaq	-16(%rdi),%rdi
3145	movups	%xmm0,16(%rdi)
3146	movups	%xmm1,-16(%rdx)
3147	cmpq	%rdx,%rdi
3148	ja	.Ldec_key_inverse
3149
3150	movups	(%rdx),%xmm0
3151.byte	102,15,56,219,192
3152	pxor	%xmm1,%xmm1
3153	movups	%xmm0,(%rdi)
3154	pxor	%xmm0,%xmm0
3155.Ldec_key_ret:
3156	addq	$8,%rsp
3157	.byte	0xf3,0xc3
3158.LSEH_end_set_decrypt_key:
3159.size	aesni_set_decrypt_key,.-aesni_set_decrypt_key
3160.globl	aesni_set_encrypt_key
3161.type	aesni_set_encrypt_key,@function
3162.align	16
3163aesni_set_encrypt_key:
3164__aesni_set_encrypt_key:
3165.byte	0x48,0x83,0xEC,0x08
3166	movq	$-1,%rax
3167	testq	%rdi,%rdi
3168	jz	.Lenc_key_ret
3169	testq	%rdx,%rdx
3170	jz	.Lenc_key_ret
3171
3172	movl	$268437504,%r10d
3173	movups	(%rdi),%xmm0
3174	xorps	%xmm4,%xmm4
3175	andl	OPENSSL_ia32cap_P+4(%rip),%r10d
3176	leaq	16(%rdx),%rax
3177	cmpl	$256,%esi
3178	je	.L14rounds
3179	cmpl	$192,%esi
3180	je	.L12rounds
3181	cmpl	$128,%esi
3182	jne	.Lbad_keybits
3183
3184.L10rounds:
3185	movl	$9,%esi
3186	cmpl	$268435456,%r10d
3187	je	.L10rounds_alt
3188
3189	movups	%xmm0,(%rdx)
3190.byte	102,15,58,223,200,1
3191	call	.Lkey_expansion_128_cold
3192.byte	102,15,58,223,200,2
3193	call	.Lkey_expansion_128
3194.byte	102,15,58,223,200,4
3195	call	.Lkey_expansion_128
3196.byte	102,15,58,223,200,8
3197	call	.Lkey_expansion_128
3198.byte	102,15,58,223,200,16
3199	call	.Lkey_expansion_128
3200.byte	102,15,58,223,200,32
3201	call	.Lkey_expansion_128
3202.byte	102,15,58,223,200,64
3203	call	.Lkey_expansion_128
3204.byte	102,15,58,223,200,128
3205	call	.Lkey_expansion_128
3206.byte	102,15,58,223,200,27
3207	call	.Lkey_expansion_128
3208.byte	102,15,58,223,200,54
3209	call	.Lkey_expansion_128
3210	movups	%xmm0,(%rax)
3211	movl	%esi,80(%rax)
3212	xorl	%eax,%eax
3213	jmp	.Lenc_key_ret
3214
3215.align	16
3216.L10rounds_alt:
3217	movdqa	.Lkey_rotate(%rip),%xmm5
3218	movl	$8,%r10d
3219	movdqa	.Lkey_rcon1(%rip),%xmm4
3220	movdqa	%xmm0,%xmm2
3221	movdqu	%xmm0,(%rdx)
3222	jmp	.Loop_key128
3223
3224.align	16
3225.Loop_key128:
3226.byte	102,15,56,0,197
3227.byte	102,15,56,221,196
3228	pslld	$1,%xmm4
3229	leaq	16(%rax),%rax
3230
3231	movdqa	%xmm2,%xmm3
3232	pslldq	$4,%xmm2
3233	pxor	%xmm2,%xmm3
3234	pslldq	$4,%xmm2
3235	pxor	%xmm2,%xmm3
3236	pslldq	$4,%xmm2
3237	pxor	%xmm3,%xmm2
3238
3239	pxor	%xmm2,%xmm0
3240	movdqu	%xmm0,-16(%rax)
3241	movdqa	%xmm0,%xmm2
3242
3243	decl	%r10d
3244	jnz	.Loop_key128
3245
3246	movdqa	.Lkey_rcon1b(%rip),%xmm4
3247
3248.byte	102,15,56,0,197
3249.byte	102,15,56,221,196
3250	pslld	$1,%xmm4
3251
3252	movdqa	%xmm2,%xmm3
3253	pslldq	$4,%xmm2
3254	pxor	%xmm2,%xmm3
3255	pslldq	$4,%xmm2
3256	pxor	%xmm2,%xmm3
3257	pslldq	$4,%xmm2
3258	pxor	%xmm3,%xmm2
3259
3260	pxor	%xmm2,%xmm0
3261	movdqu	%xmm0,(%rax)
3262
3263	movdqa	%xmm0,%xmm2
3264.byte	102,15,56,0,197
3265.byte	102,15,56,221,196
3266
3267	movdqa	%xmm2,%xmm3
3268	pslldq	$4,%xmm2
3269	pxor	%xmm2,%xmm3
3270	pslldq	$4,%xmm2
3271	pxor	%xmm2,%xmm3
3272	pslldq	$4,%xmm2
3273	pxor	%xmm3,%xmm2
3274
3275	pxor	%xmm2,%xmm0
3276	movdqu	%xmm0,16(%rax)
3277
3278	movl	%esi,96(%rax)
3279	xorl	%eax,%eax
3280	jmp	.Lenc_key_ret
3281
3282.align	16
3283.L12rounds:
3284	movq	16(%rdi),%xmm2
3285	movl	$11,%esi
3286	cmpl	$268435456,%r10d
3287	je	.L12rounds_alt
3288
3289	movups	%xmm0,(%rdx)
3290.byte	102,15,58,223,202,1
3291	call	.Lkey_expansion_192a_cold
3292.byte	102,15,58,223,202,2
3293	call	.Lkey_expansion_192b
3294.byte	102,15,58,223,202,4
3295	call	.Lkey_expansion_192a
3296.byte	102,15,58,223,202,8
3297	call	.Lkey_expansion_192b
3298.byte	102,15,58,223,202,16
3299	call	.Lkey_expansion_192a
3300.byte	102,15,58,223,202,32
3301	call	.Lkey_expansion_192b
3302.byte	102,15,58,223,202,64
3303	call	.Lkey_expansion_192a
3304.byte	102,15,58,223,202,128
3305	call	.Lkey_expansion_192b
3306	movups	%xmm0,(%rax)
3307	movl	%esi,48(%rax)
3308	xorq	%rax,%rax
3309	jmp	.Lenc_key_ret
3310
3311.align	16
3312.L12rounds_alt:
3313	movdqa	.Lkey_rotate192(%rip),%xmm5
3314	movdqa	.Lkey_rcon1(%rip),%xmm4
3315	movl	$8,%r10d
3316	movdqu	%xmm0,(%rdx)
3317	jmp	.Loop_key192
3318
3319.align	16
3320.Loop_key192:
3321	movq	%xmm2,0(%rax)
3322	movdqa	%xmm2,%xmm1
3323.byte	102,15,56,0,213
3324.byte	102,15,56,221,212
3325	pslld	$1,%xmm4
3326	leaq	24(%rax),%rax
3327
3328	movdqa	%xmm0,%xmm3
3329	pslldq	$4,%xmm0
3330	pxor	%xmm0,%xmm3
3331	pslldq	$4,%xmm0
3332	pxor	%xmm0,%xmm3
3333	pslldq	$4,%xmm0
3334	pxor	%xmm3,%xmm0
3335
3336	pshufd	$0xff,%xmm0,%xmm3
3337	pxor	%xmm1,%xmm3
3338	pslldq	$4,%xmm1
3339	pxor	%xmm1,%xmm3
3340
3341	pxor	%xmm2,%xmm0
3342	pxor	%xmm3,%xmm2
3343	movdqu	%xmm0,-16(%rax)
3344
3345	decl	%r10d
3346	jnz	.Loop_key192
3347
3348	movl	%esi,32(%rax)
3349	xorl	%eax,%eax
3350	jmp	.Lenc_key_ret
3351
3352.align	16
3353.L14rounds:
3354	movups	16(%rdi),%xmm2
3355	movl	$13,%esi
3356	leaq	16(%rax),%rax
3357	cmpl	$268435456,%r10d
3358	je	.L14rounds_alt
3359
3360	movups	%xmm0,(%rdx)
3361	movups	%xmm2,16(%rdx)
3362.byte	102,15,58,223,202,1
3363	call	.Lkey_expansion_256a_cold
3364.byte	102,15,58,223,200,1
3365	call	.Lkey_expansion_256b
3366.byte	102,15,58,223,202,2
3367	call	.Lkey_expansion_256a
3368.byte	102,15,58,223,200,2
3369	call	.Lkey_expansion_256b
3370.byte	102,15,58,223,202,4
3371	call	.Lkey_expansion_256a
3372.byte	102,15,58,223,200,4
3373	call	.Lkey_expansion_256b
3374.byte	102,15,58,223,202,8
3375	call	.Lkey_expansion_256a
3376.byte	102,15,58,223,200,8
3377	call	.Lkey_expansion_256b
3378.byte	102,15,58,223,202,16
3379	call	.Lkey_expansion_256a
3380.byte	102,15,58,223,200,16
3381	call	.Lkey_expansion_256b
3382.byte	102,15,58,223,202,32
3383	call	.Lkey_expansion_256a
3384.byte	102,15,58,223,200,32
3385	call	.Lkey_expansion_256b
3386.byte	102,15,58,223,202,64
3387	call	.Lkey_expansion_256a
3388	movups	%xmm0,(%rax)
3389	movl	%esi,16(%rax)
3390	xorq	%rax,%rax
3391	jmp	.Lenc_key_ret
3392
3393.align	16
3394.L14rounds_alt:
3395	movdqa	.Lkey_rotate(%rip),%xmm5
3396	movdqa	.Lkey_rcon1(%rip),%xmm4
3397	movl	$7,%r10d
3398	movdqu	%xmm0,0(%rdx)
3399	movdqa	%xmm2,%xmm1
3400	movdqu	%xmm2,16(%rdx)
3401	jmp	.Loop_key256
3402
3403.align	16
3404.Loop_key256:
3405.byte	102,15,56,0,213
3406.byte	102,15,56,221,212
3407
3408	movdqa	%xmm0,%xmm3
3409	pslldq	$4,%xmm0
3410	pxor	%xmm0,%xmm3
3411	pslldq	$4,%xmm0
3412	pxor	%xmm0,%xmm3
3413	pslldq	$4,%xmm0
3414	pxor	%xmm3,%xmm0
3415	pslld	$1,%xmm4
3416
3417	pxor	%xmm2,%xmm0
3418	movdqu	%xmm0,(%rax)
3419
3420	decl	%r10d
3421	jz	.Ldone_key256
3422
3423	pshufd	$0xff,%xmm0,%xmm2
3424	pxor	%xmm3,%xmm3
3425.byte	102,15,56,221,211
3426
3427	movdqa	%xmm1,%xmm3
3428	pslldq	$4,%xmm1
3429	pxor	%xmm1,%xmm3
3430	pslldq	$4,%xmm1
3431	pxor	%xmm1,%xmm3
3432	pslldq	$4,%xmm1
3433	pxor	%xmm3,%xmm1
3434
3435	pxor	%xmm1,%xmm2
3436	movdqu	%xmm2,16(%rax)
3437	leaq	32(%rax),%rax
3438	movdqa	%xmm2,%xmm1
3439
3440	jmp	.Loop_key256
3441
3442.Ldone_key256:
3443	movl	%esi,16(%rax)
3444	xorl	%eax,%eax
3445	jmp	.Lenc_key_ret
3446
3447.align	16
3448.Lbad_keybits:
3449	movq	$-2,%rax
3450.Lenc_key_ret:
3451	pxor	%xmm0,%xmm0
3452	pxor	%xmm1,%xmm1
3453	pxor	%xmm2,%xmm2
3454	pxor	%xmm3,%xmm3
3455	pxor	%xmm4,%xmm4
3456	pxor	%xmm5,%xmm5
3457	addq	$8,%rsp
3458	.byte	0xf3,0xc3
3459.LSEH_end_set_encrypt_key:
3460
3461.align	16
3462.Lkey_expansion_128:
3463	movups	%xmm0,(%rax)
3464	leaq	16(%rax),%rax
3465.Lkey_expansion_128_cold:
3466	shufps	$16,%xmm0,%xmm4
3467	xorps	%xmm4,%xmm0
3468	shufps	$140,%xmm0,%xmm4
3469	xorps	%xmm4,%xmm0
3470	shufps	$255,%xmm1,%xmm1
3471	xorps	%xmm1,%xmm0
3472	.byte	0xf3,0xc3
3473
3474.align	16
3475.Lkey_expansion_192a:
3476	movups	%xmm0,(%rax)
3477	leaq	16(%rax),%rax
3478.Lkey_expansion_192a_cold:
3479	movaps	%xmm2,%xmm5
3480.Lkey_expansion_192b_warm:
3481	shufps	$16,%xmm0,%xmm4
3482	movdqa	%xmm2,%xmm3
3483	xorps	%xmm4,%xmm0
3484	shufps	$140,%xmm0,%xmm4
3485	pslldq	$4,%xmm3
3486	xorps	%xmm4,%xmm0
3487	pshufd	$85,%xmm1,%xmm1
3488	pxor	%xmm3,%xmm2
3489	pxor	%xmm1,%xmm0
3490	pshufd	$255,%xmm0,%xmm3
3491	pxor	%xmm3,%xmm2
3492	.byte	0xf3,0xc3
3493
3494.align	16
3495.Lkey_expansion_192b:
3496	movaps	%xmm0,%xmm3
3497	shufps	$68,%xmm0,%xmm5
3498	movups	%xmm5,(%rax)
3499	shufps	$78,%xmm2,%xmm3
3500	movups	%xmm3,16(%rax)
3501	leaq	32(%rax),%rax
3502	jmp	.Lkey_expansion_192b_warm
3503
3504.align	16
3505.Lkey_expansion_256a:
3506	movups	%xmm2,(%rax)
3507	leaq	16(%rax),%rax
3508.Lkey_expansion_256a_cold:
3509	shufps	$16,%xmm0,%xmm4
3510	xorps	%xmm4,%xmm0
3511	shufps	$140,%xmm0,%xmm4
3512	xorps	%xmm4,%xmm0
3513	shufps	$255,%xmm1,%xmm1
3514	xorps	%xmm1,%xmm0
3515	.byte	0xf3,0xc3
3516
3517.align	16
3518.Lkey_expansion_256b:
3519	movups	%xmm0,(%rax)
3520	leaq	16(%rax),%rax
3521
3522	shufps	$16,%xmm2,%xmm4
3523	xorps	%xmm4,%xmm2
3524	shufps	$140,%xmm2,%xmm4
3525	xorps	%xmm4,%xmm2
3526	shufps	$170,%xmm1,%xmm1
3527	xorps	%xmm1,%xmm2
3528	.byte	0xf3,0xc3
3529.size	aesni_set_encrypt_key,.-aesni_set_encrypt_key
3530.size	__aesni_set_encrypt_key,.-__aesni_set_encrypt_key
3531.align	64
3532.Lbswap_mask:
3533.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
3534.Lincrement32:
3535.long	6,6,6,0
3536.Lincrement64:
3537.long	1,0,0,0
3538.Lxts_magic:
3539.long	0x87,0,1,0
3540.Lincrement1:
3541.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3542.Lkey_rotate:
3543.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
3544.Lkey_rotate192:
3545.long	0x04070605,0x04070605,0x04070605,0x04070605
3546.Lkey_rcon1:
3547.long	1,1,1,1
3548.Lkey_rcon1b:
3549.long	0x1b,0x1b,0x1b,0x1b
3550
3551.byte	65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
3552.align	64
3553