aesni-x86_64.S revision 305153
1/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/aesni-x86_64.S 305153 2016-08-31 20:33:59Z jkim $ */
2/* Do not modify. This file is auto-generated from aesni-x86_64.pl. */
3.text
4
5.globl	aesni_encrypt
6.type	aesni_encrypt,@function
7.align	16
8aesni_encrypt:
9	movups	(%rdi),%xmm2
10	movl	240(%rdx),%eax
11	movups	(%rdx),%xmm0
12	movups	16(%rdx),%xmm1
13	leaq	32(%rdx),%rdx
14	xorps	%xmm0,%xmm2
15.Loop_enc1_1:
16.byte	102,15,56,220,209
17	decl	%eax
18	movups	(%rdx),%xmm1
19	leaq	16(%rdx),%rdx
20	jnz	.Loop_enc1_1
21.byte	102,15,56,221,209
22	pxor	%xmm0,%xmm0
23	pxor	%xmm1,%xmm1
24	movups	%xmm2,(%rsi)
25	pxor	%xmm2,%xmm2
26	.byte	0xf3,0xc3
27.size	aesni_encrypt,.-aesni_encrypt
28
29.globl	aesni_decrypt
30.type	aesni_decrypt,@function
31.align	16
32aesni_decrypt:
33	movups	(%rdi),%xmm2
34	movl	240(%rdx),%eax
35	movups	(%rdx),%xmm0
36	movups	16(%rdx),%xmm1
37	leaq	32(%rdx),%rdx
38	xorps	%xmm0,%xmm2
39.Loop_dec1_2:
40.byte	102,15,56,222,209
41	decl	%eax
42	movups	(%rdx),%xmm1
43	leaq	16(%rdx),%rdx
44	jnz	.Loop_dec1_2
45.byte	102,15,56,223,209
46	pxor	%xmm0,%xmm0
47	pxor	%xmm1,%xmm1
48	movups	%xmm2,(%rsi)
49	pxor	%xmm2,%xmm2
50	.byte	0xf3,0xc3
51.size	aesni_decrypt, .-aesni_decrypt
52.type	_aesni_encrypt2,@function
53.align	16
54_aesni_encrypt2:
55	movups	(%rcx),%xmm0
56	shll	$4,%eax
57	movups	16(%rcx),%xmm1
58	xorps	%xmm0,%xmm2
59	xorps	%xmm0,%xmm3
60	movups	32(%rcx),%xmm0
61	leaq	32(%rcx,%rax,1),%rcx
62	negq	%rax
63	addq	$16,%rax
64
65.Lenc_loop2:
66.byte	102,15,56,220,209
67.byte	102,15,56,220,217
68	movups	(%rcx,%rax,1),%xmm1
69	addq	$32,%rax
70.byte	102,15,56,220,208
71.byte	102,15,56,220,216
72	movups	-16(%rcx,%rax,1),%xmm0
73	jnz	.Lenc_loop2
74
75.byte	102,15,56,220,209
76.byte	102,15,56,220,217
77.byte	102,15,56,221,208
78.byte	102,15,56,221,216
79	.byte	0xf3,0xc3
80.size	_aesni_encrypt2,.-_aesni_encrypt2
81.type	_aesni_decrypt2,@function
82.align	16
83_aesni_decrypt2:
84	movups	(%rcx),%xmm0
85	shll	$4,%eax
86	movups	16(%rcx),%xmm1
87	xorps	%xmm0,%xmm2
88	xorps	%xmm0,%xmm3
89	movups	32(%rcx),%xmm0
90	leaq	32(%rcx,%rax,1),%rcx
91	negq	%rax
92	addq	$16,%rax
93
94.Ldec_loop2:
95.byte	102,15,56,222,209
96.byte	102,15,56,222,217
97	movups	(%rcx,%rax,1),%xmm1
98	addq	$32,%rax
99.byte	102,15,56,222,208
100.byte	102,15,56,222,216
101	movups	-16(%rcx,%rax,1),%xmm0
102	jnz	.Ldec_loop2
103
104.byte	102,15,56,222,209
105.byte	102,15,56,222,217
106.byte	102,15,56,223,208
107.byte	102,15,56,223,216
108	.byte	0xf3,0xc3
109.size	_aesni_decrypt2,.-_aesni_decrypt2
110.type	_aesni_encrypt3,@function
111.align	16
112_aesni_encrypt3:
113	movups	(%rcx),%xmm0
114	shll	$4,%eax
115	movups	16(%rcx),%xmm1
116	xorps	%xmm0,%xmm2
117	xorps	%xmm0,%xmm3
118	xorps	%xmm0,%xmm4
119	movups	32(%rcx),%xmm0
120	leaq	32(%rcx,%rax,1),%rcx
121	negq	%rax
122	addq	$16,%rax
123
124.Lenc_loop3:
125.byte	102,15,56,220,209
126.byte	102,15,56,220,217
127.byte	102,15,56,220,225
128	movups	(%rcx,%rax,1),%xmm1
129	addq	$32,%rax
130.byte	102,15,56,220,208
131.byte	102,15,56,220,216
132.byte	102,15,56,220,224
133	movups	-16(%rcx,%rax,1),%xmm0
134	jnz	.Lenc_loop3
135
136.byte	102,15,56,220,209
137.byte	102,15,56,220,217
138.byte	102,15,56,220,225
139.byte	102,15,56,221,208
140.byte	102,15,56,221,216
141.byte	102,15,56,221,224
142	.byte	0xf3,0xc3
143.size	_aesni_encrypt3,.-_aesni_encrypt3
144.type	_aesni_decrypt3,@function
145.align	16
146_aesni_decrypt3:
147	movups	(%rcx),%xmm0
148	shll	$4,%eax
149	movups	16(%rcx),%xmm1
150	xorps	%xmm0,%xmm2
151	xorps	%xmm0,%xmm3
152	xorps	%xmm0,%xmm4
153	movups	32(%rcx),%xmm0
154	leaq	32(%rcx,%rax,1),%rcx
155	negq	%rax
156	addq	$16,%rax
157
158.Ldec_loop3:
159.byte	102,15,56,222,209
160.byte	102,15,56,222,217
161.byte	102,15,56,222,225
162	movups	(%rcx,%rax,1),%xmm1
163	addq	$32,%rax
164.byte	102,15,56,222,208
165.byte	102,15,56,222,216
166.byte	102,15,56,222,224
167	movups	-16(%rcx,%rax,1),%xmm0
168	jnz	.Ldec_loop3
169
170.byte	102,15,56,222,209
171.byte	102,15,56,222,217
172.byte	102,15,56,222,225
173.byte	102,15,56,223,208
174.byte	102,15,56,223,216
175.byte	102,15,56,223,224
176	.byte	0xf3,0xc3
177.size	_aesni_decrypt3,.-_aesni_decrypt3
178.type	_aesni_encrypt4,@function
179.align	16
180_aesni_encrypt4:
181	movups	(%rcx),%xmm0
182	shll	$4,%eax
183	movups	16(%rcx),%xmm1
184	xorps	%xmm0,%xmm2
185	xorps	%xmm0,%xmm3
186	xorps	%xmm0,%xmm4
187	xorps	%xmm0,%xmm5
188	movups	32(%rcx),%xmm0
189	leaq	32(%rcx,%rax,1),%rcx
190	negq	%rax
191.byte	0x0f,0x1f,0x00
192	addq	$16,%rax
193
194.Lenc_loop4:
195.byte	102,15,56,220,209
196.byte	102,15,56,220,217
197.byte	102,15,56,220,225
198.byte	102,15,56,220,233
199	movups	(%rcx,%rax,1),%xmm1
200	addq	$32,%rax
201.byte	102,15,56,220,208
202.byte	102,15,56,220,216
203.byte	102,15,56,220,224
204.byte	102,15,56,220,232
205	movups	-16(%rcx,%rax,1),%xmm0
206	jnz	.Lenc_loop4
207
208.byte	102,15,56,220,209
209.byte	102,15,56,220,217
210.byte	102,15,56,220,225
211.byte	102,15,56,220,233
212.byte	102,15,56,221,208
213.byte	102,15,56,221,216
214.byte	102,15,56,221,224
215.byte	102,15,56,221,232
216	.byte	0xf3,0xc3
217.size	_aesni_encrypt4,.-_aesni_encrypt4
218.type	_aesni_decrypt4,@function
219.align	16
220_aesni_decrypt4:
221	movups	(%rcx),%xmm0
222	shll	$4,%eax
223	movups	16(%rcx),%xmm1
224	xorps	%xmm0,%xmm2
225	xorps	%xmm0,%xmm3
226	xorps	%xmm0,%xmm4
227	xorps	%xmm0,%xmm5
228	movups	32(%rcx),%xmm0
229	leaq	32(%rcx,%rax,1),%rcx
230	negq	%rax
231.byte	0x0f,0x1f,0x00
232	addq	$16,%rax
233
234.Ldec_loop4:
235.byte	102,15,56,222,209
236.byte	102,15,56,222,217
237.byte	102,15,56,222,225
238.byte	102,15,56,222,233
239	movups	(%rcx,%rax,1),%xmm1
240	addq	$32,%rax
241.byte	102,15,56,222,208
242.byte	102,15,56,222,216
243.byte	102,15,56,222,224
244.byte	102,15,56,222,232
245	movups	-16(%rcx,%rax,1),%xmm0
246	jnz	.Ldec_loop4
247
248.byte	102,15,56,222,209
249.byte	102,15,56,222,217
250.byte	102,15,56,222,225
251.byte	102,15,56,222,233
252.byte	102,15,56,223,208
253.byte	102,15,56,223,216
254.byte	102,15,56,223,224
255.byte	102,15,56,223,232
256	.byte	0xf3,0xc3
257.size	_aesni_decrypt4,.-_aesni_decrypt4
258.type	_aesni_encrypt6,@function
259.align	16
260_aesni_encrypt6:
261	movups	(%rcx),%xmm0
262	shll	$4,%eax
263	movups	16(%rcx),%xmm1
264	xorps	%xmm0,%xmm2
265	pxor	%xmm0,%xmm3
266	pxor	%xmm0,%xmm4
267.byte	102,15,56,220,209
268	leaq	32(%rcx,%rax,1),%rcx
269	negq	%rax
270.byte	102,15,56,220,217
271	pxor	%xmm0,%xmm5
272	pxor	%xmm0,%xmm6
273.byte	102,15,56,220,225
274	pxor	%xmm0,%xmm7
275	movups	(%rcx,%rax,1),%xmm0
276	addq	$16,%rax
277	jmp	.Lenc_loop6_enter
278.align	16
279.Lenc_loop6:
280.byte	102,15,56,220,209
281.byte	102,15,56,220,217
282.byte	102,15,56,220,225
283.Lenc_loop6_enter:
284.byte	102,15,56,220,233
285.byte	102,15,56,220,241
286.byte	102,15,56,220,249
287	movups	(%rcx,%rax,1),%xmm1
288	addq	$32,%rax
289.byte	102,15,56,220,208
290.byte	102,15,56,220,216
291.byte	102,15,56,220,224
292.byte	102,15,56,220,232
293.byte	102,15,56,220,240
294.byte	102,15,56,220,248
295	movups	-16(%rcx,%rax,1),%xmm0
296	jnz	.Lenc_loop6
297
298.byte	102,15,56,220,209
299.byte	102,15,56,220,217
300.byte	102,15,56,220,225
301.byte	102,15,56,220,233
302.byte	102,15,56,220,241
303.byte	102,15,56,220,249
304.byte	102,15,56,221,208
305.byte	102,15,56,221,216
306.byte	102,15,56,221,224
307.byte	102,15,56,221,232
308.byte	102,15,56,221,240
309.byte	102,15,56,221,248
310	.byte	0xf3,0xc3
311.size	_aesni_encrypt6,.-_aesni_encrypt6
312.type	_aesni_decrypt6,@function
313.align	16
314_aesni_decrypt6:
315	movups	(%rcx),%xmm0
316	shll	$4,%eax
317	movups	16(%rcx),%xmm1
318	xorps	%xmm0,%xmm2
319	pxor	%xmm0,%xmm3
320	pxor	%xmm0,%xmm4
321.byte	102,15,56,222,209
322	leaq	32(%rcx,%rax,1),%rcx
323	negq	%rax
324.byte	102,15,56,222,217
325	pxor	%xmm0,%xmm5
326	pxor	%xmm0,%xmm6
327.byte	102,15,56,222,225
328	pxor	%xmm0,%xmm7
329	movups	(%rcx,%rax,1),%xmm0
330	addq	$16,%rax
331	jmp	.Ldec_loop6_enter
332.align	16
333.Ldec_loop6:
334.byte	102,15,56,222,209
335.byte	102,15,56,222,217
336.byte	102,15,56,222,225
337.Ldec_loop6_enter:
338.byte	102,15,56,222,233
339.byte	102,15,56,222,241
340.byte	102,15,56,222,249
341	movups	(%rcx,%rax,1),%xmm1
342	addq	$32,%rax
343.byte	102,15,56,222,208
344.byte	102,15,56,222,216
345.byte	102,15,56,222,224
346.byte	102,15,56,222,232
347.byte	102,15,56,222,240
348.byte	102,15,56,222,248
349	movups	-16(%rcx,%rax,1),%xmm0
350	jnz	.Ldec_loop6
351
352.byte	102,15,56,222,209
353.byte	102,15,56,222,217
354.byte	102,15,56,222,225
355.byte	102,15,56,222,233
356.byte	102,15,56,222,241
357.byte	102,15,56,222,249
358.byte	102,15,56,223,208
359.byte	102,15,56,223,216
360.byte	102,15,56,223,224
361.byte	102,15,56,223,232
362.byte	102,15,56,223,240
363.byte	102,15,56,223,248
364	.byte	0xf3,0xc3
365.size	_aesni_decrypt6,.-_aesni_decrypt6
366.type	_aesni_encrypt8,@function
367.align	16
368_aesni_encrypt8:
369	movups	(%rcx),%xmm0
370	shll	$4,%eax
371	movups	16(%rcx),%xmm1
372	xorps	%xmm0,%xmm2
373	xorps	%xmm0,%xmm3
374	pxor	%xmm0,%xmm4
375	pxor	%xmm0,%xmm5
376	pxor	%xmm0,%xmm6
377	leaq	32(%rcx,%rax,1),%rcx
378	negq	%rax
379.byte	102,15,56,220,209
380	pxor	%xmm0,%xmm7
381	pxor	%xmm0,%xmm8
382.byte	102,15,56,220,217
383	pxor	%xmm0,%xmm9
384	movups	(%rcx,%rax,1),%xmm0
385	addq	$16,%rax
386	jmp	.Lenc_loop8_inner
387.align	16
388.Lenc_loop8:
389.byte	102,15,56,220,209
390.byte	102,15,56,220,217
391.Lenc_loop8_inner:
392.byte	102,15,56,220,225
393.byte	102,15,56,220,233
394.byte	102,15,56,220,241
395.byte	102,15,56,220,249
396.byte	102,68,15,56,220,193
397.byte	102,68,15,56,220,201
398.Lenc_loop8_enter:
399	movups	(%rcx,%rax,1),%xmm1
400	addq	$32,%rax
401.byte	102,15,56,220,208
402.byte	102,15,56,220,216
403.byte	102,15,56,220,224
404.byte	102,15,56,220,232
405.byte	102,15,56,220,240
406.byte	102,15,56,220,248
407.byte	102,68,15,56,220,192
408.byte	102,68,15,56,220,200
409	movups	-16(%rcx,%rax,1),%xmm0
410	jnz	.Lenc_loop8
411
412.byte	102,15,56,220,209
413.byte	102,15,56,220,217
414.byte	102,15,56,220,225
415.byte	102,15,56,220,233
416.byte	102,15,56,220,241
417.byte	102,15,56,220,249
418.byte	102,68,15,56,220,193
419.byte	102,68,15,56,220,201
420.byte	102,15,56,221,208
421.byte	102,15,56,221,216
422.byte	102,15,56,221,224
423.byte	102,15,56,221,232
424.byte	102,15,56,221,240
425.byte	102,15,56,221,248
426.byte	102,68,15,56,221,192
427.byte	102,68,15,56,221,200
428	.byte	0xf3,0xc3
429.size	_aesni_encrypt8,.-_aesni_encrypt8
430.type	_aesni_decrypt8,@function
431.align	16
432_aesni_decrypt8:
433	movups	(%rcx),%xmm0
434	shll	$4,%eax
435	movups	16(%rcx),%xmm1
436	xorps	%xmm0,%xmm2
437	xorps	%xmm0,%xmm3
438	pxor	%xmm0,%xmm4
439	pxor	%xmm0,%xmm5
440	pxor	%xmm0,%xmm6
441	leaq	32(%rcx,%rax,1),%rcx
442	negq	%rax
443.byte	102,15,56,222,209
444	pxor	%xmm0,%xmm7
445	pxor	%xmm0,%xmm8
446.byte	102,15,56,222,217
447	pxor	%xmm0,%xmm9
448	movups	(%rcx,%rax,1),%xmm0
449	addq	$16,%rax
450	jmp	.Ldec_loop8_inner
451.align	16
452.Ldec_loop8:
453.byte	102,15,56,222,209
454.byte	102,15,56,222,217
455.Ldec_loop8_inner:
456.byte	102,15,56,222,225
457.byte	102,15,56,222,233
458.byte	102,15,56,222,241
459.byte	102,15,56,222,249
460.byte	102,68,15,56,222,193
461.byte	102,68,15,56,222,201
462.Ldec_loop8_enter:
463	movups	(%rcx,%rax,1),%xmm1
464	addq	$32,%rax
465.byte	102,15,56,222,208
466.byte	102,15,56,222,216
467.byte	102,15,56,222,224
468.byte	102,15,56,222,232
469.byte	102,15,56,222,240
470.byte	102,15,56,222,248
471.byte	102,68,15,56,222,192
472.byte	102,68,15,56,222,200
473	movups	-16(%rcx,%rax,1),%xmm0
474	jnz	.Ldec_loop8
475
476.byte	102,15,56,222,209
477.byte	102,15,56,222,217
478.byte	102,15,56,222,225
479.byte	102,15,56,222,233
480.byte	102,15,56,222,241
481.byte	102,15,56,222,249
482.byte	102,68,15,56,222,193
483.byte	102,68,15,56,222,201
484.byte	102,15,56,223,208
485.byte	102,15,56,223,216
486.byte	102,15,56,223,224
487.byte	102,15,56,223,232
488.byte	102,15,56,223,240
489.byte	102,15,56,223,248
490.byte	102,68,15,56,223,192
491.byte	102,68,15,56,223,200
492	.byte	0xf3,0xc3
493.size	_aesni_decrypt8,.-_aesni_decrypt8
494.globl	aesni_ecb_encrypt
495.type	aesni_ecb_encrypt,@function
496.align	16
497aesni_ecb_encrypt:
498	andq	$-16,%rdx
499	jz	.Lecb_ret
500
501	movl	240(%rcx),%eax
502	movups	(%rcx),%xmm0
503	movq	%rcx,%r11
504	movl	%eax,%r10d
505	testl	%r8d,%r8d
506	jz	.Lecb_decrypt
507
508	cmpq	$0x80,%rdx
509	jb	.Lecb_enc_tail
510
511	movdqu	(%rdi),%xmm2
512	movdqu	16(%rdi),%xmm3
513	movdqu	32(%rdi),%xmm4
514	movdqu	48(%rdi),%xmm5
515	movdqu	64(%rdi),%xmm6
516	movdqu	80(%rdi),%xmm7
517	movdqu	96(%rdi),%xmm8
518	movdqu	112(%rdi),%xmm9
519	leaq	128(%rdi),%rdi
520	subq	$0x80,%rdx
521	jmp	.Lecb_enc_loop8_enter
522.align	16
523.Lecb_enc_loop8:
524	movups	%xmm2,(%rsi)
525	movq	%r11,%rcx
526	movdqu	(%rdi),%xmm2
527	movl	%r10d,%eax
528	movups	%xmm3,16(%rsi)
529	movdqu	16(%rdi),%xmm3
530	movups	%xmm4,32(%rsi)
531	movdqu	32(%rdi),%xmm4
532	movups	%xmm5,48(%rsi)
533	movdqu	48(%rdi),%xmm5
534	movups	%xmm6,64(%rsi)
535	movdqu	64(%rdi),%xmm6
536	movups	%xmm7,80(%rsi)
537	movdqu	80(%rdi),%xmm7
538	movups	%xmm8,96(%rsi)
539	movdqu	96(%rdi),%xmm8
540	movups	%xmm9,112(%rsi)
541	leaq	128(%rsi),%rsi
542	movdqu	112(%rdi),%xmm9
543	leaq	128(%rdi),%rdi
544.Lecb_enc_loop8_enter:
545
546	call	_aesni_encrypt8
547
548	subq	$0x80,%rdx
549	jnc	.Lecb_enc_loop8
550
551	movups	%xmm2,(%rsi)
552	movq	%r11,%rcx
553	movups	%xmm3,16(%rsi)
554	movl	%r10d,%eax
555	movups	%xmm4,32(%rsi)
556	movups	%xmm5,48(%rsi)
557	movups	%xmm6,64(%rsi)
558	movups	%xmm7,80(%rsi)
559	movups	%xmm8,96(%rsi)
560	movups	%xmm9,112(%rsi)
561	leaq	128(%rsi),%rsi
562	addq	$0x80,%rdx
563	jz	.Lecb_ret
564
565.Lecb_enc_tail:
566	movups	(%rdi),%xmm2
567	cmpq	$0x20,%rdx
568	jb	.Lecb_enc_one
569	movups	16(%rdi),%xmm3
570	je	.Lecb_enc_two
571	movups	32(%rdi),%xmm4
572	cmpq	$0x40,%rdx
573	jb	.Lecb_enc_three
574	movups	48(%rdi),%xmm5
575	je	.Lecb_enc_four
576	movups	64(%rdi),%xmm6
577	cmpq	$0x60,%rdx
578	jb	.Lecb_enc_five
579	movups	80(%rdi),%xmm7
580	je	.Lecb_enc_six
581	movdqu	96(%rdi),%xmm8
582	xorps	%xmm9,%xmm9
583	call	_aesni_encrypt8
584	movups	%xmm2,(%rsi)
585	movups	%xmm3,16(%rsi)
586	movups	%xmm4,32(%rsi)
587	movups	%xmm5,48(%rsi)
588	movups	%xmm6,64(%rsi)
589	movups	%xmm7,80(%rsi)
590	movups	%xmm8,96(%rsi)
591	jmp	.Lecb_ret
592.align	16
593.Lecb_enc_one:
594	movups	(%rcx),%xmm0
595	movups	16(%rcx),%xmm1
596	leaq	32(%rcx),%rcx
597	xorps	%xmm0,%xmm2
598.Loop_enc1_3:
599.byte	102,15,56,220,209
600	decl	%eax
601	movups	(%rcx),%xmm1
602	leaq	16(%rcx),%rcx
603	jnz	.Loop_enc1_3
604.byte	102,15,56,221,209
605	movups	%xmm2,(%rsi)
606	jmp	.Lecb_ret
607.align	16
608.Lecb_enc_two:
609	call	_aesni_encrypt2
610	movups	%xmm2,(%rsi)
611	movups	%xmm3,16(%rsi)
612	jmp	.Lecb_ret
613.align	16
614.Lecb_enc_three:
615	call	_aesni_encrypt3
616	movups	%xmm2,(%rsi)
617	movups	%xmm3,16(%rsi)
618	movups	%xmm4,32(%rsi)
619	jmp	.Lecb_ret
620.align	16
621.Lecb_enc_four:
622	call	_aesni_encrypt4
623	movups	%xmm2,(%rsi)
624	movups	%xmm3,16(%rsi)
625	movups	%xmm4,32(%rsi)
626	movups	%xmm5,48(%rsi)
627	jmp	.Lecb_ret
628.align	16
629.Lecb_enc_five:
630	xorps	%xmm7,%xmm7
631	call	_aesni_encrypt6
632	movups	%xmm2,(%rsi)
633	movups	%xmm3,16(%rsi)
634	movups	%xmm4,32(%rsi)
635	movups	%xmm5,48(%rsi)
636	movups	%xmm6,64(%rsi)
637	jmp	.Lecb_ret
638.align	16
639.Lecb_enc_six:
640	call	_aesni_encrypt6
641	movups	%xmm2,(%rsi)
642	movups	%xmm3,16(%rsi)
643	movups	%xmm4,32(%rsi)
644	movups	%xmm5,48(%rsi)
645	movups	%xmm6,64(%rsi)
646	movups	%xmm7,80(%rsi)
647	jmp	.Lecb_ret
648
649.align	16
650.Lecb_decrypt:
651	cmpq	$0x80,%rdx
652	jb	.Lecb_dec_tail
653
654	movdqu	(%rdi),%xmm2
655	movdqu	16(%rdi),%xmm3
656	movdqu	32(%rdi),%xmm4
657	movdqu	48(%rdi),%xmm5
658	movdqu	64(%rdi),%xmm6
659	movdqu	80(%rdi),%xmm7
660	movdqu	96(%rdi),%xmm8
661	movdqu	112(%rdi),%xmm9
662	leaq	128(%rdi),%rdi
663	subq	$0x80,%rdx
664	jmp	.Lecb_dec_loop8_enter
665.align	16
666.Lecb_dec_loop8:
667	movups	%xmm2,(%rsi)
668	movq	%r11,%rcx
669	movdqu	(%rdi),%xmm2
670	movl	%r10d,%eax
671	movups	%xmm3,16(%rsi)
672	movdqu	16(%rdi),%xmm3
673	movups	%xmm4,32(%rsi)
674	movdqu	32(%rdi),%xmm4
675	movups	%xmm5,48(%rsi)
676	movdqu	48(%rdi),%xmm5
677	movups	%xmm6,64(%rsi)
678	movdqu	64(%rdi),%xmm6
679	movups	%xmm7,80(%rsi)
680	movdqu	80(%rdi),%xmm7
681	movups	%xmm8,96(%rsi)
682	movdqu	96(%rdi),%xmm8
683	movups	%xmm9,112(%rsi)
684	leaq	128(%rsi),%rsi
685	movdqu	112(%rdi),%xmm9
686	leaq	128(%rdi),%rdi
687.Lecb_dec_loop8_enter:
688
689	call	_aesni_decrypt8
690
691	movups	(%r11),%xmm0
692	subq	$0x80,%rdx
693	jnc	.Lecb_dec_loop8
694
695	movups	%xmm2,(%rsi)
696	pxor	%xmm2,%xmm2
697	movq	%r11,%rcx
698	movups	%xmm3,16(%rsi)
699	pxor	%xmm3,%xmm3
700	movl	%r10d,%eax
701	movups	%xmm4,32(%rsi)
702	pxor	%xmm4,%xmm4
703	movups	%xmm5,48(%rsi)
704	pxor	%xmm5,%xmm5
705	movups	%xmm6,64(%rsi)
706	pxor	%xmm6,%xmm6
707	movups	%xmm7,80(%rsi)
708	pxor	%xmm7,%xmm7
709	movups	%xmm8,96(%rsi)
710	pxor	%xmm8,%xmm8
711	movups	%xmm9,112(%rsi)
712	pxor	%xmm9,%xmm9
713	leaq	128(%rsi),%rsi
714	addq	$0x80,%rdx
715	jz	.Lecb_ret
716
717.Lecb_dec_tail:
718	movups	(%rdi),%xmm2
719	cmpq	$0x20,%rdx
720	jb	.Lecb_dec_one
721	movups	16(%rdi),%xmm3
722	je	.Lecb_dec_two
723	movups	32(%rdi),%xmm4
724	cmpq	$0x40,%rdx
725	jb	.Lecb_dec_three
726	movups	48(%rdi),%xmm5
727	je	.Lecb_dec_four
728	movups	64(%rdi),%xmm6
729	cmpq	$0x60,%rdx
730	jb	.Lecb_dec_five
731	movups	80(%rdi),%xmm7
732	je	.Lecb_dec_six
733	movups	96(%rdi),%xmm8
734	movups	(%rcx),%xmm0
735	xorps	%xmm9,%xmm9
736	call	_aesni_decrypt8
737	movups	%xmm2,(%rsi)
738	pxor	%xmm2,%xmm2
739	movups	%xmm3,16(%rsi)
740	pxor	%xmm3,%xmm3
741	movups	%xmm4,32(%rsi)
742	pxor	%xmm4,%xmm4
743	movups	%xmm5,48(%rsi)
744	pxor	%xmm5,%xmm5
745	movups	%xmm6,64(%rsi)
746	pxor	%xmm6,%xmm6
747	movups	%xmm7,80(%rsi)
748	pxor	%xmm7,%xmm7
749	movups	%xmm8,96(%rsi)
750	pxor	%xmm8,%xmm8
751	pxor	%xmm9,%xmm9
752	jmp	.Lecb_ret
753.align	16
754.Lecb_dec_one:
755	movups	(%rcx),%xmm0
756	movups	16(%rcx),%xmm1
757	leaq	32(%rcx),%rcx
758	xorps	%xmm0,%xmm2
759.Loop_dec1_4:
760.byte	102,15,56,222,209
761	decl	%eax
762	movups	(%rcx),%xmm1
763	leaq	16(%rcx),%rcx
764	jnz	.Loop_dec1_4
765.byte	102,15,56,223,209
766	movups	%xmm2,(%rsi)
767	pxor	%xmm2,%xmm2
768	jmp	.Lecb_ret
769.align	16
770.Lecb_dec_two:
771	call	_aesni_decrypt2
772	movups	%xmm2,(%rsi)
773	pxor	%xmm2,%xmm2
774	movups	%xmm3,16(%rsi)
775	pxor	%xmm3,%xmm3
776	jmp	.Lecb_ret
777.align	16
778.Lecb_dec_three:
779	call	_aesni_decrypt3
780	movups	%xmm2,(%rsi)
781	pxor	%xmm2,%xmm2
782	movups	%xmm3,16(%rsi)
783	pxor	%xmm3,%xmm3
784	movups	%xmm4,32(%rsi)
785	pxor	%xmm4,%xmm4
786	jmp	.Lecb_ret
787.align	16
788.Lecb_dec_four:
789	call	_aesni_decrypt4
790	movups	%xmm2,(%rsi)
791	pxor	%xmm2,%xmm2
792	movups	%xmm3,16(%rsi)
793	pxor	%xmm3,%xmm3
794	movups	%xmm4,32(%rsi)
795	pxor	%xmm4,%xmm4
796	movups	%xmm5,48(%rsi)
797	pxor	%xmm5,%xmm5
798	jmp	.Lecb_ret
799.align	16
800.Lecb_dec_five:
801	xorps	%xmm7,%xmm7
802	call	_aesni_decrypt6
803	movups	%xmm2,(%rsi)
804	pxor	%xmm2,%xmm2
805	movups	%xmm3,16(%rsi)
806	pxor	%xmm3,%xmm3
807	movups	%xmm4,32(%rsi)
808	pxor	%xmm4,%xmm4
809	movups	%xmm5,48(%rsi)
810	pxor	%xmm5,%xmm5
811	movups	%xmm6,64(%rsi)
812	pxor	%xmm6,%xmm6
813	pxor	%xmm7,%xmm7
814	jmp	.Lecb_ret
815.align	16
816.Lecb_dec_six:
817	call	_aesni_decrypt6
818	movups	%xmm2,(%rsi)
819	pxor	%xmm2,%xmm2
820	movups	%xmm3,16(%rsi)
821	pxor	%xmm3,%xmm3
822	movups	%xmm4,32(%rsi)
823	pxor	%xmm4,%xmm4
824	movups	%xmm5,48(%rsi)
825	pxor	%xmm5,%xmm5
826	movups	%xmm6,64(%rsi)
827	pxor	%xmm6,%xmm6
828	movups	%xmm7,80(%rsi)
829	pxor	%xmm7,%xmm7
830
831.Lecb_ret:
832	xorps	%xmm0,%xmm0
833	pxor	%xmm1,%xmm1
834	.byte	0xf3,0xc3
835.size	aesni_ecb_encrypt,.-aesni_ecb_encrypt
836.globl	aesni_ccm64_encrypt_blocks
837.type	aesni_ccm64_encrypt_blocks,@function
838.align	16
839aesni_ccm64_encrypt_blocks:
840	movl	240(%rcx),%eax
841	movdqu	(%r8),%xmm6
842	movdqa	.Lincrement64(%rip),%xmm9
843	movdqa	.Lbswap_mask(%rip),%xmm7
844
845	shll	$4,%eax
846	movl	$16,%r10d
847	leaq	0(%rcx),%r11
848	movdqu	(%r9),%xmm3
849	movdqa	%xmm6,%xmm2
850	leaq	32(%rcx,%rax,1),%rcx
851.byte	102,15,56,0,247
852	subq	%rax,%r10
853	jmp	.Lccm64_enc_outer
854.align	16
855.Lccm64_enc_outer:
856	movups	(%r11),%xmm0
857	movq	%r10,%rax
858	movups	(%rdi),%xmm8
859
860	xorps	%xmm0,%xmm2
861	movups	16(%r11),%xmm1
862	xorps	%xmm8,%xmm0
863	xorps	%xmm0,%xmm3
864	movups	32(%r11),%xmm0
865
866.Lccm64_enc2_loop:
867.byte	102,15,56,220,209
868.byte	102,15,56,220,217
869	movups	(%rcx,%rax,1),%xmm1
870	addq	$32,%rax
871.byte	102,15,56,220,208
872.byte	102,15,56,220,216
873	movups	-16(%rcx,%rax,1),%xmm0
874	jnz	.Lccm64_enc2_loop
875.byte	102,15,56,220,209
876.byte	102,15,56,220,217
877	paddq	%xmm9,%xmm6
878	decq	%rdx
879.byte	102,15,56,221,208
880.byte	102,15,56,221,216
881
882	leaq	16(%rdi),%rdi
883	xorps	%xmm2,%xmm8
884	movdqa	%xmm6,%xmm2
885	movups	%xmm8,(%rsi)
886.byte	102,15,56,0,215
887	leaq	16(%rsi),%rsi
888	jnz	.Lccm64_enc_outer
889
890	pxor	%xmm0,%xmm0
891	pxor	%xmm1,%xmm1
892	pxor	%xmm2,%xmm2
893	movups	%xmm3,(%r9)
894	pxor	%xmm3,%xmm3
895	pxor	%xmm8,%xmm8
896	pxor	%xmm6,%xmm6
897	.byte	0xf3,0xc3
898.size	aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
899.globl	aesni_ccm64_decrypt_blocks
900.type	aesni_ccm64_decrypt_blocks,@function
901.align	16
902aesni_ccm64_decrypt_blocks:
903	movl	240(%rcx),%eax
904	movups	(%r8),%xmm6
905	movdqu	(%r9),%xmm3
906	movdqa	.Lincrement64(%rip),%xmm9
907	movdqa	.Lbswap_mask(%rip),%xmm7
908
909	movaps	%xmm6,%xmm2
910	movl	%eax,%r10d
911	movq	%rcx,%r11
912.byte	102,15,56,0,247
913	movups	(%rcx),%xmm0
914	movups	16(%rcx),%xmm1
915	leaq	32(%rcx),%rcx
916	xorps	%xmm0,%xmm2
917.Loop_enc1_5:
918.byte	102,15,56,220,209
919	decl	%eax
920	movups	(%rcx),%xmm1
921	leaq	16(%rcx),%rcx
922	jnz	.Loop_enc1_5
923.byte	102,15,56,221,209
924	shll	$4,%r10d
925	movl	$16,%eax
926	movups	(%rdi),%xmm8
927	paddq	%xmm9,%xmm6
928	leaq	16(%rdi),%rdi
929	subq	%r10,%rax
930	leaq	32(%r11,%r10,1),%rcx
931	movq	%rax,%r10
932	jmp	.Lccm64_dec_outer
933.align	16
934.Lccm64_dec_outer:
935	xorps	%xmm2,%xmm8
936	movdqa	%xmm6,%xmm2
937	movups	%xmm8,(%rsi)
938	leaq	16(%rsi),%rsi
939.byte	102,15,56,0,215
940
941	subq	$1,%rdx
942	jz	.Lccm64_dec_break
943
944	movups	(%r11),%xmm0
945	movq	%r10,%rax
946	movups	16(%r11),%xmm1
947	xorps	%xmm0,%xmm8
948	xorps	%xmm0,%xmm2
949	xorps	%xmm8,%xmm3
950	movups	32(%r11),%xmm0
951	jmp	.Lccm64_dec2_loop
952.align	16
953.Lccm64_dec2_loop:
954.byte	102,15,56,220,209
955.byte	102,15,56,220,217
956	movups	(%rcx,%rax,1),%xmm1
957	addq	$32,%rax
958.byte	102,15,56,220,208
959.byte	102,15,56,220,216
960	movups	-16(%rcx,%rax,1),%xmm0
961	jnz	.Lccm64_dec2_loop
962	movups	(%rdi),%xmm8
963	paddq	%xmm9,%xmm6
964.byte	102,15,56,220,209
965.byte	102,15,56,220,217
966.byte	102,15,56,221,208
967.byte	102,15,56,221,216
968	leaq	16(%rdi),%rdi
969	jmp	.Lccm64_dec_outer
970
971.align	16
972.Lccm64_dec_break:
973
974	movl	240(%r11),%eax
975	movups	(%r11),%xmm0
976	movups	16(%r11),%xmm1
977	xorps	%xmm0,%xmm8
978	leaq	32(%r11),%r11
979	xorps	%xmm8,%xmm3
980.Loop_enc1_6:
981.byte	102,15,56,220,217
982	decl	%eax
983	movups	(%r11),%xmm1
984	leaq	16(%r11),%r11
985	jnz	.Loop_enc1_6
986.byte	102,15,56,221,217
987	pxor	%xmm0,%xmm0
988	pxor	%xmm1,%xmm1
989	pxor	%xmm2,%xmm2
990	movups	%xmm3,(%r9)
991	pxor	%xmm3,%xmm3
992	pxor	%xmm8,%xmm8
993	pxor	%xmm6,%xmm6
994	.byte	0xf3,0xc3
995.size	aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
996.globl	aesni_ctr32_encrypt_blocks
997.type	aesni_ctr32_encrypt_blocks,@function
998.align	16
999aesni_ctr32_encrypt_blocks:
1000	cmpq	$1,%rdx
1001	jne	.Lctr32_bulk
1002
1003
1004
1005	movups	(%r8),%xmm2
1006	movups	(%rdi),%xmm3
1007	movl	240(%rcx),%edx
1008	movups	(%rcx),%xmm0
1009	movups	16(%rcx),%xmm1
1010	leaq	32(%rcx),%rcx
1011	xorps	%xmm0,%xmm2
1012.Loop_enc1_7:
1013.byte	102,15,56,220,209
1014	decl	%edx
1015	movups	(%rcx),%xmm1
1016	leaq	16(%rcx),%rcx
1017	jnz	.Loop_enc1_7
1018.byte	102,15,56,221,209
1019	pxor	%xmm0,%xmm0
1020	pxor	%xmm1,%xmm1
1021	xorps	%xmm3,%xmm2
1022	pxor	%xmm3,%xmm3
1023	movups	%xmm2,(%rsi)
1024	xorps	%xmm2,%xmm2
1025	jmp	.Lctr32_epilogue
1026
1027.align	16
1028.Lctr32_bulk:
1029	leaq	(%rsp),%rax
1030	pushq	%rbp
1031	subq	$128,%rsp
1032	andq	$-16,%rsp
1033	leaq	-8(%rax),%rbp
1034
1035
1036
1037
1038	movdqu	(%r8),%xmm2
1039	movdqu	(%rcx),%xmm0
1040	movl	12(%r8),%r8d
1041	pxor	%xmm0,%xmm2
1042	movl	12(%rcx),%r11d
1043	movdqa	%xmm2,0(%rsp)
1044	bswapl	%r8d
1045	movdqa	%xmm2,%xmm3
1046	movdqa	%xmm2,%xmm4
1047	movdqa	%xmm2,%xmm5
1048	movdqa	%xmm2,64(%rsp)
1049	movdqa	%xmm2,80(%rsp)
1050	movdqa	%xmm2,96(%rsp)
1051	movq	%rdx,%r10
1052	movdqa	%xmm2,112(%rsp)
1053
1054	leaq	1(%r8),%rax
1055	leaq	2(%r8),%rdx
1056	bswapl	%eax
1057	bswapl	%edx
1058	xorl	%r11d,%eax
1059	xorl	%r11d,%edx
1060.byte	102,15,58,34,216,3
1061	leaq	3(%r8),%rax
1062	movdqa	%xmm3,16(%rsp)
1063.byte	102,15,58,34,226,3
1064	bswapl	%eax
1065	movq	%r10,%rdx
1066	leaq	4(%r8),%r10
1067	movdqa	%xmm4,32(%rsp)
1068	xorl	%r11d,%eax
1069	bswapl	%r10d
1070.byte	102,15,58,34,232,3
1071	xorl	%r11d,%r10d
1072	movdqa	%xmm5,48(%rsp)
1073	leaq	5(%r8),%r9
1074	movl	%r10d,64+12(%rsp)
1075	bswapl	%r9d
1076	leaq	6(%r8),%r10
1077	movl	240(%rcx),%eax
1078	xorl	%r11d,%r9d
1079	bswapl	%r10d
1080	movl	%r9d,80+12(%rsp)
1081	xorl	%r11d,%r10d
1082	leaq	7(%r8),%r9
1083	movl	%r10d,96+12(%rsp)
1084	bswapl	%r9d
1085	movl	OPENSSL_ia32cap_P+4(%rip),%r10d
1086	xorl	%r11d,%r9d
1087	andl	$71303168,%r10d
1088	movl	%r9d,112+12(%rsp)
1089
1090	movups	16(%rcx),%xmm1
1091
1092	movdqa	64(%rsp),%xmm6
1093	movdqa	80(%rsp),%xmm7
1094
1095	cmpq	$8,%rdx
1096	jb	.Lctr32_tail
1097
1098	subq	$6,%rdx
1099	cmpl	$4194304,%r10d
1100	je	.Lctr32_6x
1101
1102	leaq	128(%rcx),%rcx
1103	subq	$2,%rdx
1104	jmp	.Lctr32_loop8
1105
1106.align	16
1107.Lctr32_6x:
1108	shll	$4,%eax
1109	movl	$48,%r10d
1110	bswapl	%r11d
1111	leaq	32(%rcx,%rax,1),%rcx
1112	subq	%rax,%r10
1113	jmp	.Lctr32_loop6
1114
1115.align	16
1116.Lctr32_loop6:
1117	addl	$6,%r8d
1118	movups	-48(%rcx,%r10,1),%xmm0
1119.byte	102,15,56,220,209
1120	movl	%r8d,%eax
1121	xorl	%r11d,%eax
1122.byte	102,15,56,220,217
1123.byte	0x0f,0x38,0xf1,0x44,0x24,12
1124	leal	1(%r8),%eax
1125.byte	102,15,56,220,225
1126	xorl	%r11d,%eax
1127.byte	0x0f,0x38,0xf1,0x44,0x24,28
1128.byte	102,15,56,220,233
1129	leal	2(%r8),%eax
1130	xorl	%r11d,%eax
1131.byte	102,15,56,220,241
1132.byte	0x0f,0x38,0xf1,0x44,0x24,44
1133	leal	3(%r8),%eax
1134.byte	102,15,56,220,249
1135	movups	-32(%rcx,%r10,1),%xmm1
1136	xorl	%r11d,%eax
1137
1138.byte	102,15,56,220,208
1139.byte	0x0f,0x38,0xf1,0x44,0x24,60
1140	leal	4(%r8),%eax
1141.byte	102,15,56,220,216
1142	xorl	%r11d,%eax
1143.byte	0x0f,0x38,0xf1,0x44,0x24,76
1144.byte	102,15,56,220,224
1145	leal	5(%r8),%eax
1146	xorl	%r11d,%eax
1147.byte	102,15,56,220,232
1148.byte	0x0f,0x38,0xf1,0x44,0x24,92
1149	movq	%r10,%rax
1150.byte	102,15,56,220,240
1151.byte	102,15,56,220,248
1152	movups	-16(%rcx,%r10,1),%xmm0
1153
1154	call	.Lenc_loop6
1155
1156	movdqu	(%rdi),%xmm8
1157	movdqu	16(%rdi),%xmm9
1158	movdqu	32(%rdi),%xmm10
1159	movdqu	48(%rdi),%xmm11
1160	movdqu	64(%rdi),%xmm12
1161	movdqu	80(%rdi),%xmm13
1162	leaq	96(%rdi),%rdi
1163	movups	-64(%rcx,%r10,1),%xmm1
1164	pxor	%xmm2,%xmm8
1165	movaps	0(%rsp),%xmm2
1166	pxor	%xmm3,%xmm9
1167	movaps	16(%rsp),%xmm3
1168	pxor	%xmm4,%xmm10
1169	movaps	32(%rsp),%xmm4
1170	pxor	%xmm5,%xmm11
1171	movaps	48(%rsp),%xmm5
1172	pxor	%xmm6,%xmm12
1173	movaps	64(%rsp),%xmm6
1174	pxor	%xmm7,%xmm13
1175	movaps	80(%rsp),%xmm7
1176	movdqu	%xmm8,(%rsi)
1177	movdqu	%xmm9,16(%rsi)
1178	movdqu	%xmm10,32(%rsi)
1179	movdqu	%xmm11,48(%rsi)
1180	movdqu	%xmm12,64(%rsi)
1181	movdqu	%xmm13,80(%rsi)
1182	leaq	96(%rsi),%rsi
1183
1184	subq	$6,%rdx
1185	jnc	.Lctr32_loop6
1186
1187	addq	$6,%rdx
1188	jz	.Lctr32_done
1189
1190	leal	-48(%r10),%eax
1191	leaq	-80(%rcx,%r10,1),%rcx
1192	negl	%eax
1193	shrl	$4,%eax
1194	jmp	.Lctr32_tail
1195
1196.align	32
1197.Lctr32_loop8:
1198	addl	$8,%r8d
1199	movdqa	96(%rsp),%xmm8
1200.byte	102,15,56,220,209
1201	movl	%r8d,%r9d
1202	movdqa	112(%rsp),%xmm9
1203.byte	102,15,56,220,217
1204	bswapl	%r9d
1205	movups	32-128(%rcx),%xmm0
1206.byte	102,15,56,220,225
1207	xorl	%r11d,%r9d
1208	nop
1209.byte	102,15,56,220,233
1210	movl	%r9d,0+12(%rsp)
1211	leaq	1(%r8),%r9
1212.byte	102,15,56,220,241
1213.byte	102,15,56,220,249
1214.byte	102,68,15,56,220,193
1215.byte	102,68,15,56,220,201
1216	movups	48-128(%rcx),%xmm1
1217	bswapl	%r9d
1218.byte	102,15,56,220,208
1219.byte	102,15,56,220,216
1220	xorl	%r11d,%r9d
1221.byte	0x66,0x90
1222.byte	102,15,56,220,224
1223.byte	102,15,56,220,232
1224	movl	%r9d,16+12(%rsp)
1225	leaq	2(%r8),%r9
1226.byte	102,15,56,220,240
1227.byte	102,15,56,220,248
1228.byte	102,68,15,56,220,192
1229.byte	102,68,15,56,220,200
1230	movups	64-128(%rcx),%xmm0
1231	bswapl	%r9d
1232.byte	102,15,56,220,209
1233.byte	102,15,56,220,217
1234	xorl	%r11d,%r9d
1235.byte	0x66,0x90
1236.byte	102,15,56,220,225
1237.byte	102,15,56,220,233
1238	movl	%r9d,32+12(%rsp)
1239	leaq	3(%r8),%r9
1240.byte	102,15,56,220,241
1241.byte	102,15,56,220,249
1242.byte	102,68,15,56,220,193
1243.byte	102,68,15,56,220,201
1244	movups	80-128(%rcx),%xmm1
1245	bswapl	%r9d
1246.byte	102,15,56,220,208
1247.byte	102,15,56,220,216
1248	xorl	%r11d,%r9d
1249.byte	0x66,0x90
1250.byte	102,15,56,220,224
1251.byte	102,15,56,220,232
1252	movl	%r9d,48+12(%rsp)
1253	leaq	4(%r8),%r9
1254.byte	102,15,56,220,240
1255.byte	102,15,56,220,248
1256.byte	102,68,15,56,220,192
1257.byte	102,68,15,56,220,200
1258	movups	96-128(%rcx),%xmm0
1259	bswapl	%r9d
1260.byte	102,15,56,220,209
1261.byte	102,15,56,220,217
1262	xorl	%r11d,%r9d
1263.byte	0x66,0x90
1264.byte	102,15,56,220,225
1265.byte	102,15,56,220,233
1266	movl	%r9d,64+12(%rsp)
1267	leaq	5(%r8),%r9
1268.byte	102,15,56,220,241
1269.byte	102,15,56,220,249
1270.byte	102,68,15,56,220,193
1271.byte	102,68,15,56,220,201
1272	movups	112-128(%rcx),%xmm1
1273	bswapl	%r9d
1274.byte	102,15,56,220,208
1275.byte	102,15,56,220,216
1276	xorl	%r11d,%r9d
1277.byte	0x66,0x90
1278.byte	102,15,56,220,224
1279.byte	102,15,56,220,232
1280	movl	%r9d,80+12(%rsp)
1281	leaq	6(%r8),%r9
1282.byte	102,15,56,220,240
1283.byte	102,15,56,220,248
1284.byte	102,68,15,56,220,192
1285.byte	102,68,15,56,220,200
1286	movups	128-128(%rcx),%xmm0
1287	bswapl	%r9d
1288.byte	102,15,56,220,209
1289.byte	102,15,56,220,217
1290	xorl	%r11d,%r9d
1291.byte	0x66,0x90
1292.byte	102,15,56,220,225
1293.byte	102,15,56,220,233
1294	movl	%r9d,96+12(%rsp)
1295	leaq	7(%r8),%r9
1296.byte	102,15,56,220,241
1297.byte	102,15,56,220,249
1298.byte	102,68,15,56,220,193
1299.byte	102,68,15,56,220,201
1300	movups	144-128(%rcx),%xmm1
1301	bswapl	%r9d
1302.byte	102,15,56,220,208
1303.byte	102,15,56,220,216
1304.byte	102,15,56,220,224
1305	xorl	%r11d,%r9d
1306	movdqu	0(%rdi),%xmm10
1307.byte	102,15,56,220,232
1308	movl	%r9d,112+12(%rsp)
1309	cmpl	$11,%eax
1310.byte	102,15,56,220,240
1311.byte	102,15,56,220,248
1312.byte	102,68,15,56,220,192
1313.byte	102,68,15,56,220,200
1314	movups	160-128(%rcx),%xmm0
1315
1316	jb	.Lctr32_enc_done
1317
1318.byte	102,15,56,220,209
1319.byte	102,15,56,220,217
1320.byte	102,15,56,220,225
1321.byte	102,15,56,220,233
1322.byte	102,15,56,220,241
1323.byte	102,15,56,220,249
1324.byte	102,68,15,56,220,193
1325.byte	102,68,15,56,220,201
1326	movups	176-128(%rcx),%xmm1
1327
1328.byte	102,15,56,220,208
1329.byte	102,15,56,220,216
1330.byte	102,15,56,220,224
1331.byte	102,15,56,220,232
1332.byte	102,15,56,220,240
1333.byte	102,15,56,220,248
1334.byte	102,68,15,56,220,192
1335.byte	102,68,15,56,220,200
1336	movups	192-128(%rcx),%xmm0
1337	je	.Lctr32_enc_done
1338
1339.byte	102,15,56,220,209
1340.byte	102,15,56,220,217
1341.byte	102,15,56,220,225
1342.byte	102,15,56,220,233
1343.byte	102,15,56,220,241
1344.byte	102,15,56,220,249
1345.byte	102,68,15,56,220,193
1346.byte	102,68,15,56,220,201
1347	movups	208-128(%rcx),%xmm1
1348
1349.byte	102,15,56,220,208
1350.byte	102,15,56,220,216
1351.byte	102,15,56,220,224
1352.byte	102,15,56,220,232
1353.byte	102,15,56,220,240
1354.byte	102,15,56,220,248
1355.byte	102,68,15,56,220,192
1356.byte	102,68,15,56,220,200
1357	movups	224-128(%rcx),%xmm0
1358	jmp	.Lctr32_enc_done
1359
1360.align	16
1361.Lctr32_enc_done:
1362	movdqu	16(%rdi),%xmm11
1363	pxor	%xmm0,%xmm10
1364	movdqu	32(%rdi),%xmm12
1365	pxor	%xmm0,%xmm11
1366	movdqu	48(%rdi),%xmm13
1367	pxor	%xmm0,%xmm12
1368	movdqu	64(%rdi),%xmm14
1369	pxor	%xmm0,%xmm13
1370	movdqu	80(%rdi),%xmm15
1371	pxor	%xmm0,%xmm14
1372	pxor	%xmm0,%xmm15
1373.byte	102,15,56,220,209
1374.byte	102,15,56,220,217
1375.byte	102,15,56,220,225
1376.byte	102,15,56,220,233
1377.byte	102,15,56,220,241
1378.byte	102,15,56,220,249
1379.byte	102,68,15,56,220,193
1380.byte	102,68,15,56,220,201
1381	movdqu	96(%rdi),%xmm1
1382	leaq	128(%rdi),%rdi
1383
1384.byte	102,65,15,56,221,210
1385	pxor	%xmm0,%xmm1
1386	movdqu	112-128(%rdi),%xmm10
1387.byte	102,65,15,56,221,219
1388	pxor	%xmm0,%xmm10
1389	movdqa	0(%rsp),%xmm11
1390.byte	102,65,15,56,221,228
1391.byte	102,65,15,56,221,237
1392	movdqa	16(%rsp),%xmm12
1393	movdqa	32(%rsp),%xmm13
1394.byte	102,65,15,56,221,246
1395.byte	102,65,15,56,221,255
1396	movdqa	48(%rsp),%xmm14
1397	movdqa	64(%rsp),%xmm15
1398.byte	102,68,15,56,221,193
1399	movdqa	80(%rsp),%xmm0
1400	movups	16-128(%rcx),%xmm1
1401.byte	102,69,15,56,221,202
1402
1403	movups	%xmm2,(%rsi)
1404	movdqa	%xmm11,%xmm2
1405	movups	%xmm3,16(%rsi)
1406	movdqa	%xmm12,%xmm3
1407	movups	%xmm4,32(%rsi)
1408	movdqa	%xmm13,%xmm4
1409	movups	%xmm5,48(%rsi)
1410	movdqa	%xmm14,%xmm5
1411	movups	%xmm6,64(%rsi)
1412	movdqa	%xmm15,%xmm6
1413	movups	%xmm7,80(%rsi)
1414	movdqa	%xmm0,%xmm7
1415	movups	%xmm8,96(%rsi)
1416	movups	%xmm9,112(%rsi)
1417	leaq	128(%rsi),%rsi
1418
1419	subq	$8,%rdx
1420	jnc	.Lctr32_loop8
1421
1422	addq	$8,%rdx
1423	jz	.Lctr32_done
1424	leaq	-128(%rcx),%rcx
1425
1426.Lctr32_tail:
1427
1428
1429	leaq	16(%rcx),%rcx
1430	cmpq	$4,%rdx
1431	jb	.Lctr32_loop3
1432	je	.Lctr32_loop4
1433
1434
1435	shll	$4,%eax
1436	movdqa	96(%rsp),%xmm8
1437	pxor	%xmm9,%xmm9
1438
1439	movups	16(%rcx),%xmm0
1440.byte	102,15,56,220,209
1441.byte	102,15,56,220,217
1442	leaq	32-16(%rcx,%rax,1),%rcx
1443	negq	%rax
1444.byte	102,15,56,220,225
1445	addq	$16,%rax
1446	movups	(%rdi),%xmm10
1447.byte	102,15,56,220,233
1448.byte	102,15,56,220,241
1449	movups	16(%rdi),%xmm11
1450	movups	32(%rdi),%xmm12
1451.byte	102,15,56,220,249
1452.byte	102,68,15,56,220,193
1453
1454	call	.Lenc_loop8_enter
1455
1456	movdqu	48(%rdi),%xmm13
1457	pxor	%xmm10,%xmm2
1458	movdqu	64(%rdi),%xmm10
1459	pxor	%xmm11,%xmm3
1460	movdqu	%xmm2,(%rsi)
1461	pxor	%xmm12,%xmm4
1462	movdqu	%xmm3,16(%rsi)
1463	pxor	%xmm13,%xmm5
1464	movdqu	%xmm4,32(%rsi)
1465	pxor	%xmm10,%xmm6
1466	movdqu	%xmm5,48(%rsi)
1467	movdqu	%xmm6,64(%rsi)
1468	cmpq	$6,%rdx
1469	jb	.Lctr32_done
1470
1471	movups	80(%rdi),%xmm11
1472	xorps	%xmm11,%xmm7
1473	movups	%xmm7,80(%rsi)
1474	je	.Lctr32_done
1475
1476	movups	96(%rdi),%xmm12
1477	xorps	%xmm12,%xmm8
1478	movups	%xmm8,96(%rsi)
1479	jmp	.Lctr32_done
1480
1481.align	32
1482.Lctr32_loop4:
1483.byte	102,15,56,220,209
1484	leaq	16(%rcx),%rcx
1485	decl	%eax
1486.byte	102,15,56,220,217
1487.byte	102,15,56,220,225
1488.byte	102,15,56,220,233
1489	movups	(%rcx),%xmm1
1490	jnz	.Lctr32_loop4
1491.byte	102,15,56,221,209
1492.byte	102,15,56,221,217
1493	movups	(%rdi),%xmm10
1494	movups	16(%rdi),%xmm11
1495.byte	102,15,56,221,225
1496.byte	102,15,56,221,233
1497	movups	32(%rdi),%xmm12
1498	movups	48(%rdi),%xmm13
1499
1500	xorps	%xmm10,%xmm2
1501	movups	%xmm2,(%rsi)
1502	xorps	%xmm11,%xmm3
1503	movups	%xmm3,16(%rsi)
1504	pxor	%xmm12,%xmm4
1505	movdqu	%xmm4,32(%rsi)
1506	pxor	%xmm13,%xmm5
1507	movdqu	%xmm5,48(%rsi)
1508	jmp	.Lctr32_done
1509
1510.align	32
1511.Lctr32_loop3:
1512.byte	102,15,56,220,209
1513	leaq	16(%rcx),%rcx
1514	decl	%eax
1515.byte	102,15,56,220,217
1516.byte	102,15,56,220,225
1517	movups	(%rcx),%xmm1
1518	jnz	.Lctr32_loop3
1519.byte	102,15,56,221,209
1520.byte	102,15,56,221,217
1521.byte	102,15,56,221,225
1522
1523	movups	(%rdi),%xmm10
1524	xorps	%xmm10,%xmm2
1525	movups	%xmm2,(%rsi)
1526	cmpq	$2,%rdx
1527	jb	.Lctr32_done
1528
1529	movups	16(%rdi),%xmm11
1530	xorps	%xmm11,%xmm3
1531	movups	%xmm3,16(%rsi)
1532	je	.Lctr32_done
1533
1534	movups	32(%rdi),%xmm12
1535	xorps	%xmm12,%xmm4
1536	movups	%xmm4,32(%rsi)
1537
1538.Lctr32_done:
1539	xorps	%xmm0,%xmm0
1540	xorl	%r11d,%r11d
1541	pxor	%xmm1,%xmm1
1542	pxor	%xmm2,%xmm2
1543	pxor	%xmm3,%xmm3
1544	pxor	%xmm4,%xmm4
1545	pxor	%xmm5,%xmm5
1546	pxor	%xmm6,%xmm6
1547	pxor	%xmm7,%xmm7
1548	movaps	%xmm0,0(%rsp)
1549	pxor	%xmm8,%xmm8
1550	movaps	%xmm0,16(%rsp)
1551	pxor	%xmm9,%xmm9
1552	movaps	%xmm0,32(%rsp)
1553	pxor	%xmm10,%xmm10
1554	movaps	%xmm0,48(%rsp)
1555	pxor	%xmm11,%xmm11
1556	movaps	%xmm0,64(%rsp)
1557	pxor	%xmm12,%xmm12
1558	movaps	%xmm0,80(%rsp)
1559	pxor	%xmm13,%xmm13
1560	movaps	%xmm0,96(%rsp)
1561	pxor	%xmm14,%xmm14
1562	movaps	%xmm0,112(%rsp)
1563	pxor	%xmm15,%xmm15
1564	leaq	(%rbp),%rsp
1565	popq	%rbp
1566.Lctr32_epilogue:
1567	.byte	0xf3,0xc3
1568.size	aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
1569.globl	aesni_xts_encrypt
1570.type	aesni_xts_encrypt,@function
1571.align	16
1572aesni_xts_encrypt:
1573	leaq	(%rsp),%rax
1574	pushq	%rbp
1575	subq	$112,%rsp
1576	andq	$-16,%rsp
1577	leaq	-8(%rax),%rbp
1578	movups	(%r9),%xmm2
1579	movl	240(%r8),%eax
1580	movl	240(%rcx),%r10d
1581	movups	(%r8),%xmm0
1582	movups	16(%r8),%xmm1
1583	leaq	32(%r8),%r8
1584	xorps	%xmm0,%xmm2
1585.Loop_enc1_8:
1586.byte	102,15,56,220,209
1587	decl	%eax
1588	movups	(%r8),%xmm1
1589	leaq	16(%r8),%r8
1590	jnz	.Loop_enc1_8
1591.byte	102,15,56,221,209
1592	movups	(%rcx),%xmm0
1593	movq	%rcx,%r11
1594	movl	%r10d,%eax
1595	shll	$4,%r10d
1596	movq	%rdx,%r9
1597	andq	$-16,%rdx
1598
1599	movups	16(%rcx,%r10,1),%xmm1
1600
1601	movdqa	.Lxts_magic(%rip),%xmm8
1602	movdqa	%xmm2,%xmm15
1603	pshufd	$0x5f,%xmm2,%xmm9
1604	pxor	%xmm0,%xmm1
1605	movdqa	%xmm9,%xmm14
1606	paddd	%xmm9,%xmm9
1607	movdqa	%xmm15,%xmm10
1608	psrad	$31,%xmm14
1609	paddq	%xmm15,%xmm15
1610	pand	%xmm8,%xmm14
1611	pxor	%xmm0,%xmm10
1612	pxor	%xmm14,%xmm15
1613	movdqa	%xmm9,%xmm14
1614	paddd	%xmm9,%xmm9
1615	movdqa	%xmm15,%xmm11
1616	psrad	$31,%xmm14
1617	paddq	%xmm15,%xmm15
1618	pand	%xmm8,%xmm14
1619	pxor	%xmm0,%xmm11
1620	pxor	%xmm14,%xmm15
1621	movdqa	%xmm9,%xmm14
1622	paddd	%xmm9,%xmm9
1623	movdqa	%xmm15,%xmm12
1624	psrad	$31,%xmm14
1625	paddq	%xmm15,%xmm15
1626	pand	%xmm8,%xmm14
1627	pxor	%xmm0,%xmm12
1628	pxor	%xmm14,%xmm15
1629	movdqa	%xmm9,%xmm14
1630	paddd	%xmm9,%xmm9
1631	movdqa	%xmm15,%xmm13
1632	psrad	$31,%xmm14
1633	paddq	%xmm15,%xmm15
1634	pand	%xmm8,%xmm14
1635	pxor	%xmm0,%xmm13
1636	pxor	%xmm14,%xmm15
1637	movdqa	%xmm15,%xmm14
1638	psrad	$31,%xmm9
1639	paddq	%xmm15,%xmm15
1640	pand	%xmm8,%xmm9
1641	pxor	%xmm0,%xmm14
1642	pxor	%xmm9,%xmm15
1643	movaps	%xmm1,96(%rsp)
1644
1645	subq	$96,%rdx
1646	jc	.Lxts_enc_short
1647
1648	movl	$16+96,%eax
1649	leaq	32(%r11,%r10,1),%rcx
1650	subq	%r10,%rax
1651	movups	16(%r11),%xmm1
1652	movq	%rax,%r10
1653	leaq	.Lxts_magic(%rip),%r8
1654	jmp	.Lxts_enc_grandloop
1655
1656.align	32
1657.Lxts_enc_grandloop:
1658	movdqu	0(%rdi),%xmm2
1659	movdqa	%xmm0,%xmm8
1660	movdqu	16(%rdi),%xmm3
1661	pxor	%xmm10,%xmm2
1662	movdqu	32(%rdi),%xmm4
1663	pxor	%xmm11,%xmm3
1664.byte	102,15,56,220,209
1665	movdqu	48(%rdi),%xmm5
1666	pxor	%xmm12,%xmm4
1667.byte	102,15,56,220,217
1668	movdqu	64(%rdi),%xmm6
1669	pxor	%xmm13,%xmm5
1670.byte	102,15,56,220,225
1671	movdqu	80(%rdi),%xmm7
1672	pxor	%xmm15,%xmm8
1673	movdqa	96(%rsp),%xmm9
1674	pxor	%xmm14,%xmm6
1675.byte	102,15,56,220,233
1676	movups	32(%r11),%xmm0
1677	leaq	96(%rdi),%rdi
1678	pxor	%xmm8,%xmm7
1679
1680	pxor	%xmm9,%xmm10
1681.byte	102,15,56,220,241
1682	pxor	%xmm9,%xmm11
1683	movdqa	%xmm10,0(%rsp)
1684.byte	102,15,56,220,249
1685	movups	48(%r11),%xmm1
1686	pxor	%xmm9,%xmm12
1687
1688.byte	102,15,56,220,208
1689	pxor	%xmm9,%xmm13
1690	movdqa	%xmm11,16(%rsp)
1691.byte	102,15,56,220,216
1692	pxor	%xmm9,%xmm14
1693	movdqa	%xmm12,32(%rsp)
1694.byte	102,15,56,220,224
1695.byte	102,15,56,220,232
1696	pxor	%xmm9,%xmm8
1697	movdqa	%xmm14,64(%rsp)
1698.byte	102,15,56,220,240
1699.byte	102,15,56,220,248
1700	movups	64(%r11),%xmm0
1701	movdqa	%xmm8,80(%rsp)
1702	pshufd	$0x5f,%xmm15,%xmm9
1703	jmp	.Lxts_enc_loop6
1704.align	32
1705.Lxts_enc_loop6:
1706.byte	102,15,56,220,209
1707.byte	102,15,56,220,217
1708.byte	102,15,56,220,225
1709.byte	102,15,56,220,233
1710.byte	102,15,56,220,241
1711.byte	102,15,56,220,249
1712	movups	-64(%rcx,%rax,1),%xmm1
1713	addq	$32,%rax
1714
1715.byte	102,15,56,220,208
1716.byte	102,15,56,220,216
1717.byte	102,15,56,220,224
1718.byte	102,15,56,220,232
1719.byte	102,15,56,220,240
1720.byte	102,15,56,220,248
1721	movups	-80(%rcx,%rax,1),%xmm0
1722	jnz	.Lxts_enc_loop6
1723
1724	movdqa	(%r8),%xmm8
1725	movdqa	%xmm9,%xmm14
1726	paddd	%xmm9,%xmm9
1727.byte	102,15,56,220,209
1728	paddq	%xmm15,%xmm15
1729	psrad	$31,%xmm14
1730.byte	102,15,56,220,217
1731	pand	%xmm8,%xmm14
1732	movups	(%r11),%xmm10
1733.byte	102,15,56,220,225
1734.byte	102,15,56,220,233
1735.byte	102,15,56,220,241
1736	pxor	%xmm14,%xmm15
1737	movaps	%xmm10,%xmm11
1738.byte	102,15,56,220,249
1739	movups	-64(%rcx),%xmm1
1740
1741	movdqa	%xmm9,%xmm14
1742.byte	102,15,56,220,208
1743	paddd	%xmm9,%xmm9
1744	pxor	%xmm15,%xmm10
1745.byte	102,15,56,220,216
1746	psrad	$31,%xmm14
1747	paddq	%xmm15,%xmm15
1748.byte	102,15,56,220,224
1749.byte	102,15,56,220,232
1750	pand	%xmm8,%xmm14
1751	movaps	%xmm11,%xmm12
1752.byte	102,15,56,220,240
1753	pxor	%xmm14,%xmm15
1754	movdqa	%xmm9,%xmm14
1755.byte	102,15,56,220,248
1756	movups	-48(%rcx),%xmm0
1757
1758	paddd	%xmm9,%xmm9
1759.byte	102,15,56,220,209
1760	pxor	%xmm15,%xmm11
1761	psrad	$31,%xmm14
1762.byte	102,15,56,220,217
1763	paddq	%xmm15,%xmm15
1764	pand	%xmm8,%xmm14
1765.byte	102,15,56,220,225
1766.byte	102,15,56,220,233
1767	movdqa	%xmm13,48(%rsp)
1768	pxor	%xmm14,%xmm15
1769.byte	102,15,56,220,241
1770	movaps	%xmm12,%xmm13
1771	movdqa	%xmm9,%xmm14
1772.byte	102,15,56,220,249
1773	movups	-32(%rcx),%xmm1
1774
1775	paddd	%xmm9,%xmm9
1776.byte	102,15,56,220,208
1777	pxor	%xmm15,%xmm12
1778	psrad	$31,%xmm14
1779.byte	102,15,56,220,216
1780	paddq	%xmm15,%xmm15
1781	pand	%xmm8,%xmm14
1782.byte	102,15,56,220,224
1783.byte	102,15,56,220,232
1784.byte	102,15,56,220,240
1785	pxor	%xmm14,%xmm15
1786	movaps	%xmm13,%xmm14
1787.byte	102,15,56,220,248
1788
1789	movdqa	%xmm9,%xmm0
1790	paddd	%xmm9,%xmm9
1791.byte	102,15,56,220,209
1792	pxor	%xmm15,%xmm13
1793	psrad	$31,%xmm0
1794.byte	102,15,56,220,217
1795	paddq	%xmm15,%xmm15
1796	pand	%xmm8,%xmm0
1797.byte	102,15,56,220,225
1798.byte	102,15,56,220,233
1799	pxor	%xmm0,%xmm15
1800	movups	(%r11),%xmm0
1801.byte	102,15,56,220,241
1802.byte	102,15,56,220,249
1803	movups	16(%r11),%xmm1
1804
1805	pxor	%xmm15,%xmm14
1806.byte	102,15,56,221,84,36,0
1807	psrad	$31,%xmm9
1808	paddq	%xmm15,%xmm15
1809.byte	102,15,56,221,92,36,16
1810.byte	102,15,56,221,100,36,32
1811	pand	%xmm8,%xmm9
1812	movq	%r10,%rax
1813.byte	102,15,56,221,108,36,48
1814.byte	102,15,56,221,116,36,64
1815.byte	102,15,56,221,124,36,80
1816	pxor	%xmm9,%xmm15
1817
1818	leaq	96(%rsi),%rsi
1819	movups	%xmm2,-96(%rsi)
1820	movups	%xmm3,-80(%rsi)
1821	movups	%xmm4,-64(%rsi)
1822	movups	%xmm5,-48(%rsi)
1823	movups	%xmm6,-32(%rsi)
1824	movups	%xmm7,-16(%rsi)
1825	subq	$96,%rdx
1826	jnc	.Lxts_enc_grandloop
1827
1828	movl	$16+96,%eax
1829	subl	%r10d,%eax
1830	movq	%r11,%rcx
1831	shrl	$4,%eax
1832
1833.Lxts_enc_short:
1834
1835	movl	%eax,%r10d
1836	pxor	%xmm0,%xmm10
1837	addq	$96,%rdx
1838	jz	.Lxts_enc_done
1839
1840	pxor	%xmm0,%xmm11
1841	cmpq	$0x20,%rdx
1842	jb	.Lxts_enc_one
1843	pxor	%xmm0,%xmm12
1844	je	.Lxts_enc_two
1845
1846	pxor	%xmm0,%xmm13
1847	cmpq	$0x40,%rdx
1848	jb	.Lxts_enc_three
1849	pxor	%xmm0,%xmm14
1850	je	.Lxts_enc_four
1851
1852	movdqu	(%rdi),%xmm2
1853	movdqu	16(%rdi),%xmm3
1854	movdqu	32(%rdi),%xmm4
1855	pxor	%xmm10,%xmm2
1856	movdqu	48(%rdi),%xmm5
1857	pxor	%xmm11,%xmm3
1858	movdqu	64(%rdi),%xmm6
1859	leaq	80(%rdi),%rdi
1860	pxor	%xmm12,%xmm4
1861	pxor	%xmm13,%xmm5
1862	pxor	%xmm14,%xmm6
1863	pxor	%xmm7,%xmm7
1864
1865	call	_aesni_encrypt6
1866
1867	xorps	%xmm10,%xmm2
1868	movdqa	%xmm15,%xmm10
1869	xorps	%xmm11,%xmm3
1870	xorps	%xmm12,%xmm4
1871	movdqu	%xmm2,(%rsi)
1872	xorps	%xmm13,%xmm5
1873	movdqu	%xmm3,16(%rsi)
1874	xorps	%xmm14,%xmm6
1875	movdqu	%xmm4,32(%rsi)
1876	movdqu	%xmm5,48(%rsi)
1877	movdqu	%xmm6,64(%rsi)
1878	leaq	80(%rsi),%rsi
1879	jmp	.Lxts_enc_done
1880
1881.align	16
1882.Lxts_enc_one:
1883	movups	(%rdi),%xmm2
1884	leaq	16(%rdi),%rdi
1885	xorps	%xmm10,%xmm2
1886	movups	(%rcx),%xmm0
1887	movups	16(%rcx),%xmm1
1888	leaq	32(%rcx),%rcx
1889	xorps	%xmm0,%xmm2
1890.Loop_enc1_9:
1891.byte	102,15,56,220,209
1892	decl	%eax
1893	movups	(%rcx),%xmm1
1894	leaq	16(%rcx),%rcx
1895	jnz	.Loop_enc1_9
1896.byte	102,15,56,221,209
1897	xorps	%xmm10,%xmm2
1898	movdqa	%xmm11,%xmm10
1899	movups	%xmm2,(%rsi)
1900	leaq	16(%rsi),%rsi
1901	jmp	.Lxts_enc_done
1902
1903.align	16
1904.Lxts_enc_two:
1905	movups	(%rdi),%xmm2
1906	movups	16(%rdi),%xmm3
1907	leaq	32(%rdi),%rdi
1908	xorps	%xmm10,%xmm2
1909	xorps	%xmm11,%xmm3
1910
1911	call	_aesni_encrypt2
1912
1913	xorps	%xmm10,%xmm2
1914	movdqa	%xmm12,%xmm10
1915	xorps	%xmm11,%xmm3
1916	movups	%xmm2,(%rsi)
1917	movups	%xmm3,16(%rsi)
1918	leaq	32(%rsi),%rsi
1919	jmp	.Lxts_enc_done
1920
1921.align	16
1922.Lxts_enc_three:
1923	movups	(%rdi),%xmm2
1924	movups	16(%rdi),%xmm3
1925	movups	32(%rdi),%xmm4
1926	leaq	48(%rdi),%rdi
1927	xorps	%xmm10,%xmm2
1928	xorps	%xmm11,%xmm3
1929	xorps	%xmm12,%xmm4
1930
1931	call	_aesni_encrypt3
1932
1933	xorps	%xmm10,%xmm2
1934	movdqa	%xmm13,%xmm10
1935	xorps	%xmm11,%xmm3
1936	xorps	%xmm12,%xmm4
1937	movups	%xmm2,(%rsi)
1938	movups	%xmm3,16(%rsi)
1939	movups	%xmm4,32(%rsi)
1940	leaq	48(%rsi),%rsi
1941	jmp	.Lxts_enc_done
1942
1943.align	16
1944.Lxts_enc_four:
1945	movups	(%rdi),%xmm2
1946	movups	16(%rdi),%xmm3
1947	movups	32(%rdi),%xmm4
1948	xorps	%xmm10,%xmm2
1949	movups	48(%rdi),%xmm5
1950	leaq	64(%rdi),%rdi
1951	xorps	%xmm11,%xmm3
1952	xorps	%xmm12,%xmm4
1953	xorps	%xmm13,%xmm5
1954
1955	call	_aesni_encrypt4
1956
1957	pxor	%xmm10,%xmm2
1958	movdqa	%xmm14,%xmm10
1959	pxor	%xmm11,%xmm3
1960	pxor	%xmm12,%xmm4
1961	movdqu	%xmm2,(%rsi)
1962	pxor	%xmm13,%xmm5
1963	movdqu	%xmm3,16(%rsi)
1964	movdqu	%xmm4,32(%rsi)
1965	movdqu	%xmm5,48(%rsi)
1966	leaq	64(%rsi),%rsi
1967	jmp	.Lxts_enc_done
1968
1969.align	16
1970.Lxts_enc_done:
1971	andq	$15,%r9
1972	jz	.Lxts_enc_ret
1973	movq	%r9,%rdx
1974
1975.Lxts_enc_steal:
1976	movzbl	(%rdi),%eax
1977	movzbl	-16(%rsi),%ecx
1978	leaq	1(%rdi),%rdi
1979	movb	%al,-16(%rsi)
1980	movb	%cl,0(%rsi)
1981	leaq	1(%rsi),%rsi
1982	subq	$1,%rdx
1983	jnz	.Lxts_enc_steal
1984
1985	subq	%r9,%rsi
1986	movq	%r11,%rcx
1987	movl	%r10d,%eax
1988
1989	movups	-16(%rsi),%xmm2
1990	xorps	%xmm10,%xmm2
1991	movups	(%rcx),%xmm0
1992	movups	16(%rcx),%xmm1
1993	leaq	32(%rcx),%rcx
1994	xorps	%xmm0,%xmm2
1995.Loop_enc1_10:
1996.byte	102,15,56,220,209
1997	decl	%eax
1998	movups	(%rcx),%xmm1
1999	leaq	16(%rcx),%rcx
2000	jnz	.Loop_enc1_10
2001.byte	102,15,56,221,209
2002	xorps	%xmm10,%xmm2
2003	movups	%xmm2,-16(%rsi)
2004
2005.Lxts_enc_ret:
2006	xorps	%xmm0,%xmm0
2007	pxor	%xmm1,%xmm1
2008	pxor	%xmm2,%xmm2
2009	pxor	%xmm3,%xmm3
2010	pxor	%xmm4,%xmm4
2011	pxor	%xmm5,%xmm5
2012	pxor	%xmm6,%xmm6
2013	pxor	%xmm7,%xmm7
2014	movaps	%xmm0,0(%rsp)
2015	pxor	%xmm8,%xmm8
2016	movaps	%xmm0,16(%rsp)
2017	pxor	%xmm9,%xmm9
2018	movaps	%xmm0,32(%rsp)
2019	pxor	%xmm10,%xmm10
2020	movaps	%xmm0,48(%rsp)
2021	pxor	%xmm11,%xmm11
2022	movaps	%xmm0,64(%rsp)
2023	pxor	%xmm12,%xmm12
2024	movaps	%xmm0,80(%rsp)
2025	pxor	%xmm13,%xmm13
2026	movaps	%xmm0,96(%rsp)
2027	pxor	%xmm14,%xmm14
2028	pxor	%xmm15,%xmm15
2029	leaq	(%rbp),%rsp
2030	popq	%rbp
2031.Lxts_enc_epilogue:
2032	.byte	0xf3,0xc3
2033.size	aesni_xts_encrypt,.-aesni_xts_encrypt
2034.globl	aesni_xts_decrypt
2035.type	aesni_xts_decrypt,@function
2036.align	16
2037aesni_xts_decrypt:
2038	leaq	(%rsp),%rax
2039	pushq	%rbp
2040	subq	$112,%rsp
2041	andq	$-16,%rsp
2042	leaq	-8(%rax),%rbp
2043	movups	(%r9),%xmm2
2044	movl	240(%r8),%eax
2045	movl	240(%rcx),%r10d
2046	movups	(%r8),%xmm0
2047	movups	16(%r8),%xmm1
2048	leaq	32(%r8),%r8
2049	xorps	%xmm0,%xmm2
2050.Loop_enc1_11:
2051.byte	102,15,56,220,209
2052	decl	%eax
2053	movups	(%r8),%xmm1
2054	leaq	16(%r8),%r8
2055	jnz	.Loop_enc1_11
2056.byte	102,15,56,221,209
2057	xorl	%eax,%eax
2058	testq	$15,%rdx
2059	setnz	%al
2060	shlq	$4,%rax
2061	subq	%rax,%rdx
2062
2063	movups	(%rcx),%xmm0
2064	movq	%rcx,%r11
2065	movl	%r10d,%eax
2066	shll	$4,%r10d
2067	movq	%rdx,%r9
2068	andq	$-16,%rdx
2069
2070	movups	16(%rcx,%r10,1),%xmm1
2071
2072	movdqa	.Lxts_magic(%rip),%xmm8
2073	movdqa	%xmm2,%xmm15
2074	pshufd	$0x5f,%xmm2,%xmm9
2075	pxor	%xmm0,%xmm1
2076	movdqa	%xmm9,%xmm14
2077	paddd	%xmm9,%xmm9
2078	movdqa	%xmm15,%xmm10
2079	psrad	$31,%xmm14
2080	paddq	%xmm15,%xmm15
2081	pand	%xmm8,%xmm14
2082	pxor	%xmm0,%xmm10
2083	pxor	%xmm14,%xmm15
2084	movdqa	%xmm9,%xmm14
2085	paddd	%xmm9,%xmm9
2086	movdqa	%xmm15,%xmm11
2087	psrad	$31,%xmm14
2088	paddq	%xmm15,%xmm15
2089	pand	%xmm8,%xmm14
2090	pxor	%xmm0,%xmm11
2091	pxor	%xmm14,%xmm15
2092	movdqa	%xmm9,%xmm14
2093	paddd	%xmm9,%xmm9
2094	movdqa	%xmm15,%xmm12
2095	psrad	$31,%xmm14
2096	paddq	%xmm15,%xmm15
2097	pand	%xmm8,%xmm14
2098	pxor	%xmm0,%xmm12
2099	pxor	%xmm14,%xmm15
2100	movdqa	%xmm9,%xmm14
2101	paddd	%xmm9,%xmm9
2102	movdqa	%xmm15,%xmm13
2103	psrad	$31,%xmm14
2104	paddq	%xmm15,%xmm15
2105	pand	%xmm8,%xmm14
2106	pxor	%xmm0,%xmm13
2107	pxor	%xmm14,%xmm15
2108	movdqa	%xmm15,%xmm14
2109	psrad	$31,%xmm9
2110	paddq	%xmm15,%xmm15
2111	pand	%xmm8,%xmm9
2112	pxor	%xmm0,%xmm14
2113	pxor	%xmm9,%xmm15
2114	movaps	%xmm1,96(%rsp)
2115
2116	subq	$96,%rdx
2117	jc	.Lxts_dec_short
2118
2119	movl	$16+96,%eax
2120	leaq	32(%r11,%r10,1),%rcx
2121	subq	%r10,%rax
2122	movups	16(%r11),%xmm1
2123	movq	%rax,%r10
2124	leaq	.Lxts_magic(%rip),%r8
2125	jmp	.Lxts_dec_grandloop
2126
2127.align	32
2128.Lxts_dec_grandloop:
2129	movdqu	0(%rdi),%xmm2
2130	movdqa	%xmm0,%xmm8
2131	movdqu	16(%rdi),%xmm3
2132	pxor	%xmm10,%xmm2
2133	movdqu	32(%rdi),%xmm4
2134	pxor	%xmm11,%xmm3
2135.byte	102,15,56,222,209
2136	movdqu	48(%rdi),%xmm5
2137	pxor	%xmm12,%xmm4
2138.byte	102,15,56,222,217
2139	movdqu	64(%rdi),%xmm6
2140	pxor	%xmm13,%xmm5
2141.byte	102,15,56,222,225
2142	movdqu	80(%rdi),%xmm7
2143	pxor	%xmm15,%xmm8
2144	movdqa	96(%rsp),%xmm9
2145	pxor	%xmm14,%xmm6
2146.byte	102,15,56,222,233
2147	movups	32(%r11),%xmm0
2148	leaq	96(%rdi),%rdi
2149	pxor	%xmm8,%xmm7
2150
2151	pxor	%xmm9,%xmm10
2152.byte	102,15,56,222,241
2153	pxor	%xmm9,%xmm11
2154	movdqa	%xmm10,0(%rsp)
2155.byte	102,15,56,222,249
2156	movups	48(%r11),%xmm1
2157	pxor	%xmm9,%xmm12
2158
2159.byte	102,15,56,222,208
2160	pxor	%xmm9,%xmm13
2161	movdqa	%xmm11,16(%rsp)
2162.byte	102,15,56,222,216
2163	pxor	%xmm9,%xmm14
2164	movdqa	%xmm12,32(%rsp)
2165.byte	102,15,56,222,224
2166.byte	102,15,56,222,232
2167	pxor	%xmm9,%xmm8
2168	movdqa	%xmm14,64(%rsp)
2169.byte	102,15,56,222,240
2170.byte	102,15,56,222,248
2171	movups	64(%r11),%xmm0
2172	movdqa	%xmm8,80(%rsp)
2173	pshufd	$0x5f,%xmm15,%xmm9
2174	jmp	.Lxts_dec_loop6
2175.align	32
2176.Lxts_dec_loop6:
2177.byte	102,15,56,222,209
2178.byte	102,15,56,222,217
2179.byte	102,15,56,222,225
2180.byte	102,15,56,222,233
2181.byte	102,15,56,222,241
2182.byte	102,15,56,222,249
2183	movups	-64(%rcx,%rax,1),%xmm1
2184	addq	$32,%rax
2185
2186.byte	102,15,56,222,208
2187.byte	102,15,56,222,216
2188.byte	102,15,56,222,224
2189.byte	102,15,56,222,232
2190.byte	102,15,56,222,240
2191.byte	102,15,56,222,248
2192	movups	-80(%rcx,%rax,1),%xmm0
2193	jnz	.Lxts_dec_loop6
2194
2195	movdqa	(%r8),%xmm8
2196	movdqa	%xmm9,%xmm14
2197	paddd	%xmm9,%xmm9
2198.byte	102,15,56,222,209
2199	paddq	%xmm15,%xmm15
2200	psrad	$31,%xmm14
2201.byte	102,15,56,222,217
2202	pand	%xmm8,%xmm14
2203	movups	(%r11),%xmm10
2204.byte	102,15,56,222,225
2205.byte	102,15,56,222,233
2206.byte	102,15,56,222,241
2207	pxor	%xmm14,%xmm15
2208	movaps	%xmm10,%xmm11
2209.byte	102,15,56,222,249
2210	movups	-64(%rcx),%xmm1
2211
2212	movdqa	%xmm9,%xmm14
2213.byte	102,15,56,222,208
2214	paddd	%xmm9,%xmm9
2215	pxor	%xmm15,%xmm10
2216.byte	102,15,56,222,216
2217	psrad	$31,%xmm14
2218	paddq	%xmm15,%xmm15
2219.byte	102,15,56,222,224
2220.byte	102,15,56,222,232
2221	pand	%xmm8,%xmm14
2222	movaps	%xmm11,%xmm12
2223.byte	102,15,56,222,240
2224	pxor	%xmm14,%xmm15
2225	movdqa	%xmm9,%xmm14
2226.byte	102,15,56,222,248
2227	movups	-48(%rcx),%xmm0
2228
2229	paddd	%xmm9,%xmm9
2230.byte	102,15,56,222,209
2231	pxor	%xmm15,%xmm11
2232	psrad	$31,%xmm14
2233.byte	102,15,56,222,217
2234	paddq	%xmm15,%xmm15
2235	pand	%xmm8,%xmm14
2236.byte	102,15,56,222,225
2237.byte	102,15,56,222,233
2238	movdqa	%xmm13,48(%rsp)
2239	pxor	%xmm14,%xmm15
2240.byte	102,15,56,222,241
2241	movaps	%xmm12,%xmm13
2242	movdqa	%xmm9,%xmm14
2243.byte	102,15,56,222,249
2244	movups	-32(%rcx),%xmm1
2245
2246	paddd	%xmm9,%xmm9
2247.byte	102,15,56,222,208
2248	pxor	%xmm15,%xmm12
2249	psrad	$31,%xmm14
2250.byte	102,15,56,222,216
2251	paddq	%xmm15,%xmm15
2252	pand	%xmm8,%xmm14
2253.byte	102,15,56,222,224
2254.byte	102,15,56,222,232
2255.byte	102,15,56,222,240
2256	pxor	%xmm14,%xmm15
2257	movaps	%xmm13,%xmm14
2258.byte	102,15,56,222,248
2259
2260	movdqa	%xmm9,%xmm0
2261	paddd	%xmm9,%xmm9
2262.byte	102,15,56,222,209
2263	pxor	%xmm15,%xmm13
2264	psrad	$31,%xmm0
2265.byte	102,15,56,222,217
2266	paddq	%xmm15,%xmm15
2267	pand	%xmm8,%xmm0
2268.byte	102,15,56,222,225
2269.byte	102,15,56,222,233
2270	pxor	%xmm0,%xmm15
2271	movups	(%r11),%xmm0
2272.byte	102,15,56,222,241
2273.byte	102,15,56,222,249
2274	movups	16(%r11),%xmm1
2275
2276	pxor	%xmm15,%xmm14
2277.byte	102,15,56,223,84,36,0
2278	psrad	$31,%xmm9
2279	paddq	%xmm15,%xmm15
2280.byte	102,15,56,223,92,36,16
2281.byte	102,15,56,223,100,36,32
2282	pand	%xmm8,%xmm9
2283	movq	%r10,%rax
2284.byte	102,15,56,223,108,36,48
2285.byte	102,15,56,223,116,36,64
2286.byte	102,15,56,223,124,36,80
2287	pxor	%xmm9,%xmm15
2288
2289	leaq	96(%rsi),%rsi
2290	movups	%xmm2,-96(%rsi)
2291	movups	%xmm3,-80(%rsi)
2292	movups	%xmm4,-64(%rsi)
2293	movups	%xmm5,-48(%rsi)
2294	movups	%xmm6,-32(%rsi)
2295	movups	%xmm7,-16(%rsi)
2296	subq	$96,%rdx
2297	jnc	.Lxts_dec_grandloop
2298
2299	movl	$16+96,%eax
2300	subl	%r10d,%eax
2301	movq	%r11,%rcx
2302	shrl	$4,%eax
2303
2304.Lxts_dec_short:
2305
2306	movl	%eax,%r10d
2307	pxor	%xmm0,%xmm10
2308	pxor	%xmm0,%xmm11
2309	addq	$96,%rdx
2310	jz	.Lxts_dec_done
2311
2312	pxor	%xmm0,%xmm12
2313	cmpq	$0x20,%rdx
2314	jb	.Lxts_dec_one
2315	pxor	%xmm0,%xmm13
2316	je	.Lxts_dec_two
2317
2318	pxor	%xmm0,%xmm14
2319	cmpq	$0x40,%rdx
2320	jb	.Lxts_dec_three
2321	je	.Lxts_dec_four
2322
2323	movdqu	(%rdi),%xmm2
2324	movdqu	16(%rdi),%xmm3
2325	movdqu	32(%rdi),%xmm4
2326	pxor	%xmm10,%xmm2
2327	movdqu	48(%rdi),%xmm5
2328	pxor	%xmm11,%xmm3
2329	movdqu	64(%rdi),%xmm6
2330	leaq	80(%rdi),%rdi
2331	pxor	%xmm12,%xmm4
2332	pxor	%xmm13,%xmm5
2333	pxor	%xmm14,%xmm6
2334
2335	call	_aesni_decrypt6
2336
2337	xorps	%xmm10,%xmm2
2338	xorps	%xmm11,%xmm3
2339	xorps	%xmm12,%xmm4
2340	movdqu	%xmm2,(%rsi)
2341	xorps	%xmm13,%xmm5
2342	movdqu	%xmm3,16(%rsi)
2343	xorps	%xmm14,%xmm6
2344	movdqu	%xmm4,32(%rsi)
2345	pxor	%xmm14,%xmm14
2346	movdqu	%xmm5,48(%rsi)
2347	pcmpgtd	%xmm15,%xmm14
2348	movdqu	%xmm6,64(%rsi)
2349	leaq	80(%rsi),%rsi
2350	pshufd	$0x13,%xmm14,%xmm11
2351	andq	$15,%r9
2352	jz	.Lxts_dec_ret
2353
2354	movdqa	%xmm15,%xmm10
2355	paddq	%xmm15,%xmm15
2356	pand	%xmm8,%xmm11
2357	pxor	%xmm15,%xmm11
2358	jmp	.Lxts_dec_done2
2359
2360.align	16
2361.Lxts_dec_one:
2362	movups	(%rdi),%xmm2
2363	leaq	16(%rdi),%rdi
2364	xorps	%xmm10,%xmm2
2365	movups	(%rcx),%xmm0
2366	movups	16(%rcx),%xmm1
2367	leaq	32(%rcx),%rcx
2368	xorps	%xmm0,%xmm2
2369.Loop_dec1_12:
2370.byte	102,15,56,222,209
2371	decl	%eax
2372	movups	(%rcx),%xmm1
2373	leaq	16(%rcx),%rcx
2374	jnz	.Loop_dec1_12
2375.byte	102,15,56,223,209
2376	xorps	%xmm10,%xmm2
2377	movdqa	%xmm11,%xmm10
2378	movups	%xmm2,(%rsi)
2379	movdqa	%xmm12,%xmm11
2380	leaq	16(%rsi),%rsi
2381	jmp	.Lxts_dec_done
2382
2383.align	16
2384.Lxts_dec_two:
2385	movups	(%rdi),%xmm2
2386	movups	16(%rdi),%xmm3
2387	leaq	32(%rdi),%rdi
2388	xorps	%xmm10,%xmm2
2389	xorps	%xmm11,%xmm3
2390
2391	call	_aesni_decrypt2
2392
2393	xorps	%xmm10,%xmm2
2394	movdqa	%xmm12,%xmm10
2395	xorps	%xmm11,%xmm3
2396	movdqa	%xmm13,%xmm11
2397	movups	%xmm2,(%rsi)
2398	movups	%xmm3,16(%rsi)
2399	leaq	32(%rsi),%rsi
2400	jmp	.Lxts_dec_done
2401
2402.align	16
2403.Lxts_dec_three:
2404	movups	(%rdi),%xmm2
2405	movups	16(%rdi),%xmm3
2406	movups	32(%rdi),%xmm4
2407	leaq	48(%rdi),%rdi
2408	xorps	%xmm10,%xmm2
2409	xorps	%xmm11,%xmm3
2410	xorps	%xmm12,%xmm4
2411
2412	call	_aesni_decrypt3
2413
2414	xorps	%xmm10,%xmm2
2415	movdqa	%xmm13,%xmm10
2416	xorps	%xmm11,%xmm3
2417	movdqa	%xmm14,%xmm11
2418	xorps	%xmm12,%xmm4
2419	movups	%xmm2,(%rsi)
2420	movups	%xmm3,16(%rsi)
2421	movups	%xmm4,32(%rsi)
2422	leaq	48(%rsi),%rsi
2423	jmp	.Lxts_dec_done
2424
2425.align	16
2426.Lxts_dec_four:
2427	movups	(%rdi),%xmm2
2428	movups	16(%rdi),%xmm3
2429	movups	32(%rdi),%xmm4
2430	xorps	%xmm10,%xmm2
2431	movups	48(%rdi),%xmm5
2432	leaq	64(%rdi),%rdi
2433	xorps	%xmm11,%xmm3
2434	xorps	%xmm12,%xmm4
2435	xorps	%xmm13,%xmm5
2436
2437	call	_aesni_decrypt4
2438
2439	pxor	%xmm10,%xmm2
2440	movdqa	%xmm14,%xmm10
2441	pxor	%xmm11,%xmm3
2442	movdqa	%xmm15,%xmm11
2443	pxor	%xmm12,%xmm4
2444	movdqu	%xmm2,(%rsi)
2445	pxor	%xmm13,%xmm5
2446	movdqu	%xmm3,16(%rsi)
2447	movdqu	%xmm4,32(%rsi)
2448	movdqu	%xmm5,48(%rsi)
2449	leaq	64(%rsi),%rsi
2450	jmp	.Lxts_dec_done
2451
2452.align	16
2453.Lxts_dec_done:
2454	andq	$15,%r9
2455	jz	.Lxts_dec_ret
2456.Lxts_dec_done2:
2457	movq	%r9,%rdx
2458	movq	%r11,%rcx
2459	movl	%r10d,%eax
2460
2461	movups	(%rdi),%xmm2
2462	xorps	%xmm11,%xmm2
2463	movups	(%rcx),%xmm0
2464	movups	16(%rcx),%xmm1
2465	leaq	32(%rcx),%rcx
2466	xorps	%xmm0,%xmm2
2467.Loop_dec1_13:
2468.byte	102,15,56,222,209
2469	decl	%eax
2470	movups	(%rcx),%xmm1
2471	leaq	16(%rcx),%rcx
2472	jnz	.Loop_dec1_13
2473.byte	102,15,56,223,209
2474	xorps	%xmm11,%xmm2
2475	movups	%xmm2,(%rsi)
2476
2477.Lxts_dec_steal:
2478	movzbl	16(%rdi),%eax
2479	movzbl	(%rsi),%ecx
2480	leaq	1(%rdi),%rdi
2481	movb	%al,(%rsi)
2482	movb	%cl,16(%rsi)
2483	leaq	1(%rsi),%rsi
2484	subq	$1,%rdx
2485	jnz	.Lxts_dec_steal
2486
2487	subq	%r9,%rsi
2488	movq	%r11,%rcx
2489	movl	%r10d,%eax
2490
2491	movups	(%rsi),%xmm2
2492	xorps	%xmm10,%xmm2
2493	movups	(%rcx),%xmm0
2494	movups	16(%rcx),%xmm1
2495	leaq	32(%rcx),%rcx
2496	xorps	%xmm0,%xmm2
2497.Loop_dec1_14:
2498.byte	102,15,56,222,209
2499	decl	%eax
2500	movups	(%rcx),%xmm1
2501	leaq	16(%rcx),%rcx
2502	jnz	.Loop_dec1_14
2503.byte	102,15,56,223,209
2504	xorps	%xmm10,%xmm2
2505	movups	%xmm2,(%rsi)
2506
2507.Lxts_dec_ret:
2508	xorps	%xmm0,%xmm0
2509	pxor	%xmm1,%xmm1
2510	pxor	%xmm2,%xmm2
2511	pxor	%xmm3,%xmm3
2512	pxor	%xmm4,%xmm4
2513	pxor	%xmm5,%xmm5
2514	pxor	%xmm6,%xmm6
2515	pxor	%xmm7,%xmm7
2516	movaps	%xmm0,0(%rsp)
2517	pxor	%xmm8,%xmm8
2518	movaps	%xmm0,16(%rsp)
2519	pxor	%xmm9,%xmm9
2520	movaps	%xmm0,32(%rsp)
2521	pxor	%xmm10,%xmm10
2522	movaps	%xmm0,48(%rsp)
2523	pxor	%xmm11,%xmm11
2524	movaps	%xmm0,64(%rsp)
2525	pxor	%xmm12,%xmm12
2526	movaps	%xmm0,80(%rsp)
2527	pxor	%xmm13,%xmm13
2528	movaps	%xmm0,96(%rsp)
2529	pxor	%xmm14,%xmm14
2530	pxor	%xmm15,%xmm15
2531	leaq	(%rbp),%rsp
2532	popq	%rbp
2533.Lxts_dec_epilogue:
2534	.byte	0xf3,0xc3
2535.size	aesni_xts_decrypt,.-aesni_xts_decrypt
2536.globl	aesni_cbc_encrypt
2537.type	aesni_cbc_encrypt,@function
2538.align	16
2539aesni_cbc_encrypt:
2540	testq	%rdx,%rdx
2541	jz	.Lcbc_ret
2542
2543	movl	240(%rcx),%r10d
2544	movq	%rcx,%r11
2545	testl	%r9d,%r9d
2546	jz	.Lcbc_decrypt
2547
2548	movups	(%r8),%xmm2
2549	movl	%r10d,%eax
2550	cmpq	$16,%rdx
2551	jb	.Lcbc_enc_tail
2552	subq	$16,%rdx
2553	jmp	.Lcbc_enc_loop
2554.align	16
2555.Lcbc_enc_loop:
2556	movups	(%rdi),%xmm3
2557	leaq	16(%rdi),%rdi
2558
2559	movups	(%rcx),%xmm0
2560	movups	16(%rcx),%xmm1
2561	xorps	%xmm0,%xmm3
2562	leaq	32(%rcx),%rcx
2563	xorps	%xmm3,%xmm2
2564.Loop_enc1_15:
2565.byte	102,15,56,220,209
2566	decl	%eax
2567	movups	(%rcx),%xmm1
2568	leaq	16(%rcx),%rcx
2569	jnz	.Loop_enc1_15
2570.byte	102,15,56,221,209
2571	movl	%r10d,%eax
2572	movq	%r11,%rcx
2573	movups	%xmm2,0(%rsi)
2574	leaq	16(%rsi),%rsi
2575	subq	$16,%rdx
2576	jnc	.Lcbc_enc_loop
2577	addq	$16,%rdx
2578	jnz	.Lcbc_enc_tail
2579	pxor	%xmm0,%xmm0
2580	pxor	%xmm1,%xmm1
2581	movups	%xmm2,(%r8)
2582	pxor	%xmm2,%xmm2
2583	pxor	%xmm3,%xmm3
2584	jmp	.Lcbc_ret
2585
2586.Lcbc_enc_tail:
2587	movq	%rdx,%rcx
2588	xchgq	%rdi,%rsi
2589.long	0x9066A4F3
2590	movl	$16,%ecx
2591	subq	%rdx,%rcx
2592	xorl	%eax,%eax
2593.long	0x9066AAF3
2594	leaq	-16(%rdi),%rdi
2595	movl	%r10d,%eax
2596	movq	%rdi,%rsi
2597	movq	%r11,%rcx
2598	xorq	%rdx,%rdx
2599	jmp	.Lcbc_enc_loop
2600
2601.align	16
2602.Lcbc_decrypt:
2603	cmpq	$16,%rdx
2604	jne	.Lcbc_decrypt_bulk
2605
2606
2607
2608	movdqu	(%rdi),%xmm2
2609	movdqu	(%r8),%xmm3
2610	movdqa	%xmm2,%xmm4
2611	movups	(%rcx),%xmm0
2612	movups	16(%rcx),%xmm1
2613	leaq	32(%rcx),%rcx
2614	xorps	%xmm0,%xmm2
2615.Loop_dec1_16:
2616.byte	102,15,56,222,209
2617	decl	%r10d
2618	movups	(%rcx),%xmm1
2619	leaq	16(%rcx),%rcx
2620	jnz	.Loop_dec1_16
2621.byte	102,15,56,223,209
2622	pxor	%xmm0,%xmm0
2623	pxor	%xmm1,%xmm1
2624	movdqu	%xmm4,(%r8)
2625	xorps	%xmm3,%xmm2
2626	pxor	%xmm3,%xmm3
2627	movups	%xmm2,(%rsi)
2628	pxor	%xmm2,%xmm2
2629	jmp	.Lcbc_ret
2630.align	16
2631.Lcbc_decrypt_bulk:
2632	leaq	(%rsp),%rax
2633	pushq	%rbp
2634	subq	$16,%rsp
2635	andq	$-16,%rsp
2636	leaq	-8(%rax),%rbp
2637	movups	(%r8),%xmm10
2638	movl	%r10d,%eax
2639	cmpq	$0x50,%rdx
2640	jbe	.Lcbc_dec_tail
2641
2642	movups	(%rcx),%xmm0
2643	movdqu	0(%rdi),%xmm2
2644	movdqu	16(%rdi),%xmm3
2645	movdqa	%xmm2,%xmm11
2646	movdqu	32(%rdi),%xmm4
2647	movdqa	%xmm3,%xmm12
2648	movdqu	48(%rdi),%xmm5
2649	movdqa	%xmm4,%xmm13
2650	movdqu	64(%rdi),%xmm6
2651	movdqa	%xmm5,%xmm14
2652	movdqu	80(%rdi),%xmm7
2653	movdqa	%xmm6,%xmm15
2654	movl	OPENSSL_ia32cap_P+4(%rip),%r9d
2655	cmpq	$0x70,%rdx
2656	jbe	.Lcbc_dec_six_or_seven
2657
2658	andl	$71303168,%r9d
2659	subq	$0x50,%rdx
2660	cmpl	$4194304,%r9d
2661	je	.Lcbc_dec_loop6_enter
2662	subq	$0x20,%rdx
2663	leaq	112(%rcx),%rcx
2664	jmp	.Lcbc_dec_loop8_enter
2665.align	16
2666.Lcbc_dec_loop8:
2667	movups	%xmm9,(%rsi)
2668	leaq	16(%rsi),%rsi
2669.Lcbc_dec_loop8_enter:
2670	movdqu	96(%rdi),%xmm8
2671	pxor	%xmm0,%xmm2
2672	movdqu	112(%rdi),%xmm9
2673	pxor	%xmm0,%xmm3
2674	movups	16-112(%rcx),%xmm1
2675	pxor	%xmm0,%xmm4
2676	xorq	%r11,%r11
2677	cmpq	$0x70,%rdx
2678	pxor	%xmm0,%xmm5
2679	pxor	%xmm0,%xmm6
2680	pxor	%xmm0,%xmm7
2681	pxor	%xmm0,%xmm8
2682
2683.byte	102,15,56,222,209
2684	pxor	%xmm0,%xmm9
2685	movups	32-112(%rcx),%xmm0
2686.byte	102,15,56,222,217
2687.byte	102,15,56,222,225
2688.byte	102,15,56,222,233
2689.byte	102,15,56,222,241
2690.byte	102,15,56,222,249
2691.byte	102,68,15,56,222,193
2692	setnc	%r11b
2693	shlq	$7,%r11
2694.byte	102,68,15,56,222,201
2695	addq	%rdi,%r11
2696	movups	48-112(%rcx),%xmm1
2697.byte	102,15,56,222,208
2698.byte	102,15,56,222,216
2699.byte	102,15,56,222,224
2700.byte	102,15,56,222,232
2701.byte	102,15,56,222,240
2702.byte	102,15,56,222,248
2703.byte	102,68,15,56,222,192
2704.byte	102,68,15,56,222,200
2705	movups	64-112(%rcx),%xmm0
2706	nop
2707.byte	102,15,56,222,209
2708.byte	102,15,56,222,217
2709.byte	102,15,56,222,225
2710.byte	102,15,56,222,233
2711.byte	102,15,56,222,241
2712.byte	102,15,56,222,249
2713.byte	102,68,15,56,222,193
2714.byte	102,68,15,56,222,201
2715	movups	80-112(%rcx),%xmm1
2716	nop
2717.byte	102,15,56,222,208
2718.byte	102,15,56,222,216
2719.byte	102,15,56,222,224
2720.byte	102,15,56,222,232
2721.byte	102,15,56,222,240
2722.byte	102,15,56,222,248
2723.byte	102,68,15,56,222,192
2724.byte	102,68,15,56,222,200
2725	movups	96-112(%rcx),%xmm0
2726	nop
2727.byte	102,15,56,222,209
2728.byte	102,15,56,222,217
2729.byte	102,15,56,222,225
2730.byte	102,15,56,222,233
2731.byte	102,15,56,222,241
2732.byte	102,15,56,222,249
2733.byte	102,68,15,56,222,193
2734.byte	102,68,15,56,222,201
2735	movups	112-112(%rcx),%xmm1
2736	nop
2737.byte	102,15,56,222,208
2738.byte	102,15,56,222,216
2739.byte	102,15,56,222,224
2740.byte	102,15,56,222,232
2741.byte	102,15,56,222,240
2742.byte	102,15,56,222,248
2743.byte	102,68,15,56,222,192
2744.byte	102,68,15,56,222,200
2745	movups	128-112(%rcx),%xmm0
2746	nop
2747.byte	102,15,56,222,209
2748.byte	102,15,56,222,217
2749.byte	102,15,56,222,225
2750.byte	102,15,56,222,233
2751.byte	102,15,56,222,241
2752.byte	102,15,56,222,249
2753.byte	102,68,15,56,222,193
2754.byte	102,68,15,56,222,201
2755	movups	144-112(%rcx),%xmm1
2756	cmpl	$11,%eax
2757.byte	102,15,56,222,208
2758.byte	102,15,56,222,216
2759.byte	102,15,56,222,224
2760.byte	102,15,56,222,232
2761.byte	102,15,56,222,240
2762.byte	102,15,56,222,248
2763.byte	102,68,15,56,222,192
2764.byte	102,68,15,56,222,200
2765	movups	160-112(%rcx),%xmm0
2766	jb	.Lcbc_dec_done
2767.byte	102,15,56,222,209
2768.byte	102,15,56,222,217
2769.byte	102,15,56,222,225
2770.byte	102,15,56,222,233
2771.byte	102,15,56,222,241
2772.byte	102,15,56,222,249
2773.byte	102,68,15,56,222,193
2774.byte	102,68,15,56,222,201
2775	movups	176-112(%rcx),%xmm1
2776	nop
2777.byte	102,15,56,222,208
2778.byte	102,15,56,222,216
2779.byte	102,15,56,222,224
2780.byte	102,15,56,222,232
2781.byte	102,15,56,222,240
2782.byte	102,15,56,222,248
2783.byte	102,68,15,56,222,192
2784.byte	102,68,15,56,222,200
2785	movups	192-112(%rcx),%xmm0
2786	je	.Lcbc_dec_done
2787.byte	102,15,56,222,209
2788.byte	102,15,56,222,217
2789.byte	102,15,56,222,225
2790.byte	102,15,56,222,233
2791.byte	102,15,56,222,241
2792.byte	102,15,56,222,249
2793.byte	102,68,15,56,222,193
2794.byte	102,68,15,56,222,201
2795	movups	208-112(%rcx),%xmm1
2796	nop
2797.byte	102,15,56,222,208
2798.byte	102,15,56,222,216
2799.byte	102,15,56,222,224
2800.byte	102,15,56,222,232
2801.byte	102,15,56,222,240
2802.byte	102,15,56,222,248
2803.byte	102,68,15,56,222,192
2804.byte	102,68,15,56,222,200
2805	movups	224-112(%rcx),%xmm0
2806	jmp	.Lcbc_dec_done
2807.align	16
2808.Lcbc_dec_done:
2809.byte	102,15,56,222,209
2810.byte	102,15,56,222,217
2811	pxor	%xmm0,%xmm10
2812	pxor	%xmm0,%xmm11
2813.byte	102,15,56,222,225
2814.byte	102,15,56,222,233
2815	pxor	%xmm0,%xmm12
2816	pxor	%xmm0,%xmm13
2817.byte	102,15,56,222,241
2818.byte	102,15,56,222,249
2819	pxor	%xmm0,%xmm14
2820	pxor	%xmm0,%xmm15
2821.byte	102,68,15,56,222,193
2822.byte	102,68,15,56,222,201
2823	movdqu	80(%rdi),%xmm1
2824
2825.byte	102,65,15,56,223,210
2826	movdqu	96(%rdi),%xmm10
2827	pxor	%xmm0,%xmm1
2828.byte	102,65,15,56,223,219
2829	pxor	%xmm0,%xmm10
2830	movdqu	112(%rdi),%xmm0
2831.byte	102,65,15,56,223,228
2832	leaq	128(%rdi),%rdi
2833	movdqu	0(%r11),%xmm11
2834.byte	102,65,15,56,223,237
2835.byte	102,65,15,56,223,246
2836	movdqu	16(%r11),%xmm12
2837	movdqu	32(%r11),%xmm13
2838.byte	102,65,15,56,223,255
2839.byte	102,68,15,56,223,193
2840	movdqu	48(%r11),%xmm14
2841	movdqu	64(%r11),%xmm15
2842.byte	102,69,15,56,223,202
2843	movdqa	%xmm0,%xmm10
2844	movdqu	80(%r11),%xmm1
2845	movups	-112(%rcx),%xmm0
2846
2847	movups	%xmm2,(%rsi)
2848	movdqa	%xmm11,%xmm2
2849	movups	%xmm3,16(%rsi)
2850	movdqa	%xmm12,%xmm3
2851	movups	%xmm4,32(%rsi)
2852	movdqa	%xmm13,%xmm4
2853	movups	%xmm5,48(%rsi)
2854	movdqa	%xmm14,%xmm5
2855	movups	%xmm6,64(%rsi)
2856	movdqa	%xmm15,%xmm6
2857	movups	%xmm7,80(%rsi)
2858	movdqa	%xmm1,%xmm7
2859	movups	%xmm8,96(%rsi)
2860	leaq	112(%rsi),%rsi
2861
2862	subq	$0x80,%rdx
2863	ja	.Lcbc_dec_loop8
2864
2865	movaps	%xmm9,%xmm2
2866	leaq	-112(%rcx),%rcx
2867	addq	$0x70,%rdx
2868	jle	.Lcbc_dec_clear_tail_collected
2869	movups	%xmm9,(%rsi)
2870	leaq	16(%rsi),%rsi
2871	cmpq	$0x50,%rdx
2872	jbe	.Lcbc_dec_tail
2873
2874	movaps	%xmm11,%xmm2
2875.Lcbc_dec_six_or_seven:
2876	cmpq	$0x60,%rdx
2877	ja	.Lcbc_dec_seven
2878
2879	movaps	%xmm7,%xmm8
2880	call	_aesni_decrypt6
2881	pxor	%xmm10,%xmm2
2882	movaps	%xmm8,%xmm10
2883	pxor	%xmm11,%xmm3
2884	movdqu	%xmm2,(%rsi)
2885	pxor	%xmm12,%xmm4
2886	movdqu	%xmm3,16(%rsi)
2887	pxor	%xmm3,%xmm3
2888	pxor	%xmm13,%xmm5
2889	movdqu	%xmm4,32(%rsi)
2890	pxor	%xmm4,%xmm4
2891	pxor	%xmm14,%xmm6
2892	movdqu	%xmm5,48(%rsi)
2893	pxor	%xmm5,%xmm5
2894	pxor	%xmm15,%xmm7
2895	movdqu	%xmm6,64(%rsi)
2896	pxor	%xmm6,%xmm6
2897	leaq	80(%rsi),%rsi
2898	movdqa	%xmm7,%xmm2
2899	pxor	%xmm7,%xmm7
2900	jmp	.Lcbc_dec_tail_collected
2901
2902.align	16
2903.Lcbc_dec_seven:
2904	movups	96(%rdi),%xmm8
2905	xorps	%xmm9,%xmm9
2906	call	_aesni_decrypt8
2907	movups	80(%rdi),%xmm9
2908	pxor	%xmm10,%xmm2
2909	movups	96(%rdi),%xmm10
2910	pxor	%xmm11,%xmm3
2911	movdqu	%xmm2,(%rsi)
2912	pxor	%xmm12,%xmm4
2913	movdqu	%xmm3,16(%rsi)
2914	pxor	%xmm3,%xmm3
2915	pxor	%xmm13,%xmm5
2916	movdqu	%xmm4,32(%rsi)
2917	pxor	%xmm4,%xmm4
2918	pxor	%xmm14,%xmm6
2919	movdqu	%xmm5,48(%rsi)
2920	pxor	%xmm5,%xmm5
2921	pxor	%xmm15,%xmm7
2922	movdqu	%xmm6,64(%rsi)
2923	pxor	%xmm6,%xmm6
2924	pxor	%xmm9,%xmm8
2925	movdqu	%xmm7,80(%rsi)
2926	pxor	%xmm7,%xmm7
2927	leaq	96(%rsi),%rsi
2928	movdqa	%xmm8,%xmm2
2929	pxor	%xmm8,%xmm8
2930	pxor	%xmm9,%xmm9
2931	jmp	.Lcbc_dec_tail_collected
2932
2933.align	16
2934.Lcbc_dec_loop6:
2935	movups	%xmm7,(%rsi)
2936	leaq	16(%rsi),%rsi
2937	movdqu	0(%rdi),%xmm2
2938	movdqu	16(%rdi),%xmm3
2939	movdqa	%xmm2,%xmm11
2940	movdqu	32(%rdi),%xmm4
2941	movdqa	%xmm3,%xmm12
2942	movdqu	48(%rdi),%xmm5
2943	movdqa	%xmm4,%xmm13
2944	movdqu	64(%rdi),%xmm6
2945	movdqa	%xmm5,%xmm14
2946	movdqu	80(%rdi),%xmm7
2947	movdqa	%xmm6,%xmm15
2948.Lcbc_dec_loop6_enter:
2949	leaq	96(%rdi),%rdi
2950	movdqa	%xmm7,%xmm8
2951
2952	call	_aesni_decrypt6
2953
2954	pxor	%xmm10,%xmm2
2955	movdqa	%xmm8,%xmm10
2956	pxor	%xmm11,%xmm3
2957	movdqu	%xmm2,(%rsi)
2958	pxor	%xmm12,%xmm4
2959	movdqu	%xmm3,16(%rsi)
2960	pxor	%xmm13,%xmm5
2961	movdqu	%xmm4,32(%rsi)
2962	pxor	%xmm14,%xmm6
2963	movq	%r11,%rcx
2964	movdqu	%xmm5,48(%rsi)
2965	pxor	%xmm15,%xmm7
2966	movl	%r10d,%eax
2967	movdqu	%xmm6,64(%rsi)
2968	leaq	80(%rsi),%rsi
2969	subq	$0x60,%rdx
2970	ja	.Lcbc_dec_loop6
2971
2972	movdqa	%xmm7,%xmm2
2973	addq	$0x50,%rdx
2974	jle	.Lcbc_dec_clear_tail_collected
2975	movups	%xmm7,(%rsi)
2976	leaq	16(%rsi),%rsi
2977
2978.Lcbc_dec_tail:
2979	movups	(%rdi),%xmm2
2980	subq	$0x10,%rdx
2981	jbe	.Lcbc_dec_one
2982
2983	movups	16(%rdi),%xmm3
2984	movaps	%xmm2,%xmm11
2985	subq	$0x10,%rdx
2986	jbe	.Lcbc_dec_two
2987
2988	movups	32(%rdi),%xmm4
2989	movaps	%xmm3,%xmm12
2990	subq	$0x10,%rdx
2991	jbe	.Lcbc_dec_three
2992
2993	movups	48(%rdi),%xmm5
2994	movaps	%xmm4,%xmm13
2995	subq	$0x10,%rdx
2996	jbe	.Lcbc_dec_four
2997
2998	movups	64(%rdi),%xmm6
2999	movaps	%xmm5,%xmm14
3000	movaps	%xmm6,%xmm15
3001	xorps	%xmm7,%xmm7
3002	call	_aesni_decrypt6
3003	pxor	%xmm10,%xmm2
3004	movaps	%xmm15,%xmm10
3005	pxor	%xmm11,%xmm3
3006	movdqu	%xmm2,(%rsi)
3007	pxor	%xmm12,%xmm4
3008	movdqu	%xmm3,16(%rsi)
3009	pxor	%xmm3,%xmm3
3010	pxor	%xmm13,%xmm5
3011	movdqu	%xmm4,32(%rsi)
3012	pxor	%xmm4,%xmm4
3013	pxor	%xmm14,%xmm6
3014	movdqu	%xmm5,48(%rsi)
3015	pxor	%xmm5,%xmm5
3016	leaq	64(%rsi),%rsi
3017	movdqa	%xmm6,%xmm2
3018	pxor	%xmm6,%xmm6
3019	pxor	%xmm7,%xmm7
3020	subq	$0x10,%rdx
3021	jmp	.Lcbc_dec_tail_collected
3022
3023.align	16
3024.Lcbc_dec_one:
3025	movaps	%xmm2,%xmm11
3026	movups	(%rcx),%xmm0
3027	movups	16(%rcx),%xmm1
3028	leaq	32(%rcx),%rcx
3029	xorps	%xmm0,%xmm2
3030.Loop_dec1_17:
3031.byte	102,15,56,222,209
3032	decl	%eax
3033	movups	(%rcx),%xmm1
3034	leaq	16(%rcx),%rcx
3035	jnz	.Loop_dec1_17
3036.byte	102,15,56,223,209
3037	xorps	%xmm10,%xmm2
3038	movaps	%xmm11,%xmm10
3039	jmp	.Lcbc_dec_tail_collected
3040.align	16
3041.Lcbc_dec_two:
3042	movaps	%xmm3,%xmm12
3043	call	_aesni_decrypt2
3044	pxor	%xmm10,%xmm2
3045	movaps	%xmm12,%xmm10
3046	pxor	%xmm11,%xmm3
3047	movdqu	%xmm2,(%rsi)
3048	movdqa	%xmm3,%xmm2
3049	pxor	%xmm3,%xmm3
3050	leaq	16(%rsi),%rsi
3051	jmp	.Lcbc_dec_tail_collected
3052.align	16
3053.Lcbc_dec_three:
3054	movaps	%xmm4,%xmm13
3055	call	_aesni_decrypt3
3056	pxor	%xmm10,%xmm2
3057	movaps	%xmm13,%xmm10
3058	pxor	%xmm11,%xmm3
3059	movdqu	%xmm2,(%rsi)
3060	pxor	%xmm12,%xmm4
3061	movdqu	%xmm3,16(%rsi)
3062	pxor	%xmm3,%xmm3
3063	movdqa	%xmm4,%xmm2
3064	pxor	%xmm4,%xmm4
3065	leaq	32(%rsi),%rsi
3066	jmp	.Lcbc_dec_tail_collected
3067.align	16
3068.Lcbc_dec_four:
3069	movaps	%xmm5,%xmm14
3070	call	_aesni_decrypt4
3071	pxor	%xmm10,%xmm2
3072	movaps	%xmm14,%xmm10
3073	pxor	%xmm11,%xmm3
3074	movdqu	%xmm2,(%rsi)
3075	pxor	%xmm12,%xmm4
3076	movdqu	%xmm3,16(%rsi)
3077	pxor	%xmm3,%xmm3
3078	pxor	%xmm13,%xmm5
3079	movdqu	%xmm4,32(%rsi)
3080	pxor	%xmm4,%xmm4
3081	movdqa	%xmm5,%xmm2
3082	pxor	%xmm5,%xmm5
3083	leaq	48(%rsi),%rsi
3084	jmp	.Lcbc_dec_tail_collected
3085
3086.align	16
3087.Lcbc_dec_clear_tail_collected:
3088	pxor	%xmm3,%xmm3
3089	pxor	%xmm4,%xmm4
3090	pxor	%xmm5,%xmm5
3091	pxor	%xmm6,%xmm6
3092	pxor	%xmm7,%xmm7
3093	pxor	%xmm8,%xmm8
3094	pxor	%xmm9,%xmm9
3095.Lcbc_dec_tail_collected:
3096	movups	%xmm10,(%r8)
3097	andq	$15,%rdx
3098	jnz	.Lcbc_dec_tail_partial
3099	movups	%xmm2,(%rsi)
3100	pxor	%xmm2,%xmm2
3101	jmp	.Lcbc_dec_ret
3102.align	16
3103.Lcbc_dec_tail_partial:
3104	movaps	%xmm2,(%rsp)
3105	pxor	%xmm2,%xmm2
3106	movq	$16,%rcx
3107	movq	%rsi,%rdi
3108	subq	%rdx,%rcx
3109	leaq	(%rsp),%rsi
3110.long	0x9066A4F3
3111	movdqa	%xmm2,(%rsp)
3112
3113.Lcbc_dec_ret:
3114	xorps	%xmm0,%xmm0
3115	pxor	%xmm1,%xmm1
3116	leaq	(%rbp),%rsp
3117	popq	%rbp
3118.Lcbc_ret:
3119	.byte	0xf3,0xc3
3120.size	aesni_cbc_encrypt,.-aesni_cbc_encrypt
3121.globl	aesni_set_decrypt_key
3122.type	aesni_set_decrypt_key,@function
3123.align	16
3124aesni_set_decrypt_key:
3125.byte	0x48,0x83,0xEC,0x08
3126	call	__aesni_set_encrypt_key
3127	shll	$4,%esi
3128	testl	%eax,%eax
3129	jnz	.Ldec_key_ret
3130	leaq	16(%rdx,%rsi,1),%rdi
3131
3132	movups	(%rdx),%xmm0
3133	movups	(%rdi),%xmm1
3134	movups	%xmm0,(%rdi)
3135	movups	%xmm1,(%rdx)
3136	leaq	16(%rdx),%rdx
3137	leaq	-16(%rdi),%rdi
3138
3139.Ldec_key_inverse:
3140	movups	(%rdx),%xmm0
3141	movups	(%rdi),%xmm1
3142.byte	102,15,56,219,192
3143.byte	102,15,56,219,201
3144	leaq	16(%rdx),%rdx
3145	leaq	-16(%rdi),%rdi
3146	movups	%xmm0,16(%rdi)
3147	movups	%xmm1,-16(%rdx)
3148	cmpq	%rdx,%rdi
3149	ja	.Ldec_key_inverse
3150
3151	movups	(%rdx),%xmm0
3152.byte	102,15,56,219,192
3153	pxor	%xmm1,%xmm1
3154	movups	%xmm0,(%rdi)
3155	pxor	%xmm0,%xmm0
3156.Ldec_key_ret:
3157	addq	$8,%rsp
3158	.byte	0xf3,0xc3
3159.LSEH_end_set_decrypt_key:
3160.size	aesni_set_decrypt_key,.-aesni_set_decrypt_key
3161.globl	aesni_set_encrypt_key
3162.type	aesni_set_encrypt_key,@function
3163.align	16
3164aesni_set_encrypt_key:
3165__aesni_set_encrypt_key:
3166.byte	0x48,0x83,0xEC,0x08
3167	movq	$-1,%rax
3168	testq	%rdi,%rdi
3169	jz	.Lenc_key_ret
3170	testq	%rdx,%rdx
3171	jz	.Lenc_key_ret
3172
3173	movl	$268437504,%r10d
3174	movups	(%rdi),%xmm0
3175	xorps	%xmm4,%xmm4
3176	andl	OPENSSL_ia32cap_P+4(%rip),%r10d
3177	leaq	16(%rdx),%rax
3178	cmpl	$256,%esi
3179	je	.L14rounds
3180	cmpl	$192,%esi
3181	je	.L12rounds
3182	cmpl	$128,%esi
3183	jne	.Lbad_keybits
3184
3185.L10rounds:
3186	movl	$9,%esi
3187	cmpl	$268435456,%r10d
3188	je	.L10rounds_alt
3189
3190	movups	%xmm0,(%rdx)
3191.byte	102,15,58,223,200,1
3192	call	.Lkey_expansion_128_cold
3193.byte	102,15,58,223,200,2
3194	call	.Lkey_expansion_128
3195.byte	102,15,58,223,200,4
3196	call	.Lkey_expansion_128
3197.byte	102,15,58,223,200,8
3198	call	.Lkey_expansion_128
3199.byte	102,15,58,223,200,16
3200	call	.Lkey_expansion_128
3201.byte	102,15,58,223,200,32
3202	call	.Lkey_expansion_128
3203.byte	102,15,58,223,200,64
3204	call	.Lkey_expansion_128
3205.byte	102,15,58,223,200,128
3206	call	.Lkey_expansion_128
3207.byte	102,15,58,223,200,27
3208	call	.Lkey_expansion_128
3209.byte	102,15,58,223,200,54
3210	call	.Lkey_expansion_128
3211	movups	%xmm0,(%rax)
3212	movl	%esi,80(%rax)
3213	xorl	%eax,%eax
3214	jmp	.Lenc_key_ret
3215
3216.align	16
3217.L10rounds_alt:
3218	movdqa	.Lkey_rotate(%rip),%xmm5
3219	movl	$8,%r10d
3220	movdqa	.Lkey_rcon1(%rip),%xmm4
3221	movdqa	%xmm0,%xmm2
3222	movdqu	%xmm0,(%rdx)
3223	jmp	.Loop_key128
3224
3225.align	16
3226.Loop_key128:
3227.byte	102,15,56,0,197
3228.byte	102,15,56,221,196
3229	pslld	$1,%xmm4
3230	leaq	16(%rax),%rax
3231
3232	movdqa	%xmm2,%xmm3
3233	pslldq	$4,%xmm2
3234	pxor	%xmm2,%xmm3
3235	pslldq	$4,%xmm2
3236	pxor	%xmm2,%xmm3
3237	pslldq	$4,%xmm2
3238	pxor	%xmm3,%xmm2
3239
3240	pxor	%xmm2,%xmm0
3241	movdqu	%xmm0,-16(%rax)
3242	movdqa	%xmm0,%xmm2
3243
3244	decl	%r10d
3245	jnz	.Loop_key128
3246
3247	movdqa	.Lkey_rcon1b(%rip),%xmm4
3248
3249.byte	102,15,56,0,197
3250.byte	102,15,56,221,196
3251	pslld	$1,%xmm4
3252
3253	movdqa	%xmm2,%xmm3
3254	pslldq	$4,%xmm2
3255	pxor	%xmm2,%xmm3
3256	pslldq	$4,%xmm2
3257	pxor	%xmm2,%xmm3
3258	pslldq	$4,%xmm2
3259	pxor	%xmm3,%xmm2
3260
3261	pxor	%xmm2,%xmm0
3262	movdqu	%xmm0,(%rax)
3263
3264	movdqa	%xmm0,%xmm2
3265.byte	102,15,56,0,197
3266.byte	102,15,56,221,196
3267
3268	movdqa	%xmm2,%xmm3
3269	pslldq	$4,%xmm2
3270	pxor	%xmm2,%xmm3
3271	pslldq	$4,%xmm2
3272	pxor	%xmm2,%xmm3
3273	pslldq	$4,%xmm2
3274	pxor	%xmm3,%xmm2
3275
3276	pxor	%xmm2,%xmm0
3277	movdqu	%xmm0,16(%rax)
3278
3279	movl	%esi,96(%rax)
3280	xorl	%eax,%eax
3281	jmp	.Lenc_key_ret
3282
3283.align	16
3284.L12rounds:
3285	movq	16(%rdi),%xmm2
3286	movl	$11,%esi
3287	cmpl	$268435456,%r10d
3288	je	.L12rounds_alt
3289
3290	movups	%xmm0,(%rdx)
3291.byte	102,15,58,223,202,1
3292	call	.Lkey_expansion_192a_cold
3293.byte	102,15,58,223,202,2
3294	call	.Lkey_expansion_192b
3295.byte	102,15,58,223,202,4
3296	call	.Lkey_expansion_192a
3297.byte	102,15,58,223,202,8
3298	call	.Lkey_expansion_192b
3299.byte	102,15,58,223,202,16
3300	call	.Lkey_expansion_192a
3301.byte	102,15,58,223,202,32
3302	call	.Lkey_expansion_192b
3303.byte	102,15,58,223,202,64
3304	call	.Lkey_expansion_192a
3305.byte	102,15,58,223,202,128
3306	call	.Lkey_expansion_192b
3307	movups	%xmm0,(%rax)
3308	movl	%esi,48(%rax)
3309	xorq	%rax,%rax
3310	jmp	.Lenc_key_ret
3311
3312.align	16
3313.L12rounds_alt:
3314	movdqa	.Lkey_rotate192(%rip),%xmm5
3315	movdqa	.Lkey_rcon1(%rip),%xmm4
3316	movl	$8,%r10d
3317	movdqu	%xmm0,(%rdx)
3318	jmp	.Loop_key192
3319
3320.align	16
3321.Loop_key192:
3322	movq	%xmm2,0(%rax)
3323	movdqa	%xmm2,%xmm1
3324.byte	102,15,56,0,213
3325.byte	102,15,56,221,212
3326	pslld	$1,%xmm4
3327	leaq	24(%rax),%rax
3328
3329	movdqa	%xmm0,%xmm3
3330	pslldq	$4,%xmm0
3331	pxor	%xmm0,%xmm3
3332	pslldq	$4,%xmm0
3333	pxor	%xmm0,%xmm3
3334	pslldq	$4,%xmm0
3335	pxor	%xmm3,%xmm0
3336
3337	pshufd	$0xff,%xmm0,%xmm3
3338	pxor	%xmm1,%xmm3
3339	pslldq	$4,%xmm1
3340	pxor	%xmm1,%xmm3
3341
3342	pxor	%xmm2,%xmm0
3343	pxor	%xmm3,%xmm2
3344	movdqu	%xmm0,-16(%rax)
3345
3346	decl	%r10d
3347	jnz	.Loop_key192
3348
3349	movl	%esi,32(%rax)
3350	xorl	%eax,%eax
3351	jmp	.Lenc_key_ret
3352
3353.align	16
3354.L14rounds:
3355	movups	16(%rdi),%xmm2
3356	movl	$13,%esi
3357	leaq	16(%rax),%rax
3358	cmpl	$268435456,%r10d
3359	je	.L14rounds_alt
3360
3361	movups	%xmm0,(%rdx)
3362	movups	%xmm2,16(%rdx)
3363.byte	102,15,58,223,202,1
3364	call	.Lkey_expansion_256a_cold
3365.byte	102,15,58,223,200,1
3366	call	.Lkey_expansion_256b
3367.byte	102,15,58,223,202,2
3368	call	.Lkey_expansion_256a
3369.byte	102,15,58,223,200,2
3370	call	.Lkey_expansion_256b
3371.byte	102,15,58,223,202,4
3372	call	.Lkey_expansion_256a
3373.byte	102,15,58,223,200,4
3374	call	.Lkey_expansion_256b
3375.byte	102,15,58,223,202,8
3376	call	.Lkey_expansion_256a
3377.byte	102,15,58,223,200,8
3378	call	.Lkey_expansion_256b
3379.byte	102,15,58,223,202,16
3380	call	.Lkey_expansion_256a
3381.byte	102,15,58,223,200,16
3382	call	.Lkey_expansion_256b
3383.byte	102,15,58,223,202,32
3384	call	.Lkey_expansion_256a
3385.byte	102,15,58,223,200,32
3386	call	.Lkey_expansion_256b
3387.byte	102,15,58,223,202,64
3388	call	.Lkey_expansion_256a
3389	movups	%xmm0,(%rax)
3390	movl	%esi,16(%rax)
3391	xorq	%rax,%rax
3392	jmp	.Lenc_key_ret
3393
3394.align	16
3395.L14rounds_alt:
3396	movdqa	.Lkey_rotate(%rip),%xmm5
3397	movdqa	.Lkey_rcon1(%rip),%xmm4
3398	movl	$7,%r10d
3399	movdqu	%xmm0,0(%rdx)
3400	movdqa	%xmm2,%xmm1
3401	movdqu	%xmm2,16(%rdx)
3402	jmp	.Loop_key256
3403
3404.align	16
3405.Loop_key256:
3406.byte	102,15,56,0,213
3407.byte	102,15,56,221,212
3408
3409	movdqa	%xmm0,%xmm3
3410	pslldq	$4,%xmm0
3411	pxor	%xmm0,%xmm3
3412	pslldq	$4,%xmm0
3413	pxor	%xmm0,%xmm3
3414	pslldq	$4,%xmm0
3415	pxor	%xmm3,%xmm0
3416	pslld	$1,%xmm4
3417
3418	pxor	%xmm2,%xmm0
3419	movdqu	%xmm0,(%rax)
3420
3421	decl	%r10d
3422	jz	.Ldone_key256
3423
3424	pshufd	$0xff,%xmm0,%xmm2
3425	pxor	%xmm3,%xmm3
3426.byte	102,15,56,221,211
3427
3428	movdqa	%xmm1,%xmm3
3429	pslldq	$4,%xmm1
3430	pxor	%xmm1,%xmm3
3431	pslldq	$4,%xmm1
3432	pxor	%xmm1,%xmm3
3433	pslldq	$4,%xmm1
3434	pxor	%xmm3,%xmm1
3435
3436	pxor	%xmm1,%xmm2
3437	movdqu	%xmm2,16(%rax)
3438	leaq	32(%rax),%rax
3439	movdqa	%xmm2,%xmm1
3440
3441	jmp	.Loop_key256
3442
3443.Ldone_key256:
3444	movl	%esi,16(%rax)
3445	xorl	%eax,%eax
3446	jmp	.Lenc_key_ret
3447
3448.align	16
3449.Lbad_keybits:
3450	movq	$-2,%rax
3451.Lenc_key_ret:
3452	pxor	%xmm0,%xmm0
3453	pxor	%xmm1,%xmm1
3454	pxor	%xmm2,%xmm2
3455	pxor	%xmm3,%xmm3
3456	pxor	%xmm4,%xmm4
3457	pxor	%xmm5,%xmm5
3458	addq	$8,%rsp
3459	.byte	0xf3,0xc3
3460.LSEH_end_set_encrypt_key:
3461
3462.align	16
3463.Lkey_expansion_128:
3464	movups	%xmm0,(%rax)
3465	leaq	16(%rax),%rax
3466.Lkey_expansion_128_cold:
3467	shufps	$16,%xmm0,%xmm4
3468	xorps	%xmm4,%xmm0
3469	shufps	$140,%xmm0,%xmm4
3470	xorps	%xmm4,%xmm0
3471	shufps	$255,%xmm1,%xmm1
3472	xorps	%xmm1,%xmm0
3473	.byte	0xf3,0xc3
3474
3475.align	16
3476.Lkey_expansion_192a:
3477	movups	%xmm0,(%rax)
3478	leaq	16(%rax),%rax
3479.Lkey_expansion_192a_cold:
3480	movaps	%xmm2,%xmm5
3481.Lkey_expansion_192b_warm:
3482	shufps	$16,%xmm0,%xmm4
3483	movdqa	%xmm2,%xmm3
3484	xorps	%xmm4,%xmm0
3485	shufps	$140,%xmm0,%xmm4
3486	pslldq	$4,%xmm3
3487	xorps	%xmm4,%xmm0
3488	pshufd	$85,%xmm1,%xmm1
3489	pxor	%xmm3,%xmm2
3490	pxor	%xmm1,%xmm0
3491	pshufd	$255,%xmm0,%xmm3
3492	pxor	%xmm3,%xmm2
3493	.byte	0xf3,0xc3
3494
3495.align	16
3496.Lkey_expansion_192b:
3497	movaps	%xmm0,%xmm3
3498	shufps	$68,%xmm0,%xmm5
3499	movups	%xmm5,(%rax)
3500	shufps	$78,%xmm2,%xmm3
3501	movups	%xmm3,16(%rax)
3502	leaq	32(%rax),%rax
3503	jmp	.Lkey_expansion_192b_warm
3504
3505.align	16
3506.Lkey_expansion_256a:
3507	movups	%xmm2,(%rax)
3508	leaq	16(%rax),%rax
3509.Lkey_expansion_256a_cold:
3510	shufps	$16,%xmm0,%xmm4
3511	xorps	%xmm4,%xmm0
3512	shufps	$140,%xmm0,%xmm4
3513	xorps	%xmm4,%xmm0
3514	shufps	$255,%xmm1,%xmm1
3515	xorps	%xmm1,%xmm0
3516	.byte	0xf3,0xc3
3517
3518.align	16
3519.Lkey_expansion_256b:
3520	movups	%xmm0,(%rax)
3521	leaq	16(%rax),%rax
3522
3523	shufps	$16,%xmm2,%xmm4
3524	xorps	%xmm4,%xmm2
3525	shufps	$140,%xmm2,%xmm4
3526	xorps	%xmm4,%xmm2
3527	shufps	$170,%xmm1,%xmm1
3528	xorps	%xmm1,%xmm2
3529	.byte	0xf3,0xc3
3530.size	aesni_set_encrypt_key,.-aesni_set_encrypt_key
3531.size	__aesni_set_encrypt_key,.-__aesni_set_encrypt_key
3532.align	64
3533.Lbswap_mask:
3534.byte	15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
3535.Lincrement32:
3536.long	6,6,6,0
3537.Lincrement64:
3538.long	1,0,0,0
3539.Lxts_magic:
3540.long	0x87,0,1,0
3541.Lincrement1:
3542.byte	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3543.Lkey_rotate:
3544.long	0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
3545.Lkey_rotate192:
3546.long	0x04070605,0x04070605,0x04070605,0x04070605
3547.Lkey_rcon1:
3548.long	1,1,1,1
3549.Lkey_rcon1b:
3550.long	0x1b,0x1b,0x1b,0x1b
3551
3552.byte	65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
3553.align	64
3554