aesni-mb-x86_64.S revision 290207
1	# $FreeBSD: head/secure/lib/libcrypto/amd64/aesni-mb-x86_64.S 290207 2015-10-30 20:51:33Z jkim $
2.text
3
4
5
6.globl	aesni_multi_cbc_encrypt
7.type	aesni_multi_cbc_encrypt,@function
8.align	32
9aesni_multi_cbc_encrypt:
10	movq	%rsp,%rax
11	pushq	%rbx
12	pushq	%rbp
13	pushq	%r12
14	pushq	%r13
15	pushq	%r14
16	pushq	%r15
17
18
19
20
21
22
23	subq	$48,%rsp
24	andq	$-64,%rsp
25	movq	%rax,16(%rsp)
26
27.Lenc4x_body:
28	movdqu	(%rsi),%xmm12
29	leaq	120(%rsi),%rsi
30	leaq	80(%rdi),%rdi
31
32.Lenc4x_loop_grande:
33	movl	%edx,24(%rsp)
34	xorl	%edx,%edx
35	movl	-64(%rdi),%ecx
36	movq	-80(%rdi),%r8
37	cmpl	%edx,%ecx
38	movq	-72(%rdi),%r12
39	cmovgl	%ecx,%edx
40	testl	%ecx,%ecx
41	movdqu	-56(%rdi),%xmm2
42	movl	%ecx,32(%rsp)
43	cmovleq	%rsp,%r8
44	movl	-24(%rdi),%ecx
45	movq	-40(%rdi),%r9
46	cmpl	%edx,%ecx
47	movq	-32(%rdi),%r13
48	cmovgl	%ecx,%edx
49	testl	%ecx,%ecx
50	movdqu	-16(%rdi),%xmm3
51	movl	%ecx,36(%rsp)
52	cmovleq	%rsp,%r9
53	movl	16(%rdi),%ecx
54	movq	0(%rdi),%r10
55	cmpl	%edx,%ecx
56	movq	8(%rdi),%r14
57	cmovgl	%ecx,%edx
58	testl	%ecx,%ecx
59	movdqu	24(%rdi),%xmm4
60	movl	%ecx,40(%rsp)
61	cmovleq	%rsp,%r10
62	movl	56(%rdi),%ecx
63	movq	40(%rdi),%r11
64	cmpl	%edx,%ecx
65	movq	48(%rdi),%r15
66	cmovgl	%ecx,%edx
67	testl	%ecx,%ecx
68	movdqu	64(%rdi),%xmm5
69	movl	%ecx,44(%rsp)
70	cmovleq	%rsp,%r11
71	testl	%edx,%edx
72	jz	.Lenc4x_done
73
74	movups	16-120(%rsi),%xmm1
75	pxor	%xmm12,%xmm2
76	movups	32-120(%rsi),%xmm0
77	pxor	%xmm12,%xmm3
78	movl	240-120(%rsi),%eax
79	pxor	%xmm12,%xmm4
80	movdqu	(%r8),%xmm6
81	pxor	%xmm12,%xmm5
82	movdqu	(%r9),%xmm7
83	pxor	%xmm6,%xmm2
84	movdqu	(%r10),%xmm8
85	pxor	%xmm7,%xmm3
86	movdqu	(%r11),%xmm9
87	pxor	%xmm8,%xmm4
88	pxor	%xmm9,%xmm5
89	movdqa	32(%rsp),%xmm10
90	xorq	%rbx,%rbx
91	jmp	.Loop_enc4x
92
93.align	32
94.Loop_enc4x:
95	addq	$16,%rbx
96	leaq	16(%rsp),%rbp
97	movl	$1,%ecx
98	subq	%rbx,%rbp
99
100.byte	102,15,56,220,209
101	prefetcht0	31(%r8,%rbx,1)
102	prefetcht0	31(%r9,%rbx,1)
103.byte	102,15,56,220,217
104	prefetcht0	31(%r10,%rbx,1)
105	prefetcht0	31(%r10,%rbx,1)
106.byte	102,15,56,220,225
107.byte	102,15,56,220,233
108	movups	48-120(%rsi),%xmm1
109	cmpl	32(%rsp),%ecx
110.byte	102,15,56,220,208
111.byte	102,15,56,220,216
112.byte	102,15,56,220,224
113	cmovgeq	%rbp,%r8
114	cmovgq	%rbp,%r12
115.byte	102,15,56,220,232
116	movups	-56(%rsi),%xmm0
117	cmpl	36(%rsp),%ecx
118.byte	102,15,56,220,209
119.byte	102,15,56,220,217
120.byte	102,15,56,220,225
121	cmovgeq	%rbp,%r9
122	cmovgq	%rbp,%r13
123.byte	102,15,56,220,233
124	movups	-40(%rsi),%xmm1
125	cmpl	40(%rsp),%ecx
126.byte	102,15,56,220,208
127.byte	102,15,56,220,216
128.byte	102,15,56,220,224
129	cmovgeq	%rbp,%r10
130	cmovgq	%rbp,%r14
131.byte	102,15,56,220,232
132	movups	-24(%rsi),%xmm0
133	cmpl	44(%rsp),%ecx
134.byte	102,15,56,220,209
135.byte	102,15,56,220,217
136.byte	102,15,56,220,225
137	cmovgeq	%rbp,%r11
138	cmovgq	%rbp,%r15
139.byte	102,15,56,220,233
140	movups	-8(%rsi),%xmm1
141	movdqa	%xmm10,%xmm11
142.byte	102,15,56,220,208
143	prefetcht0	15(%r12,%rbx,1)
144	prefetcht0	15(%r13,%rbx,1)
145.byte	102,15,56,220,216
146	prefetcht0	15(%r14,%rbx,1)
147	prefetcht0	15(%r15,%rbx,1)
148.byte	102,15,56,220,224
149.byte	102,15,56,220,232
150	movups	128-120(%rsi),%xmm0
151	pxor	%xmm12,%xmm12
152
153.byte	102,15,56,220,209
154	pcmpgtd	%xmm12,%xmm11
155	movdqu	-120(%rsi),%xmm12
156.byte	102,15,56,220,217
157	paddd	%xmm11,%xmm10
158	movdqa	%xmm10,32(%rsp)
159.byte	102,15,56,220,225
160.byte	102,15,56,220,233
161	movups	144-120(%rsi),%xmm1
162
163	cmpl	$11,%eax
164
165.byte	102,15,56,220,208
166.byte	102,15,56,220,216
167.byte	102,15,56,220,224
168.byte	102,15,56,220,232
169	movups	160-120(%rsi),%xmm0
170
171	jb	.Lenc4x_tail
172
173.byte	102,15,56,220,209
174.byte	102,15,56,220,217
175.byte	102,15,56,220,225
176.byte	102,15,56,220,233
177	movups	176-120(%rsi),%xmm1
178
179.byte	102,15,56,220,208
180.byte	102,15,56,220,216
181.byte	102,15,56,220,224
182.byte	102,15,56,220,232
183	movups	192-120(%rsi),%xmm0
184
185	je	.Lenc4x_tail
186
187.byte	102,15,56,220,209
188.byte	102,15,56,220,217
189.byte	102,15,56,220,225
190.byte	102,15,56,220,233
191	movups	208-120(%rsi),%xmm1
192
193.byte	102,15,56,220,208
194.byte	102,15,56,220,216
195.byte	102,15,56,220,224
196.byte	102,15,56,220,232
197	movups	224-120(%rsi),%xmm0
198	jmp	.Lenc4x_tail
199
200.align	32
201.Lenc4x_tail:
202.byte	102,15,56,220,209
203.byte	102,15,56,220,217
204.byte	102,15,56,220,225
205.byte	102,15,56,220,233
206	movdqu	(%r8,%rbx,1),%xmm6
207	movdqu	16-120(%rsi),%xmm1
208
209.byte	102,15,56,221,208
210	movdqu	(%r9,%rbx,1),%xmm7
211	pxor	%xmm12,%xmm6
212.byte	102,15,56,221,216
213	movdqu	(%r10,%rbx,1),%xmm8
214	pxor	%xmm12,%xmm7
215.byte	102,15,56,221,224
216	movdqu	(%r11,%rbx,1),%xmm9
217	pxor	%xmm12,%xmm8
218.byte	102,15,56,221,232
219	movdqu	32-120(%rsi),%xmm0
220	pxor	%xmm12,%xmm9
221
222	movups	%xmm2,-16(%r12,%rbx,1)
223	pxor	%xmm6,%xmm2
224	movups	%xmm3,-16(%r13,%rbx,1)
225	pxor	%xmm7,%xmm3
226	movups	%xmm4,-16(%r14,%rbx,1)
227	pxor	%xmm8,%xmm4
228	movups	%xmm5,-16(%r15,%rbx,1)
229	pxor	%xmm9,%xmm5
230
231	decl	%edx
232	jnz	.Loop_enc4x
233
234	movq	16(%rsp),%rax
235	movl	24(%rsp),%edx
236
237
238
239
240
241
242
243
244
245
246	leaq	160(%rdi),%rdi
247	decl	%edx
248	jnz	.Lenc4x_loop_grande
249
250.Lenc4x_done:
251	movq	-48(%rax),%r15
252	movq	-40(%rax),%r14
253	movq	-32(%rax),%r13
254	movq	-24(%rax),%r12
255	movq	-16(%rax),%rbp
256	movq	-8(%rax),%rbx
257	leaq	(%rax),%rsp
258.Lenc4x_epilogue:
259	.byte	0xf3,0xc3
260.size	aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt
261
262.globl	aesni_multi_cbc_decrypt
263.type	aesni_multi_cbc_decrypt,@function
264.align	32
265aesni_multi_cbc_decrypt:
266	movq	%rsp,%rax
267	pushq	%rbx
268	pushq	%rbp
269	pushq	%r12
270	pushq	%r13
271	pushq	%r14
272	pushq	%r15
273
274
275
276
277
278
279	subq	$48,%rsp
280	andq	$-64,%rsp
281	movq	%rax,16(%rsp)
282
283.Ldec4x_body:
284	movdqu	(%rsi),%xmm12
285	leaq	120(%rsi),%rsi
286	leaq	80(%rdi),%rdi
287
288.Ldec4x_loop_grande:
289	movl	%edx,24(%rsp)
290	xorl	%edx,%edx
291	movl	-64(%rdi),%ecx
292	movq	-80(%rdi),%r8
293	cmpl	%edx,%ecx
294	movq	-72(%rdi),%r12
295	cmovgl	%ecx,%edx
296	testl	%ecx,%ecx
297	movdqu	-56(%rdi),%xmm6
298	movl	%ecx,32(%rsp)
299	cmovleq	%rsp,%r8
300	movl	-24(%rdi),%ecx
301	movq	-40(%rdi),%r9
302	cmpl	%edx,%ecx
303	movq	-32(%rdi),%r13
304	cmovgl	%ecx,%edx
305	testl	%ecx,%ecx
306	movdqu	-16(%rdi),%xmm7
307	movl	%ecx,36(%rsp)
308	cmovleq	%rsp,%r9
309	movl	16(%rdi),%ecx
310	movq	0(%rdi),%r10
311	cmpl	%edx,%ecx
312	movq	8(%rdi),%r14
313	cmovgl	%ecx,%edx
314	testl	%ecx,%ecx
315	movdqu	24(%rdi),%xmm8
316	movl	%ecx,40(%rsp)
317	cmovleq	%rsp,%r10
318	movl	56(%rdi),%ecx
319	movq	40(%rdi),%r11
320	cmpl	%edx,%ecx
321	movq	48(%rdi),%r15
322	cmovgl	%ecx,%edx
323	testl	%ecx,%ecx
324	movdqu	64(%rdi),%xmm9
325	movl	%ecx,44(%rsp)
326	cmovleq	%rsp,%r11
327	testl	%edx,%edx
328	jz	.Ldec4x_done
329
330	movups	16-120(%rsi),%xmm1
331	movups	32-120(%rsi),%xmm0
332	movl	240-120(%rsi),%eax
333	movdqu	(%r8),%xmm2
334	movdqu	(%r9),%xmm3
335	pxor	%xmm12,%xmm2
336	movdqu	(%r10),%xmm4
337	pxor	%xmm12,%xmm3
338	movdqu	(%r11),%xmm5
339	pxor	%xmm12,%xmm4
340	pxor	%xmm12,%xmm5
341	movdqa	32(%rsp),%xmm10
342	xorq	%rbx,%rbx
343	jmp	.Loop_dec4x
344
345.align	32
346.Loop_dec4x:
347	addq	$16,%rbx
348	leaq	16(%rsp),%rbp
349	movl	$1,%ecx
350	subq	%rbx,%rbp
351
352.byte	102,15,56,222,209
353	prefetcht0	31(%r8,%rbx,1)
354	prefetcht0	31(%r9,%rbx,1)
355.byte	102,15,56,222,217
356	prefetcht0	31(%r10,%rbx,1)
357	prefetcht0	31(%r11,%rbx,1)
358.byte	102,15,56,222,225
359.byte	102,15,56,222,233
360	movups	48-120(%rsi),%xmm1
361	cmpl	32(%rsp),%ecx
362.byte	102,15,56,222,208
363.byte	102,15,56,222,216
364.byte	102,15,56,222,224
365	cmovgeq	%rbp,%r8
366	cmovgq	%rbp,%r12
367.byte	102,15,56,222,232
368	movups	-56(%rsi),%xmm0
369	cmpl	36(%rsp),%ecx
370.byte	102,15,56,222,209
371.byte	102,15,56,222,217
372.byte	102,15,56,222,225
373	cmovgeq	%rbp,%r9
374	cmovgq	%rbp,%r13
375.byte	102,15,56,222,233
376	movups	-40(%rsi),%xmm1
377	cmpl	40(%rsp),%ecx
378.byte	102,15,56,222,208
379.byte	102,15,56,222,216
380.byte	102,15,56,222,224
381	cmovgeq	%rbp,%r10
382	cmovgq	%rbp,%r14
383.byte	102,15,56,222,232
384	movups	-24(%rsi),%xmm0
385	cmpl	44(%rsp),%ecx
386.byte	102,15,56,222,209
387.byte	102,15,56,222,217
388.byte	102,15,56,222,225
389	cmovgeq	%rbp,%r11
390	cmovgq	%rbp,%r15
391.byte	102,15,56,222,233
392	movups	-8(%rsi),%xmm1
393	movdqa	%xmm10,%xmm11
394.byte	102,15,56,222,208
395	prefetcht0	15(%r12,%rbx,1)
396	prefetcht0	15(%r13,%rbx,1)
397.byte	102,15,56,222,216
398	prefetcht0	15(%r14,%rbx,1)
399	prefetcht0	15(%r15,%rbx,1)
400.byte	102,15,56,222,224
401.byte	102,15,56,222,232
402	movups	128-120(%rsi),%xmm0
403	pxor	%xmm12,%xmm12
404
405.byte	102,15,56,222,209
406	pcmpgtd	%xmm12,%xmm11
407	movdqu	-120(%rsi),%xmm12
408.byte	102,15,56,222,217
409	paddd	%xmm11,%xmm10
410	movdqa	%xmm10,32(%rsp)
411.byte	102,15,56,222,225
412.byte	102,15,56,222,233
413	movups	144-120(%rsi),%xmm1
414
415	cmpl	$11,%eax
416
417.byte	102,15,56,222,208
418.byte	102,15,56,222,216
419.byte	102,15,56,222,224
420.byte	102,15,56,222,232
421	movups	160-120(%rsi),%xmm0
422
423	jb	.Ldec4x_tail
424
425.byte	102,15,56,222,209
426.byte	102,15,56,222,217
427.byte	102,15,56,222,225
428.byte	102,15,56,222,233
429	movups	176-120(%rsi),%xmm1
430
431.byte	102,15,56,222,208
432.byte	102,15,56,222,216
433.byte	102,15,56,222,224
434.byte	102,15,56,222,232
435	movups	192-120(%rsi),%xmm0
436
437	je	.Ldec4x_tail
438
439.byte	102,15,56,222,209
440.byte	102,15,56,222,217
441.byte	102,15,56,222,225
442.byte	102,15,56,222,233
443	movups	208-120(%rsi),%xmm1
444
445.byte	102,15,56,222,208
446.byte	102,15,56,222,216
447.byte	102,15,56,222,224
448.byte	102,15,56,222,232
449	movups	224-120(%rsi),%xmm0
450	jmp	.Ldec4x_tail
451
452.align	32
453.Ldec4x_tail:
454.byte	102,15,56,222,209
455.byte	102,15,56,222,217
456.byte	102,15,56,222,225
457	pxor	%xmm0,%xmm6
458	pxor	%xmm0,%xmm7
459.byte	102,15,56,222,233
460	movdqu	16-120(%rsi),%xmm1
461	pxor	%xmm0,%xmm8
462	pxor	%xmm0,%xmm9
463	movdqu	32-120(%rsi),%xmm0
464
465.byte	102,15,56,223,214
466.byte	102,15,56,223,223
467	movdqu	-16(%r8,%rbx,1),%xmm6
468	movdqu	-16(%r9,%rbx,1),%xmm7
469.byte	102,65,15,56,223,224
470.byte	102,65,15,56,223,233
471	movdqu	-16(%r10,%rbx,1),%xmm8
472	movdqu	-16(%r11,%rbx,1),%xmm9
473
474	movups	%xmm2,-16(%r12,%rbx,1)
475	movdqu	(%r8,%rbx,1),%xmm2
476	movups	%xmm3,-16(%r13,%rbx,1)
477	movdqu	(%r9,%rbx,1),%xmm3
478	pxor	%xmm12,%xmm2
479	movups	%xmm4,-16(%r14,%rbx,1)
480	movdqu	(%r10,%rbx,1),%xmm4
481	pxor	%xmm12,%xmm3
482	movups	%xmm5,-16(%r15,%rbx,1)
483	movdqu	(%r11,%rbx,1),%xmm5
484	pxor	%xmm12,%xmm4
485	pxor	%xmm12,%xmm5
486
487	decl	%edx
488	jnz	.Loop_dec4x
489
490	movq	16(%rsp),%rax
491	movl	24(%rsp),%edx
492
493	leaq	160(%rdi),%rdi
494	decl	%edx
495	jnz	.Ldec4x_loop_grande
496
497.Ldec4x_done:
498	movq	-48(%rax),%r15
499	movq	-40(%rax),%r14
500	movq	-32(%rax),%r13
501	movq	-24(%rax),%r12
502	movq	-16(%rax),%rbp
503	movq	-8(%rax),%rbx
504	leaq	(%rax),%rsp
505.Ldec4x_epilogue:
506	.byte	0xf3,0xc3
507.size	aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
508