poly1305-x86_64.S revision 1.3
1#include <machine/asm.h>
2.text
3
4
5
6.globl	poly1305_init
7.hidden	poly1305_init
8.globl	poly1305_blocks
9.hidden	poly1305_blocks
10.globl	poly1305_emit
11.hidden	poly1305_emit
12
13.type	poly1305_init,@function
14.align	32
15poly1305_init:
16.cfi_startproc
17	xorq	%rax,%rax
18	movq	%rax,0(%rdi)
19	movq	%rax,8(%rdi)
20	movq	%rax,16(%rdi)
21
22	cmpq	$0,%rsi
23	je	.Lno_key
24
25	leaq	poly1305_blocks(%rip),%r10
26	leaq	poly1305_emit(%rip),%r11
27	movq	OPENSSL_ia32cap_P+4(%rip),%r9
28	leaq	poly1305_blocks_avx(%rip),%rax
29	leaq	poly1305_emit_avx(%rip),%rcx
30	btq	$28,%r9
31	cmovcq	%rax,%r10
32	cmovcq	%rcx,%r11
33	leaq	poly1305_blocks_avx2(%rip),%rax
34	btq	$37,%r9
35	cmovcq	%rax,%r10
36	movq	$2149646336,%rax
37	shrq	$32,%r9
38	andq	%rax,%r9
39	cmpq	%rax,%r9
40	je	.Linit_base2_44
41	movq	$0x0ffffffc0fffffff,%rax
42	movq	$0x0ffffffc0ffffffc,%rcx
43	andq	0(%rsi),%rax
44	andq	8(%rsi),%rcx
45	movq	%rax,24(%rdi)
46	movq	%rcx,32(%rdi)
47	movq	%r10,0(%rdx)
48	movq	%r11,8(%rdx)
49	movl	$1,%eax
50.Lno_key:
51	.byte	0xf3,0xc3
52.cfi_endproc
53.size	poly1305_init,.-poly1305_init
54
55.type	poly1305_blocks,@function
56.align	32
57poly1305_blocks:
58.cfi_startproc
59.Lblocks:
60	shrq	$4,%rdx
61	jz	.Lno_data
62
63	pushq	%rbx
64.cfi_adjust_cfa_offset	8
65.cfi_offset	%rbx,-16
66	pushq	%rbp
67.cfi_adjust_cfa_offset	8
68.cfi_offset	%rbp,-24
69	pushq	%r12
70.cfi_adjust_cfa_offset	8
71.cfi_offset	%r12,-32
72	pushq	%r13
73.cfi_adjust_cfa_offset	8
74.cfi_offset	%r13,-40
75	pushq	%r14
76.cfi_adjust_cfa_offset	8
77.cfi_offset	%r14,-48
78	pushq	%r15
79.cfi_adjust_cfa_offset	8
80.cfi_offset	%r15,-56
81.Lblocks_body:
82
83	movq	%rdx,%r15
84
85	movq	24(%rdi),%r11
86	movq	32(%rdi),%r13
87
88	movq	0(%rdi),%r14
89	movq	8(%rdi),%rbx
90	movq	16(%rdi),%rbp
91
92	movq	%r13,%r12
93	shrq	$2,%r13
94	movq	%r12,%rax
95	addq	%r12,%r13
96	jmp	.Loop
97
98.align	32
99.Loop:
100	addq	0(%rsi),%r14
101	adcq	8(%rsi),%rbx
102	leaq	16(%rsi),%rsi
103	adcq	%rcx,%rbp
104	mulq	%r14
105	movq	%rax,%r9
106	movq	%r11,%rax
107	movq	%rdx,%r10
108
109	mulq	%r14
110	movq	%rax,%r14
111	movq	%r11,%rax
112	movq	%rdx,%r8
113
114	mulq	%rbx
115	addq	%rax,%r9
116	movq	%r13,%rax
117	adcq	%rdx,%r10
118
119	mulq	%rbx
120	movq	%rbp,%rbx
121	addq	%rax,%r14
122	adcq	%rdx,%r8
123
124	imulq	%r13,%rbx
125	addq	%rbx,%r9
126	movq	%r8,%rbx
127	adcq	$0,%r10
128
129	imulq	%r11,%rbp
130	addq	%r9,%rbx
131	movq	$-4,%rax
132	adcq	%rbp,%r10
133
134	andq	%r10,%rax
135	movq	%r10,%rbp
136	shrq	$2,%r10
137	andq	$3,%rbp
138	addq	%r10,%rax
139	addq	%rax,%r14
140	adcq	$0,%rbx
141	adcq	$0,%rbp
142	movq	%r12,%rax
143	decq	%r15
144	jnz	.Loop
145
146	movq	%r14,0(%rdi)
147	movq	%rbx,8(%rdi)
148	movq	%rbp,16(%rdi)
149
150	movq	0(%rsp),%r15
151.cfi_restore	%r15
152	movq	8(%rsp),%r14
153.cfi_restore	%r14
154	movq	16(%rsp),%r13
155.cfi_restore	%r13
156	movq	24(%rsp),%r12
157.cfi_restore	%r12
158	movq	32(%rsp),%rbp
159.cfi_restore	%rbp
160	movq	40(%rsp),%rbx
161.cfi_restore	%rbx
162	leaq	48(%rsp),%rsp
163.cfi_adjust_cfa_offset	-48
164.Lno_data:
165.Lblocks_epilogue:
166	.byte	0xf3,0xc3
167.cfi_endproc
168.size	poly1305_blocks,.-poly1305_blocks
169
170.type	poly1305_emit,@function
171.align	32
172poly1305_emit:
173.cfi_startproc
174.Lemit:
175	movq	0(%rdi),%r8
176	movq	8(%rdi),%r9
177	movq	16(%rdi),%r10
178
179	movq	%r8,%rax
180	addq	$5,%r8
181	movq	%r9,%rcx
182	adcq	$0,%r9
183	adcq	$0,%r10
184	shrq	$2,%r10
185	cmovnzq	%r8,%rax
186	cmovnzq	%r9,%rcx
187
188	addq	0(%rdx),%rax
189	adcq	8(%rdx),%rcx
190	movq	%rax,0(%rsi)
191	movq	%rcx,8(%rsi)
192
193	.byte	0xf3,0xc3
194.cfi_endproc
195.size	poly1305_emit,.-poly1305_emit
196.type	__poly1305_block,@function
197.align	32
198__poly1305_block:
199.cfi_startproc
200	mulq	%r14
201	movq	%rax,%r9
202	movq	%r11,%rax
203	movq	%rdx,%r10
204
205	mulq	%r14
206	movq	%rax,%r14
207	movq	%r11,%rax
208	movq	%rdx,%r8
209
210	mulq	%rbx
211	addq	%rax,%r9
212	movq	%r13,%rax
213	adcq	%rdx,%r10
214
215	mulq	%rbx
216	movq	%rbp,%rbx
217	addq	%rax,%r14
218	adcq	%rdx,%r8
219
220	imulq	%r13,%rbx
221	addq	%rbx,%r9
222	movq	%r8,%rbx
223	adcq	$0,%r10
224
225	imulq	%r11,%rbp
226	addq	%r9,%rbx
227	movq	$-4,%rax
228	adcq	%rbp,%r10
229
230	andq	%r10,%rax
231	movq	%r10,%rbp
232	shrq	$2,%r10
233	andq	$3,%rbp
234	addq	%r10,%rax
235	addq	%rax,%r14
236	adcq	$0,%rbx
237	adcq	$0,%rbp
238	.byte	0xf3,0xc3
239.cfi_endproc
240.size	__poly1305_block,.-__poly1305_block
241
242.type	__poly1305_init_avx,@function
243.align	32
244__poly1305_init_avx:
245.cfi_startproc
246	movq	%r11,%r14
247	movq	%r12,%rbx
248	xorq	%rbp,%rbp
249
250	leaq	48+64(%rdi),%rdi
251
252	movq	%r12,%rax
253	call	__poly1305_block
254
255	movl	$0x3ffffff,%eax
256	movl	$0x3ffffff,%edx
257	movq	%r14,%r8
258	andl	%r14d,%eax
259	movq	%r11,%r9
260	andl	%r11d,%edx
261	movl	%eax,-64(%rdi)
262	shrq	$26,%r8
263	movl	%edx,-60(%rdi)
264	shrq	$26,%r9
265
266	movl	$0x3ffffff,%eax
267	movl	$0x3ffffff,%edx
268	andl	%r8d,%eax
269	andl	%r9d,%edx
270	movl	%eax,-48(%rdi)
271	leal	(%rax,%rax,4),%eax
272	movl	%edx,-44(%rdi)
273	leal	(%rdx,%rdx,4),%edx
274	movl	%eax,-32(%rdi)
275	shrq	$26,%r8
276	movl	%edx,-28(%rdi)
277	shrq	$26,%r9
278
279	movq	%rbx,%rax
280	movq	%r12,%rdx
281	shlq	$12,%rax
282	shlq	$12,%rdx
283	orq	%r8,%rax
284	orq	%r9,%rdx
285	andl	$0x3ffffff,%eax
286	andl	$0x3ffffff,%edx
287	movl	%eax,-16(%rdi)
288	leal	(%rax,%rax,4),%eax
289	movl	%edx,-12(%rdi)
290	leal	(%rdx,%rdx,4),%edx
291	movl	%eax,0(%rdi)
292	movq	%rbx,%r8
293	movl	%edx,4(%rdi)
294	movq	%r12,%r9
295
296	movl	$0x3ffffff,%eax
297	movl	$0x3ffffff,%edx
298	shrq	$14,%r8
299	shrq	$14,%r9
300	andl	%r8d,%eax
301	andl	%r9d,%edx
302	movl	%eax,16(%rdi)
303	leal	(%rax,%rax,4),%eax
304	movl	%edx,20(%rdi)
305	leal	(%rdx,%rdx,4),%edx
306	movl	%eax,32(%rdi)
307	shrq	$26,%r8
308	movl	%edx,36(%rdi)
309	shrq	$26,%r9
310
311	movq	%rbp,%rax
312	shlq	$24,%rax
313	orq	%rax,%r8
314	movl	%r8d,48(%rdi)
315	leaq	(%r8,%r8,4),%r8
316	movl	%r9d,52(%rdi)
317	leaq	(%r9,%r9,4),%r9
318	movl	%r8d,64(%rdi)
319	movl	%r9d,68(%rdi)
320
321	movq	%r12,%rax
322	call	__poly1305_block
323
324	movl	$0x3ffffff,%eax
325	movq	%r14,%r8
326	andl	%r14d,%eax
327	shrq	$26,%r8
328	movl	%eax,-52(%rdi)
329
330	movl	$0x3ffffff,%edx
331	andl	%r8d,%edx
332	movl	%edx,-36(%rdi)
333	leal	(%rdx,%rdx,4),%edx
334	shrq	$26,%r8
335	movl	%edx,-20(%rdi)
336
337	movq	%rbx,%rax
338	shlq	$12,%rax
339	orq	%r8,%rax
340	andl	$0x3ffffff,%eax
341	movl	%eax,-4(%rdi)
342	leal	(%rax,%rax,4),%eax
343	movq	%rbx,%r8
344	movl	%eax,12(%rdi)
345
346	movl	$0x3ffffff,%edx
347	shrq	$14,%r8
348	andl	%r8d,%edx
349	movl	%edx,28(%rdi)
350	leal	(%rdx,%rdx,4),%edx
351	shrq	$26,%r8
352	movl	%edx,44(%rdi)
353
354	movq	%rbp,%rax
355	shlq	$24,%rax
356	orq	%rax,%r8
357	movl	%r8d,60(%rdi)
358	leaq	(%r8,%r8,4),%r8
359	movl	%r8d,76(%rdi)
360
361	movq	%r12,%rax
362	call	__poly1305_block
363
364	movl	$0x3ffffff,%eax
365	movq	%r14,%r8
366	andl	%r14d,%eax
367	shrq	$26,%r8
368	movl	%eax,-56(%rdi)
369
370	movl	$0x3ffffff,%edx
371	andl	%r8d,%edx
372	movl	%edx,-40(%rdi)
373	leal	(%rdx,%rdx,4),%edx
374	shrq	$26,%r8
375	movl	%edx,-24(%rdi)
376
377	movq	%rbx,%rax
378	shlq	$12,%rax
379	orq	%r8,%rax
380	andl	$0x3ffffff,%eax
381	movl	%eax,-8(%rdi)
382	leal	(%rax,%rax,4),%eax
383	movq	%rbx,%r8
384	movl	%eax,8(%rdi)
385
386	movl	$0x3ffffff,%edx
387	shrq	$14,%r8
388	andl	%r8d,%edx
389	movl	%edx,24(%rdi)
390	leal	(%rdx,%rdx,4),%edx
391	shrq	$26,%r8
392	movl	%edx,40(%rdi)
393
394	movq	%rbp,%rax
395	shlq	$24,%rax
396	orq	%rax,%r8
397	movl	%r8d,56(%rdi)
398	leaq	(%r8,%r8,4),%r8
399	movl	%r8d,72(%rdi)
400
401	leaq	-48-64(%rdi),%rdi
402	.byte	0xf3,0xc3
403.cfi_endproc
404.size	__poly1305_init_avx,.-__poly1305_init_avx
405
406.type	poly1305_blocks_avx,@function
407.align	32
408poly1305_blocks_avx:
409.cfi_startproc
410	movl	20(%rdi),%r8d
411	cmpq	$128,%rdx
412	jae	.Lblocks_avx
413	testl	%r8d,%r8d
414	jz	.Lblocks
415
416.Lblocks_avx:
417	andq	$-16,%rdx
418	jz	.Lno_data_avx
419
420	vzeroupper
421
422	testl	%r8d,%r8d
423	jz	.Lbase2_64_avx
424
425	testq	$31,%rdx
426	jz	.Leven_avx
427
428	pushq	%rbx
429.cfi_adjust_cfa_offset	8
430.cfi_offset	%rbx,-16
431	pushq	%rbp
432.cfi_adjust_cfa_offset	8
433.cfi_offset	%rbp,-24
434	pushq	%r12
435.cfi_adjust_cfa_offset	8
436.cfi_offset	%r12,-32
437	pushq	%r13
438.cfi_adjust_cfa_offset	8
439.cfi_offset	%r13,-40
440	pushq	%r14
441.cfi_adjust_cfa_offset	8
442.cfi_offset	%r14,-48
443	pushq	%r15
444.cfi_adjust_cfa_offset	8
445.cfi_offset	%r15,-56
446.Lblocks_avx_body:
447
448	movq	%rdx,%r15
449
450	movq	0(%rdi),%r8
451	movq	8(%rdi),%r9
452	movl	16(%rdi),%ebp
453
454	movq	24(%rdi),%r11
455	movq	32(%rdi),%r13
456
457
458	movl	%r8d,%r14d
459	andq	$-2147483648,%r8
460	movq	%r9,%r12
461	movl	%r9d,%ebx
462	andq	$-2147483648,%r9
463
464	shrq	$6,%r8
465	shlq	$52,%r12
466	addq	%r8,%r14
467	shrq	$12,%rbx
468	shrq	$18,%r9
469	addq	%r12,%r14
470	adcq	%r9,%rbx
471
472	movq	%rbp,%r8
473	shlq	$40,%r8
474	shrq	$24,%rbp
475	addq	%r8,%rbx
476	adcq	$0,%rbp
477
478	movq	$-4,%r9
479	movq	%rbp,%r8
480	andq	%rbp,%r9
481	shrq	$2,%r8
482	andq	$3,%rbp
483	addq	%r9,%r8
484	addq	%r8,%r14
485	adcq	$0,%rbx
486	adcq	$0,%rbp
487
488	movq	%r13,%r12
489	movq	%r13,%rax
490	shrq	$2,%r13
491	addq	%r12,%r13
492
493	addq	0(%rsi),%r14
494	adcq	8(%rsi),%rbx
495	leaq	16(%rsi),%rsi
496	adcq	%rcx,%rbp
497
498	call	__poly1305_block
499
500	testq	%rcx,%rcx
501	jz	.Lstore_base2_64_avx
502
503
504	movq	%r14,%rax
505	movq	%r14,%rdx
506	shrq	$52,%r14
507	movq	%rbx,%r11
508	movq	%rbx,%r12
509	shrq	$26,%rdx
510	andq	$0x3ffffff,%rax
511	shlq	$12,%r11
512	andq	$0x3ffffff,%rdx
513	shrq	$14,%rbx
514	orq	%r11,%r14
515	shlq	$24,%rbp
516	andq	$0x3ffffff,%r14
517	shrq	$40,%r12
518	andq	$0x3ffffff,%rbx
519	orq	%r12,%rbp
520
521	subq	$16,%r15
522	jz	.Lstore_base2_26_avx
523
524	vmovd	%eax,%xmm0
525	vmovd	%edx,%xmm1
526	vmovd	%r14d,%xmm2
527	vmovd	%ebx,%xmm3
528	vmovd	%ebp,%xmm4
529	jmp	.Lproceed_avx
530
531.align	32
532.Lstore_base2_64_avx:
533	movq	%r14,0(%rdi)
534	movq	%rbx,8(%rdi)
535	movq	%rbp,16(%rdi)
536	jmp	.Ldone_avx
537
538.align	16
539.Lstore_base2_26_avx:
540	movl	%eax,0(%rdi)
541	movl	%edx,4(%rdi)
542	movl	%r14d,8(%rdi)
543	movl	%ebx,12(%rdi)
544	movl	%ebp,16(%rdi)
545.align	16
546.Ldone_avx:
547	movq	0(%rsp),%r15
548.cfi_restore	%r15
549	movq	8(%rsp),%r14
550.cfi_restore	%r14
551	movq	16(%rsp),%r13
552.cfi_restore	%r13
553	movq	24(%rsp),%r12
554.cfi_restore	%r12
555	movq	32(%rsp),%rbp
556.cfi_restore	%rbp
557	movq	40(%rsp),%rbx
558.cfi_restore	%rbx
559	leaq	48(%rsp),%rsp
560.cfi_adjust_cfa_offset	-48
561.Lno_data_avx:
562.Lblocks_avx_epilogue:
563	.byte	0xf3,0xc3
564.cfi_endproc
565
566.align	32
567.Lbase2_64_avx:
568.cfi_startproc
569	pushq	%rbx
570.cfi_adjust_cfa_offset	8
571.cfi_offset	%rbx,-16
572	pushq	%rbp
573.cfi_adjust_cfa_offset	8
574.cfi_offset	%rbp,-24
575	pushq	%r12
576.cfi_adjust_cfa_offset	8
577.cfi_offset	%r12,-32
578	pushq	%r13
579.cfi_adjust_cfa_offset	8
580.cfi_offset	%r13,-40
581	pushq	%r14
582.cfi_adjust_cfa_offset	8
583.cfi_offset	%r14,-48
584	pushq	%r15
585.cfi_adjust_cfa_offset	8
586.cfi_offset	%r15,-56
587.Lbase2_64_avx_body:
588
589	movq	%rdx,%r15
590
591	movq	24(%rdi),%r11
592	movq	32(%rdi),%r13
593
594	movq	0(%rdi),%r14
595	movq	8(%rdi),%rbx
596	movl	16(%rdi),%ebp
597
598	movq	%r13,%r12
599	movq	%r13,%rax
600	shrq	$2,%r13
601	addq	%r12,%r13
602
603	testq	$31,%rdx
604	jz	.Linit_avx
605
606	addq	0(%rsi),%r14
607	adcq	8(%rsi),%rbx
608	leaq	16(%rsi),%rsi
609	adcq	%rcx,%rbp
610	subq	$16,%r15
611
612	call	__poly1305_block
613
614.Linit_avx:
615
616	movq	%r14,%rax
617	movq	%r14,%rdx
618	shrq	$52,%r14
619	movq	%rbx,%r8
620	movq	%rbx,%r9
621	shrq	$26,%rdx
622	andq	$0x3ffffff,%rax
623	shlq	$12,%r8
624	andq	$0x3ffffff,%rdx
625	shrq	$14,%rbx
626	orq	%r8,%r14
627	shlq	$24,%rbp
628	andq	$0x3ffffff,%r14
629	shrq	$40,%r9
630	andq	$0x3ffffff,%rbx
631	orq	%r9,%rbp
632
633	vmovd	%eax,%xmm0
634	vmovd	%edx,%xmm1
635	vmovd	%r14d,%xmm2
636	vmovd	%ebx,%xmm3
637	vmovd	%ebp,%xmm4
638	movl	$1,20(%rdi)
639
640	call	__poly1305_init_avx
641
642.Lproceed_avx:
643	movq	%r15,%rdx
644
645	movq	0(%rsp),%r15
646.cfi_restore	%r15
647	movq	8(%rsp),%r14
648.cfi_restore	%r14
649	movq	16(%rsp),%r13
650.cfi_restore	%r13
651	movq	24(%rsp),%r12
652.cfi_restore	%r12
653	movq	32(%rsp),%rbp
654.cfi_restore	%rbp
655	movq	40(%rsp),%rbx
656.cfi_restore	%rbx
657	leaq	48(%rsp),%rax
658	leaq	48(%rsp),%rsp
659.cfi_adjust_cfa_offset	-48
660.Lbase2_64_avx_epilogue:
661	jmp	.Ldo_avx
662.cfi_endproc
663
664.align	32
665.Leven_avx:
666.cfi_startproc
667	vmovd	0(%rdi),%xmm0
668	vmovd	4(%rdi),%xmm1
669	vmovd	8(%rdi),%xmm2
670	vmovd	12(%rdi),%xmm3
671	vmovd	16(%rdi),%xmm4
672
673.Ldo_avx:
674	leaq	-88(%rsp),%r11
675.cfi_def_cfa	%r11,0x60
676	subq	$0x178,%rsp
677	subq	$64,%rdx
678	leaq	-32(%rsi),%rax
679	cmovcq	%rax,%rsi
680
681	vmovdqu	48(%rdi),%xmm14
682	leaq	112(%rdi),%rdi
683	leaq	.Lconst(%rip),%rcx
684
685
686
687	vmovdqu	32(%rsi),%xmm5
688	vmovdqu	48(%rsi),%xmm6
689	vmovdqa	64(%rcx),%xmm15
690
691	vpsrldq	$6,%xmm5,%xmm7
692	vpsrldq	$6,%xmm6,%xmm8
693	vpunpckhqdq	%xmm6,%xmm5,%xmm9
694	vpunpcklqdq	%xmm6,%xmm5,%xmm5
695	vpunpcklqdq	%xmm8,%xmm7,%xmm8
696
697	vpsrlq	$40,%xmm9,%xmm9
698	vpsrlq	$26,%xmm5,%xmm6
699	vpand	%xmm15,%xmm5,%xmm5
700	vpsrlq	$4,%xmm8,%xmm7
701	vpand	%xmm15,%xmm6,%xmm6
702	vpsrlq	$30,%xmm8,%xmm8
703	vpand	%xmm15,%xmm7,%xmm7
704	vpand	%xmm15,%xmm8,%xmm8
705	vpor	32(%rcx),%xmm9,%xmm9
706
707	jbe	.Lskip_loop_avx
708
709
710	vmovdqu	-48(%rdi),%xmm11
711	vmovdqu	-32(%rdi),%xmm12
712	vpshufd	$0xEE,%xmm14,%xmm13
713	vpshufd	$0x44,%xmm14,%xmm10
714	vmovdqa	%xmm13,-144(%r11)
715	vmovdqa	%xmm10,0(%rsp)
716	vpshufd	$0xEE,%xmm11,%xmm14
717	vmovdqu	-16(%rdi),%xmm10
718	vpshufd	$0x44,%xmm11,%xmm11
719	vmovdqa	%xmm14,-128(%r11)
720	vmovdqa	%xmm11,16(%rsp)
721	vpshufd	$0xEE,%xmm12,%xmm13
722	vmovdqu	0(%rdi),%xmm11
723	vpshufd	$0x44,%xmm12,%xmm12
724	vmovdqa	%xmm13,-112(%r11)
725	vmovdqa	%xmm12,32(%rsp)
726	vpshufd	$0xEE,%xmm10,%xmm14
727	vmovdqu	16(%rdi),%xmm12
728	vpshufd	$0x44,%xmm10,%xmm10
729	vmovdqa	%xmm14,-96(%r11)
730	vmovdqa	%xmm10,48(%rsp)
731	vpshufd	$0xEE,%xmm11,%xmm13
732	vmovdqu	32(%rdi),%xmm10
733	vpshufd	$0x44,%xmm11,%xmm11
734	vmovdqa	%xmm13,-80(%r11)
735	vmovdqa	%xmm11,64(%rsp)
736	vpshufd	$0xEE,%xmm12,%xmm14
737	vmovdqu	48(%rdi),%xmm11
738	vpshufd	$0x44,%xmm12,%xmm12
739	vmovdqa	%xmm14,-64(%r11)
740	vmovdqa	%xmm12,80(%rsp)
741	vpshufd	$0xEE,%xmm10,%xmm13
742	vmovdqu	64(%rdi),%xmm12
743	vpshufd	$0x44,%xmm10,%xmm10
744	vmovdqa	%xmm13,-48(%r11)
745	vmovdqa	%xmm10,96(%rsp)
746	vpshufd	$0xEE,%xmm11,%xmm14
747	vpshufd	$0x44,%xmm11,%xmm11
748	vmovdqa	%xmm14,-32(%r11)
749	vmovdqa	%xmm11,112(%rsp)
750	vpshufd	$0xEE,%xmm12,%xmm13
751	vmovdqa	0(%rsp),%xmm14
752	vpshufd	$0x44,%xmm12,%xmm12
753	vmovdqa	%xmm13,-16(%r11)
754	vmovdqa	%xmm12,128(%rsp)
755
756	jmp	.Loop_avx
757
758.align	32
759.Loop_avx:
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780	vpmuludq	%xmm5,%xmm14,%xmm10
781	vpmuludq	%xmm6,%xmm14,%xmm11
782	vmovdqa	%xmm2,32(%r11)
783	vpmuludq	%xmm7,%xmm14,%xmm12
784	vmovdqa	16(%rsp),%xmm2
785	vpmuludq	%xmm8,%xmm14,%xmm13
786	vpmuludq	%xmm9,%xmm14,%xmm14
787
788	vmovdqa	%xmm0,0(%r11)
789	vpmuludq	32(%rsp),%xmm9,%xmm0
790	vmovdqa	%xmm1,16(%r11)
791	vpmuludq	%xmm8,%xmm2,%xmm1
792	vpaddq	%xmm0,%xmm10,%xmm10
793	vpaddq	%xmm1,%xmm14,%xmm14
794	vmovdqa	%xmm3,48(%r11)
795	vpmuludq	%xmm7,%xmm2,%xmm0
796	vpmuludq	%xmm6,%xmm2,%xmm1
797	vpaddq	%xmm0,%xmm13,%xmm13
798	vmovdqa	48(%rsp),%xmm3
799	vpaddq	%xmm1,%xmm12,%xmm12
800	vmovdqa	%xmm4,64(%r11)
801	vpmuludq	%xmm5,%xmm2,%xmm2
802	vpmuludq	%xmm7,%xmm3,%xmm0
803	vpaddq	%xmm2,%xmm11,%xmm11
804
805	vmovdqa	64(%rsp),%xmm4
806	vpaddq	%xmm0,%xmm14,%xmm14
807	vpmuludq	%xmm6,%xmm3,%xmm1
808	vpmuludq	%xmm5,%xmm3,%xmm3
809	vpaddq	%xmm1,%xmm13,%xmm13
810	vmovdqa	80(%rsp),%xmm2
811	vpaddq	%xmm3,%xmm12,%xmm12
812	vpmuludq	%xmm9,%xmm4,%xmm0
813	vpmuludq	%xmm8,%xmm4,%xmm4
814	vpaddq	%xmm0,%xmm11,%xmm11
815	vmovdqa	96(%rsp),%xmm3
816	vpaddq	%xmm4,%xmm10,%xmm10
817
818	vmovdqa	128(%rsp),%xmm4
819	vpmuludq	%xmm6,%xmm2,%xmm1
820	vpmuludq	%xmm5,%xmm2,%xmm2
821	vpaddq	%xmm1,%xmm14,%xmm14
822	vpaddq	%xmm2,%xmm13,%xmm13
823	vpmuludq	%xmm9,%xmm3,%xmm0
824	vpmuludq	%xmm8,%xmm3,%xmm1
825	vpaddq	%xmm0,%xmm12,%xmm12
826	vmovdqu	0(%rsi),%xmm0
827	vpaddq	%xmm1,%xmm11,%xmm11
828	vpmuludq	%xmm7,%xmm3,%xmm3
829	vpmuludq	%xmm7,%xmm4,%xmm7
830	vpaddq	%xmm3,%xmm10,%xmm10
831
832	vmovdqu	16(%rsi),%xmm1
833	vpaddq	%xmm7,%xmm11,%xmm11
834	vpmuludq	%xmm8,%xmm4,%xmm8
835	vpmuludq	%xmm9,%xmm4,%xmm9
836	vpsrldq	$6,%xmm0,%xmm2
837	vpaddq	%xmm8,%xmm12,%xmm12
838	vpaddq	%xmm9,%xmm13,%xmm13
839	vpsrldq	$6,%xmm1,%xmm3
840	vpmuludq	112(%rsp),%xmm5,%xmm9
841	vpmuludq	%xmm6,%xmm4,%xmm5
842	vpunpckhqdq	%xmm1,%xmm0,%xmm4
843	vpaddq	%xmm9,%xmm14,%xmm14
844	vmovdqa	-144(%r11),%xmm9
845	vpaddq	%xmm5,%xmm10,%xmm10
846
847	vpunpcklqdq	%xmm1,%xmm0,%xmm0
848	vpunpcklqdq	%xmm3,%xmm2,%xmm3
849
850
851	vpsrldq	$5,%xmm4,%xmm4
852	vpsrlq	$26,%xmm0,%xmm1
853	vpand	%xmm15,%xmm0,%xmm0
854	vpsrlq	$4,%xmm3,%xmm2
855	vpand	%xmm15,%xmm1,%xmm1
856	vpand	0(%rcx),%xmm4,%xmm4
857	vpsrlq	$30,%xmm3,%xmm3
858	vpand	%xmm15,%xmm2,%xmm2
859	vpand	%xmm15,%xmm3,%xmm3
860	vpor	32(%rcx),%xmm4,%xmm4
861
862	vpaddq	0(%r11),%xmm0,%xmm0
863	vpaddq	16(%r11),%xmm1,%xmm1
864	vpaddq	32(%r11),%xmm2,%xmm2
865	vpaddq	48(%r11),%xmm3,%xmm3
866	vpaddq	64(%r11),%xmm4,%xmm4
867
868	leaq	32(%rsi),%rax
869	leaq	64(%rsi),%rsi
870	subq	$64,%rdx
871	cmovcq	%rax,%rsi
872
873
874
875
876
877
878
879
880
881
882	vpmuludq	%xmm0,%xmm9,%xmm5
883	vpmuludq	%xmm1,%xmm9,%xmm6
884	vpaddq	%xmm5,%xmm10,%xmm10
885	vpaddq	%xmm6,%xmm11,%xmm11
886	vmovdqa	-128(%r11),%xmm7
887	vpmuludq	%xmm2,%xmm9,%xmm5
888	vpmuludq	%xmm3,%xmm9,%xmm6
889	vpaddq	%xmm5,%xmm12,%xmm12
890	vpaddq	%xmm6,%xmm13,%xmm13
891	vpmuludq	%xmm4,%xmm9,%xmm9
892	vpmuludq	-112(%r11),%xmm4,%xmm5
893	vpaddq	%xmm9,%xmm14,%xmm14
894
895	vpaddq	%xmm5,%xmm10,%xmm10
896	vpmuludq	%xmm2,%xmm7,%xmm6
897	vpmuludq	%xmm3,%xmm7,%xmm5
898	vpaddq	%xmm6,%xmm13,%xmm13
899	vmovdqa	-96(%r11),%xmm8
900	vpaddq	%xmm5,%xmm14,%xmm14
901	vpmuludq	%xmm1,%xmm7,%xmm6
902	vpmuludq	%xmm0,%xmm7,%xmm7
903	vpaddq	%xmm6,%xmm12,%xmm12
904	vpaddq	%xmm7,%xmm11,%xmm11
905
906	vmovdqa	-80(%r11),%xmm9
907	vpmuludq	%xmm2,%xmm8,%xmm5
908	vpmuludq	%xmm1,%xmm8,%xmm6
909	vpaddq	%xmm5,%xmm14,%xmm14
910	vpaddq	%xmm6,%xmm13,%xmm13
911	vmovdqa	-64(%r11),%xmm7
912	vpmuludq	%xmm0,%xmm8,%xmm8
913	vpmuludq	%xmm4,%xmm9,%xmm5
914	vpaddq	%xmm8,%xmm12,%xmm12
915	vpaddq	%xmm5,%xmm11,%xmm11
916	vmovdqa	-48(%r11),%xmm8
917	vpmuludq	%xmm3,%xmm9,%xmm9
918	vpmuludq	%xmm1,%xmm7,%xmm6
919	vpaddq	%xmm9,%xmm10,%xmm10
920
921	vmovdqa	-16(%r11),%xmm9
922	vpaddq	%xmm6,%xmm14,%xmm14
923	vpmuludq	%xmm0,%xmm7,%xmm7
924	vpmuludq	%xmm4,%xmm8,%xmm5
925	vpaddq	%xmm7,%xmm13,%xmm13
926	vpaddq	%xmm5,%xmm12,%xmm12
927	vmovdqu	32(%rsi),%xmm5
928	vpmuludq	%xmm3,%xmm8,%xmm7
929	vpmuludq	%xmm2,%xmm8,%xmm8
930	vpaddq	%xmm7,%xmm11,%xmm11
931	vmovdqu	48(%rsi),%xmm6
932	vpaddq	%xmm8,%xmm10,%xmm10
933
934	vpmuludq	%xmm2,%xmm9,%xmm2
935	vpmuludq	%xmm3,%xmm9,%xmm3
936	vpsrldq	$6,%xmm5,%xmm7
937	vpaddq	%xmm2,%xmm11,%xmm11
938	vpmuludq	%xmm4,%xmm9,%xmm4
939	vpsrldq	$6,%xmm6,%xmm8
940	vpaddq	%xmm3,%xmm12,%xmm2
941	vpaddq	%xmm4,%xmm13,%xmm3
942	vpmuludq	-32(%r11),%xmm0,%xmm4
943	vpmuludq	%xmm1,%xmm9,%xmm0
944	vpunpckhqdq	%xmm6,%xmm5,%xmm9
945	vpaddq	%xmm4,%xmm14,%xmm4
946	vpaddq	%xmm0,%xmm10,%xmm0
947
948	vpunpcklqdq	%xmm6,%xmm5,%xmm5
949	vpunpcklqdq	%xmm8,%xmm7,%xmm8
950
951
952	vpsrldq	$5,%xmm9,%xmm9
953	vpsrlq	$26,%xmm5,%xmm6
954	vmovdqa	0(%rsp),%xmm14
955	vpand	%xmm15,%xmm5,%xmm5
956	vpsrlq	$4,%xmm8,%xmm7
957	vpand	%xmm15,%xmm6,%xmm6
958	vpand	0(%rcx),%xmm9,%xmm9
959	vpsrlq	$30,%xmm8,%xmm8
960	vpand	%xmm15,%xmm7,%xmm7
961	vpand	%xmm15,%xmm8,%xmm8
962	vpor	32(%rcx),%xmm9,%xmm9
963
964
965
966
967
968	vpsrlq	$26,%xmm3,%xmm13
969	vpand	%xmm15,%xmm3,%xmm3
970	vpaddq	%xmm13,%xmm4,%xmm4
971
972	vpsrlq	$26,%xmm0,%xmm10
973	vpand	%xmm15,%xmm0,%xmm0
974	vpaddq	%xmm10,%xmm11,%xmm1
975
976	vpsrlq	$26,%xmm4,%xmm10
977	vpand	%xmm15,%xmm4,%xmm4
978
979	vpsrlq	$26,%xmm1,%xmm11
980	vpand	%xmm15,%xmm1,%xmm1
981	vpaddq	%xmm11,%xmm2,%xmm2
982
983	vpaddq	%xmm10,%xmm0,%xmm0
984	vpsllq	$2,%xmm10,%xmm10
985	vpaddq	%xmm10,%xmm0,%xmm0
986
987	vpsrlq	$26,%xmm2,%xmm12
988	vpand	%xmm15,%xmm2,%xmm2
989	vpaddq	%xmm12,%xmm3,%xmm3
990
991	vpsrlq	$26,%xmm0,%xmm10
992	vpand	%xmm15,%xmm0,%xmm0
993	vpaddq	%xmm10,%xmm1,%xmm1
994
995	vpsrlq	$26,%xmm3,%xmm13
996	vpand	%xmm15,%xmm3,%xmm3
997	vpaddq	%xmm13,%xmm4,%xmm4
998
999	ja	.Loop_avx
1000
1001.Lskip_loop_avx:
1002
1003
1004
1005	vpshufd	$0x10,%xmm14,%xmm14
1006	addq	$32,%rdx
1007	jnz	.Long_tail_avx
1008
1009	vpaddq	%xmm2,%xmm7,%xmm7
1010	vpaddq	%xmm0,%xmm5,%xmm5
1011	vpaddq	%xmm1,%xmm6,%xmm6
1012	vpaddq	%xmm3,%xmm8,%xmm8
1013	vpaddq	%xmm4,%xmm9,%xmm9
1014
1015.Long_tail_avx:
1016	vmovdqa	%xmm2,32(%r11)
1017	vmovdqa	%xmm0,0(%r11)
1018	vmovdqa	%xmm1,16(%r11)
1019	vmovdqa	%xmm3,48(%r11)
1020	vmovdqa	%xmm4,64(%r11)
1021
1022
1023
1024
1025
1026
1027
1028	vpmuludq	%xmm7,%xmm14,%xmm12
1029	vpmuludq	%xmm5,%xmm14,%xmm10
1030	vpshufd	$0x10,-48(%rdi),%xmm2
1031	vpmuludq	%xmm6,%xmm14,%xmm11
1032	vpmuludq	%xmm8,%xmm14,%xmm13
1033	vpmuludq	%xmm9,%xmm14,%xmm14
1034
1035	vpmuludq	%xmm8,%xmm2,%xmm0
1036	vpaddq	%xmm0,%xmm14,%xmm14
1037	vpshufd	$0x10,-32(%rdi),%xmm3
1038	vpmuludq	%xmm7,%xmm2,%xmm1
1039	vpaddq	%xmm1,%xmm13,%xmm13
1040	vpshufd	$0x10,-16(%rdi),%xmm4
1041	vpmuludq	%xmm6,%xmm2,%xmm0
1042	vpaddq	%xmm0,%xmm12,%xmm12
1043	vpmuludq	%xmm5,%xmm2,%xmm2
1044	vpaddq	%xmm2,%xmm11,%xmm11
1045	vpmuludq	%xmm9,%xmm3,%xmm3
1046	vpaddq	%xmm3,%xmm10,%xmm10
1047
1048	vpshufd	$0x10,0(%rdi),%xmm2
1049	vpmuludq	%xmm7,%xmm4,%xmm1
1050	vpaddq	%xmm1,%xmm14,%xmm14
1051	vpmuludq	%xmm6,%xmm4,%xmm0
1052	vpaddq	%xmm0,%xmm13,%xmm13
1053	vpshufd	$0x10,16(%rdi),%xmm3
1054	vpmuludq	%xmm5,%xmm4,%xmm4
1055	vpaddq	%xmm4,%xmm12,%xmm12
1056	vpmuludq	%xmm9,%xmm2,%xmm1
1057	vpaddq	%xmm1,%xmm11,%xmm11
1058	vpshufd	$0x10,32(%rdi),%xmm4
1059	vpmuludq	%xmm8,%xmm2,%xmm2
1060	vpaddq	%xmm2,%xmm10,%xmm10
1061
1062	vpmuludq	%xmm6,%xmm3,%xmm0
1063	vpaddq	%xmm0,%xmm14,%xmm14
1064	vpmuludq	%xmm5,%xmm3,%xmm3
1065	vpaddq	%xmm3,%xmm13,%xmm13
1066	vpshufd	$0x10,48(%rdi),%xmm2
1067	vpmuludq	%xmm9,%xmm4,%xmm1
1068	vpaddq	%xmm1,%xmm12,%xmm12
1069	vpshufd	$0x10,64(%rdi),%xmm3
1070	vpmuludq	%xmm8,%xmm4,%xmm0
1071	vpaddq	%xmm0,%xmm11,%xmm11
1072	vpmuludq	%xmm7,%xmm4,%xmm4
1073	vpaddq	%xmm4,%xmm10,%xmm10
1074
1075	vpmuludq	%xmm5,%xmm2,%xmm2
1076	vpaddq	%xmm2,%xmm14,%xmm14
1077	vpmuludq	%xmm9,%xmm3,%xmm1
1078	vpaddq	%xmm1,%xmm13,%xmm13
1079	vpmuludq	%xmm8,%xmm3,%xmm0
1080	vpaddq	%xmm0,%xmm12,%xmm12
1081	vpmuludq	%xmm7,%xmm3,%xmm1
1082	vpaddq	%xmm1,%xmm11,%xmm11
1083	vpmuludq	%xmm6,%xmm3,%xmm3
1084	vpaddq	%xmm3,%xmm10,%xmm10
1085
1086	jz	.Lshort_tail_avx
1087
1088	vmovdqu	0(%rsi),%xmm0
1089	vmovdqu	16(%rsi),%xmm1
1090
1091	vpsrldq	$6,%xmm0,%xmm2
1092	vpsrldq	$6,%xmm1,%xmm3
1093	vpunpckhqdq	%xmm1,%xmm0,%xmm4
1094	vpunpcklqdq	%xmm1,%xmm0,%xmm0
1095	vpunpcklqdq	%xmm3,%xmm2,%xmm3
1096
1097	vpsrlq	$40,%xmm4,%xmm4
1098	vpsrlq	$26,%xmm0,%xmm1
1099	vpand	%xmm15,%xmm0,%xmm0
1100	vpsrlq	$4,%xmm3,%xmm2
1101	vpand	%xmm15,%xmm1,%xmm1
1102	vpsrlq	$30,%xmm3,%xmm3
1103	vpand	%xmm15,%xmm2,%xmm2
1104	vpand	%xmm15,%xmm3,%xmm3
1105	vpor	32(%rcx),%xmm4,%xmm4
1106
1107	vpshufd	$0x32,-64(%rdi),%xmm9
1108	vpaddq	0(%r11),%xmm0,%xmm0
1109	vpaddq	16(%r11),%xmm1,%xmm1
1110	vpaddq	32(%r11),%xmm2,%xmm2
1111	vpaddq	48(%r11),%xmm3,%xmm3
1112	vpaddq	64(%r11),%xmm4,%xmm4
1113
1114
1115
1116
1117	vpmuludq	%xmm0,%xmm9,%xmm5
1118	vpaddq	%xmm5,%xmm10,%xmm10
1119	vpmuludq	%xmm1,%xmm9,%xmm6
1120	vpaddq	%xmm6,%xmm11,%xmm11
1121	vpmuludq	%xmm2,%xmm9,%xmm5
1122	vpaddq	%xmm5,%xmm12,%xmm12
1123	vpshufd	$0x32,-48(%rdi),%xmm7
1124	vpmuludq	%xmm3,%xmm9,%xmm6
1125	vpaddq	%xmm6,%xmm13,%xmm13
1126	vpmuludq	%xmm4,%xmm9,%xmm9
1127	vpaddq	%xmm9,%xmm14,%xmm14
1128
1129	vpmuludq	%xmm3,%xmm7,%xmm5
1130	vpaddq	%xmm5,%xmm14,%xmm14
1131	vpshufd	$0x32,-32(%rdi),%xmm8
1132	vpmuludq	%xmm2,%xmm7,%xmm6
1133	vpaddq	%xmm6,%xmm13,%xmm13
1134	vpshufd	$0x32,-16(%rdi),%xmm9
1135	vpmuludq	%xmm1,%xmm7,%xmm5
1136	vpaddq	%xmm5,%xmm12,%xmm12
1137	vpmuludq	%xmm0,%xmm7,%xmm7
1138	vpaddq	%xmm7,%xmm11,%xmm11
1139	vpmuludq	%xmm4,%xmm8,%xmm8
1140	vpaddq	%xmm8,%xmm10,%xmm10
1141
1142	vpshufd	$0x32,0(%rdi),%xmm7
1143	vpmuludq	%xmm2,%xmm9,%xmm6
1144	vpaddq	%xmm6,%xmm14,%xmm14
1145	vpmuludq	%xmm1,%xmm9,%xmm5
1146	vpaddq	%xmm5,%xmm13,%xmm13
1147	vpshufd	$0x32,16(%rdi),%xmm8
1148	vpmuludq	%xmm0,%xmm9,%xmm9
1149	vpaddq	%xmm9,%xmm12,%xmm12
1150	vpmuludq	%xmm4,%xmm7,%xmm6
1151	vpaddq	%xmm6,%xmm11,%xmm11
1152	vpshufd	$0x32,32(%rdi),%xmm9
1153	vpmuludq	%xmm3,%xmm7,%xmm7
1154	vpaddq	%xmm7,%xmm10,%xmm10
1155
1156	vpmuludq	%xmm1,%xmm8,%xmm5
1157	vpaddq	%xmm5,%xmm14,%xmm14
1158	vpmuludq	%xmm0,%xmm8,%xmm8
1159	vpaddq	%xmm8,%xmm13,%xmm13
1160	vpshufd	$0x32,48(%rdi),%xmm7
1161	vpmuludq	%xmm4,%xmm9,%xmm6
1162	vpaddq	%xmm6,%xmm12,%xmm12
1163	vpshufd	$0x32,64(%rdi),%xmm8
1164	vpmuludq	%xmm3,%xmm9,%xmm5
1165	vpaddq	%xmm5,%xmm11,%xmm11
1166	vpmuludq	%xmm2,%xmm9,%xmm9
1167	vpaddq	%xmm9,%xmm10,%xmm10
1168
1169	vpmuludq	%xmm0,%xmm7,%xmm7
1170	vpaddq	%xmm7,%xmm14,%xmm14
1171	vpmuludq	%xmm4,%xmm8,%xmm6
1172	vpaddq	%xmm6,%xmm13,%xmm13
1173	vpmuludq	%xmm3,%xmm8,%xmm5
1174	vpaddq	%xmm5,%xmm12,%xmm12
1175	vpmuludq	%xmm2,%xmm8,%xmm6
1176	vpaddq	%xmm6,%xmm11,%xmm11
1177	vpmuludq	%xmm1,%xmm8,%xmm8
1178	vpaddq	%xmm8,%xmm10,%xmm10
1179
1180.Lshort_tail_avx:
1181
1182
1183
1184	vpsrldq	$8,%xmm14,%xmm9
1185	vpsrldq	$8,%xmm13,%xmm8
1186	vpsrldq	$8,%xmm11,%xmm6
1187	vpsrldq	$8,%xmm10,%xmm5
1188	vpsrldq	$8,%xmm12,%xmm7
1189	vpaddq	%xmm8,%xmm13,%xmm13
1190	vpaddq	%xmm9,%xmm14,%xmm14
1191	vpaddq	%xmm5,%xmm10,%xmm10
1192	vpaddq	%xmm6,%xmm11,%xmm11
1193	vpaddq	%xmm7,%xmm12,%xmm12
1194
1195
1196
1197
1198	vpsrlq	$26,%xmm13,%xmm3
1199	vpand	%xmm15,%xmm13,%xmm13
1200	vpaddq	%xmm3,%xmm14,%xmm14
1201
1202	vpsrlq	$26,%xmm10,%xmm0
1203	vpand	%xmm15,%xmm10,%xmm10
1204	vpaddq	%xmm0,%xmm11,%xmm11
1205
1206	vpsrlq	$26,%xmm14,%xmm4
1207	vpand	%xmm15,%xmm14,%xmm14
1208
1209	vpsrlq	$26,%xmm11,%xmm1
1210	vpand	%xmm15,%xmm11,%xmm11
1211	vpaddq	%xmm1,%xmm12,%xmm12
1212
1213	vpaddq	%xmm4,%xmm10,%xmm10
1214	vpsllq	$2,%xmm4,%xmm4
1215	vpaddq	%xmm4,%xmm10,%xmm10
1216
1217	vpsrlq	$26,%xmm12,%xmm2
1218	vpand	%xmm15,%xmm12,%xmm12
1219	vpaddq	%xmm2,%xmm13,%xmm13
1220
1221	vpsrlq	$26,%xmm10,%xmm0
1222	vpand	%xmm15,%xmm10,%xmm10
1223	vpaddq	%xmm0,%xmm11,%xmm11
1224
1225	vpsrlq	$26,%xmm13,%xmm3
1226	vpand	%xmm15,%xmm13,%xmm13
1227	vpaddq	%xmm3,%xmm14,%xmm14
1228
1229	vmovd	%xmm10,-112(%rdi)
1230	vmovd	%xmm11,-108(%rdi)
1231	vmovd	%xmm12,-104(%rdi)
1232	vmovd	%xmm13,-100(%rdi)
1233	vmovd	%xmm14,-96(%rdi)
1234	leaq	88(%r11),%rsp
1235.cfi_def_cfa	%rsp,8
1236	vzeroupper
1237	.byte	0xf3,0xc3
1238.cfi_endproc
1239.size	poly1305_blocks_avx,.-poly1305_blocks_avx
1240
1241.type	poly1305_emit_avx,@function
1242.align	32
1243poly1305_emit_avx:
1244.cfi_startproc
1245	cmpl	$0,20(%rdi)
1246	je	.Lemit
1247
1248	movl	0(%rdi),%eax
1249	movl	4(%rdi),%ecx
1250	movl	8(%rdi),%r8d
1251	movl	12(%rdi),%r11d
1252	movl	16(%rdi),%r10d
1253
1254	shlq	$26,%rcx
1255	movq	%r8,%r9
1256	shlq	$52,%r8
1257	addq	%rcx,%rax
1258	shrq	$12,%r9
1259	addq	%rax,%r8
1260	adcq	$0,%r9
1261
1262	shlq	$14,%r11
1263	movq	%r10,%rax
1264	shrq	$24,%r10
1265	addq	%r11,%r9
1266	shlq	$40,%rax
1267	addq	%rax,%r9
1268	adcq	$0,%r10
1269
1270	movq	%r10,%rax
1271	movq	%r10,%rcx
1272	andq	$3,%r10
1273	shrq	$2,%rax
1274	andq	$-4,%rcx
1275	addq	%rcx,%rax
1276	addq	%rax,%r8
1277	adcq	$0,%r9
1278	adcq	$0,%r10
1279
1280	movq	%r8,%rax
1281	addq	$5,%r8
1282	movq	%r9,%rcx
1283	adcq	$0,%r9
1284	adcq	$0,%r10
1285	shrq	$2,%r10
1286	cmovnzq	%r8,%rax
1287	cmovnzq	%r9,%rcx
1288
1289	addq	0(%rdx),%rax
1290	adcq	8(%rdx),%rcx
1291	movq	%rax,0(%rsi)
1292	movq	%rcx,8(%rsi)
1293
1294	.byte	0xf3,0xc3
1295.cfi_endproc
1296.size	poly1305_emit_avx,.-poly1305_emit_avx
1297.type	poly1305_blocks_avx2,@function
1298.align	32
1299poly1305_blocks_avx2:
1300.cfi_startproc
1301	movl	20(%rdi),%r8d
1302	cmpq	$128,%rdx
1303	jae	.Lblocks_avx2
1304	testl	%r8d,%r8d
1305	jz	.Lblocks
1306
1307.Lblocks_avx2:
1308	andq	$-16,%rdx
1309	jz	.Lno_data_avx2
1310
1311	vzeroupper
1312
1313	testl	%r8d,%r8d
1314	jz	.Lbase2_64_avx2
1315
1316	testq	$63,%rdx
1317	jz	.Leven_avx2
1318
1319	pushq	%rbx
1320.cfi_adjust_cfa_offset	8
1321.cfi_offset	%rbx,-16
1322	pushq	%rbp
1323.cfi_adjust_cfa_offset	8
1324.cfi_offset	%rbp,-24
1325	pushq	%r12
1326.cfi_adjust_cfa_offset	8
1327.cfi_offset	%r12,-32
1328	pushq	%r13
1329.cfi_adjust_cfa_offset	8
1330.cfi_offset	%r13,-40
1331	pushq	%r14
1332.cfi_adjust_cfa_offset	8
1333.cfi_offset	%r14,-48
1334	pushq	%r15
1335.cfi_adjust_cfa_offset	8
1336.cfi_offset	%r15,-56
1337.Lblocks_avx2_body:
1338
1339	movq	%rdx,%r15
1340
1341	movq	0(%rdi),%r8
1342	movq	8(%rdi),%r9
1343	movl	16(%rdi),%ebp
1344
1345	movq	24(%rdi),%r11
1346	movq	32(%rdi),%r13
1347
1348
1349	movl	%r8d,%r14d
1350	andq	$-2147483648,%r8
1351	movq	%r9,%r12
1352	movl	%r9d,%ebx
1353	andq	$-2147483648,%r9
1354
1355	shrq	$6,%r8
1356	shlq	$52,%r12
1357	addq	%r8,%r14
1358	shrq	$12,%rbx
1359	shrq	$18,%r9
1360	addq	%r12,%r14
1361	adcq	%r9,%rbx
1362
1363	movq	%rbp,%r8
1364	shlq	$40,%r8
1365	shrq	$24,%rbp
1366	addq	%r8,%rbx
1367	adcq	$0,%rbp
1368
1369	movq	$-4,%r9
1370	movq	%rbp,%r8
1371	andq	%rbp,%r9
1372	shrq	$2,%r8
1373	andq	$3,%rbp
1374	addq	%r9,%r8
1375	addq	%r8,%r14
1376	adcq	$0,%rbx
1377	adcq	$0,%rbp
1378
1379	movq	%r13,%r12
1380	movq	%r13,%rax
1381	shrq	$2,%r13
1382	addq	%r12,%r13
1383
1384.Lbase2_26_pre_avx2:
1385	addq	0(%rsi),%r14
1386	adcq	8(%rsi),%rbx
1387	leaq	16(%rsi),%rsi
1388	adcq	%rcx,%rbp
1389	subq	$16,%r15
1390
1391	call	__poly1305_block
1392	movq	%r12,%rax
1393
1394	testq	$63,%r15
1395	jnz	.Lbase2_26_pre_avx2
1396
1397	testq	%rcx,%rcx
1398	jz	.Lstore_base2_64_avx2
1399
1400
1401	movq	%r14,%rax
1402	movq	%r14,%rdx
1403	shrq	$52,%r14
1404	movq	%rbx,%r11
1405	movq	%rbx,%r12
1406	shrq	$26,%rdx
1407	andq	$0x3ffffff,%rax
1408	shlq	$12,%r11
1409	andq	$0x3ffffff,%rdx
1410	shrq	$14,%rbx
1411	orq	%r11,%r14
1412	shlq	$24,%rbp
1413	andq	$0x3ffffff,%r14
1414	shrq	$40,%r12
1415	andq	$0x3ffffff,%rbx
1416	orq	%r12,%rbp
1417
1418	testq	%r15,%r15
1419	jz	.Lstore_base2_26_avx2
1420
1421	vmovd	%eax,%xmm0
1422	vmovd	%edx,%xmm1
1423	vmovd	%r14d,%xmm2
1424	vmovd	%ebx,%xmm3
1425	vmovd	%ebp,%xmm4
1426	jmp	.Lproceed_avx2
1427
1428.align	32
1429.Lstore_base2_64_avx2:
1430	movq	%r14,0(%rdi)
1431	movq	%rbx,8(%rdi)
1432	movq	%rbp,16(%rdi)
1433	jmp	.Ldone_avx2
1434
1435.align	16
1436.Lstore_base2_26_avx2:
1437	movl	%eax,0(%rdi)
1438	movl	%edx,4(%rdi)
1439	movl	%r14d,8(%rdi)
1440	movl	%ebx,12(%rdi)
1441	movl	%ebp,16(%rdi)
1442.align	16
1443.Ldone_avx2:
1444	movq	0(%rsp),%r15
1445.cfi_restore	%r15
1446	movq	8(%rsp),%r14
1447.cfi_restore	%r14
1448	movq	16(%rsp),%r13
1449.cfi_restore	%r13
1450	movq	24(%rsp),%r12
1451.cfi_restore	%r12
1452	movq	32(%rsp),%rbp
1453.cfi_restore	%rbp
1454	movq	40(%rsp),%rbx
1455.cfi_restore	%rbx
1456	leaq	48(%rsp),%rsp
1457.cfi_adjust_cfa_offset	-48
1458.Lno_data_avx2:
1459.Lblocks_avx2_epilogue:
1460	.byte	0xf3,0xc3
1461.cfi_endproc
1462
1463.align	32
1464.Lbase2_64_avx2:
1465.cfi_startproc
1466	pushq	%rbx
1467.cfi_adjust_cfa_offset	8
1468.cfi_offset	%rbx,-16
1469	pushq	%rbp
1470.cfi_adjust_cfa_offset	8
1471.cfi_offset	%rbp,-24
1472	pushq	%r12
1473.cfi_adjust_cfa_offset	8
1474.cfi_offset	%r12,-32
1475	pushq	%r13
1476.cfi_adjust_cfa_offset	8
1477.cfi_offset	%r13,-40
1478	pushq	%r14
1479.cfi_adjust_cfa_offset	8
1480.cfi_offset	%r14,-48
1481	pushq	%r15
1482.cfi_adjust_cfa_offset	8
1483.cfi_offset	%r15,-56
1484.Lbase2_64_avx2_body:
1485
1486	movq	%rdx,%r15
1487
1488	movq	24(%rdi),%r11
1489	movq	32(%rdi),%r13
1490
1491	movq	0(%rdi),%r14
1492	movq	8(%rdi),%rbx
1493	movl	16(%rdi),%ebp
1494
1495	movq	%r13,%r12
1496	movq	%r13,%rax
1497	shrq	$2,%r13
1498	addq	%r12,%r13
1499
1500	testq	$63,%rdx
1501	jz	.Linit_avx2
1502
1503.Lbase2_64_pre_avx2:
1504	addq	0(%rsi),%r14
1505	adcq	8(%rsi),%rbx
1506	leaq	16(%rsi),%rsi
1507	adcq	%rcx,%rbp
1508	subq	$16,%r15
1509
1510	call	__poly1305_block
1511	movq	%r12,%rax
1512
1513	testq	$63,%r15
1514	jnz	.Lbase2_64_pre_avx2
1515
1516.Linit_avx2:
1517
1518	movq	%r14,%rax
1519	movq	%r14,%rdx
1520	shrq	$52,%r14
1521	movq	%rbx,%r8
1522	movq	%rbx,%r9
1523	shrq	$26,%rdx
1524	andq	$0x3ffffff,%rax
1525	shlq	$12,%r8
1526	andq	$0x3ffffff,%rdx
1527	shrq	$14,%rbx
1528	orq	%r8,%r14
1529	shlq	$24,%rbp
1530	andq	$0x3ffffff,%r14
1531	shrq	$40,%r9
1532	andq	$0x3ffffff,%rbx
1533	orq	%r9,%rbp
1534
1535	vmovd	%eax,%xmm0
1536	vmovd	%edx,%xmm1
1537	vmovd	%r14d,%xmm2
1538	vmovd	%ebx,%xmm3
1539	vmovd	%ebp,%xmm4
1540	movl	$1,20(%rdi)
1541
1542	call	__poly1305_init_avx
1543
1544.Lproceed_avx2:
1545	movq	%r15,%rdx
1546	movl	OPENSSL_ia32cap_P+8(%rip),%r10d
1547	movl	$3221291008,%r11d
1548
1549	movq	0(%rsp),%r15
1550.cfi_restore	%r15
1551	movq	8(%rsp),%r14
1552.cfi_restore	%r14
1553	movq	16(%rsp),%r13
1554.cfi_restore	%r13
1555	movq	24(%rsp),%r12
1556.cfi_restore	%r12
1557	movq	32(%rsp),%rbp
1558.cfi_restore	%rbp
1559	movq	40(%rsp),%rbx
1560.cfi_restore	%rbx
1561	leaq	48(%rsp),%rax
1562	leaq	48(%rsp),%rsp
1563.cfi_adjust_cfa_offset	-48
1564.Lbase2_64_avx2_epilogue:
1565	jmp	.Ldo_avx2
1566.cfi_endproc
1567
1568.align	32
1569.Leven_avx2:
1570.cfi_startproc
1571	movl	OPENSSL_ia32cap_P+8(%rip),%r10d
1572	vmovd	0(%rdi),%xmm0
1573	vmovd	4(%rdi),%xmm1
1574	vmovd	8(%rdi),%xmm2
1575	vmovd	12(%rdi),%xmm3
1576	vmovd	16(%rdi),%xmm4
1577
1578.Ldo_avx2:
1579	cmpq	$512,%rdx
1580	jb	.Lskip_avx512
1581	andl	%r11d,%r10d
1582	testl	$65536,%r10d
1583	jnz	.Lblocks_avx512
1584.Lskip_avx512:
1585	leaq	-8(%rsp),%r11
1586.cfi_def_cfa	%r11,16
1587	subq	$0x128,%rsp
1588	leaq	.Lconst(%rip),%rcx
1589	leaq	48+64(%rdi),%rdi
1590	vmovdqa	96(%rcx),%ymm7
1591
1592
1593	vmovdqu	-64(%rdi),%xmm9
1594	andq	$-512,%rsp
1595	vmovdqu	-48(%rdi),%xmm10
1596	vmovdqu	-32(%rdi),%xmm6
1597	vmovdqu	-16(%rdi),%xmm11
1598	vmovdqu	0(%rdi),%xmm12
1599	vmovdqu	16(%rdi),%xmm13
1600	leaq	144(%rsp),%rax
1601	vmovdqu	32(%rdi),%xmm14
1602	vpermd	%ymm9,%ymm7,%ymm9
1603	vmovdqu	48(%rdi),%xmm15
1604	vpermd	%ymm10,%ymm7,%ymm10
1605	vmovdqu	64(%rdi),%xmm5
1606	vpermd	%ymm6,%ymm7,%ymm6
1607	vmovdqa	%ymm9,0(%rsp)
1608	vpermd	%ymm11,%ymm7,%ymm11
1609	vmovdqa	%ymm10,32-144(%rax)
1610	vpermd	%ymm12,%ymm7,%ymm12
1611	vmovdqa	%ymm6,64-144(%rax)
1612	vpermd	%ymm13,%ymm7,%ymm13
1613	vmovdqa	%ymm11,96-144(%rax)
1614	vpermd	%ymm14,%ymm7,%ymm14
1615	vmovdqa	%ymm12,128-144(%rax)
1616	vpermd	%ymm15,%ymm7,%ymm15
1617	vmovdqa	%ymm13,160-144(%rax)
1618	vpermd	%ymm5,%ymm7,%ymm5
1619	vmovdqa	%ymm14,192-144(%rax)
1620	vmovdqa	%ymm15,224-144(%rax)
1621	vmovdqa	%ymm5,256-144(%rax)
1622	vmovdqa	64(%rcx),%ymm5
1623
1624
1625
1626	vmovdqu	0(%rsi),%xmm7
1627	vmovdqu	16(%rsi),%xmm8
1628	vinserti128	$1,32(%rsi),%ymm7,%ymm7
1629	vinserti128	$1,48(%rsi),%ymm8,%ymm8
1630	leaq	64(%rsi),%rsi
1631
1632	vpsrldq	$6,%ymm7,%ymm9
1633	vpsrldq	$6,%ymm8,%ymm10
1634	vpunpckhqdq	%ymm8,%ymm7,%ymm6
1635	vpunpcklqdq	%ymm10,%ymm9,%ymm9
1636	vpunpcklqdq	%ymm8,%ymm7,%ymm7
1637
1638	vpsrlq	$30,%ymm9,%ymm10
1639	vpsrlq	$4,%ymm9,%ymm9
1640	vpsrlq	$26,%ymm7,%ymm8
1641	vpsrlq	$40,%ymm6,%ymm6
1642	vpand	%ymm5,%ymm9,%ymm9
1643	vpand	%ymm5,%ymm7,%ymm7
1644	vpand	%ymm5,%ymm8,%ymm8
1645	vpand	%ymm5,%ymm10,%ymm10
1646	vpor	32(%rcx),%ymm6,%ymm6
1647
1648	vpaddq	%ymm2,%ymm9,%ymm2
1649	subq	$64,%rdx
1650	jz	.Ltail_avx2
1651	jmp	.Loop_avx2
1652
1653.align	32
1654.Loop_avx2:
1655
1656
1657
1658
1659
1660
1661
1662
1663	vpaddq	%ymm0,%ymm7,%ymm0
1664	vmovdqa	0(%rsp),%ymm7
1665	vpaddq	%ymm1,%ymm8,%ymm1
1666	vmovdqa	32(%rsp),%ymm8
1667	vpaddq	%ymm3,%ymm10,%ymm3
1668	vmovdqa	96(%rsp),%ymm9
1669	vpaddq	%ymm4,%ymm6,%ymm4
1670	vmovdqa	48(%rax),%ymm10
1671	vmovdqa	112(%rax),%ymm5
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688	vpmuludq	%ymm2,%ymm7,%ymm13
1689	vpmuludq	%ymm2,%ymm8,%ymm14
1690	vpmuludq	%ymm2,%ymm9,%ymm15
1691	vpmuludq	%ymm2,%ymm10,%ymm11
1692	vpmuludq	%ymm2,%ymm5,%ymm12
1693
1694	vpmuludq	%ymm0,%ymm8,%ymm6
1695	vpmuludq	%ymm1,%ymm8,%ymm2
1696	vpaddq	%ymm6,%ymm12,%ymm12
1697	vpaddq	%ymm2,%ymm13,%ymm13
1698	vpmuludq	%ymm3,%ymm8,%ymm6
1699	vpmuludq	64(%rsp),%ymm4,%ymm2
1700	vpaddq	%ymm6,%ymm15,%ymm15
1701	vpaddq	%ymm2,%ymm11,%ymm11
1702	vmovdqa	-16(%rax),%ymm8
1703
1704	vpmuludq	%ymm0,%ymm7,%ymm6
1705	vpmuludq	%ymm1,%ymm7,%ymm2
1706	vpaddq	%ymm6,%ymm11,%ymm11
1707	vpaddq	%ymm2,%ymm12,%ymm12
1708	vpmuludq	%ymm3,%ymm7,%ymm6
1709	vpmuludq	%ymm4,%ymm7,%ymm2
1710	vmovdqu	0(%rsi),%xmm7
1711	vpaddq	%ymm6,%ymm14,%ymm14
1712	vpaddq	%ymm2,%ymm15,%ymm15
1713	vinserti128	$1,32(%rsi),%ymm7,%ymm7
1714
1715	vpmuludq	%ymm3,%ymm8,%ymm6
1716	vpmuludq	%ymm4,%ymm8,%ymm2
1717	vmovdqu	16(%rsi),%xmm8
1718	vpaddq	%ymm6,%ymm11,%ymm11
1719	vpaddq	%ymm2,%ymm12,%ymm12
1720	vmovdqa	16(%rax),%ymm2
1721	vpmuludq	%ymm1,%ymm9,%ymm6
1722	vpmuludq	%ymm0,%ymm9,%ymm9
1723	vpaddq	%ymm6,%ymm14,%ymm14
1724	vpaddq	%ymm9,%ymm13,%ymm13
1725	vinserti128	$1,48(%rsi),%ymm8,%ymm8
1726	leaq	64(%rsi),%rsi
1727
1728	vpmuludq	%ymm1,%ymm2,%ymm6
1729	vpmuludq	%ymm0,%ymm2,%ymm2
1730	vpsrldq	$6,%ymm7,%ymm9
1731	vpaddq	%ymm6,%ymm15,%ymm15
1732	vpaddq	%ymm2,%ymm14,%ymm14
1733	vpmuludq	%ymm3,%ymm10,%ymm6
1734	vpmuludq	%ymm4,%ymm10,%ymm2
1735	vpsrldq	$6,%ymm8,%ymm10
1736	vpaddq	%ymm6,%ymm12,%ymm12
1737	vpaddq	%ymm2,%ymm13,%ymm13
1738	vpunpckhqdq	%ymm8,%ymm7,%ymm6
1739
1740	vpmuludq	%ymm3,%ymm5,%ymm3
1741	vpmuludq	%ymm4,%ymm5,%ymm4
1742	vpunpcklqdq	%ymm8,%ymm7,%ymm7
1743	vpaddq	%ymm3,%ymm13,%ymm2
1744	vpaddq	%ymm4,%ymm14,%ymm3
1745	vpunpcklqdq	%ymm10,%ymm9,%ymm10
1746	vpmuludq	80(%rax),%ymm0,%ymm4
1747	vpmuludq	%ymm1,%ymm5,%ymm0
1748	vmovdqa	64(%rcx),%ymm5
1749	vpaddq	%ymm4,%ymm15,%ymm4
1750	vpaddq	%ymm0,%ymm11,%ymm0
1751
1752
1753
1754
1755	vpsrlq	$26,%ymm3,%ymm14
1756	vpand	%ymm5,%ymm3,%ymm3
1757	vpaddq	%ymm14,%ymm4,%ymm4
1758
1759	vpsrlq	$26,%ymm0,%ymm11
1760	vpand	%ymm5,%ymm0,%ymm0
1761	vpaddq	%ymm11,%ymm12,%ymm1
1762
1763	vpsrlq	$26,%ymm4,%ymm15
1764	vpand	%ymm5,%ymm4,%ymm4
1765
1766	vpsrlq	$4,%ymm10,%ymm9
1767
1768	vpsrlq	$26,%ymm1,%ymm12
1769	vpand	%ymm5,%ymm1,%ymm1
1770	vpaddq	%ymm12,%ymm2,%ymm2
1771
1772	vpaddq	%ymm15,%ymm0,%ymm0
1773	vpsllq	$2,%ymm15,%ymm15
1774	vpaddq	%ymm15,%ymm0,%ymm0
1775
1776	vpand	%ymm5,%ymm9,%ymm9
1777	vpsrlq	$26,%ymm7,%ymm8
1778
1779	vpsrlq	$26,%ymm2,%ymm13
1780	vpand	%ymm5,%ymm2,%ymm2
1781	vpaddq	%ymm13,%ymm3,%ymm3
1782
1783	vpaddq	%ymm9,%ymm2,%ymm2
1784	vpsrlq	$30,%ymm10,%ymm10
1785
1786	vpsrlq	$26,%ymm0,%ymm11
1787	vpand	%ymm5,%ymm0,%ymm0
1788	vpaddq	%ymm11,%ymm1,%ymm1
1789
1790	vpsrlq	$40,%ymm6,%ymm6
1791
1792	vpsrlq	$26,%ymm3,%ymm14
1793	vpand	%ymm5,%ymm3,%ymm3
1794	vpaddq	%ymm14,%ymm4,%ymm4
1795
1796	vpand	%ymm5,%ymm7,%ymm7
1797	vpand	%ymm5,%ymm8,%ymm8
1798	vpand	%ymm5,%ymm10,%ymm10
1799	vpor	32(%rcx),%ymm6,%ymm6
1800
1801	subq	$64,%rdx
1802	jnz	.Loop_avx2
1803
1804.byte	0x66,0x90
1805.Ltail_avx2:
1806
1807
1808
1809
1810
1811
1812
1813	vpaddq	%ymm0,%ymm7,%ymm0
1814	vmovdqu	4(%rsp),%ymm7
1815	vpaddq	%ymm1,%ymm8,%ymm1
1816	vmovdqu	36(%rsp),%ymm8
1817	vpaddq	%ymm3,%ymm10,%ymm3
1818	vmovdqu	100(%rsp),%ymm9
1819	vpaddq	%ymm4,%ymm6,%ymm4
1820	vmovdqu	52(%rax),%ymm10
1821	vmovdqu	116(%rax),%ymm5
1822
1823	vpmuludq	%ymm2,%ymm7,%ymm13
1824	vpmuludq	%ymm2,%ymm8,%ymm14
1825	vpmuludq	%ymm2,%ymm9,%ymm15
1826	vpmuludq	%ymm2,%ymm10,%ymm11
1827	vpmuludq	%ymm2,%ymm5,%ymm12
1828
1829	vpmuludq	%ymm0,%ymm8,%ymm6
1830	vpmuludq	%ymm1,%ymm8,%ymm2
1831	vpaddq	%ymm6,%ymm12,%ymm12
1832	vpaddq	%ymm2,%ymm13,%ymm13
1833	vpmuludq	%ymm3,%ymm8,%ymm6
1834	vpmuludq	68(%rsp),%ymm4,%ymm2
1835	vpaddq	%ymm6,%ymm15,%ymm15
1836	vpaddq	%ymm2,%ymm11,%ymm11
1837
1838	vpmuludq	%ymm0,%ymm7,%ymm6
1839	vpmuludq	%ymm1,%ymm7,%ymm2
1840	vpaddq	%ymm6,%ymm11,%ymm11
1841	vmovdqu	-12(%rax),%ymm8
1842	vpaddq	%ymm2,%ymm12,%ymm12
1843	vpmuludq	%ymm3,%ymm7,%ymm6
1844	vpmuludq	%ymm4,%ymm7,%ymm2
1845	vpaddq	%ymm6,%ymm14,%ymm14
1846	vpaddq	%ymm2,%ymm15,%ymm15
1847
1848	vpmuludq	%ymm3,%ymm8,%ymm6
1849	vpmuludq	%ymm4,%ymm8,%ymm2
1850	vpaddq	%ymm6,%ymm11,%ymm11
1851	vpaddq	%ymm2,%ymm12,%ymm12
1852	vmovdqu	20(%rax),%ymm2
1853	vpmuludq	%ymm1,%ymm9,%ymm6
1854	vpmuludq	%ymm0,%ymm9,%ymm9
1855	vpaddq	%ymm6,%ymm14,%ymm14
1856	vpaddq	%ymm9,%ymm13,%ymm13
1857
1858	vpmuludq	%ymm1,%ymm2,%ymm6
1859	vpmuludq	%ymm0,%ymm2,%ymm2
1860	vpaddq	%ymm6,%ymm15,%ymm15
1861	vpaddq	%ymm2,%ymm14,%ymm14
1862	vpmuludq	%ymm3,%ymm10,%ymm6
1863	vpmuludq	%ymm4,%ymm10,%ymm2
1864	vpaddq	%ymm6,%ymm12,%ymm12
1865	vpaddq	%ymm2,%ymm13,%ymm13
1866
1867	vpmuludq	%ymm3,%ymm5,%ymm3
1868	vpmuludq	%ymm4,%ymm5,%ymm4
1869	vpaddq	%ymm3,%ymm13,%ymm2
1870	vpaddq	%ymm4,%ymm14,%ymm3
1871	vpmuludq	84(%rax),%ymm0,%ymm4
1872	vpmuludq	%ymm1,%ymm5,%ymm0
1873	vmovdqa	64(%rcx),%ymm5
1874	vpaddq	%ymm4,%ymm15,%ymm4
1875	vpaddq	%ymm0,%ymm11,%ymm0
1876
1877
1878
1879
1880	vpsrldq	$8,%ymm12,%ymm8
1881	vpsrldq	$8,%ymm2,%ymm9
1882	vpsrldq	$8,%ymm3,%ymm10
1883	vpsrldq	$8,%ymm4,%ymm6
1884	vpsrldq	$8,%ymm0,%ymm7
1885	vpaddq	%ymm8,%ymm12,%ymm12
1886	vpaddq	%ymm9,%ymm2,%ymm2
1887	vpaddq	%ymm10,%ymm3,%ymm3
1888	vpaddq	%ymm6,%ymm4,%ymm4
1889	vpaddq	%ymm7,%ymm0,%ymm0
1890
1891	vpermq	$0x2,%ymm3,%ymm10
1892	vpermq	$0x2,%ymm4,%ymm6
1893	vpermq	$0x2,%ymm0,%ymm7
1894	vpermq	$0x2,%ymm12,%ymm8
1895	vpermq	$0x2,%ymm2,%ymm9
1896	vpaddq	%ymm10,%ymm3,%ymm3
1897	vpaddq	%ymm6,%ymm4,%ymm4
1898	vpaddq	%ymm7,%ymm0,%ymm0
1899	vpaddq	%ymm8,%ymm12,%ymm12
1900	vpaddq	%ymm9,%ymm2,%ymm2
1901
1902
1903
1904
1905	vpsrlq	$26,%ymm3,%ymm14
1906	vpand	%ymm5,%ymm3,%ymm3
1907	vpaddq	%ymm14,%ymm4,%ymm4
1908
1909	vpsrlq	$26,%ymm0,%ymm11
1910	vpand	%ymm5,%ymm0,%ymm0
1911	vpaddq	%ymm11,%ymm12,%ymm1
1912
1913	vpsrlq	$26,%ymm4,%ymm15
1914	vpand	%ymm5,%ymm4,%ymm4
1915
1916	vpsrlq	$26,%ymm1,%ymm12
1917	vpand	%ymm5,%ymm1,%ymm1
1918	vpaddq	%ymm12,%ymm2,%ymm2
1919
1920	vpaddq	%ymm15,%ymm0,%ymm0
1921	vpsllq	$2,%ymm15,%ymm15
1922	vpaddq	%ymm15,%ymm0,%ymm0
1923
1924	vpsrlq	$26,%ymm2,%ymm13
1925	vpand	%ymm5,%ymm2,%ymm2
1926	vpaddq	%ymm13,%ymm3,%ymm3
1927
1928	vpsrlq	$26,%ymm0,%ymm11
1929	vpand	%ymm5,%ymm0,%ymm0
1930	vpaddq	%ymm11,%ymm1,%ymm1
1931
1932	vpsrlq	$26,%ymm3,%ymm14
1933	vpand	%ymm5,%ymm3,%ymm3
1934	vpaddq	%ymm14,%ymm4,%ymm4
1935
1936	vmovd	%xmm0,-112(%rdi)
1937	vmovd	%xmm1,-108(%rdi)
1938	vmovd	%xmm2,-104(%rdi)
1939	vmovd	%xmm3,-100(%rdi)
1940	vmovd	%xmm4,-96(%rdi)
1941	leaq	8(%r11),%rsp
1942.cfi_def_cfa	%rsp,8
1943	vzeroupper
1944	.byte	0xf3,0xc3
1945.cfi_endproc
1946.size	poly1305_blocks_avx2,.-poly1305_blocks_avx2
1947.type	poly1305_blocks_avx512,@function
1948.align	32
1949poly1305_blocks_avx512:
1950.cfi_startproc
1951.Lblocks_avx512:
1952	movl	$15,%eax
1953	kmovw	%eax,%k2
1954	leaq	-8(%rsp),%r11
1955.cfi_def_cfa	%r11,16
1956	subq	$0x128,%rsp
1957	leaq	.Lconst(%rip),%rcx
1958	leaq	48+64(%rdi),%rdi
1959	vmovdqa	96(%rcx),%ymm9
1960
1961
1962	vmovdqu	-64(%rdi),%xmm11
1963	andq	$-512,%rsp
1964	vmovdqu	-48(%rdi),%xmm12
1965	movq	$0x20,%rax
1966	vmovdqu	-32(%rdi),%xmm7
1967	vmovdqu	-16(%rdi),%xmm13
1968	vmovdqu	0(%rdi),%xmm8
1969	vmovdqu	16(%rdi),%xmm14
1970	vmovdqu	32(%rdi),%xmm10
1971	vmovdqu	48(%rdi),%xmm15
1972	vmovdqu	64(%rdi),%xmm6
1973	vpermd	%zmm11,%zmm9,%zmm16
1974	vpbroadcastq	64(%rcx),%zmm5
1975	vpermd	%zmm12,%zmm9,%zmm17
1976	vpermd	%zmm7,%zmm9,%zmm21
1977	vpermd	%zmm13,%zmm9,%zmm18
1978	vmovdqa64	%zmm16,0(%rsp){%k2}
1979	vpsrlq	$32,%zmm16,%zmm7
1980	vpermd	%zmm8,%zmm9,%zmm22
1981	vmovdqu64	%zmm17,0(%rsp,%rax,1){%k2}
1982	vpsrlq	$32,%zmm17,%zmm8
1983	vpermd	%zmm14,%zmm9,%zmm19
1984	vmovdqa64	%zmm21,64(%rsp){%k2}
1985	vpermd	%zmm10,%zmm9,%zmm23
1986	vpermd	%zmm15,%zmm9,%zmm20
1987	vmovdqu64	%zmm18,64(%rsp,%rax,1){%k2}
1988	vpermd	%zmm6,%zmm9,%zmm24
1989	vmovdqa64	%zmm22,128(%rsp){%k2}
1990	vmovdqu64	%zmm19,128(%rsp,%rax,1){%k2}
1991	vmovdqa64	%zmm23,192(%rsp){%k2}
1992	vmovdqu64	%zmm20,192(%rsp,%rax,1){%k2}
1993	vmovdqa64	%zmm24,256(%rsp){%k2}
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004	vpmuludq	%zmm7,%zmm16,%zmm11
2005	vpmuludq	%zmm7,%zmm17,%zmm12
2006	vpmuludq	%zmm7,%zmm18,%zmm13
2007	vpmuludq	%zmm7,%zmm19,%zmm14
2008	vpmuludq	%zmm7,%zmm20,%zmm15
2009	vpsrlq	$32,%zmm18,%zmm9
2010
2011	vpmuludq	%zmm8,%zmm24,%zmm25
2012	vpmuludq	%zmm8,%zmm16,%zmm26
2013	vpmuludq	%zmm8,%zmm17,%zmm27
2014	vpmuludq	%zmm8,%zmm18,%zmm28
2015	vpmuludq	%zmm8,%zmm19,%zmm29
2016	vpsrlq	$32,%zmm19,%zmm10
2017	vpaddq	%zmm25,%zmm11,%zmm11
2018	vpaddq	%zmm26,%zmm12,%zmm12
2019	vpaddq	%zmm27,%zmm13,%zmm13
2020	vpaddq	%zmm28,%zmm14,%zmm14
2021	vpaddq	%zmm29,%zmm15,%zmm15
2022
2023	vpmuludq	%zmm9,%zmm23,%zmm25
2024	vpmuludq	%zmm9,%zmm24,%zmm26
2025	vpmuludq	%zmm9,%zmm17,%zmm28
2026	vpmuludq	%zmm9,%zmm18,%zmm29
2027	vpmuludq	%zmm9,%zmm16,%zmm27
2028	vpsrlq	$32,%zmm20,%zmm6
2029	vpaddq	%zmm25,%zmm11,%zmm11
2030	vpaddq	%zmm26,%zmm12,%zmm12
2031	vpaddq	%zmm28,%zmm14,%zmm14
2032	vpaddq	%zmm29,%zmm15,%zmm15
2033	vpaddq	%zmm27,%zmm13,%zmm13
2034
2035	vpmuludq	%zmm10,%zmm22,%zmm25
2036	vpmuludq	%zmm10,%zmm16,%zmm28
2037	vpmuludq	%zmm10,%zmm17,%zmm29
2038	vpmuludq	%zmm10,%zmm23,%zmm26
2039	vpmuludq	%zmm10,%zmm24,%zmm27
2040	vpaddq	%zmm25,%zmm11,%zmm11
2041	vpaddq	%zmm28,%zmm14,%zmm14
2042	vpaddq	%zmm29,%zmm15,%zmm15
2043	vpaddq	%zmm26,%zmm12,%zmm12
2044	vpaddq	%zmm27,%zmm13,%zmm13
2045
2046	vpmuludq	%zmm6,%zmm24,%zmm28
2047	vpmuludq	%zmm6,%zmm16,%zmm29
2048	vpmuludq	%zmm6,%zmm21,%zmm25
2049	vpmuludq	%zmm6,%zmm22,%zmm26
2050	vpmuludq	%zmm6,%zmm23,%zmm27
2051	vpaddq	%zmm28,%zmm14,%zmm14
2052	vpaddq	%zmm29,%zmm15,%zmm15
2053	vpaddq	%zmm25,%zmm11,%zmm11
2054	vpaddq	%zmm26,%zmm12,%zmm12
2055	vpaddq	%zmm27,%zmm13,%zmm13
2056
2057
2058
2059	vmovdqu64	0(%rsi),%zmm10
2060	vmovdqu64	64(%rsi),%zmm6
2061	leaq	128(%rsi),%rsi
2062
2063
2064
2065
2066	vpsrlq	$26,%zmm14,%zmm28
2067	vpandq	%zmm5,%zmm14,%zmm14
2068	vpaddq	%zmm28,%zmm15,%zmm15
2069
2070	vpsrlq	$26,%zmm11,%zmm25
2071	vpandq	%zmm5,%zmm11,%zmm11
2072	vpaddq	%zmm25,%zmm12,%zmm12
2073
2074	vpsrlq	$26,%zmm15,%zmm29
2075	vpandq	%zmm5,%zmm15,%zmm15
2076
2077	vpsrlq	$26,%zmm12,%zmm26
2078	vpandq	%zmm5,%zmm12,%zmm12
2079	vpaddq	%zmm26,%zmm13,%zmm13
2080
2081	vpaddq	%zmm29,%zmm11,%zmm11
2082	vpsllq	$2,%zmm29,%zmm29
2083	vpaddq	%zmm29,%zmm11,%zmm11
2084
2085	vpsrlq	$26,%zmm13,%zmm27
2086	vpandq	%zmm5,%zmm13,%zmm13
2087	vpaddq	%zmm27,%zmm14,%zmm14
2088
2089	vpsrlq	$26,%zmm11,%zmm25
2090	vpandq	%zmm5,%zmm11,%zmm11
2091	vpaddq	%zmm25,%zmm12,%zmm12
2092
2093	vpsrlq	$26,%zmm14,%zmm28
2094	vpandq	%zmm5,%zmm14,%zmm14
2095	vpaddq	%zmm28,%zmm15,%zmm15
2096
2097
2098
2099
2100
2101	vpunpcklqdq	%zmm6,%zmm10,%zmm7
2102	vpunpckhqdq	%zmm6,%zmm10,%zmm6
2103
2104
2105
2106
2107
2108
2109	vmovdqa32	128(%rcx),%zmm25
2110	movl	$0x7777,%eax
2111	kmovw	%eax,%k1
2112
2113	vpermd	%zmm16,%zmm25,%zmm16
2114	vpermd	%zmm17,%zmm25,%zmm17
2115	vpermd	%zmm18,%zmm25,%zmm18
2116	vpermd	%zmm19,%zmm25,%zmm19
2117	vpermd	%zmm20,%zmm25,%zmm20
2118
2119	vpermd	%zmm11,%zmm25,%zmm16{%k1}
2120	vpermd	%zmm12,%zmm25,%zmm17{%k1}
2121	vpermd	%zmm13,%zmm25,%zmm18{%k1}
2122	vpermd	%zmm14,%zmm25,%zmm19{%k1}
2123	vpermd	%zmm15,%zmm25,%zmm20{%k1}
2124
2125	vpslld	$2,%zmm17,%zmm21
2126	vpslld	$2,%zmm18,%zmm22
2127	vpslld	$2,%zmm19,%zmm23
2128	vpslld	$2,%zmm20,%zmm24
2129	vpaddd	%zmm17,%zmm21,%zmm21
2130	vpaddd	%zmm18,%zmm22,%zmm22
2131	vpaddd	%zmm19,%zmm23,%zmm23
2132	vpaddd	%zmm20,%zmm24,%zmm24
2133
2134	vpbroadcastq	32(%rcx),%zmm30
2135
2136	vpsrlq	$52,%zmm7,%zmm9
2137	vpsllq	$12,%zmm6,%zmm10
2138	vporq	%zmm10,%zmm9,%zmm9
2139	vpsrlq	$26,%zmm7,%zmm8
2140	vpsrlq	$14,%zmm6,%zmm10
2141	vpsrlq	$40,%zmm6,%zmm6
2142	vpandq	%zmm5,%zmm9,%zmm9
2143	vpandq	%zmm5,%zmm7,%zmm7
2144
2145
2146
2147
2148	vpaddq	%zmm2,%zmm9,%zmm2
2149	subq	$192,%rdx
2150	jbe	.Ltail_avx512
2151	jmp	.Loop_avx512
2152
2153.align	32
2154.Loop_avx512:
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183	vpmuludq	%zmm2,%zmm17,%zmm14
2184	vpaddq	%zmm0,%zmm7,%zmm0
2185	vpmuludq	%zmm2,%zmm18,%zmm15
2186	vpandq	%zmm5,%zmm8,%zmm8
2187	vpmuludq	%zmm2,%zmm23,%zmm11
2188	vpandq	%zmm5,%zmm10,%zmm10
2189	vpmuludq	%zmm2,%zmm24,%zmm12
2190	vporq	%zmm30,%zmm6,%zmm6
2191	vpmuludq	%zmm2,%zmm16,%zmm13
2192	vpaddq	%zmm1,%zmm8,%zmm1
2193	vpaddq	%zmm3,%zmm10,%zmm3
2194	vpaddq	%zmm4,%zmm6,%zmm4
2195
2196	vmovdqu64	0(%rsi),%zmm10
2197	vmovdqu64	64(%rsi),%zmm6
2198	leaq	128(%rsi),%rsi
2199	vpmuludq	%zmm0,%zmm19,%zmm28
2200	vpmuludq	%zmm0,%zmm20,%zmm29
2201	vpmuludq	%zmm0,%zmm16,%zmm25
2202	vpmuludq	%zmm0,%zmm17,%zmm26
2203	vpaddq	%zmm28,%zmm14,%zmm14
2204	vpaddq	%zmm29,%zmm15,%zmm15
2205	vpaddq	%zmm25,%zmm11,%zmm11
2206	vpaddq	%zmm26,%zmm12,%zmm12
2207
2208	vpmuludq	%zmm1,%zmm18,%zmm28
2209	vpmuludq	%zmm1,%zmm19,%zmm29
2210	vpmuludq	%zmm1,%zmm24,%zmm25
2211	vpmuludq	%zmm0,%zmm18,%zmm27
2212	vpaddq	%zmm28,%zmm14,%zmm14
2213	vpaddq	%zmm29,%zmm15,%zmm15
2214	vpaddq	%zmm25,%zmm11,%zmm11
2215	vpaddq	%zmm27,%zmm13,%zmm13
2216
2217	vpunpcklqdq	%zmm6,%zmm10,%zmm7
2218	vpunpckhqdq	%zmm6,%zmm10,%zmm6
2219
2220	vpmuludq	%zmm3,%zmm16,%zmm28
2221	vpmuludq	%zmm3,%zmm17,%zmm29
2222	vpmuludq	%zmm1,%zmm16,%zmm26
2223	vpmuludq	%zmm1,%zmm17,%zmm27
2224	vpaddq	%zmm28,%zmm14,%zmm14
2225	vpaddq	%zmm29,%zmm15,%zmm15
2226	vpaddq	%zmm26,%zmm12,%zmm12
2227	vpaddq	%zmm27,%zmm13,%zmm13
2228
2229	vpmuludq	%zmm4,%zmm24,%zmm28
2230	vpmuludq	%zmm4,%zmm16,%zmm29
2231	vpmuludq	%zmm3,%zmm22,%zmm25
2232	vpmuludq	%zmm3,%zmm23,%zmm26
2233	vpaddq	%zmm28,%zmm14,%zmm14
2234	vpmuludq	%zmm3,%zmm24,%zmm27
2235	vpaddq	%zmm29,%zmm15,%zmm15
2236	vpaddq	%zmm25,%zmm11,%zmm11
2237	vpaddq	%zmm26,%zmm12,%zmm12
2238	vpaddq	%zmm27,%zmm13,%zmm13
2239
2240	vpmuludq	%zmm4,%zmm21,%zmm25
2241	vpmuludq	%zmm4,%zmm22,%zmm26
2242	vpmuludq	%zmm4,%zmm23,%zmm27
2243	vpaddq	%zmm25,%zmm11,%zmm0
2244	vpaddq	%zmm26,%zmm12,%zmm1
2245	vpaddq	%zmm27,%zmm13,%zmm2
2246
2247
2248
2249
2250	vpsrlq	$52,%zmm7,%zmm9
2251	vpsllq	$12,%zmm6,%zmm10
2252
2253	vpsrlq	$26,%zmm14,%zmm3
2254	vpandq	%zmm5,%zmm14,%zmm14
2255	vpaddq	%zmm3,%zmm15,%zmm4
2256
2257	vporq	%zmm10,%zmm9,%zmm9
2258
2259	vpsrlq	$26,%zmm0,%zmm11
2260	vpandq	%zmm5,%zmm0,%zmm0
2261	vpaddq	%zmm11,%zmm1,%zmm1
2262
2263	vpandq	%zmm5,%zmm9,%zmm9
2264
2265	vpsrlq	$26,%zmm4,%zmm15
2266	vpandq	%zmm5,%zmm4,%zmm4
2267
2268	vpsrlq	$26,%zmm1,%zmm12
2269	vpandq	%zmm5,%zmm1,%zmm1
2270	vpaddq	%zmm12,%zmm2,%zmm2
2271
2272	vpaddq	%zmm15,%zmm0,%zmm0
2273	vpsllq	$2,%zmm15,%zmm15
2274	vpaddq	%zmm15,%zmm0,%zmm0
2275
2276	vpaddq	%zmm9,%zmm2,%zmm2
2277	vpsrlq	$26,%zmm7,%zmm8
2278
2279	vpsrlq	$26,%zmm2,%zmm13
2280	vpandq	%zmm5,%zmm2,%zmm2
2281	vpaddq	%zmm13,%zmm14,%zmm3
2282
2283	vpsrlq	$14,%zmm6,%zmm10
2284
2285	vpsrlq	$26,%zmm0,%zmm11
2286	vpandq	%zmm5,%zmm0,%zmm0
2287	vpaddq	%zmm11,%zmm1,%zmm1
2288
2289	vpsrlq	$40,%zmm6,%zmm6
2290
2291	vpsrlq	$26,%zmm3,%zmm14
2292	vpandq	%zmm5,%zmm3,%zmm3
2293	vpaddq	%zmm14,%zmm4,%zmm4
2294
2295	vpandq	%zmm5,%zmm7,%zmm7
2296
2297
2298
2299
2300	subq	$128,%rdx
2301	ja	.Loop_avx512
2302
2303.Ltail_avx512:
2304
2305
2306
2307
2308
2309	vpsrlq	$32,%zmm16,%zmm16
2310	vpsrlq	$32,%zmm17,%zmm17
2311	vpsrlq	$32,%zmm18,%zmm18
2312	vpsrlq	$32,%zmm23,%zmm23
2313	vpsrlq	$32,%zmm24,%zmm24
2314	vpsrlq	$32,%zmm19,%zmm19
2315	vpsrlq	$32,%zmm20,%zmm20
2316	vpsrlq	$32,%zmm21,%zmm21
2317	vpsrlq	$32,%zmm22,%zmm22
2318
2319
2320
2321	leaq	(%rsi,%rdx,1),%rsi
2322
2323
2324	vpaddq	%zmm0,%zmm7,%zmm0
2325
2326	vpmuludq	%zmm2,%zmm17,%zmm14
2327	vpmuludq	%zmm2,%zmm18,%zmm15
2328	vpmuludq	%zmm2,%zmm23,%zmm11
2329	vpandq	%zmm5,%zmm8,%zmm8
2330	vpmuludq	%zmm2,%zmm24,%zmm12
2331	vpandq	%zmm5,%zmm10,%zmm10
2332	vpmuludq	%zmm2,%zmm16,%zmm13
2333	vporq	%zmm30,%zmm6,%zmm6
2334	vpaddq	%zmm1,%zmm8,%zmm1
2335	vpaddq	%zmm3,%zmm10,%zmm3
2336	vpaddq	%zmm4,%zmm6,%zmm4
2337
2338	vmovdqu	0(%rsi),%xmm7
2339	vpmuludq	%zmm0,%zmm19,%zmm28
2340	vpmuludq	%zmm0,%zmm20,%zmm29
2341	vpmuludq	%zmm0,%zmm16,%zmm25
2342	vpmuludq	%zmm0,%zmm17,%zmm26
2343	vpaddq	%zmm28,%zmm14,%zmm14
2344	vpaddq	%zmm29,%zmm15,%zmm15
2345	vpaddq	%zmm25,%zmm11,%zmm11
2346	vpaddq	%zmm26,%zmm12,%zmm12
2347
2348	vmovdqu	16(%rsi),%xmm8
2349	vpmuludq	%zmm1,%zmm18,%zmm28
2350	vpmuludq	%zmm1,%zmm19,%zmm29
2351	vpmuludq	%zmm1,%zmm24,%zmm25
2352	vpmuludq	%zmm0,%zmm18,%zmm27
2353	vpaddq	%zmm28,%zmm14,%zmm14
2354	vpaddq	%zmm29,%zmm15,%zmm15
2355	vpaddq	%zmm25,%zmm11,%zmm11
2356	vpaddq	%zmm27,%zmm13,%zmm13
2357
2358	vinserti128	$1,32(%rsi),%ymm7,%ymm7
2359	vpmuludq	%zmm3,%zmm16,%zmm28
2360	vpmuludq	%zmm3,%zmm17,%zmm29
2361	vpmuludq	%zmm1,%zmm16,%zmm26
2362	vpmuludq	%zmm1,%zmm17,%zmm27
2363	vpaddq	%zmm28,%zmm14,%zmm14
2364	vpaddq	%zmm29,%zmm15,%zmm15
2365	vpaddq	%zmm26,%zmm12,%zmm12
2366	vpaddq	%zmm27,%zmm13,%zmm13
2367
2368	vinserti128	$1,48(%rsi),%ymm8,%ymm8
2369	vpmuludq	%zmm4,%zmm24,%zmm28
2370	vpmuludq	%zmm4,%zmm16,%zmm29
2371	vpmuludq	%zmm3,%zmm22,%zmm25
2372	vpmuludq	%zmm3,%zmm23,%zmm26
2373	vpmuludq	%zmm3,%zmm24,%zmm27
2374	vpaddq	%zmm28,%zmm14,%zmm3
2375	vpaddq	%zmm29,%zmm15,%zmm15
2376	vpaddq	%zmm25,%zmm11,%zmm11
2377	vpaddq	%zmm26,%zmm12,%zmm12
2378	vpaddq	%zmm27,%zmm13,%zmm13
2379
2380	vpmuludq	%zmm4,%zmm21,%zmm25
2381	vpmuludq	%zmm4,%zmm22,%zmm26
2382	vpmuludq	%zmm4,%zmm23,%zmm27
2383	vpaddq	%zmm25,%zmm11,%zmm0
2384	vpaddq	%zmm26,%zmm12,%zmm1
2385	vpaddq	%zmm27,%zmm13,%zmm2
2386
2387
2388
2389
2390	movl	$1,%eax
2391	vpermq	$0xb1,%zmm3,%zmm14
2392	vpermq	$0xb1,%zmm15,%zmm4
2393	vpermq	$0xb1,%zmm0,%zmm11
2394	vpermq	$0xb1,%zmm1,%zmm12
2395	vpermq	$0xb1,%zmm2,%zmm13
2396	vpaddq	%zmm14,%zmm3,%zmm3
2397	vpaddq	%zmm15,%zmm4,%zmm4
2398	vpaddq	%zmm11,%zmm0,%zmm0
2399	vpaddq	%zmm12,%zmm1,%zmm1
2400	vpaddq	%zmm13,%zmm2,%zmm2
2401
2402	kmovw	%eax,%k3
2403	vpermq	$0x2,%zmm3,%zmm14
2404	vpermq	$0x2,%zmm4,%zmm15
2405	vpermq	$0x2,%zmm0,%zmm11
2406	vpermq	$0x2,%zmm1,%zmm12
2407	vpermq	$0x2,%zmm2,%zmm13
2408	vpaddq	%zmm14,%zmm3,%zmm3
2409	vpaddq	%zmm15,%zmm4,%zmm4
2410	vpaddq	%zmm11,%zmm0,%zmm0
2411	vpaddq	%zmm12,%zmm1,%zmm1
2412	vpaddq	%zmm13,%zmm2,%zmm2
2413
2414	vextracti64x4	$0x1,%zmm3,%ymm14
2415	vextracti64x4	$0x1,%zmm4,%ymm15
2416	vextracti64x4	$0x1,%zmm0,%ymm11
2417	vextracti64x4	$0x1,%zmm1,%ymm12
2418	vextracti64x4	$0x1,%zmm2,%ymm13
2419	vpaddq	%zmm14,%zmm3,%zmm3{%k3}{z}
2420	vpaddq	%zmm15,%zmm4,%zmm4{%k3}{z}
2421	vpaddq	%zmm11,%zmm0,%zmm0{%k3}{z}
2422	vpaddq	%zmm12,%zmm1,%zmm1{%k3}{z}
2423	vpaddq	%zmm13,%zmm2,%zmm2{%k3}{z}
2424
2425
2426
2427	vpsrlq	$26,%ymm3,%ymm14
2428	vpand	%ymm5,%ymm3,%ymm3
2429	vpsrldq	$6,%ymm7,%ymm9
2430	vpsrldq	$6,%ymm8,%ymm10
2431	vpunpckhqdq	%ymm8,%ymm7,%ymm6
2432	vpaddq	%ymm14,%ymm4,%ymm4
2433
2434	vpsrlq	$26,%ymm0,%ymm11
2435	vpand	%ymm5,%ymm0,%ymm0
2436	vpunpcklqdq	%ymm10,%ymm9,%ymm9
2437	vpunpcklqdq	%ymm8,%ymm7,%ymm7
2438	vpaddq	%ymm11,%ymm1,%ymm1
2439
2440	vpsrlq	$26,%ymm4,%ymm15
2441	vpand	%ymm5,%ymm4,%ymm4
2442
2443	vpsrlq	$26,%ymm1,%ymm12
2444	vpand	%ymm5,%ymm1,%ymm1
2445	vpsrlq	$30,%ymm9,%ymm10
2446	vpsrlq	$4,%ymm9,%ymm9
2447	vpaddq	%ymm12,%ymm2,%ymm2
2448
2449	vpaddq	%ymm15,%ymm0,%ymm0
2450	vpsllq	$2,%ymm15,%ymm15
2451	vpsrlq	$26,%ymm7,%ymm8
2452	vpsrlq	$40,%ymm6,%ymm6
2453	vpaddq	%ymm15,%ymm0,%ymm0
2454
2455	vpsrlq	$26,%ymm2,%ymm13
2456	vpand	%ymm5,%ymm2,%ymm2
2457	vpand	%ymm5,%ymm9,%ymm9
2458	vpand	%ymm5,%ymm7,%ymm7
2459	vpaddq	%ymm13,%ymm3,%ymm3
2460
2461	vpsrlq	$26,%ymm0,%ymm11
2462	vpand	%ymm5,%ymm0,%ymm0
2463	vpaddq	%ymm2,%ymm9,%ymm2
2464	vpand	%ymm5,%ymm8,%ymm8
2465	vpaddq	%ymm11,%ymm1,%ymm1
2466
2467	vpsrlq	$26,%ymm3,%ymm14
2468	vpand	%ymm5,%ymm3,%ymm3
2469	vpand	%ymm5,%ymm10,%ymm10
2470	vpor	32(%rcx),%ymm6,%ymm6
2471	vpaddq	%ymm14,%ymm4,%ymm4
2472
2473	leaq	144(%rsp),%rax
2474	addq	$64,%rdx
2475	jnz	.Ltail_avx2
2476
2477	vpsubq	%ymm9,%ymm2,%ymm2
2478	vmovd	%xmm0,-112(%rdi)
2479	vmovd	%xmm1,-108(%rdi)
2480	vmovd	%xmm2,-104(%rdi)
2481	vmovd	%xmm3,-100(%rdi)
2482	vmovd	%xmm4,-96(%rdi)
2483	vzeroall
2484	leaq	8(%r11),%rsp
2485.cfi_def_cfa	%rsp,8
2486	.byte	0xf3,0xc3
2487.cfi_endproc
2488.size	poly1305_blocks_avx512,.-poly1305_blocks_avx512
2489.type	poly1305_init_base2_44,@function
2490.align	32
2491poly1305_init_base2_44:
2492.cfi_startproc
2493	xorq	%rax,%rax
2494	movq	%rax,0(%rdi)
2495	movq	%rax,8(%rdi)
2496	movq	%rax,16(%rdi)
2497
2498.Linit_base2_44:
2499	leaq	poly1305_blocks_vpmadd52(%rip),%r10
2500	leaq	poly1305_emit_base2_44(%rip),%r11
2501
2502	movq	$0x0ffffffc0fffffff,%rax
2503	movq	$0x0ffffffc0ffffffc,%rcx
2504	andq	0(%rsi),%rax
2505	movq	$0x00000fffffffffff,%r8
2506	andq	8(%rsi),%rcx
2507	movq	$0x00000fffffffffff,%r9
2508	andq	%rax,%r8
2509	shrdq	$44,%rcx,%rax
2510	movq	%r8,40(%rdi)
2511	andq	%r9,%rax
2512	shrq	$24,%rcx
2513	movq	%rax,48(%rdi)
2514	leaq	(%rax,%rax,4),%rax
2515	movq	%rcx,56(%rdi)
2516	shlq	$2,%rax
2517	leaq	(%rcx,%rcx,4),%rcx
2518	shlq	$2,%rcx
2519	movq	%rax,24(%rdi)
2520	movq	%rcx,32(%rdi)
2521	movq	$-1,64(%rdi)
2522	movq	%r10,0(%rdx)
2523	movq	%r11,8(%rdx)
2524	movl	$1,%eax
2525	.byte	0xf3,0xc3
2526.cfi_endproc
2527.size	poly1305_init_base2_44,.-poly1305_init_base2_44
2528.type	poly1305_blocks_vpmadd52,@function
2529.align	32
2530poly1305_blocks_vpmadd52:
2531.cfi_startproc
2532	shrq	$4,%rdx
2533	jz	.Lno_data_vpmadd52
2534
2535	shlq	$40,%rcx
2536	movq	64(%rdi),%r8
2537
2538
2539
2540
2541
2542
2543	movq	$3,%rax
2544	movq	$1,%r10
2545	cmpq	$4,%rdx
2546	cmovaeq	%r10,%rax
2547	testq	%r8,%r8
2548	cmovnsq	%r10,%rax
2549
2550	andq	%rdx,%rax
2551	jz	.Lblocks_vpmadd52_4x
2552
2553	subq	%rax,%rdx
2554	movl	$7,%r10d
2555	movl	$1,%r11d
2556	kmovw	%r10d,%k7
2557	leaq	.L2_44_inp_permd(%rip),%r10
2558	kmovw	%r11d,%k1
2559
2560	vmovq	%rcx,%xmm21
2561	vmovdqa64	0(%r10),%ymm19
2562	vmovdqa64	32(%r10),%ymm20
2563	vpermq	$0xcf,%ymm21,%ymm21
2564	vmovdqa64	64(%r10),%ymm22
2565
2566	vmovdqu64	0(%rdi),%ymm16{%k7}{z}
2567	vmovdqu64	40(%rdi),%ymm3{%k7}{z}
2568	vmovdqu64	32(%rdi),%ymm4{%k7}{z}
2569	vmovdqu64	24(%rdi),%ymm5{%k7}{z}
2570
2571	vmovdqa64	96(%r10),%ymm23
2572	vmovdqa64	128(%r10),%ymm24
2573
2574	jmp	.Loop_vpmadd52
2575
2576.align	32
2577.Loop_vpmadd52:
2578	vmovdqu32	0(%rsi),%xmm18
2579	leaq	16(%rsi),%rsi
2580
2581	vpermd	%ymm18,%ymm19,%ymm18
2582	vpsrlvq	%ymm20,%ymm18,%ymm18
2583	vpandq	%ymm22,%ymm18,%ymm18
2584	vporq	%ymm21,%ymm18,%ymm18
2585
2586	vpaddq	%ymm18,%ymm16,%ymm16
2587
2588	vpermq	$0,%ymm16,%ymm0{%k7}{z}
2589	vpermq	$85,%ymm16,%ymm1{%k7}{z}
2590	vpermq	$170,%ymm16,%ymm2{%k7}{z}
2591
2592	vpxord	%ymm16,%ymm16,%ymm16
2593	vpxord	%ymm17,%ymm17,%ymm17
2594
2595	vpmadd52luq	%ymm3,%ymm0,%ymm16
2596	vpmadd52huq	%ymm3,%ymm0,%ymm17
2597
2598	vpmadd52luq	%ymm4,%ymm1,%ymm16
2599	vpmadd52huq	%ymm4,%ymm1,%ymm17
2600
2601	vpmadd52luq	%ymm5,%ymm2,%ymm16
2602	vpmadd52huq	%ymm5,%ymm2,%ymm17
2603
2604	vpsrlvq	%ymm23,%ymm16,%ymm18
2605	vpsllvq	%ymm24,%ymm17,%ymm17
2606	vpandq	%ymm22,%ymm16,%ymm16
2607
2608	vpaddq	%ymm18,%ymm17,%ymm17
2609
2610	vpermq	$147,%ymm17,%ymm17
2611
2612	vpaddq	%ymm17,%ymm16,%ymm16
2613
2614	vpsrlvq	%ymm23,%ymm16,%ymm18
2615	vpandq	%ymm22,%ymm16,%ymm16
2616
2617	vpermq	$147,%ymm18,%ymm18
2618
2619	vpaddq	%ymm18,%ymm16,%ymm16
2620
2621	vpermq	$147,%ymm16,%ymm18{%k1}{z}
2622
2623	vpaddq	%ymm18,%ymm16,%ymm16
2624	vpsllq	$2,%ymm18,%ymm18
2625
2626	vpaddq	%ymm18,%ymm16,%ymm16
2627
2628	decq	%rax
2629	jnz	.Loop_vpmadd52
2630
2631	vmovdqu64	%ymm16,0(%rdi){%k7}
2632
2633	testq	%rdx,%rdx
2634	jnz	.Lblocks_vpmadd52_4x
2635
2636.Lno_data_vpmadd52:
2637	.byte	0xf3,0xc3
2638.cfi_endproc
2639.size	poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52
2640.type	poly1305_blocks_vpmadd52_4x,@function
2641.align	32
2642poly1305_blocks_vpmadd52_4x:
2643.cfi_startproc
2644	shrq	$4,%rdx
2645	jz	.Lno_data_vpmadd52_4x
2646
2647	shlq	$40,%rcx
2648	movq	64(%rdi),%r8
2649
2650.Lblocks_vpmadd52_4x:
2651	vpbroadcastq	%rcx,%ymm31
2652
2653	vmovdqa64	.Lx_mask44(%rip),%ymm28
2654	movl	$5,%eax
2655	vmovdqa64	.Lx_mask42(%rip),%ymm29
2656	kmovw	%eax,%k1
2657
2658	testq	%r8,%r8
2659	js	.Linit_vpmadd52
2660
2661	vmovq	0(%rdi),%xmm0
2662	vmovq	8(%rdi),%xmm1
2663	vmovq	16(%rdi),%xmm2
2664
2665	testq	$3,%rdx
2666	jnz	.Lblocks_vpmadd52_2x_do
2667
2668.Lblocks_vpmadd52_4x_do:
2669	vpbroadcastq	64(%rdi),%ymm3
2670	vpbroadcastq	96(%rdi),%ymm4
2671	vpbroadcastq	128(%rdi),%ymm5
2672	vpbroadcastq	160(%rdi),%ymm16
2673
2674.Lblocks_vpmadd52_4x_key_loaded:
2675	vpsllq	$2,%ymm5,%ymm17
2676	vpaddq	%ymm5,%ymm17,%ymm17
2677	vpsllq	$2,%ymm17,%ymm17
2678
2679	testq	$7,%rdx
2680	jz	.Lblocks_vpmadd52_8x
2681
2682	vmovdqu64	0(%rsi),%ymm26
2683	vmovdqu64	32(%rsi),%ymm27
2684	leaq	64(%rsi),%rsi
2685
2686	vpunpcklqdq	%ymm27,%ymm26,%ymm25
2687	vpunpckhqdq	%ymm27,%ymm26,%ymm27
2688
2689
2690
2691	vpsrlq	$24,%ymm27,%ymm26
2692	vporq	%ymm31,%ymm26,%ymm26
2693	vpaddq	%ymm26,%ymm2,%ymm2
2694	vpandq	%ymm28,%ymm25,%ymm24
2695	vpsrlq	$44,%ymm25,%ymm25
2696	vpsllq	$20,%ymm27,%ymm27
2697	vporq	%ymm27,%ymm25,%ymm25
2698	vpandq	%ymm28,%ymm25,%ymm25
2699
2700	subq	$4,%rdx
2701	jz	.Ltail_vpmadd52_4x
2702	jmp	.Loop_vpmadd52_4x
2703	ud2
2704
2705.align	32
2706.Linit_vpmadd52:
2707	vmovq	24(%rdi),%xmm16
2708	vmovq	56(%rdi),%xmm2
2709	vmovq	32(%rdi),%xmm17
2710	vmovq	40(%rdi),%xmm3
2711	vmovq	48(%rdi),%xmm4
2712
2713	vmovdqa	%ymm3,%ymm0
2714	vmovdqa	%ymm4,%ymm1
2715	vmovdqa	%ymm2,%ymm5
2716
2717	movl	$2,%eax
2718
2719.Lmul_init_vpmadd52:
2720	vpxorq	%ymm18,%ymm18,%ymm18
2721	vpmadd52luq	%ymm2,%ymm16,%ymm18
2722	vpxorq	%ymm19,%ymm19,%ymm19
2723	vpmadd52huq	%ymm2,%ymm16,%ymm19
2724	vpxorq	%ymm20,%ymm20,%ymm20
2725	vpmadd52luq	%ymm2,%ymm17,%ymm20
2726	vpxorq	%ymm21,%ymm21,%ymm21
2727	vpmadd52huq	%ymm2,%ymm17,%ymm21
2728	vpxorq	%ymm22,%ymm22,%ymm22
2729	vpmadd52luq	%ymm2,%ymm3,%ymm22
2730	vpxorq	%ymm23,%ymm23,%ymm23
2731	vpmadd52huq	%ymm2,%ymm3,%ymm23
2732
2733	vpmadd52luq	%ymm0,%ymm3,%ymm18
2734	vpmadd52huq	%ymm0,%ymm3,%ymm19
2735	vpmadd52luq	%ymm0,%ymm4,%ymm20
2736	vpmadd52huq	%ymm0,%ymm4,%ymm21
2737	vpmadd52luq	%ymm0,%ymm5,%ymm22
2738	vpmadd52huq	%ymm0,%ymm5,%ymm23
2739
2740	vpmadd52luq	%ymm1,%ymm17,%ymm18
2741	vpmadd52huq	%ymm1,%ymm17,%ymm19
2742	vpmadd52luq	%ymm1,%ymm3,%ymm20
2743	vpmadd52huq	%ymm1,%ymm3,%ymm21
2744	vpmadd52luq	%ymm1,%ymm4,%ymm22
2745	vpmadd52huq	%ymm1,%ymm4,%ymm23
2746
2747
2748
2749	vpsrlq	$44,%ymm18,%ymm30
2750	vpsllq	$8,%ymm19,%ymm19
2751	vpandq	%ymm28,%ymm18,%ymm0
2752	vpaddq	%ymm30,%ymm19,%ymm19
2753
2754	vpaddq	%ymm19,%ymm20,%ymm20
2755
2756	vpsrlq	$44,%ymm20,%ymm30
2757	vpsllq	$8,%ymm21,%ymm21
2758	vpandq	%ymm28,%ymm20,%ymm1
2759	vpaddq	%ymm30,%ymm21,%ymm21
2760
2761	vpaddq	%ymm21,%ymm22,%ymm22
2762
2763	vpsrlq	$42,%ymm22,%ymm30
2764	vpsllq	$10,%ymm23,%ymm23
2765	vpandq	%ymm29,%ymm22,%ymm2
2766	vpaddq	%ymm30,%ymm23,%ymm23
2767
2768	vpaddq	%ymm23,%ymm0,%ymm0
2769	vpsllq	$2,%ymm23,%ymm23
2770
2771	vpaddq	%ymm23,%ymm0,%ymm0
2772
2773	vpsrlq	$44,%ymm0,%ymm30
2774	vpandq	%ymm28,%ymm0,%ymm0
2775
2776	vpaddq	%ymm30,%ymm1,%ymm1
2777
2778	decl	%eax
2779	jz	.Ldone_init_vpmadd52
2780
2781	vpunpcklqdq	%ymm4,%ymm1,%ymm4
2782	vpbroadcastq	%xmm1,%xmm1
2783	vpunpcklqdq	%ymm5,%ymm2,%ymm5
2784	vpbroadcastq	%xmm2,%xmm2
2785	vpunpcklqdq	%ymm3,%ymm0,%ymm3
2786	vpbroadcastq	%xmm0,%xmm0
2787
2788	vpsllq	$2,%ymm4,%ymm16
2789	vpsllq	$2,%ymm5,%ymm17
2790	vpaddq	%ymm4,%ymm16,%ymm16
2791	vpaddq	%ymm5,%ymm17,%ymm17
2792	vpsllq	$2,%ymm16,%ymm16
2793	vpsllq	$2,%ymm17,%ymm17
2794
2795	jmp	.Lmul_init_vpmadd52
2796	ud2
2797
2798.align	32
2799.Ldone_init_vpmadd52:
2800	vinserti128	$1,%xmm4,%ymm1,%ymm4
2801	vinserti128	$1,%xmm5,%ymm2,%ymm5
2802	vinserti128	$1,%xmm3,%ymm0,%ymm3
2803
2804	vpermq	$216,%ymm4,%ymm4
2805	vpermq	$216,%ymm5,%ymm5
2806	vpermq	$216,%ymm3,%ymm3
2807
2808	vpsllq	$2,%ymm4,%ymm16
2809	vpaddq	%ymm4,%ymm16,%ymm16
2810	vpsllq	$2,%ymm16,%ymm16
2811
2812	vmovq	0(%rdi),%xmm0
2813	vmovq	8(%rdi),%xmm1
2814	vmovq	16(%rdi),%xmm2
2815
2816	testq	$3,%rdx
2817	jnz	.Ldone_init_vpmadd52_2x
2818
2819	vmovdqu64	%ymm3,64(%rdi)
2820	vpbroadcastq	%xmm3,%ymm3
2821	vmovdqu64	%ymm4,96(%rdi)
2822	vpbroadcastq	%xmm4,%ymm4
2823	vmovdqu64	%ymm5,128(%rdi)
2824	vpbroadcastq	%xmm5,%ymm5
2825	vmovdqu64	%ymm16,160(%rdi)
2826	vpbroadcastq	%xmm16,%ymm16
2827
2828	jmp	.Lblocks_vpmadd52_4x_key_loaded
2829	ud2
2830
2831.align	32
2832.Ldone_init_vpmadd52_2x:
2833	vmovdqu64	%ymm3,64(%rdi)
2834	vpsrldq	$8,%ymm3,%ymm3
2835	vmovdqu64	%ymm4,96(%rdi)
2836	vpsrldq	$8,%ymm4,%ymm4
2837	vmovdqu64	%ymm5,128(%rdi)
2838	vpsrldq	$8,%ymm5,%ymm5
2839	vmovdqu64	%ymm16,160(%rdi)
2840	vpsrldq	$8,%ymm16,%ymm16
2841	jmp	.Lblocks_vpmadd52_2x_key_loaded
2842	ud2
2843
2844.align	32
2845.Lblocks_vpmadd52_2x_do:
2846	vmovdqu64	128+8(%rdi),%ymm5{%k1}{z}
2847	vmovdqu64	160+8(%rdi),%ymm16{%k1}{z}
2848	vmovdqu64	64+8(%rdi),%ymm3{%k1}{z}
2849	vmovdqu64	96+8(%rdi),%ymm4{%k1}{z}
2850
2851.Lblocks_vpmadd52_2x_key_loaded:
2852	vmovdqu64	0(%rsi),%ymm26
2853	vpxorq	%ymm27,%ymm27,%ymm27
2854	leaq	32(%rsi),%rsi
2855
2856	vpunpcklqdq	%ymm27,%ymm26,%ymm25
2857	vpunpckhqdq	%ymm27,%ymm26,%ymm27
2858
2859
2860
2861	vpsrlq	$24,%ymm27,%ymm26
2862	vporq	%ymm31,%ymm26,%ymm26
2863	vpaddq	%ymm26,%ymm2,%ymm2
2864	vpandq	%ymm28,%ymm25,%ymm24
2865	vpsrlq	$44,%ymm25,%ymm25
2866	vpsllq	$20,%ymm27,%ymm27
2867	vporq	%ymm27,%ymm25,%ymm25
2868	vpandq	%ymm28,%ymm25,%ymm25
2869
2870	jmp	.Ltail_vpmadd52_2x
2871	ud2
2872
2873.align	32
2874.Loop_vpmadd52_4x:
2875
2876	vpaddq	%ymm24,%ymm0,%ymm0
2877	vpaddq	%ymm25,%ymm1,%ymm1
2878
2879	vpxorq	%ymm18,%ymm18,%ymm18
2880	vpmadd52luq	%ymm2,%ymm16,%ymm18
2881	vpxorq	%ymm19,%ymm19,%ymm19
2882	vpmadd52huq	%ymm2,%ymm16,%ymm19
2883	vpxorq	%ymm20,%ymm20,%ymm20
2884	vpmadd52luq	%ymm2,%ymm17,%ymm20
2885	vpxorq	%ymm21,%ymm21,%ymm21
2886	vpmadd52huq	%ymm2,%ymm17,%ymm21
2887	vpxorq	%ymm22,%ymm22,%ymm22
2888	vpmadd52luq	%ymm2,%ymm3,%ymm22
2889	vpxorq	%ymm23,%ymm23,%ymm23
2890	vpmadd52huq	%ymm2,%ymm3,%ymm23
2891
2892	vmovdqu64	0(%rsi),%ymm26
2893	vmovdqu64	32(%rsi),%ymm27
2894	leaq	64(%rsi),%rsi
2895	vpmadd52luq	%ymm0,%ymm3,%ymm18
2896	vpmadd52huq	%ymm0,%ymm3,%ymm19
2897	vpmadd52luq	%ymm0,%ymm4,%ymm20
2898	vpmadd52huq	%ymm0,%ymm4,%ymm21
2899	vpmadd52luq	%ymm0,%ymm5,%ymm22
2900	vpmadd52huq	%ymm0,%ymm5,%ymm23
2901
2902	vpunpcklqdq	%ymm27,%ymm26,%ymm25
2903	vpunpckhqdq	%ymm27,%ymm26,%ymm27
2904	vpmadd52luq	%ymm1,%ymm17,%ymm18
2905	vpmadd52huq	%ymm1,%ymm17,%ymm19
2906	vpmadd52luq	%ymm1,%ymm3,%ymm20
2907	vpmadd52huq	%ymm1,%ymm3,%ymm21
2908	vpmadd52luq	%ymm1,%ymm4,%ymm22
2909	vpmadd52huq	%ymm1,%ymm4,%ymm23
2910
2911
2912
2913	vpsrlq	$44,%ymm18,%ymm30
2914	vpsllq	$8,%ymm19,%ymm19
2915	vpandq	%ymm28,%ymm18,%ymm0
2916	vpaddq	%ymm30,%ymm19,%ymm19
2917
2918	vpsrlq	$24,%ymm27,%ymm26
2919	vporq	%ymm31,%ymm26,%ymm26
2920	vpaddq	%ymm19,%ymm20,%ymm20
2921
2922	vpsrlq	$44,%ymm20,%ymm30
2923	vpsllq	$8,%ymm21,%ymm21
2924	vpandq	%ymm28,%ymm20,%ymm1
2925	vpaddq	%ymm30,%ymm21,%ymm21
2926
2927	vpandq	%ymm28,%ymm25,%ymm24
2928	vpsrlq	$44,%ymm25,%ymm25
2929	vpsllq	$20,%ymm27,%ymm27
2930	vpaddq	%ymm21,%ymm22,%ymm22
2931
2932	vpsrlq	$42,%ymm22,%ymm30
2933	vpsllq	$10,%ymm23,%ymm23
2934	vpandq	%ymm29,%ymm22,%ymm2
2935	vpaddq	%ymm30,%ymm23,%ymm23
2936
2937	vpaddq	%ymm26,%ymm2,%ymm2
2938	vpaddq	%ymm23,%ymm0,%ymm0
2939	vpsllq	$2,%ymm23,%ymm23
2940
2941	vpaddq	%ymm23,%ymm0,%ymm0
2942	vporq	%ymm27,%ymm25,%ymm25
2943	vpandq	%ymm28,%ymm25,%ymm25
2944
2945	vpsrlq	$44,%ymm0,%ymm30
2946	vpandq	%ymm28,%ymm0,%ymm0
2947
2948	vpaddq	%ymm30,%ymm1,%ymm1
2949
2950	subq	$4,%rdx
2951	jnz	.Loop_vpmadd52_4x
2952
2953.Ltail_vpmadd52_4x:
2954	vmovdqu64	128(%rdi),%ymm5
2955	vmovdqu64	160(%rdi),%ymm16
2956	vmovdqu64	64(%rdi),%ymm3
2957	vmovdqu64	96(%rdi),%ymm4
2958
2959.Ltail_vpmadd52_2x:
2960	vpsllq	$2,%ymm5,%ymm17
2961	vpaddq	%ymm5,%ymm17,%ymm17
2962	vpsllq	$2,%ymm17,%ymm17
2963
2964
2965	vpaddq	%ymm24,%ymm0,%ymm0
2966	vpaddq	%ymm25,%ymm1,%ymm1
2967
2968	vpxorq	%ymm18,%ymm18,%ymm18
2969	vpmadd52luq	%ymm2,%ymm16,%ymm18
2970	vpxorq	%ymm19,%ymm19,%ymm19
2971	vpmadd52huq	%ymm2,%ymm16,%ymm19
2972	vpxorq	%ymm20,%ymm20,%ymm20
2973	vpmadd52luq	%ymm2,%ymm17,%ymm20
2974	vpxorq	%ymm21,%ymm21,%ymm21
2975	vpmadd52huq	%ymm2,%ymm17,%ymm21
2976	vpxorq	%ymm22,%ymm22,%ymm22
2977	vpmadd52luq	%ymm2,%ymm3,%ymm22
2978	vpxorq	%ymm23,%ymm23,%ymm23
2979	vpmadd52huq	%ymm2,%ymm3,%ymm23
2980
2981	vpmadd52luq	%ymm0,%ymm3,%ymm18
2982	vpmadd52huq	%ymm0,%ymm3,%ymm19
2983	vpmadd52luq	%ymm0,%ymm4,%ymm20
2984	vpmadd52huq	%ymm0,%ymm4,%ymm21
2985	vpmadd52luq	%ymm0,%ymm5,%ymm22
2986	vpmadd52huq	%ymm0,%ymm5,%ymm23
2987
2988	vpmadd52luq	%ymm1,%ymm17,%ymm18
2989	vpmadd52huq	%ymm1,%ymm17,%ymm19
2990	vpmadd52luq	%ymm1,%ymm3,%ymm20
2991	vpmadd52huq	%ymm1,%ymm3,%ymm21
2992	vpmadd52luq	%ymm1,%ymm4,%ymm22
2993	vpmadd52huq	%ymm1,%ymm4,%ymm23
2994
2995
2996
2997
2998	movl	$1,%eax
2999	kmovw	%eax,%k1
3000	vpsrldq	$8,%ymm18,%ymm24
3001	vpsrldq	$8,%ymm19,%ymm0
3002	vpsrldq	$8,%ymm20,%ymm25
3003	vpsrldq	$8,%ymm21,%ymm1
3004	vpaddq	%ymm24,%ymm18,%ymm18
3005	vpaddq	%ymm0,%ymm19,%ymm19
3006	vpsrldq	$8,%ymm22,%ymm26
3007	vpsrldq	$8,%ymm23,%ymm2
3008	vpaddq	%ymm25,%ymm20,%ymm20
3009	vpaddq	%ymm1,%ymm21,%ymm21
3010	vpermq	$0x2,%ymm18,%ymm24
3011	vpermq	$0x2,%ymm19,%ymm0
3012	vpaddq	%ymm26,%ymm22,%ymm22
3013	vpaddq	%ymm2,%ymm23,%ymm23
3014
3015	vpermq	$0x2,%ymm20,%ymm25
3016	vpermq	$0x2,%ymm21,%ymm1
3017	vpaddq	%ymm24,%ymm18,%ymm18{%k1}{z}
3018	vpaddq	%ymm0,%ymm19,%ymm19{%k1}{z}
3019	vpermq	$0x2,%ymm22,%ymm26
3020	vpermq	$0x2,%ymm23,%ymm2
3021	vpaddq	%ymm25,%ymm20,%ymm20{%k1}{z}
3022	vpaddq	%ymm1,%ymm21,%ymm21{%k1}{z}
3023	vpaddq	%ymm26,%ymm22,%ymm22{%k1}{z}
3024	vpaddq	%ymm2,%ymm23,%ymm23{%k1}{z}
3025
3026
3027
3028	vpsrlq	$44,%ymm18,%ymm30
3029	vpsllq	$8,%ymm19,%ymm19
3030	vpandq	%ymm28,%ymm18,%ymm0
3031	vpaddq	%ymm30,%ymm19,%ymm19
3032
3033	vpaddq	%ymm19,%ymm20,%ymm20
3034
3035	vpsrlq	$44,%ymm20,%ymm30
3036	vpsllq	$8,%ymm21,%ymm21
3037	vpandq	%ymm28,%ymm20,%ymm1
3038	vpaddq	%ymm30,%ymm21,%ymm21
3039
3040	vpaddq	%ymm21,%ymm22,%ymm22
3041
3042	vpsrlq	$42,%ymm22,%ymm30
3043	vpsllq	$10,%ymm23,%ymm23
3044	vpandq	%ymm29,%ymm22,%ymm2
3045	vpaddq	%ymm30,%ymm23,%ymm23
3046
3047	vpaddq	%ymm23,%ymm0,%ymm0
3048	vpsllq	$2,%ymm23,%ymm23
3049
3050	vpaddq	%ymm23,%ymm0,%ymm0
3051
3052	vpsrlq	$44,%ymm0,%ymm30
3053	vpandq	%ymm28,%ymm0,%ymm0
3054
3055	vpaddq	%ymm30,%ymm1,%ymm1
3056
3057
3058	subq	$2,%rdx
3059	ja	.Lblocks_vpmadd52_4x_do
3060
3061	vmovq	%xmm0,0(%rdi)
3062	vmovq	%xmm1,8(%rdi)
3063	vmovq	%xmm2,16(%rdi)
3064	vzeroall
3065
3066.Lno_data_vpmadd52_4x:
3067	.byte	0xf3,0xc3
3068.cfi_endproc
3069.size	poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x
3070.type	poly1305_blocks_vpmadd52_8x,@function
3071.align	32
3072poly1305_blocks_vpmadd52_8x:
3073.cfi_startproc
3074	shrq	$4,%rdx
3075	jz	.Lno_data_vpmadd52_8x
3076
3077	shlq	$40,%rcx
3078	movq	64(%rdi),%r8
3079
3080	vmovdqa64	.Lx_mask44(%rip),%ymm28
3081	vmovdqa64	.Lx_mask42(%rip),%ymm29
3082
3083	testq	%r8,%r8
3084	js	.Linit_vpmadd52
3085
3086	vmovq	0(%rdi),%xmm0
3087	vmovq	8(%rdi),%xmm1
3088	vmovq	16(%rdi),%xmm2
3089
3090.Lblocks_vpmadd52_8x:
3091
3092
3093
3094	vmovdqu64	128(%rdi),%ymm5
3095	vmovdqu64	160(%rdi),%ymm16
3096	vmovdqu64	64(%rdi),%ymm3
3097	vmovdqu64	96(%rdi),%ymm4
3098
3099	vpsllq	$2,%ymm5,%ymm17
3100	vpaddq	%ymm5,%ymm17,%ymm17
3101	vpsllq	$2,%ymm17,%ymm17
3102
3103	vpbroadcastq	%xmm5,%ymm8
3104	vpbroadcastq	%xmm3,%ymm6
3105	vpbroadcastq	%xmm4,%ymm7
3106
3107	vpxorq	%ymm18,%ymm18,%ymm18
3108	vpmadd52luq	%ymm8,%ymm16,%ymm18
3109	vpxorq	%ymm19,%ymm19,%ymm19
3110	vpmadd52huq	%ymm8,%ymm16,%ymm19
3111	vpxorq	%ymm20,%ymm20,%ymm20
3112	vpmadd52luq	%ymm8,%ymm17,%ymm20
3113	vpxorq	%ymm21,%ymm21,%ymm21
3114	vpmadd52huq	%ymm8,%ymm17,%ymm21
3115	vpxorq	%ymm22,%ymm22,%ymm22
3116	vpmadd52luq	%ymm8,%ymm3,%ymm22
3117	vpxorq	%ymm23,%ymm23,%ymm23
3118	vpmadd52huq	%ymm8,%ymm3,%ymm23
3119
3120	vpmadd52luq	%ymm6,%ymm3,%ymm18
3121	vpmadd52huq	%ymm6,%ymm3,%ymm19
3122	vpmadd52luq	%ymm6,%ymm4,%ymm20
3123	vpmadd52huq	%ymm6,%ymm4,%ymm21
3124	vpmadd52luq	%ymm6,%ymm5,%ymm22
3125	vpmadd52huq	%ymm6,%ymm5,%ymm23
3126
3127	vpmadd52luq	%ymm7,%ymm17,%ymm18
3128	vpmadd52huq	%ymm7,%ymm17,%ymm19
3129	vpmadd52luq	%ymm7,%ymm3,%ymm20
3130	vpmadd52huq	%ymm7,%ymm3,%ymm21
3131	vpmadd52luq	%ymm7,%ymm4,%ymm22
3132	vpmadd52huq	%ymm7,%ymm4,%ymm23
3133
3134
3135
3136	vpsrlq	$44,%ymm18,%ymm30
3137	vpsllq	$8,%ymm19,%ymm19
3138	vpandq	%ymm28,%ymm18,%ymm6
3139	vpaddq	%ymm30,%ymm19,%ymm19
3140
3141	vpaddq	%ymm19,%ymm20,%ymm20
3142
3143	vpsrlq	$44,%ymm20,%ymm30
3144	vpsllq	$8,%ymm21,%ymm21
3145	vpandq	%ymm28,%ymm20,%ymm7
3146	vpaddq	%ymm30,%ymm21,%ymm21
3147
3148	vpaddq	%ymm21,%ymm22,%ymm22
3149
3150	vpsrlq	$42,%ymm22,%ymm30
3151	vpsllq	$10,%ymm23,%ymm23
3152	vpandq	%ymm29,%ymm22,%ymm8
3153	vpaddq	%ymm30,%ymm23,%ymm23
3154
3155	vpaddq	%ymm23,%ymm6,%ymm6
3156	vpsllq	$2,%ymm23,%ymm23
3157
3158	vpaddq	%ymm23,%ymm6,%ymm6
3159
3160	vpsrlq	$44,%ymm6,%ymm30
3161	vpandq	%ymm28,%ymm6,%ymm6
3162
3163	vpaddq	%ymm30,%ymm7,%ymm7
3164
3165
3166
3167
3168
3169	vpunpcklqdq	%ymm5,%ymm8,%ymm26
3170	vpunpckhqdq	%ymm5,%ymm8,%ymm5
3171	vpunpcklqdq	%ymm3,%ymm6,%ymm24
3172	vpunpckhqdq	%ymm3,%ymm6,%ymm3
3173	vpunpcklqdq	%ymm4,%ymm7,%ymm25
3174	vpunpckhqdq	%ymm4,%ymm7,%ymm4
3175	vshufi64x2	$0x44,%zmm5,%zmm26,%zmm8
3176	vshufi64x2	$0x44,%zmm3,%zmm24,%zmm6
3177	vshufi64x2	$0x44,%zmm4,%zmm25,%zmm7
3178
3179	vmovdqu64	0(%rsi),%zmm26
3180	vmovdqu64	64(%rsi),%zmm27
3181	leaq	128(%rsi),%rsi
3182
3183	vpsllq	$2,%zmm8,%zmm10
3184	vpsllq	$2,%zmm7,%zmm9
3185	vpaddq	%zmm8,%zmm10,%zmm10
3186	vpaddq	%zmm7,%zmm9,%zmm9
3187	vpsllq	$2,%zmm10,%zmm10
3188	vpsllq	$2,%zmm9,%zmm9
3189
3190	vpbroadcastq	%rcx,%zmm31
3191	vpbroadcastq	%xmm28,%zmm28
3192	vpbroadcastq	%xmm29,%zmm29
3193
3194	vpbroadcastq	%xmm9,%zmm16
3195	vpbroadcastq	%xmm10,%zmm17
3196	vpbroadcastq	%xmm6,%zmm3
3197	vpbroadcastq	%xmm7,%zmm4
3198	vpbroadcastq	%xmm8,%zmm5
3199
3200	vpunpcklqdq	%zmm27,%zmm26,%zmm25
3201	vpunpckhqdq	%zmm27,%zmm26,%zmm27
3202
3203
3204
3205	vpsrlq	$24,%zmm27,%zmm26
3206	vporq	%zmm31,%zmm26,%zmm26
3207	vpaddq	%zmm26,%zmm2,%zmm2
3208	vpandq	%zmm28,%zmm25,%zmm24
3209	vpsrlq	$44,%zmm25,%zmm25
3210	vpsllq	$20,%zmm27,%zmm27
3211	vporq	%zmm27,%zmm25,%zmm25
3212	vpandq	%zmm28,%zmm25,%zmm25
3213
3214	subq	$8,%rdx
3215	jz	.Ltail_vpmadd52_8x
3216	jmp	.Loop_vpmadd52_8x
3217
3218.align	32
3219.Loop_vpmadd52_8x:
3220
3221	vpaddq	%zmm24,%zmm0,%zmm0
3222	vpaddq	%zmm25,%zmm1,%zmm1
3223
3224	vpxorq	%zmm18,%zmm18,%zmm18
3225	vpmadd52luq	%zmm2,%zmm16,%zmm18
3226	vpxorq	%zmm19,%zmm19,%zmm19
3227	vpmadd52huq	%zmm2,%zmm16,%zmm19
3228	vpxorq	%zmm20,%zmm20,%zmm20
3229	vpmadd52luq	%zmm2,%zmm17,%zmm20
3230	vpxorq	%zmm21,%zmm21,%zmm21
3231	vpmadd52huq	%zmm2,%zmm17,%zmm21
3232	vpxorq	%zmm22,%zmm22,%zmm22
3233	vpmadd52luq	%zmm2,%zmm3,%zmm22
3234	vpxorq	%zmm23,%zmm23,%zmm23
3235	vpmadd52huq	%zmm2,%zmm3,%zmm23
3236
3237	vmovdqu64	0(%rsi),%zmm26
3238	vmovdqu64	64(%rsi),%zmm27
3239	leaq	128(%rsi),%rsi
3240	vpmadd52luq	%zmm0,%zmm3,%zmm18
3241	vpmadd52huq	%zmm0,%zmm3,%zmm19
3242	vpmadd52luq	%zmm0,%zmm4,%zmm20
3243	vpmadd52huq	%zmm0,%zmm4,%zmm21
3244	vpmadd52luq	%zmm0,%zmm5,%zmm22
3245	vpmadd52huq	%zmm0,%zmm5,%zmm23
3246
3247	vpunpcklqdq	%zmm27,%zmm26,%zmm25
3248	vpunpckhqdq	%zmm27,%zmm26,%zmm27
3249	vpmadd52luq	%zmm1,%zmm17,%zmm18
3250	vpmadd52huq	%zmm1,%zmm17,%zmm19
3251	vpmadd52luq	%zmm1,%zmm3,%zmm20
3252	vpmadd52huq	%zmm1,%zmm3,%zmm21
3253	vpmadd52luq	%zmm1,%zmm4,%zmm22
3254	vpmadd52huq	%zmm1,%zmm4,%zmm23
3255
3256
3257
3258	vpsrlq	$44,%zmm18,%zmm30
3259	vpsllq	$8,%zmm19,%zmm19
3260	vpandq	%zmm28,%zmm18,%zmm0
3261	vpaddq	%zmm30,%zmm19,%zmm19
3262
3263	vpsrlq	$24,%zmm27,%zmm26
3264	vporq	%zmm31,%zmm26,%zmm26
3265	vpaddq	%zmm19,%zmm20,%zmm20
3266
3267	vpsrlq	$44,%zmm20,%zmm30
3268	vpsllq	$8,%zmm21,%zmm21
3269	vpandq	%zmm28,%zmm20,%zmm1
3270	vpaddq	%zmm30,%zmm21,%zmm21
3271
3272	vpandq	%zmm28,%zmm25,%zmm24
3273	vpsrlq	$44,%zmm25,%zmm25
3274	vpsllq	$20,%zmm27,%zmm27
3275	vpaddq	%zmm21,%zmm22,%zmm22
3276
3277	vpsrlq	$42,%zmm22,%zmm30
3278	vpsllq	$10,%zmm23,%zmm23
3279	vpandq	%zmm29,%zmm22,%zmm2
3280	vpaddq	%zmm30,%zmm23,%zmm23
3281
3282	vpaddq	%zmm26,%zmm2,%zmm2
3283	vpaddq	%zmm23,%zmm0,%zmm0
3284	vpsllq	$2,%zmm23,%zmm23
3285
3286	vpaddq	%zmm23,%zmm0,%zmm0
3287	vporq	%zmm27,%zmm25,%zmm25
3288	vpandq	%zmm28,%zmm25,%zmm25
3289
3290	vpsrlq	$44,%zmm0,%zmm30
3291	vpandq	%zmm28,%zmm0,%zmm0
3292
3293	vpaddq	%zmm30,%zmm1,%zmm1
3294
3295	subq	$8,%rdx
3296	jnz	.Loop_vpmadd52_8x
3297
3298.Ltail_vpmadd52_8x:
3299
3300	vpaddq	%zmm24,%zmm0,%zmm0
3301	vpaddq	%zmm25,%zmm1,%zmm1
3302
3303	vpxorq	%zmm18,%zmm18,%zmm18
3304	vpmadd52luq	%zmm2,%zmm9,%zmm18
3305	vpxorq	%zmm19,%zmm19,%zmm19
3306	vpmadd52huq	%zmm2,%zmm9,%zmm19
3307	vpxorq	%zmm20,%zmm20,%zmm20
3308	vpmadd52luq	%zmm2,%zmm10,%zmm20
3309	vpxorq	%zmm21,%zmm21,%zmm21
3310	vpmadd52huq	%zmm2,%zmm10,%zmm21
3311	vpxorq	%zmm22,%zmm22,%zmm22
3312	vpmadd52luq	%zmm2,%zmm6,%zmm22
3313	vpxorq	%zmm23,%zmm23,%zmm23
3314	vpmadd52huq	%zmm2,%zmm6,%zmm23
3315
3316	vpmadd52luq	%zmm0,%zmm6,%zmm18
3317	vpmadd52huq	%zmm0,%zmm6,%zmm19
3318	vpmadd52luq	%zmm0,%zmm7,%zmm20
3319	vpmadd52huq	%zmm0,%zmm7,%zmm21
3320	vpmadd52luq	%zmm0,%zmm8,%zmm22
3321	vpmadd52huq	%zmm0,%zmm8,%zmm23
3322
3323	vpmadd52luq	%zmm1,%zmm10,%zmm18
3324	vpmadd52huq	%zmm1,%zmm10,%zmm19
3325	vpmadd52luq	%zmm1,%zmm6,%zmm20
3326	vpmadd52huq	%zmm1,%zmm6,%zmm21
3327	vpmadd52luq	%zmm1,%zmm7,%zmm22
3328	vpmadd52huq	%zmm1,%zmm7,%zmm23
3329
3330
3331
3332
3333	movl	$1,%eax
3334	kmovw	%eax,%k1
3335	vpsrldq	$8,%zmm18,%zmm24
3336	vpsrldq	$8,%zmm19,%zmm0
3337	vpsrldq	$8,%zmm20,%zmm25
3338	vpsrldq	$8,%zmm21,%zmm1
3339	vpaddq	%zmm24,%zmm18,%zmm18
3340	vpaddq	%zmm0,%zmm19,%zmm19
3341	vpsrldq	$8,%zmm22,%zmm26
3342	vpsrldq	$8,%zmm23,%zmm2
3343	vpaddq	%zmm25,%zmm20,%zmm20
3344	vpaddq	%zmm1,%zmm21,%zmm21
3345	vpermq	$0x2,%zmm18,%zmm24
3346	vpermq	$0x2,%zmm19,%zmm0
3347	vpaddq	%zmm26,%zmm22,%zmm22
3348	vpaddq	%zmm2,%zmm23,%zmm23
3349
3350	vpermq	$0x2,%zmm20,%zmm25
3351	vpermq	$0x2,%zmm21,%zmm1
3352	vpaddq	%zmm24,%zmm18,%zmm18
3353	vpaddq	%zmm0,%zmm19,%zmm19
3354	vpermq	$0x2,%zmm22,%zmm26
3355	vpermq	$0x2,%zmm23,%zmm2
3356	vpaddq	%zmm25,%zmm20,%zmm20
3357	vpaddq	%zmm1,%zmm21,%zmm21
3358	vextracti64x4	$1,%zmm18,%ymm24
3359	vextracti64x4	$1,%zmm19,%ymm0
3360	vpaddq	%zmm26,%zmm22,%zmm22
3361	vpaddq	%zmm2,%zmm23,%zmm23
3362
3363	vextracti64x4	$1,%zmm20,%ymm25
3364	vextracti64x4	$1,%zmm21,%ymm1
3365	vextracti64x4	$1,%zmm22,%ymm26
3366	vextracti64x4	$1,%zmm23,%ymm2
3367	vpaddq	%ymm24,%ymm18,%ymm18{%k1}{z}
3368	vpaddq	%ymm0,%ymm19,%ymm19{%k1}{z}
3369	vpaddq	%ymm25,%ymm20,%ymm20{%k1}{z}
3370	vpaddq	%ymm1,%ymm21,%ymm21{%k1}{z}
3371	vpaddq	%ymm26,%ymm22,%ymm22{%k1}{z}
3372	vpaddq	%ymm2,%ymm23,%ymm23{%k1}{z}
3373
3374
3375
3376	vpsrlq	$44,%ymm18,%ymm30
3377	vpsllq	$8,%ymm19,%ymm19
3378	vpandq	%ymm28,%ymm18,%ymm0
3379	vpaddq	%ymm30,%ymm19,%ymm19
3380
3381	vpaddq	%ymm19,%ymm20,%ymm20
3382
3383	vpsrlq	$44,%ymm20,%ymm30
3384	vpsllq	$8,%ymm21,%ymm21
3385	vpandq	%ymm28,%ymm20,%ymm1
3386	vpaddq	%ymm30,%ymm21,%ymm21
3387
3388	vpaddq	%ymm21,%ymm22,%ymm22
3389
3390	vpsrlq	$42,%ymm22,%ymm30
3391	vpsllq	$10,%ymm23,%ymm23
3392	vpandq	%ymm29,%ymm22,%ymm2
3393	vpaddq	%ymm30,%ymm23,%ymm23
3394
3395	vpaddq	%ymm23,%ymm0,%ymm0
3396	vpsllq	$2,%ymm23,%ymm23
3397
3398	vpaddq	%ymm23,%ymm0,%ymm0
3399
3400	vpsrlq	$44,%ymm0,%ymm30
3401	vpandq	%ymm28,%ymm0,%ymm0
3402
3403	vpaddq	%ymm30,%ymm1,%ymm1
3404
3405
3406
3407	vmovq	%xmm0,0(%rdi)
3408	vmovq	%xmm1,8(%rdi)
3409	vmovq	%xmm2,16(%rdi)
3410	vzeroall
3411
3412.Lno_data_vpmadd52_8x:
3413	.byte	0xf3,0xc3
3414.cfi_endproc
3415.size	poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x
3416.type	poly1305_emit_base2_44,@function
3417.align	32
3418poly1305_emit_base2_44:
3419.cfi_startproc
3420	movq	0(%rdi),%r8
3421	movq	8(%rdi),%r9
3422	movq	16(%rdi),%r10
3423
3424	movq	%r9,%rax
3425	shrq	$20,%r9
3426	shlq	$44,%rax
3427	movq	%r10,%rcx
3428	shrq	$40,%r10
3429	shlq	$24,%rcx
3430
3431	addq	%rax,%r8
3432	adcq	%rcx,%r9
3433	adcq	$0,%r10
3434
3435	movq	%r8,%rax
3436	addq	$5,%r8
3437	movq	%r9,%rcx
3438	adcq	$0,%r9
3439	adcq	$0,%r10
3440	shrq	$2,%r10
3441	cmovnzq	%r8,%rax
3442	cmovnzq	%r9,%rcx
3443
3444	addq	0(%rdx),%rax
3445	adcq	8(%rdx),%rcx
3446	movq	%rax,0(%rsi)
3447	movq	%rcx,8(%rsi)
3448
3449	.byte	0xf3,0xc3
3450.cfi_endproc
3451.size	poly1305_emit_base2_44,.-poly1305_emit_base2_44
3452.align	64
3453.Lconst:
3454.Lmask24:
3455.long	0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
3456.L129:
3457.long	16777216,0,16777216,0,16777216,0,16777216,0
3458.Lmask26:
3459.long	0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
3460.Lpermd_avx2:
3461.long	2,2,2,3,2,0,2,1
3462.Lpermd_avx512:
3463.long	0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
3464
3465.L2_44_inp_permd:
3466.long	0,1,1,2,2,3,7,7
3467.L2_44_inp_shift:
3468.quad	0,12,24,64
3469.L2_44_mask:
3470.quad	0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
3471.L2_44_shift_rgt:
3472.quad	44,44,42,64
3473.L2_44_shift_lft:
3474.quad	8,8,10,64
3475
3476.align	64
3477.Lx_mask44:
3478.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
3479.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
3480.Lx_mask42:
3481.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
3482.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
3483.byte	80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
3484.align	16
3485.globl	xor128_encrypt_n_pad
3486.type	xor128_encrypt_n_pad,@function
3487.align	16
3488xor128_encrypt_n_pad:
3489.cfi_startproc
3490	subq	%rdx,%rsi
3491	subq	%rdx,%rdi
3492	movq	%rcx,%r10
3493	shrq	$4,%rcx
3494	jz	.Ltail_enc
3495	nop
3496.Loop_enc_xmm:
3497	movdqu	(%rsi,%rdx,1),%xmm0
3498	pxor	(%rdx),%xmm0
3499	movdqu	%xmm0,(%rdi,%rdx,1)
3500	movdqa	%xmm0,(%rdx)
3501	leaq	16(%rdx),%rdx
3502	decq	%rcx
3503	jnz	.Loop_enc_xmm
3504
3505	andq	$15,%r10
3506	jz	.Ldone_enc
3507
3508.Ltail_enc:
3509	movq	$16,%rcx
3510	subq	%r10,%rcx
3511	xorl	%eax,%eax
3512.Loop_enc_byte:
3513	movb	(%rsi,%rdx,1),%al
3514	xorb	(%rdx),%al
3515	movb	%al,(%rdi,%rdx,1)
3516	movb	%al,(%rdx)
3517	leaq	1(%rdx),%rdx
3518	decq	%r10
3519	jnz	.Loop_enc_byte
3520
3521	xorl	%eax,%eax
3522.Loop_enc_pad:
3523	movb	%al,(%rdx)
3524	leaq	1(%rdx),%rdx
3525	decq	%rcx
3526	jnz	.Loop_enc_pad
3527
3528.Ldone_enc:
3529	movq	%rdx,%rax
3530	.byte	0xf3,0xc3
3531.cfi_endproc
3532.size	xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
3533
3534.globl	xor128_decrypt_n_pad
3535.type	xor128_decrypt_n_pad,@function
3536.align	16
3537xor128_decrypt_n_pad:
3538.cfi_startproc
3539	subq	%rdx,%rsi
3540	subq	%rdx,%rdi
3541	movq	%rcx,%r10
3542	shrq	$4,%rcx
3543	jz	.Ltail_dec
3544	nop
3545.Loop_dec_xmm:
3546	movdqu	(%rsi,%rdx,1),%xmm0
3547	movdqa	(%rdx),%xmm1
3548	pxor	%xmm0,%xmm1
3549	movdqu	%xmm1,(%rdi,%rdx,1)
3550	movdqa	%xmm0,(%rdx)
3551	leaq	16(%rdx),%rdx
3552	decq	%rcx
3553	jnz	.Loop_dec_xmm
3554
3555	pxor	%xmm1,%xmm1
3556	andq	$15,%r10
3557	jz	.Ldone_dec
3558
3559.Ltail_dec:
3560	movq	$16,%rcx
3561	subq	%r10,%rcx
3562	xorl	%eax,%eax
3563	xorq	%r11,%r11
3564.Loop_dec_byte:
3565	movb	(%rsi,%rdx,1),%r11b
3566	movb	(%rdx),%al
3567	xorb	%r11b,%al
3568	movb	%al,(%rdi,%rdx,1)
3569	movb	%r11b,(%rdx)
3570	leaq	1(%rdx),%rdx
3571	decq	%r10
3572	jnz	.Loop_dec_byte
3573
3574	xorl	%eax,%eax
3575.Loop_dec_pad:
3576	movb	%al,(%rdx)
3577	leaq	1(%rdx),%rdx
3578	decq	%rcx
3579	jnz	.Loop_dec_pad
3580
3581.Ldone_dec:
3582	movq	%rdx,%rax
3583	.byte	0xf3,0xc3
3584.cfi_endproc
3585.size	xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
3586