1#include <machine/asm.h>
2.text
3.align	64
4.globl	poly1305_init
5.type	poly1305_init,@function
6.align	16
7poly1305_init:
8.L_poly1305_init_begin:
9	#ifdef __CET__
10
11.byte	243,15,30,251
12	#endif
13
14	pushl	%ebp
15	pushl	%ebx
16	pushl	%esi
17	pushl	%edi
18	movl	20(%esp),%edi
19	movl	24(%esp),%esi
20	movl	28(%esp),%ebp
21	xorl	%eax,%eax
22	movl	%eax,(%edi)
23	movl	%eax,4(%edi)
24	movl	%eax,8(%edi)
25	movl	%eax,12(%edi)
26	movl	%eax,16(%edi)
27	movl	%eax,20(%edi)
28	cmpl	$0,%esi
29	je	.L000nokey
30	call	.L001pic_point
31.L001pic_point:
32	popl	%ebx
33	leal	poly1305_blocks-.L001pic_point(%ebx),%eax
34	leal	poly1305_emit-.L001pic_point(%ebx),%edx
35	leal	OPENSSL_ia32cap_P-.L001pic_point(%ebx),%edi
36	movl	(%edi),%ecx
37	andl	$83886080,%ecx
38	cmpl	$83886080,%ecx
39	jne	.L002no_sse2
40	leal	_poly1305_blocks_sse2-.L001pic_point(%ebx),%eax
41	leal	_poly1305_emit_sse2-.L001pic_point(%ebx),%edx
42.L002no_sse2:
43	movl	20(%esp),%edi
44	movl	%eax,(%ebp)
45	movl	%edx,4(%ebp)
46	movl	(%esi),%eax
47	movl	4(%esi),%ebx
48	movl	8(%esi),%ecx
49	movl	12(%esi),%edx
50	andl	$268435455,%eax
51	andl	$268435452,%ebx
52	andl	$268435452,%ecx
53	andl	$268435452,%edx
54	movl	%eax,24(%edi)
55	movl	%ebx,28(%edi)
56	movl	%ecx,32(%edi)
57	movl	%edx,36(%edi)
58	movl	$1,%eax
59.L000nokey:
60	popl	%edi
61	popl	%esi
62	popl	%ebx
63	popl	%ebp
64	ret
65.size	poly1305_init,.-.L_poly1305_init_begin
66.globl	poly1305_blocks
67.type	poly1305_blocks,@function
68.align	16
69poly1305_blocks:
70.L_poly1305_blocks_begin:
71	#ifdef __CET__
72
73.byte	243,15,30,251
74	#endif
75
76	pushl	%ebp
77	pushl	%ebx
78	pushl	%esi
79	pushl	%edi
80	movl	20(%esp),%edi
81	movl	24(%esp),%esi
82	movl	28(%esp),%ecx
83.Lenter_blocks:
84	andl	$-15,%ecx
85	jz	.L003nodata
86	subl	$64,%esp
87	movl	24(%edi),%eax
88	movl	28(%edi),%ebx
89	leal	(%esi,%ecx,1),%ebp
90	movl	32(%edi),%ecx
91	movl	36(%edi),%edx
92	movl	%ebp,92(%esp)
93	movl	%esi,%ebp
94	movl	%eax,36(%esp)
95	movl	%ebx,%eax
96	shrl	$2,%eax
97	movl	%ebx,40(%esp)
98	addl	%ebx,%eax
99	movl	%ecx,%ebx
100	shrl	$2,%ebx
101	movl	%ecx,44(%esp)
102	addl	%ecx,%ebx
103	movl	%edx,%ecx
104	shrl	$2,%ecx
105	movl	%edx,48(%esp)
106	addl	%edx,%ecx
107	movl	%eax,52(%esp)
108	movl	%ebx,56(%esp)
109	movl	%ecx,60(%esp)
110	movl	(%edi),%eax
111	movl	4(%edi),%ebx
112	movl	8(%edi),%ecx
113	movl	12(%edi),%esi
114	movl	16(%edi),%edi
115	jmp	.L004loop
116.align	32
117.L004loop:
118	addl	(%ebp),%eax
119	adcl	4(%ebp),%ebx
120	adcl	8(%ebp),%ecx
121	adcl	12(%ebp),%esi
122	leal	16(%ebp),%ebp
123	adcl	96(%esp),%edi
124	movl	%eax,(%esp)
125	movl	%esi,12(%esp)
126	mull	36(%esp)
127	movl	%edi,16(%esp)
128	movl	%eax,%edi
129	movl	%ebx,%eax
130	movl	%edx,%esi
131	mull	60(%esp)
132	addl	%eax,%edi
133	movl	%ecx,%eax
134	adcl	%edx,%esi
135	mull	56(%esp)
136	addl	%eax,%edi
137	movl	12(%esp),%eax
138	adcl	%edx,%esi
139	mull	52(%esp)
140	addl	%eax,%edi
141	movl	(%esp),%eax
142	adcl	%edx,%esi
143	mull	40(%esp)
144	movl	%edi,20(%esp)
145	xorl	%edi,%edi
146	addl	%eax,%esi
147	movl	%ebx,%eax
148	adcl	%edx,%edi
149	mull	36(%esp)
150	addl	%eax,%esi
151	movl	%ecx,%eax
152	adcl	%edx,%edi
153	mull	60(%esp)
154	addl	%eax,%esi
155	movl	12(%esp),%eax
156	adcl	%edx,%edi
157	mull	56(%esp)
158	addl	%eax,%esi
159	movl	16(%esp),%eax
160	adcl	%edx,%edi
161	imull	52(%esp),%eax
162	addl	%eax,%esi
163	movl	(%esp),%eax
164	adcl	$0,%edi
165	mull	44(%esp)
166	movl	%esi,24(%esp)
167	xorl	%esi,%esi
168	addl	%eax,%edi
169	movl	%ebx,%eax
170	adcl	%edx,%esi
171	mull	40(%esp)
172	addl	%eax,%edi
173	movl	%ecx,%eax
174	adcl	%edx,%esi
175	mull	36(%esp)
176	addl	%eax,%edi
177	movl	12(%esp),%eax
178	adcl	%edx,%esi
179	mull	60(%esp)
180	addl	%eax,%edi
181	movl	16(%esp),%eax
182	adcl	%edx,%esi
183	imull	56(%esp),%eax
184	addl	%eax,%edi
185	movl	(%esp),%eax
186	adcl	$0,%esi
187	mull	48(%esp)
188	movl	%edi,28(%esp)
189	xorl	%edi,%edi
190	addl	%eax,%esi
191	movl	%ebx,%eax
192	adcl	%edx,%edi
193	mull	44(%esp)
194	addl	%eax,%esi
195	movl	%ecx,%eax
196	adcl	%edx,%edi
197	mull	40(%esp)
198	addl	%eax,%esi
199	movl	12(%esp),%eax
200	adcl	%edx,%edi
201	mull	36(%esp)
202	addl	%eax,%esi
203	movl	16(%esp),%ecx
204	adcl	%edx,%edi
205	movl	%ecx,%edx
206	imull	60(%esp),%ecx
207	addl	%ecx,%esi
208	movl	20(%esp),%eax
209	adcl	$0,%edi
210	imull	36(%esp),%edx
211	addl	%edi,%edx
212	movl	24(%esp),%ebx
213	movl	28(%esp),%ecx
214	movl	%edx,%edi
215	shrl	$2,%edx
216	andl	$3,%edi
217	leal	(%edx,%edx,4),%edx
218	addl	%edx,%eax
219	adcl	$0,%ebx
220	adcl	$0,%ecx
221	adcl	$0,%esi
222	adcl	$0,%edi
223	cmpl	92(%esp),%ebp
224	jne	.L004loop
225	movl	84(%esp),%edx
226	addl	$64,%esp
227	movl	%eax,(%edx)
228	movl	%ebx,4(%edx)
229	movl	%ecx,8(%edx)
230	movl	%esi,12(%edx)
231	movl	%edi,16(%edx)
232.L003nodata:
233	popl	%edi
234	popl	%esi
235	popl	%ebx
236	popl	%ebp
237	ret
238.size	poly1305_blocks,.-.L_poly1305_blocks_begin
239.globl	poly1305_emit
240.type	poly1305_emit,@function
241.align	16
242poly1305_emit:
243.L_poly1305_emit_begin:
244	#ifdef __CET__
245
246.byte	243,15,30,251
247	#endif
248
249	pushl	%ebp
250	pushl	%ebx
251	pushl	%esi
252	pushl	%edi
253	movl	20(%esp),%ebp
254.Lenter_emit:
255	movl	24(%esp),%edi
256	movl	(%ebp),%eax
257	movl	4(%ebp),%ebx
258	movl	8(%ebp),%ecx
259	movl	12(%ebp),%edx
260	movl	16(%ebp),%esi
261	addl	$5,%eax
262	adcl	$0,%ebx
263	adcl	$0,%ecx
264	adcl	$0,%edx
265	adcl	$0,%esi
266	shrl	$2,%esi
267	negl	%esi
268	andl	%esi,%eax
269	andl	%esi,%ebx
270	andl	%esi,%ecx
271	andl	%esi,%edx
272	movl	%eax,(%edi)
273	movl	%ebx,4(%edi)
274	movl	%ecx,8(%edi)
275	movl	%edx,12(%edi)
276	notl	%esi
277	movl	(%ebp),%eax
278	movl	4(%ebp),%ebx
279	movl	8(%ebp),%ecx
280	movl	12(%ebp),%edx
281	movl	28(%esp),%ebp
282	andl	%esi,%eax
283	andl	%esi,%ebx
284	andl	%esi,%ecx
285	andl	%esi,%edx
286	orl	(%edi),%eax
287	orl	4(%edi),%ebx
288	orl	8(%edi),%ecx
289	orl	12(%edi),%edx
290	addl	(%ebp),%eax
291	adcl	4(%ebp),%ebx
292	adcl	8(%ebp),%ecx
293	adcl	12(%ebp),%edx
294	movl	%eax,(%edi)
295	movl	%ebx,4(%edi)
296	movl	%ecx,8(%edi)
297	movl	%edx,12(%edi)
298	popl	%edi
299	popl	%esi
300	popl	%ebx
301	popl	%ebp
302	ret
303.size	poly1305_emit,.-.L_poly1305_emit_begin
304.align	32
305.type	_poly1305_init_sse2,@function
306.align	16
307_poly1305_init_sse2:
308	#ifdef __CET__
309
310.byte	243,15,30,251
311	#endif
312
313	movdqu	24(%edi),%xmm4
314	leal	48(%edi),%edi
315	movl	%esp,%ebp
316	subl	$224,%esp
317	andl	$-16,%esp
318	movq	64(%ebx),%xmm7
319	movdqa	%xmm4,%xmm0
320	movdqa	%xmm4,%xmm1
321	movdqa	%xmm4,%xmm2
322	pand	%xmm7,%xmm0
323	psrlq	$26,%xmm1
324	psrldq	$6,%xmm2
325	pand	%xmm7,%xmm1
326	movdqa	%xmm2,%xmm3
327	psrlq	$4,%xmm2
328	psrlq	$30,%xmm3
329	pand	%xmm7,%xmm2
330	pand	%xmm7,%xmm3
331	psrldq	$13,%xmm4
332	leal	144(%esp),%edx
333	movl	$2,%ecx
334.L005square:
335	movdqa	%xmm0,(%esp)
336	movdqa	%xmm1,16(%esp)
337	movdqa	%xmm2,32(%esp)
338	movdqa	%xmm3,48(%esp)
339	movdqa	%xmm4,64(%esp)
340	movdqa	%xmm1,%xmm6
341	movdqa	%xmm2,%xmm5
342	pslld	$2,%xmm6
343	pslld	$2,%xmm5
344	paddd	%xmm1,%xmm6
345	paddd	%xmm2,%xmm5
346	movdqa	%xmm6,80(%esp)
347	movdqa	%xmm5,96(%esp)
348	movdqa	%xmm3,%xmm6
349	movdqa	%xmm4,%xmm5
350	pslld	$2,%xmm6
351	pslld	$2,%xmm5
352	paddd	%xmm3,%xmm6
353	paddd	%xmm4,%xmm5
354	movdqa	%xmm6,112(%esp)
355	movdqa	%xmm5,128(%esp)
356	pshufd	$68,%xmm0,%xmm6
357	movdqa	%xmm1,%xmm5
358	pshufd	$68,%xmm1,%xmm1
359	pshufd	$68,%xmm2,%xmm2
360	pshufd	$68,%xmm3,%xmm3
361	pshufd	$68,%xmm4,%xmm4
362	movdqa	%xmm6,(%edx)
363	movdqa	%xmm1,16(%edx)
364	movdqa	%xmm2,32(%edx)
365	movdqa	%xmm3,48(%edx)
366	movdqa	%xmm4,64(%edx)
367	pmuludq	%xmm0,%xmm4
368	pmuludq	%xmm0,%xmm3
369	pmuludq	%xmm0,%xmm2
370	pmuludq	%xmm0,%xmm1
371	pmuludq	%xmm6,%xmm0
372	movdqa	%xmm5,%xmm6
373	pmuludq	48(%edx),%xmm5
374	movdqa	%xmm6,%xmm7
375	pmuludq	32(%edx),%xmm6
376	paddq	%xmm5,%xmm4
377	movdqa	%xmm7,%xmm5
378	pmuludq	16(%edx),%xmm7
379	paddq	%xmm6,%xmm3
380	movdqa	80(%esp),%xmm6
381	pmuludq	(%edx),%xmm5
382	paddq	%xmm7,%xmm2
383	pmuludq	64(%edx),%xmm6
384	movdqa	32(%esp),%xmm7
385	paddq	%xmm5,%xmm1
386	movdqa	%xmm7,%xmm5
387	pmuludq	32(%edx),%xmm7
388	paddq	%xmm6,%xmm0
389	movdqa	%xmm5,%xmm6
390	pmuludq	16(%edx),%xmm5
391	paddq	%xmm7,%xmm4
392	movdqa	96(%esp),%xmm7
393	pmuludq	(%edx),%xmm6
394	paddq	%xmm5,%xmm3
395	movdqa	%xmm7,%xmm5
396	pmuludq	64(%edx),%xmm7
397	paddq	%xmm6,%xmm2
398	pmuludq	48(%edx),%xmm5
399	movdqa	48(%esp),%xmm6
400	paddq	%xmm7,%xmm1
401	movdqa	%xmm6,%xmm7
402	pmuludq	16(%edx),%xmm6
403	paddq	%xmm5,%xmm0
404	movdqa	112(%esp),%xmm5
405	pmuludq	(%edx),%xmm7
406	paddq	%xmm6,%xmm4
407	movdqa	%xmm5,%xmm6
408	pmuludq	64(%edx),%xmm5
409	paddq	%xmm7,%xmm3
410	movdqa	%xmm6,%xmm7
411	pmuludq	48(%edx),%xmm6
412	paddq	%xmm5,%xmm2
413	pmuludq	32(%edx),%xmm7
414	movdqa	64(%esp),%xmm5
415	paddq	%xmm6,%xmm1
416	movdqa	128(%esp),%xmm6
417	pmuludq	(%edx),%xmm5
418	paddq	%xmm7,%xmm0
419	movdqa	%xmm6,%xmm7
420	pmuludq	64(%edx),%xmm6
421	paddq	%xmm5,%xmm4
422	movdqa	%xmm7,%xmm5
423	pmuludq	16(%edx),%xmm7
424	paddq	%xmm6,%xmm3
425	movdqa	%xmm5,%xmm6
426	pmuludq	32(%edx),%xmm5
427	paddq	%xmm7,%xmm0
428	pmuludq	48(%edx),%xmm6
429	movdqa	64(%ebx),%xmm7
430	paddq	%xmm5,%xmm1
431	paddq	%xmm6,%xmm2
432	movdqa	%xmm3,%xmm5
433	pand	%xmm7,%xmm3
434	psrlq	$26,%xmm5
435	paddq	%xmm4,%xmm5
436	movdqa	%xmm0,%xmm6
437	pand	%xmm7,%xmm0
438	psrlq	$26,%xmm6
439	movdqa	%xmm5,%xmm4
440	paddq	%xmm1,%xmm6
441	psrlq	$26,%xmm5
442	pand	%xmm7,%xmm4
443	movdqa	%xmm6,%xmm1
444	psrlq	$26,%xmm6
445	paddd	%xmm5,%xmm0
446	psllq	$2,%xmm5
447	paddq	%xmm2,%xmm6
448	paddq	%xmm0,%xmm5
449	pand	%xmm7,%xmm1
450	movdqa	%xmm6,%xmm2
451	psrlq	$26,%xmm6
452	pand	%xmm7,%xmm2
453	paddd	%xmm3,%xmm6
454	movdqa	%xmm5,%xmm0
455	psrlq	$26,%xmm5
456	movdqa	%xmm6,%xmm3
457	psrlq	$26,%xmm6
458	pand	%xmm7,%xmm0
459	paddd	%xmm5,%xmm1
460	pand	%xmm7,%xmm3
461	paddd	%xmm6,%xmm4
462	decl	%ecx
463	jz	.L006square_break
464	punpcklqdq	(%esp),%xmm0
465	punpcklqdq	16(%esp),%xmm1
466	punpcklqdq	32(%esp),%xmm2
467	punpcklqdq	48(%esp),%xmm3
468	punpcklqdq	64(%esp),%xmm4
469	jmp	.L005square
470.L006square_break:
471	psllq	$32,%xmm0
472	psllq	$32,%xmm1
473	psllq	$32,%xmm2
474	psllq	$32,%xmm3
475	psllq	$32,%xmm4
476	por	(%esp),%xmm0
477	por	16(%esp),%xmm1
478	por	32(%esp),%xmm2
479	por	48(%esp),%xmm3
480	por	64(%esp),%xmm4
481	pshufd	$141,%xmm0,%xmm0
482	pshufd	$141,%xmm1,%xmm1
483	pshufd	$141,%xmm2,%xmm2
484	pshufd	$141,%xmm3,%xmm3
485	pshufd	$141,%xmm4,%xmm4
486	movdqu	%xmm0,(%edi)
487	movdqu	%xmm1,16(%edi)
488	movdqu	%xmm2,32(%edi)
489	movdqu	%xmm3,48(%edi)
490	movdqu	%xmm4,64(%edi)
491	movdqa	%xmm1,%xmm6
492	movdqa	%xmm2,%xmm5
493	pslld	$2,%xmm6
494	pslld	$2,%xmm5
495	paddd	%xmm1,%xmm6
496	paddd	%xmm2,%xmm5
497	movdqu	%xmm6,80(%edi)
498	movdqu	%xmm5,96(%edi)
499	movdqa	%xmm3,%xmm6
500	movdqa	%xmm4,%xmm5
501	pslld	$2,%xmm6
502	pslld	$2,%xmm5
503	paddd	%xmm3,%xmm6
504	paddd	%xmm4,%xmm5
505	movdqu	%xmm6,112(%edi)
506	movdqu	%xmm5,128(%edi)
507	movl	%ebp,%esp
508	leal	-48(%edi),%edi
509	ret
510.size	_poly1305_init_sse2,.-_poly1305_init_sse2
511.align	32
512.type	_poly1305_blocks_sse2,@function
513.align	16
514_poly1305_blocks_sse2:
515	#ifdef __CET__
516
517.byte	243,15,30,251
518	#endif
519
520	pushl	%ebp
521	pushl	%ebx
522	pushl	%esi
523	pushl	%edi
524	movl	20(%esp),%edi
525	movl	24(%esp),%esi
526	movl	28(%esp),%ecx
527	movl	20(%edi),%eax
528	andl	$-16,%ecx
529	jz	.L007nodata
530	cmpl	$64,%ecx
531	jae	.L008enter_sse2
532	testl	%eax,%eax
533	jz	.Lenter_blocks
534.align	16
535.L008enter_sse2:
536	call	.L009pic_point
537.L009pic_point:
538	popl	%ebx
539	leal	.Lconst_sse2-.L009pic_point(%ebx),%ebx
540	testl	%eax,%eax
541	jnz	.L010base2_26
542	call	_poly1305_init_sse2
543	movl	(%edi),%eax
544	movl	3(%edi),%ecx
545	movl	6(%edi),%edx
546	movl	9(%edi),%esi
547	movl	13(%edi),%ebp
548	movl	$1,20(%edi)
549	shrl	$2,%ecx
550	andl	$67108863,%eax
551	shrl	$4,%edx
552	andl	$67108863,%ecx
553	shrl	$6,%esi
554	andl	$67108863,%edx
555	movd	%eax,%xmm0
556	movd	%ecx,%xmm1
557	movd	%edx,%xmm2
558	movd	%esi,%xmm3
559	movd	%ebp,%xmm4
560	movl	24(%esp),%esi
561	movl	28(%esp),%ecx
562	jmp	.L011base2_32
563.align	16
564.L010base2_26:
565	movd	(%edi),%xmm0
566	movd	4(%edi),%xmm1
567	movd	8(%edi),%xmm2
568	movd	12(%edi),%xmm3
569	movd	16(%edi),%xmm4
570	movdqa	64(%ebx),%xmm7
571.L011base2_32:
572	movl	32(%esp),%eax
573	movl	%esp,%ebp
574	subl	$528,%esp
575	andl	$-16,%esp
576	leal	48(%edi),%edi
577	shll	$24,%eax
578	testl	$31,%ecx
579	jz	.L012even
580	movdqu	(%esi),%xmm6
581	leal	16(%esi),%esi
582	movdqa	%xmm6,%xmm5
583	pand	%xmm7,%xmm6
584	paddd	%xmm6,%xmm0
585	movdqa	%xmm5,%xmm6
586	psrlq	$26,%xmm5
587	psrldq	$6,%xmm6
588	pand	%xmm7,%xmm5
589	paddd	%xmm5,%xmm1
590	movdqa	%xmm6,%xmm5
591	psrlq	$4,%xmm6
592	pand	%xmm7,%xmm6
593	paddd	%xmm6,%xmm2
594	movdqa	%xmm5,%xmm6
595	psrlq	$30,%xmm5
596	pand	%xmm7,%xmm5
597	psrldq	$7,%xmm6
598	paddd	%xmm5,%xmm3
599	movd	%eax,%xmm5
600	paddd	%xmm6,%xmm4
601	movd	12(%edi),%xmm6
602	paddd	%xmm5,%xmm4
603	movdqa	%xmm0,(%esp)
604	movdqa	%xmm1,16(%esp)
605	movdqa	%xmm2,32(%esp)
606	movdqa	%xmm3,48(%esp)
607	movdqa	%xmm4,64(%esp)
608	pmuludq	%xmm6,%xmm0
609	pmuludq	%xmm6,%xmm1
610	pmuludq	%xmm6,%xmm2
611	movd	28(%edi),%xmm5
612	pmuludq	%xmm6,%xmm3
613	pmuludq	%xmm6,%xmm4
614	movdqa	%xmm5,%xmm6
615	pmuludq	48(%esp),%xmm5
616	movdqa	%xmm6,%xmm7
617	pmuludq	32(%esp),%xmm6
618	paddq	%xmm5,%xmm4
619	movdqa	%xmm7,%xmm5
620	pmuludq	16(%esp),%xmm7
621	paddq	%xmm6,%xmm3
622	movd	92(%edi),%xmm6
623	pmuludq	(%esp),%xmm5
624	paddq	%xmm7,%xmm2
625	pmuludq	64(%esp),%xmm6
626	movd	44(%edi),%xmm7
627	paddq	%xmm5,%xmm1
628	movdqa	%xmm7,%xmm5
629	pmuludq	32(%esp),%xmm7
630	paddq	%xmm6,%xmm0
631	movdqa	%xmm5,%xmm6
632	pmuludq	16(%esp),%xmm5
633	paddq	%xmm7,%xmm4
634	movd	108(%edi),%xmm7
635	pmuludq	(%esp),%xmm6
636	paddq	%xmm5,%xmm3
637	movdqa	%xmm7,%xmm5
638	pmuludq	64(%esp),%xmm7
639	paddq	%xmm6,%xmm2
640	pmuludq	48(%esp),%xmm5
641	movd	60(%edi),%xmm6
642	paddq	%xmm7,%xmm1
643	movdqa	%xmm6,%xmm7
644	pmuludq	16(%esp),%xmm6
645	paddq	%xmm5,%xmm0
646	movd	124(%edi),%xmm5
647	pmuludq	(%esp),%xmm7
648	paddq	%xmm6,%xmm4
649	movdqa	%xmm5,%xmm6
650	pmuludq	64(%esp),%xmm5
651	paddq	%xmm7,%xmm3
652	movdqa	%xmm6,%xmm7
653	pmuludq	48(%esp),%xmm6
654	paddq	%xmm5,%xmm2
655	pmuludq	32(%esp),%xmm7
656	movd	76(%edi),%xmm5
657	paddq	%xmm6,%xmm1
658	movd	140(%edi),%xmm6
659	pmuludq	(%esp),%xmm5
660	paddq	%xmm7,%xmm0
661	movdqa	%xmm6,%xmm7
662	pmuludq	64(%esp),%xmm6
663	paddq	%xmm5,%xmm4
664	movdqa	%xmm7,%xmm5
665	pmuludq	16(%esp),%xmm7
666	paddq	%xmm6,%xmm3
667	movdqa	%xmm5,%xmm6
668	pmuludq	32(%esp),%xmm5
669	paddq	%xmm7,%xmm0
670	pmuludq	48(%esp),%xmm6
671	movdqa	64(%ebx),%xmm7
672	paddq	%xmm5,%xmm1
673	paddq	%xmm6,%xmm2
674	movdqa	%xmm3,%xmm5
675	pand	%xmm7,%xmm3
676	psrlq	$26,%xmm5
677	paddq	%xmm4,%xmm5
678	movdqa	%xmm0,%xmm6
679	pand	%xmm7,%xmm0
680	psrlq	$26,%xmm6
681	movdqa	%xmm5,%xmm4
682	paddq	%xmm1,%xmm6
683	psrlq	$26,%xmm5
684	pand	%xmm7,%xmm4
685	movdqa	%xmm6,%xmm1
686	psrlq	$26,%xmm6
687	paddd	%xmm5,%xmm0
688	psllq	$2,%xmm5
689	paddq	%xmm2,%xmm6
690	paddq	%xmm0,%xmm5
691	pand	%xmm7,%xmm1
692	movdqa	%xmm6,%xmm2
693	psrlq	$26,%xmm6
694	pand	%xmm7,%xmm2
695	paddd	%xmm3,%xmm6
696	movdqa	%xmm5,%xmm0
697	psrlq	$26,%xmm5
698	movdqa	%xmm6,%xmm3
699	psrlq	$26,%xmm6
700	pand	%xmm7,%xmm0
701	paddd	%xmm5,%xmm1
702	pand	%xmm7,%xmm3
703	paddd	%xmm6,%xmm4
704	subl	$16,%ecx
705	jz	.L013done
706.L012even:
707	leal	384(%esp),%edx
708	leal	-32(%esi),%eax
709	subl	$64,%ecx
710	movdqu	(%edi),%xmm5
711	pshufd	$68,%xmm5,%xmm6
712	cmovbl	%eax,%esi
713	pshufd	$238,%xmm5,%xmm5
714	movdqa	%xmm6,(%edx)
715	leal	160(%esp),%eax
716	movdqu	16(%edi),%xmm6
717	movdqa	%xmm5,-144(%edx)
718	pshufd	$68,%xmm6,%xmm5
719	pshufd	$238,%xmm6,%xmm6
720	movdqa	%xmm5,16(%edx)
721	movdqu	32(%edi),%xmm5
722	movdqa	%xmm6,-128(%edx)
723	pshufd	$68,%xmm5,%xmm6
724	pshufd	$238,%xmm5,%xmm5
725	movdqa	%xmm6,32(%edx)
726	movdqu	48(%edi),%xmm6
727	movdqa	%xmm5,-112(%edx)
728	pshufd	$68,%xmm6,%xmm5
729	pshufd	$238,%xmm6,%xmm6
730	movdqa	%xmm5,48(%edx)
731	movdqu	64(%edi),%xmm5
732	movdqa	%xmm6,-96(%edx)
733	pshufd	$68,%xmm5,%xmm6
734	pshufd	$238,%xmm5,%xmm5
735	movdqa	%xmm6,64(%edx)
736	movdqu	80(%edi),%xmm6
737	movdqa	%xmm5,-80(%edx)
738	pshufd	$68,%xmm6,%xmm5
739	pshufd	$238,%xmm6,%xmm6
740	movdqa	%xmm5,80(%edx)
741	movdqu	96(%edi),%xmm5
742	movdqa	%xmm6,-64(%edx)
743	pshufd	$68,%xmm5,%xmm6
744	pshufd	$238,%xmm5,%xmm5
745	movdqa	%xmm6,96(%edx)
746	movdqu	112(%edi),%xmm6
747	movdqa	%xmm5,-48(%edx)
748	pshufd	$68,%xmm6,%xmm5
749	pshufd	$238,%xmm6,%xmm6
750	movdqa	%xmm5,112(%edx)
751	movdqu	128(%edi),%xmm5
752	movdqa	%xmm6,-32(%edx)
753	pshufd	$68,%xmm5,%xmm6
754	pshufd	$238,%xmm5,%xmm5
755	movdqa	%xmm6,128(%edx)
756	movdqa	%xmm5,-16(%edx)
757	movdqu	32(%esi),%xmm5
758	movdqu	48(%esi),%xmm6
759	leal	32(%esi),%esi
760	movdqa	%xmm2,112(%esp)
761	movdqa	%xmm3,128(%esp)
762	movdqa	%xmm4,144(%esp)
763	movdqa	%xmm5,%xmm2
764	movdqa	%xmm6,%xmm3
765	psrldq	$6,%xmm2
766	psrldq	$6,%xmm3
767	movdqa	%xmm5,%xmm4
768	punpcklqdq	%xmm3,%xmm2
769	punpckhqdq	%xmm6,%xmm4
770	punpcklqdq	%xmm6,%xmm5
771	movdqa	%xmm2,%xmm3
772	psrlq	$4,%xmm2
773	psrlq	$30,%xmm3
774	movdqa	%xmm5,%xmm6
775	psrlq	$40,%xmm4
776	psrlq	$26,%xmm6
777	pand	%xmm7,%xmm5
778	pand	%xmm7,%xmm6
779	pand	%xmm7,%xmm2
780	pand	%xmm7,%xmm3
781	por	(%ebx),%xmm4
782	movdqa	%xmm0,80(%esp)
783	movdqa	%xmm1,96(%esp)
784	jbe	.L014skip_loop
785	jmp	.L015loop
786.align	32
787.L015loop:
788	movdqa	-144(%edx),%xmm7
789	movdqa	%xmm6,16(%eax)
790	movdqa	%xmm2,32(%eax)
791	movdqa	%xmm3,48(%eax)
792	movdqa	%xmm4,64(%eax)
793	movdqa	%xmm5,%xmm1
794	pmuludq	%xmm7,%xmm5
795	movdqa	%xmm6,%xmm0
796	pmuludq	%xmm7,%xmm6
797	pmuludq	%xmm7,%xmm2
798	pmuludq	%xmm7,%xmm3
799	pmuludq	%xmm7,%xmm4
800	pmuludq	-16(%edx),%xmm0
801	movdqa	%xmm1,%xmm7
802	pmuludq	-128(%edx),%xmm1
803	paddq	%xmm5,%xmm0
804	movdqa	%xmm7,%xmm5
805	pmuludq	-112(%edx),%xmm7
806	paddq	%xmm6,%xmm1
807	movdqa	%xmm5,%xmm6
808	pmuludq	-96(%edx),%xmm5
809	paddq	%xmm7,%xmm2
810	movdqa	16(%eax),%xmm7
811	pmuludq	-80(%edx),%xmm6
812	paddq	%xmm5,%xmm3
813	movdqa	%xmm7,%xmm5
814	pmuludq	-128(%edx),%xmm7
815	paddq	%xmm6,%xmm4
816	movdqa	%xmm5,%xmm6
817	pmuludq	-112(%edx),%xmm5
818	paddq	%xmm7,%xmm2
819	movdqa	32(%eax),%xmm7
820	pmuludq	-96(%edx),%xmm6
821	paddq	%xmm5,%xmm3
822	movdqa	%xmm7,%xmm5
823	pmuludq	-32(%edx),%xmm7
824	paddq	%xmm6,%xmm4
825	movdqa	%xmm5,%xmm6
826	pmuludq	-16(%edx),%xmm5
827	paddq	%xmm7,%xmm0
828	movdqa	%xmm6,%xmm7
829	pmuludq	-128(%edx),%xmm6
830	paddq	%xmm5,%xmm1
831	movdqa	48(%eax),%xmm5
832	pmuludq	-112(%edx),%xmm7
833	paddq	%xmm6,%xmm3
834	movdqa	%xmm5,%xmm6
835	pmuludq	-48(%edx),%xmm5
836	paddq	%xmm7,%xmm4
837	movdqa	%xmm6,%xmm7
838	pmuludq	-32(%edx),%xmm6
839	paddq	%xmm5,%xmm0
840	movdqa	%xmm7,%xmm5
841	pmuludq	-16(%edx),%xmm7
842	paddq	%xmm6,%xmm1
843	movdqa	64(%eax),%xmm6
844	pmuludq	-128(%edx),%xmm5
845	paddq	%xmm7,%xmm2
846	movdqa	%xmm6,%xmm7
847	pmuludq	-16(%edx),%xmm6
848	paddq	%xmm5,%xmm4
849	movdqa	%xmm7,%xmm5
850	pmuludq	-64(%edx),%xmm7
851	paddq	%xmm6,%xmm3
852	movdqa	%xmm5,%xmm6
853	pmuludq	-48(%edx),%xmm5
854	paddq	%xmm7,%xmm0
855	movdqa	64(%ebx),%xmm7
856	pmuludq	-32(%edx),%xmm6
857	paddq	%xmm5,%xmm1
858	paddq	%xmm6,%xmm2
859	movdqu	-32(%esi),%xmm5
860	movdqu	-16(%esi),%xmm6
861	leal	32(%esi),%esi
862	movdqa	%xmm2,32(%esp)
863	movdqa	%xmm3,48(%esp)
864	movdqa	%xmm4,64(%esp)
865	movdqa	%xmm5,%xmm2
866	movdqa	%xmm6,%xmm3
867	psrldq	$6,%xmm2
868	psrldq	$6,%xmm3
869	movdqa	%xmm5,%xmm4
870	punpcklqdq	%xmm3,%xmm2
871	punpckhqdq	%xmm6,%xmm4
872	punpcklqdq	%xmm6,%xmm5
873	movdqa	%xmm2,%xmm3
874	psrlq	$4,%xmm2
875	psrlq	$30,%xmm3
876	movdqa	%xmm5,%xmm6
877	psrlq	$40,%xmm4
878	psrlq	$26,%xmm6
879	pand	%xmm7,%xmm5
880	pand	%xmm7,%xmm6
881	pand	%xmm7,%xmm2
882	pand	%xmm7,%xmm3
883	por	(%ebx),%xmm4
884	leal	-32(%esi),%eax
885	subl	$64,%ecx
886	paddd	80(%esp),%xmm5
887	paddd	96(%esp),%xmm6
888	paddd	112(%esp),%xmm2
889	paddd	128(%esp),%xmm3
890	paddd	144(%esp),%xmm4
891	cmovbl	%eax,%esi
892	leal	160(%esp),%eax
893	movdqa	(%edx),%xmm7
894	movdqa	%xmm1,16(%esp)
895	movdqa	%xmm6,16(%eax)
896	movdqa	%xmm2,32(%eax)
897	movdqa	%xmm3,48(%eax)
898	movdqa	%xmm4,64(%eax)
899	movdqa	%xmm5,%xmm1
900	pmuludq	%xmm7,%xmm5
901	paddq	%xmm0,%xmm5
902	movdqa	%xmm6,%xmm0
903	pmuludq	%xmm7,%xmm6
904	pmuludq	%xmm7,%xmm2
905	pmuludq	%xmm7,%xmm3
906	pmuludq	%xmm7,%xmm4
907	paddq	16(%esp),%xmm6
908	paddq	32(%esp),%xmm2
909	paddq	48(%esp),%xmm3
910	paddq	64(%esp),%xmm4
911	pmuludq	128(%edx),%xmm0
912	movdqa	%xmm1,%xmm7
913	pmuludq	16(%edx),%xmm1
914	paddq	%xmm5,%xmm0
915	movdqa	%xmm7,%xmm5
916	pmuludq	32(%edx),%xmm7
917	paddq	%xmm6,%xmm1
918	movdqa	%xmm5,%xmm6
919	pmuludq	48(%edx),%xmm5
920	paddq	%xmm7,%xmm2
921	movdqa	16(%eax),%xmm7
922	pmuludq	64(%edx),%xmm6
923	paddq	%xmm5,%xmm3
924	movdqa	%xmm7,%xmm5
925	pmuludq	16(%edx),%xmm7
926	paddq	%xmm6,%xmm4
927	movdqa	%xmm5,%xmm6
928	pmuludq	32(%edx),%xmm5
929	paddq	%xmm7,%xmm2
930	movdqa	32(%eax),%xmm7
931	pmuludq	48(%edx),%xmm6
932	paddq	%xmm5,%xmm3
933	movdqa	%xmm7,%xmm5
934	pmuludq	112(%edx),%xmm7
935	paddq	%xmm6,%xmm4
936	movdqa	%xmm5,%xmm6
937	pmuludq	128(%edx),%xmm5
938	paddq	%xmm7,%xmm0
939	movdqa	%xmm6,%xmm7
940	pmuludq	16(%edx),%xmm6
941	paddq	%xmm5,%xmm1
942	movdqa	48(%eax),%xmm5
943	pmuludq	32(%edx),%xmm7
944	paddq	%xmm6,%xmm3
945	movdqa	%xmm5,%xmm6
946	pmuludq	96(%edx),%xmm5
947	paddq	%xmm7,%xmm4
948	movdqa	%xmm6,%xmm7
949	pmuludq	112(%edx),%xmm6
950	paddq	%xmm5,%xmm0
951	movdqa	%xmm7,%xmm5
952	pmuludq	128(%edx),%xmm7
953	paddq	%xmm6,%xmm1
954	movdqa	64(%eax),%xmm6
955	pmuludq	16(%edx),%xmm5
956	paddq	%xmm7,%xmm2
957	movdqa	%xmm6,%xmm7
958	pmuludq	128(%edx),%xmm6
959	paddq	%xmm5,%xmm4
960	movdqa	%xmm7,%xmm5
961	pmuludq	80(%edx),%xmm7
962	paddq	%xmm6,%xmm3
963	movdqa	%xmm5,%xmm6
964	pmuludq	96(%edx),%xmm5
965	paddq	%xmm7,%xmm0
966	movdqa	64(%ebx),%xmm7
967	pmuludq	112(%edx),%xmm6
968	paddq	%xmm5,%xmm1
969	paddq	%xmm6,%xmm2
970	movdqa	%xmm3,%xmm5
971	pand	%xmm7,%xmm3
972	psrlq	$26,%xmm5
973	paddq	%xmm4,%xmm5
974	movdqa	%xmm0,%xmm6
975	pand	%xmm7,%xmm0
976	psrlq	$26,%xmm6
977	movdqa	%xmm5,%xmm4
978	paddq	%xmm1,%xmm6
979	psrlq	$26,%xmm5
980	pand	%xmm7,%xmm4
981	movdqa	%xmm6,%xmm1
982	psrlq	$26,%xmm6
983	paddd	%xmm5,%xmm0
984	psllq	$2,%xmm5
985	paddq	%xmm2,%xmm6
986	paddq	%xmm0,%xmm5
987	pand	%xmm7,%xmm1
988	movdqa	%xmm6,%xmm2
989	psrlq	$26,%xmm6
990	pand	%xmm7,%xmm2
991	paddd	%xmm3,%xmm6
992	movdqa	%xmm5,%xmm0
993	psrlq	$26,%xmm5
994	movdqa	%xmm6,%xmm3
995	psrlq	$26,%xmm6
996	pand	%xmm7,%xmm0
997	paddd	%xmm5,%xmm1
998	pand	%xmm7,%xmm3
999	paddd	%xmm6,%xmm4
1000	movdqu	32(%esi),%xmm5
1001	movdqu	48(%esi),%xmm6
1002	leal	32(%esi),%esi
1003	movdqa	%xmm2,112(%esp)
1004	movdqa	%xmm3,128(%esp)
1005	movdqa	%xmm4,144(%esp)
1006	movdqa	%xmm5,%xmm2
1007	movdqa	%xmm6,%xmm3
1008	psrldq	$6,%xmm2
1009	psrldq	$6,%xmm3
1010	movdqa	%xmm5,%xmm4
1011	punpcklqdq	%xmm3,%xmm2
1012	punpckhqdq	%xmm6,%xmm4
1013	punpcklqdq	%xmm6,%xmm5
1014	movdqa	%xmm2,%xmm3
1015	psrlq	$4,%xmm2
1016	psrlq	$30,%xmm3
1017	movdqa	%xmm5,%xmm6
1018	psrlq	$40,%xmm4
1019	psrlq	$26,%xmm6
1020	pand	%xmm7,%xmm5
1021	pand	%xmm7,%xmm6
1022	pand	%xmm7,%xmm2
1023	pand	%xmm7,%xmm3
1024	por	(%ebx),%xmm4
1025	movdqa	%xmm0,80(%esp)
1026	movdqa	%xmm1,96(%esp)
1027	ja	.L015loop
1028.L014skip_loop:
1029	pshufd	$16,-144(%edx),%xmm7
1030	addl	$32,%ecx
1031	jnz	.L016long_tail
1032	paddd	%xmm0,%xmm5
1033	paddd	%xmm1,%xmm6
1034	paddd	112(%esp),%xmm2
1035	paddd	128(%esp),%xmm3
1036	paddd	144(%esp),%xmm4
1037.L016long_tail:
1038	movdqa	%xmm5,(%eax)
1039	movdqa	%xmm6,16(%eax)
1040	movdqa	%xmm2,32(%eax)
1041	movdqa	%xmm3,48(%eax)
1042	movdqa	%xmm4,64(%eax)
1043	pmuludq	%xmm7,%xmm5
1044	pmuludq	%xmm7,%xmm6
1045	pmuludq	%xmm7,%xmm2
1046	movdqa	%xmm5,%xmm0
1047	pshufd	$16,-128(%edx),%xmm5
1048	pmuludq	%xmm7,%xmm3
1049	movdqa	%xmm6,%xmm1
1050	pmuludq	%xmm7,%xmm4
1051	movdqa	%xmm5,%xmm6
1052	pmuludq	48(%eax),%xmm5
1053	movdqa	%xmm6,%xmm7
1054	pmuludq	32(%eax),%xmm6
1055	paddq	%xmm5,%xmm4
1056	movdqa	%xmm7,%xmm5
1057	pmuludq	16(%eax),%xmm7
1058	paddq	%xmm6,%xmm3
1059	pshufd	$16,-64(%edx),%xmm6
1060	pmuludq	(%eax),%xmm5
1061	paddq	%xmm7,%xmm2
1062	pmuludq	64(%eax),%xmm6
1063	pshufd	$16,-112(%edx),%xmm7
1064	paddq	%xmm5,%xmm1
1065	movdqa	%xmm7,%xmm5
1066	pmuludq	32(%eax),%xmm7
1067	paddq	%xmm6,%xmm0
1068	movdqa	%xmm5,%xmm6
1069	pmuludq	16(%eax),%xmm5
1070	paddq	%xmm7,%xmm4
1071	pshufd	$16,-48(%edx),%xmm7
1072	pmuludq	(%eax),%xmm6
1073	paddq	%xmm5,%xmm3
1074	movdqa	%xmm7,%xmm5
1075	pmuludq	64(%eax),%xmm7
1076	paddq	%xmm6,%xmm2
1077	pmuludq	48(%eax),%xmm5
1078	pshufd	$16,-96(%edx),%xmm6
1079	paddq	%xmm7,%xmm1
1080	movdqa	%xmm6,%xmm7
1081	pmuludq	16(%eax),%xmm6
1082	paddq	%xmm5,%xmm0
1083	pshufd	$16,-32(%edx),%xmm5
1084	pmuludq	(%eax),%xmm7
1085	paddq	%xmm6,%xmm4
1086	movdqa	%xmm5,%xmm6
1087	pmuludq	64(%eax),%xmm5
1088	paddq	%xmm7,%xmm3
1089	movdqa	%xmm6,%xmm7
1090	pmuludq	48(%eax),%xmm6
1091	paddq	%xmm5,%xmm2
1092	pmuludq	32(%eax),%xmm7
1093	pshufd	$16,-80(%edx),%xmm5
1094	paddq	%xmm6,%xmm1
1095	pshufd	$16,-16(%edx),%xmm6
1096	pmuludq	(%eax),%xmm5
1097	paddq	%xmm7,%xmm0
1098	movdqa	%xmm6,%xmm7
1099	pmuludq	64(%eax),%xmm6
1100	paddq	%xmm5,%xmm4
1101	movdqa	%xmm7,%xmm5
1102	pmuludq	16(%eax),%xmm7
1103	paddq	%xmm6,%xmm3
1104	movdqa	%xmm5,%xmm6
1105	pmuludq	32(%eax),%xmm5
1106	paddq	%xmm7,%xmm0
1107	pmuludq	48(%eax),%xmm6
1108	movdqa	64(%ebx),%xmm7
1109	paddq	%xmm5,%xmm1
1110	paddq	%xmm6,%xmm2
1111	jz	.L017short_tail
1112	movdqu	-32(%esi),%xmm5
1113	movdqu	-16(%esi),%xmm6
1114	leal	32(%esi),%esi
1115	movdqa	%xmm2,32(%esp)
1116	movdqa	%xmm3,48(%esp)
1117	movdqa	%xmm4,64(%esp)
1118	movdqa	%xmm5,%xmm2
1119	movdqa	%xmm6,%xmm3
1120	psrldq	$6,%xmm2
1121	psrldq	$6,%xmm3
1122	movdqa	%xmm5,%xmm4
1123	punpcklqdq	%xmm3,%xmm2
1124	punpckhqdq	%xmm6,%xmm4
1125	punpcklqdq	%xmm6,%xmm5
1126	movdqa	%xmm2,%xmm3
1127	psrlq	$4,%xmm2
1128	psrlq	$30,%xmm3
1129	movdqa	%xmm5,%xmm6
1130	psrlq	$40,%xmm4
1131	psrlq	$26,%xmm6
1132	pand	%xmm7,%xmm5
1133	pand	%xmm7,%xmm6
1134	pand	%xmm7,%xmm2
1135	pand	%xmm7,%xmm3
1136	por	(%ebx),%xmm4
1137	pshufd	$16,(%edx),%xmm7
1138	paddd	80(%esp),%xmm5
1139	paddd	96(%esp),%xmm6
1140	paddd	112(%esp),%xmm2
1141	paddd	128(%esp),%xmm3
1142	paddd	144(%esp),%xmm4
1143	movdqa	%xmm5,(%esp)
1144	pmuludq	%xmm7,%xmm5
1145	movdqa	%xmm6,16(%esp)
1146	pmuludq	%xmm7,%xmm6
1147	paddq	%xmm5,%xmm0
1148	movdqa	%xmm2,%xmm5
1149	pmuludq	%xmm7,%xmm2
1150	paddq	%xmm6,%xmm1
1151	movdqa	%xmm3,%xmm6
1152	pmuludq	%xmm7,%xmm3
1153	paddq	32(%esp),%xmm2
1154	movdqa	%xmm5,32(%esp)
1155	pshufd	$16,16(%edx),%xmm5
1156	paddq	48(%esp),%xmm3
1157	movdqa	%xmm6,48(%esp)
1158	movdqa	%xmm4,%xmm6
1159	pmuludq	%xmm7,%xmm4
1160	paddq	64(%esp),%xmm4
1161	movdqa	%xmm6,64(%esp)
1162	movdqa	%xmm5,%xmm6
1163	pmuludq	48(%esp),%xmm5
1164	movdqa	%xmm6,%xmm7
1165	pmuludq	32(%esp),%xmm6
1166	paddq	%xmm5,%xmm4
1167	movdqa	%xmm7,%xmm5
1168	pmuludq	16(%esp),%xmm7
1169	paddq	%xmm6,%xmm3
1170	pshufd	$16,80(%edx),%xmm6
1171	pmuludq	(%esp),%xmm5
1172	paddq	%xmm7,%xmm2
1173	pmuludq	64(%esp),%xmm6
1174	pshufd	$16,32(%edx),%xmm7
1175	paddq	%xmm5,%xmm1
1176	movdqa	%xmm7,%xmm5
1177	pmuludq	32(%esp),%xmm7
1178	paddq	%xmm6,%xmm0
1179	movdqa	%xmm5,%xmm6
1180	pmuludq	16(%esp),%xmm5
1181	paddq	%xmm7,%xmm4
1182	pshufd	$16,96(%edx),%xmm7
1183	pmuludq	(%esp),%xmm6
1184	paddq	%xmm5,%xmm3
1185	movdqa	%xmm7,%xmm5
1186	pmuludq	64(%esp),%xmm7
1187	paddq	%xmm6,%xmm2
1188	pmuludq	48(%esp),%xmm5
1189	pshufd	$16,48(%edx),%xmm6
1190	paddq	%xmm7,%xmm1
1191	movdqa	%xmm6,%xmm7
1192	pmuludq	16(%esp),%xmm6
1193	paddq	%xmm5,%xmm0
1194	pshufd	$16,112(%edx),%xmm5
1195	pmuludq	(%esp),%xmm7
1196	paddq	%xmm6,%xmm4
1197	movdqa	%xmm5,%xmm6
1198	pmuludq	64(%esp),%xmm5
1199	paddq	%xmm7,%xmm3
1200	movdqa	%xmm6,%xmm7
1201	pmuludq	48(%esp),%xmm6
1202	paddq	%xmm5,%xmm2
1203	pmuludq	32(%esp),%xmm7
1204	pshufd	$16,64(%edx),%xmm5
1205	paddq	%xmm6,%xmm1
1206	pshufd	$16,128(%edx),%xmm6
1207	pmuludq	(%esp),%xmm5
1208	paddq	%xmm7,%xmm0
1209	movdqa	%xmm6,%xmm7
1210	pmuludq	64(%esp),%xmm6
1211	paddq	%xmm5,%xmm4
1212	movdqa	%xmm7,%xmm5
1213	pmuludq	16(%esp),%xmm7
1214	paddq	%xmm6,%xmm3
1215	movdqa	%xmm5,%xmm6
1216	pmuludq	32(%esp),%xmm5
1217	paddq	%xmm7,%xmm0
1218	pmuludq	48(%esp),%xmm6
1219	movdqa	64(%ebx),%xmm7
1220	paddq	%xmm5,%xmm1
1221	paddq	%xmm6,%xmm2
1222.L017short_tail:
1223	pshufd	$78,%xmm4,%xmm6
1224	pshufd	$78,%xmm3,%xmm5
1225	paddq	%xmm6,%xmm4
1226	paddq	%xmm5,%xmm3
1227	pshufd	$78,%xmm0,%xmm6
1228	pshufd	$78,%xmm1,%xmm5
1229	paddq	%xmm6,%xmm0
1230	paddq	%xmm5,%xmm1
1231	pshufd	$78,%xmm2,%xmm6
1232	movdqa	%xmm3,%xmm5
1233	pand	%xmm7,%xmm3
1234	psrlq	$26,%xmm5
1235	paddq	%xmm6,%xmm2
1236	paddq	%xmm4,%xmm5
1237	movdqa	%xmm0,%xmm6
1238	pand	%xmm7,%xmm0
1239	psrlq	$26,%xmm6
1240	movdqa	%xmm5,%xmm4
1241	paddq	%xmm1,%xmm6
1242	psrlq	$26,%xmm5
1243	pand	%xmm7,%xmm4
1244	movdqa	%xmm6,%xmm1
1245	psrlq	$26,%xmm6
1246	paddd	%xmm5,%xmm0
1247	psllq	$2,%xmm5
1248	paddq	%xmm2,%xmm6
1249	paddq	%xmm0,%xmm5
1250	pand	%xmm7,%xmm1
1251	movdqa	%xmm6,%xmm2
1252	psrlq	$26,%xmm6
1253	pand	%xmm7,%xmm2
1254	paddd	%xmm3,%xmm6
1255	movdqa	%xmm5,%xmm0
1256	psrlq	$26,%xmm5
1257	movdqa	%xmm6,%xmm3
1258	psrlq	$26,%xmm6
1259	pand	%xmm7,%xmm0
1260	paddd	%xmm5,%xmm1
1261	pand	%xmm7,%xmm3
1262	paddd	%xmm6,%xmm4
1263.L013done:
1264	movd	%xmm0,-48(%edi)
1265	movd	%xmm1,-44(%edi)
1266	movd	%xmm2,-40(%edi)
1267	movd	%xmm3,-36(%edi)
1268	movd	%xmm4,-32(%edi)
1269	movl	%ebp,%esp
1270.L007nodata:
1271	popl	%edi
1272	popl	%esi
1273	popl	%ebx
1274	popl	%ebp
1275	ret
1276.size	_poly1305_blocks_sse2,.-_poly1305_blocks_sse2
1277.align	32
1278.type	_poly1305_emit_sse2,@function
1279.align	16
1280_poly1305_emit_sse2:
1281	#ifdef __CET__
1282
1283.byte	243,15,30,251
1284	#endif
1285
1286	pushl	%ebp
1287	pushl	%ebx
1288	pushl	%esi
1289	pushl	%edi
1290	movl	20(%esp),%ebp
1291	cmpl	$0,20(%ebp)
1292	je	.Lenter_emit
1293	movl	(%ebp),%eax
1294	movl	4(%ebp),%edi
1295	movl	8(%ebp),%ecx
1296	movl	12(%ebp),%edx
1297	movl	16(%ebp),%esi
1298	movl	%edi,%ebx
1299	shll	$26,%edi
1300	shrl	$6,%ebx
1301	addl	%edi,%eax
1302	movl	%ecx,%edi
1303	adcl	$0,%ebx
1304	shll	$20,%edi
1305	shrl	$12,%ecx
1306	addl	%edi,%ebx
1307	movl	%edx,%edi
1308	adcl	$0,%ecx
1309	shll	$14,%edi
1310	shrl	$18,%edx
1311	addl	%edi,%ecx
1312	movl	%esi,%edi
1313	adcl	$0,%edx
1314	shll	$8,%edi
1315	shrl	$24,%esi
1316	addl	%edi,%edx
1317	adcl	$0,%esi
1318	movl	%esi,%edi
1319	andl	$3,%esi
1320	shrl	$2,%edi
1321	leal	(%edi,%edi,4),%ebp
1322	movl	24(%esp),%edi
1323	addl	%ebp,%eax
1324	movl	28(%esp),%ebp
1325	adcl	$0,%ebx
1326	adcl	$0,%ecx
1327	adcl	$0,%edx
1328	adcl	$0,%esi
1329	movd	%eax,%xmm0
1330	addl	$5,%eax
1331	movd	%ebx,%xmm1
1332	adcl	$0,%ebx
1333	movd	%ecx,%xmm2
1334	adcl	$0,%ecx
1335	movd	%edx,%xmm3
1336	adcl	$0,%edx
1337	adcl	$0,%esi
1338	shrl	$2,%esi
1339	negl	%esi
1340	andl	%esi,%eax
1341	andl	%esi,%ebx
1342	andl	%esi,%ecx
1343	andl	%esi,%edx
1344	movl	%eax,(%edi)
1345	movd	%xmm0,%eax
1346	movl	%ebx,4(%edi)
1347	movd	%xmm1,%ebx
1348	movl	%ecx,8(%edi)
1349	movd	%xmm2,%ecx
1350	movl	%edx,12(%edi)
1351	movd	%xmm3,%edx
1352	notl	%esi
1353	andl	%esi,%eax
1354	andl	%esi,%ebx
1355	orl	(%edi),%eax
1356	andl	%esi,%ecx
1357	orl	4(%edi),%ebx
1358	andl	%esi,%edx
1359	orl	8(%edi),%ecx
1360	orl	12(%edi),%edx
1361	addl	(%ebp),%eax
1362	adcl	4(%ebp),%ebx
1363	movl	%eax,(%edi)
1364	adcl	8(%ebp),%ecx
1365	movl	%ebx,4(%edi)
1366	adcl	12(%ebp),%edx
1367	movl	%ecx,8(%edi)
1368	movl	%edx,12(%edi)
1369	popl	%edi
1370	popl	%esi
1371	popl	%ebx
1372	popl	%ebp
1373	ret
1374.size	_poly1305_emit_sse2,.-_poly1305_emit_sse2
1375.align	64
1376.Lconst_sse2:
1377.long	16777216,0,16777216,0,16777216,0,16777216,0
1378.long	0,0,0,0,0,0,0,0
1379.long	67108863,0,67108863,0,67108863,0,67108863,0
1380.long	268435455,268435452,268435452,268435452
1381.byte	80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54
1382.byte	44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
1383.byte	60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
1384.byte	114,103,62,0
1385.align	4
1386.comm	OPENSSL_ia32cap_P,16,4
1387
1388	.section ".note.gnu.property", "a"
1389	.p2align 2
1390	.long 1f - 0f
1391	.long 4f - 1f
1392	.long 5
13930:
1394	.asciz "GNU"
13951:
1396	.p2align 2
1397	.long 0xc0000002
1398	.long 3f - 2f
13992:
1400	.long 3
14013:
1402	.p2align 2
14034:
1404