rsaz-x86_64.S revision 356290
1/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/rsaz-x86_64.S 356290 2020-01-02 21:35:28Z jkim $ */
2/* Do not modify. This file is auto-generated from rsaz-x86_64.pl. */
3.text
4
5
6
7.globl	rsaz_512_sqr
8.type	rsaz_512_sqr,@function
9.align	32
10rsaz_512_sqr:
11	pushq	%rbx
12	pushq	%rbp
13	pushq	%r12
14	pushq	%r13
15	pushq	%r14
16	pushq	%r15
17
18	subq	$128+24,%rsp
19.Lsqr_body:
20.byte	102,72,15,110,202
21	movq	(%rsi),%rdx
22	movq	8(%rsi),%rax
23	movq	%rcx,128(%rsp)
24	movl	$0x80100,%r11d
25	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
26	cmpl	$0x80100,%r11d
27	je	.Loop_sqrx
28	jmp	.Loop_sqr
29
30.align	32
31.Loop_sqr:
32	movl	%r8d,128+8(%rsp)
33
34	movq	%rdx,%rbx
35	movq	%rax,%rbp
36	mulq	%rdx
37	movq	%rax,%r8
38	movq	16(%rsi),%rax
39	movq	%rdx,%r9
40
41	mulq	%rbx
42	addq	%rax,%r9
43	movq	24(%rsi),%rax
44	movq	%rdx,%r10
45	adcq	$0,%r10
46
47	mulq	%rbx
48	addq	%rax,%r10
49	movq	32(%rsi),%rax
50	movq	%rdx,%r11
51	adcq	$0,%r11
52
53	mulq	%rbx
54	addq	%rax,%r11
55	movq	40(%rsi),%rax
56	movq	%rdx,%r12
57	adcq	$0,%r12
58
59	mulq	%rbx
60	addq	%rax,%r12
61	movq	48(%rsi),%rax
62	movq	%rdx,%r13
63	adcq	$0,%r13
64
65	mulq	%rbx
66	addq	%rax,%r13
67	movq	56(%rsi),%rax
68	movq	%rdx,%r14
69	adcq	$0,%r14
70
71	mulq	%rbx
72	addq	%rax,%r14
73	movq	%rbx,%rax
74	adcq	$0,%rdx
75
76	xorq	%rcx,%rcx
77	addq	%r8,%r8
78	movq	%rdx,%r15
79	adcq	$0,%rcx
80
81	mulq	%rax
82	addq	%r8,%rdx
83	adcq	$0,%rcx
84
85	movq	%rax,(%rsp)
86	movq	%rdx,8(%rsp)
87
88
89	movq	16(%rsi),%rax
90	mulq	%rbp
91	addq	%rax,%r10
92	movq	24(%rsi),%rax
93	movq	%rdx,%rbx
94	adcq	$0,%rbx
95
96	mulq	%rbp
97	addq	%rax,%r11
98	movq	32(%rsi),%rax
99	adcq	$0,%rdx
100	addq	%rbx,%r11
101	movq	%rdx,%rbx
102	adcq	$0,%rbx
103
104	mulq	%rbp
105	addq	%rax,%r12
106	movq	40(%rsi),%rax
107	adcq	$0,%rdx
108	addq	%rbx,%r12
109	movq	%rdx,%rbx
110	adcq	$0,%rbx
111
112	mulq	%rbp
113	addq	%rax,%r13
114	movq	48(%rsi),%rax
115	adcq	$0,%rdx
116	addq	%rbx,%r13
117	movq	%rdx,%rbx
118	adcq	$0,%rbx
119
120	mulq	%rbp
121	addq	%rax,%r14
122	movq	56(%rsi),%rax
123	adcq	$0,%rdx
124	addq	%rbx,%r14
125	movq	%rdx,%rbx
126	adcq	$0,%rbx
127
128	mulq	%rbp
129	addq	%rax,%r15
130	movq	%rbp,%rax
131	adcq	$0,%rdx
132	addq	%rbx,%r15
133	adcq	$0,%rdx
134
135	xorq	%rbx,%rbx
136	addq	%r9,%r9
137	movq	%rdx,%r8
138	adcq	%r10,%r10
139	adcq	$0,%rbx
140
141	mulq	%rax
142
143	addq	%rcx,%rax
144	movq	16(%rsi),%rbp
145	addq	%rax,%r9
146	movq	24(%rsi),%rax
147	adcq	%rdx,%r10
148	adcq	$0,%rbx
149
150	movq	%r9,16(%rsp)
151	movq	%r10,24(%rsp)
152
153
154	mulq	%rbp
155	addq	%rax,%r12
156	movq	32(%rsi),%rax
157	movq	%rdx,%rcx
158	adcq	$0,%rcx
159
160	mulq	%rbp
161	addq	%rax,%r13
162	movq	40(%rsi),%rax
163	adcq	$0,%rdx
164	addq	%rcx,%r13
165	movq	%rdx,%rcx
166	adcq	$0,%rcx
167
168	mulq	%rbp
169	addq	%rax,%r14
170	movq	48(%rsi),%rax
171	adcq	$0,%rdx
172	addq	%rcx,%r14
173	movq	%rdx,%rcx
174	adcq	$0,%rcx
175
176	mulq	%rbp
177	addq	%rax,%r15
178	movq	56(%rsi),%rax
179	adcq	$0,%rdx
180	addq	%rcx,%r15
181	movq	%rdx,%rcx
182	adcq	$0,%rcx
183
184	mulq	%rbp
185	addq	%rax,%r8
186	movq	%rbp,%rax
187	adcq	$0,%rdx
188	addq	%rcx,%r8
189	adcq	$0,%rdx
190
191	xorq	%rcx,%rcx
192	addq	%r11,%r11
193	movq	%rdx,%r9
194	adcq	%r12,%r12
195	adcq	$0,%rcx
196
197	mulq	%rax
198
199	addq	%rbx,%rax
200	movq	24(%rsi),%r10
201	addq	%rax,%r11
202	movq	32(%rsi),%rax
203	adcq	%rdx,%r12
204	adcq	$0,%rcx
205
206	movq	%r11,32(%rsp)
207	movq	%r12,40(%rsp)
208
209
210	movq	%rax,%r11
211	mulq	%r10
212	addq	%rax,%r14
213	movq	40(%rsi),%rax
214	movq	%rdx,%rbx
215	adcq	$0,%rbx
216
217	movq	%rax,%r12
218	mulq	%r10
219	addq	%rax,%r15
220	movq	48(%rsi),%rax
221	adcq	$0,%rdx
222	addq	%rbx,%r15
223	movq	%rdx,%rbx
224	adcq	$0,%rbx
225
226	movq	%rax,%rbp
227	mulq	%r10
228	addq	%rax,%r8
229	movq	56(%rsi),%rax
230	adcq	$0,%rdx
231	addq	%rbx,%r8
232	movq	%rdx,%rbx
233	adcq	$0,%rbx
234
235	mulq	%r10
236	addq	%rax,%r9
237	movq	%r10,%rax
238	adcq	$0,%rdx
239	addq	%rbx,%r9
240	adcq	$0,%rdx
241
242	xorq	%rbx,%rbx
243	addq	%r13,%r13
244	movq	%rdx,%r10
245	adcq	%r14,%r14
246	adcq	$0,%rbx
247
248	mulq	%rax
249
250	addq	%rcx,%rax
251	addq	%rax,%r13
252	movq	%r12,%rax
253	adcq	%rdx,%r14
254	adcq	$0,%rbx
255
256	movq	%r13,48(%rsp)
257	movq	%r14,56(%rsp)
258
259
260	mulq	%r11
261	addq	%rax,%r8
262	movq	%rbp,%rax
263	movq	%rdx,%rcx
264	adcq	$0,%rcx
265
266	mulq	%r11
267	addq	%rax,%r9
268	movq	56(%rsi),%rax
269	adcq	$0,%rdx
270	addq	%rcx,%r9
271	movq	%rdx,%rcx
272	adcq	$0,%rcx
273
274	movq	%rax,%r14
275	mulq	%r11
276	addq	%rax,%r10
277	movq	%r11,%rax
278	adcq	$0,%rdx
279	addq	%rcx,%r10
280	adcq	$0,%rdx
281
282	xorq	%rcx,%rcx
283	addq	%r15,%r15
284	movq	%rdx,%r11
285	adcq	%r8,%r8
286	adcq	$0,%rcx
287
288	mulq	%rax
289
290	addq	%rbx,%rax
291	addq	%rax,%r15
292	movq	%rbp,%rax
293	adcq	%rdx,%r8
294	adcq	$0,%rcx
295
296	movq	%r15,64(%rsp)
297	movq	%r8,72(%rsp)
298
299
300	mulq	%r12
301	addq	%rax,%r10
302	movq	%r14,%rax
303	movq	%rdx,%rbx
304	adcq	$0,%rbx
305
306	mulq	%r12
307	addq	%rax,%r11
308	movq	%r12,%rax
309	adcq	$0,%rdx
310	addq	%rbx,%r11
311	adcq	$0,%rdx
312
313	xorq	%rbx,%rbx
314	addq	%r9,%r9
315	movq	%rdx,%r12
316	adcq	%r10,%r10
317	adcq	$0,%rbx
318
319	mulq	%rax
320
321	addq	%rcx,%rax
322	addq	%rax,%r9
323	movq	%r14,%rax
324	adcq	%rdx,%r10
325	adcq	$0,%rbx
326
327	movq	%r9,80(%rsp)
328	movq	%r10,88(%rsp)
329
330
331	mulq	%rbp
332	addq	%rax,%r12
333	movq	%rbp,%rax
334	adcq	$0,%rdx
335
336	xorq	%rcx,%rcx
337	addq	%r11,%r11
338	movq	%rdx,%r13
339	adcq	%r12,%r12
340	adcq	$0,%rcx
341
342	mulq	%rax
343
344	addq	%rbx,%rax
345	addq	%rax,%r11
346	movq	%r14,%rax
347	adcq	%rdx,%r12
348	adcq	$0,%rcx
349
350	movq	%r11,96(%rsp)
351	movq	%r12,104(%rsp)
352
353
354	xorq	%rbx,%rbx
355	addq	%r13,%r13
356	adcq	$0,%rbx
357
358	mulq	%rax
359
360	addq	%rcx,%rax
361	addq	%r13,%rax
362	adcq	%rbx,%rdx
363
364	movq	(%rsp),%r8
365	movq	8(%rsp),%r9
366	movq	16(%rsp),%r10
367	movq	24(%rsp),%r11
368	movq	32(%rsp),%r12
369	movq	40(%rsp),%r13
370	movq	48(%rsp),%r14
371	movq	56(%rsp),%r15
372.byte	102,72,15,126,205
373
374	movq	%rax,112(%rsp)
375	movq	%rdx,120(%rsp)
376
377	call	__rsaz_512_reduce
378
379	addq	64(%rsp),%r8
380	adcq	72(%rsp),%r9
381	adcq	80(%rsp),%r10
382	adcq	88(%rsp),%r11
383	adcq	96(%rsp),%r12
384	adcq	104(%rsp),%r13
385	adcq	112(%rsp),%r14
386	adcq	120(%rsp),%r15
387	sbbq	%rcx,%rcx
388
389	call	__rsaz_512_subtract
390
391	movq	%r8,%rdx
392	movq	%r9,%rax
393	movl	128+8(%rsp),%r8d
394	movq	%rdi,%rsi
395
396	decl	%r8d
397	jnz	.Loop_sqr
398	jmp	.Lsqr_tail
399
400.align	32
401.Loop_sqrx:
402	movl	%r8d,128+8(%rsp)
403.byte	102,72,15,110,199
404
405	mulxq	%rax,%r8,%r9
406	movq	%rax,%rbx
407
408	mulxq	16(%rsi),%rcx,%r10
409	xorq	%rbp,%rbp
410
411	mulxq	24(%rsi),%rax,%r11
412	adcxq	%rcx,%r9
413
414.byte	0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00
415	adcxq	%rax,%r10
416
417.byte	0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00
418	adcxq	%rcx,%r11
419
420	mulxq	48(%rsi),%rcx,%r14
421	adcxq	%rax,%r12
422	adcxq	%rcx,%r13
423
424	mulxq	56(%rsi),%rax,%r15
425	adcxq	%rax,%r14
426	adcxq	%rbp,%r15
427
428	mulxq	%rdx,%rax,%rdi
429	movq	%rbx,%rdx
430	xorq	%rcx,%rcx
431	adoxq	%r8,%r8
432	adcxq	%rdi,%r8
433	adoxq	%rbp,%rcx
434	adcxq	%rbp,%rcx
435
436	movq	%rax,(%rsp)
437	movq	%r8,8(%rsp)
438
439
440.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00
441	adoxq	%rax,%r10
442	adcxq	%rbx,%r11
443
444	mulxq	24(%rsi),%rdi,%r8
445	adoxq	%rdi,%r11
446.byte	0x66
447	adcxq	%r8,%r12
448
449	mulxq	32(%rsi),%rax,%rbx
450	adoxq	%rax,%r12
451	adcxq	%rbx,%r13
452
453	mulxq	40(%rsi),%rdi,%r8
454	adoxq	%rdi,%r13
455	adcxq	%r8,%r14
456
457.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
458	adoxq	%rax,%r14
459	adcxq	%rbx,%r15
460
461.byte	0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00
462	adoxq	%rdi,%r15
463	adcxq	%rbp,%r8
464	mulxq	%rdx,%rax,%rdi
465	adoxq	%rbp,%r8
466.byte	0x48,0x8b,0x96,0x10,0x00,0x00,0x00
467
468	xorq	%rbx,%rbx
469	adoxq	%r9,%r9
470
471	adcxq	%rcx,%rax
472	adoxq	%r10,%r10
473	adcxq	%rax,%r9
474	adoxq	%rbp,%rbx
475	adcxq	%rdi,%r10
476	adcxq	%rbp,%rbx
477
478	movq	%r9,16(%rsp)
479.byte	0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00
480
481
482	mulxq	24(%rsi),%rdi,%r9
483	adoxq	%rdi,%r12
484	adcxq	%r9,%r13
485
486	mulxq	32(%rsi),%rax,%rcx
487	adoxq	%rax,%r13
488	adcxq	%rcx,%r14
489
490.byte	0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00
491	adoxq	%rdi,%r14
492	adcxq	%r9,%r15
493
494.byte	0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00
495	adoxq	%rax,%r15
496	adcxq	%rcx,%r8
497
498	mulxq	56(%rsi),%rdi,%r9
499	adoxq	%rdi,%r8
500	adcxq	%rbp,%r9
501	mulxq	%rdx,%rax,%rdi
502	adoxq	%rbp,%r9
503	movq	24(%rsi),%rdx
504
505	xorq	%rcx,%rcx
506	adoxq	%r11,%r11
507
508	adcxq	%rbx,%rax
509	adoxq	%r12,%r12
510	adcxq	%rax,%r11
511	adoxq	%rbp,%rcx
512	adcxq	%rdi,%r12
513	adcxq	%rbp,%rcx
514
515	movq	%r11,32(%rsp)
516	movq	%r12,40(%rsp)
517
518
519	mulxq	32(%rsi),%rax,%rbx
520	adoxq	%rax,%r14
521	adcxq	%rbx,%r15
522
523	mulxq	40(%rsi),%rdi,%r10
524	adoxq	%rdi,%r15
525	adcxq	%r10,%r8
526
527	mulxq	48(%rsi),%rax,%rbx
528	adoxq	%rax,%r8
529	adcxq	%rbx,%r9
530
531	mulxq	56(%rsi),%rdi,%r10
532	adoxq	%rdi,%r9
533	adcxq	%rbp,%r10
534	mulxq	%rdx,%rax,%rdi
535	adoxq	%rbp,%r10
536	movq	32(%rsi),%rdx
537
538	xorq	%rbx,%rbx
539	adoxq	%r13,%r13
540
541	adcxq	%rcx,%rax
542	adoxq	%r14,%r14
543	adcxq	%rax,%r13
544	adoxq	%rbp,%rbx
545	adcxq	%rdi,%r14
546	adcxq	%rbp,%rbx
547
548	movq	%r13,48(%rsp)
549	movq	%r14,56(%rsp)
550
551
552	mulxq	40(%rsi),%rdi,%r11
553	adoxq	%rdi,%r8
554	adcxq	%r11,%r9
555
556	mulxq	48(%rsi),%rax,%rcx
557	adoxq	%rax,%r9
558	adcxq	%rcx,%r10
559
560	mulxq	56(%rsi),%rdi,%r11
561	adoxq	%rdi,%r10
562	adcxq	%rbp,%r11
563	mulxq	%rdx,%rax,%rdi
564	movq	40(%rsi),%rdx
565	adoxq	%rbp,%r11
566
567	xorq	%rcx,%rcx
568	adoxq	%r15,%r15
569
570	adcxq	%rbx,%rax
571	adoxq	%r8,%r8
572	adcxq	%rax,%r15
573	adoxq	%rbp,%rcx
574	adcxq	%rdi,%r8
575	adcxq	%rbp,%rcx
576
577	movq	%r15,64(%rsp)
578	movq	%r8,72(%rsp)
579
580
581.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
582	adoxq	%rax,%r10
583	adcxq	%rbx,%r11
584
585.byte	0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00
586	adoxq	%rdi,%r11
587	adcxq	%rbp,%r12
588	mulxq	%rdx,%rax,%rdi
589	adoxq	%rbp,%r12
590	movq	48(%rsi),%rdx
591
592	xorq	%rbx,%rbx
593	adoxq	%r9,%r9
594
595	adcxq	%rcx,%rax
596	adoxq	%r10,%r10
597	adcxq	%rax,%r9
598	adcxq	%rdi,%r10
599	adoxq	%rbp,%rbx
600	adcxq	%rbp,%rbx
601
602	movq	%r9,80(%rsp)
603	movq	%r10,88(%rsp)
604
605
606.byte	0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00
607	adoxq	%rax,%r12
608	adoxq	%rbp,%r13
609
610	mulxq	%rdx,%rax,%rdi
611	xorq	%rcx,%rcx
612	movq	56(%rsi),%rdx
613	adoxq	%r11,%r11
614
615	adcxq	%rbx,%rax
616	adoxq	%r12,%r12
617	adcxq	%rax,%r11
618	adoxq	%rbp,%rcx
619	adcxq	%rdi,%r12
620	adcxq	%rbp,%rcx
621
622.byte	0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00
623.byte	0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00
624
625
626	mulxq	%rdx,%rax,%rdx
627	xorq	%rbx,%rbx
628	adoxq	%r13,%r13
629
630	adcxq	%rcx,%rax
631	adoxq	%rbp,%rbx
632	adcxq	%r13,%rax
633	adcxq	%rdx,%rbx
634
635.byte	102,72,15,126,199
636.byte	102,72,15,126,205
637
638	movq	128(%rsp),%rdx
639	movq	(%rsp),%r8
640	movq	8(%rsp),%r9
641	movq	16(%rsp),%r10
642	movq	24(%rsp),%r11
643	movq	32(%rsp),%r12
644	movq	40(%rsp),%r13
645	movq	48(%rsp),%r14
646	movq	56(%rsp),%r15
647
648	movq	%rax,112(%rsp)
649	movq	%rbx,120(%rsp)
650
651	call	__rsaz_512_reducex
652
653	addq	64(%rsp),%r8
654	adcq	72(%rsp),%r9
655	adcq	80(%rsp),%r10
656	adcq	88(%rsp),%r11
657	adcq	96(%rsp),%r12
658	adcq	104(%rsp),%r13
659	adcq	112(%rsp),%r14
660	adcq	120(%rsp),%r15
661	sbbq	%rcx,%rcx
662
663	call	__rsaz_512_subtract
664
665	movq	%r8,%rdx
666	movq	%r9,%rax
667	movl	128+8(%rsp),%r8d
668	movq	%rdi,%rsi
669
670	decl	%r8d
671	jnz	.Loop_sqrx
672
673.Lsqr_tail:
674
675	leaq	128+24+48(%rsp),%rax
676	movq	-48(%rax),%r15
677	movq	-40(%rax),%r14
678	movq	-32(%rax),%r13
679	movq	-24(%rax),%r12
680	movq	-16(%rax),%rbp
681	movq	-8(%rax),%rbx
682	leaq	(%rax),%rsp
683.Lsqr_epilogue:
684	.byte	0xf3,0xc3
685.size	rsaz_512_sqr,.-rsaz_512_sqr
686.globl	rsaz_512_mul
687.type	rsaz_512_mul,@function
688.align	32
689rsaz_512_mul:
690	pushq	%rbx
691	pushq	%rbp
692	pushq	%r12
693	pushq	%r13
694	pushq	%r14
695	pushq	%r15
696
697	subq	$128+24,%rsp
698.Lmul_body:
699.byte	102,72,15,110,199
700.byte	102,72,15,110,201
701	movq	%r8,128(%rsp)
702	movl	$0x80100,%r11d
703	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
704	cmpl	$0x80100,%r11d
705	je	.Lmulx
706	movq	(%rdx),%rbx
707	movq	%rdx,%rbp
708	call	__rsaz_512_mul
709
710.byte	102,72,15,126,199
711.byte	102,72,15,126,205
712
713	movq	(%rsp),%r8
714	movq	8(%rsp),%r9
715	movq	16(%rsp),%r10
716	movq	24(%rsp),%r11
717	movq	32(%rsp),%r12
718	movq	40(%rsp),%r13
719	movq	48(%rsp),%r14
720	movq	56(%rsp),%r15
721
722	call	__rsaz_512_reduce
723	jmp	.Lmul_tail
724
725.align	32
726.Lmulx:
727	movq	%rdx,%rbp
728	movq	(%rdx),%rdx
729	call	__rsaz_512_mulx
730
731.byte	102,72,15,126,199
732.byte	102,72,15,126,205
733
734	movq	128(%rsp),%rdx
735	movq	(%rsp),%r8
736	movq	8(%rsp),%r9
737	movq	16(%rsp),%r10
738	movq	24(%rsp),%r11
739	movq	32(%rsp),%r12
740	movq	40(%rsp),%r13
741	movq	48(%rsp),%r14
742	movq	56(%rsp),%r15
743
744	call	__rsaz_512_reducex
745.Lmul_tail:
746	addq	64(%rsp),%r8
747	adcq	72(%rsp),%r9
748	adcq	80(%rsp),%r10
749	adcq	88(%rsp),%r11
750	adcq	96(%rsp),%r12
751	adcq	104(%rsp),%r13
752	adcq	112(%rsp),%r14
753	adcq	120(%rsp),%r15
754	sbbq	%rcx,%rcx
755
756	call	__rsaz_512_subtract
757
758	leaq	128+24+48(%rsp),%rax
759	movq	-48(%rax),%r15
760	movq	-40(%rax),%r14
761	movq	-32(%rax),%r13
762	movq	-24(%rax),%r12
763	movq	-16(%rax),%rbp
764	movq	-8(%rax),%rbx
765	leaq	(%rax),%rsp
766.Lmul_epilogue:
767	.byte	0xf3,0xc3
768.size	rsaz_512_mul,.-rsaz_512_mul
769.globl	rsaz_512_mul_gather4
770.type	rsaz_512_mul_gather4,@function
771.align	32
772rsaz_512_mul_gather4:
773	pushq	%rbx
774	pushq	%rbp
775	pushq	%r12
776	pushq	%r13
777	pushq	%r14
778	pushq	%r15
779
780	subq	$152,%rsp
781.Lmul_gather4_body:
782	movd	%r9d,%xmm8
783	movdqa	.Linc+16(%rip),%xmm1
784	movdqa	.Linc(%rip),%xmm0
785
786	pshufd	$0,%xmm8,%xmm8
787	movdqa	%xmm1,%xmm7
788	movdqa	%xmm1,%xmm2
789	paddd	%xmm0,%xmm1
790	pcmpeqd	%xmm8,%xmm0
791	movdqa	%xmm7,%xmm3
792	paddd	%xmm1,%xmm2
793	pcmpeqd	%xmm8,%xmm1
794	movdqa	%xmm7,%xmm4
795	paddd	%xmm2,%xmm3
796	pcmpeqd	%xmm8,%xmm2
797	movdqa	%xmm7,%xmm5
798	paddd	%xmm3,%xmm4
799	pcmpeqd	%xmm8,%xmm3
800	movdqa	%xmm7,%xmm6
801	paddd	%xmm4,%xmm5
802	pcmpeqd	%xmm8,%xmm4
803	paddd	%xmm5,%xmm6
804	pcmpeqd	%xmm8,%xmm5
805	paddd	%xmm6,%xmm7
806	pcmpeqd	%xmm8,%xmm6
807	pcmpeqd	%xmm8,%xmm7
808
809	movdqa	0(%rdx),%xmm8
810	movdqa	16(%rdx),%xmm9
811	movdqa	32(%rdx),%xmm10
812	movdqa	48(%rdx),%xmm11
813	pand	%xmm0,%xmm8
814	movdqa	64(%rdx),%xmm12
815	pand	%xmm1,%xmm9
816	movdqa	80(%rdx),%xmm13
817	pand	%xmm2,%xmm10
818	movdqa	96(%rdx),%xmm14
819	pand	%xmm3,%xmm11
820	movdqa	112(%rdx),%xmm15
821	leaq	128(%rdx),%rbp
822	pand	%xmm4,%xmm12
823	pand	%xmm5,%xmm13
824	pand	%xmm6,%xmm14
825	pand	%xmm7,%xmm15
826	por	%xmm10,%xmm8
827	por	%xmm11,%xmm9
828	por	%xmm12,%xmm8
829	por	%xmm13,%xmm9
830	por	%xmm14,%xmm8
831	por	%xmm15,%xmm9
832
833	por	%xmm9,%xmm8
834	pshufd	$0x4e,%xmm8,%xmm9
835	por	%xmm9,%xmm8
836	movl	$0x80100,%r11d
837	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
838	cmpl	$0x80100,%r11d
839	je	.Lmulx_gather
840.byte	102,76,15,126,195
841
842	movq	%r8,128(%rsp)
843	movq	%rdi,128+8(%rsp)
844	movq	%rcx,128+16(%rsp)
845
846	movq	(%rsi),%rax
847	movq	8(%rsi),%rcx
848	mulq	%rbx
849	movq	%rax,(%rsp)
850	movq	%rcx,%rax
851	movq	%rdx,%r8
852
853	mulq	%rbx
854	addq	%rax,%r8
855	movq	16(%rsi),%rax
856	movq	%rdx,%r9
857	adcq	$0,%r9
858
859	mulq	%rbx
860	addq	%rax,%r9
861	movq	24(%rsi),%rax
862	movq	%rdx,%r10
863	adcq	$0,%r10
864
865	mulq	%rbx
866	addq	%rax,%r10
867	movq	32(%rsi),%rax
868	movq	%rdx,%r11
869	adcq	$0,%r11
870
871	mulq	%rbx
872	addq	%rax,%r11
873	movq	40(%rsi),%rax
874	movq	%rdx,%r12
875	adcq	$0,%r12
876
877	mulq	%rbx
878	addq	%rax,%r12
879	movq	48(%rsi),%rax
880	movq	%rdx,%r13
881	adcq	$0,%r13
882
883	mulq	%rbx
884	addq	%rax,%r13
885	movq	56(%rsi),%rax
886	movq	%rdx,%r14
887	adcq	$0,%r14
888
889	mulq	%rbx
890	addq	%rax,%r14
891	movq	(%rsi),%rax
892	movq	%rdx,%r15
893	adcq	$0,%r15
894
895	leaq	8(%rsp),%rdi
896	movl	$7,%ecx
897	jmp	.Loop_mul_gather
898
899.align	32
900.Loop_mul_gather:
901	movdqa	0(%rbp),%xmm8
902	movdqa	16(%rbp),%xmm9
903	movdqa	32(%rbp),%xmm10
904	movdqa	48(%rbp),%xmm11
905	pand	%xmm0,%xmm8
906	movdqa	64(%rbp),%xmm12
907	pand	%xmm1,%xmm9
908	movdqa	80(%rbp),%xmm13
909	pand	%xmm2,%xmm10
910	movdqa	96(%rbp),%xmm14
911	pand	%xmm3,%xmm11
912	movdqa	112(%rbp),%xmm15
913	leaq	128(%rbp),%rbp
914	pand	%xmm4,%xmm12
915	pand	%xmm5,%xmm13
916	pand	%xmm6,%xmm14
917	pand	%xmm7,%xmm15
918	por	%xmm10,%xmm8
919	por	%xmm11,%xmm9
920	por	%xmm12,%xmm8
921	por	%xmm13,%xmm9
922	por	%xmm14,%xmm8
923	por	%xmm15,%xmm9
924
925	por	%xmm9,%xmm8
926	pshufd	$0x4e,%xmm8,%xmm9
927	por	%xmm9,%xmm8
928.byte	102,76,15,126,195
929
930	mulq	%rbx
931	addq	%rax,%r8
932	movq	8(%rsi),%rax
933	movq	%r8,(%rdi)
934	movq	%rdx,%r8
935	adcq	$0,%r8
936
937	mulq	%rbx
938	addq	%rax,%r9
939	movq	16(%rsi),%rax
940	adcq	$0,%rdx
941	addq	%r9,%r8
942	movq	%rdx,%r9
943	adcq	$0,%r9
944
945	mulq	%rbx
946	addq	%rax,%r10
947	movq	24(%rsi),%rax
948	adcq	$0,%rdx
949	addq	%r10,%r9
950	movq	%rdx,%r10
951	adcq	$0,%r10
952
953	mulq	%rbx
954	addq	%rax,%r11
955	movq	32(%rsi),%rax
956	adcq	$0,%rdx
957	addq	%r11,%r10
958	movq	%rdx,%r11
959	adcq	$0,%r11
960
961	mulq	%rbx
962	addq	%rax,%r12
963	movq	40(%rsi),%rax
964	adcq	$0,%rdx
965	addq	%r12,%r11
966	movq	%rdx,%r12
967	adcq	$0,%r12
968
969	mulq	%rbx
970	addq	%rax,%r13
971	movq	48(%rsi),%rax
972	adcq	$0,%rdx
973	addq	%r13,%r12
974	movq	%rdx,%r13
975	adcq	$0,%r13
976
977	mulq	%rbx
978	addq	%rax,%r14
979	movq	56(%rsi),%rax
980	adcq	$0,%rdx
981	addq	%r14,%r13
982	movq	%rdx,%r14
983	adcq	$0,%r14
984
985	mulq	%rbx
986	addq	%rax,%r15
987	movq	(%rsi),%rax
988	adcq	$0,%rdx
989	addq	%r15,%r14
990	movq	%rdx,%r15
991	adcq	$0,%r15
992
993	leaq	8(%rdi),%rdi
994
995	decl	%ecx
996	jnz	.Loop_mul_gather
997
998	movq	%r8,(%rdi)
999	movq	%r9,8(%rdi)
1000	movq	%r10,16(%rdi)
1001	movq	%r11,24(%rdi)
1002	movq	%r12,32(%rdi)
1003	movq	%r13,40(%rdi)
1004	movq	%r14,48(%rdi)
1005	movq	%r15,56(%rdi)
1006
1007	movq	128+8(%rsp),%rdi
1008	movq	128+16(%rsp),%rbp
1009
1010	movq	(%rsp),%r8
1011	movq	8(%rsp),%r9
1012	movq	16(%rsp),%r10
1013	movq	24(%rsp),%r11
1014	movq	32(%rsp),%r12
1015	movq	40(%rsp),%r13
1016	movq	48(%rsp),%r14
1017	movq	56(%rsp),%r15
1018
1019	call	__rsaz_512_reduce
1020	jmp	.Lmul_gather_tail
1021
1022.align	32
1023.Lmulx_gather:
1024.byte	102,76,15,126,194
1025
1026	movq	%r8,128(%rsp)
1027	movq	%rdi,128+8(%rsp)
1028	movq	%rcx,128+16(%rsp)
1029
1030	mulxq	(%rsi),%rbx,%r8
1031	movq	%rbx,(%rsp)
1032	xorl	%edi,%edi
1033
1034	mulxq	8(%rsi),%rax,%r9
1035
1036	mulxq	16(%rsi),%rbx,%r10
1037	adcxq	%rax,%r8
1038
1039	mulxq	24(%rsi),%rax,%r11
1040	adcxq	%rbx,%r9
1041
1042	mulxq	32(%rsi),%rbx,%r12
1043	adcxq	%rax,%r10
1044
1045	mulxq	40(%rsi),%rax,%r13
1046	adcxq	%rbx,%r11
1047
1048	mulxq	48(%rsi),%rbx,%r14
1049	adcxq	%rax,%r12
1050
1051	mulxq	56(%rsi),%rax,%r15
1052	adcxq	%rbx,%r13
1053	adcxq	%rax,%r14
1054.byte	0x67
1055	movq	%r8,%rbx
1056	adcxq	%rdi,%r15
1057
1058	movq	$-7,%rcx
1059	jmp	.Loop_mulx_gather
1060
1061.align	32
1062.Loop_mulx_gather:
1063	movdqa	0(%rbp),%xmm8
1064	movdqa	16(%rbp),%xmm9
1065	movdqa	32(%rbp),%xmm10
1066	movdqa	48(%rbp),%xmm11
1067	pand	%xmm0,%xmm8
1068	movdqa	64(%rbp),%xmm12
1069	pand	%xmm1,%xmm9
1070	movdqa	80(%rbp),%xmm13
1071	pand	%xmm2,%xmm10
1072	movdqa	96(%rbp),%xmm14
1073	pand	%xmm3,%xmm11
1074	movdqa	112(%rbp),%xmm15
1075	leaq	128(%rbp),%rbp
1076	pand	%xmm4,%xmm12
1077	pand	%xmm5,%xmm13
1078	pand	%xmm6,%xmm14
1079	pand	%xmm7,%xmm15
1080	por	%xmm10,%xmm8
1081	por	%xmm11,%xmm9
1082	por	%xmm12,%xmm8
1083	por	%xmm13,%xmm9
1084	por	%xmm14,%xmm8
1085	por	%xmm15,%xmm9
1086
1087	por	%xmm9,%xmm8
1088	pshufd	$0x4e,%xmm8,%xmm9
1089	por	%xmm9,%xmm8
1090.byte	102,76,15,126,194
1091
1092.byte	0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00
1093	adcxq	%rax,%rbx
1094	adoxq	%r9,%r8
1095
1096	mulxq	8(%rsi),%rax,%r9
1097	adcxq	%rax,%r8
1098	adoxq	%r10,%r9
1099
1100	mulxq	16(%rsi),%rax,%r10
1101	adcxq	%rax,%r9
1102	adoxq	%r11,%r10
1103
1104.byte	0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00
1105	adcxq	%rax,%r10
1106	adoxq	%r12,%r11
1107
1108	mulxq	32(%rsi),%rax,%r12
1109	adcxq	%rax,%r11
1110	adoxq	%r13,%r12
1111
1112	mulxq	40(%rsi),%rax,%r13
1113	adcxq	%rax,%r12
1114	adoxq	%r14,%r13
1115
1116.byte	0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
1117	adcxq	%rax,%r13
1118.byte	0x67
1119	adoxq	%r15,%r14
1120
1121	mulxq	56(%rsi),%rax,%r15
1122	movq	%rbx,64(%rsp,%rcx,8)
1123	adcxq	%rax,%r14
1124	adoxq	%rdi,%r15
1125	movq	%r8,%rbx
1126	adcxq	%rdi,%r15
1127
1128	incq	%rcx
1129	jnz	.Loop_mulx_gather
1130
1131	movq	%r8,64(%rsp)
1132	movq	%r9,64+8(%rsp)
1133	movq	%r10,64+16(%rsp)
1134	movq	%r11,64+24(%rsp)
1135	movq	%r12,64+32(%rsp)
1136	movq	%r13,64+40(%rsp)
1137	movq	%r14,64+48(%rsp)
1138	movq	%r15,64+56(%rsp)
1139
1140	movq	128(%rsp),%rdx
1141	movq	128+8(%rsp),%rdi
1142	movq	128+16(%rsp),%rbp
1143
1144	movq	(%rsp),%r8
1145	movq	8(%rsp),%r9
1146	movq	16(%rsp),%r10
1147	movq	24(%rsp),%r11
1148	movq	32(%rsp),%r12
1149	movq	40(%rsp),%r13
1150	movq	48(%rsp),%r14
1151	movq	56(%rsp),%r15
1152
1153	call	__rsaz_512_reducex
1154
1155.Lmul_gather_tail:
1156	addq	64(%rsp),%r8
1157	adcq	72(%rsp),%r9
1158	adcq	80(%rsp),%r10
1159	adcq	88(%rsp),%r11
1160	adcq	96(%rsp),%r12
1161	adcq	104(%rsp),%r13
1162	adcq	112(%rsp),%r14
1163	adcq	120(%rsp),%r15
1164	sbbq	%rcx,%rcx
1165
1166	call	__rsaz_512_subtract
1167
1168	leaq	128+24+48(%rsp),%rax
1169	movq	-48(%rax),%r15
1170	movq	-40(%rax),%r14
1171	movq	-32(%rax),%r13
1172	movq	-24(%rax),%r12
1173	movq	-16(%rax),%rbp
1174	movq	-8(%rax),%rbx
1175	leaq	(%rax),%rsp
1176.Lmul_gather4_epilogue:
1177	.byte	0xf3,0xc3
1178.size	rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
1179.globl	rsaz_512_mul_scatter4
1180.type	rsaz_512_mul_scatter4,@function
1181.align	32
1182rsaz_512_mul_scatter4:
1183	pushq	%rbx
1184	pushq	%rbp
1185	pushq	%r12
1186	pushq	%r13
1187	pushq	%r14
1188	pushq	%r15
1189
1190	movl	%r9d,%r9d
1191	subq	$128+24,%rsp
1192.Lmul_scatter4_body:
1193	leaq	(%r8,%r9,8),%r8
1194.byte	102,72,15,110,199
1195.byte	102,72,15,110,202
1196.byte	102,73,15,110,208
1197	movq	%rcx,128(%rsp)
1198
1199	movq	%rdi,%rbp
1200	movl	$0x80100,%r11d
1201	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
1202	cmpl	$0x80100,%r11d
1203	je	.Lmulx_scatter
1204	movq	(%rdi),%rbx
1205	call	__rsaz_512_mul
1206
1207.byte	102,72,15,126,199
1208.byte	102,72,15,126,205
1209
1210	movq	(%rsp),%r8
1211	movq	8(%rsp),%r9
1212	movq	16(%rsp),%r10
1213	movq	24(%rsp),%r11
1214	movq	32(%rsp),%r12
1215	movq	40(%rsp),%r13
1216	movq	48(%rsp),%r14
1217	movq	56(%rsp),%r15
1218
1219	call	__rsaz_512_reduce
1220	jmp	.Lmul_scatter_tail
1221
1222.align	32
1223.Lmulx_scatter:
1224	movq	(%rdi),%rdx
1225	call	__rsaz_512_mulx
1226
1227.byte	102,72,15,126,199
1228.byte	102,72,15,126,205
1229
1230	movq	128(%rsp),%rdx
1231	movq	(%rsp),%r8
1232	movq	8(%rsp),%r9
1233	movq	16(%rsp),%r10
1234	movq	24(%rsp),%r11
1235	movq	32(%rsp),%r12
1236	movq	40(%rsp),%r13
1237	movq	48(%rsp),%r14
1238	movq	56(%rsp),%r15
1239
1240	call	__rsaz_512_reducex
1241
1242.Lmul_scatter_tail:
1243	addq	64(%rsp),%r8
1244	adcq	72(%rsp),%r9
1245	adcq	80(%rsp),%r10
1246	adcq	88(%rsp),%r11
1247	adcq	96(%rsp),%r12
1248	adcq	104(%rsp),%r13
1249	adcq	112(%rsp),%r14
1250	adcq	120(%rsp),%r15
1251.byte	102,72,15,126,214
1252	sbbq	%rcx,%rcx
1253
1254	call	__rsaz_512_subtract
1255
1256	movq	%r8,0(%rsi)
1257	movq	%r9,128(%rsi)
1258	movq	%r10,256(%rsi)
1259	movq	%r11,384(%rsi)
1260	movq	%r12,512(%rsi)
1261	movq	%r13,640(%rsi)
1262	movq	%r14,768(%rsi)
1263	movq	%r15,896(%rsi)
1264
1265	leaq	128+24+48(%rsp),%rax
1266	movq	-48(%rax),%r15
1267	movq	-40(%rax),%r14
1268	movq	-32(%rax),%r13
1269	movq	-24(%rax),%r12
1270	movq	-16(%rax),%rbp
1271	movq	-8(%rax),%rbx
1272	leaq	(%rax),%rsp
1273.Lmul_scatter4_epilogue:
1274	.byte	0xf3,0xc3
1275.size	rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
1276.globl	rsaz_512_mul_by_one
1277.type	rsaz_512_mul_by_one,@function
1278.align	32
1279rsaz_512_mul_by_one:
1280	pushq	%rbx
1281	pushq	%rbp
1282	pushq	%r12
1283	pushq	%r13
1284	pushq	%r14
1285	pushq	%r15
1286
1287	subq	$128+24,%rsp
1288.Lmul_by_one_body:
1289	movl	OPENSSL_ia32cap_P+8(%rip),%eax
1290	movq	%rdx,%rbp
1291	movq	%rcx,128(%rsp)
1292
1293	movq	(%rsi),%r8
1294	pxor	%xmm0,%xmm0
1295	movq	8(%rsi),%r9
1296	movq	16(%rsi),%r10
1297	movq	24(%rsi),%r11
1298	movq	32(%rsi),%r12
1299	movq	40(%rsi),%r13
1300	movq	48(%rsi),%r14
1301	movq	56(%rsi),%r15
1302
1303	movdqa	%xmm0,(%rsp)
1304	movdqa	%xmm0,16(%rsp)
1305	movdqa	%xmm0,32(%rsp)
1306	movdqa	%xmm0,48(%rsp)
1307	movdqa	%xmm0,64(%rsp)
1308	movdqa	%xmm0,80(%rsp)
1309	movdqa	%xmm0,96(%rsp)
1310	andl	$0x80100,%eax
1311	cmpl	$0x80100,%eax
1312	je	.Lby_one_callx
1313	call	__rsaz_512_reduce
1314	jmp	.Lby_one_tail
1315.align	32
1316.Lby_one_callx:
1317	movq	128(%rsp),%rdx
1318	call	__rsaz_512_reducex
1319.Lby_one_tail:
1320	movq	%r8,(%rdi)
1321	movq	%r9,8(%rdi)
1322	movq	%r10,16(%rdi)
1323	movq	%r11,24(%rdi)
1324	movq	%r12,32(%rdi)
1325	movq	%r13,40(%rdi)
1326	movq	%r14,48(%rdi)
1327	movq	%r15,56(%rdi)
1328
1329	leaq	128+24+48(%rsp),%rax
1330	movq	-48(%rax),%r15
1331	movq	-40(%rax),%r14
1332	movq	-32(%rax),%r13
1333	movq	-24(%rax),%r12
1334	movq	-16(%rax),%rbp
1335	movq	-8(%rax),%rbx
1336	leaq	(%rax),%rsp
1337.Lmul_by_one_epilogue:
1338	.byte	0xf3,0xc3
1339.size	rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
1340.type	__rsaz_512_reduce,@function
1341.align	32
1342__rsaz_512_reduce:
1343	movq	%r8,%rbx
1344	imulq	128+8(%rsp),%rbx
1345	movq	0(%rbp),%rax
1346	movl	$8,%ecx
1347	jmp	.Lreduction_loop
1348
1349.align	32
1350.Lreduction_loop:
1351	mulq	%rbx
1352	movq	8(%rbp),%rax
1353	negq	%r8
1354	movq	%rdx,%r8
1355	adcq	$0,%r8
1356
1357	mulq	%rbx
1358	addq	%rax,%r9
1359	movq	16(%rbp),%rax
1360	adcq	$0,%rdx
1361	addq	%r9,%r8
1362	movq	%rdx,%r9
1363	adcq	$0,%r9
1364
1365	mulq	%rbx
1366	addq	%rax,%r10
1367	movq	24(%rbp),%rax
1368	adcq	$0,%rdx
1369	addq	%r10,%r9
1370	movq	%rdx,%r10
1371	adcq	$0,%r10
1372
1373	mulq	%rbx
1374	addq	%rax,%r11
1375	movq	32(%rbp),%rax
1376	adcq	$0,%rdx
1377	addq	%r11,%r10
1378	movq	128+8(%rsp),%rsi
1379
1380
1381	adcq	$0,%rdx
1382	movq	%rdx,%r11
1383
1384	mulq	%rbx
1385	addq	%rax,%r12
1386	movq	40(%rbp),%rax
1387	adcq	$0,%rdx
1388	imulq	%r8,%rsi
1389	addq	%r12,%r11
1390	movq	%rdx,%r12
1391	adcq	$0,%r12
1392
1393	mulq	%rbx
1394	addq	%rax,%r13
1395	movq	48(%rbp),%rax
1396	adcq	$0,%rdx
1397	addq	%r13,%r12
1398	movq	%rdx,%r13
1399	adcq	$0,%r13
1400
1401	mulq	%rbx
1402	addq	%rax,%r14
1403	movq	56(%rbp),%rax
1404	adcq	$0,%rdx
1405	addq	%r14,%r13
1406	movq	%rdx,%r14
1407	adcq	$0,%r14
1408
1409	mulq	%rbx
1410	movq	%rsi,%rbx
1411	addq	%rax,%r15
1412	movq	0(%rbp),%rax
1413	adcq	$0,%rdx
1414	addq	%r15,%r14
1415	movq	%rdx,%r15
1416	adcq	$0,%r15
1417
1418	decl	%ecx
1419	jne	.Lreduction_loop
1420
1421	.byte	0xf3,0xc3
1422.size	__rsaz_512_reduce,.-__rsaz_512_reduce
1423.type	__rsaz_512_reducex,@function
1424.align	32
1425__rsaz_512_reducex:
1426
1427	imulq	%r8,%rdx
1428	xorq	%rsi,%rsi
1429	movl	$8,%ecx
1430	jmp	.Lreduction_loopx
1431
1432.align	32
1433.Lreduction_loopx:
1434	movq	%r8,%rbx
1435	mulxq	0(%rbp),%rax,%r8
1436	adcxq	%rbx,%rax
1437	adoxq	%r9,%r8
1438
1439	mulxq	8(%rbp),%rax,%r9
1440	adcxq	%rax,%r8
1441	adoxq	%r10,%r9
1442
1443	mulxq	16(%rbp),%rbx,%r10
1444	adcxq	%rbx,%r9
1445	adoxq	%r11,%r10
1446
1447	mulxq	24(%rbp),%rbx,%r11
1448	adcxq	%rbx,%r10
1449	adoxq	%r12,%r11
1450
1451.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
1452	movq	%rdx,%rax
1453	movq	%r8,%rdx
1454	adcxq	%rbx,%r11
1455	adoxq	%r13,%r12
1456
1457	mulxq	128+8(%rsp),%rbx,%rdx
1458	movq	%rax,%rdx
1459
1460	mulxq	40(%rbp),%rax,%r13
1461	adcxq	%rax,%r12
1462	adoxq	%r14,%r13
1463
1464.byte	0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00
1465	adcxq	%rax,%r13
1466	adoxq	%r15,%r14
1467
1468	mulxq	56(%rbp),%rax,%r15
1469	movq	%rbx,%rdx
1470	adcxq	%rax,%r14
1471	adoxq	%rsi,%r15
1472	adcxq	%rsi,%r15
1473
1474	decl	%ecx
1475	jne	.Lreduction_loopx
1476
1477	.byte	0xf3,0xc3
1478.size	__rsaz_512_reducex,.-__rsaz_512_reducex
1479.type	__rsaz_512_subtract,@function
1480.align	32
1481__rsaz_512_subtract:
1482	movq	%r8,(%rdi)
1483	movq	%r9,8(%rdi)
1484	movq	%r10,16(%rdi)
1485	movq	%r11,24(%rdi)
1486	movq	%r12,32(%rdi)
1487	movq	%r13,40(%rdi)
1488	movq	%r14,48(%rdi)
1489	movq	%r15,56(%rdi)
1490
1491	movq	0(%rbp),%r8
1492	movq	8(%rbp),%r9
1493	negq	%r8
1494	notq	%r9
1495	andq	%rcx,%r8
1496	movq	16(%rbp),%r10
1497	andq	%rcx,%r9
1498	notq	%r10
1499	movq	24(%rbp),%r11
1500	andq	%rcx,%r10
1501	notq	%r11
1502	movq	32(%rbp),%r12
1503	andq	%rcx,%r11
1504	notq	%r12
1505	movq	40(%rbp),%r13
1506	andq	%rcx,%r12
1507	notq	%r13
1508	movq	48(%rbp),%r14
1509	andq	%rcx,%r13
1510	notq	%r14
1511	movq	56(%rbp),%r15
1512	andq	%rcx,%r14
1513	notq	%r15
1514	andq	%rcx,%r15
1515
1516	addq	(%rdi),%r8
1517	adcq	8(%rdi),%r9
1518	adcq	16(%rdi),%r10
1519	adcq	24(%rdi),%r11
1520	adcq	32(%rdi),%r12
1521	adcq	40(%rdi),%r13
1522	adcq	48(%rdi),%r14
1523	adcq	56(%rdi),%r15
1524
1525	movq	%r8,(%rdi)
1526	movq	%r9,8(%rdi)
1527	movq	%r10,16(%rdi)
1528	movq	%r11,24(%rdi)
1529	movq	%r12,32(%rdi)
1530	movq	%r13,40(%rdi)
1531	movq	%r14,48(%rdi)
1532	movq	%r15,56(%rdi)
1533
1534	.byte	0xf3,0xc3
1535.size	__rsaz_512_subtract,.-__rsaz_512_subtract
1536.type	__rsaz_512_mul,@function
1537.align	32
1538__rsaz_512_mul:
1539	leaq	8(%rsp),%rdi
1540
1541	movq	(%rsi),%rax
1542	mulq	%rbx
1543	movq	%rax,(%rdi)
1544	movq	8(%rsi),%rax
1545	movq	%rdx,%r8
1546
1547	mulq	%rbx
1548	addq	%rax,%r8
1549	movq	16(%rsi),%rax
1550	movq	%rdx,%r9
1551	adcq	$0,%r9
1552
1553	mulq	%rbx
1554	addq	%rax,%r9
1555	movq	24(%rsi),%rax
1556	movq	%rdx,%r10
1557	adcq	$0,%r10
1558
1559	mulq	%rbx
1560	addq	%rax,%r10
1561	movq	32(%rsi),%rax
1562	movq	%rdx,%r11
1563	adcq	$0,%r11
1564
1565	mulq	%rbx
1566	addq	%rax,%r11
1567	movq	40(%rsi),%rax
1568	movq	%rdx,%r12
1569	adcq	$0,%r12
1570
1571	mulq	%rbx
1572	addq	%rax,%r12
1573	movq	48(%rsi),%rax
1574	movq	%rdx,%r13
1575	adcq	$0,%r13
1576
1577	mulq	%rbx
1578	addq	%rax,%r13
1579	movq	56(%rsi),%rax
1580	movq	%rdx,%r14
1581	adcq	$0,%r14
1582
1583	mulq	%rbx
1584	addq	%rax,%r14
1585	movq	(%rsi),%rax
1586	movq	%rdx,%r15
1587	adcq	$0,%r15
1588
1589	leaq	8(%rbp),%rbp
1590	leaq	8(%rdi),%rdi
1591
1592	movl	$7,%ecx
1593	jmp	.Loop_mul
1594
1595.align	32
1596.Loop_mul:
1597	movq	(%rbp),%rbx
1598	mulq	%rbx
1599	addq	%rax,%r8
1600	movq	8(%rsi),%rax
1601	movq	%r8,(%rdi)
1602	movq	%rdx,%r8
1603	adcq	$0,%r8
1604
1605	mulq	%rbx
1606	addq	%rax,%r9
1607	movq	16(%rsi),%rax
1608	adcq	$0,%rdx
1609	addq	%r9,%r8
1610	movq	%rdx,%r9
1611	adcq	$0,%r9
1612
1613	mulq	%rbx
1614	addq	%rax,%r10
1615	movq	24(%rsi),%rax
1616	adcq	$0,%rdx
1617	addq	%r10,%r9
1618	movq	%rdx,%r10
1619	adcq	$0,%r10
1620
1621	mulq	%rbx
1622	addq	%rax,%r11
1623	movq	32(%rsi),%rax
1624	adcq	$0,%rdx
1625	addq	%r11,%r10
1626	movq	%rdx,%r11
1627	adcq	$0,%r11
1628
1629	mulq	%rbx
1630	addq	%rax,%r12
1631	movq	40(%rsi),%rax
1632	adcq	$0,%rdx
1633	addq	%r12,%r11
1634	movq	%rdx,%r12
1635	adcq	$0,%r12
1636
1637	mulq	%rbx
1638	addq	%rax,%r13
1639	movq	48(%rsi),%rax
1640	adcq	$0,%rdx
1641	addq	%r13,%r12
1642	movq	%rdx,%r13
1643	adcq	$0,%r13
1644
1645	mulq	%rbx
1646	addq	%rax,%r14
1647	movq	56(%rsi),%rax
1648	adcq	$0,%rdx
1649	addq	%r14,%r13
1650	movq	%rdx,%r14
1651	leaq	8(%rbp),%rbp
1652	adcq	$0,%r14
1653
1654	mulq	%rbx
1655	addq	%rax,%r15
1656	movq	(%rsi),%rax
1657	adcq	$0,%rdx
1658	addq	%r15,%r14
1659	movq	%rdx,%r15
1660	adcq	$0,%r15
1661
1662	leaq	8(%rdi),%rdi
1663
1664	decl	%ecx
1665	jnz	.Loop_mul
1666
1667	movq	%r8,(%rdi)
1668	movq	%r9,8(%rdi)
1669	movq	%r10,16(%rdi)
1670	movq	%r11,24(%rdi)
1671	movq	%r12,32(%rdi)
1672	movq	%r13,40(%rdi)
1673	movq	%r14,48(%rdi)
1674	movq	%r15,56(%rdi)
1675
1676	.byte	0xf3,0xc3
1677.size	__rsaz_512_mul,.-__rsaz_512_mul
1678.type	__rsaz_512_mulx,@function
1679.align	32
1680__rsaz_512_mulx:
1681	mulxq	(%rsi),%rbx,%r8
1682	movq	$-6,%rcx
1683
1684	mulxq	8(%rsi),%rax,%r9
1685	movq	%rbx,8(%rsp)
1686
1687	mulxq	16(%rsi),%rbx,%r10
1688	adcq	%rax,%r8
1689
1690	mulxq	24(%rsi),%rax,%r11
1691	adcq	%rbx,%r9
1692
1693	mulxq	32(%rsi),%rbx,%r12
1694	adcq	%rax,%r10
1695
1696	mulxq	40(%rsi),%rax,%r13
1697	adcq	%rbx,%r11
1698
1699	mulxq	48(%rsi),%rbx,%r14
1700	adcq	%rax,%r12
1701
1702	mulxq	56(%rsi),%rax,%r15
1703	movq	8(%rbp),%rdx
1704	adcq	%rbx,%r13
1705	adcq	%rax,%r14
1706	adcq	$0,%r15
1707
1708	xorq	%rdi,%rdi
1709	jmp	.Loop_mulx
1710
1711.align	32
1712.Loop_mulx:
1713	movq	%r8,%rbx
1714	mulxq	(%rsi),%rax,%r8
1715	adcxq	%rax,%rbx
1716	adoxq	%r9,%r8
1717
1718	mulxq	8(%rsi),%rax,%r9
1719	adcxq	%rax,%r8
1720	adoxq	%r10,%r9
1721
1722	mulxq	16(%rsi),%rax,%r10
1723	adcxq	%rax,%r9
1724	adoxq	%r11,%r10
1725
1726	mulxq	24(%rsi),%rax,%r11
1727	adcxq	%rax,%r10
1728	adoxq	%r12,%r11
1729
1730.byte	0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00
1731	adcxq	%rax,%r11
1732	adoxq	%r13,%r12
1733
1734	mulxq	40(%rsi),%rax,%r13
1735	adcxq	%rax,%r12
1736	adoxq	%r14,%r13
1737
1738	mulxq	48(%rsi),%rax,%r14
1739	adcxq	%rax,%r13
1740	adoxq	%r15,%r14
1741
1742	mulxq	56(%rsi),%rax,%r15
1743	movq	64(%rbp,%rcx,8),%rdx
1744	movq	%rbx,8+64-8(%rsp,%rcx,8)
1745	adcxq	%rax,%r14
1746	adoxq	%rdi,%r15
1747	adcxq	%rdi,%r15
1748
1749	incq	%rcx
1750	jnz	.Loop_mulx
1751
1752	movq	%r8,%rbx
1753	mulxq	(%rsi),%rax,%r8
1754	adcxq	%rax,%rbx
1755	adoxq	%r9,%r8
1756
1757.byte	0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00
1758	adcxq	%rax,%r8
1759	adoxq	%r10,%r9
1760
1761.byte	0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00
1762	adcxq	%rax,%r9
1763	adoxq	%r11,%r10
1764
1765	mulxq	24(%rsi),%rax,%r11
1766	adcxq	%rax,%r10
1767	adoxq	%r12,%r11
1768
1769	mulxq	32(%rsi),%rax,%r12
1770	adcxq	%rax,%r11
1771	adoxq	%r13,%r12
1772
1773	mulxq	40(%rsi),%rax,%r13
1774	adcxq	%rax,%r12
1775	adoxq	%r14,%r13
1776
1777.byte	0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
1778	adcxq	%rax,%r13
1779	adoxq	%r15,%r14
1780
1781.byte	0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00
1782	adcxq	%rax,%r14
1783	adoxq	%rdi,%r15
1784	adcxq	%rdi,%r15
1785
1786	movq	%rbx,8+64-8(%rsp)
1787	movq	%r8,8+64(%rsp)
1788	movq	%r9,8+64+8(%rsp)
1789	movq	%r10,8+64+16(%rsp)
1790	movq	%r11,8+64+24(%rsp)
1791	movq	%r12,8+64+32(%rsp)
1792	movq	%r13,8+64+40(%rsp)
1793	movq	%r14,8+64+48(%rsp)
1794	movq	%r15,8+64+56(%rsp)
1795
1796	.byte	0xf3,0xc3
1797.size	__rsaz_512_mulx,.-__rsaz_512_mulx
1798.globl	rsaz_512_scatter4
1799.type	rsaz_512_scatter4,@function
1800.align	16
1801rsaz_512_scatter4:
1802	leaq	(%rdi,%rdx,8),%rdi
1803	movl	$8,%r9d
1804	jmp	.Loop_scatter
1805.align	16
1806.Loop_scatter:
1807	movq	(%rsi),%rax
1808	leaq	8(%rsi),%rsi
1809	movq	%rax,(%rdi)
1810	leaq	128(%rdi),%rdi
1811	decl	%r9d
1812	jnz	.Loop_scatter
1813	.byte	0xf3,0xc3
1814.size	rsaz_512_scatter4,.-rsaz_512_scatter4
1815
1816.globl	rsaz_512_gather4
1817.type	rsaz_512_gather4,@function
1818.align	16
1819rsaz_512_gather4:
1820	movd	%edx,%xmm8
1821	movdqa	.Linc+16(%rip),%xmm1
1822	movdqa	.Linc(%rip),%xmm0
1823
1824	pshufd	$0,%xmm8,%xmm8
1825	movdqa	%xmm1,%xmm7
1826	movdqa	%xmm1,%xmm2
1827	paddd	%xmm0,%xmm1
1828	pcmpeqd	%xmm8,%xmm0
1829	movdqa	%xmm7,%xmm3
1830	paddd	%xmm1,%xmm2
1831	pcmpeqd	%xmm8,%xmm1
1832	movdqa	%xmm7,%xmm4
1833	paddd	%xmm2,%xmm3
1834	pcmpeqd	%xmm8,%xmm2
1835	movdqa	%xmm7,%xmm5
1836	paddd	%xmm3,%xmm4
1837	pcmpeqd	%xmm8,%xmm3
1838	movdqa	%xmm7,%xmm6
1839	paddd	%xmm4,%xmm5
1840	pcmpeqd	%xmm8,%xmm4
1841	paddd	%xmm5,%xmm6
1842	pcmpeqd	%xmm8,%xmm5
1843	paddd	%xmm6,%xmm7
1844	pcmpeqd	%xmm8,%xmm6
1845	pcmpeqd	%xmm8,%xmm7
1846	movl	$8,%r9d
1847	jmp	.Loop_gather
1848.align	16
1849.Loop_gather:
1850	movdqa	0(%rsi),%xmm8
1851	movdqa	16(%rsi),%xmm9
1852	movdqa	32(%rsi),%xmm10
1853	movdqa	48(%rsi),%xmm11
1854	pand	%xmm0,%xmm8
1855	movdqa	64(%rsi),%xmm12
1856	pand	%xmm1,%xmm9
1857	movdqa	80(%rsi),%xmm13
1858	pand	%xmm2,%xmm10
1859	movdqa	96(%rsi),%xmm14
1860	pand	%xmm3,%xmm11
1861	movdqa	112(%rsi),%xmm15
1862	leaq	128(%rsi),%rsi
1863	pand	%xmm4,%xmm12
1864	pand	%xmm5,%xmm13
1865	pand	%xmm6,%xmm14
1866	pand	%xmm7,%xmm15
1867	por	%xmm10,%xmm8
1868	por	%xmm11,%xmm9
1869	por	%xmm12,%xmm8
1870	por	%xmm13,%xmm9
1871	por	%xmm14,%xmm8
1872	por	%xmm15,%xmm9
1873
1874	por	%xmm9,%xmm8
1875	pshufd	$0x4e,%xmm8,%xmm9
1876	por	%xmm9,%xmm8
1877	movq	%xmm8,(%rdi)
1878	leaq	8(%rdi),%rdi
1879	decl	%r9d
1880	jnz	.Loop_gather
1881	.byte	0xf3,0xc3
1882.LSEH_end_rsaz_512_gather4:
1883.size	rsaz_512_gather4,.-rsaz_512_gather4
1884
1885.align	64
1886.Linc:
1887.long	0,0, 1,1
1888.long	2,2, 2,2
1889