rsaz-x86_64.S revision 305153
1/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/rsaz-x86_64.S 305153 2016-08-31 20:33:59Z jkim $ */
2/* Do not modify. This file is auto-generated from rsaz-x86_64.pl. */
3.text
4
5
6
7.globl	rsaz_512_sqr
8.type	rsaz_512_sqr,@function
9.align	32
10rsaz_512_sqr:
11	pushq	%rbx
12	pushq	%rbp
13	pushq	%r12
14	pushq	%r13
15	pushq	%r14
16	pushq	%r15
17
18	subq	$128+24,%rsp
19.Lsqr_body:
20	movq	%rdx,%rbp
21	movq	(%rsi),%rdx
22	movq	8(%rsi),%rax
23	movq	%rcx,128(%rsp)
24	movl	$0x80100,%r11d
25	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
26	cmpl	$0x80100,%r11d
27	je	.Loop_sqrx
28	jmp	.Loop_sqr
29
30.align	32
31.Loop_sqr:
32	movl	%r8d,128+8(%rsp)
33
34	movq	%rdx,%rbx
35	mulq	%rdx
36	movq	%rax,%r8
37	movq	16(%rsi),%rax
38	movq	%rdx,%r9
39
40	mulq	%rbx
41	addq	%rax,%r9
42	movq	24(%rsi),%rax
43	movq	%rdx,%r10
44	adcq	$0,%r10
45
46	mulq	%rbx
47	addq	%rax,%r10
48	movq	32(%rsi),%rax
49	movq	%rdx,%r11
50	adcq	$0,%r11
51
52	mulq	%rbx
53	addq	%rax,%r11
54	movq	40(%rsi),%rax
55	movq	%rdx,%r12
56	adcq	$0,%r12
57
58	mulq	%rbx
59	addq	%rax,%r12
60	movq	48(%rsi),%rax
61	movq	%rdx,%r13
62	adcq	$0,%r13
63
64	mulq	%rbx
65	addq	%rax,%r13
66	movq	56(%rsi),%rax
67	movq	%rdx,%r14
68	adcq	$0,%r14
69
70	mulq	%rbx
71	addq	%rax,%r14
72	movq	%rbx,%rax
73	movq	%rdx,%r15
74	adcq	$0,%r15
75
76	addq	%r8,%r8
77	movq	%r9,%rcx
78	adcq	%r9,%r9
79
80	mulq	%rax
81	movq	%rax,(%rsp)
82	addq	%rdx,%r8
83	adcq	$0,%r9
84
85	movq	%r8,8(%rsp)
86	shrq	$63,%rcx
87
88
89	movq	8(%rsi),%r8
90	movq	16(%rsi),%rax
91	mulq	%r8
92	addq	%rax,%r10
93	movq	24(%rsi),%rax
94	movq	%rdx,%rbx
95	adcq	$0,%rbx
96
97	mulq	%r8
98	addq	%rax,%r11
99	movq	32(%rsi),%rax
100	adcq	$0,%rdx
101	addq	%rbx,%r11
102	movq	%rdx,%rbx
103	adcq	$0,%rbx
104
105	mulq	%r8
106	addq	%rax,%r12
107	movq	40(%rsi),%rax
108	adcq	$0,%rdx
109	addq	%rbx,%r12
110	movq	%rdx,%rbx
111	adcq	$0,%rbx
112
113	mulq	%r8
114	addq	%rax,%r13
115	movq	48(%rsi),%rax
116	adcq	$0,%rdx
117	addq	%rbx,%r13
118	movq	%rdx,%rbx
119	adcq	$0,%rbx
120
121	mulq	%r8
122	addq	%rax,%r14
123	movq	56(%rsi),%rax
124	adcq	$0,%rdx
125	addq	%rbx,%r14
126	movq	%rdx,%rbx
127	adcq	$0,%rbx
128
129	mulq	%r8
130	addq	%rax,%r15
131	movq	%r8,%rax
132	adcq	$0,%rdx
133	addq	%rbx,%r15
134	movq	%rdx,%r8
135	movq	%r10,%rdx
136	adcq	$0,%r8
137
138	addq	%rdx,%rdx
139	leaq	(%rcx,%r10,2),%r10
140	movq	%r11,%rbx
141	adcq	%r11,%r11
142
143	mulq	%rax
144	addq	%rax,%r9
145	adcq	%rdx,%r10
146	adcq	$0,%r11
147
148	movq	%r9,16(%rsp)
149	movq	%r10,24(%rsp)
150	shrq	$63,%rbx
151
152
153	movq	16(%rsi),%r9
154	movq	24(%rsi),%rax
155	mulq	%r9
156	addq	%rax,%r12
157	movq	32(%rsi),%rax
158	movq	%rdx,%rcx
159	adcq	$0,%rcx
160
161	mulq	%r9
162	addq	%rax,%r13
163	movq	40(%rsi),%rax
164	adcq	$0,%rdx
165	addq	%rcx,%r13
166	movq	%rdx,%rcx
167	adcq	$0,%rcx
168
169	mulq	%r9
170	addq	%rax,%r14
171	movq	48(%rsi),%rax
172	adcq	$0,%rdx
173	addq	%rcx,%r14
174	movq	%rdx,%rcx
175	adcq	$0,%rcx
176
177	mulq	%r9
178	movq	%r12,%r10
179	leaq	(%rbx,%r12,2),%r12
180	addq	%rax,%r15
181	movq	56(%rsi),%rax
182	adcq	$0,%rdx
183	addq	%rcx,%r15
184	movq	%rdx,%rcx
185	adcq	$0,%rcx
186
187	mulq	%r9
188	shrq	$63,%r10
189	addq	%rax,%r8
190	movq	%r9,%rax
191	adcq	$0,%rdx
192	addq	%rcx,%r8
193	movq	%rdx,%r9
194	adcq	$0,%r9
195
196	movq	%r13,%rcx
197	leaq	(%r10,%r13,2),%r13
198
199	mulq	%rax
200	addq	%rax,%r11
201	adcq	%rdx,%r12
202	adcq	$0,%r13
203
204	movq	%r11,32(%rsp)
205	movq	%r12,40(%rsp)
206	shrq	$63,%rcx
207
208
209	movq	24(%rsi),%r10
210	movq	32(%rsi),%rax
211	mulq	%r10
212	addq	%rax,%r14
213	movq	40(%rsi),%rax
214	movq	%rdx,%rbx
215	adcq	$0,%rbx
216
217	mulq	%r10
218	addq	%rax,%r15
219	movq	48(%rsi),%rax
220	adcq	$0,%rdx
221	addq	%rbx,%r15
222	movq	%rdx,%rbx
223	adcq	$0,%rbx
224
225	mulq	%r10
226	movq	%r14,%r12
227	leaq	(%rcx,%r14,2),%r14
228	addq	%rax,%r8
229	movq	56(%rsi),%rax
230	adcq	$0,%rdx
231	addq	%rbx,%r8
232	movq	%rdx,%rbx
233	adcq	$0,%rbx
234
235	mulq	%r10
236	shrq	$63,%r12
237	addq	%rax,%r9
238	movq	%r10,%rax
239	adcq	$0,%rdx
240	addq	%rbx,%r9
241	movq	%rdx,%r10
242	adcq	$0,%r10
243
244	movq	%r15,%rbx
245	leaq	(%r12,%r15,2),%r15
246
247	mulq	%rax
248	addq	%rax,%r13
249	adcq	%rdx,%r14
250	adcq	$0,%r15
251
252	movq	%r13,48(%rsp)
253	movq	%r14,56(%rsp)
254	shrq	$63,%rbx
255
256
257	movq	32(%rsi),%r11
258	movq	40(%rsi),%rax
259	mulq	%r11
260	addq	%rax,%r8
261	movq	48(%rsi),%rax
262	movq	%rdx,%rcx
263	adcq	$0,%rcx
264
265	mulq	%r11
266	addq	%rax,%r9
267	movq	56(%rsi),%rax
268	adcq	$0,%rdx
269	movq	%r8,%r12
270	leaq	(%rbx,%r8,2),%r8
271	addq	%rcx,%r9
272	movq	%rdx,%rcx
273	adcq	$0,%rcx
274
275	mulq	%r11
276	shrq	$63,%r12
277	addq	%rax,%r10
278	movq	%r11,%rax
279	adcq	$0,%rdx
280	addq	%rcx,%r10
281	movq	%rdx,%r11
282	adcq	$0,%r11
283
284	movq	%r9,%rcx
285	leaq	(%r12,%r9,2),%r9
286
287	mulq	%rax
288	addq	%rax,%r15
289	adcq	%rdx,%r8
290	adcq	$0,%r9
291
292	movq	%r15,64(%rsp)
293	movq	%r8,72(%rsp)
294	shrq	$63,%rcx
295
296
297	movq	40(%rsi),%r12
298	movq	48(%rsi),%rax
299	mulq	%r12
300	addq	%rax,%r10
301	movq	56(%rsi),%rax
302	movq	%rdx,%rbx
303	adcq	$0,%rbx
304
305	mulq	%r12
306	addq	%rax,%r11
307	movq	%r12,%rax
308	movq	%r10,%r15
309	leaq	(%rcx,%r10,2),%r10
310	adcq	$0,%rdx
311	shrq	$63,%r15
312	addq	%rbx,%r11
313	movq	%rdx,%r12
314	adcq	$0,%r12
315
316	movq	%r11,%rbx
317	leaq	(%r15,%r11,2),%r11
318
319	mulq	%rax
320	addq	%rax,%r9
321	adcq	%rdx,%r10
322	adcq	$0,%r11
323
324	movq	%r9,80(%rsp)
325	movq	%r10,88(%rsp)
326
327
328	movq	48(%rsi),%r13
329	movq	56(%rsi),%rax
330	mulq	%r13
331	addq	%rax,%r12
332	movq	%r13,%rax
333	movq	%rdx,%r13
334	adcq	$0,%r13
335
336	xorq	%r14,%r14
337	shlq	$1,%rbx
338	adcq	%r12,%r12
339	adcq	%r13,%r13
340	adcq	%r14,%r14
341
342	mulq	%rax
343	addq	%rax,%r11
344	adcq	%rdx,%r12
345	adcq	$0,%r13
346
347	movq	%r11,96(%rsp)
348	movq	%r12,104(%rsp)
349
350
351	movq	56(%rsi),%rax
352	mulq	%rax
353	addq	%rax,%r13
354	adcq	$0,%rdx
355
356	addq	%rdx,%r14
357
358	movq	%r13,112(%rsp)
359	movq	%r14,120(%rsp)
360
361	movq	(%rsp),%r8
362	movq	8(%rsp),%r9
363	movq	16(%rsp),%r10
364	movq	24(%rsp),%r11
365	movq	32(%rsp),%r12
366	movq	40(%rsp),%r13
367	movq	48(%rsp),%r14
368	movq	56(%rsp),%r15
369
370	call	__rsaz_512_reduce
371
372	addq	64(%rsp),%r8
373	adcq	72(%rsp),%r9
374	adcq	80(%rsp),%r10
375	adcq	88(%rsp),%r11
376	adcq	96(%rsp),%r12
377	adcq	104(%rsp),%r13
378	adcq	112(%rsp),%r14
379	adcq	120(%rsp),%r15
380	sbbq	%rcx,%rcx
381
382	call	__rsaz_512_subtract
383
384	movq	%r8,%rdx
385	movq	%r9,%rax
386	movl	128+8(%rsp),%r8d
387	movq	%rdi,%rsi
388
389	decl	%r8d
390	jnz	.Loop_sqr
391	jmp	.Lsqr_tail
392
393.align	32
394.Loop_sqrx:
395	movl	%r8d,128+8(%rsp)
396.byte	102,72,15,110,199
397.byte	102,72,15,110,205
398
399	mulxq	%rax,%r8,%r9
400
401	mulxq	16(%rsi),%rcx,%r10
402	xorq	%rbp,%rbp
403
404	mulxq	24(%rsi),%rax,%r11
405	adcxq	%rcx,%r9
406
407	mulxq	32(%rsi),%rcx,%r12
408	adcxq	%rax,%r10
409
410	mulxq	40(%rsi),%rax,%r13
411	adcxq	%rcx,%r11
412
413.byte	0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00
414	adcxq	%rax,%r12
415	adcxq	%rcx,%r13
416
417.byte	0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00
418	adcxq	%rax,%r14
419	adcxq	%rbp,%r15
420
421	movq	%r9,%rcx
422	shldq	$1,%r8,%r9
423	shlq	$1,%r8
424
425	xorl	%ebp,%ebp
426	mulxq	%rdx,%rax,%rdx
427	adcxq	%rdx,%r8
428	movq	8(%rsi),%rdx
429	adcxq	%rbp,%r9
430
431	movq	%rax,(%rsp)
432	movq	%r8,8(%rsp)
433
434
435	mulxq	16(%rsi),%rax,%rbx
436	adoxq	%rax,%r10
437	adcxq	%rbx,%r11
438
439.byte	0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00
440	adoxq	%rdi,%r11
441	adcxq	%r8,%r12
442
443	mulxq	32(%rsi),%rax,%rbx
444	adoxq	%rax,%r12
445	adcxq	%rbx,%r13
446
447	mulxq	40(%rsi),%rdi,%r8
448	adoxq	%rdi,%r13
449	adcxq	%r8,%r14
450
451.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
452	adoxq	%rax,%r14
453	adcxq	%rbx,%r15
454
455.byte	0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00
456	adoxq	%rdi,%r15
457	adcxq	%rbp,%r8
458	adoxq	%rbp,%r8
459
460	movq	%r11,%rbx
461	shldq	$1,%r10,%r11
462	shldq	$1,%rcx,%r10
463
464	xorl	%ebp,%ebp
465	mulxq	%rdx,%rax,%rcx
466	movq	16(%rsi),%rdx
467	adcxq	%rax,%r9
468	adcxq	%rcx,%r10
469	adcxq	%rbp,%r11
470
471	movq	%r9,16(%rsp)
472.byte	0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00
473
474
475.byte	0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00
476	adoxq	%rdi,%r12
477	adcxq	%r9,%r13
478
479	mulxq	32(%rsi),%rax,%rcx
480	adoxq	%rax,%r13
481	adcxq	%rcx,%r14
482
483	mulxq	40(%rsi),%rdi,%r9
484	adoxq	%rdi,%r14
485	adcxq	%r9,%r15
486
487.byte	0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00
488	adoxq	%rax,%r15
489	adcxq	%rcx,%r8
490
491.byte	0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00
492	adoxq	%rdi,%r8
493	adcxq	%rbp,%r9
494	adoxq	%rbp,%r9
495
496	movq	%r13,%rcx
497	shldq	$1,%r12,%r13
498	shldq	$1,%rbx,%r12
499
500	xorl	%ebp,%ebp
501	mulxq	%rdx,%rax,%rdx
502	adcxq	%rax,%r11
503	adcxq	%rdx,%r12
504	movq	24(%rsi),%rdx
505	adcxq	%rbp,%r13
506
507	movq	%r11,32(%rsp)
508.byte	0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00
509
510
511.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00
512	adoxq	%rax,%r14
513	adcxq	%rbx,%r15
514
515	mulxq	40(%rsi),%rdi,%r10
516	adoxq	%rdi,%r15
517	adcxq	%r10,%r8
518
519	mulxq	48(%rsi),%rax,%rbx
520	adoxq	%rax,%r8
521	adcxq	%rbx,%r9
522
523	mulxq	56(%rsi),%rdi,%r10
524	adoxq	%rdi,%r9
525	adcxq	%rbp,%r10
526	adoxq	%rbp,%r10
527
528.byte	0x66
529	movq	%r15,%rbx
530	shldq	$1,%r14,%r15
531	shldq	$1,%rcx,%r14
532
533	xorl	%ebp,%ebp
534	mulxq	%rdx,%rax,%rdx
535	adcxq	%rax,%r13
536	adcxq	%rdx,%r14
537	movq	32(%rsi),%rdx
538	adcxq	%rbp,%r15
539
540	movq	%r13,48(%rsp)
541	movq	%r14,56(%rsp)
542
543
544.byte	0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00
545	adoxq	%rdi,%r8
546	adcxq	%r11,%r9
547
548	mulxq	48(%rsi),%rax,%rcx
549	adoxq	%rax,%r9
550	adcxq	%rcx,%r10
551
552	mulxq	56(%rsi),%rdi,%r11
553	adoxq	%rdi,%r10
554	adcxq	%rbp,%r11
555	adoxq	%rbp,%r11
556
557	movq	%r9,%rcx
558	shldq	$1,%r8,%r9
559	shldq	$1,%rbx,%r8
560
561	xorl	%ebp,%ebp
562	mulxq	%rdx,%rax,%rdx
563	adcxq	%rax,%r15
564	adcxq	%rdx,%r8
565	movq	40(%rsi),%rdx
566	adcxq	%rbp,%r9
567
568	movq	%r15,64(%rsp)
569	movq	%r8,72(%rsp)
570
571
572.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
573	adoxq	%rax,%r10
574	adcxq	%rbx,%r11
575
576.byte	0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00
577	adoxq	%rdi,%r11
578	adcxq	%rbp,%r12
579	adoxq	%rbp,%r12
580
581	movq	%r11,%rbx
582	shldq	$1,%r10,%r11
583	shldq	$1,%rcx,%r10
584
585	xorl	%ebp,%ebp
586	mulxq	%rdx,%rax,%rdx
587	adcxq	%rax,%r9
588	adcxq	%rdx,%r10
589	movq	48(%rsi),%rdx
590	adcxq	%rbp,%r11
591
592	movq	%r9,80(%rsp)
593	movq	%r10,88(%rsp)
594
595
596.byte	0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00
597	adoxq	%rax,%r12
598	adoxq	%rbp,%r13
599
600	xorq	%r14,%r14
601	shldq	$1,%r13,%r14
602	shldq	$1,%r12,%r13
603	shldq	$1,%rbx,%r12
604
605	xorl	%ebp,%ebp
606	mulxq	%rdx,%rax,%rdx
607	adcxq	%rax,%r11
608	adcxq	%rdx,%r12
609	movq	56(%rsi),%rdx
610	adcxq	%rbp,%r13
611
612.byte	0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00
613.byte	0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00
614
615
616	mulxq	%rdx,%rax,%rdx
617	adoxq	%rax,%r13
618	adoxq	%rbp,%rdx
619
620.byte	0x66
621	addq	%rdx,%r14
622
623	movq	%r13,112(%rsp)
624	movq	%r14,120(%rsp)
625.byte	102,72,15,126,199
626.byte	102,72,15,126,205
627
628	movq	128(%rsp),%rdx
629	movq	(%rsp),%r8
630	movq	8(%rsp),%r9
631	movq	16(%rsp),%r10
632	movq	24(%rsp),%r11
633	movq	32(%rsp),%r12
634	movq	40(%rsp),%r13
635	movq	48(%rsp),%r14
636	movq	56(%rsp),%r15
637
638	call	__rsaz_512_reducex
639
640	addq	64(%rsp),%r8
641	adcq	72(%rsp),%r9
642	adcq	80(%rsp),%r10
643	adcq	88(%rsp),%r11
644	adcq	96(%rsp),%r12
645	adcq	104(%rsp),%r13
646	adcq	112(%rsp),%r14
647	adcq	120(%rsp),%r15
648	sbbq	%rcx,%rcx
649
650	call	__rsaz_512_subtract
651
652	movq	%r8,%rdx
653	movq	%r9,%rax
654	movl	128+8(%rsp),%r8d
655	movq	%rdi,%rsi
656
657	decl	%r8d
658	jnz	.Loop_sqrx
659
660.Lsqr_tail:
661
662	leaq	128+24+48(%rsp),%rax
663	movq	-48(%rax),%r15
664	movq	-40(%rax),%r14
665	movq	-32(%rax),%r13
666	movq	-24(%rax),%r12
667	movq	-16(%rax),%rbp
668	movq	-8(%rax),%rbx
669	leaq	(%rax),%rsp
670.Lsqr_epilogue:
671	.byte	0xf3,0xc3
672.size	rsaz_512_sqr,.-rsaz_512_sqr
673.globl	rsaz_512_mul
674.type	rsaz_512_mul,@function
675.align	32
676rsaz_512_mul:
677	pushq	%rbx
678	pushq	%rbp
679	pushq	%r12
680	pushq	%r13
681	pushq	%r14
682	pushq	%r15
683
684	subq	$128+24,%rsp
685.Lmul_body:
686.byte	102,72,15,110,199
687.byte	102,72,15,110,201
688	movq	%r8,128(%rsp)
689	movl	$0x80100,%r11d
690	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
691	cmpl	$0x80100,%r11d
692	je	.Lmulx
693	movq	(%rdx),%rbx
694	movq	%rdx,%rbp
695	call	__rsaz_512_mul
696
697.byte	102,72,15,126,199
698.byte	102,72,15,126,205
699
700	movq	(%rsp),%r8
701	movq	8(%rsp),%r9
702	movq	16(%rsp),%r10
703	movq	24(%rsp),%r11
704	movq	32(%rsp),%r12
705	movq	40(%rsp),%r13
706	movq	48(%rsp),%r14
707	movq	56(%rsp),%r15
708
709	call	__rsaz_512_reduce
710	jmp	.Lmul_tail
711
712.align	32
713.Lmulx:
714	movq	%rdx,%rbp
715	movq	(%rdx),%rdx
716	call	__rsaz_512_mulx
717
718.byte	102,72,15,126,199
719.byte	102,72,15,126,205
720
721	movq	128(%rsp),%rdx
722	movq	(%rsp),%r8
723	movq	8(%rsp),%r9
724	movq	16(%rsp),%r10
725	movq	24(%rsp),%r11
726	movq	32(%rsp),%r12
727	movq	40(%rsp),%r13
728	movq	48(%rsp),%r14
729	movq	56(%rsp),%r15
730
731	call	__rsaz_512_reducex
732.Lmul_tail:
733	addq	64(%rsp),%r8
734	adcq	72(%rsp),%r9
735	adcq	80(%rsp),%r10
736	adcq	88(%rsp),%r11
737	adcq	96(%rsp),%r12
738	adcq	104(%rsp),%r13
739	adcq	112(%rsp),%r14
740	adcq	120(%rsp),%r15
741	sbbq	%rcx,%rcx
742
743	call	__rsaz_512_subtract
744
745	leaq	128+24+48(%rsp),%rax
746	movq	-48(%rax),%r15
747	movq	-40(%rax),%r14
748	movq	-32(%rax),%r13
749	movq	-24(%rax),%r12
750	movq	-16(%rax),%rbp
751	movq	-8(%rax),%rbx
752	leaq	(%rax),%rsp
753.Lmul_epilogue:
754	.byte	0xf3,0xc3
755.size	rsaz_512_mul,.-rsaz_512_mul
756.globl	rsaz_512_mul_gather4
757.type	rsaz_512_mul_gather4,@function
758.align	32
759rsaz_512_mul_gather4:
760	pushq	%rbx
761	pushq	%rbp
762	pushq	%r12
763	pushq	%r13
764	pushq	%r14
765	pushq	%r15
766
767	subq	$152,%rsp
768.Lmul_gather4_body:
769	movd	%r9d,%xmm8
770	movdqa	.Linc+16(%rip),%xmm1
771	movdqa	.Linc(%rip),%xmm0
772
773	pshufd	$0,%xmm8,%xmm8
774	movdqa	%xmm1,%xmm7
775	movdqa	%xmm1,%xmm2
776	paddd	%xmm0,%xmm1
777	pcmpeqd	%xmm8,%xmm0
778	movdqa	%xmm7,%xmm3
779	paddd	%xmm1,%xmm2
780	pcmpeqd	%xmm8,%xmm1
781	movdqa	%xmm7,%xmm4
782	paddd	%xmm2,%xmm3
783	pcmpeqd	%xmm8,%xmm2
784	movdqa	%xmm7,%xmm5
785	paddd	%xmm3,%xmm4
786	pcmpeqd	%xmm8,%xmm3
787	movdqa	%xmm7,%xmm6
788	paddd	%xmm4,%xmm5
789	pcmpeqd	%xmm8,%xmm4
790	paddd	%xmm5,%xmm6
791	pcmpeqd	%xmm8,%xmm5
792	paddd	%xmm6,%xmm7
793	pcmpeqd	%xmm8,%xmm6
794	pcmpeqd	%xmm8,%xmm7
795
796	movdqa	0(%rdx),%xmm8
797	movdqa	16(%rdx),%xmm9
798	movdqa	32(%rdx),%xmm10
799	movdqa	48(%rdx),%xmm11
800	pand	%xmm0,%xmm8
801	movdqa	64(%rdx),%xmm12
802	pand	%xmm1,%xmm9
803	movdqa	80(%rdx),%xmm13
804	pand	%xmm2,%xmm10
805	movdqa	96(%rdx),%xmm14
806	pand	%xmm3,%xmm11
807	movdqa	112(%rdx),%xmm15
808	leaq	128(%rdx),%rbp
809	pand	%xmm4,%xmm12
810	pand	%xmm5,%xmm13
811	pand	%xmm6,%xmm14
812	pand	%xmm7,%xmm15
813	por	%xmm10,%xmm8
814	por	%xmm11,%xmm9
815	por	%xmm12,%xmm8
816	por	%xmm13,%xmm9
817	por	%xmm14,%xmm8
818	por	%xmm15,%xmm9
819
820	por	%xmm9,%xmm8
821	pshufd	$0x4e,%xmm8,%xmm9
822	por	%xmm9,%xmm8
823	movl	$0x80100,%r11d
824	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
825	cmpl	$0x80100,%r11d
826	je	.Lmulx_gather
827.byte	102,76,15,126,195
828
829	movq	%r8,128(%rsp)
830	movq	%rdi,128+8(%rsp)
831	movq	%rcx,128+16(%rsp)
832
833	movq	(%rsi),%rax
834	movq	8(%rsi),%rcx
835	mulq	%rbx
836	movq	%rax,(%rsp)
837	movq	%rcx,%rax
838	movq	%rdx,%r8
839
840	mulq	%rbx
841	addq	%rax,%r8
842	movq	16(%rsi),%rax
843	movq	%rdx,%r9
844	adcq	$0,%r9
845
846	mulq	%rbx
847	addq	%rax,%r9
848	movq	24(%rsi),%rax
849	movq	%rdx,%r10
850	adcq	$0,%r10
851
852	mulq	%rbx
853	addq	%rax,%r10
854	movq	32(%rsi),%rax
855	movq	%rdx,%r11
856	adcq	$0,%r11
857
858	mulq	%rbx
859	addq	%rax,%r11
860	movq	40(%rsi),%rax
861	movq	%rdx,%r12
862	adcq	$0,%r12
863
864	mulq	%rbx
865	addq	%rax,%r12
866	movq	48(%rsi),%rax
867	movq	%rdx,%r13
868	adcq	$0,%r13
869
870	mulq	%rbx
871	addq	%rax,%r13
872	movq	56(%rsi),%rax
873	movq	%rdx,%r14
874	adcq	$0,%r14
875
876	mulq	%rbx
877	addq	%rax,%r14
878	movq	(%rsi),%rax
879	movq	%rdx,%r15
880	adcq	$0,%r15
881
882	leaq	8(%rsp),%rdi
883	movl	$7,%ecx
884	jmp	.Loop_mul_gather
885
886.align	32
887.Loop_mul_gather:
888	movdqa	0(%rbp),%xmm8
889	movdqa	16(%rbp),%xmm9
890	movdqa	32(%rbp),%xmm10
891	movdqa	48(%rbp),%xmm11
892	pand	%xmm0,%xmm8
893	movdqa	64(%rbp),%xmm12
894	pand	%xmm1,%xmm9
895	movdqa	80(%rbp),%xmm13
896	pand	%xmm2,%xmm10
897	movdqa	96(%rbp),%xmm14
898	pand	%xmm3,%xmm11
899	movdqa	112(%rbp),%xmm15
900	leaq	128(%rbp),%rbp
901	pand	%xmm4,%xmm12
902	pand	%xmm5,%xmm13
903	pand	%xmm6,%xmm14
904	pand	%xmm7,%xmm15
905	por	%xmm10,%xmm8
906	por	%xmm11,%xmm9
907	por	%xmm12,%xmm8
908	por	%xmm13,%xmm9
909	por	%xmm14,%xmm8
910	por	%xmm15,%xmm9
911
912	por	%xmm9,%xmm8
913	pshufd	$0x4e,%xmm8,%xmm9
914	por	%xmm9,%xmm8
915.byte	102,76,15,126,195
916
917	mulq	%rbx
918	addq	%rax,%r8
919	movq	8(%rsi),%rax
920	movq	%r8,(%rdi)
921	movq	%rdx,%r8
922	adcq	$0,%r8
923
924	mulq	%rbx
925	addq	%rax,%r9
926	movq	16(%rsi),%rax
927	adcq	$0,%rdx
928	addq	%r9,%r8
929	movq	%rdx,%r9
930	adcq	$0,%r9
931
932	mulq	%rbx
933	addq	%rax,%r10
934	movq	24(%rsi),%rax
935	adcq	$0,%rdx
936	addq	%r10,%r9
937	movq	%rdx,%r10
938	adcq	$0,%r10
939
940	mulq	%rbx
941	addq	%rax,%r11
942	movq	32(%rsi),%rax
943	adcq	$0,%rdx
944	addq	%r11,%r10
945	movq	%rdx,%r11
946	adcq	$0,%r11
947
948	mulq	%rbx
949	addq	%rax,%r12
950	movq	40(%rsi),%rax
951	adcq	$0,%rdx
952	addq	%r12,%r11
953	movq	%rdx,%r12
954	adcq	$0,%r12
955
956	mulq	%rbx
957	addq	%rax,%r13
958	movq	48(%rsi),%rax
959	adcq	$0,%rdx
960	addq	%r13,%r12
961	movq	%rdx,%r13
962	adcq	$0,%r13
963
964	mulq	%rbx
965	addq	%rax,%r14
966	movq	56(%rsi),%rax
967	adcq	$0,%rdx
968	addq	%r14,%r13
969	movq	%rdx,%r14
970	adcq	$0,%r14
971
972	mulq	%rbx
973	addq	%rax,%r15
974	movq	(%rsi),%rax
975	adcq	$0,%rdx
976	addq	%r15,%r14
977	movq	%rdx,%r15
978	adcq	$0,%r15
979
980	leaq	8(%rdi),%rdi
981
982	decl	%ecx
983	jnz	.Loop_mul_gather
984
985	movq	%r8,(%rdi)
986	movq	%r9,8(%rdi)
987	movq	%r10,16(%rdi)
988	movq	%r11,24(%rdi)
989	movq	%r12,32(%rdi)
990	movq	%r13,40(%rdi)
991	movq	%r14,48(%rdi)
992	movq	%r15,56(%rdi)
993
994	movq	128+8(%rsp),%rdi
995	movq	128+16(%rsp),%rbp
996
997	movq	(%rsp),%r8
998	movq	8(%rsp),%r9
999	movq	16(%rsp),%r10
1000	movq	24(%rsp),%r11
1001	movq	32(%rsp),%r12
1002	movq	40(%rsp),%r13
1003	movq	48(%rsp),%r14
1004	movq	56(%rsp),%r15
1005
1006	call	__rsaz_512_reduce
1007	jmp	.Lmul_gather_tail
1008
1009.align	32
1010.Lmulx_gather:
1011.byte	102,76,15,126,194
1012
1013	movq	%r8,128(%rsp)
1014	movq	%rdi,128+8(%rsp)
1015	movq	%rcx,128+16(%rsp)
1016
1017	mulxq	(%rsi),%rbx,%r8
1018	movq	%rbx,(%rsp)
1019	xorl	%edi,%edi
1020
1021	mulxq	8(%rsi),%rax,%r9
1022
1023	mulxq	16(%rsi),%rbx,%r10
1024	adcxq	%rax,%r8
1025
1026	mulxq	24(%rsi),%rax,%r11
1027	adcxq	%rbx,%r9
1028
1029	mulxq	32(%rsi),%rbx,%r12
1030	adcxq	%rax,%r10
1031
1032	mulxq	40(%rsi),%rax,%r13
1033	adcxq	%rbx,%r11
1034
1035	mulxq	48(%rsi),%rbx,%r14
1036	adcxq	%rax,%r12
1037
1038	mulxq	56(%rsi),%rax,%r15
1039	adcxq	%rbx,%r13
1040	adcxq	%rax,%r14
1041.byte	0x67
1042	movq	%r8,%rbx
1043	adcxq	%rdi,%r15
1044
1045	movq	$-7,%rcx
1046	jmp	.Loop_mulx_gather
1047
1048.align	32
1049.Loop_mulx_gather:
1050	movdqa	0(%rbp),%xmm8
1051	movdqa	16(%rbp),%xmm9
1052	movdqa	32(%rbp),%xmm10
1053	movdqa	48(%rbp),%xmm11
1054	pand	%xmm0,%xmm8
1055	movdqa	64(%rbp),%xmm12
1056	pand	%xmm1,%xmm9
1057	movdqa	80(%rbp),%xmm13
1058	pand	%xmm2,%xmm10
1059	movdqa	96(%rbp),%xmm14
1060	pand	%xmm3,%xmm11
1061	movdqa	112(%rbp),%xmm15
1062	leaq	128(%rbp),%rbp
1063	pand	%xmm4,%xmm12
1064	pand	%xmm5,%xmm13
1065	pand	%xmm6,%xmm14
1066	pand	%xmm7,%xmm15
1067	por	%xmm10,%xmm8
1068	por	%xmm11,%xmm9
1069	por	%xmm12,%xmm8
1070	por	%xmm13,%xmm9
1071	por	%xmm14,%xmm8
1072	por	%xmm15,%xmm9
1073
1074	por	%xmm9,%xmm8
1075	pshufd	$0x4e,%xmm8,%xmm9
1076	por	%xmm9,%xmm8
1077.byte	102,76,15,126,194
1078
1079.byte	0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00
1080	adcxq	%rax,%rbx
1081	adoxq	%r9,%r8
1082
1083	mulxq	8(%rsi),%rax,%r9
1084	adcxq	%rax,%r8
1085	adoxq	%r10,%r9
1086
1087	mulxq	16(%rsi),%rax,%r10
1088	adcxq	%rax,%r9
1089	adoxq	%r11,%r10
1090
1091.byte	0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00
1092	adcxq	%rax,%r10
1093	adoxq	%r12,%r11
1094
1095	mulxq	32(%rsi),%rax,%r12
1096	adcxq	%rax,%r11
1097	adoxq	%r13,%r12
1098
1099	mulxq	40(%rsi),%rax,%r13
1100	adcxq	%rax,%r12
1101	adoxq	%r14,%r13
1102
1103.byte	0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
1104	adcxq	%rax,%r13
1105.byte	0x67
1106	adoxq	%r15,%r14
1107
1108	mulxq	56(%rsi),%rax,%r15
1109	movq	%rbx,64(%rsp,%rcx,8)
1110	adcxq	%rax,%r14
1111	adoxq	%rdi,%r15
1112	movq	%r8,%rbx
1113	adcxq	%rdi,%r15
1114
1115	incq	%rcx
1116	jnz	.Loop_mulx_gather
1117
1118	movq	%r8,64(%rsp)
1119	movq	%r9,64+8(%rsp)
1120	movq	%r10,64+16(%rsp)
1121	movq	%r11,64+24(%rsp)
1122	movq	%r12,64+32(%rsp)
1123	movq	%r13,64+40(%rsp)
1124	movq	%r14,64+48(%rsp)
1125	movq	%r15,64+56(%rsp)
1126
1127	movq	128(%rsp),%rdx
1128	movq	128+8(%rsp),%rdi
1129	movq	128+16(%rsp),%rbp
1130
1131	movq	(%rsp),%r8
1132	movq	8(%rsp),%r9
1133	movq	16(%rsp),%r10
1134	movq	24(%rsp),%r11
1135	movq	32(%rsp),%r12
1136	movq	40(%rsp),%r13
1137	movq	48(%rsp),%r14
1138	movq	56(%rsp),%r15
1139
1140	call	__rsaz_512_reducex
1141
1142.Lmul_gather_tail:
1143	addq	64(%rsp),%r8
1144	adcq	72(%rsp),%r9
1145	adcq	80(%rsp),%r10
1146	adcq	88(%rsp),%r11
1147	adcq	96(%rsp),%r12
1148	adcq	104(%rsp),%r13
1149	adcq	112(%rsp),%r14
1150	adcq	120(%rsp),%r15
1151	sbbq	%rcx,%rcx
1152
1153	call	__rsaz_512_subtract
1154
1155	leaq	128+24+48(%rsp),%rax
1156	movq	-48(%rax),%r15
1157	movq	-40(%rax),%r14
1158	movq	-32(%rax),%r13
1159	movq	-24(%rax),%r12
1160	movq	-16(%rax),%rbp
1161	movq	-8(%rax),%rbx
1162	leaq	(%rax),%rsp
1163.Lmul_gather4_epilogue:
1164	.byte	0xf3,0xc3
1165.size	rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
1166.globl	rsaz_512_mul_scatter4
1167.type	rsaz_512_mul_scatter4,@function
1168.align	32
1169rsaz_512_mul_scatter4:
1170	pushq	%rbx
1171	pushq	%rbp
1172	pushq	%r12
1173	pushq	%r13
1174	pushq	%r14
1175	pushq	%r15
1176
1177	movl	%r9d,%r9d
1178	subq	$128+24,%rsp
1179.Lmul_scatter4_body:
1180	leaq	(%r8,%r9,8),%r8
1181.byte	102,72,15,110,199
1182.byte	102,72,15,110,202
1183.byte	102,73,15,110,208
1184	movq	%rcx,128(%rsp)
1185
1186	movq	%rdi,%rbp
1187	movl	$0x80100,%r11d
1188	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
1189	cmpl	$0x80100,%r11d
1190	je	.Lmulx_scatter
1191	movq	(%rdi),%rbx
1192	call	__rsaz_512_mul
1193
1194.byte	102,72,15,126,199
1195.byte	102,72,15,126,205
1196
1197	movq	(%rsp),%r8
1198	movq	8(%rsp),%r9
1199	movq	16(%rsp),%r10
1200	movq	24(%rsp),%r11
1201	movq	32(%rsp),%r12
1202	movq	40(%rsp),%r13
1203	movq	48(%rsp),%r14
1204	movq	56(%rsp),%r15
1205
1206	call	__rsaz_512_reduce
1207	jmp	.Lmul_scatter_tail
1208
1209.align	32
1210.Lmulx_scatter:
1211	movq	(%rdi),%rdx
1212	call	__rsaz_512_mulx
1213
1214.byte	102,72,15,126,199
1215.byte	102,72,15,126,205
1216
1217	movq	128(%rsp),%rdx
1218	movq	(%rsp),%r8
1219	movq	8(%rsp),%r9
1220	movq	16(%rsp),%r10
1221	movq	24(%rsp),%r11
1222	movq	32(%rsp),%r12
1223	movq	40(%rsp),%r13
1224	movq	48(%rsp),%r14
1225	movq	56(%rsp),%r15
1226
1227	call	__rsaz_512_reducex
1228
1229.Lmul_scatter_tail:
1230	addq	64(%rsp),%r8
1231	adcq	72(%rsp),%r9
1232	adcq	80(%rsp),%r10
1233	adcq	88(%rsp),%r11
1234	adcq	96(%rsp),%r12
1235	adcq	104(%rsp),%r13
1236	adcq	112(%rsp),%r14
1237	adcq	120(%rsp),%r15
1238.byte	102,72,15,126,214
1239	sbbq	%rcx,%rcx
1240
1241	call	__rsaz_512_subtract
1242
1243	movq	%r8,0(%rsi)
1244	movq	%r9,128(%rsi)
1245	movq	%r10,256(%rsi)
1246	movq	%r11,384(%rsi)
1247	movq	%r12,512(%rsi)
1248	movq	%r13,640(%rsi)
1249	movq	%r14,768(%rsi)
1250	movq	%r15,896(%rsi)
1251
1252	leaq	128+24+48(%rsp),%rax
1253	movq	-48(%rax),%r15
1254	movq	-40(%rax),%r14
1255	movq	-32(%rax),%r13
1256	movq	-24(%rax),%r12
1257	movq	-16(%rax),%rbp
1258	movq	-8(%rax),%rbx
1259	leaq	(%rax),%rsp
1260.Lmul_scatter4_epilogue:
1261	.byte	0xf3,0xc3
1262.size	rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
1263.globl	rsaz_512_mul_by_one
1264.type	rsaz_512_mul_by_one,@function
1265.align	32
1266rsaz_512_mul_by_one:
1267	pushq	%rbx
1268	pushq	%rbp
1269	pushq	%r12
1270	pushq	%r13
1271	pushq	%r14
1272	pushq	%r15
1273
1274	subq	$128+24,%rsp
1275.Lmul_by_one_body:
1276	movl	OPENSSL_ia32cap_P+8(%rip),%eax
1277	movq	%rdx,%rbp
1278	movq	%rcx,128(%rsp)
1279
1280	movq	(%rsi),%r8
1281	pxor	%xmm0,%xmm0
1282	movq	8(%rsi),%r9
1283	movq	16(%rsi),%r10
1284	movq	24(%rsi),%r11
1285	movq	32(%rsi),%r12
1286	movq	40(%rsi),%r13
1287	movq	48(%rsi),%r14
1288	movq	56(%rsi),%r15
1289
1290	movdqa	%xmm0,(%rsp)
1291	movdqa	%xmm0,16(%rsp)
1292	movdqa	%xmm0,32(%rsp)
1293	movdqa	%xmm0,48(%rsp)
1294	movdqa	%xmm0,64(%rsp)
1295	movdqa	%xmm0,80(%rsp)
1296	movdqa	%xmm0,96(%rsp)
1297	andl	$0x80100,%eax
1298	cmpl	$0x80100,%eax
1299	je	.Lby_one_callx
1300	call	__rsaz_512_reduce
1301	jmp	.Lby_one_tail
1302.align	32
1303.Lby_one_callx:
1304	movq	128(%rsp),%rdx
1305	call	__rsaz_512_reducex
1306.Lby_one_tail:
1307	movq	%r8,(%rdi)
1308	movq	%r9,8(%rdi)
1309	movq	%r10,16(%rdi)
1310	movq	%r11,24(%rdi)
1311	movq	%r12,32(%rdi)
1312	movq	%r13,40(%rdi)
1313	movq	%r14,48(%rdi)
1314	movq	%r15,56(%rdi)
1315
1316	leaq	128+24+48(%rsp),%rax
1317	movq	-48(%rax),%r15
1318	movq	-40(%rax),%r14
1319	movq	-32(%rax),%r13
1320	movq	-24(%rax),%r12
1321	movq	-16(%rax),%rbp
1322	movq	-8(%rax),%rbx
1323	leaq	(%rax),%rsp
1324.Lmul_by_one_epilogue:
1325	.byte	0xf3,0xc3
1326.size	rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
1327.type	__rsaz_512_reduce,@function
1328.align	32
1329__rsaz_512_reduce:
1330	movq	%r8,%rbx
1331	imulq	128+8(%rsp),%rbx
1332	movq	0(%rbp),%rax
1333	movl	$8,%ecx
1334	jmp	.Lreduction_loop
1335
1336.align	32
1337.Lreduction_loop:
1338	mulq	%rbx
1339	movq	8(%rbp),%rax
1340	negq	%r8
1341	movq	%rdx,%r8
1342	adcq	$0,%r8
1343
1344	mulq	%rbx
1345	addq	%rax,%r9
1346	movq	16(%rbp),%rax
1347	adcq	$0,%rdx
1348	addq	%r9,%r8
1349	movq	%rdx,%r9
1350	adcq	$0,%r9
1351
1352	mulq	%rbx
1353	addq	%rax,%r10
1354	movq	24(%rbp),%rax
1355	adcq	$0,%rdx
1356	addq	%r10,%r9
1357	movq	%rdx,%r10
1358	adcq	$0,%r10
1359
1360	mulq	%rbx
1361	addq	%rax,%r11
1362	movq	32(%rbp),%rax
1363	adcq	$0,%rdx
1364	addq	%r11,%r10
1365	movq	128+8(%rsp),%rsi
1366
1367
1368	adcq	$0,%rdx
1369	movq	%rdx,%r11
1370
1371	mulq	%rbx
1372	addq	%rax,%r12
1373	movq	40(%rbp),%rax
1374	adcq	$0,%rdx
1375	imulq	%r8,%rsi
1376	addq	%r12,%r11
1377	movq	%rdx,%r12
1378	adcq	$0,%r12
1379
1380	mulq	%rbx
1381	addq	%rax,%r13
1382	movq	48(%rbp),%rax
1383	adcq	$0,%rdx
1384	addq	%r13,%r12
1385	movq	%rdx,%r13
1386	adcq	$0,%r13
1387
1388	mulq	%rbx
1389	addq	%rax,%r14
1390	movq	56(%rbp),%rax
1391	adcq	$0,%rdx
1392	addq	%r14,%r13
1393	movq	%rdx,%r14
1394	adcq	$0,%r14
1395
1396	mulq	%rbx
1397	movq	%rsi,%rbx
1398	addq	%rax,%r15
1399	movq	0(%rbp),%rax
1400	adcq	$0,%rdx
1401	addq	%r15,%r14
1402	movq	%rdx,%r15
1403	adcq	$0,%r15
1404
1405	decl	%ecx
1406	jne	.Lreduction_loop
1407
1408	.byte	0xf3,0xc3
1409.size	__rsaz_512_reduce,.-__rsaz_512_reduce
1410.type	__rsaz_512_reducex,@function
1411.align	32
1412__rsaz_512_reducex:
1413
1414	imulq	%r8,%rdx
1415	xorq	%rsi,%rsi
1416	movl	$8,%ecx
1417	jmp	.Lreduction_loopx
1418
1419.align	32
1420.Lreduction_loopx:
1421	movq	%r8,%rbx
1422	mulxq	0(%rbp),%rax,%r8
1423	adcxq	%rbx,%rax
1424	adoxq	%r9,%r8
1425
1426	mulxq	8(%rbp),%rax,%r9
1427	adcxq	%rax,%r8
1428	adoxq	%r10,%r9
1429
1430	mulxq	16(%rbp),%rbx,%r10
1431	adcxq	%rbx,%r9
1432	adoxq	%r11,%r10
1433
1434	mulxq	24(%rbp),%rbx,%r11
1435	adcxq	%rbx,%r10
1436	adoxq	%r12,%r11
1437
1438.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
1439	movq	%rdx,%rax
1440	movq	%r8,%rdx
1441	adcxq	%rbx,%r11
1442	adoxq	%r13,%r12
1443
1444	mulxq	128+8(%rsp),%rbx,%rdx
1445	movq	%rax,%rdx
1446
1447	mulxq	40(%rbp),%rax,%r13
1448	adcxq	%rax,%r12
1449	adoxq	%r14,%r13
1450
1451.byte	0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00
1452	adcxq	%rax,%r13
1453	adoxq	%r15,%r14
1454
1455	mulxq	56(%rbp),%rax,%r15
1456	movq	%rbx,%rdx
1457	adcxq	%rax,%r14
1458	adoxq	%rsi,%r15
1459	adcxq	%rsi,%r15
1460
1461	decl	%ecx
1462	jne	.Lreduction_loopx
1463
1464	.byte	0xf3,0xc3
1465.size	__rsaz_512_reducex,.-__rsaz_512_reducex
1466.type	__rsaz_512_subtract,@function
1467.align	32
1468__rsaz_512_subtract:
1469	movq	%r8,(%rdi)
1470	movq	%r9,8(%rdi)
1471	movq	%r10,16(%rdi)
1472	movq	%r11,24(%rdi)
1473	movq	%r12,32(%rdi)
1474	movq	%r13,40(%rdi)
1475	movq	%r14,48(%rdi)
1476	movq	%r15,56(%rdi)
1477
1478	movq	0(%rbp),%r8
1479	movq	8(%rbp),%r9
1480	negq	%r8
1481	notq	%r9
1482	andq	%rcx,%r8
1483	movq	16(%rbp),%r10
1484	andq	%rcx,%r9
1485	notq	%r10
1486	movq	24(%rbp),%r11
1487	andq	%rcx,%r10
1488	notq	%r11
1489	movq	32(%rbp),%r12
1490	andq	%rcx,%r11
1491	notq	%r12
1492	movq	40(%rbp),%r13
1493	andq	%rcx,%r12
1494	notq	%r13
1495	movq	48(%rbp),%r14
1496	andq	%rcx,%r13
1497	notq	%r14
1498	movq	56(%rbp),%r15
1499	andq	%rcx,%r14
1500	notq	%r15
1501	andq	%rcx,%r15
1502
1503	addq	(%rdi),%r8
1504	adcq	8(%rdi),%r9
1505	adcq	16(%rdi),%r10
1506	adcq	24(%rdi),%r11
1507	adcq	32(%rdi),%r12
1508	adcq	40(%rdi),%r13
1509	adcq	48(%rdi),%r14
1510	adcq	56(%rdi),%r15
1511
1512	movq	%r8,(%rdi)
1513	movq	%r9,8(%rdi)
1514	movq	%r10,16(%rdi)
1515	movq	%r11,24(%rdi)
1516	movq	%r12,32(%rdi)
1517	movq	%r13,40(%rdi)
1518	movq	%r14,48(%rdi)
1519	movq	%r15,56(%rdi)
1520
1521	.byte	0xf3,0xc3
1522.size	__rsaz_512_subtract,.-__rsaz_512_subtract
1523.type	__rsaz_512_mul,@function
1524.align	32
1525__rsaz_512_mul:
1526	leaq	8(%rsp),%rdi
1527
1528	movq	(%rsi),%rax
1529	mulq	%rbx
1530	movq	%rax,(%rdi)
1531	movq	8(%rsi),%rax
1532	movq	%rdx,%r8
1533
1534	mulq	%rbx
1535	addq	%rax,%r8
1536	movq	16(%rsi),%rax
1537	movq	%rdx,%r9
1538	adcq	$0,%r9
1539
1540	mulq	%rbx
1541	addq	%rax,%r9
1542	movq	24(%rsi),%rax
1543	movq	%rdx,%r10
1544	adcq	$0,%r10
1545
1546	mulq	%rbx
1547	addq	%rax,%r10
1548	movq	32(%rsi),%rax
1549	movq	%rdx,%r11
1550	adcq	$0,%r11
1551
1552	mulq	%rbx
1553	addq	%rax,%r11
1554	movq	40(%rsi),%rax
1555	movq	%rdx,%r12
1556	adcq	$0,%r12
1557
1558	mulq	%rbx
1559	addq	%rax,%r12
1560	movq	48(%rsi),%rax
1561	movq	%rdx,%r13
1562	adcq	$0,%r13
1563
1564	mulq	%rbx
1565	addq	%rax,%r13
1566	movq	56(%rsi),%rax
1567	movq	%rdx,%r14
1568	adcq	$0,%r14
1569
1570	mulq	%rbx
1571	addq	%rax,%r14
1572	movq	(%rsi),%rax
1573	movq	%rdx,%r15
1574	adcq	$0,%r15
1575
1576	leaq	8(%rbp),%rbp
1577	leaq	8(%rdi),%rdi
1578
1579	movl	$7,%ecx
1580	jmp	.Loop_mul
1581
1582.align	32
1583.Loop_mul:
1584	movq	(%rbp),%rbx
1585	mulq	%rbx
1586	addq	%rax,%r8
1587	movq	8(%rsi),%rax
1588	movq	%r8,(%rdi)
1589	movq	%rdx,%r8
1590	adcq	$0,%r8
1591
1592	mulq	%rbx
1593	addq	%rax,%r9
1594	movq	16(%rsi),%rax
1595	adcq	$0,%rdx
1596	addq	%r9,%r8
1597	movq	%rdx,%r9
1598	adcq	$0,%r9
1599
1600	mulq	%rbx
1601	addq	%rax,%r10
1602	movq	24(%rsi),%rax
1603	adcq	$0,%rdx
1604	addq	%r10,%r9
1605	movq	%rdx,%r10
1606	adcq	$0,%r10
1607
1608	mulq	%rbx
1609	addq	%rax,%r11
1610	movq	32(%rsi),%rax
1611	adcq	$0,%rdx
1612	addq	%r11,%r10
1613	movq	%rdx,%r11
1614	adcq	$0,%r11
1615
1616	mulq	%rbx
1617	addq	%rax,%r12
1618	movq	40(%rsi),%rax
1619	adcq	$0,%rdx
1620	addq	%r12,%r11
1621	movq	%rdx,%r12
1622	adcq	$0,%r12
1623
1624	mulq	%rbx
1625	addq	%rax,%r13
1626	movq	48(%rsi),%rax
1627	adcq	$0,%rdx
1628	addq	%r13,%r12
1629	movq	%rdx,%r13
1630	adcq	$0,%r13
1631
1632	mulq	%rbx
1633	addq	%rax,%r14
1634	movq	56(%rsi),%rax
1635	adcq	$0,%rdx
1636	addq	%r14,%r13
1637	movq	%rdx,%r14
1638	leaq	8(%rbp),%rbp
1639	adcq	$0,%r14
1640
1641	mulq	%rbx
1642	addq	%rax,%r15
1643	movq	(%rsi),%rax
1644	adcq	$0,%rdx
1645	addq	%r15,%r14
1646	movq	%rdx,%r15
1647	adcq	$0,%r15
1648
1649	leaq	8(%rdi),%rdi
1650
1651	decl	%ecx
1652	jnz	.Loop_mul
1653
1654	movq	%r8,(%rdi)
1655	movq	%r9,8(%rdi)
1656	movq	%r10,16(%rdi)
1657	movq	%r11,24(%rdi)
1658	movq	%r12,32(%rdi)
1659	movq	%r13,40(%rdi)
1660	movq	%r14,48(%rdi)
1661	movq	%r15,56(%rdi)
1662
1663	.byte	0xf3,0xc3
1664.size	__rsaz_512_mul,.-__rsaz_512_mul
1665.type	__rsaz_512_mulx,@function
1666.align	32
1667__rsaz_512_mulx:
1668	mulxq	(%rsi),%rbx,%r8
1669	movq	$-6,%rcx
1670
1671	mulxq	8(%rsi),%rax,%r9
1672	movq	%rbx,8(%rsp)
1673
1674	mulxq	16(%rsi),%rbx,%r10
1675	adcq	%rax,%r8
1676
1677	mulxq	24(%rsi),%rax,%r11
1678	adcq	%rbx,%r9
1679
1680	mulxq	32(%rsi),%rbx,%r12
1681	adcq	%rax,%r10
1682
1683	mulxq	40(%rsi),%rax,%r13
1684	adcq	%rbx,%r11
1685
1686	mulxq	48(%rsi),%rbx,%r14
1687	adcq	%rax,%r12
1688
1689	mulxq	56(%rsi),%rax,%r15
1690	movq	8(%rbp),%rdx
1691	adcq	%rbx,%r13
1692	adcq	%rax,%r14
1693	adcq	$0,%r15
1694
1695	xorq	%rdi,%rdi
1696	jmp	.Loop_mulx
1697
1698.align	32
1699.Loop_mulx:
1700	movq	%r8,%rbx
1701	mulxq	(%rsi),%rax,%r8
1702	adcxq	%rax,%rbx
1703	adoxq	%r9,%r8
1704
1705	mulxq	8(%rsi),%rax,%r9
1706	adcxq	%rax,%r8
1707	adoxq	%r10,%r9
1708
1709	mulxq	16(%rsi),%rax,%r10
1710	adcxq	%rax,%r9
1711	adoxq	%r11,%r10
1712
1713	mulxq	24(%rsi),%rax,%r11
1714	adcxq	%rax,%r10
1715	adoxq	%r12,%r11
1716
1717.byte	0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00
1718	adcxq	%rax,%r11
1719	adoxq	%r13,%r12
1720
1721	mulxq	40(%rsi),%rax,%r13
1722	adcxq	%rax,%r12
1723	adoxq	%r14,%r13
1724
1725	mulxq	48(%rsi),%rax,%r14
1726	adcxq	%rax,%r13
1727	adoxq	%r15,%r14
1728
1729	mulxq	56(%rsi),%rax,%r15
1730	movq	64(%rbp,%rcx,8),%rdx
1731	movq	%rbx,8+64-8(%rsp,%rcx,8)
1732	adcxq	%rax,%r14
1733	adoxq	%rdi,%r15
1734	adcxq	%rdi,%r15
1735
1736	incq	%rcx
1737	jnz	.Loop_mulx
1738
1739	movq	%r8,%rbx
1740	mulxq	(%rsi),%rax,%r8
1741	adcxq	%rax,%rbx
1742	adoxq	%r9,%r8
1743
1744.byte	0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00
1745	adcxq	%rax,%r8
1746	adoxq	%r10,%r9
1747
1748.byte	0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00
1749	adcxq	%rax,%r9
1750	adoxq	%r11,%r10
1751
1752	mulxq	24(%rsi),%rax,%r11
1753	adcxq	%rax,%r10
1754	adoxq	%r12,%r11
1755
1756	mulxq	32(%rsi),%rax,%r12
1757	adcxq	%rax,%r11
1758	adoxq	%r13,%r12
1759
1760	mulxq	40(%rsi),%rax,%r13
1761	adcxq	%rax,%r12
1762	adoxq	%r14,%r13
1763
1764.byte	0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
1765	adcxq	%rax,%r13
1766	adoxq	%r15,%r14
1767
1768.byte	0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00
1769	adcxq	%rax,%r14
1770	adoxq	%rdi,%r15
1771	adcxq	%rdi,%r15
1772
1773	movq	%rbx,8+64-8(%rsp)
1774	movq	%r8,8+64(%rsp)
1775	movq	%r9,8+64+8(%rsp)
1776	movq	%r10,8+64+16(%rsp)
1777	movq	%r11,8+64+24(%rsp)
1778	movq	%r12,8+64+32(%rsp)
1779	movq	%r13,8+64+40(%rsp)
1780	movq	%r14,8+64+48(%rsp)
1781	movq	%r15,8+64+56(%rsp)
1782
1783	.byte	0xf3,0xc3
1784.size	__rsaz_512_mulx,.-__rsaz_512_mulx
1785.globl	rsaz_512_scatter4
1786.type	rsaz_512_scatter4,@function
1787.align	16
1788rsaz_512_scatter4:
1789	leaq	(%rdi,%rdx,8),%rdi
1790	movl	$8,%r9d
1791	jmp	.Loop_scatter
1792.align	16
1793.Loop_scatter:
1794	movq	(%rsi),%rax
1795	leaq	8(%rsi),%rsi
1796	movq	%rax,(%rdi)
1797	leaq	128(%rdi),%rdi
1798	decl	%r9d
1799	jnz	.Loop_scatter
1800	.byte	0xf3,0xc3
1801.size	rsaz_512_scatter4,.-rsaz_512_scatter4
1802
1803.globl	rsaz_512_gather4
1804.type	rsaz_512_gather4,@function
1805.align	16
1806rsaz_512_gather4:
1807	movd	%edx,%xmm8
1808	movdqa	.Linc+16(%rip),%xmm1
1809	movdqa	.Linc(%rip),%xmm0
1810
1811	pshufd	$0,%xmm8,%xmm8
1812	movdqa	%xmm1,%xmm7
1813	movdqa	%xmm1,%xmm2
1814	paddd	%xmm0,%xmm1
1815	pcmpeqd	%xmm8,%xmm0
1816	movdqa	%xmm7,%xmm3
1817	paddd	%xmm1,%xmm2
1818	pcmpeqd	%xmm8,%xmm1
1819	movdqa	%xmm7,%xmm4
1820	paddd	%xmm2,%xmm3
1821	pcmpeqd	%xmm8,%xmm2
1822	movdqa	%xmm7,%xmm5
1823	paddd	%xmm3,%xmm4
1824	pcmpeqd	%xmm8,%xmm3
1825	movdqa	%xmm7,%xmm6
1826	paddd	%xmm4,%xmm5
1827	pcmpeqd	%xmm8,%xmm4
1828	paddd	%xmm5,%xmm6
1829	pcmpeqd	%xmm8,%xmm5
1830	paddd	%xmm6,%xmm7
1831	pcmpeqd	%xmm8,%xmm6
1832	pcmpeqd	%xmm8,%xmm7
1833	movl	$8,%r9d
1834	jmp	.Loop_gather
1835.align	16
1836.Loop_gather:
1837	movdqa	0(%rsi),%xmm8
1838	movdqa	16(%rsi),%xmm9
1839	movdqa	32(%rsi),%xmm10
1840	movdqa	48(%rsi),%xmm11
1841	pand	%xmm0,%xmm8
1842	movdqa	64(%rsi),%xmm12
1843	pand	%xmm1,%xmm9
1844	movdqa	80(%rsi),%xmm13
1845	pand	%xmm2,%xmm10
1846	movdqa	96(%rsi),%xmm14
1847	pand	%xmm3,%xmm11
1848	movdqa	112(%rsi),%xmm15
1849	leaq	128(%rsi),%rsi
1850	pand	%xmm4,%xmm12
1851	pand	%xmm5,%xmm13
1852	pand	%xmm6,%xmm14
1853	pand	%xmm7,%xmm15
1854	por	%xmm10,%xmm8
1855	por	%xmm11,%xmm9
1856	por	%xmm12,%xmm8
1857	por	%xmm13,%xmm9
1858	por	%xmm14,%xmm8
1859	por	%xmm15,%xmm9
1860
1861	por	%xmm9,%xmm8
1862	pshufd	$0x4e,%xmm8,%xmm9
1863	por	%xmm9,%xmm8
1864	movq	%xmm8,(%rdi)
1865	leaq	8(%rdi),%rdi
1866	decl	%r9d
1867	jnz	.Loop_gather
1868	.byte	0xf3,0xc3
1869.LSEH_end_rsaz_512_gather4:
1870.size	rsaz_512_gather4,.-rsaz_512_gather4
1871
1872.align	64
1873.Linc:
1874.long	0,0, 1,1
1875.long	2,2, 2,2
1876