rsaz-x86_64.S revision 1.2
1#include <machine/asm.h>
2.text
3
4
5
6.globl	rsaz_512_sqr
7.type	rsaz_512_sqr,@function
8.align	32
9rsaz_512_sqr:
10	pushq	%rbx
11	pushq	%rbp
12	pushq	%r12
13	pushq	%r13
14	pushq	%r14
15	pushq	%r15
16
17	subq	$128+24,%rsp
18.Lsqr_body:
19	movq	%rdx,%rbp
20	movq	(%rsi),%rdx
21	movq	8(%rsi),%rax
22	movq	%rcx,128(%rsp)
23	movl	$0x80100,%r11d
24	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
25	cmpl	$0x80100,%r11d
26	je	.Loop_sqrx
27	jmp	.Loop_sqr
28
29.align	32
30.Loop_sqr:
31	movl	%r8d,128+8(%rsp)
32
33	movq	%rdx,%rbx
34	mulq	%rdx
35	movq	%rax,%r8
36	movq	16(%rsi),%rax
37	movq	%rdx,%r9
38
39	mulq	%rbx
40	addq	%rax,%r9
41	movq	24(%rsi),%rax
42	movq	%rdx,%r10
43	adcq	$0,%r10
44
45	mulq	%rbx
46	addq	%rax,%r10
47	movq	32(%rsi),%rax
48	movq	%rdx,%r11
49	adcq	$0,%r11
50
51	mulq	%rbx
52	addq	%rax,%r11
53	movq	40(%rsi),%rax
54	movq	%rdx,%r12
55	adcq	$0,%r12
56
57	mulq	%rbx
58	addq	%rax,%r12
59	movq	48(%rsi),%rax
60	movq	%rdx,%r13
61	adcq	$0,%r13
62
63	mulq	%rbx
64	addq	%rax,%r13
65	movq	56(%rsi),%rax
66	movq	%rdx,%r14
67	adcq	$0,%r14
68
69	mulq	%rbx
70	addq	%rax,%r14
71	movq	%rbx,%rax
72	movq	%rdx,%r15
73	adcq	$0,%r15
74
75	addq	%r8,%r8
76	movq	%r9,%rcx
77	adcq	%r9,%r9
78
79	mulq	%rax
80	movq	%rax,(%rsp)
81	addq	%rdx,%r8
82	adcq	$0,%r9
83
84	movq	%r8,8(%rsp)
85	shrq	$63,%rcx
86
87
88	movq	8(%rsi),%r8
89	movq	16(%rsi),%rax
90	mulq	%r8
91	addq	%rax,%r10
92	movq	24(%rsi),%rax
93	movq	%rdx,%rbx
94	adcq	$0,%rbx
95
96	mulq	%r8
97	addq	%rax,%r11
98	movq	32(%rsi),%rax
99	adcq	$0,%rdx
100	addq	%rbx,%r11
101	movq	%rdx,%rbx
102	adcq	$0,%rbx
103
104	mulq	%r8
105	addq	%rax,%r12
106	movq	40(%rsi),%rax
107	adcq	$0,%rdx
108	addq	%rbx,%r12
109	movq	%rdx,%rbx
110	adcq	$0,%rbx
111
112	mulq	%r8
113	addq	%rax,%r13
114	movq	48(%rsi),%rax
115	adcq	$0,%rdx
116	addq	%rbx,%r13
117	movq	%rdx,%rbx
118	adcq	$0,%rbx
119
120	mulq	%r8
121	addq	%rax,%r14
122	movq	56(%rsi),%rax
123	adcq	$0,%rdx
124	addq	%rbx,%r14
125	movq	%rdx,%rbx
126	adcq	$0,%rbx
127
128	mulq	%r8
129	addq	%rax,%r15
130	movq	%r8,%rax
131	adcq	$0,%rdx
132	addq	%rbx,%r15
133	movq	%rdx,%r8
134	movq	%r10,%rdx
135	adcq	$0,%r8
136
137	addq	%rdx,%rdx
138	leaq	(%rcx,%r10,2),%r10
139	movq	%r11,%rbx
140	adcq	%r11,%r11
141
142	mulq	%rax
143	addq	%rax,%r9
144	adcq	%rdx,%r10
145	adcq	$0,%r11
146
147	movq	%r9,16(%rsp)
148	movq	%r10,24(%rsp)
149	shrq	$63,%rbx
150
151
152	movq	16(%rsi),%r9
153	movq	24(%rsi),%rax
154	mulq	%r9
155	addq	%rax,%r12
156	movq	32(%rsi),%rax
157	movq	%rdx,%rcx
158	adcq	$0,%rcx
159
160	mulq	%r9
161	addq	%rax,%r13
162	movq	40(%rsi),%rax
163	adcq	$0,%rdx
164	addq	%rcx,%r13
165	movq	%rdx,%rcx
166	adcq	$0,%rcx
167
168	mulq	%r9
169	addq	%rax,%r14
170	movq	48(%rsi),%rax
171	adcq	$0,%rdx
172	addq	%rcx,%r14
173	movq	%rdx,%rcx
174	adcq	$0,%rcx
175
176	mulq	%r9
177	movq	%r12,%r10
178	leaq	(%rbx,%r12,2),%r12
179	addq	%rax,%r15
180	movq	56(%rsi),%rax
181	adcq	$0,%rdx
182	addq	%rcx,%r15
183	movq	%rdx,%rcx
184	adcq	$0,%rcx
185
186	mulq	%r9
187	shrq	$63,%r10
188	addq	%rax,%r8
189	movq	%r9,%rax
190	adcq	$0,%rdx
191	addq	%rcx,%r8
192	movq	%rdx,%r9
193	adcq	$0,%r9
194
195	movq	%r13,%rcx
196	leaq	(%r10,%r13,2),%r13
197
198	mulq	%rax
199	addq	%rax,%r11
200	adcq	%rdx,%r12
201	adcq	$0,%r13
202
203	movq	%r11,32(%rsp)
204	movq	%r12,40(%rsp)
205	shrq	$63,%rcx
206
207
208	movq	24(%rsi),%r10
209	movq	32(%rsi),%rax
210	mulq	%r10
211	addq	%rax,%r14
212	movq	40(%rsi),%rax
213	movq	%rdx,%rbx
214	adcq	$0,%rbx
215
216	mulq	%r10
217	addq	%rax,%r15
218	movq	48(%rsi),%rax
219	adcq	$0,%rdx
220	addq	%rbx,%r15
221	movq	%rdx,%rbx
222	adcq	$0,%rbx
223
224	mulq	%r10
225	movq	%r14,%r12
226	leaq	(%rcx,%r14,2),%r14
227	addq	%rax,%r8
228	movq	56(%rsi),%rax
229	adcq	$0,%rdx
230	addq	%rbx,%r8
231	movq	%rdx,%rbx
232	adcq	$0,%rbx
233
234	mulq	%r10
235	shrq	$63,%r12
236	addq	%rax,%r9
237	movq	%r10,%rax
238	adcq	$0,%rdx
239	addq	%rbx,%r9
240	movq	%rdx,%r10
241	adcq	$0,%r10
242
243	movq	%r15,%rbx
244	leaq	(%r12,%r15,2),%r15
245
246	mulq	%rax
247	addq	%rax,%r13
248	adcq	%rdx,%r14
249	adcq	$0,%r15
250
251	movq	%r13,48(%rsp)
252	movq	%r14,56(%rsp)
253	shrq	$63,%rbx
254
255
256	movq	32(%rsi),%r11
257	movq	40(%rsi),%rax
258	mulq	%r11
259	addq	%rax,%r8
260	movq	48(%rsi),%rax
261	movq	%rdx,%rcx
262	adcq	$0,%rcx
263
264	mulq	%r11
265	addq	%rax,%r9
266	movq	56(%rsi),%rax
267	adcq	$0,%rdx
268	movq	%r8,%r12
269	leaq	(%rbx,%r8,2),%r8
270	addq	%rcx,%r9
271	movq	%rdx,%rcx
272	adcq	$0,%rcx
273
274	mulq	%r11
275	shrq	$63,%r12
276	addq	%rax,%r10
277	movq	%r11,%rax
278	adcq	$0,%rdx
279	addq	%rcx,%r10
280	movq	%rdx,%r11
281	adcq	$0,%r11
282
283	movq	%r9,%rcx
284	leaq	(%r12,%r9,2),%r9
285
286	mulq	%rax
287	addq	%rax,%r15
288	adcq	%rdx,%r8
289	adcq	$0,%r9
290
291	movq	%r15,64(%rsp)
292	movq	%r8,72(%rsp)
293	shrq	$63,%rcx
294
295
296	movq	40(%rsi),%r12
297	movq	48(%rsi),%rax
298	mulq	%r12
299	addq	%rax,%r10
300	movq	56(%rsi),%rax
301	movq	%rdx,%rbx
302	adcq	$0,%rbx
303
304	mulq	%r12
305	addq	%rax,%r11
306	movq	%r12,%rax
307	movq	%r10,%r15
308	leaq	(%rcx,%r10,2),%r10
309	adcq	$0,%rdx
310	shrq	$63,%r15
311	addq	%rbx,%r11
312	movq	%rdx,%r12
313	adcq	$0,%r12
314
315	movq	%r11,%rbx
316	leaq	(%r15,%r11,2),%r11
317
318	mulq	%rax
319	addq	%rax,%r9
320	adcq	%rdx,%r10
321	adcq	$0,%r11
322
323	movq	%r9,80(%rsp)
324	movq	%r10,88(%rsp)
325
326
327	movq	48(%rsi),%r13
328	movq	56(%rsi),%rax
329	mulq	%r13
330	addq	%rax,%r12
331	movq	%r13,%rax
332	movq	%rdx,%r13
333	adcq	$0,%r13
334
335	xorq	%r14,%r14
336	shlq	$1,%rbx
337	adcq	%r12,%r12
338	adcq	%r13,%r13
339	adcq	%r14,%r14
340
341	mulq	%rax
342	addq	%rax,%r11
343	adcq	%rdx,%r12
344	adcq	$0,%r13
345
346	movq	%r11,96(%rsp)
347	movq	%r12,104(%rsp)
348
349
350	movq	56(%rsi),%rax
351	mulq	%rax
352	addq	%rax,%r13
353	adcq	$0,%rdx
354
355	addq	%rdx,%r14
356
357	movq	%r13,112(%rsp)
358	movq	%r14,120(%rsp)
359
360	movq	(%rsp),%r8
361	movq	8(%rsp),%r9
362	movq	16(%rsp),%r10
363	movq	24(%rsp),%r11
364	movq	32(%rsp),%r12
365	movq	40(%rsp),%r13
366	movq	48(%rsp),%r14
367	movq	56(%rsp),%r15
368
369	call	__rsaz_512_reduce
370
371	addq	64(%rsp),%r8
372	adcq	72(%rsp),%r9
373	adcq	80(%rsp),%r10
374	adcq	88(%rsp),%r11
375	adcq	96(%rsp),%r12
376	adcq	104(%rsp),%r13
377	adcq	112(%rsp),%r14
378	adcq	120(%rsp),%r15
379	sbbq	%rcx,%rcx
380
381	call	__rsaz_512_subtract
382
383	movq	%r8,%rdx
384	movq	%r9,%rax
385	movl	128+8(%rsp),%r8d
386	movq	%rdi,%rsi
387
388	decl	%r8d
389	jnz	.Loop_sqr
390	jmp	.Lsqr_tail
391
392.align	32
393.Loop_sqrx:
394	movl	%r8d,128+8(%rsp)
395.byte	102,72,15,110,199
396.byte	102,72,15,110,205
397
398	mulxq	%rax,%r8,%r9
399
400	mulxq	16(%rsi),%rcx,%r10
401	xorq	%rbp,%rbp
402
403	mulxq	24(%rsi),%rax,%r11
404	adcxq	%rcx,%r9
405
406	mulxq	32(%rsi),%rcx,%r12
407	adcxq	%rax,%r10
408
409	mulxq	40(%rsi),%rax,%r13
410	adcxq	%rcx,%r11
411
412.byte	0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00
413	adcxq	%rax,%r12
414	adcxq	%rcx,%r13
415
416.byte	0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00
417	adcxq	%rax,%r14
418	adcxq	%rbp,%r15
419
420	movq	%r9,%rcx
421	shldq	$1,%r8,%r9
422	shlq	$1,%r8
423
424	xorl	%ebp,%ebp
425	mulxq	%rdx,%rax,%rdx
426	adcxq	%rdx,%r8
427	movq	8(%rsi),%rdx
428	adcxq	%rbp,%r9
429
430	movq	%rax,(%rsp)
431	movq	%r8,8(%rsp)
432
433
434	mulxq	16(%rsi),%rax,%rbx
435	adoxq	%rax,%r10
436	adcxq	%rbx,%r11
437
438.byte	0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00
439	adoxq	%rdi,%r11
440	adcxq	%r8,%r12
441
442	mulxq	32(%rsi),%rax,%rbx
443	adoxq	%rax,%r12
444	adcxq	%rbx,%r13
445
446	mulxq	40(%rsi),%rdi,%r8
447	adoxq	%rdi,%r13
448	adcxq	%r8,%r14
449
450.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
451	adoxq	%rax,%r14
452	adcxq	%rbx,%r15
453
454.byte	0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00
455	adoxq	%rdi,%r15
456	adcxq	%rbp,%r8
457	adoxq	%rbp,%r8
458
459	movq	%r11,%rbx
460	shldq	$1,%r10,%r11
461	shldq	$1,%rcx,%r10
462
463	xorl	%ebp,%ebp
464	mulxq	%rdx,%rax,%rcx
465	movq	16(%rsi),%rdx
466	adcxq	%rax,%r9
467	adcxq	%rcx,%r10
468	adcxq	%rbp,%r11
469
470	movq	%r9,16(%rsp)
471.byte	0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00
472
473
474.byte	0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00
475	adoxq	%rdi,%r12
476	adcxq	%r9,%r13
477
478	mulxq	32(%rsi),%rax,%rcx
479	adoxq	%rax,%r13
480	adcxq	%rcx,%r14
481
482	mulxq	40(%rsi),%rdi,%r9
483	adoxq	%rdi,%r14
484	adcxq	%r9,%r15
485
486.byte	0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00
487	adoxq	%rax,%r15
488	adcxq	%rcx,%r8
489
490.byte	0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00
491	adoxq	%rdi,%r8
492	adcxq	%rbp,%r9
493	adoxq	%rbp,%r9
494
495	movq	%r13,%rcx
496	shldq	$1,%r12,%r13
497	shldq	$1,%rbx,%r12
498
499	xorl	%ebp,%ebp
500	mulxq	%rdx,%rax,%rdx
501	adcxq	%rax,%r11
502	adcxq	%rdx,%r12
503	movq	24(%rsi),%rdx
504	adcxq	%rbp,%r13
505
506	movq	%r11,32(%rsp)
507.byte	0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00
508
509
510.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00
511	adoxq	%rax,%r14
512	adcxq	%rbx,%r15
513
514	mulxq	40(%rsi),%rdi,%r10
515	adoxq	%rdi,%r15
516	adcxq	%r10,%r8
517
518	mulxq	48(%rsi),%rax,%rbx
519	adoxq	%rax,%r8
520	adcxq	%rbx,%r9
521
522	mulxq	56(%rsi),%rdi,%r10
523	adoxq	%rdi,%r9
524	adcxq	%rbp,%r10
525	adoxq	%rbp,%r10
526
527.byte	0x66
528	movq	%r15,%rbx
529	shldq	$1,%r14,%r15
530	shldq	$1,%rcx,%r14
531
532	xorl	%ebp,%ebp
533	mulxq	%rdx,%rax,%rdx
534	adcxq	%rax,%r13
535	adcxq	%rdx,%r14
536	movq	32(%rsi),%rdx
537	adcxq	%rbp,%r15
538
539	movq	%r13,48(%rsp)
540	movq	%r14,56(%rsp)
541
542
543.byte	0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00
544	adoxq	%rdi,%r8
545	adcxq	%r11,%r9
546
547	mulxq	48(%rsi),%rax,%rcx
548	adoxq	%rax,%r9
549	adcxq	%rcx,%r10
550
551	mulxq	56(%rsi),%rdi,%r11
552	adoxq	%rdi,%r10
553	adcxq	%rbp,%r11
554	adoxq	%rbp,%r11
555
556	movq	%r9,%rcx
557	shldq	$1,%r8,%r9
558	shldq	$1,%rbx,%r8
559
560	xorl	%ebp,%ebp
561	mulxq	%rdx,%rax,%rdx
562	adcxq	%rax,%r15
563	adcxq	%rdx,%r8
564	movq	40(%rsi),%rdx
565	adcxq	%rbp,%r9
566
567	movq	%r15,64(%rsp)
568	movq	%r8,72(%rsp)
569
570
571.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
572	adoxq	%rax,%r10
573	adcxq	%rbx,%r11
574
575.byte	0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00
576	adoxq	%rdi,%r11
577	adcxq	%rbp,%r12
578	adoxq	%rbp,%r12
579
580	movq	%r11,%rbx
581	shldq	$1,%r10,%r11
582	shldq	$1,%rcx,%r10
583
584	xorl	%ebp,%ebp
585	mulxq	%rdx,%rax,%rdx
586	adcxq	%rax,%r9
587	adcxq	%rdx,%r10
588	movq	48(%rsi),%rdx
589	adcxq	%rbp,%r11
590
591	movq	%r9,80(%rsp)
592	movq	%r10,88(%rsp)
593
594
595.byte	0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00
596	adoxq	%rax,%r12
597	adoxq	%rbp,%r13
598
599	xorq	%r14,%r14
600	shldq	$1,%r13,%r14
601	shldq	$1,%r12,%r13
602	shldq	$1,%rbx,%r12
603
604	xorl	%ebp,%ebp
605	mulxq	%rdx,%rax,%rdx
606	adcxq	%rax,%r11
607	adcxq	%rdx,%r12
608	movq	56(%rsi),%rdx
609	adcxq	%rbp,%r13
610
611.byte	0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00
612.byte	0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00
613
614
615	mulxq	%rdx,%rax,%rdx
616	adoxq	%rax,%r13
617	adoxq	%rbp,%rdx
618
619.byte	0x66
620	addq	%rdx,%r14
621
622	movq	%r13,112(%rsp)
623	movq	%r14,120(%rsp)
624.byte	102,72,15,126,199
625.byte	102,72,15,126,205
626
627	movq	128(%rsp),%rdx
628	movq	(%rsp),%r8
629	movq	8(%rsp),%r9
630	movq	16(%rsp),%r10
631	movq	24(%rsp),%r11
632	movq	32(%rsp),%r12
633	movq	40(%rsp),%r13
634	movq	48(%rsp),%r14
635	movq	56(%rsp),%r15
636
637	call	__rsaz_512_reducex
638
639	addq	64(%rsp),%r8
640	adcq	72(%rsp),%r9
641	adcq	80(%rsp),%r10
642	adcq	88(%rsp),%r11
643	adcq	96(%rsp),%r12
644	adcq	104(%rsp),%r13
645	adcq	112(%rsp),%r14
646	adcq	120(%rsp),%r15
647	sbbq	%rcx,%rcx
648
649	call	__rsaz_512_subtract
650
651	movq	%r8,%rdx
652	movq	%r9,%rax
653	movl	128+8(%rsp),%r8d
654	movq	%rdi,%rsi
655
656	decl	%r8d
657	jnz	.Loop_sqrx
658
659.Lsqr_tail:
660
661	leaq	128+24+48(%rsp),%rax
662	movq	-48(%rax),%r15
663	movq	-40(%rax),%r14
664	movq	-32(%rax),%r13
665	movq	-24(%rax),%r12
666	movq	-16(%rax),%rbp
667	movq	-8(%rax),%rbx
668	leaq	(%rax),%rsp
669.Lsqr_epilogue:
670	.byte	0xf3,0xc3
671.size	rsaz_512_sqr,.-rsaz_512_sqr
672.globl	rsaz_512_mul
673.type	rsaz_512_mul,@function
674.align	32
675rsaz_512_mul:
676	pushq	%rbx
677	pushq	%rbp
678	pushq	%r12
679	pushq	%r13
680	pushq	%r14
681	pushq	%r15
682
683	subq	$128+24,%rsp
684.Lmul_body:
685.byte	102,72,15,110,199
686.byte	102,72,15,110,201
687	movq	%r8,128(%rsp)
688	movl	$0x80100,%r11d
689	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
690	cmpl	$0x80100,%r11d
691	je	.Lmulx
692	movq	(%rdx),%rbx
693	movq	%rdx,%rbp
694	call	__rsaz_512_mul
695
696.byte	102,72,15,126,199
697.byte	102,72,15,126,205
698
699	movq	(%rsp),%r8
700	movq	8(%rsp),%r9
701	movq	16(%rsp),%r10
702	movq	24(%rsp),%r11
703	movq	32(%rsp),%r12
704	movq	40(%rsp),%r13
705	movq	48(%rsp),%r14
706	movq	56(%rsp),%r15
707
708	call	__rsaz_512_reduce
709	jmp	.Lmul_tail
710
711.align	32
712.Lmulx:
713	movq	%rdx,%rbp
714	movq	(%rdx),%rdx
715	call	__rsaz_512_mulx
716
717.byte	102,72,15,126,199
718.byte	102,72,15,126,205
719
720	movq	128(%rsp),%rdx
721	movq	(%rsp),%r8
722	movq	8(%rsp),%r9
723	movq	16(%rsp),%r10
724	movq	24(%rsp),%r11
725	movq	32(%rsp),%r12
726	movq	40(%rsp),%r13
727	movq	48(%rsp),%r14
728	movq	56(%rsp),%r15
729
730	call	__rsaz_512_reducex
731.Lmul_tail:
732	addq	64(%rsp),%r8
733	adcq	72(%rsp),%r9
734	adcq	80(%rsp),%r10
735	adcq	88(%rsp),%r11
736	adcq	96(%rsp),%r12
737	adcq	104(%rsp),%r13
738	adcq	112(%rsp),%r14
739	adcq	120(%rsp),%r15
740	sbbq	%rcx,%rcx
741
742	call	__rsaz_512_subtract
743
744	leaq	128+24+48(%rsp),%rax
745	movq	-48(%rax),%r15
746	movq	-40(%rax),%r14
747	movq	-32(%rax),%r13
748	movq	-24(%rax),%r12
749	movq	-16(%rax),%rbp
750	movq	-8(%rax),%rbx
751	leaq	(%rax),%rsp
752.Lmul_epilogue:
753	.byte	0xf3,0xc3
754.size	rsaz_512_mul,.-rsaz_512_mul
755.globl	rsaz_512_mul_gather4
756.type	rsaz_512_mul_gather4,@function
757.align	32
758rsaz_512_mul_gather4:
759	pushq	%rbx
760	pushq	%rbp
761	pushq	%r12
762	pushq	%r13
763	pushq	%r14
764	pushq	%r15
765
766	subq	$152,%rsp
767.Lmul_gather4_body:
768	movd	%r9d,%xmm8
769	movdqa	.Linc+16(%rip),%xmm1
770	movdqa	.Linc(%rip),%xmm0
771
772	pshufd	$0,%xmm8,%xmm8
773	movdqa	%xmm1,%xmm7
774	movdqa	%xmm1,%xmm2
775	paddd	%xmm0,%xmm1
776	pcmpeqd	%xmm8,%xmm0
777	movdqa	%xmm7,%xmm3
778	paddd	%xmm1,%xmm2
779	pcmpeqd	%xmm8,%xmm1
780	movdqa	%xmm7,%xmm4
781	paddd	%xmm2,%xmm3
782	pcmpeqd	%xmm8,%xmm2
783	movdqa	%xmm7,%xmm5
784	paddd	%xmm3,%xmm4
785	pcmpeqd	%xmm8,%xmm3
786	movdqa	%xmm7,%xmm6
787	paddd	%xmm4,%xmm5
788	pcmpeqd	%xmm8,%xmm4
789	paddd	%xmm5,%xmm6
790	pcmpeqd	%xmm8,%xmm5
791	paddd	%xmm6,%xmm7
792	pcmpeqd	%xmm8,%xmm6
793	pcmpeqd	%xmm8,%xmm7
794
795	movdqa	0(%rdx),%xmm8
796	movdqa	16(%rdx),%xmm9
797	movdqa	32(%rdx),%xmm10
798	movdqa	48(%rdx),%xmm11
799	pand	%xmm0,%xmm8
800	movdqa	64(%rdx),%xmm12
801	pand	%xmm1,%xmm9
802	movdqa	80(%rdx),%xmm13
803	pand	%xmm2,%xmm10
804	movdqa	96(%rdx),%xmm14
805	pand	%xmm3,%xmm11
806	movdqa	112(%rdx),%xmm15
807	leaq	128(%rdx),%rbp
808	pand	%xmm4,%xmm12
809	pand	%xmm5,%xmm13
810	pand	%xmm6,%xmm14
811	pand	%xmm7,%xmm15
812	por	%xmm10,%xmm8
813	por	%xmm11,%xmm9
814	por	%xmm12,%xmm8
815	por	%xmm13,%xmm9
816	por	%xmm14,%xmm8
817	por	%xmm15,%xmm9
818
819	por	%xmm9,%xmm8
820	pshufd	$0x4e,%xmm8,%xmm9
821	por	%xmm9,%xmm8
822	movl	$0x80100,%r11d
823	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
824	cmpl	$0x80100,%r11d
825	je	.Lmulx_gather
826.byte	102,76,15,126,195
827
828	movq	%r8,128(%rsp)
829	movq	%rdi,128+8(%rsp)
830	movq	%rcx,128+16(%rsp)
831
832	movq	(%rsi),%rax
833	movq	8(%rsi),%rcx
834	mulq	%rbx
835	movq	%rax,(%rsp)
836	movq	%rcx,%rax
837	movq	%rdx,%r8
838
839	mulq	%rbx
840	addq	%rax,%r8
841	movq	16(%rsi),%rax
842	movq	%rdx,%r9
843	adcq	$0,%r9
844
845	mulq	%rbx
846	addq	%rax,%r9
847	movq	24(%rsi),%rax
848	movq	%rdx,%r10
849	adcq	$0,%r10
850
851	mulq	%rbx
852	addq	%rax,%r10
853	movq	32(%rsi),%rax
854	movq	%rdx,%r11
855	adcq	$0,%r11
856
857	mulq	%rbx
858	addq	%rax,%r11
859	movq	40(%rsi),%rax
860	movq	%rdx,%r12
861	adcq	$0,%r12
862
863	mulq	%rbx
864	addq	%rax,%r12
865	movq	48(%rsi),%rax
866	movq	%rdx,%r13
867	adcq	$0,%r13
868
869	mulq	%rbx
870	addq	%rax,%r13
871	movq	56(%rsi),%rax
872	movq	%rdx,%r14
873	adcq	$0,%r14
874
875	mulq	%rbx
876	addq	%rax,%r14
877	movq	(%rsi),%rax
878	movq	%rdx,%r15
879	adcq	$0,%r15
880
881	leaq	8(%rsp),%rdi
882	movl	$7,%ecx
883	jmp	.Loop_mul_gather
884
885.align	32
886.Loop_mul_gather:
887	movdqa	0(%rbp),%xmm8
888	movdqa	16(%rbp),%xmm9
889	movdqa	32(%rbp),%xmm10
890	movdqa	48(%rbp),%xmm11
891	pand	%xmm0,%xmm8
892	movdqa	64(%rbp),%xmm12
893	pand	%xmm1,%xmm9
894	movdqa	80(%rbp),%xmm13
895	pand	%xmm2,%xmm10
896	movdqa	96(%rbp),%xmm14
897	pand	%xmm3,%xmm11
898	movdqa	112(%rbp),%xmm15
899	leaq	128(%rbp),%rbp
900	pand	%xmm4,%xmm12
901	pand	%xmm5,%xmm13
902	pand	%xmm6,%xmm14
903	pand	%xmm7,%xmm15
904	por	%xmm10,%xmm8
905	por	%xmm11,%xmm9
906	por	%xmm12,%xmm8
907	por	%xmm13,%xmm9
908	por	%xmm14,%xmm8
909	por	%xmm15,%xmm9
910
911	por	%xmm9,%xmm8
912	pshufd	$0x4e,%xmm8,%xmm9
913	por	%xmm9,%xmm8
914.byte	102,76,15,126,195
915
916	mulq	%rbx
917	addq	%rax,%r8
918	movq	8(%rsi),%rax
919	movq	%r8,(%rdi)
920	movq	%rdx,%r8
921	adcq	$0,%r8
922
923	mulq	%rbx
924	addq	%rax,%r9
925	movq	16(%rsi),%rax
926	adcq	$0,%rdx
927	addq	%r9,%r8
928	movq	%rdx,%r9
929	adcq	$0,%r9
930
931	mulq	%rbx
932	addq	%rax,%r10
933	movq	24(%rsi),%rax
934	adcq	$0,%rdx
935	addq	%r10,%r9
936	movq	%rdx,%r10
937	adcq	$0,%r10
938
939	mulq	%rbx
940	addq	%rax,%r11
941	movq	32(%rsi),%rax
942	adcq	$0,%rdx
943	addq	%r11,%r10
944	movq	%rdx,%r11
945	adcq	$0,%r11
946
947	mulq	%rbx
948	addq	%rax,%r12
949	movq	40(%rsi),%rax
950	adcq	$0,%rdx
951	addq	%r12,%r11
952	movq	%rdx,%r12
953	adcq	$0,%r12
954
955	mulq	%rbx
956	addq	%rax,%r13
957	movq	48(%rsi),%rax
958	adcq	$0,%rdx
959	addq	%r13,%r12
960	movq	%rdx,%r13
961	adcq	$0,%r13
962
963	mulq	%rbx
964	addq	%rax,%r14
965	movq	56(%rsi),%rax
966	adcq	$0,%rdx
967	addq	%r14,%r13
968	movq	%rdx,%r14
969	adcq	$0,%r14
970
971	mulq	%rbx
972	addq	%rax,%r15
973	movq	(%rsi),%rax
974	adcq	$0,%rdx
975	addq	%r15,%r14
976	movq	%rdx,%r15
977	adcq	$0,%r15
978
979	leaq	8(%rdi),%rdi
980
981	decl	%ecx
982	jnz	.Loop_mul_gather
983
984	movq	%r8,(%rdi)
985	movq	%r9,8(%rdi)
986	movq	%r10,16(%rdi)
987	movq	%r11,24(%rdi)
988	movq	%r12,32(%rdi)
989	movq	%r13,40(%rdi)
990	movq	%r14,48(%rdi)
991	movq	%r15,56(%rdi)
992
993	movq	128+8(%rsp),%rdi
994	movq	128+16(%rsp),%rbp
995
996	movq	(%rsp),%r8
997	movq	8(%rsp),%r9
998	movq	16(%rsp),%r10
999	movq	24(%rsp),%r11
1000	movq	32(%rsp),%r12
1001	movq	40(%rsp),%r13
1002	movq	48(%rsp),%r14
1003	movq	56(%rsp),%r15
1004
1005	call	__rsaz_512_reduce
1006	jmp	.Lmul_gather_tail
1007
1008.align	32
1009.Lmulx_gather:
1010.byte	102,76,15,126,194
1011
1012	movq	%r8,128(%rsp)
1013	movq	%rdi,128+8(%rsp)
1014	movq	%rcx,128+16(%rsp)
1015
1016	mulxq	(%rsi),%rbx,%r8
1017	movq	%rbx,(%rsp)
1018	xorl	%edi,%edi
1019
1020	mulxq	8(%rsi),%rax,%r9
1021
1022	mulxq	16(%rsi),%rbx,%r10
1023	adcxq	%rax,%r8
1024
1025	mulxq	24(%rsi),%rax,%r11
1026	adcxq	%rbx,%r9
1027
1028	mulxq	32(%rsi),%rbx,%r12
1029	adcxq	%rax,%r10
1030
1031	mulxq	40(%rsi),%rax,%r13
1032	adcxq	%rbx,%r11
1033
1034	mulxq	48(%rsi),%rbx,%r14
1035	adcxq	%rax,%r12
1036
1037	mulxq	56(%rsi),%rax,%r15
1038	adcxq	%rbx,%r13
1039	adcxq	%rax,%r14
1040.byte	0x67
1041	movq	%r8,%rbx
1042	adcxq	%rdi,%r15
1043
1044	movq	$-7,%rcx
1045	jmp	.Loop_mulx_gather
1046
1047.align	32
1048.Loop_mulx_gather:
1049	movdqa	0(%rbp),%xmm8
1050	movdqa	16(%rbp),%xmm9
1051	movdqa	32(%rbp),%xmm10
1052	movdqa	48(%rbp),%xmm11
1053	pand	%xmm0,%xmm8
1054	movdqa	64(%rbp),%xmm12
1055	pand	%xmm1,%xmm9
1056	movdqa	80(%rbp),%xmm13
1057	pand	%xmm2,%xmm10
1058	movdqa	96(%rbp),%xmm14
1059	pand	%xmm3,%xmm11
1060	movdqa	112(%rbp),%xmm15
1061	leaq	128(%rbp),%rbp
1062	pand	%xmm4,%xmm12
1063	pand	%xmm5,%xmm13
1064	pand	%xmm6,%xmm14
1065	pand	%xmm7,%xmm15
1066	por	%xmm10,%xmm8
1067	por	%xmm11,%xmm9
1068	por	%xmm12,%xmm8
1069	por	%xmm13,%xmm9
1070	por	%xmm14,%xmm8
1071	por	%xmm15,%xmm9
1072
1073	por	%xmm9,%xmm8
1074	pshufd	$0x4e,%xmm8,%xmm9
1075	por	%xmm9,%xmm8
1076.byte	102,76,15,126,194
1077
1078.byte	0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00
1079	adcxq	%rax,%rbx
1080	adoxq	%r9,%r8
1081
1082	mulxq	8(%rsi),%rax,%r9
1083	adcxq	%rax,%r8
1084	adoxq	%r10,%r9
1085
1086	mulxq	16(%rsi),%rax,%r10
1087	adcxq	%rax,%r9
1088	adoxq	%r11,%r10
1089
1090.byte	0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00
1091	adcxq	%rax,%r10
1092	adoxq	%r12,%r11
1093
1094	mulxq	32(%rsi),%rax,%r12
1095	adcxq	%rax,%r11
1096	adoxq	%r13,%r12
1097
1098	mulxq	40(%rsi),%rax,%r13
1099	adcxq	%rax,%r12
1100	adoxq	%r14,%r13
1101
1102.byte	0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
1103	adcxq	%rax,%r13
1104.byte	0x67
1105	adoxq	%r15,%r14
1106
1107	mulxq	56(%rsi),%rax,%r15
1108	movq	%rbx,64(%rsp,%rcx,8)
1109	adcxq	%rax,%r14
1110	adoxq	%rdi,%r15
1111	movq	%r8,%rbx
1112	adcxq	%rdi,%r15
1113
1114	incq	%rcx
1115	jnz	.Loop_mulx_gather
1116
1117	movq	%r8,64(%rsp)
1118	movq	%r9,64+8(%rsp)
1119	movq	%r10,64+16(%rsp)
1120	movq	%r11,64+24(%rsp)
1121	movq	%r12,64+32(%rsp)
1122	movq	%r13,64+40(%rsp)
1123	movq	%r14,64+48(%rsp)
1124	movq	%r15,64+56(%rsp)
1125
1126	movq	128(%rsp),%rdx
1127	movq	128+8(%rsp),%rdi
1128	movq	128+16(%rsp),%rbp
1129
1130	movq	(%rsp),%r8
1131	movq	8(%rsp),%r9
1132	movq	16(%rsp),%r10
1133	movq	24(%rsp),%r11
1134	movq	32(%rsp),%r12
1135	movq	40(%rsp),%r13
1136	movq	48(%rsp),%r14
1137	movq	56(%rsp),%r15
1138
1139	call	__rsaz_512_reducex
1140
1141.Lmul_gather_tail:
1142	addq	64(%rsp),%r8
1143	adcq	72(%rsp),%r9
1144	adcq	80(%rsp),%r10
1145	adcq	88(%rsp),%r11
1146	adcq	96(%rsp),%r12
1147	adcq	104(%rsp),%r13
1148	adcq	112(%rsp),%r14
1149	adcq	120(%rsp),%r15
1150	sbbq	%rcx,%rcx
1151
1152	call	__rsaz_512_subtract
1153
1154	leaq	128+24+48(%rsp),%rax
1155	movq	-48(%rax),%r15
1156	movq	-40(%rax),%r14
1157	movq	-32(%rax),%r13
1158	movq	-24(%rax),%r12
1159	movq	-16(%rax),%rbp
1160	movq	-8(%rax),%rbx
1161	leaq	(%rax),%rsp
1162.Lmul_gather4_epilogue:
1163	.byte	0xf3,0xc3
1164.size	rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
1165.globl	rsaz_512_mul_scatter4
1166.type	rsaz_512_mul_scatter4,@function
1167.align	32
1168rsaz_512_mul_scatter4:
1169	pushq	%rbx
1170	pushq	%rbp
1171	pushq	%r12
1172	pushq	%r13
1173	pushq	%r14
1174	pushq	%r15
1175
1176	movl	%r9d,%r9d
1177	subq	$128+24,%rsp
1178.Lmul_scatter4_body:
1179	leaq	(%r8,%r9,8),%r8
1180.byte	102,72,15,110,199
1181.byte	102,72,15,110,202
1182.byte	102,73,15,110,208
1183	movq	%rcx,128(%rsp)
1184
1185	movq	%rdi,%rbp
1186	movl	$0x80100,%r11d
1187	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
1188	cmpl	$0x80100,%r11d
1189	je	.Lmulx_scatter
1190	movq	(%rdi),%rbx
1191	call	__rsaz_512_mul
1192
1193.byte	102,72,15,126,199
1194.byte	102,72,15,126,205
1195
1196	movq	(%rsp),%r8
1197	movq	8(%rsp),%r9
1198	movq	16(%rsp),%r10
1199	movq	24(%rsp),%r11
1200	movq	32(%rsp),%r12
1201	movq	40(%rsp),%r13
1202	movq	48(%rsp),%r14
1203	movq	56(%rsp),%r15
1204
1205	call	__rsaz_512_reduce
1206	jmp	.Lmul_scatter_tail
1207
1208.align	32
1209.Lmulx_scatter:
1210	movq	(%rdi),%rdx
1211	call	__rsaz_512_mulx
1212
1213.byte	102,72,15,126,199
1214.byte	102,72,15,126,205
1215
1216	movq	128(%rsp),%rdx
1217	movq	(%rsp),%r8
1218	movq	8(%rsp),%r9
1219	movq	16(%rsp),%r10
1220	movq	24(%rsp),%r11
1221	movq	32(%rsp),%r12
1222	movq	40(%rsp),%r13
1223	movq	48(%rsp),%r14
1224	movq	56(%rsp),%r15
1225
1226	call	__rsaz_512_reducex
1227
1228.Lmul_scatter_tail:
1229	addq	64(%rsp),%r8
1230	adcq	72(%rsp),%r9
1231	adcq	80(%rsp),%r10
1232	adcq	88(%rsp),%r11
1233	adcq	96(%rsp),%r12
1234	adcq	104(%rsp),%r13
1235	adcq	112(%rsp),%r14
1236	adcq	120(%rsp),%r15
1237.byte	102,72,15,126,214
1238	sbbq	%rcx,%rcx
1239
1240	call	__rsaz_512_subtract
1241
1242	movq	%r8,0(%rsi)
1243	movq	%r9,128(%rsi)
1244	movq	%r10,256(%rsi)
1245	movq	%r11,384(%rsi)
1246	movq	%r12,512(%rsi)
1247	movq	%r13,640(%rsi)
1248	movq	%r14,768(%rsi)
1249	movq	%r15,896(%rsi)
1250
1251	leaq	128+24+48(%rsp),%rax
1252	movq	-48(%rax),%r15
1253	movq	-40(%rax),%r14
1254	movq	-32(%rax),%r13
1255	movq	-24(%rax),%r12
1256	movq	-16(%rax),%rbp
1257	movq	-8(%rax),%rbx
1258	leaq	(%rax),%rsp
1259.Lmul_scatter4_epilogue:
1260	.byte	0xf3,0xc3
1261.size	rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
1262.globl	rsaz_512_mul_by_one
1263.type	rsaz_512_mul_by_one,@function
1264.align	32
1265rsaz_512_mul_by_one:
1266	pushq	%rbx
1267	pushq	%rbp
1268	pushq	%r12
1269	pushq	%r13
1270	pushq	%r14
1271	pushq	%r15
1272
1273	subq	$128+24,%rsp
1274.Lmul_by_one_body:
1275	movl	OPENSSL_ia32cap_P+8(%rip),%eax
1276	movq	%rdx,%rbp
1277	movq	%rcx,128(%rsp)
1278
1279	movq	(%rsi),%r8
1280	pxor	%xmm0,%xmm0
1281	movq	8(%rsi),%r9
1282	movq	16(%rsi),%r10
1283	movq	24(%rsi),%r11
1284	movq	32(%rsi),%r12
1285	movq	40(%rsi),%r13
1286	movq	48(%rsi),%r14
1287	movq	56(%rsi),%r15
1288
1289	movdqa	%xmm0,(%rsp)
1290	movdqa	%xmm0,16(%rsp)
1291	movdqa	%xmm0,32(%rsp)
1292	movdqa	%xmm0,48(%rsp)
1293	movdqa	%xmm0,64(%rsp)
1294	movdqa	%xmm0,80(%rsp)
1295	movdqa	%xmm0,96(%rsp)
1296	andl	$0x80100,%eax
1297	cmpl	$0x80100,%eax
1298	je	.Lby_one_callx
1299	call	__rsaz_512_reduce
1300	jmp	.Lby_one_tail
1301.align	32
1302.Lby_one_callx:
1303	movq	128(%rsp),%rdx
1304	call	__rsaz_512_reducex
1305.Lby_one_tail:
1306	movq	%r8,(%rdi)
1307	movq	%r9,8(%rdi)
1308	movq	%r10,16(%rdi)
1309	movq	%r11,24(%rdi)
1310	movq	%r12,32(%rdi)
1311	movq	%r13,40(%rdi)
1312	movq	%r14,48(%rdi)
1313	movq	%r15,56(%rdi)
1314
1315	leaq	128+24+48(%rsp),%rax
1316	movq	-48(%rax),%r15
1317	movq	-40(%rax),%r14
1318	movq	-32(%rax),%r13
1319	movq	-24(%rax),%r12
1320	movq	-16(%rax),%rbp
1321	movq	-8(%rax),%rbx
1322	leaq	(%rax),%rsp
1323.Lmul_by_one_epilogue:
1324	.byte	0xf3,0xc3
1325.size	rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
1326.type	__rsaz_512_reduce,@function
1327.align	32
1328__rsaz_512_reduce:
1329	movq	%r8,%rbx
1330	imulq	128+8(%rsp),%rbx
1331	movq	0(%rbp),%rax
1332	movl	$8,%ecx
1333	jmp	.Lreduction_loop
1334
1335.align	32
1336.Lreduction_loop:
1337	mulq	%rbx
1338	movq	8(%rbp),%rax
1339	negq	%r8
1340	movq	%rdx,%r8
1341	adcq	$0,%r8
1342
1343	mulq	%rbx
1344	addq	%rax,%r9
1345	movq	16(%rbp),%rax
1346	adcq	$0,%rdx
1347	addq	%r9,%r8
1348	movq	%rdx,%r9
1349	adcq	$0,%r9
1350
1351	mulq	%rbx
1352	addq	%rax,%r10
1353	movq	24(%rbp),%rax
1354	adcq	$0,%rdx
1355	addq	%r10,%r9
1356	movq	%rdx,%r10
1357	adcq	$0,%r10
1358
1359	mulq	%rbx
1360	addq	%rax,%r11
1361	movq	32(%rbp),%rax
1362	adcq	$0,%rdx
1363	addq	%r11,%r10
1364	movq	128+8(%rsp),%rsi
1365
1366
1367	adcq	$0,%rdx
1368	movq	%rdx,%r11
1369
1370	mulq	%rbx
1371	addq	%rax,%r12
1372	movq	40(%rbp),%rax
1373	adcq	$0,%rdx
1374	imulq	%r8,%rsi
1375	addq	%r12,%r11
1376	movq	%rdx,%r12
1377	adcq	$0,%r12
1378
1379	mulq	%rbx
1380	addq	%rax,%r13
1381	movq	48(%rbp),%rax
1382	adcq	$0,%rdx
1383	addq	%r13,%r12
1384	movq	%rdx,%r13
1385	adcq	$0,%r13
1386
1387	mulq	%rbx
1388	addq	%rax,%r14
1389	movq	56(%rbp),%rax
1390	adcq	$0,%rdx
1391	addq	%r14,%r13
1392	movq	%rdx,%r14
1393	adcq	$0,%r14
1394
1395	mulq	%rbx
1396	movq	%rsi,%rbx
1397	addq	%rax,%r15
1398	movq	0(%rbp),%rax
1399	adcq	$0,%rdx
1400	addq	%r15,%r14
1401	movq	%rdx,%r15
1402	adcq	$0,%r15
1403
1404	decl	%ecx
1405	jne	.Lreduction_loop
1406
1407	.byte	0xf3,0xc3
1408.size	__rsaz_512_reduce,.-__rsaz_512_reduce
1409.type	__rsaz_512_reducex,@function
1410.align	32
1411__rsaz_512_reducex:
1412
1413	imulq	%r8,%rdx
1414	xorq	%rsi,%rsi
1415	movl	$8,%ecx
1416	jmp	.Lreduction_loopx
1417
1418.align	32
1419.Lreduction_loopx:
1420	movq	%r8,%rbx
1421	mulxq	0(%rbp),%rax,%r8
1422	adcxq	%rbx,%rax
1423	adoxq	%r9,%r8
1424
1425	mulxq	8(%rbp),%rax,%r9
1426	adcxq	%rax,%r8
1427	adoxq	%r10,%r9
1428
1429	mulxq	16(%rbp),%rbx,%r10
1430	adcxq	%rbx,%r9
1431	adoxq	%r11,%r10
1432
1433	mulxq	24(%rbp),%rbx,%r11
1434	adcxq	%rbx,%r10
1435	adoxq	%r12,%r11
1436
1437.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
1438	movq	%rdx,%rax
1439	movq	%r8,%rdx
1440	adcxq	%rbx,%r11
1441	adoxq	%r13,%r12
1442
1443	mulxq	128+8(%rsp),%rbx,%rdx
1444	movq	%rax,%rdx
1445
1446	mulxq	40(%rbp),%rax,%r13
1447	adcxq	%rax,%r12
1448	adoxq	%r14,%r13
1449
1450.byte	0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00
1451	adcxq	%rax,%r13
1452	adoxq	%r15,%r14
1453
1454	mulxq	56(%rbp),%rax,%r15
1455	movq	%rbx,%rdx
1456	adcxq	%rax,%r14
1457	adoxq	%rsi,%r15
1458	adcxq	%rsi,%r15
1459
1460	decl	%ecx
1461	jne	.Lreduction_loopx
1462
1463	.byte	0xf3,0xc3
1464.size	__rsaz_512_reducex,.-__rsaz_512_reducex
1465.type	__rsaz_512_subtract,@function
1466.align	32
1467__rsaz_512_subtract:
1468	movq	%r8,(%rdi)
1469	movq	%r9,8(%rdi)
1470	movq	%r10,16(%rdi)
1471	movq	%r11,24(%rdi)
1472	movq	%r12,32(%rdi)
1473	movq	%r13,40(%rdi)
1474	movq	%r14,48(%rdi)
1475	movq	%r15,56(%rdi)
1476
1477	movq	0(%rbp),%r8
1478	movq	8(%rbp),%r9
1479	negq	%r8
1480	notq	%r9
1481	andq	%rcx,%r8
1482	movq	16(%rbp),%r10
1483	andq	%rcx,%r9
1484	notq	%r10
1485	movq	24(%rbp),%r11
1486	andq	%rcx,%r10
1487	notq	%r11
1488	movq	32(%rbp),%r12
1489	andq	%rcx,%r11
1490	notq	%r12
1491	movq	40(%rbp),%r13
1492	andq	%rcx,%r12
1493	notq	%r13
1494	movq	48(%rbp),%r14
1495	andq	%rcx,%r13
1496	notq	%r14
1497	movq	56(%rbp),%r15
1498	andq	%rcx,%r14
1499	notq	%r15
1500	andq	%rcx,%r15
1501
1502	addq	(%rdi),%r8
1503	adcq	8(%rdi),%r9
1504	adcq	16(%rdi),%r10
1505	adcq	24(%rdi),%r11
1506	adcq	32(%rdi),%r12
1507	adcq	40(%rdi),%r13
1508	adcq	48(%rdi),%r14
1509	adcq	56(%rdi),%r15
1510
1511	movq	%r8,(%rdi)
1512	movq	%r9,8(%rdi)
1513	movq	%r10,16(%rdi)
1514	movq	%r11,24(%rdi)
1515	movq	%r12,32(%rdi)
1516	movq	%r13,40(%rdi)
1517	movq	%r14,48(%rdi)
1518	movq	%r15,56(%rdi)
1519
1520	.byte	0xf3,0xc3
1521.size	__rsaz_512_subtract,.-__rsaz_512_subtract
1522.type	__rsaz_512_mul,@function
1523.align	32
1524__rsaz_512_mul:
1525	leaq	8(%rsp),%rdi
1526
1527	movq	(%rsi),%rax
1528	mulq	%rbx
1529	movq	%rax,(%rdi)
1530	movq	8(%rsi),%rax
1531	movq	%rdx,%r8
1532
1533	mulq	%rbx
1534	addq	%rax,%r8
1535	movq	16(%rsi),%rax
1536	movq	%rdx,%r9
1537	adcq	$0,%r9
1538
1539	mulq	%rbx
1540	addq	%rax,%r9
1541	movq	24(%rsi),%rax
1542	movq	%rdx,%r10
1543	adcq	$0,%r10
1544
1545	mulq	%rbx
1546	addq	%rax,%r10
1547	movq	32(%rsi),%rax
1548	movq	%rdx,%r11
1549	adcq	$0,%r11
1550
1551	mulq	%rbx
1552	addq	%rax,%r11
1553	movq	40(%rsi),%rax
1554	movq	%rdx,%r12
1555	adcq	$0,%r12
1556
1557	mulq	%rbx
1558	addq	%rax,%r12
1559	movq	48(%rsi),%rax
1560	movq	%rdx,%r13
1561	adcq	$0,%r13
1562
1563	mulq	%rbx
1564	addq	%rax,%r13
1565	movq	56(%rsi),%rax
1566	movq	%rdx,%r14
1567	adcq	$0,%r14
1568
1569	mulq	%rbx
1570	addq	%rax,%r14
1571	movq	(%rsi),%rax
1572	movq	%rdx,%r15
1573	adcq	$0,%r15
1574
1575	leaq	8(%rbp),%rbp
1576	leaq	8(%rdi),%rdi
1577
1578	movl	$7,%ecx
1579	jmp	.Loop_mul
1580
1581.align	32
1582.Loop_mul:
1583	movq	(%rbp),%rbx
1584	mulq	%rbx
1585	addq	%rax,%r8
1586	movq	8(%rsi),%rax
1587	movq	%r8,(%rdi)
1588	movq	%rdx,%r8
1589	adcq	$0,%r8
1590
1591	mulq	%rbx
1592	addq	%rax,%r9
1593	movq	16(%rsi),%rax
1594	adcq	$0,%rdx
1595	addq	%r9,%r8
1596	movq	%rdx,%r9
1597	adcq	$0,%r9
1598
1599	mulq	%rbx
1600	addq	%rax,%r10
1601	movq	24(%rsi),%rax
1602	adcq	$0,%rdx
1603	addq	%r10,%r9
1604	movq	%rdx,%r10
1605	adcq	$0,%r10
1606
1607	mulq	%rbx
1608	addq	%rax,%r11
1609	movq	32(%rsi),%rax
1610	adcq	$0,%rdx
1611	addq	%r11,%r10
1612	movq	%rdx,%r11
1613	adcq	$0,%r11
1614
1615	mulq	%rbx
1616	addq	%rax,%r12
1617	movq	40(%rsi),%rax
1618	adcq	$0,%rdx
1619	addq	%r12,%r11
1620	movq	%rdx,%r12
1621	adcq	$0,%r12
1622
1623	mulq	%rbx
1624	addq	%rax,%r13
1625	movq	48(%rsi),%rax
1626	adcq	$0,%rdx
1627	addq	%r13,%r12
1628	movq	%rdx,%r13
1629	adcq	$0,%r13
1630
1631	mulq	%rbx
1632	addq	%rax,%r14
1633	movq	56(%rsi),%rax
1634	adcq	$0,%rdx
1635	addq	%r14,%r13
1636	movq	%rdx,%r14
1637	leaq	8(%rbp),%rbp
1638	adcq	$0,%r14
1639
1640	mulq	%rbx
1641	addq	%rax,%r15
1642	movq	(%rsi),%rax
1643	adcq	$0,%rdx
1644	addq	%r15,%r14
1645	movq	%rdx,%r15
1646	adcq	$0,%r15
1647
1648	leaq	8(%rdi),%rdi
1649
1650	decl	%ecx
1651	jnz	.Loop_mul
1652
1653	movq	%r8,(%rdi)
1654	movq	%r9,8(%rdi)
1655	movq	%r10,16(%rdi)
1656	movq	%r11,24(%rdi)
1657	movq	%r12,32(%rdi)
1658	movq	%r13,40(%rdi)
1659	movq	%r14,48(%rdi)
1660	movq	%r15,56(%rdi)
1661
1662	.byte	0xf3,0xc3
1663.size	__rsaz_512_mul,.-__rsaz_512_mul
1664.type	__rsaz_512_mulx,@function
1665.align	32
1666__rsaz_512_mulx:
1667	mulxq	(%rsi),%rbx,%r8
1668	movq	$-6,%rcx
1669
1670	mulxq	8(%rsi),%rax,%r9
1671	movq	%rbx,8(%rsp)
1672
1673	mulxq	16(%rsi),%rbx,%r10
1674	adcq	%rax,%r8
1675
1676	mulxq	24(%rsi),%rax,%r11
1677	adcq	%rbx,%r9
1678
1679	mulxq	32(%rsi),%rbx,%r12
1680	adcq	%rax,%r10
1681
1682	mulxq	40(%rsi),%rax,%r13
1683	adcq	%rbx,%r11
1684
1685	mulxq	48(%rsi),%rbx,%r14
1686	adcq	%rax,%r12
1687
1688	mulxq	56(%rsi),%rax,%r15
1689	movq	8(%rbp),%rdx
1690	adcq	%rbx,%r13
1691	adcq	%rax,%r14
1692	adcq	$0,%r15
1693
1694	xorq	%rdi,%rdi
1695	jmp	.Loop_mulx
1696
1697.align	32
1698.Loop_mulx:
1699	movq	%r8,%rbx
1700	mulxq	(%rsi),%rax,%r8
1701	adcxq	%rax,%rbx
1702	adoxq	%r9,%r8
1703
1704	mulxq	8(%rsi),%rax,%r9
1705	adcxq	%rax,%r8
1706	adoxq	%r10,%r9
1707
1708	mulxq	16(%rsi),%rax,%r10
1709	adcxq	%rax,%r9
1710	adoxq	%r11,%r10
1711
1712	mulxq	24(%rsi),%rax,%r11
1713	adcxq	%rax,%r10
1714	adoxq	%r12,%r11
1715
1716.byte	0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00
1717	adcxq	%rax,%r11
1718	adoxq	%r13,%r12
1719
1720	mulxq	40(%rsi),%rax,%r13
1721	adcxq	%rax,%r12
1722	adoxq	%r14,%r13
1723
1724	mulxq	48(%rsi),%rax,%r14
1725	adcxq	%rax,%r13
1726	adoxq	%r15,%r14
1727
1728	mulxq	56(%rsi),%rax,%r15
1729	movq	64(%rbp,%rcx,8),%rdx
1730	movq	%rbx,8+64-8(%rsp,%rcx,8)
1731	adcxq	%rax,%r14
1732	adoxq	%rdi,%r15
1733	adcxq	%rdi,%r15
1734
1735	incq	%rcx
1736	jnz	.Loop_mulx
1737
1738	movq	%r8,%rbx
1739	mulxq	(%rsi),%rax,%r8
1740	adcxq	%rax,%rbx
1741	adoxq	%r9,%r8
1742
1743.byte	0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00
1744	adcxq	%rax,%r8
1745	adoxq	%r10,%r9
1746
1747.byte	0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00
1748	adcxq	%rax,%r9
1749	adoxq	%r11,%r10
1750
1751	mulxq	24(%rsi),%rax,%r11
1752	adcxq	%rax,%r10
1753	adoxq	%r12,%r11
1754
1755	mulxq	32(%rsi),%rax,%r12
1756	adcxq	%rax,%r11
1757	adoxq	%r13,%r12
1758
1759	mulxq	40(%rsi),%rax,%r13
1760	adcxq	%rax,%r12
1761	adoxq	%r14,%r13
1762
1763.byte	0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
1764	adcxq	%rax,%r13
1765	adoxq	%r15,%r14
1766
1767.byte	0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00
1768	adcxq	%rax,%r14
1769	adoxq	%rdi,%r15
1770	adcxq	%rdi,%r15
1771
1772	movq	%rbx,8+64-8(%rsp)
1773	movq	%r8,8+64(%rsp)
1774	movq	%r9,8+64+8(%rsp)
1775	movq	%r10,8+64+16(%rsp)
1776	movq	%r11,8+64+24(%rsp)
1777	movq	%r12,8+64+32(%rsp)
1778	movq	%r13,8+64+40(%rsp)
1779	movq	%r14,8+64+48(%rsp)
1780	movq	%r15,8+64+56(%rsp)
1781
1782	.byte	0xf3,0xc3
1783.size	__rsaz_512_mulx,.-__rsaz_512_mulx
1784.globl	rsaz_512_scatter4
1785.type	rsaz_512_scatter4,@function
1786.align	16
1787rsaz_512_scatter4:
1788	leaq	(%rdi,%rdx,8),%rdi
1789	movl	$8,%r9d
1790	jmp	.Loop_scatter
1791.align	16
1792.Loop_scatter:
1793	movq	(%rsi),%rax
1794	leaq	8(%rsi),%rsi
1795	movq	%rax,(%rdi)
1796	leaq	128(%rdi),%rdi
1797	decl	%r9d
1798	jnz	.Loop_scatter
1799	.byte	0xf3,0xc3
1800.size	rsaz_512_scatter4,.-rsaz_512_scatter4
1801
1802.globl	rsaz_512_gather4
1803.type	rsaz_512_gather4,@function
1804.align	16
1805rsaz_512_gather4:
1806	movd	%edx,%xmm8
1807	movdqa	.Linc+16(%rip),%xmm1
1808	movdqa	.Linc(%rip),%xmm0
1809
1810	pshufd	$0,%xmm8,%xmm8
1811	movdqa	%xmm1,%xmm7
1812	movdqa	%xmm1,%xmm2
1813	paddd	%xmm0,%xmm1
1814	pcmpeqd	%xmm8,%xmm0
1815	movdqa	%xmm7,%xmm3
1816	paddd	%xmm1,%xmm2
1817	pcmpeqd	%xmm8,%xmm1
1818	movdqa	%xmm7,%xmm4
1819	paddd	%xmm2,%xmm3
1820	pcmpeqd	%xmm8,%xmm2
1821	movdqa	%xmm7,%xmm5
1822	paddd	%xmm3,%xmm4
1823	pcmpeqd	%xmm8,%xmm3
1824	movdqa	%xmm7,%xmm6
1825	paddd	%xmm4,%xmm5
1826	pcmpeqd	%xmm8,%xmm4
1827	paddd	%xmm5,%xmm6
1828	pcmpeqd	%xmm8,%xmm5
1829	paddd	%xmm6,%xmm7
1830	pcmpeqd	%xmm8,%xmm6
1831	pcmpeqd	%xmm8,%xmm7
1832	movl	$8,%r9d
1833	jmp	.Loop_gather
1834.align	16
1835.Loop_gather:
1836	movdqa	0(%rsi),%xmm8
1837	movdqa	16(%rsi),%xmm9
1838	movdqa	32(%rsi),%xmm10
1839	movdqa	48(%rsi),%xmm11
1840	pand	%xmm0,%xmm8
1841	movdqa	64(%rsi),%xmm12
1842	pand	%xmm1,%xmm9
1843	movdqa	80(%rsi),%xmm13
1844	pand	%xmm2,%xmm10
1845	movdqa	96(%rsi),%xmm14
1846	pand	%xmm3,%xmm11
1847	movdqa	112(%rsi),%xmm15
1848	leaq	128(%rsi),%rsi
1849	pand	%xmm4,%xmm12
1850	pand	%xmm5,%xmm13
1851	pand	%xmm6,%xmm14
1852	pand	%xmm7,%xmm15
1853	por	%xmm10,%xmm8
1854	por	%xmm11,%xmm9
1855	por	%xmm12,%xmm8
1856	por	%xmm13,%xmm9
1857	por	%xmm14,%xmm8
1858	por	%xmm15,%xmm9
1859
1860	por	%xmm9,%xmm8
1861	pshufd	$0x4e,%xmm8,%xmm9
1862	por	%xmm9,%xmm8
1863	movq	%xmm8,(%rdi)
1864	leaq	8(%rdi),%rdi
1865	decl	%r9d
1866	jnz	.Loop_gather
1867	.byte	0xf3,0xc3
1868.LSEH_end_rsaz_512_gather4:
1869.size	rsaz_512_gather4,.-rsaz_512_gather4
1870
1871.align	64
1872.Linc:
1873.long	0,0, 1,1
1874.long	2,2, 2,2
1875