rsaz-x86_64.S revision 1.1
1#include <machine/asm.h>
2.text
3
4
5
6.globl	rsaz_512_sqr
7.type	rsaz_512_sqr,@function
8.align	32
9rsaz_512_sqr:
10	pushq	%rbx
11	pushq	%rbp
12	pushq	%r12
13	pushq	%r13
14	pushq	%r14
15	pushq	%r15
16
17	subq	$128+24,%rsp
18.Lsqr_body:
19	movq	%rdx,%rbp
20	movq	(%rsi),%rdx
21	movq	8(%rsi),%rax
22	movq	%rcx,128(%rsp)
23	jmp	.Loop_sqr
24
25.align	32
26.Loop_sqr:
27	movl	%r8d,128+8(%rsp)
28
29	movq	%rdx,%rbx
30	mulq	%rdx
31	movq	%rax,%r8
32	movq	16(%rsi),%rax
33	movq	%rdx,%r9
34
35	mulq	%rbx
36	addq	%rax,%r9
37	movq	24(%rsi),%rax
38	movq	%rdx,%r10
39	adcq	$0,%r10
40
41	mulq	%rbx
42	addq	%rax,%r10
43	movq	32(%rsi),%rax
44	movq	%rdx,%r11
45	adcq	$0,%r11
46
47	mulq	%rbx
48	addq	%rax,%r11
49	movq	40(%rsi),%rax
50	movq	%rdx,%r12
51	adcq	$0,%r12
52
53	mulq	%rbx
54	addq	%rax,%r12
55	movq	48(%rsi),%rax
56	movq	%rdx,%r13
57	adcq	$0,%r13
58
59	mulq	%rbx
60	addq	%rax,%r13
61	movq	56(%rsi),%rax
62	movq	%rdx,%r14
63	adcq	$0,%r14
64
65	mulq	%rbx
66	addq	%rax,%r14
67	movq	%rbx,%rax
68	movq	%rdx,%r15
69	adcq	$0,%r15
70
71	addq	%r8,%r8
72	movq	%r9,%rcx
73	adcq	%r9,%r9
74
75	mulq	%rax
76	movq	%rax,(%rsp)
77	addq	%rdx,%r8
78	adcq	$0,%r9
79
80	movq	%r8,8(%rsp)
81	shrq	$63,%rcx
82
83
84	movq	8(%rsi),%r8
85	movq	16(%rsi),%rax
86	mulq	%r8
87	addq	%rax,%r10
88	movq	24(%rsi),%rax
89	movq	%rdx,%rbx
90	adcq	$0,%rbx
91
92	mulq	%r8
93	addq	%rax,%r11
94	movq	32(%rsi),%rax
95	adcq	$0,%rdx
96	addq	%rbx,%r11
97	movq	%rdx,%rbx
98	adcq	$0,%rbx
99
100	mulq	%r8
101	addq	%rax,%r12
102	movq	40(%rsi),%rax
103	adcq	$0,%rdx
104	addq	%rbx,%r12
105	movq	%rdx,%rbx
106	adcq	$0,%rbx
107
108	mulq	%r8
109	addq	%rax,%r13
110	movq	48(%rsi),%rax
111	adcq	$0,%rdx
112	addq	%rbx,%r13
113	movq	%rdx,%rbx
114	adcq	$0,%rbx
115
116	mulq	%r8
117	addq	%rax,%r14
118	movq	56(%rsi),%rax
119	adcq	$0,%rdx
120	addq	%rbx,%r14
121	movq	%rdx,%rbx
122	adcq	$0,%rbx
123
124	mulq	%r8
125	addq	%rax,%r15
126	movq	%r8,%rax
127	adcq	$0,%rdx
128	addq	%rbx,%r15
129	movq	%rdx,%r8
130	movq	%r10,%rdx
131	adcq	$0,%r8
132
133	addq	%rdx,%rdx
134	leaq	(%rcx,%r10,2),%r10
135	movq	%r11,%rbx
136	adcq	%r11,%r11
137
138	mulq	%rax
139	addq	%rax,%r9
140	adcq	%rdx,%r10
141	adcq	$0,%r11
142
143	movq	%r9,16(%rsp)
144	movq	%r10,24(%rsp)
145	shrq	$63,%rbx
146
147
148	movq	16(%rsi),%r9
149	movq	24(%rsi),%rax
150	mulq	%r9
151	addq	%rax,%r12
152	movq	32(%rsi),%rax
153	movq	%rdx,%rcx
154	adcq	$0,%rcx
155
156	mulq	%r9
157	addq	%rax,%r13
158	movq	40(%rsi),%rax
159	adcq	$0,%rdx
160	addq	%rcx,%r13
161	movq	%rdx,%rcx
162	adcq	$0,%rcx
163
164	mulq	%r9
165	addq	%rax,%r14
166	movq	48(%rsi),%rax
167	adcq	$0,%rdx
168	addq	%rcx,%r14
169	movq	%rdx,%rcx
170	adcq	$0,%rcx
171
172	mulq	%r9
173	movq	%r12,%r10
174	leaq	(%rbx,%r12,2),%r12
175	addq	%rax,%r15
176	movq	56(%rsi),%rax
177	adcq	$0,%rdx
178	addq	%rcx,%r15
179	movq	%rdx,%rcx
180	adcq	$0,%rcx
181
182	mulq	%r9
183	shrq	$63,%r10
184	addq	%rax,%r8
185	movq	%r9,%rax
186	adcq	$0,%rdx
187	addq	%rcx,%r8
188	movq	%rdx,%r9
189	adcq	$0,%r9
190
191	movq	%r13,%rcx
192	leaq	(%r10,%r13,2),%r13
193
194	mulq	%rax
195	addq	%rax,%r11
196	adcq	%rdx,%r12
197	adcq	$0,%r13
198
199	movq	%r11,32(%rsp)
200	movq	%r12,40(%rsp)
201	shrq	$63,%rcx
202
203
204	movq	24(%rsi),%r10
205	movq	32(%rsi),%rax
206	mulq	%r10
207	addq	%rax,%r14
208	movq	40(%rsi),%rax
209	movq	%rdx,%rbx
210	adcq	$0,%rbx
211
212	mulq	%r10
213	addq	%rax,%r15
214	movq	48(%rsi),%rax
215	adcq	$0,%rdx
216	addq	%rbx,%r15
217	movq	%rdx,%rbx
218	adcq	$0,%rbx
219
220	mulq	%r10
221	movq	%r14,%r12
222	leaq	(%rcx,%r14,2),%r14
223	addq	%rax,%r8
224	movq	56(%rsi),%rax
225	adcq	$0,%rdx
226	addq	%rbx,%r8
227	movq	%rdx,%rbx
228	adcq	$0,%rbx
229
230	mulq	%r10
231	shrq	$63,%r12
232	addq	%rax,%r9
233	movq	%r10,%rax
234	adcq	$0,%rdx
235	addq	%rbx,%r9
236	movq	%rdx,%r10
237	adcq	$0,%r10
238
239	movq	%r15,%rbx
240	leaq	(%r12,%r15,2),%r15
241
242	mulq	%rax
243	addq	%rax,%r13
244	adcq	%rdx,%r14
245	adcq	$0,%r15
246
247	movq	%r13,48(%rsp)
248	movq	%r14,56(%rsp)
249	shrq	$63,%rbx
250
251
252	movq	32(%rsi),%r11
253	movq	40(%rsi),%rax
254	mulq	%r11
255	addq	%rax,%r8
256	movq	48(%rsi),%rax
257	movq	%rdx,%rcx
258	adcq	$0,%rcx
259
260	mulq	%r11
261	addq	%rax,%r9
262	movq	56(%rsi),%rax
263	adcq	$0,%rdx
264	movq	%r8,%r12
265	leaq	(%rbx,%r8,2),%r8
266	addq	%rcx,%r9
267	movq	%rdx,%rcx
268	adcq	$0,%rcx
269
270	mulq	%r11
271	shrq	$63,%r12
272	addq	%rax,%r10
273	movq	%r11,%rax
274	adcq	$0,%rdx
275	addq	%rcx,%r10
276	movq	%rdx,%r11
277	adcq	$0,%r11
278
279	movq	%r9,%rcx
280	leaq	(%r12,%r9,2),%r9
281
282	mulq	%rax
283	addq	%rax,%r15
284	adcq	%rdx,%r8
285	adcq	$0,%r9
286
287	movq	%r15,64(%rsp)
288	movq	%r8,72(%rsp)
289	shrq	$63,%rcx
290
291
292	movq	40(%rsi),%r12
293	movq	48(%rsi),%rax
294	mulq	%r12
295	addq	%rax,%r10
296	movq	56(%rsi),%rax
297	movq	%rdx,%rbx
298	adcq	$0,%rbx
299
300	mulq	%r12
301	addq	%rax,%r11
302	movq	%r12,%rax
303	movq	%r10,%r15
304	leaq	(%rcx,%r10,2),%r10
305	adcq	$0,%rdx
306	shrq	$63,%r15
307	addq	%rbx,%r11
308	movq	%rdx,%r12
309	adcq	$0,%r12
310
311	movq	%r11,%rbx
312	leaq	(%r15,%r11,2),%r11
313
314	mulq	%rax
315	addq	%rax,%r9
316	adcq	%rdx,%r10
317	adcq	$0,%r11
318
319	movq	%r9,80(%rsp)
320	movq	%r10,88(%rsp)
321
322
323	movq	48(%rsi),%r13
324	movq	56(%rsi),%rax
325	mulq	%r13
326	addq	%rax,%r12
327	movq	%r13,%rax
328	movq	%rdx,%r13
329	adcq	$0,%r13
330
331	xorq	%r14,%r14
332	shlq	$1,%rbx
333	adcq	%r12,%r12
334	adcq	%r13,%r13
335	adcq	%r14,%r14
336
337	mulq	%rax
338	addq	%rax,%r11
339	adcq	%rdx,%r12
340	adcq	$0,%r13
341
342	movq	%r11,96(%rsp)
343	movq	%r12,104(%rsp)
344
345
346	movq	56(%rsi),%rax
347	mulq	%rax
348	addq	%rax,%r13
349	adcq	$0,%rdx
350
351	addq	%rdx,%r14
352
353	movq	%r13,112(%rsp)
354	movq	%r14,120(%rsp)
355
356	movq	(%rsp),%r8
357	movq	8(%rsp),%r9
358	movq	16(%rsp),%r10
359	movq	24(%rsp),%r11
360	movq	32(%rsp),%r12
361	movq	40(%rsp),%r13
362	movq	48(%rsp),%r14
363	movq	56(%rsp),%r15
364
365	call	__rsaz_512_reduce
366
367	addq	64(%rsp),%r8
368	adcq	72(%rsp),%r9
369	adcq	80(%rsp),%r10
370	adcq	88(%rsp),%r11
371	adcq	96(%rsp),%r12
372	adcq	104(%rsp),%r13
373	adcq	112(%rsp),%r14
374	adcq	120(%rsp),%r15
375	sbbq	%rcx,%rcx
376
377	call	__rsaz_512_subtract
378
379	movq	%r8,%rdx
380	movq	%r9,%rax
381	movl	128+8(%rsp),%r8d
382	movq	%rdi,%rsi
383
384	decl	%r8d
385	jnz	.Loop_sqr
386
387	leaq	128+24+48(%rsp),%rax
388	movq	-48(%rax),%r15
389	movq	-40(%rax),%r14
390	movq	-32(%rax),%r13
391	movq	-24(%rax),%r12
392	movq	-16(%rax),%rbp
393	movq	-8(%rax),%rbx
394	leaq	(%rax),%rsp
395.Lsqr_epilogue:
396	.byte	0xf3,0xc3
397.size	rsaz_512_sqr,.-rsaz_512_sqr
398.globl	rsaz_512_mul
399.type	rsaz_512_mul,@function
400.align	32
401rsaz_512_mul:
402	pushq	%rbx
403	pushq	%rbp
404	pushq	%r12
405	pushq	%r13
406	pushq	%r14
407	pushq	%r15
408
409	subq	$128+24,%rsp
410.Lmul_body:
411.byte	102,72,15,110,199
412.byte	102,72,15,110,201
413	movq	%r8,128(%rsp)
414	movq	(%rdx),%rbx
415	movq	%rdx,%rbp
416	call	__rsaz_512_mul
417
418.byte	102,72,15,126,199
419.byte	102,72,15,126,205
420
421	movq	(%rsp),%r8
422	movq	8(%rsp),%r9
423	movq	16(%rsp),%r10
424	movq	24(%rsp),%r11
425	movq	32(%rsp),%r12
426	movq	40(%rsp),%r13
427	movq	48(%rsp),%r14
428	movq	56(%rsp),%r15
429
430	call	__rsaz_512_reduce
431	addq	64(%rsp),%r8
432	adcq	72(%rsp),%r9
433	adcq	80(%rsp),%r10
434	adcq	88(%rsp),%r11
435	adcq	96(%rsp),%r12
436	adcq	104(%rsp),%r13
437	adcq	112(%rsp),%r14
438	adcq	120(%rsp),%r15
439	sbbq	%rcx,%rcx
440
441	call	__rsaz_512_subtract
442
443	leaq	128+24+48(%rsp),%rax
444	movq	-48(%rax),%r15
445	movq	-40(%rax),%r14
446	movq	-32(%rax),%r13
447	movq	-24(%rax),%r12
448	movq	-16(%rax),%rbp
449	movq	-8(%rax),%rbx
450	leaq	(%rax),%rsp
451.Lmul_epilogue:
452	.byte	0xf3,0xc3
453.size	rsaz_512_mul,.-rsaz_512_mul
454.globl	rsaz_512_mul_gather4
455.type	rsaz_512_mul_gather4,@function
456.align	32
457rsaz_512_mul_gather4:
458	pushq	%rbx
459	pushq	%rbp
460	pushq	%r12
461	pushq	%r13
462	pushq	%r14
463	pushq	%r15
464
465	subq	$152,%rsp
466.Lmul_gather4_body:
467	movd	%r9d,%xmm8
468	movdqa	.Linc+16(%rip),%xmm1
469	movdqa	.Linc(%rip),%xmm0
470
471	pshufd	$0,%xmm8,%xmm8
472	movdqa	%xmm1,%xmm7
473	movdqa	%xmm1,%xmm2
474	paddd	%xmm0,%xmm1
475	pcmpeqd	%xmm8,%xmm0
476	movdqa	%xmm7,%xmm3
477	paddd	%xmm1,%xmm2
478	pcmpeqd	%xmm8,%xmm1
479	movdqa	%xmm7,%xmm4
480	paddd	%xmm2,%xmm3
481	pcmpeqd	%xmm8,%xmm2
482	movdqa	%xmm7,%xmm5
483	paddd	%xmm3,%xmm4
484	pcmpeqd	%xmm8,%xmm3
485	movdqa	%xmm7,%xmm6
486	paddd	%xmm4,%xmm5
487	pcmpeqd	%xmm8,%xmm4
488	paddd	%xmm5,%xmm6
489	pcmpeqd	%xmm8,%xmm5
490	paddd	%xmm6,%xmm7
491	pcmpeqd	%xmm8,%xmm6
492	pcmpeqd	%xmm8,%xmm7
493
494	movdqa	0(%rdx),%xmm8
495	movdqa	16(%rdx),%xmm9
496	movdqa	32(%rdx),%xmm10
497	movdqa	48(%rdx),%xmm11
498	pand	%xmm0,%xmm8
499	movdqa	64(%rdx),%xmm12
500	pand	%xmm1,%xmm9
501	movdqa	80(%rdx),%xmm13
502	pand	%xmm2,%xmm10
503	movdqa	96(%rdx),%xmm14
504	pand	%xmm3,%xmm11
505	movdqa	112(%rdx),%xmm15
506	leaq	128(%rdx),%rbp
507	pand	%xmm4,%xmm12
508	pand	%xmm5,%xmm13
509	pand	%xmm6,%xmm14
510	pand	%xmm7,%xmm15
511	por	%xmm10,%xmm8
512	por	%xmm11,%xmm9
513	por	%xmm12,%xmm8
514	por	%xmm13,%xmm9
515	por	%xmm14,%xmm8
516	por	%xmm15,%xmm9
517
518	por	%xmm9,%xmm8
519	pshufd	$0x4e,%xmm8,%xmm9
520	por	%xmm9,%xmm8
521.byte	102,76,15,126,195
522
523	movq	%r8,128(%rsp)
524	movq	%rdi,128+8(%rsp)
525	movq	%rcx,128+16(%rsp)
526
527	movq	(%rsi),%rax
528	movq	8(%rsi),%rcx
529	mulq	%rbx
530	movq	%rax,(%rsp)
531	movq	%rcx,%rax
532	movq	%rdx,%r8
533
534	mulq	%rbx
535	addq	%rax,%r8
536	movq	16(%rsi),%rax
537	movq	%rdx,%r9
538	adcq	$0,%r9
539
540	mulq	%rbx
541	addq	%rax,%r9
542	movq	24(%rsi),%rax
543	movq	%rdx,%r10
544	adcq	$0,%r10
545
546	mulq	%rbx
547	addq	%rax,%r10
548	movq	32(%rsi),%rax
549	movq	%rdx,%r11
550	adcq	$0,%r11
551
552	mulq	%rbx
553	addq	%rax,%r11
554	movq	40(%rsi),%rax
555	movq	%rdx,%r12
556	adcq	$0,%r12
557
558	mulq	%rbx
559	addq	%rax,%r12
560	movq	48(%rsi),%rax
561	movq	%rdx,%r13
562	adcq	$0,%r13
563
564	mulq	%rbx
565	addq	%rax,%r13
566	movq	56(%rsi),%rax
567	movq	%rdx,%r14
568	adcq	$0,%r14
569
570	mulq	%rbx
571	addq	%rax,%r14
572	movq	(%rsi),%rax
573	movq	%rdx,%r15
574	adcq	$0,%r15
575
576	leaq	8(%rsp),%rdi
577	movl	$7,%ecx
578	jmp	.Loop_mul_gather
579
580.align	32
581.Loop_mul_gather:
582	movdqa	0(%rbp),%xmm8
583	movdqa	16(%rbp),%xmm9
584	movdqa	32(%rbp),%xmm10
585	movdqa	48(%rbp),%xmm11
586	pand	%xmm0,%xmm8
587	movdqa	64(%rbp),%xmm12
588	pand	%xmm1,%xmm9
589	movdqa	80(%rbp),%xmm13
590	pand	%xmm2,%xmm10
591	movdqa	96(%rbp),%xmm14
592	pand	%xmm3,%xmm11
593	movdqa	112(%rbp),%xmm15
594	leaq	128(%rbp),%rbp
595	pand	%xmm4,%xmm12
596	pand	%xmm5,%xmm13
597	pand	%xmm6,%xmm14
598	pand	%xmm7,%xmm15
599	por	%xmm10,%xmm8
600	por	%xmm11,%xmm9
601	por	%xmm12,%xmm8
602	por	%xmm13,%xmm9
603	por	%xmm14,%xmm8
604	por	%xmm15,%xmm9
605
606	por	%xmm9,%xmm8
607	pshufd	$0x4e,%xmm8,%xmm9
608	por	%xmm9,%xmm8
609.byte	102,76,15,126,195
610
611	mulq	%rbx
612	addq	%rax,%r8
613	movq	8(%rsi),%rax
614	movq	%r8,(%rdi)
615	movq	%rdx,%r8
616	adcq	$0,%r8
617
618	mulq	%rbx
619	addq	%rax,%r9
620	movq	16(%rsi),%rax
621	adcq	$0,%rdx
622	addq	%r9,%r8
623	movq	%rdx,%r9
624	adcq	$0,%r9
625
626	mulq	%rbx
627	addq	%rax,%r10
628	movq	24(%rsi),%rax
629	adcq	$0,%rdx
630	addq	%r10,%r9
631	movq	%rdx,%r10
632	adcq	$0,%r10
633
634	mulq	%rbx
635	addq	%rax,%r11
636	movq	32(%rsi),%rax
637	adcq	$0,%rdx
638	addq	%r11,%r10
639	movq	%rdx,%r11
640	adcq	$0,%r11
641
642	mulq	%rbx
643	addq	%rax,%r12
644	movq	40(%rsi),%rax
645	adcq	$0,%rdx
646	addq	%r12,%r11
647	movq	%rdx,%r12
648	adcq	$0,%r12
649
650	mulq	%rbx
651	addq	%rax,%r13
652	movq	48(%rsi),%rax
653	adcq	$0,%rdx
654	addq	%r13,%r12
655	movq	%rdx,%r13
656	adcq	$0,%r13
657
658	mulq	%rbx
659	addq	%rax,%r14
660	movq	56(%rsi),%rax
661	adcq	$0,%rdx
662	addq	%r14,%r13
663	movq	%rdx,%r14
664	adcq	$0,%r14
665
666	mulq	%rbx
667	addq	%rax,%r15
668	movq	(%rsi),%rax
669	adcq	$0,%rdx
670	addq	%r15,%r14
671	movq	%rdx,%r15
672	adcq	$0,%r15
673
674	leaq	8(%rdi),%rdi
675
676	decl	%ecx
677	jnz	.Loop_mul_gather
678
679	movq	%r8,(%rdi)
680	movq	%r9,8(%rdi)
681	movq	%r10,16(%rdi)
682	movq	%r11,24(%rdi)
683	movq	%r12,32(%rdi)
684	movq	%r13,40(%rdi)
685	movq	%r14,48(%rdi)
686	movq	%r15,56(%rdi)
687
688	movq	128+8(%rsp),%rdi
689	movq	128+16(%rsp),%rbp
690
691	movq	(%rsp),%r8
692	movq	8(%rsp),%r9
693	movq	16(%rsp),%r10
694	movq	24(%rsp),%r11
695	movq	32(%rsp),%r12
696	movq	40(%rsp),%r13
697	movq	48(%rsp),%r14
698	movq	56(%rsp),%r15
699
700	call	__rsaz_512_reduce
701	addq	64(%rsp),%r8
702	adcq	72(%rsp),%r9
703	adcq	80(%rsp),%r10
704	adcq	88(%rsp),%r11
705	adcq	96(%rsp),%r12
706	adcq	104(%rsp),%r13
707	adcq	112(%rsp),%r14
708	adcq	120(%rsp),%r15
709	sbbq	%rcx,%rcx
710
711	call	__rsaz_512_subtract
712
713	leaq	128+24+48(%rsp),%rax
714	movq	-48(%rax),%r15
715	movq	-40(%rax),%r14
716	movq	-32(%rax),%r13
717	movq	-24(%rax),%r12
718	movq	-16(%rax),%rbp
719	movq	-8(%rax),%rbx
720	leaq	(%rax),%rsp
721.Lmul_gather4_epilogue:
722	.byte	0xf3,0xc3
723.size	rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
724.globl	rsaz_512_mul_scatter4
725.type	rsaz_512_mul_scatter4,@function
726.align	32
727rsaz_512_mul_scatter4:
728	pushq	%rbx
729	pushq	%rbp
730	pushq	%r12
731	pushq	%r13
732	pushq	%r14
733	pushq	%r15
734
735	movl	%r9d,%r9d
736	subq	$128+24,%rsp
737.Lmul_scatter4_body:
738	leaq	(%r8,%r9,8),%r8
739.byte	102,72,15,110,199
740.byte	102,72,15,110,202
741.byte	102,73,15,110,208
742	movq	%rcx,128(%rsp)
743
744	movq	%rdi,%rbp
745	movq	(%rdi),%rbx
746	call	__rsaz_512_mul
747
748.byte	102,72,15,126,199
749.byte	102,72,15,126,205
750
751	movq	(%rsp),%r8
752	movq	8(%rsp),%r9
753	movq	16(%rsp),%r10
754	movq	24(%rsp),%r11
755	movq	32(%rsp),%r12
756	movq	40(%rsp),%r13
757	movq	48(%rsp),%r14
758	movq	56(%rsp),%r15
759
760	call	__rsaz_512_reduce
761	addq	64(%rsp),%r8
762	adcq	72(%rsp),%r9
763	adcq	80(%rsp),%r10
764	adcq	88(%rsp),%r11
765	adcq	96(%rsp),%r12
766	adcq	104(%rsp),%r13
767	adcq	112(%rsp),%r14
768	adcq	120(%rsp),%r15
769.byte	102,72,15,126,214
770	sbbq	%rcx,%rcx
771
772	call	__rsaz_512_subtract
773
774	movq	%r8,0(%rsi)
775	movq	%r9,128(%rsi)
776	movq	%r10,256(%rsi)
777	movq	%r11,384(%rsi)
778	movq	%r12,512(%rsi)
779	movq	%r13,640(%rsi)
780	movq	%r14,768(%rsi)
781	movq	%r15,896(%rsi)
782
783	leaq	128+24+48(%rsp),%rax
784	movq	-48(%rax),%r15
785	movq	-40(%rax),%r14
786	movq	-32(%rax),%r13
787	movq	-24(%rax),%r12
788	movq	-16(%rax),%rbp
789	movq	-8(%rax),%rbx
790	leaq	(%rax),%rsp
791.Lmul_scatter4_epilogue:
792	.byte	0xf3,0xc3
793.size	rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
794.globl	rsaz_512_mul_by_one
795.type	rsaz_512_mul_by_one,@function
796.align	32
797rsaz_512_mul_by_one:
798	pushq	%rbx
799	pushq	%rbp
800	pushq	%r12
801	pushq	%r13
802	pushq	%r14
803	pushq	%r15
804
805	subq	$128+24,%rsp
806.Lmul_by_one_body:
807	movq	%rdx,%rbp
808	movq	%rcx,128(%rsp)
809
810	movq	(%rsi),%r8
811	pxor	%xmm0,%xmm0
812	movq	8(%rsi),%r9
813	movq	16(%rsi),%r10
814	movq	24(%rsi),%r11
815	movq	32(%rsi),%r12
816	movq	40(%rsi),%r13
817	movq	48(%rsi),%r14
818	movq	56(%rsi),%r15
819
820	movdqa	%xmm0,(%rsp)
821	movdqa	%xmm0,16(%rsp)
822	movdqa	%xmm0,32(%rsp)
823	movdqa	%xmm0,48(%rsp)
824	movdqa	%xmm0,64(%rsp)
825	movdqa	%xmm0,80(%rsp)
826	movdqa	%xmm0,96(%rsp)
827	call	__rsaz_512_reduce
828	movq	%r8,(%rdi)
829	movq	%r9,8(%rdi)
830	movq	%r10,16(%rdi)
831	movq	%r11,24(%rdi)
832	movq	%r12,32(%rdi)
833	movq	%r13,40(%rdi)
834	movq	%r14,48(%rdi)
835	movq	%r15,56(%rdi)
836
837	leaq	128+24+48(%rsp),%rax
838	movq	-48(%rax),%r15
839	movq	-40(%rax),%r14
840	movq	-32(%rax),%r13
841	movq	-24(%rax),%r12
842	movq	-16(%rax),%rbp
843	movq	-8(%rax),%rbx
844	leaq	(%rax),%rsp
845.Lmul_by_one_epilogue:
846	.byte	0xf3,0xc3
847.size	rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
848.type	__rsaz_512_reduce,@function
849.align	32
850__rsaz_512_reduce:
851	movq	%r8,%rbx
852	imulq	128+8(%rsp),%rbx
853	movq	0(%rbp),%rax
854	movl	$8,%ecx
855	jmp	.Lreduction_loop
856
857.align	32
858.Lreduction_loop:
859	mulq	%rbx
860	movq	8(%rbp),%rax
861	negq	%r8
862	movq	%rdx,%r8
863	adcq	$0,%r8
864
865	mulq	%rbx
866	addq	%rax,%r9
867	movq	16(%rbp),%rax
868	adcq	$0,%rdx
869	addq	%r9,%r8
870	movq	%rdx,%r9
871	adcq	$0,%r9
872
873	mulq	%rbx
874	addq	%rax,%r10
875	movq	24(%rbp),%rax
876	adcq	$0,%rdx
877	addq	%r10,%r9
878	movq	%rdx,%r10
879	adcq	$0,%r10
880
881	mulq	%rbx
882	addq	%rax,%r11
883	movq	32(%rbp),%rax
884	adcq	$0,%rdx
885	addq	%r11,%r10
886	movq	128+8(%rsp),%rsi
887
888
889	adcq	$0,%rdx
890	movq	%rdx,%r11
891
892	mulq	%rbx
893	addq	%rax,%r12
894	movq	40(%rbp),%rax
895	adcq	$0,%rdx
896	imulq	%r8,%rsi
897	addq	%r12,%r11
898	movq	%rdx,%r12
899	adcq	$0,%r12
900
901	mulq	%rbx
902	addq	%rax,%r13
903	movq	48(%rbp),%rax
904	adcq	$0,%rdx
905	addq	%r13,%r12
906	movq	%rdx,%r13
907	adcq	$0,%r13
908
909	mulq	%rbx
910	addq	%rax,%r14
911	movq	56(%rbp),%rax
912	adcq	$0,%rdx
913	addq	%r14,%r13
914	movq	%rdx,%r14
915	adcq	$0,%r14
916
917	mulq	%rbx
918	movq	%rsi,%rbx
919	addq	%rax,%r15
920	movq	0(%rbp),%rax
921	adcq	$0,%rdx
922	addq	%r15,%r14
923	movq	%rdx,%r15
924	adcq	$0,%r15
925
926	decl	%ecx
927	jne	.Lreduction_loop
928
929	.byte	0xf3,0xc3
930.size	__rsaz_512_reduce,.-__rsaz_512_reduce
931.type	__rsaz_512_subtract,@function
932.align	32
933__rsaz_512_subtract:
934	movq	%r8,(%rdi)
935	movq	%r9,8(%rdi)
936	movq	%r10,16(%rdi)
937	movq	%r11,24(%rdi)
938	movq	%r12,32(%rdi)
939	movq	%r13,40(%rdi)
940	movq	%r14,48(%rdi)
941	movq	%r15,56(%rdi)
942
943	movq	0(%rbp),%r8
944	movq	8(%rbp),%r9
945	negq	%r8
946	notq	%r9
947	andq	%rcx,%r8
948	movq	16(%rbp),%r10
949	andq	%rcx,%r9
950	notq	%r10
951	movq	24(%rbp),%r11
952	andq	%rcx,%r10
953	notq	%r11
954	movq	32(%rbp),%r12
955	andq	%rcx,%r11
956	notq	%r12
957	movq	40(%rbp),%r13
958	andq	%rcx,%r12
959	notq	%r13
960	movq	48(%rbp),%r14
961	andq	%rcx,%r13
962	notq	%r14
963	movq	56(%rbp),%r15
964	andq	%rcx,%r14
965	notq	%r15
966	andq	%rcx,%r15
967
968	addq	(%rdi),%r8
969	adcq	8(%rdi),%r9
970	adcq	16(%rdi),%r10
971	adcq	24(%rdi),%r11
972	adcq	32(%rdi),%r12
973	adcq	40(%rdi),%r13
974	adcq	48(%rdi),%r14
975	adcq	56(%rdi),%r15
976
977	movq	%r8,(%rdi)
978	movq	%r9,8(%rdi)
979	movq	%r10,16(%rdi)
980	movq	%r11,24(%rdi)
981	movq	%r12,32(%rdi)
982	movq	%r13,40(%rdi)
983	movq	%r14,48(%rdi)
984	movq	%r15,56(%rdi)
985
986	.byte	0xf3,0xc3
987.size	__rsaz_512_subtract,.-__rsaz_512_subtract
988.type	__rsaz_512_mul,@function
989.align	32
990__rsaz_512_mul:
991	leaq	8(%rsp),%rdi
992
993	movq	(%rsi),%rax
994	mulq	%rbx
995	movq	%rax,(%rdi)
996	movq	8(%rsi),%rax
997	movq	%rdx,%r8
998
999	mulq	%rbx
1000	addq	%rax,%r8
1001	movq	16(%rsi),%rax
1002	movq	%rdx,%r9
1003	adcq	$0,%r9
1004
1005	mulq	%rbx
1006	addq	%rax,%r9
1007	movq	24(%rsi),%rax
1008	movq	%rdx,%r10
1009	adcq	$0,%r10
1010
1011	mulq	%rbx
1012	addq	%rax,%r10
1013	movq	32(%rsi),%rax
1014	movq	%rdx,%r11
1015	adcq	$0,%r11
1016
1017	mulq	%rbx
1018	addq	%rax,%r11
1019	movq	40(%rsi),%rax
1020	movq	%rdx,%r12
1021	adcq	$0,%r12
1022
1023	mulq	%rbx
1024	addq	%rax,%r12
1025	movq	48(%rsi),%rax
1026	movq	%rdx,%r13
1027	adcq	$0,%r13
1028
1029	mulq	%rbx
1030	addq	%rax,%r13
1031	movq	56(%rsi),%rax
1032	movq	%rdx,%r14
1033	adcq	$0,%r14
1034
1035	mulq	%rbx
1036	addq	%rax,%r14
1037	movq	(%rsi),%rax
1038	movq	%rdx,%r15
1039	adcq	$0,%r15
1040
1041	leaq	8(%rbp),%rbp
1042	leaq	8(%rdi),%rdi
1043
1044	movl	$7,%ecx
1045	jmp	.Loop_mul
1046
1047.align	32
1048.Loop_mul:
1049	movq	(%rbp),%rbx
1050	mulq	%rbx
1051	addq	%rax,%r8
1052	movq	8(%rsi),%rax
1053	movq	%r8,(%rdi)
1054	movq	%rdx,%r8
1055	adcq	$0,%r8
1056
1057	mulq	%rbx
1058	addq	%rax,%r9
1059	movq	16(%rsi),%rax
1060	adcq	$0,%rdx
1061	addq	%r9,%r8
1062	movq	%rdx,%r9
1063	adcq	$0,%r9
1064
1065	mulq	%rbx
1066	addq	%rax,%r10
1067	movq	24(%rsi),%rax
1068	adcq	$0,%rdx
1069	addq	%r10,%r9
1070	movq	%rdx,%r10
1071	adcq	$0,%r10
1072
1073	mulq	%rbx
1074	addq	%rax,%r11
1075	movq	32(%rsi),%rax
1076	adcq	$0,%rdx
1077	addq	%r11,%r10
1078	movq	%rdx,%r11
1079	adcq	$0,%r11
1080
1081	mulq	%rbx
1082	addq	%rax,%r12
1083	movq	40(%rsi),%rax
1084	adcq	$0,%rdx
1085	addq	%r12,%r11
1086	movq	%rdx,%r12
1087	adcq	$0,%r12
1088
1089	mulq	%rbx
1090	addq	%rax,%r13
1091	movq	48(%rsi),%rax
1092	adcq	$0,%rdx
1093	addq	%r13,%r12
1094	movq	%rdx,%r13
1095	adcq	$0,%r13
1096
1097	mulq	%rbx
1098	addq	%rax,%r14
1099	movq	56(%rsi),%rax
1100	adcq	$0,%rdx
1101	addq	%r14,%r13
1102	movq	%rdx,%r14
1103	leaq	8(%rbp),%rbp
1104	adcq	$0,%r14
1105
1106	mulq	%rbx
1107	addq	%rax,%r15
1108	movq	(%rsi),%rax
1109	adcq	$0,%rdx
1110	addq	%r15,%r14
1111	movq	%rdx,%r15
1112	adcq	$0,%r15
1113
1114	leaq	8(%rdi),%rdi
1115
1116	decl	%ecx
1117	jnz	.Loop_mul
1118
1119	movq	%r8,(%rdi)
1120	movq	%r9,8(%rdi)
1121	movq	%r10,16(%rdi)
1122	movq	%r11,24(%rdi)
1123	movq	%r12,32(%rdi)
1124	movq	%r13,40(%rdi)
1125	movq	%r14,48(%rdi)
1126	movq	%r15,56(%rdi)
1127
1128	.byte	0xf3,0xc3
1129.size	__rsaz_512_mul,.-__rsaz_512_mul
1130.globl	rsaz_512_scatter4
1131.type	rsaz_512_scatter4,@function
1132.align	16
1133rsaz_512_scatter4:
1134	leaq	(%rdi,%rdx,8),%rdi
1135	movl	$8,%r9d
1136	jmp	.Loop_scatter
1137.align	16
1138.Loop_scatter:
1139	movq	(%rsi),%rax
1140	leaq	8(%rsi),%rsi
1141	movq	%rax,(%rdi)
1142	leaq	128(%rdi),%rdi
1143	decl	%r9d
1144	jnz	.Loop_scatter
1145	.byte	0xf3,0xc3
1146.size	rsaz_512_scatter4,.-rsaz_512_scatter4
1147
1148.globl	rsaz_512_gather4
1149.type	rsaz_512_gather4,@function
1150.align	16
1151rsaz_512_gather4:
1152	movd	%edx,%xmm8
1153	movdqa	.Linc+16(%rip),%xmm1
1154	movdqa	.Linc(%rip),%xmm0
1155
1156	pshufd	$0,%xmm8,%xmm8
1157	movdqa	%xmm1,%xmm7
1158	movdqa	%xmm1,%xmm2
1159	paddd	%xmm0,%xmm1
1160	pcmpeqd	%xmm8,%xmm0
1161	movdqa	%xmm7,%xmm3
1162	paddd	%xmm1,%xmm2
1163	pcmpeqd	%xmm8,%xmm1
1164	movdqa	%xmm7,%xmm4
1165	paddd	%xmm2,%xmm3
1166	pcmpeqd	%xmm8,%xmm2
1167	movdqa	%xmm7,%xmm5
1168	paddd	%xmm3,%xmm4
1169	pcmpeqd	%xmm8,%xmm3
1170	movdqa	%xmm7,%xmm6
1171	paddd	%xmm4,%xmm5
1172	pcmpeqd	%xmm8,%xmm4
1173	paddd	%xmm5,%xmm6
1174	pcmpeqd	%xmm8,%xmm5
1175	paddd	%xmm6,%xmm7
1176	pcmpeqd	%xmm8,%xmm6
1177	pcmpeqd	%xmm8,%xmm7
1178	movl	$8,%r9d
1179	jmp	.Loop_gather
1180.align	16
1181.Loop_gather:
1182	movdqa	0(%rsi),%xmm8
1183	movdqa	16(%rsi),%xmm9
1184	movdqa	32(%rsi),%xmm10
1185	movdqa	48(%rsi),%xmm11
1186	pand	%xmm0,%xmm8
1187	movdqa	64(%rsi),%xmm12
1188	pand	%xmm1,%xmm9
1189	movdqa	80(%rsi),%xmm13
1190	pand	%xmm2,%xmm10
1191	movdqa	96(%rsi),%xmm14
1192	pand	%xmm3,%xmm11
1193	movdqa	112(%rsi),%xmm15
1194	leaq	128(%rsi),%rsi
1195	pand	%xmm4,%xmm12
1196	pand	%xmm5,%xmm13
1197	pand	%xmm6,%xmm14
1198	pand	%xmm7,%xmm15
1199	por	%xmm10,%xmm8
1200	por	%xmm11,%xmm9
1201	por	%xmm12,%xmm8
1202	por	%xmm13,%xmm9
1203	por	%xmm14,%xmm8
1204	por	%xmm15,%xmm9
1205
1206	por	%xmm9,%xmm8
1207	pshufd	$0x4e,%xmm8,%xmm9
1208	por	%xmm9,%xmm8
1209	movq	%xmm8,(%rdi)
1210	leaq	8(%rdi),%rdi
1211	decl	%r9d
1212	jnz	.Loop_gather
1213	.byte	0xf3,0xc3
1214.LSEH_end_rsaz_512_gather4:
1215.size	rsaz_512_gather4,.-rsaz_512_gather4
1216
1217.align	64
1218.Linc:
1219.long	0,0, 1,1
1220.long	2,2, 2,2
1221