x86_64-mont.S revision 312826
1/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/x86_64-mont.S 312826 2017-01-26 19:14:14Z jkim $ */
2/* Do not modify. This file is auto-generated from x86_64-mont.pl. */
3.text
4
5
6
7.globl	bn_mul_mont
8.type	bn_mul_mont,@function
9.align	16
10bn_mul_mont:
11	movl	%r9d,%r9d
12	movq	%rsp,%rax
13	testl	$3,%r9d
14	jnz	.Lmul_enter
15	cmpl	$8,%r9d
16	jb	.Lmul_enter
17	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
18	cmpq	%rsi,%rdx
19	jne	.Lmul4x_enter
20	testl	$7,%r9d
21	jz	.Lsqr8x_enter
22	jmp	.Lmul4x_enter
23
24.align	16
25.Lmul_enter:
26	pushq	%rbx
27	pushq	%rbp
28	pushq	%r12
29	pushq	%r13
30	pushq	%r14
31	pushq	%r15
32
33	negq	%r9
34	movq	%rsp,%r11
35	leaq	-16(%rsp,%r9,8),%r10
36	negq	%r9
37	andq	$-1024,%r10
38
39
40
41
42
43
44
45	subq	%r10,%r11
46	andq	$-4096,%r11
47	leaq	(%r10,%r11,1),%rsp
48	movq	(%rsp),%r11
49	cmpq	%r10,%rsp
50	ja	.Lmul_page_walk
51	jmp	.Lmul_page_walk_done
52
53.align	16
54.Lmul_page_walk:
55	leaq	-4096(%rsp),%rsp
56	movq	(%rsp),%r11
57	cmpq	%r10,%rsp
58	ja	.Lmul_page_walk
59.Lmul_page_walk_done:
60
61	movq	%rax,8(%rsp,%r9,8)
62.Lmul_body:
63	movq	%rdx,%r12
64	movq	(%r8),%r8
65	movq	(%r12),%rbx
66	movq	(%rsi),%rax
67
68	xorq	%r14,%r14
69	xorq	%r15,%r15
70
71	movq	%r8,%rbp
72	mulq	%rbx
73	movq	%rax,%r10
74	movq	(%rcx),%rax
75
76	imulq	%r10,%rbp
77	movq	%rdx,%r11
78
79	mulq	%rbp
80	addq	%rax,%r10
81	movq	8(%rsi),%rax
82	adcq	$0,%rdx
83	movq	%rdx,%r13
84
85	leaq	1(%r15),%r15
86	jmp	.L1st_enter
87
88.align	16
89.L1st:
90	addq	%rax,%r13
91	movq	(%rsi,%r15,8),%rax
92	adcq	$0,%rdx
93	addq	%r11,%r13
94	movq	%r10,%r11
95	adcq	$0,%rdx
96	movq	%r13,-16(%rsp,%r15,8)
97	movq	%rdx,%r13
98
99.L1st_enter:
100	mulq	%rbx
101	addq	%rax,%r11
102	movq	(%rcx,%r15,8),%rax
103	adcq	$0,%rdx
104	leaq	1(%r15),%r15
105	movq	%rdx,%r10
106
107	mulq	%rbp
108	cmpq	%r9,%r15
109	jne	.L1st
110
111	addq	%rax,%r13
112	movq	(%rsi),%rax
113	adcq	$0,%rdx
114	addq	%r11,%r13
115	adcq	$0,%rdx
116	movq	%r13,-16(%rsp,%r15,8)
117	movq	%rdx,%r13
118	movq	%r10,%r11
119
120	xorq	%rdx,%rdx
121	addq	%r11,%r13
122	adcq	$0,%rdx
123	movq	%r13,-8(%rsp,%r9,8)
124	movq	%rdx,(%rsp,%r9,8)
125
126	leaq	1(%r14),%r14
127	jmp	.Louter
128.align	16
129.Louter:
130	movq	(%r12,%r14,8),%rbx
131	xorq	%r15,%r15
132	movq	%r8,%rbp
133	movq	(%rsp),%r10
134	mulq	%rbx
135	addq	%rax,%r10
136	movq	(%rcx),%rax
137	adcq	$0,%rdx
138
139	imulq	%r10,%rbp
140	movq	%rdx,%r11
141
142	mulq	%rbp
143	addq	%rax,%r10
144	movq	8(%rsi),%rax
145	adcq	$0,%rdx
146	movq	8(%rsp),%r10
147	movq	%rdx,%r13
148
149	leaq	1(%r15),%r15
150	jmp	.Linner_enter
151
152.align	16
153.Linner:
154	addq	%rax,%r13
155	movq	(%rsi,%r15,8),%rax
156	adcq	$0,%rdx
157	addq	%r10,%r13
158	movq	(%rsp,%r15,8),%r10
159	adcq	$0,%rdx
160	movq	%r13,-16(%rsp,%r15,8)
161	movq	%rdx,%r13
162
163.Linner_enter:
164	mulq	%rbx
165	addq	%rax,%r11
166	movq	(%rcx,%r15,8),%rax
167	adcq	$0,%rdx
168	addq	%r11,%r10
169	movq	%rdx,%r11
170	adcq	$0,%r11
171	leaq	1(%r15),%r15
172
173	mulq	%rbp
174	cmpq	%r9,%r15
175	jne	.Linner
176
177	addq	%rax,%r13
178	movq	(%rsi),%rax
179	adcq	$0,%rdx
180	addq	%r10,%r13
181	movq	(%rsp,%r15,8),%r10
182	adcq	$0,%rdx
183	movq	%r13,-16(%rsp,%r15,8)
184	movq	%rdx,%r13
185
186	xorq	%rdx,%rdx
187	addq	%r11,%r13
188	adcq	$0,%rdx
189	addq	%r10,%r13
190	adcq	$0,%rdx
191	movq	%r13,-8(%rsp,%r9,8)
192	movq	%rdx,(%rsp,%r9,8)
193
194	leaq	1(%r14),%r14
195	cmpq	%r9,%r14
196	jb	.Louter
197
198	xorq	%r14,%r14
199	movq	(%rsp),%rax
200	leaq	(%rsp),%rsi
201	movq	%r9,%r15
202	jmp	.Lsub
203.align	16
204.Lsub:	sbbq	(%rcx,%r14,8),%rax
205	movq	%rax,(%rdi,%r14,8)
206	movq	8(%rsi,%r14,8),%rax
207	leaq	1(%r14),%r14
208	decq	%r15
209	jnz	.Lsub
210
211	sbbq	$0,%rax
212	xorq	%r14,%r14
213	andq	%rax,%rsi
214	notq	%rax
215	movq	%rdi,%rcx
216	andq	%rax,%rcx
217	movq	%r9,%r15
218	orq	%rcx,%rsi
219.align	16
220.Lcopy:
221	movq	(%rsi,%r14,8),%rax
222	movq	%r14,(%rsp,%r14,8)
223	movq	%rax,(%rdi,%r14,8)
224	leaq	1(%r14),%r14
225	subq	$1,%r15
226	jnz	.Lcopy
227
228	movq	8(%rsp,%r9,8),%rsi
229	movq	$1,%rax
230	movq	-48(%rsi),%r15
231	movq	-40(%rsi),%r14
232	movq	-32(%rsi),%r13
233	movq	-24(%rsi),%r12
234	movq	-16(%rsi),%rbp
235	movq	-8(%rsi),%rbx
236	leaq	(%rsi),%rsp
237.Lmul_epilogue:
238	.byte	0xf3,0xc3
239.size	bn_mul_mont,.-bn_mul_mont
240.type	bn_mul4x_mont,@function
241.align	16
242bn_mul4x_mont:
243	movl	%r9d,%r9d
244	movq	%rsp,%rax
245.Lmul4x_enter:
246	andl	$0x80100,%r11d
247	cmpl	$0x80100,%r11d
248	je	.Lmulx4x_enter
249	pushq	%rbx
250	pushq	%rbp
251	pushq	%r12
252	pushq	%r13
253	pushq	%r14
254	pushq	%r15
255
256	negq	%r9
257	movq	%rsp,%r11
258	leaq	-32(%rsp,%r9,8),%r10
259	negq	%r9
260	andq	$-1024,%r10
261
262	subq	%r10,%r11
263	andq	$-4096,%r11
264	leaq	(%r10,%r11,1),%rsp
265	movq	(%rsp),%r11
266	cmpq	%r10,%rsp
267	ja	.Lmul4x_page_walk
268	jmp	.Lmul4x_page_walk_done
269
270.Lmul4x_page_walk:
271	leaq	-4096(%rsp),%rsp
272	movq	(%rsp),%r11
273	cmpq	%r10,%rsp
274	ja	.Lmul4x_page_walk
275.Lmul4x_page_walk_done:
276
277	movq	%rax,8(%rsp,%r9,8)
278.Lmul4x_body:
279	movq	%rdi,16(%rsp,%r9,8)
280	movq	%rdx,%r12
281	movq	(%r8),%r8
282	movq	(%r12),%rbx
283	movq	(%rsi),%rax
284
285	xorq	%r14,%r14
286	xorq	%r15,%r15
287
288	movq	%r8,%rbp
289	mulq	%rbx
290	movq	%rax,%r10
291	movq	(%rcx),%rax
292
293	imulq	%r10,%rbp
294	movq	%rdx,%r11
295
296	mulq	%rbp
297	addq	%rax,%r10
298	movq	8(%rsi),%rax
299	adcq	$0,%rdx
300	movq	%rdx,%rdi
301
302	mulq	%rbx
303	addq	%rax,%r11
304	movq	8(%rcx),%rax
305	adcq	$0,%rdx
306	movq	%rdx,%r10
307
308	mulq	%rbp
309	addq	%rax,%rdi
310	movq	16(%rsi),%rax
311	adcq	$0,%rdx
312	addq	%r11,%rdi
313	leaq	4(%r15),%r15
314	adcq	$0,%rdx
315	movq	%rdi,(%rsp)
316	movq	%rdx,%r13
317	jmp	.L1st4x
318.align	16
319.L1st4x:
320	mulq	%rbx
321	addq	%rax,%r10
322	movq	-16(%rcx,%r15,8),%rax
323	adcq	$0,%rdx
324	movq	%rdx,%r11
325
326	mulq	%rbp
327	addq	%rax,%r13
328	movq	-8(%rsi,%r15,8),%rax
329	adcq	$0,%rdx
330	addq	%r10,%r13
331	adcq	$0,%rdx
332	movq	%r13,-24(%rsp,%r15,8)
333	movq	%rdx,%rdi
334
335	mulq	%rbx
336	addq	%rax,%r11
337	movq	-8(%rcx,%r15,8),%rax
338	adcq	$0,%rdx
339	movq	%rdx,%r10
340
341	mulq	%rbp
342	addq	%rax,%rdi
343	movq	(%rsi,%r15,8),%rax
344	adcq	$0,%rdx
345	addq	%r11,%rdi
346	adcq	$0,%rdx
347	movq	%rdi,-16(%rsp,%r15,8)
348	movq	%rdx,%r13
349
350	mulq	%rbx
351	addq	%rax,%r10
352	movq	(%rcx,%r15,8),%rax
353	adcq	$0,%rdx
354	movq	%rdx,%r11
355
356	mulq	%rbp
357	addq	%rax,%r13
358	movq	8(%rsi,%r15,8),%rax
359	adcq	$0,%rdx
360	addq	%r10,%r13
361	adcq	$0,%rdx
362	movq	%r13,-8(%rsp,%r15,8)
363	movq	%rdx,%rdi
364
365	mulq	%rbx
366	addq	%rax,%r11
367	movq	8(%rcx,%r15,8),%rax
368	adcq	$0,%rdx
369	leaq	4(%r15),%r15
370	movq	%rdx,%r10
371
372	mulq	%rbp
373	addq	%rax,%rdi
374	movq	-16(%rsi,%r15,8),%rax
375	adcq	$0,%rdx
376	addq	%r11,%rdi
377	adcq	$0,%rdx
378	movq	%rdi,-32(%rsp,%r15,8)
379	movq	%rdx,%r13
380	cmpq	%r9,%r15
381	jb	.L1st4x
382
383	mulq	%rbx
384	addq	%rax,%r10
385	movq	-16(%rcx,%r15,8),%rax
386	adcq	$0,%rdx
387	movq	%rdx,%r11
388
389	mulq	%rbp
390	addq	%rax,%r13
391	movq	-8(%rsi,%r15,8),%rax
392	adcq	$0,%rdx
393	addq	%r10,%r13
394	adcq	$0,%rdx
395	movq	%r13,-24(%rsp,%r15,8)
396	movq	%rdx,%rdi
397
398	mulq	%rbx
399	addq	%rax,%r11
400	movq	-8(%rcx,%r15,8),%rax
401	adcq	$0,%rdx
402	movq	%rdx,%r10
403
404	mulq	%rbp
405	addq	%rax,%rdi
406	movq	(%rsi),%rax
407	adcq	$0,%rdx
408	addq	%r11,%rdi
409	adcq	$0,%rdx
410	movq	%rdi,-16(%rsp,%r15,8)
411	movq	%rdx,%r13
412
413	xorq	%rdi,%rdi
414	addq	%r10,%r13
415	adcq	$0,%rdi
416	movq	%r13,-8(%rsp,%r15,8)
417	movq	%rdi,(%rsp,%r15,8)
418
419	leaq	1(%r14),%r14
420.align	4
421.Louter4x:
422	movq	(%r12,%r14,8),%rbx
423	xorq	%r15,%r15
424	movq	(%rsp),%r10
425	movq	%r8,%rbp
426	mulq	%rbx
427	addq	%rax,%r10
428	movq	(%rcx),%rax
429	adcq	$0,%rdx
430
431	imulq	%r10,%rbp
432	movq	%rdx,%r11
433
434	mulq	%rbp
435	addq	%rax,%r10
436	movq	8(%rsi),%rax
437	adcq	$0,%rdx
438	movq	%rdx,%rdi
439
440	mulq	%rbx
441	addq	%rax,%r11
442	movq	8(%rcx),%rax
443	adcq	$0,%rdx
444	addq	8(%rsp),%r11
445	adcq	$0,%rdx
446	movq	%rdx,%r10
447
448	mulq	%rbp
449	addq	%rax,%rdi
450	movq	16(%rsi),%rax
451	adcq	$0,%rdx
452	addq	%r11,%rdi
453	leaq	4(%r15),%r15
454	adcq	$0,%rdx
455	movq	%rdi,(%rsp)
456	movq	%rdx,%r13
457	jmp	.Linner4x
458.align	16
459.Linner4x:
460	mulq	%rbx
461	addq	%rax,%r10
462	movq	-16(%rcx,%r15,8),%rax
463	adcq	$0,%rdx
464	addq	-16(%rsp,%r15,8),%r10
465	adcq	$0,%rdx
466	movq	%rdx,%r11
467
468	mulq	%rbp
469	addq	%rax,%r13
470	movq	-8(%rsi,%r15,8),%rax
471	adcq	$0,%rdx
472	addq	%r10,%r13
473	adcq	$0,%rdx
474	movq	%r13,-24(%rsp,%r15,8)
475	movq	%rdx,%rdi
476
477	mulq	%rbx
478	addq	%rax,%r11
479	movq	-8(%rcx,%r15,8),%rax
480	adcq	$0,%rdx
481	addq	-8(%rsp,%r15,8),%r11
482	adcq	$0,%rdx
483	movq	%rdx,%r10
484
485	mulq	%rbp
486	addq	%rax,%rdi
487	movq	(%rsi,%r15,8),%rax
488	adcq	$0,%rdx
489	addq	%r11,%rdi
490	adcq	$0,%rdx
491	movq	%rdi,-16(%rsp,%r15,8)
492	movq	%rdx,%r13
493
494	mulq	%rbx
495	addq	%rax,%r10
496	movq	(%rcx,%r15,8),%rax
497	adcq	$0,%rdx
498	addq	(%rsp,%r15,8),%r10
499	adcq	$0,%rdx
500	movq	%rdx,%r11
501
502	mulq	%rbp
503	addq	%rax,%r13
504	movq	8(%rsi,%r15,8),%rax
505	adcq	$0,%rdx
506	addq	%r10,%r13
507	adcq	$0,%rdx
508	movq	%r13,-8(%rsp,%r15,8)
509	movq	%rdx,%rdi
510
511	mulq	%rbx
512	addq	%rax,%r11
513	movq	8(%rcx,%r15,8),%rax
514	adcq	$0,%rdx
515	addq	8(%rsp,%r15,8),%r11
516	adcq	$0,%rdx
517	leaq	4(%r15),%r15
518	movq	%rdx,%r10
519
520	mulq	%rbp
521	addq	%rax,%rdi
522	movq	-16(%rsi,%r15,8),%rax
523	adcq	$0,%rdx
524	addq	%r11,%rdi
525	adcq	$0,%rdx
526	movq	%rdi,-32(%rsp,%r15,8)
527	movq	%rdx,%r13
528	cmpq	%r9,%r15
529	jb	.Linner4x
530
531	mulq	%rbx
532	addq	%rax,%r10
533	movq	-16(%rcx,%r15,8),%rax
534	adcq	$0,%rdx
535	addq	-16(%rsp,%r15,8),%r10
536	adcq	$0,%rdx
537	movq	%rdx,%r11
538
539	mulq	%rbp
540	addq	%rax,%r13
541	movq	-8(%rsi,%r15,8),%rax
542	adcq	$0,%rdx
543	addq	%r10,%r13
544	adcq	$0,%rdx
545	movq	%r13,-24(%rsp,%r15,8)
546	movq	%rdx,%rdi
547
548	mulq	%rbx
549	addq	%rax,%r11
550	movq	-8(%rcx,%r15,8),%rax
551	adcq	$0,%rdx
552	addq	-8(%rsp,%r15,8),%r11
553	adcq	$0,%rdx
554	leaq	1(%r14),%r14
555	movq	%rdx,%r10
556
557	mulq	%rbp
558	addq	%rax,%rdi
559	movq	(%rsi),%rax
560	adcq	$0,%rdx
561	addq	%r11,%rdi
562	adcq	$0,%rdx
563	movq	%rdi,-16(%rsp,%r15,8)
564	movq	%rdx,%r13
565
566	xorq	%rdi,%rdi
567	addq	%r10,%r13
568	adcq	$0,%rdi
569	addq	(%rsp,%r9,8),%r13
570	adcq	$0,%rdi
571	movq	%r13,-8(%rsp,%r15,8)
572	movq	%rdi,(%rsp,%r15,8)
573
574	cmpq	%r9,%r14
575	jb	.Louter4x
576	movq	16(%rsp,%r9,8),%rdi
577	movq	0(%rsp),%rax
578	pxor	%xmm0,%xmm0
579	movq	8(%rsp),%rdx
580	shrq	$2,%r9
581	leaq	(%rsp),%rsi
582	xorq	%r14,%r14
583
584	subq	0(%rcx),%rax
585	movq	16(%rsi),%rbx
586	movq	24(%rsi),%rbp
587	sbbq	8(%rcx),%rdx
588	leaq	-1(%r9),%r15
589	jmp	.Lsub4x
590.align	16
591.Lsub4x:
592	movq	%rax,0(%rdi,%r14,8)
593	movq	%rdx,8(%rdi,%r14,8)
594	sbbq	16(%rcx,%r14,8),%rbx
595	movq	32(%rsi,%r14,8),%rax
596	movq	40(%rsi,%r14,8),%rdx
597	sbbq	24(%rcx,%r14,8),%rbp
598	movq	%rbx,16(%rdi,%r14,8)
599	movq	%rbp,24(%rdi,%r14,8)
600	sbbq	32(%rcx,%r14,8),%rax
601	movq	48(%rsi,%r14,8),%rbx
602	movq	56(%rsi,%r14,8),%rbp
603	sbbq	40(%rcx,%r14,8),%rdx
604	leaq	4(%r14),%r14
605	decq	%r15
606	jnz	.Lsub4x
607
608	movq	%rax,0(%rdi,%r14,8)
609	movq	32(%rsi,%r14,8),%rax
610	sbbq	16(%rcx,%r14,8),%rbx
611	movq	%rdx,8(%rdi,%r14,8)
612	sbbq	24(%rcx,%r14,8),%rbp
613	movq	%rbx,16(%rdi,%r14,8)
614
615	sbbq	$0,%rax
616	movq	%rbp,24(%rdi,%r14,8)
617	xorq	%r14,%r14
618	andq	%rax,%rsi
619	notq	%rax
620	movq	%rdi,%rcx
621	andq	%rax,%rcx
622	leaq	-1(%r9),%r15
623	orq	%rcx,%rsi
624
625	movdqu	(%rsi),%xmm1
626	movdqa	%xmm0,(%rsp)
627	movdqu	%xmm1,(%rdi)
628	jmp	.Lcopy4x
629.align	16
630.Lcopy4x:
631	movdqu	16(%rsi,%r14,1),%xmm2
632	movdqu	32(%rsi,%r14,1),%xmm1
633	movdqa	%xmm0,16(%rsp,%r14,1)
634	movdqu	%xmm2,16(%rdi,%r14,1)
635	movdqa	%xmm0,32(%rsp,%r14,1)
636	movdqu	%xmm1,32(%rdi,%r14,1)
637	leaq	32(%r14),%r14
638	decq	%r15
639	jnz	.Lcopy4x
640
641	shlq	$2,%r9
642	movdqu	16(%rsi,%r14,1),%xmm2
643	movdqa	%xmm0,16(%rsp,%r14,1)
644	movdqu	%xmm2,16(%rdi,%r14,1)
645	movq	8(%rsp,%r9,8),%rsi
646	movq	$1,%rax
647	movq	-48(%rsi),%r15
648	movq	-40(%rsi),%r14
649	movq	-32(%rsi),%r13
650	movq	-24(%rsi),%r12
651	movq	-16(%rsi),%rbp
652	movq	-8(%rsi),%rbx
653	leaq	(%rsi),%rsp
654.Lmul4x_epilogue:
655	.byte	0xf3,0xc3
656.size	bn_mul4x_mont,.-bn_mul4x_mont
657
658
659
660.type	bn_sqr8x_mont,@function
661.align	32
662bn_sqr8x_mont:
663	movq	%rsp,%rax
664.Lsqr8x_enter:
665	pushq	%rbx
666	pushq	%rbp
667	pushq	%r12
668	pushq	%r13
669	pushq	%r14
670	pushq	%r15
671.Lsqr8x_prologue:
672
673	movl	%r9d,%r10d
674	shll	$3,%r9d
675	shlq	$3+2,%r10
676	negq	%r9
677
678
679
680
681
682
683	leaq	-64(%rsp,%r9,2),%r11
684	movq	%rsp,%rbp
685	movq	(%r8),%r8
686	subq	%rsi,%r11
687	andq	$4095,%r11
688	cmpq	%r11,%r10
689	jb	.Lsqr8x_sp_alt
690	subq	%r11,%rbp
691	leaq	-64(%rbp,%r9,2),%rbp
692	jmp	.Lsqr8x_sp_done
693
694.align	32
695.Lsqr8x_sp_alt:
696	leaq	4096-64(,%r9,2),%r10
697	leaq	-64(%rbp,%r9,2),%rbp
698	subq	%r10,%r11
699	movq	$0,%r10
700	cmovcq	%r10,%r11
701	subq	%r11,%rbp
702.Lsqr8x_sp_done:
703	andq	$-64,%rbp
704	movq	%rsp,%r11
705	subq	%rbp,%r11
706	andq	$-4096,%r11
707	leaq	(%r11,%rbp,1),%rsp
708	movq	(%rsp),%r10
709	cmpq	%rbp,%rsp
710	ja	.Lsqr8x_page_walk
711	jmp	.Lsqr8x_page_walk_done
712
713.align	16
714.Lsqr8x_page_walk:
715	leaq	-4096(%rsp),%rsp
716	movq	(%rsp),%r10
717	cmpq	%rbp,%rsp
718	ja	.Lsqr8x_page_walk
719.Lsqr8x_page_walk_done:
720
721	movq	%r9,%r10
722	negq	%r9
723
724	movq	%r8,32(%rsp)
725	movq	%rax,40(%rsp)
726.Lsqr8x_body:
727
728.byte	102,72,15,110,209
729	pxor	%xmm0,%xmm0
730.byte	102,72,15,110,207
731.byte	102,73,15,110,218
732	movl	OPENSSL_ia32cap_P+8(%rip),%eax
733	andl	$0x80100,%eax
734	cmpl	$0x80100,%eax
735	jne	.Lsqr8x_nox
736
737	call	bn_sqrx8x_internal
738
739
740
741
742	leaq	(%r8,%rcx,1),%rbx
743	movq	%rcx,%r9
744	movq	%rcx,%rdx
745.byte	102,72,15,126,207
746	sarq	$3+2,%rcx
747	jmp	.Lsqr8x_sub
748
749.align	32
750.Lsqr8x_nox:
751	call	bn_sqr8x_internal
752
753
754
755
756	leaq	(%rdi,%r9,1),%rbx
757	movq	%r9,%rcx
758	movq	%r9,%rdx
759.byte	102,72,15,126,207
760	sarq	$3+2,%rcx
761	jmp	.Lsqr8x_sub
762
763.align	32
764.Lsqr8x_sub:
765	movq	0(%rbx),%r12
766	movq	8(%rbx),%r13
767	movq	16(%rbx),%r14
768	movq	24(%rbx),%r15
769	leaq	32(%rbx),%rbx
770	sbbq	0(%rbp),%r12
771	sbbq	8(%rbp),%r13
772	sbbq	16(%rbp),%r14
773	sbbq	24(%rbp),%r15
774	leaq	32(%rbp),%rbp
775	movq	%r12,0(%rdi)
776	movq	%r13,8(%rdi)
777	movq	%r14,16(%rdi)
778	movq	%r15,24(%rdi)
779	leaq	32(%rdi),%rdi
780	incq	%rcx
781	jnz	.Lsqr8x_sub
782
783	sbbq	$0,%rax
784	leaq	(%rbx,%r9,1),%rbx
785	leaq	(%rdi,%r9,1),%rdi
786
787.byte	102,72,15,110,200
788	pxor	%xmm0,%xmm0
789	pshufd	$0,%xmm1,%xmm1
790	movq	40(%rsp),%rsi
791	jmp	.Lsqr8x_cond_copy
792
793.align	32
794.Lsqr8x_cond_copy:
795	movdqa	0(%rbx),%xmm2
796	movdqa	16(%rbx),%xmm3
797	leaq	32(%rbx),%rbx
798	movdqu	0(%rdi),%xmm4
799	movdqu	16(%rdi),%xmm5
800	leaq	32(%rdi),%rdi
801	movdqa	%xmm0,-32(%rbx)
802	movdqa	%xmm0,-16(%rbx)
803	movdqa	%xmm0,-32(%rbx,%rdx,1)
804	movdqa	%xmm0,-16(%rbx,%rdx,1)
805	pcmpeqd	%xmm1,%xmm0
806	pand	%xmm1,%xmm2
807	pand	%xmm1,%xmm3
808	pand	%xmm0,%xmm4
809	pand	%xmm0,%xmm5
810	pxor	%xmm0,%xmm0
811	por	%xmm2,%xmm4
812	por	%xmm3,%xmm5
813	movdqu	%xmm4,-32(%rdi)
814	movdqu	%xmm5,-16(%rdi)
815	addq	$32,%r9
816	jnz	.Lsqr8x_cond_copy
817
818	movq	$1,%rax
819	movq	-48(%rsi),%r15
820	movq	-40(%rsi),%r14
821	movq	-32(%rsi),%r13
822	movq	-24(%rsi),%r12
823	movq	-16(%rsi),%rbp
824	movq	-8(%rsi),%rbx
825	leaq	(%rsi),%rsp
826.Lsqr8x_epilogue:
827	.byte	0xf3,0xc3
828.size	bn_sqr8x_mont,.-bn_sqr8x_mont
829.type	bn_mulx4x_mont,@function
830.align	32
831bn_mulx4x_mont:
832	movq	%rsp,%rax
833.Lmulx4x_enter:
834	pushq	%rbx
835	pushq	%rbp
836	pushq	%r12
837	pushq	%r13
838	pushq	%r14
839	pushq	%r15
840.Lmulx4x_prologue:
841
842	shll	$3,%r9d
843	xorq	%r10,%r10
844	subq	%r9,%r10
845	movq	(%r8),%r8
846	leaq	-72(%rsp,%r10,1),%rbp
847	andq	$-128,%rbp
848	movq	%rsp,%r11
849	subq	%rbp,%r11
850	andq	$-4096,%r11
851	leaq	(%r11,%rbp,1),%rsp
852	movq	(%rsp),%r10
853	cmpq	%rbp,%rsp
854	ja	.Lmulx4x_page_walk
855	jmp	.Lmulx4x_page_walk_done
856
857.align	16
858.Lmulx4x_page_walk:
859	leaq	-4096(%rsp),%rsp
860	movq	(%rsp),%r10
861	cmpq	%rbp,%rsp
862	ja	.Lmulx4x_page_walk
863.Lmulx4x_page_walk_done:
864
865	leaq	(%rdx,%r9,1),%r10
866
867
868
869
870
871
872
873
874
875
876
877
878	movq	%r9,0(%rsp)
879	shrq	$5,%r9
880	movq	%r10,16(%rsp)
881	subq	$1,%r9
882	movq	%r8,24(%rsp)
883	movq	%rdi,32(%rsp)
884	movq	%rax,40(%rsp)
885	movq	%r9,48(%rsp)
886	jmp	.Lmulx4x_body
887
888.align	32
889.Lmulx4x_body:
890	leaq	8(%rdx),%rdi
891	movq	(%rdx),%rdx
892	leaq	64+32(%rsp),%rbx
893	movq	%rdx,%r9
894
895	mulxq	0(%rsi),%r8,%rax
896	mulxq	8(%rsi),%r11,%r14
897	addq	%rax,%r11
898	movq	%rdi,8(%rsp)
899	mulxq	16(%rsi),%r12,%r13
900	adcq	%r14,%r12
901	adcq	$0,%r13
902
903	movq	%r8,%rdi
904	imulq	24(%rsp),%r8
905	xorq	%rbp,%rbp
906
907	mulxq	24(%rsi),%rax,%r14
908	movq	%r8,%rdx
909	leaq	32(%rsi),%rsi
910	adcxq	%rax,%r13
911	adcxq	%rbp,%r14
912
913	mulxq	0(%rcx),%rax,%r10
914	adcxq	%rax,%rdi
915	adoxq	%r11,%r10
916	mulxq	8(%rcx),%rax,%r11
917	adcxq	%rax,%r10
918	adoxq	%r12,%r11
919.byte	0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
920	movq	48(%rsp),%rdi
921	movq	%r10,-32(%rbx)
922	adcxq	%rax,%r11
923	adoxq	%r13,%r12
924	mulxq	24(%rcx),%rax,%r15
925	movq	%r9,%rdx
926	movq	%r11,-24(%rbx)
927	adcxq	%rax,%r12
928	adoxq	%rbp,%r15
929	leaq	32(%rcx),%rcx
930	movq	%r12,-16(%rbx)
931
932	jmp	.Lmulx4x_1st
933
934.align	32
935.Lmulx4x_1st:
936	adcxq	%rbp,%r15
937	mulxq	0(%rsi),%r10,%rax
938	adcxq	%r14,%r10
939	mulxq	8(%rsi),%r11,%r14
940	adcxq	%rax,%r11
941	mulxq	16(%rsi),%r12,%rax
942	adcxq	%r14,%r12
943	mulxq	24(%rsi),%r13,%r14
944.byte	0x67,0x67
945	movq	%r8,%rdx
946	adcxq	%rax,%r13
947	adcxq	%rbp,%r14
948	leaq	32(%rsi),%rsi
949	leaq	32(%rbx),%rbx
950
951	adoxq	%r15,%r10
952	mulxq	0(%rcx),%rax,%r15
953	adcxq	%rax,%r10
954	adoxq	%r15,%r11
955	mulxq	8(%rcx),%rax,%r15
956	adcxq	%rax,%r11
957	adoxq	%r15,%r12
958	mulxq	16(%rcx),%rax,%r15
959	movq	%r10,-40(%rbx)
960	adcxq	%rax,%r12
961	movq	%r11,-32(%rbx)
962	adoxq	%r15,%r13
963	mulxq	24(%rcx),%rax,%r15
964	movq	%r9,%rdx
965	movq	%r12,-24(%rbx)
966	adcxq	%rax,%r13
967	adoxq	%rbp,%r15
968	leaq	32(%rcx),%rcx
969	movq	%r13,-16(%rbx)
970
971	decq	%rdi
972	jnz	.Lmulx4x_1st
973
974	movq	0(%rsp),%rax
975	movq	8(%rsp),%rdi
976	adcq	%rbp,%r15
977	addq	%r15,%r14
978	sbbq	%r15,%r15
979	movq	%r14,-8(%rbx)
980	jmp	.Lmulx4x_outer
981
982.align	32
983.Lmulx4x_outer:
984	movq	(%rdi),%rdx
985	leaq	8(%rdi),%rdi
986	subq	%rax,%rsi
987	movq	%r15,(%rbx)
988	leaq	64+32(%rsp),%rbx
989	subq	%rax,%rcx
990
991	mulxq	0(%rsi),%r8,%r11
992	xorl	%ebp,%ebp
993	movq	%rdx,%r9
994	mulxq	8(%rsi),%r14,%r12
995	adoxq	-32(%rbx),%r8
996	adcxq	%r14,%r11
997	mulxq	16(%rsi),%r15,%r13
998	adoxq	-24(%rbx),%r11
999	adcxq	%r15,%r12
1000	adoxq	-16(%rbx),%r12
1001	adcxq	%rbp,%r13
1002	adoxq	%rbp,%r13
1003
1004	movq	%rdi,8(%rsp)
1005	movq	%r8,%r15
1006	imulq	24(%rsp),%r8
1007	xorl	%ebp,%ebp
1008
1009	mulxq	24(%rsi),%rax,%r14
1010	movq	%r8,%rdx
1011	adcxq	%rax,%r13
1012	adoxq	-8(%rbx),%r13
1013	adcxq	%rbp,%r14
1014	leaq	32(%rsi),%rsi
1015	adoxq	%rbp,%r14
1016
1017	mulxq	0(%rcx),%rax,%r10
1018	adcxq	%rax,%r15
1019	adoxq	%r11,%r10
1020	mulxq	8(%rcx),%rax,%r11
1021	adcxq	%rax,%r10
1022	adoxq	%r12,%r11
1023	mulxq	16(%rcx),%rax,%r12
1024	movq	%r10,-32(%rbx)
1025	adcxq	%rax,%r11
1026	adoxq	%r13,%r12
1027	mulxq	24(%rcx),%rax,%r15
1028	movq	%r9,%rdx
1029	movq	%r11,-24(%rbx)
1030	leaq	32(%rcx),%rcx
1031	adcxq	%rax,%r12
1032	adoxq	%rbp,%r15
1033	movq	48(%rsp),%rdi
1034	movq	%r12,-16(%rbx)
1035
1036	jmp	.Lmulx4x_inner
1037
1038.align	32
1039.Lmulx4x_inner:
1040	mulxq	0(%rsi),%r10,%rax
1041	adcxq	%rbp,%r15
1042	adoxq	%r14,%r10
1043	mulxq	8(%rsi),%r11,%r14
1044	adcxq	0(%rbx),%r10
1045	adoxq	%rax,%r11
1046	mulxq	16(%rsi),%r12,%rax
1047	adcxq	8(%rbx),%r11
1048	adoxq	%r14,%r12
1049	mulxq	24(%rsi),%r13,%r14
1050	movq	%r8,%rdx
1051	adcxq	16(%rbx),%r12
1052	adoxq	%rax,%r13
1053	adcxq	24(%rbx),%r13
1054	adoxq	%rbp,%r14
1055	leaq	32(%rsi),%rsi
1056	leaq	32(%rbx),%rbx
1057	adcxq	%rbp,%r14
1058
1059	adoxq	%r15,%r10
1060	mulxq	0(%rcx),%rax,%r15
1061	adcxq	%rax,%r10
1062	adoxq	%r15,%r11
1063	mulxq	8(%rcx),%rax,%r15
1064	adcxq	%rax,%r11
1065	adoxq	%r15,%r12
1066	mulxq	16(%rcx),%rax,%r15
1067	movq	%r10,-40(%rbx)
1068	adcxq	%rax,%r12
1069	adoxq	%r15,%r13
1070	mulxq	24(%rcx),%rax,%r15
1071	movq	%r9,%rdx
1072	movq	%r11,-32(%rbx)
1073	movq	%r12,-24(%rbx)
1074	adcxq	%rax,%r13
1075	adoxq	%rbp,%r15
1076	leaq	32(%rcx),%rcx
1077	movq	%r13,-16(%rbx)
1078
1079	decq	%rdi
1080	jnz	.Lmulx4x_inner
1081
1082	movq	0(%rsp),%rax
1083	movq	8(%rsp),%rdi
1084	adcq	%rbp,%r15
1085	subq	0(%rbx),%rbp
1086	adcq	%r15,%r14
1087	sbbq	%r15,%r15
1088	movq	%r14,-8(%rbx)
1089
1090	cmpq	16(%rsp),%rdi
1091	jne	.Lmulx4x_outer
1092
1093	leaq	64(%rsp),%rbx
1094	subq	%rax,%rcx
1095	negq	%r15
1096	movq	%rax,%rdx
1097	shrq	$3+2,%rax
1098	movq	32(%rsp),%rdi
1099	jmp	.Lmulx4x_sub
1100
1101.align	32
1102.Lmulx4x_sub:
1103	movq	0(%rbx),%r11
1104	movq	8(%rbx),%r12
1105	movq	16(%rbx),%r13
1106	movq	24(%rbx),%r14
1107	leaq	32(%rbx),%rbx
1108	sbbq	0(%rcx),%r11
1109	sbbq	8(%rcx),%r12
1110	sbbq	16(%rcx),%r13
1111	sbbq	24(%rcx),%r14
1112	leaq	32(%rcx),%rcx
1113	movq	%r11,0(%rdi)
1114	movq	%r12,8(%rdi)
1115	movq	%r13,16(%rdi)
1116	movq	%r14,24(%rdi)
1117	leaq	32(%rdi),%rdi
1118	decq	%rax
1119	jnz	.Lmulx4x_sub
1120
1121	sbbq	$0,%r15
1122	leaq	64(%rsp),%rbx
1123	subq	%rdx,%rdi
1124
1125.byte	102,73,15,110,207
1126	pxor	%xmm0,%xmm0
1127	pshufd	$0,%xmm1,%xmm1
1128	movq	40(%rsp),%rsi
1129	jmp	.Lmulx4x_cond_copy
1130
1131.align	32
1132.Lmulx4x_cond_copy:
1133	movdqa	0(%rbx),%xmm2
1134	movdqa	16(%rbx),%xmm3
1135	leaq	32(%rbx),%rbx
1136	movdqu	0(%rdi),%xmm4
1137	movdqu	16(%rdi),%xmm5
1138	leaq	32(%rdi),%rdi
1139	movdqa	%xmm0,-32(%rbx)
1140	movdqa	%xmm0,-16(%rbx)
1141	pcmpeqd	%xmm1,%xmm0
1142	pand	%xmm1,%xmm2
1143	pand	%xmm1,%xmm3
1144	pand	%xmm0,%xmm4
1145	pand	%xmm0,%xmm5
1146	pxor	%xmm0,%xmm0
1147	por	%xmm2,%xmm4
1148	por	%xmm3,%xmm5
1149	movdqu	%xmm4,-32(%rdi)
1150	movdqu	%xmm5,-16(%rdi)
1151	subq	$32,%rdx
1152	jnz	.Lmulx4x_cond_copy
1153
1154	movq	%rdx,(%rbx)
1155
1156	movq	$1,%rax
1157	movq	-48(%rsi),%r15
1158	movq	-40(%rsi),%r14
1159	movq	-32(%rsi),%r13
1160	movq	-24(%rsi),%r12
1161	movq	-16(%rsi),%rbp
1162	movq	-8(%rsi),%rbx
1163	leaq	(%rsi),%rsp
1164.Lmulx4x_epilogue:
1165	.byte	0xf3,0xc3
1166.size	bn_mulx4x_mont,.-bn_mulx4x_mont
1167.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1168.align	16
1169