x86_64-mont.S revision 337982
1/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/x86_64-mont.S 337982 2018-08-17 18:32:53Z jkim $ */
2/* Do not modify. This file is auto-generated from x86_64-mont.pl. */
3.text
4
5
6
7.globl	bn_mul_mont
8.type	bn_mul_mont,@function
9.align	16
10bn_mul_mont:
11	movl	%r9d,%r9d
12	movq	%rsp,%rax
13	testl	$3,%r9d
14	jnz	.Lmul_enter
15	cmpl	$8,%r9d
16	jb	.Lmul_enter
17	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
18	cmpq	%rsi,%rdx
19	jne	.Lmul4x_enter
20	testl	$7,%r9d
21	jz	.Lsqr8x_enter
22	jmp	.Lmul4x_enter
23
24.align	16
25.Lmul_enter:
26	pushq	%rbx
27	pushq	%rbp
28	pushq	%r12
29	pushq	%r13
30	pushq	%r14
31	pushq	%r15
32
33	negq	%r9
34	movq	%rsp,%r11
35	leaq	-16(%rsp,%r9,8),%r10
36	negq	%r9
37	andq	$-1024,%r10
38
39
40
41
42
43
44
45	subq	%r10,%r11
46	andq	$-4096,%r11
47	leaq	(%r10,%r11,1),%rsp
48	movq	(%rsp),%r11
49	cmpq	%r10,%rsp
50	ja	.Lmul_page_walk
51	jmp	.Lmul_page_walk_done
52
53.align	16
54.Lmul_page_walk:
55	leaq	-4096(%rsp),%rsp
56	movq	(%rsp),%r11
57	cmpq	%r10,%rsp
58	ja	.Lmul_page_walk
59.Lmul_page_walk_done:
60
61	movq	%rax,8(%rsp,%r9,8)
62.Lmul_body:
63	movq	%rdx,%r12
64	movq	(%r8),%r8
65	movq	(%r12),%rbx
66	movq	(%rsi),%rax
67
68	xorq	%r14,%r14
69	xorq	%r15,%r15
70
71	movq	%r8,%rbp
72	mulq	%rbx
73	movq	%rax,%r10
74	movq	(%rcx),%rax
75
76	imulq	%r10,%rbp
77	movq	%rdx,%r11
78
79	mulq	%rbp
80	addq	%rax,%r10
81	movq	8(%rsi),%rax
82	adcq	$0,%rdx
83	movq	%rdx,%r13
84
85	leaq	1(%r15),%r15
86	jmp	.L1st_enter
87
88.align	16
89.L1st:
90	addq	%rax,%r13
91	movq	(%rsi,%r15,8),%rax
92	adcq	$0,%rdx
93	addq	%r11,%r13
94	movq	%r10,%r11
95	adcq	$0,%rdx
96	movq	%r13,-16(%rsp,%r15,8)
97	movq	%rdx,%r13
98
99.L1st_enter:
100	mulq	%rbx
101	addq	%rax,%r11
102	movq	(%rcx,%r15,8),%rax
103	adcq	$0,%rdx
104	leaq	1(%r15),%r15
105	movq	%rdx,%r10
106
107	mulq	%rbp
108	cmpq	%r9,%r15
109	jne	.L1st
110
111	addq	%rax,%r13
112	movq	(%rsi),%rax
113	adcq	$0,%rdx
114	addq	%r11,%r13
115	adcq	$0,%rdx
116	movq	%r13,-16(%rsp,%r15,8)
117	movq	%rdx,%r13
118	movq	%r10,%r11
119
120	xorq	%rdx,%rdx
121	addq	%r11,%r13
122	adcq	$0,%rdx
123	movq	%r13,-8(%rsp,%r9,8)
124	movq	%rdx,(%rsp,%r9,8)
125
126	leaq	1(%r14),%r14
127	jmp	.Louter
128.align	16
129.Louter:
130	movq	(%r12,%r14,8),%rbx
131	xorq	%r15,%r15
132	movq	%r8,%rbp
133	movq	(%rsp),%r10
134	mulq	%rbx
135	addq	%rax,%r10
136	movq	(%rcx),%rax
137	adcq	$0,%rdx
138
139	imulq	%r10,%rbp
140	movq	%rdx,%r11
141
142	mulq	%rbp
143	addq	%rax,%r10
144	movq	8(%rsi),%rax
145	adcq	$0,%rdx
146	movq	8(%rsp),%r10
147	movq	%rdx,%r13
148
149	leaq	1(%r15),%r15
150	jmp	.Linner_enter
151
152.align	16
153.Linner:
154	addq	%rax,%r13
155	movq	(%rsi,%r15,8),%rax
156	adcq	$0,%rdx
157	addq	%r10,%r13
158	movq	(%rsp,%r15,8),%r10
159	adcq	$0,%rdx
160	movq	%r13,-16(%rsp,%r15,8)
161	movq	%rdx,%r13
162
163.Linner_enter:
164	mulq	%rbx
165	addq	%rax,%r11
166	movq	(%rcx,%r15,8),%rax
167	adcq	$0,%rdx
168	addq	%r11,%r10
169	movq	%rdx,%r11
170	adcq	$0,%r11
171	leaq	1(%r15),%r15
172
173	mulq	%rbp
174	cmpq	%r9,%r15
175	jne	.Linner
176
177	addq	%rax,%r13
178	movq	(%rsi),%rax
179	adcq	$0,%rdx
180	addq	%r10,%r13
181	movq	(%rsp,%r15,8),%r10
182	adcq	$0,%rdx
183	movq	%r13,-16(%rsp,%r15,8)
184	movq	%rdx,%r13
185
186	xorq	%rdx,%rdx
187	addq	%r11,%r13
188	adcq	$0,%rdx
189	addq	%r10,%r13
190	adcq	$0,%rdx
191	movq	%r13,-8(%rsp,%r9,8)
192	movq	%rdx,(%rsp,%r9,8)
193
194	leaq	1(%r14),%r14
195	cmpq	%r9,%r14
196	jb	.Louter
197
198	xorq	%r14,%r14
199	movq	(%rsp),%rax
200	movq	%r9,%r15
201
202.align	16
203.Lsub:	sbbq	(%rcx,%r14,8),%rax
204	movq	%rax,(%rdi,%r14,8)
205	movq	8(%rsp,%r14,8),%rax
206	leaq	1(%r14),%r14
207	decq	%r15
208	jnz	.Lsub
209
210	sbbq	$0,%rax
211	movq	$-1,%rbx
212	xorq	%rax,%rbx
213	xorq	%r14,%r14
214	movq	%r9,%r15
215
216.Lcopy:
217	movq	(%rdi,%r14,8),%rcx
218	movq	(%rsp,%r14,8),%rdx
219	andq	%rbx,%rcx
220	andq	%rax,%rdx
221	movq	%r9,(%rsp,%r14,8)
222	orq	%rcx,%rdx
223	movq	%rdx,(%rdi,%r14,8)
224	leaq	1(%r14),%r14
225	subq	$1,%r15
226	jnz	.Lcopy
227
228	movq	8(%rsp,%r9,8),%rsi
229	movq	$1,%rax
230	movq	-48(%rsi),%r15
231	movq	-40(%rsi),%r14
232	movq	-32(%rsi),%r13
233	movq	-24(%rsi),%r12
234	movq	-16(%rsi),%rbp
235	movq	-8(%rsi),%rbx
236	leaq	(%rsi),%rsp
237.Lmul_epilogue:
238	.byte	0xf3,0xc3
239.size	bn_mul_mont,.-bn_mul_mont
240.type	bn_mul4x_mont,@function
241.align	16
242bn_mul4x_mont:
243	movl	%r9d,%r9d
244	movq	%rsp,%rax
245.Lmul4x_enter:
246	andl	$0x80100,%r11d
247	cmpl	$0x80100,%r11d
248	je	.Lmulx4x_enter
249	pushq	%rbx
250	pushq	%rbp
251	pushq	%r12
252	pushq	%r13
253	pushq	%r14
254	pushq	%r15
255
256	negq	%r9
257	movq	%rsp,%r11
258	leaq	-32(%rsp,%r9,8),%r10
259	negq	%r9
260	andq	$-1024,%r10
261
262	subq	%r10,%r11
263	andq	$-4096,%r11
264	leaq	(%r10,%r11,1),%rsp
265	movq	(%rsp),%r11
266	cmpq	%r10,%rsp
267	ja	.Lmul4x_page_walk
268	jmp	.Lmul4x_page_walk_done
269
270.Lmul4x_page_walk:
271	leaq	-4096(%rsp),%rsp
272	movq	(%rsp),%r11
273	cmpq	%r10,%rsp
274	ja	.Lmul4x_page_walk
275.Lmul4x_page_walk_done:
276
277	movq	%rax,8(%rsp,%r9,8)
278.Lmul4x_body:
279	movq	%rdi,16(%rsp,%r9,8)
280	movq	%rdx,%r12
281	movq	(%r8),%r8
282	movq	(%r12),%rbx
283	movq	(%rsi),%rax
284
285	xorq	%r14,%r14
286	xorq	%r15,%r15
287
288	movq	%r8,%rbp
289	mulq	%rbx
290	movq	%rax,%r10
291	movq	(%rcx),%rax
292
293	imulq	%r10,%rbp
294	movq	%rdx,%r11
295
296	mulq	%rbp
297	addq	%rax,%r10
298	movq	8(%rsi),%rax
299	adcq	$0,%rdx
300	movq	%rdx,%rdi
301
302	mulq	%rbx
303	addq	%rax,%r11
304	movq	8(%rcx),%rax
305	adcq	$0,%rdx
306	movq	%rdx,%r10
307
308	mulq	%rbp
309	addq	%rax,%rdi
310	movq	16(%rsi),%rax
311	adcq	$0,%rdx
312	addq	%r11,%rdi
313	leaq	4(%r15),%r15
314	adcq	$0,%rdx
315	movq	%rdi,(%rsp)
316	movq	%rdx,%r13
317	jmp	.L1st4x
318.align	16
319.L1st4x:
320	mulq	%rbx
321	addq	%rax,%r10
322	movq	-16(%rcx,%r15,8),%rax
323	adcq	$0,%rdx
324	movq	%rdx,%r11
325
326	mulq	%rbp
327	addq	%rax,%r13
328	movq	-8(%rsi,%r15,8),%rax
329	adcq	$0,%rdx
330	addq	%r10,%r13
331	adcq	$0,%rdx
332	movq	%r13,-24(%rsp,%r15,8)
333	movq	%rdx,%rdi
334
335	mulq	%rbx
336	addq	%rax,%r11
337	movq	-8(%rcx,%r15,8),%rax
338	adcq	$0,%rdx
339	movq	%rdx,%r10
340
341	mulq	%rbp
342	addq	%rax,%rdi
343	movq	(%rsi,%r15,8),%rax
344	adcq	$0,%rdx
345	addq	%r11,%rdi
346	adcq	$0,%rdx
347	movq	%rdi,-16(%rsp,%r15,8)
348	movq	%rdx,%r13
349
350	mulq	%rbx
351	addq	%rax,%r10
352	movq	(%rcx,%r15,8),%rax
353	adcq	$0,%rdx
354	movq	%rdx,%r11
355
356	mulq	%rbp
357	addq	%rax,%r13
358	movq	8(%rsi,%r15,8),%rax
359	adcq	$0,%rdx
360	addq	%r10,%r13
361	adcq	$0,%rdx
362	movq	%r13,-8(%rsp,%r15,8)
363	movq	%rdx,%rdi
364
365	mulq	%rbx
366	addq	%rax,%r11
367	movq	8(%rcx,%r15,8),%rax
368	adcq	$0,%rdx
369	leaq	4(%r15),%r15
370	movq	%rdx,%r10
371
372	mulq	%rbp
373	addq	%rax,%rdi
374	movq	-16(%rsi,%r15,8),%rax
375	adcq	$0,%rdx
376	addq	%r11,%rdi
377	adcq	$0,%rdx
378	movq	%rdi,-32(%rsp,%r15,8)
379	movq	%rdx,%r13
380	cmpq	%r9,%r15
381	jb	.L1st4x
382
383	mulq	%rbx
384	addq	%rax,%r10
385	movq	-16(%rcx,%r15,8),%rax
386	adcq	$0,%rdx
387	movq	%rdx,%r11
388
389	mulq	%rbp
390	addq	%rax,%r13
391	movq	-8(%rsi,%r15,8),%rax
392	adcq	$0,%rdx
393	addq	%r10,%r13
394	adcq	$0,%rdx
395	movq	%r13,-24(%rsp,%r15,8)
396	movq	%rdx,%rdi
397
398	mulq	%rbx
399	addq	%rax,%r11
400	movq	-8(%rcx,%r15,8),%rax
401	adcq	$0,%rdx
402	movq	%rdx,%r10
403
404	mulq	%rbp
405	addq	%rax,%rdi
406	movq	(%rsi),%rax
407	adcq	$0,%rdx
408	addq	%r11,%rdi
409	adcq	$0,%rdx
410	movq	%rdi,-16(%rsp,%r15,8)
411	movq	%rdx,%r13
412
413	xorq	%rdi,%rdi
414	addq	%r10,%r13
415	adcq	$0,%rdi
416	movq	%r13,-8(%rsp,%r15,8)
417	movq	%rdi,(%rsp,%r15,8)
418
419	leaq	1(%r14),%r14
420.align	4
421.Louter4x:
422	movq	(%r12,%r14,8),%rbx
423	xorq	%r15,%r15
424	movq	(%rsp),%r10
425	movq	%r8,%rbp
426	mulq	%rbx
427	addq	%rax,%r10
428	movq	(%rcx),%rax
429	adcq	$0,%rdx
430
431	imulq	%r10,%rbp
432	movq	%rdx,%r11
433
434	mulq	%rbp
435	addq	%rax,%r10
436	movq	8(%rsi),%rax
437	adcq	$0,%rdx
438	movq	%rdx,%rdi
439
440	mulq	%rbx
441	addq	%rax,%r11
442	movq	8(%rcx),%rax
443	adcq	$0,%rdx
444	addq	8(%rsp),%r11
445	adcq	$0,%rdx
446	movq	%rdx,%r10
447
448	mulq	%rbp
449	addq	%rax,%rdi
450	movq	16(%rsi),%rax
451	adcq	$0,%rdx
452	addq	%r11,%rdi
453	leaq	4(%r15),%r15
454	adcq	$0,%rdx
455	movq	%rdi,(%rsp)
456	movq	%rdx,%r13
457	jmp	.Linner4x
458.align	16
459.Linner4x:
460	mulq	%rbx
461	addq	%rax,%r10
462	movq	-16(%rcx,%r15,8),%rax
463	adcq	$0,%rdx
464	addq	-16(%rsp,%r15,8),%r10
465	adcq	$0,%rdx
466	movq	%rdx,%r11
467
468	mulq	%rbp
469	addq	%rax,%r13
470	movq	-8(%rsi,%r15,8),%rax
471	adcq	$0,%rdx
472	addq	%r10,%r13
473	adcq	$0,%rdx
474	movq	%r13,-24(%rsp,%r15,8)
475	movq	%rdx,%rdi
476
477	mulq	%rbx
478	addq	%rax,%r11
479	movq	-8(%rcx,%r15,8),%rax
480	adcq	$0,%rdx
481	addq	-8(%rsp,%r15,8),%r11
482	adcq	$0,%rdx
483	movq	%rdx,%r10
484
485	mulq	%rbp
486	addq	%rax,%rdi
487	movq	(%rsi,%r15,8),%rax
488	adcq	$0,%rdx
489	addq	%r11,%rdi
490	adcq	$0,%rdx
491	movq	%rdi,-16(%rsp,%r15,8)
492	movq	%rdx,%r13
493
494	mulq	%rbx
495	addq	%rax,%r10
496	movq	(%rcx,%r15,8),%rax
497	adcq	$0,%rdx
498	addq	(%rsp,%r15,8),%r10
499	adcq	$0,%rdx
500	movq	%rdx,%r11
501
502	mulq	%rbp
503	addq	%rax,%r13
504	movq	8(%rsi,%r15,8),%rax
505	adcq	$0,%rdx
506	addq	%r10,%r13
507	adcq	$0,%rdx
508	movq	%r13,-8(%rsp,%r15,8)
509	movq	%rdx,%rdi
510
511	mulq	%rbx
512	addq	%rax,%r11
513	movq	8(%rcx,%r15,8),%rax
514	adcq	$0,%rdx
515	addq	8(%rsp,%r15,8),%r11
516	adcq	$0,%rdx
517	leaq	4(%r15),%r15
518	movq	%rdx,%r10
519
520	mulq	%rbp
521	addq	%rax,%rdi
522	movq	-16(%rsi,%r15,8),%rax
523	adcq	$0,%rdx
524	addq	%r11,%rdi
525	adcq	$0,%rdx
526	movq	%rdi,-32(%rsp,%r15,8)
527	movq	%rdx,%r13
528	cmpq	%r9,%r15
529	jb	.Linner4x
530
531	mulq	%rbx
532	addq	%rax,%r10
533	movq	-16(%rcx,%r15,8),%rax
534	adcq	$0,%rdx
535	addq	-16(%rsp,%r15,8),%r10
536	adcq	$0,%rdx
537	movq	%rdx,%r11
538
539	mulq	%rbp
540	addq	%rax,%r13
541	movq	-8(%rsi,%r15,8),%rax
542	adcq	$0,%rdx
543	addq	%r10,%r13
544	adcq	$0,%rdx
545	movq	%r13,-24(%rsp,%r15,8)
546	movq	%rdx,%rdi
547
548	mulq	%rbx
549	addq	%rax,%r11
550	movq	-8(%rcx,%r15,8),%rax
551	adcq	$0,%rdx
552	addq	-8(%rsp,%r15,8),%r11
553	adcq	$0,%rdx
554	leaq	1(%r14),%r14
555	movq	%rdx,%r10
556
557	mulq	%rbp
558	addq	%rax,%rdi
559	movq	(%rsi),%rax
560	adcq	$0,%rdx
561	addq	%r11,%rdi
562	adcq	$0,%rdx
563	movq	%rdi,-16(%rsp,%r15,8)
564	movq	%rdx,%r13
565
566	xorq	%rdi,%rdi
567	addq	%r10,%r13
568	adcq	$0,%rdi
569	addq	(%rsp,%r9,8),%r13
570	adcq	$0,%rdi
571	movq	%r13,-8(%rsp,%r15,8)
572	movq	%rdi,(%rsp,%r15,8)
573
574	cmpq	%r9,%r14
575	jb	.Louter4x
576	movq	16(%rsp,%r9,8),%rdi
577	leaq	-4(%r9),%r15
578	movq	0(%rsp),%rax
579	movq	8(%rsp),%rdx
580	shrq	$2,%r15
581	leaq	(%rsp),%rsi
582	xorq	%r14,%r14
583
584	subq	0(%rcx),%rax
585	movq	16(%rsi),%rbx
586	movq	24(%rsi),%rbp
587	sbbq	8(%rcx),%rdx
588
589.Lsub4x:
590	movq	%rax,0(%rdi,%r14,8)
591	movq	%rdx,8(%rdi,%r14,8)
592	sbbq	16(%rcx,%r14,8),%rbx
593	movq	32(%rsi,%r14,8),%rax
594	movq	40(%rsi,%r14,8),%rdx
595	sbbq	24(%rcx,%r14,8),%rbp
596	movq	%rbx,16(%rdi,%r14,8)
597	movq	%rbp,24(%rdi,%r14,8)
598	sbbq	32(%rcx,%r14,8),%rax
599	movq	48(%rsi,%r14,8),%rbx
600	movq	56(%rsi,%r14,8),%rbp
601	sbbq	40(%rcx,%r14,8),%rdx
602	leaq	4(%r14),%r14
603	decq	%r15
604	jnz	.Lsub4x
605
606	movq	%rax,0(%rdi,%r14,8)
607	movq	32(%rsi,%r14,8),%rax
608	sbbq	16(%rcx,%r14,8),%rbx
609	movq	%rdx,8(%rdi,%r14,8)
610	sbbq	24(%rcx,%r14,8),%rbp
611	movq	%rbx,16(%rdi,%r14,8)
612
613	sbbq	$0,%rax
614	movq	%rbp,24(%rdi,%r14,8)
615	pxor	%xmm0,%xmm0
616.byte	102,72,15,110,224
617	pcmpeqd	%xmm5,%xmm5
618	pshufd	$0,%xmm4,%xmm4
619	movq	%r9,%r15
620	pxor	%xmm4,%xmm5
621	shrq	$2,%r15
622	xorl	%eax,%eax
623
624	jmp	.Lcopy4x
625.align	16
626.Lcopy4x:
627	movdqa	(%rsp,%rax,1),%xmm1
628	movdqu	(%rdi,%rax,1),%xmm2
629	pand	%xmm4,%xmm1
630	pand	%xmm5,%xmm2
631	movdqa	16(%rsp,%rax,1),%xmm3
632	movdqa	%xmm0,(%rsp,%rax,1)
633	por	%xmm2,%xmm1
634	movdqu	16(%rdi,%rax,1),%xmm2
635	movdqu	%xmm1,(%rdi,%rax,1)
636	pand	%xmm4,%xmm3
637	pand	%xmm5,%xmm2
638	movdqa	%xmm0,16(%rsp,%rax,1)
639	por	%xmm2,%xmm3
640	movdqu	%xmm3,16(%rdi,%rax,1)
641	leaq	32(%rax),%rax
642	decq	%r15
643	jnz	.Lcopy4x
644	movq	8(%rsp,%r9,8),%rsi
645	movq	$1,%rax
646	movq	-48(%rsi),%r15
647	movq	-40(%rsi),%r14
648	movq	-32(%rsi),%r13
649	movq	-24(%rsi),%r12
650	movq	-16(%rsi),%rbp
651	movq	-8(%rsi),%rbx
652	leaq	(%rsi),%rsp
653.Lmul4x_epilogue:
654	.byte	0xf3,0xc3
655.size	bn_mul4x_mont,.-bn_mul4x_mont
656
657
658
659.type	bn_sqr8x_mont,@function
660.align	32
661bn_sqr8x_mont:
662	movq	%rsp,%rax
663.Lsqr8x_enter:
664	pushq	%rbx
665	pushq	%rbp
666	pushq	%r12
667	pushq	%r13
668	pushq	%r14
669	pushq	%r15
670.Lsqr8x_prologue:
671
672	movl	%r9d,%r10d
673	shll	$3,%r9d
674	shlq	$3+2,%r10
675	negq	%r9
676
677
678
679
680
681
682	leaq	-64(%rsp,%r9,2),%r11
683	movq	%rsp,%rbp
684	movq	(%r8),%r8
685	subq	%rsi,%r11
686	andq	$4095,%r11
687	cmpq	%r11,%r10
688	jb	.Lsqr8x_sp_alt
689	subq	%r11,%rbp
690	leaq	-64(%rbp,%r9,2),%rbp
691	jmp	.Lsqr8x_sp_done
692
693.align	32
694.Lsqr8x_sp_alt:
695	leaq	4096-64(,%r9,2),%r10
696	leaq	-64(%rbp,%r9,2),%rbp
697	subq	%r10,%r11
698	movq	$0,%r10
699	cmovcq	%r10,%r11
700	subq	%r11,%rbp
701.Lsqr8x_sp_done:
702	andq	$-64,%rbp
703	movq	%rsp,%r11
704	subq	%rbp,%r11
705	andq	$-4096,%r11
706	leaq	(%r11,%rbp,1),%rsp
707	movq	(%rsp),%r10
708	cmpq	%rbp,%rsp
709	ja	.Lsqr8x_page_walk
710	jmp	.Lsqr8x_page_walk_done
711
712.align	16
713.Lsqr8x_page_walk:
714	leaq	-4096(%rsp),%rsp
715	movq	(%rsp),%r10
716	cmpq	%rbp,%rsp
717	ja	.Lsqr8x_page_walk
718.Lsqr8x_page_walk_done:
719
720	movq	%r9,%r10
721	negq	%r9
722
723	movq	%r8,32(%rsp)
724	movq	%rax,40(%rsp)
725.Lsqr8x_body:
726
727.byte	102,72,15,110,209
728	pxor	%xmm0,%xmm0
729.byte	102,72,15,110,207
730.byte	102,73,15,110,218
731	movl	OPENSSL_ia32cap_P+8(%rip),%eax
732	andl	$0x80100,%eax
733	cmpl	$0x80100,%eax
734	jne	.Lsqr8x_nox
735
736	call	bn_sqrx8x_internal
737
738
739
740
741	leaq	(%r8,%rcx,1),%rbx
742	movq	%rcx,%r9
743	movq	%rcx,%rdx
744.byte	102,72,15,126,207
745	sarq	$3+2,%rcx
746	jmp	.Lsqr8x_sub
747
748.align	32
749.Lsqr8x_nox:
750	call	bn_sqr8x_internal
751
752
753
754
755	leaq	(%rdi,%r9,1),%rbx
756	movq	%r9,%rcx
757	movq	%r9,%rdx
758.byte	102,72,15,126,207
759	sarq	$3+2,%rcx
760	jmp	.Lsqr8x_sub
761
762.align	32
763.Lsqr8x_sub:
764	movq	0(%rbx),%r12
765	movq	8(%rbx),%r13
766	movq	16(%rbx),%r14
767	movq	24(%rbx),%r15
768	leaq	32(%rbx),%rbx
769	sbbq	0(%rbp),%r12
770	sbbq	8(%rbp),%r13
771	sbbq	16(%rbp),%r14
772	sbbq	24(%rbp),%r15
773	leaq	32(%rbp),%rbp
774	movq	%r12,0(%rdi)
775	movq	%r13,8(%rdi)
776	movq	%r14,16(%rdi)
777	movq	%r15,24(%rdi)
778	leaq	32(%rdi),%rdi
779	incq	%rcx
780	jnz	.Lsqr8x_sub
781
782	sbbq	$0,%rax
783	leaq	(%rbx,%r9,1),%rbx
784	leaq	(%rdi,%r9,1),%rdi
785
786.byte	102,72,15,110,200
787	pxor	%xmm0,%xmm0
788	pshufd	$0,%xmm1,%xmm1
789	movq	40(%rsp),%rsi
790	jmp	.Lsqr8x_cond_copy
791
792.align	32
793.Lsqr8x_cond_copy:
794	movdqa	0(%rbx),%xmm2
795	movdqa	16(%rbx),%xmm3
796	leaq	32(%rbx),%rbx
797	movdqu	0(%rdi),%xmm4
798	movdqu	16(%rdi),%xmm5
799	leaq	32(%rdi),%rdi
800	movdqa	%xmm0,-32(%rbx)
801	movdqa	%xmm0,-16(%rbx)
802	movdqa	%xmm0,-32(%rbx,%rdx,1)
803	movdqa	%xmm0,-16(%rbx,%rdx,1)
804	pcmpeqd	%xmm1,%xmm0
805	pand	%xmm1,%xmm2
806	pand	%xmm1,%xmm3
807	pand	%xmm0,%xmm4
808	pand	%xmm0,%xmm5
809	pxor	%xmm0,%xmm0
810	por	%xmm2,%xmm4
811	por	%xmm3,%xmm5
812	movdqu	%xmm4,-32(%rdi)
813	movdqu	%xmm5,-16(%rdi)
814	addq	$32,%r9
815	jnz	.Lsqr8x_cond_copy
816
817	movq	$1,%rax
818	movq	-48(%rsi),%r15
819	movq	-40(%rsi),%r14
820	movq	-32(%rsi),%r13
821	movq	-24(%rsi),%r12
822	movq	-16(%rsi),%rbp
823	movq	-8(%rsi),%rbx
824	leaq	(%rsi),%rsp
825.Lsqr8x_epilogue:
826	.byte	0xf3,0xc3
827.size	bn_sqr8x_mont,.-bn_sqr8x_mont
828.type	bn_mulx4x_mont,@function
829.align	32
830bn_mulx4x_mont:
831	movq	%rsp,%rax
832.Lmulx4x_enter:
833	pushq	%rbx
834	pushq	%rbp
835	pushq	%r12
836	pushq	%r13
837	pushq	%r14
838	pushq	%r15
839.Lmulx4x_prologue:
840
841	shll	$3,%r9d
842	xorq	%r10,%r10
843	subq	%r9,%r10
844	movq	(%r8),%r8
845	leaq	-72(%rsp,%r10,1),%rbp
846	andq	$-128,%rbp
847	movq	%rsp,%r11
848	subq	%rbp,%r11
849	andq	$-4096,%r11
850	leaq	(%r11,%rbp,1),%rsp
851	movq	(%rsp),%r10
852	cmpq	%rbp,%rsp
853	ja	.Lmulx4x_page_walk
854	jmp	.Lmulx4x_page_walk_done
855
856.align	16
857.Lmulx4x_page_walk:
858	leaq	-4096(%rsp),%rsp
859	movq	(%rsp),%r10
860	cmpq	%rbp,%rsp
861	ja	.Lmulx4x_page_walk
862.Lmulx4x_page_walk_done:
863
864	leaq	(%rdx,%r9,1),%r10
865
866
867
868
869
870
871
872
873
874
875
876
877	movq	%r9,0(%rsp)
878	shrq	$5,%r9
879	movq	%r10,16(%rsp)
880	subq	$1,%r9
881	movq	%r8,24(%rsp)
882	movq	%rdi,32(%rsp)
883	movq	%rax,40(%rsp)
884	movq	%r9,48(%rsp)
885	jmp	.Lmulx4x_body
886
887.align	32
888.Lmulx4x_body:
889	leaq	8(%rdx),%rdi
890	movq	(%rdx),%rdx
891	leaq	64+32(%rsp),%rbx
892	movq	%rdx,%r9
893
894	mulxq	0(%rsi),%r8,%rax
895	mulxq	8(%rsi),%r11,%r14
896	addq	%rax,%r11
897	movq	%rdi,8(%rsp)
898	mulxq	16(%rsi),%r12,%r13
899	adcq	%r14,%r12
900	adcq	$0,%r13
901
902	movq	%r8,%rdi
903	imulq	24(%rsp),%r8
904	xorq	%rbp,%rbp
905
906	mulxq	24(%rsi),%rax,%r14
907	movq	%r8,%rdx
908	leaq	32(%rsi),%rsi
909	adcxq	%rax,%r13
910	adcxq	%rbp,%r14
911
912	mulxq	0(%rcx),%rax,%r10
913	adcxq	%rax,%rdi
914	adoxq	%r11,%r10
915	mulxq	8(%rcx),%rax,%r11
916	adcxq	%rax,%r10
917	adoxq	%r12,%r11
918.byte	0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
919	movq	48(%rsp),%rdi
920	movq	%r10,-32(%rbx)
921	adcxq	%rax,%r11
922	adoxq	%r13,%r12
923	mulxq	24(%rcx),%rax,%r15
924	movq	%r9,%rdx
925	movq	%r11,-24(%rbx)
926	adcxq	%rax,%r12
927	adoxq	%rbp,%r15
928	leaq	32(%rcx),%rcx
929	movq	%r12,-16(%rbx)
930
931	jmp	.Lmulx4x_1st
932
933.align	32
934.Lmulx4x_1st:
935	adcxq	%rbp,%r15
936	mulxq	0(%rsi),%r10,%rax
937	adcxq	%r14,%r10
938	mulxq	8(%rsi),%r11,%r14
939	adcxq	%rax,%r11
940	mulxq	16(%rsi),%r12,%rax
941	adcxq	%r14,%r12
942	mulxq	24(%rsi),%r13,%r14
943.byte	0x67,0x67
944	movq	%r8,%rdx
945	adcxq	%rax,%r13
946	adcxq	%rbp,%r14
947	leaq	32(%rsi),%rsi
948	leaq	32(%rbx),%rbx
949
950	adoxq	%r15,%r10
951	mulxq	0(%rcx),%rax,%r15
952	adcxq	%rax,%r10
953	adoxq	%r15,%r11
954	mulxq	8(%rcx),%rax,%r15
955	adcxq	%rax,%r11
956	adoxq	%r15,%r12
957	mulxq	16(%rcx),%rax,%r15
958	movq	%r10,-40(%rbx)
959	adcxq	%rax,%r12
960	movq	%r11,-32(%rbx)
961	adoxq	%r15,%r13
962	mulxq	24(%rcx),%rax,%r15
963	movq	%r9,%rdx
964	movq	%r12,-24(%rbx)
965	adcxq	%rax,%r13
966	adoxq	%rbp,%r15
967	leaq	32(%rcx),%rcx
968	movq	%r13,-16(%rbx)
969
970	decq	%rdi
971	jnz	.Lmulx4x_1st
972
973	movq	0(%rsp),%rax
974	movq	8(%rsp),%rdi
975	adcq	%rbp,%r15
976	addq	%r15,%r14
977	sbbq	%r15,%r15
978	movq	%r14,-8(%rbx)
979	jmp	.Lmulx4x_outer
980
981.align	32
982.Lmulx4x_outer:
983	movq	(%rdi),%rdx
984	leaq	8(%rdi),%rdi
985	subq	%rax,%rsi
986	movq	%r15,(%rbx)
987	leaq	64+32(%rsp),%rbx
988	subq	%rax,%rcx
989
990	mulxq	0(%rsi),%r8,%r11
991	xorl	%ebp,%ebp
992	movq	%rdx,%r9
993	mulxq	8(%rsi),%r14,%r12
994	adoxq	-32(%rbx),%r8
995	adcxq	%r14,%r11
996	mulxq	16(%rsi),%r15,%r13
997	adoxq	-24(%rbx),%r11
998	adcxq	%r15,%r12
999	adoxq	-16(%rbx),%r12
1000	adcxq	%rbp,%r13
1001	adoxq	%rbp,%r13
1002
1003	movq	%rdi,8(%rsp)
1004	movq	%r8,%r15
1005	imulq	24(%rsp),%r8
1006	xorl	%ebp,%ebp
1007
1008	mulxq	24(%rsi),%rax,%r14
1009	movq	%r8,%rdx
1010	adcxq	%rax,%r13
1011	adoxq	-8(%rbx),%r13
1012	adcxq	%rbp,%r14
1013	leaq	32(%rsi),%rsi
1014	adoxq	%rbp,%r14
1015
1016	mulxq	0(%rcx),%rax,%r10
1017	adcxq	%rax,%r15
1018	adoxq	%r11,%r10
1019	mulxq	8(%rcx),%rax,%r11
1020	adcxq	%rax,%r10
1021	adoxq	%r12,%r11
1022	mulxq	16(%rcx),%rax,%r12
1023	movq	%r10,-32(%rbx)
1024	adcxq	%rax,%r11
1025	adoxq	%r13,%r12
1026	mulxq	24(%rcx),%rax,%r15
1027	movq	%r9,%rdx
1028	movq	%r11,-24(%rbx)
1029	leaq	32(%rcx),%rcx
1030	adcxq	%rax,%r12
1031	adoxq	%rbp,%r15
1032	movq	48(%rsp),%rdi
1033	movq	%r12,-16(%rbx)
1034
1035	jmp	.Lmulx4x_inner
1036
1037.align	32
1038.Lmulx4x_inner:
1039	mulxq	0(%rsi),%r10,%rax
1040	adcxq	%rbp,%r15
1041	adoxq	%r14,%r10
1042	mulxq	8(%rsi),%r11,%r14
1043	adcxq	0(%rbx),%r10
1044	adoxq	%rax,%r11
1045	mulxq	16(%rsi),%r12,%rax
1046	adcxq	8(%rbx),%r11
1047	adoxq	%r14,%r12
1048	mulxq	24(%rsi),%r13,%r14
1049	movq	%r8,%rdx
1050	adcxq	16(%rbx),%r12
1051	adoxq	%rax,%r13
1052	adcxq	24(%rbx),%r13
1053	adoxq	%rbp,%r14
1054	leaq	32(%rsi),%rsi
1055	leaq	32(%rbx),%rbx
1056	adcxq	%rbp,%r14
1057
1058	adoxq	%r15,%r10
1059	mulxq	0(%rcx),%rax,%r15
1060	adcxq	%rax,%r10
1061	adoxq	%r15,%r11
1062	mulxq	8(%rcx),%rax,%r15
1063	adcxq	%rax,%r11
1064	adoxq	%r15,%r12
1065	mulxq	16(%rcx),%rax,%r15
1066	movq	%r10,-40(%rbx)
1067	adcxq	%rax,%r12
1068	adoxq	%r15,%r13
1069	mulxq	24(%rcx),%rax,%r15
1070	movq	%r9,%rdx
1071	movq	%r11,-32(%rbx)
1072	movq	%r12,-24(%rbx)
1073	adcxq	%rax,%r13
1074	adoxq	%rbp,%r15
1075	leaq	32(%rcx),%rcx
1076	movq	%r13,-16(%rbx)
1077
1078	decq	%rdi
1079	jnz	.Lmulx4x_inner
1080
1081	movq	0(%rsp),%rax
1082	movq	8(%rsp),%rdi
1083	adcq	%rbp,%r15
1084	subq	0(%rbx),%rbp
1085	adcq	%r15,%r14
1086	sbbq	%r15,%r15
1087	movq	%r14,-8(%rbx)
1088
1089	cmpq	16(%rsp),%rdi
1090	jne	.Lmulx4x_outer
1091
1092	leaq	64(%rsp),%rbx
1093	subq	%rax,%rcx
1094	negq	%r15
1095	movq	%rax,%rdx
1096	shrq	$3+2,%rax
1097	movq	32(%rsp),%rdi
1098	jmp	.Lmulx4x_sub
1099
1100.align	32
1101.Lmulx4x_sub:
1102	movq	0(%rbx),%r11
1103	movq	8(%rbx),%r12
1104	movq	16(%rbx),%r13
1105	movq	24(%rbx),%r14
1106	leaq	32(%rbx),%rbx
1107	sbbq	0(%rcx),%r11
1108	sbbq	8(%rcx),%r12
1109	sbbq	16(%rcx),%r13
1110	sbbq	24(%rcx),%r14
1111	leaq	32(%rcx),%rcx
1112	movq	%r11,0(%rdi)
1113	movq	%r12,8(%rdi)
1114	movq	%r13,16(%rdi)
1115	movq	%r14,24(%rdi)
1116	leaq	32(%rdi),%rdi
1117	decq	%rax
1118	jnz	.Lmulx4x_sub
1119
1120	sbbq	$0,%r15
1121	leaq	64(%rsp),%rbx
1122	subq	%rdx,%rdi
1123
1124.byte	102,73,15,110,207
1125	pxor	%xmm0,%xmm0
1126	pshufd	$0,%xmm1,%xmm1
1127	movq	40(%rsp),%rsi
1128	jmp	.Lmulx4x_cond_copy
1129
1130.align	32
1131.Lmulx4x_cond_copy:
1132	movdqa	0(%rbx),%xmm2
1133	movdqa	16(%rbx),%xmm3
1134	leaq	32(%rbx),%rbx
1135	movdqu	0(%rdi),%xmm4
1136	movdqu	16(%rdi),%xmm5
1137	leaq	32(%rdi),%rdi
1138	movdqa	%xmm0,-32(%rbx)
1139	movdqa	%xmm0,-16(%rbx)
1140	pcmpeqd	%xmm1,%xmm0
1141	pand	%xmm1,%xmm2
1142	pand	%xmm1,%xmm3
1143	pand	%xmm0,%xmm4
1144	pand	%xmm0,%xmm5
1145	pxor	%xmm0,%xmm0
1146	por	%xmm2,%xmm4
1147	por	%xmm3,%xmm5
1148	movdqu	%xmm4,-32(%rdi)
1149	movdqu	%xmm5,-16(%rdi)
1150	subq	$32,%rdx
1151	jnz	.Lmulx4x_cond_copy
1152
1153	movq	%rdx,(%rbx)
1154
1155	movq	$1,%rax
1156	movq	-48(%rsi),%r15
1157	movq	-40(%rsi),%r14
1158	movq	-32(%rsi),%r13
1159	movq	-24(%rsi),%r12
1160	movq	-16(%rsi),%rbp
1161	movq	-8(%rsi),%rbx
1162	leaq	(%rsi),%rsp
1163.Lmulx4x_epilogue:
1164	.byte	0xf3,0xc3
1165.size	bn_mulx4x_mont,.-bn_mulx4x_mont
1166.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1167.align	16
1168