x86_64-mont.S revision 305153
1/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/x86_64-mont.S 305153 2016-08-31 20:33:59Z jkim $ */
2/* Do not modify. This file is auto-generated from x86_64-mont.pl. */
3.text
4
5
6
7.globl	bn_mul_mont
8.type	bn_mul_mont,@function
9.align	16
10bn_mul_mont:
11	testl	$3,%r9d
12	jnz	.Lmul_enter
13	cmpl	$8,%r9d
14	jb	.Lmul_enter
15	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
16	cmpq	%rsi,%rdx
17	jne	.Lmul4x_enter
18	testl	$7,%r9d
19	jz	.Lsqr8x_enter
20	jmp	.Lmul4x_enter
21
22.align	16
23.Lmul_enter:
24	pushq	%rbx
25	pushq	%rbp
26	pushq	%r12
27	pushq	%r13
28	pushq	%r14
29	pushq	%r15
30
31	movl	%r9d,%r9d
32	leaq	2(%r9),%r10
33	movq	%rsp,%r11
34	negq	%r10
35	leaq	(%rsp,%r10,8),%rsp
36	andq	$-1024,%rsp
37
38	movq	%r11,8(%rsp,%r9,8)
39.Lmul_body:
40
41
42
43
44
45
46	subq	%rsp,%r11
47	andq	$-4096,%r11
48.Lmul_page_walk:
49	movq	(%rsp,%r11,1),%r10
50	subq	$4096,%r11
51.byte	0x66,0x2e
52	jnc	.Lmul_page_walk
53
54	movq	%rdx,%r12
55	movq	(%r8),%r8
56	movq	(%r12),%rbx
57	movq	(%rsi),%rax
58
59	xorq	%r14,%r14
60	xorq	%r15,%r15
61
62	movq	%r8,%rbp
63	mulq	%rbx
64	movq	%rax,%r10
65	movq	(%rcx),%rax
66
67	imulq	%r10,%rbp
68	movq	%rdx,%r11
69
70	mulq	%rbp
71	addq	%rax,%r10
72	movq	8(%rsi),%rax
73	adcq	$0,%rdx
74	movq	%rdx,%r13
75
76	leaq	1(%r15),%r15
77	jmp	.L1st_enter
78
79.align	16
80.L1st:
81	addq	%rax,%r13
82	movq	(%rsi,%r15,8),%rax
83	adcq	$0,%rdx
84	addq	%r11,%r13
85	movq	%r10,%r11
86	adcq	$0,%rdx
87	movq	%r13,-16(%rsp,%r15,8)
88	movq	%rdx,%r13
89
90.L1st_enter:
91	mulq	%rbx
92	addq	%rax,%r11
93	movq	(%rcx,%r15,8),%rax
94	adcq	$0,%rdx
95	leaq	1(%r15),%r15
96	movq	%rdx,%r10
97
98	mulq	%rbp
99	cmpq	%r9,%r15
100	jne	.L1st
101
102	addq	%rax,%r13
103	movq	(%rsi),%rax
104	adcq	$0,%rdx
105	addq	%r11,%r13
106	adcq	$0,%rdx
107	movq	%r13,-16(%rsp,%r15,8)
108	movq	%rdx,%r13
109	movq	%r10,%r11
110
111	xorq	%rdx,%rdx
112	addq	%r11,%r13
113	adcq	$0,%rdx
114	movq	%r13,-8(%rsp,%r9,8)
115	movq	%rdx,(%rsp,%r9,8)
116
117	leaq	1(%r14),%r14
118	jmp	.Louter
119.align	16
120.Louter:
121	movq	(%r12,%r14,8),%rbx
122	xorq	%r15,%r15
123	movq	%r8,%rbp
124	movq	(%rsp),%r10
125	mulq	%rbx
126	addq	%rax,%r10
127	movq	(%rcx),%rax
128	adcq	$0,%rdx
129
130	imulq	%r10,%rbp
131	movq	%rdx,%r11
132
133	mulq	%rbp
134	addq	%rax,%r10
135	movq	8(%rsi),%rax
136	adcq	$0,%rdx
137	movq	8(%rsp),%r10
138	movq	%rdx,%r13
139
140	leaq	1(%r15),%r15
141	jmp	.Linner_enter
142
143.align	16
144.Linner:
145	addq	%rax,%r13
146	movq	(%rsi,%r15,8),%rax
147	adcq	$0,%rdx
148	addq	%r10,%r13
149	movq	(%rsp,%r15,8),%r10
150	adcq	$0,%rdx
151	movq	%r13,-16(%rsp,%r15,8)
152	movq	%rdx,%r13
153
154.Linner_enter:
155	mulq	%rbx
156	addq	%rax,%r11
157	movq	(%rcx,%r15,8),%rax
158	adcq	$0,%rdx
159	addq	%r11,%r10
160	movq	%rdx,%r11
161	adcq	$0,%r11
162	leaq	1(%r15),%r15
163
164	mulq	%rbp
165	cmpq	%r9,%r15
166	jne	.Linner
167
168	addq	%rax,%r13
169	movq	(%rsi),%rax
170	adcq	$0,%rdx
171	addq	%r10,%r13
172	movq	(%rsp,%r15,8),%r10
173	adcq	$0,%rdx
174	movq	%r13,-16(%rsp,%r15,8)
175	movq	%rdx,%r13
176
177	xorq	%rdx,%rdx
178	addq	%r11,%r13
179	adcq	$0,%rdx
180	addq	%r10,%r13
181	adcq	$0,%rdx
182	movq	%r13,-8(%rsp,%r9,8)
183	movq	%rdx,(%rsp,%r9,8)
184
185	leaq	1(%r14),%r14
186	cmpq	%r9,%r14
187	jb	.Louter
188
189	xorq	%r14,%r14
190	movq	(%rsp),%rax
191	leaq	(%rsp),%rsi
192	movq	%r9,%r15
193	jmp	.Lsub
194.align	16
195.Lsub:	sbbq	(%rcx,%r14,8),%rax
196	movq	%rax,(%rdi,%r14,8)
197	movq	8(%rsi,%r14,8),%rax
198	leaq	1(%r14),%r14
199	decq	%r15
200	jnz	.Lsub
201
202	sbbq	$0,%rax
203	xorq	%r14,%r14
204	andq	%rax,%rsi
205	notq	%rax
206	movq	%rdi,%rcx
207	andq	%rax,%rcx
208	movq	%r9,%r15
209	orq	%rcx,%rsi
210.align	16
211.Lcopy:
212	movq	(%rsi,%r14,8),%rax
213	movq	%r14,(%rsp,%r14,8)
214	movq	%rax,(%rdi,%r14,8)
215	leaq	1(%r14),%r14
216	subq	$1,%r15
217	jnz	.Lcopy
218
219	movq	8(%rsp,%r9,8),%rsi
220	movq	$1,%rax
221	movq	(%rsi),%r15
222	movq	8(%rsi),%r14
223	movq	16(%rsi),%r13
224	movq	24(%rsi),%r12
225	movq	32(%rsi),%rbp
226	movq	40(%rsi),%rbx
227	leaq	48(%rsi),%rsp
228.Lmul_epilogue:
229	.byte	0xf3,0xc3
230.size	bn_mul_mont,.-bn_mul_mont
231.type	bn_mul4x_mont,@function
232.align	16
233bn_mul4x_mont:
234.Lmul4x_enter:
235	andl	$0x80100,%r11d
236	cmpl	$0x80100,%r11d
237	je	.Lmulx4x_enter
238	pushq	%rbx
239	pushq	%rbp
240	pushq	%r12
241	pushq	%r13
242	pushq	%r14
243	pushq	%r15
244
245	movl	%r9d,%r9d
246	leaq	4(%r9),%r10
247	movq	%rsp,%r11
248	negq	%r10
249	leaq	(%rsp,%r10,8),%rsp
250	andq	$-1024,%rsp
251
252	movq	%r11,8(%rsp,%r9,8)
253.Lmul4x_body:
254	subq	%rsp,%r11
255	andq	$-4096,%r11
256.Lmul4x_page_walk:
257	movq	(%rsp,%r11,1),%r10
258	subq	$4096,%r11
259.byte	0x2e
260	jnc	.Lmul4x_page_walk
261
262	movq	%rdi,16(%rsp,%r9,8)
263	movq	%rdx,%r12
264	movq	(%r8),%r8
265	movq	(%r12),%rbx
266	movq	(%rsi),%rax
267
268	xorq	%r14,%r14
269	xorq	%r15,%r15
270
271	movq	%r8,%rbp
272	mulq	%rbx
273	movq	%rax,%r10
274	movq	(%rcx),%rax
275
276	imulq	%r10,%rbp
277	movq	%rdx,%r11
278
279	mulq	%rbp
280	addq	%rax,%r10
281	movq	8(%rsi),%rax
282	adcq	$0,%rdx
283	movq	%rdx,%rdi
284
285	mulq	%rbx
286	addq	%rax,%r11
287	movq	8(%rcx),%rax
288	adcq	$0,%rdx
289	movq	%rdx,%r10
290
291	mulq	%rbp
292	addq	%rax,%rdi
293	movq	16(%rsi),%rax
294	adcq	$0,%rdx
295	addq	%r11,%rdi
296	leaq	4(%r15),%r15
297	adcq	$0,%rdx
298	movq	%rdi,(%rsp)
299	movq	%rdx,%r13
300	jmp	.L1st4x
301.align	16
302.L1st4x:
303	mulq	%rbx
304	addq	%rax,%r10
305	movq	-16(%rcx,%r15,8),%rax
306	adcq	$0,%rdx
307	movq	%rdx,%r11
308
309	mulq	%rbp
310	addq	%rax,%r13
311	movq	-8(%rsi,%r15,8),%rax
312	adcq	$0,%rdx
313	addq	%r10,%r13
314	adcq	$0,%rdx
315	movq	%r13,-24(%rsp,%r15,8)
316	movq	%rdx,%rdi
317
318	mulq	%rbx
319	addq	%rax,%r11
320	movq	-8(%rcx,%r15,8),%rax
321	adcq	$0,%rdx
322	movq	%rdx,%r10
323
324	mulq	%rbp
325	addq	%rax,%rdi
326	movq	(%rsi,%r15,8),%rax
327	adcq	$0,%rdx
328	addq	%r11,%rdi
329	adcq	$0,%rdx
330	movq	%rdi,-16(%rsp,%r15,8)
331	movq	%rdx,%r13
332
333	mulq	%rbx
334	addq	%rax,%r10
335	movq	(%rcx,%r15,8),%rax
336	adcq	$0,%rdx
337	movq	%rdx,%r11
338
339	mulq	%rbp
340	addq	%rax,%r13
341	movq	8(%rsi,%r15,8),%rax
342	adcq	$0,%rdx
343	addq	%r10,%r13
344	adcq	$0,%rdx
345	movq	%r13,-8(%rsp,%r15,8)
346	movq	%rdx,%rdi
347
348	mulq	%rbx
349	addq	%rax,%r11
350	movq	8(%rcx,%r15,8),%rax
351	adcq	$0,%rdx
352	leaq	4(%r15),%r15
353	movq	%rdx,%r10
354
355	mulq	%rbp
356	addq	%rax,%rdi
357	movq	-16(%rsi,%r15,8),%rax
358	adcq	$0,%rdx
359	addq	%r11,%rdi
360	adcq	$0,%rdx
361	movq	%rdi,-32(%rsp,%r15,8)
362	movq	%rdx,%r13
363	cmpq	%r9,%r15
364	jb	.L1st4x
365
366	mulq	%rbx
367	addq	%rax,%r10
368	movq	-16(%rcx,%r15,8),%rax
369	adcq	$0,%rdx
370	movq	%rdx,%r11
371
372	mulq	%rbp
373	addq	%rax,%r13
374	movq	-8(%rsi,%r15,8),%rax
375	adcq	$0,%rdx
376	addq	%r10,%r13
377	adcq	$0,%rdx
378	movq	%r13,-24(%rsp,%r15,8)
379	movq	%rdx,%rdi
380
381	mulq	%rbx
382	addq	%rax,%r11
383	movq	-8(%rcx,%r15,8),%rax
384	adcq	$0,%rdx
385	movq	%rdx,%r10
386
387	mulq	%rbp
388	addq	%rax,%rdi
389	movq	(%rsi),%rax
390	adcq	$0,%rdx
391	addq	%r11,%rdi
392	adcq	$0,%rdx
393	movq	%rdi,-16(%rsp,%r15,8)
394	movq	%rdx,%r13
395
396	xorq	%rdi,%rdi
397	addq	%r10,%r13
398	adcq	$0,%rdi
399	movq	%r13,-8(%rsp,%r15,8)
400	movq	%rdi,(%rsp,%r15,8)
401
402	leaq	1(%r14),%r14
403.align	4
404.Louter4x:
405	movq	(%r12,%r14,8),%rbx
406	xorq	%r15,%r15
407	movq	(%rsp),%r10
408	movq	%r8,%rbp
409	mulq	%rbx
410	addq	%rax,%r10
411	movq	(%rcx),%rax
412	adcq	$0,%rdx
413
414	imulq	%r10,%rbp
415	movq	%rdx,%r11
416
417	mulq	%rbp
418	addq	%rax,%r10
419	movq	8(%rsi),%rax
420	adcq	$0,%rdx
421	movq	%rdx,%rdi
422
423	mulq	%rbx
424	addq	%rax,%r11
425	movq	8(%rcx),%rax
426	adcq	$0,%rdx
427	addq	8(%rsp),%r11
428	adcq	$0,%rdx
429	movq	%rdx,%r10
430
431	mulq	%rbp
432	addq	%rax,%rdi
433	movq	16(%rsi),%rax
434	adcq	$0,%rdx
435	addq	%r11,%rdi
436	leaq	4(%r15),%r15
437	adcq	$0,%rdx
438	movq	%rdi,(%rsp)
439	movq	%rdx,%r13
440	jmp	.Linner4x
441.align	16
442.Linner4x:
443	mulq	%rbx
444	addq	%rax,%r10
445	movq	-16(%rcx,%r15,8),%rax
446	adcq	$0,%rdx
447	addq	-16(%rsp,%r15,8),%r10
448	adcq	$0,%rdx
449	movq	%rdx,%r11
450
451	mulq	%rbp
452	addq	%rax,%r13
453	movq	-8(%rsi,%r15,8),%rax
454	adcq	$0,%rdx
455	addq	%r10,%r13
456	adcq	$0,%rdx
457	movq	%r13,-24(%rsp,%r15,8)
458	movq	%rdx,%rdi
459
460	mulq	%rbx
461	addq	%rax,%r11
462	movq	-8(%rcx,%r15,8),%rax
463	adcq	$0,%rdx
464	addq	-8(%rsp,%r15,8),%r11
465	adcq	$0,%rdx
466	movq	%rdx,%r10
467
468	mulq	%rbp
469	addq	%rax,%rdi
470	movq	(%rsi,%r15,8),%rax
471	adcq	$0,%rdx
472	addq	%r11,%rdi
473	adcq	$0,%rdx
474	movq	%rdi,-16(%rsp,%r15,8)
475	movq	%rdx,%r13
476
477	mulq	%rbx
478	addq	%rax,%r10
479	movq	(%rcx,%r15,8),%rax
480	adcq	$0,%rdx
481	addq	(%rsp,%r15,8),%r10
482	adcq	$0,%rdx
483	movq	%rdx,%r11
484
485	mulq	%rbp
486	addq	%rax,%r13
487	movq	8(%rsi,%r15,8),%rax
488	adcq	$0,%rdx
489	addq	%r10,%r13
490	adcq	$0,%rdx
491	movq	%r13,-8(%rsp,%r15,8)
492	movq	%rdx,%rdi
493
494	mulq	%rbx
495	addq	%rax,%r11
496	movq	8(%rcx,%r15,8),%rax
497	adcq	$0,%rdx
498	addq	8(%rsp,%r15,8),%r11
499	adcq	$0,%rdx
500	leaq	4(%r15),%r15
501	movq	%rdx,%r10
502
503	mulq	%rbp
504	addq	%rax,%rdi
505	movq	-16(%rsi,%r15,8),%rax
506	adcq	$0,%rdx
507	addq	%r11,%rdi
508	adcq	$0,%rdx
509	movq	%rdi,-32(%rsp,%r15,8)
510	movq	%rdx,%r13
511	cmpq	%r9,%r15
512	jb	.Linner4x
513
514	mulq	%rbx
515	addq	%rax,%r10
516	movq	-16(%rcx,%r15,8),%rax
517	adcq	$0,%rdx
518	addq	-16(%rsp,%r15,8),%r10
519	adcq	$0,%rdx
520	movq	%rdx,%r11
521
522	mulq	%rbp
523	addq	%rax,%r13
524	movq	-8(%rsi,%r15,8),%rax
525	adcq	$0,%rdx
526	addq	%r10,%r13
527	adcq	$0,%rdx
528	movq	%r13,-24(%rsp,%r15,8)
529	movq	%rdx,%rdi
530
531	mulq	%rbx
532	addq	%rax,%r11
533	movq	-8(%rcx,%r15,8),%rax
534	adcq	$0,%rdx
535	addq	-8(%rsp,%r15,8),%r11
536	adcq	$0,%rdx
537	leaq	1(%r14),%r14
538	movq	%rdx,%r10
539
540	mulq	%rbp
541	addq	%rax,%rdi
542	movq	(%rsi),%rax
543	adcq	$0,%rdx
544	addq	%r11,%rdi
545	adcq	$0,%rdx
546	movq	%rdi,-16(%rsp,%r15,8)
547	movq	%rdx,%r13
548
549	xorq	%rdi,%rdi
550	addq	%r10,%r13
551	adcq	$0,%rdi
552	addq	(%rsp,%r9,8),%r13
553	adcq	$0,%rdi
554	movq	%r13,-8(%rsp,%r15,8)
555	movq	%rdi,(%rsp,%r15,8)
556
557	cmpq	%r9,%r14
558	jb	.Louter4x
559	movq	16(%rsp,%r9,8),%rdi
560	movq	0(%rsp),%rax
561	pxor	%xmm0,%xmm0
562	movq	8(%rsp),%rdx
563	shrq	$2,%r9
564	leaq	(%rsp),%rsi
565	xorq	%r14,%r14
566
567	subq	0(%rcx),%rax
568	movq	16(%rsi),%rbx
569	movq	24(%rsi),%rbp
570	sbbq	8(%rcx),%rdx
571	leaq	-1(%r9),%r15
572	jmp	.Lsub4x
573.align	16
574.Lsub4x:
575	movq	%rax,0(%rdi,%r14,8)
576	movq	%rdx,8(%rdi,%r14,8)
577	sbbq	16(%rcx,%r14,8),%rbx
578	movq	32(%rsi,%r14,8),%rax
579	movq	40(%rsi,%r14,8),%rdx
580	sbbq	24(%rcx,%r14,8),%rbp
581	movq	%rbx,16(%rdi,%r14,8)
582	movq	%rbp,24(%rdi,%r14,8)
583	sbbq	32(%rcx,%r14,8),%rax
584	movq	48(%rsi,%r14,8),%rbx
585	movq	56(%rsi,%r14,8),%rbp
586	sbbq	40(%rcx,%r14,8),%rdx
587	leaq	4(%r14),%r14
588	decq	%r15
589	jnz	.Lsub4x
590
591	movq	%rax,0(%rdi,%r14,8)
592	movq	32(%rsi,%r14,8),%rax
593	sbbq	16(%rcx,%r14,8),%rbx
594	movq	%rdx,8(%rdi,%r14,8)
595	sbbq	24(%rcx,%r14,8),%rbp
596	movq	%rbx,16(%rdi,%r14,8)
597
598	sbbq	$0,%rax
599	movq	%rbp,24(%rdi,%r14,8)
600	xorq	%r14,%r14
601	andq	%rax,%rsi
602	notq	%rax
603	movq	%rdi,%rcx
604	andq	%rax,%rcx
605	leaq	-1(%r9),%r15
606	orq	%rcx,%rsi
607
608	movdqu	(%rsi),%xmm1
609	movdqa	%xmm0,(%rsp)
610	movdqu	%xmm1,(%rdi)
611	jmp	.Lcopy4x
612.align	16
613.Lcopy4x:
614	movdqu	16(%rsi,%r14,1),%xmm2
615	movdqu	32(%rsi,%r14,1),%xmm1
616	movdqa	%xmm0,16(%rsp,%r14,1)
617	movdqu	%xmm2,16(%rdi,%r14,1)
618	movdqa	%xmm0,32(%rsp,%r14,1)
619	movdqu	%xmm1,32(%rdi,%r14,1)
620	leaq	32(%r14),%r14
621	decq	%r15
622	jnz	.Lcopy4x
623
624	shlq	$2,%r9
625	movdqu	16(%rsi,%r14,1),%xmm2
626	movdqa	%xmm0,16(%rsp,%r14,1)
627	movdqu	%xmm2,16(%rdi,%r14,1)
628	movq	8(%rsp,%r9,8),%rsi
629	movq	$1,%rax
630	movq	(%rsi),%r15
631	movq	8(%rsi),%r14
632	movq	16(%rsi),%r13
633	movq	24(%rsi),%r12
634	movq	32(%rsi),%rbp
635	movq	40(%rsi),%rbx
636	leaq	48(%rsi),%rsp
637.Lmul4x_epilogue:
638	.byte	0xf3,0xc3
639.size	bn_mul4x_mont,.-bn_mul4x_mont
640
641
642
643.type	bn_sqr8x_mont,@function
644.align	32
645bn_sqr8x_mont:
646.Lsqr8x_enter:
647	movq	%rsp,%rax
648	pushq	%rbx
649	pushq	%rbp
650	pushq	%r12
651	pushq	%r13
652	pushq	%r14
653	pushq	%r15
654
655	movl	%r9d,%r10d
656	shll	$3,%r9d
657	shlq	$3+2,%r10
658	negq	%r9
659
660
661
662
663
664
665	leaq	-64(%rsp,%r9,2),%r11
666	movq	(%r8),%r8
667	subq	%rsi,%r11
668	andq	$4095,%r11
669	cmpq	%r11,%r10
670	jb	.Lsqr8x_sp_alt
671	subq	%r11,%rsp
672	leaq	-64(%rsp,%r9,2),%rsp
673	jmp	.Lsqr8x_sp_done
674
675.align	32
676.Lsqr8x_sp_alt:
677	leaq	4096-64(,%r9,2),%r10
678	leaq	-64(%rsp,%r9,2),%rsp
679	subq	%r10,%r11
680	movq	$0,%r10
681	cmovcq	%r10,%r11
682	subq	%r11,%rsp
683.Lsqr8x_sp_done:
684	andq	$-64,%rsp
685	movq	%rax,%r11
686	subq	%rsp,%r11
687	andq	$-4096,%r11
688.Lsqr8x_page_walk:
689	movq	(%rsp,%r11,1),%r10
690	subq	$4096,%r11
691.byte	0x2e
692	jnc	.Lsqr8x_page_walk
693
694	movq	%r9,%r10
695	negq	%r9
696
697	movq	%r8,32(%rsp)
698	movq	%rax,40(%rsp)
699.Lsqr8x_body:
700
701.byte	102,72,15,110,209
702	pxor	%xmm0,%xmm0
703.byte	102,72,15,110,207
704.byte	102,73,15,110,218
705	movl	OPENSSL_ia32cap_P+8(%rip),%eax
706	andl	$0x80100,%eax
707	cmpl	$0x80100,%eax
708	jne	.Lsqr8x_nox
709
710	call	bn_sqrx8x_internal
711
712
713
714
715	leaq	(%r8,%rcx,1),%rbx
716	movq	%rcx,%r9
717	movq	%rcx,%rdx
718.byte	102,72,15,126,207
719	sarq	$3+2,%rcx
720	jmp	.Lsqr8x_sub
721
722.align	32
723.Lsqr8x_nox:
724	call	bn_sqr8x_internal
725
726
727
728
729	leaq	(%rdi,%r9,1),%rbx
730	movq	%r9,%rcx
731	movq	%r9,%rdx
732.byte	102,72,15,126,207
733	sarq	$3+2,%rcx
734	jmp	.Lsqr8x_sub
735
736.align	32
737.Lsqr8x_sub:
738	movq	0(%rbx),%r12
739	movq	8(%rbx),%r13
740	movq	16(%rbx),%r14
741	movq	24(%rbx),%r15
742	leaq	32(%rbx),%rbx
743	sbbq	0(%rbp),%r12
744	sbbq	8(%rbp),%r13
745	sbbq	16(%rbp),%r14
746	sbbq	24(%rbp),%r15
747	leaq	32(%rbp),%rbp
748	movq	%r12,0(%rdi)
749	movq	%r13,8(%rdi)
750	movq	%r14,16(%rdi)
751	movq	%r15,24(%rdi)
752	leaq	32(%rdi),%rdi
753	incq	%rcx
754	jnz	.Lsqr8x_sub
755
756	sbbq	$0,%rax
757	leaq	(%rbx,%r9,1),%rbx
758	leaq	(%rdi,%r9,1),%rdi
759
760.byte	102,72,15,110,200
761	pxor	%xmm0,%xmm0
762	pshufd	$0,%xmm1,%xmm1
763	movq	40(%rsp),%rsi
764	jmp	.Lsqr8x_cond_copy
765
766.align	32
767.Lsqr8x_cond_copy:
768	movdqa	0(%rbx),%xmm2
769	movdqa	16(%rbx),%xmm3
770	leaq	32(%rbx),%rbx
771	movdqu	0(%rdi),%xmm4
772	movdqu	16(%rdi),%xmm5
773	leaq	32(%rdi),%rdi
774	movdqa	%xmm0,-32(%rbx)
775	movdqa	%xmm0,-16(%rbx)
776	movdqa	%xmm0,-32(%rbx,%rdx,1)
777	movdqa	%xmm0,-16(%rbx,%rdx,1)
778	pcmpeqd	%xmm1,%xmm0
779	pand	%xmm1,%xmm2
780	pand	%xmm1,%xmm3
781	pand	%xmm0,%xmm4
782	pand	%xmm0,%xmm5
783	pxor	%xmm0,%xmm0
784	por	%xmm2,%xmm4
785	por	%xmm3,%xmm5
786	movdqu	%xmm4,-32(%rdi)
787	movdqu	%xmm5,-16(%rdi)
788	addq	$32,%r9
789	jnz	.Lsqr8x_cond_copy
790
791	movq	$1,%rax
792	movq	-48(%rsi),%r15
793	movq	-40(%rsi),%r14
794	movq	-32(%rsi),%r13
795	movq	-24(%rsi),%r12
796	movq	-16(%rsi),%rbp
797	movq	-8(%rsi),%rbx
798	leaq	(%rsi),%rsp
799.Lsqr8x_epilogue:
800	.byte	0xf3,0xc3
801.size	bn_sqr8x_mont,.-bn_sqr8x_mont
802.type	bn_mulx4x_mont,@function
803.align	32
804bn_mulx4x_mont:
805.Lmulx4x_enter:
806	movq	%rsp,%rax
807	pushq	%rbx
808	pushq	%rbp
809	pushq	%r12
810	pushq	%r13
811	pushq	%r14
812	pushq	%r15
813
814	shll	$3,%r9d
815.byte	0x67
816	xorq	%r10,%r10
817	subq	%r9,%r10
818	movq	(%r8),%r8
819	leaq	-72(%rsp,%r10,1),%rsp
820	andq	$-128,%rsp
821	movq	%rax,%r11
822	subq	%rsp,%r11
823	andq	$-4096,%r11
824.Lmulx4x_page_walk:
825	movq	(%rsp,%r11,1),%r10
826	subq	$4096,%r11
827.byte	0x66,0x2e
828	jnc	.Lmulx4x_page_walk
829
830	leaq	(%rdx,%r9,1),%r10
831
832
833
834
835
836
837
838
839
840
841
842
843	movq	%r9,0(%rsp)
844	shrq	$5,%r9
845	movq	%r10,16(%rsp)
846	subq	$1,%r9
847	movq	%r8,24(%rsp)
848	movq	%rdi,32(%rsp)
849	movq	%rax,40(%rsp)
850	movq	%r9,48(%rsp)
851	jmp	.Lmulx4x_body
852
853.align	32
854.Lmulx4x_body:
855	leaq	8(%rdx),%rdi
856	movq	(%rdx),%rdx
857	leaq	64+32(%rsp),%rbx
858	movq	%rdx,%r9
859
860	mulxq	0(%rsi),%r8,%rax
861	mulxq	8(%rsi),%r11,%r14
862	addq	%rax,%r11
863	movq	%rdi,8(%rsp)
864	mulxq	16(%rsi),%r12,%r13
865	adcq	%r14,%r12
866	adcq	$0,%r13
867
868	movq	%r8,%rdi
869	imulq	24(%rsp),%r8
870	xorq	%rbp,%rbp
871
872	mulxq	24(%rsi),%rax,%r14
873	movq	%r8,%rdx
874	leaq	32(%rsi),%rsi
875	adcxq	%rax,%r13
876	adcxq	%rbp,%r14
877
878	mulxq	0(%rcx),%rax,%r10
879	adcxq	%rax,%rdi
880	adoxq	%r11,%r10
881	mulxq	8(%rcx),%rax,%r11
882	adcxq	%rax,%r10
883	adoxq	%r12,%r11
884.byte	0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
885	movq	48(%rsp),%rdi
886	movq	%r10,-32(%rbx)
887	adcxq	%rax,%r11
888	adoxq	%r13,%r12
889	mulxq	24(%rcx),%rax,%r15
890	movq	%r9,%rdx
891	movq	%r11,-24(%rbx)
892	adcxq	%rax,%r12
893	adoxq	%rbp,%r15
894	leaq	32(%rcx),%rcx
895	movq	%r12,-16(%rbx)
896
897	jmp	.Lmulx4x_1st
898
899.align	32
900.Lmulx4x_1st:
901	adcxq	%rbp,%r15
902	mulxq	0(%rsi),%r10,%rax
903	adcxq	%r14,%r10
904	mulxq	8(%rsi),%r11,%r14
905	adcxq	%rax,%r11
906	mulxq	16(%rsi),%r12,%rax
907	adcxq	%r14,%r12
908	mulxq	24(%rsi),%r13,%r14
909.byte	0x67,0x67
910	movq	%r8,%rdx
911	adcxq	%rax,%r13
912	adcxq	%rbp,%r14
913	leaq	32(%rsi),%rsi
914	leaq	32(%rbx),%rbx
915
916	adoxq	%r15,%r10
917	mulxq	0(%rcx),%rax,%r15
918	adcxq	%rax,%r10
919	adoxq	%r15,%r11
920	mulxq	8(%rcx),%rax,%r15
921	adcxq	%rax,%r11
922	adoxq	%r15,%r12
923	mulxq	16(%rcx),%rax,%r15
924	movq	%r10,-40(%rbx)
925	adcxq	%rax,%r12
926	movq	%r11,-32(%rbx)
927	adoxq	%r15,%r13
928	mulxq	24(%rcx),%rax,%r15
929	movq	%r9,%rdx
930	movq	%r12,-24(%rbx)
931	adcxq	%rax,%r13
932	adoxq	%rbp,%r15
933	leaq	32(%rcx),%rcx
934	movq	%r13,-16(%rbx)
935
936	decq	%rdi
937	jnz	.Lmulx4x_1st
938
939	movq	0(%rsp),%rax
940	movq	8(%rsp),%rdi
941	adcq	%rbp,%r15
942	addq	%r15,%r14
943	sbbq	%r15,%r15
944	movq	%r14,-8(%rbx)
945	jmp	.Lmulx4x_outer
946
947.align	32
948.Lmulx4x_outer:
949	movq	(%rdi),%rdx
950	leaq	8(%rdi),%rdi
951	subq	%rax,%rsi
952	movq	%r15,(%rbx)
953	leaq	64+32(%rsp),%rbx
954	subq	%rax,%rcx
955
956	mulxq	0(%rsi),%r8,%r11
957	xorl	%ebp,%ebp
958	movq	%rdx,%r9
959	mulxq	8(%rsi),%r14,%r12
960	adoxq	-32(%rbx),%r8
961	adcxq	%r14,%r11
962	mulxq	16(%rsi),%r15,%r13
963	adoxq	-24(%rbx),%r11
964	adcxq	%r15,%r12
965	adoxq	%rbp,%r12
966	adcxq	%rbp,%r13
967
968	movq	%rdi,8(%rsp)
969.byte	0x67
970	movq	%r8,%r15
971	imulq	24(%rsp),%r8
972	xorl	%ebp,%ebp
973
974	mulxq	24(%rsi),%rax,%r14
975	movq	%r8,%rdx
976	adoxq	-16(%rbx),%r12
977	adcxq	%rax,%r13
978	adoxq	-8(%rbx),%r13
979	adcxq	%rbp,%r14
980	leaq	32(%rsi),%rsi
981	adoxq	%rbp,%r14
982
983	mulxq	0(%rcx),%rax,%r10
984	adcxq	%rax,%r15
985	adoxq	%r11,%r10
986	mulxq	8(%rcx),%rax,%r11
987	adcxq	%rax,%r10
988	adoxq	%r12,%r11
989	mulxq	16(%rcx),%rax,%r12
990	movq	%r10,-32(%rbx)
991	adcxq	%rax,%r11
992	adoxq	%r13,%r12
993	mulxq	24(%rcx),%rax,%r15
994	movq	%r9,%rdx
995	movq	%r11,-24(%rbx)
996	leaq	32(%rcx),%rcx
997	adcxq	%rax,%r12
998	adoxq	%rbp,%r15
999	movq	48(%rsp),%rdi
1000	movq	%r12,-16(%rbx)
1001
1002	jmp	.Lmulx4x_inner
1003
1004.align	32
1005.Lmulx4x_inner:
1006	mulxq	0(%rsi),%r10,%rax
1007	adcxq	%rbp,%r15
1008	adoxq	%r14,%r10
1009	mulxq	8(%rsi),%r11,%r14
1010	adcxq	0(%rbx),%r10
1011	adoxq	%rax,%r11
1012	mulxq	16(%rsi),%r12,%rax
1013	adcxq	8(%rbx),%r11
1014	adoxq	%r14,%r12
1015	mulxq	24(%rsi),%r13,%r14
1016	movq	%r8,%rdx
1017	adcxq	16(%rbx),%r12
1018	adoxq	%rax,%r13
1019	adcxq	24(%rbx),%r13
1020	adoxq	%rbp,%r14
1021	leaq	32(%rsi),%rsi
1022	leaq	32(%rbx),%rbx
1023	adcxq	%rbp,%r14
1024
1025	adoxq	%r15,%r10
1026	mulxq	0(%rcx),%rax,%r15
1027	adcxq	%rax,%r10
1028	adoxq	%r15,%r11
1029	mulxq	8(%rcx),%rax,%r15
1030	adcxq	%rax,%r11
1031	adoxq	%r15,%r12
1032	mulxq	16(%rcx),%rax,%r15
1033	movq	%r10,-40(%rbx)
1034	adcxq	%rax,%r12
1035	adoxq	%r15,%r13
1036	mulxq	24(%rcx),%rax,%r15
1037	movq	%r9,%rdx
1038	movq	%r11,-32(%rbx)
1039	movq	%r12,-24(%rbx)
1040	adcxq	%rax,%r13
1041	adoxq	%rbp,%r15
1042	leaq	32(%rcx),%rcx
1043	movq	%r13,-16(%rbx)
1044
1045	decq	%rdi
1046	jnz	.Lmulx4x_inner
1047
1048	movq	0(%rsp),%rax
1049	movq	8(%rsp),%rdi
1050	adcq	%rbp,%r15
1051	subq	0(%rbx),%rbp
1052	adcq	%r15,%r14
1053	sbbq	%r15,%r15
1054	movq	%r14,-8(%rbx)
1055
1056	cmpq	16(%rsp),%rdi
1057	jne	.Lmulx4x_outer
1058
1059	leaq	64(%rsp),%rbx
1060	subq	%rax,%rcx
1061	negq	%r15
1062	movq	%rax,%rdx
1063	shrq	$3+2,%rax
1064	movq	32(%rsp),%rdi
1065	jmp	.Lmulx4x_sub
1066
1067.align	32
1068.Lmulx4x_sub:
1069	movq	0(%rbx),%r11
1070	movq	8(%rbx),%r12
1071	movq	16(%rbx),%r13
1072	movq	24(%rbx),%r14
1073	leaq	32(%rbx),%rbx
1074	sbbq	0(%rcx),%r11
1075	sbbq	8(%rcx),%r12
1076	sbbq	16(%rcx),%r13
1077	sbbq	24(%rcx),%r14
1078	leaq	32(%rcx),%rcx
1079	movq	%r11,0(%rdi)
1080	movq	%r12,8(%rdi)
1081	movq	%r13,16(%rdi)
1082	movq	%r14,24(%rdi)
1083	leaq	32(%rdi),%rdi
1084	decq	%rax
1085	jnz	.Lmulx4x_sub
1086
1087	sbbq	$0,%r15
1088	leaq	64(%rsp),%rbx
1089	subq	%rdx,%rdi
1090
1091.byte	102,73,15,110,207
1092	pxor	%xmm0,%xmm0
1093	pshufd	$0,%xmm1,%xmm1
1094	movq	40(%rsp),%rsi
1095	jmp	.Lmulx4x_cond_copy
1096
1097.align	32
1098.Lmulx4x_cond_copy:
1099	movdqa	0(%rbx),%xmm2
1100	movdqa	16(%rbx),%xmm3
1101	leaq	32(%rbx),%rbx
1102	movdqu	0(%rdi),%xmm4
1103	movdqu	16(%rdi),%xmm5
1104	leaq	32(%rdi),%rdi
1105	movdqa	%xmm0,-32(%rbx)
1106	movdqa	%xmm0,-16(%rbx)
1107	pcmpeqd	%xmm1,%xmm0
1108	pand	%xmm1,%xmm2
1109	pand	%xmm1,%xmm3
1110	pand	%xmm0,%xmm4
1111	pand	%xmm0,%xmm5
1112	pxor	%xmm0,%xmm0
1113	por	%xmm2,%xmm4
1114	por	%xmm3,%xmm5
1115	movdqu	%xmm4,-32(%rdi)
1116	movdqu	%xmm5,-16(%rdi)
1117	subq	$32,%rdx
1118	jnz	.Lmulx4x_cond_copy
1119
1120	movq	%rdx,(%rbx)
1121
1122	movq	$1,%rax
1123	movq	-48(%rsi),%r15
1124	movq	-40(%rsi),%r14
1125	movq	-32(%rsi),%r13
1126	movq	-24(%rsi),%r12
1127	movq	-16(%rsi),%rbp
1128	movq	-8(%rsi),%rbx
1129	leaq	(%rsi),%rsp
1130.Lmulx4x_epilogue:
1131	.byte	0xf3,0xc3
1132.size	bn_mulx4x_mont,.-bn_mulx4x_mont
1133.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1134.align	16
1135