x86_64-mont5.S revision 1.7
1#include <machine/asm.h>
2.text
3
4
5
6.globl	bn_mul_mont_gather5
7.type	bn_mul_mont_gather5,@function
8.align	64
9bn_mul_mont_gather5:
10.cfi_startproc
11	movl	%r9d,%r9d
12	movq	%rsp,%rax
13.cfi_def_cfa_register	%rax
14	testl	$7,%r9d
15	jnz	.Lmul_enter
16	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
17	jmp	.Lmul4x_enter
18
19.align	16
20.Lmul_enter:
21	movd	8(%rsp),%xmm5
22	pushq	%rbx
23.cfi_offset	%rbx,-16
24	pushq	%rbp
25.cfi_offset	%rbp,-24
26	pushq	%r12
27.cfi_offset	%r12,-32
28	pushq	%r13
29.cfi_offset	%r13,-40
30	pushq	%r14
31.cfi_offset	%r14,-48
32	pushq	%r15
33.cfi_offset	%r15,-56
34
35	negq	%r9
36	movq	%rsp,%r11
37	leaq	-280(%rsp,%r9,8),%r10
38	negq	%r9
39	andq	$-1024,%r10
40
41
42
43
44
45
46
47
48
49	subq	%r10,%r11
50	andq	$-4096,%r11
51	leaq	(%r10,%r11,1),%rsp
52	movq	(%rsp),%r11
53	cmpq	%r10,%rsp
54	ja	.Lmul_page_walk
55	jmp	.Lmul_page_walk_done
56
57.Lmul_page_walk:
58	leaq	-4096(%rsp),%rsp
59	movq	(%rsp),%r11
60	cmpq	%r10,%rsp
61	ja	.Lmul_page_walk
62.Lmul_page_walk_done:
63
64	leaq	.Linc(%rip),%r10
65	movq	%rax,8(%rsp,%r9,8)
66.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
67.Lmul_body:
68
69	leaq	128(%rdx),%r12
70	movdqa	0(%r10),%xmm0
71	movdqa	16(%r10),%xmm1
72	leaq	24-112(%rsp,%r9,8),%r10
73	andq	$-16,%r10
74
75	pshufd	$0,%xmm5,%xmm5
76	movdqa	%xmm1,%xmm4
77	movdqa	%xmm1,%xmm2
78	paddd	%xmm0,%xmm1
79	pcmpeqd	%xmm5,%xmm0
80.byte	0x67
81	movdqa	%xmm4,%xmm3
82	paddd	%xmm1,%xmm2
83	pcmpeqd	%xmm5,%xmm1
84	movdqa	%xmm0,112(%r10)
85	movdqa	%xmm4,%xmm0
86
87	paddd	%xmm2,%xmm3
88	pcmpeqd	%xmm5,%xmm2
89	movdqa	%xmm1,128(%r10)
90	movdqa	%xmm4,%xmm1
91
92	paddd	%xmm3,%xmm0
93	pcmpeqd	%xmm5,%xmm3
94	movdqa	%xmm2,144(%r10)
95	movdqa	%xmm4,%xmm2
96
97	paddd	%xmm0,%xmm1
98	pcmpeqd	%xmm5,%xmm0
99	movdqa	%xmm3,160(%r10)
100	movdqa	%xmm4,%xmm3
101	paddd	%xmm1,%xmm2
102	pcmpeqd	%xmm5,%xmm1
103	movdqa	%xmm0,176(%r10)
104	movdqa	%xmm4,%xmm0
105
106	paddd	%xmm2,%xmm3
107	pcmpeqd	%xmm5,%xmm2
108	movdqa	%xmm1,192(%r10)
109	movdqa	%xmm4,%xmm1
110
111	paddd	%xmm3,%xmm0
112	pcmpeqd	%xmm5,%xmm3
113	movdqa	%xmm2,208(%r10)
114	movdqa	%xmm4,%xmm2
115
116	paddd	%xmm0,%xmm1
117	pcmpeqd	%xmm5,%xmm0
118	movdqa	%xmm3,224(%r10)
119	movdqa	%xmm4,%xmm3
120	paddd	%xmm1,%xmm2
121	pcmpeqd	%xmm5,%xmm1
122	movdqa	%xmm0,240(%r10)
123	movdqa	%xmm4,%xmm0
124
125	paddd	%xmm2,%xmm3
126	pcmpeqd	%xmm5,%xmm2
127	movdqa	%xmm1,256(%r10)
128	movdqa	%xmm4,%xmm1
129
130	paddd	%xmm3,%xmm0
131	pcmpeqd	%xmm5,%xmm3
132	movdqa	%xmm2,272(%r10)
133	movdqa	%xmm4,%xmm2
134
135	paddd	%xmm0,%xmm1
136	pcmpeqd	%xmm5,%xmm0
137	movdqa	%xmm3,288(%r10)
138	movdqa	%xmm4,%xmm3
139	paddd	%xmm1,%xmm2
140	pcmpeqd	%xmm5,%xmm1
141	movdqa	%xmm0,304(%r10)
142
143	paddd	%xmm2,%xmm3
144.byte	0x67
145	pcmpeqd	%xmm5,%xmm2
146	movdqa	%xmm1,320(%r10)
147
148	pcmpeqd	%xmm5,%xmm3
149	movdqa	%xmm2,336(%r10)
150	pand	64(%r12),%xmm0
151
152	pand	80(%r12),%xmm1
153	pand	96(%r12),%xmm2
154	movdqa	%xmm3,352(%r10)
155	pand	112(%r12),%xmm3
156	por	%xmm2,%xmm0
157	por	%xmm3,%xmm1
158	movdqa	-128(%r12),%xmm4
159	movdqa	-112(%r12),%xmm5
160	movdqa	-96(%r12),%xmm2
161	pand	112(%r10),%xmm4
162	movdqa	-80(%r12),%xmm3
163	pand	128(%r10),%xmm5
164	por	%xmm4,%xmm0
165	pand	144(%r10),%xmm2
166	por	%xmm5,%xmm1
167	pand	160(%r10),%xmm3
168	por	%xmm2,%xmm0
169	por	%xmm3,%xmm1
170	movdqa	-64(%r12),%xmm4
171	movdqa	-48(%r12),%xmm5
172	movdqa	-32(%r12),%xmm2
173	pand	176(%r10),%xmm4
174	movdqa	-16(%r12),%xmm3
175	pand	192(%r10),%xmm5
176	por	%xmm4,%xmm0
177	pand	208(%r10),%xmm2
178	por	%xmm5,%xmm1
179	pand	224(%r10),%xmm3
180	por	%xmm2,%xmm0
181	por	%xmm3,%xmm1
182	movdqa	0(%r12),%xmm4
183	movdqa	16(%r12),%xmm5
184	movdqa	32(%r12),%xmm2
185	pand	240(%r10),%xmm4
186	movdqa	48(%r12),%xmm3
187	pand	256(%r10),%xmm5
188	por	%xmm4,%xmm0
189	pand	272(%r10),%xmm2
190	por	%xmm5,%xmm1
191	pand	288(%r10),%xmm3
192	por	%xmm2,%xmm0
193	por	%xmm3,%xmm1
194	por	%xmm1,%xmm0
195	pshufd	$0x4e,%xmm0,%xmm1
196	por	%xmm1,%xmm0
197	leaq	256(%r12),%r12
198.byte	102,72,15,126,195
199
200	movq	(%r8),%r8
201	movq	(%rsi),%rax
202
203	xorq	%r14,%r14
204	xorq	%r15,%r15
205
206	movq	%r8,%rbp
207	mulq	%rbx
208	movq	%rax,%r10
209	movq	(%rcx),%rax
210
211	imulq	%r10,%rbp
212	movq	%rdx,%r11
213
214	mulq	%rbp
215	addq	%rax,%r10
216	movq	8(%rsi),%rax
217	adcq	$0,%rdx
218	movq	%rdx,%r13
219
220	leaq	1(%r15),%r15
221	jmp	.L1st_enter
222
223.align	16
224.L1st:
225	addq	%rax,%r13
226	movq	(%rsi,%r15,8),%rax
227	adcq	$0,%rdx
228	addq	%r11,%r13
229	movq	%r10,%r11
230	adcq	$0,%rdx
231	movq	%r13,-16(%rsp,%r15,8)
232	movq	%rdx,%r13
233
234.L1st_enter:
235	mulq	%rbx
236	addq	%rax,%r11
237	movq	(%rcx,%r15,8),%rax
238	adcq	$0,%rdx
239	leaq	1(%r15),%r15
240	movq	%rdx,%r10
241
242	mulq	%rbp
243	cmpq	%r9,%r15
244	jne	.L1st
245
246
247	addq	%rax,%r13
248	adcq	$0,%rdx
249	addq	%r11,%r13
250	adcq	$0,%rdx
251	movq	%r13,-16(%rsp,%r9,8)
252	movq	%rdx,%r13
253	movq	%r10,%r11
254
255	xorq	%rdx,%rdx
256	addq	%r11,%r13
257	adcq	$0,%rdx
258	movq	%r13,-8(%rsp,%r9,8)
259	movq	%rdx,(%rsp,%r9,8)
260
261	leaq	1(%r14),%r14
262	jmp	.Louter
263.align	16
264.Louter:
265	leaq	24+128(%rsp,%r9,8),%rdx
266	andq	$-16,%rdx
267	pxor	%xmm4,%xmm4
268	pxor	%xmm5,%xmm5
269	movdqa	-128(%r12),%xmm0
270	movdqa	-112(%r12),%xmm1
271	movdqa	-96(%r12),%xmm2
272	movdqa	-80(%r12),%xmm3
273	pand	-128(%rdx),%xmm0
274	pand	-112(%rdx),%xmm1
275	por	%xmm0,%xmm4
276	pand	-96(%rdx),%xmm2
277	por	%xmm1,%xmm5
278	pand	-80(%rdx),%xmm3
279	por	%xmm2,%xmm4
280	por	%xmm3,%xmm5
281	movdqa	-64(%r12),%xmm0
282	movdqa	-48(%r12),%xmm1
283	movdqa	-32(%r12),%xmm2
284	movdqa	-16(%r12),%xmm3
285	pand	-64(%rdx),%xmm0
286	pand	-48(%rdx),%xmm1
287	por	%xmm0,%xmm4
288	pand	-32(%rdx),%xmm2
289	por	%xmm1,%xmm5
290	pand	-16(%rdx),%xmm3
291	por	%xmm2,%xmm4
292	por	%xmm3,%xmm5
293	movdqa	0(%r12),%xmm0
294	movdqa	16(%r12),%xmm1
295	movdqa	32(%r12),%xmm2
296	movdqa	48(%r12),%xmm3
297	pand	0(%rdx),%xmm0
298	pand	16(%rdx),%xmm1
299	por	%xmm0,%xmm4
300	pand	32(%rdx),%xmm2
301	por	%xmm1,%xmm5
302	pand	48(%rdx),%xmm3
303	por	%xmm2,%xmm4
304	por	%xmm3,%xmm5
305	movdqa	64(%r12),%xmm0
306	movdqa	80(%r12),%xmm1
307	movdqa	96(%r12),%xmm2
308	movdqa	112(%r12),%xmm3
309	pand	64(%rdx),%xmm0
310	pand	80(%rdx),%xmm1
311	por	%xmm0,%xmm4
312	pand	96(%rdx),%xmm2
313	por	%xmm1,%xmm5
314	pand	112(%rdx),%xmm3
315	por	%xmm2,%xmm4
316	por	%xmm3,%xmm5
317	por	%xmm5,%xmm4
318	pshufd	$0x4e,%xmm4,%xmm0
319	por	%xmm4,%xmm0
320	leaq	256(%r12),%r12
321
322	movq	(%rsi),%rax
323.byte	102,72,15,126,195
324
325	xorq	%r15,%r15
326	movq	%r8,%rbp
327	movq	(%rsp),%r10
328
329	mulq	%rbx
330	addq	%rax,%r10
331	movq	(%rcx),%rax
332	adcq	$0,%rdx
333
334	imulq	%r10,%rbp
335	movq	%rdx,%r11
336
337	mulq	%rbp
338	addq	%rax,%r10
339	movq	8(%rsi),%rax
340	adcq	$0,%rdx
341	movq	8(%rsp),%r10
342	movq	%rdx,%r13
343
344	leaq	1(%r15),%r15
345	jmp	.Linner_enter
346
347.align	16
348.Linner:
349	addq	%rax,%r13
350	movq	(%rsi,%r15,8),%rax
351	adcq	$0,%rdx
352	addq	%r10,%r13
353	movq	(%rsp,%r15,8),%r10
354	adcq	$0,%rdx
355	movq	%r13,-16(%rsp,%r15,8)
356	movq	%rdx,%r13
357
358.Linner_enter:
359	mulq	%rbx
360	addq	%rax,%r11
361	movq	(%rcx,%r15,8),%rax
362	adcq	$0,%rdx
363	addq	%r11,%r10
364	movq	%rdx,%r11
365	adcq	$0,%r11
366	leaq	1(%r15),%r15
367
368	mulq	%rbp
369	cmpq	%r9,%r15
370	jne	.Linner
371
372	addq	%rax,%r13
373	adcq	$0,%rdx
374	addq	%r10,%r13
375	movq	(%rsp,%r9,8),%r10
376	adcq	$0,%rdx
377	movq	%r13,-16(%rsp,%r9,8)
378	movq	%rdx,%r13
379
380	xorq	%rdx,%rdx
381	addq	%r11,%r13
382	adcq	$0,%rdx
383	addq	%r10,%r13
384	adcq	$0,%rdx
385	movq	%r13,-8(%rsp,%r9,8)
386	movq	%rdx,(%rsp,%r9,8)
387
388	leaq	1(%r14),%r14
389	cmpq	%r9,%r14
390	jb	.Louter
391
392	xorq	%r14,%r14
393	movq	(%rsp),%rax
394	leaq	(%rsp),%rsi
395	movq	%r9,%r15
396	jmp	.Lsub
397.align	16
398.Lsub:	sbbq	(%rcx,%r14,8),%rax
399	movq	%rax,(%rdi,%r14,8)
400	movq	8(%rsi,%r14,8),%rax
401	leaq	1(%r14),%r14
402	decq	%r15
403	jnz	.Lsub
404
405	sbbq	$0,%rax
406	movq	$-1,%rbx
407	xorq	%rax,%rbx
408	xorq	%r14,%r14
409	movq	%r9,%r15
410
411.Lcopy:
412	movq	(%rdi,%r14,8),%rcx
413	movq	(%rsp,%r14,8),%rdx
414	andq	%rbx,%rcx
415	andq	%rax,%rdx
416	movq	%r14,(%rsp,%r14,8)
417	orq	%rcx,%rdx
418	movq	%rdx,(%rdi,%r14,8)
419	leaq	1(%r14),%r14
420	subq	$1,%r15
421	jnz	.Lcopy
422
423	movq	8(%rsp,%r9,8),%rsi
424.cfi_def_cfa	%rsi,8
425	movq	$1,%rax
426
427	movq	-48(%rsi),%r15
428.cfi_restore	%r15
429	movq	-40(%rsi),%r14
430.cfi_restore	%r14
431	movq	-32(%rsi),%r13
432.cfi_restore	%r13
433	movq	-24(%rsi),%r12
434.cfi_restore	%r12
435	movq	-16(%rsi),%rbp
436.cfi_restore	%rbp
437	movq	-8(%rsi),%rbx
438.cfi_restore	%rbx
439	leaq	(%rsi),%rsp
440.cfi_def_cfa_register	%rsp
441.Lmul_epilogue:
442	.byte	0xf3,0xc3
443.cfi_endproc
444.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
445.type	bn_mul4x_mont_gather5,@function
446.align	32
447bn_mul4x_mont_gather5:
448.cfi_startproc
449.byte	0x67
450	movq	%rsp,%rax
451.cfi_def_cfa_register	%rax
452.Lmul4x_enter:
453	andl	$0x80108,%r11d
454	cmpl	$0x80108,%r11d
455	je	.Lmulx4x_enter
456	pushq	%rbx
457.cfi_offset	%rbx,-16
458	pushq	%rbp
459.cfi_offset	%rbp,-24
460	pushq	%r12
461.cfi_offset	%r12,-32
462	pushq	%r13
463.cfi_offset	%r13,-40
464	pushq	%r14
465.cfi_offset	%r14,-48
466	pushq	%r15
467.cfi_offset	%r15,-56
468.Lmul4x_prologue:
469
470.byte	0x67
471	shll	$3,%r9d
472	leaq	(%r9,%r9,2),%r10
473	negq	%r9
474
475
476
477
478
479
480
481
482
483
484	leaq	-320(%rsp,%r9,2),%r11
485	movq	%rsp,%rbp
486	subq	%rdi,%r11
487	andq	$4095,%r11
488	cmpq	%r11,%r10
489	jb	.Lmul4xsp_alt
490	subq	%r11,%rbp
491	leaq	-320(%rbp,%r9,2),%rbp
492	jmp	.Lmul4xsp_done
493
494.align	32
495.Lmul4xsp_alt:
496	leaq	4096-320(,%r9,2),%r10
497	leaq	-320(%rbp,%r9,2),%rbp
498	subq	%r10,%r11
499	movq	$0,%r10
500	cmovcq	%r10,%r11
501	subq	%r11,%rbp
502.Lmul4xsp_done:
503	andq	$-64,%rbp
504	movq	%rsp,%r11
505	subq	%rbp,%r11
506	andq	$-4096,%r11
507	leaq	(%r11,%rbp,1),%rsp
508	movq	(%rsp),%r10
509	cmpq	%rbp,%rsp
510	ja	.Lmul4x_page_walk
511	jmp	.Lmul4x_page_walk_done
512
513.Lmul4x_page_walk:
514	leaq	-4096(%rsp),%rsp
515	movq	(%rsp),%r10
516	cmpq	%rbp,%rsp
517	ja	.Lmul4x_page_walk
518.Lmul4x_page_walk_done:
519
520	negq	%r9
521
522	movq	%rax,40(%rsp)
523.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
524.Lmul4x_body:
525
526	call	mul4x_internal
527
528	movq	40(%rsp),%rsi
529.cfi_def_cfa	%rsi,8
530	movq	$1,%rax
531
532	movq	-48(%rsi),%r15
533.cfi_restore	%r15
534	movq	-40(%rsi),%r14
535.cfi_restore	%r14
536	movq	-32(%rsi),%r13
537.cfi_restore	%r13
538	movq	-24(%rsi),%r12
539.cfi_restore	%r12
540	movq	-16(%rsi),%rbp
541.cfi_restore	%rbp
542	movq	-8(%rsi),%rbx
543.cfi_restore	%rbx
544	leaq	(%rsi),%rsp
545.cfi_def_cfa_register	%rsp
546.Lmul4x_epilogue:
547	.byte	0xf3,0xc3
548.cfi_endproc
549.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
550
551.type	mul4x_internal,@function
552.align	32
553mul4x_internal:
554	shlq	$5,%r9
555	movd	8(%rax),%xmm5
556	leaq	.Linc(%rip),%rax
557	leaq	128(%rdx,%r9,1),%r13
558	shrq	$5,%r9
559	movdqa	0(%rax),%xmm0
560	movdqa	16(%rax),%xmm1
561	leaq	88-112(%rsp,%r9,1),%r10
562	leaq	128(%rdx),%r12
563
564	pshufd	$0,%xmm5,%xmm5
565	movdqa	%xmm1,%xmm4
566.byte	0x67,0x67
567	movdqa	%xmm1,%xmm2
568	paddd	%xmm0,%xmm1
569	pcmpeqd	%xmm5,%xmm0
570.byte	0x67
571	movdqa	%xmm4,%xmm3
572	paddd	%xmm1,%xmm2
573	pcmpeqd	%xmm5,%xmm1
574	movdqa	%xmm0,112(%r10)
575	movdqa	%xmm4,%xmm0
576
577	paddd	%xmm2,%xmm3
578	pcmpeqd	%xmm5,%xmm2
579	movdqa	%xmm1,128(%r10)
580	movdqa	%xmm4,%xmm1
581
582	paddd	%xmm3,%xmm0
583	pcmpeqd	%xmm5,%xmm3
584	movdqa	%xmm2,144(%r10)
585	movdqa	%xmm4,%xmm2
586
587	paddd	%xmm0,%xmm1
588	pcmpeqd	%xmm5,%xmm0
589	movdqa	%xmm3,160(%r10)
590	movdqa	%xmm4,%xmm3
591	paddd	%xmm1,%xmm2
592	pcmpeqd	%xmm5,%xmm1
593	movdqa	%xmm0,176(%r10)
594	movdqa	%xmm4,%xmm0
595
596	paddd	%xmm2,%xmm3
597	pcmpeqd	%xmm5,%xmm2
598	movdqa	%xmm1,192(%r10)
599	movdqa	%xmm4,%xmm1
600
601	paddd	%xmm3,%xmm0
602	pcmpeqd	%xmm5,%xmm3
603	movdqa	%xmm2,208(%r10)
604	movdqa	%xmm4,%xmm2
605
606	paddd	%xmm0,%xmm1
607	pcmpeqd	%xmm5,%xmm0
608	movdqa	%xmm3,224(%r10)
609	movdqa	%xmm4,%xmm3
610	paddd	%xmm1,%xmm2
611	pcmpeqd	%xmm5,%xmm1
612	movdqa	%xmm0,240(%r10)
613	movdqa	%xmm4,%xmm0
614
615	paddd	%xmm2,%xmm3
616	pcmpeqd	%xmm5,%xmm2
617	movdqa	%xmm1,256(%r10)
618	movdqa	%xmm4,%xmm1
619
620	paddd	%xmm3,%xmm0
621	pcmpeqd	%xmm5,%xmm3
622	movdqa	%xmm2,272(%r10)
623	movdqa	%xmm4,%xmm2
624
625	paddd	%xmm0,%xmm1
626	pcmpeqd	%xmm5,%xmm0
627	movdqa	%xmm3,288(%r10)
628	movdqa	%xmm4,%xmm3
629	paddd	%xmm1,%xmm2
630	pcmpeqd	%xmm5,%xmm1
631	movdqa	%xmm0,304(%r10)
632
633	paddd	%xmm2,%xmm3
634.byte	0x67
635	pcmpeqd	%xmm5,%xmm2
636	movdqa	%xmm1,320(%r10)
637
638	pcmpeqd	%xmm5,%xmm3
639	movdqa	%xmm2,336(%r10)
640	pand	64(%r12),%xmm0
641
642	pand	80(%r12),%xmm1
643	pand	96(%r12),%xmm2
644	movdqa	%xmm3,352(%r10)
645	pand	112(%r12),%xmm3
646	por	%xmm2,%xmm0
647	por	%xmm3,%xmm1
648	movdqa	-128(%r12),%xmm4
649	movdqa	-112(%r12),%xmm5
650	movdqa	-96(%r12),%xmm2
651	pand	112(%r10),%xmm4
652	movdqa	-80(%r12),%xmm3
653	pand	128(%r10),%xmm5
654	por	%xmm4,%xmm0
655	pand	144(%r10),%xmm2
656	por	%xmm5,%xmm1
657	pand	160(%r10),%xmm3
658	por	%xmm2,%xmm0
659	por	%xmm3,%xmm1
660	movdqa	-64(%r12),%xmm4
661	movdqa	-48(%r12),%xmm5
662	movdqa	-32(%r12),%xmm2
663	pand	176(%r10),%xmm4
664	movdqa	-16(%r12),%xmm3
665	pand	192(%r10),%xmm5
666	por	%xmm4,%xmm0
667	pand	208(%r10),%xmm2
668	por	%xmm5,%xmm1
669	pand	224(%r10),%xmm3
670	por	%xmm2,%xmm0
671	por	%xmm3,%xmm1
672	movdqa	0(%r12),%xmm4
673	movdqa	16(%r12),%xmm5
674	movdqa	32(%r12),%xmm2
675	pand	240(%r10),%xmm4
676	movdqa	48(%r12),%xmm3
677	pand	256(%r10),%xmm5
678	por	%xmm4,%xmm0
679	pand	272(%r10),%xmm2
680	por	%xmm5,%xmm1
681	pand	288(%r10),%xmm3
682	por	%xmm2,%xmm0
683	por	%xmm3,%xmm1
684	por	%xmm1,%xmm0
685	pshufd	$0x4e,%xmm0,%xmm1
686	por	%xmm1,%xmm0
687	leaq	256(%r12),%r12
688.byte	102,72,15,126,195
689
690	movq	%r13,16+8(%rsp)
691	movq	%rdi,56+8(%rsp)
692
693	movq	(%r8),%r8
694	movq	(%rsi),%rax
695	leaq	(%rsi,%r9,1),%rsi
696	negq	%r9
697
698	movq	%r8,%rbp
699	mulq	%rbx
700	movq	%rax,%r10
701	movq	(%rcx),%rax
702
703	imulq	%r10,%rbp
704	leaq	64+8(%rsp),%r14
705	movq	%rdx,%r11
706
707	mulq	%rbp
708	addq	%rax,%r10
709	movq	8(%rsi,%r9,1),%rax
710	adcq	$0,%rdx
711	movq	%rdx,%rdi
712
713	mulq	%rbx
714	addq	%rax,%r11
715	movq	8(%rcx),%rax
716	adcq	$0,%rdx
717	movq	%rdx,%r10
718
719	mulq	%rbp
720	addq	%rax,%rdi
721	movq	16(%rsi,%r9,1),%rax
722	adcq	$0,%rdx
723	addq	%r11,%rdi
724	leaq	32(%r9),%r15
725	leaq	32(%rcx),%rcx
726	adcq	$0,%rdx
727	movq	%rdi,(%r14)
728	movq	%rdx,%r13
729	jmp	.L1st4x
730
731.align	32
732.L1st4x:
733	mulq	%rbx
734	addq	%rax,%r10
735	movq	-16(%rcx),%rax
736	leaq	32(%r14),%r14
737	adcq	$0,%rdx
738	movq	%rdx,%r11
739
740	mulq	%rbp
741	addq	%rax,%r13
742	movq	-8(%rsi,%r15,1),%rax
743	adcq	$0,%rdx
744	addq	%r10,%r13
745	adcq	$0,%rdx
746	movq	%r13,-24(%r14)
747	movq	%rdx,%rdi
748
749	mulq	%rbx
750	addq	%rax,%r11
751	movq	-8(%rcx),%rax
752	adcq	$0,%rdx
753	movq	%rdx,%r10
754
755	mulq	%rbp
756	addq	%rax,%rdi
757	movq	(%rsi,%r15,1),%rax
758	adcq	$0,%rdx
759	addq	%r11,%rdi
760	adcq	$0,%rdx
761	movq	%rdi,-16(%r14)
762	movq	%rdx,%r13
763
764	mulq	%rbx
765	addq	%rax,%r10
766	movq	0(%rcx),%rax
767	adcq	$0,%rdx
768	movq	%rdx,%r11
769
770	mulq	%rbp
771	addq	%rax,%r13
772	movq	8(%rsi,%r15,1),%rax
773	adcq	$0,%rdx
774	addq	%r10,%r13
775	adcq	$0,%rdx
776	movq	%r13,-8(%r14)
777	movq	%rdx,%rdi
778
779	mulq	%rbx
780	addq	%rax,%r11
781	movq	8(%rcx),%rax
782	adcq	$0,%rdx
783	movq	%rdx,%r10
784
785	mulq	%rbp
786	addq	%rax,%rdi
787	movq	16(%rsi,%r15,1),%rax
788	adcq	$0,%rdx
789	addq	%r11,%rdi
790	leaq	32(%rcx),%rcx
791	adcq	$0,%rdx
792	movq	%rdi,(%r14)
793	movq	%rdx,%r13
794
795	addq	$32,%r15
796	jnz	.L1st4x
797
798	mulq	%rbx
799	addq	%rax,%r10
800	movq	-16(%rcx),%rax
801	leaq	32(%r14),%r14
802	adcq	$0,%rdx
803	movq	%rdx,%r11
804
805	mulq	%rbp
806	addq	%rax,%r13
807	movq	-8(%rsi),%rax
808	adcq	$0,%rdx
809	addq	%r10,%r13
810	adcq	$0,%rdx
811	movq	%r13,-24(%r14)
812	movq	%rdx,%rdi
813
814	mulq	%rbx
815	addq	%rax,%r11
816	movq	-8(%rcx),%rax
817	adcq	$0,%rdx
818	movq	%rdx,%r10
819
820	mulq	%rbp
821	addq	%rax,%rdi
822	movq	(%rsi,%r9,1),%rax
823	adcq	$0,%rdx
824	addq	%r11,%rdi
825	adcq	$0,%rdx
826	movq	%rdi,-16(%r14)
827	movq	%rdx,%r13
828
829	leaq	(%rcx,%r9,1),%rcx
830
831	xorq	%rdi,%rdi
832	addq	%r10,%r13
833	adcq	$0,%rdi
834	movq	%r13,-8(%r14)
835
836	jmp	.Louter4x
837
838.align	32
839.Louter4x:
840	leaq	16+128(%r14),%rdx
841	pxor	%xmm4,%xmm4
842	pxor	%xmm5,%xmm5
843	movdqa	-128(%r12),%xmm0
844	movdqa	-112(%r12),%xmm1
845	movdqa	-96(%r12),%xmm2
846	movdqa	-80(%r12),%xmm3
847	pand	-128(%rdx),%xmm0
848	pand	-112(%rdx),%xmm1
849	por	%xmm0,%xmm4
850	pand	-96(%rdx),%xmm2
851	por	%xmm1,%xmm5
852	pand	-80(%rdx),%xmm3
853	por	%xmm2,%xmm4
854	por	%xmm3,%xmm5
855	movdqa	-64(%r12),%xmm0
856	movdqa	-48(%r12),%xmm1
857	movdqa	-32(%r12),%xmm2
858	movdqa	-16(%r12),%xmm3
859	pand	-64(%rdx),%xmm0
860	pand	-48(%rdx),%xmm1
861	por	%xmm0,%xmm4
862	pand	-32(%rdx),%xmm2
863	por	%xmm1,%xmm5
864	pand	-16(%rdx),%xmm3
865	por	%xmm2,%xmm4
866	por	%xmm3,%xmm5
867	movdqa	0(%r12),%xmm0
868	movdqa	16(%r12),%xmm1
869	movdqa	32(%r12),%xmm2
870	movdqa	48(%r12),%xmm3
871	pand	0(%rdx),%xmm0
872	pand	16(%rdx),%xmm1
873	por	%xmm0,%xmm4
874	pand	32(%rdx),%xmm2
875	por	%xmm1,%xmm5
876	pand	48(%rdx),%xmm3
877	por	%xmm2,%xmm4
878	por	%xmm3,%xmm5
879	movdqa	64(%r12),%xmm0
880	movdqa	80(%r12),%xmm1
881	movdqa	96(%r12),%xmm2
882	movdqa	112(%r12),%xmm3
883	pand	64(%rdx),%xmm0
884	pand	80(%rdx),%xmm1
885	por	%xmm0,%xmm4
886	pand	96(%rdx),%xmm2
887	por	%xmm1,%xmm5
888	pand	112(%rdx),%xmm3
889	por	%xmm2,%xmm4
890	por	%xmm3,%xmm5
891	por	%xmm5,%xmm4
892	pshufd	$0x4e,%xmm4,%xmm0
893	por	%xmm4,%xmm0
894	leaq	256(%r12),%r12
895.byte	102,72,15,126,195
896
897	movq	(%r14,%r9,1),%r10
898	movq	%r8,%rbp
899	mulq	%rbx
900	addq	%rax,%r10
901	movq	(%rcx),%rax
902	adcq	$0,%rdx
903
904	imulq	%r10,%rbp
905	movq	%rdx,%r11
906	movq	%rdi,(%r14)
907
908	leaq	(%r14,%r9,1),%r14
909
910	mulq	%rbp
911	addq	%rax,%r10
912	movq	8(%rsi,%r9,1),%rax
913	adcq	$0,%rdx
914	movq	%rdx,%rdi
915
916	mulq	%rbx
917	addq	%rax,%r11
918	movq	8(%rcx),%rax
919	adcq	$0,%rdx
920	addq	8(%r14),%r11
921	adcq	$0,%rdx
922	movq	%rdx,%r10
923
924	mulq	%rbp
925	addq	%rax,%rdi
926	movq	16(%rsi,%r9,1),%rax
927	adcq	$0,%rdx
928	addq	%r11,%rdi
929	leaq	32(%r9),%r15
930	leaq	32(%rcx),%rcx
931	adcq	$0,%rdx
932	movq	%rdx,%r13
933	jmp	.Linner4x
934
935.align	32
936.Linner4x:
937	mulq	%rbx
938	addq	%rax,%r10
939	movq	-16(%rcx),%rax
940	adcq	$0,%rdx
941	addq	16(%r14),%r10
942	leaq	32(%r14),%r14
943	adcq	$0,%rdx
944	movq	%rdx,%r11
945
946	mulq	%rbp
947	addq	%rax,%r13
948	movq	-8(%rsi,%r15,1),%rax
949	adcq	$0,%rdx
950	addq	%r10,%r13
951	adcq	$0,%rdx
952	movq	%rdi,-32(%r14)
953	movq	%rdx,%rdi
954
955	mulq	%rbx
956	addq	%rax,%r11
957	movq	-8(%rcx),%rax
958	adcq	$0,%rdx
959	addq	-8(%r14),%r11
960	adcq	$0,%rdx
961	movq	%rdx,%r10
962
963	mulq	%rbp
964	addq	%rax,%rdi
965	movq	(%rsi,%r15,1),%rax
966	adcq	$0,%rdx
967	addq	%r11,%rdi
968	adcq	$0,%rdx
969	movq	%r13,-24(%r14)
970	movq	%rdx,%r13
971
972	mulq	%rbx
973	addq	%rax,%r10
974	movq	0(%rcx),%rax
975	adcq	$0,%rdx
976	addq	(%r14),%r10
977	adcq	$0,%rdx
978	movq	%rdx,%r11
979
980	mulq	%rbp
981	addq	%rax,%r13
982	movq	8(%rsi,%r15,1),%rax
983	adcq	$0,%rdx
984	addq	%r10,%r13
985	adcq	$0,%rdx
986	movq	%rdi,-16(%r14)
987	movq	%rdx,%rdi
988
989	mulq	%rbx
990	addq	%rax,%r11
991	movq	8(%rcx),%rax
992	adcq	$0,%rdx
993	addq	8(%r14),%r11
994	adcq	$0,%rdx
995	movq	%rdx,%r10
996
997	mulq	%rbp
998	addq	%rax,%rdi
999	movq	16(%rsi,%r15,1),%rax
1000	adcq	$0,%rdx
1001	addq	%r11,%rdi
1002	leaq	32(%rcx),%rcx
1003	adcq	$0,%rdx
1004	movq	%r13,-8(%r14)
1005	movq	%rdx,%r13
1006
1007	addq	$32,%r15
1008	jnz	.Linner4x
1009
1010	mulq	%rbx
1011	addq	%rax,%r10
1012	movq	-16(%rcx),%rax
1013	adcq	$0,%rdx
1014	addq	16(%r14),%r10
1015	leaq	32(%r14),%r14
1016	adcq	$0,%rdx
1017	movq	%rdx,%r11
1018
1019	mulq	%rbp
1020	addq	%rax,%r13
1021	movq	-8(%rsi),%rax
1022	adcq	$0,%rdx
1023	addq	%r10,%r13
1024	adcq	$0,%rdx
1025	movq	%rdi,-32(%r14)
1026	movq	%rdx,%rdi
1027
1028	mulq	%rbx
1029	addq	%rax,%r11
1030	movq	%rbp,%rax
1031	movq	-8(%rcx),%rbp
1032	adcq	$0,%rdx
1033	addq	-8(%r14),%r11
1034	adcq	$0,%rdx
1035	movq	%rdx,%r10
1036
1037	mulq	%rbp
1038	addq	%rax,%rdi
1039	movq	(%rsi,%r9,1),%rax
1040	adcq	$0,%rdx
1041	addq	%r11,%rdi
1042	adcq	$0,%rdx
1043	movq	%r13,-24(%r14)
1044	movq	%rdx,%r13
1045
1046	movq	%rdi,-16(%r14)
1047	leaq	(%rcx,%r9,1),%rcx
1048
1049	xorq	%rdi,%rdi
1050	addq	%r10,%r13
1051	adcq	$0,%rdi
1052	addq	(%r14),%r13
1053	adcq	$0,%rdi
1054	movq	%r13,-8(%r14)
1055
1056	cmpq	16+8(%rsp),%r12
1057	jb	.Louter4x
1058	xorq	%rax,%rax
1059	subq	%r13,%rbp
1060	adcq	%r15,%r15
1061	orq	%r15,%rdi
1062	subq	%rdi,%rax
1063	leaq	(%r14,%r9,1),%rbx
1064	movq	(%rcx),%r12
1065	leaq	(%rcx),%rbp
1066	movq	%r9,%rcx
1067	sarq	$3+2,%rcx
1068	movq	56+8(%rsp),%rdi
1069	decq	%r12
1070	xorq	%r10,%r10
1071	movq	8(%rbp),%r13
1072	movq	16(%rbp),%r14
1073	movq	24(%rbp),%r15
1074	jmp	.Lsqr4x_sub_entry
1075.size	mul4x_internal,.-mul4x_internal
1076.globl	bn_power5
1077.type	bn_power5,@function
1078.align	32
1079bn_power5:
1080.cfi_startproc
1081	movq	%rsp,%rax
1082.cfi_def_cfa_register	%rax
1083	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
1084	andl	$0x80108,%r11d
1085	cmpl	$0x80108,%r11d
1086	je	.Lpowerx5_enter
1087	pushq	%rbx
1088.cfi_offset	%rbx,-16
1089	pushq	%rbp
1090.cfi_offset	%rbp,-24
1091	pushq	%r12
1092.cfi_offset	%r12,-32
1093	pushq	%r13
1094.cfi_offset	%r13,-40
1095	pushq	%r14
1096.cfi_offset	%r14,-48
1097	pushq	%r15
1098.cfi_offset	%r15,-56
1099.Lpower5_prologue:
1100
1101	shll	$3,%r9d
1102	leal	(%r9,%r9,2),%r10d
1103	negq	%r9
1104	movq	(%r8),%r8
1105
1106
1107
1108
1109
1110
1111
1112
1113	leaq	-320(%rsp,%r9,2),%r11
1114	movq	%rsp,%rbp
1115	subq	%rdi,%r11
1116	andq	$4095,%r11
1117	cmpq	%r11,%r10
1118	jb	.Lpwr_sp_alt
1119	subq	%r11,%rbp
1120	leaq	-320(%rbp,%r9,2),%rbp
1121	jmp	.Lpwr_sp_done
1122
1123.align	32
1124.Lpwr_sp_alt:
1125	leaq	4096-320(,%r9,2),%r10
1126	leaq	-320(%rbp,%r9,2),%rbp
1127	subq	%r10,%r11
1128	movq	$0,%r10
1129	cmovcq	%r10,%r11
1130	subq	%r11,%rbp
1131.Lpwr_sp_done:
1132	andq	$-64,%rbp
1133	movq	%rsp,%r11
1134	subq	%rbp,%r11
1135	andq	$-4096,%r11
1136	leaq	(%r11,%rbp,1),%rsp
1137	movq	(%rsp),%r10
1138	cmpq	%rbp,%rsp
1139	ja	.Lpwr_page_walk
1140	jmp	.Lpwr_page_walk_done
1141
1142.Lpwr_page_walk:
1143	leaq	-4096(%rsp),%rsp
1144	movq	(%rsp),%r10
1145	cmpq	%rbp,%rsp
1146	ja	.Lpwr_page_walk
1147.Lpwr_page_walk_done:
1148
1149	movq	%r9,%r10
1150	negq	%r9
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161	movq	%r8,32(%rsp)
1162	movq	%rax,40(%rsp)
1163.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
1164.Lpower5_body:
1165.byte	102,72,15,110,207
1166.byte	102,72,15,110,209
1167.byte	102,73,15,110,218
1168.byte	102,72,15,110,226
1169
1170	call	__bn_sqr8x_internal
1171	call	__bn_post4x_internal
1172	call	__bn_sqr8x_internal
1173	call	__bn_post4x_internal
1174	call	__bn_sqr8x_internal
1175	call	__bn_post4x_internal
1176	call	__bn_sqr8x_internal
1177	call	__bn_post4x_internal
1178	call	__bn_sqr8x_internal
1179	call	__bn_post4x_internal
1180
1181.byte	102,72,15,126,209
1182.byte	102,72,15,126,226
1183	movq	%rsi,%rdi
1184	movq	40(%rsp),%rax
1185	leaq	32(%rsp),%r8
1186
1187	call	mul4x_internal
1188
1189	movq	40(%rsp),%rsi
1190.cfi_def_cfa	%rsi,8
1191	movq	$1,%rax
1192	movq	-48(%rsi),%r15
1193.cfi_restore	%r15
1194	movq	-40(%rsi),%r14
1195.cfi_restore	%r14
1196	movq	-32(%rsi),%r13
1197.cfi_restore	%r13
1198	movq	-24(%rsi),%r12
1199.cfi_restore	%r12
1200	movq	-16(%rsi),%rbp
1201.cfi_restore	%rbp
1202	movq	-8(%rsi),%rbx
1203.cfi_restore	%rbx
1204	leaq	(%rsi),%rsp
1205.cfi_def_cfa_register	%rsp
1206.Lpower5_epilogue:
1207	.byte	0xf3,0xc3
1208.cfi_endproc
1209.size	bn_power5,.-bn_power5
1210
1211.globl	bn_sqr8x_internal
1212.hidden	bn_sqr8x_internal
1213.type	bn_sqr8x_internal,@function
1214.align	32
1215bn_sqr8x_internal:
1216__bn_sqr8x_internal:
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290	leaq	32(%r10),%rbp
1291	leaq	(%rsi,%r9,1),%rsi
1292
1293	movq	%r9,%rcx
1294
1295
1296	movq	-32(%rsi,%rbp,1),%r14
1297	leaq	48+8(%rsp,%r9,2),%rdi
1298	movq	-24(%rsi,%rbp,1),%rax
1299	leaq	-32(%rdi,%rbp,1),%rdi
1300	movq	-16(%rsi,%rbp,1),%rbx
1301	movq	%rax,%r15
1302
1303	mulq	%r14
1304	movq	%rax,%r10
1305	movq	%rbx,%rax
1306	movq	%rdx,%r11
1307	movq	%r10,-24(%rdi,%rbp,1)
1308
1309	mulq	%r14
1310	addq	%rax,%r11
1311	movq	%rbx,%rax
1312	adcq	$0,%rdx
1313	movq	%r11,-16(%rdi,%rbp,1)
1314	movq	%rdx,%r10
1315
1316
1317	movq	-8(%rsi,%rbp,1),%rbx
1318	mulq	%r15
1319	movq	%rax,%r12
1320	movq	%rbx,%rax
1321	movq	%rdx,%r13
1322
1323	leaq	(%rbp),%rcx
1324	mulq	%r14
1325	addq	%rax,%r10
1326	movq	%rbx,%rax
1327	movq	%rdx,%r11
1328	adcq	$0,%r11
1329	addq	%r12,%r10
1330	adcq	$0,%r11
1331	movq	%r10,-8(%rdi,%rcx,1)
1332	jmp	.Lsqr4x_1st
1333
1334.align	32
1335.Lsqr4x_1st:
1336	movq	(%rsi,%rcx,1),%rbx
1337	mulq	%r15
1338	addq	%rax,%r13
1339	movq	%rbx,%rax
1340	movq	%rdx,%r12
1341	adcq	$0,%r12
1342
1343	mulq	%r14
1344	addq	%rax,%r11
1345	movq	%rbx,%rax
1346	movq	8(%rsi,%rcx,1),%rbx
1347	movq	%rdx,%r10
1348	adcq	$0,%r10
1349	addq	%r13,%r11
1350	adcq	$0,%r10
1351
1352
1353	mulq	%r15
1354	addq	%rax,%r12
1355	movq	%rbx,%rax
1356	movq	%r11,(%rdi,%rcx,1)
1357	movq	%rdx,%r13
1358	adcq	$0,%r13
1359
1360	mulq	%r14
1361	addq	%rax,%r10
1362	movq	%rbx,%rax
1363	movq	16(%rsi,%rcx,1),%rbx
1364	movq	%rdx,%r11
1365	adcq	$0,%r11
1366	addq	%r12,%r10
1367	adcq	$0,%r11
1368
1369	mulq	%r15
1370	addq	%rax,%r13
1371	movq	%rbx,%rax
1372	movq	%r10,8(%rdi,%rcx,1)
1373	movq	%rdx,%r12
1374	adcq	$0,%r12
1375
1376	mulq	%r14
1377	addq	%rax,%r11
1378	movq	%rbx,%rax
1379	movq	24(%rsi,%rcx,1),%rbx
1380	movq	%rdx,%r10
1381	adcq	$0,%r10
1382	addq	%r13,%r11
1383	adcq	$0,%r10
1384
1385
1386	mulq	%r15
1387	addq	%rax,%r12
1388	movq	%rbx,%rax
1389	movq	%r11,16(%rdi,%rcx,1)
1390	movq	%rdx,%r13
1391	adcq	$0,%r13
1392	leaq	32(%rcx),%rcx
1393
1394	mulq	%r14
1395	addq	%rax,%r10
1396	movq	%rbx,%rax
1397	movq	%rdx,%r11
1398	adcq	$0,%r11
1399	addq	%r12,%r10
1400	adcq	$0,%r11
1401	movq	%r10,-8(%rdi,%rcx,1)
1402
1403	cmpq	$0,%rcx
1404	jne	.Lsqr4x_1st
1405
1406	mulq	%r15
1407	addq	%rax,%r13
1408	leaq	16(%rbp),%rbp
1409	adcq	$0,%rdx
1410	addq	%r11,%r13
1411	adcq	$0,%rdx
1412
1413	movq	%r13,(%rdi)
1414	movq	%rdx,%r12
1415	movq	%rdx,8(%rdi)
1416	jmp	.Lsqr4x_outer
1417
1418.align	32
1419.Lsqr4x_outer:
1420	movq	-32(%rsi,%rbp,1),%r14
1421	leaq	48+8(%rsp,%r9,2),%rdi
1422	movq	-24(%rsi,%rbp,1),%rax
1423	leaq	-32(%rdi,%rbp,1),%rdi
1424	movq	-16(%rsi,%rbp,1),%rbx
1425	movq	%rax,%r15
1426
1427	mulq	%r14
1428	movq	-24(%rdi,%rbp,1),%r10
1429	addq	%rax,%r10
1430	movq	%rbx,%rax
1431	adcq	$0,%rdx
1432	movq	%r10,-24(%rdi,%rbp,1)
1433	movq	%rdx,%r11
1434
1435	mulq	%r14
1436	addq	%rax,%r11
1437	movq	%rbx,%rax
1438	adcq	$0,%rdx
1439	addq	-16(%rdi,%rbp,1),%r11
1440	movq	%rdx,%r10
1441	adcq	$0,%r10
1442	movq	%r11,-16(%rdi,%rbp,1)
1443
1444	xorq	%r12,%r12
1445
1446	movq	-8(%rsi,%rbp,1),%rbx
1447	mulq	%r15
1448	addq	%rax,%r12
1449	movq	%rbx,%rax
1450	adcq	$0,%rdx
1451	addq	-8(%rdi,%rbp,1),%r12
1452	movq	%rdx,%r13
1453	adcq	$0,%r13
1454
1455	mulq	%r14
1456	addq	%rax,%r10
1457	movq	%rbx,%rax
1458	adcq	$0,%rdx
1459	addq	%r12,%r10
1460	movq	%rdx,%r11
1461	adcq	$0,%r11
1462	movq	%r10,-8(%rdi,%rbp,1)
1463
1464	leaq	(%rbp),%rcx
1465	jmp	.Lsqr4x_inner
1466
1467.align	32
1468.Lsqr4x_inner:
1469	movq	(%rsi,%rcx,1),%rbx
1470	mulq	%r15
1471	addq	%rax,%r13
1472	movq	%rbx,%rax
1473	movq	%rdx,%r12
1474	adcq	$0,%r12
1475	addq	(%rdi,%rcx,1),%r13
1476	adcq	$0,%r12
1477
1478.byte	0x67
1479	mulq	%r14
1480	addq	%rax,%r11
1481	movq	%rbx,%rax
1482	movq	8(%rsi,%rcx,1),%rbx
1483	movq	%rdx,%r10
1484	adcq	$0,%r10
1485	addq	%r13,%r11
1486	adcq	$0,%r10
1487
1488	mulq	%r15
1489	addq	%rax,%r12
1490	movq	%r11,(%rdi,%rcx,1)
1491	movq	%rbx,%rax
1492	movq	%rdx,%r13
1493	adcq	$0,%r13
1494	addq	8(%rdi,%rcx,1),%r12
1495	leaq	16(%rcx),%rcx
1496	adcq	$0,%r13
1497
1498	mulq	%r14
1499	addq	%rax,%r10
1500	movq	%rbx,%rax
1501	adcq	$0,%rdx
1502	addq	%r12,%r10
1503	movq	%rdx,%r11
1504	adcq	$0,%r11
1505	movq	%r10,-8(%rdi,%rcx,1)
1506
1507	cmpq	$0,%rcx
1508	jne	.Lsqr4x_inner
1509
1510.byte	0x67
1511	mulq	%r15
1512	addq	%rax,%r13
1513	adcq	$0,%rdx
1514	addq	%r11,%r13
1515	adcq	$0,%rdx
1516
1517	movq	%r13,(%rdi)
1518	movq	%rdx,%r12
1519	movq	%rdx,8(%rdi)
1520
1521	addq	$16,%rbp
1522	jnz	.Lsqr4x_outer
1523
1524
1525	movq	-32(%rsi),%r14
1526	leaq	48+8(%rsp,%r9,2),%rdi
1527	movq	-24(%rsi),%rax
1528	leaq	-32(%rdi,%rbp,1),%rdi
1529	movq	-16(%rsi),%rbx
1530	movq	%rax,%r15
1531
1532	mulq	%r14
1533	addq	%rax,%r10
1534	movq	%rbx,%rax
1535	movq	%rdx,%r11
1536	adcq	$0,%r11
1537
1538	mulq	%r14
1539	addq	%rax,%r11
1540	movq	%rbx,%rax
1541	movq	%r10,-24(%rdi)
1542	movq	%rdx,%r10
1543	adcq	$0,%r10
1544	addq	%r13,%r11
1545	movq	-8(%rsi),%rbx
1546	adcq	$0,%r10
1547
1548	mulq	%r15
1549	addq	%rax,%r12
1550	movq	%rbx,%rax
1551	movq	%r11,-16(%rdi)
1552	movq	%rdx,%r13
1553	adcq	$0,%r13
1554
1555	mulq	%r14
1556	addq	%rax,%r10
1557	movq	%rbx,%rax
1558	movq	%rdx,%r11
1559	adcq	$0,%r11
1560	addq	%r12,%r10
1561	adcq	$0,%r11
1562	movq	%r10,-8(%rdi)
1563
1564	mulq	%r15
1565	addq	%rax,%r13
1566	movq	-16(%rsi),%rax
1567	adcq	$0,%rdx
1568	addq	%r11,%r13
1569	adcq	$0,%rdx
1570
1571	movq	%r13,(%rdi)
1572	movq	%rdx,%r12
1573	movq	%rdx,8(%rdi)
1574
1575	mulq	%rbx
1576	addq	$16,%rbp
1577	xorq	%r14,%r14
1578	subq	%r9,%rbp
1579	xorq	%r15,%r15
1580
1581	addq	%r12,%rax
1582	adcq	$0,%rdx
1583	movq	%rax,8(%rdi)
1584	movq	%rdx,16(%rdi)
1585	movq	%r15,24(%rdi)
1586
1587	movq	-16(%rsi,%rbp,1),%rax
1588	leaq	48+8(%rsp),%rdi
1589	xorq	%r10,%r10
1590	movq	8(%rdi),%r11
1591
1592	leaq	(%r14,%r10,2),%r12
1593	shrq	$63,%r10
1594	leaq	(%rcx,%r11,2),%r13
1595	shrq	$63,%r11
1596	orq	%r10,%r13
1597	movq	16(%rdi),%r10
1598	movq	%r11,%r14
1599	mulq	%rax
1600	negq	%r15
1601	movq	24(%rdi),%r11
1602	adcq	%rax,%r12
1603	movq	-8(%rsi,%rbp,1),%rax
1604	movq	%r12,(%rdi)
1605	adcq	%rdx,%r13
1606
1607	leaq	(%r14,%r10,2),%rbx
1608	movq	%r13,8(%rdi)
1609	sbbq	%r15,%r15
1610	shrq	$63,%r10
1611	leaq	(%rcx,%r11,2),%r8
1612	shrq	$63,%r11
1613	orq	%r10,%r8
1614	movq	32(%rdi),%r10
1615	movq	%r11,%r14
1616	mulq	%rax
1617	negq	%r15
1618	movq	40(%rdi),%r11
1619	adcq	%rax,%rbx
1620	movq	0(%rsi,%rbp,1),%rax
1621	movq	%rbx,16(%rdi)
1622	adcq	%rdx,%r8
1623	leaq	16(%rbp),%rbp
1624	movq	%r8,24(%rdi)
1625	sbbq	%r15,%r15
1626	leaq	64(%rdi),%rdi
1627	jmp	.Lsqr4x_shift_n_add
1628
1629.align	32
1630.Lsqr4x_shift_n_add:
1631	leaq	(%r14,%r10,2),%r12
1632	shrq	$63,%r10
1633	leaq	(%rcx,%r11,2),%r13
1634	shrq	$63,%r11
1635	orq	%r10,%r13
1636	movq	-16(%rdi),%r10
1637	movq	%r11,%r14
1638	mulq	%rax
1639	negq	%r15
1640	movq	-8(%rdi),%r11
1641	adcq	%rax,%r12
1642	movq	-8(%rsi,%rbp,1),%rax
1643	movq	%r12,-32(%rdi)
1644	adcq	%rdx,%r13
1645
1646	leaq	(%r14,%r10,2),%rbx
1647	movq	%r13,-24(%rdi)
1648	sbbq	%r15,%r15
1649	shrq	$63,%r10
1650	leaq	(%rcx,%r11,2),%r8
1651	shrq	$63,%r11
1652	orq	%r10,%r8
1653	movq	0(%rdi),%r10
1654	movq	%r11,%r14
1655	mulq	%rax
1656	negq	%r15
1657	movq	8(%rdi),%r11
1658	adcq	%rax,%rbx
1659	movq	0(%rsi,%rbp,1),%rax
1660	movq	%rbx,-16(%rdi)
1661	adcq	%rdx,%r8
1662
1663	leaq	(%r14,%r10,2),%r12
1664	movq	%r8,-8(%rdi)
1665	sbbq	%r15,%r15
1666	shrq	$63,%r10
1667	leaq	(%rcx,%r11,2),%r13
1668	shrq	$63,%r11
1669	orq	%r10,%r13
1670	movq	16(%rdi),%r10
1671	movq	%r11,%r14
1672	mulq	%rax
1673	negq	%r15
1674	movq	24(%rdi),%r11
1675	adcq	%rax,%r12
1676	movq	8(%rsi,%rbp,1),%rax
1677	movq	%r12,0(%rdi)
1678	adcq	%rdx,%r13
1679
1680	leaq	(%r14,%r10,2),%rbx
1681	movq	%r13,8(%rdi)
1682	sbbq	%r15,%r15
1683	shrq	$63,%r10
1684	leaq	(%rcx,%r11,2),%r8
1685	shrq	$63,%r11
1686	orq	%r10,%r8
1687	movq	32(%rdi),%r10
1688	movq	%r11,%r14
1689	mulq	%rax
1690	negq	%r15
1691	movq	40(%rdi),%r11
1692	adcq	%rax,%rbx
1693	movq	16(%rsi,%rbp,1),%rax
1694	movq	%rbx,16(%rdi)
1695	adcq	%rdx,%r8
1696	movq	%r8,24(%rdi)
1697	sbbq	%r15,%r15
1698	leaq	64(%rdi),%rdi
1699	addq	$32,%rbp
1700	jnz	.Lsqr4x_shift_n_add
1701
1702	leaq	(%r14,%r10,2),%r12
1703.byte	0x67
1704	shrq	$63,%r10
1705	leaq	(%rcx,%r11,2),%r13
1706	shrq	$63,%r11
1707	orq	%r10,%r13
1708	movq	-16(%rdi),%r10
1709	movq	%r11,%r14
1710	mulq	%rax
1711	negq	%r15
1712	movq	-8(%rdi),%r11
1713	adcq	%rax,%r12
1714	movq	-8(%rsi),%rax
1715	movq	%r12,-32(%rdi)
1716	adcq	%rdx,%r13
1717
1718	leaq	(%r14,%r10,2),%rbx
1719	movq	%r13,-24(%rdi)
1720	sbbq	%r15,%r15
1721	shrq	$63,%r10
1722	leaq	(%rcx,%r11,2),%r8
1723	shrq	$63,%r11
1724	orq	%r10,%r8
1725	mulq	%rax
1726	negq	%r15
1727	adcq	%rax,%rbx
1728	adcq	%rdx,%r8
1729	movq	%rbx,-16(%rdi)
1730	movq	%r8,-8(%rdi)
1731.byte	102,72,15,126,213
1732__bn_sqr8x_reduction:
1733	xorq	%rax,%rax
1734	leaq	(%r9,%rbp,1),%rcx
1735	leaq	48+8(%rsp,%r9,2),%rdx
1736	movq	%rcx,0+8(%rsp)
1737	leaq	48+8(%rsp,%r9,1),%rdi
1738	movq	%rdx,8+8(%rsp)
1739	negq	%r9
1740	jmp	.L8x_reduction_loop
1741
1742.align	32
1743.L8x_reduction_loop:
1744	leaq	(%rdi,%r9,1),%rdi
1745.byte	0x66
1746	movq	0(%rdi),%rbx
1747	movq	8(%rdi),%r9
1748	movq	16(%rdi),%r10
1749	movq	24(%rdi),%r11
1750	movq	32(%rdi),%r12
1751	movq	40(%rdi),%r13
1752	movq	48(%rdi),%r14
1753	movq	56(%rdi),%r15
1754	movq	%rax,(%rdx)
1755	leaq	64(%rdi),%rdi
1756
1757.byte	0x67
1758	movq	%rbx,%r8
1759	imulq	32+8(%rsp),%rbx
1760	movq	0(%rbp),%rax
1761	movl	$8,%ecx
1762	jmp	.L8x_reduce
1763
1764.align	32
1765.L8x_reduce:
1766	mulq	%rbx
1767	movq	8(%rbp),%rax
1768	negq	%r8
1769	movq	%rdx,%r8
1770	adcq	$0,%r8
1771
1772	mulq	%rbx
1773	addq	%rax,%r9
1774	movq	16(%rbp),%rax
1775	adcq	$0,%rdx
1776	addq	%r9,%r8
1777	movq	%rbx,48-8+8(%rsp,%rcx,8)
1778	movq	%rdx,%r9
1779	adcq	$0,%r9
1780
1781	mulq	%rbx
1782	addq	%rax,%r10
1783	movq	24(%rbp),%rax
1784	adcq	$0,%rdx
1785	addq	%r10,%r9
1786	movq	32+8(%rsp),%rsi
1787	movq	%rdx,%r10
1788	adcq	$0,%r10
1789
1790	mulq	%rbx
1791	addq	%rax,%r11
1792	movq	32(%rbp),%rax
1793	adcq	$0,%rdx
1794	imulq	%r8,%rsi
1795	addq	%r11,%r10
1796	movq	%rdx,%r11
1797	adcq	$0,%r11
1798
1799	mulq	%rbx
1800	addq	%rax,%r12
1801	movq	40(%rbp),%rax
1802	adcq	$0,%rdx
1803	addq	%r12,%r11
1804	movq	%rdx,%r12
1805	adcq	$0,%r12
1806
1807	mulq	%rbx
1808	addq	%rax,%r13
1809	movq	48(%rbp),%rax
1810	adcq	$0,%rdx
1811	addq	%r13,%r12
1812	movq	%rdx,%r13
1813	adcq	$0,%r13
1814
1815	mulq	%rbx
1816	addq	%rax,%r14
1817	movq	56(%rbp),%rax
1818	adcq	$0,%rdx
1819	addq	%r14,%r13
1820	movq	%rdx,%r14
1821	adcq	$0,%r14
1822
1823	mulq	%rbx
1824	movq	%rsi,%rbx
1825	addq	%rax,%r15
1826	movq	0(%rbp),%rax
1827	adcq	$0,%rdx
1828	addq	%r15,%r14
1829	movq	%rdx,%r15
1830	adcq	$0,%r15
1831
1832	decl	%ecx
1833	jnz	.L8x_reduce
1834
1835	leaq	64(%rbp),%rbp
1836	xorq	%rax,%rax
1837	movq	8+8(%rsp),%rdx
1838	cmpq	0+8(%rsp),%rbp
1839	jae	.L8x_no_tail
1840
1841.byte	0x66
1842	addq	0(%rdi),%r8
1843	adcq	8(%rdi),%r9
1844	adcq	16(%rdi),%r10
1845	adcq	24(%rdi),%r11
1846	adcq	32(%rdi),%r12
1847	adcq	40(%rdi),%r13
1848	adcq	48(%rdi),%r14
1849	adcq	56(%rdi),%r15
1850	sbbq	%rsi,%rsi
1851
1852	movq	48+56+8(%rsp),%rbx
1853	movl	$8,%ecx
1854	movq	0(%rbp),%rax
1855	jmp	.L8x_tail
1856
1857.align	32
1858.L8x_tail:
1859	mulq	%rbx
1860	addq	%rax,%r8
1861	movq	8(%rbp),%rax
1862	movq	%r8,(%rdi)
1863	movq	%rdx,%r8
1864	adcq	$0,%r8
1865
1866	mulq	%rbx
1867	addq	%rax,%r9
1868	movq	16(%rbp),%rax
1869	adcq	$0,%rdx
1870	addq	%r9,%r8
1871	leaq	8(%rdi),%rdi
1872	movq	%rdx,%r9
1873	adcq	$0,%r9
1874
1875	mulq	%rbx
1876	addq	%rax,%r10
1877	movq	24(%rbp),%rax
1878	adcq	$0,%rdx
1879	addq	%r10,%r9
1880	movq	%rdx,%r10
1881	adcq	$0,%r10
1882
1883	mulq	%rbx
1884	addq	%rax,%r11
1885	movq	32(%rbp),%rax
1886	adcq	$0,%rdx
1887	addq	%r11,%r10
1888	movq	%rdx,%r11
1889	adcq	$0,%r11
1890
1891	mulq	%rbx
1892	addq	%rax,%r12
1893	movq	40(%rbp),%rax
1894	adcq	$0,%rdx
1895	addq	%r12,%r11
1896	movq	%rdx,%r12
1897	adcq	$0,%r12
1898
1899	mulq	%rbx
1900	addq	%rax,%r13
1901	movq	48(%rbp),%rax
1902	adcq	$0,%rdx
1903	addq	%r13,%r12
1904	movq	%rdx,%r13
1905	adcq	$0,%r13
1906
1907	mulq	%rbx
1908	addq	%rax,%r14
1909	movq	56(%rbp),%rax
1910	adcq	$0,%rdx
1911	addq	%r14,%r13
1912	movq	%rdx,%r14
1913	adcq	$0,%r14
1914
1915	mulq	%rbx
1916	movq	48-16+8(%rsp,%rcx,8),%rbx
1917	addq	%rax,%r15
1918	adcq	$0,%rdx
1919	addq	%r15,%r14
1920	movq	0(%rbp),%rax
1921	movq	%rdx,%r15
1922	adcq	$0,%r15
1923
1924	decl	%ecx
1925	jnz	.L8x_tail
1926
1927	leaq	64(%rbp),%rbp
1928	movq	8+8(%rsp),%rdx
1929	cmpq	0+8(%rsp),%rbp
1930	jae	.L8x_tail_done
1931
1932	movq	48+56+8(%rsp),%rbx
1933	negq	%rsi
1934	movq	0(%rbp),%rax
1935	adcq	0(%rdi),%r8
1936	adcq	8(%rdi),%r9
1937	adcq	16(%rdi),%r10
1938	adcq	24(%rdi),%r11
1939	adcq	32(%rdi),%r12
1940	adcq	40(%rdi),%r13
1941	adcq	48(%rdi),%r14
1942	adcq	56(%rdi),%r15
1943	sbbq	%rsi,%rsi
1944
1945	movl	$8,%ecx
1946	jmp	.L8x_tail
1947
1948.align	32
1949.L8x_tail_done:
1950	xorq	%rax,%rax
1951	addq	(%rdx),%r8
1952	adcq	$0,%r9
1953	adcq	$0,%r10
1954	adcq	$0,%r11
1955	adcq	$0,%r12
1956	adcq	$0,%r13
1957	adcq	$0,%r14
1958	adcq	$0,%r15
1959	adcq	$0,%rax
1960
1961	negq	%rsi
1962.L8x_no_tail:
1963	adcq	0(%rdi),%r8
1964	adcq	8(%rdi),%r9
1965	adcq	16(%rdi),%r10
1966	adcq	24(%rdi),%r11
1967	adcq	32(%rdi),%r12
1968	adcq	40(%rdi),%r13
1969	adcq	48(%rdi),%r14
1970	adcq	56(%rdi),%r15
1971	adcq	$0,%rax
1972	movq	-8(%rbp),%rcx
1973	xorq	%rsi,%rsi
1974
1975.byte	102,72,15,126,213
1976
1977	movq	%r8,0(%rdi)
1978	movq	%r9,8(%rdi)
1979.byte	102,73,15,126,217
1980	movq	%r10,16(%rdi)
1981	movq	%r11,24(%rdi)
1982	movq	%r12,32(%rdi)
1983	movq	%r13,40(%rdi)
1984	movq	%r14,48(%rdi)
1985	movq	%r15,56(%rdi)
1986	leaq	64(%rdi),%rdi
1987
1988	cmpq	%rdx,%rdi
1989	jb	.L8x_reduction_loop
1990	.byte	0xf3,0xc3
1991.size	bn_sqr8x_internal,.-bn_sqr8x_internal
1992.type	__bn_post4x_internal,@function
1993.align	32
1994__bn_post4x_internal:
1995	movq	0(%rbp),%r12
1996	leaq	(%rdi,%r9,1),%rbx
1997	movq	%r9,%rcx
1998.byte	102,72,15,126,207
1999	negq	%rax
2000.byte	102,72,15,126,206
2001	sarq	$3+2,%rcx
2002	decq	%r12
2003	xorq	%r10,%r10
2004	movq	8(%rbp),%r13
2005	movq	16(%rbp),%r14
2006	movq	24(%rbp),%r15
2007	jmp	.Lsqr4x_sub_entry
2008
2009.align	16
2010.Lsqr4x_sub:
2011	movq	0(%rbp),%r12
2012	movq	8(%rbp),%r13
2013	movq	16(%rbp),%r14
2014	movq	24(%rbp),%r15
2015.Lsqr4x_sub_entry:
2016	leaq	32(%rbp),%rbp
2017	notq	%r12
2018	notq	%r13
2019	notq	%r14
2020	notq	%r15
2021	andq	%rax,%r12
2022	andq	%rax,%r13
2023	andq	%rax,%r14
2024	andq	%rax,%r15
2025
2026	negq	%r10
2027	adcq	0(%rbx),%r12
2028	adcq	8(%rbx),%r13
2029	adcq	16(%rbx),%r14
2030	adcq	24(%rbx),%r15
2031	movq	%r12,0(%rdi)
2032	leaq	32(%rbx),%rbx
2033	movq	%r13,8(%rdi)
2034	sbbq	%r10,%r10
2035	movq	%r14,16(%rdi)
2036	movq	%r15,24(%rdi)
2037	leaq	32(%rdi),%rdi
2038
2039	incq	%rcx
2040	jnz	.Lsqr4x_sub
2041
2042	movq	%r9,%r10
2043	negq	%r9
2044	.byte	0xf3,0xc3
2045.size	__bn_post4x_internal,.-__bn_post4x_internal
2046.globl	bn_from_montgomery
2047.type	bn_from_montgomery,@function
2048.align	32
2049bn_from_montgomery:
2050	testl	$7,%r9d
2051	jz	bn_from_mont8x
2052	xorl	%eax,%eax
2053	.byte	0xf3,0xc3
2054.size	bn_from_montgomery,.-bn_from_montgomery
2055
2056.type	bn_from_mont8x,@function
2057.align	32
2058bn_from_mont8x:
2059.cfi_startproc
2060.byte	0x67
2061	movq	%rsp,%rax
2062.cfi_def_cfa_register	%rax
2063	pushq	%rbx
2064.cfi_offset	%rbx,-16
2065	pushq	%rbp
2066.cfi_offset	%rbp,-24
2067	pushq	%r12
2068.cfi_offset	%r12,-32
2069	pushq	%r13
2070.cfi_offset	%r13,-40
2071	pushq	%r14
2072.cfi_offset	%r14,-48
2073	pushq	%r15
2074.cfi_offset	%r15,-56
2075.Lfrom_prologue:
2076
2077	shll	$3,%r9d
2078	leaq	(%r9,%r9,2),%r10
2079	negq	%r9
2080	movq	(%r8),%r8
2081
2082
2083
2084
2085
2086
2087
2088
2089	leaq	-320(%rsp,%r9,2),%r11
2090	movq	%rsp,%rbp
2091	subq	%rdi,%r11
2092	andq	$4095,%r11
2093	cmpq	%r11,%r10
2094	jb	.Lfrom_sp_alt
2095	subq	%r11,%rbp
2096	leaq	-320(%rbp,%r9,2),%rbp
2097	jmp	.Lfrom_sp_done
2098
2099.align	32
2100.Lfrom_sp_alt:
2101	leaq	4096-320(,%r9,2),%r10
2102	leaq	-320(%rbp,%r9,2),%rbp
2103	subq	%r10,%r11
2104	movq	$0,%r10
2105	cmovcq	%r10,%r11
2106	subq	%r11,%rbp
2107.Lfrom_sp_done:
2108	andq	$-64,%rbp
2109	movq	%rsp,%r11
2110	subq	%rbp,%r11
2111	andq	$-4096,%r11
2112	leaq	(%r11,%rbp,1),%rsp
2113	movq	(%rsp),%r10
2114	cmpq	%rbp,%rsp
2115	ja	.Lfrom_page_walk
2116	jmp	.Lfrom_page_walk_done
2117
2118.Lfrom_page_walk:
2119	leaq	-4096(%rsp),%rsp
2120	movq	(%rsp),%r10
2121	cmpq	%rbp,%rsp
2122	ja	.Lfrom_page_walk
2123.Lfrom_page_walk_done:
2124
2125	movq	%r9,%r10
2126	negq	%r9
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137	movq	%r8,32(%rsp)
2138	movq	%rax,40(%rsp)
2139.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2140.Lfrom_body:
2141	movq	%r9,%r11
2142	leaq	48(%rsp),%rax
2143	pxor	%xmm0,%xmm0
2144	jmp	.Lmul_by_1
2145
2146.align	32
2147.Lmul_by_1:
2148	movdqu	(%rsi),%xmm1
2149	movdqu	16(%rsi),%xmm2
2150	movdqu	32(%rsi),%xmm3
2151	movdqa	%xmm0,(%rax,%r9,1)
2152	movdqu	48(%rsi),%xmm4
2153	movdqa	%xmm0,16(%rax,%r9,1)
2154.byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
2155	movdqa	%xmm1,(%rax)
2156	movdqa	%xmm0,32(%rax,%r9,1)
2157	movdqa	%xmm2,16(%rax)
2158	movdqa	%xmm0,48(%rax,%r9,1)
2159	movdqa	%xmm3,32(%rax)
2160	movdqa	%xmm4,48(%rax)
2161	leaq	64(%rax),%rax
2162	subq	$64,%r11
2163	jnz	.Lmul_by_1
2164
2165.byte	102,72,15,110,207
2166.byte	102,72,15,110,209
2167.byte	0x67
2168	movq	%rcx,%rbp
2169.byte	102,73,15,110,218
2170	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
2171	andl	$0x80108,%r11d
2172	cmpl	$0x80108,%r11d
2173	jne	.Lfrom_mont_nox
2174
2175	leaq	(%rax,%r9,1),%rdi
2176	call	__bn_sqrx8x_reduction
2177	call	__bn_postx4x_internal
2178
2179	pxor	%xmm0,%xmm0
2180	leaq	48(%rsp),%rax
2181	jmp	.Lfrom_mont_zero
2182
2183.align	32
2184.Lfrom_mont_nox:
2185	call	__bn_sqr8x_reduction
2186	call	__bn_post4x_internal
2187
2188	pxor	%xmm0,%xmm0
2189	leaq	48(%rsp),%rax
2190	jmp	.Lfrom_mont_zero
2191
2192.align	32
2193.Lfrom_mont_zero:
2194	movq	40(%rsp),%rsi
2195.cfi_def_cfa	%rsi,8
2196	movdqa	%xmm0,0(%rax)
2197	movdqa	%xmm0,16(%rax)
2198	movdqa	%xmm0,32(%rax)
2199	movdqa	%xmm0,48(%rax)
2200	leaq	64(%rax),%rax
2201	subq	$32,%r9
2202	jnz	.Lfrom_mont_zero
2203
2204	movq	$1,%rax
2205	movq	-48(%rsi),%r15
2206.cfi_restore	%r15
2207	movq	-40(%rsi),%r14
2208.cfi_restore	%r14
2209	movq	-32(%rsi),%r13
2210.cfi_restore	%r13
2211	movq	-24(%rsi),%r12
2212.cfi_restore	%r12
2213	movq	-16(%rsi),%rbp
2214.cfi_restore	%rbp
2215	movq	-8(%rsi),%rbx
2216.cfi_restore	%rbx
2217	leaq	(%rsi),%rsp
2218.cfi_def_cfa_register	%rsp
2219.Lfrom_epilogue:
2220	.byte	0xf3,0xc3
2221.cfi_endproc
2222.size	bn_from_mont8x,.-bn_from_mont8x
2223.type	bn_mulx4x_mont_gather5,@function
2224.align	32
2225bn_mulx4x_mont_gather5:
2226.cfi_startproc
2227	movq	%rsp,%rax
2228.cfi_def_cfa_register	%rax
2229.Lmulx4x_enter:
2230	pushq	%rbx
2231.cfi_offset	%rbx,-16
2232	pushq	%rbp
2233.cfi_offset	%rbp,-24
2234	pushq	%r12
2235.cfi_offset	%r12,-32
2236	pushq	%r13
2237.cfi_offset	%r13,-40
2238	pushq	%r14
2239.cfi_offset	%r14,-48
2240	pushq	%r15
2241.cfi_offset	%r15,-56
2242.Lmulx4x_prologue:
2243
2244	shll	$3,%r9d
2245	leaq	(%r9,%r9,2),%r10
2246	negq	%r9
2247	movq	(%r8),%r8
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258	leaq	-320(%rsp,%r9,2),%r11
2259	movq	%rsp,%rbp
2260	subq	%rdi,%r11
2261	andq	$4095,%r11
2262	cmpq	%r11,%r10
2263	jb	.Lmulx4xsp_alt
2264	subq	%r11,%rbp
2265	leaq	-320(%rbp,%r9,2),%rbp
2266	jmp	.Lmulx4xsp_done
2267
2268.Lmulx4xsp_alt:
2269	leaq	4096-320(,%r9,2),%r10
2270	leaq	-320(%rbp,%r9,2),%rbp
2271	subq	%r10,%r11
2272	movq	$0,%r10
2273	cmovcq	%r10,%r11
2274	subq	%r11,%rbp
2275.Lmulx4xsp_done:
2276	andq	$-64,%rbp
2277	movq	%rsp,%r11
2278	subq	%rbp,%r11
2279	andq	$-4096,%r11
2280	leaq	(%r11,%rbp,1),%rsp
2281	movq	(%rsp),%r10
2282	cmpq	%rbp,%rsp
2283	ja	.Lmulx4x_page_walk
2284	jmp	.Lmulx4x_page_walk_done
2285
2286.Lmulx4x_page_walk:
2287	leaq	-4096(%rsp),%rsp
2288	movq	(%rsp),%r10
2289	cmpq	%rbp,%rsp
2290	ja	.Lmulx4x_page_walk
2291.Lmulx4x_page_walk_done:
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305	movq	%r8,32(%rsp)
2306	movq	%rax,40(%rsp)
2307.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2308.Lmulx4x_body:
2309	call	mulx4x_internal
2310
2311	movq	40(%rsp),%rsi
2312.cfi_def_cfa	%rsi,8
2313	movq	$1,%rax
2314
2315	movq	-48(%rsi),%r15
2316.cfi_restore	%r15
2317	movq	-40(%rsi),%r14
2318.cfi_restore	%r14
2319	movq	-32(%rsi),%r13
2320.cfi_restore	%r13
2321	movq	-24(%rsi),%r12
2322.cfi_restore	%r12
2323	movq	-16(%rsi),%rbp
2324.cfi_restore	%rbp
2325	movq	-8(%rsi),%rbx
2326.cfi_restore	%rbx
2327	leaq	(%rsi),%rsp
2328.cfi_def_cfa_register	%rsp
2329.Lmulx4x_epilogue:
2330	.byte	0xf3,0xc3
2331.cfi_endproc
2332.size	bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
2333
2334.type	mulx4x_internal,@function
2335.align	32
2336mulx4x_internal:
2337	movq	%r9,8(%rsp)
2338	movq	%r9,%r10
2339	negq	%r9
2340	shlq	$5,%r9
2341	negq	%r10
2342	leaq	128(%rdx,%r9,1),%r13
2343	shrq	$5+5,%r9
2344	movd	8(%rax),%xmm5
2345	subq	$1,%r9
2346	leaq	.Linc(%rip),%rax
2347	movq	%r13,16+8(%rsp)
2348	movq	%r9,24+8(%rsp)
2349	movq	%rdi,56+8(%rsp)
2350	movdqa	0(%rax),%xmm0
2351	movdqa	16(%rax),%xmm1
2352	leaq	88-112(%rsp,%r10,1),%r10
2353	leaq	128(%rdx),%rdi
2354
2355	pshufd	$0,%xmm5,%xmm5
2356	movdqa	%xmm1,%xmm4
2357.byte	0x67
2358	movdqa	%xmm1,%xmm2
2359.byte	0x67
2360	paddd	%xmm0,%xmm1
2361	pcmpeqd	%xmm5,%xmm0
2362	movdqa	%xmm4,%xmm3
2363	paddd	%xmm1,%xmm2
2364	pcmpeqd	%xmm5,%xmm1
2365	movdqa	%xmm0,112(%r10)
2366	movdqa	%xmm4,%xmm0
2367
2368	paddd	%xmm2,%xmm3
2369	pcmpeqd	%xmm5,%xmm2
2370	movdqa	%xmm1,128(%r10)
2371	movdqa	%xmm4,%xmm1
2372
2373	paddd	%xmm3,%xmm0
2374	pcmpeqd	%xmm5,%xmm3
2375	movdqa	%xmm2,144(%r10)
2376	movdqa	%xmm4,%xmm2
2377
2378	paddd	%xmm0,%xmm1
2379	pcmpeqd	%xmm5,%xmm0
2380	movdqa	%xmm3,160(%r10)
2381	movdqa	%xmm4,%xmm3
2382	paddd	%xmm1,%xmm2
2383	pcmpeqd	%xmm5,%xmm1
2384	movdqa	%xmm0,176(%r10)
2385	movdqa	%xmm4,%xmm0
2386
2387	paddd	%xmm2,%xmm3
2388	pcmpeqd	%xmm5,%xmm2
2389	movdqa	%xmm1,192(%r10)
2390	movdqa	%xmm4,%xmm1
2391
2392	paddd	%xmm3,%xmm0
2393	pcmpeqd	%xmm5,%xmm3
2394	movdqa	%xmm2,208(%r10)
2395	movdqa	%xmm4,%xmm2
2396
2397	paddd	%xmm0,%xmm1
2398	pcmpeqd	%xmm5,%xmm0
2399	movdqa	%xmm3,224(%r10)
2400	movdqa	%xmm4,%xmm3
2401	paddd	%xmm1,%xmm2
2402	pcmpeqd	%xmm5,%xmm1
2403	movdqa	%xmm0,240(%r10)
2404	movdqa	%xmm4,%xmm0
2405
2406	paddd	%xmm2,%xmm3
2407	pcmpeqd	%xmm5,%xmm2
2408	movdqa	%xmm1,256(%r10)
2409	movdqa	%xmm4,%xmm1
2410
2411	paddd	%xmm3,%xmm0
2412	pcmpeqd	%xmm5,%xmm3
2413	movdqa	%xmm2,272(%r10)
2414	movdqa	%xmm4,%xmm2
2415
2416	paddd	%xmm0,%xmm1
2417	pcmpeqd	%xmm5,%xmm0
2418	movdqa	%xmm3,288(%r10)
2419	movdqa	%xmm4,%xmm3
2420.byte	0x67
2421	paddd	%xmm1,%xmm2
2422	pcmpeqd	%xmm5,%xmm1
2423	movdqa	%xmm0,304(%r10)
2424
2425	paddd	%xmm2,%xmm3
2426	pcmpeqd	%xmm5,%xmm2
2427	movdqa	%xmm1,320(%r10)
2428
2429	pcmpeqd	%xmm5,%xmm3
2430	movdqa	%xmm2,336(%r10)
2431
2432	pand	64(%rdi),%xmm0
2433	pand	80(%rdi),%xmm1
2434	pand	96(%rdi),%xmm2
2435	movdqa	%xmm3,352(%r10)
2436	pand	112(%rdi),%xmm3
2437	por	%xmm2,%xmm0
2438	por	%xmm3,%xmm1
2439	movdqa	-128(%rdi),%xmm4
2440	movdqa	-112(%rdi),%xmm5
2441	movdqa	-96(%rdi),%xmm2
2442	pand	112(%r10),%xmm4
2443	movdqa	-80(%rdi),%xmm3
2444	pand	128(%r10),%xmm5
2445	por	%xmm4,%xmm0
2446	pand	144(%r10),%xmm2
2447	por	%xmm5,%xmm1
2448	pand	160(%r10),%xmm3
2449	por	%xmm2,%xmm0
2450	por	%xmm3,%xmm1
2451	movdqa	-64(%rdi),%xmm4
2452	movdqa	-48(%rdi),%xmm5
2453	movdqa	-32(%rdi),%xmm2
2454	pand	176(%r10),%xmm4
2455	movdqa	-16(%rdi),%xmm3
2456	pand	192(%r10),%xmm5
2457	por	%xmm4,%xmm0
2458	pand	208(%r10),%xmm2
2459	por	%xmm5,%xmm1
2460	pand	224(%r10),%xmm3
2461	por	%xmm2,%xmm0
2462	por	%xmm3,%xmm1
2463	movdqa	0(%rdi),%xmm4
2464	movdqa	16(%rdi),%xmm5
2465	movdqa	32(%rdi),%xmm2
2466	pand	240(%r10),%xmm4
2467	movdqa	48(%rdi),%xmm3
2468	pand	256(%r10),%xmm5
2469	por	%xmm4,%xmm0
2470	pand	272(%r10),%xmm2
2471	por	%xmm5,%xmm1
2472	pand	288(%r10),%xmm3
2473	por	%xmm2,%xmm0
2474	por	%xmm3,%xmm1
2475	pxor	%xmm1,%xmm0
2476	pshufd	$0x4e,%xmm0,%xmm1
2477	por	%xmm1,%xmm0
2478	leaq	256(%rdi),%rdi
2479.byte	102,72,15,126,194
2480	leaq	64+32+8(%rsp),%rbx
2481
2482	movq	%rdx,%r9
2483	mulxq	0(%rsi),%r8,%rax
2484	mulxq	8(%rsi),%r11,%r12
2485	addq	%rax,%r11
2486	mulxq	16(%rsi),%rax,%r13
2487	adcq	%rax,%r12
2488	adcq	$0,%r13
2489	mulxq	24(%rsi),%rax,%r14
2490
2491	movq	%r8,%r15
2492	imulq	32+8(%rsp),%r8
2493	xorq	%rbp,%rbp
2494	movq	%r8,%rdx
2495
2496	movq	%rdi,8+8(%rsp)
2497
2498	leaq	32(%rsi),%rsi
2499	adcxq	%rax,%r13
2500	adcxq	%rbp,%r14
2501
2502	mulxq	0(%rcx),%rax,%r10
2503	adcxq	%rax,%r15
2504	adoxq	%r11,%r10
2505	mulxq	8(%rcx),%rax,%r11
2506	adcxq	%rax,%r10
2507	adoxq	%r12,%r11
2508	mulxq	16(%rcx),%rax,%r12
2509	movq	24+8(%rsp),%rdi
2510	movq	%r10,-32(%rbx)
2511	adcxq	%rax,%r11
2512	adoxq	%r13,%r12
2513	mulxq	24(%rcx),%rax,%r15
2514	movq	%r9,%rdx
2515	movq	%r11,-24(%rbx)
2516	adcxq	%rax,%r12
2517	adoxq	%rbp,%r15
2518	leaq	32(%rcx),%rcx
2519	movq	%r12,-16(%rbx)
2520	jmp	.Lmulx4x_1st
2521
2522.align	32
2523.Lmulx4x_1st:
2524	adcxq	%rbp,%r15
2525	mulxq	0(%rsi),%r10,%rax
2526	adcxq	%r14,%r10
2527	mulxq	8(%rsi),%r11,%r14
2528	adcxq	%rax,%r11
2529	mulxq	16(%rsi),%r12,%rax
2530	adcxq	%r14,%r12
2531	mulxq	24(%rsi),%r13,%r14
2532.byte	0x67,0x67
2533	movq	%r8,%rdx
2534	adcxq	%rax,%r13
2535	adcxq	%rbp,%r14
2536	leaq	32(%rsi),%rsi
2537	leaq	32(%rbx),%rbx
2538
2539	adoxq	%r15,%r10
2540	mulxq	0(%rcx),%rax,%r15
2541	adcxq	%rax,%r10
2542	adoxq	%r15,%r11
2543	mulxq	8(%rcx),%rax,%r15
2544	adcxq	%rax,%r11
2545	adoxq	%r15,%r12
2546	mulxq	16(%rcx),%rax,%r15
2547	movq	%r10,-40(%rbx)
2548	adcxq	%rax,%r12
2549	movq	%r11,-32(%rbx)
2550	adoxq	%r15,%r13
2551	mulxq	24(%rcx),%rax,%r15
2552	movq	%r9,%rdx
2553	movq	%r12,-24(%rbx)
2554	adcxq	%rax,%r13
2555	adoxq	%rbp,%r15
2556	leaq	32(%rcx),%rcx
2557	movq	%r13,-16(%rbx)
2558
2559	decq	%rdi
2560	jnz	.Lmulx4x_1st
2561
2562	movq	8(%rsp),%rax
2563	adcq	%rbp,%r15
2564	leaq	(%rsi,%rax,1),%rsi
2565	addq	%r15,%r14
2566	movq	8+8(%rsp),%rdi
2567	adcq	%rbp,%rbp
2568	movq	%r14,-8(%rbx)
2569	jmp	.Lmulx4x_outer
2570
2571.align	32
2572.Lmulx4x_outer:
2573	leaq	16-256(%rbx),%r10
2574	pxor	%xmm4,%xmm4
2575.byte	0x67,0x67
2576	pxor	%xmm5,%xmm5
2577	movdqa	-128(%rdi),%xmm0
2578	movdqa	-112(%rdi),%xmm1
2579	movdqa	-96(%rdi),%xmm2
2580	pand	256(%r10),%xmm0
2581	movdqa	-80(%rdi),%xmm3
2582	pand	272(%r10),%xmm1
2583	por	%xmm0,%xmm4
2584	pand	288(%r10),%xmm2
2585	por	%xmm1,%xmm5
2586	pand	304(%r10),%xmm3
2587	por	%xmm2,%xmm4
2588	por	%xmm3,%xmm5
2589	movdqa	-64(%rdi),%xmm0
2590	movdqa	-48(%rdi),%xmm1
2591	movdqa	-32(%rdi),%xmm2
2592	pand	320(%r10),%xmm0
2593	movdqa	-16(%rdi),%xmm3
2594	pand	336(%r10),%xmm1
2595	por	%xmm0,%xmm4
2596	pand	352(%r10),%xmm2
2597	por	%xmm1,%xmm5
2598	pand	368(%r10),%xmm3
2599	por	%xmm2,%xmm4
2600	por	%xmm3,%xmm5
2601	movdqa	0(%rdi),%xmm0
2602	movdqa	16(%rdi),%xmm1
2603	movdqa	32(%rdi),%xmm2
2604	pand	384(%r10),%xmm0
2605	movdqa	48(%rdi),%xmm3
2606	pand	400(%r10),%xmm1
2607	por	%xmm0,%xmm4
2608	pand	416(%r10),%xmm2
2609	por	%xmm1,%xmm5
2610	pand	432(%r10),%xmm3
2611	por	%xmm2,%xmm4
2612	por	%xmm3,%xmm5
2613	movdqa	64(%rdi),%xmm0
2614	movdqa	80(%rdi),%xmm1
2615	movdqa	96(%rdi),%xmm2
2616	pand	448(%r10),%xmm0
2617	movdqa	112(%rdi),%xmm3
2618	pand	464(%r10),%xmm1
2619	por	%xmm0,%xmm4
2620	pand	480(%r10),%xmm2
2621	por	%xmm1,%xmm5
2622	pand	496(%r10),%xmm3
2623	por	%xmm2,%xmm4
2624	por	%xmm3,%xmm5
2625	por	%xmm5,%xmm4
2626	pshufd	$0x4e,%xmm4,%xmm0
2627	por	%xmm4,%xmm0
2628	leaq	256(%rdi),%rdi
2629.byte	102,72,15,126,194
2630
2631	movq	%rbp,(%rbx)
2632	leaq	32(%rbx,%rax,1),%rbx
2633	mulxq	0(%rsi),%r8,%r11
2634	xorq	%rbp,%rbp
2635	movq	%rdx,%r9
2636	mulxq	8(%rsi),%r14,%r12
2637	adoxq	-32(%rbx),%r8
2638	adcxq	%r14,%r11
2639	mulxq	16(%rsi),%r15,%r13
2640	adoxq	-24(%rbx),%r11
2641	adcxq	%r15,%r12
2642	mulxq	24(%rsi),%rdx,%r14
2643	adoxq	-16(%rbx),%r12
2644	adcxq	%rdx,%r13
2645	leaq	(%rcx,%rax,1),%rcx
2646	leaq	32(%rsi),%rsi
2647	adoxq	-8(%rbx),%r13
2648	adcxq	%rbp,%r14
2649	adoxq	%rbp,%r14
2650
2651	movq	%r8,%r15
2652	imulq	32+8(%rsp),%r8
2653
2654	movq	%r8,%rdx
2655	xorq	%rbp,%rbp
2656	movq	%rdi,8+8(%rsp)
2657
2658	mulxq	0(%rcx),%rax,%r10
2659	adcxq	%rax,%r15
2660	adoxq	%r11,%r10
2661	mulxq	8(%rcx),%rax,%r11
2662	adcxq	%rax,%r10
2663	adoxq	%r12,%r11
2664	mulxq	16(%rcx),%rax,%r12
2665	adcxq	%rax,%r11
2666	adoxq	%r13,%r12
2667	mulxq	24(%rcx),%rax,%r15
2668	movq	%r9,%rdx
2669	movq	24+8(%rsp),%rdi
2670	movq	%r10,-32(%rbx)
2671	adcxq	%rax,%r12
2672	movq	%r11,-24(%rbx)
2673	adoxq	%rbp,%r15
2674	movq	%r12,-16(%rbx)
2675	leaq	32(%rcx),%rcx
2676	jmp	.Lmulx4x_inner
2677
2678.align	32
2679.Lmulx4x_inner:
2680	mulxq	0(%rsi),%r10,%rax
2681	adcxq	%rbp,%r15
2682	adoxq	%r14,%r10
2683	mulxq	8(%rsi),%r11,%r14
2684	adcxq	0(%rbx),%r10
2685	adoxq	%rax,%r11
2686	mulxq	16(%rsi),%r12,%rax
2687	adcxq	8(%rbx),%r11
2688	adoxq	%r14,%r12
2689	mulxq	24(%rsi),%r13,%r14
2690	movq	%r8,%rdx
2691	adcxq	16(%rbx),%r12
2692	adoxq	%rax,%r13
2693	adcxq	24(%rbx),%r13
2694	adoxq	%rbp,%r14
2695	leaq	32(%rsi),%rsi
2696	leaq	32(%rbx),%rbx
2697	adcxq	%rbp,%r14
2698
2699	adoxq	%r15,%r10
2700	mulxq	0(%rcx),%rax,%r15
2701	adcxq	%rax,%r10
2702	adoxq	%r15,%r11
2703	mulxq	8(%rcx),%rax,%r15
2704	adcxq	%rax,%r11
2705	adoxq	%r15,%r12
2706	mulxq	16(%rcx),%rax,%r15
2707	movq	%r10,-40(%rbx)
2708	adcxq	%rax,%r12
2709	adoxq	%r15,%r13
2710	movq	%r11,-32(%rbx)
2711	mulxq	24(%rcx),%rax,%r15
2712	movq	%r9,%rdx
2713	leaq	32(%rcx),%rcx
2714	movq	%r12,-24(%rbx)
2715	adcxq	%rax,%r13
2716	adoxq	%rbp,%r15
2717	movq	%r13,-16(%rbx)
2718
2719	decq	%rdi
2720	jnz	.Lmulx4x_inner
2721
2722	movq	0+8(%rsp),%rax
2723	adcq	%rbp,%r15
2724	subq	0(%rbx),%rdi
2725	movq	8+8(%rsp),%rdi
2726	movq	16+8(%rsp),%r10
2727	adcq	%r15,%r14
2728	leaq	(%rsi,%rax,1),%rsi
2729	adcq	%rbp,%rbp
2730	movq	%r14,-8(%rbx)
2731
2732	cmpq	%r10,%rdi
2733	jb	.Lmulx4x_outer
2734
2735	movq	-8(%rcx),%r10
2736	movq	%rbp,%r8
2737	movq	(%rcx,%rax,1),%r12
2738	leaq	(%rcx,%rax,1),%rbp
2739	movq	%rax,%rcx
2740	leaq	(%rbx,%rax,1),%rdi
2741	xorl	%eax,%eax
2742	xorq	%r15,%r15
2743	subq	%r14,%r10
2744	adcq	%r15,%r15
2745	orq	%r15,%r8
2746	sarq	$3+2,%rcx
2747	subq	%r8,%rax
2748	movq	56+8(%rsp),%rdx
2749	decq	%r12
2750	movq	8(%rbp),%r13
2751	xorq	%r8,%r8
2752	movq	16(%rbp),%r14
2753	movq	24(%rbp),%r15
2754	jmp	.Lsqrx4x_sub_entry
2755.size	mulx4x_internal,.-mulx4x_internal
2756.type	bn_powerx5,@function
2757.align	32
2758bn_powerx5:
2759.cfi_startproc
2760	movq	%rsp,%rax
2761.cfi_def_cfa_register	%rax
2762.Lpowerx5_enter:
2763	pushq	%rbx
2764.cfi_offset	%rbx,-16
2765	pushq	%rbp
2766.cfi_offset	%rbp,-24
2767	pushq	%r12
2768.cfi_offset	%r12,-32
2769	pushq	%r13
2770.cfi_offset	%r13,-40
2771	pushq	%r14
2772.cfi_offset	%r14,-48
2773	pushq	%r15
2774.cfi_offset	%r15,-56
2775.Lpowerx5_prologue:
2776
2777	shll	$3,%r9d
2778	leaq	(%r9,%r9,2),%r10
2779	negq	%r9
2780	movq	(%r8),%r8
2781
2782
2783
2784
2785
2786
2787
2788
2789	leaq	-320(%rsp,%r9,2),%r11
2790	movq	%rsp,%rbp
2791	subq	%rdi,%r11
2792	andq	$4095,%r11
2793	cmpq	%r11,%r10
2794	jb	.Lpwrx_sp_alt
2795	subq	%r11,%rbp
2796	leaq	-320(%rbp,%r9,2),%rbp
2797	jmp	.Lpwrx_sp_done
2798
2799.align	32
2800.Lpwrx_sp_alt:
2801	leaq	4096-320(,%r9,2),%r10
2802	leaq	-320(%rbp,%r9,2),%rbp
2803	subq	%r10,%r11
2804	movq	$0,%r10
2805	cmovcq	%r10,%r11
2806	subq	%r11,%rbp
2807.Lpwrx_sp_done:
2808	andq	$-64,%rbp
2809	movq	%rsp,%r11
2810	subq	%rbp,%r11
2811	andq	$-4096,%r11
2812	leaq	(%r11,%rbp,1),%rsp
2813	movq	(%rsp),%r10
2814	cmpq	%rbp,%rsp
2815	ja	.Lpwrx_page_walk
2816	jmp	.Lpwrx_page_walk_done
2817
2818.Lpwrx_page_walk:
2819	leaq	-4096(%rsp),%rsp
2820	movq	(%rsp),%r10
2821	cmpq	%rbp,%rsp
2822	ja	.Lpwrx_page_walk
2823.Lpwrx_page_walk_done:
2824
2825	movq	%r9,%r10
2826	negq	%r9
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839	pxor	%xmm0,%xmm0
2840.byte	102,72,15,110,207
2841.byte	102,72,15,110,209
2842.byte	102,73,15,110,218
2843.byte	102,72,15,110,226
2844	movq	%r8,32(%rsp)
2845	movq	%rax,40(%rsp)
2846.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2847.Lpowerx5_body:
2848
2849	call	__bn_sqrx8x_internal
2850	call	__bn_postx4x_internal
2851	call	__bn_sqrx8x_internal
2852	call	__bn_postx4x_internal
2853	call	__bn_sqrx8x_internal
2854	call	__bn_postx4x_internal
2855	call	__bn_sqrx8x_internal
2856	call	__bn_postx4x_internal
2857	call	__bn_sqrx8x_internal
2858	call	__bn_postx4x_internal
2859
2860	movq	%r10,%r9
2861	movq	%rsi,%rdi
2862.byte	102,72,15,126,209
2863.byte	102,72,15,126,226
2864	movq	40(%rsp),%rax
2865
2866	call	mulx4x_internal
2867
2868	movq	40(%rsp),%rsi
2869.cfi_def_cfa	%rsi,8
2870	movq	$1,%rax
2871
2872	movq	-48(%rsi),%r15
2873.cfi_restore	%r15
2874	movq	-40(%rsi),%r14
2875.cfi_restore	%r14
2876	movq	-32(%rsi),%r13
2877.cfi_restore	%r13
2878	movq	-24(%rsi),%r12
2879.cfi_restore	%r12
2880	movq	-16(%rsi),%rbp
2881.cfi_restore	%rbp
2882	movq	-8(%rsi),%rbx
2883.cfi_restore	%rbx
2884	leaq	(%rsi),%rsp
2885.cfi_def_cfa_register	%rsp
2886.Lpowerx5_epilogue:
2887	.byte	0xf3,0xc3
2888.cfi_endproc
2889.size	bn_powerx5,.-bn_powerx5
2890
2891.globl	bn_sqrx8x_internal
2892.hidden	bn_sqrx8x_internal
2893.type	bn_sqrx8x_internal,@function
2894.align	32
2895bn_sqrx8x_internal:
2896__bn_sqrx8x_internal:
2897
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937	leaq	48+8(%rsp),%rdi
2938	leaq	(%rsi,%r9,1),%rbp
2939	movq	%r9,0+8(%rsp)
2940	movq	%rbp,8+8(%rsp)
2941	jmp	.Lsqr8x_zero_start
2942
2943.align	32
2944.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2945.Lsqrx8x_zero:
2946.byte	0x3e
2947	movdqa	%xmm0,0(%rdi)
2948	movdqa	%xmm0,16(%rdi)
2949	movdqa	%xmm0,32(%rdi)
2950	movdqa	%xmm0,48(%rdi)
2951.Lsqr8x_zero_start:
2952	movdqa	%xmm0,64(%rdi)
2953	movdqa	%xmm0,80(%rdi)
2954	movdqa	%xmm0,96(%rdi)
2955	movdqa	%xmm0,112(%rdi)
2956	leaq	128(%rdi),%rdi
2957	subq	$64,%r9
2958	jnz	.Lsqrx8x_zero
2959
2960	movq	0(%rsi),%rdx
2961
2962	xorq	%r10,%r10
2963	xorq	%r11,%r11
2964	xorq	%r12,%r12
2965	xorq	%r13,%r13
2966	xorq	%r14,%r14
2967	xorq	%r15,%r15
2968	leaq	48+8(%rsp),%rdi
2969	xorq	%rbp,%rbp
2970	jmp	.Lsqrx8x_outer_loop
2971
2972.align	32
2973.Lsqrx8x_outer_loop:
2974	mulxq	8(%rsi),%r8,%rax
2975	adcxq	%r9,%r8
2976	adoxq	%rax,%r10
2977	mulxq	16(%rsi),%r9,%rax
2978	adcxq	%r10,%r9
2979	adoxq	%rax,%r11
2980.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
2981	adcxq	%r11,%r10
2982	adoxq	%rax,%r12
2983.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
2984	adcxq	%r12,%r11
2985	adoxq	%rax,%r13
2986	mulxq	40(%rsi),%r12,%rax
2987	adcxq	%r13,%r12
2988	adoxq	%rax,%r14
2989	mulxq	48(%rsi),%r13,%rax
2990	adcxq	%r14,%r13
2991	adoxq	%r15,%rax
2992	mulxq	56(%rsi),%r14,%r15
2993	movq	8(%rsi),%rdx
2994	adcxq	%rax,%r14
2995	adoxq	%rbp,%r15
2996	adcq	64(%rdi),%r15
2997	movq	%r8,8(%rdi)
2998	movq	%r9,16(%rdi)
2999	sbbq	%rcx,%rcx
3000	xorq	%rbp,%rbp
3001
3002
3003	mulxq	16(%rsi),%r8,%rbx
3004	mulxq	24(%rsi),%r9,%rax
3005	adcxq	%r10,%r8
3006	adoxq	%rbx,%r9
3007	mulxq	32(%rsi),%r10,%rbx
3008	adcxq	%r11,%r9
3009	adoxq	%rax,%r10
3010.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
3011	adcxq	%r12,%r10
3012	adoxq	%rbx,%r11
3013.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
3014	adcxq	%r13,%r11
3015	adoxq	%r14,%r12
3016.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
3017	movq	16(%rsi),%rdx
3018	adcxq	%rax,%r12
3019	adoxq	%rbx,%r13
3020	adcxq	%r15,%r13
3021	adoxq	%rbp,%r14
3022	adcxq	%rbp,%r14
3023
3024	movq	%r8,24(%rdi)
3025	movq	%r9,32(%rdi)
3026
3027	mulxq	24(%rsi),%r8,%rbx
3028	mulxq	32(%rsi),%r9,%rax
3029	adcxq	%r10,%r8
3030	adoxq	%rbx,%r9
3031	mulxq	40(%rsi),%r10,%rbx
3032	adcxq	%r11,%r9
3033	adoxq	%rax,%r10
3034.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
3035	adcxq	%r12,%r10
3036	adoxq	%r13,%r11
3037.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
3038.byte	0x3e
3039	movq	24(%rsi),%rdx
3040	adcxq	%rbx,%r11
3041	adoxq	%rax,%r12
3042	adcxq	%r14,%r12
3043	movq	%r8,40(%rdi)
3044	movq	%r9,48(%rdi)
3045	mulxq	32(%rsi),%r8,%rax
3046	adoxq	%rbp,%r13
3047	adcxq	%rbp,%r13
3048
3049	mulxq	40(%rsi),%r9,%rbx
3050	adcxq	%r10,%r8
3051	adoxq	%rax,%r9
3052	mulxq	48(%rsi),%r10,%rax
3053	adcxq	%r11,%r9
3054	adoxq	%r12,%r10
3055	mulxq	56(%rsi),%r11,%r12
3056	movq	32(%rsi),%rdx
3057	movq	40(%rsi),%r14
3058	adcxq	%rbx,%r10
3059	adoxq	%rax,%r11
3060	movq	48(%rsi),%r15
3061	adcxq	%r13,%r11
3062	adoxq	%rbp,%r12
3063	adcxq	%rbp,%r12
3064
3065	movq	%r8,56(%rdi)
3066	movq	%r9,64(%rdi)
3067
3068	mulxq	%r14,%r9,%rax
3069	movq	56(%rsi),%r8
3070	adcxq	%r10,%r9
3071	mulxq	%r15,%r10,%rbx
3072	adoxq	%rax,%r10
3073	adcxq	%r11,%r10
3074	mulxq	%r8,%r11,%rax
3075	movq	%r14,%rdx
3076	adoxq	%rbx,%r11
3077	adcxq	%r12,%r11
3078
3079	adcxq	%rbp,%rax
3080
3081	mulxq	%r15,%r14,%rbx
3082	mulxq	%r8,%r12,%r13
3083	movq	%r15,%rdx
3084	leaq	64(%rsi),%rsi
3085	adcxq	%r14,%r11
3086	adoxq	%rbx,%r12
3087	adcxq	%rax,%r12
3088	adoxq	%rbp,%r13
3089
3090.byte	0x67,0x67
3091	mulxq	%r8,%r8,%r14
3092	adcxq	%r8,%r13
3093	adcxq	%rbp,%r14
3094
3095	cmpq	8+8(%rsp),%rsi
3096	je	.Lsqrx8x_outer_break
3097
3098	negq	%rcx
3099	movq	$-8,%rcx
3100	movq	%rbp,%r15
3101	movq	64(%rdi),%r8
3102	adcxq	72(%rdi),%r9
3103	adcxq	80(%rdi),%r10
3104	adcxq	88(%rdi),%r11
3105	adcq	96(%rdi),%r12
3106	adcq	104(%rdi),%r13
3107	adcq	112(%rdi),%r14
3108	adcq	120(%rdi),%r15
3109	leaq	(%rsi),%rbp
3110	leaq	128(%rdi),%rdi
3111	sbbq	%rax,%rax
3112
3113	movq	-64(%rsi),%rdx
3114	movq	%rax,16+8(%rsp)
3115	movq	%rdi,24+8(%rsp)
3116
3117
3118	xorl	%eax,%eax
3119	jmp	.Lsqrx8x_loop
3120
3121.align	32
3122.Lsqrx8x_loop:
3123	movq	%r8,%rbx
3124	mulxq	0(%rbp),%rax,%r8
3125	adcxq	%rax,%rbx
3126	adoxq	%r9,%r8
3127
3128	mulxq	8(%rbp),%rax,%r9
3129	adcxq	%rax,%r8
3130	adoxq	%r10,%r9
3131
3132	mulxq	16(%rbp),%rax,%r10
3133	adcxq	%rax,%r9
3134	adoxq	%r11,%r10
3135
3136	mulxq	24(%rbp),%rax,%r11
3137	adcxq	%rax,%r10
3138	adoxq	%r12,%r11
3139
3140.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3141	adcxq	%rax,%r11
3142	adoxq	%r13,%r12
3143
3144	mulxq	40(%rbp),%rax,%r13
3145	adcxq	%rax,%r12
3146	adoxq	%r14,%r13
3147
3148	mulxq	48(%rbp),%rax,%r14
3149	movq	%rbx,(%rdi,%rcx,8)
3150	movl	$0,%ebx
3151	adcxq	%rax,%r13
3152	adoxq	%r15,%r14
3153
3154.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
3155	movq	8(%rsi,%rcx,8),%rdx
3156	adcxq	%rax,%r14
3157	adoxq	%rbx,%r15
3158	adcxq	%rbx,%r15
3159
3160.byte	0x67
3161	incq	%rcx
3162	jnz	.Lsqrx8x_loop
3163
3164	leaq	64(%rbp),%rbp
3165	movq	$-8,%rcx
3166	cmpq	8+8(%rsp),%rbp
3167	je	.Lsqrx8x_break
3168
3169	subq	16+8(%rsp),%rbx
3170.byte	0x66
3171	movq	-64(%rsi),%rdx
3172	adcxq	0(%rdi),%r8
3173	adcxq	8(%rdi),%r9
3174	adcq	16(%rdi),%r10
3175	adcq	24(%rdi),%r11
3176	adcq	32(%rdi),%r12
3177	adcq	40(%rdi),%r13
3178	adcq	48(%rdi),%r14
3179	adcq	56(%rdi),%r15
3180	leaq	64(%rdi),%rdi
3181.byte	0x67
3182	sbbq	%rax,%rax
3183	xorl	%ebx,%ebx
3184	movq	%rax,16+8(%rsp)
3185	jmp	.Lsqrx8x_loop
3186
3187.align	32
3188.Lsqrx8x_break:
3189	xorq	%rbp,%rbp
3190	subq	16+8(%rsp),%rbx
3191	adcxq	%rbp,%r8
3192	movq	24+8(%rsp),%rcx
3193	adcxq	%rbp,%r9
3194	movq	0(%rsi),%rdx
3195	adcq	$0,%r10
3196	movq	%r8,0(%rdi)
3197	adcq	$0,%r11
3198	adcq	$0,%r12
3199	adcq	$0,%r13
3200	adcq	$0,%r14
3201	adcq	$0,%r15
3202	cmpq	%rcx,%rdi
3203	je	.Lsqrx8x_outer_loop
3204
3205	movq	%r9,8(%rdi)
3206	movq	8(%rcx),%r9
3207	movq	%r10,16(%rdi)
3208	movq	16(%rcx),%r10
3209	movq	%r11,24(%rdi)
3210	movq	24(%rcx),%r11
3211	movq	%r12,32(%rdi)
3212	movq	32(%rcx),%r12
3213	movq	%r13,40(%rdi)
3214	movq	40(%rcx),%r13
3215	movq	%r14,48(%rdi)
3216	movq	48(%rcx),%r14
3217	movq	%r15,56(%rdi)
3218	movq	56(%rcx),%r15
3219	movq	%rcx,%rdi
3220	jmp	.Lsqrx8x_outer_loop
3221
3222.align	32
3223.Lsqrx8x_outer_break:
3224	movq	%r9,72(%rdi)
3225.byte	102,72,15,126,217
3226	movq	%r10,80(%rdi)
3227	movq	%r11,88(%rdi)
3228	movq	%r12,96(%rdi)
3229	movq	%r13,104(%rdi)
3230	movq	%r14,112(%rdi)
3231	leaq	48+8(%rsp),%rdi
3232	movq	(%rsi,%rcx,1),%rdx
3233
3234	movq	8(%rdi),%r11
3235	xorq	%r10,%r10
3236	movq	0+8(%rsp),%r9
3237	adoxq	%r11,%r11
3238	movq	16(%rdi),%r12
3239	movq	24(%rdi),%r13
3240
3241
3242.align	32
3243.Lsqrx4x_shift_n_add:
3244	mulxq	%rdx,%rax,%rbx
3245	adoxq	%r12,%r12
3246	adcxq	%r10,%rax
3247.byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
3248.byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
3249	adoxq	%r13,%r13
3250	adcxq	%r11,%rbx
3251	movq	40(%rdi),%r11
3252	movq	%rax,0(%rdi)
3253	movq	%rbx,8(%rdi)
3254
3255	mulxq	%rdx,%rax,%rbx
3256	adoxq	%r10,%r10
3257	adcxq	%r12,%rax
3258	movq	16(%rsi,%rcx,1),%rdx
3259	movq	48(%rdi),%r12
3260	adoxq	%r11,%r11
3261	adcxq	%r13,%rbx
3262	movq	56(%rdi),%r13
3263	movq	%rax,16(%rdi)
3264	movq	%rbx,24(%rdi)
3265
3266	mulxq	%rdx,%rax,%rbx
3267	adoxq	%r12,%r12
3268	adcxq	%r10,%rax
3269	movq	24(%rsi,%rcx,1),%rdx
3270	leaq	32(%rcx),%rcx
3271	movq	64(%rdi),%r10
3272	adoxq	%r13,%r13
3273	adcxq	%r11,%rbx
3274	movq	72(%rdi),%r11
3275	movq	%rax,32(%rdi)
3276	movq	%rbx,40(%rdi)
3277
3278	mulxq	%rdx,%rax,%rbx
3279	adoxq	%r10,%r10
3280	adcxq	%r12,%rax
3281	jrcxz	.Lsqrx4x_shift_n_add_break
3282.byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
3283	adoxq	%r11,%r11
3284	adcxq	%r13,%rbx
3285	movq	80(%rdi),%r12
3286	movq	88(%rdi),%r13
3287	movq	%rax,48(%rdi)
3288	movq	%rbx,56(%rdi)
3289	leaq	64(%rdi),%rdi
3290	nop
3291	jmp	.Lsqrx4x_shift_n_add
3292
3293.align	32
3294.Lsqrx4x_shift_n_add_break:
3295	adcxq	%r13,%rbx
3296	movq	%rax,48(%rdi)
3297	movq	%rbx,56(%rdi)
3298	leaq	64(%rdi),%rdi
3299.byte	102,72,15,126,213
3300__bn_sqrx8x_reduction:
3301	xorl	%eax,%eax
3302	movq	32+8(%rsp),%rbx
3303	movq	48+8(%rsp),%rdx
3304	leaq	-64(%rbp,%r9,1),%rcx
3305
3306	movq	%rcx,0+8(%rsp)
3307	movq	%rdi,8+8(%rsp)
3308
3309	leaq	48+8(%rsp),%rdi
3310	jmp	.Lsqrx8x_reduction_loop
3311
3312.align	32
3313.Lsqrx8x_reduction_loop:
3314	movq	8(%rdi),%r9
3315	movq	16(%rdi),%r10
3316	movq	24(%rdi),%r11
3317	movq	32(%rdi),%r12
3318	movq	%rdx,%r8
3319	imulq	%rbx,%rdx
3320	movq	40(%rdi),%r13
3321	movq	48(%rdi),%r14
3322	movq	56(%rdi),%r15
3323	movq	%rax,24+8(%rsp)
3324
3325	leaq	64(%rdi),%rdi
3326	xorq	%rsi,%rsi
3327	movq	$-8,%rcx
3328	jmp	.Lsqrx8x_reduce
3329
3330.align	32
3331.Lsqrx8x_reduce:
3332	movq	%r8,%rbx
3333	mulxq	0(%rbp),%rax,%r8
3334	adcxq	%rbx,%rax
3335	adoxq	%r9,%r8
3336
3337	mulxq	8(%rbp),%rbx,%r9
3338	adcxq	%rbx,%r8
3339	adoxq	%r10,%r9
3340
3341	mulxq	16(%rbp),%rbx,%r10
3342	adcxq	%rbx,%r9
3343	adoxq	%r11,%r10
3344
3345	mulxq	24(%rbp),%rbx,%r11
3346	adcxq	%rbx,%r10
3347	adoxq	%r12,%r11
3348
3349.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
3350	movq	%rdx,%rax
3351	movq	%r8,%rdx
3352	adcxq	%rbx,%r11
3353	adoxq	%r13,%r12
3354
3355	mulxq	32+8(%rsp),%rbx,%rdx
3356	movq	%rax,%rdx
3357	movq	%rax,64+48+8(%rsp,%rcx,8)
3358
3359	mulxq	40(%rbp),%rax,%r13
3360	adcxq	%rax,%r12
3361	adoxq	%r14,%r13
3362
3363	mulxq	48(%rbp),%rax,%r14
3364	adcxq	%rax,%r13
3365	adoxq	%r15,%r14
3366
3367	mulxq	56(%rbp),%rax,%r15
3368	movq	%rbx,%rdx
3369	adcxq	%rax,%r14
3370	adoxq	%rsi,%r15
3371	adcxq	%rsi,%r15
3372
3373.byte	0x67,0x67,0x67
3374	incq	%rcx
3375	jnz	.Lsqrx8x_reduce
3376
3377	movq	%rsi,%rax
3378	cmpq	0+8(%rsp),%rbp
3379	jae	.Lsqrx8x_no_tail
3380
3381	movq	48+8(%rsp),%rdx
3382	addq	0(%rdi),%r8
3383	leaq	64(%rbp),%rbp
3384	movq	$-8,%rcx
3385	adcxq	8(%rdi),%r9
3386	adcxq	16(%rdi),%r10
3387	adcq	24(%rdi),%r11
3388	adcq	32(%rdi),%r12
3389	adcq	40(%rdi),%r13
3390	adcq	48(%rdi),%r14
3391	adcq	56(%rdi),%r15
3392	leaq	64(%rdi),%rdi
3393	sbbq	%rax,%rax
3394
3395	xorq	%rsi,%rsi
3396	movq	%rax,16+8(%rsp)
3397	jmp	.Lsqrx8x_tail
3398
3399.align	32
3400.Lsqrx8x_tail:
3401	movq	%r8,%rbx
3402	mulxq	0(%rbp),%rax,%r8
3403	adcxq	%rax,%rbx
3404	adoxq	%r9,%r8
3405
3406	mulxq	8(%rbp),%rax,%r9
3407	adcxq	%rax,%r8
3408	adoxq	%r10,%r9
3409
3410	mulxq	16(%rbp),%rax,%r10
3411	adcxq	%rax,%r9
3412	adoxq	%r11,%r10
3413
3414	mulxq	24(%rbp),%rax,%r11
3415	adcxq	%rax,%r10
3416	adoxq	%r12,%r11
3417
3418.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3419	adcxq	%rax,%r11
3420	adoxq	%r13,%r12
3421
3422	mulxq	40(%rbp),%rax,%r13
3423	adcxq	%rax,%r12
3424	adoxq	%r14,%r13
3425
3426	mulxq	48(%rbp),%rax,%r14
3427	adcxq	%rax,%r13
3428	adoxq	%r15,%r14
3429
3430	mulxq	56(%rbp),%rax,%r15
3431	movq	72+48+8(%rsp,%rcx,8),%rdx
3432	adcxq	%rax,%r14
3433	adoxq	%rsi,%r15
3434	movq	%rbx,(%rdi,%rcx,8)
3435	movq	%r8,%rbx
3436	adcxq	%rsi,%r15
3437
3438	incq	%rcx
3439	jnz	.Lsqrx8x_tail
3440
3441	cmpq	0+8(%rsp),%rbp
3442	jae	.Lsqrx8x_tail_done
3443
3444	subq	16+8(%rsp),%rsi
3445	movq	48+8(%rsp),%rdx
3446	leaq	64(%rbp),%rbp
3447	adcq	0(%rdi),%r8
3448	adcq	8(%rdi),%r9
3449	adcq	16(%rdi),%r10
3450	adcq	24(%rdi),%r11
3451	adcq	32(%rdi),%r12
3452	adcq	40(%rdi),%r13
3453	adcq	48(%rdi),%r14
3454	adcq	56(%rdi),%r15
3455	leaq	64(%rdi),%rdi
3456	sbbq	%rax,%rax
3457	subq	$8,%rcx
3458
3459	xorq	%rsi,%rsi
3460	movq	%rax,16+8(%rsp)
3461	jmp	.Lsqrx8x_tail
3462
3463.align	32
3464.Lsqrx8x_tail_done:
3465	xorq	%rax,%rax
3466	addq	24+8(%rsp),%r8
3467	adcq	$0,%r9
3468	adcq	$0,%r10
3469	adcq	$0,%r11
3470	adcq	$0,%r12
3471	adcq	$0,%r13
3472	adcq	$0,%r14
3473	adcq	$0,%r15
3474	adcq	$0,%rax
3475
3476	subq	16+8(%rsp),%rsi
3477.Lsqrx8x_no_tail:
3478	adcq	0(%rdi),%r8
3479.byte	102,72,15,126,217
3480	adcq	8(%rdi),%r9
3481	movq	56(%rbp),%rsi
3482.byte	102,72,15,126,213
3483	adcq	16(%rdi),%r10
3484	adcq	24(%rdi),%r11
3485	adcq	32(%rdi),%r12
3486	adcq	40(%rdi),%r13
3487	adcq	48(%rdi),%r14
3488	adcq	56(%rdi),%r15
3489	adcq	$0,%rax
3490
3491	movq	32+8(%rsp),%rbx
3492	movq	64(%rdi,%rcx,1),%rdx
3493
3494	movq	%r8,0(%rdi)
3495	leaq	64(%rdi),%r8
3496	movq	%r9,8(%rdi)
3497	movq	%r10,16(%rdi)
3498	movq	%r11,24(%rdi)
3499	movq	%r12,32(%rdi)
3500	movq	%r13,40(%rdi)
3501	movq	%r14,48(%rdi)
3502	movq	%r15,56(%rdi)
3503
3504	leaq	64(%rdi,%rcx,1),%rdi
3505	cmpq	8+8(%rsp),%r8
3506	jb	.Lsqrx8x_reduction_loop
3507	.byte	0xf3,0xc3
3508.size	bn_sqrx8x_internal,.-bn_sqrx8x_internal
3509.align	32
3510__bn_postx4x_internal:
3511	movq	0(%rbp),%r12
3512	movq	%rcx,%r10
3513	movq	%rcx,%r9
3514	negq	%rax
3515	sarq	$3+2,%rcx
3516
3517.byte	102,72,15,126,202
3518.byte	102,72,15,126,206
3519	decq	%r12
3520	movq	8(%rbp),%r13
3521	xorq	%r8,%r8
3522	movq	16(%rbp),%r14
3523	movq	24(%rbp),%r15
3524	jmp	.Lsqrx4x_sub_entry
3525
3526.align	16
3527.Lsqrx4x_sub:
3528	movq	0(%rbp),%r12
3529	movq	8(%rbp),%r13
3530	movq	16(%rbp),%r14
3531	movq	24(%rbp),%r15
3532.Lsqrx4x_sub_entry:
3533	andnq	%rax,%r12,%r12
3534	leaq	32(%rbp),%rbp
3535	andnq	%rax,%r13,%r13
3536	andnq	%rax,%r14,%r14
3537	andnq	%rax,%r15,%r15
3538
3539	negq	%r8
3540	adcq	0(%rdi),%r12
3541	adcq	8(%rdi),%r13
3542	adcq	16(%rdi),%r14
3543	adcq	24(%rdi),%r15
3544	movq	%r12,0(%rdx)
3545	leaq	32(%rdi),%rdi
3546	movq	%r13,8(%rdx)
3547	sbbq	%r8,%r8
3548	movq	%r14,16(%rdx)
3549	movq	%r15,24(%rdx)
3550	leaq	32(%rdx),%rdx
3551
3552	incq	%rcx
3553	jnz	.Lsqrx4x_sub
3554
3555	negq	%r9
3556
3557	.byte	0xf3,0xc3
3558.size	__bn_postx4x_internal,.-__bn_postx4x_internal
3559.globl	bn_get_bits5
3560.type	bn_get_bits5,@function
3561.align	16
3562bn_get_bits5:
3563	leaq	0(%rdi),%r10
3564	leaq	1(%rdi),%r11
3565	movl	%esi,%ecx
3566	shrl	$4,%esi
3567	andl	$15,%ecx
3568	leal	-8(%rcx),%eax
3569	cmpl	$11,%ecx
3570	cmovaq	%r11,%r10
3571	cmoval	%eax,%ecx
3572	movzwl	(%r10,%rsi,2),%eax
3573	shrl	%cl,%eax
3574	andl	$31,%eax
3575	.byte	0xf3,0xc3
3576.size	bn_get_bits5,.-bn_get_bits5
3577
3578.globl	bn_scatter5
3579.type	bn_scatter5,@function
3580.align	16
3581bn_scatter5:
3582	cmpl	$0,%esi
3583	jz	.Lscatter_epilogue
3584	leaq	(%rdx,%rcx,8),%rdx
3585.Lscatter:
3586	movq	(%rdi),%rax
3587	leaq	8(%rdi),%rdi
3588	movq	%rax,(%rdx)
3589	leaq	256(%rdx),%rdx
3590	subl	$1,%esi
3591	jnz	.Lscatter
3592.Lscatter_epilogue:
3593	.byte	0xf3,0xc3
3594.size	bn_scatter5,.-bn_scatter5
3595
3596.globl	bn_gather5
3597.type	bn_gather5,@function
3598.align	32
3599bn_gather5:
3600.LSEH_begin_bn_gather5:
3601
3602.byte	0x4c,0x8d,0x14,0x24
3603.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00
3604	leaq	.Linc(%rip),%rax
3605	andq	$-16,%rsp
3606
3607	movd	%ecx,%xmm5
3608	movdqa	0(%rax),%xmm0
3609	movdqa	16(%rax),%xmm1
3610	leaq	128(%rdx),%r11
3611	leaq	128(%rsp),%rax
3612
3613	pshufd	$0,%xmm5,%xmm5
3614	movdqa	%xmm1,%xmm4
3615	movdqa	%xmm1,%xmm2
3616	paddd	%xmm0,%xmm1
3617	pcmpeqd	%xmm5,%xmm0
3618	movdqa	%xmm4,%xmm3
3619
3620	paddd	%xmm1,%xmm2
3621	pcmpeqd	%xmm5,%xmm1
3622	movdqa	%xmm0,-128(%rax)
3623	movdqa	%xmm4,%xmm0
3624
3625	paddd	%xmm2,%xmm3
3626	pcmpeqd	%xmm5,%xmm2
3627	movdqa	%xmm1,-112(%rax)
3628	movdqa	%xmm4,%xmm1
3629
3630	paddd	%xmm3,%xmm0
3631	pcmpeqd	%xmm5,%xmm3
3632	movdqa	%xmm2,-96(%rax)
3633	movdqa	%xmm4,%xmm2
3634	paddd	%xmm0,%xmm1
3635	pcmpeqd	%xmm5,%xmm0
3636	movdqa	%xmm3,-80(%rax)
3637	movdqa	%xmm4,%xmm3
3638
3639	paddd	%xmm1,%xmm2
3640	pcmpeqd	%xmm5,%xmm1
3641	movdqa	%xmm0,-64(%rax)
3642	movdqa	%xmm4,%xmm0
3643
3644	paddd	%xmm2,%xmm3
3645	pcmpeqd	%xmm5,%xmm2
3646	movdqa	%xmm1,-48(%rax)
3647	movdqa	%xmm4,%xmm1
3648
3649	paddd	%xmm3,%xmm0
3650	pcmpeqd	%xmm5,%xmm3
3651	movdqa	%xmm2,-32(%rax)
3652	movdqa	%xmm4,%xmm2
3653	paddd	%xmm0,%xmm1
3654	pcmpeqd	%xmm5,%xmm0
3655	movdqa	%xmm3,-16(%rax)
3656	movdqa	%xmm4,%xmm3
3657
3658	paddd	%xmm1,%xmm2
3659	pcmpeqd	%xmm5,%xmm1
3660	movdqa	%xmm0,0(%rax)
3661	movdqa	%xmm4,%xmm0
3662
3663	paddd	%xmm2,%xmm3
3664	pcmpeqd	%xmm5,%xmm2
3665	movdqa	%xmm1,16(%rax)
3666	movdqa	%xmm4,%xmm1
3667
3668	paddd	%xmm3,%xmm0
3669	pcmpeqd	%xmm5,%xmm3
3670	movdqa	%xmm2,32(%rax)
3671	movdqa	%xmm4,%xmm2
3672	paddd	%xmm0,%xmm1
3673	pcmpeqd	%xmm5,%xmm0
3674	movdqa	%xmm3,48(%rax)
3675	movdqa	%xmm4,%xmm3
3676
3677	paddd	%xmm1,%xmm2
3678	pcmpeqd	%xmm5,%xmm1
3679	movdqa	%xmm0,64(%rax)
3680	movdqa	%xmm4,%xmm0
3681
3682	paddd	%xmm2,%xmm3
3683	pcmpeqd	%xmm5,%xmm2
3684	movdqa	%xmm1,80(%rax)
3685	movdqa	%xmm4,%xmm1
3686
3687	paddd	%xmm3,%xmm0
3688	pcmpeqd	%xmm5,%xmm3
3689	movdqa	%xmm2,96(%rax)
3690	movdqa	%xmm4,%xmm2
3691	movdqa	%xmm3,112(%rax)
3692	jmp	.Lgather
3693
3694.align	32
3695.Lgather:
3696	pxor	%xmm4,%xmm4
3697	pxor	%xmm5,%xmm5
3698	movdqa	-128(%r11),%xmm0
3699	movdqa	-112(%r11),%xmm1
3700	movdqa	-96(%r11),%xmm2
3701	pand	-128(%rax),%xmm0
3702	movdqa	-80(%r11),%xmm3
3703	pand	-112(%rax),%xmm1
3704	por	%xmm0,%xmm4
3705	pand	-96(%rax),%xmm2
3706	por	%xmm1,%xmm5
3707	pand	-80(%rax),%xmm3
3708	por	%xmm2,%xmm4
3709	por	%xmm3,%xmm5
3710	movdqa	-64(%r11),%xmm0
3711	movdqa	-48(%r11),%xmm1
3712	movdqa	-32(%r11),%xmm2
3713	pand	-64(%rax),%xmm0
3714	movdqa	-16(%r11),%xmm3
3715	pand	-48(%rax),%xmm1
3716	por	%xmm0,%xmm4
3717	pand	-32(%rax),%xmm2
3718	por	%xmm1,%xmm5
3719	pand	-16(%rax),%xmm3
3720	por	%xmm2,%xmm4
3721	por	%xmm3,%xmm5
3722	movdqa	0(%r11),%xmm0
3723	movdqa	16(%r11),%xmm1
3724	movdqa	32(%r11),%xmm2
3725	pand	0(%rax),%xmm0
3726	movdqa	48(%r11),%xmm3
3727	pand	16(%rax),%xmm1
3728	por	%xmm0,%xmm4
3729	pand	32(%rax),%xmm2
3730	por	%xmm1,%xmm5
3731	pand	48(%rax),%xmm3
3732	por	%xmm2,%xmm4
3733	por	%xmm3,%xmm5
3734	movdqa	64(%r11),%xmm0
3735	movdqa	80(%r11),%xmm1
3736	movdqa	96(%r11),%xmm2
3737	pand	64(%rax),%xmm0
3738	movdqa	112(%r11),%xmm3
3739	pand	80(%rax),%xmm1
3740	por	%xmm0,%xmm4
3741	pand	96(%rax),%xmm2
3742	por	%xmm1,%xmm5
3743	pand	112(%rax),%xmm3
3744	por	%xmm2,%xmm4
3745	por	%xmm3,%xmm5
3746	por	%xmm5,%xmm4
3747	leaq	256(%r11),%r11
3748	pshufd	$0x4e,%xmm4,%xmm0
3749	por	%xmm4,%xmm0
3750	movq	%xmm0,(%rdi)
3751	leaq	8(%rdi),%rdi
3752	subl	$1,%esi
3753	jnz	.Lgather
3754
3755	leaq	(%r10),%rsp
3756	.byte	0xf3,0xc3
3757.LSEH_end_bn_gather5:
3758.size	bn_gather5,.-bn_gather5
3759.align	64
3760.Linc:
3761.long	0,0, 1,1
3762.long	2,2, 2,2
3763.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
3764