1#include <machine/asm.h>
2.text
3
4
5
6.globl	bn_mul_mont_gather5
7.type	bn_mul_mont_gather5,@function
8.align	64
9bn_mul_mont_gather5:
10.cfi_startproc
11	movl	%r9d,%r9d
12	movq	%rsp,%rax
13.cfi_def_cfa_register	%rax
14	testl	$7,%r9d
15	jnz	.Lmul_enter
16	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
17	jmp	.Lmul4x_enter
18
19.align	16
20.Lmul_enter:
21	movd	8(%rsp),%xmm5
22	pushq	%rbx
23.cfi_offset	%rbx,-16
24	pushq	%rbp
25.cfi_offset	%rbp,-24
26	pushq	%r12
27.cfi_offset	%r12,-32
28	pushq	%r13
29.cfi_offset	%r13,-40
30	pushq	%r14
31.cfi_offset	%r14,-48
32	pushq	%r15
33.cfi_offset	%r15,-56
34
35	negq	%r9
36	movq	%rsp,%r11
37	leaq	-280(%rsp,%r9,8),%r10
38	negq	%r9
39	andq	$-1024,%r10
40
41
42
43
44
45
46
47
48
49	subq	%r10,%r11
50	andq	$-4096,%r11
51	leaq	(%r10,%r11,1),%rsp
52	movq	(%rsp),%r11
53	cmpq	%r10,%rsp
54	ja	.Lmul_page_walk
55	jmp	.Lmul_page_walk_done
56
57.Lmul_page_walk:
58	leaq	-4096(%rsp),%rsp
59	movq	(%rsp),%r11
60	cmpq	%r10,%rsp
61	ja	.Lmul_page_walk
62.Lmul_page_walk_done:
63
64	leaq	.Linc(%rip),%r10
65	movq	%rax,8(%rsp,%r9,8)
66.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
67.Lmul_body:
68
69	leaq	128(%rdx),%r12
70	movdqa	0(%r10),%xmm0
71	movdqa	16(%r10),%xmm1
72	leaq	24-112(%rsp,%r9,8),%r10
73	andq	$-16,%r10
74
75	pshufd	$0,%xmm5,%xmm5
76	movdqa	%xmm1,%xmm4
77	movdqa	%xmm1,%xmm2
78	paddd	%xmm0,%xmm1
79	pcmpeqd	%xmm5,%xmm0
80.byte	0x67
81	movdqa	%xmm4,%xmm3
82	paddd	%xmm1,%xmm2
83	pcmpeqd	%xmm5,%xmm1
84	movdqa	%xmm0,112(%r10)
85	movdqa	%xmm4,%xmm0
86
87	paddd	%xmm2,%xmm3
88	pcmpeqd	%xmm5,%xmm2
89	movdqa	%xmm1,128(%r10)
90	movdqa	%xmm4,%xmm1
91
92	paddd	%xmm3,%xmm0
93	pcmpeqd	%xmm5,%xmm3
94	movdqa	%xmm2,144(%r10)
95	movdqa	%xmm4,%xmm2
96
97	paddd	%xmm0,%xmm1
98	pcmpeqd	%xmm5,%xmm0
99	movdqa	%xmm3,160(%r10)
100	movdqa	%xmm4,%xmm3
101	paddd	%xmm1,%xmm2
102	pcmpeqd	%xmm5,%xmm1
103	movdqa	%xmm0,176(%r10)
104	movdqa	%xmm4,%xmm0
105
106	paddd	%xmm2,%xmm3
107	pcmpeqd	%xmm5,%xmm2
108	movdqa	%xmm1,192(%r10)
109	movdqa	%xmm4,%xmm1
110
111	paddd	%xmm3,%xmm0
112	pcmpeqd	%xmm5,%xmm3
113	movdqa	%xmm2,208(%r10)
114	movdqa	%xmm4,%xmm2
115
116	paddd	%xmm0,%xmm1
117	pcmpeqd	%xmm5,%xmm0
118	movdqa	%xmm3,224(%r10)
119	movdqa	%xmm4,%xmm3
120	paddd	%xmm1,%xmm2
121	pcmpeqd	%xmm5,%xmm1
122	movdqa	%xmm0,240(%r10)
123	movdqa	%xmm4,%xmm0
124
125	paddd	%xmm2,%xmm3
126	pcmpeqd	%xmm5,%xmm2
127	movdqa	%xmm1,256(%r10)
128	movdqa	%xmm4,%xmm1
129
130	paddd	%xmm3,%xmm0
131	pcmpeqd	%xmm5,%xmm3
132	movdqa	%xmm2,272(%r10)
133	movdqa	%xmm4,%xmm2
134
135	paddd	%xmm0,%xmm1
136	pcmpeqd	%xmm5,%xmm0
137	movdqa	%xmm3,288(%r10)
138	movdqa	%xmm4,%xmm3
139	paddd	%xmm1,%xmm2
140	pcmpeqd	%xmm5,%xmm1
141	movdqa	%xmm0,304(%r10)
142
143	paddd	%xmm2,%xmm3
144.byte	0x67
145	pcmpeqd	%xmm5,%xmm2
146	movdqa	%xmm1,320(%r10)
147
148	pcmpeqd	%xmm5,%xmm3
149	movdqa	%xmm2,336(%r10)
150	pand	64(%r12),%xmm0
151
152	pand	80(%r12),%xmm1
153	pand	96(%r12),%xmm2
154	movdqa	%xmm3,352(%r10)
155	pand	112(%r12),%xmm3
156	por	%xmm2,%xmm0
157	por	%xmm3,%xmm1
158	movdqa	-128(%r12),%xmm4
159	movdqa	-112(%r12),%xmm5
160	movdqa	-96(%r12),%xmm2
161	pand	112(%r10),%xmm4
162	movdqa	-80(%r12),%xmm3
163	pand	128(%r10),%xmm5
164	por	%xmm4,%xmm0
165	pand	144(%r10),%xmm2
166	por	%xmm5,%xmm1
167	pand	160(%r10),%xmm3
168	por	%xmm2,%xmm0
169	por	%xmm3,%xmm1
170	movdqa	-64(%r12),%xmm4
171	movdqa	-48(%r12),%xmm5
172	movdqa	-32(%r12),%xmm2
173	pand	176(%r10),%xmm4
174	movdqa	-16(%r12),%xmm3
175	pand	192(%r10),%xmm5
176	por	%xmm4,%xmm0
177	pand	208(%r10),%xmm2
178	por	%xmm5,%xmm1
179	pand	224(%r10),%xmm3
180	por	%xmm2,%xmm0
181	por	%xmm3,%xmm1
182	movdqa	0(%r12),%xmm4
183	movdqa	16(%r12),%xmm5
184	movdqa	32(%r12),%xmm2
185	pand	240(%r10),%xmm4
186	movdqa	48(%r12),%xmm3
187	pand	256(%r10),%xmm5
188	por	%xmm4,%xmm0
189	pand	272(%r10),%xmm2
190	por	%xmm5,%xmm1
191	pand	288(%r10),%xmm3
192	por	%xmm2,%xmm0
193	por	%xmm3,%xmm1
194	por	%xmm1,%xmm0
195	pshufd	$0x4e,%xmm0,%xmm1
196	por	%xmm1,%xmm0
197	leaq	256(%r12),%r12
198.byte	102,72,15,126,195
199
200	movq	(%r8),%r8
201	movq	(%rsi),%rax
202
203	xorq	%r14,%r14
204	xorq	%r15,%r15
205
206	movq	%r8,%rbp
207	mulq	%rbx
208	movq	%rax,%r10
209	movq	(%rcx),%rax
210
211	imulq	%r10,%rbp
212	movq	%rdx,%r11
213
214	mulq	%rbp
215	addq	%rax,%r10
216	movq	8(%rsi),%rax
217	adcq	$0,%rdx
218	movq	%rdx,%r13
219
220	leaq	1(%r15),%r15
221	jmp	.L1st_enter
222
223.align	16
224.L1st:
225	addq	%rax,%r13
226	movq	(%rsi,%r15,8),%rax
227	adcq	$0,%rdx
228	addq	%r11,%r13
229	movq	%r10,%r11
230	adcq	$0,%rdx
231	movq	%r13,-16(%rsp,%r15,8)
232	movq	%rdx,%r13
233
234.L1st_enter:
235	mulq	%rbx
236	addq	%rax,%r11
237	movq	(%rcx,%r15,8),%rax
238	adcq	$0,%rdx
239	leaq	1(%r15),%r15
240	movq	%rdx,%r10
241
242	mulq	%rbp
243	cmpq	%r9,%r15
244	jne	.L1st
245
246
247	addq	%rax,%r13
248	adcq	$0,%rdx
249	addq	%r11,%r13
250	adcq	$0,%rdx
251	movq	%r13,-16(%rsp,%r9,8)
252	movq	%rdx,%r13
253	movq	%r10,%r11
254
255	xorq	%rdx,%rdx
256	addq	%r11,%r13
257	adcq	$0,%rdx
258	movq	%r13,-8(%rsp,%r9,8)
259	movq	%rdx,(%rsp,%r9,8)
260
261	leaq	1(%r14),%r14
262	jmp	.Louter
263.align	16
264.Louter:
265	leaq	24+128(%rsp,%r9,8),%rdx
266	andq	$-16,%rdx
267	pxor	%xmm4,%xmm4
268	pxor	%xmm5,%xmm5
269	movdqa	-128(%r12),%xmm0
270	movdqa	-112(%r12),%xmm1
271	movdqa	-96(%r12),%xmm2
272	movdqa	-80(%r12),%xmm3
273	pand	-128(%rdx),%xmm0
274	pand	-112(%rdx),%xmm1
275	por	%xmm0,%xmm4
276	pand	-96(%rdx),%xmm2
277	por	%xmm1,%xmm5
278	pand	-80(%rdx),%xmm3
279	por	%xmm2,%xmm4
280	por	%xmm3,%xmm5
281	movdqa	-64(%r12),%xmm0
282	movdqa	-48(%r12),%xmm1
283	movdqa	-32(%r12),%xmm2
284	movdqa	-16(%r12),%xmm3
285	pand	-64(%rdx),%xmm0
286	pand	-48(%rdx),%xmm1
287	por	%xmm0,%xmm4
288	pand	-32(%rdx),%xmm2
289	por	%xmm1,%xmm5
290	pand	-16(%rdx),%xmm3
291	por	%xmm2,%xmm4
292	por	%xmm3,%xmm5
293	movdqa	0(%r12),%xmm0
294	movdqa	16(%r12),%xmm1
295	movdqa	32(%r12),%xmm2
296	movdqa	48(%r12),%xmm3
297	pand	0(%rdx),%xmm0
298	pand	16(%rdx),%xmm1
299	por	%xmm0,%xmm4
300	pand	32(%rdx),%xmm2
301	por	%xmm1,%xmm5
302	pand	48(%rdx),%xmm3
303	por	%xmm2,%xmm4
304	por	%xmm3,%xmm5
305	movdqa	64(%r12),%xmm0
306	movdqa	80(%r12),%xmm1
307	movdqa	96(%r12),%xmm2
308	movdqa	112(%r12),%xmm3
309	pand	64(%rdx),%xmm0
310	pand	80(%rdx),%xmm1
311	por	%xmm0,%xmm4
312	pand	96(%rdx),%xmm2
313	por	%xmm1,%xmm5
314	pand	112(%rdx),%xmm3
315	por	%xmm2,%xmm4
316	por	%xmm3,%xmm5
317	por	%xmm5,%xmm4
318	pshufd	$0x4e,%xmm4,%xmm0
319	por	%xmm4,%xmm0
320	leaq	256(%r12),%r12
321
322	movq	(%rsi),%rax
323.byte	102,72,15,126,195
324
325	xorq	%r15,%r15
326	movq	%r8,%rbp
327	movq	(%rsp),%r10
328
329	mulq	%rbx
330	addq	%rax,%r10
331	movq	(%rcx),%rax
332	adcq	$0,%rdx
333
334	imulq	%r10,%rbp
335	movq	%rdx,%r11
336
337	mulq	%rbp
338	addq	%rax,%r10
339	movq	8(%rsi),%rax
340	adcq	$0,%rdx
341	movq	8(%rsp),%r10
342	movq	%rdx,%r13
343
344	leaq	1(%r15),%r15
345	jmp	.Linner_enter
346
347.align	16
348.Linner:
349	addq	%rax,%r13
350	movq	(%rsi,%r15,8),%rax
351	adcq	$0,%rdx
352	addq	%r10,%r13
353	movq	(%rsp,%r15,8),%r10
354	adcq	$0,%rdx
355	movq	%r13,-16(%rsp,%r15,8)
356	movq	%rdx,%r13
357
358.Linner_enter:
359	mulq	%rbx
360	addq	%rax,%r11
361	movq	(%rcx,%r15,8),%rax
362	adcq	$0,%rdx
363	addq	%r11,%r10
364	movq	%rdx,%r11
365	adcq	$0,%r11
366	leaq	1(%r15),%r15
367
368	mulq	%rbp
369	cmpq	%r9,%r15
370	jne	.Linner
371
372	addq	%rax,%r13
373	adcq	$0,%rdx
374	addq	%r10,%r13
375	movq	(%rsp,%r9,8),%r10
376	adcq	$0,%rdx
377	movq	%r13,-16(%rsp,%r9,8)
378	movq	%rdx,%r13
379
380	xorq	%rdx,%rdx
381	addq	%r11,%r13
382	adcq	$0,%rdx
383	addq	%r10,%r13
384	adcq	$0,%rdx
385	movq	%r13,-8(%rsp,%r9,8)
386	movq	%rdx,(%rsp,%r9,8)
387
388	leaq	1(%r14),%r14
389	cmpq	%r9,%r14
390	jb	.Louter
391
392	xorq	%r14,%r14
393	movq	(%rsp),%rax
394	leaq	(%rsp),%rsi
395	movq	%r9,%r15
396	jmp	.Lsub
397.align	16
398.Lsub:	sbbq	(%rcx,%r14,8),%rax
399	movq	%rax,(%rdi,%r14,8)
400	movq	8(%rsi,%r14,8),%rax
401	leaq	1(%r14),%r14
402	decq	%r15
403	jnz	.Lsub
404
405	sbbq	$0,%rax
406	movq	$-1,%rbx
407	xorq	%rax,%rbx
408	xorq	%r14,%r14
409	movq	%r9,%r15
410
411.Lcopy:
412	movq	(%rdi,%r14,8),%rcx
413	movq	(%rsp,%r14,8),%rdx
414	andq	%rbx,%rcx
415	andq	%rax,%rdx
416	movq	%r14,(%rsp,%r14,8)
417	orq	%rcx,%rdx
418	movq	%rdx,(%rdi,%r14,8)
419	leaq	1(%r14),%r14
420	subq	$1,%r15
421	jnz	.Lcopy
422
423	movq	8(%rsp,%r9,8),%rsi
424.cfi_def_cfa	%rsi,8
425	movq	$1,%rax
426
427	movq	-48(%rsi),%r15
428.cfi_restore	%r15
429	movq	-40(%rsi),%r14
430.cfi_restore	%r14
431	movq	-32(%rsi),%r13
432.cfi_restore	%r13
433	movq	-24(%rsi),%r12
434.cfi_restore	%r12
435	movq	-16(%rsi),%rbp
436.cfi_restore	%rbp
437	movq	-8(%rsi),%rbx
438.cfi_restore	%rbx
439	leaq	(%rsi),%rsp
440.cfi_def_cfa_register	%rsp
441.Lmul_epilogue:
442	.byte	0xf3,0xc3
443.cfi_endproc
444.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
445.type	bn_mul4x_mont_gather5,@function
446.align	32
447bn_mul4x_mont_gather5:
448.cfi_startproc
449.byte	0x67
450	movq	%rsp,%rax
451.cfi_def_cfa_register	%rax
452.Lmul4x_enter:
453	andl	$0x80108,%r11d
454	cmpl	$0x80108,%r11d
455	je	.Lmulx4x_enter
456	pushq	%rbx
457.cfi_offset	%rbx,-16
458	pushq	%rbp
459.cfi_offset	%rbp,-24
460	pushq	%r12
461.cfi_offset	%r12,-32
462	pushq	%r13
463.cfi_offset	%r13,-40
464	pushq	%r14
465.cfi_offset	%r14,-48
466	pushq	%r15
467.cfi_offset	%r15,-56
468.Lmul4x_prologue:
469
470.byte	0x67
471	shll	$3,%r9d
472	leaq	(%r9,%r9,2),%r10
473	negq	%r9
474
475
476
477
478
479
480
481
482
483
484	leaq	-320(%rsp,%r9,2),%r11
485	movq	%rsp,%rbp
486	subq	%rdi,%r11
487	andq	$4095,%r11
488	cmpq	%r11,%r10
489	jb	.Lmul4xsp_alt
490	subq	%r11,%rbp
491	leaq	-320(%rbp,%r9,2),%rbp
492	jmp	.Lmul4xsp_done
493
494.align	32
495.Lmul4xsp_alt:
496	leaq	4096-320(,%r9,2),%r10
497	leaq	-320(%rbp,%r9,2),%rbp
498	subq	%r10,%r11
499	movq	$0,%r10
500	cmovcq	%r10,%r11
501	subq	%r11,%rbp
502.Lmul4xsp_done:
503	andq	$-64,%rbp
504	movq	%rsp,%r11
505	subq	%rbp,%r11
506	andq	$-4096,%r11
507	leaq	(%r11,%rbp,1),%rsp
508	movq	(%rsp),%r10
509	cmpq	%rbp,%rsp
510	ja	.Lmul4x_page_walk
511	jmp	.Lmul4x_page_walk_done
512
513.Lmul4x_page_walk:
514	leaq	-4096(%rsp),%rsp
515	movq	(%rsp),%r10
516	cmpq	%rbp,%rsp
517	ja	.Lmul4x_page_walk
518.Lmul4x_page_walk_done:
519
520	negq	%r9
521
522	movq	%rax,40(%rsp)
523.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
524.Lmul4x_body:
525
526	call	mul4x_internal
527
528	movq	40(%rsp),%rsi
529.cfi_def_cfa	%rsi,8
530	movq	$1,%rax
531
532	movq	-48(%rsi),%r15
533.cfi_restore	%r15
534	movq	-40(%rsi),%r14
535.cfi_restore	%r14
536	movq	-32(%rsi),%r13
537.cfi_restore	%r13
538	movq	-24(%rsi),%r12
539.cfi_restore	%r12
540	movq	-16(%rsi),%rbp
541.cfi_restore	%rbp
542	movq	-8(%rsi),%rbx
543.cfi_restore	%rbx
544	leaq	(%rsi),%rsp
545.cfi_def_cfa_register	%rsp
546.Lmul4x_epilogue:
547	.byte	0xf3,0xc3
548.cfi_endproc
549.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
550
551.type	mul4x_internal,@function
552.align	32
553mul4x_internal:
554.cfi_startproc
555	shlq	$5,%r9
556	movd	8(%rax),%xmm5
557	leaq	.Linc(%rip),%rax
558	leaq	128(%rdx,%r9,1),%r13
559	shrq	$5,%r9
560	movdqa	0(%rax),%xmm0
561	movdqa	16(%rax),%xmm1
562	leaq	88-112(%rsp,%r9,1),%r10
563	leaq	128(%rdx),%r12
564
565	pshufd	$0,%xmm5,%xmm5
566	movdqa	%xmm1,%xmm4
567.byte	0x67,0x67
568	movdqa	%xmm1,%xmm2
569	paddd	%xmm0,%xmm1
570	pcmpeqd	%xmm5,%xmm0
571.byte	0x67
572	movdqa	%xmm4,%xmm3
573	paddd	%xmm1,%xmm2
574	pcmpeqd	%xmm5,%xmm1
575	movdqa	%xmm0,112(%r10)
576	movdqa	%xmm4,%xmm0
577
578	paddd	%xmm2,%xmm3
579	pcmpeqd	%xmm5,%xmm2
580	movdqa	%xmm1,128(%r10)
581	movdqa	%xmm4,%xmm1
582
583	paddd	%xmm3,%xmm0
584	pcmpeqd	%xmm5,%xmm3
585	movdqa	%xmm2,144(%r10)
586	movdqa	%xmm4,%xmm2
587
588	paddd	%xmm0,%xmm1
589	pcmpeqd	%xmm5,%xmm0
590	movdqa	%xmm3,160(%r10)
591	movdqa	%xmm4,%xmm3
592	paddd	%xmm1,%xmm2
593	pcmpeqd	%xmm5,%xmm1
594	movdqa	%xmm0,176(%r10)
595	movdqa	%xmm4,%xmm0
596
597	paddd	%xmm2,%xmm3
598	pcmpeqd	%xmm5,%xmm2
599	movdqa	%xmm1,192(%r10)
600	movdqa	%xmm4,%xmm1
601
602	paddd	%xmm3,%xmm0
603	pcmpeqd	%xmm5,%xmm3
604	movdqa	%xmm2,208(%r10)
605	movdqa	%xmm4,%xmm2
606
607	paddd	%xmm0,%xmm1
608	pcmpeqd	%xmm5,%xmm0
609	movdqa	%xmm3,224(%r10)
610	movdqa	%xmm4,%xmm3
611	paddd	%xmm1,%xmm2
612	pcmpeqd	%xmm5,%xmm1
613	movdqa	%xmm0,240(%r10)
614	movdqa	%xmm4,%xmm0
615
616	paddd	%xmm2,%xmm3
617	pcmpeqd	%xmm5,%xmm2
618	movdqa	%xmm1,256(%r10)
619	movdqa	%xmm4,%xmm1
620
621	paddd	%xmm3,%xmm0
622	pcmpeqd	%xmm5,%xmm3
623	movdqa	%xmm2,272(%r10)
624	movdqa	%xmm4,%xmm2
625
626	paddd	%xmm0,%xmm1
627	pcmpeqd	%xmm5,%xmm0
628	movdqa	%xmm3,288(%r10)
629	movdqa	%xmm4,%xmm3
630	paddd	%xmm1,%xmm2
631	pcmpeqd	%xmm5,%xmm1
632	movdqa	%xmm0,304(%r10)
633
634	paddd	%xmm2,%xmm3
635.byte	0x67
636	pcmpeqd	%xmm5,%xmm2
637	movdqa	%xmm1,320(%r10)
638
639	pcmpeqd	%xmm5,%xmm3
640	movdqa	%xmm2,336(%r10)
641	pand	64(%r12),%xmm0
642
643	pand	80(%r12),%xmm1
644	pand	96(%r12),%xmm2
645	movdqa	%xmm3,352(%r10)
646	pand	112(%r12),%xmm3
647	por	%xmm2,%xmm0
648	por	%xmm3,%xmm1
649	movdqa	-128(%r12),%xmm4
650	movdqa	-112(%r12),%xmm5
651	movdqa	-96(%r12),%xmm2
652	pand	112(%r10),%xmm4
653	movdqa	-80(%r12),%xmm3
654	pand	128(%r10),%xmm5
655	por	%xmm4,%xmm0
656	pand	144(%r10),%xmm2
657	por	%xmm5,%xmm1
658	pand	160(%r10),%xmm3
659	por	%xmm2,%xmm0
660	por	%xmm3,%xmm1
661	movdqa	-64(%r12),%xmm4
662	movdqa	-48(%r12),%xmm5
663	movdqa	-32(%r12),%xmm2
664	pand	176(%r10),%xmm4
665	movdqa	-16(%r12),%xmm3
666	pand	192(%r10),%xmm5
667	por	%xmm4,%xmm0
668	pand	208(%r10),%xmm2
669	por	%xmm5,%xmm1
670	pand	224(%r10),%xmm3
671	por	%xmm2,%xmm0
672	por	%xmm3,%xmm1
673	movdqa	0(%r12),%xmm4
674	movdqa	16(%r12),%xmm5
675	movdqa	32(%r12),%xmm2
676	pand	240(%r10),%xmm4
677	movdqa	48(%r12),%xmm3
678	pand	256(%r10),%xmm5
679	por	%xmm4,%xmm0
680	pand	272(%r10),%xmm2
681	por	%xmm5,%xmm1
682	pand	288(%r10),%xmm3
683	por	%xmm2,%xmm0
684	por	%xmm3,%xmm1
685	por	%xmm1,%xmm0
686	pshufd	$0x4e,%xmm0,%xmm1
687	por	%xmm1,%xmm0
688	leaq	256(%r12),%r12
689.byte	102,72,15,126,195
690
691	movq	%r13,16+8(%rsp)
692	movq	%rdi,56+8(%rsp)
693
694	movq	(%r8),%r8
695	movq	(%rsi),%rax
696	leaq	(%rsi,%r9,1),%rsi
697	negq	%r9
698
699	movq	%r8,%rbp
700	mulq	%rbx
701	movq	%rax,%r10
702	movq	(%rcx),%rax
703
704	imulq	%r10,%rbp
705	leaq	64+8(%rsp),%r14
706	movq	%rdx,%r11
707
708	mulq	%rbp
709	addq	%rax,%r10
710	movq	8(%rsi,%r9,1),%rax
711	adcq	$0,%rdx
712	movq	%rdx,%rdi
713
714	mulq	%rbx
715	addq	%rax,%r11
716	movq	8(%rcx),%rax
717	adcq	$0,%rdx
718	movq	%rdx,%r10
719
720	mulq	%rbp
721	addq	%rax,%rdi
722	movq	16(%rsi,%r9,1),%rax
723	adcq	$0,%rdx
724	addq	%r11,%rdi
725	leaq	32(%r9),%r15
726	leaq	32(%rcx),%rcx
727	adcq	$0,%rdx
728	movq	%rdi,(%r14)
729	movq	%rdx,%r13
730	jmp	.L1st4x
731
732.align	32
733.L1st4x:
734	mulq	%rbx
735	addq	%rax,%r10
736	movq	-16(%rcx),%rax
737	leaq	32(%r14),%r14
738	adcq	$0,%rdx
739	movq	%rdx,%r11
740
741	mulq	%rbp
742	addq	%rax,%r13
743	movq	-8(%rsi,%r15,1),%rax
744	adcq	$0,%rdx
745	addq	%r10,%r13
746	adcq	$0,%rdx
747	movq	%r13,-24(%r14)
748	movq	%rdx,%rdi
749
750	mulq	%rbx
751	addq	%rax,%r11
752	movq	-8(%rcx),%rax
753	adcq	$0,%rdx
754	movq	%rdx,%r10
755
756	mulq	%rbp
757	addq	%rax,%rdi
758	movq	(%rsi,%r15,1),%rax
759	adcq	$0,%rdx
760	addq	%r11,%rdi
761	adcq	$0,%rdx
762	movq	%rdi,-16(%r14)
763	movq	%rdx,%r13
764
765	mulq	%rbx
766	addq	%rax,%r10
767	movq	0(%rcx),%rax
768	adcq	$0,%rdx
769	movq	%rdx,%r11
770
771	mulq	%rbp
772	addq	%rax,%r13
773	movq	8(%rsi,%r15,1),%rax
774	adcq	$0,%rdx
775	addq	%r10,%r13
776	adcq	$0,%rdx
777	movq	%r13,-8(%r14)
778	movq	%rdx,%rdi
779
780	mulq	%rbx
781	addq	%rax,%r11
782	movq	8(%rcx),%rax
783	adcq	$0,%rdx
784	movq	%rdx,%r10
785
786	mulq	%rbp
787	addq	%rax,%rdi
788	movq	16(%rsi,%r15,1),%rax
789	adcq	$0,%rdx
790	addq	%r11,%rdi
791	leaq	32(%rcx),%rcx
792	adcq	$0,%rdx
793	movq	%rdi,(%r14)
794	movq	%rdx,%r13
795
796	addq	$32,%r15
797	jnz	.L1st4x
798
799	mulq	%rbx
800	addq	%rax,%r10
801	movq	-16(%rcx),%rax
802	leaq	32(%r14),%r14
803	adcq	$0,%rdx
804	movq	%rdx,%r11
805
806	mulq	%rbp
807	addq	%rax,%r13
808	movq	-8(%rsi),%rax
809	adcq	$0,%rdx
810	addq	%r10,%r13
811	adcq	$0,%rdx
812	movq	%r13,-24(%r14)
813	movq	%rdx,%rdi
814
815	mulq	%rbx
816	addq	%rax,%r11
817	movq	-8(%rcx),%rax
818	adcq	$0,%rdx
819	movq	%rdx,%r10
820
821	mulq	%rbp
822	addq	%rax,%rdi
823	movq	(%rsi,%r9,1),%rax
824	adcq	$0,%rdx
825	addq	%r11,%rdi
826	adcq	$0,%rdx
827	movq	%rdi,-16(%r14)
828	movq	%rdx,%r13
829
830	leaq	(%rcx,%r9,1),%rcx
831
832	xorq	%rdi,%rdi
833	addq	%r10,%r13
834	adcq	$0,%rdi
835	movq	%r13,-8(%r14)
836
837	jmp	.Louter4x
838
839.align	32
840.Louter4x:
841	leaq	16+128(%r14),%rdx
842	pxor	%xmm4,%xmm4
843	pxor	%xmm5,%xmm5
844	movdqa	-128(%r12),%xmm0
845	movdqa	-112(%r12),%xmm1
846	movdqa	-96(%r12),%xmm2
847	movdqa	-80(%r12),%xmm3
848	pand	-128(%rdx),%xmm0
849	pand	-112(%rdx),%xmm1
850	por	%xmm0,%xmm4
851	pand	-96(%rdx),%xmm2
852	por	%xmm1,%xmm5
853	pand	-80(%rdx),%xmm3
854	por	%xmm2,%xmm4
855	por	%xmm3,%xmm5
856	movdqa	-64(%r12),%xmm0
857	movdqa	-48(%r12),%xmm1
858	movdqa	-32(%r12),%xmm2
859	movdqa	-16(%r12),%xmm3
860	pand	-64(%rdx),%xmm0
861	pand	-48(%rdx),%xmm1
862	por	%xmm0,%xmm4
863	pand	-32(%rdx),%xmm2
864	por	%xmm1,%xmm5
865	pand	-16(%rdx),%xmm3
866	por	%xmm2,%xmm4
867	por	%xmm3,%xmm5
868	movdqa	0(%r12),%xmm0
869	movdqa	16(%r12),%xmm1
870	movdqa	32(%r12),%xmm2
871	movdqa	48(%r12),%xmm3
872	pand	0(%rdx),%xmm0
873	pand	16(%rdx),%xmm1
874	por	%xmm0,%xmm4
875	pand	32(%rdx),%xmm2
876	por	%xmm1,%xmm5
877	pand	48(%rdx),%xmm3
878	por	%xmm2,%xmm4
879	por	%xmm3,%xmm5
880	movdqa	64(%r12),%xmm0
881	movdqa	80(%r12),%xmm1
882	movdqa	96(%r12),%xmm2
883	movdqa	112(%r12),%xmm3
884	pand	64(%rdx),%xmm0
885	pand	80(%rdx),%xmm1
886	por	%xmm0,%xmm4
887	pand	96(%rdx),%xmm2
888	por	%xmm1,%xmm5
889	pand	112(%rdx),%xmm3
890	por	%xmm2,%xmm4
891	por	%xmm3,%xmm5
892	por	%xmm5,%xmm4
893	pshufd	$0x4e,%xmm4,%xmm0
894	por	%xmm4,%xmm0
895	leaq	256(%r12),%r12
896.byte	102,72,15,126,195
897
898	movq	(%r14,%r9,1),%r10
899	movq	%r8,%rbp
900	mulq	%rbx
901	addq	%rax,%r10
902	movq	(%rcx),%rax
903	adcq	$0,%rdx
904
905	imulq	%r10,%rbp
906	movq	%rdx,%r11
907	movq	%rdi,(%r14)
908
909	leaq	(%r14,%r9,1),%r14
910
911	mulq	%rbp
912	addq	%rax,%r10
913	movq	8(%rsi,%r9,1),%rax
914	adcq	$0,%rdx
915	movq	%rdx,%rdi
916
917	mulq	%rbx
918	addq	%rax,%r11
919	movq	8(%rcx),%rax
920	adcq	$0,%rdx
921	addq	8(%r14),%r11
922	adcq	$0,%rdx
923	movq	%rdx,%r10
924
925	mulq	%rbp
926	addq	%rax,%rdi
927	movq	16(%rsi,%r9,1),%rax
928	adcq	$0,%rdx
929	addq	%r11,%rdi
930	leaq	32(%r9),%r15
931	leaq	32(%rcx),%rcx
932	adcq	$0,%rdx
933	movq	%rdx,%r13
934	jmp	.Linner4x
935
936.align	32
937.Linner4x:
938	mulq	%rbx
939	addq	%rax,%r10
940	movq	-16(%rcx),%rax
941	adcq	$0,%rdx
942	addq	16(%r14),%r10
943	leaq	32(%r14),%r14
944	adcq	$0,%rdx
945	movq	%rdx,%r11
946
947	mulq	%rbp
948	addq	%rax,%r13
949	movq	-8(%rsi,%r15,1),%rax
950	adcq	$0,%rdx
951	addq	%r10,%r13
952	adcq	$0,%rdx
953	movq	%rdi,-32(%r14)
954	movq	%rdx,%rdi
955
956	mulq	%rbx
957	addq	%rax,%r11
958	movq	-8(%rcx),%rax
959	adcq	$0,%rdx
960	addq	-8(%r14),%r11
961	adcq	$0,%rdx
962	movq	%rdx,%r10
963
964	mulq	%rbp
965	addq	%rax,%rdi
966	movq	(%rsi,%r15,1),%rax
967	adcq	$0,%rdx
968	addq	%r11,%rdi
969	adcq	$0,%rdx
970	movq	%r13,-24(%r14)
971	movq	%rdx,%r13
972
973	mulq	%rbx
974	addq	%rax,%r10
975	movq	0(%rcx),%rax
976	adcq	$0,%rdx
977	addq	(%r14),%r10
978	adcq	$0,%rdx
979	movq	%rdx,%r11
980
981	mulq	%rbp
982	addq	%rax,%r13
983	movq	8(%rsi,%r15,1),%rax
984	adcq	$0,%rdx
985	addq	%r10,%r13
986	adcq	$0,%rdx
987	movq	%rdi,-16(%r14)
988	movq	%rdx,%rdi
989
990	mulq	%rbx
991	addq	%rax,%r11
992	movq	8(%rcx),%rax
993	adcq	$0,%rdx
994	addq	8(%r14),%r11
995	adcq	$0,%rdx
996	movq	%rdx,%r10
997
998	mulq	%rbp
999	addq	%rax,%rdi
1000	movq	16(%rsi,%r15,1),%rax
1001	adcq	$0,%rdx
1002	addq	%r11,%rdi
1003	leaq	32(%rcx),%rcx
1004	adcq	$0,%rdx
1005	movq	%r13,-8(%r14)
1006	movq	%rdx,%r13
1007
1008	addq	$32,%r15
1009	jnz	.Linner4x
1010
1011	mulq	%rbx
1012	addq	%rax,%r10
1013	movq	-16(%rcx),%rax
1014	adcq	$0,%rdx
1015	addq	16(%r14),%r10
1016	leaq	32(%r14),%r14
1017	adcq	$0,%rdx
1018	movq	%rdx,%r11
1019
1020	mulq	%rbp
1021	addq	%rax,%r13
1022	movq	-8(%rsi),%rax
1023	adcq	$0,%rdx
1024	addq	%r10,%r13
1025	adcq	$0,%rdx
1026	movq	%rdi,-32(%r14)
1027	movq	%rdx,%rdi
1028
1029	mulq	%rbx
1030	addq	%rax,%r11
1031	movq	%rbp,%rax
1032	movq	-8(%rcx),%rbp
1033	adcq	$0,%rdx
1034	addq	-8(%r14),%r11
1035	adcq	$0,%rdx
1036	movq	%rdx,%r10
1037
1038	mulq	%rbp
1039	addq	%rax,%rdi
1040	movq	(%rsi,%r9,1),%rax
1041	adcq	$0,%rdx
1042	addq	%r11,%rdi
1043	adcq	$0,%rdx
1044	movq	%r13,-24(%r14)
1045	movq	%rdx,%r13
1046
1047	movq	%rdi,-16(%r14)
1048	leaq	(%rcx,%r9,1),%rcx
1049
1050	xorq	%rdi,%rdi
1051	addq	%r10,%r13
1052	adcq	$0,%rdi
1053	addq	(%r14),%r13
1054	adcq	$0,%rdi
1055	movq	%r13,-8(%r14)
1056
1057	cmpq	16+8(%rsp),%r12
1058	jb	.Louter4x
1059	xorq	%rax,%rax
1060	subq	%r13,%rbp
1061	adcq	%r15,%r15
1062	orq	%r15,%rdi
1063	subq	%rdi,%rax
1064	leaq	(%r14,%r9,1),%rbx
1065	movq	(%rcx),%r12
1066	leaq	(%rcx),%rbp
1067	movq	%r9,%rcx
1068	sarq	$3+2,%rcx
1069	movq	56+8(%rsp),%rdi
1070	decq	%r12
1071	xorq	%r10,%r10
1072	movq	8(%rbp),%r13
1073	movq	16(%rbp),%r14
1074	movq	24(%rbp),%r15
1075	jmp	.Lsqr4x_sub_entry
1076.cfi_endproc
1077.size	mul4x_internal,.-mul4x_internal
1078.globl	bn_power5
1079.type	bn_power5,@function
1080.align	32
1081bn_power5:
1082.cfi_startproc
1083	movq	%rsp,%rax
1084.cfi_def_cfa_register	%rax
1085	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
1086	andl	$0x80108,%r11d
1087	cmpl	$0x80108,%r11d
1088	je	.Lpowerx5_enter
1089	pushq	%rbx
1090.cfi_offset	%rbx,-16
1091	pushq	%rbp
1092.cfi_offset	%rbp,-24
1093	pushq	%r12
1094.cfi_offset	%r12,-32
1095	pushq	%r13
1096.cfi_offset	%r13,-40
1097	pushq	%r14
1098.cfi_offset	%r14,-48
1099	pushq	%r15
1100.cfi_offset	%r15,-56
1101.Lpower5_prologue:
1102
1103	shll	$3,%r9d
1104	leal	(%r9,%r9,2),%r10d
1105	negq	%r9
1106	movq	(%r8),%r8
1107
1108
1109
1110
1111
1112
1113
1114
1115	leaq	-320(%rsp,%r9,2),%r11
1116	movq	%rsp,%rbp
1117	subq	%rdi,%r11
1118	andq	$4095,%r11
1119	cmpq	%r11,%r10
1120	jb	.Lpwr_sp_alt
1121	subq	%r11,%rbp
1122	leaq	-320(%rbp,%r9,2),%rbp
1123	jmp	.Lpwr_sp_done
1124
1125.align	32
1126.Lpwr_sp_alt:
1127	leaq	4096-320(,%r9,2),%r10
1128	leaq	-320(%rbp,%r9,2),%rbp
1129	subq	%r10,%r11
1130	movq	$0,%r10
1131	cmovcq	%r10,%r11
1132	subq	%r11,%rbp
1133.Lpwr_sp_done:
1134	andq	$-64,%rbp
1135	movq	%rsp,%r11
1136	subq	%rbp,%r11
1137	andq	$-4096,%r11
1138	leaq	(%r11,%rbp,1),%rsp
1139	movq	(%rsp),%r10
1140	cmpq	%rbp,%rsp
1141	ja	.Lpwr_page_walk
1142	jmp	.Lpwr_page_walk_done
1143
1144.Lpwr_page_walk:
1145	leaq	-4096(%rsp),%rsp
1146	movq	(%rsp),%r10
1147	cmpq	%rbp,%rsp
1148	ja	.Lpwr_page_walk
1149.Lpwr_page_walk_done:
1150
1151	movq	%r9,%r10
1152	negq	%r9
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163	movq	%r8,32(%rsp)
1164	movq	%rax,40(%rsp)
1165.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
1166.Lpower5_body:
1167.byte	102,72,15,110,207
1168.byte	102,72,15,110,209
1169.byte	102,73,15,110,218
1170.byte	102,72,15,110,226
1171
1172	call	__bn_sqr8x_internal
1173	call	__bn_post4x_internal
1174	call	__bn_sqr8x_internal
1175	call	__bn_post4x_internal
1176	call	__bn_sqr8x_internal
1177	call	__bn_post4x_internal
1178	call	__bn_sqr8x_internal
1179	call	__bn_post4x_internal
1180	call	__bn_sqr8x_internal
1181	call	__bn_post4x_internal
1182
1183.byte	102,72,15,126,209
1184.byte	102,72,15,126,226
1185	movq	%rsi,%rdi
1186	movq	40(%rsp),%rax
1187	leaq	32(%rsp),%r8
1188
1189	call	mul4x_internal
1190
1191	movq	40(%rsp),%rsi
1192.cfi_def_cfa	%rsi,8
1193	movq	$1,%rax
1194	movq	-48(%rsi),%r15
1195.cfi_restore	%r15
1196	movq	-40(%rsi),%r14
1197.cfi_restore	%r14
1198	movq	-32(%rsi),%r13
1199.cfi_restore	%r13
1200	movq	-24(%rsi),%r12
1201.cfi_restore	%r12
1202	movq	-16(%rsi),%rbp
1203.cfi_restore	%rbp
1204	movq	-8(%rsi),%rbx
1205.cfi_restore	%rbx
1206	leaq	(%rsi),%rsp
1207.cfi_def_cfa_register	%rsp
1208.Lpower5_epilogue:
1209	.byte	0xf3,0xc3
1210.cfi_endproc
1211.size	bn_power5,.-bn_power5
1212
1213.globl	bn_sqr8x_internal
1214.hidden	bn_sqr8x_internal
1215.type	bn_sqr8x_internal,@function
1216.align	32
1217bn_sqr8x_internal:
1218__bn_sqr8x_internal:
1219.cfi_startproc
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293	leaq	32(%r10),%rbp
1294	leaq	(%rsi,%r9,1),%rsi
1295
1296	movq	%r9,%rcx
1297
1298
1299	movq	-32(%rsi,%rbp,1),%r14
1300	leaq	48+8(%rsp,%r9,2),%rdi
1301	movq	-24(%rsi,%rbp,1),%rax
1302	leaq	-32(%rdi,%rbp,1),%rdi
1303	movq	-16(%rsi,%rbp,1),%rbx
1304	movq	%rax,%r15
1305
1306	mulq	%r14
1307	movq	%rax,%r10
1308	movq	%rbx,%rax
1309	movq	%rdx,%r11
1310	movq	%r10,-24(%rdi,%rbp,1)
1311
1312	mulq	%r14
1313	addq	%rax,%r11
1314	movq	%rbx,%rax
1315	adcq	$0,%rdx
1316	movq	%r11,-16(%rdi,%rbp,1)
1317	movq	%rdx,%r10
1318
1319
1320	movq	-8(%rsi,%rbp,1),%rbx
1321	mulq	%r15
1322	movq	%rax,%r12
1323	movq	%rbx,%rax
1324	movq	%rdx,%r13
1325
1326	leaq	(%rbp),%rcx
1327	mulq	%r14
1328	addq	%rax,%r10
1329	movq	%rbx,%rax
1330	movq	%rdx,%r11
1331	adcq	$0,%r11
1332	addq	%r12,%r10
1333	adcq	$0,%r11
1334	movq	%r10,-8(%rdi,%rcx,1)
1335	jmp	.Lsqr4x_1st
1336
1337.align	32
1338.Lsqr4x_1st:
1339	movq	(%rsi,%rcx,1),%rbx
1340	mulq	%r15
1341	addq	%rax,%r13
1342	movq	%rbx,%rax
1343	movq	%rdx,%r12
1344	adcq	$0,%r12
1345
1346	mulq	%r14
1347	addq	%rax,%r11
1348	movq	%rbx,%rax
1349	movq	8(%rsi,%rcx,1),%rbx
1350	movq	%rdx,%r10
1351	adcq	$0,%r10
1352	addq	%r13,%r11
1353	adcq	$0,%r10
1354
1355
1356	mulq	%r15
1357	addq	%rax,%r12
1358	movq	%rbx,%rax
1359	movq	%r11,(%rdi,%rcx,1)
1360	movq	%rdx,%r13
1361	adcq	$0,%r13
1362
1363	mulq	%r14
1364	addq	%rax,%r10
1365	movq	%rbx,%rax
1366	movq	16(%rsi,%rcx,1),%rbx
1367	movq	%rdx,%r11
1368	adcq	$0,%r11
1369	addq	%r12,%r10
1370	adcq	$0,%r11
1371
1372	mulq	%r15
1373	addq	%rax,%r13
1374	movq	%rbx,%rax
1375	movq	%r10,8(%rdi,%rcx,1)
1376	movq	%rdx,%r12
1377	adcq	$0,%r12
1378
1379	mulq	%r14
1380	addq	%rax,%r11
1381	movq	%rbx,%rax
1382	movq	24(%rsi,%rcx,1),%rbx
1383	movq	%rdx,%r10
1384	adcq	$0,%r10
1385	addq	%r13,%r11
1386	adcq	$0,%r10
1387
1388
1389	mulq	%r15
1390	addq	%rax,%r12
1391	movq	%rbx,%rax
1392	movq	%r11,16(%rdi,%rcx,1)
1393	movq	%rdx,%r13
1394	adcq	$0,%r13
1395	leaq	32(%rcx),%rcx
1396
1397	mulq	%r14
1398	addq	%rax,%r10
1399	movq	%rbx,%rax
1400	movq	%rdx,%r11
1401	adcq	$0,%r11
1402	addq	%r12,%r10
1403	adcq	$0,%r11
1404	movq	%r10,-8(%rdi,%rcx,1)
1405
1406	cmpq	$0,%rcx
1407	jne	.Lsqr4x_1st
1408
1409	mulq	%r15
1410	addq	%rax,%r13
1411	leaq	16(%rbp),%rbp
1412	adcq	$0,%rdx
1413	addq	%r11,%r13
1414	adcq	$0,%rdx
1415
1416	movq	%r13,(%rdi)
1417	movq	%rdx,%r12
1418	movq	%rdx,8(%rdi)
1419	jmp	.Lsqr4x_outer
1420
1421.align	32
1422.Lsqr4x_outer:
1423	movq	-32(%rsi,%rbp,1),%r14
1424	leaq	48+8(%rsp,%r9,2),%rdi
1425	movq	-24(%rsi,%rbp,1),%rax
1426	leaq	-32(%rdi,%rbp,1),%rdi
1427	movq	-16(%rsi,%rbp,1),%rbx
1428	movq	%rax,%r15
1429
1430	mulq	%r14
1431	movq	-24(%rdi,%rbp,1),%r10
1432	addq	%rax,%r10
1433	movq	%rbx,%rax
1434	adcq	$0,%rdx
1435	movq	%r10,-24(%rdi,%rbp,1)
1436	movq	%rdx,%r11
1437
1438	mulq	%r14
1439	addq	%rax,%r11
1440	movq	%rbx,%rax
1441	adcq	$0,%rdx
1442	addq	-16(%rdi,%rbp,1),%r11
1443	movq	%rdx,%r10
1444	adcq	$0,%r10
1445	movq	%r11,-16(%rdi,%rbp,1)
1446
1447	xorq	%r12,%r12
1448
1449	movq	-8(%rsi,%rbp,1),%rbx
1450	mulq	%r15
1451	addq	%rax,%r12
1452	movq	%rbx,%rax
1453	adcq	$0,%rdx
1454	addq	-8(%rdi,%rbp,1),%r12
1455	movq	%rdx,%r13
1456	adcq	$0,%r13
1457
1458	mulq	%r14
1459	addq	%rax,%r10
1460	movq	%rbx,%rax
1461	adcq	$0,%rdx
1462	addq	%r12,%r10
1463	movq	%rdx,%r11
1464	adcq	$0,%r11
1465	movq	%r10,-8(%rdi,%rbp,1)
1466
1467	leaq	(%rbp),%rcx
1468	jmp	.Lsqr4x_inner
1469
1470.align	32
1471.Lsqr4x_inner:
1472	movq	(%rsi,%rcx,1),%rbx
1473	mulq	%r15
1474	addq	%rax,%r13
1475	movq	%rbx,%rax
1476	movq	%rdx,%r12
1477	adcq	$0,%r12
1478	addq	(%rdi,%rcx,1),%r13
1479	adcq	$0,%r12
1480
1481.byte	0x67
1482	mulq	%r14
1483	addq	%rax,%r11
1484	movq	%rbx,%rax
1485	movq	8(%rsi,%rcx,1),%rbx
1486	movq	%rdx,%r10
1487	adcq	$0,%r10
1488	addq	%r13,%r11
1489	adcq	$0,%r10
1490
1491	mulq	%r15
1492	addq	%rax,%r12
1493	movq	%r11,(%rdi,%rcx,1)
1494	movq	%rbx,%rax
1495	movq	%rdx,%r13
1496	adcq	$0,%r13
1497	addq	8(%rdi,%rcx,1),%r12
1498	leaq	16(%rcx),%rcx
1499	adcq	$0,%r13
1500
1501	mulq	%r14
1502	addq	%rax,%r10
1503	movq	%rbx,%rax
1504	adcq	$0,%rdx
1505	addq	%r12,%r10
1506	movq	%rdx,%r11
1507	adcq	$0,%r11
1508	movq	%r10,-8(%rdi,%rcx,1)
1509
1510	cmpq	$0,%rcx
1511	jne	.Lsqr4x_inner
1512
1513.byte	0x67
1514	mulq	%r15
1515	addq	%rax,%r13
1516	adcq	$0,%rdx
1517	addq	%r11,%r13
1518	adcq	$0,%rdx
1519
1520	movq	%r13,(%rdi)
1521	movq	%rdx,%r12
1522	movq	%rdx,8(%rdi)
1523
1524	addq	$16,%rbp
1525	jnz	.Lsqr4x_outer
1526
1527
1528	movq	-32(%rsi),%r14
1529	leaq	48+8(%rsp,%r9,2),%rdi
1530	movq	-24(%rsi),%rax
1531	leaq	-32(%rdi,%rbp,1),%rdi
1532	movq	-16(%rsi),%rbx
1533	movq	%rax,%r15
1534
1535	mulq	%r14
1536	addq	%rax,%r10
1537	movq	%rbx,%rax
1538	movq	%rdx,%r11
1539	adcq	$0,%r11
1540
1541	mulq	%r14
1542	addq	%rax,%r11
1543	movq	%rbx,%rax
1544	movq	%r10,-24(%rdi)
1545	movq	%rdx,%r10
1546	adcq	$0,%r10
1547	addq	%r13,%r11
1548	movq	-8(%rsi),%rbx
1549	adcq	$0,%r10
1550
1551	mulq	%r15
1552	addq	%rax,%r12
1553	movq	%rbx,%rax
1554	movq	%r11,-16(%rdi)
1555	movq	%rdx,%r13
1556	adcq	$0,%r13
1557
1558	mulq	%r14
1559	addq	%rax,%r10
1560	movq	%rbx,%rax
1561	movq	%rdx,%r11
1562	adcq	$0,%r11
1563	addq	%r12,%r10
1564	adcq	$0,%r11
1565	movq	%r10,-8(%rdi)
1566
1567	mulq	%r15
1568	addq	%rax,%r13
1569	movq	-16(%rsi),%rax
1570	adcq	$0,%rdx
1571	addq	%r11,%r13
1572	adcq	$0,%rdx
1573
1574	movq	%r13,(%rdi)
1575	movq	%rdx,%r12
1576	movq	%rdx,8(%rdi)
1577
1578	mulq	%rbx
1579	addq	$16,%rbp
1580	xorq	%r14,%r14
1581	subq	%r9,%rbp
1582	xorq	%r15,%r15
1583
1584	addq	%r12,%rax
1585	adcq	$0,%rdx
1586	movq	%rax,8(%rdi)
1587	movq	%rdx,16(%rdi)
1588	movq	%r15,24(%rdi)
1589
1590	movq	-16(%rsi,%rbp,1),%rax
1591	leaq	48+8(%rsp),%rdi
1592	xorq	%r10,%r10
1593	movq	8(%rdi),%r11
1594
1595	leaq	(%r14,%r10,2),%r12
1596	shrq	$63,%r10
1597	leaq	(%rcx,%r11,2),%r13
1598	shrq	$63,%r11
1599	orq	%r10,%r13
1600	movq	16(%rdi),%r10
1601	movq	%r11,%r14
1602	mulq	%rax
1603	negq	%r15
1604	movq	24(%rdi),%r11
1605	adcq	%rax,%r12
1606	movq	-8(%rsi,%rbp,1),%rax
1607	movq	%r12,(%rdi)
1608	adcq	%rdx,%r13
1609
1610	leaq	(%r14,%r10,2),%rbx
1611	movq	%r13,8(%rdi)
1612	sbbq	%r15,%r15
1613	shrq	$63,%r10
1614	leaq	(%rcx,%r11,2),%r8
1615	shrq	$63,%r11
1616	orq	%r10,%r8
1617	movq	32(%rdi),%r10
1618	movq	%r11,%r14
1619	mulq	%rax
1620	negq	%r15
1621	movq	40(%rdi),%r11
1622	adcq	%rax,%rbx
1623	movq	0(%rsi,%rbp,1),%rax
1624	movq	%rbx,16(%rdi)
1625	adcq	%rdx,%r8
1626	leaq	16(%rbp),%rbp
1627	movq	%r8,24(%rdi)
1628	sbbq	%r15,%r15
1629	leaq	64(%rdi),%rdi
1630	jmp	.Lsqr4x_shift_n_add
1631
1632.align	32
1633.Lsqr4x_shift_n_add:
1634	leaq	(%r14,%r10,2),%r12
1635	shrq	$63,%r10
1636	leaq	(%rcx,%r11,2),%r13
1637	shrq	$63,%r11
1638	orq	%r10,%r13
1639	movq	-16(%rdi),%r10
1640	movq	%r11,%r14
1641	mulq	%rax
1642	negq	%r15
1643	movq	-8(%rdi),%r11
1644	adcq	%rax,%r12
1645	movq	-8(%rsi,%rbp,1),%rax
1646	movq	%r12,-32(%rdi)
1647	adcq	%rdx,%r13
1648
1649	leaq	(%r14,%r10,2),%rbx
1650	movq	%r13,-24(%rdi)
1651	sbbq	%r15,%r15
1652	shrq	$63,%r10
1653	leaq	(%rcx,%r11,2),%r8
1654	shrq	$63,%r11
1655	orq	%r10,%r8
1656	movq	0(%rdi),%r10
1657	movq	%r11,%r14
1658	mulq	%rax
1659	negq	%r15
1660	movq	8(%rdi),%r11
1661	adcq	%rax,%rbx
1662	movq	0(%rsi,%rbp,1),%rax
1663	movq	%rbx,-16(%rdi)
1664	adcq	%rdx,%r8
1665
1666	leaq	(%r14,%r10,2),%r12
1667	movq	%r8,-8(%rdi)
1668	sbbq	%r15,%r15
1669	shrq	$63,%r10
1670	leaq	(%rcx,%r11,2),%r13
1671	shrq	$63,%r11
1672	orq	%r10,%r13
1673	movq	16(%rdi),%r10
1674	movq	%r11,%r14
1675	mulq	%rax
1676	negq	%r15
1677	movq	24(%rdi),%r11
1678	adcq	%rax,%r12
1679	movq	8(%rsi,%rbp,1),%rax
1680	movq	%r12,0(%rdi)
1681	adcq	%rdx,%r13
1682
1683	leaq	(%r14,%r10,2),%rbx
1684	movq	%r13,8(%rdi)
1685	sbbq	%r15,%r15
1686	shrq	$63,%r10
1687	leaq	(%rcx,%r11,2),%r8
1688	shrq	$63,%r11
1689	orq	%r10,%r8
1690	movq	32(%rdi),%r10
1691	movq	%r11,%r14
1692	mulq	%rax
1693	negq	%r15
1694	movq	40(%rdi),%r11
1695	adcq	%rax,%rbx
1696	movq	16(%rsi,%rbp,1),%rax
1697	movq	%rbx,16(%rdi)
1698	adcq	%rdx,%r8
1699	movq	%r8,24(%rdi)
1700	sbbq	%r15,%r15
1701	leaq	64(%rdi),%rdi
1702	addq	$32,%rbp
1703	jnz	.Lsqr4x_shift_n_add
1704
1705	leaq	(%r14,%r10,2),%r12
1706.byte	0x67
1707	shrq	$63,%r10
1708	leaq	(%rcx,%r11,2),%r13
1709	shrq	$63,%r11
1710	orq	%r10,%r13
1711	movq	-16(%rdi),%r10
1712	movq	%r11,%r14
1713	mulq	%rax
1714	negq	%r15
1715	movq	-8(%rdi),%r11
1716	adcq	%rax,%r12
1717	movq	-8(%rsi),%rax
1718	movq	%r12,-32(%rdi)
1719	adcq	%rdx,%r13
1720
1721	leaq	(%r14,%r10,2),%rbx
1722	movq	%r13,-24(%rdi)
1723	sbbq	%r15,%r15
1724	shrq	$63,%r10
1725	leaq	(%rcx,%r11,2),%r8
1726	shrq	$63,%r11
1727	orq	%r10,%r8
1728	mulq	%rax
1729	negq	%r15
1730	adcq	%rax,%rbx
1731	adcq	%rdx,%r8
1732	movq	%rbx,-16(%rdi)
1733	movq	%r8,-8(%rdi)
1734.byte	102,72,15,126,213
1735__bn_sqr8x_reduction:
1736	xorq	%rax,%rax
1737	leaq	(%r9,%rbp,1),%rcx
1738	leaq	48+8(%rsp,%r9,2),%rdx
1739	movq	%rcx,0+8(%rsp)
1740	leaq	48+8(%rsp,%r9,1),%rdi
1741	movq	%rdx,8+8(%rsp)
1742	negq	%r9
1743	jmp	.L8x_reduction_loop
1744
1745.align	32
1746.L8x_reduction_loop:
1747	leaq	(%rdi,%r9,1),%rdi
1748.byte	0x66
1749	movq	0(%rdi),%rbx
1750	movq	8(%rdi),%r9
1751	movq	16(%rdi),%r10
1752	movq	24(%rdi),%r11
1753	movq	32(%rdi),%r12
1754	movq	40(%rdi),%r13
1755	movq	48(%rdi),%r14
1756	movq	56(%rdi),%r15
1757	movq	%rax,(%rdx)
1758	leaq	64(%rdi),%rdi
1759
1760.byte	0x67
1761	movq	%rbx,%r8
1762	imulq	32+8(%rsp),%rbx
1763	movq	0(%rbp),%rax
1764	movl	$8,%ecx
1765	jmp	.L8x_reduce
1766
1767.align	32
1768.L8x_reduce:
1769	mulq	%rbx
1770	movq	8(%rbp),%rax
1771	negq	%r8
1772	movq	%rdx,%r8
1773	adcq	$0,%r8
1774
1775	mulq	%rbx
1776	addq	%rax,%r9
1777	movq	16(%rbp),%rax
1778	adcq	$0,%rdx
1779	addq	%r9,%r8
1780	movq	%rbx,48-8+8(%rsp,%rcx,8)
1781	movq	%rdx,%r9
1782	adcq	$0,%r9
1783
1784	mulq	%rbx
1785	addq	%rax,%r10
1786	movq	24(%rbp),%rax
1787	adcq	$0,%rdx
1788	addq	%r10,%r9
1789	movq	32+8(%rsp),%rsi
1790	movq	%rdx,%r10
1791	adcq	$0,%r10
1792
1793	mulq	%rbx
1794	addq	%rax,%r11
1795	movq	32(%rbp),%rax
1796	adcq	$0,%rdx
1797	imulq	%r8,%rsi
1798	addq	%r11,%r10
1799	movq	%rdx,%r11
1800	adcq	$0,%r11
1801
1802	mulq	%rbx
1803	addq	%rax,%r12
1804	movq	40(%rbp),%rax
1805	adcq	$0,%rdx
1806	addq	%r12,%r11
1807	movq	%rdx,%r12
1808	adcq	$0,%r12
1809
1810	mulq	%rbx
1811	addq	%rax,%r13
1812	movq	48(%rbp),%rax
1813	adcq	$0,%rdx
1814	addq	%r13,%r12
1815	movq	%rdx,%r13
1816	adcq	$0,%r13
1817
1818	mulq	%rbx
1819	addq	%rax,%r14
1820	movq	56(%rbp),%rax
1821	adcq	$0,%rdx
1822	addq	%r14,%r13
1823	movq	%rdx,%r14
1824	adcq	$0,%r14
1825
1826	mulq	%rbx
1827	movq	%rsi,%rbx
1828	addq	%rax,%r15
1829	movq	0(%rbp),%rax
1830	adcq	$0,%rdx
1831	addq	%r15,%r14
1832	movq	%rdx,%r15
1833	adcq	$0,%r15
1834
1835	decl	%ecx
1836	jnz	.L8x_reduce
1837
1838	leaq	64(%rbp),%rbp
1839	xorq	%rax,%rax
1840	movq	8+8(%rsp),%rdx
1841	cmpq	0+8(%rsp),%rbp
1842	jae	.L8x_no_tail
1843
1844.byte	0x66
1845	addq	0(%rdi),%r8
1846	adcq	8(%rdi),%r9
1847	adcq	16(%rdi),%r10
1848	adcq	24(%rdi),%r11
1849	adcq	32(%rdi),%r12
1850	adcq	40(%rdi),%r13
1851	adcq	48(%rdi),%r14
1852	adcq	56(%rdi),%r15
1853	sbbq	%rsi,%rsi
1854
1855	movq	48+56+8(%rsp),%rbx
1856	movl	$8,%ecx
1857	movq	0(%rbp),%rax
1858	jmp	.L8x_tail
1859
1860.align	32
1861.L8x_tail:
1862	mulq	%rbx
1863	addq	%rax,%r8
1864	movq	8(%rbp),%rax
1865	movq	%r8,(%rdi)
1866	movq	%rdx,%r8
1867	adcq	$0,%r8
1868
1869	mulq	%rbx
1870	addq	%rax,%r9
1871	movq	16(%rbp),%rax
1872	adcq	$0,%rdx
1873	addq	%r9,%r8
1874	leaq	8(%rdi),%rdi
1875	movq	%rdx,%r9
1876	adcq	$0,%r9
1877
1878	mulq	%rbx
1879	addq	%rax,%r10
1880	movq	24(%rbp),%rax
1881	adcq	$0,%rdx
1882	addq	%r10,%r9
1883	movq	%rdx,%r10
1884	adcq	$0,%r10
1885
1886	mulq	%rbx
1887	addq	%rax,%r11
1888	movq	32(%rbp),%rax
1889	adcq	$0,%rdx
1890	addq	%r11,%r10
1891	movq	%rdx,%r11
1892	adcq	$0,%r11
1893
1894	mulq	%rbx
1895	addq	%rax,%r12
1896	movq	40(%rbp),%rax
1897	adcq	$0,%rdx
1898	addq	%r12,%r11
1899	movq	%rdx,%r12
1900	adcq	$0,%r12
1901
1902	mulq	%rbx
1903	addq	%rax,%r13
1904	movq	48(%rbp),%rax
1905	adcq	$0,%rdx
1906	addq	%r13,%r12
1907	movq	%rdx,%r13
1908	adcq	$0,%r13
1909
1910	mulq	%rbx
1911	addq	%rax,%r14
1912	movq	56(%rbp),%rax
1913	adcq	$0,%rdx
1914	addq	%r14,%r13
1915	movq	%rdx,%r14
1916	adcq	$0,%r14
1917
1918	mulq	%rbx
1919	movq	48-16+8(%rsp,%rcx,8),%rbx
1920	addq	%rax,%r15
1921	adcq	$0,%rdx
1922	addq	%r15,%r14
1923	movq	0(%rbp),%rax
1924	movq	%rdx,%r15
1925	adcq	$0,%r15
1926
1927	decl	%ecx
1928	jnz	.L8x_tail
1929
1930	leaq	64(%rbp),%rbp
1931	movq	8+8(%rsp),%rdx
1932	cmpq	0+8(%rsp),%rbp
1933	jae	.L8x_tail_done
1934
1935	movq	48+56+8(%rsp),%rbx
1936	negq	%rsi
1937	movq	0(%rbp),%rax
1938	adcq	0(%rdi),%r8
1939	adcq	8(%rdi),%r9
1940	adcq	16(%rdi),%r10
1941	adcq	24(%rdi),%r11
1942	adcq	32(%rdi),%r12
1943	adcq	40(%rdi),%r13
1944	adcq	48(%rdi),%r14
1945	adcq	56(%rdi),%r15
1946	sbbq	%rsi,%rsi
1947
1948	movl	$8,%ecx
1949	jmp	.L8x_tail
1950
1951.align	32
1952.L8x_tail_done:
1953	xorq	%rax,%rax
1954	addq	(%rdx),%r8
1955	adcq	$0,%r9
1956	adcq	$0,%r10
1957	adcq	$0,%r11
1958	adcq	$0,%r12
1959	adcq	$0,%r13
1960	adcq	$0,%r14
1961	adcq	$0,%r15
1962	adcq	$0,%rax
1963
1964	negq	%rsi
1965.L8x_no_tail:
1966	adcq	0(%rdi),%r8
1967	adcq	8(%rdi),%r9
1968	adcq	16(%rdi),%r10
1969	adcq	24(%rdi),%r11
1970	adcq	32(%rdi),%r12
1971	adcq	40(%rdi),%r13
1972	adcq	48(%rdi),%r14
1973	adcq	56(%rdi),%r15
1974	adcq	$0,%rax
1975	movq	-8(%rbp),%rcx
1976	xorq	%rsi,%rsi
1977
1978.byte	102,72,15,126,213
1979
1980	movq	%r8,0(%rdi)
1981	movq	%r9,8(%rdi)
1982.byte	102,73,15,126,217
1983	movq	%r10,16(%rdi)
1984	movq	%r11,24(%rdi)
1985	movq	%r12,32(%rdi)
1986	movq	%r13,40(%rdi)
1987	movq	%r14,48(%rdi)
1988	movq	%r15,56(%rdi)
1989	leaq	64(%rdi),%rdi
1990
1991	cmpq	%rdx,%rdi
1992	jb	.L8x_reduction_loop
1993	.byte	0xf3,0xc3
1994.cfi_endproc
1995.size	bn_sqr8x_internal,.-bn_sqr8x_internal
1996.type	__bn_post4x_internal,@function
1997.align	32
1998__bn_post4x_internal:
1999.cfi_startproc
2000	movq	0(%rbp),%r12
2001	leaq	(%rdi,%r9,1),%rbx
2002	movq	%r9,%rcx
2003.byte	102,72,15,126,207
2004	negq	%rax
2005.byte	102,72,15,126,206
2006	sarq	$3+2,%rcx
2007	decq	%r12
2008	xorq	%r10,%r10
2009	movq	8(%rbp),%r13
2010	movq	16(%rbp),%r14
2011	movq	24(%rbp),%r15
2012	jmp	.Lsqr4x_sub_entry
2013
2014.align	16
2015.Lsqr4x_sub:
2016	movq	0(%rbp),%r12
2017	movq	8(%rbp),%r13
2018	movq	16(%rbp),%r14
2019	movq	24(%rbp),%r15
2020.Lsqr4x_sub_entry:
2021	leaq	32(%rbp),%rbp
2022	notq	%r12
2023	notq	%r13
2024	notq	%r14
2025	notq	%r15
2026	andq	%rax,%r12
2027	andq	%rax,%r13
2028	andq	%rax,%r14
2029	andq	%rax,%r15
2030
2031	negq	%r10
2032	adcq	0(%rbx),%r12
2033	adcq	8(%rbx),%r13
2034	adcq	16(%rbx),%r14
2035	adcq	24(%rbx),%r15
2036	movq	%r12,0(%rdi)
2037	leaq	32(%rbx),%rbx
2038	movq	%r13,8(%rdi)
2039	sbbq	%r10,%r10
2040	movq	%r14,16(%rdi)
2041	movq	%r15,24(%rdi)
2042	leaq	32(%rdi),%rdi
2043
2044	incq	%rcx
2045	jnz	.Lsqr4x_sub
2046
2047	movq	%r9,%r10
2048	negq	%r9
2049	.byte	0xf3,0xc3
2050.cfi_endproc
2051.size	__bn_post4x_internal,.-__bn_post4x_internal
2052.globl	bn_from_montgomery
2053.type	bn_from_montgomery,@function
2054.align	32
2055bn_from_montgomery:
2056.cfi_startproc
2057	testl	$7,%r9d
2058	jz	bn_from_mont8x
2059	xorl	%eax,%eax
2060	.byte	0xf3,0xc3
2061.cfi_endproc
2062.size	bn_from_montgomery,.-bn_from_montgomery
2063
2064.type	bn_from_mont8x,@function
2065.align	32
2066bn_from_mont8x:
2067.cfi_startproc
2068.byte	0x67
2069	movq	%rsp,%rax
2070.cfi_def_cfa_register	%rax
2071	pushq	%rbx
2072.cfi_offset	%rbx,-16
2073	pushq	%rbp
2074.cfi_offset	%rbp,-24
2075	pushq	%r12
2076.cfi_offset	%r12,-32
2077	pushq	%r13
2078.cfi_offset	%r13,-40
2079	pushq	%r14
2080.cfi_offset	%r14,-48
2081	pushq	%r15
2082.cfi_offset	%r15,-56
2083.Lfrom_prologue:
2084
2085	shll	$3,%r9d
2086	leaq	(%r9,%r9,2),%r10
2087	negq	%r9
2088	movq	(%r8),%r8
2089
2090
2091
2092
2093
2094
2095
2096
2097	leaq	-320(%rsp,%r9,2),%r11
2098	movq	%rsp,%rbp
2099	subq	%rdi,%r11
2100	andq	$4095,%r11
2101	cmpq	%r11,%r10
2102	jb	.Lfrom_sp_alt
2103	subq	%r11,%rbp
2104	leaq	-320(%rbp,%r9,2),%rbp
2105	jmp	.Lfrom_sp_done
2106
2107.align	32
2108.Lfrom_sp_alt:
2109	leaq	4096-320(,%r9,2),%r10
2110	leaq	-320(%rbp,%r9,2),%rbp
2111	subq	%r10,%r11
2112	movq	$0,%r10
2113	cmovcq	%r10,%r11
2114	subq	%r11,%rbp
2115.Lfrom_sp_done:
2116	andq	$-64,%rbp
2117	movq	%rsp,%r11
2118	subq	%rbp,%r11
2119	andq	$-4096,%r11
2120	leaq	(%r11,%rbp,1),%rsp
2121	movq	(%rsp),%r10
2122	cmpq	%rbp,%rsp
2123	ja	.Lfrom_page_walk
2124	jmp	.Lfrom_page_walk_done
2125
2126.Lfrom_page_walk:
2127	leaq	-4096(%rsp),%rsp
2128	movq	(%rsp),%r10
2129	cmpq	%rbp,%rsp
2130	ja	.Lfrom_page_walk
2131.Lfrom_page_walk_done:
2132
2133	movq	%r9,%r10
2134	negq	%r9
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145	movq	%r8,32(%rsp)
2146	movq	%rax,40(%rsp)
2147.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2148.Lfrom_body:
2149	movq	%r9,%r11
2150	leaq	48(%rsp),%rax
2151	pxor	%xmm0,%xmm0
2152	jmp	.Lmul_by_1
2153
2154.align	32
2155.Lmul_by_1:
2156	movdqu	(%rsi),%xmm1
2157	movdqu	16(%rsi),%xmm2
2158	movdqu	32(%rsi),%xmm3
2159	movdqa	%xmm0,(%rax,%r9,1)
2160	movdqu	48(%rsi),%xmm4
2161	movdqa	%xmm0,16(%rax,%r9,1)
2162.byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
2163	movdqa	%xmm1,(%rax)
2164	movdqa	%xmm0,32(%rax,%r9,1)
2165	movdqa	%xmm2,16(%rax)
2166	movdqa	%xmm0,48(%rax,%r9,1)
2167	movdqa	%xmm3,32(%rax)
2168	movdqa	%xmm4,48(%rax)
2169	leaq	64(%rax),%rax
2170	subq	$64,%r11
2171	jnz	.Lmul_by_1
2172
2173.byte	102,72,15,110,207
2174.byte	102,72,15,110,209
2175.byte	0x67
2176	movq	%rcx,%rbp
2177.byte	102,73,15,110,218
2178	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
2179	andl	$0x80108,%r11d
2180	cmpl	$0x80108,%r11d
2181	jne	.Lfrom_mont_nox
2182
2183	leaq	(%rax,%r9,1),%rdi
2184	call	__bn_sqrx8x_reduction
2185	call	__bn_postx4x_internal
2186
2187	pxor	%xmm0,%xmm0
2188	leaq	48(%rsp),%rax
2189	jmp	.Lfrom_mont_zero
2190
2191.align	32
2192.Lfrom_mont_nox:
2193	call	__bn_sqr8x_reduction
2194	call	__bn_post4x_internal
2195
2196	pxor	%xmm0,%xmm0
2197	leaq	48(%rsp),%rax
2198	jmp	.Lfrom_mont_zero
2199
2200.align	32
2201.Lfrom_mont_zero:
2202	movq	40(%rsp),%rsi
2203.cfi_def_cfa	%rsi,8
2204	movdqa	%xmm0,0(%rax)
2205	movdqa	%xmm0,16(%rax)
2206	movdqa	%xmm0,32(%rax)
2207	movdqa	%xmm0,48(%rax)
2208	leaq	64(%rax),%rax
2209	subq	$32,%r9
2210	jnz	.Lfrom_mont_zero
2211
2212	movq	$1,%rax
2213	movq	-48(%rsi),%r15
2214.cfi_restore	%r15
2215	movq	-40(%rsi),%r14
2216.cfi_restore	%r14
2217	movq	-32(%rsi),%r13
2218.cfi_restore	%r13
2219	movq	-24(%rsi),%r12
2220.cfi_restore	%r12
2221	movq	-16(%rsi),%rbp
2222.cfi_restore	%rbp
2223	movq	-8(%rsi),%rbx
2224.cfi_restore	%rbx
2225	leaq	(%rsi),%rsp
2226.cfi_def_cfa_register	%rsp
2227.Lfrom_epilogue:
2228	.byte	0xf3,0xc3
2229.cfi_endproc
2230.size	bn_from_mont8x,.-bn_from_mont8x
2231.type	bn_mulx4x_mont_gather5,@function
2232.align	32
2233bn_mulx4x_mont_gather5:
2234.cfi_startproc
2235	movq	%rsp,%rax
2236.cfi_def_cfa_register	%rax
2237.Lmulx4x_enter:
2238	pushq	%rbx
2239.cfi_offset	%rbx,-16
2240	pushq	%rbp
2241.cfi_offset	%rbp,-24
2242	pushq	%r12
2243.cfi_offset	%r12,-32
2244	pushq	%r13
2245.cfi_offset	%r13,-40
2246	pushq	%r14
2247.cfi_offset	%r14,-48
2248	pushq	%r15
2249.cfi_offset	%r15,-56
2250.Lmulx4x_prologue:
2251
2252	shll	$3,%r9d
2253	leaq	(%r9,%r9,2),%r10
2254	negq	%r9
2255	movq	(%r8),%r8
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266	leaq	-320(%rsp,%r9,2),%r11
2267	movq	%rsp,%rbp
2268	subq	%rdi,%r11
2269	andq	$4095,%r11
2270	cmpq	%r11,%r10
2271	jb	.Lmulx4xsp_alt
2272	subq	%r11,%rbp
2273	leaq	-320(%rbp,%r9,2),%rbp
2274	jmp	.Lmulx4xsp_done
2275
2276.Lmulx4xsp_alt:
2277	leaq	4096-320(,%r9,2),%r10
2278	leaq	-320(%rbp,%r9,2),%rbp
2279	subq	%r10,%r11
2280	movq	$0,%r10
2281	cmovcq	%r10,%r11
2282	subq	%r11,%rbp
2283.Lmulx4xsp_done:
2284	andq	$-64,%rbp
2285	movq	%rsp,%r11
2286	subq	%rbp,%r11
2287	andq	$-4096,%r11
2288	leaq	(%r11,%rbp,1),%rsp
2289	movq	(%rsp),%r10
2290	cmpq	%rbp,%rsp
2291	ja	.Lmulx4x_page_walk
2292	jmp	.Lmulx4x_page_walk_done
2293
2294.Lmulx4x_page_walk:
2295	leaq	-4096(%rsp),%rsp
2296	movq	(%rsp),%r10
2297	cmpq	%rbp,%rsp
2298	ja	.Lmulx4x_page_walk
2299.Lmulx4x_page_walk_done:
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313	movq	%r8,32(%rsp)
2314	movq	%rax,40(%rsp)
2315.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2316.Lmulx4x_body:
2317	call	mulx4x_internal
2318
2319	movq	40(%rsp),%rsi
2320.cfi_def_cfa	%rsi,8
2321	movq	$1,%rax
2322
2323	movq	-48(%rsi),%r15
2324.cfi_restore	%r15
2325	movq	-40(%rsi),%r14
2326.cfi_restore	%r14
2327	movq	-32(%rsi),%r13
2328.cfi_restore	%r13
2329	movq	-24(%rsi),%r12
2330.cfi_restore	%r12
2331	movq	-16(%rsi),%rbp
2332.cfi_restore	%rbp
2333	movq	-8(%rsi),%rbx
2334.cfi_restore	%rbx
2335	leaq	(%rsi),%rsp
2336.cfi_def_cfa_register	%rsp
2337.Lmulx4x_epilogue:
2338	.byte	0xf3,0xc3
2339.cfi_endproc
2340.size	bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
2341
2342.type	mulx4x_internal,@function
2343.align	32
2344mulx4x_internal:
2345.cfi_startproc
2346	movq	%r9,8(%rsp)
2347	movq	%r9,%r10
2348	negq	%r9
2349	shlq	$5,%r9
2350	negq	%r10
2351	leaq	128(%rdx,%r9,1),%r13
2352	shrq	$5+5,%r9
2353	movd	8(%rax),%xmm5
2354	subq	$1,%r9
2355	leaq	.Linc(%rip),%rax
2356	movq	%r13,16+8(%rsp)
2357	movq	%r9,24+8(%rsp)
2358	movq	%rdi,56+8(%rsp)
2359	movdqa	0(%rax),%xmm0
2360	movdqa	16(%rax),%xmm1
2361	leaq	88-112(%rsp,%r10,1),%r10
2362	leaq	128(%rdx),%rdi
2363
2364	pshufd	$0,%xmm5,%xmm5
2365	movdqa	%xmm1,%xmm4
2366.byte	0x67
2367	movdqa	%xmm1,%xmm2
2368.byte	0x67
2369	paddd	%xmm0,%xmm1
2370	pcmpeqd	%xmm5,%xmm0
2371	movdqa	%xmm4,%xmm3
2372	paddd	%xmm1,%xmm2
2373	pcmpeqd	%xmm5,%xmm1
2374	movdqa	%xmm0,112(%r10)
2375	movdqa	%xmm4,%xmm0
2376
2377	paddd	%xmm2,%xmm3
2378	pcmpeqd	%xmm5,%xmm2
2379	movdqa	%xmm1,128(%r10)
2380	movdqa	%xmm4,%xmm1
2381
2382	paddd	%xmm3,%xmm0
2383	pcmpeqd	%xmm5,%xmm3
2384	movdqa	%xmm2,144(%r10)
2385	movdqa	%xmm4,%xmm2
2386
2387	paddd	%xmm0,%xmm1
2388	pcmpeqd	%xmm5,%xmm0
2389	movdqa	%xmm3,160(%r10)
2390	movdqa	%xmm4,%xmm3
2391	paddd	%xmm1,%xmm2
2392	pcmpeqd	%xmm5,%xmm1
2393	movdqa	%xmm0,176(%r10)
2394	movdqa	%xmm4,%xmm0
2395
2396	paddd	%xmm2,%xmm3
2397	pcmpeqd	%xmm5,%xmm2
2398	movdqa	%xmm1,192(%r10)
2399	movdqa	%xmm4,%xmm1
2400
2401	paddd	%xmm3,%xmm0
2402	pcmpeqd	%xmm5,%xmm3
2403	movdqa	%xmm2,208(%r10)
2404	movdqa	%xmm4,%xmm2
2405
2406	paddd	%xmm0,%xmm1
2407	pcmpeqd	%xmm5,%xmm0
2408	movdqa	%xmm3,224(%r10)
2409	movdqa	%xmm4,%xmm3
2410	paddd	%xmm1,%xmm2
2411	pcmpeqd	%xmm5,%xmm1
2412	movdqa	%xmm0,240(%r10)
2413	movdqa	%xmm4,%xmm0
2414
2415	paddd	%xmm2,%xmm3
2416	pcmpeqd	%xmm5,%xmm2
2417	movdqa	%xmm1,256(%r10)
2418	movdqa	%xmm4,%xmm1
2419
2420	paddd	%xmm3,%xmm0
2421	pcmpeqd	%xmm5,%xmm3
2422	movdqa	%xmm2,272(%r10)
2423	movdqa	%xmm4,%xmm2
2424
2425	paddd	%xmm0,%xmm1
2426	pcmpeqd	%xmm5,%xmm0
2427	movdqa	%xmm3,288(%r10)
2428	movdqa	%xmm4,%xmm3
2429.byte	0x67
2430	paddd	%xmm1,%xmm2
2431	pcmpeqd	%xmm5,%xmm1
2432	movdqa	%xmm0,304(%r10)
2433
2434	paddd	%xmm2,%xmm3
2435	pcmpeqd	%xmm5,%xmm2
2436	movdqa	%xmm1,320(%r10)
2437
2438	pcmpeqd	%xmm5,%xmm3
2439	movdqa	%xmm2,336(%r10)
2440
2441	pand	64(%rdi),%xmm0
2442	pand	80(%rdi),%xmm1
2443	pand	96(%rdi),%xmm2
2444	movdqa	%xmm3,352(%r10)
2445	pand	112(%rdi),%xmm3
2446	por	%xmm2,%xmm0
2447	por	%xmm3,%xmm1
2448	movdqa	-128(%rdi),%xmm4
2449	movdqa	-112(%rdi),%xmm5
2450	movdqa	-96(%rdi),%xmm2
2451	pand	112(%r10),%xmm4
2452	movdqa	-80(%rdi),%xmm3
2453	pand	128(%r10),%xmm5
2454	por	%xmm4,%xmm0
2455	pand	144(%r10),%xmm2
2456	por	%xmm5,%xmm1
2457	pand	160(%r10),%xmm3
2458	por	%xmm2,%xmm0
2459	por	%xmm3,%xmm1
2460	movdqa	-64(%rdi),%xmm4
2461	movdqa	-48(%rdi),%xmm5
2462	movdqa	-32(%rdi),%xmm2
2463	pand	176(%r10),%xmm4
2464	movdqa	-16(%rdi),%xmm3
2465	pand	192(%r10),%xmm5
2466	por	%xmm4,%xmm0
2467	pand	208(%r10),%xmm2
2468	por	%xmm5,%xmm1
2469	pand	224(%r10),%xmm3
2470	por	%xmm2,%xmm0
2471	por	%xmm3,%xmm1
2472	movdqa	0(%rdi),%xmm4
2473	movdqa	16(%rdi),%xmm5
2474	movdqa	32(%rdi),%xmm2
2475	pand	240(%r10),%xmm4
2476	movdqa	48(%rdi),%xmm3
2477	pand	256(%r10),%xmm5
2478	por	%xmm4,%xmm0
2479	pand	272(%r10),%xmm2
2480	por	%xmm5,%xmm1
2481	pand	288(%r10),%xmm3
2482	por	%xmm2,%xmm0
2483	por	%xmm3,%xmm1
2484	pxor	%xmm1,%xmm0
2485	pshufd	$0x4e,%xmm0,%xmm1
2486	por	%xmm1,%xmm0
2487	leaq	256(%rdi),%rdi
2488.byte	102,72,15,126,194
2489	leaq	64+32+8(%rsp),%rbx
2490
2491	movq	%rdx,%r9
2492	mulxq	0(%rsi),%r8,%rax
2493	mulxq	8(%rsi),%r11,%r12
2494	addq	%rax,%r11
2495	mulxq	16(%rsi),%rax,%r13
2496	adcq	%rax,%r12
2497	adcq	$0,%r13
2498	mulxq	24(%rsi),%rax,%r14
2499
2500	movq	%r8,%r15
2501	imulq	32+8(%rsp),%r8
2502	xorq	%rbp,%rbp
2503	movq	%r8,%rdx
2504
2505	movq	%rdi,8+8(%rsp)
2506
2507	leaq	32(%rsi),%rsi
2508	adcxq	%rax,%r13
2509	adcxq	%rbp,%r14
2510
2511	mulxq	0(%rcx),%rax,%r10
2512	adcxq	%rax,%r15
2513	adoxq	%r11,%r10
2514	mulxq	8(%rcx),%rax,%r11
2515	adcxq	%rax,%r10
2516	adoxq	%r12,%r11
2517	mulxq	16(%rcx),%rax,%r12
2518	movq	24+8(%rsp),%rdi
2519	movq	%r10,-32(%rbx)
2520	adcxq	%rax,%r11
2521	adoxq	%r13,%r12
2522	mulxq	24(%rcx),%rax,%r15
2523	movq	%r9,%rdx
2524	movq	%r11,-24(%rbx)
2525	adcxq	%rax,%r12
2526	adoxq	%rbp,%r15
2527	leaq	32(%rcx),%rcx
2528	movq	%r12,-16(%rbx)
2529	jmp	.Lmulx4x_1st
2530
2531.align	32
2532.Lmulx4x_1st:
2533	adcxq	%rbp,%r15
2534	mulxq	0(%rsi),%r10,%rax
2535	adcxq	%r14,%r10
2536	mulxq	8(%rsi),%r11,%r14
2537	adcxq	%rax,%r11
2538	mulxq	16(%rsi),%r12,%rax
2539	adcxq	%r14,%r12
2540	mulxq	24(%rsi),%r13,%r14
2541.byte	0x67,0x67
2542	movq	%r8,%rdx
2543	adcxq	%rax,%r13
2544	adcxq	%rbp,%r14
2545	leaq	32(%rsi),%rsi
2546	leaq	32(%rbx),%rbx
2547
2548	adoxq	%r15,%r10
2549	mulxq	0(%rcx),%rax,%r15
2550	adcxq	%rax,%r10
2551	adoxq	%r15,%r11
2552	mulxq	8(%rcx),%rax,%r15
2553	adcxq	%rax,%r11
2554	adoxq	%r15,%r12
2555	mulxq	16(%rcx),%rax,%r15
2556	movq	%r10,-40(%rbx)
2557	adcxq	%rax,%r12
2558	movq	%r11,-32(%rbx)
2559	adoxq	%r15,%r13
2560	mulxq	24(%rcx),%rax,%r15
2561	movq	%r9,%rdx
2562	movq	%r12,-24(%rbx)
2563	adcxq	%rax,%r13
2564	adoxq	%rbp,%r15
2565	leaq	32(%rcx),%rcx
2566	movq	%r13,-16(%rbx)
2567
2568	decq	%rdi
2569	jnz	.Lmulx4x_1st
2570
2571	movq	8(%rsp),%rax
2572	adcq	%rbp,%r15
2573	leaq	(%rsi,%rax,1),%rsi
2574	addq	%r15,%r14
2575	movq	8+8(%rsp),%rdi
2576	adcq	%rbp,%rbp
2577	movq	%r14,-8(%rbx)
2578	jmp	.Lmulx4x_outer
2579
2580.align	32
2581.Lmulx4x_outer:
2582	leaq	16-256(%rbx),%r10
2583	pxor	%xmm4,%xmm4
2584.byte	0x67,0x67
2585	pxor	%xmm5,%xmm5
2586	movdqa	-128(%rdi),%xmm0
2587	movdqa	-112(%rdi),%xmm1
2588	movdqa	-96(%rdi),%xmm2
2589	pand	256(%r10),%xmm0
2590	movdqa	-80(%rdi),%xmm3
2591	pand	272(%r10),%xmm1
2592	por	%xmm0,%xmm4
2593	pand	288(%r10),%xmm2
2594	por	%xmm1,%xmm5
2595	pand	304(%r10),%xmm3
2596	por	%xmm2,%xmm4
2597	por	%xmm3,%xmm5
2598	movdqa	-64(%rdi),%xmm0
2599	movdqa	-48(%rdi),%xmm1
2600	movdqa	-32(%rdi),%xmm2
2601	pand	320(%r10),%xmm0
2602	movdqa	-16(%rdi),%xmm3
2603	pand	336(%r10),%xmm1
2604	por	%xmm0,%xmm4
2605	pand	352(%r10),%xmm2
2606	por	%xmm1,%xmm5
2607	pand	368(%r10),%xmm3
2608	por	%xmm2,%xmm4
2609	por	%xmm3,%xmm5
2610	movdqa	0(%rdi),%xmm0
2611	movdqa	16(%rdi),%xmm1
2612	movdqa	32(%rdi),%xmm2
2613	pand	384(%r10),%xmm0
2614	movdqa	48(%rdi),%xmm3
2615	pand	400(%r10),%xmm1
2616	por	%xmm0,%xmm4
2617	pand	416(%r10),%xmm2
2618	por	%xmm1,%xmm5
2619	pand	432(%r10),%xmm3
2620	por	%xmm2,%xmm4
2621	por	%xmm3,%xmm5
2622	movdqa	64(%rdi),%xmm0
2623	movdqa	80(%rdi),%xmm1
2624	movdqa	96(%rdi),%xmm2
2625	pand	448(%r10),%xmm0
2626	movdqa	112(%rdi),%xmm3
2627	pand	464(%r10),%xmm1
2628	por	%xmm0,%xmm4
2629	pand	480(%r10),%xmm2
2630	por	%xmm1,%xmm5
2631	pand	496(%r10),%xmm3
2632	por	%xmm2,%xmm4
2633	por	%xmm3,%xmm5
2634	por	%xmm5,%xmm4
2635	pshufd	$0x4e,%xmm4,%xmm0
2636	por	%xmm4,%xmm0
2637	leaq	256(%rdi),%rdi
2638.byte	102,72,15,126,194
2639
2640	movq	%rbp,(%rbx)
2641	leaq	32(%rbx,%rax,1),%rbx
2642	mulxq	0(%rsi),%r8,%r11
2643	xorq	%rbp,%rbp
2644	movq	%rdx,%r9
2645	mulxq	8(%rsi),%r14,%r12
2646	adoxq	-32(%rbx),%r8
2647	adcxq	%r14,%r11
2648	mulxq	16(%rsi),%r15,%r13
2649	adoxq	-24(%rbx),%r11
2650	adcxq	%r15,%r12
2651	mulxq	24(%rsi),%rdx,%r14
2652	adoxq	-16(%rbx),%r12
2653	adcxq	%rdx,%r13
2654	leaq	(%rcx,%rax,1),%rcx
2655	leaq	32(%rsi),%rsi
2656	adoxq	-8(%rbx),%r13
2657	adcxq	%rbp,%r14
2658	adoxq	%rbp,%r14
2659
2660	movq	%r8,%r15
2661	imulq	32+8(%rsp),%r8
2662
2663	movq	%r8,%rdx
2664	xorq	%rbp,%rbp
2665	movq	%rdi,8+8(%rsp)
2666
2667	mulxq	0(%rcx),%rax,%r10
2668	adcxq	%rax,%r15
2669	adoxq	%r11,%r10
2670	mulxq	8(%rcx),%rax,%r11
2671	adcxq	%rax,%r10
2672	adoxq	%r12,%r11
2673	mulxq	16(%rcx),%rax,%r12
2674	adcxq	%rax,%r11
2675	adoxq	%r13,%r12
2676	mulxq	24(%rcx),%rax,%r15
2677	movq	%r9,%rdx
2678	movq	24+8(%rsp),%rdi
2679	movq	%r10,-32(%rbx)
2680	adcxq	%rax,%r12
2681	movq	%r11,-24(%rbx)
2682	adoxq	%rbp,%r15
2683	movq	%r12,-16(%rbx)
2684	leaq	32(%rcx),%rcx
2685	jmp	.Lmulx4x_inner
2686
2687.align	32
2688.Lmulx4x_inner:
2689	mulxq	0(%rsi),%r10,%rax
2690	adcxq	%rbp,%r15
2691	adoxq	%r14,%r10
2692	mulxq	8(%rsi),%r11,%r14
2693	adcxq	0(%rbx),%r10
2694	adoxq	%rax,%r11
2695	mulxq	16(%rsi),%r12,%rax
2696	adcxq	8(%rbx),%r11
2697	adoxq	%r14,%r12
2698	mulxq	24(%rsi),%r13,%r14
2699	movq	%r8,%rdx
2700	adcxq	16(%rbx),%r12
2701	adoxq	%rax,%r13
2702	adcxq	24(%rbx),%r13
2703	adoxq	%rbp,%r14
2704	leaq	32(%rsi),%rsi
2705	leaq	32(%rbx),%rbx
2706	adcxq	%rbp,%r14
2707
2708	adoxq	%r15,%r10
2709	mulxq	0(%rcx),%rax,%r15
2710	adcxq	%rax,%r10
2711	adoxq	%r15,%r11
2712	mulxq	8(%rcx),%rax,%r15
2713	adcxq	%rax,%r11
2714	adoxq	%r15,%r12
2715	mulxq	16(%rcx),%rax,%r15
2716	movq	%r10,-40(%rbx)
2717	adcxq	%rax,%r12
2718	adoxq	%r15,%r13
2719	movq	%r11,-32(%rbx)
2720	mulxq	24(%rcx),%rax,%r15
2721	movq	%r9,%rdx
2722	leaq	32(%rcx),%rcx
2723	movq	%r12,-24(%rbx)
2724	adcxq	%rax,%r13
2725	adoxq	%rbp,%r15
2726	movq	%r13,-16(%rbx)
2727
2728	decq	%rdi
2729	jnz	.Lmulx4x_inner
2730
2731	movq	0+8(%rsp),%rax
2732	adcq	%rbp,%r15
2733	subq	0(%rbx),%rdi
2734	movq	8+8(%rsp),%rdi
2735	movq	16+8(%rsp),%r10
2736	adcq	%r15,%r14
2737	leaq	(%rsi,%rax,1),%rsi
2738	adcq	%rbp,%rbp
2739	movq	%r14,-8(%rbx)
2740
2741	cmpq	%r10,%rdi
2742	jb	.Lmulx4x_outer
2743
2744	movq	-8(%rcx),%r10
2745	movq	%rbp,%r8
2746	movq	(%rcx,%rax,1),%r12
2747	leaq	(%rcx,%rax,1),%rbp
2748	movq	%rax,%rcx
2749	leaq	(%rbx,%rax,1),%rdi
2750	xorl	%eax,%eax
2751	xorq	%r15,%r15
2752	subq	%r14,%r10
2753	adcq	%r15,%r15
2754	orq	%r15,%r8
2755	sarq	$3+2,%rcx
2756	subq	%r8,%rax
2757	movq	56+8(%rsp),%rdx
2758	decq	%r12
2759	movq	8(%rbp),%r13
2760	xorq	%r8,%r8
2761	movq	16(%rbp),%r14
2762	movq	24(%rbp),%r15
2763	jmp	.Lsqrx4x_sub_entry
2764.cfi_endproc
2765.size	mulx4x_internal,.-mulx4x_internal
2766.type	bn_powerx5,@function
2767.align	32
2768bn_powerx5:
2769.cfi_startproc
2770	movq	%rsp,%rax
2771.cfi_def_cfa_register	%rax
2772.Lpowerx5_enter:
2773	pushq	%rbx
2774.cfi_offset	%rbx,-16
2775	pushq	%rbp
2776.cfi_offset	%rbp,-24
2777	pushq	%r12
2778.cfi_offset	%r12,-32
2779	pushq	%r13
2780.cfi_offset	%r13,-40
2781	pushq	%r14
2782.cfi_offset	%r14,-48
2783	pushq	%r15
2784.cfi_offset	%r15,-56
2785.Lpowerx5_prologue:
2786
2787	shll	$3,%r9d
2788	leaq	(%r9,%r9,2),%r10
2789	negq	%r9
2790	movq	(%r8),%r8
2791
2792
2793
2794
2795
2796
2797
2798
2799	leaq	-320(%rsp,%r9,2),%r11
2800	movq	%rsp,%rbp
2801	subq	%rdi,%r11
2802	andq	$4095,%r11
2803	cmpq	%r11,%r10
2804	jb	.Lpwrx_sp_alt
2805	subq	%r11,%rbp
2806	leaq	-320(%rbp,%r9,2),%rbp
2807	jmp	.Lpwrx_sp_done
2808
2809.align	32
2810.Lpwrx_sp_alt:
2811	leaq	4096-320(,%r9,2),%r10
2812	leaq	-320(%rbp,%r9,2),%rbp
2813	subq	%r10,%r11
2814	movq	$0,%r10
2815	cmovcq	%r10,%r11
2816	subq	%r11,%rbp
2817.Lpwrx_sp_done:
2818	andq	$-64,%rbp
2819	movq	%rsp,%r11
2820	subq	%rbp,%r11
2821	andq	$-4096,%r11
2822	leaq	(%r11,%rbp,1),%rsp
2823	movq	(%rsp),%r10
2824	cmpq	%rbp,%rsp
2825	ja	.Lpwrx_page_walk
2826	jmp	.Lpwrx_page_walk_done
2827
2828.Lpwrx_page_walk:
2829	leaq	-4096(%rsp),%rsp
2830	movq	(%rsp),%r10
2831	cmpq	%rbp,%rsp
2832	ja	.Lpwrx_page_walk
2833.Lpwrx_page_walk_done:
2834
2835	movq	%r9,%r10
2836	negq	%r9
2837
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849	pxor	%xmm0,%xmm0
2850.byte	102,72,15,110,207
2851.byte	102,72,15,110,209
2852.byte	102,73,15,110,218
2853.byte	102,72,15,110,226
2854	movq	%r8,32(%rsp)
2855	movq	%rax,40(%rsp)
2856.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2857.Lpowerx5_body:
2858
2859	call	__bn_sqrx8x_internal
2860	call	__bn_postx4x_internal
2861	call	__bn_sqrx8x_internal
2862	call	__bn_postx4x_internal
2863	call	__bn_sqrx8x_internal
2864	call	__bn_postx4x_internal
2865	call	__bn_sqrx8x_internal
2866	call	__bn_postx4x_internal
2867	call	__bn_sqrx8x_internal
2868	call	__bn_postx4x_internal
2869
2870	movq	%r10,%r9
2871	movq	%rsi,%rdi
2872.byte	102,72,15,126,209
2873.byte	102,72,15,126,226
2874	movq	40(%rsp),%rax
2875
2876	call	mulx4x_internal
2877
2878	movq	40(%rsp),%rsi
2879.cfi_def_cfa	%rsi,8
2880	movq	$1,%rax
2881
2882	movq	-48(%rsi),%r15
2883.cfi_restore	%r15
2884	movq	-40(%rsi),%r14
2885.cfi_restore	%r14
2886	movq	-32(%rsi),%r13
2887.cfi_restore	%r13
2888	movq	-24(%rsi),%r12
2889.cfi_restore	%r12
2890	movq	-16(%rsi),%rbp
2891.cfi_restore	%rbp
2892	movq	-8(%rsi),%rbx
2893.cfi_restore	%rbx
2894	leaq	(%rsi),%rsp
2895.cfi_def_cfa_register	%rsp
2896.Lpowerx5_epilogue:
2897	.byte	0xf3,0xc3
2898.cfi_endproc
2899.size	bn_powerx5,.-bn_powerx5
2900
2901.globl	bn_sqrx8x_internal
2902.hidden	bn_sqrx8x_internal
2903.type	bn_sqrx8x_internal,@function
2904.align	32
2905bn_sqrx8x_internal:
2906__bn_sqrx8x_internal:
2907.cfi_startproc
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948	leaq	48+8(%rsp),%rdi
2949	leaq	(%rsi,%r9,1),%rbp
2950	movq	%r9,0+8(%rsp)
2951	movq	%rbp,8+8(%rsp)
2952	jmp	.Lsqr8x_zero_start
2953
2954.align	32
2955.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2956.Lsqrx8x_zero:
2957.byte	0x3e
2958	movdqa	%xmm0,0(%rdi)
2959	movdqa	%xmm0,16(%rdi)
2960	movdqa	%xmm0,32(%rdi)
2961	movdqa	%xmm0,48(%rdi)
2962.Lsqr8x_zero_start:
2963	movdqa	%xmm0,64(%rdi)
2964	movdqa	%xmm0,80(%rdi)
2965	movdqa	%xmm0,96(%rdi)
2966	movdqa	%xmm0,112(%rdi)
2967	leaq	128(%rdi),%rdi
2968	subq	$64,%r9
2969	jnz	.Lsqrx8x_zero
2970
2971	movq	0(%rsi),%rdx
2972
2973	xorq	%r10,%r10
2974	xorq	%r11,%r11
2975	xorq	%r12,%r12
2976	xorq	%r13,%r13
2977	xorq	%r14,%r14
2978	xorq	%r15,%r15
2979	leaq	48+8(%rsp),%rdi
2980	xorq	%rbp,%rbp
2981	jmp	.Lsqrx8x_outer_loop
2982
2983.align	32
2984.Lsqrx8x_outer_loop:
2985	mulxq	8(%rsi),%r8,%rax
2986	adcxq	%r9,%r8
2987	adoxq	%rax,%r10
2988	mulxq	16(%rsi),%r9,%rax
2989	adcxq	%r10,%r9
2990	adoxq	%rax,%r11
2991.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
2992	adcxq	%r11,%r10
2993	adoxq	%rax,%r12
2994.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
2995	adcxq	%r12,%r11
2996	adoxq	%rax,%r13
2997	mulxq	40(%rsi),%r12,%rax
2998	adcxq	%r13,%r12
2999	adoxq	%rax,%r14
3000	mulxq	48(%rsi),%r13,%rax
3001	adcxq	%r14,%r13
3002	adoxq	%r15,%rax
3003	mulxq	56(%rsi),%r14,%r15
3004	movq	8(%rsi),%rdx
3005	adcxq	%rax,%r14
3006	adoxq	%rbp,%r15
3007	adcq	64(%rdi),%r15
3008	movq	%r8,8(%rdi)
3009	movq	%r9,16(%rdi)
3010	sbbq	%rcx,%rcx
3011	xorq	%rbp,%rbp
3012
3013
3014	mulxq	16(%rsi),%r8,%rbx
3015	mulxq	24(%rsi),%r9,%rax
3016	adcxq	%r10,%r8
3017	adoxq	%rbx,%r9
3018	mulxq	32(%rsi),%r10,%rbx
3019	adcxq	%r11,%r9
3020	adoxq	%rax,%r10
3021.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
3022	adcxq	%r12,%r10
3023	adoxq	%rbx,%r11
3024.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
3025	adcxq	%r13,%r11
3026	adoxq	%r14,%r12
3027.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
3028	movq	16(%rsi),%rdx
3029	adcxq	%rax,%r12
3030	adoxq	%rbx,%r13
3031	adcxq	%r15,%r13
3032	adoxq	%rbp,%r14
3033	adcxq	%rbp,%r14
3034
3035	movq	%r8,24(%rdi)
3036	movq	%r9,32(%rdi)
3037
3038	mulxq	24(%rsi),%r8,%rbx
3039	mulxq	32(%rsi),%r9,%rax
3040	adcxq	%r10,%r8
3041	adoxq	%rbx,%r9
3042	mulxq	40(%rsi),%r10,%rbx
3043	adcxq	%r11,%r9
3044	adoxq	%rax,%r10
3045.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
3046	adcxq	%r12,%r10
3047	adoxq	%r13,%r11
3048.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
3049.byte	0x3e
3050	movq	24(%rsi),%rdx
3051	adcxq	%rbx,%r11
3052	adoxq	%rax,%r12
3053	adcxq	%r14,%r12
3054	movq	%r8,40(%rdi)
3055	movq	%r9,48(%rdi)
3056	mulxq	32(%rsi),%r8,%rax
3057	adoxq	%rbp,%r13
3058	adcxq	%rbp,%r13
3059
3060	mulxq	40(%rsi),%r9,%rbx
3061	adcxq	%r10,%r8
3062	adoxq	%rax,%r9
3063	mulxq	48(%rsi),%r10,%rax
3064	adcxq	%r11,%r9
3065	adoxq	%r12,%r10
3066	mulxq	56(%rsi),%r11,%r12
3067	movq	32(%rsi),%rdx
3068	movq	40(%rsi),%r14
3069	adcxq	%rbx,%r10
3070	adoxq	%rax,%r11
3071	movq	48(%rsi),%r15
3072	adcxq	%r13,%r11
3073	adoxq	%rbp,%r12
3074	adcxq	%rbp,%r12
3075
3076	movq	%r8,56(%rdi)
3077	movq	%r9,64(%rdi)
3078
3079	mulxq	%r14,%r9,%rax
3080	movq	56(%rsi),%r8
3081	adcxq	%r10,%r9
3082	mulxq	%r15,%r10,%rbx
3083	adoxq	%rax,%r10
3084	adcxq	%r11,%r10
3085	mulxq	%r8,%r11,%rax
3086	movq	%r14,%rdx
3087	adoxq	%rbx,%r11
3088	adcxq	%r12,%r11
3089
3090	adcxq	%rbp,%rax
3091
3092	mulxq	%r15,%r14,%rbx
3093	mulxq	%r8,%r12,%r13
3094	movq	%r15,%rdx
3095	leaq	64(%rsi),%rsi
3096	adcxq	%r14,%r11
3097	adoxq	%rbx,%r12
3098	adcxq	%rax,%r12
3099	adoxq	%rbp,%r13
3100
3101.byte	0x67,0x67
3102	mulxq	%r8,%r8,%r14
3103	adcxq	%r8,%r13
3104	adcxq	%rbp,%r14
3105
3106	cmpq	8+8(%rsp),%rsi
3107	je	.Lsqrx8x_outer_break
3108
3109	negq	%rcx
3110	movq	$-8,%rcx
3111	movq	%rbp,%r15
3112	movq	64(%rdi),%r8
3113	adcxq	72(%rdi),%r9
3114	adcxq	80(%rdi),%r10
3115	adcxq	88(%rdi),%r11
3116	adcq	96(%rdi),%r12
3117	adcq	104(%rdi),%r13
3118	adcq	112(%rdi),%r14
3119	adcq	120(%rdi),%r15
3120	leaq	(%rsi),%rbp
3121	leaq	128(%rdi),%rdi
3122	sbbq	%rax,%rax
3123
3124	movq	-64(%rsi),%rdx
3125	movq	%rax,16+8(%rsp)
3126	movq	%rdi,24+8(%rsp)
3127
3128
3129	xorl	%eax,%eax
3130	jmp	.Lsqrx8x_loop
3131
3132.align	32
3133.Lsqrx8x_loop:
3134	movq	%r8,%rbx
3135	mulxq	0(%rbp),%rax,%r8
3136	adcxq	%rax,%rbx
3137	adoxq	%r9,%r8
3138
3139	mulxq	8(%rbp),%rax,%r9
3140	adcxq	%rax,%r8
3141	adoxq	%r10,%r9
3142
3143	mulxq	16(%rbp),%rax,%r10
3144	adcxq	%rax,%r9
3145	adoxq	%r11,%r10
3146
3147	mulxq	24(%rbp),%rax,%r11
3148	adcxq	%rax,%r10
3149	adoxq	%r12,%r11
3150
3151.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3152	adcxq	%rax,%r11
3153	adoxq	%r13,%r12
3154
3155	mulxq	40(%rbp),%rax,%r13
3156	adcxq	%rax,%r12
3157	adoxq	%r14,%r13
3158
3159	mulxq	48(%rbp),%rax,%r14
3160	movq	%rbx,(%rdi,%rcx,8)
3161	movl	$0,%ebx
3162	adcxq	%rax,%r13
3163	adoxq	%r15,%r14
3164
3165.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
3166	movq	8(%rsi,%rcx,8),%rdx
3167	adcxq	%rax,%r14
3168	adoxq	%rbx,%r15
3169	adcxq	%rbx,%r15
3170
3171.byte	0x67
3172	incq	%rcx
3173	jnz	.Lsqrx8x_loop
3174
3175	leaq	64(%rbp),%rbp
3176	movq	$-8,%rcx
3177	cmpq	8+8(%rsp),%rbp
3178	je	.Lsqrx8x_break
3179
3180	subq	16+8(%rsp),%rbx
3181.byte	0x66
3182	movq	-64(%rsi),%rdx
3183	adcxq	0(%rdi),%r8
3184	adcxq	8(%rdi),%r9
3185	adcq	16(%rdi),%r10
3186	adcq	24(%rdi),%r11
3187	adcq	32(%rdi),%r12
3188	adcq	40(%rdi),%r13
3189	adcq	48(%rdi),%r14
3190	adcq	56(%rdi),%r15
3191	leaq	64(%rdi),%rdi
3192.byte	0x67
3193	sbbq	%rax,%rax
3194	xorl	%ebx,%ebx
3195	movq	%rax,16+8(%rsp)
3196	jmp	.Lsqrx8x_loop
3197
3198.align	32
3199.Lsqrx8x_break:
3200	xorq	%rbp,%rbp
3201	subq	16+8(%rsp),%rbx
3202	adcxq	%rbp,%r8
3203	movq	24+8(%rsp),%rcx
3204	adcxq	%rbp,%r9
3205	movq	0(%rsi),%rdx
3206	adcq	$0,%r10
3207	movq	%r8,0(%rdi)
3208	adcq	$0,%r11
3209	adcq	$0,%r12
3210	adcq	$0,%r13
3211	adcq	$0,%r14
3212	adcq	$0,%r15
3213	cmpq	%rcx,%rdi
3214	je	.Lsqrx8x_outer_loop
3215
3216	movq	%r9,8(%rdi)
3217	movq	8(%rcx),%r9
3218	movq	%r10,16(%rdi)
3219	movq	16(%rcx),%r10
3220	movq	%r11,24(%rdi)
3221	movq	24(%rcx),%r11
3222	movq	%r12,32(%rdi)
3223	movq	32(%rcx),%r12
3224	movq	%r13,40(%rdi)
3225	movq	40(%rcx),%r13
3226	movq	%r14,48(%rdi)
3227	movq	48(%rcx),%r14
3228	movq	%r15,56(%rdi)
3229	movq	56(%rcx),%r15
3230	movq	%rcx,%rdi
3231	jmp	.Lsqrx8x_outer_loop
3232
3233.align	32
3234.Lsqrx8x_outer_break:
3235	movq	%r9,72(%rdi)
3236.byte	102,72,15,126,217
3237	movq	%r10,80(%rdi)
3238	movq	%r11,88(%rdi)
3239	movq	%r12,96(%rdi)
3240	movq	%r13,104(%rdi)
3241	movq	%r14,112(%rdi)
3242	leaq	48+8(%rsp),%rdi
3243	movq	(%rsi,%rcx,1),%rdx
3244
3245	movq	8(%rdi),%r11
3246	xorq	%r10,%r10
3247	movq	0+8(%rsp),%r9
3248	adoxq	%r11,%r11
3249	movq	16(%rdi),%r12
3250	movq	24(%rdi),%r13
3251
3252
3253.align	32
3254.Lsqrx4x_shift_n_add:
3255	mulxq	%rdx,%rax,%rbx
3256	adoxq	%r12,%r12
3257	adcxq	%r10,%rax
3258.byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
3259.byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
3260	adoxq	%r13,%r13
3261	adcxq	%r11,%rbx
3262	movq	40(%rdi),%r11
3263	movq	%rax,0(%rdi)
3264	movq	%rbx,8(%rdi)
3265
3266	mulxq	%rdx,%rax,%rbx
3267	adoxq	%r10,%r10
3268	adcxq	%r12,%rax
3269	movq	16(%rsi,%rcx,1),%rdx
3270	movq	48(%rdi),%r12
3271	adoxq	%r11,%r11
3272	adcxq	%r13,%rbx
3273	movq	56(%rdi),%r13
3274	movq	%rax,16(%rdi)
3275	movq	%rbx,24(%rdi)
3276
3277	mulxq	%rdx,%rax,%rbx
3278	adoxq	%r12,%r12
3279	adcxq	%r10,%rax
3280	movq	24(%rsi,%rcx,1),%rdx
3281	leaq	32(%rcx),%rcx
3282	movq	64(%rdi),%r10
3283	adoxq	%r13,%r13
3284	adcxq	%r11,%rbx
3285	movq	72(%rdi),%r11
3286	movq	%rax,32(%rdi)
3287	movq	%rbx,40(%rdi)
3288
3289	mulxq	%rdx,%rax,%rbx
3290	adoxq	%r10,%r10
3291	adcxq	%r12,%rax
3292	jrcxz	.Lsqrx4x_shift_n_add_break
3293.byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
3294	adoxq	%r11,%r11
3295	adcxq	%r13,%rbx
3296	movq	80(%rdi),%r12
3297	movq	88(%rdi),%r13
3298	movq	%rax,48(%rdi)
3299	movq	%rbx,56(%rdi)
3300	leaq	64(%rdi),%rdi
3301	nop
3302	jmp	.Lsqrx4x_shift_n_add
3303
3304.align	32
3305.Lsqrx4x_shift_n_add_break:
3306	adcxq	%r13,%rbx
3307	movq	%rax,48(%rdi)
3308	movq	%rbx,56(%rdi)
3309	leaq	64(%rdi),%rdi
3310.byte	102,72,15,126,213
3311__bn_sqrx8x_reduction:
3312	xorl	%eax,%eax
3313	movq	32+8(%rsp),%rbx
3314	movq	48+8(%rsp),%rdx
3315	leaq	-64(%rbp,%r9,1),%rcx
3316
3317	movq	%rcx,0+8(%rsp)
3318	movq	%rdi,8+8(%rsp)
3319
3320	leaq	48+8(%rsp),%rdi
3321	jmp	.Lsqrx8x_reduction_loop
3322
3323.align	32
3324.Lsqrx8x_reduction_loop:
3325	movq	8(%rdi),%r9
3326	movq	16(%rdi),%r10
3327	movq	24(%rdi),%r11
3328	movq	32(%rdi),%r12
3329	movq	%rdx,%r8
3330	imulq	%rbx,%rdx
3331	movq	40(%rdi),%r13
3332	movq	48(%rdi),%r14
3333	movq	56(%rdi),%r15
3334	movq	%rax,24+8(%rsp)
3335
3336	leaq	64(%rdi),%rdi
3337	xorq	%rsi,%rsi
3338	movq	$-8,%rcx
3339	jmp	.Lsqrx8x_reduce
3340
3341.align	32
3342.Lsqrx8x_reduce:
3343	movq	%r8,%rbx
3344	mulxq	0(%rbp),%rax,%r8
3345	adcxq	%rbx,%rax
3346	adoxq	%r9,%r8
3347
3348	mulxq	8(%rbp),%rbx,%r9
3349	adcxq	%rbx,%r8
3350	adoxq	%r10,%r9
3351
3352	mulxq	16(%rbp),%rbx,%r10
3353	adcxq	%rbx,%r9
3354	adoxq	%r11,%r10
3355
3356	mulxq	24(%rbp),%rbx,%r11
3357	adcxq	%rbx,%r10
3358	adoxq	%r12,%r11
3359
3360.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
3361	movq	%rdx,%rax
3362	movq	%r8,%rdx
3363	adcxq	%rbx,%r11
3364	adoxq	%r13,%r12
3365
3366	mulxq	32+8(%rsp),%rbx,%rdx
3367	movq	%rax,%rdx
3368	movq	%rax,64+48+8(%rsp,%rcx,8)
3369
3370	mulxq	40(%rbp),%rax,%r13
3371	adcxq	%rax,%r12
3372	adoxq	%r14,%r13
3373
3374	mulxq	48(%rbp),%rax,%r14
3375	adcxq	%rax,%r13
3376	adoxq	%r15,%r14
3377
3378	mulxq	56(%rbp),%rax,%r15
3379	movq	%rbx,%rdx
3380	adcxq	%rax,%r14
3381	adoxq	%rsi,%r15
3382	adcxq	%rsi,%r15
3383
3384.byte	0x67,0x67,0x67
3385	incq	%rcx
3386	jnz	.Lsqrx8x_reduce
3387
3388	movq	%rsi,%rax
3389	cmpq	0+8(%rsp),%rbp
3390	jae	.Lsqrx8x_no_tail
3391
3392	movq	48+8(%rsp),%rdx
3393	addq	0(%rdi),%r8
3394	leaq	64(%rbp),%rbp
3395	movq	$-8,%rcx
3396	adcxq	8(%rdi),%r9
3397	adcxq	16(%rdi),%r10
3398	adcq	24(%rdi),%r11
3399	adcq	32(%rdi),%r12
3400	adcq	40(%rdi),%r13
3401	adcq	48(%rdi),%r14
3402	adcq	56(%rdi),%r15
3403	leaq	64(%rdi),%rdi
3404	sbbq	%rax,%rax
3405
3406	xorq	%rsi,%rsi
3407	movq	%rax,16+8(%rsp)
3408	jmp	.Lsqrx8x_tail
3409
3410.align	32
3411.Lsqrx8x_tail:
3412	movq	%r8,%rbx
3413	mulxq	0(%rbp),%rax,%r8
3414	adcxq	%rax,%rbx
3415	adoxq	%r9,%r8
3416
3417	mulxq	8(%rbp),%rax,%r9
3418	adcxq	%rax,%r8
3419	adoxq	%r10,%r9
3420
3421	mulxq	16(%rbp),%rax,%r10
3422	adcxq	%rax,%r9
3423	adoxq	%r11,%r10
3424
3425	mulxq	24(%rbp),%rax,%r11
3426	adcxq	%rax,%r10
3427	adoxq	%r12,%r11
3428
3429.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3430	adcxq	%rax,%r11
3431	adoxq	%r13,%r12
3432
3433	mulxq	40(%rbp),%rax,%r13
3434	adcxq	%rax,%r12
3435	adoxq	%r14,%r13
3436
3437	mulxq	48(%rbp),%rax,%r14
3438	adcxq	%rax,%r13
3439	adoxq	%r15,%r14
3440
3441	mulxq	56(%rbp),%rax,%r15
3442	movq	72+48+8(%rsp,%rcx,8),%rdx
3443	adcxq	%rax,%r14
3444	adoxq	%rsi,%r15
3445	movq	%rbx,(%rdi,%rcx,8)
3446	movq	%r8,%rbx
3447	adcxq	%rsi,%r15
3448
3449	incq	%rcx
3450	jnz	.Lsqrx8x_tail
3451
3452	cmpq	0+8(%rsp),%rbp
3453	jae	.Lsqrx8x_tail_done
3454
3455	subq	16+8(%rsp),%rsi
3456	movq	48+8(%rsp),%rdx
3457	leaq	64(%rbp),%rbp
3458	adcq	0(%rdi),%r8
3459	adcq	8(%rdi),%r9
3460	adcq	16(%rdi),%r10
3461	adcq	24(%rdi),%r11
3462	adcq	32(%rdi),%r12
3463	adcq	40(%rdi),%r13
3464	adcq	48(%rdi),%r14
3465	adcq	56(%rdi),%r15
3466	leaq	64(%rdi),%rdi
3467	sbbq	%rax,%rax
3468	subq	$8,%rcx
3469
3470	xorq	%rsi,%rsi
3471	movq	%rax,16+8(%rsp)
3472	jmp	.Lsqrx8x_tail
3473
3474.align	32
3475.Lsqrx8x_tail_done:
3476	xorq	%rax,%rax
3477	addq	24+8(%rsp),%r8
3478	adcq	$0,%r9
3479	adcq	$0,%r10
3480	adcq	$0,%r11
3481	adcq	$0,%r12
3482	adcq	$0,%r13
3483	adcq	$0,%r14
3484	adcq	$0,%r15
3485	adcq	$0,%rax
3486
3487	subq	16+8(%rsp),%rsi
3488.Lsqrx8x_no_tail:
3489	adcq	0(%rdi),%r8
3490.byte	102,72,15,126,217
3491	adcq	8(%rdi),%r9
3492	movq	56(%rbp),%rsi
3493.byte	102,72,15,126,213
3494	adcq	16(%rdi),%r10
3495	adcq	24(%rdi),%r11
3496	adcq	32(%rdi),%r12
3497	adcq	40(%rdi),%r13
3498	adcq	48(%rdi),%r14
3499	adcq	56(%rdi),%r15
3500	adcq	$0,%rax
3501
3502	movq	32+8(%rsp),%rbx
3503	movq	64(%rdi,%rcx,1),%rdx
3504
3505	movq	%r8,0(%rdi)
3506	leaq	64(%rdi),%r8
3507	movq	%r9,8(%rdi)
3508	movq	%r10,16(%rdi)
3509	movq	%r11,24(%rdi)
3510	movq	%r12,32(%rdi)
3511	movq	%r13,40(%rdi)
3512	movq	%r14,48(%rdi)
3513	movq	%r15,56(%rdi)
3514
3515	leaq	64(%rdi,%rcx,1),%rdi
3516	cmpq	8+8(%rsp),%r8
3517	jb	.Lsqrx8x_reduction_loop
3518	.byte	0xf3,0xc3
3519.cfi_endproc
3520.size	bn_sqrx8x_internal,.-bn_sqrx8x_internal
3521.align	32
3522__bn_postx4x_internal:
3523.cfi_startproc
3524	movq	0(%rbp),%r12
3525	movq	%rcx,%r10
3526	movq	%rcx,%r9
3527	negq	%rax
3528	sarq	$3+2,%rcx
3529
3530.byte	102,72,15,126,202
3531.byte	102,72,15,126,206
3532	decq	%r12
3533	movq	8(%rbp),%r13
3534	xorq	%r8,%r8
3535	movq	16(%rbp),%r14
3536	movq	24(%rbp),%r15
3537	jmp	.Lsqrx4x_sub_entry
3538
3539.align	16
3540.Lsqrx4x_sub:
3541	movq	0(%rbp),%r12
3542	movq	8(%rbp),%r13
3543	movq	16(%rbp),%r14
3544	movq	24(%rbp),%r15
3545.Lsqrx4x_sub_entry:
3546	andnq	%rax,%r12,%r12
3547	leaq	32(%rbp),%rbp
3548	andnq	%rax,%r13,%r13
3549	andnq	%rax,%r14,%r14
3550	andnq	%rax,%r15,%r15
3551
3552	negq	%r8
3553	adcq	0(%rdi),%r12
3554	adcq	8(%rdi),%r13
3555	adcq	16(%rdi),%r14
3556	adcq	24(%rdi),%r15
3557	movq	%r12,0(%rdx)
3558	leaq	32(%rdi),%rdi
3559	movq	%r13,8(%rdx)
3560	sbbq	%r8,%r8
3561	movq	%r14,16(%rdx)
3562	movq	%r15,24(%rdx)
3563	leaq	32(%rdx),%rdx
3564
3565	incq	%rcx
3566	jnz	.Lsqrx4x_sub
3567
3568	negq	%r9
3569
3570	.byte	0xf3,0xc3
3571.cfi_endproc
3572.size	__bn_postx4x_internal,.-__bn_postx4x_internal
3573.globl	bn_get_bits5
3574.type	bn_get_bits5,@function
3575.align	16
3576bn_get_bits5:
3577.cfi_startproc
3578	leaq	0(%rdi),%r10
3579	leaq	1(%rdi),%r11
3580	movl	%esi,%ecx
3581	shrl	$4,%esi
3582	andl	$15,%ecx
3583	leal	-8(%rcx),%eax
3584	cmpl	$11,%ecx
3585	cmovaq	%r11,%r10
3586	cmoval	%eax,%ecx
3587	movzwl	(%r10,%rsi,2),%eax
3588	shrl	%cl,%eax
3589	andl	$31,%eax
3590	.byte	0xf3,0xc3
3591.cfi_endproc
3592.size	bn_get_bits5,.-bn_get_bits5
3593
3594.globl	bn_scatter5
3595.type	bn_scatter5,@function
3596.align	16
3597bn_scatter5:
3598.cfi_startproc
3599	cmpl	$0,%esi
3600	jz	.Lscatter_epilogue
3601	leaq	(%rdx,%rcx,8),%rdx
3602.Lscatter:
3603	movq	(%rdi),%rax
3604	leaq	8(%rdi),%rdi
3605	movq	%rax,(%rdx)
3606	leaq	256(%rdx),%rdx
3607	subl	$1,%esi
3608	jnz	.Lscatter
3609.Lscatter_epilogue:
3610	.byte	0xf3,0xc3
3611.cfi_endproc
3612.size	bn_scatter5,.-bn_scatter5
3613
3614.globl	bn_gather5
3615.type	bn_gather5,@function
3616.align	32
3617bn_gather5:
3618.LSEH_begin_bn_gather5:
3619.cfi_startproc
3620
3621.byte	0x4c,0x8d,0x14,0x24
3622.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00
3623	leaq	.Linc(%rip),%rax
3624	andq	$-16,%rsp
3625
3626	movd	%ecx,%xmm5
3627	movdqa	0(%rax),%xmm0
3628	movdqa	16(%rax),%xmm1
3629	leaq	128(%rdx),%r11
3630	leaq	128(%rsp),%rax
3631
3632	pshufd	$0,%xmm5,%xmm5
3633	movdqa	%xmm1,%xmm4
3634	movdqa	%xmm1,%xmm2
3635	paddd	%xmm0,%xmm1
3636	pcmpeqd	%xmm5,%xmm0
3637	movdqa	%xmm4,%xmm3
3638
3639	paddd	%xmm1,%xmm2
3640	pcmpeqd	%xmm5,%xmm1
3641	movdqa	%xmm0,-128(%rax)
3642	movdqa	%xmm4,%xmm0
3643
3644	paddd	%xmm2,%xmm3
3645	pcmpeqd	%xmm5,%xmm2
3646	movdqa	%xmm1,-112(%rax)
3647	movdqa	%xmm4,%xmm1
3648
3649	paddd	%xmm3,%xmm0
3650	pcmpeqd	%xmm5,%xmm3
3651	movdqa	%xmm2,-96(%rax)
3652	movdqa	%xmm4,%xmm2
3653	paddd	%xmm0,%xmm1
3654	pcmpeqd	%xmm5,%xmm0
3655	movdqa	%xmm3,-80(%rax)
3656	movdqa	%xmm4,%xmm3
3657
3658	paddd	%xmm1,%xmm2
3659	pcmpeqd	%xmm5,%xmm1
3660	movdqa	%xmm0,-64(%rax)
3661	movdqa	%xmm4,%xmm0
3662
3663	paddd	%xmm2,%xmm3
3664	pcmpeqd	%xmm5,%xmm2
3665	movdqa	%xmm1,-48(%rax)
3666	movdqa	%xmm4,%xmm1
3667
3668	paddd	%xmm3,%xmm0
3669	pcmpeqd	%xmm5,%xmm3
3670	movdqa	%xmm2,-32(%rax)
3671	movdqa	%xmm4,%xmm2
3672	paddd	%xmm0,%xmm1
3673	pcmpeqd	%xmm5,%xmm0
3674	movdqa	%xmm3,-16(%rax)
3675	movdqa	%xmm4,%xmm3
3676
3677	paddd	%xmm1,%xmm2
3678	pcmpeqd	%xmm5,%xmm1
3679	movdqa	%xmm0,0(%rax)
3680	movdqa	%xmm4,%xmm0
3681
3682	paddd	%xmm2,%xmm3
3683	pcmpeqd	%xmm5,%xmm2
3684	movdqa	%xmm1,16(%rax)
3685	movdqa	%xmm4,%xmm1
3686
3687	paddd	%xmm3,%xmm0
3688	pcmpeqd	%xmm5,%xmm3
3689	movdqa	%xmm2,32(%rax)
3690	movdqa	%xmm4,%xmm2
3691	paddd	%xmm0,%xmm1
3692	pcmpeqd	%xmm5,%xmm0
3693	movdqa	%xmm3,48(%rax)
3694	movdqa	%xmm4,%xmm3
3695
3696	paddd	%xmm1,%xmm2
3697	pcmpeqd	%xmm5,%xmm1
3698	movdqa	%xmm0,64(%rax)
3699	movdqa	%xmm4,%xmm0
3700
3701	paddd	%xmm2,%xmm3
3702	pcmpeqd	%xmm5,%xmm2
3703	movdqa	%xmm1,80(%rax)
3704	movdqa	%xmm4,%xmm1
3705
3706	paddd	%xmm3,%xmm0
3707	pcmpeqd	%xmm5,%xmm3
3708	movdqa	%xmm2,96(%rax)
3709	movdqa	%xmm4,%xmm2
3710	movdqa	%xmm3,112(%rax)
3711	jmp	.Lgather
3712
3713.align	32
3714.Lgather:
3715	pxor	%xmm4,%xmm4
3716	pxor	%xmm5,%xmm5
3717	movdqa	-128(%r11),%xmm0
3718	movdqa	-112(%r11),%xmm1
3719	movdqa	-96(%r11),%xmm2
3720	pand	-128(%rax),%xmm0
3721	movdqa	-80(%r11),%xmm3
3722	pand	-112(%rax),%xmm1
3723	por	%xmm0,%xmm4
3724	pand	-96(%rax),%xmm2
3725	por	%xmm1,%xmm5
3726	pand	-80(%rax),%xmm3
3727	por	%xmm2,%xmm4
3728	por	%xmm3,%xmm5
3729	movdqa	-64(%r11),%xmm0
3730	movdqa	-48(%r11),%xmm1
3731	movdqa	-32(%r11),%xmm2
3732	pand	-64(%rax),%xmm0
3733	movdqa	-16(%r11),%xmm3
3734	pand	-48(%rax),%xmm1
3735	por	%xmm0,%xmm4
3736	pand	-32(%rax),%xmm2
3737	por	%xmm1,%xmm5
3738	pand	-16(%rax),%xmm3
3739	por	%xmm2,%xmm4
3740	por	%xmm3,%xmm5
3741	movdqa	0(%r11),%xmm0
3742	movdqa	16(%r11),%xmm1
3743	movdqa	32(%r11),%xmm2
3744	pand	0(%rax),%xmm0
3745	movdqa	48(%r11),%xmm3
3746	pand	16(%rax),%xmm1
3747	por	%xmm0,%xmm4
3748	pand	32(%rax),%xmm2
3749	por	%xmm1,%xmm5
3750	pand	48(%rax),%xmm3
3751	por	%xmm2,%xmm4
3752	por	%xmm3,%xmm5
3753	movdqa	64(%r11),%xmm0
3754	movdqa	80(%r11),%xmm1
3755	movdqa	96(%r11),%xmm2
3756	pand	64(%rax),%xmm0
3757	movdqa	112(%r11),%xmm3
3758	pand	80(%rax),%xmm1
3759	por	%xmm0,%xmm4
3760	pand	96(%rax),%xmm2
3761	por	%xmm1,%xmm5
3762	pand	112(%rax),%xmm3
3763	por	%xmm2,%xmm4
3764	por	%xmm3,%xmm5
3765	por	%xmm5,%xmm4
3766	leaq	256(%r11),%r11
3767	pshufd	$0x4e,%xmm4,%xmm0
3768	por	%xmm4,%xmm0
3769	movq	%xmm0,(%rdi)
3770	leaq	8(%rdi),%rdi
3771	subl	$1,%esi
3772	jnz	.Lgather
3773
3774	leaq	(%r10),%rsp
3775	.byte	0xf3,0xc3
3776.LSEH_end_bn_gather5:
3777.cfi_endproc
3778.size	bn_gather5,.-bn_gather5
3779.align	64
3780.Linc:
3781.long	0,0, 1,1
3782.long	2,2, 2,2
3783.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
3784