x86_64-mont5.S revision 1.8
1#include <machine/asm.h>
2.text
3
4
5
6.globl	bn_mul_mont_gather5
7.type	bn_mul_mont_gather5,@function
8.align	64
9bn_mul_mont_gather5:
10.cfi_startproc
11	movl	%r9d,%r9d
12	movq	%rsp,%rax
13.cfi_def_cfa_register	%rax
14	testl	$7,%r9d
15	jnz	.Lmul_enter
16	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
17	jmp	.Lmul4x_enter
18
19.align	16
20.Lmul_enter:
21	movd	8(%rsp),%xmm5
22	pushq	%rbx
23.cfi_offset	%rbx,-16
24	pushq	%rbp
25.cfi_offset	%rbp,-24
26	pushq	%r12
27.cfi_offset	%r12,-32
28	pushq	%r13
29.cfi_offset	%r13,-40
30	pushq	%r14
31.cfi_offset	%r14,-48
32	pushq	%r15
33.cfi_offset	%r15,-56
34
35	negq	%r9
36	movq	%rsp,%r11
37	leaq	-280(%rsp,%r9,8),%r10
38	negq	%r9
39	andq	$-1024,%r10
40
41
42
43
44
45
46
47
48
49	subq	%r10,%r11
50	andq	$-4096,%r11
51	leaq	(%r10,%r11,1),%rsp
52	movq	(%rsp),%r11
53	cmpq	%r10,%rsp
54	ja	.Lmul_page_walk
55	jmp	.Lmul_page_walk_done
56
57.Lmul_page_walk:
58	leaq	-4096(%rsp),%rsp
59	movq	(%rsp),%r11
60	cmpq	%r10,%rsp
61	ja	.Lmul_page_walk
62.Lmul_page_walk_done:
63
64	leaq	.Linc(%rip),%r10
65	movq	%rax,8(%rsp,%r9,8)
66.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
67.Lmul_body:
68
69	leaq	128(%rdx),%r12
70	movdqa	0(%r10),%xmm0
71	movdqa	16(%r10),%xmm1
72	leaq	24-112(%rsp,%r9,8),%r10
73	andq	$-16,%r10
74
75	pshufd	$0,%xmm5,%xmm5
76	movdqa	%xmm1,%xmm4
77	movdqa	%xmm1,%xmm2
78	paddd	%xmm0,%xmm1
79	pcmpeqd	%xmm5,%xmm0
80.byte	0x67
81	movdqa	%xmm4,%xmm3
82	paddd	%xmm1,%xmm2
83	pcmpeqd	%xmm5,%xmm1
84	movdqa	%xmm0,112(%r10)
85	movdqa	%xmm4,%xmm0
86
87	paddd	%xmm2,%xmm3
88	pcmpeqd	%xmm5,%xmm2
89	movdqa	%xmm1,128(%r10)
90	movdqa	%xmm4,%xmm1
91
92	paddd	%xmm3,%xmm0
93	pcmpeqd	%xmm5,%xmm3
94	movdqa	%xmm2,144(%r10)
95	movdqa	%xmm4,%xmm2
96
97	paddd	%xmm0,%xmm1
98	pcmpeqd	%xmm5,%xmm0
99	movdqa	%xmm3,160(%r10)
100	movdqa	%xmm4,%xmm3
101	paddd	%xmm1,%xmm2
102	pcmpeqd	%xmm5,%xmm1
103	movdqa	%xmm0,176(%r10)
104	movdqa	%xmm4,%xmm0
105
106	paddd	%xmm2,%xmm3
107	pcmpeqd	%xmm5,%xmm2
108	movdqa	%xmm1,192(%r10)
109	movdqa	%xmm4,%xmm1
110
111	paddd	%xmm3,%xmm0
112	pcmpeqd	%xmm5,%xmm3
113	movdqa	%xmm2,208(%r10)
114	movdqa	%xmm4,%xmm2
115
116	paddd	%xmm0,%xmm1
117	pcmpeqd	%xmm5,%xmm0
118	movdqa	%xmm3,224(%r10)
119	movdqa	%xmm4,%xmm3
120	paddd	%xmm1,%xmm2
121	pcmpeqd	%xmm5,%xmm1
122	movdqa	%xmm0,240(%r10)
123	movdqa	%xmm4,%xmm0
124
125	paddd	%xmm2,%xmm3
126	pcmpeqd	%xmm5,%xmm2
127	movdqa	%xmm1,256(%r10)
128	movdqa	%xmm4,%xmm1
129
130	paddd	%xmm3,%xmm0
131	pcmpeqd	%xmm5,%xmm3
132	movdqa	%xmm2,272(%r10)
133	movdqa	%xmm4,%xmm2
134
135	paddd	%xmm0,%xmm1
136	pcmpeqd	%xmm5,%xmm0
137	movdqa	%xmm3,288(%r10)
138	movdqa	%xmm4,%xmm3
139	paddd	%xmm1,%xmm2
140	pcmpeqd	%xmm5,%xmm1
141	movdqa	%xmm0,304(%r10)
142
143	paddd	%xmm2,%xmm3
144.byte	0x67
145	pcmpeqd	%xmm5,%xmm2
146	movdqa	%xmm1,320(%r10)
147
148	pcmpeqd	%xmm5,%xmm3
149	movdqa	%xmm2,336(%r10)
150	pand	64(%r12),%xmm0
151
152	pand	80(%r12),%xmm1
153	pand	96(%r12),%xmm2
154	movdqa	%xmm3,352(%r10)
155	pand	112(%r12),%xmm3
156	por	%xmm2,%xmm0
157	por	%xmm3,%xmm1
158	movdqa	-128(%r12),%xmm4
159	movdqa	-112(%r12),%xmm5
160	movdqa	-96(%r12),%xmm2
161	pand	112(%r10),%xmm4
162	movdqa	-80(%r12),%xmm3
163	pand	128(%r10),%xmm5
164	por	%xmm4,%xmm0
165	pand	144(%r10),%xmm2
166	por	%xmm5,%xmm1
167	pand	160(%r10),%xmm3
168	por	%xmm2,%xmm0
169	por	%xmm3,%xmm1
170	movdqa	-64(%r12),%xmm4
171	movdqa	-48(%r12),%xmm5
172	movdqa	-32(%r12),%xmm2
173	pand	176(%r10),%xmm4
174	movdqa	-16(%r12),%xmm3
175	pand	192(%r10),%xmm5
176	por	%xmm4,%xmm0
177	pand	208(%r10),%xmm2
178	por	%xmm5,%xmm1
179	pand	224(%r10),%xmm3
180	por	%xmm2,%xmm0
181	por	%xmm3,%xmm1
182	movdqa	0(%r12),%xmm4
183	movdqa	16(%r12),%xmm5
184	movdqa	32(%r12),%xmm2
185	pand	240(%r10),%xmm4
186	movdqa	48(%r12),%xmm3
187	pand	256(%r10),%xmm5
188	por	%xmm4,%xmm0
189	pand	272(%r10),%xmm2
190	por	%xmm5,%xmm1
191	pand	288(%r10),%xmm3
192	por	%xmm2,%xmm0
193	por	%xmm3,%xmm1
194	por	%xmm1,%xmm0
195	pshufd	$0x4e,%xmm0,%xmm1
196	por	%xmm1,%xmm0
197	leaq	256(%r12),%r12
198.byte	102,72,15,126,195
199
200	movq	(%r8),%r8
201	movq	(%rsi),%rax
202
203	xorq	%r14,%r14
204	xorq	%r15,%r15
205
206	movq	%r8,%rbp
207	mulq	%rbx
208	movq	%rax,%r10
209	movq	(%rcx),%rax
210
211	imulq	%r10,%rbp
212	movq	%rdx,%r11
213
214	mulq	%rbp
215	addq	%rax,%r10
216	movq	8(%rsi),%rax
217	adcq	$0,%rdx
218	movq	%rdx,%r13
219
220	leaq	1(%r15),%r15
221	jmp	.L1st_enter
222
223.align	16
224.L1st:
225	addq	%rax,%r13
226	movq	(%rsi,%r15,8),%rax
227	adcq	$0,%rdx
228	addq	%r11,%r13
229	movq	%r10,%r11
230	adcq	$0,%rdx
231	movq	%r13,-16(%rsp,%r15,8)
232	movq	%rdx,%r13
233
234.L1st_enter:
235	mulq	%rbx
236	addq	%rax,%r11
237	movq	(%rcx,%r15,8),%rax
238	adcq	$0,%rdx
239	leaq	1(%r15),%r15
240	movq	%rdx,%r10
241
242	mulq	%rbp
243	cmpq	%r9,%r15
244	jne	.L1st
245
246
247	addq	%rax,%r13
248	adcq	$0,%rdx
249	addq	%r11,%r13
250	adcq	$0,%rdx
251	movq	%r13,-16(%rsp,%r9,8)
252	movq	%rdx,%r13
253	movq	%r10,%r11
254
255	xorq	%rdx,%rdx
256	addq	%r11,%r13
257	adcq	$0,%rdx
258	movq	%r13,-8(%rsp,%r9,8)
259	movq	%rdx,(%rsp,%r9,8)
260
261	leaq	1(%r14),%r14
262	jmp	.Louter
263.align	16
264.Louter:
265	leaq	24+128(%rsp,%r9,8),%rdx
266	andq	$-16,%rdx
267	pxor	%xmm4,%xmm4
268	pxor	%xmm5,%xmm5
269	movdqa	-128(%r12),%xmm0
270	movdqa	-112(%r12),%xmm1
271	movdqa	-96(%r12),%xmm2
272	movdqa	-80(%r12),%xmm3
273	pand	-128(%rdx),%xmm0
274	pand	-112(%rdx),%xmm1
275	por	%xmm0,%xmm4
276	pand	-96(%rdx),%xmm2
277	por	%xmm1,%xmm5
278	pand	-80(%rdx),%xmm3
279	por	%xmm2,%xmm4
280	por	%xmm3,%xmm5
281	movdqa	-64(%r12),%xmm0
282	movdqa	-48(%r12),%xmm1
283	movdqa	-32(%r12),%xmm2
284	movdqa	-16(%r12),%xmm3
285	pand	-64(%rdx),%xmm0
286	pand	-48(%rdx),%xmm1
287	por	%xmm0,%xmm4
288	pand	-32(%rdx),%xmm2
289	por	%xmm1,%xmm5
290	pand	-16(%rdx),%xmm3
291	por	%xmm2,%xmm4
292	por	%xmm3,%xmm5
293	movdqa	0(%r12),%xmm0
294	movdqa	16(%r12),%xmm1
295	movdqa	32(%r12),%xmm2
296	movdqa	48(%r12),%xmm3
297	pand	0(%rdx),%xmm0
298	pand	16(%rdx),%xmm1
299	por	%xmm0,%xmm4
300	pand	32(%rdx),%xmm2
301	por	%xmm1,%xmm5
302	pand	48(%rdx),%xmm3
303	por	%xmm2,%xmm4
304	por	%xmm3,%xmm5
305	movdqa	64(%r12),%xmm0
306	movdqa	80(%r12),%xmm1
307	movdqa	96(%r12),%xmm2
308	movdqa	112(%r12),%xmm3
309	pand	64(%rdx),%xmm0
310	pand	80(%rdx),%xmm1
311	por	%xmm0,%xmm4
312	pand	96(%rdx),%xmm2
313	por	%xmm1,%xmm5
314	pand	112(%rdx),%xmm3
315	por	%xmm2,%xmm4
316	por	%xmm3,%xmm5
317	por	%xmm5,%xmm4
318	pshufd	$0x4e,%xmm4,%xmm0
319	por	%xmm4,%xmm0
320	leaq	256(%r12),%r12
321
322	movq	(%rsi),%rax
323.byte	102,72,15,126,195
324
325	xorq	%r15,%r15
326	movq	%r8,%rbp
327	movq	(%rsp),%r10
328
329	mulq	%rbx
330	addq	%rax,%r10
331	movq	(%rcx),%rax
332	adcq	$0,%rdx
333
334	imulq	%r10,%rbp
335	movq	%rdx,%r11
336
337	mulq	%rbp
338	addq	%rax,%r10
339	movq	8(%rsi),%rax
340	adcq	$0,%rdx
341	movq	8(%rsp),%r10
342	movq	%rdx,%r13
343
344	leaq	1(%r15),%r15
345	jmp	.Linner_enter
346
347.align	16
348.Linner:
349	addq	%rax,%r13
350	movq	(%rsi,%r15,8),%rax
351	adcq	$0,%rdx
352	addq	%r10,%r13
353	movq	(%rsp,%r15,8),%r10
354	adcq	$0,%rdx
355	movq	%r13,-16(%rsp,%r15,8)
356	movq	%rdx,%r13
357
358.Linner_enter:
359	mulq	%rbx
360	addq	%rax,%r11
361	movq	(%rcx,%r15,8),%rax
362	adcq	$0,%rdx
363	addq	%r11,%r10
364	movq	%rdx,%r11
365	adcq	$0,%r11
366	leaq	1(%r15),%r15
367
368	mulq	%rbp
369	cmpq	%r9,%r15
370	jne	.Linner
371
372	addq	%rax,%r13
373	adcq	$0,%rdx
374	addq	%r10,%r13
375	movq	(%rsp,%r9,8),%r10
376	adcq	$0,%rdx
377	movq	%r13,-16(%rsp,%r9,8)
378	movq	%rdx,%r13
379
380	xorq	%rdx,%rdx
381	addq	%r11,%r13
382	adcq	$0,%rdx
383	addq	%r10,%r13
384	adcq	$0,%rdx
385	movq	%r13,-8(%rsp,%r9,8)
386	movq	%rdx,(%rsp,%r9,8)
387
388	leaq	1(%r14),%r14
389	cmpq	%r9,%r14
390	jb	.Louter
391
392	xorq	%r14,%r14
393	movq	(%rsp),%rax
394	leaq	(%rsp),%rsi
395	movq	%r9,%r15
396	jmp	.Lsub
397.align	16
398.Lsub:	sbbq	(%rcx,%r14,8),%rax
399	movq	%rax,(%rdi,%r14,8)
400	movq	8(%rsi,%r14,8),%rax
401	leaq	1(%r14),%r14
402	decq	%r15
403	jnz	.Lsub
404
405	sbbq	$0,%rax
406	movq	$-1,%rbx
407	xorq	%rax,%rbx
408	xorq	%r14,%r14
409	movq	%r9,%r15
410
411.Lcopy:
412	movq	(%rdi,%r14,8),%rcx
413	movq	(%rsp,%r14,8),%rdx
414	andq	%rbx,%rcx
415	andq	%rax,%rdx
416	movq	%r14,(%rsp,%r14,8)
417	orq	%rcx,%rdx
418	movq	%rdx,(%rdi,%r14,8)
419	leaq	1(%r14),%r14
420	subq	$1,%r15
421	jnz	.Lcopy
422
423	movq	8(%rsp,%r9,8),%rsi
424.cfi_def_cfa	%rsi,8
425	movq	$1,%rax
426
427	movq	-48(%rsi),%r15
428.cfi_restore	%r15
429	movq	-40(%rsi),%r14
430.cfi_restore	%r14
431	movq	-32(%rsi),%r13
432.cfi_restore	%r13
433	movq	-24(%rsi),%r12
434.cfi_restore	%r12
435	movq	-16(%rsi),%rbp
436.cfi_restore	%rbp
437	movq	-8(%rsi),%rbx
438.cfi_restore	%rbx
439	leaq	(%rsi),%rsp
440.cfi_def_cfa_register	%rsp
441.Lmul_epilogue:
442	.byte	0xf3,0xc3
443.cfi_endproc
444.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
445.type	bn_mul4x_mont_gather5,@function
446.align	32
447bn_mul4x_mont_gather5:
448.cfi_startproc
449.byte	0x67
450	movq	%rsp,%rax
451.cfi_def_cfa_register	%rax
452.Lmul4x_enter:
453	andl	$0x80108,%r11d
454	cmpl	$0x80108,%r11d
455	je	.Lmulx4x_enter
456	pushq	%rbx
457.cfi_offset	%rbx,-16
458	pushq	%rbp
459.cfi_offset	%rbp,-24
460	pushq	%r12
461.cfi_offset	%r12,-32
462	pushq	%r13
463.cfi_offset	%r13,-40
464	pushq	%r14
465.cfi_offset	%r14,-48
466	pushq	%r15
467.cfi_offset	%r15,-56
468.Lmul4x_prologue:
469
470.byte	0x67
471	shll	$3,%r9d
472	leaq	(%r9,%r9,2),%r10
473	negq	%r9
474
475
476
477
478
479
480
481
482
483
484	leaq	-320(%rsp,%r9,2),%r11
485	movq	%rsp,%rbp
486	subq	%rdi,%r11
487	andq	$4095,%r11
488	cmpq	%r11,%r10
489	jb	.Lmul4xsp_alt
490	subq	%r11,%rbp
491	leaq	-320(%rbp,%r9,2),%rbp
492	jmp	.Lmul4xsp_done
493
494.align	32
495.Lmul4xsp_alt:
496	leaq	4096-320(,%r9,2),%r10
497	leaq	-320(%rbp,%r9,2),%rbp
498	subq	%r10,%r11
499	movq	$0,%r10
500	cmovcq	%r10,%r11
501	subq	%r11,%rbp
502.Lmul4xsp_done:
503	andq	$-64,%rbp
504	movq	%rsp,%r11
505	subq	%rbp,%r11
506	andq	$-4096,%r11
507	leaq	(%r11,%rbp,1),%rsp
508	movq	(%rsp),%r10
509	cmpq	%rbp,%rsp
510	ja	.Lmul4x_page_walk
511	jmp	.Lmul4x_page_walk_done
512
513.Lmul4x_page_walk:
514	leaq	-4096(%rsp),%rsp
515	movq	(%rsp),%r10
516	cmpq	%rbp,%rsp
517	ja	.Lmul4x_page_walk
518.Lmul4x_page_walk_done:
519
520	negq	%r9
521
522	movq	%rax,40(%rsp)
523.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
524.Lmul4x_body:
525
526	call	mul4x_internal
527
528	movq	40(%rsp),%rsi
529.cfi_def_cfa	%rsi,8
530	movq	$1,%rax
531
532	movq	-48(%rsi),%r15
533.cfi_restore	%r15
534	movq	-40(%rsi),%r14
535.cfi_restore	%r14
536	movq	-32(%rsi),%r13
537.cfi_restore	%r13
538	movq	-24(%rsi),%r12
539.cfi_restore	%r12
540	movq	-16(%rsi),%rbp
541.cfi_restore	%rbp
542	movq	-8(%rsi),%rbx
543.cfi_restore	%rbx
544	leaq	(%rsi),%rsp
545.cfi_def_cfa_register	%rsp
546.Lmul4x_epilogue:
547	.byte	0xf3,0xc3
548.cfi_endproc
549.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
550
551.type	mul4x_internal,@function
552.align	32
553mul4x_internal:
554	shlq	$5,%r9
555	movd	8(%rax),%xmm5
556	leaq	.Linc(%rip),%rax
557	leaq	128(%rdx,%r9,1),%r13
558	shrq	$5,%r9
559	movdqa	0(%rax),%xmm0
560	movdqa	16(%rax),%xmm1
561	leaq	88-112(%rsp,%r9,1),%r10
562	leaq	128(%rdx),%r12
563
564	pshufd	$0,%xmm5,%xmm5
565	movdqa	%xmm1,%xmm4
566.byte	0x67,0x67
567	movdqa	%xmm1,%xmm2
568	paddd	%xmm0,%xmm1
569	pcmpeqd	%xmm5,%xmm0
570.byte	0x67
571	movdqa	%xmm4,%xmm3
572	paddd	%xmm1,%xmm2
573	pcmpeqd	%xmm5,%xmm1
574	movdqa	%xmm0,112(%r10)
575	movdqa	%xmm4,%xmm0
576
577	paddd	%xmm2,%xmm3
578	pcmpeqd	%xmm5,%xmm2
579	movdqa	%xmm1,128(%r10)
580	movdqa	%xmm4,%xmm1
581
582	paddd	%xmm3,%xmm0
583	pcmpeqd	%xmm5,%xmm3
584	movdqa	%xmm2,144(%r10)
585	movdqa	%xmm4,%xmm2
586
587	paddd	%xmm0,%xmm1
588	pcmpeqd	%xmm5,%xmm0
589	movdqa	%xmm3,160(%r10)
590	movdqa	%xmm4,%xmm3
591	paddd	%xmm1,%xmm2
592	pcmpeqd	%xmm5,%xmm1
593	movdqa	%xmm0,176(%r10)
594	movdqa	%xmm4,%xmm0
595
596	paddd	%xmm2,%xmm3
597	pcmpeqd	%xmm5,%xmm2
598	movdqa	%xmm1,192(%r10)
599	movdqa	%xmm4,%xmm1
600
601	paddd	%xmm3,%xmm0
602	pcmpeqd	%xmm5,%xmm3
603	movdqa	%xmm2,208(%r10)
604	movdqa	%xmm4,%xmm2
605
606	paddd	%xmm0,%xmm1
607	pcmpeqd	%xmm5,%xmm0
608	movdqa	%xmm3,224(%r10)
609	movdqa	%xmm4,%xmm3
610	paddd	%xmm1,%xmm2
611	pcmpeqd	%xmm5,%xmm1
612	movdqa	%xmm0,240(%r10)
613	movdqa	%xmm4,%xmm0
614
615	paddd	%xmm2,%xmm3
616	pcmpeqd	%xmm5,%xmm2
617	movdqa	%xmm1,256(%r10)
618	movdqa	%xmm4,%xmm1
619
620	paddd	%xmm3,%xmm0
621	pcmpeqd	%xmm5,%xmm3
622	movdqa	%xmm2,272(%r10)
623	movdqa	%xmm4,%xmm2
624
625	paddd	%xmm0,%xmm1
626	pcmpeqd	%xmm5,%xmm0
627	movdqa	%xmm3,288(%r10)
628	movdqa	%xmm4,%xmm3
629	paddd	%xmm1,%xmm2
630	pcmpeqd	%xmm5,%xmm1
631	movdqa	%xmm0,304(%r10)
632
633	paddd	%xmm2,%xmm3
634.byte	0x67
635	pcmpeqd	%xmm5,%xmm2
636	movdqa	%xmm1,320(%r10)
637
638	pcmpeqd	%xmm5,%xmm3
639	movdqa	%xmm2,336(%r10)
640	pand	64(%r12),%xmm0
641
642	pand	80(%r12),%xmm1
643	pand	96(%r12),%xmm2
644	movdqa	%xmm3,352(%r10)
645	pand	112(%r12),%xmm3
646	por	%xmm2,%xmm0
647	por	%xmm3,%xmm1
648	movdqa	-128(%r12),%xmm4
649	movdqa	-112(%r12),%xmm5
650	movdqa	-96(%r12),%xmm2
651	pand	112(%r10),%xmm4
652	movdqa	-80(%r12),%xmm3
653	pand	128(%r10),%xmm5
654	por	%xmm4,%xmm0
655	pand	144(%r10),%xmm2
656	por	%xmm5,%xmm1
657	pand	160(%r10),%xmm3
658	por	%xmm2,%xmm0
659	por	%xmm3,%xmm1
660	movdqa	-64(%r12),%xmm4
661	movdqa	-48(%r12),%xmm5
662	movdqa	-32(%r12),%xmm2
663	pand	176(%r10),%xmm4
664	movdqa	-16(%r12),%xmm3
665	pand	192(%r10),%xmm5
666	por	%xmm4,%xmm0
667	pand	208(%r10),%xmm2
668	por	%xmm5,%xmm1
669	pand	224(%r10),%xmm3
670	por	%xmm2,%xmm0
671	por	%xmm3,%xmm1
672	movdqa	0(%r12),%xmm4
673	movdqa	16(%r12),%xmm5
674	movdqa	32(%r12),%xmm2
675	pand	240(%r10),%xmm4
676	movdqa	48(%r12),%xmm3
677	pand	256(%r10),%xmm5
678	por	%xmm4,%xmm0
679	pand	272(%r10),%xmm2
680	por	%xmm5,%xmm1
681	pand	288(%r10),%xmm3
682	por	%xmm2,%xmm0
683	por	%xmm3,%xmm1
684	por	%xmm1,%xmm0
685	pshufd	$0x4e,%xmm0,%xmm1
686	por	%xmm1,%xmm0
687	leaq	256(%r12),%r12
688.byte	102,72,15,126,195
689
690	movq	%r13,16+8(%rsp)
691	movq	%rdi,56+8(%rsp)
692
693	movq	(%r8),%r8
694	movq	(%rsi),%rax
695	leaq	(%rsi,%r9,1),%rsi
696	negq	%r9
697
698	movq	%r8,%rbp
699	mulq	%rbx
700	movq	%rax,%r10
701	movq	(%rcx),%rax
702
703	imulq	%r10,%rbp
704	leaq	64+8(%rsp),%r14
705	movq	%rdx,%r11
706
707	mulq	%rbp
708	addq	%rax,%r10
709	movq	8(%rsi,%r9,1),%rax
710	adcq	$0,%rdx
711	movq	%rdx,%rdi
712
713	mulq	%rbx
714	addq	%rax,%r11
715	movq	8(%rcx),%rax
716	adcq	$0,%rdx
717	movq	%rdx,%r10
718
719	mulq	%rbp
720	addq	%rax,%rdi
721	movq	16(%rsi,%r9,1),%rax
722	adcq	$0,%rdx
723	addq	%r11,%rdi
724	leaq	32(%r9),%r15
725	leaq	32(%rcx),%rcx
726	adcq	$0,%rdx
727	movq	%rdi,(%r14)
728	movq	%rdx,%r13
729	jmp	.L1st4x
730
731.align	32
732.L1st4x:
733	mulq	%rbx
734	addq	%rax,%r10
735	movq	-16(%rcx),%rax
736	leaq	32(%r14),%r14
737	adcq	$0,%rdx
738	movq	%rdx,%r11
739
740	mulq	%rbp
741	addq	%rax,%r13
742	movq	-8(%rsi,%r15,1),%rax
743	adcq	$0,%rdx
744	addq	%r10,%r13
745	adcq	$0,%rdx
746	movq	%r13,-24(%r14)
747	movq	%rdx,%rdi
748
749	mulq	%rbx
750	addq	%rax,%r11
751	movq	-8(%rcx),%rax
752	adcq	$0,%rdx
753	movq	%rdx,%r10
754
755	mulq	%rbp
756	addq	%rax,%rdi
757	movq	(%rsi,%r15,1),%rax
758	adcq	$0,%rdx
759	addq	%r11,%rdi
760	adcq	$0,%rdx
761	movq	%rdi,-16(%r14)
762	movq	%rdx,%r13
763
764	mulq	%rbx
765	addq	%rax,%r10
766	movq	0(%rcx),%rax
767	adcq	$0,%rdx
768	movq	%rdx,%r11
769
770	mulq	%rbp
771	addq	%rax,%r13
772	movq	8(%rsi,%r15,1),%rax
773	adcq	$0,%rdx
774	addq	%r10,%r13
775	adcq	$0,%rdx
776	movq	%r13,-8(%r14)
777	movq	%rdx,%rdi
778
779	mulq	%rbx
780	addq	%rax,%r11
781	movq	8(%rcx),%rax
782	adcq	$0,%rdx
783	movq	%rdx,%r10
784
785	mulq	%rbp
786	addq	%rax,%rdi
787	movq	16(%rsi,%r15,1),%rax
788	adcq	$0,%rdx
789	addq	%r11,%rdi
790	leaq	32(%rcx),%rcx
791	adcq	$0,%rdx
792	movq	%rdi,(%r14)
793	movq	%rdx,%r13
794
795	addq	$32,%r15
796	jnz	.L1st4x
797
798	mulq	%rbx
799	addq	%rax,%r10
800	movq	-16(%rcx),%rax
801	leaq	32(%r14),%r14
802	adcq	$0,%rdx
803	movq	%rdx,%r11
804
805	mulq	%rbp
806	addq	%rax,%r13
807	movq	-8(%rsi),%rax
808	adcq	$0,%rdx
809	addq	%r10,%r13
810	adcq	$0,%rdx
811	movq	%r13,-24(%r14)
812	movq	%rdx,%rdi
813
814	mulq	%rbx
815	addq	%rax,%r11
816	movq	-8(%rcx),%rax
817	adcq	$0,%rdx
818	movq	%rdx,%r10
819
820	mulq	%rbp
821	addq	%rax,%rdi
822	movq	(%rsi,%r9,1),%rax
823	adcq	$0,%rdx
824	addq	%r11,%rdi
825	adcq	$0,%rdx
826	movq	%rdi,-16(%r14)
827	movq	%rdx,%r13
828
829	leaq	(%rcx,%r9,1),%rcx
830
831	xorq	%rdi,%rdi
832	addq	%r10,%r13
833	adcq	$0,%rdi
834	movq	%r13,-8(%r14)
835
836	jmp	.Louter4x
837
838.align	32
839.Louter4x:
840	leaq	16+128(%r14),%rdx
841	pxor	%xmm4,%xmm4
842	pxor	%xmm5,%xmm5
843	movdqa	-128(%r12),%xmm0
844	movdqa	-112(%r12),%xmm1
845	movdqa	-96(%r12),%xmm2
846	movdqa	-80(%r12),%xmm3
847	pand	-128(%rdx),%xmm0
848	pand	-112(%rdx),%xmm1
849	por	%xmm0,%xmm4
850	pand	-96(%rdx),%xmm2
851	por	%xmm1,%xmm5
852	pand	-80(%rdx),%xmm3
853	por	%xmm2,%xmm4
854	por	%xmm3,%xmm5
855	movdqa	-64(%r12),%xmm0
856	movdqa	-48(%r12),%xmm1
857	movdqa	-32(%r12),%xmm2
858	movdqa	-16(%r12),%xmm3
859	pand	-64(%rdx),%xmm0
860	pand	-48(%rdx),%xmm1
861	por	%xmm0,%xmm4
862	pand	-32(%rdx),%xmm2
863	por	%xmm1,%xmm5
864	pand	-16(%rdx),%xmm3
865	por	%xmm2,%xmm4
866	por	%xmm3,%xmm5
867	movdqa	0(%r12),%xmm0
868	movdqa	16(%r12),%xmm1
869	movdqa	32(%r12),%xmm2
870	movdqa	48(%r12),%xmm3
871	pand	0(%rdx),%xmm0
872	pand	16(%rdx),%xmm1
873	por	%xmm0,%xmm4
874	pand	32(%rdx),%xmm2
875	por	%xmm1,%xmm5
876	pand	48(%rdx),%xmm3
877	por	%xmm2,%xmm4
878	por	%xmm3,%xmm5
879	movdqa	64(%r12),%xmm0
880	movdqa	80(%r12),%xmm1
881	movdqa	96(%r12),%xmm2
882	movdqa	112(%r12),%xmm3
883	pand	64(%rdx),%xmm0
884	pand	80(%rdx),%xmm1
885	por	%xmm0,%xmm4
886	pand	96(%rdx),%xmm2
887	por	%xmm1,%xmm5
888	pand	112(%rdx),%xmm3
889	por	%xmm2,%xmm4
890	por	%xmm3,%xmm5
891	por	%xmm5,%xmm4
892	pshufd	$0x4e,%xmm4,%xmm0
893	por	%xmm4,%xmm0
894	leaq	256(%r12),%r12
895.byte	102,72,15,126,195
896
897	movq	(%r14,%r9,1),%r10
898	movq	%r8,%rbp
899	mulq	%rbx
900	addq	%rax,%r10
901	movq	(%rcx),%rax
902	adcq	$0,%rdx
903
904	imulq	%r10,%rbp
905	movq	%rdx,%r11
906	movq	%rdi,(%r14)
907
908	leaq	(%r14,%r9,1),%r14
909
910	mulq	%rbp
911	addq	%rax,%r10
912	movq	8(%rsi,%r9,1),%rax
913	adcq	$0,%rdx
914	movq	%rdx,%rdi
915
916	mulq	%rbx
917	addq	%rax,%r11
918	movq	8(%rcx),%rax
919	adcq	$0,%rdx
920	addq	8(%r14),%r11
921	adcq	$0,%rdx
922	movq	%rdx,%r10
923
924	mulq	%rbp
925	addq	%rax,%rdi
926	movq	16(%rsi,%r9,1),%rax
927	adcq	$0,%rdx
928	addq	%r11,%rdi
929	leaq	32(%r9),%r15
930	leaq	32(%rcx),%rcx
931	adcq	$0,%rdx
932	movq	%rdx,%r13
933	jmp	.Linner4x
934
935.align	32
936.Linner4x:
937	mulq	%rbx
938	addq	%rax,%r10
939	movq	-16(%rcx),%rax
940	adcq	$0,%rdx
941	addq	16(%r14),%r10
942	leaq	32(%r14),%r14
943	adcq	$0,%rdx
944	movq	%rdx,%r11
945
946	mulq	%rbp
947	addq	%rax,%r13
948	movq	-8(%rsi,%r15,1),%rax
949	adcq	$0,%rdx
950	addq	%r10,%r13
951	adcq	$0,%rdx
952	movq	%rdi,-32(%r14)
953	movq	%rdx,%rdi
954
955	mulq	%rbx
956	addq	%rax,%r11
957	movq	-8(%rcx),%rax
958	adcq	$0,%rdx
959	addq	-8(%r14),%r11
960	adcq	$0,%rdx
961	movq	%rdx,%r10
962
963	mulq	%rbp
964	addq	%rax,%rdi
965	movq	(%rsi,%r15,1),%rax
966	adcq	$0,%rdx
967	addq	%r11,%rdi
968	adcq	$0,%rdx
969	movq	%r13,-24(%r14)
970	movq	%rdx,%r13
971
972	mulq	%rbx
973	addq	%rax,%r10
974	movq	0(%rcx),%rax
975	adcq	$0,%rdx
976	addq	(%r14),%r10
977	adcq	$0,%rdx
978	movq	%rdx,%r11
979
980	mulq	%rbp
981	addq	%rax,%r13
982	movq	8(%rsi,%r15,1),%rax
983	adcq	$0,%rdx
984	addq	%r10,%r13
985	adcq	$0,%rdx
986	movq	%rdi,-16(%r14)
987	movq	%rdx,%rdi
988
989	mulq	%rbx
990	addq	%rax,%r11
991	movq	8(%rcx),%rax
992	adcq	$0,%rdx
993	addq	8(%r14),%r11
994	adcq	$0,%rdx
995	movq	%rdx,%r10
996
997	mulq	%rbp
998	addq	%rax,%rdi
999	movq	16(%rsi,%r15,1),%rax
1000	adcq	$0,%rdx
1001	addq	%r11,%rdi
1002	leaq	32(%rcx),%rcx
1003	adcq	$0,%rdx
1004	movq	%r13,-8(%r14)
1005	movq	%rdx,%r13
1006
1007	addq	$32,%r15
1008	jnz	.Linner4x
1009
1010	mulq	%rbx
1011	addq	%rax,%r10
1012	movq	-16(%rcx),%rax
1013	adcq	$0,%rdx
1014	addq	16(%r14),%r10
1015	leaq	32(%r14),%r14
1016	adcq	$0,%rdx
1017	movq	%rdx,%r11
1018
1019	mulq	%rbp
1020	addq	%rax,%r13
1021	movq	-8(%rsi),%rax
1022	adcq	$0,%rdx
1023	addq	%r10,%r13
1024	adcq	$0,%rdx
1025	movq	%rdi,-32(%r14)
1026	movq	%rdx,%rdi
1027
1028	mulq	%rbx
1029	addq	%rax,%r11
1030	movq	%rbp,%rax
1031	movq	-8(%rcx),%rbp
1032	adcq	$0,%rdx
1033	addq	-8(%r14),%r11
1034	adcq	$0,%rdx
1035	movq	%rdx,%r10
1036
1037	mulq	%rbp
1038	addq	%rax,%rdi
1039	movq	(%rsi,%r9,1),%rax
1040	adcq	$0,%rdx
1041	addq	%r11,%rdi
1042	adcq	$0,%rdx
1043	movq	%r13,-24(%r14)
1044	movq	%rdx,%r13
1045
1046	movq	%rdi,-16(%r14)
1047	leaq	(%rcx,%r9,1),%rcx
1048
1049	xorq	%rdi,%rdi
1050	addq	%r10,%r13
1051	adcq	$0,%rdi
1052	addq	(%r14),%r13
1053	adcq	$0,%rdi
1054	movq	%r13,-8(%r14)
1055
1056	cmpq	16+8(%rsp),%r12
1057	jb	.Louter4x
1058	xorq	%rax,%rax
1059	subq	%r13,%rbp
1060	adcq	%r15,%r15
1061	orq	%r15,%rdi
1062	subq	%rdi,%rax
1063	leaq	(%r14,%r9,1),%rbx
1064	movq	(%rcx),%r12
1065	leaq	(%rcx),%rbp
1066	movq	%r9,%rcx
1067	sarq	$3+2,%rcx
1068	movq	56+8(%rsp),%rdi
1069	decq	%r12
1070	xorq	%r10,%r10
1071	movq	8(%rbp),%r13
1072	movq	16(%rbp),%r14
1073	movq	24(%rbp),%r15
1074	jmp	.Lsqr4x_sub_entry
1075.size	mul4x_internal,.-mul4x_internal
1076.globl	bn_power5
1077.type	bn_power5,@function
1078.align	32
1079bn_power5:
1080.cfi_startproc
1081	movq	%rsp,%rax
1082.cfi_def_cfa_register	%rax
1083	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
1084	andl	$0x80108,%r11d
1085	cmpl	$0x80108,%r11d
1086	je	.Lpowerx5_enter
1087	pushq	%rbx
1088.cfi_offset	%rbx,-16
1089	pushq	%rbp
1090.cfi_offset	%rbp,-24
1091	pushq	%r12
1092.cfi_offset	%r12,-32
1093	pushq	%r13
1094.cfi_offset	%r13,-40
1095	pushq	%r14
1096.cfi_offset	%r14,-48
1097	pushq	%r15
1098.cfi_offset	%r15,-56
1099.Lpower5_prologue:
1100
1101	shll	$3,%r9d
1102	leal	(%r9,%r9,2),%r10d
1103	negq	%r9
1104	movq	(%r8),%r8
1105
1106
1107
1108
1109
1110
1111
1112
1113	leaq	-320(%rsp,%r9,2),%r11
1114	movq	%rsp,%rbp
1115	subq	%rdi,%r11
1116	andq	$4095,%r11
1117	cmpq	%r11,%r10
1118	jb	.Lpwr_sp_alt
1119	subq	%r11,%rbp
1120	leaq	-320(%rbp,%r9,2),%rbp
1121	jmp	.Lpwr_sp_done
1122
1123.align	32
1124.Lpwr_sp_alt:
1125	leaq	4096-320(,%r9,2),%r10
1126	leaq	-320(%rbp,%r9,2),%rbp
1127	subq	%r10,%r11
1128	movq	$0,%r10
1129	cmovcq	%r10,%r11
1130	subq	%r11,%rbp
1131.Lpwr_sp_done:
1132	andq	$-64,%rbp
1133	movq	%rsp,%r11
1134	subq	%rbp,%r11
1135	andq	$-4096,%r11
1136	leaq	(%r11,%rbp,1),%rsp
1137	movq	(%rsp),%r10
1138	cmpq	%rbp,%rsp
1139	ja	.Lpwr_page_walk
1140	jmp	.Lpwr_page_walk_done
1141
1142.Lpwr_page_walk:
1143	leaq	-4096(%rsp),%rsp
1144	movq	(%rsp),%r10
1145	cmpq	%rbp,%rsp
1146	ja	.Lpwr_page_walk
1147.Lpwr_page_walk_done:
1148
1149	movq	%r9,%r10
1150	negq	%r9
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161	movq	%r8,32(%rsp)
1162	movq	%rax,40(%rsp)
1163.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
1164.Lpower5_body:
1165.byte	102,72,15,110,207
1166.byte	102,72,15,110,209
1167.byte	102,73,15,110,218
1168.byte	102,72,15,110,226
1169
1170	call	__bn_sqr8x_internal
1171	call	__bn_post4x_internal
1172	call	__bn_sqr8x_internal
1173	call	__bn_post4x_internal
1174	call	__bn_sqr8x_internal
1175	call	__bn_post4x_internal
1176	call	__bn_sqr8x_internal
1177	call	__bn_post4x_internal
1178	call	__bn_sqr8x_internal
1179	call	__bn_post4x_internal
1180
1181.byte	102,72,15,126,209
1182.byte	102,72,15,126,226
1183	movq	%rsi,%rdi
1184	movq	40(%rsp),%rax
1185	leaq	32(%rsp),%r8
1186
1187	call	mul4x_internal
1188
1189	movq	40(%rsp),%rsi
1190.cfi_def_cfa	%rsi,8
1191	movq	$1,%rax
1192	movq	-48(%rsi),%r15
1193.cfi_restore	%r15
1194	movq	-40(%rsi),%r14
1195.cfi_restore	%r14
1196	movq	-32(%rsi),%r13
1197.cfi_restore	%r13
1198	movq	-24(%rsi),%r12
1199.cfi_restore	%r12
1200	movq	-16(%rsi),%rbp
1201.cfi_restore	%rbp
1202	movq	-8(%rsi),%rbx
1203.cfi_restore	%rbx
1204	leaq	(%rsi),%rsp
1205.cfi_def_cfa_register	%rsp
1206.Lpower5_epilogue:
1207	.byte	0xf3,0xc3
1208.cfi_endproc
1209.size	bn_power5,.-bn_power5
1210
1211.globl	bn_sqr8x_internal
1212.hidden	bn_sqr8x_internal
1213.type	bn_sqr8x_internal,@function
1214.align	32
1215bn_sqr8x_internal:
1216__bn_sqr8x_internal:
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290	leaq	32(%r10),%rbp
1291	leaq	(%rsi,%r9,1),%rsi
1292
1293	movq	%r9,%rcx
1294
1295
1296	movq	-32(%rsi,%rbp,1),%r14
1297	leaq	48+8(%rsp,%r9,2),%rdi
1298	movq	-24(%rsi,%rbp,1),%rax
1299	leaq	-32(%rdi,%rbp,1),%rdi
1300	movq	-16(%rsi,%rbp,1),%rbx
1301	movq	%rax,%r15
1302
1303	mulq	%r14
1304	movq	%rax,%r10
1305	movq	%rbx,%rax
1306	movq	%rdx,%r11
1307	movq	%r10,-24(%rdi,%rbp,1)
1308
1309	mulq	%r14
1310	addq	%rax,%r11
1311	movq	%rbx,%rax
1312	adcq	$0,%rdx
1313	movq	%r11,-16(%rdi,%rbp,1)
1314	movq	%rdx,%r10
1315
1316
1317	movq	-8(%rsi,%rbp,1),%rbx
1318	mulq	%r15
1319	movq	%rax,%r12
1320	movq	%rbx,%rax
1321	movq	%rdx,%r13
1322
1323	leaq	(%rbp),%rcx
1324	mulq	%r14
1325	addq	%rax,%r10
1326	movq	%rbx,%rax
1327	movq	%rdx,%r11
1328	adcq	$0,%r11
1329	addq	%r12,%r10
1330	adcq	$0,%r11
1331	movq	%r10,-8(%rdi,%rcx,1)
1332	jmp	.Lsqr4x_1st
1333
1334.align	32
1335.Lsqr4x_1st:
1336	movq	(%rsi,%rcx,1),%rbx
1337	mulq	%r15
1338	addq	%rax,%r13
1339	movq	%rbx,%rax
1340	movq	%rdx,%r12
1341	adcq	$0,%r12
1342
1343	mulq	%r14
1344	addq	%rax,%r11
1345	movq	%rbx,%rax
1346	movq	8(%rsi,%rcx,1),%rbx
1347	movq	%rdx,%r10
1348	adcq	$0,%r10
1349	addq	%r13,%r11
1350	adcq	$0,%r10
1351
1352
1353	mulq	%r15
1354	addq	%rax,%r12
1355	movq	%rbx,%rax
1356	movq	%r11,(%rdi,%rcx,1)
1357	movq	%rdx,%r13
1358	adcq	$0,%r13
1359
1360	mulq	%r14
1361	addq	%rax,%r10
1362	movq	%rbx,%rax
1363	movq	16(%rsi,%rcx,1),%rbx
1364	movq	%rdx,%r11
1365	adcq	$0,%r11
1366	addq	%r12,%r10
1367	adcq	$0,%r11
1368
1369	mulq	%r15
1370	addq	%rax,%r13
1371	movq	%rbx,%rax
1372	movq	%r10,8(%rdi,%rcx,1)
1373	movq	%rdx,%r12
1374	adcq	$0,%r12
1375
1376	mulq	%r14
1377	addq	%rax,%r11
1378	movq	%rbx,%rax
1379	movq	24(%rsi,%rcx,1),%rbx
1380	movq	%rdx,%r10
1381	adcq	$0,%r10
1382	addq	%r13,%r11
1383	adcq	$0,%r10
1384
1385
1386	mulq	%r15
1387	addq	%rax,%r12
1388	movq	%rbx,%rax
1389	movq	%r11,16(%rdi,%rcx,1)
1390	movq	%rdx,%r13
1391	adcq	$0,%r13
1392	leaq	32(%rcx),%rcx
1393
1394	mulq	%r14
1395	addq	%rax,%r10
1396	movq	%rbx,%rax
1397	movq	%rdx,%r11
1398	adcq	$0,%r11
1399	addq	%r12,%r10
1400	adcq	$0,%r11
1401	movq	%r10,-8(%rdi,%rcx,1)
1402
1403	cmpq	$0,%rcx
1404	jne	.Lsqr4x_1st
1405
1406	mulq	%r15
1407	addq	%rax,%r13
1408	leaq	16(%rbp),%rbp
1409	adcq	$0,%rdx
1410	addq	%r11,%r13
1411	adcq	$0,%rdx
1412
1413	movq	%r13,(%rdi)
1414	movq	%rdx,%r12
1415	movq	%rdx,8(%rdi)
1416	jmp	.Lsqr4x_outer
1417
1418.align	32
1419.Lsqr4x_outer:
1420	movq	-32(%rsi,%rbp,1),%r14
1421	leaq	48+8(%rsp,%r9,2),%rdi
1422	movq	-24(%rsi,%rbp,1),%rax
1423	leaq	-32(%rdi,%rbp,1),%rdi
1424	movq	-16(%rsi,%rbp,1),%rbx
1425	movq	%rax,%r15
1426
1427	mulq	%r14
1428	movq	-24(%rdi,%rbp,1),%r10
1429	addq	%rax,%r10
1430	movq	%rbx,%rax
1431	adcq	$0,%rdx
1432	movq	%r10,-24(%rdi,%rbp,1)
1433	movq	%rdx,%r11
1434
1435	mulq	%r14
1436	addq	%rax,%r11
1437	movq	%rbx,%rax
1438	adcq	$0,%rdx
1439	addq	-16(%rdi,%rbp,1),%r11
1440	movq	%rdx,%r10
1441	adcq	$0,%r10
1442	movq	%r11,-16(%rdi,%rbp,1)
1443
1444	xorq	%r12,%r12
1445
1446	movq	-8(%rsi,%rbp,1),%rbx
1447	mulq	%r15
1448	addq	%rax,%r12
1449	movq	%rbx,%rax
1450	adcq	$0,%rdx
1451	addq	-8(%rdi,%rbp,1),%r12
1452	movq	%rdx,%r13
1453	adcq	$0,%r13
1454
1455	mulq	%r14
1456	addq	%rax,%r10
1457	movq	%rbx,%rax
1458	adcq	$0,%rdx
1459	addq	%r12,%r10
1460	movq	%rdx,%r11
1461	adcq	$0,%r11
1462	movq	%r10,-8(%rdi,%rbp,1)
1463
1464	leaq	(%rbp),%rcx
1465	jmp	.Lsqr4x_inner
1466
1467.align	32
1468.Lsqr4x_inner:
1469	movq	(%rsi,%rcx,1),%rbx
1470	mulq	%r15
1471	addq	%rax,%r13
1472	movq	%rbx,%rax
1473	movq	%rdx,%r12
1474	adcq	$0,%r12
1475	addq	(%rdi,%rcx,1),%r13
1476	adcq	$0,%r12
1477
1478.byte	0x67
1479	mulq	%r14
1480	addq	%rax,%r11
1481	movq	%rbx,%rax
1482	movq	8(%rsi,%rcx,1),%rbx
1483	movq	%rdx,%r10
1484	adcq	$0,%r10
1485	addq	%r13,%r11
1486	adcq	$0,%r10
1487
1488	mulq	%r15
1489	addq	%rax,%r12
1490	movq	%r11,(%rdi,%rcx,1)
1491	movq	%rbx,%rax
1492	movq	%rdx,%r13
1493	adcq	$0,%r13
1494	addq	8(%rdi,%rcx,1),%r12
1495	leaq	16(%rcx),%rcx
1496	adcq	$0,%r13
1497
1498	mulq	%r14
1499	addq	%rax,%r10
1500	movq	%rbx,%rax
1501	adcq	$0,%rdx
1502	addq	%r12,%r10
1503	movq	%rdx,%r11
1504	adcq	$0,%r11
1505	movq	%r10,-8(%rdi,%rcx,1)
1506
1507	cmpq	$0,%rcx
1508	jne	.Lsqr4x_inner
1509
1510.byte	0x67
1511	mulq	%r15
1512	addq	%rax,%r13
1513	adcq	$0,%rdx
1514	addq	%r11,%r13
1515	adcq	$0,%rdx
1516
1517	movq	%r13,(%rdi)
1518	movq	%rdx,%r12
1519	movq	%rdx,8(%rdi)
1520
1521	addq	$16,%rbp
1522	jnz	.Lsqr4x_outer
1523
1524
1525	movq	-32(%rsi),%r14
1526	leaq	48+8(%rsp,%r9,2),%rdi
1527	movq	-24(%rsi),%rax
1528	leaq	-32(%rdi,%rbp,1),%rdi
1529	movq	-16(%rsi),%rbx
1530	movq	%rax,%r15
1531
1532	mulq	%r14
1533	addq	%rax,%r10
1534	movq	%rbx,%rax
1535	movq	%rdx,%r11
1536	adcq	$0,%r11
1537
1538	mulq	%r14
1539	addq	%rax,%r11
1540	movq	%rbx,%rax
1541	movq	%r10,-24(%rdi)
1542	movq	%rdx,%r10
1543	adcq	$0,%r10
1544	addq	%r13,%r11
1545	movq	-8(%rsi),%rbx
1546	adcq	$0,%r10
1547
1548	mulq	%r15
1549	addq	%rax,%r12
1550	movq	%rbx,%rax
1551	movq	%r11,-16(%rdi)
1552	movq	%rdx,%r13
1553	adcq	$0,%r13
1554
1555	mulq	%r14
1556	addq	%rax,%r10
1557	movq	%rbx,%rax
1558	movq	%rdx,%r11
1559	adcq	$0,%r11
1560	addq	%r12,%r10
1561	adcq	$0,%r11
1562	movq	%r10,-8(%rdi)
1563
1564	mulq	%r15
1565	addq	%rax,%r13
1566	movq	-16(%rsi),%rax
1567	adcq	$0,%rdx
1568	addq	%r11,%r13
1569	adcq	$0,%rdx
1570
1571	movq	%r13,(%rdi)
1572	movq	%rdx,%r12
1573	movq	%rdx,8(%rdi)
1574
1575	mulq	%rbx
1576	addq	$16,%rbp
1577	xorq	%r14,%r14
1578	subq	%r9,%rbp
1579	xorq	%r15,%r15
1580
1581	addq	%r12,%rax
1582	adcq	$0,%rdx
1583	movq	%rax,8(%rdi)
1584	movq	%rdx,16(%rdi)
1585	movq	%r15,24(%rdi)
1586
1587	movq	-16(%rsi,%rbp,1),%rax
1588	leaq	48+8(%rsp),%rdi
1589	xorq	%r10,%r10
1590	movq	8(%rdi),%r11
1591
1592	leaq	(%r14,%r10,2),%r12
1593	shrq	$63,%r10
1594	leaq	(%rcx,%r11,2),%r13
1595	shrq	$63,%r11
1596	orq	%r10,%r13
1597	movq	16(%rdi),%r10
1598	movq	%r11,%r14
1599	mulq	%rax
1600	negq	%r15
1601	movq	24(%rdi),%r11
1602	adcq	%rax,%r12
1603	movq	-8(%rsi,%rbp,1),%rax
1604	movq	%r12,(%rdi)
1605	adcq	%rdx,%r13
1606
1607	leaq	(%r14,%r10,2),%rbx
1608	movq	%r13,8(%rdi)
1609	sbbq	%r15,%r15
1610	shrq	$63,%r10
1611	leaq	(%rcx,%r11,2),%r8
1612	shrq	$63,%r11
1613	orq	%r10,%r8
1614	movq	32(%rdi),%r10
1615	movq	%r11,%r14
1616	mulq	%rax
1617	negq	%r15
1618	movq	40(%rdi),%r11
1619	adcq	%rax,%rbx
1620	movq	0(%rsi,%rbp,1),%rax
1621	movq	%rbx,16(%rdi)
1622	adcq	%rdx,%r8
1623	leaq	16(%rbp),%rbp
1624	movq	%r8,24(%rdi)
1625	sbbq	%r15,%r15
1626	leaq	64(%rdi),%rdi
1627	jmp	.Lsqr4x_shift_n_add
1628
1629.align	32
1630.Lsqr4x_shift_n_add:
1631	leaq	(%r14,%r10,2),%r12
1632	shrq	$63,%r10
1633	leaq	(%rcx,%r11,2),%r13
1634	shrq	$63,%r11
1635	orq	%r10,%r13
1636	movq	-16(%rdi),%r10
1637	movq	%r11,%r14
1638	mulq	%rax
1639	negq	%r15
1640	movq	-8(%rdi),%r11
1641	adcq	%rax,%r12
1642	movq	-8(%rsi,%rbp,1),%rax
1643	movq	%r12,-32(%rdi)
1644	adcq	%rdx,%r13
1645
1646	leaq	(%r14,%r10,2),%rbx
1647	movq	%r13,-24(%rdi)
1648	sbbq	%r15,%r15
1649	shrq	$63,%r10
1650	leaq	(%rcx,%r11,2),%r8
1651	shrq	$63,%r11
1652	orq	%r10,%r8
1653	movq	0(%rdi),%r10
1654	movq	%r11,%r14
1655	mulq	%rax
1656	negq	%r15
1657	movq	8(%rdi),%r11
1658	adcq	%rax,%rbx
1659	movq	0(%rsi,%rbp,1),%rax
1660	movq	%rbx,-16(%rdi)
1661	adcq	%rdx,%r8
1662
1663	leaq	(%r14,%r10,2),%r12
1664	movq	%r8,-8(%rdi)
1665	sbbq	%r15,%r15
1666	shrq	$63,%r10
1667	leaq	(%rcx,%r11,2),%r13
1668	shrq	$63,%r11
1669	orq	%r10,%r13
1670	movq	16(%rdi),%r10
1671	movq	%r11,%r14
1672	mulq	%rax
1673	negq	%r15
1674	movq	24(%rdi),%r11
1675	adcq	%rax,%r12
1676	movq	8(%rsi,%rbp,1),%rax
1677	movq	%r12,0(%rdi)
1678	adcq	%rdx,%r13
1679
1680	leaq	(%r14,%r10,2),%rbx
1681	movq	%r13,8(%rdi)
1682	sbbq	%r15,%r15
1683	shrq	$63,%r10
1684	leaq	(%rcx,%r11,2),%r8
1685	shrq	$63,%r11
1686	orq	%r10,%r8
1687	movq	32(%rdi),%r10
1688	movq	%r11,%r14
1689	mulq	%rax
1690	negq	%r15
1691	movq	40(%rdi),%r11
1692	adcq	%rax,%rbx
1693	movq	16(%rsi,%rbp,1),%rax
1694	movq	%rbx,16(%rdi)
1695	adcq	%rdx,%r8
1696	movq	%r8,24(%rdi)
1697	sbbq	%r15,%r15
1698	leaq	64(%rdi),%rdi
1699	addq	$32,%rbp
1700	jnz	.Lsqr4x_shift_n_add
1701
1702	leaq	(%r14,%r10,2),%r12
1703.byte	0x67
1704	shrq	$63,%r10
1705	leaq	(%rcx,%r11,2),%r13
1706	shrq	$63,%r11
1707	orq	%r10,%r13
1708	movq	-16(%rdi),%r10
1709	movq	%r11,%r14
1710	mulq	%rax
1711	negq	%r15
1712	movq	-8(%rdi),%r11
1713	adcq	%rax,%r12
1714	movq	-8(%rsi),%rax
1715	movq	%r12,-32(%rdi)
1716	adcq	%rdx,%r13
1717
1718	leaq	(%r14,%r10,2),%rbx
1719	movq	%r13,-24(%rdi)
1720	sbbq	%r15,%r15
1721	shrq	$63,%r10
1722	leaq	(%rcx,%r11,2),%r8
1723	shrq	$63,%r11
1724	orq	%r10,%r8
1725	mulq	%rax
1726	negq	%r15
1727	adcq	%rax,%rbx
1728	adcq	%rdx,%r8
1729	movq	%rbx,-16(%rdi)
1730	movq	%r8,-8(%rdi)
1731.byte	102,72,15,126,213
1732__bn_sqr8x_reduction:
1733	xorq	%rax,%rax
1734	leaq	(%r9,%rbp,1),%rcx
1735	leaq	48+8(%rsp,%r9,2),%rdx
1736	movq	%rcx,0+8(%rsp)
1737	leaq	48+8(%rsp,%r9,1),%rdi
1738	movq	%rdx,8+8(%rsp)
1739	negq	%r9
1740	jmp	.L8x_reduction_loop
1741
1742.align	32
1743.L8x_reduction_loop:
1744	leaq	(%rdi,%r9,1),%rdi
1745.byte	0x66
1746	movq	0(%rdi),%rbx
1747	movq	8(%rdi),%r9
1748	movq	16(%rdi),%r10
1749	movq	24(%rdi),%r11
1750	movq	32(%rdi),%r12
1751	movq	40(%rdi),%r13
1752	movq	48(%rdi),%r14
1753	movq	56(%rdi),%r15
1754	movq	%rax,(%rdx)
1755	leaq	64(%rdi),%rdi
1756
1757.byte	0x67
1758	movq	%rbx,%r8
1759	imulq	32+8(%rsp),%rbx
1760	movq	0(%rbp),%rax
1761	movl	$8,%ecx
1762	jmp	.L8x_reduce
1763
1764.align	32
1765.L8x_reduce:
1766	mulq	%rbx
1767	movq	8(%rbp),%rax
1768	negq	%r8
1769	movq	%rdx,%r8
1770	adcq	$0,%r8
1771
1772	mulq	%rbx
1773	addq	%rax,%r9
1774	movq	16(%rbp),%rax
1775	adcq	$0,%rdx
1776	addq	%r9,%r8
1777	movq	%rbx,48-8+8(%rsp,%rcx,8)
1778	movq	%rdx,%r9
1779	adcq	$0,%r9
1780
1781	mulq	%rbx
1782	addq	%rax,%r10
1783	movq	24(%rbp),%rax
1784	adcq	$0,%rdx
1785	addq	%r10,%r9
1786	movq	32+8(%rsp),%rsi
1787	movq	%rdx,%r10
1788	adcq	$0,%r10
1789
1790	mulq	%rbx
1791	addq	%rax,%r11
1792	movq	32(%rbp),%rax
1793	adcq	$0,%rdx
1794	imulq	%r8,%rsi
1795	addq	%r11,%r10
1796	movq	%rdx,%r11
1797	adcq	$0,%r11
1798
1799	mulq	%rbx
1800	addq	%rax,%r12
1801	movq	40(%rbp),%rax
1802	adcq	$0,%rdx
1803	addq	%r12,%r11
1804	movq	%rdx,%r12
1805	adcq	$0,%r12
1806
1807	mulq	%rbx
1808	addq	%rax,%r13
1809	movq	48(%rbp),%rax
1810	adcq	$0,%rdx
1811	addq	%r13,%r12
1812	movq	%rdx,%r13
1813	adcq	$0,%r13
1814
1815	mulq	%rbx
1816	addq	%rax,%r14
1817	movq	56(%rbp),%rax
1818	adcq	$0,%rdx
1819	addq	%r14,%r13
1820	movq	%rdx,%r14
1821	adcq	$0,%r14
1822
1823	mulq	%rbx
1824	movq	%rsi,%rbx
1825	addq	%rax,%r15
1826	movq	0(%rbp),%rax
1827	adcq	$0,%rdx
1828	addq	%r15,%r14
1829	movq	%rdx,%r15
1830	adcq	$0,%r15
1831
1832	decl	%ecx
1833	jnz	.L8x_reduce
1834
1835	leaq	64(%rbp),%rbp
1836	xorq	%rax,%rax
1837	movq	8+8(%rsp),%rdx
1838	cmpq	0+8(%rsp),%rbp
1839	jae	.L8x_no_tail
1840
1841.byte	0x66
1842	addq	0(%rdi),%r8
1843	adcq	8(%rdi),%r9
1844	adcq	16(%rdi),%r10
1845	adcq	24(%rdi),%r11
1846	adcq	32(%rdi),%r12
1847	adcq	40(%rdi),%r13
1848	adcq	48(%rdi),%r14
1849	adcq	56(%rdi),%r15
1850	sbbq	%rsi,%rsi
1851
1852	movq	48+56+8(%rsp),%rbx
1853	movl	$8,%ecx
1854	movq	0(%rbp),%rax
1855	jmp	.L8x_tail
1856
1857.align	32
1858.L8x_tail:
1859	mulq	%rbx
1860	addq	%rax,%r8
1861	movq	8(%rbp),%rax
1862	movq	%r8,(%rdi)
1863	movq	%rdx,%r8
1864	adcq	$0,%r8
1865
1866	mulq	%rbx
1867	addq	%rax,%r9
1868	movq	16(%rbp),%rax
1869	adcq	$0,%rdx
1870	addq	%r9,%r8
1871	leaq	8(%rdi),%rdi
1872	movq	%rdx,%r9
1873	adcq	$0,%r9
1874
1875	mulq	%rbx
1876	addq	%rax,%r10
1877	movq	24(%rbp),%rax
1878	adcq	$0,%rdx
1879	addq	%r10,%r9
1880	movq	%rdx,%r10
1881	adcq	$0,%r10
1882
1883	mulq	%rbx
1884	addq	%rax,%r11
1885	movq	32(%rbp),%rax
1886	adcq	$0,%rdx
1887	addq	%r11,%r10
1888	movq	%rdx,%r11
1889	adcq	$0,%r11
1890
1891	mulq	%rbx
1892	addq	%rax,%r12
1893	movq	40(%rbp),%rax
1894	adcq	$0,%rdx
1895	addq	%r12,%r11
1896	movq	%rdx,%r12
1897	adcq	$0,%r12
1898
1899	mulq	%rbx
1900	addq	%rax,%r13
1901	movq	48(%rbp),%rax
1902	adcq	$0,%rdx
1903	addq	%r13,%r12
1904	movq	%rdx,%r13
1905	adcq	$0,%r13
1906
1907	mulq	%rbx
1908	addq	%rax,%r14
1909	movq	56(%rbp),%rax
1910	adcq	$0,%rdx
1911	addq	%r14,%r13
1912	movq	%rdx,%r14
1913	adcq	$0,%r14
1914
1915	mulq	%rbx
1916	movq	48-16+8(%rsp,%rcx,8),%rbx
1917	addq	%rax,%r15
1918	adcq	$0,%rdx
1919	addq	%r15,%r14
1920	movq	0(%rbp),%rax
1921	movq	%rdx,%r15
1922	adcq	$0,%r15
1923
1924	decl	%ecx
1925	jnz	.L8x_tail
1926
1927	leaq	64(%rbp),%rbp
1928	movq	8+8(%rsp),%rdx
1929	cmpq	0+8(%rsp),%rbp
1930	jae	.L8x_tail_done
1931
1932	movq	48+56+8(%rsp),%rbx
1933	negq	%rsi
1934	movq	0(%rbp),%rax
1935	adcq	0(%rdi),%r8
1936	adcq	8(%rdi),%r9
1937	adcq	16(%rdi),%r10
1938	adcq	24(%rdi),%r11
1939	adcq	32(%rdi),%r12
1940	adcq	40(%rdi),%r13
1941	adcq	48(%rdi),%r14
1942	adcq	56(%rdi),%r15
1943	sbbq	%rsi,%rsi
1944
1945	movl	$8,%ecx
1946	jmp	.L8x_tail
1947
1948.align	32
1949.L8x_tail_done:
1950	xorq	%rax,%rax
1951	addq	(%rdx),%r8
1952	adcq	$0,%r9
1953	adcq	$0,%r10
1954	adcq	$0,%r11
1955	adcq	$0,%r12
1956	adcq	$0,%r13
1957	adcq	$0,%r14
1958	adcq	$0,%r15
1959	adcq	$0,%rax
1960
1961	negq	%rsi
1962.L8x_no_tail:
1963	adcq	0(%rdi),%r8
1964	adcq	8(%rdi),%r9
1965	adcq	16(%rdi),%r10
1966	adcq	24(%rdi),%r11
1967	adcq	32(%rdi),%r12
1968	adcq	40(%rdi),%r13
1969	adcq	48(%rdi),%r14
1970	adcq	56(%rdi),%r15
1971	adcq	$0,%rax
1972	movq	-8(%rbp),%rcx
1973	xorq	%rsi,%rsi
1974
1975.byte	102,72,15,126,213
1976
1977	movq	%r8,0(%rdi)
1978	movq	%r9,8(%rdi)
1979.byte	102,73,15,126,217
1980	movq	%r10,16(%rdi)
1981	movq	%r11,24(%rdi)
1982	movq	%r12,32(%rdi)
1983	movq	%r13,40(%rdi)
1984	movq	%r14,48(%rdi)
1985	movq	%r15,56(%rdi)
1986	leaq	64(%rdi),%rdi
1987
1988	cmpq	%rdx,%rdi
1989	jb	.L8x_reduction_loop
1990	.byte	0xf3,0xc3
1991.size	bn_sqr8x_internal,.-bn_sqr8x_internal
1992.type	__bn_post4x_internal,@function
1993.align	32
1994__bn_post4x_internal:
1995	movq	0(%rbp),%r12
1996	leaq	(%rdi,%r9,1),%rbx
1997	movq	%r9,%rcx
1998.byte	102,72,15,126,207
1999	negq	%rax
2000.byte	102,72,15,126,206
2001	sarq	$3+2,%rcx
2002	decq	%r12
2003	xorq	%r10,%r10
2004	movq	8(%rbp),%r13
2005	movq	16(%rbp),%r14
2006	movq	24(%rbp),%r15
2007	jmp	.Lsqr4x_sub_entry
2008
2009.align	16
2010.Lsqr4x_sub:
2011	movq	0(%rbp),%r12
2012	movq	8(%rbp),%r13
2013	movq	16(%rbp),%r14
2014	movq	24(%rbp),%r15
2015.Lsqr4x_sub_entry:
2016	leaq	32(%rbp),%rbp
2017	notq	%r12
2018	notq	%r13
2019	notq	%r14
2020	notq	%r15
2021	andq	%rax,%r12
2022	andq	%rax,%r13
2023	andq	%rax,%r14
2024	andq	%rax,%r15
2025
2026	negq	%r10
2027	adcq	0(%rbx),%r12
2028	adcq	8(%rbx),%r13
2029	adcq	16(%rbx),%r14
2030	adcq	24(%rbx),%r15
2031	movq	%r12,0(%rdi)
2032	leaq	32(%rbx),%rbx
2033	movq	%r13,8(%rdi)
2034	sbbq	%r10,%r10
2035	movq	%r14,16(%rdi)
2036	movq	%r15,24(%rdi)
2037	leaq	32(%rdi),%rdi
2038
2039	incq	%rcx
2040	jnz	.Lsqr4x_sub
2041
2042	movq	%r9,%r10
2043	negq	%r9
2044	.byte	0xf3,0xc3
2045.size	__bn_post4x_internal,.-__bn_post4x_internal
2046.globl	bn_from_montgomery
2047.type	bn_from_montgomery,@function
2048.align	32
2049bn_from_montgomery:
2050	testl	$7,%r9d
2051	jz	bn_from_mont8x
2052	xorl	%eax,%eax
2053	.byte	0xf3,0xc3
2054.size	bn_from_montgomery,.-bn_from_montgomery
2055
2056.type	bn_from_mont8x,@function
2057.align	32
2058bn_from_mont8x:
2059.cfi_startproc
2060.byte	0x67
2061	movq	%rsp,%rax
2062.cfi_def_cfa_register	%rax
2063	pushq	%rbx
2064.cfi_offset	%rbx,-16
2065	pushq	%rbp
2066.cfi_offset	%rbp,-24
2067	pushq	%r12
2068.cfi_offset	%r12,-32
2069	pushq	%r13
2070.cfi_offset	%r13,-40
2071	pushq	%r14
2072.cfi_offset	%r14,-48
2073	pushq	%r15
2074.cfi_offset	%r15,-56
2075.Lfrom_prologue:
2076
2077	shll	$3,%r9d
2078	leaq	(%r9,%r9,2),%r10
2079	negq	%r9
2080	movq	(%r8),%r8
2081
2082
2083
2084
2085
2086
2087
2088
2089	leaq	-320(%rsp,%r9,2),%r11
2090	movq	%rsp,%rbp
2091	subq	%rdi,%r11
2092	andq	$4095,%r11
2093	cmpq	%r11,%r10
2094	jb	.Lfrom_sp_alt
2095	subq	%r11,%rbp
2096	leaq	-320(%rbp,%r9,2),%rbp
2097	jmp	.Lfrom_sp_done
2098
2099.align	32
2100.Lfrom_sp_alt:
2101	leaq	4096-320(,%r9,2),%r10
2102	leaq	-320(%rbp,%r9,2),%rbp
2103	subq	%r10,%r11
2104	movq	$0,%r10
2105	cmovcq	%r10,%r11
2106	subq	%r11,%rbp
2107.Lfrom_sp_done:
2108	andq	$-64,%rbp
2109	movq	%rsp,%r11
2110	subq	%rbp,%r11
2111	andq	$-4096,%r11
2112	leaq	(%r11,%rbp,1),%rsp
2113	movq	(%rsp),%r10
2114	cmpq	%rbp,%rsp
2115	ja	.Lfrom_page_walk
2116	jmp	.Lfrom_page_walk_done
2117
2118.Lfrom_page_walk:
2119	leaq	-4096(%rsp),%rsp
2120	movq	(%rsp),%r10
2121	cmpq	%rbp,%rsp
2122	ja	.Lfrom_page_walk
2123.Lfrom_page_walk_done:
2124
2125	movq	%r9,%r10
2126	negq	%r9
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137	movq	%r8,32(%rsp)
2138	movq	%rax,40(%rsp)
2139.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2140.Lfrom_body:
2141	movq	%r9,%r11
2142	leaq	48(%rsp),%rax
2143	pxor	%xmm0,%xmm0
2144	jmp	.Lmul_by_1
2145
2146.align	32
2147.Lmul_by_1:
2148	movdqu	(%rsi),%xmm1
2149	movdqu	16(%rsi),%xmm2
2150	movdqu	32(%rsi),%xmm3
2151	movdqa	%xmm0,(%rax,%r9,1)
2152	movdqu	48(%rsi),%xmm4
2153	movdqa	%xmm0,16(%rax,%r9,1)
2154.byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
2155	movdqa	%xmm1,(%rax)
2156	movdqa	%xmm0,32(%rax,%r9,1)
2157	movdqa	%xmm2,16(%rax)
2158	movdqa	%xmm0,48(%rax,%r9,1)
2159	movdqa	%xmm3,32(%rax)
2160	movdqa	%xmm4,48(%rax)
2161	leaq	64(%rax),%rax
2162	subq	$64,%r11
2163	jnz	.Lmul_by_1
2164
2165.byte	102,72,15,110,207
2166.byte	102,72,15,110,209
2167.byte	0x67
2168	movq	%rcx,%rbp
2169.byte	102,73,15,110,218
2170	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
2171	andl	$0x80108,%r11d
2172	cmpl	$0x80108,%r11d
2173	jne	.Lfrom_mont_nox
2174
2175	leaq	(%rax,%r9,1),%rdi
2176	call	__bn_sqrx8x_reduction
2177	call	__bn_postx4x_internal
2178
2179	pxor	%xmm0,%xmm0
2180	leaq	48(%rsp),%rax
2181	jmp	.Lfrom_mont_zero
2182
2183.align	32
2184.Lfrom_mont_nox:
2185	call	__bn_sqr8x_reduction
2186	call	__bn_post4x_internal
2187
2188	pxor	%xmm0,%xmm0
2189	leaq	48(%rsp),%rax
2190	jmp	.Lfrom_mont_zero
2191
2192.align	32
2193.Lfrom_mont_zero:
2194	movq	40(%rsp),%rsi
2195.cfi_def_cfa	%rsi,8
2196	movdqa	%xmm0,0(%rax)
2197	movdqa	%xmm0,16(%rax)
2198	movdqa	%xmm0,32(%rax)
2199	movdqa	%xmm0,48(%rax)
2200	leaq	64(%rax),%rax
2201	subq	$32,%r9
2202	jnz	.Lfrom_mont_zero
2203
2204	movq	$1,%rax
2205	movq	-48(%rsi),%r15
2206.cfi_restore	%r15
2207	movq	-40(%rsi),%r14
2208.cfi_restore	%r14
2209	movq	-32(%rsi),%r13
2210.cfi_restore	%r13
2211	movq	-24(%rsi),%r12
2212.cfi_restore	%r12
2213	movq	-16(%rsi),%rbp
2214.cfi_restore	%rbp
2215	movq	-8(%rsi),%rbx
2216.cfi_restore	%rbx
2217	leaq	(%rsi),%rsp
2218.cfi_def_cfa_register	%rsp
2219.Lfrom_epilogue:
2220	.byte	0xf3,0xc3
2221.cfi_endproc
2222.size	bn_from_mont8x,.-bn_from_mont8x
2223.type	bn_mulx4x_mont_gather5,@function
2224.align	32
2225bn_mulx4x_mont_gather5:
2226.cfi_startproc
2227	movq	%rsp,%rax
2228.cfi_def_cfa_register	%rax
2229.Lmulx4x_enter:
2230	pushq	%rbx
2231.cfi_offset	%rbx,-16
2232	pushq	%rbp
2233.cfi_offset	%rbp,-24
2234	pushq	%r12
2235.cfi_offset	%r12,-32
2236	pushq	%r13
2237.cfi_offset	%r13,-40
2238	pushq	%r14
2239.cfi_offset	%r14,-48
2240	pushq	%r15
2241.cfi_offset	%r15,-56
2242.Lmulx4x_prologue:
2243
2244	shll	$3,%r9d
2245	leaq	(%r9,%r9,2),%r10
2246	negq	%r9
2247	movq	(%r8),%r8
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258	leaq	-320(%rsp,%r9,2),%r11
2259	movq	%rsp,%rbp
2260	subq	%rdi,%r11
2261	andq	$4095,%r11
2262	cmpq	%r11,%r10
2263	jb	.Lmulx4xsp_alt
2264	subq	%r11,%rbp
2265	leaq	-320(%rbp,%r9,2),%rbp
2266	jmp	.Lmulx4xsp_done
2267
2268.Lmulx4xsp_alt:
2269	leaq	4096-320(,%r9,2),%r10
2270	leaq	-320(%rbp,%r9,2),%rbp
2271	subq	%r10,%r11
2272	movq	$0,%r10
2273	cmovcq	%r10,%r11
2274	subq	%r11,%rbp
2275.Lmulx4xsp_done:
2276	andq	$-64,%rbp
2277	movq	%rsp,%r11
2278	subq	%rbp,%r11
2279	andq	$-4096,%r11
2280	leaq	(%r11,%rbp,1),%rsp
2281	movq	(%rsp),%r10
2282	cmpq	%rbp,%rsp
2283	ja	.Lmulx4x_page_walk
2284	jmp	.Lmulx4x_page_walk_done
2285
2286.Lmulx4x_page_walk:
2287	leaq	-4096(%rsp),%rsp
2288	movq	(%rsp),%r10
2289	cmpq	%rbp,%rsp
2290	ja	.Lmulx4x_page_walk
2291.Lmulx4x_page_walk_done:
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305	movq	%r8,32(%rsp)
2306	movq	%rax,40(%rsp)
2307.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2308.Lmulx4x_body:
2309	call	mulx4x_internal
2310
2311	movq	40(%rsp),%rsi
2312.cfi_def_cfa	%rsi,8
2313	movq	$1,%rax
2314
2315	movq	-48(%rsi),%r15
2316.cfi_restore	%r15
2317	movq	-40(%rsi),%r14
2318.cfi_restore	%r14
2319	movq	-32(%rsi),%r13
2320.cfi_restore	%r13
2321	movq	-24(%rsi),%r12
2322.cfi_restore	%r12
2323	movq	-16(%rsi),%rbp
2324.cfi_restore	%rbp
2325	movq	-8(%rsi),%rbx
2326.cfi_restore	%rbx
2327	leaq	(%rsi),%rsp
2328.cfi_def_cfa_register	%rsp
2329.Lmulx4x_epilogue:
2330	.byte	0xf3,0xc3
2331.cfi_endproc
2332.size	bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
2333
2334.type	mulx4x_internal,@function
2335.align	32
2336mulx4x_internal:
2337	movq	%r9,8(%rsp)
2338	movq	%r9,%r10
2339	negq	%r9
2340	shlq	$5,%r9
2341	negq	%r10
2342	leaq	128(%rdx,%r9,1),%r13
2343	shrq	$5+5,%r9
2344	movd	8(%rax),%xmm5
2345	subq	$1,%r9
2346	leaq	.Linc(%rip),%rax
2347	movq	%r13,16+8(%rsp)
2348	movq	%r9,24+8(%rsp)
2349	movq	%rdi,56+8(%rsp)
2350	movdqa	0(%rax),%xmm0
2351	movdqa	16(%rax),%xmm1
2352	leaq	88-112(%rsp,%r10,1),%r10
2353	leaq	128(%rdx),%rdi
2354
2355	pshufd	$0,%xmm5,%xmm5
2356	movdqa	%xmm1,%xmm4
2357.byte	0x67
2358	movdqa	%xmm1,%xmm2
2359.byte	0x67
2360	paddd	%xmm0,%xmm1
2361	pcmpeqd	%xmm5,%xmm0
2362	movdqa	%xmm4,%xmm3
2363	paddd	%xmm1,%xmm2
2364	pcmpeqd	%xmm5,%xmm1
2365	movdqa	%xmm0,112(%r10)
2366	movdqa	%xmm4,%xmm0
2367
2368	paddd	%xmm2,%xmm3
2369	pcmpeqd	%xmm5,%xmm2
2370	movdqa	%xmm1,128(%r10)
2371	movdqa	%xmm4,%xmm1
2372
2373	paddd	%xmm3,%xmm0
2374	pcmpeqd	%xmm5,%xmm3
2375	movdqa	%xmm2,144(%r10)
2376	movdqa	%xmm4,%xmm2
2377
2378	paddd	%xmm0,%xmm1
2379	pcmpeqd	%xmm5,%xmm0
2380	movdqa	%xmm3,160(%r10)
2381	movdqa	%xmm4,%xmm3
2382	paddd	%xmm1,%xmm2
2383	pcmpeqd	%xmm5,%xmm1
2384	movdqa	%xmm0,176(%r10)
2385	movdqa	%xmm4,%xmm0
2386
2387	paddd	%xmm2,%xmm3
2388	pcmpeqd	%xmm5,%xmm2
2389	movdqa	%xmm1,192(%r10)
2390	movdqa	%xmm4,%xmm1
2391
2392	paddd	%xmm3,%xmm0
2393	pcmpeqd	%xmm5,%xmm3
2394	movdqa	%xmm2,208(%r10)
2395	movdqa	%xmm4,%xmm2
2396
2397	paddd	%xmm0,%xmm1
2398	pcmpeqd	%xmm5,%xmm0
2399	movdqa	%xmm3,224(%r10)
2400	movdqa	%xmm4,%xmm3
2401	paddd	%xmm1,%xmm2
2402	pcmpeqd	%xmm5,%xmm1
2403	movdqa	%xmm0,240(%r10)
2404	movdqa	%xmm4,%xmm0
2405
2406	paddd	%xmm2,%xmm3
2407	pcmpeqd	%xmm5,%xmm2
2408	movdqa	%xmm1,256(%r10)
2409	movdqa	%xmm4,%xmm1
2410
2411	paddd	%xmm3,%xmm0
2412	pcmpeqd	%xmm5,%xmm3
2413	movdqa	%xmm2,272(%r10)
2414	movdqa	%xmm4,%xmm2
2415
2416	paddd	%xmm0,%xmm1
2417	pcmpeqd	%xmm5,%xmm0
2418	movdqa	%xmm3,288(%r10)
2419	movdqa	%xmm4,%xmm3
2420.byte	0x67
2421	paddd	%xmm1,%xmm2
2422	pcmpeqd	%xmm5,%xmm1
2423	movdqa	%xmm0,304(%r10)
2424
2425	paddd	%xmm2,%xmm3
2426	pcmpeqd	%xmm5,%xmm2
2427	movdqa	%xmm1,320(%r10)
2428
2429	pcmpeqd	%xmm5,%xmm3
2430	movdqa	%xmm2,336(%r10)
2431
2432	pand	64(%rdi),%xmm0
2433	pand	80(%rdi),%xmm1
2434	pand	96(%rdi),%xmm2
2435	movdqa	%xmm3,352(%r10)
2436	pand	112(%rdi),%xmm3
2437	por	%xmm2,%xmm0
2438	por	%xmm3,%xmm1
2439	movdqa	-128(%rdi),%xmm4
2440	movdqa	-112(%rdi),%xmm5
2441	movdqa	-96(%rdi),%xmm2
2442	pand	112(%r10),%xmm4
2443	movdqa	-80(%rdi),%xmm3
2444	pand	128(%r10),%xmm5
2445	por	%xmm4,%xmm0
2446	pand	144(%r10),%xmm2
2447	por	%xmm5,%xmm1
2448	pand	160(%r10),%xmm3
2449	por	%xmm2,%xmm0
2450	por	%xmm3,%xmm1
2451	movdqa	-64(%rdi),%xmm4
2452	movdqa	-48(%rdi),%xmm5
2453	movdqa	-32(%rdi),%xmm2
2454	pand	176(%r10),%xmm4
2455	movdqa	-16(%rdi),%xmm3
2456	pand	192(%r10),%xmm5
2457	por	%xmm4,%xmm0
2458	pand	208(%r10),%xmm2
2459	por	%xmm5,%xmm1
2460	pand	224(%r10),%xmm3
2461	por	%xmm2,%xmm0
2462	por	%xmm3,%xmm1
2463	movdqa	0(%rdi),%xmm4
2464	movdqa	16(%rdi),%xmm5
2465	movdqa	32(%rdi),%xmm2
2466	pand	240(%r10),%xmm4
2467	movdqa	48(%rdi),%xmm3
2468	pand	256(%r10),%xmm5
2469	por	%xmm4,%xmm0
2470	pand	272(%r10),%xmm2
2471	por	%xmm5,%xmm1
2472	pand	288(%r10),%xmm3
2473	por	%xmm2,%xmm0
2474	por	%xmm3,%xmm1
2475	pxor	%xmm1,%xmm0
2476	pshufd	$0x4e,%xmm0,%xmm1
2477	por	%xmm1,%xmm0
2478	leaq	256(%rdi),%rdi
2479.byte	102,72,15,126,194
2480	leaq	64+32+8(%rsp),%rbx
2481
2482	movq	%rdx,%r9
2483	mulxq	0(%rsi),%r8,%rax
2484	mulxq	8(%rsi),%r11,%r12
2485	addq	%rax,%r11
2486	mulxq	16(%rsi),%rax,%r13
2487	adcq	%rax,%r12
2488	adcq	$0,%r13
2489	mulxq	24(%rsi),%rax,%r14
2490
2491	movq	%r8,%r15
2492	imulq	32+8(%rsp),%r8
2493	xorq	%rbp,%rbp
2494	movq	%r8,%rdx
2495
2496	movq	%rdi,8+8(%rsp)
2497
2498	leaq	32(%rsi),%rsi
2499	adcxq	%rax,%r13
2500	adcxq	%rbp,%r14
2501
2502	mulxq	0(%rcx),%rax,%r10
2503	adcxq	%rax,%r15
2504	adoxq	%r11,%r10
2505	mulxq	8(%rcx),%rax,%r11
2506	adcxq	%rax,%r10
2507	adoxq	%r12,%r11
2508	mulxq	16(%rcx),%rax,%r12
2509	movq	24+8(%rsp),%rdi
2510	movq	%r10,-32(%rbx)
2511	adcxq	%rax,%r11
2512	adoxq	%r13,%r12
2513	mulxq	24(%rcx),%rax,%r15
2514	movq	%r9,%rdx
2515	movq	%r11,-24(%rbx)
2516	adcxq	%rax,%r12
2517	adoxq	%rbp,%r15
2518	leaq	32(%rcx),%rcx
2519	movq	%r12,-16(%rbx)
2520	jmp	.Lmulx4x_1st
2521
2522.align	32
2523.Lmulx4x_1st:
2524	adcxq	%rbp,%r15
2525	mulxq	0(%rsi),%r10,%rax
2526	adcxq	%r14,%r10
2527	mulxq	8(%rsi),%r11,%r14
2528	adcxq	%rax,%r11
2529	mulxq	16(%rsi),%r12,%rax
2530	adcxq	%r14,%r12
2531	mulxq	24(%rsi),%r13,%r14
2532.byte	0x67,0x67
2533	movq	%r8,%rdx
2534	adcxq	%rax,%r13
2535	adcxq	%rbp,%r14
2536	leaq	32(%rsi),%rsi
2537	leaq	32(%rbx),%rbx
2538
2539	adoxq	%r15,%r10
2540	mulxq	0(%rcx),%rax,%r15
2541	adcxq	%rax,%r10
2542	adoxq	%r15,%r11
2543	mulxq	8(%rcx),%rax,%r15
2544	adcxq	%rax,%r11
2545	adoxq	%r15,%r12
2546	mulxq	16(%rcx),%rax,%r15
2547	movq	%r10,-40(%rbx)
2548	adcxq	%rax,%r12
2549	movq	%r11,-32(%rbx)
2550	adoxq	%r15,%r13
2551	mulxq	24(%rcx),%rax,%r15
2552	movq	%r9,%rdx
2553	movq	%r12,-24(%rbx)
2554	adcxq	%rax,%r13
2555	adoxq	%rbp,%r15
2556	leaq	32(%rcx),%rcx
2557	movq	%r13,-16(%rbx)
2558
2559	decq	%rdi
2560	jnz	.Lmulx4x_1st
2561
2562	movq	8(%rsp),%rax
2563	adcq	%rbp,%r15
2564	leaq	(%rsi,%rax,1),%rsi
2565	addq	%r15,%r14
2566	movq	8+8(%rsp),%rdi
2567	adcq	%rbp,%rbp
2568	movq	%r14,-8(%rbx)
2569	jmp	.Lmulx4x_outer
2570
2571.align	32
2572.Lmulx4x_outer:
2573	leaq	16-256(%rbx),%r10
2574	pxor	%xmm4,%xmm4
2575.byte	0x67,0x67
2576	pxor	%xmm5,%xmm5
2577	movdqa	-128(%rdi),%xmm0
2578	movdqa	-112(%rdi),%xmm1
2579	movdqa	-96(%rdi),%xmm2
2580	pand	256(%r10),%xmm0
2581	movdqa	-80(%rdi),%xmm3
2582	pand	272(%r10),%xmm1
2583	por	%xmm0,%xmm4
2584	pand	288(%r10),%xmm2
2585	por	%xmm1,%xmm5
2586	pand	304(%r10),%xmm3
2587	por	%xmm2,%xmm4
2588	por	%xmm3,%xmm5
2589	movdqa	-64(%rdi),%xmm0
2590	movdqa	-48(%rdi),%xmm1
2591	movdqa	-32(%rdi),%xmm2
2592	pand	320(%r10),%xmm0
2593	movdqa	-16(%rdi),%xmm3
2594	pand	336(%r10),%xmm1
2595	por	%xmm0,%xmm4
2596	pand	352(%r10),%xmm2
2597	por	%xmm1,%xmm5
2598	pand	368(%r10),%xmm3
2599	por	%xmm2,%xmm4
2600	por	%xmm3,%xmm5
2601	movdqa	0(%rdi),%xmm0
2602	movdqa	16(%rdi),%xmm1
2603	movdqa	32(%rdi),%xmm2
2604	pand	384(%r10),%xmm0
2605	movdqa	48(%rdi),%xmm3
2606	pand	400(%r10),%xmm1
2607	por	%xmm0,%xmm4
2608	pand	416(%r10),%xmm2
2609	por	%xmm1,%xmm5
2610	pand	432(%r10),%xmm3
2611	por	%xmm2,%xmm4
2612	por	%xmm3,%xmm5
2613	movdqa	64(%rdi),%xmm0
2614	movdqa	80(%rdi),%xmm1
2615	movdqa	96(%rdi),%xmm2
2616	pand	448(%r10),%xmm0
2617	movdqa	112(%rdi),%xmm3
2618	pand	464(%r10),%xmm1
2619	por	%xmm0,%xmm4
2620	pand	480(%r10),%xmm2
2621	por	%xmm1,%xmm5
2622	pand	496(%r10),%xmm3
2623	por	%xmm2,%xmm4
2624	por	%xmm3,%xmm5
2625	por	%xmm5,%xmm4
2626	pshufd	$0x4e,%xmm4,%xmm0
2627	por	%xmm4,%xmm0
2628	leaq	256(%rdi),%rdi
2629.byte	102,72,15,126,194
2630
2631	movq	%rbp,(%rbx)
2632	leaq	32(%rbx,%rax,1),%rbx
2633	mulxq	0(%rsi),%r8,%r11
2634	xorq	%rbp,%rbp
2635	movq	%rdx,%r9
2636	mulxq	8(%rsi),%r14,%r12
2637	adoxq	-32(%rbx),%r8
2638	adcxq	%r14,%r11
2639	mulxq	16(%rsi),%r15,%r13
2640	adoxq	-24(%rbx),%r11
2641	adcxq	%r15,%r12
2642	mulxq	24(%rsi),%rdx,%r14
2643	adoxq	-16(%rbx),%r12
2644	adcxq	%rdx,%r13
2645	leaq	(%rcx,%rax,1),%rcx
2646	leaq	32(%rsi),%rsi
2647	adoxq	-8(%rbx),%r13
2648	adcxq	%rbp,%r14
2649	adoxq	%rbp,%r14
2650
2651	movq	%r8,%r15
2652	imulq	32+8(%rsp),%r8
2653
2654	movq	%r8,%rdx
2655	xorq	%rbp,%rbp
2656	movq	%rdi,8+8(%rsp)
2657
2658	mulxq	0(%rcx),%rax,%r10
2659	adcxq	%rax,%r15
2660	adoxq	%r11,%r10
2661	mulxq	8(%rcx),%rax,%r11
2662	adcxq	%rax,%r10
2663	adoxq	%r12,%r11
2664	mulxq	16(%rcx),%rax,%r12
2665	adcxq	%rax,%r11
2666	adoxq	%r13,%r12
2667	mulxq	24(%rcx),%rax,%r15
2668	movq	%r9,%rdx
2669	movq	24+8(%rsp),%rdi
2670	movq	%r10,-32(%rbx)
2671	adcxq	%rax,%r12
2672	movq	%r11,-24(%rbx)
2673	adoxq	%rbp,%r15
2674	movq	%r12,-16(%rbx)
2675	leaq	32(%rcx),%rcx
2676	jmp	.Lmulx4x_inner
2677
2678.align	32
2679.Lmulx4x_inner:
2680	mulxq	0(%rsi),%r10,%rax
2681	adcxq	%rbp,%r15
2682	adoxq	%r14,%r10
2683	mulxq	8(%rsi),%r11,%r14
2684	adcxq	0(%rbx),%r10
2685	adoxq	%rax,%r11
2686	mulxq	16(%rsi),%r12,%rax
2687	adcxq	8(%rbx),%r11
2688	adoxq	%r14,%r12
2689	mulxq	24(%rsi),%r13,%r14
2690	movq	%r8,%rdx
2691	adcxq	16(%rbx),%r12
2692	adoxq	%rax,%r13
2693	adcxq	24(%rbx),%r13
2694	adoxq	%rbp,%r14
2695	leaq	32(%rsi),%rsi
2696	leaq	32(%rbx),%rbx
2697	adcxq	%rbp,%r14
2698
2699	adoxq	%r15,%r10
2700	mulxq	0(%rcx),%rax,%r15
2701	adcxq	%rax,%r10
2702	adoxq	%r15,%r11
2703	mulxq	8(%rcx),%rax,%r15
2704	adcxq	%rax,%r11
2705	adoxq	%r15,%r12
2706	mulxq	16(%rcx),%rax,%r15
2707	movq	%r10,-40(%rbx)
2708	adcxq	%rax,%r12
2709	adoxq	%r15,%r13
2710	movq	%r11,-32(%rbx)
2711	mulxq	24(%rcx),%rax,%r15
2712	movq	%r9,%rdx
2713	leaq	32(%rcx),%rcx
2714	movq	%r12,-24(%rbx)
2715	adcxq	%rax,%r13
2716	adoxq	%rbp,%r15
2717	movq	%r13,-16(%rbx)
2718
2719	decq	%rdi
2720	jnz	.Lmulx4x_inner
2721
2722	movq	0+8(%rsp),%rax
2723	adcq	%rbp,%r15
2724	subq	0(%rbx),%rdi
2725	movq	8+8(%rsp),%rdi
2726	movq	16+8(%rsp),%r10
2727	adcq	%r15,%r14
2728	leaq	(%rsi,%rax,1),%rsi
2729	adcq	%rbp,%rbp
2730	movq	%r14,-8(%rbx)
2731
2732	cmpq	%r10,%rdi
2733	jb	.Lmulx4x_outer
2734
2735	movq	-8(%rcx),%r10
2736	movq	%rbp,%r8
2737	movq	(%rcx,%rax,1),%r12
2738	leaq	(%rcx,%rax,1),%rbp
2739	movq	%rax,%rcx
2740	leaq	(%rbx,%rax,1),%rdi
2741	xorl	%eax,%eax
2742	xorq	%r15,%r15
2743	subq	%r14,%r10
2744	adcq	%r15,%r15
2745	orq	%r15,%r8
2746	sarq	$3+2,%rcx
2747	subq	%r8,%rax
2748	movq	56+8(%rsp),%rdx
2749	decq	%r12
2750	movq	8(%rbp),%r13
2751	xorq	%r8,%r8
2752	movq	16(%rbp),%r14
2753	movq	24(%rbp),%r15
2754	jmp	.Lsqrx4x_sub_entry
2755.size	mulx4x_internal,.-mulx4x_internal
2756.type	bn_powerx5,@function
2757.align	32
2758bn_powerx5:
2759.cfi_startproc
2760	movq	%rsp,%rax
2761.cfi_def_cfa_register	%rax
2762.Lpowerx5_enter:
2763	pushq	%rbx
2764.cfi_offset	%rbx,-16
2765	pushq	%rbp
2766.cfi_offset	%rbp,-24
2767	pushq	%r12
2768.cfi_offset	%r12,-32
2769	pushq	%r13
2770.cfi_offset	%r13,-40
2771	pushq	%r14
2772.cfi_offset	%r14,-48
2773	pushq	%r15
2774.cfi_offset	%r15,-56
2775.Lpowerx5_prologue:
2776
2777	shll	$3,%r9d
2778	leaq	(%r9,%r9,2),%r10
2779	negq	%r9
2780	movq	(%r8),%r8
2781
2782
2783
2784
2785
2786
2787
2788
2789	leaq	-320(%rsp,%r9,2),%r11
2790	movq	%rsp,%rbp
2791	subq	%rdi,%r11
2792	andq	$4095,%r11
2793	cmpq	%r11,%r10
2794	jb	.Lpwrx_sp_alt
2795	subq	%r11,%rbp
2796	leaq	-320(%rbp,%r9,2),%rbp
2797	jmp	.Lpwrx_sp_done
2798
2799.align	32
2800.Lpwrx_sp_alt:
2801	leaq	4096-320(,%r9,2),%r10
2802	leaq	-320(%rbp,%r9,2),%rbp
2803	subq	%r10,%r11
2804	movq	$0,%r10
2805	cmovcq	%r10,%r11
2806	subq	%r11,%rbp
2807.Lpwrx_sp_done:
2808	andq	$-64,%rbp
2809	movq	%rsp,%r11
2810	subq	%rbp,%r11
2811	andq	$-4096,%r11
2812	leaq	(%r11,%rbp,1),%rsp
2813	movq	(%rsp),%r10
2814	cmpq	%rbp,%rsp
2815	ja	.Lpwrx_page_walk
2816	jmp	.Lpwrx_page_walk_done
2817
2818.Lpwrx_page_walk:
2819	leaq	-4096(%rsp),%rsp
2820	movq	(%rsp),%r10
2821	cmpq	%rbp,%rsp
2822	ja	.Lpwrx_page_walk
2823.Lpwrx_page_walk_done:
2824
2825	movq	%r9,%r10
2826	negq	%r9
2827
2828
2829
2830
2831
2832
2833
2834
2835
2836
2837
2838
2839	pxor	%xmm0,%xmm0
2840.byte	102,72,15,110,207
2841.byte	102,72,15,110,209
2842.byte	102,73,15,110,218
2843.byte	102,72,15,110,226
2844	movq	%r8,32(%rsp)
2845	movq	%rax,40(%rsp)
2846.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2847.Lpowerx5_body:
2848
2849	call	__bn_sqrx8x_internal
2850	call	__bn_postx4x_internal
2851	call	__bn_sqrx8x_internal
2852	call	__bn_postx4x_internal
2853	call	__bn_sqrx8x_internal
2854	call	__bn_postx4x_internal
2855	call	__bn_sqrx8x_internal
2856	call	__bn_postx4x_internal
2857	call	__bn_sqrx8x_internal
2858	call	__bn_postx4x_internal
2859
2860	movq	%r10,%r9
2861	movq	%rsi,%rdi
2862.byte	102,72,15,126,209
2863.byte	102,72,15,126,226
2864	movq	40(%rsp),%rax
2865
2866	call	mulx4x_internal
2867
2868	movq	40(%rsp),%rsi
2869.cfi_def_cfa	%rsi,8
2870	movq	$1,%rax
2871
2872	movq	-48(%rsi),%r15
2873.cfi_restore	%r15
2874	movq	-40(%rsi),%r14
2875.cfi_restore	%r14
2876	movq	-32(%rsi),%r13
2877.cfi_restore	%r13
2878	movq	-24(%rsi),%r12
2879.cfi_restore	%r12
2880	movq	-16(%rsi),%rbp
2881.cfi_restore	%rbp
2882	movq	-8(%rsi),%rbx
2883.cfi_restore	%rbx
2884	leaq	(%rsi),%rsp
2885.cfi_def_cfa_register	%rsp
2886.Lpowerx5_epilogue:
2887	.byte	0xf3,0xc3
2888.cfi_endproc
2889.size	bn_powerx5,.-bn_powerx5
2890
2891.globl	bn_sqrx8x_internal
2892.hidden	bn_sqrx8x_internal
2893.type	bn_sqrx8x_internal,@function
2894.align	32
2895bn_sqrx8x_internal:
2896__bn_sqrx8x_internal:
2897.cfi_startproc
2898
2899
2900
2901
2902
2903
2904
2905
2906
2907
2908
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938	leaq	48+8(%rsp),%rdi
2939	leaq	(%rsi,%r9,1),%rbp
2940	movq	%r9,0+8(%rsp)
2941	movq	%rbp,8+8(%rsp)
2942	jmp	.Lsqr8x_zero_start
2943
2944.align	32
2945.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2946.Lsqrx8x_zero:
2947.byte	0x3e
2948	movdqa	%xmm0,0(%rdi)
2949	movdqa	%xmm0,16(%rdi)
2950	movdqa	%xmm0,32(%rdi)
2951	movdqa	%xmm0,48(%rdi)
2952.Lsqr8x_zero_start:
2953	movdqa	%xmm0,64(%rdi)
2954	movdqa	%xmm0,80(%rdi)
2955	movdqa	%xmm0,96(%rdi)
2956	movdqa	%xmm0,112(%rdi)
2957	leaq	128(%rdi),%rdi
2958	subq	$64,%r9
2959	jnz	.Lsqrx8x_zero
2960
2961	movq	0(%rsi),%rdx
2962
2963	xorq	%r10,%r10
2964	xorq	%r11,%r11
2965	xorq	%r12,%r12
2966	xorq	%r13,%r13
2967	xorq	%r14,%r14
2968	xorq	%r15,%r15
2969	leaq	48+8(%rsp),%rdi
2970	xorq	%rbp,%rbp
2971	jmp	.Lsqrx8x_outer_loop
2972
2973.align	32
2974.Lsqrx8x_outer_loop:
2975	mulxq	8(%rsi),%r8,%rax
2976	adcxq	%r9,%r8
2977	adoxq	%rax,%r10
2978	mulxq	16(%rsi),%r9,%rax
2979	adcxq	%r10,%r9
2980	adoxq	%rax,%r11
2981.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
2982	adcxq	%r11,%r10
2983	adoxq	%rax,%r12
2984.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
2985	adcxq	%r12,%r11
2986	adoxq	%rax,%r13
2987	mulxq	40(%rsi),%r12,%rax
2988	adcxq	%r13,%r12
2989	adoxq	%rax,%r14
2990	mulxq	48(%rsi),%r13,%rax
2991	adcxq	%r14,%r13
2992	adoxq	%r15,%rax
2993	mulxq	56(%rsi),%r14,%r15
2994	movq	8(%rsi),%rdx
2995	adcxq	%rax,%r14
2996	adoxq	%rbp,%r15
2997	adcq	64(%rdi),%r15
2998	movq	%r8,8(%rdi)
2999	movq	%r9,16(%rdi)
3000	sbbq	%rcx,%rcx
3001	xorq	%rbp,%rbp
3002
3003
3004	mulxq	16(%rsi),%r8,%rbx
3005	mulxq	24(%rsi),%r9,%rax
3006	adcxq	%r10,%r8
3007	adoxq	%rbx,%r9
3008	mulxq	32(%rsi),%r10,%rbx
3009	adcxq	%r11,%r9
3010	adoxq	%rax,%r10
3011.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
3012	adcxq	%r12,%r10
3013	adoxq	%rbx,%r11
3014.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
3015	adcxq	%r13,%r11
3016	adoxq	%r14,%r12
3017.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
3018	movq	16(%rsi),%rdx
3019	adcxq	%rax,%r12
3020	adoxq	%rbx,%r13
3021	adcxq	%r15,%r13
3022	adoxq	%rbp,%r14
3023	adcxq	%rbp,%r14
3024
3025	movq	%r8,24(%rdi)
3026	movq	%r9,32(%rdi)
3027
3028	mulxq	24(%rsi),%r8,%rbx
3029	mulxq	32(%rsi),%r9,%rax
3030	adcxq	%r10,%r8
3031	adoxq	%rbx,%r9
3032	mulxq	40(%rsi),%r10,%rbx
3033	adcxq	%r11,%r9
3034	adoxq	%rax,%r10
3035.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
3036	adcxq	%r12,%r10
3037	adoxq	%r13,%r11
3038.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
3039.byte	0x3e
3040	movq	24(%rsi),%rdx
3041	adcxq	%rbx,%r11
3042	adoxq	%rax,%r12
3043	adcxq	%r14,%r12
3044	movq	%r8,40(%rdi)
3045	movq	%r9,48(%rdi)
3046	mulxq	32(%rsi),%r8,%rax
3047	adoxq	%rbp,%r13
3048	adcxq	%rbp,%r13
3049
3050	mulxq	40(%rsi),%r9,%rbx
3051	adcxq	%r10,%r8
3052	adoxq	%rax,%r9
3053	mulxq	48(%rsi),%r10,%rax
3054	adcxq	%r11,%r9
3055	adoxq	%r12,%r10
3056	mulxq	56(%rsi),%r11,%r12
3057	movq	32(%rsi),%rdx
3058	movq	40(%rsi),%r14
3059	adcxq	%rbx,%r10
3060	adoxq	%rax,%r11
3061	movq	48(%rsi),%r15
3062	adcxq	%r13,%r11
3063	adoxq	%rbp,%r12
3064	adcxq	%rbp,%r12
3065
3066	movq	%r8,56(%rdi)
3067	movq	%r9,64(%rdi)
3068
3069	mulxq	%r14,%r9,%rax
3070	movq	56(%rsi),%r8
3071	adcxq	%r10,%r9
3072	mulxq	%r15,%r10,%rbx
3073	adoxq	%rax,%r10
3074	adcxq	%r11,%r10
3075	mulxq	%r8,%r11,%rax
3076	movq	%r14,%rdx
3077	adoxq	%rbx,%r11
3078	adcxq	%r12,%r11
3079
3080	adcxq	%rbp,%rax
3081
3082	mulxq	%r15,%r14,%rbx
3083	mulxq	%r8,%r12,%r13
3084	movq	%r15,%rdx
3085	leaq	64(%rsi),%rsi
3086	adcxq	%r14,%r11
3087	adoxq	%rbx,%r12
3088	adcxq	%rax,%r12
3089	adoxq	%rbp,%r13
3090
3091.byte	0x67,0x67
3092	mulxq	%r8,%r8,%r14
3093	adcxq	%r8,%r13
3094	adcxq	%rbp,%r14
3095
3096	cmpq	8+8(%rsp),%rsi
3097	je	.Lsqrx8x_outer_break
3098
3099	negq	%rcx
3100	movq	$-8,%rcx
3101	movq	%rbp,%r15
3102	movq	64(%rdi),%r8
3103	adcxq	72(%rdi),%r9
3104	adcxq	80(%rdi),%r10
3105	adcxq	88(%rdi),%r11
3106	adcq	96(%rdi),%r12
3107	adcq	104(%rdi),%r13
3108	adcq	112(%rdi),%r14
3109	adcq	120(%rdi),%r15
3110	leaq	(%rsi),%rbp
3111	leaq	128(%rdi),%rdi
3112	sbbq	%rax,%rax
3113
3114	movq	-64(%rsi),%rdx
3115	movq	%rax,16+8(%rsp)
3116	movq	%rdi,24+8(%rsp)
3117
3118
3119	xorl	%eax,%eax
3120	jmp	.Lsqrx8x_loop
3121
3122.align	32
3123.Lsqrx8x_loop:
3124	movq	%r8,%rbx
3125	mulxq	0(%rbp),%rax,%r8
3126	adcxq	%rax,%rbx
3127	adoxq	%r9,%r8
3128
3129	mulxq	8(%rbp),%rax,%r9
3130	adcxq	%rax,%r8
3131	adoxq	%r10,%r9
3132
3133	mulxq	16(%rbp),%rax,%r10
3134	adcxq	%rax,%r9
3135	adoxq	%r11,%r10
3136
3137	mulxq	24(%rbp),%rax,%r11
3138	adcxq	%rax,%r10
3139	adoxq	%r12,%r11
3140
3141.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3142	adcxq	%rax,%r11
3143	adoxq	%r13,%r12
3144
3145	mulxq	40(%rbp),%rax,%r13
3146	adcxq	%rax,%r12
3147	adoxq	%r14,%r13
3148
3149	mulxq	48(%rbp),%rax,%r14
3150	movq	%rbx,(%rdi,%rcx,8)
3151	movl	$0,%ebx
3152	adcxq	%rax,%r13
3153	adoxq	%r15,%r14
3154
3155.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
3156	movq	8(%rsi,%rcx,8),%rdx
3157	adcxq	%rax,%r14
3158	adoxq	%rbx,%r15
3159	adcxq	%rbx,%r15
3160
3161.byte	0x67
3162	incq	%rcx
3163	jnz	.Lsqrx8x_loop
3164
3165	leaq	64(%rbp),%rbp
3166	movq	$-8,%rcx
3167	cmpq	8+8(%rsp),%rbp
3168	je	.Lsqrx8x_break
3169
3170	subq	16+8(%rsp),%rbx
3171.byte	0x66
3172	movq	-64(%rsi),%rdx
3173	adcxq	0(%rdi),%r8
3174	adcxq	8(%rdi),%r9
3175	adcq	16(%rdi),%r10
3176	adcq	24(%rdi),%r11
3177	adcq	32(%rdi),%r12
3178	adcq	40(%rdi),%r13
3179	adcq	48(%rdi),%r14
3180	adcq	56(%rdi),%r15
3181	leaq	64(%rdi),%rdi
3182.byte	0x67
3183	sbbq	%rax,%rax
3184	xorl	%ebx,%ebx
3185	movq	%rax,16+8(%rsp)
3186	jmp	.Lsqrx8x_loop
3187
3188.align	32
3189.Lsqrx8x_break:
3190	xorq	%rbp,%rbp
3191	subq	16+8(%rsp),%rbx
3192	adcxq	%rbp,%r8
3193	movq	24+8(%rsp),%rcx
3194	adcxq	%rbp,%r9
3195	movq	0(%rsi),%rdx
3196	adcq	$0,%r10
3197	movq	%r8,0(%rdi)
3198	adcq	$0,%r11
3199	adcq	$0,%r12
3200	adcq	$0,%r13
3201	adcq	$0,%r14
3202	adcq	$0,%r15
3203	cmpq	%rcx,%rdi
3204	je	.Lsqrx8x_outer_loop
3205
3206	movq	%r9,8(%rdi)
3207	movq	8(%rcx),%r9
3208	movq	%r10,16(%rdi)
3209	movq	16(%rcx),%r10
3210	movq	%r11,24(%rdi)
3211	movq	24(%rcx),%r11
3212	movq	%r12,32(%rdi)
3213	movq	32(%rcx),%r12
3214	movq	%r13,40(%rdi)
3215	movq	40(%rcx),%r13
3216	movq	%r14,48(%rdi)
3217	movq	48(%rcx),%r14
3218	movq	%r15,56(%rdi)
3219	movq	56(%rcx),%r15
3220	movq	%rcx,%rdi
3221	jmp	.Lsqrx8x_outer_loop
3222
3223.align	32
3224.Lsqrx8x_outer_break:
3225	movq	%r9,72(%rdi)
3226.byte	102,72,15,126,217
3227	movq	%r10,80(%rdi)
3228	movq	%r11,88(%rdi)
3229	movq	%r12,96(%rdi)
3230	movq	%r13,104(%rdi)
3231	movq	%r14,112(%rdi)
3232	leaq	48+8(%rsp),%rdi
3233	movq	(%rsi,%rcx,1),%rdx
3234
3235	movq	8(%rdi),%r11
3236	xorq	%r10,%r10
3237	movq	0+8(%rsp),%r9
3238	adoxq	%r11,%r11
3239	movq	16(%rdi),%r12
3240	movq	24(%rdi),%r13
3241
3242
3243.align	32
3244.Lsqrx4x_shift_n_add:
3245	mulxq	%rdx,%rax,%rbx
3246	adoxq	%r12,%r12
3247	adcxq	%r10,%rax
3248.byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
3249.byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
3250	adoxq	%r13,%r13
3251	adcxq	%r11,%rbx
3252	movq	40(%rdi),%r11
3253	movq	%rax,0(%rdi)
3254	movq	%rbx,8(%rdi)
3255
3256	mulxq	%rdx,%rax,%rbx
3257	adoxq	%r10,%r10
3258	adcxq	%r12,%rax
3259	movq	16(%rsi,%rcx,1),%rdx
3260	movq	48(%rdi),%r12
3261	adoxq	%r11,%r11
3262	adcxq	%r13,%rbx
3263	movq	56(%rdi),%r13
3264	movq	%rax,16(%rdi)
3265	movq	%rbx,24(%rdi)
3266
3267	mulxq	%rdx,%rax,%rbx
3268	adoxq	%r12,%r12
3269	adcxq	%r10,%rax
3270	movq	24(%rsi,%rcx,1),%rdx
3271	leaq	32(%rcx),%rcx
3272	movq	64(%rdi),%r10
3273	adoxq	%r13,%r13
3274	adcxq	%r11,%rbx
3275	movq	72(%rdi),%r11
3276	movq	%rax,32(%rdi)
3277	movq	%rbx,40(%rdi)
3278
3279	mulxq	%rdx,%rax,%rbx
3280	adoxq	%r10,%r10
3281	adcxq	%r12,%rax
3282	jrcxz	.Lsqrx4x_shift_n_add_break
3283.byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
3284	adoxq	%r11,%r11
3285	adcxq	%r13,%rbx
3286	movq	80(%rdi),%r12
3287	movq	88(%rdi),%r13
3288	movq	%rax,48(%rdi)
3289	movq	%rbx,56(%rdi)
3290	leaq	64(%rdi),%rdi
3291	nop
3292	jmp	.Lsqrx4x_shift_n_add
3293
3294.align	32
3295.Lsqrx4x_shift_n_add_break:
3296	adcxq	%r13,%rbx
3297	movq	%rax,48(%rdi)
3298	movq	%rbx,56(%rdi)
3299	leaq	64(%rdi),%rdi
3300.byte	102,72,15,126,213
3301__bn_sqrx8x_reduction:
3302	xorl	%eax,%eax
3303	movq	32+8(%rsp),%rbx
3304	movq	48+8(%rsp),%rdx
3305	leaq	-64(%rbp,%r9,1),%rcx
3306
3307	movq	%rcx,0+8(%rsp)
3308	movq	%rdi,8+8(%rsp)
3309
3310	leaq	48+8(%rsp),%rdi
3311	jmp	.Lsqrx8x_reduction_loop
3312
3313.align	32
3314.Lsqrx8x_reduction_loop:
3315	movq	8(%rdi),%r9
3316	movq	16(%rdi),%r10
3317	movq	24(%rdi),%r11
3318	movq	32(%rdi),%r12
3319	movq	%rdx,%r8
3320	imulq	%rbx,%rdx
3321	movq	40(%rdi),%r13
3322	movq	48(%rdi),%r14
3323	movq	56(%rdi),%r15
3324	movq	%rax,24+8(%rsp)
3325
3326	leaq	64(%rdi),%rdi
3327	xorq	%rsi,%rsi
3328	movq	$-8,%rcx
3329	jmp	.Lsqrx8x_reduce
3330
3331.align	32
3332.Lsqrx8x_reduce:
3333	movq	%r8,%rbx
3334	mulxq	0(%rbp),%rax,%r8
3335	adcxq	%rbx,%rax
3336	adoxq	%r9,%r8
3337
3338	mulxq	8(%rbp),%rbx,%r9
3339	adcxq	%rbx,%r8
3340	adoxq	%r10,%r9
3341
3342	mulxq	16(%rbp),%rbx,%r10
3343	adcxq	%rbx,%r9
3344	adoxq	%r11,%r10
3345
3346	mulxq	24(%rbp),%rbx,%r11
3347	adcxq	%rbx,%r10
3348	adoxq	%r12,%r11
3349
3350.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
3351	movq	%rdx,%rax
3352	movq	%r8,%rdx
3353	adcxq	%rbx,%r11
3354	adoxq	%r13,%r12
3355
3356	mulxq	32+8(%rsp),%rbx,%rdx
3357	movq	%rax,%rdx
3358	movq	%rax,64+48+8(%rsp,%rcx,8)
3359
3360	mulxq	40(%rbp),%rax,%r13
3361	adcxq	%rax,%r12
3362	adoxq	%r14,%r13
3363
3364	mulxq	48(%rbp),%rax,%r14
3365	adcxq	%rax,%r13
3366	adoxq	%r15,%r14
3367
3368	mulxq	56(%rbp),%rax,%r15
3369	movq	%rbx,%rdx
3370	adcxq	%rax,%r14
3371	adoxq	%rsi,%r15
3372	adcxq	%rsi,%r15
3373
3374.byte	0x67,0x67,0x67
3375	incq	%rcx
3376	jnz	.Lsqrx8x_reduce
3377
3378	movq	%rsi,%rax
3379	cmpq	0+8(%rsp),%rbp
3380	jae	.Lsqrx8x_no_tail
3381
3382	movq	48+8(%rsp),%rdx
3383	addq	0(%rdi),%r8
3384	leaq	64(%rbp),%rbp
3385	movq	$-8,%rcx
3386	adcxq	8(%rdi),%r9
3387	adcxq	16(%rdi),%r10
3388	adcq	24(%rdi),%r11
3389	adcq	32(%rdi),%r12
3390	adcq	40(%rdi),%r13
3391	adcq	48(%rdi),%r14
3392	adcq	56(%rdi),%r15
3393	leaq	64(%rdi),%rdi
3394	sbbq	%rax,%rax
3395
3396	xorq	%rsi,%rsi
3397	movq	%rax,16+8(%rsp)
3398	jmp	.Lsqrx8x_tail
3399
3400.align	32
3401.Lsqrx8x_tail:
3402	movq	%r8,%rbx
3403	mulxq	0(%rbp),%rax,%r8
3404	adcxq	%rax,%rbx
3405	adoxq	%r9,%r8
3406
3407	mulxq	8(%rbp),%rax,%r9
3408	adcxq	%rax,%r8
3409	adoxq	%r10,%r9
3410
3411	mulxq	16(%rbp),%rax,%r10
3412	adcxq	%rax,%r9
3413	adoxq	%r11,%r10
3414
3415	mulxq	24(%rbp),%rax,%r11
3416	adcxq	%rax,%r10
3417	adoxq	%r12,%r11
3418
3419.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3420	adcxq	%rax,%r11
3421	adoxq	%r13,%r12
3422
3423	mulxq	40(%rbp),%rax,%r13
3424	adcxq	%rax,%r12
3425	adoxq	%r14,%r13
3426
3427	mulxq	48(%rbp),%rax,%r14
3428	adcxq	%rax,%r13
3429	adoxq	%r15,%r14
3430
3431	mulxq	56(%rbp),%rax,%r15
3432	movq	72+48+8(%rsp,%rcx,8),%rdx
3433	adcxq	%rax,%r14
3434	adoxq	%rsi,%r15
3435	movq	%rbx,(%rdi,%rcx,8)
3436	movq	%r8,%rbx
3437	adcxq	%rsi,%r15
3438
3439	incq	%rcx
3440	jnz	.Lsqrx8x_tail
3441
3442	cmpq	0+8(%rsp),%rbp
3443	jae	.Lsqrx8x_tail_done
3444
3445	subq	16+8(%rsp),%rsi
3446	movq	48+8(%rsp),%rdx
3447	leaq	64(%rbp),%rbp
3448	adcq	0(%rdi),%r8
3449	adcq	8(%rdi),%r9
3450	adcq	16(%rdi),%r10
3451	adcq	24(%rdi),%r11
3452	adcq	32(%rdi),%r12
3453	adcq	40(%rdi),%r13
3454	adcq	48(%rdi),%r14
3455	adcq	56(%rdi),%r15
3456	leaq	64(%rdi),%rdi
3457	sbbq	%rax,%rax
3458	subq	$8,%rcx
3459
3460	xorq	%rsi,%rsi
3461	movq	%rax,16+8(%rsp)
3462	jmp	.Lsqrx8x_tail
3463
3464.align	32
3465.Lsqrx8x_tail_done:
3466	xorq	%rax,%rax
3467	addq	24+8(%rsp),%r8
3468	adcq	$0,%r9
3469	adcq	$0,%r10
3470	adcq	$0,%r11
3471	adcq	$0,%r12
3472	adcq	$0,%r13
3473	adcq	$0,%r14
3474	adcq	$0,%r15
3475	adcq	$0,%rax
3476
3477	subq	16+8(%rsp),%rsi
3478.Lsqrx8x_no_tail:
3479	adcq	0(%rdi),%r8
3480.byte	102,72,15,126,217
3481	adcq	8(%rdi),%r9
3482	movq	56(%rbp),%rsi
3483.byte	102,72,15,126,213
3484	adcq	16(%rdi),%r10
3485	adcq	24(%rdi),%r11
3486	adcq	32(%rdi),%r12
3487	adcq	40(%rdi),%r13
3488	adcq	48(%rdi),%r14
3489	adcq	56(%rdi),%r15
3490	adcq	$0,%rax
3491
3492	movq	32+8(%rsp),%rbx
3493	movq	64(%rdi,%rcx,1),%rdx
3494
3495	movq	%r8,0(%rdi)
3496	leaq	64(%rdi),%r8
3497	movq	%r9,8(%rdi)
3498	movq	%r10,16(%rdi)
3499	movq	%r11,24(%rdi)
3500	movq	%r12,32(%rdi)
3501	movq	%r13,40(%rdi)
3502	movq	%r14,48(%rdi)
3503	movq	%r15,56(%rdi)
3504
3505	leaq	64(%rdi,%rcx,1),%rdi
3506	cmpq	8+8(%rsp),%r8
3507	jb	.Lsqrx8x_reduction_loop
3508	.byte	0xf3,0xc3
3509.cfi_endproc
3510.size	bn_sqrx8x_internal,.-bn_sqrx8x_internal
3511.align	32
3512__bn_postx4x_internal:
3513	movq	0(%rbp),%r12
3514	movq	%rcx,%r10
3515	movq	%rcx,%r9
3516	negq	%rax
3517	sarq	$3+2,%rcx
3518
3519.byte	102,72,15,126,202
3520.byte	102,72,15,126,206
3521	decq	%r12
3522	movq	8(%rbp),%r13
3523	xorq	%r8,%r8
3524	movq	16(%rbp),%r14
3525	movq	24(%rbp),%r15
3526	jmp	.Lsqrx4x_sub_entry
3527
3528.align	16
3529.Lsqrx4x_sub:
3530	movq	0(%rbp),%r12
3531	movq	8(%rbp),%r13
3532	movq	16(%rbp),%r14
3533	movq	24(%rbp),%r15
3534.Lsqrx4x_sub_entry:
3535	andnq	%rax,%r12,%r12
3536	leaq	32(%rbp),%rbp
3537	andnq	%rax,%r13,%r13
3538	andnq	%rax,%r14,%r14
3539	andnq	%rax,%r15,%r15
3540
3541	negq	%r8
3542	adcq	0(%rdi),%r12
3543	adcq	8(%rdi),%r13
3544	adcq	16(%rdi),%r14
3545	adcq	24(%rdi),%r15
3546	movq	%r12,0(%rdx)
3547	leaq	32(%rdi),%rdi
3548	movq	%r13,8(%rdx)
3549	sbbq	%r8,%r8
3550	movq	%r14,16(%rdx)
3551	movq	%r15,24(%rdx)
3552	leaq	32(%rdx),%rdx
3553
3554	incq	%rcx
3555	jnz	.Lsqrx4x_sub
3556
3557	negq	%r9
3558
3559	.byte	0xf3,0xc3
3560.size	__bn_postx4x_internal,.-__bn_postx4x_internal
3561.globl	bn_get_bits5
3562.type	bn_get_bits5,@function
3563.align	16
3564bn_get_bits5:
3565	leaq	0(%rdi),%r10
3566	leaq	1(%rdi),%r11
3567	movl	%esi,%ecx
3568	shrl	$4,%esi
3569	andl	$15,%ecx
3570	leal	-8(%rcx),%eax
3571	cmpl	$11,%ecx
3572	cmovaq	%r11,%r10
3573	cmoval	%eax,%ecx
3574	movzwl	(%r10,%rsi,2),%eax
3575	shrl	%cl,%eax
3576	andl	$31,%eax
3577	.byte	0xf3,0xc3
3578.size	bn_get_bits5,.-bn_get_bits5
3579
3580.globl	bn_scatter5
3581.type	bn_scatter5,@function
3582.align	16
3583bn_scatter5:
3584	cmpl	$0,%esi
3585	jz	.Lscatter_epilogue
3586	leaq	(%rdx,%rcx,8),%rdx
3587.Lscatter:
3588	movq	(%rdi),%rax
3589	leaq	8(%rdi),%rdi
3590	movq	%rax,(%rdx)
3591	leaq	256(%rdx),%rdx
3592	subl	$1,%esi
3593	jnz	.Lscatter
3594.Lscatter_epilogue:
3595	.byte	0xf3,0xc3
3596.size	bn_scatter5,.-bn_scatter5
3597
3598.globl	bn_gather5
3599.type	bn_gather5,@function
3600.align	32
3601bn_gather5:
3602.LSEH_begin_bn_gather5:
3603
3604.byte	0x4c,0x8d,0x14,0x24
3605.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00
3606	leaq	.Linc(%rip),%rax
3607	andq	$-16,%rsp
3608
3609	movd	%ecx,%xmm5
3610	movdqa	0(%rax),%xmm0
3611	movdqa	16(%rax),%xmm1
3612	leaq	128(%rdx),%r11
3613	leaq	128(%rsp),%rax
3614
3615	pshufd	$0,%xmm5,%xmm5
3616	movdqa	%xmm1,%xmm4
3617	movdqa	%xmm1,%xmm2
3618	paddd	%xmm0,%xmm1
3619	pcmpeqd	%xmm5,%xmm0
3620	movdqa	%xmm4,%xmm3
3621
3622	paddd	%xmm1,%xmm2
3623	pcmpeqd	%xmm5,%xmm1
3624	movdqa	%xmm0,-128(%rax)
3625	movdqa	%xmm4,%xmm0
3626
3627	paddd	%xmm2,%xmm3
3628	pcmpeqd	%xmm5,%xmm2
3629	movdqa	%xmm1,-112(%rax)
3630	movdqa	%xmm4,%xmm1
3631
3632	paddd	%xmm3,%xmm0
3633	pcmpeqd	%xmm5,%xmm3
3634	movdqa	%xmm2,-96(%rax)
3635	movdqa	%xmm4,%xmm2
3636	paddd	%xmm0,%xmm1
3637	pcmpeqd	%xmm5,%xmm0
3638	movdqa	%xmm3,-80(%rax)
3639	movdqa	%xmm4,%xmm3
3640
3641	paddd	%xmm1,%xmm2
3642	pcmpeqd	%xmm5,%xmm1
3643	movdqa	%xmm0,-64(%rax)
3644	movdqa	%xmm4,%xmm0
3645
3646	paddd	%xmm2,%xmm3
3647	pcmpeqd	%xmm5,%xmm2
3648	movdqa	%xmm1,-48(%rax)
3649	movdqa	%xmm4,%xmm1
3650
3651	paddd	%xmm3,%xmm0
3652	pcmpeqd	%xmm5,%xmm3
3653	movdqa	%xmm2,-32(%rax)
3654	movdqa	%xmm4,%xmm2
3655	paddd	%xmm0,%xmm1
3656	pcmpeqd	%xmm5,%xmm0
3657	movdqa	%xmm3,-16(%rax)
3658	movdqa	%xmm4,%xmm3
3659
3660	paddd	%xmm1,%xmm2
3661	pcmpeqd	%xmm5,%xmm1
3662	movdqa	%xmm0,0(%rax)
3663	movdqa	%xmm4,%xmm0
3664
3665	paddd	%xmm2,%xmm3
3666	pcmpeqd	%xmm5,%xmm2
3667	movdqa	%xmm1,16(%rax)
3668	movdqa	%xmm4,%xmm1
3669
3670	paddd	%xmm3,%xmm0
3671	pcmpeqd	%xmm5,%xmm3
3672	movdqa	%xmm2,32(%rax)
3673	movdqa	%xmm4,%xmm2
3674	paddd	%xmm0,%xmm1
3675	pcmpeqd	%xmm5,%xmm0
3676	movdqa	%xmm3,48(%rax)
3677	movdqa	%xmm4,%xmm3
3678
3679	paddd	%xmm1,%xmm2
3680	pcmpeqd	%xmm5,%xmm1
3681	movdqa	%xmm0,64(%rax)
3682	movdqa	%xmm4,%xmm0
3683
3684	paddd	%xmm2,%xmm3
3685	pcmpeqd	%xmm5,%xmm2
3686	movdqa	%xmm1,80(%rax)
3687	movdqa	%xmm4,%xmm1
3688
3689	paddd	%xmm3,%xmm0
3690	pcmpeqd	%xmm5,%xmm3
3691	movdqa	%xmm2,96(%rax)
3692	movdqa	%xmm4,%xmm2
3693	movdqa	%xmm3,112(%rax)
3694	jmp	.Lgather
3695
3696.align	32
3697.Lgather:
3698	pxor	%xmm4,%xmm4
3699	pxor	%xmm5,%xmm5
3700	movdqa	-128(%r11),%xmm0
3701	movdqa	-112(%r11),%xmm1
3702	movdqa	-96(%r11),%xmm2
3703	pand	-128(%rax),%xmm0
3704	movdqa	-80(%r11),%xmm3
3705	pand	-112(%rax),%xmm1
3706	por	%xmm0,%xmm4
3707	pand	-96(%rax),%xmm2
3708	por	%xmm1,%xmm5
3709	pand	-80(%rax),%xmm3
3710	por	%xmm2,%xmm4
3711	por	%xmm3,%xmm5
3712	movdqa	-64(%r11),%xmm0
3713	movdqa	-48(%r11),%xmm1
3714	movdqa	-32(%r11),%xmm2
3715	pand	-64(%rax),%xmm0
3716	movdqa	-16(%r11),%xmm3
3717	pand	-48(%rax),%xmm1
3718	por	%xmm0,%xmm4
3719	pand	-32(%rax),%xmm2
3720	por	%xmm1,%xmm5
3721	pand	-16(%rax),%xmm3
3722	por	%xmm2,%xmm4
3723	por	%xmm3,%xmm5
3724	movdqa	0(%r11),%xmm0
3725	movdqa	16(%r11),%xmm1
3726	movdqa	32(%r11),%xmm2
3727	pand	0(%rax),%xmm0
3728	movdqa	48(%r11),%xmm3
3729	pand	16(%rax),%xmm1
3730	por	%xmm0,%xmm4
3731	pand	32(%rax),%xmm2
3732	por	%xmm1,%xmm5
3733	pand	48(%rax),%xmm3
3734	por	%xmm2,%xmm4
3735	por	%xmm3,%xmm5
3736	movdqa	64(%r11),%xmm0
3737	movdqa	80(%r11),%xmm1
3738	movdqa	96(%r11),%xmm2
3739	pand	64(%rax),%xmm0
3740	movdqa	112(%r11),%xmm3
3741	pand	80(%rax),%xmm1
3742	por	%xmm0,%xmm4
3743	pand	96(%rax),%xmm2
3744	por	%xmm1,%xmm5
3745	pand	112(%rax),%xmm3
3746	por	%xmm2,%xmm4
3747	por	%xmm3,%xmm5
3748	por	%xmm5,%xmm4
3749	leaq	256(%r11),%r11
3750	pshufd	$0x4e,%xmm4,%xmm0
3751	por	%xmm4,%xmm0
3752	movq	%xmm0,(%rdi)
3753	leaq	8(%rdi),%rdi
3754	subl	$1,%esi
3755	jnz	.Lgather
3756
3757	leaq	(%r10),%rsp
3758	.byte	0xf3,0xc3
3759.LSEH_end_bn_gather5:
3760.size	bn_gather5,.-bn_gather5
3761.align	64
3762.Linc:
3763.long	0,0, 1,1
3764.long	2,2, 2,2
3765.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
3766