x86_64-mont5.S revision 1.5
1#include <machine/asm.h>
2.text
3
4
5
6.globl	bn_mul_mont_gather5
7.type	bn_mul_mont_gather5,@function
8.align	64
9bn_mul_mont_gather5:
10	movl	%r9d,%r9d
11	movq	%rsp,%rax
12	testl	$7,%r9d
13	jnz	.Lmul_enter
14	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
15	jmp	.Lmul4x_enter
16
17.align	16
18.Lmul_enter:
19	movd	8(%rsp),%xmm5
20	pushq	%rbx
21	pushq	%rbp
22	pushq	%r12
23	pushq	%r13
24	pushq	%r14
25	pushq	%r15
26
27	negq	%r9
28	movq	%rsp,%r11
29	leaq	-280(%rsp,%r9,8),%r10
30	negq	%r9
31	andq	$-1024,%r10
32
33
34
35
36
37
38
39	subq	%r10,%r11
40	andq	$-4096,%r11
41	leaq	(%r10,%r11,1),%rsp
42	movq	(%rsp),%r11
43	cmpq	%r10,%rsp
44	ja	.Lmul_page_walk
45	jmp	.Lmul_page_walk_done
46
47.Lmul_page_walk:
48	leaq	-4096(%rsp),%rsp
49	movq	(%rsp),%r11
50	cmpq	%r10,%rsp
51	ja	.Lmul_page_walk
52.Lmul_page_walk_done:
53
54	leaq	.Linc(%rip),%r10
55	movq	%rax,8(%rsp,%r9,8)
56.Lmul_body:
57
58	leaq	128(%rdx),%r12
59	movdqa	0(%r10),%xmm0
60	movdqa	16(%r10),%xmm1
61	leaq	24-112(%rsp,%r9,8),%r10
62	andq	$-16,%r10
63
64	pshufd	$0,%xmm5,%xmm5
65	movdqa	%xmm1,%xmm4
66	movdqa	%xmm1,%xmm2
67	paddd	%xmm0,%xmm1
68	pcmpeqd	%xmm5,%xmm0
69.byte	0x67
70	movdqa	%xmm4,%xmm3
71	paddd	%xmm1,%xmm2
72	pcmpeqd	%xmm5,%xmm1
73	movdqa	%xmm0,112(%r10)
74	movdqa	%xmm4,%xmm0
75
76	paddd	%xmm2,%xmm3
77	pcmpeqd	%xmm5,%xmm2
78	movdqa	%xmm1,128(%r10)
79	movdqa	%xmm4,%xmm1
80
81	paddd	%xmm3,%xmm0
82	pcmpeqd	%xmm5,%xmm3
83	movdqa	%xmm2,144(%r10)
84	movdqa	%xmm4,%xmm2
85
86	paddd	%xmm0,%xmm1
87	pcmpeqd	%xmm5,%xmm0
88	movdqa	%xmm3,160(%r10)
89	movdqa	%xmm4,%xmm3
90	paddd	%xmm1,%xmm2
91	pcmpeqd	%xmm5,%xmm1
92	movdqa	%xmm0,176(%r10)
93	movdqa	%xmm4,%xmm0
94
95	paddd	%xmm2,%xmm3
96	pcmpeqd	%xmm5,%xmm2
97	movdqa	%xmm1,192(%r10)
98	movdqa	%xmm4,%xmm1
99
100	paddd	%xmm3,%xmm0
101	pcmpeqd	%xmm5,%xmm3
102	movdqa	%xmm2,208(%r10)
103	movdqa	%xmm4,%xmm2
104
105	paddd	%xmm0,%xmm1
106	pcmpeqd	%xmm5,%xmm0
107	movdqa	%xmm3,224(%r10)
108	movdqa	%xmm4,%xmm3
109	paddd	%xmm1,%xmm2
110	pcmpeqd	%xmm5,%xmm1
111	movdqa	%xmm0,240(%r10)
112	movdqa	%xmm4,%xmm0
113
114	paddd	%xmm2,%xmm3
115	pcmpeqd	%xmm5,%xmm2
116	movdqa	%xmm1,256(%r10)
117	movdqa	%xmm4,%xmm1
118
119	paddd	%xmm3,%xmm0
120	pcmpeqd	%xmm5,%xmm3
121	movdqa	%xmm2,272(%r10)
122	movdqa	%xmm4,%xmm2
123
124	paddd	%xmm0,%xmm1
125	pcmpeqd	%xmm5,%xmm0
126	movdqa	%xmm3,288(%r10)
127	movdqa	%xmm4,%xmm3
128	paddd	%xmm1,%xmm2
129	pcmpeqd	%xmm5,%xmm1
130	movdqa	%xmm0,304(%r10)
131
132	paddd	%xmm2,%xmm3
133.byte	0x67
134	pcmpeqd	%xmm5,%xmm2
135	movdqa	%xmm1,320(%r10)
136
137	pcmpeqd	%xmm5,%xmm3
138	movdqa	%xmm2,336(%r10)
139	pand	64(%r12),%xmm0
140
141	pand	80(%r12),%xmm1
142	pand	96(%r12),%xmm2
143	movdqa	%xmm3,352(%r10)
144	pand	112(%r12),%xmm3
145	por	%xmm2,%xmm0
146	por	%xmm3,%xmm1
147	movdqa	-128(%r12),%xmm4
148	movdqa	-112(%r12),%xmm5
149	movdqa	-96(%r12),%xmm2
150	pand	112(%r10),%xmm4
151	movdqa	-80(%r12),%xmm3
152	pand	128(%r10),%xmm5
153	por	%xmm4,%xmm0
154	pand	144(%r10),%xmm2
155	por	%xmm5,%xmm1
156	pand	160(%r10),%xmm3
157	por	%xmm2,%xmm0
158	por	%xmm3,%xmm1
159	movdqa	-64(%r12),%xmm4
160	movdqa	-48(%r12),%xmm5
161	movdqa	-32(%r12),%xmm2
162	pand	176(%r10),%xmm4
163	movdqa	-16(%r12),%xmm3
164	pand	192(%r10),%xmm5
165	por	%xmm4,%xmm0
166	pand	208(%r10),%xmm2
167	por	%xmm5,%xmm1
168	pand	224(%r10),%xmm3
169	por	%xmm2,%xmm0
170	por	%xmm3,%xmm1
171	movdqa	0(%r12),%xmm4
172	movdqa	16(%r12),%xmm5
173	movdqa	32(%r12),%xmm2
174	pand	240(%r10),%xmm4
175	movdqa	48(%r12),%xmm3
176	pand	256(%r10),%xmm5
177	por	%xmm4,%xmm0
178	pand	272(%r10),%xmm2
179	por	%xmm5,%xmm1
180	pand	288(%r10),%xmm3
181	por	%xmm2,%xmm0
182	por	%xmm3,%xmm1
183	por	%xmm1,%xmm0
184	pshufd	$0x4e,%xmm0,%xmm1
185	por	%xmm1,%xmm0
186	leaq	256(%r12),%r12
187.byte	102,72,15,126,195
188
189	movq	(%r8),%r8
190	movq	(%rsi),%rax
191
192	xorq	%r14,%r14
193	xorq	%r15,%r15
194
195	movq	%r8,%rbp
196	mulq	%rbx
197	movq	%rax,%r10
198	movq	(%rcx),%rax
199
200	imulq	%r10,%rbp
201	movq	%rdx,%r11
202
203	mulq	%rbp
204	addq	%rax,%r10
205	movq	8(%rsi),%rax
206	adcq	$0,%rdx
207	movq	%rdx,%r13
208
209	leaq	1(%r15),%r15
210	jmp	.L1st_enter
211
212.align	16
213.L1st:
214	addq	%rax,%r13
215	movq	(%rsi,%r15,8),%rax
216	adcq	$0,%rdx
217	addq	%r11,%r13
218	movq	%r10,%r11
219	adcq	$0,%rdx
220	movq	%r13,-16(%rsp,%r15,8)
221	movq	%rdx,%r13
222
223.L1st_enter:
224	mulq	%rbx
225	addq	%rax,%r11
226	movq	(%rcx,%r15,8),%rax
227	adcq	$0,%rdx
228	leaq	1(%r15),%r15
229	movq	%rdx,%r10
230
231	mulq	%rbp
232	cmpq	%r9,%r15
233	jne	.L1st
234
235
236	addq	%rax,%r13
237	adcq	$0,%rdx
238	addq	%r11,%r13
239	adcq	$0,%rdx
240	movq	%r13,-16(%rsp,%r9,8)
241	movq	%rdx,%r13
242	movq	%r10,%r11
243
244	xorq	%rdx,%rdx
245	addq	%r11,%r13
246	adcq	$0,%rdx
247	movq	%r13,-8(%rsp,%r9,8)
248	movq	%rdx,(%rsp,%r9,8)
249
250	leaq	1(%r14),%r14
251	jmp	.Louter
252.align	16
253.Louter:
254	leaq	24+128(%rsp,%r9,8),%rdx
255	andq	$-16,%rdx
256	pxor	%xmm4,%xmm4
257	pxor	%xmm5,%xmm5
258	movdqa	-128(%r12),%xmm0
259	movdqa	-112(%r12),%xmm1
260	movdqa	-96(%r12),%xmm2
261	movdqa	-80(%r12),%xmm3
262	pand	-128(%rdx),%xmm0
263	pand	-112(%rdx),%xmm1
264	por	%xmm0,%xmm4
265	pand	-96(%rdx),%xmm2
266	por	%xmm1,%xmm5
267	pand	-80(%rdx),%xmm3
268	por	%xmm2,%xmm4
269	por	%xmm3,%xmm5
270	movdqa	-64(%r12),%xmm0
271	movdqa	-48(%r12),%xmm1
272	movdqa	-32(%r12),%xmm2
273	movdqa	-16(%r12),%xmm3
274	pand	-64(%rdx),%xmm0
275	pand	-48(%rdx),%xmm1
276	por	%xmm0,%xmm4
277	pand	-32(%rdx),%xmm2
278	por	%xmm1,%xmm5
279	pand	-16(%rdx),%xmm3
280	por	%xmm2,%xmm4
281	por	%xmm3,%xmm5
282	movdqa	0(%r12),%xmm0
283	movdqa	16(%r12),%xmm1
284	movdqa	32(%r12),%xmm2
285	movdqa	48(%r12),%xmm3
286	pand	0(%rdx),%xmm0
287	pand	16(%rdx),%xmm1
288	por	%xmm0,%xmm4
289	pand	32(%rdx),%xmm2
290	por	%xmm1,%xmm5
291	pand	48(%rdx),%xmm3
292	por	%xmm2,%xmm4
293	por	%xmm3,%xmm5
294	movdqa	64(%r12),%xmm0
295	movdqa	80(%r12),%xmm1
296	movdqa	96(%r12),%xmm2
297	movdqa	112(%r12),%xmm3
298	pand	64(%rdx),%xmm0
299	pand	80(%rdx),%xmm1
300	por	%xmm0,%xmm4
301	pand	96(%rdx),%xmm2
302	por	%xmm1,%xmm5
303	pand	112(%rdx),%xmm3
304	por	%xmm2,%xmm4
305	por	%xmm3,%xmm5
306	por	%xmm5,%xmm4
307	pshufd	$0x4e,%xmm4,%xmm0
308	por	%xmm4,%xmm0
309	leaq	256(%r12),%r12
310
311	movq	(%rsi),%rax
312.byte	102,72,15,126,195
313
314	xorq	%r15,%r15
315	movq	%r8,%rbp
316	movq	(%rsp),%r10
317
318	mulq	%rbx
319	addq	%rax,%r10
320	movq	(%rcx),%rax
321	adcq	$0,%rdx
322
323	imulq	%r10,%rbp
324	movq	%rdx,%r11
325
326	mulq	%rbp
327	addq	%rax,%r10
328	movq	8(%rsi),%rax
329	adcq	$0,%rdx
330	movq	8(%rsp),%r10
331	movq	%rdx,%r13
332
333	leaq	1(%r15),%r15
334	jmp	.Linner_enter
335
336.align	16
337.Linner:
338	addq	%rax,%r13
339	movq	(%rsi,%r15,8),%rax
340	adcq	$0,%rdx
341	addq	%r10,%r13
342	movq	(%rsp,%r15,8),%r10
343	adcq	$0,%rdx
344	movq	%r13,-16(%rsp,%r15,8)
345	movq	%rdx,%r13
346
347.Linner_enter:
348	mulq	%rbx
349	addq	%rax,%r11
350	movq	(%rcx,%r15,8),%rax
351	adcq	$0,%rdx
352	addq	%r11,%r10
353	movq	%rdx,%r11
354	adcq	$0,%r11
355	leaq	1(%r15),%r15
356
357	mulq	%rbp
358	cmpq	%r9,%r15
359	jne	.Linner
360
361	addq	%rax,%r13
362	adcq	$0,%rdx
363	addq	%r10,%r13
364	movq	(%rsp,%r9,8),%r10
365	adcq	$0,%rdx
366	movq	%r13,-16(%rsp,%r9,8)
367	movq	%rdx,%r13
368
369	xorq	%rdx,%rdx
370	addq	%r11,%r13
371	adcq	$0,%rdx
372	addq	%r10,%r13
373	adcq	$0,%rdx
374	movq	%r13,-8(%rsp,%r9,8)
375	movq	%rdx,(%rsp,%r9,8)
376
377	leaq	1(%r14),%r14
378	cmpq	%r9,%r14
379	jb	.Louter
380
381	xorq	%r14,%r14
382	movq	(%rsp),%rax
383	leaq	(%rsp),%rsi
384	movq	%r9,%r15
385	jmp	.Lsub
386.align	16
387.Lsub:	sbbq	(%rcx,%r14,8),%rax
388	movq	%rax,(%rdi,%r14,8)
389	movq	8(%rsi,%r14,8),%rax
390	leaq	1(%r14),%r14
391	decq	%r15
392	jnz	.Lsub
393
394	sbbq	$0,%rax
395	xorq	%r14,%r14
396	andq	%rax,%rsi
397	notq	%rax
398	movq	%rdi,%rcx
399	andq	%rax,%rcx
400	movq	%r9,%r15
401	orq	%rcx,%rsi
402.align	16
403.Lcopy:
404	movq	(%rsi,%r14,8),%rax
405	movq	%r14,(%rsp,%r14,8)
406	movq	%rax,(%rdi,%r14,8)
407	leaq	1(%r14),%r14
408	subq	$1,%r15
409	jnz	.Lcopy
410
411	movq	8(%rsp,%r9,8),%rsi
412	movq	$1,%rax
413
414	movq	-48(%rsi),%r15
415	movq	-40(%rsi),%r14
416	movq	-32(%rsi),%r13
417	movq	-24(%rsi),%r12
418	movq	-16(%rsi),%rbp
419	movq	-8(%rsi),%rbx
420	leaq	(%rsi),%rsp
421.Lmul_epilogue:
422	.byte	0xf3,0xc3
423.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
424.type	bn_mul4x_mont_gather5,@function
425.align	32
426bn_mul4x_mont_gather5:
427.byte	0x67
428	movq	%rsp,%rax
429.Lmul4x_enter:
430	andl	$0x80108,%r11d
431	cmpl	$0x80108,%r11d
432	je	.Lmulx4x_enter
433	pushq	%rbx
434	pushq	%rbp
435	pushq	%r12
436	pushq	%r13
437	pushq	%r14
438	pushq	%r15
439.Lmul4x_prologue:
440
441.byte	0x67
442	shll	$3,%r9d
443	leaq	(%r9,%r9,2),%r10
444	negq	%r9
445
446
447
448
449
450
451
452
453
454
455	leaq	-320(%rsp,%r9,2),%r11
456	movq	%rsp,%rbp
457	subq	%rdi,%r11
458	andq	$4095,%r11
459	cmpq	%r11,%r10
460	jb	.Lmul4xsp_alt
461	subq	%r11,%rbp
462	leaq	-320(%rbp,%r9,2),%rbp
463	jmp	.Lmul4xsp_done
464
465.align	32
466.Lmul4xsp_alt:
467	leaq	4096-320(,%r9,2),%r10
468	leaq	-320(%rbp,%r9,2),%rbp
469	subq	%r10,%r11
470	movq	$0,%r10
471	cmovcq	%r10,%r11
472	subq	%r11,%rbp
473.Lmul4xsp_done:
474	andq	$-64,%rbp
475	movq	%rsp,%r11
476	subq	%rbp,%r11
477	andq	$-4096,%r11
478	leaq	(%r11,%rbp,1),%rsp
479	movq	(%rsp),%r10
480	cmpq	%rbp,%rsp
481	ja	.Lmul4x_page_walk
482	jmp	.Lmul4x_page_walk_done
483
484.Lmul4x_page_walk:
485	leaq	-4096(%rsp),%rsp
486	movq	(%rsp),%r10
487	cmpq	%rbp,%rsp
488	ja	.Lmul4x_page_walk
489.Lmul4x_page_walk_done:
490
491	negq	%r9
492
493	movq	%rax,40(%rsp)
494.Lmul4x_body:
495
496	call	mul4x_internal
497
498	movq	40(%rsp),%rsi
499	movq	$1,%rax
500
501	movq	-48(%rsi),%r15
502	movq	-40(%rsi),%r14
503	movq	-32(%rsi),%r13
504	movq	-24(%rsi),%r12
505	movq	-16(%rsi),%rbp
506	movq	-8(%rsi),%rbx
507	leaq	(%rsi),%rsp
508.Lmul4x_epilogue:
509	.byte	0xf3,0xc3
510.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
511
512.type	mul4x_internal,@function
513.align	32
514mul4x_internal:
515	shlq	$5,%r9
516	movd	8(%rax),%xmm5
517	leaq	.Linc(%rip),%rax
518	leaq	128(%rdx,%r9,1),%r13
519	shrq	$5,%r9
520	movdqa	0(%rax),%xmm0
521	movdqa	16(%rax),%xmm1
522	leaq	88-112(%rsp,%r9,1),%r10
523	leaq	128(%rdx),%r12
524
525	pshufd	$0,%xmm5,%xmm5
526	movdqa	%xmm1,%xmm4
527.byte	0x67,0x67
528	movdqa	%xmm1,%xmm2
529	paddd	%xmm0,%xmm1
530	pcmpeqd	%xmm5,%xmm0
531.byte	0x67
532	movdqa	%xmm4,%xmm3
533	paddd	%xmm1,%xmm2
534	pcmpeqd	%xmm5,%xmm1
535	movdqa	%xmm0,112(%r10)
536	movdqa	%xmm4,%xmm0
537
538	paddd	%xmm2,%xmm3
539	pcmpeqd	%xmm5,%xmm2
540	movdqa	%xmm1,128(%r10)
541	movdqa	%xmm4,%xmm1
542
543	paddd	%xmm3,%xmm0
544	pcmpeqd	%xmm5,%xmm3
545	movdqa	%xmm2,144(%r10)
546	movdqa	%xmm4,%xmm2
547
548	paddd	%xmm0,%xmm1
549	pcmpeqd	%xmm5,%xmm0
550	movdqa	%xmm3,160(%r10)
551	movdqa	%xmm4,%xmm3
552	paddd	%xmm1,%xmm2
553	pcmpeqd	%xmm5,%xmm1
554	movdqa	%xmm0,176(%r10)
555	movdqa	%xmm4,%xmm0
556
557	paddd	%xmm2,%xmm3
558	pcmpeqd	%xmm5,%xmm2
559	movdqa	%xmm1,192(%r10)
560	movdqa	%xmm4,%xmm1
561
562	paddd	%xmm3,%xmm0
563	pcmpeqd	%xmm5,%xmm3
564	movdqa	%xmm2,208(%r10)
565	movdqa	%xmm4,%xmm2
566
567	paddd	%xmm0,%xmm1
568	pcmpeqd	%xmm5,%xmm0
569	movdqa	%xmm3,224(%r10)
570	movdqa	%xmm4,%xmm3
571	paddd	%xmm1,%xmm2
572	pcmpeqd	%xmm5,%xmm1
573	movdqa	%xmm0,240(%r10)
574	movdqa	%xmm4,%xmm0
575
576	paddd	%xmm2,%xmm3
577	pcmpeqd	%xmm5,%xmm2
578	movdqa	%xmm1,256(%r10)
579	movdqa	%xmm4,%xmm1
580
581	paddd	%xmm3,%xmm0
582	pcmpeqd	%xmm5,%xmm3
583	movdqa	%xmm2,272(%r10)
584	movdqa	%xmm4,%xmm2
585
586	paddd	%xmm0,%xmm1
587	pcmpeqd	%xmm5,%xmm0
588	movdqa	%xmm3,288(%r10)
589	movdqa	%xmm4,%xmm3
590	paddd	%xmm1,%xmm2
591	pcmpeqd	%xmm5,%xmm1
592	movdqa	%xmm0,304(%r10)
593
594	paddd	%xmm2,%xmm3
595.byte	0x67
596	pcmpeqd	%xmm5,%xmm2
597	movdqa	%xmm1,320(%r10)
598
599	pcmpeqd	%xmm5,%xmm3
600	movdqa	%xmm2,336(%r10)
601	pand	64(%r12),%xmm0
602
603	pand	80(%r12),%xmm1
604	pand	96(%r12),%xmm2
605	movdqa	%xmm3,352(%r10)
606	pand	112(%r12),%xmm3
607	por	%xmm2,%xmm0
608	por	%xmm3,%xmm1
609	movdqa	-128(%r12),%xmm4
610	movdqa	-112(%r12),%xmm5
611	movdqa	-96(%r12),%xmm2
612	pand	112(%r10),%xmm4
613	movdqa	-80(%r12),%xmm3
614	pand	128(%r10),%xmm5
615	por	%xmm4,%xmm0
616	pand	144(%r10),%xmm2
617	por	%xmm5,%xmm1
618	pand	160(%r10),%xmm3
619	por	%xmm2,%xmm0
620	por	%xmm3,%xmm1
621	movdqa	-64(%r12),%xmm4
622	movdqa	-48(%r12),%xmm5
623	movdqa	-32(%r12),%xmm2
624	pand	176(%r10),%xmm4
625	movdqa	-16(%r12),%xmm3
626	pand	192(%r10),%xmm5
627	por	%xmm4,%xmm0
628	pand	208(%r10),%xmm2
629	por	%xmm5,%xmm1
630	pand	224(%r10),%xmm3
631	por	%xmm2,%xmm0
632	por	%xmm3,%xmm1
633	movdqa	0(%r12),%xmm4
634	movdqa	16(%r12),%xmm5
635	movdqa	32(%r12),%xmm2
636	pand	240(%r10),%xmm4
637	movdqa	48(%r12),%xmm3
638	pand	256(%r10),%xmm5
639	por	%xmm4,%xmm0
640	pand	272(%r10),%xmm2
641	por	%xmm5,%xmm1
642	pand	288(%r10),%xmm3
643	por	%xmm2,%xmm0
644	por	%xmm3,%xmm1
645	por	%xmm1,%xmm0
646	pshufd	$0x4e,%xmm0,%xmm1
647	por	%xmm1,%xmm0
648	leaq	256(%r12),%r12
649.byte	102,72,15,126,195
650
651	movq	%r13,16+8(%rsp)
652	movq	%rdi,56+8(%rsp)
653
654	movq	(%r8),%r8
655	movq	(%rsi),%rax
656	leaq	(%rsi,%r9,1),%rsi
657	negq	%r9
658
659	movq	%r8,%rbp
660	mulq	%rbx
661	movq	%rax,%r10
662	movq	(%rcx),%rax
663
664	imulq	%r10,%rbp
665	leaq	64+8(%rsp),%r14
666	movq	%rdx,%r11
667
668	mulq	%rbp
669	addq	%rax,%r10
670	movq	8(%rsi,%r9,1),%rax
671	adcq	$0,%rdx
672	movq	%rdx,%rdi
673
674	mulq	%rbx
675	addq	%rax,%r11
676	movq	8(%rcx),%rax
677	adcq	$0,%rdx
678	movq	%rdx,%r10
679
680	mulq	%rbp
681	addq	%rax,%rdi
682	movq	16(%rsi,%r9,1),%rax
683	adcq	$0,%rdx
684	addq	%r11,%rdi
685	leaq	32(%r9),%r15
686	leaq	32(%rcx),%rcx
687	adcq	$0,%rdx
688	movq	%rdi,(%r14)
689	movq	%rdx,%r13
690	jmp	.L1st4x
691
692.align	32
693.L1st4x:
694	mulq	%rbx
695	addq	%rax,%r10
696	movq	-16(%rcx),%rax
697	leaq	32(%r14),%r14
698	adcq	$0,%rdx
699	movq	%rdx,%r11
700
701	mulq	%rbp
702	addq	%rax,%r13
703	movq	-8(%rsi,%r15,1),%rax
704	adcq	$0,%rdx
705	addq	%r10,%r13
706	adcq	$0,%rdx
707	movq	%r13,-24(%r14)
708	movq	%rdx,%rdi
709
710	mulq	%rbx
711	addq	%rax,%r11
712	movq	-8(%rcx),%rax
713	adcq	$0,%rdx
714	movq	%rdx,%r10
715
716	mulq	%rbp
717	addq	%rax,%rdi
718	movq	(%rsi,%r15,1),%rax
719	adcq	$0,%rdx
720	addq	%r11,%rdi
721	adcq	$0,%rdx
722	movq	%rdi,-16(%r14)
723	movq	%rdx,%r13
724
725	mulq	%rbx
726	addq	%rax,%r10
727	movq	0(%rcx),%rax
728	adcq	$0,%rdx
729	movq	%rdx,%r11
730
731	mulq	%rbp
732	addq	%rax,%r13
733	movq	8(%rsi,%r15,1),%rax
734	adcq	$0,%rdx
735	addq	%r10,%r13
736	adcq	$0,%rdx
737	movq	%r13,-8(%r14)
738	movq	%rdx,%rdi
739
740	mulq	%rbx
741	addq	%rax,%r11
742	movq	8(%rcx),%rax
743	adcq	$0,%rdx
744	movq	%rdx,%r10
745
746	mulq	%rbp
747	addq	%rax,%rdi
748	movq	16(%rsi,%r15,1),%rax
749	adcq	$0,%rdx
750	addq	%r11,%rdi
751	leaq	32(%rcx),%rcx
752	adcq	$0,%rdx
753	movq	%rdi,(%r14)
754	movq	%rdx,%r13
755
756	addq	$32,%r15
757	jnz	.L1st4x
758
759	mulq	%rbx
760	addq	%rax,%r10
761	movq	-16(%rcx),%rax
762	leaq	32(%r14),%r14
763	adcq	$0,%rdx
764	movq	%rdx,%r11
765
766	mulq	%rbp
767	addq	%rax,%r13
768	movq	-8(%rsi),%rax
769	adcq	$0,%rdx
770	addq	%r10,%r13
771	adcq	$0,%rdx
772	movq	%r13,-24(%r14)
773	movq	%rdx,%rdi
774
775	mulq	%rbx
776	addq	%rax,%r11
777	movq	-8(%rcx),%rax
778	adcq	$0,%rdx
779	movq	%rdx,%r10
780
781	mulq	%rbp
782	addq	%rax,%rdi
783	movq	(%rsi,%r9,1),%rax
784	adcq	$0,%rdx
785	addq	%r11,%rdi
786	adcq	$0,%rdx
787	movq	%rdi,-16(%r14)
788	movq	%rdx,%r13
789
790	leaq	(%rcx,%r9,1),%rcx
791
792	xorq	%rdi,%rdi
793	addq	%r10,%r13
794	adcq	$0,%rdi
795	movq	%r13,-8(%r14)
796
797	jmp	.Louter4x
798
799.align	32
800.Louter4x:
801	leaq	16+128(%r14),%rdx
802	pxor	%xmm4,%xmm4
803	pxor	%xmm5,%xmm5
804	movdqa	-128(%r12),%xmm0
805	movdqa	-112(%r12),%xmm1
806	movdqa	-96(%r12),%xmm2
807	movdqa	-80(%r12),%xmm3
808	pand	-128(%rdx),%xmm0
809	pand	-112(%rdx),%xmm1
810	por	%xmm0,%xmm4
811	pand	-96(%rdx),%xmm2
812	por	%xmm1,%xmm5
813	pand	-80(%rdx),%xmm3
814	por	%xmm2,%xmm4
815	por	%xmm3,%xmm5
816	movdqa	-64(%r12),%xmm0
817	movdqa	-48(%r12),%xmm1
818	movdqa	-32(%r12),%xmm2
819	movdqa	-16(%r12),%xmm3
820	pand	-64(%rdx),%xmm0
821	pand	-48(%rdx),%xmm1
822	por	%xmm0,%xmm4
823	pand	-32(%rdx),%xmm2
824	por	%xmm1,%xmm5
825	pand	-16(%rdx),%xmm3
826	por	%xmm2,%xmm4
827	por	%xmm3,%xmm5
828	movdqa	0(%r12),%xmm0
829	movdqa	16(%r12),%xmm1
830	movdqa	32(%r12),%xmm2
831	movdqa	48(%r12),%xmm3
832	pand	0(%rdx),%xmm0
833	pand	16(%rdx),%xmm1
834	por	%xmm0,%xmm4
835	pand	32(%rdx),%xmm2
836	por	%xmm1,%xmm5
837	pand	48(%rdx),%xmm3
838	por	%xmm2,%xmm4
839	por	%xmm3,%xmm5
840	movdqa	64(%r12),%xmm0
841	movdqa	80(%r12),%xmm1
842	movdqa	96(%r12),%xmm2
843	movdqa	112(%r12),%xmm3
844	pand	64(%rdx),%xmm0
845	pand	80(%rdx),%xmm1
846	por	%xmm0,%xmm4
847	pand	96(%rdx),%xmm2
848	por	%xmm1,%xmm5
849	pand	112(%rdx),%xmm3
850	por	%xmm2,%xmm4
851	por	%xmm3,%xmm5
852	por	%xmm5,%xmm4
853	pshufd	$0x4e,%xmm4,%xmm0
854	por	%xmm4,%xmm0
855	leaq	256(%r12),%r12
856.byte	102,72,15,126,195
857
858	movq	(%r14,%r9,1),%r10
859	movq	%r8,%rbp
860	mulq	%rbx
861	addq	%rax,%r10
862	movq	(%rcx),%rax
863	adcq	$0,%rdx
864
865	imulq	%r10,%rbp
866	movq	%rdx,%r11
867	movq	%rdi,(%r14)
868
869	leaq	(%r14,%r9,1),%r14
870
871	mulq	%rbp
872	addq	%rax,%r10
873	movq	8(%rsi,%r9,1),%rax
874	adcq	$0,%rdx
875	movq	%rdx,%rdi
876
877	mulq	%rbx
878	addq	%rax,%r11
879	movq	8(%rcx),%rax
880	adcq	$0,%rdx
881	addq	8(%r14),%r11
882	adcq	$0,%rdx
883	movq	%rdx,%r10
884
885	mulq	%rbp
886	addq	%rax,%rdi
887	movq	16(%rsi,%r9,1),%rax
888	adcq	$0,%rdx
889	addq	%r11,%rdi
890	leaq	32(%r9),%r15
891	leaq	32(%rcx),%rcx
892	adcq	$0,%rdx
893	movq	%rdx,%r13
894	jmp	.Linner4x
895
896.align	32
897.Linner4x:
898	mulq	%rbx
899	addq	%rax,%r10
900	movq	-16(%rcx),%rax
901	adcq	$0,%rdx
902	addq	16(%r14),%r10
903	leaq	32(%r14),%r14
904	adcq	$0,%rdx
905	movq	%rdx,%r11
906
907	mulq	%rbp
908	addq	%rax,%r13
909	movq	-8(%rsi,%r15,1),%rax
910	adcq	$0,%rdx
911	addq	%r10,%r13
912	adcq	$0,%rdx
913	movq	%rdi,-32(%r14)
914	movq	%rdx,%rdi
915
916	mulq	%rbx
917	addq	%rax,%r11
918	movq	-8(%rcx),%rax
919	adcq	$0,%rdx
920	addq	-8(%r14),%r11
921	adcq	$0,%rdx
922	movq	%rdx,%r10
923
924	mulq	%rbp
925	addq	%rax,%rdi
926	movq	(%rsi,%r15,1),%rax
927	adcq	$0,%rdx
928	addq	%r11,%rdi
929	adcq	$0,%rdx
930	movq	%r13,-24(%r14)
931	movq	%rdx,%r13
932
933	mulq	%rbx
934	addq	%rax,%r10
935	movq	0(%rcx),%rax
936	adcq	$0,%rdx
937	addq	(%r14),%r10
938	adcq	$0,%rdx
939	movq	%rdx,%r11
940
941	mulq	%rbp
942	addq	%rax,%r13
943	movq	8(%rsi,%r15,1),%rax
944	adcq	$0,%rdx
945	addq	%r10,%r13
946	adcq	$0,%rdx
947	movq	%rdi,-16(%r14)
948	movq	%rdx,%rdi
949
950	mulq	%rbx
951	addq	%rax,%r11
952	movq	8(%rcx),%rax
953	adcq	$0,%rdx
954	addq	8(%r14),%r11
955	adcq	$0,%rdx
956	movq	%rdx,%r10
957
958	mulq	%rbp
959	addq	%rax,%rdi
960	movq	16(%rsi,%r15,1),%rax
961	adcq	$0,%rdx
962	addq	%r11,%rdi
963	leaq	32(%rcx),%rcx
964	adcq	$0,%rdx
965	movq	%r13,-8(%r14)
966	movq	%rdx,%r13
967
968	addq	$32,%r15
969	jnz	.Linner4x
970
971	mulq	%rbx
972	addq	%rax,%r10
973	movq	-16(%rcx),%rax
974	adcq	$0,%rdx
975	addq	16(%r14),%r10
976	leaq	32(%r14),%r14
977	adcq	$0,%rdx
978	movq	%rdx,%r11
979
980	mulq	%rbp
981	addq	%rax,%r13
982	movq	-8(%rsi),%rax
983	adcq	$0,%rdx
984	addq	%r10,%r13
985	adcq	$0,%rdx
986	movq	%rdi,-32(%r14)
987	movq	%rdx,%rdi
988
989	mulq	%rbx
990	addq	%rax,%r11
991	movq	%rbp,%rax
992	movq	-8(%rcx),%rbp
993	adcq	$0,%rdx
994	addq	-8(%r14),%r11
995	adcq	$0,%rdx
996	movq	%rdx,%r10
997
998	mulq	%rbp
999	addq	%rax,%rdi
1000	movq	(%rsi,%r9,1),%rax
1001	adcq	$0,%rdx
1002	addq	%r11,%rdi
1003	adcq	$0,%rdx
1004	movq	%r13,-24(%r14)
1005	movq	%rdx,%r13
1006
1007	movq	%rdi,-16(%r14)
1008	leaq	(%rcx,%r9,1),%rcx
1009
1010	xorq	%rdi,%rdi
1011	addq	%r10,%r13
1012	adcq	$0,%rdi
1013	addq	(%r14),%r13
1014	adcq	$0,%rdi
1015	movq	%r13,-8(%r14)
1016
1017	cmpq	16+8(%rsp),%r12
1018	jb	.Louter4x
1019	xorq	%rax,%rax
1020	subq	%r13,%rbp
1021	adcq	%r15,%r15
1022	orq	%r15,%rdi
1023	subq	%rdi,%rax
1024	leaq	(%r14,%r9,1),%rbx
1025	movq	(%rcx),%r12
1026	leaq	(%rcx),%rbp
1027	movq	%r9,%rcx
1028	sarq	$3+2,%rcx
1029	movq	56+8(%rsp),%rdi
1030	decq	%r12
1031	xorq	%r10,%r10
1032	movq	8(%rbp),%r13
1033	movq	16(%rbp),%r14
1034	movq	24(%rbp),%r15
1035	jmp	.Lsqr4x_sub_entry
1036.size	mul4x_internal,.-mul4x_internal
1037.globl	bn_power5
1038.type	bn_power5,@function
1039.align	32
1040bn_power5:
1041	movq	%rsp,%rax
1042	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
1043	andl	$0x80108,%r11d
1044	cmpl	$0x80108,%r11d
1045	je	.Lpowerx5_enter
1046	pushq	%rbx
1047	pushq	%rbp
1048	pushq	%r12
1049	pushq	%r13
1050	pushq	%r14
1051	pushq	%r15
1052.Lpower5_prologue:
1053
1054	shll	$3,%r9d
1055	leal	(%r9,%r9,2),%r10d
1056	negq	%r9
1057	movq	(%r8),%r8
1058
1059
1060
1061
1062
1063
1064
1065
1066	leaq	-320(%rsp,%r9,2),%r11
1067	movq	%rsp,%rbp
1068	subq	%rdi,%r11
1069	andq	$4095,%r11
1070	cmpq	%r11,%r10
1071	jb	.Lpwr_sp_alt
1072	subq	%r11,%rbp
1073	leaq	-320(%rbp,%r9,2),%rbp
1074	jmp	.Lpwr_sp_done
1075
1076.align	32
1077.Lpwr_sp_alt:
1078	leaq	4096-320(,%r9,2),%r10
1079	leaq	-320(%rbp,%r9,2),%rbp
1080	subq	%r10,%r11
1081	movq	$0,%r10
1082	cmovcq	%r10,%r11
1083	subq	%r11,%rbp
1084.Lpwr_sp_done:
1085	andq	$-64,%rbp
1086	movq	%rsp,%r11
1087	subq	%rbp,%r11
1088	andq	$-4096,%r11
1089	leaq	(%r11,%rbp,1),%rsp
1090	movq	(%rsp),%r10
1091	cmpq	%rbp,%rsp
1092	ja	.Lpwr_page_walk
1093	jmp	.Lpwr_page_walk_done
1094
1095.Lpwr_page_walk:
1096	leaq	-4096(%rsp),%rsp
1097	movq	(%rsp),%r10
1098	cmpq	%rbp,%rsp
1099	ja	.Lpwr_page_walk
1100.Lpwr_page_walk_done:
1101
1102	movq	%r9,%r10
1103	negq	%r9
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114	movq	%r8,32(%rsp)
1115	movq	%rax,40(%rsp)
1116.Lpower5_body:
1117.byte	102,72,15,110,207
1118.byte	102,72,15,110,209
1119.byte	102,73,15,110,218
1120.byte	102,72,15,110,226
1121
1122	call	__bn_sqr8x_internal
1123	call	__bn_post4x_internal
1124	call	__bn_sqr8x_internal
1125	call	__bn_post4x_internal
1126	call	__bn_sqr8x_internal
1127	call	__bn_post4x_internal
1128	call	__bn_sqr8x_internal
1129	call	__bn_post4x_internal
1130	call	__bn_sqr8x_internal
1131	call	__bn_post4x_internal
1132
1133.byte	102,72,15,126,209
1134.byte	102,72,15,126,226
1135	movq	%rsi,%rdi
1136	movq	40(%rsp),%rax
1137	leaq	32(%rsp),%r8
1138
1139	call	mul4x_internal
1140
1141	movq	40(%rsp),%rsi
1142	movq	$1,%rax
1143	movq	-48(%rsi),%r15
1144	movq	-40(%rsi),%r14
1145	movq	-32(%rsi),%r13
1146	movq	-24(%rsi),%r12
1147	movq	-16(%rsi),%rbp
1148	movq	-8(%rsi),%rbx
1149	leaq	(%rsi),%rsp
1150.Lpower5_epilogue:
1151	.byte	0xf3,0xc3
1152.size	bn_power5,.-bn_power5
1153
1154.globl	bn_sqr8x_internal
1155.hidden	bn_sqr8x_internal
1156.type	bn_sqr8x_internal,@function
1157.align	32
1158bn_sqr8x_internal:
1159__bn_sqr8x_internal:
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233	leaq	32(%r10),%rbp
1234	leaq	(%rsi,%r9,1),%rsi
1235
1236	movq	%r9,%rcx
1237
1238
1239	movq	-32(%rsi,%rbp,1),%r14
1240	leaq	48+8(%rsp,%r9,2),%rdi
1241	movq	-24(%rsi,%rbp,1),%rax
1242	leaq	-32(%rdi,%rbp,1),%rdi
1243	movq	-16(%rsi,%rbp,1),%rbx
1244	movq	%rax,%r15
1245
1246	mulq	%r14
1247	movq	%rax,%r10
1248	movq	%rbx,%rax
1249	movq	%rdx,%r11
1250	movq	%r10,-24(%rdi,%rbp,1)
1251
1252	mulq	%r14
1253	addq	%rax,%r11
1254	movq	%rbx,%rax
1255	adcq	$0,%rdx
1256	movq	%r11,-16(%rdi,%rbp,1)
1257	movq	%rdx,%r10
1258
1259
1260	movq	-8(%rsi,%rbp,1),%rbx
1261	mulq	%r15
1262	movq	%rax,%r12
1263	movq	%rbx,%rax
1264	movq	%rdx,%r13
1265
1266	leaq	(%rbp),%rcx
1267	mulq	%r14
1268	addq	%rax,%r10
1269	movq	%rbx,%rax
1270	movq	%rdx,%r11
1271	adcq	$0,%r11
1272	addq	%r12,%r10
1273	adcq	$0,%r11
1274	movq	%r10,-8(%rdi,%rcx,1)
1275	jmp	.Lsqr4x_1st
1276
1277.align	32
1278.Lsqr4x_1st:
1279	movq	(%rsi,%rcx,1),%rbx
1280	mulq	%r15
1281	addq	%rax,%r13
1282	movq	%rbx,%rax
1283	movq	%rdx,%r12
1284	adcq	$0,%r12
1285
1286	mulq	%r14
1287	addq	%rax,%r11
1288	movq	%rbx,%rax
1289	movq	8(%rsi,%rcx,1),%rbx
1290	movq	%rdx,%r10
1291	adcq	$0,%r10
1292	addq	%r13,%r11
1293	adcq	$0,%r10
1294
1295
1296	mulq	%r15
1297	addq	%rax,%r12
1298	movq	%rbx,%rax
1299	movq	%r11,(%rdi,%rcx,1)
1300	movq	%rdx,%r13
1301	adcq	$0,%r13
1302
1303	mulq	%r14
1304	addq	%rax,%r10
1305	movq	%rbx,%rax
1306	movq	16(%rsi,%rcx,1),%rbx
1307	movq	%rdx,%r11
1308	adcq	$0,%r11
1309	addq	%r12,%r10
1310	adcq	$0,%r11
1311
1312	mulq	%r15
1313	addq	%rax,%r13
1314	movq	%rbx,%rax
1315	movq	%r10,8(%rdi,%rcx,1)
1316	movq	%rdx,%r12
1317	adcq	$0,%r12
1318
1319	mulq	%r14
1320	addq	%rax,%r11
1321	movq	%rbx,%rax
1322	movq	24(%rsi,%rcx,1),%rbx
1323	movq	%rdx,%r10
1324	adcq	$0,%r10
1325	addq	%r13,%r11
1326	adcq	$0,%r10
1327
1328
1329	mulq	%r15
1330	addq	%rax,%r12
1331	movq	%rbx,%rax
1332	movq	%r11,16(%rdi,%rcx,1)
1333	movq	%rdx,%r13
1334	adcq	$0,%r13
1335	leaq	32(%rcx),%rcx
1336
1337	mulq	%r14
1338	addq	%rax,%r10
1339	movq	%rbx,%rax
1340	movq	%rdx,%r11
1341	adcq	$0,%r11
1342	addq	%r12,%r10
1343	adcq	$0,%r11
1344	movq	%r10,-8(%rdi,%rcx,1)
1345
1346	cmpq	$0,%rcx
1347	jne	.Lsqr4x_1st
1348
1349	mulq	%r15
1350	addq	%rax,%r13
1351	leaq	16(%rbp),%rbp
1352	adcq	$0,%rdx
1353	addq	%r11,%r13
1354	adcq	$0,%rdx
1355
1356	movq	%r13,(%rdi)
1357	movq	%rdx,%r12
1358	movq	%rdx,8(%rdi)
1359	jmp	.Lsqr4x_outer
1360
1361.align	32
1362.Lsqr4x_outer:
1363	movq	-32(%rsi,%rbp,1),%r14
1364	leaq	48+8(%rsp,%r9,2),%rdi
1365	movq	-24(%rsi,%rbp,1),%rax
1366	leaq	-32(%rdi,%rbp,1),%rdi
1367	movq	-16(%rsi,%rbp,1),%rbx
1368	movq	%rax,%r15
1369
1370	mulq	%r14
1371	movq	-24(%rdi,%rbp,1),%r10
1372	addq	%rax,%r10
1373	movq	%rbx,%rax
1374	adcq	$0,%rdx
1375	movq	%r10,-24(%rdi,%rbp,1)
1376	movq	%rdx,%r11
1377
1378	mulq	%r14
1379	addq	%rax,%r11
1380	movq	%rbx,%rax
1381	adcq	$0,%rdx
1382	addq	-16(%rdi,%rbp,1),%r11
1383	movq	%rdx,%r10
1384	adcq	$0,%r10
1385	movq	%r11,-16(%rdi,%rbp,1)
1386
1387	xorq	%r12,%r12
1388
1389	movq	-8(%rsi,%rbp,1),%rbx
1390	mulq	%r15
1391	addq	%rax,%r12
1392	movq	%rbx,%rax
1393	adcq	$0,%rdx
1394	addq	-8(%rdi,%rbp,1),%r12
1395	movq	%rdx,%r13
1396	adcq	$0,%r13
1397
1398	mulq	%r14
1399	addq	%rax,%r10
1400	movq	%rbx,%rax
1401	adcq	$0,%rdx
1402	addq	%r12,%r10
1403	movq	%rdx,%r11
1404	adcq	$0,%r11
1405	movq	%r10,-8(%rdi,%rbp,1)
1406
1407	leaq	(%rbp),%rcx
1408	jmp	.Lsqr4x_inner
1409
1410.align	32
1411.Lsqr4x_inner:
1412	movq	(%rsi,%rcx,1),%rbx
1413	mulq	%r15
1414	addq	%rax,%r13
1415	movq	%rbx,%rax
1416	movq	%rdx,%r12
1417	adcq	$0,%r12
1418	addq	(%rdi,%rcx,1),%r13
1419	adcq	$0,%r12
1420
1421.byte	0x67
1422	mulq	%r14
1423	addq	%rax,%r11
1424	movq	%rbx,%rax
1425	movq	8(%rsi,%rcx,1),%rbx
1426	movq	%rdx,%r10
1427	adcq	$0,%r10
1428	addq	%r13,%r11
1429	adcq	$0,%r10
1430
1431	mulq	%r15
1432	addq	%rax,%r12
1433	movq	%r11,(%rdi,%rcx,1)
1434	movq	%rbx,%rax
1435	movq	%rdx,%r13
1436	adcq	$0,%r13
1437	addq	8(%rdi,%rcx,1),%r12
1438	leaq	16(%rcx),%rcx
1439	adcq	$0,%r13
1440
1441	mulq	%r14
1442	addq	%rax,%r10
1443	movq	%rbx,%rax
1444	adcq	$0,%rdx
1445	addq	%r12,%r10
1446	movq	%rdx,%r11
1447	adcq	$0,%r11
1448	movq	%r10,-8(%rdi,%rcx,1)
1449
1450	cmpq	$0,%rcx
1451	jne	.Lsqr4x_inner
1452
1453.byte	0x67
1454	mulq	%r15
1455	addq	%rax,%r13
1456	adcq	$0,%rdx
1457	addq	%r11,%r13
1458	adcq	$0,%rdx
1459
1460	movq	%r13,(%rdi)
1461	movq	%rdx,%r12
1462	movq	%rdx,8(%rdi)
1463
1464	addq	$16,%rbp
1465	jnz	.Lsqr4x_outer
1466
1467
1468	movq	-32(%rsi),%r14
1469	leaq	48+8(%rsp,%r9,2),%rdi
1470	movq	-24(%rsi),%rax
1471	leaq	-32(%rdi,%rbp,1),%rdi
1472	movq	-16(%rsi),%rbx
1473	movq	%rax,%r15
1474
1475	mulq	%r14
1476	addq	%rax,%r10
1477	movq	%rbx,%rax
1478	movq	%rdx,%r11
1479	adcq	$0,%r11
1480
1481	mulq	%r14
1482	addq	%rax,%r11
1483	movq	%rbx,%rax
1484	movq	%r10,-24(%rdi)
1485	movq	%rdx,%r10
1486	adcq	$0,%r10
1487	addq	%r13,%r11
1488	movq	-8(%rsi),%rbx
1489	adcq	$0,%r10
1490
1491	mulq	%r15
1492	addq	%rax,%r12
1493	movq	%rbx,%rax
1494	movq	%r11,-16(%rdi)
1495	movq	%rdx,%r13
1496	adcq	$0,%r13
1497
1498	mulq	%r14
1499	addq	%rax,%r10
1500	movq	%rbx,%rax
1501	movq	%rdx,%r11
1502	adcq	$0,%r11
1503	addq	%r12,%r10
1504	adcq	$0,%r11
1505	movq	%r10,-8(%rdi)
1506
1507	mulq	%r15
1508	addq	%rax,%r13
1509	movq	-16(%rsi),%rax
1510	adcq	$0,%rdx
1511	addq	%r11,%r13
1512	adcq	$0,%rdx
1513
1514	movq	%r13,(%rdi)
1515	movq	%rdx,%r12
1516	movq	%rdx,8(%rdi)
1517
1518	mulq	%rbx
1519	addq	$16,%rbp
1520	xorq	%r14,%r14
1521	subq	%r9,%rbp
1522	xorq	%r15,%r15
1523
1524	addq	%r12,%rax
1525	adcq	$0,%rdx
1526	movq	%rax,8(%rdi)
1527	movq	%rdx,16(%rdi)
1528	movq	%r15,24(%rdi)
1529
1530	movq	-16(%rsi,%rbp,1),%rax
1531	leaq	48+8(%rsp),%rdi
1532	xorq	%r10,%r10
1533	movq	8(%rdi),%r11
1534
1535	leaq	(%r14,%r10,2),%r12
1536	shrq	$63,%r10
1537	leaq	(%rcx,%r11,2),%r13
1538	shrq	$63,%r11
1539	orq	%r10,%r13
1540	movq	16(%rdi),%r10
1541	movq	%r11,%r14
1542	mulq	%rax
1543	negq	%r15
1544	movq	24(%rdi),%r11
1545	adcq	%rax,%r12
1546	movq	-8(%rsi,%rbp,1),%rax
1547	movq	%r12,(%rdi)
1548	adcq	%rdx,%r13
1549
1550	leaq	(%r14,%r10,2),%rbx
1551	movq	%r13,8(%rdi)
1552	sbbq	%r15,%r15
1553	shrq	$63,%r10
1554	leaq	(%rcx,%r11,2),%r8
1555	shrq	$63,%r11
1556	orq	%r10,%r8
1557	movq	32(%rdi),%r10
1558	movq	%r11,%r14
1559	mulq	%rax
1560	negq	%r15
1561	movq	40(%rdi),%r11
1562	adcq	%rax,%rbx
1563	movq	0(%rsi,%rbp,1),%rax
1564	movq	%rbx,16(%rdi)
1565	adcq	%rdx,%r8
1566	leaq	16(%rbp),%rbp
1567	movq	%r8,24(%rdi)
1568	sbbq	%r15,%r15
1569	leaq	64(%rdi),%rdi
1570	jmp	.Lsqr4x_shift_n_add
1571
1572.align	32
1573.Lsqr4x_shift_n_add:
1574	leaq	(%r14,%r10,2),%r12
1575	shrq	$63,%r10
1576	leaq	(%rcx,%r11,2),%r13
1577	shrq	$63,%r11
1578	orq	%r10,%r13
1579	movq	-16(%rdi),%r10
1580	movq	%r11,%r14
1581	mulq	%rax
1582	negq	%r15
1583	movq	-8(%rdi),%r11
1584	adcq	%rax,%r12
1585	movq	-8(%rsi,%rbp,1),%rax
1586	movq	%r12,-32(%rdi)
1587	adcq	%rdx,%r13
1588
1589	leaq	(%r14,%r10,2),%rbx
1590	movq	%r13,-24(%rdi)
1591	sbbq	%r15,%r15
1592	shrq	$63,%r10
1593	leaq	(%rcx,%r11,2),%r8
1594	shrq	$63,%r11
1595	orq	%r10,%r8
1596	movq	0(%rdi),%r10
1597	movq	%r11,%r14
1598	mulq	%rax
1599	negq	%r15
1600	movq	8(%rdi),%r11
1601	adcq	%rax,%rbx
1602	movq	0(%rsi,%rbp,1),%rax
1603	movq	%rbx,-16(%rdi)
1604	adcq	%rdx,%r8
1605
1606	leaq	(%r14,%r10,2),%r12
1607	movq	%r8,-8(%rdi)
1608	sbbq	%r15,%r15
1609	shrq	$63,%r10
1610	leaq	(%rcx,%r11,2),%r13
1611	shrq	$63,%r11
1612	orq	%r10,%r13
1613	movq	16(%rdi),%r10
1614	movq	%r11,%r14
1615	mulq	%rax
1616	negq	%r15
1617	movq	24(%rdi),%r11
1618	adcq	%rax,%r12
1619	movq	8(%rsi,%rbp,1),%rax
1620	movq	%r12,0(%rdi)
1621	adcq	%rdx,%r13
1622
1623	leaq	(%r14,%r10,2),%rbx
1624	movq	%r13,8(%rdi)
1625	sbbq	%r15,%r15
1626	shrq	$63,%r10
1627	leaq	(%rcx,%r11,2),%r8
1628	shrq	$63,%r11
1629	orq	%r10,%r8
1630	movq	32(%rdi),%r10
1631	movq	%r11,%r14
1632	mulq	%rax
1633	negq	%r15
1634	movq	40(%rdi),%r11
1635	adcq	%rax,%rbx
1636	movq	16(%rsi,%rbp,1),%rax
1637	movq	%rbx,16(%rdi)
1638	adcq	%rdx,%r8
1639	movq	%r8,24(%rdi)
1640	sbbq	%r15,%r15
1641	leaq	64(%rdi),%rdi
1642	addq	$32,%rbp
1643	jnz	.Lsqr4x_shift_n_add
1644
1645	leaq	(%r14,%r10,2),%r12
1646.byte	0x67
1647	shrq	$63,%r10
1648	leaq	(%rcx,%r11,2),%r13
1649	shrq	$63,%r11
1650	orq	%r10,%r13
1651	movq	-16(%rdi),%r10
1652	movq	%r11,%r14
1653	mulq	%rax
1654	negq	%r15
1655	movq	-8(%rdi),%r11
1656	adcq	%rax,%r12
1657	movq	-8(%rsi),%rax
1658	movq	%r12,-32(%rdi)
1659	adcq	%rdx,%r13
1660
1661	leaq	(%r14,%r10,2),%rbx
1662	movq	%r13,-24(%rdi)
1663	sbbq	%r15,%r15
1664	shrq	$63,%r10
1665	leaq	(%rcx,%r11,2),%r8
1666	shrq	$63,%r11
1667	orq	%r10,%r8
1668	mulq	%rax
1669	negq	%r15
1670	adcq	%rax,%rbx
1671	adcq	%rdx,%r8
1672	movq	%rbx,-16(%rdi)
1673	movq	%r8,-8(%rdi)
1674.byte	102,72,15,126,213
1675__bn_sqr8x_reduction:
1676	xorq	%rax,%rax
1677	leaq	(%r9,%rbp,1),%rcx
1678	leaq	48+8(%rsp,%r9,2),%rdx
1679	movq	%rcx,0+8(%rsp)
1680	leaq	48+8(%rsp,%r9,1),%rdi
1681	movq	%rdx,8+8(%rsp)
1682	negq	%r9
1683	jmp	.L8x_reduction_loop
1684
1685.align	32
1686.L8x_reduction_loop:
1687	leaq	(%rdi,%r9,1),%rdi
1688.byte	0x66
1689	movq	0(%rdi),%rbx
1690	movq	8(%rdi),%r9
1691	movq	16(%rdi),%r10
1692	movq	24(%rdi),%r11
1693	movq	32(%rdi),%r12
1694	movq	40(%rdi),%r13
1695	movq	48(%rdi),%r14
1696	movq	56(%rdi),%r15
1697	movq	%rax,(%rdx)
1698	leaq	64(%rdi),%rdi
1699
1700.byte	0x67
1701	movq	%rbx,%r8
1702	imulq	32+8(%rsp),%rbx
1703	movq	0(%rbp),%rax
1704	movl	$8,%ecx
1705	jmp	.L8x_reduce
1706
1707.align	32
1708.L8x_reduce:
1709	mulq	%rbx
1710	movq	8(%rbp),%rax
1711	negq	%r8
1712	movq	%rdx,%r8
1713	adcq	$0,%r8
1714
1715	mulq	%rbx
1716	addq	%rax,%r9
1717	movq	16(%rbp),%rax
1718	adcq	$0,%rdx
1719	addq	%r9,%r8
1720	movq	%rbx,48-8+8(%rsp,%rcx,8)
1721	movq	%rdx,%r9
1722	adcq	$0,%r9
1723
1724	mulq	%rbx
1725	addq	%rax,%r10
1726	movq	24(%rbp),%rax
1727	adcq	$0,%rdx
1728	addq	%r10,%r9
1729	movq	32+8(%rsp),%rsi
1730	movq	%rdx,%r10
1731	adcq	$0,%r10
1732
1733	mulq	%rbx
1734	addq	%rax,%r11
1735	movq	32(%rbp),%rax
1736	adcq	$0,%rdx
1737	imulq	%r8,%rsi
1738	addq	%r11,%r10
1739	movq	%rdx,%r11
1740	adcq	$0,%r11
1741
1742	mulq	%rbx
1743	addq	%rax,%r12
1744	movq	40(%rbp),%rax
1745	adcq	$0,%rdx
1746	addq	%r12,%r11
1747	movq	%rdx,%r12
1748	adcq	$0,%r12
1749
1750	mulq	%rbx
1751	addq	%rax,%r13
1752	movq	48(%rbp),%rax
1753	adcq	$0,%rdx
1754	addq	%r13,%r12
1755	movq	%rdx,%r13
1756	adcq	$0,%r13
1757
1758	mulq	%rbx
1759	addq	%rax,%r14
1760	movq	56(%rbp),%rax
1761	adcq	$0,%rdx
1762	addq	%r14,%r13
1763	movq	%rdx,%r14
1764	adcq	$0,%r14
1765
1766	mulq	%rbx
1767	movq	%rsi,%rbx
1768	addq	%rax,%r15
1769	movq	0(%rbp),%rax
1770	adcq	$0,%rdx
1771	addq	%r15,%r14
1772	movq	%rdx,%r15
1773	adcq	$0,%r15
1774
1775	decl	%ecx
1776	jnz	.L8x_reduce
1777
1778	leaq	64(%rbp),%rbp
1779	xorq	%rax,%rax
1780	movq	8+8(%rsp),%rdx
1781	cmpq	0+8(%rsp),%rbp
1782	jae	.L8x_no_tail
1783
1784.byte	0x66
1785	addq	0(%rdi),%r8
1786	adcq	8(%rdi),%r9
1787	adcq	16(%rdi),%r10
1788	adcq	24(%rdi),%r11
1789	adcq	32(%rdi),%r12
1790	adcq	40(%rdi),%r13
1791	adcq	48(%rdi),%r14
1792	adcq	56(%rdi),%r15
1793	sbbq	%rsi,%rsi
1794
1795	movq	48+56+8(%rsp),%rbx
1796	movl	$8,%ecx
1797	movq	0(%rbp),%rax
1798	jmp	.L8x_tail
1799
1800.align	32
1801.L8x_tail:
1802	mulq	%rbx
1803	addq	%rax,%r8
1804	movq	8(%rbp),%rax
1805	movq	%r8,(%rdi)
1806	movq	%rdx,%r8
1807	adcq	$0,%r8
1808
1809	mulq	%rbx
1810	addq	%rax,%r9
1811	movq	16(%rbp),%rax
1812	adcq	$0,%rdx
1813	addq	%r9,%r8
1814	leaq	8(%rdi),%rdi
1815	movq	%rdx,%r9
1816	adcq	$0,%r9
1817
1818	mulq	%rbx
1819	addq	%rax,%r10
1820	movq	24(%rbp),%rax
1821	adcq	$0,%rdx
1822	addq	%r10,%r9
1823	movq	%rdx,%r10
1824	adcq	$0,%r10
1825
1826	mulq	%rbx
1827	addq	%rax,%r11
1828	movq	32(%rbp),%rax
1829	adcq	$0,%rdx
1830	addq	%r11,%r10
1831	movq	%rdx,%r11
1832	adcq	$0,%r11
1833
1834	mulq	%rbx
1835	addq	%rax,%r12
1836	movq	40(%rbp),%rax
1837	adcq	$0,%rdx
1838	addq	%r12,%r11
1839	movq	%rdx,%r12
1840	adcq	$0,%r12
1841
1842	mulq	%rbx
1843	addq	%rax,%r13
1844	movq	48(%rbp),%rax
1845	adcq	$0,%rdx
1846	addq	%r13,%r12
1847	movq	%rdx,%r13
1848	adcq	$0,%r13
1849
1850	mulq	%rbx
1851	addq	%rax,%r14
1852	movq	56(%rbp),%rax
1853	adcq	$0,%rdx
1854	addq	%r14,%r13
1855	movq	%rdx,%r14
1856	adcq	$0,%r14
1857
1858	mulq	%rbx
1859	movq	48-16+8(%rsp,%rcx,8),%rbx
1860	addq	%rax,%r15
1861	adcq	$0,%rdx
1862	addq	%r15,%r14
1863	movq	0(%rbp),%rax
1864	movq	%rdx,%r15
1865	adcq	$0,%r15
1866
1867	decl	%ecx
1868	jnz	.L8x_tail
1869
1870	leaq	64(%rbp),%rbp
1871	movq	8+8(%rsp),%rdx
1872	cmpq	0+8(%rsp),%rbp
1873	jae	.L8x_tail_done
1874
1875	movq	48+56+8(%rsp),%rbx
1876	negq	%rsi
1877	movq	0(%rbp),%rax
1878	adcq	0(%rdi),%r8
1879	adcq	8(%rdi),%r9
1880	adcq	16(%rdi),%r10
1881	adcq	24(%rdi),%r11
1882	adcq	32(%rdi),%r12
1883	adcq	40(%rdi),%r13
1884	adcq	48(%rdi),%r14
1885	adcq	56(%rdi),%r15
1886	sbbq	%rsi,%rsi
1887
1888	movl	$8,%ecx
1889	jmp	.L8x_tail
1890
1891.align	32
1892.L8x_tail_done:
1893	xorq	%rax,%rax
1894	addq	(%rdx),%r8
1895	adcq	$0,%r9
1896	adcq	$0,%r10
1897	adcq	$0,%r11
1898	adcq	$0,%r12
1899	adcq	$0,%r13
1900	adcq	$0,%r14
1901	adcq	$0,%r15
1902	adcq	$0,%rax
1903
1904	negq	%rsi
1905.L8x_no_tail:
1906	adcq	0(%rdi),%r8
1907	adcq	8(%rdi),%r9
1908	adcq	16(%rdi),%r10
1909	adcq	24(%rdi),%r11
1910	adcq	32(%rdi),%r12
1911	adcq	40(%rdi),%r13
1912	adcq	48(%rdi),%r14
1913	adcq	56(%rdi),%r15
1914	adcq	$0,%rax
1915	movq	-8(%rbp),%rcx
1916	xorq	%rsi,%rsi
1917
1918.byte	102,72,15,126,213
1919
1920	movq	%r8,0(%rdi)
1921	movq	%r9,8(%rdi)
1922.byte	102,73,15,126,217
1923	movq	%r10,16(%rdi)
1924	movq	%r11,24(%rdi)
1925	movq	%r12,32(%rdi)
1926	movq	%r13,40(%rdi)
1927	movq	%r14,48(%rdi)
1928	movq	%r15,56(%rdi)
1929	leaq	64(%rdi),%rdi
1930
1931	cmpq	%rdx,%rdi
1932	jb	.L8x_reduction_loop
1933	.byte	0xf3,0xc3
1934.size	bn_sqr8x_internal,.-bn_sqr8x_internal
1935.type	__bn_post4x_internal,@function
1936.align	32
1937__bn_post4x_internal:
1938	movq	0(%rbp),%r12
1939	leaq	(%rdi,%r9,1),%rbx
1940	movq	%r9,%rcx
1941.byte	102,72,15,126,207
1942	negq	%rax
1943.byte	102,72,15,126,206
1944	sarq	$3+2,%rcx
1945	decq	%r12
1946	xorq	%r10,%r10
1947	movq	8(%rbp),%r13
1948	movq	16(%rbp),%r14
1949	movq	24(%rbp),%r15
1950	jmp	.Lsqr4x_sub_entry
1951
1952.align	16
1953.Lsqr4x_sub:
1954	movq	0(%rbp),%r12
1955	movq	8(%rbp),%r13
1956	movq	16(%rbp),%r14
1957	movq	24(%rbp),%r15
1958.Lsqr4x_sub_entry:
1959	leaq	32(%rbp),%rbp
1960	notq	%r12
1961	notq	%r13
1962	notq	%r14
1963	notq	%r15
1964	andq	%rax,%r12
1965	andq	%rax,%r13
1966	andq	%rax,%r14
1967	andq	%rax,%r15
1968
1969	negq	%r10
1970	adcq	0(%rbx),%r12
1971	adcq	8(%rbx),%r13
1972	adcq	16(%rbx),%r14
1973	adcq	24(%rbx),%r15
1974	movq	%r12,0(%rdi)
1975	leaq	32(%rbx),%rbx
1976	movq	%r13,8(%rdi)
1977	sbbq	%r10,%r10
1978	movq	%r14,16(%rdi)
1979	movq	%r15,24(%rdi)
1980	leaq	32(%rdi),%rdi
1981
1982	incq	%rcx
1983	jnz	.Lsqr4x_sub
1984
1985	movq	%r9,%r10
1986	negq	%r9
1987	.byte	0xf3,0xc3
1988.size	__bn_post4x_internal,.-__bn_post4x_internal
1989.globl	bn_from_montgomery
1990.type	bn_from_montgomery,@function
1991.align	32
1992bn_from_montgomery:
1993	testl	$7,%r9d
1994	jz	bn_from_mont8x
1995	xorl	%eax,%eax
1996	.byte	0xf3,0xc3
1997.size	bn_from_montgomery,.-bn_from_montgomery
1998
1999.type	bn_from_mont8x,@function
2000.align	32
2001bn_from_mont8x:
2002.byte	0x67
2003	movq	%rsp,%rax
2004	pushq	%rbx
2005	pushq	%rbp
2006	pushq	%r12
2007	pushq	%r13
2008	pushq	%r14
2009	pushq	%r15
2010.Lfrom_prologue:
2011
2012	shll	$3,%r9d
2013	leaq	(%r9,%r9,2),%r10
2014	negq	%r9
2015	movq	(%r8),%r8
2016
2017
2018
2019
2020
2021
2022
2023
2024	leaq	-320(%rsp,%r9,2),%r11
2025	movq	%rsp,%rbp
2026	subq	%rdi,%r11
2027	andq	$4095,%r11
2028	cmpq	%r11,%r10
2029	jb	.Lfrom_sp_alt
2030	subq	%r11,%rbp
2031	leaq	-320(%rbp,%r9,2),%rbp
2032	jmp	.Lfrom_sp_done
2033
2034.align	32
2035.Lfrom_sp_alt:
2036	leaq	4096-320(,%r9,2),%r10
2037	leaq	-320(%rbp,%r9,2),%rbp
2038	subq	%r10,%r11
2039	movq	$0,%r10
2040	cmovcq	%r10,%r11
2041	subq	%r11,%rbp
2042.Lfrom_sp_done:
2043	andq	$-64,%rbp
2044	movq	%rsp,%r11
2045	subq	%rbp,%r11
2046	andq	$-4096,%r11
2047	leaq	(%r11,%rbp,1),%rsp
2048	movq	(%rsp),%r10
2049	cmpq	%rbp,%rsp
2050	ja	.Lfrom_page_walk
2051	jmp	.Lfrom_page_walk_done
2052
2053.Lfrom_page_walk:
2054	leaq	-4096(%rsp),%rsp
2055	movq	(%rsp),%r10
2056	cmpq	%rbp,%rsp
2057	ja	.Lfrom_page_walk
2058.Lfrom_page_walk_done:
2059
2060	movq	%r9,%r10
2061	negq	%r9
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072	movq	%r8,32(%rsp)
2073	movq	%rax,40(%rsp)
2074.Lfrom_body:
2075	movq	%r9,%r11
2076	leaq	48(%rsp),%rax
2077	pxor	%xmm0,%xmm0
2078	jmp	.Lmul_by_1
2079
2080.align	32
2081.Lmul_by_1:
2082	movdqu	(%rsi),%xmm1
2083	movdqu	16(%rsi),%xmm2
2084	movdqu	32(%rsi),%xmm3
2085	movdqa	%xmm0,(%rax,%r9,1)
2086	movdqu	48(%rsi),%xmm4
2087	movdqa	%xmm0,16(%rax,%r9,1)
2088.byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
2089	movdqa	%xmm1,(%rax)
2090	movdqa	%xmm0,32(%rax,%r9,1)
2091	movdqa	%xmm2,16(%rax)
2092	movdqa	%xmm0,48(%rax,%r9,1)
2093	movdqa	%xmm3,32(%rax)
2094	movdqa	%xmm4,48(%rax)
2095	leaq	64(%rax),%rax
2096	subq	$64,%r11
2097	jnz	.Lmul_by_1
2098
2099.byte	102,72,15,110,207
2100.byte	102,72,15,110,209
2101.byte	0x67
2102	movq	%rcx,%rbp
2103.byte	102,73,15,110,218
2104	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
2105	andl	$0x80108,%r11d
2106	cmpl	$0x80108,%r11d
2107	jne	.Lfrom_mont_nox
2108
2109	leaq	(%rax,%r9,1),%rdi
2110	call	__bn_sqrx8x_reduction
2111	call	__bn_postx4x_internal
2112
2113	pxor	%xmm0,%xmm0
2114	leaq	48(%rsp),%rax
2115	movq	40(%rsp),%rsi
2116	jmp	.Lfrom_mont_zero
2117
2118.align	32
2119.Lfrom_mont_nox:
2120	call	__bn_sqr8x_reduction
2121	call	__bn_post4x_internal
2122
2123	pxor	%xmm0,%xmm0
2124	leaq	48(%rsp),%rax
2125	movq	40(%rsp),%rsi
2126	jmp	.Lfrom_mont_zero
2127
2128.align	32
2129.Lfrom_mont_zero:
2130	movdqa	%xmm0,0(%rax)
2131	movdqa	%xmm0,16(%rax)
2132	movdqa	%xmm0,32(%rax)
2133	movdqa	%xmm0,48(%rax)
2134	leaq	64(%rax),%rax
2135	subq	$32,%r9
2136	jnz	.Lfrom_mont_zero
2137
2138	movq	$1,%rax
2139	movq	-48(%rsi),%r15
2140	movq	-40(%rsi),%r14
2141	movq	-32(%rsi),%r13
2142	movq	-24(%rsi),%r12
2143	movq	-16(%rsi),%rbp
2144	movq	-8(%rsi),%rbx
2145	leaq	(%rsi),%rsp
2146.Lfrom_epilogue:
2147	.byte	0xf3,0xc3
2148.size	bn_from_mont8x,.-bn_from_mont8x
2149.type	bn_mulx4x_mont_gather5,@function
2150.align	32
2151bn_mulx4x_mont_gather5:
2152	movq	%rsp,%rax
2153.Lmulx4x_enter:
2154	pushq	%rbx
2155	pushq	%rbp
2156	pushq	%r12
2157	pushq	%r13
2158	pushq	%r14
2159	pushq	%r15
2160.Lmulx4x_prologue:
2161
2162	shll	$3,%r9d
2163	leaq	(%r9,%r9,2),%r10
2164	negq	%r9
2165	movq	(%r8),%r8
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176	leaq	-320(%rsp,%r9,2),%r11
2177	movq	%rsp,%rbp
2178	subq	%rdi,%r11
2179	andq	$4095,%r11
2180	cmpq	%r11,%r10
2181	jb	.Lmulx4xsp_alt
2182	subq	%r11,%rbp
2183	leaq	-320(%rbp,%r9,2),%rbp
2184	jmp	.Lmulx4xsp_done
2185
2186.Lmulx4xsp_alt:
2187	leaq	4096-320(,%r9,2),%r10
2188	leaq	-320(%rbp,%r9,2),%rbp
2189	subq	%r10,%r11
2190	movq	$0,%r10
2191	cmovcq	%r10,%r11
2192	subq	%r11,%rbp
2193.Lmulx4xsp_done:
2194	andq	$-64,%rbp
2195	movq	%rsp,%r11
2196	subq	%rbp,%r11
2197	andq	$-4096,%r11
2198	leaq	(%r11,%rbp,1),%rsp
2199	movq	(%rsp),%r10
2200	cmpq	%rbp,%rsp
2201	ja	.Lmulx4x_page_walk
2202	jmp	.Lmulx4x_page_walk_done
2203
2204.Lmulx4x_page_walk:
2205	leaq	-4096(%rsp),%rsp
2206	movq	(%rsp),%r10
2207	cmpq	%rbp,%rsp
2208	ja	.Lmulx4x_page_walk
2209.Lmulx4x_page_walk_done:
2210
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223	movq	%r8,32(%rsp)
2224	movq	%rax,40(%rsp)
2225.Lmulx4x_body:
2226	call	mulx4x_internal
2227
2228	movq	40(%rsp),%rsi
2229	movq	$1,%rax
2230
2231	movq	-48(%rsi),%r15
2232	movq	-40(%rsi),%r14
2233	movq	-32(%rsi),%r13
2234	movq	-24(%rsi),%r12
2235	movq	-16(%rsi),%rbp
2236	movq	-8(%rsi),%rbx
2237	leaq	(%rsi),%rsp
2238.Lmulx4x_epilogue:
2239	.byte	0xf3,0xc3
2240.size	bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
2241
2242.type	mulx4x_internal,@function
2243.align	32
2244mulx4x_internal:
2245	movq	%r9,8(%rsp)
2246	movq	%r9,%r10
2247	negq	%r9
2248	shlq	$5,%r9
2249	negq	%r10
2250	leaq	128(%rdx,%r9,1),%r13
2251	shrq	$5+5,%r9
2252	movd	8(%rax),%xmm5
2253	subq	$1,%r9
2254	leaq	.Linc(%rip),%rax
2255	movq	%r13,16+8(%rsp)
2256	movq	%r9,24+8(%rsp)
2257	movq	%rdi,56+8(%rsp)
2258	movdqa	0(%rax),%xmm0
2259	movdqa	16(%rax),%xmm1
2260	leaq	88-112(%rsp,%r10,1),%r10
2261	leaq	128(%rdx),%rdi
2262
2263	pshufd	$0,%xmm5,%xmm5
2264	movdqa	%xmm1,%xmm4
2265.byte	0x67
2266	movdqa	%xmm1,%xmm2
2267.byte	0x67
2268	paddd	%xmm0,%xmm1
2269	pcmpeqd	%xmm5,%xmm0
2270	movdqa	%xmm4,%xmm3
2271	paddd	%xmm1,%xmm2
2272	pcmpeqd	%xmm5,%xmm1
2273	movdqa	%xmm0,112(%r10)
2274	movdqa	%xmm4,%xmm0
2275
2276	paddd	%xmm2,%xmm3
2277	pcmpeqd	%xmm5,%xmm2
2278	movdqa	%xmm1,128(%r10)
2279	movdqa	%xmm4,%xmm1
2280
2281	paddd	%xmm3,%xmm0
2282	pcmpeqd	%xmm5,%xmm3
2283	movdqa	%xmm2,144(%r10)
2284	movdqa	%xmm4,%xmm2
2285
2286	paddd	%xmm0,%xmm1
2287	pcmpeqd	%xmm5,%xmm0
2288	movdqa	%xmm3,160(%r10)
2289	movdqa	%xmm4,%xmm3
2290	paddd	%xmm1,%xmm2
2291	pcmpeqd	%xmm5,%xmm1
2292	movdqa	%xmm0,176(%r10)
2293	movdqa	%xmm4,%xmm0
2294
2295	paddd	%xmm2,%xmm3
2296	pcmpeqd	%xmm5,%xmm2
2297	movdqa	%xmm1,192(%r10)
2298	movdqa	%xmm4,%xmm1
2299
2300	paddd	%xmm3,%xmm0
2301	pcmpeqd	%xmm5,%xmm3
2302	movdqa	%xmm2,208(%r10)
2303	movdqa	%xmm4,%xmm2
2304
2305	paddd	%xmm0,%xmm1
2306	pcmpeqd	%xmm5,%xmm0
2307	movdqa	%xmm3,224(%r10)
2308	movdqa	%xmm4,%xmm3
2309	paddd	%xmm1,%xmm2
2310	pcmpeqd	%xmm5,%xmm1
2311	movdqa	%xmm0,240(%r10)
2312	movdqa	%xmm4,%xmm0
2313
2314	paddd	%xmm2,%xmm3
2315	pcmpeqd	%xmm5,%xmm2
2316	movdqa	%xmm1,256(%r10)
2317	movdqa	%xmm4,%xmm1
2318
2319	paddd	%xmm3,%xmm0
2320	pcmpeqd	%xmm5,%xmm3
2321	movdqa	%xmm2,272(%r10)
2322	movdqa	%xmm4,%xmm2
2323
2324	paddd	%xmm0,%xmm1
2325	pcmpeqd	%xmm5,%xmm0
2326	movdqa	%xmm3,288(%r10)
2327	movdqa	%xmm4,%xmm3
2328.byte	0x67
2329	paddd	%xmm1,%xmm2
2330	pcmpeqd	%xmm5,%xmm1
2331	movdqa	%xmm0,304(%r10)
2332
2333	paddd	%xmm2,%xmm3
2334	pcmpeqd	%xmm5,%xmm2
2335	movdqa	%xmm1,320(%r10)
2336
2337	pcmpeqd	%xmm5,%xmm3
2338	movdqa	%xmm2,336(%r10)
2339
2340	pand	64(%rdi),%xmm0
2341	pand	80(%rdi),%xmm1
2342	pand	96(%rdi),%xmm2
2343	movdqa	%xmm3,352(%r10)
2344	pand	112(%rdi),%xmm3
2345	por	%xmm2,%xmm0
2346	por	%xmm3,%xmm1
2347	movdqa	-128(%rdi),%xmm4
2348	movdqa	-112(%rdi),%xmm5
2349	movdqa	-96(%rdi),%xmm2
2350	pand	112(%r10),%xmm4
2351	movdqa	-80(%rdi),%xmm3
2352	pand	128(%r10),%xmm5
2353	por	%xmm4,%xmm0
2354	pand	144(%r10),%xmm2
2355	por	%xmm5,%xmm1
2356	pand	160(%r10),%xmm3
2357	por	%xmm2,%xmm0
2358	por	%xmm3,%xmm1
2359	movdqa	-64(%rdi),%xmm4
2360	movdqa	-48(%rdi),%xmm5
2361	movdqa	-32(%rdi),%xmm2
2362	pand	176(%r10),%xmm4
2363	movdqa	-16(%rdi),%xmm3
2364	pand	192(%r10),%xmm5
2365	por	%xmm4,%xmm0
2366	pand	208(%r10),%xmm2
2367	por	%xmm5,%xmm1
2368	pand	224(%r10),%xmm3
2369	por	%xmm2,%xmm0
2370	por	%xmm3,%xmm1
2371	movdqa	0(%rdi),%xmm4
2372	movdqa	16(%rdi),%xmm5
2373	movdqa	32(%rdi),%xmm2
2374	pand	240(%r10),%xmm4
2375	movdqa	48(%rdi),%xmm3
2376	pand	256(%r10),%xmm5
2377	por	%xmm4,%xmm0
2378	pand	272(%r10),%xmm2
2379	por	%xmm5,%xmm1
2380	pand	288(%r10),%xmm3
2381	por	%xmm2,%xmm0
2382	por	%xmm3,%xmm1
2383	pxor	%xmm1,%xmm0
2384	pshufd	$0x4e,%xmm0,%xmm1
2385	por	%xmm1,%xmm0
2386	leaq	256(%rdi),%rdi
2387.byte	102,72,15,126,194
2388	leaq	64+32+8(%rsp),%rbx
2389
2390	movq	%rdx,%r9
2391	mulxq	0(%rsi),%r8,%rax
2392	mulxq	8(%rsi),%r11,%r12
2393	addq	%rax,%r11
2394	mulxq	16(%rsi),%rax,%r13
2395	adcq	%rax,%r12
2396	adcq	$0,%r13
2397	mulxq	24(%rsi),%rax,%r14
2398
2399	movq	%r8,%r15
2400	imulq	32+8(%rsp),%r8
2401	xorq	%rbp,%rbp
2402	movq	%r8,%rdx
2403
2404	movq	%rdi,8+8(%rsp)
2405
2406	leaq	32(%rsi),%rsi
2407	adcxq	%rax,%r13
2408	adcxq	%rbp,%r14
2409
2410	mulxq	0(%rcx),%rax,%r10
2411	adcxq	%rax,%r15
2412	adoxq	%r11,%r10
2413	mulxq	8(%rcx),%rax,%r11
2414	adcxq	%rax,%r10
2415	adoxq	%r12,%r11
2416	mulxq	16(%rcx),%rax,%r12
2417	movq	24+8(%rsp),%rdi
2418	movq	%r10,-32(%rbx)
2419	adcxq	%rax,%r11
2420	adoxq	%r13,%r12
2421	mulxq	24(%rcx),%rax,%r15
2422	movq	%r9,%rdx
2423	movq	%r11,-24(%rbx)
2424	adcxq	%rax,%r12
2425	adoxq	%rbp,%r15
2426	leaq	32(%rcx),%rcx
2427	movq	%r12,-16(%rbx)
2428	jmp	.Lmulx4x_1st
2429
2430.align	32
2431.Lmulx4x_1st:
2432	adcxq	%rbp,%r15
2433	mulxq	0(%rsi),%r10,%rax
2434	adcxq	%r14,%r10
2435	mulxq	8(%rsi),%r11,%r14
2436	adcxq	%rax,%r11
2437	mulxq	16(%rsi),%r12,%rax
2438	adcxq	%r14,%r12
2439	mulxq	24(%rsi),%r13,%r14
2440.byte	0x67,0x67
2441	movq	%r8,%rdx
2442	adcxq	%rax,%r13
2443	adcxq	%rbp,%r14
2444	leaq	32(%rsi),%rsi
2445	leaq	32(%rbx),%rbx
2446
2447	adoxq	%r15,%r10
2448	mulxq	0(%rcx),%rax,%r15
2449	adcxq	%rax,%r10
2450	adoxq	%r15,%r11
2451	mulxq	8(%rcx),%rax,%r15
2452	adcxq	%rax,%r11
2453	adoxq	%r15,%r12
2454	mulxq	16(%rcx),%rax,%r15
2455	movq	%r10,-40(%rbx)
2456	adcxq	%rax,%r12
2457	movq	%r11,-32(%rbx)
2458	adoxq	%r15,%r13
2459	mulxq	24(%rcx),%rax,%r15
2460	movq	%r9,%rdx
2461	movq	%r12,-24(%rbx)
2462	adcxq	%rax,%r13
2463	adoxq	%rbp,%r15
2464	leaq	32(%rcx),%rcx
2465	movq	%r13,-16(%rbx)
2466
2467	decq	%rdi
2468	jnz	.Lmulx4x_1st
2469
2470	movq	8(%rsp),%rax
2471	adcq	%rbp,%r15
2472	leaq	(%rsi,%rax,1),%rsi
2473	addq	%r15,%r14
2474	movq	8+8(%rsp),%rdi
2475	adcq	%rbp,%rbp
2476	movq	%r14,-8(%rbx)
2477	jmp	.Lmulx4x_outer
2478
2479.align	32
2480.Lmulx4x_outer:
2481	leaq	16-256(%rbx),%r10
2482	pxor	%xmm4,%xmm4
2483.byte	0x67,0x67
2484	pxor	%xmm5,%xmm5
2485	movdqa	-128(%rdi),%xmm0
2486	movdqa	-112(%rdi),%xmm1
2487	movdqa	-96(%rdi),%xmm2
2488	pand	256(%r10),%xmm0
2489	movdqa	-80(%rdi),%xmm3
2490	pand	272(%r10),%xmm1
2491	por	%xmm0,%xmm4
2492	pand	288(%r10),%xmm2
2493	por	%xmm1,%xmm5
2494	pand	304(%r10),%xmm3
2495	por	%xmm2,%xmm4
2496	por	%xmm3,%xmm5
2497	movdqa	-64(%rdi),%xmm0
2498	movdqa	-48(%rdi),%xmm1
2499	movdqa	-32(%rdi),%xmm2
2500	pand	320(%r10),%xmm0
2501	movdqa	-16(%rdi),%xmm3
2502	pand	336(%r10),%xmm1
2503	por	%xmm0,%xmm4
2504	pand	352(%r10),%xmm2
2505	por	%xmm1,%xmm5
2506	pand	368(%r10),%xmm3
2507	por	%xmm2,%xmm4
2508	por	%xmm3,%xmm5
2509	movdqa	0(%rdi),%xmm0
2510	movdqa	16(%rdi),%xmm1
2511	movdqa	32(%rdi),%xmm2
2512	pand	384(%r10),%xmm0
2513	movdqa	48(%rdi),%xmm3
2514	pand	400(%r10),%xmm1
2515	por	%xmm0,%xmm4
2516	pand	416(%r10),%xmm2
2517	por	%xmm1,%xmm5
2518	pand	432(%r10),%xmm3
2519	por	%xmm2,%xmm4
2520	por	%xmm3,%xmm5
2521	movdqa	64(%rdi),%xmm0
2522	movdqa	80(%rdi),%xmm1
2523	movdqa	96(%rdi),%xmm2
2524	pand	448(%r10),%xmm0
2525	movdqa	112(%rdi),%xmm3
2526	pand	464(%r10),%xmm1
2527	por	%xmm0,%xmm4
2528	pand	480(%r10),%xmm2
2529	por	%xmm1,%xmm5
2530	pand	496(%r10),%xmm3
2531	por	%xmm2,%xmm4
2532	por	%xmm3,%xmm5
2533	por	%xmm5,%xmm4
2534	pshufd	$0x4e,%xmm4,%xmm0
2535	por	%xmm4,%xmm0
2536	leaq	256(%rdi),%rdi
2537.byte	102,72,15,126,194
2538
2539	movq	%rbp,(%rbx)
2540	leaq	32(%rbx,%rax,1),%rbx
2541	mulxq	0(%rsi),%r8,%r11
2542	xorq	%rbp,%rbp
2543	movq	%rdx,%r9
2544	mulxq	8(%rsi),%r14,%r12
2545	adoxq	-32(%rbx),%r8
2546	adcxq	%r14,%r11
2547	mulxq	16(%rsi),%r15,%r13
2548	adoxq	-24(%rbx),%r11
2549	adcxq	%r15,%r12
2550	mulxq	24(%rsi),%rdx,%r14
2551	adoxq	-16(%rbx),%r12
2552	adcxq	%rdx,%r13
2553	leaq	(%rcx,%rax,1),%rcx
2554	leaq	32(%rsi),%rsi
2555	adoxq	-8(%rbx),%r13
2556	adcxq	%rbp,%r14
2557	adoxq	%rbp,%r14
2558
2559	movq	%r8,%r15
2560	imulq	32+8(%rsp),%r8
2561
2562	movq	%r8,%rdx
2563	xorq	%rbp,%rbp
2564	movq	%rdi,8+8(%rsp)
2565
2566	mulxq	0(%rcx),%rax,%r10
2567	adcxq	%rax,%r15
2568	adoxq	%r11,%r10
2569	mulxq	8(%rcx),%rax,%r11
2570	adcxq	%rax,%r10
2571	adoxq	%r12,%r11
2572	mulxq	16(%rcx),%rax,%r12
2573	adcxq	%rax,%r11
2574	adoxq	%r13,%r12
2575	mulxq	24(%rcx),%rax,%r15
2576	movq	%r9,%rdx
2577	movq	24+8(%rsp),%rdi
2578	movq	%r10,-32(%rbx)
2579	adcxq	%rax,%r12
2580	movq	%r11,-24(%rbx)
2581	adoxq	%rbp,%r15
2582	movq	%r12,-16(%rbx)
2583	leaq	32(%rcx),%rcx
2584	jmp	.Lmulx4x_inner
2585
2586.align	32
2587.Lmulx4x_inner:
2588	mulxq	0(%rsi),%r10,%rax
2589	adcxq	%rbp,%r15
2590	adoxq	%r14,%r10
2591	mulxq	8(%rsi),%r11,%r14
2592	adcxq	0(%rbx),%r10
2593	adoxq	%rax,%r11
2594	mulxq	16(%rsi),%r12,%rax
2595	adcxq	8(%rbx),%r11
2596	adoxq	%r14,%r12
2597	mulxq	24(%rsi),%r13,%r14
2598	movq	%r8,%rdx
2599	adcxq	16(%rbx),%r12
2600	adoxq	%rax,%r13
2601	adcxq	24(%rbx),%r13
2602	adoxq	%rbp,%r14
2603	leaq	32(%rsi),%rsi
2604	leaq	32(%rbx),%rbx
2605	adcxq	%rbp,%r14
2606
2607	adoxq	%r15,%r10
2608	mulxq	0(%rcx),%rax,%r15
2609	adcxq	%rax,%r10
2610	adoxq	%r15,%r11
2611	mulxq	8(%rcx),%rax,%r15
2612	adcxq	%rax,%r11
2613	adoxq	%r15,%r12
2614	mulxq	16(%rcx),%rax,%r15
2615	movq	%r10,-40(%rbx)
2616	adcxq	%rax,%r12
2617	adoxq	%r15,%r13
2618	movq	%r11,-32(%rbx)
2619	mulxq	24(%rcx),%rax,%r15
2620	movq	%r9,%rdx
2621	leaq	32(%rcx),%rcx
2622	movq	%r12,-24(%rbx)
2623	adcxq	%rax,%r13
2624	adoxq	%rbp,%r15
2625	movq	%r13,-16(%rbx)
2626
2627	decq	%rdi
2628	jnz	.Lmulx4x_inner
2629
2630	movq	0+8(%rsp),%rax
2631	adcq	%rbp,%r15
2632	subq	0(%rbx),%rdi
2633	movq	8+8(%rsp),%rdi
2634	movq	16+8(%rsp),%r10
2635	adcq	%r15,%r14
2636	leaq	(%rsi,%rax,1),%rsi
2637	adcq	%rbp,%rbp
2638	movq	%r14,-8(%rbx)
2639
2640	cmpq	%r10,%rdi
2641	jb	.Lmulx4x_outer
2642
2643	movq	-8(%rcx),%r10
2644	movq	%rbp,%r8
2645	movq	(%rcx,%rax,1),%r12
2646	leaq	(%rcx,%rax,1),%rbp
2647	movq	%rax,%rcx
2648	leaq	(%rbx,%rax,1),%rdi
2649	xorl	%eax,%eax
2650	xorq	%r15,%r15
2651	subq	%r14,%r10
2652	adcq	%r15,%r15
2653	orq	%r15,%r8
2654	sarq	$3+2,%rcx
2655	subq	%r8,%rax
2656	movq	56+8(%rsp),%rdx
2657	decq	%r12
2658	movq	8(%rbp),%r13
2659	xorq	%r8,%r8
2660	movq	16(%rbp),%r14
2661	movq	24(%rbp),%r15
2662	jmp	.Lsqrx4x_sub_entry
2663.size	mulx4x_internal,.-mulx4x_internal
2664.type	bn_powerx5,@function
2665.align	32
2666bn_powerx5:
2667	movq	%rsp,%rax
2668.Lpowerx5_enter:
2669	pushq	%rbx
2670	pushq	%rbp
2671	pushq	%r12
2672	pushq	%r13
2673	pushq	%r14
2674	pushq	%r15
2675.Lpowerx5_prologue:
2676
2677	shll	$3,%r9d
2678	leaq	(%r9,%r9,2),%r10
2679	negq	%r9
2680	movq	(%r8),%r8
2681
2682
2683
2684
2685
2686
2687
2688
2689	leaq	-320(%rsp,%r9,2),%r11
2690	movq	%rsp,%rbp
2691	subq	%rdi,%r11
2692	andq	$4095,%r11
2693	cmpq	%r11,%r10
2694	jb	.Lpwrx_sp_alt
2695	subq	%r11,%rbp
2696	leaq	-320(%rbp,%r9,2),%rbp
2697	jmp	.Lpwrx_sp_done
2698
2699.align	32
2700.Lpwrx_sp_alt:
2701	leaq	4096-320(,%r9,2),%r10
2702	leaq	-320(%rbp,%r9,2),%rbp
2703	subq	%r10,%r11
2704	movq	$0,%r10
2705	cmovcq	%r10,%r11
2706	subq	%r11,%rbp
2707.Lpwrx_sp_done:
2708	andq	$-64,%rbp
2709	movq	%rsp,%r11
2710	subq	%rbp,%r11
2711	andq	$-4096,%r11
2712	leaq	(%r11,%rbp,1),%rsp
2713	movq	(%rsp),%r10
2714	cmpq	%rbp,%rsp
2715	ja	.Lpwrx_page_walk
2716	jmp	.Lpwrx_page_walk_done
2717
2718.Lpwrx_page_walk:
2719	leaq	-4096(%rsp),%rsp
2720	movq	(%rsp),%r10
2721	cmpq	%rbp,%rsp
2722	ja	.Lpwrx_page_walk
2723.Lpwrx_page_walk_done:
2724
2725	movq	%r9,%r10
2726	negq	%r9
2727
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739	pxor	%xmm0,%xmm0
2740.byte	102,72,15,110,207
2741.byte	102,72,15,110,209
2742.byte	102,73,15,110,218
2743.byte	102,72,15,110,226
2744	movq	%r8,32(%rsp)
2745	movq	%rax,40(%rsp)
2746.Lpowerx5_body:
2747
2748	call	__bn_sqrx8x_internal
2749	call	__bn_postx4x_internal
2750	call	__bn_sqrx8x_internal
2751	call	__bn_postx4x_internal
2752	call	__bn_sqrx8x_internal
2753	call	__bn_postx4x_internal
2754	call	__bn_sqrx8x_internal
2755	call	__bn_postx4x_internal
2756	call	__bn_sqrx8x_internal
2757	call	__bn_postx4x_internal
2758
2759	movq	%r10,%r9
2760	movq	%rsi,%rdi
2761.byte	102,72,15,126,209
2762.byte	102,72,15,126,226
2763	movq	40(%rsp),%rax
2764
2765	call	mulx4x_internal
2766
2767	movq	40(%rsp),%rsi
2768	movq	$1,%rax
2769
2770	movq	-48(%rsi),%r15
2771	movq	-40(%rsi),%r14
2772	movq	-32(%rsi),%r13
2773	movq	-24(%rsi),%r12
2774	movq	-16(%rsi),%rbp
2775	movq	-8(%rsi),%rbx
2776	leaq	(%rsi),%rsp
2777.Lpowerx5_epilogue:
2778	.byte	0xf3,0xc3
2779.size	bn_powerx5,.-bn_powerx5
2780
2781.globl	bn_sqrx8x_internal
2782.hidden	bn_sqrx8x_internal
2783.type	bn_sqrx8x_internal,@function
2784.align	32
2785bn_sqrx8x_internal:
2786__bn_sqrx8x_internal:
2787
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827	leaq	48+8(%rsp),%rdi
2828	leaq	(%rsi,%r9,1),%rbp
2829	movq	%r9,0+8(%rsp)
2830	movq	%rbp,8+8(%rsp)
2831	jmp	.Lsqr8x_zero_start
2832
2833.align	32
2834.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2835.Lsqrx8x_zero:
2836.byte	0x3e
2837	movdqa	%xmm0,0(%rdi)
2838	movdqa	%xmm0,16(%rdi)
2839	movdqa	%xmm0,32(%rdi)
2840	movdqa	%xmm0,48(%rdi)
2841.Lsqr8x_zero_start:
2842	movdqa	%xmm0,64(%rdi)
2843	movdqa	%xmm0,80(%rdi)
2844	movdqa	%xmm0,96(%rdi)
2845	movdqa	%xmm0,112(%rdi)
2846	leaq	128(%rdi),%rdi
2847	subq	$64,%r9
2848	jnz	.Lsqrx8x_zero
2849
2850	movq	0(%rsi),%rdx
2851
2852	xorq	%r10,%r10
2853	xorq	%r11,%r11
2854	xorq	%r12,%r12
2855	xorq	%r13,%r13
2856	xorq	%r14,%r14
2857	xorq	%r15,%r15
2858	leaq	48+8(%rsp),%rdi
2859	xorq	%rbp,%rbp
2860	jmp	.Lsqrx8x_outer_loop
2861
2862.align	32
2863.Lsqrx8x_outer_loop:
2864	mulxq	8(%rsi),%r8,%rax
2865	adcxq	%r9,%r8
2866	adoxq	%rax,%r10
2867	mulxq	16(%rsi),%r9,%rax
2868	adcxq	%r10,%r9
2869	adoxq	%rax,%r11
2870.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
2871	adcxq	%r11,%r10
2872	adoxq	%rax,%r12
2873.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
2874	adcxq	%r12,%r11
2875	adoxq	%rax,%r13
2876	mulxq	40(%rsi),%r12,%rax
2877	adcxq	%r13,%r12
2878	adoxq	%rax,%r14
2879	mulxq	48(%rsi),%r13,%rax
2880	adcxq	%r14,%r13
2881	adoxq	%r15,%rax
2882	mulxq	56(%rsi),%r14,%r15
2883	movq	8(%rsi),%rdx
2884	adcxq	%rax,%r14
2885	adoxq	%rbp,%r15
2886	adcq	64(%rdi),%r15
2887	movq	%r8,8(%rdi)
2888	movq	%r9,16(%rdi)
2889	sbbq	%rcx,%rcx
2890	xorq	%rbp,%rbp
2891
2892
2893	mulxq	16(%rsi),%r8,%rbx
2894	mulxq	24(%rsi),%r9,%rax
2895	adcxq	%r10,%r8
2896	adoxq	%rbx,%r9
2897	mulxq	32(%rsi),%r10,%rbx
2898	adcxq	%r11,%r9
2899	adoxq	%rax,%r10
2900.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
2901	adcxq	%r12,%r10
2902	adoxq	%rbx,%r11
2903.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
2904	adcxq	%r13,%r11
2905	adoxq	%r14,%r12
2906.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
2907	movq	16(%rsi),%rdx
2908	adcxq	%rax,%r12
2909	adoxq	%rbx,%r13
2910	adcxq	%r15,%r13
2911	adoxq	%rbp,%r14
2912	adcxq	%rbp,%r14
2913
2914	movq	%r8,24(%rdi)
2915	movq	%r9,32(%rdi)
2916
2917	mulxq	24(%rsi),%r8,%rbx
2918	mulxq	32(%rsi),%r9,%rax
2919	adcxq	%r10,%r8
2920	adoxq	%rbx,%r9
2921	mulxq	40(%rsi),%r10,%rbx
2922	adcxq	%r11,%r9
2923	adoxq	%rax,%r10
2924.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
2925	adcxq	%r12,%r10
2926	adoxq	%r13,%r11
2927.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
2928.byte	0x3e
2929	movq	24(%rsi),%rdx
2930	adcxq	%rbx,%r11
2931	adoxq	%rax,%r12
2932	adcxq	%r14,%r12
2933	movq	%r8,40(%rdi)
2934	movq	%r9,48(%rdi)
2935	mulxq	32(%rsi),%r8,%rax
2936	adoxq	%rbp,%r13
2937	adcxq	%rbp,%r13
2938
2939	mulxq	40(%rsi),%r9,%rbx
2940	adcxq	%r10,%r8
2941	adoxq	%rax,%r9
2942	mulxq	48(%rsi),%r10,%rax
2943	adcxq	%r11,%r9
2944	adoxq	%r12,%r10
2945	mulxq	56(%rsi),%r11,%r12
2946	movq	32(%rsi),%rdx
2947	movq	40(%rsi),%r14
2948	adcxq	%rbx,%r10
2949	adoxq	%rax,%r11
2950	movq	48(%rsi),%r15
2951	adcxq	%r13,%r11
2952	adoxq	%rbp,%r12
2953	adcxq	%rbp,%r12
2954
2955	movq	%r8,56(%rdi)
2956	movq	%r9,64(%rdi)
2957
2958	mulxq	%r14,%r9,%rax
2959	movq	56(%rsi),%r8
2960	adcxq	%r10,%r9
2961	mulxq	%r15,%r10,%rbx
2962	adoxq	%rax,%r10
2963	adcxq	%r11,%r10
2964	mulxq	%r8,%r11,%rax
2965	movq	%r14,%rdx
2966	adoxq	%rbx,%r11
2967	adcxq	%r12,%r11
2968
2969	adcxq	%rbp,%rax
2970
2971	mulxq	%r15,%r14,%rbx
2972	mulxq	%r8,%r12,%r13
2973	movq	%r15,%rdx
2974	leaq	64(%rsi),%rsi
2975	adcxq	%r14,%r11
2976	adoxq	%rbx,%r12
2977	adcxq	%rax,%r12
2978	adoxq	%rbp,%r13
2979
2980.byte	0x67,0x67
2981	mulxq	%r8,%r8,%r14
2982	adcxq	%r8,%r13
2983	adcxq	%rbp,%r14
2984
2985	cmpq	8+8(%rsp),%rsi
2986	je	.Lsqrx8x_outer_break
2987
2988	negq	%rcx
2989	movq	$-8,%rcx
2990	movq	%rbp,%r15
2991	movq	64(%rdi),%r8
2992	adcxq	72(%rdi),%r9
2993	adcxq	80(%rdi),%r10
2994	adcxq	88(%rdi),%r11
2995	adcq	96(%rdi),%r12
2996	adcq	104(%rdi),%r13
2997	adcq	112(%rdi),%r14
2998	adcq	120(%rdi),%r15
2999	leaq	(%rsi),%rbp
3000	leaq	128(%rdi),%rdi
3001	sbbq	%rax,%rax
3002
3003	movq	-64(%rsi),%rdx
3004	movq	%rax,16+8(%rsp)
3005	movq	%rdi,24+8(%rsp)
3006
3007
3008	xorl	%eax,%eax
3009	jmp	.Lsqrx8x_loop
3010
3011.align	32
3012.Lsqrx8x_loop:
3013	movq	%r8,%rbx
3014	mulxq	0(%rbp),%rax,%r8
3015	adcxq	%rax,%rbx
3016	adoxq	%r9,%r8
3017
3018	mulxq	8(%rbp),%rax,%r9
3019	adcxq	%rax,%r8
3020	adoxq	%r10,%r9
3021
3022	mulxq	16(%rbp),%rax,%r10
3023	adcxq	%rax,%r9
3024	adoxq	%r11,%r10
3025
3026	mulxq	24(%rbp),%rax,%r11
3027	adcxq	%rax,%r10
3028	adoxq	%r12,%r11
3029
3030.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3031	adcxq	%rax,%r11
3032	adoxq	%r13,%r12
3033
3034	mulxq	40(%rbp),%rax,%r13
3035	adcxq	%rax,%r12
3036	adoxq	%r14,%r13
3037
3038	mulxq	48(%rbp),%rax,%r14
3039	movq	%rbx,(%rdi,%rcx,8)
3040	movl	$0,%ebx
3041	adcxq	%rax,%r13
3042	adoxq	%r15,%r14
3043
3044.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
3045	movq	8(%rsi,%rcx,8),%rdx
3046	adcxq	%rax,%r14
3047	adoxq	%rbx,%r15
3048	adcxq	%rbx,%r15
3049
3050.byte	0x67
3051	incq	%rcx
3052	jnz	.Lsqrx8x_loop
3053
3054	leaq	64(%rbp),%rbp
3055	movq	$-8,%rcx
3056	cmpq	8+8(%rsp),%rbp
3057	je	.Lsqrx8x_break
3058
3059	subq	16+8(%rsp),%rbx
3060.byte	0x66
3061	movq	-64(%rsi),%rdx
3062	adcxq	0(%rdi),%r8
3063	adcxq	8(%rdi),%r9
3064	adcq	16(%rdi),%r10
3065	adcq	24(%rdi),%r11
3066	adcq	32(%rdi),%r12
3067	adcq	40(%rdi),%r13
3068	adcq	48(%rdi),%r14
3069	adcq	56(%rdi),%r15
3070	leaq	64(%rdi),%rdi
3071.byte	0x67
3072	sbbq	%rax,%rax
3073	xorl	%ebx,%ebx
3074	movq	%rax,16+8(%rsp)
3075	jmp	.Lsqrx8x_loop
3076
3077.align	32
3078.Lsqrx8x_break:
3079	subq	16+8(%rsp),%r8
3080	movq	24+8(%rsp),%rcx
3081	movq	0(%rsi),%rdx
3082	xorl	%ebp,%ebp
3083	movq	%r8,0(%rdi)
3084	cmpq	%rcx,%rdi
3085	je	.Lsqrx8x_outer_loop
3086
3087	movq	%r9,8(%rdi)
3088	movq	8(%rcx),%r9
3089	movq	%r10,16(%rdi)
3090	movq	16(%rcx),%r10
3091	movq	%r11,24(%rdi)
3092	movq	24(%rcx),%r11
3093	movq	%r12,32(%rdi)
3094	movq	32(%rcx),%r12
3095	movq	%r13,40(%rdi)
3096	movq	40(%rcx),%r13
3097	movq	%r14,48(%rdi)
3098	movq	48(%rcx),%r14
3099	movq	%r15,56(%rdi)
3100	movq	56(%rcx),%r15
3101	movq	%rcx,%rdi
3102	jmp	.Lsqrx8x_outer_loop
3103
3104.align	32
3105.Lsqrx8x_outer_break:
3106	movq	%r9,72(%rdi)
3107.byte	102,72,15,126,217
3108	movq	%r10,80(%rdi)
3109	movq	%r11,88(%rdi)
3110	movq	%r12,96(%rdi)
3111	movq	%r13,104(%rdi)
3112	movq	%r14,112(%rdi)
3113	leaq	48+8(%rsp),%rdi
3114	movq	(%rsi,%rcx,1),%rdx
3115
3116	movq	8(%rdi),%r11
3117	xorq	%r10,%r10
3118	movq	0+8(%rsp),%r9
3119	adoxq	%r11,%r11
3120	movq	16(%rdi),%r12
3121	movq	24(%rdi),%r13
3122
3123
3124.align	32
3125.Lsqrx4x_shift_n_add:
3126	mulxq	%rdx,%rax,%rbx
3127	adoxq	%r12,%r12
3128	adcxq	%r10,%rax
3129.byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
3130.byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
3131	adoxq	%r13,%r13
3132	adcxq	%r11,%rbx
3133	movq	40(%rdi),%r11
3134	movq	%rax,0(%rdi)
3135	movq	%rbx,8(%rdi)
3136
3137	mulxq	%rdx,%rax,%rbx
3138	adoxq	%r10,%r10
3139	adcxq	%r12,%rax
3140	movq	16(%rsi,%rcx,1),%rdx
3141	movq	48(%rdi),%r12
3142	adoxq	%r11,%r11
3143	adcxq	%r13,%rbx
3144	movq	56(%rdi),%r13
3145	movq	%rax,16(%rdi)
3146	movq	%rbx,24(%rdi)
3147
3148	mulxq	%rdx,%rax,%rbx
3149	adoxq	%r12,%r12
3150	adcxq	%r10,%rax
3151	movq	24(%rsi,%rcx,1),%rdx
3152	leaq	32(%rcx),%rcx
3153	movq	64(%rdi),%r10
3154	adoxq	%r13,%r13
3155	adcxq	%r11,%rbx
3156	movq	72(%rdi),%r11
3157	movq	%rax,32(%rdi)
3158	movq	%rbx,40(%rdi)
3159
3160	mulxq	%rdx,%rax,%rbx
3161	adoxq	%r10,%r10
3162	adcxq	%r12,%rax
3163	jrcxz	.Lsqrx4x_shift_n_add_break
3164.byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
3165	adoxq	%r11,%r11
3166	adcxq	%r13,%rbx
3167	movq	80(%rdi),%r12
3168	movq	88(%rdi),%r13
3169	movq	%rax,48(%rdi)
3170	movq	%rbx,56(%rdi)
3171	leaq	64(%rdi),%rdi
3172	nop
3173	jmp	.Lsqrx4x_shift_n_add
3174
3175.align	32
3176.Lsqrx4x_shift_n_add_break:
3177	adcxq	%r13,%rbx
3178	movq	%rax,48(%rdi)
3179	movq	%rbx,56(%rdi)
3180	leaq	64(%rdi),%rdi
3181.byte	102,72,15,126,213
3182__bn_sqrx8x_reduction:
3183	xorl	%eax,%eax
3184	movq	32+8(%rsp),%rbx
3185	movq	48+8(%rsp),%rdx
3186	leaq	-64(%rbp,%r9,1),%rcx
3187
3188	movq	%rcx,0+8(%rsp)
3189	movq	%rdi,8+8(%rsp)
3190
3191	leaq	48+8(%rsp),%rdi
3192	jmp	.Lsqrx8x_reduction_loop
3193
3194.align	32
3195.Lsqrx8x_reduction_loop:
3196	movq	8(%rdi),%r9
3197	movq	16(%rdi),%r10
3198	movq	24(%rdi),%r11
3199	movq	32(%rdi),%r12
3200	movq	%rdx,%r8
3201	imulq	%rbx,%rdx
3202	movq	40(%rdi),%r13
3203	movq	48(%rdi),%r14
3204	movq	56(%rdi),%r15
3205	movq	%rax,24+8(%rsp)
3206
3207	leaq	64(%rdi),%rdi
3208	xorq	%rsi,%rsi
3209	movq	$-8,%rcx
3210	jmp	.Lsqrx8x_reduce
3211
3212.align	32
3213.Lsqrx8x_reduce:
3214	movq	%r8,%rbx
3215	mulxq	0(%rbp),%rax,%r8
3216	adcxq	%rbx,%rax
3217	adoxq	%r9,%r8
3218
3219	mulxq	8(%rbp),%rbx,%r9
3220	adcxq	%rbx,%r8
3221	adoxq	%r10,%r9
3222
3223	mulxq	16(%rbp),%rbx,%r10
3224	adcxq	%rbx,%r9
3225	adoxq	%r11,%r10
3226
3227	mulxq	24(%rbp),%rbx,%r11
3228	adcxq	%rbx,%r10
3229	adoxq	%r12,%r11
3230
3231.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
3232	movq	%rdx,%rax
3233	movq	%r8,%rdx
3234	adcxq	%rbx,%r11
3235	adoxq	%r13,%r12
3236
3237	mulxq	32+8(%rsp),%rbx,%rdx
3238	movq	%rax,%rdx
3239	movq	%rax,64+48+8(%rsp,%rcx,8)
3240
3241	mulxq	40(%rbp),%rax,%r13
3242	adcxq	%rax,%r12
3243	adoxq	%r14,%r13
3244
3245	mulxq	48(%rbp),%rax,%r14
3246	adcxq	%rax,%r13
3247	adoxq	%r15,%r14
3248
3249	mulxq	56(%rbp),%rax,%r15
3250	movq	%rbx,%rdx
3251	adcxq	%rax,%r14
3252	adoxq	%rsi,%r15
3253	adcxq	%rsi,%r15
3254
3255.byte	0x67,0x67,0x67
3256	incq	%rcx
3257	jnz	.Lsqrx8x_reduce
3258
3259	movq	%rsi,%rax
3260	cmpq	0+8(%rsp),%rbp
3261	jae	.Lsqrx8x_no_tail
3262
3263	movq	48+8(%rsp),%rdx
3264	addq	0(%rdi),%r8
3265	leaq	64(%rbp),%rbp
3266	movq	$-8,%rcx
3267	adcxq	8(%rdi),%r9
3268	adcxq	16(%rdi),%r10
3269	adcq	24(%rdi),%r11
3270	adcq	32(%rdi),%r12
3271	adcq	40(%rdi),%r13
3272	adcq	48(%rdi),%r14
3273	adcq	56(%rdi),%r15
3274	leaq	64(%rdi),%rdi
3275	sbbq	%rax,%rax
3276
3277	xorq	%rsi,%rsi
3278	movq	%rax,16+8(%rsp)
3279	jmp	.Lsqrx8x_tail
3280
3281.align	32
3282.Lsqrx8x_tail:
3283	movq	%r8,%rbx
3284	mulxq	0(%rbp),%rax,%r8
3285	adcxq	%rax,%rbx
3286	adoxq	%r9,%r8
3287
3288	mulxq	8(%rbp),%rax,%r9
3289	adcxq	%rax,%r8
3290	adoxq	%r10,%r9
3291
3292	mulxq	16(%rbp),%rax,%r10
3293	adcxq	%rax,%r9
3294	adoxq	%r11,%r10
3295
3296	mulxq	24(%rbp),%rax,%r11
3297	adcxq	%rax,%r10
3298	adoxq	%r12,%r11
3299
3300.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3301	adcxq	%rax,%r11
3302	adoxq	%r13,%r12
3303
3304	mulxq	40(%rbp),%rax,%r13
3305	adcxq	%rax,%r12
3306	adoxq	%r14,%r13
3307
3308	mulxq	48(%rbp),%rax,%r14
3309	adcxq	%rax,%r13
3310	adoxq	%r15,%r14
3311
3312	mulxq	56(%rbp),%rax,%r15
3313	movq	72+48+8(%rsp,%rcx,8),%rdx
3314	adcxq	%rax,%r14
3315	adoxq	%rsi,%r15
3316	movq	%rbx,(%rdi,%rcx,8)
3317	movq	%r8,%rbx
3318	adcxq	%rsi,%r15
3319
3320	incq	%rcx
3321	jnz	.Lsqrx8x_tail
3322
3323	cmpq	0+8(%rsp),%rbp
3324	jae	.Lsqrx8x_tail_done
3325
3326	subq	16+8(%rsp),%rsi
3327	movq	48+8(%rsp),%rdx
3328	leaq	64(%rbp),%rbp
3329	adcq	0(%rdi),%r8
3330	adcq	8(%rdi),%r9
3331	adcq	16(%rdi),%r10
3332	adcq	24(%rdi),%r11
3333	adcq	32(%rdi),%r12
3334	adcq	40(%rdi),%r13
3335	adcq	48(%rdi),%r14
3336	adcq	56(%rdi),%r15
3337	leaq	64(%rdi),%rdi
3338	sbbq	%rax,%rax
3339	subq	$8,%rcx
3340
3341	xorq	%rsi,%rsi
3342	movq	%rax,16+8(%rsp)
3343	jmp	.Lsqrx8x_tail
3344
3345.align	32
3346.Lsqrx8x_tail_done:
3347	xorq	%rax,%rax
3348	addq	24+8(%rsp),%r8
3349	adcq	$0,%r9
3350	adcq	$0,%r10
3351	adcq	$0,%r11
3352	adcq	$0,%r12
3353	adcq	$0,%r13
3354	adcq	$0,%r14
3355	adcq	$0,%r15
3356	adcq	$0,%rax
3357
3358	subq	16+8(%rsp),%rsi
3359.Lsqrx8x_no_tail:
3360	adcq	0(%rdi),%r8
3361.byte	102,72,15,126,217
3362	adcq	8(%rdi),%r9
3363	movq	56(%rbp),%rsi
3364.byte	102,72,15,126,213
3365	adcq	16(%rdi),%r10
3366	adcq	24(%rdi),%r11
3367	adcq	32(%rdi),%r12
3368	adcq	40(%rdi),%r13
3369	adcq	48(%rdi),%r14
3370	adcq	56(%rdi),%r15
3371	adcq	$0,%rax
3372
3373	movq	32+8(%rsp),%rbx
3374	movq	64(%rdi,%rcx,1),%rdx
3375
3376	movq	%r8,0(%rdi)
3377	leaq	64(%rdi),%r8
3378	movq	%r9,8(%rdi)
3379	movq	%r10,16(%rdi)
3380	movq	%r11,24(%rdi)
3381	movq	%r12,32(%rdi)
3382	movq	%r13,40(%rdi)
3383	movq	%r14,48(%rdi)
3384	movq	%r15,56(%rdi)
3385
3386	leaq	64(%rdi,%rcx,1),%rdi
3387	cmpq	8+8(%rsp),%r8
3388	jb	.Lsqrx8x_reduction_loop
3389	.byte	0xf3,0xc3
3390.size	bn_sqrx8x_internal,.-bn_sqrx8x_internal
3391.align	32
3392__bn_postx4x_internal:
3393	movq	0(%rbp),%r12
3394	movq	%rcx,%r10
3395	movq	%rcx,%r9
3396	negq	%rax
3397	sarq	$3+2,%rcx
3398
3399.byte	102,72,15,126,202
3400.byte	102,72,15,126,206
3401	decq	%r12
3402	movq	8(%rbp),%r13
3403	xorq	%r8,%r8
3404	movq	16(%rbp),%r14
3405	movq	24(%rbp),%r15
3406	jmp	.Lsqrx4x_sub_entry
3407
3408.align	16
3409.Lsqrx4x_sub:
3410	movq	0(%rbp),%r12
3411	movq	8(%rbp),%r13
3412	movq	16(%rbp),%r14
3413	movq	24(%rbp),%r15
3414.Lsqrx4x_sub_entry:
3415	andnq	%rax,%r12,%r12
3416	leaq	32(%rbp),%rbp
3417	andnq	%rax,%r13,%r13
3418	andnq	%rax,%r14,%r14
3419	andnq	%rax,%r15,%r15
3420
3421	negq	%r8
3422	adcq	0(%rdi),%r12
3423	adcq	8(%rdi),%r13
3424	adcq	16(%rdi),%r14
3425	adcq	24(%rdi),%r15
3426	movq	%r12,0(%rdx)
3427	leaq	32(%rdi),%rdi
3428	movq	%r13,8(%rdx)
3429	sbbq	%r8,%r8
3430	movq	%r14,16(%rdx)
3431	movq	%r15,24(%rdx)
3432	leaq	32(%rdx),%rdx
3433
3434	incq	%rcx
3435	jnz	.Lsqrx4x_sub
3436
3437	negq	%r9
3438
3439	.byte	0xf3,0xc3
3440.size	__bn_postx4x_internal,.-__bn_postx4x_internal
3441.globl	bn_get_bits5
3442.type	bn_get_bits5,@function
3443.align	16
3444bn_get_bits5:
3445	leaq	0(%rdi),%r10
3446	leaq	1(%rdi),%r11
3447	movl	%esi,%ecx
3448	shrl	$4,%esi
3449	andl	$15,%ecx
3450	leal	-8(%rcx),%eax
3451	cmpl	$11,%ecx
3452	cmovaq	%r11,%r10
3453	cmoval	%eax,%ecx
3454	movzwl	(%r10,%rsi,2),%eax
3455	shrl	%cl,%eax
3456	andl	$31,%eax
3457	.byte	0xf3,0xc3
3458.size	bn_get_bits5,.-bn_get_bits5
3459
3460.globl	bn_scatter5
3461.type	bn_scatter5,@function
3462.align	16
3463bn_scatter5:
3464	cmpl	$0,%esi
3465	jz	.Lscatter_epilogue
3466	leaq	(%rdx,%rcx,8),%rdx
3467.Lscatter:
3468	movq	(%rdi),%rax
3469	leaq	8(%rdi),%rdi
3470	movq	%rax,(%rdx)
3471	leaq	256(%rdx),%rdx
3472	subl	$1,%esi
3473	jnz	.Lscatter
3474.Lscatter_epilogue:
3475	.byte	0xf3,0xc3
3476.size	bn_scatter5,.-bn_scatter5
3477
3478.globl	bn_gather5
3479.type	bn_gather5,@function
3480.align	32
3481bn_gather5:
3482.LSEH_begin_bn_gather5:
3483
3484.byte	0x4c,0x8d,0x14,0x24
3485.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00
3486	leaq	.Linc(%rip),%rax
3487	andq	$-16,%rsp
3488
3489	movd	%ecx,%xmm5
3490	movdqa	0(%rax),%xmm0
3491	movdqa	16(%rax),%xmm1
3492	leaq	128(%rdx),%r11
3493	leaq	128(%rsp),%rax
3494
3495	pshufd	$0,%xmm5,%xmm5
3496	movdqa	%xmm1,%xmm4
3497	movdqa	%xmm1,%xmm2
3498	paddd	%xmm0,%xmm1
3499	pcmpeqd	%xmm5,%xmm0
3500	movdqa	%xmm4,%xmm3
3501
3502	paddd	%xmm1,%xmm2
3503	pcmpeqd	%xmm5,%xmm1
3504	movdqa	%xmm0,-128(%rax)
3505	movdqa	%xmm4,%xmm0
3506
3507	paddd	%xmm2,%xmm3
3508	pcmpeqd	%xmm5,%xmm2
3509	movdqa	%xmm1,-112(%rax)
3510	movdqa	%xmm4,%xmm1
3511
3512	paddd	%xmm3,%xmm0
3513	pcmpeqd	%xmm5,%xmm3
3514	movdqa	%xmm2,-96(%rax)
3515	movdqa	%xmm4,%xmm2
3516	paddd	%xmm0,%xmm1
3517	pcmpeqd	%xmm5,%xmm0
3518	movdqa	%xmm3,-80(%rax)
3519	movdqa	%xmm4,%xmm3
3520
3521	paddd	%xmm1,%xmm2
3522	pcmpeqd	%xmm5,%xmm1
3523	movdqa	%xmm0,-64(%rax)
3524	movdqa	%xmm4,%xmm0
3525
3526	paddd	%xmm2,%xmm3
3527	pcmpeqd	%xmm5,%xmm2
3528	movdqa	%xmm1,-48(%rax)
3529	movdqa	%xmm4,%xmm1
3530
3531	paddd	%xmm3,%xmm0
3532	pcmpeqd	%xmm5,%xmm3
3533	movdqa	%xmm2,-32(%rax)
3534	movdqa	%xmm4,%xmm2
3535	paddd	%xmm0,%xmm1
3536	pcmpeqd	%xmm5,%xmm0
3537	movdqa	%xmm3,-16(%rax)
3538	movdqa	%xmm4,%xmm3
3539
3540	paddd	%xmm1,%xmm2
3541	pcmpeqd	%xmm5,%xmm1
3542	movdqa	%xmm0,0(%rax)
3543	movdqa	%xmm4,%xmm0
3544
3545	paddd	%xmm2,%xmm3
3546	pcmpeqd	%xmm5,%xmm2
3547	movdqa	%xmm1,16(%rax)
3548	movdqa	%xmm4,%xmm1
3549
3550	paddd	%xmm3,%xmm0
3551	pcmpeqd	%xmm5,%xmm3
3552	movdqa	%xmm2,32(%rax)
3553	movdqa	%xmm4,%xmm2
3554	paddd	%xmm0,%xmm1
3555	pcmpeqd	%xmm5,%xmm0
3556	movdqa	%xmm3,48(%rax)
3557	movdqa	%xmm4,%xmm3
3558
3559	paddd	%xmm1,%xmm2
3560	pcmpeqd	%xmm5,%xmm1
3561	movdqa	%xmm0,64(%rax)
3562	movdqa	%xmm4,%xmm0
3563
3564	paddd	%xmm2,%xmm3
3565	pcmpeqd	%xmm5,%xmm2
3566	movdqa	%xmm1,80(%rax)
3567	movdqa	%xmm4,%xmm1
3568
3569	paddd	%xmm3,%xmm0
3570	pcmpeqd	%xmm5,%xmm3
3571	movdqa	%xmm2,96(%rax)
3572	movdqa	%xmm4,%xmm2
3573	movdqa	%xmm3,112(%rax)
3574	jmp	.Lgather
3575
3576.align	32
3577.Lgather:
3578	pxor	%xmm4,%xmm4
3579	pxor	%xmm5,%xmm5
3580	movdqa	-128(%r11),%xmm0
3581	movdqa	-112(%r11),%xmm1
3582	movdqa	-96(%r11),%xmm2
3583	pand	-128(%rax),%xmm0
3584	movdqa	-80(%r11),%xmm3
3585	pand	-112(%rax),%xmm1
3586	por	%xmm0,%xmm4
3587	pand	-96(%rax),%xmm2
3588	por	%xmm1,%xmm5
3589	pand	-80(%rax),%xmm3
3590	por	%xmm2,%xmm4
3591	por	%xmm3,%xmm5
3592	movdqa	-64(%r11),%xmm0
3593	movdqa	-48(%r11),%xmm1
3594	movdqa	-32(%r11),%xmm2
3595	pand	-64(%rax),%xmm0
3596	movdqa	-16(%r11),%xmm3
3597	pand	-48(%rax),%xmm1
3598	por	%xmm0,%xmm4
3599	pand	-32(%rax),%xmm2
3600	por	%xmm1,%xmm5
3601	pand	-16(%rax),%xmm3
3602	por	%xmm2,%xmm4
3603	por	%xmm3,%xmm5
3604	movdqa	0(%r11),%xmm0
3605	movdqa	16(%r11),%xmm1
3606	movdqa	32(%r11),%xmm2
3607	pand	0(%rax),%xmm0
3608	movdqa	48(%r11),%xmm3
3609	pand	16(%rax),%xmm1
3610	por	%xmm0,%xmm4
3611	pand	32(%rax),%xmm2
3612	por	%xmm1,%xmm5
3613	pand	48(%rax),%xmm3
3614	por	%xmm2,%xmm4
3615	por	%xmm3,%xmm5
3616	movdqa	64(%r11),%xmm0
3617	movdqa	80(%r11),%xmm1
3618	movdqa	96(%r11),%xmm2
3619	pand	64(%rax),%xmm0
3620	movdqa	112(%r11),%xmm3
3621	pand	80(%rax),%xmm1
3622	por	%xmm0,%xmm4
3623	pand	96(%rax),%xmm2
3624	por	%xmm1,%xmm5
3625	pand	112(%rax),%xmm3
3626	por	%xmm2,%xmm4
3627	por	%xmm3,%xmm5
3628	por	%xmm5,%xmm4
3629	leaq	256(%r11),%r11
3630	pshufd	$0x4e,%xmm4,%xmm0
3631	por	%xmm4,%xmm0
3632	movq	%xmm0,(%rdi)
3633	leaq	8(%rdi),%rdi
3634	subl	$1,%esi
3635	jnz	.Lgather
3636
3637	leaq	(%r10),%rsp
3638	.byte	0xf3,0xc3
3639.LSEH_end_bn_gather5:
3640.size	bn_gather5,.-bn_gather5
3641.align	64
3642.Linc:
3643.long	0,0, 1,1
3644.long	2,2, 2,2
3645.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
3646