x86_64-mont5.S revision 1.3
1#include <machine/asm.h>
2.text
3
4
5
6.globl	bn_mul_mont_gather5
7.type	bn_mul_mont_gather5,@function
8.align	64
9bn_mul_mont_gather5:
10	movl	%r9d,%r9d
11	movq	%rsp,%rax
12	testl	$7,%r9d
13	jnz	.Lmul_enter
14	jmp	.Lmul4x_enter
15
16.align	16
17.Lmul_enter:
18	movd	8(%rsp),%xmm5
19	pushq	%rbx
20	pushq	%rbp
21	pushq	%r12
22	pushq	%r13
23	pushq	%r14
24	pushq	%r15
25
26	negq	%r9
27	movq	%rsp,%r11
28	leaq	-280(%rsp,%r9,8),%r10
29	negq	%r9
30	andq	$-1024,%r10
31
32
33
34
35
36
37
38	subq	%r10,%r11
39	andq	$-4096,%r11
40	leaq	(%r10,%r11,1),%rsp
41	movq	(%rsp),%r11
42	cmpq	%r10,%rsp
43	ja	.Lmul_page_walk
44	jmp	.Lmul_page_walk_done
45
46.Lmul_page_walk:
47	leaq	-4096(%rsp),%rsp
48	movq	(%rsp),%r11
49	cmpq	%r10,%rsp
50	ja	.Lmul_page_walk
51.Lmul_page_walk_done:
52
53	leaq	.Linc(%rip),%r10
54	movq	%rax,8(%rsp,%r9,8)
55.Lmul_body:
56
57	leaq	128(%rdx),%r12
58	movdqa	0(%r10),%xmm0
59	movdqa	16(%r10),%xmm1
60	leaq	24-112(%rsp,%r9,8),%r10
61	andq	$-16,%r10
62
63	pshufd	$0,%xmm5,%xmm5
64	movdqa	%xmm1,%xmm4
65	movdqa	%xmm1,%xmm2
66	paddd	%xmm0,%xmm1
67	pcmpeqd	%xmm5,%xmm0
68.byte	0x67
69	movdqa	%xmm4,%xmm3
70	paddd	%xmm1,%xmm2
71	pcmpeqd	%xmm5,%xmm1
72	movdqa	%xmm0,112(%r10)
73	movdqa	%xmm4,%xmm0
74
75	paddd	%xmm2,%xmm3
76	pcmpeqd	%xmm5,%xmm2
77	movdqa	%xmm1,128(%r10)
78	movdqa	%xmm4,%xmm1
79
80	paddd	%xmm3,%xmm0
81	pcmpeqd	%xmm5,%xmm3
82	movdqa	%xmm2,144(%r10)
83	movdqa	%xmm4,%xmm2
84
85	paddd	%xmm0,%xmm1
86	pcmpeqd	%xmm5,%xmm0
87	movdqa	%xmm3,160(%r10)
88	movdqa	%xmm4,%xmm3
89	paddd	%xmm1,%xmm2
90	pcmpeqd	%xmm5,%xmm1
91	movdqa	%xmm0,176(%r10)
92	movdqa	%xmm4,%xmm0
93
94	paddd	%xmm2,%xmm3
95	pcmpeqd	%xmm5,%xmm2
96	movdqa	%xmm1,192(%r10)
97	movdqa	%xmm4,%xmm1
98
99	paddd	%xmm3,%xmm0
100	pcmpeqd	%xmm5,%xmm3
101	movdqa	%xmm2,208(%r10)
102	movdqa	%xmm4,%xmm2
103
104	paddd	%xmm0,%xmm1
105	pcmpeqd	%xmm5,%xmm0
106	movdqa	%xmm3,224(%r10)
107	movdqa	%xmm4,%xmm3
108	paddd	%xmm1,%xmm2
109	pcmpeqd	%xmm5,%xmm1
110	movdqa	%xmm0,240(%r10)
111	movdqa	%xmm4,%xmm0
112
113	paddd	%xmm2,%xmm3
114	pcmpeqd	%xmm5,%xmm2
115	movdqa	%xmm1,256(%r10)
116	movdqa	%xmm4,%xmm1
117
118	paddd	%xmm3,%xmm0
119	pcmpeqd	%xmm5,%xmm3
120	movdqa	%xmm2,272(%r10)
121	movdqa	%xmm4,%xmm2
122
123	paddd	%xmm0,%xmm1
124	pcmpeqd	%xmm5,%xmm0
125	movdqa	%xmm3,288(%r10)
126	movdqa	%xmm4,%xmm3
127	paddd	%xmm1,%xmm2
128	pcmpeqd	%xmm5,%xmm1
129	movdqa	%xmm0,304(%r10)
130
131	paddd	%xmm2,%xmm3
132.byte	0x67
133	pcmpeqd	%xmm5,%xmm2
134	movdqa	%xmm1,320(%r10)
135
136	pcmpeqd	%xmm5,%xmm3
137	movdqa	%xmm2,336(%r10)
138	pand	64(%r12),%xmm0
139
140	pand	80(%r12),%xmm1
141	pand	96(%r12),%xmm2
142	movdqa	%xmm3,352(%r10)
143	pand	112(%r12),%xmm3
144	por	%xmm2,%xmm0
145	por	%xmm3,%xmm1
146	movdqa	-128(%r12),%xmm4
147	movdqa	-112(%r12),%xmm5
148	movdqa	-96(%r12),%xmm2
149	pand	112(%r10),%xmm4
150	movdqa	-80(%r12),%xmm3
151	pand	128(%r10),%xmm5
152	por	%xmm4,%xmm0
153	pand	144(%r10),%xmm2
154	por	%xmm5,%xmm1
155	pand	160(%r10),%xmm3
156	por	%xmm2,%xmm0
157	por	%xmm3,%xmm1
158	movdqa	-64(%r12),%xmm4
159	movdqa	-48(%r12),%xmm5
160	movdqa	-32(%r12),%xmm2
161	pand	176(%r10),%xmm4
162	movdqa	-16(%r12),%xmm3
163	pand	192(%r10),%xmm5
164	por	%xmm4,%xmm0
165	pand	208(%r10),%xmm2
166	por	%xmm5,%xmm1
167	pand	224(%r10),%xmm3
168	por	%xmm2,%xmm0
169	por	%xmm3,%xmm1
170	movdqa	0(%r12),%xmm4
171	movdqa	16(%r12),%xmm5
172	movdqa	32(%r12),%xmm2
173	pand	240(%r10),%xmm4
174	movdqa	48(%r12),%xmm3
175	pand	256(%r10),%xmm5
176	por	%xmm4,%xmm0
177	pand	272(%r10),%xmm2
178	por	%xmm5,%xmm1
179	pand	288(%r10),%xmm3
180	por	%xmm2,%xmm0
181	por	%xmm3,%xmm1
182	por	%xmm1,%xmm0
183	pshufd	$0x4e,%xmm0,%xmm1
184	por	%xmm1,%xmm0
185	leaq	256(%r12),%r12
186.byte	102,72,15,126,195
187
188	movq	(%r8),%r8
189	movq	(%rsi),%rax
190
191	xorq	%r14,%r14
192	xorq	%r15,%r15
193
194	movq	%r8,%rbp
195	mulq	%rbx
196	movq	%rax,%r10
197	movq	(%rcx),%rax
198
199	imulq	%r10,%rbp
200	movq	%rdx,%r11
201
202	mulq	%rbp
203	addq	%rax,%r10
204	movq	8(%rsi),%rax
205	adcq	$0,%rdx
206	movq	%rdx,%r13
207
208	leaq	1(%r15),%r15
209	jmp	.L1st_enter
210
211.align	16
212.L1st:
213	addq	%rax,%r13
214	movq	(%rsi,%r15,8),%rax
215	adcq	$0,%rdx
216	addq	%r11,%r13
217	movq	%r10,%r11
218	adcq	$0,%rdx
219	movq	%r13,-16(%rsp,%r15,8)
220	movq	%rdx,%r13
221
222.L1st_enter:
223	mulq	%rbx
224	addq	%rax,%r11
225	movq	(%rcx,%r15,8),%rax
226	adcq	$0,%rdx
227	leaq	1(%r15),%r15
228	movq	%rdx,%r10
229
230	mulq	%rbp
231	cmpq	%r9,%r15
232	jne	.L1st
233
234
235	addq	%rax,%r13
236	adcq	$0,%rdx
237	addq	%r11,%r13
238	adcq	$0,%rdx
239	movq	%r13,-16(%rsp,%r9,8)
240	movq	%rdx,%r13
241	movq	%r10,%r11
242
243	xorq	%rdx,%rdx
244	addq	%r11,%r13
245	adcq	$0,%rdx
246	movq	%r13,-8(%rsp,%r9,8)
247	movq	%rdx,(%rsp,%r9,8)
248
249	leaq	1(%r14),%r14
250	jmp	.Louter
251.align	16
252.Louter:
253	leaq	24+128(%rsp,%r9,8),%rdx
254	andq	$-16,%rdx
255	pxor	%xmm4,%xmm4
256	pxor	%xmm5,%xmm5
257	movdqa	-128(%r12),%xmm0
258	movdqa	-112(%r12),%xmm1
259	movdqa	-96(%r12),%xmm2
260	movdqa	-80(%r12),%xmm3
261	pand	-128(%rdx),%xmm0
262	pand	-112(%rdx),%xmm1
263	por	%xmm0,%xmm4
264	pand	-96(%rdx),%xmm2
265	por	%xmm1,%xmm5
266	pand	-80(%rdx),%xmm3
267	por	%xmm2,%xmm4
268	por	%xmm3,%xmm5
269	movdqa	-64(%r12),%xmm0
270	movdqa	-48(%r12),%xmm1
271	movdqa	-32(%r12),%xmm2
272	movdqa	-16(%r12),%xmm3
273	pand	-64(%rdx),%xmm0
274	pand	-48(%rdx),%xmm1
275	por	%xmm0,%xmm4
276	pand	-32(%rdx),%xmm2
277	por	%xmm1,%xmm5
278	pand	-16(%rdx),%xmm3
279	por	%xmm2,%xmm4
280	por	%xmm3,%xmm5
281	movdqa	0(%r12),%xmm0
282	movdqa	16(%r12),%xmm1
283	movdqa	32(%r12),%xmm2
284	movdqa	48(%r12),%xmm3
285	pand	0(%rdx),%xmm0
286	pand	16(%rdx),%xmm1
287	por	%xmm0,%xmm4
288	pand	32(%rdx),%xmm2
289	por	%xmm1,%xmm5
290	pand	48(%rdx),%xmm3
291	por	%xmm2,%xmm4
292	por	%xmm3,%xmm5
293	movdqa	64(%r12),%xmm0
294	movdqa	80(%r12),%xmm1
295	movdqa	96(%r12),%xmm2
296	movdqa	112(%r12),%xmm3
297	pand	64(%rdx),%xmm0
298	pand	80(%rdx),%xmm1
299	por	%xmm0,%xmm4
300	pand	96(%rdx),%xmm2
301	por	%xmm1,%xmm5
302	pand	112(%rdx),%xmm3
303	por	%xmm2,%xmm4
304	por	%xmm3,%xmm5
305	por	%xmm5,%xmm4
306	pshufd	$0x4e,%xmm4,%xmm0
307	por	%xmm4,%xmm0
308	leaq	256(%r12),%r12
309
310	movq	(%rsi),%rax
311.byte	102,72,15,126,195
312
313	xorq	%r15,%r15
314	movq	%r8,%rbp
315	movq	(%rsp),%r10
316
317	mulq	%rbx
318	addq	%rax,%r10
319	movq	(%rcx),%rax
320	adcq	$0,%rdx
321
322	imulq	%r10,%rbp
323	movq	%rdx,%r11
324
325	mulq	%rbp
326	addq	%rax,%r10
327	movq	8(%rsi),%rax
328	adcq	$0,%rdx
329	movq	8(%rsp),%r10
330	movq	%rdx,%r13
331
332	leaq	1(%r15),%r15
333	jmp	.Linner_enter
334
335.align	16
336.Linner:
337	addq	%rax,%r13
338	movq	(%rsi,%r15,8),%rax
339	adcq	$0,%rdx
340	addq	%r10,%r13
341	movq	(%rsp,%r15,8),%r10
342	adcq	$0,%rdx
343	movq	%r13,-16(%rsp,%r15,8)
344	movq	%rdx,%r13
345
346.Linner_enter:
347	mulq	%rbx
348	addq	%rax,%r11
349	movq	(%rcx,%r15,8),%rax
350	adcq	$0,%rdx
351	addq	%r11,%r10
352	movq	%rdx,%r11
353	adcq	$0,%r11
354	leaq	1(%r15),%r15
355
356	mulq	%rbp
357	cmpq	%r9,%r15
358	jne	.Linner
359
360	addq	%rax,%r13
361	adcq	$0,%rdx
362	addq	%r10,%r13
363	movq	(%rsp,%r9,8),%r10
364	adcq	$0,%rdx
365	movq	%r13,-16(%rsp,%r9,8)
366	movq	%rdx,%r13
367
368	xorq	%rdx,%rdx
369	addq	%r11,%r13
370	adcq	$0,%rdx
371	addq	%r10,%r13
372	adcq	$0,%rdx
373	movq	%r13,-8(%rsp,%r9,8)
374	movq	%rdx,(%rsp,%r9,8)
375
376	leaq	1(%r14),%r14
377	cmpq	%r9,%r14
378	jb	.Louter
379
380	xorq	%r14,%r14
381	movq	(%rsp),%rax
382	leaq	(%rsp),%rsi
383	movq	%r9,%r15
384	jmp	.Lsub
385.align	16
386.Lsub:	sbbq	(%rcx,%r14,8),%rax
387	movq	%rax,(%rdi,%r14,8)
388	movq	8(%rsi,%r14,8),%rax
389	leaq	1(%r14),%r14
390	decq	%r15
391	jnz	.Lsub
392
393	sbbq	$0,%rax
394	xorq	%r14,%r14
395	andq	%rax,%rsi
396	notq	%rax
397	movq	%rdi,%rcx
398	andq	%rax,%rcx
399	movq	%r9,%r15
400	orq	%rcx,%rsi
401.align	16
402.Lcopy:
403	movq	(%rsi,%r14,8),%rax
404	movq	%r14,(%rsp,%r14,8)
405	movq	%rax,(%rdi,%r14,8)
406	leaq	1(%r14),%r14
407	subq	$1,%r15
408	jnz	.Lcopy
409
410	movq	8(%rsp,%r9,8),%rsi
411	movq	$1,%rax
412
413	movq	-48(%rsi),%r15
414	movq	-40(%rsi),%r14
415	movq	-32(%rsi),%r13
416	movq	-24(%rsi),%r12
417	movq	-16(%rsi),%rbp
418	movq	-8(%rsi),%rbx
419	leaq	(%rsi),%rsp
420.Lmul_epilogue:
421	.byte	0xf3,0xc3
422.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
423.type	bn_mul4x_mont_gather5,@function
424.align	32
425bn_mul4x_mont_gather5:
426.byte	0x67
427	movq	%rsp,%rax
428.Lmul4x_enter:
429	pushq	%rbx
430	pushq	%rbp
431	pushq	%r12
432	pushq	%r13
433	pushq	%r14
434	pushq	%r15
435.Lmul4x_prologue:
436
437.byte	0x67
438	shll	$3,%r9d
439	leaq	(%r9,%r9,2),%r10
440	negq	%r9
441
442
443
444
445
446
447
448
449
450
451	leaq	-320(%rsp,%r9,2),%r11
452	movq	%rsp,%rbp
453	subq	%rdi,%r11
454	andq	$4095,%r11
455	cmpq	%r11,%r10
456	jb	.Lmul4xsp_alt
457	subq	%r11,%rbp
458	leaq	-320(%rbp,%r9,2),%rbp
459	jmp	.Lmul4xsp_done
460
461.align	32
462.Lmul4xsp_alt:
463	leaq	4096-320(,%r9,2),%r10
464	leaq	-320(%rbp,%r9,2),%rbp
465	subq	%r10,%r11
466	movq	$0,%r10
467	cmovcq	%r10,%r11
468	subq	%r11,%rbp
469.Lmul4xsp_done:
470	andq	$-64,%rbp
471	movq	%rsp,%r11
472	subq	%rbp,%r11
473	andq	$-4096,%r11
474	leaq	(%r11,%rbp,1),%rsp
475	movq	(%rsp),%r10
476	cmpq	%rbp,%rsp
477	ja	.Lmul4x_page_walk
478	jmp	.Lmul4x_page_walk_done
479
480.Lmul4x_page_walk:
481	leaq	-4096(%rsp),%rsp
482	movq	(%rsp),%r10
483	cmpq	%rbp,%rsp
484	ja	.Lmul4x_page_walk
485.Lmul4x_page_walk_done:
486
487	negq	%r9
488
489	movq	%rax,40(%rsp)
490.Lmul4x_body:
491
492	call	mul4x_internal
493
494	movq	40(%rsp),%rsi
495	movq	$1,%rax
496
497	movq	-48(%rsi),%r15
498	movq	-40(%rsi),%r14
499	movq	-32(%rsi),%r13
500	movq	-24(%rsi),%r12
501	movq	-16(%rsi),%rbp
502	movq	-8(%rsi),%rbx
503	leaq	(%rsi),%rsp
504.Lmul4x_epilogue:
505	.byte	0xf3,0xc3
506.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
507
508.type	mul4x_internal,@function
509.align	32
510mul4x_internal:
511	shlq	$5,%r9
512	movd	8(%rax),%xmm5
513	leaq	.Linc(%rip),%rax
514	leaq	128(%rdx,%r9,1),%r13
515	shrq	$5,%r9
516	movdqa	0(%rax),%xmm0
517	movdqa	16(%rax),%xmm1
518	leaq	88-112(%rsp,%r9,1),%r10
519	leaq	128(%rdx),%r12
520
521	pshufd	$0,%xmm5,%xmm5
522	movdqa	%xmm1,%xmm4
523.byte	0x67,0x67
524	movdqa	%xmm1,%xmm2
525	paddd	%xmm0,%xmm1
526	pcmpeqd	%xmm5,%xmm0
527.byte	0x67
528	movdqa	%xmm4,%xmm3
529	paddd	%xmm1,%xmm2
530	pcmpeqd	%xmm5,%xmm1
531	movdqa	%xmm0,112(%r10)
532	movdqa	%xmm4,%xmm0
533
534	paddd	%xmm2,%xmm3
535	pcmpeqd	%xmm5,%xmm2
536	movdqa	%xmm1,128(%r10)
537	movdqa	%xmm4,%xmm1
538
539	paddd	%xmm3,%xmm0
540	pcmpeqd	%xmm5,%xmm3
541	movdqa	%xmm2,144(%r10)
542	movdqa	%xmm4,%xmm2
543
544	paddd	%xmm0,%xmm1
545	pcmpeqd	%xmm5,%xmm0
546	movdqa	%xmm3,160(%r10)
547	movdqa	%xmm4,%xmm3
548	paddd	%xmm1,%xmm2
549	pcmpeqd	%xmm5,%xmm1
550	movdqa	%xmm0,176(%r10)
551	movdqa	%xmm4,%xmm0
552
553	paddd	%xmm2,%xmm3
554	pcmpeqd	%xmm5,%xmm2
555	movdqa	%xmm1,192(%r10)
556	movdqa	%xmm4,%xmm1
557
558	paddd	%xmm3,%xmm0
559	pcmpeqd	%xmm5,%xmm3
560	movdqa	%xmm2,208(%r10)
561	movdqa	%xmm4,%xmm2
562
563	paddd	%xmm0,%xmm1
564	pcmpeqd	%xmm5,%xmm0
565	movdqa	%xmm3,224(%r10)
566	movdqa	%xmm4,%xmm3
567	paddd	%xmm1,%xmm2
568	pcmpeqd	%xmm5,%xmm1
569	movdqa	%xmm0,240(%r10)
570	movdqa	%xmm4,%xmm0
571
572	paddd	%xmm2,%xmm3
573	pcmpeqd	%xmm5,%xmm2
574	movdqa	%xmm1,256(%r10)
575	movdqa	%xmm4,%xmm1
576
577	paddd	%xmm3,%xmm0
578	pcmpeqd	%xmm5,%xmm3
579	movdqa	%xmm2,272(%r10)
580	movdqa	%xmm4,%xmm2
581
582	paddd	%xmm0,%xmm1
583	pcmpeqd	%xmm5,%xmm0
584	movdqa	%xmm3,288(%r10)
585	movdqa	%xmm4,%xmm3
586	paddd	%xmm1,%xmm2
587	pcmpeqd	%xmm5,%xmm1
588	movdqa	%xmm0,304(%r10)
589
590	paddd	%xmm2,%xmm3
591.byte	0x67
592	pcmpeqd	%xmm5,%xmm2
593	movdqa	%xmm1,320(%r10)
594
595	pcmpeqd	%xmm5,%xmm3
596	movdqa	%xmm2,336(%r10)
597	pand	64(%r12),%xmm0
598
599	pand	80(%r12),%xmm1
600	pand	96(%r12),%xmm2
601	movdqa	%xmm3,352(%r10)
602	pand	112(%r12),%xmm3
603	por	%xmm2,%xmm0
604	por	%xmm3,%xmm1
605	movdqa	-128(%r12),%xmm4
606	movdqa	-112(%r12),%xmm5
607	movdqa	-96(%r12),%xmm2
608	pand	112(%r10),%xmm4
609	movdqa	-80(%r12),%xmm3
610	pand	128(%r10),%xmm5
611	por	%xmm4,%xmm0
612	pand	144(%r10),%xmm2
613	por	%xmm5,%xmm1
614	pand	160(%r10),%xmm3
615	por	%xmm2,%xmm0
616	por	%xmm3,%xmm1
617	movdqa	-64(%r12),%xmm4
618	movdqa	-48(%r12),%xmm5
619	movdqa	-32(%r12),%xmm2
620	pand	176(%r10),%xmm4
621	movdqa	-16(%r12),%xmm3
622	pand	192(%r10),%xmm5
623	por	%xmm4,%xmm0
624	pand	208(%r10),%xmm2
625	por	%xmm5,%xmm1
626	pand	224(%r10),%xmm3
627	por	%xmm2,%xmm0
628	por	%xmm3,%xmm1
629	movdqa	0(%r12),%xmm4
630	movdqa	16(%r12),%xmm5
631	movdqa	32(%r12),%xmm2
632	pand	240(%r10),%xmm4
633	movdqa	48(%r12),%xmm3
634	pand	256(%r10),%xmm5
635	por	%xmm4,%xmm0
636	pand	272(%r10),%xmm2
637	por	%xmm5,%xmm1
638	pand	288(%r10),%xmm3
639	por	%xmm2,%xmm0
640	por	%xmm3,%xmm1
641	por	%xmm1,%xmm0
642	pshufd	$0x4e,%xmm0,%xmm1
643	por	%xmm1,%xmm0
644	leaq	256(%r12),%r12
645.byte	102,72,15,126,195
646
647	movq	%r13,16+8(%rsp)
648	movq	%rdi,56+8(%rsp)
649
650	movq	(%r8),%r8
651	movq	(%rsi),%rax
652	leaq	(%rsi,%r9,1),%rsi
653	negq	%r9
654
655	movq	%r8,%rbp
656	mulq	%rbx
657	movq	%rax,%r10
658	movq	(%rcx),%rax
659
660	imulq	%r10,%rbp
661	leaq	64+8(%rsp),%r14
662	movq	%rdx,%r11
663
664	mulq	%rbp
665	addq	%rax,%r10
666	movq	8(%rsi,%r9,1),%rax
667	adcq	$0,%rdx
668	movq	%rdx,%rdi
669
670	mulq	%rbx
671	addq	%rax,%r11
672	movq	8(%rcx),%rax
673	adcq	$0,%rdx
674	movq	%rdx,%r10
675
676	mulq	%rbp
677	addq	%rax,%rdi
678	movq	16(%rsi,%r9,1),%rax
679	adcq	$0,%rdx
680	addq	%r11,%rdi
681	leaq	32(%r9),%r15
682	leaq	32(%rcx),%rcx
683	adcq	$0,%rdx
684	movq	%rdi,(%r14)
685	movq	%rdx,%r13
686	jmp	.L1st4x
687
688.align	32
689.L1st4x:
690	mulq	%rbx
691	addq	%rax,%r10
692	movq	-16(%rcx),%rax
693	leaq	32(%r14),%r14
694	adcq	$0,%rdx
695	movq	%rdx,%r11
696
697	mulq	%rbp
698	addq	%rax,%r13
699	movq	-8(%rsi,%r15,1),%rax
700	adcq	$0,%rdx
701	addq	%r10,%r13
702	adcq	$0,%rdx
703	movq	%r13,-24(%r14)
704	movq	%rdx,%rdi
705
706	mulq	%rbx
707	addq	%rax,%r11
708	movq	-8(%rcx),%rax
709	adcq	$0,%rdx
710	movq	%rdx,%r10
711
712	mulq	%rbp
713	addq	%rax,%rdi
714	movq	(%rsi,%r15,1),%rax
715	adcq	$0,%rdx
716	addq	%r11,%rdi
717	adcq	$0,%rdx
718	movq	%rdi,-16(%r14)
719	movq	%rdx,%r13
720
721	mulq	%rbx
722	addq	%rax,%r10
723	movq	0(%rcx),%rax
724	adcq	$0,%rdx
725	movq	%rdx,%r11
726
727	mulq	%rbp
728	addq	%rax,%r13
729	movq	8(%rsi,%r15,1),%rax
730	adcq	$0,%rdx
731	addq	%r10,%r13
732	adcq	$0,%rdx
733	movq	%r13,-8(%r14)
734	movq	%rdx,%rdi
735
736	mulq	%rbx
737	addq	%rax,%r11
738	movq	8(%rcx),%rax
739	adcq	$0,%rdx
740	movq	%rdx,%r10
741
742	mulq	%rbp
743	addq	%rax,%rdi
744	movq	16(%rsi,%r15,1),%rax
745	adcq	$0,%rdx
746	addq	%r11,%rdi
747	leaq	32(%rcx),%rcx
748	adcq	$0,%rdx
749	movq	%rdi,(%r14)
750	movq	%rdx,%r13
751
752	addq	$32,%r15
753	jnz	.L1st4x
754
755	mulq	%rbx
756	addq	%rax,%r10
757	movq	-16(%rcx),%rax
758	leaq	32(%r14),%r14
759	adcq	$0,%rdx
760	movq	%rdx,%r11
761
762	mulq	%rbp
763	addq	%rax,%r13
764	movq	-8(%rsi),%rax
765	adcq	$0,%rdx
766	addq	%r10,%r13
767	adcq	$0,%rdx
768	movq	%r13,-24(%r14)
769	movq	%rdx,%rdi
770
771	mulq	%rbx
772	addq	%rax,%r11
773	movq	-8(%rcx),%rax
774	adcq	$0,%rdx
775	movq	%rdx,%r10
776
777	mulq	%rbp
778	addq	%rax,%rdi
779	movq	(%rsi,%r9,1),%rax
780	adcq	$0,%rdx
781	addq	%r11,%rdi
782	adcq	$0,%rdx
783	movq	%rdi,-16(%r14)
784	movq	%rdx,%r13
785
786	leaq	(%rcx,%r9,1),%rcx
787
788	xorq	%rdi,%rdi
789	addq	%r10,%r13
790	adcq	$0,%rdi
791	movq	%r13,-8(%r14)
792
793	jmp	.Louter4x
794
795.align	32
796.Louter4x:
797	leaq	16+128(%r14),%rdx
798	pxor	%xmm4,%xmm4
799	pxor	%xmm5,%xmm5
800	movdqa	-128(%r12),%xmm0
801	movdqa	-112(%r12),%xmm1
802	movdqa	-96(%r12),%xmm2
803	movdqa	-80(%r12),%xmm3
804	pand	-128(%rdx),%xmm0
805	pand	-112(%rdx),%xmm1
806	por	%xmm0,%xmm4
807	pand	-96(%rdx),%xmm2
808	por	%xmm1,%xmm5
809	pand	-80(%rdx),%xmm3
810	por	%xmm2,%xmm4
811	por	%xmm3,%xmm5
812	movdqa	-64(%r12),%xmm0
813	movdqa	-48(%r12),%xmm1
814	movdqa	-32(%r12),%xmm2
815	movdqa	-16(%r12),%xmm3
816	pand	-64(%rdx),%xmm0
817	pand	-48(%rdx),%xmm1
818	por	%xmm0,%xmm4
819	pand	-32(%rdx),%xmm2
820	por	%xmm1,%xmm5
821	pand	-16(%rdx),%xmm3
822	por	%xmm2,%xmm4
823	por	%xmm3,%xmm5
824	movdqa	0(%r12),%xmm0
825	movdqa	16(%r12),%xmm1
826	movdqa	32(%r12),%xmm2
827	movdqa	48(%r12),%xmm3
828	pand	0(%rdx),%xmm0
829	pand	16(%rdx),%xmm1
830	por	%xmm0,%xmm4
831	pand	32(%rdx),%xmm2
832	por	%xmm1,%xmm5
833	pand	48(%rdx),%xmm3
834	por	%xmm2,%xmm4
835	por	%xmm3,%xmm5
836	movdqa	64(%r12),%xmm0
837	movdqa	80(%r12),%xmm1
838	movdqa	96(%r12),%xmm2
839	movdqa	112(%r12),%xmm3
840	pand	64(%rdx),%xmm0
841	pand	80(%rdx),%xmm1
842	por	%xmm0,%xmm4
843	pand	96(%rdx),%xmm2
844	por	%xmm1,%xmm5
845	pand	112(%rdx),%xmm3
846	por	%xmm2,%xmm4
847	por	%xmm3,%xmm5
848	por	%xmm5,%xmm4
849	pshufd	$0x4e,%xmm4,%xmm0
850	por	%xmm4,%xmm0
851	leaq	256(%r12),%r12
852.byte	102,72,15,126,195
853
854	movq	(%r14,%r9,1),%r10
855	movq	%r8,%rbp
856	mulq	%rbx
857	addq	%rax,%r10
858	movq	(%rcx),%rax
859	adcq	$0,%rdx
860
861	imulq	%r10,%rbp
862	movq	%rdx,%r11
863	movq	%rdi,(%r14)
864
865	leaq	(%r14,%r9,1),%r14
866
867	mulq	%rbp
868	addq	%rax,%r10
869	movq	8(%rsi,%r9,1),%rax
870	adcq	$0,%rdx
871	movq	%rdx,%rdi
872
873	mulq	%rbx
874	addq	%rax,%r11
875	movq	8(%rcx),%rax
876	adcq	$0,%rdx
877	addq	8(%r14),%r11
878	adcq	$0,%rdx
879	movq	%rdx,%r10
880
881	mulq	%rbp
882	addq	%rax,%rdi
883	movq	16(%rsi,%r9,1),%rax
884	adcq	$0,%rdx
885	addq	%r11,%rdi
886	leaq	32(%r9),%r15
887	leaq	32(%rcx),%rcx
888	adcq	$0,%rdx
889	movq	%rdx,%r13
890	jmp	.Linner4x
891
892.align	32
893.Linner4x:
894	mulq	%rbx
895	addq	%rax,%r10
896	movq	-16(%rcx),%rax
897	adcq	$0,%rdx
898	addq	16(%r14),%r10
899	leaq	32(%r14),%r14
900	adcq	$0,%rdx
901	movq	%rdx,%r11
902
903	mulq	%rbp
904	addq	%rax,%r13
905	movq	-8(%rsi,%r15,1),%rax
906	adcq	$0,%rdx
907	addq	%r10,%r13
908	adcq	$0,%rdx
909	movq	%rdi,-32(%r14)
910	movq	%rdx,%rdi
911
912	mulq	%rbx
913	addq	%rax,%r11
914	movq	-8(%rcx),%rax
915	adcq	$0,%rdx
916	addq	-8(%r14),%r11
917	adcq	$0,%rdx
918	movq	%rdx,%r10
919
920	mulq	%rbp
921	addq	%rax,%rdi
922	movq	(%rsi,%r15,1),%rax
923	adcq	$0,%rdx
924	addq	%r11,%rdi
925	adcq	$0,%rdx
926	movq	%r13,-24(%r14)
927	movq	%rdx,%r13
928
929	mulq	%rbx
930	addq	%rax,%r10
931	movq	0(%rcx),%rax
932	adcq	$0,%rdx
933	addq	(%r14),%r10
934	adcq	$0,%rdx
935	movq	%rdx,%r11
936
937	mulq	%rbp
938	addq	%rax,%r13
939	movq	8(%rsi,%r15,1),%rax
940	adcq	$0,%rdx
941	addq	%r10,%r13
942	adcq	$0,%rdx
943	movq	%rdi,-16(%r14)
944	movq	%rdx,%rdi
945
946	mulq	%rbx
947	addq	%rax,%r11
948	movq	8(%rcx),%rax
949	adcq	$0,%rdx
950	addq	8(%r14),%r11
951	adcq	$0,%rdx
952	movq	%rdx,%r10
953
954	mulq	%rbp
955	addq	%rax,%rdi
956	movq	16(%rsi,%r15,1),%rax
957	adcq	$0,%rdx
958	addq	%r11,%rdi
959	leaq	32(%rcx),%rcx
960	adcq	$0,%rdx
961	movq	%r13,-8(%r14)
962	movq	%rdx,%r13
963
964	addq	$32,%r15
965	jnz	.Linner4x
966
967	mulq	%rbx
968	addq	%rax,%r10
969	movq	-16(%rcx),%rax
970	adcq	$0,%rdx
971	addq	16(%r14),%r10
972	leaq	32(%r14),%r14
973	adcq	$0,%rdx
974	movq	%rdx,%r11
975
976	mulq	%rbp
977	addq	%rax,%r13
978	movq	-8(%rsi),%rax
979	adcq	$0,%rdx
980	addq	%r10,%r13
981	adcq	$0,%rdx
982	movq	%rdi,-32(%r14)
983	movq	%rdx,%rdi
984
985	mulq	%rbx
986	addq	%rax,%r11
987	movq	%rbp,%rax
988	movq	-8(%rcx),%rbp
989	adcq	$0,%rdx
990	addq	-8(%r14),%r11
991	adcq	$0,%rdx
992	movq	%rdx,%r10
993
994	mulq	%rbp
995	addq	%rax,%rdi
996	movq	(%rsi,%r9,1),%rax
997	adcq	$0,%rdx
998	addq	%r11,%rdi
999	adcq	$0,%rdx
1000	movq	%r13,-24(%r14)
1001	movq	%rdx,%r13
1002
1003	movq	%rdi,-16(%r14)
1004	leaq	(%rcx,%r9,1),%rcx
1005
1006	xorq	%rdi,%rdi
1007	addq	%r10,%r13
1008	adcq	$0,%rdi
1009	addq	(%r14),%r13
1010	adcq	$0,%rdi
1011	movq	%r13,-8(%r14)
1012
1013	cmpq	16+8(%rsp),%r12
1014	jb	.Louter4x
1015	xorq	%rax,%rax
1016	subq	%r13,%rbp
1017	adcq	%r15,%r15
1018	orq	%r15,%rdi
1019	subq	%rdi,%rax
1020	leaq	(%r14,%r9,1),%rbx
1021	movq	(%rcx),%r12
1022	leaq	(%rcx),%rbp
1023	movq	%r9,%rcx
1024	sarq	$3+2,%rcx
1025	movq	56+8(%rsp),%rdi
1026	decq	%r12
1027	xorq	%r10,%r10
1028	movq	8(%rbp),%r13
1029	movq	16(%rbp),%r14
1030	movq	24(%rbp),%r15
1031	jmp	.Lsqr4x_sub_entry
1032.size	mul4x_internal,.-mul4x_internal
1033.globl	bn_power5
1034.type	bn_power5,@function
1035.align	32
1036bn_power5:
1037	movq	%rsp,%rax
1038	pushq	%rbx
1039	pushq	%rbp
1040	pushq	%r12
1041	pushq	%r13
1042	pushq	%r14
1043	pushq	%r15
1044.Lpower5_prologue:
1045
1046	shll	$3,%r9d
1047	leal	(%r9,%r9,2),%r10d
1048	negq	%r9
1049	movq	(%r8),%r8
1050
1051
1052
1053
1054
1055
1056
1057
1058	leaq	-320(%rsp,%r9,2),%r11
1059	movq	%rsp,%rbp
1060	subq	%rdi,%r11
1061	andq	$4095,%r11
1062	cmpq	%r11,%r10
1063	jb	.Lpwr_sp_alt
1064	subq	%r11,%rbp
1065	leaq	-320(%rbp,%r9,2),%rbp
1066	jmp	.Lpwr_sp_done
1067
1068.align	32
1069.Lpwr_sp_alt:
1070	leaq	4096-320(,%r9,2),%r10
1071	leaq	-320(%rbp,%r9,2),%rbp
1072	subq	%r10,%r11
1073	movq	$0,%r10
1074	cmovcq	%r10,%r11
1075	subq	%r11,%rbp
1076.Lpwr_sp_done:
1077	andq	$-64,%rbp
1078	movq	%rsp,%r11
1079	subq	%rbp,%r11
1080	andq	$-4096,%r11
1081	leaq	(%r11,%rbp,1),%rsp
1082	movq	(%rsp),%r10
1083	cmpq	%rbp,%rsp
1084	ja	.Lpwr_page_walk
1085	jmp	.Lpwr_page_walk_done
1086
1087.Lpwr_page_walk:
1088	leaq	-4096(%rsp),%rsp
1089	movq	(%rsp),%r10
1090	cmpq	%rbp,%rsp
1091	ja	.Lpwr_page_walk
1092.Lpwr_page_walk_done:
1093
1094	movq	%r9,%r10
1095	negq	%r9
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106	movq	%r8,32(%rsp)
1107	movq	%rax,40(%rsp)
1108.Lpower5_body:
1109.byte	102,72,15,110,207
1110.byte	102,72,15,110,209
1111.byte	102,73,15,110,218
1112.byte	102,72,15,110,226
1113
1114	call	__bn_sqr8x_internal
1115	call	__bn_post4x_internal
1116	call	__bn_sqr8x_internal
1117	call	__bn_post4x_internal
1118	call	__bn_sqr8x_internal
1119	call	__bn_post4x_internal
1120	call	__bn_sqr8x_internal
1121	call	__bn_post4x_internal
1122	call	__bn_sqr8x_internal
1123	call	__bn_post4x_internal
1124
1125.byte	102,72,15,126,209
1126.byte	102,72,15,126,226
1127	movq	%rsi,%rdi
1128	movq	40(%rsp),%rax
1129	leaq	32(%rsp),%r8
1130
1131	call	mul4x_internal
1132
1133	movq	40(%rsp),%rsi
1134	movq	$1,%rax
1135	movq	-48(%rsi),%r15
1136	movq	-40(%rsi),%r14
1137	movq	-32(%rsi),%r13
1138	movq	-24(%rsi),%r12
1139	movq	-16(%rsi),%rbp
1140	movq	-8(%rsi),%rbx
1141	leaq	(%rsi),%rsp
1142.Lpower5_epilogue:
1143	.byte	0xf3,0xc3
1144.size	bn_power5,.-bn_power5
1145
1146.globl	bn_sqr8x_internal
1147.hidden	bn_sqr8x_internal
1148.type	bn_sqr8x_internal,@function
1149.align	32
1150bn_sqr8x_internal:
1151__bn_sqr8x_internal:
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225	leaq	32(%r10),%rbp
1226	leaq	(%rsi,%r9,1),%rsi
1227
1228	movq	%r9,%rcx
1229
1230
1231	movq	-32(%rsi,%rbp,1),%r14
1232	leaq	48+8(%rsp,%r9,2),%rdi
1233	movq	-24(%rsi,%rbp,1),%rax
1234	leaq	-32(%rdi,%rbp,1),%rdi
1235	movq	-16(%rsi,%rbp,1),%rbx
1236	movq	%rax,%r15
1237
1238	mulq	%r14
1239	movq	%rax,%r10
1240	movq	%rbx,%rax
1241	movq	%rdx,%r11
1242	movq	%r10,-24(%rdi,%rbp,1)
1243
1244	mulq	%r14
1245	addq	%rax,%r11
1246	movq	%rbx,%rax
1247	adcq	$0,%rdx
1248	movq	%r11,-16(%rdi,%rbp,1)
1249	movq	%rdx,%r10
1250
1251
1252	movq	-8(%rsi,%rbp,1),%rbx
1253	mulq	%r15
1254	movq	%rax,%r12
1255	movq	%rbx,%rax
1256	movq	%rdx,%r13
1257
1258	leaq	(%rbp),%rcx
1259	mulq	%r14
1260	addq	%rax,%r10
1261	movq	%rbx,%rax
1262	movq	%rdx,%r11
1263	adcq	$0,%r11
1264	addq	%r12,%r10
1265	adcq	$0,%r11
1266	movq	%r10,-8(%rdi,%rcx,1)
1267	jmp	.Lsqr4x_1st
1268
1269.align	32
1270.Lsqr4x_1st:
1271	movq	(%rsi,%rcx,1),%rbx
1272	mulq	%r15
1273	addq	%rax,%r13
1274	movq	%rbx,%rax
1275	movq	%rdx,%r12
1276	adcq	$0,%r12
1277
1278	mulq	%r14
1279	addq	%rax,%r11
1280	movq	%rbx,%rax
1281	movq	8(%rsi,%rcx,1),%rbx
1282	movq	%rdx,%r10
1283	adcq	$0,%r10
1284	addq	%r13,%r11
1285	adcq	$0,%r10
1286
1287
1288	mulq	%r15
1289	addq	%rax,%r12
1290	movq	%rbx,%rax
1291	movq	%r11,(%rdi,%rcx,1)
1292	movq	%rdx,%r13
1293	adcq	$0,%r13
1294
1295	mulq	%r14
1296	addq	%rax,%r10
1297	movq	%rbx,%rax
1298	movq	16(%rsi,%rcx,1),%rbx
1299	movq	%rdx,%r11
1300	adcq	$0,%r11
1301	addq	%r12,%r10
1302	adcq	$0,%r11
1303
1304	mulq	%r15
1305	addq	%rax,%r13
1306	movq	%rbx,%rax
1307	movq	%r10,8(%rdi,%rcx,1)
1308	movq	%rdx,%r12
1309	adcq	$0,%r12
1310
1311	mulq	%r14
1312	addq	%rax,%r11
1313	movq	%rbx,%rax
1314	movq	24(%rsi,%rcx,1),%rbx
1315	movq	%rdx,%r10
1316	adcq	$0,%r10
1317	addq	%r13,%r11
1318	adcq	$0,%r10
1319
1320
1321	mulq	%r15
1322	addq	%rax,%r12
1323	movq	%rbx,%rax
1324	movq	%r11,16(%rdi,%rcx,1)
1325	movq	%rdx,%r13
1326	adcq	$0,%r13
1327	leaq	32(%rcx),%rcx
1328
1329	mulq	%r14
1330	addq	%rax,%r10
1331	movq	%rbx,%rax
1332	movq	%rdx,%r11
1333	adcq	$0,%r11
1334	addq	%r12,%r10
1335	adcq	$0,%r11
1336	movq	%r10,-8(%rdi,%rcx,1)
1337
1338	cmpq	$0,%rcx
1339	jne	.Lsqr4x_1st
1340
1341	mulq	%r15
1342	addq	%rax,%r13
1343	leaq	16(%rbp),%rbp
1344	adcq	$0,%rdx
1345	addq	%r11,%r13
1346	adcq	$0,%rdx
1347
1348	movq	%r13,(%rdi)
1349	movq	%rdx,%r12
1350	movq	%rdx,8(%rdi)
1351	jmp	.Lsqr4x_outer
1352
1353.align	32
1354.Lsqr4x_outer:
1355	movq	-32(%rsi,%rbp,1),%r14
1356	leaq	48+8(%rsp,%r9,2),%rdi
1357	movq	-24(%rsi,%rbp,1),%rax
1358	leaq	-32(%rdi,%rbp,1),%rdi
1359	movq	-16(%rsi,%rbp,1),%rbx
1360	movq	%rax,%r15
1361
1362	mulq	%r14
1363	movq	-24(%rdi,%rbp,1),%r10
1364	addq	%rax,%r10
1365	movq	%rbx,%rax
1366	adcq	$0,%rdx
1367	movq	%r10,-24(%rdi,%rbp,1)
1368	movq	%rdx,%r11
1369
1370	mulq	%r14
1371	addq	%rax,%r11
1372	movq	%rbx,%rax
1373	adcq	$0,%rdx
1374	addq	-16(%rdi,%rbp,1),%r11
1375	movq	%rdx,%r10
1376	adcq	$0,%r10
1377	movq	%r11,-16(%rdi,%rbp,1)
1378
1379	xorq	%r12,%r12
1380
1381	movq	-8(%rsi,%rbp,1),%rbx
1382	mulq	%r15
1383	addq	%rax,%r12
1384	movq	%rbx,%rax
1385	adcq	$0,%rdx
1386	addq	-8(%rdi,%rbp,1),%r12
1387	movq	%rdx,%r13
1388	adcq	$0,%r13
1389
1390	mulq	%r14
1391	addq	%rax,%r10
1392	movq	%rbx,%rax
1393	adcq	$0,%rdx
1394	addq	%r12,%r10
1395	movq	%rdx,%r11
1396	adcq	$0,%r11
1397	movq	%r10,-8(%rdi,%rbp,1)
1398
1399	leaq	(%rbp),%rcx
1400	jmp	.Lsqr4x_inner
1401
1402.align	32
1403.Lsqr4x_inner:
1404	movq	(%rsi,%rcx,1),%rbx
1405	mulq	%r15
1406	addq	%rax,%r13
1407	movq	%rbx,%rax
1408	movq	%rdx,%r12
1409	adcq	$0,%r12
1410	addq	(%rdi,%rcx,1),%r13
1411	adcq	$0,%r12
1412
1413.byte	0x67
1414	mulq	%r14
1415	addq	%rax,%r11
1416	movq	%rbx,%rax
1417	movq	8(%rsi,%rcx,1),%rbx
1418	movq	%rdx,%r10
1419	adcq	$0,%r10
1420	addq	%r13,%r11
1421	adcq	$0,%r10
1422
1423	mulq	%r15
1424	addq	%rax,%r12
1425	movq	%r11,(%rdi,%rcx,1)
1426	movq	%rbx,%rax
1427	movq	%rdx,%r13
1428	adcq	$0,%r13
1429	addq	8(%rdi,%rcx,1),%r12
1430	leaq	16(%rcx),%rcx
1431	adcq	$0,%r13
1432
1433	mulq	%r14
1434	addq	%rax,%r10
1435	movq	%rbx,%rax
1436	adcq	$0,%rdx
1437	addq	%r12,%r10
1438	movq	%rdx,%r11
1439	adcq	$0,%r11
1440	movq	%r10,-8(%rdi,%rcx,1)
1441
1442	cmpq	$0,%rcx
1443	jne	.Lsqr4x_inner
1444
1445.byte	0x67
1446	mulq	%r15
1447	addq	%rax,%r13
1448	adcq	$0,%rdx
1449	addq	%r11,%r13
1450	adcq	$0,%rdx
1451
1452	movq	%r13,(%rdi)
1453	movq	%rdx,%r12
1454	movq	%rdx,8(%rdi)
1455
1456	addq	$16,%rbp
1457	jnz	.Lsqr4x_outer
1458
1459
1460	movq	-32(%rsi),%r14
1461	leaq	48+8(%rsp,%r9,2),%rdi
1462	movq	-24(%rsi),%rax
1463	leaq	-32(%rdi,%rbp,1),%rdi
1464	movq	-16(%rsi),%rbx
1465	movq	%rax,%r15
1466
1467	mulq	%r14
1468	addq	%rax,%r10
1469	movq	%rbx,%rax
1470	movq	%rdx,%r11
1471	adcq	$0,%r11
1472
1473	mulq	%r14
1474	addq	%rax,%r11
1475	movq	%rbx,%rax
1476	movq	%r10,-24(%rdi)
1477	movq	%rdx,%r10
1478	adcq	$0,%r10
1479	addq	%r13,%r11
1480	movq	-8(%rsi),%rbx
1481	adcq	$0,%r10
1482
1483	mulq	%r15
1484	addq	%rax,%r12
1485	movq	%rbx,%rax
1486	movq	%r11,-16(%rdi)
1487	movq	%rdx,%r13
1488	adcq	$0,%r13
1489
1490	mulq	%r14
1491	addq	%rax,%r10
1492	movq	%rbx,%rax
1493	movq	%rdx,%r11
1494	adcq	$0,%r11
1495	addq	%r12,%r10
1496	adcq	$0,%r11
1497	movq	%r10,-8(%rdi)
1498
1499	mulq	%r15
1500	addq	%rax,%r13
1501	movq	-16(%rsi),%rax
1502	adcq	$0,%rdx
1503	addq	%r11,%r13
1504	adcq	$0,%rdx
1505
1506	movq	%r13,(%rdi)
1507	movq	%rdx,%r12
1508	movq	%rdx,8(%rdi)
1509
1510	mulq	%rbx
1511	addq	$16,%rbp
1512	xorq	%r14,%r14
1513	subq	%r9,%rbp
1514	xorq	%r15,%r15
1515
1516	addq	%r12,%rax
1517	adcq	$0,%rdx
1518	movq	%rax,8(%rdi)
1519	movq	%rdx,16(%rdi)
1520	movq	%r15,24(%rdi)
1521
1522	movq	-16(%rsi,%rbp,1),%rax
1523	leaq	48+8(%rsp),%rdi
1524	xorq	%r10,%r10
1525	movq	8(%rdi),%r11
1526
1527	leaq	(%r14,%r10,2),%r12
1528	shrq	$63,%r10
1529	leaq	(%rcx,%r11,2),%r13
1530	shrq	$63,%r11
1531	orq	%r10,%r13
1532	movq	16(%rdi),%r10
1533	movq	%r11,%r14
1534	mulq	%rax
1535	negq	%r15
1536	movq	24(%rdi),%r11
1537	adcq	%rax,%r12
1538	movq	-8(%rsi,%rbp,1),%rax
1539	movq	%r12,(%rdi)
1540	adcq	%rdx,%r13
1541
1542	leaq	(%r14,%r10,2),%rbx
1543	movq	%r13,8(%rdi)
1544	sbbq	%r15,%r15
1545	shrq	$63,%r10
1546	leaq	(%rcx,%r11,2),%r8
1547	shrq	$63,%r11
1548	orq	%r10,%r8
1549	movq	32(%rdi),%r10
1550	movq	%r11,%r14
1551	mulq	%rax
1552	negq	%r15
1553	movq	40(%rdi),%r11
1554	adcq	%rax,%rbx
1555	movq	0(%rsi,%rbp,1),%rax
1556	movq	%rbx,16(%rdi)
1557	adcq	%rdx,%r8
1558	leaq	16(%rbp),%rbp
1559	movq	%r8,24(%rdi)
1560	sbbq	%r15,%r15
1561	leaq	64(%rdi),%rdi
1562	jmp	.Lsqr4x_shift_n_add
1563
1564.align	32
1565.Lsqr4x_shift_n_add:
1566	leaq	(%r14,%r10,2),%r12
1567	shrq	$63,%r10
1568	leaq	(%rcx,%r11,2),%r13
1569	shrq	$63,%r11
1570	orq	%r10,%r13
1571	movq	-16(%rdi),%r10
1572	movq	%r11,%r14
1573	mulq	%rax
1574	negq	%r15
1575	movq	-8(%rdi),%r11
1576	adcq	%rax,%r12
1577	movq	-8(%rsi,%rbp,1),%rax
1578	movq	%r12,-32(%rdi)
1579	adcq	%rdx,%r13
1580
1581	leaq	(%r14,%r10,2),%rbx
1582	movq	%r13,-24(%rdi)
1583	sbbq	%r15,%r15
1584	shrq	$63,%r10
1585	leaq	(%rcx,%r11,2),%r8
1586	shrq	$63,%r11
1587	orq	%r10,%r8
1588	movq	0(%rdi),%r10
1589	movq	%r11,%r14
1590	mulq	%rax
1591	negq	%r15
1592	movq	8(%rdi),%r11
1593	adcq	%rax,%rbx
1594	movq	0(%rsi,%rbp,1),%rax
1595	movq	%rbx,-16(%rdi)
1596	adcq	%rdx,%r8
1597
1598	leaq	(%r14,%r10,2),%r12
1599	movq	%r8,-8(%rdi)
1600	sbbq	%r15,%r15
1601	shrq	$63,%r10
1602	leaq	(%rcx,%r11,2),%r13
1603	shrq	$63,%r11
1604	orq	%r10,%r13
1605	movq	16(%rdi),%r10
1606	movq	%r11,%r14
1607	mulq	%rax
1608	negq	%r15
1609	movq	24(%rdi),%r11
1610	adcq	%rax,%r12
1611	movq	8(%rsi,%rbp,1),%rax
1612	movq	%r12,0(%rdi)
1613	adcq	%rdx,%r13
1614
1615	leaq	(%r14,%r10,2),%rbx
1616	movq	%r13,8(%rdi)
1617	sbbq	%r15,%r15
1618	shrq	$63,%r10
1619	leaq	(%rcx,%r11,2),%r8
1620	shrq	$63,%r11
1621	orq	%r10,%r8
1622	movq	32(%rdi),%r10
1623	movq	%r11,%r14
1624	mulq	%rax
1625	negq	%r15
1626	movq	40(%rdi),%r11
1627	adcq	%rax,%rbx
1628	movq	16(%rsi,%rbp,1),%rax
1629	movq	%rbx,16(%rdi)
1630	adcq	%rdx,%r8
1631	movq	%r8,24(%rdi)
1632	sbbq	%r15,%r15
1633	leaq	64(%rdi),%rdi
1634	addq	$32,%rbp
1635	jnz	.Lsqr4x_shift_n_add
1636
1637	leaq	(%r14,%r10,2),%r12
1638.byte	0x67
1639	shrq	$63,%r10
1640	leaq	(%rcx,%r11,2),%r13
1641	shrq	$63,%r11
1642	orq	%r10,%r13
1643	movq	-16(%rdi),%r10
1644	movq	%r11,%r14
1645	mulq	%rax
1646	negq	%r15
1647	movq	-8(%rdi),%r11
1648	adcq	%rax,%r12
1649	movq	-8(%rsi),%rax
1650	movq	%r12,-32(%rdi)
1651	adcq	%rdx,%r13
1652
1653	leaq	(%r14,%r10,2),%rbx
1654	movq	%r13,-24(%rdi)
1655	sbbq	%r15,%r15
1656	shrq	$63,%r10
1657	leaq	(%rcx,%r11,2),%r8
1658	shrq	$63,%r11
1659	orq	%r10,%r8
1660	mulq	%rax
1661	negq	%r15
1662	adcq	%rax,%rbx
1663	adcq	%rdx,%r8
1664	movq	%rbx,-16(%rdi)
1665	movq	%r8,-8(%rdi)
1666.byte	102,72,15,126,213
1667__bn_sqr8x_reduction:
1668	xorq	%rax,%rax
1669	leaq	(%r9,%rbp,1),%rcx
1670	leaq	48+8(%rsp,%r9,2),%rdx
1671	movq	%rcx,0+8(%rsp)
1672	leaq	48+8(%rsp,%r9,1),%rdi
1673	movq	%rdx,8+8(%rsp)
1674	negq	%r9
1675	jmp	.L8x_reduction_loop
1676
1677.align	32
1678.L8x_reduction_loop:
1679	leaq	(%rdi,%r9,1),%rdi
1680.byte	0x66
1681	movq	0(%rdi),%rbx
1682	movq	8(%rdi),%r9
1683	movq	16(%rdi),%r10
1684	movq	24(%rdi),%r11
1685	movq	32(%rdi),%r12
1686	movq	40(%rdi),%r13
1687	movq	48(%rdi),%r14
1688	movq	56(%rdi),%r15
1689	movq	%rax,(%rdx)
1690	leaq	64(%rdi),%rdi
1691
1692.byte	0x67
1693	movq	%rbx,%r8
1694	imulq	32+8(%rsp),%rbx
1695	movq	0(%rbp),%rax
1696	movl	$8,%ecx
1697	jmp	.L8x_reduce
1698
1699.align	32
1700.L8x_reduce:
1701	mulq	%rbx
1702	movq	8(%rbp),%rax
1703	negq	%r8
1704	movq	%rdx,%r8
1705	adcq	$0,%r8
1706
1707	mulq	%rbx
1708	addq	%rax,%r9
1709	movq	16(%rbp),%rax
1710	adcq	$0,%rdx
1711	addq	%r9,%r8
1712	movq	%rbx,48-8+8(%rsp,%rcx,8)
1713	movq	%rdx,%r9
1714	adcq	$0,%r9
1715
1716	mulq	%rbx
1717	addq	%rax,%r10
1718	movq	24(%rbp),%rax
1719	adcq	$0,%rdx
1720	addq	%r10,%r9
1721	movq	32+8(%rsp),%rsi
1722	movq	%rdx,%r10
1723	adcq	$0,%r10
1724
1725	mulq	%rbx
1726	addq	%rax,%r11
1727	movq	32(%rbp),%rax
1728	adcq	$0,%rdx
1729	imulq	%r8,%rsi
1730	addq	%r11,%r10
1731	movq	%rdx,%r11
1732	adcq	$0,%r11
1733
1734	mulq	%rbx
1735	addq	%rax,%r12
1736	movq	40(%rbp),%rax
1737	adcq	$0,%rdx
1738	addq	%r12,%r11
1739	movq	%rdx,%r12
1740	adcq	$0,%r12
1741
1742	mulq	%rbx
1743	addq	%rax,%r13
1744	movq	48(%rbp),%rax
1745	adcq	$0,%rdx
1746	addq	%r13,%r12
1747	movq	%rdx,%r13
1748	adcq	$0,%r13
1749
1750	mulq	%rbx
1751	addq	%rax,%r14
1752	movq	56(%rbp),%rax
1753	adcq	$0,%rdx
1754	addq	%r14,%r13
1755	movq	%rdx,%r14
1756	adcq	$0,%r14
1757
1758	mulq	%rbx
1759	movq	%rsi,%rbx
1760	addq	%rax,%r15
1761	movq	0(%rbp),%rax
1762	adcq	$0,%rdx
1763	addq	%r15,%r14
1764	movq	%rdx,%r15
1765	adcq	$0,%r15
1766
1767	decl	%ecx
1768	jnz	.L8x_reduce
1769
1770	leaq	64(%rbp),%rbp
1771	xorq	%rax,%rax
1772	movq	8+8(%rsp),%rdx
1773	cmpq	0+8(%rsp),%rbp
1774	jae	.L8x_no_tail
1775
1776.byte	0x66
1777	addq	0(%rdi),%r8
1778	adcq	8(%rdi),%r9
1779	adcq	16(%rdi),%r10
1780	adcq	24(%rdi),%r11
1781	adcq	32(%rdi),%r12
1782	adcq	40(%rdi),%r13
1783	adcq	48(%rdi),%r14
1784	adcq	56(%rdi),%r15
1785	sbbq	%rsi,%rsi
1786
1787	movq	48+56+8(%rsp),%rbx
1788	movl	$8,%ecx
1789	movq	0(%rbp),%rax
1790	jmp	.L8x_tail
1791
1792.align	32
1793.L8x_tail:
1794	mulq	%rbx
1795	addq	%rax,%r8
1796	movq	8(%rbp),%rax
1797	movq	%r8,(%rdi)
1798	movq	%rdx,%r8
1799	adcq	$0,%r8
1800
1801	mulq	%rbx
1802	addq	%rax,%r9
1803	movq	16(%rbp),%rax
1804	adcq	$0,%rdx
1805	addq	%r9,%r8
1806	leaq	8(%rdi),%rdi
1807	movq	%rdx,%r9
1808	adcq	$0,%r9
1809
1810	mulq	%rbx
1811	addq	%rax,%r10
1812	movq	24(%rbp),%rax
1813	adcq	$0,%rdx
1814	addq	%r10,%r9
1815	movq	%rdx,%r10
1816	adcq	$0,%r10
1817
1818	mulq	%rbx
1819	addq	%rax,%r11
1820	movq	32(%rbp),%rax
1821	adcq	$0,%rdx
1822	addq	%r11,%r10
1823	movq	%rdx,%r11
1824	adcq	$0,%r11
1825
1826	mulq	%rbx
1827	addq	%rax,%r12
1828	movq	40(%rbp),%rax
1829	adcq	$0,%rdx
1830	addq	%r12,%r11
1831	movq	%rdx,%r12
1832	adcq	$0,%r12
1833
1834	mulq	%rbx
1835	addq	%rax,%r13
1836	movq	48(%rbp),%rax
1837	adcq	$0,%rdx
1838	addq	%r13,%r12
1839	movq	%rdx,%r13
1840	adcq	$0,%r13
1841
1842	mulq	%rbx
1843	addq	%rax,%r14
1844	movq	56(%rbp),%rax
1845	adcq	$0,%rdx
1846	addq	%r14,%r13
1847	movq	%rdx,%r14
1848	adcq	$0,%r14
1849
1850	mulq	%rbx
1851	movq	48-16+8(%rsp,%rcx,8),%rbx
1852	addq	%rax,%r15
1853	adcq	$0,%rdx
1854	addq	%r15,%r14
1855	movq	0(%rbp),%rax
1856	movq	%rdx,%r15
1857	adcq	$0,%r15
1858
1859	decl	%ecx
1860	jnz	.L8x_tail
1861
1862	leaq	64(%rbp),%rbp
1863	movq	8+8(%rsp),%rdx
1864	cmpq	0+8(%rsp),%rbp
1865	jae	.L8x_tail_done
1866
1867	movq	48+56+8(%rsp),%rbx
1868	negq	%rsi
1869	movq	0(%rbp),%rax
1870	adcq	0(%rdi),%r8
1871	adcq	8(%rdi),%r9
1872	adcq	16(%rdi),%r10
1873	adcq	24(%rdi),%r11
1874	adcq	32(%rdi),%r12
1875	adcq	40(%rdi),%r13
1876	adcq	48(%rdi),%r14
1877	adcq	56(%rdi),%r15
1878	sbbq	%rsi,%rsi
1879
1880	movl	$8,%ecx
1881	jmp	.L8x_tail
1882
1883.align	32
1884.L8x_tail_done:
1885	addq	(%rdx),%r8
1886	adcq	$0,%r9
1887	adcq	$0,%r10
1888	adcq	$0,%r11
1889	adcq	$0,%r12
1890	adcq	$0,%r13
1891	adcq	$0,%r14
1892	adcq	$0,%r15
1893
1894
1895	xorq	%rax,%rax
1896
1897	negq	%rsi
1898.L8x_no_tail:
1899	adcq	0(%rdi),%r8
1900	adcq	8(%rdi),%r9
1901	adcq	16(%rdi),%r10
1902	adcq	24(%rdi),%r11
1903	adcq	32(%rdi),%r12
1904	adcq	40(%rdi),%r13
1905	adcq	48(%rdi),%r14
1906	adcq	56(%rdi),%r15
1907	adcq	$0,%rax
1908	movq	-8(%rbp),%rcx
1909	xorq	%rsi,%rsi
1910
1911.byte	102,72,15,126,213
1912
1913	movq	%r8,0(%rdi)
1914	movq	%r9,8(%rdi)
1915.byte	102,73,15,126,217
1916	movq	%r10,16(%rdi)
1917	movq	%r11,24(%rdi)
1918	movq	%r12,32(%rdi)
1919	movq	%r13,40(%rdi)
1920	movq	%r14,48(%rdi)
1921	movq	%r15,56(%rdi)
1922	leaq	64(%rdi),%rdi
1923
1924	cmpq	%rdx,%rdi
1925	jb	.L8x_reduction_loop
1926	.byte	0xf3,0xc3
1927.size	bn_sqr8x_internal,.-bn_sqr8x_internal
1928.type	__bn_post4x_internal,@function
1929.align	32
1930__bn_post4x_internal:
1931	movq	0(%rbp),%r12
1932	leaq	(%rdi,%r9,1),%rbx
1933	movq	%r9,%rcx
1934.byte	102,72,15,126,207
1935	negq	%rax
1936.byte	102,72,15,126,206
1937	sarq	$3+2,%rcx
1938	decq	%r12
1939	xorq	%r10,%r10
1940	movq	8(%rbp),%r13
1941	movq	16(%rbp),%r14
1942	movq	24(%rbp),%r15
1943	jmp	.Lsqr4x_sub_entry
1944
1945.align	16
1946.Lsqr4x_sub:
1947	movq	0(%rbp),%r12
1948	movq	8(%rbp),%r13
1949	movq	16(%rbp),%r14
1950	movq	24(%rbp),%r15
1951.Lsqr4x_sub_entry:
1952	leaq	32(%rbp),%rbp
1953	notq	%r12
1954	notq	%r13
1955	notq	%r14
1956	notq	%r15
1957	andq	%rax,%r12
1958	andq	%rax,%r13
1959	andq	%rax,%r14
1960	andq	%rax,%r15
1961
1962	negq	%r10
1963	adcq	0(%rbx),%r12
1964	adcq	8(%rbx),%r13
1965	adcq	16(%rbx),%r14
1966	adcq	24(%rbx),%r15
1967	movq	%r12,0(%rdi)
1968	leaq	32(%rbx),%rbx
1969	movq	%r13,8(%rdi)
1970	sbbq	%r10,%r10
1971	movq	%r14,16(%rdi)
1972	movq	%r15,24(%rdi)
1973	leaq	32(%rdi),%rdi
1974
1975	incq	%rcx
1976	jnz	.Lsqr4x_sub
1977
1978	movq	%r9,%r10
1979	negq	%r9
1980	.byte	0xf3,0xc3
1981.size	__bn_post4x_internal,.-__bn_post4x_internal
1982.globl	bn_from_montgomery
1983.type	bn_from_montgomery,@function
1984.align	32
1985bn_from_montgomery:
1986	testl	$7,%r9d
1987	jz	bn_from_mont8x
1988	xorl	%eax,%eax
1989	.byte	0xf3,0xc3
1990.size	bn_from_montgomery,.-bn_from_montgomery
1991
1992.type	bn_from_mont8x,@function
1993.align	32
1994bn_from_mont8x:
1995.byte	0x67
1996	movq	%rsp,%rax
1997	pushq	%rbx
1998	pushq	%rbp
1999	pushq	%r12
2000	pushq	%r13
2001	pushq	%r14
2002	pushq	%r15
2003.Lfrom_prologue:
2004
2005	shll	$3,%r9d
2006	leaq	(%r9,%r9,2),%r10
2007	negq	%r9
2008	movq	(%r8),%r8
2009
2010
2011
2012
2013
2014
2015
2016
2017	leaq	-320(%rsp,%r9,2),%r11
2018	movq	%rsp,%rbp
2019	subq	%rdi,%r11
2020	andq	$4095,%r11
2021	cmpq	%r11,%r10
2022	jb	.Lfrom_sp_alt
2023	subq	%r11,%rbp
2024	leaq	-320(%rbp,%r9,2),%rbp
2025	jmp	.Lfrom_sp_done
2026
2027.align	32
2028.Lfrom_sp_alt:
2029	leaq	4096-320(,%r9,2),%r10
2030	leaq	-320(%rbp,%r9,2),%rbp
2031	subq	%r10,%r11
2032	movq	$0,%r10
2033	cmovcq	%r10,%r11
2034	subq	%r11,%rbp
2035.Lfrom_sp_done:
2036	andq	$-64,%rbp
2037	movq	%rsp,%r11
2038	subq	%rbp,%r11
2039	andq	$-4096,%r11
2040	leaq	(%r11,%rbp,1),%rsp
2041	movq	(%rsp),%r10
2042	cmpq	%rbp,%rsp
2043	ja	.Lfrom_page_walk
2044	jmp	.Lfrom_page_walk_done
2045
2046.Lfrom_page_walk:
2047	leaq	-4096(%rsp),%rsp
2048	movq	(%rsp),%r10
2049	cmpq	%rbp,%rsp
2050	ja	.Lfrom_page_walk
2051.Lfrom_page_walk_done:
2052
2053	movq	%r9,%r10
2054	negq	%r9
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065	movq	%r8,32(%rsp)
2066	movq	%rax,40(%rsp)
2067.Lfrom_body:
2068	movq	%r9,%r11
2069	leaq	48(%rsp),%rax
2070	pxor	%xmm0,%xmm0
2071	jmp	.Lmul_by_1
2072
2073.align	32
2074.Lmul_by_1:
2075	movdqu	(%rsi),%xmm1
2076	movdqu	16(%rsi),%xmm2
2077	movdqu	32(%rsi),%xmm3
2078	movdqa	%xmm0,(%rax,%r9,1)
2079	movdqu	48(%rsi),%xmm4
2080	movdqa	%xmm0,16(%rax,%r9,1)
2081.byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
2082	movdqa	%xmm1,(%rax)
2083	movdqa	%xmm0,32(%rax,%r9,1)
2084	movdqa	%xmm2,16(%rax)
2085	movdqa	%xmm0,48(%rax,%r9,1)
2086	movdqa	%xmm3,32(%rax)
2087	movdqa	%xmm4,48(%rax)
2088	leaq	64(%rax),%rax
2089	subq	$64,%r11
2090	jnz	.Lmul_by_1
2091
2092.byte	102,72,15,110,207
2093.byte	102,72,15,110,209
2094.byte	0x67
2095	movq	%rcx,%rbp
2096.byte	102,73,15,110,218
2097	call	__bn_sqr8x_reduction
2098	call	__bn_post4x_internal
2099
2100	pxor	%xmm0,%xmm0
2101	leaq	48(%rsp),%rax
2102	movq	40(%rsp),%rsi
2103	jmp	.Lfrom_mont_zero
2104
2105.align	32
2106.Lfrom_mont_zero:
2107	movdqa	%xmm0,0(%rax)
2108	movdqa	%xmm0,16(%rax)
2109	movdqa	%xmm0,32(%rax)
2110	movdqa	%xmm0,48(%rax)
2111	leaq	64(%rax),%rax
2112	subq	$32,%r9
2113	jnz	.Lfrom_mont_zero
2114
2115	movq	$1,%rax
2116	movq	-48(%rsi),%r15
2117	movq	-40(%rsi),%r14
2118	movq	-32(%rsi),%r13
2119	movq	-24(%rsi),%r12
2120	movq	-16(%rsi),%rbp
2121	movq	-8(%rsi),%rbx
2122	leaq	(%rsi),%rsp
2123.Lfrom_epilogue:
2124	.byte	0xf3,0xc3
2125.size	bn_from_mont8x,.-bn_from_mont8x
2126.globl	bn_get_bits5
2127.type	bn_get_bits5,@function
2128.align	16
2129bn_get_bits5:
2130	leaq	0(%rdi),%r10
2131	leaq	1(%rdi),%r11
2132	movl	%esi,%ecx
2133	shrl	$4,%esi
2134	andl	$15,%ecx
2135	leal	-8(%rcx),%eax
2136	cmpl	$11,%ecx
2137	cmovaq	%r11,%r10
2138	cmoval	%eax,%ecx
2139	movzwl	(%r10,%rsi,2),%eax
2140	shrl	%cl,%eax
2141	andl	$31,%eax
2142	.byte	0xf3,0xc3
2143.size	bn_get_bits5,.-bn_get_bits5
2144
2145.globl	bn_scatter5
2146.type	bn_scatter5,@function
2147.align	16
2148bn_scatter5:
2149	cmpl	$0,%esi
2150	jz	.Lscatter_epilogue
2151	leaq	(%rdx,%rcx,8),%rdx
2152.Lscatter:
2153	movq	(%rdi),%rax
2154	leaq	8(%rdi),%rdi
2155	movq	%rax,(%rdx)
2156	leaq	256(%rdx),%rdx
2157	subl	$1,%esi
2158	jnz	.Lscatter
2159.Lscatter_epilogue:
2160	.byte	0xf3,0xc3
2161.size	bn_scatter5,.-bn_scatter5
2162
2163.globl	bn_gather5
2164.type	bn_gather5,@function
2165.align	32
2166bn_gather5:
2167.LSEH_begin_bn_gather5:
2168
2169.byte	0x4c,0x8d,0x14,0x24
2170.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00
2171	leaq	.Linc(%rip),%rax
2172	andq	$-16,%rsp
2173
2174	movd	%ecx,%xmm5
2175	movdqa	0(%rax),%xmm0
2176	movdqa	16(%rax),%xmm1
2177	leaq	128(%rdx),%r11
2178	leaq	128(%rsp),%rax
2179
2180	pshufd	$0,%xmm5,%xmm5
2181	movdqa	%xmm1,%xmm4
2182	movdqa	%xmm1,%xmm2
2183	paddd	%xmm0,%xmm1
2184	pcmpeqd	%xmm5,%xmm0
2185	movdqa	%xmm4,%xmm3
2186
2187	paddd	%xmm1,%xmm2
2188	pcmpeqd	%xmm5,%xmm1
2189	movdqa	%xmm0,-128(%rax)
2190	movdqa	%xmm4,%xmm0
2191
2192	paddd	%xmm2,%xmm3
2193	pcmpeqd	%xmm5,%xmm2
2194	movdqa	%xmm1,-112(%rax)
2195	movdqa	%xmm4,%xmm1
2196
2197	paddd	%xmm3,%xmm0
2198	pcmpeqd	%xmm5,%xmm3
2199	movdqa	%xmm2,-96(%rax)
2200	movdqa	%xmm4,%xmm2
2201	paddd	%xmm0,%xmm1
2202	pcmpeqd	%xmm5,%xmm0
2203	movdqa	%xmm3,-80(%rax)
2204	movdqa	%xmm4,%xmm3
2205
2206	paddd	%xmm1,%xmm2
2207	pcmpeqd	%xmm5,%xmm1
2208	movdqa	%xmm0,-64(%rax)
2209	movdqa	%xmm4,%xmm0
2210
2211	paddd	%xmm2,%xmm3
2212	pcmpeqd	%xmm5,%xmm2
2213	movdqa	%xmm1,-48(%rax)
2214	movdqa	%xmm4,%xmm1
2215
2216	paddd	%xmm3,%xmm0
2217	pcmpeqd	%xmm5,%xmm3
2218	movdqa	%xmm2,-32(%rax)
2219	movdqa	%xmm4,%xmm2
2220	paddd	%xmm0,%xmm1
2221	pcmpeqd	%xmm5,%xmm0
2222	movdqa	%xmm3,-16(%rax)
2223	movdqa	%xmm4,%xmm3
2224
2225	paddd	%xmm1,%xmm2
2226	pcmpeqd	%xmm5,%xmm1
2227	movdqa	%xmm0,0(%rax)
2228	movdqa	%xmm4,%xmm0
2229
2230	paddd	%xmm2,%xmm3
2231	pcmpeqd	%xmm5,%xmm2
2232	movdqa	%xmm1,16(%rax)
2233	movdqa	%xmm4,%xmm1
2234
2235	paddd	%xmm3,%xmm0
2236	pcmpeqd	%xmm5,%xmm3
2237	movdqa	%xmm2,32(%rax)
2238	movdqa	%xmm4,%xmm2
2239	paddd	%xmm0,%xmm1
2240	pcmpeqd	%xmm5,%xmm0
2241	movdqa	%xmm3,48(%rax)
2242	movdqa	%xmm4,%xmm3
2243
2244	paddd	%xmm1,%xmm2
2245	pcmpeqd	%xmm5,%xmm1
2246	movdqa	%xmm0,64(%rax)
2247	movdqa	%xmm4,%xmm0
2248
2249	paddd	%xmm2,%xmm3
2250	pcmpeqd	%xmm5,%xmm2
2251	movdqa	%xmm1,80(%rax)
2252	movdqa	%xmm4,%xmm1
2253
2254	paddd	%xmm3,%xmm0
2255	pcmpeqd	%xmm5,%xmm3
2256	movdqa	%xmm2,96(%rax)
2257	movdqa	%xmm4,%xmm2
2258	movdqa	%xmm3,112(%rax)
2259	jmp	.Lgather
2260
2261.align	32
2262.Lgather:
2263	pxor	%xmm4,%xmm4
2264	pxor	%xmm5,%xmm5
2265	movdqa	-128(%r11),%xmm0
2266	movdqa	-112(%r11),%xmm1
2267	movdqa	-96(%r11),%xmm2
2268	pand	-128(%rax),%xmm0
2269	movdqa	-80(%r11),%xmm3
2270	pand	-112(%rax),%xmm1
2271	por	%xmm0,%xmm4
2272	pand	-96(%rax),%xmm2
2273	por	%xmm1,%xmm5
2274	pand	-80(%rax),%xmm3
2275	por	%xmm2,%xmm4
2276	por	%xmm3,%xmm5
2277	movdqa	-64(%r11),%xmm0
2278	movdqa	-48(%r11),%xmm1
2279	movdqa	-32(%r11),%xmm2
2280	pand	-64(%rax),%xmm0
2281	movdqa	-16(%r11),%xmm3
2282	pand	-48(%rax),%xmm1
2283	por	%xmm0,%xmm4
2284	pand	-32(%rax),%xmm2
2285	por	%xmm1,%xmm5
2286	pand	-16(%rax),%xmm3
2287	por	%xmm2,%xmm4
2288	por	%xmm3,%xmm5
2289	movdqa	0(%r11),%xmm0
2290	movdqa	16(%r11),%xmm1
2291	movdqa	32(%r11),%xmm2
2292	pand	0(%rax),%xmm0
2293	movdqa	48(%r11),%xmm3
2294	pand	16(%rax),%xmm1
2295	por	%xmm0,%xmm4
2296	pand	32(%rax),%xmm2
2297	por	%xmm1,%xmm5
2298	pand	48(%rax),%xmm3
2299	por	%xmm2,%xmm4
2300	por	%xmm3,%xmm5
2301	movdqa	64(%r11),%xmm0
2302	movdqa	80(%r11),%xmm1
2303	movdqa	96(%r11),%xmm2
2304	pand	64(%rax),%xmm0
2305	movdqa	112(%r11),%xmm3
2306	pand	80(%rax),%xmm1
2307	por	%xmm0,%xmm4
2308	pand	96(%rax),%xmm2
2309	por	%xmm1,%xmm5
2310	pand	112(%rax),%xmm3
2311	por	%xmm2,%xmm4
2312	por	%xmm3,%xmm5
2313	por	%xmm5,%xmm4
2314	leaq	256(%r11),%r11
2315	pshufd	$0x4e,%xmm4,%xmm0
2316	por	%xmm4,%xmm0
2317	movq	%xmm0,(%rdi)
2318	leaq	8(%rdi),%rdi
2319	subl	$1,%esi
2320	jnz	.Lgather
2321
2322	leaq	(%r10),%rsp
2323	.byte	0xf3,0xc3
2324.LSEH_end_bn_gather5:
2325.size	bn_gather5,.-bn_gather5
2326.align	64
2327.Linc:
2328.long	0,0, 1,1
2329.long	2,2, 2,2
2330.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
2331