x86_64-mont5.S revision 306195
1/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/x86_64-mont5.S 306195 2016-09-22 14:57:48Z jkim $ */
2/* Do not modify. This file is auto-generated from x86_64-mont5.pl. */
3.text
4
5
6
7.globl	bn_mul_mont_gather5
8.type	bn_mul_mont_gather5,@function
9.align	64
10bn_mul_mont_gather5:
11	movl	%r9d,%r9d
12	movq	%rsp,%rax
13	testl	$7,%r9d
14	jnz	.Lmul_enter
15	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
16	jmp	.Lmul4x_enter
17
18.align	16
19.Lmul_enter:
20	movd	8(%rsp),%xmm5
21	pushq	%rbx
22	pushq	%rbp
23	pushq	%r12
24	pushq	%r13
25	pushq	%r14
26	pushq	%r15
27
28	negq	%r9
29	movq	%rsp,%r11
30	leaq	-280(%rsp,%r9,8),%r10
31	negq	%r9
32	andq	$-1024,%r10
33
34
35
36
37
38
39
40	subq	%r10,%r11
41	andq	$-4096,%r11
42	leaq	(%r10,%r11,1),%rsp
43	movq	(%rsp),%r11
44	cmpq	%r10,%rsp
45	ja	.Lmul_page_walk
46	jmp	.Lmul_page_walk_done
47
48.Lmul_page_walk:
49	leaq	-4096(%rsp),%rsp
50	movq	(%rsp),%r11
51	cmpq	%r10,%rsp
52	ja	.Lmul_page_walk
53.Lmul_page_walk_done:
54
55	leaq	.Linc(%rip),%r10
56	movq	%rax,8(%rsp,%r9,8)
57.Lmul_body:
58
59	leaq	128(%rdx),%r12
60	movdqa	0(%r10),%xmm0
61	movdqa	16(%r10),%xmm1
62	leaq	24-112(%rsp,%r9,8),%r10
63	andq	$-16,%r10
64
65	pshufd	$0,%xmm5,%xmm5
66	movdqa	%xmm1,%xmm4
67	movdqa	%xmm1,%xmm2
68	paddd	%xmm0,%xmm1
69	pcmpeqd	%xmm5,%xmm0
70.byte	0x67
71	movdqa	%xmm4,%xmm3
72	paddd	%xmm1,%xmm2
73	pcmpeqd	%xmm5,%xmm1
74	movdqa	%xmm0,112(%r10)
75	movdqa	%xmm4,%xmm0
76
77	paddd	%xmm2,%xmm3
78	pcmpeqd	%xmm5,%xmm2
79	movdqa	%xmm1,128(%r10)
80	movdqa	%xmm4,%xmm1
81
82	paddd	%xmm3,%xmm0
83	pcmpeqd	%xmm5,%xmm3
84	movdqa	%xmm2,144(%r10)
85	movdqa	%xmm4,%xmm2
86
87	paddd	%xmm0,%xmm1
88	pcmpeqd	%xmm5,%xmm0
89	movdqa	%xmm3,160(%r10)
90	movdqa	%xmm4,%xmm3
91	paddd	%xmm1,%xmm2
92	pcmpeqd	%xmm5,%xmm1
93	movdqa	%xmm0,176(%r10)
94	movdqa	%xmm4,%xmm0
95
96	paddd	%xmm2,%xmm3
97	pcmpeqd	%xmm5,%xmm2
98	movdqa	%xmm1,192(%r10)
99	movdqa	%xmm4,%xmm1
100
101	paddd	%xmm3,%xmm0
102	pcmpeqd	%xmm5,%xmm3
103	movdqa	%xmm2,208(%r10)
104	movdqa	%xmm4,%xmm2
105
106	paddd	%xmm0,%xmm1
107	pcmpeqd	%xmm5,%xmm0
108	movdqa	%xmm3,224(%r10)
109	movdqa	%xmm4,%xmm3
110	paddd	%xmm1,%xmm2
111	pcmpeqd	%xmm5,%xmm1
112	movdqa	%xmm0,240(%r10)
113	movdqa	%xmm4,%xmm0
114
115	paddd	%xmm2,%xmm3
116	pcmpeqd	%xmm5,%xmm2
117	movdqa	%xmm1,256(%r10)
118	movdqa	%xmm4,%xmm1
119
120	paddd	%xmm3,%xmm0
121	pcmpeqd	%xmm5,%xmm3
122	movdqa	%xmm2,272(%r10)
123	movdqa	%xmm4,%xmm2
124
125	paddd	%xmm0,%xmm1
126	pcmpeqd	%xmm5,%xmm0
127	movdqa	%xmm3,288(%r10)
128	movdqa	%xmm4,%xmm3
129	paddd	%xmm1,%xmm2
130	pcmpeqd	%xmm5,%xmm1
131	movdqa	%xmm0,304(%r10)
132
133	paddd	%xmm2,%xmm3
134.byte	0x67
135	pcmpeqd	%xmm5,%xmm2
136	movdqa	%xmm1,320(%r10)
137
138	pcmpeqd	%xmm5,%xmm3
139	movdqa	%xmm2,336(%r10)
140	pand	64(%r12),%xmm0
141
142	pand	80(%r12),%xmm1
143	pand	96(%r12),%xmm2
144	movdqa	%xmm3,352(%r10)
145	pand	112(%r12),%xmm3
146	por	%xmm2,%xmm0
147	por	%xmm3,%xmm1
148	movdqa	-128(%r12),%xmm4
149	movdqa	-112(%r12),%xmm5
150	movdqa	-96(%r12),%xmm2
151	pand	112(%r10),%xmm4
152	movdqa	-80(%r12),%xmm3
153	pand	128(%r10),%xmm5
154	por	%xmm4,%xmm0
155	pand	144(%r10),%xmm2
156	por	%xmm5,%xmm1
157	pand	160(%r10),%xmm3
158	por	%xmm2,%xmm0
159	por	%xmm3,%xmm1
160	movdqa	-64(%r12),%xmm4
161	movdqa	-48(%r12),%xmm5
162	movdqa	-32(%r12),%xmm2
163	pand	176(%r10),%xmm4
164	movdqa	-16(%r12),%xmm3
165	pand	192(%r10),%xmm5
166	por	%xmm4,%xmm0
167	pand	208(%r10),%xmm2
168	por	%xmm5,%xmm1
169	pand	224(%r10),%xmm3
170	por	%xmm2,%xmm0
171	por	%xmm3,%xmm1
172	movdqa	0(%r12),%xmm4
173	movdqa	16(%r12),%xmm5
174	movdqa	32(%r12),%xmm2
175	pand	240(%r10),%xmm4
176	movdqa	48(%r12),%xmm3
177	pand	256(%r10),%xmm5
178	por	%xmm4,%xmm0
179	pand	272(%r10),%xmm2
180	por	%xmm5,%xmm1
181	pand	288(%r10),%xmm3
182	por	%xmm2,%xmm0
183	por	%xmm3,%xmm1
184	por	%xmm1,%xmm0
185	pshufd	$0x4e,%xmm0,%xmm1
186	por	%xmm1,%xmm0
187	leaq	256(%r12),%r12
188.byte	102,72,15,126,195
189
190	movq	(%r8),%r8
191	movq	(%rsi),%rax
192
193	xorq	%r14,%r14
194	xorq	%r15,%r15
195
196	movq	%r8,%rbp
197	mulq	%rbx
198	movq	%rax,%r10
199	movq	(%rcx),%rax
200
201	imulq	%r10,%rbp
202	movq	%rdx,%r11
203
204	mulq	%rbp
205	addq	%rax,%r10
206	movq	8(%rsi),%rax
207	adcq	$0,%rdx
208	movq	%rdx,%r13
209
210	leaq	1(%r15),%r15
211	jmp	.L1st_enter
212
213.align	16
214.L1st:
215	addq	%rax,%r13
216	movq	(%rsi,%r15,8),%rax
217	adcq	$0,%rdx
218	addq	%r11,%r13
219	movq	%r10,%r11
220	adcq	$0,%rdx
221	movq	%r13,-16(%rsp,%r15,8)
222	movq	%rdx,%r13
223
224.L1st_enter:
225	mulq	%rbx
226	addq	%rax,%r11
227	movq	(%rcx,%r15,8),%rax
228	adcq	$0,%rdx
229	leaq	1(%r15),%r15
230	movq	%rdx,%r10
231
232	mulq	%rbp
233	cmpq	%r9,%r15
234	jne	.L1st
235
236
237	addq	%rax,%r13
238	adcq	$0,%rdx
239	addq	%r11,%r13
240	adcq	$0,%rdx
241	movq	%r13,-16(%rsp,%r9,8)
242	movq	%rdx,%r13
243	movq	%r10,%r11
244
245	xorq	%rdx,%rdx
246	addq	%r11,%r13
247	adcq	$0,%rdx
248	movq	%r13,-8(%rsp,%r9,8)
249	movq	%rdx,(%rsp,%r9,8)
250
251	leaq	1(%r14),%r14
252	jmp	.Louter
253.align	16
254.Louter:
255	leaq	24+128(%rsp,%r9,8),%rdx
256	andq	$-16,%rdx
257	pxor	%xmm4,%xmm4
258	pxor	%xmm5,%xmm5
259	movdqa	-128(%r12),%xmm0
260	movdqa	-112(%r12),%xmm1
261	movdqa	-96(%r12),%xmm2
262	movdqa	-80(%r12),%xmm3
263	pand	-128(%rdx),%xmm0
264	pand	-112(%rdx),%xmm1
265	por	%xmm0,%xmm4
266	pand	-96(%rdx),%xmm2
267	por	%xmm1,%xmm5
268	pand	-80(%rdx),%xmm3
269	por	%xmm2,%xmm4
270	por	%xmm3,%xmm5
271	movdqa	-64(%r12),%xmm0
272	movdqa	-48(%r12),%xmm1
273	movdqa	-32(%r12),%xmm2
274	movdqa	-16(%r12),%xmm3
275	pand	-64(%rdx),%xmm0
276	pand	-48(%rdx),%xmm1
277	por	%xmm0,%xmm4
278	pand	-32(%rdx),%xmm2
279	por	%xmm1,%xmm5
280	pand	-16(%rdx),%xmm3
281	por	%xmm2,%xmm4
282	por	%xmm3,%xmm5
283	movdqa	0(%r12),%xmm0
284	movdqa	16(%r12),%xmm1
285	movdqa	32(%r12),%xmm2
286	movdqa	48(%r12),%xmm3
287	pand	0(%rdx),%xmm0
288	pand	16(%rdx),%xmm1
289	por	%xmm0,%xmm4
290	pand	32(%rdx),%xmm2
291	por	%xmm1,%xmm5
292	pand	48(%rdx),%xmm3
293	por	%xmm2,%xmm4
294	por	%xmm3,%xmm5
295	movdqa	64(%r12),%xmm0
296	movdqa	80(%r12),%xmm1
297	movdqa	96(%r12),%xmm2
298	movdqa	112(%r12),%xmm3
299	pand	64(%rdx),%xmm0
300	pand	80(%rdx),%xmm1
301	por	%xmm0,%xmm4
302	pand	96(%rdx),%xmm2
303	por	%xmm1,%xmm5
304	pand	112(%rdx),%xmm3
305	por	%xmm2,%xmm4
306	por	%xmm3,%xmm5
307	por	%xmm5,%xmm4
308	pshufd	$0x4e,%xmm4,%xmm0
309	por	%xmm4,%xmm0
310	leaq	256(%r12),%r12
311
312	movq	(%rsi),%rax
313.byte	102,72,15,126,195
314
315	xorq	%r15,%r15
316	movq	%r8,%rbp
317	movq	(%rsp),%r10
318
319	mulq	%rbx
320	addq	%rax,%r10
321	movq	(%rcx),%rax
322	adcq	$0,%rdx
323
324	imulq	%r10,%rbp
325	movq	%rdx,%r11
326
327	mulq	%rbp
328	addq	%rax,%r10
329	movq	8(%rsi),%rax
330	adcq	$0,%rdx
331	movq	8(%rsp),%r10
332	movq	%rdx,%r13
333
334	leaq	1(%r15),%r15
335	jmp	.Linner_enter
336
337.align	16
338.Linner:
339	addq	%rax,%r13
340	movq	(%rsi,%r15,8),%rax
341	adcq	$0,%rdx
342	addq	%r10,%r13
343	movq	(%rsp,%r15,8),%r10
344	adcq	$0,%rdx
345	movq	%r13,-16(%rsp,%r15,8)
346	movq	%rdx,%r13
347
348.Linner_enter:
349	mulq	%rbx
350	addq	%rax,%r11
351	movq	(%rcx,%r15,8),%rax
352	adcq	$0,%rdx
353	addq	%r11,%r10
354	movq	%rdx,%r11
355	adcq	$0,%r11
356	leaq	1(%r15),%r15
357
358	mulq	%rbp
359	cmpq	%r9,%r15
360	jne	.Linner
361
362	addq	%rax,%r13
363	adcq	$0,%rdx
364	addq	%r10,%r13
365	movq	(%rsp,%r9,8),%r10
366	adcq	$0,%rdx
367	movq	%r13,-16(%rsp,%r9,8)
368	movq	%rdx,%r13
369
370	xorq	%rdx,%rdx
371	addq	%r11,%r13
372	adcq	$0,%rdx
373	addq	%r10,%r13
374	adcq	$0,%rdx
375	movq	%r13,-8(%rsp,%r9,8)
376	movq	%rdx,(%rsp,%r9,8)
377
378	leaq	1(%r14),%r14
379	cmpq	%r9,%r14
380	jb	.Louter
381
382	xorq	%r14,%r14
383	movq	(%rsp),%rax
384	leaq	(%rsp),%rsi
385	movq	%r9,%r15
386	jmp	.Lsub
387.align	16
388.Lsub:	sbbq	(%rcx,%r14,8),%rax
389	movq	%rax,(%rdi,%r14,8)
390	movq	8(%rsi,%r14,8),%rax
391	leaq	1(%r14),%r14
392	decq	%r15
393	jnz	.Lsub
394
395	sbbq	$0,%rax
396	xorq	%r14,%r14
397	andq	%rax,%rsi
398	notq	%rax
399	movq	%rdi,%rcx
400	andq	%rax,%rcx
401	movq	%r9,%r15
402	orq	%rcx,%rsi
403.align	16
404.Lcopy:
405	movq	(%rsi,%r14,8),%rax
406	movq	%r14,(%rsp,%r14,8)
407	movq	%rax,(%rdi,%r14,8)
408	leaq	1(%r14),%r14
409	subq	$1,%r15
410	jnz	.Lcopy
411
412	movq	8(%rsp,%r9,8),%rsi
413	movq	$1,%rax
414
415	movq	-48(%rsi),%r15
416	movq	-40(%rsi),%r14
417	movq	-32(%rsi),%r13
418	movq	-24(%rsi),%r12
419	movq	-16(%rsi),%rbp
420	movq	-8(%rsi),%rbx
421	leaq	(%rsi),%rsp
422.Lmul_epilogue:
423	.byte	0xf3,0xc3
424.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
425.type	bn_mul4x_mont_gather5,@function
426.align	32
427bn_mul4x_mont_gather5:
428.byte	0x67
429	movq	%rsp,%rax
430.Lmul4x_enter:
431	andl	$0x80108,%r11d
432	cmpl	$0x80108,%r11d
433	je	.Lmulx4x_enter
434	pushq	%rbx
435	pushq	%rbp
436	pushq	%r12
437	pushq	%r13
438	pushq	%r14
439	pushq	%r15
440.Lmul4x_prologue:
441
442.byte	0x67
443	shll	$3,%r9d
444	leaq	(%r9,%r9,2),%r10
445	negq	%r9
446
447
448
449
450
451
452
453
454
455
456	leaq	-320(%rsp,%r9,2),%r11
457	movq	%rsp,%rbp
458	subq	%rdi,%r11
459	andq	$4095,%r11
460	cmpq	%r11,%r10
461	jb	.Lmul4xsp_alt
462	subq	%r11,%rbp
463	leaq	-320(%rbp,%r9,2),%rbp
464	jmp	.Lmul4xsp_done
465
466.align	32
467.Lmul4xsp_alt:
468	leaq	4096-320(,%r9,2),%r10
469	leaq	-320(%rbp,%r9,2),%rbp
470	subq	%r10,%r11
471	movq	$0,%r10
472	cmovcq	%r10,%r11
473	subq	%r11,%rbp
474.Lmul4xsp_done:
475	andq	$-64,%rbp
476	movq	%rsp,%r11
477	subq	%rbp,%r11
478	andq	$-4096,%r11
479	leaq	(%r11,%rbp,1),%rsp
480	movq	(%rsp),%r10
481	cmpq	%rbp,%rsp
482	ja	.Lmul4x_page_walk
483	jmp	.Lmul4x_page_walk_done
484
485.Lmul4x_page_walk:
486	leaq	-4096(%rsp),%rsp
487	movq	(%rsp),%r10
488	cmpq	%rbp,%rsp
489	ja	.Lmul4x_page_walk
490.Lmul4x_page_walk_done:
491
492	negq	%r9
493
494	movq	%rax,40(%rsp)
495.Lmul4x_body:
496
497	call	mul4x_internal
498
499	movq	40(%rsp),%rsi
500	movq	$1,%rax
501
502	movq	-48(%rsi),%r15
503	movq	-40(%rsi),%r14
504	movq	-32(%rsi),%r13
505	movq	-24(%rsi),%r12
506	movq	-16(%rsi),%rbp
507	movq	-8(%rsi),%rbx
508	leaq	(%rsi),%rsp
509.Lmul4x_epilogue:
510	.byte	0xf3,0xc3
511.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
512
513.type	mul4x_internal,@function
514.align	32
515mul4x_internal:
516	shlq	$5,%r9
517	movd	8(%rax),%xmm5
518	leaq	.Linc(%rip),%rax
519	leaq	128(%rdx,%r9,1),%r13
520	shrq	$5,%r9
521	movdqa	0(%rax),%xmm0
522	movdqa	16(%rax),%xmm1
523	leaq	88-112(%rsp,%r9,1),%r10
524	leaq	128(%rdx),%r12
525
526	pshufd	$0,%xmm5,%xmm5
527	movdqa	%xmm1,%xmm4
528.byte	0x67,0x67
529	movdqa	%xmm1,%xmm2
530	paddd	%xmm0,%xmm1
531	pcmpeqd	%xmm5,%xmm0
532.byte	0x67
533	movdqa	%xmm4,%xmm3
534	paddd	%xmm1,%xmm2
535	pcmpeqd	%xmm5,%xmm1
536	movdqa	%xmm0,112(%r10)
537	movdqa	%xmm4,%xmm0
538
539	paddd	%xmm2,%xmm3
540	pcmpeqd	%xmm5,%xmm2
541	movdqa	%xmm1,128(%r10)
542	movdqa	%xmm4,%xmm1
543
544	paddd	%xmm3,%xmm0
545	pcmpeqd	%xmm5,%xmm3
546	movdqa	%xmm2,144(%r10)
547	movdqa	%xmm4,%xmm2
548
549	paddd	%xmm0,%xmm1
550	pcmpeqd	%xmm5,%xmm0
551	movdqa	%xmm3,160(%r10)
552	movdqa	%xmm4,%xmm3
553	paddd	%xmm1,%xmm2
554	pcmpeqd	%xmm5,%xmm1
555	movdqa	%xmm0,176(%r10)
556	movdqa	%xmm4,%xmm0
557
558	paddd	%xmm2,%xmm3
559	pcmpeqd	%xmm5,%xmm2
560	movdqa	%xmm1,192(%r10)
561	movdqa	%xmm4,%xmm1
562
563	paddd	%xmm3,%xmm0
564	pcmpeqd	%xmm5,%xmm3
565	movdqa	%xmm2,208(%r10)
566	movdqa	%xmm4,%xmm2
567
568	paddd	%xmm0,%xmm1
569	pcmpeqd	%xmm5,%xmm0
570	movdqa	%xmm3,224(%r10)
571	movdqa	%xmm4,%xmm3
572	paddd	%xmm1,%xmm2
573	pcmpeqd	%xmm5,%xmm1
574	movdqa	%xmm0,240(%r10)
575	movdqa	%xmm4,%xmm0
576
577	paddd	%xmm2,%xmm3
578	pcmpeqd	%xmm5,%xmm2
579	movdqa	%xmm1,256(%r10)
580	movdqa	%xmm4,%xmm1
581
582	paddd	%xmm3,%xmm0
583	pcmpeqd	%xmm5,%xmm3
584	movdqa	%xmm2,272(%r10)
585	movdqa	%xmm4,%xmm2
586
587	paddd	%xmm0,%xmm1
588	pcmpeqd	%xmm5,%xmm0
589	movdqa	%xmm3,288(%r10)
590	movdqa	%xmm4,%xmm3
591	paddd	%xmm1,%xmm2
592	pcmpeqd	%xmm5,%xmm1
593	movdqa	%xmm0,304(%r10)
594
595	paddd	%xmm2,%xmm3
596.byte	0x67
597	pcmpeqd	%xmm5,%xmm2
598	movdqa	%xmm1,320(%r10)
599
600	pcmpeqd	%xmm5,%xmm3
601	movdqa	%xmm2,336(%r10)
602	pand	64(%r12),%xmm0
603
604	pand	80(%r12),%xmm1
605	pand	96(%r12),%xmm2
606	movdqa	%xmm3,352(%r10)
607	pand	112(%r12),%xmm3
608	por	%xmm2,%xmm0
609	por	%xmm3,%xmm1
610	movdqa	-128(%r12),%xmm4
611	movdqa	-112(%r12),%xmm5
612	movdqa	-96(%r12),%xmm2
613	pand	112(%r10),%xmm4
614	movdqa	-80(%r12),%xmm3
615	pand	128(%r10),%xmm5
616	por	%xmm4,%xmm0
617	pand	144(%r10),%xmm2
618	por	%xmm5,%xmm1
619	pand	160(%r10),%xmm3
620	por	%xmm2,%xmm0
621	por	%xmm3,%xmm1
622	movdqa	-64(%r12),%xmm4
623	movdqa	-48(%r12),%xmm5
624	movdqa	-32(%r12),%xmm2
625	pand	176(%r10),%xmm4
626	movdqa	-16(%r12),%xmm3
627	pand	192(%r10),%xmm5
628	por	%xmm4,%xmm0
629	pand	208(%r10),%xmm2
630	por	%xmm5,%xmm1
631	pand	224(%r10),%xmm3
632	por	%xmm2,%xmm0
633	por	%xmm3,%xmm1
634	movdqa	0(%r12),%xmm4
635	movdqa	16(%r12),%xmm5
636	movdqa	32(%r12),%xmm2
637	pand	240(%r10),%xmm4
638	movdqa	48(%r12),%xmm3
639	pand	256(%r10),%xmm5
640	por	%xmm4,%xmm0
641	pand	272(%r10),%xmm2
642	por	%xmm5,%xmm1
643	pand	288(%r10),%xmm3
644	por	%xmm2,%xmm0
645	por	%xmm3,%xmm1
646	por	%xmm1,%xmm0
647	pshufd	$0x4e,%xmm0,%xmm1
648	por	%xmm1,%xmm0
649	leaq	256(%r12),%r12
650.byte	102,72,15,126,195
651
652	movq	%r13,16+8(%rsp)
653	movq	%rdi,56+8(%rsp)
654
655	movq	(%r8),%r8
656	movq	(%rsi),%rax
657	leaq	(%rsi,%r9,1),%rsi
658	negq	%r9
659
660	movq	%r8,%rbp
661	mulq	%rbx
662	movq	%rax,%r10
663	movq	(%rcx),%rax
664
665	imulq	%r10,%rbp
666	leaq	64+8(%rsp),%r14
667	movq	%rdx,%r11
668
669	mulq	%rbp
670	addq	%rax,%r10
671	movq	8(%rsi,%r9,1),%rax
672	adcq	$0,%rdx
673	movq	%rdx,%rdi
674
675	mulq	%rbx
676	addq	%rax,%r11
677	movq	8(%rcx),%rax
678	adcq	$0,%rdx
679	movq	%rdx,%r10
680
681	mulq	%rbp
682	addq	%rax,%rdi
683	movq	16(%rsi,%r9,1),%rax
684	adcq	$0,%rdx
685	addq	%r11,%rdi
686	leaq	32(%r9),%r15
687	leaq	32(%rcx),%rcx
688	adcq	$0,%rdx
689	movq	%rdi,(%r14)
690	movq	%rdx,%r13
691	jmp	.L1st4x
692
693.align	32
694.L1st4x:
695	mulq	%rbx
696	addq	%rax,%r10
697	movq	-16(%rcx),%rax
698	leaq	32(%r14),%r14
699	adcq	$0,%rdx
700	movq	%rdx,%r11
701
702	mulq	%rbp
703	addq	%rax,%r13
704	movq	-8(%rsi,%r15,1),%rax
705	adcq	$0,%rdx
706	addq	%r10,%r13
707	adcq	$0,%rdx
708	movq	%r13,-24(%r14)
709	movq	%rdx,%rdi
710
711	mulq	%rbx
712	addq	%rax,%r11
713	movq	-8(%rcx),%rax
714	adcq	$0,%rdx
715	movq	%rdx,%r10
716
717	mulq	%rbp
718	addq	%rax,%rdi
719	movq	(%rsi,%r15,1),%rax
720	adcq	$0,%rdx
721	addq	%r11,%rdi
722	adcq	$0,%rdx
723	movq	%rdi,-16(%r14)
724	movq	%rdx,%r13
725
726	mulq	%rbx
727	addq	%rax,%r10
728	movq	0(%rcx),%rax
729	adcq	$0,%rdx
730	movq	%rdx,%r11
731
732	mulq	%rbp
733	addq	%rax,%r13
734	movq	8(%rsi,%r15,1),%rax
735	adcq	$0,%rdx
736	addq	%r10,%r13
737	adcq	$0,%rdx
738	movq	%r13,-8(%r14)
739	movq	%rdx,%rdi
740
741	mulq	%rbx
742	addq	%rax,%r11
743	movq	8(%rcx),%rax
744	adcq	$0,%rdx
745	movq	%rdx,%r10
746
747	mulq	%rbp
748	addq	%rax,%rdi
749	movq	16(%rsi,%r15,1),%rax
750	adcq	$0,%rdx
751	addq	%r11,%rdi
752	leaq	32(%rcx),%rcx
753	adcq	$0,%rdx
754	movq	%rdi,(%r14)
755	movq	%rdx,%r13
756
757	addq	$32,%r15
758	jnz	.L1st4x
759
760	mulq	%rbx
761	addq	%rax,%r10
762	movq	-16(%rcx),%rax
763	leaq	32(%r14),%r14
764	adcq	$0,%rdx
765	movq	%rdx,%r11
766
767	mulq	%rbp
768	addq	%rax,%r13
769	movq	-8(%rsi),%rax
770	adcq	$0,%rdx
771	addq	%r10,%r13
772	adcq	$0,%rdx
773	movq	%r13,-24(%r14)
774	movq	%rdx,%rdi
775
776	mulq	%rbx
777	addq	%rax,%r11
778	movq	-8(%rcx),%rax
779	adcq	$0,%rdx
780	movq	%rdx,%r10
781
782	mulq	%rbp
783	addq	%rax,%rdi
784	movq	(%rsi,%r9,1),%rax
785	adcq	$0,%rdx
786	addq	%r11,%rdi
787	adcq	$0,%rdx
788	movq	%rdi,-16(%r14)
789	movq	%rdx,%r13
790
791	leaq	(%rcx,%r9,1),%rcx
792
793	xorq	%rdi,%rdi
794	addq	%r10,%r13
795	adcq	$0,%rdi
796	movq	%r13,-8(%r14)
797
798	jmp	.Louter4x
799
800.align	32
801.Louter4x:
802	leaq	16+128(%r14),%rdx
803	pxor	%xmm4,%xmm4
804	pxor	%xmm5,%xmm5
805	movdqa	-128(%r12),%xmm0
806	movdqa	-112(%r12),%xmm1
807	movdqa	-96(%r12),%xmm2
808	movdqa	-80(%r12),%xmm3
809	pand	-128(%rdx),%xmm0
810	pand	-112(%rdx),%xmm1
811	por	%xmm0,%xmm4
812	pand	-96(%rdx),%xmm2
813	por	%xmm1,%xmm5
814	pand	-80(%rdx),%xmm3
815	por	%xmm2,%xmm4
816	por	%xmm3,%xmm5
817	movdqa	-64(%r12),%xmm0
818	movdqa	-48(%r12),%xmm1
819	movdqa	-32(%r12),%xmm2
820	movdqa	-16(%r12),%xmm3
821	pand	-64(%rdx),%xmm0
822	pand	-48(%rdx),%xmm1
823	por	%xmm0,%xmm4
824	pand	-32(%rdx),%xmm2
825	por	%xmm1,%xmm5
826	pand	-16(%rdx),%xmm3
827	por	%xmm2,%xmm4
828	por	%xmm3,%xmm5
829	movdqa	0(%r12),%xmm0
830	movdqa	16(%r12),%xmm1
831	movdqa	32(%r12),%xmm2
832	movdqa	48(%r12),%xmm3
833	pand	0(%rdx),%xmm0
834	pand	16(%rdx),%xmm1
835	por	%xmm0,%xmm4
836	pand	32(%rdx),%xmm2
837	por	%xmm1,%xmm5
838	pand	48(%rdx),%xmm3
839	por	%xmm2,%xmm4
840	por	%xmm3,%xmm5
841	movdqa	64(%r12),%xmm0
842	movdqa	80(%r12),%xmm1
843	movdqa	96(%r12),%xmm2
844	movdqa	112(%r12),%xmm3
845	pand	64(%rdx),%xmm0
846	pand	80(%rdx),%xmm1
847	por	%xmm0,%xmm4
848	pand	96(%rdx),%xmm2
849	por	%xmm1,%xmm5
850	pand	112(%rdx),%xmm3
851	por	%xmm2,%xmm4
852	por	%xmm3,%xmm5
853	por	%xmm5,%xmm4
854	pshufd	$0x4e,%xmm4,%xmm0
855	por	%xmm4,%xmm0
856	leaq	256(%r12),%r12
857.byte	102,72,15,126,195
858
859	movq	(%r14,%r9,1),%r10
860	movq	%r8,%rbp
861	mulq	%rbx
862	addq	%rax,%r10
863	movq	(%rcx),%rax
864	adcq	$0,%rdx
865
866	imulq	%r10,%rbp
867	movq	%rdx,%r11
868	movq	%rdi,(%r14)
869
870	leaq	(%r14,%r9,1),%r14
871
872	mulq	%rbp
873	addq	%rax,%r10
874	movq	8(%rsi,%r9,1),%rax
875	adcq	$0,%rdx
876	movq	%rdx,%rdi
877
878	mulq	%rbx
879	addq	%rax,%r11
880	movq	8(%rcx),%rax
881	adcq	$0,%rdx
882	addq	8(%r14),%r11
883	adcq	$0,%rdx
884	movq	%rdx,%r10
885
886	mulq	%rbp
887	addq	%rax,%rdi
888	movq	16(%rsi,%r9,1),%rax
889	adcq	$0,%rdx
890	addq	%r11,%rdi
891	leaq	32(%r9),%r15
892	leaq	32(%rcx),%rcx
893	adcq	$0,%rdx
894	movq	%rdx,%r13
895	jmp	.Linner4x
896
897.align	32
898.Linner4x:
899	mulq	%rbx
900	addq	%rax,%r10
901	movq	-16(%rcx),%rax
902	adcq	$0,%rdx
903	addq	16(%r14),%r10
904	leaq	32(%r14),%r14
905	adcq	$0,%rdx
906	movq	%rdx,%r11
907
908	mulq	%rbp
909	addq	%rax,%r13
910	movq	-8(%rsi,%r15,1),%rax
911	adcq	$0,%rdx
912	addq	%r10,%r13
913	adcq	$0,%rdx
914	movq	%rdi,-32(%r14)
915	movq	%rdx,%rdi
916
917	mulq	%rbx
918	addq	%rax,%r11
919	movq	-8(%rcx),%rax
920	adcq	$0,%rdx
921	addq	-8(%r14),%r11
922	adcq	$0,%rdx
923	movq	%rdx,%r10
924
925	mulq	%rbp
926	addq	%rax,%rdi
927	movq	(%rsi,%r15,1),%rax
928	adcq	$0,%rdx
929	addq	%r11,%rdi
930	adcq	$0,%rdx
931	movq	%r13,-24(%r14)
932	movq	%rdx,%r13
933
934	mulq	%rbx
935	addq	%rax,%r10
936	movq	0(%rcx),%rax
937	adcq	$0,%rdx
938	addq	(%r14),%r10
939	adcq	$0,%rdx
940	movq	%rdx,%r11
941
942	mulq	%rbp
943	addq	%rax,%r13
944	movq	8(%rsi,%r15,1),%rax
945	adcq	$0,%rdx
946	addq	%r10,%r13
947	adcq	$0,%rdx
948	movq	%rdi,-16(%r14)
949	movq	%rdx,%rdi
950
951	mulq	%rbx
952	addq	%rax,%r11
953	movq	8(%rcx),%rax
954	adcq	$0,%rdx
955	addq	8(%r14),%r11
956	adcq	$0,%rdx
957	movq	%rdx,%r10
958
959	mulq	%rbp
960	addq	%rax,%rdi
961	movq	16(%rsi,%r15,1),%rax
962	adcq	$0,%rdx
963	addq	%r11,%rdi
964	leaq	32(%rcx),%rcx
965	adcq	$0,%rdx
966	movq	%r13,-8(%r14)
967	movq	%rdx,%r13
968
969	addq	$32,%r15
970	jnz	.Linner4x
971
972	mulq	%rbx
973	addq	%rax,%r10
974	movq	-16(%rcx),%rax
975	adcq	$0,%rdx
976	addq	16(%r14),%r10
977	leaq	32(%r14),%r14
978	adcq	$0,%rdx
979	movq	%rdx,%r11
980
981	mulq	%rbp
982	addq	%rax,%r13
983	movq	-8(%rsi),%rax
984	adcq	$0,%rdx
985	addq	%r10,%r13
986	adcq	$0,%rdx
987	movq	%rdi,-32(%r14)
988	movq	%rdx,%rdi
989
990	mulq	%rbx
991	addq	%rax,%r11
992	movq	%rbp,%rax
993	movq	-8(%rcx),%rbp
994	adcq	$0,%rdx
995	addq	-8(%r14),%r11
996	adcq	$0,%rdx
997	movq	%rdx,%r10
998
999	mulq	%rbp
1000	addq	%rax,%rdi
1001	movq	(%rsi,%r9,1),%rax
1002	adcq	$0,%rdx
1003	addq	%r11,%rdi
1004	adcq	$0,%rdx
1005	movq	%r13,-24(%r14)
1006	movq	%rdx,%r13
1007
1008	movq	%rdi,-16(%r14)
1009	leaq	(%rcx,%r9,1),%rcx
1010
1011	xorq	%rdi,%rdi
1012	addq	%r10,%r13
1013	adcq	$0,%rdi
1014	addq	(%r14),%r13
1015	adcq	$0,%rdi
1016	movq	%r13,-8(%r14)
1017
1018	cmpq	16+8(%rsp),%r12
1019	jb	.Louter4x
1020	xorq	%rax,%rax
1021	subq	%r13,%rbp
1022	adcq	%r15,%r15
1023	orq	%r15,%rdi
1024	subq	%rdi,%rax
1025	leaq	(%r14,%r9,1),%rbx
1026	movq	(%rcx),%r12
1027	leaq	(%rcx),%rbp
1028	movq	%r9,%rcx
1029	sarq	$3+2,%rcx
1030	movq	56+8(%rsp),%rdi
1031	decq	%r12
1032	xorq	%r10,%r10
1033	movq	8(%rbp),%r13
1034	movq	16(%rbp),%r14
1035	movq	24(%rbp),%r15
1036	jmp	.Lsqr4x_sub_entry
1037.size	mul4x_internal,.-mul4x_internal
1038.globl	bn_power5
1039.type	bn_power5,@function
1040.align	32
1041bn_power5:
1042	movq	%rsp,%rax
1043	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
1044	andl	$0x80108,%r11d
1045	cmpl	$0x80108,%r11d
1046	je	.Lpowerx5_enter
1047	pushq	%rbx
1048	pushq	%rbp
1049	pushq	%r12
1050	pushq	%r13
1051	pushq	%r14
1052	pushq	%r15
1053.Lpower5_prologue:
1054
1055	shll	$3,%r9d
1056	leal	(%r9,%r9,2),%r10d
1057	negq	%r9
1058	movq	(%r8),%r8
1059
1060
1061
1062
1063
1064
1065
1066
1067	leaq	-320(%rsp,%r9,2),%r11
1068	movq	%rsp,%rbp
1069	subq	%rdi,%r11
1070	andq	$4095,%r11
1071	cmpq	%r11,%r10
1072	jb	.Lpwr_sp_alt
1073	subq	%r11,%rbp
1074	leaq	-320(%rbp,%r9,2),%rbp
1075	jmp	.Lpwr_sp_done
1076
1077.align	32
1078.Lpwr_sp_alt:
1079	leaq	4096-320(,%r9,2),%r10
1080	leaq	-320(%rbp,%r9,2),%rbp
1081	subq	%r10,%r11
1082	movq	$0,%r10
1083	cmovcq	%r10,%r11
1084	subq	%r11,%rbp
1085.Lpwr_sp_done:
1086	andq	$-64,%rbp
1087	movq	%rsp,%r11
1088	subq	%rbp,%r11
1089	andq	$-4096,%r11
1090	leaq	(%r11,%rbp,1),%rsp
1091	movq	(%rsp),%r10
1092	cmpq	%rbp,%rsp
1093	ja	.Lpwr_page_walk
1094	jmp	.Lpwr_page_walk_done
1095
1096.Lpwr_page_walk:
1097	leaq	-4096(%rsp),%rsp
1098	movq	(%rsp),%r10
1099	cmpq	%rbp,%rsp
1100	ja	.Lpwr_page_walk
1101.Lpwr_page_walk_done:
1102
1103	movq	%r9,%r10
1104	negq	%r9
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115	movq	%r8,32(%rsp)
1116	movq	%rax,40(%rsp)
1117.Lpower5_body:
1118.byte	102,72,15,110,207
1119.byte	102,72,15,110,209
1120.byte	102,73,15,110,218
1121.byte	102,72,15,110,226
1122
1123	call	__bn_sqr8x_internal
1124	call	__bn_post4x_internal
1125	call	__bn_sqr8x_internal
1126	call	__bn_post4x_internal
1127	call	__bn_sqr8x_internal
1128	call	__bn_post4x_internal
1129	call	__bn_sqr8x_internal
1130	call	__bn_post4x_internal
1131	call	__bn_sqr8x_internal
1132	call	__bn_post4x_internal
1133
1134.byte	102,72,15,126,209
1135.byte	102,72,15,126,226
1136	movq	%rsi,%rdi
1137	movq	40(%rsp),%rax
1138	leaq	32(%rsp),%r8
1139
1140	call	mul4x_internal
1141
1142	movq	40(%rsp),%rsi
1143	movq	$1,%rax
1144	movq	-48(%rsi),%r15
1145	movq	-40(%rsi),%r14
1146	movq	-32(%rsi),%r13
1147	movq	-24(%rsi),%r12
1148	movq	-16(%rsi),%rbp
1149	movq	-8(%rsi),%rbx
1150	leaq	(%rsi),%rsp
1151.Lpower5_epilogue:
1152	.byte	0xf3,0xc3
1153.size	bn_power5,.-bn_power5
1154
1155.globl	bn_sqr8x_internal
1156.hidden	bn_sqr8x_internal
1157.type	bn_sqr8x_internal,@function
1158.align	32
1159bn_sqr8x_internal:
1160__bn_sqr8x_internal:
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234	leaq	32(%r10),%rbp
1235	leaq	(%rsi,%r9,1),%rsi
1236
1237	movq	%r9,%rcx
1238
1239
1240	movq	-32(%rsi,%rbp,1),%r14
1241	leaq	48+8(%rsp,%r9,2),%rdi
1242	movq	-24(%rsi,%rbp,1),%rax
1243	leaq	-32(%rdi,%rbp,1),%rdi
1244	movq	-16(%rsi,%rbp,1),%rbx
1245	movq	%rax,%r15
1246
1247	mulq	%r14
1248	movq	%rax,%r10
1249	movq	%rbx,%rax
1250	movq	%rdx,%r11
1251	movq	%r10,-24(%rdi,%rbp,1)
1252
1253	mulq	%r14
1254	addq	%rax,%r11
1255	movq	%rbx,%rax
1256	adcq	$0,%rdx
1257	movq	%r11,-16(%rdi,%rbp,1)
1258	movq	%rdx,%r10
1259
1260
1261	movq	-8(%rsi,%rbp,1),%rbx
1262	mulq	%r15
1263	movq	%rax,%r12
1264	movq	%rbx,%rax
1265	movq	%rdx,%r13
1266
1267	leaq	(%rbp),%rcx
1268	mulq	%r14
1269	addq	%rax,%r10
1270	movq	%rbx,%rax
1271	movq	%rdx,%r11
1272	adcq	$0,%r11
1273	addq	%r12,%r10
1274	adcq	$0,%r11
1275	movq	%r10,-8(%rdi,%rcx,1)
1276	jmp	.Lsqr4x_1st
1277
1278.align	32
1279.Lsqr4x_1st:
1280	movq	(%rsi,%rcx,1),%rbx
1281	mulq	%r15
1282	addq	%rax,%r13
1283	movq	%rbx,%rax
1284	movq	%rdx,%r12
1285	adcq	$0,%r12
1286
1287	mulq	%r14
1288	addq	%rax,%r11
1289	movq	%rbx,%rax
1290	movq	8(%rsi,%rcx,1),%rbx
1291	movq	%rdx,%r10
1292	adcq	$0,%r10
1293	addq	%r13,%r11
1294	adcq	$0,%r10
1295
1296
1297	mulq	%r15
1298	addq	%rax,%r12
1299	movq	%rbx,%rax
1300	movq	%r11,(%rdi,%rcx,1)
1301	movq	%rdx,%r13
1302	adcq	$0,%r13
1303
1304	mulq	%r14
1305	addq	%rax,%r10
1306	movq	%rbx,%rax
1307	movq	16(%rsi,%rcx,1),%rbx
1308	movq	%rdx,%r11
1309	adcq	$0,%r11
1310	addq	%r12,%r10
1311	adcq	$0,%r11
1312
1313	mulq	%r15
1314	addq	%rax,%r13
1315	movq	%rbx,%rax
1316	movq	%r10,8(%rdi,%rcx,1)
1317	movq	%rdx,%r12
1318	adcq	$0,%r12
1319
1320	mulq	%r14
1321	addq	%rax,%r11
1322	movq	%rbx,%rax
1323	movq	24(%rsi,%rcx,1),%rbx
1324	movq	%rdx,%r10
1325	adcq	$0,%r10
1326	addq	%r13,%r11
1327	adcq	$0,%r10
1328
1329
1330	mulq	%r15
1331	addq	%rax,%r12
1332	movq	%rbx,%rax
1333	movq	%r11,16(%rdi,%rcx,1)
1334	movq	%rdx,%r13
1335	adcq	$0,%r13
1336	leaq	32(%rcx),%rcx
1337
1338	mulq	%r14
1339	addq	%rax,%r10
1340	movq	%rbx,%rax
1341	movq	%rdx,%r11
1342	adcq	$0,%r11
1343	addq	%r12,%r10
1344	adcq	$0,%r11
1345	movq	%r10,-8(%rdi,%rcx,1)
1346
1347	cmpq	$0,%rcx
1348	jne	.Lsqr4x_1st
1349
1350	mulq	%r15
1351	addq	%rax,%r13
1352	leaq	16(%rbp),%rbp
1353	adcq	$0,%rdx
1354	addq	%r11,%r13
1355	adcq	$0,%rdx
1356
1357	movq	%r13,(%rdi)
1358	movq	%rdx,%r12
1359	movq	%rdx,8(%rdi)
1360	jmp	.Lsqr4x_outer
1361
1362.align	32
1363.Lsqr4x_outer:
1364	movq	-32(%rsi,%rbp,1),%r14
1365	leaq	48+8(%rsp,%r9,2),%rdi
1366	movq	-24(%rsi,%rbp,1),%rax
1367	leaq	-32(%rdi,%rbp,1),%rdi
1368	movq	-16(%rsi,%rbp,1),%rbx
1369	movq	%rax,%r15
1370
1371	mulq	%r14
1372	movq	-24(%rdi,%rbp,1),%r10
1373	addq	%rax,%r10
1374	movq	%rbx,%rax
1375	adcq	$0,%rdx
1376	movq	%r10,-24(%rdi,%rbp,1)
1377	movq	%rdx,%r11
1378
1379	mulq	%r14
1380	addq	%rax,%r11
1381	movq	%rbx,%rax
1382	adcq	$0,%rdx
1383	addq	-16(%rdi,%rbp,1),%r11
1384	movq	%rdx,%r10
1385	adcq	$0,%r10
1386	movq	%r11,-16(%rdi,%rbp,1)
1387
1388	xorq	%r12,%r12
1389
1390	movq	-8(%rsi,%rbp,1),%rbx
1391	mulq	%r15
1392	addq	%rax,%r12
1393	movq	%rbx,%rax
1394	adcq	$0,%rdx
1395	addq	-8(%rdi,%rbp,1),%r12
1396	movq	%rdx,%r13
1397	adcq	$0,%r13
1398
1399	mulq	%r14
1400	addq	%rax,%r10
1401	movq	%rbx,%rax
1402	adcq	$0,%rdx
1403	addq	%r12,%r10
1404	movq	%rdx,%r11
1405	adcq	$0,%r11
1406	movq	%r10,-8(%rdi,%rbp,1)
1407
1408	leaq	(%rbp),%rcx
1409	jmp	.Lsqr4x_inner
1410
1411.align	32
1412.Lsqr4x_inner:
1413	movq	(%rsi,%rcx,1),%rbx
1414	mulq	%r15
1415	addq	%rax,%r13
1416	movq	%rbx,%rax
1417	movq	%rdx,%r12
1418	adcq	$0,%r12
1419	addq	(%rdi,%rcx,1),%r13
1420	adcq	$0,%r12
1421
1422.byte	0x67
1423	mulq	%r14
1424	addq	%rax,%r11
1425	movq	%rbx,%rax
1426	movq	8(%rsi,%rcx,1),%rbx
1427	movq	%rdx,%r10
1428	adcq	$0,%r10
1429	addq	%r13,%r11
1430	adcq	$0,%r10
1431
1432	mulq	%r15
1433	addq	%rax,%r12
1434	movq	%r11,(%rdi,%rcx,1)
1435	movq	%rbx,%rax
1436	movq	%rdx,%r13
1437	adcq	$0,%r13
1438	addq	8(%rdi,%rcx,1),%r12
1439	leaq	16(%rcx),%rcx
1440	adcq	$0,%r13
1441
1442	mulq	%r14
1443	addq	%rax,%r10
1444	movq	%rbx,%rax
1445	adcq	$0,%rdx
1446	addq	%r12,%r10
1447	movq	%rdx,%r11
1448	adcq	$0,%r11
1449	movq	%r10,-8(%rdi,%rcx,1)
1450
1451	cmpq	$0,%rcx
1452	jne	.Lsqr4x_inner
1453
1454.byte	0x67
1455	mulq	%r15
1456	addq	%rax,%r13
1457	adcq	$0,%rdx
1458	addq	%r11,%r13
1459	adcq	$0,%rdx
1460
1461	movq	%r13,(%rdi)
1462	movq	%rdx,%r12
1463	movq	%rdx,8(%rdi)
1464
1465	addq	$16,%rbp
1466	jnz	.Lsqr4x_outer
1467
1468
1469	movq	-32(%rsi),%r14
1470	leaq	48+8(%rsp,%r9,2),%rdi
1471	movq	-24(%rsi),%rax
1472	leaq	-32(%rdi,%rbp,1),%rdi
1473	movq	-16(%rsi),%rbx
1474	movq	%rax,%r15
1475
1476	mulq	%r14
1477	addq	%rax,%r10
1478	movq	%rbx,%rax
1479	movq	%rdx,%r11
1480	adcq	$0,%r11
1481
1482	mulq	%r14
1483	addq	%rax,%r11
1484	movq	%rbx,%rax
1485	movq	%r10,-24(%rdi)
1486	movq	%rdx,%r10
1487	adcq	$0,%r10
1488	addq	%r13,%r11
1489	movq	-8(%rsi),%rbx
1490	adcq	$0,%r10
1491
1492	mulq	%r15
1493	addq	%rax,%r12
1494	movq	%rbx,%rax
1495	movq	%r11,-16(%rdi)
1496	movq	%rdx,%r13
1497	adcq	$0,%r13
1498
1499	mulq	%r14
1500	addq	%rax,%r10
1501	movq	%rbx,%rax
1502	movq	%rdx,%r11
1503	adcq	$0,%r11
1504	addq	%r12,%r10
1505	adcq	$0,%r11
1506	movq	%r10,-8(%rdi)
1507
1508	mulq	%r15
1509	addq	%rax,%r13
1510	movq	-16(%rsi),%rax
1511	adcq	$0,%rdx
1512	addq	%r11,%r13
1513	adcq	$0,%rdx
1514
1515	movq	%r13,(%rdi)
1516	movq	%rdx,%r12
1517	movq	%rdx,8(%rdi)
1518
1519	mulq	%rbx
1520	addq	$16,%rbp
1521	xorq	%r14,%r14
1522	subq	%r9,%rbp
1523	xorq	%r15,%r15
1524
1525	addq	%r12,%rax
1526	adcq	$0,%rdx
1527	movq	%rax,8(%rdi)
1528	movq	%rdx,16(%rdi)
1529	movq	%r15,24(%rdi)
1530
1531	movq	-16(%rsi,%rbp,1),%rax
1532	leaq	48+8(%rsp),%rdi
1533	xorq	%r10,%r10
1534	movq	8(%rdi),%r11
1535
1536	leaq	(%r14,%r10,2),%r12
1537	shrq	$63,%r10
1538	leaq	(%rcx,%r11,2),%r13
1539	shrq	$63,%r11
1540	orq	%r10,%r13
1541	movq	16(%rdi),%r10
1542	movq	%r11,%r14
1543	mulq	%rax
1544	negq	%r15
1545	movq	24(%rdi),%r11
1546	adcq	%rax,%r12
1547	movq	-8(%rsi,%rbp,1),%rax
1548	movq	%r12,(%rdi)
1549	adcq	%rdx,%r13
1550
1551	leaq	(%r14,%r10,2),%rbx
1552	movq	%r13,8(%rdi)
1553	sbbq	%r15,%r15
1554	shrq	$63,%r10
1555	leaq	(%rcx,%r11,2),%r8
1556	shrq	$63,%r11
1557	orq	%r10,%r8
1558	movq	32(%rdi),%r10
1559	movq	%r11,%r14
1560	mulq	%rax
1561	negq	%r15
1562	movq	40(%rdi),%r11
1563	adcq	%rax,%rbx
1564	movq	0(%rsi,%rbp,1),%rax
1565	movq	%rbx,16(%rdi)
1566	adcq	%rdx,%r8
1567	leaq	16(%rbp),%rbp
1568	movq	%r8,24(%rdi)
1569	sbbq	%r15,%r15
1570	leaq	64(%rdi),%rdi
1571	jmp	.Lsqr4x_shift_n_add
1572
1573.align	32
1574.Lsqr4x_shift_n_add:
1575	leaq	(%r14,%r10,2),%r12
1576	shrq	$63,%r10
1577	leaq	(%rcx,%r11,2),%r13
1578	shrq	$63,%r11
1579	orq	%r10,%r13
1580	movq	-16(%rdi),%r10
1581	movq	%r11,%r14
1582	mulq	%rax
1583	negq	%r15
1584	movq	-8(%rdi),%r11
1585	adcq	%rax,%r12
1586	movq	-8(%rsi,%rbp,1),%rax
1587	movq	%r12,-32(%rdi)
1588	adcq	%rdx,%r13
1589
1590	leaq	(%r14,%r10,2),%rbx
1591	movq	%r13,-24(%rdi)
1592	sbbq	%r15,%r15
1593	shrq	$63,%r10
1594	leaq	(%rcx,%r11,2),%r8
1595	shrq	$63,%r11
1596	orq	%r10,%r8
1597	movq	0(%rdi),%r10
1598	movq	%r11,%r14
1599	mulq	%rax
1600	negq	%r15
1601	movq	8(%rdi),%r11
1602	adcq	%rax,%rbx
1603	movq	0(%rsi,%rbp,1),%rax
1604	movq	%rbx,-16(%rdi)
1605	adcq	%rdx,%r8
1606
1607	leaq	(%r14,%r10,2),%r12
1608	movq	%r8,-8(%rdi)
1609	sbbq	%r15,%r15
1610	shrq	$63,%r10
1611	leaq	(%rcx,%r11,2),%r13
1612	shrq	$63,%r11
1613	orq	%r10,%r13
1614	movq	16(%rdi),%r10
1615	movq	%r11,%r14
1616	mulq	%rax
1617	negq	%r15
1618	movq	24(%rdi),%r11
1619	adcq	%rax,%r12
1620	movq	8(%rsi,%rbp,1),%rax
1621	movq	%r12,0(%rdi)
1622	adcq	%rdx,%r13
1623
1624	leaq	(%r14,%r10,2),%rbx
1625	movq	%r13,8(%rdi)
1626	sbbq	%r15,%r15
1627	shrq	$63,%r10
1628	leaq	(%rcx,%r11,2),%r8
1629	shrq	$63,%r11
1630	orq	%r10,%r8
1631	movq	32(%rdi),%r10
1632	movq	%r11,%r14
1633	mulq	%rax
1634	negq	%r15
1635	movq	40(%rdi),%r11
1636	adcq	%rax,%rbx
1637	movq	16(%rsi,%rbp,1),%rax
1638	movq	%rbx,16(%rdi)
1639	adcq	%rdx,%r8
1640	movq	%r8,24(%rdi)
1641	sbbq	%r15,%r15
1642	leaq	64(%rdi),%rdi
1643	addq	$32,%rbp
1644	jnz	.Lsqr4x_shift_n_add
1645
1646	leaq	(%r14,%r10,2),%r12
1647.byte	0x67
1648	shrq	$63,%r10
1649	leaq	(%rcx,%r11,2),%r13
1650	shrq	$63,%r11
1651	orq	%r10,%r13
1652	movq	-16(%rdi),%r10
1653	movq	%r11,%r14
1654	mulq	%rax
1655	negq	%r15
1656	movq	-8(%rdi),%r11
1657	adcq	%rax,%r12
1658	movq	-8(%rsi),%rax
1659	movq	%r12,-32(%rdi)
1660	adcq	%rdx,%r13
1661
1662	leaq	(%r14,%r10,2),%rbx
1663	movq	%r13,-24(%rdi)
1664	sbbq	%r15,%r15
1665	shrq	$63,%r10
1666	leaq	(%rcx,%r11,2),%r8
1667	shrq	$63,%r11
1668	orq	%r10,%r8
1669	mulq	%rax
1670	negq	%r15
1671	adcq	%rax,%rbx
1672	adcq	%rdx,%r8
1673	movq	%rbx,-16(%rdi)
1674	movq	%r8,-8(%rdi)
1675.byte	102,72,15,126,213
1676__bn_sqr8x_reduction:
1677	xorq	%rax,%rax
1678	leaq	(%r9,%rbp,1),%rcx
1679	leaq	48+8(%rsp,%r9,2),%rdx
1680	movq	%rcx,0+8(%rsp)
1681	leaq	48+8(%rsp,%r9,1),%rdi
1682	movq	%rdx,8+8(%rsp)
1683	negq	%r9
1684	jmp	.L8x_reduction_loop
1685
1686.align	32
1687.L8x_reduction_loop:
1688	leaq	(%rdi,%r9,1),%rdi
1689.byte	0x66
1690	movq	0(%rdi),%rbx
1691	movq	8(%rdi),%r9
1692	movq	16(%rdi),%r10
1693	movq	24(%rdi),%r11
1694	movq	32(%rdi),%r12
1695	movq	40(%rdi),%r13
1696	movq	48(%rdi),%r14
1697	movq	56(%rdi),%r15
1698	movq	%rax,(%rdx)
1699	leaq	64(%rdi),%rdi
1700
1701.byte	0x67
1702	movq	%rbx,%r8
1703	imulq	32+8(%rsp),%rbx
1704	movq	0(%rbp),%rax
1705	movl	$8,%ecx
1706	jmp	.L8x_reduce
1707
1708.align	32
1709.L8x_reduce:
1710	mulq	%rbx
1711	movq	8(%rbp),%rax
1712	negq	%r8
1713	movq	%rdx,%r8
1714	adcq	$0,%r8
1715
1716	mulq	%rbx
1717	addq	%rax,%r9
1718	movq	16(%rbp),%rax
1719	adcq	$0,%rdx
1720	addq	%r9,%r8
1721	movq	%rbx,48-8+8(%rsp,%rcx,8)
1722	movq	%rdx,%r9
1723	adcq	$0,%r9
1724
1725	mulq	%rbx
1726	addq	%rax,%r10
1727	movq	24(%rbp),%rax
1728	adcq	$0,%rdx
1729	addq	%r10,%r9
1730	movq	32+8(%rsp),%rsi
1731	movq	%rdx,%r10
1732	adcq	$0,%r10
1733
1734	mulq	%rbx
1735	addq	%rax,%r11
1736	movq	32(%rbp),%rax
1737	adcq	$0,%rdx
1738	imulq	%r8,%rsi
1739	addq	%r11,%r10
1740	movq	%rdx,%r11
1741	adcq	$0,%r11
1742
1743	mulq	%rbx
1744	addq	%rax,%r12
1745	movq	40(%rbp),%rax
1746	adcq	$0,%rdx
1747	addq	%r12,%r11
1748	movq	%rdx,%r12
1749	adcq	$0,%r12
1750
1751	mulq	%rbx
1752	addq	%rax,%r13
1753	movq	48(%rbp),%rax
1754	adcq	$0,%rdx
1755	addq	%r13,%r12
1756	movq	%rdx,%r13
1757	adcq	$0,%r13
1758
1759	mulq	%rbx
1760	addq	%rax,%r14
1761	movq	56(%rbp),%rax
1762	adcq	$0,%rdx
1763	addq	%r14,%r13
1764	movq	%rdx,%r14
1765	adcq	$0,%r14
1766
1767	mulq	%rbx
1768	movq	%rsi,%rbx
1769	addq	%rax,%r15
1770	movq	0(%rbp),%rax
1771	adcq	$0,%rdx
1772	addq	%r15,%r14
1773	movq	%rdx,%r15
1774	adcq	$0,%r15
1775
1776	decl	%ecx
1777	jnz	.L8x_reduce
1778
1779	leaq	64(%rbp),%rbp
1780	xorq	%rax,%rax
1781	movq	8+8(%rsp),%rdx
1782	cmpq	0+8(%rsp),%rbp
1783	jae	.L8x_no_tail
1784
1785.byte	0x66
1786	addq	0(%rdi),%r8
1787	adcq	8(%rdi),%r9
1788	adcq	16(%rdi),%r10
1789	adcq	24(%rdi),%r11
1790	adcq	32(%rdi),%r12
1791	adcq	40(%rdi),%r13
1792	adcq	48(%rdi),%r14
1793	adcq	56(%rdi),%r15
1794	sbbq	%rsi,%rsi
1795
1796	movq	48+56+8(%rsp),%rbx
1797	movl	$8,%ecx
1798	movq	0(%rbp),%rax
1799	jmp	.L8x_tail
1800
1801.align	32
1802.L8x_tail:
1803	mulq	%rbx
1804	addq	%rax,%r8
1805	movq	8(%rbp),%rax
1806	movq	%r8,(%rdi)
1807	movq	%rdx,%r8
1808	adcq	$0,%r8
1809
1810	mulq	%rbx
1811	addq	%rax,%r9
1812	movq	16(%rbp),%rax
1813	adcq	$0,%rdx
1814	addq	%r9,%r8
1815	leaq	8(%rdi),%rdi
1816	movq	%rdx,%r9
1817	adcq	$0,%r9
1818
1819	mulq	%rbx
1820	addq	%rax,%r10
1821	movq	24(%rbp),%rax
1822	adcq	$0,%rdx
1823	addq	%r10,%r9
1824	movq	%rdx,%r10
1825	adcq	$0,%r10
1826
1827	mulq	%rbx
1828	addq	%rax,%r11
1829	movq	32(%rbp),%rax
1830	adcq	$0,%rdx
1831	addq	%r11,%r10
1832	movq	%rdx,%r11
1833	adcq	$0,%r11
1834
1835	mulq	%rbx
1836	addq	%rax,%r12
1837	movq	40(%rbp),%rax
1838	adcq	$0,%rdx
1839	addq	%r12,%r11
1840	movq	%rdx,%r12
1841	adcq	$0,%r12
1842
1843	mulq	%rbx
1844	addq	%rax,%r13
1845	movq	48(%rbp),%rax
1846	adcq	$0,%rdx
1847	addq	%r13,%r12
1848	movq	%rdx,%r13
1849	adcq	$0,%r13
1850
1851	mulq	%rbx
1852	addq	%rax,%r14
1853	movq	56(%rbp),%rax
1854	adcq	$0,%rdx
1855	addq	%r14,%r13
1856	movq	%rdx,%r14
1857	adcq	$0,%r14
1858
1859	mulq	%rbx
1860	movq	48-16+8(%rsp,%rcx,8),%rbx
1861	addq	%rax,%r15
1862	adcq	$0,%rdx
1863	addq	%r15,%r14
1864	movq	0(%rbp),%rax
1865	movq	%rdx,%r15
1866	adcq	$0,%r15
1867
1868	decl	%ecx
1869	jnz	.L8x_tail
1870
1871	leaq	64(%rbp),%rbp
1872	movq	8+8(%rsp),%rdx
1873	cmpq	0+8(%rsp),%rbp
1874	jae	.L8x_tail_done
1875
1876	movq	48+56+8(%rsp),%rbx
1877	negq	%rsi
1878	movq	0(%rbp),%rax
1879	adcq	0(%rdi),%r8
1880	adcq	8(%rdi),%r9
1881	adcq	16(%rdi),%r10
1882	adcq	24(%rdi),%r11
1883	adcq	32(%rdi),%r12
1884	adcq	40(%rdi),%r13
1885	adcq	48(%rdi),%r14
1886	adcq	56(%rdi),%r15
1887	sbbq	%rsi,%rsi
1888
1889	movl	$8,%ecx
1890	jmp	.L8x_tail
1891
1892.align	32
1893.L8x_tail_done:
1894	addq	(%rdx),%r8
1895	adcq	$0,%r9
1896	adcq	$0,%r10
1897	adcq	$0,%r11
1898	adcq	$0,%r12
1899	adcq	$0,%r13
1900	adcq	$0,%r14
1901	adcq	$0,%r15
1902
1903
1904	xorq	%rax,%rax
1905
1906	negq	%rsi
1907.L8x_no_tail:
1908	adcq	0(%rdi),%r8
1909	adcq	8(%rdi),%r9
1910	adcq	16(%rdi),%r10
1911	adcq	24(%rdi),%r11
1912	adcq	32(%rdi),%r12
1913	adcq	40(%rdi),%r13
1914	adcq	48(%rdi),%r14
1915	adcq	56(%rdi),%r15
1916	adcq	$0,%rax
1917	movq	-8(%rbp),%rcx
1918	xorq	%rsi,%rsi
1919
1920.byte	102,72,15,126,213
1921
1922	movq	%r8,0(%rdi)
1923	movq	%r9,8(%rdi)
1924.byte	102,73,15,126,217
1925	movq	%r10,16(%rdi)
1926	movq	%r11,24(%rdi)
1927	movq	%r12,32(%rdi)
1928	movq	%r13,40(%rdi)
1929	movq	%r14,48(%rdi)
1930	movq	%r15,56(%rdi)
1931	leaq	64(%rdi),%rdi
1932
1933	cmpq	%rdx,%rdi
1934	jb	.L8x_reduction_loop
1935	.byte	0xf3,0xc3
1936.size	bn_sqr8x_internal,.-bn_sqr8x_internal
1937.type	__bn_post4x_internal,@function
1938.align	32
1939__bn_post4x_internal:
1940	movq	0(%rbp),%r12
1941	leaq	(%rdi,%r9,1),%rbx
1942	movq	%r9,%rcx
1943.byte	102,72,15,126,207
1944	negq	%rax
1945.byte	102,72,15,126,206
1946	sarq	$3+2,%rcx
1947	decq	%r12
1948	xorq	%r10,%r10
1949	movq	8(%rbp),%r13
1950	movq	16(%rbp),%r14
1951	movq	24(%rbp),%r15
1952	jmp	.Lsqr4x_sub_entry
1953
1954.align	16
1955.Lsqr4x_sub:
1956	movq	0(%rbp),%r12
1957	movq	8(%rbp),%r13
1958	movq	16(%rbp),%r14
1959	movq	24(%rbp),%r15
1960.Lsqr4x_sub_entry:
1961	leaq	32(%rbp),%rbp
1962	notq	%r12
1963	notq	%r13
1964	notq	%r14
1965	notq	%r15
1966	andq	%rax,%r12
1967	andq	%rax,%r13
1968	andq	%rax,%r14
1969	andq	%rax,%r15
1970
1971	negq	%r10
1972	adcq	0(%rbx),%r12
1973	adcq	8(%rbx),%r13
1974	adcq	16(%rbx),%r14
1975	adcq	24(%rbx),%r15
1976	movq	%r12,0(%rdi)
1977	leaq	32(%rbx),%rbx
1978	movq	%r13,8(%rdi)
1979	sbbq	%r10,%r10
1980	movq	%r14,16(%rdi)
1981	movq	%r15,24(%rdi)
1982	leaq	32(%rdi),%rdi
1983
1984	incq	%rcx
1985	jnz	.Lsqr4x_sub
1986
1987	movq	%r9,%r10
1988	negq	%r9
1989	.byte	0xf3,0xc3
1990.size	__bn_post4x_internal,.-__bn_post4x_internal
1991.globl	bn_from_montgomery
1992.type	bn_from_montgomery,@function
1993.align	32
1994bn_from_montgomery:
1995	testl	$7,%r9d
1996	jz	bn_from_mont8x
1997	xorl	%eax,%eax
1998	.byte	0xf3,0xc3
1999.size	bn_from_montgomery,.-bn_from_montgomery
2000
2001.type	bn_from_mont8x,@function
2002.align	32
2003bn_from_mont8x:
2004.byte	0x67
2005	movq	%rsp,%rax
2006	pushq	%rbx
2007	pushq	%rbp
2008	pushq	%r12
2009	pushq	%r13
2010	pushq	%r14
2011	pushq	%r15
2012.Lfrom_prologue:
2013
2014	shll	$3,%r9d
2015	leaq	(%r9,%r9,2),%r10
2016	negq	%r9
2017	movq	(%r8),%r8
2018
2019
2020
2021
2022
2023
2024
2025
2026	leaq	-320(%rsp,%r9,2),%r11
2027	movq	%rsp,%rbp
2028	subq	%rdi,%r11
2029	andq	$4095,%r11
2030	cmpq	%r11,%r10
2031	jb	.Lfrom_sp_alt
2032	subq	%r11,%rbp
2033	leaq	-320(%rbp,%r9,2),%rbp
2034	jmp	.Lfrom_sp_done
2035
2036.align	32
2037.Lfrom_sp_alt:
2038	leaq	4096-320(,%r9,2),%r10
2039	leaq	-320(%rbp,%r9,2),%rbp
2040	subq	%r10,%r11
2041	movq	$0,%r10
2042	cmovcq	%r10,%r11
2043	subq	%r11,%rbp
2044.Lfrom_sp_done:
2045	andq	$-64,%rbp
2046	movq	%rsp,%r11
2047	subq	%rbp,%r11
2048	andq	$-4096,%r11
2049	leaq	(%r11,%rbp,1),%rsp
2050	movq	(%rsp),%r10
2051	cmpq	%rbp,%rsp
2052	ja	.Lfrom_page_walk
2053	jmp	.Lfrom_page_walk_done
2054
2055.Lfrom_page_walk:
2056	leaq	-4096(%rsp),%rsp
2057	movq	(%rsp),%r10
2058	cmpq	%rbp,%rsp
2059	ja	.Lfrom_page_walk
2060.Lfrom_page_walk_done:
2061
2062	movq	%r9,%r10
2063	negq	%r9
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074	movq	%r8,32(%rsp)
2075	movq	%rax,40(%rsp)
2076.Lfrom_body:
2077	movq	%r9,%r11
2078	leaq	48(%rsp),%rax
2079	pxor	%xmm0,%xmm0
2080	jmp	.Lmul_by_1
2081
2082.align	32
2083.Lmul_by_1:
2084	movdqu	(%rsi),%xmm1
2085	movdqu	16(%rsi),%xmm2
2086	movdqu	32(%rsi),%xmm3
2087	movdqa	%xmm0,(%rax,%r9,1)
2088	movdqu	48(%rsi),%xmm4
2089	movdqa	%xmm0,16(%rax,%r9,1)
2090.byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
2091	movdqa	%xmm1,(%rax)
2092	movdqa	%xmm0,32(%rax,%r9,1)
2093	movdqa	%xmm2,16(%rax)
2094	movdqa	%xmm0,48(%rax,%r9,1)
2095	movdqa	%xmm3,32(%rax)
2096	movdqa	%xmm4,48(%rax)
2097	leaq	64(%rax),%rax
2098	subq	$64,%r11
2099	jnz	.Lmul_by_1
2100
2101.byte	102,72,15,110,207
2102.byte	102,72,15,110,209
2103.byte	0x67
2104	movq	%rcx,%rbp
2105.byte	102,73,15,110,218
2106	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
2107	andl	$0x80108,%r11d
2108	cmpl	$0x80108,%r11d
2109	jne	.Lfrom_mont_nox
2110
2111	leaq	(%rax,%r9,1),%rdi
2112	call	__bn_sqrx8x_reduction
2113	call	__bn_postx4x_internal
2114
2115	pxor	%xmm0,%xmm0
2116	leaq	48(%rsp),%rax
2117	movq	40(%rsp),%rsi
2118	jmp	.Lfrom_mont_zero
2119
2120.align	32
2121.Lfrom_mont_nox:
2122	call	__bn_sqr8x_reduction
2123	call	__bn_post4x_internal
2124
2125	pxor	%xmm0,%xmm0
2126	leaq	48(%rsp),%rax
2127	movq	40(%rsp),%rsi
2128	jmp	.Lfrom_mont_zero
2129
2130.align	32
2131.Lfrom_mont_zero:
2132	movdqa	%xmm0,0(%rax)
2133	movdqa	%xmm0,16(%rax)
2134	movdqa	%xmm0,32(%rax)
2135	movdqa	%xmm0,48(%rax)
2136	leaq	64(%rax),%rax
2137	subq	$32,%r9
2138	jnz	.Lfrom_mont_zero
2139
2140	movq	$1,%rax
2141	movq	-48(%rsi),%r15
2142	movq	-40(%rsi),%r14
2143	movq	-32(%rsi),%r13
2144	movq	-24(%rsi),%r12
2145	movq	-16(%rsi),%rbp
2146	movq	-8(%rsi),%rbx
2147	leaq	(%rsi),%rsp
2148.Lfrom_epilogue:
2149	.byte	0xf3,0xc3
2150.size	bn_from_mont8x,.-bn_from_mont8x
2151.type	bn_mulx4x_mont_gather5,@function
2152.align	32
2153bn_mulx4x_mont_gather5:
2154	movq	%rsp,%rax
2155.Lmulx4x_enter:
2156	pushq	%rbx
2157	pushq	%rbp
2158	pushq	%r12
2159	pushq	%r13
2160	pushq	%r14
2161	pushq	%r15
2162.Lmulx4x_prologue:
2163
2164	shll	$3,%r9d
2165	leaq	(%r9,%r9,2),%r10
2166	negq	%r9
2167	movq	(%r8),%r8
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178	leaq	-320(%rsp,%r9,2),%r11
2179	movq	%rsp,%rbp
2180	subq	%rdi,%r11
2181	andq	$4095,%r11
2182	cmpq	%r11,%r10
2183	jb	.Lmulx4xsp_alt
2184	subq	%r11,%rbp
2185	leaq	-320(%rbp,%r9,2),%rbp
2186	jmp	.Lmulx4xsp_done
2187
2188.Lmulx4xsp_alt:
2189	leaq	4096-320(,%r9,2),%r10
2190	leaq	-320(%rbp,%r9,2),%rbp
2191	subq	%r10,%r11
2192	movq	$0,%r10
2193	cmovcq	%r10,%r11
2194	subq	%r11,%rbp
2195.Lmulx4xsp_done:
2196	andq	$-64,%rbp
2197	movq	%rsp,%r11
2198	subq	%rbp,%r11
2199	andq	$-4096,%r11
2200	leaq	(%r11,%rbp,1),%rsp
2201	movq	(%rsp),%r10
2202	cmpq	%rbp,%rsp
2203	ja	.Lmulx4x_page_walk
2204	jmp	.Lmulx4x_page_walk_done
2205
2206.Lmulx4x_page_walk:
2207	leaq	-4096(%rsp),%rsp
2208	movq	(%rsp),%r10
2209	cmpq	%rbp,%rsp
2210	ja	.Lmulx4x_page_walk
2211.Lmulx4x_page_walk_done:
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225	movq	%r8,32(%rsp)
2226	movq	%rax,40(%rsp)
2227.Lmulx4x_body:
2228	call	mulx4x_internal
2229
2230	movq	40(%rsp),%rsi
2231	movq	$1,%rax
2232
2233	movq	-48(%rsi),%r15
2234	movq	-40(%rsi),%r14
2235	movq	-32(%rsi),%r13
2236	movq	-24(%rsi),%r12
2237	movq	-16(%rsi),%rbp
2238	movq	-8(%rsi),%rbx
2239	leaq	(%rsi),%rsp
2240.Lmulx4x_epilogue:
2241	.byte	0xf3,0xc3
2242.size	bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
2243
2244.type	mulx4x_internal,@function
2245.align	32
2246mulx4x_internal:
2247	movq	%r9,8(%rsp)
2248	movq	%r9,%r10
2249	negq	%r9
2250	shlq	$5,%r9
2251	negq	%r10
2252	leaq	128(%rdx,%r9,1),%r13
2253	shrq	$5+5,%r9
2254	movd	8(%rax),%xmm5
2255	subq	$1,%r9
2256	leaq	.Linc(%rip),%rax
2257	movq	%r13,16+8(%rsp)
2258	movq	%r9,24+8(%rsp)
2259	movq	%rdi,56+8(%rsp)
2260	movdqa	0(%rax),%xmm0
2261	movdqa	16(%rax),%xmm1
2262	leaq	88-112(%rsp,%r10,1),%r10
2263	leaq	128(%rdx),%rdi
2264
2265	pshufd	$0,%xmm5,%xmm5
2266	movdqa	%xmm1,%xmm4
2267.byte	0x67
2268	movdqa	%xmm1,%xmm2
2269.byte	0x67
2270	paddd	%xmm0,%xmm1
2271	pcmpeqd	%xmm5,%xmm0
2272	movdqa	%xmm4,%xmm3
2273	paddd	%xmm1,%xmm2
2274	pcmpeqd	%xmm5,%xmm1
2275	movdqa	%xmm0,112(%r10)
2276	movdqa	%xmm4,%xmm0
2277
2278	paddd	%xmm2,%xmm3
2279	pcmpeqd	%xmm5,%xmm2
2280	movdqa	%xmm1,128(%r10)
2281	movdqa	%xmm4,%xmm1
2282
2283	paddd	%xmm3,%xmm0
2284	pcmpeqd	%xmm5,%xmm3
2285	movdqa	%xmm2,144(%r10)
2286	movdqa	%xmm4,%xmm2
2287
2288	paddd	%xmm0,%xmm1
2289	pcmpeqd	%xmm5,%xmm0
2290	movdqa	%xmm3,160(%r10)
2291	movdqa	%xmm4,%xmm3
2292	paddd	%xmm1,%xmm2
2293	pcmpeqd	%xmm5,%xmm1
2294	movdqa	%xmm0,176(%r10)
2295	movdqa	%xmm4,%xmm0
2296
2297	paddd	%xmm2,%xmm3
2298	pcmpeqd	%xmm5,%xmm2
2299	movdqa	%xmm1,192(%r10)
2300	movdqa	%xmm4,%xmm1
2301
2302	paddd	%xmm3,%xmm0
2303	pcmpeqd	%xmm5,%xmm3
2304	movdqa	%xmm2,208(%r10)
2305	movdqa	%xmm4,%xmm2
2306
2307	paddd	%xmm0,%xmm1
2308	pcmpeqd	%xmm5,%xmm0
2309	movdqa	%xmm3,224(%r10)
2310	movdqa	%xmm4,%xmm3
2311	paddd	%xmm1,%xmm2
2312	pcmpeqd	%xmm5,%xmm1
2313	movdqa	%xmm0,240(%r10)
2314	movdqa	%xmm4,%xmm0
2315
2316	paddd	%xmm2,%xmm3
2317	pcmpeqd	%xmm5,%xmm2
2318	movdqa	%xmm1,256(%r10)
2319	movdqa	%xmm4,%xmm1
2320
2321	paddd	%xmm3,%xmm0
2322	pcmpeqd	%xmm5,%xmm3
2323	movdqa	%xmm2,272(%r10)
2324	movdqa	%xmm4,%xmm2
2325
2326	paddd	%xmm0,%xmm1
2327	pcmpeqd	%xmm5,%xmm0
2328	movdqa	%xmm3,288(%r10)
2329	movdqa	%xmm4,%xmm3
2330.byte	0x67
2331	paddd	%xmm1,%xmm2
2332	pcmpeqd	%xmm5,%xmm1
2333	movdqa	%xmm0,304(%r10)
2334
2335	paddd	%xmm2,%xmm3
2336	pcmpeqd	%xmm5,%xmm2
2337	movdqa	%xmm1,320(%r10)
2338
2339	pcmpeqd	%xmm5,%xmm3
2340	movdqa	%xmm2,336(%r10)
2341
2342	pand	64(%rdi),%xmm0
2343	pand	80(%rdi),%xmm1
2344	pand	96(%rdi),%xmm2
2345	movdqa	%xmm3,352(%r10)
2346	pand	112(%rdi),%xmm3
2347	por	%xmm2,%xmm0
2348	por	%xmm3,%xmm1
2349	movdqa	-128(%rdi),%xmm4
2350	movdqa	-112(%rdi),%xmm5
2351	movdqa	-96(%rdi),%xmm2
2352	pand	112(%r10),%xmm4
2353	movdqa	-80(%rdi),%xmm3
2354	pand	128(%r10),%xmm5
2355	por	%xmm4,%xmm0
2356	pand	144(%r10),%xmm2
2357	por	%xmm5,%xmm1
2358	pand	160(%r10),%xmm3
2359	por	%xmm2,%xmm0
2360	por	%xmm3,%xmm1
2361	movdqa	-64(%rdi),%xmm4
2362	movdqa	-48(%rdi),%xmm5
2363	movdqa	-32(%rdi),%xmm2
2364	pand	176(%r10),%xmm4
2365	movdqa	-16(%rdi),%xmm3
2366	pand	192(%r10),%xmm5
2367	por	%xmm4,%xmm0
2368	pand	208(%r10),%xmm2
2369	por	%xmm5,%xmm1
2370	pand	224(%r10),%xmm3
2371	por	%xmm2,%xmm0
2372	por	%xmm3,%xmm1
2373	movdqa	0(%rdi),%xmm4
2374	movdqa	16(%rdi),%xmm5
2375	movdqa	32(%rdi),%xmm2
2376	pand	240(%r10),%xmm4
2377	movdqa	48(%rdi),%xmm3
2378	pand	256(%r10),%xmm5
2379	por	%xmm4,%xmm0
2380	pand	272(%r10),%xmm2
2381	por	%xmm5,%xmm1
2382	pand	288(%r10),%xmm3
2383	por	%xmm2,%xmm0
2384	por	%xmm3,%xmm1
2385	pxor	%xmm1,%xmm0
2386	pshufd	$0x4e,%xmm0,%xmm1
2387	por	%xmm1,%xmm0
2388	leaq	256(%rdi),%rdi
2389.byte	102,72,15,126,194
2390	leaq	64+32+8(%rsp),%rbx
2391
2392	movq	%rdx,%r9
2393	mulxq	0(%rsi),%r8,%rax
2394	mulxq	8(%rsi),%r11,%r12
2395	addq	%rax,%r11
2396	mulxq	16(%rsi),%rax,%r13
2397	adcq	%rax,%r12
2398	adcq	$0,%r13
2399	mulxq	24(%rsi),%rax,%r14
2400
2401	movq	%r8,%r15
2402	imulq	32+8(%rsp),%r8
2403	xorq	%rbp,%rbp
2404	movq	%r8,%rdx
2405
2406	movq	%rdi,8+8(%rsp)
2407
2408	leaq	32(%rsi),%rsi
2409	adcxq	%rax,%r13
2410	adcxq	%rbp,%r14
2411
2412	mulxq	0(%rcx),%rax,%r10
2413	adcxq	%rax,%r15
2414	adoxq	%r11,%r10
2415	mulxq	8(%rcx),%rax,%r11
2416	adcxq	%rax,%r10
2417	adoxq	%r12,%r11
2418	mulxq	16(%rcx),%rax,%r12
2419	movq	24+8(%rsp),%rdi
2420	movq	%r10,-32(%rbx)
2421	adcxq	%rax,%r11
2422	adoxq	%r13,%r12
2423	mulxq	24(%rcx),%rax,%r15
2424	movq	%r9,%rdx
2425	movq	%r11,-24(%rbx)
2426	adcxq	%rax,%r12
2427	adoxq	%rbp,%r15
2428	leaq	32(%rcx),%rcx
2429	movq	%r12,-16(%rbx)
2430	jmp	.Lmulx4x_1st
2431
2432.align	32
2433.Lmulx4x_1st:
2434	adcxq	%rbp,%r15
2435	mulxq	0(%rsi),%r10,%rax
2436	adcxq	%r14,%r10
2437	mulxq	8(%rsi),%r11,%r14
2438	adcxq	%rax,%r11
2439	mulxq	16(%rsi),%r12,%rax
2440	adcxq	%r14,%r12
2441	mulxq	24(%rsi),%r13,%r14
2442.byte	0x67,0x67
2443	movq	%r8,%rdx
2444	adcxq	%rax,%r13
2445	adcxq	%rbp,%r14
2446	leaq	32(%rsi),%rsi
2447	leaq	32(%rbx),%rbx
2448
2449	adoxq	%r15,%r10
2450	mulxq	0(%rcx),%rax,%r15
2451	adcxq	%rax,%r10
2452	adoxq	%r15,%r11
2453	mulxq	8(%rcx),%rax,%r15
2454	adcxq	%rax,%r11
2455	adoxq	%r15,%r12
2456	mulxq	16(%rcx),%rax,%r15
2457	movq	%r10,-40(%rbx)
2458	adcxq	%rax,%r12
2459	movq	%r11,-32(%rbx)
2460	adoxq	%r15,%r13
2461	mulxq	24(%rcx),%rax,%r15
2462	movq	%r9,%rdx
2463	movq	%r12,-24(%rbx)
2464	adcxq	%rax,%r13
2465	adoxq	%rbp,%r15
2466	leaq	32(%rcx),%rcx
2467	movq	%r13,-16(%rbx)
2468
2469	decq	%rdi
2470	jnz	.Lmulx4x_1st
2471
2472	movq	8(%rsp),%rax
2473	adcq	%rbp,%r15
2474	leaq	(%rsi,%rax,1),%rsi
2475	addq	%r15,%r14
2476	movq	8+8(%rsp),%rdi
2477	adcq	%rbp,%rbp
2478	movq	%r14,-8(%rbx)
2479	jmp	.Lmulx4x_outer
2480
2481.align	32
2482.Lmulx4x_outer:
2483	leaq	16-256(%rbx),%r10
2484	pxor	%xmm4,%xmm4
2485.byte	0x67,0x67
2486	pxor	%xmm5,%xmm5
2487	movdqa	-128(%rdi),%xmm0
2488	movdqa	-112(%rdi),%xmm1
2489	movdqa	-96(%rdi),%xmm2
2490	pand	256(%r10),%xmm0
2491	movdqa	-80(%rdi),%xmm3
2492	pand	272(%r10),%xmm1
2493	por	%xmm0,%xmm4
2494	pand	288(%r10),%xmm2
2495	por	%xmm1,%xmm5
2496	pand	304(%r10),%xmm3
2497	por	%xmm2,%xmm4
2498	por	%xmm3,%xmm5
2499	movdqa	-64(%rdi),%xmm0
2500	movdqa	-48(%rdi),%xmm1
2501	movdqa	-32(%rdi),%xmm2
2502	pand	320(%r10),%xmm0
2503	movdqa	-16(%rdi),%xmm3
2504	pand	336(%r10),%xmm1
2505	por	%xmm0,%xmm4
2506	pand	352(%r10),%xmm2
2507	por	%xmm1,%xmm5
2508	pand	368(%r10),%xmm3
2509	por	%xmm2,%xmm4
2510	por	%xmm3,%xmm5
2511	movdqa	0(%rdi),%xmm0
2512	movdqa	16(%rdi),%xmm1
2513	movdqa	32(%rdi),%xmm2
2514	pand	384(%r10),%xmm0
2515	movdqa	48(%rdi),%xmm3
2516	pand	400(%r10),%xmm1
2517	por	%xmm0,%xmm4
2518	pand	416(%r10),%xmm2
2519	por	%xmm1,%xmm5
2520	pand	432(%r10),%xmm3
2521	por	%xmm2,%xmm4
2522	por	%xmm3,%xmm5
2523	movdqa	64(%rdi),%xmm0
2524	movdqa	80(%rdi),%xmm1
2525	movdqa	96(%rdi),%xmm2
2526	pand	448(%r10),%xmm0
2527	movdqa	112(%rdi),%xmm3
2528	pand	464(%r10),%xmm1
2529	por	%xmm0,%xmm4
2530	pand	480(%r10),%xmm2
2531	por	%xmm1,%xmm5
2532	pand	496(%r10),%xmm3
2533	por	%xmm2,%xmm4
2534	por	%xmm3,%xmm5
2535	por	%xmm5,%xmm4
2536	pshufd	$0x4e,%xmm4,%xmm0
2537	por	%xmm4,%xmm0
2538	leaq	256(%rdi),%rdi
2539.byte	102,72,15,126,194
2540
2541	movq	%rbp,(%rbx)
2542	leaq	32(%rbx,%rax,1),%rbx
2543	mulxq	0(%rsi),%r8,%r11
2544	xorq	%rbp,%rbp
2545	movq	%rdx,%r9
2546	mulxq	8(%rsi),%r14,%r12
2547	adoxq	-32(%rbx),%r8
2548	adcxq	%r14,%r11
2549	mulxq	16(%rsi),%r15,%r13
2550	adoxq	-24(%rbx),%r11
2551	adcxq	%r15,%r12
2552	mulxq	24(%rsi),%rdx,%r14
2553	adoxq	-16(%rbx),%r12
2554	adcxq	%rdx,%r13
2555	leaq	(%rcx,%rax,1),%rcx
2556	leaq	32(%rsi),%rsi
2557	adoxq	-8(%rbx),%r13
2558	adcxq	%rbp,%r14
2559	adoxq	%rbp,%r14
2560
2561	movq	%r8,%r15
2562	imulq	32+8(%rsp),%r8
2563
2564	movq	%r8,%rdx
2565	xorq	%rbp,%rbp
2566	movq	%rdi,8+8(%rsp)
2567
2568	mulxq	0(%rcx),%rax,%r10
2569	adcxq	%rax,%r15
2570	adoxq	%r11,%r10
2571	mulxq	8(%rcx),%rax,%r11
2572	adcxq	%rax,%r10
2573	adoxq	%r12,%r11
2574	mulxq	16(%rcx),%rax,%r12
2575	adcxq	%rax,%r11
2576	adoxq	%r13,%r12
2577	mulxq	24(%rcx),%rax,%r15
2578	movq	%r9,%rdx
2579	movq	24+8(%rsp),%rdi
2580	movq	%r10,-32(%rbx)
2581	adcxq	%rax,%r12
2582	movq	%r11,-24(%rbx)
2583	adoxq	%rbp,%r15
2584	movq	%r12,-16(%rbx)
2585	leaq	32(%rcx),%rcx
2586	jmp	.Lmulx4x_inner
2587
2588.align	32
2589.Lmulx4x_inner:
2590	mulxq	0(%rsi),%r10,%rax
2591	adcxq	%rbp,%r15
2592	adoxq	%r14,%r10
2593	mulxq	8(%rsi),%r11,%r14
2594	adcxq	0(%rbx),%r10
2595	adoxq	%rax,%r11
2596	mulxq	16(%rsi),%r12,%rax
2597	adcxq	8(%rbx),%r11
2598	adoxq	%r14,%r12
2599	mulxq	24(%rsi),%r13,%r14
2600	movq	%r8,%rdx
2601	adcxq	16(%rbx),%r12
2602	adoxq	%rax,%r13
2603	adcxq	24(%rbx),%r13
2604	adoxq	%rbp,%r14
2605	leaq	32(%rsi),%rsi
2606	leaq	32(%rbx),%rbx
2607	adcxq	%rbp,%r14
2608
2609	adoxq	%r15,%r10
2610	mulxq	0(%rcx),%rax,%r15
2611	adcxq	%rax,%r10
2612	adoxq	%r15,%r11
2613	mulxq	8(%rcx),%rax,%r15
2614	adcxq	%rax,%r11
2615	adoxq	%r15,%r12
2616	mulxq	16(%rcx),%rax,%r15
2617	movq	%r10,-40(%rbx)
2618	adcxq	%rax,%r12
2619	adoxq	%r15,%r13
2620	movq	%r11,-32(%rbx)
2621	mulxq	24(%rcx),%rax,%r15
2622	movq	%r9,%rdx
2623	leaq	32(%rcx),%rcx
2624	movq	%r12,-24(%rbx)
2625	adcxq	%rax,%r13
2626	adoxq	%rbp,%r15
2627	movq	%r13,-16(%rbx)
2628
2629	decq	%rdi
2630	jnz	.Lmulx4x_inner
2631
2632	movq	0+8(%rsp),%rax
2633	adcq	%rbp,%r15
2634	subq	0(%rbx),%rdi
2635	movq	8+8(%rsp),%rdi
2636	movq	16+8(%rsp),%r10
2637	adcq	%r15,%r14
2638	leaq	(%rsi,%rax,1),%rsi
2639	adcq	%rbp,%rbp
2640	movq	%r14,-8(%rbx)
2641
2642	cmpq	%r10,%rdi
2643	jb	.Lmulx4x_outer
2644
2645	movq	-8(%rcx),%r10
2646	movq	%rbp,%r8
2647	movq	(%rcx,%rax,1),%r12
2648	leaq	(%rcx,%rax,1),%rbp
2649	movq	%rax,%rcx
2650	leaq	(%rbx,%rax,1),%rdi
2651	xorl	%eax,%eax
2652	xorq	%r15,%r15
2653	subq	%r14,%r10
2654	adcq	%r15,%r15
2655	orq	%r15,%r8
2656	sarq	$3+2,%rcx
2657	subq	%r8,%rax
2658	movq	56+8(%rsp),%rdx
2659	decq	%r12
2660	movq	8(%rbp),%r13
2661	xorq	%r8,%r8
2662	movq	16(%rbp),%r14
2663	movq	24(%rbp),%r15
2664	jmp	.Lsqrx4x_sub_entry
2665.size	mulx4x_internal,.-mulx4x_internal
2666.type	bn_powerx5,@function
2667.align	32
2668bn_powerx5:
2669	movq	%rsp,%rax
2670.Lpowerx5_enter:
2671	pushq	%rbx
2672	pushq	%rbp
2673	pushq	%r12
2674	pushq	%r13
2675	pushq	%r14
2676	pushq	%r15
2677.Lpowerx5_prologue:
2678
2679	shll	$3,%r9d
2680	leaq	(%r9,%r9,2),%r10
2681	negq	%r9
2682	movq	(%r8),%r8
2683
2684
2685
2686
2687
2688
2689
2690
2691	leaq	-320(%rsp,%r9,2),%r11
2692	movq	%rsp,%rbp
2693	subq	%rdi,%r11
2694	andq	$4095,%r11
2695	cmpq	%r11,%r10
2696	jb	.Lpwrx_sp_alt
2697	subq	%r11,%rbp
2698	leaq	-320(%rbp,%r9,2),%rbp
2699	jmp	.Lpwrx_sp_done
2700
2701.align	32
2702.Lpwrx_sp_alt:
2703	leaq	4096-320(,%r9,2),%r10
2704	leaq	-320(%rbp,%r9,2),%rbp
2705	subq	%r10,%r11
2706	movq	$0,%r10
2707	cmovcq	%r10,%r11
2708	subq	%r11,%rbp
2709.Lpwrx_sp_done:
2710	andq	$-64,%rbp
2711	movq	%rsp,%r11
2712	subq	%rbp,%r11
2713	andq	$-4096,%r11
2714	leaq	(%r11,%rbp,1),%rsp
2715	movq	(%rsp),%r10
2716	cmpq	%rbp,%rsp
2717	ja	.Lpwrx_page_walk
2718	jmp	.Lpwrx_page_walk_done
2719
2720.Lpwrx_page_walk:
2721	leaq	-4096(%rsp),%rsp
2722	movq	(%rsp),%r10
2723	cmpq	%rbp,%rsp
2724	ja	.Lpwrx_page_walk
2725.Lpwrx_page_walk_done:
2726
2727	movq	%r9,%r10
2728	negq	%r9
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741	pxor	%xmm0,%xmm0
2742.byte	102,72,15,110,207
2743.byte	102,72,15,110,209
2744.byte	102,73,15,110,218
2745.byte	102,72,15,110,226
2746	movq	%r8,32(%rsp)
2747	movq	%rax,40(%rsp)
2748.Lpowerx5_body:
2749
2750	call	__bn_sqrx8x_internal
2751	call	__bn_postx4x_internal
2752	call	__bn_sqrx8x_internal
2753	call	__bn_postx4x_internal
2754	call	__bn_sqrx8x_internal
2755	call	__bn_postx4x_internal
2756	call	__bn_sqrx8x_internal
2757	call	__bn_postx4x_internal
2758	call	__bn_sqrx8x_internal
2759	call	__bn_postx4x_internal
2760
2761	movq	%r10,%r9
2762	movq	%rsi,%rdi
2763.byte	102,72,15,126,209
2764.byte	102,72,15,126,226
2765	movq	40(%rsp),%rax
2766
2767	call	mulx4x_internal
2768
2769	movq	40(%rsp),%rsi
2770	movq	$1,%rax
2771
2772	movq	-48(%rsi),%r15
2773	movq	-40(%rsi),%r14
2774	movq	-32(%rsi),%r13
2775	movq	-24(%rsi),%r12
2776	movq	-16(%rsi),%rbp
2777	movq	-8(%rsi),%rbx
2778	leaq	(%rsi),%rsp
2779.Lpowerx5_epilogue:
2780	.byte	0xf3,0xc3
2781.size	bn_powerx5,.-bn_powerx5
2782
2783.globl	bn_sqrx8x_internal
2784.hidden	bn_sqrx8x_internal
2785.type	bn_sqrx8x_internal,@function
2786.align	32
2787bn_sqrx8x_internal:
2788__bn_sqrx8x_internal:
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829	leaq	48+8(%rsp),%rdi
2830	leaq	(%rsi,%r9,1),%rbp
2831	movq	%r9,0+8(%rsp)
2832	movq	%rbp,8+8(%rsp)
2833	jmp	.Lsqr8x_zero_start
2834
2835.align	32
2836.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2837.Lsqrx8x_zero:
2838.byte	0x3e
2839	movdqa	%xmm0,0(%rdi)
2840	movdqa	%xmm0,16(%rdi)
2841	movdqa	%xmm0,32(%rdi)
2842	movdqa	%xmm0,48(%rdi)
2843.Lsqr8x_zero_start:
2844	movdqa	%xmm0,64(%rdi)
2845	movdqa	%xmm0,80(%rdi)
2846	movdqa	%xmm0,96(%rdi)
2847	movdqa	%xmm0,112(%rdi)
2848	leaq	128(%rdi),%rdi
2849	subq	$64,%r9
2850	jnz	.Lsqrx8x_zero
2851
2852	movq	0(%rsi),%rdx
2853
2854	xorq	%r10,%r10
2855	xorq	%r11,%r11
2856	xorq	%r12,%r12
2857	xorq	%r13,%r13
2858	xorq	%r14,%r14
2859	xorq	%r15,%r15
2860	leaq	48+8(%rsp),%rdi
2861	xorq	%rbp,%rbp
2862	jmp	.Lsqrx8x_outer_loop
2863
2864.align	32
2865.Lsqrx8x_outer_loop:
2866	mulxq	8(%rsi),%r8,%rax
2867	adcxq	%r9,%r8
2868	adoxq	%rax,%r10
2869	mulxq	16(%rsi),%r9,%rax
2870	adcxq	%r10,%r9
2871	adoxq	%rax,%r11
2872.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
2873	adcxq	%r11,%r10
2874	adoxq	%rax,%r12
2875.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
2876	adcxq	%r12,%r11
2877	adoxq	%rax,%r13
2878	mulxq	40(%rsi),%r12,%rax
2879	adcxq	%r13,%r12
2880	adoxq	%rax,%r14
2881	mulxq	48(%rsi),%r13,%rax
2882	adcxq	%r14,%r13
2883	adoxq	%r15,%rax
2884	mulxq	56(%rsi),%r14,%r15
2885	movq	8(%rsi),%rdx
2886	adcxq	%rax,%r14
2887	adoxq	%rbp,%r15
2888	adcq	64(%rdi),%r15
2889	movq	%r8,8(%rdi)
2890	movq	%r9,16(%rdi)
2891	sbbq	%rcx,%rcx
2892	xorq	%rbp,%rbp
2893
2894
2895	mulxq	16(%rsi),%r8,%rbx
2896	mulxq	24(%rsi),%r9,%rax
2897	adcxq	%r10,%r8
2898	adoxq	%rbx,%r9
2899	mulxq	32(%rsi),%r10,%rbx
2900	adcxq	%r11,%r9
2901	adoxq	%rax,%r10
2902.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
2903	adcxq	%r12,%r10
2904	adoxq	%rbx,%r11
2905.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
2906	adcxq	%r13,%r11
2907	adoxq	%r14,%r12
2908.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
2909	movq	16(%rsi),%rdx
2910	adcxq	%rax,%r12
2911	adoxq	%rbx,%r13
2912	adcxq	%r15,%r13
2913	adoxq	%rbp,%r14
2914	adcxq	%rbp,%r14
2915
2916	movq	%r8,24(%rdi)
2917	movq	%r9,32(%rdi)
2918
2919	mulxq	24(%rsi),%r8,%rbx
2920	mulxq	32(%rsi),%r9,%rax
2921	adcxq	%r10,%r8
2922	adoxq	%rbx,%r9
2923	mulxq	40(%rsi),%r10,%rbx
2924	adcxq	%r11,%r9
2925	adoxq	%rax,%r10
2926.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
2927	adcxq	%r12,%r10
2928	adoxq	%r13,%r11
2929.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
2930.byte	0x3e
2931	movq	24(%rsi),%rdx
2932	adcxq	%rbx,%r11
2933	adoxq	%rax,%r12
2934	adcxq	%r14,%r12
2935	movq	%r8,40(%rdi)
2936	movq	%r9,48(%rdi)
2937	mulxq	32(%rsi),%r8,%rax
2938	adoxq	%rbp,%r13
2939	adcxq	%rbp,%r13
2940
2941	mulxq	40(%rsi),%r9,%rbx
2942	adcxq	%r10,%r8
2943	adoxq	%rax,%r9
2944	mulxq	48(%rsi),%r10,%rax
2945	adcxq	%r11,%r9
2946	adoxq	%r12,%r10
2947	mulxq	56(%rsi),%r11,%r12
2948	movq	32(%rsi),%rdx
2949	movq	40(%rsi),%r14
2950	adcxq	%rbx,%r10
2951	adoxq	%rax,%r11
2952	movq	48(%rsi),%r15
2953	adcxq	%r13,%r11
2954	adoxq	%rbp,%r12
2955	adcxq	%rbp,%r12
2956
2957	movq	%r8,56(%rdi)
2958	movq	%r9,64(%rdi)
2959
2960	mulxq	%r14,%r9,%rax
2961	movq	56(%rsi),%r8
2962	adcxq	%r10,%r9
2963	mulxq	%r15,%r10,%rbx
2964	adoxq	%rax,%r10
2965	adcxq	%r11,%r10
2966	mulxq	%r8,%r11,%rax
2967	movq	%r14,%rdx
2968	adoxq	%rbx,%r11
2969	adcxq	%r12,%r11
2970
2971	adcxq	%rbp,%rax
2972
2973	mulxq	%r15,%r14,%rbx
2974	mulxq	%r8,%r12,%r13
2975	movq	%r15,%rdx
2976	leaq	64(%rsi),%rsi
2977	adcxq	%r14,%r11
2978	adoxq	%rbx,%r12
2979	adcxq	%rax,%r12
2980	adoxq	%rbp,%r13
2981
2982.byte	0x67,0x67
2983	mulxq	%r8,%r8,%r14
2984	adcxq	%r8,%r13
2985	adcxq	%rbp,%r14
2986
2987	cmpq	8+8(%rsp),%rsi
2988	je	.Lsqrx8x_outer_break
2989
2990	negq	%rcx
2991	movq	$-8,%rcx
2992	movq	%rbp,%r15
2993	movq	64(%rdi),%r8
2994	adcxq	72(%rdi),%r9
2995	adcxq	80(%rdi),%r10
2996	adcxq	88(%rdi),%r11
2997	adcq	96(%rdi),%r12
2998	adcq	104(%rdi),%r13
2999	adcq	112(%rdi),%r14
3000	adcq	120(%rdi),%r15
3001	leaq	(%rsi),%rbp
3002	leaq	128(%rdi),%rdi
3003	sbbq	%rax,%rax
3004
3005	movq	-64(%rsi),%rdx
3006	movq	%rax,16+8(%rsp)
3007	movq	%rdi,24+8(%rsp)
3008
3009
3010	xorl	%eax,%eax
3011	jmp	.Lsqrx8x_loop
3012
3013.align	32
3014.Lsqrx8x_loop:
3015	movq	%r8,%rbx
3016	mulxq	0(%rbp),%rax,%r8
3017	adcxq	%rax,%rbx
3018	adoxq	%r9,%r8
3019
3020	mulxq	8(%rbp),%rax,%r9
3021	adcxq	%rax,%r8
3022	adoxq	%r10,%r9
3023
3024	mulxq	16(%rbp),%rax,%r10
3025	adcxq	%rax,%r9
3026	adoxq	%r11,%r10
3027
3028	mulxq	24(%rbp),%rax,%r11
3029	adcxq	%rax,%r10
3030	adoxq	%r12,%r11
3031
3032.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3033	adcxq	%rax,%r11
3034	adoxq	%r13,%r12
3035
3036	mulxq	40(%rbp),%rax,%r13
3037	adcxq	%rax,%r12
3038	adoxq	%r14,%r13
3039
3040	mulxq	48(%rbp),%rax,%r14
3041	movq	%rbx,(%rdi,%rcx,8)
3042	movl	$0,%ebx
3043	adcxq	%rax,%r13
3044	adoxq	%r15,%r14
3045
3046.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
3047	movq	8(%rsi,%rcx,8),%rdx
3048	adcxq	%rax,%r14
3049	adoxq	%rbx,%r15
3050	adcxq	%rbx,%r15
3051
3052.byte	0x67
3053	incq	%rcx
3054	jnz	.Lsqrx8x_loop
3055
3056	leaq	64(%rbp),%rbp
3057	movq	$-8,%rcx
3058	cmpq	8+8(%rsp),%rbp
3059	je	.Lsqrx8x_break
3060
3061	subq	16+8(%rsp),%rbx
3062.byte	0x66
3063	movq	-64(%rsi),%rdx
3064	adcxq	0(%rdi),%r8
3065	adcxq	8(%rdi),%r9
3066	adcq	16(%rdi),%r10
3067	adcq	24(%rdi),%r11
3068	adcq	32(%rdi),%r12
3069	adcq	40(%rdi),%r13
3070	adcq	48(%rdi),%r14
3071	adcq	56(%rdi),%r15
3072	leaq	64(%rdi),%rdi
3073.byte	0x67
3074	sbbq	%rax,%rax
3075	xorl	%ebx,%ebx
3076	movq	%rax,16+8(%rsp)
3077	jmp	.Lsqrx8x_loop
3078
3079.align	32
3080.Lsqrx8x_break:
3081	subq	16+8(%rsp),%r8
3082	movq	24+8(%rsp),%rcx
3083	movq	0(%rsi),%rdx
3084	xorl	%ebp,%ebp
3085	movq	%r8,0(%rdi)
3086	cmpq	%rcx,%rdi
3087	je	.Lsqrx8x_outer_loop
3088
3089	movq	%r9,8(%rdi)
3090	movq	8(%rcx),%r9
3091	movq	%r10,16(%rdi)
3092	movq	16(%rcx),%r10
3093	movq	%r11,24(%rdi)
3094	movq	24(%rcx),%r11
3095	movq	%r12,32(%rdi)
3096	movq	32(%rcx),%r12
3097	movq	%r13,40(%rdi)
3098	movq	40(%rcx),%r13
3099	movq	%r14,48(%rdi)
3100	movq	48(%rcx),%r14
3101	movq	%r15,56(%rdi)
3102	movq	56(%rcx),%r15
3103	movq	%rcx,%rdi
3104	jmp	.Lsqrx8x_outer_loop
3105
3106.align	32
3107.Lsqrx8x_outer_break:
3108	movq	%r9,72(%rdi)
3109.byte	102,72,15,126,217
3110	movq	%r10,80(%rdi)
3111	movq	%r11,88(%rdi)
3112	movq	%r12,96(%rdi)
3113	movq	%r13,104(%rdi)
3114	movq	%r14,112(%rdi)
3115	leaq	48+8(%rsp),%rdi
3116	movq	(%rsi,%rcx,1),%rdx
3117
3118	movq	8(%rdi),%r11
3119	xorq	%r10,%r10
3120	movq	0+8(%rsp),%r9
3121	adoxq	%r11,%r11
3122	movq	16(%rdi),%r12
3123	movq	24(%rdi),%r13
3124
3125
3126.align	32
3127.Lsqrx4x_shift_n_add:
3128	mulxq	%rdx,%rax,%rbx
3129	adoxq	%r12,%r12
3130	adcxq	%r10,%rax
3131.byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
3132.byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
3133	adoxq	%r13,%r13
3134	adcxq	%r11,%rbx
3135	movq	40(%rdi),%r11
3136	movq	%rax,0(%rdi)
3137	movq	%rbx,8(%rdi)
3138
3139	mulxq	%rdx,%rax,%rbx
3140	adoxq	%r10,%r10
3141	adcxq	%r12,%rax
3142	movq	16(%rsi,%rcx,1),%rdx
3143	movq	48(%rdi),%r12
3144	adoxq	%r11,%r11
3145	adcxq	%r13,%rbx
3146	movq	56(%rdi),%r13
3147	movq	%rax,16(%rdi)
3148	movq	%rbx,24(%rdi)
3149
3150	mulxq	%rdx,%rax,%rbx
3151	adoxq	%r12,%r12
3152	adcxq	%r10,%rax
3153	movq	24(%rsi,%rcx,1),%rdx
3154	leaq	32(%rcx),%rcx
3155	movq	64(%rdi),%r10
3156	adoxq	%r13,%r13
3157	adcxq	%r11,%rbx
3158	movq	72(%rdi),%r11
3159	movq	%rax,32(%rdi)
3160	movq	%rbx,40(%rdi)
3161
3162	mulxq	%rdx,%rax,%rbx
3163	adoxq	%r10,%r10
3164	adcxq	%r12,%rax
3165	jrcxz	.Lsqrx4x_shift_n_add_break
3166.byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
3167	adoxq	%r11,%r11
3168	adcxq	%r13,%rbx
3169	movq	80(%rdi),%r12
3170	movq	88(%rdi),%r13
3171	movq	%rax,48(%rdi)
3172	movq	%rbx,56(%rdi)
3173	leaq	64(%rdi),%rdi
3174	nop
3175	jmp	.Lsqrx4x_shift_n_add
3176
3177.align	32
3178.Lsqrx4x_shift_n_add_break:
3179	adcxq	%r13,%rbx
3180	movq	%rax,48(%rdi)
3181	movq	%rbx,56(%rdi)
3182	leaq	64(%rdi),%rdi
3183.byte	102,72,15,126,213
3184__bn_sqrx8x_reduction:
3185	xorl	%eax,%eax
3186	movq	32+8(%rsp),%rbx
3187	movq	48+8(%rsp),%rdx
3188	leaq	-64(%rbp,%r9,1),%rcx
3189
3190	movq	%rcx,0+8(%rsp)
3191	movq	%rdi,8+8(%rsp)
3192
3193	leaq	48+8(%rsp),%rdi
3194	jmp	.Lsqrx8x_reduction_loop
3195
3196.align	32
3197.Lsqrx8x_reduction_loop:
3198	movq	8(%rdi),%r9
3199	movq	16(%rdi),%r10
3200	movq	24(%rdi),%r11
3201	movq	32(%rdi),%r12
3202	movq	%rdx,%r8
3203	imulq	%rbx,%rdx
3204	movq	40(%rdi),%r13
3205	movq	48(%rdi),%r14
3206	movq	56(%rdi),%r15
3207	movq	%rax,24+8(%rsp)
3208
3209	leaq	64(%rdi),%rdi
3210	xorq	%rsi,%rsi
3211	movq	$-8,%rcx
3212	jmp	.Lsqrx8x_reduce
3213
3214.align	32
3215.Lsqrx8x_reduce:
3216	movq	%r8,%rbx
3217	mulxq	0(%rbp),%rax,%r8
3218	adcxq	%rbx,%rax
3219	adoxq	%r9,%r8
3220
3221	mulxq	8(%rbp),%rbx,%r9
3222	adcxq	%rbx,%r8
3223	adoxq	%r10,%r9
3224
3225	mulxq	16(%rbp),%rbx,%r10
3226	adcxq	%rbx,%r9
3227	adoxq	%r11,%r10
3228
3229	mulxq	24(%rbp),%rbx,%r11
3230	adcxq	%rbx,%r10
3231	adoxq	%r12,%r11
3232
3233.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
3234	movq	%rdx,%rax
3235	movq	%r8,%rdx
3236	adcxq	%rbx,%r11
3237	adoxq	%r13,%r12
3238
3239	mulxq	32+8(%rsp),%rbx,%rdx
3240	movq	%rax,%rdx
3241	movq	%rax,64+48+8(%rsp,%rcx,8)
3242
3243	mulxq	40(%rbp),%rax,%r13
3244	adcxq	%rax,%r12
3245	adoxq	%r14,%r13
3246
3247	mulxq	48(%rbp),%rax,%r14
3248	adcxq	%rax,%r13
3249	adoxq	%r15,%r14
3250
3251	mulxq	56(%rbp),%rax,%r15
3252	movq	%rbx,%rdx
3253	adcxq	%rax,%r14
3254	adoxq	%rsi,%r15
3255	adcxq	%rsi,%r15
3256
3257.byte	0x67,0x67,0x67
3258	incq	%rcx
3259	jnz	.Lsqrx8x_reduce
3260
3261	movq	%rsi,%rax
3262	cmpq	0+8(%rsp),%rbp
3263	jae	.Lsqrx8x_no_tail
3264
3265	movq	48+8(%rsp),%rdx
3266	addq	0(%rdi),%r8
3267	leaq	64(%rbp),%rbp
3268	movq	$-8,%rcx
3269	adcxq	8(%rdi),%r9
3270	adcxq	16(%rdi),%r10
3271	adcq	24(%rdi),%r11
3272	adcq	32(%rdi),%r12
3273	adcq	40(%rdi),%r13
3274	adcq	48(%rdi),%r14
3275	adcq	56(%rdi),%r15
3276	leaq	64(%rdi),%rdi
3277	sbbq	%rax,%rax
3278
3279	xorq	%rsi,%rsi
3280	movq	%rax,16+8(%rsp)
3281	jmp	.Lsqrx8x_tail
3282
3283.align	32
3284.Lsqrx8x_tail:
3285	movq	%r8,%rbx
3286	mulxq	0(%rbp),%rax,%r8
3287	adcxq	%rax,%rbx
3288	adoxq	%r9,%r8
3289
3290	mulxq	8(%rbp),%rax,%r9
3291	adcxq	%rax,%r8
3292	adoxq	%r10,%r9
3293
3294	mulxq	16(%rbp),%rax,%r10
3295	adcxq	%rax,%r9
3296	adoxq	%r11,%r10
3297
3298	mulxq	24(%rbp),%rax,%r11
3299	adcxq	%rax,%r10
3300	adoxq	%r12,%r11
3301
3302.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3303	adcxq	%rax,%r11
3304	adoxq	%r13,%r12
3305
3306	mulxq	40(%rbp),%rax,%r13
3307	adcxq	%rax,%r12
3308	adoxq	%r14,%r13
3309
3310	mulxq	48(%rbp),%rax,%r14
3311	adcxq	%rax,%r13
3312	adoxq	%r15,%r14
3313
3314	mulxq	56(%rbp),%rax,%r15
3315	movq	72+48+8(%rsp,%rcx,8),%rdx
3316	adcxq	%rax,%r14
3317	adoxq	%rsi,%r15
3318	movq	%rbx,(%rdi,%rcx,8)
3319	movq	%r8,%rbx
3320	adcxq	%rsi,%r15
3321
3322	incq	%rcx
3323	jnz	.Lsqrx8x_tail
3324
3325	cmpq	0+8(%rsp),%rbp
3326	jae	.Lsqrx8x_tail_done
3327
3328	subq	16+8(%rsp),%rsi
3329	movq	48+8(%rsp),%rdx
3330	leaq	64(%rbp),%rbp
3331	adcq	0(%rdi),%r8
3332	adcq	8(%rdi),%r9
3333	adcq	16(%rdi),%r10
3334	adcq	24(%rdi),%r11
3335	adcq	32(%rdi),%r12
3336	adcq	40(%rdi),%r13
3337	adcq	48(%rdi),%r14
3338	adcq	56(%rdi),%r15
3339	leaq	64(%rdi),%rdi
3340	sbbq	%rax,%rax
3341	subq	$8,%rcx
3342
3343	xorq	%rsi,%rsi
3344	movq	%rax,16+8(%rsp)
3345	jmp	.Lsqrx8x_tail
3346
3347.align	32
3348.Lsqrx8x_tail_done:
3349	addq	24+8(%rsp),%r8
3350	adcq	$0,%r9
3351	adcq	$0,%r10
3352	adcq	$0,%r11
3353	adcq	$0,%r12
3354	adcq	$0,%r13
3355	adcq	$0,%r14
3356	adcq	$0,%r15
3357
3358
3359	movq	%rsi,%rax
3360
3361	subq	16+8(%rsp),%rsi
3362.Lsqrx8x_no_tail:
3363	adcq	0(%rdi),%r8
3364.byte	102,72,15,126,217
3365	adcq	8(%rdi),%r9
3366	movq	56(%rbp),%rsi
3367.byte	102,72,15,126,213
3368	adcq	16(%rdi),%r10
3369	adcq	24(%rdi),%r11
3370	adcq	32(%rdi),%r12
3371	adcq	40(%rdi),%r13
3372	adcq	48(%rdi),%r14
3373	adcq	56(%rdi),%r15
3374	adcq	%rax,%rax
3375
3376	movq	32+8(%rsp),%rbx
3377	movq	64(%rdi,%rcx,1),%rdx
3378
3379	movq	%r8,0(%rdi)
3380	leaq	64(%rdi),%r8
3381	movq	%r9,8(%rdi)
3382	movq	%r10,16(%rdi)
3383	movq	%r11,24(%rdi)
3384	movq	%r12,32(%rdi)
3385	movq	%r13,40(%rdi)
3386	movq	%r14,48(%rdi)
3387	movq	%r15,56(%rdi)
3388
3389	leaq	64(%rdi,%rcx,1),%rdi
3390	cmpq	8+8(%rsp),%r8
3391	jb	.Lsqrx8x_reduction_loop
3392	.byte	0xf3,0xc3
3393.size	bn_sqrx8x_internal,.-bn_sqrx8x_internal
3394.align	32
3395__bn_postx4x_internal:
3396	movq	0(%rbp),%r12
3397	movq	%rcx,%r10
3398	movq	%rcx,%r9
3399	negq	%rax
3400	sarq	$3+2,%rcx
3401
3402.byte	102,72,15,126,202
3403.byte	102,72,15,126,206
3404	decq	%r12
3405	movq	8(%rbp),%r13
3406	xorq	%r8,%r8
3407	movq	16(%rbp),%r14
3408	movq	24(%rbp),%r15
3409	jmp	.Lsqrx4x_sub_entry
3410
3411.align	16
3412.Lsqrx4x_sub:
3413	movq	0(%rbp),%r12
3414	movq	8(%rbp),%r13
3415	movq	16(%rbp),%r14
3416	movq	24(%rbp),%r15
3417.Lsqrx4x_sub_entry:
3418	andnq	%rax,%r12,%r12
3419	leaq	32(%rbp),%rbp
3420	andnq	%rax,%r13,%r13
3421	andnq	%rax,%r14,%r14
3422	andnq	%rax,%r15,%r15
3423
3424	negq	%r8
3425	adcq	0(%rdi),%r12
3426	adcq	8(%rdi),%r13
3427	adcq	16(%rdi),%r14
3428	adcq	24(%rdi),%r15
3429	movq	%r12,0(%rdx)
3430	leaq	32(%rdi),%rdi
3431	movq	%r13,8(%rdx)
3432	sbbq	%r8,%r8
3433	movq	%r14,16(%rdx)
3434	movq	%r15,24(%rdx)
3435	leaq	32(%rdx),%rdx
3436
3437	incq	%rcx
3438	jnz	.Lsqrx4x_sub
3439
3440	negq	%r9
3441
3442	.byte	0xf3,0xc3
3443.size	__bn_postx4x_internal,.-__bn_postx4x_internal
3444.globl	bn_get_bits5
3445.type	bn_get_bits5,@function
3446.align	16
3447bn_get_bits5:
3448	leaq	0(%rdi),%r10
3449	leaq	1(%rdi),%r11
3450	movl	%esi,%ecx
3451	shrl	$4,%esi
3452	andl	$15,%ecx
3453	leal	-8(%rcx),%eax
3454	cmpl	$11,%ecx
3455	cmovaq	%r11,%r10
3456	cmoval	%eax,%ecx
3457	movzwl	(%r10,%rsi,2),%eax
3458	shrl	%cl,%eax
3459	andl	$31,%eax
3460	.byte	0xf3,0xc3
3461.size	bn_get_bits5,.-bn_get_bits5
3462
3463.globl	bn_scatter5
3464.type	bn_scatter5,@function
3465.align	16
3466bn_scatter5:
3467	cmpl	$0,%esi
3468	jz	.Lscatter_epilogue
3469	leaq	(%rdx,%rcx,8),%rdx
3470.Lscatter:
3471	movq	(%rdi),%rax
3472	leaq	8(%rdi),%rdi
3473	movq	%rax,(%rdx)
3474	leaq	256(%rdx),%rdx
3475	subl	$1,%esi
3476	jnz	.Lscatter
3477.Lscatter_epilogue:
3478	.byte	0xf3,0xc3
3479.size	bn_scatter5,.-bn_scatter5
3480
3481.globl	bn_gather5
3482.type	bn_gather5,@function
3483.align	32
3484bn_gather5:
3485.LSEH_begin_bn_gather5:
3486
3487.byte	0x4c,0x8d,0x14,0x24
3488.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00
3489	leaq	.Linc(%rip),%rax
3490	andq	$-16,%rsp
3491
3492	movd	%ecx,%xmm5
3493	movdqa	0(%rax),%xmm0
3494	movdqa	16(%rax),%xmm1
3495	leaq	128(%rdx),%r11
3496	leaq	128(%rsp),%rax
3497
3498	pshufd	$0,%xmm5,%xmm5
3499	movdqa	%xmm1,%xmm4
3500	movdqa	%xmm1,%xmm2
3501	paddd	%xmm0,%xmm1
3502	pcmpeqd	%xmm5,%xmm0
3503	movdqa	%xmm4,%xmm3
3504
3505	paddd	%xmm1,%xmm2
3506	pcmpeqd	%xmm5,%xmm1
3507	movdqa	%xmm0,-128(%rax)
3508	movdqa	%xmm4,%xmm0
3509
3510	paddd	%xmm2,%xmm3
3511	pcmpeqd	%xmm5,%xmm2
3512	movdqa	%xmm1,-112(%rax)
3513	movdqa	%xmm4,%xmm1
3514
3515	paddd	%xmm3,%xmm0
3516	pcmpeqd	%xmm5,%xmm3
3517	movdqa	%xmm2,-96(%rax)
3518	movdqa	%xmm4,%xmm2
3519	paddd	%xmm0,%xmm1
3520	pcmpeqd	%xmm5,%xmm0
3521	movdqa	%xmm3,-80(%rax)
3522	movdqa	%xmm4,%xmm3
3523
3524	paddd	%xmm1,%xmm2
3525	pcmpeqd	%xmm5,%xmm1
3526	movdqa	%xmm0,-64(%rax)
3527	movdqa	%xmm4,%xmm0
3528
3529	paddd	%xmm2,%xmm3
3530	pcmpeqd	%xmm5,%xmm2
3531	movdqa	%xmm1,-48(%rax)
3532	movdqa	%xmm4,%xmm1
3533
3534	paddd	%xmm3,%xmm0
3535	pcmpeqd	%xmm5,%xmm3
3536	movdqa	%xmm2,-32(%rax)
3537	movdqa	%xmm4,%xmm2
3538	paddd	%xmm0,%xmm1
3539	pcmpeqd	%xmm5,%xmm0
3540	movdqa	%xmm3,-16(%rax)
3541	movdqa	%xmm4,%xmm3
3542
3543	paddd	%xmm1,%xmm2
3544	pcmpeqd	%xmm5,%xmm1
3545	movdqa	%xmm0,0(%rax)
3546	movdqa	%xmm4,%xmm0
3547
3548	paddd	%xmm2,%xmm3
3549	pcmpeqd	%xmm5,%xmm2
3550	movdqa	%xmm1,16(%rax)
3551	movdqa	%xmm4,%xmm1
3552
3553	paddd	%xmm3,%xmm0
3554	pcmpeqd	%xmm5,%xmm3
3555	movdqa	%xmm2,32(%rax)
3556	movdqa	%xmm4,%xmm2
3557	paddd	%xmm0,%xmm1
3558	pcmpeqd	%xmm5,%xmm0
3559	movdqa	%xmm3,48(%rax)
3560	movdqa	%xmm4,%xmm3
3561
3562	paddd	%xmm1,%xmm2
3563	pcmpeqd	%xmm5,%xmm1
3564	movdqa	%xmm0,64(%rax)
3565	movdqa	%xmm4,%xmm0
3566
3567	paddd	%xmm2,%xmm3
3568	pcmpeqd	%xmm5,%xmm2
3569	movdqa	%xmm1,80(%rax)
3570	movdqa	%xmm4,%xmm1
3571
3572	paddd	%xmm3,%xmm0
3573	pcmpeqd	%xmm5,%xmm3
3574	movdqa	%xmm2,96(%rax)
3575	movdqa	%xmm4,%xmm2
3576	movdqa	%xmm3,112(%rax)
3577	jmp	.Lgather
3578
3579.align	32
3580.Lgather:
3581	pxor	%xmm4,%xmm4
3582	pxor	%xmm5,%xmm5
3583	movdqa	-128(%r11),%xmm0
3584	movdqa	-112(%r11),%xmm1
3585	movdqa	-96(%r11),%xmm2
3586	pand	-128(%rax),%xmm0
3587	movdqa	-80(%r11),%xmm3
3588	pand	-112(%rax),%xmm1
3589	por	%xmm0,%xmm4
3590	pand	-96(%rax),%xmm2
3591	por	%xmm1,%xmm5
3592	pand	-80(%rax),%xmm3
3593	por	%xmm2,%xmm4
3594	por	%xmm3,%xmm5
3595	movdqa	-64(%r11),%xmm0
3596	movdqa	-48(%r11),%xmm1
3597	movdqa	-32(%r11),%xmm2
3598	pand	-64(%rax),%xmm0
3599	movdqa	-16(%r11),%xmm3
3600	pand	-48(%rax),%xmm1
3601	por	%xmm0,%xmm4
3602	pand	-32(%rax),%xmm2
3603	por	%xmm1,%xmm5
3604	pand	-16(%rax),%xmm3
3605	por	%xmm2,%xmm4
3606	por	%xmm3,%xmm5
3607	movdqa	0(%r11),%xmm0
3608	movdqa	16(%r11),%xmm1
3609	movdqa	32(%r11),%xmm2
3610	pand	0(%rax),%xmm0
3611	movdqa	48(%r11),%xmm3
3612	pand	16(%rax),%xmm1
3613	por	%xmm0,%xmm4
3614	pand	32(%rax),%xmm2
3615	por	%xmm1,%xmm5
3616	pand	48(%rax),%xmm3
3617	por	%xmm2,%xmm4
3618	por	%xmm3,%xmm5
3619	movdqa	64(%r11),%xmm0
3620	movdqa	80(%r11),%xmm1
3621	movdqa	96(%r11),%xmm2
3622	pand	64(%rax),%xmm0
3623	movdqa	112(%r11),%xmm3
3624	pand	80(%rax),%xmm1
3625	por	%xmm0,%xmm4
3626	pand	96(%rax),%xmm2
3627	por	%xmm1,%xmm5
3628	pand	112(%rax),%xmm3
3629	por	%xmm2,%xmm4
3630	por	%xmm3,%xmm5
3631	por	%xmm5,%xmm4
3632	leaq	256(%r11),%r11
3633	pshufd	$0x4e,%xmm4,%xmm0
3634	por	%xmm4,%xmm0
3635	movq	%xmm0,(%rdi)
3636	leaq	8(%rdi),%rdi
3637	subl	$1,%esi
3638	jnz	.Lgather
3639
3640	leaq	(%r10),%rsp
3641	.byte	0xf3,0xc3
3642.LSEH_end_bn_gather5:
3643.size	bn_gather5,.-bn_gather5
3644.align	64
3645.Linc:
3646.long	0,0, 1,1
3647.long	2,2, 2,2
3648.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
3649