x86_64-mont5.S revision 312826
1/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/x86_64-mont5.S 312826 2017-01-26 19:14:14Z jkim $ */
2/* Do not modify. This file is auto-generated from x86_64-mont5.pl. */
3.text
4
5
6
7.globl	bn_mul_mont_gather5
8.type	bn_mul_mont_gather5,@function
9.align	64
10bn_mul_mont_gather5:
11	movl	%r9d,%r9d
12	movq	%rsp,%rax
13	testl	$7,%r9d
14	jnz	.Lmul_enter
15	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
16	jmp	.Lmul4x_enter
17
18.align	16
19.Lmul_enter:
20	movd	8(%rsp),%xmm5
21	pushq	%rbx
22	pushq	%rbp
23	pushq	%r12
24	pushq	%r13
25	pushq	%r14
26	pushq	%r15
27
28	negq	%r9
29	movq	%rsp,%r11
30	leaq	-280(%rsp,%r9,8),%r10
31	negq	%r9
32	andq	$-1024,%r10
33
34
35
36
37
38
39
40	subq	%r10,%r11
41	andq	$-4096,%r11
42	leaq	(%r10,%r11,1),%rsp
43	movq	(%rsp),%r11
44	cmpq	%r10,%rsp
45	ja	.Lmul_page_walk
46	jmp	.Lmul_page_walk_done
47
48.Lmul_page_walk:
49	leaq	-4096(%rsp),%rsp
50	movq	(%rsp),%r11
51	cmpq	%r10,%rsp
52	ja	.Lmul_page_walk
53.Lmul_page_walk_done:
54
55	leaq	.Linc(%rip),%r10
56	movq	%rax,8(%rsp,%r9,8)
57.Lmul_body:
58
59	leaq	128(%rdx),%r12
60	movdqa	0(%r10),%xmm0
61	movdqa	16(%r10),%xmm1
62	leaq	24-112(%rsp,%r9,8),%r10
63	andq	$-16,%r10
64
65	pshufd	$0,%xmm5,%xmm5
66	movdqa	%xmm1,%xmm4
67	movdqa	%xmm1,%xmm2
68	paddd	%xmm0,%xmm1
69	pcmpeqd	%xmm5,%xmm0
70.byte	0x67
71	movdqa	%xmm4,%xmm3
72	paddd	%xmm1,%xmm2
73	pcmpeqd	%xmm5,%xmm1
74	movdqa	%xmm0,112(%r10)
75	movdqa	%xmm4,%xmm0
76
77	paddd	%xmm2,%xmm3
78	pcmpeqd	%xmm5,%xmm2
79	movdqa	%xmm1,128(%r10)
80	movdqa	%xmm4,%xmm1
81
82	paddd	%xmm3,%xmm0
83	pcmpeqd	%xmm5,%xmm3
84	movdqa	%xmm2,144(%r10)
85	movdqa	%xmm4,%xmm2
86
87	paddd	%xmm0,%xmm1
88	pcmpeqd	%xmm5,%xmm0
89	movdqa	%xmm3,160(%r10)
90	movdqa	%xmm4,%xmm3
91	paddd	%xmm1,%xmm2
92	pcmpeqd	%xmm5,%xmm1
93	movdqa	%xmm0,176(%r10)
94	movdqa	%xmm4,%xmm0
95
96	paddd	%xmm2,%xmm3
97	pcmpeqd	%xmm5,%xmm2
98	movdqa	%xmm1,192(%r10)
99	movdqa	%xmm4,%xmm1
100
101	paddd	%xmm3,%xmm0
102	pcmpeqd	%xmm5,%xmm3
103	movdqa	%xmm2,208(%r10)
104	movdqa	%xmm4,%xmm2
105
106	paddd	%xmm0,%xmm1
107	pcmpeqd	%xmm5,%xmm0
108	movdqa	%xmm3,224(%r10)
109	movdqa	%xmm4,%xmm3
110	paddd	%xmm1,%xmm2
111	pcmpeqd	%xmm5,%xmm1
112	movdqa	%xmm0,240(%r10)
113	movdqa	%xmm4,%xmm0
114
115	paddd	%xmm2,%xmm3
116	pcmpeqd	%xmm5,%xmm2
117	movdqa	%xmm1,256(%r10)
118	movdqa	%xmm4,%xmm1
119
120	paddd	%xmm3,%xmm0
121	pcmpeqd	%xmm5,%xmm3
122	movdqa	%xmm2,272(%r10)
123	movdqa	%xmm4,%xmm2
124
125	paddd	%xmm0,%xmm1
126	pcmpeqd	%xmm5,%xmm0
127	movdqa	%xmm3,288(%r10)
128	movdqa	%xmm4,%xmm3
129	paddd	%xmm1,%xmm2
130	pcmpeqd	%xmm5,%xmm1
131	movdqa	%xmm0,304(%r10)
132
133	paddd	%xmm2,%xmm3
134.byte	0x67
135	pcmpeqd	%xmm5,%xmm2
136	movdqa	%xmm1,320(%r10)
137
138	pcmpeqd	%xmm5,%xmm3
139	movdqa	%xmm2,336(%r10)
140	pand	64(%r12),%xmm0
141
142	pand	80(%r12),%xmm1
143	pand	96(%r12),%xmm2
144	movdqa	%xmm3,352(%r10)
145	pand	112(%r12),%xmm3
146	por	%xmm2,%xmm0
147	por	%xmm3,%xmm1
148	movdqa	-128(%r12),%xmm4
149	movdqa	-112(%r12),%xmm5
150	movdqa	-96(%r12),%xmm2
151	pand	112(%r10),%xmm4
152	movdqa	-80(%r12),%xmm3
153	pand	128(%r10),%xmm5
154	por	%xmm4,%xmm0
155	pand	144(%r10),%xmm2
156	por	%xmm5,%xmm1
157	pand	160(%r10),%xmm3
158	por	%xmm2,%xmm0
159	por	%xmm3,%xmm1
160	movdqa	-64(%r12),%xmm4
161	movdqa	-48(%r12),%xmm5
162	movdqa	-32(%r12),%xmm2
163	pand	176(%r10),%xmm4
164	movdqa	-16(%r12),%xmm3
165	pand	192(%r10),%xmm5
166	por	%xmm4,%xmm0
167	pand	208(%r10),%xmm2
168	por	%xmm5,%xmm1
169	pand	224(%r10),%xmm3
170	por	%xmm2,%xmm0
171	por	%xmm3,%xmm1
172	movdqa	0(%r12),%xmm4
173	movdqa	16(%r12),%xmm5
174	movdqa	32(%r12),%xmm2
175	pand	240(%r10),%xmm4
176	movdqa	48(%r12),%xmm3
177	pand	256(%r10),%xmm5
178	por	%xmm4,%xmm0
179	pand	272(%r10),%xmm2
180	por	%xmm5,%xmm1
181	pand	288(%r10),%xmm3
182	por	%xmm2,%xmm0
183	por	%xmm3,%xmm1
184	por	%xmm1,%xmm0
185	pshufd	$0x4e,%xmm0,%xmm1
186	por	%xmm1,%xmm0
187	leaq	256(%r12),%r12
188.byte	102,72,15,126,195
189
190	movq	(%r8),%r8
191	movq	(%rsi),%rax
192
193	xorq	%r14,%r14
194	xorq	%r15,%r15
195
196	movq	%r8,%rbp
197	mulq	%rbx
198	movq	%rax,%r10
199	movq	(%rcx),%rax
200
201	imulq	%r10,%rbp
202	movq	%rdx,%r11
203
204	mulq	%rbp
205	addq	%rax,%r10
206	movq	8(%rsi),%rax
207	adcq	$0,%rdx
208	movq	%rdx,%r13
209
210	leaq	1(%r15),%r15
211	jmp	.L1st_enter
212
213.align	16
214.L1st:
215	addq	%rax,%r13
216	movq	(%rsi,%r15,8),%rax
217	adcq	$0,%rdx
218	addq	%r11,%r13
219	movq	%r10,%r11
220	adcq	$0,%rdx
221	movq	%r13,-16(%rsp,%r15,8)
222	movq	%rdx,%r13
223
224.L1st_enter:
225	mulq	%rbx
226	addq	%rax,%r11
227	movq	(%rcx,%r15,8),%rax
228	adcq	$0,%rdx
229	leaq	1(%r15),%r15
230	movq	%rdx,%r10
231
232	mulq	%rbp
233	cmpq	%r9,%r15
234	jne	.L1st
235
236
237	addq	%rax,%r13
238	adcq	$0,%rdx
239	addq	%r11,%r13
240	adcq	$0,%rdx
241	movq	%r13,-16(%rsp,%r9,8)
242	movq	%rdx,%r13
243	movq	%r10,%r11
244
245	xorq	%rdx,%rdx
246	addq	%r11,%r13
247	adcq	$0,%rdx
248	movq	%r13,-8(%rsp,%r9,8)
249	movq	%rdx,(%rsp,%r9,8)
250
251	leaq	1(%r14),%r14
252	jmp	.Louter
253.align	16
254.Louter:
255	leaq	24+128(%rsp,%r9,8),%rdx
256	andq	$-16,%rdx
257	pxor	%xmm4,%xmm4
258	pxor	%xmm5,%xmm5
259	movdqa	-128(%r12),%xmm0
260	movdqa	-112(%r12),%xmm1
261	movdqa	-96(%r12),%xmm2
262	movdqa	-80(%r12),%xmm3
263	pand	-128(%rdx),%xmm0
264	pand	-112(%rdx),%xmm1
265	por	%xmm0,%xmm4
266	pand	-96(%rdx),%xmm2
267	por	%xmm1,%xmm5
268	pand	-80(%rdx),%xmm3
269	por	%xmm2,%xmm4
270	por	%xmm3,%xmm5
271	movdqa	-64(%r12),%xmm0
272	movdqa	-48(%r12),%xmm1
273	movdqa	-32(%r12),%xmm2
274	movdqa	-16(%r12),%xmm3
275	pand	-64(%rdx),%xmm0
276	pand	-48(%rdx),%xmm1
277	por	%xmm0,%xmm4
278	pand	-32(%rdx),%xmm2
279	por	%xmm1,%xmm5
280	pand	-16(%rdx),%xmm3
281	por	%xmm2,%xmm4
282	por	%xmm3,%xmm5
283	movdqa	0(%r12),%xmm0
284	movdqa	16(%r12),%xmm1
285	movdqa	32(%r12),%xmm2
286	movdqa	48(%r12),%xmm3
287	pand	0(%rdx),%xmm0
288	pand	16(%rdx),%xmm1
289	por	%xmm0,%xmm4
290	pand	32(%rdx),%xmm2
291	por	%xmm1,%xmm5
292	pand	48(%rdx),%xmm3
293	por	%xmm2,%xmm4
294	por	%xmm3,%xmm5
295	movdqa	64(%r12),%xmm0
296	movdqa	80(%r12),%xmm1
297	movdqa	96(%r12),%xmm2
298	movdqa	112(%r12),%xmm3
299	pand	64(%rdx),%xmm0
300	pand	80(%rdx),%xmm1
301	por	%xmm0,%xmm4
302	pand	96(%rdx),%xmm2
303	por	%xmm1,%xmm5
304	pand	112(%rdx),%xmm3
305	por	%xmm2,%xmm4
306	por	%xmm3,%xmm5
307	por	%xmm5,%xmm4
308	pshufd	$0x4e,%xmm4,%xmm0
309	por	%xmm4,%xmm0
310	leaq	256(%r12),%r12
311
312	movq	(%rsi),%rax
313.byte	102,72,15,126,195
314
315	xorq	%r15,%r15
316	movq	%r8,%rbp
317	movq	(%rsp),%r10
318
319	mulq	%rbx
320	addq	%rax,%r10
321	movq	(%rcx),%rax
322	adcq	$0,%rdx
323
324	imulq	%r10,%rbp
325	movq	%rdx,%r11
326
327	mulq	%rbp
328	addq	%rax,%r10
329	movq	8(%rsi),%rax
330	adcq	$0,%rdx
331	movq	8(%rsp),%r10
332	movq	%rdx,%r13
333
334	leaq	1(%r15),%r15
335	jmp	.Linner_enter
336
337.align	16
338.Linner:
339	addq	%rax,%r13
340	movq	(%rsi,%r15,8),%rax
341	adcq	$0,%rdx
342	addq	%r10,%r13
343	movq	(%rsp,%r15,8),%r10
344	adcq	$0,%rdx
345	movq	%r13,-16(%rsp,%r15,8)
346	movq	%rdx,%r13
347
348.Linner_enter:
349	mulq	%rbx
350	addq	%rax,%r11
351	movq	(%rcx,%r15,8),%rax
352	adcq	$0,%rdx
353	addq	%r11,%r10
354	movq	%rdx,%r11
355	adcq	$0,%r11
356	leaq	1(%r15),%r15
357
358	mulq	%rbp
359	cmpq	%r9,%r15
360	jne	.Linner
361
362	addq	%rax,%r13
363	adcq	$0,%rdx
364	addq	%r10,%r13
365	movq	(%rsp,%r9,8),%r10
366	adcq	$0,%rdx
367	movq	%r13,-16(%rsp,%r9,8)
368	movq	%rdx,%r13
369
370	xorq	%rdx,%rdx
371	addq	%r11,%r13
372	adcq	$0,%rdx
373	addq	%r10,%r13
374	adcq	$0,%rdx
375	movq	%r13,-8(%rsp,%r9,8)
376	movq	%rdx,(%rsp,%r9,8)
377
378	leaq	1(%r14),%r14
379	cmpq	%r9,%r14
380	jb	.Louter
381
382	xorq	%r14,%r14
383	movq	(%rsp),%rax
384	leaq	(%rsp),%rsi
385	movq	%r9,%r15
386	jmp	.Lsub
387.align	16
388.Lsub:	sbbq	(%rcx,%r14,8),%rax
389	movq	%rax,(%rdi,%r14,8)
390	movq	8(%rsi,%r14,8),%rax
391	leaq	1(%r14),%r14
392	decq	%r15
393	jnz	.Lsub
394
395	sbbq	$0,%rax
396	xorq	%r14,%r14
397	andq	%rax,%rsi
398	notq	%rax
399	movq	%rdi,%rcx
400	andq	%rax,%rcx
401	movq	%r9,%r15
402	orq	%rcx,%rsi
403.align	16
404.Lcopy:
405	movq	(%rsi,%r14,8),%rax
406	movq	%r14,(%rsp,%r14,8)
407	movq	%rax,(%rdi,%r14,8)
408	leaq	1(%r14),%r14
409	subq	$1,%r15
410	jnz	.Lcopy
411
412	movq	8(%rsp,%r9,8),%rsi
413	movq	$1,%rax
414
415	movq	-48(%rsi),%r15
416	movq	-40(%rsi),%r14
417	movq	-32(%rsi),%r13
418	movq	-24(%rsi),%r12
419	movq	-16(%rsi),%rbp
420	movq	-8(%rsi),%rbx
421	leaq	(%rsi),%rsp
422.Lmul_epilogue:
423	.byte	0xf3,0xc3
424.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
425.type	bn_mul4x_mont_gather5,@function
426.align	32
427bn_mul4x_mont_gather5:
428.byte	0x67
429	movq	%rsp,%rax
430.Lmul4x_enter:
431	andl	$0x80108,%r11d
432	cmpl	$0x80108,%r11d
433	je	.Lmulx4x_enter
434	pushq	%rbx
435	pushq	%rbp
436	pushq	%r12
437	pushq	%r13
438	pushq	%r14
439	pushq	%r15
440.Lmul4x_prologue:
441
442.byte	0x67
443	shll	$3,%r9d
444	leaq	(%r9,%r9,2),%r10
445	negq	%r9
446
447
448
449
450
451
452
453
454
455
456	leaq	-320(%rsp,%r9,2),%r11
457	movq	%rsp,%rbp
458	subq	%rdi,%r11
459	andq	$4095,%r11
460	cmpq	%r11,%r10
461	jb	.Lmul4xsp_alt
462	subq	%r11,%rbp
463	leaq	-320(%rbp,%r9,2),%rbp
464	jmp	.Lmul4xsp_done
465
466.align	32
467.Lmul4xsp_alt:
468	leaq	4096-320(,%r9,2),%r10
469	leaq	-320(%rbp,%r9,2),%rbp
470	subq	%r10,%r11
471	movq	$0,%r10
472	cmovcq	%r10,%r11
473	subq	%r11,%rbp
474.Lmul4xsp_done:
475	andq	$-64,%rbp
476	movq	%rsp,%r11
477	subq	%rbp,%r11
478	andq	$-4096,%r11
479	leaq	(%r11,%rbp,1),%rsp
480	movq	(%rsp),%r10
481	cmpq	%rbp,%rsp
482	ja	.Lmul4x_page_walk
483	jmp	.Lmul4x_page_walk_done
484
485.Lmul4x_page_walk:
486	leaq	-4096(%rsp),%rsp
487	movq	(%rsp),%r10
488	cmpq	%rbp,%rsp
489	ja	.Lmul4x_page_walk
490.Lmul4x_page_walk_done:
491
492	negq	%r9
493
494	movq	%rax,40(%rsp)
495.Lmul4x_body:
496
497	call	mul4x_internal
498
499	movq	40(%rsp),%rsi
500	movq	$1,%rax
501
502	movq	-48(%rsi),%r15
503	movq	-40(%rsi),%r14
504	movq	-32(%rsi),%r13
505	movq	-24(%rsi),%r12
506	movq	-16(%rsi),%rbp
507	movq	-8(%rsi),%rbx
508	leaq	(%rsi),%rsp
509.Lmul4x_epilogue:
510	.byte	0xf3,0xc3
511.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
512
513.type	mul4x_internal,@function
514.align	32
515mul4x_internal:
516	shlq	$5,%r9
517	movd	8(%rax),%xmm5
518	leaq	.Linc(%rip),%rax
519	leaq	128(%rdx,%r9,1),%r13
520	shrq	$5,%r9
521	movdqa	0(%rax),%xmm0
522	movdqa	16(%rax),%xmm1
523	leaq	88-112(%rsp,%r9,1),%r10
524	leaq	128(%rdx),%r12
525
526	pshufd	$0,%xmm5,%xmm5
527	movdqa	%xmm1,%xmm4
528.byte	0x67,0x67
529	movdqa	%xmm1,%xmm2
530	paddd	%xmm0,%xmm1
531	pcmpeqd	%xmm5,%xmm0
532.byte	0x67
533	movdqa	%xmm4,%xmm3
534	paddd	%xmm1,%xmm2
535	pcmpeqd	%xmm5,%xmm1
536	movdqa	%xmm0,112(%r10)
537	movdqa	%xmm4,%xmm0
538
539	paddd	%xmm2,%xmm3
540	pcmpeqd	%xmm5,%xmm2
541	movdqa	%xmm1,128(%r10)
542	movdqa	%xmm4,%xmm1
543
544	paddd	%xmm3,%xmm0
545	pcmpeqd	%xmm5,%xmm3
546	movdqa	%xmm2,144(%r10)
547	movdqa	%xmm4,%xmm2
548
549	paddd	%xmm0,%xmm1
550	pcmpeqd	%xmm5,%xmm0
551	movdqa	%xmm3,160(%r10)
552	movdqa	%xmm4,%xmm3
553	paddd	%xmm1,%xmm2
554	pcmpeqd	%xmm5,%xmm1
555	movdqa	%xmm0,176(%r10)
556	movdqa	%xmm4,%xmm0
557
558	paddd	%xmm2,%xmm3
559	pcmpeqd	%xmm5,%xmm2
560	movdqa	%xmm1,192(%r10)
561	movdqa	%xmm4,%xmm1
562
563	paddd	%xmm3,%xmm0
564	pcmpeqd	%xmm5,%xmm3
565	movdqa	%xmm2,208(%r10)
566	movdqa	%xmm4,%xmm2
567
568	paddd	%xmm0,%xmm1
569	pcmpeqd	%xmm5,%xmm0
570	movdqa	%xmm3,224(%r10)
571	movdqa	%xmm4,%xmm3
572	paddd	%xmm1,%xmm2
573	pcmpeqd	%xmm5,%xmm1
574	movdqa	%xmm0,240(%r10)
575	movdqa	%xmm4,%xmm0
576
577	paddd	%xmm2,%xmm3
578	pcmpeqd	%xmm5,%xmm2
579	movdqa	%xmm1,256(%r10)
580	movdqa	%xmm4,%xmm1
581
582	paddd	%xmm3,%xmm0
583	pcmpeqd	%xmm5,%xmm3
584	movdqa	%xmm2,272(%r10)
585	movdqa	%xmm4,%xmm2
586
587	paddd	%xmm0,%xmm1
588	pcmpeqd	%xmm5,%xmm0
589	movdqa	%xmm3,288(%r10)
590	movdqa	%xmm4,%xmm3
591	paddd	%xmm1,%xmm2
592	pcmpeqd	%xmm5,%xmm1
593	movdqa	%xmm0,304(%r10)
594
595	paddd	%xmm2,%xmm3
596.byte	0x67
597	pcmpeqd	%xmm5,%xmm2
598	movdqa	%xmm1,320(%r10)
599
600	pcmpeqd	%xmm5,%xmm3
601	movdqa	%xmm2,336(%r10)
602	pand	64(%r12),%xmm0
603
604	pand	80(%r12),%xmm1
605	pand	96(%r12),%xmm2
606	movdqa	%xmm3,352(%r10)
607	pand	112(%r12),%xmm3
608	por	%xmm2,%xmm0
609	por	%xmm3,%xmm1
610	movdqa	-128(%r12),%xmm4
611	movdqa	-112(%r12),%xmm5
612	movdqa	-96(%r12),%xmm2
613	pand	112(%r10),%xmm4
614	movdqa	-80(%r12),%xmm3
615	pand	128(%r10),%xmm5
616	por	%xmm4,%xmm0
617	pand	144(%r10),%xmm2
618	por	%xmm5,%xmm1
619	pand	160(%r10),%xmm3
620	por	%xmm2,%xmm0
621	por	%xmm3,%xmm1
622	movdqa	-64(%r12),%xmm4
623	movdqa	-48(%r12),%xmm5
624	movdqa	-32(%r12),%xmm2
625	pand	176(%r10),%xmm4
626	movdqa	-16(%r12),%xmm3
627	pand	192(%r10),%xmm5
628	por	%xmm4,%xmm0
629	pand	208(%r10),%xmm2
630	por	%xmm5,%xmm1
631	pand	224(%r10),%xmm3
632	por	%xmm2,%xmm0
633	por	%xmm3,%xmm1
634	movdqa	0(%r12),%xmm4
635	movdqa	16(%r12),%xmm5
636	movdqa	32(%r12),%xmm2
637	pand	240(%r10),%xmm4
638	movdqa	48(%r12),%xmm3
639	pand	256(%r10),%xmm5
640	por	%xmm4,%xmm0
641	pand	272(%r10),%xmm2
642	por	%xmm5,%xmm1
643	pand	288(%r10),%xmm3
644	por	%xmm2,%xmm0
645	por	%xmm3,%xmm1
646	por	%xmm1,%xmm0
647	pshufd	$0x4e,%xmm0,%xmm1
648	por	%xmm1,%xmm0
649	leaq	256(%r12),%r12
650.byte	102,72,15,126,195
651
652	movq	%r13,16+8(%rsp)
653	movq	%rdi,56+8(%rsp)
654
655	movq	(%r8),%r8
656	movq	(%rsi),%rax
657	leaq	(%rsi,%r9,1),%rsi
658	negq	%r9
659
660	movq	%r8,%rbp
661	mulq	%rbx
662	movq	%rax,%r10
663	movq	(%rcx),%rax
664
665	imulq	%r10,%rbp
666	leaq	64+8(%rsp),%r14
667	movq	%rdx,%r11
668
669	mulq	%rbp
670	addq	%rax,%r10
671	movq	8(%rsi,%r9,1),%rax
672	adcq	$0,%rdx
673	movq	%rdx,%rdi
674
675	mulq	%rbx
676	addq	%rax,%r11
677	movq	8(%rcx),%rax
678	adcq	$0,%rdx
679	movq	%rdx,%r10
680
681	mulq	%rbp
682	addq	%rax,%rdi
683	movq	16(%rsi,%r9,1),%rax
684	adcq	$0,%rdx
685	addq	%r11,%rdi
686	leaq	32(%r9),%r15
687	leaq	32(%rcx),%rcx
688	adcq	$0,%rdx
689	movq	%rdi,(%r14)
690	movq	%rdx,%r13
691	jmp	.L1st4x
692
693.align	32
694.L1st4x:
695	mulq	%rbx
696	addq	%rax,%r10
697	movq	-16(%rcx),%rax
698	leaq	32(%r14),%r14
699	adcq	$0,%rdx
700	movq	%rdx,%r11
701
702	mulq	%rbp
703	addq	%rax,%r13
704	movq	-8(%rsi,%r15,1),%rax
705	adcq	$0,%rdx
706	addq	%r10,%r13
707	adcq	$0,%rdx
708	movq	%r13,-24(%r14)
709	movq	%rdx,%rdi
710
711	mulq	%rbx
712	addq	%rax,%r11
713	movq	-8(%rcx),%rax
714	adcq	$0,%rdx
715	movq	%rdx,%r10
716
717	mulq	%rbp
718	addq	%rax,%rdi
719	movq	(%rsi,%r15,1),%rax
720	adcq	$0,%rdx
721	addq	%r11,%rdi
722	adcq	$0,%rdx
723	movq	%rdi,-16(%r14)
724	movq	%rdx,%r13
725
726	mulq	%rbx
727	addq	%rax,%r10
728	movq	0(%rcx),%rax
729	adcq	$0,%rdx
730	movq	%rdx,%r11
731
732	mulq	%rbp
733	addq	%rax,%r13
734	movq	8(%rsi,%r15,1),%rax
735	adcq	$0,%rdx
736	addq	%r10,%r13
737	adcq	$0,%rdx
738	movq	%r13,-8(%r14)
739	movq	%rdx,%rdi
740
741	mulq	%rbx
742	addq	%rax,%r11
743	movq	8(%rcx),%rax
744	adcq	$0,%rdx
745	movq	%rdx,%r10
746
747	mulq	%rbp
748	addq	%rax,%rdi
749	movq	16(%rsi,%r15,1),%rax
750	adcq	$0,%rdx
751	addq	%r11,%rdi
752	leaq	32(%rcx),%rcx
753	adcq	$0,%rdx
754	movq	%rdi,(%r14)
755	movq	%rdx,%r13
756
757	addq	$32,%r15
758	jnz	.L1st4x
759
760	mulq	%rbx
761	addq	%rax,%r10
762	movq	-16(%rcx),%rax
763	leaq	32(%r14),%r14
764	adcq	$0,%rdx
765	movq	%rdx,%r11
766
767	mulq	%rbp
768	addq	%rax,%r13
769	movq	-8(%rsi),%rax
770	adcq	$0,%rdx
771	addq	%r10,%r13
772	adcq	$0,%rdx
773	movq	%r13,-24(%r14)
774	movq	%rdx,%rdi
775
776	mulq	%rbx
777	addq	%rax,%r11
778	movq	-8(%rcx),%rax
779	adcq	$0,%rdx
780	movq	%rdx,%r10
781
782	mulq	%rbp
783	addq	%rax,%rdi
784	movq	(%rsi,%r9,1),%rax
785	adcq	$0,%rdx
786	addq	%r11,%rdi
787	adcq	$0,%rdx
788	movq	%rdi,-16(%r14)
789	movq	%rdx,%r13
790
791	leaq	(%rcx,%r9,1),%rcx
792
793	xorq	%rdi,%rdi
794	addq	%r10,%r13
795	adcq	$0,%rdi
796	movq	%r13,-8(%r14)
797
798	jmp	.Louter4x
799
800.align	32
801.Louter4x:
802	leaq	16+128(%r14),%rdx
803	pxor	%xmm4,%xmm4
804	pxor	%xmm5,%xmm5
805	movdqa	-128(%r12),%xmm0
806	movdqa	-112(%r12),%xmm1
807	movdqa	-96(%r12),%xmm2
808	movdqa	-80(%r12),%xmm3
809	pand	-128(%rdx),%xmm0
810	pand	-112(%rdx),%xmm1
811	por	%xmm0,%xmm4
812	pand	-96(%rdx),%xmm2
813	por	%xmm1,%xmm5
814	pand	-80(%rdx),%xmm3
815	por	%xmm2,%xmm4
816	por	%xmm3,%xmm5
817	movdqa	-64(%r12),%xmm0
818	movdqa	-48(%r12),%xmm1
819	movdqa	-32(%r12),%xmm2
820	movdqa	-16(%r12),%xmm3
821	pand	-64(%rdx),%xmm0
822	pand	-48(%rdx),%xmm1
823	por	%xmm0,%xmm4
824	pand	-32(%rdx),%xmm2
825	por	%xmm1,%xmm5
826	pand	-16(%rdx),%xmm3
827	por	%xmm2,%xmm4
828	por	%xmm3,%xmm5
829	movdqa	0(%r12),%xmm0
830	movdqa	16(%r12),%xmm1
831	movdqa	32(%r12),%xmm2
832	movdqa	48(%r12),%xmm3
833	pand	0(%rdx),%xmm0
834	pand	16(%rdx),%xmm1
835	por	%xmm0,%xmm4
836	pand	32(%rdx),%xmm2
837	por	%xmm1,%xmm5
838	pand	48(%rdx),%xmm3
839	por	%xmm2,%xmm4
840	por	%xmm3,%xmm5
841	movdqa	64(%r12),%xmm0
842	movdqa	80(%r12),%xmm1
843	movdqa	96(%r12),%xmm2
844	movdqa	112(%r12),%xmm3
845	pand	64(%rdx),%xmm0
846	pand	80(%rdx),%xmm1
847	por	%xmm0,%xmm4
848	pand	96(%rdx),%xmm2
849	por	%xmm1,%xmm5
850	pand	112(%rdx),%xmm3
851	por	%xmm2,%xmm4
852	por	%xmm3,%xmm5
853	por	%xmm5,%xmm4
854	pshufd	$0x4e,%xmm4,%xmm0
855	por	%xmm4,%xmm0
856	leaq	256(%r12),%r12
857.byte	102,72,15,126,195
858
859	movq	(%r14,%r9,1),%r10
860	movq	%r8,%rbp
861	mulq	%rbx
862	addq	%rax,%r10
863	movq	(%rcx),%rax
864	adcq	$0,%rdx
865
866	imulq	%r10,%rbp
867	movq	%rdx,%r11
868	movq	%rdi,(%r14)
869
870	leaq	(%r14,%r9,1),%r14
871
872	mulq	%rbp
873	addq	%rax,%r10
874	movq	8(%rsi,%r9,1),%rax
875	adcq	$0,%rdx
876	movq	%rdx,%rdi
877
878	mulq	%rbx
879	addq	%rax,%r11
880	movq	8(%rcx),%rax
881	adcq	$0,%rdx
882	addq	8(%r14),%r11
883	adcq	$0,%rdx
884	movq	%rdx,%r10
885
886	mulq	%rbp
887	addq	%rax,%rdi
888	movq	16(%rsi,%r9,1),%rax
889	adcq	$0,%rdx
890	addq	%r11,%rdi
891	leaq	32(%r9),%r15
892	leaq	32(%rcx),%rcx
893	adcq	$0,%rdx
894	movq	%rdx,%r13
895	jmp	.Linner4x
896
897.align	32
898.Linner4x:
899	mulq	%rbx
900	addq	%rax,%r10
901	movq	-16(%rcx),%rax
902	adcq	$0,%rdx
903	addq	16(%r14),%r10
904	leaq	32(%r14),%r14
905	adcq	$0,%rdx
906	movq	%rdx,%r11
907
908	mulq	%rbp
909	addq	%rax,%r13
910	movq	-8(%rsi,%r15,1),%rax
911	adcq	$0,%rdx
912	addq	%r10,%r13
913	adcq	$0,%rdx
914	movq	%rdi,-32(%r14)
915	movq	%rdx,%rdi
916
917	mulq	%rbx
918	addq	%rax,%r11
919	movq	-8(%rcx),%rax
920	adcq	$0,%rdx
921	addq	-8(%r14),%r11
922	adcq	$0,%rdx
923	movq	%rdx,%r10
924
925	mulq	%rbp
926	addq	%rax,%rdi
927	movq	(%rsi,%r15,1),%rax
928	adcq	$0,%rdx
929	addq	%r11,%rdi
930	adcq	$0,%rdx
931	movq	%r13,-24(%r14)
932	movq	%rdx,%r13
933
934	mulq	%rbx
935	addq	%rax,%r10
936	movq	0(%rcx),%rax
937	adcq	$0,%rdx
938	addq	(%r14),%r10
939	adcq	$0,%rdx
940	movq	%rdx,%r11
941
942	mulq	%rbp
943	addq	%rax,%r13
944	movq	8(%rsi,%r15,1),%rax
945	adcq	$0,%rdx
946	addq	%r10,%r13
947	adcq	$0,%rdx
948	movq	%rdi,-16(%r14)
949	movq	%rdx,%rdi
950
951	mulq	%rbx
952	addq	%rax,%r11
953	movq	8(%rcx),%rax
954	adcq	$0,%rdx
955	addq	8(%r14),%r11
956	adcq	$0,%rdx
957	movq	%rdx,%r10
958
959	mulq	%rbp
960	addq	%rax,%rdi
961	movq	16(%rsi,%r15,1),%rax
962	adcq	$0,%rdx
963	addq	%r11,%rdi
964	leaq	32(%rcx),%rcx
965	adcq	$0,%rdx
966	movq	%r13,-8(%r14)
967	movq	%rdx,%r13
968
969	addq	$32,%r15
970	jnz	.Linner4x
971
972	mulq	%rbx
973	addq	%rax,%r10
974	movq	-16(%rcx),%rax
975	adcq	$0,%rdx
976	addq	16(%r14),%r10
977	leaq	32(%r14),%r14
978	adcq	$0,%rdx
979	movq	%rdx,%r11
980
981	mulq	%rbp
982	addq	%rax,%r13
983	movq	-8(%rsi),%rax
984	adcq	$0,%rdx
985	addq	%r10,%r13
986	adcq	$0,%rdx
987	movq	%rdi,-32(%r14)
988	movq	%rdx,%rdi
989
990	mulq	%rbx
991	addq	%rax,%r11
992	movq	%rbp,%rax
993	movq	-8(%rcx),%rbp
994	adcq	$0,%rdx
995	addq	-8(%r14),%r11
996	adcq	$0,%rdx
997	movq	%rdx,%r10
998
999	mulq	%rbp
1000	addq	%rax,%rdi
1001	movq	(%rsi,%r9,1),%rax
1002	adcq	$0,%rdx
1003	addq	%r11,%rdi
1004	adcq	$0,%rdx
1005	movq	%r13,-24(%r14)
1006	movq	%rdx,%r13
1007
1008	movq	%rdi,-16(%r14)
1009	leaq	(%rcx,%r9,1),%rcx
1010
1011	xorq	%rdi,%rdi
1012	addq	%r10,%r13
1013	adcq	$0,%rdi
1014	addq	(%r14),%r13
1015	adcq	$0,%rdi
1016	movq	%r13,-8(%r14)
1017
1018	cmpq	16+8(%rsp),%r12
1019	jb	.Louter4x
1020	xorq	%rax,%rax
1021	subq	%r13,%rbp
1022	adcq	%r15,%r15
1023	orq	%r15,%rdi
1024	subq	%rdi,%rax
1025	leaq	(%r14,%r9,1),%rbx
1026	movq	(%rcx),%r12
1027	leaq	(%rcx),%rbp
1028	movq	%r9,%rcx
1029	sarq	$3+2,%rcx
1030	movq	56+8(%rsp),%rdi
1031	decq	%r12
1032	xorq	%r10,%r10
1033	movq	8(%rbp),%r13
1034	movq	16(%rbp),%r14
1035	movq	24(%rbp),%r15
1036	jmp	.Lsqr4x_sub_entry
1037.size	mul4x_internal,.-mul4x_internal
1038.globl	bn_power5
1039.type	bn_power5,@function
1040.align	32
1041bn_power5:
1042	movq	%rsp,%rax
1043	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
1044	andl	$0x80108,%r11d
1045	cmpl	$0x80108,%r11d
1046	je	.Lpowerx5_enter
1047	pushq	%rbx
1048	pushq	%rbp
1049	pushq	%r12
1050	pushq	%r13
1051	pushq	%r14
1052	pushq	%r15
1053.Lpower5_prologue:
1054
1055	shll	$3,%r9d
1056	leal	(%r9,%r9,2),%r10d
1057	negq	%r9
1058	movq	(%r8),%r8
1059
1060
1061
1062
1063
1064
1065
1066
1067	leaq	-320(%rsp,%r9,2),%r11
1068	movq	%rsp,%rbp
1069	subq	%rdi,%r11
1070	andq	$4095,%r11
1071	cmpq	%r11,%r10
1072	jb	.Lpwr_sp_alt
1073	subq	%r11,%rbp
1074	leaq	-320(%rbp,%r9,2),%rbp
1075	jmp	.Lpwr_sp_done
1076
1077.align	32
1078.Lpwr_sp_alt:
1079	leaq	4096-320(,%r9,2),%r10
1080	leaq	-320(%rbp,%r9,2),%rbp
1081	subq	%r10,%r11
1082	movq	$0,%r10
1083	cmovcq	%r10,%r11
1084	subq	%r11,%rbp
1085.Lpwr_sp_done:
1086	andq	$-64,%rbp
1087	movq	%rsp,%r11
1088	subq	%rbp,%r11
1089	andq	$-4096,%r11
1090	leaq	(%r11,%rbp,1),%rsp
1091	movq	(%rsp),%r10
1092	cmpq	%rbp,%rsp
1093	ja	.Lpwr_page_walk
1094	jmp	.Lpwr_page_walk_done
1095
1096.Lpwr_page_walk:
1097	leaq	-4096(%rsp),%rsp
1098	movq	(%rsp),%r10
1099	cmpq	%rbp,%rsp
1100	ja	.Lpwr_page_walk
1101.Lpwr_page_walk_done:
1102
1103	movq	%r9,%r10
1104	negq	%r9
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115	movq	%r8,32(%rsp)
1116	movq	%rax,40(%rsp)
1117.Lpower5_body:
1118.byte	102,72,15,110,207
1119.byte	102,72,15,110,209
1120.byte	102,73,15,110,218
1121.byte	102,72,15,110,226
1122
1123	call	__bn_sqr8x_internal
1124	call	__bn_post4x_internal
1125	call	__bn_sqr8x_internal
1126	call	__bn_post4x_internal
1127	call	__bn_sqr8x_internal
1128	call	__bn_post4x_internal
1129	call	__bn_sqr8x_internal
1130	call	__bn_post4x_internal
1131	call	__bn_sqr8x_internal
1132	call	__bn_post4x_internal
1133
1134.byte	102,72,15,126,209
1135.byte	102,72,15,126,226
1136	movq	%rsi,%rdi
1137	movq	40(%rsp),%rax
1138	leaq	32(%rsp),%r8
1139
1140	call	mul4x_internal
1141
1142	movq	40(%rsp),%rsi
1143	movq	$1,%rax
1144	movq	-48(%rsi),%r15
1145	movq	-40(%rsi),%r14
1146	movq	-32(%rsi),%r13
1147	movq	-24(%rsi),%r12
1148	movq	-16(%rsi),%rbp
1149	movq	-8(%rsi),%rbx
1150	leaq	(%rsi),%rsp
1151.Lpower5_epilogue:
1152	.byte	0xf3,0xc3
1153.size	bn_power5,.-bn_power5
1154
1155.globl	bn_sqr8x_internal
1156.hidden	bn_sqr8x_internal
1157.type	bn_sqr8x_internal,@function
1158.align	32
1159bn_sqr8x_internal:
1160__bn_sqr8x_internal:
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234	leaq	32(%r10),%rbp
1235	leaq	(%rsi,%r9,1),%rsi
1236
1237	movq	%r9,%rcx
1238
1239
1240	movq	-32(%rsi,%rbp,1),%r14
1241	leaq	48+8(%rsp,%r9,2),%rdi
1242	movq	-24(%rsi,%rbp,1),%rax
1243	leaq	-32(%rdi,%rbp,1),%rdi
1244	movq	-16(%rsi,%rbp,1),%rbx
1245	movq	%rax,%r15
1246
1247	mulq	%r14
1248	movq	%rax,%r10
1249	movq	%rbx,%rax
1250	movq	%rdx,%r11
1251	movq	%r10,-24(%rdi,%rbp,1)
1252
1253	mulq	%r14
1254	addq	%rax,%r11
1255	movq	%rbx,%rax
1256	adcq	$0,%rdx
1257	movq	%r11,-16(%rdi,%rbp,1)
1258	movq	%rdx,%r10
1259
1260
1261	movq	-8(%rsi,%rbp,1),%rbx
1262	mulq	%r15
1263	movq	%rax,%r12
1264	movq	%rbx,%rax
1265	movq	%rdx,%r13
1266
1267	leaq	(%rbp),%rcx
1268	mulq	%r14
1269	addq	%rax,%r10
1270	movq	%rbx,%rax
1271	movq	%rdx,%r11
1272	adcq	$0,%r11
1273	addq	%r12,%r10
1274	adcq	$0,%r11
1275	movq	%r10,-8(%rdi,%rcx,1)
1276	jmp	.Lsqr4x_1st
1277
1278.align	32
1279.Lsqr4x_1st:
1280	movq	(%rsi,%rcx,1),%rbx
1281	mulq	%r15
1282	addq	%rax,%r13
1283	movq	%rbx,%rax
1284	movq	%rdx,%r12
1285	adcq	$0,%r12
1286
1287	mulq	%r14
1288	addq	%rax,%r11
1289	movq	%rbx,%rax
1290	movq	8(%rsi,%rcx,1),%rbx
1291	movq	%rdx,%r10
1292	adcq	$0,%r10
1293	addq	%r13,%r11
1294	adcq	$0,%r10
1295
1296
1297	mulq	%r15
1298	addq	%rax,%r12
1299	movq	%rbx,%rax
1300	movq	%r11,(%rdi,%rcx,1)
1301	movq	%rdx,%r13
1302	adcq	$0,%r13
1303
1304	mulq	%r14
1305	addq	%rax,%r10
1306	movq	%rbx,%rax
1307	movq	16(%rsi,%rcx,1),%rbx
1308	movq	%rdx,%r11
1309	adcq	$0,%r11
1310	addq	%r12,%r10
1311	adcq	$0,%r11
1312
1313	mulq	%r15
1314	addq	%rax,%r13
1315	movq	%rbx,%rax
1316	movq	%r10,8(%rdi,%rcx,1)
1317	movq	%rdx,%r12
1318	adcq	$0,%r12
1319
1320	mulq	%r14
1321	addq	%rax,%r11
1322	movq	%rbx,%rax
1323	movq	24(%rsi,%rcx,1),%rbx
1324	movq	%rdx,%r10
1325	adcq	$0,%r10
1326	addq	%r13,%r11
1327	adcq	$0,%r10
1328
1329
1330	mulq	%r15
1331	addq	%rax,%r12
1332	movq	%rbx,%rax
1333	movq	%r11,16(%rdi,%rcx,1)
1334	movq	%rdx,%r13
1335	adcq	$0,%r13
1336	leaq	32(%rcx),%rcx
1337
1338	mulq	%r14
1339	addq	%rax,%r10
1340	movq	%rbx,%rax
1341	movq	%rdx,%r11
1342	adcq	$0,%r11
1343	addq	%r12,%r10
1344	adcq	$0,%r11
1345	movq	%r10,-8(%rdi,%rcx,1)
1346
1347	cmpq	$0,%rcx
1348	jne	.Lsqr4x_1st
1349
1350	mulq	%r15
1351	addq	%rax,%r13
1352	leaq	16(%rbp),%rbp
1353	adcq	$0,%rdx
1354	addq	%r11,%r13
1355	adcq	$0,%rdx
1356
1357	movq	%r13,(%rdi)
1358	movq	%rdx,%r12
1359	movq	%rdx,8(%rdi)
1360	jmp	.Lsqr4x_outer
1361
1362.align	32
1363.Lsqr4x_outer:
1364	movq	-32(%rsi,%rbp,1),%r14
1365	leaq	48+8(%rsp,%r9,2),%rdi
1366	movq	-24(%rsi,%rbp,1),%rax
1367	leaq	-32(%rdi,%rbp,1),%rdi
1368	movq	-16(%rsi,%rbp,1),%rbx
1369	movq	%rax,%r15
1370
1371	mulq	%r14
1372	movq	-24(%rdi,%rbp,1),%r10
1373	addq	%rax,%r10
1374	movq	%rbx,%rax
1375	adcq	$0,%rdx
1376	movq	%r10,-24(%rdi,%rbp,1)
1377	movq	%rdx,%r11
1378
1379	mulq	%r14
1380	addq	%rax,%r11
1381	movq	%rbx,%rax
1382	adcq	$0,%rdx
1383	addq	-16(%rdi,%rbp,1),%r11
1384	movq	%rdx,%r10
1385	adcq	$0,%r10
1386	movq	%r11,-16(%rdi,%rbp,1)
1387
1388	xorq	%r12,%r12
1389
1390	movq	-8(%rsi,%rbp,1),%rbx
1391	mulq	%r15
1392	addq	%rax,%r12
1393	movq	%rbx,%rax
1394	adcq	$0,%rdx
1395	addq	-8(%rdi,%rbp,1),%r12
1396	movq	%rdx,%r13
1397	adcq	$0,%r13
1398
1399	mulq	%r14
1400	addq	%rax,%r10
1401	movq	%rbx,%rax
1402	adcq	$0,%rdx
1403	addq	%r12,%r10
1404	movq	%rdx,%r11
1405	adcq	$0,%r11
1406	movq	%r10,-8(%rdi,%rbp,1)
1407
1408	leaq	(%rbp),%rcx
1409	jmp	.Lsqr4x_inner
1410
1411.align	32
1412.Lsqr4x_inner:
1413	movq	(%rsi,%rcx,1),%rbx
1414	mulq	%r15
1415	addq	%rax,%r13
1416	movq	%rbx,%rax
1417	movq	%rdx,%r12
1418	adcq	$0,%r12
1419	addq	(%rdi,%rcx,1),%r13
1420	adcq	$0,%r12
1421
1422.byte	0x67
1423	mulq	%r14
1424	addq	%rax,%r11
1425	movq	%rbx,%rax
1426	movq	8(%rsi,%rcx,1),%rbx
1427	movq	%rdx,%r10
1428	adcq	$0,%r10
1429	addq	%r13,%r11
1430	adcq	$0,%r10
1431
1432	mulq	%r15
1433	addq	%rax,%r12
1434	movq	%r11,(%rdi,%rcx,1)
1435	movq	%rbx,%rax
1436	movq	%rdx,%r13
1437	adcq	$0,%r13
1438	addq	8(%rdi,%rcx,1),%r12
1439	leaq	16(%rcx),%rcx
1440	adcq	$0,%r13
1441
1442	mulq	%r14
1443	addq	%rax,%r10
1444	movq	%rbx,%rax
1445	adcq	$0,%rdx
1446	addq	%r12,%r10
1447	movq	%rdx,%r11
1448	adcq	$0,%r11
1449	movq	%r10,-8(%rdi,%rcx,1)
1450
1451	cmpq	$0,%rcx
1452	jne	.Lsqr4x_inner
1453
1454.byte	0x67
1455	mulq	%r15
1456	addq	%rax,%r13
1457	adcq	$0,%rdx
1458	addq	%r11,%r13
1459	adcq	$0,%rdx
1460
1461	movq	%r13,(%rdi)
1462	movq	%rdx,%r12
1463	movq	%rdx,8(%rdi)
1464
1465	addq	$16,%rbp
1466	jnz	.Lsqr4x_outer
1467
1468
1469	movq	-32(%rsi),%r14
1470	leaq	48+8(%rsp,%r9,2),%rdi
1471	movq	-24(%rsi),%rax
1472	leaq	-32(%rdi,%rbp,1),%rdi
1473	movq	-16(%rsi),%rbx
1474	movq	%rax,%r15
1475
1476	mulq	%r14
1477	addq	%rax,%r10
1478	movq	%rbx,%rax
1479	movq	%rdx,%r11
1480	adcq	$0,%r11
1481
1482	mulq	%r14
1483	addq	%rax,%r11
1484	movq	%rbx,%rax
1485	movq	%r10,-24(%rdi)
1486	movq	%rdx,%r10
1487	adcq	$0,%r10
1488	addq	%r13,%r11
1489	movq	-8(%rsi),%rbx
1490	adcq	$0,%r10
1491
1492	mulq	%r15
1493	addq	%rax,%r12
1494	movq	%rbx,%rax
1495	movq	%r11,-16(%rdi)
1496	movq	%rdx,%r13
1497	adcq	$0,%r13
1498
1499	mulq	%r14
1500	addq	%rax,%r10
1501	movq	%rbx,%rax
1502	movq	%rdx,%r11
1503	adcq	$0,%r11
1504	addq	%r12,%r10
1505	adcq	$0,%r11
1506	movq	%r10,-8(%rdi)
1507
1508	mulq	%r15
1509	addq	%rax,%r13
1510	movq	-16(%rsi),%rax
1511	adcq	$0,%rdx
1512	addq	%r11,%r13
1513	adcq	$0,%rdx
1514
1515	movq	%r13,(%rdi)
1516	movq	%rdx,%r12
1517	movq	%rdx,8(%rdi)
1518
1519	mulq	%rbx
1520	addq	$16,%rbp
1521	xorq	%r14,%r14
1522	subq	%r9,%rbp
1523	xorq	%r15,%r15
1524
1525	addq	%r12,%rax
1526	adcq	$0,%rdx
1527	movq	%rax,8(%rdi)
1528	movq	%rdx,16(%rdi)
1529	movq	%r15,24(%rdi)
1530
1531	movq	-16(%rsi,%rbp,1),%rax
1532	leaq	48+8(%rsp),%rdi
1533	xorq	%r10,%r10
1534	movq	8(%rdi),%r11
1535
1536	leaq	(%r14,%r10,2),%r12
1537	shrq	$63,%r10
1538	leaq	(%rcx,%r11,2),%r13
1539	shrq	$63,%r11
1540	orq	%r10,%r13
1541	movq	16(%rdi),%r10
1542	movq	%r11,%r14
1543	mulq	%rax
1544	negq	%r15
1545	movq	24(%rdi),%r11
1546	adcq	%rax,%r12
1547	movq	-8(%rsi,%rbp,1),%rax
1548	movq	%r12,(%rdi)
1549	adcq	%rdx,%r13
1550
1551	leaq	(%r14,%r10,2),%rbx
1552	movq	%r13,8(%rdi)
1553	sbbq	%r15,%r15
1554	shrq	$63,%r10
1555	leaq	(%rcx,%r11,2),%r8
1556	shrq	$63,%r11
1557	orq	%r10,%r8
1558	movq	32(%rdi),%r10
1559	movq	%r11,%r14
1560	mulq	%rax
1561	negq	%r15
1562	movq	40(%rdi),%r11
1563	adcq	%rax,%rbx
1564	movq	0(%rsi,%rbp,1),%rax
1565	movq	%rbx,16(%rdi)
1566	adcq	%rdx,%r8
1567	leaq	16(%rbp),%rbp
1568	movq	%r8,24(%rdi)
1569	sbbq	%r15,%r15
1570	leaq	64(%rdi),%rdi
1571	jmp	.Lsqr4x_shift_n_add
1572
1573.align	32
1574.Lsqr4x_shift_n_add:
1575	leaq	(%r14,%r10,2),%r12
1576	shrq	$63,%r10
1577	leaq	(%rcx,%r11,2),%r13
1578	shrq	$63,%r11
1579	orq	%r10,%r13
1580	movq	-16(%rdi),%r10
1581	movq	%r11,%r14
1582	mulq	%rax
1583	negq	%r15
1584	movq	-8(%rdi),%r11
1585	adcq	%rax,%r12
1586	movq	-8(%rsi,%rbp,1),%rax
1587	movq	%r12,-32(%rdi)
1588	adcq	%rdx,%r13
1589
1590	leaq	(%r14,%r10,2),%rbx
1591	movq	%r13,-24(%rdi)
1592	sbbq	%r15,%r15
1593	shrq	$63,%r10
1594	leaq	(%rcx,%r11,2),%r8
1595	shrq	$63,%r11
1596	orq	%r10,%r8
1597	movq	0(%rdi),%r10
1598	movq	%r11,%r14
1599	mulq	%rax
1600	negq	%r15
1601	movq	8(%rdi),%r11
1602	adcq	%rax,%rbx
1603	movq	0(%rsi,%rbp,1),%rax
1604	movq	%rbx,-16(%rdi)
1605	adcq	%rdx,%r8
1606
1607	leaq	(%r14,%r10,2),%r12
1608	movq	%r8,-8(%rdi)
1609	sbbq	%r15,%r15
1610	shrq	$63,%r10
1611	leaq	(%rcx,%r11,2),%r13
1612	shrq	$63,%r11
1613	orq	%r10,%r13
1614	movq	16(%rdi),%r10
1615	movq	%r11,%r14
1616	mulq	%rax
1617	negq	%r15
1618	movq	24(%rdi),%r11
1619	adcq	%rax,%r12
1620	movq	8(%rsi,%rbp,1),%rax
1621	movq	%r12,0(%rdi)
1622	adcq	%rdx,%r13
1623
1624	leaq	(%r14,%r10,2),%rbx
1625	movq	%r13,8(%rdi)
1626	sbbq	%r15,%r15
1627	shrq	$63,%r10
1628	leaq	(%rcx,%r11,2),%r8
1629	shrq	$63,%r11
1630	orq	%r10,%r8
1631	movq	32(%rdi),%r10
1632	movq	%r11,%r14
1633	mulq	%rax
1634	negq	%r15
1635	movq	40(%rdi),%r11
1636	adcq	%rax,%rbx
1637	movq	16(%rsi,%rbp,1),%rax
1638	movq	%rbx,16(%rdi)
1639	adcq	%rdx,%r8
1640	movq	%r8,24(%rdi)
1641	sbbq	%r15,%r15
1642	leaq	64(%rdi),%rdi
1643	addq	$32,%rbp
1644	jnz	.Lsqr4x_shift_n_add
1645
1646	leaq	(%r14,%r10,2),%r12
1647.byte	0x67
1648	shrq	$63,%r10
1649	leaq	(%rcx,%r11,2),%r13
1650	shrq	$63,%r11
1651	orq	%r10,%r13
1652	movq	-16(%rdi),%r10
1653	movq	%r11,%r14
1654	mulq	%rax
1655	negq	%r15
1656	movq	-8(%rdi),%r11
1657	adcq	%rax,%r12
1658	movq	-8(%rsi),%rax
1659	movq	%r12,-32(%rdi)
1660	adcq	%rdx,%r13
1661
1662	leaq	(%r14,%r10,2),%rbx
1663	movq	%r13,-24(%rdi)
1664	sbbq	%r15,%r15
1665	shrq	$63,%r10
1666	leaq	(%rcx,%r11,2),%r8
1667	shrq	$63,%r11
1668	orq	%r10,%r8
1669	mulq	%rax
1670	negq	%r15
1671	adcq	%rax,%rbx
1672	adcq	%rdx,%r8
1673	movq	%rbx,-16(%rdi)
1674	movq	%r8,-8(%rdi)
1675.byte	102,72,15,126,213
1676__bn_sqr8x_reduction:
1677	xorq	%rax,%rax
1678	leaq	(%r9,%rbp,1),%rcx
1679	leaq	48+8(%rsp,%r9,2),%rdx
1680	movq	%rcx,0+8(%rsp)
1681	leaq	48+8(%rsp,%r9,1),%rdi
1682	movq	%rdx,8+8(%rsp)
1683	negq	%r9
1684	jmp	.L8x_reduction_loop
1685
1686.align	32
1687.L8x_reduction_loop:
1688	leaq	(%rdi,%r9,1),%rdi
1689.byte	0x66
1690	movq	0(%rdi),%rbx
1691	movq	8(%rdi),%r9
1692	movq	16(%rdi),%r10
1693	movq	24(%rdi),%r11
1694	movq	32(%rdi),%r12
1695	movq	40(%rdi),%r13
1696	movq	48(%rdi),%r14
1697	movq	56(%rdi),%r15
1698	movq	%rax,(%rdx)
1699	leaq	64(%rdi),%rdi
1700
1701.byte	0x67
1702	movq	%rbx,%r8
1703	imulq	32+8(%rsp),%rbx
1704	movq	0(%rbp),%rax
1705	movl	$8,%ecx
1706	jmp	.L8x_reduce
1707
1708.align	32
1709.L8x_reduce:
1710	mulq	%rbx
1711	movq	8(%rbp),%rax
1712	negq	%r8
1713	movq	%rdx,%r8
1714	adcq	$0,%r8
1715
1716	mulq	%rbx
1717	addq	%rax,%r9
1718	movq	16(%rbp),%rax
1719	adcq	$0,%rdx
1720	addq	%r9,%r8
1721	movq	%rbx,48-8+8(%rsp,%rcx,8)
1722	movq	%rdx,%r9
1723	adcq	$0,%r9
1724
1725	mulq	%rbx
1726	addq	%rax,%r10
1727	movq	24(%rbp),%rax
1728	adcq	$0,%rdx
1729	addq	%r10,%r9
1730	movq	32+8(%rsp),%rsi
1731	movq	%rdx,%r10
1732	adcq	$0,%r10
1733
1734	mulq	%rbx
1735	addq	%rax,%r11
1736	movq	32(%rbp),%rax
1737	adcq	$0,%rdx
1738	imulq	%r8,%rsi
1739	addq	%r11,%r10
1740	movq	%rdx,%r11
1741	adcq	$0,%r11
1742
1743	mulq	%rbx
1744	addq	%rax,%r12
1745	movq	40(%rbp),%rax
1746	adcq	$0,%rdx
1747	addq	%r12,%r11
1748	movq	%rdx,%r12
1749	adcq	$0,%r12
1750
1751	mulq	%rbx
1752	addq	%rax,%r13
1753	movq	48(%rbp),%rax
1754	adcq	$0,%rdx
1755	addq	%r13,%r12
1756	movq	%rdx,%r13
1757	adcq	$0,%r13
1758
1759	mulq	%rbx
1760	addq	%rax,%r14
1761	movq	56(%rbp),%rax
1762	adcq	$0,%rdx
1763	addq	%r14,%r13
1764	movq	%rdx,%r14
1765	adcq	$0,%r14
1766
1767	mulq	%rbx
1768	movq	%rsi,%rbx
1769	addq	%rax,%r15
1770	movq	0(%rbp),%rax
1771	adcq	$0,%rdx
1772	addq	%r15,%r14
1773	movq	%rdx,%r15
1774	adcq	$0,%r15
1775
1776	decl	%ecx
1777	jnz	.L8x_reduce
1778
1779	leaq	64(%rbp),%rbp
1780	xorq	%rax,%rax
1781	movq	8+8(%rsp),%rdx
1782	cmpq	0+8(%rsp),%rbp
1783	jae	.L8x_no_tail
1784
1785.byte	0x66
1786	addq	0(%rdi),%r8
1787	adcq	8(%rdi),%r9
1788	adcq	16(%rdi),%r10
1789	adcq	24(%rdi),%r11
1790	adcq	32(%rdi),%r12
1791	adcq	40(%rdi),%r13
1792	adcq	48(%rdi),%r14
1793	adcq	56(%rdi),%r15
1794	sbbq	%rsi,%rsi
1795
1796	movq	48+56+8(%rsp),%rbx
1797	movl	$8,%ecx
1798	movq	0(%rbp),%rax
1799	jmp	.L8x_tail
1800
1801.align	32
1802.L8x_tail:
1803	mulq	%rbx
1804	addq	%rax,%r8
1805	movq	8(%rbp),%rax
1806	movq	%r8,(%rdi)
1807	movq	%rdx,%r8
1808	adcq	$0,%r8
1809
1810	mulq	%rbx
1811	addq	%rax,%r9
1812	movq	16(%rbp),%rax
1813	adcq	$0,%rdx
1814	addq	%r9,%r8
1815	leaq	8(%rdi),%rdi
1816	movq	%rdx,%r9
1817	adcq	$0,%r9
1818
1819	mulq	%rbx
1820	addq	%rax,%r10
1821	movq	24(%rbp),%rax
1822	adcq	$0,%rdx
1823	addq	%r10,%r9
1824	movq	%rdx,%r10
1825	adcq	$0,%r10
1826
1827	mulq	%rbx
1828	addq	%rax,%r11
1829	movq	32(%rbp),%rax
1830	adcq	$0,%rdx
1831	addq	%r11,%r10
1832	movq	%rdx,%r11
1833	adcq	$0,%r11
1834
1835	mulq	%rbx
1836	addq	%rax,%r12
1837	movq	40(%rbp),%rax
1838	adcq	$0,%rdx
1839	addq	%r12,%r11
1840	movq	%rdx,%r12
1841	adcq	$0,%r12
1842
1843	mulq	%rbx
1844	addq	%rax,%r13
1845	movq	48(%rbp),%rax
1846	adcq	$0,%rdx
1847	addq	%r13,%r12
1848	movq	%rdx,%r13
1849	adcq	$0,%r13
1850
1851	mulq	%rbx
1852	addq	%rax,%r14
1853	movq	56(%rbp),%rax
1854	adcq	$0,%rdx
1855	addq	%r14,%r13
1856	movq	%rdx,%r14
1857	adcq	$0,%r14
1858
1859	mulq	%rbx
1860	movq	48-16+8(%rsp,%rcx,8),%rbx
1861	addq	%rax,%r15
1862	adcq	$0,%rdx
1863	addq	%r15,%r14
1864	movq	0(%rbp),%rax
1865	movq	%rdx,%r15
1866	adcq	$0,%r15
1867
1868	decl	%ecx
1869	jnz	.L8x_tail
1870
1871	leaq	64(%rbp),%rbp
1872	movq	8+8(%rsp),%rdx
1873	cmpq	0+8(%rsp),%rbp
1874	jae	.L8x_tail_done
1875
1876	movq	48+56+8(%rsp),%rbx
1877	negq	%rsi
1878	movq	0(%rbp),%rax
1879	adcq	0(%rdi),%r8
1880	adcq	8(%rdi),%r9
1881	adcq	16(%rdi),%r10
1882	adcq	24(%rdi),%r11
1883	adcq	32(%rdi),%r12
1884	adcq	40(%rdi),%r13
1885	adcq	48(%rdi),%r14
1886	adcq	56(%rdi),%r15
1887	sbbq	%rsi,%rsi
1888
1889	movl	$8,%ecx
1890	jmp	.L8x_tail
1891
1892.align	32
1893.L8x_tail_done:
1894	xorq	%rax,%rax
1895	addq	(%rdx),%r8
1896	adcq	$0,%r9
1897	adcq	$0,%r10
1898	adcq	$0,%r11
1899	adcq	$0,%r12
1900	adcq	$0,%r13
1901	adcq	$0,%r14
1902	adcq	$0,%r15
1903	adcq	$0,%rax
1904
1905	negq	%rsi
1906.L8x_no_tail:
1907	adcq	0(%rdi),%r8
1908	adcq	8(%rdi),%r9
1909	adcq	16(%rdi),%r10
1910	adcq	24(%rdi),%r11
1911	adcq	32(%rdi),%r12
1912	adcq	40(%rdi),%r13
1913	adcq	48(%rdi),%r14
1914	adcq	56(%rdi),%r15
1915	adcq	$0,%rax
1916	movq	-8(%rbp),%rcx
1917	xorq	%rsi,%rsi
1918
1919.byte	102,72,15,126,213
1920
1921	movq	%r8,0(%rdi)
1922	movq	%r9,8(%rdi)
1923.byte	102,73,15,126,217
1924	movq	%r10,16(%rdi)
1925	movq	%r11,24(%rdi)
1926	movq	%r12,32(%rdi)
1927	movq	%r13,40(%rdi)
1928	movq	%r14,48(%rdi)
1929	movq	%r15,56(%rdi)
1930	leaq	64(%rdi),%rdi
1931
1932	cmpq	%rdx,%rdi
1933	jb	.L8x_reduction_loop
1934	.byte	0xf3,0xc3
1935.size	bn_sqr8x_internal,.-bn_sqr8x_internal
1936.type	__bn_post4x_internal,@function
1937.align	32
1938__bn_post4x_internal:
1939	movq	0(%rbp),%r12
1940	leaq	(%rdi,%r9,1),%rbx
1941	movq	%r9,%rcx
1942.byte	102,72,15,126,207
1943	negq	%rax
1944.byte	102,72,15,126,206
1945	sarq	$3+2,%rcx
1946	decq	%r12
1947	xorq	%r10,%r10
1948	movq	8(%rbp),%r13
1949	movq	16(%rbp),%r14
1950	movq	24(%rbp),%r15
1951	jmp	.Lsqr4x_sub_entry
1952
1953.align	16
1954.Lsqr4x_sub:
1955	movq	0(%rbp),%r12
1956	movq	8(%rbp),%r13
1957	movq	16(%rbp),%r14
1958	movq	24(%rbp),%r15
1959.Lsqr4x_sub_entry:
1960	leaq	32(%rbp),%rbp
1961	notq	%r12
1962	notq	%r13
1963	notq	%r14
1964	notq	%r15
1965	andq	%rax,%r12
1966	andq	%rax,%r13
1967	andq	%rax,%r14
1968	andq	%rax,%r15
1969
1970	negq	%r10
1971	adcq	0(%rbx),%r12
1972	adcq	8(%rbx),%r13
1973	adcq	16(%rbx),%r14
1974	adcq	24(%rbx),%r15
1975	movq	%r12,0(%rdi)
1976	leaq	32(%rbx),%rbx
1977	movq	%r13,8(%rdi)
1978	sbbq	%r10,%r10
1979	movq	%r14,16(%rdi)
1980	movq	%r15,24(%rdi)
1981	leaq	32(%rdi),%rdi
1982
1983	incq	%rcx
1984	jnz	.Lsqr4x_sub
1985
1986	movq	%r9,%r10
1987	negq	%r9
1988	.byte	0xf3,0xc3
1989.size	__bn_post4x_internal,.-__bn_post4x_internal
1990.globl	bn_from_montgomery
1991.type	bn_from_montgomery,@function
1992.align	32
1993bn_from_montgomery:
1994	testl	$7,%r9d
1995	jz	bn_from_mont8x
1996	xorl	%eax,%eax
1997	.byte	0xf3,0xc3
1998.size	bn_from_montgomery,.-bn_from_montgomery
1999
2000.type	bn_from_mont8x,@function
2001.align	32
2002bn_from_mont8x:
2003.byte	0x67
2004	movq	%rsp,%rax
2005	pushq	%rbx
2006	pushq	%rbp
2007	pushq	%r12
2008	pushq	%r13
2009	pushq	%r14
2010	pushq	%r15
2011.Lfrom_prologue:
2012
2013	shll	$3,%r9d
2014	leaq	(%r9,%r9,2),%r10
2015	negq	%r9
2016	movq	(%r8),%r8
2017
2018
2019
2020
2021
2022
2023
2024
2025	leaq	-320(%rsp,%r9,2),%r11
2026	movq	%rsp,%rbp
2027	subq	%rdi,%r11
2028	andq	$4095,%r11
2029	cmpq	%r11,%r10
2030	jb	.Lfrom_sp_alt
2031	subq	%r11,%rbp
2032	leaq	-320(%rbp,%r9,2),%rbp
2033	jmp	.Lfrom_sp_done
2034
2035.align	32
2036.Lfrom_sp_alt:
2037	leaq	4096-320(,%r9,2),%r10
2038	leaq	-320(%rbp,%r9,2),%rbp
2039	subq	%r10,%r11
2040	movq	$0,%r10
2041	cmovcq	%r10,%r11
2042	subq	%r11,%rbp
2043.Lfrom_sp_done:
2044	andq	$-64,%rbp
2045	movq	%rsp,%r11
2046	subq	%rbp,%r11
2047	andq	$-4096,%r11
2048	leaq	(%r11,%rbp,1),%rsp
2049	movq	(%rsp),%r10
2050	cmpq	%rbp,%rsp
2051	ja	.Lfrom_page_walk
2052	jmp	.Lfrom_page_walk_done
2053
2054.Lfrom_page_walk:
2055	leaq	-4096(%rsp),%rsp
2056	movq	(%rsp),%r10
2057	cmpq	%rbp,%rsp
2058	ja	.Lfrom_page_walk
2059.Lfrom_page_walk_done:
2060
2061	movq	%r9,%r10
2062	negq	%r9
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073	movq	%r8,32(%rsp)
2074	movq	%rax,40(%rsp)
2075.Lfrom_body:
2076	movq	%r9,%r11
2077	leaq	48(%rsp),%rax
2078	pxor	%xmm0,%xmm0
2079	jmp	.Lmul_by_1
2080
2081.align	32
2082.Lmul_by_1:
2083	movdqu	(%rsi),%xmm1
2084	movdqu	16(%rsi),%xmm2
2085	movdqu	32(%rsi),%xmm3
2086	movdqa	%xmm0,(%rax,%r9,1)
2087	movdqu	48(%rsi),%xmm4
2088	movdqa	%xmm0,16(%rax,%r9,1)
2089.byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
2090	movdqa	%xmm1,(%rax)
2091	movdqa	%xmm0,32(%rax,%r9,1)
2092	movdqa	%xmm2,16(%rax)
2093	movdqa	%xmm0,48(%rax,%r9,1)
2094	movdqa	%xmm3,32(%rax)
2095	movdqa	%xmm4,48(%rax)
2096	leaq	64(%rax),%rax
2097	subq	$64,%r11
2098	jnz	.Lmul_by_1
2099
2100.byte	102,72,15,110,207
2101.byte	102,72,15,110,209
2102.byte	0x67
2103	movq	%rcx,%rbp
2104.byte	102,73,15,110,218
2105	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
2106	andl	$0x80108,%r11d
2107	cmpl	$0x80108,%r11d
2108	jne	.Lfrom_mont_nox
2109
2110	leaq	(%rax,%r9,1),%rdi
2111	call	__bn_sqrx8x_reduction
2112	call	__bn_postx4x_internal
2113
2114	pxor	%xmm0,%xmm0
2115	leaq	48(%rsp),%rax
2116	movq	40(%rsp),%rsi
2117	jmp	.Lfrom_mont_zero
2118
2119.align	32
2120.Lfrom_mont_nox:
2121	call	__bn_sqr8x_reduction
2122	call	__bn_post4x_internal
2123
2124	pxor	%xmm0,%xmm0
2125	leaq	48(%rsp),%rax
2126	movq	40(%rsp),%rsi
2127	jmp	.Lfrom_mont_zero
2128
2129.align	32
2130.Lfrom_mont_zero:
2131	movdqa	%xmm0,0(%rax)
2132	movdqa	%xmm0,16(%rax)
2133	movdqa	%xmm0,32(%rax)
2134	movdqa	%xmm0,48(%rax)
2135	leaq	64(%rax),%rax
2136	subq	$32,%r9
2137	jnz	.Lfrom_mont_zero
2138
2139	movq	$1,%rax
2140	movq	-48(%rsi),%r15
2141	movq	-40(%rsi),%r14
2142	movq	-32(%rsi),%r13
2143	movq	-24(%rsi),%r12
2144	movq	-16(%rsi),%rbp
2145	movq	-8(%rsi),%rbx
2146	leaq	(%rsi),%rsp
2147.Lfrom_epilogue:
2148	.byte	0xf3,0xc3
2149.size	bn_from_mont8x,.-bn_from_mont8x
2150.type	bn_mulx4x_mont_gather5,@function
2151.align	32
2152bn_mulx4x_mont_gather5:
2153	movq	%rsp,%rax
2154.Lmulx4x_enter:
2155	pushq	%rbx
2156	pushq	%rbp
2157	pushq	%r12
2158	pushq	%r13
2159	pushq	%r14
2160	pushq	%r15
2161.Lmulx4x_prologue:
2162
2163	shll	$3,%r9d
2164	leaq	(%r9,%r9,2),%r10
2165	negq	%r9
2166	movq	(%r8),%r8
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177	leaq	-320(%rsp,%r9,2),%r11
2178	movq	%rsp,%rbp
2179	subq	%rdi,%r11
2180	andq	$4095,%r11
2181	cmpq	%r11,%r10
2182	jb	.Lmulx4xsp_alt
2183	subq	%r11,%rbp
2184	leaq	-320(%rbp,%r9,2),%rbp
2185	jmp	.Lmulx4xsp_done
2186
2187.Lmulx4xsp_alt:
2188	leaq	4096-320(,%r9,2),%r10
2189	leaq	-320(%rbp,%r9,2),%rbp
2190	subq	%r10,%r11
2191	movq	$0,%r10
2192	cmovcq	%r10,%r11
2193	subq	%r11,%rbp
2194.Lmulx4xsp_done:
2195	andq	$-64,%rbp
2196	movq	%rsp,%r11
2197	subq	%rbp,%r11
2198	andq	$-4096,%r11
2199	leaq	(%r11,%rbp,1),%rsp
2200	movq	(%rsp),%r10
2201	cmpq	%rbp,%rsp
2202	ja	.Lmulx4x_page_walk
2203	jmp	.Lmulx4x_page_walk_done
2204
2205.Lmulx4x_page_walk:
2206	leaq	-4096(%rsp),%rsp
2207	movq	(%rsp),%r10
2208	cmpq	%rbp,%rsp
2209	ja	.Lmulx4x_page_walk
2210.Lmulx4x_page_walk_done:
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224	movq	%r8,32(%rsp)
2225	movq	%rax,40(%rsp)
2226.Lmulx4x_body:
2227	call	mulx4x_internal
2228
2229	movq	40(%rsp),%rsi
2230	movq	$1,%rax
2231
2232	movq	-48(%rsi),%r15
2233	movq	-40(%rsi),%r14
2234	movq	-32(%rsi),%r13
2235	movq	-24(%rsi),%r12
2236	movq	-16(%rsi),%rbp
2237	movq	-8(%rsi),%rbx
2238	leaq	(%rsi),%rsp
2239.Lmulx4x_epilogue:
2240	.byte	0xf3,0xc3
2241.size	bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
2242
2243.type	mulx4x_internal,@function
2244.align	32
2245mulx4x_internal:
2246	movq	%r9,8(%rsp)
2247	movq	%r9,%r10
2248	negq	%r9
2249	shlq	$5,%r9
2250	negq	%r10
2251	leaq	128(%rdx,%r9,1),%r13
2252	shrq	$5+5,%r9
2253	movd	8(%rax),%xmm5
2254	subq	$1,%r9
2255	leaq	.Linc(%rip),%rax
2256	movq	%r13,16+8(%rsp)
2257	movq	%r9,24+8(%rsp)
2258	movq	%rdi,56+8(%rsp)
2259	movdqa	0(%rax),%xmm0
2260	movdqa	16(%rax),%xmm1
2261	leaq	88-112(%rsp,%r10,1),%r10
2262	leaq	128(%rdx),%rdi
2263
2264	pshufd	$0,%xmm5,%xmm5
2265	movdqa	%xmm1,%xmm4
2266.byte	0x67
2267	movdqa	%xmm1,%xmm2
2268.byte	0x67
2269	paddd	%xmm0,%xmm1
2270	pcmpeqd	%xmm5,%xmm0
2271	movdqa	%xmm4,%xmm3
2272	paddd	%xmm1,%xmm2
2273	pcmpeqd	%xmm5,%xmm1
2274	movdqa	%xmm0,112(%r10)
2275	movdqa	%xmm4,%xmm0
2276
2277	paddd	%xmm2,%xmm3
2278	pcmpeqd	%xmm5,%xmm2
2279	movdqa	%xmm1,128(%r10)
2280	movdqa	%xmm4,%xmm1
2281
2282	paddd	%xmm3,%xmm0
2283	pcmpeqd	%xmm5,%xmm3
2284	movdqa	%xmm2,144(%r10)
2285	movdqa	%xmm4,%xmm2
2286
2287	paddd	%xmm0,%xmm1
2288	pcmpeqd	%xmm5,%xmm0
2289	movdqa	%xmm3,160(%r10)
2290	movdqa	%xmm4,%xmm3
2291	paddd	%xmm1,%xmm2
2292	pcmpeqd	%xmm5,%xmm1
2293	movdqa	%xmm0,176(%r10)
2294	movdqa	%xmm4,%xmm0
2295
2296	paddd	%xmm2,%xmm3
2297	pcmpeqd	%xmm5,%xmm2
2298	movdqa	%xmm1,192(%r10)
2299	movdqa	%xmm4,%xmm1
2300
2301	paddd	%xmm3,%xmm0
2302	pcmpeqd	%xmm5,%xmm3
2303	movdqa	%xmm2,208(%r10)
2304	movdqa	%xmm4,%xmm2
2305
2306	paddd	%xmm0,%xmm1
2307	pcmpeqd	%xmm5,%xmm0
2308	movdqa	%xmm3,224(%r10)
2309	movdqa	%xmm4,%xmm3
2310	paddd	%xmm1,%xmm2
2311	pcmpeqd	%xmm5,%xmm1
2312	movdqa	%xmm0,240(%r10)
2313	movdqa	%xmm4,%xmm0
2314
2315	paddd	%xmm2,%xmm3
2316	pcmpeqd	%xmm5,%xmm2
2317	movdqa	%xmm1,256(%r10)
2318	movdqa	%xmm4,%xmm1
2319
2320	paddd	%xmm3,%xmm0
2321	pcmpeqd	%xmm5,%xmm3
2322	movdqa	%xmm2,272(%r10)
2323	movdqa	%xmm4,%xmm2
2324
2325	paddd	%xmm0,%xmm1
2326	pcmpeqd	%xmm5,%xmm0
2327	movdqa	%xmm3,288(%r10)
2328	movdqa	%xmm4,%xmm3
2329.byte	0x67
2330	paddd	%xmm1,%xmm2
2331	pcmpeqd	%xmm5,%xmm1
2332	movdqa	%xmm0,304(%r10)
2333
2334	paddd	%xmm2,%xmm3
2335	pcmpeqd	%xmm5,%xmm2
2336	movdqa	%xmm1,320(%r10)
2337
2338	pcmpeqd	%xmm5,%xmm3
2339	movdqa	%xmm2,336(%r10)
2340
2341	pand	64(%rdi),%xmm0
2342	pand	80(%rdi),%xmm1
2343	pand	96(%rdi),%xmm2
2344	movdqa	%xmm3,352(%r10)
2345	pand	112(%rdi),%xmm3
2346	por	%xmm2,%xmm0
2347	por	%xmm3,%xmm1
2348	movdqa	-128(%rdi),%xmm4
2349	movdqa	-112(%rdi),%xmm5
2350	movdqa	-96(%rdi),%xmm2
2351	pand	112(%r10),%xmm4
2352	movdqa	-80(%rdi),%xmm3
2353	pand	128(%r10),%xmm5
2354	por	%xmm4,%xmm0
2355	pand	144(%r10),%xmm2
2356	por	%xmm5,%xmm1
2357	pand	160(%r10),%xmm3
2358	por	%xmm2,%xmm0
2359	por	%xmm3,%xmm1
2360	movdqa	-64(%rdi),%xmm4
2361	movdqa	-48(%rdi),%xmm5
2362	movdqa	-32(%rdi),%xmm2
2363	pand	176(%r10),%xmm4
2364	movdqa	-16(%rdi),%xmm3
2365	pand	192(%r10),%xmm5
2366	por	%xmm4,%xmm0
2367	pand	208(%r10),%xmm2
2368	por	%xmm5,%xmm1
2369	pand	224(%r10),%xmm3
2370	por	%xmm2,%xmm0
2371	por	%xmm3,%xmm1
2372	movdqa	0(%rdi),%xmm4
2373	movdqa	16(%rdi),%xmm5
2374	movdqa	32(%rdi),%xmm2
2375	pand	240(%r10),%xmm4
2376	movdqa	48(%rdi),%xmm3
2377	pand	256(%r10),%xmm5
2378	por	%xmm4,%xmm0
2379	pand	272(%r10),%xmm2
2380	por	%xmm5,%xmm1
2381	pand	288(%r10),%xmm3
2382	por	%xmm2,%xmm0
2383	por	%xmm3,%xmm1
2384	pxor	%xmm1,%xmm0
2385	pshufd	$0x4e,%xmm0,%xmm1
2386	por	%xmm1,%xmm0
2387	leaq	256(%rdi),%rdi
2388.byte	102,72,15,126,194
2389	leaq	64+32+8(%rsp),%rbx
2390
2391	movq	%rdx,%r9
2392	mulxq	0(%rsi),%r8,%rax
2393	mulxq	8(%rsi),%r11,%r12
2394	addq	%rax,%r11
2395	mulxq	16(%rsi),%rax,%r13
2396	adcq	%rax,%r12
2397	adcq	$0,%r13
2398	mulxq	24(%rsi),%rax,%r14
2399
2400	movq	%r8,%r15
2401	imulq	32+8(%rsp),%r8
2402	xorq	%rbp,%rbp
2403	movq	%r8,%rdx
2404
2405	movq	%rdi,8+8(%rsp)
2406
2407	leaq	32(%rsi),%rsi
2408	adcxq	%rax,%r13
2409	adcxq	%rbp,%r14
2410
2411	mulxq	0(%rcx),%rax,%r10
2412	adcxq	%rax,%r15
2413	adoxq	%r11,%r10
2414	mulxq	8(%rcx),%rax,%r11
2415	adcxq	%rax,%r10
2416	adoxq	%r12,%r11
2417	mulxq	16(%rcx),%rax,%r12
2418	movq	24+8(%rsp),%rdi
2419	movq	%r10,-32(%rbx)
2420	adcxq	%rax,%r11
2421	adoxq	%r13,%r12
2422	mulxq	24(%rcx),%rax,%r15
2423	movq	%r9,%rdx
2424	movq	%r11,-24(%rbx)
2425	adcxq	%rax,%r12
2426	adoxq	%rbp,%r15
2427	leaq	32(%rcx),%rcx
2428	movq	%r12,-16(%rbx)
2429	jmp	.Lmulx4x_1st
2430
2431.align	32
2432.Lmulx4x_1st:
2433	adcxq	%rbp,%r15
2434	mulxq	0(%rsi),%r10,%rax
2435	adcxq	%r14,%r10
2436	mulxq	8(%rsi),%r11,%r14
2437	adcxq	%rax,%r11
2438	mulxq	16(%rsi),%r12,%rax
2439	adcxq	%r14,%r12
2440	mulxq	24(%rsi),%r13,%r14
2441.byte	0x67,0x67
2442	movq	%r8,%rdx
2443	adcxq	%rax,%r13
2444	adcxq	%rbp,%r14
2445	leaq	32(%rsi),%rsi
2446	leaq	32(%rbx),%rbx
2447
2448	adoxq	%r15,%r10
2449	mulxq	0(%rcx),%rax,%r15
2450	adcxq	%rax,%r10
2451	adoxq	%r15,%r11
2452	mulxq	8(%rcx),%rax,%r15
2453	adcxq	%rax,%r11
2454	adoxq	%r15,%r12
2455	mulxq	16(%rcx),%rax,%r15
2456	movq	%r10,-40(%rbx)
2457	adcxq	%rax,%r12
2458	movq	%r11,-32(%rbx)
2459	adoxq	%r15,%r13
2460	mulxq	24(%rcx),%rax,%r15
2461	movq	%r9,%rdx
2462	movq	%r12,-24(%rbx)
2463	adcxq	%rax,%r13
2464	adoxq	%rbp,%r15
2465	leaq	32(%rcx),%rcx
2466	movq	%r13,-16(%rbx)
2467
2468	decq	%rdi
2469	jnz	.Lmulx4x_1st
2470
2471	movq	8(%rsp),%rax
2472	adcq	%rbp,%r15
2473	leaq	(%rsi,%rax,1),%rsi
2474	addq	%r15,%r14
2475	movq	8+8(%rsp),%rdi
2476	adcq	%rbp,%rbp
2477	movq	%r14,-8(%rbx)
2478	jmp	.Lmulx4x_outer
2479
2480.align	32
2481.Lmulx4x_outer:
2482	leaq	16-256(%rbx),%r10
2483	pxor	%xmm4,%xmm4
2484.byte	0x67,0x67
2485	pxor	%xmm5,%xmm5
2486	movdqa	-128(%rdi),%xmm0
2487	movdqa	-112(%rdi),%xmm1
2488	movdqa	-96(%rdi),%xmm2
2489	pand	256(%r10),%xmm0
2490	movdqa	-80(%rdi),%xmm3
2491	pand	272(%r10),%xmm1
2492	por	%xmm0,%xmm4
2493	pand	288(%r10),%xmm2
2494	por	%xmm1,%xmm5
2495	pand	304(%r10),%xmm3
2496	por	%xmm2,%xmm4
2497	por	%xmm3,%xmm5
2498	movdqa	-64(%rdi),%xmm0
2499	movdqa	-48(%rdi),%xmm1
2500	movdqa	-32(%rdi),%xmm2
2501	pand	320(%r10),%xmm0
2502	movdqa	-16(%rdi),%xmm3
2503	pand	336(%r10),%xmm1
2504	por	%xmm0,%xmm4
2505	pand	352(%r10),%xmm2
2506	por	%xmm1,%xmm5
2507	pand	368(%r10),%xmm3
2508	por	%xmm2,%xmm4
2509	por	%xmm3,%xmm5
2510	movdqa	0(%rdi),%xmm0
2511	movdqa	16(%rdi),%xmm1
2512	movdqa	32(%rdi),%xmm2
2513	pand	384(%r10),%xmm0
2514	movdqa	48(%rdi),%xmm3
2515	pand	400(%r10),%xmm1
2516	por	%xmm0,%xmm4
2517	pand	416(%r10),%xmm2
2518	por	%xmm1,%xmm5
2519	pand	432(%r10),%xmm3
2520	por	%xmm2,%xmm4
2521	por	%xmm3,%xmm5
2522	movdqa	64(%rdi),%xmm0
2523	movdqa	80(%rdi),%xmm1
2524	movdqa	96(%rdi),%xmm2
2525	pand	448(%r10),%xmm0
2526	movdqa	112(%rdi),%xmm3
2527	pand	464(%r10),%xmm1
2528	por	%xmm0,%xmm4
2529	pand	480(%r10),%xmm2
2530	por	%xmm1,%xmm5
2531	pand	496(%r10),%xmm3
2532	por	%xmm2,%xmm4
2533	por	%xmm3,%xmm5
2534	por	%xmm5,%xmm4
2535	pshufd	$0x4e,%xmm4,%xmm0
2536	por	%xmm4,%xmm0
2537	leaq	256(%rdi),%rdi
2538.byte	102,72,15,126,194
2539
2540	movq	%rbp,(%rbx)
2541	leaq	32(%rbx,%rax,1),%rbx
2542	mulxq	0(%rsi),%r8,%r11
2543	xorq	%rbp,%rbp
2544	movq	%rdx,%r9
2545	mulxq	8(%rsi),%r14,%r12
2546	adoxq	-32(%rbx),%r8
2547	adcxq	%r14,%r11
2548	mulxq	16(%rsi),%r15,%r13
2549	adoxq	-24(%rbx),%r11
2550	adcxq	%r15,%r12
2551	mulxq	24(%rsi),%rdx,%r14
2552	adoxq	-16(%rbx),%r12
2553	adcxq	%rdx,%r13
2554	leaq	(%rcx,%rax,1),%rcx
2555	leaq	32(%rsi),%rsi
2556	adoxq	-8(%rbx),%r13
2557	adcxq	%rbp,%r14
2558	adoxq	%rbp,%r14
2559
2560	movq	%r8,%r15
2561	imulq	32+8(%rsp),%r8
2562
2563	movq	%r8,%rdx
2564	xorq	%rbp,%rbp
2565	movq	%rdi,8+8(%rsp)
2566
2567	mulxq	0(%rcx),%rax,%r10
2568	adcxq	%rax,%r15
2569	adoxq	%r11,%r10
2570	mulxq	8(%rcx),%rax,%r11
2571	adcxq	%rax,%r10
2572	adoxq	%r12,%r11
2573	mulxq	16(%rcx),%rax,%r12
2574	adcxq	%rax,%r11
2575	adoxq	%r13,%r12
2576	mulxq	24(%rcx),%rax,%r15
2577	movq	%r9,%rdx
2578	movq	24+8(%rsp),%rdi
2579	movq	%r10,-32(%rbx)
2580	adcxq	%rax,%r12
2581	movq	%r11,-24(%rbx)
2582	adoxq	%rbp,%r15
2583	movq	%r12,-16(%rbx)
2584	leaq	32(%rcx),%rcx
2585	jmp	.Lmulx4x_inner
2586
2587.align	32
2588.Lmulx4x_inner:
2589	mulxq	0(%rsi),%r10,%rax
2590	adcxq	%rbp,%r15
2591	adoxq	%r14,%r10
2592	mulxq	8(%rsi),%r11,%r14
2593	adcxq	0(%rbx),%r10
2594	adoxq	%rax,%r11
2595	mulxq	16(%rsi),%r12,%rax
2596	adcxq	8(%rbx),%r11
2597	adoxq	%r14,%r12
2598	mulxq	24(%rsi),%r13,%r14
2599	movq	%r8,%rdx
2600	adcxq	16(%rbx),%r12
2601	adoxq	%rax,%r13
2602	adcxq	24(%rbx),%r13
2603	adoxq	%rbp,%r14
2604	leaq	32(%rsi),%rsi
2605	leaq	32(%rbx),%rbx
2606	adcxq	%rbp,%r14
2607
2608	adoxq	%r15,%r10
2609	mulxq	0(%rcx),%rax,%r15
2610	adcxq	%rax,%r10
2611	adoxq	%r15,%r11
2612	mulxq	8(%rcx),%rax,%r15
2613	adcxq	%rax,%r11
2614	adoxq	%r15,%r12
2615	mulxq	16(%rcx),%rax,%r15
2616	movq	%r10,-40(%rbx)
2617	adcxq	%rax,%r12
2618	adoxq	%r15,%r13
2619	movq	%r11,-32(%rbx)
2620	mulxq	24(%rcx),%rax,%r15
2621	movq	%r9,%rdx
2622	leaq	32(%rcx),%rcx
2623	movq	%r12,-24(%rbx)
2624	adcxq	%rax,%r13
2625	adoxq	%rbp,%r15
2626	movq	%r13,-16(%rbx)
2627
2628	decq	%rdi
2629	jnz	.Lmulx4x_inner
2630
2631	movq	0+8(%rsp),%rax
2632	adcq	%rbp,%r15
2633	subq	0(%rbx),%rdi
2634	movq	8+8(%rsp),%rdi
2635	movq	16+8(%rsp),%r10
2636	adcq	%r15,%r14
2637	leaq	(%rsi,%rax,1),%rsi
2638	adcq	%rbp,%rbp
2639	movq	%r14,-8(%rbx)
2640
2641	cmpq	%r10,%rdi
2642	jb	.Lmulx4x_outer
2643
2644	movq	-8(%rcx),%r10
2645	movq	%rbp,%r8
2646	movq	(%rcx,%rax,1),%r12
2647	leaq	(%rcx,%rax,1),%rbp
2648	movq	%rax,%rcx
2649	leaq	(%rbx,%rax,1),%rdi
2650	xorl	%eax,%eax
2651	xorq	%r15,%r15
2652	subq	%r14,%r10
2653	adcq	%r15,%r15
2654	orq	%r15,%r8
2655	sarq	$3+2,%rcx
2656	subq	%r8,%rax
2657	movq	56+8(%rsp),%rdx
2658	decq	%r12
2659	movq	8(%rbp),%r13
2660	xorq	%r8,%r8
2661	movq	16(%rbp),%r14
2662	movq	24(%rbp),%r15
2663	jmp	.Lsqrx4x_sub_entry
2664.size	mulx4x_internal,.-mulx4x_internal
2665.type	bn_powerx5,@function
2666.align	32
2667bn_powerx5:
2668	movq	%rsp,%rax
2669.Lpowerx5_enter:
2670	pushq	%rbx
2671	pushq	%rbp
2672	pushq	%r12
2673	pushq	%r13
2674	pushq	%r14
2675	pushq	%r15
2676.Lpowerx5_prologue:
2677
2678	shll	$3,%r9d
2679	leaq	(%r9,%r9,2),%r10
2680	negq	%r9
2681	movq	(%r8),%r8
2682
2683
2684
2685
2686
2687
2688
2689
2690	leaq	-320(%rsp,%r9,2),%r11
2691	movq	%rsp,%rbp
2692	subq	%rdi,%r11
2693	andq	$4095,%r11
2694	cmpq	%r11,%r10
2695	jb	.Lpwrx_sp_alt
2696	subq	%r11,%rbp
2697	leaq	-320(%rbp,%r9,2),%rbp
2698	jmp	.Lpwrx_sp_done
2699
2700.align	32
2701.Lpwrx_sp_alt:
2702	leaq	4096-320(,%r9,2),%r10
2703	leaq	-320(%rbp,%r9,2),%rbp
2704	subq	%r10,%r11
2705	movq	$0,%r10
2706	cmovcq	%r10,%r11
2707	subq	%r11,%rbp
2708.Lpwrx_sp_done:
2709	andq	$-64,%rbp
2710	movq	%rsp,%r11
2711	subq	%rbp,%r11
2712	andq	$-4096,%r11
2713	leaq	(%r11,%rbp,1),%rsp
2714	movq	(%rsp),%r10
2715	cmpq	%rbp,%rsp
2716	ja	.Lpwrx_page_walk
2717	jmp	.Lpwrx_page_walk_done
2718
2719.Lpwrx_page_walk:
2720	leaq	-4096(%rsp),%rsp
2721	movq	(%rsp),%r10
2722	cmpq	%rbp,%rsp
2723	ja	.Lpwrx_page_walk
2724.Lpwrx_page_walk_done:
2725
2726	movq	%r9,%r10
2727	negq	%r9
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740	pxor	%xmm0,%xmm0
2741.byte	102,72,15,110,207
2742.byte	102,72,15,110,209
2743.byte	102,73,15,110,218
2744.byte	102,72,15,110,226
2745	movq	%r8,32(%rsp)
2746	movq	%rax,40(%rsp)
2747.Lpowerx5_body:
2748
2749	call	__bn_sqrx8x_internal
2750	call	__bn_postx4x_internal
2751	call	__bn_sqrx8x_internal
2752	call	__bn_postx4x_internal
2753	call	__bn_sqrx8x_internal
2754	call	__bn_postx4x_internal
2755	call	__bn_sqrx8x_internal
2756	call	__bn_postx4x_internal
2757	call	__bn_sqrx8x_internal
2758	call	__bn_postx4x_internal
2759
2760	movq	%r10,%r9
2761	movq	%rsi,%rdi
2762.byte	102,72,15,126,209
2763.byte	102,72,15,126,226
2764	movq	40(%rsp),%rax
2765
2766	call	mulx4x_internal
2767
2768	movq	40(%rsp),%rsi
2769	movq	$1,%rax
2770
2771	movq	-48(%rsi),%r15
2772	movq	-40(%rsi),%r14
2773	movq	-32(%rsi),%r13
2774	movq	-24(%rsi),%r12
2775	movq	-16(%rsi),%rbp
2776	movq	-8(%rsi),%rbx
2777	leaq	(%rsi),%rsp
2778.Lpowerx5_epilogue:
2779	.byte	0xf3,0xc3
2780.size	bn_powerx5,.-bn_powerx5
2781
2782.globl	bn_sqrx8x_internal
2783.hidden	bn_sqrx8x_internal
2784.type	bn_sqrx8x_internal,@function
2785.align	32
2786bn_sqrx8x_internal:
2787__bn_sqrx8x_internal:
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828	leaq	48+8(%rsp),%rdi
2829	leaq	(%rsi,%r9,1),%rbp
2830	movq	%r9,0+8(%rsp)
2831	movq	%rbp,8+8(%rsp)
2832	jmp	.Lsqr8x_zero_start
2833
2834.align	32
2835.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2836.Lsqrx8x_zero:
2837.byte	0x3e
2838	movdqa	%xmm0,0(%rdi)
2839	movdqa	%xmm0,16(%rdi)
2840	movdqa	%xmm0,32(%rdi)
2841	movdqa	%xmm0,48(%rdi)
2842.Lsqr8x_zero_start:
2843	movdqa	%xmm0,64(%rdi)
2844	movdqa	%xmm0,80(%rdi)
2845	movdqa	%xmm0,96(%rdi)
2846	movdqa	%xmm0,112(%rdi)
2847	leaq	128(%rdi),%rdi
2848	subq	$64,%r9
2849	jnz	.Lsqrx8x_zero
2850
2851	movq	0(%rsi),%rdx
2852
2853	xorq	%r10,%r10
2854	xorq	%r11,%r11
2855	xorq	%r12,%r12
2856	xorq	%r13,%r13
2857	xorq	%r14,%r14
2858	xorq	%r15,%r15
2859	leaq	48+8(%rsp),%rdi
2860	xorq	%rbp,%rbp
2861	jmp	.Lsqrx8x_outer_loop
2862
2863.align	32
2864.Lsqrx8x_outer_loop:
2865	mulxq	8(%rsi),%r8,%rax
2866	adcxq	%r9,%r8
2867	adoxq	%rax,%r10
2868	mulxq	16(%rsi),%r9,%rax
2869	adcxq	%r10,%r9
2870	adoxq	%rax,%r11
2871.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
2872	adcxq	%r11,%r10
2873	adoxq	%rax,%r12
2874.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
2875	adcxq	%r12,%r11
2876	adoxq	%rax,%r13
2877	mulxq	40(%rsi),%r12,%rax
2878	adcxq	%r13,%r12
2879	adoxq	%rax,%r14
2880	mulxq	48(%rsi),%r13,%rax
2881	adcxq	%r14,%r13
2882	adoxq	%r15,%rax
2883	mulxq	56(%rsi),%r14,%r15
2884	movq	8(%rsi),%rdx
2885	adcxq	%rax,%r14
2886	adoxq	%rbp,%r15
2887	adcq	64(%rdi),%r15
2888	movq	%r8,8(%rdi)
2889	movq	%r9,16(%rdi)
2890	sbbq	%rcx,%rcx
2891	xorq	%rbp,%rbp
2892
2893
2894	mulxq	16(%rsi),%r8,%rbx
2895	mulxq	24(%rsi),%r9,%rax
2896	adcxq	%r10,%r8
2897	adoxq	%rbx,%r9
2898	mulxq	32(%rsi),%r10,%rbx
2899	adcxq	%r11,%r9
2900	adoxq	%rax,%r10
2901.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
2902	adcxq	%r12,%r10
2903	adoxq	%rbx,%r11
2904.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
2905	adcxq	%r13,%r11
2906	adoxq	%r14,%r12
2907.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
2908	movq	16(%rsi),%rdx
2909	adcxq	%rax,%r12
2910	adoxq	%rbx,%r13
2911	adcxq	%r15,%r13
2912	adoxq	%rbp,%r14
2913	adcxq	%rbp,%r14
2914
2915	movq	%r8,24(%rdi)
2916	movq	%r9,32(%rdi)
2917
2918	mulxq	24(%rsi),%r8,%rbx
2919	mulxq	32(%rsi),%r9,%rax
2920	adcxq	%r10,%r8
2921	adoxq	%rbx,%r9
2922	mulxq	40(%rsi),%r10,%rbx
2923	adcxq	%r11,%r9
2924	adoxq	%rax,%r10
2925.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
2926	adcxq	%r12,%r10
2927	adoxq	%r13,%r11
2928.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
2929.byte	0x3e
2930	movq	24(%rsi),%rdx
2931	adcxq	%rbx,%r11
2932	adoxq	%rax,%r12
2933	adcxq	%r14,%r12
2934	movq	%r8,40(%rdi)
2935	movq	%r9,48(%rdi)
2936	mulxq	32(%rsi),%r8,%rax
2937	adoxq	%rbp,%r13
2938	adcxq	%rbp,%r13
2939
2940	mulxq	40(%rsi),%r9,%rbx
2941	adcxq	%r10,%r8
2942	adoxq	%rax,%r9
2943	mulxq	48(%rsi),%r10,%rax
2944	adcxq	%r11,%r9
2945	adoxq	%r12,%r10
2946	mulxq	56(%rsi),%r11,%r12
2947	movq	32(%rsi),%rdx
2948	movq	40(%rsi),%r14
2949	adcxq	%rbx,%r10
2950	adoxq	%rax,%r11
2951	movq	48(%rsi),%r15
2952	adcxq	%r13,%r11
2953	adoxq	%rbp,%r12
2954	adcxq	%rbp,%r12
2955
2956	movq	%r8,56(%rdi)
2957	movq	%r9,64(%rdi)
2958
2959	mulxq	%r14,%r9,%rax
2960	movq	56(%rsi),%r8
2961	adcxq	%r10,%r9
2962	mulxq	%r15,%r10,%rbx
2963	adoxq	%rax,%r10
2964	adcxq	%r11,%r10
2965	mulxq	%r8,%r11,%rax
2966	movq	%r14,%rdx
2967	adoxq	%rbx,%r11
2968	adcxq	%r12,%r11
2969
2970	adcxq	%rbp,%rax
2971
2972	mulxq	%r15,%r14,%rbx
2973	mulxq	%r8,%r12,%r13
2974	movq	%r15,%rdx
2975	leaq	64(%rsi),%rsi
2976	adcxq	%r14,%r11
2977	adoxq	%rbx,%r12
2978	adcxq	%rax,%r12
2979	adoxq	%rbp,%r13
2980
2981.byte	0x67,0x67
2982	mulxq	%r8,%r8,%r14
2983	adcxq	%r8,%r13
2984	adcxq	%rbp,%r14
2985
2986	cmpq	8+8(%rsp),%rsi
2987	je	.Lsqrx8x_outer_break
2988
2989	negq	%rcx
2990	movq	$-8,%rcx
2991	movq	%rbp,%r15
2992	movq	64(%rdi),%r8
2993	adcxq	72(%rdi),%r9
2994	adcxq	80(%rdi),%r10
2995	adcxq	88(%rdi),%r11
2996	adcq	96(%rdi),%r12
2997	adcq	104(%rdi),%r13
2998	adcq	112(%rdi),%r14
2999	adcq	120(%rdi),%r15
3000	leaq	(%rsi),%rbp
3001	leaq	128(%rdi),%rdi
3002	sbbq	%rax,%rax
3003
3004	movq	-64(%rsi),%rdx
3005	movq	%rax,16+8(%rsp)
3006	movq	%rdi,24+8(%rsp)
3007
3008
3009	xorl	%eax,%eax
3010	jmp	.Lsqrx8x_loop
3011
3012.align	32
3013.Lsqrx8x_loop:
3014	movq	%r8,%rbx
3015	mulxq	0(%rbp),%rax,%r8
3016	adcxq	%rax,%rbx
3017	adoxq	%r9,%r8
3018
3019	mulxq	8(%rbp),%rax,%r9
3020	adcxq	%rax,%r8
3021	adoxq	%r10,%r9
3022
3023	mulxq	16(%rbp),%rax,%r10
3024	adcxq	%rax,%r9
3025	adoxq	%r11,%r10
3026
3027	mulxq	24(%rbp),%rax,%r11
3028	adcxq	%rax,%r10
3029	adoxq	%r12,%r11
3030
3031.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3032	adcxq	%rax,%r11
3033	adoxq	%r13,%r12
3034
3035	mulxq	40(%rbp),%rax,%r13
3036	adcxq	%rax,%r12
3037	adoxq	%r14,%r13
3038
3039	mulxq	48(%rbp),%rax,%r14
3040	movq	%rbx,(%rdi,%rcx,8)
3041	movl	$0,%ebx
3042	adcxq	%rax,%r13
3043	adoxq	%r15,%r14
3044
3045.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
3046	movq	8(%rsi,%rcx,8),%rdx
3047	adcxq	%rax,%r14
3048	adoxq	%rbx,%r15
3049	adcxq	%rbx,%r15
3050
3051.byte	0x67
3052	incq	%rcx
3053	jnz	.Lsqrx8x_loop
3054
3055	leaq	64(%rbp),%rbp
3056	movq	$-8,%rcx
3057	cmpq	8+8(%rsp),%rbp
3058	je	.Lsqrx8x_break
3059
3060	subq	16+8(%rsp),%rbx
3061.byte	0x66
3062	movq	-64(%rsi),%rdx
3063	adcxq	0(%rdi),%r8
3064	adcxq	8(%rdi),%r9
3065	adcq	16(%rdi),%r10
3066	adcq	24(%rdi),%r11
3067	adcq	32(%rdi),%r12
3068	adcq	40(%rdi),%r13
3069	adcq	48(%rdi),%r14
3070	adcq	56(%rdi),%r15
3071	leaq	64(%rdi),%rdi
3072.byte	0x67
3073	sbbq	%rax,%rax
3074	xorl	%ebx,%ebx
3075	movq	%rax,16+8(%rsp)
3076	jmp	.Lsqrx8x_loop
3077
3078.align	32
3079.Lsqrx8x_break:
3080	subq	16+8(%rsp),%r8
3081	movq	24+8(%rsp),%rcx
3082	movq	0(%rsi),%rdx
3083	xorl	%ebp,%ebp
3084	movq	%r8,0(%rdi)
3085	cmpq	%rcx,%rdi
3086	je	.Lsqrx8x_outer_loop
3087
3088	movq	%r9,8(%rdi)
3089	movq	8(%rcx),%r9
3090	movq	%r10,16(%rdi)
3091	movq	16(%rcx),%r10
3092	movq	%r11,24(%rdi)
3093	movq	24(%rcx),%r11
3094	movq	%r12,32(%rdi)
3095	movq	32(%rcx),%r12
3096	movq	%r13,40(%rdi)
3097	movq	40(%rcx),%r13
3098	movq	%r14,48(%rdi)
3099	movq	48(%rcx),%r14
3100	movq	%r15,56(%rdi)
3101	movq	56(%rcx),%r15
3102	movq	%rcx,%rdi
3103	jmp	.Lsqrx8x_outer_loop
3104
3105.align	32
3106.Lsqrx8x_outer_break:
3107	movq	%r9,72(%rdi)
3108.byte	102,72,15,126,217
3109	movq	%r10,80(%rdi)
3110	movq	%r11,88(%rdi)
3111	movq	%r12,96(%rdi)
3112	movq	%r13,104(%rdi)
3113	movq	%r14,112(%rdi)
3114	leaq	48+8(%rsp),%rdi
3115	movq	(%rsi,%rcx,1),%rdx
3116
3117	movq	8(%rdi),%r11
3118	xorq	%r10,%r10
3119	movq	0+8(%rsp),%r9
3120	adoxq	%r11,%r11
3121	movq	16(%rdi),%r12
3122	movq	24(%rdi),%r13
3123
3124
3125.align	32
3126.Lsqrx4x_shift_n_add:
3127	mulxq	%rdx,%rax,%rbx
3128	adoxq	%r12,%r12
3129	adcxq	%r10,%rax
3130.byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
3131.byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
3132	adoxq	%r13,%r13
3133	adcxq	%r11,%rbx
3134	movq	40(%rdi),%r11
3135	movq	%rax,0(%rdi)
3136	movq	%rbx,8(%rdi)
3137
3138	mulxq	%rdx,%rax,%rbx
3139	adoxq	%r10,%r10
3140	adcxq	%r12,%rax
3141	movq	16(%rsi,%rcx,1),%rdx
3142	movq	48(%rdi),%r12
3143	adoxq	%r11,%r11
3144	adcxq	%r13,%rbx
3145	movq	56(%rdi),%r13
3146	movq	%rax,16(%rdi)
3147	movq	%rbx,24(%rdi)
3148
3149	mulxq	%rdx,%rax,%rbx
3150	adoxq	%r12,%r12
3151	adcxq	%r10,%rax
3152	movq	24(%rsi,%rcx,1),%rdx
3153	leaq	32(%rcx),%rcx
3154	movq	64(%rdi),%r10
3155	adoxq	%r13,%r13
3156	adcxq	%r11,%rbx
3157	movq	72(%rdi),%r11
3158	movq	%rax,32(%rdi)
3159	movq	%rbx,40(%rdi)
3160
3161	mulxq	%rdx,%rax,%rbx
3162	adoxq	%r10,%r10
3163	adcxq	%r12,%rax
3164	jrcxz	.Lsqrx4x_shift_n_add_break
3165.byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
3166	adoxq	%r11,%r11
3167	adcxq	%r13,%rbx
3168	movq	80(%rdi),%r12
3169	movq	88(%rdi),%r13
3170	movq	%rax,48(%rdi)
3171	movq	%rbx,56(%rdi)
3172	leaq	64(%rdi),%rdi
3173	nop
3174	jmp	.Lsqrx4x_shift_n_add
3175
3176.align	32
3177.Lsqrx4x_shift_n_add_break:
3178	adcxq	%r13,%rbx
3179	movq	%rax,48(%rdi)
3180	movq	%rbx,56(%rdi)
3181	leaq	64(%rdi),%rdi
3182.byte	102,72,15,126,213
3183__bn_sqrx8x_reduction:
3184	xorl	%eax,%eax
3185	movq	32+8(%rsp),%rbx
3186	movq	48+8(%rsp),%rdx
3187	leaq	-64(%rbp,%r9,1),%rcx
3188
3189	movq	%rcx,0+8(%rsp)
3190	movq	%rdi,8+8(%rsp)
3191
3192	leaq	48+8(%rsp),%rdi
3193	jmp	.Lsqrx8x_reduction_loop
3194
3195.align	32
3196.Lsqrx8x_reduction_loop:
3197	movq	8(%rdi),%r9
3198	movq	16(%rdi),%r10
3199	movq	24(%rdi),%r11
3200	movq	32(%rdi),%r12
3201	movq	%rdx,%r8
3202	imulq	%rbx,%rdx
3203	movq	40(%rdi),%r13
3204	movq	48(%rdi),%r14
3205	movq	56(%rdi),%r15
3206	movq	%rax,24+8(%rsp)
3207
3208	leaq	64(%rdi),%rdi
3209	xorq	%rsi,%rsi
3210	movq	$-8,%rcx
3211	jmp	.Lsqrx8x_reduce
3212
3213.align	32
3214.Lsqrx8x_reduce:
3215	movq	%r8,%rbx
3216	mulxq	0(%rbp),%rax,%r8
3217	adcxq	%rbx,%rax
3218	adoxq	%r9,%r8
3219
3220	mulxq	8(%rbp),%rbx,%r9
3221	adcxq	%rbx,%r8
3222	adoxq	%r10,%r9
3223
3224	mulxq	16(%rbp),%rbx,%r10
3225	adcxq	%rbx,%r9
3226	adoxq	%r11,%r10
3227
3228	mulxq	24(%rbp),%rbx,%r11
3229	adcxq	%rbx,%r10
3230	adoxq	%r12,%r11
3231
3232.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
3233	movq	%rdx,%rax
3234	movq	%r8,%rdx
3235	adcxq	%rbx,%r11
3236	adoxq	%r13,%r12
3237
3238	mulxq	32+8(%rsp),%rbx,%rdx
3239	movq	%rax,%rdx
3240	movq	%rax,64+48+8(%rsp,%rcx,8)
3241
3242	mulxq	40(%rbp),%rax,%r13
3243	adcxq	%rax,%r12
3244	adoxq	%r14,%r13
3245
3246	mulxq	48(%rbp),%rax,%r14
3247	adcxq	%rax,%r13
3248	adoxq	%r15,%r14
3249
3250	mulxq	56(%rbp),%rax,%r15
3251	movq	%rbx,%rdx
3252	adcxq	%rax,%r14
3253	adoxq	%rsi,%r15
3254	adcxq	%rsi,%r15
3255
3256.byte	0x67,0x67,0x67
3257	incq	%rcx
3258	jnz	.Lsqrx8x_reduce
3259
3260	movq	%rsi,%rax
3261	cmpq	0+8(%rsp),%rbp
3262	jae	.Lsqrx8x_no_tail
3263
3264	movq	48+8(%rsp),%rdx
3265	addq	0(%rdi),%r8
3266	leaq	64(%rbp),%rbp
3267	movq	$-8,%rcx
3268	adcxq	8(%rdi),%r9
3269	adcxq	16(%rdi),%r10
3270	adcq	24(%rdi),%r11
3271	adcq	32(%rdi),%r12
3272	adcq	40(%rdi),%r13
3273	adcq	48(%rdi),%r14
3274	adcq	56(%rdi),%r15
3275	leaq	64(%rdi),%rdi
3276	sbbq	%rax,%rax
3277
3278	xorq	%rsi,%rsi
3279	movq	%rax,16+8(%rsp)
3280	jmp	.Lsqrx8x_tail
3281
3282.align	32
3283.Lsqrx8x_tail:
3284	movq	%r8,%rbx
3285	mulxq	0(%rbp),%rax,%r8
3286	adcxq	%rax,%rbx
3287	adoxq	%r9,%r8
3288
3289	mulxq	8(%rbp),%rax,%r9
3290	adcxq	%rax,%r8
3291	adoxq	%r10,%r9
3292
3293	mulxq	16(%rbp),%rax,%r10
3294	adcxq	%rax,%r9
3295	adoxq	%r11,%r10
3296
3297	mulxq	24(%rbp),%rax,%r11
3298	adcxq	%rax,%r10
3299	adoxq	%r12,%r11
3300
3301.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3302	adcxq	%rax,%r11
3303	adoxq	%r13,%r12
3304
3305	mulxq	40(%rbp),%rax,%r13
3306	adcxq	%rax,%r12
3307	adoxq	%r14,%r13
3308
3309	mulxq	48(%rbp),%rax,%r14
3310	adcxq	%rax,%r13
3311	adoxq	%r15,%r14
3312
3313	mulxq	56(%rbp),%rax,%r15
3314	movq	72+48+8(%rsp,%rcx,8),%rdx
3315	adcxq	%rax,%r14
3316	adoxq	%rsi,%r15
3317	movq	%rbx,(%rdi,%rcx,8)
3318	movq	%r8,%rbx
3319	adcxq	%rsi,%r15
3320
3321	incq	%rcx
3322	jnz	.Lsqrx8x_tail
3323
3324	cmpq	0+8(%rsp),%rbp
3325	jae	.Lsqrx8x_tail_done
3326
3327	subq	16+8(%rsp),%rsi
3328	movq	48+8(%rsp),%rdx
3329	leaq	64(%rbp),%rbp
3330	adcq	0(%rdi),%r8
3331	adcq	8(%rdi),%r9
3332	adcq	16(%rdi),%r10
3333	adcq	24(%rdi),%r11
3334	adcq	32(%rdi),%r12
3335	adcq	40(%rdi),%r13
3336	adcq	48(%rdi),%r14
3337	adcq	56(%rdi),%r15
3338	leaq	64(%rdi),%rdi
3339	sbbq	%rax,%rax
3340	subq	$8,%rcx
3341
3342	xorq	%rsi,%rsi
3343	movq	%rax,16+8(%rsp)
3344	jmp	.Lsqrx8x_tail
3345
3346.align	32
3347.Lsqrx8x_tail_done:
3348	xorq	%rax,%rax
3349	addq	24+8(%rsp),%r8
3350	adcq	$0,%r9
3351	adcq	$0,%r10
3352	adcq	$0,%r11
3353	adcq	$0,%r12
3354	adcq	$0,%r13
3355	adcq	$0,%r14
3356	adcq	$0,%r15
3357	adcq	$0,%rax
3358
3359	subq	16+8(%rsp),%rsi
3360.Lsqrx8x_no_tail:
3361	adcq	0(%rdi),%r8
3362.byte	102,72,15,126,217
3363	adcq	8(%rdi),%r9
3364	movq	56(%rbp),%rsi
3365.byte	102,72,15,126,213
3366	adcq	16(%rdi),%r10
3367	adcq	24(%rdi),%r11
3368	adcq	32(%rdi),%r12
3369	adcq	40(%rdi),%r13
3370	adcq	48(%rdi),%r14
3371	adcq	56(%rdi),%r15
3372	adcq	$0,%rax
3373
3374	movq	32+8(%rsp),%rbx
3375	movq	64(%rdi,%rcx,1),%rdx
3376
3377	movq	%r8,0(%rdi)
3378	leaq	64(%rdi),%r8
3379	movq	%r9,8(%rdi)
3380	movq	%r10,16(%rdi)
3381	movq	%r11,24(%rdi)
3382	movq	%r12,32(%rdi)
3383	movq	%r13,40(%rdi)
3384	movq	%r14,48(%rdi)
3385	movq	%r15,56(%rdi)
3386
3387	leaq	64(%rdi,%rcx,1),%rdi
3388	cmpq	8+8(%rsp),%r8
3389	jb	.Lsqrx8x_reduction_loop
3390	.byte	0xf3,0xc3
3391.size	bn_sqrx8x_internal,.-bn_sqrx8x_internal
3392.align	32
3393__bn_postx4x_internal:
3394	movq	0(%rbp),%r12
3395	movq	%rcx,%r10
3396	movq	%rcx,%r9
3397	negq	%rax
3398	sarq	$3+2,%rcx
3399
3400.byte	102,72,15,126,202
3401.byte	102,72,15,126,206
3402	decq	%r12
3403	movq	8(%rbp),%r13
3404	xorq	%r8,%r8
3405	movq	16(%rbp),%r14
3406	movq	24(%rbp),%r15
3407	jmp	.Lsqrx4x_sub_entry
3408
3409.align	16
3410.Lsqrx4x_sub:
3411	movq	0(%rbp),%r12
3412	movq	8(%rbp),%r13
3413	movq	16(%rbp),%r14
3414	movq	24(%rbp),%r15
3415.Lsqrx4x_sub_entry:
3416	andnq	%rax,%r12,%r12
3417	leaq	32(%rbp),%rbp
3418	andnq	%rax,%r13,%r13
3419	andnq	%rax,%r14,%r14
3420	andnq	%rax,%r15,%r15
3421
3422	negq	%r8
3423	adcq	0(%rdi),%r12
3424	adcq	8(%rdi),%r13
3425	adcq	16(%rdi),%r14
3426	adcq	24(%rdi),%r15
3427	movq	%r12,0(%rdx)
3428	leaq	32(%rdi),%rdi
3429	movq	%r13,8(%rdx)
3430	sbbq	%r8,%r8
3431	movq	%r14,16(%rdx)
3432	movq	%r15,24(%rdx)
3433	leaq	32(%rdx),%rdx
3434
3435	incq	%rcx
3436	jnz	.Lsqrx4x_sub
3437
3438	negq	%r9
3439
3440	.byte	0xf3,0xc3
3441.size	__bn_postx4x_internal,.-__bn_postx4x_internal
3442.globl	bn_get_bits5
3443.type	bn_get_bits5,@function
3444.align	16
3445bn_get_bits5:
3446	leaq	0(%rdi),%r10
3447	leaq	1(%rdi),%r11
3448	movl	%esi,%ecx
3449	shrl	$4,%esi
3450	andl	$15,%ecx
3451	leal	-8(%rcx),%eax
3452	cmpl	$11,%ecx
3453	cmovaq	%r11,%r10
3454	cmoval	%eax,%ecx
3455	movzwl	(%r10,%rsi,2),%eax
3456	shrl	%cl,%eax
3457	andl	$31,%eax
3458	.byte	0xf3,0xc3
3459.size	bn_get_bits5,.-bn_get_bits5
3460
3461.globl	bn_scatter5
3462.type	bn_scatter5,@function
3463.align	16
3464bn_scatter5:
3465	cmpl	$0,%esi
3466	jz	.Lscatter_epilogue
3467	leaq	(%rdx,%rcx,8),%rdx
3468.Lscatter:
3469	movq	(%rdi),%rax
3470	leaq	8(%rdi),%rdi
3471	movq	%rax,(%rdx)
3472	leaq	256(%rdx),%rdx
3473	subl	$1,%esi
3474	jnz	.Lscatter
3475.Lscatter_epilogue:
3476	.byte	0xf3,0xc3
3477.size	bn_scatter5,.-bn_scatter5
3478
3479.globl	bn_gather5
3480.type	bn_gather5,@function
3481.align	32
3482bn_gather5:
3483.LSEH_begin_bn_gather5:
3484
3485.byte	0x4c,0x8d,0x14,0x24
3486.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00
3487	leaq	.Linc(%rip),%rax
3488	andq	$-16,%rsp
3489
3490	movd	%ecx,%xmm5
3491	movdqa	0(%rax),%xmm0
3492	movdqa	16(%rax),%xmm1
3493	leaq	128(%rdx),%r11
3494	leaq	128(%rsp),%rax
3495
3496	pshufd	$0,%xmm5,%xmm5
3497	movdqa	%xmm1,%xmm4
3498	movdqa	%xmm1,%xmm2
3499	paddd	%xmm0,%xmm1
3500	pcmpeqd	%xmm5,%xmm0
3501	movdqa	%xmm4,%xmm3
3502
3503	paddd	%xmm1,%xmm2
3504	pcmpeqd	%xmm5,%xmm1
3505	movdqa	%xmm0,-128(%rax)
3506	movdqa	%xmm4,%xmm0
3507
3508	paddd	%xmm2,%xmm3
3509	pcmpeqd	%xmm5,%xmm2
3510	movdqa	%xmm1,-112(%rax)
3511	movdqa	%xmm4,%xmm1
3512
3513	paddd	%xmm3,%xmm0
3514	pcmpeqd	%xmm5,%xmm3
3515	movdqa	%xmm2,-96(%rax)
3516	movdqa	%xmm4,%xmm2
3517	paddd	%xmm0,%xmm1
3518	pcmpeqd	%xmm5,%xmm0
3519	movdqa	%xmm3,-80(%rax)
3520	movdqa	%xmm4,%xmm3
3521
3522	paddd	%xmm1,%xmm2
3523	pcmpeqd	%xmm5,%xmm1
3524	movdqa	%xmm0,-64(%rax)
3525	movdqa	%xmm4,%xmm0
3526
3527	paddd	%xmm2,%xmm3
3528	pcmpeqd	%xmm5,%xmm2
3529	movdqa	%xmm1,-48(%rax)
3530	movdqa	%xmm4,%xmm1
3531
3532	paddd	%xmm3,%xmm0
3533	pcmpeqd	%xmm5,%xmm3
3534	movdqa	%xmm2,-32(%rax)
3535	movdqa	%xmm4,%xmm2
3536	paddd	%xmm0,%xmm1
3537	pcmpeqd	%xmm5,%xmm0
3538	movdqa	%xmm3,-16(%rax)
3539	movdqa	%xmm4,%xmm3
3540
3541	paddd	%xmm1,%xmm2
3542	pcmpeqd	%xmm5,%xmm1
3543	movdqa	%xmm0,0(%rax)
3544	movdqa	%xmm4,%xmm0
3545
3546	paddd	%xmm2,%xmm3
3547	pcmpeqd	%xmm5,%xmm2
3548	movdqa	%xmm1,16(%rax)
3549	movdqa	%xmm4,%xmm1
3550
3551	paddd	%xmm3,%xmm0
3552	pcmpeqd	%xmm5,%xmm3
3553	movdqa	%xmm2,32(%rax)
3554	movdqa	%xmm4,%xmm2
3555	paddd	%xmm0,%xmm1
3556	pcmpeqd	%xmm5,%xmm0
3557	movdqa	%xmm3,48(%rax)
3558	movdqa	%xmm4,%xmm3
3559
3560	paddd	%xmm1,%xmm2
3561	pcmpeqd	%xmm5,%xmm1
3562	movdqa	%xmm0,64(%rax)
3563	movdqa	%xmm4,%xmm0
3564
3565	paddd	%xmm2,%xmm3
3566	pcmpeqd	%xmm5,%xmm2
3567	movdqa	%xmm1,80(%rax)
3568	movdqa	%xmm4,%xmm1
3569
3570	paddd	%xmm3,%xmm0
3571	pcmpeqd	%xmm5,%xmm3
3572	movdqa	%xmm2,96(%rax)
3573	movdqa	%xmm4,%xmm2
3574	movdqa	%xmm3,112(%rax)
3575	jmp	.Lgather
3576
3577.align	32
3578.Lgather:
3579	pxor	%xmm4,%xmm4
3580	pxor	%xmm5,%xmm5
3581	movdqa	-128(%r11),%xmm0
3582	movdqa	-112(%r11),%xmm1
3583	movdqa	-96(%r11),%xmm2
3584	pand	-128(%rax),%xmm0
3585	movdqa	-80(%r11),%xmm3
3586	pand	-112(%rax),%xmm1
3587	por	%xmm0,%xmm4
3588	pand	-96(%rax),%xmm2
3589	por	%xmm1,%xmm5
3590	pand	-80(%rax),%xmm3
3591	por	%xmm2,%xmm4
3592	por	%xmm3,%xmm5
3593	movdqa	-64(%r11),%xmm0
3594	movdqa	-48(%r11),%xmm1
3595	movdqa	-32(%r11),%xmm2
3596	pand	-64(%rax),%xmm0
3597	movdqa	-16(%r11),%xmm3
3598	pand	-48(%rax),%xmm1
3599	por	%xmm0,%xmm4
3600	pand	-32(%rax),%xmm2
3601	por	%xmm1,%xmm5
3602	pand	-16(%rax),%xmm3
3603	por	%xmm2,%xmm4
3604	por	%xmm3,%xmm5
3605	movdqa	0(%r11),%xmm0
3606	movdqa	16(%r11),%xmm1
3607	movdqa	32(%r11),%xmm2
3608	pand	0(%rax),%xmm0
3609	movdqa	48(%r11),%xmm3
3610	pand	16(%rax),%xmm1
3611	por	%xmm0,%xmm4
3612	pand	32(%rax),%xmm2
3613	por	%xmm1,%xmm5
3614	pand	48(%rax),%xmm3
3615	por	%xmm2,%xmm4
3616	por	%xmm3,%xmm5
3617	movdqa	64(%r11),%xmm0
3618	movdqa	80(%r11),%xmm1
3619	movdqa	96(%r11),%xmm2
3620	pand	64(%rax),%xmm0
3621	movdqa	112(%r11),%xmm3
3622	pand	80(%rax),%xmm1
3623	por	%xmm0,%xmm4
3624	pand	96(%rax),%xmm2
3625	por	%xmm1,%xmm5
3626	pand	112(%rax),%xmm3
3627	por	%xmm2,%xmm4
3628	por	%xmm3,%xmm5
3629	por	%xmm5,%xmm4
3630	leaq	256(%r11),%r11
3631	pshufd	$0x4e,%xmm4,%xmm0
3632	por	%xmm4,%xmm0
3633	movq	%xmm0,(%rdi)
3634	leaq	8(%rdi),%rdi
3635	subl	$1,%esi
3636	jnz	.Lgather
3637
3638	leaq	(%r10),%rsp
3639	.byte	0xf3,0xc3
3640.LSEH_end_bn_gather5:
3641.size	bn_gather5,.-bn_gather5
3642.align	64
3643.Linc:
3644.long	0,0, 1,1
3645.long	2,2, 2,2
3646.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
3647