x86_64-mont5.S revision 337982
1/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/x86_64-mont5.S 337982 2018-08-17 18:32:53Z jkim $ */
2/* Do not modify. This file is auto-generated from x86_64-mont5.pl. */
3.text
4
5
6
7.globl	bn_mul_mont_gather5
8.type	bn_mul_mont_gather5,@function
9.align	64
10bn_mul_mont_gather5:
11	movl	%r9d,%r9d
12	movq	%rsp,%rax
13	testl	$7,%r9d
14	jnz	.Lmul_enter
15	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
16	jmp	.Lmul4x_enter
17
18.align	16
19.Lmul_enter:
20	movd	8(%rsp),%xmm5
21	pushq	%rbx
22	pushq	%rbp
23	pushq	%r12
24	pushq	%r13
25	pushq	%r14
26	pushq	%r15
27
28	negq	%r9
29	movq	%rsp,%r11
30	leaq	-280(%rsp,%r9,8),%r10
31	negq	%r9
32	andq	$-1024,%r10
33
34
35
36
37
38
39
40	subq	%r10,%r11
41	andq	$-4096,%r11
42	leaq	(%r10,%r11,1),%rsp
43	movq	(%rsp),%r11
44	cmpq	%r10,%rsp
45	ja	.Lmul_page_walk
46	jmp	.Lmul_page_walk_done
47
48.Lmul_page_walk:
49	leaq	-4096(%rsp),%rsp
50	movq	(%rsp),%r11
51	cmpq	%r10,%rsp
52	ja	.Lmul_page_walk
53.Lmul_page_walk_done:
54
55	leaq	.Linc(%rip),%r10
56	movq	%rax,8(%rsp,%r9,8)
57.Lmul_body:
58
59	leaq	128(%rdx),%r12
60	movdqa	0(%r10),%xmm0
61	movdqa	16(%r10),%xmm1
62	leaq	24-112(%rsp,%r9,8),%r10
63	andq	$-16,%r10
64
65	pshufd	$0,%xmm5,%xmm5
66	movdqa	%xmm1,%xmm4
67	movdqa	%xmm1,%xmm2
68	paddd	%xmm0,%xmm1
69	pcmpeqd	%xmm5,%xmm0
70.byte	0x67
71	movdqa	%xmm4,%xmm3
72	paddd	%xmm1,%xmm2
73	pcmpeqd	%xmm5,%xmm1
74	movdqa	%xmm0,112(%r10)
75	movdqa	%xmm4,%xmm0
76
77	paddd	%xmm2,%xmm3
78	pcmpeqd	%xmm5,%xmm2
79	movdqa	%xmm1,128(%r10)
80	movdqa	%xmm4,%xmm1
81
82	paddd	%xmm3,%xmm0
83	pcmpeqd	%xmm5,%xmm3
84	movdqa	%xmm2,144(%r10)
85	movdqa	%xmm4,%xmm2
86
87	paddd	%xmm0,%xmm1
88	pcmpeqd	%xmm5,%xmm0
89	movdqa	%xmm3,160(%r10)
90	movdqa	%xmm4,%xmm3
91	paddd	%xmm1,%xmm2
92	pcmpeqd	%xmm5,%xmm1
93	movdqa	%xmm0,176(%r10)
94	movdqa	%xmm4,%xmm0
95
96	paddd	%xmm2,%xmm3
97	pcmpeqd	%xmm5,%xmm2
98	movdqa	%xmm1,192(%r10)
99	movdqa	%xmm4,%xmm1
100
101	paddd	%xmm3,%xmm0
102	pcmpeqd	%xmm5,%xmm3
103	movdqa	%xmm2,208(%r10)
104	movdqa	%xmm4,%xmm2
105
106	paddd	%xmm0,%xmm1
107	pcmpeqd	%xmm5,%xmm0
108	movdqa	%xmm3,224(%r10)
109	movdqa	%xmm4,%xmm3
110	paddd	%xmm1,%xmm2
111	pcmpeqd	%xmm5,%xmm1
112	movdqa	%xmm0,240(%r10)
113	movdqa	%xmm4,%xmm0
114
115	paddd	%xmm2,%xmm3
116	pcmpeqd	%xmm5,%xmm2
117	movdqa	%xmm1,256(%r10)
118	movdqa	%xmm4,%xmm1
119
120	paddd	%xmm3,%xmm0
121	pcmpeqd	%xmm5,%xmm3
122	movdqa	%xmm2,272(%r10)
123	movdqa	%xmm4,%xmm2
124
125	paddd	%xmm0,%xmm1
126	pcmpeqd	%xmm5,%xmm0
127	movdqa	%xmm3,288(%r10)
128	movdqa	%xmm4,%xmm3
129	paddd	%xmm1,%xmm2
130	pcmpeqd	%xmm5,%xmm1
131	movdqa	%xmm0,304(%r10)
132
133	paddd	%xmm2,%xmm3
134.byte	0x67
135	pcmpeqd	%xmm5,%xmm2
136	movdqa	%xmm1,320(%r10)
137
138	pcmpeqd	%xmm5,%xmm3
139	movdqa	%xmm2,336(%r10)
140	pand	64(%r12),%xmm0
141
142	pand	80(%r12),%xmm1
143	pand	96(%r12),%xmm2
144	movdqa	%xmm3,352(%r10)
145	pand	112(%r12),%xmm3
146	por	%xmm2,%xmm0
147	por	%xmm3,%xmm1
148	movdqa	-128(%r12),%xmm4
149	movdqa	-112(%r12),%xmm5
150	movdqa	-96(%r12),%xmm2
151	pand	112(%r10),%xmm4
152	movdqa	-80(%r12),%xmm3
153	pand	128(%r10),%xmm5
154	por	%xmm4,%xmm0
155	pand	144(%r10),%xmm2
156	por	%xmm5,%xmm1
157	pand	160(%r10),%xmm3
158	por	%xmm2,%xmm0
159	por	%xmm3,%xmm1
160	movdqa	-64(%r12),%xmm4
161	movdqa	-48(%r12),%xmm5
162	movdqa	-32(%r12),%xmm2
163	pand	176(%r10),%xmm4
164	movdqa	-16(%r12),%xmm3
165	pand	192(%r10),%xmm5
166	por	%xmm4,%xmm0
167	pand	208(%r10),%xmm2
168	por	%xmm5,%xmm1
169	pand	224(%r10),%xmm3
170	por	%xmm2,%xmm0
171	por	%xmm3,%xmm1
172	movdqa	0(%r12),%xmm4
173	movdqa	16(%r12),%xmm5
174	movdqa	32(%r12),%xmm2
175	pand	240(%r10),%xmm4
176	movdqa	48(%r12),%xmm3
177	pand	256(%r10),%xmm5
178	por	%xmm4,%xmm0
179	pand	272(%r10),%xmm2
180	por	%xmm5,%xmm1
181	pand	288(%r10),%xmm3
182	por	%xmm2,%xmm0
183	por	%xmm3,%xmm1
184	por	%xmm1,%xmm0
185	pshufd	$0x4e,%xmm0,%xmm1
186	por	%xmm1,%xmm0
187	leaq	256(%r12),%r12
188.byte	102,72,15,126,195
189
190	movq	(%r8),%r8
191	movq	(%rsi),%rax
192
193	xorq	%r14,%r14
194	xorq	%r15,%r15
195
196	movq	%r8,%rbp
197	mulq	%rbx
198	movq	%rax,%r10
199	movq	(%rcx),%rax
200
201	imulq	%r10,%rbp
202	movq	%rdx,%r11
203
204	mulq	%rbp
205	addq	%rax,%r10
206	movq	8(%rsi),%rax
207	adcq	$0,%rdx
208	movq	%rdx,%r13
209
210	leaq	1(%r15),%r15
211	jmp	.L1st_enter
212
213.align	16
214.L1st:
215	addq	%rax,%r13
216	movq	(%rsi,%r15,8),%rax
217	adcq	$0,%rdx
218	addq	%r11,%r13
219	movq	%r10,%r11
220	adcq	$0,%rdx
221	movq	%r13,-16(%rsp,%r15,8)
222	movq	%rdx,%r13
223
224.L1st_enter:
225	mulq	%rbx
226	addq	%rax,%r11
227	movq	(%rcx,%r15,8),%rax
228	adcq	$0,%rdx
229	leaq	1(%r15),%r15
230	movq	%rdx,%r10
231
232	mulq	%rbp
233	cmpq	%r9,%r15
234	jne	.L1st
235
236
237	addq	%rax,%r13
238	adcq	$0,%rdx
239	addq	%r11,%r13
240	adcq	$0,%rdx
241	movq	%r13,-16(%rsp,%r9,8)
242	movq	%rdx,%r13
243	movq	%r10,%r11
244
245	xorq	%rdx,%rdx
246	addq	%r11,%r13
247	adcq	$0,%rdx
248	movq	%r13,-8(%rsp,%r9,8)
249	movq	%rdx,(%rsp,%r9,8)
250
251	leaq	1(%r14),%r14
252	jmp	.Louter
253.align	16
254.Louter:
255	leaq	24+128(%rsp,%r9,8),%rdx
256	andq	$-16,%rdx
257	pxor	%xmm4,%xmm4
258	pxor	%xmm5,%xmm5
259	movdqa	-128(%r12),%xmm0
260	movdqa	-112(%r12),%xmm1
261	movdqa	-96(%r12),%xmm2
262	movdqa	-80(%r12),%xmm3
263	pand	-128(%rdx),%xmm0
264	pand	-112(%rdx),%xmm1
265	por	%xmm0,%xmm4
266	pand	-96(%rdx),%xmm2
267	por	%xmm1,%xmm5
268	pand	-80(%rdx),%xmm3
269	por	%xmm2,%xmm4
270	por	%xmm3,%xmm5
271	movdqa	-64(%r12),%xmm0
272	movdqa	-48(%r12),%xmm1
273	movdqa	-32(%r12),%xmm2
274	movdqa	-16(%r12),%xmm3
275	pand	-64(%rdx),%xmm0
276	pand	-48(%rdx),%xmm1
277	por	%xmm0,%xmm4
278	pand	-32(%rdx),%xmm2
279	por	%xmm1,%xmm5
280	pand	-16(%rdx),%xmm3
281	por	%xmm2,%xmm4
282	por	%xmm3,%xmm5
283	movdqa	0(%r12),%xmm0
284	movdqa	16(%r12),%xmm1
285	movdqa	32(%r12),%xmm2
286	movdqa	48(%r12),%xmm3
287	pand	0(%rdx),%xmm0
288	pand	16(%rdx),%xmm1
289	por	%xmm0,%xmm4
290	pand	32(%rdx),%xmm2
291	por	%xmm1,%xmm5
292	pand	48(%rdx),%xmm3
293	por	%xmm2,%xmm4
294	por	%xmm3,%xmm5
295	movdqa	64(%r12),%xmm0
296	movdqa	80(%r12),%xmm1
297	movdqa	96(%r12),%xmm2
298	movdqa	112(%r12),%xmm3
299	pand	64(%rdx),%xmm0
300	pand	80(%rdx),%xmm1
301	por	%xmm0,%xmm4
302	pand	96(%rdx),%xmm2
303	por	%xmm1,%xmm5
304	pand	112(%rdx),%xmm3
305	por	%xmm2,%xmm4
306	por	%xmm3,%xmm5
307	por	%xmm5,%xmm4
308	pshufd	$0x4e,%xmm4,%xmm0
309	por	%xmm4,%xmm0
310	leaq	256(%r12),%r12
311
312	movq	(%rsi),%rax
313.byte	102,72,15,126,195
314
315	xorq	%r15,%r15
316	movq	%r8,%rbp
317	movq	(%rsp),%r10
318
319	mulq	%rbx
320	addq	%rax,%r10
321	movq	(%rcx),%rax
322	adcq	$0,%rdx
323
324	imulq	%r10,%rbp
325	movq	%rdx,%r11
326
327	mulq	%rbp
328	addq	%rax,%r10
329	movq	8(%rsi),%rax
330	adcq	$0,%rdx
331	movq	8(%rsp),%r10
332	movq	%rdx,%r13
333
334	leaq	1(%r15),%r15
335	jmp	.Linner_enter
336
337.align	16
338.Linner:
339	addq	%rax,%r13
340	movq	(%rsi,%r15,8),%rax
341	adcq	$0,%rdx
342	addq	%r10,%r13
343	movq	(%rsp,%r15,8),%r10
344	adcq	$0,%rdx
345	movq	%r13,-16(%rsp,%r15,8)
346	movq	%rdx,%r13
347
348.Linner_enter:
349	mulq	%rbx
350	addq	%rax,%r11
351	movq	(%rcx,%r15,8),%rax
352	adcq	$0,%rdx
353	addq	%r11,%r10
354	movq	%rdx,%r11
355	adcq	$0,%r11
356	leaq	1(%r15),%r15
357
358	mulq	%rbp
359	cmpq	%r9,%r15
360	jne	.Linner
361
362	addq	%rax,%r13
363	adcq	$0,%rdx
364	addq	%r10,%r13
365	movq	(%rsp,%r9,8),%r10
366	adcq	$0,%rdx
367	movq	%r13,-16(%rsp,%r9,8)
368	movq	%rdx,%r13
369
370	xorq	%rdx,%rdx
371	addq	%r11,%r13
372	adcq	$0,%rdx
373	addq	%r10,%r13
374	adcq	$0,%rdx
375	movq	%r13,-8(%rsp,%r9,8)
376	movq	%rdx,(%rsp,%r9,8)
377
378	leaq	1(%r14),%r14
379	cmpq	%r9,%r14
380	jb	.Louter
381
382	xorq	%r14,%r14
383	movq	(%rsp),%rax
384	leaq	(%rsp),%rsi
385	movq	%r9,%r15
386	jmp	.Lsub
387.align	16
388.Lsub:	sbbq	(%rcx,%r14,8),%rax
389	movq	%rax,(%rdi,%r14,8)
390	movq	8(%rsi,%r14,8),%rax
391	leaq	1(%r14),%r14
392	decq	%r15
393	jnz	.Lsub
394
395	sbbq	$0,%rax
396	movq	$-1,%rbx
397	xorq	%rax,%rbx
398	xorq	%r14,%r14
399	movq	%r9,%r15
400
401.Lcopy:
402	movq	(%rdi,%r14,8),%rcx
403	movq	(%rsp,%r14,8),%rdx
404	andq	%rbx,%rcx
405	andq	%rax,%rdx
406	movq	%r14,(%rsp,%r14,8)
407	orq	%rcx,%rdx
408	movq	%rdx,(%rdi,%r14,8)
409	leaq	1(%r14),%r14
410	subq	$1,%r15
411	jnz	.Lcopy
412
413	movq	8(%rsp,%r9,8),%rsi
414	movq	$1,%rax
415
416	movq	-48(%rsi),%r15
417	movq	-40(%rsi),%r14
418	movq	-32(%rsi),%r13
419	movq	-24(%rsi),%r12
420	movq	-16(%rsi),%rbp
421	movq	-8(%rsi),%rbx
422	leaq	(%rsi),%rsp
423.Lmul_epilogue:
424	.byte	0xf3,0xc3
425.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
426.type	bn_mul4x_mont_gather5,@function
427.align	32
428bn_mul4x_mont_gather5:
429.byte	0x67
430	movq	%rsp,%rax
431.Lmul4x_enter:
432	andl	$0x80108,%r11d
433	cmpl	$0x80108,%r11d
434	je	.Lmulx4x_enter
435	pushq	%rbx
436	pushq	%rbp
437	pushq	%r12
438	pushq	%r13
439	pushq	%r14
440	pushq	%r15
441.Lmul4x_prologue:
442
443.byte	0x67
444	shll	$3,%r9d
445	leaq	(%r9,%r9,2),%r10
446	negq	%r9
447
448
449
450
451
452
453
454
455
456
457	leaq	-320(%rsp,%r9,2),%r11
458	movq	%rsp,%rbp
459	subq	%rdi,%r11
460	andq	$4095,%r11
461	cmpq	%r11,%r10
462	jb	.Lmul4xsp_alt
463	subq	%r11,%rbp
464	leaq	-320(%rbp,%r9,2),%rbp
465	jmp	.Lmul4xsp_done
466
467.align	32
468.Lmul4xsp_alt:
469	leaq	4096-320(,%r9,2),%r10
470	leaq	-320(%rbp,%r9,2),%rbp
471	subq	%r10,%r11
472	movq	$0,%r10
473	cmovcq	%r10,%r11
474	subq	%r11,%rbp
475.Lmul4xsp_done:
476	andq	$-64,%rbp
477	movq	%rsp,%r11
478	subq	%rbp,%r11
479	andq	$-4096,%r11
480	leaq	(%r11,%rbp,1),%rsp
481	movq	(%rsp),%r10
482	cmpq	%rbp,%rsp
483	ja	.Lmul4x_page_walk
484	jmp	.Lmul4x_page_walk_done
485
486.Lmul4x_page_walk:
487	leaq	-4096(%rsp),%rsp
488	movq	(%rsp),%r10
489	cmpq	%rbp,%rsp
490	ja	.Lmul4x_page_walk
491.Lmul4x_page_walk_done:
492
493	negq	%r9
494
495	movq	%rax,40(%rsp)
496.Lmul4x_body:
497
498	call	mul4x_internal
499
500	movq	40(%rsp),%rsi
501	movq	$1,%rax
502
503	movq	-48(%rsi),%r15
504	movq	-40(%rsi),%r14
505	movq	-32(%rsi),%r13
506	movq	-24(%rsi),%r12
507	movq	-16(%rsi),%rbp
508	movq	-8(%rsi),%rbx
509	leaq	(%rsi),%rsp
510.Lmul4x_epilogue:
511	.byte	0xf3,0xc3
512.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
513
514.type	mul4x_internal,@function
515.align	32
516mul4x_internal:
517	shlq	$5,%r9
518	movd	8(%rax),%xmm5
519	leaq	.Linc(%rip),%rax
520	leaq	128(%rdx,%r9,1),%r13
521	shrq	$5,%r9
522	movdqa	0(%rax),%xmm0
523	movdqa	16(%rax),%xmm1
524	leaq	88-112(%rsp,%r9,1),%r10
525	leaq	128(%rdx),%r12
526
527	pshufd	$0,%xmm5,%xmm5
528	movdqa	%xmm1,%xmm4
529.byte	0x67,0x67
530	movdqa	%xmm1,%xmm2
531	paddd	%xmm0,%xmm1
532	pcmpeqd	%xmm5,%xmm0
533.byte	0x67
534	movdqa	%xmm4,%xmm3
535	paddd	%xmm1,%xmm2
536	pcmpeqd	%xmm5,%xmm1
537	movdqa	%xmm0,112(%r10)
538	movdqa	%xmm4,%xmm0
539
540	paddd	%xmm2,%xmm3
541	pcmpeqd	%xmm5,%xmm2
542	movdqa	%xmm1,128(%r10)
543	movdqa	%xmm4,%xmm1
544
545	paddd	%xmm3,%xmm0
546	pcmpeqd	%xmm5,%xmm3
547	movdqa	%xmm2,144(%r10)
548	movdqa	%xmm4,%xmm2
549
550	paddd	%xmm0,%xmm1
551	pcmpeqd	%xmm5,%xmm0
552	movdqa	%xmm3,160(%r10)
553	movdqa	%xmm4,%xmm3
554	paddd	%xmm1,%xmm2
555	pcmpeqd	%xmm5,%xmm1
556	movdqa	%xmm0,176(%r10)
557	movdqa	%xmm4,%xmm0
558
559	paddd	%xmm2,%xmm3
560	pcmpeqd	%xmm5,%xmm2
561	movdqa	%xmm1,192(%r10)
562	movdqa	%xmm4,%xmm1
563
564	paddd	%xmm3,%xmm0
565	pcmpeqd	%xmm5,%xmm3
566	movdqa	%xmm2,208(%r10)
567	movdqa	%xmm4,%xmm2
568
569	paddd	%xmm0,%xmm1
570	pcmpeqd	%xmm5,%xmm0
571	movdqa	%xmm3,224(%r10)
572	movdqa	%xmm4,%xmm3
573	paddd	%xmm1,%xmm2
574	pcmpeqd	%xmm5,%xmm1
575	movdqa	%xmm0,240(%r10)
576	movdqa	%xmm4,%xmm0
577
578	paddd	%xmm2,%xmm3
579	pcmpeqd	%xmm5,%xmm2
580	movdqa	%xmm1,256(%r10)
581	movdqa	%xmm4,%xmm1
582
583	paddd	%xmm3,%xmm0
584	pcmpeqd	%xmm5,%xmm3
585	movdqa	%xmm2,272(%r10)
586	movdqa	%xmm4,%xmm2
587
588	paddd	%xmm0,%xmm1
589	pcmpeqd	%xmm5,%xmm0
590	movdqa	%xmm3,288(%r10)
591	movdqa	%xmm4,%xmm3
592	paddd	%xmm1,%xmm2
593	pcmpeqd	%xmm5,%xmm1
594	movdqa	%xmm0,304(%r10)
595
596	paddd	%xmm2,%xmm3
597.byte	0x67
598	pcmpeqd	%xmm5,%xmm2
599	movdqa	%xmm1,320(%r10)
600
601	pcmpeqd	%xmm5,%xmm3
602	movdqa	%xmm2,336(%r10)
603	pand	64(%r12),%xmm0
604
605	pand	80(%r12),%xmm1
606	pand	96(%r12),%xmm2
607	movdqa	%xmm3,352(%r10)
608	pand	112(%r12),%xmm3
609	por	%xmm2,%xmm0
610	por	%xmm3,%xmm1
611	movdqa	-128(%r12),%xmm4
612	movdqa	-112(%r12),%xmm5
613	movdqa	-96(%r12),%xmm2
614	pand	112(%r10),%xmm4
615	movdqa	-80(%r12),%xmm3
616	pand	128(%r10),%xmm5
617	por	%xmm4,%xmm0
618	pand	144(%r10),%xmm2
619	por	%xmm5,%xmm1
620	pand	160(%r10),%xmm3
621	por	%xmm2,%xmm0
622	por	%xmm3,%xmm1
623	movdqa	-64(%r12),%xmm4
624	movdqa	-48(%r12),%xmm5
625	movdqa	-32(%r12),%xmm2
626	pand	176(%r10),%xmm4
627	movdqa	-16(%r12),%xmm3
628	pand	192(%r10),%xmm5
629	por	%xmm4,%xmm0
630	pand	208(%r10),%xmm2
631	por	%xmm5,%xmm1
632	pand	224(%r10),%xmm3
633	por	%xmm2,%xmm0
634	por	%xmm3,%xmm1
635	movdqa	0(%r12),%xmm4
636	movdqa	16(%r12),%xmm5
637	movdqa	32(%r12),%xmm2
638	pand	240(%r10),%xmm4
639	movdqa	48(%r12),%xmm3
640	pand	256(%r10),%xmm5
641	por	%xmm4,%xmm0
642	pand	272(%r10),%xmm2
643	por	%xmm5,%xmm1
644	pand	288(%r10),%xmm3
645	por	%xmm2,%xmm0
646	por	%xmm3,%xmm1
647	por	%xmm1,%xmm0
648	pshufd	$0x4e,%xmm0,%xmm1
649	por	%xmm1,%xmm0
650	leaq	256(%r12),%r12
651.byte	102,72,15,126,195
652
653	movq	%r13,16+8(%rsp)
654	movq	%rdi,56+8(%rsp)
655
656	movq	(%r8),%r8
657	movq	(%rsi),%rax
658	leaq	(%rsi,%r9,1),%rsi
659	negq	%r9
660
661	movq	%r8,%rbp
662	mulq	%rbx
663	movq	%rax,%r10
664	movq	(%rcx),%rax
665
666	imulq	%r10,%rbp
667	leaq	64+8(%rsp),%r14
668	movq	%rdx,%r11
669
670	mulq	%rbp
671	addq	%rax,%r10
672	movq	8(%rsi,%r9,1),%rax
673	adcq	$0,%rdx
674	movq	%rdx,%rdi
675
676	mulq	%rbx
677	addq	%rax,%r11
678	movq	8(%rcx),%rax
679	adcq	$0,%rdx
680	movq	%rdx,%r10
681
682	mulq	%rbp
683	addq	%rax,%rdi
684	movq	16(%rsi,%r9,1),%rax
685	adcq	$0,%rdx
686	addq	%r11,%rdi
687	leaq	32(%r9),%r15
688	leaq	32(%rcx),%rcx
689	adcq	$0,%rdx
690	movq	%rdi,(%r14)
691	movq	%rdx,%r13
692	jmp	.L1st4x
693
694.align	32
695.L1st4x:
696	mulq	%rbx
697	addq	%rax,%r10
698	movq	-16(%rcx),%rax
699	leaq	32(%r14),%r14
700	adcq	$0,%rdx
701	movq	%rdx,%r11
702
703	mulq	%rbp
704	addq	%rax,%r13
705	movq	-8(%rsi,%r15,1),%rax
706	adcq	$0,%rdx
707	addq	%r10,%r13
708	adcq	$0,%rdx
709	movq	%r13,-24(%r14)
710	movq	%rdx,%rdi
711
712	mulq	%rbx
713	addq	%rax,%r11
714	movq	-8(%rcx),%rax
715	adcq	$0,%rdx
716	movq	%rdx,%r10
717
718	mulq	%rbp
719	addq	%rax,%rdi
720	movq	(%rsi,%r15,1),%rax
721	adcq	$0,%rdx
722	addq	%r11,%rdi
723	adcq	$0,%rdx
724	movq	%rdi,-16(%r14)
725	movq	%rdx,%r13
726
727	mulq	%rbx
728	addq	%rax,%r10
729	movq	0(%rcx),%rax
730	adcq	$0,%rdx
731	movq	%rdx,%r11
732
733	mulq	%rbp
734	addq	%rax,%r13
735	movq	8(%rsi,%r15,1),%rax
736	adcq	$0,%rdx
737	addq	%r10,%r13
738	adcq	$0,%rdx
739	movq	%r13,-8(%r14)
740	movq	%rdx,%rdi
741
742	mulq	%rbx
743	addq	%rax,%r11
744	movq	8(%rcx),%rax
745	adcq	$0,%rdx
746	movq	%rdx,%r10
747
748	mulq	%rbp
749	addq	%rax,%rdi
750	movq	16(%rsi,%r15,1),%rax
751	adcq	$0,%rdx
752	addq	%r11,%rdi
753	leaq	32(%rcx),%rcx
754	adcq	$0,%rdx
755	movq	%rdi,(%r14)
756	movq	%rdx,%r13
757
758	addq	$32,%r15
759	jnz	.L1st4x
760
761	mulq	%rbx
762	addq	%rax,%r10
763	movq	-16(%rcx),%rax
764	leaq	32(%r14),%r14
765	adcq	$0,%rdx
766	movq	%rdx,%r11
767
768	mulq	%rbp
769	addq	%rax,%r13
770	movq	-8(%rsi),%rax
771	adcq	$0,%rdx
772	addq	%r10,%r13
773	adcq	$0,%rdx
774	movq	%r13,-24(%r14)
775	movq	%rdx,%rdi
776
777	mulq	%rbx
778	addq	%rax,%r11
779	movq	-8(%rcx),%rax
780	adcq	$0,%rdx
781	movq	%rdx,%r10
782
783	mulq	%rbp
784	addq	%rax,%rdi
785	movq	(%rsi,%r9,1),%rax
786	adcq	$0,%rdx
787	addq	%r11,%rdi
788	adcq	$0,%rdx
789	movq	%rdi,-16(%r14)
790	movq	%rdx,%r13
791
792	leaq	(%rcx,%r9,1),%rcx
793
794	xorq	%rdi,%rdi
795	addq	%r10,%r13
796	adcq	$0,%rdi
797	movq	%r13,-8(%r14)
798
799	jmp	.Louter4x
800
801.align	32
802.Louter4x:
803	leaq	16+128(%r14),%rdx
804	pxor	%xmm4,%xmm4
805	pxor	%xmm5,%xmm5
806	movdqa	-128(%r12),%xmm0
807	movdqa	-112(%r12),%xmm1
808	movdqa	-96(%r12),%xmm2
809	movdqa	-80(%r12),%xmm3
810	pand	-128(%rdx),%xmm0
811	pand	-112(%rdx),%xmm1
812	por	%xmm0,%xmm4
813	pand	-96(%rdx),%xmm2
814	por	%xmm1,%xmm5
815	pand	-80(%rdx),%xmm3
816	por	%xmm2,%xmm4
817	por	%xmm3,%xmm5
818	movdqa	-64(%r12),%xmm0
819	movdqa	-48(%r12),%xmm1
820	movdqa	-32(%r12),%xmm2
821	movdqa	-16(%r12),%xmm3
822	pand	-64(%rdx),%xmm0
823	pand	-48(%rdx),%xmm1
824	por	%xmm0,%xmm4
825	pand	-32(%rdx),%xmm2
826	por	%xmm1,%xmm5
827	pand	-16(%rdx),%xmm3
828	por	%xmm2,%xmm4
829	por	%xmm3,%xmm5
830	movdqa	0(%r12),%xmm0
831	movdqa	16(%r12),%xmm1
832	movdqa	32(%r12),%xmm2
833	movdqa	48(%r12),%xmm3
834	pand	0(%rdx),%xmm0
835	pand	16(%rdx),%xmm1
836	por	%xmm0,%xmm4
837	pand	32(%rdx),%xmm2
838	por	%xmm1,%xmm5
839	pand	48(%rdx),%xmm3
840	por	%xmm2,%xmm4
841	por	%xmm3,%xmm5
842	movdqa	64(%r12),%xmm0
843	movdqa	80(%r12),%xmm1
844	movdqa	96(%r12),%xmm2
845	movdqa	112(%r12),%xmm3
846	pand	64(%rdx),%xmm0
847	pand	80(%rdx),%xmm1
848	por	%xmm0,%xmm4
849	pand	96(%rdx),%xmm2
850	por	%xmm1,%xmm5
851	pand	112(%rdx),%xmm3
852	por	%xmm2,%xmm4
853	por	%xmm3,%xmm5
854	por	%xmm5,%xmm4
855	pshufd	$0x4e,%xmm4,%xmm0
856	por	%xmm4,%xmm0
857	leaq	256(%r12),%r12
858.byte	102,72,15,126,195
859
860	movq	(%r14,%r9,1),%r10
861	movq	%r8,%rbp
862	mulq	%rbx
863	addq	%rax,%r10
864	movq	(%rcx),%rax
865	adcq	$0,%rdx
866
867	imulq	%r10,%rbp
868	movq	%rdx,%r11
869	movq	%rdi,(%r14)
870
871	leaq	(%r14,%r9,1),%r14
872
873	mulq	%rbp
874	addq	%rax,%r10
875	movq	8(%rsi,%r9,1),%rax
876	adcq	$0,%rdx
877	movq	%rdx,%rdi
878
879	mulq	%rbx
880	addq	%rax,%r11
881	movq	8(%rcx),%rax
882	adcq	$0,%rdx
883	addq	8(%r14),%r11
884	adcq	$0,%rdx
885	movq	%rdx,%r10
886
887	mulq	%rbp
888	addq	%rax,%rdi
889	movq	16(%rsi,%r9,1),%rax
890	adcq	$0,%rdx
891	addq	%r11,%rdi
892	leaq	32(%r9),%r15
893	leaq	32(%rcx),%rcx
894	adcq	$0,%rdx
895	movq	%rdx,%r13
896	jmp	.Linner4x
897
898.align	32
899.Linner4x:
900	mulq	%rbx
901	addq	%rax,%r10
902	movq	-16(%rcx),%rax
903	adcq	$0,%rdx
904	addq	16(%r14),%r10
905	leaq	32(%r14),%r14
906	adcq	$0,%rdx
907	movq	%rdx,%r11
908
909	mulq	%rbp
910	addq	%rax,%r13
911	movq	-8(%rsi,%r15,1),%rax
912	adcq	$0,%rdx
913	addq	%r10,%r13
914	adcq	$0,%rdx
915	movq	%rdi,-32(%r14)
916	movq	%rdx,%rdi
917
918	mulq	%rbx
919	addq	%rax,%r11
920	movq	-8(%rcx),%rax
921	adcq	$0,%rdx
922	addq	-8(%r14),%r11
923	adcq	$0,%rdx
924	movq	%rdx,%r10
925
926	mulq	%rbp
927	addq	%rax,%rdi
928	movq	(%rsi,%r15,1),%rax
929	adcq	$0,%rdx
930	addq	%r11,%rdi
931	adcq	$0,%rdx
932	movq	%r13,-24(%r14)
933	movq	%rdx,%r13
934
935	mulq	%rbx
936	addq	%rax,%r10
937	movq	0(%rcx),%rax
938	adcq	$0,%rdx
939	addq	(%r14),%r10
940	adcq	$0,%rdx
941	movq	%rdx,%r11
942
943	mulq	%rbp
944	addq	%rax,%r13
945	movq	8(%rsi,%r15,1),%rax
946	adcq	$0,%rdx
947	addq	%r10,%r13
948	adcq	$0,%rdx
949	movq	%rdi,-16(%r14)
950	movq	%rdx,%rdi
951
952	mulq	%rbx
953	addq	%rax,%r11
954	movq	8(%rcx),%rax
955	adcq	$0,%rdx
956	addq	8(%r14),%r11
957	adcq	$0,%rdx
958	movq	%rdx,%r10
959
960	mulq	%rbp
961	addq	%rax,%rdi
962	movq	16(%rsi,%r15,1),%rax
963	adcq	$0,%rdx
964	addq	%r11,%rdi
965	leaq	32(%rcx),%rcx
966	adcq	$0,%rdx
967	movq	%r13,-8(%r14)
968	movq	%rdx,%r13
969
970	addq	$32,%r15
971	jnz	.Linner4x
972
973	mulq	%rbx
974	addq	%rax,%r10
975	movq	-16(%rcx),%rax
976	adcq	$0,%rdx
977	addq	16(%r14),%r10
978	leaq	32(%r14),%r14
979	adcq	$0,%rdx
980	movq	%rdx,%r11
981
982	mulq	%rbp
983	addq	%rax,%r13
984	movq	-8(%rsi),%rax
985	adcq	$0,%rdx
986	addq	%r10,%r13
987	adcq	$0,%rdx
988	movq	%rdi,-32(%r14)
989	movq	%rdx,%rdi
990
991	mulq	%rbx
992	addq	%rax,%r11
993	movq	%rbp,%rax
994	movq	-8(%rcx),%rbp
995	adcq	$0,%rdx
996	addq	-8(%r14),%r11
997	adcq	$0,%rdx
998	movq	%rdx,%r10
999
1000	mulq	%rbp
1001	addq	%rax,%rdi
1002	movq	(%rsi,%r9,1),%rax
1003	adcq	$0,%rdx
1004	addq	%r11,%rdi
1005	adcq	$0,%rdx
1006	movq	%r13,-24(%r14)
1007	movq	%rdx,%r13
1008
1009	movq	%rdi,-16(%r14)
1010	leaq	(%rcx,%r9,1),%rcx
1011
1012	xorq	%rdi,%rdi
1013	addq	%r10,%r13
1014	adcq	$0,%rdi
1015	addq	(%r14),%r13
1016	adcq	$0,%rdi
1017	movq	%r13,-8(%r14)
1018
1019	cmpq	16+8(%rsp),%r12
1020	jb	.Louter4x
1021	xorq	%rax,%rax
1022	subq	%r13,%rbp
1023	adcq	%r15,%r15
1024	orq	%r15,%rdi
1025	subq	%rdi,%rax
1026	leaq	(%r14,%r9,1),%rbx
1027	movq	(%rcx),%r12
1028	leaq	(%rcx),%rbp
1029	movq	%r9,%rcx
1030	sarq	$3+2,%rcx
1031	movq	56+8(%rsp),%rdi
1032	decq	%r12
1033	xorq	%r10,%r10
1034	movq	8(%rbp),%r13
1035	movq	16(%rbp),%r14
1036	movq	24(%rbp),%r15
1037	jmp	.Lsqr4x_sub_entry
1038.size	mul4x_internal,.-mul4x_internal
1039.globl	bn_power5
1040.type	bn_power5,@function
1041.align	32
1042bn_power5:
1043	movq	%rsp,%rax
1044	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
1045	andl	$0x80108,%r11d
1046	cmpl	$0x80108,%r11d
1047	je	.Lpowerx5_enter
1048	pushq	%rbx
1049	pushq	%rbp
1050	pushq	%r12
1051	pushq	%r13
1052	pushq	%r14
1053	pushq	%r15
1054.Lpower5_prologue:
1055
1056	shll	$3,%r9d
1057	leal	(%r9,%r9,2),%r10d
1058	negq	%r9
1059	movq	(%r8),%r8
1060
1061
1062
1063
1064
1065
1066
1067
1068	leaq	-320(%rsp,%r9,2),%r11
1069	movq	%rsp,%rbp
1070	subq	%rdi,%r11
1071	andq	$4095,%r11
1072	cmpq	%r11,%r10
1073	jb	.Lpwr_sp_alt
1074	subq	%r11,%rbp
1075	leaq	-320(%rbp,%r9,2),%rbp
1076	jmp	.Lpwr_sp_done
1077
1078.align	32
1079.Lpwr_sp_alt:
1080	leaq	4096-320(,%r9,2),%r10
1081	leaq	-320(%rbp,%r9,2),%rbp
1082	subq	%r10,%r11
1083	movq	$0,%r10
1084	cmovcq	%r10,%r11
1085	subq	%r11,%rbp
1086.Lpwr_sp_done:
1087	andq	$-64,%rbp
1088	movq	%rsp,%r11
1089	subq	%rbp,%r11
1090	andq	$-4096,%r11
1091	leaq	(%r11,%rbp,1),%rsp
1092	movq	(%rsp),%r10
1093	cmpq	%rbp,%rsp
1094	ja	.Lpwr_page_walk
1095	jmp	.Lpwr_page_walk_done
1096
1097.Lpwr_page_walk:
1098	leaq	-4096(%rsp),%rsp
1099	movq	(%rsp),%r10
1100	cmpq	%rbp,%rsp
1101	ja	.Lpwr_page_walk
1102.Lpwr_page_walk_done:
1103
1104	movq	%r9,%r10
1105	negq	%r9
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116	movq	%r8,32(%rsp)
1117	movq	%rax,40(%rsp)
1118.Lpower5_body:
1119.byte	102,72,15,110,207
1120.byte	102,72,15,110,209
1121.byte	102,73,15,110,218
1122.byte	102,72,15,110,226
1123
1124	call	__bn_sqr8x_internal
1125	call	__bn_post4x_internal
1126	call	__bn_sqr8x_internal
1127	call	__bn_post4x_internal
1128	call	__bn_sqr8x_internal
1129	call	__bn_post4x_internal
1130	call	__bn_sqr8x_internal
1131	call	__bn_post4x_internal
1132	call	__bn_sqr8x_internal
1133	call	__bn_post4x_internal
1134
1135.byte	102,72,15,126,209
1136.byte	102,72,15,126,226
1137	movq	%rsi,%rdi
1138	movq	40(%rsp),%rax
1139	leaq	32(%rsp),%r8
1140
1141	call	mul4x_internal
1142
1143	movq	40(%rsp),%rsi
1144	movq	$1,%rax
1145	movq	-48(%rsi),%r15
1146	movq	-40(%rsi),%r14
1147	movq	-32(%rsi),%r13
1148	movq	-24(%rsi),%r12
1149	movq	-16(%rsi),%rbp
1150	movq	-8(%rsi),%rbx
1151	leaq	(%rsi),%rsp
1152.Lpower5_epilogue:
1153	.byte	0xf3,0xc3
1154.size	bn_power5,.-bn_power5
1155
1156.globl	bn_sqr8x_internal
1157.hidden	bn_sqr8x_internal
1158.type	bn_sqr8x_internal,@function
1159.align	32
1160bn_sqr8x_internal:
1161__bn_sqr8x_internal:
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235	leaq	32(%r10),%rbp
1236	leaq	(%rsi,%r9,1),%rsi
1237
1238	movq	%r9,%rcx
1239
1240
1241	movq	-32(%rsi,%rbp,1),%r14
1242	leaq	48+8(%rsp,%r9,2),%rdi
1243	movq	-24(%rsi,%rbp,1),%rax
1244	leaq	-32(%rdi,%rbp,1),%rdi
1245	movq	-16(%rsi,%rbp,1),%rbx
1246	movq	%rax,%r15
1247
1248	mulq	%r14
1249	movq	%rax,%r10
1250	movq	%rbx,%rax
1251	movq	%rdx,%r11
1252	movq	%r10,-24(%rdi,%rbp,1)
1253
1254	mulq	%r14
1255	addq	%rax,%r11
1256	movq	%rbx,%rax
1257	adcq	$0,%rdx
1258	movq	%r11,-16(%rdi,%rbp,1)
1259	movq	%rdx,%r10
1260
1261
1262	movq	-8(%rsi,%rbp,1),%rbx
1263	mulq	%r15
1264	movq	%rax,%r12
1265	movq	%rbx,%rax
1266	movq	%rdx,%r13
1267
1268	leaq	(%rbp),%rcx
1269	mulq	%r14
1270	addq	%rax,%r10
1271	movq	%rbx,%rax
1272	movq	%rdx,%r11
1273	adcq	$0,%r11
1274	addq	%r12,%r10
1275	adcq	$0,%r11
1276	movq	%r10,-8(%rdi,%rcx,1)
1277	jmp	.Lsqr4x_1st
1278
1279.align	32
1280.Lsqr4x_1st:
1281	movq	(%rsi,%rcx,1),%rbx
1282	mulq	%r15
1283	addq	%rax,%r13
1284	movq	%rbx,%rax
1285	movq	%rdx,%r12
1286	adcq	$0,%r12
1287
1288	mulq	%r14
1289	addq	%rax,%r11
1290	movq	%rbx,%rax
1291	movq	8(%rsi,%rcx,1),%rbx
1292	movq	%rdx,%r10
1293	adcq	$0,%r10
1294	addq	%r13,%r11
1295	adcq	$0,%r10
1296
1297
1298	mulq	%r15
1299	addq	%rax,%r12
1300	movq	%rbx,%rax
1301	movq	%r11,(%rdi,%rcx,1)
1302	movq	%rdx,%r13
1303	adcq	$0,%r13
1304
1305	mulq	%r14
1306	addq	%rax,%r10
1307	movq	%rbx,%rax
1308	movq	16(%rsi,%rcx,1),%rbx
1309	movq	%rdx,%r11
1310	adcq	$0,%r11
1311	addq	%r12,%r10
1312	adcq	$0,%r11
1313
1314	mulq	%r15
1315	addq	%rax,%r13
1316	movq	%rbx,%rax
1317	movq	%r10,8(%rdi,%rcx,1)
1318	movq	%rdx,%r12
1319	adcq	$0,%r12
1320
1321	mulq	%r14
1322	addq	%rax,%r11
1323	movq	%rbx,%rax
1324	movq	24(%rsi,%rcx,1),%rbx
1325	movq	%rdx,%r10
1326	adcq	$0,%r10
1327	addq	%r13,%r11
1328	adcq	$0,%r10
1329
1330
1331	mulq	%r15
1332	addq	%rax,%r12
1333	movq	%rbx,%rax
1334	movq	%r11,16(%rdi,%rcx,1)
1335	movq	%rdx,%r13
1336	adcq	$0,%r13
1337	leaq	32(%rcx),%rcx
1338
1339	mulq	%r14
1340	addq	%rax,%r10
1341	movq	%rbx,%rax
1342	movq	%rdx,%r11
1343	adcq	$0,%r11
1344	addq	%r12,%r10
1345	adcq	$0,%r11
1346	movq	%r10,-8(%rdi,%rcx,1)
1347
1348	cmpq	$0,%rcx
1349	jne	.Lsqr4x_1st
1350
1351	mulq	%r15
1352	addq	%rax,%r13
1353	leaq	16(%rbp),%rbp
1354	adcq	$0,%rdx
1355	addq	%r11,%r13
1356	adcq	$0,%rdx
1357
1358	movq	%r13,(%rdi)
1359	movq	%rdx,%r12
1360	movq	%rdx,8(%rdi)
1361	jmp	.Lsqr4x_outer
1362
1363.align	32
1364.Lsqr4x_outer:
1365	movq	-32(%rsi,%rbp,1),%r14
1366	leaq	48+8(%rsp,%r9,2),%rdi
1367	movq	-24(%rsi,%rbp,1),%rax
1368	leaq	-32(%rdi,%rbp,1),%rdi
1369	movq	-16(%rsi,%rbp,1),%rbx
1370	movq	%rax,%r15
1371
1372	mulq	%r14
1373	movq	-24(%rdi,%rbp,1),%r10
1374	addq	%rax,%r10
1375	movq	%rbx,%rax
1376	adcq	$0,%rdx
1377	movq	%r10,-24(%rdi,%rbp,1)
1378	movq	%rdx,%r11
1379
1380	mulq	%r14
1381	addq	%rax,%r11
1382	movq	%rbx,%rax
1383	adcq	$0,%rdx
1384	addq	-16(%rdi,%rbp,1),%r11
1385	movq	%rdx,%r10
1386	adcq	$0,%r10
1387	movq	%r11,-16(%rdi,%rbp,1)
1388
1389	xorq	%r12,%r12
1390
1391	movq	-8(%rsi,%rbp,1),%rbx
1392	mulq	%r15
1393	addq	%rax,%r12
1394	movq	%rbx,%rax
1395	adcq	$0,%rdx
1396	addq	-8(%rdi,%rbp,1),%r12
1397	movq	%rdx,%r13
1398	adcq	$0,%r13
1399
1400	mulq	%r14
1401	addq	%rax,%r10
1402	movq	%rbx,%rax
1403	adcq	$0,%rdx
1404	addq	%r12,%r10
1405	movq	%rdx,%r11
1406	adcq	$0,%r11
1407	movq	%r10,-8(%rdi,%rbp,1)
1408
1409	leaq	(%rbp),%rcx
1410	jmp	.Lsqr4x_inner
1411
1412.align	32
1413.Lsqr4x_inner:
1414	movq	(%rsi,%rcx,1),%rbx
1415	mulq	%r15
1416	addq	%rax,%r13
1417	movq	%rbx,%rax
1418	movq	%rdx,%r12
1419	adcq	$0,%r12
1420	addq	(%rdi,%rcx,1),%r13
1421	adcq	$0,%r12
1422
1423.byte	0x67
1424	mulq	%r14
1425	addq	%rax,%r11
1426	movq	%rbx,%rax
1427	movq	8(%rsi,%rcx,1),%rbx
1428	movq	%rdx,%r10
1429	adcq	$0,%r10
1430	addq	%r13,%r11
1431	adcq	$0,%r10
1432
1433	mulq	%r15
1434	addq	%rax,%r12
1435	movq	%r11,(%rdi,%rcx,1)
1436	movq	%rbx,%rax
1437	movq	%rdx,%r13
1438	adcq	$0,%r13
1439	addq	8(%rdi,%rcx,1),%r12
1440	leaq	16(%rcx),%rcx
1441	adcq	$0,%r13
1442
1443	mulq	%r14
1444	addq	%rax,%r10
1445	movq	%rbx,%rax
1446	adcq	$0,%rdx
1447	addq	%r12,%r10
1448	movq	%rdx,%r11
1449	adcq	$0,%r11
1450	movq	%r10,-8(%rdi,%rcx,1)
1451
1452	cmpq	$0,%rcx
1453	jne	.Lsqr4x_inner
1454
1455.byte	0x67
1456	mulq	%r15
1457	addq	%rax,%r13
1458	adcq	$0,%rdx
1459	addq	%r11,%r13
1460	adcq	$0,%rdx
1461
1462	movq	%r13,(%rdi)
1463	movq	%rdx,%r12
1464	movq	%rdx,8(%rdi)
1465
1466	addq	$16,%rbp
1467	jnz	.Lsqr4x_outer
1468
1469
1470	movq	-32(%rsi),%r14
1471	leaq	48+8(%rsp,%r9,2),%rdi
1472	movq	-24(%rsi),%rax
1473	leaq	-32(%rdi,%rbp,1),%rdi
1474	movq	-16(%rsi),%rbx
1475	movq	%rax,%r15
1476
1477	mulq	%r14
1478	addq	%rax,%r10
1479	movq	%rbx,%rax
1480	movq	%rdx,%r11
1481	adcq	$0,%r11
1482
1483	mulq	%r14
1484	addq	%rax,%r11
1485	movq	%rbx,%rax
1486	movq	%r10,-24(%rdi)
1487	movq	%rdx,%r10
1488	adcq	$0,%r10
1489	addq	%r13,%r11
1490	movq	-8(%rsi),%rbx
1491	adcq	$0,%r10
1492
1493	mulq	%r15
1494	addq	%rax,%r12
1495	movq	%rbx,%rax
1496	movq	%r11,-16(%rdi)
1497	movq	%rdx,%r13
1498	adcq	$0,%r13
1499
1500	mulq	%r14
1501	addq	%rax,%r10
1502	movq	%rbx,%rax
1503	movq	%rdx,%r11
1504	adcq	$0,%r11
1505	addq	%r12,%r10
1506	adcq	$0,%r11
1507	movq	%r10,-8(%rdi)
1508
1509	mulq	%r15
1510	addq	%rax,%r13
1511	movq	-16(%rsi),%rax
1512	adcq	$0,%rdx
1513	addq	%r11,%r13
1514	adcq	$0,%rdx
1515
1516	movq	%r13,(%rdi)
1517	movq	%rdx,%r12
1518	movq	%rdx,8(%rdi)
1519
1520	mulq	%rbx
1521	addq	$16,%rbp
1522	xorq	%r14,%r14
1523	subq	%r9,%rbp
1524	xorq	%r15,%r15
1525
1526	addq	%r12,%rax
1527	adcq	$0,%rdx
1528	movq	%rax,8(%rdi)
1529	movq	%rdx,16(%rdi)
1530	movq	%r15,24(%rdi)
1531
1532	movq	-16(%rsi,%rbp,1),%rax
1533	leaq	48+8(%rsp),%rdi
1534	xorq	%r10,%r10
1535	movq	8(%rdi),%r11
1536
1537	leaq	(%r14,%r10,2),%r12
1538	shrq	$63,%r10
1539	leaq	(%rcx,%r11,2),%r13
1540	shrq	$63,%r11
1541	orq	%r10,%r13
1542	movq	16(%rdi),%r10
1543	movq	%r11,%r14
1544	mulq	%rax
1545	negq	%r15
1546	movq	24(%rdi),%r11
1547	adcq	%rax,%r12
1548	movq	-8(%rsi,%rbp,1),%rax
1549	movq	%r12,(%rdi)
1550	adcq	%rdx,%r13
1551
1552	leaq	(%r14,%r10,2),%rbx
1553	movq	%r13,8(%rdi)
1554	sbbq	%r15,%r15
1555	shrq	$63,%r10
1556	leaq	(%rcx,%r11,2),%r8
1557	shrq	$63,%r11
1558	orq	%r10,%r8
1559	movq	32(%rdi),%r10
1560	movq	%r11,%r14
1561	mulq	%rax
1562	negq	%r15
1563	movq	40(%rdi),%r11
1564	adcq	%rax,%rbx
1565	movq	0(%rsi,%rbp,1),%rax
1566	movq	%rbx,16(%rdi)
1567	adcq	%rdx,%r8
1568	leaq	16(%rbp),%rbp
1569	movq	%r8,24(%rdi)
1570	sbbq	%r15,%r15
1571	leaq	64(%rdi),%rdi
1572	jmp	.Lsqr4x_shift_n_add
1573
1574.align	32
1575.Lsqr4x_shift_n_add:
1576	leaq	(%r14,%r10,2),%r12
1577	shrq	$63,%r10
1578	leaq	(%rcx,%r11,2),%r13
1579	shrq	$63,%r11
1580	orq	%r10,%r13
1581	movq	-16(%rdi),%r10
1582	movq	%r11,%r14
1583	mulq	%rax
1584	negq	%r15
1585	movq	-8(%rdi),%r11
1586	adcq	%rax,%r12
1587	movq	-8(%rsi,%rbp,1),%rax
1588	movq	%r12,-32(%rdi)
1589	adcq	%rdx,%r13
1590
1591	leaq	(%r14,%r10,2),%rbx
1592	movq	%r13,-24(%rdi)
1593	sbbq	%r15,%r15
1594	shrq	$63,%r10
1595	leaq	(%rcx,%r11,2),%r8
1596	shrq	$63,%r11
1597	orq	%r10,%r8
1598	movq	0(%rdi),%r10
1599	movq	%r11,%r14
1600	mulq	%rax
1601	negq	%r15
1602	movq	8(%rdi),%r11
1603	adcq	%rax,%rbx
1604	movq	0(%rsi,%rbp,1),%rax
1605	movq	%rbx,-16(%rdi)
1606	adcq	%rdx,%r8
1607
1608	leaq	(%r14,%r10,2),%r12
1609	movq	%r8,-8(%rdi)
1610	sbbq	%r15,%r15
1611	shrq	$63,%r10
1612	leaq	(%rcx,%r11,2),%r13
1613	shrq	$63,%r11
1614	orq	%r10,%r13
1615	movq	16(%rdi),%r10
1616	movq	%r11,%r14
1617	mulq	%rax
1618	negq	%r15
1619	movq	24(%rdi),%r11
1620	adcq	%rax,%r12
1621	movq	8(%rsi,%rbp,1),%rax
1622	movq	%r12,0(%rdi)
1623	adcq	%rdx,%r13
1624
1625	leaq	(%r14,%r10,2),%rbx
1626	movq	%r13,8(%rdi)
1627	sbbq	%r15,%r15
1628	shrq	$63,%r10
1629	leaq	(%rcx,%r11,2),%r8
1630	shrq	$63,%r11
1631	orq	%r10,%r8
1632	movq	32(%rdi),%r10
1633	movq	%r11,%r14
1634	mulq	%rax
1635	negq	%r15
1636	movq	40(%rdi),%r11
1637	adcq	%rax,%rbx
1638	movq	16(%rsi,%rbp,1),%rax
1639	movq	%rbx,16(%rdi)
1640	adcq	%rdx,%r8
1641	movq	%r8,24(%rdi)
1642	sbbq	%r15,%r15
1643	leaq	64(%rdi),%rdi
1644	addq	$32,%rbp
1645	jnz	.Lsqr4x_shift_n_add
1646
1647	leaq	(%r14,%r10,2),%r12
1648.byte	0x67
1649	shrq	$63,%r10
1650	leaq	(%rcx,%r11,2),%r13
1651	shrq	$63,%r11
1652	orq	%r10,%r13
1653	movq	-16(%rdi),%r10
1654	movq	%r11,%r14
1655	mulq	%rax
1656	negq	%r15
1657	movq	-8(%rdi),%r11
1658	adcq	%rax,%r12
1659	movq	-8(%rsi),%rax
1660	movq	%r12,-32(%rdi)
1661	adcq	%rdx,%r13
1662
1663	leaq	(%r14,%r10,2),%rbx
1664	movq	%r13,-24(%rdi)
1665	sbbq	%r15,%r15
1666	shrq	$63,%r10
1667	leaq	(%rcx,%r11,2),%r8
1668	shrq	$63,%r11
1669	orq	%r10,%r8
1670	mulq	%rax
1671	negq	%r15
1672	adcq	%rax,%rbx
1673	adcq	%rdx,%r8
1674	movq	%rbx,-16(%rdi)
1675	movq	%r8,-8(%rdi)
1676.byte	102,72,15,126,213
1677__bn_sqr8x_reduction:
1678	xorq	%rax,%rax
1679	leaq	(%r9,%rbp,1),%rcx
1680	leaq	48+8(%rsp,%r9,2),%rdx
1681	movq	%rcx,0+8(%rsp)
1682	leaq	48+8(%rsp,%r9,1),%rdi
1683	movq	%rdx,8+8(%rsp)
1684	negq	%r9
1685	jmp	.L8x_reduction_loop
1686
1687.align	32
1688.L8x_reduction_loop:
1689	leaq	(%rdi,%r9,1),%rdi
1690.byte	0x66
1691	movq	0(%rdi),%rbx
1692	movq	8(%rdi),%r9
1693	movq	16(%rdi),%r10
1694	movq	24(%rdi),%r11
1695	movq	32(%rdi),%r12
1696	movq	40(%rdi),%r13
1697	movq	48(%rdi),%r14
1698	movq	56(%rdi),%r15
1699	movq	%rax,(%rdx)
1700	leaq	64(%rdi),%rdi
1701
1702.byte	0x67
1703	movq	%rbx,%r8
1704	imulq	32+8(%rsp),%rbx
1705	movq	0(%rbp),%rax
1706	movl	$8,%ecx
1707	jmp	.L8x_reduce
1708
1709.align	32
1710.L8x_reduce:
1711	mulq	%rbx
1712	movq	8(%rbp),%rax
1713	negq	%r8
1714	movq	%rdx,%r8
1715	adcq	$0,%r8
1716
1717	mulq	%rbx
1718	addq	%rax,%r9
1719	movq	16(%rbp),%rax
1720	adcq	$0,%rdx
1721	addq	%r9,%r8
1722	movq	%rbx,48-8+8(%rsp,%rcx,8)
1723	movq	%rdx,%r9
1724	adcq	$0,%r9
1725
1726	mulq	%rbx
1727	addq	%rax,%r10
1728	movq	24(%rbp),%rax
1729	adcq	$0,%rdx
1730	addq	%r10,%r9
1731	movq	32+8(%rsp),%rsi
1732	movq	%rdx,%r10
1733	adcq	$0,%r10
1734
1735	mulq	%rbx
1736	addq	%rax,%r11
1737	movq	32(%rbp),%rax
1738	adcq	$0,%rdx
1739	imulq	%r8,%rsi
1740	addq	%r11,%r10
1741	movq	%rdx,%r11
1742	adcq	$0,%r11
1743
1744	mulq	%rbx
1745	addq	%rax,%r12
1746	movq	40(%rbp),%rax
1747	adcq	$0,%rdx
1748	addq	%r12,%r11
1749	movq	%rdx,%r12
1750	adcq	$0,%r12
1751
1752	mulq	%rbx
1753	addq	%rax,%r13
1754	movq	48(%rbp),%rax
1755	adcq	$0,%rdx
1756	addq	%r13,%r12
1757	movq	%rdx,%r13
1758	adcq	$0,%r13
1759
1760	mulq	%rbx
1761	addq	%rax,%r14
1762	movq	56(%rbp),%rax
1763	adcq	$0,%rdx
1764	addq	%r14,%r13
1765	movq	%rdx,%r14
1766	adcq	$0,%r14
1767
1768	mulq	%rbx
1769	movq	%rsi,%rbx
1770	addq	%rax,%r15
1771	movq	0(%rbp),%rax
1772	adcq	$0,%rdx
1773	addq	%r15,%r14
1774	movq	%rdx,%r15
1775	adcq	$0,%r15
1776
1777	decl	%ecx
1778	jnz	.L8x_reduce
1779
1780	leaq	64(%rbp),%rbp
1781	xorq	%rax,%rax
1782	movq	8+8(%rsp),%rdx
1783	cmpq	0+8(%rsp),%rbp
1784	jae	.L8x_no_tail
1785
1786.byte	0x66
1787	addq	0(%rdi),%r8
1788	adcq	8(%rdi),%r9
1789	adcq	16(%rdi),%r10
1790	adcq	24(%rdi),%r11
1791	adcq	32(%rdi),%r12
1792	adcq	40(%rdi),%r13
1793	adcq	48(%rdi),%r14
1794	adcq	56(%rdi),%r15
1795	sbbq	%rsi,%rsi
1796
1797	movq	48+56+8(%rsp),%rbx
1798	movl	$8,%ecx
1799	movq	0(%rbp),%rax
1800	jmp	.L8x_tail
1801
1802.align	32
1803.L8x_tail:
1804	mulq	%rbx
1805	addq	%rax,%r8
1806	movq	8(%rbp),%rax
1807	movq	%r8,(%rdi)
1808	movq	%rdx,%r8
1809	adcq	$0,%r8
1810
1811	mulq	%rbx
1812	addq	%rax,%r9
1813	movq	16(%rbp),%rax
1814	adcq	$0,%rdx
1815	addq	%r9,%r8
1816	leaq	8(%rdi),%rdi
1817	movq	%rdx,%r9
1818	adcq	$0,%r9
1819
1820	mulq	%rbx
1821	addq	%rax,%r10
1822	movq	24(%rbp),%rax
1823	adcq	$0,%rdx
1824	addq	%r10,%r9
1825	movq	%rdx,%r10
1826	adcq	$0,%r10
1827
1828	mulq	%rbx
1829	addq	%rax,%r11
1830	movq	32(%rbp),%rax
1831	adcq	$0,%rdx
1832	addq	%r11,%r10
1833	movq	%rdx,%r11
1834	adcq	$0,%r11
1835
1836	mulq	%rbx
1837	addq	%rax,%r12
1838	movq	40(%rbp),%rax
1839	adcq	$0,%rdx
1840	addq	%r12,%r11
1841	movq	%rdx,%r12
1842	adcq	$0,%r12
1843
1844	mulq	%rbx
1845	addq	%rax,%r13
1846	movq	48(%rbp),%rax
1847	adcq	$0,%rdx
1848	addq	%r13,%r12
1849	movq	%rdx,%r13
1850	adcq	$0,%r13
1851
1852	mulq	%rbx
1853	addq	%rax,%r14
1854	movq	56(%rbp),%rax
1855	adcq	$0,%rdx
1856	addq	%r14,%r13
1857	movq	%rdx,%r14
1858	adcq	$0,%r14
1859
1860	mulq	%rbx
1861	movq	48-16+8(%rsp,%rcx,8),%rbx
1862	addq	%rax,%r15
1863	adcq	$0,%rdx
1864	addq	%r15,%r14
1865	movq	0(%rbp),%rax
1866	movq	%rdx,%r15
1867	adcq	$0,%r15
1868
1869	decl	%ecx
1870	jnz	.L8x_tail
1871
1872	leaq	64(%rbp),%rbp
1873	movq	8+8(%rsp),%rdx
1874	cmpq	0+8(%rsp),%rbp
1875	jae	.L8x_tail_done
1876
1877	movq	48+56+8(%rsp),%rbx
1878	negq	%rsi
1879	movq	0(%rbp),%rax
1880	adcq	0(%rdi),%r8
1881	adcq	8(%rdi),%r9
1882	adcq	16(%rdi),%r10
1883	adcq	24(%rdi),%r11
1884	adcq	32(%rdi),%r12
1885	adcq	40(%rdi),%r13
1886	adcq	48(%rdi),%r14
1887	adcq	56(%rdi),%r15
1888	sbbq	%rsi,%rsi
1889
1890	movl	$8,%ecx
1891	jmp	.L8x_tail
1892
1893.align	32
1894.L8x_tail_done:
1895	xorq	%rax,%rax
1896	addq	(%rdx),%r8
1897	adcq	$0,%r9
1898	adcq	$0,%r10
1899	adcq	$0,%r11
1900	adcq	$0,%r12
1901	adcq	$0,%r13
1902	adcq	$0,%r14
1903	adcq	$0,%r15
1904	adcq	$0,%rax
1905
1906	negq	%rsi
1907.L8x_no_tail:
1908	adcq	0(%rdi),%r8
1909	adcq	8(%rdi),%r9
1910	adcq	16(%rdi),%r10
1911	adcq	24(%rdi),%r11
1912	adcq	32(%rdi),%r12
1913	adcq	40(%rdi),%r13
1914	adcq	48(%rdi),%r14
1915	adcq	56(%rdi),%r15
1916	adcq	$0,%rax
1917	movq	-8(%rbp),%rcx
1918	xorq	%rsi,%rsi
1919
1920.byte	102,72,15,126,213
1921
1922	movq	%r8,0(%rdi)
1923	movq	%r9,8(%rdi)
1924.byte	102,73,15,126,217
1925	movq	%r10,16(%rdi)
1926	movq	%r11,24(%rdi)
1927	movq	%r12,32(%rdi)
1928	movq	%r13,40(%rdi)
1929	movq	%r14,48(%rdi)
1930	movq	%r15,56(%rdi)
1931	leaq	64(%rdi),%rdi
1932
1933	cmpq	%rdx,%rdi
1934	jb	.L8x_reduction_loop
1935	.byte	0xf3,0xc3
1936.size	bn_sqr8x_internal,.-bn_sqr8x_internal
1937.type	__bn_post4x_internal,@function
1938.align	32
1939__bn_post4x_internal:
1940	movq	0(%rbp),%r12
1941	leaq	(%rdi,%r9,1),%rbx
1942	movq	%r9,%rcx
1943.byte	102,72,15,126,207
1944	negq	%rax
1945.byte	102,72,15,126,206
1946	sarq	$3+2,%rcx
1947	decq	%r12
1948	xorq	%r10,%r10
1949	movq	8(%rbp),%r13
1950	movq	16(%rbp),%r14
1951	movq	24(%rbp),%r15
1952	jmp	.Lsqr4x_sub_entry
1953
1954.align	16
1955.Lsqr4x_sub:
1956	movq	0(%rbp),%r12
1957	movq	8(%rbp),%r13
1958	movq	16(%rbp),%r14
1959	movq	24(%rbp),%r15
1960.Lsqr4x_sub_entry:
1961	leaq	32(%rbp),%rbp
1962	notq	%r12
1963	notq	%r13
1964	notq	%r14
1965	notq	%r15
1966	andq	%rax,%r12
1967	andq	%rax,%r13
1968	andq	%rax,%r14
1969	andq	%rax,%r15
1970
1971	negq	%r10
1972	adcq	0(%rbx),%r12
1973	adcq	8(%rbx),%r13
1974	adcq	16(%rbx),%r14
1975	adcq	24(%rbx),%r15
1976	movq	%r12,0(%rdi)
1977	leaq	32(%rbx),%rbx
1978	movq	%r13,8(%rdi)
1979	sbbq	%r10,%r10
1980	movq	%r14,16(%rdi)
1981	movq	%r15,24(%rdi)
1982	leaq	32(%rdi),%rdi
1983
1984	incq	%rcx
1985	jnz	.Lsqr4x_sub
1986
1987	movq	%r9,%r10
1988	negq	%r9
1989	.byte	0xf3,0xc3
1990.size	__bn_post4x_internal,.-__bn_post4x_internal
1991.globl	bn_from_montgomery
1992.type	bn_from_montgomery,@function
1993.align	32
1994bn_from_montgomery:
1995	testl	$7,%r9d
1996	jz	bn_from_mont8x
1997	xorl	%eax,%eax
1998	.byte	0xf3,0xc3
1999.size	bn_from_montgomery,.-bn_from_montgomery
2000
2001.type	bn_from_mont8x,@function
2002.align	32
2003bn_from_mont8x:
2004.byte	0x67
2005	movq	%rsp,%rax
2006	pushq	%rbx
2007	pushq	%rbp
2008	pushq	%r12
2009	pushq	%r13
2010	pushq	%r14
2011	pushq	%r15
2012.Lfrom_prologue:
2013
2014	shll	$3,%r9d
2015	leaq	(%r9,%r9,2),%r10
2016	negq	%r9
2017	movq	(%r8),%r8
2018
2019
2020
2021
2022
2023
2024
2025
2026	leaq	-320(%rsp,%r9,2),%r11
2027	movq	%rsp,%rbp
2028	subq	%rdi,%r11
2029	andq	$4095,%r11
2030	cmpq	%r11,%r10
2031	jb	.Lfrom_sp_alt
2032	subq	%r11,%rbp
2033	leaq	-320(%rbp,%r9,2),%rbp
2034	jmp	.Lfrom_sp_done
2035
2036.align	32
2037.Lfrom_sp_alt:
2038	leaq	4096-320(,%r9,2),%r10
2039	leaq	-320(%rbp,%r9,2),%rbp
2040	subq	%r10,%r11
2041	movq	$0,%r10
2042	cmovcq	%r10,%r11
2043	subq	%r11,%rbp
2044.Lfrom_sp_done:
2045	andq	$-64,%rbp
2046	movq	%rsp,%r11
2047	subq	%rbp,%r11
2048	andq	$-4096,%r11
2049	leaq	(%r11,%rbp,1),%rsp
2050	movq	(%rsp),%r10
2051	cmpq	%rbp,%rsp
2052	ja	.Lfrom_page_walk
2053	jmp	.Lfrom_page_walk_done
2054
2055.Lfrom_page_walk:
2056	leaq	-4096(%rsp),%rsp
2057	movq	(%rsp),%r10
2058	cmpq	%rbp,%rsp
2059	ja	.Lfrom_page_walk
2060.Lfrom_page_walk_done:
2061
2062	movq	%r9,%r10
2063	negq	%r9
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074	movq	%r8,32(%rsp)
2075	movq	%rax,40(%rsp)
2076.Lfrom_body:
2077	movq	%r9,%r11
2078	leaq	48(%rsp),%rax
2079	pxor	%xmm0,%xmm0
2080	jmp	.Lmul_by_1
2081
2082.align	32
2083.Lmul_by_1:
2084	movdqu	(%rsi),%xmm1
2085	movdqu	16(%rsi),%xmm2
2086	movdqu	32(%rsi),%xmm3
2087	movdqa	%xmm0,(%rax,%r9,1)
2088	movdqu	48(%rsi),%xmm4
2089	movdqa	%xmm0,16(%rax,%r9,1)
2090.byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
2091	movdqa	%xmm1,(%rax)
2092	movdqa	%xmm0,32(%rax,%r9,1)
2093	movdqa	%xmm2,16(%rax)
2094	movdqa	%xmm0,48(%rax,%r9,1)
2095	movdqa	%xmm3,32(%rax)
2096	movdqa	%xmm4,48(%rax)
2097	leaq	64(%rax),%rax
2098	subq	$64,%r11
2099	jnz	.Lmul_by_1
2100
2101.byte	102,72,15,110,207
2102.byte	102,72,15,110,209
2103.byte	0x67
2104	movq	%rcx,%rbp
2105.byte	102,73,15,110,218
2106	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
2107	andl	$0x80108,%r11d
2108	cmpl	$0x80108,%r11d
2109	jne	.Lfrom_mont_nox
2110
2111	leaq	(%rax,%r9,1),%rdi
2112	call	__bn_sqrx8x_reduction
2113	call	__bn_postx4x_internal
2114
2115	pxor	%xmm0,%xmm0
2116	leaq	48(%rsp),%rax
2117	movq	40(%rsp),%rsi
2118	jmp	.Lfrom_mont_zero
2119
2120.align	32
2121.Lfrom_mont_nox:
2122	call	__bn_sqr8x_reduction
2123	call	__bn_post4x_internal
2124
2125	pxor	%xmm0,%xmm0
2126	leaq	48(%rsp),%rax
2127	movq	40(%rsp),%rsi
2128	jmp	.Lfrom_mont_zero
2129
2130.align	32
2131.Lfrom_mont_zero:
2132	movdqa	%xmm0,0(%rax)
2133	movdqa	%xmm0,16(%rax)
2134	movdqa	%xmm0,32(%rax)
2135	movdqa	%xmm0,48(%rax)
2136	leaq	64(%rax),%rax
2137	subq	$32,%r9
2138	jnz	.Lfrom_mont_zero
2139
2140	movq	$1,%rax
2141	movq	-48(%rsi),%r15
2142	movq	-40(%rsi),%r14
2143	movq	-32(%rsi),%r13
2144	movq	-24(%rsi),%r12
2145	movq	-16(%rsi),%rbp
2146	movq	-8(%rsi),%rbx
2147	leaq	(%rsi),%rsp
2148.Lfrom_epilogue:
2149	.byte	0xf3,0xc3
2150.size	bn_from_mont8x,.-bn_from_mont8x
2151.type	bn_mulx4x_mont_gather5,@function
2152.align	32
2153bn_mulx4x_mont_gather5:
2154	movq	%rsp,%rax
2155.Lmulx4x_enter:
2156	pushq	%rbx
2157	pushq	%rbp
2158	pushq	%r12
2159	pushq	%r13
2160	pushq	%r14
2161	pushq	%r15
2162.Lmulx4x_prologue:
2163
2164	shll	$3,%r9d
2165	leaq	(%r9,%r9,2),%r10
2166	negq	%r9
2167	movq	(%r8),%r8
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178	leaq	-320(%rsp,%r9,2),%r11
2179	movq	%rsp,%rbp
2180	subq	%rdi,%r11
2181	andq	$4095,%r11
2182	cmpq	%r11,%r10
2183	jb	.Lmulx4xsp_alt
2184	subq	%r11,%rbp
2185	leaq	-320(%rbp,%r9,2),%rbp
2186	jmp	.Lmulx4xsp_done
2187
2188.Lmulx4xsp_alt:
2189	leaq	4096-320(,%r9,2),%r10
2190	leaq	-320(%rbp,%r9,2),%rbp
2191	subq	%r10,%r11
2192	movq	$0,%r10
2193	cmovcq	%r10,%r11
2194	subq	%r11,%rbp
2195.Lmulx4xsp_done:
2196	andq	$-64,%rbp
2197	movq	%rsp,%r11
2198	subq	%rbp,%r11
2199	andq	$-4096,%r11
2200	leaq	(%r11,%rbp,1),%rsp
2201	movq	(%rsp),%r10
2202	cmpq	%rbp,%rsp
2203	ja	.Lmulx4x_page_walk
2204	jmp	.Lmulx4x_page_walk_done
2205
2206.Lmulx4x_page_walk:
2207	leaq	-4096(%rsp),%rsp
2208	movq	(%rsp),%r10
2209	cmpq	%rbp,%rsp
2210	ja	.Lmulx4x_page_walk
2211.Lmulx4x_page_walk_done:
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224
2225	movq	%r8,32(%rsp)
2226	movq	%rax,40(%rsp)
2227.Lmulx4x_body:
2228	call	mulx4x_internal
2229
2230	movq	40(%rsp),%rsi
2231	movq	$1,%rax
2232
2233	movq	-48(%rsi),%r15
2234	movq	-40(%rsi),%r14
2235	movq	-32(%rsi),%r13
2236	movq	-24(%rsi),%r12
2237	movq	-16(%rsi),%rbp
2238	movq	-8(%rsi),%rbx
2239	leaq	(%rsi),%rsp
2240.Lmulx4x_epilogue:
2241	.byte	0xf3,0xc3
2242.size	bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
2243
2244.type	mulx4x_internal,@function
2245.align	32
2246mulx4x_internal:
2247	movq	%r9,8(%rsp)
2248	movq	%r9,%r10
2249	negq	%r9
2250	shlq	$5,%r9
2251	negq	%r10
2252	leaq	128(%rdx,%r9,1),%r13
2253	shrq	$5+5,%r9
2254	movd	8(%rax),%xmm5
2255	subq	$1,%r9
2256	leaq	.Linc(%rip),%rax
2257	movq	%r13,16+8(%rsp)
2258	movq	%r9,24+8(%rsp)
2259	movq	%rdi,56+8(%rsp)
2260	movdqa	0(%rax),%xmm0
2261	movdqa	16(%rax),%xmm1
2262	leaq	88-112(%rsp,%r10,1),%r10
2263	leaq	128(%rdx),%rdi
2264
2265	pshufd	$0,%xmm5,%xmm5
2266	movdqa	%xmm1,%xmm4
2267.byte	0x67
2268	movdqa	%xmm1,%xmm2
2269.byte	0x67
2270	paddd	%xmm0,%xmm1
2271	pcmpeqd	%xmm5,%xmm0
2272	movdqa	%xmm4,%xmm3
2273	paddd	%xmm1,%xmm2
2274	pcmpeqd	%xmm5,%xmm1
2275	movdqa	%xmm0,112(%r10)
2276	movdqa	%xmm4,%xmm0
2277
2278	paddd	%xmm2,%xmm3
2279	pcmpeqd	%xmm5,%xmm2
2280	movdqa	%xmm1,128(%r10)
2281	movdqa	%xmm4,%xmm1
2282
2283	paddd	%xmm3,%xmm0
2284	pcmpeqd	%xmm5,%xmm3
2285	movdqa	%xmm2,144(%r10)
2286	movdqa	%xmm4,%xmm2
2287
2288	paddd	%xmm0,%xmm1
2289	pcmpeqd	%xmm5,%xmm0
2290	movdqa	%xmm3,160(%r10)
2291	movdqa	%xmm4,%xmm3
2292	paddd	%xmm1,%xmm2
2293	pcmpeqd	%xmm5,%xmm1
2294	movdqa	%xmm0,176(%r10)
2295	movdqa	%xmm4,%xmm0
2296
2297	paddd	%xmm2,%xmm3
2298	pcmpeqd	%xmm5,%xmm2
2299	movdqa	%xmm1,192(%r10)
2300	movdqa	%xmm4,%xmm1
2301
2302	paddd	%xmm3,%xmm0
2303	pcmpeqd	%xmm5,%xmm3
2304	movdqa	%xmm2,208(%r10)
2305	movdqa	%xmm4,%xmm2
2306
2307	paddd	%xmm0,%xmm1
2308	pcmpeqd	%xmm5,%xmm0
2309	movdqa	%xmm3,224(%r10)
2310	movdqa	%xmm4,%xmm3
2311	paddd	%xmm1,%xmm2
2312	pcmpeqd	%xmm5,%xmm1
2313	movdqa	%xmm0,240(%r10)
2314	movdqa	%xmm4,%xmm0
2315
2316	paddd	%xmm2,%xmm3
2317	pcmpeqd	%xmm5,%xmm2
2318	movdqa	%xmm1,256(%r10)
2319	movdqa	%xmm4,%xmm1
2320
2321	paddd	%xmm3,%xmm0
2322	pcmpeqd	%xmm5,%xmm3
2323	movdqa	%xmm2,272(%r10)
2324	movdqa	%xmm4,%xmm2
2325
2326	paddd	%xmm0,%xmm1
2327	pcmpeqd	%xmm5,%xmm0
2328	movdqa	%xmm3,288(%r10)
2329	movdqa	%xmm4,%xmm3
2330.byte	0x67
2331	paddd	%xmm1,%xmm2
2332	pcmpeqd	%xmm5,%xmm1
2333	movdqa	%xmm0,304(%r10)
2334
2335	paddd	%xmm2,%xmm3
2336	pcmpeqd	%xmm5,%xmm2
2337	movdqa	%xmm1,320(%r10)
2338
2339	pcmpeqd	%xmm5,%xmm3
2340	movdqa	%xmm2,336(%r10)
2341
2342	pand	64(%rdi),%xmm0
2343	pand	80(%rdi),%xmm1
2344	pand	96(%rdi),%xmm2
2345	movdqa	%xmm3,352(%r10)
2346	pand	112(%rdi),%xmm3
2347	por	%xmm2,%xmm0
2348	por	%xmm3,%xmm1
2349	movdqa	-128(%rdi),%xmm4
2350	movdqa	-112(%rdi),%xmm5
2351	movdqa	-96(%rdi),%xmm2
2352	pand	112(%r10),%xmm4
2353	movdqa	-80(%rdi),%xmm3
2354	pand	128(%r10),%xmm5
2355	por	%xmm4,%xmm0
2356	pand	144(%r10),%xmm2
2357	por	%xmm5,%xmm1
2358	pand	160(%r10),%xmm3
2359	por	%xmm2,%xmm0
2360	por	%xmm3,%xmm1
2361	movdqa	-64(%rdi),%xmm4
2362	movdqa	-48(%rdi),%xmm5
2363	movdqa	-32(%rdi),%xmm2
2364	pand	176(%r10),%xmm4
2365	movdqa	-16(%rdi),%xmm3
2366	pand	192(%r10),%xmm5
2367	por	%xmm4,%xmm0
2368	pand	208(%r10),%xmm2
2369	por	%xmm5,%xmm1
2370	pand	224(%r10),%xmm3
2371	por	%xmm2,%xmm0
2372	por	%xmm3,%xmm1
2373	movdqa	0(%rdi),%xmm4
2374	movdqa	16(%rdi),%xmm5
2375	movdqa	32(%rdi),%xmm2
2376	pand	240(%r10),%xmm4
2377	movdqa	48(%rdi),%xmm3
2378	pand	256(%r10),%xmm5
2379	por	%xmm4,%xmm0
2380	pand	272(%r10),%xmm2
2381	por	%xmm5,%xmm1
2382	pand	288(%r10),%xmm3
2383	por	%xmm2,%xmm0
2384	por	%xmm3,%xmm1
2385	pxor	%xmm1,%xmm0
2386	pshufd	$0x4e,%xmm0,%xmm1
2387	por	%xmm1,%xmm0
2388	leaq	256(%rdi),%rdi
2389.byte	102,72,15,126,194
2390	leaq	64+32+8(%rsp),%rbx
2391
2392	movq	%rdx,%r9
2393	mulxq	0(%rsi),%r8,%rax
2394	mulxq	8(%rsi),%r11,%r12
2395	addq	%rax,%r11
2396	mulxq	16(%rsi),%rax,%r13
2397	adcq	%rax,%r12
2398	adcq	$0,%r13
2399	mulxq	24(%rsi),%rax,%r14
2400
2401	movq	%r8,%r15
2402	imulq	32+8(%rsp),%r8
2403	xorq	%rbp,%rbp
2404	movq	%r8,%rdx
2405
2406	movq	%rdi,8+8(%rsp)
2407
2408	leaq	32(%rsi),%rsi
2409	adcxq	%rax,%r13
2410	adcxq	%rbp,%r14
2411
2412	mulxq	0(%rcx),%rax,%r10
2413	adcxq	%rax,%r15
2414	adoxq	%r11,%r10
2415	mulxq	8(%rcx),%rax,%r11
2416	adcxq	%rax,%r10
2417	adoxq	%r12,%r11
2418	mulxq	16(%rcx),%rax,%r12
2419	movq	24+8(%rsp),%rdi
2420	movq	%r10,-32(%rbx)
2421	adcxq	%rax,%r11
2422	adoxq	%r13,%r12
2423	mulxq	24(%rcx),%rax,%r15
2424	movq	%r9,%rdx
2425	movq	%r11,-24(%rbx)
2426	adcxq	%rax,%r12
2427	adoxq	%rbp,%r15
2428	leaq	32(%rcx),%rcx
2429	movq	%r12,-16(%rbx)
2430	jmp	.Lmulx4x_1st
2431
2432.align	32
2433.Lmulx4x_1st:
2434	adcxq	%rbp,%r15
2435	mulxq	0(%rsi),%r10,%rax
2436	adcxq	%r14,%r10
2437	mulxq	8(%rsi),%r11,%r14
2438	adcxq	%rax,%r11
2439	mulxq	16(%rsi),%r12,%rax
2440	adcxq	%r14,%r12
2441	mulxq	24(%rsi),%r13,%r14
2442.byte	0x67,0x67
2443	movq	%r8,%rdx
2444	adcxq	%rax,%r13
2445	adcxq	%rbp,%r14
2446	leaq	32(%rsi),%rsi
2447	leaq	32(%rbx),%rbx
2448
2449	adoxq	%r15,%r10
2450	mulxq	0(%rcx),%rax,%r15
2451	adcxq	%rax,%r10
2452	adoxq	%r15,%r11
2453	mulxq	8(%rcx),%rax,%r15
2454	adcxq	%rax,%r11
2455	adoxq	%r15,%r12
2456	mulxq	16(%rcx),%rax,%r15
2457	movq	%r10,-40(%rbx)
2458	adcxq	%rax,%r12
2459	movq	%r11,-32(%rbx)
2460	adoxq	%r15,%r13
2461	mulxq	24(%rcx),%rax,%r15
2462	movq	%r9,%rdx
2463	movq	%r12,-24(%rbx)
2464	adcxq	%rax,%r13
2465	adoxq	%rbp,%r15
2466	leaq	32(%rcx),%rcx
2467	movq	%r13,-16(%rbx)
2468
2469	decq	%rdi
2470	jnz	.Lmulx4x_1st
2471
2472	movq	8(%rsp),%rax
2473	adcq	%rbp,%r15
2474	leaq	(%rsi,%rax,1),%rsi
2475	addq	%r15,%r14
2476	movq	8+8(%rsp),%rdi
2477	adcq	%rbp,%rbp
2478	movq	%r14,-8(%rbx)
2479	jmp	.Lmulx4x_outer
2480
2481.align	32
2482.Lmulx4x_outer:
2483	leaq	16-256(%rbx),%r10
2484	pxor	%xmm4,%xmm4
2485.byte	0x67,0x67
2486	pxor	%xmm5,%xmm5
2487	movdqa	-128(%rdi),%xmm0
2488	movdqa	-112(%rdi),%xmm1
2489	movdqa	-96(%rdi),%xmm2
2490	pand	256(%r10),%xmm0
2491	movdqa	-80(%rdi),%xmm3
2492	pand	272(%r10),%xmm1
2493	por	%xmm0,%xmm4
2494	pand	288(%r10),%xmm2
2495	por	%xmm1,%xmm5
2496	pand	304(%r10),%xmm3
2497	por	%xmm2,%xmm4
2498	por	%xmm3,%xmm5
2499	movdqa	-64(%rdi),%xmm0
2500	movdqa	-48(%rdi),%xmm1
2501	movdqa	-32(%rdi),%xmm2
2502	pand	320(%r10),%xmm0
2503	movdqa	-16(%rdi),%xmm3
2504	pand	336(%r10),%xmm1
2505	por	%xmm0,%xmm4
2506	pand	352(%r10),%xmm2
2507	por	%xmm1,%xmm5
2508	pand	368(%r10),%xmm3
2509	por	%xmm2,%xmm4
2510	por	%xmm3,%xmm5
2511	movdqa	0(%rdi),%xmm0
2512	movdqa	16(%rdi),%xmm1
2513	movdqa	32(%rdi),%xmm2
2514	pand	384(%r10),%xmm0
2515	movdqa	48(%rdi),%xmm3
2516	pand	400(%r10),%xmm1
2517	por	%xmm0,%xmm4
2518	pand	416(%r10),%xmm2
2519	por	%xmm1,%xmm5
2520	pand	432(%r10),%xmm3
2521	por	%xmm2,%xmm4
2522	por	%xmm3,%xmm5
2523	movdqa	64(%rdi),%xmm0
2524	movdqa	80(%rdi),%xmm1
2525	movdqa	96(%rdi),%xmm2
2526	pand	448(%r10),%xmm0
2527	movdqa	112(%rdi),%xmm3
2528	pand	464(%r10),%xmm1
2529	por	%xmm0,%xmm4
2530	pand	480(%r10),%xmm2
2531	por	%xmm1,%xmm5
2532	pand	496(%r10),%xmm3
2533	por	%xmm2,%xmm4
2534	por	%xmm3,%xmm5
2535	por	%xmm5,%xmm4
2536	pshufd	$0x4e,%xmm4,%xmm0
2537	por	%xmm4,%xmm0
2538	leaq	256(%rdi),%rdi
2539.byte	102,72,15,126,194
2540
2541	movq	%rbp,(%rbx)
2542	leaq	32(%rbx,%rax,1),%rbx
2543	mulxq	0(%rsi),%r8,%r11
2544	xorq	%rbp,%rbp
2545	movq	%rdx,%r9
2546	mulxq	8(%rsi),%r14,%r12
2547	adoxq	-32(%rbx),%r8
2548	adcxq	%r14,%r11
2549	mulxq	16(%rsi),%r15,%r13
2550	adoxq	-24(%rbx),%r11
2551	adcxq	%r15,%r12
2552	mulxq	24(%rsi),%rdx,%r14
2553	adoxq	-16(%rbx),%r12
2554	adcxq	%rdx,%r13
2555	leaq	(%rcx,%rax,1),%rcx
2556	leaq	32(%rsi),%rsi
2557	adoxq	-8(%rbx),%r13
2558	adcxq	%rbp,%r14
2559	adoxq	%rbp,%r14
2560
2561	movq	%r8,%r15
2562	imulq	32+8(%rsp),%r8
2563
2564	movq	%r8,%rdx
2565	xorq	%rbp,%rbp
2566	movq	%rdi,8+8(%rsp)
2567
2568	mulxq	0(%rcx),%rax,%r10
2569	adcxq	%rax,%r15
2570	adoxq	%r11,%r10
2571	mulxq	8(%rcx),%rax,%r11
2572	adcxq	%rax,%r10
2573	adoxq	%r12,%r11
2574	mulxq	16(%rcx),%rax,%r12
2575	adcxq	%rax,%r11
2576	adoxq	%r13,%r12
2577	mulxq	24(%rcx),%rax,%r15
2578	movq	%r9,%rdx
2579	movq	24+8(%rsp),%rdi
2580	movq	%r10,-32(%rbx)
2581	adcxq	%rax,%r12
2582	movq	%r11,-24(%rbx)
2583	adoxq	%rbp,%r15
2584	movq	%r12,-16(%rbx)
2585	leaq	32(%rcx),%rcx
2586	jmp	.Lmulx4x_inner
2587
2588.align	32
2589.Lmulx4x_inner:
2590	mulxq	0(%rsi),%r10,%rax
2591	adcxq	%rbp,%r15
2592	adoxq	%r14,%r10
2593	mulxq	8(%rsi),%r11,%r14
2594	adcxq	0(%rbx),%r10
2595	adoxq	%rax,%r11
2596	mulxq	16(%rsi),%r12,%rax
2597	adcxq	8(%rbx),%r11
2598	adoxq	%r14,%r12
2599	mulxq	24(%rsi),%r13,%r14
2600	movq	%r8,%rdx
2601	adcxq	16(%rbx),%r12
2602	adoxq	%rax,%r13
2603	adcxq	24(%rbx),%r13
2604	adoxq	%rbp,%r14
2605	leaq	32(%rsi),%rsi
2606	leaq	32(%rbx),%rbx
2607	adcxq	%rbp,%r14
2608
2609	adoxq	%r15,%r10
2610	mulxq	0(%rcx),%rax,%r15
2611	adcxq	%rax,%r10
2612	adoxq	%r15,%r11
2613	mulxq	8(%rcx),%rax,%r15
2614	adcxq	%rax,%r11
2615	adoxq	%r15,%r12
2616	mulxq	16(%rcx),%rax,%r15
2617	movq	%r10,-40(%rbx)
2618	adcxq	%rax,%r12
2619	adoxq	%r15,%r13
2620	movq	%r11,-32(%rbx)
2621	mulxq	24(%rcx),%rax,%r15
2622	movq	%r9,%rdx
2623	leaq	32(%rcx),%rcx
2624	movq	%r12,-24(%rbx)
2625	adcxq	%rax,%r13
2626	adoxq	%rbp,%r15
2627	movq	%r13,-16(%rbx)
2628
2629	decq	%rdi
2630	jnz	.Lmulx4x_inner
2631
2632	movq	0+8(%rsp),%rax
2633	adcq	%rbp,%r15
2634	subq	0(%rbx),%rdi
2635	movq	8+8(%rsp),%rdi
2636	movq	16+8(%rsp),%r10
2637	adcq	%r15,%r14
2638	leaq	(%rsi,%rax,1),%rsi
2639	adcq	%rbp,%rbp
2640	movq	%r14,-8(%rbx)
2641
2642	cmpq	%r10,%rdi
2643	jb	.Lmulx4x_outer
2644
2645	movq	-8(%rcx),%r10
2646	movq	%rbp,%r8
2647	movq	(%rcx,%rax,1),%r12
2648	leaq	(%rcx,%rax,1),%rbp
2649	movq	%rax,%rcx
2650	leaq	(%rbx,%rax,1),%rdi
2651	xorl	%eax,%eax
2652	xorq	%r15,%r15
2653	subq	%r14,%r10
2654	adcq	%r15,%r15
2655	orq	%r15,%r8
2656	sarq	$3+2,%rcx
2657	subq	%r8,%rax
2658	movq	56+8(%rsp),%rdx
2659	decq	%r12
2660	movq	8(%rbp),%r13
2661	xorq	%r8,%r8
2662	movq	16(%rbp),%r14
2663	movq	24(%rbp),%r15
2664	jmp	.Lsqrx4x_sub_entry
2665.size	mulx4x_internal,.-mulx4x_internal
2666.type	bn_powerx5,@function
2667.align	32
2668bn_powerx5:
2669	movq	%rsp,%rax
2670.Lpowerx5_enter:
2671	pushq	%rbx
2672	pushq	%rbp
2673	pushq	%r12
2674	pushq	%r13
2675	pushq	%r14
2676	pushq	%r15
2677.Lpowerx5_prologue:
2678
2679	shll	$3,%r9d
2680	leaq	(%r9,%r9,2),%r10
2681	negq	%r9
2682	movq	(%r8),%r8
2683
2684
2685
2686
2687
2688
2689
2690
2691	leaq	-320(%rsp,%r9,2),%r11
2692	movq	%rsp,%rbp
2693	subq	%rdi,%r11
2694	andq	$4095,%r11
2695	cmpq	%r11,%r10
2696	jb	.Lpwrx_sp_alt
2697	subq	%r11,%rbp
2698	leaq	-320(%rbp,%r9,2),%rbp
2699	jmp	.Lpwrx_sp_done
2700
2701.align	32
2702.Lpwrx_sp_alt:
2703	leaq	4096-320(,%r9,2),%r10
2704	leaq	-320(%rbp,%r9,2),%rbp
2705	subq	%r10,%r11
2706	movq	$0,%r10
2707	cmovcq	%r10,%r11
2708	subq	%r11,%rbp
2709.Lpwrx_sp_done:
2710	andq	$-64,%rbp
2711	movq	%rsp,%r11
2712	subq	%rbp,%r11
2713	andq	$-4096,%r11
2714	leaq	(%r11,%rbp,1),%rsp
2715	movq	(%rsp),%r10
2716	cmpq	%rbp,%rsp
2717	ja	.Lpwrx_page_walk
2718	jmp	.Lpwrx_page_walk_done
2719
2720.Lpwrx_page_walk:
2721	leaq	-4096(%rsp),%rsp
2722	movq	(%rsp),%r10
2723	cmpq	%rbp,%rsp
2724	ja	.Lpwrx_page_walk
2725.Lpwrx_page_walk_done:
2726
2727	movq	%r9,%r10
2728	negq	%r9
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741	pxor	%xmm0,%xmm0
2742.byte	102,72,15,110,207
2743.byte	102,72,15,110,209
2744.byte	102,73,15,110,218
2745.byte	102,72,15,110,226
2746	movq	%r8,32(%rsp)
2747	movq	%rax,40(%rsp)
2748.Lpowerx5_body:
2749
2750	call	__bn_sqrx8x_internal
2751	call	__bn_postx4x_internal
2752	call	__bn_sqrx8x_internal
2753	call	__bn_postx4x_internal
2754	call	__bn_sqrx8x_internal
2755	call	__bn_postx4x_internal
2756	call	__bn_sqrx8x_internal
2757	call	__bn_postx4x_internal
2758	call	__bn_sqrx8x_internal
2759	call	__bn_postx4x_internal
2760
2761	movq	%r10,%r9
2762	movq	%rsi,%rdi
2763.byte	102,72,15,126,209
2764.byte	102,72,15,126,226
2765	movq	40(%rsp),%rax
2766
2767	call	mulx4x_internal
2768
2769	movq	40(%rsp),%rsi
2770	movq	$1,%rax
2771
2772	movq	-48(%rsi),%r15
2773	movq	-40(%rsi),%r14
2774	movq	-32(%rsi),%r13
2775	movq	-24(%rsi),%r12
2776	movq	-16(%rsi),%rbp
2777	movq	-8(%rsi),%rbx
2778	leaq	(%rsi),%rsp
2779.Lpowerx5_epilogue:
2780	.byte	0xf3,0xc3
2781.size	bn_powerx5,.-bn_powerx5
2782
2783.globl	bn_sqrx8x_internal
2784.hidden	bn_sqrx8x_internal
2785.type	bn_sqrx8x_internal,@function
2786.align	32
2787bn_sqrx8x_internal:
2788__bn_sqrx8x_internal:
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828
2829	leaq	48+8(%rsp),%rdi
2830	leaq	(%rsi,%r9,1),%rbp
2831	movq	%r9,0+8(%rsp)
2832	movq	%rbp,8+8(%rsp)
2833	jmp	.Lsqr8x_zero_start
2834
2835.align	32
2836.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2837.Lsqrx8x_zero:
2838.byte	0x3e
2839	movdqa	%xmm0,0(%rdi)
2840	movdqa	%xmm0,16(%rdi)
2841	movdqa	%xmm0,32(%rdi)
2842	movdqa	%xmm0,48(%rdi)
2843.Lsqr8x_zero_start:
2844	movdqa	%xmm0,64(%rdi)
2845	movdqa	%xmm0,80(%rdi)
2846	movdqa	%xmm0,96(%rdi)
2847	movdqa	%xmm0,112(%rdi)
2848	leaq	128(%rdi),%rdi
2849	subq	$64,%r9
2850	jnz	.Lsqrx8x_zero
2851
2852	movq	0(%rsi),%rdx
2853
2854	xorq	%r10,%r10
2855	xorq	%r11,%r11
2856	xorq	%r12,%r12
2857	xorq	%r13,%r13
2858	xorq	%r14,%r14
2859	xorq	%r15,%r15
2860	leaq	48+8(%rsp),%rdi
2861	xorq	%rbp,%rbp
2862	jmp	.Lsqrx8x_outer_loop
2863
2864.align	32
2865.Lsqrx8x_outer_loop:
2866	mulxq	8(%rsi),%r8,%rax
2867	adcxq	%r9,%r8
2868	adoxq	%rax,%r10
2869	mulxq	16(%rsi),%r9,%rax
2870	adcxq	%r10,%r9
2871	adoxq	%rax,%r11
2872.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
2873	adcxq	%r11,%r10
2874	adoxq	%rax,%r12
2875.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
2876	adcxq	%r12,%r11
2877	adoxq	%rax,%r13
2878	mulxq	40(%rsi),%r12,%rax
2879	adcxq	%r13,%r12
2880	adoxq	%rax,%r14
2881	mulxq	48(%rsi),%r13,%rax
2882	adcxq	%r14,%r13
2883	adoxq	%r15,%rax
2884	mulxq	56(%rsi),%r14,%r15
2885	movq	8(%rsi),%rdx
2886	adcxq	%rax,%r14
2887	adoxq	%rbp,%r15
2888	adcq	64(%rdi),%r15
2889	movq	%r8,8(%rdi)
2890	movq	%r9,16(%rdi)
2891	sbbq	%rcx,%rcx
2892	xorq	%rbp,%rbp
2893
2894
2895	mulxq	16(%rsi),%r8,%rbx
2896	mulxq	24(%rsi),%r9,%rax
2897	adcxq	%r10,%r8
2898	adoxq	%rbx,%r9
2899	mulxq	32(%rsi),%r10,%rbx
2900	adcxq	%r11,%r9
2901	adoxq	%rax,%r10
2902.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
2903	adcxq	%r12,%r10
2904	adoxq	%rbx,%r11
2905.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
2906	adcxq	%r13,%r11
2907	adoxq	%r14,%r12
2908.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
2909	movq	16(%rsi),%rdx
2910	adcxq	%rax,%r12
2911	adoxq	%rbx,%r13
2912	adcxq	%r15,%r13
2913	adoxq	%rbp,%r14
2914	adcxq	%rbp,%r14
2915
2916	movq	%r8,24(%rdi)
2917	movq	%r9,32(%rdi)
2918
2919	mulxq	24(%rsi),%r8,%rbx
2920	mulxq	32(%rsi),%r9,%rax
2921	adcxq	%r10,%r8
2922	adoxq	%rbx,%r9
2923	mulxq	40(%rsi),%r10,%rbx
2924	adcxq	%r11,%r9
2925	adoxq	%rax,%r10
2926.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
2927	adcxq	%r12,%r10
2928	adoxq	%r13,%r11
2929.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
2930.byte	0x3e
2931	movq	24(%rsi),%rdx
2932	adcxq	%rbx,%r11
2933	adoxq	%rax,%r12
2934	adcxq	%r14,%r12
2935	movq	%r8,40(%rdi)
2936	movq	%r9,48(%rdi)
2937	mulxq	32(%rsi),%r8,%rax
2938	adoxq	%rbp,%r13
2939	adcxq	%rbp,%r13
2940
2941	mulxq	40(%rsi),%r9,%rbx
2942	adcxq	%r10,%r8
2943	adoxq	%rax,%r9
2944	mulxq	48(%rsi),%r10,%rax
2945	adcxq	%r11,%r9
2946	adoxq	%r12,%r10
2947	mulxq	56(%rsi),%r11,%r12
2948	movq	32(%rsi),%rdx
2949	movq	40(%rsi),%r14
2950	adcxq	%rbx,%r10
2951	adoxq	%rax,%r11
2952	movq	48(%rsi),%r15
2953	adcxq	%r13,%r11
2954	adoxq	%rbp,%r12
2955	adcxq	%rbp,%r12
2956
2957	movq	%r8,56(%rdi)
2958	movq	%r9,64(%rdi)
2959
2960	mulxq	%r14,%r9,%rax
2961	movq	56(%rsi),%r8
2962	adcxq	%r10,%r9
2963	mulxq	%r15,%r10,%rbx
2964	adoxq	%rax,%r10
2965	adcxq	%r11,%r10
2966	mulxq	%r8,%r11,%rax
2967	movq	%r14,%rdx
2968	adoxq	%rbx,%r11
2969	adcxq	%r12,%r11
2970
2971	adcxq	%rbp,%rax
2972
2973	mulxq	%r15,%r14,%rbx
2974	mulxq	%r8,%r12,%r13
2975	movq	%r15,%rdx
2976	leaq	64(%rsi),%rsi
2977	adcxq	%r14,%r11
2978	adoxq	%rbx,%r12
2979	adcxq	%rax,%r12
2980	adoxq	%rbp,%r13
2981
2982.byte	0x67,0x67
2983	mulxq	%r8,%r8,%r14
2984	adcxq	%r8,%r13
2985	adcxq	%rbp,%r14
2986
2987	cmpq	8+8(%rsp),%rsi
2988	je	.Lsqrx8x_outer_break
2989
2990	negq	%rcx
2991	movq	$-8,%rcx
2992	movq	%rbp,%r15
2993	movq	64(%rdi),%r8
2994	adcxq	72(%rdi),%r9
2995	adcxq	80(%rdi),%r10
2996	adcxq	88(%rdi),%r11
2997	adcq	96(%rdi),%r12
2998	adcq	104(%rdi),%r13
2999	adcq	112(%rdi),%r14
3000	adcq	120(%rdi),%r15
3001	leaq	(%rsi),%rbp
3002	leaq	128(%rdi),%rdi
3003	sbbq	%rax,%rax
3004
3005	movq	-64(%rsi),%rdx
3006	movq	%rax,16+8(%rsp)
3007	movq	%rdi,24+8(%rsp)
3008
3009
3010	xorl	%eax,%eax
3011	jmp	.Lsqrx8x_loop
3012
3013.align	32
3014.Lsqrx8x_loop:
3015	movq	%r8,%rbx
3016	mulxq	0(%rbp),%rax,%r8
3017	adcxq	%rax,%rbx
3018	adoxq	%r9,%r8
3019
3020	mulxq	8(%rbp),%rax,%r9
3021	adcxq	%rax,%r8
3022	adoxq	%r10,%r9
3023
3024	mulxq	16(%rbp),%rax,%r10
3025	adcxq	%rax,%r9
3026	adoxq	%r11,%r10
3027
3028	mulxq	24(%rbp),%rax,%r11
3029	adcxq	%rax,%r10
3030	adoxq	%r12,%r11
3031
3032.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3033	adcxq	%rax,%r11
3034	adoxq	%r13,%r12
3035
3036	mulxq	40(%rbp),%rax,%r13
3037	adcxq	%rax,%r12
3038	adoxq	%r14,%r13
3039
3040	mulxq	48(%rbp),%rax,%r14
3041	movq	%rbx,(%rdi,%rcx,8)
3042	movl	$0,%ebx
3043	adcxq	%rax,%r13
3044	adoxq	%r15,%r14
3045
3046.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
3047	movq	8(%rsi,%rcx,8),%rdx
3048	adcxq	%rax,%r14
3049	adoxq	%rbx,%r15
3050	adcxq	%rbx,%r15
3051
3052.byte	0x67
3053	incq	%rcx
3054	jnz	.Lsqrx8x_loop
3055
3056	leaq	64(%rbp),%rbp
3057	movq	$-8,%rcx
3058	cmpq	8+8(%rsp),%rbp
3059	je	.Lsqrx8x_break
3060
3061	subq	16+8(%rsp),%rbx
3062.byte	0x66
3063	movq	-64(%rsi),%rdx
3064	adcxq	0(%rdi),%r8
3065	adcxq	8(%rdi),%r9
3066	adcq	16(%rdi),%r10
3067	adcq	24(%rdi),%r11
3068	adcq	32(%rdi),%r12
3069	adcq	40(%rdi),%r13
3070	adcq	48(%rdi),%r14
3071	adcq	56(%rdi),%r15
3072	leaq	64(%rdi),%rdi
3073.byte	0x67
3074	sbbq	%rax,%rax
3075	xorl	%ebx,%ebx
3076	movq	%rax,16+8(%rsp)
3077	jmp	.Lsqrx8x_loop
3078
3079.align	32
3080.Lsqrx8x_break:
3081	xorq	%rbp,%rbp
3082	subq	16+8(%rsp),%rbx
3083	adcxq	%rbp,%r8
3084	movq	24+8(%rsp),%rcx
3085	adcxq	%rbp,%r9
3086	movq	0(%rsi),%rdx
3087	adcq	$0,%r10
3088	movq	%r8,0(%rdi)
3089	adcq	$0,%r11
3090	adcq	$0,%r12
3091	adcq	$0,%r13
3092	adcq	$0,%r14
3093	adcq	$0,%r15
3094	cmpq	%rcx,%rdi
3095	je	.Lsqrx8x_outer_loop
3096
3097	movq	%r9,8(%rdi)
3098	movq	8(%rcx),%r9
3099	movq	%r10,16(%rdi)
3100	movq	16(%rcx),%r10
3101	movq	%r11,24(%rdi)
3102	movq	24(%rcx),%r11
3103	movq	%r12,32(%rdi)
3104	movq	32(%rcx),%r12
3105	movq	%r13,40(%rdi)
3106	movq	40(%rcx),%r13
3107	movq	%r14,48(%rdi)
3108	movq	48(%rcx),%r14
3109	movq	%r15,56(%rdi)
3110	movq	56(%rcx),%r15
3111	movq	%rcx,%rdi
3112	jmp	.Lsqrx8x_outer_loop
3113
3114.align	32
3115.Lsqrx8x_outer_break:
3116	movq	%r9,72(%rdi)
3117.byte	102,72,15,126,217
3118	movq	%r10,80(%rdi)
3119	movq	%r11,88(%rdi)
3120	movq	%r12,96(%rdi)
3121	movq	%r13,104(%rdi)
3122	movq	%r14,112(%rdi)
3123	leaq	48+8(%rsp),%rdi
3124	movq	(%rsi,%rcx,1),%rdx
3125
3126	movq	8(%rdi),%r11
3127	xorq	%r10,%r10
3128	movq	0+8(%rsp),%r9
3129	adoxq	%r11,%r11
3130	movq	16(%rdi),%r12
3131	movq	24(%rdi),%r13
3132
3133
3134.align	32
3135.Lsqrx4x_shift_n_add:
3136	mulxq	%rdx,%rax,%rbx
3137	adoxq	%r12,%r12
3138	adcxq	%r10,%rax
3139.byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
3140.byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
3141	adoxq	%r13,%r13
3142	adcxq	%r11,%rbx
3143	movq	40(%rdi),%r11
3144	movq	%rax,0(%rdi)
3145	movq	%rbx,8(%rdi)
3146
3147	mulxq	%rdx,%rax,%rbx
3148	adoxq	%r10,%r10
3149	adcxq	%r12,%rax
3150	movq	16(%rsi,%rcx,1),%rdx
3151	movq	48(%rdi),%r12
3152	adoxq	%r11,%r11
3153	adcxq	%r13,%rbx
3154	movq	56(%rdi),%r13
3155	movq	%rax,16(%rdi)
3156	movq	%rbx,24(%rdi)
3157
3158	mulxq	%rdx,%rax,%rbx
3159	adoxq	%r12,%r12
3160	adcxq	%r10,%rax
3161	movq	24(%rsi,%rcx,1),%rdx
3162	leaq	32(%rcx),%rcx
3163	movq	64(%rdi),%r10
3164	adoxq	%r13,%r13
3165	adcxq	%r11,%rbx
3166	movq	72(%rdi),%r11
3167	movq	%rax,32(%rdi)
3168	movq	%rbx,40(%rdi)
3169
3170	mulxq	%rdx,%rax,%rbx
3171	adoxq	%r10,%r10
3172	adcxq	%r12,%rax
3173	jrcxz	.Lsqrx4x_shift_n_add_break
3174.byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
3175	adoxq	%r11,%r11
3176	adcxq	%r13,%rbx
3177	movq	80(%rdi),%r12
3178	movq	88(%rdi),%r13
3179	movq	%rax,48(%rdi)
3180	movq	%rbx,56(%rdi)
3181	leaq	64(%rdi),%rdi
3182	nop
3183	jmp	.Lsqrx4x_shift_n_add
3184
3185.align	32
3186.Lsqrx4x_shift_n_add_break:
3187	adcxq	%r13,%rbx
3188	movq	%rax,48(%rdi)
3189	movq	%rbx,56(%rdi)
3190	leaq	64(%rdi),%rdi
3191.byte	102,72,15,126,213
3192__bn_sqrx8x_reduction:
3193	xorl	%eax,%eax
3194	movq	32+8(%rsp),%rbx
3195	movq	48+8(%rsp),%rdx
3196	leaq	-64(%rbp,%r9,1),%rcx
3197
3198	movq	%rcx,0+8(%rsp)
3199	movq	%rdi,8+8(%rsp)
3200
3201	leaq	48+8(%rsp),%rdi
3202	jmp	.Lsqrx8x_reduction_loop
3203
3204.align	32
3205.Lsqrx8x_reduction_loop:
3206	movq	8(%rdi),%r9
3207	movq	16(%rdi),%r10
3208	movq	24(%rdi),%r11
3209	movq	32(%rdi),%r12
3210	movq	%rdx,%r8
3211	imulq	%rbx,%rdx
3212	movq	40(%rdi),%r13
3213	movq	48(%rdi),%r14
3214	movq	56(%rdi),%r15
3215	movq	%rax,24+8(%rsp)
3216
3217	leaq	64(%rdi),%rdi
3218	xorq	%rsi,%rsi
3219	movq	$-8,%rcx
3220	jmp	.Lsqrx8x_reduce
3221
3222.align	32
3223.Lsqrx8x_reduce:
3224	movq	%r8,%rbx
3225	mulxq	0(%rbp),%rax,%r8
3226	adcxq	%rbx,%rax
3227	adoxq	%r9,%r8
3228
3229	mulxq	8(%rbp),%rbx,%r9
3230	adcxq	%rbx,%r8
3231	adoxq	%r10,%r9
3232
3233	mulxq	16(%rbp),%rbx,%r10
3234	adcxq	%rbx,%r9
3235	adoxq	%r11,%r10
3236
3237	mulxq	24(%rbp),%rbx,%r11
3238	adcxq	%rbx,%r10
3239	adoxq	%r12,%r11
3240
3241.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
3242	movq	%rdx,%rax
3243	movq	%r8,%rdx
3244	adcxq	%rbx,%r11
3245	adoxq	%r13,%r12
3246
3247	mulxq	32+8(%rsp),%rbx,%rdx
3248	movq	%rax,%rdx
3249	movq	%rax,64+48+8(%rsp,%rcx,8)
3250
3251	mulxq	40(%rbp),%rax,%r13
3252	adcxq	%rax,%r12
3253	adoxq	%r14,%r13
3254
3255	mulxq	48(%rbp),%rax,%r14
3256	adcxq	%rax,%r13
3257	adoxq	%r15,%r14
3258
3259	mulxq	56(%rbp),%rax,%r15
3260	movq	%rbx,%rdx
3261	adcxq	%rax,%r14
3262	adoxq	%rsi,%r15
3263	adcxq	%rsi,%r15
3264
3265.byte	0x67,0x67,0x67
3266	incq	%rcx
3267	jnz	.Lsqrx8x_reduce
3268
3269	movq	%rsi,%rax
3270	cmpq	0+8(%rsp),%rbp
3271	jae	.Lsqrx8x_no_tail
3272
3273	movq	48+8(%rsp),%rdx
3274	addq	0(%rdi),%r8
3275	leaq	64(%rbp),%rbp
3276	movq	$-8,%rcx
3277	adcxq	8(%rdi),%r9
3278	adcxq	16(%rdi),%r10
3279	adcq	24(%rdi),%r11
3280	adcq	32(%rdi),%r12
3281	adcq	40(%rdi),%r13
3282	adcq	48(%rdi),%r14
3283	adcq	56(%rdi),%r15
3284	leaq	64(%rdi),%rdi
3285	sbbq	%rax,%rax
3286
3287	xorq	%rsi,%rsi
3288	movq	%rax,16+8(%rsp)
3289	jmp	.Lsqrx8x_tail
3290
3291.align	32
3292.Lsqrx8x_tail:
3293	movq	%r8,%rbx
3294	mulxq	0(%rbp),%rax,%r8
3295	adcxq	%rax,%rbx
3296	adoxq	%r9,%r8
3297
3298	mulxq	8(%rbp),%rax,%r9
3299	adcxq	%rax,%r8
3300	adoxq	%r10,%r9
3301
3302	mulxq	16(%rbp),%rax,%r10
3303	adcxq	%rax,%r9
3304	adoxq	%r11,%r10
3305
3306	mulxq	24(%rbp),%rax,%r11
3307	adcxq	%rax,%r10
3308	adoxq	%r12,%r11
3309
3310.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3311	adcxq	%rax,%r11
3312	adoxq	%r13,%r12
3313
3314	mulxq	40(%rbp),%rax,%r13
3315	adcxq	%rax,%r12
3316	adoxq	%r14,%r13
3317
3318	mulxq	48(%rbp),%rax,%r14
3319	adcxq	%rax,%r13
3320	adoxq	%r15,%r14
3321
3322	mulxq	56(%rbp),%rax,%r15
3323	movq	72+48+8(%rsp,%rcx,8),%rdx
3324	adcxq	%rax,%r14
3325	adoxq	%rsi,%r15
3326	movq	%rbx,(%rdi,%rcx,8)
3327	movq	%r8,%rbx
3328	adcxq	%rsi,%r15
3329
3330	incq	%rcx
3331	jnz	.Lsqrx8x_tail
3332
3333	cmpq	0+8(%rsp),%rbp
3334	jae	.Lsqrx8x_tail_done
3335
3336	subq	16+8(%rsp),%rsi
3337	movq	48+8(%rsp),%rdx
3338	leaq	64(%rbp),%rbp
3339	adcq	0(%rdi),%r8
3340	adcq	8(%rdi),%r9
3341	adcq	16(%rdi),%r10
3342	adcq	24(%rdi),%r11
3343	adcq	32(%rdi),%r12
3344	adcq	40(%rdi),%r13
3345	adcq	48(%rdi),%r14
3346	adcq	56(%rdi),%r15
3347	leaq	64(%rdi),%rdi
3348	sbbq	%rax,%rax
3349	subq	$8,%rcx
3350
3351	xorq	%rsi,%rsi
3352	movq	%rax,16+8(%rsp)
3353	jmp	.Lsqrx8x_tail
3354
3355.align	32
3356.Lsqrx8x_tail_done:
3357	xorq	%rax,%rax
3358	addq	24+8(%rsp),%r8
3359	adcq	$0,%r9
3360	adcq	$0,%r10
3361	adcq	$0,%r11
3362	adcq	$0,%r12
3363	adcq	$0,%r13
3364	adcq	$0,%r14
3365	adcq	$0,%r15
3366	adcq	$0,%rax
3367
3368	subq	16+8(%rsp),%rsi
3369.Lsqrx8x_no_tail:
3370	adcq	0(%rdi),%r8
3371.byte	102,72,15,126,217
3372	adcq	8(%rdi),%r9
3373	movq	56(%rbp),%rsi
3374.byte	102,72,15,126,213
3375	adcq	16(%rdi),%r10
3376	adcq	24(%rdi),%r11
3377	adcq	32(%rdi),%r12
3378	adcq	40(%rdi),%r13
3379	adcq	48(%rdi),%r14
3380	adcq	56(%rdi),%r15
3381	adcq	$0,%rax
3382
3383	movq	32+8(%rsp),%rbx
3384	movq	64(%rdi,%rcx,1),%rdx
3385
3386	movq	%r8,0(%rdi)
3387	leaq	64(%rdi),%r8
3388	movq	%r9,8(%rdi)
3389	movq	%r10,16(%rdi)
3390	movq	%r11,24(%rdi)
3391	movq	%r12,32(%rdi)
3392	movq	%r13,40(%rdi)
3393	movq	%r14,48(%rdi)
3394	movq	%r15,56(%rdi)
3395
3396	leaq	64(%rdi,%rcx,1),%rdi
3397	cmpq	8+8(%rsp),%r8
3398	jb	.Lsqrx8x_reduction_loop
3399	.byte	0xf3,0xc3
3400.size	bn_sqrx8x_internal,.-bn_sqrx8x_internal
3401.align	32
3402__bn_postx4x_internal:
3403	movq	0(%rbp),%r12
3404	movq	%rcx,%r10
3405	movq	%rcx,%r9
3406	negq	%rax
3407	sarq	$3+2,%rcx
3408
3409.byte	102,72,15,126,202
3410.byte	102,72,15,126,206
3411	decq	%r12
3412	movq	8(%rbp),%r13
3413	xorq	%r8,%r8
3414	movq	16(%rbp),%r14
3415	movq	24(%rbp),%r15
3416	jmp	.Lsqrx4x_sub_entry
3417
3418.align	16
3419.Lsqrx4x_sub:
3420	movq	0(%rbp),%r12
3421	movq	8(%rbp),%r13
3422	movq	16(%rbp),%r14
3423	movq	24(%rbp),%r15
3424.Lsqrx4x_sub_entry:
3425	andnq	%rax,%r12,%r12
3426	leaq	32(%rbp),%rbp
3427	andnq	%rax,%r13,%r13
3428	andnq	%rax,%r14,%r14
3429	andnq	%rax,%r15,%r15
3430
3431	negq	%r8
3432	adcq	0(%rdi),%r12
3433	adcq	8(%rdi),%r13
3434	adcq	16(%rdi),%r14
3435	adcq	24(%rdi),%r15
3436	movq	%r12,0(%rdx)
3437	leaq	32(%rdi),%rdi
3438	movq	%r13,8(%rdx)
3439	sbbq	%r8,%r8
3440	movq	%r14,16(%rdx)
3441	movq	%r15,24(%rdx)
3442	leaq	32(%rdx),%rdx
3443
3444	incq	%rcx
3445	jnz	.Lsqrx4x_sub
3446
3447	negq	%r9
3448
3449	.byte	0xf3,0xc3
3450.size	__bn_postx4x_internal,.-__bn_postx4x_internal
3451.globl	bn_get_bits5
3452.type	bn_get_bits5,@function
3453.align	16
3454bn_get_bits5:
3455	leaq	0(%rdi),%r10
3456	leaq	1(%rdi),%r11
3457	movl	%esi,%ecx
3458	shrl	$4,%esi
3459	andl	$15,%ecx
3460	leal	-8(%rcx),%eax
3461	cmpl	$11,%ecx
3462	cmovaq	%r11,%r10
3463	cmoval	%eax,%ecx
3464	movzwl	(%r10,%rsi,2),%eax
3465	shrl	%cl,%eax
3466	andl	$31,%eax
3467	.byte	0xf3,0xc3
3468.size	bn_get_bits5,.-bn_get_bits5
3469
3470.globl	bn_scatter5
3471.type	bn_scatter5,@function
3472.align	16
3473bn_scatter5:
3474	cmpl	$0,%esi
3475	jz	.Lscatter_epilogue
3476	leaq	(%rdx,%rcx,8),%rdx
3477.Lscatter:
3478	movq	(%rdi),%rax
3479	leaq	8(%rdi),%rdi
3480	movq	%rax,(%rdx)
3481	leaq	256(%rdx),%rdx
3482	subl	$1,%esi
3483	jnz	.Lscatter
3484.Lscatter_epilogue:
3485	.byte	0xf3,0xc3
3486.size	bn_scatter5,.-bn_scatter5
3487
3488.globl	bn_gather5
3489.type	bn_gather5,@function
3490.align	32
3491bn_gather5:
3492.LSEH_begin_bn_gather5:
3493
3494.byte	0x4c,0x8d,0x14,0x24
3495.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00
3496	leaq	.Linc(%rip),%rax
3497	andq	$-16,%rsp
3498
3499	movd	%ecx,%xmm5
3500	movdqa	0(%rax),%xmm0
3501	movdqa	16(%rax),%xmm1
3502	leaq	128(%rdx),%r11
3503	leaq	128(%rsp),%rax
3504
3505	pshufd	$0,%xmm5,%xmm5
3506	movdqa	%xmm1,%xmm4
3507	movdqa	%xmm1,%xmm2
3508	paddd	%xmm0,%xmm1
3509	pcmpeqd	%xmm5,%xmm0
3510	movdqa	%xmm4,%xmm3
3511
3512	paddd	%xmm1,%xmm2
3513	pcmpeqd	%xmm5,%xmm1
3514	movdqa	%xmm0,-128(%rax)
3515	movdqa	%xmm4,%xmm0
3516
3517	paddd	%xmm2,%xmm3
3518	pcmpeqd	%xmm5,%xmm2
3519	movdqa	%xmm1,-112(%rax)
3520	movdqa	%xmm4,%xmm1
3521
3522	paddd	%xmm3,%xmm0
3523	pcmpeqd	%xmm5,%xmm3
3524	movdqa	%xmm2,-96(%rax)
3525	movdqa	%xmm4,%xmm2
3526	paddd	%xmm0,%xmm1
3527	pcmpeqd	%xmm5,%xmm0
3528	movdqa	%xmm3,-80(%rax)
3529	movdqa	%xmm4,%xmm3
3530
3531	paddd	%xmm1,%xmm2
3532	pcmpeqd	%xmm5,%xmm1
3533	movdqa	%xmm0,-64(%rax)
3534	movdqa	%xmm4,%xmm0
3535
3536	paddd	%xmm2,%xmm3
3537	pcmpeqd	%xmm5,%xmm2
3538	movdqa	%xmm1,-48(%rax)
3539	movdqa	%xmm4,%xmm1
3540
3541	paddd	%xmm3,%xmm0
3542	pcmpeqd	%xmm5,%xmm3
3543	movdqa	%xmm2,-32(%rax)
3544	movdqa	%xmm4,%xmm2
3545	paddd	%xmm0,%xmm1
3546	pcmpeqd	%xmm5,%xmm0
3547	movdqa	%xmm3,-16(%rax)
3548	movdqa	%xmm4,%xmm3
3549
3550	paddd	%xmm1,%xmm2
3551	pcmpeqd	%xmm5,%xmm1
3552	movdqa	%xmm0,0(%rax)
3553	movdqa	%xmm4,%xmm0
3554
3555	paddd	%xmm2,%xmm3
3556	pcmpeqd	%xmm5,%xmm2
3557	movdqa	%xmm1,16(%rax)
3558	movdqa	%xmm4,%xmm1
3559
3560	paddd	%xmm3,%xmm0
3561	pcmpeqd	%xmm5,%xmm3
3562	movdqa	%xmm2,32(%rax)
3563	movdqa	%xmm4,%xmm2
3564	paddd	%xmm0,%xmm1
3565	pcmpeqd	%xmm5,%xmm0
3566	movdqa	%xmm3,48(%rax)
3567	movdqa	%xmm4,%xmm3
3568
3569	paddd	%xmm1,%xmm2
3570	pcmpeqd	%xmm5,%xmm1
3571	movdqa	%xmm0,64(%rax)
3572	movdqa	%xmm4,%xmm0
3573
3574	paddd	%xmm2,%xmm3
3575	pcmpeqd	%xmm5,%xmm2
3576	movdqa	%xmm1,80(%rax)
3577	movdqa	%xmm4,%xmm1
3578
3579	paddd	%xmm3,%xmm0
3580	pcmpeqd	%xmm5,%xmm3
3581	movdqa	%xmm2,96(%rax)
3582	movdqa	%xmm4,%xmm2
3583	movdqa	%xmm3,112(%rax)
3584	jmp	.Lgather
3585
3586.align	32
3587.Lgather:
3588	pxor	%xmm4,%xmm4
3589	pxor	%xmm5,%xmm5
3590	movdqa	-128(%r11),%xmm0
3591	movdqa	-112(%r11),%xmm1
3592	movdqa	-96(%r11),%xmm2
3593	pand	-128(%rax),%xmm0
3594	movdqa	-80(%r11),%xmm3
3595	pand	-112(%rax),%xmm1
3596	por	%xmm0,%xmm4
3597	pand	-96(%rax),%xmm2
3598	por	%xmm1,%xmm5
3599	pand	-80(%rax),%xmm3
3600	por	%xmm2,%xmm4
3601	por	%xmm3,%xmm5
3602	movdqa	-64(%r11),%xmm0
3603	movdqa	-48(%r11),%xmm1
3604	movdqa	-32(%r11),%xmm2
3605	pand	-64(%rax),%xmm0
3606	movdqa	-16(%r11),%xmm3
3607	pand	-48(%rax),%xmm1
3608	por	%xmm0,%xmm4
3609	pand	-32(%rax),%xmm2
3610	por	%xmm1,%xmm5
3611	pand	-16(%rax),%xmm3
3612	por	%xmm2,%xmm4
3613	por	%xmm3,%xmm5
3614	movdqa	0(%r11),%xmm0
3615	movdqa	16(%r11),%xmm1
3616	movdqa	32(%r11),%xmm2
3617	pand	0(%rax),%xmm0
3618	movdqa	48(%r11),%xmm3
3619	pand	16(%rax),%xmm1
3620	por	%xmm0,%xmm4
3621	pand	32(%rax),%xmm2
3622	por	%xmm1,%xmm5
3623	pand	48(%rax),%xmm3
3624	por	%xmm2,%xmm4
3625	por	%xmm3,%xmm5
3626	movdqa	64(%r11),%xmm0
3627	movdqa	80(%r11),%xmm1
3628	movdqa	96(%r11),%xmm2
3629	pand	64(%rax),%xmm0
3630	movdqa	112(%r11),%xmm3
3631	pand	80(%rax),%xmm1
3632	por	%xmm0,%xmm4
3633	pand	96(%rax),%xmm2
3634	por	%xmm1,%xmm5
3635	pand	112(%rax),%xmm3
3636	por	%xmm2,%xmm4
3637	por	%xmm3,%xmm5
3638	por	%xmm5,%xmm4
3639	leaq	256(%r11),%r11
3640	pshufd	$0x4e,%xmm4,%xmm0
3641	por	%xmm4,%xmm0
3642	movq	%xmm0,(%rdi)
3643	leaq	8(%rdi),%rdi
3644	subl	$1,%esi
3645	jnz	.Lgather
3646
3647	leaq	(%r10),%rsp
3648	.byte	0xf3,0xc3
3649.LSEH_end_bn_gather5:
3650.size	bn_gather5,.-bn_gather5
3651.align	64
3652.Linc:
3653.long	0,0, 1,1
3654.long	2,2, 2,2
3655.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
3656