x86_64-mont5.S revision 1.4
1#include <machine/asm.h>
2.text
3
4
5
6.globl	bn_mul_mont_gather5
7.type	bn_mul_mont_gather5,@function
8.align	64
9bn_mul_mont_gather5:
10	movl	%r9d,%r9d
11	movq	%rsp,%rax
12	testl	$7,%r9d
13	jnz	.Lmul_enter
14	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
15	jmp	.Lmul4x_enter
16
17.align	16
18.Lmul_enter:
19	movd	8(%rsp),%xmm5
20	pushq	%rbx
21	pushq	%rbp
22	pushq	%r12
23	pushq	%r13
24	pushq	%r14
25	pushq	%r15
26
27	negq	%r9
28	movq	%rsp,%r11
29	leaq	-280(%rsp,%r9,8),%r10
30	negq	%r9
31	andq	$-1024,%r10
32
33
34
35
36
37
38
39	subq	%r10,%r11
40	andq	$-4096,%r11
41	leaq	(%r10,%r11,1),%rsp
42	movq	(%rsp),%r11
43	cmpq	%r10,%rsp
44	ja	.Lmul_page_walk
45	jmp	.Lmul_page_walk_done
46
47.Lmul_page_walk:
48	leaq	-4096(%rsp),%rsp
49	movq	(%rsp),%r11
50	cmpq	%r10,%rsp
51	ja	.Lmul_page_walk
52.Lmul_page_walk_done:
53
54	leaq	.Linc(%rip),%r10
55	movq	%rax,8(%rsp,%r9,8)
56.Lmul_body:
57
58	leaq	128(%rdx),%r12
59	movdqa	0(%r10),%xmm0
60	movdqa	16(%r10),%xmm1
61	leaq	24-112(%rsp,%r9,8),%r10
62	andq	$-16,%r10
63
64	pshufd	$0,%xmm5,%xmm5
65	movdqa	%xmm1,%xmm4
66	movdqa	%xmm1,%xmm2
67	paddd	%xmm0,%xmm1
68	pcmpeqd	%xmm5,%xmm0
69.byte	0x67
70	movdqa	%xmm4,%xmm3
71	paddd	%xmm1,%xmm2
72	pcmpeqd	%xmm5,%xmm1
73	movdqa	%xmm0,112(%r10)
74	movdqa	%xmm4,%xmm0
75
76	paddd	%xmm2,%xmm3
77	pcmpeqd	%xmm5,%xmm2
78	movdqa	%xmm1,128(%r10)
79	movdqa	%xmm4,%xmm1
80
81	paddd	%xmm3,%xmm0
82	pcmpeqd	%xmm5,%xmm3
83	movdqa	%xmm2,144(%r10)
84	movdqa	%xmm4,%xmm2
85
86	paddd	%xmm0,%xmm1
87	pcmpeqd	%xmm5,%xmm0
88	movdqa	%xmm3,160(%r10)
89	movdqa	%xmm4,%xmm3
90	paddd	%xmm1,%xmm2
91	pcmpeqd	%xmm5,%xmm1
92	movdqa	%xmm0,176(%r10)
93	movdqa	%xmm4,%xmm0
94
95	paddd	%xmm2,%xmm3
96	pcmpeqd	%xmm5,%xmm2
97	movdqa	%xmm1,192(%r10)
98	movdqa	%xmm4,%xmm1
99
100	paddd	%xmm3,%xmm0
101	pcmpeqd	%xmm5,%xmm3
102	movdqa	%xmm2,208(%r10)
103	movdqa	%xmm4,%xmm2
104
105	paddd	%xmm0,%xmm1
106	pcmpeqd	%xmm5,%xmm0
107	movdqa	%xmm3,224(%r10)
108	movdqa	%xmm4,%xmm3
109	paddd	%xmm1,%xmm2
110	pcmpeqd	%xmm5,%xmm1
111	movdqa	%xmm0,240(%r10)
112	movdqa	%xmm4,%xmm0
113
114	paddd	%xmm2,%xmm3
115	pcmpeqd	%xmm5,%xmm2
116	movdqa	%xmm1,256(%r10)
117	movdqa	%xmm4,%xmm1
118
119	paddd	%xmm3,%xmm0
120	pcmpeqd	%xmm5,%xmm3
121	movdqa	%xmm2,272(%r10)
122	movdqa	%xmm4,%xmm2
123
124	paddd	%xmm0,%xmm1
125	pcmpeqd	%xmm5,%xmm0
126	movdqa	%xmm3,288(%r10)
127	movdqa	%xmm4,%xmm3
128	paddd	%xmm1,%xmm2
129	pcmpeqd	%xmm5,%xmm1
130	movdqa	%xmm0,304(%r10)
131
132	paddd	%xmm2,%xmm3
133.byte	0x67
134	pcmpeqd	%xmm5,%xmm2
135	movdqa	%xmm1,320(%r10)
136
137	pcmpeqd	%xmm5,%xmm3
138	movdqa	%xmm2,336(%r10)
139	pand	64(%r12),%xmm0
140
141	pand	80(%r12),%xmm1
142	pand	96(%r12),%xmm2
143	movdqa	%xmm3,352(%r10)
144	pand	112(%r12),%xmm3
145	por	%xmm2,%xmm0
146	por	%xmm3,%xmm1
147	movdqa	-128(%r12),%xmm4
148	movdqa	-112(%r12),%xmm5
149	movdqa	-96(%r12),%xmm2
150	pand	112(%r10),%xmm4
151	movdqa	-80(%r12),%xmm3
152	pand	128(%r10),%xmm5
153	por	%xmm4,%xmm0
154	pand	144(%r10),%xmm2
155	por	%xmm5,%xmm1
156	pand	160(%r10),%xmm3
157	por	%xmm2,%xmm0
158	por	%xmm3,%xmm1
159	movdqa	-64(%r12),%xmm4
160	movdqa	-48(%r12),%xmm5
161	movdqa	-32(%r12),%xmm2
162	pand	176(%r10),%xmm4
163	movdqa	-16(%r12),%xmm3
164	pand	192(%r10),%xmm5
165	por	%xmm4,%xmm0
166	pand	208(%r10),%xmm2
167	por	%xmm5,%xmm1
168	pand	224(%r10),%xmm3
169	por	%xmm2,%xmm0
170	por	%xmm3,%xmm1
171	movdqa	0(%r12),%xmm4
172	movdqa	16(%r12),%xmm5
173	movdqa	32(%r12),%xmm2
174	pand	240(%r10),%xmm4
175	movdqa	48(%r12),%xmm3
176	pand	256(%r10),%xmm5
177	por	%xmm4,%xmm0
178	pand	272(%r10),%xmm2
179	por	%xmm5,%xmm1
180	pand	288(%r10),%xmm3
181	por	%xmm2,%xmm0
182	por	%xmm3,%xmm1
183	por	%xmm1,%xmm0
184	pshufd	$0x4e,%xmm0,%xmm1
185	por	%xmm1,%xmm0
186	leaq	256(%r12),%r12
187.byte	102,72,15,126,195
188
189	movq	(%r8),%r8
190	movq	(%rsi),%rax
191
192	xorq	%r14,%r14
193	xorq	%r15,%r15
194
195	movq	%r8,%rbp
196	mulq	%rbx
197	movq	%rax,%r10
198	movq	(%rcx),%rax
199
200	imulq	%r10,%rbp
201	movq	%rdx,%r11
202
203	mulq	%rbp
204	addq	%rax,%r10
205	movq	8(%rsi),%rax
206	adcq	$0,%rdx
207	movq	%rdx,%r13
208
209	leaq	1(%r15),%r15
210	jmp	.L1st_enter
211
212.align	16
213.L1st:
214	addq	%rax,%r13
215	movq	(%rsi,%r15,8),%rax
216	adcq	$0,%rdx
217	addq	%r11,%r13
218	movq	%r10,%r11
219	adcq	$0,%rdx
220	movq	%r13,-16(%rsp,%r15,8)
221	movq	%rdx,%r13
222
223.L1st_enter:
224	mulq	%rbx
225	addq	%rax,%r11
226	movq	(%rcx,%r15,8),%rax
227	adcq	$0,%rdx
228	leaq	1(%r15),%r15
229	movq	%rdx,%r10
230
231	mulq	%rbp
232	cmpq	%r9,%r15
233	jne	.L1st
234
235
236	addq	%rax,%r13
237	adcq	$0,%rdx
238	addq	%r11,%r13
239	adcq	$0,%rdx
240	movq	%r13,-16(%rsp,%r9,8)
241	movq	%rdx,%r13
242	movq	%r10,%r11
243
244	xorq	%rdx,%rdx
245	addq	%r11,%r13
246	adcq	$0,%rdx
247	movq	%r13,-8(%rsp,%r9,8)
248	movq	%rdx,(%rsp,%r9,8)
249
250	leaq	1(%r14),%r14
251	jmp	.Louter
252.align	16
253.Louter:
254	leaq	24+128(%rsp,%r9,8),%rdx
255	andq	$-16,%rdx
256	pxor	%xmm4,%xmm4
257	pxor	%xmm5,%xmm5
258	movdqa	-128(%r12),%xmm0
259	movdqa	-112(%r12),%xmm1
260	movdqa	-96(%r12),%xmm2
261	movdqa	-80(%r12),%xmm3
262	pand	-128(%rdx),%xmm0
263	pand	-112(%rdx),%xmm1
264	por	%xmm0,%xmm4
265	pand	-96(%rdx),%xmm2
266	por	%xmm1,%xmm5
267	pand	-80(%rdx),%xmm3
268	por	%xmm2,%xmm4
269	por	%xmm3,%xmm5
270	movdqa	-64(%r12),%xmm0
271	movdqa	-48(%r12),%xmm1
272	movdqa	-32(%r12),%xmm2
273	movdqa	-16(%r12),%xmm3
274	pand	-64(%rdx),%xmm0
275	pand	-48(%rdx),%xmm1
276	por	%xmm0,%xmm4
277	pand	-32(%rdx),%xmm2
278	por	%xmm1,%xmm5
279	pand	-16(%rdx),%xmm3
280	por	%xmm2,%xmm4
281	por	%xmm3,%xmm5
282	movdqa	0(%r12),%xmm0
283	movdqa	16(%r12),%xmm1
284	movdqa	32(%r12),%xmm2
285	movdqa	48(%r12),%xmm3
286	pand	0(%rdx),%xmm0
287	pand	16(%rdx),%xmm1
288	por	%xmm0,%xmm4
289	pand	32(%rdx),%xmm2
290	por	%xmm1,%xmm5
291	pand	48(%rdx),%xmm3
292	por	%xmm2,%xmm4
293	por	%xmm3,%xmm5
294	movdqa	64(%r12),%xmm0
295	movdqa	80(%r12),%xmm1
296	movdqa	96(%r12),%xmm2
297	movdqa	112(%r12),%xmm3
298	pand	64(%rdx),%xmm0
299	pand	80(%rdx),%xmm1
300	por	%xmm0,%xmm4
301	pand	96(%rdx),%xmm2
302	por	%xmm1,%xmm5
303	pand	112(%rdx),%xmm3
304	por	%xmm2,%xmm4
305	por	%xmm3,%xmm5
306	por	%xmm5,%xmm4
307	pshufd	$0x4e,%xmm4,%xmm0
308	por	%xmm4,%xmm0
309	leaq	256(%r12),%r12
310
311	movq	(%rsi),%rax
312.byte	102,72,15,126,195
313
314	xorq	%r15,%r15
315	movq	%r8,%rbp
316	movq	(%rsp),%r10
317
318	mulq	%rbx
319	addq	%rax,%r10
320	movq	(%rcx),%rax
321	adcq	$0,%rdx
322
323	imulq	%r10,%rbp
324	movq	%rdx,%r11
325
326	mulq	%rbp
327	addq	%rax,%r10
328	movq	8(%rsi),%rax
329	adcq	$0,%rdx
330	movq	8(%rsp),%r10
331	movq	%rdx,%r13
332
333	leaq	1(%r15),%r15
334	jmp	.Linner_enter
335
336.align	16
337.Linner:
338	addq	%rax,%r13
339	movq	(%rsi,%r15,8),%rax
340	adcq	$0,%rdx
341	addq	%r10,%r13
342	movq	(%rsp,%r15,8),%r10
343	adcq	$0,%rdx
344	movq	%r13,-16(%rsp,%r15,8)
345	movq	%rdx,%r13
346
347.Linner_enter:
348	mulq	%rbx
349	addq	%rax,%r11
350	movq	(%rcx,%r15,8),%rax
351	adcq	$0,%rdx
352	addq	%r11,%r10
353	movq	%rdx,%r11
354	adcq	$0,%r11
355	leaq	1(%r15),%r15
356
357	mulq	%rbp
358	cmpq	%r9,%r15
359	jne	.Linner
360
361	addq	%rax,%r13
362	adcq	$0,%rdx
363	addq	%r10,%r13
364	movq	(%rsp,%r9,8),%r10
365	adcq	$0,%rdx
366	movq	%r13,-16(%rsp,%r9,8)
367	movq	%rdx,%r13
368
369	xorq	%rdx,%rdx
370	addq	%r11,%r13
371	adcq	$0,%rdx
372	addq	%r10,%r13
373	adcq	$0,%rdx
374	movq	%r13,-8(%rsp,%r9,8)
375	movq	%rdx,(%rsp,%r9,8)
376
377	leaq	1(%r14),%r14
378	cmpq	%r9,%r14
379	jb	.Louter
380
381	xorq	%r14,%r14
382	movq	(%rsp),%rax
383	leaq	(%rsp),%rsi
384	movq	%r9,%r15
385	jmp	.Lsub
386.align	16
387.Lsub:	sbbq	(%rcx,%r14,8),%rax
388	movq	%rax,(%rdi,%r14,8)
389	movq	8(%rsi,%r14,8),%rax
390	leaq	1(%r14),%r14
391	decq	%r15
392	jnz	.Lsub
393
394	sbbq	$0,%rax
395	xorq	%r14,%r14
396	andq	%rax,%rsi
397	notq	%rax
398	movq	%rdi,%rcx
399	andq	%rax,%rcx
400	movq	%r9,%r15
401	orq	%rcx,%rsi
402.align	16
403.Lcopy:
404	movq	(%rsi,%r14,8),%rax
405	movq	%r14,(%rsp,%r14,8)
406	movq	%rax,(%rdi,%r14,8)
407	leaq	1(%r14),%r14
408	subq	$1,%r15
409	jnz	.Lcopy
410
411	movq	8(%rsp,%r9,8),%rsi
412	movq	$1,%rax
413
414	movq	-48(%rsi),%r15
415	movq	-40(%rsi),%r14
416	movq	-32(%rsi),%r13
417	movq	-24(%rsi),%r12
418	movq	-16(%rsi),%rbp
419	movq	-8(%rsi),%rbx
420	leaq	(%rsi),%rsp
421.Lmul_epilogue:
422	.byte	0xf3,0xc3
423.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
424.type	bn_mul4x_mont_gather5,@function
425.align	32
426bn_mul4x_mont_gather5:
427.byte	0x67
428	movq	%rsp,%rax
429.Lmul4x_enter:
430	andl	$0x80108,%r11d
431	cmpl	$0x80108,%r11d
432	je	.Lmulx4x_enter
433	pushq	%rbx
434	pushq	%rbp
435	pushq	%r12
436	pushq	%r13
437	pushq	%r14
438	pushq	%r15
439.Lmul4x_prologue:
440
441.byte	0x67
442	shll	$3,%r9d
443	leaq	(%r9,%r9,2),%r10
444	negq	%r9
445
446
447
448
449
450
451
452
453
454
455	leaq	-320(%rsp,%r9,2),%r11
456	movq	%rsp,%rbp
457	subq	%rdi,%r11
458	andq	$4095,%r11
459	cmpq	%r11,%r10
460	jb	.Lmul4xsp_alt
461	subq	%r11,%rbp
462	leaq	-320(%rbp,%r9,2),%rbp
463	jmp	.Lmul4xsp_done
464
465.align	32
466.Lmul4xsp_alt:
467	leaq	4096-320(,%r9,2),%r10
468	leaq	-320(%rbp,%r9,2),%rbp
469	subq	%r10,%r11
470	movq	$0,%r10
471	cmovcq	%r10,%r11
472	subq	%r11,%rbp
473.Lmul4xsp_done:
474	andq	$-64,%rbp
475	movq	%rsp,%r11
476	subq	%rbp,%r11
477	andq	$-4096,%r11
478	leaq	(%r11,%rbp,1),%rsp
479	movq	(%rsp),%r10
480	cmpq	%rbp,%rsp
481	ja	.Lmul4x_page_walk
482	jmp	.Lmul4x_page_walk_done
483
484.Lmul4x_page_walk:
485	leaq	-4096(%rsp),%rsp
486	movq	(%rsp),%r10
487	cmpq	%rbp,%rsp
488	ja	.Lmul4x_page_walk
489.Lmul4x_page_walk_done:
490
491	negq	%r9
492
493	movq	%rax,40(%rsp)
494.Lmul4x_body:
495
496	call	mul4x_internal
497
498	movq	40(%rsp),%rsi
499	movq	$1,%rax
500
501	movq	-48(%rsi),%r15
502	movq	-40(%rsi),%r14
503	movq	-32(%rsi),%r13
504	movq	-24(%rsi),%r12
505	movq	-16(%rsi),%rbp
506	movq	-8(%rsi),%rbx
507	leaq	(%rsi),%rsp
508.Lmul4x_epilogue:
509	.byte	0xf3,0xc3
510.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
511
512.type	mul4x_internal,@function
513.align	32
514mul4x_internal:
515	shlq	$5,%r9
516	movd	8(%rax),%xmm5
517	leaq	.Linc(%rip),%rax
518	leaq	128(%rdx,%r9,1),%r13
519	shrq	$5,%r9
520	movdqa	0(%rax),%xmm0
521	movdqa	16(%rax),%xmm1
522	leaq	88-112(%rsp,%r9,1),%r10
523	leaq	128(%rdx),%r12
524
525	pshufd	$0,%xmm5,%xmm5
526	movdqa	%xmm1,%xmm4
527.byte	0x67,0x67
528	movdqa	%xmm1,%xmm2
529	paddd	%xmm0,%xmm1
530	pcmpeqd	%xmm5,%xmm0
531.byte	0x67
532	movdqa	%xmm4,%xmm3
533	paddd	%xmm1,%xmm2
534	pcmpeqd	%xmm5,%xmm1
535	movdqa	%xmm0,112(%r10)
536	movdqa	%xmm4,%xmm0
537
538	paddd	%xmm2,%xmm3
539	pcmpeqd	%xmm5,%xmm2
540	movdqa	%xmm1,128(%r10)
541	movdqa	%xmm4,%xmm1
542
543	paddd	%xmm3,%xmm0
544	pcmpeqd	%xmm5,%xmm3
545	movdqa	%xmm2,144(%r10)
546	movdqa	%xmm4,%xmm2
547
548	paddd	%xmm0,%xmm1
549	pcmpeqd	%xmm5,%xmm0
550	movdqa	%xmm3,160(%r10)
551	movdqa	%xmm4,%xmm3
552	paddd	%xmm1,%xmm2
553	pcmpeqd	%xmm5,%xmm1
554	movdqa	%xmm0,176(%r10)
555	movdqa	%xmm4,%xmm0
556
557	paddd	%xmm2,%xmm3
558	pcmpeqd	%xmm5,%xmm2
559	movdqa	%xmm1,192(%r10)
560	movdqa	%xmm4,%xmm1
561
562	paddd	%xmm3,%xmm0
563	pcmpeqd	%xmm5,%xmm3
564	movdqa	%xmm2,208(%r10)
565	movdqa	%xmm4,%xmm2
566
567	paddd	%xmm0,%xmm1
568	pcmpeqd	%xmm5,%xmm0
569	movdqa	%xmm3,224(%r10)
570	movdqa	%xmm4,%xmm3
571	paddd	%xmm1,%xmm2
572	pcmpeqd	%xmm5,%xmm1
573	movdqa	%xmm0,240(%r10)
574	movdqa	%xmm4,%xmm0
575
576	paddd	%xmm2,%xmm3
577	pcmpeqd	%xmm5,%xmm2
578	movdqa	%xmm1,256(%r10)
579	movdqa	%xmm4,%xmm1
580
581	paddd	%xmm3,%xmm0
582	pcmpeqd	%xmm5,%xmm3
583	movdqa	%xmm2,272(%r10)
584	movdqa	%xmm4,%xmm2
585
586	paddd	%xmm0,%xmm1
587	pcmpeqd	%xmm5,%xmm0
588	movdqa	%xmm3,288(%r10)
589	movdqa	%xmm4,%xmm3
590	paddd	%xmm1,%xmm2
591	pcmpeqd	%xmm5,%xmm1
592	movdqa	%xmm0,304(%r10)
593
594	paddd	%xmm2,%xmm3
595.byte	0x67
596	pcmpeqd	%xmm5,%xmm2
597	movdqa	%xmm1,320(%r10)
598
599	pcmpeqd	%xmm5,%xmm3
600	movdqa	%xmm2,336(%r10)
601	pand	64(%r12),%xmm0
602
603	pand	80(%r12),%xmm1
604	pand	96(%r12),%xmm2
605	movdqa	%xmm3,352(%r10)
606	pand	112(%r12),%xmm3
607	por	%xmm2,%xmm0
608	por	%xmm3,%xmm1
609	movdqa	-128(%r12),%xmm4
610	movdqa	-112(%r12),%xmm5
611	movdqa	-96(%r12),%xmm2
612	pand	112(%r10),%xmm4
613	movdqa	-80(%r12),%xmm3
614	pand	128(%r10),%xmm5
615	por	%xmm4,%xmm0
616	pand	144(%r10),%xmm2
617	por	%xmm5,%xmm1
618	pand	160(%r10),%xmm3
619	por	%xmm2,%xmm0
620	por	%xmm3,%xmm1
621	movdqa	-64(%r12),%xmm4
622	movdqa	-48(%r12),%xmm5
623	movdqa	-32(%r12),%xmm2
624	pand	176(%r10),%xmm4
625	movdqa	-16(%r12),%xmm3
626	pand	192(%r10),%xmm5
627	por	%xmm4,%xmm0
628	pand	208(%r10),%xmm2
629	por	%xmm5,%xmm1
630	pand	224(%r10),%xmm3
631	por	%xmm2,%xmm0
632	por	%xmm3,%xmm1
633	movdqa	0(%r12),%xmm4
634	movdqa	16(%r12),%xmm5
635	movdqa	32(%r12),%xmm2
636	pand	240(%r10),%xmm4
637	movdqa	48(%r12),%xmm3
638	pand	256(%r10),%xmm5
639	por	%xmm4,%xmm0
640	pand	272(%r10),%xmm2
641	por	%xmm5,%xmm1
642	pand	288(%r10),%xmm3
643	por	%xmm2,%xmm0
644	por	%xmm3,%xmm1
645	por	%xmm1,%xmm0
646	pshufd	$0x4e,%xmm0,%xmm1
647	por	%xmm1,%xmm0
648	leaq	256(%r12),%r12
649.byte	102,72,15,126,195
650
651	movq	%r13,16+8(%rsp)
652	movq	%rdi,56+8(%rsp)
653
654	movq	(%r8),%r8
655	movq	(%rsi),%rax
656	leaq	(%rsi,%r9,1),%rsi
657	negq	%r9
658
659	movq	%r8,%rbp
660	mulq	%rbx
661	movq	%rax,%r10
662	movq	(%rcx),%rax
663
664	imulq	%r10,%rbp
665	leaq	64+8(%rsp),%r14
666	movq	%rdx,%r11
667
668	mulq	%rbp
669	addq	%rax,%r10
670	movq	8(%rsi,%r9,1),%rax
671	adcq	$0,%rdx
672	movq	%rdx,%rdi
673
674	mulq	%rbx
675	addq	%rax,%r11
676	movq	8(%rcx),%rax
677	adcq	$0,%rdx
678	movq	%rdx,%r10
679
680	mulq	%rbp
681	addq	%rax,%rdi
682	movq	16(%rsi,%r9,1),%rax
683	adcq	$0,%rdx
684	addq	%r11,%rdi
685	leaq	32(%r9),%r15
686	leaq	32(%rcx),%rcx
687	adcq	$0,%rdx
688	movq	%rdi,(%r14)
689	movq	%rdx,%r13
690	jmp	.L1st4x
691
692.align	32
693.L1st4x:
694	mulq	%rbx
695	addq	%rax,%r10
696	movq	-16(%rcx),%rax
697	leaq	32(%r14),%r14
698	adcq	$0,%rdx
699	movq	%rdx,%r11
700
701	mulq	%rbp
702	addq	%rax,%r13
703	movq	-8(%rsi,%r15,1),%rax
704	adcq	$0,%rdx
705	addq	%r10,%r13
706	adcq	$0,%rdx
707	movq	%r13,-24(%r14)
708	movq	%rdx,%rdi
709
710	mulq	%rbx
711	addq	%rax,%r11
712	movq	-8(%rcx),%rax
713	adcq	$0,%rdx
714	movq	%rdx,%r10
715
716	mulq	%rbp
717	addq	%rax,%rdi
718	movq	(%rsi,%r15,1),%rax
719	adcq	$0,%rdx
720	addq	%r11,%rdi
721	adcq	$0,%rdx
722	movq	%rdi,-16(%r14)
723	movq	%rdx,%r13
724
725	mulq	%rbx
726	addq	%rax,%r10
727	movq	0(%rcx),%rax
728	adcq	$0,%rdx
729	movq	%rdx,%r11
730
731	mulq	%rbp
732	addq	%rax,%r13
733	movq	8(%rsi,%r15,1),%rax
734	adcq	$0,%rdx
735	addq	%r10,%r13
736	adcq	$0,%rdx
737	movq	%r13,-8(%r14)
738	movq	%rdx,%rdi
739
740	mulq	%rbx
741	addq	%rax,%r11
742	movq	8(%rcx),%rax
743	adcq	$0,%rdx
744	movq	%rdx,%r10
745
746	mulq	%rbp
747	addq	%rax,%rdi
748	movq	16(%rsi,%r15,1),%rax
749	adcq	$0,%rdx
750	addq	%r11,%rdi
751	leaq	32(%rcx),%rcx
752	adcq	$0,%rdx
753	movq	%rdi,(%r14)
754	movq	%rdx,%r13
755
756	addq	$32,%r15
757	jnz	.L1st4x
758
759	mulq	%rbx
760	addq	%rax,%r10
761	movq	-16(%rcx),%rax
762	leaq	32(%r14),%r14
763	adcq	$0,%rdx
764	movq	%rdx,%r11
765
766	mulq	%rbp
767	addq	%rax,%r13
768	movq	-8(%rsi),%rax
769	adcq	$0,%rdx
770	addq	%r10,%r13
771	adcq	$0,%rdx
772	movq	%r13,-24(%r14)
773	movq	%rdx,%rdi
774
775	mulq	%rbx
776	addq	%rax,%r11
777	movq	-8(%rcx),%rax
778	adcq	$0,%rdx
779	movq	%rdx,%r10
780
781	mulq	%rbp
782	addq	%rax,%rdi
783	movq	(%rsi,%r9,1),%rax
784	adcq	$0,%rdx
785	addq	%r11,%rdi
786	adcq	$0,%rdx
787	movq	%rdi,-16(%r14)
788	movq	%rdx,%r13
789
790	leaq	(%rcx,%r9,1),%rcx
791
792	xorq	%rdi,%rdi
793	addq	%r10,%r13
794	adcq	$0,%rdi
795	movq	%r13,-8(%r14)
796
797	jmp	.Louter4x
798
799.align	32
800.Louter4x:
801	leaq	16+128(%r14),%rdx
802	pxor	%xmm4,%xmm4
803	pxor	%xmm5,%xmm5
804	movdqa	-128(%r12),%xmm0
805	movdqa	-112(%r12),%xmm1
806	movdqa	-96(%r12),%xmm2
807	movdqa	-80(%r12),%xmm3
808	pand	-128(%rdx),%xmm0
809	pand	-112(%rdx),%xmm1
810	por	%xmm0,%xmm4
811	pand	-96(%rdx),%xmm2
812	por	%xmm1,%xmm5
813	pand	-80(%rdx),%xmm3
814	por	%xmm2,%xmm4
815	por	%xmm3,%xmm5
816	movdqa	-64(%r12),%xmm0
817	movdqa	-48(%r12),%xmm1
818	movdqa	-32(%r12),%xmm2
819	movdqa	-16(%r12),%xmm3
820	pand	-64(%rdx),%xmm0
821	pand	-48(%rdx),%xmm1
822	por	%xmm0,%xmm4
823	pand	-32(%rdx),%xmm2
824	por	%xmm1,%xmm5
825	pand	-16(%rdx),%xmm3
826	por	%xmm2,%xmm4
827	por	%xmm3,%xmm5
828	movdqa	0(%r12),%xmm0
829	movdqa	16(%r12),%xmm1
830	movdqa	32(%r12),%xmm2
831	movdqa	48(%r12),%xmm3
832	pand	0(%rdx),%xmm0
833	pand	16(%rdx),%xmm1
834	por	%xmm0,%xmm4
835	pand	32(%rdx),%xmm2
836	por	%xmm1,%xmm5
837	pand	48(%rdx),%xmm3
838	por	%xmm2,%xmm4
839	por	%xmm3,%xmm5
840	movdqa	64(%r12),%xmm0
841	movdqa	80(%r12),%xmm1
842	movdqa	96(%r12),%xmm2
843	movdqa	112(%r12),%xmm3
844	pand	64(%rdx),%xmm0
845	pand	80(%rdx),%xmm1
846	por	%xmm0,%xmm4
847	pand	96(%rdx),%xmm2
848	por	%xmm1,%xmm5
849	pand	112(%rdx),%xmm3
850	por	%xmm2,%xmm4
851	por	%xmm3,%xmm5
852	por	%xmm5,%xmm4
853	pshufd	$0x4e,%xmm4,%xmm0
854	por	%xmm4,%xmm0
855	leaq	256(%r12),%r12
856.byte	102,72,15,126,195
857
858	movq	(%r14,%r9,1),%r10
859	movq	%r8,%rbp
860	mulq	%rbx
861	addq	%rax,%r10
862	movq	(%rcx),%rax
863	adcq	$0,%rdx
864
865	imulq	%r10,%rbp
866	movq	%rdx,%r11
867	movq	%rdi,(%r14)
868
869	leaq	(%r14,%r9,1),%r14
870
871	mulq	%rbp
872	addq	%rax,%r10
873	movq	8(%rsi,%r9,1),%rax
874	adcq	$0,%rdx
875	movq	%rdx,%rdi
876
877	mulq	%rbx
878	addq	%rax,%r11
879	movq	8(%rcx),%rax
880	adcq	$0,%rdx
881	addq	8(%r14),%r11
882	adcq	$0,%rdx
883	movq	%rdx,%r10
884
885	mulq	%rbp
886	addq	%rax,%rdi
887	movq	16(%rsi,%r9,1),%rax
888	adcq	$0,%rdx
889	addq	%r11,%rdi
890	leaq	32(%r9),%r15
891	leaq	32(%rcx),%rcx
892	adcq	$0,%rdx
893	movq	%rdx,%r13
894	jmp	.Linner4x
895
896.align	32
897.Linner4x:
898	mulq	%rbx
899	addq	%rax,%r10
900	movq	-16(%rcx),%rax
901	adcq	$0,%rdx
902	addq	16(%r14),%r10
903	leaq	32(%r14),%r14
904	adcq	$0,%rdx
905	movq	%rdx,%r11
906
907	mulq	%rbp
908	addq	%rax,%r13
909	movq	-8(%rsi,%r15,1),%rax
910	adcq	$0,%rdx
911	addq	%r10,%r13
912	adcq	$0,%rdx
913	movq	%rdi,-32(%r14)
914	movq	%rdx,%rdi
915
916	mulq	%rbx
917	addq	%rax,%r11
918	movq	-8(%rcx),%rax
919	adcq	$0,%rdx
920	addq	-8(%r14),%r11
921	adcq	$0,%rdx
922	movq	%rdx,%r10
923
924	mulq	%rbp
925	addq	%rax,%rdi
926	movq	(%rsi,%r15,1),%rax
927	adcq	$0,%rdx
928	addq	%r11,%rdi
929	adcq	$0,%rdx
930	movq	%r13,-24(%r14)
931	movq	%rdx,%r13
932
933	mulq	%rbx
934	addq	%rax,%r10
935	movq	0(%rcx),%rax
936	adcq	$0,%rdx
937	addq	(%r14),%r10
938	adcq	$0,%rdx
939	movq	%rdx,%r11
940
941	mulq	%rbp
942	addq	%rax,%r13
943	movq	8(%rsi,%r15,1),%rax
944	adcq	$0,%rdx
945	addq	%r10,%r13
946	adcq	$0,%rdx
947	movq	%rdi,-16(%r14)
948	movq	%rdx,%rdi
949
950	mulq	%rbx
951	addq	%rax,%r11
952	movq	8(%rcx),%rax
953	adcq	$0,%rdx
954	addq	8(%r14),%r11
955	adcq	$0,%rdx
956	movq	%rdx,%r10
957
958	mulq	%rbp
959	addq	%rax,%rdi
960	movq	16(%rsi,%r15,1),%rax
961	adcq	$0,%rdx
962	addq	%r11,%rdi
963	leaq	32(%rcx),%rcx
964	adcq	$0,%rdx
965	movq	%r13,-8(%r14)
966	movq	%rdx,%r13
967
968	addq	$32,%r15
969	jnz	.Linner4x
970
971	mulq	%rbx
972	addq	%rax,%r10
973	movq	-16(%rcx),%rax
974	adcq	$0,%rdx
975	addq	16(%r14),%r10
976	leaq	32(%r14),%r14
977	adcq	$0,%rdx
978	movq	%rdx,%r11
979
980	mulq	%rbp
981	addq	%rax,%r13
982	movq	-8(%rsi),%rax
983	adcq	$0,%rdx
984	addq	%r10,%r13
985	adcq	$0,%rdx
986	movq	%rdi,-32(%r14)
987	movq	%rdx,%rdi
988
989	mulq	%rbx
990	addq	%rax,%r11
991	movq	%rbp,%rax
992	movq	-8(%rcx),%rbp
993	adcq	$0,%rdx
994	addq	-8(%r14),%r11
995	adcq	$0,%rdx
996	movq	%rdx,%r10
997
998	mulq	%rbp
999	addq	%rax,%rdi
1000	movq	(%rsi,%r9,1),%rax
1001	adcq	$0,%rdx
1002	addq	%r11,%rdi
1003	adcq	$0,%rdx
1004	movq	%r13,-24(%r14)
1005	movq	%rdx,%r13
1006
1007	movq	%rdi,-16(%r14)
1008	leaq	(%rcx,%r9,1),%rcx
1009
1010	xorq	%rdi,%rdi
1011	addq	%r10,%r13
1012	adcq	$0,%rdi
1013	addq	(%r14),%r13
1014	adcq	$0,%rdi
1015	movq	%r13,-8(%r14)
1016
1017	cmpq	16+8(%rsp),%r12
1018	jb	.Louter4x
1019	xorq	%rax,%rax
1020	subq	%r13,%rbp
1021	adcq	%r15,%r15
1022	orq	%r15,%rdi
1023	subq	%rdi,%rax
1024	leaq	(%r14,%r9,1),%rbx
1025	movq	(%rcx),%r12
1026	leaq	(%rcx),%rbp
1027	movq	%r9,%rcx
1028	sarq	$3+2,%rcx
1029	movq	56+8(%rsp),%rdi
1030	decq	%r12
1031	xorq	%r10,%r10
1032	movq	8(%rbp),%r13
1033	movq	16(%rbp),%r14
1034	movq	24(%rbp),%r15
1035	jmp	.Lsqr4x_sub_entry
1036.size	mul4x_internal,.-mul4x_internal
1037.globl	bn_power5
1038.type	bn_power5,@function
1039.align	32
1040bn_power5:
1041	movq	%rsp,%rax
1042	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
1043	andl	$0x80108,%r11d
1044	cmpl	$0x80108,%r11d
1045	je	.Lpowerx5_enter
1046	pushq	%rbx
1047	pushq	%rbp
1048	pushq	%r12
1049	pushq	%r13
1050	pushq	%r14
1051	pushq	%r15
1052.Lpower5_prologue:
1053
1054	shll	$3,%r9d
1055	leal	(%r9,%r9,2),%r10d
1056	negq	%r9
1057	movq	(%r8),%r8
1058
1059
1060
1061
1062
1063
1064
1065
1066	leaq	-320(%rsp,%r9,2),%r11
1067	movq	%rsp,%rbp
1068	subq	%rdi,%r11
1069	andq	$4095,%r11
1070	cmpq	%r11,%r10
1071	jb	.Lpwr_sp_alt
1072	subq	%r11,%rbp
1073	leaq	-320(%rbp,%r9,2),%rbp
1074	jmp	.Lpwr_sp_done
1075
1076.align	32
1077.Lpwr_sp_alt:
1078	leaq	4096-320(,%r9,2),%r10
1079	leaq	-320(%rbp,%r9,2),%rbp
1080	subq	%r10,%r11
1081	movq	$0,%r10
1082	cmovcq	%r10,%r11
1083	subq	%r11,%rbp
1084.Lpwr_sp_done:
1085	andq	$-64,%rbp
1086	movq	%rsp,%r11
1087	subq	%rbp,%r11
1088	andq	$-4096,%r11
1089	leaq	(%r11,%rbp,1),%rsp
1090	movq	(%rsp),%r10
1091	cmpq	%rbp,%rsp
1092	ja	.Lpwr_page_walk
1093	jmp	.Lpwr_page_walk_done
1094
1095.Lpwr_page_walk:
1096	leaq	-4096(%rsp),%rsp
1097	movq	(%rsp),%r10
1098	cmpq	%rbp,%rsp
1099	ja	.Lpwr_page_walk
1100.Lpwr_page_walk_done:
1101
1102	movq	%r9,%r10
1103	negq	%r9
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114	movq	%r8,32(%rsp)
1115	movq	%rax,40(%rsp)
1116.Lpower5_body:
1117.byte	102,72,15,110,207
1118.byte	102,72,15,110,209
1119.byte	102,73,15,110,218
1120.byte	102,72,15,110,226
1121
1122	call	__bn_sqr8x_internal
1123	call	__bn_post4x_internal
1124	call	__bn_sqr8x_internal
1125	call	__bn_post4x_internal
1126	call	__bn_sqr8x_internal
1127	call	__bn_post4x_internal
1128	call	__bn_sqr8x_internal
1129	call	__bn_post4x_internal
1130	call	__bn_sqr8x_internal
1131	call	__bn_post4x_internal
1132
1133.byte	102,72,15,126,209
1134.byte	102,72,15,126,226
1135	movq	%rsi,%rdi
1136	movq	40(%rsp),%rax
1137	leaq	32(%rsp),%r8
1138
1139	call	mul4x_internal
1140
1141	movq	40(%rsp),%rsi
1142	movq	$1,%rax
1143	movq	-48(%rsi),%r15
1144	movq	-40(%rsi),%r14
1145	movq	-32(%rsi),%r13
1146	movq	-24(%rsi),%r12
1147	movq	-16(%rsi),%rbp
1148	movq	-8(%rsi),%rbx
1149	leaq	(%rsi),%rsp
1150.Lpower5_epilogue:
1151	.byte	0xf3,0xc3
1152.size	bn_power5,.-bn_power5
1153
1154.globl	bn_sqr8x_internal
1155.hidden	bn_sqr8x_internal
1156.type	bn_sqr8x_internal,@function
1157.align	32
1158bn_sqr8x_internal:
1159__bn_sqr8x_internal:
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233	leaq	32(%r10),%rbp
1234	leaq	(%rsi,%r9,1),%rsi
1235
1236	movq	%r9,%rcx
1237
1238
1239	movq	-32(%rsi,%rbp,1),%r14
1240	leaq	48+8(%rsp,%r9,2),%rdi
1241	movq	-24(%rsi,%rbp,1),%rax
1242	leaq	-32(%rdi,%rbp,1),%rdi
1243	movq	-16(%rsi,%rbp,1),%rbx
1244	movq	%rax,%r15
1245
1246	mulq	%r14
1247	movq	%rax,%r10
1248	movq	%rbx,%rax
1249	movq	%rdx,%r11
1250	movq	%r10,-24(%rdi,%rbp,1)
1251
1252	mulq	%r14
1253	addq	%rax,%r11
1254	movq	%rbx,%rax
1255	adcq	$0,%rdx
1256	movq	%r11,-16(%rdi,%rbp,1)
1257	movq	%rdx,%r10
1258
1259
1260	movq	-8(%rsi,%rbp,1),%rbx
1261	mulq	%r15
1262	movq	%rax,%r12
1263	movq	%rbx,%rax
1264	movq	%rdx,%r13
1265
1266	leaq	(%rbp),%rcx
1267	mulq	%r14
1268	addq	%rax,%r10
1269	movq	%rbx,%rax
1270	movq	%rdx,%r11
1271	adcq	$0,%r11
1272	addq	%r12,%r10
1273	adcq	$0,%r11
1274	movq	%r10,-8(%rdi,%rcx,1)
1275	jmp	.Lsqr4x_1st
1276
1277.align	32
1278.Lsqr4x_1st:
1279	movq	(%rsi,%rcx,1),%rbx
1280	mulq	%r15
1281	addq	%rax,%r13
1282	movq	%rbx,%rax
1283	movq	%rdx,%r12
1284	adcq	$0,%r12
1285
1286	mulq	%r14
1287	addq	%rax,%r11
1288	movq	%rbx,%rax
1289	movq	8(%rsi,%rcx,1),%rbx
1290	movq	%rdx,%r10
1291	adcq	$0,%r10
1292	addq	%r13,%r11
1293	adcq	$0,%r10
1294
1295
1296	mulq	%r15
1297	addq	%rax,%r12
1298	movq	%rbx,%rax
1299	movq	%r11,(%rdi,%rcx,1)
1300	movq	%rdx,%r13
1301	adcq	$0,%r13
1302
1303	mulq	%r14
1304	addq	%rax,%r10
1305	movq	%rbx,%rax
1306	movq	16(%rsi,%rcx,1),%rbx
1307	movq	%rdx,%r11
1308	adcq	$0,%r11
1309	addq	%r12,%r10
1310	adcq	$0,%r11
1311
1312	mulq	%r15
1313	addq	%rax,%r13
1314	movq	%rbx,%rax
1315	movq	%r10,8(%rdi,%rcx,1)
1316	movq	%rdx,%r12
1317	adcq	$0,%r12
1318
1319	mulq	%r14
1320	addq	%rax,%r11
1321	movq	%rbx,%rax
1322	movq	24(%rsi,%rcx,1),%rbx
1323	movq	%rdx,%r10
1324	adcq	$0,%r10
1325	addq	%r13,%r11
1326	adcq	$0,%r10
1327
1328
1329	mulq	%r15
1330	addq	%rax,%r12
1331	movq	%rbx,%rax
1332	movq	%r11,16(%rdi,%rcx,1)
1333	movq	%rdx,%r13
1334	adcq	$0,%r13
1335	leaq	32(%rcx),%rcx
1336
1337	mulq	%r14
1338	addq	%rax,%r10
1339	movq	%rbx,%rax
1340	movq	%rdx,%r11
1341	adcq	$0,%r11
1342	addq	%r12,%r10
1343	adcq	$0,%r11
1344	movq	%r10,-8(%rdi,%rcx,1)
1345
1346	cmpq	$0,%rcx
1347	jne	.Lsqr4x_1st
1348
1349	mulq	%r15
1350	addq	%rax,%r13
1351	leaq	16(%rbp),%rbp
1352	adcq	$0,%rdx
1353	addq	%r11,%r13
1354	adcq	$0,%rdx
1355
1356	movq	%r13,(%rdi)
1357	movq	%rdx,%r12
1358	movq	%rdx,8(%rdi)
1359	jmp	.Lsqr4x_outer
1360
1361.align	32
1362.Lsqr4x_outer:
1363	movq	-32(%rsi,%rbp,1),%r14
1364	leaq	48+8(%rsp,%r9,2),%rdi
1365	movq	-24(%rsi,%rbp,1),%rax
1366	leaq	-32(%rdi,%rbp,1),%rdi
1367	movq	-16(%rsi,%rbp,1),%rbx
1368	movq	%rax,%r15
1369
1370	mulq	%r14
1371	movq	-24(%rdi,%rbp,1),%r10
1372	addq	%rax,%r10
1373	movq	%rbx,%rax
1374	adcq	$0,%rdx
1375	movq	%r10,-24(%rdi,%rbp,1)
1376	movq	%rdx,%r11
1377
1378	mulq	%r14
1379	addq	%rax,%r11
1380	movq	%rbx,%rax
1381	adcq	$0,%rdx
1382	addq	-16(%rdi,%rbp,1),%r11
1383	movq	%rdx,%r10
1384	adcq	$0,%r10
1385	movq	%r11,-16(%rdi,%rbp,1)
1386
1387	xorq	%r12,%r12
1388
1389	movq	-8(%rsi,%rbp,1),%rbx
1390	mulq	%r15
1391	addq	%rax,%r12
1392	movq	%rbx,%rax
1393	adcq	$0,%rdx
1394	addq	-8(%rdi,%rbp,1),%r12
1395	movq	%rdx,%r13
1396	adcq	$0,%r13
1397
1398	mulq	%r14
1399	addq	%rax,%r10
1400	movq	%rbx,%rax
1401	adcq	$0,%rdx
1402	addq	%r12,%r10
1403	movq	%rdx,%r11
1404	adcq	$0,%r11
1405	movq	%r10,-8(%rdi,%rbp,1)
1406
1407	leaq	(%rbp),%rcx
1408	jmp	.Lsqr4x_inner
1409
1410.align	32
1411.Lsqr4x_inner:
1412	movq	(%rsi,%rcx,1),%rbx
1413	mulq	%r15
1414	addq	%rax,%r13
1415	movq	%rbx,%rax
1416	movq	%rdx,%r12
1417	adcq	$0,%r12
1418	addq	(%rdi,%rcx,1),%r13
1419	adcq	$0,%r12
1420
1421.byte	0x67
1422	mulq	%r14
1423	addq	%rax,%r11
1424	movq	%rbx,%rax
1425	movq	8(%rsi,%rcx,1),%rbx
1426	movq	%rdx,%r10
1427	adcq	$0,%r10
1428	addq	%r13,%r11
1429	adcq	$0,%r10
1430
1431	mulq	%r15
1432	addq	%rax,%r12
1433	movq	%r11,(%rdi,%rcx,1)
1434	movq	%rbx,%rax
1435	movq	%rdx,%r13
1436	adcq	$0,%r13
1437	addq	8(%rdi,%rcx,1),%r12
1438	leaq	16(%rcx),%rcx
1439	adcq	$0,%r13
1440
1441	mulq	%r14
1442	addq	%rax,%r10
1443	movq	%rbx,%rax
1444	adcq	$0,%rdx
1445	addq	%r12,%r10
1446	movq	%rdx,%r11
1447	adcq	$0,%r11
1448	movq	%r10,-8(%rdi,%rcx,1)
1449
1450	cmpq	$0,%rcx
1451	jne	.Lsqr4x_inner
1452
1453.byte	0x67
1454	mulq	%r15
1455	addq	%rax,%r13
1456	adcq	$0,%rdx
1457	addq	%r11,%r13
1458	adcq	$0,%rdx
1459
1460	movq	%r13,(%rdi)
1461	movq	%rdx,%r12
1462	movq	%rdx,8(%rdi)
1463
1464	addq	$16,%rbp
1465	jnz	.Lsqr4x_outer
1466
1467
1468	movq	-32(%rsi),%r14
1469	leaq	48+8(%rsp,%r9,2),%rdi
1470	movq	-24(%rsi),%rax
1471	leaq	-32(%rdi,%rbp,1),%rdi
1472	movq	-16(%rsi),%rbx
1473	movq	%rax,%r15
1474
1475	mulq	%r14
1476	addq	%rax,%r10
1477	movq	%rbx,%rax
1478	movq	%rdx,%r11
1479	adcq	$0,%r11
1480
1481	mulq	%r14
1482	addq	%rax,%r11
1483	movq	%rbx,%rax
1484	movq	%r10,-24(%rdi)
1485	movq	%rdx,%r10
1486	adcq	$0,%r10
1487	addq	%r13,%r11
1488	movq	-8(%rsi),%rbx
1489	adcq	$0,%r10
1490
1491	mulq	%r15
1492	addq	%rax,%r12
1493	movq	%rbx,%rax
1494	movq	%r11,-16(%rdi)
1495	movq	%rdx,%r13
1496	adcq	$0,%r13
1497
1498	mulq	%r14
1499	addq	%rax,%r10
1500	movq	%rbx,%rax
1501	movq	%rdx,%r11
1502	adcq	$0,%r11
1503	addq	%r12,%r10
1504	adcq	$0,%r11
1505	movq	%r10,-8(%rdi)
1506
1507	mulq	%r15
1508	addq	%rax,%r13
1509	movq	-16(%rsi),%rax
1510	adcq	$0,%rdx
1511	addq	%r11,%r13
1512	adcq	$0,%rdx
1513
1514	movq	%r13,(%rdi)
1515	movq	%rdx,%r12
1516	movq	%rdx,8(%rdi)
1517
1518	mulq	%rbx
1519	addq	$16,%rbp
1520	xorq	%r14,%r14
1521	subq	%r9,%rbp
1522	xorq	%r15,%r15
1523
1524	addq	%r12,%rax
1525	adcq	$0,%rdx
1526	movq	%rax,8(%rdi)
1527	movq	%rdx,16(%rdi)
1528	movq	%r15,24(%rdi)
1529
1530	movq	-16(%rsi,%rbp,1),%rax
1531	leaq	48+8(%rsp),%rdi
1532	xorq	%r10,%r10
1533	movq	8(%rdi),%r11
1534
1535	leaq	(%r14,%r10,2),%r12
1536	shrq	$63,%r10
1537	leaq	(%rcx,%r11,2),%r13
1538	shrq	$63,%r11
1539	orq	%r10,%r13
1540	movq	16(%rdi),%r10
1541	movq	%r11,%r14
1542	mulq	%rax
1543	negq	%r15
1544	movq	24(%rdi),%r11
1545	adcq	%rax,%r12
1546	movq	-8(%rsi,%rbp,1),%rax
1547	movq	%r12,(%rdi)
1548	adcq	%rdx,%r13
1549
1550	leaq	(%r14,%r10,2),%rbx
1551	movq	%r13,8(%rdi)
1552	sbbq	%r15,%r15
1553	shrq	$63,%r10
1554	leaq	(%rcx,%r11,2),%r8
1555	shrq	$63,%r11
1556	orq	%r10,%r8
1557	movq	32(%rdi),%r10
1558	movq	%r11,%r14
1559	mulq	%rax
1560	negq	%r15
1561	movq	40(%rdi),%r11
1562	adcq	%rax,%rbx
1563	movq	0(%rsi,%rbp,1),%rax
1564	movq	%rbx,16(%rdi)
1565	adcq	%rdx,%r8
1566	leaq	16(%rbp),%rbp
1567	movq	%r8,24(%rdi)
1568	sbbq	%r15,%r15
1569	leaq	64(%rdi),%rdi
1570	jmp	.Lsqr4x_shift_n_add
1571
1572.align	32
1573.Lsqr4x_shift_n_add:
1574	leaq	(%r14,%r10,2),%r12
1575	shrq	$63,%r10
1576	leaq	(%rcx,%r11,2),%r13
1577	shrq	$63,%r11
1578	orq	%r10,%r13
1579	movq	-16(%rdi),%r10
1580	movq	%r11,%r14
1581	mulq	%rax
1582	negq	%r15
1583	movq	-8(%rdi),%r11
1584	adcq	%rax,%r12
1585	movq	-8(%rsi,%rbp,1),%rax
1586	movq	%r12,-32(%rdi)
1587	adcq	%rdx,%r13
1588
1589	leaq	(%r14,%r10,2),%rbx
1590	movq	%r13,-24(%rdi)
1591	sbbq	%r15,%r15
1592	shrq	$63,%r10
1593	leaq	(%rcx,%r11,2),%r8
1594	shrq	$63,%r11
1595	orq	%r10,%r8
1596	movq	0(%rdi),%r10
1597	movq	%r11,%r14
1598	mulq	%rax
1599	negq	%r15
1600	movq	8(%rdi),%r11
1601	adcq	%rax,%rbx
1602	movq	0(%rsi,%rbp,1),%rax
1603	movq	%rbx,-16(%rdi)
1604	adcq	%rdx,%r8
1605
1606	leaq	(%r14,%r10,2),%r12
1607	movq	%r8,-8(%rdi)
1608	sbbq	%r15,%r15
1609	shrq	$63,%r10
1610	leaq	(%rcx,%r11,2),%r13
1611	shrq	$63,%r11
1612	orq	%r10,%r13
1613	movq	16(%rdi),%r10
1614	movq	%r11,%r14
1615	mulq	%rax
1616	negq	%r15
1617	movq	24(%rdi),%r11
1618	adcq	%rax,%r12
1619	movq	8(%rsi,%rbp,1),%rax
1620	movq	%r12,0(%rdi)
1621	adcq	%rdx,%r13
1622
1623	leaq	(%r14,%r10,2),%rbx
1624	movq	%r13,8(%rdi)
1625	sbbq	%r15,%r15
1626	shrq	$63,%r10
1627	leaq	(%rcx,%r11,2),%r8
1628	shrq	$63,%r11
1629	orq	%r10,%r8
1630	movq	32(%rdi),%r10
1631	movq	%r11,%r14
1632	mulq	%rax
1633	negq	%r15
1634	movq	40(%rdi),%r11
1635	adcq	%rax,%rbx
1636	movq	16(%rsi,%rbp,1),%rax
1637	movq	%rbx,16(%rdi)
1638	adcq	%rdx,%r8
1639	movq	%r8,24(%rdi)
1640	sbbq	%r15,%r15
1641	leaq	64(%rdi),%rdi
1642	addq	$32,%rbp
1643	jnz	.Lsqr4x_shift_n_add
1644
1645	leaq	(%r14,%r10,2),%r12
1646.byte	0x67
1647	shrq	$63,%r10
1648	leaq	(%rcx,%r11,2),%r13
1649	shrq	$63,%r11
1650	orq	%r10,%r13
1651	movq	-16(%rdi),%r10
1652	movq	%r11,%r14
1653	mulq	%rax
1654	negq	%r15
1655	movq	-8(%rdi),%r11
1656	adcq	%rax,%r12
1657	movq	-8(%rsi),%rax
1658	movq	%r12,-32(%rdi)
1659	adcq	%rdx,%r13
1660
1661	leaq	(%r14,%r10,2),%rbx
1662	movq	%r13,-24(%rdi)
1663	sbbq	%r15,%r15
1664	shrq	$63,%r10
1665	leaq	(%rcx,%r11,2),%r8
1666	shrq	$63,%r11
1667	orq	%r10,%r8
1668	mulq	%rax
1669	negq	%r15
1670	adcq	%rax,%rbx
1671	adcq	%rdx,%r8
1672	movq	%rbx,-16(%rdi)
1673	movq	%r8,-8(%rdi)
1674.byte	102,72,15,126,213
1675__bn_sqr8x_reduction:
1676	xorq	%rax,%rax
1677	leaq	(%r9,%rbp,1),%rcx
1678	leaq	48+8(%rsp,%r9,2),%rdx
1679	movq	%rcx,0+8(%rsp)
1680	leaq	48+8(%rsp,%r9,1),%rdi
1681	movq	%rdx,8+8(%rsp)
1682	negq	%r9
1683	jmp	.L8x_reduction_loop
1684
1685.align	32
1686.L8x_reduction_loop:
1687	leaq	(%rdi,%r9,1),%rdi
1688.byte	0x66
1689	movq	0(%rdi),%rbx
1690	movq	8(%rdi),%r9
1691	movq	16(%rdi),%r10
1692	movq	24(%rdi),%r11
1693	movq	32(%rdi),%r12
1694	movq	40(%rdi),%r13
1695	movq	48(%rdi),%r14
1696	movq	56(%rdi),%r15
1697	movq	%rax,(%rdx)
1698	leaq	64(%rdi),%rdi
1699
1700.byte	0x67
1701	movq	%rbx,%r8
1702	imulq	32+8(%rsp),%rbx
1703	movq	0(%rbp),%rax
1704	movl	$8,%ecx
1705	jmp	.L8x_reduce
1706
1707.align	32
1708.L8x_reduce:
1709	mulq	%rbx
1710	movq	8(%rbp),%rax
1711	negq	%r8
1712	movq	%rdx,%r8
1713	adcq	$0,%r8
1714
1715	mulq	%rbx
1716	addq	%rax,%r9
1717	movq	16(%rbp),%rax
1718	adcq	$0,%rdx
1719	addq	%r9,%r8
1720	movq	%rbx,48-8+8(%rsp,%rcx,8)
1721	movq	%rdx,%r9
1722	adcq	$0,%r9
1723
1724	mulq	%rbx
1725	addq	%rax,%r10
1726	movq	24(%rbp),%rax
1727	adcq	$0,%rdx
1728	addq	%r10,%r9
1729	movq	32+8(%rsp),%rsi
1730	movq	%rdx,%r10
1731	adcq	$0,%r10
1732
1733	mulq	%rbx
1734	addq	%rax,%r11
1735	movq	32(%rbp),%rax
1736	adcq	$0,%rdx
1737	imulq	%r8,%rsi
1738	addq	%r11,%r10
1739	movq	%rdx,%r11
1740	adcq	$0,%r11
1741
1742	mulq	%rbx
1743	addq	%rax,%r12
1744	movq	40(%rbp),%rax
1745	adcq	$0,%rdx
1746	addq	%r12,%r11
1747	movq	%rdx,%r12
1748	adcq	$0,%r12
1749
1750	mulq	%rbx
1751	addq	%rax,%r13
1752	movq	48(%rbp),%rax
1753	adcq	$0,%rdx
1754	addq	%r13,%r12
1755	movq	%rdx,%r13
1756	adcq	$0,%r13
1757
1758	mulq	%rbx
1759	addq	%rax,%r14
1760	movq	56(%rbp),%rax
1761	adcq	$0,%rdx
1762	addq	%r14,%r13
1763	movq	%rdx,%r14
1764	adcq	$0,%r14
1765
1766	mulq	%rbx
1767	movq	%rsi,%rbx
1768	addq	%rax,%r15
1769	movq	0(%rbp),%rax
1770	adcq	$0,%rdx
1771	addq	%r15,%r14
1772	movq	%rdx,%r15
1773	adcq	$0,%r15
1774
1775	decl	%ecx
1776	jnz	.L8x_reduce
1777
1778	leaq	64(%rbp),%rbp
1779	xorq	%rax,%rax
1780	movq	8+8(%rsp),%rdx
1781	cmpq	0+8(%rsp),%rbp
1782	jae	.L8x_no_tail
1783
1784.byte	0x66
1785	addq	0(%rdi),%r8
1786	adcq	8(%rdi),%r9
1787	adcq	16(%rdi),%r10
1788	adcq	24(%rdi),%r11
1789	adcq	32(%rdi),%r12
1790	adcq	40(%rdi),%r13
1791	adcq	48(%rdi),%r14
1792	adcq	56(%rdi),%r15
1793	sbbq	%rsi,%rsi
1794
1795	movq	48+56+8(%rsp),%rbx
1796	movl	$8,%ecx
1797	movq	0(%rbp),%rax
1798	jmp	.L8x_tail
1799
1800.align	32
1801.L8x_tail:
1802	mulq	%rbx
1803	addq	%rax,%r8
1804	movq	8(%rbp),%rax
1805	movq	%r8,(%rdi)
1806	movq	%rdx,%r8
1807	adcq	$0,%r8
1808
1809	mulq	%rbx
1810	addq	%rax,%r9
1811	movq	16(%rbp),%rax
1812	adcq	$0,%rdx
1813	addq	%r9,%r8
1814	leaq	8(%rdi),%rdi
1815	movq	%rdx,%r9
1816	adcq	$0,%r9
1817
1818	mulq	%rbx
1819	addq	%rax,%r10
1820	movq	24(%rbp),%rax
1821	adcq	$0,%rdx
1822	addq	%r10,%r9
1823	movq	%rdx,%r10
1824	adcq	$0,%r10
1825
1826	mulq	%rbx
1827	addq	%rax,%r11
1828	movq	32(%rbp),%rax
1829	adcq	$0,%rdx
1830	addq	%r11,%r10
1831	movq	%rdx,%r11
1832	adcq	$0,%r11
1833
1834	mulq	%rbx
1835	addq	%rax,%r12
1836	movq	40(%rbp),%rax
1837	adcq	$0,%rdx
1838	addq	%r12,%r11
1839	movq	%rdx,%r12
1840	adcq	$0,%r12
1841
1842	mulq	%rbx
1843	addq	%rax,%r13
1844	movq	48(%rbp),%rax
1845	adcq	$0,%rdx
1846	addq	%r13,%r12
1847	movq	%rdx,%r13
1848	adcq	$0,%r13
1849
1850	mulq	%rbx
1851	addq	%rax,%r14
1852	movq	56(%rbp),%rax
1853	adcq	$0,%rdx
1854	addq	%r14,%r13
1855	movq	%rdx,%r14
1856	adcq	$0,%r14
1857
1858	mulq	%rbx
1859	movq	48-16+8(%rsp,%rcx,8),%rbx
1860	addq	%rax,%r15
1861	adcq	$0,%rdx
1862	addq	%r15,%r14
1863	movq	0(%rbp),%rax
1864	movq	%rdx,%r15
1865	adcq	$0,%r15
1866
1867	decl	%ecx
1868	jnz	.L8x_tail
1869
1870	leaq	64(%rbp),%rbp
1871	movq	8+8(%rsp),%rdx
1872	cmpq	0+8(%rsp),%rbp
1873	jae	.L8x_tail_done
1874
1875	movq	48+56+8(%rsp),%rbx
1876	negq	%rsi
1877	movq	0(%rbp),%rax
1878	adcq	0(%rdi),%r8
1879	adcq	8(%rdi),%r9
1880	adcq	16(%rdi),%r10
1881	adcq	24(%rdi),%r11
1882	adcq	32(%rdi),%r12
1883	adcq	40(%rdi),%r13
1884	adcq	48(%rdi),%r14
1885	adcq	56(%rdi),%r15
1886	sbbq	%rsi,%rsi
1887
1888	movl	$8,%ecx
1889	jmp	.L8x_tail
1890
1891.align	32
1892.L8x_tail_done:
1893	addq	(%rdx),%r8
1894	adcq	$0,%r9
1895	adcq	$0,%r10
1896	adcq	$0,%r11
1897	adcq	$0,%r12
1898	adcq	$0,%r13
1899	adcq	$0,%r14
1900	adcq	$0,%r15
1901
1902
1903	xorq	%rax,%rax
1904
1905	negq	%rsi
1906.L8x_no_tail:
1907	adcq	0(%rdi),%r8
1908	adcq	8(%rdi),%r9
1909	adcq	16(%rdi),%r10
1910	adcq	24(%rdi),%r11
1911	adcq	32(%rdi),%r12
1912	adcq	40(%rdi),%r13
1913	adcq	48(%rdi),%r14
1914	adcq	56(%rdi),%r15
1915	adcq	$0,%rax
1916	movq	-8(%rbp),%rcx
1917	xorq	%rsi,%rsi
1918
1919.byte	102,72,15,126,213
1920
1921	movq	%r8,0(%rdi)
1922	movq	%r9,8(%rdi)
1923.byte	102,73,15,126,217
1924	movq	%r10,16(%rdi)
1925	movq	%r11,24(%rdi)
1926	movq	%r12,32(%rdi)
1927	movq	%r13,40(%rdi)
1928	movq	%r14,48(%rdi)
1929	movq	%r15,56(%rdi)
1930	leaq	64(%rdi),%rdi
1931
1932	cmpq	%rdx,%rdi
1933	jb	.L8x_reduction_loop
1934	.byte	0xf3,0xc3
1935.size	bn_sqr8x_internal,.-bn_sqr8x_internal
1936.type	__bn_post4x_internal,@function
1937.align	32
1938__bn_post4x_internal:
1939	movq	0(%rbp),%r12
1940	leaq	(%rdi,%r9,1),%rbx
1941	movq	%r9,%rcx
1942.byte	102,72,15,126,207
1943	negq	%rax
1944.byte	102,72,15,126,206
1945	sarq	$3+2,%rcx
1946	decq	%r12
1947	xorq	%r10,%r10
1948	movq	8(%rbp),%r13
1949	movq	16(%rbp),%r14
1950	movq	24(%rbp),%r15
1951	jmp	.Lsqr4x_sub_entry
1952
1953.align	16
1954.Lsqr4x_sub:
1955	movq	0(%rbp),%r12
1956	movq	8(%rbp),%r13
1957	movq	16(%rbp),%r14
1958	movq	24(%rbp),%r15
1959.Lsqr4x_sub_entry:
1960	leaq	32(%rbp),%rbp
1961	notq	%r12
1962	notq	%r13
1963	notq	%r14
1964	notq	%r15
1965	andq	%rax,%r12
1966	andq	%rax,%r13
1967	andq	%rax,%r14
1968	andq	%rax,%r15
1969
1970	negq	%r10
1971	adcq	0(%rbx),%r12
1972	adcq	8(%rbx),%r13
1973	adcq	16(%rbx),%r14
1974	adcq	24(%rbx),%r15
1975	movq	%r12,0(%rdi)
1976	leaq	32(%rbx),%rbx
1977	movq	%r13,8(%rdi)
1978	sbbq	%r10,%r10
1979	movq	%r14,16(%rdi)
1980	movq	%r15,24(%rdi)
1981	leaq	32(%rdi),%rdi
1982
1983	incq	%rcx
1984	jnz	.Lsqr4x_sub
1985
1986	movq	%r9,%r10
1987	negq	%r9
1988	.byte	0xf3,0xc3
1989.size	__bn_post4x_internal,.-__bn_post4x_internal
1990.globl	bn_from_montgomery
1991.type	bn_from_montgomery,@function
1992.align	32
1993bn_from_montgomery:
1994	testl	$7,%r9d
1995	jz	bn_from_mont8x
1996	xorl	%eax,%eax
1997	.byte	0xf3,0xc3
1998.size	bn_from_montgomery,.-bn_from_montgomery
1999
2000.type	bn_from_mont8x,@function
2001.align	32
2002bn_from_mont8x:
2003.byte	0x67
2004	movq	%rsp,%rax
2005	pushq	%rbx
2006	pushq	%rbp
2007	pushq	%r12
2008	pushq	%r13
2009	pushq	%r14
2010	pushq	%r15
2011.Lfrom_prologue:
2012
2013	shll	$3,%r9d
2014	leaq	(%r9,%r9,2),%r10
2015	negq	%r9
2016	movq	(%r8),%r8
2017
2018
2019
2020
2021
2022
2023
2024
2025	leaq	-320(%rsp,%r9,2),%r11
2026	movq	%rsp,%rbp
2027	subq	%rdi,%r11
2028	andq	$4095,%r11
2029	cmpq	%r11,%r10
2030	jb	.Lfrom_sp_alt
2031	subq	%r11,%rbp
2032	leaq	-320(%rbp,%r9,2),%rbp
2033	jmp	.Lfrom_sp_done
2034
2035.align	32
2036.Lfrom_sp_alt:
2037	leaq	4096-320(,%r9,2),%r10
2038	leaq	-320(%rbp,%r9,2),%rbp
2039	subq	%r10,%r11
2040	movq	$0,%r10
2041	cmovcq	%r10,%r11
2042	subq	%r11,%rbp
2043.Lfrom_sp_done:
2044	andq	$-64,%rbp
2045	movq	%rsp,%r11
2046	subq	%rbp,%r11
2047	andq	$-4096,%r11
2048	leaq	(%r11,%rbp,1),%rsp
2049	movq	(%rsp),%r10
2050	cmpq	%rbp,%rsp
2051	ja	.Lfrom_page_walk
2052	jmp	.Lfrom_page_walk_done
2053
2054.Lfrom_page_walk:
2055	leaq	-4096(%rsp),%rsp
2056	movq	(%rsp),%r10
2057	cmpq	%rbp,%rsp
2058	ja	.Lfrom_page_walk
2059.Lfrom_page_walk_done:
2060
2061	movq	%r9,%r10
2062	negq	%r9
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073	movq	%r8,32(%rsp)
2074	movq	%rax,40(%rsp)
2075.Lfrom_body:
2076	movq	%r9,%r11
2077	leaq	48(%rsp),%rax
2078	pxor	%xmm0,%xmm0
2079	jmp	.Lmul_by_1
2080
2081.align	32
2082.Lmul_by_1:
2083	movdqu	(%rsi),%xmm1
2084	movdqu	16(%rsi),%xmm2
2085	movdqu	32(%rsi),%xmm3
2086	movdqa	%xmm0,(%rax,%r9,1)
2087	movdqu	48(%rsi),%xmm4
2088	movdqa	%xmm0,16(%rax,%r9,1)
2089.byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
2090	movdqa	%xmm1,(%rax)
2091	movdqa	%xmm0,32(%rax,%r9,1)
2092	movdqa	%xmm2,16(%rax)
2093	movdqa	%xmm0,48(%rax,%r9,1)
2094	movdqa	%xmm3,32(%rax)
2095	movdqa	%xmm4,48(%rax)
2096	leaq	64(%rax),%rax
2097	subq	$64,%r11
2098	jnz	.Lmul_by_1
2099
2100.byte	102,72,15,110,207
2101.byte	102,72,15,110,209
2102.byte	0x67
2103	movq	%rcx,%rbp
2104.byte	102,73,15,110,218
2105	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
2106	andl	$0x80108,%r11d
2107	cmpl	$0x80108,%r11d
2108	jne	.Lfrom_mont_nox
2109
2110	leaq	(%rax,%r9,1),%rdi
2111	call	__bn_sqrx8x_reduction
2112	call	__bn_postx4x_internal
2113
2114	pxor	%xmm0,%xmm0
2115	leaq	48(%rsp),%rax
2116	movq	40(%rsp),%rsi
2117	jmp	.Lfrom_mont_zero
2118
2119.align	32
2120.Lfrom_mont_nox:
2121	call	__bn_sqr8x_reduction
2122	call	__bn_post4x_internal
2123
2124	pxor	%xmm0,%xmm0
2125	leaq	48(%rsp),%rax
2126	movq	40(%rsp),%rsi
2127	jmp	.Lfrom_mont_zero
2128
2129.align	32
2130.Lfrom_mont_zero:
2131	movdqa	%xmm0,0(%rax)
2132	movdqa	%xmm0,16(%rax)
2133	movdqa	%xmm0,32(%rax)
2134	movdqa	%xmm0,48(%rax)
2135	leaq	64(%rax),%rax
2136	subq	$32,%r9
2137	jnz	.Lfrom_mont_zero
2138
2139	movq	$1,%rax
2140	movq	-48(%rsi),%r15
2141	movq	-40(%rsi),%r14
2142	movq	-32(%rsi),%r13
2143	movq	-24(%rsi),%r12
2144	movq	-16(%rsi),%rbp
2145	movq	-8(%rsi),%rbx
2146	leaq	(%rsi),%rsp
2147.Lfrom_epilogue:
2148	.byte	0xf3,0xc3
2149.size	bn_from_mont8x,.-bn_from_mont8x
2150.type	bn_mulx4x_mont_gather5,@function
2151.align	32
2152bn_mulx4x_mont_gather5:
2153	movq	%rsp,%rax
2154.Lmulx4x_enter:
2155	pushq	%rbx
2156	pushq	%rbp
2157	pushq	%r12
2158	pushq	%r13
2159	pushq	%r14
2160	pushq	%r15
2161.Lmulx4x_prologue:
2162
2163	shll	$3,%r9d
2164	leaq	(%r9,%r9,2),%r10
2165	negq	%r9
2166	movq	(%r8),%r8
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177	leaq	-320(%rsp,%r9,2),%r11
2178	movq	%rsp,%rbp
2179	subq	%rdi,%r11
2180	andq	$4095,%r11
2181	cmpq	%r11,%r10
2182	jb	.Lmulx4xsp_alt
2183	subq	%r11,%rbp
2184	leaq	-320(%rbp,%r9,2),%rbp
2185	jmp	.Lmulx4xsp_done
2186
2187.Lmulx4xsp_alt:
2188	leaq	4096-320(,%r9,2),%r10
2189	leaq	-320(%rbp,%r9,2),%rbp
2190	subq	%r10,%r11
2191	movq	$0,%r10
2192	cmovcq	%r10,%r11
2193	subq	%r11,%rbp
2194.Lmulx4xsp_done:
2195	andq	$-64,%rbp
2196	movq	%rsp,%r11
2197	subq	%rbp,%r11
2198	andq	$-4096,%r11
2199	leaq	(%r11,%rbp,1),%rsp
2200	movq	(%rsp),%r10
2201	cmpq	%rbp,%rsp
2202	ja	.Lmulx4x_page_walk
2203	jmp	.Lmulx4x_page_walk_done
2204
2205.Lmulx4x_page_walk:
2206	leaq	-4096(%rsp),%rsp
2207	movq	(%rsp),%r10
2208	cmpq	%rbp,%rsp
2209	ja	.Lmulx4x_page_walk
2210.Lmulx4x_page_walk_done:
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223
2224	movq	%r8,32(%rsp)
2225	movq	%rax,40(%rsp)
2226.Lmulx4x_body:
2227	call	mulx4x_internal
2228
2229	movq	40(%rsp),%rsi
2230	movq	$1,%rax
2231
2232	movq	-48(%rsi),%r15
2233	movq	-40(%rsi),%r14
2234	movq	-32(%rsi),%r13
2235	movq	-24(%rsi),%r12
2236	movq	-16(%rsi),%rbp
2237	movq	-8(%rsi),%rbx
2238	leaq	(%rsi),%rsp
2239.Lmulx4x_epilogue:
2240	.byte	0xf3,0xc3
2241.size	bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
2242
2243.type	mulx4x_internal,@function
2244.align	32
2245mulx4x_internal:
2246	movq	%r9,8(%rsp)
2247	movq	%r9,%r10
2248	negq	%r9
2249	shlq	$5,%r9
2250	negq	%r10
2251	leaq	128(%rdx,%r9,1),%r13
2252	shrq	$5+5,%r9
2253	movd	8(%rax),%xmm5
2254	subq	$1,%r9
2255	leaq	.Linc(%rip),%rax
2256	movq	%r13,16+8(%rsp)
2257	movq	%r9,24+8(%rsp)
2258	movq	%rdi,56+8(%rsp)
2259	movdqa	0(%rax),%xmm0
2260	movdqa	16(%rax),%xmm1
2261	leaq	88-112(%rsp,%r10,1),%r10
2262	leaq	128(%rdx),%rdi
2263
2264	pshufd	$0,%xmm5,%xmm5
2265	movdqa	%xmm1,%xmm4
2266.byte	0x67
2267	movdqa	%xmm1,%xmm2
2268.byte	0x67
2269	paddd	%xmm0,%xmm1
2270	pcmpeqd	%xmm5,%xmm0
2271	movdqa	%xmm4,%xmm3
2272	paddd	%xmm1,%xmm2
2273	pcmpeqd	%xmm5,%xmm1
2274	movdqa	%xmm0,112(%r10)
2275	movdqa	%xmm4,%xmm0
2276
2277	paddd	%xmm2,%xmm3
2278	pcmpeqd	%xmm5,%xmm2
2279	movdqa	%xmm1,128(%r10)
2280	movdqa	%xmm4,%xmm1
2281
2282	paddd	%xmm3,%xmm0
2283	pcmpeqd	%xmm5,%xmm3
2284	movdqa	%xmm2,144(%r10)
2285	movdqa	%xmm4,%xmm2
2286
2287	paddd	%xmm0,%xmm1
2288	pcmpeqd	%xmm5,%xmm0
2289	movdqa	%xmm3,160(%r10)
2290	movdqa	%xmm4,%xmm3
2291	paddd	%xmm1,%xmm2
2292	pcmpeqd	%xmm5,%xmm1
2293	movdqa	%xmm0,176(%r10)
2294	movdqa	%xmm4,%xmm0
2295
2296	paddd	%xmm2,%xmm3
2297	pcmpeqd	%xmm5,%xmm2
2298	movdqa	%xmm1,192(%r10)
2299	movdqa	%xmm4,%xmm1
2300
2301	paddd	%xmm3,%xmm0
2302	pcmpeqd	%xmm5,%xmm3
2303	movdqa	%xmm2,208(%r10)
2304	movdqa	%xmm4,%xmm2
2305
2306	paddd	%xmm0,%xmm1
2307	pcmpeqd	%xmm5,%xmm0
2308	movdqa	%xmm3,224(%r10)
2309	movdqa	%xmm4,%xmm3
2310	paddd	%xmm1,%xmm2
2311	pcmpeqd	%xmm5,%xmm1
2312	movdqa	%xmm0,240(%r10)
2313	movdqa	%xmm4,%xmm0
2314
2315	paddd	%xmm2,%xmm3
2316	pcmpeqd	%xmm5,%xmm2
2317	movdqa	%xmm1,256(%r10)
2318	movdqa	%xmm4,%xmm1
2319
2320	paddd	%xmm3,%xmm0
2321	pcmpeqd	%xmm5,%xmm3
2322	movdqa	%xmm2,272(%r10)
2323	movdqa	%xmm4,%xmm2
2324
2325	paddd	%xmm0,%xmm1
2326	pcmpeqd	%xmm5,%xmm0
2327	movdqa	%xmm3,288(%r10)
2328	movdqa	%xmm4,%xmm3
2329.byte	0x67
2330	paddd	%xmm1,%xmm2
2331	pcmpeqd	%xmm5,%xmm1
2332	movdqa	%xmm0,304(%r10)
2333
2334	paddd	%xmm2,%xmm3
2335	pcmpeqd	%xmm5,%xmm2
2336	movdqa	%xmm1,320(%r10)
2337
2338	pcmpeqd	%xmm5,%xmm3
2339	movdqa	%xmm2,336(%r10)
2340
2341	pand	64(%rdi),%xmm0
2342	pand	80(%rdi),%xmm1
2343	pand	96(%rdi),%xmm2
2344	movdqa	%xmm3,352(%r10)
2345	pand	112(%rdi),%xmm3
2346	por	%xmm2,%xmm0
2347	por	%xmm3,%xmm1
2348	movdqa	-128(%rdi),%xmm4
2349	movdqa	-112(%rdi),%xmm5
2350	movdqa	-96(%rdi),%xmm2
2351	pand	112(%r10),%xmm4
2352	movdqa	-80(%rdi),%xmm3
2353	pand	128(%r10),%xmm5
2354	por	%xmm4,%xmm0
2355	pand	144(%r10),%xmm2
2356	por	%xmm5,%xmm1
2357	pand	160(%r10),%xmm3
2358	por	%xmm2,%xmm0
2359	por	%xmm3,%xmm1
2360	movdqa	-64(%rdi),%xmm4
2361	movdqa	-48(%rdi),%xmm5
2362	movdqa	-32(%rdi),%xmm2
2363	pand	176(%r10),%xmm4
2364	movdqa	-16(%rdi),%xmm3
2365	pand	192(%r10),%xmm5
2366	por	%xmm4,%xmm0
2367	pand	208(%r10),%xmm2
2368	por	%xmm5,%xmm1
2369	pand	224(%r10),%xmm3
2370	por	%xmm2,%xmm0
2371	por	%xmm3,%xmm1
2372	movdqa	0(%rdi),%xmm4
2373	movdqa	16(%rdi),%xmm5
2374	movdqa	32(%rdi),%xmm2
2375	pand	240(%r10),%xmm4
2376	movdqa	48(%rdi),%xmm3
2377	pand	256(%r10),%xmm5
2378	por	%xmm4,%xmm0
2379	pand	272(%r10),%xmm2
2380	por	%xmm5,%xmm1
2381	pand	288(%r10),%xmm3
2382	por	%xmm2,%xmm0
2383	por	%xmm3,%xmm1
2384	pxor	%xmm1,%xmm0
2385	pshufd	$0x4e,%xmm0,%xmm1
2386	por	%xmm1,%xmm0
2387	leaq	256(%rdi),%rdi
2388.byte	102,72,15,126,194
2389	leaq	64+32+8(%rsp),%rbx
2390
2391	movq	%rdx,%r9
2392	mulxq	0(%rsi),%r8,%rax
2393	mulxq	8(%rsi),%r11,%r12
2394	addq	%rax,%r11
2395	mulxq	16(%rsi),%rax,%r13
2396	adcq	%rax,%r12
2397	adcq	$0,%r13
2398	mulxq	24(%rsi),%rax,%r14
2399
2400	movq	%r8,%r15
2401	imulq	32+8(%rsp),%r8
2402	xorq	%rbp,%rbp
2403	movq	%r8,%rdx
2404
2405	movq	%rdi,8+8(%rsp)
2406
2407	leaq	32(%rsi),%rsi
2408	adcxq	%rax,%r13
2409	adcxq	%rbp,%r14
2410
2411	mulxq	0(%rcx),%rax,%r10
2412	adcxq	%rax,%r15
2413	adoxq	%r11,%r10
2414	mulxq	8(%rcx),%rax,%r11
2415	adcxq	%rax,%r10
2416	adoxq	%r12,%r11
2417	mulxq	16(%rcx),%rax,%r12
2418	movq	24+8(%rsp),%rdi
2419	movq	%r10,-32(%rbx)
2420	adcxq	%rax,%r11
2421	adoxq	%r13,%r12
2422	mulxq	24(%rcx),%rax,%r15
2423	movq	%r9,%rdx
2424	movq	%r11,-24(%rbx)
2425	adcxq	%rax,%r12
2426	adoxq	%rbp,%r15
2427	leaq	32(%rcx),%rcx
2428	movq	%r12,-16(%rbx)
2429	jmp	.Lmulx4x_1st
2430
2431.align	32
2432.Lmulx4x_1st:
2433	adcxq	%rbp,%r15
2434	mulxq	0(%rsi),%r10,%rax
2435	adcxq	%r14,%r10
2436	mulxq	8(%rsi),%r11,%r14
2437	adcxq	%rax,%r11
2438	mulxq	16(%rsi),%r12,%rax
2439	adcxq	%r14,%r12
2440	mulxq	24(%rsi),%r13,%r14
2441.byte	0x67,0x67
2442	movq	%r8,%rdx
2443	adcxq	%rax,%r13
2444	adcxq	%rbp,%r14
2445	leaq	32(%rsi),%rsi
2446	leaq	32(%rbx),%rbx
2447
2448	adoxq	%r15,%r10
2449	mulxq	0(%rcx),%rax,%r15
2450	adcxq	%rax,%r10
2451	adoxq	%r15,%r11
2452	mulxq	8(%rcx),%rax,%r15
2453	adcxq	%rax,%r11
2454	adoxq	%r15,%r12
2455	mulxq	16(%rcx),%rax,%r15
2456	movq	%r10,-40(%rbx)
2457	adcxq	%rax,%r12
2458	movq	%r11,-32(%rbx)
2459	adoxq	%r15,%r13
2460	mulxq	24(%rcx),%rax,%r15
2461	movq	%r9,%rdx
2462	movq	%r12,-24(%rbx)
2463	adcxq	%rax,%r13
2464	adoxq	%rbp,%r15
2465	leaq	32(%rcx),%rcx
2466	movq	%r13,-16(%rbx)
2467
2468	decq	%rdi
2469	jnz	.Lmulx4x_1st
2470
2471	movq	8(%rsp),%rax
2472	adcq	%rbp,%r15
2473	leaq	(%rsi,%rax,1),%rsi
2474	addq	%r15,%r14
2475	movq	8+8(%rsp),%rdi
2476	adcq	%rbp,%rbp
2477	movq	%r14,-8(%rbx)
2478	jmp	.Lmulx4x_outer
2479
2480.align	32
2481.Lmulx4x_outer:
2482	leaq	16-256(%rbx),%r10
2483	pxor	%xmm4,%xmm4
2484.byte	0x67,0x67
2485	pxor	%xmm5,%xmm5
2486	movdqa	-128(%rdi),%xmm0
2487	movdqa	-112(%rdi),%xmm1
2488	movdqa	-96(%rdi),%xmm2
2489	pand	256(%r10),%xmm0
2490	movdqa	-80(%rdi),%xmm3
2491	pand	272(%r10),%xmm1
2492	por	%xmm0,%xmm4
2493	pand	288(%r10),%xmm2
2494	por	%xmm1,%xmm5
2495	pand	304(%r10),%xmm3
2496	por	%xmm2,%xmm4
2497	por	%xmm3,%xmm5
2498	movdqa	-64(%rdi),%xmm0
2499	movdqa	-48(%rdi),%xmm1
2500	movdqa	-32(%rdi),%xmm2
2501	pand	320(%r10),%xmm0
2502	movdqa	-16(%rdi),%xmm3
2503	pand	336(%r10),%xmm1
2504	por	%xmm0,%xmm4
2505	pand	352(%r10),%xmm2
2506	por	%xmm1,%xmm5
2507	pand	368(%r10),%xmm3
2508	por	%xmm2,%xmm4
2509	por	%xmm3,%xmm5
2510	movdqa	0(%rdi),%xmm0
2511	movdqa	16(%rdi),%xmm1
2512	movdqa	32(%rdi),%xmm2
2513	pand	384(%r10),%xmm0
2514	movdqa	48(%rdi),%xmm3
2515	pand	400(%r10),%xmm1
2516	por	%xmm0,%xmm4
2517	pand	416(%r10),%xmm2
2518	por	%xmm1,%xmm5
2519	pand	432(%r10),%xmm3
2520	por	%xmm2,%xmm4
2521	por	%xmm3,%xmm5
2522	movdqa	64(%rdi),%xmm0
2523	movdqa	80(%rdi),%xmm1
2524	movdqa	96(%rdi),%xmm2
2525	pand	448(%r10),%xmm0
2526	movdqa	112(%rdi),%xmm3
2527	pand	464(%r10),%xmm1
2528	por	%xmm0,%xmm4
2529	pand	480(%r10),%xmm2
2530	por	%xmm1,%xmm5
2531	pand	496(%r10),%xmm3
2532	por	%xmm2,%xmm4
2533	por	%xmm3,%xmm5
2534	por	%xmm5,%xmm4
2535	pshufd	$0x4e,%xmm4,%xmm0
2536	por	%xmm4,%xmm0
2537	leaq	256(%rdi),%rdi
2538.byte	102,72,15,126,194
2539
2540	movq	%rbp,(%rbx)
2541	leaq	32(%rbx,%rax,1),%rbx
2542	mulxq	0(%rsi),%r8,%r11
2543	xorq	%rbp,%rbp
2544	movq	%rdx,%r9
2545	mulxq	8(%rsi),%r14,%r12
2546	adoxq	-32(%rbx),%r8
2547	adcxq	%r14,%r11
2548	mulxq	16(%rsi),%r15,%r13
2549	adoxq	-24(%rbx),%r11
2550	adcxq	%r15,%r12
2551	mulxq	24(%rsi),%rdx,%r14
2552	adoxq	-16(%rbx),%r12
2553	adcxq	%rdx,%r13
2554	leaq	(%rcx,%rax,1),%rcx
2555	leaq	32(%rsi),%rsi
2556	adoxq	-8(%rbx),%r13
2557	adcxq	%rbp,%r14
2558	adoxq	%rbp,%r14
2559
2560	movq	%r8,%r15
2561	imulq	32+8(%rsp),%r8
2562
2563	movq	%r8,%rdx
2564	xorq	%rbp,%rbp
2565	movq	%rdi,8+8(%rsp)
2566
2567	mulxq	0(%rcx),%rax,%r10
2568	adcxq	%rax,%r15
2569	adoxq	%r11,%r10
2570	mulxq	8(%rcx),%rax,%r11
2571	adcxq	%rax,%r10
2572	adoxq	%r12,%r11
2573	mulxq	16(%rcx),%rax,%r12
2574	adcxq	%rax,%r11
2575	adoxq	%r13,%r12
2576	mulxq	24(%rcx),%rax,%r15
2577	movq	%r9,%rdx
2578	movq	24+8(%rsp),%rdi
2579	movq	%r10,-32(%rbx)
2580	adcxq	%rax,%r12
2581	movq	%r11,-24(%rbx)
2582	adoxq	%rbp,%r15
2583	movq	%r12,-16(%rbx)
2584	leaq	32(%rcx),%rcx
2585	jmp	.Lmulx4x_inner
2586
2587.align	32
2588.Lmulx4x_inner:
2589	mulxq	0(%rsi),%r10,%rax
2590	adcxq	%rbp,%r15
2591	adoxq	%r14,%r10
2592	mulxq	8(%rsi),%r11,%r14
2593	adcxq	0(%rbx),%r10
2594	adoxq	%rax,%r11
2595	mulxq	16(%rsi),%r12,%rax
2596	adcxq	8(%rbx),%r11
2597	adoxq	%r14,%r12
2598	mulxq	24(%rsi),%r13,%r14
2599	movq	%r8,%rdx
2600	adcxq	16(%rbx),%r12
2601	adoxq	%rax,%r13
2602	adcxq	24(%rbx),%r13
2603	adoxq	%rbp,%r14
2604	leaq	32(%rsi),%rsi
2605	leaq	32(%rbx),%rbx
2606	adcxq	%rbp,%r14
2607
2608	adoxq	%r15,%r10
2609	mulxq	0(%rcx),%rax,%r15
2610	adcxq	%rax,%r10
2611	adoxq	%r15,%r11
2612	mulxq	8(%rcx),%rax,%r15
2613	adcxq	%rax,%r11
2614	adoxq	%r15,%r12
2615	mulxq	16(%rcx),%rax,%r15
2616	movq	%r10,-40(%rbx)
2617	adcxq	%rax,%r12
2618	adoxq	%r15,%r13
2619	movq	%r11,-32(%rbx)
2620	mulxq	24(%rcx),%rax,%r15
2621	movq	%r9,%rdx
2622	leaq	32(%rcx),%rcx
2623	movq	%r12,-24(%rbx)
2624	adcxq	%rax,%r13
2625	adoxq	%rbp,%r15
2626	movq	%r13,-16(%rbx)
2627
2628	decq	%rdi
2629	jnz	.Lmulx4x_inner
2630
2631	movq	0+8(%rsp),%rax
2632	adcq	%rbp,%r15
2633	subq	0(%rbx),%rdi
2634	movq	8+8(%rsp),%rdi
2635	movq	16+8(%rsp),%r10
2636	adcq	%r15,%r14
2637	leaq	(%rsi,%rax,1),%rsi
2638	adcq	%rbp,%rbp
2639	movq	%r14,-8(%rbx)
2640
2641	cmpq	%r10,%rdi
2642	jb	.Lmulx4x_outer
2643
2644	movq	-8(%rcx),%r10
2645	movq	%rbp,%r8
2646	movq	(%rcx,%rax,1),%r12
2647	leaq	(%rcx,%rax,1),%rbp
2648	movq	%rax,%rcx
2649	leaq	(%rbx,%rax,1),%rdi
2650	xorl	%eax,%eax
2651	xorq	%r15,%r15
2652	subq	%r14,%r10
2653	adcq	%r15,%r15
2654	orq	%r15,%r8
2655	sarq	$3+2,%rcx
2656	subq	%r8,%rax
2657	movq	56+8(%rsp),%rdx
2658	decq	%r12
2659	movq	8(%rbp),%r13
2660	xorq	%r8,%r8
2661	movq	16(%rbp),%r14
2662	movq	24(%rbp),%r15
2663	jmp	.Lsqrx4x_sub_entry
2664.size	mulx4x_internal,.-mulx4x_internal
2665.type	bn_powerx5,@function
2666.align	32
2667bn_powerx5:
2668	movq	%rsp,%rax
2669.Lpowerx5_enter:
2670	pushq	%rbx
2671	pushq	%rbp
2672	pushq	%r12
2673	pushq	%r13
2674	pushq	%r14
2675	pushq	%r15
2676.Lpowerx5_prologue:
2677
2678	shll	$3,%r9d
2679	leaq	(%r9,%r9,2),%r10
2680	negq	%r9
2681	movq	(%r8),%r8
2682
2683
2684
2685
2686
2687
2688
2689
2690	leaq	-320(%rsp,%r9,2),%r11
2691	movq	%rsp,%rbp
2692	subq	%rdi,%r11
2693	andq	$4095,%r11
2694	cmpq	%r11,%r10
2695	jb	.Lpwrx_sp_alt
2696	subq	%r11,%rbp
2697	leaq	-320(%rbp,%r9,2),%rbp
2698	jmp	.Lpwrx_sp_done
2699
2700.align	32
2701.Lpwrx_sp_alt:
2702	leaq	4096-320(,%r9,2),%r10
2703	leaq	-320(%rbp,%r9,2),%rbp
2704	subq	%r10,%r11
2705	movq	$0,%r10
2706	cmovcq	%r10,%r11
2707	subq	%r11,%rbp
2708.Lpwrx_sp_done:
2709	andq	$-64,%rbp
2710	movq	%rsp,%r11
2711	subq	%rbp,%r11
2712	andq	$-4096,%r11
2713	leaq	(%r11,%rbp,1),%rsp
2714	movq	(%rsp),%r10
2715	cmpq	%rbp,%rsp
2716	ja	.Lpwrx_page_walk
2717	jmp	.Lpwrx_page_walk_done
2718
2719.Lpwrx_page_walk:
2720	leaq	-4096(%rsp),%rsp
2721	movq	(%rsp),%r10
2722	cmpq	%rbp,%rsp
2723	ja	.Lpwrx_page_walk
2724.Lpwrx_page_walk_done:
2725
2726	movq	%r9,%r10
2727	negq	%r9
2728
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740	pxor	%xmm0,%xmm0
2741.byte	102,72,15,110,207
2742.byte	102,72,15,110,209
2743.byte	102,73,15,110,218
2744.byte	102,72,15,110,226
2745	movq	%r8,32(%rsp)
2746	movq	%rax,40(%rsp)
2747.Lpowerx5_body:
2748
2749	call	__bn_sqrx8x_internal
2750	call	__bn_postx4x_internal
2751	call	__bn_sqrx8x_internal
2752	call	__bn_postx4x_internal
2753	call	__bn_sqrx8x_internal
2754	call	__bn_postx4x_internal
2755	call	__bn_sqrx8x_internal
2756	call	__bn_postx4x_internal
2757	call	__bn_sqrx8x_internal
2758	call	__bn_postx4x_internal
2759
2760	movq	%r10,%r9
2761	movq	%rsi,%rdi
2762.byte	102,72,15,126,209
2763.byte	102,72,15,126,226
2764	movq	40(%rsp),%rax
2765
2766	call	mulx4x_internal
2767
2768	movq	40(%rsp),%rsi
2769	movq	$1,%rax
2770
2771	movq	-48(%rsi),%r15
2772	movq	-40(%rsi),%r14
2773	movq	-32(%rsi),%r13
2774	movq	-24(%rsi),%r12
2775	movq	-16(%rsi),%rbp
2776	movq	-8(%rsi),%rbx
2777	leaq	(%rsi),%rsp
2778.Lpowerx5_epilogue:
2779	.byte	0xf3,0xc3
2780.size	bn_powerx5,.-bn_powerx5
2781
2782.globl	bn_sqrx8x_internal
2783.hidden	bn_sqrx8x_internal
2784.type	bn_sqrx8x_internal,@function
2785.align	32
2786bn_sqrx8x_internal:
2787__bn_sqrx8x_internal:
2788
2789
2790
2791
2792
2793
2794
2795
2796
2797
2798
2799
2800
2801
2802
2803
2804
2805
2806
2807
2808
2809
2810
2811
2812
2813
2814
2815
2816
2817
2818
2819
2820
2821
2822
2823
2824
2825
2826
2827
2828	leaq	48+8(%rsp),%rdi
2829	leaq	(%rsi,%r9,1),%rbp
2830	movq	%r9,0+8(%rsp)
2831	movq	%rbp,8+8(%rsp)
2832	jmp	.Lsqr8x_zero_start
2833
2834.align	32
2835.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2836.Lsqrx8x_zero:
2837.byte	0x3e
2838	movdqa	%xmm0,0(%rdi)
2839	movdqa	%xmm0,16(%rdi)
2840	movdqa	%xmm0,32(%rdi)
2841	movdqa	%xmm0,48(%rdi)
2842.Lsqr8x_zero_start:
2843	movdqa	%xmm0,64(%rdi)
2844	movdqa	%xmm0,80(%rdi)
2845	movdqa	%xmm0,96(%rdi)
2846	movdqa	%xmm0,112(%rdi)
2847	leaq	128(%rdi),%rdi
2848	subq	$64,%r9
2849	jnz	.Lsqrx8x_zero
2850
2851	movq	0(%rsi),%rdx
2852
2853	xorq	%r10,%r10
2854	xorq	%r11,%r11
2855	xorq	%r12,%r12
2856	xorq	%r13,%r13
2857	xorq	%r14,%r14
2858	xorq	%r15,%r15
2859	leaq	48+8(%rsp),%rdi
2860	xorq	%rbp,%rbp
2861	jmp	.Lsqrx8x_outer_loop
2862
2863.align	32
2864.Lsqrx8x_outer_loop:
2865	mulxq	8(%rsi),%r8,%rax
2866	adcxq	%r9,%r8
2867	adoxq	%rax,%r10
2868	mulxq	16(%rsi),%r9,%rax
2869	adcxq	%r10,%r9
2870	adoxq	%rax,%r11
2871.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
2872	adcxq	%r11,%r10
2873	adoxq	%rax,%r12
2874.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
2875	adcxq	%r12,%r11
2876	adoxq	%rax,%r13
2877	mulxq	40(%rsi),%r12,%rax
2878	adcxq	%r13,%r12
2879	adoxq	%rax,%r14
2880	mulxq	48(%rsi),%r13,%rax
2881	adcxq	%r14,%r13
2882	adoxq	%r15,%rax
2883	mulxq	56(%rsi),%r14,%r15
2884	movq	8(%rsi),%rdx
2885	adcxq	%rax,%r14
2886	adoxq	%rbp,%r15
2887	adcq	64(%rdi),%r15
2888	movq	%r8,8(%rdi)
2889	movq	%r9,16(%rdi)
2890	sbbq	%rcx,%rcx
2891	xorq	%rbp,%rbp
2892
2893
2894	mulxq	16(%rsi),%r8,%rbx
2895	mulxq	24(%rsi),%r9,%rax
2896	adcxq	%r10,%r8
2897	adoxq	%rbx,%r9
2898	mulxq	32(%rsi),%r10,%rbx
2899	adcxq	%r11,%r9
2900	adoxq	%rax,%r10
2901.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
2902	adcxq	%r12,%r10
2903	adoxq	%rbx,%r11
2904.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
2905	adcxq	%r13,%r11
2906	adoxq	%r14,%r12
2907.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
2908	movq	16(%rsi),%rdx
2909	adcxq	%rax,%r12
2910	adoxq	%rbx,%r13
2911	adcxq	%r15,%r13
2912	adoxq	%rbp,%r14
2913	adcxq	%rbp,%r14
2914
2915	movq	%r8,24(%rdi)
2916	movq	%r9,32(%rdi)
2917
2918	mulxq	24(%rsi),%r8,%rbx
2919	mulxq	32(%rsi),%r9,%rax
2920	adcxq	%r10,%r8
2921	adoxq	%rbx,%r9
2922	mulxq	40(%rsi),%r10,%rbx
2923	adcxq	%r11,%r9
2924	adoxq	%rax,%r10
2925.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
2926	adcxq	%r12,%r10
2927	adoxq	%r13,%r11
2928.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
2929.byte	0x3e
2930	movq	24(%rsi),%rdx
2931	adcxq	%rbx,%r11
2932	adoxq	%rax,%r12
2933	adcxq	%r14,%r12
2934	movq	%r8,40(%rdi)
2935	movq	%r9,48(%rdi)
2936	mulxq	32(%rsi),%r8,%rax
2937	adoxq	%rbp,%r13
2938	adcxq	%rbp,%r13
2939
2940	mulxq	40(%rsi),%r9,%rbx
2941	adcxq	%r10,%r8
2942	adoxq	%rax,%r9
2943	mulxq	48(%rsi),%r10,%rax
2944	adcxq	%r11,%r9
2945	adoxq	%r12,%r10
2946	mulxq	56(%rsi),%r11,%r12
2947	movq	32(%rsi),%rdx
2948	movq	40(%rsi),%r14
2949	adcxq	%rbx,%r10
2950	adoxq	%rax,%r11
2951	movq	48(%rsi),%r15
2952	adcxq	%r13,%r11
2953	adoxq	%rbp,%r12
2954	adcxq	%rbp,%r12
2955
2956	movq	%r8,56(%rdi)
2957	movq	%r9,64(%rdi)
2958
2959	mulxq	%r14,%r9,%rax
2960	movq	56(%rsi),%r8
2961	adcxq	%r10,%r9
2962	mulxq	%r15,%r10,%rbx
2963	adoxq	%rax,%r10
2964	adcxq	%r11,%r10
2965	mulxq	%r8,%r11,%rax
2966	movq	%r14,%rdx
2967	adoxq	%rbx,%r11
2968	adcxq	%r12,%r11
2969
2970	adcxq	%rbp,%rax
2971
2972	mulxq	%r15,%r14,%rbx
2973	mulxq	%r8,%r12,%r13
2974	movq	%r15,%rdx
2975	leaq	64(%rsi),%rsi
2976	adcxq	%r14,%r11
2977	adoxq	%rbx,%r12
2978	adcxq	%rax,%r12
2979	adoxq	%rbp,%r13
2980
2981.byte	0x67,0x67
2982	mulxq	%r8,%r8,%r14
2983	adcxq	%r8,%r13
2984	adcxq	%rbp,%r14
2985
2986	cmpq	8+8(%rsp),%rsi
2987	je	.Lsqrx8x_outer_break
2988
2989	negq	%rcx
2990	movq	$-8,%rcx
2991	movq	%rbp,%r15
2992	movq	64(%rdi),%r8
2993	adcxq	72(%rdi),%r9
2994	adcxq	80(%rdi),%r10
2995	adcxq	88(%rdi),%r11
2996	adcq	96(%rdi),%r12
2997	adcq	104(%rdi),%r13
2998	adcq	112(%rdi),%r14
2999	adcq	120(%rdi),%r15
3000	leaq	(%rsi),%rbp
3001	leaq	128(%rdi),%rdi
3002	sbbq	%rax,%rax
3003
3004	movq	-64(%rsi),%rdx
3005	movq	%rax,16+8(%rsp)
3006	movq	%rdi,24+8(%rsp)
3007
3008
3009	xorl	%eax,%eax
3010	jmp	.Lsqrx8x_loop
3011
3012.align	32
3013.Lsqrx8x_loop:
3014	movq	%r8,%rbx
3015	mulxq	0(%rbp),%rax,%r8
3016	adcxq	%rax,%rbx
3017	adoxq	%r9,%r8
3018
3019	mulxq	8(%rbp),%rax,%r9
3020	adcxq	%rax,%r8
3021	adoxq	%r10,%r9
3022
3023	mulxq	16(%rbp),%rax,%r10
3024	adcxq	%rax,%r9
3025	adoxq	%r11,%r10
3026
3027	mulxq	24(%rbp),%rax,%r11
3028	adcxq	%rax,%r10
3029	adoxq	%r12,%r11
3030
3031.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3032	adcxq	%rax,%r11
3033	adoxq	%r13,%r12
3034
3035	mulxq	40(%rbp),%rax,%r13
3036	adcxq	%rax,%r12
3037	adoxq	%r14,%r13
3038
3039	mulxq	48(%rbp),%rax,%r14
3040	movq	%rbx,(%rdi,%rcx,8)
3041	movl	$0,%ebx
3042	adcxq	%rax,%r13
3043	adoxq	%r15,%r14
3044
3045.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
3046	movq	8(%rsi,%rcx,8),%rdx
3047	adcxq	%rax,%r14
3048	adoxq	%rbx,%r15
3049	adcxq	%rbx,%r15
3050
3051.byte	0x67
3052	incq	%rcx
3053	jnz	.Lsqrx8x_loop
3054
3055	leaq	64(%rbp),%rbp
3056	movq	$-8,%rcx
3057	cmpq	8+8(%rsp),%rbp
3058	je	.Lsqrx8x_break
3059
3060	subq	16+8(%rsp),%rbx
3061.byte	0x66
3062	movq	-64(%rsi),%rdx
3063	adcxq	0(%rdi),%r8
3064	adcxq	8(%rdi),%r9
3065	adcq	16(%rdi),%r10
3066	adcq	24(%rdi),%r11
3067	adcq	32(%rdi),%r12
3068	adcq	40(%rdi),%r13
3069	adcq	48(%rdi),%r14
3070	adcq	56(%rdi),%r15
3071	leaq	64(%rdi),%rdi
3072.byte	0x67
3073	sbbq	%rax,%rax
3074	xorl	%ebx,%ebx
3075	movq	%rax,16+8(%rsp)
3076	jmp	.Lsqrx8x_loop
3077
3078.align	32
3079.Lsqrx8x_break:
3080	subq	16+8(%rsp),%r8
3081	movq	24+8(%rsp),%rcx
3082	movq	0(%rsi),%rdx
3083	xorl	%ebp,%ebp
3084	movq	%r8,0(%rdi)
3085	cmpq	%rcx,%rdi
3086	je	.Lsqrx8x_outer_loop
3087
3088	movq	%r9,8(%rdi)
3089	movq	8(%rcx),%r9
3090	movq	%r10,16(%rdi)
3091	movq	16(%rcx),%r10
3092	movq	%r11,24(%rdi)
3093	movq	24(%rcx),%r11
3094	movq	%r12,32(%rdi)
3095	movq	32(%rcx),%r12
3096	movq	%r13,40(%rdi)
3097	movq	40(%rcx),%r13
3098	movq	%r14,48(%rdi)
3099	movq	48(%rcx),%r14
3100	movq	%r15,56(%rdi)
3101	movq	56(%rcx),%r15
3102	movq	%rcx,%rdi
3103	jmp	.Lsqrx8x_outer_loop
3104
3105.align	32
3106.Lsqrx8x_outer_break:
3107	movq	%r9,72(%rdi)
3108.byte	102,72,15,126,217
3109	movq	%r10,80(%rdi)
3110	movq	%r11,88(%rdi)
3111	movq	%r12,96(%rdi)
3112	movq	%r13,104(%rdi)
3113	movq	%r14,112(%rdi)
3114	leaq	48+8(%rsp),%rdi
3115	movq	(%rsi,%rcx,1),%rdx
3116
3117	movq	8(%rdi),%r11
3118	xorq	%r10,%r10
3119	movq	0+8(%rsp),%r9
3120	adoxq	%r11,%r11
3121	movq	16(%rdi),%r12
3122	movq	24(%rdi),%r13
3123
3124
3125.align	32
3126.Lsqrx4x_shift_n_add:
3127	mulxq	%rdx,%rax,%rbx
3128	adoxq	%r12,%r12
3129	adcxq	%r10,%rax
3130.byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
3131.byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
3132	adoxq	%r13,%r13
3133	adcxq	%r11,%rbx
3134	movq	40(%rdi),%r11
3135	movq	%rax,0(%rdi)
3136	movq	%rbx,8(%rdi)
3137
3138	mulxq	%rdx,%rax,%rbx
3139	adoxq	%r10,%r10
3140	adcxq	%r12,%rax
3141	movq	16(%rsi,%rcx,1),%rdx
3142	movq	48(%rdi),%r12
3143	adoxq	%r11,%r11
3144	adcxq	%r13,%rbx
3145	movq	56(%rdi),%r13
3146	movq	%rax,16(%rdi)
3147	movq	%rbx,24(%rdi)
3148
3149	mulxq	%rdx,%rax,%rbx
3150	adoxq	%r12,%r12
3151	adcxq	%r10,%rax
3152	movq	24(%rsi,%rcx,1),%rdx
3153	leaq	32(%rcx),%rcx
3154	movq	64(%rdi),%r10
3155	adoxq	%r13,%r13
3156	adcxq	%r11,%rbx
3157	movq	72(%rdi),%r11
3158	movq	%rax,32(%rdi)
3159	movq	%rbx,40(%rdi)
3160
3161	mulxq	%rdx,%rax,%rbx
3162	adoxq	%r10,%r10
3163	adcxq	%r12,%rax
3164	jrcxz	.Lsqrx4x_shift_n_add_break
3165.byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
3166	adoxq	%r11,%r11
3167	adcxq	%r13,%rbx
3168	movq	80(%rdi),%r12
3169	movq	88(%rdi),%r13
3170	movq	%rax,48(%rdi)
3171	movq	%rbx,56(%rdi)
3172	leaq	64(%rdi),%rdi
3173	nop
3174	jmp	.Lsqrx4x_shift_n_add
3175
3176.align	32
3177.Lsqrx4x_shift_n_add_break:
3178	adcxq	%r13,%rbx
3179	movq	%rax,48(%rdi)
3180	movq	%rbx,56(%rdi)
3181	leaq	64(%rdi),%rdi
3182.byte	102,72,15,126,213
3183__bn_sqrx8x_reduction:
3184	xorl	%eax,%eax
3185	movq	32+8(%rsp),%rbx
3186	movq	48+8(%rsp),%rdx
3187	leaq	-64(%rbp,%r9,1),%rcx
3188
3189	movq	%rcx,0+8(%rsp)
3190	movq	%rdi,8+8(%rsp)
3191
3192	leaq	48+8(%rsp),%rdi
3193	jmp	.Lsqrx8x_reduction_loop
3194
3195.align	32
3196.Lsqrx8x_reduction_loop:
3197	movq	8(%rdi),%r9
3198	movq	16(%rdi),%r10
3199	movq	24(%rdi),%r11
3200	movq	32(%rdi),%r12
3201	movq	%rdx,%r8
3202	imulq	%rbx,%rdx
3203	movq	40(%rdi),%r13
3204	movq	48(%rdi),%r14
3205	movq	56(%rdi),%r15
3206	movq	%rax,24+8(%rsp)
3207
3208	leaq	64(%rdi),%rdi
3209	xorq	%rsi,%rsi
3210	movq	$-8,%rcx
3211	jmp	.Lsqrx8x_reduce
3212
3213.align	32
3214.Lsqrx8x_reduce:
3215	movq	%r8,%rbx
3216	mulxq	0(%rbp),%rax,%r8
3217	adcxq	%rbx,%rax
3218	adoxq	%r9,%r8
3219
3220	mulxq	8(%rbp),%rbx,%r9
3221	adcxq	%rbx,%r8
3222	adoxq	%r10,%r9
3223
3224	mulxq	16(%rbp),%rbx,%r10
3225	adcxq	%rbx,%r9
3226	adoxq	%r11,%r10
3227
3228	mulxq	24(%rbp),%rbx,%r11
3229	adcxq	%rbx,%r10
3230	adoxq	%r12,%r11
3231
3232.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
3233	movq	%rdx,%rax
3234	movq	%r8,%rdx
3235	adcxq	%rbx,%r11
3236	adoxq	%r13,%r12
3237
3238	mulxq	32+8(%rsp),%rbx,%rdx
3239	movq	%rax,%rdx
3240	movq	%rax,64+48+8(%rsp,%rcx,8)
3241
3242	mulxq	40(%rbp),%rax,%r13
3243	adcxq	%rax,%r12
3244	adoxq	%r14,%r13
3245
3246	mulxq	48(%rbp),%rax,%r14
3247	adcxq	%rax,%r13
3248	adoxq	%r15,%r14
3249
3250	mulxq	56(%rbp),%rax,%r15
3251	movq	%rbx,%rdx
3252	adcxq	%rax,%r14
3253	adoxq	%rsi,%r15
3254	adcxq	%rsi,%r15
3255
3256.byte	0x67,0x67,0x67
3257	incq	%rcx
3258	jnz	.Lsqrx8x_reduce
3259
3260	movq	%rsi,%rax
3261	cmpq	0+8(%rsp),%rbp
3262	jae	.Lsqrx8x_no_tail
3263
3264	movq	48+8(%rsp),%rdx
3265	addq	0(%rdi),%r8
3266	leaq	64(%rbp),%rbp
3267	movq	$-8,%rcx
3268	adcxq	8(%rdi),%r9
3269	adcxq	16(%rdi),%r10
3270	adcq	24(%rdi),%r11
3271	adcq	32(%rdi),%r12
3272	adcq	40(%rdi),%r13
3273	adcq	48(%rdi),%r14
3274	adcq	56(%rdi),%r15
3275	leaq	64(%rdi),%rdi
3276	sbbq	%rax,%rax
3277
3278	xorq	%rsi,%rsi
3279	movq	%rax,16+8(%rsp)
3280	jmp	.Lsqrx8x_tail
3281
3282.align	32
3283.Lsqrx8x_tail:
3284	movq	%r8,%rbx
3285	mulxq	0(%rbp),%rax,%r8
3286	adcxq	%rax,%rbx
3287	adoxq	%r9,%r8
3288
3289	mulxq	8(%rbp),%rax,%r9
3290	adcxq	%rax,%r8
3291	adoxq	%r10,%r9
3292
3293	mulxq	16(%rbp),%rax,%r10
3294	adcxq	%rax,%r9
3295	adoxq	%r11,%r10
3296
3297	mulxq	24(%rbp),%rax,%r11
3298	adcxq	%rax,%r10
3299	adoxq	%r12,%r11
3300
3301.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3302	adcxq	%rax,%r11
3303	adoxq	%r13,%r12
3304
3305	mulxq	40(%rbp),%rax,%r13
3306	adcxq	%rax,%r12
3307	adoxq	%r14,%r13
3308
3309	mulxq	48(%rbp),%rax,%r14
3310	adcxq	%rax,%r13
3311	adoxq	%r15,%r14
3312
3313	mulxq	56(%rbp),%rax,%r15
3314	movq	72+48+8(%rsp,%rcx,8),%rdx
3315	adcxq	%rax,%r14
3316	adoxq	%rsi,%r15
3317	movq	%rbx,(%rdi,%rcx,8)
3318	movq	%r8,%rbx
3319	adcxq	%rsi,%r15
3320
3321	incq	%rcx
3322	jnz	.Lsqrx8x_tail
3323
3324	cmpq	0+8(%rsp),%rbp
3325	jae	.Lsqrx8x_tail_done
3326
3327	subq	16+8(%rsp),%rsi
3328	movq	48+8(%rsp),%rdx
3329	leaq	64(%rbp),%rbp
3330	adcq	0(%rdi),%r8
3331	adcq	8(%rdi),%r9
3332	adcq	16(%rdi),%r10
3333	adcq	24(%rdi),%r11
3334	adcq	32(%rdi),%r12
3335	adcq	40(%rdi),%r13
3336	adcq	48(%rdi),%r14
3337	adcq	56(%rdi),%r15
3338	leaq	64(%rdi),%rdi
3339	sbbq	%rax,%rax
3340	subq	$8,%rcx
3341
3342	xorq	%rsi,%rsi
3343	movq	%rax,16+8(%rsp)
3344	jmp	.Lsqrx8x_tail
3345
3346.align	32
3347.Lsqrx8x_tail_done:
3348	addq	24+8(%rsp),%r8
3349	adcq	$0,%r9
3350	adcq	$0,%r10
3351	adcq	$0,%r11
3352	adcq	$0,%r12
3353	adcq	$0,%r13
3354	adcq	$0,%r14
3355	adcq	$0,%r15
3356
3357
3358	movq	%rsi,%rax
3359
3360	subq	16+8(%rsp),%rsi
3361.Lsqrx8x_no_tail:
3362	adcq	0(%rdi),%r8
3363.byte	102,72,15,126,217
3364	adcq	8(%rdi),%r9
3365	movq	56(%rbp),%rsi
3366.byte	102,72,15,126,213
3367	adcq	16(%rdi),%r10
3368	adcq	24(%rdi),%r11
3369	adcq	32(%rdi),%r12
3370	adcq	40(%rdi),%r13
3371	adcq	48(%rdi),%r14
3372	adcq	56(%rdi),%r15
3373	adcq	%rax,%rax
3374
3375	movq	32+8(%rsp),%rbx
3376	movq	64(%rdi,%rcx,1),%rdx
3377
3378	movq	%r8,0(%rdi)
3379	leaq	64(%rdi),%r8
3380	movq	%r9,8(%rdi)
3381	movq	%r10,16(%rdi)
3382	movq	%r11,24(%rdi)
3383	movq	%r12,32(%rdi)
3384	movq	%r13,40(%rdi)
3385	movq	%r14,48(%rdi)
3386	movq	%r15,56(%rdi)
3387
3388	leaq	64(%rdi,%rcx,1),%rdi
3389	cmpq	8+8(%rsp),%r8
3390	jb	.Lsqrx8x_reduction_loop
3391	.byte	0xf3,0xc3
3392.size	bn_sqrx8x_internal,.-bn_sqrx8x_internal
3393.align	32
3394__bn_postx4x_internal:
3395	movq	0(%rbp),%r12
3396	movq	%rcx,%r10
3397	movq	%rcx,%r9
3398	negq	%rax
3399	sarq	$3+2,%rcx
3400
3401.byte	102,72,15,126,202
3402.byte	102,72,15,126,206
3403	decq	%r12
3404	movq	8(%rbp),%r13
3405	xorq	%r8,%r8
3406	movq	16(%rbp),%r14
3407	movq	24(%rbp),%r15
3408	jmp	.Lsqrx4x_sub_entry
3409
3410.align	16
3411.Lsqrx4x_sub:
3412	movq	0(%rbp),%r12
3413	movq	8(%rbp),%r13
3414	movq	16(%rbp),%r14
3415	movq	24(%rbp),%r15
3416.Lsqrx4x_sub_entry:
3417	andnq	%rax,%r12,%r12
3418	leaq	32(%rbp),%rbp
3419	andnq	%rax,%r13,%r13
3420	andnq	%rax,%r14,%r14
3421	andnq	%rax,%r15,%r15
3422
3423	negq	%r8
3424	adcq	0(%rdi),%r12
3425	adcq	8(%rdi),%r13
3426	adcq	16(%rdi),%r14
3427	adcq	24(%rdi),%r15
3428	movq	%r12,0(%rdx)
3429	leaq	32(%rdi),%rdi
3430	movq	%r13,8(%rdx)
3431	sbbq	%r8,%r8
3432	movq	%r14,16(%rdx)
3433	movq	%r15,24(%rdx)
3434	leaq	32(%rdx),%rdx
3435
3436	incq	%rcx
3437	jnz	.Lsqrx4x_sub
3438
3439	negq	%r9
3440
3441	.byte	0xf3,0xc3
3442.size	__bn_postx4x_internal,.-__bn_postx4x_internal
3443.globl	bn_get_bits5
3444.type	bn_get_bits5,@function
3445.align	16
3446bn_get_bits5:
3447	leaq	0(%rdi),%r10
3448	leaq	1(%rdi),%r11
3449	movl	%esi,%ecx
3450	shrl	$4,%esi
3451	andl	$15,%ecx
3452	leal	-8(%rcx),%eax
3453	cmpl	$11,%ecx
3454	cmovaq	%r11,%r10
3455	cmoval	%eax,%ecx
3456	movzwl	(%r10,%rsi,2),%eax
3457	shrl	%cl,%eax
3458	andl	$31,%eax
3459	.byte	0xf3,0xc3
3460.size	bn_get_bits5,.-bn_get_bits5
3461
3462.globl	bn_scatter5
3463.type	bn_scatter5,@function
3464.align	16
3465bn_scatter5:
3466	cmpl	$0,%esi
3467	jz	.Lscatter_epilogue
3468	leaq	(%rdx,%rcx,8),%rdx
3469.Lscatter:
3470	movq	(%rdi),%rax
3471	leaq	8(%rdi),%rdi
3472	movq	%rax,(%rdx)
3473	leaq	256(%rdx),%rdx
3474	subl	$1,%esi
3475	jnz	.Lscatter
3476.Lscatter_epilogue:
3477	.byte	0xf3,0xc3
3478.size	bn_scatter5,.-bn_scatter5
3479
3480.globl	bn_gather5
3481.type	bn_gather5,@function
3482.align	32
3483bn_gather5:
3484.LSEH_begin_bn_gather5:
3485
3486.byte	0x4c,0x8d,0x14,0x24
3487.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00
3488	leaq	.Linc(%rip),%rax
3489	andq	$-16,%rsp
3490
3491	movd	%ecx,%xmm5
3492	movdqa	0(%rax),%xmm0
3493	movdqa	16(%rax),%xmm1
3494	leaq	128(%rdx),%r11
3495	leaq	128(%rsp),%rax
3496
3497	pshufd	$0,%xmm5,%xmm5
3498	movdqa	%xmm1,%xmm4
3499	movdqa	%xmm1,%xmm2
3500	paddd	%xmm0,%xmm1
3501	pcmpeqd	%xmm5,%xmm0
3502	movdqa	%xmm4,%xmm3
3503
3504	paddd	%xmm1,%xmm2
3505	pcmpeqd	%xmm5,%xmm1
3506	movdqa	%xmm0,-128(%rax)
3507	movdqa	%xmm4,%xmm0
3508
3509	paddd	%xmm2,%xmm3
3510	pcmpeqd	%xmm5,%xmm2
3511	movdqa	%xmm1,-112(%rax)
3512	movdqa	%xmm4,%xmm1
3513
3514	paddd	%xmm3,%xmm0
3515	pcmpeqd	%xmm5,%xmm3
3516	movdqa	%xmm2,-96(%rax)
3517	movdqa	%xmm4,%xmm2
3518	paddd	%xmm0,%xmm1
3519	pcmpeqd	%xmm5,%xmm0
3520	movdqa	%xmm3,-80(%rax)
3521	movdqa	%xmm4,%xmm3
3522
3523	paddd	%xmm1,%xmm2
3524	pcmpeqd	%xmm5,%xmm1
3525	movdqa	%xmm0,-64(%rax)
3526	movdqa	%xmm4,%xmm0
3527
3528	paddd	%xmm2,%xmm3
3529	pcmpeqd	%xmm5,%xmm2
3530	movdqa	%xmm1,-48(%rax)
3531	movdqa	%xmm4,%xmm1
3532
3533	paddd	%xmm3,%xmm0
3534	pcmpeqd	%xmm5,%xmm3
3535	movdqa	%xmm2,-32(%rax)
3536	movdqa	%xmm4,%xmm2
3537	paddd	%xmm0,%xmm1
3538	pcmpeqd	%xmm5,%xmm0
3539	movdqa	%xmm3,-16(%rax)
3540	movdqa	%xmm4,%xmm3
3541
3542	paddd	%xmm1,%xmm2
3543	pcmpeqd	%xmm5,%xmm1
3544	movdqa	%xmm0,0(%rax)
3545	movdqa	%xmm4,%xmm0
3546
3547	paddd	%xmm2,%xmm3
3548	pcmpeqd	%xmm5,%xmm2
3549	movdqa	%xmm1,16(%rax)
3550	movdqa	%xmm4,%xmm1
3551
3552	paddd	%xmm3,%xmm0
3553	pcmpeqd	%xmm5,%xmm3
3554	movdqa	%xmm2,32(%rax)
3555	movdqa	%xmm4,%xmm2
3556	paddd	%xmm0,%xmm1
3557	pcmpeqd	%xmm5,%xmm0
3558	movdqa	%xmm3,48(%rax)
3559	movdqa	%xmm4,%xmm3
3560
3561	paddd	%xmm1,%xmm2
3562	pcmpeqd	%xmm5,%xmm1
3563	movdqa	%xmm0,64(%rax)
3564	movdqa	%xmm4,%xmm0
3565
3566	paddd	%xmm2,%xmm3
3567	pcmpeqd	%xmm5,%xmm2
3568	movdqa	%xmm1,80(%rax)
3569	movdqa	%xmm4,%xmm1
3570
3571	paddd	%xmm3,%xmm0
3572	pcmpeqd	%xmm5,%xmm3
3573	movdqa	%xmm2,96(%rax)
3574	movdqa	%xmm4,%xmm2
3575	movdqa	%xmm3,112(%rax)
3576	jmp	.Lgather
3577
3578.align	32
3579.Lgather:
3580	pxor	%xmm4,%xmm4
3581	pxor	%xmm5,%xmm5
3582	movdqa	-128(%r11),%xmm0
3583	movdqa	-112(%r11),%xmm1
3584	movdqa	-96(%r11),%xmm2
3585	pand	-128(%rax),%xmm0
3586	movdqa	-80(%r11),%xmm3
3587	pand	-112(%rax),%xmm1
3588	por	%xmm0,%xmm4
3589	pand	-96(%rax),%xmm2
3590	por	%xmm1,%xmm5
3591	pand	-80(%rax),%xmm3
3592	por	%xmm2,%xmm4
3593	por	%xmm3,%xmm5
3594	movdqa	-64(%r11),%xmm0
3595	movdqa	-48(%r11),%xmm1
3596	movdqa	-32(%r11),%xmm2
3597	pand	-64(%rax),%xmm0
3598	movdqa	-16(%r11),%xmm3
3599	pand	-48(%rax),%xmm1
3600	por	%xmm0,%xmm4
3601	pand	-32(%rax),%xmm2
3602	por	%xmm1,%xmm5
3603	pand	-16(%rax),%xmm3
3604	por	%xmm2,%xmm4
3605	por	%xmm3,%xmm5
3606	movdqa	0(%r11),%xmm0
3607	movdqa	16(%r11),%xmm1
3608	movdqa	32(%r11),%xmm2
3609	pand	0(%rax),%xmm0
3610	movdqa	48(%r11),%xmm3
3611	pand	16(%rax),%xmm1
3612	por	%xmm0,%xmm4
3613	pand	32(%rax),%xmm2
3614	por	%xmm1,%xmm5
3615	pand	48(%rax),%xmm3
3616	por	%xmm2,%xmm4
3617	por	%xmm3,%xmm5
3618	movdqa	64(%r11),%xmm0
3619	movdqa	80(%r11),%xmm1
3620	movdqa	96(%r11),%xmm2
3621	pand	64(%rax),%xmm0
3622	movdqa	112(%r11),%xmm3
3623	pand	80(%rax),%xmm1
3624	por	%xmm0,%xmm4
3625	pand	96(%rax),%xmm2
3626	por	%xmm1,%xmm5
3627	pand	112(%rax),%xmm3
3628	por	%xmm2,%xmm4
3629	por	%xmm3,%xmm5
3630	por	%xmm5,%xmm4
3631	leaq	256(%r11),%r11
3632	pshufd	$0x4e,%xmm4,%xmm0
3633	por	%xmm4,%xmm0
3634	movq	%xmm0,(%rdi)
3635	leaq	8(%rdi),%rdi
3636	subl	$1,%esi
3637	jnz	.Lgather
3638
3639	leaq	(%r10),%rsp
3640	.byte	0xf3,0xc3
3641.LSEH_end_bn_gather5:
3642.size	bn_gather5,.-bn_gather5
3643.align	64
3644.Linc:
3645.long	0,0, 1,1
3646.long	2,2, 2,2
3647.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
3648