x86_64-mont5.S revision 305153
1/* $FreeBSD: stable/11/secure/lib/libcrypto/amd64/x86_64-mont5.S 305153 2016-08-31 20:33:59Z jkim $ */
2/* Do not modify. This file is auto-generated from x86_64-mont5.pl. */
3.text
4
5
6
7.globl	bn_mul_mont_gather5
8.type	bn_mul_mont_gather5,@function
9.align	64
10bn_mul_mont_gather5:
11	testl	$7,%r9d
12	jnz	.Lmul_enter
13	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
14	jmp	.Lmul4x_enter
15
16.align	16
17.Lmul_enter:
18	movl	%r9d,%r9d
19	movq	%rsp,%rax
20	movd	8(%rsp),%xmm5
21	leaq	.Linc(%rip),%r10
22	pushq	%rbx
23	pushq	%rbp
24	pushq	%r12
25	pushq	%r13
26	pushq	%r14
27	pushq	%r15
28
29	leaq	2(%r9),%r11
30	negq	%r11
31	leaq	-264(%rsp,%r11,8),%rsp
32	andq	$-1024,%rsp
33
34	movq	%rax,8(%rsp,%r9,8)
35.Lmul_body:
36
37
38
39
40
41
42	subq	%rsp,%rax
43	andq	$-4096,%rax
44.Lmul_page_walk:
45	movq	(%rsp,%rax,1),%r11
46	subq	$4096,%rax
47.byte	0x2e
48	jnc	.Lmul_page_walk
49
50	leaq	128(%rdx),%r12
51	movdqa	0(%r10),%xmm0
52	movdqa	16(%r10),%xmm1
53	leaq	24-112(%rsp,%r9,8),%r10
54	andq	$-16,%r10
55
56	pshufd	$0,%xmm5,%xmm5
57	movdqa	%xmm1,%xmm4
58	movdqa	%xmm1,%xmm2
59	paddd	%xmm0,%xmm1
60	pcmpeqd	%xmm5,%xmm0
61.byte	0x67
62	movdqa	%xmm4,%xmm3
63	paddd	%xmm1,%xmm2
64	pcmpeqd	%xmm5,%xmm1
65	movdqa	%xmm0,112(%r10)
66	movdqa	%xmm4,%xmm0
67
68	paddd	%xmm2,%xmm3
69	pcmpeqd	%xmm5,%xmm2
70	movdqa	%xmm1,128(%r10)
71	movdqa	%xmm4,%xmm1
72
73	paddd	%xmm3,%xmm0
74	pcmpeqd	%xmm5,%xmm3
75	movdqa	%xmm2,144(%r10)
76	movdqa	%xmm4,%xmm2
77
78	paddd	%xmm0,%xmm1
79	pcmpeqd	%xmm5,%xmm0
80	movdqa	%xmm3,160(%r10)
81	movdqa	%xmm4,%xmm3
82	paddd	%xmm1,%xmm2
83	pcmpeqd	%xmm5,%xmm1
84	movdqa	%xmm0,176(%r10)
85	movdqa	%xmm4,%xmm0
86
87	paddd	%xmm2,%xmm3
88	pcmpeqd	%xmm5,%xmm2
89	movdqa	%xmm1,192(%r10)
90	movdqa	%xmm4,%xmm1
91
92	paddd	%xmm3,%xmm0
93	pcmpeqd	%xmm5,%xmm3
94	movdqa	%xmm2,208(%r10)
95	movdqa	%xmm4,%xmm2
96
97	paddd	%xmm0,%xmm1
98	pcmpeqd	%xmm5,%xmm0
99	movdqa	%xmm3,224(%r10)
100	movdqa	%xmm4,%xmm3
101	paddd	%xmm1,%xmm2
102	pcmpeqd	%xmm5,%xmm1
103	movdqa	%xmm0,240(%r10)
104	movdqa	%xmm4,%xmm0
105
106	paddd	%xmm2,%xmm3
107	pcmpeqd	%xmm5,%xmm2
108	movdqa	%xmm1,256(%r10)
109	movdqa	%xmm4,%xmm1
110
111	paddd	%xmm3,%xmm0
112	pcmpeqd	%xmm5,%xmm3
113	movdqa	%xmm2,272(%r10)
114	movdqa	%xmm4,%xmm2
115
116	paddd	%xmm0,%xmm1
117	pcmpeqd	%xmm5,%xmm0
118	movdqa	%xmm3,288(%r10)
119	movdqa	%xmm4,%xmm3
120	paddd	%xmm1,%xmm2
121	pcmpeqd	%xmm5,%xmm1
122	movdqa	%xmm0,304(%r10)
123
124	paddd	%xmm2,%xmm3
125.byte	0x67
126	pcmpeqd	%xmm5,%xmm2
127	movdqa	%xmm1,320(%r10)
128
129	pcmpeqd	%xmm5,%xmm3
130	movdqa	%xmm2,336(%r10)
131	pand	64(%r12),%xmm0
132
133	pand	80(%r12),%xmm1
134	pand	96(%r12),%xmm2
135	movdqa	%xmm3,352(%r10)
136	pand	112(%r12),%xmm3
137	por	%xmm2,%xmm0
138	por	%xmm3,%xmm1
139	movdqa	-128(%r12),%xmm4
140	movdqa	-112(%r12),%xmm5
141	movdqa	-96(%r12),%xmm2
142	pand	112(%r10),%xmm4
143	movdqa	-80(%r12),%xmm3
144	pand	128(%r10),%xmm5
145	por	%xmm4,%xmm0
146	pand	144(%r10),%xmm2
147	por	%xmm5,%xmm1
148	pand	160(%r10),%xmm3
149	por	%xmm2,%xmm0
150	por	%xmm3,%xmm1
151	movdqa	-64(%r12),%xmm4
152	movdqa	-48(%r12),%xmm5
153	movdqa	-32(%r12),%xmm2
154	pand	176(%r10),%xmm4
155	movdqa	-16(%r12),%xmm3
156	pand	192(%r10),%xmm5
157	por	%xmm4,%xmm0
158	pand	208(%r10),%xmm2
159	por	%xmm5,%xmm1
160	pand	224(%r10),%xmm3
161	por	%xmm2,%xmm0
162	por	%xmm3,%xmm1
163	movdqa	0(%r12),%xmm4
164	movdqa	16(%r12),%xmm5
165	movdqa	32(%r12),%xmm2
166	pand	240(%r10),%xmm4
167	movdqa	48(%r12),%xmm3
168	pand	256(%r10),%xmm5
169	por	%xmm4,%xmm0
170	pand	272(%r10),%xmm2
171	por	%xmm5,%xmm1
172	pand	288(%r10),%xmm3
173	por	%xmm2,%xmm0
174	por	%xmm3,%xmm1
175	por	%xmm1,%xmm0
176	pshufd	$0x4e,%xmm0,%xmm1
177	por	%xmm1,%xmm0
178	leaq	256(%r12),%r12
179.byte	102,72,15,126,195
180
181	movq	(%r8),%r8
182	movq	(%rsi),%rax
183
184	xorq	%r14,%r14
185	xorq	%r15,%r15
186
187	movq	%r8,%rbp
188	mulq	%rbx
189	movq	%rax,%r10
190	movq	(%rcx),%rax
191
192	imulq	%r10,%rbp
193	movq	%rdx,%r11
194
195	mulq	%rbp
196	addq	%rax,%r10
197	movq	8(%rsi),%rax
198	adcq	$0,%rdx
199	movq	%rdx,%r13
200
201	leaq	1(%r15),%r15
202	jmp	.L1st_enter
203
204.align	16
205.L1st:
206	addq	%rax,%r13
207	movq	(%rsi,%r15,8),%rax
208	adcq	$0,%rdx
209	addq	%r11,%r13
210	movq	%r10,%r11
211	adcq	$0,%rdx
212	movq	%r13,-16(%rsp,%r15,8)
213	movq	%rdx,%r13
214
215.L1st_enter:
216	mulq	%rbx
217	addq	%rax,%r11
218	movq	(%rcx,%r15,8),%rax
219	adcq	$0,%rdx
220	leaq	1(%r15),%r15
221	movq	%rdx,%r10
222
223	mulq	%rbp
224	cmpq	%r9,%r15
225	jne	.L1st
226
227
228	addq	%rax,%r13
229	adcq	$0,%rdx
230	addq	%r11,%r13
231	adcq	$0,%rdx
232	movq	%r13,-16(%rsp,%r9,8)
233	movq	%rdx,%r13
234	movq	%r10,%r11
235
236	xorq	%rdx,%rdx
237	addq	%r11,%r13
238	adcq	$0,%rdx
239	movq	%r13,-8(%rsp,%r9,8)
240	movq	%rdx,(%rsp,%r9,8)
241
242	leaq	1(%r14),%r14
243	jmp	.Louter
244.align	16
245.Louter:
246	leaq	24+128(%rsp,%r9,8),%rdx
247	andq	$-16,%rdx
248	pxor	%xmm4,%xmm4
249	pxor	%xmm5,%xmm5
250	movdqa	-128(%r12),%xmm0
251	movdqa	-112(%r12),%xmm1
252	movdqa	-96(%r12),%xmm2
253	movdqa	-80(%r12),%xmm3
254	pand	-128(%rdx),%xmm0
255	pand	-112(%rdx),%xmm1
256	por	%xmm0,%xmm4
257	pand	-96(%rdx),%xmm2
258	por	%xmm1,%xmm5
259	pand	-80(%rdx),%xmm3
260	por	%xmm2,%xmm4
261	por	%xmm3,%xmm5
262	movdqa	-64(%r12),%xmm0
263	movdqa	-48(%r12),%xmm1
264	movdqa	-32(%r12),%xmm2
265	movdqa	-16(%r12),%xmm3
266	pand	-64(%rdx),%xmm0
267	pand	-48(%rdx),%xmm1
268	por	%xmm0,%xmm4
269	pand	-32(%rdx),%xmm2
270	por	%xmm1,%xmm5
271	pand	-16(%rdx),%xmm3
272	por	%xmm2,%xmm4
273	por	%xmm3,%xmm5
274	movdqa	0(%r12),%xmm0
275	movdqa	16(%r12),%xmm1
276	movdqa	32(%r12),%xmm2
277	movdqa	48(%r12),%xmm3
278	pand	0(%rdx),%xmm0
279	pand	16(%rdx),%xmm1
280	por	%xmm0,%xmm4
281	pand	32(%rdx),%xmm2
282	por	%xmm1,%xmm5
283	pand	48(%rdx),%xmm3
284	por	%xmm2,%xmm4
285	por	%xmm3,%xmm5
286	movdqa	64(%r12),%xmm0
287	movdqa	80(%r12),%xmm1
288	movdqa	96(%r12),%xmm2
289	movdqa	112(%r12),%xmm3
290	pand	64(%rdx),%xmm0
291	pand	80(%rdx),%xmm1
292	por	%xmm0,%xmm4
293	pand	96(%rdx),%xmm2
294	por	%xmm1,%xmm5
295	pand	112(%rdx),%xmm3
296	por	%xmm2,%xmm4
297	por	%xmm3,%xmm5
298	por	%xmm5,%xmm4
299	pshufd	$0x4e,%xmm4,%xmm0
300	por	%xmm4,%xmm0
301	leaq	256(%r12),%r12
302
303	movq	(%rsi),%rax
304.byte	102,72,15,126,195
305
306	xorq	%r15,%r15
307	movq	%r8,%rbp
308	movq	(%rsp),%r10
309
310	mulq	%rbx
311	addq	%rax,%r10
312	movq	(%rcx),%rax
313	adcq	$0,%rdx
314
315	imulq	%r10,%rbp
316	movq	%rdx,%r11
317
318	mulq	%rbp
319	addq	%rax,%r10
320	movq	8(%rsi),%rax
321	adcq	$0,%rdx
322	movq	8(%rsp),%r10
323	movq	%rdx,%r13
324
325	leaq	1(%r15),%r15
326	jmp	.Linner_enter
327
328.align	16
329.Linner:
330	addq	%rax,%r13
331	movq	(%rsi,%r15,8),%rax
332	adcq	$0,%rdx
333	addq	%r10,%r13
334	movq	(%rsp,%r15,8),%r10
335	adcq	$0,%rdx
336	movq	%r13,-16(%rsp,%r15,8)
337	movq	%rdx,%r13
338
339.Linner_enter:
340	mulq	%rbx
341	addq	%rax,%r11
342	movq	(%rcx,%r15,8),%rax
343	adcq	$0,%rdx
344	addq	%r11,%r10
345	movq	%rdx,%r11
346	adcq	$0,%r11
347	leaq	1(%r15),%r15
348
349	mulq	%rbp
350	cmpq	%r9,%r15
351	jne	.Linner
352
353	addq	%rax,%r13
354	adcq	$0,%rdx
355	addq	%r10,%r13
356	movq	(%rsp,%r9,8),%r10
357	adcq	$0,%rdx
358	movq	%r13,-16(%rsp,%r9,8)
359	movq	%rdx,%r13
360
361	xorq	%rdx,%rdx
362	addq	%r11,%r13
363	adcq	$0,%rdx
364	addq	%r10,%r13
365	adcq	$0,%rdx
366	movq	%r13,-8(%rsp,%r9,8)
367	movq	%rdx,(%rsp,%r9,8)
368
369	leaq	1(%r14),%r14
370	cmpq	%r9,%r14
371	jb	.Louter
372
373	xorq	%r14,%r14
374	movq	(%rsp),%rax
375	leaq	(%rsp),%rsi
376	movq	%r9,%r15
377	jmp	.Lsub
378.align	16
379.Lsub:	sbbq	(%rcx,%r14,8),%rax
380	movq	%rax,(%rdi,%r14,8)
381	movq	8(%rsi,%r14,8),%rax
382	leaq	1(%r14),%r14
383	decq	%r15
384	jnz	.Lsub
385
386	sbbq	$0,%rax
387	xorq	%r14,%r14
388	andq	%rax,%rsi
389	notq	%rax
390	movq	%rdi,%rcx
391	andq	%rax,%rcx
392	movq	%r9,%r15
393	orq	%rcx,%rsi
394.align	16
395.Lcopy:
396	movq	(%rsi,%r14,8),%rax
397	movq	%r14,(%rsp,%r14,8)
398	movq	%rax,(%rdi,%r14,8)
399	leaq	1(%r14),%r14
400	subq	$1,%r15
401	jnz	.Lcopy
402
403	movq	8(%rsp,%r9,8),%rsi
404	movq	$1,%rax
405
406	movq	-48(%rsi),%r15
407	movq	-40(%rsi),%r14
408	movq	-32(%rsi),%r13
409	movq	-24(%rsi),%r12
410	movq	-16(%rsi),%rbp
411	movq	-8(%rsi),%rbx
412	leaq	(%rsi),%rsp
413.Lmul_epilogue:
414	.byte	0xf3,0xc3
415.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
416.type	bn_mul4x_mont_gather5,@function
417.align	32
418bn_mul4x_mont_gather5:
419.Lmul4x_enter:
420	andl	$0x80108,%r11d
421	cmpl	$0x80108,%r11d
422	je	.Lmulx4x_enter
423.byte	0x67
424	movq	%rsp,%rax
425	pushq	%rbx
426	pushq	%rbp
427	pushq	%r12
428	pushq	%r13
429	pushq	%r14
430	pushq	%r15
431
432.byte	0x67
433	shll	$3,%r9d
434	leaq	(%r9,%r9,2),%r10
435	negq	%r9
436
437
438
439
440
441
442
443
444
445
446	leaq	-320(%rsp,%r9,2),%r11
447	subq	%rdi,%r11
448	andq	$4095,%r11
449	cmpq	%r11,%r10
450	jb	.Lmul4xsp_alt
451	subq	%r11,%rsp
452	leaq	-320(%rsp,%r9,2),%rsp
453	jmp	.Lmul4xsp_done
454
455.align	32
456.Lmul4xsp_alt:
457	leaq	4096-320(,%r9,2),%r10
458	leaq	-320(%rsp,%r9,2),%rsp
459	subq	%r10,%r11
460	movq	$0,%r10
461	cmovcq	%r10,%r11
462	subq	%r11,%rsp
463.Lmul4xsp_done:
464	andq	$-64,%rsp
465	movq	%rax,%r11
466	subq	%rsp,%r11
467	andq	$-4096,%r11
468.Lmul4x_page_walk:
469	movq	(%rsp,%r11,1),%r10
470	subq	$4096,%r11
471.byte	0x2e
472	jnc	.Lmul4x_page_walk
473
474	negq	%r9
475
476	movq	%rax,40(%rsp)
477.Lmul4x_body:
478
479	call	mul4x_internal
480
481	movq	40(%rsp),%rsi
482	movq	$1,%rax
483
484	movq	-48(%rsi),%r15
485	movq	-40(%rsi),%r14
486	movq	-32(%rsi),%r13
487	movq	-24(%rsi),%r12
488	movq	-16(%rsi),%rbp
489	movq	-8(%rsi),%rbx
490	leaq	(%rsi),%rsp
491.Lmul4x_epilogue:
492	.byte	0xf3,0xc3
493.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
494
495.type	mul4x_internal,@function
496.align	32
497mul4x_internal:
498	shlq	$5,%r9
499	movd	8(%rax),%xmm5
500	leaq	.Linc(%rip),%rax
501	leaq	128(%rdx,%r9,1),%r13
502	shrq	$5,%r9
503	movdqa	0(%rax),%xmm0
504	movdqa	16(%rax),%xmm1
505	leaq	88-112(%rsp,%r9,1),%r10
506	leaq	128(%rdx),%r12
507
508	pshufd	$0,%xmm5,%xmm5
509	movdqa	%xmm1,%xmm4
510.byte	0x67,0x67
511	movdqa	%xmm1,%xmm2
512	paddd	%xmm0,%xmm1
513	pcmpeqd	%xmm5,%xmm0
514.byte	0x67
515	movdqa	%xmm4,%xmm3
516	paddd	%xmm1,%xmm2
517	pcmpeqd	%xmm5,%xmm1
518	movdqa	%xmm0,112(%r10)
519	movdqa	%xmm4,%xmm0
520
521	paddd	%xmm2,%xmm3
522	pcmpeqd	%xmm5,%xmm2
523	movdqa	%xmm1,128(%r10)
524	movdqa	%xmm4,%xmm1
525
526	paddd	%xmm3,%xmm0
527	pcmpeqd	%xmm5,%xmm3
528	movdqa	%xmm2,144(%r10)
529	movdqa	%xmm4,%xmm2
530
531	paddd	%xmm0,%xmm1
532	pcmpeqd	%xmm5,%xmm0
533	movdqa	%xmm3,160(%r10)
534	movdqa	%xmm4,%xmm3
535	paddd	%xmm1,%xmm2
536	pcmpeqd	%xmm5,%xmm1
537	movdqa	%xmm0,176(%r10)
538	movdqa	%xmm4,%xmm0
539
540	paddd	%xmm2,%xmm3
541	pcmpeqd	%xmm5,%xmm2
542	movdqa	%xmm1,192(%r10)
543	movdqa	%xmm4,%xmm1
544
545	paddd	%xmm3,%xmm0
546	pcmpeqd	%xmm5,%xmm3
547	movdqa	%xmm2,208(%r10)
548	movdqa	%xmm4,%xmm2
549
550	paddd	%xmm0,%xmm1
551	pcmpeqd	%xmm5,%xmm0
552	movdqa	%xmm3,224(%r10)
553	movdqa	%xmm4,%xmm3
554	paddd	%xmm1,%xmm2
555	pcmpeqd	%xmm5,%xmm1
556	movdqa	%xmm0,240(%r10)
557	movdqa	%xmm4,%xmm0
558
559	paddd	%xmm2,%xmm3
560	pcmpeqd	%xmm5,%xmm2
561	movdqa	%xmm1,256(%r10)
562	movdqa	%xmm4,%xmm1
563
564	paddd	%xmm3,%xmm0
565	pcmpeqd	%xmm5,%xmm3
566	movdqa	%xmm2,272(%r10)
567	movdqa	%xmm4,%xmm2
568
569	paddd	%xmm0,%xmm1
570	pcmpeqd	%xmm5,%xmm0
571	movdqa	%xmm3,288(%r10)
572	movdqa	%xmm4,%xmm3
573	paddd	%xmm1,%xmm2
574	pcmpeqd	%xmm5,%xmm1
575	movdqa	%xmm0,304(%r10)
576
577	paddd	%xmm2,%xmm3
578.byte	0x67
579	pcmpeqd	%xmm5,%xmm2
580	movdqa	%xmm1,320(%r10)
581
582	pcmpeqd	%xmm5,%xmm3
583	movdqa	%xmm2,336(%r10)
584	pand	64(%r12),%xmm0
585
586	pand	80(%r12),%xmm1
587	pand	96(%r12),%xmm2
588	movdqa	%xmm3,352(%r10)
589	pand	112(%r12),%xmm3
590	por	%xmm2,%xmm0
591	por	%xmm3,%xmm1
592	movdqa	-128(%r12),%xmm4
593	movdqa	-112(%r12),%xmm5
594	movdqa	-96(%r12),%xmm2
595	pand	112(%r10),%xmm4
596	movdqa	-80(%r12),%xmm3
597	pand	128(%r10),%xmm5
598	por	%xmm4,%xmm0
599	pand	144(%r10),%xmm2
600	por	%xmm5,%xmm1
601	pand	160(%r10),%xmm3
602	por	%xmm2,%xmm0
603	por	%xmm3,%xmm1
604	movdqa	-64(%r12),%xmm4
605	movdqa	-48(%r12),%xmm5
606	movdqa	-32(%r12),%xmm2
607	pand	176(%r10),%xmm4
608	movdqa	-16(%r12),%xmm3
609	pand	192(%r10),%xmm5
610	por	%xmm4,%xmm0
611	pand	208(%r10),%xmm2
612	por	%xmm5,%xmm1
613	pand	224(%r10),%xmm3
614	por	%xmm2,%xmm0
615	por	%xmm3,%xmm1
616	movdqa	0(%r12),%xmm4
617	movdqa	16(%r12),%xmm5
618	movdqa	32(%r12),%xmm2
619	pand	240(%r10),%xmm4
620	movdqa	48(%r12),%xmm3
621	pand	256(%r10),%xmm5
622	por	%xmm4,%xmm0
623	pand	272(%r10),%xmm2
624	por	%xmm5,%xmm1
625	pand	288(%r10),%xmm3
626	por	%xmm2,%xmm0
627	por	%xmm3,%xmm1
628	por	%xmm1,%xmm0
629	pshufd	$0x4e,%xmm0,%xmm1
630	por	%xmm1,%xmm0
631	leaq	256(%r12),%r12
632.byte	102,72,15,126,195
633
634	movq	%r13,16+8(%rsp)
635	movq	%rdi,56+8(%rsp)
636
637	movq	(%r8),%r8
638	movq	(%rsi),%rax
639	leaq	(%rsi,%r9,1),%rsi
640	negq	%r9
641
642	movq	%r8,%rbp
643	mulq	%rbx
644	movq	%rax,%r10
645	movq	(%rcx),%rax
646
647	imulq	%r10,%rbp
648	leaq	64+8(%rsp),%r14
649	movq	%rdx,%r11
650
651	mulq	%rbp
652	addq	%rax,%r10
653	movq	8(%rsi,%r9,1),%rax
654	adcq	$0,%rdx
655	movq	%rdx,%rdi
656
657	mulq	%rbx
658	addq	%rax,%r11
659	movq	8(%rcx),%rax
660	adcq	$0,%rdx
661	movq	%rdx,%r10
662
663	mulq	%rbp
664	addq	%rax,%rdi
665	movq	16(%rsi,%r9,1),%rax
666	adcq	$0,%rdx
667	addq	%r11,%rdi
668	leaq	32(%r9),%r15
669	leaq	32(%rcx),%rcx
670	adcq	$0,%rdx
671	movq	%rdi,(%r14)
672	movq	%rdx,%r13
673	jmp	.L1st4x
674
675.align	32
676.L1st4x:
677	mulq	%rbx
678	addq	%rax,%r10
679	movq	-16(%rcx),%rax
680	leaq	32(%r14),%r14
681	adcq	$0,%rdx
682	movq	%rdx,%r11
683
684	mulq	%rbp
685	addq	%rax,%r13
686	movq	-8(%rsi,%r15,1),%rax
687	adcq	$0,%rdx
688	addq	%r10,%r13
689	adcq	$0,%rdx
690	movq	%r13,-24(%r14)
691	movq	%rdx,%rdi
692
693	mulq	%rbx
694	addq	%rax,%r11
695	movq	-8(%rcx),%rax
696	adcq	$0,%rdx
697	movq	%rdx,%r10
698
699	mulq	%rbp
700	addq	%rax,%rdi
701	movq	(%rsi,%r15,1),%rax
702	adcq	$0,%rdx
703	addq	%r11,%rdi
704	adcq	$0,%rdx
705	movq	%rdi,-16(%r14)
706	movq	%rdx,%r13
707
708	mulq	%rbx
709	addq	%rax,%r10
710	movq	0(%rcx),%rax
711	adcq	$0,%rdx
712	movq	%rdx,%r11
713
714	mulq	%rbp
715	addq	%rax,%r13
716	movq	8(%rsi,%r15,1),%rax
717	adcq	$0,%rdx
718	addq	%r10,%r13
719	adcq	$0,%rdx
720	movq	%r13,-8(%r14)
721	movq	%rdx,%rdi
722
723	mulq	%rbx
724	addq	%rax,%r11
725	movq	8(%rcx),%rax
726	adcq	$0,%rdx
727	movq	%rdx,%r10
728
729	mulq	%rbp
730	addq	%rax,%rdi
731	movq	16(%rsi,%r15,1),%rax
732	adcq	$0,%rdx
733	addq	%r11,%rdi
734	leaq	32(%rcx),%rcx
735	adcq	$0,%rdx
736	movq	%rdi,(%r14)
737	movq	%rdx,%r13
738
739	addq	$32,%r15
740	jnz	.L1st4x
741
742	mulq	%rbx
743	addq	%rax,%r10
744	movq	-16(%rcx),%rax
745	leaq	32(%r14),%r14
746	adcq	$0,%rdx
747	movq	%rdx,%r11
748
749	mulq	%rbp
750	addq	%rax,%r13
751	movq	-8(%rsi),%rax
752	adcq	$0,%rdx
753	addq	%r10,%r13
754	adcq	$0,%rdx
755	movq	%r13,-24(%r14)
756	movq	%rdx,%rdi
757
758	mulq	%rbx
759	addq	%rax,%r11
760	movq	-8(%rcx),%rax
761	adcq	$0,%rdx
762	movq	%rdx,%r10
763
764	mulq	%rbp
765	addq	%rax,%rdi
766	movq	(%rsi,%r9,1),%rax
767	adcq	$0,%rdx
768	addq	%r11,%rdi
769	adcq	$0,%rdx
770	movq	%rdi,-16(%r14)
771	movq	%rdx,%r13
772
773	leaq	(%rcx,%r9,1),%rcx
774
775	xorq	%rdi,%rdi
776	addq	%r10,%r13
777	adcq	$0,%rdi
778	movq	%r13,-8(%r14)
779
780	jmp	.Louter4x
781
782.align	32
783.Louter4x:
784	leaq	16+128(%r14),%rdx
785	pxor	%xmm4,%xmm4
786	pxor	%xmm5,%xmm5
787	movdqa	-128(%r12),%xmm0
788	movdqa	-112(%r12),%xmm1
789	movdqa	-96(%r12),%xmm2
790	movdqa	-80(%r12),%xmm3
791	pand	-128(%rdx),%xmm0
792	pand	-112(%rdx),%xmm1
793	por	%xmm0,%xmm4
794	pand	-96(%rdx),%xmm2
795	por	%xmm1,%xmm5
796	pand	-80(%rdx),%xmm3
797	por	%xmm2,%xmm4
798	por	%xmm3,%xmm5
799	movdqa	-64(%r12),%xmm0
800	movdqa	-48(%r12),%xmm1
801	movdqa	-32(%r12),%xmm2
802	movdqa	-16(%r12),%xmm3
803	pand	-64(%rdx),%xmm0
804	pand	-48(%rdx),%xmm1
805	por	%xmm0,%xmm4
806	pand	-32(%rdx),%xmm2
807	por	%xmm1,%xmm5
808	pand	-16(%rdx),%xmm3
809	por	%xmm2,%xmm4
810	por	%xmm3,%xmm5
811	movdqa	0(%r12),%xmm0
812	movdqa	16(%r12),%xmm1
813	movdqa	32(%r12),%xmm2
814	movdqa	48(%r12),%xmm3
815	pand	0(%rdx),%xmm0
816	pand	16(%rdx),%xmm1
817	por	%xmm0,%xmm4
818	pand	32(%rdx),%xmm2
819	por	%xmm1,%xmm5
820	pand	48(%rdx),%xmm3
821	por	%xmm2,%xmm4
822	por	%xmm3,%xmm5
823	movdqa	64(%r12),%xmm0
824	movdqa	80(%r12),%xmm1
825	movdqa	96(%r12),%xmm2
826	movdqa	112(%r12),%xmm3
827	pand	64(%rdx),%xmm0
828	pand	80(%rdx),%xmm1
829	por	%xmm0,%xmm4
830	pand	96(%rdx),%xmm2
831	por	%xmm1,%xmm5
832	pand	112(%rdx),%xmm3
833	por	%xmm2,%xmm4
834	por	%xmm3,%xmm5
835	por	%xmm5,%xmm4
836	pshufd	$0x4e,%xmm4,%xmm0
837	por	%xmm4,%xmm0
838	leaq	256(%r12),%r12
839.byte	102,72,15,126,195
840
841	movq	(%r14,%r9,1),%r10
842	movq	%r8,%rbp
843	mulq	%rbx
844	addq	%rax,%r10
845	movq	(%rcx),%rax
846	adcq	$0,%rdx
847
848	imulq	%r10,%rbp
849	movq	%rdx,%r11
850	movq	%rdi,(%r14)
851
852	leaq	(%r14,%r9,1),%r14
853
854	mulq	%rbp
855	addq	%rax,%r10
856	movq	8(%rsi,%r9,1),%rax
857	adcq	$0,%rdx
858	movq	%rdx,%rdi
859
860	mulq	%rbx
861	addq	%rax,%r11
862	movq	8(%rcx),%rax
863	adcq	$0,%rdx
864	addq	8(%r14),%r11
865	adcq	$0,%rdx
866	movq	%rdx,%r10
867
868	mulq	%rbp
869	addq	%rax,%rdi
870	movq	16(%rsi,%r9,1),%rax
871	adcq	$0,%rdx
872	addq	%r11,%rdi
873	leaq	32(%r9),%r15
874	leaq	32(%rcx),%rcx
875	adcq	$0,%rdx
876	movq	%rdx,%r13
877	jmp	.Linner4x
878
879.align	32
880.Linner4x:
881	mulq	%rbx
882	addq	%rax,%r10
883	movq	-16(%rcx),%rax
884	adcq	$0,%rdx
885	addq	16(%r14),%r10
886	leaq	32(%r14),%r14
887	adcq	$0,%rdx
888	movq	%rdx,%r11
889
890	mulq	%rbp
891	addq	%rax,%r13
892	movq	-8(%rsi,%r15,1),%rax
893	adcq	$0,%rdx
894	addq	%r10,%r13
895	adcq	$0,%rdx
896	movq	%rdi,-32(%r14)
897	movq	%rdx,%rdi
898
899	mulq	%rbx
900	addq	%rax,%r11
901	movq	-8(%rcx),%rax
902	adcq	$0,%rdx
903	addq	-8(%r14),%r11
904	adcq	$0,%rdx
905	movq	%rdx,%r10
906
907	mulq	%rbp
908	addq	%rax,%rdi
909	movq	(%rsi,%r15,1),%rax
910	adcq	$0,%rdx
911	addq	%r11,%rdi
912	adcq	$0,%rdx
913	movq	%r13,-24(%r14)
914	movq	%rdx,%r13
915
916	mulq	%rbx
917	addq	%rax,%r10
918	movq	0(%rcx),%rax
919	adcq	$0,%rdx
920	addq	(%r14),%r10
921	adcq	$0,%rdx
922	movq	%rdx,%r11
923
924	mulq	%rbp
925	addq	%rax,%r13
926	movq	8(%rsi,%r15,1),%rax
927	adcq	$0,%rdx
928	addq	%r10,%r13
929	adcq	$0,%rdx
930	movq	%rdi,-16(%r14)
931	movq	%rdx,%rdi
932
933	mulq	%rbx
934	addq	%rax,%r11
935	movq	8(%rcx),%rax
936	adcq	$0,%rdx
937	addq	8(%r14),%r11
938	adcq	$0,%rdx
939	movq	%rdx,%r10
940
941	mulq	%rbp
942	addq	%rax,%rdi
943	movq	16(%rsi,%r15,1),%rax
944	adcq	$0,%rdx
945	addq	%r11,%rdi
946	leaq	32(%rcx),%rcx
947	adcq	$0,%rdx
948	movq	%r13,-8(%r14)
949	movq	%rdx,%r13
950
951	addq	$32,%r15
952	jnz	.Linner4x
953
954	mulq	%rbx
955	addq	%rax,%r10
956	movq	-16(%rcx),%rax
957	adcq	$0,%rdx
958	addq	16(%r14),%r10
959	leaq	32(%r14),%r14
960	adcq	$0,%rdx
961	movq	%rdx,%r11
962
963	mulq	%rbp
964	addq	%rax,%r13
965	movq	-8(%rsi),%rax
966	adcq	$0,%rdx
967	addq	%r10,%r13
968	adcq	$0,%rdx
969	movq	%rdi,-32(%r14)
970	movq	%rdx,%rdi
971
972	mulq	%rbx
973	addq	%rax,%r11
974	movq	%rbp,%rax
975	movq	-8(%rcx),%rbp
976	adcq	$0,%rdx
977	addq	-8(%r14),%r11
978	adcq	$0,%rdx
979	movq	%rdx,%r10
980
981	mulq	%rbp
982	addq	%rax,%rdi
983	movq	(%rsi,%r9,1),%rax
984	adcq	$0,%rdx
985	addq	%r11,%rdi
986	adcq	$0,%rdx
987	movq	%r13,-24(%r14)
988	movq	%rdx,%r13
989
990	movq	%rdi,-16(%r14)
991	leaq	(%rcx,%r9,1),%rcx
992
993	xorq	%rdi,%rdi
994	addq	%r10,%r13
995	adcq	$0,%rdi
996	addq	(%r14),%r13
997	adcq	$0,%rdi
998	movq	%r13,-8(%r14)
999
1000	cmpq	16+8(%rsp),%r12
1001	jb	.Louter4x
1002	xorq	%rax,%rax
1003	subq	%r13,%rbp
1004	adcq	%r15,%r15
1005	orq	%r15,%rdi
1006	subq	%rdi,%rax
1007	leaq	(%r14,%r9,1),%rbx
1008	movq	(%rcx),%r12
1009	leaq	(%rcx),%rbp
1010	movq	%r9,%rcx
1011	sarq	$3+2,%rcx
1012	movq	56+8(%rsp),%rdi
1013	decq	%r12
1014	xorq	%r10,%r10
1015	movq	8(%rbp),%r13
1016	movq	16(%rbp),%r14
1017	movq	24(%rbp),%r15
1018	jmp	.Lsqr4x_sub_entry
1019.size	mul4x_internal,.-mul4x_internal
1020.globl	bn_power5
1021.type	bn_power5,@function
1022.align	32
1023bn_power5:
1024	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
1025	andl	$0x80108,%r11d
1026	cmpl	$0x80108,%r11d
1027	je	.Lpowerx5_enter
1028	movq	%rsp,%rax
1029	pushq	%rbx
1030	pushq	%rbp
1031	pushq	%r12
1032	pushq	%r13
1033	pushq	%r14
1034	pushq	%r15
1035
1036	shll	$3,%r9d
1037	leal	(%r9,%r9,2),%r10d
1038	negq	%r9
1039	movq	(%r8),%r8
1040
1041
1042
1043
1044
1045
1046
1047
1048	leaq	-320(%rsp,%r9,2),%r11
1049	subq	%rdi,%r11
1050	andq	$4095,%r11
1051	cmpq	%r11,%r10
1052	jb	.Lpwr_sp_alt
1053	subq	%r11,%rsp
1054	leaq	-320(%rsp,%r9,2),%rsp
1055	jmp	.Lpwr_sp_done
1056
1057.align	32
1058.Lpwr_sp_alt:
1059	leaq	4096-320(,%r9,2),%r10
1060	leaq	-320(%rsp,%r9,2),%rsp
1061	subq	%r10,%r11
1062	movq	$0,%r10
1063	cmovcq	%r10,%r11
1064	subq	%r11,%rsp
1065.Lpwr_sp_done:
1066	andq	$-64,%rsp
1067	movq	%rax,%r11
1068	subq	%rsp,%r11
1069	andq	$-4096,%r11
1070.Lpwr_page_walk:
1071	movq	(%rsp,%r11,1),%r10
1072	subq	$4096,%r11
1073.byte	0x2e
1074	jnc	.Lpwr_page_walk
1075
1076	movq	%r9,%r10
1077	negq	%r9
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088	movq	%r8,32(%rsp)
1089	movq	%rax,40(%rsp)
1090.Lpower5_body:
1091.byte	102,72,15,110,207
1092.byte	102,72,15,110,209
1093.byte	102,73,15,110,218
1094.byte	102,72,15,110,226
1095
1096	call	__bn_sqr8x_internal
1097	call	__bn_post4x_internal
1098	call	__bn_sqr8x_internal
1099	call	__bn_post4x_internal
1100	call	__bn_sqr8x_internal
1101	call	__bn_post4x_internal
1102	call	__bn_sqr8x_internal
1103	call	__bn_post4x_internal
1104	call	__bn_sqr8x_internal
1105	call	__bn_post4x_internal
1106
1107.byte	102,72,15,126,209
1108.byte	102,72,15,126,226
1109	movq	%rsi,%rdi
1110	movq	40(%rsp),%rax
1111	leaq	32(%rsp),%r8
1112
1113	call	mul4x_internal
1114
1115	movq	40(%rsp),%rsi
1116	movq	$1,%rax
1117	movq	-48(%rsi),%r15
1118	movq	-40(%rsi),%r14
1119	movq	-32(%rsi),%r13
1120	movq	-24(%rsi),%r12
1121	movq	-16(%rsi),%rbp
1122	movq	-8(%rsi),%rbx
1123	leaq	(%rsi),%rsp
1124.Lpower5_epilogue:
1125	.byte	0xf3,0xc3
1126.size	bn_power5,.-bn_power5
1127
1128.globl	bn_sqr8x_internal
1129.hidden	bn_sqr8x_internal
1130.type	bn_sqr8x_internal,@function
1131.align	32
1132bn_sqr8x_internal:
1133__bn_sqr8x_internal:
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207	leaq	32(%r10),%rbp
1208	leaq	(%rsi,%r9,1),%rsi
1209
1210	movq	%r9,%rcx
1211
1212
1213	movq	-32(%rsi,%rbp,1),%r14
1214	leaq	48+8(%rsp,%r9,2),%rdi
1215	movq	-24(%rsi,%rbp,1),%rax
1216	leaq	-32(%rdi,%rbp,1),%rdi
1217	movq	-16(%rsi,%rbp,1),%rbx
1218	movq	%rax,%r15
1219
1220	mulq	%r14
1221	movq	%rax,%r10
1222	movq	%rbx,%rax
1223	movq	%rdx,%r11
1224	movq	%r10,-24(%rdi,%rbp,1)
1225
1226	mulq	%r14
1227	addq	%rax,%r11
1228	movq	%rbx,%rax
1229	adcq	$0,%rdx
1230	movq	%r11,-16(%rdi,%rbp,1)
1231	movq	%rdx,%r10
1232
1233
1234	movq	-8(%rsi,%rbp,1),%rbx
1235	mulq	%r15
1236	movq	%rax,%r12
1237	movq	%rbx,%rax
1238	movq	%rdx,%r13
1239
1240	leaq	(%rbp),%rcx
1241	mulq	%r14
1242	addq	%rax,%r10
1243	movq	%rbx,%rax
1244	movq	%rdx,%r11
1245	adcq	$0,%r11
1246	addq	%r12,%r10
1247	adcq	$0,%r11
1248	movq	%r10,-8(%rdi,%rcx,1)
1249	jmp	.Lsqr4x_1st
1250
1251.align	32
1252.Lsqr4x_1st:
1253	movq	(%rsi,%rcx,1),%rbx
1254	mulq	%r15
1255	addq	%rax,%r13
1256	movq	%rbx,%rax
1257	movq	%rdx,%r12
1258	adcq	$0,%r12
1259
1260	mulq	%r14
1261	addq	%rax,%r11
1262	movq	%rbx,%rax
1263	movq	8(%rsi,%rcx,1),%rbx
1264	movq	%rdx,%r10
1265	adcq	$0,%r10
1266	addq	%r13,%r11
1267	adcq	$0,%r10
1268
1269
1270	mulq	%r15
1271	addq	%rax,%r12
1272	movq	%rbx,%rax
1273	movq	%r11,(%rdi,%rcx,1)
1274	movq	%rdx,%r13
1275	adcq	$0,%r13
1276
1277	mulq	%r14
1278	addq	%rax,%r10
1279	movq	%rbx,%rax
1280	movq	16(%rsi,%rcx,1),%rbx
1281	movq	%rdx,%r11
1282	adcq	$0,%r11
1283	addq	%r12,%r10
1284	adcq	$0,%r11
1285
1286	mulq	%r15
1287	addq	%rax,%r13
1288	movq	%rbx,%rax
1289	movq	%r10,8(%rdi,%rcx,1)
1290	movq	%rdx,%r12
1291	adcq	$0,%r12
1292
1293	mulq	%r14
1294	addq	%rax,%r11
1295	movq	%rbx,%rax
1296	movq	24(%rsi,%rcx,1),%rbx
1297	movq	%rdx,%r10
1298	adcq	$0,%r10
1299	addq	%r13,%r11
1300	adcq	$0,%r10
1301
1302
1303	mulq	%r15
1304	addq	%rax,%r12
1305	movq	%rbx,%rax
1306	movq	%r11,16(%rdi,%rcx,1)
1307	movq	%rdx,%r13
1308	adcq	$0,%r13
1309	leaq	32(%rcx),%rcx
1310
1311	mulq	%r14
1312	addq	%rax,%r10
1313	movq	%rbx,%rax
1314	movq	%rdx,%r11
1315	adcq	$0,%r11
1316	addq	%r12,%r10
1317	adcq	$0,%r11
1318	movq	%r10,-8(%rdi,%rcx,1)
1319
1320	cmpq	$0,%rcx
1321	jne	.Lsqr4x_1st
1322
1323	mulq	%r15
1324	addq	%rax,%r13
1325	leaq	16(%rbp),%rbp
1326	adcq	$0,%rdx
1327	addq	%r11,%r13
1328	adcq	$0,%rdx
1329
1330	movq	%r13,(%rdi)
1331	movq	%rdx,%r12
1332	movq	%rdx,8(%rdi)
1333	jmp	.Lsqr4x_outer
1334
1335.align	32
1336.Lsqr4x_outer:
1337	movq	-32(%rsi,%rbp,1),%r14
1338	leaq	48+8(%rsp,%r9,2),%rdi
1339	movq	-24(%rsi,%rbp,1),%rax
1340	leaq	-32(%rdi,%rbp,1),%rdi
1341	movq	-16(%rsi,%rbp,1),%rbx
1342	movq	%rax,%r15
1343
1344	mulq	%r14
1345	movq	-24(%rdi,%rbp,1),%r10
1346	addq	%rax,%r10
1347	movq	%rbx,%rax
1348	adcq	$0,%rdx
1349	movq	%r10,-24(%rdi,%rbp,1)
1350	movq	%rdx,%r11
1351
1352	mulq	%r14
1353	addq	%rax,%r11
1354	movq	%rbx,%rax
1355	adcq	$0,%rdx
1356	addq	-16(%rdi,%rbp,1),%r11
1357	movq	%rdx,%r10
1358	adcq	$0,%r10
1359	movq	%r11,-16(%rdi,%rbp,1)
1360
1361	xorq	%r12,%r12
1362
1363	movq	-8(%rsi,%rbp,1),%rbx
1364	mulq	%r15
1365	addq	%rax,%r12
1366	movq	%rbx,%rax
1367	adcq	$0,%rdx
1368	addq	-8(%rdi,%rbp,1),%r12
1369	movq	%rdx,%r13
1370	adcq	$0,%r13
1371
1372	mulq	%r14
1373	addq	%rax,%r10
1374	movq	%rbx,%rax
1375	adcq	$0,%rdx
1376	addq	%r12,%r10
1377	movq	%rdx,%r11
1378	adcq	$0,%r11
1379	movq	%r10,-8(%rdi,%rbp,1)
1380
1381	leaq	(%rbp),%rcx
1382	jmp	.Lsqr4x_inner
1383
1384.align	32
1385.Lsqr4x_inner:
1386	movq	(%rsi,%rcx,1),%rbx
1387	mulq	%r15
1388	addq	%rax,%r13
1389	movq	%rbx,%rax
1390	movq	%rdx,%r12
1391	adcq	$0,%r12
1392	addq	(%rdi,%rcx,1),%r13
1393	adcq	$0,%r12
1394
1395.byte	0x67
1396	mulq	%r14
1397	addq	%rax,%r11
1398	movq	%rbx,%rax
1399	movq	8(%rsi,%rcx,1),%rbx
1400	movq	%rdx,%r10
1401	adcq	$0,%r10
1402	addq	%r13,%r11
1403	adcq	$0,%r10
1404
1405	mulq	%r15
1406	addq	%rax,%r12
1407	movq	%r11,(%rdi,%rcx,1)
1408	movq	%rbx,%rax
1409	movq	%rdx,%r13
1410	adcq	$0,%r13
1411	addq	8(%rdi,%rcx,1),%r12
1412	leaq	16(%rcx),%rcx
1413	adcq	$0,%r13
1414
1415	mulq	%r14
1416	addq	%rax,%r10
1417	movq	%rbx,%rax
1418	adcq	$0,%rdx
1419	addq	%r12,%r10
1420	movq	%rdx,%r11
1421	adcq	$0,%r11
1422	movq	%r10,-8(%rdi,%rcx,1)
1423
1424	cmpq	$0,%rcx
1425	jne	.Lsqr4x_inner
1426
1427.byte	0x67
1428	mulq	%r15
1429	addq	%rax,%r13
1430	adcq	$0,%rdx
1431	addq	%r11,%r13
1432	adcq	$0,%rdx
1433
1434	movq	%r13,(%rdi)
1435	movq	%rdx,%r12
1436	movq	%rdx,8(%rdi)
1437
1438	addq	$16,%rbp
1439	jnz	.Lsqr4x_outer
1440
1441
1442	movq	-32(%rsi),%r14
1443	leaq	48+8(%rsp,%r9,2),%rdi
1444	movq	-24(%rsi),%rax
1445	leaq	-32(%rdi,%rbp,1),%rdi
1446	movq	-16(%rsi),%rbx
1447	movq	%rax,%r15
1448
1449	mulq	%r14
1450	addq	%rax,%r10
1451	movq	%rbx,%rax
1452	movq	%rdx,%r11
1453	adcq	$0,%r11
1454
1455	mulq	%r14
1456	addq	%rax,%r11
1457	movq	%rbx,%rax
1458	movq	%r10,-24(%rdi)
1459	movq	%rdx,%r10
1460	adcq	$0,%r10
1461	addq	%r13,%r11
1462	movq	-8(%rsi),%rbx
1463	adcq	$0,%r10
1464
1465	mulq	%r15
1466	addq	%rax,%r12
1467	movq	%rbx,%rax
1468	movq	%r11,-16(%rdi)
1469	movq	%rdx,%r13
1470	adcq	$0,%r13
1471
1472	mulq	%r14
1473	addq	%rax,%r10
1474	movq	%rbx,%rax
1475	movq	%rdx,%r11
1476	adcq	$0,%r11
1477	addq	%r12,%r10
1478	adcq	$0,%r11
1479	movq	%r10,-8(%rdi)
1480
1481	mulq	%r15
1482	addq	%rax,%r13
1483	movq	-16(%rsi),%rax
1484	adcq	$0,%rdx
1485	addq	%r11,%r13
1486	adcq	$0,%rdx
1487
1488	movq	%r13,(%rdi)
1489	movq	%rdx,%r12
1490	movq	%rdx,8(%rdi)
1491
1492	mulq	%rbx
1493	addq	$16,%rbp
1494	xorq	%r14,%r14
1495	subq	%r9,%rbp
1496	xorq	%r15,%r15
1497
1498	addq	%r12,%rax
1499	adcq	$0,%rdx
1500	movq	%rax,8(%rdi)
1501	movq	%rdx,16(%rdi)
1502	movq	%r15,24(%rdi)
1503
1504	movq	-16(%rsi,%rbp,1),%rax
1505	leaq	48+8(%rsp),%rdi
1506	xorq	%r10,%r10
1507	movq	8(%rdi),%r11
1508
1509	leaq	(%r14,%r10,2),%r12
1510	shrq	$63,%r10
1511	leaq	(%rcx,%r11,2),%r13
1512	shrq	$63,%r11
1513	orq	%r10,%r13
1514	movq	16(%rdi),%r10
1515	movq	%r11,%r14
1516	mulq	%rax
1517	negq	%r15
1518	movq	24(%rdi),%r11
1519	adcq	%rax,%r12
1520	movq	-8(%rsi,%rbp,1),%rax
1521	movq	%r12,(%rdi)
1522	adcq	%rdx,%r13
1523
1524	leaq	(%r14,%r10,2),%rbx
1525	movq	%r13,8(%rdi)
1526	sbbq	%r15,%r15
1527	shrq	$63,%r10
1528	leaq	(%rcx,%r11,2),%r8
1529	shrq	$63,%r11
1530	orq	%r10,%r8
1531	movq	32(%rdi),%r10
1532	movq	%r11,%r14
1533	mulq	%rax
1534	negq	%r15
1535	movq	40(%rdi),%r11
1536	adcq	%rax,%rbx
1537	movq	0(%rsi,%rbp,1),%rax
1538	movq	%rbx,16(%rdi)
1539	adcq	%rdx,%r8
1540	leaq	16(%rbp),%rbp
1541	movq	%r8,24(%rdi)
1542	sbbq	%r15,%r15
1543	leaq	64(%rdi),%rdi
1544	jmp	.Lsqr4x_shift_n_add
1545
1546.align	32
1547.Lsqr4x_shift_n_add:
1548	leaq	(%r14,%r10,2),%r12
1549	shrq	$63,%r10
1550	leaq	(%rcx,%r11,2),%r13
1551	shrq	$63,%r11
1552	orq	%r10,%r13
1553	movq	-16(%rdi),%r10
1554	movq	%r11,%r14
1555	mulq	%rax
1556	negq	%r15
1557	movq	-8(%rdi),%r11
1558	adcq	%rax,%r12
1559	movq	-8(%rsi,%rbp,1),%rax
1560	movq	%r12,-32(%rdi)
1561	adcq	%rdx,%r13
1562
1563	leaq	(%r14,%r10,2),%rbx
1564	movq	%r13,-24(%rdi)
1565	sbbq	%r15,%r15
1566	shrq	$63,%r10
1567	leaq	(%rcx,%r11,2),%r8
1568	shrq	$63,%r11
1569	orq	%r10,%r8
1570	movq	0(%rdi),%r10
1571	movq	%r11,%r14
1572	mulq	%rax
1573	negq	%r15
1574	movq	8(%rdi),%r11
1575	adcq	%rax,%rbx
1576	movq	0(%rsi,%rbp,1),%rax
1577	movq	%rbx,-16(%rdi)
1578	adcq	%rdx,%r8
1579
1580	leaq	(%r14,%r10,2),%r12
1581	movq	%r8,-8(%rdi)
1582	sbbq	%r15,%r15
1583	shrq	$63,%r10
1584	leaq	(%rcx,%r11,2),%r13
1585	shrq	$63,%r11
1586	orq	%r10,%r13
1587	movq	16(%rdi),%r10
1588	movq	%r11,%r14
1589	mulq	%rax
1590	negq	%r15
1591	movq	24(%rdi),%r11
1592	adcq	%rax,%r12
1593	movq	8(%rsi,%rbp,1),%rax
1594	movq	%r12,0(%rdi)
1595	adcq	%rdx,%r13
1596
1597	leaq	(%r14,%r10,2),%rbx
1598	movq	%r13,8(%rdi)
1599	sbbq	%r15,%r15
1600	shrq	$63,%r10
1601	leaq	(%rcx,%r11,2),%r8
1602	shrq	$63,%r11
1603	orq	%r10,%r8
1604	movq	32(%rdi),%r10
1605	movq	%r11,%r14
1606	mulq	%rax
1607	negq	%r15
1608	movq	40(%rdi),%r11
1609	adcq	%rax,%rbx
1610	movq	16(%rsi,%rbp,1),%rax
1611	movq	%rbx,16(%rdi)
1612	adcq	%rdx,%r8
1613	movq	%r8,24(%rdi)
1614	sbbq	%r15,%r15
1615	leaq	64(%rdi),%rdi
1616	addq	$32,%rbp
1617	jnz	.Lsqr4x_shift_n_add
1618
1619	leaq	(%r14,%r10,2),%r12
1620.byte	0x67
1621	shrq	$63,%r10
1622	leaq	(%rcx,%r11,2),%r13
1623	shrq	$63,%r11
1624	orq	%r10,%r13
1625	movq	-16(%rdi),%r10
1626	movq	%r11,%r14
1627	mulq	%rax
1628	negq	%r15
1629	movq	-8(%rdi),%r11
1630	adcq	%rax,%r12
1631	movq	-8(%rsi),%rax
1632	movq	%r12,-32(%rdi)
1633	adcq	%rdx,%r13
1634
1635	leaq	(%r14,%r10,2),%rbx
1636	movq	%r13,-24(%rdi)
1637	sbbq	%r15,%r15
1638	shrq	$63,%r10
1639	leaq	(%rcx,%r11,2),%r8
1640	shrq	$63,%r11
1641	orq	%r10,%r8
1642	mulq	%rax
1643	negq	%r15
1644	adcq	%rax,%rbx
1645	adcq	%rdx,%r8
1646	movq	%rbx,-16(%rdi)
1647	movq	%r8,-8(%rdi)
1648.byte	102,72,15,126,213
1649__bn_sqr8x_reduction:
1650	xorq	%rax,%rax
1651	leaq	(%r9,%rbp,1),%rcx
1652	leaq	48+8(%rsp,%r9,2),%rdx
1653	movq	%rcx,0+8(%rsp)
1654	leaq	48+8(%rsp,%r9,1),%rdi
1655	movq	%rdx,8+8(%rsp)
1656	negq	%r9
1657	jmp	.L8x_reduction_loop
1658
1659.align	32
1660.L8x_reduction_loop:
1661	leaq	(%rdi,%r9,1),%rdi
1662.byte	0x66
1663	movq	0(%rdi),%rbx
1664	movq	8(%rdi),%r9
1665	movq	16(%rdi),%r10
1666	movq	24(%rdi),%r11
1667	movq	32(%rdi),%r12
1668	movq	40(%rdi),%r13
1669	movq	48(%rdi),%r14
1670	movq	56(%rdi),%r15
1671	movq	%rax,(%rdx)
1672	leaq	64(%rdi),%rdi
1673
1674.byte	0x67
1675	movq	%rbx,%r8
1676	imulq	32+8(%rsp),%rbx
1677	movq	0(%rbp),%rax
1678	movl	$8,%ecx
1679	jmp	.L8x_reduce
1680
1681.align	32
1682.L8x_reduce:
1683	mulq	%rbx
1684	movq	8(%rbp),%rax
1685	negq	%r8
1686	movq	%rdx,%r8
1687	adcq	$0,%r8
1688
1689	mulq	%rbx
1690	addq	%rax,%r9
1691	movq	16(%rbp),%rax
1692	adcq	$0,%rdx
1693	addq	%r9,%r8
1694	movq	%rbx,48-8+8(%rsp,%rcx,8)
1695	movq	%rdx,%r9
1696	adcq	$0,%r9
1697
1698	mulq	%rbx
1699	addq	%rax,%r10
1700	movq	24(%rbp),%rax
1701	adcq	$0,%rdx
1702	addq	%r10,%r9
1703	movq	32+8(%rsp),%rsi
1704	movq	%rdx,%r10
1705	adcq	$0,%r10
1706
1707	mulq	%rbx
1708	addq	%rax,%r11
1709	movq	32(%rbp),%rax
1710	adcq	$0,%rdx
1711	imulq	%r8,%rsi
1712	addq	%r11,%r10
1713	movq	%rdx,%r11
1714	adcq	$0,%r11
1715
1716	mulq	%rbx
1717	addq	%rax,%r12
1718	movq	40(%rbp),%rax
1719	adcq	$0,%rdx
1720	addq	%r12,%r11
1721	movq	%rdx,%r12
1722	adcq	$0,%r12
1723
1724	mulq	%rbx
1725	addq	%rax,%r13
1726	movq	48(%rbp),%rax
1727	adcq	$0,%rdx
1728	addq	%r13,%r12
1729	movq	%rdx,%r13
1730	adcq	$0,%r13
1731
1732	mulq	%rbx
1733	addq	%rax,%r14
1734	movq	56(%rbp),%rax
1735	adcq	$0,%rdx
1736	addq	%r14,%r13
1737	movq	%rdx,%r14
1738	adcq	$0,%r14
1739
1740	mulq	%rbx
1741	movq	%rsi,%rbx
1742	addq	%rax,%r15
1743	movq	0(%rbp),%rax
1744	adcq	$0,%rdx
1745	addq	%r15,%r14
1746	movq	%rdx,%r15
1747	adcq	$0,%r15
1748
1749	decl	%ecx
1750	jnz	.L8x_reduce
1751
1752	leaq	64(%rbp),%rbp
1753	xorq	%rax,%rax
1754	movq	8+8(%rsp),%rdx
1755	cmpq	0+8(%rsp),%rbp
1756	jae	.L8x_no_tail
1757
1758.byte	0x66
1759	addq	0(%rdi),%r8
1760	adcq	8(%rdi),%r9
1761	adcq	16(%rdi),%r10
1762	adcq	24(%rdi),%r11
1763	adcq	32(%rdi),%r12
1764	adcq	40(%rdi),%r13
1765	adcq	48(%rdi),%r14
1766	adcq	56(%rdi),%r15
1767	sbbq	%rsi,%rsi
1768
1769	movq	48+56+8(%rsp),%rbx
1770	movl	$8,%ecx
1771	movq	0(%rbp),%rax
1772	jmp	.L8x_tail
1773
1774.align	32
1775.L8x_tail:
1776	mulq	%rbx
1777	addq	%rax,%r8
1778	movq	8(%rbp),%rax
1779	movq	%r8,(%rdi)
1780	movq	%rdx,%r8
1781	adcq	$0,%r8
1782
1783	mulq	%rbx
1784	addq	%rax,%r9
1785	movq	16(%rbp),%rax
1786	adcq	$0,%rdx
1787	addq	%r9,%r8
1788	leaq	8(%rdi),%rdi
1789	movq	%rdx,%r9
1790	adcq	$0,%r9
1791
1792	mulq	%rbx
1793	addq	%rax,%r10
1794	movq	24(%rbp),%rax
1795	adcq	$0,%rdx
1796	addq	%r10,%r9
1797	movq	%rdx,%r10
1798	adcq	$0,%r10
1799
1800	mulq	%rbx
1801	addq	%rax,%r11
1802	movq	32(%rbp),%rax
1803	adcq	$0,%rdx
1804	addq	%r11,%r10
1805	movq	%rdx,%r11
1806	adcq	$0,%r11
1807
1808	mulq	%rbx
1809	addq	%rax,%r12
1810	movq	40(%rbp),%rax
1811	adcq	$0,%rdx
1812	addq	%r12,%r11
1813	movq	%rdx,%r12
1814	adcq	$0,%r12
1815
1816	mulq	%rbx
1817	addq	%rax,%r13
1818	movq	48(%rbp),%rax
1819	adcq	$0,%rdx
1820	addq	%r13,%r12
1821	movq	%rdx,%r13
1822	adcq	$0,%r13
1823
1824	mulq	%rbx
1825	addq	%rax,%r14
1826	movq	56(%rbp),%rax
1827	adcq	$0,%rdx
1828	addq	%r14,%r13
1829	movq	%rdx,%r14
1830	adcq	$0,%r14
1831
1832	mulq	%rbx
1833	movq	48-16+8(%rsp,%rcx,8),%rbx
1834	addq	%rax,%r15
1835	adcq	$0,%rdx
1836	addq	%r15,%r14
1837	movq	0(%rbp),%rax
1838	movq	%rdx,%r15
1839	adcq	$0,%r15
1840
1841	decl	%ecx
1842	jnz	.L8x_tail
1843
1844	leaq	64(%rbp),%rbp
1845	movq	8+8(%rsp),%rdx
1846	cmpq	0+8(%rsp),%rbp
1847	jae	.L8x_tail_done
1848
1849	movq	48+56+8(%rsp),%rbx
1850	negq	%rsi
1851	movq	0(%rbp),%rax
1852	adcq	0(%rdi),%r8
1853	adcq	8(%rdi),%r9
1854	adcq	16(%rdi),%r10
1855	adcq	24(%rdi),%r11
1856	adcq	32(%rdi),%r12
1857	adcq	40(%rdi),%r13
1858	adcq	48(%rdi),%r14
1859	adcq	56(%rdi),%r15
1860	sbbq	%rsi,%rsi
1861
1862	movl	$8,%ecx
1863	jmp	.L8x_tail
1864
1865.align	32
1866.L8x_tail_done:
1867	addq	(%rdx),%r8
1868	adcq	$0,%r9
1869	adcq	$0,%r10
1870	adcq	$0,%r11
1871	adcq	$0,%r12
1872	adcq	$0,%r13
1873	adcq	$0,%r14
1874	adcq	$0,%r15
1875
1876
1877	xorq	%rax,%rax
1878
1879	negq	%rsi
1880.L8x_no_tail:
1881	adcq	0(%rdi),%r8
1882	adcq	8(%rdi),%r9
1883	adcq	16(%rdi),%r10
1884	adcq	24(%rdi),%r11
1885	adcq	32(%rdi),%r12
1886	adcq	40(%rdi),%r13
1887	adcq	48(%rdi),%r14
1888	adcq	56(%rdi),%r15
1889	adcq	$0,%rax
1890	movq	-8(%rbp),%rcx
1891	xorq	%rsi,%rsi
1892
1893.byte	102,72,15,126,213
1894
1895	movq	%r8,0(%rdi)
1896	movq	%r9,8(%rdi)
1897.byte	102,73,15,126,217
1898	movq	%r10,16(%rdi)
1899	movq	%r11,24(%rdi)
1900	movq	%r12,32(%rdi)
1901	movq	%r13,40(%rdi)
1902	movq	%r14,48(%rdi)
1903	movq	%r15,56(%rdi)
1904	leaq	64(%rdi),%rdi
1905
1906	cmpq	%rdx,%rdi
1907	jb	.L8x_reduction_loop
1908	.byte	0xf3,0xc3
1909.size	bn_sqr8x_internal,.-bn_sqr8x_internal
1910.type	__bn_post4x_internal,@function
1911.align	32
1912__bn_post4x_internal:
1913	movq	0(%rbp),%r12
1914	leaq	(%rdi,%r9,1),%rbx
1915	movq	%r9,%rcx
1916.byte	102,72,15,126,207
1917	negq	%rax
1918.byte	102,72,15,126,206
1919	sarq	$3+2,%rcx
1920	decq	%r12
1921	xorq	%r10,%r10
1922	movq	8(%rbp),%r13
1923	movq	16(%rbp),%r14
1924	movq	24(%rbp),%r15
1925	jmp	.Lsqr4x_sub_entry
1926
1927.align	16
1928.Lsqr4x_sub:
1929	movq	0(%rbp),%r12
1930	movq	8(%rbp),%r13
1931	movq	16(%rbp),%r14
1932	movq	24(%rbp),%r15
1933.Lsqr4x_sub_entry:
1934	leaq	32(%rbp),%rbp
1935	notq	%r12
1936	notq	%r13
1937	notq	%r14
1938	notq	%r15
1939	andq	%rax,%r12
1940	andq	%rax,%r13
1941	andq	%rax,%r14
1942	andq	%rax,%r15
1943
1944	negq	%r10
1945	adcq	0(%rbx),%r12
1946	adcq	8(%rbx),%r13
1947	adcq	16(%rbx),%r14
1948	adcq	24(%rbx),%r15
1949	movq	%r12,0(%rdi)
1950	leaq	32(%rbx),%rbx
1951	movq	%r13,8(%rdi)
1952	sbbq	%r10,%r10
1953	movq	%r14,16(%rdi)
1954	movq	%r15,24(%rdi)
1955	leaq	32(%rdi),%rdi
1956
1957	incq	%rcx
1958	jnz	.Lsqr4x_sub
1959
1960	movq	%r9,%r10
1961	negq	%r9
1962	.byte	0xf3,0xc3
1963.size	__bn_post4x_internal,.-__bn_post4x_internal
1964.globl	bn_from_montgomery
1965.type	bn_from_montgomery,@function
1966.align	32
1967bn_from_montgomery:
1968	testl	$7,%r9d
1969	jz	bn_from_mont8x
1970	xorl	%eax,%eax
1971	.byte	0xf3,0xc3
1972.size	bn_from_montgomery,.-bn_from_montgomery
1973
1974.type	bn_from_mont8x,@function
1975.align	32
1976bn_from_mont8x:
1977.byte	0x67
1978	movq	%rsp,%rax
1979	pushq	%rbx
1980	pushq	%rbp
1981	pushq	%r12
1982	pushq	%r13
1983	pushq	%r14
1984	pushq	%r15
1985
1986	shll	$3,%r9d
1987	leaq	(%r9,%r9,2),%r10
1988	negq	%r9
1989	movq	(%r8),%r8
1990
1991
1992
1993
1994
1995
1996
1997
1998	leaq	-320(%rsp,%r9,2),%r11
1999	subq	%rdi,%r11
2000	andq	$4095,%r11
2001	cmpq	%r11,%r10
2002	jb	.Lfrom_sp_alt
2003	subq	%r11,%rsp
2004	leaq	-320(%rsp,%r9,2),%rsp
2005	jmp	.Lfrom_sp_done
2006
2007.align	32
2008.Lfrom_sp_alt:
2009	leaq	4096-320(,%r9,2),%r10
2010	leaq	-320(%rsp,%r9,2),%rsp
2011	subq	%r10,%r11
2012	movq	$0,%r10
2013	cmovcq	%r10,%r11
2014	subq	%r11,%rsp
2015.Lfrom_sp_done:
2016	andq	$-64,%rsp
2017	movq	%rax,%r11
2018	subq	%rsp,%r11
2019	andq	$-4096,%r11
2020.Lfrom_page_walk:
2021	movq	(%rsp,%r11,1),%r10
2022	subq	$4096,%r11
2023.byte	0x2e
2024	jnc	.Lfrom_page_walk
2025
2026	movq	%r9,%r10
2027	negq	%r9
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038	movq	%r8,32(%rsp)
2039	movq	%rax,40(%rsp)
2040.Lfrom_body:
2041	movq	%r9,%r11
2042	leaq	48(%rsp),%rax
2043	pxor	%xmm0,%xmm0
2044	jmp	.Lmul_by_1
2045
2046.align	32
2047.Lmul_by_1:
2048	movdqu	(%rsi),%xmm1
2049	movdqu	16(%rsi),%xmm2
2050	movdqu	32(%rsi),%xmm3
2051	movdqa	%xmm0,(%rax,%r9,1)
2052	movdqu	48(%rsi),%xmm4
2053	movdqa	%xmm0,16(%rax,%r9,1)
2054.byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
2055	movdqa	%xmm1,(%rax)
2056	movdqa	%xmm0,32(%rax,%r9,1)
2057	movdqa	%xmm2,16(%rax)
2058	movdqa	%xmm0,48(%rax,%r9,1)
2059	movdqa	%xmm3,32(%rax)
2060	movdqa	%xmm4,48(%rax)
2061	leaq	64(%rax),%rax
2062	subq	$64,%r11
2063	jnz	.Lmul_by_1
2064
2065.byte	102,72,15,110,207
2066.byte	102,72,15,110,209
2067.byte	0x67
2068	movq	%rcx,%rbp
2069.byte	102,73,15,110,218
2070	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
2071	andl	$0x80108,%r11d
2072	cmpl	$0x80108,%r11d
2073	jne	.Lfrom_mont_nox
2074
2075	leaq	(%rax,%r9,1),%rdi
2076	call	__bn_sqrx8x_reduction
2077	call	__bn_postx4x_internal
2078
2079	pxor	%xmm0,%xmm0
2080	leaq	48(%rsp),%rax
2081	movq	40(%rsp),%rsi
2082	jmp	.Lfrom_mont_zero
2083
2084.align	32
2085.Lfrom_mont_nox:
2086	call	__bn_sqr8x_reduction
2087	call	__bn_post4x_internal
2088
2089	pxor	%xmm0,%xmm0
2090	leaq	48(%rsp),%rax
2091	movq	40(%rsp),%rsi
2092	jmp	.Lfrom_mont_zero
2093
2094.align	32
2095.Lfrom_mont_zero:
2096	movdqa	%xmm0,0(%rax)
2097	movdqa	%xmm0,16(%rax)
2098	movdqa	%xmm0,32(%rax)
2099	movdqa	%xmm0,48(%rax)
2100	leaq	64(%rax),%rax
2101	subq	$32,%r9
2102	jnz	.Lfrom_mont_zero
2103
2104	movq	$1,%rax
2105	movq	-48(%rsi),%r15
2106	movq	-40(%rsi),%r14
2107	movq	-32(%rsi),%r13
2108	movq	-24(%rsi),%r12
2109	movq	-16(%rsi),%rbp
2110	movq	-8(%rsi),%rbx
2111	leaq	(%rsi),%rsp
2112.Lfrom_epilogue:
2113	.byte	0xf3,0xc3
2114.size	bn_from_mont8x,.-bn_from_mont8x
2115.type	bn_mulx4x_mont_gather5,@function
2116.align	32
2117bn_mulx4x_mont_gather5:
2118.Lmulx4x_enter:
2119	movq	%rsp,%rax
2120	pushq	%rbx
2121	pushq	%rbp
2122	pushq	%r12
2123	pushq	%r13
2124	pushq	%r14
2125	pushq	%r15
2126
2127	shll	$3,%r9d
2128	leaq	(%r9,%r9,2),%r10
2129	negq	%r9
2130	movq	(%r8),%r8
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141	leaq	-320(%rsp,%r9,2),%r11
2142	subq	%rdi,%r11
2143	andq	$4095,%r11
2144	cmpq	%r11,%r10
2145	jb	.Lmulx4xsp_alt
2146	subq	%r11,%rsp
2147	leaq	-320(%rsp,%r9,2),%rsp
2148	jmp	.Lmulx4xsp_done
2149
2150.Lmulx4xsp_alt:
2151	leaq	4096-320(,%r9,2),%r10
2152	leaq	-320(%rsp,%r9,2),%rsp
2153	subq	%r10,%r11
2154	movq	$0,%r10
2155	cmovcq	%r10,%r11
2156	subq	%r11,%rsp
2157.Lmulx4xsp_done:
2158	andq	$-64,%rsp
2159	movq	%rax,%r11
2160	subq	%rsp,%r11
2161	andq	$-4096,%r11
2162.Lmulx4x_page_walk:
2163	movq	(%rsp,%r11,1),%r10
2164	subq	$4096,%r11
2165.byte	0x2e
2166	jnc	.Lmulx4x_page_walk
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180	movq	%r8,32(%rsp)
2181	movq	%rax,40(%rsp)
2182.Lmulx4x_body:
2183	call	mulx4x_internal
2184
2185	movq	40(%rsp),%rsi
2186	movq	$1,%rax
2187
2188	movq	-48(%rsi),%r15
2189	movq	-40(%rsi),%r14
2190	movq	-32(%rsi),%r13
2191	movq	-24(%rsi),%r12
2192	movq	-16(%rsi),%rbp
2193	movq	-8(%rsi),%rbx
2194	leaq	(%rsi),%rsp
2195.Lmulx4x_epilogue:
2196	.byte	0xf3,0xc3
2197.size	bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
2198
2199.type	mulx4x_internal,@function
2200.align	32
2201mulx4x_internal:
2202	movq	%r9,8(%rsp)
2203	movq	%r9,%r10
2204	negq	%r9
2205	shlq	$5,%r9
2206	negq	%r10
2207	leaq	128(%rdx,%r9,1),%r13
2208	shrq	$5+5,%r9
2209	movd	8(%rax),%xmm5
2210	subq	$1,%r9
2211	leaq	.Linc(%rip),%rax
2212	movq	%r13,16+8(%rsp)
2213	movq	%r9,24+8(%rsp)
2214	movq	%rdi,56+8(%rsp)
2215	movdqa	0(%rax),%xmm0
2216	movdqa	16(%rax),%xmm1
2217	leaq	88-112(%rsp,%r10,1),%r10
2218	leaq	128(%rdx),%rdi
2219
2220	pshufd	$0,%xmm5,%xmm5
2221	movdqa	%xmm1,%xmm4
2222.byte	0x67
2223	movdqa	%xmm1,%xmm2
2224.byte	0x67
2225	paddd	%xmm0,%xmm1
2226	pcmpeqd	%xmm5,%xmm0
2227	movdqa	%xmm4,%xmm3
2228	paddd	%xmm1,%xmm2
2229	pcmpeqd	%xmm5,%xmm1
2230	movdqa	%xmm0,112(%r10)
2231	movdqa	%xmm4,%xmm0
2232
2233	paddd	%xmm2,%xmm3
2234	pcmpeqd	%xmm5,%xmm2
2235	movdqa	%xmm1,128(%r10)
2236	movdqa	%xmm4,%xmm1
2237
2238	paddd	%xmm3,%xmm0
2239	pcmpeqd	%xmm5,%xmm3
2240	movdqa	%xmm2,144(%r10)
2241	movdqa	%xmm4,%xmm2
2242
2243	paddd	%xmm0,%xmm1
2244	pcmpeqd	%xmm5,%xmm0
2245	movdqa	%xmm3,160(%r10)
2246	movdqa	%xmm4,%xmm3
2247	paddd	%xmm1,%xmm2
2248	pcmpeqd	%xmm5,%xmm1
2249	movdqa	%xmm0,176(%r10)
2250	movdqa	%xmm4,%xmm0
2251
2252	paddd	%xmm2,%xmm3
2253	pcmpeqd	%xmm5,%xmm2
2254	movdqa	%xmm1,192(%r10)
2255	movdqa	%xmm4,%xmm1
2256
2257	paddd	%xmm3,%xmm0
2258	pcmpeqd	%xmm5,%xmm3
2259	movdqa	%xmm2,208(%r10)
2260	movdqa	%xmm4,%xmm2
2261
2262	paddd	%xmm0,%xmm1
2263	pcmpeqd	%xmm5,%xmm0
2264	movdqa	%xmm3,224(%r10)
2265	movdqa	%xmm4,%xmm3
2266	paddd	%xmm1,%xmm2
2267	pcmpeqd	%xmm5,%xmm1
2268	movdqa	%xmm0,240(%r10)
2269	movdqa	%xmm4,%xmm0
2270
2271	paddd	%xmm2,%xmm3
2272	pcmpeqd	%xmm5,%xmm2
2273	movdqa	%xmm1,256(%r10)
2274	movdqa	%xmm4,%xmm1
2275
2276	paddd	%xmm3,%xmm0
2277	pcmpeqd	%xmm5,%xmm3
2278	movdqa	%xmm2,272(%r10)
2279	movdqa	%xmm4,%xmm2
2280
2281	paddd	%xmm0,%xmm1
2282	pcmpeqd	%xmm5,%xmm0
2283	movdqa	%xmm3,288(%r10)
2284	movdqa	%xmm4,%xmm3
2285.byte	0x67
2286	paddd	%xmm1,%xmm2
2287	pcmpeqd	%xmm5,%xmm1
2288	movdqa	%xmm0,304(%r10)
2289
2290	paddd	%xmm2,%xmm3
2291	pcmpeqd	%xmm5,%xmm2
2292	movdqa	%xmm1,320(%r10)
2293
2294	pcmpeqd	%xmm5,%xmm3
2295	movdqa	%xmm2,336(%r10)
2296
2297	pand	64(%rdi),%xmm0
2298	pand	80(%rdi),%xmm1
2299	pand	96(%rdi),%xmm2
2300	movdqa	%xmm3,352(%r10)
2301	pand	112(%rdi),%xmm3
2302	por	%xmm2,%xmm0
2303	por	%xmm3,%xmm1
2304	movdqa	-128(%rdi),%xmm4
2305	movdqa	-112(%rdi),%xmm5
2306	movdqa	-96(%rdi),%xmm2
2307	pand	112(%r10),%xmm4
2308	movdqa	-80(%rdi),%xmm3
2309	pand	128(%r10),%xmm5
2310	por	%xmm4,%xmm0
2311	pand	144(%r10),%xmm2
2312	por	%xmm5,%xmm1
2313	pand	160(%r10),%xmm3
2314	por	%xmm2,%xmm0
2315	por	%xmm3,%xmm1
2316	movdqa	-64(%rdi),%xmm4
2317	movdqa	-48(%rdi),%xmm5
2318	movdqa	-32(%rdi),%xmm2
2319	pand	176(%r10),%xmm4
2320	movdqa	-16(%rdi),%xmm3
2321	pand	192(%r10),%xmm5
2322	por	%xmm4,%xmm0
2323	pand	208(%r10),%xmm2
2324	por	%xmm5,%xmm1
2325	pand	224(%r10),%xmm3
2326	por	%xmm2,%xmm0
2327	por	%xmm3,%xmm1
2328	movdqa	0(%rdi),%xmm4
2329	movdqa	16(%rdi),%xmm5
2330	movdqa	32(%rdi),%xmm2
2331	pand	240(%r10),%xmm4
2332	movdqa	48(%rdi),%xmm3
2333	pand	256(%r10),%xmm5
2334	por	%xmm4,%xmm0
2335	pand	272(%r10),%xmm2
2336	por	%xmm5,%xmm1
2337	pand	288(%r10),%xmm3
2338	por	%xmm2,%xmm0
2339	por	%xmm3,%xmm1
2340	pxor	%xmm1,%xmm0
2341	pshufd	$0x4e,%xmm0,%xmm1
2342	por	%xmm1,%xmm0
2343	leaq	256(%rdi),%rdi
2344.byte	102,72,15,126,194
2345	leaq	64+32+8(%rsp),%rbx
2346
2347	movq	%rdx,%r9
2348	mulxq	0(%rsi),%r8,%rax
2349	mulxq	8(%rsi),%r11,%r12
2350	addq	%rax,%r11
2351	mulxq	16(%rsi),%rax,%r13
2352	adcq	%rax,%r12
2353	adcq	$0,%r13
2354	mulxq	24(%rsi),%rax,%r14
2355
2356	movq	%r8,%r15
2357	imulq	32+8(%rsp),%r8
2358	xorq	%rbp,%rbp
2359	movq	%r8,%rdx
2360
2361	movq	%rdi,8+8(%rsp)
2362
2363	leaq	32(%rsi),%rsi
2364	adcxq	%rax,%r13
2365	adcxq	%rbp,%r14
2366
2367	mulxq	0(%rcx),%rax,%r10
2368	adcxq	%rax,%r15
2369	adoxq	%r11,%r10
2370	mulxq	8(%rcx),%rax,%r11
2371	adcxq	%rax,%r10
2372	adoxq	%r12,%r11
2373	mulxq	16(%rcx),%rax,%r12
2374	movq	24+8(%rsp),%rdi
2375	movq	%r10,-32(%rbx)
2376	adcxq	%rax,%r11
2377	adoxq	%r13,%r12
2378	mulxq	24(%rcx),%rax,%r15
2379	movq	%r9,%rdx
2380	movq	%r11,-24(%rbx)
2381	adcxq	%rax,%r12
2382	adoxq	%rbp,%r15
2383	leaq	32(%rcx),%rcx
2384	movq	%r12,-16(%rbx)
2385	jmp	.Lmulx4x_1st
2386
2387.align	32
2388.Lmulx4x_1st:
2389	adcxq	%rbp,%r15
2390	mulxq	0(%rsi),%r10,%rax
2391	adcxq	%r14,%r10
2392	mulxq	8(%rsi),%r11,%r14
2393	adcxq	%rax,%r11
2394	mulxq	16(%rsi),%r12,%rax
2395	adcxq	%r14,%r12
2396	mulxq	24(%rsi),%r13,%r14
2397.byte	0x67,0x67
2398	movq	%r8,%rdx
2399	adcxq	%rax,%r13
2400	adcxq	%rbp,%r14
2401	leaq	32(%rsi),%rsi
2402	leaq	32(%rbx),%rbx
2403
2404	adoxq	%r15,%r10
2405	mulxq	0(%rcx),%rax,%r15
2406	adcxq	%rax,%r10
2407	adoxq	%r15,%r11
2408	mulxq	8(%rcx),%rax,%r15
2409	adcxq	%rax,%r11
2410	adoxq	%r15,%r12
2411	mulxq	16(%rcx),%rax,%r15
2412	movq	%r10,-40(%rbx)
2413	adcxq	%rax,%r12
2414	movq	%r11,-32(%rbx)
2415	adoxq	%r15,%r13
2416	mulxq	24(%rcx),%rax,%r15
2417	movq	%r9,%rdx
2418	movq	%r12,-24(%rbx)
2419	adcxq	%rax,%r13
2420	adoxq	%rbp,%r15
2421	leaq	32(%rcx),%rcx
2422	movq	%r13,-16(%rbx)
2423
2424	decq	%rdi
2425	jnz	.Lmulx4x_1st
2426
2427	movq	8(%rsp),%rax
2428	adcq	%rbp,%r15
2429	leaq	(%rsi,%rax,1),%rsi
2430	addq	%r15,%r14
2431	movq	8+8(%rsp),%rdi
2432	adcq	%rbp,%rbp
2433	movq	%r14,-8(%rbx)
2434	jmp	.Lmulx4x_outer
2435
2436.align	32
2437.Lmulx4x_outer:
2438	leaq	16-256(%rbx),%r10
2439	pxor	%xmm4,%xmm4
2440.byte	0x67,0x67
2441	pxor	%xmm5,%xmm5
2442	movdqa	-128(%rdi),%xmm0
2443	movdqa	-112(%rdi),%xmm1
2444	movdqa	-96(%rdi),%xmm2
2445	pand	256(%r10),%xmm0
2446	movdqa	-80(%rdi),%xmm3
2447	pand	272(%r10),%xmm1
2448	por	%xmm0,%xmm4
2449	pand	288(%r10),%xmm2
2450	por	%xmm1,%xmm5
2451	pand	304(%r10),%xmm3
2452	por	%xmm2,%xmm4
2453	por	%xmm3,%xmm5
2454	movdqa	-64(%rdi),%xmm0
2455	movdqa	-48(%rdi),%xmm1
2456	movdqa	-32(%rdi),%xmm2
2457	pand	320(%r10),%xmm0
2458	movdqa	-16(%rdi),%xmm3
2459	pand	336(%r10),%xmm1
2460	por	%xmm0,%xmm4
2461	pand	352(%r10),%xmm2
2462	por	%xmm1,%xmm5
2463	pand	368(%r10),%xmm3
2464	por	%xmm2,%xmm4
2465	por	%xmm3,%xmm5
2466	movdqa	0(%rdi),%xmm0
2467	movdqa	16(%rdi),%xmm1
2468	movdqa	32(%rdi),%xmm2
2469	pand	384(%r10),%xmm0
2470	movdqa	48(%rdi),%xmm3
2471	pand	400(%r10),%xmm1
2472	por	%xmm0,%xmm4
2473	pand	416(%r10),%xmm2
2474	por	%xmm1,%xmm5
2475	pand	432(%r10),%xmm3
2476	por	%xmm2,%xmm4
2477	por	%xmm3,%xmm5
2478	movdqa	64(%rdi),%xmm0
2479	movdqa	80(%rdi),%xmm1
2480	movdqa	96(%rdi),%xmm2
2481	pand	448(%r10),%xmm0
2482	movdqa	112(%rdi),%xmm3
2483	pand	464(%r10),%xmm1
2484	por	%xmm0,%xmm4
2485	pand	480(%r10),%xmm2
2486	por	%xmm1,%xmm5
2487	pand	496(%r10),%xmm3
2488	por	%xmm2,%xmm4
2489	por	%xmm3,%xmm5
2490	por	%xmm5,%xmm4
2491	pshufd	$0x4e,%xmm4,%xmm0
2492	por	%xmm4,%xmm0
2493	leaq	256(%rdi),%rdi
2494.byte	102,72,15,126,194
2495
2496	movq	%rbp,(%rbx)
2497	leaq	32(%rbx,%rax,1),%rbx
2498	mulxq	0(%rsi),%r8,%r11
2499	xorq	%rbp,%rbp
2500	movq	%rdx,%r9
2501	mulxq	8(%rsi),%r14,%r12
2502	adoxq	-32(%rbx),%r8
2503	adcxq	%r14,%r11
2504	mulxq	16(%rsi),%r15,%r13
2505	adoxq	-24(%rbx),%r11
2506	adcxq	%r15,%r12
2507	mulxq	24(%rsi),%rdx,%r14
2508	adoxq	-16(%rbx),%r12
2509	adcxq	%rdx,%r13
2510	leaq	(%rcx,%rax,1),%rcx
2511	leaq	32(%rsi),%rsi
2512	adoxq	-8(%rbx),%r13
2513	adcxq	%rbp,%r14
2514	adoxq	%rbp,%r14
2515
2516	movq	%r8,%r15
2517	imulq	32+8(%rsp),%r8
2518
2519	movq	%r8,%rdx
2520	xorq	%rbp,%rbp
2521	movq	%rdi,8+8(%rsp)
2522
2523	mulxq	0(%rcx),%rax,%r10
2524	adcxq	%rax,%r15
2525	adoxq	%r11,%r10
2526	mulxq	8(%rcx),%rax,%r11
2527	adcxq	%rax,%r10
2528	adoxq	%r12,%r11
2529	mulxq	16(%rcx),%rax,%r12
2530	adcxq	%rax,%r11
2531	adoxq	%r13,%r12
2532	mulxq	24(%rcx),%rax,%r15
2533	movq	%r9,%rdx
2534	movq	24+8(%rsp),%rdi
2535	movq	%r10,-32(%rbx)
2536	adcxq	%rax,%r12
2537	movq	%r11,-24(%rbx)
2538	adoxq	%rbp,%r15
2539	movq	%r12,-16(%rbx)
2540	leaq	32(%rcx),%rcx
2541	jmp	.Lmulx4x_inner
2542
2543.align	32
2544.Lmulx4x_inner:
2545	mulxq	0(%rsi),%r10,%rax
2546	adcxq	%rbp,%r15
2547	adoxq	%r14,%r10
2548	mulxq	8(%rsi),%r11,%r14
2549	adcxq	0(%rbx),%r10
2550	adoxq	%rax,%r11
2551	mulxq	16(%rsi),%r12,%rax
2552	adcxq	8(%rbx),%r11
2553	adoxq	%r14,%r12
2554	mulxq	24(%rsi),%r13,%r14
2555	movq	%r8,%rdx
2556	adcxq	16(%rbx),%r12
2557	adoxq	%rax,%r13
2558	adcxq	24(%rbx),%r13
2559	adoxq	%rbp,%r14
2560	leaq	32(%rsi),%rsi
2561	leaq	32(%rbx),%rbx
2562	adcxq	%rbp,%r14
2563
2564	adoxq	%r15,%r10
2565	mulxq	0(%rcx),%rax,%r15
2566	adcxq	%rax,%r10
2567	adoxq	%r15,%r11
2568	mulxq	8(%rcx),%rax,%r15
2569	adcxq	%rax,%r11
2570	adoxq	%r15,%r12
2571	mulxq	16(%rcx),%rax,%r15
2572	movq	%r10,-40(%rbx)
2573	adcxq	%rax,%r12
2574	adoxq	%r15,%r13
2575	movq	%r11,-32(%rbx)
2576	mulxq	24(%rcx),%rax,%r15
2577	movq	%r9,%rdx
2578	leaq	32(%rcx),%rcx
2579	movq	%r12,-24(%rbx)
2580	adcxq	%rax,%r13
2581	adoxq	%rbp,%r15
2582	movq	%r13,-16(%rbx)
2583
2584	decq	%rdi
2585	jnz	.Lmulx4x_inner
2586
2587	movq	0+8(%rsp),%rax
2588	adcq	%rbp,%r15
2589	subq	0(%rbx),%rdi
2590	movq	8+8(%rsp),%rdi
2591	movq	16+8(%rsp),%r10
2592	adcq	%r15,%r14
2593	leaq	(%rsi,%rax,1),%rsi
2594	adcq	%rbp,%rbp
2595	movq	%r14,-8(%rbx)
2596
2597	cmpq	%r10,%rdi
2598	jb	.Lmulx4x_outer
2599
2600	movq	-8(%rcx),%r10
2601	movq	%rbp,%r8
2602	movq	(%rcx,%rax,1),%r12
2603	leaq	(%rcx,%rax,1),%rbp
2604	movq	%rax,%rcx
2605	leaq	(%rbx,%rax,1),%rdi
2606	xorl	%eax,%eax
2607	xorq	%r15,%r15
2608	subq	%r14,%r10
2609	adcq	%r15,%r15
2610	orq	%r15,%r8
2611	sarq	$3+2,%rcx
2612	subq	%r8,%rax
2613	movq	56+8(%rsp),%rdx
2614	decq	%r12
2615	movq	8(%rbp),%r13
2616	xorq	%r8,%r8
2617	movq	16(%rbp),%r14
2618	movq	24(%rbp),%r15
2619	jmp	.Lsqrx4x_sub_entry
2620.size	mulx4x_internal,.-mulx4x_internal
2621.type	bn_powerx5,@function
2622.align	32
2623bn_powerx5:
2624.Lpowerx5_enter:
2625	movq	%rsp,%rax
2626	pushq	%rbx
2627	pushq	%rbp
2628	pushq	%r12
2629	pushq	%r13
2630	pushq	%r14
2631	pushq	%r15
2632
2633	shll	$3,%r9d
2634	leaq	(%r9,%r9,2),%r10
2635	negq	%r9
2636	movq	(%r8),%r8
2637
2638
2639
2640
2641
2642
2643
2644
2645	leaq	-320(%rsp,%r9,2),%r11
2646	subq	%rdi,%r11
2647	andq	$4095,%r11
2648	cmpq	%r11,%r10
2649	jb	.Lpwrx_sp_alt
2650	subq	%r11,%rsp
2651	leaq	-320(%rsp,%r9,2),%rsp
2652	jmp	.Lpwrx_sp_done
2653
2654.align	32
2655.Lpwrx_sp_alt:
2656	leaq	4096-320(,%r9,2),%r10
2657	leaq	-320(%rsp,%r9,2),%rsp
2658	subq	%r10,%r11
2659	movq	$0,%r10
2660	cmovcq	%r10,%r11
2661	subq	%r11,%rsp
2662.Lpwrx_sp_done:
2663	andq	$-64,%rsp
2664	movq	%rax,%r11
2665	subq	%rsp,%r11
2666	andq	$-4096,%r11
2667.Lpwrx_page_walk:
2668	movq	(%rsp,%r11,1),%r10
2669	subq	$4096,%r11
2670.byte	0x2e
2671	jnc	.Lpwrx_page_walk
2672
2673	movq	%r9,%r10
2674	negq	%r9
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687	pxor	%xmm0,%xmm0
2688.byte	102,72,15,110,207
2689.byte	102,72,15,110,209
2690.byte	102,73,15,110,218
2691.byte	102,72,15,110,226
2692	movq	%r8,32(%rsp)
2693	movq	%rax,40(%rsp)
2694.Lpowerx5_body:
2695
2696	call	__bn_sqrx8x_internal
2697	call	__bn_postx4x_internal
2698	call	__bn_sqrx8x_internal
2699	call	__bn_postx4x_internal
2700	call	__bn_sqrx8x_internal
2701	call	__bn_postx4x_internal
2702	call	__bn_sqrx8x_internal
2703	call	__bn_postx4x_internal
2704	call	__bn_sqrx8x_internal
2705	call	__bn_postx4x_internal
2706
2707	movq	%r10,%r9
2708	movq	%rsi,%rdi
2709.byte	102,72,15,126,209
2710.byte	102,72,15,126,226
2711	movq	40(%rsp),%rax
2712
2713	call	mulx4x_internal
2714
2715	movq	40(%rsp),%rsi
2716	movq	$1,%rax
2717
2718	movq	-48(%rsi),%r15
2719	movq	-40(%rsi),%r14
2720	movq	-32(%rsi),%r13
2721	movq	-24(%rsi),%r12
2722	movq	-16(%rsi),%rbp
2723	movq	-8(%rsi),%rbx
2724	leaq	(%rsi),%rsp
2725.Lpowerx5_epilogue:
2726	.byte	0xf3,0xc3
2727.size	bn_powerx5,.-bn_powerx5
2728
2729.globl	bn_sqrx8x_internal
2730.hidden	bn_sqrx8x_internal
2731.type	bn_sqrx8x_internal,@function
2732.align	32
2733bn_sqrx8x_internal:
2734__bn_sqrx8x_internal:
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770
2771
2772
2773
2774
2775	leaq	48+8(%rsp),%rdi
2776	leaq	(%rsi,%r9,1),%rbp
2777	movq	%r9,0+8(%rsp)
2778	movq	%rbp,8+8(%rsp)
2779	jmp	.Lsqr8x_zero_start
2780
2781.align	32
2782.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2783.Lsqrx8x_zero:
2784.byte	0x3e
2785	movdqa	%xmm0,0(%rdi)
2786	movdqa	%xmm0,16(%rdi)
2787	movdqa	%xmm0,32(%rdi)
2788	movdqa	%xmm0,48(%rdi)
2789.Lsqr8x_zero_start:
2790	movdqa	%xmm0,64(%rdi)
2791	movdqa	%xmm0,80(%rdi)
2792	movdqa	%xmm0,96(%rdi)
2793	movdqa	%xmm0,112(%rdi)
2794	leaq	128(%rdi),%rdi
2795	subq	$64,%r9
2796	jnz	.Lsqrx8x_zero
2797
2798	movq	0(%rsi),%rdx
2799
2800	xorq	%r10,%r10
2801	xorq	%r11,%r11
2802	xorq	%r12,%r12
2803	xorq	%r13,%r13
2804	xorq	%r14,%r14
2805	xorq	%r15,%r15
2806	leaq	48+8(%rsp),%rdi
2807	xorq	%rbp,%rbp
2808	jmp	.Lsqrx8x_outer_loop
2809
2810.align	32
2811.Lsqrx8x_outer_loop:
2812	mulxq	8(%rsi),%r8,%rax
2813	adcxq	%r9,%r8
2814	adoxq	%rax,%r10
2815	mulxq	16(%rsi),%r9,%rax
2816	adcxq	%r10,%r9
2817	adoxq	%rax,%r11
2818.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
2819	adcxq	%r11,%r10
2820	adoxq	%rax,%r12
2821.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
2822	adcxq	%r12,%r11
2823	adoxq	%rax,%r13
2824	mulxq	40(%rsi),%r12,%rax
2825	adcxq	%r13,%r12
2826	adoxq	%rax,%r14
2827	mulxq	48(%rsi),%r13,%rax
2828	adcxq	%r14,%r13
2829	adoxq	%r15,%rax
2830	mulxq	56(%rsi),%r14,%r15
2831	movq	8(%rsi),%rdx
2832	adcxq	%rax,%r14
2833	adoxq	%rbp,%r15
2834	adcq	64(%rdi),%r15
2835	movq	%r8,8(%rdi)
2836	movq	%r9,16(%rdi)
2837	sbbq	%rcx,%rcx
2838	xorq	%rbp,%rbp
2839
2840
2841	mulxq	16(%rsi),%r8,%rbx
2842	mulxq	24(%rsi),%r9,%rax
2843	adcxq	%r10,%r8
2844	adoxq	%rbx,%r9
2845	mulxq	32(%rsi),%r10,%rbx
2846	adcxq	%r11,%r9
2847	adoxq	%rax,%r10
2848.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
2849	adcxq	%r12,%r10
2850	adoxq	%rbx,%r11
2851.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
2852	adcxq	%r13,%r11
2853	adoxq	%r14,%r12
2854.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
2855	movq	16(%rsi),%rdx
2856	adcxq	%rax,%r12
2857	adoxq	%rbx,%r13
2858	adcxq	%r15,%r13
2859	adoxq	%rbp,%r14
2860	adcxq	%rbp,%r14
2861
2862	movq	%r8,24(%rdi)
2863	movq	%r9,32(%rdi)
2864
2865	mulxq	24(%rsi),%r8,%rbx
2866	mulxq	32(%rsi),%r9,%rax
2867	adcxq	%r10,%r8
2868	adoxq	%rbx,%r9
2869	mulxq	40(%rsi),%r10,%rbx
2870	adcxq	%r11,%r9
2871	adoxq	%rax,%r10
2872.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
2873	adcxq	%r12,%r10
2874	adoxq	%r13,%r11
2875.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
2876.byte	0x3e
2877	movq	24(%rsi),%rdx
2878	adcxq	%rbx,%r11
2879	adoxq	%rax,%r12
2880	adcxq	%r14,%r12
2881	movq	%r8,40(%rdi)
2882	movq	%r9,48(%rdi)
2883	mulxq	32(%rsi),%r8,%rax
2884	adoxq	%rbp,%r13
2885	adcxq	%rbp,%r13
2886
2887	mulxq	40(%rsi),%r9,%rbx
2888	adcxq	%r10,%r8
2889	adoxq	%rax,%r9
2890	mulxq	48(%rsi),%r10,%rax
2891	adcxq	%r11,%r9
2892	adoxq	%r12,%r10
2893	mulxq	56(%rsi),%r11,%r12
2894	movq	32(%rsi),%rdx
2895	movq	40(%rsi),%r14
2896	adcxq	%rbx,%r10
2897	adoxq	%rax,%r11
2898	movq	48(%rsi),%r15
2899	adcxq	%r13,%r11
2900	adoxq	%rbp,%r12
2901	adcxq	%rbp,%r12
2902
2903	movq	%r8,56(%rdi)
2904	movq	%r9,64(%rdi)
2905
2906	mulxq	%r14,%r9,%rax
2907	movq	56(%rsi),%r8
2908	adcxq	%r10,%r9
2909	mulxq	%r15,%r10,%rbx
2910	adoxq	%rax,%r10
2911	adcxq	%r11,%r10
2912	mulxq	%r8,%r11,%rax
2913	movq	%r14,%rdx
2914	adoxq	%rbx,%r11
2915	adcxq	%r12,%r11
2916
2917	adcxq	%rbp,%rax
2918
2919	mulxq	%r15,%r14,%rbx
2920	mulxq	%r8,%r12,%r13
2921	movq	%r15,%rdx
2922	leaq	64(%rsi),%rsi
2923	adcxq	%r14,%r11
2924	adoxq	%rbx,%r12
2925	adcxq	%rax,%r12
2926	adoxq	%rbp,%r13
2927
2928.byte	0x67,0x67
2929	mulxq	%r8,%r8,%r14
2930	adcxq	%r8,%r13
2931	adcxq	%rbp,%r14
2932
2933	cmpq	8+8(%rsp),%rsi
2934	je	.Lsqrx8x_outer_break
2935
2936	negq	%rcx
2937	movq	$-8,%rcx
2938	movq	%rbp,%r15
2939	movq	64(%rdi),%r8
2940	adcxq	72(%rdi),%r9
2941	adcxq	80(%rdi),%r10
2942	adcxq	88(%rdi),%r11
2943	adcq	96(%rdi),%r12
2944	adcq	104(%rdi),%r13
2945	adcq	112(%rdi),%r14
2946	adcq	120(%rdi),%r15
2947	leaq	(%rsi),%rbp
2948	leaq	128(%rdi),%rdi
2949	sbbq	%rax,%rax
2950
2951	movq	-64(%rsi),%rdx
2952	movq	%rax,16+8(%rsp)
2953	movq	%rdi,24+8(%rsp)
2954
2955
2956	xorl	%eax,%eax
2957	jmp	.Lsqrx8x_loop
2958
2959.align	32
2960.Lsqrx8x_loop:
2961	movq	%r8,%rbx
2962	mulxq	0(%rbp),%rax,%r8
2963	adcxq	%rax,%rbx
2964	adoxq	%r9,%r8
2965
2966	mulxq	8(%rbp),%rax,%r9
2967	adcxq	%rax,%r8
2968	adoxq	%r10,%r9
2969
2970	mulxq	16(%rbp),%rax,%r10
2971	adcxq	%rax,%r9
2972	adoxq	%r11,%r10
2973
2974	mulxq	24(%rbp),%rax,%r11
2975	adcxq	%rax,%r10
2976	adoxq	%r12,%r11
2977
2978.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
2979	adcxq	%rax,%r11
2980	adoxq	%r13,%r12
2981
2982	mulxq	40(%rbp),%rax,%r13
2983	adcxq	%rax,%r12
2984	adoxq	%r14,%r13
2985
2986	mulxq	48(%rbp),%rax,%r14
2987	movq	%rbx,(%rdi,%rcx,8)
2988	movl	$0,%ebx
2989	adcxq	%rax,%r13
2990	adoxq	%r15,%r14
2991
2992.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
2993	movq	8(%rsi,%rcx,8),%rdx
2994	adcxq	%rax,%r14
2995	adoxq	%rbx,%r15
2996	adcxq	%rbx,%r15
2997
2998.byte	0x67
2999	incq	%rcx
3000	jnz	.Lsqrx8x_loop
3001
3002	leaq	64(%rbp),%rbp
3003	movq	$-8,%rcx
3004	cmpq	8+8(%rsp),%rbp
3005	je	.Lsqrx8x_break
3006
3007	subq	16+8(%rsp),%rbx
3008.byte	0x66
3009	movq	-64(%rsi),%rdx
3010	adcxq	0(%rdi),%r8
3011	adcxq	8(%rdi),%r9
3012	adcq	16(%rdi),%r10
3013	adcq	24(%rdi),%r11
3014	adcq	32(%rdi),%r12
3015	adcq	40(%rdi),%r13
3016	adcq	48(%rdi),%r14
3017	adcq	56(%rdi),%r15
3018	leaq	64(%rdi),%rdi
3019.byte	0x67
3020	sbbq	%rax,%rax
3021	xorl	%ebx,%ebx
3022	movq	%rax,16+8(%rsp)
3023	jmp	.Lsqrx8x_loop
3024
3025.align	32
3026.Lsqrx8x_break:
3027	subq	16+8(%rsp),%r8
3028	movq	24+8(%rsp),%rcx
3029	movq	0(%rsi),%rdx
3030	xorl	%ebp,%ebp
3031	movq	%r8,0(%rdi)
3032	cmpq	%rcx,%rdi
3033	je	.Lsqrx8x_outer_loop
3034
3035	movq	%r9,8(%rdi)
3036	movq	8(%rcx),%r9
3037	movq	%r10,16(%rdi)
3038	movq	16(%rcx),%r10
3039	movq	%r11,24(%rdi)
3040	movq	24(%rcx),%r11
3041	movq	%r12,32(%rdi)
3042	movq	32(%rcx),%r12
3043	movq	%r13,40(%rdi)
3044	movq	40(%rcx),%r13
3045	movq	%r14,48(%rdi)
3046	movq	48(%rcx),%r14
3047	movq	%r15,56(%rdi)
3048	movq	56(%rcx),%r15
3049	movq	%rcx,%rdi
3050	jmp	.Lsqrx8x_outer_loop
3051
3052.align	32
3053.Lsqrx8x_outer_break:
3054	movq	%r9,72(%rdi)
3055.byte	102,72,15,126,217
3056	movq	%r10,80(%rdi)
3057	movq	%r11,88(%rdi)
3058	movq	%r12,96(%rdi)
3059	movq	%r13,104(%rdi)
3060	movq	%r14,112(%rdi)
3061	leaq	48+8(%rsp),%rdi
3062	movq	(%rsi,%rcx,1),%rdx
3063
3064	movq	8(%rdi),%r11
3065	xorq	%r10,%r10
3066	movq	0+8(%rsp),%r9
3067	adoxq	%r11,%r11
3068	movq	16(%rdi),%r12
3069	movq	24(%rdi),%r13
3070
3071
3072.align	32
3073.Lsqrx4x_shift_n_add:
3074	mulxq	%rdx,%rax,%rbx
3075	adoxq	%r12,%r12
3076	adcxq	%r10,%rax
3077.byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
3078.byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
3079	adoxq	%r13,%r13
3080	adcxq	%r11,%rbx
3081	movq	40(%rdi),%r11
3082	movq	%rax,0(%rdi)
3083	movq	%rbx,8(%rdi)
3084
3085	mulxq	%rdx,%rax,%rbx
3086	adoxq	%r10,%r10
3087	adcxq	%r12,%rax
3088	movq	16(%rsi,%rcx,1),%rdx
3089	movq	48(%rdi),%r12
3090	adoxq	%r11,%r11
3091	adcxq	%r13,%rbx
3092	movq	56(%rdi),%r13
3093	movq	%rax,16(%rdi)
3094	movq	%rbx,24(%rdi)
3095
3096	mulxq	%rdx,%rax,%rbx
3097	adoxq	%r12,%r12
3098	adcxq	%r10,%rax
3099	movq	24(%rsi,%rcx,1),%rdx
3100	leaq	32(%rcx),%rcx
3101	movq	64(%rdi),%r10
3102	adoxq	%r13,%r13
3103	adcxq	%r11,%rbx
3104	movq	72(%rdi),%r11
3105	movq	%rax,32(%rdi)
3106	movq	%rbx,40(%rdi)
3107
3108	mulxq	%rdx,%rax,%rbx
3109	adoxq	%r10,%r10
3110	adcxq	%r12,%rax
3111	jrcxz	.Lsqrx4x_shift_n_add_break
3112.byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
3113	adoxq	%r11,%r11
3114	adcxq	%r13,%rbx
3115	movq	80(%rdi),%r12
3116	movq	88(%rdi),%r13
3117	movq	%rax,48(%rdi)
3118	movq	%rbx,56(%rdi)
3119	leaq	64(%rdi),%rdi
3120	nop
3121	jmp	.Lsqrx4x_shift_n_add
3122
3123.align	32
3124.Lsqrx4x_shift_n_add_break:
3125	adcxq	%r13,%rbx
3126	movq	%rax,48(%rdi)
3127	movq	%rbx,56(%rdi)
3128	leaq	64(%rdi),%rdi
3129.byte	102,72,15,126,213
3130__bn_sqrx8x_reduction:
3131	xorl	%eax,%eax
3132	movq	32+8(%rsp),%rbx
3133	movq	48+8(%rsp),%rdx
3134	leaq	-64(%rbp,%r9,1),%rcx
3135
3136	movq	%rcx,0+8(%rsp)
3137	movq	%rdi,8+8(%rsp)
3138
3139	leaq	48+8(%rsp),%rdi
3140	jmp	.Lsqrx8x_reduction_loop
3141
3142.align	32
3143.Lsqrx8x_reduction_loop:
3144	movq	8(%rdi),%r9
3145	movq	16(%rdi),%r10
3146	movq	24(%rdi),%r11
3147	movq	32(%rdi),%r12
3148	movq	%rdx,%r8
3149	imulq	%rbx,%rdx
3150	movq	40(%rdi),%r13
3151	movq	48(%rdi),%r14
3152	movq	56(%rdi),%r15
3153	movq	%rax,24+8(%rsp)
3154
3155	leaq	64(%rdi),%rdi
3156	xorq	%rsi,%rsi
3157	movq	$-8,%rcx
3158	jmp	.Lsqrx8x_reduce
3159
3160.align	32
3161.Lsqrx8x_reduce:
3162	movq	%r8,%rbx
3163	mulxq	0(%rbp),%rax,%r8
3164	adcxq	%rbx,%rax
3165	adoxq	%r9,%r8
3166
3167	mulxq	8(%rbp),%rbx,%r9
3168	adcxq	%rbx,%r8
3169	adoxq	%r10,%r9
3170
3171	mulxq	16(%rbp),%rbx,%r10
3172	adcxq	%rbx,%r9
3173	adoxq	%r11,%r10
3174
3175	mulxq	24(%rbp),%rbx,%r11
3176	adcxq	%rbx,%r10
3177	adoxq	%r12,%r11
3178
3179.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
3180	movq	%rdx,%rax
3181	movq	%r8,%rdx
3182	adcxq	%rbx,%r11
3183	adoxq	%r13,%r12
3184
3185	mulxq	32+8(%rsp),%rbx,%rdx
3186	movq	%rax,%rdx
3187	movq	%rax,64+48+8(%rsp,%rcx,8)
3188
3189	mulxq	40(%rbp),%rax,%r13
3190	adcxq	%rax,%r12
3191	adoxq	%r14,%r13
3192
3193	mulxq	48(%rbp),%rax,%r14
3194	adcxq	%rax,%r13
3195	adoxq	%r15,%r14
3196
3197	mulxq	56(%rbp),%rax,%r15
3198	movq	%rbx,%rdx
3199	adcxq	%rax,%r14
3200	adoxq	%rsi,%r15
3201	adcxq	%rsi,%r15
3202
3203.byte	0x67,0x67,0x67
3204	incq	%rcx
3205	jnz	.Lsqrx8x_reduce
3206
3207	movq	%rsi,%rax
3208	cmpq	0+8(%rsp),%rbp
3209	jae	.Lsqrx8x_no_tail
3210
3211	movq	48+8(%rsp),%rdx
3212	addq	0(%rdi),%r8
3213	leaq	64(%rbp),%rbp
3214	movq	$-8,%rcx
3215	adcxq	8(%rdi),%r9
3216	adcxq	16(%rdi),%r10
3217	adcq	24(%rdi),%r11
3218	adcq	32(%rdi),%r12
3219	adcq	40(%rdi),%r13
3220	adcq	48(%rdi),%r14
3221	adcq	56(%rdi),%r15
3222	leaq	64(%rdi),%rdi
3223	sbbq	%rax,%rax
3224
3225	xorq	%rsi,%rsi
3226	movq	%rax,16+8(%rsp)
3227	jmp	.Lsqrx8x_tail
3228
3229.align	32
3230.Lsqrx8x_tail:
3231	movq	%r8,%rbx
3232	mulxq	0(%rbp),%rax,%r8
3233	adcxq	%rax,%rbx
3234	adoxq	%r9,%r8
3235
3236	mulxq	8(%rbp),%rax,%r9
3237	adcxq	%rax,%r8
3238	adoxq	%r10,%r9
3239
3240	mulxq	16(%rbp),%rax,%r10
3241	adcxq	%rax,%r9
3242	adoxq	%r11,%r10
3243
3244	mulxq	24(%rbp),%rax,%r11
3245	adcxq	%rax,%r10
3246	adoxq	%r12,%r11
3247
3248.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3249	adcxq	%rax,%r11
3250	adoxq	%r13,%r12
3251
3252	mulxq	40(%rbp),%rax,%r13
3253	adcxq	%rax,%r12
3254	adoxq	%r14,%r13
3255
3256	mulxq	48(%rbp),%rax,%r14
3257	adcxq	%rax,%r13
3258	adoxq	%r15,%r14
3259
3260	mulxq	56(%rbp),%rax,%r15
3261	movq	72+48+8(%rsp,%rcx,8),%rdx
3262	adcxq	%rax,%r14
3263	adoxq	%rsi,%r15
3264	movq	%rbx,(%rdi,%rcx,8)
3265	movq	%r8,%rbx
3266	adcxq	%rsi,%r15
3267
3268	incq	%rcx
3269	jnz	.Lsqrx8x_tail
3270
3271	cmpq	0+8(%rsp),%rbp
3272	jae	.Lsqrx8x_tail_done
3273
3274	subq	16+8(%rsp),%rsi
3275	movq	48+8(%rsp),%rdx
3276	leaq	64(%rbp),%rbp
3277	adcq	0(%rdi),%r8
3278	adcq	8(%rdi),%r9
3279	adcq	16(%rdi),%r10
3280	adcq	24(%rdi),%r11
3281	adcq	32(%rdi),%r12
3282	adcq	40(%rdi),%r13
3283	adcq	48(%rdi),%r14
3284	adcq	56(%rdi),%r15
3285	leaq	64(%rdi),%rdi
3286	sbbq	%rax,%rax
3287	subq	$8,%rcx
3288
3289	xorq	%rsi,%rsi
3290	movq	%rax,16+8(%rsp)
3291	jmp	.Lsqrx8x_tail
3292
3293.align	32
3294.Lsqrx8x_tail_done:
3295	addq	24+8(%rsp),%r8
3296	adcq	$0,%r9
3297	adcq	$0,%r10
3298	adcq	$0,%r11
3299	adcq	$0,%r12
3300	adcq	$0,%r13
3301	adcq	$0,%r14
3302	adcq	$0,%r15
3303
3304
3305	movq	%rsi,%rax
3306
3307	subq	16+8(%rsp),%rsi
3308.Lsqrx8x_no_tail:
3309	adcq	0(%rdi),%r8
3310.byte	102,72,15,126,217
3311	adcq	8(%rdi),%r9
3312	movq	56(%rbp),%rsi
3313.byte	102,72,15,126,213
3314	adcq	16(%rdi),%r10
3315	adcq	24(%rdi),%r11
3316	adcq	32(%rdi),%r12
3317	adcq	40(%rdi),%r13
3318	adcq	48(%rdi),%r14
3319	adcq	56(%rdi),%r15
3320	adcq	%rax,%rax
3321
3322	movq	32+8(%rsp),%rbx
3323	movq	64(%rdi,%rcx,1),%rdx
3324
3325	movq	%r8,0(%rdi)
3326	leaq	64(%rdi),%r8
3327	movq	%r9,8(%rdi)
3328	movq	%r10,16(%rdi)
3329	movq	%r11,24(%rdi)
3330	movq	%r12,32(%rdi)
3331	movq	%r13,40(%rdi)
3332	movq	%r14,48(%rdi)
3333	movq	%r15,56(%rdi)
3334
3335	leaq	64(%rdi,%rcx,1),%rdi
3336	cmpq	8+8(%rsp),%r8
3337	jb	.Lsqrx8x_reduction_loop
3338	.byte	0xf3,0xc3
3339.size	bn_sqrx8x_internal,.-bn_sqrx8x_internal
3340.align	32
3341__bn_postx4x_internal:
3342	movq	0(%rbp),%r12
3343	movq	%rcx,%r10
3344	movq	%rcx,%r9
3345	negq	%rax
3346	sarq	$3+2,%rcx
3347
3348.byte	102,72,15,126,202
3349.byte	102,72,15,126,206
3350	decq	%r12
3351	movq	8(%rbp),%r13
3352	xorq	%r8,%r8
3353	movq	16(%rbp),%r14
3354	movq	24(%rbp),%r15
3355	jmp	.Lsqrx4x_sub_entry
3356
3357.align	16
3358.Lsqrx4x_sub:
3359	movq	0(%rbp),%r12
3360	movq	8(%rbp),%r13
3361	movq	16(%rbp),%r14
3362	movq	24(%rbp),%r15
3363.Lsqrx4x_sub_entry:
3364	andnq	%rax,%r12,%r12
3365	leaq	32(%rbp),%rbp
3366	andnq	%rax,%r13,%r13
3367	andnq	%rax,%r14,%r14
3368	andnq	%rax,%r15,%r15
3369
3370	negq	%r8
3371	adcq	0(%rdi),%r12
3372	adcq	8(%rdi),%r13
3373	adcq	16(%rdi),%r14
3374	adcq	24(%rdi),%r15
3375	movq	%r12,0(%rdx)
3376	leaq	32(%rdi),%rdi
3377	movq	%r13,8(%rdx)
3378	sbbq	%r8,%r8
3379	movq	%r14,16(%rdx)
3380	movq	%r15,24(%rdx)
3381	leaq	32(%rdx),%rdx
3382
3383	incq	%rcx
3384	jnz	.Lsqrx4x_sub
3385
3386	negq	%r9
3387
3388	.byte	0xf3,0xc3
3389.size	__bn_postx4x_internal,.-__bn_postx4x_internal
3390.globl	bn_get_bits5
3391.type	bn_get_bits5,@function
3392.align	16
3393bn_get_bits5:
3394	leaq	0(%rdi),%r10
3395	leaq	1(%rdi),%r11
3396	movl	%esi,%ecx
3397	shrl	$4,%esi
3398	andl	$15,%ecx
3399	leal	-8(%rcx),%eax
3400	cmpl	$11,%ecx
3401	cmovaq	%r11,%r10
3402	cmoval	%eax,%ecx
3403	movzwl	(%r10,%rsi,2),%eax
3404	shrl	%cl,%eax
3405	andl	$31,%eax
3406	.byte	0xf3,0xc3
3407.size	bn_get_bits5,.-bn_get_bits5
3408
3409.globl	bn_scatter5
3410.type	bn_scatter5,@function
3411.align	16
3412bn_scatter5:
3413	cmpl	$0,%esi
3414	jz	.Lscatter_epilogue
3415	leaq	(%rdx,%rcx,8),%rdx
3416.Lscatter:
3417	movq	(%rdi),%rax
3418	leaq	8(%rdi),%rdi
3419	movq	%rax,(%rdx)
3420	leaq	256(%rdx),%rdx
3421	subl	$1,%esi
3422	jnz	.Lscatter
3423.Lscatter_epilogue:
3424	.byte	0xf3,0xc3
3425.size	bn_scatter5,.-bn_scatter5
3426
3427.globl	bn_gather5
3428.type	bn_gather5,@function
3429.align	32
3430bn_gather5:
3431.LSEH_begin_bn_gather5:
3432
3433.byte	0x4c,0x8d,0x14,0x24
3434.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00
3435	leaq	.Linc(%rip),%rax
3436	andq	$-16,%rsp
3437
3438	movd	%ecx,%xmm5
3439	movdqa	0(%rax),%xmm0
3440	movdqa	16(%rax),%xmm1
3441	leaq	128(%rdx),%r11
3442	leaq	128(%rsp),%rax
3443
3444	pshufd	$0,%xmm5,%xmm5
3445	movdqa	%xmm1,%xmm4
3446	movdqa	%xmm1,%xmm2
3447	paddd	%xmm0,%xmm1
3448	pcmpeqd	%xmm5,%xmm0
3449	movdqa	%xmm4,%xmm3
3450
3451	paddd	%xmm1,%xmm2
3452	pcmpeqd	%xmm5,%xmm1
3453	movdqa	%xmm0,-128(%rax)
3454	movdqa	%xmm4,%xmm0
3455
3456	paddd	%xmm2,%xmm3
3457	pcmpeqd	%xmm5,%xmm2
3458	movdqa	%xmm1,-112(%rax)
3459	movdqa	%xmm4,%xmm1
3460
3461	paddd	%xmm3,%xmm0
3462	pcmpeqd	%xmm5,%xmm3
3463	movdqa	%xmm2,-96(%rax)
3464	movdqa	%xmm4,%xmm2
3465	paddd	%xmm0,%xmm1
3466	pcmpeqd	%xmm5,%xmm0
3467	movdqa	%xmm3,-80(%rax)
3468	movdqa	%xmm4,%xmm3
3469
3470	paddd	%xmm1,%xmm2
3471	pcmpeqd	%xmm5,%xmm1
3472	movdqa	%xmm0,-64(%rax)
3473	movdqa	%xmm4,%xmm0
3474
3475	paddd	%xmm2,%xmm3
3476	pcmpeqd	%xmm5,%xmm2
3477	movdqa	%xmm1,-48(%rax)
3478	movdqa	%xmm4,%xmm1
3479
3480	paddd	%xmm3,%xmm0
3481	pcmpeqd	%xmm5,%xmm3
3482	movdqa	%xmm2,-32(%rax)
3483	movdqa	%xmm4,%xmm2
3484	paddd	%xmm0,%xmm1
3485	pcmpeqd	%xmm5,%xmm0
3486	movdqa	%xmm3,-16(%rax)
3487	movdqa	%xmm4,%xmm3
3488
3489	paddd	%xmm1,%xmm2
3490	pcmpeqd	%xmm5,%xmm1
3491	movdqa	%xmm0,0(%rax)
3492	movdqa	%xmm4,%xmm0
3493
3494	paddd	%xmm2,%xmm3
3495	pcmpeqd	%xmm5,%xmm2
3496	movdqa	%xmm1,16(%rax)
3497	movdqa	%xmm4,%xmm1
3498
3499	paddd	%xmm3,%xmm0
3500	pcmpeqd	%xmm5,%xmm3
3501	movdqa	%xmm2,32(%rax)
3502	movdqa	%xmm4,%xmm2
3503	paddd	%xmm0,%xmm1
3504	pcmpeqd	%xmm5,%xmm0
3505	movdqa	%xmm3,48(%rax)
3506	movdqa	%xmm4,%xmm3
3507
3508	paddd	%xmm1,%xmm2
3509	pcmpeqd	%xmm5,%xmm1
3510	movdqa	%xmm0,64(%rax)
3511	movdqa	%xmm4,%xmm0
3512
3513	paddd	%xmm2,%xmm3
3514	pcmpeqd	%xmm5,%xmm2
3515	movdqa	%xmm1,80(%rax)
3516	movdqa	%xmm4,%xmm1
3517
3518	paddd	%xmm3,%xmm0
3519	pcmpeqd	%xmm5,%xmm3
3520	movdqa	%xmm2,96(%rax)
3521	movdqa	%xmm4,%xmm2
3522	movdqa	%xmm3,112(%rax)
3523	jmp	.Lgather
3524
3525.align	32
3526.Lgather:
3527	pxor	%xmm4,%xmm4
3528	pxor	%xmm5,%xmm5
3529	movdqa	-128(%r11),%xmm0
3530	movdqa	-112(%r11),%xmm1
3531	movdqa	-96(%r11),%xmm2
3532	pand	-128(%rax),%xmm0
3533	movdqa	-80(%r11),%xmm3
3534	pand	-112(%rax),%xmm1
3535	por	%xmm0,%xmm4
3536	pand	-96(%rax),%xmm2
3537	por	%xmm1,%xmm5
3538	pand	-80(%rax),%xmm3
3539	por	%xmm2,%xmm4
3540	por	%xmm3,%xmm5
3541	movdqa	-64(%r11),%xmm0
3542	movdqa	-48(%r11),%xmm1
3543	movdqa	-32(%r11),%xmm2
3544	pand	-64(%rax),%xmm0
3545	movdqa	-16(%r11),%xmm3
3546	pand	-48(%rax),%xmm1
3547	por	%xmm0,%xmm4
3548	pand	-32(%rax),%xmm2
3549	por	%xmm1,%xmm5
3550	pand	-16(%rax),%xmm3
3551	por	%xmm2,%xmm4
3552	por	%xmm3,%xmm5
3553	movdqa	0(%r11),%xmm0
3554	movdqa	16(%r11),%xmm1
3555	movdqa	32(%r11),%xmm2
3556	pand	0(%rax),%xmm0
3557	movdqa	48(%r11),%xmm3
3558	pand	16(%rax),%xmm1
3559	por	%xmm0,%xmm4
3560	pand	32(%rax),%xmm2
3561	por	%xmm1,%xmm5
3562	pand	48(%rax),%xmm3
3563	por	%xmm2,%xmm4
3564	por	%xmm3,%xmm5
3565	movdqa	64(%r11),%xmm0
3566	movdqa	80(%r11),%xmm1
3567	movdqa	96(%r11),%xmm2
3568	pand	64(%rax),%xmm0
3569	movdqa	112(%r11),%xmm3
3570	pand	80(%rax),%xmm1
3571	por	%xmm0,%xmm4
3572	pand	96(%rax),%xmm2
3573	por	%xmm1,%xmm5
3574	pand	112(%rax),%xmm3
3575	por	%xmm2,%xmm4
3576	por	%xmm3,%xmm5
3577	por	%xmm5,%xmm4
3578	leaq	256(%r11),%r11
3579	pshufd	$0x4e,%xmm4,%xmm0
3580	por	%xmm4,%xmm0
3581	movq	%xmm0,(%rdi)
3582	leaq	8(%rdi),%rdi
3583	subl	$1,%esi
3584	jnz	.Lgather
3585
3586	leaq	(%r10),%rsp
3587	.byte	0xf3,0xc3
3588.LSEH_end_bn_gather5:
3589.size	bn_gather5,.-bn_gather5
3590.align	64
3591.Linc:
3592.long	0,0, 1,1
3593.long	2,2, 2,2
3594.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
3595