x86_64-mont.S revision 298999
1185380Ssam	# $FreeBSD: stable/10/secure/lib/libcrypto/amd64/x86_64-mont.S 298999 2016-05-03 18:54:20Z jkim $
2185380Ssam.text
3185380Ssam
4185380Ssam.globl	bn_mul_mont
5185380Ssam.type	bn_mul_mont,@function
6185380Ssam.align	16
7185380Ssambn_mul_mont:
8185380Ssam	testl	$3,%r9d
9185380Ssam	jnz	.Lmul_enter
10185380Ssam	cmpl	$8,%r9d
11185380Ssam	jb	.Lmul_enter
12185380Ssam	cmpq	%rsi,%rdx
13185380Ssam	jne	.Lmul4x_enter
14185380Ssam	jmp	.Lsqr4x_enter
15185380Ssam
16185380Ssam.align	16
17187129Ssam.Lmul_enter:
18185380Ssam	pushq	%rbx
19185380Ssam	pushq	%rbp
20185380Ssam	pushq	%r12
21185380Ssam	pushq	%r13
22185380Ssam	pushq	%r14
23185380Ssam	pushq	%r15
24185380Ssam
25185380Ssam	movl	%r9d,%r9d
26185380Ssam	leaq	2(%r9),%r10
27185380Ssam	movq	%rsp,%r11
28185380Ssam	negq	%r10
29185380Ssam	leaq	(%rsp,%r10,8),%rsp
30185380Ssam	andq	$-1024,%rsp
31185380Ssam
32185380Ssam	movq	%r11,8(%rsp,%r9,8)
33185380Ssam.Lmul_body:
34185380Ssam
35185380Ssam
36185380Ssam
37185380Ssam
38185380Ssam
39185380Ssam
40185380Ssam	subq	%rsp,%r11
41185380Ssam	andq	$-4096,%r11
42185380Ssam.Lmul_page_walk:
43185380Ssam	movq	(%rsp,%r11,1),%r10
44185380Ssam	subq	$4096,%r11
45185380Ssam.byte	0x66,0x2e
46185380Ssam	jnc	.Lmul_page_walk
47185380Ssam
48185380Ssam	movq	%rdx,%r12
49185380Ssam	movq	(%r8),%r8
50185380Ssam	movq	(%r12),%rbx
51185380Ssam	movq	(%rsi),%rax
52185380Ssam
53185380Ssam	xorq	%r14,%r14
54185380Ssam	xorq	%r15,%r15
55185380Ssam
56185380Ssam	movq	%r8,%rbp
57185380Ssam	mulq	%rbx
58185380Ssam	movq	%rax,%r10
59185380Ssam	movq	(%rcx),%rax
60185380Ssam
61185380Ssam	imulq	%r10,%rbp
62185380Ssam	movq	%rdx,%r11
63185380Ssam
64185380Ssam	mulq	%rbp
65185380Ssam	addq	%rax,%r10
66185380Ssam	movq	8(%rsi),%rax
67185380Ssam	adcq	$0,%rdx
68185380Ssam	movq	%rdx,%r13
69185380Ssam
70185380Ssam	leaq	1(%r15),%r15
71185380Ssam	jmp	.L1st_enter
72185380Ssam
73185380Ssam.align	16
74185380Ssam.L1st:
75185380Ssam	addq	%rax,%r13
76185380Ssam	movq	(%rsi,%r15,8),%rax
77185380Ssam	adcq	$0,%rdx
78185380Ssam	addq	%r11,%r13
79185380Ssam	movq	%r10,%r11
80185380Ssam	adcq	$0,%rdx
81185380Ssam	movq	%r13,-16(%rsp,%r15,8)
82185380Ssam	movq	%rdx,%r13
83185380Ssam
84185380Ssam.L1st_enter:
85185380Ssam	mulq	%rbx
86185380Ssam	addq	%rax,%r11
87185380Ssam	movq	(%rcx,%r15,8),%rax
88185380Ssam	adcq	$0,%rdx
89185380Ssam	leaq	1(%r15),%r15
90185380Ssam	movq	%rdx,%r10
91185380Ssam
92185380Ssam	mulq	%rbp
93185380Ssam	cmpq	%r9,%r15
94185380Ssam	jne	.L1st
95185380Ssam
96185380Ssam	addq	%rax,%r13
97185380Ssam	movq	(%rsi),%rax
98185380Ssam	adcq	$0,%rdx
99185380Ssam	addq	%r11,%r13
100185380Ssam	adcq	$0,%rdx
101185380Ssam	movq	%r13,-16(%rsp,%r15,8)
102185380Ssam	movq	%rdx,%r13
103185380Ssam	movq	%r10,%r11
104185380Ssam
105185380Ssam	xorq	%rdx,%rdx
106185380Ssam	addq	%r11,%r13
107185380Ssam	adcq	$0,%rdx
108185380Ssam	movq	%r13,-8(%rsp,%r9,8)
109185380Ssam	movq	%rdx,(%rsp,%r9,8)
110185380Ssam
111185380Ssam	leaq	1(%r14),%r14
112185380Ssam	jmp	.Louter
113185380Ssam.align	16
114185380Ssam.Louter:
115185380Ssam	movq	(%r12,%r14,8),%rbx
116185380Ssam	xorq	%r15,%r15
117185380Ssam	movq	%r8,%rbp
118185380Ssam	movq	(%rsp),%r10
119185380Ssam	mulq	%rbx
120185380Ssam	addq	%rax,%r10
121185380Ssam	movq	(%rcx),%rax
122185380Ssam	adcq	$0,%rdx
123185380Ssam
124185380Ssam	imulq	%r10,%rbp
125185380Ssam	movq	%rdx,%r11
126185380Ssam
127185380Ssam	mulq	%rbp
128185380Ssam	addq	%rax,%r10
129185380Ssam	movq	8(%rsi),%rax
130185380Ssam	adcq	$0,%rdx
131185380Ssam	movq	8(%rsp),%r10
132185380Ssam	movq	%rdx,%r13
133185380Ssam
134185380Ssam	leaq	1(%r15),%r15
135185380Ssam	jmp	.Linner_enter
136185380Ssam
137185380Ssam.align	16
138185380Ssam.Linner:
139185380Ssam	addq	%rax,%r13
140185380Ssam	movq	(%rsi,%r15,8),%rax
141185380Ssam	adcq	$0,%rdx
142185380Ssam	addq	%r10,%r13
143185380Ssam	movq	(%rsp,%r15,8),%r10
144185380Ssam	adcq	$0,%rdx
145185380Ssam	movq	%r13,-16(%rsp,%r15,8)
146185380Ssam	movq	%rdx,%r13
147185380Ssam
148185380Ssam.Linner_enter:
149185380Ssam	mulq	%rbx
150185380Ssam	addq	%rax,%r11
151185380Ssam	movq	(%rcx,%r15,8),%rax
152185380Ssam	adcq	$0,%rdx
153185380Ssam	addq	%r11,%r10
154185380Ssam	movq	%rdx,%r11
155185380Ssam	adcq	$0,%r11
156185380Ssam	leaq	1(%r15),%r15
157185380Ssam
158185380Ssam	mulq	%rbp
159185380Ssam	cmpq	%r9,%r15
160185380Ssam	jne	.Linner
161185380Ssam
162185380Ssam	addq	%rax,%r13
163185380Ssam	movq	(%rsi),%rax
164185380Ssam	adcq	$0,%rdx
165185380Ssam	addq	%r10,%r13
166185380Ssam	movq	(%rsp,%r15,8),%r10
167185380Ssam	adcq	$0,%rdx
168185380Ssam	movq	%r13,-16(%rsp,%r15,8)
169185380Ssam	movq	%rdx,%r13
170185380Ssam
171185380Ssam	xorq	%rdx,%rdx
172185380Ssam	addq	%r11,%r13
173185380Ssam	adcq	$0,%rdx
174185380Ssam	addq	%r10,%r13
175185380Ssam	adcq	$0,%rdx
176185380Ssam	movq	%r13,-8(%rsp,%r9,8)
177185380Ssam	movq	%rdx,(%rsp,%r9,8)
178185380Ssam
179185380Ssam	leaq	1(%r14),%r14
180185380Ssam	cmpq	%r9,%r14
181185380Ssam	jl	.Louter
182185380Ssam
183185380Ssam	xorq	%r14,%r14
184185380Ssam	movq	(%rsp),%rax
185185380Ssam	leaq	(%rsp),%rsi
186185380Ssam	movq	%r9,%r15
187185380Ssam	jmp	.Lsub
188185380Ssam.align	16
189185380Ssam.Lsub:	sbbq	(%rcx,%r14,8),%rax
190185380Ssam	movq	%rax,(%rdi,%r14,8)
191185380Ssam	movq	8(%rsi,%r14,8),%rax
192185380Ssam	leaq	1(%r14),%r14
193185380Ssam	decq	%r15
194185380Ssam	jnz	.Lsub
195185380Ssam
196185380Ssam	sbbq	$0,%rax
197185380Ssam	xorq	%r14,%r14
198185380Ssam	andq	%rax,%rsi
199185380Ssam	notq	%rax
200185380Ssam	movq	%rdi,%rcx
201185380Ssam	andq	%rax,%rcx
202185380Ssam	movq	%r9,%r15
203185380Ssam	orq	%rcx,%rsi
204185380Ssam.align	16
205185380Ssam.Lcopy:
206185380Ssam	movq	(%rsi,%r14,8),%rax
207185380Ssam	movq	%r14,(%rsp,%r14,8)
208185380Ssam	movq	%rax,(%rdi,%r14,8)
209185380Ssam	leaq	1(%r14),%r14
210185380Ssam	subq	$1,%r15
211185380Ssam	jnz	.Lcopy
212185380Ssam
213185380Ssam	movq	8(%rsp,%r9,8),%rsi
214185380Ssam	movq	$1,%rax
215185380Ssam	movq	(%rsi),%r15
216185380Ssam	movq	8(%rsi),%r14
217185380Ssam	movq	16(%rsi),%r13
218185380Ssam	movq	24(%rsi),%r12
219185380Ssam	movq	32(%rsi),%rbp
220185380Ssam	movq	40(%rsi),%rbx
221185380Ssam	leaq	48(%rsi),%rsp
222185380Ssam.Lmul_epilogue:
223185380Ssam	.byte	0xf3,0xc3
224185380Ssam.size	bn_mul_mont,.-bn_mul_mont
225185380Ssam.type	bn_mul4x_mont,@function
226185380Ssam.align	16
227185380Ssambn_mul4x_mont:
228185380Ssam.Lmul4x_enter:
229185380Ssam	pushq	%rbx
230185380Ssam	pushq	%rbp
231185380Ssam	pushq	%r12
232185380Ssam	pushq	%r13
233185380Ssam	pushq	%r14
234185380Ssam	pushq	%r15
235185380Ssam
236185380Ssam	movl	%r9d,%r9d
237185380Ssam	leaq	4(%r9),%r10
238185380Ssam	movq	%rsp,%r11
239185380Ssam	negq	%r10
240185380Ssam	leaq	(%rsp,%r10,8),%rsp
241185380Ssam	andq	$-1024,%rsp
242188197Ssam
243185380Ssam	movq	%r11,8(%rsp,%r9,8)
244185380Ssam.Lmul4x_body:
245185380Ssam	subq	%rsp,%r11
246188197Ssam	andq	$-4096,%r11
247185380Ssam.Lmul4x_page_walk:
248185380Ssam	movq	(%rsp,%r11,1),%r10
249185380Ssam	subq	$4096,%r11
250188197Ssam.byte	0x2e
251185380Ssam	jnc	.Lmul4x_page_walk
252188197Ssam
253185380Ssam	movq	%rdi,16(%rsp,%r9,8)
254185380Ssam	movq	%rdx,%r12
255185380Ssam	movq	(%r8),%r8
256185380Ssam	movq	(%r12),%rbx
257185380Ssam	movq	(%rsi),%rax
258188197Ssam
259185380Ssam	xorq	%r14,%r14
260185380Ssam	xorq	%r15,%r15
261188197Ssam
262185380Ssam	movq	%r8,%rbp
263185380Ssam	mulq	%rbx
264188197Ssam	movq	%rax,%r10
265185380Ssam	movq	(%rcx),%rax
266185380Ssam
267188197Ssam	imulq	%r10,%rbp
268185380Ssam	movq	%rdx,%r11
269185380Ssam
270185380Ssam	mulq	%rbp
271188197Ssam	addq	%rax,%r10
272185380Ssam	movq	8(%rsi),%rax
273185380Ssam	adcq	$0,%rdx
274185380Ssam	movq	%rdx,%rdi
275185380Ssam
276185380Ssam	mulq	%rbx
277185380Ssam	addq	%rax,%r11
278185380Ssam	movq	8(%rcx),%rax
279185380Ssam	adcq	$0,%rdx
280185380Ssam	movq	%rdx,%r10
281185380Ssam
282185380Ssam	mulq	%rbp
283185380Ssam	addq	%rax,%rdi
284185380Ssam	movq	16(%rsi),%rax
285187129Ssam	adcq	$0,%rdx
286187129Ssam	addq	%r11,%rdi
287185380Ssam	leaq	4(%r15),%r15
288185380Ssam	adcq	$0,%rdx
289185380Ssam	movq	%rdi,(%rsp)
290185380Ssam	movq	%rdx,%r13
291185380Ssam	jmp	.L1st4x
292185380Ssam.align	16
293185380Ssam.L1st4x:
294185380Ssam	mulq	%rbx
295185380Ssam	addq	%rax,%r10
296185380Ssam	movq	-16(%rcx,%r15,8),%rax
297185380Ssam	adcq	$0,%rdx
298185380Ssam	movq	%rdx,%r11
299185380Ssam
300185380Ssam	mulq	%rbp
301185380Ssam	addq	%rax,%r13
302185380Ssam	movq	-8(%rsi,%r15,8),%rax
303185380Ssam	adcq	$0,%rdx
304185380Ssam	addq	%r10,%r13
305185380Ssam	adcq	$0,%rdx
306185380Ssam	movq	%r13,-24(%rsp,%r15,8)
307185380Ssam	movq	%rdx,%rdi
308188197Ssam
309188197Ssam	mulq	%rbx
310188197Ssam	addq	%rax,%r11
311185380Ssam	movq	-8(%rcx,%r15,8),%rax
312185380Ssam	adcq	$0,%rdx
313185380Ssam	movq	%rdx,%r10
314185380Ssam
315185380Ssam	mulq	%rbp
316185380Ssam	addq	%rax,%rdi
317185380Ssam	movq	(%rsi,%r15,8),%rax
318185380Ssam	adcq	$0,%rdx
319185380Ssam	addq	%r11,%rdi
320185380Ssam	adcq	$0,%rdx
321185380Ssam	movq	%rdi,-16(%rsp,%r15,8)
322185380Ssam	movq	%rdx,%r13
323185380Ssam
324185380Ssam	mulq	%rbx
325185380Ssam	addq	%rax,%r10
326185380Ssam	movq	(%rcx,%r15,8),%rax
327185380Ssam	adcq	$0,%rdx
328185380Ssam	movq	%rdx,%r11
329185380Ssam
330185380Ssam	mulq	%rbp
331185380Ssam	addq	%rax,%r13
332185380Ssam	movq	8(%rsi,%r15,8),%rax
333185380Ssam	adcq	$0,%rdx
334	addq	%r10,%r13
335	adcq	$0,%rdx
336	movq	%r13,-8(%rsp,%r15,8)
337	movq	%rdx,%rdi
338
339	mulq	%rbx
340	addq	%rax,%r11
341	movq	8(%rcx,%r15,8),%rax
342	adcq	$0,%rdx
343	leaq	4(%r15),%r15
344	movq	%rdx,%r10
345
346	mulq	%rbp
347	addq	%rax,%rdi
348	movq	-16(%rsi,%r15,8),%rax
349	adcq	$0,%rdx
350	addq	%r11,%rdi
351	adcq	$0,%rdx
352	movq	%rdi,-32(%rsp,%r15,8)
353	movq	%rdx,%r13
354	cmpq	%r9,%r15
355	jl	.L1st4x
356
357	mulq	%rbx
358	addq	%rax,%r10
359	movq	-16(%rcx,%r15,8),%rax
360	adcq	$0,%rdx
361	movq	%rdx,%r11
362
363	mulq	%rbp
364	addq	%rax,%r13
365	movq	-8(%rsi,%r15,8),%rax
366	adcq	$0,%rdx
367	addq	%r10,%r13
368	adcq	$0,%rdx
369	movq	%r13,-24(%rsp,%r15,8)
370	movq	%rdx,%rdi
371
372	mulq	%rbx
373	addq	%rax,%r11
374	movq	-8(%rcx,%r15,8),%rax
375	adcq	$0,%rdx
376	movq	%rdx,%r10
377
378	mulq	%rbp
379	addq	%rax,%rdi
380	movq	(%rsi),%rax
381	adcq	$0,%rdx
382	addq	%r11,%rdi
383	adcq	$0,%rdx
384	movq	%rdi,-16(%rsp,%r15,8)
385	movq	%rdx,%r13
386
387	xorq	%rdi,%rdi
388	addq	%r10,%r13
389	adcq	$0,%rdi
390	movq	%r13,-8(%rsp,%r15,8)
391	movq	%rdi,(%rsp,%r15,8)
392
393	leaq	1(%r14),%r14
394.align	4
395.Louter4x:
396	movq	(%r12,%r14,8),%rbx
397	xorq	%r15,%r15
398	movq	(%rsp),%r10
399	movq	%r8,%rbp
400	mulq	%rbx
401	addq	%rax,%r10
402	movq	(%rcx),%rax
403	adcq	$0,%rdx
404
405	imulq	%r10,%rbp
406	movq	%rdx,%r11
407
408	mulq	%rbp
409	addq	%rax,%r10
410	movq	8(%rsi),%rax
411	adcq	$0,%rdx
412	movq	%rdx,%rdi
413
414	mulq	%rbx
415	addq	%rax,%r11
416	movq	8(%rcx),%rax
417	adcq	$0,%rdx
418	addq	8(%rsp),%r11
419	adcq	$0,%rdx
420	movq	%rdx,%r10
421
422	mulq	%rbp
423	addq	%rax,%rdi
424	movq	16(%rsi),%rax
425	adcq	$0,%rdx
426	addq	%r11,%rdi
427	leaq	4(%r15),%r15
428	adcq	$0,%rdx
429	movq	%rdi,(%rsp)
430	movq	%rdx,%r13
431	jmp	.Linner4x
432.align	16
433.Linner4x:
434	mulq	%rbx
435	addq	%rax,%r10
436	movq	-16(%rcx,%r15,8),%rax
437	adcq	$0,%rdx
438	addq	-16(%rsp,%r15,8),%r10
439	adcq	$0,%rdx
440	movq	%rdx,%r11
441
442	mulq	%rbp
443	addq	%rax,%r13
444	movq	-8(%rsi,%r15,8),%rax
445	adcq	$0,%rdx
446	addq	%r10,%r13
447	adcq	$0,%rdx
448	movq	%r13,-24(%rsp,%r15,8)
449	movq	%rdx,%rdi
450
451	mulq	%rbx
452	addq	%rax,%r11
453	movq	-8(%rcx,%r15,8),%rax
454	adcq	$0,%rdx
455	addq	-8(%rsp,%r15,8),%r11
456	adcq	$0,%rdx
457	movq	%rdx,%r10
458
459	mulq	%rbp
460	addq	%rax,%rdi
461	movq	(%rsi,%r15,8),%rax
462	adcq	$0,%rdx
463	addq	%r11,%rdi
464	adcq	$0,%rdx
465	movq	%rdi,-16(%rsp,%r15,8)
466	movq	%rdx,%r13
467
468	mulq	%rbx
469	addq	%rax,%r10
470	movq	(%rcx,%r15,8),%rax
471	adcq	$0,%rdx
472	addq	(%rsp,%r15,8),%r10
473	adcq	$0,%rdx
474	movq	%rdx,%r11
475
476	mulq	%rbp
477	addq	%rax,%r13
478	movq	8(%rsi,%r15,8),%rax
479	adcq	$0,%rdx
480	addq	%r10,%r13
481	adcq	$0,%rdx
482	movq	%r13,-8(%rsp,%r15,8)
483	movq	%rdx,%rdi
484
485	mulq	%rbx
486	addq	%rax,%r11
487	movq	8(%rcx,%r15,8),%rax
488	adcq	$0,%rdx
489	addq	8(%rsp,%r15,8),%r11
490	adcq	$0,%rdx
491	leaq	4(%r15),%r15
492	movq	%rdx,%r10
493
494	mulq	%rbp
495	addq	%rax,%rdi
496	movq	-16(%rsi,%r15,8),%rax
497	adcq	$0,%rdx
498	addq	%r11,%rdi
499	adcq	$0,%rdx
500	movq	%rdi,-32(%rsp,%r15,8)
501	movq	%rdx,%r13
502	cmpq	%r9,%r15
503	jl	.Linner4x
504
505	mulq	%rbx
506	addq	%rax,%r10
507	movq	-16(%rcx,%r15,8),%rax
508	adcq	$0,%rdx
509	addq	-16(%rsp,%r15,8),%r10
510	adcq	$0,%rdx
511	movq	%rdx,%r11
512
513	mulq	%rbp
514	addq	%rax,%r13
515	movq	-8(%rsi,%r15,8),%rax
516	adcq	$0,%rdx
517	addq	%r10,%r13
518	adcq	$0,%rdx
519	movq	%r13,-24(%rsp,%r15,8)
520	movq	%rdx,%rdi
521
522	mulq	%rbx
523	addq	%rax,%r11
524	movq	-8(%rcx,%r15,8),%rax
525	adcq	$0,%rdx
526	addq	-8(%rsp,%r15,8),%r11
527	adcq	$0,%rdx
528	leaq	1(%r14),%r14
529	movq	%rdx,%r10
530
531	mulq	%rbp
532	addq	%rax,%rdi
533	movq	(%rsi),%rax
534	adcq	$0,%rdx
535	addq	%r11,%rdi
536	adcq	$0,%rdx
537	movq	%rdi,-16(%rsp,%r15,8)
538	movq	%rdx,%r13
539
540	xorq	%rdi,%rdi
541	addq	%r10,%r13
542	adcq	$0,%rdi
543	addq	(%rsp,%r9,8),%r13
544	adcq	$0,%rdi
545	movq	%r13,-8(%rsp,%r15,8)
546	movq	%rdi,(%rsp,%r15,8)
547
548	cmpq	%r9,%r14
549	jl	.Louter4x
550	movq	16(%rsp,%r9,8),%rdi
551	movq	0(%rsp),%rax
552	pxor	%xmm0,%xmm0
553	movq	8(%rsp),%rdx
554	shrq	$2,%r9
555	leaq	(%rsp),%rsi
556	xorq	%r14,%r14
557
558	subq	0(%rcx),%rax
559	movq	16(%rsi),%rbx
560	movq	24(%rsi),%rbp
561	sbbq	8(%rcx),%rdx
562	leaq	-1(%r9),%r15
563	jmp	.Lsub4x
564.align	16
565.Lsub4x:
566	movq	%rax,0(%rdi,%r14,8)
567	movq	%rdx,8(%rdi,%r14,8)
568	sbbq	16(%rcx,%r14,8),%rbx
569	movq	32(%rsi,%r14,8),%rax
570	movq	40(%rsi,%r14,8),%rdx
571	sbbq	24(%rcx,%r14,8),%rbp
572	movq	%rbx,16(%rdi,%r14,8)
573	movq	%rbp,24(%rdi,%r14,8)
574	sbbq	32(%rcx,%r14,8),%rax
575	movq	48(%rsi,%r14,8),%rbx
576	movq	56(%rsi,%r14,8),%rbp
577	sbbq	40(%rcx,%r14,8),%rdx
578	leaq	4(%r14),%r14
579	decq	%r15
580	jnz	.Lsub4x
581
582	movq	%rax,0(%rdi,%r14,8)
583	movq	32(%rsi,%r14,8),%rax
584	sbbq	16(%rcx,%r14,8),%rbx
585	movq	%rdx,8(%rdi,%r14,8)
586	sbbq	24(%rcx,%r14,8),%rbp
587	movq	%rbx,16(%rdi,%r14,8)
588
589	sbbq	$0,%rax
590	movq	%rbp,24(%rdi,%r14,8)
591	xorq	%r14,%r14
592	andq	%rax,%rsi
593	notq	%rax
594	movq	%rdi,%rcx
595	andq	%rax,%rcx
596	leaq	-1(%r9),%r15
597	orq	%rcx,%rsi
598
599	movdqu	(%rsi),%xmm1
600	movdqa	%xmm0,(%rsp)
601	movdqu	%xmm1,(%rdi)
602	jmp	.Lcopy4x
603.align	16
604.Lcopy4x:
605	movdqu	16(%rsi,%r14,1),%xmm2
606	movdqu	32(%rsi,%r14,1),%xmm1
607	movdqa	%xmm0,16(%rsp,%r14,1)
608	movdqu	%xmm2,16(%rdi,%r14,1)
609	movdqa	%xmm0,32(%rsp,%r14,1)
610	movdqu	%xmm1,32(%rdi,%r14,1)
611	leaq	32(%r14),%r14
612	decq	%r15
613	jnz	.Lcopy4x
614
615	shlq	$2,%r9
616	movdqu	16(%rsi,%r14,1),%xmm2
617	movdqa	%xmm0,16(%rsp,%r14,1)
618	movdqu	%xmm2,16(%rdi,%r14,1)
619	movq	8(%rsp,%r9,8),%rsi
620	movq	$1,%rax
621	movq	(%rsi),%r15
622	movq	8(%rsi),%r14
623	movq	16(%rsi),%r13
624	movq	24(%rsi),%r12
625	movq	32(%rsi),%rbp
626	movq	40(%rsi),%rbx
627	leaq	48(%rsi),%rsp
628.Lmul4x_epilogue:
629	.byte	0xf3,0xc3
630.size	bn_mul4x_mont,.-bn_mul4x_mont
631.type	bn_sqr4x_mont,@function
632.align	16
633bn_sqr4x_mont:
634.Lsqr4x_enter:
635	movq	%rsp,%rax
636	pushq	%rbx
637	pushq	%rbp
638	pushq	%r12
639	pushq	%r13
640	pushq	%r14
641	pushq	%r15
642
643	shll	$3,%r9d
644	movq	%rsp,%r11
645	negq	%r9
646	movq	(%r8),%r8
647	leaq	-72(%rsp,%r9,2),%rsp
648	andq	$-1024,%rsp
649
650	subq	%rsp,%r11
651	andq	$-4096,%r11
652.Lsqr4x_page_walk:
653	movq	(%rsp,%r11,1),%r10
654	subq	$4096,%r11
655.byte	0x2e
656	jnc	.Lsqr4x_page_walk
657
658	movq	%r9,%r10
659	negq	%r9
660	leaq	-48(%rax),%r11
661
662
663
664
665
666
667
668
669
670
671
672	movq	%rdi,32(%rsp)
673	movq	%rcx,40(%rsp)
674	movq	%r8,48(%rsp)
675	movq	%r11,56(%rsp)
676.Lsqr4x_body:
677
678
679
680
681
682
683
684	leaq	32(%r10),%rbp
685	leaq	(%rsi,%r9,1),%rsi
686
687	movq	%r9,%rcx
688
689
690	movq	-32(%rsi,%rbp,1),%r14
691	leaq	64(%rsp,%r9,2),%rdi
692	movq	-24(%rsi,%rbp,1),%rax
693	leaq	-32(%rdi,%rbp,1),%rdi
694	movq	-16(%rsi,%rbp,1),%rbx
695	movq	%rax,%r15
696
697	mulq	%r14
698	movq	%rax,%r10
699	movq	%rbx,%rax
700	movq	%rdx,%r11
701	movq	%r10,-24(%rdi,%rbp,1)
702
703	xorq	%r10,%r10
704	mulq	%r14
705	addq	%rax,%r11
706	movq	%rbx,%rax
707	adcq	%rdx,%r10
708	movq	%r11,-16(%rdi,%rbp,1)
709
710	leaq	-16(%rbp),%rcx
711
712
713	movq	8(%rsi,%rcx,1),%rbx
714	mulq	%r15
715	movq	%rax,%r12
716	movq	%rbx,%rax
717	movq	%rdx,%r13
718
719	xorq	%r11,%r11
720	addq	%r12,%r10
721	leaq	16(%rcx),%rcx
722	adcq	$0,%r11
723	mulq	%r14
724	addq	%rax,%r10
725	movq	%rbx,%rax
726	adcq	%rdx,%r11
727	movq	%r10,-8(%rdi,%rcx,1)
728	jmp	.Lsqr4x_1st
729
730.align	16
731.Lsqr4x_1st:
732	movq	(%rsi,%rcx,1),%rbx
733	xorq	%r12,%r12
734	mulq	%r15
735	addq	%rax,%r13
736	movq	%rbx,%rax
737	adcq	%rdx,%r12
738
739	xorq	%r10,%r10
740	addq	%r13,%r11
741	adcq	$0,%r10
742	mulq	%r14
743	addq	%rax,%r11
744	movq	%rbx,%rax
745	adcq	%rdx,%r10
746	movq	%r11,(%rdi,%rcx,1)
747
748
749	movq	8(%rsi,%rcx,1),%rbx
750	xorq	%r13,%r13
751	mulq	%r15
752	addq	%rax,%r12
753	movq	%rbx,%rax
754	adcq	%rdx,%r13
755
756	xorq	%r11,%r11
757	addq	%r12,%r10
758	adcq	$0,%r11
759	mulq	%r14
760	addq	%rax,%r10
761	movq	%rbx,%rax
762	adcq	%rdx,%r11
763	movq	%r10,8(%rdi,%rcx,1)
764
765	movq	16(%rsi,%rcx,1),%rbx
766	xorq	%r12,%r12
767	mulq	%r15
768	addq	%rax,%r13
769	movq	%rbx,%rax
770	adcq	%rdx,%r12
771
772	xorq	%r10,%r10
773	addq	%r13,%r11
774	adcq	$0,%r10
775	mulq	%r14
776	addq	%rax,%r11
777	movq	%rbx,%rax
778	adcq	%rdx,%r10
779	movq	%r11,16(%rdi,%rcx,1)
780
781
782	movq	24(%rsi,%rcx,1),%rbx
783	xorq	%r13,%r13
784	mulq	%r15
785	addq	%rax,%r12
786	movq	%rbx,%rax
787	adcq	%rdx,%r13
788
789	xorq	%r11,%r11
790	addq	%r12,%r10
791	leaq	32(%rcx),%rcx
792	adcq	$0,%r11
793	mulq	%r14
794	addq	%rax,%r10
795	movq	%rbx,%rax
796	adcq	%rdx,%r11
797	movq	%r10,-8(%rdi,%rcx,1)
798
799	cmpq	$0,%rcx
800	jne	.Lsqr4x_1st
801
802	xorq	%r12,%r12
803	addq	%r11,%r13
804	adcq	$0,%r12
805	mulq	%r15
806	addq	%rax,%r13
807	adcq	%rdx,%r12
808
809	movq	%r13,(%rdi)
810	leaq	16(%rbp),%rbp
811	movq	%r12,8(%rdi)
812	jmp	.Lsqr4x_outer
813
814.align	16
815.Lsqr4x_outer:
816	movq	-32(%rsi,%rbp,1),%r14
817	leaq	64(%rsp,%r9,2),%rdi
818	movq	-24(%rsi,%rbp,1),%rax
819	leaq	-32(%rdi,%rbp,1),%rdi
820	movq	-16(%rsi,%rbp,1),%rbx
821	movq	%rax,%r15
822
823	movq	-24(%rdi,%rbp,1),%r10
824	xorq	%r11,%r11
825	mulq	%r14
826	addq	%rax,%r10
827	movq	%rbx,%rax
828	adcq	%rdx,%r11
829	movq	%r10,-24(%rdi,%rbp,1)
830
831	xorq	%r10,%r10
832	addq	-16(%rdi,%rbp,1),%r11
833	adcq	$0,%r10
834	mulq	%r14
835	addq	%rax,%r11
836	movq	%rbx,%rax
837	adcq	%rdx,%r10
838	movq	%r11,-16(%rdi,%rbp,1)
839
840	leaq	-16(%rbp),%rcx
841	xorq	%r12,%r12
842
843
844	movq	8(%rsi,%rcx,1),%rbx
845	xorq	%r13,%r13
846	addq	8(%rdi,%rcx,1),%r12
847	adcq	$0,%r13
848	mulq	%r15
849	addq	%rax,%r12
850	movq	%rbx,%rax
851	adcq	%rdx,%r13
852
853	xorq	%r11,%r11
854	addq	%r12,%r10
855	adcq	$0,%r11
856	mulq	%r14
857	addq	%rax,%r10
858	movq	%rbx,%rax
859	adcq	%rdx,%r11
860	movq	%r10,8(%rdi,%rcx,1)
861
862	leaq	16(%rcx),%rcx
863	jmp	.Lsqr4x_inner
864
865.align	16
866.Lsqr4x_inner:
867	movq	(%rsi,%rcx,1),%rbx
868	xorq	%r12,%r12
869	addq	(%rdi,%rcx,1),%r13
870	adcq	$0,%r12
871	mulq	%r15
872	addq	%rax,%r13
873	movq	%rbx,%rax
874	adcq	%rdx,%r12
875
876	xorq	%r10,%r10
877	addq	%r13,%r11
878	adcq	$0,%r10
879	mulq	%r14
880	addq	%rax,%r11
881	movq	%rbx,%rax
882	adcq	%rdx,%r10
883	movq	%r11,(%rdi,%rcx,1)
884
885	movq	8(%rsi,%rcx,1),%rbx
886	xorq	%r13,%r13
887	addq	8(%rdi,%rcx,1),%r12
888	adcq	$0,%r13
889	mulq	%r15
890	addq	%rax,%r12
891	movq	%rbx,%rax
892	adcq	%rdx,%r13
893
894	xorq	%r11,%r11
895	addq	%r12,%r10
896	leaq	16(%rcx),%rcx
897	adcq	$0,%r11
898	mulq	%r14
899	addq	%rax,%r10
900	movq	%rbx,%rax
901	adcq	%rdx,%r11
902	movq	%r10,-8(%rdi,%rcx,1)
903
904	cmpq	$0,%rcx
905	jne	.Lsqr4x_inner
906
907	xorq	%r12,%r12
908	addq	%r11,%r13
909	adcq	$0,%r12
910	mulq	%r15
911	addq	%rax,%r13
912	adcq	%rdx,%r12
913
914	movq	%r13,(%rdi)
915	movq	%r12,8(%rdi)
916
917	addq	$16,%rbp
918	jnz	.Lsqr4x_outer
919
920
921	movq	-32(%rsi),%r14
922	leaq	64(%rsp,%r9,2),%rdi
923	movq	-24(%rsi),%rax
924	leaq	-32(%rdi,%rbp,1),%rdi
925	movq	-16(%rsi),%rbx
926	movq	%rax,%r15
927
928	xorq	%r11,%r11
929	mulq	%r14
930	addq	%rax,%r10
931	movq	%rbx,%rax
932	adcq	%rdx,%r11
933	movq	%r10,-24(%rdi)
934
935	xorq	%r10,%r10
936	addq	%r13,%r11
937	adcq	$0,%r10
938	mulq	%r14
939	addq	%rax,%r11
940	movq	%rbx,%rax
941	adcq	%rdx,%r10
942	movq	%r11,-16(%rdi)
943
944	movq	-8(%rsi),%rbx
945	mulq	%r15
946	addq	%rax,%r12
947	movq	%rbx,%rax
948	adcq	$0,%rdx
949
950	xorq	%r11,%r11
951	addq	%r12,%r10
952	movq	%rdx,%r13
953	adcq	$0,%r11
954	mulq	%r14
955	addq	%rax,%r10
956	movq	%rbx,%rax
957	adcq	%rdx,%r11
958	movq	%r10,-8(%rdi)
959
960	xorq	%r12,%r12
961	addq	%r11,%r13
962	adcq	$0,%r12
963	mulq	%r15
964	addq	%rax,%r13
965	movq	-16(%rsi),%rax
966	adcq	%rdx,%r12
967
968	movq	%r13,(%rdi)
969	movq	%r12,8(%rdi)
970
971	mulq	%rbx
972	addq	$16,%rbp
973	xorq	%r14,%r14
974	subq	%r9,%rbp
975	xorq	%r15,%r15
976
977	addq	%r12,%rax
978	adcq	$0,%rdx
979	movq	%rax,8(%rdi)
980	movq	%rdx,16(%rdi)
981	movq	%r15,24(%rdi)
982
983	movq	-16(%rsi,%rbp,1),%rax
984	leaq	64(%rsp,%r9,2),%rdi
985	xorq	%r10,%r10
986	movq	-24(%rdi,%rbp,2),%r11
987
988	leaq	(%r14,%r10,2),%r12
989	shrq	$63,%r10
990	leaq	(%rcx,%r11,2),%r13
991	shrq	$63,%r11
992	orq	%r10,%r13
993	movq	-16(%rdi,%rbp,2),%r10
994	movq	%r11,%r14
995	mulq	%rax
996	negq	%r15
997	movq	-8(%rdi,%rbp,2),%r11
998	adcq	%rax,%r12
999	movq	-8(%rsi,%rbp,1),%rax
1000	movq	%r12,-32(%rdi,%rbp,2)
1001	adcq	%rdx,%r13
1002
1003	leaq	(%r14,%r10,2),%rbx
1004	movq	%r13,-24(%rdi,%rbp,2)
1005	sbbq	%r15,%r15
1006	shrq	$63,%r10
1007	leaq	(%rcx,%r11,2),%r8
1008	shrq	$63,%r11
1009	orq	%r10,%r8
1010	movq	0(%rdi,%rbp,2),%r10
1011	movq	%r11,%r14
1012	mulq	%rax
1013	negq	%r15
1014	movq	8(%rdi,%rbp,2),%r11
1015	adcq	%rax,%rbx
1016	movq	0(%rsi,%rbp,1),%rax
1017	movq	%rbx,-16(%rdi,%rbp,2)
1018	adcq	%rdx,%r8
1019	leaq	16(%rbp),%rbp
1020	movq	%r8,-40(%rdi,%rbp,2)
1021	sbbq	%r15,%r15
1022	jmp	.Lsqr4x_shift_n_add
1023
1024.align	16
1025.Lsqr4x_shift_n_add:
1026	leaq	(%r14,%r10,2),%r12
1027	shrq	$63,%r10
1028	leaq	(%rcx,%r11,2),%r13
1029	shrq	$63,%r11
1030	orq	%r10,%r13
1031	movq	-16(%rdi,%rbp,2),%r10
1032	movq	%r11,%r14
1033	mulq	%rax
1034	negq	%r15
1035	movq	-8(%rdi,%rbp,2),%r11
1036	adcq	%rax,%r12
1037	movq	-8(%rsi,%rbp,1),%rax
1038	movq	%r12,-32(%rdi,%rbp,2)
1039	adcq	%rdx,%r13
1040
1041	leaq	(%r14,%r10,2),%rbx
1042	movq	%r13,-24(%rdi,%rbp,2)
1043	sbbq	%r15,%r15
1044	shrq	$63,%r10
1045	leaq	(%rcx,%r11,2),%r8
1046	shrq	$63,%r11
1047	orq	%r10,%r8
1048	movq	0(%rdi,%rbp,2),%r10
1049	movq	%r11,%r14
1050	mulq	%rax
1051	negq	%r15
1052	movq	8(%rdi,%rbp,2),%r11
1053	adcq	%rax,%rbx
1054	movq	0(%rsi,%rbp,1),%rax
1055	movq	%rbx,-16(%rdi,%rbp,2)
1056	adcq	%rdx,%r8
1057
1058	leaq	(%r14,%r10,2),%r12
1059	movq	%r8,-8(%rdi,%rbp,2)
1060	sbbq	%r15,%r15
1061	shrq	$63,%r10
1062	leaq	(%rcx,%r11,2),%r13
1063	shrq	$63,%r11
1064	orq	%r10,%r13
1065	movq	16(%rdi,%rbp,2),%r10
1066	movq	%r11,%r14
1067	mulq	%rax
1068	negq	%r15
1069	movq	24(%rdi,%rbp,2),%r11
1070	adcq	%rax,%r12
1071	movq	8(%rsi,%rbp,1),%rax
1072	movq	%r12,0(%rdi,%rbp,2)
1073	adcq	%rdx,%r13
1074
1075	leaq	(%r14,%r10,2),%rbx
1076	movq	%r13,8(%rdi,%rbp,2)
1077	sbbq	%r15,%r15
1078	shrq	$63,%r10
1079	leaq	(%rcx,%r11,2),%r8
1080	shrq	$63,%r11
1081	orq	%r10,%r8
1082	movq	32(%rdi,%rbp,2),%r10
1083	movq	%r11,%r14
1084	mulq	%rax
1085	negq	%r15
1086	movq	40(%rdi,%rbp,2),%r11
1087	adcq	%rax,%rbx
1088	movq	16(%rsi,%rbp,1),%rax
1089	movq	%rbx,16(%rdi,%rbp,2)
1090	adcq	%rdx,%r8
1091	movq	%r8,24(%rdi,%rbp,2)
1092	sbbq	%r15,%r15
1093	addq	$32,%rbp
1094	jnz	.Lsqr4x_shift_n_add
1095
1096	leaq	(%r14,%r10,2),%r12
1097	shrq	$63,%r10
1098	leaq	(%rcx,%r11,2),%r13
1099	shrq	$63,%r11
1100	orq	%r10,%r13
1101	movq	-16(%rdi),%r10
1102	movq	%r11,%r14
1103	mulq	%rax
1104	negq	%r15
1105	movq	-8(%rdi),%r11
1106	adcq	%rax,%r12
1107	movq	-8(%rsi),%rax
1108	movq	%r12,-32(%rdi)
1109	adcq	%rdx,%r13
1110
1111	leaq	(%r14,%r10,2),%rbx
1112	movq	%r13,-24(%rdi)
1113	sbbq	%r15,%r15
1114	shrq	$63,%r10
1115	leaq	(%rcx,%r11,2),%r8
1116	shrq	$63,%r11
1117	orq	%r10,%r8
1118	mulq	%rax
1119	negq	%r15
1120	adcq	%rax,%rbx
1121	adcq	%rdx,%r8
1122	movq	%rbx,-16(%rdi)
1123	movq	%r8,-8(%rdi)
1124	movq	40(%rsp),%rsi
1125	movq	48(%rsp),%r8
1126	xorq	%rcx,%rcx
1127	movq	%r9,0(%rsp)
1128	subq	%r9,%rcx
1129	movq	64(%rsp),%r10
1130	movq	%r8,%r14
1131	leaq	64(%rsp,%r9,2),%rax
1132	leaq	64(%rsp,%r9,1),%rdi
1133	movq	%rax,8(%rsp)
1134	leaq	(%rsi,%r9,1),%rsi
1135	xorq	%rbp,%rbp
1136
1137	movq	0(%rsi,%rcx,1),%rax
1138	movq	8(%rsi,%rcx,1),%r9
1139	imulq	%r10,%r14
1140	movq	%rax,%rbx
1141	jmp	.Lsqr4x_mont_outer
1142
1143.align	16
1144.Lsqr4x_mont_outer:
1145	xorq	%r11,%r11
1146	mulq	%r14
1147	addq	%rax,%r10
1148	movq	%r9,%rax
1149	adcq	%rdx,%r11
1150	movq	%r8,%r15
1151
1152	xorq	%r10,%r10
1153	addq	8(%rdi,%rcx,1),%r11
1154	adcq	$0,%r10
1155	mulq	%r14
1156	addq	%rax,%r11
1157	movq	%rbx,%rax
1158	adcq	%rdx,%r10
1159
1160	imulq	%r11,%r15
1161
1162	movq	16(%rsi,%rcx,1),%rbx
1163	xorq	%r13,%r13
1164	addq	%r11,%r12
1165	adcq	$0,%r13
1166	mulq	%r15
1167	addq	%rax,%r12
1168	movq	%rbx,%rax
1169	adcq	%rdx,%r13
1170	movq	%r12,8(%rdi,%rcx,1)
1171
1172	xorq	%r11,%r11
1173	addq	16(%rdi,%rcx,1),%r10
1174	adcq	$0,%r11
1175	mulq	%r14
1176	addq	%rax,%r10
1177	movq	%r9,%rax
1178	adcq	%rdx,%r11
1179
1180	movq	24(%rsi,%rcx,1),%r9
1181	xorq	%r12,%r12
1182	addq	%r10,%r13
1183	adcq	$0,%r12
1184	mulq	%r15
1185	addq	%rax,%r13
1186	movq	%r9,%rax
1187	adcq	%rdx,%r12
1188	movq	%r13,16(%rdi,%rcx,1)
1189
1190	xorq	%r10,%r10
1191	addq	24(%rdi,%rcx,1),%r11
1192	leaq	32(%rcx),%rcx
1193	adcq	$0,%r10
1194	mulq	%r14
1195	addq	%rax,%r11
1196	movq	%rbx,%rax
1197	adcq	%rdx,%r10
1198	jmp	.Lsqr4x_mont_inner
1199
1200.align	16
1201.Lsqr4x_mont_inner:
1202	movq	(%rsi,%rcx,1),%rbx
1203	xorq	%r13,%r13
1204	addq	%r11,%r12
1205	adcq	$0,%r13
1206	mulq	%r15
1207	addq	%rax,%r12
1208	movq	%rbx,%rax
1209	adcq	%rdx,%r13
1210	movq	%r12,-8(%rdi,%rcx,1)
1211
1212	xorq	%r11,%r11
1213	addq	(%rdi,%rcx,1),%r10
1214	adcq	$0,%r11
1215	mulq	%r14
1216	addq	%rax,%r10
1217	movq	%r9,%rax
1218	adcq	%rdx,%r11
1219
1220	movq	8(%rsi,%rcx,1),%r9
1221	xorq	%r12,%r12
1222	addq	%r10,%r13
1223	adcq	$0,%r12
1224	mulq	%r15
1225	addq	%rax,%r13
1226	movq	%r9,%rax
1227	adcq	%rdx,%r12
1228	movq	%r13,(%rdi,%rcx,1)
1229
1230	xorq	%r10,%r10
1231	addq	8(%rdi,%rcx,1),%r11
1232	adcq	$0,%r10
1233	mulq	%r14
1234	addq	%rax,%r11
1235	movq	%rbx,%rax
1236	adcq	%rdx,%r10
1237
1238
1239	movq	16(%rsi,%rcx,1),%rbx
1240	xorq	%r13,%r13
1241	addq	%r11,%r12
1242	adcq	$0,%r13
1243	mulq	%r15
1244	addq	%rax,%r12
1245	movq	%rbx,%rax
1246	adcq	%rdx,%r13
1247	movq	%r12,8(%rdi,%rcx,1)
1248
1249	xorq	%r11,%r11
1250	addq	16(%rdi,%rcx,1),%r10
1251	adcq	$0,%r11
1252	mulq	%r14
1253	addq	%rax,%r10
1254	movq	%r9,%rax
1255	adcq	%rdx,%r11
1256
1257	movq	24(%rsi,%rcx,1),%r9
1258	xorq	%r12,%r12
1259	addq	%r10,%r13
1260	adcq	$0,%r12
1261	mulq	%r15
1262	addq	%rax,%r13
1263	movq	%r9,%rax
1264	adcq	%rdx,%r12
1265	movq	%r13,16(%rdi,%rcx,1)
1266
1267	xorq	%r10,%r10
1268	addq	24(%rdi,%rcx,1),%r11
1269	leaq	32(%rcx),%rcx
1270	adcq	$0,%r10
1271	mulq	%r14
1272	addq	%rax,%r11
1273	movq	%rbx,%rax
1274	adcq	%rdx,%r10
1275	cmpq	$0,%rcx
1276	jne	.Lsqr4x_mont_inner
1277
1278	subq	0(%rsp),%rcx
1279	movq	%r8,%r14
1280
1281	xorq	%r13,%r13
1282	addq	%r11,%r12
1283	adcq	$0,%r13
1284	mulq	%r15
1285	addq	%rax,%r12
1286	movq	%r9,%rax
1287	adcq	%rdx,%r13
1288	movq	%r12,-8(%rdi)
1289
1290	xorq	%r11,%r11
1291	addq	(%rdi),%r10
1292	adcq	$0,%r11
1293	movq	0(%rsi,%rcx,1),%rbx
1294	addq	%rbp,%r10
1295	adcq	$0,%r11
1296
1297	imulq	16(%rdi,%rcx,1),%r14
1298	xorq	%r12,%r12
1299	movq	8(%rsi,%rcx,1),%r9
1300	addq	%r10,%r13
1301	movq	16(%rdi,%rcx,1),%r10
1302	adcq	$0,%r12
1303	mulq	%r15
1304	addq	%rax,%r13
1305	movq	%rbx,%rax
1306	adcq	%rdx,%r12
1307	movq	%r13,(%rdi)
1308
1309	xorq	%rbp,%rbp
1310	addq	8(%rdi),%r12
1311	adcq	%rbp,%rbp
1312	addq	%r11,%r12
1313	leaq	16(%rdi),%rdi
1314	adcq	$0,%rbp
1315	movq	%r12,-8(%rdi)
1316	cmpq	8(%rsp),%rdi
1317	jb	.Lsqr4x_mont_outer
1318
1319	movq	0(%rsp),%r9
1320	movq	%rbp,(%rdi)
1321	movq	64(%rsp,%r9,1),%rax
1322	leaq	64(%rsp,%r9,1),%rbx
1323	movq	40(%rsp),%rsi
1324	shrq	$5,%r9
1325	movq	8(%rbx),%rdx
1326	xorq	%rbp,%rbp
1327
1328	movq	32(%rsp),%rdi
1329	subq	0(%rsi),%rax
1330	movq	16(%rbx),%r10
1331	movq	24(%rbx),%r11
1332	sbbq	8(%rsi),%rdx
1333	leaq	-1(%r9),%rcx
1334	jmp	.Lsqr4x_sub
1335.align	16
1336.Lsqr4x_sub:
1337	movq	%rax,0(%rdi,%rbp,8)
1338	movq	%rdx,8(%rdi,%rbp,8)
1339	sbbq	16(%rsi,%rbp,8),%r10
1340	movq	32(%rbx,%rbp,8),%rax
1341	movq	40(%rbx,%rbp,8),%rdx
1342	sbbq	24(%rsi,%rbp,8),%r11
1343	movq	%r10,16(%rdi,%rbp,8)
1344	movq	%r11,24(%rdi,%rbp,8)
1345	sbbq	32(%rsi,%rbp,8),%rax
1346	movq	48(%rbx,%rbp,8),%r10
1347	movq	56(%rbx,%rbp,8),%r11
1348	sbbq	40(%rsi,%rbp,8),%rdx
1349	leaq	4(%rbp),%rbp
1350	decq	%rcx
1351	jnz	.Lsqr4x_sub
1352
1353	movq	%rax,0(%rdi,%rbp,8)
1354	movq	32(%rbx,%rbp,8),%rax
1355	sbbq	16(%rsi,%rbp,8),%r10
1356	movq	%rdx,8(%rdi,%rbp,8)
1357	sbbq	24(%rsi,%rbp,8),%r11
1358	movq	%r10,16(%rdi,%rbp,8)
1359
1360	sbbq	$0,%rax
1361	movq	%r11,24(%rdi,%rbp,8)
1362	xorq	%rbp,%rbp
1363	andq	%rax,%rbx
1364	notq	%rax
1365	movq	%rdi,%rsi
1366	andq	%rax,%rsi
1367	leaq	-1(%r9),%rcx
1368	orq	%rsi,%rbx
1369
1370	pxor	%xmm0,%xmm0
1371	leaq	64(%rsp,%r9,8),%rsi
1372	movdqu	(%rbx),%xmm1
1373	leaq	(%rsi,%r9,8),%rsi
1374	movdqa	%xmm0,64(%rsp)
1375	movdqa	%xmm0,(%rsi)
1376	movdqu	%xmm1,(%rdi)
1377	jmp	.Lsqr4x_copy
1378.align	16
1379.Lsqr4x_copy:
1380	movdqu	16(%rbx,%rbp,1),%xmm2
1381	movdqu	32(%rbx,%rbp,1),%xmm1
1382	movdqa	%xmm0,80(%rsp,%rbp,1)
1383	movdqa	%xmm0,96(%rsp,%rbp,1)
1384	movdqa	%xmm0,16(%rsi,%rbp,1)
1385	movdqa	%xmm0,32(%rsi,%rbp,1)
1386	movdqu	%xmm2,16(%rdi,%rbp,1)
1387	movdqu	%xmm1,32(%rdi,%rbp,1)
1388	leaq	32(%rbp),%rbp
1389	decq	%rcx
1390	jnz	.Lsqr4x_copy
1391
1392	movdqu	16(%rbx,%rbp,1),%xmm2
1393	movdqa	%xmm0,80(%rsp,%rbp,1)
1394	movdqa	%xmm0,16(%rsi,%rbp,1)
1395	movdqu	%xmm2,16(%rdi,%rbp,1)
1396	movq	56(%rsp),%rsi
1397	movq	$1,%rax
1398	movq	0(%rsi),%r15
1399	movq	8(%rsi),%r14
1400	movq	16(%rsi),%r13
1401	movq	24(%rsi),%r12
1402	movq	32(%rsi),%rbp
1403	movq	40(%rsi),%rbx
1404	leaq	48(%rsi),%rsp
1405.Lsqr4x_epilogue:
1406	.byte	0xf3,0xc3
1407.size	bn_sqr4x_mont,.-bn_sqr4x_mont
1408.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1409.align	16
1410