1183234Ssimon#!/usr/bin/env perl
2183234Ssimon
3183234Ssimon# ====================================================================
4238405Sjkim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5183234Ssimon# project. The module is, however, dual licensed under OpenSSL and
6183234Ssimon# CRYPTOGAMS licenses depending on where you obtain it. For further
7183234Ssimon# details see http://www.openssl.org/~appro/cryptogams/.
8183234Ssimon# ====================================================================
9183234Ssimon
10183234Ssimon# October 2005.
11183234Ssimon#
12183234Ssimon# Montgomery multiplication routine for x86_64. While it gives modest
13183234Ssimon# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
14183234Ssimon# than twice, >2x, as fast. Most common rsa1024 sign is improved by
15183234Ssimon# respectful 50%. It remains to be seen if loop unrolling and
16183234Ssimon# dedicated squaring routine can provide further improvement...
17183234Ssimon
18238405Sjkim# July 2011.
19238405Sjkim#
20238405Sjkim# Add dedicated squaring procedure. Performance improvement varies
21238405Sjkim# from platform to platform, but in average it's ~5%/15%/25%/33%
22238405Sjkim# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
23183234Ssimon
24238405Sjkim# August 2011.
25238405Sjkim#
26238405Sjkim# Unroll and modulo-schedule inner loops in such manner that they
27238405Sjkim# are "fallen through" for input lengths of 8, which is critical for
28238405Sjkim# 1024-bit RSA *sign*. Average performance improvement in comparison
29238405Sjkim# to *initial* version of this module from 2005 is ~0%/30%/40%/45%
30238405Sjkim# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
31238405Sjkim
32238405Sjkim$flavour = shift;
33238405Sjkim$output  = shift;
34238405Sjkimif ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
35238405Sjkim
36238405Sjkim$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
37238405Sjkim
38183234Ssimon$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
39183234Ssimon( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
40183234Ssimon( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
41183234Ssimondie "can't locate x86_64-xlate.pl";
42183234Ssimon
43246772Sjkimopen OUT,"| \"$^X\" $xlate $flavour $output";
44246772Sjkim*STDOUT=*OUT;
45183234Ssimon
46183234Ssimon# int bn_mul_mont(
47183234Ssimon$rp="%rdi";	# BN_ULONG *rp,
48183234Ssimon$ap="%rsi";	# const BN_ULONG *ap,
49183234Ssimon$bp="%rdx";	# const BN_ULONG *bp,
50183234Ssimon$np="%rcx";	# const BN_ULONG *np,
51183234Ssimon$n0="%r8";	# const BN_ULONG *n0,
52183234Ssimon$num="%r9";	# int num);
53183234Ssimon$lo0="%r10";
54183234Ssimon$hi0="%r11";
55183234Ssimon$hi1="%r13";
56183234Ssimon$i="%r14";
57183234Ssimon$j="%r15";
58183234Ssimon$m0="%rbx";
59183234Ssimon$m1="%rbp";
60183234Ssimon
61183234Ssimon$code=<<___;
62183234Ssimon.text
63183234Ssimon
64183234Ssimon.globl	bn_mul_mont
65183234Ssimon.type	bn_mul_mont,\@function,6
66183234Ssimon.align	16
67183234Ssimonbn_mul_mont:
68238405Sjkim	test	\$3,${num}d
69238405Sjkim	jnz	.Lmul_enter
70238405Sjkim	cmp	\$8,${num}d
71238405Sjkim	jb	.Lmul_enter
72238405Sjkim	cmp	$ap,$bp
73238405Sjkim	jne	.Lmul4x_enter
74238405Sjkim	jmp	.Lsqr4x_enter
75238405Sjkim
76238405Sjkim.align	16
77238405Sjkim.Lmul_enter:
78183234Ssimon	push	%rbx
79183234Ssimon	push	%rbp
80183234Ssimon	push	%r12
81183234Ssimon	push	%r13
82183234Ssimon	push	%r14
83183234Ssimon	push	%r15
84183234Ssimon
85183234Ssimon	mov	${num}d,${num}d
86238405Sjkim	lea	2($num),%r10
87238405Sjkim	mov	%rsp,%r11
88238405Sjkim	neg	%r10
89238405Sjkim	lea	(%rsp,%r10,8),%rsp	# tp=alloca(8*(num+2))
90183234Ssimon	and	\$-1024,%rsp		# minimize TLB usage
91183234Ssimon
92238405Sjkim	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
93238405Sjkim.Lmul_body:
94238405Sjkim	mov	$bp,%r12		# reassign $bp
95238405Sjkim___
96238405Sjkim		$bp="%r12";
97238405Sjkim$code.=<<___;
98183234Ssimon	mov	($n0),$n0		# pull n0[0] value
99238405Sjkim	mov	($bp),$m0		# m0=bp[0]
100238405Sjkim	mov	($ap),%rax
101183234Ssimon
102183234Ssimon	xor	$i,$i			# i=0
103183234Ssimon	xor	$j,$j			# j=0
104183234Ssimon
105238405Sjkim	mov	$n0,$m1
106183234Ssimon	mulq	$m0			# ap[0]*bp[0]
107183234Ssimon	mov	%rax,$lo0
108238405Sjkim	mov	($np),%rax
109238405Sjkim
110238405Sjkim	imulq	$lo0,$m1		# "tp[0]"*n0
111183234Ssimon	mov	%rdx,$hi0
112183234Ssimon
113238405Sjkim	mulq	$m1			# np[0]*m1
114238405Sjkim	add	%rax,$lo0		# discarded
115238405Sjkim	mov	8($ap),%rax
116183234Ssimon	adc	\$0,%rdx
117183234Ssimon	mov	%rdx,$hi1
118183234Ssimon
119183234Ssimon	lea	1($j),$j		# j++
120238405Sjkim	jmp	.L1st_enter
121238405Sjkim
122238405Sjkim.align	16
123183234Ssimon.L1st:
124238405Sjkim	add	%rax,$hi1
125183234Ssimon	mov	($ap,$j,8),%rax
126238405Sjkim	adc	\$0,%rdx
127238405Sjkim	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
128238405Sjkim	mov	$lo0,$hi0
129238405Sjkim	adc	\$0,%rdx
130238405Sjkim	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
131238405Sjkim	mov	%rdx,$hi1
132238405Sjkim
133238405Sjkim.L1st_enter:
134183234Ssimon	mulq	$m0			# ap[j]*bp[0]
135238405Sjkim	add	%rax,$hi0
136238405Sjkim	mov	($np,$j,8),%rax
137183234Ssimon	adc	\$0,%rdx
138238405Sjkim	lea	1($j),$j		# j++
139238405Sjkim	mov	%rdx,$lo0
140183234Ssimon
141183234Ssimon	mulq	$m1			# np[j]*m1
142238405Sjkim	cmp	$num,$j
143238405Sjkim	jne	.L1st
144238405Sjkim
145238405Sjkim	add	%rax,$hi1
146238405Sjkim	mov	($ap),%rax		# ap[0]
147183234Ssimon	adc	\$0,%rdx
148238405Sjkim	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
149183234Ssimon	adc	\$0,%rdx
150238405Sjkim	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
151183234Ssimon	mov	%rdx,$hi1
152238405Sjkim	mov	$lo0,$hi0
153183234Ssimon
154183234Ssimon	xor	%rdx,%rdx
155183234Ssimon	add	$hi0,$hi1
156183234Ssimon	adc	\$0,%rdx
157183234Ssimon	mov	$hi1,-8(%rsp,$num,8)
158183234Ssimon	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
159183234Ssimon
160183234Ssimon	lea	1($i),$i		# i++
161238405Sjkim	jmp	.Louter
162238405Sjkim.align	16
163183234Ssimon.Louter:
164238405Sjkim	mov	($bp,$i,8),$m0		# m0=bp[i]
165183234Ssimon	xor	$j,$j			# j=0
166238405Sjkim	mov	$n0,$m1
167238405Sjkim	mov	(%rsp),$lo0
168183234Ssimon	mulq	$m0			# ap[0]*bp[i]
169238405Sjkim	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
170238405Sjkim	mov	($np),%rax
171183234Ssimon	adc	\$0,%rdx
172238405Sjkim
173238405Sjkim	imulq	$lo0,$m1		# tp[0]*n0
174183234Ssimon	mov	%rdx,$hi0
175183234Ssimon
176238405Sjkim	mulq	$m1			# np[0]*m1
177238405Sjkim	add	%rax,$lo0		# discarded
178238405Sjkim	mov	8($ap),%rax
179238405Sjkim	adc	\$0,%rdx
180183234Ssimon	mov	8(%rsp),$lo0		# tp[1]
181183234Ssimon	mov	%rdx,$hi1
182183234Ssimon
183183234Ssimon	lea	1($j),$j		# j++
184238405Sjkim	jmp	.Linner_enter
185238405Sjkim
186238405Sjkim.align	16
187183234Ssimon.Linner:
188238405Sjkim	add	%rax,$hi1
189183234Ssimon	mov	($ap,$j,8),%rax
190238405Sjkim	adc	\$0,%rdx
191238405Sjkim	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
192238405Sjkim	mov	(%rsp,$j,8),$lo0
193238405Sjkim	adc	\$0,%rdx
194238405Sjkim	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
195238405Sjkim	mov	%rdx,$hi1
196238405Sjkim
197238405Sjkim.Linner_enter:
198183234Ssimon	mulq	$m0			# ap[j]*bp[i]
199238405Sjkim	add	%rax,$hi0
200183234Ssimon	mov	($np,$j,8),%rax
201183234Ssimon	adc	\$0,%rdx
202238405Sjkim	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
203183234Ssimon	mov	%rdx,$hi0
204238405Sjkim	adc	\$0,$hi0
205238405Sjkim	lea	1($j),$j		# j++
206183234Ssimon
207183234Ssimon	mulq	$m1			# np[j]*m1
208238405Sjkim	cmp	$num,$j
209238405Sjkim	jne	.Linner
210238405Sjkim
211238405Sjkim	add	%rax,$hi1
212238405Sjkim	mov	($ap),%rax		# ap[0]
213183234Ssimon	adc	\$0,%rdx
214238405Sjkim	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
215238405Sjkim	mov	(%rsp,$j,8),$lo0
216183234Ssimon	adc	\$0,%rdx
217238405Sjkim	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
218183234Ssimon	mov	%rdx,$hi1
219183234Ssimon
220183234Ssimon	xor	%rdx,%rdx
221183234Ssimon	add	$hi0,$hi1
222183234Ssimon	adc	\$0,%rdx
223183234Ssimon	add	$lo0,$hi1		# pull upmost overflow bit
224183234Ssimon	adc	\$0,%rdx
225183234Ssimon	mov	$hi1,-8(%rsp,$num,8)
226183234Ssimon	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
227183234Ssimon
228183234Ssimon	lea	1($i),$i		# i++
229183234Ssimon	cmp	$num,$i
230183234Ssimon	jl	.Louter
231183234Ssimon
232238405Sjkim	xor	$i,$i			# i=0 and clear CF!
233238405Sjkim	mov	(%rsp),%rax		# tp[0]
234183234Ssimon	lea	(%rsp),$ap		# borrow ap for tp
235238405Sjkim	mov	$num,$j			# j=num
236183234Ssimon	jmp	.Lsub
237183234Ssimon.align	16
238183234Ssimon.Lsub:	sbb	($np,$i,8),%rax
239183234Ssimon	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
240183234Ssimon	mov	8($ap,$i,8),%rax	# tp[i+1]
241183234Ssimon	lea	1($i),$i		# i++
242238405Sjkim	dec	$j			# doesnn't affect CF!
243238405Sjkim	jnz	.Lsub
244183234Ssimon
245183234Ssimon	sbb	\$0,%rax		# handle upmost overflow bit
246238405Sjkim	xor	$i,$i
247183234Ssimon	and	%rax,$ap
248183234Ssimon	not	%rax
249183234Ssimon	mov	$rp,$np
250183234Ssimon	and	%rax,$np
251238405Sjkim	mov	$num,$j			# j=num
252183234Ssimon	or	$np,$ap			# ap=borrow?tp:rp
253183234Ssimon.align	16
254183234Ssimon.Lcopy:					# copy or in-place refresh
255238405Sjkim	mov	($ap,$i,8),%rax
256238405Sjkim	mov	$i,(%rsp,$i,8)		# zap temporary vector
257238405Sjkim	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
258238405Sjkim	lea	1($i),$i
259238405Sjkim	sub	\$1,$j
260238405Sjkim	jnz	.Lcopy
261238405Sjkim
262238405Sjkim	mov	8(%rsp,$num,8),%rsi	# restore %rsp
263238405Sjkim	mov	\$1,%rax
264238405Sjkim	mov	(%rsi),%r15
265238405Sjkim	mov	8(%rsi),%r14
266238405Sjkim	mov	16(%rsi),%r13
267238405Sjkim	mov	24(%rsi),%r12
268238405Sjkim	mov	32(%rsi),%rbp
269238405Sjkim	mov	40(%rsi),%rbx
270238405Sjkim	lea	48(%rsi),%rsp
271238405Sjkim.Lmul_epilogue:
272238405Sjkim	ret
273238405Sjkim.size	bn_mul_mont,.-bn_mul_mont
274238405Sjkim___
275238405Sjkim{{{
276238405Sjkimmy @A=("%r10","%r11");
277238405Sjkimmy @N=("%r13","%rdi");
278238405Sjkim$code.=<<___;
279238405Sjkim.type	bn_mul4x_mont,\@function,6
280238405Sjkim.align	16
281238405Sjkimbn_mul4x_mont:
282238405Sjkim.Lmul4x_enter:
283238405Sjkim	push	%rbx
284238405Sjkim	push	%rbp
285238405Sjkim	push	%r12
286238405Sjkim	push	%r13
287238405Sjkim	push	%r14
288238405Sjkim	push	%r15
289238405Sjkim
290238405Sjkim	mov	${num}d,${num}d
291238405Sjkim	lea	4($num),%r10
292238405Sjkim	mov	%rsp,%r11
293238405Sjkim	neg	%r10
294238405Sjkim	lea	(%rsp,%r10,8),%rsp	# tp=alloca(8*(num+4))
295238405Sjkim	and	\$-1024,%rsp		# minimize TLB usage
296238405Sjkim
297238405Sjkim	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
298238405Sjkim.Lmul4x_body:
299238405Sjkim	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
300238405Sjkim	mov	%rdx,%r12		# reassign $bp
301238405Sjkim___
302238405Sjkim		$bp="%r12";
303238405Sjkim$code.=<<___;
304238405Sjkim	mov	($n0),$n0		# pull n0[0] value
305238405Sjkim	mov	($bp),$m0		# m0=bp[0]
306238405Sjkim	mov	($ap),%rax
307238405Sjkim
308238405Sjkim	xor	$i,$i			# i=0
309238405Sjkim	xor	$j,$j			# j=0
310238405Sjkim
311238405Sjkim	mov	$n0,$m1
312238405Sjkim	mulq	$m0			# ap[0]*bp[0]
313238405Sjkim	mov	%rax,$A[0]
314238405Sjkim	mov	($np),%rax
315238405Sjkim
316238405Sjkim	imulq	$A[0],$m1		# "tp[0]"*n0
317238405Sjkim	mov	%rdx,$A[1]
318238405Sjkim
319238405Sjkim	mulq	$m1			# np[0]*m1
320238405Sjkim	add	%rax,$A[0]		# discarded
321238405Sjkim	mov	8($ap),%rax
322238405Sjkim	adc	\$0,%rdx
323238405Sjkim	mov	%rdx,$N[1]
324238405Sjkim
325238405Sjkim	mulq	$m0
326238405Sjkim	add	%rax,$A[1]
327238405Sjkim	mov	8($np),%rax
328238405Sjkim	adc	\$0,%rdx
329238405Sjkim	mov	%rdx,$A[0]
330238405Sjkim
331238405Sjkim	mulq	$m1
332238405Sjkim	add	%rax,$N[1]
333238405Sjkim	mov	16($ap),%rax
334238405Sjkim	adc	\$0,%rdx
335238405Sjkim	add	$A[1],$N[1]
336238405Sjkim	lea	4($j),$j		# j++
337238405Sjkim	adc	\$0,%rdx
338238405Sjkim	mov	$N[1],(%rsp)
339238405Sjkim	mov	%rdx,$N[0]
340238405Sjkim	jmp	.L1st4x
341238405Sjkim.align	16
342238405Sjkim.L1st4x:
343238405Sjkim	mulq	$m0			# ap[j]*bp[0]
344238405Sjkim	add	%rax,$A[0]
345238405Sjkim	mov	-16($np,$j,8),%rax
346238405Sjkim	adc	\$0,%rdx
347238405Sjkim	mov	%rdx,$A[1]
348238405Sjkim
349238405Sjkim	mulq	$m1			# np[j]*m1
350238405Sjkim	add	%rax,$N[0]
351238405Sjkim	mov	-8($ap,$j,8),%rax
352238405Sjkim	adc	\$0,%rdx
353238405Sjkim	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
354238405Sjkim	adc	\$0,%rdx
355238405Sjkim	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
356238405Sjkim	mov	%rdx,$N[1]
357238405Sjkim
358238405Sjkim	mulq	$m0			# ap[j]*bp[0]
359238405Sjkim	add	%rax,$A[1]
360238405Sjkim	mov	-8($np,$j,8),%rax
361238405Sjkim	adc	\$0,%rdx
362238405Sjkim	mov	%rdx,$A[0]
363238405Sjkim
364238405Sjkim	mulq	$m1			# np[j]*m1
365238405Sjkim	add	%rax,$N[1]
366183234Ssimon	mov	($ap,$j,8),%rax
367238405Sjkim	adc	\$0,%rdx
368238405Sjkim	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
369238405Sjkim	adc	\$0,%rdx
370238405Sjkim	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
371238405Sjkim	mov	%rdx,$N[0]
372238405Sjkim
373238405Sjkim	mulq	$m0			# ap[j]*bp[0]
374238405Sjkim	add	%rax,$A[0]
375238405Sjkim	mov	($np,$j,8),%rax
376238405Sjkim	adc	\$0,%rdx
377238405Sjkim	mov	%rdx,$A[1]
378238405Sjkim
379238405Sjkim	mulq	$m1			# np[j]*m1
380238405Sjkim	add	%rax,$N[0]
381238405Sjkim	mov	8($ap,$j,8),%rax
382238405Sjkim	adc	\$0,%rdx
383238405Sjkim	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
384238405Sjkim	adc	\$0,%rdx
385238405Sjkim	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
386238405Sjkim	mov	%rdx,$N[1]
387238405Sjkim
388238405Sjkim	mulq	$m0			# ap[j]*bp[0]
389238405Sjkim	add	%rax,$A[1]
390238405Sjkim	mov	8($np,$j,8),%rax
391238405Sjkim	adc	\$0,%rdx
392238405Sjkim	lea	4($j),$j		# j++
393238405Sjkim	mov	%rdx,$A[0]
394238405Sjkim
395238405Sjkim	mulq	$m1			# np[j]*m1
396238405Sjkim	add	%rax,$N[1]
397238405Sjkim	mov	-16($ap,$j,8),%rax
398238405Sjkim	adc	\$0,%rdx
399238405Sjkim	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
400238405Sjkim	adc	\$0,%rdx
401238405Sjkim	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
402238405Sjkim	mov	%rdx,$N[0]
403238405Sjkim	cmp	$num,$j
404238405Sjkim	jl	.L1st4x
405238405Sjkim
406238405Sjkim	mulq	$m0			# ap[j]*bp[0]
407238405Sjkim	add	%rax,$A[0]
408238405Sjkim	mov	-16($np,$j,8),%rax
409238405Sjkim	adc	\$0,%rdx
410238405Sjkim	mov	%rdx,$A[1]
411238405Sjkim
412238405Sjkim	mulq	$m1			# np[j]*m1
413238405Sjkim	add	%rax,$N[0]
414238405Sjkim	mov	-8($ap,$j,8),%rax
415238405Sjkim	adc	\$0,%rdx
416238405Sjkim	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
417238405Sjkim	adc	\$0,%rdx
418238405Sjkim	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
419238405Sjkim	mov	%rdx,$N[1]
420238405Sjkim
421238405Sjkim	mulq	$m0			# ap[j]*bp[0]
422238405Sjkim	add	%rax,$A[1]
423238405Sjkim	mov	-8($np,$j,8),%rax
424238405Sjkim	adc	\$0,%rdx
425238405Sjkim	mov	%rdx,$A[0]
426238405Sjkim
427238405Sjkim	mulq	$m1			# np[j]*m1
428238405Sjkim	add	%rax,$N[1]
429238405Sjkim	mov	($ap),%rax		# ap[0]
430238405Sjkim	adc	\$0,%rdx
431238405Sjkim	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
432238405Sjkim	adc	\$0,%rdx
433238405Sjkim	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
434238405Sjkim	mov	%rdx,$N[0]
435238405Sjkim
436238405Sjkim	xor	$N[1],$N[1]
437238405Sjkim	add	$A[0],$N[0]
438238405Sjkim	adc	\$0,$N[1]
439238405Sjkim	mov	$N[0],-8(%rsp,$j,8)
440238405Sjkim	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
441238405Sjkim
442238405Sjkim	lea	1($i),$i		# i++
443238405Sjkim.align	4
444238405Sjkim.Louter4x:
445238405Sjkim	mov	($bp,$i,8),$m0		# m0=bp[i]
446238405Sjkim	xor	$j,$j			# j=0
447238405Sjkim	mov	(%rsp),$A[0]
448238405Sjkim	mov	$n0,$m1
449238405Sjkim	mulq	$m0			# ap[0]*bp[i]
450238405Sjkim	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
451238405Sjkim	mov	($np),%rax
452238405Sjkim	adc	\$0,%rdx
453238405Sjkim
454238405Sjkim	imulq	$A[0],$m1		# tp[0]*n0
455238405Sjkim	mov	%rdx,$A[1]
456238405Sjkim
457238405Sjkim	mulq	$m1			# np[0]*m1
458238405Sjkim	add	%rax,$A[0]		# "$N[0]", discarded
459238405Sjkim	mov	8($ap),%rax
460238405Sjkim	adc	\$0,%rdx
461238405Sjkim	mov	%rdx,$N[1]
462238405Sjkim
463238405Sjkim	mulq	$m0			# ap[j]*bp[i]
464238405Sjkim	add	%rax,$A[1]
465238405Sjkim	mov	8($np),%rax
466238405Sjkim	adc	\$0,%rdx
467238405Sjkim	add	8(%rsp),$A[1]		# +tp[1]
468238405Sjkim	adc	\$0,%rdx
469238405Sjkim	mov	%rdx,$A[0]
470238405Sjkim
471238405Sjkim	mulq	$m1			# np[j]*m1
472238405Sjkim	add	%rax,$N[1]
473238405Sjkim	mov	16($ap),%rax
474238405Sjkim	adc	\$0,%rdx
475238405Sjkim	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
476238405Sjkim	lea	4($j),$j		# j+=2
477238405Sjkim	adc	\$0,%rdx
478238405Sjkim	mov	$N[1],(%rsp)		# tp[j-1]
479238405Sjkim	mov	%rdx,$N[0]
480238405Sjkim	jmp	.Linner4x
481238405Sjkim.align	16
482238405Sjkim.Linner4x:
483238405Sjkim	mulq	$m0			# ap[j]*bp[i]
484238405Sjkim	add	%rax,$A[0]
485238405Sjkim	mov	-16($np,$j,8),%rax
486238405Sjkim	adc	\$0,%rdx
487238405Sjkim	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
488238405Sjkim	adc	\$0,%rdx
489238405Sjkim	mov	%rdx,$A[1]
490238405Sjkim
491238405Sjkim	mulq	$m1			# np[j]*m1
492238405Sjkim	add	%rax,$N[0]
493238405Sjkim	mov	-8($ap,$j,8),%rax
494238405Sjkim	adc	\$0,%rdx
495238405Sjkim	add	$A[0],$N[0]
496238405Sjkim	adc	\$0,%rdx
497238405Sjkim	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
498238405Sjkim	mov	%rdx,$N[1]
499238405Sjkim
500238405Sjkim	mulq	$m0			# ap[j]*bp[i]
501238405Sjkim	add	%rax,$A[1]
502238405Sjkim	mov	-8($np,$j,8),%rax
503238405Sjkim	adc	\$0,%rdx
504238405Sjkim	add	-8(%rsp,$j,8),$A[1]
505238405Sjkim	adc	\$0,%rdx
506238405Sjkim	mov	%rdx,$A[0]
507238405Sjkim
508238405Sjkim	mulq	$m1			# np[j]*m1
509238405Sjkim	add	%rax,$N[1]
510238405Sjkim	mov	($ap,$j,8),%rax
511238405Sjkim	adc	\$0,%rdx
512238405Sjkim	add	$A[1],$N[1]
513238405Sjkim	adc	\$0,%rdx
514238405Sjkim	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
515238405Sjkim	mov	%rdx,$N[0]
516238405Sjkim
517238405Sjkim	mulq	$m0			# ap[j]*bp[i]
518238405Sjkim	add	%rax,$A[0]
519238405Sjkim	mov	($np,$j,8),%rax
520238405Sjkim	adc	\$0,%rdx
521238405Sjkim	add	(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
522238405Sjkim	adc	\$0,%rdx
523238405Sjkim	mov	%rdx,$A[1]
524238405Sjkim
525238405Sjkim	mulq	$m1			# np[j]*m1
526238405Sjkim	add	%rax,$N[0]
527238405Sjkim	mov	8($ap,$j,8),%rax
528238405Sjkim	adc	\$0,%rdx
529238405Sjkim	add	$A[0],$N[0]
530238405Sjkim	adc	\$0,%rdx
531238405Sjkim	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
532238405Sjkim	mov	%rdx,$N[1]
533238405Sjkim
534238405Sjkim	mulq	$m0			# ap[j]*bp[i]
535238405Sjkim	add	%rax,$A[1]
536238405Sjkim	mov	8($np,$j,8),%rax
537238405Sjkim	adc	\$0,%rdx
538238405Sjkim	add	8(%rsp,$j,8),$A[1]
539238405Sjkim	adc	\$0,%rdx
540238405Sjkim	lea	4($j),$j		# j++
541238405Sjkim	mov	%rdx,$A[0]
542238405Sjkim
543238405Sjkim	mulq	$m1			# np[j]*m1
544238405Sjkim	add	%rax,$N[1]
545238405Sjkim	mov	-16($ap,$j,8),%rax
546238405Sjkim	adc	\$0,%rdx
547238405Sjkim	add	$A[1],$N[1]
548238405Sjkim	adc	\$0,%rdx
549238405Sjkim	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
550238405Sjkim	mov	%rdx,$N[0]
551238405Sjkim	cmp	$num,$j
552238405Sjkim	jl	.Linner4x
553238405Sjkim
554238405Sjkim	mulq	$m0			# ap[j]*bp[i]
555238405Sjkim	add	%rax,$A[0]
556238405Sjkim	mov	-16($np,$j,8),%rax
557238405Sjkim	adc	\$0,%rdx
558238405Sjkim	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
559238405Sjkim	adc	\$0,%rdx
560238405Sjkim	mov	%rdx,$A[1]
561238405Sjkim
562238405Sjkim	mulq	$m1			# np[j]*m1
563238405Sjkim	add	%rax,$N[0]
564238405Sjkim	mov	-8($ap,$j,8),%rax
565238405Sjkim	adc	\$0,%rdx
566238405Sjkim	add	$A[0],$N[0]
567238405Sjkim	adc	\$0,%rdx
568238405Sjkim	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
569238405Sjkim	mov	%rdx,$N[1]
570238405Sjkim
571238405Sjkim	mulq	$m0			# ap[j]*bp[i]
572238405Sjkim	add	%rax,$A[1]
573238405Sjkim	mov	-8($np,$j,8),%rax
574238405Sjkim	adc	\$0,%rdx
575238405Sjkim	add	-8(%rsp,$j,8),$A[1]
576238405Sjkim	adc	\$0,%rdx
577238405Sjkim	lea	1($i),$i		# i++
578238405Sjkim	mov	%rdx,$A[0]
579238405Sjkim
580238405Sjkim	mulq	$m1			# np[j]*m1
581238405Sjkim	add	%rax,$N[1]
582238405Sjkim	mov	($ap),%rax		# ap[0]
583238405Sjkim	adc	\$0,%rdx
584238405Sjkim	add	$A[1],$N[1]
585238405Sjkim	adc	\$0,%rdx
586238405Sjkim	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
587238405Sjkim	mov	%rdx,$N[0]
588238405Sjkim
589238405Sjkim	xor	$N[1],$N[1]
590238405Sjkim	add	$A[0],$N[0]
591238405Sjkim	adc	\$0,$N[1]
592238405Sjkim	add	(%rsp,$num,8),$N[0]	# pull upmost overflow bit
593238405Sjkim	adc	\$0,$N[1]
594238405Sjkim	mov	$N[0],-8(%rsp,$j,8)
595238405Sjkim	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
596238405Sjkim
597238405Sjkim	cmp	$num,$i
598238405Sjkim	jl	.Louter4x
599238405Sjkim___
600238405Sjkim{
601238405Sjkimmy @ri=("%rax","%rdx",$m0,$m1);
602238405Sjkim$code.=<<___;
603238405Sjkim	mov	16(%rsp,$num,8),$rp	# restore $rp
604238405Sjkim	mov	0(%rsp),@ri[0]		# tp[0]
605238405Sjkim	pxor	%xmm0,%xmm0
606238405Sjkim	mov	8(%rsp),@ri[1]		# tp[1]
607238405Sjkim	shr	\$2,$num		# num/=4
608238405Sjkim	lea	(%rsp),$ap		# borrow ap for tp
609238405Sjkim	xor	$i,$i			# i=0 and clear CF!
610238405Sjkim
611238405Sjkim	sub	0($np),@ri[0]
612238405Sjkim	mov	16($ap),@ri[2]		# tp[2]
613238405Sjkim	mov	24($ap),@ri[3]		# tp[3]
614238405Sjkim	sbb	8($np),@ri[1]
615238405Sjkim	lea	-1($num),$j		# j=num/4-1
616238405Sjkim	jmp	.Lsub4x
617238405Sjkim.align	16
618238405Sjkim.Lsub4x:
619238405Sjkim	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
620238405Sjkim	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
621238405Sjkim	sbb	16($np,$i,8),@ri[2]
622238405Sjkim	mov	32($ap,$i,8),@ri[0]	# tp[i+1]
623238405Sjkim	mov	40($ap,$i,8),@ri[1]
624238405Sjkim	sbb	24($np,$i,8),@ri[3]
625238405Sjkim	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
626238405Sjkim	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
627238405Sjkim	sbb	32($np,$i,8),@ri[0]
628238405Sjkim	mov	48($ap,$i,8),@ri[2]
629238405Sjkim	mov	56($ap,$i,8),@ri[3]
630238405Sjkim	sbb	40($np,$i,8),@ri[1]
631238405Sjkim	lea	4($i),$i		# i++
632238405Sjkim	dec	$j			# doesnn't affect CF!
633238405Sjkim	jnz	.Lsub4x
634238405Sjkim
635238405Sjkim	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
636238405Sjkim	mov	32($ap,$i,8),@ri[0]	# load overflow bit
637238405Sjkim	sbb	16($np,$i,8),@ri[2]
638238405Sjkim	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
639238405Sjkim	sbb	24($np,$i,8),@ri[3]
640238405Sjkim	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
641238405Sjkim
642238405Sjkim	sbb	\$0,@ri[0]		# handle upmost overflow bit
643238405Sjkim	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
644238405Sjkim	xor	$i,$i			# i=0
645238405Sjkim	and	@ri[0],$ap
646238405Sjkim	not	@ri[0]
647238405Sjkim	mov	$rp,$np
648238405Sjkim	and	@ri[0],$np
649238405Sjkim	lea	-1($num),$j
650238405Sjkim	or	$np,$ap			# ap=borrow?tp:rp
651238405Sjkim
652238405Sjkim	movdqu	($ap),%xmm1
653238405Sjkim	movdqa	%xmm0,(%rsp)
654238405Sjkim	movdqu	%xmm1,($rp)
655238405Sjkim	jmp	.Lcopy4x
656238405Sjkim.align	16
657238405Sjkim.Lcopy4x:					# copy or in-place refresh
658238405Sjkim	movdqu	16($ap,$i),%xmm2
659238405Sjkim	movdqu	32($ap,$i),%xmm1
660238405Sjkim	movdqa	%xmm0,16(%rsp,$i)
661238405Sjkim	movdqu	%xmm2,16($rp,$i)
662238405Sjkim	movdqa	%xmm0,32(%rsp,$i)
663238405Sjkim	movdqu	%xmm1,32($rp,$i)
664238405Sjkim	lea	32($i),$i
665183234Ssimon	dec	$j
666238405Sjkim	jnz	.Lcopy4x
667183234Ssimon
668238405Sjkim	shl	\$2,$num
669238405Sjkim	movdqu	16($ap,$i),%xmm2
670238405Sjkim	movdqa	%xmm0,16(%rsp,$i)
671238405Sjkim	movdqu	%xmm2,16($rp,$i)
672238405Sjkim___
673238405Sjkim}
674238405Sjkim$code.=<<___;
675238405Sjkim	mov	8(%rsp,$num,8),%rsi	# restore %rsp
676183234Ssimon	mov	\$1,%rax
677238405Sjkim	mov	(%rsi),%r15
678238405Sjkim	mov	8(%rsi),%r14
679238405Sjkim	mov	16(%rsi),%r13
680238405Sjkim	mov	24(%rsi),%r12
681238405Sjkim	mov	32(%rsi),%rbp
682238405Sjkim	mov	40(%rsi),%rbx
683238405Sjkim	lea	48(%rsi),%rsp
684238405Sjkim.Lmul4x_epilogue:
685238405Sjkim	ret
686238405Sjkim.size	bn_mul4x_mont,.-bn_mul4x_mont
687238405Sjkim___
688238405Sjkim}}}
689238405Sjkim{{{
690238405Sjkim######################################################################
691238405Sjkim# void bn_sqr4x_mont(
692238405Sjkimmy $rptr="%rdi";	# const BN_ULONG *rptr,
693238405Sjkimmy $aptr="%rsi";	# const BN_ULONG *aptr,
694238405Sjkimmy $bptr="%rdx";	# not used
695238405Sjkimmy $nptr="%rcx";	# const BN_ULONG *nptr,
696238405Sjkimmy $n0  ="%r8";		# const BN_ULONG *n0);
697238405Sjkimmy $num ="%r9";		# int num, has to be divisible by 4 and
698238405Sjkim			# not less than 8
699238405Sjkim
700238405Sjkimmy ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
701238405Sjkimmy @A0=("%r10","%r11");
702238405Sjkimmy @A1=("%r12","%r13");
703238405Sjkimmy ($a0,$a1,$ai)=("%r14","%r15","%rbx");
704238405Sjkim
705238405Sjkim$code.=<<___;
706238405Sjkim.type	bn_sqr4x_mont,\@function,6
707238405Sjkim.align	16
708238405Sjkimbn_sqr4x_mont:
709238405Sjkim.Lsqr4x_enter:
710238405Sjkim	push	%rbx
711238405Sjkim	push	%rbp
712238405Sjkim	push	%r12
713238405Sjkim	push	%r13
714238405Sjkim	push	%r14
715238405Sjkim	push	%r15
716238405Sjkim
717238405Sjkim	shl	\$3,${num}d		# convert $num to bytes
718238405Sjkim	xor	%r10,%r10
719238405Sjkim	mov	%rsp,%r11		# put aside %rsp
720238405Sjkim	sub	$num,%r10		# -$num
721238405Sjkim	mov	($n0),$n0		# *n0
722238405Sjkim	lea	-72(%rsp,%r10,2),%rsp	# alloca(frame+2*$num)
723238405Sjkim	and	\$-1024,%rsp		# minimize TLB usage
724238405Sjkim	##############################################################
725238405Sjkim	# Stack layout
726238405Sjkim	#
727238405Sjkim	# +0	saved $num, used in reduction section
728238405Sjkim	# +8	&t[2*$num], used in reduction section
729238405Sjkim	# +32	saved $rptr
730238405Sjkim	# +40	saved $nptr
731238405Sjkim	# +48	saved *n0
732238405Sjkim	# +56	saved %rsp
733238405Sjkim	# +64	t[2*$num]
734238405Sjkim	#
735238405Sjkim	mov	$rptr,32(%rsp)		# save $rptr
736238405Sjkim	mov	$nptr,40(%rsp)
737238405Sjkim	mov	$n0,  48(%rsp)
738238405Sjkim	mov	%r11, 56(%rsp)		# save original %rsp
739238405Sjkim.Lsqr4x_body:
740238405Sjkim	##############################################################
741238405Sjkim	# Squaring part:
742238405Sjkim	#
743238405Sjkim	# a) multiply-n-add everything but a[i]*a[i];
744238405Sjkim	# b) shift result of a) by 1 to the left and accumulate
745238405Sjkim	#    a[i]*a[i] products;
746238405Sjkim	#
747238405Sjkim	lea	32(%r10),$i		# $i=-($num-32)
748238405Sjkim	lea	($aptr,$num),$aptr	# end of a[] buffer, ($aptr,$i)=&ap[2]
749238405Sjkim
750238405Sjkim	mov	$num,$j			# $j=$num
751238405Sjkim
752238405Sjkim					# comments apply to $num==8 case
753238405Sjkim	mov	-32($aptr,$i),$a0	# a[0]
754238405Sjkim	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
755238405Sjkim	mov	-24($aptr,$i),%rax	# a[1]
756238405Sjkim	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
757238405Sjkim	mov	-16($aptr,$i),$ai	# a[2]
758238405Sjkim	mov	%rax,$a1
759238405Sjkim
760238405Sjkim	mul	$a0			# a[1]*a[0]
761238405Sjkim	mov	%rax,$A0[0]		# a[1]*a[0]
762238405Sjkim	 mov	$ai,%rax		# a[2]
763238405Sjkim	mov	%rdx,$A0[1]
764238405Sjkim	mov	$A0[0],-24($tptr,$i)	# t[1]
765238405Sjkim
766238405Sjkim	xor	$A0[0],$A0[0]
767238405Sjkim	mul	$a0			# a[2]*a[0]
768238405Sjkim	add	%rax,$A0[1]
769238405Sjkim	 mov	$ai,%rax
770238405Sjkim	adc	%rdx,$A0[0]
771238405Sjkim	mov	$A0[1],-16($tptr,$i)	# t[2]
772238405Sjkim
773238405Sjkim	lea	-16($i),$j		# j=-16
774238405Sjkim
775238405Sjkim
776238405Sjkim	 mov	8($aptr,$j),$ai		# a[3]
777238405Sjkim	mul	$a1			# a[2]*a[1]
778238405Sjkim	mov	%rax,$A1[0]		# a[2]*a[1]+t[3]
779238405Sjkim	 mov	$ai,%rax
780238405Sjkim	mov	%rdx,$A1[1]
781238405Sjkim
782238405Sjkim	xor	$A0[1],$A0[1]
783238405Sjkim	add	$A1[0],$A0[0]
784238405Sjkim	 lea	16($j),$j
785238405Sjkim	adc	\$0,$A0[1]
786238405Sjkim	mul	$a0			# a[3]*a[0]
787238405Sjkim	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
788238405Sjkim	 mov	$ai,%rax
789238405Sjkim	adc	%rdx,$A0[1]
790238405Sjkim	mov	$A0[0],-8($tptr,$j)	# t[3]
791238405Sjkim	jmp	.Lsqr4x_1st
792238405Sjkim
793238405Sjkim.align	16
794238405Sjkim.Lsqr4x_1st:
795238405Sjkim	 mov	($aptr,$j),$ai		# a[4]
796238405Sjkim	xor	$A1[0],$A1[0]
797238405Sjkim	mul	$a1			# a[3]*a[1]
798238405Sjkim	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
799238405Sjkim	 mov	$ai,%rax
800238405Sjkim	adc	%rdx,$A1[0]
801238405Sjkim
802238405Sjkim	xor	$A0[0],$A0[0]
803238405Sjkim	add	$A1[1],$A0[1]
804238405Sjkim	adc	\$0,$A0[0]
805238405Sjkim	mul	$a0			# a[4]*a[0]
806238405Sjkim	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
807238405Sjkim	 mov	$ai,%rax		# a[3]
808238405Sjkim	adc	%rdx,$A0[0]
809238405Sjkim	mov	$A0[1],($tptr,$j)	# t[4]
810238405Sjkim
811238405Sjkim
812238405Sjkim	 mov	8($aptr,$j),$ai		# a[5]
813238405Sjkim	xor	$A1[1],$A1[1]
814238405Sjkim	mul	$a1			# a[4]*a[3]
815238405Sjkim	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
816238405Sjkim	 mov	$ai,%rax
817238405Sjkim	adc	%rdx,$A1[1]
818238405Sjkim
819238405Sjkim	xor	$A0[1],$A0[1]
820238405Sjkim	add	$A1[0],$A0[0]
821238405Sjkim	adc	\$0,$A0[1]
822238405Sjkim	mul	$a0			# a[5]*a[2]
823238405Sjkim	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
824238405Sjkim	 mov	$ai,%rax
825238405Sjkim	adc	%rdx,$A0[1]
826238405Sjkim	mov	$A0[0],8($tptr,$j)	# t[5]
827238405Sjkim
828238405Sjkim	 mov	16($aptr,$j),$ai	# a[6]
829238405Sjkim	xor	$A1[0],$A1[0]
830238405Sjkim	mul	$a1			# a[5]*a[3]
831238405Sjkim	add	%rax,$A1[1]		# a[5]*a[3]+t[6]
832238405Sjkim	 mov	$ai,%rax
833238405Sjkim	adc	%rdx,$A1[0]
834238405Sjkim
835238405Sjkim	xor	$A0[0],$A0[0]
836238405Sjkim	add	$A1[1],$A0[1]
837238405Sjkim	adc	\$0,$A0[0]
838238405Sjkim	mul	$a0			# a[6]*a[2]
839238405Sjkim	add	%rax,$A0[1]		# a[6]*a[2]+a[5]*a[3]+t[6]
840238405Sjkim	 mov	$ai,%rax		# a[3]
841238405Sjkim	adc	%rdx,$A0[0]
842238405Sjkim	mov	$A0[1],16($tptr,$j)	# t[6]
843238405Sjkim
844238405Sjkim
845238405Sjkim	 mov	24($aptr,$j),$ai	# a[7]
846238405Sjkim	xor	$A1[1],$A1[1]
847238405Sjkim	mul	$a1			# a[6]*a[5]
848238405Sjkim	add	%rax,$A1[0]		# a[6]*a[5]+t[7]
849238405Sjkim	 mov	$ai,%rax
850238405Sjkim	adc	%rdx,$A1[1]
851238405Sjkim
852238405Sjkim	xor	$A0[1],$A0[1]
853238405Sjkim	add	$A1[0],$A0[0]
854238405Sjkim	 lea	32($j),$j
855238405Sjkim	adc	\$0,$A0[1]
856238405Sjkim	mul	$a0			# a[7]*a[4]
857238405Sjkim	add	%rax,$A0[0]		# a[7]*a[4]+a[6]*a[5]+t[6]
858238405Sjkim	 mov	$ai,%rax
859238405Sjkim	adc	%rdx,$A0[1]
860238405Sjkim	mov	$A0[0],-8($tptr,$j)	# t[7]
861238405Sjkim
862238405Sjkim	cmp	\$0,$j
863238405Sjkim	jne	.Lsqr4x_1st
864238405Sjkim
865238405Sjkim	xor	$A1[0],$A1[0]
866238405Sjkim	add	$A0[1],$A1[1]
867238405Sjkim	adc	\$0,$A1[0]
868238405Sjkim	mul	$a1			# a[7]*a[5]
869238405Sjkim	add	%rax,$A1[1]
870238405Sjkim	adc	%rdx,$A1[0]
871238405Sjkim
872238405Sjkim	mov	$A1[1],($tptr)		# t[8]
873238405Sjkim	lea	16($i),$i
874238405Sjkim	mov	$A1[0],8($tptr)		# t[9]
875238405Sjkim	jmp	.Lsqr4x_outer
876238405Sjkim
877238405Sjkim.align	16
878238405Sjkim.Lsqr4x_outer:				# comments apply to $num==6 case
879238405Sjkim	mov	-32($aptr,$i),$a0	# a[0]
880238405Sjkim	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
881238405Sjkim	mov	-24($aptr,$i),%rax	# a[1]
882238405Sjkim	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
883238405Sjkim	mov	-16($aptr,$i),$ai	# a[2]
884238405Sjkim	mov	%rax,$a1
885238405Sjkim
886238405Sjkim	mov	-24($tptr,$i),$A0[0]	# t[1]
887238405Sjkim	xor	$A0[1],$A0[1]
888238405Sjkim	mul	$a0			# a[1]*a[0]
889238405Sjkim	add	%rax,$A0[0]		# a[1]*a[0]+t[1]
890238405Sjkim	 mov	$ai,%rax		# a[2]
891238405Sjkim	adc	%rdx,$A0[1]
892238405Sjkim	mov	$A0[0],-24($tptr,$i)	# t[1]
893238405Sjkim
894238405Sjkim	xor	$A0[0],$A0[0]
895238405Sjkim	add	-16($tptr,$i),$A0[1]	# a[2]*a[0]+t[2]
896238405Sjkim	adc	\$0,$A0[0]
897238405Sjkim	mul	$a0			# a[2]*a[0]
898238405Sjkim	add	%rax,$A0[1]
899238405Sjkim	 mov	$ai,%rax
900238405Sjkim	adc	%rdx,$A0[0]
901238405Sjkim	mov	$A0[1],-16($tptr,$i)	# t[2]
902238405Sjkim
903238405Sjkim	lea	-16($i),$j		# j=-16
904238405Sjkim	xor	$A1[0],$A1[0]
905238405Sjkim
906238405Sjkim
907238405Sjkim	 mov	8($aptr,$j),$ai		# a[3]
908238405Sjkim	xor	$A1[1],$A1[1]
909238405Sjkim	add	8($tptr,$j),$A1[0]
910238405Sjkim	adc	\$0,$A1[1]
911238405Sjkim	mul	$a1			# a[2]*a[1]
912238405Sjkim	add	%rax,$A1[0]		# a[2]*a[1]+t[3]
913238405Sjkim	 mov	$ai,%rax
914238405Sjkim	adc	%rdx,$A1[1]
915238405Sjkim
916238405Sjkim	xor	$A0[1],$A0[1]
917238405Sjkim	add	$A1[0],$A0[0]
918238405Sjkim	adc	\$0,$A0[1]
919238405Sjkim	mul	$a0			# a[3]*a[0]
920238405Sjkim	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
921238405Sjkim	 mov	$ai,%rax
922238405Sjkim	adc	%rdx,$A0[1]
923238405Sjkim	mov	$A0[0],8($tptr,$j)	# t[3]
924238405Sjkim
925238405Sjkim	lea	16($j),$j
926238405Sjkim	jmp	.Lsqr4x_inner
927238405Sjkim
928238405Sjkim.align	16
929238405Sjkim.Lsqr4x_inner:
930238405Sjkim	 mov	($aptr,$j),$ai		# a[4]
931238405Sjkim	xor	$A1[0],$A1[0]
932238405Sjkim	add	($tptr,$j),$A1[1]
933238405Sjkim	adc	\$0,$A1[0]
934238405Sjkim	mul	$a1			# a[3]*a[1]
935238405Sjkim	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
936238405Sjkim	 mov	$ai,%rax
937238405Sjkim	adc	%rdx,$A1[0]
938238405Sjkim
939238405Sjkim	xor	$A0[0],$A0[0]
940238405Sjkim	add	$A1[1],$A0[1]
941238405Sjkim	adc	\$0,$A0[0]
942238405Sjkim	mul	$a0			# a[4]*a[0]
943238405Sjkim	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
944238405Sjkim	 mov	$ai,%rax		# a[3]
945238405Sjkim	adc	%rdx,$A0[0]
946238405Sjkim	mov	$A0[1],($tptr,$j)	# t[4]
947238405Sjkim
948238405Sjkim	 mov	8($aptr,$j),$ai		# a[5]
949238405Sjkim	xor	$A1[1],$A1[1]
950238405Sjkim	add	8($tptr,$j),$A1[0]
951238405Sjkim	adc	\$0,$A1[1]
952238405Sjkim	mul	$a1			# a[4]*a[3]
953238405Sjkim	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
954238405Sjkim	 mov	$ai,%rax
955238405Sjkim	adc	%rdx,$A1[1]
956238405Sjkim
957238405Sjkim	xor	$A0[1],$A0[1]
958238405Sjkim	add	$A1[0],$A0[0]
959238405Sjkim	lea	16($j),$j		# j++
960238405Sjkim	adc	\$0,$A0[1]
961238405Sjkim	mul	$a0			# a[5]*a[2]
962238405Sjkim	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
963238405Sjkim	 mov	$ai,%rax
964238405Sjkim	adc	%rdx,$A0[1]
965238405Sjkim	mov	$A0[0],-8($tptr,$j)	# t[5], "preloaded t[1]" below
966238405Sjkim
967238405Sjkim	cmp	\$0,$j
968238405Sjkim	jne	.Lsqr4x_inner
969238405Sjkim
970238405Sjkim	xor	$A1[0],$A1[0]
971238405Sjkim	add	$A0[1],$A1[1]
972238405Sjkim	adc	\$0,$A1[0]
973238405Sjkim	mul	$a1			# a[5]*a[3]
974238405Sjkim	add	%rax,$A1[1]
975238405Sjkim	adc	%rdx,$A1[0]
976238405Sjkim
977238405Sjkim	mov	$A1[1],($tptr)		# t[6], "preloaded t[2]" below
978238405Sjkim	mov	$A1[0],8($tptr)		# t[7], "preloaded t[3]" below
979238405Sjkim
980238405Sjkim	add	\$16,$i
981238405Sjkim	jnz	.Lsqr4x_outer
982238405Sjkim
983238405Sjkim					# comments apply to $num==4 case
984238405Sjkim	mov	-32($aptr),$a0		# a[0]
985238405Sjkim	lea	64(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
986238405Sjkim	mov	-24($aptr),%rax		# a[1]
987238405Sjkim	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
988238405Sjkim	mov	-16($aptr),$ai		# a[2]
989238405Sjkim	mov	%rax,$a1
990238405Sjkim
991238405Sjkim	xor	$A0[1],$A0[1]
992238405Sjkim	mul	$a0			# a[1]*a[0]
993238405Sjkim	add	%rax,$A0[0]		# a[1]*a[0]+t[1], preloaded t[1]
994238405Sjkim	 mov	$ai,%rax		# a[2]
995238405Sjkim	adc	%rdx,$A0[1]
996238405Sjkim	mov	$A0[0],-24($tptr)	# t[1]
997238405Sjkim
998238405Sjkim	xor	$A0[0],$A0[0]
999238405Sjkim	add	$A1[1],$A0[1]		# a[2]*a[0]+t[2], preloaded t[2]
1000238405Sjkim	adc	\$0,$A0[0]
1001238405Sjkim	mul	$a0			# a[2]*a[0]
1002238405Sjkim	add	%rax,$A0[1]
1003238405Sjkim	 mov	$ai,%rax
1004238405Sjkim	adc	%rdx,$A0[0]
1005238405Sjkim	mov	$A0[1],-16($tptr)	# t[2]
1006238405Sjkim
1007238405Sjkim	 mov	-8($aptr),$ai		# a[3]
1008238405Sjkim	mul	$a1			# a[2]*a[1]
1009238405Sjkim	add	%rax,$A1[0]		# a[2]*a[1]+t[3], preloaded t[3]
1010238405Sjkim	 mov	$ai,%rax
1011238405Sjkim	adc	\$0,%rdx
1012238405Sjkim
1013238405Sjkim	xor	$A0[1],$A0[1]
1014238405Sjkim	add	$A1[0],$A0[0]
1015238405Sjkim	 mov	%rdx,$A1[1]
1016238405Sjkim	adc	\$0,$A0[1]
1017238405Sjkim	mul	$a0			# a[3]*a[0]
1018238405Sjkim	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
1019238405Sjkim	 mov	$ai,%rax
1020238405Sjkim	adc	%rdx,$A0[1]
1021238405Sjkim	mov	$A0[0],-8($tptr)	# t[3]
1022238405Sjkim
1023238405Sjkim	xor	$A1[0],$A1[0]
1024238405Sjkim	add	$A0[1],$A1[1]
1025238405Sjkim	adc	\$0,$A1[0]
1026238405Sjkim	mul	$a1			# a[3]*a[1]
1027238405Sjkim	add	%rax,$A1[1]
1028238405Sjkim	 mov	-16($aptr),%rax		# a[2]
1029238405Sjkim	adc	%rdx,$A1[0]
1030238405Sjkim
1031238405Sjkim	mov	$A1[1],($tptr)		# t[4]
1032238405Sjkim	mov	$A1[0],8($tptr)		# t[5]
1033238405Sjkim
1034238405Sjkim	mul	$ai			# a[2]*a[3]
1035238405Sjkim___
1036238405Sjkim{
1037238405Sjkimmy ($shift,$carry)=($a0,$a1);
1038238405Sjkimmy @S=(@A1,$ai,$n0);
1039238405Sjkim$code.=<<___;
1040238405Sjkim	 add	\$16,$i
1041238405Sjkim	 xor	$shift,$shift
1042238405Sjkim	 sub	$num,$i			# $i=16-$num
1043238405Sjkim	 xor	$carry,$carry
1044238405Sjkim
1045238405Sjkim	add	$A1[0],%rax		# t[5]
1046238405Sjkim	adc	\$0,%rdx
1047238405Sjkim	mov	%rax,8($tptr)		# t[5]
1048238405Sjkim	mov	%rdx,16($tptr)		# t[6]
1049238405Sjkim	mov	$carry,24($tptr)	# t[7]
1050238405Sjkim
1051238405Sjkim	 mov	-16($aptr,$i),%rax	# a[0]
1052238405Sjkim	lea	64(%rsp,$num,2),$tptr
1053238405Sjkim	 xor	$A0[0],$A0[0]		# t[0]
1054238405Sjkim	 mov	-24($tptr,$i,2),$A0[1]	# t[1]
1055238405Sjkim
1056238405Sjkim	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1057238405Sjkim	shr	\$63,$A0[0]
1058238405Sjkim	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1059238405Sjkim	shr	\$63,$A0[1]
1060238405Sjkim	or	$A0[0],$S[1]		# | t[2*i]>>63
1061238405Sjkim	 mov	-16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1062238405Sjkim	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1063238405Sjkim	mul	%rax			# a[i]*a[i]
1064238405Sjkim	neg	$carry			# mov $carry,cf
1065238405Sjkim	 mov	-8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1066238405Sjkim	adc	%rax,$S[0]
1067238405Sjkim	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
1068238405Sjkim	mov	$S[0],-32($tptr,$i,2)
1069238405Sjkim	adc	%rdx,$S[1]
1070238405Sjkim
1071238405Sjkim	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1072238405Sjkim	 mov	$S[1],-24($tptr,$i,2)
1073238405Sjkim	 sbb	$carry,$carry		# mov cf,$carry
1074238405Sjkim	shr	\$63,$A0[0]
1075238405Sjkim	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1076238405Sjkim	shr	\$63,$A0[1]
1077238405Sjkim	or	$A0[0],$S[3]		# | t[2*i]>>63
1078238405Sjkim	 mov	0($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1079238405Sjkim	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1080238405Sjkim	mul	%rax			# a[i]*a[i]
1081238405Sjkim	neg	$carry			# mov $carry,cf
1082238405Sjkim	 mov	8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1083238405Sjkim	adc	%rax,$S[2]
1084238405Sjkim	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
1085238405Sjkim	mov	$S[2],-16($tptr,$i,2)
1086238405Sjkim	adc	%rdx,$S[3]
1087238405Sjkim	lea	16($i),$i
1088238405Sjkim	mov	$S[3],-40($tptr,$i,2)
1089238405Sjkim	sbb	$carry,$carry		# mov cf,$carry
1090238405Sjkim	jmp	.Lsqr4x_shift_n_add
1091238405Sjkim
1092238405Sjkim.align	16
1093238405Sjkim.Lsqr4x_shift_n_add:
1094238405Sjkim	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1095238405Sjkim	shr	\$63,$A0[0]
1096238405Sjkim	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1097238405Sjkim	shr	\$63,$A0[1]
1098238405Sjkim	or	$A0[0],$S[1]		# | t[2*i]>>63
1099238405Sjkim	 mov	-16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1100238405Sjkim	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1101238405Sjkim	mul	%rax			# a[i]*a[i]
1102238405Sjkim	neg	$carry			# mov $carry,cf
1103238405Sjkim	 mov	-8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1104238405Sjkim	adc	%rax,$S[0]
1105238405Sjkim	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
1106238405Sjkim	mov	$S[0],-32($tptr,$i,2)
1107238405Sjkim	adc	%rdx,$S[1]
1108238405Sjkim
1109238405Sjkim	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1110238405Sjkim	 mov	$S[1],-24($tptr,$i,2)
1111238405Sjkim	 sbb	$carry,$carry		# mov cf,$carry
1112238405Sjkim	shr	\$63,$A0[0]
1113238405Sjkim	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1114238405Sjkim	shr	\$63,$A0[1]
1115238405Sjkim	or	$A0[0],$S[3]		# | t[2*i]>>63
1116238405Sjkim	 mov	0($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1117238405Sjkim	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1118238405Sjkim	mul	%rax			# a[i]*a[i]
1119238405Sjkim	neg	$carry			# mov $carry,cf
1120238405Sjkim	 mov	8($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1121238405Sjkim	adc	%rax,$S[2]
1122238405Sjkim	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
1123238405Sjkim	mov	$S[2],-16($tptr,$i,2)
1124238405Sjkim	adc	%rdx,$S[3]
1125238405Sjkim
1126238405Sjkim	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1127238405Sjkim	 mov	$S[3],-8($tptr,$i,2)
1128238405Sjkim	 sbb	$carry,$carry		# mov cf,$carry
1129238405Sjkim	shr	\$63,$A0[0]
1130238405Sjkim	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1131238405Sjkim	shr	\$63,$A0[1]
1132238405Sjkim	or	$A0[0],$S[1]		# | t[2*i]>>63
1133238405Sjkim	 mov	16($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1134238405Sjkim	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1135238405Sjkim	mul	%rax			# a[i]*a[i]
1136238405Sjkim	neg	$carry			# mov $carry,cf
1137238405Sjkim	 mov	24($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1138238405Sjkim	adc	%rax,$S[0]
1139238405Sjkim	 mov	8($aptr,$i),%rax	# a[i+1]	# prefetch
1140238405Sjkim	mov	$S[0],0($tptr,$i,2)
1141238405Sjkim	adc	%rdx,$S[1]
1142238405Sjkim
1143238405Sjkim	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1144238405Sjkim	 mov	$S[1],8($tptr,$i,2)
1145238405Sjkim	 sbb	$carry,$carry		# mov cf,$carry
1146238405Sjkim	shr	\$63,$A0[0]
1147238405Sjkim	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1148238405Sjkim	shr	\$63,$A0[1]
1149238405Sjkim	or	$A0[0],$S[3]		# | t[2*i]>>63
1150238405Sjkim	 mov	32($tptr,$i,2),$A0[0]	# t[2*i+2]	# prefetch
1151238405Sjkim	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1152238405Sjkim	mul	%rax			# a[i]*a[i]
1153238405Sjkim	neg	$carry			# mov $carry,cf
1154238405Sjkim	 mov	40($tptr,$i,2),$A0[1]	# t[2*i+2+1]	# prefetch
1155238405Sjkim	adc	%rax,$S[2]
1156238405Sjkim	 mov	16($aptr,$i),%rax	# a[i+1]	# prefetch
1157238405Sjkim	mov	$S[2],16($tptr,$i,2)
1158238405Sjkim	adc	%rdx,$S[3]
1159238405Sjkim	mov	$S[3],24($tptr,$i,2)
1160238405Sjkim	sbb	$carry,$carry		# mov cf,$carry
1161238405Sjkim	add	\$32,$i
1162238405Sjkim	jnz	.Lsqr4x_shift_n_add
1163238405Sjkim
1164238405Sjkim	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1165238405Sjkim	shr	\$63,$A0[0]
1166238405Sjkim	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1167238405Sjkim	shr	\$63,$A0[1]
1168238405Sjkim	or	$A0[0],$S[1]		# | t[2*i]>>63
1169238405Sjkim	 mov	-16($tptr),$A0[0]	# t[2*i+2]	# prefetch
1170238405Sjkim	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1171238405Sjkim	mul	%rax			# a[i]*a[i]
1172238405Sjkim	neg	$carry			# mov $carry,cf
1173238405Sjkim	 mov	-8($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1174238405Sjkim	adc	%rax,$S[0]
1175238405Sjkim	 mov	-8($aptr),%rax		# a[i+1]	# prefetch
1176238405Sjkim	mov	$S[0],-32($tptr)
1177238405Sjkim	adc	%rdx,$S[1]
1178238405Sjkim
1179238405Sjkim	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1|shift
1180238405Sjkim	 mov	$S[1],-24($tptr)
1181238405Sjkim	 sbb	$carry,$carry		# mov cf,$carry
1182238405Sjkim	shr	\$63,$A0[0]
1183238405Sjkim	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1184238405Sjkim	shr	\$63,$A0[1]
1185238405Sjkim	or	$A0[0],$S[3]		# | t[2*i]>>63
1186238405Sjkim	mul	%rax			# a[i]*a[i]
1187238405Sjkim	neg	$carry			# mov $carry,cf
1188238405Sjkim	adc	%rax,$S[2]
1189238405Sjkim	adc	%rdx,$S[3]
1190238405Sjkim	mov	$S[2],-16($tptr)
1191238405Sjkim	mov	$S[3],-8($tptr)
1192238405Sjkim___
1193238405Sjkim}
1194238405Sjkim##############################################################
1195238405Sjkim# Montgomery reduction part, "word-by-word" algorithm.
1196238405Sjkim#
1197238405Sjkim{
1198238405Sjkimmy ($topbit,$nptr)=("%rbp",$aptr);
1199238405Sjkimmy ($m0,$m1)=($a0,$a1);
1200238405Sjkimmy @Ni=("%rbx","%r9");
1201238405Sjkim$code.=<<___;
1202238405Sjkim	mov	40(%rsp),$nptr		# restore $nptr
1203238405Sjkim	mov	48(%rsp),$n0		# restore *n0
1204238405Sjkim	xor	$j,$j
1205238405Sjkim	mov	$num,0(%rsp)		# save $num
1206238405Sjkim	sub	$num,$j			# $j=-$num
1207238405Sjkim	 mov	64(%rsp),$A0[0]		# t[0]		# modsched #
1208238405Sjkim	 mov	$n0,$m0			#		# modsched #
1209238405Sjkim	lea	64(%rsp,$num,2),%rax	# end of t[] buffer
1210238405Sjkim	lea	64(%rsp,$num),$tptr	# end of t[] window
1211238405Sjkim	mov	%rax,8(%rsp)		# save end of t[] buffer
1212238405Sjkim	lea	($nptr,$num),$nptr	# end of n[] buffer
1213238405Sjkim	xor	$topbit,$topbit		# $topbit=0
1214238405Sjkim
1215238405Sjkim	mov	0($nptr,$j),%rax	# n[0]		# modsched #
1216238405Sjkim	mov	8($nptr,$j),$Ni[1]	# n[1]		# modsched #
1217238405Sjkim	 imulq	$A0[0],$m0		# m0=t[0]*n0	# modsched #
1218238405Sjkim	 mov	%rax,$Ni[0]		#		# modsched #
1219238405Sjkim	jmp	.Lsqr4x_mont_outer
1220238405Sjkim
1221238405Sjkim.align	16
1222238405Sjkim.Lsqr4x_mont_outer:
1223238405Sjkim	xor	$A0[1],$A0[1]
1224238405Sjkim	mul	$m0			# n[0]*m0
1225238405Sjkim	add	%rax,$A0[0]		# n[0]*m0+t[0]
1226238405Sjkim	 mov	$Ni[1],%rax
1227238405Sjkim	adc	%rdx,$A0[1]
1228238405Sjkim	mov	$n0,$m1
1229238405Sjkim
1230238405Sjkim	xor	$A0[0],$A0[0]
1231238405Sjkim	add	8($tptr,$j),$A0[1]
1232238405Sjkim	adc	\$0,$A0[0]
1233238405Sjkim	mul	$m0			# n[1]*m0
1234238405Sjkim	add	%rax,$A0[1]		# n[1]*m0+t[1]
1235238405Sjkim	 mov	$Ni[0],%rax
1236238405Sjkim	adc	%rdx,$A0[0]
1237238405Sjkim
1238238405Sjkim	imulq	$A0[1],$m1
1239238405Sjkim
1240238405Sjkim	mov	16($nptr,$j),$Ni[0]	# n[2]
1241238405Sjkim	xor	$A1[1],$A1[1]
1242238405Sjkim	add	$A0[1],$A1[0]
1243238405Sjkim	adc	\$0,$A1[1]
1244238405Sjkim	mul	$m1			# n[0]*m1
1245238405Sjkim	add	%rax,$A1[0]		# n[0]*m1+"t[1]"
1246238405Sjkim	 mov	$Ni[0],%rax
1247238405Sjkim	adc	%rdx,$A1[1]
1248238405Sjkim	mov	$A1[0],8($tptr,$j)	# "t[1]"
1249238405Sjkim
1250238405Sjkim	xor	$A0[1],$A0[1]
1251238405Sjkim	add	16($tptr,$j),$A0[0]
1252238405Sjkim	adc	\$0,$A0[1]
1253238405Sjkim	mul	$m0			# n[2]*m0
1254238405Sjkim	add	%rax,$A0[0]		# n[2]*m0+t[2]
1255238405Sjkim	 mov	$Ni[1],%rax
1256238405Sjkim	adc	%rdx,$A0[1]
1257238405Sjkim
1258238405Sjkim	mov	24($nptr,$j),$Ni[1]	# n[3]
1259238405Sjkim	xor	$A1[0],$A1[0]
1260238405Sjkim	add	$A0[0],$A1[1]
1261238405Sjkim	adc	\$0,$A1[0]
1262238405Sjkim	mul	$m1			# n[1]*m1
1263238405Sjkim	add	%rax,$A1[1]		# n[1]*m1+"t[2]"
1264238405Sjkim	 mov	$Ni[1],%rax
1265238405Sjkim	adc	%rdx,$A1[0]
1266238405Sjkim	mov	$A1[1],16($tptr,$j)	# "t[2]"
1267238405Sjkim
1268238405Sjkim	xor	$A0[0],$A0[0]
1269238405Sjkim	add	24($tptr,$j),$A0[1]
1270238405Sjkim	lea	32($j),$j
1271238405Sjkim	adc	\$0,$A0[0]
1272238405Sjkim	mul	$m0			# n[3]*m0
1273238405Sjkim	add	%rax,$A0[1]		# n[3]*m0+t[3]
1274238405Sjkim	 mov	$Ni[0],%rax
1275238405Sjkim	adc	%rdx,$A0[0]
1276238405Sjkim	jmp	.Lsqr4x_mont_inner
1277238405Sjkim
1278238405Sjkim.align	16
1279238405Sjkim.Lsqr4x_mont_inner:
1280238405Sjkim	mov	($nptr,$j),$Ni[0]	# n[4]
1281238405Sjkim	xor	$A1[1],$A1[1]
1282238405Sjkim	add	$A0[1],$A1[0]
1283238405Sjkim	adc	\$0,$A1[1]
1284238405Sjkim	mul	$m1			# n[2]*m1
1285238405Sjkim	add	%rax,$A1[0]		# n[2]*m1+"t[3]"
1286238405Sjkim	 mov	$Ni[0],%rax
1287238405Sjkim	adc	%rdx,$A1[1]
1288238405Sjkim	mov	$A1[0],-8($tptr,$j)	# "t[3]"
1289238405Sjkim
1290238405Sjkim	xor	$A0[1],$A0[1]
1291238405Sjkim	add	($tptr,$j),$A0[0]
1292238405Sjkim	adc	\$0,$A0[1]
1293238405Sjkim	mul	$m0			# n[4]*m0
1294238405Sjkim	add	%rax,$A0[0]		# n[4]*m0+t[4]
1295238405Sjkim	 mov	$Ni[1],%rax
1296238405Sjkim	adc	%rdx,$A0[1]
1297238405Sjkim
1298238405Sjkim	mov	8($nptr,$j),$Ni[1]	# n[5]
1299238405Sjkim	xor	$A1[0],$A1[0]
1300238405Sjkim	add	$A0[0],$A1[1]
1301238405Sjkim	adc	\$0,$A1[0]
1302238405Sjkim	mul	$m1			# n[3]*m1
1303238405Sjkim	add	%rax,$A1[1]		# n[3]*m1+"t[4]"
1304238405Sjkim	 mov	$Ni[1],%rax
1305238405Sjkim	adc	%rdx,$A1[0]
1306238405Sjkim	mov	$A1[1],($tptr,$j)	# "t[4]"
1307238405Sjkim
1308238405Sjkim	xor	$A0[0],$A0[0]
1309238405Sjkim	add	8($tptr,$j),$A0[1]
1310238405Sjkim	adc	\$0,$A0[0]
1311238405Sjkim	mul	$m0			# n[5]*m0
1312238405Sjkim	add	%rax,$A0[1]		# n[5]*m0+t[5]
1313238405Sjkim	 mov	$Ni[0],%rax
1314238405Sjkim	adc	%rdx,$A0[0]
1315238405Sjkim
1316238405Sjkim
1317238405Sjkim	mov	16($nptr,$j),$Ni[0]	# n[6]
1318238405Sjkim	xor	$A1[1],$A1[1]
1319238405Sjkim	add	$A0[1],$A1[0]
1320238405Sjkim	adc	\$0,$A1[1]
1321238405Sjkim	mul	$m1			# n[4]*m1
1322238405Sjkim	add	%rax,$A1[0]		# n[4]*m1+"t[5]"
1323238405Sjkim	 mov	$Ni[0],%rax
1324238405Sjkim	adc	%rdx,$A1[1]
1325238405Sjkim	mov	$A1[0],8($tptr,$j)	# "t[5]"
1326238405Sjkim
1327238405Sjkim	xor	$A0[1],$A0[1]
1328238405Sjkim	add	16($tptr,$j),$A0[0]
1329238405Sjkim	adc	\$0,$A0[1]
1330238405Sjkim	mul	$m0			# n[6]*m0
1331238405Sjkim	add	%rax,$A0[0]		# n[6]*m0+t[6]
1332238405Sjkim	 mov	$Ni[1],%rax
1333238405Sjkim	adc	%rdx,$A0[1]
1334238405Sjkim
1335238405Sjkim	mov	24($nptr,$j),$Ni[1]	# n[7]
1336238405Sjkim	xor	$A1[0],$A1[0]
1337238405Sjkim	add	$A0[0],$A1[1]
1338238405Sjkim	adc	\$0,$A1[0]
1339238405Sjkim	mul	$m1			# n[5]*m1
1340238405Sjkim	add	%rax,$A1[1]		# n[5]*m1+"t[6]"
1341238405Sjkim	 mov	$Ni[1],%rax
1342238405Sjkim	adc	%rdx,$A1[0]
1343238405Sjkim	mov	$A1[1],16($tptr,$j)	# "t[6]"
1344238405Sjkim
1345238405Sjkim	xor	$A0[0],$A0[0]
1346238405Sjkim	add	24($tptr,$j),$A0[1]
1347238405Sjkim	lea	32($j),$j
1348238405Sjkim	adc	\$0,$A0[0]
1349238405Sjkim	mul	$m0			# n[7]*m0
1350238405Sjkim	add	%rax,$A0[1]		# n[7]*m0+t[7]
1351238405Sjkim	 mov	$Ni[0],%rax
1352238405Sjkim	adc	%rdx,$A0[0]
1353238405Sjkim	cmp	\$0,$j
1354238405Sjkim	jne	.Lsqr4x_mont_inner
1355238405Sjkim
1356238405Sjkim	 sub	0(%rsp),$j		# $j=-$num	# modsched #
1357238405Sjkim	 mov	$n0,$m0			#		# modsched #
1358238405Sjkim
1359238405Sjkim	xor	$A1[1],$A1[1]
1360238405Sjkim	add	$A0[1],$A1[0]
1361238405Sjkim	adc	\$0,$A1[1]
1362238405Sjkim	mul	$m1			# n[6]*m1
1363238405Sjkim	add	%rax,$A1[0]		# n[6]*m1+"t[7]"
1364238405Sjkim	mov	$Ni[1],%rax
1365238405Sjkim	adc	%rdx,$A1[1]
1366238405Sjkim	mov	$A1[0],-8($tptr)	# "t[7]"
1367238405Sjkim
1368238405Sjkim	xor	$A0[1],$A0[1]
1369238405Sjkim	add	($tptr),$A0[0]		# +t[8]
1370238405Sjkim	adc	\$0,$A0[1]
1371238405Sjkim	 mov	0($nptr,$j),$Ni[0]	# n[0]		# modsched #
1372238405Sjkim	add	$topbit,$A0[0]
1373238405Sjkim	adc	\$0,$A0[1]
1374238405Sjkim
1375238405Sjkim	 imulq	16($tptr,$j),$m0	# m0=t[0]*n0	# modsched #
1376238405Sjkim	xor	$A1[0],$A1[0]
1377238405Sjkim	 mov	8($nptr,$j),$Ni[1]	# n[1]		# modsched #
1378238405Sjkim	add	$A0[0],$A1[1]
1379238405Sjkim	 mov	16($tptr,$j),$A0[0]	# t[0]		# modsched #
1380238405Sjkim	adc	\$0,$A1[0]
1381238405Sjkim	mul	$m1			# n[7]*m1
1382238405Sjkim	add	%rax,$A1[1]		# n[7]*m1+"t[8]"
1383238405Sjkim	 mov	$Ni[0],%rax		#		# modsched #
1384238405Sjkim	adc	%rdx,$A1[0]
1385238405Sjkim	mov	$A1[1],($tptr)		# "t[8]"
1386238405Sjkim
1387238405Sjkim	xor	$topbit,$topbit
1388238405Sjkim	add	8($tptr),$A1[0]		# +t[9]
1389238405Sjkim	adc	$topbit,$topbit
1390238405Sjkim	add	$A0[1],$A1[0]
1391238405Sjkim	lea	16($tptr),$tptr		# "t[$num]>>128"
1392238405Sjkim	adc	\$0,$topbit
1393238405Sjkim	mov	$A1[0],-8($tptr)	# "t[9]"
1394238405Sjkim	cmp	8(%rsp),$tptr		# are we done?
1395238405Sjkim	jb	.Lsqr4x_mont_outer
1396238405Sjkim
1397238405Sjkim	mov	0(%rsp),$num		# restore $num
1398238405Sjkim	mov	$topbit,($tptr)		# save $topbit
1399238405Sjkim___
1400238405Sjkim}
1401238405Sjkim##############################################################
1402238405Sjkim# Post-condition, 4x unrolled copy from bn_mul_mont
1403238405Sjkim#
1404238405Sjkim{
1405238405Sjkimmy ($tptr,$nptr)=("%rbx",$aptr);
1406238405Sjkimmy @ri=("%rax","%rdx","%r10","%r11");
1407238405Sjkim$code.=<<___;
1408238405Sjkim	mov	64(%rsp,$num),@ri[0]	# tp[0]
1409238405Sjkim	lea	64(%rsp,$num),$tptr	# upper half of t[2*$num] holds result
1410238405Sjkim	mov	40(%rsp),$nptr		# restore $nptr
1411238405Sjkim	shr	\$5,$num		# num/4
1412238405Sjkim	mov	8($tptr),@ri[1]		# t[1]
1413238405Sjkim	xor	$i,$i			# i=0 and clear CF!
1414238405Sjkim
1415238405Sjkim	mov	32(%rsp),$rptr		# restore $rptr
1416238405Sjkim	sub	0($nptr),@ri[0]
1417238405Sjkim	mov	16($tptr),@ri[2]	# t[2]
1418238405Sjkim	mov	24($tptr),@ri[3]	# t[3]
1419238405Sjkim	sbb	8($nptr),@ri[1]
1420238405Sjkim	lea	-1($num),$j		# j=num/4-1
1421238405Sjkim	jmp	.Lsqr4x_sub
1422238405Sjkim.align	16
1423238405Sjkim.Lsqr4x_sub:
1424238405Sjkim	mov	@ri[0],0($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1425238405Sjkim	mov	@ri[1],8($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1426238405Sjkim	sbb	16($nptr,$i,8),@ri[2]
1427238405Sjkim	mov	32($tptr,$i,8),@ri[0]	# tp[i+1]
1428238405Sjkim	mov	40($tptr,$i,8),@ri[1]
1429238405Sjkim	sbb	24($nptr,$i,8),@ri[3]
1430238405Sjkim	mov	@ri[2],16($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1431238405Sjkim	mov	@ri[3],24($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1432238405Sjkim	sbb	32($nptr,$i,8),@ri[0]
1433238405Sjkim	mov	48($tptr,$i,8),@ri[2]
1434238405Sjkim	mov	56($tptr,$i,8),@ri[3]
1435238405Sjkim	sbb	40($nptr,$i,8),@ri[1]
1436238405Sjkim	lea	4($i),$i		# i++
1437238405Sjkim	dec	$j			# doesn't affect CF!
1438238405Sjkim	jnz	.Lsqr4x_sub
1439238405Sjkim
1440238405Sjkim	mov	@ri[0],0($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1441238405Sjkim	mov	32($tptr,$i,8),@ri[0]	# load overflow bit
1442238405Sjkim	sbb	16($nptr,$i,8),@ri[2]
1443238405Sjkim	mov	@ri[1],8($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1444238405Sjkim	sbb	24($nptr,$i,8),@ri[3]
1445238405Sjkim	mov	@ri[2],16($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1446238405Sjkim
1447238405Sjkim	sbb	\$0,@ri[0]		# handle upmost overflow bit
1448238405Sjkim	mov	@ri[3],24($rptr,$i,8)	# rp[i]=tp[i]-np[i]
1449238405Sjkim	xor	$i,$i			# i=0
1450238405Sjkim	and	@ri[0],$tptr
1451238405Sjkim	not	@ri[0]
1452238405Sjkim	mov	$rptr,$nptr
1453238405Sjkim	and	@ri[0],$nptr
1454238405Sjkim	lea	-1($num),$j
1455238405Sjkim	or	$nptr,$tptr		# tp=borrow?tp:rp
1456238405Sjkim
1457238405Sjkim	pxor	%xmm0,%xmm0
1458238405Sjkim	lea	64(%rsp,$num,8),$nptr
1459238405Sjkim	movdqu	($tptr),%xmm1
1460238405Sjkim	lea	($nptr,$num,8),$nptr
1461238405Sjkim	movdqa	%xmm0,64(%rsp)		# zap lower half of temporary vector
1462238405Sjkim	movdqa	%xmm0,($nptr)		# zap upper half of temporary vector
1463238405Sjkim	movdqu	%xmm1,($rptr)
1464238405Sjkim	jmp	.Lsqr4x_copy
1465238405Sjkim.align	16
1466238405Sjkim.Lsqr4x_copy:				# copy or in-place refresh
1467238405Sjkim	movdqu	16($tptr,$i),%xmm2
1468238405Sjkim	movdqu	32($tptr,$i),%xmm1
1469238405Sjkim	movdqa	%xmm0,80(%rsp,$i)	# zap lower half of temporary vector
1470238405Sjkim	movdqa	%xmm0,96(%rsp,$i)	# zap lower half of temporary vector
1471238405Sjkim	movdqa	%xmm0,16($nptr,$i)	# zap upper half of temporary vector
1472238405Sjkim	movdqa	%xmm0,32($nptr,$i)	# zap upper half of temporary vector
1473238405Sjkim	movdqu	%xmm2,16($rptr,$i)
1474238405Sjkim	movdqu	%xmm1,32($rptr,$i)
1475238405Sjkim	lea	32($i),$i
1476238405Sjkim	dec	$j
1477238405Sjkim	jnz	.Lsqr4x_copy
1478238405Sjkim
1479238405Sjkim	movdqu	16($tptr,$i),%xmm2
1480238405Sjkim	movdqa	%xmm0,80(%rsp,$i)	# zap lower half of temporary vector
1481238405Sjkim	movdqa	%xmm0,16($nptr,$i)	# zap upper half of temporary vector
1482238405Sjkim	movdqu	%xmm2,16($rptr,$i)
1483238405Sjkim___
1484238405Sjkim}
1485238405Sjkim$code.=<<___;
1486238405Sjkim	mov	56(%rsp),%rsi		# restore %rsp
1487238405Sjkim	mov	\$1,%rax
1488238405Sjkim	mov	0(%rsi),%r15
1489238405Sjkim	mov	8(%rsi),%r14
1490238405Sjkim	mov	16(%rsi),%r13
1491238405Sjkim	mov	24(%rsi),%r12
1492238405Sjkim	mov	32(%rsi),%rbp
1493238405Sjkim	mov	40(%rsi),%rbx
1494238405Sjkim	lea	48(%rsi),%rsp
1495238405Sjkim.Lsqr4x_epilogue:
1496238405Sjkim	ret
1497238405Sjkim.size	bn_sqr4x_mont,.-bn_sqr4x_mont
1498238405Sjkim___
1499238405Sjkim}}}
1500238405Sjkim$code.=<<___;
1501238405Sjkim.asciz	"Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1502238405Sjkim.align	16
1503238405Sjkim___
1504238405Sjkim
1505238405Sjkim# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1506238405Sjkim#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1507238405Sjkimif ($win64) {
1508238405Sjkim$rec="%rcx";
1509238405Sjkim$frame="%rdx";
1510238405Sjkim$context="%r8";
1511238405Sjkim$disp="%r9";
1512238405Sjkim
1513238405Sjkim$code.=<<___;
1514238405Sjkim.extern	__imp_RtlVirtualUnwind
1515238405Sjkim.type	mul_handler,\@abi-omnipotent
1516238405Sjkim.align	16
1517238405Sjkimmul_handler:
1518238405Sjkim	push	%rsi
1519238405Sjkim	push	%rdi
1520238405Sjkim	push	%rbx
1521238405Sjkim	push	%rbp
1522238405Sjkim	push	%r12
1523238405Sjkim	push	%r13
1524238405Sjkim	push	%r14
1525238405Sjkim	push	%r15
1526238405Sjkim	pushfq
1527238405Sjkim	sub	\$64,%rsp
1528238405Sjkim
1529238405Sjkim	mov	120($context),%rax	# pull context->Rax
1530238405Sjkim	mov	248($context),%rbx	# pull context->Rip
1531238405Sjkim
1532238405Sjkim	mov	8($disp),%rsi		# disp->ImageBase
1533238405Sjkim	mov	56($disp),%r11		# disp->HandlerData
1534238405Sjkim
1535238405Sjkim	mov	0(%r11),%r10d		# HandlerData[0]
1536238405Sjkim	lea	(%rsi,%r10),%r10	# end of prologue label
1537238405Sjkim	cmp	%r10,%rbx		# context->Rip<end of prologue label
1538238405Sjkim	jb	.Lcommon_seh_tail
1539238405Sjkim
1540238405Sjkim	mov	152($context),%rax	# pull context->Rsp
1541238405Sjkim
1542238405Sjkim	mov	4(%r11),%r10d		# HandlerData[1]
1543238405Sjkim	lea	(%rsi,%r10),%r10	# epilogue label
1544238405Sjkim	cmp	%r10,%rbx		# context->Rip>=epilogue label
1545238405Sjkim	jae	.Lcommon_seh_tail
1546238405Sjkim
1547238405Sjkim	mov	192($context),%r10	# pull $num
1548238405Sjkim	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
1549238405Sjkim	lea	48(%rax),%rax
1550238405Sjkim
1551238405Sjkim	mov	-8(%rax),%rbx
1552238405Sjkim	mov	-16(%rax),%rbp
1553238405Sjkim	mov	-24(%rax),%r12
1554238405Sjkim	mov	-32(%rax),%r13
1555238405Sjkim	mov	-40(%rax),%r14
1556238405Sjkim	mov	-48(%rax),%r15
1557238405Sjkim	mov	%rbx,144($context)	# restore context->Rbx
1558238405Sjkim	mov	%rbp,160($context)	# restore context->Rbp
1559238405Sjkim	mov	%r12,216($context)	# restore context->R12
1560238405Sjkim	mov	%r13,224($context)	# restore context->R13
1561238405Sjkim	mov	%r14,232($context)	# restore context->R14
1562238405Sjkim	mov	%r15,240($context)	# restore context->R15
1563238405Sjkim
1564238405Sjkim	jmp	.Lcommon_seh_tail
1565238405Sjkim.size	mul_handler,.-mul_handler
1566238405Sjkim
1567238405Sjkim.type	sqr_handler,\@abi-omnipotent
1568238405Sjkim.align	16
1569238405Sjkimsqr_handler:
1570238405Sjkim	push	%rsi
1571238405Sjkim	push	%rdi
1572238405Sjkim	push	%rbx
1573238405Sjkim	push	%rbp
1574238405Sjkim	push	%r12
1575238405Sjkim	push	%r13
1576238405Sjkim	push	%r14
1577238405Sjkim	push	%r15
1578238405Sjkim	pushfq
1579238405Sjkim	sub	\$64,%rsp
1580238405Sjkim
1581238405Sjkim	mov	120($context),%rax	# pull context->Rax
1582238405Sjkim	mov	248($context),%rbx	# pull context->Rip
1583238405Sjkim
1584238405Sjkim	lea	.Lsqr4x_body(%rip),%r10
1585238405Sjkim	cmp	%r10,%rbx		# context->Rip<.Lsqr_body
1586238405Sjkim	jb	.Lcommon_seh_tail
1587238405Sjkim
1588238405Sjkim	mov	152($context),%rax	# pull context->Rsp
1589238405Sjkim
1590238405Sjkim	lea	.Lsqr4x_epilogue(%rip),%r10
1591238405Sjkim	cmp	%r10,%rbx		# context->Rip>=.Lsqr_epilogue
1592238405Sjkim	jae	.Lcommon_seh_tail
1593238405Sjkim
1594238405Sjkim	mov	56(%rax),%rax		# pull saved stack pointer
1595238405Sjkim	lea	48(%rax),%rax
1596238405Sjkim
1597238405Sjkim	mov	-8(%rax),%rbx
1598238405Sjkim	mov	-16(%rax),%rbp
1599238405Sjkim	mov	-24(%rax),%r12
1600238405Sjkim	mov	-32(%rax),%r13
1601238405Sjkim	mov	-40(%rax),%r14
1602238405Sjkim	mov	-48(%rax),%r15
1603238405Sjkim	mov	%rbx,144($context)	# restore context->Rbx
1604238405Sjkim	mov	%rbp,160($context)	# restore context->Rbp
1605238405Sjkim	mov	%r12,216($context)	# restore context->R12
1606238405Sjkim	mov	%r13,224($context)	# restore context->R13
1607238405Sjkim	mov	%r14,232($context)	# restore context->R14
1608238405Sjkim	mov	%r15,240($context)	# restore context->R15
1609238405Sjkim
1610238405Sjkim.Lcommon_seh_tail:
1611238405Sjkim	mov	8(%rax),%rdi
1612238405Sjkim	mov	16(%rax),%rsi
1613238405Sjkim	mov	%rax,152($context)	# restore context->Rsp
1614238405Sjkim	mov	%rsi,168($context)	# restore context->Rsi
1615238405Sjkim	mov	%rdi,176($context)	# restore context->Rdi
1616238405Sjkim
1617238405Sjkim	mov	40($disp),%rdi		# disp->ContextRecord
1618238405Sjkim	mov	$context,%rsi		# context
1619238405Sjkim	mov	\$154,%ecx		# sizeof(CONTEXT)
1620238405Sjkim	.long	0xa548f3fc		# cld; rep movsq
1621238405Sjkim
1622238405Sjkim	mov	$disp,%rsi
1623238405Sjkim	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1624238405Sjkim	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1625238405Sjkim	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1626238405Sjkim	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1627238405Sjkim	mov	40(%rsi),%r10		# disp->ContextRecord
1628238405Sjkim	lea	56(%rsi),%r11		# &disp->HandlerData
1629238405Sjkim	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1630238405Sjkim	mov	%r10,32(%rsp)		# arg5
1631238405Sjkim	mov	%r11,40(%rsp)		# arg6
1632238405Sjkim	mov	%r12,48(%rsp)		# arg7
1633238405Sjkim	mov	%rcx,56(%rsp)		# arg8, (NULL)
1634238405Sjkim	call	*__imp_RtlVirtualUnwind(%rip)
1635238405Sjkim
1636238405Sjkim	mov	\$1,%eax		# ExceptionContinueSearch
1637238405Sjkim	add	\$64,%rsp
1638238405Sjkim	popfq
1639183234Ssimon	pop	%r15
1640183234Ssimon	pop	%r14
1641183234Ssimon	pop	%r13
1642183234Ssimon	pop	%r12
1643183234Ssimon	pop	%rbp
1644183234Ssimon	pop	%rbx
1645238405Sjkim	pop	%rdi
1646238405Sjkim	pop	%rsi
1647183234Ssimon	ret
1648238405Sjkim.size	sqr_handler,.-sqr_handler
1649238405Sjkim
1650238405Sjkim.section	.pdata
1651238405Sjkim.align	4
1652238405Sjkim	.rva	.LSEH_begin_bn_mul_mont
1653238405Sjkim	.rva	.LSEH_end_bn_mul_mont
1654238405Sjkim	.rva	.LSEH_info_bn_mul_mont
1655238405Sjkim
1656238405Sjkim	.rva	.LSEH_begin_bn_mul4x_mont
1657238405Sjkim	.rva	.LSEH_end_bn_mul4x_mont
1658238405Sjkim	.rva	.LSEH_info_bn_mul4x_mont
1659238405Sjkim
1660238405Sjkim	.rva	.LSEH_begin_bn_sqr4x_mont
1661238405Sjkim	.rva	.LSEH_end_bn_sqr4x_mont
1662238405Sjkim	.rva	.LSEH_info_bn_sqr4x_mont
1663238405Sjkim
1664238405Sjkim.section	.xdata
1665238405Sjkim.align	8
1666238405Sjkim.LSEH_info_bn_mul_mont:
1667238405Sjkim	.byte	9,0,0,0
1668238405Sjkim	.rva	mul_handler
1669238405Sjkim	.rva	.Lmul_body,.Lmul_epilogue	# HandlerData[]
1670238405Sjkim.LSEH_info_bn_mul4x_mont:
1671238405Sjkim	.byte	9,0,0,0
1672238405Sjkim	.rva	mul_handler
1673238405Sjkim	.rva	.Lmul4x_body,.Lmul4x_epilogue	# HandlerData[]
1674238405Sjkim.LSEH_info_bn_sqr4x_mont:
1675238405Sjkim	.byte	9,0,0,0
1676238405Sjkim	.rva	sqr_handler
1677183234Ssimon___
1678238405Sjkim}
1679183234Ssimon
1680183234Ssimonprint $code;
1681183234Ssimonclose STDOUT;
1682