x86_64-mont.pl revision 312826
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# October 2005.
11#
12# Montgomery multiplication routine for x86_64. While it gives modest
13# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
14# than twice, >2x, as fast. Most common rsa1024 sign is improved by
15# respectful 50%. It remains to be seen if loop unrolling and
16# dedicated squaring routine can provide further improvement...
17
18# July 2011.
19#
20# Add dedicated squaring procedure. Performance improvement varies
21# from platform to platform, but in average it's ~5%/15%/25%/33%
22# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
23
24# August 2011.
25#
26# Unroll and modulo-schedule inner loops in such manner that they
27# are "fallen through" for input lengths of 8, which is critical for
28# 1024-bit RSA *sign*. Average performance improvement in comparison
29# to *initial* version of this module from 2005 is ~0%/30%/40%/45%
30# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
31
32# June 2013.
33#
34# Optimize reduction in squaring procedure and improve 1024+-bit RSA
35# sign performance by 10-16% on Intel Sandy Bridge and later
36# (virtually same on non-Intel processors).
37
38# August 2013.
39#
40# Add MULX/ADOX/ADCX code path.
41
42$flavour = shift;
43$output  = shift;
44if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
45
46$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
47
48$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
49( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
50( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
51die "can't locate x86_64-xlate.pl";
52
53open OUT,"| \"$^X\" $xlate $flavour $output";
54*STDOUT=*OUT;
55
56if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
57		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
58	$addx = ($1>=2.23);
59}
60
61if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
62	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
63	$addx = ($1>=2.10);
64}
65
66if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
67	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
68	$addx = ($1>=12);
69}
70
71if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
72	my $ver = $2 + $3/100.0;	# 3.1->3.01, 3.10->3.10
73	$addx = ($ver>=3.03);
74}
75
76# int bn_mul_mont(
77$rp="%rdi";	# BN_ULONG *rp,
78$ap="%rsi";	# const BN_ULONG *ap,
79$bp="%rdx";	# const BN_ULONG *bp,
80$np="%rcx";	# const BN_ULONG *np,
81$n0="%r8";	# const BN_ULONG *n0,
82$num="%r9";	# int num);
83$lo0="%r10";
84$hi0="%r11";
85$hi1="%r13";
86$i="%r14";
87$j="%r15";
88$m0="%rbx";
89$m1="%rbp";
90
91$code=<<___;
92.text
93
94.extern	OPENSSL_ia32cap_P
95
96.globl	bn_mul_mont
97.type	bn_mul_mont,\@function,6
98.align	16
99bn_mul_mont:
100	mov	${num}d,${num}d
101	mov	%rsp,%rax
102	test	\$3,${num}d
103	jnz	.Lmul_enter
104	cmp	\$8,${num}d
105	jb	.Lmul_enter
106___
107$code.=<<___ if ($addx);
108	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
109___
110$code.=<<___;
111	cmp	$ap,$bp
112	jne	.Lmul4x_enter
113	test	\$7,${num}d
114	jz	.Lsqr8x_enter
115	jmp	.Lmul4x_enter
116
117.align	16
118.Lmul_enter:
119	push	%rbx
120	push	%rbp
121	push	%r12
122	push	%r13
123	push	%r14
124	push	%r15
125
126	neg	$num
127	mov	%rsp,%r11
128	lea	-16(%rsp,$num,8),%r10	# future alloca(8*(num+2))
129	neg	$num			# restore $num
130	and	\$-1024,%r10		# minimize TLB usage
131
132	# Some OSes, *cough*-dows, insist on stack being "wired" to
133	# physical memory in strictly sequential manner, i.e. if stack
134	# allocation spans two pages, then reference to farmost one can
135	# be punishable by SEGV. But page walking can do good even on
136	# other OSes, because it guarantees that villain thread hits
137	# the guard page before it can make damage to innocent one...
138	sub	%r10,%r11
139	and	\$-4096,%r11
140	lea	(%r10,%r11),%rsp
141	mov	(%rsp),%r11
142	cmp	%r10,%rsp
143	ja	.Lmul_page_walk
144	jmp	.Lmul_page_walk_done
145
146.align	16
147.Lmul_page_walk:
148	lea	-4096(%rsp),%rsp
149	mov	(%rsp),%r11
150	cmp	%r10,%rsp
151	ja	.Lmul_page_walk
152.Lmul_page_walk_done:
153
154	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
155.Lmul_body:
156	mov	$bp,%r12		# reassign $bp
157___
158		$bp="%r12";
159$code.=<<___;
160	mov	($n0),$n0		# pull n0[0] value
161	mov	($bp),$m0		# m0=bp[0]
162	mov	($ap),%rax
163
164	xor	$i,$i			# i=0
165	xor	$j,$j			# j=0
166
167	mov	$n0,$m1
168	mulq	$m0			# ap[0]*bp[0]
169	mov	%rax,$lo0
170	mov	($np),%rax
171
172	imulq	$lo0,$m1		# "tp[0]"*n0
173	mov	%rdx,$hi0
174
175	mulq	$m1			# np[0]*m1
176	add	%rax,$lo0		# discarded
177	mov	8($ap),%rax
178	adc	\$0,%rdx
179	mov	%rdx,$hi1
180
181	lea	1($j),$j		# j++
182	jmp	.L1st_enter
183
184.align	16
185.L1st:
186	add	%rax,$hi1
187	mov	($ap,$j,8),%rax
188	adc	\$0,%rdx
189	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
190	mov	$lo0,$hi0
191	adc	\$0,%rdx
192	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
193	mov	%rdx,$hi1
194
195.L1st_enter:
196	mulq	$m0			# ap[j]*bp[0]
197	add	%rax,$hi0
198	mov	($np,$j,8),%rax
199	adc	\$0,%rdx
200	lea	1($j),$j		# j++
201	mov	%rdx,$lo0
202
203	mulq	$m1			# np[j]*m1
204	cmp	$num,$j
205	jne	.L1st
206
207	add	%rax,$hi1
208	mov	($ap),%rax		# ap[0]
209	adc	\$0,%rdx
210	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
211	adc	\$0,%rdx
212	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
213	mov	%rdx,$hi1
214	mov	$lo0,$hi0
215
216	xor	%rdx,%rdx
217	add	$hi0,$hi1
218	adc	\$0,%rdx
219	mov	$hi1,-8(%rsp,$num,8)
220	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
221
222	lea	1($i),$i		# i++
223	jmp	.Louter
224.align	16
225.Louter:
226	mov	($bp,$i,8),$m0		# m0=bp[i]
227	xor	$j,$j			# j=0
228	mov	$n0,$m1
229	mov	(%rsp),$lo0
230	mulq	$m0			# ap[0]*bp[i]
231	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
232	mov	($np),%rax
233	adc	\$0,%rdx
234
235	imulq	$lo0,$m1		# tp[0]*n0
236	mov	%rdx,$hi0
237
238	mulq	$m1			# np[0]*m1
239	add	%rax,$lo0		# discarded
240	mov	8($ap),%rax
241	adc	\$0,%rdx
242	mov	8(%rsp),$lo0		# tp[1]
243	mov	%rdx,$hi1
244
245	lea	1($j),$j		# j++
246	jmp	.Linner_enter
247
248.align	16
249.Linner:
250	add	%rax,$hi1
251	mov	($ap,$j,8),%rax
252	adc	\$0,%rdx
253	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
254	mov	(%rsp,$j,8),$lo0
255	adc	\$0,%rdx
256	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
257	mov	%rdx,$hi1
258
259.Linner_enter:
260	mulq	$m0			# ap[j]*bp[i]
261	add	%rax,$hi0
262	mov	($np,$j,8),%rax
263	adc	\$0,%rdx
264	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
265	mov	%rdx,$hi0
266	adc	\$0,$hi0
267	lea	1($j),$j		# j++
268
269	mulq	$m1			# np[j]*m1
270	cmp	$num,$j
271	jne	.Linner
272
273	add	%rax,$hi1
274	mov	($ap),%rax		# ap[0]
275	adc	\$0,%rdx
276	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
277	mov	(%rsp,$j,8),$lo0
278	adc	\$0,%rdx
279	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
280	mov	%rdx,$hi1
281
282	xor	%rdx,%rdx
283	add	$hi0,$hi1
284	adc	\$0,%rdx
285	add	$lo0,$hi1		# pull upmost overflow bit
286	adc	\$0,%rdx
287	mov	$hi1,-8(%rsp,$num,8)
288	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
289
290	lea	1($i),$i		# i++
291	cmp	$num,$i
292	jb	.Louter
293
294	xor	$i,$i			# i=0 and clear CF!
295	mov	(%rsp),%rax		# tp[0]
296	lea	(%rsp),$ap		# borrow ap for tp
297	mov	$num,$j			# j=num
298	jmp	.Lsub
299.align	16
300.Lsub:	sbb	($np,$i,8),%rax
301	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
302	mov	8($ap,$i,8),%rax	# tp[i+1]
303	lea	1($i),$i		# i++
304	dec	$j			# doesnn't affect CF!
305	jnz	.Lsub
306
307	sbb	\$0,%rax		# handle upmost overflow bit
308	xor	$i,$i
309	and	%rax,$ap
310	not	%rax
311	mov	$rp,$np
312	and	%rax,$np
313	mov	$num,$j			# j=num
314	or	$np,$ap			# ap=borrow?tp:rp
315.align	16
316.Lcopy:					# copy or in-place refresh
317	mov	($ap,$i,8),%rax
318	mov	$i,(%rsp,$i,8)		# zap temporary vector
319	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
320	lea	1($i),$i
321	sub	\$1,$j
322	jnz	.Lcopy
323
324	mov	8(%rsp,$num,8),%rsi	# restore %rsp
325	mov	\$1,%rax
326	mov	-48(%rsi),%r15
327	mov	-40(%rsi),%r14
328	mov	-32(%rsi),%r13
329	mov	-24(%rsi),%r12
330	mov	-16(%rsi),%rbp
331	mov	-8(%rsi),%rbx
332	lea	(%rsi),%rsp
333.Lmul_epilogue:
334	ret
335.size	bn_mul_mont,.-bn_mul_mont
336___
337{{{
338my @A=("%r10","%r11");
339my @N=("%r13","%rdi");
340$code.=<<___;
341.type	bn_mul4x_mont,\@function,6
342.align	16
343bn_mul4x_mont:
344	mov	${num}d,${num}d
345	mov	%rsp,%rax
346.Lmul4x_enter:
347___
348$code.=<<___ if ($addx);
349	and	\$0x80100,%r11d
350	cmp	\$0x80100,%r11d
351	je	.Lmulx4x_enter
352___
353$code.=<<___;
354	push	%rbx
355	push	%rbp
356	push	%r12
357	push	%r13
358	push	%r14
359	push	%r15
360
361	neg	$num
362	mov	%rsp,%r11
363	lea	-32(%rsp,$num,8),%r10	# future alloca(8*(num+4))
364	neg	$num			# restore
365	and	\$-1024,%r10		# minimize TLB usage
366
367	sub	%r10,%r11
368	and	\$-4096,%r11
369	lea	(%r10,%r11),%rsp
370	mov	(%rsp),%r11
371	cmp	%r10,%rsp
372	ja	.Lmul4x_page_walk
373	jmp	.Lmul4x_page_walk_done
374
375.Lmul4x_page_walk:
376	lea	-4096(%rsp),%rsp
377	mov	(%rsp),%r11
378	cmp	%r10,%rsp
379	ja	.Lmul4x_page_walk
380.Lmul4x_page_walk_done:
381
382	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
383.Lmul4x_body:
384	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
385	mov	%rdx,%r12		# reassign $bp
386___
387		$bp="%r12";
388$code.=<<___;
389	mov	($n0),$n0		# pull n0[0] value
390	mov	($bp),$m0		# m0=bp[0]
391	mov	($ap),%rax
392
393	xor	$i,$i			# i=0
394	xor	$j,$j			# j=0
395
396	mov	$n0,$m1
397	mulq	$m0			# ap[0]*bp[0]
398	mov	%rax,$A[0]
399	mov	($np),%rax
400
401	imulq	$A[0],$m1		# "tp[0]"*n0
402	mov	%rdx,$A[1]
403
404	mulq	$m1			# np[0]*m1
405	add	%rax,$A[0]		# discarded
406	mov	8($ap),%rax
407	adc	\$0,%rdx
408	mov	%rdx,$N[1]
409
410	mulq	$m0
411	add	%rax,$A[1]
412	mov	8($np),%rax
413	adc	\$0,%rdx
414	mov	%rdx,$A[0]
415
416	mulq	$m1
417	add	%rax,$N[1]
418	mov	16($ap),%rax
419	adc	\$0,%rdx
420	add	$A[1],$N[1]
421	lea	4($j),$j		# j++
422	adc	\$0,%rdx
423	mov	$N[1],(%rsp)
424	mov	%rdx,$N[0]
425	jmp	.L1st4x
426.align	16
427.L1st4x:
428	mulq	$m0			# ap[j]*bp[0]
429	add	%rax,$A[0]
430	mov	-16($np,$j,8),%rax
431	adc	\$0,%rdx
432	mov	%rdx,$A[1]
433
434	mulq	$m1			# np[j]*m1
435	add	%rax,$N[0]
436	mov	-8($ap,$j,8),%rax
437	adc	\$0,%rdx
438	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
439	adc	\$0,%rdx
440	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
441	mov	%rdx,$N[1]
442
443	mulq	$m0			# ap[j]*bp[0]
444	add	%rax,$A[1]
445	mov	-8($np,$j,8),%rax
446	adc	\$0,%rdx
447	mov	%rdx,$A[0]
448
449	mulq	$m1			# np[j]*m1
450	add	%rax,$N[1]
451	mov	($ap,$j,8),%rax
452	adc	\$0,%rdx
453	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
454	adc	\$0,%rdx
455	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
456	mov	%rdx,$N[0]
457
458	mulq	$m0			# ap[j]*bp[0]
459	add	%rax,$A[0]
460	mov	($np,$j,8),%rax
461	adc	\$0,%rdx
462	mov	%rdx,$A[1]
463
464	mulq	$m1			# np[j]*m1
465	add	%rax,$N[0]
466	mov	8($ap,$j,8),%rax
467	adc	\$0,%rdx
468	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
469	adc	\$0,%rdx
470	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
471	mov	%rdx,$N[1]
472
473	mulq	$m0			# ap[j]*bp[0]
474	add	%rax,$A[1]
475	mov	8($np,$j,8),%rax
476	adc	\$0,%rdx
477	lea	4($j),$j		# j++
478	mov	%rdx,$A[0]
479
480	mulq	$m1			# np[j]*m1
481	add	%rax,$N[1]
482	mov	-16($ap,$j,8),%rax
483	adc	\$0,%rdx
484	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
485	adc	\$0,%rdx
486	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
487	mov	%rdx,$N[0]
488	cmp	$num,$j
489	jb	.L1st4x
490
491	mulq	$m0			# ap[j]*bp[0]
492	add	%rax,$A[0]
493	mov	-16($np,$j,8),%rax
494	adc	\$0,%rdx
495	mov	%rdx,$A[1]
496
497	mulq	$m1			# np[j]*m1
498	add	%rax,$N[0]
499	mov	-8($ap,$j,8),%rax
500	adc	\$0,%rdx
501	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
502	adc	\$0,%rdx
503	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
504	mov	%rdx,$N[1]
505
506	mulq	$m0			# ap[j]*bp[0]
507	add	%rax,$A[1]
508	mov	-8($np,$j,8),%rax
509	adc	\$0,%rdx
510	mov	%rdx,$A[0]
511
512	mulq	$m1			# np[j]*m1
513	add	%rax,$N[1]
514	mov	($ap),%rax		# ap[0]
515	adc	\$0,%rdx
516	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
517	adc	\$0,%rdx
518	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
519	mov	%rdx,$N[0]
520
521	xor	$N[1],$N[1]
522	add	$A[0],$N[0]
523	adc	\$0,$N[1]
524	mov	$N[0],-8(%rsp,$j,8)
525	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
526
527	lea	1($i),$i		# i++
528.align	4
529.Louter4x:
530	mov	($bp,$i,8),$m0		# m0=bp[i]
531	xor	$j,$j			# j=0
532	mov	(%rsp),$A[0]
533	mov	$n0,$m1
534	mulq	$m0			# ap[0]*bp[i]
535	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
536	mov	($np),%rax
537	adc	\$0,%rdx
538
539	imulq	$A[0],$m1		# tp[0]*n0
540	mov	%rdx,$A[1]
541
542	mulq	$m1			# np[0]*m1
543	add	%rax,$A[0]		# "$N[0]", discarded
544	mov	8($ap),%rax
545	adc	\$0,%rdx
546	mov	%rdx,$N[1]
547
548	mulq	$m0			# ap[j]*bp[i]
549	add	%rax,$A[1]
550	mov	8($np),%rax
551	adc	\$0,%rdx
552	add	8(%rsp),$A[1]		# +tp[1]
553	adc	\$0,%rdx
554	mov	%rdx,$A[0]
555
556	mulq	$m1			# np[j]*m1
557	add	%rax,$N[1]
558	mov	16($ap),%rax
559	adc	\$0,%rdx
560	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
561	lea	4($j),$j		# j+=2
562	adc	\$0,%rdx
563	mov	$N[1],(%rsp)		# tp[j-1]
564	mov	%rdx,$N[0]
565	jmp	.Linner4x
566.align	16
567.Linner4x:
568	mulq	$m0			# ap[j]*bp[i]
569	add	%rax,$A[0]
570	mov	-16($np,$j,8),%rax
571	adc	\$0,%rdx
572	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
573	adc	\$0,%rdx
574	mov	%rdx,$A[1]
575
576	mulq	$m1			# np[j]*m1
577	add	%rax,$N[0]
578	mov	-8($ap,$j,8),%rax
579	adc	\$0,%rdx
580	add	$A[0],$N[0]
581	adc	\$0,%rdx
582	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
583	mov	%rdx,$N[1]
584
585	mulq	$m0			# ap[j]*bp[i]
586	add	%rax,$A[1]
587	mov	-8($np,$j,8),%rax
588	adc	\$0,%rdx
589	add	-8(%rsp,$j,8),$A[1]
590	adc	\$0,%rdx
591	mov	%rdx,$A[0]
592
593	mulq	$m1			# np[j]*m1
594	add	%rax,$N[1]
595	mov	($ap,$j,8),%rax
596	adc	\$0,%rdx
597	add	$A[1],$N[1]
598	adc	\$0,%rdx
599	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
600	mov	%rdx,$N[0]
601
602	mulq	$m0			# ap[j]*bp[i]
603	add	%rax,$A[0]
604	mov	($np,$j,8),%rax
605	adc	\$0,%rdx
606	add	(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
607	adc	\$0,%rdx
608	mov	%rdx,$A[1]
609
610	mulq	$m1			# np[j]*m1
611	add	%rax,$N[0]
612	mov	8($ap,$j,8),%rax
613	adc	\$0,%rdx
614	add	$A[0],$N[0]
615	adc	\$0,%rdx
616	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
617	mov	%rdx,$N[1]
618
619	mulq	$m0			# ap[j]*bp[i]
620	add	%rax,$A[1]
621	mov	8($np,$j,8),%rax
622	adc	\$0,%rdx
623	add	8(%rsp,$j,8),$A[1]
624	adc	\$0,%rdx
625	lea	4($j),$j		# j++
626	mov	%rdx,$A[0]
627
628	mulq	$m1			# np[j]*m1
629	add	%rax,$N[1]
630	mov	-16($ap,$j,8),%rax
631	adc	\$0,%rdx
632	add	$A[1],$N[1]
633	adc	\$0,%rdx
634	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
635	mov	%rdx,$N[0]
636	cmp	$num,$j
637	jb	.Linner4x
638
639	mulq	$m0			# ap[j]*bp[i]
640	add	%rax,$A[0]
641	mov	-16($np,$j,8),%rax
642	adc	\$0,%rdx
643	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
644	adc	\$0,%rdx
645	mov	%rdx,$A[1]
646
647	mulq	$m1			# np[j]*m1
648	add	%rax,$N[0]
649	mov	-8($ap,$j,8),%rax
650	adc	\$0,%rdx
651	add	$A[0],$N[0]
652	adc	\$0,%rdx
653	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
654	mov	%rdx,$N[1]
655
656	mulq	$m0			# ap[j]*bp[i]
657	add	%rax,$A[1]
658	mov	-8($np,$j,8),%rax
659	adc	\$0,%rdx
660	add	-8(%rsp,$j,8),$A[1]
661	adc	\$0,%rdx
662	lea	1($i),$i		# i++
663	mov	%rdx,$A[0]
664
665	mulq	$m1			# np[j]*m1
666	add	%rax,$N[1]
667	mov	($ap),%rax		# ap[0]
668	adc	\$0,%rdx
669	add	$A[1],$N[1]
670	adc	\$0,%rdx
671	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
672	mov	%rdx,$N[0]
673
674	xor	$N[1],$N[1]
675	add	$A[0],$N[0]
676	adc	\$0,$N[1]
677	add	(%rsp,$num,8),$N[0]	# pull upmost overflow bit
678	adc	\$0,$N[1]
679	mov	$N[0],-8(%rsp,$j,8)
680	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
681
682	cmp	$num,$i
683	jb	.Louter4x
684___
685{
686my @ri=("%rax","%rdx",$m0,$m1);
687$code.=<<___;
688	mov	16(%rsp,$num,8),$rp	# restore $rp
689	mov	0(%rsp),@ri[0]		# tp[0]
690	pxor	%xmm0,%xmm0
691	mov	8(%rsp),@ri[1]		# tp[1]
692	shr	\$2,$num		# num/=4
693	lea	(%rsp),$ap		# borrow ap for tp
694	xor	$i,$i			# i=0 and clear CF!
695
696	sub	0($np),@ri[0]
697	mov	16($ap),@ri[2]		# tp[2]
698	mov	24($ap),@ri[3]		# tp[3]
699	sbb	8($np),@ri[1]
700	lea	-1($num),$j		# j=num/4-1
701	jmp	.Lsub4x
702.align	16
703.Lsub4x:
704	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
705	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
706	sbb	16($np,$i,8),@ri[2]
707	mov	32($ap,$i,8),@ri[0]	# tp[i+1]
708	mov	40($ap,$i,8),@ri[1]
709	sbb	24($np,$i,8),@ri[3]
710	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
711	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
712	sbb	32($np,$i,8),@ri[0]
713	mov	48($ap,$i,8),@ri[2]
714	mov	56($ap,$i,8),@ri[3]
715	sbb	40($np,$i,8),@ri[1]
716	lea	4($i),$i		# i++
717	dec	$j			# doesnn't affect CF!
718	jnz	.Lsub4x
719
720	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
721	mov	32($ap,$i,8),@ri[0]	# load overflow bit
722	sbb	16($np,$i,8),@ri[2]
723	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
724	sbb	24($np,$i,8),@ri[3]
725	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
726
727	sbb	\$0,@ri[0]		# handle upmost overflow bit
728	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
729	xor	$i,$i			# i=0
730	and	@ri[0],$ap
731	not	@ri[0]
732	mov	$rp,$np
733	and	@ri[0],$np
734	lea	-1($num),$j
735	or	$np,$ap			# ap=borrow?tp:rp
736
737	movdqu	($ap),%xmm1
738	movdqa	%xmm0,(%rsp)
739	movdqu	%xmm1,($rp)
740	jmp	.Lcopy4x
741.align	16
742.Lcopy4x:					# copy or in-place refresh
743	movdqu	16($ap,$i),%xmm2
744	movdqu	32($ap,$i),%xmm1
745	movdqa	%xmm0,16(%rsp,$i)
746	movdqu	%xmm2,16($rp,$i)
747	movdqa	%xmm0,32(%rsp,$i)
748	movdqu	%xmm1,32($rp,$i)
749	lea	32($i),$i
750	dec	$j
751	jnz	.Lcopy4x
752
753	shl	\$2,$num
754	movdqu	16($ap,$i),%xmm2
755	movdqa	%xmm0,16(%rsp,$i)
756	movdqu	%xmm2,16($rp,$i)
757___
758}
759$code.=<<___;
760	mov	8(%rsp,$num,8),%rsi	# restore %rsp
761	mov	\$1,%rax
762	mov	-48(%rsi),%r15
763	mov	-40(%rsi),%r14
764	mov	-32(%rsi),%r13
765	mov	-24(%rsi),%r12
766	mov	-16(%rsi),%rbp
767	mov	-8(%rsi),%rbx
768	lea	(%rsi),%rsp
769.Lmul4x_epilogue:
770	ret
771.size	bn_mul4x_mont,.-bn_mul4x_mont
772___
773}}}
774{{{
775######################################################################
776# void bn_sqr8x_mont(
777my $rptr="%rdi";	# const BN_ULONG *rptr,
778my $aptr="%rsi";	# const BN_ULONG *aptr,
779my $bptr="%rdx";	# not used
780my $nptr="%rcx";	# const BN_ULONG *nptr,
781my $n0  ="%r8";		# const BN_ULONG *n0);
782my $num ="%r9";		# int num, has to be divisible by 8
783
784my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
785my @A0=("%r10","%r11");
786my @A1=("%r12","%r13");
787my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
788
789$code.=<<___	if ($addx);
790.extern	bn_sqrx8x_internal		# see x86_64-mont5 module
791___
792$code.=<<___;
793.extern	bn_sqr8x_internal		# see x86_64-mont5 module
794
795.type	bn_sqr8x_mont,\@function,6
796.align	32
797bn_sqr8x_mont:
798	mov	%rsp,%rax
799.Lsqr8x_enter:
800	push	%rbx
801	push	%rbp
802	push	%r12
803	push	%r13
804	push	%r14
805	push	%r15
806.Lsqr8x_prologue:
807
808	mov	${num}d,%r10d
809	shl	\$3,${num}d		# convert $num to bytes
810	shl	\$3+2,%r10		# 4*$num
811	neg	$num
812
813	##############################################################
814	# ensure that stack frame doesn't alias with $aptr modulo
815	# 4096. this is done to allow memory disambiguation logic
816	# do its job.
817	#
818	lea	-64(%rsp,$num,2),%r11
819	mov	%rsp,%rbp
820	mov	($n0),$n0		# *n0
821	sub	$aptr,%r11
822	and	\$4095,%r11
823	cmp	%r11,%r10
824	jb	.Lsqr8x_sp_alt
825	sub	%r11,%rbp		# align with $aptr
826	lea	-64(%rbp,$num,2),%rbp	# future alloca(frame+2*$num)
827	jmp	.Lsqr8x_sp_done
828
829.align	32
830.Lsqr8x_sp_alt:
831	lea	4096-64(,$num,2),%r10	# 4096-frame-2*$num
832	lea	-64(%rbp,$num,2),%rbp	# future alloca(frame+2*$num)
833	sub	%r10,%r11
834	mov	\$0,%r10
835	cmovc	%r10,%r11
836	sub	%r11,%rbp
837.Lsqr8x_sp_done:
838	and	\$-64,%rbp
839	mov	%rsp,%r11
840	sub	%rbp,%r11
841	and	\$-4096,%r11
842	lea	(%rbp,%r11),%rsp
843	mov	(%rsp),%r10
844	cmp	%rbp,%rsp
845	ja	.Lsqr8x_page_walk
846	jmp	.Lsqr8x_page_walk_done
847
848.align	16
849.Lsqr8x_page_walk:
850	lea	-4096(%rsp),%rsp
851	mov	(%rsp),%r10
852	cmp	%rbp,%rsp
853	ja	.Lsqr8x_page_walk
854.Lsqr8x_page_walk_done:
855
856	mov	$num,%r10
857	neg	$num
858
859	mov	$n0,  32(%rsp)
860	mov	%rax, 40(%rsp)		# save original %rsp
861.Lsqr8x_body:
862
863	movq	$nptr, %xmm2		# save pointer to modulus
864	pxor	%xmm0,%xmm0
865	movq	$rptr,%xmm1		# save $rptr
866	movq	%r10, %xmm3		# -$num
867___
868$code.=<<___ if ($addx);
869	mov	OPENSSL_ia32cap_P+8(%rip),%eax
870	and	\$0x80100,%eax
871	cmp	\$0x80100,%eax
872	jne	.Lsqr8x_nox
873
874	call	bn_sqrx8x_internal	# see x86_64-mont5 module
875					# %rax	top-most carry
876					# %rbp	nptr
877					# %rcx	-8*num
878					# %r8	end of tp[2*num]
879	lea	(%r8,%rcx),%rbx
880	mov	%rcx,$num
881	mov	%rcx,%rdx
882	movq	%xmm1,$rptr
883	sar	\$3+2,%rcx		# %cf=0
884	jmp	.Lsqr8x_sub
885
886.align	32
887.Lsqr8x_nox:
888___
889$code.=<<___;
890	call	bn_sqr8x_internal	# see x86_64-mont5 module
891					# %rax	top-most carry
892					# %rbp	nptr
893					# %r8	-8*num
894					# %rdi	end of tp[2*num]
895	lea	(%rdi,$num),%rbx
896	mov	$num,%rcx
897	mov	$num,%rdx
898	movq	%xmm1,$rptr
899	sar	\$3+2,%rcx		# %cf=0
900	jmp	.Lsqr8x_sub
901
902.align	32
903.Lsqr8x_sub:
904	mov	8*0(%rbx),%r12
905	mov	8*1(%rbx),%r13
906	mov	8*2(%rbx),%r14
907	mov	8*3(%rbx),%r15
908	lea	8*4(%rbx),%rbx
909	sbb	8*0(%rbp),%r12
910	sbb	8*1(%rbp),%r13
911	sbb	8*2(%rbp),%r14
912	sbb	8*3(%rbp),%r15
913	lea	8*4(%rbp),%rbp
914	mov	%r12,8*0($rptr)
915	mov	%r13,8*1($rptr)
916	mov	%r14,8*2($rptr)
917	mov	%r15,8*3($rptr)
918	lea	8*4($rptr),$rptr
919	inc	%rcx			# preserves %cf
920	jnz	.Lsqr8x_sub
921
922	sbb	\$0,%rax		# top-most carry
923	lea	(%rbx,$num),%rbx	# rewind
924	lea	($rptr,$num),$rptr	# rewind
925
926	movq	%rax,%xmm1
927	pxor	%xmm0,%xmm0
928	pshufd	\$0,%xmm1,%xmm1
929	mov	40(%rsp),%rsi		# restore %rsp
930	jmp	.Lsqr8x_cond_copy
931
932.align	32
933.Lsqr8x_cond_copy:
934	movdqa	16*0(%rbx),%xmm2
935	movdqa	16*1(%rbx),%xmm3
936	lea	16*2(%rbx),%rbx
937	movdqu	16*0($rptr),%xmm4
938	movdqu	16*1($rptr),%xmm5
939	lea	16*2($rptr),$rptr
940	movdqa	%xmm0,-16*2(%rbx)	# zero tp
941	movdqa	%xmm0,-16*1(%rbx)
942	movdqa	%xmm0,-16*2(%rbx,%rdx)
943	movdqa	%xmm0,-16*1(%rbx,%rdx)
944	pcmpeqd	%xmm1,%xmm0
945	pand	%xmm1,%xmm2
946	pand	%xmm1,%xmm3
947	pand	%xmm0,%xmm4
948	pand	%xmm0,%xmm5
949	pxor	%xmm0,%xmm0
950	por	%xmm2,%xmm4
951	por	%xmm3,%xmm5
952	movdqu	%xmm4,-16*2($rptr)
953	movdqu	%xmm5,-16*1($rptr)
954	add	\$32,$num
955	jnz	.Lsqr8x_cond_copy
956
957	mov	\$1,%rax
958	mov	-48(%rsi),%r15
959	mov	-40(%rsi),%r14
960	mov	-32(%rsi),%r13
961	mov	-24(%rsi),%r12
962	mov	-16(%rsi),%rbp
963	mov	-8(%rsi),%rbx
964	lea	(%rsi),%rsp
965.Lsqr8x_epilogue:
966	ret
967.size	bn_sqr8x_mont,.-bn_sqr8x_mont
968___
969}}}
970
971if ($addx) {{{
972my $bp="%rdx";	# original value
973
974$code.=<<___;
975.type	bn_mulx4x_mont,\@function,6
976.align	32
977bn_mulx4x_mont:
978	mov	%rsp,%rax
979.Lmulx4x_enter:
980	push	%rbx
981	push	%rbp
982	push	%r12
983	push	%r13
984	push	%r14
985	push	%r15
986.Lmulx4x_prologue:
987
988	shl	\$3,${num}d		# convert $num to bytes
989	xor	%r10,%r10
990	sub	$num,%r10		# -$num
991	mov	($n0),$n0		# *n0
992	lea	-72(%rsp,%r10),%rbp	# future alloca(frame+$num+8)
993	and	\$-128,%rbp
994	mov	%rsp,%r11
995	sub	%rbp,%r11
996	and	\$-4096,%r11
997	lea	(%rbp,%r11),%rsp
998	mov	(%rsp),%r10
999	cmp	%rbp,%rsp
1000	ja	.Lmulx4x_page_walk
1001	jmp	.Lmulx4x_page_walk_done
1002
1003.align	16
1004.Lmulx4x_page_walk:
1005	lea	-4096(%rsp),%rsp
1006	mov	(%rsp),%r10
1007	cmp	%rbp,%rsp
1008	ja	.Lmulx4x_page_walk
1009.Lmulx4x_page_walk_done:
1010
1011	lea	($bp,$num),%r10
1012	##############################################################
1013	# Stack layout
1014	# +0	num
1015	# +8	off-loaded &b[i]
1016	# +16	end of b[num]
1017	# +24	saved n0
1018	# +32	saved rp
1019	# +40	saved %rsp
1020	# +48	inner counter
1021	# +56
1022	# +64	tmp[num+1]
1023	#
1024	mov	$num,0(%rsp)		# save $num
1025	shr	\$5,$num
1026	mov	%r10,16(%rsp)		# end of b[num]
1027	sub	\$1,$num
1028	mov	$n0, 24(%rsp)		# save *n0
1029	mov	$rp, 32(%rsp)		# save $rp
1030	mov	%rax,40(%rsp)		# save original %rsp
1031	mov	$num,48(%rsp)		# inner counter
1032	jmp	.Lmulx4x_body
1033
1034.align	32
1035.Lmulx4x_body:
1036___
1037my ($aptr, $bptr, $nptr, $tptr, $mi,  $bi,  $zero, $num)=
1038   ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
1039my $rptr=$bptr;
1040$code.=<<___;
1041	lea	8($bp),$bptr
1042	mov	($bp),%rdx		# b[0], $bp==%rdx actually
1043	lea	64+32(%rsp),$tptr
1044	mov	%rdx,$bi
1045
1046	mulx	0*8($aptr),$mi,%rax	# a[0]*b[0]
1047	mulx	1*8($aptr),%r11,%r14	# a[1]*b[0]
1048	add	%rax,%r11
1049	mov	$bptr,8(%rsp)		# off-load &b[i]
1050	mulx	2*8($aptr),%r12,%r13	# ...
1051	adc	%r14,%r12
1052	adc	\$0,%r13
1053
1054	mov	$mi,$bptr		# borrow $bptr
1055	imulq	24(%rsp),$mi		# "t[0]"*n0
1056	xor	$zero,$zero		# cf=0, of=0
1057
1058	mulx	3*8($aptr),%rax,%r14
1059	 mov	$mi,%rdx
1060	lea	4*8($aptr),$aptr
1061	adcx	%rax,%r13
1062	adcx	$zero,%r14		# cf=0
1063
1064	mulx	0*8($nptr),%rax,%r10
1065	adcx	%rax,$bptr		# discarded
1066	adox	%r11,%r10
1067	mulx	1*8($nptr),%rax,%r11
1068	adcx	%rax,%r10
1069	adox	%r12,%r11
1070	.byte	0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00	# mulx	2*8($nptr),%rax,%r12
1071	mov	48(%rsp),$bptr		# counter value
1072	mov	%r10,-4*8($tptr)
1073	adcx	%rax,%r11
1074	adox	%r13,%r12
1075	mulx	3*8($nptr),%rax,%r15
1076	 mov	$bi,%rdx
1077	mov	%r11,-3*8($tptr)
1078	adcx	%rax,%r12
1079	adox	$zero,%r15		# of=0
1080	lea	4*8($nptr),$nptr
1081	mov	%r12,-2*8($tptr)
1082
1083	jmp	.Lmulx4x_1st
1084
1085.align	32
1086.Lmulx4x_1st:
1087	adcx	$zero,%r15		# cf=0, modulo-scheduled
1088	mulx	0*8($aptr),%r10,%rax	# a[4]*b[0]
1089	adcx	%r14,%r10
1090	mulx	1*8($aptr),%r11,%r14	# a[5]*b[0]
1091	adcx	%rax,%r11
1092	mulx	2*8($aptr),%r12,%rax	# ...
1093	adcx	%r14,%r12
1094	mulx	3*8($aptr),%r13,%r14
1095	 .byte	0x67,0x67
1096	 mov	$mi,%rdx
1097	adcx	%rax,%r13
1098	adcx	$zero,%r14		# cf=0
1099	lea	4*8($aptr),$aptr
1100	lea	4*8($tptr),$tptr
1101
1102	adox	%r15,%r10
1103	mulx	0*8($nptr),%rax,%r15
1104	adcx	%rax,%r10
1105	adox	%r15,%r11
1106	mulx	1*8($nptr),%rax,%r15
1107	adcx	%rax,%r11
1108	adox	%r15,%r12
1109	mulx	2*8($nptr),%rax,%r15
1110	mov	%r10,-5*8($tptr)
1111	adcx	%rax,%r12
1112	mov	%r11,-4*8($tptr)
1113	adox	%r15,%r13
1114	mulx	3*8($nptr),%rax,%r15
1115	 mov	$bi,%rdx
1116	mov	%r12,-3*8($tptr)
1117	adcx	%rax,%r13
1118	adox	$zero,%r15
1119	lea	4*8($nptr),$nptr
1120	mov	%r13,-2*8($tptr)
1121
1122	dec	$bptr			# of=0, pass cf
1123	jnz	.Lmulx4x_1st
1124
1125	mov	0(%rsp),$num		# load num
1126	mov	8(%rsp),$bptr		# re-load &b[i]
1127	adc	$zero,%r15		# modulo-scheduled
1128	add	%r15,%r14
1129	sbb	%r15,%r15		# top-most carry
1130	mov	%r14,-1*8($tptr)
1131	jmp	.Lmulx4x_outer
1132
1133.align	32
1134.Lmulx4x_outer:
1135	mov	($bptr),%rdx		# b[i]
1136	lea	8($bptr),$bptr		# b++
1137	sub	$num,$aptr		# rewind $aptr
1138	mov	%r15,($tptr)		# save top-most carry
1139	lea	64+4*8(%rsp),$tptr
1140	sub	$num,$nptr		# rewind $nptr
1141
1142	mulx	0*8($aptr),$mi,%r11	# a[0]*b[i]
1143	xor	%ebp,%ebp		# xor	$zero,$zero	# cf=0, of=0
1144	mov	%rdx,$bi
1145	mulx	1*8($aptr),%r14,%r12	# a[1]*b[i]
1146	adox	-4*8($tptr),$mi
1147	adcx	%r14,%r11
1148	mulx	2*8($aptr),%r15,%r13	# ...
1149	adox	-3*8($tptr),%r11
1150	adcx	%r15,%r12
1151	adox	-2*8($tptr),%r12
1152	adcx	$zero,%r13
1153	adox	$zero,%r13
1154
1155	mov	$bptr,8(%rsp)		# off-load &b[i]
1156	mov	$mi,%r15
1157	imulq	24(%rsp),$mi		# "t[0]"*n0
1158	xor	%ebp,%ebp		# xor	$zero,$zero	# cf=0, of=0
1159
1160	mulx	3*8($aptr),%rax,%r14
1161	 mov	$mi,%rdx
1162	adcx	%rax,%r13
1163	adox	-1*8($tptr),%r13
1164	adcx	$zero,%r14
1165	lea	4*8($aptr),$aptr
1166	adox	$zero,%r14
1167
1168	mulx	0*8($nptr),%rax,%r10
1169	adcx	%rax,%r15		# discarded
1170	adox	%r11,%r10
1171	mulx	1*8($nptr),%rax,%r11
1172	adcx	%rax,%r10
1173	adox	%r12,%r11
1174	mulx	2*8($nptr),%rax,%r12
1175	mov	%r10,-4*8($tptr)
1176	adcx	%rax,%r11
1177	adox	%r13,%r12
1178	mulx	3*8($nptr),%rax,%r15
1179	 mov	$bi,%rdx
1180	mov	%r11,-3*8($tptr)
1181	lea	4*8($nptr),$nptr
1182	adcx	%rax,%r12
1183	adox	$zero,%r15		# of=0
1184	mov	48(%rsp),$bptr		# counter value
1185	mov	%r12,-2*8($tptr)
1186
1187	jmp	.Lmulx4x_inner
1188
1189.align	32
1190.Lmulx4x_inner:
1191	mulx	0*8($aptr),%r10,%rax	# a[4]*b[i]
1192	adcx	$zero,%r15		# cf=0, modulo-scheduled
1193	adox	%r14,%r10
1194	mulx	1*8($aptr),%r11,%r14	# a[5]*b[i]
1195	adcx	0*8($tptr),%r10
1196	adox	%rax,%r11
1197	mulx	2*8($aptr),%r12,%rax	# ...
1198	adcx	1*8($tptr),%r11
1199	adox	%r14,%r12
1200	mulx	3*8($aptr),%r13,%r14
1201	 mov	$mi,%rdx
1202	adcx	2*8($tptr),%r12
1203	adox	%rax,%r13
1204	adcx	3*8($tptr),%r13
1205	adox	$zero,%r14		# of=0
1206	lea	4*8($aptr),$aptr
1207	lea	4*8($tptr),$tptr
1208	adcx	$zero,%r14		# cf=0
1209
1210	adox	%r15,%r10
1211	mulx	0*8($nptr),%rax,%r15
1212	adcx	%rax,%r10
1213	adox	%r15,%r11
1214	mulx	1*8($nptr),%rax,%r15
1215	adcx	%rax,%r11
1216	adox	%r15,%r12
1217	mulx	2*8($nptr),%rax,%r15
1218	mov	%r10,-5*8($tptr)
1219	adcx	%rax,%r12
1220	adox	%r15,%r13
1221	mulx	3*8($nptr),%rax,%r15
1222	 mov	$bi,%rdx
1223	mov	%r11,-4*8($tptr)
1224	mov	%r12,-3*8($tptr)
1225	adcx	%rax,%r13
1226	adox	$zero,%r15
1227	lea	4*8($nptr),$nptr
1228	mov	%r13,-2*8($tptr)
1229
1230	dec	$bptr			# of=0, pass cf
1231	jnz	.Lmulx4x_inner
1232
1233	mov	0(%rsp),$num		# load num
1234	mov	8(%rsp),$bptr		# re-load &b[i]
1235	adc	$zero,%r15		# modulo-scheduled
1236	sub	0*8($tptr),$zero	# pull top-most carry
1237	adc	%r15,%r14
1238	sbb	%r15,%r15		# top-most carry
1239	mov	%r14,-1*8($tptr)
1240
1241	cmp	16(%rsp),$bptr
1242	jne	.Lmulx4x_outer
1243
1244	lea	64(%rsp),$tptr
1245	sub	$num,$nptr		# rewind $nptr
1246	neg	%r15
1247	mov	$num,%rdx
1248	shr	\$3+2,$num		# %cf=0
1249	mov	32(%rsp),$rptr		# restore rp
1250	jmp	.Lmulx4x_sub
1251
1252.align	32
1253.Lmulx4x_sub:
1254	mov	8*0($tptr),%r11
1255	mov	8*1($tptr),%r12
1256	mov	8*2($tptr),%r13
1257	mov	8*3($tptr),%r14
1258	lea	8*4($tptr),$tptr
1259	sbb	8*0($nptr),%r11
1260	sbb	8*1($nptr),%r12
1261	sbb	8*2($nptr),%r13
1262	sbb	8*3($nptr),%r14
1263	lea	8*4($nptr),$nptr
1264	mov	%r11,8*0($rptr)
1265	mov	%r12,8*1($rptr)
1266	mov	%r13,8*2($rptr)
1267	mov	%r14,8*3($rptr)
1268	lea	8*4($rptr),$rptr
1269	dec	$num			# preserves %cf
1270	jnz	.Lmulx4x_sub
1271
1272	sbb	\$0,%r15		# top-most carry
1273	lea	64(%rsp),$tptr
1274	sub	%rdx,$rptr		# rewind
1275
1276	movq	%r15,%xmm1
1277	pxor	%xmm0,%xmm0
1278	pshufd	\$0,%xmm1,%xmm1
1279	mov	40(%rsp),%rsi		# restore %rsp
1280	jmp	.Lmulx4x_cond_copy
1281
1282.align	32
1283.Lmulx4x_cond_copy:
1284	movdqa	16*0($tptr),%xmm2
1285	movdqa	16*1($tptr),%xmm3
1286	lea	16*2($tptr),$tptr
1287	movdqu	16*0($rptr),%xmm4
1288	movdqu	16*1($rptr),%xmm5
1289	lea	16*2($rptr),$rptr
1290	movdqa	%xmm0,-16*2($tptr)	# zero tp
1291	movdqa	%xmm0,-16*1($tptr)
1292	pcmpeqd	%xmm1,%xmm0
1293	pand	%xmm1,%xmm2
1294	pand	%xmm1,%xmm3
1295	pand	%xmm0,%xmm4
1296	pand	%xmm0,%xmm5
1297	pxor	%xmm0,%xmm0
1298	por	%xmm2,%xmm4
1299	por	%xmm3,%xmm5
1300	movdqu	%xmm4,-16*2($rptr)
1301	movdqu	%xmm5,-16*1($rptr)
1302	sub	\$32,%rdx
1303	jnz	.Lmulx4x_cond_copy
1304
1305	mov	%rdx,($tptr)
1306
1307	mov	\$1,%rax
1308	mov	-48(%rsi),%r15
1309	mov	-40(%rsi),%r14
1310	mov	-32(%rsi),%r13
1311	mov	-24(%rsi),%r12
1312	mov	-16(%rsi),%rbp
1313	mov	-8(%rsi),%rbx
1314	lea	(%rsi),%rsp
1315.Lmulx4x_epilogue:
1316	ret
1317.size	bn_mulx4x_mont,.-bn_mulx4x_mont
1318___
1319}}}
1320$code.=<<___;
1321.asciz	"Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1322.align	16
1323___
1324
1325# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1326#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
1327if ($win64) {
1328$rec="%rcx";
1329$frame="%rdx";
1330$context="%r8";
1331$disp="%r9";
1332
1333$code.=<<___;
1334.extern	__imp_RtlVirtualUnwind
1335.type	mul_handler,\@abi-omnipotent
1336.align	16
1337mul_handler:
1338	push	%rsi
1339	push	%rdi
1340	push	%rbx
1341	push	%rbp
1342	push	%r12
1343	push	%r13
1344	push	%r14
1345	push	%r15
1346	pushfq
1347	sub	\$64,%rsp
1348
1349	mov	120($context),%rax	# pull context->Rax
1350	mov	248($context),%rbx	# pull context->Rip
1351
1352	mov	8($disp),%rsi		# disp->ImageBase
1353	mov	56($disp),%r11		# disp->HandlerData
1354
1355	mov	0(%r11),%r10d		# HandlerData[0]
1356	lea	(%rsi,%r10),%r10	# end of prologue label
1357	cmp	%r10,%rbx		# context->Rip<end of prologue label
1358	jb	.Lcommon_seh_tail
1359
1360	mov	152($context),%rax	# pull context->Rsp
1361
1362	mov	4(%r11),%r10d		# HandlerData[1]
1363	lea	(%rsi,%r10),%r10	# epilogue label
1364	cmp	%r10,%rbx		# context->Rip>=epilogue label
1365	jae	.Lcommon_seh_tail
1366
1367	mov	192($context),%r10	# pull $num
1368	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
1369
1370	jmp	.Lcommon_pop_regs
1371.size	mul_handler,.-mul_handler
1372
1373.type	sqr_handler,\@abi-omnipotent
1374.align	16
1375sqr_handler:
1376	push	%rsi
1377	push	%rdi
1378	push	%rbx
1379	push	%rbp
1380	push	%r12
1381	push	%r13
1382	push	%r14
1383	push	%r15
1384	pushfq
1385	sub	\$64,%rsp
1386
1387	mov	120($context),%rax	# pull context->Rax
1388	mov	248($context),%rbx	# pull context->Rip
1389
1390	mov	8($disp),%rsi		# disp->ImageBase
1391	mov	56($disp),%r11		# disp->HandlerData
1392
1393	mov	0(%r11),%r10d		# HandlerData[0]
1394	lea	(%rsi,%r10),%r10	# end of prologue label
1395	cmp	%r10,%rbx		# context->Rip<.Lsqr_body
1396	jb	.Lcommon_seh_tail
1397
1398	mov	4(%r11),%r10d		# HandlerData[1]
1399	lea	(%rsi,%r10),%r10	# body label
1400	cmp	%r10,%rbx		# context->Rip>=.Lsqr_epilogue
1401	jb	.Lcommon_pop_regs
1402
1403	mov	152($context),%rax	# pull context->Rsp
1404
1405	mov	8(%r11),%r10d		# HandlerData[2]
1406	lea	(%rsi,%r10),%r10	# epilogue label
1407	cmp	%r10,%rbx		# context->Rip>=.Lsqr_epilogue
1408	jae	.Lcommon_seh_tail
1409
1410	mov	40(%rax),%rax		# pull saved stack pointer
1411
1412.Lcommon_pop_regs:
1413	mov	-8(%rax),%rbx
1414	mov	-16(%rax),%rbp
1415	mov	-24(%rax),%r12
1416	mov	-32(%rax),%r13
1417	mov	-40(%rax),%r14
1418	mov	-48(%rax),%r15
1419	mov	%rbx,144($context)	# restore context->Rbx
1420	mov	%rbp,160($context)	# restore context->Rbp
1421	mov	%r12,216($context)	# restore context->R12
1422	mov	%r13,224($context)	# restore context->R13
1423	mov	%r14,232($context)	# restore context->R14
1424	mov	%r15,240($context)	# restore context->R15
1425
1426.Lcommon_seh_tail:
1427	mov	8(%rax),%rdi
1428	mov	16(%rax),%rsi
1429	mov	%rax,152($context)	# restore context->Rsp
1430	mov	%rsi,168($context)	# restore context->Rsi
1431	mov	%rdi,176($context)	# restore context->Rdi
1432
1433	mov	40($disp),%rdi		# disp->ContextRecord
1434	mov	$context,%rsi		# context
1435	mov	\$154,%ecx		# sizeof(CONTEXT)
1436	.long	0xa548f3fc		# cld; rep movsq
1437
1438	mov	$disp,%rsi
1439	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
1440	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
1441	mov	0(%rsi),%r8		# arg3, disp->ControlPc
1442	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
1443	mov	40(%rsi),%r10		# disp->ContextRecord
1444	lea	56(%rsi),%r11		# &disp->HandlerData
1445	lea	24(%rsi),%r12		# &disp->EstablisherFrame
1446	mov	%r10,32(%rsp)		# arg5
1447	mov	%r11,40(%rsp)		# arg6
1448	mov	%r12,48(%rsp)		# arg7
1449	mov	%rcx,56(%rsp)		# arg8, (NULL)
1450	call	*__imp_RtlVirtualUnwind(%rip)
1451
1452	mov	\$1,%eax		# ExceptionContinueSearch
1453	add	\$64,%rsp
1454	popfq
1455	pop	%r15
1456	pop	%r14
1457	pop	%r13
1458	pop	%r12
1459	pop	%rbp
1460	pop	%rbx
1461	pop	%rdi
1462	pop	%rsi
1463	ret
1464.size	sqr_handler,.-sqr_handler
1465
1466.section	.pdata
1467.align	4
1468	.rva	.LSEH_begin_bn_mul_mont
1469	.rva	.LSEH_end_bn_mul_mont
1470	.rva	.LSEH_info_bn_mul_mont
1471
1472	.rva	.LSEH_begin_bn_mul4x_mont
1473	.rva	.LSEH_end_bn_mul4x_mont
1474	.rva	.LSEH_info_bn_mul4x_mont
1475
1476	.rva	.LSEH_begin_bn_sqr8x_mont
1477	.rva	.LSEH_end_bn_sqr8x_mont
1478	.rva	.LSEH_info_bn_sqr8x_mont
1479___
1480$code.=<<___ if ($addx);
1481	.rva	.LSEH_begin_bn_mulx4x_mont
1482	.rva	.LSEH_end_bn_mulx4x_mont
1483	.rva	.LSEH_info_bn_mulx4x_mont
1484___
1485$code.=<<___;
1486.section	.xdata
1487.align	8
1488.LSEH_info_bn_mul_mont:
1489	.byte	9,0,0,0
1490	.rva	mul_handler
1491	.rva	.Lmul_body,.Lmul_epilogue	# HandlerData[]
1492.LSEH_info_bn_mul4x_mont:
1493	.byte	9,0,0,0
1494	.rva	mul_handler
1495	.rva	.Lmul4x_body,.Lmul4x_epilogue	# HandlerData[]
1496.LSEH_info_bn_sqr8x_mont:
1497	.byte	9,0,0,0
1498	.rva	sqr_handler
1499	.rva	.Lsqr8x_prologue,.Lsqr8x_body,.Lsqr8x_epilogue		# HandlerData[]
1500.align	8
1501___
1502$code.=<<___ if ($addx);
1503.LSEH_info_bn_mulx4x_mont:
1504	.byte	9,0,0,0
1505	.rva	sqr_handler
1506	.rva	.Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue	# HandlerData[]
1507.align	8
1508___
1509}
1510
1511print $code;
1512close STDOUT;
1513