x86_64-mont5.pl revision 296279
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# August 2011.
11#
12# Companion to x86_64-mont.pl that optimizes cache-timing attack
13# countermeasures. The subroutines are produced by replacing bp[i]
14# references in their x86_64-mont.pl counterparts with cache-neutral
15# references to powers table computed in BN_mod_exp_mont_consttime.
16# In addition subroutine that scatters elements of the powers table
17# is implemented, so that scatter-/gathering can be tuned without
18# bn_exp.c modifications.
19
20# August 2013.
21#
22# Add MULX/AD*X code paths and additional interfaces to optimize for
23# branch prediction unit. For input lengths that are multiples of 8
24# the np argument is not just modulus value, but one interleaved
25# with 0. This is to optimize post-condition...
26
27$flavour = shift;
28$output  = shift;
29if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
30
31$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
32
33$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
34( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
35( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
36die "can't locate x86_64-xlate.pl";
37
38open OUT,"| \"$^X\" $xlate $flavour $output";
39*STDOUT=*OUT;
40
41if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
42		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
43	$addx = ($1>=2.23);
44}
45
46if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
47	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
48	$addx = ($1>=2.10);
49}
50
51if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
52	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
53	$addx = ($1>=12);
54}
55
56if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
57	my $ver = $2 + $3/100.0;	# 3.1->3.01, 3.10->3.10
58	$addx = ($ver>=3.03);
59}
60
61# int bn_mul_mont_gather5(
62$rp="%rdi";	# BN_ULONG *rp,
63$ap="%rsi";	# const BN_ULONG *ap,
64$bp="%rdx";	# const BN_ULONG *bp,
65$np="%rcx";	# const BN_ULONG *np,
66$n0="%r8";	# const BN_ULONG *n0,
67$num="%r9";	# int num,
68		# int idx);	# 0 to 2^5-1, "index" in $bp holding
69				# pre-computed powers of a', interlaced
70				# in such manner that b[0] is $bp[idx],
71				# b[1] is [2^5+idx], etc.
72$lo0="%r10";
73$hi0="%r11";
74$hi1="%r13";
75$i="%r14";
76$j="%r15";
77$m0="%rbx";
78$m1="%rbp";
79
80$code=<<___;
81.text
82
83.extern	OPENSSL_ia32cap_P
84
85.globl	bn_mul_mont_gather5
86.type	bn_mul_mont_gather5,\@function,6
87.align	64
88bn_mul_mont_gather5:
89	test	\$7,${num}d
90	jnz	.Lmul_enter
91___
92$code.=<<___ if ($addx);
93	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
94___
95$code.=<<___;
96	jmp	.Lmul4x_enter
97
98.align	16
99.Lmul_enter:
100	mov	${num}d,${num}d
101	mov	%rsp,%rax
102	movd	`($win64?56:8)`(%rsp),%xmm5	# load 7th argument
103	lea	.Linc(%rip),%r10
104	push	%rbx
105	push	%rbp
106	push	%r12
107	push	%r13
108	push	%r14
109	push	%r15
110
111	lea	2($num),%r11
112	neg	%r11
113	lea	-264(%rsp,%r11,8),%rsp	# tp=alloca(8*(num+2)+256+8)
114	and	\$-1024,%rsp		# minimize TLB usage
115
116	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
117.Lmul_body:
118	lea	128($bp),%r12		# reassign $bp (+size optimization)
119___
120		$bp="%r12";
121		$STRIDE=2**5*8;		# 5 is "window size"
122		$N=$STRIDE/4;		# should match cache line size
123$code.=<<___;
124	movdqa	0(%r10),%xmm0		# 00000001000000010000000000000000
125	movdqa	16(%r10),%xmm1		# 00000002000000020000000200000002
126	lea	24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization)
127	and	\$-16,%r10
128
129	pshufd	\$0,%xmm5,%xmm5		# broadcast index
130	movdqa	%xmm1,%xmm4
131	movdqa	%xmm1,%xmm2
132___
133########################################################################
134# calculate mask by comparing 0..31 to index and save result to stack
135#
136$code.=<<___;
137	paddd	%xmm0,%xmm1
138	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
139	.byte	0x67
140	movdqa	%xmm4,%xmm3
141___
142for($k=0;$k<$STRIDE/16-4;$k+=4) {
143$code.=<<___;
144	paddd	%xmm1,%xmm2
145	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
146	movdqa	%xmm0,`16*($k+0)+112`(%r10)
147	movdqa	%xmm4,%xmm0
148
149	paddd	%xmm2,%xmm3
150	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
151	movdqa	%xmm1,`16*($k+1)+112`(%r10)
152	movdqa	%xmm4,%xmm1
153
154	paddd	%xmm3,%xmm0
155	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
156	movdqa	%xmm2,`16*($k+2)+112`(%r10)
157	movdqa	%xmm4,%xmm2
158
159	paddd	%xmm0,%xmm1
160	pcmpeqd	%xmm5,%xmm0
161	movdqa	%xmm3,`16*($k+3)+112`(%r10)
162	movdqa	%xmm4,%xmm3
163___
164}
165$code.=<<___;				# last iteration can be optimized
166	paddd	%xmm1,%xmm2
167	pcmpeqd	%xmm5,%xmm1
168	movdqa	%xmm0,`16*($k+0)+112`(%r10)
169
170	paddd	%xmm2,%xmm3
171	.byte	0x67
172	pcmpeqd	%xmm5,%xmm2
173	movdqa	%xmm1,`16*($k+1)+112`(%r10)
174
175	pcmpeqd	%xmm5,%xmm3
176	movdqa	%xmm2,`16*($k+2)+112`(%r10)
177	pand	`16*($k+0)-128`($bp),%xmm0	# while it's still in register
178
179	pand	`16*($k+1)-128`($bp),%xmm1
180	pand	`16*($k+2)-128`($bp),%xmm2
181	movdqa	%xmm3,`16*($k+3)+112`(%r10)
182	pand	`16*($k+3)-128`($bp),%xmm3
183	por	%xmm2,%xmm0
184	por	%xmm3,%xmm1
185___
186for($k=0;$k<$STRIDE/16-4;$k+=4) {
187$code.=<<___;
188	movdqa	`16*($k+0)-128`($bp),%xmm4
189	movdqa	`16*($k+1)-128`($bp),%xmm5
190	movdqa	`16*($k+2)-128`($bp),%xmm2
191	pand	`16*($k+0)+112`(%r10),%xmm4
192	movdqa	`16*($k+3)-128`($bp),%xmm3
193	pand	`16*($k+1)+112`(%r10),%xmm5
194	por	%xmm4,%xmm0
195	pand	`16*($k+2)+112`(%r10),%xmm2
196	por	%xmm5,%xmm1
197	pand	`16*($k+3)+112`(%r10),%xmm3
198	por	%xmm2,%xmm0
199	por	%xmm3,%xmm1
200___
201}
202$code.=<<___;
203	por	%xmm1,%xmm0
204	pshufd	\$0x4e,%xmm0,%xmm1
205	por	%xmm1,%xmm0
206	lea	$STRIDE($bp),$bp
207	movq	%xmm0,$m0		# m0=bp[0]
208
209	mov	($n0),$n0		# pull n0[0] value
210	mov	($ap),%rax
211
212	xor	$i,$i			# i=0
213	xor	$j,$j			# j=0
214
215	mov	$n0,$m1
216	mulq	$m0			# ap[0]*bp[0]
217	mov	%rax,$lo0
218	mov	($np),%rax
219
220	imulq	$lo0,$m1		# "tp[0]"*n0
221	mov	%rdx,$hi0
222
223	mulq	$m1			# np[0]*m1
224	add	%rax,$lo0		# discarded
225	mov	8($ap),%rax
226	adc	\$0,%rdx
227	mov	%rdx,$hi1
228
229	lea	1($j),$j		# j++
230	jmp	.L1st_enter
231
232.align	16
233.L1st:
234	add	%rax,$hi1
235	mov	($ap,$j,8),%rax
236	adc	\$0,%rdx
237	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
238	mov	$lo0,$hi0
239	adc	\$0,%rdx
240	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
241	mov	%rdx,$hi1
242
243.L1st_enter:
244	mulq	$m0			# ap[j]*bp[0]
245	add	%rax,$hi0
246	mov	($np,$j,8),%rax
247	adc	\$0,%rdx
248	lea	1($j),$j		# j++
249	mov	%rdx,$lo0
250
251	mulq	$m1			# np[j]*m1
252	cmp	$num,$j
253	jne	.L1st			# note that upon exit $j==$num, so
254					# they can be used interchangeably
255
256	add	%rax,$hi1
257	adc	\$0,%rdx
258	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
259	adc	\$0,%rdx
260	mov	$hi1,-16(%rsp,$num,8)	# tp[num-1]
261	mov	%rdx,$hi1
262	mov	$lo0,$hi0
263
264	xor	%rdx,%rdx
265	add	$hi0,$hi1
266	adc	\$0,%rdx
267	mov	$hi1,-8(%rsp,$num,8)
268	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
269
270	lea	1($i),$i		# i++
271	jmp	.Louter
272.align	16
273.Louter:
274	lea	24+128(%rsp,$num,8),%rdx	# where 256-byte mask is (+size optimization)
275	and	\$-16,%rdx
276	pxor	%xmm4,%xmm4
277	pxor	%xmm5,%xmm5
278___
279for($k=0;$k<$STRIDE/16;$k+=4) {
280$code.=<<___;
281	movdqa	`16*($k+0)-128`($bp),%xmm0
282	movdqa	`16*($k+1)-128`($bp),%xmm1
283	movdqa	`16*($k+2)-128`($bp),%xmm2
284	movdqa	`16*($k+3)-128`($bp),%xmm3
285	pand	`16*($k+0)-128`(%rdx),%xmm0
286	pand	`16*($k+1)-128`(%rdx),%xmm1
287	por	%xmm0,%xmm4
288	pand	`16*($k+2)-128`(%rdx),%xmm2
289	por	%xmm1,%xmm5
290	pand	`16*($k+3)-128`(%rdx),%xmm3
291	por	%xmm2,%xmm4
292	por	%xmm3,%xmm5
293___
294}
295$code.=<<___;
296	por	%xmm5,%xmm4
297	pshufd	\$0x4e,%xmm4,%xmm0
298	por	%xmm4,%xmm0
299	lea	$STRIDE($bp),$bp
300
301	mov	($ap),%rax		# ap[0]
302	movq	%xmm0,$m0		# m0=bp[i]
303
304	xor	$j,$j			# j=0
305	mov	$n0,$m1
306	mov	(%rsp),$lo0
307
308	mulq	$m0			# ap[0]*bp[i]
309	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
310	mov	($np),%rax
311	adc	\$0,%rdx
312
313	imulq	$lo0,$m1		# tp[0]*n0
314	mov	%rdx,$hi0
315
316	mulq	$m1			# np[0]*m1
317	add	%rax,$lo0		# discarded
318	mov	8($ap),%rax
319	adc	\$0,%rdx
320	mov	8(%rsp),$lo0		# tp[1]
321	mov	%rdx,$hi1
322
323	lea	1($j),$j		# j++
324	jmp	.Linner_enter
325
326.align	16
327.Linner:
328	add	%rax,$hi1
329	mov	($ap,$j,8),%rax
330	adc	\$0,%rdx
331	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
332	mov	(%rsp,$j,8),$lo0
333	adc	\$0,%rdx
334	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
335	mov	%rdx,$hi1
336
337.Linner_enter:
338	mulq	$m0			# ap[j]*bp[i]
339	add	%rax,$hi0
340	mov	($np,$j,8),%rax
341	adc	\$0,%rdx
342	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
343	mov	%rdx,$hi0
344	adc	\$0,$hi0
345	lea	1($j),$j		# j++
346
347	mulq	$m1			# np[j]*m1
348	cmp	$num,$j
349	jne	.Linner			# note that upon exit $j==$num, so
350					# they can be used interchangeably
351	add	%rax,$hi1
352	adc	\$0,%rdx
353	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
354	mov	(%rsp,$num,8),$lo0
355	adc	\$0,%rdx
356	mov	$hi1,-16(%rsp,$num,8)	# tp[num-1]
357	mov	%rdx,$hi1
358
359	xor	%rdx,%rdx
360	add	$hi0,$hi1
361	adc	\$0,%rdx
362	add	$lo0,$hi1		# pull upmost overflow bit
363	adc	\$0,%rdx
364	mov	$hi1,-8(%rsp,$num,8)
365	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
366
367	lea	1($i),$i		# i++
368	cmp	$num,$i
369	jb	.Louter
370
371	xor	$i,$i			# i=0 and clear CF!
372	mov	(%rsp),%rax		# tp[0]
373	lea	(%rsp),$ap		# borrow ap for tp
374	mov	$num,$j			# j=num
375	jmp	.Lsub
376.align	16
377.Lsub:	sbb	($np,$i,8),%rax
378	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
379	mov	8($ap,$i,8),%rax	# tp[i+1]
380	lea	1($i),$i		# i++
381	dec	$j			# doesnn't affect CF!
382	jnz	.Lsub
383
384	sbb	\$0,%rax		# handle upmost overflow bit
385	xor	$i,$i
386	and	%rax,$ap
387	not	%rax
388	mov	$rp,$np
389	and	%rax,$np
390	mov	$num,$j			# j=num
391	or	$np,$ap			# ap=borrow?tp:rp
392.align	16
393.Lcopy:					# copy or in-place refresh
394	mov	($ap,$i,8),%rax
395	mov	$i,(%rsp,$i,8)		# zap temporary vector
396	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
397	lea	1($i),$i
398	sub	\$1,$j
399	jnz	.Lcopy
400
401	mov	8(%rsp,$num,8),%rsi	# restore %rsp
402	mov	\$1,%rax
403
404	mov	-48(%rsi),%r15
405	mov	-40(%rsi),%r14
406	mov	-32(%rsi),%r13
407	mov	-24(%rsi),%r12
408	mov	-16(%rsi),%rbp
409	mov	-8(%rsi),%rbx
410	lea	(%rsi),%rsp
411.Lmul_epilogue:
412	ret
413.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
414___
415{{{
416my @A=("%r10","%r11");
417my @N=("%r13","%rdi");
418$code.=<<___;
419.type	bn_mul4x_mont_gather5,\@function,6
420.align	32
421bn_mul4x_mont_gather5:
422.Lmul4x_enter:
423___
424$code.=<<___ if ($addx);
425	and	\$0x80108,%r11d
426	cmp	\$0x80108,%r11d		# check for AD*X+BMI2+BMI1
427	je	.Lmulx4x_enter
428___
429$code.=<<___;
430	.byte	0x67
431	mov	%rsp,%rax
432	push	%rbx
433	push	%rbp
434	push	%r12
435	push	%r13
436	push	%r14
437	push	%r15
438
439	.byte	0x67
440	shl	\$3,${num}d		# convert $num to bytes
441	lea	($num,$num,2),%r10	# 3*$num in bytes
442	neg	$num			# -$num
443
444	##############################################################
445	# Ensure that stack frame doesn't alias with $rptr+3*$num
446	# modulo 4096, which covers ret[num], am[num] and n[num]
447	# (see bn_exp.c). This is done to allow memory disambiguation
448	# logic do its magic. [Extra [num] is allocated in order
449	# to align with bn_power5's frame, which is cleansed after
450	# completing exponentiation. Extra 256 bytes is for power mask
451	# calculated from 7th argument, the index.]
452	#
453	lea	-320(%rsp,$num,2),%r11
454	sub	$rp,%r11
455	and	\$4095,%r11
456	cmp	%r11,%r10
457	jb	.Lmul4xsp_alt
458	sub	%r11,%rsp		# align with $rp
459	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*num*8+256)
460	jmp	.Lmul4xsp_done
461
462.align	32
463.Lmul4xsp_alt:
464	lea	4096-320(,$num,2),%r10
465	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*num*8+256)
466	sub	%r10,%r11
467	mov	\$0,%r10
468	cmovc	%r10,%r11
469	sub	%r11,%rsp
470.Lmul4xsp_done:
471	and	\$-64,%rsp
472	neg	$num
473
474	mov	%rax,40(%rsp)
475.Lmul4x_body:
476
477	call	mul4x_internal
478
479	mov	40(%rsp),%rsi		# restore %rsp
480	mov	\$1,%rax
481
482	mov	-48(%rsi),%r15
483	mov	-40(%rsi),%r14
484	mov	-32(%rsi),%r13
485	mov	-24(%rsi),%r12
486	mov	-16(%rsi),%rbp
487	mov	-8(%rsi),%rbx
488	lea	(%rsi),%rsp
489.Lmul4x_epilogue:
490	ret
491.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
492
493.type	mul4x_internal,\@abi-omnipotent
494.align	32
495mul4x_internal:
496	shl	\$5,$num		# $num was in bytes
497	movd	`($win64?56:8)`(%rax),%xmm5	# load 7th argument, index
498	lea	.Linc(%rip),%rax
499	lea	128(%rdx,$num),%r13	# end of powers table (+size optimization)
500	shr	\$5,$num		# restore $num
501___
502		$bp="%r12";
503		$STRIDE=2**5*8;		# 5 is "window size"
504		$N=$STRIDE/4;		# should match cache line size
505		$tp=$i;
506$code.=<<___;
507	movdqa	0(%rax),%xmm0		# 00000001000000010000000000000000
508	movdqa	16(%rax),%xmm1		# 00000002000000020000000200000002
509	lea	88-112(%rsp,$num),%r10	# place the mask after tp[num+1] (+ICache optimization)
510	lea	128(%rdx),$bp		# size optimization
511
512	pshufd	\$0,%xmm5,%xmm5		# broadcast index
513	movdqa	%xmm1,%xmm4
514	.byte	0x67,0x67
515	movdqa	%xmm1,%xmm2
516___
517########################################################################
518# calculate mask by comparing 0..31 to index and save result to stack
519#
520$code.=<<___;
521	paddd	%xmm0,%xmm1
522	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
523	.byte	0x67
524	movdqa	%xmm4,%xmm3
525___
526for($i=0;$i<$STRIDE/16-4;$i+=4) {
527$code.=<<___;
528	paddd	%xmm1,%xmm2
529	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
530	movdqa	%xmm0,`16*($i+0)+112`(%r10)
531	movdqa	%xmm4,%xmm0
532
533	paddd	%xmm2,%xmm3
534	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
535	movdqa	%xmm1,`16*($i+1)+112`(%r10)
536	movdqa	%xmm4,%xmm1
537
538	paddd	%xmm3,%xmm0
539	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
540	movdqa	%xmm2,`16*($i+2)+112`(%r10)
541	movdqa	%xmm4,%xmm2
542
543	paddd	%xmm0,%xmm1
544	pcmpeqd	%xmm5,%xmm0
545	movdqa	%xmm3,`16*($i+3)+112`(%r10)
546	movdqa	%xmm4,%xmm3
547___
548}
549$code.=<<___;				# last iteration can be optimized
550	paddd	%xmm1,%xmm2
551	pcmpeqd	%xmm5,%xmm1
552	movdqa	%xmm0,`16*($i+0)+112`(%r10)
553
554	paddd	%xmm2,%xmm3
555	.byte	0x67
556	pcmpeqd	%xmm5,%xmm2
557	movdqa	%xmm1,`16*($i+1)+112`(%r10)
558
559	pcmpeqd	%xmm5,%xmm3
560	movdqa	%xmm2,`16*($i+2)+112`(%r10)
561	pand	`16*($i+0)-128`($bp),%xmm0	# while it's still in register
562
563	pand	`16*($i+1)-128`($bp),%xmm1
564	pand	`16*($i+2)-128`($bp),%xmm2
565	movdqa	%xmm3,`16*($i+3)+112`(%r10)
566	pand	`16*($i+3)-128`($bp),%xmm3
567	por	%xmm2,%xmm0
568	por	%xmm3,%xmm1
569___
570for($i=0;$i<$STRIDE/16-4;$i+=4) {
571$code.=<<___;
572	movdqa	`16*($i+0)-128`($bp),%xmm4
573	movdqa	`16*($i+1)-128`($bp),%xmm5
574	movdqa	`16*($i+2)-128`($bp),%xmm2
575	pand	`16*($i+0)+112`(%r10),%xmm4
576	movdqa	`16*($i+3)-128`($bp),%xmm3
577	pand	`16*($i+1)+112`(%r10),%xmm5
578	por	%xmm4,%xmm0
579	pand	`16*($i+2)+112`(%r10),%xmm2
580	por	%xmm5,%xmm1
581	pand	`16*($i+3)+112`(%r10),%xmm3
582	por	%xmm2,%xmm0
583	por	%xmm3,%xmm1
584___
585}
586$code.=<<___;
587	por	%xmm1,%xmm0
588	pshufd	\$0x4e,%xmm0,%xmm1
589	por	%xmm1,%xmm0
590	lea	$STRIDE($bp),$bp
591	movq	%xmm0,$m0		# m0=bp[0]
592
593	mov	%r13,16+8(%rsp)		# save end of b[num]
594	mov	$rp, 56+8(%rsp)		# save $rp
595
596	mov	($n0),$n0		# pull n0[0] value
597	mov	($ap),%rax
598	lea	($ap,$num),$ap		# end of a[num]
599	neg	$num
600
601	mov	$n0,$m1
602	mulq	$m0			# ap[0]*bp[0]
603	mov	%rax,$A[0]
604	mov	($np),%rax
605
606	imulq	$A[0],$m1		# "tp[0]"*n0
607	lea	64+8(%rsp),$tp
608	mov	%rdx,$A[1]
609
610	mulq	$m1			# np[0]*m1
611	add	%rax,$A[0]		# discarded
612	mov	8($ap,$num),%rax
613	adc	\$0,%rdx
614	mov	%rdx,$N[1]
615
616	mulq	$m0
617	add	%rax,$A[1]
618	mov	8*1($np),%rax
619	adc	\$0,%rdx
620	mov	%rdx,$A[0]
621
622	mulq	$m1
623	add	%rax,$N[1]
624	mov	16($ap,$num),%rax
625	adc	\$0,%rdx
626	add	$A[1],$N[1]
627	lea	4*8($num),$j		# j=4
628	lea	8*4($np),$np
629	adc	\$0,%rdx
630	mov	$N[1],($tp)
631	mov	%rdx,$N[0]
632	jmp	.L1st4x
633
634.align	32
635.L1st4x:
636	mulq	$m0			# ap[j]*bp[0]
637	add	%rax,$A[0]
638	mov	-8*2($np),%rax
639	lea	32($tp),$tp
640	adc	\$0,%rdx
641	mov	%rdx,$A[1]
642
643	mulq	$m1			# np[j]*m1
644	add	%rax,$N[0]
645	mov	-8($ap,$j),%rax
646	adc	\$0,%rdx
647	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
648	adc	\$0,%rdx
649	mov	$N[0],-24($tp)		# tp[j-1]
650	mov	%rdx,$N[1]
651
652	mulq	$m0			# ap[j]*bp[0]
653	add	%rax,$A[1]
654	mov	-8*1($np),%rax
655	adc	\$0,%rdx
656	mov	%rdx,$A[0]
657
658	mulq	$m1			# np[j]*m1
659	add	%rax,$N[1]
660	mov	($ap,$j),%rax
661	adc	\$0,%rdx
662	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
663	adc	\$0,%rdx
664	mov	$N[1],-16($tp)		# tp[j-1]
665	mov	%rdx,$N[0]
666
667	mulq	$m0			# ap[j]*bp[0]
668	add	%rax,$A[0]
669	mov	8*0($np),%rax
670	adc	\$0,%rdx
671	mov	%rdx,$A[1]
672
673	mulq	$m1			# np[j]*m1
674	add	%rax,$N[0]
675	mov	8($ap,$j),%rax
676	adc	\$0,%rdx
677	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
678	adc	\$0,%rdx
679	mov	$N[0],-8($tp)		# tp[j-1]
680	mov	%rdx,$N[1]
681
682	mulq	$m0			# ap[j]*bp[0]
683	add	%rax,$A[1]
684	mov	8*1($np),%rax
685	adc	\$0,%rdx
686	mov	%rdx,$A[0]
687
688	mulq	$m1			# np[j]*m1
689	add	%rax,$N[1]
690	mov	16($ap,$j),%rax
691	adc	\$0,%rdx
692	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
693	lea	8*4($np),$np
694	adc	\$0,%rdx
695	mov	$N[1],($tp)		# tp[j-1]
696	mov	%rdx,$N[0]
697
698	add	\$32,$j			# j+=4
699	jnz	.L1st4x
700
701	mulq	$m0			# ap[j]*bp[0]
702	add	%rax,$A[0]
703	mov	-8*2($np),%rax
704	lea	32($tp),$tp
705	adc	\$0,%rdx
706	mov	%rdx,$A[1]
707
708	mulq	$m1			# np[j]*m1
709	add	%rax,$N[0]
710	mov	-8($ap),%rax
711	adc	\$0,%rdx
712	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
713	adc	\$0,%rdx
714	mov	$N[0],-24($tp)		# tp[j-1]
715	mov	%rdx,$N[1]
716
717	mulq	$m0			# ap[j]*bp[0]
718	add	%rax,$A[1]
719	mov	-8*1($np),%rax
720	adc	\$0,%rdx
721	mov	%rdx,$A[0]
722
723	mulq	$m1			# np[j]*m1
724	add	%rax,$N[1]
725	mov	($ap,$num),%rax		# ap[0]
726	adc	\$0,%rdx
727	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
728	adc	\$0,%rdx
729	mov	$N[1],-16($tp)		# tp[j-1]
730	mov	%rdx,$N[0]
731
732	lea	($np,$num),$np		# rewind $np
733
734	xor	$N[1],$N[1]
735	add	$A[0],$N[0]
736	adc	\$0,$N[1]
737	mov	$N[0],-8($tp)
738
739	jmp	.Louter4x
740
741.align	32
742.Louter4x:
743	lea	16+128($tp),%rdx	# where 256-byte mask is (+size optimization)
744	pxor	%xmm4,%xmm4
745	pxor	%xmm5,%xmm5
746___
747for($i=0;$i<$STRIDE/16;$i+=4) {
748$code.=<<___;
749	movdqa	`16*($i+0)-128`($bp),%xmm0
750	movdqa	`16*($i+1)-128`($bp),%xmm1
751	movdqa	`16*($i+2)-128`($bp),%xmm2
752	movdqa	`16*($i+3)-128`($bp),%xmm3
753	pand	`16*($i+0)-128`(%rdx),%xmm0
754	pand	`16*($i+1)-128`(%rdx),%xmm1
755	por	%xmm0,%xmm4
756	pand	`16*($i+2)-128`(%rdx),%xmm2
757	por	%xmm1,%xmm5
758	pand	`16*($i+3)-128`(%rdx),%xmm3
759	por	%xmm2,%xmm4
760	por	%xmm3,%xmm5
761___
762}
763$code.=<<___;
764	por	%xmm5,%xmm4
765	pshufd	\$0x4e,%xmm4,%xmm0
766	por	%xmm4,%xmm0
767	lea	$STRIDE($bp),$bp
768	movq	%xmm0,$m0		# m0=bp[i]
769
770	mov	($tp,$num),$A[0]
771	mov	$n0,$m1
772	mulq	$m0			# ap[0]*bp[i]
773	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
774	mov	($np),%rax
775	adc	\$0,%rdx
776
777	imulq	$A[0],$m1		# tp[0]*n0
778	mov	%rdx,$A[1]
779	mov	$N[1],($tp)		# store upmost overflow bit
780
781	lea	($tp,$num),$tp		# rewind $tp
782
783	mulq	$m1			# np[0]*m1
784	add	%rax,$A[0]		# "$N[0]", discarded
785	mov	8($ap,$num),%rax
786	adc	\$0,%rdx
787	mov	%rdx,$N[1]
788
789	mulq	$m0			# ap[j]*bp[i]
790	add	%rax,$A[1]
791	mov	8*1($np),%rax
792	adc	\$0,%rdx
793	add	8($tp),$A[1]		# +tp[1]
794	adc	\$0,%rdx
795	mov	%rdx,$A[0]
796
797	mulq	$m1			# np[j]*m1
798	add	%rax,$N[1]
799	mov	16($ap,$num),%rax
800	adc	\$0,%rdx
801	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
802	lea	4*8($num),$j		# j=4
803	lea	8*4($np),$np
804	adc	\$0,%rdx
805	mov	%rdx,$N[0]
806	jmp	.Linner4x
807
808.align	32
809.Linner4x:
810	mulq	$m0			# ap[j]*bp[i]
811	add	%rax,$A[0]
812	mov	-8*2($np),%rax
813	adc	\$0,%rdx
814	add	16($tp),$A[0]		# ap[j]*bp[i]+tp[j]
815	lea	32($tp),$tp
816	adc	\$0,%rdx
817	mov	%rdx,$A[1]
818
819	mulq	$m1			# np[j]*m1
820	add	%rax,$N[0]
821	mov	-8($ap,$j),%rax
822	adc	\$0,%rdx
823	add	$A[0],$N[0]
824	adc	\$0,%rdx
825	mov	$N[1],-32($tp)		# tp[j-1]
826	mov	%rdx,$N[1]
827
828	mulq	$m0			# ap[j]*bp[i]
829	add	%rax,$A[1]
830	mov	-8*1($np),%rax
831	adc	\$0,%rdx
832	add	-8($tp),$A[1]
833	adc	\$0,%rdx
834	mov	%rdx,$A[0]
835
836	mulq	$m1			# np[j]*m1
837	add	%rax,$N[1]
838	mov	($ap,$j),%rax
839	adc	\$0,%rdx
840	add	$A[1],$N[1]
841	adc	\$0,%rdx
842	mov	$N[0],-24($tp)		# tp[j-1]
843	mov	%rdx,$N[0]
844
845	mulq	$m0			# ap[j]*bp[i]
846	add	%rax,$A[0]
847	mov	8*0($np),%rax
848	adc	\$0,%rdx
849	add	($tp),$A[0]		# ap[j]*bp[i]+tp[j]
850	adc	\$0,%rdx
851	mov	%rdx,$A[1]
852
853	mulq	$m1			# np[j]*m1
854	add	%rax,$N[0]
855	mov	8($ap,$j),%rax
856	adc	\$0,%rdx
857	add	$A[0],$N[0]
858	adc	\$0,%rdx
859	mov	$N[1],-16($tp)		# tp[j-1]
860	mov	%rdx,$N[1]
861
862	mulq	$m0			# ap[j]*bp[i]
863	add	%rax,$A[1]
864	mov	8*1($np),%rax
865	adc	\$0,%rdx
866	add	8($tp),$A[1]
867	adc	\$0,%rdx
868	mov	%rdx,$A[0]
869
870	mulq	$m1			# np[j]*m1
871	add	%rax,$N[1]
872	mov	16($ap,$j),%rax
873	adc	\$0,%rdx
874	add	$A[1],$N[1]
875	lea	8*4($np),$np
876	adc	\$0,%rdx
877	mov	$N[0],-8($tp)		# tp[j-1]
878	mov	%rdx,$N[0]
879
880	add	\$32,$j			# j+=4
881	jnz	.Linner4x
882
883	mulq	$m0			# ap[j]*bp[i]
884	add	%rax,$A[0]
885	mov	-8*2($np),%rax
886	adc	\$0,%rdx
887	add	16($tp),$A[0]		# ap[j]*bp[i]+tp[j]
888	lea	32($tp),$tp
889	adc	\$0,%rdx
890	mov	%rdx,$A[1]
891
892	mulq	$m1			# np[j]*m1
893	add	%rax,$N[0]
894	mov	-8($ap),%rax
895	adc	\$0,%rdx
896	add	$A[0],$N[0]
897	adc	\$0,%rdx
898	mov	$N[1],-32($tp)		# tp[j-1]
899	mov	%rdx,$N[1]
900
901	mulq	$m0			# ap[j]*bp[i]
902	add	%rax,$A[1]
903	mov	$m1,%rax
904	mov	-8*1($np),$m1
905	adc	\$0,%rdx
906	add	-8($tp),$A[1]
907	adc	\$0,%rdx
908	mov	%rdx,$A[0]
909
910	mulq	$m1			# np[j]*m1
911	add	%rax,$N[1]
912	mov	($ap,$num),%rax		# ap[0]
913	adc	\$0,%rdx
914	add	$A[1],$N[1]
915	adc	\$0,%rdx
916	mov	$N[0],-24($tp)		# tp[j-1]
917	mov	%rdx,$N[0]
918
919	mov	$N[1],-16($tp)		# tp[j-1]
920	lea	($np,$num),$np		# rewind $np
921
922	xor	$N[1],$N[1]
923	add	$A[0],$N[0]
924	adc	\$0,$N[1]
925	add	($tp),$N[0]		# pull upmost overflow bit
926	adc	\$0,$N[1]		# upmost overflow bit
927	mov	$N[0],-8($tp)
928
929	cmp	16+8(%rsp),$bp
930	jb	.Louter4x
931___
932if (1) {
933$code.=<<___;
934	xor	%rax,%rax
935	sub	$N[0],$m1		# compare top-most words
936	adc	$j,$j			# $j is zero
937	or	$j,$N[1]
938	sub	$N[1],%rax		# %rax=-$N[1]
939	lea	($tp,$num),%rbx		# tptr in .sqr4x_sub
940	mov	($np),%r12
941	lea	($np),%rbp		# nptr in .sqr4x_sub
942	mov	%r9,%rcx
943	sar	\$3+2,%rcx
944	mov	56+8(%rsp),%rdi		# rptr in .sqr4x_sub
945	dec	%r12			# so that after 'not' we get -n[0]
946	xor	%r10,%r10
947	mov	8*1(%rbp),%r13
948	mov	8*2(%rbp),%r14
949	mov	8*3(%rbp),%r15
950	jmp	.Lsqr4x_sub_entry
951___
952} else {
953my @ri=("%rax",$bp,$m0,$m1);
954my $rp="%rdx";
955$code.=<<___
956	xor	\$1,$N[1]
957	lea	($tp,$num),$tp		# rewind $tp
958	sar	\$5,$num		# cf=0
959	lea	($np,$N[1],8),$np
960	mov	56+8(%rsp),$rp		# restore $rp
961	jmp	.Lsub4x
962
963.align	32
964.Lsub4x:
965	.byte	0x66
966	mov	8*0($tp),@ri[0]
967	mov	8*1($tp),@ri[1]
968	.byte	0x66
969	sbb	16*0($np),@ri[0]
970	mov	8*2($tp),@ri[2]
971	sbb	16*1($np),@ri[1]
972	mov	3*8($tp),@ri[3]
973	lea	4*8($tp),$tp
974	sbb	16*2($np),@ri[2]
975	mov	@ri[0],8*0($rp)
976	sbb	16*3($np),@ri[3]
977	lea	16*4($np),$np
978	mov	@ri[1],8*1($rp)
979	mov	@ri[2],8*2($rp)
980	mov	@ri[3],8*3($rp)
981	lea	8*4($rp),$rp
982
983	inc	$num
984	jnz	.Lsub4x
985
986	ret
987___
988}
989$code.=<<___;
990.size	mul4x_internal,.-mul4x_internal
991___
992}}}
993{{{
994######################################################################
995# void bn_power5(
996my $rptr="%rdi";	# BN_ULONG *rptr,
997my $aptr="%rsi";	# const BN_ULONG *aptr,
998my $bptr="%rdx";	# const void *table,
999my $nptr="%rcx";	# const BN_ULONG *nptr,
1000my $n0  ="%r8";		# const BN_ULONG *n0);
1001my $num ="%r9";		# int num, has to be divisible by 8
1002			# int pwr
1003
1004my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
1005my @A0=("%r10","%r11");
1006my @A1=("%r12","%r13");
1007my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
1008
1009$code.=<<___;
1010.globl	bn_power5
1011.type	bn_power5,\@function,6
1012.align	32
1013bn_power5:
1014___
1015$code.=<<___ if ($addx);
1016	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
1017	and	\$0x80108,%r11d
1018	cmp	\$0x80108,%r11d		# check for AD*X+BMI2+BMI1
1019	je	.Lpowerx5_enter
1020___
1021$code.=<<___;
1022	mov	%rsp,%rax
1023	push	%rbx
1024	push	%rbp
1025	push	%r12
1026	push	%r13
1027	push	%r14
1028	push	%r15
1029
1030	shl	\$3,${num}d		# convert $num to bytes
1031	lea	($num,$num,2),%r10d	# 3*$num
1032	neg	$num
1033	mov	($n0),$n0		# *n0
1034
1035	##############################################################
1036	# Ensure that stack frame doesn't alias with $rptr+3*$num
1037	# modulo 4096, which covers ret[num], am[num] and n[num]
1038	# (see bn_exp.c). This is done to allow memory disambiguation
1039	# logic do its magic. [Extra 256 bytes is for power mask
1040	# calculated from 7th argument, the index.]
1041	#
1042	lea	-320(%rsp,$num,2),%r11
1043	sub	$rptr,%r11
1044	and	\$4095,%r11
1045	cmp	%r11,%r10
1046	jb	.Lpwr_sp_alt
1047	sub	%r11,%rsp		# align with $aptr
1048	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*num*8+256)
1049	jmp	.Lpwr_sp_done
1050
1051.align	32
1052.Lpwr_sp_alt:
1053	lea	4096-320(,$num,2),%r10
1054	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*num*8+256)
1055	sub	%r10,%r11
1056	mov	\$0,%r10
1057	cmovc	%r10,%r11
1058	sub	%r11,%rsp
1059.Lpwr_sp_done:
1060	and	\$-64,%rsp
1061	mov	$num,%r10
1062	neg	$num
1063
1064	##############################################################
1065	# Stack layout
1066	#
1067	# +0	saved $num, used in reduction section
1068	# +8	&t[2*$num], used in reduction section
1069	# +32	saved *n0
1070	# +40	saved %rsp
1071	# +48	t[2*$num]
1072	#
1073	mov	$n0,  32(%rsp)
1074	mov	%rax, 40(%rsp)		# save original %rsp
1075.Lpower5_body:
1076	movq	$rptr,%xmm1		# save $rptr, used in sqr8x
1077	movq	$nptr,%xmm2		# save $nptr
1078	movq	%r10, %xmm3		# -$num, used in sqr8x
1079	movq	$bptr,%xmm4
1080
1081	call	__bn_sqr8x_internal
1082	call	__bn_post4x_internal
1083	call	__bn_sqr8x_internal
1084	call	__bn_post4x_internal
1085	call	__bn_sqr8x_internal
1086	call	__bn_post4x_internal
1087	call	__bn_sqr8x_internal
1088	call	__bn_post4x_internal
1089	call	__bn_sqr8x_internal
1090	call	__bn_post4x_internal
1091
1092	movq	%xmm2,$nptr
1093	movq	%xmm4,$bptr
1094	mov	$aptr,$rptr
1095	mov	40(%rsp),%rax
1096	lea	32(%rsp),$n0
1097
1098	call	mul4x_internal
1099
1100	mov	40(%rsp),%rsi		# restore %rsp
1101	mov	\$1,%rax
1102	mov	-48(%rsi),%r15
1103	mov	-40(%rsi),%r14
1104	mov	-32(%rsi),%r13
1105	mov	-24(%rsi),%r12
1106	mov	-16(%rsi),%rbp
1107	mov	-8(%rsi),%rbx
1108	lea	(%rsi),%rsp
1109.Lpower5_epilogue:
1110	ret
1111.size	bn_power5,.-bn_power5
1112
1113.globl	bn_sqr8x_internal
1114.hidden	bn_sqr8x_internal
1115.type	bn_sqr8x_internal,\@abi-omnipotent
1116.align	32
1117bn_sqr8x_internal:
1118__bn_sqr8x_internal:
1119	##############################################################
1120	# Squaring part:
1121	#
1122	# a) multiply-n-add everything but a[i]*a[i];
1123	# b) shift result of a) by 1 to the left and accumulate
1124	#    a[i]*a[i] products;
1125	#
1126	##############################################################
1127	#                                                     a[1]a[0]
1128	#                                                 a[2]a[0]
1129	#                                             a[3]a[0]
1130	#                                             a[2]a[1]
1131	#                                         a[4]a[0]
1132	#                                         a[3]a[1]
1133	#                                     a[5]a[0]
1134	#                                     a[4]a[1]
1135	#                                     a[3]a[2]
1136	#                                 a[6]a[0]
1137	#                                 a[5]a[1]
1138	#                                 a[4]a[2]
1139	#                             a[7]a[0]
1140	#                             a[6]a[1]
1141	#                             a[5]a[2]
1142	#                             a[4]a[3]
1143	#                         a[7]a[1]
1144	#                         a[6]a[2]
1145	#                         a[5]a[3]
1146	#                     a[7]a[2]
1147	#                     a[6]a[3]
1148	#                     a[5]a[4]
1149	#                 a[7]a[3]
1150	#                 a[6]a[4]
1151	#             a[7]a[4]
1152	#             a[6]a[5]
1153	#         a[7]a[5]
1154	#     a[7]a[6]
1155	#                                                     a[1]a[0]
1156	#                                                 a[2]a[0]
1157	#                                             a[3]a[0]
1158	#                                         a[4]a[0]
1159	#                                     a[5]a[0]
1160	#                                 a[6]a[0]
1161	#                             a[7]a[0]
1162	#                                             a[2]a[1]
1163	#                                         a[3]a[1]
1164	#                                     a[4]a[1]
1165	#                                 a[5]a[1]
1166	#                             a[6]a[1]
1167	#                         a[7]a[1]
1168	#                                     a[3]a[2]
1169	#                                 a[4]a[2]
1170	#                             a[5]a[2]
1171	#                         a[6]a[2]
1172	#                     a[7]a[2]
1173	#                             a[4]a[3]
1174	#                         a[5]a[3]
1175	#                     a[6]a[3]
1176	#                 a[7]a[3]
1177	#                     a[5]a[4]
1178	#                 a[6]a[4]
1179	#             a[7]a[4]
1180	#             a[6]a[5]
1181	#         a[7]a[5]
1182	#     a[7]a[6]
1183	#                                                         a[0]a[0]
1184	#                                                 a[1]a[1]
1185	#                                         a[2]a[2]
1186	#                                 a[3]a[3]
1187	#                         a[4]a[4]
1188	#                 a[5]a[5]
1189	#         a[6]a[6]
1190	# a[7]a[7]
1191
1192	lea	32(%r10),$i		# $i=-($num-32)
1193	lea	($aptr,$num),$aptr	# end of a[] buffer, ($aptr,$i)=&ap[2]
1194
1195	mov	$num,$j			# $j=$num
1196
1197					# comments apply to $num==8 case
1198	mov	-32($aptr,$i),$a0	# a[0]
1199	lea	48+8(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
1200	mov	-24($aptr,$i),%rax	# a[1]
1201	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
1202	mov	-16($aptr,$i),$ai	# a[2]
1203	mov	%rax,$a1
1204
1205	mul	$a0			# a[1]*a[0]
1206	mov	%rax,$A0[0]		# a[1]*a[0]
1207	 mov	$ai,%rax		# a[2]
1208	mov	%rdx,$A0[1]
1209	mov	$A0[0],-24($tptr,$i)	# t[1]
1210
1211	mul	$a0			# a[2]*a[0]
1212	add	%rax,$A0[1]
1213	 mov	$ai,%rax
1214	adc	\$0,%rdx
1215	mov	$A0[1],-16($tptr,$i)	# t[2]
1216	mov	%rdx,$A0[0]
1217
1218
1219	 mov	-8($aptr,$i),$ai	# a[3]
1220	mul	$a1			# a[2]*a[1]
1221	mov	%rax,$A1[0]		# a[2]*a[1]+t[3]
1222	 mov	$ai,%rax
1223	mov	%rdx,$A1[1]
1224
1225	 lea	($i),$j
1226	mul	$a0			# a[3]*a[0]
1227	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
1228	 mov	$ai,%rax
1229	mov	%rdx,$A0[1]
1230	adc	\$0,$A0[1]
1231	add	$A1[0],$A0[0]
1232	adc	\$0,$A0[1]
1233	mov	$A0[0],-8($tptr,$j)	# t[3]
1234	jmp	.Lsqr4x_1st
1235
1236.align	32
1237.Lsqr4x_1st:
1238	 mov	($aptr,$j),$ai		# a[4]
1239	mul	$a1			# a[3]*a[1]
1240	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
1241	 mov	$ai,%rax
1242	mov	%rdx,$A1[0]
1243	adc	\$0,$A1[0]
1244
1245	mul	$a0			# a[4]*a[0]
1246	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
1247	 mov	$ai,%rax		# a[3]
1248	 mov	8($aptr,$j),$ai		# a[5]
1249	mov	%rdx,$A0[0]
1250	adc	\$0,$A0[0]
1251	add	$A1[1],$A0[1]
1252	adc	\$0,$A0[0]
1253
1254
1255	mul	$a1			# a[4]*a[3]
1256	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
1257	 mov	$ai,%rax
1258	 mov	$A0[1],($tptr,$j)	# t[4]
1259	mov	%rdx,$A1[1]
1260	adc	\$0,$A1[1]
1261
1262	mul	$a0			# a[5]*a[2]
1263	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
1264	 mov	$ai,%rax
1265	 mov	16($aptr,$j),$ai	# a[6]
1266	mov	%rdx,$A0[1]
1267	adc	\$0,$A0[1]
1268	add	$A1[0],$A0[0]
1269	adc	\$0,$A0[1]
1270
1271	mul	$a1			# a[5]*a[3]
1272	add	%rax,$A1[1]		# a[5]*a[3]+t[6]
1273	 mov	$ai,%rax
1274	 mov	$A0[0],8($tptr,$j)	# t[5]
1275	mov	%rdx,$A1[0]
1276	adc	\$0,$A1[0]
1277
1278	mul	$a0			# a[6]*a[2]
1279	add	%rax,$A0[1]		# a[6]*a[2]+a[5]*a[3]+t[6]
1280	 mov	$ai,%rax		# a[3]
1281	 mov	24($aptr,$j),$ai	# a[7]
1282	mov	%rdx,$A0[0]
1283	adc	\$0,$A0[0]
1284	add	$A1[1],$A0[1]
1285	adc	\$0,$A0[0]
1286
1287
1288	mul	$a1			# a[6]*a[5]
1289	add	%rax,$A1[0]		# a[6]*a[5]+t[7]
1290	 mov	$ai,%rax
1291	 mov	$A0[1],16($tptr,$j)	# t[6]
1292	mov	%rdx,$A1[1]
1293	adc	\$0,$A1[1]
1294	 lea	32($j),$j
1295
1296	mul	$a0			# a[7]*a[4]
1297	add	%rax,$A0[0]		# a[7]*a[4]+a[6]*a[5]+t[6]
1298	 mov	$ai,%rax
1299	mov	%rdx,$A0[1]
1300	adc	\$0,$A0[1]
1301	add	$A1[0],$A0[0]
1302	adc	\$0,$A0[1]
1303	mov	$A0[0],-8($tptr,$j)	# t[7]
1304
1305	cmp	\$0,$j
1306	jne	.Lsqr4x_1st
1307
1308	mul	$a1			# a[7]*a[5]
1309	add	%rax,$A1[1]
1310	lea	16($i),$i
1311	adc	\$0,%rdx
1312	add	$A0[1],$A1[1]
1313	adc	\$0,%rdx
1314
1315	mov	$A1[1],($tptr)		# t[8]
1316	mov	%rdx,$A1[0]
1317	mov	%rdx,8($tptr)		# t[9]
1318	jmp	.Lsqr4x_outer
1319
1320.align	32
1321.Lsqr4x_outer:				# comments apply to $num==6 case
1322	mov	-32($aptr,$i),$a0	# a[0]
1323	lea	48+8(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
1324	mov	-24($aptr,$i),%rax	# a[1]
1325	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
1326	mov	-16($aptr,$i),$ai	# a[2]
1327	mov	%rax,$a1
1328
1329	mul	$a0			# a[1]*a[0]
1330	mov	-24($tptr,$i),$A0[0]	# t[1]
1331	add	%rax,$A0[0]		# a[1]*a[0]+t[1]
1332	 mov	$ai,%rax		# a[2]
1333	adc	\$0,%rdx
1334	mov	$A0[0],-24($tptr,$i)	# t[1]
1335	mov	%rdx,$A0[1]
1336
1337	mul	$a0			# a[2]*a[0]
1338	add	%rax,$A0[1]
1339	 mov	$ai,%rax
1340	adc	\$0,%rdx
1341	add	-16($tptr,$i),$A0[1]	# a[2]*a[0]+t[2]
1342	mov	%rdx,$A0[0]
1343	adc	\$0,$A0[0]
1344	mov	$A0[1],-16($tptr,$i)	# t[2]
1345
1346	xor	$A1[0],$A1[0]
1347
1348	 mov	-8($aptr,$i),$ai	# a[3]
1349	mul	$a1			# a[2]*a[1]
1350	add	%rax,$A1[0]		# a[2]*a[1]+t[3]
1351	 mov	$ai,%rax
1352	adc	\$0,%rdx
1353	add	-8($tptr,$i),$A1[0]
1354	mov	%rdx,$A1[1]
1355	adc	\$0,$A1[1]
1356
1357	mul	$a0			# a[3]*a[0]
1358	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
1359	 mov	$ai,%rax
1360	adc	\$0,%rdx
1361	add	$A1[0],$A0[0]
1362	mov	%rdx,$A0[1]
1363	adc	\$0,$A0[1]
1364	mov	$A0[0],-8($tptr,$i)	# t[3]
1365
1366	lea	($i),$j
1367	jmp	.Lsqr4x_inner
1368
1369.align	32
1370.Lsqr4x_inner:
1371	 mov	($aptr,$j),$ai		# a[4]
1372	mul	$a1			# a[3]*a[1]
1373	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
1374	 mov	$ai,%rax
1375	mov	%rdx,$A1[0]
1376	adc	\$0,$A1[0]
1377	add	($tptr,$j),$A1[1]
1378	adc	\$0,$A1[0]
1379
1380	.byte	0x67
1381	mul	$a0			# a[4]*a[0]
1382	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
1383	 mov	$ai,%rax		# a[3]
1384	 mov	8($aptr,$j),$ai		# a[5]
1385	mov	%rdx,$A0[0]
1386	adc	\$0,$A0[0]
1387	add	$A1[1],$A0[1]
1388	adc	\$0,$A0[0]
1389
1390	mul	$a1			# a[4]*a[3]
1391	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
1392	mov	$A0[1],($tptr,$j)	# t[4]
1393	 mov	$ai,%rax
1394	mov	%rdx,$A1[1]
1395	adc	\$0,$A1[1]
1396	add	8($tptr,$j),$A1[0]
1397	lea	16($j),$j		# j++
1398	adc	\$0,$A1[1]
1399
1400	mul	$a0			# a[5]*a[2]
1401	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
1402	 mov	$ai,%rax
1403	adc	\$0,%rdx
1404	add	$A1[0],$A0[0]
1405	mov	%rdx,$A0[1]
1406	adc	\$0,$A0[1]
1407	mov	$A0[0],-8($tptr,$j)	# t[5], "preloaded t[1]" below
1408
1409	cmp	\$0,$j
1410	jne	.Lsqr4x_inner
1411
1412	.byte	0x67
1413	mul	$a1			# a[5]*a[3]
1414	add	%rax,$A1[1]
1415	adc	\$0,%rdx
1416	add	$A0[1],$A1[1]
1417	adc	\$0,%rdx
1418
1419	mov	$A1[1],($tptr)		# t[6], "preloaded t[2]" below
1420	mov	%rdx,$A1[0]
1421	mov	%rdx,8($tptr)		# t[7], "preloaded t[3]" below
1422
1423	add	\$16,$i
1424	jnz	.Lsqr4x_outer
1425
1426					# comments apply to $num==4 case
1427	mov	-32($aptr),$a0		# a[0]
1428	lea	48+8(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
1429	mov	-24($aptr),%rax		# a[1]
1430	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
1431	mov	-16($aptr),$ai		# a[2]
1432	mov	%rax,$a1
1433
1434	mul	$a0			# a[1]*a[0]
1435	add	%rax,$A0[0]		# a[1]*a[0]+t[1], preloaded t[1]
1436	 mov	$ai,%rax		# a[2]
1437	mov	%rdx,$A0[1]
1438	adc	\$0,$A0[1]
1439
1440	mul	$a0			# a[2]*a[0]
1441	add	%rax,$A0[1]
1442	 mov	$ai,%rax
1443	 mov	$A0[0],-24($tptr)	# t[1]
1444	mov	%rdx,$A0[0]
1445	adc	\$0,$A0[0]
1446	add	$A1[1],$A0[1]		# a[2]*a[0]+t[2], preloaded t[2]
1447	 mov	-8($aptr),$ai		# a[3]
1448	adc	\$0,$A0[0]
1449
1450	mul	$a1			# a[2]*a[1]
1451	add	%rax,$A1[0]		# a[2]*a[1]+t[3], preloaded t[3]
1452	 mov	$ai,%rax
1453	 mov	$A0[1],-16($tptr)	# t[2]
1454	mov	%rdx,$A1[1]
1455	adc	\$0,$A1[1]
1456
1457	mul	$a0			# a[3]*a[0]
1458	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
1459	 mov	$ai,%rax
1460	mov	%rdx,$A0[1]
1461	adc	\$0,$A0[1]
1462	add	$A1[0],$A0[0]
1463	adc	\$0,$A0[1]
1464	mov	$A0[0],-8($tptr)	# t[3]
1465
1466	mul	$a1			# a[3]*a[1]
1467	add	%rax,$A1[1]
1468	 mov	-16($aptr),%rax		# a[2]
1469	adc	\$0,%rdx
1470	add	$A0[1],$A1[1]
1471	adc	\$0,%rdx
1472
1473	mov	$A1[1],($tptr)		# t[4]
1474	mov	%rdx,$A1[0]
1475	mov	%rdx,8($tptr)		# t[5]
1476
1477	mul	$ai			# a[2]*a[3]
1478___
1479{
1480my ($shift,$carry)=($a0,$a1);
1481my @S=(@A1,$ai,$n0);
1482$code.=<<___;
1483	 add	\$16,$i
1484	 xor	$shift,$shift
1485	 sub	$num,$i			# $i=16-$num
1486	 xor	$carry,$carry
1487
1488	add	$A1[0],%rax		# t[5]
1489	adc	\$0,%rdx
1490	mov	%rax,8($tptr)		# t[5]
1491	mov	%rdx,16($tptr)		# t[6]
1492	mov	$carry,24($tptr)	# t[7]
1493
1494	 mov	-16($aptr,$i),%rax	# a[0]
1495	lea	48+8(%rsp),$tptr
1496	 xor	$A0[0],$A0[0]		# t[0]
1497	 mov	8($tptr),$A0[1]		# t[1]
1498
1499	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1500	shr	\$63,$A0[0]
1501	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1502	shr	\$63,$A0[1]
1503	or	$A0[0],$S[1]		# | t[2*i]>>63
1504	 mov	16($tptr),$A0[0]	# t[2*i+2]	# prefetch
1505	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1506	mul	%rax			# a[i]*a[i]
1507	neg	$carry			# mov $carry,cf
1508	 mov	24($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1509	adc	%rax,$S[0]
1510	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
1511	mov	$S[0],($tptr)
1512	adc	%rdx,$S[1]
1513
1514	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1515	 mov	$S[1],8($tptr)
1516	 sbb	$carry,$carry		# mov cf,$carry
1517	shr	\$63,$A0[0]
1518	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1519	shr	\$63,$A0[1]
1520	or	$A0[0],$S[3]		# | t[2*i]>>63
1521	 mov	32($tptr),$A0[0]	# t[2*i+2]	# prefetch
1522	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1523	mul	%rax			# a[i]*a[i]
1524	neg	$carry			# mov $carry,cf
1525	 mov	40($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1526	adc	%rax,$S[2]
1527	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
1528	mov	$S[2],16($tptr)
1529	adc	%rdx,$S[3]
1530	lea	16($i),$i
1531	mov	$S[3],24($tptr)
1532	sbb	$carry,$carry		# mov cf,$carry
1533	lea	64($tptr),$tptr
1534	jmp	.Lsqr4x_shift_n_add
1535
1536.align	32
1537.Lsqr4x_shift_n_add:
1538	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1539	shr	\$63,$A0[0]
1540	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1541	shr	\$63,$A0[1]
1542	or	$A0[0],$S[1]		# | t[2*i]>>63
1543	 mov	-16($tptr),$A0[0]	# t[2*i+2]	# prefetch
1544	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1545	mul	%rax			# a[i]*a[i]
1546	neg	$carry			# mov $carry,cf
1547	 mov	-8($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1548	adc	%rax,$S[0]
1549	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
1550	mov	$S[0],-32($tptr)
1551	adc	%rdx,$S[1]
1552
1553	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1554	 mov	$S[1],-24($tptr)
1555	 sbb	$carry,$carry		# mov cf,$carry
1556	shr	\$63,$A0[0]
1557	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1558	shr	\$63,$A0[1]
1559	or	$A0[0],$S[3]		# | t[2*i]>>63
1560	 mov	0($tptr),$A0[0]		# t[2*i+2]	# prefetch
1561	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1562	mul	%rax			# a[i]*a[i]
1563	neg	$carry			# mov $carry,cf
1564	 mov	8($tptr),$A0[1]		# t[2*i+2+1]	# prefetch
1565	adc	%rax,$S[2]
1566	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
1567	mov	$S[2],-16($tptr)
1568	adc	%rdx,$S[3]
1569
1570	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1571	 mov	$S[3],-8($tptr)
1572	 sbb	$carry,$carry		# mov cf,$carry
1573	shr	\$63,$A0[0]
1574	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1575	shr	\$63,$A0[1]
1576	or	$A0[0],$S[1]		# | t[2*i]>>63
1577	 mov	16($tptr),$A0[0]	# t[2*i+2]	# prefetch
1578	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1579	mul	%rax			# a[i]*a[i]
1580	neg	$carry			# mov $carry,cf
1581	 mov	24($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1582	adc	%rax,$S[0]
1583	 mov	8($aptr,$i),%rax	# a[i+1]	# prefetch
1584	mov	$S[0],0($tptr)
1585	adc	%rdx,$S[1]
1586
1587	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1588	 mov	$S[1],8($tptr)
1589	 sbb	$carry,$carry		# mov cf,$carry
1590	shr	\$63,$A0[0]
1591	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1592	shr	\$63,$A0[1]
1593	or	$A0[0],$S[3]		# | t[2*i]>>63
1594	 mov	32($tptr),$A0[0]	# t[2*i+2]	# prefetch
1595	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1596	mul	%rax			# a[i]*a[i]
1597	neg	$carry			# mov $carry,cf
1598	 mov	40($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1599	adc	%rax,$S[2]
1600	 mov	16($aptr,$i),%rax	# a[i+1]	# prefetch
1601	mov	$S[2],16($tptr)
1602	adc	%rdx,$S[3]
1603	mov	$S[3],24($tptr)
1604	sbb	$carry,$carry		# mov cf,$carry
1605	lea	64($tptr),$tptr
1606	add	\$32,$i
1607	jnz	.Lsqr4x_shift_n_add
1608
1609	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1610	.byte	0x67
1611	shr	\$63,$A0[0]
1612	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1613	shr	\$63,$A0[1]
1614	or	$A0[0],$S[1]		# | t[2*i]>>63
1615	 mov	-16($tptr),$A0[0]	# t[2*i+2]	# prefetch
1616	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1617	mul	%rax			# a[i]*a[i]
1618	neg	$carry			# mov $carry,cf
1619	 mov	-8($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1620	adc	%rax,$S[0]
1621	 mov	-8($aptr),%rax		# a[i+1]	# prefetch
1622	mov	$S[0],-32($tptr)
1623	adc	%rdx,$S[1]
1624
1625	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1|shift
1626	 mov	$S[1],-24($tptr)
1627	 sbb	$carry,$carry		# mov cf,$carry
1628	shr	\$63,$A0[0]
1629	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1630	shr	\$63,$A0[1]
1631	or	$A0[0],$S[3]		# | t[2*i]>>63
1632	mul	%rax			# a[i]*a[i]
1633	neg	$carry			# mov $carry,cf
1634	adc	%rax,$S[2]
1635	adc	%rdx,$S[3]
1636	mov	$S[2],-16($tptr)
1637	mov	$S[3],-8($tptr)
1638___
1639}
1640######################################################################
1641# Montgomery reduction part, "word-by-word" algorithm.
1642#
1643# This new path is inspired by multiple submissions from Intel, by
1644# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
1645# Vinodh Gopal...
1646{
1647my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx");
1648
1649$code.=<<___;
1650	movq	%xmm2,$nptr
1651__bn_sqr8x_reduction:
1652	xor	%rax,%rax
1653	lea	($nptr,$num),%rcx	# end of n[]
1654	lea	48+8(%rsp,$num,2),%rdx	# end of t[] buffer
1655	mov	%rcx,0+8(%rsp)
1656	lea	48+8(%rsp,$num),$tptr	# end of initial t[] window
1657	mov	%rdx,8+8(%rsp)
1658	neg	$num
1659	jmp	.L8x_reduction_loop
1660
1661.align	32
1662.L8x_reduction_loop:
1663	lea	($tptr,$num),$tptr	# start of current t[] window
1664	.byte	0x66
1665	mov	8*0($tptr),$m0
1666	mov	8*1($tptr),%r9
1667	mov	8*2($tptr),%r10
1668	mov	8*3($tptr),%r11
1669	mov	8*4($tptr),%r12
1670	mov	8*5($tptr),%r13
1671	mov	8*6($tptr),%r14
1672	mov	8*7($tptr),%r15
1673	mov	%rax,(%rdx)		# store top-most carry bit
1674	lea	8*8($tptr),$tptr
1675
1676	.byte	0x67
1677	mov	$m0,%r8
1678	imulq	32+8(%rsp),$m0		# n0*a[0]
1679	mov	8*0($nptr),%rax		# n[0]
1680	mov	\$8,%ecx
1681	jmp	.L8x_reduce
1682
1683.align	32
1684.L8x_reduce:
1685	mulq	$m0
1686	 mov	8*1($nptr),%rax		# n[1]
1687	neg	%r8
1688	mov	%rdx,%r8
1689	adc	\$0,%r8
1690
1691	mulq	$m0
1692	add	%rax,%r9
1693	 mov	8*2($nptr),%rax
1694	adc	\$0,%rdx
1695	add	%r9,%r8
1696	 mov	$m0,48-8+8(%rsp,%rcx,8)	# put aside n0*a[i]
1697	mov	%rdx,%r9
1698	adc	\$0,%r9
1699
1700	mulq	$m0
1701	add	%rax,%r10
1702	 mov	8*3($nptr),%rax
1703	adc	\$0,%rdx
1704	add	%r10,%r9
1705	 mov	32+8(%rsp),$carry	# pull n0, borrow $carry
1706	mov	%rdx,%r10
1707	adc	\$0,%r10
1708
1709	mulq	$m0
1710	add	%rax,%r11
1711	 mov	8*4($nptr),%rax
1712	adc	\$0,%rdx
1713	 imulq	%r8,$carry		# modulo-scheduled
1714	add	%r11,%r10
1715	mov	%rdx,%r11
1716	adc	\$0,%r11
1717
1718	mulq	$m0
1719	add	%rax,%r12
1720	 mov	8*5($nptr),%rax
1721	adc	\$0,%rdx
1722	add	%r12,%r11
1723	mov	%rdx,%r12
1724	adc	\$0,%r12
1725
1726	mulq	$m0
1727	add	%rax,%r13
1728	 mov	8*6($nptr),%rax
1729	adc	\$0,%rdx
1730	add	%r13,%r12
1731	mov	%rdx,%r13
1732	adc	\$0,%r13
1733
1734	mulq	$m0
1735	add	%rax,%r14
1736	 mov	8*7($nptr),%rax
1737	adc	\$0,%rdx
1738	add	%r14,%r13
1739	mov	%rdx,%r14
1740	adc	\$0,%r14
1741
1742	mulq	$m0
1743	 mov	$carry,$m0		# n0*a[i]
1744	add	%rax,%r15
1745	 mov	8*0($nptr),%rax		# n[0]
1746	adc	\$0,%rdx
1747	add	%r15,%r14
1748	mov	%rdx,%r15
1749	adc	\$0,%r15
1750
1751	dec	%ecx
1752	jnz	.L8x_reduce
1753
1754	lea	8*8($nptr),$nptr
1755	xor	%rax,%rax
1756	mov	8+8(%rsp),%rdx		# pull end of t[]
1757	cmp	0+8(%rsp),$nptr		# end of n[]?
1758	jae	.L8x_no_tail
1759
1760	.byte	0x66
1761	add	8*0($tptr),%r8
1762	adc	8*1($tptr),%r9
1763	adc	8*2($tptr),%r10
1764	adc	8*3($tptr),%r11
1765	adc	8*4($tptr),%r12
1766	adc	8*5($tptr),%r13
1767	adc	8*6($tptr),%r14
1768	adc	8*7($tptr),%r15
1769	sbb	$carry,$carry		# top carry
1770
1771	mov	48+56+8(%rsp),$m0	# pull n0*a[0]
1772	mov	\$8,%ecx
1773	mov	8*0($nptr),%rax
1774	jmp	.L8x_tail
1775
1776.align	32
1777.L8x_tail:
1778	mulq	$m0
1779	add	%rax,%r8
1780	 mov	8*1($nptr),%rax
1781	 mov	%r8,($tptr)		# save result
1782	mov	%rdx,%r8
1783	adc	\$0,%r8
1784
1785	mulq	$m0
1786	add	%rax,%r9
1787	 mov	8*2($nptr),%rax
1788	adc	\$0,%rdx
1789	add	%r9,%r8
1790	 lea	8($tptr),$tptr		# $tptr++
1791	mov	%rdx,%r9
1792	adc	\$0,%r9
1793
1794	mulq	$m0
1795	add	%rax,%r10
1796	 mov	8*3($nptr),%rax
1797	adc	\$0,%rdx
1798	add	%r10,%r9
1799	mov	%rdx,%r10
1800	adc	\$0,%r10
1801
1802	mulq	$m0
1803	add	%rax,%r11
1804	 mov	8*4($nptr),%rax
1805	adc	\$0,%rdx
1806	add	%r11,%r10
1807	mov	%rdx,%r11
1808	adc	\$0,%r11
1809
1810	mulq	$m0
1811	add	%rax,%r12
1812	 mov	8*5($nptr),%rax
1813	adc	\$0,%rdx
1814	add	%r12,%r11
1815	mov	%rdx,%r12
1816	adc	\$0,%r12
1817
1818	mulq	$m0
1819	add	%rax,%r13
1820	 mov	8*6($nptr),%rax
1821	adc	\$0,%rdx
1822	add	%r13,%r12
1823	mov	%rdx,%r13
1824	adc	\$0,%r13
1825
1826	mulq	$m0
1827	add	%rax,%r14
1828	 mov	8*7($nptr),%rax
1829	adc	\$0,%rdx
1830	add	%r14,%r13
1831	mov	%rdx,%r14
1832	adc	\$0,%r14
1833
1834	mulq	$m0
1835	 mov	48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i]
1836	add	%rax,%r15
1837	adc	\$0,%rdx
1838	add	%r15,%r14
1839	 mov	8*0($nptr),%rax		# pull n[0]
1840	mov	%rdx,%r15
1841	adc	\$0,%r15
1842
1843	dec	%ecx
1844	jnz	.L8x_tail
1845
1846	lea	8*8($nptr),$nptr
1847	mov	8+8(%rsp),%rdx		# pull end of t[]
1848	cmp	0+8(%rsp),$nptr		# end of n[]?
1849	jae	.L8x_tail_done		# break out of loop
1850
1851	 mov	48+56+8(%rsp),$m0	# pull n0*a[0]
1852	neg	$carry
1853	 mov	8*0($nptr),%rax		# pull n[0]
1854	adc	8*0($tptr),%r8
1855	adc	8*1($tptr),%r9
1856	adc	8*2($tptr),%r10
1857	adc	8*3($tptr),%r11
1858	adc	8*4($tptr),%r12
1859	adc	8*5($tptr),%r13
1860	adc	8*6($tptr),%r14
1861	adc	8*7($tptr),%r15
1862	sbb	$carry,$carry		# top carry
1863
1864	mov	\$8,%ecx
1865	jmp	.L8x_tail
1866
1867.align	32
1868.L8x_tail_done:
1869	add	(%rdx),%r8		# can this overflow?
1870	adc	\$0,%r9
1871	adc	\$0,%r10
1872	adc	\$0,%r11
1873	adc	\$0,%r12
1874	adc	\$0,%r13
1875	adc	\$0,%r14
1876	adc	\$0,%r15		# can't overflow, because we
1877					# started with "overhung" part
1878					# of multiplication
1879	xor	%rax,%rax
1880
1881	neg	$carry
1882.L8x_no_tail:
1883	adc	8*0($tptr),%r8
1884	adc	8*1($tptr),%r9
1885	adc	8*2($tptr),%r10
1886	adc	8*3($tptr),%r11
1887	adc	8*4($tptr),%r12
1888	adc	8*5($tptr),%r13
1889	adc	8*6($tptr),%r14
1890	adc	8*7($tptr),%r15
1891	adc	\$0,%rax		# top-most carry
1892	 mov	-8($nptr),%rcx		# np[num-1]
1893	 xor	$carry,$carry
1894
1895	movq	%xmm2,$nptr		# restore $nptr
1896
1897	mov	%r8,8*0($tptr)		# store top 512 bits
1898	mov	%r9,8*1($tptr)
1899	 movq	%xmm3,$num		# $num is %r9, can't be moved upwards
1900	mov	%r10,8*2($tptr)
1901	mov	%r11,8*3($tptr)
1902	mov	%r12,8*4($tptr)
1903	mov	%r13,8*5($tptr)
1904	mov	%r14,8*6($tptr)
1905	mov	%r15,8*7($tptr)
1906	lea	8*8($tptr),$tptr
1907
1908	cmp	%rdx,$tptr		# end of t[]?
1909	jb	.L8x_reduction_loop
1910	ret
1911.size	bn_sqr8x_internal,.-bn_sqr8x_internal
1912___
1913}
1914##############################################################
1915# Post-condition, 4x unrolled
1916#
1917{
1918my ($tptr,$nptr)=("%rbx","%rbp");
1919$code.=<<___;
1920.type	__bn_post4x_internal,\@abi-omnipotent
1921.align	32
1922__bn_post4x_internal:
1923	mov	8*0($nptr),%r12
1924	lea	(%rdi,$num),$tptr	# %rdi was $tptr above
1925	mov	$num,%rcx
1926	movq	%xmm1,$rptr		# restore $rptr
1927	neg	%rax
1928	movq	%xmm1,$aptr		# prepare for back-to-back call
1929	sar	\$3+2,%rcx
1930	dec	%r12			# so that after 'not' we get -n[0]
1931	xor	%r10,%r10
1932	mov	8*1($nptr),%r13
1933	mov	8*2($nptr),%r14
1934	mov	8*3($nptr),%r15
1935	jmp	.Lsqr4x_sub_entry
1936
1937.align	16
1938.Lsqr4x_sub:
1939	mov	8*0($nptr),%r12
1940	mov	8*1($nptr),%r13
1941	mov	8*2($nptr),%r14
1942	mov	8*3($nptr),%r15
1943.Lsqr4x_sub_entry:
1944	lea	8*4($nptr),$nptr
1945	not	%r12
1946	not	%r13
1947	not	%r14
1948	not	%r15
1949	and	%rax,%r12
1950	and	%rax,%r13
1951	and	%rax,%r14
1952	and	%rax,%r15
1953
1954	neg	%r10			# mov %r10,%cf
1955	adc	8*0($tptr),%r12
1956	adc	8*1($tptr),%r13
1957	adc	8*2($tptr),%r14
1958	adc	8*3($tptr),%r15
1959	mov	%r12,8*0($rptr)
1960	lea	8*4($tptr),$tptr
1961	mov	%r13,8*1($rptr)
1962	sbb	%r10,%r10		# mov %cf,%r10
1963	mov	%r14,8*2($rptr)
1964	mov	%r15,8*3($rptr)
1965	lea	8*4($rptr),$rptr
1966
1967	inc	%rcx			# pass %cf
1968	jnz	.Lsqr4x_sub
1969
1970	mov	$num,%r10		# prepare for back-to-back call
1971	neg	$num			# restore $num
1972	ret
1973.size	__bn_post4x_internal,.-__bn_post4x_internal
1974___
1975}
1976{
1977$code.=<<___;
1978.globl	bn_from_montgomery
1979.type	bn_from_montgomery,\@abi-omnipotent
1980.align	32
1981bn_from_montgomery:
1982	testl	\$7,`($win64?"48(%rsp)":"%r9d")`
1983	jz	bn_from_mont8x
1984	xor	%eax,%eax
1985	ret
1986.size	bn_from_montgomery,.-bn_from_montgomery
1987
1988.type	bn_from_mont8x,\@function,6
1989.align	32
1990bn_from_mont8x:
1991	.byte	0x67
1992	mov	%rsp,%rax
1993	push	%rbx
1994	push	%rbp
1995	push	%r12
1996	push	%r13
1997	push	%r14
1998	push	%r15
1999
2000	shl	\$3,${num}d		# convert $num to bytes
2001	lea	($num,$num,2),%r10	# 3*$num in bytes
2002	neg	$num
2003	mov	($n0),$n0		# *n0
2004
2005	##############################################################
2006	# Ensure that stack frame doesn't alias with $rptr+3*$num
2007	# modulo 4096, which covers ret[num], am[num] and n[num]
2008	# (see bn_exp.c). The stack is allocated to aligned with
2009	# bn_power5's frame, and as bn_from_montgomery happens to be
2010	# last operation, we use the opportunity to cleanse it.
2011	#
2012	lea	-320(%rsp,$num,2),%r11
2013	sub	$rptr,%r11
2014	and	\$4095,%r11
2015	cmp	%r11,%r10
2016	jb	.Lfrom_sp_alt
2017	sub	%r11,%rsp		# align with $aptr
2018	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*$num*8+256)
2019	jmp	.Lfrom_sp_done
2020
2021.align	32
2022.Lfrom_sp_alt:
2023	lea	4096-320(,$num,2),%r10
2024	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*$num*8+256)
2025	sub	%r10,%r11
2026	mov	\$0,%r10
2027	cmovc	%r10,%r11
2028	sub	%r11,%rsp
2029.Lfrom_sp_done:
2030	and	\$-64,%rsp
2031	mov	$num,%r10
2032	neg	$num
2033
2034	##############################################################
2035	# Stack layout
2036	#
2037	# +0	saved $num, used in reduction section
2038	# +8	&t[2*$num], used in reduction section
2039	# +32	saved *n0
2040	# +40	saved %rsp
2041	# +48	t[2*$num]
2042	#
2043	mov	$n0,  32(%rsp)
2044	mov	%rax, 40(%rsp)		# save original %rsp
2045.Lfrom_body:
2046	mov	$num,%r11
2047	lea	48(%rsp),%rax
2048	pxor	%xmm0,%xmm0
2049	jmp	.Lmul_by_1
2050
2051.align	32
2052.Lmul_by_1:
2053	movdqu	($aptr),%xmm1
2054	movdqu	16($aptr),%xmm2
2055	movdqu	32($aptr),%xmm3
2056	movdqa	%xmm0,(%rax,$num)
2057	movdqu	48($aptr),%xmm4
2058	movdqa	%xmm0,16(%rax,$num)
2059	.byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00	# lea	64($aptr),$aptr
2060	movdqa	%xmm1,(%rax)
2061	movdqa	%xmm0,32(%rax,$num)
2062	movdqa	%xmm2,16(%rax)
2063	movdqa	%xmm0,48(%rax,$num)
2064	movdqa	%xmm3,32(%rax)
2065	movdqa	%xmm4,48(%rax)
2066	lea	64(%rax),%rax
2067	sub	\$64,%r11
2068	jnz	.Lmul_by_1
2069
2070	movq	$rptr,%xmm1
2071	movq	$nptr,%xmm2
2072	.byte	0x67
2073	mov	$nptr,%rbp
2074	movq	%r10, %xmm3		# -num
2075___
2076$code.=<<___ if ($addx);
2077	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
2078	and	\$0x80108,%r11d
2079	cmp	\$0x80108,%r11d		# check for AD*X+BMI2+BMI1
2080	jne	.Lfrom_mont_nox
2081
2082	lea	(%rax,$num),$rptr
2083	call	__bn_sqrx8x_reduction
2084	call	__bn_postx4x_internal
2085
2086	pxor	%xmm0,%xmm0
2087	lea	48(%rsp),%rax
2088	mov	40(%rsp),%rsi		# restore %rsp
2089	jmp	.Lfrom_mont_zero
2090
2091.align	32
2092.Lfrom_mont_nox:
2093___
2094$code.=<<___;
2095	call	__bn_sqr8x_reduction
2096	call	__bn_post4x_internal
2097
2098	pxor	%xmm0,%xmm0
2099	lea	48(%rsp),%rax
2100	mov	40(%rsp),%rsi		# restore %rsp
2101	jmp	.Lfrom_mont_zero
2102
2103.align	32
2104.Lfrom_mont_zero:
2105	movdqa	%xmm0,16*0(%rax)
2106	movdqa	%xmm0,16*1(%rax)
2107	movdqa	%xmm0,16*2(%rax)
2108	movdqa	%xmm0,16*3(%rax)
2109	lea	16*4(%rax),%rax
2110	sub	\$32,$num
2111	jnz	.Lfrom_mont_zero
2112
2113	mov	\$1,%rax
2114	mov	-48(%rsi),%r15
2115	mov	-40(%rsi),%r14
2116	mov	-32(%rsi),%r13
2117	mov	-24(%rsi),%r12
2118	mov	-16(%rsi),%rbp
2119	mov	-8(%rsi),%rbx
2120	lea	(%rsi),%rsp
2121.Lfrom_epilogue:
2122	ret
2123.size	bn_from_mont8x,.-bn_from_mont8x
2124___
2125}
2126}}}
2127
2128if ($addx) {{{
2129my $bp="%rdx";	# restore original value
2130
2131$code.=<<___;
2132.type	bn_mulx4x_mont_gather5,\@function,6
2133.align	32
2134bn_mulx4x_mont_gather5:
2135.Lmulx4x_enter:
2136	mov	%rsp,%rax
2137	push	%rbx
2138	push	%rbp
2139	push	%r12
2140	push	%r13
2141	push	%r14
2142	push	%r15
2143
2144	shl	\$3,${num}d		# convert $num to bytes
2145	lea	($num,$num,2),%r10	# 3*$num in bytes
2146	neg	$num			# -$num
2147	mov	($n0),$n0		# *n0
2148
2149	##############################################################
2150	# Ensure that stack frame doesn't alias with $rptr+3*$num
2151	# modulo 4096, which covers ret[num], am[num] and n[num]
2152	# (see bn_exp.c). This is done to allow memory disambiguation
2153	# logic do its magic. [Extra [num] is allocated in order
2154	# to align with bn_power5's frame, which is cleansed after
2155	# completing exponentiation. Extra 256 bytes is for power mask
2156	# calculated from 7th argument, the index.]
2157	#
2158	lea	-320(%rsp,$num,2),%r11
2159	sub	$rp,%r11
2160	and	\$4095,%r11
2161	cmp	%r11,%r10
2162	jb	.Lmulx4xsp_alt
2163	sub	%r11,%rsp		# align with $aptr
2164	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*$num*8+256)
2165	jmp	.Lmulx4xsp_done
2166
2167.Lmulx4xsp_alt:
2168	lea	4096-320(,$num,2),%r10
2169	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*$num*8+256)
2170	sub	%r10,%r11
2171	mov	\$0,%r10
2172	cmovc	%r10,%r11
2173	sub	%r11,%rsp
2174.Lmulx4xsp_done:
2175	and	\$-64,%rsp		# ensure alignment
2176	##############################################################
2177	# Stack layout
2178	# +0	-num
2179	# +8	off-loaded &b[i]
2180	# +16	end of b[num]
2181	# +24	inner counter
2182	# +32	saved n0
2183	# +40	saved %rsp
2184	# +48
2185	# +56	saved rp
2186	# +64	tmp[num+1]
2187	#
2188	mov	$n0, 32(%rsp)		# save *n0
2189	mov	%rax,40(%rsp)		# save original %rsp
2190.Lmulx4x_body:
2191	call	mulx4x_internal
2192
2193	mov	40(%rsp),%rsi		# restore %rsp
2194	mov	\$1,%rax
2195
2196	mov	-48(%rsi),%r15
2197	mov	-40(%rsi),%r14
2198	mov	-32(%rsi),%r13
2199	mov	-24(%rsi),%r12
2200	mov	-16(%rsi),%rbp
2201	mov	-8(%rsi),%rbx
2202	lea	(%rsi),%rsp
2203.Lmulx4x_epilogue:
2204	ret
2205.size	bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
2206
2207.type	mulx4x_internal,\@abi-omnipotent
2208.align	32
2209mulx4x_internal:
2210	mov	$num,8(%rsp)		# save -$num (it was in bytes)
2211	mov	$num,%r10
2212	neg	$num			# restore $num
2213	shl	\$5,$num
2214	neg	%r10			# restore $num
2215	lea	128($bp,$num),%r13	# end of powers table (+size optimization)
2216	shr	\$5+5,$num
2217	movd	`($win64?56:8)`(%rax),%xmm5	# load 7th argument
2218	sub	\$1,$num
2219	lea	.Linc(%rip),%rax
2220	mov	%r13,16+8(%rsp)		# end of b[num]
2221	mov	$num,24+8(%rsp)		# inner counter
2222	mov	$rp, 56+8(%rsp)		# save $rp
2223___
2224my ($aptr, $bptr, $nptr, $tptr, $mi,  $bi,  $zero, $num)=
2225   ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
2226my $rptr=$bptr;
2227my $STRIDE=2**5*8;		# 5 is "window size"
2228my $N=$STRIDE/4;		# should match cache line size
2229$code.=<<___;
2230	movdqa	0(%rax),%xmm0		# 00000001000000010000000000000000
2231	movdqa	16(%rax),%xmm1		# 00000002000000020000000200000002
2232	lea	88-112(%rsp,%r10),%r10	# place the mask after tp[num+1] (+ICache optimizaton)
2233	lea	128($bp),$bptr		# size optimization
2234
2235	pshufd	\$0,%xmm5,%xmm5		# broadcast index
2236	movdqa	%xmm1,%xmm4
2237	.byte	0x67
2238	movdqa	%xmm1,%xmm2
2239___
2240########################################################################
2241# calculate mask by comparing 0..31 to index and save result to stack
2242#
2243$code.=<<___;
2244	.byte	0x67
2245	paddd	%xmm0,%xmm1
2246	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
2247	movdqa	%xmm4,%xmm3
2248___
2249for($i=0;$i<$STRIDE/16-4;$i+=4) {
2250$code.=<<___;
2251	paddd	%xmm1,%xmm2
2252	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
2253	movdqa	%xmm0,`16*($i+0)+112`(%r10)
2254	movdqa	%xmm4,%xmm0
2255
2256	paddd	%xmm2,%xmm3
2257	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
2258	movdqa	%xmm1,`16*($i+1)+112`(%r10)
2259	movdqa	%xmm4,%xmm1
2260
2261	paddd	%xmm3,%xmm0
2262	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
2263	movdqa	%xmm2,`16*($i+2)+112`(%r10)
2264	movdqa	%xmm4,%xmm2
2265
2266	paddd	%xmm0,%xmm1
2267	pcmpeqd	%xmm5,%xmm0
2268	movdqa	%xmm3,`16*($i+3)+112`(%r10)
2269	movdqa	%xmm4,%xmm3
2270___
2271}
2272$code.=<<___;				# last iteration can be optimized
2273	.byte	0x67
2274	paddd	%xmm1,%xmm2
2275	pcmpeqd	%xmm5,%xmm1
2276	movdqa	%xmm0,`16*($i+0)+112`(%r10)
2277
2278	paddd	%xmm2,%xmm3
2279	pcmpeqd	%xmm5,%xmm2
2280	movdqa	%xmm1,`16*($i+1)+112`(%r10)
2281
2282	pcmpeqd	%xmm5,%xmm3
2283	movdqa	%xmm2,`16*($i+2)+112`(%r10)
2284
2285	pand	`16*($i+0)-128`($bptr),%xmm0	# while it's still in register
2286	pand	`16*($i+1)-128`($bptr),%xmm1
2287	pand	`16*($i+2)-128`($bptr),%xmm2
2288	movdqa	%xmm3,`16*($i+3)+112`(%r10)
2289	pand	`16*($i+3)-128`($bptr),%xmm3
2290	por	%xmm2,%xmm0
2291	por	%xmm3,%xmm1
2292___
2293for($i=0;$i<$STRIDE/16-4;$i+=4) {
2294$code.=<<___;
2295	movdqa	`16*($i+0)-128`($bptr),%xmm4
2296	movdqa	`16*($i+1)-128`($bptr),%xmm5
2297	movdqa	`16*($i+2)-128`($bptr),%xmm2
2298	pand	`16*($i+0)+112`(%r10),%xmm4
2299	movdqa	`16*($i+3)-128`($bptr),%xmm3
2300	pand	`16*($i+1)+112`(%r10),%xmm5
2301	por	%xmm4,%xmm0
2302	pand	`16*($i+2)+112`(%r10),%xmm2
2303	por	%xmm5,%xmm1
2304	pand	`16*($i+3)+112`(%r10),%xmm3
2305	por	%xmm2,%xmm0
2306	por	%xmm3,%xmm1
2307___
2308}
2309$code.=<<___;
2310	pxor	%xmm1,%xmm0
2311	pshufd	\$0x4e,%xmm0,%xmm1
2312	por	%xmm1,%xmm0
2313	lea	$STRIDE($bptr),$bptr
2314	movq	%xmm0,%rdx		# bp[0]
2315	lea	64+8*4+8(%rsp),$tptr
2316
2317	mov	%rdx,$bi
2318	mulx	0*8($aptr),$mi,%rax	# a[0]*b[0]
2319	mulx	1*8($aptr),%r11,%r12	# a[1]*b[0]
2320	add	%rax,%r11
2321	mulx	2*8($aptr),%rax,%r13	# ...
2322	adc	%rax,%r12
2323	adc	\$0,%r13
2324	mulx	3*8($aptr),%rax,%r14
2325
2326	mov	$mi,%r15
2327	imulq	32+8(%rsp),$mi		# "t[0]"*n0
2328	xor	$zero,$zero		# cf=0, of=0
2329	mov	$mi,%rdx
2330
2331	mov	$bptr,8+8(%rsp)		# off-load &b[i]
2332
2333	lea	4*8($aptr),$aptr
2334	adcx	%rax,%r13
2335	adcx	$zero,%r14		# cf=0
2336
2337	mulx	0*8($nptr),%rax,%r10
2338	adcx	%rax,%r15		# discarded
2339	adox	%r11,%r10
2340	mulx	1*8($nptr),%rax,%r11
2341	adcx	%rax,%r10
2342	adox	%r12,%r11
2343	mulx	2*8($nptr),%rax,%r12
2344	mov	24+8(%rsp),$bptr	# counter value
2345	mov	%r10,-8*4($tptr)
2346	adcx	%rax,%r11
2347	adox	%r13,%r12
2348	mulx	3*8($nptr),%rax,%r15
2349	 mov	$bi,%rdx
2350	mov	%r11,-8*3($tptr)
2351	adcx	%rax,%r12
2352	adox	$zero,%r15		# of=0
2353	lea	4*8($nptr),$nptr
2354	mov	%r12,-8*2($tptr)
2355	jmp	.Lmulx4x_1st
2356
2357.align	32
2358.Lmulx4x_1st:
2359	adcx	$zero,%r15		# cf=0, modulo-scheduled
2360	mulx	0*8($aptr),%r10,%rax	# a[4]*b[0]
2361	adcx	%r14,%r10
2362	mulx	1*8($aptr),%r11,%r14	# a[5]*b[0]
2363	adcx	%rax,%r11
2364	mulx	2*8($aptr),%r12,%rax	# ...
2365	adcx	%r14,%r12
2366	mulx	3*8($aptr),%r13,%r14
2367	 .byte	0x67,0x67
2368	 mov	$mi,%rdx
2369	adcx	%rax,%r13
2370	adcx	$zero,%r14		# cf=0
2371	lea	4*8($aptr),$aptr
2372	lea	4*8($tptr),$tptr
2373
2374	adox	%r15,%r10
2375	mulx	0*8($nptr),%rax,%r15
2376	adcx	%rax,%r10
2377	adox	%r15,%r11
2378	mulx	1*8($nptr),%rax,%r15
2379	adcx	%rax,%r11
2380	adox	%r15,%r12
2381	mulx	2*8($nptr),%rax,%r15
2382	mov	%r10,-5*8($tptr)
2383	adcx	%rax,%r12
2384	mov	%r11,-4*8($tptr)
2385	adox	%r15,%r13
2386	mulx	3*8($nptr),%rax,%r15
2387	 mov	$bi,%rdx
2388	mov	%r12,-3*8($tptr)
2389	adcx	%rax,%r13
2390	adox	$zero,%r15
2391	lea	4*8($nptr),$nptr
2392	mov	%r13,-2*8($tptr)
2393
2394	dec	$bptr			# of=0, pass cf
2395	jnz	.Lmulx4x_1st
2396
2397	mov	8(%rsp),$num		# load -num
2398	adc	$zero,%r15		# modulo-scheduled
2399	lea	($aptr,$num),$aptr	# rewind $aptr
2400	add	%r15,%r14
2401	mov	8+8(%rsp),$bptr		# re-load &b[i]
2402	adc	$zero,$zero		# top-most carry
2403	mov	%r14,-1*8($tptr)
2404	jmp	.Lmulx4x_outer
2405
2406.align	32
2407.Lmulx4x_outer:
2408	lea	16-256($tptr),%r10	# where 256-byte mask is (+density control)
2409	pxor	%xmm4,%xmm4
2410	.byte	0x67,0x67
2411	pxor	%xmm5,%xmm5
2412___
2413for($i=0;$i<$STRIDE/16;$i+=4) {
2414$code.=<<___;
2415	movdqa	`16*($i+0)-128`($bptr),%xmm0
2416	movdqa	`16*($i+1)-128`($bptr),%xmm1
2417	movdqa	`16*($i+2)-128`($bptr),%xmm2
2418	pand	`16*($i+0)+256`(%r10),%xmm0
2419	movdqa	`16*($i+3)-128`($bptr),%xmm3
2420	pand	`16*($i+1)+256`(%r10),%xmm1
2421	por	%xmm0,%xmm4
2422	pand	`16*($i+2)+256`(%r10),%xmm2
2423	por	%xmm1,%xmm5
2424	pand	`16*($i+3)+256`(%r10),%xmm3
2425	por	%xmm2,%xmm4
2426	por	%xmm3,%xmm5
2427___
2428}
2429$code.=<<___;
2430	por	%xmm5,%xmm4
2431	pshufd	\$0x4e,%xmm4,%xmm0
2432	por	%xmm4,%xmm0
2433	lea	$STRIDE($bptr),$bptr
2434	movq	%xmm0,%rdx		# m0=bp[i]
2435
2436	mov	$zero,($tptr)		# save top-most carry
2437	lea	4*8($tptr,$num),$tptr	# rewind $tptr
2438	mulx	0*8($aptr),$mi,%r11	# a[0]*b[i]
2439	xor	$zero,$zero		# cf=0, of=0
2440	mov	%rdx,$bi
2441	mulx	1*8($aptr),%r14,%r12	# a[1]*b[i]
2442	adox	-4*8($tptr),$mi		# +t[0]
2443	adcx	%r14,%r11
2444	mulx	2*8($aptr),%r15,%r13	# ...
2445	adox	-3*8($tptr),%r11
2446	adcx	%r15,%r12
2447	mulx	3*8($aptr),%rdx,%r14
2448	adox	-2*8($tptr),%r12
2449	adcx	%rdx,%r13
2450	lea	($nptr,$num),$nptr	# rewind $nptr
2451	lea	4*8($aptr),$aptr
2452	adox	-1*8($tptr),%r13
2453	adcx	$zero,%r14
2454	adox	$zero,%r14
2455
2456	mov	$mi,%r15
2457	imulq	32+8(%rsp),$mi		# "t[0]"*n0
2458
2459	mov	$mi,%rdx
2460	xor	$zero,$zero		# cf=0, of=0
2461	mov	$bptr,8+8(%rsp)		# off-load &b[i]
2462
2463	mulx	0*8($nptr),%rax,%r10
2464	adcx	%rax,%r15		# discarded
2465	adox	%r11,%r10
2466	mulx	1*8($nptr),%rax,%r11
2467	adcx	%rax,%r10
2468	adox	%r12,%r11
2469	mulx	2*8($nptr),%rax,%r12
2470	adcx	%rax,%r11
2471	adox	%r13,%r12
2472	mulx	3*8($nptr),%rax,%r15
2473	 mov	$bi,%rdx
2474	mov	24+8(%rsp),$bptr	# counter value
2475	mov	%r10,-8*4($tptr)
2476	adcx	%rax,%r12
2477	mov	%r11,-8*3($tptr)
2478	adox	$zero,%r15		# of=0
2479	mov	%r12,-8*2($tptr)
2480	lea	4*8($nptr),$nptr
2481	jmp	.Lmulx4x_inner
2482
2483.align	32
2484.Lmulx4x_inner:
2485	mulx	0*8($aptr),%r10,%rax	# a[4]*b[i]
2486	adcx	$zero,%r15		# cf=0, modulo-scheduled
2487	adox	%r14,%r10
2488	mulx	1*8($aptr),%r11,%r14	# a[5]*b[i]
2489	adcx	0*8($tptr),%r10
2490	adox	%rax,%r11
2491	mulx	2*8($aptr),%r12,%rax	# ...
2492	adcx	1*8($tptr),%r11
2493	adox	%r14,%r12
2494	mulx	3*8($aptr),%r13,%r14
2495	 mov	$mi,%rdx
2496	adcx	2*8($tptr),%r12
2497	adox	%rax,%r13
2498	adcx	3*8($tptr),%r13
2499	adox	$zero,%r14		# of=0
2500	lea	4*8($aptr),$aptr
2501	lea	4*8($tptr),$tptr
2502	adcx	$zero,%r14		# cf=0
2503
2504	adox	%r15,%r10
2505	mulx	0*8($nptr),%rax,%r15
2506	adcx	%rax,%r10
2507	adox	%r15,%r11
2508	mulx	1*8($nptr),%rax,%r15
2509	adcx	%rax,%r11
2510	adox	%r15,%r12
2511	mulx	2*8($nptr),%rax,%r15
2512	mov	%r10,-5*8($tptr)
2513	adcx	%rax,%r12
2514	adox	%r15,%r13
2515	mov	%r11,-4*8($tptr)
2516	mulx	3*8($nptr),%rax,%r15
2517	 mov	$bi,%rdx
2518	lea	4*8($nptr),$nptr
2519	mov	%r12,-3*8($tptr)
2520	adcx	%rax,%r13
2521	adox	$zero,%r15
2522	mov	%r13,-2*8($tptr)
2523
2524	dec	$bptr			# of=0, pass cf
2525	jnz	.Lmulx4x_inner
2526
2527	mov	0+8(%rsp),$num		# load -num
2528	adc	$zero,%r15		# modulo-scheduled
2529	sub	0*8($tptr),$bptr	# pull top-most carry to %cf
2530	mov	8+8(%rsp),$bptr		# re-load &b[i]
2531	mov	16+8(%rsp),%r10
2532	adc	%r15,%r14
2533	lea	($aptr,$num),$aptr	# rewind $aptr
2534	adc	$zero,$zero		# top-most carry
2535	mov	%r14,-1*8($tptr)
2536
2537	cmp	%r10,$bptr
2538	jb	.Lmulx4x_outer
2539
2540	mov	-8($nptr),%r10
2541	mov	$zero,%r8
2542	mov	($nptr,$num),%r12
2543	lea	($nptr,$num),%rbp	# rewind $nptr
2544	mov	$num,%rcx
2545	lea	($tptr,$num),%rdi	# rewind $tptr
2546	xor	%eax,%eax
2547	xor	%r15,%r15
2548	sub	%r14,%r10		# compare top-most words
2549	adc	%r15,%r15
2550	or	%r15,%r8
2551	sar	\$3+2,%rcx
2552	sub	%r8,%rax		# %rax=-%r8
2553	mov	56+8(%rsp),%rdx		# restore rp
2554	dec	%r12			# so that after 'not' we get -n[0]
2555	mov	8*1(%rbp),%r13
2556	xor	%r8,%r8
2557	mov	8*2(%rbp),%r14
2558	mov	8*3(%rbp),%r15
2559	jmp	.Lsqrx4x_sub_entry	# common post-condition
2560.size	mulx4x_internal,.-mulx4x_internal
2561___
2562}{
2563######################################################################
2564# void bn_power5(
2565my $rptr="%rdi";	# BN_ULONG *rptr,
2566my $aptr="%rsi";	# const BN_ULONG *aptr,
2567my $bptr="%rdx";	# const void *table,
2568my $nptr="%rcx";	# const BN_ULONG *nptr,
2569my $n0  ="%r8";		# const BN_ULONG *n0);
2570my $num ="%r9";		# int num, has to be divisible by 8
2571			# int pwr);
2572
2573my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
2574my @A0=("%r10","%r11");
2575my @A1=("%r12","%r13");
2576my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
2577
2578$code.=<<___;
2579.type	bn_powerx5,\@function,6
2580.align	32
2581bn_powerx5:
2582.Lpowerx5_enter:
2583	mov	%rsp,%rax
2584	push	%rbx
2585	push	%rbp
2586	push	%r12
2587	push	%r13
2588	push	%r14
2589	push	%r15
2590
2591	shl	\$3,${num}d		# convert $num to bytes
2592	lea	($num,$num,2),%r10	# 3*$num in bytes
2593	neg	$num
2594	mov	($n0),$n0		# *n0
2595
2596	##############################################################
2597	# Ensure that stack frame doesn't alias with $rptr+3*$num
2598	# modulo 4096, which covers ret[num], am[num] and n[num]
2599	# (see bn_exp.c). This is done to allow memory disambiguation
2600	# logic do its magic. [Extra 256 bytes is for power mask
2601	# calculated from 7th argument, the index.]
2602	#
2603	lea	-320(%rsp,$num,2),%r11
2604	sub	$rptr,%r11
2605	and	\$4095,%r11
2606	cmp	%r11,%r10
2607	jb	.Lpwrx_sp_alt
2608	sub	%r11,%rsp		# align with $aptr
2609	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*$num*8+256)
2610	jmp	.Lpwrx_sp_done
2611
2612.align	32
2613.Lpwrx_sp_alt:
2614	lea	4096-320(,$num,2),%r10
2615	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*$num*8+256)
2616	sub	%r10,%r11
2617	mov	\$0,%r10
2618	cmovc	%r10,%r11
2619	sub	%r11,%rsp
2620.Lpwrx_sp_done:
2621	and	\$-64,%rsp
2622	mov	$num,%r10
2623	neg	$num
2624
2625	##############################################################
2626	# Stack layout
2627	#
2628	# +0	saved $num, used in reduction section
2629	# +8	&t[2*$num], used in reduction section
2630	# +16	intermediate carry bit
2631	# +24	top-most carry bit, used in reduction section
2632	# +32	saved *n0
2633	# +40	saved %rsp
2634	# +48	t[2*$num]
2635	#
2636	pxor	%xmm0,%xmm0
2637	movq	$rptr,%xmm1		# save $rptr
2638	movq	$nptr,%xmm2		# save $nptr
2639	movq	%r10, %xmm3		# -$num
2640	movq	$bptr,%xmm4
2641	mov	$n0,  32(%rsp)
2642	mov	%rax, 40(%rsp)		# save original %rsp
2643.Lpowerx5_body:
2644
2645	call	__bn_sqrx8x_internal
2646	call	__bn_postx4x_internal
2647	call	__bn_sqrx8x_internal
2648	call	__bn_postx4x_internal
2649	call	__bn_sqrx8x_internal
2650	call	__bn_postx4x_internal
2651	call	__bn_sqrx8x_internal
2652	call	__bn_postx4x_internal
2653	call	__bn_sqrx8x_internal
2654	call	__bn_postx4x_internal
2655
2656	mov	%r10,$num		# -num
2657	mov	$aptr,$rptr
2658	movq	%xmm2,$nptr
2659	movq	%xmm4,$bptr
2660	mov	40(%rsp),%rax
2661
2662	call	mulx4x_internal
2663
2664	mov	40(%rsp),%rsi		# restore %rsp
2665	mov	\$1,%rax
2666
2667	mov	-48(%rsi),%r15
2668	mov	-40(%rsi),%r14
2669	mov	-32(%rsi),%r13
2670	mov	-24(%rsi),%r12
2671	mov	-16(%rsi),%rbp
2672	mov	-8(%rsi),%rbx
2673	lea	(%rsi),%rsp
2674.Lpowerx5_epilogue:
2675	ret
2676.size	bn_powerx5,.-bn_powerx5
2677
2678.globl	bn_sqrx8x_internal
2679.hidden	bn_sqrx8x_internal
2680.type	bn_sqrx8x_internal,\@abi-omnipotent
2681.align	32
2682bn_sqrx8x_internal:
2683__bn_sqrx8x_internal:
2684	##################################################################
2685	# Squaring part:
2686	#
2687	# a) multiply-n-add everything but a[i]*a[i];
2688	# b) shift result of a) by 1 to the left and accumulate
2689	#    a[i]*a[i] products;
2690	#
2691	##################################################################
2692	# a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
2693	#                                                     a[1]a[0]
2694	#                                                 a[2]a[0]
2695	#                                             a[3]a[0]
2696	#                                             a[2]a[1]
2697	#                                         a[3]a[1]
2698	#                                     a[3]a[2]
2699	#
2700	#                                         a[4]a[0]
2701	#                                     a[5]a[0]
2702	#                                 a[6]a[0]
2703	#                             a[7]a[0]
2704	#                                     a[4]a[1]
2705	#                                 a[5]a[1]
2706	#                             a[6]a[1]
2707	#                         a[7]a[1]
2708	#                                 a[4]a[2]
2709	#                             a[5]a[2]
2710	#                         a[6]a[2]
2711	#                     a[7]a[2]
2712	#                             a[4]a[3]
2713	#                         a[5]a[3]
2714	#                     a[6]a[3]
2715	#                 a[7]a[3]
2716	#
2717	#                     a[5]a[4]
2718	#                 a[6]a[4]
2719	#             a[7]a[4]
2720	#             a[6]a[5]
2721	#         a[7]a[5]
2722	#     a[7]a[6]
2723	# a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
2724___
2725{
2726my ($zero,$carry)=("%rbp","%rcx");
2727my $aaptr=$zero;
2728$code.=<<___;
2729	lea	48+8(%rsp),$tptr
2730	lea	($aptr,$num),$aaptr
2731	mov	$num,0+8(%rsp)			# save $num
2732	mov	$aaptr,8+8(%rsp)		# save end of $aptr
2733	jmp	.Lsqr8x_zero_start
2734
2735.align	32
2736.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2737.Lsqrx8x_zero:
2738	.byte	0x3e
2739	movdqa	%xmm0,0*8($tptr)
2740	movdqa	%xmm0,2*8($tptr)
2741	movdqa	%xmm0,4*8($tptr)
2742	movdqa	%xmm0,6*8($tptr)
2743.Lsqr8x_zero_start:			# aligned at 32
2744	movdqa	%xmm0,8*8($tptr)
2745	movdqa	%xmm0,10*8($tptr)
2746	movdqa	%xmm0,12*8($tptr)
2747	movdqa	%xmm0,14*8($tptr)
2748	lea	16*8($tptr),$tptr
2749	sub	\$64,$num
2750	jnz	.Lsqrx8x_zero
2751
2752	mov	0*8($aptr),%rdx		# a[0], modulo-scheduled
2753	#xor	%r9,%r9			# t[1], ex-$num, zero already
2754	xor	%r10,%r10
2755	xor	%r11,%r11
2756	xor	%r12,%r12
2757	xor	%r13,%r13
2758	xor	%r14,%r14
2759	xor	%r15,%r15
2760	lea	48+8(%rsp),$tptr
2761	xor	$zero,$zero		# cf=0, cf=0
2762	jmp	.Lsqrx8x_outer_loop
2763
2764.align	32
2765.Lsqrx8x_outer_loop:
2766	mulx	1*8($aptr),%r8,%rax	# a[1]*a[0]
2767	adcx	%r9,%r8			# a[1]*a[0]+=t[1]
2768	adox	%rax,%r10
2769	mulx	2*8($aptr),%r9,%rax	# a[2]*a[0]
2770	adcx	%r10,%r9
2771	adox	%rax,%r11
2772	.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00	# mulx	3*8($aptr),%r10,%rax	# ...
2773	adcx	%r11,%r10
2774	adox	%rax,%r12
2775	.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00	# mulx	4*8($aptr),%r11,%rax
2776	adcx	%r12,%r11
2777	adox	%rax,%r13
2778	mulx	5*8($aptr),%r12,%rax
2779	adcx	%r13,%r12
2780	adox	%rax,%r14
2781	mulx	6*8($aptr),%r13,%rax
2782	adcx	%r14,%r13
2783	adox	%r15,%rax
2784	mulx	7*8($aptr),%r14,%r15
2785	 mov	1*8($aptr),%rdx		# a[1]
2786	adcx	%rax,%r14
2787	adox	$zero,%r15
2788	adc	8*8($tptr),%r15
2789	mov	%r8,1*8($tptr)		# t[1]
2790	mov	%r9,2*8($tptr)		# t[2]
2791	sbb	$carry,$carry		# mov %cf,$carry
2792	xor	$zero,$zero		# cf=0, of=0
2793
2794
2795	mulx	2*8($aptr),%r8,%rbx	# a[2]*a[1]
2796	mulx	3*8($aptr),%r9,%rax	# a[3]*a[1]
2797	adcx	%r10,%r8
2798	adox	%rbx,%r9
2799	mulx	4*8($aptr),%r10,%rbx	# ...
2800	adcx	%r11,%r9
2801	adox	%rax,%r10
2802	.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00	# mulx	5*8($aptr),%r11,%rax
2803	adcx	%r12,%r10
2804	adox	%rbx,%r11
2805	.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00	# mulx	6*8($aptr),%r12,%rbx
2806	adcx	%r13,%r11
2807	adox	%r14,%r12
2808	.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00	# mulx	7*8($aptr),%r13,%r14
2809	 mov	2*8($aptr),%rdx		# a[2]
2810	adcx	%rax,%r12
2811	adox	%rbx,%r13
2812	adcx	%r15,%r13
2813	adox	$zero,%r14		# of=0
2814	adcx	$zero,%r14		# cf=0
2815
2816	mov	%r8,3*8($tptr)		# t[3]
2817	mov	%r9,4*8($tptr)		# t[4]
2818
2819	mulx	3*8($aptr),%r8,%rbx	# a[3]*a[2]
2820	mulx	4*8($aptr),%r9,%rax	# a[4]*a[2]
2821	adcx	%r10,%r8
2822	adox	%rbx,%r9
2823	mulx	5*8($aptr),%r10,%rbx	# ...
2824	adcx	%r11,%r9
2825	adox	%rax,%r10
2826	.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00	# mulx	6*8($aptr),%r11,%rax
2827	adcx	%r12,%r10
2828	adox	%r13,%r11
2829	.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00	# mulx	7*8($aptr),%r12,%r13
2830	.byte	0x3e
2831	 mov	3*8($aptr),%rdx		# a[3]
2832	adcx	%rbx,%r11
2833	adox	%rax,%r12
2834	adcx	%r14,%r12
2835	mov	%r8,5*8($tptr)		# t[5]
2836	mov	%r9,6*8($tptr)		# t[6]
2837	 mulx	4*8($aptr),%r8,%rax	# a[4]*a[3]
2838	adox	$zero,%r13		# of=0
2839	adcx	$zero,%r13		# cf=0
2840
2841	mulx	5*8($aptr),%r9,%rbx	# a[5]*a[3]
2842	adcx	%r10,%r8
2843	adox	%rax,%r9
2844	mulx	6*8($aptr),%r10,%rax	# ...
2845	adcx	%r11,%r9
2846	adox	%r12,%r10
2847	mulx	7*8($aptr),%r11,%r12
2848	 mov	4*8($aptr),%rdx		# a[4]
2849	 mov	5*8($aptr),%r14		# a[5]
2850	adcx	%rbx,%r10
2851	adox	%rax,%r11
2852	 mov	6*8($aptr),%r15		# a[6]
2853	adcx	%r13,%r11
2854	adox	$zero,%r12		# of=0
2855	adcx	$zero,%r12		# cf=0
2856
2857	mov	%r8,7*8($tptr)		# t[7]
2858	mov	%r9,8*8($tptr)		# t[8]
2859
2860	mulx	%r14,%r9,%rax		# a[5]*a[4]
2861	 mov	7*8($aptr),%r8		# a[7]
2862	adcx	%r10,%r9
2863	mulx	%r15,%r10,%rbx		# a[6]*a[4]
2864	adox	%rax,%r10
2865	adcx	%r11,%r10
2866	mulx	%r8,%r11,%rax		# a[7]*a[4]
2867	 mov	%r14,%rdx		# a[5]
2868	adox	%rbx,%r11
2869	adcx	%r12,%r11
2870	#adox	$zero,%rax		# of=0
2871	adcx	$zero,%rax		# cf=0
2872
2873	mulx	%r15,%r14,%rbx		# a[6]*a[5]
2874	mulx	%r8,%r12,%r13		# a[7]*a[5]
2875	 mov	%r15,%rdx		# a[6]
2876	 lea	8*8($aptr),$aptr
2877	adcx	%r14,%r11
2878	adox	%rbx,%r12
2879	adcx	%rax,%r12
2880	adox	$zero,%r13
2881
2882	.byte	0x67,0x67
2883	mulx	%r8,%r8,%r14		# a[7]*a[6]
2884	adcx	%r8,%r13
2885	adcx	$zero,%r14
2886
2887	cmp	8+8(%rsp),$aptr
2888	je	.Lsqrx8x_outer_break
2889
2890	neg	$carry			# mov $carry,%cf
2891	mov	\$-8,%rcx
2892	mov	$zero,%r15
2893	mov	8*8($tptr),%r8
2894	adcx	9*8($tptr),%r9		# +=t[9]
2895	adcx	10*8($tptr),%r10	# ...
2896	adcx	11*8($tptr),%r11
2897	adc	12*8($tptr),%r12
2898	adc	13*8($tptr),%r13
2899	adc	14*8($tptr),%r14
2900	adc	15*8($tptr),%r15
2901	lea	($aptr),$aaptr
2902	lea	2*64($tptr),$tptr
2903	sbb	%rax,%rax		# mov %cf,$carry
2904
2905	mov	-64($aptr),%rdx		# a[0]
2906	mov	%rax,16+8(%rsp)		# offload $carry
2907	mov	$tptr,24+8(%rsp)
2908
2909	#lea	8*8($tptr),$tptr	# see 2*8*8($tptr) above
2910	xor	%eax,%eax		# cf=0, of=0
2911	jmp	.Lsqrx8x_loop
2912
2913.align	32
2914.Lsqrx8x_loop:
2915	mov	%r8,%rbx
2916	mulx	0*8($aaptr),%rax,%r8	# a[8]*a[i]
2917	adcx	%rax,%rbx		# +=t[8]
2918	adox	%r9,%r8
2919
2920	mulx	1*8($aaptr),%rax,%r9	# ...
2921	adcx	%rax,%r8
2922	adox	%r10,%r9
2923
2924	mulx	2*8($aaptr),%rax,%r10
2925	adcx	%rax,%r9
2926	adox	%r11,%r10
2927
2928	mulx	3*8($aaptr),%rax,%r11
2929	adcx	%rax,%r10
2930	adox	%r12,%r11
2931
2932	.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00	# mulx	4*8($aaptr),%rax,%r12
2933	adcx	%rax,%r11
2934	adox	%r13,%r12
2935
2936	mulx	5*8($aaptr),%rax,%r13
2937	adcx	%rax,%r12
2938	adox	%r14,%r13
2939
2940	mulx	6*8($aaptr),%rax,%r14
2941	 mov	%rbx,($tptr,%rcx,8)	# store t[8+i]
2942	 mov	\$0,%ebx
2943	adcx	%rax,%r13
2944	adox	%r15,%r14
2945
2946	.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00	# mulx	7*8($aaptr),%rax,%r15
2947	 mov	8($aptr,%rcx,8),%rdx	# a[i]
2948	adcx	%rax,%r14
2949	adox	%rbx,%r15		# %rbx is 0, of=0
2950	adcx	%rbx,%r15		# cf=0
2951
2952	.byte	0x67
2953	inc	%rcx			# of=0
2954	jnz	.Lsqrx8x_loop
2955
2956	lea	8*8($aaptr),$aaptr
2957	mov	\$-8,%rcx
2958	cmp	8+8(%rsp),$aaptr	# done?
2959	je	.Lsqrx8x_break
2960
2961	sub	16+8(%rsp),%rbx		# mov 16(%rsp),%cf
2962	.byte	0x66
2963	mov	-64($aptr),%rdx
2964	adcx	0*8($tptr),%r8
2965	adcx	1*8($tptr),%r9
2966	adc	2*8($tptr),%r10
2967	adc	3*8($tptr),%r11
2968	adc	4*8($tptr),%r12
2969	adc	5*8($tptr),%r13
2970	adc	6*8($tptr),%r14
2971	adc	7*8($tptr),%r15
2972	lea	8*8($tptr),$tptr
2973	.byte	0x67
2974	sbb	%rax,%rax		# mov %cf,%rax
2975	xor	%ebx,%ebx		# cf=0, of=0
2976	mov	%rax,16+8(%rsp)		# offload carry
2977	jmp	.Lsqrx8x_loop
2978
2979.align	32
2980.Lsqrx8x_break:
2981	sub	16+8(%rsp),%r8		# consume last carry
2982	mov	24+8(%rsp),$carry	# initial $tptr, borrow $carry
2983	mov	0*8($aptr),%rdx		# a[8], modulo-scheduled
2984	xor	%ebp,%ebp		# xor	$zero,$zero
2985	mov	%r8,0*8($tptr)
2986	cmp	$carry,$tptr		# cf=0, of=0
2987	je	.Lsqrx8x_outer_loop
2988
2989	mov	%r9,1*8($tptr)
2990	 mov	1*8($carry),%r9
2991	mov	%r10,2*8($tptr)
2992	 mov	2*8($carry),%r10
2993	mov	%r11,3*8($tptr)
2994	 mov	3*8($carry),%r11
2995	mov	%r12,4*8($tptr)
2996	 mov	4*8($carry),%r12
2997	mov	%r13,5*8($tptr)
2998	 mov	5*8($carry),%r13
2999	mov	%r14,6*8($tptr)
3000	 mov	6*8($carry),%r14
3001	mov	%r15,7*8($tptr)
3002	 mov	7*8($carry),%r15
3003	mov	$carry,$tptr
3004	jmp	.Lsqrx8x_outer_loop
3005
3006.align	32
3007.Lsqrx8x_outer_break:
3008	mov	%r9,9*8($tptr)		# t[9]
3009	 movq	%xmm3,%rcx		# -$num
3010	mov	%r10,10*8($tptr)	# ...
3011	mov	%r11,11*8($tptr)
3012	mov	%r12,12*8($tptr)
3013	mov	%r13,13*8($tptr)
3014	mov	%r14,14*8($tptr)
3015___
3016}{
3017my $i="%rcx";
3018$code.=<<___;
3019	lea	48+8(%rsp),$tptr
3020	mov	($aptr,$i),%rdx		# a[0]
3021
3022	mov	8($tptr),$A0[1]		# t[1]
3023	xor	$A0[0],$A0[0]		# t[0], of=0, cf=0
3024	mov	0+8(%rsp),$num		# restore $num
3025	adox	$A0[1],$A0[1]
3026	 mov	16($tptr),$A1[0]	# t[2]	# prefetch
3027	 mov	24($tptr),$A1[1]	# t[3]	# prefetch
3028	#jmp	.Lsqrx4x_shift_n_add	# happens to be aligned
3029
3030.align	32
3031.Lsqrx4x_shift_n_add:
3032	mulx	%rdx,%rax,%rbx
3033	 adox	$A1[0],$A1[0]
3034	adcx	$A0[0],%rax
3035	 .byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00	# mov	8($aptr,$i),%rdx	# a[i+1]	# prefetch
3036	 .byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00	# mov	32($tptr),$A0[0]	# t[2*i+4]	# prefetch
3037	 adox	$A1[1],$A1[1]
3038	adcx	$A0[1],%rbx
3039	 mov	40($tptr),$A0[1]		# t[2*i+4+1]	# prefetch
3040	mov	%rax,0($tptr)
3041	mov	%rbx,8($tptr)
3042
3043	mulx	%rdx,%rax,%rbx
3044	 adox	$A0[0],$A0[0]
3045	adcx	$A1[0],%rax
3046	 mov	16($aptr,$i),%rdx	# a[i+2]	# prefetch
3047	 mov	48($tptr),$A1[0]	# t[2*i+6]	# prefetch
3048	 adox	$A0[1],$A0[1]
3049	adcx	$A1[1],%rbx
3050	 mov	56($tptr),$A1[1]	# t[2*i+6+1]	# prefetch
3051	mov	%rax,16($tptr)
3052	mov	%rbx,24($tptr)
3053
3054	mulx	%rdx,%rax,%rbx
3055	 adox	$A1[0],$A1[0]
3056	adcx	$A0[0],%rax
3057	 mov	24($aptr,$i),%rdx	# a[i+3]	# prefetch
3058	 lea	32($i),$i
3059	 mov	64($tptr),$A0[0]	# t[2*i+8]	# prefetch
3060	 adox	$A1[1],$A1[1]
3061	adcx	$A0[1],%rbx
3062	 mov	72($tptr),$A0[1]	# t[2*i+8+1]	# prefetch
3063	mov	%rax,32($tptr)
3064	mov	%rbx,40($tptr)
3065
3066	mulx	%rdx,%rax,%rbx
3067	 adox	$A0[0],$A0[0]
3068	adcx	$A1[0],%rax
3069	jrcxz	.Lsqrx4x_shift_n_add_break
3070	 .byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00	# mov	0($aptr,$i),%rdx	# a[i+4]	# prefetch
3071	 adox	$A0[1],$A0[1]
3072	adcx	$A1[1],%rbx
3073	 mov	80($tptr),$A1[0]	# t[2*i+10]	# prefetch
3074	 mov	88($tptr),$A1[1]	# t[2*i+10+1]	# prefetch
3075	mov	%rax,48($tptr)
3076	mov	%rbx,56($tptr)
3077	lea	64($tptr),$tptr
3078	nop
3079	jmp	.Lsqrx4x_shift_n_add
3080
3081.align	32
3082.Lsqrx4x_shift_n_add_break:
3083	adcx	$A1[1],%rbx
3084	mov	%rax,48($tptr)
3085	mov	%rbx,56($tptr)
3086	lea	64($tptr),$tptr		# end of t[] buffer
3087___
3088}
3089######################################################################
3090# Montgomery reduction part, "word-by-word" algorithm.
3091#
3092# This new path is inspired by multiple submissions from Intel, by
3093# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
3094# Vinodh Gopal...
3095{
3096my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx");
3097
3098$code.=<<___;
3099	movq	%xmm2,$nptr
3100__bn_sqrx8x_reduction:
3101	xor	%eax,%eax		# initial top-most carry bit
3102	mov	32+8(%rsp),%rbx		# n0
3103	mov	48+8(%rsp),%rdx		# "%r8", 8*0($tptr)
3104	lea	-8*8($nptr,$num),%rcx	# end of n[]
3105	#lea	48+8(%rsp,$num,2),$tptr	# end of t[] buffer
3106	mov	%rcx, 0+8(%rsp)		# save end of n[]
3107	mov	$tptr,8+8(%rsp)		# save end of t[]
3108
3109	lea	48+8(%rsp),$tptr		# initial t[] window
3110	jmp	.Lsqrx8x_reduction_loop
3111
3112.align	32
3113.Lsqrx8x_reduction_loop:
3114	mov	8*1($tptr),%r9
3115	mov	8*2($tptr),%r10
3116	mov	8*3($tptr),%r11
3117	mov	8*4($tptr),%r12
3118	mov	%rdx,%r8
3119	imulq	%rbx,%rdx		# n0*a[i]
3120	mov	8*5($tptr),%r13
3121	mov	8*6($tptr),%r14
3122	mov	8*7($tptr),%r15
3123	mov	%rax,24+8(%rsp)		# store top-most carry bit
3124
3125	lea	8*8($tptr),$tptr
3126	xor	$carry,$carry		# cf=0,of=0
3127	mov	\$-8,%rcx
3128	jmp	.Lsqrx8x_reduce
3129
3130.align	32
3131.Lsqrx8x_reduce:
3132	mov	%r8, %rbx
3133	mulx	8*0($nptr),%rax,%r8	# n[0]
3134	adcx	%rbx,%rax		# discarded
3135	adox	%r9,%r8
3136
3137	mulx	8*1($nptr),%rbx,%r9	# n[1]
3138	adcx	%rbx,%r8
3139	adox	%r10,%r9
3140
3141	mulx	8*2($nptr),%rbx,%r10
3142	adcx	%rbx,%r9
3143	adox	%r11,%r10
3144
3145	mulx	8*3($nptr),%rbx,%r11
3146	adcx	%rbx,%r10
3147	adox	%r12,%r11
3148
3149	.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00	# mulx	8*4($nptr),%rbx,%r12
3150	 mov	%rdx,%rax
3151	 mov	%r8,%rdx
3152	adcx	%rbx,%r11
3153	adox	%r13,%r12
3154
3155	 mulx	32+8(%rsp),%rbx,%rdx	# %rdx discarded
3156	 mov	%rax,%rdx
3157	 mov	%rax,64+48+8(%rsp,%rcx,8)	# put aside n0*a[i]
3158
3159	mulx	8*5($nptr),%rax,%r13
3160	adcx	%rax,%r12
3161	adox	%r14,%r13
3162
3163	mulx	8*6($nptr),%rax,%r14
3164	adcx	%rax,%r13
3165	adox	%r15,%r14
3166
3167	mulx	8*7($nptr),%rax,%r15
3168	 mov	%rbx,%rdx
3169	adcx	%rax,%r14
3170	adox	$carry,%r15		# $carry is 0
3171	adcx	$carry,%r15		# cf=0
3172
3173	.byte	0x67,0x67,0x67
3174	inc	%rcx			# of=0
3175	jnz	.Lsqrx8x_reduce
3176
3177	mov	$carry,%rax		# xor	%rax,%rax
3178	cmp	0+8(%rsp),$nptr		# end of n[]?
3179	jae	.Lsqrx8x_no_tail
3180
3181	mov	48+8(%rsp),%rdx		# pull n0*a[0]
3182	add	8*0($tptr),%r8
3183	lea	8*8($nptr),$nptr
3184	mov	\$-8,%rcx
3185	adcx	8*1($tptr),%r9
3186	adcx	8*2($tptr),%r10
3187	adc	8*3($tptr),%r11
3188	adc	8*4($tptr),%r12
3189	adc	8*5($tptr),%r13
3190	adc	8*6($tptr),%r14
3191	adc	8*7($tptr),%r15
3192	lea	8*8($tptr),$tptr
3193	sbb	%rax,%rax		# top carry
3194
3195	xor	$carry,$carry		# of=0, cf=0
3196	mov	%rax,16+8(%rsp)
3197	jmp	.Lsqrx8x_tail
3198
3199.align	32
3200.Lsqrx8x_tail:
3201	mov	%r8,%rbx
3202	mulx	8*0($nptr),%rax,%r8
3203	adcx	%rax,%rbx
3204	adox	%r9,%r8
3205
3206	mulx	8*1($nptr),%rax,%r9
3207	adcx	%rax,%r8
3208	adox	%r10,%r9
3209
3210	mulx	8*2($nptr),%rax,%r10
3211	adcx	%rax,%r9
3212	adox	%r11,%r10
3213
3214	mulx	8*3($nptr),%rax,%r11
3215	adcx	%rax,%r10
3216	adox	%r12,%r11
3217
3218	.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00	# mulx	8*4($nptr),%rax,%r12
3219	adcx	%rax,%r11
3220	adox	%r13,%r12
3221
3222	mulx	8*5($nptr),%rax,%r13
3223	adcx	%rax,%r12
3224	adox	%r14,%r13
3225
3226	mulx	8*6($nptr),%rax,%r14
3227	adcx	%rax,%r13
3228	adox	%r15,%r14
3229
3230	mulx	8*7($nptr),%rax,%r15
3231	 mov	72+48+8(%rsp,%rcx,8),%rdx	# pull n0*a[i]
3232	adcx	%rax,%r14
3233	adox	$carry,%r15
3234	 mov	%rbx,($tptr,%rcx,8)	# save result
3235	 mov	%r8,%rbx
3236	adcx	$carry,%r15		# cf=0
3237
3238	inc	%rcx			# of=0
3239	jnz	.Lsqrx8x_tail
3240
3241	cmp	0+8(%rsp),$nptr		# end of n[]?
3242	jae	.Lsqrx8x_tail_done	# break out of loop
3243
3244	sub	16+8(%rsp),$carry	# mov 16(%rsp),%cf
3245	 mov	48+8(%rsp),%rdx		# pull n0*a[0]
3246	 lea	8*8($nptr),$nptr
3247	adc	8*0($tptr),%r8
3248	adc	8*1($tptr),%r9
3249	adc	8*2($tptr),%r10
3250	adc	8*3($tptr),%r11
3251	adc	8*4($tptr),%r12
3252	adc	8*5($tptr),%r13
3253	adc	8*6($tptr),%r14
3254	adc	8*7($tptr),%r15
3255	lea	8*8($tptr),$tptr
3256	sbb	%rax,%rax
3257	sub	\$8,%rcx		# mov	\$-8,%rcx
3258
3259	xor	$carry,$carry		# of=0, cf=0
3260	mov	%rax,16+8(%rsp)
3261	jmp	.Lsqrx8x_tail
3262
3263.align	32
3264.Lsqrx8x_tail_done:
3265	add	24+8(%rsp),%r8		# can this overflow?
3266	adc	\$0,%r9
3267	adc	\$0,%r10
3268	adc	\$0,%r11
3269	adc	\$0,%r12
3270	adc	\$0,%r13
3271	adc	\$0,%r14
3272	adc	\$0,%r15		# can't overflow, because we
3273					# started with "overhung" part
3274					# of multiplication
3275	mov	$carry,%rax		# xor	%rax,%rax
3276
3277	sub	16+8(%rsp),$carry	# mov 16(%rsp),%cf
3278.Lsqrx8x_no_tail:			# %cf is 0 if jumped here
3279	adc	8*0($tptr),%r8
3280	 movq	%xmm3,%rcx
3281	adc	8*1($tptr),%r9
3282	 mov	8*7($nptr),$carry
3283	 movq	%xmm2,$nptr		# restore $nptr
3284	adc	8*2($tptr),%r10
3285	adc	8*3($tptr),%r11
3286	adc	8*4($tptr),%r12
3287	adc	8*5($tptr),%r13
3288	adc	8*6($tptr),%r14
3289	adc	8*7($tptr),%r15
3290	adc	%rax,%rax		# top-most carry
3291
3292	mov	32+8(%rsp),%rbx		# n0
3293	mov	8*8($tptr,%rcx),%rdx	# modulo-scheduled "%r8"
3294
3295	mov	%r8,8*0($tptr)		# store top 512 bits
3296	 lea	8*8($tptr),%r8		# borrow %r8
3297	mov	%r9,8*1($tptr)
3298	mov	%r10,8*2($tptr)
3299	mov	%r11,8*3($tptr)
3300	mov	%r12,8*4($tptr)
3301	mov	%r13,8*5($tptr)
3302	mov	%r14,8*6($tptr)
3303	mov	%r15,8*7($tptr)
3304
3305	lea	8*8($tptr,%rcx),$tptr	# start of current t[] window
3306	cmp	8+8(%rsp),%r8		# end of t[]?
3307	jb	.Lsqrx8x_reduction_loop
3308	ret
3309.size	bn_sqrx8x_internal,.-bn_sqrx8x_internal
3310___
3311}
3312##############################################################
3313# Post-condition, 4x unrolled
3314#
3315{
3316my ($rptr,$nptr)=("%rdx","%rbp");
3317$code.=<<___;
3318.align	32
3319__bn_postx4x_internal:
3320	mov	8*0($nptr),%r12
3321	mov	%rcx,%r10		# -$num
3322	mov	%rcx,%r9		# -$num
3323	neg	%rax
3324	sar	\$3+2,%rcx
3325	#lea	48+8(%rsp,%r9),$tptr
3326	movq	%xmm1,$rptr		# restore $rptr
3327	movq	%xmm1,$aptr		# prepare for back-to-back call
3328	dec	%r12			# so that after 'not' we get -n[0]
3329	mov	8*1($nptr),%r13
3330	xor	%r8,%r8
3331	mov	8*2($nptr),%r14
3332	mov	8*3($nptr),%r15
3333	jmp	.Lsqrx4x_sub_entry
3334
3335.align	16
3336.Lsqrx4x_sub:
3337	mov	8*0($nptr),%r12
3338	mov	8*1($nptr),%r13
3339	mov	8*2($nptr),%r14
3340	mov	8*3($nptr),%r15
3341.Lsqrx4x_sub_entry:
3342	andn	%rax,%r12,%r12
3343	lea	8*4($nptr),$nptr
3344	andn	%rax,%r13,%r13
3345	andn	%rax,%r14,%r14
3346	andn	%rax,%r15,%r15
3347
3348	neg	%r8			# mov %r8,%cf
3349	adc	8*0($tptr),%r12
3350	adc	8*1($tptr),%r13
3351	adc	8*2($tptr),%r14
3352	adc	8*3($tptr),%r15
3353	mov	%r12,8*0($rptr)
3354	lea	8*4($tptr),$tptr
3355	mov	%r13,8*1($rptr)
3356	sbb	%r8,%r8			# mov %cf,%r8
3357	mov	%r14,8*2($rptr)
3358	mov	%r15,8*3($rptr)
3359	lea	8*4($rptr),$rptr
3360
3361	inc	%rcx
3362	jnz	.Lsqrx4x_sub
3363
3364	neg	%r9			# restore $num
3365
3366	ret
3367.size	__bn_postx4x_internal,.-__bn_postx4x_internal
3368___
3369}
3370}}}
3371{
3372my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order
3373				("%rdi","%esi","%rdx","%ecx");  # Unix order
3374my $out=$inp;
3375my $STRIDE=2**5*8;
3376my $N=$STRIDE/4;
3377
3378$code.=<<___;
3379.globl	bn_get_bits5
3380.type	bn_get_bits5,\@abi-omnipotent
3381.align	16
3382bn_get_bits5:
3383	lea	0($inp),%r10
3384	lea	1($inp),%r11
3385	mov	$num,%ecx
3386	shr	\$4,$num
3387	and	\$15,%ecx
3388	lea	-8(%ecx),%eax
3389	cmp	\$11,%ecx
3390	cmova	%r11,%r10
3391	cmova	%eax,%ecx
3392	movzw	(%r10,$num,2),%eax
3393	shrl	%cl,%eax
3394	and	\$31,%eax
3395	ret
3396.size	bn_get_bits5,.-bn_get_bits5
3397
3398.globl	bn_scatter5
3399.type	bn_scatter5,\@abi-omnipotent
3400.align	16
3401bn_scatter5:
3402	cmp	\$0, $num
3403	jz	.Lscatter_epilogue
3404	lea	($tbl,$idx,8),$tbl
3405.Lscatter:
3406	mov	($inp),%rax
3407	lea	8($inp),$inp
3408	mov	%rax,($tbl)
3409	lea	32*8($tbl),$tbl
3410	sub	\$1,$num
3411	jnz	.Lscatter
3412.Lscatter_epilogue:
3413	ret
3414.size	bn_scatter5,.-bn_scatter5
3415
3416.globl	bn_gather5
3417.type	bn_gather5,\@abi-omnipotent
3418.align	32
3419bn_gather5:
3420.LSEH_begin_bn_gather5:			# Win64 thing, but harmless in other cases
3421	# I can't trust assembler to use specific encoding:-(
3422	.byte	0x4c,0x8d,0x14,0x24			#lea    (%rsp),%r10
3423	.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00	#sub	$0x108,%rsp
3424	lea	.Linc(%rip),%rax
3425	and	\$-16,%rsp		# shouldn't be formally required
3426
3427	movd	$idx,%xmm5
3428	movdqa	0(%rax),%xmm0		# 00000001000000010000000000000000
3429	movdqa	16(%rax),%xmm1		# 00000002000000020000000200000002
3430	lea	128($tbl),%r11		# size optimization
3431	lea	128(%rsp),%rax		# size optimization
3432
3433	pshufd	\$0,%xmm5,%xmm5		# broadcast $idx
3434	movdqa	%xmm1,%xmm4
3435	movdqa	%xmm1,%xmm2
3436___
3437########################################################################
3438# calculate mask by comparing 0..31 to $idx and save result to stack
3439#
3440for($i=0;$i<$STRIDE/16;$i+=4) {
3441$code.=<<___;
3442	paddd	%xmm0,%xmm1
3443	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
3444___
3445$code.=<<___	if ($i);
3446	movdqa	%xmm3,`16*($i-1)-128`(%rax)
3447___
3448$code.=<<___;
3449	movdqa	%xmm4,%xmm3
3450
3451	paddd	%xmm1,%xmm2
3452	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
3453	movdqa	%xmm0,`16*($i+0)-128`(%rax)
3454	movdqa	%xmm4,%xmm0
3455
3456	paddd	%xmm2,%xmm3
3457	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
3458	movdqa	%xmm1,`16*($i+1)-128`(%rax)
3459	movdqa	%xmm4,%xmm1
3460
3461	paddd	%xmm3,%xmm0
3462	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
3463	movdqa	%xmm2,`16*($i+2)-128`(%rax)
3464	movdqa	%xmm4,%xmm2
3465___
3466}
3467$code.=<<___;
3468	movdqa	%xmm3,`16*($i-1)-128`(%rax)
3469	jmp	.Lgather
3470
3471.align	32
3472.Lgather:
3473	pxor	%xmm4,%xmm4
3474	pxor	%xmm5,%xmm5
3475___
3476for($i=0;$i<$STRIDE/16;$i+=4) {
3477$code.=<<___;
3478	movdqa	`16*($i+0)-128`(%r11),%xmm0
3479	movdqa	`16*($i+1)-128`(%r11),%xmm1
3480	movdqa	`16*($i+2)-128`(%r11),%xmm2
3481	pand	`16*($i+0)-128`(%rax),%xmm0
3482	movdqa	`16*($i+3)-128`(%r11),%xmm3
3483	pand	`16*($i+1)-128`(%rax),%xmm1
3484	por	%xmm0,%xmm4
3485	pand	`16*($i+2)-128`(%rax),%xmm2
3486	por	%xmm1,%xmm5
3487	pand	`16*($i+3)-128`(%rax),%xmm3
3488	por	%xmm2,%xmm4
3489	por	%xmm3,%xmm5
3490___
3491}
3492$code.=<<___;
3493	por	%xmm5,%xmm4
3494	lea	$STRIDE(%r11),%r11
3495	pshufd	\$0x4e,%xmm4,%xmm0
3496	por	%xmm4,%xmm0
3497	movq	%xmm0,($out)		# m0=bp[0]
3498	lea	8($out),$out
3499	sub	\$1,$num
3500	jnz	.Lgather
3501
3502	lea	(%r10),%rsp
3503	ret
3504.LSEH_end_bn_gather5:
3505.size	bn_gather5,.-bn_gather5
3506___
3507}
3508$code.=<<___;
3509.align	64
3510.Linc:
3511	.long	0,0, 1,1
3512	.long	2,2, 2,2
3513.asciz	"Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
3514___
3515
3516# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3517#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
3518if ($win64) {
3519$rec="%rcx";
3520$frame="%rdx";
3521$context="%r8";
3522$disp="%r9";
3523
3524$code.=<<___;
3525.extern	__imp_RtlVirtualUnwind
3526.type	mul_handler,\@abi-omnipotent
3527.align	16
3528mul_handler:
3529	push	%rsi
3530	push	%rdi
3531	push	%rbx
3532	push	%rbp
3533	push	%r12
3534	push	%r13
3535	push	%r14
3536	push	%r15
3537	pushfq
3538	sub	\$64,%rsp
3539
3540	mov	120($context),%rax	# pull context->Rax
3541	mov	248($context),%rbx	# pull context->Rip
3542
3543	mov	8($disp),%rsi		# disp->ImageBase
3544	mov	56($disp),%r11		# disp->HandlerData
3545
3546	mov	0(%r11),%r10d		# HandlerData[0]
3547	lea	(%rsi,%r10),%r10	# end of prologue label
3548	cmp	%r10,%rbx		# context->Rip<end of prologue label
3549	jb	.Lcommon_seh_tail
3550
3551	mov	152($context),%rax	# pull context->Rsp
3552
3553	mov	4(%r11),%r10d		# HandlerData[1]
3554	lea	(%rsi,%r10),%r10	# epilogue label
3555	cmp	%r10,%rbx		# context->Rip>=epilogue label
3556	jae	.Lcommon_seh_tail
3557
3558	lea	.Lmul_epilogue(%rip),%r10
3559	cmp	%r10,%rbx
3560	ja	.Lbody_40
3561
3562	mov	192($context),%r10	# pull $num
3563	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
3564
3565	jmp	.Lbody_proceed
3566
3567.Lbody_40:
3568	mov	40(%rax),%rax		# pull saved stack pointer
3569.Lbody_proceed:
3570	mov	-8(%rax),%rbx
3571	mov	-16(%rax),%rbp
3572	mov	-24(%rax),%r12
3573	mov	-32(%rax),%r13
3574	mov	-40(%rax),%r14
3575	mov	-48(%rax),%r15
3576	mov	%rbx,144($context)	# restore context->Rbx
3577	mov	%rbp,160($context)	# restore context->Rbp
3578	mov	%r12,216($context)	# restore context->R12
3579	mov	%r13,224($context)	# restore context->R13
3580	mov	%r14,232($context)	# restore context->R14
3581	mov	%r15,240($context)	# restore context->R15
3582
3583.Lcommon_seh_tail:
3584	mov	8(%rax),%rdi
3585	mov	16(%rax),%rsi
3586	mov	%rax,152($context)	# restore context->Rsp
3587	mov	%rsi,168($context)	# restore context->Rsi
3588	mov	%rdi,176($context)	# restore context->Rdi
3589
3590	mov	40($disp),%rdi		# disp->ContextRecord
3591	mov	$context,%rsi		# context
3592	mov	\$154,%ecx		# sizeof(CONTEXT)
3593	.long	0xa548f3fc		# cld; rep movsq
3594
3595	mov	$disp,%rsi
3596	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
3597	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
3598	mov	0(%rsi),%r8		# arg3, disp->ControlPc
3599	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
3600	mov	40(%rsi),%r10		# disp->ContextRecord
3601	lea	56(%rsi),%r11		# &disp->HandlerData
3602	lea	24(%rsi),%r12		# &disp->EstablisherFrame
3603	mov	%r10,32(%rsp)		# arg5
3604	mov	%r11,40(%rsp)		# arg6
3605	mov	%r12,48(%rsp)		# arg7
3606	mov	%rcx,56(%rsp)		# arg8, (NULL)
3607	call	*__imp_RtlVirtualUnwind(%rip)
3608
3609	mov	\$1,%eax		# ExceptionContinueSearch
3610	add	\$64,%rsp
3611	popfq
3612	pop	%r15
3613	pop	%r14
3614	pop	%r13
3615	pop	%r12
3616	pop	%rbp
3617	pop	%rbx
3618	pop	%rdi
3619	pop	%rsi
3620	ret
3621.size	mul_handler,.-mul_handler
3622
3623.section	.pdata
3624.align	4
3625	.rva	.LSEH_begin_bn_mul_mont_gather5
3626	.rva	.LSEH_end_bn_mul_mont_gather5
3627	.rva	.LSEH_info_bn_mul_mont_gather5
3628
3629	.rva	.LSEH_begin_bn_mul4x_mont_gather5
3630	.rva	.LSEH_end_bn_mul4x_mont_gather5
3631	.rva	.LSEH_info_bn_mul4x_mont_gather5
3632
3633	.rva	.LSEH_begin_bn_power5
3634	.rva	.LSEH_end_bn_power5
3635	.rva	.LSEH_info_bn_power5
3636
3637	.rva	.LSEH_begin_bn_from_mont8x
3638	.rva	.LSEH_end_bn_from_mont8x
3639	.rva	.LSEH_info_bn_from_mont8x
3640___
3641$code.=<<___ if ($addx);
3642	.rva	.LSEH_begin_bn_mulx4x_mont_gather5
3643	.rva	.LSEH_end_bn_mulx4x_mont_gather5
3644	.rva	.LSEH_info_bn_mulx4x_mont_gather5
3645
3646	.rva	.LSEH_begin_bn_powerx5
3647	.rva	.LSEH_end_bn_powerx5
3648	.rva	.LSEH_info_bn_powerx5
3649___
3650$code.=<<___;
3651	.rva	.LSEH_begin_bn_gather5
3652	.rva	.LSEH_end_bn_gather5
3653	.rva	.LSEH_info_bn_gather5
3654
3655.section	.xdata
3656.align	8
3657.LSEH_info_bn_mul_mont_gather5:
3658	.byte	9,0,0,0
3659	.rva	mul_handler
3660	.rva	.Lmul_body,.Lmul_epilogue		# HandlerData[]
3661.align	8
3662.LSEH_info_bn_mul4x_mont_gather5:
3663	.byte	9,0,0,0
3664	.rva	mul_handler
3665	.rva	.Lmul4x_body,.Lmul4x_epilogue		# HandlerData[]
3666.align	8
3667.LSEH_info_bn_power5:
3668	.byte	9,0,0,0
3669	.rva	mul_handler
3670	.rva	.Lpower5_body,.Lpower5_epilogue		# HandlerData[]
3671.align	8
3672.LSEH_info_bn_from_mont8x:
3673	.byte	9,0,0,0
3674	.rva	mul_handler
3675	.rva	.Lfrom_body,.Lfrom_epilogue		# HandlerData[]
3676___
3677$code.=<<___ if ($addx);
3678.align	8
3679.LSEH_info_bn_mulx4x_mont_gather5:
3680	.byte	9,0,0,0
3681	.rva	mul_handler
3682	.rva	.Lmulx4x_body,.Lmulx4x_epilogue		# HandlerData[]
3683.align	8
3684.LSEH_info_bn_powerx5:
3685	.byte	9,0,0,0
3686	.rva	mul_handler
3687	.rva	.Lpowerx5_body,.Lpowerx5_epilogue	# HandlerData[]
3688___
3689$code.=<<___;
3690.align	8
3691.LSEH_info_bn_gather5:
3692	.byte	0x01,0x0b,0x03,0x0a
3693	.byte	0x0b,0x01,0x21,0x00	# sub	rsp,0x108
3694	.byte	0x04,0xa3,0x00,0x00	# lea	r10,(rsp)
3695.align	8
3696___
3697}
3698
3699$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3700
3701print $code;
3702close STDOUT;
3703