x86_64-mont5.pl revision 298998
1351280Sdim#!/usr/bin/env perl
2351280Sdim
3351280Sdim# ====================================================================
4351280Sdim# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5351280Sdim# project. The module is, however, dual licensed under OpenSSL and
6351280Sdim# CRYPTOGAMS licenses depending on where you obtain it. For further
7351280Sdim# details see http://www.openssl.org/~appro/cryptogams/.
8351280Sdim# ====================================================================
9351280Sdim
10351280Sdim# August 2011.
11351280Sdim#
12351280Sdim# Companion to x86_64-mont.pl that optimizes cache-timing attack
13351280Sdim# countermeasures. The subroutines are produced by replacing bp[i]
14351280Sdim# references in their x86_64-mont.pl counterparts with cache-neutral
15351280Sdim# references to powers table computed in BN_mod_exp_mont_consttime.
16351280Sdim# In addition subroutine that scatters elements of the powers table
17351280Sdim# is implemented, so that scatter-/gathering can be tuned without
18351280Sdim# bn_exp.c modifications.
19351280Sdim
20351280Sdim# August 2013.
21351280Sdim#
22351280Sdim# Add MULX/AD*X code paths and additional interfaces to optimize for
23351280Sdim# branch prediction unit. For input lengths that are multiples of 8
24351280Sdim# the np argument is not just modulus value, but one interleaved
25351280Sdim# with 0. This is to optimize post-condition...
26351280Sdim
27351280Sdim$flavour = shift;
28351280Sdim$output  = shift;
29351280Sdimif ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
30351280Sdim
31351280Sdim$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
32351280Sdim
33351280Sdim$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
34351280Sdim( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
35351280Sdim( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
36351280Sdimdie "can't locate x86_64-xlate.pl";
37351280Sdim
38351280Sdimopen OUT,"| \"$^X\" $xlate $flavour $output";
39351280Sdim*STDOUT=*OUT;
40351280Sdim
41351280Sdimif (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
42351280Sdim		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
43351280Sdim	$addx = ($1>=2.23);
44351280Sdim}
45351280Sdim
46351280Sdimif (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
47351280Sdim	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
48351280Sdim	$addx = ($1>=2.10);
49351280Sdim}
50351280Sdim
51351280Sdimif (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
52351280Sdim	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
53351280Sdim	$addx = ($1>=12);
54351280Sdim}
55351280Sdim
56351280Sdimif (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
57351280Sdim	my $ver = $2 + $3/100.0;	# 3.1->3.01, 3.10->3.10
58351280Sdim	$addx = ($ver>=3.03);
59351280Sdim}
60351280Sdim
61351280Sdim# int bn_mul_mont_gather5(
62351280Sdim$rp="%rdi";	# BN_ULONG *rp,
63351280Sdim$ap="%rsi";	# const BN_ULONG *ap,
64351280Sdim$bp="%rdx";	# const BN_ULONG *bp,
65351280Sdim$np="%rcx";	# const BN_ULONG *np,
66351280Sdim$n0="%r8";	# const BN_ULONG *n0,
67351280Sdim$num="%r9";	# int num,
68351280Sdim		# int idx);	# 0 to 2^5-1, "index" in $bp holding
69351280Sdim				# pre-computed powers of a', interlaced
70351280Sdim				# in such manner that b[0] is $bp[idx],
71351280Sdim				# b[1] is [2^5+idx], etc.
72351280Sdim$lo0="%r10";
73351280Sdim$hi0="%r11";
74351280Sdim$hi1="%r13";
75351280Sdim$i="%r14";
76351280Sdim$j="%r15";
77351280Sdim$m0="%rbx";
78351280Sdim$m1="%rbp";
79351280Sdim
80351280Sdim$code=<<___;
81351280Sdim.text
82351280Sdim
83351280Sdim.extern	OPENSSL_ia32cap_P
84351280Sdim
85351280Sdim.globl	bn_mul_mont_gather5
86351280Sdim.type	bn_mul_mont_gather5,\@function,6
87351280Sdim.align	64
88351280Sdimbn_mul_mont_gather5:
89351280Sdim	test	\$7,${num}d
90351280Sdim	jnz	.Lmul_enter
91351280Sdim___
92351280Sdim$code.=<<___ if ($addx);
93351280Sdim	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
94351280Sdim___
95351280Sdim$code.=<<___;
96351280Sdim	jmp	.Lmul4x_enter
97351280Sdim
98351280Sdim.align	16
99351280Sdim.Lmul_enter:
100351280Sdim	mov	${num}d,${num}d
101351280Sdim	mov	%rsp,%rax
102351280Sdim	movd	`($win64?56:8)`(%rsp),%xmm5	# load 7th argument
103351280Sdim	lea	.Linc(%rip),%r10
104351280Sdim	push	%rbx
105351280Sdim	push	%rbp
106351280Sdim	push	%r12
107351280Sdim	push	%r13
108351280Sdim	push	%r14
109351280Sdim	push	%r15
110351280Sdim
111351280Sdim	lea	2($num),%r11
112351280Sdim	neg	%r11
113351280Sdim	lea	-264(%rsp,%r11,8),%rsp	# tp=alloca(8*(num+2)+256+8)
114351280Sdim	and	\$-1024,%rsp		# minimize TLB usage
115351280Sdim
116351280Sdim	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
117351280Sdim.Lmul_body:
118351280Sdim	# Some OSes, *cough*-dows, insist on stack being "wired" to
119351280Sdim	# physical memory in strictly sequential manner, i.e. if stack
120351280Sdim	# allocation spans two pages, then reference to farmost one can
121351280Sdim	# be punishable by SEGV. But page walking can do good even on
122351280Sdim	# other OSes, because it guarantees that villain thread hits
123351280Sdim	# the guard page before it can make damage to innocent one...
124351280Sdim	sub	%rsp,%rax
125351280Sdim	and	\$-4096,%rax
126351280Sdim.Lmul_page_walk:
127351280Sdim	mov	(%rsp,%rax),%r11
128351280Sdim	sub	\$4096,%rax
129351280Sdim	.byte	0x2e			# predict non-taken
130351280Sdim	jnc	.Lmul_page_walk
131351280Sdim
132351280Sdim	lea	128($bp),%r12		# reassign $bp (+size optimization)
133351280Sdim___
134351280Sdim		$bp="%r12";
135351280Sdim		$STRIDE=2**5*8;		# 5 is "window size"
136351280Sdim		$N=$STRIDE/4;		# should match cache line size
137351280Sdim$code.=<<___;
138351280Sdim	movdqa	0(%r10),%xmm0		# 00000001000000010000000000000000
139351280Sdim	movdqa	16(%r10),%xmm1		# 00000002000000020000000200000002
140351280Sdim	lea	24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization)
141351280Sdim	and	\$-16,%r10
142351280Sdim
143351280Sdim	pshufd	\$0,%xmm5,%xmm5		# broadcast index
144351280Sdim	movdqa	%xmm1,%xmm4
145351280Sdim	movdqa	%xmm1,%xmm2
146351280Sdim___
147351280Sdim########################################################################
148351280Sdim# calculate mask by comparing 0..31 to index and save result to stack
149351280Sdim#
150351280Sdim$code.=<<___;
151351280Sdim	paddd	%xmm0,%xmm1
152351280Sdim	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
153351280Sdim	.byte	0x67
154351280Sdim	movdqa	%xmm4,%xmm3
155351280Sdim___
156351280Sdimfor($k=0;$k<$STRIDE/16-4;$k+=4) {
157351280Sdim$code.=<<___;
158351280Sdim	paddd	%xmm1,%xmm2
159351280Sdim	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
160351280Sdim	movdqa	%xmm0,`16*($k+0)+112`(%r10)
161351280Sdim	movdqa	%xmm4,%xmm0
162351280Sdim
163351280Sdim	paddd	%xmm2,%xmm3
164351280Sdim	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
165351280Sdim	movdqa	%xmm1,`16*($k+1)+112`(%r10)
166351280Sdim	movdqa	%xmm4,%xmm1
167351280Sdim
168351280Sdim	paddd	%xmm3,%xmm0
169351280Sdim	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
170351280Sdim	movdqa	%xmm2,`16*($k+2)+112`(%r10)
171351280Sdim	movdqa	%xmm4,%xmm2
172351280Sdim
173351280Sdim	paddd	%xmm0,%xmm1
174351280Sdim	pcmpeqd	%xmm5,%xmm0
175351280Sdim	movdqa	%xmm3,`16*($k+3)+112`(%r10)
176351280Sdim	movdqa	%xmm4,%xmm3
177351280Sdim___
178351280Sdim}
179351280Sdim$code.=<<___;				# last iteration can be optimized
180351280Sdim	paddd	%xmm1,%xmm2
181351280Sdim	pcmpeqd	%xmm5,%xmm1
182351280Sdim	movdqa	%xmm0,`16*($k+0)+112`(%r10)
183351280Sdim
184351280Sdim	paddd	%xmm2,%xmm3
185351280Sdim	.byte	0x67
186351280Sdim	pcmpeqd	%xmm5,%xmm2
187351280Sdim	movdqa	%xmm1,`16*($k+1)+112`(%r10)
188351280Sdim
189351280Sdim	pcmpeqd	%xmm5,%xmm3
190351280Sdim	movdqa	%xmm2,`16*($k+2)+112`(%r10)
191351280Sdim	pand	`16*($k+0)-128`($bp),%xmm0	# while it's still in register
192351280Sdim
193351280Sdim	pand	`16*($k+1)-128`($bp),%xmm1
194351280Sdim	pand	`16*($k+2)-128`($bp),%xmm2
195351280Sdim	movdqa	%xmm3,`16*($k+3)+112`(%r10)
196351280Sdim	pand	`16*($k+3)-128`($bp),%xmm3
197351280Sdim	por	%xmm2,%xmm0
198351280Sdim	por	%xmm3,%xmm1
199351280Sdim___
200351280Sdimfor($k=0;$k<$STRIDE/16-4;$k+=4) {
201351280Sdim$code.=<<___;
202351280Sdim	movdqa	`16*($k+0)-128`($bp),%xmm4
203351280Sdim	movdqa	`16*($k+1)-128`($bp),%xmm5
204351280Sdim	movdqa	`16*($k+2)-128`($bp),%xmm2
205351280Sdim	pand	`16*($k+0)+112`(%r10),%xmm4
206351280Sdim	movdqa	`16*($k+3)-128`($bp),%xmm3
207351280Sdim	pand	`16*($k+1)+112`(%r10),%xmm5
208351280Sdim	por	%xmm4,%xmm0
209351280Sdim	pand	`16*($k+2)+112`(%r10),%xmm2
210351280Sdim	por	%xmm5,%xmm1
211351280Sdim	pand	`16*($k+3)+112`(%r10),%xmm3
212351280Sdim	por	%xmm2,%xmm0
213351280Sdim	por	%xmm3,%xmm1
214351280Sdim___
215351280Sdim}
216351280Sdim$code.=<<___;
217351280Sdim	por	%xmm1,%xmm0
218351280Sdim	pshufd	\$0x4e,%xmm0,%xmm1
219351280Sdim	por	%xmm1,%xmm0
220351280Sdim	lea	$STRIDE($bp),$bp
221351280Sdim	movq	%xmm0,$m0		# m0=bp[0]
222351280Sdim
223351280Sdim	mov	($n0),$n0		# pull n0[0] value
224351280Sdim	mov	($ap),%rax
225351280Sdim
226351280Sdim	xor	$i,$i			# i=0
227351280Sdim	xor	$j,$j			# j=0
228
229	mov	$n0,$m1
230	mulq	$m0			# ap[0]*bp[0]
231	mov	%rax,$lo0
232	mov	($np),%rax
233
234	imulq	$lo0,$m1		# "tp[0]"*n0
235	mov	%rdx,$hi0
236
237	mulq	$m1			# np[0]*m1
238	add	%rax,$lo0		# discarded
239	mov	8($ap),%rax
240	adc	\$0,%rdx
241	mov	%rdx,$hi1
242
243	lea	1($j),$j		# j++
244	jmp	.L1st_enter
245
246.align	16
247.L1st:
248	add	%rax,$hi1
249	mov	($ap,$j,8),%rax
250	adc	\$0,%rdx
251	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
252	mov	$lo0,$hi0
253	adc	\$0,%rdx
254	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
255	mov	%rdx,$hi1
256
257.L1st_enter:
258	mulq	$m0			# ap[j]*bp[0]
259	add	%rax,$hi0
260	mov	($np,$j,8),%rax
261	adc	\$0,%rdx
262	lea	1($j),$j		# j++
263	mov	%rdx,$lo0
264
265	mulq	$m1			# np[j]*m1
266	cmp	$num,$j
267	jne	.L1st			# note that upon exit $j==$num, so
268					# they can be used interchangeably
269
270	add	%rax,$hi1
271	adc	\$0,%rdx
272	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
273	adc	\$0,%rdx
274	mov	$hi1,-16(%rsp,$num,8)	# tp[num-1]
275	mov	%rdx,$hi1
276	mov	$lo0,$hi0
277
278	xor	%rdx,%rdx
279	add	$hi0,$hi1
280	adc	\$0,%rdx
281	mov	$hi1,-8(%rsp,$num,8)
282	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
283
284	lea	1($i),$i		# i++
285	jmp	.Louter
286.align	16
287.Louter:
288	lea	24+128(%rsp,$num,8),%rdx	# where 256-byte mask is (+size optimization)
289	and	\$-16,%rdx
290	pxor	%xmm4,%xmm4
291	pxor	%xmm5,%xmm5
292___
293for($k=0;$k<$STRIDE/16;$k+=4) {
294$code.=<<___;
295	movdqa	`16*($k+0)-128`($bp),%xmm0
296	movdqa	`16*($k+1)-128`($bp),%xmm1
297	movdqa	`16*($k+2)-128`($bp),%xmm2
298	movdqa	`16*($k+3)-128`($bp),%xmm3
299	pand	`16*($k+0)-128`(%rdx),%xmm0
300	pand	`16*($k+1)-128`(%rdx),%xmm1
301	por	%xmm0,%xmm4
302	pand	`16*($k+2)-128`(%rdx),%xmm2
303	por	%xmm1,%xmm5
304	pand	`16*($k+3)-128`(%rdx),%xmm3
305	por	%xmm2,%xmm4
306	por	%xmm3,%xmm5
307___
308}
309$code.=<<___;
310	por	%xmm5,%xmm4
311	pshufd	\$0x4e,%xmm4,%xmm0
312	por	%xmm4,%xmm0
313	lea	$STRIDE($bp),$bp
314
315	mov	($ap),%rax		# ap[0]
316	movq	%xmm0,$m0		# m0=bp[i]
317
318	xor	$j,$j			# j=0
319	mov	$n0,$m1
320	mov	(%rsp),$lo0
321
322	mulq	$m0			# ap[0]*bp[i]
323	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
324	mov	($np),%rax
325	adc	\$0,%rdx
326
327	imulq	$lo0,$m1		# tp[0]*n0
328	mov	%rdx,$hi0
329
330	mulq	$m1			# np[0]*m1
331	add	%rax,$lo0		# discarded
332	mov	8($ap),%rax
333	adc	\$0,%rdx
334	mov	8(%rsp),$lo0		# tp[1]
335	mov	%rdx,$hi1
336
337	lea	1($j),$j		# j++
338	jmp	.Linner_enter
339
340.align	16
341.Linner:
342	add	%rax,$hi1
343	mov	($ap,$j,8),%rax
344	adc	\$0,%rdx
345	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
346	mov	(%rsp,$j,8),$lo0
347	adc	\$0,%rdx
348	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
349	mov	%rdx,$hi1
350
351.Linner_enter:
352	mulq	$m0			# ap[j]*bp[i]
353	add	%rax,$hi0
354	mov	($np,$j,8),%rax
355	adc	\$0,%rdx
356	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
357	mov	%rdx,$hi0
358	adc	\$0,$hi0
359	lea	1($j),$j		# j++
360
361	mulq	$m1			# np[j]*m1
362	cmp	$num,$j
363	jne	.Linner			# note that upon exit $j==$num, so
364					# they can be used interchangeably
365	add	%rax,$hi1
366	adc	\$0,%rdx
367	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
368	mov	(%rsp,$num,8),$lo0
369	adc	\$0,%rdx
370	mov	$hi1,-16(%rsp,$num,8)	# tp[num-1]
371	mov	%rdx,$hi1
372
373	xor	%rdx,%rdx
374	add	$hi0,$hi1
375	adc	\$0,%rdx
376	add	$lo0,$hi1		# pull upmost overflow bit
377	adc	\$0,%rdx
378	mov	$hi1,-8(%rsp,$num,8)
379	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
380
381	lea	1($i),$i		# i++
382	cmp	$num,$i
383	jb	.Louter
384
385	xor	$i,$i			# i=0 and clear CF!
386	mov	(%rsp),%rax		# tp[0]
387	lea	(%rsp),$ap		# borrow ap for tp
388	mov	$num,$j			# j=num
389	jmp	.Lsub
390.align	16
391.Lsub:	sbb	($np,$i,8),%rax
392	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
393	mov	8($ap,$i,8),%rax	# tp[i+1]
394	lea	1($i),$i		# i++
395	dec	$j			# doesnn't affect CF!
396	jnz	.Lsub
397
398	sbb	\$0,%rax		# handle upmost overflow bit
399	xor	$i,$i
400	and	%rax,$ap
401	not	%rax
402	mov	$rp,$np
403	and	%rax,$np
404	mov	$num,$j			# j=num
405	or	$np,$ap			# ap=borrow?tp:rp
406.align	16
407.Lcopy:					# copy or in-place refresh
408	mov	($ap,$i,8),%rax
409	mov	$i,(%rsp,$i,8)		# zap temporary vector
410	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
411	lea	1($i),$i
412	sub	\$1,$j
413	jnz	.Lcopy
414
415	mov	8(%rsp,$num,8),%rsi	# restore %rsp
416	mov	\$1,%rax
417
418	mov	-48(%rsi),%r15
419	mov	-40(%rsi),%r14
420	mov	-32(%rsi),%r13
421	mov	-24(%rsi),%r12
422	mov	-16(%rsi),%rbp
423	mov	-8(%rsi),%rbx
424	lea	(%rsi),%rsp
425.Lmul_epilogue:
426	ret
427.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
428___
429{{{
430my @A=("%r10","%r11");
431my @N=("%r13","%rdi");
432$code.=<<___;
433.type	bn_mul4x_mont_gather5,\@function,6
434.align	32
435bn_mul4x_mont_gather5:
436.Lmul4x_enter:
437___
438$code.=<<___ if ($addx);
439	and	\$0x80108,%r11d
440	cmp	\$0x80108,%r11d		# check for AD*X+BMI2+BMI1
441	je	.Lmulx4x_enter
442___
443$code.=<<___;
444	.byte	0x67
445	mov	%rsp,%rax
446	push	%rbx
447	push	%rbp
448	push	%r12
449	push	%r13
450	push	%r14
451	push	%r15
452
453	.byte	0x67
454	shl	\$3,${num}d		# convert $num to bytes
455	lea	($num,$num,2),%r10	# 3*$num in bytes
456	neg	$num			# -$num
457
458	##############################################################
459	# Ensure that stack frame doesn't alias with $rptr+3*$num
460	# modulo 4096, which covers ret[num], am[num] and n[num]
461	# (see bn_exp.c). This is done to allow memory disambiguation
462	# logic do its magic. [Extra [num] is allocated in order
463	# to align with bn_power5's frame, which is cleansed after
464	# completing exponentiation. Extra 256 bytes is for power mask
465	# calculated from 7th argument, the index.]
466	#
467	lea	-320(%rsp,$num,2),%r11
468	sub	$rp,%r11
469	and	\$4095,%r11
470	cmp	%r11,%r10
471	jb	.Lmul4xsp_alt
472	sub	%r11,%rsp		# align with $rp
473	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*num*8+256)
474	jmp	.Lmul4xsp_done
475
476.align	32
477.Lmul4xsp_alt:
478	lea	4096-320(,$num,2),%r10
479	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*num*8+256)
480	sub	%r10,%r11
481	mov	\$0,%r10
482	cmovc	%r10,%r11
483	sub	%r11,%rsp
484.Lmul4xsp_done:
485	and	\$-64,%rsp
486	mov	%rax,%r11
487	sub	%rsp,%r11
488	and	\$-4096,%r11
489.Lmul4x_page_walk:
490	mov	(%rsp,%r11),%r10
491	sub	\$4096,%r11
492	.byte	0x2e			# predict non-taken
493	jnc	.Lmul4x_page_walk
494
495	neg	$num
496
497	mov	%rax,40(%rsp)
498.Lmul4x_body:
499
500	call	mul4x_internal
501
502	mov	40(%rsp),%rsi		# restore %rsp
503	mov	\$1,%rax
504
505	mov	-48(%rsi),%r15
506	mov	-40(%rsi),%r14
507	mov	-32(%rsi),%r13
508	mov	-24(%rsi),%r12
509	mov	-16(%rsi),%rbp
510	mov	-8(%rsi),%rbx
511	lea	(%rsi),%rsp
512.Lmul4x_epilogue:
513	ret
514.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
515
516.type	mul4x_internal,\@abi-omnipotent
517.align	32
518mul4x_internal:
519	shl	\$5,$num		# $num was in bytes
520	movd	`($win64?56:8)`(%rax),%xmm5	# load 7th argument, index
521	lea	.Linc(%rip),%rax
522	lea	128(%rdx,$num),%r13	# end of powers table (+size optimization)
523	shr	\$5,$num		# restore $num
524___
525		$bp="%r12";
526		$STRIDE=2**5*8;		# 5 is "window size"
527		$N=$STRIDE/4;		# should match cache line size
528		$tp=$i;
529$code.=<<___;
530	movdqa	0(%rax),%xmm0		# 00000001000000010000000000000000
531	movdqa	16(%rax),%xmm1		# 00000002000000020000000200000002
532	lea	88-112(%rsp,$num),%r10	# place the mask after tp[num+1] (+ICache optimization)
533	lea	128(%rdx),$bp		# size optimization
534
535	pshufd	\$0,%xmm5,%xmm5		# broadcast index
536	movdqa	%xmm1,%xmm4
537	.byte	0x67,0x67
538	movdqa	%xmm1,%xmm2
539___
540########################################################################
541# calculate mask by comparing 0..31 to index and save result to stack
542#
543$code.=<<___;
544	paddd	%xmm0,%xmm1
545	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
546	.byte	0x67
547	movdqa	%xmm4,%xmm3
548___
549for($i=0;$i<$STRIDE/16-4;$i+=4) {
550$code.=<<___;
551	paddd	%xmm1,%xmm2
552	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
553	movdqa	%xmm0,`16*($i+0)+112`(%r10)
554	movdqa	%xmm4,%xmm0
555
556	paddd	%xmm2,%xmm3
557	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
558	movdqa	%xmm1,`16*($i+1)+112`(%r10)
559	movdqa	%xmm4,%xmm1
560
561	paddd	%xmm3,%xmm0
562	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
563	movdqa	%xmm2,`16*($i+2)+112`(%r10)
564	movdqa	%xmm4,%xmm2
565
566	paddd	%xmm0,%xmm1
567	pcmpeqd	%xmm5,%xmm0
568	movdqa	%xmm3,`16*($i+3)+112`(%r10)
569	movdqa	%xmm4,%xmm3
570___
571}
572$code.=<<___;				# last iteration can be optimized
573	paddd	%xmm1,%xmm2
574	pcmpeqd	%xmm5,%xmm1
575	movdqa	%xmm0,`16*($i+0)+112`(%r10)
576
577	paddd	%xmm2,%xmm3
578	.byte	0x67
579	pcmpeqd	%xmm5,%xmm2
580	movdqa	%xmm1,`16*($i+1)+112`(%r10)
581
582	pcmpeqd	%xmm5,%xmm3
583	movdqa	%xmm2,`16*($i+2)+112`(%r10)
584	pand	`16*($i+0)-128`($bp),%xmm0	# while it's still in register
585
586	pand	`16*($i+1)-128`($bp),%xmm1
587	pand	`16*($i+2)-128`($bp),%xmm2
588	movdqa	%xmm3,`16*($i+3)+112`(%r10)
589	pand	`16*($i+3)-128`($bp),%xmm3
590	por	%xmm2,%xmm0
591	por	%xmm3,%xmm1
592___
593for($i=0;$i<$STRIDE/16-4;$i+=4) {
594$code.=<<___;
595	movdqa	`16*($i+0)-128`($bp),%xmm4
596	movdqa	`16*($i+1)-128`($bp),%xmm5
597	movdqa	`16*($i+2)-128`($bp),%xmm2
598	pand	`16*($i+0)+112`(%r10),%xmm4
599	movdqa	`16*($i+3)-128`($bp),%xmm3
600	pand	`16*($i+1)+112`(%r10),%xmm5
601	por	%xmm4,%xmm0
602	pand	`16*($i+2)+112`(%r10),%xmm2
603	por	%xmm5,%xmm1
604	pand	`16*($i+3)+112`(%r10),%xmm3
605	por	%xmm2,%xmm0
606	por	%xmm3,%xmm1
607___
608}
609$code.=<<___;
610	por	%xmm1,%xmm0
611	pshufd	\$0x4e,%xmm0,%xmm1
612	por	%xmm1,%xmm0
613	lea	$STRIDE($bp),$bp
614	movq	%xmm0,$m0		# m0=bp[0]
615
616	mov	%r13,16+8(%rsp)		# save end of b[num]
617	mov	$rp, 56+8(%rsp)		# save $rp
618
619	mov	($n0),$n0		# pull n0[0] value
620	mov	($ap),%rax
621	lea	($ap,$num),$ap		# end of a[num]
622	neg	$num
623
624	mov	$n0,$m1
625	mulq	$m0			# ap[0]*bp[0]
626	mov	%rax,$A[0]
627	mov	($np),%rax
628
629	imulq	$A[0],$m1		# "tp[0]"*n0
630	lea	64+8(%rsp),$tp
631	mov	%rdx,$A[1]
632
633	mulq	$m1			# np[0]*m1
634	add	%rax,$A[0]		# discarded
635	mov	8($ap,$num),%rax
636	adc	\$0,%rdx
637	mov	%rdx,$N[1]
638
639	mulq	$m0
640	add	%rax,$A[1]
641	mov	8*1($np),%rax
642	adc	\$0,%rdx
643	mov	%rdx,$A[0]
644
645	mulq	$m1
646	add	%rax,$N[1]
647	mov	16($ap,$num),%rax
648	adc	\$0,%rdx
649	add	$A[1],$N[1]
650	lea	4*8($num),$j		# j=4
651	lea	8*4($np),$np
652	adc	\$0,%rdx
653	mov	$N[1],($tp)
654	mov	%rdx,$N[0]
655	jmp	.L1st4x
656
657.align	32
658.L1st4x:
659	mulq	$m0			# ap[j]*bp[0]
660	add	%rax,$A[0]
661	mov	-8*2($np),%rax
662	lea	32($tp),$tp
663	adc	\$0,%rdx
664	mov	%rdx,$A[1]
665
666	mulq	$m1			# np[j]*m1
667	add	%rax,$N[0]
668	mov	-8($ap,$j),%rax
669	adc	\$0,%rdx
670	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
671	adc	\$0,%rdx
672	mov	$N[0],-24($tp)		# tp[j-1]
673	mov	%rdx,$N[1]
674
675	mulq	$m0			# ap[j]*bp[0]
676	add	%rax,$A[1]
677	mov	-8*1($np),%rax
678	adc	\$0,%rdx
679	mov	%rdx,$A[0]
680
681	mulq	$m1			# np[j]*m1
682	add	%rax,$N[1]
683	mov	($ap,$j),%rax
684	adc	\$0,%rdx
685	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
686	adc	\$0,%rdx
687	mov	$N[1],-16($tp)		# tp[j-1]
688	mov	%rdx,$N[0]
689
690	mulq	$m0			# ap[j]*bp[0]
691	add	%rax,$A[0]
692	mov	8*0($np),%rax
693	adc	\$0,%rdx
694	mov	%rdx,$A[1]
695
696	mulq	$m1			# np[j]*m1
697	add	%rax,$N[0]
698	mov	8($ap,$j),%rax
699	adc	\$0,%rdx
700	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
701	adc	\$0,%rdx
702	mov	$N[0],-8($tp)		# tp[j-1]
703	mov	%rdx,$N[1]
704
705	mulq	$m0			# ap[j]*bp[0]
706	add	%rax,$A[1]
707	mov	8*1($np),%rax
708	adc	\$0,%rdx
709	mov	%rdx,$A[0]
710
711	mulq	$m1			# np[j]*m1
712	add	%rax,$N[1]
713	mov	16($ap,$j),%rax
714	adc	\$0,%rdx
715	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
716	lea	8*4($np),$np
717	adc	\$0,%rdx
718	mov	$N[1],($tp)		# tp[j-1]
719	mov	%rdx,$N[0]
720
721	add	\$32,$j			# j+=4
722	jnz	.L1st4x
723
724	mulq	$m0			# ap[j]*bp[0]
725	add	%rax,$A[0]
726	mov	-8*2($np),%rax
727	lea	32($tp),$tp
728	adc	\$0,%rdx
729	mov	%rdx,$A[1]
730
731	mulq	$m1			# np[j]*m1
732	add	%rax,$N[0]
733	mov	-8($ap),%rax
734	adc	\$0,%rdx
735	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
736	adc	\$0,%rdx
737	mov	$N[0],-24($tp)		# tp[j-1]
738	mov	%rdx,$N[1]
739
740	mulq	$m0			# ap[j]*bp[0]
741	add	%rax,$A[1]
742	mov	-8*1($np),%rax
743	adc	\$0,%rdx
744	mov	%rdx,$A[0]
745
746	mulq	$m1			# np[j]*m1
747	add	%rax,$N[1]
748	mov	($ap,$num),%rax		# ap[0]
749	adc	\$0,%rdx
750	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
751	adc	\$0,%rdx
752	mov	$N[1],-16($tp)		# tp[j-1]
753	mov	%rdx,$N[0]
754
755	lea	($np,$num),$np		# rewind $np
756
757	xor	$N[1],$N[1]
758	add	$A[0],$N[0]
759	adc	\$0,$N[1]
760	mov	$N[0],-8($tp)
761
762	jmp	.Louter4x
763
764.align	32
765.Louter4x:
766	lea	16+128($tp),%rdx	# where 256-byte mask is (+size optimization)
767	pxor	%xmm4,%xmm4
768	pxor	%xmm5,%xmm5
769___
770for($i=0;$i<$STRIDE/16;$i+=4) {
771$code.=<<___;
772	movdqa	`16*($i+0)-128`($bp),%xmm0
773	movdqa	`16*($i+1)-128`($bp),%xmm1
774	movdqa	`16*($i+2)-128`($bp),%xmm2
775	movdqa	`16*($i+3)-128`($bp),%xmm3
776	pand	`16*($i+0)-128`(%rdx),%xmm0
777	pand	`16*($i+1)-128`(%rdx),%xmm1
778	por	%xmm0,%xmm4
779	pand	`16*($i+2)-128`(%rdx),%xmm2
780	por	%xmm1,%xmm5
781	pand	`16*($i+3)-128`(%rdx),%xmm3
782	por	%xmm2,%xmm4
783	por	%xmm3,%xmm5
784___
785}
786$code.=<<___;
787	por	%xmm5,%xmm4
788	pshufd	\$0x4e,%xmm4,%xmm0
789	por	%xmm4,%xmm0
790	lea	$STRIDE($bp),$bp
791	movq	%xmm0,$m0		# m0=bp[i]
792
793	mov	($tp,$num),$A[0]
794	mov	$n0,$m1
795	mulq	$m0			# ap[0]*bp[i]
796	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
797	mov	($np),%rax
798	adc	\$0,%rdx
799
800	imulq	$A[0],$m1		# tp[0]*n0
801	mov	%rdx,$A[1]
802	mov	$N[1],($tp)		# store upmost overflow bit
803
804	lea	($tp,$num),$tp		# rewind $tp
805
806	mulq	$m1			# np[0]*m1
807	add	%rax,$A[0]		# "$N[0]", discarded
808	mov	8($ap,$num),%rax
809	adc	\$0,%rdx
810	mov	%rdx,$N[1]
811
812	mulq	$m0			# ap[j]*bp[i]
813	add	%rax,$A[1]
814	mov	8*1($np),%rax
815	adc	\$0,%rdx
816	add	8($tp),$A[1]		# +tp[1]
817	adc	\$0,%rdx
818	mov	%rdx,$A[0]
819
820	mulq	$m1			# np[j]*m1
821	add	%rax,$N[1]
822	mov	16($ap,$num),%rax
823	adc	\$0,%rdx
824	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
825	lea	4*8($num),$j		# j=4
826	lea	8*4($np),$np
827	adc	\$0,%rdx
828	mov	%rdx,$N[0]
829	jmp	.Linner4x
830
831.align	32
832.Linner4x:
833	mulq	$m0			# ap[j]*bp[i]
834	add	%rax,$A[0]
835	mov	-8*2($np),%rax
836	adc	\$0,%rdx
837	add	16($tp),$A[0]		# ap[j]*bp[i]+tp[j]
838	lea	32($tp),$tp
839	adc	\$0,%rdx
840	mov	%rdx,$A[1]
841
842	mulq	$m1			# np[j]*m1
843	add	%rax,$N[0]
844	mov	-8($ap,$j),%rax
845	adc	\$0,%rdx
846	add	$A[0],$N[0]
847	adc	\$0,%rdx
848	mov	$N[1],-32($tp)		# tp[j-1]
849	mov	%rdx,$N[1]
850
851	mulq	$m0			# ap[j]*bp[i]
852	add	%rax,$A[1]
853	mov	-8*1($np),%rax
854	adc	\$0,%rdx
855	add	-8($tp),$A[1]
856	adc	\$0,%rdx
857	mov	%rdx,$A[0]
858
859	mulq	$m1			# np[j]*m1
860	add	%rax,$N[1]
861	mov	($ap,$j),%rax
862	adc	\$0,%rdx
863	add	$A[1],$N[1]
864	adc	\$0,%rdx
865	mov	$N[0],-24($tp)		# tp[j-1]
866	mov	%rdx,$N[0]
867
868	mulq	$m0			# ap[j]*bp[i]
869	add	%rax,$A[0]
870	mov	8*0($np),%rax
871	adc	\$0,%rdx
872	add	($tp),$A[0]		# ap[j]*bp[i]+tp[j]
873	adc	\$0,%rdx
874	mov	%rdx,$A[1]
875
876	mulq	$m1			# np[j]*m1
877	add	%rax,$N[0]
878	mov	8($ap,$j),%rax
879	adc	\$0,%rdx
880	add	$A[0],$N[0]
881	adc	\$0,%rdx
882	mov	$N[1],-16($tp)		# tp[j-1]
883	mov	%rdx,$N[1]
884
885	mulq	$m0			# ap[j]*bp[i]
886	add	%rax,$A[1]
887	mov	8*1($np),%rax
888	adc	\$0,%rdx
889	add	8($tp),$A[1]
890	adc	\$0,%rdx
891	mov	%rdx,$A[0]
892
893	mulq	$m1			# np[j]*m1
894	add	%rax,$N[1]
895	mov	16($ap,$j),%rax
896	adc	\$0,%rdx
897	add	$A[1],$N[1]
898	lea	8*4($np),$np
899	adc	\$0,%rdx
900	mov	$N[0],-8($tp)		# tp[j-1]
901	mov	%rdx,$N[0]
902
903	add	\$32,$j			# j+=4
904	jnz	.Linner4x
905
906	mulq	$m0			# ap[j]*bp[i]
907	add	%rax,$A[0]
908	mov	-8*2($np),%rax
909	adc	\$0,%rdx
910	add	16($tp),$A[0]		# ap[j]*bp[i]+tp[j]
911	lea	32($tp),$tp
912	adc	\$0,%rdx
913	mov	%rdx,$A[1]
914
915	mulq	$m1			# np[j]*m1
916	add	%rax,$N[0]
917	mov	-8($ap),%rax
918	adc	\$0,%rdx
919	add	$A[0],$N[0]
920	adc	\$0,%rdx
921	mov	$N[1],-32($tp)		# tp[j-1]
922	mov	%rdx,$N[1]
923
924	mulq	$m0			# ap[j]*bp[i]
925	add	%rax,$A[1]
926	mov	$m1,%rax
927	mov	-8*1($np),$m1
928	adc	\$0,%rdx
929	add	-8($tp),$A[1]
930	adc	\$0,%rdx
931	mov	%rdx,$A[0]
932
933	mulq	$m1			# np[j]*m1
934	add	%rax,$N[1]
935	mov	($ap,$num),%rax		# ap[0]
936	adc	\$0,%rdx
937	add	$A[1],$N[1]
938	adc	\$0,%rdx
939	mov	$N[0],-24($tp)		# tp[j-1]
940	mov	%rdx,$N[0]
941
942	mov	$N[1],-16($tp)		# tp[j-1]
943	lea	($np,$num),$np		# rewind $np
944
945	xor	$N[1],$N[1]
946	add	$A[0],$N[0]
947	adc	\$0,$N[1]
948	add	($tp),$N[0]		# pull upmost overflow bit
949	adc	\$0,$N[1]		# upmost overflow bit
950	mov	$N[0],-8($tp)
951
952	cmp	16+8(%rsp),$bp
953	jb	.Louter4x
954___
955if (1) {
956$code.=<<___;
957	xor	%rax,%rax
958	sub	$N[0],$m1		# compare top-most words
959	adc	$j,$j			# $j is zero
960	or	$j,$N[1]
961	sub	$N[1],%rax		# %rax=-$N[1]
962	lea	($tp,$num),%rbx		# tptr in .sqr4x_sub
963	mov	($np),%r12
964	lea	($np),%rbp		# nptr in .sqr4x_sub
965	mov	%r9,%rcx
966	sar	\$3+2,%rcx
967	mov	56+8(%rsp),%rdi		# rptr in .sqr4x_sub
968	dec	%r12			# so that after 'not' we get -n[0]
969	xor	%r10,%r10
970	mov	8*1(%rbp),%r13
971	mov	8*2(%rbp),%r14
972	mov	8*3(%rbp),%r15
973	jmp	.Lsqr4x_sub_entry
974___
975} else {
976my @ri=("%rax",$bp,$m0,$m1);
977my $rp="%rdx";
978$code.=<<___
979	xor	\$1,$N[1]
980	lea	($tp,$num),$tp		# rewind $tp
981	sar	\$5,$num		# cf=0
982	lea	($np,$N[1],8),$np
983	mov	56+8(%rsp),$rp		# restore $rp
984	jmp	.Lsub4x
985
986.align	32
987.Lsub4x:
988	.byte	0x66
989	mov	8*0($tp),@ri[0]
990	mov	8*1($tp),@ri[1]
991	.byte	0x66
992	sbb	16*0($np),@ri[0]
993	mov	8*2($tp),@ri[2]
994	sbb	16*1($np),@ri[1]
995	mov	3*8($tp),@ri[3]
996	lea	4*8($tp),$tp
997	sbb	16*2($np),@ri[2]
998	mov	@ri[0],8*0($rp)
999	sbb	16*3($np),@ri[3]
1000	lea	16*4($np),$np
1001	mov	@ri[1],8*1($rp)
1002	mov	@ri[2],8*2($rp)
1003	mov	@ri[3],8*3($rp)
1004	lea	8*4($rp),$rp
1005
1006	inc	$num
1007	jnz	.Lsub4x
1008
1009	ret
1010___
1011}
1012$code.=<<___;
1013.size	mul4x_internal,.-mul4x_internal
1014___
1015}}}
1016{{{
1017######################################################################
1018# void bn_power5(
1019my $rptr="%rdi";	# BN_ULONG *rptr,
1020my $aptr="%rsi";	# const BN_ULONG *aptr,
1021my $bptr="%rdx";	# const void *table,
1022my $nptr="%rcx";	# const BN_ULONG *nptr,
1023my $n0  ="%r8";		# const BN_ULONG *n0);
1024my $num ="%r9";		# int num, has to be divisible by 8
1025			# int pwr
1026
1027my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
1028my @A0=("%r10","%r11");
1029my @A1=("%r12","%r13");
1030my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
1031
1032$code.=<<___;
1033.globl	bn_power5
1034.type	bn_power5,\@function,6
1035.align	32
1036bn_power5:
1037___
1038$code.=<<___ if ($addx);
1039	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
1040	and	\$0x80108,%r11d
1041	cmp	\$0x80108,%r11d		# check for AD*X+BMI2+BMI1
1042	je	.Lpowerx5_enter
1043___
1044$code.=<<___;
1045	mov	%rsp,%rax
1046	push	%rbx
1047	push	%rbp
1048	push	%r12
1049	push	%r13
1050	push	%r14
1051	push	%r15
1052
1053	shl	\$3,${num}d		# convert $num to bytes
1054	lea	($num,$num,2),%r10d	# 3*$num
1055	neg	$num
1056	mov	($n0),$n0		# *n0
1057
1058	##############################################################
1059	# Ensure that stack frame doesn't alias with $rptr+3*$num
1060	# modulo 4096, which covers ret[num], am[num] and n[num]
1061	# (see bn_exp.c). This is done to allow memory disambiguation
1062	# logic do its magic. [Extra 256 bytes is for power mask
1063	# calculated from 7th argument, the index.]
1064	#
1065	lea	-320(%rsp,$num,2),%r11
1066	sub	$rptr,%r11
1067	and	\$4095,%r11
1068	cmp	%r11,%r10
1069	jb	.Lpwr_sp_alt
1070	sub	%r11,%rsp		# align with $aptr
1071	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*num*8+256)
1072	jmp	.Lpwr_sp_done
1073
1074.align	32
1075.Lpwr_sp_alt:
1076	lea	4096-320(,$num,2),%r10
1077	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*num*8+256)
1078	sub	%r10,%r11
1079	mov	\$0,%r10
1080	cmovc	%r10,%r11
1081	sub	%r11,%rsp
1082.Lpwr_sp_done:
1083	and	\$-64,%rsp
1084	mov	%rax,%r11
1085	sub	%rsp,%r11
1086	and	\$-4096,%r11
1087.Lpwr_page_walk:
1088	mov	(%rsp,%r11),%r10
1089	sub	\$4096,%r11
1090	.byte	0x2e			# predict non-taken
1091	jnc	.Lpwr_page_walk
1092
1093	mov	$num,%r10
1094	neg	$num
1095
1096	##############################################################
1097	# Stack layout
1098	#
1099	# +0	saved $num, used in reduction section
1100	# +8	&t[2*$num], used in reduction section
1101	# +32	saved *n0
1102	# +40	saved %rsp
1103	# +48	t[2*$num]
1104	#
1105	mov	$n0,  32(%rsp)
1106	mov	%rax, 40(%rsp)		# save original %rsp
1107.Lpower5_body:
1108	movq	$rptr,%xmm1		# save $rptr, used in sqr8x
1109	movq	$nptr,%xmm2		# save $nptr
1110	movq	%r10, %xmm3		# -$num, used in sqr8x
1111	movq	$bptr,%xmm4
1112
1113	call	__bn_sqr8x_internal
1114	call	__bn_post4x_internal
1115	call	__bn_sqr8x_internal
1116	call	__bn_post4x_internal
1117	call	__bn_sqr8x_internal
1118	call	__bn_post4x_internal
1119	call	__bn_sqr8x_internal
1120	call	__bn_post4x_internal
1121	call	__bn_sqr8x_internal
1122	call	__bn_post4x_internal
1123
1124	movq	%xmm2,$nptr
1125	movq	%xmm4,$bptr
1126	mov	$aptr,$rptr
1127	mov	40(%rsp),%rax
1128	lea	32(%rsp),$n0
1129
1130	call	mul4x_internal
1131
1132	mov	40(%rsp),%rsi		# restore %rsp
1133	mov	\$1,%rax
1134	mov	-48(%rsi),%r15
1135	mov	-40(%rsi),%r14
1136	mov	-32(%rsi),%r13
1137	mov	-24(%rsi),%r12
1138	mov	-16(%rsi),%rbp
1139	mov	-8(%rsi),%rbx
1140	lea	(%rsi),%rsp
1141.Lpower5_epilogue:
1142	ret
1143.size	bn_power5,.-bn_power5
1144
1145.globl	bn_sqr8x_internal
1146.hidden	bn_sqr8x_internal
1147.type	bn_sqr8x_internal,\@abi-omnipotent
1148.align	32
1149bn_sqr8x_internal:
1150__bn_sqr8x_internal:
1151	##############################################################
1152	# Squaring part:
1153	#
1154	# a) multiply-n-add everything but a[i]*a[i];
1155	# b) shift result of a) by 1 to the left and accumulate
1156	#    a[i]*a[i] products;
1157	#
1158	##############################################################
1159	#                                                     a[1]a[0]
1160	#                                                 a[2]a[0]
1161	#                                             a[3]a[0]
1162	#                                             a[2]a[1]
1163	#                                         a[4]a[0]
1164	#                                         a[3]a[1]
1165	#                                     a[5]a[0]
1166	#                                     a[4]a[1]
1167	#                                     a[3]a[2]
1168	#                                 a[6]a[0]
1169	#                                 a[5]a[1]
1170	#                                 a[4]a[2]
1171	#                             a[7]a[0]
1172	#                             a[6]a[1]
1173	#                             a[5]a[2]
1174	#                             a[4]a[3]
1175	#                         a[7]a[1]
1176	#                         a[6]a[2]
1177	#                         a[5]a[3]
1178	#                     a[7]a[2]
1179	#                     a[6]a[3]
1180	#                     a[5]a[4]
1181	#                 a[7]a[3]
1182	#                 a[6]a[4]
1183	#             a[7]a[4]
1184	#             a[6]a[5]
1185	#         a[7]a[5]
1186	#     a[7]a[6]
1187	#                                                     a[1]a[0]
1188	#                                                 a[2]a[0]
1189	#                                             a[3]a[0]
1190	#                                         a[4]a[0]
1191	#                                     a[5]a[0]
1192	#                                 a[6]a[0]
1193	#                             a[7]a[0]
1194	#                                             a[2]a[1]
1195	#                                         a[3]a[1]
1196	#                                     a[4]a[1]
1197	#                                 a[5]a[1]
1198	#                             a[6]a[1]
1199	#                         a[7]a[1]
1200	#                                     a[3]a[2]
1201	#                                 a[4]a[2]
1202	#                             a[5]a[2]
1203	#                         a[6]a[2]
1204	#                     a[7]a[2]
1205	#                             a[4]a[3]
1206	#                         a[5]a[3]
1207	#                     a[6]a[3]
1208	#                 a[7]a[3]
1209	#                     a[5]a[4]
1210	#                 a[6]a[4]
1211	#             a[7]a[4]
1212	#             a[6]a[5]
1213	#         a[7]a[5]
1214	#     a[7]a[6]
1215	#                                                         a[0]a[0]
1216	#                                                 a[1]a[1]
1217	#                                         a[2]a[2]
1218	#                                 a[3]a[3]
1219	#                         a[4]a[4]
1220	#                 a[5]a[5]
1221	#         a[6]a[6]
1222	# a[7]a[7]
1223
1224	lea	32(%r10),$i		# $i=-($num-32)
1225	lea	($aptr,$num),$aptr	# end of a[] buffer, ($aptr,$i)=&ap[2]
1226
1227	mov	$num,$j			# $j=$num
1228
1229					# comments apply to $num==8 case
1230	mov	-32($aptr,$i),$a0	# a[0]
1231	lea	48+8(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
1232	mov	-24($aptr,$i),%rax	# a[1]
1233	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
1234	mov	-16($aptr,$i),$ai	# a[2]
1235	mov	%rax,$a1
1236
1237	mul	$a0			# a[1]*a[0]
1238	mov	%rax,$A0[0]		# a[1]*a[0]
1239	 mov	$ai,%rax		# a[2]
1240	mov	%rdx,$A0[1]
1241	mov	$A0[0],-24($tptr,$i)	# t[1]
1242
1243	mul	$a0			# a[2]*a[0]
1244	add	%rax,$A0[1]
1245	 mov	$ai,%rax
1246	adc	\$0,%rdx
1247	mov	$A0[1],-16($tptr,$i)	# t[2]
1248	mov	%rdx,$A0[0]
1249
1250
1251	 mov	-8($aptr,$i),$ai	# a[3]
1252	mul	$a1			# a[2]*a[1]
1253	mov	%rax,$A1[0]		# a[2]*a[1]+t[3]
1254	 mov	$ai,%rax
1255	mov	%rdx,$A1[1]
1256
1257	 lea	($i),$j
1258	mul	$a0			# a[3]*a[0]
1259	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
1260	 mov	$ai,%rax
1261	mov	%rdx,$A0[1]
1262	adc	\$0,$A0[1]
1263	add	$A1[0],$A0[0]
1264	adc	\$0,$A0[1]
1265	mov	$A0[0],-8($tptr,$j)	# t[3]
1266	jmp	.Lsqr4x_1st
1267
1268.align	32
1269.Lsqr4x_1st:
1270	 mov	($aptr,$j),$ai		# a[4]
1271	mul	$a1			# a[3]*a[1]
1272	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
1273	 mov	$ai,%rax
1274	mov	%rdx,$A1[0]
1275	adc	\$0,$A1[0]
1276
1277	mul	$a0			# a[4]*a[0]
1278	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
1279	 mov	$ai,%rax		# a[3]
1280	 mov	8($aptr,$j),$ai		# a[5]
1281	mov	%rdx,$A0[0]
1282	adc	\$0,$A0[0]
1283	add	$A1[1],$A0[1]
1284	adc	\$0,$A0[0]
1285
1286
1287	mul	$a1			# a[4]*a[3]
1288	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
1289	 mov	$ai,%rax
1290	 mov	$A0[1],($tptr,$j)	# t[4]
1291	mov	%rdx,$A1[1]
1292	adc	\$0,$A1[1]
1293
1294	mul	$a0			# a[5]*a[2]
1295	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
1296	 mov	$ai,%rax
1297	 mov	16($aptr,$j),$ai	# a[6]
1298	mov	%rdx,$A0[1]
1299	adc	\$0,$A0[1]
1300	add	$A1[0],$A0[0]
1301	adc	\$0,$A0[1]
1302
1303	mul	$a1			# a[5]*a[3]
1304	add	%rax,$A1[1]		# a[5]*a[3]+t[6]
1305	 mov	$ai,%rax
1306	 mov	$A0[0],8($tptr,$j)	# t[5]
1307	mov	%rdx,$A1[0]
1308	adc	\$0,$A1[0]
1309
1310	mul	$a0			# a[6]*a[2]
1311	add	%rax,$A0[1]		# a[6]*a[2]+a[5]*a[3]+t[6]
1312	 mov	$ai,%rax		# a[3]
1313	 mov	24($aptr,$j),$ai	# a[7]
1314	mov	%rdx,$A0[0]
1315	adc	\$0,$A0[0]
1316	add	$A1[1],$A0[1]
1317	adc	\$0,$A0[0]
1318
1319
1320	mul	$a1			# a[6]*a[5]
1321	add	%rax,$A1[0]		# a[6]*a[5]+t[7]
1322	 mov	$ai,%rax
1323	 mov	$A0[1],16($tptr,$j)	# t[6]
1324	mov	%rdx,$A1[1]
1325	adc	\$0,$A1[1]
1326	 lea	32($j),$j
1327
1328	mul	$a0			# a[7]*a[4]
1329	add	%rax,$A0[0]		# a[7]*a[4]+a[6]*a[5]+t[6]
1330	 mov	$ai,%rax
1331	mov	%rdx,$A0[1]
1332	adc	\$0,$A0[1]
1333	add	$A1[0],$A0[0]
1334	adc	\$0,$A0[1]
1335	mov	$A0[0],-8($tptr,$j)	# t[7]
1336
1337	cmp	\$0,$j
1338	jne	.Lsqr4x_1st
1339
1340	mul	$a1			# a[7]*a[5]
1341	add	%rax,$A1[1]
1342	lea	16($i),$i
1343	adc	\$0,%rdx
1344	add	$A0[1],$A1[1]
1345	adc	\$0,%rdx
1346
1347	mov	$A1[1],($tptr)		# t[8]
1348	mov	%rdx,$A1[0]
1349	mov	%rdx,8($tptr)		# t[9]
1350	jmp	.Lsqr4x_outer
1351
1352.align	32
1353.Lsqr4x_outer:				# comments apply to $num==6 case
1354	mov	-32($aptr,$i),$a0	# a[0]
1355	lea	48+8(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
1356	mov	-24($aptr,$i),%rax	# a[1]
1357	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
1358	mov	-16($aptr,$i),$ai	# a[2]
1359	mov	%rax,$a1
1360
1361	mul	$a0			# a[1]*a[0]
1362	mov	-24($tptr,$i),$A0[0]	# t[1]
1363	add	%rax,$A0[0]		# a[1]*a[0]+t[1]
1364	 mov	$ai,%rax		# a[2]
1365	adc	\$0,%rdx
1366	mov	$A0[0],-24($tptr,$i)	# t[1]
1367	mov	%rdx,$A0[1]
1368
1369	mul	$a0			# a[2]*a[0]
1370	add	%rax,$A0[1]
1371	 mov	$ai,%rax
1372	adc	\$0,%rdx
1373	add	-16($tptr,$i),$A0[1]	# a[2]*a[0]+t[2]
1374	mov	%rdx,$A0[0]
1375	adc	\$0,$A0[0]
1376	mov	$A0[1],-16($tptr,$i)	# t[2]
1377
1378	xor	$A1[0],$A1[0]
1379
1380	 mov	-8($aptr,$i),$ai	# a[3]
1381	mul	$a1			# a[2]*a[1]
1382	add	%rax,$A1[0]		# a[2]*a[1]+t[3]
1383	 mov	$ai,%rax
1384	adc	\$0,%rdx
1385	add	-8($tptr,$i),$A1[0]
1386	mov	%rdx,$A1[1]
1387	adc	\$0,$A1[1]
1388
1389	mul	$a0			# a[3]*a[0]
1390	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
1391	 mov	$ai,%rax
1392	adc	\$0,%rdx
1393	add	$A1[0],$A0[0]
1394	mov	%rdx,$A0[1]
1395	adc	\$0,$A0[1]
1396	mov	$A0[0],-8($tptr,$i)	# t[3]
1397
1398	lea	($i),$j
1399	jmp	.Lsqr4x_inner
1400
1401.align	32
1402.Lsqr4x_inner:
1403	 mov	($aptr,$j),$ai		# a[4]
1404	mul	$a1			# a[3]*a[1]
1405	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
1406	 mov	$ai,%rax
1407	mov	%rdx,$A1[0]
1408	adc	\$0,$A1[0]
1409	add	($tptr,$j),$A1[1]
1410	adc	\$0,$A1[0]
1411
1412	.byte	0x67
1413	mul	$a0			# a[4]*a[0]
1414	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
1415	 mov	$ai,%rax		# a[3]
1416	 mov	8($aptr,$j),$ai		# a[5]
1417	mov	%rdx,$A0[0]
1418	adc	\$0,$A0[0]
1419	add	$A1[1],$A0[1]
1420	adc	\$0,$A0[0]
1421
1422	mul	$a1			# a[4]*a[3]
1423	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
1424	mov	$A0[1],($tptr,$j)	# t[4]
1425	 mov	$ai,%rax
1426	mov	%rdx,$A1[1]
1427	adc	\$0,$A1[1]
1428	add	8($tptr,$j),$A1[0]
1429	lea	16($j),$j		# j++
1430	adc	\$0,$A1[1]
1431
1432	mul	$a0			# a[5]*a[2]
1433	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
1434	 mov	$ai,%rax
1435	adc	\$0,%rdx
1436	add	$A1[0],$A0[0]
1437	mov	%rdx,$A0[1]
1438	adc	\$0,$A0[1]
1439	mov	$A0[0],-8($tptr,$j)	# t[5], "preloaded t[1]" below
1440
1441	cmp	\$0,$j
1442	jne	.Lsqr4x_inner
1443
1444	.byte	0x67
1445	mul	$a1			# a[5]*a[3]
1446	add	%rax,$A1[1]
1447	adc	\$0,%rdx
1448	add	$A0[1],$A1[1]
1449	adc	\$0,%rdx
1450
1451	mov	$A1[1],($tptr)		# t[6], "preloaded t[2]" below
1452	mov	%rdx,$A1[0]
1453	mov	%rdx,8($tptr)		# t[7], "preloaded t[3]" below
1454
1455	add	\$16,$i
1456	jnz	.Lsqr4x_outer
1457
1458					# comments apply to $num==4 case
1459	mov	-32($aptr),$a0		# a[0]
1460	lea	48+8(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
1461	mov	-24($aptr),%rax		# a[1]
1462	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
1463	mov	-16($aptr),$ai		# a[2]
1464	mov	%rax,$a1
1465
1466	mul	$a0			# a[1]*a[0]
1467	add	%rax,$A0[0]		# a[1]*a[0]+t[1], preloaded t[1]
1468	 mov	$ai,%rax		# a[2]
1469	mov	%rdx,$A0[1]
1470	adc	\$0,$A0[1]
1471
1472	mul	$a0			# a[2]*a[0]
1473	add	%rax,$A0[1]
1474	 mov	$ai,%rax
1475	 mov	$A0[0],-24($tptr)	# t[1]
1476	mov	%rdx,$A0[0]
1477	adc	\$0,$A0[0]
1478	add	$A1[1],$A0[1]		# a[2]*a[0]+t[2], preloaded t[2]
1479	 mov	-8($aptr),$ai		# a[3]
1480	adc	\$0,$A0[0]
1481
1482	mul	$a1			# a[2]*a[1]
1483	add	%rax,$A1[0]		# a[2]*a[1]+t[3], preloaded t[3]
1484	 mov	$ai,%rax
1485	 mov	$A0[1],-16($tptr)	# t[2]
1486	mov	%rdx,$A1[1]
1487	adc	\$0,$A1[1]
1488
1489	mul	$a0			# a[3]*a[0]
1490	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
1491	 mov	$ai,%rax
1492	mov	%rdx,$A0[1]
1493	adc	\$0,$A0[1]
1494	add	$A1[0],$A0[0]
1495	adc	\$0,$A0[1]
1496	mov	$A0[0],-8($tptr)	# t[3]
1497
1498	mul	$a1			# a[3]*a[1]
1499	add	%rax,$A1[1]
1500	 mov	-16($aptr),%rax		# a[2]
1501	adc	\$0,%rdx
1502	add	$A0[1],$A1[1]
1503	adc	\$0,%rdx
1504
1505	mov	$A1[1],($tptr)		# t[4]
1506	mov	%rdx,$A1[0]
1507	mov	%rdx,8($tptr)		# t[5]
1508
1509	mul	$ai			# a[2]*a[3]
1510___
1511{
1512my ($shift,$carry)=($a0,$a1);
1513my @S=(@A1,$ai,$n0);
1514$code.=<<___;
1515	 add	\$16,$i
1516	 xor	$shift,$shift
1517	 sub	$num,$i			# $i=16-$num
1518	 xor	$carry,$carry
1519
1520	add	$A1[0],%rax		# t[5]
1521	adc	\$0,%rdx
1522	mov	%rax,8($tptr)		# t[5]
1523	mov	%rdx,16($tptr)		# t[6]
1524	mov	$carry,24($tptr)	# t[7]
1525
1526	 mov	-16($aptr,$i),%rax	# a[0]
1527	lea	48+8(%rsp),$tptr
1528	 xor	$A0[0],$A0[0]		# t[0]
1529	 mov	8($tptr),$A0[1]		# t[1]
1530
1531	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1532	shr	\$63,$A0[0]
1533	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1534	shr	\$63,$A0[1]
1535	or	$A0[0],$S[1]		# | t[2*i]>>63
1536	 mov	16($tptr),$A0[0]	# t[2*i+2]	# prefetch
1537	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1538	mul	%rax			# a[i]*a[i]
1539	neg	$carry			# mov $carry,cf
1540	 mov	24($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1541	adc	%rax,$S[0]
1542	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
1543	mov	$S[0],($tptr)
1544	adc	%rdx,$S[1]
1545
1546	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1547	 mov	$S[1],8($tptr)
1548	 sbb	$carry,$carry		# mov cf,$carry
1549	shr	\$63,$A0[0]
1550	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1551	shr	\$63,$A0[1]
1552	or	$A0[0],$S[3]		# | t[2*i]>>63
1553	 mov	32($tptr),$A0[0]	# t[2*i+2]	# prefetch
1554	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1555	mul	%rax			# a[i]*a[i]
1556	neg	$carry			# mov $carry,cf
1557	 mov	40($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1558	adc	%rax,$S[2]
1559	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
1560	mov	$S[2],16($tptr)
1561	adc	%rdx,$S[3]
1562	lea	16($i),$i
1563	mov	$S[3],24($tptr)
1564	sbb	$carry,$carry		# mov cf,$carry
1565	lea	64($tptr),$tptr
1566	jmp	.Lsqr4x_shift_n_add
1567
1568.align	32
1569.Lsqr4x_shift_n_add:
1570	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1571	shr	\$63,$A0[0]
1572	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1573	shr	\$63,$A0[1]
1574	or	$A0[0],$S[1]		# | t[2*i]>>63
1575	 mov	-16($tptr),$A0[0]	# t[2*i+2]	# prefetch
1576	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1577	mul	%rax			# a[i]*a[i]
1578	neg	$carry			# mov $carry,cf
1579	 mov	-8($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1580	adc	%rax,$S[0]
1581	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
1582	mov	$S[0],-32($tptr)
1583	adc	%rdx,$S[1]
1584
1585	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1586	 mov	$S[1],-24($tptr)
1587	 sbb	$carry,$carry		# mov cf,$carry
1588	shr	\$63,$A0[0]
1589	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1590	shr	\$63,$A0[1]
1591	or	$A0[0],$S[3]		# | t[2*i]>>63
1592	 mov	0($tptr),$A0[0]		# t[2*i+2]	# prefetch
1593	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1594	mul	%rax			# a[i]*a[i]
1595	neg	$carry			# mov $carry,cf
1596	 mov	8($tptr),$A0[1]		# t[2*i+2+1]	# prefetch
1597	adc	%rax,$S[2]
1598	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
1599	mov	$S[2],-16($tptr)
1600	adc	%rdx,$S[3]
1601
1602	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1603	 mov	$S[3],-8($tptr)
1604	 sbb	$carry,$carry		# mov cf,$carry
1605	shr	\$63,$A0[0]
1606	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1607	shr	\$63,$A0[1]
1608	or	$A0[0],$S[1]		# | t[2*i]>>63
1609	 mov	16($tptr),$A0[0]	# t[2*i+2]	# prefetch
1610	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1611	mul	%rax			# a[i]*a[i]
1612	neg	$carry			# mov $carry,cf
1613	 mov	24($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1614	adc	%rax,$S[0]
1615	 mov	8($aptr,$i),%rax	# a[i+1]	# prefetch
1616	mov	$S[0],0($tptr)
1617	adc	%rdx,$S[1]
1618
1619	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1620	 mov	$S[1],8($tptr)
1621	 sbb	$carry,$carry		# mov cf,$carry
1622	shr	\$63,$A0[0]
1623	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1624	shr	\$63,$A0[1]
1625	or	$A0[0],$S[3]		# | t[2*i]>>63
1626	 mov	32($tptr),$A0[0]	# t[2*i+2]	# prefetch
1627	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1628	mul	%rax			# a[i]*a[i]
1629	neg	$carry			# mov $carry,cf
1630	 mov	40($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1631	adc	%rax,$S[2]
1632	 mov	16($aptr,$i),%rax	# a[i+1]	# prefetch
1633	mov	$S[2],16($tptr)
1634	adc	%rdx,$S[3]
1635	mov	$S[3],24($tptr)
1636	sbb	$carry,$carry		# mov cf,$carry
1637	lea	64($tptr),$tptr
1638	add	\$32,$i
1639	jnz	.Lsqr4x_shift_n_add
1640
1641	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1642	.byte	0x67
1643	shr	\$63,$A0[0]
1644	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1645	shr	\$63,$A0[1]
1646	or	$A0[0],$S[1]		# | t[2*i]>>63
1647	 mov	-16($tptr),$A0[0]	# t[2*i+2]	# prefetch
1648	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1649	mul	%rax			# a[i]*a[i]
1650	neg	$carry			# mov $carry,cf
1651	 mov	-8($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1652	adc	%rax,$S[0]
1653	 mov	-8($aptr),%rax		# a[i+1]	# prefetch
1654	mov	$S[0],-32($tptr)
1655	adc	%rdx,$S[1]
1656
1657	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1|shift
1658	 mov	$S[1],-24($tptr)
1659	 sbb	$carry,$carry		# mov cf,$carry
1660	shr	\$63,$A0[0]
1661	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1662	shr	\$63,$A0[1]
1663	or	$A0[0],$S[3]		# | t[2*i]>>63
1664	mul	%rax			# a[i]*a[i]
1665	neg	$carry			# mov $carry,cf
1666	adc	%rax,$S[2]
1667	adc	%rdx,$S[3]
1668	mov	$S[2],-16($tptr)
1669	mov	$S[3],-8($tptr)
1670___
1671}
1672######################################################################
1673# Montgomery reduction part, "word-by-word" algorithm.
1674#
1675# This new path is inspired by multiple submissions from Intel, by
1676# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
1677# Vinodh Gopal...
1678{
1679my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx");
1680
1681$code.=<<___;
1682	movq	%xmm2,$nptr
1683__bn_sqr8x_reduction:
1684	xor	%rax,%rax
1685	lea	($nptr,$num),%rcx	# end of n[]
1686	lea	48+8(%rsp,$num,2),%rdx	# end of t[] buffer
1687	mov	%rcx,0+8(%rsp)
1688	lea	48+8(%rsp,$num),$tptr	# end of initial t[] window
1689	mov	%rdx,8+8(%rsp)
1690	neg	$num
1691	jmp	.L8x_reduction_loop
1692
1693.align	32
1694.L8x_reduction_loop:
1695	lea	($tptr,$num),$tptr	# start of current t[] window
1696	.byte	0x66
1697	mov	8*0($tptr),$m0
1698	mov	8*1($tptr),%r9
1699	mov	8*2($tptr),%r10
1700	mov	8*3($tptr),%r11
1701	mov	8*4($tptr),%r12
1702	mov	8*5($tptr),%r13
1703	mov	8*6($tptr),%r14
1704	mov	8*7($tptr),%r15
1705	mov	%rax,(%rdx)		# store top-most carry bit
1706	lea	8*8($tptr),$tptr
1707
1708	.byte	0x67
1709	mov	$m0,%r8
1710	imulq	32+8(%rsp),$m0		# n0*a[0]
1711	mov	8*0($nptr),%rax		# n[0]
1712	mov	\$8,%ecx
1713	jmp	.L8x_reduce
1714
1715.align	32
1716.L8x_reduce:
1717	mulq	$m0
1718	 mov	8*1($nptr),%rax		# n[1]
1719	neg	%r8
1720	mov	%rdx,%r8
1721	adc	\$0,%r8
1722
1723	mulq	$m0
1724	add	%rax,%r9
1725	 mov	8*2($nptr),%rax
1726	adc	\$0,%rdx
1727	add	%r9,%r8
1728	 mov	$m0,48-8+8(%rsp,%rcx,8)	# put aside n0*a[i]
1729	mov	%rdx,%r9
1730	adc	\$0,%r9
1731
1732	mulq	$m0
1733	add	%rax,%r10
1734	 mov	8*3($nptr),%rax
1735	adc	\$0,%rdx
1736	add	%r10,%r9
1737	 mov	32+8(%rsp),$carry	# pull n0, borrow $carry
1738	mov	%rdx,%r10
1739	adc	\$0,%r10
1740
1741	mulq	$m0
1742	add	%rax,%r11
1743	 mov	8*4($nptr),%rax
1744	adc	\$0,%rdx
1745	 imulq	%r8,$carry		# modulo-scheduled
1746	add	%r11,%r10
1747	mov	%rdx,%r11
1748	adc	\$0,%r11
1749
1750	mulq	$m0
1751	add	%rax,%r12
1752	 mov	8*5($nptr),%rax
1753	adc	\$0,%rdx
1754	add	%r12,%r11
1755	mov	%rdx,%r12
1756	adc	\$0,%r12
1757
1758	mulq	$m0
1759	add	%rax,%r13
1760	 mov	8*6($nptr),%rax
1761	adc	\$0,%rdx
1762	add	%r13,%r12
1763	mov	%rdx,%r13
1764	adc	\$0,%r13
1765
1766	mulq	$m0
1767	add	%rax,%r14
1768	 mov	8*7($nptr),%rax
1769	adc	\$0,%rdx
1770	add	%r14,%r13
1771	mov	%rdx,%r14
1772	adc	\$0,%r14
1773
1774	mulq	$m0
1775	 mov	$carry,$m0		# n0*a[i]
1776	add	%rax,%r15
1777	 mov	8*0($nptr),%rax		# n[0]
1778	adc	\$0,%rdx
1779	add	%r15,%r14
1780	mov	%rdx,%r15
1781	adc	\$0,%r15
1782
1783	dec	%ecx
1784	jnz	.L8x_reduce
1785
1786	lea	8*8($nptr),$nptr
1787	xor	%rax,%rax
1788	mov	8+8(%rsp),%rdx		# pull end of t[]
1789	cmp	0+8(%rsp),$nptr		# end of n[]?
1790	jae	.L8x_no_tail
1791
1792	.byte	0x66
1793	add	8*0($tptr),%r8
1794	adc	8*1($tptr),%r9
1795	adc	8*2($tptr),%r10
1796	adc	8*3($tptr),%r11
1797	adc	8*4($tptr),%r12
1798	adc	8*5($tptr),%r13
1799	adc	8*6($tptr),%r14
1800	adc	8*7($tptr),%r15
1801	sbb	$carry,$carry		# top carry
1802
1803	mov	48+56+8(%rsp),$m0	# pull n0*a[0]
1804	mov	\$8,%ecx
1805	mov	8*0($nptr),%rax
1806	jmp	.L8x_tail
1807
1808.align	32
1809.L8x_tail:
1810	mulq	$m0
1811	add	%rax,%r8
1812	 mov	8*1($nptr),%rax
1813	 mov	%r8,($tptr)		# save result
1814	mov	%rdx,%r8
1815	adc	\$0,%r8
1816
1817	mulq	$m0
1818	add	%rax,%r9
1819	 mov	8*2($nptr),%rax
1820	adc	\$0,%rdx
1821	add	%r9,%r8
1822	 lea	8($tptr),$tptr		# $tptr++
1823	mov	%rdx,%r9
1824	adc	\$0,%r9
1825
1826	mulq	$m0
1827	add	%rax,%r10
1828	 mov	8*3($nptr),%rax
1829	adc	\$0,%rdx
1830	add	%r10,%r9
1831	mov	%rdx,%r10
1832	adc	\$0,%r10
1833
1834	mulq	$m0
1835	add	%rax,%r11
1836	 mov	8*4($nptr),%rax
1837	adc	\$0,%rdx
1838	add	%r11,%r10
1839	mov	%rdx,%r11
1840	adc	\$0,%r11
1841
1842	mulq	$m0
1843	add	%rax,%r12
1844	 mov	8*5($nptr),%rax
1845	adc	\$0,%rdx
1846	add	%r12,%r11
1847	mov	%rdx,%r12
1848	adc	\$0,%r12
1849
1850	mulq	$m0
1851	add	%rax,%r13
1852	 mov	8*6($nptr),%rax
1853	adc	\$0,%rdx
1854	add	%r13,%r12
1855	mov	%rdx,%r13
1856	adc	\$0,%r13
1857
1858	mulq	$m0
1859	add	%rax,%r14
1860	 mov	8*7($nptr),%rax
1861	adc	\$0,%rdx
1862	add	%r14,%r13
1863	mov	%rdx,%r14
1864	adc	\$0,%r14
1865
1866	mulq	$m0
1867	 mov	48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i]
1868	add	%rax,%r15
1869	adc	\$0,%rdx
1870	add	%r15,%r14
1871	 mov	8*0($nptr),%rax		# pull n[0]
1872	mov	%rdx,%r15
1873	adc	\$0,%r15
1874
1875	dec	%ecx
1876	jnz	.L8x_tail
1877
1878	lea	8*8($nptr),$nptr
1879	mov	8+8(%rsp),%rdx		# pull end of t[]
1880	cmp	0+8(%rsp),$nptr		# end of n[]?
1881	jae	.L8x_tail_done		# break out of loop
1882
1883	 mov	48+56+8(%rsp),$m0	# pull n0*a[0]
1884	neg	$carry
1885	 mov	8*0($nptr),%rax		# pull n[0]
1886	adc	8*0($tptr),%r8
1887	adc	8*1($tptr),%r9
1888	adc	8*2($tptr),%r10
1889	adc	8*3($tptr),%r11
1890	adc	8*4($tptr),%r12
1891	adc	8*5($tptr),%r13
1892	adc	8*6($tptr),%r14
1893	adc	8*7($tptr),%r15
1894	sbb	$carry,$carry		# top carry
1895
1896	mov	\$8,%ecx
1897	jmp	.L8x_tail
1898
1899.align	32
1900.L8x_tail_done:
1901	add	(%rdx),%r8		# can this overflow?
1902	adc	\$0,%r9
1903	adc	\$0,%r10
1904	adc	\$0,%r11
1905	adc	\$0,%r12
1906	adc	\$0,%r13
1907	adc	\$0,%r14
1908	adc	\$0,%r15		# can't overflow, because we
1909					# started with "overhung" part
1910					# of multiplication
1911	xor	%rax,%rax
1912
1913	neg	$carry
1914.L8x_no_tail:
1915	adc	8*0($tptr),%r8
1916	adc	8*1($tptr),%r9
1917	adc	8*2($tptr),%r10
1918	adc	8*3($tptr),%r11
1919	adc	8*4($tptr),%r12
1920	adc	8*5($tptr),%r13
1921	adc	8*6($tptr),%r14
1922	adc	8*7($tptr),%r15
1923	adc	\$0,%rax		# top-most carry
1924	 mov	-8($nptr),%rcx		# np[num-1]
1925	 xor	$carry,$carry
1926
1927	movq	%xmm2,$nptr		# restore $nptr
1928
1929	mov	%r8,8*0($tptr)		# store top 512 bits
1930	mov	%r9,8*1($tptr)
1931	 movq	%xmm3,$num		# $num is %r9, can't be moved upwards
1932	mov	%r10,8*2($tptr)
1933	mov	%r11,8*3($tptr)
1934	mov	%r12,8*4($tptr)
1935	mov	%r13,8*5($tptr)
1936	mov	%r14,8*6($tptr)
1937	mov	%r15,8*7($tptr)
1938	lea	8*8($tptr),$tptr
1939
1940	cmp	%rdx,$tptr		# end of t[]?
1941	jb	.L8x_reduction_loop
1942	ret
1943.size	bn_sqr8x_internal,.-bn_sqr8x_internal
1944___
1945}
1946##############################################################
1947# Post-condition, 4x unrolled
1948#
1949{
1950my ($tptr,$nptr)=("%rbx","%rbp");
1951$code.=<<___;
1952.type	__bn_post4x_internal,\@abi-omnipotent
1953.align	32
1954__bn_post4x_internal:
1955	mov	8*0($nptr),%r12
1956	lea	(%rdi,$num),$tptr	# %rdi was $tptr above
1957	mov	$num,%rcx
1958	movq	%xmm1,$rptr		# restore $rptr
1959	neg	%rax
1960	movq	%xmm1,$aptr		# prepare for back-to-back call
1961	sar	\$3+2,%rcx
1962	dec	%r12			# so that after 'not' we get -n[0]
1963	xor	%r10,%r10
1964	mov	8*1($nptr),%r13
1965	mov	8*2($nptr),%r14
1966	mov	8*3($nptr),%r15
1967	jmp	.Lsqr4x_sub_entry
1968
1969.align	16
1970.Lsqr4x_sub:
1971	mov	8*0($nptr),%r12
1972	mov	8*1($nptr),%r13
1973	mov	8*2($nptr),%r14
1974	mov	8*3($nptr),%r15
1975.Lsqr4x_sub_entry:
1976	lea	8*4($nptr),$nptr
1977	not	%r12
1978	not	%r13
1979	not	%r14
1980	not	%r15
1981	and	%rax,%r12
1982	and	%rax,%r13
1983	and	%rax,%r14
1984	and	%rax,%r15
1985
1986	neg	%r10			# mov %r10,%cf
1987	adc	8*0($tptr),%r12
1988	adc	8*1($tptr),%r13
1989	adc	8*2($tptr),%r14
1990	adc	8*3($tptr),%r15
1991	mov	%r12,8*0($rptr)
1992	lea	8*4($tptr),$tptr
1993	mov	%r13,8*1($rptr)
1994	sbb	%r10,%r10		# mov %cf,%r10
1995	mov	%r14,8*2($rptr)
1996	mov	%r15,8*3($rptr)
1997	lea	8*4($rptr),$rptr
1998
1999	inc	%rcx			# pass %cf
2000	jnz	.Lsqr4x_sub
2001
2002	mov	$num,%r10		# prepare for back-to-back call
2003	neg	$num			# restore $num
2004	ret
2005.size	__bn_post4x_internal,.-__bn_post4x_internal
2006___
2007}
2008{
2009$code.=<<___;
2010.globl	bn_from_montgomery
2011.type	bn_from_montgomery,\@abi-omnipotent
2012.align	32
2013bn_from_montgomery:
2014	testl	\$7,`($win64?"48(%rsp)":"%r9d")`
2015	jz	bn_from_mont8x
2016	xor	%eax,%eax
2017	ret
2018.size	bn_from_montgomery,.-bn_from_montgomery
2019
2020.type	bn_from_mont8x,\@function,6
2021.align	32
2022bn_from_mont8x:
2023	.byte	0x67
2024	mov	%rsp,%rax
2025	push	%rbx
2026	push	%rbp
2027	push	%r12
2028	push	%r13
2029	push	%r14
2030	push	%r15
2031
2032	shl	\$3,${num}d		# convert $num to bytes
2033	lea	($num,$num,2),%r10	# 3*$num in bytes
2034	neg	$num
2035	mov	($n0),$n0		# *n0
2036
2037	##############################################################
2038	# Ensure that stack frame doesn't alias with $rptr+3*$num
2039	# modulo 4096, which covers ret[num], am[num] and n[num]
2040	# (see bn_exp.c). The stack is allocated to aligned with
2041	# bn_power5's frame, and as bn_from_montgomery happens to be
2042	# last operation, we use the opportunity to cleanse it.
2043	#
2044	lea	-320(%rsp,$num,2),%r11
2045	sub	$rptr,%r11
2046	and	\$4095,%r11
2047	cmp	%r11,%r10
2048	jb	.Lfrom_sp_alt
2049	sub	%r11,%rsp		# align with $aptr
2050	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*$num*8+256)
2051	jmp	.Lfrom_sp_done
2052
2053.align	32
2054.Lfrom_sp_alt:
2055	lea	4096-320(,$num,2),%r10
2056	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*$num*8+256)
2057	sub	%r10,%r11
2058	mov	\$0,%r10
2059	cmovc	%r10,%r11
2060	sub	%r11,%rsp
2061.Lfrom_sp_done:
2062	and	\$-64,%rsp
2063	mov	%rax,%r11
2064	sub	%rsp,%r11
2065	and	\$-4096,%r11
2066.Lfrom_page_walk:
2067	mov	(%rsp,%r11),%r10
2068	sub	\$4096,%r11
2069	.byte	0x2e			# predict non-taken
2070	jnc	.Lfrom_page_walk
2071
2072	mov	$num,%r10
2073	neg	$num
2074
2075	##############################################################
2076	# Stack layout
2077	#
2078	# +0	saved $num, used in reduction section
2079	# +8	&t[2*$num], used in reduction section
2080	# +32	saved *n0
2081	# +40	saved %rsp
2082	# +48	t[2*$num]
2083	#
2084	mov	$n0,  32(%rsp)
2085	mov	%rax, 40(%rsp)		# save original %rsp
2086.Lfrom_body:
2087	mov	$num,%r11
2088	lea	48(%rsp),%rax
2089	pxor	%xmm0,%xmm0
2090	jmp	.Lmul_by_1
2091
2092.align	32
2093.Lmul_by_1:
2094	movdqu	($aptr),%xmm1
2095	movdqu	16($aptr),%xmm2
2096	movdqu	32($aptr),%xmm3
2097	movdqa	%xmm0,(%rax,$num)
2098	movdqu	48($aptr),%xmm4
2099	movdqa	%xmm0,16(%rax,$num)
2100	.byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00	# lea	64($aptr),$aptr
2101	movdqa	%xmm1,(%rax)
2102	movdqa	%xmm0,32(%rax,$num)
2103	movdqa	%xmm2,16(%rax)
2104	movdqa	%xmm0,48(%rax,$num)
2105	movdqa	%xmm3,32(%rax)
2106	movdqa	%xmm4,48(%rax)
2107	lea	64(%rax),%rax
2108	sub	\$64,%r11
2109	jnz	.Lmul_by_1
2110
2111	movq	$rptr,%xmm1
2112	movq	$nptr,%xmm2
2113	.byte	0x67
2114	mov	$nptr,%rbp
2115	movq	%r10, %xmm3		# -num
2116___
2117$code.=<<___ if ($addx);
2118	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
2119	and	\$0x80108,%r11d
2120	cmp	\$0x80108,%r11d		# check for AD*X+BMI2+BMI1
2121	jne	.Lfrom_mont_nox
2122
2123	lea	(%rax,$num),$rptr
2124	call	__bn_sqrx8x_reduction
2125	call	__bn_postx4x_internal
2126
2127	pxor	%xmm0,%xmm0
2128	lea	48(%rsp),%rax
2129	mov	40(%rsp),%rsi		# restore %rsp
2130	jmp	.Lfrom_mont_zero
2131
2132.align	32
2133.Lfrom_mont_nox:
2134___
2135$code.=<<___;
2136	call	__bn_sqr8x_reduction
2137	call	__bn_post4x_internal
2138
2139	pxor	%xmm0,%xmm0
2140	lea	48(%rsp),%rax
2141	mov	40(%rsp),%rsi		# restore %rsp
2142	jmp	.Lfrom_mont_zero
2143
2144.align	32
2145.Lfrom_mont_zero:
2146	movdqa	%xmm0,16*0(%rax)
2147	movdqa	%xmm0,16*1(%rax)
2148	movdqa	%xmm0,16*2(%rax)
2149	movdqa	%xmm0,16*3(%rax)
2150	lea	16*4(%rax),%rax
2151	sub	\$32,$num
2152	jnz	.Lfrom_mont_zero
2153
2154	mov	\$1,%rax
2155	mov	-48(%rsi),%r15
2156	mov	-40(%rsi),%r14
2157	mov	-32(%rsi),%r13
2158	mov	-24(%rsi),%r12
2159	mov	-16(%rsi),%rbp
2160	mov	-8(%rsi),%rbx
2161	lea	(%rsi),%rsp
2162.Lfrom_epilogue:
2163	ret
2164.size	bn_from_mont8x,.-bn_from_mont8x
2165___
2166}
2167}}}
2168
2169if ($addx) {{{
2170my $bp="%rdx";	# restore original value
2171
2172$code.=<<___;
2173.type	bn_mulx4x_mont_gather5,\@function,6
2174.align	32
2175bn_mulx4x_mont_gather5:
2176.Lmulx4x_enter:
2177	mov	%rsp,%rax
2178	push	%rbx
2179	push	%rbp
2180	push	%r12
2181	push	%r13
2182	push	%r14
2183	push	%r15
2184
2185	shl	\$3,${num}d		# convert $num to bytes
2186	lea	($num,$num,2),%r10	# 3*$num in bytes
2187	neg	$num			# -$num
2188	mov	($n0),$n0		# *n0
2189
2190	##############################################################
2191	# Ensure that stack frame doesn't alias with $rptr+3*$num
2192	# modulo 4096, which covers ret[num], am[num] and n[num]
2193	# (see bn_exp.c). This is done to allow memory disambiguation
2194	# logic do its magic. [Extra [num] is allocated in order
2195	# to align with bn_power5's frame, which is cleansed after
2196	# completing exponentiation. Extra 256 bytes is for power mask
2197	# calculated from 7th argument, the index.]
2198	#
2199	lea	-320(%rsp,$num,2),%r11
2200	sub	$rp,%r11
2201	and	\$4095,%r11
2202	cmp	%r11,%r10
2203	jb	.Lmulx4xsp_alt
2204	sub	%r11,%rsp		# align with $aptr
2205	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*$num*8+256)
2206	jmp	.Lmulx4xsp_done
2207
2208.Lmulx4xsp_alt:
2209	lea	4096-320(,$num,2),%r10
2210	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*$num*8+256)
2211	sub	%r10,%r11
2212	mov	\$0,%r10
2213	cmovc	%r10,%r11
2214	sub	%r11,%rsp
2215.Lmulx4xsp_done:
2216	and	\$-64,%rsp		# ensure alignment
2217	mov	%rax,%r11
2218	sub	%rsp,%r11
2219	and	\$-4096,%r11
2220.Lmulx4x_page_walk:
2221	mov	(%rsp,%r11),%r10
2222	sub	\$4096,%r11
2223	.byte	0x2e			# predict non-taken
2224	jnc	.Lmulx4x_page_walk
2225
2226	##############################################################
2227	# Stack layout
2228	# +0	-num
2229	# +8	off-loaded &b[i]
2230	# +16	end of b[num]
2231	# +24	inner counter
2232	# +32	saved n0
2233	# +40	saved %rsp
2234	# +48
2235	# +56	saved rp
2236	# +64	tmp[num+1]
2237	#
2238	mov	$n0, 32(%rsp)		# save *n0
2239	mov	%rax,40(%rsp)		# save original %rsp
2240.Lmulx4x_body:
2241	call	mulx4x_internal
2242
2243	mov	40(%rsp),%rsi		# restore %rsp
2244	mov	\$1,%rax
2245
2246	mov	-48(%rsi),%r15
2247	mov	-40(%rsi),%r14
2248	mov	-32(%rsi),%r13
2249	mov	-24(%rsi),%r12
2250	mov	-16(%rsi),%rbp
2251	mov	-8(%rsi),%rbx
2252	lea	(%rsi),%rsp
2253.Lmulx4x_epilogue:
2254	ret
2255.size	bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
2256
2257.type	mulx4x_internal,\@abi-omnipotent
2258.align	32
2259mulx4x_internal:
2260	mov	$num,8(%rsp)		# save -$num (it was in bytes)
2261	mov	$num,%r10
2262	neg	$num			# restore $num
2263	shl	\$5,$num
2264	neg	%r10			# restore $num
2265	lea	128($bp,$num),%r13	# end of powers table (+size optimization)
2266	shr	\$5+5,$num
2267	movd	`($win64?56:8)`(%rax),%xmm5	# load 7th argument
2268	sub	\$1,$num
2269	lea	.Linc(%rip),%rax
2270	mov	%r13,16+8(%rsp)		# end of b[num]
2271	mov	$num,24+8(%rsp)		# inner counter
2272	mov	$rp, 56+8(%rsp)		# save $rp
2273___
2274my ($aptr, $bptr, $nptr, $tptr, $mi,  $bi,  $zero, $num)=
2275   ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
2276my $rptr=$bptr;
2277my $STRIDE=2**5*8;		# 5 is "window size"
2278my $N=$STRIDE/4;		# should match cache line size
2279$code.=<<___;
2280	movdqa	0(%rax),%xmm0		# 00000001000000010000000000000000
2281	movdqa	16(%rax),%xmm1		# 00000002000000020000000200000002
2282	lea	88-112(%rsp,%r10),%r10	# place the mask after tp[num+1] (+ICache optimizaton)
2283	lea	128($bp),$bptr		# size optimization
2284
2285	pshufd	\$0,%xmm5,%xmm5		# broadcast index
2286	movdqa	%xmm1,%xmm4
2287	.byte	0x67
2288	movdqa	%xmm1,%xmm2
2289___
2290########################################################################
2291# calculate mask by comparing 0..31 to index and save result to stack
2292#
2293$code.=<<___;
2294	.byte	0x67
2295	paddd	%xmm0,%xmm1
2296	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
2297	movdqa	%xmm4,%xmm3
2298___
2299for($i=0;$i<$STRIDE/16-4;$i+=4) {
2300$code.=<<___;
2301	paddd	%xmm1,%xmm2
2302	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
2303	movdqa	%xmm0,`16*($i+0)+112`(%r10)
2304	movdqa	%xmm4,%xmm0
2305
2306	paddd	%xmm2,%xmm3
2307	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
2308	movdqa	%xmm1,`16*($i+1)+112`(%r10)
2309	movdqa	%xmm4,%xmm1
2310
2311	paddd	%xmm3,%xmm0
2312	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
2313	movdqa	%xmm2,`16*($i+2)+112`(%r10)
2314	movdqa	%xmm4,%xmm2
2315
2316	paddd	%xmm0,%xmm1
2317	pcmpeqd	%xmm5,%xmm0
2318	movdqa	%xmm3,`16*($i+3)+112`(%r10)
2319	movdqa	%xmm4,%xmm3
2320___
2321}
2322$code.=<<___;				# last iteration can be optimized
2323	.byte	0x67
2324	paddd	%xmm1,%xmm2
2325	pcmpeqd	%xmm5,%xmm1
2326	movdqa	%xmm0,`16*($i+0)+112`(%r10)
2327
2328	paddd	%xmm2,%xmm3
2329	pcmpeqd	%xmm5,%xmm2
2330	movdqa	%xmm1,`16*($i+1)+112`(%r10)
2331
2332	pcmpeqd	%xmm5,%xmm3
2333	movdqa	%xmm2,`16*($i+2)+112`(%r10)
2334
2335	pand	`16*($i+0)-128`($bptr),%xmm0	# while it's still in register
2336	pand	`16*($i+1)-128`($bptr),%xmm1
2337	pand	`16*($i+2)-128`($bptr),%xmm2
2338	movdqa	%xmm3,`16*($i+3)+112`(%r10)
2339	pand	`16*($i+3)-128`($bptr),%xmm3
2340	por	%xmm2,%xmm0
2341	por	%xmm3,%xmm1
2342___
2343for($i=0;$i<$STRIDE/16-4;$i+=4) {
2344$code.=<<___;
2345	movdqa	`16*($i+0)-128`($bptr),%xmm4
2346	movdqa	`16*($i+1)-128`($bptr),%xmm5
2347	movdqa	`16*($i+2)-128`($bptr),%xmm2
2348	pand	`16*($i+0)+112`(%r10),%xmm4
2349	movdqa	`16*($i+3)-128`($bptr),%xmm3
2350	pand	`16*($i+1)+112`(%r10),%xmm5
2351	por	%xmm4,%xmm0
2352	pand	`16*($i+2)+112`(%r10),%xmm2
2353	por	%xmm5,%xmm1
2354	pand	`16*($i+3)+112`(%r10),%xmm3
2355	por	%xmm2,%xmm0
2356	por	%xmm3,%xmm1
2357___
2358}
2359$code.=<<___;
2360	pxor	%xmm1,%xmm0
2361	pshufd	\$0x4e,%xmm0,%xmm1
2362	por	%xmm1,%xmm0
2363	lea	$STRIDE($bptr),$bptr
2364	movq	%xmm0,%rdx		# bp[0]
2365	lea	64+8*4+8(%rsp),$tptr
2366
2367	mov	%rdx,$bi
2368	mulx	0*8($aptr),$mi,%rax	# a[0]*b[0]
2369	mulx	1*8($aptr),%r11,%r12	# a[1]*b[0]
2370	add	%rax,%r11
2371	mulx	2*8($aptr),%rax,%r13	# ...
2372	adc	%rax,%r12
2373	adc	\$0,%r13
2374	mulx	3*8($aptr),%rax,%r14
2375
2376	mov	$mi,%r15
2377	imulq	32+8(%rsp),$mi		# "t[0]"*n0
2378	xor	$zero,$zero		# cf=0, of=0
2379	mov	$mi,%rdx
2380
2381	mov	$bptr,8+8(%rsp)		# off-load &b[i]
2382
2383	lea	4*8($aptr),$aptr
2384	adcx	%rax,%r13
2385	adcx	$zero,%r14		# cf=0
2386
2387	mulx	0*8($nptr),%rax,%r10
2388	adcx	%rax,%r15		# discarded
2389	adox	%r11,%r10
2390	mulx	1*8($nptr),%rax,%r11
2391	adcx	%rax,%r10
2392	adox	%r12,%r11
2393	mulx	2*8($nptr),%rax,%r12
2394	mov	24+8(%rsp),$bptr	# counter value
2395	mov	%r10,-8*4($tptr)
2396	adcx	%rax,%r11
2397	adox	%r13,%r12
2398	mulx	3*8($nptr),%rax,%r15
2399	 mov	$bi,%rdx
2400	mov	%r11,-8*3($tptr)
2401	adcx	%rax,%r12
2402	adox	$zero,%r15		# of=0
2403	lea	4*8($nptr),$nptr
2404	mov	%r12,-8*2($tptr)
2405	jmp	.Lmulx4x_1st
2406
2407.align	32
2408.Lmulx4x_1st:
2409	adcx	$zero,%r15		# cf=0, modulo-scheduled
2410	mulx	0*8($aptr),%r10,%rax	# a[4]*b[0]
2411	adcx	%r14,%r10
2412	mulx	1*8($aptr),%r11,%r14	# a[5]*b[0]
2413	adcx	%rax,%r11
2414	mulx	2*8($aptr),%r12,%rax	# ...
2415	adcx	%r14,%r12
2416	mulx	3*8($aptr),%r13,%r14
2417	 .byte	0x67,0x67
2418	 mov	$mi,%rdx
2419	adcx	%rax,%r13
2420	adcx	$zero,%r14		# cf=0
2421	lea	4*8($aptr),$aptr
2422	lea	4*8($tptr),$tptr
2423
2424	adox	%r15,%r10
2425	mulx	0*8($nptr),%rax,%r15
2426	adcx	%rax,%r10
2427	adox	%r15,%r11
2428	mulx	1*8($nptr),%rax,%r15
2429	adcx	%rax,%r11
2430	adox	%r15,%r12
2431	mulx	2*8($nptr),%rax,%r15
2432	mov	%r10,-5*8($tptr)
2433	adcx	%rax,%r12
2434	mov	%r11,-4*8($tptr)
2435	adox	%r15,%r13
2436	mulx	3*8($nptr),%rax,%r15
2437	 mov	$bi,%rdx
2438	mov	%r12,-3*8($tptr)
2439	adcx	%rax,%r13
2440	adox	$zero,%r15
2441	lea	4*8($nptr),$nptr
2442	mov	%r13,-2*8($tptr)
2443
2444	dec	$bptr			# of=0, pass cf
2445	jnz	.Lmulx4x_1st
2446
2447	mov	8(%rsp),$num		# load -num
2448	adc	$zero,%r15		# modulo-scheduled
2449	lea	($aptr,$num),$aptr	# rewind $aptr
2450	add	%r15,%r14
2451	mov	8+8(%rsp),$bptr		# re-load &b[i]
2452	adc	$zero,$zero		# top-most carry
2453	mov	%r14,-1*8($tptr)
2454	jmp	.Lmulx4x_outer
2455
2456.align	32
2457.Lmulx4x_outer:
2458	lea	16-256($tptr),%r10	# where 256-byte mask is (+density control)
2459	pxor	%xmm4,%xmm4
2460	.byte	0x67,0x67
2461	pxor	%xmm5,%xmm5
2462___
2463for($i=0;$i<$STRIDE/16;$i+=4) {
2464$code.=<<___;
2465	movdqa	`16*($i+0)-128`($bptr),%xmm0
2466	movdqa	`16*($i+1)-128`($bptr),%xmm1
2467	movdqa	`16*($i+2)-128`($bptr),%xmm2
2468	pand	`16*($i+0)+256`(%r10),%xmm0
2469	movdqa	`16*($i+3)-128`($bptr),%xmm3
2470	pand	`16*($i+1)+256`(%r10),%xmm1
2471	por	%xmm0,%xmm4
2472	pand	`16*($i+2)+256`(%r10),%xmm2
2473	por	%xmm1,%xmm5
2474	pand	`16*($i+3)+256`(%r10),%xmm3
2475	por	%xmm2,%xmm4
2476	por	%xmm3,%xmm5
2477___
2478}
2479$code.=<<___;
2480	por	%xmm5,%xmm4
2481	pshufd	\$0x4e,%xmm4,%xmm0
2482	por	%xmm4,%xmm0
2483	lea	$STRIDE($bptr),$bptr
2484	movq	%xmm0,%rdx		# m0=bp[i]
2485
2486	mov	$zero,($tptr)		# save top-most carry
2487	lea	4*8($tptr,$num),$tptr	# rewind $tptr
2488	mulx	0*8($aptr),$mi,%r11	# a[0]*b[i]
2489	xor	$zero,$zero		# cf=0, of=0
2490	mov	%rdx,$bi
2491	mulx	1*8($aptr),%r14,%r12	# a[1]*b[i]
2492	adox	-4*8($tptr),$mi		# +t[0]
2493	adcx	%r14,%r11
2494	mulx	2*8($aptr),%r15,%r13	# ...
2495	adox	-3*8($tptr),%r11
2496	adcx	%r15,%r12
2497	mulx	3*8($aptr),%rdx,%r14
2498	adox	-2*8($tptr),%r12
2499	adcx	%rdx,%r13
2500	lea	($nptr,$num),$nptr	# rewind $nptr
2501	lea	4*8($aptr),$aptr
2502	adox	-1*8($tptr),%r13
2503	adcx	$zero,%r14
2504	adox	$zero,%r14
2505
2506	mov	$mi,%r15
2507	imulq	32+8(%rsp),$mi		# "t[0]"*n0
2508
2509	mov	$mi,%rdx
2510	xor	$zero,$zero		# cf=0, of=0
2511	mov	$bptr,8+8(%rsp)		# off-load &b[i]
2512
2513	mulx	0*8($nptr),%rax,%r10
2514	adcx	%rax,%r15		# discarded
2515	adox	%r11,%r10
2516	mulx	1*8($nptr),%rax,%r11
2517	adcx	%rax,%r10
2518	adox	%r12,%r11
2519	mulx	2*8($nptr),%rax,%r12
2520	adcx	%rax,%r11
2521	adox	%r13,%r12
2522	mulx	3*8($nptr),%rax,%r15
2523	 mov	$bi,%rdx
2524	mov	24+8(%rsp),$bptr	# counter value
2525	mov	%r10,-8*4($tptr)
2526	adcx	%rax,%r12
2527	mov	%r11,-8*3($tptr)
2528	adox	$zero,%r15		# of=0
2529	mov	%r12,-8*2($tptr)
2530	lea	4*8($nptr),$nptr
2531	jmp	.Lmulx4x_inner
2532
2533.align	32
2534.Lmulx4x_inner:
2535	mulx	0*8($aptr),%r10,%rax	# a[4]*b[i]
2536	adcx	$zero,%r15		# cf=0, modulo-scheduled
2537	adox	%r14,%r10
2538	mulx	1*8($aptr),%r11,%r14	# a[5]*b[i]
2539	adcx	0*8($tptr),%r10
2540	adox	%rax,%r11
2541	mulx	2*8($aptr),%r12,%rax	# ...
2542	adcx	1*8($tptr),%r11
2543	adox	%r14,%r12
2544	mulx	3*8($aptr),%r13,%r14
2545	 mov	$mi,%rdx
2546	adcx	2*8($tptr),%r12
2547	adox	%rax,%r13
2548	adcx	3*8($tptr),%r13
2549	adox	$zero,%r14		# of=0
2550	lea	4*8($aptr),$aptr
2551	lea	4*8($tptr),$tptr
2552	adcx	$zero,%r14		# cf=0
2553
2554	adox	%r15,%r10
2555	mulx	0*8($nptr),%rax,%r15
2556	adcx	%rax,%r10
2557	adox	%r15,%r11
2558	mulx	1*8($nptr),%rax,%r15
2559	adcx	%rax,%r11
2560	adox	%r15,%r12
2561	mulx	2*8($nptr),%rax,%r15
2562	mov	%r10,-5*8($tptr)
2563	adcx	%rax,%r12
2564	adox	%r15,%r13
2565	mov	%r11,-4*8($tptr)
2566	mulx	3*8($nptr),%rax,%r15
2567	 mov	$bi,%rdx
2568	lea	4*8($nptr),$nptr
2569	mov	%r12,-3*8($tptr)
2570	adcx	%rax,%r13
2571	adox	$zero,%r15
2572	mov	%r13,-2*8($tptr)
2573
2574	dec	$bptr			# of=0, pass cf
2575	jnz	.Lmulx4x_inner
2576
2577	mov	0+8(%rsp),$num		# load -num
2578	adc	$zero,%r15		# modulo-scheduled
2579	sub	0*8($tptr),$bptr	# pull top-most carry to %cf
2580	mov	8+8(%rsp),$bptr		# re-load &b[i]
2581	mov	16+8(%rsp),%r10
2582	adc	%r15,%r14
2583	lea	($aptr,$num),$aptr	# rewind $aptr
2584	adc	$zero,$zero		# top-most carry
2585	mov	%r14,-1*8($tptr)
2586
2587	cmp	%r10,$bptr
2588	jb	.Lmulx4x_outer
2589
2590	mov	-8($nptr),%r10
2591	mov	$zero,%r8
2592	mov	($nptr,$num),%r12
2593	lea	($nptr,$num),%rbp	# rewind $nptr
2594	mov	$num,%rcx
2595	lea	($tptr,$num),%rdi	# rewind $tptr
2596	xor	%eax,%eax
2597	xor	%r15,%r15
2598	sub	%r14,%r10		# compare top-most words
2599	adc	%r15,%r15
2600	or	%r15,%r8
2601	sar	\$3+2,%rcx
2602	sub	%r8,%rax		# %rax=-%r8
2603	mov	56+8(%rsp),%rdx		# restore rp
2604	dec	%r12			# so that after 'not' we get -n[0]
2605	mov	8*1(%rbp),%r13
2606	xor	%r8,%r8
2607	mov	8*2(%rbp),%r14
2608	mov	8*3(%rbp),%r15
2609	jmp	.Lsqrx4x_sub_entry	# common post-condition
2610.size	mulx4x_internal,.-mulx4x_internal
2611___
2612}{
2613######################################################################
2614# void bn_power5(
2615my $rptr="%rdi";	# BN_ULONG *rptr,
2616my $aptr="%rsi";	# const BN_ULONG *aptr,
2617my $bptr="%rdx";	# const void *table,
2618my $nptr="%rcx";	# const BN_ULONG *nptr,
2619my $n0  ="%r8";		# const BN_ULONG *n0);
2620my $num ="%r9";		# int num, has to be divisible by 8
2621			# int pwr);
2622
2623my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
2624my @A0=("%r10","%r11");
2625my @A1=("%r12","%r13");
2626my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
2627
2628$code.=<<___;
2629.type	bn_powerx5,\@function,6
2630.align	32
2631bn_powerx5:
2632.Lpowerx5_enter:
2633	mov	%rsp,%rax
2634	push	%rbx
2635	push	%rbp
2636	push	%r12
2637	push	%r13
2638	push	%r14
2639	push	%r15
2640
2641	shl	\$3,${num}d		# convert $num to bytes
2642	lea	($num,$num,2),%r10	# 3*$num in bytes
2643	neg	$num
2644	mov	($n0),$n0		# *n0
2645
2646	##############################################################
2647	# Ensure that stack frame doesn't alias with $rptr+3*$num
2648	# modulo 4096, which covers ret[num], am[num] and n[num]
2649	# (see bn_exp.c). This is done to allow memory disambiguation
2650	# logic do its magic. [Extra 256 bytes is for power mask
2651	# calculated from 7th argument, the index.]
2652	#
2653	lea	-320(%rsp,$num,2),%r11
2654	sub	$rptr,%r11
2655	and	\$4095,%r11
2656	cmp	%r11,%r10
2657	jb	.Lpwrx_sp_alt
2658	sub	%r11,%rsp		# align with $aptr
2659	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*$num*8+256)
2660	jmp	.Lpwrx_sp_done
2661
2662.align	32
2663.Lpwrx_sp_alt:
2664	lea	4096-320(,$num,2),%r10
2665	lea	-320(%rsp,$num,2),%rsp	# alloca(frame+2*$num*8+256)
2666	sub	%r10,%r11
2667	mov	\$0,%r10
2668	cmovc	%r10,%r11
2669	sub	%r11,%rsp
2670.Lpwrx_sp_done:
2671	and	\$-64,%rsp
2672	mov	%rax,%r11
2673	sub	%rsp,%r11
2674	and	\$-4096,%r11
2675.Lpwrx_page_walk:
2676	mov	(%rsp,%r11),%r10
2677	sub	\$4096,%r11
2678	.byte	0x2e			# predict non-taken
2679	jnc	.Lpwrx_page_walk
2680
2681	mov	$num,%r10
2682	neg	$num
2683
2684	##############################################################
2685	# Stack layout
2686	#
2687	# +0	saved $num, used in reduction section
2688	# +8	&t[2*$num], used in reduction section
2689	# +16	intermediate carry bit
2690	# +24	top-most carry bit, used in reduction section
2691	# +32	saved *n0
2692	# +40	saved %rsp
2693	# +48	t[2*$num]
2694	#
2695	pxor	%xmm0,%xmm0
2696	movq	$rptr,%xmm1		# save $rptr
2697	movq	$nptr,%xmm2		# save $nptr
2698	movq	%r10, %xmm3		# -$num
2699	movq	$bptr,%xmm4
2700	mov	$n0,  32(%rsp)
2701	mov	%rax, 40(%rsp)		# save original %rsp
2702.Lpowerx5_body:
2703
2704	call	__bn_sqrx8x_internal
2705	call	__bn_postx4x_internal
2706	call	__bn_sqrx8x_internal
2707	call	__bn_postx4x_internal
2708	call	__bn_sqrx8x_internal
2709	call	__bn_postx4x_internal
2710	call	__bn_sqrx8x_internal
2711	call	__bn_postx4x_internal
2712	call	__bn_sqrx8x_internal
2713	call	__bn_postx4x_internal
2714
2715	mov	%r10,$num		# -num
2716	mov	$aptr,$rptr
2717	movq	%xmm2,$nptr
2718	movq	%xmm4,$bptr
2719	mov	40(%rsp),%rax
2720
2721	call	mulx4x_internal
2722
2723	mov	40(%rsp),%rsi		# restore %rsp
2724	mov	\$1,%rax
2725
2726	mov	-48(%rsi),%r15
2727	mov	-40(%rsi),%r14
2728	mov	-32(%rsi),%r13
2729	mov	-24(%rsi),%r12
2730	mov	-16(%rsi),%rbp
2731	mov	-8(%rsi),%rbx
2732	lea	(%rsi),%rsp
2733.Lpowerx5_epilogue:
2734	ret
2735.size	bn_powerx5,.-bn_powerx5
2736
2737.globl	bn_sqrx8x_internal
2738.hidden	bn_sqrx8x_internal
2739.type	bn_sqrx8x_internal,\@abi-omnipotent
2740.align	32
2741bn_sqrx8x_internal:
2742__bn_sqrx8x_internal:
2743	##################################################################
2744	# Squaring part:
2745	#
2746	# a) multiply-n-add everything but a[i]*a[i];
2747	# b) shift result of a) by 1 to the left and accumulate
2748	#    a[i]*a[i] products;
2749	#
2750	##################################################################
2751	# a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
2752	#                                                     a[1]a[0]
2753	#                                                 a[2]a[0]
2754	#                                             a[3]a[0]
2755	#                                             a[2]a[1]
2756	#                                         a[3]a[1]
2757	#                                     a[3]a[2]
2758	#
2759	#                                         a[4]a[0]
2760	#                                     a[5]a[0]
2761	#                                 a[6]a[0]
2762	#                             a[7]a[0]
2763	#                                     a[4]a[1]
2764	#                                 a[5]a[1]
2765	#                             a[6]a[1]
2766	#                         a[7]a[1]
2767	#                                 a[4]a[2]
2768	#                             a[5]a[2]
2769	#                         a[6]a[2]
2770	#                     a[7]a[2]
2771	#                             a[4]a[3]
2772	#                         a[5]a[3]
2773	#                     a[6]a[3]
2774	#                 a[7]a[3]
2775	#
2776	#                     a[5]a[4]
2777	#                 a[6]a[4]
2778	#             a[7]a[4]
2779	#             a[6]a[5]
2780	#         a[7]a[5]
2781	#     a[7]a[6]
2782	# a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
2783___
2784{
2785my ($zero,$carry)=("%rbp","%rcx");
2786my $aaptr=$zero;
2787$code.=<<___;
2788	lea	48+8(%rsp),$tptr
2789	lea	($aptr,$num),$aaptr
2790	mov	$num,0+8(%rsp)			# save $num
2791	mov	$aaptr,8+8(%rsp)		# save end of $aptr
2792	jmp	.Lsqr8x_zero_start
2793
2794.align	32
2795.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2796.Lsqrx8x_zero:
2797	.byte	0x3e
2798	movdqa	%xmm0,0*8($tptr)
2799	movdqa	%xmm0,2*8($tptr)
2800	movdqa	%xmm0,4*8($tptr)
2801	movdqa	%xmm0,6*8($tptr)
2802.Lsqr8x_zero_start:			# aligned at 32
2803	movdqa	%xmm0,8*8($tptr)
2804	movdqa	%xmm0,10*8($tptr)
2805	movdqa	%xmm0,12*8($tptr)
2806	movdqa	%xmm0,14*8($tptr)
2807	lea	16*8($tptr),$tptr
2808	sub	\$64,$num
2809	jnz	.Lsqrx8x_zero
2810
2811	mov	0*8($aptr),%rdx		# a[0], modulo-scheduled
2812	#xor	%r9,%r9			# t[1], ex-$num, zero already
2813	xor	%r10,%r10
2814	xor	%r11,%r11
2815	xor	%r12,%r12
2816	xor	%r13,%r13
2817	xor	%r14,%r14
2818	xor	%r15,%r15
2819	lea	48+8(%rsp),$tptr
2820	xor	$zero,$zero		# cf=0, cf=0
2821	jmp	.Lsqrx8x_outer_loop
2822
2823.align	32
2824.Lsqrx8x_outer_loop:
2825	mulx	1*8($aptr),%r8,%rax	# a[1]*a[0]
2826	adcx	%r9,%r8			# a[1]*a[0]+=t[1]
2827	adox	%rax,%r10
2828	mulx	2*8($aptr),%r9,%rax	# a[2]*a[0]
2829	adcx	%r10,%r9
2830	adox	%rax,%r11
2831	.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00	# mulx	3*8($aptr),%r10,%rax	# ...
2832	adcx	%r11,%r10
2833	adox	%rax,%r12
2834	.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00	# mulx	4*8($aptr),%r11,%rax
2835	adcx	%r12,%r11
2836	adox	%rax,%r13
2837	mulx	5*8($aptr),%r12,%rax
2838	adcx	%r13,%r12
2839	adox	%rax,%r14
2840	mulx	6*8($aptr),%r13,%rax
2841	adcx	%r14,%r13
2842	adox	%r15,%rax
2843	mulx	7*8($aptr),%r14,%r15
2844	 mov	1*8($aptr),%rdx		# a[1]
2845	adcx	%rax,%r14
2846	adox	$zero,%r15
2847	adc	8*8($tptr),%r15
2848	mov	%r8,1*8($tptr)		# t[1]
2849	mov	%r9,2*8($tptr)		# t[2]
2850	sbb	$carry,$carry		# mov %cf,$carry
2851	xor	$zero,$zero		# cf=0, of=0
2852
2853
2854	mulx	2*8($aptr),%r8,%rbx	# a[2]*a[1]
2855	mulx	3*8($aptr),%r9,%rax	# a[3]*a[1]
2856	adcx	%r10,%r8
2857	adox	%rbx,%r9
2858	mulx	4*8($aptr),%r10,%rbx	# ...
2859	adcx	%r11,%r9
2860	adox	%rax,%r10
2861	.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00	# mulx	5*8($aptr),%r11,%rax
2862	adcx	%r12,%r10
2863	adox	%rbx,%r11
2864	.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00	# mulx	6*8($aptr),%r12,%rbx
2865	adcx	%r13,%r11
2866	adox	%r14,%r12
2867	.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00	# mulx	7*8($aptr),%r13,%r14
2868	 mov	2*8($aptr),%rdx		# a[2]
2869	adcx	%rax,%r12
2870	adox	%rbx,%r13
2871	adcx	%r15,%r13
2872	adox	$zero,%r14		# of=0
2873	adcx	$zero,%r14		# cf=0
2874
2875	mov	%r8,3*8($tptr)		# t[3]
2876	mov	%r9,4*8($tptr)		# t[4]
2877
2878	mulx	3*8($aptr),%r8,%rbx	# a[3]*a[2]
2879	mulx	4*8($aptr),%r9,%rax	# a[4]*a[2]
2880	adcx	%r10,%r8
2881	adox	%rbx,%r9
2882	mulx	5*8($aptr),%r10,%rbx	# ...
2883	adcx	%r11,%r9
2884	adox	%rax,%r10
2885	.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00	# mulx	6*8($aptr),%r11,%rax
2886	adcx	%r12,%r10
2887	adox	%r13,%r11
2888	.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00	# mulx	7*8($aptr),%r12,%r13
2889	.byte	0x3e
2890	 mov	3*8($aptr),%rdx		# a[3]
2891	adcx	%rbx,%r11
2892	adox	%rax,%r12
2893	adcx	%r14,%r12
2894	mov	%r8,5*8($tptr)		# t[5]
2895	mov	%r9,6*8($tptr)		# t[6]
2896	 mulx	4*8($aptr),%r8,%rax	# a[4]*a[3]
2897	adox	$zero,%r13		# of=0
2898	adcx	$zero,%r13		# cf=0
2899
2900	mulx	5*8($aptr),%r9,%rbx	# a[5]*a[3]
2901	adcx	%r10,%r8
2902	adox	%rax,%r9
2903	mulx	6*8($aptr),%r10,%rax	# ...
2904	adcx	%r11,%r9
2905	adox	%r12,%r10
2906	mulx	7*8($aptr),%r11,%r12
2907	 mov	4*8($aptr),%rdx		# a[4]
2908	 mov	5*8($aptr),%r14		# a[5]
2909	adcx	%rbx,%r10
2910	adox	%rax,%r11
2911	 mov	6*8($aptr),%r15		# a[6]
2912	adcx	%r13,%r11
2913	adox	$zero,%r12		# of=0
2914	adcx	$zero,%r12		# cf=0
2915
2916	mov	%r8,7*8($tptr)		# t[7]
2917	mov	%r9,8*8($tptr)		# t[8]
2918
2919	mulx	%r14,%r9,%rax		# a[5]*a[4]
2920	 mov	7*8($aptr),%r8		# a[7]
2921	adcx	%r10,%r9
2922	mulx	%r15,%r10,%rbx		# a[6]*a[4]
2923	adox	%rax,%r10
2924	adcx	%r11,%r10
2925	mulx	%r8,%r11,%rax		# a[7]*a[4]
2926	 mov	%r14,%rdx		# a[5]
2927	adox	%rbx,%r11
2928	adcx	%r12,%r11
2929	#adox	$zero,%rax		# of=0
2930	adcx	$zero,%rax		# cf=0
2931
2932	mulx	%r15,%r14,%rbx		# a[6]*a[5]
2933	mulx	%r8,%r12,%r13		# a[7]*a[5]
2934	 mov	%r15,%rdx		# a[6]
2935	 lea	8*8($aptr),$aptr
2936	adcx	%r14,%r11
2937	adox	%rbx,%r12
2938	adcx	%rax,%r12
2939	adox	$zero,%r13
2940
2941	.byte	0x67,0x67
2942	mulx	%r8,%r8,%r14		# a[7]*a[6]
2943	adcx	%r8,%r13
2944	adcx	$zero,%r14
2945
2946	cmp	8+8(%rsp),$aptr
2947	je	.Lsqrx8x_outer_break
2948
2949	neg	$carry			# mov $carry,%cf
2950	mov	\$-8,%rcx
2951	mov	$zero,%r15
2952	mov	8*8($tptr),%r8
2953	adcx	9*8($tptr),%r9		# +=t[9]
2954	adcx	10*8($tptr),%r10	# ...
2955	adcx	11*8($tptr),%r11
2956	adc	12*8($tptr),%r12
2957	adc	13*8($tptr),%r13
2958	adc	14*8($tptr),%r14
2959	adc	15*8($tptr),%r15
2960	lea	($aptr),$aaptr
2961	lea	2*64($tptr),$tptr
2962	sbb	%rax,%rax		# mov %cf,$carry
2963
2964	mov	-64($aptr),%rdx		# a[0]
2965	mov	%rax,16+8(%rsp)		# offload $carry
2966	mov	$tptr,24+8(%rsp)
2967
2968	#lea	8*8($tptr),$tptr	# see 2*8*8($tptr) above
2969	xor	%eax,%eax		# cf=0, of=0
2970	jmp	.Lsqrx8x_loop
2971
2972.align	32
2973.Lsqrx8x_loop:
2974	mov	%r8,%rbx
2975	mulx	0*8($aaptr),%rax,%r8	# a[8]*a[i]
2976	adcx	%rax,%rbx		# +=t[8]
2977	adox	%r9,%r8
2978
2979	mulx	1*8($aaptr),%rax,%r9	# ...
2980	adcx	%rax,%r8
2981	adox	%r10,%r9
2982
2983	mulx	2*8($aaptr),%rax,%r10
2984	adcx	%rax,%r9
2985	adox	%r11,%r10
2986
2987	mulx	3*8($aaptr),%rax,%r11
2988	adcx	%rax,%r10
2989	adox	%r12,%r11
2990
2991	.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00	# mulx	4*8($aaptr),%rax,%r12
2992	adcx	%rax,%r11
2993	adox	%r13,%r12
2994
2995	mulx	5*8($aaptr),%rax,%r13
2996	adcx	%rax,%r12
2997	adox	%r14,%r13
2998
2999	mulx	6*8($aaptr),%rax,%r14
3000	 mov	%rbx,($tptr,%rcx,8)	# store t[8+i]
3001	 mov	\$0,%ebx
3002	adcx	%rax,%r13
3003	adox	%r15,%r14
3004
3005	.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00	# mulx	7*8($aaptr),%rax,%r15
3006	 mov	8($aptr,%rcx,8),%rdx	# a[i]
3007	adcx	%rax,%r14
3008	adox	%rbx,%r15		# %rbx is 0, of=0
3009	adcx	%rbx,%r15		# cf=0
3010
3011	.byte	0x67
3012	inc	%rcx			# of=0
3013	jnz	.Lsqrx8x_loop
3014
3015	lea	8*8($aaptr),$aaptr
3016	mov	\$-8,%rcx
3017	cmp	8+8(%rsp),$aaptr	# done?
3018	je	.Lsqrx8x_break
3019
3020	sub	16+8(%rsp),%rbx		# mov 16(%rsp),%cf
3021	.byte	0x66
3022	mov	-64($aptr),%rdx
3023	adcx	0*8($tptr),%r8
3024	adcx	1*8($tptr),%r9
3025	adc	2*8($tptr),%r10
3026	adc	3*8($tptr),%r11
3027	adc	4*8($tptr),%r12
3028	adc	5*8($tptr),%r13
3029	adc	6*8($tptr),%r14
3030	adc	7*8($tptr),%r15
3031	lea	8*8($tptr),$tptr
3032	.byte	0x67
3033	sbb	%rax,%rax		# mov %cf,%rax
3034	xor	%ebx,%ebx		# cf=0, of=0
3035	mov	%rax,16+8(%rsp)		# offload carry
3036	jmp	.Lsqrx8x_loop
3037
3038.align	32
3039.Lsqrx8x_break:
3040	sub	16+8(%rsp),%r8		# consume last carry
3041	mov	24+8(%rsp),$carry	# initial $tptr, borrow $carry
3042	mov	0*8($aptr),%rdx		# a[8], modulo-scheduled
3043	xor	%ebp,%ebp		# xor	$zero,$zero
3044	mov	%r8,0*8($tptr)
3045	cmp	$carry,$tptr		# cf=0, of=0
3046	je	.Lsqrx8x_outer_loop
3047
3048	mov	%r9,1*8($tptr)
3049	 mov	1*8($carry),%r9
3050	mov	%r10,2*8($tptr)
3051	 mov	2*8($carry),%r10
3052	mov	%r11,3*8($tptr)
3053	 mov	3*8($carry),%r11
3054	mov	%r12,4*8($tptr)
3055	 mov	4*8($carry),%r12
3056	mov	%r13,5*8($tptr)
3057	 mov	5*8($carry),%r13
3058	mov	%r14,6*8($tptr)
3059	 mov	6*8($carry),%r14
3060	mov	%r15,7*8($tptr)
3061	 mov	7*8($carry),%r15
3062	mov	$carry,$tptr
3063	jmp	.Lsqrx8x_outer_loop
3064
3065.align	32
3066.Lsqrx8x_outer_break:
3067	mov	%r9,9*8($tptr)		# t[9]
3068	 movq	%xmm3,%rcx		# -$num
3069	mov	%r10,10*8($tptr)	# ...
3070	mov	%r11,11*8($tptr)
3071	mov	%r12,12*8($tptr)
3072	mov	%r13,13*8($tptr)
3073	mov	%r14,14*8($tptr)
3074___
3075}{
3076my $i="%rcx";
3077$code.=<<___;
3078	lea	48+8(%rsp),$tptr
3079	mov	($aptr,$i),%rdx		# a[0]
3080
3081	mov	8($tptr),$A0[1]		# t[1]
3082	xor	$A0[0],$A0[0]		# t[0], of=0, cf=0
3083	mov	0+8(%rsp),$num		# restore $num
3084	adox	$A0[1],$A0[1]
3085	 mov	16($tptr),$A1[0]	# t[2]	# prefetch
3086	 mov	24($tptr),$A1[1]	# t[3]	# prefetch
3087	#jmp	.Lsqrx4x_shift_n_add	# happens to be aligned
3088
3089.align	32
3090.Lsqrx4x_shift_n_add:
3091	mulx	%rdx,%rax,%rbx
3092	 adox	$A1[0],$A1[0]
3093	adcx	$A0[0],%rax
3094	 .byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00	# mov	8($aptr,$i),%rdx	# a[i+1]	# prefetch
3095	 .byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00	# mov	32($tptr),$A0[0]	# t[2*i+4]	# prefetch
3096	 adox	$A1[1],$A1[1]
3097	adcx	$A0[1],%rbx
3098	 mov	40($tptr),$A0[1]		# t[2*i+4+1]	# prefetch
3099	mov	%rax,0($tptr)
3100	mov	%rbx,8($tptr)
3101
3102	mulx	%rdx,%rax,%rbx
3103	 adox	$A0[0],$A0[0]
3104	adcx	$A1[0],%rax
3105	 mov	16($aptr,$i),%rdx	# a[i+2]	# prefetch
3106	 mov	48($tptr),$A1[0]	# t[2*i+6]	# prefetch
3107	 adox	$A0[1],$A0[1]
3108	adcx	$A1[1],%rbx
3109	 mov	56($tptr),$A1[1]	# t[2*i+6+1]	# prefetch
3110	mov	%rax,16($tptr)
3111	mov	%rbx,24($tptr)
3112
3113	mulx	%rdx,%rax,%rbx
3114	 adox	$A1[0],$A1[0]
3115	adcx	$A0[0],%rax
3116	 mov	24($aptr,$i),%rdx	# a[i+3]	# prefetch
3117	 lea	32($i),$i
3118	 mov	64($tptr),$A0[0]	# t[2*i+8]	# prefetch
3119	 adox	$A1[1],$A1[1]
3120	adcx	$A0[1],%rbx
3121	 mov	72($tptr),$A0[1]	# t[2*i+8+1]	# prefetch
3122	mov	%rax,32($tptr)
3123	mov	%rbx,40($tptr)
3124
3125	mulx	%rdx,%rax,%rbx
3126	 adox	$A0[0],$A0[0]
3127	adcx	$A1[0],%rax
3128	jrcxz	.Lsqrx4x_shift_n_add_break
3129	 .byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00	# mov	0($aptr,$i),%rdx	# a[i+4]	# prefetch
3130	 adox	$A0[1],$A0[1]
3131	adcx	$A1[1],%rbx
3132	 mov	80($tptr),$A1[0]	# t[2*i+10]	# prefetch
3133	 mov	88($tptr),$A1[1]	# t[2*i+10+1]	# prefetch
3134	mov	%rax,48($tptr)
3135	mov	%rbx,56($tptr)
3136	lea	64($tptr),$tptr
3137	nop
3138	jmp	.Lsqrx4x_shift_n_add
3139
3140.align	32
3141.Lsqrx4x_shift_n_add_break:
3142	adcx	$A1[1],%rbx
3143	mov	%rax,48($tptr)
3144	mov	%rbx,56($tptr)
3145	lea	64($tptr),$tptr		# end of t[] buffer
3146___
3147}
3148######################################################################
3149# Montgomery reduction part, "word-by-word" algorithm.
3150#
3151# This new path is inspired by multiple submissions from Intel, by
3152# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
3153# Vinodh Gopal...
3154{
3155my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx");
3156
3157$code.=<<___;
3158	movq	%xmm2,$nptr
3159__bn_sqrx8x_reduction:
3160	xor	%eax,%eax		# initial top-most carry bit
3161	mov	32+8(%rsp),%rbx		# n0
3162	mov	48+8(%rsp),%rdx		# "%r8", 8*0($tptr)
3163	lea	-8*8($nptr,$num),%rcx	# end of n[]
3164	#lea	48+8(%rsp,$num,2),$tptr	# end of t[] buffer
3165	mov	%rcx, 0+8(%rsp)		# save end of n[]
3166	mov	$tptr,8+8(%rsp)		# save end of t[]
3167
3168	lea	48+8(%rsp),$tptr		# initial t[] window
3169	jmp	.Lsqrx8x_reduction_loop
3170
3171.align	32
3172.Lsqrx8x_reduction_loop:
3173	mov	8*1($tptr),%r9
3174	mov	8*2($tptr),%r10
3175	mov	8*3($tptr),%r11
3176	mov	8*4($tptr),%r12
3177	mov	%rdx,%r8
3178	imulq	%rbx,%rdx		# n0*a[i]
3179	mov	8*5($tptr),%r13
3180	mov	8*6($tptr),%r14
3181	mov	8*7($tptr),%r15
3182	mov	%rax,24+8(%rsp)		# store top-most carry bit
3183
3184	lea	8*8($tptr),$tptr
3185	xor	$carry,$carry		# cf=0,of=0
3186	mov	\$-8,%rcx
3187	jmp	.Lsqrx8x_reduce
3188
3189.align	32
3190.Lsqrx8x_reduce:
3191	mov	%r8, %rbx
3192	mulx	8*0($nptr),%rax,%r8	# n[0]
3193	adcx	%rbx,%rax		# discarded
3194	adox	%r9,%r8
3195
3196	mulx	8*1($nptr),%rbx,%r9	# n[1]
3197	adcx	%rbx,%r8
3198	adox	%r10,%r9
3199
3200	mulx	8*2($nptr),%rbx,%r10
3201	adcx	%rbx,%r9
3202	adox	%r11,%r10
3203
3204	mulx	8*3($nptr),%rbx,%r11
3205	adcx	%rbx,%r10
3206	adox	%r12,%r11
3207
3208	.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00	# mulx	8*4($nptr),%rbx,%r12
3209	 mov	%rdx,%rax
3210	 mov	%r8,%rdx
3211	adcx	%rbx,%r11
3212	adox	%r13,%r12
3213
3214	 mulx	32+8(%rsp),%rbx,%rdx	# %rdx discarded
3215	 mov	%rax,%rdx
3216	 mov	%rax,64+48+8(%rsp,%rcx,8)	# put aside n0*a[i]
3217
3218	mulx	8*5($nptr),%rax,%r13
3219	adcx	%rax,%r12
3220	adox	%r14,%r13
3221
3222	mulx	8*6($nptr),%rax,%r14
3223	adcx	%rax,%r13
3224	adox	%r15,%r14
3225
3226	mulx	8*7($nptr),%rax,%r15
3227	 mov	%rbx,%rdx
3228	adcx	%rax,%r14
3229	adox	$carry,%r15		# $carry is 0
3230	adcx	$carry,%r15		# cf=0
3231
3232	.byte	0x67,0x67,0x67
3233	inc	%rcx			# of=0
3234	jnz	.Lsqrx8x_reduce
3235
3236	mov	$carry,%rax		# xor	%rax,%rax
3237	cmp	0+8(%rsp),$nptr		# end of n[]?
3238	jae	.Lsqrx8x_no_tail
3239
3240	mov	48+8(%rsp),%rdx		# pull n0*a[0]
3241	add	8*0($tptr),%r8
3242	lea	8*8($nptr),$nptr
3243	mov	\$-8,%rcx
3244	adcx	8*1($tptr),%r9
3245	adcx	8*2($tptr),%r10
3246	adc	8*3($tptr),%r11
3247	adc	8*4($tptr),%r12
3248	adc	8*5($tptr),%r13
3249	adc	8*6($tptr),%r14
3250	adc	8*7($tptr),%r15
3251	lea	8*8($tptr),$tptr
3252	sbb	%rax,%rax		# top carry
3253
3254	xor	$carry,$carry		# of=0, cf=0
3255	mov	%rax,16+8(%rsp)
3256	jmp	.Lsqrx8x_tail
3257
3258.align	32
3259.Lsqrx8x_tail:
3260	mov	%r8,%rbx
3261	mulx	8*0($nptr),%rax,%r8
3262	adcx	%rax,%rbx
3263	adox	%r9,%r8
3264
3265	mulx	8*1($nptr),%rax,%r9
3266	adcx	%rax,%r8
3267	adox	%r10,%r9
3268
3269	mulx	8*2($nptr),%rax,%r10
3270	adcx	%rax,%r9
3271	adox	%r11,%r10
3272
3273	mulx	8*3($nptr),%rax,%r11
3274	adcx	%rax,%r10
3275	adox	%r12,%r11
3276
3277	.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00	# mulx	8*4($nptr),%rax,%r12
3278	adcx	%rax,%r11
3279	adox	%r13,%r12
3280
3281	mulx	8*5($nptr),%rax,%r13
3282	adcx	%rax,%r12
3283	adox	%r14,%r13
3284
3285	mulx	8*6($nptr),%rax,%r14
3286	adcx	%rax,%r13
3287	adox	%r15,%r14
3288
3289	mulx	8*7($nptr),%rax,%r15
3290	 mov	72+48+8(%rsp,%rcx,8),%rdx	# pull n0*a[i]
3291	adcx	%rax,%r14
3292	adox	$carry,%r15
3293	 mov	%rbx,($tptr,%rcx,8)	# save result
3294	 mov	%r8,%rbx
3295	adcx	$carry,%r15		# cf=0
3296
3297	inc	%rcx			# of=0
3298	jnz	.Lsqrx8x_tail
3299
3300	cmp	0+8(%rsp),$nptr		# end of n[]?
3301	jae	.Lsqrx8x_tail_done	# break out of loop
3302
3303	sub	16+8(%rsp),$carry	# mov 16(%rsp),%cf
3304	 mov	48+8(%rsp),%rdx		# pull n0*a[0]
3305	 lea	8*8($nptr),$nptr
3306	adc	8*0($tptr),%r8
3307	adc	8*1($tptr),%r9
3308	adc	8*2($tptr),%r10
3309	adc	8*3($tptr),%r11
3310	adc	8*4($tptr),%r12
3311	adc	8*5($tptr),%r13
3312	adc	8*6($tptr),%r14
3313	adc	8*7($tptr),%r15
3314	lea	8*8($tptr),$tptr
3315	sbb	%rax,%rax
3316	sub	\$8,%rcx		# mov	\$-8,%rcx
3317
3318	xor	$carry,$carry		# of=0, cf=0
3319	mov	%rax,16+8(%rsp)
3320	jmp	.Lsqrx8x_tail
3321
3322.align	32
3323.Lsqrx8x_tail_done:
3324	add	24+8(%rsp),%r8		# can this overflow?
3325	adc	\$0,%r9
3326	adc	\$0,%r10
3327	adc	\$0,%r11
3328	adc	\$0,%r12
3329	adc	\$0,%r13
3330	adc	\$0,%r14
3331	adc	\$0,%r15		# can't overflow, because we
3332					# started with "overhung" part
3333					# of multiplication
3334	mov	$carry,%rax		# xor	%rax,%rax
3335
3336	sub	16+8(%rsp),$carry	# mov 16(%rsp),%cf
3337.Lsqrx8x_no_tail:			# %cf is 0 if jumped here
3338	adc	8*0($tptr),%r8
3339	 movq	%xmm3,%rcx
3340	adc	8*1($tptr),%r9
3341	 mov	8*7($nptr),$carry
3342	 movq	%xmm2,$nptr		# restore $nptr
3343	adc	8*2($tptr),%r10
3344	adc	8*3($tptr),%r11
3345	adc	8*4($tptr),%r12
3346	adc	8*5($tptr),%r13
3347	adc	8*6($tptr),%r14
3348	adc	8*7($tptr),%r15
3349	adc	%rax,%rax		# top-most carry
3350
3351	mov	32+8(%rsp),%rbx		# n0
3352	mov	8*8($tptr,%rcx),%rdx	# modulo-scheduled "%r8"
3353
3354	mov	%r8,8*0($tptr)		# store top 512 bits
3355	 lea	8*8($tptr),%r8		# borrow %r8
3356	mov	%r9,8*1($tptr)
3357	mov	%r10,8*2($tptr)
3358	mov	%r11,8*3($tptr)
3359	mov	%r12,8*4($tptr)
3360	mov	%r13,8*5($tptr)
3361	mov	%r14,8*6($tptr)
3362	mov	%r15,8*7($tptr)
3363
3364	lea	8*8($tptr,%rcx),$tptr	# start of current t[] window
3365	cmp	8+8(%rsp),%r8		# end of t[]?
3366	jb	.Lsqrx8x_reduction_loop
3367	ret
3368.size	bn_sqrx8x_internal,.-bn_sqrx8x_internal
3369___
3370}
3371##############################################################
3372# Post-condition, 4x unrolled
3373#
3374{
3375my ($rptr,$nptr)=("%rdx","%rbp");
3376$code.=<<___;
3377.align	32
3378__bn_postx4x_internal:
3379	mov	8*0($nptr),%r12
3380	mov	%rcx,%r10		# -$num
3381	mov	%rcx,%r9		# -$num
3382	neg	%rax
3383	sar	\$3+2,%rcx
3384	#lea	48+8(%rsp,%r9),$tptr
3385	movq	%xmm1,$rptr		# restore $rptr
3386	movq	%xmm1,$aptr		# prepare for back-to-back call
3387	dec	%r12			# so that after 'not' we get -n[0]
3388	mov	8*1($nptr),%r13
3389	xor	%r8,%r8
3390	mov	8*2($nptr),%r14
3391	mov	8*3($nptr),%r15
3392	jmp	.Lsqrx4x_sub_entry
3393
3394.align	16
3395.Lsqrx4x_sub:
3396	mov	8*0($nptr),%r12
3397	mov	8*1($nptr),%r13
3398	mov	8*2($nptr),%r14
3399	mov	8*3($nptr),%r15
3400.Lsqrx4x_sub_entry:
3401	andn	%rax,%r12,%r12
3402	lea	8*4($nptr),$nptr
3403	andn	%rax,%r13,%r13
3404	andn	%rax,%r14,%r14
3405	andn	%rax,%r15,%r15
3406
3407	neg	%r8			# mov %r8,%cf
3408	adc	8*0($tptr),%r12
3409	adc	8*1($tptr),%r13
3410	adc	8*2($tptr),%r14
3411	adc	8*3($tptr),%r15
3412	mov	%r12,8*0($rptr)
3413	lea	8*4($tptr),$tptr
3414	mov	%r13,8*1($rptr)
3415	sbb	%r8,%r8			# mov %cf,%r8
3416	mov	%r14,8*2($rptr)
3417	mov	%r15,8*3($rptr)
3418	lea	8*4($rptr),$rptr
3419
3420	inc	%rcx
3421	jnz	.Lsqrx4x_sub
3422
3423	neg	%r9			# restore $num
3424
3425	ret
3426.size	__bn_postx4x_internal,.-__bn_postx4x_internal
3427___
3428}
3429}}}
3430{
3431my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order
3432				("%rdi","%esi","%rdx","%ecx");  # Unix order
3433my $out=$inp;
3434my $STRIDE=2**5*8;
3435my $N=$STRIDE/4;
3436
3437$code.=<<___;
3438.globl	bn_get_bits5
3439.type	bn_get_bits5,\@abi-omnipotent
3440.align	16
3441bn_get_bits5:
3442	lea	0($inp),%r10
3443	lea	1($inp),%r11
3444	mov	$num,%ecx
3445	shr	\$4,$num
3446	and	\$15,%ecx
3447	lea	-8(%ecx),%eax
3448	cmp	\$11,%ecx
3449	cmova	%r11,%r10
3450	cmova	%eax,%ecx
3451	movzw	(%r10,$num,2),%eax
3452	shrl	%cl,%eax
3453	and	\$31,%eax
3454	ret
3455.size	bn_get_bits5,.-bn_get_bits5
3456
3457.globl	bn_scatter5
3458.type	bn_scatter5,\@abi-omnipotent
3459.align	16
3460bn_scatter5:
3461	cmp	\$0, $num
3462	jz	.Lscatter_epilogue
3463	lea	($tbl,$idx,8),$tbl
3464.Lscatter:
3465	mov	($inp),%rax
3466	lea	8($inp),$inp
3467	mov	%rax,($tbl)
3468	lea	32*8($tbl),$tbl
3469	sub	\$1,$num
3470	jnz	.Lscatter
3471.Lscatter_epilogue:
3472	ret
3473.size	bn_scatter5,.-bn_scatter5
3474
3475.globl	bn_gather5
3476.type	bn_gather5,\@abi-omnipotent
3477.align	32
3478bn_gather5:
3479.LSEH_begin_bn_gather5:			# Win64 thing, but harmless in other cases
3480	# I can't trust assembler to use specific encoding:-(
3481	.byte	0x4c,0x8d,0x14,0x24			#lea    (%rsp),%r10
3482	.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00	#sub	$0x108,%rsp
3483	lea	.Linc(%rip),%rax
3484	and	\$-16,%rsp		# shouldn't be formally required
3485
3486	movd	$idx,%xmm5
3487	movdqa	0(%rax),%xmm0		# 00000001000000010000000000000000
3488	movdqa	16(%rax),%xmm1		# 00000002000000020000000200000002
3489	lea	128($tbl),%r11		# size optimization
3490	lea	128(%rsp),%rax		# size optimization
3491
3492	pshufd	\$0,%xmm5,%xmm5		# broadcast $idx
3493	movdqa	%xmm1,%xmm4
3494	movdqa	%xmm1,%xmm2
3495___
3496########################################################################
3497# calculate mask by comparing 0..31 to $idx and save result to stack
3498#
3499for($i=0;$i<$STRIDE/16;$i+=4) {
3500$code.=<<___;
3501	paddd	%xmm0,%xmm1
3502	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
3503___
3504$code.=<<___	if ($i);
3505	movdqa	%xmm3,`16*($i-1)-128`(%rax)
3506___
3507$code.=<<___;
3508	movdqa	%xmm4,%xmm3
3509
3510	paddd	%xmm1,%xmm2
3511	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
3512	movdqa	%xmm0,`16*($i+0)-128`(%rax)
3513	movdqa	%xmm4,%xmm0
3514
3515	paddd	%xmm2,%xmm3
3516	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
3517	movdqa	%xmm1,`16*($i+1)-128`(%rax)
3518	movdqa	%xmm4,%xmm1
3519
3520	paddd	%xmm3,%xmm0
3521	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
3522	movdqa	%xmm2,`16*($i+2)-128`(%rax)
3523	movdqa	%xmm4,%xmm2
3524___
3525}
3526$code.=<<___;
3527	movdqa	%xmm3,`16*($i-1)-128`(%rax)
3528	jmp	.Lgather
3529
3530.align	32
3531.Lgather:
3532	pxor	%xmm4,%xmm4
3533	pxor	%xmm5,%xmm5
3534___
3535for($i=0;$i<$STRIDE/16;$i+=4) {
3536$code.=<<___;
3537	movdqa	`16*($i+0)-128`(%r11),%xmm0
3538	movdqa	`16*($i+1)-128`(%r11),%xmm1
3539	movdqa	`16*($i+2)-128`(%r11),%xmm2
3540	pand	`16*($i+0)-128`(%rax),%xmm0
3541	movdqa	`16*($i+3)-128`(%r11),%xmm3
3542	pand	`16*($i+1)-128`(%rax),%xmm1
3543	por	%xmm0,%xmm4
3544	pand	`16*($i+2)-128`(%rax),%xmm2
3545	por	%xmm1,%xmm5
3546	pand	`16*($i+3)-128`(%rax),%xmm3
3547	por	%xmm2,%xmm4
3548	por	%xmm3,%xmm5
3549___
3550}
3551$code.=<<___;
3552	por	%xmm5,%xmm4
3553	lea	$STRIDE(%r11),%r11
3554	pshufd	\$0x4e,%xmm4,%xmm0
3555	por	%xmm4,%xmm0
3556	movq	%xmm0,($out)		# m0=bp[0]
3557	lea	8($out),$out
3558	sub	\$1,$num
3559	jnz	.Lgather
3560
3561	lea	(%r10),%rsp
3562	ret
3563.LSEH_end_bn_gather5:
3564.size	bn_gather5,.-bn_gather5
3565___
3566}
3567$code.=<<___;
3568.align	64
3569.Linc:
3570	.long	0,0, 1,1
3571	.long	2,2, 2,2
3572.asciz	"Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
3573___
3574
3575# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3576#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
3577if ($win64) {
3578$rec="%rcx";
3579$frame="%rdx";
3580$context="%r8";
3581$disp="%r9";
3582
3583$code.=<<___;
3584.extern	__imp_RtlVirtualUnwind
3585.type	mul_handler,\@abi-omnipotent
3586.align	16
3587mul_handler:
3588	push	%rsi
3589	push	%rdi
3590	push	%rbx
3591	push	%rbp
3592	push	%r12
3593	push	%r13
3594	push	%r14
3595	push	%r15
3596	pushfq
3597	sub	\$64,%rsp
3598
3599	mov	120($context),%rax	# pull context->Rax
3600	mov	248($context),%rbx	# pull context->Rip
3601
3602	mov	8($disp),%rsi		# disp->ImageBase
3603	mov	56($disp),%r11		# disp->HandlerData
3604
3605	mov	0(%r11),%r10d		# HandlerData[0]
3606	lea	(%rsi,%r10),%r10	# end of prologue label
3607	cmp	%r10,%rbx		# context->Rip<end of prologue label
3608	jb	.Lcommon_seh_tail
3609
3610	mov	152($context),%rax	# pull context->Rsp
3611
3612	mov	4(%r11),%r10d		# HandlerData[1]
3613	lea	(%rsi,%r10),%r10	# epilogue label
3614	cmp	%r10,%rbx		# context->Rip>=epilogue label
3615	jae	.Lcommon_seh_tail
3616
3617	lea	.Lmul_epilogue(%rip),%r10
3618	cmp	%r10,%rbx
3619	ja	.Lbody_40
3620
3621	mov	192($context),%r10	# pull $num
3622	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
3623
3624	jmp	.Lbody_proceed
3625
3626.Lbody_40:
3627	mov	40(%rax),%rax		# pull saved stack pointer
3628.Lbody_proceed:
3629	mov	-8(%rax),%rbx
3630	mov	-16(%rax),%rbp
3631	mov	-24(%rax),%r12
3632	mov	-32(%rax),%r13
3633	mov	-40(%rax),%r14
3634	mov	-48(%rax),%r15
3635	mov	%rbx,144($context)	# restore context->Rbx
3636	mov	%rbp,160($context)	# restore context->Rbp
3637	mov	%r12,216($context)	# restore context->R12
3638	mov	%r13,224($context)	# restore context->R13
3639	mov	%r14,232($context)	# restore context->R14
3640	mov	%r15,240($context)	# restore context->R15
3641
3642.Lcommon_seh_tail:
3643	mov	8(%rax),%rdi
3644	mov	16(%rax),%rsi
3645	mov	%rax,152($context)	# restore context->Rsp
3646	mov	%rsi,168($context)	# restore context->Rsi
3647	mov	%rdi,176($context)	# restore context->Rdi
3648
3649	mov	40($disp),%rdi		# disp->ContextRecord
3650	mov	$context,%rsi		# context
3651	mov	\$154,%ecx		# sizeof(CONTEXT)
3652	.long	0xa548f3fc		# cld; rep movsq
3653
3654	mov	$disp,%rsi
3655	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
3656	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
3657	mov	0(%rsi),%r8		# arg3, disp->ControlPc
3658	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
3659	mov	40(%rsi),%r10		# disp->ContextRecord
3660	lea	56(%rsi),%r11		# &disp->HandlerData
3661	lea	24(%rsi),%r12		# &disp->EstablisherFrame
3662	mov	%r10,32(%rsp)		# arg5
3663	mov	%r11,40(%rsp)		# arg6
3664	mov	%r12,48(%rsp)		# arg7
3665	mov	%rcx,56(%rsp)		# arg8, (NULL)
3666	call	*__imp_RtlVirtualUnwind(%rip)
3667
3668	mov	\$1,%eax		# ExceptionContinueSearch
3669	add	\$64,%rsp
3670	popfq
3671	pop	%r15
3672	pop	%r14
3673	pop	%r13
3674	pop	%r12
3675	pop	%rbp
3676	pop	%rbx
3677	pop	%rdi
3678	pop	%rsi
3679	ret
3680.size	mul_handler,.-mul_handler
3681
3682.section	.pdata
3683.align	4
3684	.rva	.LSEH_begin_bn_mul_mont_gather5
3685	.rva	.LSEH_end_bn_mul_mont_gather5
3686	.rva	.LSEH_info_bn_mul_mont_gather5
3687
3688	.rva	.LSEH_begin_bn_mul4x_mont_gather5
3689	.rva	.LSEH_end_bn_mul4x_mont_gather5
3690	.rva	.LSEH_info_bn_mul4x_mont_gather5
3691
3692	.rva	.LSEH_begin_bn_power5
3693	.rva	.LSEH_end_bn_power5
3694	.rva	.LSEH_info_bn_power5
3695
3696	.rva	.LSEH_begin_bn_from_mont8x
3697	.rva	.LSEH_end_bn_from_mont8x
3698	.rva	.LSEH_info_bn_from_mont8x
3699___
3700$code.=<<___ if ($addx);
3701	.rva	.LSEH_begin_bn_mulx4x_mont_gather5
3702	.rva	.LSEH_end_bn_mulx4x_mont_gather5
3703	.rva	.LSEH_info_bn_mulx4x_mont_gather5
3704
3705	.rva	.LSEH_begin_bn_powerx5
3706	.rva	.LSEH_end_bn_powerx5
3707	.rva	.LSEH_info_bn_powerx5
3708___
3709$code.=<<___;
3710	.rva	.LSEH_begin_bn_gather5
3711	.rva	.LSEH_end_bn_gather5
3712	.rva	.LSEH_info_bn_gather5
3713
3714.section	.xdata
3715.align	8
3716.LSEH_info_bn_mul_mont_gather5:
3717	.byte	9,0,0,0
3718	.rva	mul_handler
3719	.rva	.Lmul_body,.Lmul_epilogue		# HandlerData[]
3720.align	8
3721.LSEH_info_bn_mul4x_mont_gather5:
3722	.byte	9,0,0,0
3723	.rva	mul_handler
3724	.rva	.Lmul4x_body,.Lmul4x_epilogue		# HandlerData[]
3725.align	8
3726.LSEH_info_bn_power5:
3727	.byte	9,0,0,0
3728	.rva	mul_handler
3729	.rva	.Lpower5_body,.Lpower5_epilogue		# HandlerData[]
3730.align	8
3731.LSEH_info_bn_from_mont8x:
3732	.byte	9,0,0,0
3733	.rva	mul_handler
3734	.rva	.Lfrom_body,.Lfrom_epilogue		# HandlerData[]
3735___
3736$code.=<<___ if ($addx);
3737.align	8
3738.LSEH_info_bn_mulx4x_mont_gather5:
3739	.byte	9,0,0,0
3740	.rva	mul_handler
3741	.rva	.Lmulx4x_body,.Lmulx4x_epilogue		# HandlerData[]
3742.align	8
3743.LSEH_info_bn_powerx5:
3744	.byte	9,0,0,0
3745	.rva	mul_handler
3746	.rva	.Lpowerx5_body,.Lpowerx5_epilogue	# HandlerData[]
3747___
3748$code.=<<___;
3749.align	8
3750.LSEH_info_bn_gather5:
3751	.byte	0x01,0x0b,0x03,0x0a
3752	.byte	0x0b,0x01,0x21,0x00	# sub	rsp,0x108
3753	.byte	0x04,0xa3,0x00,0x00	# lea	r10,(rsp)
3754.align	8
3755___
3756}
3757
3758$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3759
3760print $code;
3761close STDOUT;
3762