x86_64-mont5.pl revision 325337
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# August 2011.
11#
12# Companion to x86_64-mont.pl that optimizes cache-timing attack
13# countermeasures. The subroutines are produced by replacing bp[i]
14# references in their x86_64-mont.pl counterparts with cache-neutral
15# references to powers table computed in BN_mod_exp_mont_consttime.
16# In addition subroutine that scatters elements of the powers table
17# is implemented, so that scatter-/gathering can be tuned without
18# bn_exp.c modifications.
19
20# August 2013.
21#
22# Add MULX/AD*X code paths and additional interfaces to optimize for
23# branch prediction unit. For input lengths that are multiples of 8
24# the np argument is not just modulus value, but one interleaved
25# with 0. This is to optimize post-condition...
26
27$flavour = shift;
28$output  = shift;
29if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
30
31$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
32
33$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
34( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
35( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
36die "can't locate x86_64-xlate.pl";
37
38open OUT,"| \"$^X\" $xlate $flavour $output";
39*STDOUT=*OUT;
40
41if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
42		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
43	$addx = ($1>=2.23);
44}
45
46if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
47	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
48	$addx = ($1>=2.10);
49}
50
51if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
52	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
53	$addx = ($1>=12);
54}
55
56if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
57	my $ver = $2 + $3/100.0;	# 3.1->3.01, 3.10->3.10
58	$addx = ($ver>=3.03);
59}
60
61# int bn_mul_mont_gather5(
62$rp="%rdi";	# BN_ULONG *rp,
63$ap="%rsi";	# const BN_ULONG *ap,
64$bp="%rdx";	# const BN_ULONG *bp,
65$np="%rcx";	# const BN_ULONG *np,
66$n0="%r8";	# const BN_ULONG *n0,
67$num="%r9";	# int num,
68		# int idx);	# 0 to 2^5-1, "index" in $bp holding
69				# pre-computed powers of a', interlaced
70				# in such manner that b[0] is $bp[idx],
71				# b[1] is [2^5+idx], etc.
72$lo0="%r10";
73$hi0="%r11";
74$hi1="%r13";
75$i="%r14";
76$j="%r15";
77$m0="%rbx";
78$m1="%rbp";
79
80$code=<<___;
81.text
82
83.extern	OPENSSL_ia32cap_P
84
85.globl	bn_mul_mont_gather5
86.type	bn_mul_mont_gather5,\@function,6
87.align	64
88bn_mul_mont_gather5:
89	mov	${num}d,${num}d
90	mov	%rsp,%rax
91	test	\$7,${num}d
92	jnz	.Lmul_enter
93___
94$code.=<<___ if ($addx);
95	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
96___
97$code.=<<___;
98	jmp	.Lmul4x_enter
99
100.align	16
101.Lmul_enter:
102	movd	`($win64?56:8)`(%rsp),%xmm5	# load 7th argument
103	push	%rbx
104	push	%rbp
105	push	%r12
106	push	%r13
107	push	%r14
108	push	%r15
109
110	neg	$num
111	mov	%rsp,%r11
112	lea	-280(%rsp,$num,8),%r10	# future alloca(8*(num+2)+256+8)
113	neg	$num			# restore $num
114	and	\$-1024,%r10		# minimize TLB usage
115
116	# Some OSes, *cough*-dows, insist on stack being "wired" to
117	# physical memory in strictly sequential manner, i.e. if stack
118	# allocation spans two pages, then reference to farmost one can
119	# be punishable by SEGV. But page walking can do good even on
120	# other OSes, because it guarantees that villain thread hits
121	# the guard page before it can make damage to innocent one...
122	sub	%r10,%r11
123	and	\$-4096,%r11
124	lea	(%r10,%r11),%rsp
125	mov	(%rsp),%r11
126	cmp	%r10,%rsp
127	ja	.Lmul_page_walk
128	jmp	.Lmul_page_walk_done
129
130.Lmul_page_walk:
131	lea	-4096(%rsp),%rsp
132	mov	(%rsp),%r11
133	cmp	%r10,%rsp
134	ja	.Lmul_page_walk
135.Lmul_page_walk_done:
136
137	lea	.Linc(%rip),%r10
138	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
139.Lmul_body:
140
141	lea	128($bp),%r12		# reassign $bp (+size optimization)
142___
143		$bp="%r12";
144		$STRIDE=2**5*8;		# 5 is "window size"
145		$N=$STRIDE/4;		# should match cache line size
146$code.=<<___;
147	movdqa	0(%r10),%xmm0		# 00000001000000010000000000000000
148	movdqa	16(%r10),%xmm1		# 00000002000000020000000200000002
149	lea	24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization)
150	and	\$-16,%r10
151
152	pshufd	\$0,%xmm5,%xmm5		# broadcast index
153	movdqa	%xmm1,%xmm4
154	movdqa	%xmm1,%xmm2
155___
156########################################################################
157# calculate mask by comparing 0..31 to index and save result to stack
158#
159$code.=<<___;
160	paddd	%xmm0,%xmm1
161	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
162	.byte	0x67
163	movdqa	%xmm4,%xmm3
164___
165for($k=0;$k<$STRIDE/16-4;$k+=4) {
166$code.=<<___;
167	paddd	%xmm1,%xmm2
168	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
169	movdqa	%xmm0,`16*($k+0)+112`(%r10)
170	movdqa	%xmm4,%xmm0
171
172	paddd	%xmm2,%xmm3
173	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
174	movdqa	%xmm1,`16*($k+1)+112`(%r10)
175	movdqa	%xmm4,%xmm1
176
177	paddd	%xmm3,%xmm0
178	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
179	movdqa	%xmm2,`16*($k+2)+112`(%r10)
180	movdqa	%xmm4,%xmm2
181
182	paddd	%xmm0,%xmm1
183	pcmpeqd	%xmm5,%xmm0
184	movdqa	%xmm3,`16*($k+3)+112`(%r10)
185	movdqa	%xmm4,%xmm3
186___
187}
188$code.=<<___;				# last iteration can be optimized
189	paddd	%xmm1,%xmm2
190	pcmpeqd	%xmm5,%xmm1
191	movdqa	%xmm0,`16*($k+0)+112`(%r10)
192
193	paddd	%xmm2,%xmm3
194	.byte	0x67
195	pcmpeqd	%xmm5,%xmm2
196	movdqa	%xmm1,`16*($k+1)+112`(%r10)
197
198	pcmpeqd	%xmm5,%xmm3
199	movdqa	%xmm2,`16*($k+2)+112`(%r10)
200	pand	`16*($k+0)-128`($bp),%xmm0	# while it's still in register
201
202	pand	`16*($k+1)-128`($bp),%xmm1
203	pand	`16*($k+2)-128`($bp),%xmm2
204	movdqa	%xmm3,`16*($k+3)+112`(%r10)
205	pand	`16*($k+3)-128`($bp),%xmm3
206	por	%xmm2,%xmm0
207	por	%xmm3,%xmm1
208___
209for($k=0;$k<$STRIDE/16-4;$k+=4) {
210$code.=<<___;
211	movdqa	`16*($k+0)-128`($bp),%xmm4
212	movdqa	`16*($k+1)-128`($bp),%xmm5
213	movdqa	`16*($k+2)-128`($bp),%xmm2
214	pand	`16*($k+0)+112`(%r10),%xmm4
215	movdqa	`16*($k+3)-128`($bp),%xmm3
216	pand	`16*($k+1)+112`(%r10),%xmm5
217	por	%xmm4,%xmm0
218	pand	`16*($k+2)+112`(%r10),%xmm2
219	por	%xmm5,%xmm1
220	pand	`16*($k+3)+112`(%r10),%xmm3
221	por	%xmm2,%xmm0
222	por	%xmm3,%xmm1
223___
224}
225$code.=<<___;
226	por	%xmm1,%xmm0
227	pshufd	\$0x4e,%xmm0,%xmm1
228	por	%xmm1,%xmm0
229	lea	$STRIDE($bp),$bp
230	movq	%xmm0,$m0		# m0=bp[0]
231
232	mov	($n0),$n0		# pull n0[0] value
233	mov	($ap),%rax
234
235	xor	$i,$i			# i=0
236	xor	$j,$j			# j=0
237
238	mov	$n0,$m1
239	mulq	$m0			# ap[0]*bp[0]
240	mov	%rax,$lo0
241	mov	($np),%rax
242
243	imulq	$lo0,$m1		# "tp[0]"*n0
244	mov	%rdx,$hi0
245
246	mulq	$m1			# np[0]*m1
247	add	%rax,$lo0		# discarded
248	mov	8($ap),%rax
249	adc	\$0,%rdx
250	mov	%rdx,$hi1
251
252	lea	1($j),$j		# j++
253	jmp	.L1st_enter
254
255.align	16
256.L1st:
257	add	%rax,$hi1
258	mov	($ap,$j,8),%rax
259	adc	\$0,%rdx
260	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
261	mov	$lo0,$hi0
262	adc	\$0,%rdx
263	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
264	mov	%rdx,$hi1
265
266.L1st_enter:
267	mulq	$m0			# ap[j]*bp[0]
268	add	%rax,$hi0
269	mov	($np,$j,8),%rax
270	adc	\$0,%rdx
271	lea	1($j),$j		# j++
272	mov	%rdx,$lo0
273
274	mulq	$m1			# np[j]*m1
275	cmp	$num,$j
276	jne	.L1st			# note that upon exit $j==$num, so
277					# they can be used interchangeably
278
279	add	%rax,$hi1
280	adc	\$0,%rdx
281	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
282	adc	\$0,%rdx
283	mov	$hi1,-16(%rsp,$num,8)	# tp[num-1]
284	mov	%rdx,$hi1
285	mov	$lo0,$hi0
286
287	xor	%rdx,%rdx
288	add	$hi0,$hi1
289	adc	\$0,%rdx
290	mov	$hi1,-8(%rsp,$num,8)
291	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
292
293	lea	1($i),$i		# i++
294	jmp	.Louter
295.align	16
296.Louter:
297	lea	24+128(%rsp,$num,8),%rdx	# where 256-byte mask is (+size optimization)
298	and	\$-16,%rdx
299	pxor	%xmm4,%xmm4
300	pxor	%xmm5,%xmm5
301___
302for($k=0;$k<$STRIDE/16;$k+=4) {
303$code.=<<___;
304	movdqa	`16*($k+0)-128`($bp),%xmm0
305	movdqa	`16*($k+1)-128`($bp),%xmm1
306	movdqa	`16*($k+2)-128`($bp),%xmm2
307	movdqa	`16*($k+3)-128`($bp),%xmm3
308	pand	`16*($k+0)-128`(%rdx),%xmm0
309	pand	`16*($k+1)-128`(%rdx),%xmm1
310	por	%xmm0,%xmm4
311	pand	`16*($k+2)-128`(%rdx),%xmm2
312	por	%xmm1,%xmm5
313	pand	`16*($k+3)-128`(%rdx),%xmm3
314	por	%xmm2,%xmm4
315	por	%xmm3,%xmm5
316___
317}
318$code.=<<___;
319	por	%xmm5,%xmm4
320	pshufd	\$0x4e,%xmm4,%xmm0
321	por	%xmm4,%xmm0
322	lea	$STRIDE($bp),$bp
323
324	mov	($ap),%rax		# ap[0]
325	movq	%xmm0,$m0		# m0=bp[i]
326
327	xor	$j,$j			# j=0
328	mov	$n0,$m1
329	mov	(%rsp),$lo0
330
331	mulq	$m0			# ap[0]*bp[i]
332	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
333	mov	($np),%rax
334	adc	\$0,%rdx
335
336	imulq	$lo0,$m1		# tp[0]*n0
337	mov	%rdx,$hi0
338
339	mulq	$m1			# np[0]*m1
340	add	%rax,$lo0		# discarded
341	mov	8($ap),%rax
342	adc	\$0,%rdx
343	mov	8(%rsp),$lo0		# tp[1]
344	mov	%rdx,$hi1
345
346	lea	1($j),$j		# j++
347	jmp	.Linner_enter
348
349.align	16
350.Linner:
351	add	%rax,$hi1
352	mov	($ap,$j,8),%rax
353	adc	\$0,%rdx
354	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
355	mov	(%rsp,$j,8),$lo0
356	adc	\$0,%rdx
357	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
358	mov	%rdx,$hi1
359
360.Linner_enter:
361	mulq	$m0			# ap[j]*bp[i]
362	add	%rax,$hi0
363	mov	($np,$j,8),%rax
364	adc	\$0,%rdx
365	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
366	mov	%rdx,$hi0
367	adc	\$0,$hi0
368	lea	1($j),$j		# j++
369
370	mulq	$m1			# np[j]*m1
371	cmp	$num,$j
372	jne	.Linner			# note that upon exit $j==$num, so
373					# they can be used interchangeably
374	add	%rax,$hi1
375	adc	\$0,%rdx
376	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
377	mov	(%rsp,$num,8),$lo0
378	adc	\$0,%rdx
379	mov	$hi1,-16(%rsp,$num,8)	# tp[num-1]
380	mov	%rdx,$hi1
381
382	xor	%rdx,%rdx
383	add	$hi0,$hi1
384	adc	\$0,%rdx
385	add	$lo0,$hi1		# pull upmost overflow bit
386	adc	\$0,%rdx
387	mov	$hi1,-8(%rsp,$num,8)
388	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
389
390	lea	1($i),$i		# i++
391	cmp	$num,$i
392	jb	.Louter
393
394	xor	$i,$i			# i=0 and clear CF!
395	mov	(%rsp),%rax		# tp[0]
396	lea	(%rsp),$ap		# borrow ap for tp
397	mov	$num,$j			# j=num
398	jmp	.Lsub
399.align	16
400.Lsub:	sbb	($np,$i,8),%rax
401	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
402	mov	8($ap,$i,8),%rax	# tp[i+1]
403	lea	1($i),$i		# i++
404	dec	$j			# doesnn't affect CF!
405	jnz	.Lsub
406
407	sbb	\$0,%rax		# handle upmost overflow bit
408	xor	$i,$i
409	and	%rax,$ap
410	not	%rax
411	mov	$rp,$np
412	and	%rax,$np
413	mov	$num,$j			# j=num
414	or	$np,$ap			# ap=borrow?tp:rp
415.align	16
416.Lcopy:					# copy or in-place refresh
417	mov	($ap,$i,8),%rax
418	mov	$i,(%rsp,$i,8)		# zap temporary vector
419	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
420	lea	1($i),$i
421	sub	\$1,$j
422	jnz	.Lcopy
423
424	mov	8(%rsp,$num,8),%rsi	# restore %rsp
425	mov	\$1,%rax
426
427	mov	-48(%rsi),%r15
428	mov	-40(%rsi),%r14
429	mov	-32(%rsi),%r13
430	mov	-24(%rsi),%r12
431	mov	-16(%rsi),%rbp
432	mov	-8(%rsi),%rbx
433	lea	(%rsi),%rsp
434.Lmul_epilogue:
435	ret
436.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
437___
438{{{
439my @A=("%r10","%r11");
440my @N=("%r13","%rdi");
441$code.=<<___;
442.type	bn_mul4x_mont_gather5,\@function,6
443.align	32
444bn_mul4x_mont_gather5:
445	.byte	0x67
446	mov	%rsp,%rax
447.Lmul4x_enter:
448___
449$code.=<<___ if ($addx);
450	and	\$0x80108,%r11d
451	cmp	\$0x80108,%r11d		# check for AD*X+BMI2+BMI1
452	je	.Lmulx4x_enter
453___
454$code.=<<___;
455	push	%rbx
456	push	%rbp
457	push	%r12
458	push	%r13
459	push	%r14
460	push	%r15
461.Lmul4x_prologue:
462
463	.byte	0x67
464	shl	\$3,${num}d		# convert $num to bytes
465	lea	($num,$num,2),%r10	# 3*$num in bytes
466	neg	$num			# -$num
467
468	##############################################################
469	# Ensure that stack frame doesn't alias with $rptr+3*$num
470	# modulo 4096, which covers ret[num], am[num] and n[num]
471	# (see bn_exp.c). This is done to allow memory disambiguation
472	# logic do its magic. [Extra [num] is allocated in order
473	# to align with bn_power5's frame, which is cleansed after
474	# completing exponentiation. Extra 256 bytes is for power mask
475	# calculated from 7th argument, the index.]
476	#
477	lea	-320(%rsp,$num,2),%r11
478	mov	%rsp,%rbp
479	sub	$rp,%r11
480	and	\$4095,%r11
481	cmp	%r11,%r10
482	jb	.Lmul4xsp_alt
483	sub	%r11,%rbp		# align with $rp
484	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*num*8+256)
485	jmp	.Lmul4xsp_done
486
487.align	32
488.Lmul4xsp_alt:
489	lea	4096-320(,$num,2),%r10
490	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*num*8+256)
491	sub	%r10,%r11
492	mov	\$0,%r10
493	cmovc	%r10,%r11
494	sub	%r11,%rbp
495.Lmul4xsp_done:
496	and	\$-64,%rbp
497	mov	%rsp,%r11
498	sub	%rbp,%r11
499	and	\$-4096,%r11
500	lea	(%rbp,%r11),%rsp
501	mov	(%rsp),%r10
502	cmp	%rbp,%rsp
503	ja	.Lmul4x_page_walk
504	jmp	.Lmul4x_page_walk_done
505
506.Lmul4x_page_walk:
507	lea	-4096(%rsp),%rsp
508	mov	(%rsp),%r10
509	cmp	%rbp,%rsp
510	ja	.Lmul4x_page_walk
511.Lmul4x_page_walk_done:
512
513	neg	$num
514
515	mov	%rax,40(%rsp)
516.Lmul4x_body:
517
518	call	mul4x_internal
519
520	mov	40(%rsp),%rsi		# restore %rsp
521	mov	\$1,%rax
522
523	mov	-48(%rsi),%r15
524	mov	-40(%rsi),%r14
525	mov	-32(%rsi),%r13
526	mov	-24(%rsi),%r12
527	mov	-16(%rsi),%rbp
528	mov	-8(%rsi),%rbx
529	lea	(%rsi),%rsp
530.Lmul4x_epilogue:
531	ret
532.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
533
534.type	mul4x_internal,\@abi-omnipotent
535.align	32
536mul4x_internal:
537	shl	\$5,$num		# $num was in bytes
538	movd	`($win64?56:8)`(%rax),%xmm5	# load 7th argument, index
539	lea	.Linc(%rip),%rax
540	lea	128(%rdx,$num),%r13	# end of powers table (+size optimization)
541	shr	\$5,$num		# restore $num
542___
543		$bp="%r12";
544		$STRIDE=2**5*8;		# 5 is "window size"
545		$N=$STRIDE/4;		# should match cache line size
546		$tp=$i;
547$code.=<<___;
548	movdqa	0(%rax),%xmm0		# 00000001000000010000000000000000
549	movdqa	16(%rax),%xmm1		# 00000002000000020000000200000002
550	lea	88-112(%rsp,$num),%r10	# place the mask after tp[num+1] (+ICache optimization)
551	lea	128(%rdx),$bp		# size optimization
552
553	pshufd	\$0,%xmm5,%xmm5		# broadcast index
554	movdqa	%xmm1,%xmm4
555	.byte	0x67,0x67
556	movdqa	%xmm1,%xmm2
557___
558########################################################################
559# calculate mask by comparing 0..31 to index and save result to stack
560#
561$code.=<<___;
562	paddd	%xmm0,%xmm1
563	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
564	.byte	0x67
565	movdqa	%xmm4,%xmm3
566___
567for($i=0;$i<$STRIDE/16-4;$i+=4) {
568$code.=<<___;
569	paddd	%xmm1,%xmm2
570	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
571	movdqa	%xmm0,`16*($i+0)+112`(%r10)
572	movdqa	%xmm4,%xmm0
573
574	paddd	%xmm2,%xmm3
575	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
576	movdqa	%xmm1,`16*($i+1)+112`(%r10)
577	movdqa	%xmm4,%xmm1
578
579	paddd	%xmm3,%xmm0
580	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
581	movdqa	%xmm2,`16*($i+2)+112`(%r10)
582	movdqa	%xmm4,%xmm2
583
584	paddd	%xmm0,%xmm1
585	pcmpeqd	%xmm5,%xmm0
586	movdqa	%xmm3,`16*($i+3)+112`(%r10)
587	movdqa	%xmm4,%xmm3
588___
589}
590$code.=<<___;				# last iteration can be optimized
591	paddd	%xmm1,%xmm2
592	pcmpeqd	%xmm5,%xmm1
593	movdqa	%xmm0,`16*($i+0)+112`(%r10)
594
595	paddd	%xmm2,%xmm3
596	.byte	0x67
597	pcmpeqd	%xmm5,%xmm2
598	movdqa	%xmm1,`16*($i+1)+112`(%r10)
599
600	pcmpeqd	%xmm5,%xmm3
601	movdqa	%xmm2,`16*($i+2)+112`(%r10)
602	pand	`16*($i+0)-128`($bp),%xmm0	# while it's still in register
603
604	pand	`16*($i+1)-128`($bp),%xmm1
605	pand	`16*($i+2)-128`($bp),%xmm2
606	movdqa	%xmm3,`16*($i+3)+112`(%r10)
607	pand	`16*($i+3)-128`($bp),%xmm3
608	por	%xmm2,%xmm0
609	por	%xmm3,%xmm1
610___
611for($i=0;$i<$STRIDE/16-4;$i+=4) {
612$code.=<<___;
613	movdqa	`16*($i+0)-128`($bp),%xmm4
614	movdqa	`16*($i+1)-128`($bp),%xmm5
615	movdqa	`16*($i+2)-128`($bp),%xmm2
616	pand	`16*($i+0)+112`(%r10),%xmm4
617	movdqa	`16*($i+3)-128`($bp),%xmm3
618	pand	`16*($i+1)+112`(%r10),%xmm5
619	por	%xmm4,%xmm0
620	pand	`16*($i+2)+112`(%r10),%xmm2
621	por	%xmm5,%xmm1
622	pand	`16*($i+3)+112`(%r10),%xmm3
623	por	%xmm2,%xmm0
624	por	%xmm3,%xmm1
625___
626}
627$code.=<<___;
628	por	%xmm1,%xmm0
629	pshufd	\$0x4e,%xmm0,%xmm1
630	por	%xmm1,%xmm0
631	lea	$STRIDE($bp),$bp
632	movq	%xmm0,$m0		# m0=bp[0]
633
634	mov	%r13,16+8(%rsp)		# save end of b[num]
635	mov	$rp, 56+8(%rsp)		# save $rp
636
637	mov	($n0),$n0		# pull n0[0] value
638	mov	($ap),%rax
639	lea	($ap,$num),$ap		# end of a[num]
640	neg	$num
641
642	mov	$n0,$m1
643	mulq	$m0			# ap[0]*bp[0]
644	mov	%rax,$A[0]
645	mov	($np),%rax
646
647	imulq	$A[0],$m1		# "tp[0]"*n0
648	lea	64+8(%rsp),$tp
649	mov	%rdx,$A[1]
650
651	mulq	$m1			# np[0]*m1
652	add	%rax,$A[0]		# discarded
653	mov	8($ap,$num),%rax
654	adc	\$0,%rdx
655	mov	%rdx,$N[1]
656
657	mulq	$m0
658	add	%rax,$A[1]
659	mov	8*1($np),%rax
660	adc	\$0,%rdx
661	mov	%rdx,$A[0]
662
663	mulq	$m1
664	add	%rax,$N[1]
665	mov	16($ap,$num),%rax
666	adc	\$0,%rdx
667	add	$A[1],$N[1]
668	lea	4*8($num),$j		# j=4
669	lea	8*4($np),$np
670	adc	\$0,%rdx
671	mov	$N[1],($tp)
672	mov	%rdx,$N[0]
673	jmp	.L1st4x
674
675.align	32
676.L1st4x:
677	mulq	$m0			# ap[j]*bp[0]
678	add	%rax,$A[0]
679	mov	-8*2($np),%rax
680	lea	32($tp),$tp
681	adc	\$0,%rdx
682	mov	%rdx,$A[1]
683
684	mulq	$m1			# np[j]*m1
685	add	%rax,$N[0]
686	mov	-8($ap,$j),%rax
687	adc	\$0,%rdx
688	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
689	adc	\$0,%rdx
690	mov	$N[0],-24($tp)		# tp[j-1]
691	mov	%rdx,$N[1]
692
693	mulq	$m0			# ap[j]*bp[0]
694	add	%rax,$A[1]
695	mov	-8*1($np),%rax
696	adc	\$0,%rdx
697	mov	%rdx,$A[0]
698
699	mulq	$m1			# np[j]*m1
700	add	%rax,$N[1]
701	mov	($ap,$j),%rax
702	adc	\$0,%rdx
703	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
704	adc	\$0,%rdx
705	mov	$N[1],-16($tp)		# tp[j-1]
706	mov	%rdx,$N[0]
707
708	mulq	$m0			# ap[j]*bp[0]
709	add	%rax,$A[0]
710	mov	8*0($np),%rax
711	adc	\$0,%rdx
712	mov	%rdx,$A[1]
713
714	mulq	$m1			# np[j]*m1
715	add	%rax,$N[0]
716	mov	8($ap,$j),%rax
717	adc	\$0,%rdx
718	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
719	adc	\$0,%rdx
720	mov	$N[0],-8($tp)		# tp[j-1]
721	mov	%rdx,$N[1]
722
723	mulq	$m0			# ap[j]*bp[0]
724	add	%rax,$A[1]
725	mov	8*1($np),%rax
726	adc	\$0,%rdx
727	mov	%rdx,$A[0]
728
729	mulq	$m1			# np[j]*m1
730	add	%rax,$N[1]
731	mov	16($ap,$j),%rax
732	adc	\$0,%rdx
733	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
734	lea	8*4($np),$np
735	adc	\$0,%rdx
736	mov	$N[1],($tp)		# tp[j-1]
737	mov	%rdx,$N[0]
738
739	add	\$32,$j			# j+=4
740	jnz	.L1st4x
741
742	mulq	$m0			# ap[j]*bp[0]
743	add	%rax,$A[0]
744	mov	-8*2($np),%rax
745	lea	32($tp),$tp
746	adc	\$0,%rdx
747	mov	%rdx,$A[1]
748
749	mulq	$m1			# np[j]*m1
750	add	%rax,$N[0]
751	mov	-8($ap),%rax
752	adc	\$0,%rdx
753	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
754	adc	\$0,%rdx
755	mov	$N[0],-24($tp)		# tp[j-1]
756	mov	%rdx,$N[1]
757
758	mulq	$m0			# ap[j]*bp[0]
759	add	%rax,$A[1]
760	mov	-8*1($np),%rax
761	adc	\$0,%rdx
762	mov	%rdx,$A[0]
763
764	mulq	$m1			# np[j]*m1
765	add	%rax,$N[1]
766	mov	($ap,$num),%rax		# ap[0]
767	adc	\$0,%rdx
768	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
769	adc	\$0,%rdx
770	mov	$N[1],-16($tp)		# tp[j-1]
771	mov	%rdx,$N[0]
772
773	lea	($np,$num),$np		# rewind $np
774
775	xor	$N[1],$N[1]
776	add	$A[0],$N[0]
777	adc	\$0,$N[1]
778	mov	$N[0],-8($tp)
779
780	jmp	.Louter4x
781
782.align	32
783.Louter4x:
784	lea	16+128($tp),%rdx	# where 256-byte mask is (+size optimization)
785	pxor	%xmm4,%xmm4
786	pxor	%xmm5,%xmm5
787___
788for($i=0;$i<$STRIDE/16;$i+=4) {
789$code.=<<___;
790	movdqa	`16*($i+0)-128`($bp),%xmm0
791	movdqa	`16*($i+1)-128`($bp),%xmm1
792	movdqa	`16*($i+2)-128`($bp),%xmm2
793	movdqa	`16*($i+3)-128`($bp),%xmm3
794	pand	`16*($i+0)-128`(%rdx),%xmm0
795	pand	`16*($i+1)-128`(%rdx),%xmm1
796	por	%xmm0,%xmm4
797	pand	`16*($i+2)-128`(%rdx),%xmm2
798	por	%xmm1,%xmm5
799	pand	`16*($i+3)-128`(%rdx),%xmm3
800	por	%xmm2,%xmm4
801	por	%xmm3,%xmm5
802___
803}
804$code.=<<___;
805	por	%xmm5,%xmm4
806	pshufd	\$0x4e,%xmm4,%xmm0
807	por	%xmm4,%xmm0
808	lea	$STRIDE($bp),$bp
809	movq	%xmm0,$m0		# m0=bp[i]
810
811	mov	($tp,$num),$A[0]
812	mov	$n0,$m1
813	mulq	$m0			# ap[0]*bp[i]
814	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
815	mov	($np),%rax
816	adc	\$0,%rdx
817
818	imulq	$A[0],$m1		# tp[0]*n0
819	mov	%rdx,$A[1]
820	mov	$N[1],($tp)		# store upmost overflow bit
821
822	lea	($tp,$num),$tp		# rewind $tp
823
824	mulq	$m1			# np[0]*m1
825	add	%rax,$A[0]		# "$N[0]", discarded
826	mov	8($ap,$num),%rax
827	adc	\$0,%rdx
828	mov	%rdx,$N[1]
829
830	mulq	$m0			# ap[j]*bp[i]
831	add	%rax,$A[1]
832	mov	8*1($np),%rax
833	adc	\$0,%rdx
834	add	8($tp),$A[1]		# +tp[1]
835	adc	\$0,%rdx
836	mov	%rdx,$A[0]
837
838	mulq	$m1			# np[j]*m1
839	add	%rax,$N[1]
840	mov	16($ap,$num),%rax
841	adc	\$0,%rdx
842	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
843	lea	4*8($num),$j		# j=4
844	lea	8*4($np),$np
845	adc	\$0,%rdx
846	mov	%rdx,$N[0]
847	jmp	.Linner4x
848
849.align	32
850.Linner4x:
851	mulq	$m0			# ap[j]*bp[i]
852	add	%rax,$A[0]
853	mov	-8*2($np),%rax
854	adc	\$0,%rdx
855	add	16($tp),$A[0]		# ap[j]*bp[i]+tp[j]
856	lea	32($tp),$tp
857	adc	\$0,%rdx
858	mov	%rdx,$A[1]
859
860	mulq	$m1			# np[j]*m1
861	add	%rax,$N[0]
862	mov	-8($ap,$j),%rax
863	adc	\$0,%rdx
864	add	$A[0],$N[0]
865	adc	\$0,%rdx
866	mov	$N[1],-32($tp)		# tp[j-1]
867	mov	%rdx,$N[1]
868
869	mulq	$m0			# ap[j]*bp[i]
870	add	%rax,$A[1]
871	mov	-8*1($np),%rax
872	adc	\$0,%rdx
873	add	-8($tp),$A[1]
874	adc	\$0,%rdx
875	mov	%rdx,$A[0]
876
877	mulq	$m1			# np[j]*m1
878	add	%rax,$N[1]
879	mov	($ap,$j),%rax
880	adc	\$0,%rdx
881	add	$A[1],$N[1]
882	adc	\$0,%rdx
883	mov	$N[0],-24($tp)		# tp[j-1]
884	mov	%rdx,$N[0]
885
886	mulq	$m0			# ap[j]*bp[i]
887	add	%rax,$A[0]
888	mov	8*0($np),%rax
889	adc	\$0,%rdx
890	add	($tp),$A[0]		# ap[j]*bp[i]+tp[j]
891	adc	\$0,%rdx
892	mov	%rdx,$A[1]
893
894	mulq	$m1			# np[j]*m1
895	add	%rax,$N[0]
896	mov	8($ap,$j),%rax
897	adc	\$0,%rdx
898	add	$A[0],$N[0]
899	adc	\$0,%rdx
900	mov	$N[1],-16($tp)		# tp[j-1]
901	mov	%rdx,$N[1]
902
903	mulq	$m0			# ap[j]*bp[i]
904	add	%rax,$A[1]
905	mov	8*1($np),%rax
906	adc	\$0,%rdx
907	add	8($tp),$A[1]
908	adc	\$0,%rdx
909	mov	%rdx,$A[0]
910
911	mulq	$m1			# np[j]*m1
912	add	%rax,$N[1]
913	mov	16($ap,$j),%rax
914	adc	\$0,%rdx
915	add	$A[1],$N[1]
916	lea	8*4($np),$np
917	adc	\$0,%rdx
918	mov	$N[0],-8($tp)		# tp[j-1]
919	mov	%rdx,$N[0]
920
921	add	\$32,$j			# j+=4
922	jnz	.Linner4x
923
924	mulq	$m0			# ap[j]*bp[i]
925	add	%rax,$A[0]
926	mov	-8*2($np),%rax
927	adc	\$0,%rdx
928	add	16($tp),$A[0]		# ap[j]*bp[i]+tp[j]
929	lea	32($tp),$tp
930	adc	\$0,%rdx
931	mov	%rdx,$A[1]
932
933	mulq	$m1			# np[j]*m1
934	add	%rax,$N[0]
935	mov	-8($ap),%rax
936	adc	\$0,%rdx
937	add	$A[0],$N[0]
938	adc	\$0,%rdx
939	mov	$N[1],-32($tp)		# tp[j-1]
940	mov	%rdx,$N[1]
941
942	mulq	$m0			# ap[j]*bp[i]
943	add	%rax,$A[1]
944	mov	$m1,%rax
945	mov	-8*1($np),$m1
946	adc	\$0,%rdx
947	add	-8($tp),$A[1]
948	adc	\$0,%rdx
949	mov	%rdx,$A[0]
950
951	mulq	$m1			# np[j]*m1
952	add	%rax,$N[1]
953	mov	($ap,$num),%rax		# ap[0]
954	adc	\$0,%rdx
955	add	$A[1],$N[1]
956	adc	\$0,%rdx
957	mov	$N[0],-24($tp)		# tp[j-1]
958	mov	%rdx,$N[0]
959
960	mov	$N[1],-16($tp)		# tp[j-1]
961	lea	($np,$num),$np		# rewind $np
962
963	xor	$N[1],$N[1]
964	add	$A[0],$N[0]
965	adc	\$0,$N[1]
966	add	($tp),$N[0]		# pull upmost overflow bit
967	adc	\$0,$N[1]		# upmost overflow bit
968	mov	$N[0],-8($tp)
969
970	cmp	16+8(%rsp),$bp
971	jb	.Louter4x
972___
973if (1) {
974$code.=<<___;
975	xor	%rax,%rax
976	sub	$N[0],$m1		# compare top-most words
977	adc	$j,$j			# $j is zero
978	or	$j,$N[1]
979	sub	$N[1],%rax		# %rax=-$N[1]
980	lea	($tp,$num),%rbx		# tptr in .sqr4x_sub
981	mov	($np),%r12
982	lea	($np),%rbp		# nptr in .sqr4x_sub
983	mov	%r9,%rcx
984	sar	\$3+2,%rcx
985	mov	56+8(%rsp),%rdi		# rptr in .sqr4x_sub
986	dec	%r12			# so that after 'not' we get -n[0]
987	xor	%r10,%r10
988	mov	8*1(%rbp),%r13
989	mov	8*2(%rbp),%r14
990	mov	8*3(%rbp),%r15
991	jmp	.Lsqr4x_sub_entry
992___
993} else {
994my @ri=("%rax",$bp,$m0,$m1);
995my $rp="%rdx";
996$code.=<<___
997	xor	\$1,$N[1]
998	lea	($tp,$num),$tp		# rewind $tp
999	sar	\$5,$num		# cf=0
1000	lea	($np,$N[1],8),$np
1001	mov	56+8(%rsp),$rp		# restore $rp
1002	jmp	.Lsub4x
1003
1004.align	32
1005.Lsub4x:
1006	.byte	0x66
1007	mov	8*0($tp),@ri[0]
1008	mov	8*1($tp),@ri[1]
1009	.byte	0x66
1010	sbb	16*0($np),@ri[0]
1011	mov	8*2($tp),@ri[2]
1012	sbb	16*1($np),@ri[1]
1013	mov	3*8($tp),@ri[3]
1014	lea	4*8($tp),$tp
1015	sbb	16*2($np),@ri[2]
1016	mov	@ri[0],8*0($rp)
1017	sbb	16*3($np),@ri[3]
1018	lea	16*4($np),$np
1019	mov	@ri[1],8*1($rp)
1020	mov	@ri[2],8*2($rp)
1021	mov	@ri[3],8*3($rp)
1022	lea	8*4($rp),$rp
1023
1024	inc	$num
1025	jnz	.Lsub4x
1026
1027	ret
1028___
1029}
1030$code.=<<___;
1031.size	mul4x_internal,.-mul4x_internal
1032___
1033}}}
1034{{{
1035######################################################################
1036# void bn_power5(
1037my $rptr="%rdi";	# BN_ULONG *rptr,
1038my $aptr="%rsi";	# const BN_ULONG *aptr,
1039my $bptr="%rdx";	# const void *table,
1040my $nptr="%rcx";	# const BN_ULONG *nptr,
1041my $n0  ="%r8";		# const BN_ULONG *n0);
1042my $num ="%r9";		# int num, has to be divisible by 8
1043			# int pwr
1044
1045my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
1046my @A0=("%r10","%r11");
1047my @A1=("%r12","%r13");
1048my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
1049
1050$code.=<<___;
1051.globl	bn_power5
1052.type	bn_power5,\@function,6
1053.align	32
1054bn_power5:
1055	mov	%rsp,%rax
1056___
1057$code.=<<___ if ($addx);
1058	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
1059	and	\$0x80108,%r11d
1060	cmp	\$0x80108,%r11d		# check for AD*X+BMI2+BMI1
1061	je	.Lpowerx5_enter
1062___
1063$code.=<<___;
1064	push	%rbx
1065	push	%rbp
1066	push	%r12
1067	push	%r13
1068	push	%r14
1069	push	%r15
1070.Lpower5_prologue:
1071
1072	shl	\$3,${num}d		# convert $num to bytes
1073	lea	($num,$num,2),%r10d	# 3*$num
1074	neg	$num
1075	mov	($n0),$n0		# *n0
1076
1077	##############################################################
1078	# Ensure that stack frame doesn't alias with $rptr+3*$num
1079	# modulo 4096, which covers ret[num], am[num] and n[num]
1080	# (see bn_exp.c). This is done to allow memory disambiguation
1081	# logic do its magic. [Extra 256 bytes is for power mask
1082	# calculated from 7th argument, the index.]
1083	#
1084	lea	-320(%rsp,$num,2),%r11
1085	mov	%rsp,%rbp
1086	sub	$rptr,%r11
1087	and	\$4095,%r11
1088	cmp	%r11,%r10
1089	jb	.Lpwr_sp_alt
1090	sub	%r11,%rbp		# align with $aptr
1091	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*num*8+256)
1092	jmp	.Lpwr_sp_done
1093
1094.align	32
1095.Lpwr_sp_alt:
1096	lea	4096-320(,$num,2),%r10
1097	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*num*8+256)
1098	sub	%r10,%r11
1099	mov	\$0,%r10
1100	cmovc	%r10,%r11
1101	sub	%r11,%rbp
1102.Lpwr_sp_done:
1103	and	\$-64,%rbp
1104	mov	%rsp,%r11
1105	sub	%rbp,%r11
1106	and	\$-4096,%r11
1107	lea	(%rbp,%r11),%rsp
1108	mov	(%rsp),%r10
1109	cmp	%rbp,%rsp
1110	ja	.Lpwr_page_walk
1111	jmp	.Lpwr_page_walk_done
1112
1113.Lpwr_page_walk:
1114	lea	-4096(%rsp),%rsp
1115	mov	(%rsp),%r10
1116	cmp	%rbp,%rsp
1117	ja	.Lpwr_page_walk
1118.Lpwr_page_walk_done:
1119
1120	mov	$num,%r10
1121	neg	$num
1122
1123	##############################################################
1124	# Stack layout
1125	#
1126	# +0	saved $num, used in reduction section
1127	# +8	&t[2*$num], used in reduction section
1128	# +32	saved *n0
1129	# +40	saved %rsp
1130	# +48	t[2*$num]
1131	#
1132	mov	$n0,  32(%rsp)
1133	mov	%rax, 40(%rsp)		# save original %rsp
1134.Lpower5_body:
1135	movq	$rptr,%xmm1		# save $rptr, used in sqr8x
1136	movq	$nptr,%xmm2		# save $nptr
1137	movq	%r10, %xmm3		# -$num, used in sqr8x
1138	movq	$bptr,%xmm4
1139
1140	call	__bn_sqr8x_internal
1141	call	__bn_post4x_internal
1142	call	__bn_sqr8x_internal
1143	call	__bn_post4x_internal
1144	call	__bn_sqr8x_internal
1145	call	__bn_post4x_internal
1146	call	__bn_sqr8x_internal
1147	call	__bn_post4x_internal
1148	call	__bn_sqr8x_internal
1149	call	__bn_post4x_internal
1150
1151	movq	%xmm2,$nptr
1152	movq	%xmm4,$bptr
1153	mov	$aptr,$rptr
1154	mov	40(%rsp),%rax
1155	lea	32(%rsp),$n0
1156
1157	call	mul4x_internal
1158
1159	mov	40(%rsp),%rsi		# restore %rsp
1160	mov	\$1,%rax
1161	mov	-48(%rsi),%r15
1162	mov	-40(%rsi),%r14
1163	mov	-32(%rsi),%r13
1164	mov	-24(%rsi),%r12
1165	mov	-16(%rsi),%rbp
1166	mov	-8(%rsi),%rbx
1167	lea	(%rsi),%rsp
1168.Lpower5_epilogue:
1169	ret
1170.size	bn_power5,.-bn_power5
1171
1172.globl	bn_sqr8x_internal
1173.hidden	bn_sqr8x_internal
1174.type	bn_sqr8x_internal,\@abi-omnipotent
1175.align	32
1176bn_sqr8x_internal:
1177__bn_sqr8x_internal:
1178	##############################################################
1179	# Squaring part:
1180	#
1181	# a) multiply-n-add everything but a[i]*a[i];
1182	# b) shift result of a) by 1 to the left and accumulate
1183	#    a[i]*a[i] products;
1184	#
1185	##############################################################
1186	#                                                     a[1]a[0]
1187	#                                                 a[2]a[0]
1188	#                                             a[3]a[0]
1189	#                                             a[2]a[1]
1190	#                                         a[4]a[0]
1191	#                                         a[3]a[1]
1192	#                                     a[5]a[0]
1193	#                                     a[4]a[1]
1194	#                                     a[3]a[2]
1195	#                                 a[6]a[0]
1196	#                                 a[5]a[1]
1197	#                                 a[4]a[2]
1198	#                             a[7]a[0]
1199	#                             a[6]a[1]
1200	#                             a[5]a[2]
1201	#                             a[4]a[3]
1202	#                         a[7]a[1]
1203	#                         a[6]a[2]
1204	#                         a[5]a[3]
1205	#                     a[7]a[2]
1206	#                     a[6]a[3]
1207	#                     a[5]a[4]
1208	#                 a[7]a[3]
1209	#                 a[6]a[4]
1210	#             a[7]a[4]
1211	#             a[6]a[5]
1212	#         a[7]a[5]
1213	#     a[7]a[6]
1214	#                                                     a[1]a[0]
1215	#                                                 a[2]a[0]
1216	#                                             a[3]a[0]
1217	#                                         a[4]a[0]
1218	#                                     a[5]a[0]
1219	#                                 a[6]a[0]
1220	#                             a[7]a[0]
1221	#                                             a[2]a[1]
1222	#                                         a[3]a[1]
1223	#                                     a[4]a[1]
1224	#                                 a[5]a[1]
1225	#                             a[6]a[1]
1226	#                         a[7]a[1]
1227	#                                     a[3]a[2]
1228	#                                 a[4]a[2]
1229	#                             a[5]a[2]
1230	#                         a[6]a[2]
1231	#                     a[7]a[2]
1232	#                             a[4]a[3]
1233	#                         a[5]a[3]
1234	#                     a[6]a[3]
1235	#                 a[7]a[3]
1236	#                     a[5]a[4]
1237	#                 a[6]a[4]
1238	#             a[7]a[4]
1239	#             a[6]a[5]
1240	#         a[7]a[5]
1241	#     a[7]a[6]
1242	#                                                         a[0]a[0]
1243	#                                                 a[1]a[1]
1244	#                                         a[2]a[2]
1245	#                                 a[3]a[3]
1246	#                         a[4]a[4]
1247	#                 a[5]a[5]
1248	#         a[6]a[6]
1249	# a[7]a[7]
1250
1251	lea	32(%r10),$i		# $i=-($num-32)
1252	lea	($aptr,$num),$aptr	# end of a[] buffer, ($aptr,$i)=&ap[2]
1253
1254	mov	$num,$j			# $j=$num
1255
1256					# comments apply to $num==8 case
1257	mov	-32($aptr,$i),$a0	# a[0]
1258	lea	48+8(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
1259	mov	-24($aptr,$i),%rax	# a[1]
1260	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
1261	mov	-16($aptr,$i),$ai	# a[2]
1262	mov	%rax,$a1
1263
1264	mul	$a0			# a[1]*a[0]
1265	mov	%rax,$A0[0]		# a[1]*a[0]
1266	 mov	$ai,%rax		# a[2]
1267	mov	%rdx,$A0[1]
1268	mov	$A0[0],-24($tptr,$i)	# t[1]
1269
1270	mul	$a0			# a[2]*a[0]
1271	add	%rax,$A0[1]
1272	 mov	$ai,%rax
1273	adc	\$0,%rdx
1274	mov	$A0[1],-16($tptr,$i)	# t[2]
1275	mov	%rdx,$A0[0]
1276
1277
1278	 mov	-8($aptr,$i),$ai	# a[3]
1279	mul	$a1			# a[2]*a[1]
1280	mov	%rax,$A1[0]		# a[2]*a[1]+t[3]
1281	 mov	$ai,%rax
1282	mov	%rdx,$A1[1]
1283
1284	 lea	($i),$j
1285	mul	$a0			# a[3]*a[0]
1286	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
1287	 mov	$ai,%rax
1288	mov	%rdx,$A0[1]
1289	adc	\$0,$A0[1]
1290	add	$A1[0],$A0[0]
1291	adc	\$0,$A0[1]
1292	mov	$A0[0],-8($tptr,$j)	# t[3]
1293	jmp	.Lsqr4x_1st
1294
1295.align	32
1296.Lsqr4x_1st:
1297	 mov	($aptr,$j),$ai		# a[4]
1298	mul	$a1			# a[3]*a[1]
1299	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
1300	 mov	$ai,%rax
1301	mov	%rdx,$A1[0]
1302	adc	\$0,$A1[0]
1303
1304	mul	$a0			# a[4]*a[0]
1305	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
1306	 mov	$ai,%rax		# a[3]
1307	 mov	8($aptr,$j),$ai		# a[5]
1308	mov	%rdx,$A0[0]
1309	adc	\$0,$A0[0]
1310	add	$A1[1],$A0[1]
1311	adc	\$0,$A0[0]
1312
1313
1314	mul	$a1			# a[4]*a[3]
1315	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
1316	 mov	$ai,%rax
1317	 mov	$A0[1],($tptr,$j)	# t[4]
1318	mov	%rdx,$A1[1]
1319	adc	\$0,$A1[1]
1320
1321	mul	$a0			# a[5]*a[2]
1322	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
1323	 mov	$ai,%rax
1324	 mov	16($aptr,$j),$ai	# a[6]
1325	mov	%rdx,$A0[1]
1326	adc	\$0,$A0[1]
1327	add	$A1[0],$A0[0]
1328	adc	\$0,$A0[1]
1329
1330	mul	$a1			# a[5]*a[3]
1331	add	%rax,$A1[1]		# a[5]*a[3]+t[6]
1332	 mov	$ai,%rax
1333	 mov	$A0[0],8($tptr,$j)	# t[5]
1334	mov	%rdx,$A1[0]
1335	adc	\$0,$A1[0]
1336
1337	mul	$a0			# a[6]*a[2]
1338	add	%rax,$A0[1]		# a[6]*a[2]+a[5]*a[3]+t[6]
1339	 mov	$ai,%rax		# a[3]
1340	 mov	24($aptr,$j),$ai	# a[7]
1341	mov	%rdx,$A0[0]
1342	adc	\$0,$A0[0]
1343	add	$A1[1],$A0[1]
1344	adc	\$0,$A0[0]
1345
1346
1347	mul	$a1			# a[6]*a[5]
1348	add	%rax,$A1[0]		# a[6]*a[5]+t[7]
1349	 mov	$ai,%rax
1350	 mov	$A0[1],16($tptr,$j)	# t[6]
1351	mov	%rdx,$A1[1]
1352	adc	\$0,$A1[1]
1353	 lea	32($j),$j
1354
1355	mul	$a0			# a[7]*a[4]
1356	add	%rax,$A0[0]		# a[7]*a[4]+a[6]*a[5]+t[6]
1357	 mov	$ai,%rax
1358	mov	%rdx,$A0[1]
1359	adc	\$0,$A0[1]
1360	add	$A1[0],$A0[0]
1361	adc	\$0,$A0[1]
1362	mov	$A0[0],-8($tptr,$j)	# t[7]
1363
1364	cmp	\$0,$j
1365	jne	.Lsqr4x_1st
1366
1367	mul	$a1			# a[7]*a[5]
1368	add	%rax,$A1[1]
1369	lea	16($i),$i
1370	adc	\$0,%rdx
1371	add	$A0[1],$A1[1]
1372	adc	\$0,%rdx
1373
1374	mov	$A1[1],($tptr)		# t[8]
1375	mov	%rdx,$A1[0]
1376	mov	%rdx,8($tptr)		# t[9]
1377	jmp	.Lsqr4x_outer
1378
1379.align	32
1380.Lsqr4x_outer:				# comments apply to $num==6 case
1381	mov	-32($aptr,$i),$a0	# a[0]
1382	lea	48+8(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
1383	mov	-24($aptr,$i),%rax	# a[1]
1384	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
1385	mov	-16($aptr,$i),$ai	# a[2]
1386	mov	%rax,$a1
1387
1388	mul	$a0			# a[1]*a[0]
1389	mov	-24($tptr,$i),$A0[0]	# t[1]
1390	add	%rax,$A0[0]		# a[1]*a[0]+t[1]
1391	 mov	$ai,%rax		# a[2]
1392	adc	\$0,%rdx
1393	mov	$A0[0],-24($tptr,$i)	# t[1]
1394	mov	%rdx,$A0[1]
1395
1396	mul	$a0			# a[2]*a[0]
1397	add	%rax,$A0[1]
1398	 mov	$ai,%rax
1399	adc	\$0,%rdx
1400	add	-16($tptr,$i),$A0[1]	# a[2]*a[0]+t[2]
1401	mov	%rdx,$A0[0]
1402	adc	\$0,$A0[0]
1403	mov	$A0[1],-16($tptr,$i)	# t[2]
1404
1405	xor	$A1[0],$A1[0]
1406
1407	 mov	-8($aptr,$i),$ai	# a[3]
1408	mul	$a1			# a[2]*a[1]
1409	add	%rax,$A1[0]		# a[2]*a[1]+t[3]
1410	 mov	$ai,%rax
1411	adc	\$0,%rdx
1412	add	-8($tptr,$i),$A1[0]
1413	mov	%rdx,$A1[1]
1414	adc	\$0,$A1[1]
1415
1416	mul	$a0			# a[3]*a[0]
1417	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
1418	 mov	$ai,%rax
1419	adc	\$0,%rdx
1420	add	$A1[0],$A0[0]
1421	mov	%rdx,$A0[1]
1422	adc	\$0,$A0[1]
1423	mov	$A0[0],-8($tptr,$i)	# t[3]
1424
1425	lea	($i),$j
1426	jmp	.Lsqr4x_inner
1427
1428.align	32
1429.Lsqr4x_inner:
1430	 mov	($aptr,$j),$ai		# a[4]
1431	mul	$a1			# a[3]*a[1]
1432	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
1433	 mov	$ai,%rax
1434	mov	%rdx,$A1[0]
1435	adc	\$0,$A1[0]
1436	add	($tptr,$j),$A1[1]
1437	adc	\$0,$A1[0]
1438
1439	.byte	0x67
1440	mul	$a0			# a[4]*a[0]
1441	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
1442	 mov	$ai,%rax		# a[3]
1443	 mov	8($aptr,$j),$ai		# a[5]
1444	mov	%rdx,$A0[0]
1445	adc	\$0,$A0[0]
1446	add	$A1[1],$A0[1]
1447	adc	\$0,$A0[0]
1448
1449	mul	$a1			# a[4]*a[3]
1450	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
1451	mov	$A0[1],($tptr,$j)	# t[4]
1452	 mov	$ai,%rax
1453	mov	%rdx,$A1[1]
1454	adc	\$0,$A1[1]
1455	add	8($tptr,$j),$A1[0]
1456	lea	16($j),$j		# j++
1457	adc	\$0,$A1[1]
1458
1459	mul	$a0			# a[5]*a[2]
1460	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
1461	 mov	$ai,%rax
1462	adc	\$0,%rdx
1463	add	$A1[0],$A0[0]
1464	mov	%rdx,$A0[1]
1465	adc	\$0,$A0[1]
1466	mov	$A0[0],-8($tptr,$j)	# t[5], "preloaded t[1]" below
1467
1468	cmp	\$0,$j
1469	jne	.Lsqr4x_inner
1470
1471	.byte	0x67
1472	mul	$a1			# a[5]*a[3]
1473	add	%rax,$A1[1]
1474	adc	\$0,%rdx
1475	add	$A0[1],$A1[1]
1476	adc	\$0,%rdx
1477
1478	mov	$A1[1],($tptr)		# t[6], "preloaded t[2]" below
1479	mov	%rdx,$A1[0]
1480	mov	%rdx,8($tptr)		# t[7], "preloaded t[3]" below
1481
1482	add	\$16,$i
1483	jnz	.Lsqr4x_outer
1484
1485					# comments apply to $num==4 case
1486	mov	-32($aptr),$a0		# a[0]
1487	lea	48+8(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
1488	mov	-24($aptr),%rax		# a[1]
1489	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
1490	mov	-16($aptr),$ai		# a[2]
1491	mov	%rax,$a1
1492
1493	mul	$a0			# a[1]*a[0]
1494	add	%rax,$A0[0]		# a[1]*a[0]+t[1], preloaded t[1]
1495	 mov	$ai,%rax		# a[2]
1496	mov	%rdx,$A0[1]
1497	adc	\$0,$A0[1]
1498
1499	mul	$a0			# a[2]*a[0]
1500	add	%rax,$A0[1]
1501	 mov	$ai,%rax
1502	 mov	$A0[0],-24($tptr)	# t[1]
1503	mov	%rdx,$A0[0]
1504	adc	\$0,$A0[0]
1505	add	$A1[1],$A0[1]		# a[2]*a[0]+t[2], preloaded t[2]
1506	 mov	-8($aptr),$ai		# a[3]
1507	adc	\$0,$A0[0]
1508
1509	mul	$a1			# a[2]*a[1]
1510	add	%rax,$A1[0]		# a[2]*a[1]+t[3], preloaded t[3]
1511	 mov	$ai,%rax
1512	 mov	$A0[1],-16($tptr)	# t[2]
1513	mov	%rdx,$A1[1]
1514	adc	\$0,$A1[1]
1515
1516	mul	$a0			# a[3]*a[0]
1517	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
1518	 mov	$ai,%rax
1519	mov	%rdx,$A0[1]
1520	adc	\$0,$A0[1]
1521	add	$A1[0],$A0[0]
1522	adc	\$0,$A0[1]
1523	mov	$A0[0],-8($tptr)	# t[3]
1524
1525	mul	$a1			# a[3]*a[1]
1526	add	%rax,$A1[1]
1527	 mov	-16($aptr),%rax		# a[2]
1528	adc	\$0,%rdx
1529	add	$A0[1],$A1[1]
1530	adc	\$0,%rdx
1531
1532	mov	$A1[1],($tptr)		# t[4]
1533	mov	%rdx,$A1[0]
1534	mov	%rdx,8($tptr)		# t[5]
1535
1536	mul	$ai			# a[2]*a[3]
1537___
1538{
1539my ($shift,$carry)=($a0,$a1);
1540my @S=(@A1,$ai,$n0);
1541$code.=<<___;
1542	 add	\$16,$i
1543	 xor	$shift,$shift
1544	 sub	$num,$i			# $i=16-$num
1545	 xor	$carry,$carry
1546
1547	add	$A1[0],%rax		# t[5]
1548	adc	\$0,%rdx
1549	mov	%rax,8($tptr)		# t[5]
1550	mov	%rdx,16($tptr)		# t[6]
1551	mov	$carry,24($tptr)	# t[7]
1552
1553	 mov	-16($aptr,$i),%rax	# a[0]
1554	lea	48+8(%rsp),$tptr
1555	 xor	$A0[0],$A0[0]		# t[0]
1556	 mov	8($tptr),$A0[1]		# t[1]
1557
1558	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1559	shr	\$63,$A0[0]
1560	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1561	shr	\$63,$A0[1]
1562	or	$A0[0],$S[1]		# | t[2*i]>>63
1563	 mov	16($tptr),$A0[0]	# t[2*i+2]	# prefetch
1564	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1565	mul	%rax			# a[i]*a[i]
1566	neg	$carry			# mov $carry,cf
1567	 mov	24($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1568	adc	%rax,$S[0]
1569	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
1570	mov	$S[0],($tptr)
1571	adc	%rdx,$S[1]
1572
1573	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1574	 mov	$S[1],8($tptr)
1575	 sbb	$carry,$carry		# mov cf,$carry
1576	shr	\$63,$A0[0]
1577	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1578	shr	\$63,$A0[1]
1579	or	$A0[0],$S[3]		# | t[2*i]>>63
1580	 mov	32($tptr),$A0[0]	# t[2*i+2]	# prefetch
1581	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1582	mul	%rax			# a[i]*a[i]
1583	neg	$carry			# mov $carry,cf
1584	 mov	40($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1585	adc	%rax,$S[2]
1586	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
1587	mov	$S[2],16($tptr)
1588	adc	%rdx,$S[3]
1589	lea	16($i),$i
1590	mov	$S[3],24($tptr)
1591	sbb	$carry,$carry		# mov cf,$carry
1592	lea	64($tptr),$tptr
1593	jmp	.Lsqr4x_shift_n_add
1594
1595.align	32
1596.Lsqr4x_shift_n_add:
1597	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1598	shr	\$63,$A0[0]
1599	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1600	shr	\$63,$A0[1]
1601	or	$A0[0],$S[1]		# | t[2*i]>>63
1602	 mov	-16($tptr),$A0[0]	# t[2*i+2]	# prefetch
1603	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1604	mul	%rax			# a[i]*a[i]
1605	neg	$carry			# mov $carry,cf
1606	 mov	-8($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1607	adc	%rax,$S[0]
1608	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
1609	mov	$S[0],-32($tptr)
1610	adc	%rdx,$S[1]
1611
1612	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1613	 mov	$S[1],-24($tptr)
1614	 sbb	$carry,$carry		# mov cf,$carry
1615	shr	\$63,$A0[0]
1616	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1617	shr	\$63,$A0[1]
1618	or	$A0[0],$S[3]		# | t[2*i]>>63
1619	 mov	0($tptr),$A0[0]		# t[2*i+2]	# prefetch
1620	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1621	mul	%rax			# a[i]*a[i]
1622	neg	$carry			# mov $carry,cf
1623	 mov	8($tptr),$A0[1]		# t[2*i+2+1]	# prefetch
1624	adc	%rax,$S[2]
1625	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
1626	mov	$S[2],-16($tptr)
1627	adc	%rdx,$S[3]
1628
1629	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1630	 mov	$S[3],-8($tptr)
1631	 sbb	$carry,$carry		# mov cf,$carry
1632	shr	\$63,$A0[0]
1633	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1634	shr	\$63,$A0[1]
1635	or	$A0[0],$S[1]		# | t[2*i]>>63
1636	 mov	16($tptr),$A0[0]	# t[2*i+2]	# prefetch
1637	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1638	mul	%rax			# a[i]*a[i]
1639	neg	$carry			# mov $carry,cf
1640	 mov	24($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1641	adc	%rax,$S[0]
1642	 mov	8($aptr,$i),%rax	# a[i+1]	# prefetch
1643	mov	$S[0],0($tptr)
1644	adc	%rdx,$S[1]
1645
1646	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1647	 mov	$S[1],8($tptr)
1648	 sbb	$carry,$carry		# mov cf,$carry
1649	shr	\$63,$A0[0]
1650	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1651	shr	\$63,$A0[1]
1652	or	$A0[0],$S[3]		# | t[2*i]>>63
1653	 mov	32($tptr),$A0[0]	# t[2*i+2]	# prefetch
1654	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1655	mul	%rax			# a[i]*a[i]
1656	neg	$carry			# mov $carry,cf
1657	 mov	40($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1658	adc	%rax,$S[2]
1659	 mov	16($aptr,$i),%rax	# a[i+1]	# prefetch
1660	mov	$S[2],16($tptr)
1661	adc	%rdx,$S[3]
1662	mov	$S[3],24($tptr)
1663	sbb	$carry,$carry		# mov cf,$carry
1664	lea	64($tptr),$tptr
1665	add	\$32,$i
1666	jnz	.Lsqr4x_shift_n_add
1667
1668	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1669	.byte	0x67
1670	shr	\$63,$A0[0]
1671	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1672	shr	\$63,$A0[1]
1673	or	$A0[0],$S[1]		# | t[2*i]>>63
1674	 mov	-16($tptr),$A0[0]	# t[2*i+2]	# prefetch
1675	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1676	mul	%rax			# a[i]*a[i]
1677	neg	$carry			# mov $carry,cf
1678	 mov	-8($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1679	adc	%rax,$S[0]
1680	 mov	-8($aptr),%rax		# a[i+1]	# prefetch
1681	mov	$S[0],-32($tptr)
1682	adc	%rdx,$S[1]
1683
1684	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1|shift
1685	 mov	$S[1],-24($tptr)
1686	 sbb	$carry,$carry		# mov cf,$carry
1687	shr	\$63,$A0[0]
1688	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1689	shr	\$63,$A0[1]
1690	or	$A0[0],$S[3]		# | t[2*i]>>63
1691	mul	%rax			# a[i]*a[i]
1692	neg	$carry			# mov $carry,cf
1693	adc	%rax,$S[2]
1694	adc	%rdx,$S[3]
1695	mov	$S[2],-16($tptr)
1696	mov	$S[3],-8($tptr)
1697___
1698}
1699######################################################################
1700# Montgomery reduction part, "word-by-word" algorithm.
1701#
1702# This new path is inspired by multiple submissions from Intel, by
1703# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
1704# Vinodh Gopal...
1705{
1706my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx");
1707
1708$code.=<<___;
1709	movq	%xmm2,$nptr
1710__bn_sqr8x_reduction:
1711	xor	%rax,%rax
1712	lea	($nptr,$num),%rcx	# end of n[]
1713	lea	48+8(%rsp,$num,2),%rdx	# end of t[] buffer
1714	mov	%rcx,0+8(%rsp)
1715	lea	48+8(%rsp,$num),$tptr	# end of initial t[] window
1716	mov	%rdx,8+8(%rsp)
1717	neg	$num
1718	jmp	.L8x_reduction_loop
1719
1720.align	32
1721.L8x_reduction_loop:
1722	lea	($tptr,$num),$tptr	# start of current t[] window
1723	.byte	0x66
1724	mov	8*0($tptr),$m0
1725	mov	8*1($tptr),%r9
1726	mov	8*2($tptr),%r10
1727	mov	8*3($tptr),%r11
1728	mov	8*4($tptr),%r12
1729	mov	8*5($tptr),%r13
1730	mov	8*6($tptr),%r14
1731	mov	8*7($tptr),%r15
1732	mov	%rax,(%rdx)		# store top-most carry bit
1733	lea	8*8($tptr),$tptr
1734
1735	.byte	0x67
1736	mov	$m0,%r8
1737	imulq	32+8(%rsp),$m0		# n0*a[0]
1738	mov	8*0($nptr),%rax		# n[0]
1739	mov	\$8,%ecx
1740	jmp	.L8x_reduce
1741
1742.align	32
1743.L8x_reduce:
1744	mulq	$m0
1745	 mov	8*1($nptr),%rax		# n[1]
1746	neg	%r8
1747	mov	%rdx,%r8
1748	adc	\$0,%r8
1749
1750	mulq	$m0
1751	add	%rax,%r9
1752	 mov	8*2($nptr),%rax
1753	adc	\$0,%rdx
1754	add	%r9,%r8
1755	 mov	$m0,48-8+8(%rsp,%rcx,8)	# put aside n0*a[i]
1756	mov	%rdx,%r9
1757	adc	\$0,%r9
1758
1759	mulq	$m0
1760	add	%rax,%r10
1761	 mov	8*3($nptr),%rax
1762	adc	\$0,%rdx
1763	add	%r10,%r9
1764	 mov	32+8(%rsp),$carry	# pull n0, borrow $carry
1765	mov	%rdx,%r10
1766	adc	\$0,%r10
1767
1768	mulq	$m0
1769	add	%rax,%r11
1770	 mov	8*4($nptr),%rax
1771	adc	\$0,%rdx
1772	 imulq	%r8,$carry		# modulo-scheduled
1773	add	%r11,%r10
1774	mov	%rdx,%r11
1775	adc	\$0,%r11
1776
1777	mulq	$m0
1778	add	%rax,%r12
1779	 mov	8*5($nptr),%rax
1780	adc	\$0,%rdx
1781	add	%r12,%r11
1782	mov	%rdx,%r12
1783	adc	\$0,%r12
1784
1785	mulq	$m0
1786	add	%rax,%r13
1787	 mov	8*6($nptr),%rax
1788	adc	\$0,%rdx
1789	add	%r13,%r12
1790	mov	%rdx,%r13
1791	adc	\$0,%r13
1792
1793	mulq	$m0
1794	add	%rax,%r14
1795	 mov	8*7($nptr),%rax
1796	adc	\$0,%rdx
1797	add	%r14,%r13
1798	mov	%rdx,%r14
1799	adc	\$0,%r14
1800
1801	mulq	$m0
1802	 mov	$carry,$m0		# n0*a[i]
1803	add	%rax,%r15
1804	 mov	8*0($nptr),%rax		# n[0]
1805	adc	\$0,%rdx
1806	add	%r15,%r14
1807	mov	%rdx,%r15
1808	adc	\$0,%r15
1809
1810	dec	%ecx
1811	jnz	.L8x_reduce
1812
1813	lea	8*8($nptr),$nptr
1814	xor	%rax,%rax
1815	mov	8+8(%rsp),%rdx		# pull end of t[]
1816	cmp	0+8(%rsp),$nptr		# end of n[]?
1817	jae	.L8x_no_tail
1818
1819	.byte	0x66
1820	add	8*0($tptr),%r8
1821	adc	8*1($tptr),%r9
1822	adc	8*2($tptr),%r10
1823	adc	8*3($tptr),%r11
1824	adc	8*4($tptr),%r12
1825	adc	8*5($tptr),%r13
1826	adc	8*6($tptr),%r14
1827	adc	8*7($tptr),%r15
1828	sbb	$carry,$carry		# top carry
1829
1830	mov	48+56+8(%rsp),$m0	# pull n0*a[0]
1831	mov	\$8,%ecx
1832	mov	8*0($nptr),%rax
1833	jmp	.L8x_tail
1834
1835.align	32
1836.L8x_tail:
1837	mulq	$m0
1838	add	%rax,%r8
1839	 mov	8*1($nptr),%rax
1840	 mov	%r8,($tptr)		# save result
1841	mov	%rdx,%r8
1842	adc	\$0,%r8
1843
1844	mulq	$m0
1845	add	%rax,%r9
1846	 mov	8*2($nptr),%rax
1847	adc	\$0,%rdx
1848	add	%r9,%r8
1849	 lea	8($tptr),$tptr		# $tptr++
1850	mov	%rdx,%r9
1851	adc	\$0,%r9
1852
1853	mulq	$m0
1854	add	%rax,%r10
1855	 mov	8*3($nptr),%rax
1856	adc	\$0,%rdx
1857	add	%r10,%r9
1858	mov	%rdx,%r10
1859	adc	\$0,%r10
1860
1861	mulq	$m0
1862	add	%rax,%r11
1863	 mov	8*4($nptr),%rax
1864	adc	\$0,%rdx
1865	add	%r11,%r10
1866	mov	%rdx,%r11
1867	adc	\$0,%r11
1868
1869	mulq	$m0
1870	add	%rax,%r12
1871	 mov	8*5($nptr),%rax
1872	adc	\$0,%rdx
1873	add	%r12,%r11
1874	mov	%rdx,%r12
1875	adc	\$0,%r12
1876
1877	mulq	$m0
1878	add	%rax,%r13
1879	 mov	8*6($nptr),%rax
1880	adc	\$0,%rdx
1881	add	%r13,%r12
1882	mov	%rdx,%r13
1883	adc	\$0,%r13
1884
1885	mulq	$m0
1886	add	%rax,%r14
1887	 mov	8*7($nptr),%rax
1888	adc	\$0,%rdx
1889	add	%r14,%r13
1890	mov	%rdx,%r14
1891	adc	\$0,%r14
1892
1893	mulq	$m0
1894	 mov	48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i]
1895	add	%rax,%r15
1896	adc	\$0,%rdx
1897	add	%r15,%r14
1898	 mov	8*0($nptr),%rax		# pull n[0]
1899	mov	%rdx,%r15
1900	adc	\$0,%r15
1901
1902	dec	%ecx
1903	jnz	.L8x_tail
1904
1905	lea	8*8($nptr),$nptr
1906	mov	8+8(%rsp),%rdx		# pull end of t[]
1907	cmp	0+8(%rsp),$nptr		# end of n[]?
1908	jae	.L8x_tail_done		# break out of loop
1909
1910	 mov	48+56+8(%rsp),$m0	# pull n0*a[0]
1911	neg	$carry
1912	 mov	8*0($nptr),%rax		# pull n[0]
1913	adc	8*0($tptr),%r8
1914	adc	8*1($tptr),%r9
1915	adc	8*2($tptr),%r10
1916	adc	8*3($tptr),%r11
1917	adc	8*4($tptr),%r12
1918	adc	8*5($tptr),%r13
1919	adc	8*6($tptr),%r14
1920	adc	8*7($tptr),%r15
1921	sbb	$carry,$carry		# top carry
1922
1923	mov	\$8,%ecx
1924	jmp	.L8x_tail
1925
1926.align	32
1927.L8x_tail_done:
1928	xor	%rax,%rax
1929	add	(%rdx),%r8		# can this overflow?
1930	adc	\$0,%r9
1931	adc	\$0,%r10
1932	adc	\$0,%r11
1933	adc	\$0,%r12
1934	adc	\$0,%r13
1935	adc	\$0,%r14
1936	adc	\$0,%r15
1937	adc	\$0,%rax
1938
1939	neg	$carry
1940.L8x_no_tail:
1941	adc	8*0($tptr),%r8
1942	adc	8*1($tptr),%r9
1943	adc	8*2($tptr),%r10
1944	adc	8*3($tptr),%r11
1945	adc	8*4($tptr),%r12
1946	adc	8*5($tptr),%r13
1947	adc	8*6($tptr),%r14
1948	adc	8*7($tptr),%r15
1949	adc	\$0,%rax		# top-most carry
1950	 mov	-8($nptr),%rcx		# np[num-1]
1951	 xor	$carry,$carry
1952
1953	movq	%xmm2,$nptr		# restore $nptr
1954
1955	mov	%r8,8*0($tptr)		# store top 512 bits
1956	mov	%r9,8*1($tptr)
1957	 movq	%xmm3,$num		# $num is %r9, can't be moved upwards
1958	mov	%r10,8*2($tptr)
1959	mov	%r11,8*3($tptr)
1960	mov	%r12,8*4($tptr)
1961	mov	%r13,8*5($tptr)
1962	mov	%r14,8*6($tptr)
1963	mov	%r15,8*7($tptr)
1964	lea	8*8($tptr),$tptr
1965
1966	cmp	%rdx,$tptr		# end of t[]?
1967	jb	.L8x_reduction_loop
1968	ret
1969.size	bn_sqr8x_internal,.-bn_sqr8x_internal
1970___
1971}
1972##############################################################
1973# Post-condition, 4x unrolled
1974#
1975{
1976my ($tptr,$nptr)=("%rbx","%rbp");
1977$code.=<<___;
1978.type	__bn_post4x_internal,\@abi-omnipotent
1979.align	32
1980__bn_post4x_internal:
1981	mov	8*0($nptr),%r12
1982	lea	(%rdi,$num),$tptr	# %rdi was $tptr above
1983	mov	$num,%rcx
1984	movq	%xmm1,$rptr		# restore $rptr
1985	neg	%rax
1986	movq	%xmm1,$aptr		# prepare for back-to-back call
1987	sar	\$3+2,%rcx
1988	dec	%r12			# so that after 'not' we get -n[0]
1989	xor	%r10,%r10
1990	mov	8*1($nptr),%r13
1991	mov	8*2($nptr),%r14
1992	mov	8*3($nptr),%r15
1993	jmp	.Lsqr4x_sub_entry
1994
1995.align	16
1996.Lsqr4x_sub:
1997	mov	8*0($nptr),%r12
1998	mov	8*1($nptr),%r13
1999	mov	8*2($nptr),%r14
2000	mov	8*3($nptr),%r15
2001.Lsqr4x_sub_entry:
2002	lea	8*4($nptr),$nptr
2003	not	%r12
2004	not	%r13
2005	not	%r14
2006	not	%r15
2007	and	%rax,%r12
2008	and	%rax,%r13
2009	and	%rax,%r14
2010	and	%rax,%r15
2011
2012	neg	%r10			# mov %r10,%cf
2013	adc	8*0($tptr),%r12
2014	adc	8*1($tptr),%r13
2015	adc	8*2($tptr),%r14
2016	adc	8*3($tptr),%r15
2017	mov	%r12,8*0($rptr)
2018	lea	8*4($tptr),$tptr
2019	mov	%r13,8*1($rptr)
2020	sbb	%r10,%r10		# mov %cf,%r10
2021	mov	%r14,8*2($rptr)
2022	mov	%r15,8*3($rptr)
2023	lea	8*4($rptr),$rptr
2024
2025	inc	%rcx			# pass %cf
2026	jnz	.Lsqr4x_sub
2027
2028	mov	$num,%r10		# prepare for back-to-back call
2029	neg	$num			# restore $num
2030	ret
2031.size	__bn_post4x_internal,.-__bn_post4x_internal
2032___
2033}
2034{
2035$code.=<<___;
2036.globl	bn_from_montgomery
2037.type	bn_from_montgomery,\@abi-omnipotent
2038.align	32
2039bn_from_montgomery:
2040	testl	\$7,`($win64?"48(%rsp)":"%r9d")`
2041	jz	bn_from_mont8x
2042	xor	%eax,%eax
2043	ret
2044.size	bn_from_montgomery,.-bn_from_montgomery
2045
2046.type	bn_from_mont8x,\@function,6
2047.align	32
2048bn_from_mont8x:
2049	.byte	0x67
2050	mov	%rsp,%rax
2051	push	%rbx
2052	push	%rbp
2053	push	%r12
2054	push	%r13
2055	push	%r14
2056	push	%r15
2057.Lfrom_prologue:
2058
2059	shl	\$3,${num}d		# convert $num to bytes
2060	lea	($num,$num,2),%r10	# 3*$num in bytes
2061	neg	$num
2062	mov	($n0),$n0		# *n0
2063
2064	##############################################################
2065	# Ensure that stack frame doesn't alias with $rptr+3*$num
2066	# modulo 4096, which covers ret[num], am[num] and n[num]
2067	# (see bn_exp.c). The stack is allocated to aligned with
2068	# bn_power5's frame, and as bn_from_montgomery happens to be
2069	# last operation, we use the opportunity to cleanse it.
2070	#
2071	lea	-320(%rsp,$num,2),%r11
2072	mov	%rsp,%rbp
2073	sub	$rptr,%r11
2074	and	\$4095,%r11
2075	cmp	%r11,%r10
2076	jb	.Lfrom_sp_alt
2077	sub	%r11,%rbp		# align with $aptr
2078	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*$num*8+256)
2079	jmp	.Lfrom_sp_done
2080
2081.align	32
2082.Lfrom_sp_alt:
2083	lea	4096-320(,$num,2),%r10
2084	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*$num*8+256)
2085	sub	%r10,%r11
2086	mov	\$0,%r10
2087	cmovc	%r10,%r11
2088	sub	%r11,%rbp
2089.Lfrom_sp_done:
2090	and	\$-64,%rbp
2091	mov	%rsp,%r11
2092	sub	%rbp,%r11
2093	and	\$-4096,%r11
2094	lea	(%rbp,%r11),%rsp
2095	mov	(%rsp),%r10
2096	cmp	%rbp,%rsp
2097	ja	.Lfrom_page_walk
2098	jmp	.Lfrom_page_walk_done
2099
2100.Lfrom_page_walk:
2101	lea	-4096(%rsp),%rsp
2102	mov	(%rsp),%r10
2103	cmp	%rbp,%rsp
2104	ja	.Lfrom_page_walk
2105.Lfrom_page_walk_done:
2106
2107	mov	$num,%r10
2108	neg	$num
2109
2110	##############################################################
2111	# Stack layout
2112	#
2113	# +0	saved $num, used in reduction section
2114	# +8	&t[2*$num], used in reduction section
2115	# +32	saved *n0
2116	# +40	saved %rsp
2117	# +48	t[2*$num]
2118	#
2119	mov	$n0,  32(%rsp)
2120	mov	%rax, 40(%rsp)		# save original %rsp
2121.Lfrom_body:
2122	mov	$num,%r11
2123	lea	48(%rsp),%rax
2124	pxor	%xmm0,%xmm0
2125	jmp	.Lmul_by_1
2126
2127.align	32
2128.Lmul_by_1:
2129	movdqu	($aptr),%xmm1
2130	movdqu	16($aptr),%xmm2
2131	movdqu	32($aptr),%xmm3
2132	movdqa	%xmm0,(%rax,$num)
2133	movdqu	48($aptr),%xmm4
2134	movdqa	%xmm0,16(%rax,$num)
2135	.byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00	# lea	64($aptr),$aptr
2136	movdqa	%xmm1,(%rax)
2137	movdqa	%xmm0,32(%rax,$num)
2138	movdqa	%xmm2,16(%rax)
2139	movdqa	%xmm0,48(%rax,$num)
2140	movdqa	%xmm3,32(%rax)
2141	movdqa	%xmm4,48(%rax)
2142	lea	64(%rax),%rax
2143	sub	\$64,%r11
2144	jnz	.Lmul_by_1
2145
2146	movq	$rptr,%xmm1
2147	movq	$nptr,%xmm2
2148	.byte	0x67
2149	mov	$nptr,%rbp
2150	movq	%r10, %xmm3		# -num
2151___
2152$code.=<<___ if ($addx);
2153	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
2154	and	\$0x80108,%r11d
2155	cmp	\$0x80108,%r11d		# check for AD*X+BMI2+BMI1
2156	jne	.Lfrom_mont_nox
2157
2158	lea	(%rax,$num),$rptr
2159	call	__bn_sqrx8x_reduction
2160	call	__bn_postx4x_internal
2161
2162	pxor	%xmm0,%xmm0
2163	lea	48(%rsp),%rax
2164	mov	40(%rsp),%rsi		# restore %rsp
2165	jmp	.Lfrom_mont_zero
2166
2167.align	32
2168.Lfrom_mont_nox:
2169___
2170$code.=<<___;
2171	call	__bn_sqr8x_reduction
2172	call	__bn_post4x_internal
2173
2174	pxor	%xmm0,%xmm0
2175	lea	48(%rsp),%rax
2176	mov	40(%rsp),%rsi		# restore %rsp
2177	jmp	.Lfrom_mont_zero
2178
2179.align	32
2180.Lfrom_mont_zero:
2181	movdqa	%xmm0,16*0(%rax)
2182	movdqa	%xmm0,16*1(%rax)
2183	movdqa	%xmm0,16*2(%rax)
2184	movdqa	%xmm0,16*3(%rax)
2185	lea	16*4(%rax),%rax
2186	sub	\$32,$num
2187	jnz	.Lfrom_mont_zero
2188
2189	mov	\$1,%rax
2190	mov	-48(%rsi),%r15
2191	mov	-40(%rsi),%r14
2192	mov	-32(%rsi),%r13
2193	mov	-24(%rsi),%r12
2194	mov	-16(%rsi),%rbp
2195	mov	-8(%rsi),%rbx
2196	lea	(%rsi),%rsp
2197.Lfrom_epilogue:
2198	ret
2199.size	bn_from_mont8x,.-bn_from_mont8x
2200___
2201}
2202}}}
2203
2204if ($addx) {{{
2205my $bp="%rdx";	# restore original value
2206
2207$code.=<<___;
2208.type	bn_mulx4x_mont_gather5,\@function,6
2209.align	32
2210bn_mulx4x_mont_gather5:
2211	mov	%rsp,%rax
2212.Lmulx4x_enter:
2213	push	%rbx
2214	push	%rbp
2215	push	%r12
2216	push	%r13
2217	push	%r14
2218	push	%r15
2219.Lmulx4x_prologue:
2220
2221	shl	\$3,${num}d		# convert $num to bytes
2222	lea	($num,$num,2),%r10	# 3*$num in bytes
2223	neg	$num			# -$num
2224	mov	($n0),$n0		# *n0
2225
2226	##############################################################
2227	# Ensure that stack frame doesn't alias with $rptr+3*$num
2228	# modulo 4096, which covers ret[num], am[num] and n[num]
2229	# (see bn_exp.c). This is done to allow memory disambiguation
2230	# logic do its magic. [Extra [num] is allocated in order
2231	# to align with bn_power5's frame, which is cleansed after
2232	# completing exponentiation. Extra 256 bytes is for power mask
2233	# calculated from 7th argument, the index.]
2234	#
2235	lea	-320(%rsp,$num,2),%r11
2236	mov	%rsp,%rbp
2237	sub	$rp,%r11
2238	and	\$4095,%r11
2239	cmp	%r11,%r10
2240	jb	.Lmulx4xsp_alt
2241	sub	%r11,%rbp		# align with $aptr
2242	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*$num*8+256)
2243	jmp	.Lmulx4xsp_done
2244
2245.Lmulx4xsp_alt:
2246	lea	4096-320(,$num,2),%r10
2247	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*$num*8+256)
2248	sub	%r10,%r11
2249	mov	\$0,%r10
2250	cmovc	%r10,%r11
2251	sub	%r11,%rbp
2252.Lmulx4xsp_done:
2253	and	\$-64,%rbp		# ensure alignment
2254	mov	%rsp,%r11
2255	sub	%rbp,%r11
2256	and	\$-4096,%r11
2257	lea	(%rbp,%r11),%rsp
2258	mov	(%rsp),%r10
2259	cmp	%rbp,%rsp
2260	ja	.Lmulx4x_page_walk
2261	jmp	.Lmulx4x_page_walk_done
2262
2263.Lmulx4x_page_walk:
2264	lea	-4096(%rsp),%rsp
2265	mov	(%rsp),%r10
2266	cmp	%rbp,%rsp
2267	ja	.Lmulx4x_page_walk
2268.Lmulx4x_page_walk_done:
2269
2270	##############################################################
2271	# Stack layout
2272	# +0	-num
2273	# +8	off-loaded &b[i]
2274	# +16	end of b[num]
2275	# +24	inner counter
2276	# +32	saved n0
2277	# +40	saved %rsp
2278	# +48
2279	# +56	saved rp
2280	# +64	tmp[num+1]
2281	#
2282	mov	$n0, 32(%rsp)		# save *n0
2283	mov	%rax,40(%rsp)		# save original %rsp
2284.Lmulx4x_body:
2285	call	mulx4x_internal
2286
2287	mov	40(%rsp),%rsi		# restore %rsp
2288	mov	\$1,%rax
2289
2290	mov	-48(%rsi),%r15
2291	mov	-40(%rsi),%r14
2292	mov	-32(%rsi),%r13
2293	mov	-24(%rsi),%r12
2294	mov	-16(%rsi),%rbp
2295	mov	-8(%rsi),%rbx
2296	lea	(%rsi),%rsp
2297.Lmulx4x_epilogue:
2298	ret
2299.size	bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
2300
2301.type	mulx4x_internal,\@abi-omnipotent
2302.align	32
2303mulx4x_internal:
2304	mov	$num,8(%rsp)		# save -$num (it was in bytes)
2305	mov	$num,%r10
2306	neg	$num			# restore $num
2307	shl	\$5,$num
2308	neg	%r10			# restore $num
2309	lea	128($bp,$num),%r13	# end of powers table (+size optimization)
2310	shr	\$5+5,$num
2311	movd	`($win64?56:8)`(%rax),%xmm5	# load 7th argument
2312	sub	\$1,$num
2313	lea	.Linc(%rip),%rax
2314	mov	%r13,16+8(%rsp)		# end of b[num]
2315	mov	$num,24+8(%rsp)		# inner counter
2316	mov	$rp, 56+8(%rsp)		# save $rp
2317___
2318my ($aptr, $bptr, $nptr, $tptr, $mi,  $bi,  $zero, $num)=
2319   ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
2320my $rptr=$bptr;
2321my $STRIDE=2**5*8;		# 5 is "window size"
2322my $N=$STRIDE/4;		# should match cache line size
2323$code.=<<___;
2324	movdqa	0(%rax),%xmm0		# 00000001000000010000000000000000
2325	movdqa	16(%rax),%xmm1		# 00000002000000020000000200000002
2326	lea	88-112(%rsp,%r10),%r10	# place the mask after tp[num+1] (+ICache optimizaton)
2327	lea	128($bp),$bptr		# size optimization
2328
2329	pshufd	\$0,%xmm5,%xmm5		# broadcast index
2330	movdqa	%xmm1,%xmm4
2331	.byte	0x67
2332	movdqa	%xmm1,%xmm2
2333___
2334########################################################################
2335# calculate mask by comparing 0..31 to index and save result to stack
2336#
2337$code.=<<___;
2338	.byte	0x67
2339	paddd	%xmm0,%xmm1
2340	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
2341	movdqa	%xmm4,%xmm3
2342___
2343for($i=0;$i<$STRIDE/16-4;$i+=4) {
2344$code.=<<___;
2345	paddd	%xmm1,%xmm2
2346	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
2347	movdqa	%xmm0,`16*($i+0)+112`(%r10)
2348	movdqa	%xmm4,%xmm0
2349
2350	paddd	%xmm2,%xmm3
2351	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
2352	movdqa	%xmm1,`16*($i+1)+112`(%r10)
2353	movdqa	%xmm4,%xmm1
2354
2355	paddd	%xmm3,%xmm0
2356	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
2357	movdqa	%xmm2,`16*($i+2)+112`(%r10)
2358	movdqa	%xmm4,%xmm2
2359
2360	paddd	%xmm0,%xmm1
2361	pcmpeqd	%xmm5,%xmm0
2362	movdqa	%xmm3,`16*($i+3)+112`(%r10)
2363	movdqa	%xmm4,%xmm3
2364___
2365}
2366$code.=<<___;				# last iteration can be optimized
2367	.byte	0x67
2368	paddd	%xmm1,%xmm2
2369	pcmpeqd	%xmm5,%xmm1
2370	movdqa	%xmm0,`16*($i+0)+112`(%r10)
2371
2372	paddd	%xmm2,%xmm3
2373	pcmpeqd	%xmm5,%xmm2
2374	movdqa	%xmm1,`16*($i+1)+112`(%r10)
2375
2376	pcmpeqd	%xmm5,%xmm3
2377	movdqa	%xmm2,`16*($i+2)+112`(%r10)
2378
2379	pand	`16*($i+0)-128`($bptr),%xmm0	# while it's still in register
2380	pand	`16*($i+1)-128`($bptr),%xmm1
2381	pand	`16*($i+2)-128`($bptr),%xmm2
2382	movdqa	%xmm3,`16*($i+3)+112`(%r10)
2383	pand	`16*($i+3)-128`($bptr),%xmm3
2384	por	%xmm2,%xmm0
2385	por	%xmm3,%xmm1
2386___
2387for($i=0;$i<$STRIDE/16-4;$i+=4) {
2388$code.=<<___;
2389	movdqa	`16*($i+0)-128`($bptr),%xmm4
2390	movdqa	`16*($i+1)-128`($bptr),%xmm5
2391	movdqa	`16*($i+2)-128`($bptr),%xmm2
2392	pand	`16*($i+0)+112`(%r10),%xmm4
2393	movdqa	`16*($i+3)-128`($bptr),%xmm3
2394	pand	`16*($i+1)+112`(%r10),%xmm5
2395	por	%xmm4,%xmm0
2396	pand	`16*($i+2)+112`(%r10),%xmm2
2397	por	%xmm5,%xmm1
2398	pand	`16*($i+3)+112`(%r10),%xmm3
2399	por	%xmm2,%xmm0
2400	por	%xmm3,%xmm1
2401___
2402}
2403$code.=<<___;
2404	pxor	%xmm1,%xmm0
2405	pshufd	\$0x4e,%xmm0,%xmm1
2406	por	%xmm1,%xmm0
2407	lea	$STRIDE($bptr),$bptr
2408	movq	%xmm0,%rdx		# bp[0]
2409	lea	64+8*4+8(%rsp),$tptr
2410
2411	mov	%rdx,$bi
2412	mulx	0*8($aptr),$mi,%rax	# a[0]*b[0]
2413	mulx	1*8($aptr),%r11,%r12	# a[1]*b[0]
2414	add	%rax,%r11
2415	mulx	2*8($aptr),%rax,%r13	# ...
2416	adc	%rax,%r12
2417	adc	\$0,%r13
2418	mulx	3*8($aptr),%rax,%r14
2419
2420	mov	$mi,%r15
2421	imulq	32+8(%rsp),$mi		# "t[0]"*n0
2422	xor	$zero,$zero		# cf=0, of=0
2423	mov	$mi,%rdx
2424
2425	mov	$bptr,8+8(%rsp)		# off-load &b[i]
2426
2427	lea	4*8($aptr),$aptr
2428	adcx	%rax,%r13
2429	adcx	$zero,%r14		# cf=0
2430
2431	mulx	0*8($nptr),%rax,%r10
2432	adcx	%rax,%r15		# discarded
2433	adox	%r11,%r10
2434	mulx	1*8($nptr),%rax,%r11
2435	adcx	%rax,%r10
2436	adox	%r12,%r11
2437	mulx	2*8($nptr),%rax,%r12
2438	mov	24+8(%rsp),$bptr	# counter value
2439	mov	%r10,-8*4($tptr)
2440	adcx	%rax,%r11
2441	adox	%r13,%r12
2442	mulx	3*8($nptr),%rax,%r15
2443	 mov	$bi,%rdx
2444	mov	%r11,-8*3($tptr)
2445	adcx	%rax,%r12
2446	adox	$zero,%r15		# of=0
2447	lea	4*8($nptr),$nptr
2448	mov	%r12,-8*2($tptr)
2449	jmp	.Lmulx4x_1st
2450
2451.align	32
2452.Lmulx4x_1st:
2453	adcx	$zero,%r15		# cf=0, modulo-scheduled
2454	mulx	0*8($aptr),%r10,%rax	# a[4]*b[0]
2455	adcx	%r14,%r10
2456	mulx	1*8($aptr),%r11,%r14	# a[5]*b[0]
2457	adcx	%rax,%r11
2458	mulx	2*8($aptr),%r12,%rax	# ...
2459	adcx	%r14,%r12
2460	mulx	3*8($aptr),%r13,%r14
2461	 .byte	0x67,0x67
2462	 mov	$mi,%rdx
2463	adcx	%rax,%r13
2464	adcx	$zero,%r14		# cf=0
2465	lea	4*8($aptr),$aptr
2466	lea	4*8($tptr),$tptr
2467
2468	adox	%r15,%r10
2469	mulx	0*8($nptr),%rax,%r15
2470	adcx	%rax,%r10
2471	adox	%r15,%r11
2472	mulx	1*8($nptr),%rax,%r15
2473	adcx	%rax,%r11
2474	adox	%r15,%r12
2475	mulx	2*8($nptr),%rax,%r15
2476	mov	%r10,-5*8($tptr)
2477	adcx	%rax,%r12
2478	mov	%r11,-4*8($tptr)
2479	adox	%r15,%r13
2480	mulx	3*8($nptr),%rax,%r15
2481	 mov	$bi,%rdx
2482	mov	%r12,-3*8($tptr)
2483	adcx	%rax,%r13
2484	adox	$zero,%r15
2485	lea	4*8($nptr),$nptr
2486	mov	%r13,-2*8($tptr)
2487
2488	dec	$bptr			# of=0, pass cf
2489	jnz	.Lmulx4x_1st
2490
2491	mov	8(%rsp),$num		# load -num
2492	adc	$zero,%r15		# modulo-scheduled
2493	lea	($aptr,$num),$aptr	# rewind $aptr
2494	add	%r15,%r14
2495	mov	8+8(%rsp),$bptr		# re-load &b[i]
2496	adc	$zero,$zero		# top-most carry
2497	mov	%r14,-1*8($tptr)
2498	jmp	.Lmulx4x_outer
2499
2500.align	32
2501.Lmulx4x_outer:
2502	lea	16-256($tptr),%r10	# where 256-byte mask is (+density control)
2503	pxor	%xmm4,%xmm4
2504	.byte	0x67,0x67
2505	pxor	%xmm5,%xmm5
2506___
2507for($i=0;$i<$STRIDE/16;$i+=4) {
2508$code.=<<___;
2509	movdqa	`16*($i+0)-128`($bptr),%xmm0
2510	movdqa	`16*($i+1)-128`($bptr),%xmm1
2511	movdqa	`16*($i+2)-128`($bptr),%xmm2
2512	pand	`16*($i+0)+256`(%r10),%xmm0
2513	movdqa	`16*($i+3)-128`($bptr),%xmm3
2514	pand	`16*($i+1)+256`(%r10),%xmm1
2515	por	%xmm0,%xmm4
2516	pand	`16*($i+2)+256`(%r10),%xmm2
2517	por	%xmm1,%xmm5
2518	pand	`16*($i+3)+256`(%r10),%xmm3
2519	por	%xmm2,%xmm4
2520	por	%xmm3,%xmm5
2521___
2522}
2523$code.=<<___;
2524	por	%xmm5,%xmm4
2525	pshufd	\$0x4e,%xmm4,%xmm0
2526	por	%xmm4,%xmm0
2527	lea	$STRIDE($bptr),$bptr
2528	movq	%xmm0,%rdx		# m0=bp[i]
2529
2530	mov	$zero,($tptr)		# save top-most carry
2531	lea	4*8($tptr,$num),$tptr	# rewind $tptr
2532	mulx	0*8($aptr),$mi,%r11	# a[0]*b[i]
2533	xor	$zero,$zero		# cf=0, of=0
2534	mov	%rdx,$bi
2535	mulx	1*8($aptr),%r14,%r12	# a[1]*b[i]
2536	adox	-4*8($tptr),$mi		# +t[0]
2537	adcx	%r14,%r11
2538	mulx	2*8($aptr),%r15,%r13	# ...
2539	adox	-3*8($tptr),%r11
2540	adcx	%r15,%r12
2541	mulx	3*8($aptr),%rdx,%r14
2542	adox	-2*8($tptr),%r12
2543	adcx	%rdx,%r13
2544	lea	($nptr,$num),$nptr	# rewind $nptr
2545	lea	4*8($aptr),$aptr
2546	adox	-1*8($tptr),%r13
2547	adcx	$zero,%r14
2548	adox	$zero,%r14
2549
2550	mov	$mi,%r15
2551	imulq	32+8(%rsp),$mi		# "t[0]"*n0
2552
2553	mov	$mi,%rdx
2554	xor	$zero,$zero		# cf=0, of=0
2555	mov	$bptr,8+8(%rsp)		# off-load &b[i]
2556
2557	mulx	0*8($nptr),%rax,%r10
2558	adcx	%rax,%r15		# discarded
2559	adox	%r11,%r10
2560	mulx	1*8($nptr),%rax,%r11
2561	adcx	%rax,%r10
2562	adox	%r12,%r11
2563	mulx	2*8($nptr),%rax,%r12
2564	adcx	%rax,%r11
2565	adox	%r13,%r12
2566	mulx	3*8($nptr),%rax,%r15
2567	 mov	$bi,%rdx
2568	mov	24+8(%rsp),$bptr	# counter value
2569	mov	%r10,-8*4($tptr)
2570	adcx	%rax,%r12
2571	mov	%r11,-8*3($tptr)
2572	adox	$zero,%r15		# of=0
2573	mov	%r12,-8*2($tptr)
2574	lea	4*8($nptr),$nptr
2575	jmp	.Lmulx4x_inner
2576
2577.align	32
2578.Lmulx4x_inner:
2579	mulx	0*8($aptr),%r10,%rax	# a[4]*b[i]
2580	adcx	$zero,%r15		# cf=0, modulo-scheduled
2581	adox	%r14,%r10
2582	mulx	1*8($aptr),%r11,%r14	# a[5]*b[i]
2583	adcx	0*8($tptr),%r10
2584	adox	%rax,%r11
2585	mulx	2*8($aptr),%r12,%rax	# ...
2586	adcx	1*8($tptr),%r11
2587	adox	%r14,%r12
2588	mulx	3*8($aptr),%r13,%r14
2589	 mov	$mi,%rdx
2590	adcx	2*8($tptr),%r12
2591	adox	%rax,%r13
2592	adcx	3*8($tptr),%r13
2593	adox	$zero,%r14		# of=0
2594	lea	4*8($aptr),$aptr
2595	lea	4*8($tptr),$tptr
2596	adcx	$zero,%r14		# cf=0
2597
2598	adox	%r15,%r10
2599	mulx	0*8($nptr),%rax,%r15
2600	adcx	%rax,%r10
2601	adox	%r15,%r11
2602	mulx	1*8($nptr),%rax,%r15
2603	adcx	%rax,%r11
2604	adox	%r15,%r12
2605	mulx	2*8($nptr),%rax,%r15
2606	mov	%r10,-5*8($tptr)
2607	adcx	%rax,%r12
2608	adox	%r15,%r13
2609	mov	%r11,-4*8($tptr)
2610	mulx	3*8($nptr),%rax,%r15
2611	 mov	$bi,%rdx
2612	lea	4*8($nptr),$nptr
2613	mov	%r12,-3*8($tptr)
2614	adcx	%rax,%r13
2615	adox	$zero,%r15
2616	mov	%r13,-2*8($tptr)
2617
2618	dec	$bptr			# of=0, pass cf
2619	jnz	.Lmulx4x_inner
2620
2621	mov	0+8(%rsp),$num		# load -num
2622	adc	$zero,%r15		# modulo-scheduled
2623	sub	0*8($tptr),$bptr	# pull top-most carry to %cf
2624	mov	8+8(%rsp),$bptr		# re-load &b[i]
2625	mov	16+8(%rsp),%r10
2626	adc	%r15,%r14
2627	lea	($aptr,$num),$aptr	# rewind $aptr
2628	adc	$zero,$zero		# top-most carry
2629	mov	%r14,-1*8($tptr)
2630
2631	cmp	%r10,$bptr
2632	jb	.Lmulx4x_outer
2633
2634	mov	-8($nptr),%r10
2635	mov	$zero,%r8
2636	mov	($nptr,$num),%r12
2637	lea	($nptr,$num),%rbp	# rewind $nptr
2638	mov	$num,%rcx
2639	lea	($tptr,$num),%rdi	# rewind $tptr
2640	xor	%eax,%eax
2641	xor	%r15,%r15
2642	sub	%r14,%r10		# compare top-most words
2643	adc	%r15,%r15
2644	or	%r15,%r8
2645	sar	\$3+2,%rcx
2646	sub	%r8,%rax		# %rax=-%r8
2647	mov	56+8(%rsp),%rdx		# restore rp
2648	dec	%r12			# so that after 'not' we get -n[0]
2649	mov	8*1(%rbp),%r13
2650	xor	%r8,%r8
2651	mov	8*2(%rbp),%r14
2652	mov	8*3(%rbp),%r15
2653	jmp	.Lsqrx4x_sub_entry	# common post-condition
2654.size	mulx4x_internal,.-mulx4x_internal
2655___
2656}{
2657######################################################################
2658# void bn_power5(
2659my $rptr="%rdi";	# BN_ULONG *rptr,
2660my $aptr="%rsi";	# const BN_ULONG *aptr,
2661my $bptr="%rdx";	# const void *table,
2662my $nptr="%rcx";	# const BN_ULONG *nptr,
2663my $n0  ="%r8";		# const BN_ULONG *n0);
2664my $num ="%r9";		# int num, has to be divisible by 8
2665			# int pwr);
2666
2667my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
2668my @A0=("%r10","%r11");
2669my @A1=("%r12","%r13");
2670my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
2671
2672$code.=<<___;
2673.type	bn_powerx5,\@function,6
2674.align	32
2675bn_powerx5:
2676	mov	%rsp,%rax
2677.Lpowerx5_enter:
2678	push	%rbx
2679	push	%rbp
2680	push	%r12
2681	push	%r13
2682	push	%r14
2683	push	%r15
2684.Lpowerx5_prologue:
2685
2686	shl	\$3,${num}d		# convert $num to bytes
2687	lea	($num,$num,2),%r10	# 3*$num in bytes
2688	neg	$num
2689	mov	($n0),$n0		# *n0
2690
2691	##############################################################
2692	# Ensure that stack frame doesn't alias with $rptr+3*$num
2693	# modulo 4096, which covers ret[num], am[num] and n[num]
2694	# (see bn_exp.c). This is done to allow memory disambiguation
2695	# logic do its magic. [Extra 256 bytes is for power mask
2696	# calculated from 7th argument, the index.]
2697	#
2698	lea	-320(%rsp,$num,2),%r11
2699	mov	%rsp,%rbp
2700	sub	$rptr,%r11
2701	and	\$4095,%r11
2702	cmp	%r11,%r10
2703	jb	.Lpwrx_sp_alt
2704	sub	%r11,%rbp		# align with $aptr
2705	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*$num*8+256)
2706	jmp	.Lpwrx_sp_done
2707
2708.align	32
2709.Lpwrx_sp_alt:
2710	lea	4096-320(,$num,2),%r10
2711	lea	-320(%rbp,$num,2),%rbp	# alloca(frame+2*$num*8+256)
2712	sub	%r10,%r11
2713	mov	\$0,%r10
2714	cmovc	%r10,%r11
2715	sub	%r11,%rbp
2716.Lpwrx_sp_done:
2717	and	\$-64,%rbp
2718	mov	%rsp,%r11
2719	sub	%rbp,%r11
2720	and	\$-4096,%r11
2721	lea	(%rbp,%r11),%rsp
2722	mov	(%rsp),%r10
2723	cmp	%rbp,%rsp
2724	ja	.Lpwrx_page_walk
2725	jmp	.Lpwrx_page_walk_done
2726
2727.Lpwrx_page_walk:
2728	lea	-4096(%rsp),%rsp
2729	mov	(%rsp),%r10
2730	cmp	%rbp,%rsp
2731	ja	.Lpwrx_page_walk
2732.Lpwrx_page_walk_done:
2733
2734	mov	$num,%r10
2735	neg	$num
2736
2737	##############################################################
2738	# Stack layout
2739	#
2740	# +0	saved $num, used in reduction section
2741	# +8	&t[2*$num], used in reduction section
2742	# +16	intermediate carry bit
2743	# +24	top-most carry bit, used in reduction section
2744	# +32	saved *n0
2745	# +40	saved %rsp
2746	# +48	t[2*$num]
2747	#
2748	pxor	%xmm0,%xmm0
2749	movq	$rptr,%xmm1		# save $rptr
2750	movq	$nptr,%xmm2		# save $nptr
2751	movq	%r10, %xmm3		# -$num
2752	movq	$bptr,%xmm4
2753	mov	$n0,  32(%rsp)
2754	mov	%rax, 40(%rsp)		# save original %rsp
2755.Lpowerx5_body:
2756
2757	call	__bn_sqrx8x_internal
2758	call	__bn_postx4x_internal
2759	call	__bn_sqrx8x_internal
2760	call	__bn_postx4x_internal
2761	call	__bn_sqrx8x_internal
2762	call	__bn_postx4x_internal
2763	call	__bn_sqrx8x_internal
2764	call	__bn_postx4x_internal
2765	call	__bn_sqrx8x_internal
2766	call	__bn_postx4x_internal
2767
2768	mov	%r10,$num		# -num
2769	mov	$aptr,$rptr
2770	movq	%xmm2,$nptr
2771	movq	%xmm4,$bptr
2772	mov	40(%rsp),%rax
2773
2774	call	mulx4x_internal
2775
2776	mov	40(%rsp),%rsi		# restore %rsp
2777	mov	\$1,%rax
2778
2779	mov	-48(%rsi),%r15
2780	mov	-40(%rsi),%r14
2781	mov	-32(%rsi),%r13
2782	mov	-24(%rsi),%r12
2783	mov	-16(%rsi),%rbp
2784	mov	-8(%rsi),%rbx
2785	lea	(%rsi),%rsp
2786.Lpowerx5_epilogue:
2787	ret
2788.size	bn_powerx5,.-bn_powerx5
2789
2790.globl	bn_sqrx8x_internal
2791.hidden	bn_sqrx8x_internal
2792.type	bn_sqrx8x_internal,\@abi-omnipotent
2793.align	32
2794bn_sqrx8x_internal:
2795__bn_sqrx8x_internal:
2796	##################################################################
2797	# Squaring part:
2798	#
2799	# a) multiply-n-add everything but a[i]*a[i];
2800	# b) shift result of a) by 1 to the left and accumulate
2801	#    a[i]*a[i] products;
2802	#
2803	##################################################################
2804	# a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
2805	#                                                     a[1]a[0]
2806	#                                                 a[2]a[0]
2807	#                                             a[3]a[0]
2808	#                                             a[2]a[1]
2809	#                                         a[3]a[1]
2810	#                                     a[3]a[2]
2811	#
2812	#                                         a[4]a[0]
2813	#                                     a[5]a[0]
2814	#                                 a[6]a[0]
2815	#                             a[7]a[0]
2816	#                                     a[4]a[1]
2817	#                                 a[5]a[1]
2818	#                             a[6]a[1]
2819	#                         a[7]a[1]
2820	#                                 a[4]a[2]
2821	#                             a[5]a[2]
2822	#                         a[6]a[2]
2823	#                     a[7]a[2]
2824	#                             a[4]a[3]
2825	#                         a[5]a[3]
2826	#                     a[6]a[3]
2827	#                 a[7]a[3]
2828	#
2829	#                     a[5]a[4]
2830	#                 a[6]a[4]
2831	#             a[7]a[4]
2832	#             a[6]a[5]
2833	#         a[7]a[5]
2834	#     a[7]a[6]
2835	# a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
2836___
2837{
2838my ($zero,$carry)=("%rbp","%rcx");
2839my $aaptr=$zero;
2840$code.=<<___;
2841	lea	48+8(%rsp),$tptr
2842	lea	($aptr,$num),$aaptr
2843	mov	$num,0+8(%rsp)			# save $num
2844	mov	$aaptr,8+8(%rsp)		# save end of $aptr
2845	jmp	.Lsqr8x_zero_start
2846
2847.align	32
2848.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2849.Lsqrx8x_zero:
2850	.byte	0x3e
2851	movdqa	%xmm0,0*8($tptr)
2852	movdqa	%xmm0,2*8($tptr)
2853	movdqa	%xmm0,4*8($tptr)
2854	movdqa	%xmm0,6*8($tptr)
2855.Lsqr8x_zero_start:			# aligned at 32
2856	movdqa	%xmm0,8*8($tptr)
2857	movdqa	%xmm0,10*8($tptr)
2858	movdqa	%xmm0,12*8($tptr)
2859	movdqa	%xmm0,14*8($tptr)
2860	lea	16*8($tptr),$tptr
2861	sub	\$64,$num
2862	jnz	.Lsqrx8x_zero
2863
2864	mov	0*8($aptr),%rdx		# a[0], modulo-scheduled
2865	#xor	%r9,%r9			# t[1], ex-$num, zero already
2866	xor	%r10,%r10
2867	xor	%r11,%r11
2868	xor	%r12,%r12
2869	xor	%r13,%r13
2870	xor	%r14,%r14
2871	xor	%r15,%r15
2872	lea	48+8(%rsp),$tptr
2873	xor	$zero,$zero		# cf=0, cf=0
2874	jmp	.Lsqrx8x_outer_loop
2875
2876.align	32
2877.Lsqrx8x_outer_loop:
2878	mulx	1*8($aptr),%r8,%rax	# a[1]*a[0]
2879	adcx	%r9,%r8			# a[1]*a[0]+=t[1]
2880	adox	%rax,%r10
2881	mulx	2*8($aptr),%r9,%rax	# a[2]*a[0]
2882	adcx	%r10,%r9
2883	adox	%rax,%r11
2884	.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00	# mulx	3*8($aptr),%r10,%rax	# ...
2885	adcx	%r11,%r10
2886	adox	%rax,%r12
2887	.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00	# mulx	4*8($aptr),%r11,%rax
2888	adcx	%r12,%r11
2889	adox	%rax,%r13
2890	mulx	5*8($aptr),%r12,%rax
2891	adcx	%r13,%r12
2892	adox	%rax,%r14
2893	mulx	6*8($aptr),%r13,%rax
2894	adcx	%r14,%r13
2895	adox	%r15,%rax
2896	mulx	7*8($aptr),%r14,%r15
2897	 mov	1*8($aptr),%rdx		# a[1]
2898	adcx	%rax,%r14
2899	adox	$zero,%r15
2900	adc	8*8($tptr),%r15
2901	mov	%r8,1*8($tptr)		# t[1]
2902	mov	%r9,2*8($tptr)		# t[2]
2903	sbb	$carry,$carry		# mov %cf,$carry
2904	xor	$zero,$zero		# cf=0, of=0
2905
2906
2907	mulx	2*8($aptr),%r8,%rbx	# a[2]*a[1]
2908	mulx	3*8($aptr),%r9,%rax	# a[3]*a[1]
2909	adcx	%r10,%r8
2910	adox	%rbx,%r9
2911	mulx	4*8($aptr),%r10,%rbx	# ...
2912	adcx	%r11,%r9
2913	adox	%rax,%r10
2914	.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00	# mulx	5*8($aptr),%r11,%rax
2915	adcx	%r12,%r10
2916	adox	%rbx,%r11
2917	.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00	# mulx	6*8($aptr),%r12,%rbx
2918	adcx	%r13,%r11
2919	adox	%r14,%r12
2920	.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00	# mulx	7*8($aptr),%r13,%r14
2921	 mov	2*8($aptr),%rdx		# a[2]
2922	adcx	%rax,%r12
2923	adox	%rbx,%r13
2924	adcx	%r15,%r13
2925	adox	$zero,%r14		# of=0
2926	adcx	$zero,%r14		# cf=0
2927
2928	mov	%r8,3*8($tptr)		# t[3]
2929	mov	%r9,4*8($tptr)		# t[4]
2930
2931	mulx	3*8($aptr),%r8,%rbx	# a[3]*a[2]
2932	mulx	4*8($aptr),%r9,%rax	# a[4]*a[2]
2933	adcx	%r10,%r8
2934	adox	%rbx,%r9
2935	mulx	5*8($aptr),%r10,%rbx	# ...
2936	adcx	%r11,%r9
2937	adox	%rax,%r10
2938	.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00	# mulx	6*8($aptr),%r11,%rax
2939	adcx	%r12,%r10
2940	adox	%r13,%r11
2941	.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00	# mulx	7*8($aptr),%r12,%r13
2942	.byte	0x3e
2943	 mov	3*8($aptr),%rdx		# a[3]
2944	adcx	%rbx,%r11
2945	adox	%rax,%r12
2946	adcx	%r14,%r12
2947	mov	%r8,5*8($tptr)		# t[5]
2948	mov	%r9,6*8($tptr)		# t[6]
2949	 mulx	4*8($aptr),%r8,%rax	# a[4]*a[3]
2950	adox	$zero,%r13		# of=0
2951	adcx	$zero,%r13		# cf=0
2952
2953	mulx	5*8($aptr),%r9,%rbx	# a[5]*a[3]
2954	adcx	%r10,%r8
2955	adox	%rax,%r9
2956	mulx	6*8($aptr),%r10,%rax	# ...
2957	adcx	%r11,%r9
2958	adox	%r12,%r10
2959	mulx	7*8($aptr),%r11,%r12
2960	 mov	4*8($aptr),%rdx		# a[4]
2961	 mov	5*8($aptr),%r14		# a[5]
2962	adcx	%rbx,%r10
2963	adox	%rax,%r11
2964	 mov	6*8($aptr),%r15		# a[6]
2965	adcx	%r13,%r11
2966	adox	$zero,%r12		# of=0
2967	adcx	$zero,%r12		# cf=0
2968
2969	mov	%r8,7*8($tptr)		# t[7]
2970	mov	%r9,8*8($tptr)		# t[8]
2971
2972	mulx	%r14,%r9,%rax		# a[5]*a[4]
2973	 mov	7*8($aptr),%r8		# a[7]
2974	adcx	%r10,%r9
2975	mulx	%r15,%r10,%rbx		# a[6]*a[4]
2976	adox	%rax,%r10
2977	adcx	%r11,%r10
2978	mulx	%r8,%r11,%rax		# a[7]*a[4]
2979	 mov	%r14,%rdx		# a[5]
2980	adox	%rbx,%r11
2981	adcx	%r12,%r11
2982	#adox	$zero,%rax		# of=0
2983	adcx	$zero,%rax		# cf=0
2984
2985	mulx	%r15,%r14,%rbx		# a[6]*a[5]
2986	mulx	%r8,%r12,%r13		# a[7]*a[5]
2987	 mov	%r15,%rdx		# a[6]
2988	 lea	8*8($aptr),$aptr
2989	adcx	%r14,%r11
2990	adox	%rbx,%r12
2991	adcx	%rax,%r12
2992	adox	$zero,%r13
2993
2994	.byte	0x67,0x67
2995	mulx	%r8,%r8,%r14		# a[7]*a[6]
2996	adcx	%r8,%r13
2997	adcx	$zero,%r14
2998
2999	cmp	8+8(%rsp),$aptr
3000	je	.Lsqrx8x_outer_break
3001
3002	neg	$carry			# mov $carry,%cf
3003	mov	\$-8,%rcx
3004	mov	$zero,%r15
3005	mov	8*8($tptr),%r8
3006	adcx	9*8($tptr),%r9		# +=t[9]
3007	adcx	10*8($tptr),%r10	# ...
3008	adcx	11*8($tptr),%r11
3009	adc	12*8($tptr),%r12
3010	adc	13*8($tptr),%r13
3011	adc	14*8($tptr),%r14
3012	adc	15*8($tptr),%r15
3013	lea	($aptr),$aaptr
3014	lea	2*64($tptr),$tptr
3015	sbb	%rax,%rax		# mov %cf,$carry
3016
3017	mov	-64($aptr),%rdx		# a[0]
3018	mov	%rax,16+8(%rsp)		# offload $carry
3019	mov	$tptr,24+8(%rsp)
3020
3021	#lea	8*8($tptr),$tptr	# see 2*8*8($tptr) above
3022	xor	%eax,%eax		# cf=0, of=0
3023	jmp	.Lsqrx8x_loop
3024
3025.align	32
3026.Lsqrx8x_loop:
3027	mov	%r8,%rbx
3028	mulx	0*8($aaptr),%rax,%r8	# a[8]*a[i]
3029	adcx	%rax,%rbx		# +=t[8]
3030	adox	%r9,%r8
3031
3032	mulx	1*8($aaptr),%rax,%r9	# ...
3033	adcx	%rax,%r8
3034	adox	%r10,%r9
3035
3036	mulx	2*8($aaptr),%rax,%r10
3037	adcx	%rax,%r9
3038	adox	%r11,%r10
3039
3040	mulx	3*8($aaptr),%rax,%r11
3041	adcx	%rax,%r10
3042	adox	%r12,%r11
3043
3044	.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00	# mulx	4*8($aaptr),%rax,%r12
3045	adcx	%rax,%r11
3046	adox	%r13,%r12
3047
3048	mulx	5*8($aaptr),%rax,%r13
3049	adcx	%rax,%r12
3050	adox	%r14,%r13
3051
3052	mulx	6*8($aaptr),%rax,%r14
3053	 mov	%rbx,($tptr,%rcx,8)	# store t[8+i]
3054	 mov	\$0,%ebx
3055	adcx	%rax,%r13
3056	adox	%r15,%r14
3057
3058	.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00	# mulx	7*8($aaptr),%rax,%r15
3059	 mov	8($aptr,%rcx,8),%rdx	# a[i]
3060	adcx	%rax,%r14
3061	adox	%rbx,%r15		# %rbx is 0, of=0
3062	adcx	%rbx,%r15		# cf=0
3063
3064	.byte	0x67
3065	inc	%rcx			# of=0
3066	jnz	.Lsqrx8x_loop
3067
3068	lea	8*8($aaptr),$aaptr
3069	mov	\$-8,%rcx
3070	cmp	8+8(%rsp),$aaptr	# done?
3071	je	.Lsqrx8x_break
3072
3073	sub	16+8(%rsp),%rbx		# mov 16(%rsp),%cf
3074	.byte	0x66
3075	mov	-64($aptr),%rdx
3076	adcx	0*8($tptr),%r8
3077	adcx	1*8($tptr),%r9
3078	adc	2*8($tptr),%r10
3079	adc	3*8($tptr),%r11
3080	adc	4*8($tptr),%r12
3081	adc	5*8($tptr),%r13
3082	adc	6*8($tptr),%r14
3083	adc	7*8($tptr),%r15
3084	lea	8*8($tptr),$tptr
3085	.byte	0x67
3086	sbb	%rax,%rax		# mov %cf,%rax
3087	xor	%ebx,%ebx		# cf=0, of=0
3088	mov	%rax,16+8(%rsp)		# offload carry
3089	jmp	.Lsqrx8x_loop
3090
3091.align	32
3092.Lsqrx8x_break:
3093	xor	$zero,$zero
3094	sub	16+8(%rsp),%rbx		# mov 16(%rsp),%cf
3095	adcx	$zero,%r8
3096	mov	24+8(%rsp),$carry	# initial $tptr, borrow $carry
3097	adcx	$zero,%r9
3098	mov	0*8($aptr),%rdx		# a[8], modulo-scheduled
3099	adc	\$0,%r10
3100	mov	%r8,0*8($tptr)
3101	adc	\$0,%r11
3102	adc	\$0,%r12
3103	adc	\$0,%r13
3104	adc	\$0,%r14
3105	adc	\$0,%r15
3106	cmp	$carry,$tptr		# cf=0, of=0
3107	je	.Lsqrx8x_outer_loop
3108
3109	mov	%r9,1*8($tptr)
3110	 mov	1*8($carry),%r9
3111	mov	%r10,2*8($tptr)
3112	 mov	2*8($carry),%r10
3113	mov	%r11,3*8($tptr)
3114	 mov	3*8($carry),%r11
3115	mov	%r12,4*8($tptr)
3116	 mov	4*8($carry),%r12
3117	mov	%r13,5*8($tptr)
3118	 mov	5*8($carry),%r13
3119	mov	%r14,6*8($tptr)
3120	 mov	6*8($carry),%r14
3121	mov	%r15,7*8($tptr)
3122	 mov	7*8($carry),%r15
3123	mov	$carry,$tptr
3124	jmp	.Lsqrx8x_outer_loop
3125
3126.align	32
3127.Lsqrx8x_outer_break:
3128	mov	%r9,9*8($tptr)		# t[9]
3129	 movq	%xmm3,%rcx		# -$num
3130	mov	%r10,10*8($tptr)	# ...
3131	mov	%r11,11*8($tptr)
3132	mov	%r12,12*8($tptr)
3133	mov	%r13,13*8($tptr)
3134	mov	%r14,14*8($tptr)
3135___
3136}{
3137my $i="%rcx";
3138$code.=<<___;
3139	lea	48+8(%rsp),$tptr
3140	mov	($aptr,$i),%rdx		# a[0]
3141
3142	mov	8($tptr),$A0[1]		# t[1]
3143	xor	$A0[0],$A0[0]		# t[0], of=0, cf=0
3144	mov	0+8(%rsp),$num		# restore $num
3145	adox	$A0[1],$A0[1]
3146	 mov	16($tptr),$A1[0]	# t[2]	# prefetch
3147	 mov	24($tptr),$A1[1]	# t[3]	# prefetch
3148	#jmp	.Lsqrx4x_shift_n_add	# happens to be aligned
3149
3150.align	32
3151.Lsqrx4x_shift_n_add:
3152	mulx	%rdx,%rax,%rbx
3153	 adox	$A1[0],$A1[0]
3154	adcx	$A0[0],%rax
3155	 .byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00	# mov	8($aptr,$i),%rdx	# a[i+1]	# prefetch
3156	 .byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00	# mov	32($tptr),$A0[0]	# t[2*i+4]	# prefetch
3157	 adox	$A1[1],$A1[1]
3158	adcx	$A0[1],%rbx
3159	 mov	40($tptr),$A0[1]		# t[2*i+4+1]	# prefetch
3160	mov	%rax,0($tptr)
3161	mov	%rbx,8($tptr)
3162
3163	mulx	%rdx,%rax,%rbx
3164	 adox	$A0[0],$A0[0]
3165	adcx	$A1[0],%rax
3166	 mov	16($aptr,$i),%rdx	# a[i+2]	# prefetch
3167	 mov	48($tptr),$A1[0]	# t[2*i+6]	# prefetch
3168	 adox	$A0[1],$A0[1]
3169	adcx	$A1[1],%rbx
3170	 mov	56($tptr),$A1[1]	# t[2*i+6+1]	# prefetch
3171	mov	%rax,16($tptr)
3172	mov	%rbx,24($tptr)
3173
3174	mulx	%rdx,%rax,%rbx
3175	 adox	$A1[0],$A1[0]
3176	adcx	$A0[0],%rax
3177	 mov	24($aptr,$i),%rdx	# a[i+3]	# prefetch
3178	 lea	32($i),$i
3179	 mov	64($tptr),$A0[0]	# t[2*i+8]	# prefetch
3180	 adox	$A1[1],$A1[1]
3181	adcx	$A0[1],%rbx
3182	 mov	72($tptr),$A0[1]	# t[2*i+8+1]	# prefetch
3183	mov	%rax,32($tptr)
3184	mov	%rbx,40($tptr)
3185
3186	mulx	%rdx,%rax,%rbx
3187	 adox	$A0[0],$A0[0]
3188	adcx	$A1[0],%rax
3189	jrcxz	.Lsqrx4x_shift_n_add_break
3190	 .byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00	# mov	0($aptr,$i),%rdx	# a[i+4]	# prefetch
3191	 adox	$A0[1],$A0[1]
3192	adcx	$A1[1],%rbx
3193	 mov	80($tptr),$A1[0]	# t[2*i+10]	# prefetch
3194	 mov	88($tptr),$A1[1]	# t[2*i+10+1]	# prefetch
3195	mov	%rax,48($tptr)
3196	mov	%rbx,56($tptr)
3197	lea	64($tptr),$tptr
3198	nop
3199	jmp	.Lsqrx4x_shift_n_add
3200
3201.align	32
3202.Lsqrx4x_shift_n_add_break:
3203	adcx	$A1[1],%rbx
3204	mov	%rax,48($tptr)
3205	mov	%rbx,56($tptr)
3206	lea	64($tptr),$tptr		# end of t[] buffer
3207___
3208}
3209######################################################################
3210# Montgomery reduction part, "word-by-word" algorithm.
3211#
3212# This new path is inspired by multiple submissions from Intel, by
3213# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
3214# Vinodh Gopal...
3215{
3216my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx");
3217
3218$code.=<<___;
3219	movq	%xmm2,$nptr
3220__bn_sqrx8x_reduction:
3221	xor	%eax,%eax		# initial top-most carry bit
3222	mov	32+8(%rsp),%rbx		# n0
3223	mov	48+8(%rsp),%rdx		# "%r8", 8*0($tptr)
3224	lea	-8*8($nptr,$num),%rcx	# end of n[]
3225	#lea	48+8(%rsp,$num,2),$tptr	# end of t[] buffer
3226	mov	%rcx, 0+8(%rsp)		# save end of n[]
3227	mov	$tptr,8+8(%rsp)		# save end of t[]
3228
3229	lea	48+8(%rsp),$tptr		# initial t[] window
3230	jmp	.Lsqrx8x_reduction_loop
3231
3232.align	32
3233.Lsqrx8x_reduction_loop:
3234	mov	8*1($tptr),%r9
3235	mov	8*2($tptr),%r10
3236	mov	8*3($tptr),%r11
3237	mov	8*4($tptr),%r12
3238	mov	%rdx,%r8
3239	imulq	%rbx,%rdx		# n0*a[i]
3240	mov	8*5($tptr),%r13
3241	mov	8*6($tptr),%r14
3242	mov	8*7($tptr),%r15
3243	mov	%rax,24+8(%rsp)		# store top-most carry bit
3244
3245	lea	8*8($tptr),$tptr
3246	xor	$carry,$carry		# cf=0,of=0
3247	mov	\$-8,%rcx
3248	jmp	.Lsqrx8x_reduce
3249
3250.align	32
3251.Lsqrx8x_reduce:
3252	mov	%r8, %rbx
3253	mulx	8*0($nptr),%rax,%r8	# n[0]
3254	adcx	%rbx,%rax		# discarded
3255	adox	%r9,%r8
3256
3257	mulx	8*1($nptr),%rbx,%r9	# n[1]
3258	adcx	%rbx,%r8
3259	adox	%r10,%r9
3260
3261	mulx	8*2($nptr),%rbx,%r10
3262	adcx	%rbx,%r9
3263	adox	%r11,%r10
3264
3265	mulx	8*3($nptr),%rbx,%r11
3266	adcx	%rbx,%r10
3267	adox	%r12,%r11
3268
3269	.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00	# mulx	8*4($nptr),%rbx,%r12
3270	 mov	%rdx,%rax
3271	 mov	%r8,%rdx
3272	adcx	%rbx,%r11
3273	adox	%r13,%r12
3274
3275	 mulx	32+8(%rsp),%rbx,%rdx	# %rdx discarded
3276	 mov	%rax,%rdx
3277	 mov	%rax,64+48+8(%rsp,%rcx,8)	# put aside n0*a[i]
3278
3279	mulx	8*5($nptr),%rax,%r13
3280	adcx	%rax,%r12
3281	adox	%r14,%r13
3282
3283	mulx	8*6($nptr),%rax,%r14
3284	adcx	%rax,%r13
3285	adox	%r15,%r14
3286
3287	mulx	8*7($nptr),%rax,%r15
3288	 mov	%rbx,%rdx
3289	adcx	%rax,%r14
3290	adox	$carry,%r15		# $carry is 0
3291	adcx	$carry,%r15		# cf=0
3292
3293	.byte	0x67,0x67,0x67
3294	inc	%rcx			# of=0
3295	jnz	.Lsqrx8x_reduce
3296
3297	mov	$carry,%rax		# xor	%rax,%rax
3298	cmp	0+8(%rsp),$nptr		# end of n[]?
3299	jae	.Lsqrx8x_no_tail
3300
3301	mov	48+8(%rsp),%rdx		# pull n0*a[0]
3302	add	8*0($tptr),%r8
3303	lea	8*8($nptr),$nptr
3304	mov	\$-8,%rcx
3305	adcx	8*1($tptr),%r9
3306	adcx	8*2($tptr),%r10
3307	adc	8*3($tptr),%r11
3308	adc	8*4($tptr),%r12
3309	adc	8*5($tptr),%r13
3310	adc	8*6($tptr),%r14
3311	adc	8*7($tptr),%r15
3312	lea	8*8($tptr),$tptr
3313	sbb	%rax,%rax		# top carry
3314
3315	xor	$carry,$carry		# of=0, cf=0
3316	mov	%rax,16+8(%rsp)
3317	jmp	.Lsqrx8x_tail
3318
3319.align	32
3320.Lsqrx8x_tail:
3321	mov	%r8,%rbx
3322	mulx	8*0($nptr),%rax,%r8
3323	adcx	%rax,%rbx
3324	adox	%r9,%r8
3325
3326	mulx	8*1($nptr),%rax,%r9
3327	adcx	%rax,%r8
3328	adox	%r10,%r9
3329
3330	mulx	8*2($nptr),%rax,%r10
3331	adcx	%rax,%r9
3332	adox	%r11,%r10
3333
3334	mulx	8*3($nptr),%rax,%r11
3335	adcx	%rax,%r10
3336	adox	%r12,%r11
3337
3338	.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00	# mulx	8*4($nptr),%rax,%r12
3339	adcx	%rax,%r11
3340	adox	%r13,%r12
3341
3342	mulx	8*5($nptr),%rax,%r13
3343	adcx	%rax,%r12
3344	adox	%r14,%r13
3345
3346	mulx	8*6($nptr),%rax,%r14
3347	adcx	%rax,%r13
3348	adox	%r15,%r14
3349
3350	mulx	8*7($nptr),%rax,%r15
3351	 mov	72+48+8(%rsp,%rcx,8),%rdx	# pull n0*a[i]
3352	adcx	%rax,%r14
3353	adox	$carry,%r15
3354	 mov	%rbx,($tptr,%rcx,8)	# save result
3355	 mov	%r8,%rbx
3356	adcx	$carry,%r15		# cf=0
3357
3358	inc	%rcx			# of=0
3359	jnz	.Lsqrx8x_tail
3360
3361	cmp	0+8(%rsp),$nptr		# end of n[]?
3362	jae	.Lsqrx8x_tail_done	# break out of loop
3363
3364	sub	16+8(%rsp),$carry	# mov 16(%rsp),%cf
3365	 mov	48+8(%rsp),%rdx		# pull n0*a[0]
3366	 lea	8*8($nptr),$nptr
3367	adc	8*0($tptr),%r8
3368	adc	8*1($tptr),%r9
3369	adc	8*2($tptr),%r10
3370	adc	8*3($tptr),%r11
3371	adc	8*4($tptr),%r12
3372	adc	8*5($tptr),%r13
3373	adc	8*6($tptr),%r14
3374	adc	8*7($tptr),%r15
3375	lea	8*8($tptr),$tptr
3376	sbb	%rax,%rax
3377	sub	\$8,%rcx		# mov	\$-8,%rcx
3378
3379	xor	$carry,$carry		# of=0, cf=0
3380	mov	%rax,16+8(%rsp)
3381	jmp	.Lsqrx8x_tail
3382
3383.align	32
3384.Lsqrx8x_tail_done:
3385	xor	%rax,%rax
3386	add	24+8(%rsp),%r8		# can this overflow?
3387	adc	\$0,%r9
3388	adc	\$0,%r10
3389	adc	\$0,%r11
3390	adc	\$0,%r12
3391	adc	\$0,%r13
3392	adc	\$0,%r14
3393	adc	\$0,%r15
3394	adc	\$0,%rax
3395
3396	sub	16+8(%rsp),$carry	# mov 16(%rsp),%cf
3397.Lsqrx8x_no_tail:			# %cf is 0 if jumped here
3398	adc	8*0($tptr),%r8
3399	 movq	%xmm3,%rcx
3400	adc	8*1($tptr),%r9
3401	 mov	8*7($nptr),$carry
3402	 movq	%xmm2,$nptr		# restore $nptr
3403	adc	8*2($tptr),%r10
3404	adc	8*3($tptr),%r11
3405	adc	8*4($tptr),%r12
3406	adc	8*5($tptr),%r13
3407	adc	8*6($tptr),%r14
3408	adc	8*7($tptr),%r15
3409	adc	\$0,%rax		# top-most carry
3410
3411	mov	32+8(%rsp),%rbx		# n0
3412	mov	8*8($tptr,%rcx),%rdx	# modulo-scheduled "%r8"
3413
3414	mov	%r8,8*0($tptr)		# store top 512 bits
3415	 lea	8*8($tptr),%r8		# borrow %r8
3416	mov	%r9,8*1($tptr)
3417	mov	%r10,8*2($tptr)
3418	mov	%r11,8*3($tptr)
3419	mov	%r12,8*4($tptr)
3420	mov	%r13,8*5($tptr)
3421	mov	%r14,8*6($tptr)
3422	mov	%r15,8*7($tptr)
3423
3424	lea	8*8($tptr,%rcx),$tptr	# start of current t[] window
3425	cmp	8+8(%rsp),%r8		# end of t[]?
3426	jb	.Lsqrx8x_reduction_loop
3427	ret
3428.size	bn_sqrx8x_internal,.-bn_sqrx8x_internal
3429___
3430}
3431##############################################################
3432# Post-condition, 4x unrolled
3433#
3434{
3435my ($rptr,$nptr)=("%rdx","%rbp");
3436$code.=<<___;
3437.align	32
3438__bn_postx4x_internal:
3439	mov	8*0($nptr),%r12
3440	mov	%rcx,%r10		# -$num
3441	mov	%rcx,%r9		# -$num
3442	neg	%rax
3443	sar	\$3+2,%rcx
3444	#lea	48+8(%rsp,%r9),$tptr
3445	movq	%xmm1,$rptr		# restore $rptr
3446	movq	%xmm1,$aptr		# prepare for back-to-back call
3447	dec	%r12			# so that after 'not' we get -n[0]
3448	mov	8*1($nptr),%r13
3449	xor	%r8,%r8
3450	mov	8*2($nptr),%r14
3451	mov	8*3($nptr),%r15
3452	jmp	.Lsqrx4x_sub_entry
3453
3454.align	16
3455.Lsqrx4x_sub:
3456	mov	8*0($nptr),%r12
3457	mov	8*1($nptr),%r13
3458	mov	8*2($nptr),%r14
3459	mov	8*3($nptr),%r15
3460.Lsqrx4x_sub_entry:
3461	andn	%rax,%r12,%r12
3462	lea	8*4($nptr),$nptr
3463	andn	%rax,%r13,%r13
3464	andn	%rax,%r14,%r14
3465	andn	%rax,%r15,%r15
3466
3467	neg	%r8			# mov %r8,%cf
3468	adc	8*0($tptr),%r12
3469	adc	8*1($tptr),%r13
3470	adc	8*2($tptr),%r14
3471	adc	8*3($tptr),%r15
3472	mov	%r12,8*0($rptr)
3473	lea	8*4($tptr),$tptr
3474	mov	%r13,8*1($rptr)
3475	sbb	%r8,%r8			# mov %cf,%r8
3476	mov	%r14,8*2($rptr)
3477	mov	%r15,8*3($rptr)
3478	lea	8*4($rptr),$rptr
3479
3480	inc	%rcx
3481	jnz	.Lsqrx4x_sub
3482
3483	neg	%r9			# restore $num
3484
3485	ret
3486.size	__bn_postx4x_internal,.-__bn_postx4x_internal
3487___
3488}
3489}}}
3490{
3491my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order
3492				("%rdi","%esi","%rdx","%ecx");  # Unix order
3493my $out=$inp;
3494my $STRIDE=2**5*8;
3495my $N=$STRIDE/4;
3496
3497$code.=<<___;
3498.globl	bn_get_bits5
3499.type	bn_get_bits5,\@abi-omnipotent
3500.align	16
3501bn_get_bits5:
3502	lea	0($inp),%r10
3503	lea	1($inp),%r11
3504	mov	$num,%ecx
3505	shr	\$4,$num
3506	and	\$15,%ecx
3507	lea	-8(%ecx),%eax
3508	cmp	\$11,%ecx
3509	cmova	%r11,%r10
3510	cmova	%eax,%ecx
3511	movzw	(%r10,$num,2),%eax
3512	shrl	%cl,%eax
3513	and	\$31,%eax
3514	ret
3515.size	bn_get_bits5,.-bn_get_bits5
3516
3517.globl	bn_scatter5
3518.type	bn_scatter5,\@abi-omnipotent
3519.align	16
3520bn_scatter5:
3521	cmp	\$0, $num
3522	jz	.Lscatter_epilogue
3523	lea	($tbl,$idx,8),$tbl
3524.Lscatter:
3525	mov	($inp),%rax
3526	lea	8($inp),$inp
3527	mov	%rax,($tbl)
3528	lea	32*8($tbl),$tbl
3529	sub	\$1,$num
3530	jnz	.Lscatter
3531.Lscatter_epilogue:
3532	ret
3533.size	bn_scatter5,.-bn_scatter5
3534
3535.globl	bn_gather5
3536.type	bn_gather5,\@abi-omnipotent
3537.align	32
3538bn_gather5:
3539.LSEH_begin_bn_gather5:			# Win64 thing, but harmless in other cases
3540	# I can't trust assembler to use specific encoding:-(
3541	.byte	0x4c,0x8d,0x14,0x24			#lea    (%rsp),%r10
3542	.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00	#sub	$0x108,%rsp
3543	lea	.Linc(%rip),%rax
3544	and	\$-16,%rsp		# shouldn't be formally required
3545
3546	movd	$idx,%xmm5
3547	movdqa	0(%rax),%xmm0		# 00000001000000010000000000000000
3548	movdqa	16(%rax),%xmm1		# 00000002000000020000000200000002
3549	lea	128($tbl),%r11		# size optimization
3550	lea	128(%rsp),%rax		# size optimization
3551
3552	pshufd	\$0,%xmm5,%xmm5		# broadcast $idx
3553	movdqa	%xmm1,%xmm4
3554	movdqa	%xmm1,%xmm2
3555___
3556########################################################################
3557# calculate mask by comparing 0..31 to $idx and save result to stack
3558#
3559for($i=0;$i<$STRIDE/16;$i+=4) {
3560$code.=<<___;
3561	paddd	%xmm0,%xmm1
3562	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
3563___
3564$code.=<<___	if ($i);
3565	movdqa	%xmm3,`16*($i-1)-128`(%rax)
3566___
3567$code.=<<___;
3568	movdqa	%xmm4,%xmm3
3569
3570	paddd	%xmm1,%xmm2
3571	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
3572	movdqa	%xmm0,`16*($i+0)-128`(%rax)
3573	movdqa	%xmm4,%xmm0
3574
3575	paddd	%xmm2,%xmm3
3576	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
3577	movdqa	%xmm1,`16*($i+1)-128`(%rax)
3578	movdqa	%xmm4,%xmm1
3579
3580	paddd	%xmm3,%xmm0
3581	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
3582	movdqa	%xmm2,`16*($i+2)-128`(%rax)
3583	movdqa	%xmm4,%xmm2
3584___
3585}
3586$code.=<<___;
3587	movdqa	%xmm3,`16*($i-1)-128`(%rax)
3588	jmp	.Lgather
3589
3590.align	32
3591.Lgather:
3592	pxor	%xmm4,%xmm4
3593	pxor	%xmm5,%xmm5
3594___
3595for($i=0;$i<$STRIDE/16;$i+=4) {
3596$code.=<<___;
3597	movdqa	`16*($i+0)-128`(%r11),%xmm0
3598	movdqa	`16*($i+1)-128`(%r11),%xmm1
3599	movdqa	`16*($i+2)-128`(%r11),%xmm2
3600	pand	`16*($i+0)-128`(%rax),%xmm0
3601	movdqa	`16*($i+3)-128`(%r11),%xmm3
3602	pand	`16*($i+1)-128`(%rax),%xmm1
3603	por	%xmm0,%xmm4
3604	pand	`16*($i+2)-128`(%rax),%xmm2
3605	por	%xmm1,%xmm5
3606	pand	`16*($i+3)-128`(%rax),%xmm3
3607	por	%xmm2,%xmm4
3608	por	%xmm3,%xmm5
3609___
3610}
3611$code.=<<___;
3612	por	%xmm5,%xmm4
3613	lea	$STRIDE(%r11),%r11
3614	pshufd	\$0x4e,%xmm4,%xmm0
3615	por	%xmm4,%xmm0
3616	movq	%xmm0,($out)		# m0=bp[0]
3617	lea	8($out),$out
3618	sub	\$1,$num
3619	jnz	.Lgather
3620
3621	lea	(%r10),%rsp
3622	ret
3623.LSEH_end_bn_gather5:
3624.size	bn_gather5,.-bn_gather5
3625___
3626}
3627$code.=<<___;
3628.align	64
3629.Linc:
3630	.long	0,0, 1,1
3631	.long	2,2, 2,2
3632.asciz	"Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
3633___
3634
3635# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3636#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
3637if ($win64) {
3638$rec="%rcx";
3639$frame="%rdx";
3640$context="%r8";
3641$disp="%r9";
3642
3643$code.=<<___;
3644.extern	__imp_RtlVirtualUnwind
3645.type	mul_handler,\@abi-omnipotent
3646.align	16
3647mul_handler:
3648	push	%rsi
3649	push	%rdi
3650	push	%rbx
3651	push	%rbp
3652	push	%r12
3653	push	%r13
3654	push	%r14
3655	push	%r15
3656	pushfq
3657	sub	\$64,%rsp
3658
3659	mov	120($context),%rax	# pull context->Rax
3660	mov	248($context),%rbx	# pull context->Rip
3661
3662	mov	8($disp),%rsi		# disp->ImageBase
3663	mov	56($disp),%r11		# disp->HandlerData
3664
3665	mov	0(%r11),%r10d		# HandlerData[0]
3666	lea	(%rsi,%r10),%r10	# end of prologue label
3667	cmp	%r10,%rbx		# context->Rip<end of prologue label
3668	jb	.Lcommon_seh_tail
3669
3670	mov	4(%r11),%r10d		# HandlerData[1]
3671	lea	(%rsi,%r10),%r10	# epilogue label
3672	cmp	%r10,%rbx		# context->Rip>=epilogue label
3673	jb	.Lcommon_pop_regs
3674
3675	mov	152($context),%rax	# pull context->Rsp
3676
3677	mov	8(%r11),%r10d		# HandlerData[2]
3678	lea	(%rsi,%r10),%r10	# epilogue label
3679	cmp	%r10,%rbx		# context->Rip>=epilogue label
3680	jae	.Lcommon_seh_tail
3681
3682	lea	.Lmul_epilogue(%rip),%r10
3683	cmp	%r10,%rbx
3684	ja	.Lbody_40
3685
3686	mov	192($context),%r10	# pull $num
3687	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
3688
3689	jmp	.Lcommon_pop_regs
3690
3691.Lbody_40:
3692	mov	40(%rax),%rax		# pull saved stack pointer
3693.Lcommon_pop_regs:
3694	mov	-8(%rax),%rbx
3695	mov	-16(%rax),%rbp
3696	mov	-24(%rax),%r12
3697	mov	-32(%rax),%r13
3698	mov	-40(%rax),%r14
3699	mov	-48(%rax),%r15
3700	mov	%rbx,144($context)	# restore context->Rbx
3701	mov	%rbp,160($context)	# restore context->Rbp
3702	mov	%r12,216($context)	# restore context->R12
3703	mov	%r13,224($context)	# restore context->R13
3704	mov	%r14,232($context)	# restore context->R14
3705	mov	%r15,240($context)	# restore context->R15
3706
3707.Lcommon_seh_tail:
3708	mov	8(%rax),%rdi
3709	mov	16(%rax),%rsi
3710	mov	%rax,152($context)	# restore context->Rsp
3711	mov	%rsi,168($context)	# restore context->Rsi
3712	mov	%rdi,176($context)	# restore context->Rdi
3713
3714	mov	40($disp),%rdi		# disp->ContextRecord
3715	mov	$context,%rsi		# context
3716	mov	\$154,%ecx		# sizeof(CONTEXT)
3717	.long	0xa548f3fc		# cld; rep movsq
3718
3719	mov	$disp,%rsi
3720	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
3721	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
3722	mov	0(%rsi),%r8		# arg3, disp->ControlPc
3723	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
3724	mov	40(%rsi),%r10		# disp->ContextRecord
3725	lea	56(%rsi),%r11		# &disp->HandlerData
3726	lea	24(%rsi),%r12		# &disp->EstablisherFrame
3727	mov	%r10,32(%rsp)		# arg5
3728	mov	%r11,40(%rsp)		# arg6
3729	mov	%r12,48(%rsp)		# arg7
3730	mov	%rcx,56(%rsp)		# arg8, (NULL)
3731	call	*__imp_RtlVirtualUnwind(%rip)
3732
3733	mov	\$1,%eax		# ExceptionContinueSearch
3734	add	\$64,%rsp
3735	popfq
3736	pop	%r15
3737	pop	%r14
3738	pop	%r13
3739	pop	%r12
3740	pop	%rbp
3741	pop	%rbx
3742	pop	%rdi
3743	pop	%rsi
3744	ret
3745.size	mul_handler,.-mul_handler
3746
3747.section	.pdata
3748.align	4
3749	.rva	.LSEH_begin_bn_mul_mont_gather5
3750	.rva	.LSEH_end_bn_mul_mont_gather5
3751	.rva	.LSEH_info_bn_mul_mont_gather5
3752
3753	.rva	.LSEH_begin_bn_mul4x_mont_gather5
3754	.rva	.LSEH_end_bn_mul4x_mont_gather5
3755	.rva	.LSEH_info_bn_mul4x_mont_gather5
3756
3757	.rva	.LSEH_begin_bn_power5
3758	.rva	.LSEH_end_bn_power5
3759	.rva	.LSEH_info_bn_power5
3760
3761	.rva	.LSEH_begin_bn_from_mont8x
3762	.rva	.LSEH_end_bn_from_mont8x
3763	.rva	.LSEH_info_bn_from_mont8x
3764___
3765$code.=<<___ if ($addx);
3766	.rva	.LSEH_begin_bn_mulx4x_mont_gather5
3767	.rva	.LSEH_end_bn_mulx4x_mont_gather5
3768	.rva	.LSEH_info_bn_mulx4x_mont_gather5
3769
3770	.rva	.LSEH_begin_bn_powerx5
3771	.rva	.LSEH_end_bn_powerx5
3772	.rva	.LSEH_info_bn_powerx5
3773___
3774$code.=<<___;
3775	.rva	.LSEH_begin_bn_gather5
3776	.rva	.LSEH_end_bn_gather5
3777	.rva	.LSEH_info_bn_gather5
3778
3779.section	.xdata
3780.align	8
3781.LSEH_info_bn_mul_mont_gather5:
3782	.byte	9,0,0,0
3783	.rva	mul_handler
3784	.rva	.Lmul_body,.Lmul_body,.Lmul_epilogue		# HandlerData[]
3785.align	8
3786.LSEH_info_bn_mul4x_mont_gather5:
3787	.byte	9,0,0,0
3788	.rva	mul_handler
3789	.rva	.Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue		# HandlerData[]
3790.align	8
3791.LSEH_info_bn_power5:
3792	.byte	9,0,0,0
3793	.rva	mul_handler
3794	.rva	.Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue	# HandlerData[]
3795.align	8
3796.LSEH_info_bn_from_mont8x:
3797	.byte	9,0,0,0
3798	.rva	mul_handler
3799	.rva	.Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue		# HandlerData[]
3800___
3801$code.=<<___ if ($addx);
3802.align	8
3803.LSEH_info_bn_mulx4x_mont_gather5:
3804	.byte	9,0,0,0
3805	.rva	mul_handler
3806	.rva	.Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue	# HandlerData[]
3807.align	8
3808.LSEH_info_bn_powerx5:
3809	.byte	9,0,0,0
3810	.rva	mul_handler
3811	.rva	.Lpowerx5_prologue,.Lpowerx5_body,.Lpowerx5_epilogue	# HandlerData[]
3812___
3813$code.=<<___;
3814.align	8
3815.LSEH_info_bn_gather5:
3816	.byte	0x01,0x0b,0x03,0x0a
3817	.byte	0x0b,0x01,0x21,0x00	# sub	rsp,0x108
3818	.byte	0x04,0xa3,0x00,0x00	# lea	r10,(rsp)
3819.align	8
3820___
3821}
3822
3823$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3824
3825print $code;
3826close STDOUT;
3827