x86_64-mont5.pl revision 337982
1#!/usr/bin/env perl
2
3# ====================================================================
4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5# project. The module is, however, dual licensed under OpenSSL and
6# CRYPTOGAMS licenses depending on where you obtain it. For further
7# details see http://www.openssl.org/~appro/cryptogams/.
8# ====================================================================
9
10# August 2011.
11#
12# Companion to x86_64-mont.pl that optimizes cache-timing attack
13# countermeasures. The subroutines are produced by replacing bp[i]
14# references in their x86_64-mont.pl counterparts with cache-neutral
15# references to powers table computed in BN_mod_exp_mont_consttime.
16# In addition subroutine that scatters elements of the powers table
17# is implemented, so that scatter-/gathering can be tuned without
18# bn_exp.c modifications.
19
20# August 2013.
21#
22# Add MULX/AD*X code paths and additional interfaces to optimize for
23# branch prediction unit. For input lengths that are multiples of 8
24# the np argument is not just modulus value, but one interleaved
25# with 0. This is to optimize post-condition...
26
27$flavour = shift;
28$output  = shift;
29if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
30
31$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
32
33$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
34( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
35( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
36die "can't locate x86_64-xlate.pl";
37
38open OUT,"| \"$^X\" $xlate $flavour $output";
39*STDOUT=*OUT;
40
41if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
42		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
43	$addx = ($1>=2.23);
44}
45
46if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
47	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
48	$addx = ($1>=2.10);
49}
50
51if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
52	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
53	$addx = ($1>=12);
54}
55
56if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9])\.([0-9]+)/) {
57	my $ver = $2 + $3/100.0;	# 3.1->3.01, 3.10->3.10
58	$addx = ($ver>=3.03);
59}
60
61# int bn_mul_mont_gather5(
62$rp="%rdi";	# BN_ULONG *rp,
63$ap="%rsi";	# const BN_ULONG *ap,
64$bp="%rdx";	# const BN_ULONG *bp,
65$np="%rcx";	# const BN_ULONG *np,
66$n0="%r8";	# const BN_ULONG *n0,
67$num="%r9";	# int num,
68		# int idx);	# 0 to 2^5-1, "index" in $bp holding
69				# pre-computed powers of a', interlaced
70				# in such manner that b[0] is $bp[idx],
71				# b[1] is [2^5+idx], etc.
72$lo0="%r10";
73$hi0="%r11";
74$hi1="%r13";
75$i="%r14";
76$j="%r15";
77$m0="%rbx";
78$m1="%rbp";
79
80$code=<<___;
81.text
82
83.extern	OPENSSL_ia32cap_P
84
85.globl	bn_mul_mont_gather5
86.type	bn_mul_mont_gather5,\@function,6
87.align	64
88bn_mul_mont_gather5:
89	mov	${num}d,${num}d
90	mov	%rsp,%rax
91	test	\$7,${num}d
92	jnz	.Lmul_enter
93___
94$code.=<<___ if ($addx);
95	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
96___
97$code.=<<___;
98	jmp	.Lmul4x_enter
99
100.align	16
101.Lmul_enter:
102	movd	`($win64?56:8)`(%rsp),%xmm5	# load 7th argument
103	push	%rbx
104	push	%rbp
105	push	%r12
106	push	%r13
107	push	%r14
108	push	%r15
109
110	neg	$num
111	mov	%rsp,%r11
112	lea	-280(%rsp,$num,8),%r10	# future alloca(8*(num+2)+256+8)
113	neg	$num			# restore $num
114	and	\$-1024,%r10		# minimize TLB usage
115
116	# Some OSes, *cough*-dows, insist on stack being "wired" to
117	# physical memory in strictly sequential manner, i.e. if stack
118	# allocation spans two pages, then reference to farmost one can
119	# be punishable by SEGV. But page walking can do good even on
120	# other OSes, because it guarantees that villain thread hits
121	# the guard page before it can make damage to innocent one...
122	sub	%r10,%r11
123	and	\$-4096,%r11
124	lea	(%r10,%r11),%rsp
125	mov	(%rsp),%r11
126	cmp	%r10,%rsp
127	ja	.Lmul_page_walk
128	jmp	.Lmul_page_walk_done
129
130.Lmul_page_walk:
131	lea	-4096(%rsp),%rsp
132	mov	(%rsp),%r11
133	cmp	%r10,%rsp
134	ja	.Lmul_page_walk
135.Lmul_page_walk_done:
136
137	lea	.Linc(%rip),%r10
138	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
139.Lmul_body:
140
141	lea	128($bp),%r12		# reassign $bp (+size optimization)
142___
143		$bp="%r12";
144		$STRIDE=2**5*8;		# 5 is "window size"
145		$N=$STRIDE/4;		# should match cache line size
146$code.=<<___;
147	movdqa	0(%r10),%xmm0		# 00000001000000010000000000000000
148	movdqa	16(%r10),%xmm1		# 00000002000000020000000200000002
149	lea	24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization)
150	and	\$-16,%r10
151
152	pshufd	\$0,%xmm5,%xmm5		# broadcast index
153	movdqa	%xmm1,%xmm4
154	movdqa	%xmm1,%xmm2
155___
156########################################################################
157# calculate mask by comparing 0..31 to index and save result to stack
158#
159$code.=<<___;
160	paddd	%xmm0,%xmm1
161	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
162	.byte	0x67
163	movdqa	%xmm4,%xmm3
164___
165for($k=0;$k<$STRIDE/16-4;$k+=4) {
166$code.=<<___;
167	paddd	%xmm1,%xmm2
168	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
169	movdqa	%xmm0,`16*($k+0)+112`(%r10)
170	movdqa	%xmm4,%xmm0
171
172	paddd	%xmm2,%xmm3
173	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
174	movdqa	%xmm1,`16*($k+1)+112`(%r10)
175	movdqa	%xmm4,%xmm1
176
177	paddd	%xmm3,%xmm0
178	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
179	movdqa	%xmm2,`16*($k+2)+112`(%r10)
180	movdqa	%xmm4,%xmm2
181
182	paddd	%xmm0,%xmm1
183	pcmpeqd	%xmm5,%xmm0
184	movdqa	%xmm3,`16*($k+3)+112`(%r10)
185	movdqa	%xmm4,%xmm3
186___
187}
188$code.=<<___;				# last iteration can be optimized
189	paddd	%xmm1,%xmm2
190	pcmpeqd	%xmm5,%xmm1
191	movdqa	%xmm0,`16*($k+0)+112`(%r10)
192
193	paddd	%xmm2,%xmm3
194	.byte	0x67
195	pcmpeqd	%xmm5,%xmm2
196	movdqa	%xmm1,`16*($k+1)+112`(%r10)
197
198	pcmpeqd	%xmm5,%xmm3
199	movdqa	%xmm2,`16*($k+2)+112`(%r10)
200	pand	`16*($k+0)-128`($bp),%xmm0	# while it's still in register
201
202	pand	`16*($k+1)-128`($bp),%xmm1
203	pand	`16*($k+2)-128`($bp),%xmm2
204	movdqa	%xmm3,`16*($k+3)+112`(%r10)
205	pand	`16*($k+3)-128`($bp),%xmm3
206	por	%xmm2,%xmm0
207	por	%xmm3,%xmm1
208___
209for($k=0;$k<$STRIDE/16-4;$k+=4) {
210$code.=<<___;
211	movdqa	`16*($k+0)-128`($bp),%xmm4
212	movdqa	`16*($k+1)-128`($bp),%xmm5
213	movdqa	`16*($k+2)-128`($bp),%xmm2
214	pand	`16*($k+0)+112`(%r10),%xmm4
215	movdqa	`16*($k+3)-128`($bp),%xmm3
216	pand	`16*($k+1)+112`(%r10),%xmm5
217	por	%xmm4,%xmm0
218	pand	`16*($k+2)+112`(%r10),%xmm2
219	por	%xmm5,%xmm1
220	pand	`16*($k+3)+112`(%r10),%xmm3
221	por	%xmm2,%xmm0
222	por	%xmm3,%xmm1
223___
224}
225$code.=<<___;
226	por	%xmm1,%xmm0
227	pshufd	\$0x4e,%xmm0,%xmm1
228	por	%xmm1,%xmm0
229	lea	$STRIDE($bp),$bp
230	movq	%xmm0,$m0		# m0=bp[0]
231
232	mov	($n0),$n0		# pull n0[0] value
233	mov	($ap),%rax
234
235	xor	$i,$i			# i=0
236	xor	$j,$j			# j=0
237
238	mov	$n0,$m1
239	mulq	$m0			# ap[0]*bp[0]
240	mov	%rax,$lo0
241	mov	($np),%rax
242
243	imulq	$lo0,$m1		# "tp[0]"*n0
244	mov	%rdx,$hi0
245
246	mulq	$m1			# np[0]*m1
247	add	%rax,$lo0		# discarded
248	mov	8($ap),%rax
249	adc	\$0,%rdx
250	mov	%rdx,$hi1
251
252	lea	1($j),$j		# j++
253	jmp	.L1st_enter
254
255.align	16
256.L1st:
257	add	%rax,$hi1
258	mov	($ap,$j,8),%rax
259	adc	\$0,%rdx
260	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
261	mov	$lo0,$hi0
262	adc	\$0,%rdx
263	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
264	mov	%rdx,$hi1
265
266.L1st_enter:
267	mulq	$m0			# ap[j]*bp[0]
268	add	%rax,$hi0
269	mov	($np,$j,8),%rax
270	adc	\$0,%rdx
271	lea	1($j),$j		# j++
272	mov	%rdx,$lo0
273
274	mulq	$m1			# np[j]*m1
275	cmp	$num,$j
276	jne	.L1st			# note that upon exit $j==$num, so
277					# they can be used interchangeably
278
279	add	%rax,$hi1
280	adc	\$0,%rdx
281	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
282	adc	\$0,%rdx
283	mov	$hi1,-16(%rsp,$num,8)	# tp[num-1]
284	mov	%rdx,$hi1
285	mov	$lo0,$hi0
286
287	xor	%rdx,%rdx
288	add	$hi0,$hi1
289	adc	\$0,%rdx
290	mov	$hi1,-8(%rsp,$num,8)
291	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
292
293	lea	1($i),$i		# i++
294	jmp	.Louter
295.align	16
296.Louter:
297	lea	24+128(%rsp,$num,8),%rdx	# where 256-byte mask is (+size optimization)
298	and	\$-16,%rdx
299	pxor	%xmm4,%xmm4
300	pxor	%xmm5,%xmm5
301___
302for($k=0;$k<$STRIDE/16;$k+=4) {
303$code.=<<___;
304	movdqa	`16*($k+0)-128`($bp),%xmm0
305	movdqa	`16*($k+1)-128`($bp),%xmm1
306	movdqa	`16*($k+2)-128`($bp),%xmm2
307	movdqa	`16*($k+3)-128`($bp),%xmm3
308	pand	`16*($k+0)-128`(%rdx),%xmm0
309	pand	`16*($k+1)-128`(%rdx),%xmm1
310	por	%xmm0,%xmm4
311	pand	`16*($k+2)-128`(%rdx),%xmm2
312	por	%xmm1,%xmm5
313	pand	`16*($k+3)-128`(%rdx),%xmm3
314	por	%xmm2,%xmm4
315	por	%xmm3,%xmm5
316___
317}
318$code.=<<___;
319	por	%xmm5,%xmm4
320	pshufd	\$0x4e,%xmm4,%xmm0
321	por	%xmm4,%xmm0
322	lea	$STRIDE($bp),$bp
323
324	mov	($ap),%rax		# ap[0]
325	movq	%xmm0,$m0		# m0=bp[i]
326
327	xor	$j,$j			# j=0
328	mov	$n0,$m1
329	mov	(%rsp),$lo0
330
331	mulq	$m0			# ap[0]*bp[i]
332	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
333	mov	($np),%rax
334	adc	\$0,%rdx
335
336	imulq	$lo0,$m1		# tp[0]*n0
337	mov	%rdx,$hi0
338
339	mulq	$m1			# np[0]*m1
340	add	%rax,$lo0		# discarded
341	mov	8($ap),%rax
342	adc	\$0,%rdx
343	mov	8(%rsp),$lo0		# tp[1]
344	mov	%rdx,$hi1
345
346	lea	1($j),$j		# j++
347	jmp	.Linner_enter
348
349.align	16
350.Linner:
351	add	%rax,$hi1
352	mov	($ap,$j,8),%rax
353	adc	\$0,%rdx
354	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
355	mov	(%rsp,$j,8),$lo0
356	adc	\$0,%rdx
357	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
358	mov	%rdx,$hi1
359
360.Linner_enter:
361	mulq	$m0			# ap[j]*bp[i]
362	add	%rax,$hi0
363	mov	($np,$j,8),%rax
364	adc	\$0,%rdx
365	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
366	mov	%rdx,$hi0
367	adc	\$0,$hi0
368	lea	1($j),$j		# j++
369
370	mulq	$m1			# np[j]*m1
371	cmp	$num,$j
372	jne	.Linner			# note that upon exit $j==$num, so
373					# they can be used interchangeably
374	add	%rax,$hi1
375	adc	\$0,%rdx
376	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
377	mov	(%rsp,$num,8),$lo0
378	adc	\$0,%rdx
379	mov	$hi1,-16(%rsp,$num,8)	# tp[num-1]
380	mov	%rdx,$hi1
381
382	xor	%rdx,%rdx
383	add	$hi0,$hi1
384	adc	\$0,%rdx
385	add	$lo0,$hi1		# pull upmost overflow bit
386	adc	\$0,%rdx
387	mov	$hi1,-8(%rsp,$num,8)
388	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
389
390	lea	1($i),$i		# i++
391	cmp	$num,$i
392	jb	.Louter
393
394	xor	$i,$i			# i=0 and clear CF!
395	mov	(%rsp),%rax		# tp[0]
396	lea	(%rsp),$ap		# borrow ap for tp
397	mov	$num,$j			# j=num
398	jmp	.Lsub
399.align	16
400.Lsub:	sbb	($np,$i,8),%rax
401	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
402	mov	8($ap,$i,8),%rax	# tp[i+1]
403	lea	1($i),$i		# i++
404	dec	$j			# doesnn't affect CF!
405	jnz	.Lsub
406
407	sbb	\$0,%rax		# handle upmost overflow bit
408	mov	\$-1,%rbx
409	xor	%rax,%rbx
410	xor	$i,$i
411	mov	$num,$j			# j=num
412
413.Lcopy:					# conditional copy
414	mov	($rp,$i,8),%rcx
415	mov	(%rsp,$i,8),%rdx
416	and	%rbx,%rcx
417	and	%rax,%rdx
418	mov	$i,(%rsp,$i,8)		# zap temporary vector
419	or	%rcx,%rdx
420	mov	%rdx,($rp,$i,8)		# rp[i]=tp[i]
421	lea	1($i),$i
422	sub	\$1,$j
423	jnz	.Lcopy
424
425	mov	8(%rsp,$num,8),%rsi	# restore %rsp
426	mov	\$1,%rax
427
428	mov	-48(%rsi),%r15
429	mov	-40(%rsi),%r14
430	mov	-32(%rsi),%r13
431	mov	-24(%rsi),%r12
432	mov	-16(%rsi),%rbp
433	mov	-8(%rsi),%rbx
434	lea	(%rsi),%rsp
435.Lmul_epilogue:
436	ret
437.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
438___
439{{{
440my @A=("%r10","%r11");
441my @N=("%r13","%rdi");
442$code.=<<___;
443.type	bn_mul4x_mont_gather5,\@function,6
444.align	32
445bn_mul4x_mont_gather5:
446	.byte	0x67
447	mov	%rsp,%rax
448.Lmul4x_enter:
449___
450$code.=<<___ if ($addx);
451	and	\$0x80108,%r11d
452	cmp	\$0x80108,%r11d		# check for AD*X+BMI2+BMI1
453	je	.Lmulx4x_enter
454___
455$code.=<<___;
456	push	%rbx
457	push	%rbp
458	push	%r12
459	push	%r13
460	push	%r14
461	push	%r15
462.Lmul4x_prologue:
463
464	.byte	0x67
465	shl	\$3,${num}d		# convert $num to bytes
466	lea	($num,$num,2),%r10	# 3*$num in bytes
467	neg	$num			# -$num
468
469	##############################################################
470	# Ensure that stack frame doesn't alias with $rptr+3*$num
471	# modulo 4096, which covers ret[num], am[num] and n[num]
472	# (see bn_exp.c). This is done to allow memory disambiguation
473	# logic do its magic. [Extra [num] is allocated in order
474	# to align with bn_power5's frame, which is cleansed after
475	# completing exponentiation. Extra 256 bytes is for power mask
476	# calculated from 7th argument, the index.]
477	#
478	lea	-320(%rsp,$num,2),%r11
479	mov	%rsp,%rbp
480	sub	$rp,%r11
481	and	\$4095,%r11
482	cmp	%r11,%r10
483	jb	.Lmul4xsp_alt
484	sub	%r11,%rbp		# align with $rp
485	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*num*8+256)
486	jmp	.Lmul4xsp_done
487
488.align	32
489.Lmul4xsp_alt:
490	lea	4096-320(,$num,2),%r10
491	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*num*8+256)
492	sub	%r10,%r11
493	mov	\$0,%r10
494	cmovc	%r10,%r11
495	sub	%r11,%rbp
496.Lmul4xsp_done:
497	and	\$-64,%rbp
498	mov	%rsp,%r11
499	sub	%rbp,%r11
500	and	\$-4096,%r11
501	lea	(%rbp,%r11),%rsp
502	mov	(%rsp),%r10
503	cmp	%rbp,%rsp
504	ja	.Lmul4x_page_walk
505	jmp	.Lmul4x_page_walk_done
506
507.Lmul4x_page_walk:
508	lea	-4096(%rsp),%rsp
509	mov	(%rsp),%r10
510	cmp	%rbp,%rsp
511	ja	.Lmul4x_page_walk
512.Lmul4x_page_walk_done:
513
514	neg	$num
515
516	mov	%rax,40(%rsp)
517.Lmul4x_body:
518
519	call	mul4x_internal
520
521	mov	40(%rsp),%rsi		# restore %rsp
522	mov	\$1,%rax
523
524	mov	-48(%rsi),%r15
525	mov	-40(%rsi),%r14
526	mov	-32(%rsi),%r13
527	mov	-24(%rsi),%r12
528	mov	-16(%rsi),%rbp
529	mov	-8(%rsi),%rbx
530	lea	(%rsi),%rsp
531.Lmul4x_epilogue:
532	ret
533.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
534
535.type	mul4x_internal,\@abi-omnipotent
536.align	32
537mul4x_internal:
538	shl	\$5,$num		# $num was in bytes
539	movd	`($win64?56:8)`(%rax),%xmm5	# load 7th argument, index
540	lea	.Linc(%rip),%rax
541	lea	128(%rdx,$num),%r13	# end of powers table (+size optimization)
542	shr	\$5,$num		# restore $num
543___
544		$bp="%r12";
545		$STRIDE=2**5*8;		# 5 is "window size"
546		$N=$STRIDE/4;		# should match cache line size
547		$tp=$i;
548$code.=<<___;
549	movdqa	0(%rax),%xmm0		# 00000001000000010000000000000000
550	movdqa	16(%rax),%xmm1		# 00000002000000020000000200000002
551	lea	88-112(%rsp,$num),%r10	# place the mask after tp[num+1] (+ICache optimization)
552	lea	128(%rdx),$bp		# size optimization
553
554	pshufd	\$0,%xmm5,%xmm5		# broadcast index
555	movdqa	%xmm1,%xmm4
556	.byte	0x67,0x67
557	movdqa	%xmm1,%xmm2
558___
559########################################################################
560# calculate mask by comparing 0..31 to index and save result to stack
561#
562$code.=<<___;
563	paddd	%xmm0,%xmm1
564	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
565	.byte	0x67
566	movdqa	%xmm4,%xmm3
567___
568for($i=0;$i<$STRIDE/16-4;$i+=4) {
569$code.=<<___;
570	paddd	%xmm1,%xmm2
571	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
572	movdqa	%xmm0,`16*($i+0)+112`(%r10)
573	movdqa	%xmm4,%xmm0
574
575	paddd	%xmm2,%xmm3
576	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
577	movdqa	%xmm1,`16*($i+1)+112`(%r10)
578	movdqa	%xmm4,%xmm1
579
580	paddd	%xmm3,%xmm0
581	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
582	movdqa	%xmm2,`16*($i+2)+112`(%r10)
583	movdqa	%xmm4,%xmm2
584
585	paddd	%xmm0,%xmm1
586	pcmpeqd	%xmm5,%xmm0
587	movdqa	%xmm3,`16*($i+3)+112`(%r10)
588	movdqa	%xmm4,%xmm3
589___
590}
591$code.=<<___;				# last iteration can be optimized
592	paddd	%xmm1,%xmm2
593	pcmpeqd	%xmm5,%xmm1
594	movdqa	%xmm0,`16*($i+0)+112`(%r10)
595
596	paddd	%xmm2,%xmm3
597	.byte	0x67
598	pcmpeqd	%xmm5,%xmm2
599	movdqa	%xmm1,`16*($i+1)+112`(%r10)
600
601	pcmpeqd	%xmm5,%xmm3
602	movdqa	%xmm2,`16*($i+2)+112`(%r10)
603	pand	`16*($i+0)-128`($bp),%xmm0	# while it's still in register
604
605	pand	`16*($i+1)-128`($bp),%xmm1
606	pand	`16*($i+2)-128`($bp),%xmm2
607	movdqa	%xmm3,`16*($i+3)+112`(%r10)
608	pand	`16*($i+3)-128`($bp),%xmm3
609	por	%xmm2,%xmm0
610	por	%xmm3,%xmm1
611___
612for($i=0;$i<$STRIDE/16-4;$i+=4) {
613$code.=<<___;
614	movdqa	`16*($i+0)-128`($bp),%xmm4
615	movdqa	`16*($i+1)-128`($bp),%xmm5
616	movdqa	`16*($i+2)-128`($bp),%xmm2
617	pand	`16*($i+0)+112`(%r10),%xmm4
618	movdqa	`16*($i+3)-128`($bp),%xmm3
619	pand	`16*($i+1)+112`(%r10),%xmm5
620	por	%xmm4,%xmm0
621	pand	`16*($i+2)+112`(%r10),%xmm2
622	por	%xmm5,%xmm1
623	pand	`16*($i+3)+112`(%r10),%xmm3
624	por	%xmm2,%xmm0
625	por	%xmm3,%xmm1
626___
627}
628$code.=<<___;
629	por	%xmm1,%xmm0
630	pshufd	\$0x4e,%xmm0,%xmm1
631	por	%xmm1,%xmm0
632	lea	$STRIDE($bp),$bp
633	movq	%xmm0,$m0		# m0=bp[0]
634
635	mov	%r13,16+8(%rsp)		# save end of b[num]
636	mov	$rp, 56+8(%rsp)		# save $rp
637
638	mov	($n0),$n0		# pull n0[0] value
639	mov	($ap),%rax
640	lea	($ap,$num),$ap		# end of a[num]
641	neg	$num
642
643	mov	$n0,$m1
644	mulq	$m0			# ap[0]*bp[0]
645	mov	%rax,$A[0]
646	mov	($np),%rax
647
648	imulq	$A[0],$m1		# "tp[0]"*n0
649	lea	64+8(%rsp),$tp
650	mov	%rdx,$A[1]
651
652	mulq	$m1			# np[0]*m1
653	add	%rax,$A[0]		# discarded
654	mov	8($ap,$num),%rax
655	adc	\$0,%rdx
656	mov	%rdx,$N[1]
657
658	mulq	$m0
659	add	%rax,$A[1]
660	mov	8*1($np),%rax
661	adc	\$0,%rdx
662	mov	%rdx,$A[0]
663
664	mulq	$m1
665	add	%rax,$N[1]
666	mov	16($ap,$num),%rax
667	adc	\$0,%rdx
668	add	$A[1],$N[1]
669	lea	4*8($num),$j		# j=4
670	lea	8*4($np),$np
671	adc	\$0,%rdx
672	mov	$N[1],($tp)
673	mov	%rdx,$N[0]
674	jmp	.L1st4x
675
676.align	32
677.L1st4x:
678	mulq	$m0			# ap[j]*bp[0]
679	add	%rax,$A[0]
680	mov	-8*2($np),%rax
681	lea	32($tp),$tp
682	adc	\$0,%rdx
683	mov	%rdx,$A[1]
684
685	mulq	$m1			# np[j]*m1
686	add	%rax,$N[0]
687	mov	-8($ap,$j),%rax
688	adc	\$0,%rdx
689	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
690	adc	\$0,%rdx
691	mov	$N[0],-24($tp)		# tp[j-1]
692	mov	%rdx,$N[1]
693
694	mulq	$m0			# ap[j]*bp[0]
695	add	%rax,$A[1]
696	mov	-8*1($np),%rax
697	adc	\$0,%rdx
698	mov	%rdx,$A[0]
699
700	mulq	$m1			# np[j]*m1
701	add	%rax,$N[1]
702	mov	($ap,$j),%rax
703	adc	\$0,%rdx
704	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
705	adc	\$0,%rdx
706	mov	$N[1],-16($tp)		# tp[j-1]
707	mov	%rdx,$N[0]
708
709	mulq	$m0			# ap[j]*bp[0]
710	add	%rax,$A[0]
711	mov	8*0($np),%rax
712	adc	\$0,%rdx
713	mov	%rdx,$A[1]
714
715	mulq	$m1			# np[j]*m1
716	add	%rax,$N[0]
717	mov	8($ap,$j),%rax
718	adc	\$0,%rdx
719	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
720	adc	\$0,%rdx
721	mov	$N[0],-8($tp)		# tp[j-1]
722	mov	%rdx,$N[1]
723
724	mulq	$m0			# ap[j]*bp[0]
725	add	%rax,$A[1]
726	mov	8*1($np),%rax
727	adc	\$0,%rdx
728	mov	%rdx,$A[0]
729
730	mulq	$m1			# np[j]*m1
731	add	%rax,$N[1]
732	mov	16($ap,$j),%rax
733	adc	\$0,%rdx
734	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
735	lea	8*4($np),$np
736	adc	\$0,%rdx
737	mov	$N[1],($tp)		# tp[j-1]
738	mov	%rdx,$N[0]
739
740	add	\$32,$j			# j+=4
741	jnz	.L1st4x
742
743	mulq	$m0			# ap[j]*bp[0]
744	add	%rax,$A[0]
745	mov	-8*2($np),%rax
746	lea	32($tp),$tp
747	adc	\$0,%rdx
748	mov	%rdx,$A[1]
749
750	mulq	$m1			# np[j]*m1
751	add	%rax,$N[0]
752	mov	-8($ap),%rax
753	adc	\$0,%rdx
754	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
755	adc	\$0,%rdx
756	mov	$N[0],-24($tp)		# tp[j-1]
757	mov	%rdx,$N[1]
758
759	mulq	$m0			# ap[j]*bp[0]
760	add	%rax,$A[1]
761	mov	-8*1($np),%rax
762	adc	\$0,%rdx
763	mov	%rdx,$A[0]
764
765	mulq	$m1			# np[j]*m1
766	add	%rax,$N[1]
767	mov	($ap,$num),%rax		# ap[0]
768	adc	\$0,%rdx
769	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
770	adc	\$0,%rdx
771	mov	$N[1],-16($tp)		# tp[j-1]
772	mov	%rdx,$N[0]
773
774	lea	($np,$num),$np		# rewind $np
775
776	xor	$N[1],$N[1]
777	add	$A[0],$N[0]
778	adc	\$0,$N[1]
779	mov	$N[0],-8($tp)
780
781	jmp	.Louter4x
782
783.align	32
784.Louter4x:
785	lea	16+128($tp),%rdx	# where 256-byte mask is (+size optimization)
786	pxor	%xmm4,%xmm4
787	pxor	%xmm5,%xmm5
788___
789for($i=0;$i<$STRIDE/16;$i+=4) {
790$code.=<<___;
791	movdqa	`16*($i+0)-128`($bp),%xmm0
792	movdqa	`16*($i+1)-128`($bp),%xmm1
793	movdqa	`16*($i+2)-128`($bp),%xmm2
794	movdqa	`16*($i+3)-128`($bp),%xmm3
795	pand	`16*($i+0)-128`(%rdx),%xmm0
796	pand	`16*($i+1)-128`(%rdx),%xmm1
797	por	%xmm0,%xmm4
798	pand	`16*($i+2)-128`(%rdx),%xmm2
799	por	%xmm1,%xmm5
800	pand	`16*($i+3)-128`(%rdx),%xmm3
801	por	%xmm2,%xmm4
802	por	%xmm3,%xmm5
803___
804}
805$code.=<<___;
806	por	%xmm5,%xmm4
807	pshufd	\$0x4e,%xmm4,%xmm0
808	por	%xmm4,%xmm0
809	lea	$STRIDE($bp),$bp
810	movq	%xmm0,$m0		# m0=bp[i]
811
812	mov	($tp,$num),$A[0]
813	mov	$n0,$m1
814	mulq	$m0			# ap[0]*bp[i]
815	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
816	mov	($np),%rax
817	adc	\$0,%rdx
818
819	imulq	$A[0],$m1		# tp[0]*n0
820	mov	%rdx,$A[1]
821	mov	$N[1],($tp)		# store upmost overflow bit
822
823	lea	($tp,$num),$tp		# rewind $tp
824
825	mulq	$m1			# np[0]*m1
826	add	%rax,$A[0]		# "$N[0]", discarded
827	mov	8($ap,$num),%rax
828	adc	\$0,%rdx
829	mov	%rdx,$N[1]
830
831	mulq	$m0			# ap[j]*bp[i]
832	add	%rax,$A[1]
833	mov	8*1($np),%rax
834	adc	\$0,%rdx
835	add	8($tp),$A[1]		# +tp[1]
836	adc	\$0,%rdx
837	mov	%rdx,$A[0]
838
839	mulq	$m1			# np[j]*m1
840	add	%rax,$N[1]
841	mov	16($ap,$num),%rax
842	adc	\$0,%rdx
843	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
844	lea	4*8($num),$j		# j=4
845	lea	8*4($np),$np
846	adc	\$0,%rdx
847	mov	%rdx,$N[0]
848	jmp	.Linner4x
849
850.align	32
851.Linner4x:
852	mulq	$m0			# ap[j]*bp[i]
853	add	%rax,$A[0]
854	mov	-8*2($np),%rax
855	adc	\$0,%rdx
856	add	16($tp),$A[0]		# ap[j]*bp[i]+tp[j]
857	lea	32($tp),$tp
858	adc	\$0,%rdx
859	mov	%rdx,$A[1]
860
861	mulq	$m1			# np[j]*m1
862	add	%rax,$N[0]
863	mov	-8($ap,$j),%rax
864	adc	\$0,%rdx
865	add	$A[0],$N[0]
866	adc	\$0,%rdx
867	mov	$N[1],-32($tp)		# tp[j-1]
868	mov	%rdx,$N[1]
869
870	mulq	$m0			# ap[j]*bp[i]
871	add	%rax,$A[1]
872	mov	-8*1($np),%rax
873	adc	\$0,%rdx
874	add	-8($tp),$A[1]
875	adc	\$0,%rdx
876	mov	%rdx,$A[0]
877
878	mulq	$m1			# np[j]*m1
879	add	%rax,$N[1]
880	mov	($ap,$j),%rax
881	adc	\$0,%rdx
882	add	$A[1],$N[1]
883	adc	\$0,%rdx
884	mov	$N[0],-24($tp)		# tp[j-1]
885	mov	%rdx,$N[0]
886
887	mulq	$m0			# ap[j]*bp[i]
888	add	%rax,$A[0]
889	mov	8*0($np),%rax
890	adc	\$0,%rdx
891	add	($tp),$A[0]		# ap[j]*bp[i]+tp[j]
892	adc	\$0,%rdx
893	mov	%rdx,$A[1]
894
895	mulq	$m1			# np[j]*m1
896	add	%rax,$N[0]
897	mov	8($ap,$j),%rax
898	adc	\$0,%rdx
899	add	$A[0],$N[0]
900	adc	\$0,%rdx
901	mov	$N[1],-16($tp)		# tp[j-1]
902	mov	%rdx,$N[1]
903
904	mulq	$m0			# ap[j]*bp[i]
905	add	%rax,$A[1]
906	mov	8*1($np),%rax
907	adc	\$0,%rdx
908	add	8($tp),$A[1]
909	adc	\$0,%rdx
910	mov	%rdx,$A[0]
911
912	mulq	$m1			# np[j]*m1
913	add	%rax,$N[1]
914	mov	16($ap,$j),%rax
915	adc	\$0,%rdx
916	add	$A[1],$N[1]
917	lea	8*4($np),$np
918	adc	\$0,%rdx
919	mov	$N[0],-8($tp)		# tp[j-1]
920	mov	%rdx,$N[0]
921
922	add	\$32,$j			# j+=4
923	jnz	.Linner4x
924
925	mulq	$m0			# ap[j]*bp[i]
926	add	%rax,$A[0]
927	mov	-8*2($np),%rax
928	adc	\$0,%rdx
929	add	16($tp),$A[0]		# ap[j]*bp[i]+tp[j]
930	lea	32($tp),$tp
931	adc	\$0,%rdx
932	mov	%rdx,$A[1]
933
934	mulq	$m1			# np[j]*m1
935	add	%rax,$N[0]
936	mov	-8($ap),%rax
937	adc	\$0,%rdx
938	add	$A[0],$N[0]
939	adc	\$0,%rdx
940	mov	$N[1],-32($tp)		# tp[j-1]
941	mov	%rdx,$N[1]
942
943	mulq	$m0			# ap[j]*bp[i]
944	add	%rax,$A[1]
945	mov	$m1,%rax
946	mov	-8*1($np),$m1
947	adc	\$0,%rdx
948	add	-8($tp),$A[1]
949	adc	\$0,%rdx
950	mov	%rdx,$A[0]
951
952	mulq	$m1			# np[j]*m1
953	add	%rax,$N[1]
954	mov	($ap,$num),%rax		# ap[0]
955	adc	\$0,%rdx
956	add	$A[1],$N[1]
957	adc	\$0,%rdx
958	mov	$N[0],-24($tp)		# tp[j-1]
959	mov	%rdx,$N[0]
960
961	mov	$N[1],-16($tp)		# tp[j-1]
962	lea	($np,$num),$np		# rewind $np
963
964	xor	$N[1],$N[1]
965	add	$A[0],$N[0]
966	adc	\$0,$N[1]
967	add	($tp),$N[0]		# pull upmost overflow bit
968	adc	\$0,$N[1]		# upmost overflow bit
969	mov	$N[0],-8($tp)
970
971	cmp	16+8(%rsp),$bp
972	jb	.Louter4x
973___
974if (1) {
975$code.=<<___;
976	xor	%rax,%rax
977	sub	$N[0],$m1		# compare top-most words
978	adc	$j,$j			# $j is zero
979	or	$j,$N[1]
980	sub	$N[1],%rax		# %rax=-$N[1]
981	lea	($tp,$num),%rbx		# tptr in .sqr4x_sub
982	mov	($np),%r12
983	lea	($np),%rbp		# nptr in .sqr4x_sub
984	mov	%r9,%rcx
985	sar	\$3+2,%rcx
986	mov	56+8(%rsp),%rdi		# rptr in .sqr4x_sub
987	dec	%r12			# so that after 'not' we get -n[0]
988	xor	%r10,%r10
989	mov	8*1(%rbp),%r13
990	mov	8*2(%rbp),%r14
991	mov	8*3(%rbp),%r15
992	jmp	.Lsqr4x_sub_entry
993___
994} else {
995my @ri=("%rax",$bp,$m0,$m1);
996my $rp="%rdx";
997$code.=<<___
998	xor	\$1,$N[1]
999	lea	($tp,$num),$tp		# rewind $tp
1000	sar	\$5,$num		# cf=0
1001	lea	($np,$N[1],8),$np
1002	mov	56+8(%rsp),$rp		# restore $rp
1003	jmp	.Lsub4x
1004
1005.align	32
1006.Lsub4x:
1007	.byte	0x66
1008	mov	8*0($tp),@ri[0]
1009	mov	8*1($tp),@ri[1]
1010	.byte	0x66
1011	sbb	16*0($np),@ri[0]
1012	mov	8*2($tp),@ri[2]
1013	sbb	16*1($np),@ri[1]
1014	mov	3*8($tp),@ri[3]
1015	lea	4*8($tp),$tp
1016	sbb	16*2($np),@ri[2]
1017	mov	@ri[0],8*0($rp)
1018	sbb	16*3($np),@ri[3]
1019	lea	16*4($np),$np
1020	mov	@ri[1],8*1($rp)
1021	mov	@ri[2],8*2($rp)
1022	mov	@ri[3],8*3($rp)
1023	lea	8*4($rp),$rp
1024
1025	inc	$num
1026	jnz	.Lsub4x
1027
1028	ret
1029___
1030}
1031$code.=<<___;
1032.size	mul4x_internal,.-mul4x_internal
1033___
1034}}}
1035{{{
1036######################################################################
1037# void bn_power5(
1038my $rptr="%rdi";	# BN_ULONG *rptr,
1039my $aptr="%rsi";	# const BN_ULONG *aptr,
1040my $bptr="%rdx";	# const void *table,
1041my $nptr="%rcx";	# const BN_ULONG *nptr,
1042my $n0  ="%r8";		# const BN_ULONG *n0);
1043my $num ="%r9";		# int num, has to be divisible by 8
1044			# int pwr
1045
1046my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
1047my @A0=("%r10","%r11");
1048my @A1=("%r12","%r13");
1049my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
1050
1051$code.=<<___;
1052.globl	bn_power5
1053.type	bn_power5,\@function,6
1054.align	32
1055bn_power5:
1056	mov	%rsp,%rax
1057___
1058$code.=<<___ if ($addx);
1059	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
1060	and	\$0x80108,%r11d
1061	cmp	\$0x80108,%r11d		# check for AD*X+BMI2+BMI1
1062	je	.Lpowerx5_enter
1063___
1064$code.=<<___;
1065	push	%rbx
1066	push	%rbp
1067	push	%r12
1068	push	%r13
1069	push	%r14
1070	push	%r15
1071.Lpower5_prologue:
1072
1073	shl	\$3,${num}d		# convert $num to bytes
1074	lea	($num,$num,2),%r10d	# 3*$num
1075	neg	$num
1076	mov	($n0),$n0		# *n0
1077
1078	##############################################################
1079	# Ensure that stack frame doesn't alias with $rptr+3*$num
1080	# modulo 4096, which covers ret[num], am[num] and n[num]
1081	# (see bn_exp.c). This is done to allow memory disambiguation
1082	# logic do its magic. [Extra 256 bytes is for power mask
1083	# calculated from 7th argument, the index.]
1084	#
1085	lea	-320(%rsp,$num,2),%r11
1086	mov	%rsp,%rbp
1087	sub	$rptr,%r11
1088	and	\$4095,%r11
1089	cmp	%r11,%r10
1090	jb	.Lpwr_sp_alt
1091	sub	%r11,%rbp		# align with $aptr
1092	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*num*8+256)
1093	jmp	.Lpwr_sp_done
1094
1095.align	32
1096.Lpwr_sp_alt:
1097	lea	4096-320(,$num,2),%r10
1098	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*num*8+256)
1099	sub	%r10,%r11
1100	mov	\$0,%r10
1101	cmovc	%r10,%r11
1102	sub	%r11,%rbp
1103.Lpwr_sp_done:
1104	and	\$-64,%rbp
1105	mov	%rsp,%r11
1106	sub	%rbp,%r11
1107	and	\$-4096,%r11
1108	lea	(%rbp,%r11),%rsp
1109	mov	(%rsp),%r10
1110	cmp	%rbp,%rsp
1111	ja	.Lpwr_page_walk
1112	jmp	.Lpwr_page_walk_done
1113
1114.Lpwr_page_walk:
1115	lea	-4096(%rsp),%rsp
1116	mov	(%rsp),%r10
1117	cmp	%rbp,%rsp
1118	ja	.Lpwr_page_walk
1119.Lpwr_page_walk_done:
1120
1121	mov	$num,%r10
1122	neg	$num
1123
1124	##############################################################
1125	# Stack layout
1126	#
1127	# +0	saved $num, used in reduction section
1128	# +8	&t[2*$num], used in reduction section
1129	# +32	saved *n0
1130	# +40	saved %rsp
1131	# +48	t[2*$num]
1132	#
1133	mov	$n0,  32(%rsp)
1134	mov	%rax, 40(%rsp)		# save original %rsp
1135.Lpower5_body:
1136	movq	$rptr,%xmm1		# save $rptr, used in sqr8x
1137	movq	$nptr,%xmm2		# save $nptr
1138	movq	%r10, %xmm3		# -$num, used in sqr8x
1139	movq	$bptr,%xmm4
1140
1141	call	__bn_sqr8x_internal
1142	call	__bn_post4x_internal
1143	call	__bn_sqr8x_internal
1144	call	__bn_post4x_internal
1145	call	__bn_sqr8x_internal
1146	call	__bn_post4x_internal
1147	call	__bn_sqr8x_internal
1148	call	__bn_post4x_internal
1149	call	__bn_sqr8x_internal
1150	call	__bn_post4x_internal
1151
1152	movq	%xmm2,$nptr
1153	movq	%xmm4,$bptr
1154	mov	$aptr,$rptr
1155	mov	40(%rsp),%rax
1156	lea	32(%rsp),$n0
1157
1158	call	mul4x_internal
1159
1160	mov	40(%rsp),%rsi		# restore %rsp
1161	mov	\$1,%rax
1162	mov	-48(%rsi),%r15
1163	mov	-40(%rsi),%r14
1164	mov	-32(%rsi),%r13
1165	mov	-24(%rsi),%r12
1166	mov	-16(%rsi),%rbp
1167	mov	-8(%rsi),%rbx
1168	lea	(%rsi),%rsp
1169.Lpower5_epilogue:
1170	ret
1171.size	bn_power5,.-bn_power5
1172
1173.globl	bn_sqr8x_internal
1174.hidden	bn_sqr8x_internal
1175.type	bn_sqr8x_internal,\@abi-omnipotent
1176.align	32
1177bn_sqr8x_internal:
1178__bn_sqr8x_internal:
1179	##############################################################
1180	# Squaring part:
1181	#
1182	# a) multiply-n-add everything but a[i]*a[i];
1183	# b) shift result of a) by 1 to the left and accumulate
1184	#    a[i]*a[i] products;
1185	#
1186	##############################################################
1187	#                                                     a[1]a[0]
1188	#                                                 a[2]a[0]
1189	#                                             a[3]a[0]
1190	#                                             a[2]a[1]
1191	#                                         a[4]a[0]
1192	#                                         a[3]a[1]
1193	#                                     a[5]a[0]
1194	#                                     a[4]a[1]
1195	#                                     a[3]a[2]
1196	#                                 a[6]a[0]
1197	#                                 a[5]a[1]
1198	#                                 a[4]a[2]
1199	#                             a[7]a[0]
1200	#                             a[6]a[1]
1201	#                             a[5]a[2]
1202	#                             a[4]a[3]
1203	#                         a[7]a[1]
1204	#                         a[6]a[2]
1205	#                         a[5]a[3]
1206	#                     a[7]a[2]
1207	#                     a[6]a[3]
1208	#                     a[5]a[4]
1209	#                 a[7]a[3]
1210	#                 a[6]a[4]
1211	#             a[7]a[4]
1212	#             a[6]a[5]
1213	#         a[7]a[5]
1214	#     a[7]a[6]
1215	#                                                     a[1]a[0]
1216	#                                                 a[2]a[0]
1217	#                                             a[3]a[0]
1218	#                                         a[4]a[0]
1219	#                                     a[5]a[0]
1220	#                                 a[6]a[0]
1221	#                             a[7]a[0]
1222	#                                             a[2]a[1]
1223	#                                         a[3]a[1]
1224	#                                     a[4]a[1]
1225	#                                 a[5]a[1]
1226	#                             a[6]a[1]
1227	#                         a[7]a[1]
1228	#                                     a[3]a[2]
1229	#                                 a[4]a[2]
1230	#                             a[5]a[2]
1231	#                         a[6]a[2]
1232	#                     a[7]a[2]
1233	#                             a[4]a[3]
1234	#                         a[5]a[3]
1235	#                     a[6]a[3]
1236	#                 a[7]a[3]
1237	#                     a[5]a[4]
1238	#                 a[6]a[4]
1239	#             a[7]a[4]
1240	#             a[6]a[5]
1241	#         a[7]a[5]
1242	#     a[7]a[6]
1243	#                                                         a[0]a[0]
1244	#                                                 a[1]a[1]
1245	#                                         a[2]a[2]
1246	#                                 a[3]a[3]
1247	#                         a[4]a[4]
1248	#                 a[5]a[5]
1249	#         a[6]a[6]
1250	# a[7]a[7]
1251
1252	lea	32(%r10),$i		# $i=-($num-32)
1253	lea	($aptr,$num),$aptr	# end of a[] buffer, ($aptr,$i)=&ap[2]
1254
1255	mov	$num,$j			# $j=$num
1256
1257					# comments apply to $num==8 case
1258	mov	-32($aptr,$i),$a0	# a[0]
1259	lea	48+8(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
1260	mov	-24($aptr,$i),%rax	# a[1]
1261	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
1262	mov	-16($aptr,$i),$ai	# a[2]
1263	mov	%rax,$a1
1264
1265	mul	$a0			# a[1]*a[0]
1266	mov	%rax,$A0[0]		# a[1]*a[0]
1267	 mov	$ai,%rax		# a[2]
1268	mov	%rdx,$A0[1]
1269	mov	$A0[0],-24($tptr,$i)	# t[1]
1270
1271	mul	$a0			# a[2]*a[0]
1272	add	%rax,$A0[1]
1273	 mov	$ai,%rax
1274	adc	\$0,%rdx
1275	mov	$A0[1],-16($tptr,$i)	# t[2]
1276	mov	%rdx,$A0[0]
1277
1278
1279	 mov	-8($aptr,$i),$ai	# a[3]
1280	mul	$a1			# a[2]*a[1]
1281	mov	%rax,$A1[0]		# a[2]*a[1]+t[3]
1282	 mov	$ai,%rax
1283	mov	%rdx,$A1[1]
1284
1285	 lea	($i),$j
1286	mul	$a0			# a[3]*a[0]
1287	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
1288	 mov	$ai,%rax
1289	mov	%rdx,$A0[1]
1290	adc	\$0,$A0[1]
1291	add	$A1[0],$A0[0]
1292	adc	\$0,$A0[1]
1293	mov	$A0[0],-8($tptr,$j)	# t[3]
1294	jmp	.Lsqr4x_1st
1295
1296.align	32
1297.Lsqr4x_1st:
1298	 mov	($aptr,$j),$ai		# a[4]
1299	mul	$a1			# a[3]*a[1]
1300	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
1301	 mov	$ai,%rax
1302	mov	%rdx,$A1[0]
1303	adc	\$0,$A1[0]
1304
1305	mul	$a0			# a[4]*a[0]
1306	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
1307	 mov	$ai,%rax		# a[3]
1308	 mov	8($aptr,$j),$ai		# a[5]
1309	mov	%rdx,$A0[0]
1310	adc	\$0,$A0[0]
1311	add	$A1[1],$A0[1]
1312	adc	\$0,$A0[0]
1313
1314
1315	mul	$a1			# a[4]*a[3]
1316	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
1317	 mov	$ai,%rax
1318	 mov	$A0[1],($tptr,$j)	# t[4]
1319	mov	%rdx,$A1[1]
1320	adc	\$0,$A1[1]
1321
1322	mul	$a0			# a[5]*a[2]
1323	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
1324	 mov	$ai,%rax
1325	 mov	16($aptr,$j),$ai	# a[6]
1326	mov	%rdx,$A0[1]
1327	adc	\$0,$A0[1]
1328	add	$A1[0],$A0[0]
1329	adc	\$0,$A0[1]
1330
1331	mul	$a1			# a[5]*a[3]
1332	add	%rax,$A1[1]		# a[5]*a[3]+t[6]
1333	 mov	$ai,%rax
1334	 mov	$A0[0],8($tptr,$j)	# t[5]
1335	mov	%rdx,$A1[0]
1336	adc	\$0,$A1[0]
1337
1338	mul	$a0			# a[6]*a[2]
1339	add	%rax,$A0[1]		# a[6]*a[2]+a[5]*a[3]+t[6]
1340	 mov	$ai,%rax		# a[3]
1341	 mov	24($aptr,$j),$ai	# a[7]
1342	mov	%rdx,$A0[0]
1343	adc	\$0,$A0[0]
1344	add	$A1[1],$A0[1]
1345	adc	\$0,$A0[0]
1346
1347
1348	mul	$a1			# a[6]*a[5]
1349	add	%rax,$A1[0]		# a[6]*a[5]+t[7]
1350	 mov	$ai,%rax
1351	 mov	$A0[1],16($tptr,$j)	# t[6]
1352	mov	%rdx,$A1[1]
1353	adc	\$0,$A1[1]
1354	 lea	32($j),$j
1355
1356	mul	$a0			# a[7]*a[4]
1357	add	%rax,$A0[0]		# a[7]*a[4]+a[6]*a[5]+t[6]
1358	 mov	$ai,%rax
1359	mov	%rdx,$A0[1]
1360	adc	\$0,$A0[1]
1361	add	$A1[0],$A0[0]
1362	adc	\$0,$A0[1]
1363	mov	$A0[0],-8($tptr,$j)	# t[7]
1364
1365	cmp	\$0,$j
1366	jne	.Lsqr4x_1st
1367
1368	mul	$a1			# a[7]*a[5]
1369	add	%rax,$A1[1]
1370	lea	16($i),$i
1371	adc	\$0,%rdx
1372	add	$A0[1],$A1[1]
1373	adc	\$0,%rdx
1374
1375	mov	$A1[1],($tptr)		# t[8]
1376	mov	%rdx,$A1[0]
1377	mov	%rdx,8($tptr)		# t[9]
1378	jmp	.Lsqr4x_outer
1379
1380.align	32
1381.Lsqr4x_outer:				# comments apply to $num==6 case
1382	mov	-32($aptr,$i),$a0	# a[0]
1383	lea	48+8(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
1384	mov	-24($aptr,$i),%rax	# a[1]
1385	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
1386	mov	-16($aptr,$i),$ai	# a[2]
1387	mov	%rax,$a1
1388
1389	mul	$a0			# a[1]*a[0]
1390	mov	-24($tptr,$i),$A0[0]	# t[1]
1391	add	%rax,$A0[0]		# a[1]*a[0]+t[1]
1392	 mov	$ai,%rax		# a[2]
1393	adc	\$0,%rdx
1394	mov	$A0[0],-24($tptr,$i)	# t[1]
1395	mov	%rdx,$A0[1]
1396
1397	mul	$a0			# a[2]*a[0]
1398	add	%rax,$A0[1]
1399	 mov	$ai,%rax
1400	adc	\$0,%rdx
1401	add	-16($tptr,$i),$A0[1]	# a[2]*a[0]+t[2]
1402	mov	%rdx,$A0[0]
1403	adc	\$0,$A0[0]
1404	mov	$A0[1],-16($tptr,$i)	# t[2]
1405
1406	xor	$A1[0],$A1[0]
1407
1408	 mov	-8($aptr,$i),$ai	# a[3]
1409	mul	$a1			# a[2]*a[1]
1410	add	%rax,$A1[0]		# a[2]*a[1]+t[3]
1411	 mov	$ai,%rax
1412	adc	\$0,%rdx
1413	add	-8($tptr,$i),$A1[0]
1414	mov	%rdx,$A1[1]
1415	adc	\$0,$A1[1]
1416
1417	mul	$a0			# a[3]*a[0]
1418	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
1419	 mov	$ai,%rax
1420	adc	\$0,%rdx
1421	add	$A1[0],$A0[0]
1422	mov	%rdx,$A0[1]
1423	adc	\$0,$A0[1]
1424	mov	$A0[0],-8($tptr,$i)	# t[3]
1425
1426	lea	($i),$j
1427	jmp	.Lsqr4x_inner
1428
1429.align	32
1430.Lsqr4x_inner:
1431	 mov	($aptr,$j),$ai		# a[4]
1432	mul	$a1			# a[3]*a[1]
1433	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
1434	 mov	$ai,%rax
1435	mov	%rdx,$A1[0]
1436	adc	\$0,$A1[0]
1437	add	($tptr,$j),$A1[1]
1438	adc	\$0,$A1[0]
1439
1440	.byte	0x67
1441	mul	$a0			# a[4]*a[0]
1442	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
1443	 mov	$ai,%rax		# a[3]
1444	 mov	8($aptr,$j),$ai		# a[5]
1445	mov	%rdx,$A0[0]
1446	adc	\$0,$A0[0]
1447	add	$A1[1],$A0[1]
1448	adc	\$0,$A0[0]
1449
1450	mul	$a1			# a[4]*a[3]
1451	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
1452	mov	$A0[1],($tptr,$j)	# t[4]
1453	 mov	$ai,%rax
1454	mov	%rdx,$A1[1]
1455	adc	\$0,$A1[1]
1456	add	8($tptr,$j),$A1[0]
1457	lea	16($j),$j		# j++
1458	adc	\$0,$A1[1]
1459
1460	mul	$a0			# a[5]*a[2]
1461	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
1462	 mov	$ai,%rax
1463	adc	\$0,%rdx
1464	add	$A1[0],$A0[0]
1465	mov	%rdx,$A0[1]
1466	adc	\$0,$A0[1]
1467	mov	$A0[0],-8($tptr,$j)	# t[5], "preloaded t[1]" below
1468
1469	cmp	\$0,$j
1470	jne	.Lsqr4x_inner
1471
1472	.byte	0x67
1473	mul	$a1			# a[5]*a[3]
1474	add	%rax,$A1[1]
1475	adc	\$0,%rdx
1476	add	$A0[1],$A1[1]
1477	adc	\$0,%rdx
1478
1479	mov	$A1[1],($tptr)		# t[6], "preloaded t[2]" below
1480	mov	%rdx,$A1[0]
1481	mov	%rdx,8($tptr)		# t[7], "preloaded t[3]" below
1482
1483	add	\$16,$i
1484	jnz	.Lsqr4x_outer
1485
1486					# comments apply to $num==4 case
1487	mov	-32($aptr),$a0		# a[0]
1488	lea	48+8(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
1489	mov	-24($aptr),%rax		# a[1]
1490	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
1491	mov	-16($aptr),$ai		# a[2]
1492	mov	%rax,$a1
1493
1494	mul	$a0			# a[1]*a[0]
1495	add	%rax,$A0[0]		# a[1]*a[0]+t[1], preloaded t[1]
1496	 mov	$ai,%rax		# a[2]
1497	mov	%rdx,$A0[1]
1498	adc	\$0,$A0[1]
1499
1500	mul	$a0			# a[2]*a[0]
1501	add	%rax,$A0[1]
1502	 mov	$ai,%rax
1503	 mov	$A0[0],-24($tptr)	# t[1]
1504	mov	%rdx,$A0[0]
1505	adc	\$0,$A0[0]
1506	add	$A1[1],$A0[1]		# a[2]*a[0]+t[2], preloaded t[2]
1507	 mov	-8($aptr),$ai		# a[3]
1508	adc	\$0,$A0[0]
1509
1510	mul	$a1			# a[2]*a[1]
1511	add	%rax,$A1[0]		# a[2]*a[1]+t[3], preloaded t[3]
1512	 mov	$ai,%rax
1513	 mov	$A0[1],-16($tptr)	# t[2]
1514	mov	%rdx,$A1[1]
1515	adc	\$0,$A1[1]
1516
1517	mul	$a0			# a[3]*a[0]
1518	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
1519	 mov	$ai,%rax
1520	mov	%rdx,$A0[1]
1521	adc	\$0,$A0[1]
1522	add	$A1[0],$A0[0]
1523	adc	\$0,$A0[1]
1524	mov	$A0[0],-8($tptr)	# t[3]
1525
1526	mul	$a1			# a[3]*a[1]
1527	add	%rax,$A1[1]
1528	 mov	-16($aptr),%rax		# a[2]
1529	adc	\$0,%rdx
1530	add	$A0[1],$A1[1]
1531	adc	\$0,%rdx
1532
1533	mov	$A1[1],($tptr)		# t[4]
1534	mov	%rdx,$A1[0]
1535	mov	%rdx,8($tptr)		# t[5]
1536
1537	mul	$ai			# a[2]*a[3]
1538___
1539{
1540my ($shift,$carry)=($a0,$a1);
1541my @S=(@A1,$ai,$n0);
1542$code.=<<___;
1543	 add	\$16,$i
1544	 xor	$shift,$shift
1545	 sub	$num,$i			# $i=16-$num
1546	 xor	$carry,$carry
1547
1548	add	$A1[0],%rax		# t[5]
1549	adc	\$0,%rdx
1550	mov	%rax,8($tptr)		# t[5]
1551	mov	%rdx,16($tptr)		# t[6]
1552	mov	$carry,24($tptr)	# t[7]
1553
1554	 mov	-16($aptr,$i),%rax	# a[0]
1555	lea	48+8(%rsp),$tptr
1556	 xor	$A0[0],$A0[0]		# t[0]
1557	 mov	8($tptr),$A0[1]		# t[1]
1558
1559	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1560	shr	\$63,$A0[0]
1561	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1562	shr	\$63,$A0[1]
1563	or	$A0[0],$S[1]		# | t[2*i]>>63
1564	 mov	16($tptr),$A0[0]	# t[2*i+2]	# prefetch
1565	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1566	mul	%rax			# a[i]*a[i]
1567	neg	$carry			# mov $carry,cf
1568	 mov	24($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1569	adc	%rax,$S[0]
1570	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
1571	mov	$S[0],($tptr)
1572	adc	%rdx,$S[1]
1573
1574	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1575	 mov	$S[1],8($tptr)
1576	 sbb	$carry,$carry		# mov cf,$carry
1577	shr	\$63,$A0[0]
1578	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1579	shr	\$63,$A0[1]
1580	or	$A0[0],$S[3]		# | t[2*i]>>63
1581	 mov	32($tptr),$A0[0]	# t[2*i+2]	# prefetch
1582	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1583	mul	%rax			# a[i]*a[i]
1584	neg	$carry			# mov $carry,cf
1585	 mov	40($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1586	adc	%rax,$S[2]
1587	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
1588	mov	$S[2],16($tptr)
1589	adc	%rdx,$S[3]
1590	lea	16($i),$i
1591	mov	$S[3],24($tptr)
1592	sbb	$carry,$carry		# mov cf,$carry
1593	lea	64($tptr),$tptr
1594	jmp	.Lsqr4x_shift_n_add
1595
1596.align	32
1597.Lsqr4x_shift_n_add:
1598	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1599	shr	\$63,$A0[0]
1600	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1601	shr	\$63,$A0[1]
1602	or	$A0[0],$S[1]		# | t[2*i]>>63
1603	 mov	-16($tptr),$A0[0]	# t[2*i+2]	# prefetch
1604	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1605	mul	%rax			# a[i]*a[i]
1606	neg	$carry			# mov $carry,cf
1607	 mov	-8($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1608	adc	%rax,$S[0]
1609	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
1610	mov	$S[0],-32($tptr)
1611	adc	%rdx,$S[1]
1612
1613	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1614	 mov	$S[1],-24($tptr)
1615	 sbb	$carry,$carry		# mov cf,$carry
1616	shr	\$63,$A0[0]
1617	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1618	shr	\$63,$A0[1]
1619	or	$A0[0],$S[3]		# | t[2*i]>>63
1620	 mov	0($tptr),$A0[0]		# t[2*i+2]	# prefetch
1621	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1622	mul	%rax			# a[i]*a[i]
1623	neg	$carry			# mov $carry,cf
1624	 mov	8($tptr),$A0[1]		# t[2*i+2+1]	# prefetch
1625	adc	%rax,$S[2]
1626	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
1627	mov	$S[2],-16($tptr)
1628	adc	%rdx,$S[3]
1629
1630	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1631	 mov	$S[3],-8($tptr)
1632	 sbb	$carry,$carry		# mov cf,$carry
1633	shr	\$63,$A0[0]
1634	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1635	shr	\$63,$A0[1]
1636	or	$A0[0],$S[1]		# | t[2*i]>>63
1637	 mov	16($tptr),$A0[0]	# t[2*i+2]	# prefetch
1638	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1639	mul	%rax			# a[i]*a[i]
1640	neg	$carry			# mov $carry,cf
1641	 mov	24($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1642	adc	%rax,$S[0]
1643	 mov	8($aptr,$i),%rax	# a[i+1]	# prefetch
1644	mov	$S[0],0($tptr)
1645	adc	%rdx,$S[1]
1646
1647	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
1648	 mov	$S[1],8($tptr)
1649	 sbb	$carry,$carry		# mov cf,$carry
1650	shr	\$63,$A0[0]
1651	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1652	shr	\$63,$A0[1]
1653	or	$A0[0],$S[3]		# | t[2*i]>>63
1654	 mov	32($tptr),$A0[0]	# t[2*i+2]	# prefetch
1655	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1656	mul	%rax			# a[i]*a[i]
1657	neg	$carry			# mov $carry,cf
1658	 mov	40($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1659	adc	%rax,$S[2]
1660	 mov	16($aptr,$i),%rax	# a[i+1]	# prefetch
1661	mov	$S[2],16($tptr)
1662	adc	%rdx,$S[3]
1663	mov	$S[3],24($tptr)
1664	sbb	$carry,$carry		# mov cf,$carry
1665	lea	64($tptr),$tptr
1666	add	\$32,$i
1667	jnz	.Lsqr4x_shift_n_add
1668
1669	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
1670	.byte	0x67
1671	shr	\$63,$A0[0]
1672	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
1673	shr	\$63,$A0[1]
1674	or	$A0[0],$S[1]		# | t[2*i]>>63
1675	 mov	-16($tptr),$A0[0]	# t[2*i+2]	# prefetch
1676	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
1677	mul	%rax			# a[i]*a[i]
1678	neg	$carry			# mov $carry,cf
1679	 mov	-8($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
1680	adc	%rax,$S[0]
1681	 mov	-8($aptr),%rax		# a[i+1]	# prefetch
1682	mov	$S[0],-32($tptr)
1683	adc	%rdx,$S[1]
1684
1685	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1|shift
1686	 mov	$S[1],-24($tptr)
1687	 sbb	$carry,$carry		# mov cf,$carry
1688	shr	\$63,$A0[0]
1689	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
1690	shr	\$63,$A0[1]
1691	or	$A0[0],$S[3]		# | t[2*i]>>63
1692	mul	%rax			# a[i]*a[i]
1693	neg	$carry			# mov $carry,cf
1694	adc	%rax,$S[2]
1695	adc	%rdx,$S[3]
1696	mov	$S[2],-16($tptr)
1697	mov	$S[3],-8($tptr)
1698___
1699}
1700######################################################################
1701# Montgomery reduction part, "word-by-word" algorithm.
1702#
1703# This new path is inspired by multiple submissions from Intel, by
1704# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
1705# Vinodh Gopal...
1706{
1707my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx");
1708
1709$code.=<<___;
1710	movq	%xmm2,$nptr
1711__bn_sqr8x_reduction:
1712	xor	%rax,%rax
1713	lea	($nptr,$num),%rcx	# end of n[]
1714	lea	48+8(%rsp,$num,2),%rdx	# end of t[] buffer
1715	mov	%rcx,0+8(%rsp)
1716	lea	48+8(%rsp,$num),$tptr	# end of initial t[] window
1717	mov	%rdx,8+8(%rsp)
1718	neg	$num
1719	jmp	.L8x_reduction_loop
1720
1721.align	32
1722.L8x_reduction_loop:
1723	lea	($tptr,$num),$tptr	# start of current t[] window
1724	.byte	0x66
1725	mov	8*0($tptr),$m0
1726	mov	8*1($tptr),%r9
1727	mov	8*2($tptr),%r10
1728	mov	8*3($tptr),%r11
1729	mov	8*4($tptr),%r12
1730	mov	8*5($tptr),%r13
1731	mov	8*6($tptr),%r14
1732	mov	8*7($tptr),%r15
1733	mov	%rax,(%rdx)		# store top-most carry bit
1734	lea	8*8($tptr),$tptr
1735
1736	.byte	0x67
1737	mov	$m0,%r8
1738	imulq	32+8(%rsp),$m0		# n0*a[0]
1739	mov	8*0($nptr),%rax		# n[0]
1740	mov	\$8,%ecx
1741	jmp	.L8x_reduce
1742
1743.align	32
1744.L8x_reduce:
1745	mulq	$m0
1746	 mov	8*1($nptr),%rax		# n[1]
1747	neg	%r8
1748	mov	%rdx,%r8
1749	adc	\$0,%r8
1750
1751	mulq	$m0
1752	add	%rax,%r9
1753	 mov	8*2($nptr),%rax
1754	adc	\$0,%rdx
1755	add	%r9,%r8
1756	 mov	$m0,48-8+8(%rsp,%rcx,8)	# put aside n0*a[i]
1757	mov	%rdx,%r9
1758	adc	\$0,%r9
1759
1760	mulq	$m0
1761	add	%rax,%r10
1762	 mov	8*3($nptr),%rax
1763	adc	\$0,%rdx
1764	add	%r10,%r9
1765	 mov	32+8(%rsp),$carry	# pull n0, borrow $carry
1766	mov	%rdx,%r10
1767	adc	\$0,%r10
1768
1769	mulq	$m0
1770	add	%rax,%r11
1771	 mov	8*4($nptr),%rax
1772	adc	\$0,%rdx
1773	 imulq	%r8,$carry		# modulo-scheduled
1774	add	%r11,%r10
1775	mov	%rdx,%r11
1776	adc	\$0,%r11
1777
1778	mulq	$m0
1779	add	%rax,%r12
1780	 mov	8*5($nptr),%rax
1781	adc	\$0,%rdx
1782	add	%r12,%r11
1783	mov	%rdx,%r12
1784	adc	\$0,%r12
1785
1786	mulq	$m0
1787	add	%rax,%r13
1788	 mov	8*6($nptr),%rax
1789	adc	\$0,%rdx
1790	add	%r13,%r12
1791	mov	%rdx,%r13
1792	adc	\$0,%r13
1793
1794	mulq	$m0
1795	add	%rax,%r14
1796	 mov	8*7($nptr),%rax
1797	adc	\$0,%rdx
1798	add	%r14,%r13
1799	mov	%rdx,%r14
1800	adc	\$0,%r14
1801
1802	mulq	$m0
1803	 mov	$carry,$m0		# n0*a[i]
1804	add	%rax,%r15
1805	 mov	8*0($nptr),%rax		# n[0]
1806	adc	\$0,%rdx
1807	add	%r15,%r14
1808	mov	%rdx,%r15
1809	adc	\$0,%r15
1810
1811	dec	%ecx
1812	jnz	.L8x_reduce
1813
1814	lea	8*8($nptr),$nptr
1815	xor	%rax,%rax
1816	mov	8+8(%rsp),%rdx		# pull end of t[]
1817	cmp	0+8(%rsp),$nptr		# end of n[]?
1818	jae	.L8x_no_tail
1819
1820	.byte	0x66
1821	add	8*0($tptr),%r8
1822	adc	8*1($tptr),%r9
1823	adc	8*2($tptr),%r10
1824	adc	8*3($tptr),%r11
1825	adc	8*4($tptr),%r12
1826	adc	8*5($tptr),%r13
1827	adc	8*6($tptr),%r14
1828	adc	8*7($tptr),%r15
1829	sbb	$carry,$carry		# top carry
1830
1831	mov	48+56+8(%rsp),$m0	# pull n0*a[0]
1832	mov	\$8,%ecx
1833	mov	8*0($nptr),%rax
1834	jmp	.L8x_tail
1835
1836.align	32
1837.L8x_tail:
1838	mulq	$m0
1839	add	%rax,%r8
1840	 mov	8*1($nptr),%rax
1841	 mov	%r8,($tptr)		# save result
1842	mov	%rdx,%r8
1843	adc	\$0,%r8
1844
1845	mulq	$m0
1846	add	%rax,%r9
1847	 mov	8*2($nptr),%rax
1848	adc	\$0,%rdx
1849	add	%r9,%r8
1850	 lea	8($tptr),$tptr		# $tptr++
1851	mov	%rdx,%r9
1852	adc	\$0,%r9
1853
1854	mulq	$m0
1855	add	%rax,%r10
1856	 mov	8*3($nptr),%rax
1857	adc	\$0,%rdx
1858	add	%r10,%r9
1859	mov	%rdx,%r10
1860	adc	\$0,%r10
1861
1862	mulq	$m0
1863	add	%rax,%r11
1864	 mov	8*4($nptr),%rax
1865	adc	\$0,%rdx
1866	add	%r11,%r10
1867	mov	%rdx,%r11
1868	adc	\$0,%r11
1869
1870	mulq	$m0
1871	add	%rax,%r12
1872	 mov	8*5($nptr),%rax
1873	adc	\$0,%rdx
1874	add	%r12,%r11
1875	mov	%rdx,%r12
1876	adc	\$0,%r12
1877
1878	mulq	$m0
1879	add	%rax,%r13
1880	 mov	8*6($nptr),%rax
1881	adc	\$0,%rdx
1882	add	%r13,%r12
1883	mov	%rdx,%r13
1884	adc	\$0,%r13
1885
1886	mulq	$m0
1887	add	%rax,%r14
1888	 mov	8*7($nptr),%rax
1889	adc	\$0,%rdx
1890	add	%r14,%r13
1891	mov	%rdx,%r14
1892	adc	\$0,%r14
1893
1894	mulq	$m0
1895	 mov	48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i]
1896	add	%rax,%r15
1897	adc	\$0,%rdx
1898	add	%r15,%r14
1899	 mov	8*0($nptr),%rax		# pull n[0]
1900	mov	%rdx,%r15
1901	adc	\$0,%r15
1902
1903	dec	%ecx
1904	jnz	.L8x_tail
1905
1906	lea	8*8($nptr),$nptr
1907	mov	8+8(%rsp),%rdx		# pull end of t[]
1908	cmp	0+8(%rsp),$nptr		# end of n[]?
1909	jae	.L8x_tail_done		# break out of loop
1910
1911	 mov	48+56+8(%rsp),$m0	# pull n0*a[0]
1912	neg	$carry
1913	 mov	8*0($nptr),%rax		# pull n[0]
1914	adc	8*0($tptr),%r8
1915	adc	8*1($tptr),%r9
1916	adc	8*2($tptr),%r10
1917	adc	8*3($tptr),%r11
1918	adc	8*4($tptr),%r12
1919	adc	8*5($tptr),%r13
1920	adc	8*6($tptr),%r14
1921	adc	8*7($tptr),%r15
1922	sbb	$carry,$carry		# top carry
1923
1924	mov	\$8,%ecx
1925	jmp	.L8x_tail
1926
1927.align	32
1928.L8x_tail_done:
1929	xor	%rax,%rax
1930	add	(%rdx),%r8		# can this overflow?
1931	adc	\$0,%r9
1932	adc	\$0,%r10
1933	adc	\$0,%r11
1934	adc	\$0,%r12
1935	adc	\$0,%r13
1936	adc	\$0,%r14
1937	adc	\$0,%r15
1938	adc	\$0,%rax
1939
1940	neg	$carry
1941.L8x_no_tail:
1942	adc	8*0($tptr),%r8
1943	adc	8*1($tptr),%r9
1944	adc	8*2($tptr),%r10
1945	adc	8*3($tptr),%r11
1946	adc	8*4($tptr),%r12
1947	adc	8*5($tptr),%r13
1948	adc	8*6($tptr),%r14
1949	adc	8*7($tptr),%r15
1950	adc	\$0,%rax		# top-most carry
1951	 mov	-8($nptr),%rcx		# np[num-1]
1952	 xor	$carry,$carry
1953
1954	movq	%xmm2,$nptr		# restore $nptr
1955
1956	mov	%r8,8*0($tptr)		# store top 512 bits
1957	mov	%r9,8*1($tptr)
1958	 movq	%xmm3,$num		# $num is %r9, can't be moved upwards
1959	mov	%r10,8*2($tptr)
1960	mov	%r11,8*3($tptr)
1961	mov	%r12,8*4($tptr)
1962	mov	%r13,8*5($tptr)
1963	mov	%r14,8*6($tptr)
1964	mov	%r15,8*7($tptr)
1965	lea	8*8($tptr),$tptr
1966
1967	cmp	%rdx,$tptr		# end of t[]?
1968	jb	.L8x_reduction_loop
1969	ret
1970.size	bn_sqr8x_internal,.-bn_sqr8x_internal
1971___
1972}
1973##############################################################
1974# Post-condition, 4x unrolled
1975#
1976{
1977my ($tptr,$nptr)=("%rbx","%rbp");
1978$code.=<<___;
1979.type	__bn_post4x_internal,\@abi-omnipotent
1980.align	32
1981__bn_post4x_internal:
1982	mov	8*0($nptr),%r12
1983	lea	(%rdi,$num),$tptr	# %rdi was $tptr above
1984	mov	$num,%rcx
1985	movq	%xmm1,$rptr		# restore $rptr
1986	neg	%rax
1987	movq	%xmm1,$aptr		# prepare for back-to-back call
1988	sar	\$3+2,%rcx
1989	dec	%r12			# so that after 'not' we get -n[0]
1990	xor	%r10,%r10
1991	mov	8*1($nptr),%r13
1992	mov	8*2($nptr),%r14
1993	mov	8*3($nptr),%r15
1994	jmp	.Lsqr4x_sub_entry
1995
1996.align	16
1997.Lsqr4x_sub:
1998	mov	8*0($nptr),%r12
1999	mov	8*1($nptr),%r13
2000	mov	8*2($nptr),%r14
2001	mov	8*3($nptr),%r15
2002.Lsqr4x_sub_entry:
2003	lea	8*4($nptr),$nptr
2004	not	%r12
2005	not	%r13
2006	not	%r14
2007	not	%r15
2008	and	%rax,%r12
2009	and	%rax,%r13
2010	and	%rax,%r14
2011	and	%rax,%r15
2012
2013	neg	%r10			# mov %r10,%cf
2014	adc	8*0($tptr),%r12
2015	adc	8*1($tptr),%r13
2016	adc	8*2($tptr),%r14
2017	adc	8*3($tptr),%r15
2018	mov	%r12,8*0($rptr)
2019	lea	8*4($tptr),$tptr
2020	mov	%r13,8*1($rptr)
2021	sbb	%r10,%r10		# mov %cf,%r10
2022	mov	%r14,8*2($rptr)
2023	mov	%r15,8*3($rptr)
2024	lea	8*4($rptr),$rptr
2025
2026	inc	%rcx			# pass %cf
2027	jnz	.Lsqr4x_sub
2028
2029	mov	$num,%r10		# prepare for back-to-back call
2030	neg	$num			# restore $num
2031	ret
2032.size	__bn_post4x_internal,.-__bn_post4x_internal
2033___
2034}
2035{
2036$code.=<<___;
2037.globl	bn_from_montgomery
2038.type	bn_from_montgomery,\@abi-omnipotent
2039.align	32
2040bn_from_montgomery:
2041	testl	\$7,`($win64?"48(%rsp)":"%r9d")`
2042	jz	bn_from_mont8x
2043	xor	%eax,%eax
2044	ret
2045.size	bn_from_montgomery,.-bn_from_montgomery
2046
2047.type	bn_from_mont8x,\@function,6
2048.align	32
2049bn_from_mont8x:
2050	.byte	0x67
2051	mov	%rsp,%rax
2052	push	%rbx
2053	push	%rbp
2054	push	%r12
2055	push	%r13
2056	push	%r14
2057	push	%r15
2058.Lfrom_prologue:
2059
2060	shl	\$3,${num}d		# convert $num to bytes
2061	lea	($num,$num,2),%r10	# 3*$num in bytes
2062	neg	$num
2063	mov	($n0),$n0		# *n0
2064
2065	##############################################################
2066	# Ensure that stack frame doesn't alias with $rptr+3*$num
2067	# modulo 4096, which covers ret[num], am[num] and n[num]
2068	# (see bn_exp.c). The stack is allocated to aligned with
2069	# bn_power5's frame, and as bn_from_montgomery happens to be
2070	# last operation, we use the opportunity to cleanse it.
2071	#
2072	lea	-320(%rsp,$num,2),%r11
2073	mov	%rsp,%rbp
2074	sub	$rptr,%r11
2075	and	\$4095,%r11
2076	cmp	%r11,%r10
2077	jb	.Lfrom_sp_alt
2078	sub	%r11,%rbp		# align with $aptr
2079	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*$num*8+256)
2080	jmp	.Lfrom_sp_done
2081
2082.align	32
2083.Lfrom_sp_alt:
2084	lea	4096-320(,$num,2),%r10
2085	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*$num*8+256)
2086	sub	%r10,%r11
2087	mov	\$0,%r10
2088	cmovc	%r10,%r11
2089	sub	%r11,%rbp
2090.Lfrom_sp_done:
2091	and	\$-64,%rbp
2092	mov	%rsp,%r11
2093	sub	%rbp,%r11
2094	and	\$-4096,%r11
2095	lea	(%rbp,%r11),%rsp
2096	mov	(%rsp),%r10
2097	cmp	%rbp,%rsp
2098	ja	.Lfrom_page_walk
2099	jmp	.Lfrom_page_walk_done
2100
2101.Lfrom_page_walk:
2102	lea	-4096(%rsp),%rsp
2103	mov	(%rsp),%r10
2104	cmp	%rbp,%rsp
2105	ja	.Lfrom_page_walk
2106.Lfrom_page_walk_done:
2107
2108	mov	$num,%r10
2109	neg	$num
2110
2111	##############################################################
2112	# Stack layout
2113	#
2114	# +0	saved $num, used in reduction section
2115	# +8	&t[2*$num], used in reduction section
2116	# +32	saved *n0
2117	# +40	saved %rsp
2118	# +48	t[2*$num]
2119	#
2120	mov	$n0,  32(%rsp)
2121	mov	%rax, 40(%rsp)		# save original %rsp
2122.Lfrom_body:
2123	mov	$num,%r11
2124	lea	48(%rsp),%rax
2125	pxor	%xmm0,%xmm0
2126	jmp	.Lmul_by_1
2127
2128.align	32
2129.Lmul_by_1:
2130	movdqu	($aptr),%xmm1
2131	movdqu	16($aptr),%xmm2
2132	movdqu	32($aptr),%xmm3
2133	movdqa	%xmm0,(%rax,$num)
2134	movdqu	48($aptr),%xmm4
2135	movdqa	%xmm0,16(%rax,$num)
2136	.byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00	# lea	64($aptr),$aptr
2137	movdqa	%xmm1,(%rax)
2138	movdqa	%xmm0,32(%rax,$num)
2139	movdqa	%xmm2,16(%rax)
2140	movdqa	%xmm0,48(%rax,$num)
2141	movdqa	%xmm3,32(%rax)
2142	movdqa	%xmm4,48(%rax)
2143	lea	64(%rax),%rax
2144	sub	\$64,%r11
2145	jnz	.Lmul_by_1
2146
2147	movq	$rptr,%xmm1
2148	movq	$nptr,%xmm2
2149	.byte	0x67
2150	mov	$nptr,%rbp
2151	movq	%r10, %xmm3		# -num
2152___
2153$code.=<<___ if ($addx);
2154	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
2155	and	\$0x80108,%r11d
2156	cmp	\$0x80108,%r11d		# check for AD*X+BMI2+BMI1
2157	jne	.Lfrom_mont_nox
2158
2159	lea	(%rax,$num),$rptr
2160	call	__bn_sqrx8x_reduction
2161	call	__bn_postx4x_internal
2162
2163	pxor	%xmm0,%xmm0
2164	lea	48(%rsp),%rax
2165	mov	40(%rsp),%rsi		# restore %rsp
2166	jmp	.Lfrom_mont_zero
2167
2168.align	32
2169.Lfrom_mont_nox:
2170___
2171$code.=<<___;
2172	call	__bn_sqr8x_reduction
2173	call	__bn_post4x_internal
2174
2175	pxor	%xmm0,%xmm0
2176	lea	48(%rsp),%rax
2177	mov	40(%rsp),%rsi		# restore %rsp
2178	jmp	.Lfrom_mont_zero
2179
2180.align	32
2181.Lfrom_mont_zero:
2182	movdqa	%xmm0,16*0(%rax)
2183	movdqa	%xmm0,16*1(%rax)
2184	movdqa	%xmm0,16*2(%rax)
2185	movdqa	%xmm0,16*3(%rax)
2186	lea	16*4(%rax),%rax
2187	sub	\$32,$num
2188	jnz	.Lfrom_mont_zero
2189
2190	mov	\$1,%rax
2191	mov	-48(%rsi),%r15
2192	mov	-40(%rsi),%r14
2193	mov	-32(%rsi),%r13
2194	mov	-24(%rsi),%r12
2195	mov	-16(%rsi),%rbp
2196	mov	-8(%rsi),%rbx
2197	lea	(%rsi),%rsp
2198.Lfrom_epilogue:
2199	ret
2200.size	bn_from_mont8x,.-bn_from_mont8x
2201___
2202}
2203}}}
2204
2205if ($addx) {{{
2206my $bp="%rdx";	# restore original value
2207
2208$code.=<<___;
2209.type	bn_mulx4x_mont_gather5,\@function,6
2210.align	32
2211bn_mulx4x_mont_gather5:
2212	mov	%rsp,%rax
2213.Lmulx4x_enter:
2214	push	%rbx
2215	push	%rbp
2216	push	%r12
2217	push	%r13
2218	push	%r14
2219	push	%r15
2220.Lmulx4x_prologue:
2221
2222	shl	\$3,${num}d		# convert $num to bytes
2223	lea	($num,$num,2),%r10	# 3*$num in bytes
2224	neg	$num			# -$num
2225	mov	($n0),$n0		# *n0
2226
2227	##############################################################
2228	# Ensure that stack frame doesn't alias with $rptr+3*$num
2229	# modulo 4096, which covers ret[num], am[num] and n[num]
2230	# (see bn_exp.c). This is done to allow memory disambiguation
2231	# logic do its magic. [Extra [num] is allocated in order
2232	# to align with bn_power5's frame, which is cleansed after
2233	# completing exponentiation. Extra 256 bytes is for power mask
2234	# calculated from 7th argument, the index.]
2235	#
2236	lea	-320(%rsp,$num,2),%r11
2237	mov	%rsp,%rbp
2238	sub	$rp,%r11
2239	and	\$4095,%r11
2240	cmp	%r11,%r10
2241	jb	.Lmulx4xsp_alt
2242	sub	%r11,%rbp		# align with $aptr
2243	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*$num*8+256)
2244	jmp	.Lmulx4xsp_done
2245
2246.Lmulx4xsp_alt:
2247	lea	4096-320(,$num,2),%r10
2248	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*$num*8+256)
2249	sub	%r10,%r11
2250	mov	\$0,%r10
2251	cmovc	%r10,%r11
2252	sub	%r11,%rbp
2253.Lmulx4xsp_done:
2254	and	\$-64,%rbp		# ensure alignment
2255	mov	%rsp,%r11
2256	sub	%rbp,%r11
2257	and	\$-4096,%r11
2258	lea	(%rbp,%r11),%rsp
2259	mov	(%rsp),%r10
2260	cmp	%rbp,%rsp
2261	ja	.Lmulx4x_page_walk
2262	jmp	.Lmulx4x_page_walk_done
2263
2264.Lmulx4x_page_walk:
2265	lea	-4096(%rsp),%rsp
2266	mov	(%rsp),%r10
2267	cmp	%rbp,%rsp
2268	ja	.Lmulx4x_page_walk
2269.Lmulx4x_page_walk_done:
2270
2271	##############################################################
2272	# Stack layout
2273	# +0	-num
2274	# +8	off-loaded &b[i]
2275	# +16	end of b[num]
2276	# +24	inner counter
2277	# +32	saved n0
2278	# +40	saved %rsp
2279	# +48
2280	# +56	saved rp
2281	# +64	tmp[num+1]
2282	#
2283	mov	$n0, 32(%rsp)		# save *n0
2284	mov	%rax,40(%rsp)		# save original %rsp
2285.Lmulx4x_body:
2286	call	mulx4x_internal
2287
2288	mov	40(%rsp),%rsi		# restore %rsp
2289	mov	\$1,%rax
2290
2291	mov	-48(%rsi),%r15
2292	mov	-40(%rsi),%r14
2293	mov	-32(%rsi),%r13
2294	mov	-24(%rsi),%r12
2295	mov	-16(%rsi),%rbp
2296	mov	-8(%rsi),%rbx
2297	lea	(%rsi),%rsp
2298.Lmulx4x_epilogue:
2299	ret
2300.size	bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
2301
2302.type	mulx4x_internal,\@abi-omnipotent
2303.align	32
2304mulx4x_internal:
2305	mov	$num,8(%rsp)		# save -$num (it was in bytes)
2306	mov	$num,%r10
2307	neg	$num			# restore $num
2308	shl	\$5,$num
2309	neg	%r10			# restore $num
2310	lea	128($bp,$num),%r13	# end of powers table (+size optimization)
2311	shr	\$5+5,$num
2312	movd	`($win64?56:8)`(%rax),%xmm5	# load 7th argument
2313	sub	\$1,$num
2314	lea	.Linc(%rip),%rax
2315	mov	%r13,16+8(%rsp)		# end of b[num]
2316	mov	$num,24+8(%rsp)		# inner counter
2317	mov	$rp, 56+8(%rsp)		# save $rp
2318___
2319my ($aptr, $bptr, $nptr, $tptr, $mi,  $bi,  $zero, $num)=
2320   ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
2321my $rptr=$bptr;
2322my $STRIDE=2**5*8;		# 5 is "window size"
2323my $N=$STRIDE/4;		# should match cache line size
2324$code.=<<___;
2325	movdqa	0(%rax),%xmm0		# 00000001000000010000000000000000
2326	movdqa	16(%rax),%xmm1		# 00000002000000020000000200000002
2327	lea	88-112(%rsp,%r10),%r10	# place the mask after tp[num+1] (+ICache optimizaton)
2328	lea	128($bp),$bptr		# size optimization
2329
2330	pshufd	\$0,%xmm5,%xmm5		# broadcast index
2331	movdqa	%xmm1,%xmm4
2332	.byte	0x67
2333	movdqa	%xmm1,%xmm2
2334___
2335########################################################################
2336# calculate mask by comparing 0..31 to index and save result to stack
2337#
2338$code.=<<___;
2339	.byte	0x67
2340	paddd	%xmm0,%xmm1
2341	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
2342	movdqa	%xmm4,%xmm3
2343___
2344for($i=0;$i<$STRIDE/16-4;$i+=4) {
2345$code.=<<___;
2346	paddd	%xmm1,%xmm2
2347	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
2348	movdqa	%xmm0,`16*($i+0)+112`(%r10)
2349	movdqa	%xmm4,%xmm0
2350
2351	paddd	%xmm2,%xmm3
2352	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
2353	movdqa	%xmm1,`16*($i+1)+112`(%r10)
2354	movdqa	%xmm4,%xmm1
2355
2356	paddd	%xmm3,%xmm0
2357	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
2358	movdqa	%xmm2,`16*($i+2)+112`(%r10)
2359	movdqa	%xmm4,%xmm2
2360
2361	paddd	%xmm0,%xmm1
2362	pcmpeqd	%xmm5,%xmm0
2363	movdqa	%xmm3,`16*($i+3)+112`(%r10)
2364	movdqa	%xmm4,%xmm3
2365___
2366}
2367$code.=<<___;				# last iteration can be optimized
2368	.byte	0x67
2369	paddd	%xmm1,%xmm2
2370	pcmpeqd	%xmm5,%xmm1
2371	movdqa	%xmm0,`16*($i+0)+112`(%r10)
2372
2373	paddd	%xmm2,%xmm3
2374	pcmpeqd	%xmm5,%xmm2
2375	movdqa	%xmm1,`16*($i+1)+112`(%r10)
2376
2377	pcmpeqd	%xmm5,%xmm3
2378	movdqa	%xmm2,`16*($i+2)+112`(%r10)
2379
2380	pand	`16*($i+0)-128`($bptr),%xmm0	# while it's still in register
2381	pand	`16*($i+1)-128`($bptr),%xmm1
2382	pand	`16*($i+2)-128`($bptr),%xmm2
2383	movdqa	%xmm3,`16*($i+3)+112`(%r10)
2384	pand	`16*($i+3)-128`($bptr),%xmm3
2385	por	%xmm2,%xmm0
2386	por	%xmm3,%xmm1
2387___
2388for($i=0;$i<$STRIDE/16-4;$i+=4) {
2389$code.=<<___;
2390	movdqa	`16*($i+0)-128`($bptr),%xmm4
2391	movdqa	`16*($i+1)-128`($bptr),%xmm5
2392	movdqa	`16*($i+2)-128`($bptr),%xmm2
2393	pand	`16*($i+0)+112`(%r10),%xmm4
2394	movdqa	`16*($i+3)-128`($bptr),%xmm3
2395	pand	`16*($i+1)+112`(%r10),%xmm5
2396	por	%xmm4,%xmm0
2397	pand	`16*($i+2)+112`(%r10),%xmm2
2398	por	%xmm5,%xmm1
2399	pand	`16*($i+3)+112`(%r10),%xmm3
2400	por	%xmm2,%xmm0
2401	por	%xmm3,%xmm1
2402___
2403}
2404$code.=<<___;
2405	pxor	%xmm1,%xmm0
2406	pshufd	\$0x4e,%xmm0,%xmm1
2407	por	%xmm1,%xmm0
2408	lea	$STRIDE($bptr),$bptr
2409	movq	%xmm0,%rdx		# bp[0]
2410	lea	64+8*4+8(%rsp),$tptr
2411
2412	mov	%rdx,$bi
2413	mulx	0*8($aptr),$mi,%rax	# a[0]*b[0]
2414	mulx	1*8($aptr),%r11,%r12	# a[1]*b[0]
2415	add	%rax,%r11
2416	mulx	2*8($aptr),%rax,%r13	# ...
2417	adc	%rax,%r12
2418	adc	\$0,%r13
2419	mulx	3*8($aptr),%rax,%r14
2420
2421	mov	$mi,%r15
2422	imulq	32+8(%rsp),$mi		# "t[0]"*n0
2423	xor	$zero,$zero		# cf=0, of=0
2424	mov	$mi,%rdx
2425
2426	mov	$bptr,8+8(%rsp)		# off-load &b[i]
2427
2428	lea	4*8($aptr),$aptr
2429	adcx	%rax,%r13
2430	adcx	$zero,%r14		# cf=0
2431
2432	mulx	0*8($nptr),%rax,%r10
2433	adcx	%rax,%r15		# discarded
2434	adox	%r11,%r10
2435	mulx	1*8($nptr),%rax,%r11
2436	adcx	%rax,%r10
2437	adox	%r12,%r11
2438	mulx	2*8($nptr),%rax,%r12
2439	mov	24+8(%rsp),$bptr	# counter value
2440	mov	%r10,-8*4($tptr)
2441	adcx	%rax,%r11
2442	adox	%r13,%r12
2443	mulx	3*8($nptr),%rax,%r15
2444	 mov	$bi,%rdx
2445	mov	%r11,-8*3($tptr)
2446	adcx	%rax,%r12
2447	adox	$zero,%r15		# of=0
2448	lea	4*8($nptr),$nptr
2449	mov	%r12,-8*2($tptr)
2450	jmp	.Lmulx4x_1st
2451
2452.align	32
2453.Lmulx4x_1st:
2454	adcx	$zero,%r15		# cf=0, modulo-scheduled
2455	mulx	0*8($aptr),%r10,%rax	# a[4]*b[0]
2456	adcx	%r14,%r10
2457	mulx	1*8($aptr),%r11,%r14	# a[5]*b[0]
2458	adcx	%rax,%r11
2459	mulx	2*8($aptr),%r12,%rax	# ...
2460	adcx	%r14,%r12
2461	mulx	3*8($aptr),%r13,%r14
2462	 .byte	0x67,0x67
2463	 mov	$mi,%rdx
2464	adcx	%rax,%r13
2465	adcx	$zero,%r14		# cf=0
2466	lea	4*8($aptr),$aptr
2467	lea	4*8($tptr),$tptr
2468
2469	adox	%r15,%r10
2470	mulx	0*8($nptr),%rax,%r15
2471	adcx	%rax,%r10
2472	adox	%r15,%r11
2473	mulx	1*8($nptr),%rax,%r15
2474	adcx	%rax,%r11
2475	adox	%r15,%r12
2476	mulx	2*8($nptr),%rax,%r15
2477	mov	%r10,-5*8($tptr)
2478	adcx	%rax,%r12
2479	mov	%r11,-4*8($tptr)
2480	adox	%r15,%r13
2481	mulx	3*8($nptr),%rax,%r15
2482	 mov	$bi,%rdx
2483	mov	%r12,-3*8($tptr)
2484	adcx	%rax,%r13
2485	adox	$zero,%r15
2486	lea	4*8($nptr),$nptr
2487	mov	%r13,-2*8($tptr)
2488
2489	dec	$bptr			# of=0, pass cf
2490	jnz	.Lmulx4x_1st
2491
2492	mov	8(%rsp),$num		# load -num
2493	adc	$zero,%r15		# modulo-scheduled
2494	lea	($aptr,$num),$aptr	# rewind $aptr
2495	add	%r15,%r14
2496	mov	8+8(%rsp),$bptr		# re-load &b[i]
2497	adc	$zero,$zero		# top-most carry
2498	mov	%r14,-1*8($tptr)
2499	jmp	.Lmulx4x_outer
2500
2501.align	32
2502.Lmulx4x_outer:
2503	lea	16-256($tptr),%r10	# where 256-byte mask is (+density control)
2504	pxor	%xmm4,%xmm4
2505	.byte	0x67,0x67
2506	pxor	%xmm5,%xmm5
2507___
2508for($i=0;$i<$STRIDE/16;$i+=4) {
2509$code.=<<___;
2510	movdqa	`16*($i+0)-128`($bptr),%xmm0
2511	movdqa	`16*($i+1)-128`($bptr),%xmm1
2512	movdqa	`16*($i+2)-128`($bptr),%xmm2
2513	pand	`16*($i+0)+256`(%r10),%xmm0
2514	movdqa	`16*($i+3)-128`($bptr),%xmm3
2515	pand	`16*($i+1)+256`(%r10),%xmm1
2516	por	%xmm0,%xmm4
2517	pand	`16*($i+2)+256`(%r10),%xmm2
2518	por	%xmm1,%xmm5
2519	pand	`16*($i+3)+256`(%r10),%xmm3
2520	por	%xmm2,%xmm4
2521	por	%xmm3,%xmm5
2522___
2523}
2524$code.=<<___;
2525	por	%xmm5,%xmm4
2526	pshufd	\$0x4e,%xmm4,%xmm0
2527	por	%xmm4,%xmm0
2528	lea	$STRIDE($bptr),$bptr
2529	movq	%xmm0,%rdx		# m0=bp[i]
2530
2531	mov	$zero,($tptr)		# save top-most carry
2532	lea	4*8($tptr,$num),$tptr	# rewind $tptr
2533	mulx	0*8($aptr),$mi,%r11	# a[0]*b[i]
2534	xor	$zero,$zero		# cf=0, of=0
2535	mov	%rdx,$bi
2536	mulx	1*8($aptr),%r14,%r12	# a[1]*b[i]
2537	adox	-4*8($tptr),$mi		# +t[0]
2538	adcx	%r14,%r11
2539	mulx	2*8($aptr),%r15,%r13	# ...
2540	adox	-3*8($tptr),%r11
2541	adcx	%r15,%r12
2542	mulx	3*8($aptr),%rdx,%r14
2543	adox	-2*8($tptr),%r12
2544	adcx	%rdx,%r13
2545	lea	($nptr,$num),$nptr	# rewind $nptr
2546	lea	4*8($aptr),$aptr
2547	adox	-1*8($tptr),%r13
2548	adcx	$zero,%r14
2549	adox	$zero,%r14
2550
2551	mov	$mi,%r15
2552	imulq	32+8(%rsp),$mi		# "t[0]"*n0
2553
2554	mov	$mi,%rdx
2555	xor	$zero,$zero		# cf=0, of=0
2556	mov	$bptr,8+8(%rsp)		# off-load &b[i]
2557
2558	mulx	0*8($nptr),%rax,%r10
2559	adcx	%rax,%r15		# discarded
2560	adox	%r11,%r10
2561	mulx	1*8($nptr),%rax,%r11
2562	adcx	%rax,%r10
2563	adox	%r12,%r11
2564	mulx	2*8($nptr),%rax,%r12
2565	adcx	%rax,%r11
2566	adox	%r13,%r12
2567	mulx	3*8($nptr),%rax,%r15
2568	 mov	$bi,%rdx
2569	mov	24+8(%rsp),$bptr	# counter value
2570	mov	%r10,-8*4($tptr)
2571	adcx	%rax,%r12
2572	mov	%r11,-8*3($tptr)
2573	adox	$zero,%r15		# of=0
2574	mov	%r12,-8*2($tptr)
2575	lea	4*8($nptr),$nptr
2576	jmp	.Lmulx4x_inner
2577
2578.align	32
2579.Lmulx4x_inner:
2580	mulx	0*8($aptr),%r10,%rax	# a[4]*b[i]
2581	adcx	$zero,%r15		# cf=0, modulo-scheduled
2582	adox	%r14,%r10
2583	mulx	1*8($aptr),%r11,%r14	# a[5]*b[i]
2584	adcx	0*8($tptr),%r10
2585	adox	%rax,%r11
2586	mulx	2*8($aptr),%r12,%rax	# ...
2587	adcx	1*8($tptr),%r11
2588	adox	%r14,%r12
2589	mulx	3*8($aptr),%r13,%r14
2590	 mov	$mi,%rdx
2591	adcx	2*8($tptr),%r12
2592	adox	%rax,%r13
2593	adcx	3*8($tptr),%r13
2594	adox	$zero,%r14		# of=0
2595	lea	4*8($aptr),$aptr
2596	lea	4*8($tptr),$tptr
2597	adcx	$zero,%r14		# cf=0
2598
2599	adox	%r15,%r10
2600	mulx	0*8($nptr),%rax,%r15
2601	adcx	%rax,%r10
2602	adox	%r15,%r11
2603	mulx	1*8($nptr),%rax,%r15
2604	adcx	%rax,%r11
2605	adox	%r15,%r12
2606	mulx	2*8($nptr),%rax,%r15
2607	mov	%r10,-5*8($tptr)
2608	adcx	%rax,%r12
2609	adox	%r15,%r13
2610	mov	%r11,-4*8($tptr)
2611	mulx	3*8($nptr),%rax,%r15
2612	 mov	$bi,%rdx
2613	lea	4*8($nptr),$nptr
2614	mov	%r12,-3*8($tptr)
2615	adcx	%rax,%r13
2616	adox	$zero,%r15
2617	mov	%r13,-2*8($tptr)
2618
2619	dec	$bptr			# of=0, pass cf
2620	jnz	.Lmulx4x_inner
2621
2622	mov	0+8(%rsp),$num		# load -num
2623	adc	$zero,%r15		# modulo-scheduled
2624	sub	0*8($tptr),$bptr	# pull top-most carry to %cf
2625	mov	8+8(%rsp),$bptr		# re-load &b[i]
2626	mov	16+8(%rsp),%r10
2627	adc	%r15,%r14
2628	lea	($aptr,$num),$aptr	# rewind $aptr
2629	adc	$zero,$zero		# top-most carry
2630	mov	%r14,-1*8($tptr)
2631
2632	cmp	%r10,$bptr
2633	jb	.Lmulx4x_outer
2634
2635	mov	-8($nptr),%r10
2636	mov	$zero,%r8
2637	mov	($nptr,$num),%r12
2638	lea	($nptr,$num),%rbp	# rewind $nptr
2639	mov	$num,%rcx
2640	lea	($tptr,$num),%rdi	# rewind $tptr
2641	xor	%eax,%eax
2642	xor	%r15,%r15
2643	sub	%r14,%r10		# compare top-most words
2644	adc	%r15,%r15
2645	or	%r15,%r8
2646	sar	\$3+2,%rcx
2647	sub	%r8,%rax		# %rax=-%r8
2648	mov	56+8(%rsp),%rdx		# restore rp
2649	dec	%r12			# so that after 'not' we get -n[0]
2650	mov	8*1(%rbp),%r13
2651	xor	%r8,%r8
2652	mov	8*2(%rbp),%r14
2653	mov	8*3(%rbp),%r15
2654	jmp	.Lsqrx4x_sub_entry	# common post-condition
2655.size	mulx4x_internal,.-mulx4x_internal
2656___
2657}{
2658######################################################################
2659# void bn_power5(
2660my $rptr="%rdi";	# BN_ULONG *rptr,
2661my $aptr="%rsi";	# const BN_ULONG *aptr,
2662my $bptr="%rdx";	# const void *table,
2663my $nptr="%rcx";	# const BN_ULONG *nptr,
2664my $n0  ="%r8";		# const BN_ULONG *n0);
2665my $num ="%r9";		# int num, has to be divisible by 8
2666			# int pwr);
2667
2668my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
2669my @A0=("%r10","%r11");
2670my @A1=("%r12","%r13");
2671my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
2672
2673$code.=<<___;
2674.type	bn_powerx5,\@function,6
2675.align	32
2676bn_powerx5:
2677	mov	%rsp,%rax
2678.Lpowerx5_enter:
2679	push	%rbx
2680	push	%rbp
2681	push	%r12
2682	push	%r13
2683	push	%r14
2684	push	%r15
2685.Lpowerx5_prologue:
2686
2687	shl	\$3,${num}d		# convert $num to bytes
2688	lea	($num,$num,2),%r10	# 3*$num in bytes
2689	neg	$num
2690	mov	($n0),$n0		# *n0
2691
2692	##############################################################
2693	# Ensure that stack frame doesn't alias with $rptr+3*$num
2694	# modulo 4096, which covers ret[num], am[num] and n[num]
2695	# (see bn_exp.c). This is done to allow memory disambiguation
2696	# logic do its magic. [Extra 256 bytes is for power mask
2697	# calculated from 7th argument, the index.]
2698	#
2699	lea	-320(%rsp,$num,2),%r11
2700	mov	%rsp,%rbp
2701	sub	$rptr,%r11
2702	and	\$4095,%r11
2703	cmp	%r11,%r10
2704	jb	.Lpwrx_sp_alt
2705	sub	%r11,%rbp		# align with $aptr
2706	lea	-320(%rbp,$num,2),%rbp	# future alloca(frame+2*$num*8+256)
2707	jmp	.Lpwrx_sp_done
2708
2709.align	32
2710.Lpwrx_sp_alt:
2711	lea	4096-320(,$num,2),%r10
2712	lea	-320(%rbp,$num,2),%rbp	# alloca(frame+2*$num*8+256)
2713	sub	%r10,%r11
2714	mov	\$0,%r10
2715	cmovc	%r10,%r11
2716	sub	%r11,%rbp
2717.Lpwrx_sp_done:
2718	and	\$-64,%rbp
2719	mov	%rsp,%r11
2720	sub	%rbp,%r11
2721	and	\$-4096,%r11
2722	lea	(%rbp,%r11),%rsp
2723	mov	(%rsp),%r10
2724	cmp	%rbp,%rsp
2725	ja	.Lpwrx_page_walk
2726	jmp	.Lpwrx_page_walk_done
2727
2728.Lpwrx_page_walk:
2729	lea	-4096(%rsp),%rsp
2730	mov	(%rsp),%r10
2731	cmp	%rbp,%rsp
2732	ja	.Lpwrx_page_walk
2733.Lpwrx_page_walk_done:
2734
2735	mov	$num,%r10
2736	neg	$num
2737
2738	##############################################################
2739	# Stack layout
2740	#
2741	# +0	saved $num, used in reduction section
2742	# +8	&t[2*$num], used in reduction section
2743	# +16	intermediate carry bit
2744	# +24	top-most carry bit, used in reduction section
2745	# +32	saved *n0
2746	# +40	saved %rsp
2747	# +48	t[2*$num]
2748	#
2749	pxor	%xmm0,%xmm0
2750	movq	$rptr,%xmm1		# save $rptr
2751	movq	$nptr,%xmm2		# save $nptr
2752	movq	%r10, %xmm3		# -$num
2753	movq	$bptr,%xmm4
2754	mov	$n0,  32(%rsp)
2755	mov	%rax, 40(%rsp)		# save original %rsp
2756.Lpowerx5_body:
2757
2758	call	__bn_sqrx8x_internal
2759	call	__bn_postx4x_internal
2760	call	__bn_sqrx8x_internal
2761	call	__bn_postx4x_internal
2762	call	__bn_sqrx8x_internal
2763	call	__bn_postx4x_internal
2764	call	__bn_sqrx8x_internal
2765	call	__bn_postx4x_internal
2766	call	__bn_sqrx8x_internal
2767	call	__bn_postx4x_internal
2768
2769	mov	%r10,$num		# -num
2770	mov	$aptr,$rptr
2771	movq	%xmm2,$nptr
2772	movq	%xmm4,$bptr
2773	mov	40(%rsp),%rax
2774
2775	call	mulx4x_internal
2776
2777	mov	40(%rsp),%rsi		# restore %rsp
2778	mov	\$1,%rax
2779
2780	mov	-48(%rsi),%r15
2781	mov	-40(%rsi),%r14
2782	mov	-32(%rsi),%r13
2783	mov	-24(%rsi),%r12
2784	mov	-16(%rsi),%rbp
2785	mov	-8(%rsi),%rbx
2786	lea	(%rsi),%rsp
2787.Lpowerx5_epilogue:
2788	ret
2789.size	bn_powerx5,.-bn_powerx5
2790
2791.globl	bn_sqrx8x_internal
2792.hidden	bn_sqrx8x_internal
2793.type	bn_sqrx8x_internal,\@abi-omnipotent
2794.align	32
2795bn_sqrx8x_internal:
2796__bn_sqrx8x_internal:
2797	##################################################################
2798	# Squaring part:
2799	#
2800	# a) multiply-n-add everything but a[i]*a[i];
2801	# b) shift result of a) by 1 to the left and accumulate
2802	#    a[i]*a[i] products;
2803	#
2804	##################################################################
2805	# a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
2806	#                                                     a[1]a[0]
2807	#                                                 a[2]a[0]
2808	#                                             a[3]a[0]
2809	#                                             a[2]a[1]
2810	#                                         a[3]a[1]
2811	#                                     a[3]a[2]
2812	#
2813	#                                         a[4]a[0]
2814	#                                     a[5]a[0]
2815	#                                 a[6]a[0]
2816	#                             a[7]a[0]
2817	#                                     a[4]a[1]
2818	#                                 a[5]a[1]
2819	#                             a[6]a[1]
2820	#                         a[7]a[1]
2821	#                                 a[4]a[2]
2822	#                             a[5]a[2]
2823	#                         a[6]a[2]
2824	#                     a[7]a[2]
2825	#                             a[4]a[3]
2826	#                         a[5]a[3]
2827	#                     a[6]a[3]
2828	#                 a[7]a[3]
2829	#
2830	#                     a[5]a[4]
2831	#                 a[6]a[4]
2832	#             a[7]a[4]
2833	#             a[6]a[5]
2834	#         a[7]a[5]
2835	#     a[7]a[6]
2836	# a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
2837___
2838{
2839my ($zero,$carry)=("%rbp","%rcx");
2840my $aaptr=$zero;
2841$code.=<<___;
2842	lea	48+8(%rsp),$tptr
2843	lea	($aptr,$num),$aaptr
2844	mov	$num,0+8(%rsp)			# save $num
2845	mov	$aaptr,8+8(%rsp)		# save end of $aptr
2846	jmp	.Lsqr8x_zero_start
2847
2848.align	32
2849.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2850.Lsqrx8x_zero:
2851	.byte	0x3e
2852	movdqa	%xmm0,0*8($tptr)
2853	movdqa	%xmm0,2*8($tptr)
2854	movdqa	%xmm0,4*8($tptr)
2855	movdqa	%xmm0,6*8($tptr)
2856.Lsqr8x_zero_start:			# aligned at 32
2857	movdqa	%xmm0,8*8($tptr)
2858	movdqa	%xmm0,10*8($tptr)
2859	movdqa	%xmm0,12*8($tptr)
2860	movdqa	%xmm0,14*8($tptr)
2861	lea	16*8($tptr),$tptr
2862	sub	\$64,$num
2863	jnz	.Lsqrx8x_zero
2864
2865	mov	0*8($aptr),%rdx		# a[0], modulo-scheduled
2866	#xor	%r9,%r9			# t[1], ex-$num, zero already
2867	xor	%r10,%r10
2868	xor	%r11,%r11
2869	xor	%r12,%r12
2870	xor	%r13,%r13
2871	xor	%r14,%r14
2872	xor	%r15,%r15
2873	lea	48+8(%rsp),$tptr
2874	xor	$zero,$zero		# cf=0, cf=0
2875	jmp	.Lsqrx8x_outer_loop
2876
2877.align	32
2878.Lsqrx8x_outer_loop:
2879	mulx	1*8($aptr),%r8,%rax	# a[1]*a[0]
2880	adcx	%r9,%r8			# a[1]*a[0]+=t[1]
2881	adox	%rax,%r10
2882	mulx	2*8($aptr),%r9,%rax	# a[2]*a[0]
2883	adcx	%r10,%r9
2884	adox	%rax,%r11
2885	.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00	# mulx	3*8($aptr),%r10,%rax	# ...
2886	adcx	%r11,%r10
2887	adox	%rax,%r12
2888	.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00	# mulx	4*8($aptr),%r11,%rax
2889	adcx	%r12,%r11
2890	adox	%rax,%r13
2891	mulx	5*8($aptr),%r12,%rax
2892	adcx	%r13,%r12
2893	adox	%rax,%r14
2894	mulx	6*8($aptr),%r13,%rax
2895	adcx	%r14,%r13
2896	adox	%r15,%rax
2897	mulx	7*8($aptr),%r14,%r15
2898	 mov	1*8($aptr),%rdx		# a[1]
2899	adcx	%rax,%r14
2900	adox	$zero,%r15
2901	adc	8*8($tptr),%r15
2902	mov	%r8,1*8($tptr)		# t[1]
2903	mov	%r9,2*8($tptr)		# t[2]
2904	sbb	$carry,$carry		# mov %cf,$carry
2905	xor	$zero,$zero		# cf=0, of=0
2906
2907
2908	mulx	2*8($aptr),%r8,%rbx	# a[2]*a[1]
2909	mulx	3*8($aptr),%r9,%rax	# a[3]*a[1]
2910	adcx	%r10,%r8
2911	adox	%rbx,%r9
2912	mulx	4*8($aptr),%r10,%rbx	# ...
2913	adcx	%r11,%r9
2914	adox	%rax,%r10
2915	.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00	# mulx	5*8($aptr),%r11,%rax
2916	adcx	%r12,%r10
2917	adox	%rbx,%r11
2918	.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00	# mulx	6*8($aptr),%r12,%rbx
2919	adcx	%r13,%r11
2920	adox	%r14,%r12
2921	.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00	# mulx	7*8($aptr),%r13,%r14
2922	 mov	2*8($aptr),%rdx		# a[2]
2923	adcx	%rax,%r12
2924	adox	%rbx,%r13
2925	adcx	%r15,%r13
2926	adox	$zero,%r14		# of=0
2927	adcx	$zero,%r14		# cf=0
2928
2929	mov	%r8,3*8($tptr)		# t[3]
2930	mov	%r9,4*8($tptr)		# t[4]
2931
2932	mulx	3*8($aptr),%r8,%rbx	# a[3]*a[2]
2933	mulx	4*8($aptr),%r9,%rax	# a[4]*a[2]
2934	adcx	%r10,%r8
2935	adox	%rbx,%r9
2936	mulx	5*8($aptr),%r10,%rbx	# ...
2937	adcx	%r11,%r9
2938	adox	%rax,%r10
2939	.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00	# mulx	6*8($aptr),%r11,%rax
2940	adcx	%r12,%r10
2941	adox	%r13,%r11
2942	.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00	# mulx	7*8($aptr),%r12,%r13
2943	.byte	0x3e
2944	 mov	3*8($aptr),%rdx		# a[3]
2945	adcx	%rbx,%r11
2946	adox	%rax,%r12
2947	adcx	%r14,%r12
2948	mov	%r8,5*8($tptr)		# t[5]
2949	mov	%r9,6*8($tptr)		# t[6]
2950	 mulx	4*8($aptr),%r8,%rax	# a[4]*a[3]
2951	adox	$zero,%r13		# of=0
2952	adcx	$zero,%r13		# cf=0
2953
2954	mulx	5*8($aptr),%r9,%rbx	# a[5]*a[3]
2955	adcx	%r10,%r8
2956	adox	%rax,%r9
2957	mulx	6*8($aptr),%r10,%rax	# ...
2958	adcx	%r11,%r9
2959	adox	%r12,%r10
2960	mulx	7*8($aptr),%r11,%r12
2961	 mov	4*8($aptr),%rdx		# a[4]
2962	 mov	5*8($aptr),%r14		# a[5]
2963	adcx	%rbx,%r10
2964	adox	%rax,%r11
2965	 mov	6*8($aptr),%r15		# a[6]
2966	adcx	%r13,%r11
2967	adox	$zero,%r12		# of=0
2968	adcx	$zero,%r12		# cf=0
2969
2970	mov	%r8,7*8($tptr)		# t[7]
2971	mov	%r9,8*8($tptr)		# t[8]
2972
2973	mulx	%r14,%r9,%rax		# a[5]*a[4]
2974	 mov	7*8($aptr),%r8		# a[7]
2975	adcx	%r10,%r9
2976	mulx	%r15,%r10,%rbx		# a[6]*a[4]
2977	adox	%rax,%r10
2978	adcx	%r11,%r10
2979	mulx	%r8,%r11,%rax		# a[7]*a[4]
2980	 mov	%r14,%rdx		# a[5]
2981	adox	%rbx,%r11
2982	adcx	%r12,%r11
2983	#adox	$zero,%rax		# of=0
2984	adcx	$zero,%rax		# cf=0
2985
2986	mulx	%r15,%r14,%rbx		# a[6]*a[5]
2987	mulx	%r8,%r12,%r13		# a[7]*a[5]
2988	 mov	%r15,%rdx		# a[6]
2989	 lea	8*8($aptr),$aptr
2990	adcx	%r14,%r11
2991	adox	%rbx,%r12
2992	adcx	%rax,%r12
2993	adox	$zero,%r13
2994
2995	.byte	0x67,0x67
2996	mulx	%r8,%r8,%r14		# a[7]*a[6]
2997	adcx	%r8,%r13
2998	adcx	$zero,%r14
2999
3000	cmp	8+8(%rsp),$aptr
3001	je	.Lsqrx8x_outer_break
3002
3003	neg	$carry			# mov $carry,%cf
3004	mov	\$-8,%rcx
3005	mov	$zero,%r15
3006	mov	8*8($tptr),%r8
3007	adcx	9*8($tptr),%r9		# +=t[9]
3008	adcx	10*8($tptr),%r10	# ...
3009	adcx	11*8($tptr),%r11
3010	adc	12*8($tptr),%r12
3011	adc	13*8($tptr),%r13
3012	adc	14*8($tptr),%r14
3013	adc	15*8($tptr),%r15
3014	lea	($aptr),$aaptr
3015	lea	2*64($tptr),$tptr
3016	sbb	%rax,%rax		# mov %cf,$carry
3017
3018	mov	-64($aptr),%rdx		# a[0]
3019	mov	%rax,16+8(%rsp)		# offload $carry
3020	mov	$tptr,24+8(%rsp)
3021
3022	#lea	8*8($tptr),$tptr	# see 2*8*8($tptr) above
3023	xor	%eax,%eax		# cf=0, of=0
3024	jmp	.Lsqrx8x_loop
3025
3026.align	32
3027.Lsqrx8x_loop:
3028	mov	%r8,%rbx
3029	mulx	0*8($aaptr),%rax,%r8	# a[8]*a[i]
3030	adcx	%rax,%rbx		# +=t[8]
3031	adox	%r9,%r8
3032
3033	mulx	1*8($aaptr),%rax,%r9	# ...
3034	adcx	%rax,%r8
3035	adox	%r10,%r9
3036
3037	mulx	2*8($aaptr),%rax,%r10
3038	adcx	%rax,%r9
3039	adox	%r11,%r10
3040
3041	mulx	3*8($aaptr),%rax,%r11
3042	adcx	%rax,%r10
3043	adox	%r12,%r11
3044
3045	.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00	# mulx	4*8($aaptr),%rax,%r12
3046	adcx	%rax,%r11
3047	adox	%r13,%r12
3048
3049	mulx	5*8($aaptr),%rax,%r13
3050	adcx	%rax,%r12
3051	adox	%r14,%r13
3052
3053	mulx	6*8($aaptr),%rax,%r14
3054	 mov	%rbx,($tptr,%rcx,8)	# store t[8+i]
3055	 mov	\$0,%ebx
3056	adcx	%rax,%r13
3057	adox	%r15,%r14
3058
3059	.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00	# mulx	7*8($aaptr),%rax,%r15
3060	 mov	8($aptr,%rcx,8),%rdx	# a[i]
3061	adcx	%rax,%r14
3062	adox	%rbx,%r15		# %rbx is 0, of=0
3063	adcx	%rbx,%r15		# cf=0
3064
3065	.byte	0x67
3066	inc	%rcx			# of=0
3067	jnz	.Lsqrx8x_loop
3068
3069	lea	8*8($aaptr),$aaptr
3070	mov	\$-8,%rcx
3071	cmp	8+8(%rsp),$aaptr	# done?
3072	je	.Lsqrx8x_break
3073
3074	sub	16+8(%rsp),%rbx		# mov 16(%rsp),%cf
3075	.byte	0x66
3076	mov	-64($aptr),%rdx
3077	adcx	0*8($tptr),%r8
3078	adcx	1*8($tptr),%r9
3079	adc	2*8($tptr),%r10
3080	adc	3*8($tptr),%r11
3081	adc	4*8($tptr),%r12
3082	adc	5*8($tptr),%r13
3083	adc	6*8($tptr),%r14
3084	adc	7*8($tptr),%r15
3085	lea	8*8($tptr),$tptr
3086	.byte	0x67
3087	sbb	%rax,%rax		# mov %cf,%rax
3088	xor	%ebx,%ebx		# cf=0, of=0
3089	mov	%rax,16+8(%rsp)		# offload carry
3090	jmp	.Lsqrx8x_loop
3091
3092.align	32
3093.Lsqrx8x_break:
3094	xor	$zero,$zero
3095	sub	16+8(%rsp),%rbx		# mov 16(%rsp),%cf
3096	adcx	$zero,%r8
3097	mov	24+8(%rsp),$carry	# initial $tptr, borrow $carry
3098	adcx	$zero,%r9
3099	mov	0*8($aptr),%rdx		# a[8], modulo-scheduled
3100	adc	\$0,%r10
3101	mov	%r8,0*8($tptr)
3102	adc	\$0,%r11
3103	adc	\$0,%r12
3104	adc	\$0,%r13
3105	adc	\$0,%r14
3106	adc	\$0,%r15
3107	cmp	$carry,$tptr		# cf=0, of=0
3108	je	.Lsqrx8x_outer_loop
3109
3110	mov	%r9,1*8($tptr)
3111	 mov	1*8($carry),%r9
3112	mov	%r10,2*8($tptr)
3113	 mov	2*8($carry),%r10
3114	mov	%r11,3*8($tptr)
3115	 mov	3*8($carry),%r11
3116	mov	%r12,4*8($tptr)
3117	 mov	4*8($carry),%r12
3118	mov	%r13,5*8($tptr)
3119	 mov	5*8($carry),%r13
3120	mov	%r14,6*8($tptr)
3121	 mov	6*8($carry),%r14
3122	mov	%r15,7*8($tptr)
3123	 mov	7*8($carry),%r15
3124	mov	$carry,$tptr
3125	jmp	.Lsqrx8x_outer_loop
3126
3127.align	32
3128.Lsqrx8x_outer_break:
3129	mov	%r9,9*8($tptr)		# t[9]
3130	 movq	%xmm3,%rcx		# -$num
3131	mov	%r10,10*8($tptr)	# ...
3132	mov	%r11,11*8($tptr)
3133	mov	%r12,12*8($tptr)
3134	mov	%r13,13*8($tptr)
3135	mov	%r14,14*8($tptr)
3136___
3137}{
3138my $i="%rcx";
3139$code.=<<___;
3140	lea	48+8(%rsp),$tptr
3141	mov	($aptr,$i),%rdx		# a[0]
3142
3143	mov	8($tptr),$A0[1]		# t[1]
3144	xor	$A0[0],$A0[0]		# t[0], of=0, cf=0
3145	mov	0+8(%rsp),$num		# restore $num
3146	adox	$A0[1],$A0[1]
3147	 mov	16($tptr),$A1[0]	# t[2]	# prefetch
3148	 mov	24($tptr),$A1[1]	# t[3]	# prefetch
3149	#jmp	.Lsqrx4x_shift_n_add	# happens to be aligned
3150
3151.align	32
3152.Lsqrx4x_shift_n_add:
3153	mulx	%rdx,%rax,%rbx
3154	 adox	$A1[0],$A1[0]
3155	adcx	$A0[0],%rax
3156	 .byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00	# mov	8($aptr,$i),%rdx	# a[i+1]	# prefetch
3157	 .byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00	# mov	32($tptr),$A0[0]	# t[2*i+4]	# prefetch
3158	 adox	$A1[1],$A1[1]
3159	adcx	$A0[1],%rbx
3160	 mov	40($tptr),$A0[1]		# t[2*i+4+1]	# prefetch
3161	mov	%rax,0($tptr)
3162	mov	%rbx,8($tptr)
3163
3164	mulx	%rdx,%rax,%rbx
3165	 adox	$A0[0],$A0[0]
3166	adcx	$A1[0],%rax
3167	 mov	16($aptr,$i),%rdx	# a[i+2]	# prefetch
3168	 mov	48($tptr),$A1[0]	# t[2*i+6]	# prefetch
3169	 adox	$A0[1],$A0[1]
3170	adcx	$A1[1],%rbx
3171	 mov	56($tptr),$A1[1]	# t[2*i+6+1]	# prefetch
3172	mov	%rax,16($tptr)
3173	mov	%rbx,24($tptr)
3174
3175	mulx	%rdx,%rax,%rbx
3176	 adox	$A1[0],$A1[0]
3177	adcx	$A0[0],%rax
3178	 mov	24($aptr,$i),%rdx	# a[i+3]	# prefetch
3179	 lea	32($i),$i
3180	 mov	64($tptr),$A0[0]	# t[2*i+8]	# prefetch
3181	 adox	$A1[1],$A1[1]
3182	adcx	$A0[1],%rbx
3183	 mov	72($tptr),$A0[1]	# t[2*i+8+1]	# prefetch
3184	mov	%rax,32($tptr)
3185	mov	%rbx,40($tptr)
3186
3187	mulx	%rdx,%rax,%rbx
3188	 adox	$A0[0],$A0[0]
3189	adcx	$A1[0],%rax
3190	jrcxz	.Lsqrx4x_shift_n_add_break
3191	 .byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00	# mov	0($aptr,$i),%rdx	# a[i+4]	# prefetch
3192	 adox	$A0[1],$A0[1]
3193	adcx	$A1[1],%rbx
3194	 mov	80($tptr),$A1[0]	# t[2*i+10]	# prefetch
3195	 mov	88($tptr),$A1[1]	# t[2*i+10+1]	# prefetch
3196	mov	%rax,48($tptr)
3197	mov	%rbx,56($tptr)
3198	lea	64($tptr),$tptr
3199	nop
3200	jmp	.Lsqrx4x_shift_n_add
3201
3202.align	32
3203.Lsqrx4x_shift_n_add_break:
3204	adcx	$A1[1],%rbx
3205	mov	%rax,48($tptr)
3206	mov	%rbx,56($tptr)
3207	lea	64($tptr),$tptr		# end of t[] buffer
3208___
3209}
3210######################################################################
3211# Montgomery reduction part, "word-by-word" algorithm.
3212#
3213# This new path is inspired by multiple submissions from Intel, by
3214# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
3215# Vinodh Gopal...
3216{
3217my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx");
3218
3219$code.=<<___;
3220	movq	%xmm2,$nptr
3221__bn_sqrx8x_reduction:
3222	xor	%eax,%eax		# initial top-most carry bit
3223	mov	32+8(%rsp),%rbx		# n0
3224	mov	48+8(%rsp),%rdx		# "%r8", 8*0($tptr)
3225	lea	-8*8($nptr,$num),%rcx	# end of n[]
3226	#lea	48+8(%rsp,$num,2),$tptr	# end of t[] buffer
3227	mov	%rcx, 0+8(%rsp)		# save end of n[]
3228	mov	$tptr,8+8(%rsp)		# save end of t[]
3229
3230	lea	48+8(%rsp),$tptr		# initial t[] window
3231	jmp	.Lsqrx8x_reduction_loop
3232
3233.align	32
3234.Lsqrx8x_reduction_loop:
3235	mov	8*1($tptr),%r9
3236	mov	8*2($tptr),%r10
3237	mov	8*3($tptr),%r11
3238	mov	8*4($tptr),%r12
3239	mov	%rdx,%r8
3240	imulq	%rbx,%rdx		# n0*a[i]
3241	mov	8*5($tptr),%r13
3242	mov	8*6($tptr),%r14
3243	mov	8*7($tptr),%r15
3244	mov	%rax,24+8(%rsp)		# store top-most carry bit
3245
3246	lea	8*8($tptr),$tptr
3247	xor	$carry,$carry		# cf=0,of=0
3248	mov	\$-8,%rcx
3249	jmp	.Lsqrx8x_reduce
3250
3251.align	32
3252.Lsqrx8x_reduce:
3253	mov	%r8, %rbx
3254	mulx	8*0($nptr),%rax,%r8	# n[0]
3255	adcx	%rbx,%rax		# discarded
3256	adox	%r9,%r8
3257
3258	mulx	8*1($nptr),%rbx,%r9	# n[1]
3259	adcx	%rbx,%r8
3260	adox	%r10,%r9
3261
3262	mulx	8*2($nptr),%rbx,%r10
3263	adcx	%rbx,%r9
3264	adox	%r11,%r10
3265
3266	mulx	8*3($nptr),%rbx,%r11
3267	adcx	%rbx,%r10
3268	adox	%r12,%r11
3269
3270	.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00	# mulx	8*4($nptr),%rbx,%r12
3271	 mov	%rdx,%rax
3272	 mov	%r8,%rdx
3273	adcx	%rbx,%r11
3274	adox	%r13,%r12
3275
3276	 mulx	32+8(%rsp),%rbx,%rdx	# %rdx discarded
3277	 mov	%rax,%rdx
3278	 mov	%rax,64+48+8(%rsp,%rcx,8)	# put aside n0*a[i]
3279
3280	mulx	8*5($nptr),%rax,%r13
3281	adcx	%rax,%r12
3282	adox	%r14,%r13
3283
3284	mulx	8*6($nptr),%rax,%r14
3285	adcx	%rax,%r13
3286	adox	%r15,%r14
3287
3288	mulx	8*7($nptr),%rax,%r15
3289	 mov	%rbx,%rdx
3290	adcx	%rax,%r14
3291	adox	$carry,%r15		# $carry is 0
3292	adcx	$carry,%r15		# cf=0
3293
3294	.byte	0x67,0x67,0x67
3295	inc	%rcx			# of=0
3296	jnz	.Lsqrx8x_reduce
3297
3298	mov	$carry,%rax		# xor	%rax,%rax
3299	cmp	0+8(%rsp),$nptr		# end of n[]?
3300	jae	.Lsqrx8x_no_tail
3301
3302	mov	48+8(%rsp),%rdx		# pull n0*a[0]
3303	add	8*0($tptr),%r8
3304	lea	8*8($nptr),$nptr
3305	mov	\$-8,%rcx
3306	adcx	8*1($tptr),%r9
3307	adcx	8*2($tptr),%r10
3308	adc	8*3($tptr),%r11
3309	adc	8*4($tptr),%r12
3310	adc	8*5($tptr),%r13
3311	adc	8*6($tptr),%r14
3312	adc	8*7($tptr),%r15
3313	lea	8*8($tptr),$tptr
3314	sbb	%rax,%rax		# top carry
3315
3316	xor	$carry,$carry		# of=0, cf=0
3317	mov	%rax,16+8(%rsp)
3318	jmp	.Lsqrx8x_tail
3319
3320.align	32
3321.Lsqrx8x_tail:
3322	mov	%r8,%rbx
3323	mulx	8*0($nptr),%rax,%r8
3324	adcx	%rax,%rbx
3325	adox	%r9,%r8
3326
3327	mulx	8*1($nptr),%rax,%r9
3328	adcx	%rax,%r8
3329	adox	%r10,%r9
3330
3331	mulx	8*2($nptr),%rax,%r10
3332	adcx	%rax,%r9
3333	adox	%r11,%r10
3334
3335	mulx	8*3($nptr),%rax,%r11
3336	adcx	%rax,%r10
3337	adox	%r12,%r11
3338
3339	.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00	# mulx	8*4($nptr),%rax,%r12
3340	adcx	%rax,%r11
3341	adox	%r13,%r12
3342
3343	mulx	8*5($nptr),%rax,%r13
3344	adcx	%rax,%r12
3345	adox	%r14,%r13
3346
3347	mulx	8*6($nptr),%rax,%r14
3348	adcx	%rax,%r13
3349	adox	%r15,%r14
3350
3351	mulx	8*7($nptr),%rax,%r15
3352	 mov	72+48+8(%rsp,%rcx,8),%rdx	# pull n0*a[i]
3353	adcx	%rax,%r14
3354	adox	$carry,%r15
3355	 mov	%rbx,($tptr,%rcx,8)	# save result
3356	 mov	%r8,%rbx
3357	adcx	$carry,%r15		# cf=0
3358
3359	inc	%rcx			# of=0
3360	jnz	.Lsqrx8x_tail
3361
3362	cmp	0+8(%rsp),$nptr		# end of n[]?
3363	jae	.Lsqrx8x_tail_done	# break out of loop
3364
3365	sub	16+8(%rsp),$carry	# mov 16(%rsp),%cf
3366	 mov	48+8(%rsp),%rdx		# pull n0*a[0]
3367	 lea	8*8($nptr),$nptr
3368	adc	8*0($tptr),%r8
3369	adc	8*1($tptr),%r9
3370	adc	8*2($tptr),%r10
3371	adc	8*3($tptr),%r11
3372	adc	8*4($tptr),%r12
3373	adc	8*5($tptr),%r13
3374	adc	8*6($tptr),%r14
3375	adc	8*7($tptr),%r15
3376	lea	8*8($tptr),$tptr
3377	sbb	%rax,%rax
3378	sub	\$8,%rcx		# mov	\$-8,%rcx
3379
3380	xor	$carry,$carry		# of=0, cf=0
3381	mov	%rax,16+8(%rsp)
3382	jmp	.Lsqrx8x_tail
3383
3384.align	32
3385.Lsqrx8x_tail_done:
3386	xor	%rax,%rax
3387	add	24+8(%rsp),%r8		# can this overflow?
3388	adc	\$0,%r9
3389	adc	\$0,%r10
3390	adc	\$0,%r11
3391	adc	\$0,%r12
3392	adc	\$0,%r13
3393	adc	\$0,%r14
3394	adc	\$0,%r15
3395	adc	\$0,%rax
3396
3397	sub	16+8(%rsp),$carry	# mov 16(%rsp),%cf
3398.Lsqrx8x_no_tail:			# %cf is 0 if jumped here
3399	adc	8*0($tptr),%r8
3400	 movq	%xmm3,%rcx
3401	adc	8*1($tptr),%r9
3402	 mov	8*7($nptr),$carry
3403	 movq	%xmm2,$nptr		# restore $nptr
3404	adc	8*2($tptr),%r10
3405	adc	8*3($tptr),%r11
3406	adc	8*4($tptr),%r12
3407	adc	8*5($tptr),%r13
3408	adc	8*6($tptr),%r14
3409	adc	8*7($tptr),%r15
3410	adc	\$0,%rax		# top-most carry
3411
3412	mov	32+8(%rsp),%rbx		# n0
3413	mov	8*8($tptr,%rcx),%rdx	# modulo-scheduled "%r8"
3414
3415	mov	%r8,8*0($tptr)		# store top 512 bits
3416	 lea	8*8($tptr),%r8		# borrow %r8
3417	mov	%r9,8*1($tptr)
3418	mov	%r10,8*2($tptr)
3419	mov	%r11,8*3($tptr)
3420	mov	%r12,8*4($tptr)
3421	mov	%r13,8*5($tptr)
3422	mov	%r14,8*6($tptr)
3423	mov	%r15,8*7($tptr)
3424
3425	lea	8*8($tptr,%rcx),$tptr	# start of current t[] window
3426	cmp	8+8(%rsp),%r8		# end of t[]?
3427	jb	.Lsqrx8x_reduction_loop
3428	ret
3429.size	bn_sqrx8x_internal,.-bn_sqrx8x_internal
3430___
3431}
3432##############################################################
3433# Post-condition, 4x unrolled
3434#
3435{
3436my ($rptr,$nptr)=("%rdx","%rbp");
3437$code.=<<___;
3438.align	32
3439__bn_postx4x_internal:
3440	mov	8*0($nptr),%r12
3441	mov	%rcx,%r10		# -$num
3442	mov	%rcx,%r9		# -$num
3443	neg	%rax
3444	sar	\$3+2,%rcx
3445	#lea	48+8(%rsp,%r9),$tptr
3446	movq	%xmm1,$rptr		# restore $rptr
3447	movq	%xmm1,$aptr		# prepare for back-to-back call
3448	dec	%r12			# so that after 'not' we get -n[0]
3449	mov	8*1($nptr),%r13
3450	xor	%r8,%r8
3451	mov	8*2($nptr),%r14
3452	mov	8*3($nptr),%r15
3453	jmp	.Lsqrx4x_sub_entry
3454
3455.align	16
3456.Lsqrx4x_sub:
3457	mov	8*0($nptr),%r12
3458	mov	8*1($nptr),%r13
3459	mov	8*2($nptr),%r14
3460	mov	8*3($nptr),%r15
3461.Lsqrx4x_sub_entry:
3462	andn	%rax,%r12,%r12
3463	lea	8*4($nptr),$nptr
3464	andn	%rax,%r13,%r13
3465	andn	%rax,%r14,%r14
3466	andn	%rax,%r15,%r15
3467
3468	neg	%r8			# mov %r8,%cf
3469	adc	8*0($tptr),%r12
3470	adc	8*1($tptr),%r13
3471	adc	8*2($tptr),%r14
3472	adc	8*3($tptr),%r15
3473	mov	%r12,8*0($rptr)
3474	lea	8*4($tptr),$tptr
3475	mov	%r13,8*1($rptr)
3476	sbb	%r8,%r8			# mov %cf,%r8
3477	mov	%r14,8*2($rptr)
3478	mov	%r15,8*3($rptr)
3479	lea	8*4($rptr),$rptr
3480
3481	inc	%rcx
3482	jnz	.Lsqrx4x_sub
3483
3484	neg	%r9			# restore $num
3485
3486	ret
3487.size	__bn_postx4x_internal,.-__bn_postx4x_internal
3488___
3489}
3490}}}
3491{
3492my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order
3493				("%rdi","%esi","%rdx","%ecx");  # Unix order
3494my $out=$inp;
3495my $STRIDE=2**5*8;
3496my $N=$STRIDE/4;
3497
3498$code.=<<___;
3499.globl	bn_get_bits5
3500.type	bn_get_bits5,\@abi-omnipotent
3501.align	16
3502bn_get_bits5:
3503	lea	0($inp),%r10
3504	lea	1($inp),%r11
3505	mov	$num,%ecx
3506	shr	\$4,$num
3507	and	\$15,%ecx
3508	lea	-8(%ecx),%eax
3509	cmp	\$11,%ecx
3510	cmova	%r11,%r10
3511	cmova	%eax,%ecx
3512	movzw	(%r10,$num,2),%eax
3513	shrl	%cl,%eax
3514	and	\$31,%eax
3515	ret
3516.size	bn_get_bits5,.-bn_get_bits5
3517
3518.globl	bn_scatter5
3519.type	bn_scatter5,\@abi-omnipotent
3520.align	16
3521bn_scatter5:
3522	cmp	\$0, $num
3523	jz	.Lscatter_epilogue
3524	lea	($tbl,$idx,8),$tbl
3525.Lscatter:
3526	mov	($inp),%rax
3527	lea	8($inp),$inp
3528	mov	%rax,($tbl)
3529	lea	32*8($tbl),$tbl
3530	sub	\$1,$num
3531	jnz	.Lscatter
3532.Lscatter_epilogue:
3533	ret
3534.size	bn_scatter5,.-bn_scatter5
3535
3536.globl	bn_gather5
3537.type	bn_gather5,\@abi-omnipotent
3538.align	32
3539bn_gather5:
3540.LSEH_begin_bn_gather5:			# Win64 thing, but harmless in other cases
3541	# I can't trust assembler to use specific encoding:-(
3542	.byte	0x4c,0x8d,0x14,0x24			#lea    (%rsp),%r10
3543	.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00	#sub	$0x108,%rsp
3544	lea	.Linc(%rip),%rax
3545	and	\$-16,%rsp		# shouldn't be formally required
3546
3547	movd	$idx,%xmm5
3548	movdqa	0(%rax),%xmm0		# 00000001000000010000000000000000
3549	movdqa	16(%rax),%xmm1		# 00000002000000020000000200000002
3550	lea	128($tbl),%r11		# size optimization
3551	lea	128(%rsp),%rax		# size optimization
3552
3553	pshufd	\$0,%xmm5,%xmm5		# broadcast $idx
3554	movdqa	%xmm1,%xmm4
3555	movdqa	%xmm1,%xmm2
3556___
3557########################################################################
3558# calculate mask by comparing 0..31 to $idx and save result to stack
3559#
3560for($i=0;$i<$STRIDE/16;$i+=4) {
3561$code.=<<___;
3562	paddd	%xmm0,%xmm1
3563	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
3564___
3565$code.=<<___	if ($i);
3566	movdqa	%xmm3,`16*($i-1)-128`(%rax)
3567___
3568$code.=<<___;
3569	movdqa	%xmm4,%xmm3
3570
3571	paddd	%xmm1,%xmm2
3572	pcmpeqd	%xmm5,%xmm1		# compare to 3,2
3573	movdqa	%xmm0,`16*($i+0)-128`(%rax)
3574	movdqa	%xmm4,%xmm0
3575
3576	paddd	%xmm2,%xmm3
3577	pcmpeqd	%xmm5,%xmm2		# compare to 5,4
3578	movdqa	%xmm1,`16*($i+1)-128`(%rax)
3579	movdqa	%xmm4,%xmm1
3580
3581	paddd	%xmm3,%xmm0
3582	pcmpeqd	%xmm5,%xmm3		# compare to 7,6
3583	movdqa	%xmm2,`16*($i+2)-128`(%rax)
3584	movdqa	%xmm4,%xmm2
3585___
3586}
3587$code.=<<___;
3588	movdqa	%xmm3,`16*($i-1)-128`(%rax)
3589	jmp	.Lgather
3590
3591.align	32
3592.Lgather:
3593	pxor	%xmm4,%xmm4
3594	pxor	%xmm5,%xmm5
3595___
3596for($i=0;$i<$STRIDE/16;$i+=4) {
3597$code.=<<___;
3598	movdqa	`16*($i+0)-128`(%r11),%xmm0
3599	movdqa	`16*($i+1)-128`(%r11),%xmm1
3600	movdqa	`16*($i+2)-128`(%r11),%xmm2
3601	pand	`16*($i+0)-128`(%rax),%xmm0
3602	movdqa	`16*($i+3)-128`(%r11),%xmm3
3603	pand	`16*($i+1)-128`(%rax),%xmm1
3604	por	%xmm0,%xmm4
3605	pand	`16*($i+2)-128`(%rax),%xmm2
3606	por	%xmm1,%xmm5
3607	pand	`16*($i+3)-128`(%rax),%xmm3
3608	por	%xmm2,%xmm4
3609	por	%xmm3,%xmm5
3610___
3611}
3612$code.=<<___;
3613	por	%xmm5,%xmm4
3614	lea	$STRIDE(%r11),%r11
3615	pshufd	\$0x4e,%xmm4,%xmm0
3616	por	%xmm4,%xmm0
3617	movq	%xmm0,($out)		# m0=bp[0]
3618	lea	8($out),$out
3619	sub	\$1,$num
3620	jnz	.Lgather
3621
3622	lea	(%r10),%rsp
3623	ret
3624.LSEH_end_bn_gather5:
3625.size	bn_gather5,.-bn_gather5
3626___
3627}
3628$code.=<<___;
3629.align	64
3630.Linc:
3631	.long	0,0, 1,1
3632	.long	2,2, 2,2
3633.asciz	"Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
3634___
3635
3636# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3637#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
3638if ($win64) {
3639$rec="%rcx";
3640$frame="%rdx";
3641$context="%r8";
3642$disp="%r9";
3643
3644$code.=<<___;
3645.extern	__imp_RtlVirtualUnwind
3646.type	mul_handler,\@abi-omnipotent
3647.align	16
3648mul_handler:
3649	push	%rsi
3650	push	%rdi
3651	push	%rbx
3652	push	%rbp
3653	push	%r12
3654	push	%r13
3655	push	%r14
3656	push	%r15
3657	pushfq
3658	sub	\$64,%rsp
3659
3660	mov	120($context),%rax	# pull context->Rax
3661	mov	248($context),%rbx	# pull context->Rip
3662
3663	mov	8($disp),%rsi		# disp->ImageBase
3664	mov	56($disp),%r11		# disp->HandlerData
3665
3666	mov	0(%r11),%r10d		# HandlerData[0]
3667	lea	(%rsi,%r10),%r10	# end of prologue label
3668	cmp	%r10,%rbx		# context->Rip<end of prologue label
3669	jb	.Lcommon_seh_tail
3670
3671	mov	4(%r11),%r10d		# HandlerData[1]
3672	lea	(%rsi,%r10),%r10	# epilogue label
3673	cmp	%r10,%rbx		# context->Rip>=epilogue label
3674	jb	.Lcommon_pop_regs
3675
3676	mov	152($context),%rax	# pull context->Rsp
3677
3678	mov	8(%r11),%r10d		# HandlerData[2]
3679	lea	(%rsi,%r10),%r10	# epilogue label
3680	cmp	%r10,%rbx		# context->Rip>=epilogue label
3681	jae	.Lcommon_seh_tail
3682
3683	lea	.Lmul_epilogue(%rip),%r10
3684	cmp	%r10,%rbx
3685	ja	.Lbody_40
3686
3687	mov	192($context),%r10	# pull $num
3688	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
3689
3690	jmp	.Lcommon_pop_regs
3691
3692.Lbody_40:
3693	mov	40(%rax),%rax		# pull saved stack pointer
3694.Lcommon_pop_regs:
3695	mov	-8(%rax),%rbx
3696	mov	-16(%rax),%rbp
3697	mov	-24(%rax),%r12
3698	mov	-32(%rax),%r13
3699	mov	-40(%rax),%r14
3700	mov	-48(%rax),%r15
3701	mov	%rbx,144($context)	# restore context->Rbx
3702	mov	%rbp,160($context)	# restore context->Rbp
3703	mov	%r12,216($context)	# restore context->R12
3704	mov	%r13,224($context)	# restore context->R13
3705	mov	%r14,232($context)	# restore context->R14
3706	mov	%r15,240($context)	# restore context->R15
3707
3708.Lcommon_seh_tail:
3709	mov	8(%rax),%rdi
3710	mov	16(%rax),%rsi
3711	mov	%rax,152($context)	# restore context->Rsp
3712	mov	%rsi,168($context)	# restore context->Rsi
3713	mov	%rdi,176($context)	# restore context->Rdi
3714
3715	mov	40($disp),%rdi		# disp->ContextRecord
3716	mov	$context,%rsi		# context
3717	mov	\$154,%ecx		# sizeof(CONTEXT)
3718	.long	0xa548f3fc		# cld; rep movsq
3719
3720	mov	$disp,%rsi
3721	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
3722	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
3723	mov	0(%rsi),%r8		# arg3, disp->ControlPc
3724	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
3725	mov	40(%rsi),%r10		# disp->ContextRecord
3726	lea	56(%rsi),%r11		# &disp->HandlerData
3727	lea	24(%rsi),%r12		# &disp->EstablisherFrame
3728	mov	%r10,32(%rsp)		# arg5
3729	mov	%r11,40(%rsp)		# arg6
3730	mov	%r12,48(%rsp)		# arg7
3731	mov	%rcx,56(%rsp)		# arg8, (NULL)
3732	call	*__imp_RtlVirtualUnwind(%rip)
3733
3734	mov	\$1,%eax		# ExceptionContinueSearch
3735	add	\$64,%rsp
3736	popfq
3737	pop	%r15
3738	pop	%r14
3739	pop	%r13
3740	pop	%r12
3741	pop	%rbp
3742	pop	%rbx
3743	pop	%rdi
3744	pop	%rsi
3745	ret
3746.size	mul_handler,.-mul_handler
3747
3748.section	.pdata
3749.align	4
3750	.rva	.LSEH_begin_bn_mul_mont_gather5
3751	.rva	.LSEH_end_bn_mul_mont_gather5
3752	.rva	.LSEH_info_bn_mul_mont_gather5
3753
3754	.rva	.LSEH_begin_bn_mul4x_mont_gather5
3755	.rva	.LSEH_end_bn_mul4x_mont_gather5
3756	.rva	.LSEH_info_bn_mul4x_mont_gather5
3757
3758	.rva	.LSEH_begin_bn_power5
3759	.rva	.LSEH_end_bn_power5
3760	.rva	.LSEH_info_bn_power5
3761
3762	.rva	.LSEH_begin_bn_from_mont8x
3763	.rva	.LSEH_end_bn_from_mont8x
3764	.rva	.LSEH_info_bn_from_mont8x
3765___
3766$code.=<<___ if ($addx);
3767	.rva	.LSEH_begin_bn_mulx4x_mont_gather5
3768	.rva	.LSEH_end_bn_mulx4x_mont_gather5
3769	.rva	.LSEH_info_bn_mulx4x_mont_gather5
3770
3771	.rva	.LSEH_begin_bn_powerx5
3772	.rva	.LSEH_end_bn_powerx5
3773	.rva	.LSEH_info_bn_powerx5
3774___
3775$code.=<<___;
3776	.rva	.LSEH_begin_bn_gather5
3777	.rva	.LSEH_end_bn_gather5
3778	.rva	.LSEH_info_bn_gather5
3779
3780.section	.xdata
3781.align	8
3782.LSEH_info_bn_mul_mont_gather5:
3783	.byte	9,0,0,0
3784	.rva	mul_handler
3785	.rva	.Lmul_body,.Lmul_body,.Lmul_epilogue		# HandlerData[]
3786.align	8
3787.LSEH_info_bn_mul4x_mont_gather5:
3788	.byte	9,0,0,0
3789	.rva	mul_handler
3790	.rva	.Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue		# HandlerData[]
3791.align	8
3792.LSEH_info_bn_power5:
3793	.byte	9,0,0,0
3794	.rva	mul_handler
3795	.rva	.Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue	# HandlerData[]
3796.align	8
3797.LSEH_info_bn_from_mont8x:
3798	.byte	9,0,0,0
3799	.rva	mul_handler
3800	.rva	.Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue		# HandlerData[]
3801___
3802$code.=<<___ if ($addx);
3803.align	8
3804.LSEH_info_bn_mulx4x_mont_gather5:
3805	.byte	9,0,0,0
3806	.rva	mul_handler
3807	.rva	.Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue	# HandlerData[]
3808.align	8
3809.LSEH_info_bn_powerx5:
3810	.byte	9,0,0,0
3811	.rva	mul_handler
3812	.rva	.Lpowerx5_prologue,.Lpowerx5_body,.Lpowerx5_epilogue	# HandlerData[]
3813___
3814$code.=<<___;
3815.align	8
3816.LSEH_info_bn_gather5:
3817	.byte	0x01,0x0b,0x03,0x0a
3818	.byte	0x0b,0x01,0x21,0x00	# sub	rsp,0x108
3819	.byte	0x04,0xa3,0x00,0x00	# lea	r10,(rsp)
3820.align	8
3821___
3822}
3823
3824$code =~ s/\`([^\`]*)\`/eval($1)/gem;
3825
3826print $code;
3827close STDOUT;
3828